diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,428535 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 61215, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.3445475101470947, + "learning_rate": 0.0005999999996049288, + "loss": 10.4731, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.277515411376953, + "learning_rate": 0.0005999999984197153, + "loss": 9.9074, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 2.00109601020813, + "learning_rate": 0.0005999999964443598, + "loss": 9.3624, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 1.896841287612915, + "learning_rate": 0.0005999999936788619, + "loss": 8.9861, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 1.8488935232162476, + "learning_rate": 0.0005999999901232217, + "loss": 8.6725, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 1.8929357528686523, + "learning_rate": 0.0005999999857774393, + "loss": 8.2981, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 1.8532694578170776, + "learning_rate": 0.0005999999806415148, + "loss": 8.0649, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 1.7290151119232178, + "learning_rate": 0.0005999999747154479, + "loss": 7.7909, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 1.5064324140548706, + "learning_rate": 0.0005999999679992389, + "loss": 7.5804, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 1.3336801528930664, + "learning_rate": 0.0005999999604928875, + "loss": 7.4451, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 1.1641117334365845, + "learning_rate": 0.0005999999521963943, + "loss": 7.2529, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 1.0922843217849731, + "learning_rate": 0.0005999999431097587, + "loss": 6.9598, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.9296258091926575, + "learning_rate": 0.0005999999332329811, + "loss": 6.7569, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.8542349934577942, + "learning_rate": 0.0005999999225660615, + "loss": 6.7468, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 0.8458271622657776, + "learning_rate": 0.0005999999111089997, + "loss": 6.6628, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.6604408621788025, + "learning_rate": 0.0005999998988617959, + "loss": 6.7246, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 0.6733983755111694, + "learning_rate": 0.00059999988582445, + "loss": 6.5789, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 0.5909510850906372, + "learning_rate": 0.0005999998719969623, + "loss": 6.5809, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 0.7529872059822083, + "learning_rate": 0.0005999998573793326, + "loss": 6.4471, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 0.5217795968055725, + "learning_rate": 0.000599999841971561, + "loss": 6.5395, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.6023449897766113, + "learning_rate": 0.0005999998257736476, + "loss": 6.5889, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 0.570929229259491, + "learning_rate": 0.0005999998087855924, + "loss": 6.3548, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 0.62796950340271, + "learning_rate": 0.0005999997910073954, + "loss": 6.3005, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 0.5817890167236328, + "learning_rate": 0.0005999997724390567, + "loss": 6.4104, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 0.5530886650085449, + "learning_rate": 0.0005999997530805763, + "loss": 6.349, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.643932580947876, + "learning_rate": 0.0005999997329319543, + "loss": 6.3305, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 0.5512265563011169, + "learning_rate": 0.0005999997119931908, + "loss": 6.3231, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 0.6873916387557983, + "learning_rate": 0.0005999996902642858, + "loss": 6.1347, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 0.7107514142990112, + "learning_rate": 0.0005999996677452392, + "loss": 6.3309, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 0.6492154002189636, + "learning_rate": 0.0005999996444360514, + "loss": 6.1696, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.5806317925453186, + "learning_rate": 0.0005999996203367222, + "loss": 6.1405, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 0.6508877873420715, + "learning_rate": 0.0005999995954472518, + "loss": 6.1023, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 0.7064071893692017, + "learning_rate": 0.00059999956976764, + "loss": 6.0525, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 0.6533799767494202, + "learning_rate": 0.0005999995432978872, + "loss": 6.2293, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 0.5126307606697083, + "learning_rate": 0.0005999995160379933, + "loss": 6.0087, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.6564686298370361, + "learning_rate": 0.0005999994879879586, + "loss": 6.1167, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 0.6836735606193542, + "learning_rate": 0.0005999994591477828, + "loss": 5.8947, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 0.6649318337440491, + "learning_rate": 0.0005999994295174661, + "loss": 6.0403, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 0.6308470368385315, + "learning_rate": 0.0005999993990970087, + "loss": 6.0636, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 0.5832207202911377, + "learning_rate": 0.0005999993678864107, + "loss": 6.0658, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.5933493971824646, + "learning_rate": 0.000599999335885672, + "loss": 6.07, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 0.6838207244873047, + "learning_rate": 0.0005999993030947929, + "loss": 5.8559, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 0.6562783718109131, + "learning_rate": 0.0005999992695137733, + "loss": 5.7866, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 0.6013045310974121, + "learning_rate": 0.0005999992351426134, + "loss": 5.631, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 0.579399824142456, + "learning_rate": 0.0005999991999813132, + "loss": 6.0032, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.6205735206604004, + "learning_rate": 0.0005999991640298728, + "loss": 5.4523, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 0.5519394874572754, + "learning_rate": 0.0005999991272882925, + "loss": 5.6271, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 0.5361047983169556, + "learning_rate": 0.0005999990897565721, + "loss": 6.0044, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 0.6790111064910889, + "learning_rate": 0.0005999990514347119, + "loss": 5.683, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 0.6192212700843811, + "learning_rate": 0.0005999990123227119, + "loss": 5.8079, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.6024751663208008, + "learning_rate": 0.0005999989724205721, + "loss": 5.7658, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 0.7180526852607727, + "learning_rate": 0.000599998931728293, + "loss": 5.7054, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 0.7811439037322998, + "learning_rate": 0.0005999988902458743, + "loss": 5.7338, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 0.678833544254303, + "learning_rate": 0.0005999988479733163, + "loss": 5.7092, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 0.633277952671051, + "learning_rate": 0.0005999988049106191, + "loss": 5.6259, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 1.4287935495376587, + "learning_rate": 0.0005999987610577827, + "loss": 5.6371, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 0.7538682818412781, + "learning_rate": 0.0005999987164148074, + "loss": 5.7429, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 0.5953795909881592, + "learning_rate": 0.0005999986709816932, + "loss": 5.7226, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 0.7172217965126038, + "learning_rate": 0.0005999986247584401, + "loss": 5.5155, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 0.7222607135772705, + "learning_rate": 0.0005999985777450485, + "loss": 5.4241, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.7364643216133118, + "learning_rate": 0.0005999985299415183, + "loss": 5.5181, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 0.7325772047042847, + "learning_rate": 0.0005999984813478497, + "loss": 5.6243, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 0.6013771891593933, + "learning_rate": 0.000599998431964043, + "loss": 5.5167, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 0.5635715126991272, + "learning_rate": 0.000599998381790098, + "loss": 5.7081, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 0.6563093662261963, + "learning_rate": 0.000599998330826015, + "loss": 5.4648, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 0.5509313941001892, + "learning_rate": 0.0005999982790717942, + "loss": 5.4311, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 0.7320454716682434, + "learning_rate": 0.0005999982265274357, + "loss": 5.5097, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 0.6863231062889099, + "learning_rate": 0.0005999981731929395, + "loss": 5.5005, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 0.543449342250824, + "learning_rate": 0.0005999981190683059, + "loss": 5.5048, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 0.5858511328697205, + "learning_rate": 0.000599998064153535, + "loss": 5.5506, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 0.7430124878883362, + "learning_rate": 0.0005999980084486269, + "loss": 5.2905, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 0.5860936045646667, + "learning_rate": 0.0005999979519535819, + "loss": 5.5213, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 0.5965762138366699, + "learning_rate": 0.0005999978946683999, + "loss": 5.1871, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 0.6979895234107971, + "learning_rate": 0.0005999978365930813, + "loss": 5.3346, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 0.7160101532936096, + "learning_rate": 0.000599997777727626, + "loss": 5.1066, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 0.5626764297485352, + "learning_rate": 0.0005999977180720344, + "loss": 5.459, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 0.6080682277679443, + "learning_rate": 0.0005999976576263064, + "loss": 5.6085, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 0.6767348647117615, + "learning_rate": 0.0005999975963904425, + "loss": 5.4253, + "step": 78 + }, + { + "epoch": 0.0, + "grad_norm": 0.5835095047950745, + "learning_rate": 0.0005999975343644425, + "loss": 5.3682, + "step": 79 + }, + { + "epoch": 0.0, + "grad_norm": 0.5829843282699585, + "learning_rate": 0.0005999974715483068, + "loss": 5.2841, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 0.6556689739227295, + "learning_rate": 0.0005999974079420355, + "loss": 5.3828, + "step": 81 + }, + { + "epoch": 0.0, + "grad_norm": 0.5656118392944336, + "learning_rate": 0.0005999973435456287, + "loss": 5.4779, + "step": 82 + }, + { + "epoch": 0.0, + "grad_norm": 0.553550660610199, + "learning_rate": 0.0005999972783590868, + "loss": 5.4759, + "step": 83 + }, + { + "epoch": 0.0, + "grad_norm": 0.5812600255012512, + "learning_rate": 0.0005999972123824096, + "loss": 5.3517, + "step": 84 + }, + { + "epoch": 0.0, + "grad_norm": 0.6190153956413269, + "learning_rate": 0.0005999971456155976, + "loss": 5.4019, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 0.5636712312698364, + "learning_rate": 0.0005999970780586508, + "loss": 5.2974, + "step": 86 + }, + { + "epoch": 0.0, + "grad_norm": 0.546588659286499, + "learning_rate": 0.0005999970097115694, + "loss": 5.4874, + "step": 87 + }, + { + "epoch": 0.0, + "grad_norm": 0.6141658425331116, + "learning_rate": 0.0005999969405743537, + "loss": 5.431, + "step": 88 + }, + { + "epoch": 0.0, + "grad_norm": 0.5903357267379761, + "learning_rate": 0.0005999968706470037, + "loss": 5.4673, + "step": 89 + }, + { + "epoch": 0.0, + "grad_norm": 0.5161350965499878, + "learning_rate": 0.0005999967999295199, + "loss": 5.2426, + "step": 90 + }, + { + "epoch": 0.0, + "grad_norm": 0.6005253791809082, + "learning_rate": 0.000599996728421902, + "loss": 5.4007, + "step": 91 + }, + { + "epoch": 0.0, + "grad_norm": 0.5921294093132019, + "learning_rate": 0.0005999966561241506, + "loss": 5.3612, + "step": 92 + }, + { + "epoch": 0.0, + "grad_norm": 0.5736501812934875, + "learning_rate": 0.0005999965830362657, + "loss": 5.327, + "step": 93 + }, + { + "epoch": 0.0, + "grad_norm": 0.5713847279548645, + "learning_rate": 0.0005999965091582475, + "loss": 5.2216, + "step": 94 + }, + { + "epoch": 0.0, + "grad_norm": 0.583519697189331, + "learning_rate": 0.0005999964344900962, + "loss": 5.101, + "step": 95 + }, + { + "epoch": 0.0, + "grad_norm": 0.6061059236526489, + "learning_rate": 0.0005999963590318122, + "loss": 5.3313, + "step": 96 + }, + { + "epoch": 0.0, + "grad_norm": 0.605904221534729, + "learning_rate": 0.0005999962827833954, + "loss": 5.3037, + "step": 97 + }, + { + "epoch": 0.0, + "grad_norm": 0.5677119493484497, + "learning_rate": 0.0005999962057448462, + "loss": 5.2484, + "step": 98 + }, + { + "epoch": 0.0, + "grad_norm": 0.6556852459907532, + "learning_rate": 0.0005999961279161646, + "loss": 5.3307, + "step": 99 + }, + { + "epoch": 0.0, + "grad_norm": 0.6364889144897461, + "learning_rate": 0.000599996049297351, + "loss": 5.3118, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 0.6421851515769958, + "learning_rate": 0.0005999959698884056, + "loss": 5.3212, + "step": 101 + }, + { + "epoch": 0.0, + "grad_norm": 0.5902762413024902, + "learning_rate": 0.0005999958896893285, + "loss": 5.2373, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.5422645211219788, + "learning_rate": 0.00059999580870012, + "loss": 5.3965, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.6656525731086731, + "learning_rate": 0.0005999957269207802, + "loss": 5.3206, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.6076167821884155, + "learning_rate": 0.0005999956443513094, + "loss": 5.2297, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.5958263278007507, + "learning_rate": 0.0005999955609917078, + "loss": 5.2308, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.6043804287910461, + "learning_rate": 0.0005999954768419757, + "loss": 5.3056, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.5793024301528931, + "learning_rate": 0.0005999953919021132, + "loss": 5.1175, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.6292415857315063, + "learning_rate": 0.0005999953061721206, + "loss": 5.2404, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.539836585521698, + "learning_rate": 0.0005999952196519981, + "loss": 5.3884, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.5717549324035645, + "learning_rate": 0.0005999951323417458, + "loss": 5.172, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.5996467471122742, + "learning_rate": 0.0005999950442413642, + "loss": 5.1867, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.6268661022186279, + "learning_rate": 0.0005999949553508534, + "loss": 5.2663, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.6602498888969421, + "learning_rate": 0.0005999948656702136, + "loss": 5.0668, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.5790102481842041, + "learning_rate": 0.0005999947751994451, + "loss": 5.1133, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.6533108949661255, + "learning_rate": 0.000599994683938548, + "loss": 5.1798, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.6445215344429016, + "learning_rate": 0.0005999945918875227, + "loss": 5.1873, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.5656430125236511, + "learning_rate": 0.0005999944990463693, + "loss": 5.0912, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.5833689570426941, + "learning_rate": 0.0005999944054150882, + "loss": 5.1466, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.6031045317649841, + "learning_rate": 0.0005999943109936796, + "loss": 5.1367, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.6581068634986877, + "learning_rate": 0.0005999942157821436, + "loss": 5.1474, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.5878651738166809, + "learning_rate": 0.0005999941197804806, + "loss": 5.116, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.6598708629608154, + "learning_rate": 0.0005999940229886909, + "loss": 5.2586, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.6158697605133057, + "learning_rate": 0.0005999939254067746, + "loss": 5.2573, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.624995231628418, + "learning_rate": 0.0005999938270347321, + "loss": 5.2498, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.5899373292922974, + "learning_rate": 0.0005999937278725636, + "loss": 5.2714, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.5740645527839661, + "learning_rate": 0.0005999936279202693, + "loss": 5.1358, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.609225332736969, + "learning_rate": 0.0005999935271778495, + "loss": 5.2404, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.6659890413284302, + "learning_rate": 0.0005999934256453046, + "loss": 4.9877, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.5626158714294434, + "learning_rate": 0.0005999933233226347, + "loss": 5.2386, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.5483737587928772, + "learning_rate": 0.0005999932202098401, + "loss": 5.2031, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.6274495124816895, + "learning_rate": 0.0005999931163069211, + "loss": 5.1119, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.6077089309692383, + "learning_rate": 0.000599993011613878, + "loss": 4.9585, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 0.5846816301345825, + "learning_rate": 0.000599992906130711, + "loss": 5.2099, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 0.6238222718238831, + "learning_rate": 0.0005999927998574204, + "loss": 5.0893, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.720881462097168, + "learning_rate": 0.0005999926927940066, + "loss": 5.0845, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.4975273013114929, + "learning_rate": 0.0005999925849404696, + "loss": 5.0894, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.6107990741729736, + "learning_rate": 0.00059999247629681, + "loss": 5.0879, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.6482533812522888, + "learning_rate": 0.000599992366863028, + "loss": 5.1011, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.5770416855812073, + "learning_rate": 0.0005999922566391237, + "loss": 4.9711, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.5987694263458252, + "learning_rate": 0.0005999921456250976, + "loss": 5.2741, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 0.5902721285820007, + "learning_rate": 0.00059999203382095, + "loss": 4.9697, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 0.5815165042877197, + "learning_rate": 0.000599991921226681, + "loss": 5.0085, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 0.6174842119216919, + "learning_rate": 0.000599991807842291, + "loss": 5.1251, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.5789567232131958, + "learning_rate": 0.0005999916936677804, + "loss": 5.0212, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.5186668634414673, + "learning_rate": 0.0005999915787031493, + "loss": 5.0934, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.5524886250495911, + "learning_rate": 0.0005999914629483983, + "loss": 5.0444, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 0.6883447766304016, + "learning_rate": 0.0005999913464035273, + "loss": 5.0623, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 0.5598011016845703, + "learning_rate": 0.000599991229068537, + "loss": 5.1861, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 0.6614800691604614, + "learning_rate": 0.0005999911109434274, + "loss": 4.9011, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.5990419983863831, + "learning_rate": 0.0005999909920281991, + "loss": 4.9618, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 0.5454047322273254, + "learning_rate": 0.0005999908723228522, + "loss": 5.1169, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 0.6134856939315796, + "learning_rate": 0.000599990751827387, + "loss": 5.0239, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 0.6012156009674072, + "learning_rate": 0.000599990630541804, + "loss": 5.24, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 0.5873432755470276, + "learning_rate": 0.0005999905084661033, + "loss": 5.1557, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.6696448922157288, + "learning_rate": 0.0005999903856002855, + "loss": 5.0485, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 0.5300459265708923, + "learning_rate": 0.0005999902619443507, + "loss": 5.0126, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 0.5879744291305542, + "learning_rate": 0.0005999901374982992, + "loss": 4.8251, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 0.6002203822135925, + "learning_rate": 0.0005999900122621314, + "loss": 4.8551, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 0.584845244884491, + "learning_rate": 0.0005999898862358478, + "loss": 5.0479, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.7027909755706787, + "learning_rate": 0.0005999897594194485, + "loss": 5.1492, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 0.6529715061187744, + "learning_rate": 0.0005999896318129339, + "loss": 5.0112, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 0.7082669734954834, + "learning_rate": 0.0005999895034163043, + "loss": 4.8184, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 0.6252927184104919, + "learning_rate": 0.0005999893742295602, + "loss": 4.8673, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 0.6685312390327454, + "learning_rate": 0.0005999892442527017, + "loss": 4.9796, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.6606252193450928, + "learning_rate": 0.0005999891134857293, + "loss": 5.0516, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 0.6550689339637756, + "learning_rate": 0.0005999889819286433, + "loss": 4.9988, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 0.6406249403953552, + "learning_rate": 0.000599988849581444, + "loss": 5.2205, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.5724087953567505, + "learning_rate": 0.0005999887164441319, + "loss": 4.8884, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 0.6238783597946167, + "learning_rate": 0.0005999885825167073, + "loss": 5.0775, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.6989482641220093, + "learning_rate": 0.0005999884477991703, + "loss": 4.9027, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 0.5641810894012451, + "learning_rate": 0.0005999883122915216, + "loss": 5.0401, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 0.5841800570487976, + "learning_rate": 0.0005999881759937614, + "loss": 4.9143, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 0.6667760610580444, + "learning_rate": 0.0005999880389058902, + "loss": 4.7318, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 0.5549502372741699, + "learning_rate": 0.0005999879010279082, + "loss": 4.9401, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.562578022480011, + "learning_rate": 0.0005999877623598156, + "loss": 4.8758, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 0.6487370729446411, + "learning_rate": 0.0005999876229016132, + "loss": 4.8428, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 0.5677328705787659, + "learning_rate": 0.000599987482653301, + "loss": 4.94, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 0.5713781714439392, + "learning_rate": 0.0005999873416148795, + "loss": 4.9779, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 0.6296773552894592, + "learning_rate": 0.0005999871997863491, + "loss": 5.0549, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 0.609196126461029, + "learning_rate": 0.0005999870571677102, + "loss": 4.8211, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 0.5543152689933777, + "learning_rate": 0.0005999869137589631, + "loss": 4.9272, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 0.6066011190414429, + "learning_rate": 0.0005999867695601082, + "loss": 4.9566, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 0.5656384229660034, + "learning_rate": 0.0005999866245711458, + "loss": 4.9814, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 0.6074258685112, + "learning_rate": 0.0005999864787920765, + "loss": 4.723, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.5915899872779846, + "learning_rate": 0.0005999863322229005, + "loss": 4.6722, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 0.5853182673454285, + "learning_rate": 0.0005999861848636183, + "loss": 4.8963, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 0.5973837375640869, + "learning_rate": 0.00059998603671423, + "loss": 4.9082, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 0.5674445033073425, + "learning_rate": 0.0005999858877747365, + "loss": 4.8848, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 0.6249287128448486, + "learning_rate": 0.0005999857380451376, + "loss": 4.7804, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 0.5839101076126099, + "learning_rate": 0.0005999855875254343, + "loss": 4.9382, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 0.6714131236076355, + "learning_rate": 0.0005999854362156265, + "loss": 4.9891, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 0.6222569942474365, + "learning_rate": 0.0005999852841157148, + "loss": 4.8221, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 0.671981692314148, + "learning_rate": 0.0005999851312256997, + "loss": 4.7026, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 0.5763186812400818, + "learning_rate": 0.0005999849775455815, + "loss": 4.816, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 0.5959866642951965, + "learning_rate": 0.0005999848230753605, + "loss": 4.9962, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 0.6573556661605835, + "learning_rate": 0.0005999846678150372, + "loss": 4.8953, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 0.6091251373291016, + "learning_rate": 0.0005999845117646122, + "loss": 4.9322, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 0.6092210412025452, + "learning_rate": 0.0005999843549240856, + "loss": 5.0708, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 0.6042549014091492, + "learning_rate": 0.000599984197293458, + "loss": 4.8596, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 0.6179243326187134, + "learning_rate": 0.0005999840388727296, + "loss": 4.9343, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 0.558975875377655, + "learning_rate": 0.0005999838796619012, + "loss": 4.736, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 0.6428976058959961, + "learning_rate": 0.0005999837196609727, + "loss": 4.7473, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 0.6300376057624817, + "learning_rate": 0.0005999835588699451, + "loss": 4.5575, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 0.6382180452346802, + "learning_rate": 0.0005999833972888184, + "loss": 4.649, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 0.6212637424468994, + "learning_rate": 0.0005999832349175932, + "loss": 4.9914, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 0.5995153188705444, + "learning_rate": 0.0005999830717562699, + "loss": 4.7988, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 0.6228866577148438, + "learning_rate": 0.0005999829078048489, + "loss": 4.7471, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 0.5948235988616943, + "learning_rate": 0.0005999827430633307, + "loss": 4.7403, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 0.5553930401802063, + "learning_rate": 0.0005999825775317156, + "loss": 4.7738, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 0.58103346824646, + "learning_rate": 0.0005999824112100042, + "loss": 4.8885, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 0.6065656542778015, + "learning_rate": 0.0005999822440981968, + "loss": 4.7514, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 0.6187020540237427, + "learning_rate": 0.000599982076196294, + "loss": 4.7373, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 0.6189316511154175, + "learning_rate": 0.0005999819075042961, + "loss": 4.7754, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 0.6775714159011841, + "learning_rate": 0.0005999817380222035, + "loss": 4.7958, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 0.6709480285644531, + "learning_rate": 0.0005999815677500168, + "loss": 4.6884, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 0.5949779152870178, + "learning_rate": 0.0005999813966877363, + "loss": 4.7673, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 0.6341105699539185, + "learning_rate": 0.0005999812248353626, + "loss": 4.9962, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 0.7519387006759644, + "learning_rate": 0.0005999810521928962, + "loss": 4.685, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 0.6779834628105164, + "learning_rate": 0.0005999808787603373, + "loss": 4.8129, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 0.6366468667984009, + "learning_rate": 0.0005999807045376866, + "loss": 4.7325, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 0.6042128205299377, + "learning_rate": 0.0005999805295249444, + "loss": 4.6506, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 0.7438672780990601, + "learning_rate": 0.0005999803537221112, + "loss": 4.741, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 0.6630218029022217, + "learning_rate": 0.0005999801771291876, + "loss": 4.7997, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 0.5924032330513, + "learning_rate": 0.0005999799997461738, + "loss": 4.6237, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 0.6375287175178528, + "learning_rate": 0.0005999798215730705, + "loss": 4.6495, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 0.6222159266471863, + "learning_rate": 0.000599979642609878, + "loss": 4.7176, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 0.6462298631668091, + "learning_rate": 0.0005999794628565969, + "loss": 4.79, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 0.5806784629821777, + "learning_rate": 0.0005999792823132276, + "loss": 4.7, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 0.6010195016860962, + "learning_rate": 0.0005999791009797706, + "loss": 4.7524, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 0.594200849533081, + "learning_rate": 0.0005999789188562264, + "loss": 4.8351, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 0.647185206413269, + "learning_rate": 0.0005999787359425954, + "loss": 4.7011, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 0.6233699321746826, + "learning_rate": 0.0005999785522388783, + "loss": 4.8662, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 0.6353042721748352, + "learning_rate": 0.0005999783677450753, + "loss": 4.5278, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 0.6044039130210876, + "learning_rate": 0.000599978182461187, + "loss": 4.7688, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 0.5654269456863403, + "learning_rate": 0.000599977996387214, + "loss": 4.7677, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 0.6043969988822937, + "learning_rate": 0.0005999778095231566, + "loss": 4.6497, + "step": 237 + }, + { + "epoch": 0.01, + "grad_norm": 0.6131507754325867, + "learning_rate": 0.0005999776218690154, + "loss": 4.4584, + "step": 238 + }, + { + "epoch": 0.01, + "grad_norm": 0.6189717650413513, + "learning_rate": 0.0005999774334247909, + "loss": 4.7738, + "step": 239 + }, + { + "epoch": 0.01, + "grad_norm": 0.6453130841255188, + "learning_rate": 0.0005999772441904836, + "loss": 4.7744, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 0.5762782096862793, + "learning_rate": 0.0005999770541660939, + "loss": 4.6736, + "step": 241 + }, + { + "epoch": 0.01, + "grad_norm": 0.6359663009643555, + "learning_rate": 0.0005999768633516224, + "loss": 4.6294, + "step": 242 + }, + { + "epoch": 0.01, + "grad_norm": 0.5880213379859924, + "learning_rate": 0.0005999766717470696, + "loss": 4.7633, + "step": 243 + }, + { + "epoch": 0.01, + "grad_norm": 0.5674964189529419, + "learning_rate": 0.0005999764793524359, + "loss": 4.9202, + "step": 244 + }, + { + "epoch": 0.01, + "grad_norm": 0.6202216744422913, + "learning_rate": 0.000599976286167722, + "loss": 4.8709, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 0.6854368448257446, + "learning_rate": 0.0005999760921929283, + "loss": 4.5826, + "step": 246 + }, + { + "epoch": 0.01, + "grad_norm": 0.6487476825714111, + "learning_rate": 0.0005999758974280551, + "loss": 4.6156, + "step": 247 + }, + { + "epoch": 0.01, + "grad_norm": 0.650581955909729, + "learning_rate": 0.0005999757018731034, + "loss": 4.6199, + "step": 248 + }, + { + "epoch": 0.01, + "grad_norm": 0.6073902249336243, + "learning_rate": 0.0005999755055280732, + "loss": 4.6756, + "step": 249 + }, + { + "epoch": 0.01, + "grad_norm": 0.8026038408279419, + "learning_rate": 0.0005999753083929653, + "loss": 4.6487, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 0.5802706480026245, + "learning_rate": 0.0005999751104677803, + "loss": 4.8061, + "step": 251 + }, + { + "epoch": 0.01, + "grad_norm": 0.6185906529426575, + "learning_rate": 0.0005999749117525185, + "loss": 4.5397, + "step": 252 + }, + { + "epoch": 0.01, + "grad_norm": 0.6637834310531616, + "learning_rate": 0.0005999747122471805, + "loss": 4.7092, + "step": 253 + }, + { + "epoch": 0.01, + "grad_norm": 0.6466782093048096, + "learning_rate": 0.0005999745119517669, + "loss": 4.8699, + "step": 254 + }, + { + "epoch": 0.01, + "grad_norm": 0.7292912602424622, + "learning_rate": 0.0005999743108662781, + "loss": 4.6157, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 0.6685052514076233, + "learning_rate": 0.0005999741089907146, + "loss": 4.4988, + "step": 256 + }, + { + "epoch": 0.01, + "grad_norm": 0.6441407799720764, + "learning_rate": 0.0005999739063250772, + "loss": 4.7335, + "step": 257 + }, + { + "epoch": 0.01, + "grad_norm": 0.7168517708778381, + "learning_rate": 0.0005999737028693662, + "loss": 4.7464, + "step": 258 + }, + { + "epoch": 0.01, + "grad_norm": 0.5829117298126221, + "learning_rate": 0.0005999734986235822, + "loss": 4.6709, + "step": 259 + }, + { + "epoch": 0.01, + "grad_norm": 0.6130709052085876, + "learning_rate": 0.0005999732935877258, + "loss": 4.942, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 0.5860216617584229, + "learning_rate": 0.0005999730877617974, + "loss": 4.8475, + "step": 261 + }, + { + "epoch": 0.01, + "grad_norm": 0.6693971157073975, + "learning_rate": 0.0005999728811457975, + "loss": 4.6322, + "step": 262 + }, + { + "epoch": 0.01, + "grad_norm": 0.6591095924377441, + "learning_rate": 0.0005999726737397269, + "loss": 4.5757, + "step": 263 + }, + { + "epoch": 0.01, + "grad_norm": 0.6148874759674072, + "learning_rate": 0.0005999724655435862, + "loss": 4.6693, + "step": 264 + }, + { + "epoch": 0.01, + "grad_norm": 0.6252702474594116, + "learning_rate": 0.0005999722565573754, + "loss": 4.8816, + "step": 265 + }, + { + "epoch": 0.01, + "grad_norm": 0.6648889780044556, + "learning_rate": 0.0005999720467810956, + "loss": 4.8358, + "step": 266 + }, + { + "epoch": 0.01, + "grad_norm": 0.6808665990829468, + "learning_rate": 0.0005999718362147471, + "loss": 4.7751, + "step": 267 + }, + { + "epoch": 0.01, + "grad_norm": 0.6691820621490479, + "learning_rate": 0.0005999716248583306, + "loss": 4.7164, + "step": 268 + }, + { + "epoch": 0.01, + "grad_norm": 0.7447707653045654, + "learning_rate": 0.0005999714127118465, + "loss": 4.466, + "step": 269 + }, + { + "epoch": 0.01, + "grad_norm": 0.6744222640991211, + "learning_rate": 0.0005999711997752954, + "loss": 4.3235, + "step": 270 + }, + { + "epoch": 0.01, + "grad_norm": 0.7479273676872253, + "learning_rate": 0.000599970986048678, + "loss": 4.63, + "step": 271 + }, + { + "epoch": 0.01, + "grad_norm": 0.7194980382919312, + "learning_rate": 0.0005999707715319947, + "loss": 4.4956, + "step": 272 + }, + { + "epoch": 0.01, + "grad_norm": 0.6701153516769409, + "learning_rate": 0.000599970556225246, + "loss": 4.5292, + "step": 273 + }, + { + "epoch": 0.01, + "grad_norm": 0.653418779373169, + "learning_rate": 0.0005999703401284328, + "loss": 4.734, + "step": 274 + }, + { + "epoch": 0.01, + "grad_norm": 0.6312591433525085, + "learning_rate": 0.0005999701232415552, + "loss": 4.6344, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 0.6036155223846436, + "learning_rate": 0.0005999699055646143, + "loss": 4.5935, + "step": 276 + }, + { + "epoch": 0.01, + "grad_norm": 0.6546796560287476, + "learning_rate": 0.0005999696870976103, + "loss": 4.4745, + "step": 277 + }, + { + "epoch": 0.01, + "grad_norm": 0.6474147439002991, + "learning_rate": 0.0005999694678405438, + "loss": 4.3296, + "step": 278 + }, + { + "epoch": 0.01, + "grad_norm": 0.6968685984611511, + "learning_rate": 0.0005999692477934153, + "loss": 4.6786, + "step": 279 + }, + { + "epoch": 0.01, + "grad_norm": 0.6009911894798279, + "learning_rate": 0.0005999690269562259, + "loss": 4.6661, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 0.6383833289146423, + "learning_rate": 0.0005999688053289757, + "loss": 4.7877, + "step": 281 + }, + { + "epoch": 0.01, + "grad_norm": 0.6396824717521667, + "learning_rate": 0.0005999685829116652, + "loss": 4.6966, + "step": 282 + }, + { + "epoch": 0.01, + "grad_norm": 0.6602899432182312, + "learning_rate": 0.0005999683597042953, + "loss": 4.4193, + "step": 283 + }, + { + "epoch": 0.01, + "grad_norm": 0.6245065927505493, + "learning_rate": 0.0005999681357068665, + "loss": 4.4753, + "step": 284 + }, + { + "epoch": 0.01, + "grad_norm": 0.6217121481895447, + "learning_rate": 0.0005999679109193794, + "loss": 4.3305, + "step": 285 + }, + { + "epoch": 0.01, + "grad_norm": 0.5806468725204468, + "learning_rate": 0.0005999676853418345, + "loss": 4.57, + "step": 286 + }, + { + "epoch": 0.01, + "grad_norm": 0.7057567238807678, + "learning_rate": 0.0005999674589742325, + "loss": 4.7142, + "step": 287 + }, + { + "epoch": 0.01, + "grad_norm": 0.6089333891868591, + "learning_rate": 0.0005999672318165738, + "loss": 4.5676, + "step": 288 + }, + { + "epoch": 0.01, + "grad_norm": 0.6079134941101074, + "learning_rate": 0.0005999670038688592, + "loss": 4.7218, + "step": 289 + }, + { + "epoch": 0.01, + "grad_norm": 0.611993670463562, + "learning_rate": 0.0005999667751310893, + "loss": 4.5366, + "step": 290 + }, + { + "epoch": 0.01, + "grad_norm": 0.5751462578773499, + "learning_rate": 0.0005999665456032647, + "loss": 4.5681, + "step": 291 + }, + { + "epoch": 0.01, + "grad_norm": 0.6958464980125427, + "learning_rate": 0.0005999663152853858, + "loss": 4.714, + "step": 292 + }, + { + "epoch": 0.01, + "grad_norm": 0.7553857564926147, + "learning_rate": 0.0005999660841774534, + "loss": 4.6257, + "step": 293 + }, + { + "epoch": 0.01, + "grad_norm": 0.6197561621665955, + "learning_rate": 0.0005999658522794681, + "loss": 4.6357, + "step": 294 + }, + { + "epoch": 0.01, + "grad_norm": 0.5696096420288086, + "learning_rate": 0.0005999656195914305, + "loss": 4.7116, + "step": 295 + }, + { + "epoch": 0.01, + "grad_norm": 0.6267894506454468, + "learning_rate": 0.0005999653861133411, + "loss": 4.6124, + "step": 296 + }, + { + "epoch": 0.01, + "grad_norm": 0.6374806761741638, + "learning_rate": 0.0005999651518452006, + "loss": 4.4663, + "step": 297 + }, + { + "epoch": 0.01, + "grad_norm": 0.619802713394165, + "learning_rate": 0.0005999649167870097, + "loss": 4.6616, + "step": 298 + }, + { + "epoch": 0.01, + "grad_norm": 0.5879377722740173, + "learning_rate": 0.0005999646809387689, + "loss": 4.5237, + "step": 299 + }, + { + "epoch": 0.01, + "grad_norm": 0.6228669881820679, + "learning_rate": 0.0005999644443004789, + "loss": 4.7327, + "step": 300 + }, + { + "epoch": 0.01, + "grad_norm": 1.0745501518249512, + "learning_rate": 0.0005999642068721403, + "loss": 4.742, + "step": 301 + }, + { + "epoch": 0.01, + "grad_norm": 0.6069528460502625, + "learning_rate": 0.0005999639686537537, + "loss": 4.5265, + "step": 302 + }, + { + "epoch": 0.01, + "grad_norm": 0.6730210185050964, + "learning_rate": 0.0005999637296453195, + "loss": 4.7896, + "step": 303 + }, + { + "epoch": 0.01, + "grad_norm": 0.665211021900177, + "learning_rate": 0.0005999634898468389, + "loss": 4.4585, + "step": 304 + }, + { + "epoch": 0.01, + "grad_norm": 0.6540265679359436, + "learning_rate": 0.0005999632492583121, + "loss": 4.2711, + "step": 305 + }, + { + "epoch": 0.01, + "grad_norm": 0.6752997040748596, + "learning_rate": 0.0005999630078797397, + "loss": 4.6462, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 0.6478506326675415, + "learning_rate": 0.0005999627657111227, + "loss": 4.4006, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 0.6323385238647461, + "learning_rate": 0.0005999625227524613, + "loss": 4.4891, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 0.6497457027435303, + "learning_rate": 0.0005999622790037563, + "loss": 4.3089, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 0.6490249037742615, + "learning_rate": 0.0005999620344650085, + "loss": 4.6512, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.6243768930435181, + "learning_rate": 0.0005999617891362185, + "loss": 4.8247, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 0.6056337356567383, + "learning_rate": 0.0005999615430173868, + "loss": 4.6136, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 0.59369295835495, + "learning_rate": 0.0005999612961085141, + "loss": 4.5969, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 0.627199113368988, + "learning_rate": 0.0005999610484096011, + "loss": 4.4371, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 0.6465263962745667, + "learning_rate": 0.0005999607999206484, + "loss": 4.492, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.6473185420036316, + "learning_rate": 0.0005999605506416567, + "loss": 4.4302, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 0.6702262759208679, + "learning_rate": 0.0005999603005726267, + "loss": 4.6095, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 0.6607844829559326, + "learning_rate": 0.0005999600497135589, + "loss": 4.6373, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 0.6345941424369812, + "learning_rate": 0.0005999597980644541, + "loss": 4.602, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 0.8155503869056702, + "learning_rate": 0.0005999595456253129, + "loss": 4.7077, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.5928239822387695, + "learning_rate": 0.000599959292396136, + "loss": 4.5808, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 0.6200684309005737, + "learning_rate": 0.000599959038376924, + "loss": 4.4643, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 0.6029160618782043, + "learning_rate": 0.0005999587835676778, + "loss": 4.6415, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 0.6697453260421753, + "learning_rate": 0.0005999585279683977, + "loss": 4.706, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 0.6388944983482361, + "learning_rate": 0.0005999582715790847, + "loss": 4.4938, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 0.6525525450706482, + "learning_rate": 0.0005999580143997393, + "loss": 4.4293, + "step": 326 + }, + { + "epoch": 0.02, + "grad_norm": 0.6061077117919922, + "learning_rate": 0.0005999577564303621, + "loss": 4.4593, + "step": 327 + }, + { + "epoch": 0.02, + "grad_norm": 0.6646733283996582, + "learning_rate": 0.000599957497670954, + "loss": 4.5408, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 0.6451191306114197, + "learning_rate": 0.0005999572381215155, + "loss": 4.6175, + "step": 329 + }, + { + "epoch": 0.02, + "grad_norm": 0.6172031164169312, + "learning_rate": 0.0005999569777820475, + "loss": 4.3329, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 0.7403270602226257, + "learning_rate": 0.0005999567166525505, + "loss": 4.4766, + "step": 331 + }, + { + "epoch": 0.02, + "grad_norm": 0.7167713642120361, + "learning_rate": 0.0005999564547330251, + "loss": 4.6084, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 0.605120062828064, + "learning_rate": 0.0005999561920234722, + "loss": 4.6095, + "step": 333 + }, + { + "epoch": 0.02, + "grad_norm": 0.6738837957382202, + "learning_rate": 0.0005999559285238925, + "loss": 4.443, + "step": 334 + }, + { + "epoch": 0.02, + "grad_norm": 0.6576931476593018, + "learning_rate": 0.0005999556642342866, + "loss": 4.4609, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 0.6313718557357788, + "learning_rate": 0.0005999553991546552, + "loss": 4.4452, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 0.5991811156272888, + "learning_rate": 0.0005999551332849989, + "loss": 4.5898, + "step": 337 + }, + { + "epoch": 0.02, + "grad_norm": 0.6302996277809143, + "learning_rate": 0.0005999548666253186, + "loss": 4.3871, + "step": 338 + }, + { + "epoch": 0.02, + "grad_norm": 0.637596607208252, + "learning_rate": 0.0005999545991756149, + "loss": 4.563, + "step": 339 + }, + { + "epoch": 0.02, + "grad_norm": 0.6226162910461426, + "learning_rate": 0.0005999543309358886, + "loss": 4.5605, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 0.6292939186096191, + "learning_rate": 0.0005999540619061402, + "loss": 4.3628, + "step": 341 + }, + { + "epoch": 0.02, + "grad_norm": 0.6156478524208069, + "learning_rate": 0.0005999537920863705, + "loss": 4.3611, + "step": 342 + }, + { + "epoch": 0.02, + "grad_norm": 0.6901237368583679, + "learning_rate": 0.0005999535214765803, + "loss": 4.4794, + "step": 343 + }, + { + "epoch": 0.02, + "grad_norm": 0.5876708626747131, + "learning_rate": 0.0005999532500767703, + "loss": 4.5018, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 0.6149948239326477, + "learning_rate": 0.0005999529778869411, + "loss": 4.423, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 0.6238973140716553, + "learning_rate": 0.0005999527049070935, + "loss": 4.4507, + "step": 346 + }, + { + "epoch": 0.02, + "grad_norm": 0.6290735602378845, + "learning_rate": 0.0005999524311372282, + "loss": 4.6397, + "step": 347 + }, + { + "epoch": 0.02, + "grad_norm": 0.5849318504333496, + "learning_rate": 0.0005999521565773459, + "loss": 4.4881, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 0.6476309299468994, + "learning_rate": 0.0005999518812274474, + "loss": 4.4975, + "step": 349 + }, + { + "epoch": 0.02, + "grad_norm": 0.6004956364631653, + "learning_rate": 0.0005999516050875334, + "loss": 4.6363, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 0.640617311000824, + "learning_rate": 0.0005999513281576045, + "loss": 4.5486, + "step": 351 + }, + { + "epoch": 0.02, + "grad_norm": 0.6288664937019348, + "learning_rate": 0.0005999510504376616, + "loss": 4.523, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 0.6244643330574036, + "learning_rate": 0.0005999507719277054, + "loss": 4.4213, + "step": 353 + }, + { + "epoch": 0.02, + "grad_norm": 0.6235512495040894, + "learning_rate": 0.0005999504926277365, + "loss": 4.4436, + "step": 354 + }, + { + "epoch": 0.02, + "grad_norm": 0.6489076614379883, + "learning_rate": 0.0005999502125377557, + "loss": 4.3302, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 0.7294739484786987, + "learning_rate": 0.0005999499316577639, + "loss": 4.3603, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 0.6268585920333862, + "learning_rate": 0.0005999496499877615, + "loss": 4.5018, + "step": 357 + }, + { + "epoch": 0.02, + "grad_norm": 0.6179900765419006, + "learning_rate": 0.0005999493675277497, + "loss": 4.4187, + "step": 358 + }, + { + "epoch": 0.02, + "grad_norm": 0.6882840991020203, + "learning_rate": 0.0005999490842777288, + "loss": 4.3276, + "step": 359 + }, + { + "epoch": 0.02, + "grad_norm": 0.6455632448196411, + "learning_rate": 0.0005999488002376999, + "loss": 4.6731, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.6357992887496948, + "learning_rate": 0.0005999485154076634, + "loss": 4.3314, + "step": 361 + }, + { + "epoch": 0.02, + "grad_norm": 0.641894519329071, + "learning_rate": 0.0005999482297876204, + "loss": 4.5446, + "step": 362 + }, + { + "epoch": 0.02, + "grad_norm": 0.6699628829956055, + "learning_rate": 0.0005999479433775714, + "loss": 4.411, + "step": 363 + }, + { + "epoch": 0.02, + "grad_norm": 0.6083547472953796, + "learning_rate": 0.0005999476561775173, + "loss": 4.4048, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 0.6556709408760071, + "learning_rate": 0.0005999473681874587, + "loss": 4.6152, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 0.6575928330421448, + "learning_rate": 0.0005999470794073965, + "loss": 4.407, + "step": 366 + }, + { + "epoch": 0.02, + "grad_norm": 0.6290593147277832, + "learning_rate": 0.0005999467898373314, + "loss": 4.4726, + "step": 367 + }, + { + "epoch": 0.02, + "grad_norm": 0.6279957890510559, + "learning_rate": 0.0005999464994772643, + "loss": 4.5635, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 0.6792528033256531, + "learning_rate": 0.0005999462083271957, + "loss": 4.564, + "step": 369 + }, + { + "epoch": 0.02, + "grad_norm": 0.6177847981452942, + "learning_rate": 0.0005999459163871266, + "loss": 4.3569, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 0.6043108701705933, + "learning_rate": 0.0005999456236570577, + "loss": 4.5328, + "step": 371 + }, + { + "epoch": 0.02, + "grad_norm": 0.7578732371330261, + "learning_rate": 0.0005999453301369897, + "loss": 4.3748, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 0.6486274600028992, + "learning_rate": 0.0005999450358269234, + "loss": 4.4287, + "step": 373 + }, + { + "epoch": 0.02, + "grad_norm": 0.6370539665222168, + "learning_rate": 0.0005999447407268597, + "loss": 4.2936, + "step": 374 + }, + { + "epoch": 0.02, + "grad_norm": 0.5845693945884705, + "learning_rate": 0.0005999444448367993, + "loss": 4.2919, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 0.6829562783241272, + "learning_rate": 0.0005999441481567428, + "loss": 4.6738, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 0.6204245090484619, + "learning_rate": 0.0005999438506866911, + "loss": 4.5717, + "step": 377 + }, + { + "epoch": 0.02, + "grad_norm": 0.6141250133514404, + "learning_rate": 0.0005999435524266453, + "loss": 4.5029, + "step": 378 + }, + { + "epoch": 0.02, + "grad_norm": 0.6364489197731018, + "learning_rate": 0.0005999432533766056, + "loss": 4.2914, + "step": 379 + }, + { + "epoch": 0.02, + "grad_norm": 0.5723608732223511, + "learning_rate": 0.0005999429535365734, + "loss": 4.4488, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 0.6832333207130432, + "learning_rate": 0.0005999426529065489, + "loss": 4.3861, + "step": 381 + }, + { + "epoch": 0.02, + "grad_norm": 0.6163421869277954, + "learning_rate": 0.0005999423514865334, + "loss": 4.8537, + "step": 382 + }, + { + "epoch": 0.02, + "grad_norm": 0.6012544631958008, + "learning_rate": 0.0005999420492765273, + "loss": 4.525, + "step": 383 + }, + { + "epoch": 0.02, + "grad_norm": 0.6326162815093994, + "learning_rate": 0.0005999417462765318, + "loss": 4.4894, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 0.7766426801681519, + "learning_rate": 0.0005999414424865472, + "loss": 4.5683, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 0.5986698269844055, + "learning_rate": 0.0005999411379065746, + "loss": 4.5938, + "step": 386 + }, + { + "epoch": 0.02, + "grad_norm": 0.6302661895751953, + "learning_rate": 0.0005999408325366149, + "loss": 4.243, + "step": 387 + }, + { + "epoch": 0.02, + "grad_norm": 0.6291135549545288, + "learning_rate": 0.0005999405263766688, + "loss": 4.6281, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 0.6244451403617859, + "learning_rate": 0.000599940219426737, + "loss": 4.3295, + "step": 389 + }, + { + "epoch": 0.02, + "grad_norm": 0.6079283952713013, + "learning_rate": 0.0005999399116868203, + "loss": 4.3864, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 0.6124751567840576, + "learning_rate": 0.0005999396031569197, + "loss": 4.6859, + "step": 391 + }, + { + "epoch": 0.02, + "grad_norm": 0.6195314526557922, + "learning_rate": 0.0005999392938370358, + "loss": 4.6973, + "step": 392 + }, + { + "epoch": 0.02, + "grad_norm": 0.7498607039451599, + "learning_rate": 0.0005999389837271697, + "loss": 4.4272, + "step": 393 + }, + { + "epoch": 0.02, + "grad_norm": 0.7137424349784851, + "learning_rate": 0.0005999386728273219, + "loss": 4.5234, + "step": 394 + }, + { + "epoch": 0.02, + "grad_norm": 0.5930689573287964, + "learning_rate": 0.0005999383611374934, + "loss": 4.5479, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 0.6761220097541809, + "learning_rate": 0.000599938048657685, + "loss": 4.4618, + "step": 396 + }, + { + "epoch": 0.02, + "grad_norm": 0.6029164791107178, + "learning_rate": 0.0005999377353878975, + "loss": 4.362, + "step": 397 + }, + { + "epoch": 0.02, + "grad_norm": 0.6475263237953186, + "learning_rate": 0.0005999374213281318, + "loss": 4.5006, + "step": 398 + }, + { + "epoch": 0.02, + "grad_norm": 0.647782564163208, + "learning_rate": 0.0005999371064783885, + "loss": 4.2719, + "step": 399 + }, + { + "epoch": 0.02, + "grad_norm": 0.5890044569969177, + "learning_rate": 0.0005999367908386688, + "loss": 4.5949, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 0.627471923828125, + "learning_rate": 0.0005999364744089731, + "loss": 4.4189, + "step": 401 + }, + { + "epoch": 0.02, + "grad_norm": 0.5897189378738403, + "learning_rate": 0.0005999361571893026, + "loss": 4.4445, + "step": 402 + }, + { + "epoch": 0.02, + "grad_norm": 0.5901340246200562, + "learning_rate": 0.0005999358391796578, + "loss": 4.3554, + "step": 403 + }, + { + "epoch": 0.02, + "grad_norm": 0.6106919646263123, + "learning_rate": 0.00059993552038004, + "loss": 4.3254, + "step": 404 + }, + { + "epoch": 0.02, + "grad_norm": 0.7342495918273926, + "learning_rate": 0.0005999352007904495, + "loss": 4.3228, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 0.5682928562164307, + "learning_rate": 0.0005999348804108875, + "loss": 4.4674, + "step": 406 + }, + { + "epoch": 0.02, + "grad_norm": 0.6223228573799133, + "learning_rate": 0.0005999345592413548, + "loss": 4.4122, + "step": 407 + }, + { + "epoch": 0.02, + "grad_norm": 0.632500946521759, + "learning_rate": 0.0005999342372818523, + "loss": 4.5875, + "step": 408 + }, + { + "epoch": 0.02, + "grad_norm": 0.629377543926239, + "learning_rate": 0.0005999339145323805, + "loss": 4.271, + "step": 409 + }, + { + "epoch": 0.02, + "grad_norm": 0.6136090755462646, + "learning_rate": 0.0005999335909929405, + "loss": 4.4486, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 0.6139892339706421, + "learning_rate": 0.0005999332666635332, + "loss": 4.51, + "step": 411 + }, + { + "epoch": 0.02, + "grad_norm": 0.8123599290847778, + "learning_rate": 0.0005999329415441595, + "loss": 4.2626, + "step": 412 + }, + { + "epoch": 0.02, + "grad_norm": 0.6359814405441284, + "learning_rate": 0.0005999326156348201, + "loss": 4.5825, + "step": 413 + }, + { + "epoch": 0.02, + "grad_norm": 0.702951967716217, + "learning_rate": 0.000599932288935516, + "loss": 4.5257, + "step": 414 + }, + { + "epoch": 0.02, + "grad_norm": 0.6103016138076782, + "learning_rate": 0.0005999319614462479, + "loss": 4.4541, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 0.6839025616645813, + "learning_rate": 0.0005999316331670166, + "loss": 4.4537, + "step": 416 + }, + { + "epoch": 0.02, + "grad_norm": 0.6513589024543762, + "learning_rate": 0.0005999313040978233, + "loss": 4.2952, + "step": 417 + }, + { + "epoch": 0.02, + "grad_norm": 0.6502857804298401, + "learning_rate": 0.0005999309742386685, + "loss": 4.3789, + "step": 418 + }, + { + "epoch": 0.02, + "grad_norm": 0.6159502267837524, + "learning_rate": 0.0005999306435895533, + "loss": 4.484, + "step": 419 + }, + { + "epoch": 0.02, + "grad_norm": 0.5981889963150024, + "learning_rate": 0.0005999303121504786, + "loss": 4.3018, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 0.6399751901626587, + "learning_rate": 0.0005999299799214451, + "loss": 4.3074, + "step": 421 + }, + { + "epoch": 0.02, + "grad_norm": 0.6635521054267883, + "learning_rate": 0.0005999296469024537, + "loss": 4.399, + "step": 422 + }, + { + "epoch": 0.02, + "grad_norm": 0.5930505990982056, + "learning_rate": 0.0005999293130935054, + "loss": 4.4843, + "step": 423 + }, + { + "epoch": 0.02, + "grad_norm": 0.6015510559082031, + "learning_rate": 0.0005999289784946011, + "loss": 4.4935, + "step": 424 + }, + { + "epoch": 0.02, + "grad_norm": 0.690433919429779, + "learning_rate": 0.0005999286431057415, + "loss": 4.4129, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 0.7033190727233887, + "learning_rate": 0.0005999283069269276, + "loss": 4.4088, + "step": 426 + }, + { + "epoch": 0.02, + "grad_norm": 0.5974972248077393, + "learning_rate": 0.0005999279699581602, + "loss": 4.2984, + "step": 427 + }, + { + "epoch": 0.02, + "grad_norm": 0.6253489255905151, + "learning_rate": 0.0005999276321994402, + "loss": 4.2365, + "step": 428 + }, + { + "epoch": 0.02, + "grad_norm": 0.6363905668258667, + "learning_rate": 0.0005999272936507687, + "loss": 4.4612, + "step": 429 + }, + { + "epoch": 0.02, + "grad_norm": 0.6299953460693359, + "learning_rate": 0.0005999269543121463, + "loss": 4.2972, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 0.7025599479675293, + "learning_rate": 0.0005999266141835741, + "loss": 4.168, + "step": 431 + }, + { + "epoch": 0.02, + "grad_norm": 0.7058206796646118, + "learning_rate": 0.000599926273265053, + "loss": 4.2049, + "step": 432 + }, + { + "epoch": 0.02, + "grad_norm": 0.639482319355011, + "learning_rate": 0.0005999259315565837, + "loss": 4.2935, + "step": 433 + }, + { + "epoch": 0.02, + "grad_norm": 0.5956512093544006, + "learning_rate": 0.0005999255890581672, + "loss": 4.2922, + "step": 434 + }, + { + "epoch": 0.02, + "grad_norm": 0.6834009289741516, + "learning_rate": 0.0005999252457698045, + "loss": 4.1426, + "step": 435 + }, + { + "epoch": 0.02, + "grad_norm": 0.6566224694252014, + "learning_rate": 0.0005999249016914964, + "loss": 4.4575, + "step": 436 + }, + { + "epoch": 0.02, + "grad_norm": 0.6373875737190247, + "learning_rate": 0.0005999245568232438, + "loss": 4.634, + "step": 437 + }, + { + "epoch": 0.02, + "grad_norm": 0.637948751449585, + "learning_rate": 0.0005999242111650476, + "loss": 4.4227, + "step": 438 + }, + { + "epoch": 0.02, + "grad_norm": 0.713834285736084, + "learning_rate": 0.0005999238647169089, + "loss": 4.2999, + "step": 439 + }, + { + "epoch": 0.02, + "grad_norm": 0.606334924697876, + "learning_rate": 0.0005999235174788283, + "loss": 4.3889, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 0.6799372434616089, + "learning_rate": 0.0005999231694508069, + "loss": 4.3517, + "step": 441 + }, + { + "epoch": 0.02, + "grad_norm": 0.6416102051734924, + "learning_rate": 0.0005999228206328457, + "loss": 4.5535, + "step": 442 + }, + { + "epoch": 0.02, + "grad_norm": 0.6423821449279785, + "learning_rate": 0.0005999224710249454, + "loss": 4.0969, + "step": 443 + }, + { + "epoch": 0.02, + "grad_norm": 0.6177202463150024, + "learning_rate": 0.0005999221206271071, + "loss": 4.3001, + "step": 444 + }, + { + "epoch": 0.02, + "grad_norm": 0.5873739719390869, + "learning_rate": 0.0005999217694393317, + "loss": 4.3533, + "step": 445 + }, + { + "epoch": 0.02, + "grad_norm": 0.6107127666473389, + "learning_rate": 0.00059992141746162, + "loss": 4.2398, + "step": 446 + }, + { + "epoch": 0.02, + "grad_norm": 0.6472650170326233, + "learning_rate": 0.0005999210646939731, + "loss": 4.3991, + "step": 447 + }, + { + "epoch": 0.02, + "grad_norm": 0.611621081829071, + "learning_rate": 0.0005999207111363916, + "loss": 4.4359, + "step": 448 + }, + { + "epoch": 0.02, + "grad_norm": 0.6676717400550842, + "learning_rate": 0.000599920356788877, + "loss": 4.3993, + "step": 449 + }, + { + "epoch": 0.02, + "grad_norm": 0.6014063954353333, + "learning_rate": 0.0005999200016514296, + "loss": 4.5267, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 0.6274306774139404, + "learning_rate": 0.0005999196457240508, + "loss": 4.2282, + "step": 451 + }, + { + "epoch": 0.02, + "grad_norm": 0.6827574372291565, + "learning_rate": 0.0005999192890067413, + "loss": 4.3729, + "step": 452 + }, + { + "epoch": 0.02, + "grad_norm": 0.7055410742759705, + "learning_rate": 0.0005999189314995022, + "loss": 4.2916, + "step": 453 + }, + { + "epoch": 0.02, + "grad_norm": 0.654010534286499, + "learning_rate": 0.0005999185732023343, + "loss": 4.2391, + "step": 454 + }, + { + "epoch": 0.02, + "grad_norm": 0.5943931937217712, + "learning_rate": 0.0005999182141152385, + "loss": 4.2879, + "step": 455 + }, + { + "epoch": 0.02, + "grad_norm": 0.6318141222000122, + "learning_rate": 0.000599917854238216, + "loss": 4.2995, + "step": 456 + }, + { + "epoch": 0.02, + "grad_norm": 0.6332406401634216, + "learning_rate": 0.0005999174935712676, + "loss": 4.2975, + "step": 457 + }, + { + "epoch": 0.02, + "grad_norm": 0.654248058795929, + "learning_rate": 0.0005999171321143941, + "loss": 4.3066, + "step": 458 + }, + { + "epoch": 0.02, + "grad_norm": 0.6300703287124634, + "learning_rate": 0.0005999167698675967, + "loss": 4.4163, + "step": 459 + }, + { + "epoch": 0.02, + "grad_norm": 0.6364467740058899, + "learning_rate": 0.0005999164068308762, + "loss": 4.4665, + "step": 460 + }, + { + "epoch": 0.02, + "grad_norm": 0.650804340839386, + "learning_rate": 0.0005999160430042337, + "loss": 4.2126, + "step": 461 + }, + { + "epoch": 0.02, + "grad_norm": 0.6396269202232361, + "learning_rate": 0.00059991567838767, + "loss": 4.4289, + "step": 462 + }, + { + "epoch": 0.02, + "grad_norm": 0.7045847177505493, + "learning_rate": 0.0005999153129811861, + "loss": 4.1509, + "step": 463 + }, + { + "epoch": 0.02, + "grad_norm": 0.895271360874176, + "learning_rate": 0.0005999149467847831, + "loss": 4.3426, + "step": 464 + }, + { + "epoch": 0.02, + "grad_norm": 0.6522945165634155, + "learning_rate": 0.0005999145797984617, + "loss": 4.4832, + "step": 465 + }, + { + "epoch": 0.02, + "grad_norm": 0.6630837321281433, + "learning_rate": 0.000599914212022223, + "loss": 4.4587, + "step": 466 + }, + { + "epoch": 0.02, + "grad_norm": 0.6792482733726501, + "learning_rate": 0.0005999138434560681, + "loss": 4.2637, + "step": 467 + }, + { + "epoch": 0.02, + "grad_norm": 0.7289097309112549, + "learning_rate": 0.0005999134740999979, + "loss": 4.4558, + "step": 468 + }, + { + "epoch": 0.02, + "grad_norm": 0.6192370653152466, + "learning_rate": 0.0005999131039540131, + "loss": 4.3225, + "step": 469 + }, + { + "epoch": 0.02, + "grad_norm": 0.6023765206336975, + "learning_rate": 0.0005999127330181151, + "loss": 4.4778, + "step": 470 + }, + { + "epoch": 0.02, + "grad_norm": 0.6397039294242859, + "learning_rate": 0.0005999123612923046, + "loss": 4.4022, + "step": 471 + }, + { + "epoch": 0.02, + "grad_norm": 0.5905510187149048, + "learning_rate": 0.0005999119887765827, + "loss": 4.468, + "step": 472 + }, + { + "epoch": 0.02, + "grad_norm": 0.7561242580413818, + "learning_rate": 0.0005999116154709504, + "loss": 4.2759, + "step": 473 + }, + { + "epoch": 0.02, + "grad_norm": 0.6308673024177551, + "learning_rate": 0.0005999112413754084, + "loss": 4.2214, + "step": 474 + }, + { + "epoch": 0.02, + "grad_norm": 0.6405158638954163, + "learning_rate": 0.0005999108664899582, + "loss": 4.3654, + "step": 475 + }, + { + "epoch": 0.02, + "grad_norm": 0.6524988412857056, + "learning_rate": 0.0005999104908146003, + "loss": 4.5213, + "step": 476 + }, + { + "epoch": 0.02, + "grad_norm": 0.6339336037635803, + "learning_rate": 0.0005999101143493359, + "loss": 4.5757, + "step": 477 + }, + { + "epoch": 0.02, + "grad_norm": 0.6563646793365479, + "learning_rate": 0.000599909737094166, + "loss": 4.4423, + "step": 478 + }, + { + "epoch": 0.02, + "grad_norm": 0.6299603581428528, + "learning_rate": 0.0005999093590490917, + "loss": 4.1991, + "step": 479 + }, + { + "epoch": 0.02, + "grad_norm": 0.8304985761642456, + "learning_rate": 0.0005999089802141137, + "loss": 4.5737, + "step": 480 + }, + { + "epoch": 0.02, + "grad_norm": 0.6237910389900208, + "learning_rate": 0.0005999086005892332, + "loss": 4.298, + "step": 481 + }, + { + "epoch": 0.02, + "grad_norm": 0.6513141989707947, + "learning_rate": 0.0005999082201744512, + "loss": 3.9653, + "step": 482 + }, + { + "epoch": 0.02, + "grad_norm": 0.6443023085594177, + "learning_rate": 0.0005999078389697687, + "loss": 4.3509, + "step": 483 + }, + { + "epoch": 0.02, + "grad_norm": 0.6092896461486816, + "learning_rate": 0.0005999074569751865, + "loss": 4.4119, + "step": 484 + }, + { + "epoch": 0.02, + "grad_norm": 0.6280472874641418, + "learning_rate": 0.000599907074190706, + "loss": 4.1763, + "step": 485 + }, + { + "epoch": 0.02, + "grad_norm": 0.6605474352836609, + "learning_rate": 0.0005999066906163279, + "loss": 4.1999, + "step": 486 + }, + { + "epoch": 0.02, + "grad_norm": 0.633695125579834, + "learning_rate": 0.0005999063062520532, + "loss": 4.5811, + "step": 487 + }, + { + "epoch": 0.02, + "grad_norm": 0.6050964593887329, + "learning_rate": 0.0005999059210978832, + "loss": 4.2844, + "step": 488 + }, + { + "epoch": 0.02, + "grad_norm": 0.6477183699607849, + "learning_rate": 0.0005999055351538186, + "loss": 4.315, + "step": 489 + }, + { + "epoch": 0.02, + "grad_norm": 0.6902852654457092, + "learning_rate": 0.0005999051484198606, + "loss": 4.4602, + "step": 490 + }, + { + "epoch": 0.02, + "grad_norm": 0.9078108668327332, + "learning_rate": 0.00059990476089601, + "loss": 4.6133, + "step": 491 + }, + { + "epoch": 0.02, + "grad_norm": 0.6101062893867493, + "learning_rate": 0.000599904372582268, + "loss": 4.5548, + "step": 492 + }, + { + "epoch": 0.02, + "grad_norm": 0.6342147588729858, + "learning_rate": 0.0005999039834786357, + "loss": 4.232, + "step": 493 + }, + { + "epoch": 0.02, + "grad_norm": 0.6037482619285583, + "learning_rate": 0.0005999035935851142, + "loss": 4.4302, + "step": 494 + }, + { + "epoch": 0.02, + "grad_norm": 0.6121346950531006, + "learning_rate": 0.0005999032029017041, + "loss": 4.2391, + "step": 495 + }, + { + "epoch": 0.02, + "grad_norm": 0.6090756058692932, + "learning_rate": 0.0005999028114284067, + "loss": 4.4575, + "step": 496 + }, + { + "epoch": 0.02, + "grad_norm": 0.569670557975769, + "learning_rate": 0.0005999024191652231, + "loss": 4.2884, + "step": 497 + }, + { + "epoch": 0.02, + "grad_norm": 0.5985832810401917, + "learning_rate": 0.0005999020261121541, + "loss": 4.3026, + "step": 498 + }, + { + "epoch": 0.02, + "grad_norm": 0.6532287001609802, + "learning_rate": 0.000599901632269201, + "loss": 4.3152, + "step": 499 + }, + { + "epoch": 0.02, + "grad_norm": 0.5899796485900879, + "learning_rate": 0.0005999012376363647, + "loss": 4.1943, + "step": 500 + }, + { + "epoch": 0.02, + "grad_norm": 0.6288335919380188, + "learning_rate": 0.0005999008422136463, + "loss": 4.4126, + "step": 501 + }, + { + "epoch": 0.02, + "grad_norm": 0.6783986687660217, + "learning_rate": 0.0005999004460010467, + "loss": 4.1275, + "step": 502 + }, + { + "epoch": 0.02, + "grad_norm": 0.612891435623169, + "learning_rate": 0.0005999000489985671, + "loss": 4.3033, + "step": 503 + }, + { + "epoch": 0.02, + "grad_norm": 0.6028085947036743, + "learning_rate": 0.0005998996512062085, + "loss": 4.2218, + "step": 504 + }, + { + "epoch": 0.02, + "grad_norm": 0.5980657935142517, + "learning_rate": 0.0005998992526239719, + "loss": 4.2868, + "step": 505 + }, + { + "epoch": 0.02, + "grad_norm": 0.6469926238059998, + "learning_rate": 0.0005998988532518584, + "loss": 4.4216, + "step": 506 + }, + { + "epoch": 0.02, + "grad_norm": 0.6261191368103027, + "learning_rate": 0.0005998984530898691, + "loss": 4.1823, + "step": 507 + }, + { + "epoch": 0.02, + "grad_norm": 0.5999051928520203, + "learning_rate": 0.0005998980521380048, + "loss": 4.0664, + "step": 508 + }, + { + "epoch": 0.02, + "grad_norm": 0.6515395641326904, + "learning_rate": 0.000599897650396267, + "loss": 4.5476, + "step": 509 + }, + { + "epoch": 0.02, + "grad_norm": 0.5981988906860352, + "learning_rate": 0.0005998972478646564, + "loss": 4.2978, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 0.6036791205406189, + "learning_rate": 0.0005998968445431742, + "loss": 4.3236, + "step": 511 + }, + { + "epoch": 0.03, + "grad_norm": 0.6372940540313721, + "learning_rate": 0.0005998964404318213, + "loss": 4.2261, + "step": 512 + }, + { + "epoch": 0.03, + "grad_norm": 0.612691342830658, + "learning_rate": 0.0005998960355305989, + "loss": 4.2686, + "step": 513 + }, + { + "epoch": 0.03, + "grad_norm": 0.6639612913131714, + "learning_rate": 0.0005998956298395082, + "loss": 4.4662, + "step": 514 + }, + { + "epoch": 0.03, + "grad_norm": 0.6196693181991577, + "learning_rate": 0.00059989522335855, + "loss": 4.244, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 0.6042441725730896, + "learning_rate": 0.0005998948160877256, + "loss": 4.4187, + "step": 516 + }, + { + "epoch": 0.03, + "grad_norm": 0.584118127822876, + "learning_rate": 0.0005998944080270359, + "loss": 4.2727, + "step": 517 + }, + { + "epoch": 0.03, + "grad_norm": 0.6688718795776367, + "learning_rate": 0.0005998939991764821, + "loss": 4.4981, + "step": 518 + }, + { + "epoch": 0.03, + "grad_norm": 0.6350945234298706, + "learning_rate": 0.0005998935895360651, + "loss": 4.0629, + "step": 519 + }, + { + "epoch": 0.03, + "grad_norm": 0.6718504428863525, + "learning_rate": 0.0005998931791057863, + "loss": 4.3385, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 0.666595995426178, + "learning_rate": 0.0005998927678856464, + "loss": 4.1755, + "step": 521 + }, + { + "epoch": 0.03, + "grad_norm": 0.6215299963951111, + "learning_rate": 0.0005998923558756467, + "loss": 4.4248, + "step": 522 + }, + { + "epoch": 0.03, + "grad_norm": 0.6000812649726868, + "learning_rate": 0.0005998919430757883, + "loss": 4.3693, + "step": 523 + }, + { + "epoch": 0.03, + "grad_norm": 0.643562912940979, + "learning_rate": 0.0005998915294860722, + "loss": 3.9877, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 0.6312112808227539, + "learning_rate": 0.0005998911151064996, + "loss": 4.3255, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 0.6362340450286865, + "learning_rate": 0.0005998906999370716, + "loss": 4.2147, + "step": 526 + }, + { + "epoch": 0.03, + "grad_norm": 0.6441465616226196, + "learning_rate": 0.0005998902839777889, + "loss": 4.2851, + "step": 527 + }, + { + "epoch": 0.03, + "grad_norm": 0.6550605893135071, + "learning_rate": 0.0005998898672286532, + "loss": 4.167, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 0.6101124286651611, + "learning_rate": 0.0005998894496896651, + "loss": 4.5298, + "step": 529 + }, + { + "epoch": 0.03, + "grad_norm": 0.6387701630592346, + "learning_rate": 0.0005998890313608261, + "loss": 4.1748, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 0.6303190588951111, + "learning_rate": 0.0005998886122421369, + "loss": 4.2512, + "step": 531 + }, + { + "epoch": 0.03, + "grad_norm": 0.7182632088661194, + "learning_rate": 0.0005998881923335989, + "loss": 4.3773, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 0.5540059208869934, + "learning_rate": 0.0005998877716352132, + "loss": 4.3919, + "step": 533 + }, + { + "epoch": 0.03, + "grad_norm": 0.6034284234046936, + "learning_rate": 0.0005998873501469808, + "loss": 4.1638, + "step": 534 + }, + { + "epoch": 0.03, + "grad_norm": 0.6099483966827393, + "learning_rate": 0.0005998869278689028, + "loss": 4.3032, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 0.6097630262374878, + "learning_rate": 0.0005998865048009803, + "loss": 4.1468, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 0.5770888328552246, + "learning_rate": 0.0005998860809432145, + "loss": 4.2715, + "step": 537 + }, + { + "epoch": 0.03, + "grad_norm": 0.6351972818374634, + "learning_rate": 0.0005998856562956064, + "loss": 4.2979, + "step": 538 + }, + { + "epoch": 0.03, + "grad_norm": 0.6256553530693054, + "learning_rate": 0.0005998852308581573, + "loss": 4.1972, + "step": 539 + }, + { + "epoch": 0.03, + "grad_norm": 0.625809907913208, + "learning_rate": 0.0005998848046308682, + "loss": 3.9932, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 0.6050224900245667, + "learning_rate": 0.0005998843776137402, + "loss": 4.2899, + "step": 541 + }, + { + "epoch": 0.03, + "grad_norm": 0.617949366569519, + "learning_rate": 0.0005998839498067745, + "loss": 4.069, + "step": 542 + }, + { + "epoch": 0.03, + "grad_norm": 0.7680637836456299, + "learning_rate": 0.0005998835212099722, + "loss": 4.264, + "step": 543 + }, + { + "epoch": 0.03, + "grad_norm": 0.6679553389549255, + "learning_rate": 0.0005998830918233344, + "loss": 4.2505, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 1.0453165769577026, + "learning_rate": 0.0005998826616468622, + "loss": 4.356, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 0.6193425059318542, + "learning_rate": 0.0005998822306805568, + "loss": 4.4011, + "step": 546 + }, + { + "epoch": 0.03, + "grad_norm": 0.6391511559486389, + "learning_rate": 0.0005998817989244194, + "loss": 4.08, + "step": 547 + }, + { + "epoch": 0.03, + "grad_norm": 0.6139493584632874, + "learning_rate": 0.000599881366378451, + "loss": 4.206, + "step": 548 + }, + { + "epoch": 0.03, + "grad_norm": 0.6752720475196838, + "learning_rate": 0.0005998809330426528, + "loss": 4.1488, + "step": 549 + }, + { + "epoch": 0.03, + "grad_norm": 0.6094870567321777, + "learning_rate": 0.0005998804989170259, + "loss": 4.1779, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 0.5940812230110168, + "learning_rate": 0.0005998800640015715, + "loss": 4.3568, + "step": 551 + }, + { + "epoch": 0.03, + "grad_norm": 0.6211913228034973, + "learning_rate": 0.0005998796282962907, + "loss": 4.4739, + "step": 552 + }, + { + "epoch": 0.03, + "grad_norm": 0.5913729071617126, + "learning_rate": 0.0005998791918011847, + "loss": 4.2086, + "step": 553 + }, + { + "epoch": 0.03, + "grad_norm": 0.6634578704833984, + "learning_rate": 0.0005998787545162547, + "loss": 4.2911, + "step": 554 + }, + { + "epoch": 0.03, + "grad_norm": 0.636097252368927, + "learning_rate": 0.0005998783164415017, + "loss": 4.1991, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 0.6963991522789001, + "learning_rate": 0.0005998778775769269, + "loss": 4.0183, + "step": 556 + }, + { + "epoch": 0.03, + "grad_norm": 0.6141919493675232, + "learning_rate": 0.0005998774379225315, + "loss": 4.5183, + "step": 557 + }, + { + "epoch": 0.03, + "grad_norm": 0.5955811142921448, + "learning_rate": 0.0005998769974783167, + "loss": 4.3453, + "step": 558 + }, + { + "epoch": 0.03, + "grad_norm": 0.6223891377449036, + "learning_rate": 0.0005998765562442835, + "loss": 4.3894, + "step": 559 + }, + { + "epoch": 0.03, + "grad_norm": 0.6813943386077881, + "learning_rate": 0.0005998761142204333, + "loss": 4.2083, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 0.6669309735298157, + "learning_rate": 0.000599875671406767, + "loss": 4.566, + "step": 561 + }, + { + "epoch": 0.03, + "grad_norm": 0.6055184602737427, + "learning_rate": 0.0005998752278032859, + "loss": 4.0433, + "step": 562 + }, + { + "epoch": 0.03, + "grad_norm": 0.6754607558250427, + "learning_rate": 0.0005998747834099912, + "loss": 4.0311, + "step": 563 + }, + { + "epoch": 0.03, + "grad_norm": 0.6373229622840881, + "learning_rate": 0.000599874338226884, + "loss": 4.33, + "step": 564 + }, + { + "epoch": 0.03, + "grad_norm": 0.5642352104187012, + "learning_rate": 0.0005998738922539656, + "loss": 4.2819, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 0.5712973475456238, + "learning_rate": 0.000599873445491237, + "loss": 4.3986, + "step": 566 + }, + { + "epoch": 0.03, + "grad_norm": 0.6137987375259399, + "learning_rate": 0.0005998729979386994, + "loss": 4.4378, + "step": 567 + }, + { + "epoch": 0.03, + "grad_norm": 0.6173729300498962, + "learning_rate": 0.0005998725495963542, + "loss": 4.3934, + "step": 568 + }, + { + "epoch": 0.03, + "grad_norm": 0.6119434833526611, + "learning_rate": 0.0005998721004642024, + "loss": 4.3429, + "step": 569 + }, + { + "epoch": 0.03, + "grad_norm": 0.6111935377120972, + "learning_rate": 0.000599871650542245, + "loss": 4.2851, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 0.7171507477760315, + "learning_rate": 0.0005998711998304835, + "loss": 4.4319, + "step": 571 + }, + { + "epoch": 0.03, + "grad_norm": 0.5924344062805176, + "learning_rate": 0.000599870748328919, + "loss": 4.4248, + "step": 572 + }, + { + "epoch": 0.03, + "grad_norm": 0.6047422289848328, + "learning_rate": 0.0005998702960375526, + "loss": 4.3181, + "step": 573 + }, + { + "epoch": 0.03, + "grad_norm": 0.61612468957901, + "learning_rate": 0.0005998698429563856, + "loss": 4.2176, + "step": 574 + }, + { + "epoch": 0.03, + "grad_norm": 0.6250529289245605, + "learning_rate": 0.0005998693890854192, + "loss": 4.3786, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 0.6662085652351379, + "learning_rate": 0.0005998689344246544, + "loss": 4.4409, + "step": 576 + }, + { + "epoch": 0.03, + "grad_norm": 0.6052320599555969, + "learning_rate": 0.0005998684789740926, + "loss": 4.1686, + "step": 577 + }, + { + "epoch": 0.03, + "grad_norm": 0.6721617579460144, + "learning_rate": 0.0005998680227337351, + "loss": 4.1626, + "step": 578 + }, + { + "epoch": 0.03, + "grad_norm": 0.6326162815093994, + "learning_rate": 0.0005998675657035827, + "loss": 4.4329, + "step": 579 + }, + { + "epoch": 0.03, + "grad_norm": 0.6244572401046753, + "learning_rate": 0.0005998671078836369, + "loss": 4.3406, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 0.6222200989723206, + "learning_rate": 0.0005998666492738989, + "loss": 4.3156, + "step": 581 + }, + { + "epoch": 0.03, + "grad_norm": 0.6773638725280762, + "learning_rate": 0.0005998661898743698, + "loss": 4.2542, + "step": 582 + }, + { + "epoch": 0.03, + "grad_norm": 0.7391511797904968, + "learning_rate": 0.0005998657296850509, + "loss": 4.293, + "step": 583 + }, + { + "epoch": 0.03, + "grad_norm": 0.6230476498603821, + "learning_rate": 0.0005998652687059434, + "loss": 4.1702, + "step": 584 + }, + { + "epoch": 0.03, + "grad_norm": 0.6014927625656128, + "learning_rate": 0.0005998648069370485, + "loss": 4.0259, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 0.6436325311660767, + "learning_rate": 0.0005998643443783674, + "loss": 4.1263, + "step": 586 + }, + { + "epoch": 0.03, + "grad_norm": 0.6496565937995911, + "learning_rate": 0.0005998638810299013, + "loss": 4.0934, + "step": 587 + }, + { + "epoch": 0.03, + "grad_norm": 0.5946733951568604, + "learning_rate": 0.0005998634168916515, + "loss": 4.3685, + "step": 588 + }, + { + "epoch": 0.03, + "grad_norm": 0.5709477066993713, + "learning_rate": 0.0005998629519636191, + "loss": 4.0456, + "step": 589 + }, + { + "epoch": 0.03, + "grad_norm": 0.6664632558822632, + "learning_rate": 0.0005998624862458054, + "loss": 4.2432, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 0.6627117395401001, + "learning_rate": 0.0005998620197382117, + "loss": 4.492, + "step": 591 + }, + { + "epoch": 0.03, + "grad_norm": 0.6205899715423584, + "learning_rate": 0.0005998615524408391, + "loss": 4.3008, + "step": 592 + }, + { + "epoch": 0.03, + "grad_norm": 0.6240425109863281, + "learning_rate": 0.0005998610843536888, + "loss": 4.4099, + "step": 593 + }, + { + "epoch": 0.03, + "grad_norm": 0.6774874329566956, + "learning_rate": 0.0005998606154767621, + "loss": 4.2297, + "step": 594 + }, + { + "epoch": 0.03, + "grad_norm": 0.6011747121810913, + "learning_rate": 0.0005998601458100603, + "loss": 4.2218, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 0.6347803473472595, + "learning_rate": 0.0005998596753535847, + "loss": 4.2884, + "step": 596 + }, + { + "epoch": 0.03, + "grad_norm": 0.6915126442909241, + "learning_rate": 0.0005998592041073364, + "loss": 4.3879, + "step": 597 + }, + { + "epoch": 0.03, + "grad_norm": 0.6727957129478455, + "learning_rate": 0.0005998587320713165, + "loss": 4.0906, + "step": 598 + }, + { + "epoch": 0.03, + "grad_norm": 0.6125076413154602, + "learning_rate": 0.0005998582592455266, + "loss": 4.2448, + "step": 599 + }, + { + "epoch": 0.03, + "grad_norm": 0.6534925699234009, + "learning_rate": 0.0005998577856299677, + "loss": 4.1813, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 0.6901077628135681, + "learning_rate": 0.0005998573112246411, + "loss": 4.0296, + "step": 601 + }, + { + "epoch": 0.03, + "grad_norm": 0.6255195140838623, + "learning_rate": 0.000599856836029548, + "loss": 4.22, + "step": 602 + }, + { + "epoch": 0.03, + "grad_norm": 0.6073751449584961, + "learning_rate": 0.0005998563600446898, + "loss": 4.3775, + "step": 603 + }, + { + "epoch": 0.03, + "grad_norm": 0.6304476857185364, + "learning_rate": 0.0005998558832700675, + "loss": 4.5508, + "step": 604 + }, + { + "epoch": 0.03, + "grad_norm": 0.6237664222717285, + "learning_rate": 0.0005998554057056825, + "loss": 4.3309, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 0.6360412836074829, + "learning_rate": 0.0005998549273515362, + "loss": 4.2754, + "step": 606 + }, + { + "epoch": 0.03, + "grad_norm": 0.6329114437103271, + "learning_rate": 0.0005998544482076297, + "loss": 4.4678, + "step": 607 + }, + { + "epoch": 0.03, + "grad_norm": 0.6190249919891357, + "learning_rate": 0.0005998539682739643, + "loss": 4.0623, + "step": 608 + }, + { + "epoch": 0.03, + "grad_norm": 0.7699590921401978, + "learning_rate": 0.0005998534875505413, + "loss": 4.368, + "step": 609 + }, + { + "epoch": 0.03, + "grad_norm": 0.6379511952400208, + "learning_rate": 0.0005998530060373618, + "loss": 4.0845, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 0.6318610310554504, + "learning_rate": 0.0005998525237344272, + "loss": 4.1786, + "step": 611 + }, + { + "epoch": 0.03, + "grad_norm": 0.654819905757904, + "learning_rate": 0.0005998520406417388, + "loss": 4.219, + "step": 612 + }, + { + "epoch": 0.03, + "grad_norm": 0.6683518886566162, + "learning_rate": 0.0005998515567592979, + "loss": 4.333, + "step": 613 + }, + { + "epoch": 0.03, + "grad_norm": 0.6329825520515442, + "learning_rate": 0.0005998510720871057, + "loss": 4.2552, + "step": 614 + }, + { + "epoch": 0.03, + "grad_norm": 0.6308939456939697, + "learning_rate": 0.0005998505866251635, + "loss": 4.142, + "step": 615 + }, + { + "epoch": 0.03, + "grad_norm": 0.557669460773468, + "learning_rate": 0.0005998501003734724, + "loss": 4.2415, + "step": 616 + }, + { + "epoch": 0.03, + "grad_norm": 0.6747656464576721, + "learning_rate": 0.0005998496133320339, + "loss": 4.3223, + "step": 617 + }, + { + "epoch": 0.03, + "grad_norm": 0.6284106373786926, + "learning_rate": 0.0005998491255008494, + "loss": 4.3184, + "step": 618 + }, + { + "epoch": 0.03, + "grad_norm": 0.6386296153068542, + "learning_rate": 0.0005998486368799197, + "loss": 4.0359, + "step": 619 + }, + { + "epoch": 0.03, + "grad_norm": 0.6140933632850647, + "learning_rate": 0.0005998481474692466, + "loss": 4.0157, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 0.6379926204681396, + "learning_rate": 0.0005998476572688312, + "loss": 4.1319, + "step": 621 + }, + { + "epoch": 0.03, + "grad_norm": 0.5984822511672974, + "learning_rate": 0.0005998471662786747, + "loss": 4.157, + "step": 622 + }, + { + "epoch": 0.03, + "grad_norm": 0.5686042904853821, + "learning_rate": 0.0005998466744987786, + "loss": 4.2332, + "step": 623 + }, + { + "epoch": 0.03, + "grad_norm": 0.6098901629447937, + "learning_rate": 0.0005998461819291439, + "loss": 4.0068, + "step": 624 + }, + { + "epoch": 0.03, + "grad_norm": 0.7193558216094971, + "learning_rate": 0.0005998456885697722, + "loss": 4.1709, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 0.6442568898200989, + "learning_rate": 0.0005998451944206646, + "loss": 4.2862, + "step": 626 + }, + { + "epoch": 0.03, + "grad_norm": 0.610493004322052, + "learning_rate": 0.0005998446994818225, + "loss": 4.1749, + "step": 627 + }, + { + "epoch": 0.03, + "grad_norm": 0.6829988360404968, + "learning_rate": 0.0005998442037532471, + "loss": 4.2972, + "step": 628 + }, + { + "epoch": 0.03, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0005998437072349397, + "loss": 4.1303, + "step": 629 + }, + { + "epoch": 0.03, + "grad_norm": 0.6559966206550598, + "learning_rate": 0.0005998432099269019, + "loss": 4.1799, + "step": 630 + }, + { + "epoch": 0.03, + "grad_norm": 0.610496461391449, + "learning_rate": 0.0005998427118291347, + "loss": 4.2051, + "step": 631 + }, + { + "epoch": 0.03, + "grad_norm": 0.5574377775192261, + "learning_rate": 0.0005998422129416393, + "loss": 4.2476, + "step": 632 + }, + { + "epoch": 0.03, + "grad_norm": 0.6652995347976685, + "learning_rate": 0.0005998417132644175, + "loss": 4.1745, + "step": 633 + }, + { + "epoch": 0.03, + "grad_norm": 0.609061598777771, + "learning_rate": 0.0005998412127974702, + "loss": 4.3128, + "step": 634 + }, + { + "epoch": 0.03, + "grad_norm": 0.6088485717773438, + "learning_rate": 0.0005998407115407989, + "loss": 4.3749, + "step": 635 + }, + { + "epoch": 0.03, + "grad_norm": 0.6192953586578369, + "learning_rate": 0.0005998402094944048, + "loss": 3.9438, + "step": 636 + }, + { + "epoch": 0.03, + "grad_norm": 0.6353934407234192, + "learning_rate": 0.0005998397066582894, + "loss": 4.1745, + "step": 637 + }, + { + "epoch": 0.03, + "grad_norm": 1.0005871057510376, + "learning_rate": 0.0005998392030324539, + "loss": 4.5231, + "step": 638 + }, + { + "epoch": 0.03, + "grad_norm": 0.5929376482963562, + "learning_rate": 0.0005998386986168996, + "loss": 4.0178, + "step": 639 + }, + { + "epoch": 0.03, + "grad_norm": 0.6462288498878479, + "learning_rate": 0.0005998381934116279, + "loss": 4.1872, + "step": 640 + }, + { + "epoch": 0.03, + "grad_norm": 0.6652570366859436, + "learning_rate": 0.0005998376874166401, + "loss": 4.4516, + "step": 641 + }, + { + "epoch": 0.03, + "grad_norm": 0.6204379200935364, + "learning_rate": 0.0005998371806319375, + "loss": 4.2446, + "step": 642 + }, + { + "epoch": 0.03, + "grad_norm": 0.6597891449928284, + "learning_rate": 0.0005998366730575216, + "loss": 4.1447, + "step": 643 + }, + { + "epoch": 0.03, + "grad_norm": 0.708060622215271, + "learning_rate": 0.0005998361646933934, + "loss": 4.3571, + "step": 644 + }, + { + "epoch": 0.03, + "grad_norm": 0.5907878279685974, + "learning_rate": 0.0005998356555395546, + "loss": 4.0995, + "step": 645 + }, + { + "epoch": 0.03, + "grad_norm": 0.5822071433067322, + "learning_rate": 0.0005998351455960062, + "loss": 4.3305, + "step": 646 + }, + { + "epoch": 0.03, + "grad_norm": 0.6510571837425232, + "learning_rate": 0.0005998346348627499, + "loss": 4.2952, + "step": 647 + }, + { + "epoch": 0.03, + "grad_norm": 0.624159574508667, + "learning_rate": 0.0005998341233397869, + "loss": 4.1151, + "step": 648 + }, + { + "epoch": 0.03, + "grad_norm": 0.5877444744110107, + "learning_rate": 0.0005998336110271185, + "loss": 4.1762, + "step": 649 + }, + { + "epoch": 0.03, + "grad_norm": 0.664666473865509, + "learning_rate": 0.0005998330979247459, + "loss": 4.0597, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 0.5993242263793945, + "learning_rate": 0.0005998325840326708, + "loss": 4.3319, + "step": 651 + }, + { + "epoch": 0.03, + "grad_norm": 0.603900671005249, + "learning_rate": 0.0005998320693508944, + "loss": 4.1354, + "step": 652 + }, + { + "epoch": 0.03, + "grad_norm": 0.5658103227615356, + "learning_rate": 0.000599831553879418, + "loss": 4.3008, + "step": 653 + }, + { + "epoch": 0.03, + "grad_norm": 0.588505208492279, + "learning_rate": 0.0005998310376182429, + "loss": 3.9547, + "step": 654 + }, + { + "epoch": 0.03, + "grad_norm": 0.6646924018859863, + "learning_rate": 0.0005998305205673707, + "loss": 4.3278, + "step": 655 + }, + { + "epoch": 0.03, + "grad_norm": 0.5992798805236816, + "learning_rate": 0.0005998300027268024, + "loss": 4.0916, + "step": 656 + }, + { + "epoch": 0.03, + "grad_norm": 0.6579504013061523, + "learning_rate": 0.0005998294840965397, + "loss": 4.2061, + "step": 657 + }, + { + "epoch": 0.03, + "grad_norm": 0.6638225317001343, + "learning_rate": 0.0005998289646765839, + "loss": 4.3004, + "step": 658 + }, + { + "epoch": 0.03, + "grad_norm": 0.6212108135223389, + "learning_rate": 0.0005998284444669363, + "loss": 4.25, + "step": 659 + }, + { + "epoch": 0.03, + "grad_norm": 0.7648120522499084, + "learning_rate": 0.0005998279234675982, + "loss": 3.9266, + "step": 660 + }, + { + "epoch": 0.03, + "grad_norm": 0.6046731472015381, + "learning_rate": 0.0005998274016785711, + "loss": 4.0652, + "step": 661 + }, + { + "epoch": 0.03, + "grad_norm": 0.6333780884742737, + "learning_rate": 0.0005998268790998563, + "loss": 4.2736, + "step": 662 + }, + { + "epoch": 0.03, + "grad_norm": 0.6316869854927063, + "learning_rate": 0.0005998263557314553, + "loss": 4.2631, + "step": 663 + }, + { + "epoch": 0.03, + "grad_norm": 0.593485951423645, + "learning_rate": 0.0005998258315733692, + "loss": 4.3236, + "step": 664 + }, + { + "epoch": 0.03, + "grad_norm": 0.633284330368042, + "learning_rate": 0.0005998253066255997, + "loss": 4.0417, + "step": 665 + }, + { + "epoch": 0.03, + "grad_norm": 0.5952419638633728, + "learning_rate": 0.000599824780888148, + "loss": 4.3424, + "step": 666 + }, + { + "epoch": 0.03, + "grad_norm": 0.6344154477119446, + "learning_rate": 0.0005998242543610155, + "loss": 4.1042, + "step": 667 + }, + { + "epoch": 0.03, + "grad_norm": 0.6327022910118103, + "learning_rate": 0.0005998237270442037, + "loss": 4.174, + "step": 668 + }, + { + "epoch": 0.03, + "grad_norm": 0.7157790660858154, + "learning_rate": 0.0005998231989377139, + "loss": 4.0206, + "step": 669 + }, + { + "epoch": 0.03, + "grad_norm": 0.6288419365882874, + "learning_rate": 0.0005998226700415474, + "loss": 4.3456, + "step": 670 + }, + { + "epoch": 0.03, + "grad_norm": 0.6706027984619141, + "learning_rate": 0.0005998221403557058, + "loss": 3.9778, + "step": 671 + }, + { + "epoch": 0.03, + "grad_norm": 0.6601995825767517, + "learning_rate": 0.0005998216098801904, + "loss": 4.2968, + "step": 672 + }, + { + "epoch": 0.03, + "grad_norm": 0.6394633650779724, + "learning_rate": 0.0005998210786150024, + "loss": 4.3944, + "step": 673 + }, + { + "epoch": 0.03, + "grad_norm": 0.7500145435333252, + "learning_rate": 0.0005998205465601435, + "loss": 4.0035, + "step": 674 + }, + { + "epoch": 0.03, + "grad_norm": 0.6336989998817444, + "learning_rate": 0.000599820013715615, + "loss": 4.241, + "step": 675 + }, + { + "epoch": 0.03, + "grad_norm": 0.6461735367774963, + "learning_rate": 0.0005998194800814182, + "loss": 4.223, + "step": 676 + }, + { + "epoch": 0.03, + "grad_norm": 0.6548101902008057, + "learning_rate": 0.0005998189456575547, + "loss": 4.1025, + "step": 677 + }, + { + "epoch": 0.03, + "grad_norm": 0.6322773694992065, + "learning_rate": 0.0005998184104440257, + "loss": 4.2219, + "step": 678 + }, + { + "epoch": 0.03, + "grad_norm": 0.6030046939849854, + "learning_rate": 0.0005998178744408328, + "loss": 4.1642, + "step": 679 + }, + { + "epoch": 0.03, + "grad_norm": 0.6334202289581299, + "learning_rate": 0.0005998173376479773, + "loss": 4.2389, + "step": 680 + }, + { + "epoch": 0.03, + "grad_norm": 0.6108695864677429, + "learning_rate": 0.0005998168000654606, + "loss": 4.0175, + "step": 681 + }, + { + "epoch": 0.03, + "grad_norm": 0.6236847639083862, + "learning_rate": 0.0005998162616932841, + "loss": 4.3389, + "step": 682 + }, + { + "epoch": 0.03, + "grad_norm": 0.6061368584632874, + "learning_rate": 0.0005998157225314493, + "loss": 4.2202, + "step": 683 + }, + { + "epoch": 0.03, + "grad_norm": 0.6532244682312012, + "learning_rate": 0.0005998151825799576, + "loss": 4.0348, + "step": 684 + }, + { + "epoch": 0.03, + "grad_norm": 0.5872693061828613, + "learning_rate": 0.0005998146418388105, + "loss": 4.2631, + "step": 685 + }, + { + "epoch": 0.03, + "grad_norm": 0.6573641300201416, + "learning_rate": 0.0005998141003080092, + "loss": 4.1371, + "step": 686 + }, + { + "epoch": 0.03, + "grad_norm": 0.6546458601951599, + "learning_rate": 0.0005998135579875554, + "loss": 4.1695, + "step": 687 + }, + { + "epoch": 0.03, + "grad_norm": 0.6271075010299683, + "learning_rate": 0.0005998130148774502, + "loss": 4.2471, + "step": 688 + }, + { + "epoch": 0.03, + "grad_norm": 0.5936674475669861, + "learning_rate": 0.0005998124709776953, + "loss": 4.1321, + "step": 689 + }, + { + "epoch": 0.03, + "grad_norm": 0.5933693051338196, + "learning_rate": 0.0005998119262882921, + "loss": 4.1484, + "step": 690 + }, + { + "epoch": 0.03, + "grad_norm": 0.6198687553405762, + "learning_rate": 0.000599811380809242, + "loss": 4.4228, + "step": 691 + }, + { + "epoch": 0.03, + "grad_norm": 0.6083481907844543, + "learning_rate": 0.0005998108345405465, + "loss": 4.266, + "step": 692 + }, + { + "epoch": 0.03, + "grad_norm": 0.6238859295845032, + "learning_rate": 0.0005998102874822068, + "loss": 4.3885, + "step": 693 + }, + { + "epoch": 0.03, + "grad_norm": 0.6317378878593445, + "learning_rate": 0.0005998097396342245, + "loss": 4.2074, + "step": 694 + }, + { + "epoch": 0.03, + "grad_norm": 0.6172046661376953, + "learning_rate": 0.0005998091909966011, + "loss": 4.0928, + "step": 695 + }, + { + "epoch": 0.03, + "grad_norm": 0.6623038649559021, + "learning_rate": 0.000599808641569338, + "loss": 4.0324, + "step": 696 + }, + { + "epoch": 0.03, + "grad_norm": 0.6745285987854004, + "learning_rate": 0.0005998080913524365, + "loss": 4.3255, + "step": 697 + }, + { + "epoch": 0.03, + "grad_norm": 0.5575293302536011, + "learning_rate": 0.0005998075403458984, + "loss": 4.2219, + "step": 698 + }, + { + "epoch": 0.03, + "grad_norm": 0.6386272311210632, + "learning_rate": 0.0005998069885497249, + "loss": 4.236, + "step": 699 + }, + { + "epoch": 0.03, + "grad_norm": 0.7148878574371338, + "learning_rate": 0.0005998064359639173, + "loss": 4.089, + "step": 700 + }, + { + "epoch": 0.03, + "grad_norm": 0.6956779360771179, + "learning_rate": 0.0005998058825884775, + "loss": 4.1711, + "step": 701 + }, + { + "epoch": 0.03, + "grad_norm": 0.610063910484314, + "learning_rate": 0.0005998053284234067, + "loss": 4.3074, + "step": 702 + }, + { + "epoch": 0.03, + "grad_norm": 0.7103130221366882, + "learning_rate": 0.0005998047734687062, + "loss": 4.1535, + "step": 703 + }, + { + "epoch": 0.03, + "grad_norm": 0.6960937976837158, + "learning_rate": 0.0005998042177243776, + "loss": 3.9969, + "step": 704 + }, + { + "epoch": 0.03, + "grad_norm": 0.7523322105407715, + "learning_rate": 0.0005998036611904225, + "loss": 3.8138, + "step": 705 + }, + { + "epoch": 0.03, + "grad_norm": 0.6804961562156677, + "learning_rate": 0.0005998031038668422, + "loss": 4.1109, + "step": 706 + }, + { + "epoch": 0.03, + "grad_norm": 0.6763580441474915, + "learning_rate": 0.0005998025457536382, + "loss": 3.9702, + "step": 707 + }, + { + "epoch": 0.03, + "grad_norm": 0.8153755068778992, + "learning_rate": 0.0005998019868508121, + "loss": 4.0995, + "step": 708 + }, + { + "epoch": 0.03, + "grad_norm": 0.6804799437522888, + "learning_rate": 0.0005998014271583652, + "loss": 4.2456, + "step": 709 + }, + { + "epoch": 0.03, + "grad_norm": 0.6983351707458496, + "learning_rate": 0.0005998008666762989, + "loss": 3.9431, + "step": 710 + }, + { + "epoch": 0.03, + "grad_norm": 0.6080770492553711, + "learning_rate": 0.0005998003054046151, + "loss": 4.277, + "step": 711 + }, + { + "epoch": 0.03, + "grad_norm": 0.7080684900283813, + "learning_rate": 0.0005997997433433148, + "loss": 4.0764, + "step": 712 + }, + { + "epoch": 0.03, + "grad_norm": 0.6443161964416504, + "learning_rate": 0.0005997991804923997, + "loss": 4.0913, + "step": 713 + }, + { + "epoch": 0.03, + "grad_norm": 0.6476138234138489, + "learning_rate": 0.0005997986168518713, + "loss": 4.2866, + "step": 714 + }, + { + "epoch": 0.04, + "grad_norm": 0.603281557559967, + "learning_rate": 0.000599798052421731, + "loss": 4.1755, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 0.6660357713699341, + "learning_rate": 0.0005997974872019804, + "loss": 4.0975, + "step": 716 + }, + { + "epoch": 0.04, + "grad_norm": 0.6493331789970398, + "learning_rate": 0.0005997969211926208, + "loss": 4.2041, + "step": 717 + }, + { + "epoch": 0.04, + "grad_norm": 0.6349976062774658, + "learning_rate": 0.000599796354393654, + "loss": 4.0962, + "step": 718 + }, + { + "epoch": 0.04, + "grad_norm": 0.6662265062332153, + "learning_rate": 0.000599795786805081, + "loss": 4.0407, + "step": 719 + }, + { + "epoch": 0.04, + "grad_norm": 0.5812216997146606, + "learning_rate": 0.0005997952184269038, + "loss": 4.2963, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 0.5997583866119385, + "learning_rate": 0.0005997946492591237, + "loss": 4.0069, + "step": 721 + }, + { + "epoch": 0.04, + "grad_norm": 0.6585355997085571, + "learning_rate": 0.0005997940793017422, + "loss": 4.0195, + "step": 722 + }, + { + "epoch": 0.04, + "grad_norm": 0.6478659510612488, + "learning_rate": 0.0005997935085547606, + "loss": 4.1715, + "step": 723 + }, + { + "epoch": 0.04, + "grad_norm": 0.6393293142318726, + "learning_rate": 0.0005997929370181809, + "loss": 4.086, + "step": 724 + }, + { + "epoch": 0.04, + "grad_norm": 0.6129328608512878, + "learning_rate": 0.0005997923646920041, + "loss": 4.2473, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 0.6000876426696777, + "learning_rate": 0.0005997917915762319, + "loss": 3.8759, + "step": 726 + }, + { + "epoch": 0.04, + "grad_norm": 0.6004976034164429, + "learning_rate": 0.0005997912176708657, + "loss": 4.2669, + "step": 727 + }, + { + "epoch": 0.04, + "grad_norm": 0.698418140411377, + "learning_rate": 0.0005997906429759074, + "loss": 4.2933, + "step": 728 + }, + { + "epoch": 0.04, + "grad_norm": 0.6390034556388855, + "learning_rate": 0.0005997900674913581, + "loss": 4.2472, + "step": 729 + }, + { + "epoch": 0.04, + "grad_norm": 0.6226664781570435, + "learning_rate": 0.0005997894912172196, + "loss": 4.0674, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 0.6232365965843201, + "learning_rate": 0.0005997889141534931, + "loss": 4.3033, + "step": 731 + }, + { + "epoch": 0.04, + "grad_norm": 0.6623662114143372, + "learning_rate": 0.0005997883363001803, + "loss": 4.1758, + "step": 732 + }, + { + "epoch": 0.04, + "grad_norm": 0.9481949806213379, + "learning_rate": 0.0005997877576572828, + "loss": 4.5139, + "step": 733 + }, + { + "epoch": 0.04, + "grad_norm": 0.6264766454696655, + "learning_rate": 0.000599787178224802, + "loss": 4.109, + "step": 734 + }, + { + "epoch": 0.04, + "grad_norm": 0.6743102073669434, + "learning_rate": 0.0005997865980027395, + "loss": 4.422, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 0.688208281993866, + "learning_rate": 0.0005997860169910968, + "loss": 4.1469, + "step": 736 + }, + { + "epoch": 0.04, + "grad_norm": 0.5970245003700256, + "learning_rate": 0.0005997854351898754, + "loss": 4.2083, + "step": 737 + }, + { + "epoch": 0.04, + "grad_norm": 0.6264445781707764, + "learning_rate": 0.0005997848525990769, + "loss": 4.251, + "step": 738 + }, + { + "epoch": 0.04, + "grad_norm": 0.6513589024543762, + "learning_rate": 0.0005997842692187027, + "loss": 3.8674, + "step": 739 + }, + { + "epoch": 0.04, + "grad_norm": 0.6829968690872192, + "learning_rate": 0.0005997836850487545, + "loss": 3.9729, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 0.6139025688171387, + "learning_rate": 0.0005997831000892338, + "loss": 4.1896, + "step": 741 + }, + { + "epoch": 0.04, + "grad_norm": 0.6279214024543762, + "learning_rate": 0.0005997825143401421, + "loss": 4.1086, + "step": 742 + }, + { + "epoch": 0.04, + "grad_norm": 0.6188284158706665, + "learning_rate": 0.0005997819278014808, + "loss": 4.0587, + "step": 743 + }, + { + "epoch": 0.04, + "grad_norm": 0.6335148811340332, + "learning_rate": 0.0005997813404732517, + "loss": 3.9916, + "step": 744 + }, + { + "epoch": 0.04, + "grad_norm": 0.643291711807251, + "learning_rate": 0.0005997807523554563, + "loss": 4.2194, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 0.6920807957649231, + "learning_rate": 0.000599780163448096, + "loss": 3.9606, + "step": 746 + }, + { + "epoch": 0.04, + "grad_norm": 0.6325848698616028, + "learning_rate": 0.0005997795737511725, + "loss": 4.0133, + "step": 747 + }, + { + "epoch": 0.04, + "grad_norm": 0.6075696349143982, + "learning_rate": 0.0005997789832646874, + "loss": 4.1146, + "step": 748 + }, + { + "epoch": 0.04, + "grad_norm": 0.658141016960144, + "learning_rate": 0.000599778391988642, + "loss": 4.1188, + "step": 749 + }, + { + "epoch": 0.04, + "grad_norm": 0.5975100994110107, + "learning_rate": 0.000599777799923038, + "loss": 4.1244, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 0.6382097601890564, + "learning_rate": 0.0005997772070678771, + "loss": 4.0732, + "step": 751 + }, + { + "epoch": 0.04, + "grad_norm": 0.623789370059967, + "learning_rate": 0.0005997766134231606, + "loss": 4.1322, + "step": 752 + }, + { + "epoch": 0.04, + "grad_norm": 0.6385670900344849, + "learning_rate": 0.0005997760189888902, + "loss": 4.1762, + "step": 753 + }, + { + "epoch": 0.04, + "grad_norm": 0.640335738658905, + "learning_rate": 0.0005997754237650675, + "loss": 4.0456, + "step": 754 + }, + { + "epoch": 0.04, + "grad_norm": 0.6698756814002991, + "learning_rate": 0.0005997748277516941, + "loss": 4.3383, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 0.6713323593139648, + "learning_rate": 0.0005997742309487714, + "loss": 4.0747, + "step": 756 + }, + { + "epoch": 0.04, + "grad_norm": 0.61420738697052, + "learning_rate": 0.0005997736333563011, + "loss": 4.1113, + "step": 757 + }, + { + "epoch": 0.04, + "grad_norm": 0.6245611310005188, + "learning_rate": 0.0005997730349742847, + "loss": 4.3387, + "step": 758 + }, + { + "epoch": 0.04, + "grad_norm": 0.5801929235458374, + "learning_rate": 0.0005997724358027238, + "loss": 4.0914, + "step": 759 + }, + { + "epoch": 0.04, + "grad_norm": 0.6528918743133545, + "learning_rate": 0.0005997718358416201, + "loss": 4.148, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 0.5901684761047363, + "learning_rate": 0.000599771235090975, + "loss": 4.0853, + "step": 761 + }, + { + "epoch": 0.04, + "grad_norm": 0.6030728220939636, + "learning_rate": 0.0005997706335507902, + "loss": 4.071, + "step": 762 + }, + { + "epoch": 0.04, + "grad_norm": 0.624138355255127, + "learning_rate": 0.0005997700312210672, + "loss": 4.134, + "step": 763 + }, + { + "epoch": 0.04, + "grad_norm": 0.647909939289093, + "learning_rate": 0.0005997694281018077, + "loss": 4.2178, + "step": 764 + }, + { + "epoch": 0.04, + "grad_norm": 0.6185994148254395, + "learning_rate": 0.0005997688241930131, + "loss": 4.2237, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 0.6649905443191528, + "learning_rate": 0.0005997682194946852, + "loss": 4.4995, + "step": 766 + }, + { + "epoch": 0.04, + "grad_norm": 0.6107615828514099, + "learning_rate": 0.0005997676140068255, + "loss": 4.1487, + "step": 767 + }, + { + "epoch": 0.04, + "grad_norm": 0.6697407364845276, + "learning_rate": 0.0005997670077294355, + "loss": 4.0481, + "step": 768 + }, + { + "epoch": 0.04, + "grad_norm": 0.6354249715805054, + "learning_rate": 0.000599766400662517, + "loss": 4.0766, + "step": 769 + }, + { + "epoch": 0.04, + "grad_norm": 0.63181072473526, + "learning_rate": 0.0005997657928060715, + "loss": 3.8411, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 0.6216081976890564, + "learning_rate": 0.0005997651841601006, + "loss": 4.1454, + "step": 771 + }, + { + "epoch": 0.04, + "grad_norm": 0.6391425728797913, + "learning_rate": 0.0005997645747246058, + "loss": 4.0997, + "step": 772 + }, + { + "epoch": 0.04, + "grad_norm": 0.6952412724494934, + "learning_rate": 0.0005997639644995888, + "loss": 4.2179, + "step": 773 + }, + { + "epoch": 0.04, + "grad_norm": 0.6759364008903503, + "learning_rate": 0.0005997633534850514, + "loss": 4.1654, + "step": 774 + }, + { + "epoch": 0.04, + "grad_norm": 0.6045799851417542, + "learning_rate": 0.0005997627416809948, + "loss": 4.2034, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 0.6865895390510559, + "learning_rate": 0.0005997621290874209, + "loss": 4.0764, + "step": 776 + }, + { + "epoch": 0.04, + "grad_norm": 0.6113295555114746, + "learning_rate": 0.0005997615157043312, + "loss": 4.2651, + "step": 777 + }, + { + "epoch": 0.04, + "grad_norm": 0.687916100025177, + "learning_rate": 0.0005997609015317275, + "loss": 4.2646, + "step": 778 + }, + { + "epoch": 0.04, + "grad_norm": 0.6994914412498474, + "learning_rate": 0.0005997602865696111, + "loss": 3.9585, + "step": 779 + }, + { + "epoch": 0.04, + "grad_norm": 0.6061134338378906, + "learning_rate": 0.0005997596708179839, + "loss": 4.0875, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 0.606025218963623, + "learning_rate": 0.0005997590542768475, + "loss": 4.321, + "step": 781 + }, + { + "epoch": 0.04, + "grad_norm": 0.6358824968338013, + "learning_rate": 0.0005997584369462033, + "loss": 4.1621, + "step": 782 + }, + { + "epoch": 0.04, + "grad_norm": 0.6684304475784302, + "learning_rate": 0.0005997578188260531, + "loss": 4.2266, + "step": 783 + }, + { + "epoch": 0.04, + "grad_norm": 0.5966183543205261, + "learning_rate": 0.0005997571999163985, + "loss": 4.0322, + "step": 784 + }, + { + "epoch": 0.04, + "grad_norm": 0.5628737211227417, + "learning_rate": 0.0005997565802172411, + "loss": 3.8574, + "step": 785 + }, + { + "epoch": 0.04, + "grad_norm": 0.7218655347824097, + "learning_rate": 0.0005997559597285825, + "loss": 3.857, + "step": 786 + }, + { + "epoch": 0.04, + "grad_norm": 0.6253189444541931, + "learning_rate": 0.0005997553384504246, + "loss": 4.1211, + "step": 787 + }, + { + "epoch": 0.04, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0005997547163827686, + "loss": 3.6799, + "step": 788 + }, + { + "epoch": 0.04, + "grad_norm": 0.5922057628631592, + "learning_rate": 0.0005997540935256165, + "loss": 4.3404, + "step": 789 + }, + { + "epoch": 0.04, + "grad_norm": 0.6513999700546265, + "learning_rate": 0.0005997534698789698, + "loss": 4.0325, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 0.6125958561897278, + "learning_rate": 0.0005997528454428301, + "loss": 3.9919, + "step": 791 + }, + { + "epoch": 0.04, + "grad_norm": 0.6877921223640442, + "learning_rate": 0.0005997522202171991, + "loss": 4.1427, + "step": 792 + }, + { + "epoch": 0.04, + "grad_norm": 0.6257963180541992, + "learning_rate": 0.0005997515942020785, + "loss": 3.9095, + "step": 793 + }, + { + "epoch": 0.04, + "grad_norm": 0.6871378421783447, + "learning_rate": 0.0005997509673974699, + "loss": 4.2673, + "step": 794 + }, + { + "epoch": 0.04, + "grad_norm": 0.7322933077812195, + "learning_rate": 0.0005997503398033748, + "loss": 4.0279, + "step": 795 + }, + { + "epoch": 0.04, + "grad_norm": 0.6184875965118408, + "learning_rate": 0.0005997497114197952, + "loss": 4.0317, + "step": 796 + }, + { + "epoch": 0.04, + "grad_norm": 0.620513916015625, + "learning_rate": 0.0005997490822467323, + "loss": 4.1609, + "step": 797 + }, + { + "epoch": 0.04, + "grad_norm": 0.6280710101127625, + "learning_rate": 0.0005997484522841882, + "loss": 3.9997, + "step": 798 + }, + { + "epoch": 0.04, + "grad_norm": 0.6425851583480835, + "learning_rate": 0.0005997478215321642, + "loss": 4.3098, + "step": 799 + }, + { + "epoch": 0.04, + "grad_norm": 0.6624174118041992, + "learning_rate": 0.0005997471899906623, + "loss": 3.9097, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 0.6578688621520996, + "learning_rate": 0.0005997465576596839, + "loss": 4.2133, + "step": 801 + }, + { + "epoch": 0.04, + "grad_norm": 0.6733409762382507, + "learning_rate": 0.0005997459245392308, + "loss": 4.0564, + "step": 802 + }, + { + "epoch": 0.04, + "grad_norm": 0.6345073580741882, + "learning_rate": 0.0005997452906293045, + "loss": 4.1252, + "step": 803 + }, + { + "epoch": 0.04, + "grad_norm": 0.6336762309074402, + "learning_rate": 0.0005997446559299069, + "loss": 4.1081, + "step": 804 + }, + { + "epoch": 0.04, + "grad_norm": 0.6436794996261597, + "learning_rate": 0.0005997440204410395, + "loss": 4.0327, + "step": 805 + }, + { + "epoch": 0.04, + "grad_norm": 1.0349910259246826, + "learning_rate": 0.0005997433841627042, + "loss": 4.0259, + "step": 806 + }, + { + "epoch": 0.04, + "grad_norm": 0.6200885772705078, + "learning_rate": 0.0005997427470949023, + "loss": 4.1537, + "step": 807 + }, + { + "epoch": 0.04, + "grad_norm": 0.6146049499511719, + "learning_rate": 0.0005997421092376358, + "loss": 4.2592, + "step": 808 + }, + { + "epoch": 0.04, + "grad_norm": 0.6383852958679199, + "learning_rate": 0.0005997414705909062, + "loss": 3.9864, + "step": 809 + }, + { + "epoch": 0.04, + "grad_norm": 0.6226063966751099, + "learning_rate": 0.0005997408311547153, + "loss": 4.0929, + "step": 810 + }, + { + "epoch": 0.04, + "grad_norm": 0.6670148968696594, + "learning_rate": 0.0005997401909290648, + "loss": 4.1746, + "step": 811 + }, + { + "epoch": 0.04, + "grad_norm": 0.6614567637443542, + "learning_rate": 0.0005997395499139562, + "loss": 3.9935, + "step": 812 + }, + { + "epoch": 0.04, + "grad_norm": 0.5947785973548889, + "learning_rate": 0.0005997389081093914, + "loss": 4.0051, + "step": 813 + }, + { + "epoch": 0.04, + "grad_norm": 0.6563360691070557, + "learning_rate": 0.000599738265515372, + "loss": 4.1408, + "step": 814 + }, + { + "epoch": 0.04, + "grad_norm": 0.626630961894989, + "learning_rate": 0.0005997376221318996, + "loss": 4.2392, + "step": 815 + }, + { + "epoch": 0.04, + "grad_norm": 0.5788909792900085, + "learning_rate": 0.0005997369779589762, + "loss": 4.0375, + "step": 816 + }, + { + "epoch": 0.04, + "grad_norm": 0.5952694416046143, + "learning_rate": 0.0005997363329966031, + "loss": 4.0138, + "step": 817 + }, + { + "epoch": 0.04, + "grad_norm": 0.6254683136940002, + "learning_rate": 0.0005997356872447822, + "loss": 4.0292, + "step": 818 + }, + { + "epoch": 0.04, + "grad_norm": 0.5747913718223572, + "learning_rate": 0.0005997350407035153, + "loss": 4.1415, + "step": 819 + }, + { + "epoch": 0.04, + "grad_norm": 0.638823926448822, + "learning_rate": 0.0005997343933728038, + "loss": 4.1323, + "step": 820 + }, + { + "epoch": 0.04, + "grad_norm": 0.6529001593589783, + "learning_rate": 0.0005997337452526498, + "loss": 4.0443, + "step": 821 + }, + { + "epoch": 0.04, + "grad_norm": 0.62080317735672, + "learning_rate": 0.0005997330963430547, + "loss": 4.1244, + "step": 822 + }, + { + "epoch": 0.04, + "grad_norm": 0.6070016622543335, + "learning_rate": 0.0005997324466440202, + "loss": 4.1765, + "step": 823 + }, + { + "epoch": 0.04, + "grad_norm": 0.7610416412353516, + "learning_rate": 0.0005997317961555483, + "loss": 4.0887, + "step": 824 + }, + { + "epoch": 0.04, + "grad_norm": 0.6447186470031738, + "learning_rate": 0.0005997311448776404, + "loss": 4.2312, + "step": 825 + }, + { + "epoch": 0.04, + "grad_norm": 0.6255171298980713, + "learning_rate": 0.0005997304928102985, + "loss": 3.8122, + "step": 826 + }, + { + "epoch": 0.04, + "grad_norm": 0.6879158616065979, + "learning_rate": 0.0005997298399535241, + "loss": 4.0435, + "step": 827 + }, + { + "epoch": 0.04, + "grad_norm": 0.6047009229660034, + "learning_rate": 0.000599729186307319, + "loss": 4.0208, + "step": 828 + }, + { + "epoch": 0.04, + "grad_norm": 0.5998377203941345, + "learning_rate": 0.0005997285318716847, + "loss": 4.055, + "step": 829 + }, + { + "epoch": 0.04, + "grad_norm": 0.5977486371994019, + "learning_rate": 0.0005997278766466233, + "loss": 4.238, + "step": 830 + }, + { + "epoch": 0.04, + "grad_norm": 0.5870335698127747, + "learning_rate": 0.0005997272206321365, + "loss": 3.7925, + "step": 831 + }, + { + "epoch": 0.04, + "grad_norm": 0.6094736456871033, + "learning_rate": 0.0005997265638282256, + "loss": 4.2518, + "step": 832 + }, + { + "epoch": 0.04, + "grad_norm": 0.6728547215461731, + "learning_rate": 0.0005997259062348929, + "loss": 4.1802, + "step": 833 + }, + { + "epoch": 0.04, + "grad_norm": 0.5765520334243774, + "learning_rate": 0.0005997252478521396, + "loss": 3.8716, + "step": 834 + }, + { + "epoch": 0.04, + "grad_norm": 0.604479193687439, + "learning_rate": 0.0005997245886799679, + "loss": 4.1638, + "step": 835 + }, + { + "epoch": 0.04, + "grad_norm": 0.7261841297149658, + "learning_rate": 0.0005997239287183791, + "loss": 4.0187, + "step": 836 + }, + { + "epoch": 0.04, + "grad_norm": 0.6174228191375732, + "learning_rate": 0.0005997232679673752, + "loss": 4.0628, + "step": 837 + }, + { + "epoch": 0.04, + "grad_norm": 0.5952420234680176, + "learning_rate": 0.0005997226064269579, + "loss": 4.1506, + "step": 838 + }, + { + "epoch": 0.04, + "grad_norm": 0.5986089110374451, + "learning_rate": 0.0005997219440971291, + "loss": 4.0894, + "step": 839 + }, + { + "epoch": 0.04, + "grad_norm": 0.5981305241584778, + "learning_rate": 0.0005997212809778903, + "loss": 3.893, + "step": 840 + }, + { + "epoch": 0.04, + "grad_norm": 0.747908353805542, + "learning_rate": 0.0005997206170692432, + "loss": 4.2173, + "step": 841 + }, + { + "epoch": 0.04, + "grad_norm": 0.6060889959335327, + "learning_rate": 0.0005997199523711899, + "loss": 4.1177, + "step": 842 + }, + { + "epoch": 0.04, + "grad_norm": 0.6315199732780457, + "learning_rate": 0.0005997192868837317, + "loss": 4.1163, + "step": 843 + }, + { + "epoch": 0.04, + "grad_norm": 0.6924545764923096, + "learning_rate": 0.0005997186206068708, + "loss": 3.9744, + "step": 844 + }, + { + "epoch": 0.04, + "grad_norm": 0.6298468112945557, + "learning_rate": 0.0005997179535406086, + "loss": 4.0924, + "step": 845 + }, + { + "epoch": 0.04, + "grad_norm": 0.5951611399650574, + "learning_rate": 0.000599717285684947, + "loss": 3.8841, + "step": 846 + }, + { + "epoch": 0.04, + "grad_norm": 0.6348850727081299, + "learning_rate": 0.0005997166170398876, + "loss": 4.0741, + "step": 847 + }, + { + "epoch": 0.04, + "grad_norm": 0.6831207275390625, + "learning_rate": 0.0005997159476054326, + "loss": 3.9413, + "step": 848 + }, + { + "epoch": 0.04, + "grad_norm": 0.6047587394714355, + "learning_rate": 0.0005997152773815834, + "loss": 4.0532, + "step": 849 + }, + { + "epoch": 0.04, + "grad_norm": 0.6082019805908203, + "learning_rate": 0.0005997146063683417, + "loss": 4.0897, + "step": 850 + }, + { + "epoch": 0.04, + "grad_norm": 0.6189398169517517, + "learning_rate": 0.0005997139345657095, + "loss": 4.1492, + "step": 851 + }, + { + "epoch": 0.04, + "grad_norm": 0.624375581741333, + "learning_rate": 0.0005997132619736885, + "loss": 4.3931, + "step": 852 + }, + { + "epoch": 0.04, + "grad_norm": 0.6344230771064758, + "learning_rate": 0.0005997125885922805, + "loss": 3.903, + "step": 853 + }, + { + "epoch": 0.04, + "grad_norm": 0.6171305179595947, + "learning_rate": 0.0005997119144214872, + "loss": 4.1388, + "step": 854 + }, + { + "epoch": 0.04, + "grad_norm": 0.6373316645622253, + "learning_rate": 0.0005997112394613102, + "loss": 4.3171, + "step": 855 + }, + { + "epoch": 0.04, + "grad_norm": 0.5979803204536438, + "learning_rate": 0.0005997105637117516, + "loss": 4.0232, + "step": 856 + }, + { + "epoch": 0.04, + "grad_norm": 0.6416492462158203, + "learning_rate": 0.0005997098871728131, + "loss": 4.4098, + "step": 857 + }, + { + "epoch": 0.04, + "grad_norm": 0.6283490061759949, + "learning_rate": 0.0005997092098444964, + "loss": 3.9947, + "step": 858 + }, + { + "epoch": 0.04, + "grad_norm": 0.5568273663520813, + "learning_rate": 0.0005997085317268033, + "loss": 4.0547, + "step": 859 + }, + { + "epoch": 0.04, + "grad_norm": 0.6507592797279358, + "learning_rate": 0.0005997078528197356, + "loss": 3.9341, + "step": 860 + }, + { + "epoch": 0.04, + "grad_norm": 0.635384202003479, + "learning_rate": 0.0005997071731232952, + "loss": 4.0064, + "step": 861 + }, + { + "epoch": 0.04, + "grad_norm": 0.5979812145233154, + "learning_rate": 0.0005997064926374837, + "loss": 3.8342, + "step": 862 + }, + { + "epoch": 0.04, + "grad_norm": 0.6876376867294312, + "learning_rate": 0.000599705811362303, + "loss": 4.1938, + "step": 863 + }, + { + "epoch": 0.04, + "grad_norm": 0.6582483649253845, + "learning_rate": 0.0005997051292977549, + "loss": 4.0655, + "step": 864 + }, + { + "epoch": 0.04, + "grad_norm": 0.6212353706359863, + "learning_rate": 0.0005997044464438413, + "loss": 4.1384, + "step": 865 + }, + { + "epoch": 0.04, + "grad_norm": 0.5523200631141663, + "learning_rate": 0.0005997037628005636, + "loss": 4.1858, + "step": 866 + }, + { + "epoch": 0.04, + "grad_norm": 0.6378701329231262, + "learning_rate": 0.0005997030783679241, + "loss": 4.112, + "step": 867 + }, + { + "epoch": 0.04, + "grad_norm": 0.5988345742225647, + "learning_rate": 0.0005997023931459242, + "loss": 4.095, + "step": 868 + }, + { + "epoch": 0.04, + "grad_norm": 0.6165454983711243, + "learning_rate": 0.0005997017071345659, + "loss": 3.8559, + "step": 869 + }, + { + "epoch": 0.04, + "grad_norm": 0.6014971733093262, + "learning_rate": 0.000599701020333851, + "loss": 3.8783, + "step": 870 + }, + { + "epoch": 0.04, + "grad_norm": 0.6640937924385071, + "learning_rate": 0.0005997003327437812, + "loss": 4.1482, + "step": 871 + }, + { + "epoch": 0.04, + "grad_norm": 0.6299766302108765, + "learning_rate": 0.0005996996443643585, + "loss": 4.0896, + "step": 872 + }, + { + "epoch": 0.04, + "grad_norm": 0.6197918057441711, + "learning_rate": 0.0005996989551955847, + "loss": 3.9987, + "step": 873 + }, + { + "epoch": 0.04, + "grad_norm": 0.6115261316299438, + "learning_rate": 0.0005996982652374614, + "loss": 4.0155, + "step": 874 + }, + { + "epoch": 0.04, + "grad_norm": 0.6456928253173828, + "learning_rate": 0.0005996975744899906, + "loss": 4.2857, + "step": 875 + }, + { + "epoch": 0.04, + "grad_norm": 0.6480864882469177, + "learning_rate": 0.000599696882953174, + "loss": 3.983, + "step": 876 + }, + { + "epoch": 0.04, + "grad_norm": 0.6337156295776367, + "learning_rate": 0.0005996961906270136, + "loss": 4.0381, + "step": 877 + }, + { + "epoch": 0.04, + "grad_norm": 0.610868513584137, + "learning_rate": 0.0005996954975115111, + "loss": 4.123, + "step": 878 + }, + { + "epoch": 0.04, + "grad_norm": 0.6607982516288757, + "learning_rate": 0.0005996948036066683, + "loss": 4.2524, + "step": 879 + }, + { + "epoch": 0.04, + "grad_norm": 0.6953883171081543, + "learning_rate": 0.000599694108912487, + "loss": 3.9404, + "step": 880 + }, + { + "epoch": 0.04, + "grad_norm": 0.6293426156044006, + "learning_rate": 0.0005996934134289692, + "loss": 3.9467, + "step": 881 + }, + { + "epoch": 0.04, + "grad_norm": 0.6299402117729187, + "learning_rate": 0.0005996927171561166, + "loss": 4.2004, + "step": 882 + }, + { + "epoch": 0.04, + "grad_norm": 0.7695729732513428, + "learning_rate": 0.0005996920200939309, + "loss": 3.9446, + "step": 883 + }, + { + "epoch": 0.04, + "grad_norm": 0.662330687046051, + "learning_rate": 0.0005996913222424144, + "loss": 4.1542, + "step": 884 + }, + { + "epoch": 0.04, + "grad_norm": 0.6112087965011597, + "learning_rate": 0.0005996906236015684, + "loss": 4.1722, + "step": 885 + }, + { + "epoch": 0.04, + "grad_norm": 0.6271827220916748, + "learning_rate": 0.000599689924171395, + "loss": 3.9089, + "step": 886 + }, + { + "epoch": 0.04, + "grad_norm": 0.6005352735519409, + "learning_rate": 0.0005996892239518961, + "loss": 3.9845, + "step": 887 + }, + { + "epoch": 0.04, + "grad_norm": 0.6041455864906311, + "learning_rate": 0.0005996885229430734, + "loss": 4.1323, + "step": 888 + }, + { + "epoch": 0.04, + "grad_norm": 0.5910174250602722, + "learning_rate": 0.0005996878211449289, + "loss": 4.2648, + "step": 889 + }, + { + "epoch": 0.04, + "grad_norm": 0.6130709648132324, + "learning_rate": 0.0005996871185574643, + "loss": 4.0298, + "step": 890 + }, + { + "epoch": 0.04, + "grad_norm": 0.6232227087020874, + "learning_rate": 0.0005996864151806815, + "loss": 4.2897, + "step": 891 + }, + { + "epoch": 0.04, + "grad_norm": 0.6949803233146667, + "learning_rate": 0.0005996857110145823, + "loss": 4.0198, + "step": 892 + }, + { + "epoch": 0.04, + "grad_norm": 0.5966178774833679, + "learning_rate": 0.0005996850060591687, + "loss": 4.1471, + "step": 893 + }, + { + "epoch": 0.04, + "grad_norm": 0.6501827836036682, + "learning_rate": 0.0005996843003144424, + "loss": 4.0682, + "step": 894 + }, + { + "epoch": 0.04, + "grad_norm": 0.6076254844665527, + "learning_rate": 0.0005996835937804054, + "loss": 4.1748, + "step": 895 + }, + { + "epoch": 0.04, + "grad_norm": 0.5926339030265808, + "learning_rate": 0.0005996828864570594, + "loss": 4.019, + "step": 896 + }, + { + "epoch": 0.04, + "grad_norm": 0.6619225144386292, + "learning_rate": 0.0005996821783444064, + "loss": 3.9084, + "step": 897 + }, + { + "epoch": 0.04, + "grad_norm": 0.6074878573417664, + "learning_rate": 0.0005996814694424483, + "loss": 4.0513, + "step": 898 + }, + { + "epoch": 0.04, + "grad_norm": 0.7126258015632629, + "learning_rate": 0.0005996807597511868, + "loss": 4.0084, + "step": 899 + }, + { + "epoch": 0.04, + "grad_norm": 0.6678495407104492, + "learning_rate": 0.0005996800492706237, + "loss": 3.9053, + "step": 900 + }, + { + "epoch": 0.04, + "grad_norm": 0.6096392869949341, + "learning_rate": 0.0005996793380007613, + "loss": 3.9624, + "step": 901 + }, + { + "epoch": 0.04, + "grad_norm": 0.6041682362556458, + "learning_rate": 0.000599678625941601, + "loss": 4.0106, + "step": 902 + }, + { + "epoch": 0.04, + "grad_norm": 0.6434935331344604, + "learning_rate": 0.000599677913093145, + "loss": 4.0789, + "step": 903 + }, + { + "epoch": 0.04, + "grad_norm": 0.7096984386444092, + "learning_rate": 0.000599677199455395, + "loss": 4.1337, + "step": 904 + }, + { + "epoch": 0.04, + "grad_norm": 0.5979908108711243, + "learning_rate": 0.0005996764850283529, + "loss": 4.1843, + "step": 905 + }, + { + "epoch": 0.04, + "grad_norm": 0.6335223913192749, + "learning_rate": 0.0005996757698120206, + "loss": 4.2159, + "step": 906 + }, + { + "epoch": 0.04, + "grad_norm": 0.685057520866394, + "learning_rate": 0.0005996750538064001, + "loss": 3.8512, + "step": 907 + }, + { + "epoch": 0.04, + "grad_norm": 0.6344968676567078, + "learning_rate": 0.0005996743370114932, + "loss": 4.1279, + "step": 908 + }, + { + "epoch": 0.04, + "grad_norm": 0.6115414500236511, + "learning_rate": 0.0005996736194273017, + "loss": 4.1364, + "step": 909 + }, + { + "epoch": 0.04, + "grad_norm": 0.6755195260047913, + "learning_rate": 0.0005996729010538275, + "loss": 4.0321, + "step": 910 + }, + { + "epoch": 0.04, + "grad_norm": 0.5959694981575012, + "learning_rate": 0.0005996721818910725, + "loss": 4.2033, + "step": 911 + }, + { + "epoch": 0.04, + "grad_norm": 0.6436964869499207, + "learning_rate": 0.0005996714619390389, + "loss": 4.0886, + "step": 912 + }, + { + "epoch": 0.04, + "grad_norm": 0.6065239310264587, + "learning_rate": 0.0005996707411977281, + "loss": 4.1504, + "step": 913 + }, + { + "epoch": 0.04, + "grad_norm": 0.5962178707122803, + "learning_rate": 0.0005996700196671423, + "loss": 3.8577, + "step": 914 + }, + { + "epoch": 0.04, + "grad_norm": 0.6235368847846985, + "learning_rate": 0.0005996692973472835, + "loss": 4.1173, + "step": 915 + }, + { + "epoch": 0.04, + "grad_norm": 0.6299294233322144, + "learning_rate": 0.0005996685742381532, + "loss": 3.9826, + "step": 916 + }, + { + "epoch": 0.04, + "grad_norm": 0.6154347062110901, + "learning_rate": 0.0005996678503397537, + "loss": 4.0741, + "step": 917 + }, + { + "epoch": 0.04, + "grad_norm": 0.6413784623146057, + "learning_rate": 0.0005996671256520867, + "loss": 3.9614, + "step": 918 + }, + { + "epoch": 0.05, + "grad_norm": 0.631839394569397, + "learning_rate": 0.0005996664001751542, + "loss": 4.0662, + "step": 919 + }, + { + "epoch": 0.05, + "grad_norm": 0.6405590772628784, + "learning_rate": 0.0005996656739089581, + "loss": 4.0365, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 0.6293720006942749, + "learning_rate": 0.0005996649468535002, + "loss": 4.2496, + "step": 921 + }, + { + "epoch": 0.05, + "grad_norm": 0.6363210082054138, + "learning_rate": 0.0005996642190087825, + "loss": 3.9949, + "step": 922 + }, + { + "epoch": 0.05, + "grad_norm": 0.6418921947479248, + "learning_rate": 0.000599663490374807, + "loss": 4.0203, + "step": 923 + }, + { + "epoch": 0.05, + "grad_norm": 0.6134077906608582, + "learning_rate": 0.0005996627609515755, + "loss": 3.9338, + "step": 924 + }, + { + "epoch": 0.05, + "grad_norm": 0.6744232177734375, + "learning_rate": 0.00059966203073909, + "loss": 3.9921, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 0.6086677312850952, + "learning_rate": 0.0005996612997373524, + "loss": 3.9985, + "step": 926 + }, + { + "epoch": 0.05, + "grad_norm": 0.6225132942199707, + "learning_rate": 0.0005996605679463644, + "loss": 4.061, + "step": 927 + }, + { + "epoch": 0.05, + "grad_norm": 0.6579686999320984, + "learning_rate": 0.0005996598353661284, + "loss": 4.1547, + "step": 928 + }, + { + "epoch": 0.05, + "grad_norm": 0.6591225862503052, + "learning_rate": 0.0005996591019966459, + "loss": 4.1103, + "step": 929 + }, + { + "epoch": 0.05, + "grad_norm": 0.601435661315918, + "learning_rate": 0.0005996583678379191, + "loss": 3.9549, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 0.6121599674224854, + "learning_rate": 0.0005996576328899497, + "loss": 4.0187, + "step": 931 + }, + { + "epoch": 0.05, + "grad_norm": 0.62762850522995, + "learning_rate": 0.00059965689715274, + "loss": 4.0643, + "step": 932 + }, + { + "epoch": 0.05, + "grad_norm": 0.638881266117096, + "learning_rate": 0.0005996561606262914, + "loss": 4.0128, + "step": 933 + }, + { + "epoch": 0.05, + "grad_norm": 0.5881949067115784, + "learning_rate": 0.0005996554233106063, + "loss": 3.9358, + "step": 934 + }, + { + "epoch": 0.05, + "grad_norm": 0.6519078612327576, + "learning_rate": 0.0005996546852056865, + "loss": 3.9074, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 0.6900700926780701, + "learning_rate": 0.0005996539463115339, + "loss": 4.0253, + "step": 936 + }, + { + "epoch": 0.05, + "grad_norm": 0.6109591126441956, + "learning_rate": 0.0005996532066281505, + "loss": 4.1243, + "step": 937 + }, + { + "epoch": 0.05, + "grad_norm": 0.6539261341094971, + "learning_rate": 0.0005996524661555382, + "loss": 4.1093, + "step": 938 + }, + { + "epoch": 0.05, + "grad_norm": 0.6261901259422302, + "learning_rate": 0.000599651724893699, + "loss": 4.0016, + "step": 939 + }, + { + "epoch": 0.05, + "grad_norm": 0.6098498702049255, + "learning_rate": 0.0005996509828426347, + "loss": 4.0939, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 0.6256742477416992, + "learning_rate": 0.0005996502400023475, + "loss": 4.1468, + "step": 941 + }, + { + "epoch": 0.05, + "grad_norm": 0.6049453616142273, + "learning_rate": 0.0005996494963728392, + "loss": 4.1078, + "step": 942 + }, + { + "epoch": 0.05, + "grad_norm": 0.593472957611084, + "learning_rate": 0.0005996487519541119, + "loss": 4.2345, + "step": 943 + }, + { + "epoch": 0.05, + "grad_norm": 0.6599992513656616, + "learning_rate": 0.0005996480067461673, + "loss": 4.0287, + "step": 944 + }, + { + "epoch": 0.05, + "grad_norm": 0.6049973964691162, + "learning_rate": 0.0005996472607490076, + "loss": 3.9218, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 0.6687982678413391, + "learning_rate": 0.0005996465139626346, + "loss": 4.0952, + "step": 946 + }, + { + "epoch": 0.05, + "grad_norm": 0.6398851275444031, + "learning_rate": 0.0005996457663870505, + "loss": 3.7504, + "step": 947 + }, + { + "epoch": 0.05, + "grad_norm": 0.6496148109436035, + "learning_rate": 0.0005996450180222571, + "loss": 3.9623, + "step": 948 + }, + { + "epoch": 0.05, + "grad_norm": 0.6317514777183533, + "learning_rate": 0.0005996442688682562, + "loss": 4.1198, + "step": 949 + }, + { + "epoch": 0.05, + "grad_norm": 0.6482225656509399, + "learning_rate": 0.0005996435189250501, + "loss": 3.883, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 0.607512891292572, + "learning_rate": 0.0005996427681926406, + "loss": 4.0248, + "step": 951 + }, + { + "epoch": 0.05, + "grad_norm": 0.6879410147666931, + "learning_rate": 0.0005996420166710297, + "loss": 4.0689, + "step": 952 + }, + { + "epoch": 0.05, + "grad_norm": 0.6295230984687805, + "learning_rate": 0.0005996412643602196, + "loss": 4.2013, + "step": 953 + }, + { + "epoch": 0.05, + "grad_norm": 0.6885456442832947, + "learning_rate": 0.0005996405112602119, + "loss": 4.1025, + "step": 954 + }, + { + "epoch": 0.05, + "grad_norm": 0.5858647227287292, + "learning_rate": 0.0005996397573710087, + "loss": 4.0533, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 0.6196497082710266, + "learning_rate": 0.0005996390026926121, + "loss": 4.094, + "step": 956 + }, + { + "epoch": 0.05, + "grad_norm": 0.6471155285835266, + "learning_rate": 0.0005996382472250241, + "loss": 3.8297, + "step": 957 + }, + { + "epoch": 0.05, + "grad_norm": 0.6373564004898071, + "learning_rate": 0.0005996374909682465, + "loss": 4.142, + "step": 958 + }, + { + "epoch": 0.05, + "grad_norm": 0.6640136241912842, + "learning_rate": 0.0005996367339222815, + "loss": 4.2055, + "step": 959 + }, + { + "epoch": 0.05, + "grad_norm": 0.6296126842498779, + "learning_rate": 0.0005996359760871311, + "loss": 3.9527, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 0.6657034754753113, + "learning_rate": 0.0005996352174627971, + "loss": 4.1435, + "step": 961 + }, + { + "epoch": 0.05, + "grad_norm": 0.632871687412262, + "learning_rate": 0.0005996344580492817, + "loss": 3.8951, + "step": 962 + }, + { + "epoch": 0.05, + "grad_norm": 0.6474413275718689, + "learning_rate": 0.0005996336978465866, + "loss": 4.1062, + "step": 963 + }, + { + "epoch": 0.05, + "grad_norm": 0.5804777145385742, + "learning_rate": 0.0005996329368547141, + "loss": 3.9748, + "step": 964 + }, + { + "epoch": 0.05, + "grad_norm": 0.6162649989128113, + "learning_rate": 0.0005996321750736662, + "loss": 3.8642, + "step": 965 + }, + { + "epoch": 0.05, + "grad_norm": 0.5436090230941772, + "learning_rate": 0.0005996314125034447, + "loss": 4.0974, + "step": 966 + }, + { + "epoch": 0.05, + "grad_norm": 0.648061990737915, + "learning_rate": 0.0005996306491440517, + "loss": 3.9705, + "step": 967 + }, + { + "epoch": 0.05, + "grad_norm": 0.6477342844009399, + "learning_rate": 0.0005996298849954894, + "loss": 4.1531, + "step": 968 + }, + { + "epoch": 0.05, + "grad_norm": 0.6069393157958984, + "learning_rate": 0.0005996291200577596, + "loss": 4.0453, + "step": 969 + }, + { + "epoch": 0.05, + "grad_norm": 0.6199972629547119, + "learning_rate": 0.0005996283543308643, + "loss": 4.2185, + "step": 970 + }, + { + "epoch": 0.05, + "grad_norm": 0.6699131727218628, + "learning_rate": 0.0005996275878148055, + "loss": 4.0178, + "step": 971 + }, + { + "epoch": 0.05, + "grad_norm": 0.7701675295829773, + "learning_rate": 0.0005996268205095855, + "loss": 3.9939, + "step": 972 + }, + { + "epoch": 0.05, + "grad_norm": 0.5983409285545349, + "learning_rate": 0.000599626052415206, + "loss": 3.9987, + "step": 973 + }, + { + "epoch": 0.05, + "grad_norm": 0.6925124526023865, + "learning_rate": 0.0005996252835316692, + "loss": 3.917, + "step": 974 + }, + { + "epoch": 0.05, + "grad_norm": 0.6102199554443359, + "learning_rate": 0.000599624513858977, + "loss": 3.9685, + "step": 975 + }, + { + "epoch": 0.05, + "grad_norm": 0.6447969079017639, + "learning_rate": 0.0005996237433971315, + "loss": 4.3062, + "step": 976 + }, + { + "epoch": 0.05, + "grad_norm": 0.606885552406311, + "learning_rate": 0.0005996229721461347, + "loss": 4.2991, + "step": 977 + }, + { + "epoch": 0.05, + "grad_norm": 0.6305561065673828, + "learning_rate": 0.0005996222001059887, + "loss": 4.1132, + "step": 978 + }, + { + "epoch": 0.05, + "grad_norm": 0.6146825551986694, + "learning_rate": 0.0005996214272766955, + "loss": 3.8875, + "step": 979 + }, + { + "epoch": 0.05, + "grad_norm": 0.6634001731872559, + "learning_rate": 0.0005996206536582571, + "loss": 3.9914, + "step": 980 + }, + { + "epoch": 0.05, + "grad_norm": 0.623626708984375, + "learning_rate": 0.0005996198792506755, + "loss": 4.1724, + "step": 981 + }, + { + "epoch": 0.05, + "grad_norm": 0.6683825850486755, + "learning_rate": 0.000599619104053953, + "loss": 4.0585, + "step": 982 + }, + { + "epoch": 0.05, + "grad_norm": 0.5538197159767151, + "learning_rate": 0.0005996183280680914, + "loss": 3.9352, + "step": 983 + }, + { + "epoch": 0.05, + "grad_norm": 0.6251077055931091, + "learning_rate": 0.0005996175512930927, + "loss": 4.3014, + "step": 984 + }, + { + "epoch": 0.05, + "grad_norm": 0.593470573425293, + "learning_rate": 0.0005996167737289589, + "loss": 4.0789, + "step": 985 + }, + { + "epoch": 0.05, + "grad_norm": 0.6242483258247375, + "learning_rate": 0.0005996159953756923, + "loss": 4.0246, + "step": 986 + }, + { + "epoch": 0.05, + "grad_norm": 0.6253484487533569, + "learning_rate": 0.0005996152162332949, + "loss": 3.9758, + "step": 987 + }, + { + "epoch": 0.05, + "grad_norm": 0.6085364818572998, + "learning_rate": 0.0005996144363017686, + "loss": 3.8767, + "step": 988 + }, + { + "epoch": 0.05, + "grad_norm": 0.6171862483024597, + "learning_rate": 0.0005996136555811156, + "loss": 4.01, + "step": 989 + }, + { + "epoch": 0.05, + "grad_norm": 0.5950732231140137, + "learning_rate": 0.0005996128740713379, + "loss": 3.7366, + "step": 990 + }, + { + "epoch": 0.05, + "grad_norm": 0.6203235983848572, + "learning_rate": 0.0005996120917724374, + "loss": 4.0769, + "step": 991 + }, + { + "epoch": 0.05, + "grad_norm": 0.608241856098175, + "learning_rate": 0.0005996113086844165, + "loss": 3.8565, + "step": 992 + }, + { + "epoch": 0.05, + "grad_norm": 0.612503707408905, + "learning_rate": 0.0005996105248072769, + "loss": 4.1548, + "step": 993 + }, + { + "epoch": 0.05, + "grad_norm": 0.5867249369621277, + "learning_rate": 0.000599609740141021, + "loss": 3.8754, + "step": 994 + }, + { + "epoch": 0.05, + "grad_norm": 0.6069759726524353, + "learning_rate": 0.0005996089546856507, + "loss": 4.0749, + "step": 995 + }, + { + "epoch": 0.05, + "grad_norm": 0.5676235556602478, + "learning_rate": 0.000599608168441168, + "loss": 3.8812, + "step": 996 + }, + { + "epoch": 0.05, + "grad_norm": 0.6222755908966064, + "learning_rate": 0.0005996073814075751, + "loss": 3.8984, + "step": 997 + }, + { + "epoch": 0.05, + "grad_norm": 0.5938562750816345, + "learning_rate": 0.0005996065935848739, + "loss": 3.961, + "step": 998 + }, + { + "epoch": 0.05, + "grad_norm": 0.5748042464256287, + "learning_rate": 0.0005996058049730666, + "loss": 3.823, + "step": 999 + }, + { + "epoch": 0.05, + "grad_norm": 0.6213597059249878, + "learning_rate": 0.0005996050155721553, + "loss": 3.9996, + "step": 1000 + }, + { + "epoch": 0.05, + "grad_norm": 0.622829258441925, + "learning_rate": 0.0005996042253821422, + "loss": 3.9153, + "step": 1001 + }, + { + "epoch": 0.05, + "grad_norm": 0.5811119675636292, + "learning_rate": 0.000599603434403029, + "loss": 4.0641, + "step": 1002 + }, + { + "epoch": 0.05, + "grad_norm": 0.6401235461235046, + "learning_rate": 0.0005996026426348183, + "loss": 4.0072, + "step": 1003 + }, + { + "epoch": 0.05, + "grad_norm": 0.6247691512107849, + "learning_rate": 0.0005996018500775117, + "loss": 4.0948, + "step": 1004 + }, + { + "epoch": 0.05, + "grad_norm": 0.8084251284599304, + "learning_rate": 0.0005996010567311115, + "loss": 4.232, + "step": 1005 + }, + { + "epoch": 0.05, + "grad_norm": 0.611668586730957, + "learning_rate": 0.0005996002625956198, + "loss": 3.9149, + "step": 1006 + }, + { + "epoch": 0.05, + "grad_norm": 0.6284043788909912, + "learning_rate": 0.0005995994676710386, + "loss": 4.1114, + "step": 1007 + }, + { + "epoch": 0.05, + "grad_norm": 0.6413992047309875, + "learning_rate": 0.0005995986719573701, + "loss": 4.0627, + "step": 1008 + }, + { + "epoch": 0.05, + "grad_norm": 0.769960880279541, + "learning_rate": 0.0005995978754546165, + "loss": 4.0868, + "step": 1009 + }, + { + "epoch": 0.05, + "grad_norm": 0.6613034605979919, + "learning_rate": 0.0005995970781627795, + "loss": 3.9804, + "step": 1010 + }, + { + "epoch": 0.05, + "grad_norm": 0.6815891265869141, + "learning_rate": 0.0005995962800818617, + "loss": 3.9572, + "step": 1011 + }, + { + "epoch": 0.05, + "grad_norm": 0.6836370825767517, + "learning_rate": 0.0005995954812118648, + "loss": 3.9664, + "step": 1012 + }, + { + "epoch": 0.05, + "grad_norm": 0.6569324731826782, + "learning_rate": 0.000599594681552791, + "loss": 4.1149, + "step": 1013 + }, + { + "epoch": 0.05, + "grad_norm": 0.6019830703735352, + "learning_rate": 0.0005995938811046426, + "loss": 3.9853, + "step": 1014 + }, + { + "epoch": 0.05, + "grad_norm": 0.612848699092865, + "learning_rate": 0.0005995930798674216, + "loss": 4.155, + "step": 1015 + }, + { + "epoch": 0.05, + "grad_norm": 0.5986717343330383, + "learning_rate": 0.00059959227784113, + "loss": 4.0356, + "step": 1016 + }, + { + "epoch": 0.05, + "grad_norm": 0.6132051348686218, + "learning_rate": 0.0005995914750257701, + "loss": 3.7446, + "step": 1017 + }, + { + "epoch": 0.05, + "grad_norm": 0.6531481742858887, + "learning_rate": 0.0005995906714213439, + "loss": 3.7868, + "step": 1018 + }, + { + "epoch": 0.05, + "grad_norm": 0.6747217178344727, + "learning_rate": 0.0005995898670278535, + "loss": 4.0415, + "step": 1019 + }, + { + "epoch": 0.05, + "grad_norm": 0.6562297344207764, + "learning_rate": 0.0005995890618453009, + "loss": 3.7626, + "step": 1020 + }, + { + "epoch": 0.05, + "grad_norm": 0.8349602222442627, + "learning_rate": 0.0005995882558736885, + "loss": 3.96, + "step": 1021 + }, + { + "epoch": 0.05, + "grad_norm": 0.7195594906806946, + "learning_rate": 0.0005995874491130183, + "loss": 3.9682, + "step": 1022 + }, + { + "epoch": 0.05, + "grad_norm": 0.5842819809913635, + "learning_rate": 0.0005995866415632924, + "loss": 4.1458, + "step": 1023 + }, + { + "epoch": 0.05, + "grad_norm": 0.6021990776062012, + "learning_rate": 0.0005995858332245129, + "loss": 3.7984, + "step": 1024 + }, + { + "epoch": 0.05, + "grad_norm": 0.6178818941116333, + "learning_rate": 0.000599585024096682, + "loss": 3.95, + "step": 1025 + }, + { + "epoch": 0.05, + "grad_norm": 0.7198466062545776, + "learning_rate": 0.0005995842141798018, + "loss": 3.782, + "step": 1026 + }, + { + "epoch": 0.05, + "grad_norm": 0.6762264370918274, + "learning_rate": 0.0005995834034738744, + "loss": 4.1151, + "step": 1027 + }, + { + "epoch": 0.05, + "grad_norm": 0.6355637311935425, + "learning_rate": 0.0005995825919789021, + "loss": 4.1855, + "step": 1028 + }, + { + "epoch": 0.05, + "grad_norm": 0.6325914859771729, + "learning_rate": 0.0005995817796948869, + "loss": 3.8144, + "step": 1029 + }, + { + "epoch": 0.05, + "grad_norm": 0.6539778709411621, + "learning_rate": 0.0005995809666218308, + "loss": 4.0136, + "step": 1030 + }, + { + "epoch": 0.05, + "grad_norm": 0.701380729675293, + "learning_rate": 0.0005995801527597362, + "loss": 4.0071, + "step": 1031 + }, + { + "epoch": 0.05, + "grad_norm": 0.617536723613739, + "learning_rate": 0.0005995793381086051, + "loss": 3.9345, + "step": 1032 + }, + { + "epoch": 0.05, + "grad_norm": 0.5975783467292786, + "learning_rate": 0.0005995785226684397, + "loss": 3.9294, + "step": 1033 + }, + { + "epoch": 0.05, + "grad_norm": 0.5805318355560303, + "learning_rate": 0.0005995777064392421, + "loss": 4.0045, + "step": 1034 + }, + { + "epoch": 0.05, + "grad_norm": 0.6723288893699646, + "learning_rate": 0.0005995768894210144, + "loss": 3.9988, + "step": 1035 + }, + { + "epoch": 0.05, + "grad_norm": 0.7065977454185486, + "learning_rate": 0.000599576071613759, + "loss": 3.8351, + "step": 1036 + }, + { + "epoch": 0.05, + "grad_norm": 0.7041149139404297, + "learning_rate": 0.0005995752530174777, + "loss": 3.907, + "step": 1037 + }, + { + "epoch": 0.05, + "grad_norm": 0.6054835319519043, + "learning_rate": 0.0005995744336321729, + "loss": 3.8096, + "step": 1038 + }, + { + "epoch": 0.05, + "grad_norm": 0.6375568509101868, + "learning_rate": 0.0005995736134578468, + "loss": 3.9835, + "step": 1039 + }, + { + "epoch": 0.05, + "grad_norm": 0.6124348640441895, + "learning_rate": 0.0005995727924945014, + "loss": 3.9451, + "step": 1040 + }, + { + "epoch": 0.05, + "grad_norm": 0.787834644317627, + "learning_rate": 0.0005995719707421388, + "loss": 3.9742, + "step": 1041 + }, + { + "epoch": 0.05, + "grad_norm": 0.6277994513511658, + "learning_rate": 0.0005995711482007615, + "loss": 3.9475, + "step": 1042 + }, + { + "epoch": 0.05, + "grad_norm": 0.6354180574417114, + "learning_rate": 0.0005995703248703713, + "loss": 4.2671, + "step": 1043 + }, + { + "epoch": 0.05, + "grad_norm": 0.5767564177513123, + "learning_rate": 0.0005995695007509705, + "loss": 4.0588, + "step": 1044 + }, + { + "epoch": 0.05, + "grad_norm": 0.6037499904632568, + "learning_rate": 0.0005995686758425614, + "loss": 3.8606, + "step": 1045 + }, + { + "epoch": 0.05, + "grad_norm": 0.7180562019348145, + "learning_rate": 0.000599567850145146, + "loss": 3.6251, + "step": 1046 + }, + { + "epoch": 0.05, + "grad_norm": 0.6226548552513123, + "learning_rate": 0.0005995670236587265, + "loss": 3.8062, + "step": 1047 + }, + { + "epoch": 0.05, + "grad_norm": 0.6276943683624268, + "learning_rate": 0.0005995661963833051, + "loss": 4.17, + "step": 1048 + }, + { + "epoch": 0.05, + "grad_norm": 0.6406780481338501, + "learning_rate": 0.0005995653683188841, + "loss": 4.0168, + "step": 1049 + }, + { + "epoch": 0.05, + "grad_norm": 0.660855770111084, + "learning_rate": 0.0005995645394654655, + "loss": 3.7325, + "step": 1050 + }, + { + "epoch": 0.05, + "grad_norm": 0.7150101661682129, + "learning_rate": 0.0005995637098230516, + "loss": 4.2903, + "step": 1051 + }, + { + "epoch": 0.05, + "grad_norm": 0.6027021408081055, + "learning_rate": 0.0005995628793916445, + "loss": 3.9363, + "step": 1052 + }, + { + "epoch": 0.05, + "grad_norm": 0.6147125959396362, + "learning_rate": 0.0005995620481712464, + "loss": 4.002, + "step": 1053 + }, + { + "epoch": 0.05, + "grad_norm": 0.6540363430976868, + "learning_rate": 0.0005995612161618595, + "loss": 4.1231, + "step": 1054 + }, + { + "epoch": 0.05, + "grad_norm": 0.7382659316062927, + "learning_rate": 0.0005995603833634862, + "loss": 3.7431, + "step": 1055 + }, + { + "epoch": 0.05, + "grad_norm": 0.6010420918464661, + "learning_rate": 0.0005995595497761283, + "loss": 4.0042, + "step": 1056 + }, + { + "epoch": 0.05, + "grad_norm": 0.6575014591217041, + "learning_rate": 0.0005995587153997882, + "loss": 3.9075, + "step": 1057 + }, + { + "epoch": 0.05, + "grad_norm": 0.6506360769271851, + "learning_rate": 0.0005995578802344682, + "loss": 4.0998, + "step": 1058 + }, + { + "epoch": 0.05, + "grad_norm": 0.670628547668457, + "learning_rate": 0.0005995570442801703, + "loss": 3.9776, + "step": 1059 + }, + { + "epoch": 0.05, + "grad_norm": 0.6330320239067078, + "learning_rate": 0.0005995562075368969, + "loss": 3.9504, + "step": 1060 + }, + { + "epoch": 0.05, + "grad_norm": 0.604716956615448, + "learning_rate": 0.0005995553700046501, + "loss": 3.8521, + "step": 1061 + }, + { + "epoch": 0.05, + "grad_norm": 0.6562084555625916, + "learning_rate": 0.000599554531683432, + "loss": 4.1304, + "step": 1062 + }, + { + "epoch": 0.05, + "grad_norm": 0.6587114334106445, + "learning_rate": 0.000599553692573245, + "loss": 3.8732, + "step": 1063 + }, + { + "epoch": 0.05, + "grad_norm": 0.6262410283088684, + "learning_rate": 0.0005995528526740911, + "loss": 3.9372, + "step": 1064 + }, + { + "epoch": 0.05, + "grad_norm": 0.6062853336334229, + "learning_rate": 0.0005995520119859727, + "loss": 4.017, + "step": 1065 + }, + { + "epoch": 0.05, + "grad_norm": 0.6965979337692261, + "learning_rate": 0.000599551170508892, + "loss": 3.9419, + "step": 1066 + }, + { + "epoch": 0.05, + "grad_norm": 0.6158636212348938, + "learning_rate": 0.0005995503282428512, + "loss": 3.903, + "step": 1067 + }, + { + "epoch": 0.05, + "grad_norm": 0.6285214424133301, + "learning_rate": 0.0005995494851878524, + "loss": 4.057, + "step": 1068 + }, + { + "epoch": 0.05, + "grad_norm": 0.7115497589111328, + "learning_rate": 0.0005995486413438979, + "loss": 3.8139, + "step": 1069 + }, + { + "epoch": 0.05, + "grad_norm": 0.5884736776351929, + "learning_rate": 0.00059954779671099, + "loss": 3.9513, + "step": 1070 + }, + { + "epoch": 0.05, + "grad_norm": 0.6723683476448059, + "learning_rate": 0.0005995469512891308, + "loss": 3.9108, + "step": 1071 + }, + { + "epoch": 0.05, + "grad_norm": 0.605546236038208, + "learning_rate": 0.0005995461050783226, + "loss": 4.0745, + "step": 1072 + }, + { + "epoch": 0.05, + "grad_norm": 0.5845876336097717, + "learning_rate": 0.0005995452580785676, + "loss": 4.0279, + "step": 1073 + }, + { + "epoch": 0.05, + "grad_norm": 0.6146058440208435, + "learning_rate": 0.000599544410289868, + "loss": 4.0084, + "step": 1074 + }, + { + "epoch": 0.05, + "grad_norm": 0.6121573448181152, + "learning_rate": 0.0005995435617122261, + "loss": 3.9437, + "step": 1075 + }, + { + "epoch": 0.05, + "grad_norm": 0.6386142373085022, + "learning_rate": 0.0005995427123456441, + "loss": 4.1899, + "step": 1076 + }, + { + "epoch": 0.05, + "grad_norm": 0.607537567615509, + "learning_rate": 0.0005995418621901243, + "loss": 3.7694, + "step": 1077 + }, + { + "epoch": 0.05, + "grad_norm": 0.8437908291816711, + "learning_rate": 0.0005995410112456688, + "loss": 3.7674, + "step": 1078 + }, + { + "epoch": 0.05, + "grad_norm": 0.6184353232383728, + "learning_rate": 0.00059954015951228, + "loss": 3.8997, + "step": 1079 + }, + { + "epoch": 0.05, + "grad_norm": 0.6318781971931458, + "learning_rate": 0.0005995393069899601, + "loss": 3.9576, + "step": 1080 + }, + { + "epoch": 0.05, + "grad_norm": 0.6466130614280701, + "learning_rate": 0.0005995384536787112, + "loss": 4.0041, + "step": 1081 + }, + { + "epoch": 0.05, + "grad_norm": 0.668156087398529, + "learning_rate": 0.0005995375995785357, + "loss": 4.0425, + "step": 1082 + }, + { + "epoch": 0.05, + "grad_norm": 0.5986151695251465, + "learning_rate": 0.0005995367446894358, + "loss": 3.7717, + "step": 1083 + }, + { + "epoch": 0.05, + "grad_norm": 0.6351150870323181, + "learning_rate": 0.0005995358890114137, + "loss": 4.0434, + "step": 1084 + }, + { + "epoch": 0.05, + "grad_norm": 0.6070703864097595, + "learning_rate": 0.0005995350325444719, + "loss": 4.0091, + "step": 1085 + }, + { + "epoch": 0.05, + "grad_norm": 0.7131597399711609, + "learning_rate": 0.0005995341752886124, + "loss": 3.9062, + "step": 1086 + }, + { + "epoch": 0.05, + "grad_norm": 0.6343297958374023, + "learning_rate": 0.0005995333172438374, + "loss": 4.2335, + "step": 1087 + }, + { + "epoch": 0.05, + "grad_norm": 0.6075549721717834, + "learning_rate": 0.0005995324584101494, + "loss": 3.9601, + "step": 1088 + }, + { + "epoch": 0.05, + "grad_norm": 0.5800171494483948, + "learning_rate": 0.0005995315987875505, + "loss": 4.0497, + "step": 1089 + }, + { + "epoch": 0.05, + "grad_norm": 0.5841231346130371, + "learning_rate": 0.0005995307383760431, + "loss": 3.9826, + "step": 1090 + }, + { + "epoch": 0.05, + "grad_norm": 0.6168744564056396, + "learning_rate": 0.0005995298771756293, + "loss": 4.2034, + "step": 1091 + }, + { + "epoch": 0.05, + "grad_norm": 0.660862922668457, + "learning_rate": 0.0005995290151863114, + "loss": 4.1355, + "step": 1092 + }, + { + "epoch": 0.05, + "grad_norm": 0.7201418280601501, + "learning_rate": 0.0005995281524080919, + "loss": 3.7802, + "step": 1093 + }, + { + "epoch": 0.05, + "grad_norm": 0.6478432416915894, + "learning_rate": 0.0005995272888409728, + "loss": 3.8778, + "step": 1094 + }, + { + "epoch": 0.05, + "grad_norm": 0.6431102156639099, + "learning_rate": 0.0005995264244849564, + "loss": 3.8396, + "step": 1095 + }, + { + "epoch": 0.05, + "grad_norm": 0.6155719757080078, + "learning_rate": 0.0005995255593400452, + "loss": 3.9207, + "step": 1096 + }, + { + "epoch": 0.05, + "grad_norm": 0.6385898590087891, + "learning_rate": 0.0005995246934062412, + "loss": 3.9631, + "step": 1097 + }, + { + "epoch": 0.05, + "grad_norm": 0.6196228265762329, + "learning_rate": 0.0005995238266835468, + "loss": 3.9081, + "step": 1098 + }, + { + "epoch": 0.05, + "grad_norm": 0.6495975255966187, + "learning_rate": 0.0005995229591719643, + "loss": 4.0299, + "step": 1099 + }, + { + "epoch": 0.05, + "grad_norm": 0.6047267913818359, + "learning_rate": 0.000599522090871496, + "loss": 4.1038, + "step": 1100 + }, + { + "epoch": 0.05, + "grad_norm": 0.5929258465766907, + "learning_rate": 0.0005995212217821441, + "loss": 4.0514, + "step": 1101 + }, + { + "epoch": 0.05, + "grad_norm": 0.6673069000244141, + "learning_rate": 0.000599520351903911, + "loss": 3.8946, + "step": 1102 + }, + { + "epoch": 0.05, + "grad_norm": 0.597668468952179, + "learning_rate": 0.000599519481236799, + "loss": 3.8061, + "step": 1103 + }, + { + "epoch": 0.05, + "grad_norm": 0.6228914260864258, + "learning_rate": 0.0005995186097808102, + "loss": 4.0257, + "step": 1104 + }, + { + "epoch": 0.05, + "grad_norm": 0.6423768997192383, + "learning_rate": 0.0005995177375359471, + "loss": 4.0893, + "step": 1105 + }, + { + "epoch": 0.05, + "grad_norm": 0.6223159432411194, + "learning_rate": 0.0005995168645022118, + "loss": 4.0435, + "step": 1106 + }, + { + "epoch": 0.05, + "grad_norm": 0.6253622174263, + "learning_rate": 0.0005995159906796068, + "loss": 3.755, + "step": 1107 + }, + { + "epoch": 0.05, + "grad_norm": 0.6040705442428589, + "learning_rate": 0.0005995151160681344, + "loss": 4.1872, + "step": 1108 + }, + { + "epoch": 0.05, + "grad_norm": 0.5690031051635742, + "learning_rate": 0.0005995142406677967, + "loss": 3.9943, + "step": 1109 + }, + { + "epoch": 0.05, + "grad_norm": 0.6597207188606262, + "learning_rate": 0.0005995133644785963, + "loss": 3.9716, + "step": 1110 + }, + { + "epoch": 0.05, + "grad_norm": 0.6696934103965759, + "learning_rate": 0.0005995124875005352, + "loss": 3.759, + "step": 1111 + }, + { + "epoch": 0.05, + "grad_norm": 0.6375954747200012, + "learning_rate": 0.000599511609733616, + "loss": 3.8252, + "step": 1112 + }, + { + "epoch": 0.05, + "grad_norm": 0.636753499507904, + "learning_rate": 0.0005995107311778406, + "loss": 3.9468, + "step": 1113 + }, + { + "epoch": 0.05, + "grad_norm": 0.5825153589248657, + "learning_rate": 0.0005995098518332117, + "loss": 3.8568, + "step": 1114 + }, + { + "epoch": 0.05, + "grad_norm": 0.6210393309593201, + "learning_rate": 0.0005995089716997316, + "loss": 4.1555, + "step": 1115 + }, + { + "epoch": 0.05, + "grad_norm": 0.6214787364006042, + "learning_rate": 0.0005995080907774024, + "loss": 3.9536, + "step": 1116 + }, + { + "epoch": 0.05, + "grad_norm": 0.678767740726471, + "learning_rate": 0.0005995072090662267, + "loss": 3.835, + "step": 1117 + }, + { + "epoch": 0.05, + "grad_norm": 0.6073644161224365, + "learning_rate": 0.0005995063265662063, + "loss": 4.2358, + "step": 1118 + }, + { + "epoch": 0.05, + "grad_norm": 0.6090550422668457, + "learning_rate": 0.0005995054432773442, + "loss": 3.9443, + "step": 1119 + }, + { + "epoch": 0.05, + "grad_norm": 0.715369462966919, + "learning_rate": 0.0005995045591996423, + "loss": 4.0607, + "step": 1120 + }, + { + "epoch": 0.05, + "grad_norm": 0.5823421478271484, + "learning_rate": 0.0005995036743331031, + "loss": 4.0344, + "step": 1121 + }, + { + "epoch": 0.05, + "grad_norm": 0.6796244382858276, + "learning_rate": 0.0005995027886777287, + "loss": 3.9048, + "step": 1122 + }, + { + "epoch": 0.06, + "grad_norm": 0.5604933500289917, + "learning_rate": 0.0005995019022335217, + "loss": 4.0935, + "step": 1123 + }, + { + "epoch": 0.06, + "grad_norm": 0.6292743682861328, + "learning_rate": 0.0005995010150004844, + "loss": 3.9694, + "step": 1124 + }, + { + "epoch": 0.06, + "grad_norm": 0.6105656623840332, + "learning_rate": 0.0005995001269786189, + "loss": 3.8372, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 0.6465719938278198, + "learning_rate": 0.0005994992381679277, + "loss": 3.7912, + "step": 1126 + }, + { + "epoch": 0.06, + "grad_norm": 0.704645574092865, + "learning_rate": 0.0005994983485684133, + "loss": 3.8925, + "step": 1127 + }, + { + "epoch": 0.06, + "grad_norm": 0.6321631073951721, + "learning_rate": 0.0005994974581800779, + "loss": 4.1433, + "step": 1128 + }, + { + "epoch": 0.06, + "grad_norm": 0.6182036399841309, + "learning_rate": 0.0005994965670029237, + "loss": 3.9114, + "step": 1129 + }, + { + "epoch": 0.06, + "grad_norm": 0.6689995527267456, + "learning_rate": 0.0005994956750369532, + "loss": 4.0865, + "step": 1130 + }, + { + "epoch": 0.06, + "grad_norm": 0.6217949986457825, + "learning_rate": 0.0005994947822821688, + "loss": 4.1664, + "step": 1131 + }, + { + "epoch": 0.06, + "grad_norm": 0.608690083026886, + "learning_rate": 0.0005994938887385727, + "loss": 4.0256, + "step": 1132 + }, + { + "epoch": 0.06, + "grad_norm": 0.6573523283004761, + "learning_rate": 0.0005994929944061675, + "loss": 3.9891, + "step": 1133 + }, + { + "epoch": 0.06, + "grad_norm": 0.6570764780044556, + "learning_rate": 0.0005994920992849552, + "loss": 3.9587, + "step": 1134 + }, + { + "epoch": 0.06, + "grad_norm": 0.5835046172142029, + "learning_rate": 0.0005994912033749385, + "loss": 4.1042, + "step": 1135 + }, + { + "epoch": 0.06, + "grad_norm": 0.6346054077148438, + "learning_rate": 0.0005994903066761196, + "loss": 4.0866, + "step": 1136 + }, + { + "epoch": 0.06, + "grad_norm": 0.6046596169471741, + "learning_rate": 0.0005994894091885007, + "loss": 3.9774, + "step": 1137 + }, + { + "epoch": 0.06, + "grad_norm": 0.6285713911056519, + "learning_rate": 0.0005994885109120845, + "loss": 3.9324, + "step": 1138 + }, + { + "epoch": 0.06, + "grad_norm": 0.5951323509216309, + "learning_rate": 0.0005994876118468731, + "loss": 3.8939, + "step": 1139 + }, + { + "epoch": 0.06, + "grad_norm": 0.6478222012519836, + "learning_rate": 0.0005994867119928689, + "loss": 3.9917, + "step": 1140 + }, + { + "epoch": 0.06, + "grad_norm": 0.6328380703926086, + "learning_rate": 0.0005994858113500746, + "loss": 3.8191, + "step": 1141 + }, + { + "epoch": 0.06, + "grad_norm": 0.6400364637374878, + "learning_rate": 0.0005994849099184921, + "loss": 4.0963, + "step": 1142 + }, + { + "epoch": 0.06, + "grad_norm": 0.5783833861351013, + "learning_rate": 0.000599484007698124, + "loss": 3.9709, + "step": 1143 + }, + { + "epoch": 0.06, + "grad_norm": 0.6741927862167358, + "learning_rate": 0.0005994831046889727, + "loss": 3.8906, + "step": 1144 + }, + { + "epoch": 0.06, + "grad_norm": 0.6302924156188965, + "learning_rate": 0.0005994822008910406, + "loss": 3.7855, + "step": 1145 + }, + { + "epoch": 0.06, + "grad_norm": 0.5713663101196289, + "learning_rate": 0.0005994812963043299, + "loss": 3.963, + "step": 1146 + }, + { + "epoch": 0.06, + "grad_norm": 0.5913187265396118, + "learning_rate": 0.0005994803909288431, + "loss": 3.9689, + "step": 1147 + }, + { + "epoch": 0.06, + "grad_norm": 0.6074280738830566, + "learning_rate": 0.0005994794847645827, + "loss": 3.9716, + "step": 1148 + }, + { + "epoch": 0.06, + "grad_norm": 0.626340925693512, + "learning_rate": 0.000599478577811551, + "loss": 4.1273, + "step": 1149 + }, + { + "epoch": 0.06, + "grad_norm": 0.5834325551986694, + "learning_rate": 0.0005994776700697503, + "loss": 3.8578, + "step": 1150 + }, + { + "epoch": 0.06, + "grad_norm": 0.621330738067627, + "learning_rate": 0.000599476761539183, + "loss": 3.8196, + "step": 1151 + }, + { + "epoch": 0.06, + "grad_norm": 0.6001361608505249, + "learning_rate": 0.0005994758522198516, + "loss": 3.9403, + "step": 1152 + }, + { + "epoch": 0.06, + "grad_norm": 0.6255820393562317, + "learning_rate": 0.0005994749421117584, + "loss": 4.1259, + "step": 1153 + }, + { + "epoch": 0.06, + "grad_norm": 0.631493330001831, + "learning_rate": 0.0005994740312149058, + "loss": 4.104, + "step": 1154 + }, + { + "epoch": 0.06, + "grad_norm": 0.6368297934532166, + "learning_rate": 0.0005994731195292964, + "loss": 4.0128, + "step": 1155 + }, + { + "epoch": 0.06, + "grad_norm": 0.6331319808959961, + "learning_rate": 0.0005994722070549323, + "loss": 3.8825, + "step": 1156 + }, + { + "epoch": 0.06, + "grad_norm": 0.5888481736183167, + "learning_rate": 0.000599471293791816, + "loss": 4.1243, + "step": 1157 + }, + { + "epoch": 0.06, + "grad_norm": 0.5803425312042236, + "learning_rate": 0.00059947037973995, + "loss": 3.9828, + "step": 1158 + }, + { + "epoch": 0.06, + "grad_norm": 0.5989425182342529, + "learning_rate": 0.0005994694648993368, + "loss": 3.7897, + "step": 1159 + }, + { + "epoch": 0.06, + "grad_norm": 0.6215832233428955, + "learning_rate": 0.0005994685492699785, + "loss": 3.6121, + "step": 1160 + }, + { + "epoch": 0.06, + "grad_norm": 0.6019163131713867, + "learning_rate": 0.0005994676328518778, + "loss": 4.244, + "step": 1161 + }, + { + "epoch": 0.06, + "grad_norm": 0.5976054668426514, + "learning_rate": 0.0005994667156450368, + "loss": 4.1499, + "step": 1162 + }, + { + "epoch": 0.06, + "grad_norm": 0.6580859422683716, + "learning_rate": 0.0005994657976494583, + "loss": 3.814, + "step": 1163 + }, + { + "epoch": 0.06, + "grad_norm": 0.6277784705162048, + "learning_rate": 0.0005994648788651445, + "loss": 4.0219, + "step": 1164 + }, + { + "epoch": 0.06, + "grad_norm": 0.6280736327171326, + "learning_rate": 0.0005994639592920976, + "loss": 4.0019, + "step": 1165 + }, + { + "epoch": 0.06, + "grad_norm": 0.7152631878852844, + "learning_rate": 0.0005994630389303205, + "loss": 3.9115, + "step": 1166 + }, + { + "epoch": 0.06, + "grad_norm": 0.6251851916313171, + "learning_rate": 0.0005994621177798153, + "loss": 4.104, + "step": 1167 + }, + { + "epoch": 0.06, + "grad_norm": 0.6360085606575012, + "learning_rate": 0.0005994611958405846, + "loss": 3.9413, + "step": 1168 + }, + { + "epoch": 0.06, + "grad_norm": 0.5834187865257263, + "learning_rate": 0.0005994602731126306, + "loss": 3.8453, + "step": 1169 + }, + { + "epoch": 0.06, + "grad_norm": 0.6263565421104431, + "learning_rate": 0.000599459349595956, + "loss": 3.9509, + "step": 1170 + }, + { + "epoch": 0.06, + "grad_norm": 0.6582068204879761, + "learning_rate": 0.000599458425290563, + "loss": 3.7823, + "step": 1171 + }, + { + "epoch": 0.06, + "grad_norm": 0.6431542038917542, + "learning_rate": 0.0005994575001964542, + "loss": 3.9593, + "step": 1172 + }, + { + "epoch": 0.06, + "grad_norm": 0.6388793587684631, + "learning_rate": 0.0005994565743136321, + "loss": 3.8094, + "step": 1173 + }, + { + "epoch": 0.06, + "grad_norm": 0.6203798055648804, + "learning_rate": 0.0005994556476420988, + "loss": 3.8218, + "step": 1174 + }, + { + "epoch": 0.06, + "grad_norm": 0.6218488812446594, + "learning_rate": 0.0005994547201818571, + "loss": 3.8657, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 0.6742687225341797, + "learning_rate": 0.0005994537919329092, + "loss": 3.8143, + "step": 1176 + }, + { + "epoch": 0.06, + "grad_norm": 0.6174846887588501, + "learning_rate": 0.0005994528628952577, + "loss": 3.8962, + "step": 1177 + }, + { + "epoch": 0.06, + "grad_norm": 0.7459059357643127, + "learning_rate": 0.000599451933068905, + "loss": 4.0231, + "step": 1178 + }, + { + "epoch": 0.06, + "grad_norm": 0.602178692817688, + "learning_rate": 0.0005994510024538535, + "loss": 4.0269, + "step": 1179 + }, + { + "epoch": 0.06, + "grad_norm": 0.6028525233268738, + "learning_rate": 0.0005994500710501058, + "loss": 4.0828, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 0.6567595601081848, + "learning_rate": 0.0005994491388576641, + "loss": 4.0124, + "step": 1181 + }, + { + "epoch": 0.06, + "grad_norm": 0.6130238771438599, + "learning_rate": 0.000599448205876531, + "loss": 4.0882, + "step": 1182 + }, + { + "epoch": 0.06, + "grad_norm": 0.5951813459396362, + "learning_rate": 0.000599447272106709, + "loss": 3.8659, + "step": 1183 + }, + { + "epoch": 0.06, + "grad_norm": 0.6704297065734863, + "learning_rate": 0.0005994463375482006, + "loss": 4.0565, + "step": 1184 + }, + { + "epoch": 0.06, + "grad_norm": 0.6474865674972534, + "learning_rate": 0.0005994454022010081, + "loss": 4.048, + "step": 1185 + }, + { + "epoch": 0.06, + "grad_norm": 0.5812511444091797, + "learning_rate": 0.0005994444660651339, + "loss": 3.8526, + "step": 1186 + }, + { + "epoch": 0.06, + "grad_norm": 0.5992949604988098, + "learning_rate": 0.0005994435291405808, + "loss": 3.9835, + "step": 1187 + }, + { + "epoch": 0.06, + "grad_norm": 0.6317020654678345, + "learning_rate": 0.0005994425914273509, + "loss": 3.8099, + "step": 1188 + }, + { + "epoch": 0.06, + "grad_norm": 0.6197634339332581, + "learning_rate": 0.000599441652925447, + "loss": 3.8242, + "step": 1189 + }, + { + "epoch": 0.06, + "grad_norm": 0.6547470092773438, + "learning_rate": 0.0005994407136348715, + "loss": 3.9396, + "step": 1190 + }, + { + "epoch": 0.06, + "grad_norm": 0.594266951084137, + "learning_rate": 0.0005994397735556267, + "loss": 3.9765, + "step": 1191 + }, + { + "epoch": 0.06, + "grad_norm": 0.647244930267334, + "learning_rate": 0.000599438832687715, + "loss": 3.8405, + "step": 1192 + }, + { + "epoch": 0.06, + "grad_norm": 0.6598258018493652, + "learning_rate": 0.0005994378910311393, + "loss": 3.7971, + "step": 1193 + }, + { + "epoch": 0.06, + "grad_norm": 0.6706022620201111, + "learning_rate": 0.0005994369485859016, + "loss": 3.9114, + "step": 1194 + }, + { + "epoch": 0.06, + "grad_norm": 0.5980822443962097, + "learning_rate": 0.0005994360053520047, + "loss": 3.8791, + "step": 1195 + }, + { + "epoch": 0.06, + "grad_norm": 0.6569104194641113, + "learning_rate": 0.000599435061329451, + "loss": 3.9084, + "step": 1196 + }, + { + "epoch": 0.06, + "grad_norm": 0.6087574362754822, + "learning_rate": 0.000599434116518243, + "loss": 3.7479, + "step": 1197 + }, + { + "epoch": 0.06, + "grad_norm": 0.6148558259010315, + "learning_rate": 0.0005994331709183831, + "loss": 3.9402, + "step": 1198 + }, + { + "epoch": 0.06, + "grad_norm": 0.6539002656936646, + "learning_rate": 0.0005994322245298739, + "loss": 3.9145, + "step": 1199 + }, + { + "epoch": 0.06, + "grad_norm": 0.6310211420059204, + "learning_rate": 0.0005994312773527178, + "loss": 3.8744, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 0.630675196647644, + "learning_rate": 0.0005994303293869175, + "loss": 3.8331, + "step": 1201 + }, + { + "epoch": 0.06, + "grad_norm": 0.5651322603225708, + "learning_rate": 0.000599429380632475, + "loss": 4.0296, + "step": 1202 + }, + { + "epoch": 0.06, + "grad_norm": 0.5851694941520691, + "learning_rate": 0.0005994284310893934, + "loss": 3.6882, + "step": 1203 + }, + { + "epoch": 0.06, + "grad_norm": 0.6342992186546326, + "learning_rate": 0.0005994274807576749, + "loss": 3.8372, + "step": 1204 + }, + { + "epoch": 0.06, + "grad_norm": 0.5836645364761353, + "learning_rate": 0.0005994265296373221, + "loss": 4.0055, + "step": 1205 + }, + { + "epoch": 0.06, + "grad_norm": 0.628150999546051, + "learning_rate": 0.0005994255777283374, + "loss": 3.7181, + "step": 1206 + }, + { + "epoch": 0.06, + "grad_norm": 0.6232638955116272, + "learning_rate": 0.0005994246250307232, + "loss": 3.8581, + "step": 1207 + }, + { + "epoch": 0.06, + "grad_norm": 0.6497608423233032, + "learning_rate": 0.0005994236715444823, + "loss": 3.6943, + "step": 1208 + }, + { + "epoch": 0.06, + "grad_norm": 0.6445043087005615, + "learning_rate": 0.0005994227172696171, + "loss": 3.918, + "step": 1209 + }, + { + "epoch": 0.06, + "grad_norm": 0.5761111378669739, + "learning_rate": 0.0005994217622061301, + "loss": 3.9378, + "step": 1210 + }, + { + "epoch": 0.06, + "grad_norm": 0.5777541995048523, + "learning_rate": 0.0005994208063540237, + "loss": 3.8161, + "step": 1211 + }, + { + "epoch": 0.06, + "grad_norm": 0.6353874206542969, + "learning_rate": 0.0005994198497133006, + "loss": 3.8907, + "step": 1212 + }, + { + "epoch": 0.06, + "grad_norm": 0.6581094264984131, + "learning_rate": 0.0005994188922839633, + "loss": 3.8506, + "step": 1213 + }, + { + "epoch": 0.06, + "grad_norm": 0.69733065366745, + "learning_rate": 0.0005994179340660141, + "loss": 4.0021, + "step": 1214 + }, + { + "epoch": 0.06, + "grad_norm": 0.6014714241027832, + "learning_rate": 0.0005994169750594558, + "loss": 3.829, + "step": 1215 + }, + { + "epoch": 0.06, + "grad_norm": 0.6537860035896301, + "learning_rate": 0.0005994160152642908, + "loss": 3.9849, + "step": 1216 + }, + { + "epoch": 0.06, + "grad_norm": 0.7314767837524414, + "learning_rate": 0.0005994150546805216, + "loss": 3.9777, + "step": 1217 + }, + { + "epoch": 0.06, + "grad_norm": 0.6319352984428406, + "learning_rate": 0.0005994140933081507, + "loss": 3.7624, + "step": 1218 + }, + { + "epoch": 0.06, + "grad_norm": 0.5953729152679443, + "learning_rate": 0.0005994131311471808, + "loss": 3.8365, + "step": 1219 + }, + { + "epoch": 0.06, + "grad_norm": 0.609235405921936, + "learning_rate": 0.0005994121681976144, + "loss": 4.1573, + "step": 1220 + }, + { + "epoch": 0.06, + "grad_norm": 0.6277957558631897, + "learning_rate": 0.0005994112044594538, + "loss": 4.0577, + "step": 1221 + }, + { + "epoch": 0.06, + "grad_norm": 0.5987586379051208, + "learning_rate": 0.0005994102399327019, + "loss": 4.0041, + "step": 1222 + }, + { + "epoch": 0.06, + "grad_norm": 0.6414321660995483, + "learning_rate": 0.0005994092746173609, + "loss": 3.9939, + "step": 1223 + }, + { + "epoch": 0.06, + "grad_norm": 0.6084062457084656, + "learning_rate": 0.0005994083085134336, + "loss": 4.004, + "step": 1224 + }, + { + "epoch": 0.06, + "grad_norm": 0.6363257169723511, + "learning_rate": 0.0005994073416209225, + "loss": 3.7369, + "step": 1225 + }, + { + "epoch": 0.06, + "grad_norm": 0.6323297023773193, + "learning_rate": 0.0005994063739398299, + "loss": 3.8556, + "step": 1226 + }, + { + "epoch": 0.06, + "grad_norm": 0.7020496726036072, + "learning_rate": 0.0005994054054701585, + "loss": 3.9934, + "step": 1227 + }, + { + "epoch": 0.06, + "grad_norm": 0.6019614338874817, + "learning_rate": 0.000599404436211911, + "loss": 3.985, + "step": 1228 + }, + { + "epoch": 0.06, + "grad_norm": 0.6175892353057861, + "learning_rate": 0.0005994034661650899, + "loss": 3.9954, + "step": 1229 + }, + { + "epoch": 0.06, + "grad_norm": 0.611221969127655, + "learning_rate": 0.0005994024953296976, + "loss": 3.852, + "step": 1230 + }, + { + "epoch": 0.06, + "grad_norm": 0.5849113464355469, + "learning_rate": 0.0005994015237057368, + "loss": 3.7631, + "step": 1231 + }, + { + "epoch": 0.06, + "grad_norm": 0.6553965210914612, + "learning_rate": 0.0005994005512932099, + "loss": 4.1467, + "step": 1232 + }, + { + "epoch": 0.06, + "grad_norm": 0.6709908843040466, + "learning_rate": 0.0005993995780921197, + "loss": 4.0218, + "step": 1233 + }, + { + "epoch": 0.06, + "grad_norm": 0.6183760762214661, + "learning_rate": 0.0005993986041024686, + "loss": 3.8164, + "step": 1234 + }, + { + "epoch": 0.06, + "grad_norm": 0.5950647592544556, + "learning_rate": 0.0005993976293242591, + "loss": 4.1307, + "step": 1235 + }, + { + "epoch": 0.06, + "grad_norm": 0.6528424620628357, + "learning_rate": 0.000599396653757494, + "loss": 3.8645, + "step": 1236 + }, + { + "epoch": 0.06, + "grad_norm": 0.6041496396064758, + "learning_rate": 0.0005993956774021757, + "loss": 4.0503, + "step": 1237 + }, + { + "epoch": 0.06, + "grad_norm": 0.6346700191497803, + "learning_rate": 0.0005993947002583067, + "loss": 3.978, + "step": 1238 + }, + { + "epoch": 0.06, + "grad_norm": 0.6335063576698303, + "learning_rate": 0.0005993937223258898, + "loss": 3.8234, + "step": 1239 + }, + { + "epoch": 0.06, + "grad_norm": 0.6392975449562073, + "learning_rate": 0.0005993927436049273, + "loss": 3.9517, + "step": 1240 + }, + { + "epoch": 0.06, + "grad_norm": 0.5925444960594177, + "learning_rate": 0.0005993917640954221, + "loss": 3.798, + "step": 1241 + }, + { + "epoch": 0.06, + "grad_norm": 0.6346644163131714, + "learning_rate": 0.0005993907837973766, + "loss": 3.9064, + "step": 1242 + }, + { + "epoch": 0.06, + "grad_norm": 0.5913667678833008, + "learning_rate": 0.0005993898027107933, + "loss": 3.9169, + "step": 1243 + }, + { + "epoch": 0.06, + "grad_norm": 0.7143539190292358, + "learning_rate": 0.000599388820835675, + "loss": 3.7224, + "step": 1244 + }, + { + "epoch": 0.06, + "grad_norm": 0.6183956265449524, + "learning_rate": 0.000599387838172024, + "loss": 3.7269, + "step": 1245 + }, + { + "epoch": 0.06, + "grad_norm": 0.614376962184906, + "learning_rate": 0.0005993868547198432, + "loss": 3.9984, + "step": 1246 + }, + { + "epoch": 0.06, + "grad_norm": 0.5987918376922607, + "learning_rate": 0.000599385870479135, + "loss": 4.0514, + "step": 1247 + }, + { + "epoch": 0.06, + "grad_norm": 0.5989696383476257, + "learning_rate": 0.000599384885449902, + "loss": 3.9283, + "step": 1248 + }, + { + "epoch": 0.06, + "grad_norm": 0.6367683410644531, + "learning_rate": 0.0005993838996321468, + "loss": 4.1072, + "step": 1249 + }, + { + "epoch": 0.06, + "grad_norm": 0.6802504658699036, + "learning_rate": 0.0005993829130258721, + "loss": 3.7503, + "step": 1250 + }, + { + "epoch": 0.06, + "grad_norm": 0.6775780320167542, + "learning_rate": 0.0005993819256310804, + "loss": 4.0616, + "step": 1251 + }, + { + "epoch": 0.06, + "grad_norm": 0.5724057555198669, + "learning_rate": 0.0005993809374477744, + "loss": 3.735, + "step": 1252 + }, + { + "epoch": 0.06, + "grad_norm": 0.6689765453338623, + "learning_rate": 0.0005993799484759566, + "loss": 3.8647, + "step": 1253 + }, + { + "epoch": 0.06, + "grad_norm": 0.6421110033988953, + "learning_rate": 0.0005993789587156295, + "loss": 3.9385, + "step": 1254 + }, + { + "epoch": 0.06, + "grad_norm": 0.696961522102356, + "learning_rate": 0.000599377968166796, + "loss": 3.847, + "step": 1255 + }, + { + "epoch": 0.06, + "grad_norm": 0.6425986886024475, + "learning_rate": 0.0005993769768294583, + "loss": 3.857, + "step": 1256 + }, + { + "epoch": 0.06, + "grad_norm": 0.6519110798835754, + "learning_rate": 0.0005993759847036195, + "loss": 3.846, + "step": 1257 + }, + { + "epoch": 0.06, + "grad_norm": 0.5740974545478821, + "learning_rate": 0.000599374991789282, + "loss": 3.8013, + "step": 1258 + }, + { + "epoch": 0.06, + "grad_norm": 0.6745760440826416, + "learning_rate": 0.0005993739980864483, + "loss": 3.9241, + "step": 1259 + }, + { + "epoch": 0.06, + "grad_norm": 0.6441271305084229, + "learning_rate": 0.000599373003595121, + "loss": 3.7695, + "step": 1260 + }, + { + "epoch": 0.06, + "grad_norm": 0.6193081140518188, + "learning_rate": 0.0005993720083153029, + "loss": 3.985, + "step": 1261 + }, + { + "epoch": 0.06, + "grad_norm": 0.6134259700775146, + "learning_rate": 0.0005993710122469966, + "loss": 3.7961, + "step": 1262 + }, + { + "epoch": 0.06, + "grad_norm": 0.6477553248405457, + "learning_rate": 0.0005993700153902046, + "loss": 3.9387, + "step": 1263 + }, + { + "epoch": 0.06, + "grad_norm": 0.6375543475151062, + "learning_rate": 0.0005993690177449298, + "loss": 4.016, + "step": 1264 + }, + { + "epoch": 0.06, + "grad_norm": 0.615598738193512, + "learning_rate": 0.0005993680193111744, + "loss": 3.823, + "step": 1265 + }, + { + "epoch": 0.06, + "grad_norm": 0.6295831203460693, + "learning_rate": 0.0005993670200889412, + "loss": 3.7599, + "step": 1266 + }, + { + "epoch": 0.06, + "grad_norm": 0.6454052329063416, + "learning_rate": 0.0005993660200782331, + "loss": 3.9033, + "step": 1267 + }, + { + "epoch": 0.06, + "grad_norm": 0.6237940192222595, + "learning_rate": 0.0005993650192790525, + "loss": 3.7442, + "step": 1268 + }, + { + "epoch": 0.06, + "grad_norm": 0.6554262042045593, + "learning_rate": 0.0005993640176914019, + "loss": 3.8714, + "step": 1269 + }, + { + "epoch": 0.06, + "grad_norm": 0.591227114200592, + "learning_rate": 0.0005993630153152841, + "loss": 3.6987, + "step": 1270 + }, + { + "epoch": 0.06, + "grad_norm": 0.5855788588523865, + "learning_rate": 0.0005993620121507018, + "loss": 3.872, + "step": 1271 + }, + { + "epoch": 0.06, + "grad_norm": 0.6025470495223999, + "learning_rate": 0.0005993610081976575, + "loss": 3.8734, + "step": 1272 + }, + { + "epoch": 0.06, + "grad_norm": 0.6828283071517944, + "learning_rate": 0.0005993600034561539, + "loss": 3.8746, + "step": 1273 + }, + { + "epoch": 0.06, + "grad_norm": 0.6492578983306885, + "learning_rate": 0.0005993589979261939, + "loss": 3.7428, + "step": 1274 + }, + { + "epoch": 0.06, + "grad_norm": 0.6107031106948853, + "learning_rate": 0.0005993579916077796, + "loss": 3.7983, + "step": 1275 + }, + { + "epoch": 0.06, + "grad_norm": 0.6975229382514954, + "learning_rate": 0.0005993569845009142, + "loss": 3.6364, + "step": 1276 + }, + { + "epoch": 0.06, + "grad_norm": 0.6226152777671814, + "learning_rate": 0.0005993559766056, + "loss": 3.9707, + "step": 1277 + }, + { + "epoch": 0.06, + "grad_norm": 0.6262789368629456, + "learning_rate": 0.0005993549679218398, + "loss": 3.9105, + "step": 1278 + }, + { + "epoch": 0.06, + "grad_norm": 0.5812487006187439, + "learning_rate": 0.0005993539584496362, + "loss": 3.5877, + "step": 1279 + }, + { + "epoch": 0.06, + "grad_norm": 0.5997166037559509, + "learning_rate": 0.000599352948188992, + "loss": 3.8289, + "step": 1280 + }, + { + "epoch": 0.06, + "grad_norm": 0.5774847269058228, + "learning_rate": 0.0005993519371399096, + "loss": 4.0577, + "step": 1281 + }, + { + "epoch": 0.06, + "grad_norm": 0.6224757432937622, + "learning_rate": 0.0005993509253023919, + "loss": 3.7593, + "step": 1282 + }, + { + "epoch": 0.06, + "grad_norm": 0.5884726047515869, + "learning_rate": 0.0005993499126764415, + "loss": 4.0062, + "step": 1283 + }, + { + "epoch": 0.06, + "grad_norm": 0.6184850931167603, + "learning_rate": 0.0005993488992620608, + "loss": 3.7726, + "step": 1284 + }, + { + "epoch": 0.06, + "grad_norm": 0.6030580997467041, + "learning_rate": 0.000599347885059253, + "loss": 4.0764, + "step": 1285 + }, + { + "epoch": 0.06, + "grad_norm": 0.6678992509841919, + "learning_rate": 0.0005993468700680204, + "loss": 4.0444, + "step": 1286 + }, + { + "epoch": 0.06, + "grad_norm": 0.6198200583457947, + "learning_rate": 0.0005993458542883657, + "loss": 3.8496, + "step": 1287 + }, + { + "epoch": 0.06, + "grad_norm": 0.5855886936187744, + "learning_rate": 0.0005993448377202916, + "loss": 3.8943, + "step": 1288 + }, + { + "epoch": 0.06, + "grad_norm": 0.684818685054779, + "learning_rate": 0.0005993438203638009, + "loss": 3.8199, + "step": 1289 + }, + { + "epoch": 0.06, + "grad_norm": 0.5692946910858154, + "learning_rate": 0.0005993428022188961, + "loss": 3.9652, + "step": 1290 + }, + { + "epoch": 0.06, + "grad_norm": 0.6946492791175842, + "learning_rate": 0.00059934178328558, + "loss": 4.1611, + "step": 1291 + }, + { + "epoch": 0.06, + "grad_norm": 0.5940146446228027, + "learning_rate": 0.0005993407635638553, + "loss": 3.9545, + "step": 1292 + }, + { + "epoch": 0.06, + "grad_norm": 0.5907729864120483, + "learning_rate": 0.0005993397430537246, + "loss": 3.9311, + "step": 1293 + }, + { + "epoch": 0.06, + "grad_norm": 0.6589469909667969, + "learning_rate": 0.0005993387217551907, + "loss": 3.795, + "step": 1294 + }, + { + "epoch": 0.06, + "grad_norm": 0.5783873200416565, + "learning_rate": 0.0005993376996682561, + "loss": 3.9994, + "step": 1295 + }, + { + "epoch": 0.06, + "grad_norm": 0.6160930395126343, + "learning_rate": 0.0005993366767929237, + "loss": 3.9999, + "step": 1296 + }, + { + "epoch": 0.06, + "grad_norm": 0.5932180881500244, + "learning_rate": 0.000599335653129196, + "loss": 4.0102, + "step": 1297 + }, + { + "epoch": 0.06, + "grad_norm": 0.635309100151062, + "learning_rate": 0.0005993346286770759, + "loss": 3.7957, + "step": 1298 + }, + { + "epoch": 0.06, + "grad_norm": 0.6134257912635803, + "learning_rate": 0.000599333603436566, + "loss": 3.8315, + "step": 1299 + }, + { + "epoch": 0.06, + "grad_norm": 0.5721672773361206, + "learning_rate": 0.0005993325774076688, + "loss": 4.0402, + "step": 1300 + }, + { + "epoch": 0.06, + "grad_norm": 0.5775968432426453, + "learning_rate": 0.0005993315505903874, + "loss": 3.8583, + "step": 1301 + }, + { + "epoch": 0.06, + "grad_norm": 0.618675172328949, + "learning_rate": 0.0005993305229847242, + "loss": 3.8447, + "step": 1302 + }, + { + "epoch": 0.06, + "grad_norm": 0.6464390754699707, + "learning_rate": 0.0005993294945906821, + "loss": 4.1014, + "step": 1303 + }, + { + "epoch": 0.06, + "grad_norm": 0.660879373550415, + "learning_rate": 0.0005993284654082638, + "loss": 3.9139, + "step": 1304 + }, + { + "epoch": 0.06, + "grad_norm": 0.5997679233551025, + "learning_rate": 0.0005993274354374718, + "loss": 3.9929, + "step": 1305 + }, + { + "epoch": 0.06, + "grad_norm": 0.6345100998878479, + "learning_rate": 0.0005993264046783089, + "loss": 4.0084, + "step": 1306 + }, + { + "epoch": 0.06, + "grad_norm": 0.5924948453903198, + "learning_rate": 0.000599325373130778, + "loss": 3.8919, + "step": 1307 + }, + { + "epoch": 0.06, + "grad_norm": 0.6048806309700012, + "learning_rate": 0.0005993243407948815, + "loss": 3.8444, + "step": 1308 + }, + { + "epoch": 0.06, + "grad_norm": 0.5796457529067993, + "learning_rate": 0.0005993233076706224, + "loss": 3.8349, + "step": 1309 + }, + { + "epoch": 0.06, + "grad_norm": 0.6050766706466675, + "learning_rate": 0.0005993222737580033, + "loss": 3.9042, + "step": 1310 + }, + { + "epoch": 0.06, + "grad_norm": 0.6207561492919922, + "learning_rate": 0.0005993212390570269, + "loss": 4.2432, + "step": 1311 + }, + { + "epoch": 0.06, + "grad_norm": 0.722430408000946, + "learning_rate": 0.0005993202035676959, + "loss": 3.7475, + "step": 1312 + }, + { + "epoch": 0.06, + "grad_norm": 0.6052895784378052, + "learning_rate": 0.0005993191672900132, + "loss": 3.9763, + "step": 1313 + }, + { + "epoch": 0.06, + "grad_norm": 0.6674761772155762, + "learning_rate": 0.0005993181302239815, + "loss": 4.0753, + "step": 1314 + }, + { + "epoch": 0.06, + "grad_norm": 0.6250360012054443, + "learning_rate": 0.0005993170923696033, + "loss": 3.8684, + "step": 1315 + }, + { + "epoch": 0.06, + "grad_norm": 0.6339815855026245, + "learning_rate": 0.0005993160537268815, + "loss": 3.8393, + "step": 1316 + }, + { + "epoch": 0.06, + "grad_norm": 0.6206067204475403, + "learning_rate": 0.0005993150142958189, + "loss": 3.7968, + "step": 1317 + }, + { + "epoch": 0.06, + "grad_norm": 0.5794596672058105, + "learning_rate": 0.000599313974076418, + "loss": 3.9016, + "step": 1318 + }, + { + "epoch": 0.06, + "grad_norm": 0.706591784954071, + "learning_rate": 0.0005993129330686818, + "loss": 3.796, + "step": 1319 + }, + { + "epoch": 0.06, + "grad_norm": 0.6444628834724426, + "learning_rate": 0.000599311891272613, + "loss": 3.8181, + "step": 1320 + }, + { + "epoch": 0.06, + "grad_norm": 0.5789697766304016, + "learning_rate": 0.0005993108486882142, + "loss": 3.9068, + "step": 1321 + }, + { + "epoch": 0.06, + "grad_norm": 0.6501109004020691, + "learning_rate": 0.0005993098053154882, + "loss": 3.836, + "step": 1322 + }, + { + "epoch": 0.06, + "grad_norm": 0.6823614239692688, + "learning_rate": 0.0005993087611544379, + "loss": 3.8886, + "step": 1323 + }, + { + "epoch": 0.06, + "grad_norm": 0.5772250890731812, + "learning_rate": 0.0005993077162050658, + "loss": 3.7892, + "step": 1324 + }, + { + "epoch": 0.06, + "grad_norm": 0.674119770526886, + "learning_rate": 0.0005993066704673748, + "loss": 4.0057, + "step": 1325 + }, + { + "epoch": 0.06, + "grad_norm": 0.541368305683136, + "learning_rate": 0.0005993056239413677, + "loss": 3.9643, + "step": 1326 + }, + { + "epoch": 0.07, + "grad_norm": 0.6814593076705933, + "learning_rate": 0.0005993045766270472, + "loss": 3.8573, + "step": 1327 + }, + { + "epoch": 0.07, + "grad_norm": 0.607441782951355, + "learning_rate": 0.000599303528524416, + "loss": 3.8976, + "step": 1328 + }, + { + "epoch": 0.07, + "grad_norm": 0.5948410630226135, + "learning_rate": 0.0005993024796334768, + "loss": 3.7664, + "step": 1329 + }, + { + "epoch": 0.07, + "grad_norm": 0.6131972074508667, + "learning_rate": 0.0005993014299542327, + "loss": 4.0518, + "step": 1330 + }, + { + "epoch": 0.07, + "grad_norm": 0.6085745096206665, + "learning_rate": 0.0005993003794866861, + "loss": 3.8089, + "step": 1331 + }, + { + "epoch": 0.07, + "grad_norm": 0.6138327717781067, + "learning_rate": 0.0005992993282308398, + "loss": 3.8027, + "step": 1332 + }, + { + "epoch": 0.07, + "grad_norm": 0.6015024781227112, + "learning_rate": 0.0005992982761866968, + "loss": 3.7218, + "step": 1333 + }, + { + "epoch": 0.07, + "grad_norm": 0.6075776815414429, + "learning_rate": 0.0005992972233542597, + "loss": 3.8605, + "step": 1334 + }, + { + "epoch": 0.07, + "grad_norm": 0.603941798210144, + "learning_rate": 0.0005992961697335314, + "loss": 3.692, + "step": 1335 + }, + { + "epoch": 0.07, + "grad_norm": 0.6697930097579956, + "learning_rate": 0.0005992951153245146, + "loss": 3.7488, + "step": 1336 + }, + { + "epoch": 0.07, + "grad_norm": 0.6258661150932312, + "learning_rate": 0.000599294060127212, + "loss": 4.0558, + "step": 1337 + }, + { + "epoch": 0.07, + "grad_norm": 0.5765604972839355, + "learning_rate": 0.0005992930041416264, + "loss": 4.1127, + "step": 1338 + }, + { + "epoch": 0.07, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0005992919473677607, + "loss": 4.1005, + "step": 1339 + }, + { + "epoch": 0.07, + "grad_norm": 0.6068031191825867, + "learning_rate": 0.0005992908898056176, + "loss": 4.0732, + "step": 1340 + }, + { + "epoch": 0.07, + "grad_norm": 0.6263118386268616, + "learning_rate": 0.0005992898314551999, + "loss": 4.0635, + "step": 1341 + }, + { + "epoch": 0.07, + "grad_norm": 0.630369246006012, + "learning_rate": 0.0005992887723165102, + "loss": 3.9827, + "step": 1342 + }, + { + "epoch": 0.07, + "grad_norm": 0.5971052646636963, + "learning_rate": 0.0005992877123895516, + "loss": 3.8703, + "step": 1343 + }, + { + "epoch": 0.07, + "grad_norm": 0.6135636568069458, + "learning_rate": 0.0005992866516743269, + "loss": 3.8379, + "step": 1344 + }, + { + "epoch": 0.07, + "grad_norm": 0.63181471824646, + "learning_rate": 0.0005992855901708387, + "loss": 3.7176, + "step": 1345 + }, + { + "epoch": 0.07, + "grad_norm": 0.6445341110229492, + "learning_rate": 0.0005992845278790898, + "loss": 3.7877, + "step": 1346 + }, + { + "epoch": 0.07, + "grad_norm": 0.6358599662780762, + "learning_rate": 0.000599283464799083, + "loss": 3.6308, + "step": 1347 + }, + { + "epoch": 0.07, + "grad_norm": 0.6615734100341797, + "learning_rate": 0.0005992824009308212, + "loss": 3.8782, + "step": 1348 + }, + { + "epoch": 0.07, + "grad_norm": 0.6317094564437866, + "learning_rate": 0.0005992813362743072, + "loss": 3.935, + "step": 1349 + }, + { + "epoch": 0.07, + "grad_norm": 0.6740353107452393, + "learning_rate": 0.0005992802708295438, + "loss": 3.6289, + "step": 1350 + }, + { + "epoch": 0.07, + "grad_norm": 0.604249894618988, + "learning_rate": 0.0005992792045965336, + "loss": 3.8375, + "step": 1351 + }, + { + "epoch": 0.07, + "grad_norm": 0.6756212115287781, + "learning_rate": 0.0005992781375752797, + "loss": 3.8007, + "step": 1352 + }, + { + "epoch": 0.07, + "grad_norm": 0.5996575355529785, + "learning_rate": 0.0005992770697657848, + "loss": 3.8507, + "step": 1353 + }, + { + "epoch": 0.07, + "grad_norm": 0.6114326119422913, + "learning_rate": 0.0005992760011680516, + "loss": 4.0967, + "step": 1354 + }, + { + "epoch": 0.07, + "grad_norm": 0.6802350878715515, + "learning_rate": 0.000599274931782083, + "loss": 3.5996, + "step": 1355 + }, + { + "epoch": 0.07, + "grad_norm": 0.6232761740684509, + "learning_rate": 0.0005992738616078819, + "loss": 3.8647, + "step": 1356 + }, + { + "epoch": 0.07, + "grad_norm": 0.5817407369613647, + "learning_rate": 0.0005992727906454511, + "loss": 4.0256, + "step": 1357 + }, + { + "epoch": 0.07, + "grad_norm": 0.6215899586677551, + "learning_rate": 0.0005992717188947933, + "loss": 4.0016, + "step": 1358 + }, + { + "epoch": 0.07, + "grad_norm": 0.6303964257240295, + "learning_rate": 0.0005992706463559113, + "loss": 3.675, + "step": 1359 + }, + { + "epoch": 0.07, + "grad_norm": 0.6754851937294006, + "learning_rate": 0.0005992695730288081, + "loss": 3.977, + "step": 1360 + }, + { + "epoch": 0.07, + "grad_norm": 0.5957140922546387, + "learning_rate": 0.0005992684989134864, + "loss": 3.8991, + "step": 1361 + }, + { + "epoch": 0.07, + "grad_norm": 0.6352207660675049, + "learning_rate": 0.0005992674240099491, + "loss": 4.0731, + "step": 1362 + }, + { + "epoch": 0.07, + "grad_norm": 0.634380042552948, + "learning_rate": 0.000599266348318199, + "loss": 3.8853, + "step": 1363 + }, + { + "epoch": 0.07, + "grad_norm": 0.6464090347290039, + "learning_rate": 0.000599265271838239, + "loss": 3.8244, + "step": 1364 + }, + { + "epoch": 0.07, + "grad_norm": 0.6003564596176147, + "learning_rate": 0.0005992641945700717, + "loss": 3.7622, + "step": 1365 + }, + { + "epoch": 0.07, + "grad_norm": 0.674795389175415, + "learning_rate": 0.0005992631165137002, + "loss": 4.0823, + "step": 1366 + }, + { + "epoch": 0.07, + "grad_norm": 0.6135823726654053, + "learning_rate": 0.0005992620376691273, + "loss": 3.7498, + "step": 1367 + }, + { + "epoch": 0.07, + "grad_norm": 0.6516311764717102, + "learning_rate": 0.0005992609580363558, + "loss": 3.8798, + "step": 1368 + }, + { + "epoch": 0.07, + "grad_norm": 0.6563093066215515, + "learning_rate": 0.0005992598776153883, + "loss": 3.9035, + "step": 1369 + }, + { + "epoch": 0.07, + "grad_norm": 0.5973907709121704, + "learning_rate": 0.0005992587964062282, + "loss": 3.9296, + "step": 1370 + }, + { + "epoch": 0.07, + "grad_norm": 0.6241722702980042, + "learning_rate": 0.0005992577144088778, + "loss": 3.9025, + "step": 1371 + }, + { + "epoch": 0.07, + "grad_norm": 0.5976197123527527, + "learning_rate": 0.0005992566316233403, + "loss": 3.9788, + "step": 1372 + }, + { + "epoch": 0.07, + "grad_norm": 0.5596520304679871, + "learning_rate": 0.0005992555480496183, + "loss": 4.1813, + "step": 1373 + }, + { + "epoch": 0.07, + "grad_norm": 0.7279554009437561, + "learning_rate": 0.0005992544636877149, + "loss": 3.7869, + "step": 1374 + }, + { + "epoch": 0.07, + "grad_norm": 0.6230193376541138, + "learning_rate": 0.0005992533785376328, + "loss": 3.7344, + "step": 1375 + }, + { + "epoch": 0.07, + "grad_norm": 0.6615080833435059, + "learning_rate": 0.0005992522925993748, + "loss": 3.793, + "step": 1376 + }, + { + "epoch": 0.07, + "grad_norm": 0.6379623413085938, + "learning_rate": 0.000599251205872944, + "loss": 4.0211, + "step": 1377 + }, + { + "epoch": 0.07, + "grad_norm": 0.6381189823150635, + "learning_rate": 0.000599250118358343, + "loss": 3.9455, + "step": 1378 + }, + { + "epoch": 0.07, + "grad_norm": 0.6189027428627014, + "learning_rate": 0.0005992490300555748, + "loss": 3.8361, + "step": 1379 + }, + { + "epoch": 0.07, + "grad_norm": 0.5973135232925415, + "learning_rate": 0.0005992479409646424, + "loss": 4.1191, + "step": 1380 + }, + { + "epoch": 0.07, + "grad_norm": 0.6134343147277832, + "learning_rate": 0.0005992468510855483, + "loss": 3.5492, + "step": 1381 + }, + { + "epoch": 0.07, + "grad_norm": 0.5866838693618774, + "learning_rate": 0.0005992457604182957, + "loss": 3.7574, + "step": 1382 + }, + { + "epoch": 0.07, + "grad_norm": 0.6180101633071899, + "learning_rate": 0.0005992446689628873, + "loss": 3.9139, + "step": 1383 + }, + { + "epoch": 0.07, + "grad_norm": 0.6157495379447937, + "learning_rate": 0.0005992435767193261, + "loss": 3.9326, + "step": 1384 + }, + { + "epoch": 0.07, + "grad_norm": 0.6040421724319458, + "learning_rate": 0.0005992424836876149, + "loss": 3.9882, + "step": 1385 + }, + { + "epoch": 0.07, + "grad_norm": 0.577000081539154, + "learning_rate": 0.0005992413898677565, + "loss": 4.0152, + "step": 1386 + }, + { + "epoch": 0.07, + "grad_norm": 0.6416754722595215, + "learning_rate": 0.0005992402952597539, + "loss": 3.7716, + "step": 1387 + }, + { + "epoch": 0.07, + "grad_norm": 0.6492377519607544, + "learning_rate": 0.00059923919986361, + "loss": 3.7164, + "step": 1388 + }, + { + "epoch": 0.07, + "grad_norm": 0.6138007640838623, + "learning_rate": 0.0005992381036793277, + "loss": 3.7265, + "step": 1389 + }, + { + "epoch": 0.07, + "grad_norm": 0.6215953230857849, + "learning_rate": 0.0005992370067069097, + "loss": 3.9165, + "step": 1390 + }, + { + "epoch": 0.07, + "grad_norm": 0.5956389307975769, + "learning_rate": 0.0005992359089463591, + "loss": 3.9124, + "step": 1391 + }, + { + "epoch": 0.07, + "grad_norm": 0.5724150538444519, + "learning_rate": 0.0005992348103976786, + "loss": 3.8611, + "step": 1392 + }, + { + "epoch": 0.07, + "grad_norm": 0.6579985022544861, + "learning_rate": 0.0005992337110608713, + "loss": 3.4344, + "step": 1393 + }, + { + "epoch": 0.07, + "grad_norm": 0.5854646563529968, + "learning_rate": 0.00059923261093594, + "loss": 3.828, + "step": 1394 + }, + { + "epoch": 0.07, + "grad_norm": 0.6103180646896362, + "learning_rate": 0.0005992315100228875, + "loss": 3.8028, + "step": 1395 + }, + { + "epoch": 0.07, + "grad_norm": 0.5735791325569153, + "learning_rate": 0.0005992304083217168, + "loss": 3.9679, + "step": 1396 + }, + { + "epoch": 0.07, + "grad_norm": 0.5912772417068481, + "learning_rate": 0.0005992293058324308, + "loss": 3.7386, + "step": 1397 + }, + { + "epoch": 0.07, + "grad_norm": 0.6024033427238464, + "learning_rate": 0.0005992282025550325, + "loss": 3.8819, + "step": 1398 + }, + { + "epoch": 0.07, + "grad_norm": 0.5784563422203064, + "learning_rate": 0.0005992270984895246, + "loss": 3.7262, + "step": 1399 + }, + { + "epoch": 0.07, + "grad_norm": 0.6742603182792664, + "learning_rate": 0.0005992259936359102, + "loss": 3.7894, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 0.5925095677375793, + "learning_rate": 0.0005992248879941919, + "loss": 3.9177, + "step": 1401 + }, + { + "epoch": 0.07, + "grad_norm": 0.5972510576248169, + "learning_rate": 0.0005992237815643732, + "loss": 3.8498, + "step": 1402 + }, + { + "epoch": 0.07, + "grad_norm": 0.6998504400253296, + "learning_rate": 0.0005992226743464563, + "loss": 3.9739, + "step": 1403 + }, + { + "epoch": 0.07, + "grad_norm": 0.6386610269546509, + "learning_rate": 0.0005992215663404446, + "loss": 3.9983, + "step": 1404 + }, + { + "epoch": 0.07, + "grad_norm": 0.6281149983406067, + "learning_rate": 0.0005992204575463409, + "loss": 3.82, + "step": 1405 + }, + { + "epoch": 0.07, + "grad_norm": 0.6512899994850159, + "learning_rate": 0.0005992193479641481, + "loss": 3.5674, + "step": 1406 + }, + { + "epoch": 0.07, + "grad_norm": 0.613243043422699, + "learning_rate": 0.0005992182375938692, + "loss": 3.8449, + "step": 1407 + }, + { + "epoch": 0.07, + "grad_norm": 0.6193988919258118, + "learning_rate": 0.0005992171264355069, + "loss": 3.8452, + "step": 1408 + }, + { + "epoch": 0.07, + "grad_norm": 0.6317744851112366, + "learning_rate": 0.0005992160144890643, + "loss": 3.7912, + "step": 1409 + }, + { + "epoch": 0.07, + "grad_norm": 0.604690432548523, + "learning_rate": 0.0005992149017545443, + "loss": 3.8883, + "step": 1410 + }, + { + "epoch": 0.07, + "grad_norm": 0.5913760662078857, + "learning_rate": 0.0005992137882319498, + "loss": 3.9683, + "step": 1411 + }, + { + "epoch": 0.07, + "grad_norm": 0.6362264156341553, + "learning_rate": 0.0005992126739212839, + "loss": 3.9218, + "step": 1412 + }, + { + "epoch": 0.07, + "grad_norm": 0.5891861319541931, + "learning_rate": 0.0005992115588225493, + "loss": 3.8159, + "step": 1413 + }, + { + "epoch": 0.07, + "grad_norm": 0.6369192004203796, + "learning_rate": 0.0005992104429357491, + "loss": 3.9504, + "step": 1414 + }, + { + "epoch": 0.07, + "grad_norm": 0.6242348551750183, + "learning_rate": 0.0005992093262608861, + "loss": 3.7089, + "step": 1415 + }, + { + "epoch": 0.07, + "grad_norm": 0.5739800333976746, + "learning_rate": 0.0005992082087979635, + "loss": 3.9268, + "step": 1416 + }, + { + "epoch": 0.07, + "grad_norm": 0.7472577095031738, + "learning_rate": 0.0005992070905469839, + "loss": 3.9839, + "step": 1417 + }, + { + "epoch": 0.07, + "grad_norm": 0.5870568156242371, + "learning_rate": 0.0005992059715079504, + "loss": 3.7954, + "step": 1418 + }, + { + "epoch": 0.07, + "grad_norm": 0.6301756501197815, + "learning_rate": 0.000599204851680866, + "loss": 3.9817, + "step": 1419 + }, + { + "epoch": 0.07, + "grad_norm": 0.6080438494682312, + "learning_rate": 0.0005992037310657336, + "loss": 3.7202, + "step": 1420 + }, + { + "epoch": 0.07, + "grad_norm": 0.6272107362747192, + "learning_rate": 0.0005992026096625561, + "loss": 3.6994, + "step": 1421 + }, + { + "epoch": 0.07, + "grad_norm": 0.6598101258277893, + "learning_rate": 0.0005992014874713366, + "loss": 3.9831, + "step": 1422 + }, + { + "epoch": 0.07, + "grad_norm": 0.6518803834915161, + "learning_rate": 0.0005992003644920779, + "loss": 3.6686, + "step": 1423 + }, + { + "epoch": 0.07, + "grad_norm": 0.5766271948814392, + "learning_rate": 0.0005991992407247831, + "loss": 3.6382, + "step": 1424 + }, + { + "epoch": 0.07, + "grad_norm": 0.6695455312728882, + "learning_rate": 0.0005991981161694549, + "loss": 3.7335, + "step": 1425 + }, + { + "epoch": 0.07, + "grad_norm": 0.5740464329719543, + "learning_rate": 0.0005991969908260966, + "loss": 3.9502, + "step": 1426 + }, + { + "epoch": 0.07, + "grad_norm": 0.6571002006530762, + "learning_rate": 0.0005991958646947111, + "loss": 3.7527, + "step": 1427 + }, + { + "epoch": 0.07, + "grad_norm": 0.6101444959640503, + "learning_rate": 0.0005991947377753011, + "loss": 3.9642, + "step": 1428 + }, + { + "epoch": 0.07, + "grad_norm": 0.6039525270462036, + "learning_rate": 0.0005991936100678697, + "loss": 3.6638, + "step": 1429 + }, + { + "epoch": 0.07, + "grad_norm": 0.7623158097267151, + "learning_rate": 0.0005991924815724201, + "loss": 3.9647, + "step": 1430 + }, + { + "epoch": 0.07, + "grad_norm": 0.5585444569587708, + "learning_rate": 0.000599191352288955, + "loss": 3.7619, + "step": 1431 + }, + { + "epoch": 0.07, + "grad_norm": 0.6341544985771179, + "learning_rate": 0.0005991902222174774, + "loss": 3.9703, + "step": 1432 + }, + { + "epoch": 0.07, + "grad_norm": 0.6054025888442993, + "learning_rate": 0.0005991890913579904, + "loss": 3.9349, + "step": 1433 + }, + { + "epoch": 0.07, + "grad_norm": 0.6357590556144714, + "learning_rate": 0.0005991879597104969, + "loss": 3.6478, + "step": 1434 + }, + { + "epoch": 0.07, + "grad_norm": 0.6057144403457642, + "learning_rate": 0.0005991868272749998, + "loss": 3.7992, + "step": 1435 + }, + { + "epoch": 0.07, + "grad_norm": 0.6412066221237183, + "learning_rate": 0.0005991856940515024, + "loss": 3.8659, + "step": 1436 + }, + { + "epoch": 0.07, + "grad_norm": 0.6738225817680359, + "learning_rate": 0.0005991845600400072, + "loss": 4.0249, + "step": 1437 + }, + { + "epoch": 0.07, + "grad_norm": 0.6093243360519409, + "learning_rate": 0.0005991834252405177, + "loss": 3.6715, + "step": 1438 + }, + { + "epoch": 0.07, + "grad_norm": 0.6469516158103943, + "learning_rate": 0.0005991822896530364, + "loss": 4.0306, + "step": 1439 + }, + { + "epoch": 0.07, + "grad_norm": 0.6203641891479492, + "learning_rate": 0.0005991811532775667, + "loss": 3.9434, + "step": 1440 + }, + { + "epoch": 0.07, + "grad_norm": 0.6038147807121277, + "learning_rate": 0.0005991800161141114, + "loss": 3.8991, + "step": 1441 + }, + { + "epoch": 0.07, + "grad_norm": 0.6124120950698853, + "learning_rate": 0.0005991788781626735, + "loss": 3.8865, + "step": 1442 + }, + { + "epoch": 0.07, + "grad_norm": 0.6234179735183716, + "learning_rate": 0.0005991777394232559, + "loss": 4.0951, + "step": 1443 + }, + { + "epoch": 0.07, + "grad_norm": 0.622256875038147, + "learning_rate": 0.0005991765998958618, + "loss": 3.8243, + "step": 1444 + }, + { + "epoch": 0.07, + "grad_norm": 0.5846306085586548, + "learning_rate": 0.0005991754595804942, + "loss": 3.9512, + "step": 1445 + }, + { + "epoch": 0.07, + "grad_norm": 0.6416991949081421, + "learning_rate": 0.0005991743184771558, + "loss": 3.8896, + "step": 1446 + }, + { + "epoch": 0.07, + "grad_norm": 0.6234089732170105, + "learning_rate": 0.00059917317658585, + "loss": 3.8233, + "step": 1447 + }, + { + "epoch": 0.07, + "grad_norm": 0.6189374327659607, + "learning_rate": 0.0005991720339065796, + "loss": 3.6576, + "step": 1448 + }, + { + "epoch": 0.07, + "grad_norm": 0.5880756378173828, + "learning_rate": 0.0005991708904393476, + "loss": 3.8074, + "step": 1449 + }, + { + "epoch": 0.07, + "grad_norm": 0.6507730484008789, + "learning_rate": 0.0005991697461841571, + "loss": 4.0946, + "step": 1450 + }, + { + "epoch": 0.07, + "grad_norm": 0.5725138783454895, + "learning_rate": 0.0005991686011410111, + "loss": 3.7917, + "step": 1451 + }, + { + "epoch": 0.07, + "grad_norm": 0.6646773219108582, + "learning_rate": 0.0005991674553099126, + "loss": 3.728, + "step": 1452 + }, + { + "epoch": 0.07, + "grad_norm": 0.5751292109489441, + "learning_rate": 0.0005991663086908644, + "loss": 3.7794, + "step": 1453 + }, + { + "epoch": 0.07, + "grad_norm": 0.6166512966156006, + "learning_rate": 0.0005991651612838698, + "loss": 3.9293, + "step": 1454 + }, + { + "epoch": 0.07, + "grad_norm": 0.6255661845207214, + "learning_rate": 0.0005991640130889319, + "loss": 3.8986, + "step": 1455 + }, + { + "epoch": 0.07, + "grad_norm": 0.5917665958404541, + "learning_rate": 0.0005991628641060534, + "loss": 3.8005, + "step": 1456 + }, + { + "epoch": 0.07, + "grad_norm": 0.5992826223373413, + "learning_rate": 0.0005991617143352375, + "loss": 3.8569, + "step": 1457 + }, + { + "epoch": 0.07, + "grad_norm": 0.6259722113609314, + "learning_rate": 0.0005991605637764872, + "loss": 3.9048, + "step": 1458 + }, + { + "epoch": 0.07, + "grad_norm": 0.6164818406105042, + "learning_rate": 0.0005991594124298056, + "loss": 3.9696, + "step": 1459 + }, + { + "epoch": 0.07, + "grad_norm": 0.5641024708747864, + "learning_rate": 0.0005991582602951958, + "loss": 3.7273, + "step": 1460 + }, + { + "epoch": 0.07, + "grad_norm": 0.6404457688331604, + "learning_rate": 0.0005991571073726605, + "loss": 3.7909, + "step": 1461 + }, + { + "epoch": 0.07, + "grad_norm": 0.6667171120643616, + "learning_rate": 0.0005991559536622031, + "loss": 3.7106, + "step": 1462 + }, + { + "epoch": 0.07, + "grad_norm": 0.5996845364570618, + "learning_rate": 0.0005991547991638264, + "loss": 3.8542, + "step": 1463 + }, + { + "epoch": 0.07, + "grad_norm": 0.6826181411743164, + "learning_rate": 0.0005991536438775337, + "loss": 4.0089, + "step": 1464 + }, + { + "epoch": 0.07, + "grad_norm": 0.6228737235069275, + "learning_rate": 0.0005991524878033277, + "loss": 3.7689, + "step": 1465 + }, + { + "epoch": 0.07, + "grad_norm": 0.6522650718688965, + "learning_rate": 0.0005991513309412116, + "loss": 3.9532, + "step": 1466 + }, + { + "epoch": 0.07, + "grad_norm": 0.6846997141838074, + "learning_rate": 0.0005991501732911886, + "loss": 3.8377, + "step": 1467 + }, + { + "epoch": 0.07, + "grad_norm": 0.6278219223022461, + "learning_rate": 0.0005991490148532616, + "loss": 3.7568, + "step": 1468 + }, + { + "epoch": 0.07, + "grad_norm": 0.6812443733215332, + "learning_rate": 0.0005991478556274336, + "loss": 3.8747, + "step": 1469 + }, + { + "epoch": 0.07, + "grad_norm": 0.7166856527328491, + "learning_rate": 0.0005991466956137077, + "loss": 3.9553, + "step": 1470 + }, + { + "epoch": 0.07, + "grad_norm": 0.5980615615844727, + "learning_rate": 0.0005991455348120871, + "loss": 4.0037, + "step": 1471 + }, + { + "epoch": 0.07, + "grad_norm": 0.6599940061569214, + "learning_rate": 0.0005991443732225746, + "loss": 3.7815, + "step": 1472 + }, + { + "epoch": 0.07, + "grad_norm": 0.6811275482177734, + "learning_rate": 0.0005991432108451735, + "loss": 3.9142, + "step": 1473 + }, + { + "epoch": 0.07, + "grad_norm": 0.59503173828125, + "learning_rate": 0.0005991420476798867, + "loss": 3.6994, + "step": 1474 + }, + { + "epoch": 0.07, + "grad_norm": 0.597121000289917, + "learning_rate": 0.0005991408837267173, + "loss": 3.911, + "step": 1475 + }, + { + "epoch": 0.07, + "grad_norm": 0.5894047021865845, + "learning_rate": 0.0005991397189856684, + "loss": 3.7512, + "step": 1476 + }, + { + "epoch": 0.07, + "grad_norm": 0.6419175267219543, + "learning_rate": 0.000599138553456743, + "loss": 3.892, + "step": 1477 + }, + { + "epoch": 0.07, + "grad_norm": 0.6146329641342163, + "learning_rate": 0.0005991373871399444, + "loss": 3.7919, + "step": 1478 + }, + { + "epoch": 0.07, + "grad_norm": 0.6020705103874207, + "learning_rate": 0.0005991362200352754, + "loss": 3.749, + "step": 1479 + }, + { + "epoch": 0.07, + "grad_norm": 0.6040729284286499, + "learning_rate": 0.0005991350521427391, + "loss": 3.8971, + "step": 1480 + }, + { + "epoch": 0.07, + "grad_norm": 0.6802822947502136, + "learning_rate": 0.0005991338834623386, + "loss": 3.9495, + "step": 1481 + }, + { + "epoch": 0.07, + "grad_norm": 0.7739499807357788, + "learning_rate": 0.0005991327139940771, + "loss": 4.0075, + "step": 1482 + }, + { + "epoch": 0.07, + "grad_norm": 0.7212753295898438, + "learning_rate": 0.0005991315437379576, + "loss": 3.7131, + "step": 1483 + }, + { + "epoch": 0.07, + "grad_norm": 0.5949774980545044, + "learning_rate": 0.0005991303726939832, + "loss": 3.9272, + "step": 1484 + }, + { + "epoch": 0.07, + "grad_norm": 0.6401901841163635, + "learning_rate": 0.0005991292008621569, + "loss": 3.9621, + "step": 1485 + }, + { + "epoch": 0.07, + "grad_norm": 0.6232290863990784, + "learning_rate": 0.0005991280282424819, + "loss": 3.8078, + "step": 1486 + }, + { + "epoch": 0.07, + "grad_norm": 0.644420862197876, + "learning_rate": 0.0005991268548349613, + "loss": 3.8307, + "step": 1487 + }, + { + "epoch": 0.07, + "grad_norm": 0.6507530808448792, + "learning_rate": 0.000599125680639598, + "loss": 3.85, + "step": 1488 + }, + { + "epoch": 0.07, + "grad_norm": 0.6406475305557251, + "learning_rate": 0.0005991245056563954, + "loss": 3.74, + "step": 1489 + }, + { + "epoch": 0.07, + "grad_norm": 0.6073602437973022, + "learning_rate": 0.0005991233298853562, + "loss": 3.7604, + "step": 1490 + }, + { + "epoch": 0.07, + "grad_norm": 0.6374882459640503, + "learning_rate": 0.0005991221533264839, + "loss": 3.842, + "step": 1491 + }, + { + "epoch": 0.07, + "grad_norm": 0.6282268762588501, + "learning_rate": 0.0005991209759797813, + "loss": 3.845, + "step": 1492 + }, + { + "epoch": 0.07, + "grad_norm": 0.6874997615814209, + "learning_rate": 0.0005991197978452517, + "loss": 4.1104, + "step": 1493 + }, + { + "epoch": 0.07, + "grad_norm": 0.663975715637207, + "learning_rate": 0.0005991186189228981, + "loss": 3.9287, + "step": 1494 + }, + { + "epoch": 0.07, + "grad_norm": 0.6389418840408325, + "learning_rate": 0.0005991174392127236, + "loss": 3.6839, + "step": 1495 + }, + { + "epoch": 0.07, + "grad_norm": 0.5825007557868958, + "learning_rate": 0.0005991162587147314, + "loss": 4.0187, + "step": 1496 + }, + { + "epoch": 0.07, + "grad_norm": 0.5879276394844055, + "learning_rate": 0.0005991150774289245, + "loss": 3.8135, + "step": 1497 + }, + { + "epoch": 0.07, + "grad_norm": 0.6391376852989197, + "learning_rate": 0.0005991138953553059, + "loss": 4.0896, + "step": 1498 + }, + { + "epoch": 0.07, + "grad_norm": 0.5957635045051575, + "learning_rate": 0.0005991127124938791, + "loss": 3.896, + "step": 1499 + }, + { + "epoch": 0.07, + "grad_norm": 0.6745945811271667, + "learning_rate": 0.0005991115288446468, + "loss": 3.4926, + "step": 1500 + }, + { + "epoch": 0.07, + "grad_norm": 0.6167849898338318, + "learning_rate": 0.0005991103444076124, + "loss": 3.8955, + "step": 1501 + }, + { + "epoch": 0.07, + "grad_norm": 0.6245574355125427, + "learning_rate": 0.000599109159182779, + "loss": 3.7455, + "step": 1502 + }, + { + "epoch": 0.07, + "grad_norm": 0.5934446454048157, + "learning_rate": 0.0005991079731701495, + "loss": 3.659, + "step": 1503 + }, + { + "epoch": 0.07, + "grad_norm": 0.5914957523345947, + "learning_rate": 0.0005991067863697271, + "loss": 3.7625, + "step": 1504 + }, + { + "epoch": 0.07, + "grad_norm": 0.6563999652862549, + "learning_rate": 0.0005991055987815151, + "loss": 4.0097, + "step": 1505 + }, + { + "epoch": 0.07, + "grad_norm": 0.6522777676582336, + "learning_rate": 0.0005991044104055166, + "loss": 3.9068, + "step": 1506 + }, + { + "epoch": 0.07, + "grad_norm": 0.6069871187210083, + "learning_rate": 0.0005991032212417345, + "loss": 3.9719, + "step": 1507 + }, + { + "epoch": 0.07, + "grad_norm": 0.630017876625061, + "learning_rate": 0.0005991020312901721, + "loss": 3.7736, + "step": 1508 + }, + { + "epoch": 0.07, + "grad_norm": 0.631232738494873, + "learning_rate": 0.0005991008405508326, + "loss": 3.6984, + "step": 1509 + }, + { + "epoch": 0.07, + "grad_norm": 0.6594641208648682, + "learning_rate": 0.000599099649023719, + "loss": 3.8675, + "step": 1510 + }, + { + "epoch": 0.07, + "grad_norm": 0.6070505976676941, + "learning_rate": 0.0005990984567088344, + "loss": 3.959, + "step": 1511 + }, + { + "epoch": 0.07, + "grad_norm": 0.6136875748634338, + "learning_rate": 0.0005990972636061821, + "loss": 3.8672, + "step": 1512 + }, + { + "epoch": 0.07, + "grad_norm": 0.7074840664863586, + "learning_rate": 0.0005990960697157652, + "loss": 3.8252, + "step": 1513 + }, + { + "epoch": 0.07, + "grad_norm": 0.6983718276023865, + "learning_rate": 0.0005990948750375868, + "loss": 3.9131, + "step": 1514 + }, + { + "epoch": 0.07, + "grad_norm": 0.6260075569152832, + "learning_rate": 0.0005990936795716501, + "loss": 3.6873, + "step": 1515 + }, + { + "epoch": 0.07, + "grad_norm": 0.6146523356437683, + "learning_rate": 0.0005990924833179582, + "loss": 3.8511, + "step": 1516 + }, + { + "epoch": 0.07, + "grad_norm": 0.586942732334137, + "learning_rate": 0.0005990912862765143, + "loss": 3.8194, + "step": 1517 + }, + { + "epoch": 0.07, + "grad_norm": 0.6059939861297607, + "learning_rate": 0.0005990900884473213, + "loss": 3.8406, + "step": 1518 + }, + { + "epoch": 0.07, + "grad_norm": 0.609937310218811, + "learning_rate": 0.0005990888898303827, + "loss": 3.4061, + "step": 1519 + }, + { + "epoch": 0.07, + "grad_norm": 0.59391850233078, + "learning_rate": 0.0005990876904257015, + "loss": 3.8884, + "step": 1520 + }, + { + "epoch": 0.07, + "grad_norm": 0.6493670344352722, + "learning_rate": 0.000599086490233281, + "loss": 3.9793, + "step": 1521 + }, + { + "epoch": 0.07, + "grad_norm": 0.6219658851623535, + "learning_rate": 0.000599085289253124, + "loss": 3.8025, + "step": 1522 + }, + { + "epoch": 0.07, + "grad_norm": 0.6138971447944641, + "learning_rate": 0.0005990840874852342, + "loss": 3.8796, + "step": 1523 + }, + { + "epoch": 0.07, + "grad_norm": 0.6053948998451233, + "learning_rate": 0.0005990828849296143, + "loss": 4.0149, + "step": 1524 + }, + { + "epoch": 0.07, + "grad_norm": 0.651043713092804, + "learning_rate": 0.0005990816815862678, + "loss": 3.9163, + "step": 1525 + }, + { + "epoch": 0.07, + "grad_norm": 0.6234416365623474, + "learning_rate": 0.0005990804774551975, + "loss": 3.7939, + "step": 1526 + }, + { + "epoch": 0.07, + "grad_norm": 0.5939651727676392, + "learning_rate": 0.0005990792725364068, + "loss": 3.8339, + "step": 1527 + }, + { + "epoch": 0.07, + "grad_norm": 0.6041796207427979, + "learning_rate": 0.000599078066829899, + "loss": 3.7584, + "step": 1528 + }, + { + "epoch": 0.07, + "grad_norm": 0.636833906173706, + "learning_rate": 0.0005990768603356771, + "loss": 3.8717, + "step": 1529 + }, + { + "epoch": 0.07, + "grad_norm": 0.6864349246025085, + "learning_rate": 0.0005990756530537442, + "loss": 3.786, + "step": 1530 + }, + { + "epoch": 0.08, + "grad_norm": 0.5589094161987305, + "learning_rate": 0.0005990744449841037, + "loss": 3.7716, + "step": 1531 + }, + { + "epoch": 0.08, + "grad_norm": 0.6430098414421082, + "learning_rate": 0.0005990732361267585, + "loss": 3.7062, + "step": 1532 + }, + { + "epoch": 0.08, + "grad_norm": 0.6288703083992004, + "learning_rate": 0.0005990720264817122, + "loss": 3.8552, + "step": 1533 + }, + { + "epoch": 0.08, + "grad_norm": 0.5796893835067749, + "learning_rate": 0.0005990708160489676, + "loss": 3.8646, + "step": 1534 + }, + { + "epoch": 0.08, + "grad_norm": 0.5982096791267395, + "learning_rate": 0.000599069604828528, + "loss": 3.9246, + "step": 1535 + }, + { + "epoch": 0.08, + "grad_norm": 0.5953184366226196, + "learning_rate": 0.0005990683928203967, + "loss": 3.7108, + "step": 1536 + }, + { + "epoch": 0.08, + "grad_norm": 0.621066153049469, + "learning_rate": 0.0005990671800245768, + "loss": 3.9115, + "step": 1537 + }, + { + "epoch": 0.08, + "grad_norm": 0.6250355839729309, + "learning_rate": 0.0005990659664410715, + "loss": 3.7263, + "step": 1538 + }, + { + "epoch": 0.08, + "grad_norm": 0.6018726229667664, + "learning_rate": 0.0005990647520698839, + "loss": 4.0379, + "step": 1539 + }, + { + "epoch": 0.08, + "grad_norm": 0.5801904201507568, + "learning_rate": 0.0005990635369110174, + "loss": 3.8927, + "step": 1540 + }, + { + "epoch": 0.08, + "grad_norm": 0.61872398853302, + "learning_rate": 0.0005990623209644751, + "loss": 3.806, + "step": 1541 + }, + { + "epoch": 0.08, + "grad_norm": 0.5771560072898865, + "learning_rate": 0.0005990611042302602, + "loss": 3.8253, + "step": 1542 + }, + { + "epoch": 0.08, + "grad_norm": 0.5889159440994263, + "learning_rate": 0.0005990598867083759, + "loss": 3.827, + "step": 1543 + }, + { + "epoch": 0.08, + "grad_norm": 0.5858890414237976, + "learning_rate": 0.0005990586683988254, + "loss": 3.9579, + "step": 1544 + }, + { + "epoch": 0.08, + "grad_norm": 0.6480501890182495, + "learning_rate": 0.0005990574493016119, + "loss": 3.844, + "step": 1545 + }, + { + "epoch": 0.08, + "grad_norm": 0.6195836067199707, + "learning_rate": 0.0005990562294167387, + "loss": 3.8677, + "step": 1546 + }, + { + "epoch": 0.08, + "grad_norm": 0.6144270300865173, + "learning_rate": 0.0005990550087442088, + "loss": 3.7589, + "step": 1547 + }, + { + "epoch": 0.08, + "grad_norm": 0.6432852745056152, + "learning_rate": 0.0005990537872840257, + "loss": 3.8853, + "step": 1548 + }, + { + "epoch": 0.08, + "grad_norm": 0.6419529914855957, + "learning_rate": 0.0005990525650361925, + "loss": 3.9961, + "step": 1549 + }, + { + "epoch": 0.08, + "grad_norm": 0.6389473080635071, + "learning_rate": 0.0005990513420007122, + "loss": 3.7316, + "step": 1550 + }, + { + "epoch": 0.08, + "grad_norm": 0.6231613159179688, + "learning_rate": 0.0005990501181775885, + "loss": 3.8515, + "step": 1551 + }, + { + "epoch": 0.08, + "grad_norm": 0.5940372347831726, + "learning_rate": 0.0005990488935668242, + "loss": 3.9313, + "step": 1552 + }, + { + "epoch": 0.08, + "grad_norm": 0.6202555298805237, + "learning_rate": 0.0005990476681684226, + "loss": 3.9028, + "step": 1553 + }, + { + "epoch": 0.08, + "grad_norm": 0.5787298083305359, + "learning_rate": 0.0005990464419823871, + "loss": 3.8886, + "step": 1554 + }, + { + "epoch": 0.08, + "grad_norm": 0.639350950717926, + "learning_rate": 0.0005990452150087208, + "loss": 3.8559, + "step": 1555 + }, + { + "epoch": 0.08, + "grad_norm": 0.5683527588844299, + "learning_rate": 0.000599043987247427, + "loss": 3.9704, + "step": 1556 + }, + { + "epoch": 0.08, + "grad_norm": 0.6336866617202759, + "learning_rate": 0.0005990427586985088, + "loss": 3.8659, + "step": 1557 + }, + { + "epoch": 0.08, + "grad_norm": 0.6109759211540222, + "learning_rate": 0.0005990415293619695, + "loss": 4.0727, + "step": 1558 + }, + { + "epoch": 0.08, + "grad_norm": 0.6308028101921082, + "learning_rate": 0.0005990402992378125, + "loss": 3.7325, + "step": 1559 + }, + { + "epoch": 0.08, + "grad_norm": 0.6601607799530029, + "learning_rate": 0.0005990390683260408, + "loss": 3.8482, + "step": 1560 + }, + { + "epoch": 0.08, + "grad_norm": 0.6257616877555847, + "learning_rate": 0.0005990378366266577, + "loss": 3.9858, + "step": 1561 + }, + { + "epoch": 0.08, + "grad_norm": 0.6197351813316345, + "learning_rate": 0.0005990366041396666, + "loss": 3.7637, + "step": 1562 + }, + { + "epoch": 0.08, + "grad_norm": 0.5823824405670166, + "learning_rate": 0.0005990353708650706, + "loss": 3.8847, + "step": 1563 + }, + { + "epoch": 0.08, + "grad_norm": 0.6095857620239258, + "learning_rate": 0.000599034136802873, + "loss": 3.9395, + "step": 1564 + }, + { + "epoch": 0.08, + "grad_norm": 0.6098863482475281, + "learning_rate": 0.000599032901953077, + "loss": 3.7885, + "step": 1565 + }, + { + "epoch": 0.08, + "grad_norm": 0.5995204448699951, + "learning_rate": 0.0005990316663156859, + "loss": 3.8914, + "step": 1566 + }, + { + "epoch": 0.08, + "grad_norm": 0.5945135354995728, + "learning_rate": 0.0005990304298907029, + "loss": 3.9679, + "step": 1567 + }, + { + "epoch": 0.08, + "grad_norm": 0.5825366973876953, + "learning_rate": 0.0005990291926781313, + "loss": 3.8133, + "step": 1568 + }, + { + "epoch": 0.08, + "grad_norm": 0.6503740549087524, + "learning_rate": 0.0005990279546779745, + "loss": 3.8859, + "step": 1569 + }, + { + "epoch": 0.08, + "grad_norm": 0.6176950335502625, + "learning_rate": 0.0005990267158902354, + "loss": 3.8342, + "step": 1570 + }, + { + "epoch": 0.08, + "grad_norm": 0.6380782127380371, + "learning_rate": 0.0005990254763149176, + "loss": 3.5816, + "step": 1571 + }, + { + "epoch": 0.08, + "grad_norm": 0.6276407837867737, + "learning_rate": 0.0005990242359520243, + "loss": 3.9316, + "step": 1572 + }, + { + "epoch": 0.08, + "grad_norm": 0.5958492159843445, + "learning_rate": 0.0005990229948015587, + "loss": 3.6879, + "step": 1573 + }, + { + "epoch": 0.08, + "grad_norm": 0.5630125403404236, + "learning_rate": 0.000599021752863524, + "loss": 3.7691, + "step": 1574 + }, + { + "epoch": 0.08, + "grad_norm": 0.6117847561836243, + "learning_rate": 0.0005990205101379236, + "loss": 3.73, + "step": 1575 + }, + { + "epoch": 0.08, + "grad_norm": 0.6020464301109314, + "learning_rate": 0.0005990192666247607, + "loss": 3.8803, + "step": 1576 + }, + { + "epoch": 0.08, + "grad_norm": 0.5830046534538269, + "learning_rate": 0.0005990180223240385, + "loss": 3.6786, + "step": 1577 + }, + { + "epoch": 0.08, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0005990167772357605, + "loss": 4.1394, + "step": 1578 + }, + { + "epoch": 0.08, + "grad_norm": 0.5913933515548706, + "learning_rate": 0.0005990155313599298, + "loss": 3.9041, + "step": 1579 + }, + { + "epoch": 0.08, + "grad_norm": 0.5918366312980652, + "learning_rate": 0.0005990142846965498, + "loss": 3.675, + "step": 1580 + }, + { + "epoch": 0.08, + "grad_norm": 0.6066710948944092, + "learning_rate": 0.0005990130372456237, + "loss": 3.7399, + "step": 1581 + }, + { + "epoch": 0.08, + "grad_norm": 0.5532450675964355, + "learning_rate": 0.0005990117890071548, + "loss": 3.6508, + "step": 1582 + }, + { + "epoch": 0.08, + "grad_norm": 0.5913922786712646, + "learning_rate": 0.0005990105399811464, + "loss": 3.7544, + "step": 1583 + }, + { + "epoch": 0.08, + "grad_norm": 0.5768578052520752, + "learning_rate": 0.0005990092901676018, + "loss": 3.748, + "step": 1584 + }, + { + "epoch": 0.08, + "grad_norm": 0.6337912678718567, + "learning_rate": 0.0005990080395665242, + "loss": 3.8081, + "step": 1585 + }, + { + "epoch": 0.08, + "grad_norm": 0.5935748815536499, + "learning_rate": 0.000599006788177917, + "loss": 3.8911, + "step": 1586 + }, + { + "epoch": 0.08, + "grad_norm": 0.6132523417472839, + "learning_rate": 0.0005990055360017835, + "loss": 3.9166, + "step": 1587 + }, + { + "epoch": 0.08, + "grad_norm": 0.6079237461090088, + "learning_rate": 0.000599004283038127, + "loss": 3.7937, + "step": 1588 + }, + { + "epoch": 0.08, + "grad_norm": 0.6035066843032837, + "learning_rate": 0.0005990030292869507, + "loss": 3.8474, + "step": 1589 + }, + { + "epoch": 0.08, + "grad_norm": 0.620436429977417, + "learning_rate": 0.0005990017747482579, + "loss": 3.6852, + "step": 1590 + }, + { + "epoch": 0.08, + "grad_norm": 0.6006262302398682, + "learning_rate": 0.0005990005194220521, + "loss": 3.7319, + "step": 1591 + }, + { + "epoch": 0.08, + "grad_norm": 0.6008548140525818, + "learning_rate": 0.0005989992633083364, + "loss": 4.134, + "step": 1592 + }, + { + "epoch": 0.08, + "grad_norm": 0.5870209336280823, + "learning_rate": 0.0005989980064071142, + "loss": 4.0057, + "step": 1593 + }, + { + "epoch": 0.08, + "grad_norm": 0.595600962638855, + "learning_rate": 0.0005989967487183888, + "loss": 3.8465, + "step": 1594 + }, + { + "epoch": 0.08, + "grad_norm": 0.5959302186965942, + "learning_rate": 0.0005989954902421634, + "loss": 3.701, + "step": 1595 + }, + { + "epoch": 0.08, + "grad_norm": 0.5945109724998474, + "learning_rate": 0.0005989942309784416, + "loss": 3.9228, + "step": 1596 + }, + { + "epoch": 0.08, + "grad_norm": 0.614081084728241, + "learning_rate": 0.0005989929709272264, + "loss": 3.7123, + "step": 1597 + }, + { + "epoch": 0.08, + "grad_norm": 0.59315425157547, + "learning_rate": 0.0005989917100885214, + "loss": 3.7997, + "step": 1598 + }, + { + "epoch": 0.08, + "grad_norm": 0.5761898756027222, + "learning_rate": 0.0005989904484623296, + "loss": 3.6235, + "step": 1599 + }, + { + "epoch": 0.08, + "grad_norm": 0.6658716201782227, + "learning_rate": 0.0005989891860486546, + "loss": 3.8065, + "step": 1600 + }, + { + "epoch": 0.08, + "grad_norm": 0.5968477129936218, + "learning_rate": 0.0005989879228474997, + "loss": 3.6979, + "step": 1601 + }, + { + "epoch": 0.08, + "grad_norm": 0.630476713180542, + "learning_rate": 0.0005989866588588681, + "loss": 3.6387, + "step": 1602 + }, + { + "epoch": 0.08, + "grad_norm": 0.6172245144844055, + "learning_rate": 0.0005989853940827631, + "loss": 3.687, + "step": 1603 + }, + { + "epoch": 0.08, + "grad_norm": 0.6125102043151855, + "learning_rate": 0.0005989841285191881, + "loss": 3.7127, + "step": 1604 + }, + { + "epoch": 0.08, + "grad_norm": 0.6071942448616028, + "learning_rate": 0.0005989828621681466, + "loss": 3.7638, + "step": 1605 + }, + { + "epoch": 0.08, + "grad_norm": 0.6062251329421997, + "learning_rate": 0.0005989815950296417, + "loss": 3.706, + "step": 1606 + }, + { + "epoch": 0.08, + "grad_norm": 0.5893798470497131, + "learning_rate": 0.0005989803271036769, + "loss": 3.8942, + "step": 1607 + }, + { + "epoch": 0.08, + "grad_norm": 0.5914480090141296, + "learning_rate": 0.0005989790583902553, + "loss": 3.7829, + "step": 1608 + }, + { + "epoch": 0.08, + "grad_norm": 0.6149440407752991, + "learning_rate": 0.0005989777888893805, + "loss": 3.6808, + "step": 1609 + }, + { + "epoch": 0.08, + "grad_norm": 0.8986303210258484, + "learning_rate": 0.0005989765186010557, + "loss": 3.543, + "step": 1610 + }, + { + "epoch": 0.08, + "grad_norm": 0.5829508900642395, + "learning_rate": 0.0005989752475252843, + "loss": 3.7894, + "step": 1611 + }, + { + "epoch": 0.08, + "grad_norm": 0.6794427633285522, + "learning_rate": 0.0005989739756620697, + "loss": 3.7064, + "step": 1612 + }, + { + "epoch": 0.08, + "grad_norm": 0.7055894732475281, + "learning_rate": 0.0005989727030114151, + "loss": 3.8842, + "step": 1613 + }, + { + "epoch": 0.08, + "grad_norm": 0.5715697407722473, + "learning_rate": 0.000598971429573324, + "loss": 3.9117, + "step": 1614 + }, + { + "epoch": 0.08, + "grad_norm": 0.6290931701660156, + "learning_rate": 0.0005989701553477996, + "loss": 3.5982, + "step": 1615 + }, + { + "epoch": 0.08, + "grad_norm": 0.5757427215576172, + "learning_rate": 0.0005989688803348455, + "loss": 3.6704, + "step": 1616 + }, + { + "epoch": 0.08, + "grad_norm": 0.6020493507385254, + "learning_rate": 0.0005989676045344648, + "loss": 3.5587, + "step": 1617 + }, + { + "epoch": 0.08, + "grad_norm": 0.5768998265266418, + "learning_rate": 0.000598966327946661, + "loss": 3.5811, + "step": 1618 + }, + { + "epoch": 0.08, + "grad_norm": 0.6348850131034851, + "learning_rate": 0.0005989650505714374, + "loss": 3.9682, + "step": 1619 + }, + { + "epoch": 0.08, + "grad_norm": 0.6411147713661194, + "learning_rate": 0.0005989637724087973, + "loss": 3.7195, + "step": 1620 + }, + { + "epoch": 0.08, + "grad_norm": 0.6555300354957581, + "learning_rate": 0.0005989624934587443, + "loss": 3.647, + "step": 1621 + }, + { + "epoch": 0.08, + "grad_norm": 0.6385052800178528, + "learning_rate": 0.0005989612137212816, + "loss": 3.8637, + "step": 1622 + }, + { + "epoch": 0.08, + "grad_norm": 0.6256349086761475, + "learning_rate": 0.0005989599331964127, + "loss": 3.765, + "step": 1623 + }, + { + "epoch": 0.08, + "grad_norm": 0.6104692220687866, + "learning_rate": 0.0005989586518841408, + "loss": 3.721, + "step": 1624 + }, + { + "epoch": 0.08, + "grad_norm": 0.5835363864898682, + "learning_rate": 0.0005989573697844692, + "loss": 3.9361, + "step": 1625 + }, + { + "epoch": 0.08, + "grad_norm": 0.6491838693618774, + "learning_rate": 0.0005989560868974015, + "loss": 3.8728, + "step": 1626 + }, + { + "epoch": 0.08, + "grad_norm": 0.602606475353241, + "learning_rate": 0.0005989548032229411, + "loss": 3.7802, + "step": 1627 + }, + { + "epoch": 0.08, + "grad_norm": 0.5809963941574097, + "learning_rate": 0.0005989535187610912, + "loss": 3.5699, + "step": 1628 + }, + { + "epoch": 0.08, + "grad_norm": 0.5906050801277161, + "learning_rate": 0.0005989522335118553, + "loss": 3.7924, + "step": 1629 + }, + { + "epoch": 0.08, + "grad_norm": 0.6175136566162109, + "learning_rate": 0.0005989509474752367, + "loss": 3.8836, + "step": 1630 + }, + { + "epoch": 0.08, + "grad_norm": 0.5688444972038269, + "learning_rate": 0.0005989496606512389, + "loss": 3.9392, + "step": 1631 + }, + { + "epoch": 0.08, + "grad_norm": 0.6032138466835022, + "learning_rate": 0.0005989483730398652, + "loss": 4.0405, + "step": 1632 + }, + { + "epoch": 0.08, + "grad_norm": 0.6493321657180786, + "learning_rate": 0.000598947084641119, + "loss": 3.824, + "step": 1633 + }, + { + "epoch": 0.08, + "grad_norm": 0.5763911008834839, + "learning_rate": 0.0005989457954550038, + "loss": 3.7681, + "step": 1634 + }, + { + "epoch": 0.08, + "grad_norm": 0.5691786408424377, + "learning_rate": 0.0005989445054815229, + "loss": 3.9562, + "step": 1635 + }, + { + "epoch": 0.08, + "grad_norm": 0.6753783226013184, + "learning_rate": 0.0005989432147206796, + "loss": 3.7078, + "step": 1636 + }, + { + "epoch": 0.08, + "grad_norm": 0.6098892092704773, + "learning_rate": 0.0005989419231724775, + "loss": 3.478, + "step": 1637 + }, + { + "epoch": 0.08, + "grad_norm": 0.6037442684173584, + "learning_rate": 0.0005989406308369199, + "loss": 3.73, + "step": 1638 + }, + { + "epoch": 0.08, + "grad_norm": 0.6473791003227234, + "learning_rate": 0.0005989393377140101, + "loss": 3.7052, + "step": 1639 + }, + { + "epoch": 0.08, + "grad_norm": 0.6919148564338684, + "learning_rate": 0.0005989380438037517, + "loss": 3.7036, + "step": 1640 + }, + { + "epoch": 0.08, + "grad_norm": 0.6580774188041687, + "learning_rate": 0.000598936749106148, + "loss": 3.8456, + "step": 1641 + }, + { + "epoch": 0.08, + "grad_norm": 0.5730836987495422, + "learning_rate": 0.0005989354536212024, + "loss": 3.5748, + "step": 1642 + }, + { + "epoch": 0.08, + "grad_norm": 0.5901459455490112, + "learning_rate": 0.0005989341573489185, + "loss": 3.7192, + "step": 1643 + }, + { + "epoch": 0.08, + "grad_norm": 0.6304981708526611, + "learning_rate": 0.0005989328602892994, + "loss": 3.9909, + "step": 1644 + }, + { + "epoch": 0.08, + "grad_norm": 0.6005657911300659, + "learning_rate": 0.0005989315624423487, + "loss": 3.6228, + "step": 1645 + }, + { + "epoch": 0.08, + "grad_norm": 0.61674565076828, + "learning_rate": 0.0005989302638080698, + "loss": 3.7769, + "step": 1646 + }, + { + "epoch": 0.08, + "grad_norm": 0.598010778427124, + "learning_rate": 0.0005989289643864662, + "loss": 3.7453, + "step": 1647 + }, + { + "epoch": 0.08, + "grad_norm": 0.6126073598861694, + "learning_rate": 0.0005989276641775412, + "loss": 3.6865, + "step": 1648 + }, + { + "epoch": 0.08, + "grad_norm": 0.6504325270652771, + "learning_rate": 0.0005989263631812982, + "loss": 3.6676, + "step": 1649 + }, + { + "epoch": 0.08, + "grad_norm": 0.6758439540863037, + "learning_rate": 0.0005989250613977408, + "loss": 3.8634, + "step": 1650 + }, + { + "epoch": 0.08, + "grad_norm": 0.5557368993759155, + "learning_rate": 0.0005989237588268722, + "loss": 3.7872, + "step": 1651 + }, + { + "epoch": 0.08, + "grad_norm": 0.6107842922210693, + "learning_rate": 0.0005989224554686961, + "loss": 3.6353, + "step": 1652 + }, + { + "epoch": 0.08, + "grad_norm": 0.5767660140991211, + "learning_rate": 0.0005989211513232157, + "loss": 3.9362, + "step": 1653 + }, + { + "epoch": 0.08, + "grad_norm": 0.6302205324172974, + "learning_rate": 0.0005989198463904344, + "loss": 3.8184, + "step": 1654 + }, + { + "epoch": 0.08, + "grad_norm": 0.6364771723747253, + "learning_rate": 0.0005989185406703559, + "loss": 3.93, + "step": 1655 + }, + { + "epoch": 0.08, + "grad_norm": 0.6427229046821594, + "learning_rate": 0.0005989172341629834, + "loss": 3.7459, + "step": 1656 + }, + { + "epoch": 0.08, + "grad_norm": 0.604668378829956, + "learning_rate": 0.0005989159268683205, + "loss": 3.7998, + "step": 1657 + }, + { + "epoch": 0.08, + "grad_norm": 0.6303444504737854, + "learning_rate": 0.0005989146187863705, + "loss": 3.8282, + "step": 1658 + }, + { + "epoch": 0.08, + "grad_norm": 0.5857745409011841, + "learning_rate": 0.0005989133099171371, + "loss": 3.8037, + "step": 1659 + }, + { + "epoch": 0.08, + "grad_norm": 0.5540629625320435, + "learning_rate": 0.0005989120002606233, + "loss": 3.9089, + "step": 1660 + }, + { + "epoch": 0.08, + "grad_norm": 0.732204794883728, + "learning_rate": 0.000598910689816833, + "loss": 3.7783, + "step": 1661 + }, + { + "epoch": 0.08, + "grad_norm": 0.6092149615287781, + "learning_rate": 0.0005989093785857694, + "loss": 3.9062, + "step": 1662 + }, + { + "epoch": 0.08, + "grad_norm": 0.5495967864990234, + "learning_rate": 0.0005989080665674361, + "loss": 3.8753, + "step": 1663 + }, + { + "epoch": 0.08, + "grad_norm": 0.6210874319076538, + "learning_rate": 0.0005989067537618364, + "loss": 3.8944, + "step": 1664 + }, + { + "epoch": 0.08, + "grad_norm": 0.6496333479881287, + "learning_rate": 0.0005989054401689739, + "loss": 3.9576, + "step": 1665 + }, + { + "epoch": 0.08, + "grad_norm": 0.6235400438308716, + "learning_rate": 0.0005989041257888519, + "loss": 3.8495, + "step": 1666 + }, + { + "epoch": 0.08, + "grad_norm": 0.5927395224571228, + "learning_rate": 0.000598902810621474, + "loss": 3.6989, + "step": 1667 + }, + { + "epoch": 0.08, + "grad_norm": 0.5796555280685425, + "learning_rate": 0.0005989014946668437, + "loss": 3.6428, + "step": 1668 + }, + { + "epoch": 0.08, + "grad_norm": 0.6373754739761353, + "learning_rate": 0.0005989001779249643, + "loss": 3.6226, + "step": 1669 + }, + { + "epoch": 0.08, + "grad_norm": 0.5925307273864746, + "learning_rate": 0.0005988988603958395, + "loss": 3.735, + "step": 1670 + }, + { + "epoch": 0.08, + "grad_norm": 0.6357890963554382, + "learning_rate": 0.0005988975420794724, + "loss": 3.7731, + "step": 1671 + }, + { + "epoch": 0.08, + "grad_norm": 0.6149181127548218, + "learning_rate": 0.0005988962229758668, + "loss": 3.844, + "step": 1672 + }, + { + "epoch": 0.08, + "grad_norm": 0.6464425921440125, + "learning_rate": 0.0005988949030850261, + "loss": 3.7181, + "step": 1673 + }, + { + "epoch": 0.08, + "grad_norm": 0.6317658424377441, + "learning_rate": 0.0005988935824069538, + "loss": 3.5902, + "step": 1674 + }, + { + "epoch": 0.08, + "grad_norm": 0.5768055319786072, + "learning_rate": 0.0005988922609416532, + "loss": 3.7552, + "step": 1675 + }, + { + "epoch": 0.08, + "grad_norm": 0.6145294904708862, + "learning_rate": 0.000598890938689128, + "loss": 4.0868, + "step": 1676 + }, + { + "epoch": 0.08, + "grad_norm": 0.6101502180099487, + "learning_rate": 0.0005988896156493815, + "loss": 3.8068, + "step": 1677 + }, + { + "epoch": 0.08, + "grad_norm": 0.6006686091423035, + "learning_rate": 0.0005988882918224175, + "loss": 3.8113, + "step": 1678 + }, + { + "epoch": 0.08, + "grad_norm": 0.6369596719741821, + "learning_rate": 0.0005988869672082389, + "loss": 3.5701, + "step": 1679 + }, + { + "epoch": 0.08, + "grad_norm": 0.6296815276145935, + "learning_rate": 0.0005988856418068498, + "loss": 3.7768, + "step": 1680 + }, + { + "epoch": 0.08, + "grad_norm": 0.6762238144874573, + "learning_rate": 0.0005988843156182534, + "loss": 3.77, + "step": 1681 + }, + { + "epoch": 0.08, + "grad_norm": 0.5993692278862, + "learning_rate": 0.0005988829886424532, + "loss": 3.8124, + "step": 1682 + }, + { + "epoch": 0.08, + "grad_norm": 0.6622273921966553, + "learning_rate": 0.0005988816608794527, + "loss": 3.7658, + "step": 1683 + }, + { + "epoch": 0.08, + "grad_norm": 0.5899795293807983, + "learning_rate": 0.0005988803323292556, + "loss": 3.7341, + "step": 1684 + }, + { + "epoch": 0.08, + "grad_norm": 0.6564462780952454, + "learning_rate": 0.0005988790029918651, + "loss": 3.6619, + "step": 1685 + }, + { + "epoch": 0.08, + "grad_norm": 0.6012856364250183, + "learning_rate": 0.0005988776728672847, + "loss": 3.7838, + "step": 1686 + }, + { + "epoch": 0.08, + "grad_norm": 0.5971396565437317, + "learning_rate": 0.0005988763419555182, + "loss": 3.7886, + "step": 1687 + }, + { + "epoch": 0.08, + "grad_norm": 0.5735385417938232, + "learning_rate": 0.0005988750102565689, + "loss": 3.8873, + "step": 1688 + }, + { + "epoch": 0.08, + "grad_norm": 0.5730529427528381, + "learning_rate": 0.0005988736777704403, + "loss": 3.7951, + "step": 1689 + }, + { + "epoch": 0.08, + "grad_norm": 0.5634326338768005, + "learning_rate": 0.0005988723444971361, + "loss": 3.9817, + "step": 1690 + }, + { + "epoch": 0.08, + "grad_norm": 0.5796641707420349, + "learning_rate": 0.0005988710104366595, + "loss": 3.6427, + "step": 1691 + }, + { + "epoch": 0.08, + "grad_norm": 0.5546978116035461, + "learning_rate": 0.0005988696755890142, + "loss": 3.839, + "step": 1692 + }, + { + "epoch": 0.08, + "grad_norm": 0.6504260897636414, + "learning_rate": 0.0005988683399542037, + "loss": 3.6866, + "step": 1693 + }, + { + "epoch": 0.08, + "grad_norm": 0.6330404281616211, + "learning_rate": 0.0005988670035322316, + "loss": 3.6307, + "step": 1694 + }, + { + "epoch": 0.08, + "grad_norm": 0.686377227306366, + "learning_rate": 0.0005988656663231012, + "loss": 3.5898, + "step": 1695 + }, + { + "epoch": 0.08, + "grad_norm": 0.6027424335479736, + "learning_rate": 0.0005988643283268162, + "loss": 3.6807, + "step": 1696 + }, + { + "epoch": 0.08, + "grad_norm": 0.6752616763114929, + "learning_rate": 0.0005988629895433801, + "loss": 3.5909, + "step": 1697 + }, + { + "epoch": 0.08, + "grad_norm": 0.5653634071350098, + "learning_rate": 0.0005988616499727965, + "loss": 3.9474, + "step": 1698 + }, + { + "epoch": 0.08, + "grad_norm": 0.6292624473571777, + "learning_rate": 0.0005988603096150686, + "loss": 3.7446, + "step": 1699 + }, + { + "epoch": 0.08, + "grad_norm": 0.6524072289466858, + "learning_rate": 0.0005988589684702004, + "loss": 3.8772, + "step": 1700 + }, + { + "epoch": 0.08, + "grad_norm": 0.6119723320007324, + "learning_rate": 0.0005988576265381951, + "loss": 3.5762, + "step": 1701 + }, + { + "epoch": 0.08, + "grad_norm": 0.6504294872283936, + "learning_rate": 0.0005988562838190562, + "loss": 3.6652, + "step": 1702 + }, + { + "epoch": 0.08, + "grad_norm": 0.6257925033569336, + "learning_rate": 0.0005988549403127875, + "loss": 3.6179, + "step": 1703 + }, + { + "epoch": 0.08, + "grad_norm": 0.6477129459381104, + "learning_rate": 0.0005988535960193924, + "loss": 3.7043, + "step": 1704 + }, + { + "epoch": 0.08, + "grad_norm": 0.6320105791091919, + "learning_rate": 0.0005988522509388744, + "loss": 3.7492, + "step": 1705 + }, + { + "epoch": 0.08, + "grad_norm": 0.5984609127044678, + "learning_rate": 0.0005988509050712371, + "loss": 3.8237, + "step": 1706 + }, + { + "epoch": 0.08, + "grad_norm": 0.5916309952735901, + "learning_rate": 0.0005988495584164841, + "loss": 3.5611, + "step": 1707 + }, + { + "epoch": 0.08, + "grad_norm": 0.5889941453933716, + "learning_rate": 0.0005988482109746188, + "loss": 3.6234, + "step": 1708 + }, + { + "epoch": 0.08, + "grad_norm": 0.5852148532867432, + "learning_rate": 0.0005988468627456448, + "loss": 3.5888, + "step": 1709 + }, + { + "epoch": 0.08, + "grad_norm": 0.6306596398353577, + "learning_rate": 0.0005988455137295657, + "loss": 3.9278, + "step": 1710 + }, + { + "epoch": 0.08, + "grad_norm": 0.63728266954422, + "learning_rate": 0.0005988441639263851, + "loss": 3.6708, + "step": 1711 + }, + { + "epoch": 0.08, + "grad_norm": 0.5717198848724365, + "learning_rate": 0.0005988428133361064, + "loss": 3.8049, + "step": 1712 + }, + { + "epoch": 0.08, + "grad_norm": 0.5758329033851624, + "learning_rate": 0.0005988414619587333, + "loss": 3.7912, + "step": 1713 + }, + { + "epoch": 0.08, + "grad_norm": 0.6172877550125122, + "learning_rate": 0.0005988401097942693, + "loss": 3.6147, + "step": 1714 + }, + { + "epoch": 0.08, + "grad_norm": 0.6377825140953064, + "learning_rate": 0.0005988387568427179, + "loss": 3.7007, + "step": 1715 + }, + { + "epoch": 0.08, + "grad_norm": 0.5383907556533813, + "learning_rate": 0.0005988374031040828, + "loss": 3.6447, + "step": 1716 + }, + { + "epoch": 0.08, + "grad_norm": 0.6975224018096924, + "learning_rate": 0.0005988360485783674, + "loss": 3.6552, + "step": 1717 + }, + { + "epoch": 0.08, + "grad_norm": 0.5704695582389832, + "learning_rate": 0.0005988346932655755, + "loss": 3.9312, + "step": 1718 + }, + { + "epoch": 0.08, + "grad_norm": 0.583870530128479, + "learning_rate": 0.0005988333371657104, + "loss": 3.8014, + "step": 1719 + }, + { + "epoch": 0.08, + "grad_norm": 0.5745240449905396, + "learning_rate": 0.0005988319802787759, + "loss": 3.9566, + "step": 1720 + }, + { + "epoch": 0.08, + "grad_norm": 0.5921694040298462, + "learning_rate": 0.0005988306226047754, + "loss": 3.7942, + "step": 1721 + }, + { + "epoch": 0.08, + "grad_norm": 0.586078405380249, + "learning_rate": 0.0005988292641437126, + "loss": 3.8475, + "step": 1722 + }, + { + "epoch": 0.08, + "grad_norm": 0.6041373610496521, + "learning_rate": 0.000598827904895591, + "loss": 3.7767, + "step": 1723 + }, + { + "epoch": 0.08, + "grad_norm": 0.6381456255912781, + "learning_rate": 0.0005988265448604143, + "loss": 3.7119, + "step": 1724 + }, + { + "epoch": 0.08, + "grad_norm": 0.6016688942909241, + "learning_rate": 0.0005988251840381858, + "loss": 3.9402, + "step": 1725 + }, + { + "epoch": 0.08, + "grad_norm": 0.6326636672019958, + "learning_rate": 0.0005988238224289094, + "loss": 3.89, + "step": 1726 + }, + { + "epoch": 0.08, + "grad_norm": 0.5598689317703247, + "learning_rate": 0.0005988224600325886, + "loss": 3.7806, + "step": 1727 + }, + { + "epoch": 0.08, + "grad_norm": 0.6054954528808594, + "learning_rate": 0.0005988210968492268, + "loss": 3.8775, + "step": 1728 + }, + { + "epoch": 0.08, + "grad_norm": 0.6688248515129089, + "learning_rate": 0.0005988197328788279, + "loss": 3.8513, + "step": 1729 + }, + { + "epoch": 0.08, + "grad_norm": 0.6302428841590881, + "learning_rate": 0.0005988183681213953, + "loss": 3.7177, + "step": 1730 + }, + { + "epoch": 0.08, + "grad_norm": 0.6412402391433716, + "learning_rate": 0.0005988170025769326, + "loss": 3.6202, + "step": 1731 + }, + { + "epoch": 0.08, + "grad_norm": 0.6302105784416199, + "learning_rate": 0.0005988156362454434, + "loss": 3.7965, + "step": 1732 + }, + { + "epoch": 0.08, + "grad_norm": 0.5867551565170288, + "learning_rate": 0.0005988142691269314, + "loss": 3.5873, + "step": 1733 + }, + { + "epoch": 0.08, + "grad_norm": 0.670638382434845, + "learning_rate": 0.0005988129012214, + "loss": 3.755, + "step": 1734 + }, + { + "epoch": 0.09, + "grad_norm": 0.6355201601982117, + "learning_rate": 0.000598811532528853, + "loss": 3.9034, + "step": 1735 + }, + { + "epoch": 0.09, + "grad_norm": 0.6599445939064026, + "learning_rate": 0.000598810163049294, + "loss": 4.0809, + "step": 1736 + }, + { + "epoch": 0.09, + "grad_norm": 0.5947389602661133, + "learning_rate": 0.0005988087927827264, + "loss": 3.8277, + "step": 1737 + }, + { + "epoch": 0.09, + "grad_norm": 0.6837261319160461, + "learning_rate": 0.000598807421729154, + "loss": 3.8107, + "step": 1738 + }, + { + "epoch": 0.09, + "grad_norm": 0.6405160427093506, + "learning_rate": 0.0005988060498885804, + "loss": 3.678, + "step": 1739 + }, + { + "epoch": 0.09, + "grad_norm": 0.6684130430221558, + "learning_rate": 0.0005988046772610091, + "loss": 3.8897, + "step": 1740 + }, + { + "epoch": 0.09, + "grad_norm": 0.6249805688858032, + "learning_rate": 0.0005988033038464438, + "loss": 3.9512, + "step": 1741 + }, + { + "epoch": 0.09, + "grad_norm": 0.591820478439331, + "learning_rate": 0.0005988019296448882, + "loss": 3.6929, + "step": 1742 + }, + { + "epoch": 0.09, + "grad_norm": 0.6458953022956848, + "learning_rate": 0.0005988005546563457, + "loss": 3.7441, + "step": 1743 + }, + { + "epoch": 0.09, + "grad_norm": 0.6687983870506287, + "learning_rate": 0.0005987991788808201, + "loss": 3.7556, + "step": 1744 + }, + { + "epoch": 0.09, + "grad_norm": 0.6560901999473572, + "learning_rate": 0.000598797802318315, + "loss": 3.7612, + "step": 1745 + }, + { + "epoch": 0.09, + "grad_norm": 0.6090633273124695, + "learning_rate": 0.000598796424968834, + "loss": 3.8793, + "step": 1746 + }, + { + "epoch": 0.09, + "grad_norm": 0.647350549697876, + "learning_rate": 0.0005987950468323806, + "loss": 3.8172, + "step": 1747 + }, + { + "epoch": 0.09, + "grad_norm": 0.6341491937637329, + "learning_rate": 0.0005987936679089586, + "loss": 3.9446, + "step": 1748 + }, + { + "epoch": 0.09, + "grad_norm": 0.5984036326408386, + "learning_rate": 0.0005987922881985717, + "loss": 3.6023, + "step": 1749 + }, + { + "epoch": 0.09, + "grad_norm": 0.6060883402824402, + "learning_rate": 0.0005987909077012233, + "loss": 3.8801, + "step": 1750 + }, + { + "epoch": 0.09, + "grad_norm": 0.6400834918022156, + "learning_rate": 0.0005987895264169172, + "loss": 3.8829, + "step": 1751 + }, + { + "epoch": 0.09, + "grad_norm": 0.6266987919807434, + "learning_rate": 0.0005987881443456569, + "loss": 3.6213, + "step": 1752 + }, + { + "epoch": 0.09, + "grad_norm": 0.6001605987548828, + "learning_rate": 0.0005987867614874463, + "loss": 3.7277, + "step": 1753 + }, + { + "epoch": 0.09, + "grad_norm": 0.6193922758102417, + "learning_rate": 0.0005987853778422887, + "loss": 3.8871, + "step": 1754 + }, + { + "epoch": 0.09, + "grad_norm": 0.6383598446846008, + "learning_rate": 0.000598783993410188, + "loss": 4.0488, + "step": 1755 + }, + { + "epoch": 0.09, + "grad_norm": 0.5988031029701233, + "learning_rate": 0.0005987826081911478, + "loss": 3.8313, + "step": 1756 + }, + { + "epoch": 0.09, + "grad_norm": 0.6383107304573059, + "learning_rate": 0.0005987812221851716, + "loss": 3.7355, + "step": 1757 + }, + { + "epoch": 0.09, + "grad_norm": 0.5527706146240234, + "learning_rate": 0.0005987798353922632, + "loss": 3.9147, + "step": 1758 + }, + { + "epoch": 0.09, + "grad_norm": 0.5697839856147766, + "learning_rate": 0.0005987784478124263, + "loss": 3.7764, + "step": 1759 + }, + { + "epoch": 0.09, + "grad_norm": 0.6100751757621765, + "learning_rate": 0.0005987770594456644, + "loss": 3.7297, + "step": 1760 + }, + { + "epoch": 0.09, + "grad_norm": 0.5448328256607056, + "learning_rate": 0.0005987756702919813, + "loss": 3.7812, + "step": 1761 + }, + { + "epoch": 0.09, + "grad_norm": 0.6359660625457764, + "learning_rate": 0.0005987742803513805, + "loss": 3.583, + "step": 1762 + }, + { + "epoch": 0.09, + "grad_norm": 0.5905768275260925, + "learning_rate": 0.0005987728896238657, + "loss": 3.7894, + "step": 1763 + }, + { + "epoch": 0.09, + "grad_norm": 0.6355024576187134, + "learning_rate": 0.0005987714981094407, + "loss": 3.7716, + "step": 1764 + }, + { + "epoch": 0.09, + "grad_norm": 0.5862598419189453, + "learning_rate": 0.0005987701058081091, + "loss": 3.7433, + "step": 1765 + }, + { + "epoch": 0.09, + "grad_norm": 0.6261117458343506, + "learning_rate": 0.0005987687127198745, + "loss": 3.8265, + "step": 1766 + }, + { + "epoch": 0.09, + "grad_norm": 0.5825438499450684, + "learning_rate": 0.0005987673188447406, + "loss": 3.598, + "step": 1767 + }, + { + "epoch": 0.09, + "grad_norm": 0.6495029926300049, + "learning_rate": 0.0005987659241827111, + "loss": 3.4557, + "step": 1768 + }, + { + "epoch": 0.09, + "grad_norm": 0.5948395729064941, + "learning_rate": 0.0005987645287337896, + "loss": 3.6985, + "step": 1769 + }, + { + "epoch": 0.09, + "grad_norm": 0.6291310787200928, + "learning_rate": 0.0005987631324979799, + "loss": 3.7719, + "step": 1770 + }, + { + "epoch": 0.09, + "grad_norm": 0.6354256868362427, + "learning_rate": 0.0005987617354752856, + "loss": 3.7825, + "step": 1771 + }, + { + "epoch": 0.09, + "grad_norm": 0.5604045391082764, + "learning_rate": 0.0005987603376657103, + "loss": 3.9863, + "step": 1772 + }, + { + "epoch": 0.09, + "grad_norm": 0.6706216931343079, + "learning_rate": 0.0005987589390692578, + "loss": 3.7109, + "step": 1773 + }, + { + "epoch": 0.09, + "grad_norm": 0.6647695302963257, + "learning_rate": 0.0005987575396859318, + "loss": 3.6884, + "step": 1774 + }, + { + "epoch": 0.09, + "grad_norm": 0.6067073941230774, + "learning_rate": 0.000598756139515736, + "loss": 3.7603, + "step": 1775 + }, + { + "epoch": 0.09, + "grad_norm": 0.6076330542564392, + "learning_rate": 0.0005987547385586739, + "loss": 3.887, + "step": 1776 + }, + { + "epoch": 0.09, + "grad_norm": 0.6393039226531982, + "learning_rate": 0.0005987533368147493, + "loss": 3.9307, + "step": 1777 + }, + { + "epoch": 0.09, + "grad_norm": 0.5665969848632812, + "learning_rate": 0.0005987519342839661, + "loss": 3.8139, + "step": 1778 + }, + { + "epoch": 0.09, + "grad_norm": 0.5718681812286377, + "learning_rate": 0.0005987505309663275, + "loss": 3.8209, + "step": 1779 + }, + { + "epoch": 0.09, + "grad_norm": 0.6644183993339539, + "learning_rate": 0.0005987491268618377, + "loss": 3.6834, + "step": 1780 + }, + { + "epoch": 0.09, + "grad_norm": 0.6793142557144165, + "learning_rate": 0.0005987477219705002, + "loss": 3.6879, + "step": 1781 + }, + { + "epoch": 0.09, + "grad_norm": 0.8465895652770996, + "learning_rate": 0.0005987463162923187, + "loss": 3.7265, + "step": 1782 + }, + { + "epoch": 0.09, + "grad_norm": 0.6196627020835876, + "learning_rate": 0.0005987449098272968, + "loss": 4.0914, + "step": 1783 + }, + { + "epoch": 0.09, + "grad_norm": 0.5777091383934021, + "learning_rate": 0.0005987435025754384, + "loss": 3.7122, + "step": 1784 + }, + { + "epoch": 0.09, + "grad_norm": 0.5573776960372925, + "learning_rate": 0.000598742094536747, + "loss": 4.0411, + "step": 1785 + }, + { + "epoch": 0.09, + "grad_norm": 0.6396626830101013, + "learning_rate": 0.0005987406857112266, + "loss": 3.6454, + "step": 1786 + }, + { + "epoch": 0.09, + "grad_norm": 0.5943922996520996, + "learning_rate": 0.0005987392760988805, + "loss": 3.7657, + "step": 1787 + }, + { + "epoch": 0.09, + "grad_norm": 0.6125734448432922, + "learning_rate": 0.0005987378656997128, + "loss": 3.7891, + "step": 1788 + }, + { + "epoch": 0.09, + "grad_norm": 0.6283426880836487, + "learning_rate": 0.000598736454513727, + "loss": 3.7106, + "step": 1789 + }, + { + "epoch": 0.09, + "grad_norm": 0.6084054708480835, + "learning_rate": 0.0005987350425409268, + "loss": 3.9321, + "step": 1790 + }, + { + "epoch": 0.09, + "grad_norm": 0.5988473296165466, + "learning_rate": 0.0005987336297813161, + "loss": 4.0916, + "step": 1791 + }, + { + "epoch": 0.09, + "grad_norm": 0.650684118270874, + "learning_rate": 0.0005987322162348985, + "loss": 4.0522, + "step": 1792 + }, + { + "epoch": 0.09, + "grad_norm": 0.5723590850830078, + "learning_rate": 0.0005987308019016778, + "loss": 3.8183, + "step": 1793 + }, + { + "epoch": 0.09, + "grad_norm": 0.5800668001174927, + "learning_rate": 0.0005987293867816575, + "loss": 3.7359, + "step": 1794 + }, + { + "epoch": 0.09, + "grad_norm": 0.5976575016975403, + "learning_rate": 0.0005987279708748416, + "loss": 3.9004, + "step": 1795 + }, + { + "epoch": 0.09, + "grad_norm": 0.6423118114471436, + "learning_rate": 0.0005987265541812336, + "loss": 3.5266, + "step": 1796 + }, + { + "epoch": 0.09, + "grad_norm": 0.5797125697135925, + "learning_rate": 0.0005987251367008374, + "loss": 3.9024, + "step": 1797 + }, + { + "epoch": 0.09, + "grad_norm": 0.5562116503715515, + "learning_rate": 0.0005987237184336568, + "loss": 3.6905, + "step": 1798 + }, + { + "epoch": 0.09, + "grad_norm": 0.6045980453491211, + "learning_rate": 0.0005987222993796953, + "loss": 3.6083, + "step": 1799 + }, + { + "epoch": 0.09, + "grad_norm": 0.696354866027832, + "learning_rate": 0.0005987208795389567, + "loss": 3.8496, + "step": 1800 + }, + { + "epoch": 0.09, + "grad_norm": 0.6290586590766907, + "learning_rate": 0.0005987194589114449, + "loss": 3.5814, + "step": 1801 + }, + { + "epoch": 0.09, + "grad_norm": 0.5684269070625305, + "learning_rate": 0.0005987180374971635, + "loss": 3.8016, + "step": 1802 + }, + { + "epoch": 0.09, + "grad_norm": 0.5839782953262329, + "learning_rate": 0.0005987166152961163, + "loss": 3.5221, + "step": 1803 + }, + { + "epoch": 0.09, + "grad_norm": 0.5923112630844116, + "learning_rate": 0.000598715192308307, + "loss": 3.5328, + "step": 1804 + }, + { + "epoch": 0.09, + "grad_norm": 0.5886315703392029, + "learning_rate": 0.0005987137685337393, + "loss": 4.0307, + "step": 1805 + }, + { + "epoch": 0.09, + "grad_norm": 0.6204468607902527, + "learning_rate": 0.0005987123439724172, + "loss": 3.8404, + "step": 1806 + }, + { + "epoch": 0.09, + "grad_norm": 0.5841030478477478, + "learning_rate": 0.0005987109186243441, + "loss": 3.3384, + "step": 1807 + }, + { + "epoch": 0.09, + "grad_norm": 0.5855621099472046, + "learning_rate": 0.0005987094924895241, + "loss": 3.7011, + "step": 1808 + }, + { + "epoch": 0.09, + "grad_norm": 0.6314960718154907, + "learning_rate": 0.0005987080655679607, + "loss": 3.7732, + "step": 1809 + }, + { + "epoch": 0.09, + "grad_norm": 0.5664706826210022, + "learning_rate": 0.0005987066378596577, + "loss": 3.8319, + "step": 1810 + }, + { + "epoch": 0.09, + "grad_norm": 0.6282777786254883, + "learning_rate": 0.0005987052093646189, + "loss": 3.6498, + "step": 1811 + }, + { + "epoch": 0.09, + "grad_norm": 0.5864231586456299, + "learning_rate": 0.0005987037800828482, + "loss": 3.8712, + "step": 1812 + }, + { + "epoch": 0.09, + "grad_norm": 0.6137101054191589, + "learning_rate": 0.0005987023500143491, + "loss": 3.6108, + "step": 1813 + }, + { + "epoch": 0.09, + "grad_norm": 0.6736273169517517, + "learning_rate": 0.0005987009191591254, + "loss": 3.6828, + "step": 1814 + }, + { + "epoch": 0.09, + "grad_norm": 0.6778322458267212, + "learning_rate": 0.0005986994875171811, + "loss": 3.6604, + "step": 1815 + }, + { + "epoch": 0.09, + "grad_norm": 0.6052943468093872, + "learning_rate": 0.0005986980550885198, + "loss": 3.9966, + "step": 1816 + }, + { + "epoch": 0.09, + "grad_norm": 0.6130955815315247, + "learning_rate": 0.0005986966218731454, + "loss": 3.6581, + "step": 1817 + }, + { + "epoch": 0.09, + "grad_norm": 0.61712247133255, + "learning_rate": 0.0005986951878710616, + "loss": 3.704, + "step": 1818 + }, + { + "epoch": 0.09, + "grad_norm": 0.6492055654525757, + "learning_rate": 0.0005986937530822721, + "loss": 3.7532, + "step": 1819 + }, + { + "epoch": 0.09, + "grad_norm": 0.6408119797706604, + "learning_rate": 0.0005986923175067806, + "loss": 3.8313, + "step": 1820 + }, + { + "epoch": 0.09, + "grad_norm": 0.577404797077179, + "learning_rate": 0.0005986908811445911, + "loss": 4.156, + "step": 1821 + }, + { + "epoch": 0.09, + "grad_norm": 0.6154677271842957, + "learning_rate": 0.0005986894439957074, + "loss": 3.8513, + "step": 1822 + }, + { + "epoch": 0.09, + "grad_norm": 0.5824177265167236, + "learning_rate": 0.0005986880060601332, + "loss": 3.7039, + "step": 1823 + }, + { + "epoch": 0.09, + "grad_norm": 0.6346336603164673, + "learning_rate": 0.0005986865673378722, + "loss": 3.801, + "step": 1824 + }, + { + "epoch": 0.09, + "grad_norm": 0.6315222978591919, + "learning_rate": 0.0005986851278289283, + "loss": 3.5617, + "step": 1825 + }, + { + "epoch": 0.09, + "grad_norm": 0.628288209438324, + "learning_rate": 0.0005986836875333052, + "loss": 3.6983, + "step": 1826 + }, + { + "epoch": 0.09, + "grad_norm": 0.6149936318397522, + "learning_rate": 0.0005986822464510067, + "loss": 3.8962, + "step": 1827 + }, + { + "epoch": 0.09, + "grad_norm": 0.5607312917709351, + "learning_rate": 0.0005986808045820367, + "loss": 3.8976, + "step": 1828 + }, + { + "epoch": 0.09, + "grad_norm": 0.6367286443710327, + "learning_rate": 0.000598679361926399, + "loss": 3.7893, + "step": 1829 + }, + { + "epoch": 0.09, + "grad_norm": 0.6114875078201294, + "learning_rate": 0.0005986779184840973, + "loss": 3.5419, + "step": 1830 + }, + { + "epoch": 0.09, + "grad_norm": 0.6330634355545044, + "learning_rate": 0.0005986764742551355, + "loss": 3.9017, + "step": 1831 + }, + { + "epoch": 0.09, + "grad_norm": 0.5696288347244263, + "learning_rate": 0.0005986750292395173, + "loss": 3.6879, + "step": 1832 + }, + { + "epoch": 0.09, + "grad_norm": 0.6309328079223633, + "learning_rate": 0.0005986735834372465, + "loss": 3.8622, + "step": 1833 + }, + { + "epoch": 0.09, + "grad_norm": 0.5509409308433533, + "learning_rate": 0.000598672136848327, + "loss": 3.8122, + "step": 1834 + }, + { + "epoch": 0.09, + "grad_norm": 0.6046048402786255, + "learning_rate": 0.0005986706894727627, + "loss": 3.7887, + "step": 1835 + }, + { + "epoch": 0.09, + "grad_norm": 0.6264524459838867, + "learning_rate": 0.0005986692413105571, + "loss": 3.9682, + "step": 1836 + }, + { + "epoch": 0.09, + "grad_norm": 0.6554989218711853, + "learning_rate": 0.0005986677923617142, + "loss": 4.0354, + "step": 1837 + }, + { + "epoch": 0.09, + "grad_norm": 0.5879736542701721, + "learning_rate": 0.0005986663426262379, + "loss": 3.8353, + "step": 1838 + }, + { + "epoch": 0.09, + "grad_norm": 0.6094560623168945, + "learning_rate": 0.0005986648921041319, + "loss": 3.7207, + "step": 1839 + }, + { + "epoch": 0.09, + "grad_norm": 0.5921733975410461, + "learning_rate": 0.0005986634407954001, + "loss": 3.7837, + "step": 1840 + }, + { + "epoch": 0.09, + "grad_norm": 0.5805111527442932, + "learning_rate": 0.0005986619887000463, + "loss": 3.7929, + "step": 1841 + }, + { + "epoch": 0.09, + "grad_norm": 0.6623033285140991, + "learning_rate": 0.0005986605358180743, + "loss": 3.669, + "step": 1842 + }, + { + "epoch": 0.09, + "grad_norm": 0.5905426144599915, + "learning_rate": 0.0005986590821494878, + "loss": 3.8428, + "step": 1843 + }, + { + "epoch": 0.09, + "grad_norm": 0.6057484149932861, + "learning_rate": 0.0005986576276942908, + "loss": 3.5711, + "step": 1844 + }, + { + "epoch": 0.09, + "grad_norm": 0.5728998184204102, + "learning_rate": 0.0005986561724524871, + "loss": 3.8205, + "step": 1845 + }, + { + "epoch": 0.09, + "grad_norm": 0.6299675703048706, + "learning_rate": 0.0005986547164240805, + "loss": 3.7557, + "step": 1846 + }, + { + "epoch": 0.09, + "grad_norm": 0.6935710906982422, + "learning_rate": 0.000598653259609075, + "loss": 3.6427, + "step": 1847 + }, + { + "epoch": 0.09, + "grad_norm": 0.6175175309181213, + "learning_rate": 0.0005986518020074742, + "loss": 3.7941, + "step": 1848 + }, + { + "epoch": 0.09, + "grad_norm": 0.5965023636817932, + "learning_rate": 0.000598650343619282, + "loss": 3.6033, + "step": 1849 + }, + { + "epoch": 0.09, + "grad_norm": 0.6782153844833374, + "learning_rate": 0.0005986488844445022, + "loss": 3.6862, + "step": 1850 + }, + { + "epoch": 0.09, + "grad_norm": 0.6123353838920593, + "learning_rate": 0.0005986474244831389, + "loss": 3.8051, + "step": 1851 + }, + { + "epoch": 0.09, + "grad_norm": 0.6483194828033447, + "learning_rate": 0.0005986459637351957, + "loss": 3.9932, + "step": 1852 + }, + { + "epoch": 0.09, + "grad_norm": 0.5647997856140137, + "learning_rate": 0.0005986445022006764, + "loss": 3.6815, + "step": 1853 + }, + { + "epoch": 0.09, + "grad_norm": 0.6573106646537781, + "learning_rate": 0.0005986430398795849, + "loss": 3.9537, + "step": 1854 + }, + { + "epoch": 0.09, + "grad_norm": 0.5951365828514099, + "learning_rate": 0.0005986415767719254, + "loss": 3.7697, + "step": 1855 + }, + { + "epoch": 0.09, + "grad_norm": 0.6256296038627625, + "learning_rate": 0.0005986401128777012, + "loss": 3.7458, + "step": 1856 + }, + { + "epoch": 0.09, + "grad_norm": 0.7118420600891113, + "learning_rate": 0.0005986386481969165, + "loss": 3.4327, + "step": 1857 + }, + { + "epoch": 0.09, + "grad_norm": 0.5771075487136841, + "learning_rate": 0.0005986371827295751, + "loss": 3.6589, + "step": 1858 + }, + { + "epoch": 0.09, + "grad_norm": 0.6408962607383728, + "learning_rate": 0.0005986357164756809, + "loss": 3.9137, + "step": 1859 + }, + { + "epoch": 0.09, + "grad_norm": 0.6218885183334351, + "learning_rate": 0.0005986342494352376, + "loss": 3.7506, + "step": 1860 + }, + { + "epoch": 0.09, + "grad_norm": 0.6180911064147949, + "learning_rate": 0.0005986327816082491, + "loss": 3.6067, + "step": 1861 + }, + { + "epoch": 0.09, + "grad_norm": 0.6280100345611572, + "learning_rate": 0.0005986313129947194, + "loss": 3.9251, + "step": 1862 + }, + { + "epoch": 0.09, + "grad_norm": 0.5840473175048828, + "learning_rate": 0.0005986298435946523, + "loss": 3.654, + "step": 1863 + }, + { + "epoch": 0.09, + "grad_norm": 0.5487592220306396, + "learning_rate": 0.0005986283734080517, + "loss": 3.7433, + "step": 1864 + }, + { + "epoch": 0.09, + "grad_norm": 0.6708935499191284, + "learning_rate": 0.0005986269024349213, + "loss": 3.6582, + "step": 1865 + }, + { + "epoch": 0.09, + "grad_norm": 0.5990443825721741, + "learning_rate": 0.0005986254306752653, + "loss": 3.7431, + "step": 1866 + }, + { + "epoch": 0.09, + "grad_norm": 0.6027263402938843, + "learning_rate": 0.0005986239581290873, + "loss": 3.6634, + "step": 1867 + }, + { + "epoch": 0.09, + "grad_norm": 0.6863844394683838, + "learning_rate": 0.0005986224847963912, + "loss": 3.727, + "step": 1868 + }, + { + "epoch": 0.09, + "grad_norm": 0.6209395527839661, + "learning_rate": 0.0005986210106771811, + "loss": 3.853, + "step": 1869 + }, + { + "epoch": 0.09, + "grad_norm": 0.6714705228805542, + "learning_rate": 0.0005986195357714605, + "loss": 3.7967, + "step": 1870 + }, + { + "epoch": 0.09, + "grad_norm": 0.5347830057144165, + "learning_rate": 0.0005986180600792337, + "loss": 3.8287, + "step": 1871 + }, + { + "epoch": 0.09, + "grad_norm": 0.6196743249893188, + "learning_rate": 0.0005986165836005044, + "loss": 3.8296, + "step": 1872 + }, + { + "epoch": 0.09, + "grad_norm": 0.5981053113937378, + "learning_rate": 0.0005986151063352763, + "loss": 3.7604, + "step": 1873 + }, + { + "epoch": 0.09, + "grad_norm": 0.5930126905441284, + "learning_rate": 0.0005986136282835537, + "loss": 3.7541, + "step": 1874 + }, + { + "epoch": 0.09, + "grad_norm": 0.604667067527771, + "learning_rate": 0.0005986121494453401, + "loss": 3.658, + "step": 1875 + }, + { + "epoch": 0.09, + "grad_norm": 0.6272248029708862, + "learning_rate": 0.0005986106698206398, + "loss": 3.6897, + "step": 1876 + }, + { + "epoch": 0.09, + "grad_norm": 0.572104811668396, + "learning_rate": 0.0005986091894094563, + "loss": 3.752, + "step": 1877 + }, + { + "epoch": 0.09, + "grad_norm": 0.6113089323043823, + "learning_rate": 0.0005986077082117936, + "loss": 3.6702, + "step": 1878 + }, + { + "epoch": 0.09, + "grad_norm": 0.6149545311927795, + "learning_rate": 0.0005986062262276559, + "loss": 3.8337, + "step": 1879 + }, + { + "epoch": 0.09, + "grad_norm": 0.5942882299423218, + "learning_rate": 0.0005986047434570466, + "loss": 3.546, + "step": 1880 + }, + { + "epoch": 0.09, + "grad_norm": 0.6164019703865051, + "learning_rate": 0.00059860325989997, + "loss": 3.713, + "step": 1881 + }, + { + "epoch": 0.09, + "grad_norm": 0.6244223713874817, + "learning_rate": 0.0005986017755564299, + "loss": 3.5671, + "step": 1882 + }, + { + "epoch": 0.09, + "grad_norm": 0.6063665747642517, + "learning_rate": 0.0005986002904264301, + "loss": 3.8345, + "step": 1883 + }, + { + "epoch": 0.09, + "grad_norm": 0.5902014374732971, + "learning_rate": 0.0005985988045099747, + "loss": 3.7012, + "step": 1884 + }, + { + "epoch": 0.09, + "grad_norm": 0.5616203546524048, + "learning_rate": 0.0005985973178070675, + "loss": 3.5898, + "step": 1885 + }, + { + "epoch": 0.09, + "grad_norm": 0.610362708568573, + "learning_rate": 0.0005985958303177124, + "loss": 3.7553, + "step": 1886 + }, + { + "epoch": 0.09, + "grad_norm": 0.5817036032676697, + "learning_rate": 0.0005985943420419134, + "loss": 3.6755, + "step": 1887 + }, + { + "epoch": 0.09, + "grad_norm": 0.6104779839515686, + "learning_rate": 0.0005985928529796743, + "loss": 3.6263, + "step": 1888 + }, + { + "epoch": 0.09, + "grad_norm": 0.5836775898933411, + "learning_rate": 0.000598591363130999, + "loss": 3.797, + "step": 1889 + }, + { + "epoch": 0.09, + "grad_norm": 0.60204017162323, + "learning_rate": 0.0005985898724958917, + "loss": 3.6424, + "step": 1890 + }, + { + "epoch": 0.09, + "grad_norm": 0.5760531425476074, + "learning_rate": 0.000598588381074356, + "loss": 3.6849, + "step": 1891 + }, + { + "epoch": 0.09, + "grad_norm": 0.6347927451133728, + "learning_rate": 0.0005985868888663961, + "loss": 3.6574, + "step": 1892 + }, + { + "epoch": 0.09, + "grad_norm": 0.6068776845932007, + "learning_rate": 0.0005985853958720157, + "loss": 4.0513, + "step": 1893 + }, + { + "epoch": 0.09, + "grad_norm": 0.6126516461372375, + "learning_rate": 0.0005985839020912189, + "loss": 3.9252, + "step": 1894 + }, + { + "epoch": 0.09, + "grad_norm": 0.5863696932792664, + "learning_rate": 0.0005985824075240096, + "loss": 3.7798, + "step": 1895 + }, + { + "epoch": 0.09, + "grad_norm": 0.6498505473136902, + "learning_rate": 0.0005985809121703916, + "loss": 3.8625, + "step": 1896 + }, + { + "epoch": 0.09, + "grad_norm": 0.5926463007926941, + "learning_rate": 0.000598579416030369, + "loss": 3.5835, + "step": 1897 + }, + { + "epoch": 0.09, + "grad_norm": 0.6369209885597229, + "learning_rate": 0.0005985779191039457, + "loss": 3.7343, + "step": 1898 + }, + { + "epoch": 0.09, + "grad_norm": 0.5705779194831848, + "learning_rate": 0.0005985764213911255, + "loss": 3.7468, + "step": 1899 + }, + { + "epoch": 0.09, + "grad_norm": 0.5890957713127136, + "learning_rate": 0.0005985749228919126, + "loss": 3.7234, + "step": 1900 + }, + { + "epoch": 0.09, + "grad_norm": 0.6325515508651733, + "learning_rate": 0.0005985734236063108, + "loss": 3.9433, + "step": 1901 + }, + { + "epoch": 0.09, + "grad_norm": 0.6154495477676392, + "learning_rate": 0.000598571923534324, + "loss": 3.8898, + "step": 1902 + }, + { + "epoch": 0.09, + "grad_norm": 0.5961557626724243, + "learning_rate": 0.0005985704226759563, + "loss": 3.7667, + "step": 1903 + }, + { + "epoch": 0.09, + "grad_norm": 0.5520904064178467, + "learning_rate": 0.0005985689210312114, + "loss": 3.959, + "step": 1904 + }, + { + "epoch": 0.09, + "grad_norm": 0.5621737241744995, + "learning_rate": 0.0005985674186000935, + "loss": 3.7459, + "step": 1905 + }, + { + "epoch": 0.09, + "grad_norm": 0.6092246174812317, + "learning_rate": 0.0005985659153826065, + "loss": 3.7063, + "step": 1906 + }, + { + "epoch": 0.09, + "grad_norm": 0.6204909086227417, + "learning_rate": 0.0005985644113787544, + "loss": 3.74, + "step": 1907 + }, + { + "epoch": 0.09, + "grad_norm": 0.618402361869812, + "learning_rate": 0.0005985629065885412, + "loss": 3.6501, + "step": 1908 + }, + { + "epoch": 0.09, + "grad_norm": 0.5384846329689026, + "learning_rate": 0.0005985614010119705, + "loss": 3.7754, + "step": 1909 + }, + { + "epoch": 0.09, + "grad_norm": 0.5484468340873718, + "learning_rate": 0.0005985598946490467, + "loss": 3.6136, + "step": 1910 + }, + { + "epoch": 0.09, + "grad_norm": 0.5607725977897644, + "learning_rate": 0.0005985583874997736, + "loss": 3.765, + "step": 1911 + }, + { + "epoch": 0.09, + "grad_norm": 0.5928760170936584, + "learning_rate": 0.0005985568795641551, + "loss": 3.6978, + "step": 1912 + }, + { + "epoch": 0.09, + "grad_norm": 0.6223922967910767, + "learning_rate": 0.0005985553708421951, + "loss": 3.8356, + "step": 1913 + }, + { + "epoch": 0.09, + "grad_norm": 0.6009296178817749, + "learning_rate": 0.0005985538613338979, + "loss": 3.8473, + "step": 1914 + }, + { + "epoch": 0.09, + "grad_norm": 0.5862912535667419, + "learning_rate": 0.0005985523510392673, + "loss": 3.8365, + "step": 1915 + }, + { + "epoch": 0.09, + "grad_norm": 0.5991998910903931, + "learning_rate": 0.0005985508399583072, + "loss": 3.6978, + "step": 1916 + }, + { + "epoch": 0.09, + "grad_norm": 0.6147690415382385, + "learning_rate": 0.0005985493280910217, + "loss": 3.631, + "step": 1917 + }, + { + "epoch": 0.09, + "grad_norm": 0.6772075295448303, + "learning_rate": 0.0005985478154374147, + "loss": 3.7764, + "step": 1918 + }, + { + "epoch": 0.09, + "grad_norm": 0.6254436373710632, + "learning_rate": 0.0005985463019974901, + "loss": 3.6895, + "step": 1919 + }, + { + "epoch": 0.09, + "grad_norm": 0.5893893837928772, + "learning_rate": 0.0005985447877712521, + "loss": 3.9327, + "step": 1920 + }, + { + "epoch": 0.09, + "grad_norm": 0.6510739922523499, + "learning_rate": 0.0005985432727587045, + "loss": 3.7668, + "step": 1921 + }, + { + "epoch": 0.09, + "grad_norm": 0.6356686949729919, + "learning_rate": 0.0005985417569598515, + "loss": 3.6375, + "step": 1922 + }, + { + "epoch": 0.09, + "grad_norm": 0.7121893763542175, + "learning_rate": 0.0005985402403746969, + "loss": 3.8785, + "step": 1923 + }, + { + "epoch": 0.09, + "grad_norm": 0.6303704977035522, + "learning_rate": 0.0005985387230032449, + "loss": 3.7847, + "step": 1924 + }, + { + "epoch": 0.09, + "grad_norm": 0.6012903451919556, + "learning_rate": 0.0005985372048454992, + "loss": 3.6605, + "step": 1925 + }, + { + "epoch": 0.09, + "grad_norm": 0.631798267364502, + "learning_rate": 0.000598535685901464, + "loss": 3.4997, + "step": 1926 + }, + { + "epoch": 0.09, + "grad_norm": 0.6432697176933289, + "learning_rate": 0.0005985341661711432, + "loss": 3.6908, + "step": 1927 + }, + { + "epoch": 0.09, + "grad_norm": 0.6546198129653931, + "learning_rate": 0.0005985326456545409, + "loss": 3.5806, + "step": 1928 + }, + { + "epoch": 0.09, + "grad_norm": 0.6076374053955078, + "learning_rate": 0.0005985311243516611, + "loss": 3.6955, + "step": 1929 + }, + { + "epoch": 0.09, + "grad_norm": 0.673281192779541, + "learning_rate": 0.0005985296022625078, + "loss": 3.8208, + "step": 1930 + }, + { + "epoch": 0.09, + "grad_norm": 0.614531397819519, + "learning_rate": 0.000598528079387085, + "loss": 3.7834, + "step": 1931 + }, + { + "epoch": 0.09, + "grad_norm": 0.6124190092086792, + "learning_rate": 0.0005985265557253965, + "loss": 4.1048, + "step": 1932 + }, + { + "epoch": 0.09, + "grad_norm": 0.5843042135238647, + "learning_rate": 0.0005985250312774466, + "loss": 3.6868, + "step": 1933 + }, + { + "epoch": 0.09, + "grad_norm": 0.5952600836753845, + "learning_rate": 0.0005985235060432393, + "loss": 3.7835, + "step": 1934 + }, + { + "epoch": 0.09, + "grad_norm": 0.5896327495574951, + "learning_rate": 0.0005985219800227785, + "loss": 3.6072, + "step": 1935 + }, + { + "epoch": 0.09, + "grad_norm": 0.5987282395362854, + "learning_rate": 0.0005985204532160683, + "loss": 3.8684, + "step": 1936 + }, + { + "epoch": 0.09, + "grad_norm": 0.661604642868042, + "learning_rate": 0.0005985189256231125, + "loss": 3.6414, + "step": 1937 + }, + { + "epoch": 0.09, + "grad_norm": 0.6821339726448059, + "learning_rate": 0.0005985173972439154, + "loss": 3.9158, + "step": 1938 + }, + { + "epoch": 0.1, + "grad_norm": 0.5843617916107178, + "learning_rate": 0.000598515868078481, + "loss": 3.8082, + "step": 1939 + }, + { + "epoch": 0.1, + "grad_norm": 0.5790466070175171, + "learning_rate": 0.0005985143381268132, + "loss": 3.8143, + "step": 1940 + }, + { + "epoch": 0.1, + "grad_norm": 0.639920175075531, + "learning_rate": 0.0005985128073889161, + "loss": 3.7693, + "step": 1941 + }, + { + "epoch": 0.1, + "grad_norm": 0.615778386592865, + "learning_rate": 0.0005985112758647937, + "loss": 3.6194, + "step": 1942 + }, + { + "epoch": 0.1, + "grad_norm": 0.6140244007110596, + "learning_rate": 0.00059850974355445, + "loss": 3.5417, + "step": 1943 + }, + { + "epoch": 0.1, + "grad_norm": 0.6662708520889282, + "learning_rate": 0.0005985082104578892, + "loss": 3.5064, + "step": 1944 + }, + { + "epoch": 0.1, + "grad_norm": 0.6774072051048279, + "learning_rate": 0.0005985066765751151, + "loss": 3.8021, + "step": 1945 + }, + { + "epoch": 0.1, + "grad_norm": 0.6490411758422852, + "learning_rate": 0.000598505141906132, + "loss": 3.5669, + "step": 1946 + }, + { + "epoch": 0.1, + "grad_norm": 0.6608024835586548, + "learning_rate": 0.0005985036064509437, + "loss": 3.6388, + "step": 1947 + }, + { + "epoch": 0.1, + "grad_norm": 0.5928901433944702, + "learning_rate": 0.0005985020702095543, + "loss": 3.6946, + "step": 1948 + }, + { + "epoch": 0.1, + "grad_norm": 0.6092929840087891, + "learning_rate": 0.000598500533181968, + "loss": 3.5945, + "step": 1949 + }, + { + "epoch": 0.1, + "grad_norm": 0.6284611821174622, + "learning_rate": 0.0005984989953681887, + "loss": 3.932, + "step": 1950 + }, + { + "epoch": 0.1, + "grad_norm": 0.5998930335044861, + "learning_rate": 0.0005984974567682205, + "loss": 3.8786, + "step": 1951 + }, + { + "epoch": 0.1, + "grad_norm": 0.6137663125991821, + "learning_rate": 0.0005984959173820674, + "loss": 3.6663, + "step": 1952 + }, + { + "epoch": 0.1, + "grad_norm": 0.6030510663986206, + "learning_rate": 0.0005984943772097337, + "loss": 3.7237, + "step": 1953 + }, + { + "epoch": 0.1, + "grad_norm": 0.5598939061164856, + "learning_rate": 0.0005984928362512231, + "loss": 3.652, + "step": 1954 + }, + { + "epoch": 0.1, + "grad_norm": 0.6012648940086365, + "learning_rate": 0.0005984912945065397, + "loss": 3.7399, + "step": 1955 + }, + { + "epoch": 0.1, + "grad_norm": 0.5890420079231262, + "learning_rate": 0.0005984897519756879, + "loss": 3.4773, + "step": 1956 + }, + { + "epoch": 0.1, + "grad_norm": 0.6079639792442322, + "learning_rate": 0.0005984882086586714, + "loss": 3.8788, + "step": 1957 + }, + { + "epoch": 0.1, + "grad_norm": 0.6101313233375549, + "learning_rate": 0.0005984866645554945, + "loss": 3.924, + "step": 1958 + }, + { + "epoch": 0.1, + "grad_norm": 0.5866898894309998, + "learning_rate": 0.000598485119666161, + "loss": 3.6906, + "step": 1959 + }, + { + "epoch": 0.1, + "grad_norm": 0.7030449509620667, + "learning_rate": 0.0005984835739906753, + "loss": 3.7014, + "step": 1960 + }, + { + "epoch": 0.1, + "grad_norm": 0.6568990349769592, + "learning_rate": 0.0005984820275290413, + "loss": 3.7585, + "step": 1961 + }, + { + "epoch": 0.1, + "grad_norm": 0.5689438581466675, + "learning_rate": 0.000598480480281263, + "loss": 3.6166, + "step": 1962 + }, + { + "epoch": 0.1, + "grad_norm": 0.6452824473381042, + "learning_rate": 0.0005984789322473446, + "loss": 3.7331, + "step": 1963 + }, + { + "epoch": 0.1, + "grad_norm": 0.5881951451301575, + "learning_rate": 0.0005984773834272902, + "loss": 3.7973, + "step": 1964 + }, + { + "epoch": 0.1, + "grad_norm": 0.5840572714805603, + "learning_rate": 0.0005984758338211037, + "loss": 3.763, + "step": 1965 + }, + { + "epoch": 0.1, + "grad_norm": 0.5961487889289856, + "learning_rate": 0.0005984742834287894, + "loss": 3.6024, + "step": 1966 + }, + { + "epoch": 0.1, + "grad_norm": 0.5628345608711243, + "learning_rate": 0.0005984727322503512, + "loss": 3.8128, + "step": 1967 + }, + { + "epoch": 0.1, + "grad_norm": 0.5680345296859741, + "learning_rate": 0.0005984711802857933, + "loss": 3.8158, + "step": 1968 + }, + { + "epoch": 0.1, + "grad_norm": 0.5904361605644226, + "learning_rate": 0.0005984696275351198, + "loss": 3.6442, + "step": 1969 + }, + { + "epoch": 0.1, + "grad_norm": 0.6462759971618652, + "learning_rate": 0.0005984680739983346, + "loss": 3.826, + "step": 1970 + }, + { + "epoch": 0.1, + "grad_norm": 0.5882953405380249, + "learning_rate": 0.0005984665196754421, + "loss": 3.7568, + "step": 1971 + }, + { + "epoch": 0.1, + "grad_norm": 0.5906187891960144, + "learning_rate": 0.0005984649645664461, + "loss": 3.7037, + "step": 1972 + }, + { + "epoch": 0.1, + "grad_norm": 0.5672358870506287, + "learning_rate": 0.0005984634086713509, + "loss": 3.8733, + "step": 1973 + }, + { + "epoch": 0.1, + "grad_norm": 0.6111389994621277, + "learning_rate": 0.0005984618519901605, + "loss": 3.617, + "step": 1974 + }, + { + "epoch": 0.1, + "grad_norm": 0.6110372543334961, + "learning_rate": 0.000598460294522879, + "loss": 3.7275, + "step": 1975 + }, + { + "epoch": 0.1, + "grad_norm": 0.5851089954376221, + "learning_rate": 0.0005984587362695105, + "loss": 3.6567, + "step": 1976 + }, + { + "epoch": 0.1, + "grad_norm": 0.6242319345474243, + "learning_rate": 0.0005984571772300592, + "loss": 3.698, + "step": 1977 + }, + { + "epoch": 0.1, + "grad_norm": 0.5997954607009888, + "learning_rate": 0.0005984556174045292, + "loss": 3.5633, + "step": 1978 + }, + { + "epoch": 0.1, + "grad_norm": 0.6077648401260376, + "learning_rate": 0.0005984540567929244, + "loss": 3.4955, + "step": 1979 + }, + { + "epoch": 0.1, + "grad_norm": 0.6089391112327576, + "learning_rate": 0.0005984524953952492, + "loss": 3.8301, + "step": 1980 + }, + { + "epoch": 0.1, + "grad_norm": 0.6587269902229309, + "learning_rate": 0.0005984509332115074, + "loss": 3.6826, + "step": 1981 + }, + { + "epoch": 0.1, + "grad_norm": 0.6356779336929321, + "learning_rate": 0.0005984493702417033, + "loss": 3.7191, + "step": 1982 + }, + { + "epoch": 0.1, + "grad_norm": 0.6114301085472107, + "learning_rate": 0.0005984478064858411, + "loss": 3.8376, + "step": 1983 + }, + { + "epoch": 0.1, + "grad_norm": 0.6225729584693909, + "learning_rate": 0.0005984462419439248, + "loss": 3.5604, + "step": 1984 + }, + { + "epoch": 0.1, + "grad_norm": 0.5687344074249268, + "learning_rate": 0.0005984446766159585, + "loss": 3.8095, + "step": 1985 + }, + { + "epoch": 0.1, + "grad_norm": 0.6089506149291992, + "learning_rate": 0.0005984431105019463, + "loss": 3.5823, + "step": 1986 + }, + { + "epoch": 0.1, + "grad_norm": 0.6365668177604675, + "learning_rate": 0.0005984415436018925, + "loss": 3.8444, + "step": 1987 + }, + { + "epoch": 0.1, + "grad_norm": 0.6325536370277405, + "learning_rate": 0.000598439975915801, + "loss": 3.8402, + "step": 1988 + }, + { + "epoch": 0.1, + "grad_norm": 0.6012512445449829, + "learning_rate": 0.0005984384074436761, + "loss": 3.7274, + "step": 1989 + }, + { + "epoch": 0.1, + "grad_norm": 0.5962508916854858, + "learning_rate": 0.0005984368381855219, + "loss": 3.6733, + "step": 1990 + }, + { + "epoch": 0.1, + "grad_norm": 0.618022084236145, + "learning_rate": 0.0005984352681413424, + "loss": 3.7132, + "step": 1991 + }, + { + "epoch": 0.1, + "grad_norm": 0.5967841148376465, + "learning_rate": 0.0005984336973111419, + "loss": 3.9452, + "step": 1992 + }, + { + "epoch": 0.1, + "grad_norm": 0.590883195400238, + "learning_rate": 0.0005984321256949245, + "loss": 3.5217, + "step": 1993 + }, + { + "epoch": 0.1, + "grad_norm": 0.5896095633506775, + "learning_rate": 0.0005984305532926943, + "loss": 3.7342, + "step": 1994 + }, + { + "epoch": 0.1, + "grad_norm": 0.5817958116531372, + "learning_rate": 0.0005984289801044554, + "loss": 3.6573, + "step": 1995 + }, + { + "epoch": 0.1, + "grad_norm": 0.586776077747345, + "learning_rate": 0.000598427406130212, + "loss": 3.7158, + "step": 1996 + }, + { + "epoch": 0.1, + "grad_norm": 0.61875981092453, + "learning_rate": 0.0005984258313699684, + "loss": 3.688, + "step": 1997 + }, + { + "epoch": 0.1, + "grad_norm": 0.5944945216178894, + "learning_rate": 0.0005984242558237285, + "loss": 3.8244, + "step": 1998 + }, + { + "epoch": 0.1, + "grad_norm": 0.62650465965271, + "learning_rate": 0.0005984226794914965, + "loss": 3.7835, + "step": 1999 + }, + { + "epoch": 0.1, + "grad_norm": 0.6252263188362122, + "learning_rate": 0.0005984211023732767, + "loss": 3.6683, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 0.6127182841300964, + "learning_rate": 0.000598419524469073, + "loss": 3.7427, + "step": 2001 + }, + { + "epoch": 0.1, + "grad_norm": 0.5782539248466492, + "learning_rate": 0.0005984179457788897, + "loss": 3.529, + "step": 2002 + }, + { + "epoch": 0.1, + "grad_norm": 0.6279388666152954, + "learning_rate": 0.0005984163663027311, + "loss": 3.4904, + "step": 2003 + }, + { + "epoch": 0.1, + "grad_norm": 0.6365188360214233, + "learning_rate": 0.0005984147860406012, + "loss": 3.75, + "step": 2004 + }, + { + "epoch": 0.1, + "grad_norm": 0.6217809319496155, + "learning_rate": 0.0005984132049925041, + "loss": 3.6439, + "step": 2005 + }, + { + "epoch": 0.1, + "grad_norm": 0.6590561270713806, + "learning_rate": 0.0005984116231584441, + "loss": 3.7347, + "step": 2006 + }, + { + "epoch": 0.1, + "grad_norm": 0.5828232169151306, + "learning_rate": 0.0005984100405384253, + "loss": 3.7325, + "step": 2007 + }, + { + "epoch": 0.1, + "grad_norm": 0.5944784879684448, + "learning_rate": 0.0005984084571324519, + "loss": 3.7656, + "step": 2008 + }, + { + "epoch": 0.1, + "grad_norm": 0.6109013557434082, + "learning_rate": 0.0005984068729405281, + "loss": 4.0064, + "step": 2009 + }, + { + "epoch": 0.1, + "grad_norm": 0.6408974528312683, + "learning_rate": 0.000598405287962658, + "loss": 3.7111, + "step": 2010 + }, + { + "epoch": 0.1, + "grad_norm": 0.6209164261817932, + "learning_rate": 0.0005984037021988458, + "loss": 3.8332, + "step": 2011 + }, + { + "epoch": 0.1, + "grad_norm": 0.6082508563995361, + "learning_rate": 0.0005984021156490956, + "loss": 3.6786, + "step": 2012 + }, + { + "epoch": 0.1, + "grad_norm": 0.6087662577629089, + "learning_rate": 0.0005984005283134117, + "loss": 3.8417, + "step": 2013 + }, + { + "epoch": 0.1, + "grad_norm": 0.6204620599746704, + "learning_rate": 0.0005983989401917982, + "loss": 3.5133, + "step": 2014 + }, + { + "epoch": 0.1, + "grad_norm": 0.5919209122657776, + "learning_rate": 0.0005983973512842595, + "loss": 3.7543, + "step": 2015 + }, + { + "epoch": 0.1, + "grad_norm": 0.6268076300621033, + "learning_rate": 0.0005983957615907995, + "loss": 3.7173, + "step": 2016 + }, + { + "epoch": 0.1, + "grad_norm": 0.6411733031272888, + "learning_rate": 0.0005983941711114224, + "loss": 3.8086, + "step": 2017 + }, + { + "epoch": 0.1, + "grad_norm": 0.5614747405052185, + "learning_rate": 0.0005983925798461325, + "loss": 3.7973, + "step": 2018 + }, + { + "epoch": 0.1, + "grad_norm": 0.6606065630912781, + "learning_rate": 0.000598390987794934, + "loss": 3.6554, + "step": 2019 + }, + { + "epoch": 0.1, + "grad_norm": 0.5416567325592041, + "learning_rate": 0.0005983893949578311, + "loss": 3.9936, + "step": 2020 + }, + { + "epoch": 0.1, + "grad_norm": 0.7004711627960205, + "learning_rate": 0.000598387801334828, + "loss": 3.8626, + "step": 2021 + }, + { + "epoch": 0.1, + "grad_norm": 0.572654664516449, + "learning_rate": 0.0005983862069259288, + "loss": 3.517, + "step": 2022 + }, + { + "epoch": 0.1, + "grad_norm": 0.6083114147186279, + "learning_rate": 0.0005983846117311377, + "loss": 3.6627, + "step": 2023 + }, + { + "epoch": 0.1, + "grad_norm": 0.639316976070404, + "learning_rate": 0.0005983830157504591, + "loss": 3.6865, + "step": 2024 + }, + { + "epoch": 0.1, + "grad_norm": 0.715901792049408, + "learning_rate": 0.0005983814189838969, + "loss": 4.052, + "step": 2025 + }, + { + "epoch": 0.1, + "grad_norm": 0.581177830696106, + "learning_rate": 0.0005983798214314555, + "loss": 3.903, + "step": 2026 + }, + { + "epoch": 0.1, + "grad_norm": 0.5649012327194214, + "learning_rate": 0.0005983782230931391, + "loss": 3.772, + "step": 2027 + }, + { + "epoch": 0.1, + "grad_norm": 0.6346585154533386, + "learning_rate": 0.000598376623968952, + "loss": 3.5737, + "step": 2028 + }, + { + "epoch": 0.1, + "grad_norm": 0.70558762550354, + "learning_rate": 0.0005983750240588982, + "loss": 3.7745, + "step": 2029 + }, + { + "epoch": 0.1, + "grad_norm": 0.6112974286079407, + "learning_rate": 0.000598373423362982, + "loss": 3.7495, + "step": 2030 + }, + { + "epoch": 0.1, + "grad_norm": 0.5762995481491089, + "learning_rate": 0.0005983718218812075, + "loss": 3.8097, + "step": 2031 + }, + { + "epoch": 0.1, + "grad_norm": 0.5994111895561218, + "learning_rate": 0.0005983702196135793, + "loss": 3.5908, + "step": 2032 + }, + { + "epoch": 0.1, + "grad_norm": 0.6250690221786499, + "learning_rate": 0.0005983686165601012, + "loss": 3.8148, + "step": 2033 + }, + { + "epoch": 0.1, + "grad_norm": 0.5601446628570557, + "learning_rate": 0.0005983670127207777, + "loss": 3.8132, + "step": 2034 + }, + { + "epoch": 0.1, + "grad_norm": 0.619858980178833, + "learning_rate": 0.0005983654080956128, + "loss": 3.698, + "step": 2035 + }, + { + "epoch": 0.1, + "grad_norm": 0.6175944805145264, + "learning_rate": 0.0005983638026846108, + "loss": 3.7716, + "step": 2036 + }, + { + "epoch": 0.1, + "grad_norm": 0.6550002098083496, + "learning_rate": 0.0005983621964877761, + "loss": 3.9279, + "step": 2037 + }, + { + "epoch": 0.1, + "grad_norm": 0.5929962396621704, + "learning_rate": 0.0005983605895051127, + "loss": 3.7389, + "step": 2038 + }, + { + "epoch": 0.1, + "grad_norm": 0.6178693771362305, + "learning_rate": 0.0005983589817366249, + "loss": 3.6407, + "step": 2039 + }, + { + "epoch": 0.1, + "grad_norm": 0.6257408857345581, + "learning_rate": 0.0005983573731823172, + "loss": 3.8194, + "step": 2040 + }, + { + "epoch": 0.1, + "grad_norm": 0.6745170950889587, + "learning_rate": 0.0005983557638421933, + "loss": 3.7114, + "step": 2041 + }, + { + "epoch": 0.1, + "grad_norm": 0.6017802357673645, + "learning_rate": 0.000598354153716258, + "loss": 3.6747, + "step": 2042 + }, + { + "epoch": 0.1, + "grad_norm": 0.5711172223091125, + "learning_rate": 0.000598352542804515, + "loss": 3.7881, + "step": 2043 + }, + { + "epoch": 0.1, + "grad_norm": 0.6025456786155701, + "learning_rate": 0.000598350931106969, + "loss": 3.7451, + "step": 2044 + }, + { + "epoch": 0.1, + "grad_norm": 0.6247069239616394, + "learning_rate": 0.0005983493186236241, + "loss": 3.5347, + "step": 2045 + }, + { + "epoch": 0.1, + "grad_norm": 0.6139021515846252, + "learning_rate": 0.0005983477053544843, + "loss": 3.7963, + "step": 2046 + }, + { + "epoch": 0.1, + "grad_norm": 0.5898500084877014, + "learning_rate": 0.0005983460912995542, + "loss": 3.6405, + "step": 2047 + }, + { + "epoch": 0.1, + "grad_norm": 0.5590971112251282, + "learning_rate": 0.000598344476458838, + "loss": 3.8969, + "step": 2048 + }, + { + "epoch": 0.1, + "grad_norm": 0.613969624042511, + "learning_rate": 0.0005983428608323397, + "loss": 3.7783, + "step": 2049 + }, + { + "epoch": 0.1, + "grad_norm": 0.6687971353530884, + "learning_rate": 0.0005983412444200639, + "loss": 3.541, + "step": 2050 + }, + { + "epoch": 0.1, + "grad_norm": 0.5752463340759277, + "learning_rate": 0.0005983396272220146, + "loss": 3.7437, + "step": 2051 + }, + { + "epoch": 0.1, + "grad_norm": 0.6484838724136353, + "learning_rate": 0.000598338009238196, + "loss": 3.5128, + "step": 2052 + }, + { + "epoch": 0.1, + "grad_norm": 0.6085900664329529, + "learning_rate": 0.0005983363904686127, + "loss": 3.8193, + "step": 2053 + }, + { + "epoch": 0.1, + "grad_norm": 0.6057000756263733, + "learning_rate": 0.0005983347709132686, + "loss": 3.567, + "step": 2054 + }, + { + "epoch": 0.1, + "grad_norm": 0.580085039138794, + "learning_rate": 0.0005983331505721683, + "loss": 3.9294, + "step": 2055 + }, + { + "epoch": 0.1, + "grad_norm": 0.5920203924179077, + "learning_rate": 0.0005983315294453158, + "loss": 3.8553, + "step": 2056 + }, + { + "epoch": 0.1, + "grad_norm": 0.5826650857925415, + "learning_rate": 0.0005983299075327155, + "loss": 3.6751, + "step": 2057 + }, + { + "epoch": 0.1, + "grad_norm": 0.6206904053688049, + "learning_rate": 0.0005983282848343717, + "loss": 3.8432, + "step": 2058 + }, + { + "epoch": 0.1, + "grad_norm": 0.5721469521522522, + "learning_rate": 0.0005983266613502885, + "loss": 3.9363, + "step": 2059 + }, + { + "epoch": 0.1, + "grad_norm": 0.545362114906311, + "learning_rate": 0.0005983250370804702, + "loss": 3.6203, + "step": 2060 + }, + { + "epoch": 0.1, + "grad_norm": 0.5554171800613403, + "learning_rate": 0.0005983234120249213, + "loss": 3.9277, + "step": 2061 + }, + { + "epoch": 0.1, + "grad_norm": 0.606959879398346, + "learning_rate": 0.0005983217861836459, + "loss": 3.5465, + "step": 2062 + }, + { + "epoch": 0.1, + "grad_norm": 0.5941494703292847, + "learning_rate": 0.0005983201595566484, + "loss": 3.9184, + "step": 2063 + }, + { + "epoch": 0.1, + "grad_norm": 0.6147699952125549, + "learning_rate": 0.000598318532143933, + "loss": 3.7999, + "step": 2064 + }, + { + "epoch": 0.1, + "grad_norm": 0.570624589920044, + "learning_rate": 0.000598316903945504, + "loss": 3.6566, + "step": 2065 + }, + { + "epoch": 0.1, + "grad_norm": 0.5827869772911072, + "learning_rate": 0.0005983152749613656, + "loss": 3.7424, + "step": 2066 + }, + { + "epoch": 0.1, + "grad_norm": 0.6157439351081848, + "learning_rate": 0.0005983136451915222, + "loss": 3.569, + "step": 2067 + }, + { + "epoch": 0.1, + "grad_norm": 0.6273912787437439, + "learning_rate": 0.0005983120146359781, + "loss": 3.8541, + "step": 2068 + }, + { + "epoch": 0.1, + "grad_norm": 0.589424192905426, + "learning_rate": 0.0005983103832947376, + "loss": 3.7674, + "step": 2069 + }, + { + "epoch": 0.1, + "grad_norm": 0.5925363898277283, + "learning_rate": 0.0005983087511678049, + "loss": 3.7414, + "step": 2070 + }, + { + "epoch": 0.1, + "grad_norm": 0.6019349098205566, + "learning_rate": 0.0005983071182551845, + "loss": 3.8578, + "step": 2071 + }, + { + "epoch": 0.1, + "grad_norm": 0.5667245984077454, + "learning_rate": 0.0005983054845568803, + "loss": 3.7414, + "step": 2072 + }, + { + "epoch": 0.1, + "grad_norm": 0.5880571007728577, + "learning_rate": 0.0005983038500728971, + "loss": 3.8056, + "step": 2073 + }, + { + "epoch": 0.1, + "grad_norm": 0.6357612013816833, + "learning_rate": 0.0005983022148032389, + "loss": 3.624, + "step": 2074 + }, + { + "epoch": 0.1, + "grad_norm": 0.6150004267692566, + "learning_rate": 0.00059830057874791, + "loss": 3.6649, + "step": 2075 + }, + { + "epoch": 0.1, + "grad_norm": 0.5906575918197632, + "learning_rate": 0.0005982989419069148, + "loss": 3.6441, + "step": 2076 + }, + { + "epoch": 0.1, + "grad_norm": 0.6579732298851013, + "learning_rate": 0.0005982973042802578, + "loss": 3.7872, + "step": 2077 + }, + { + "epoch": 0.1, + "grad_norm": 0.5626339912414551, + "learning_rate": 0.0005982956658679429, + "loss": 3.6208, + "step": 2078 + }, + { + "epoch": 0.1, + "grad_norm": 0.6239688396453857, + "learning_rate": 0.0005982940266699747, + "loss": 3.5179, + "step": 2079 + }, + { + "epoch": 0.1, + "grad_norm": 0.5556694269180298, + "learning_rate": 0.0005982923866863574, + "loss": 3.732, + "step": 2080 + }, + { + "epoch": 0.1, + "grad_norm": 0.6616612672805786, + "learning_rate": 0.0005982907459170954, + "loss": 3.6723, + "step": 2081 + }, + { + "epoch": 0.1, + "grad_norm": 0.592753529548645, + "learning_rate": 0.0005982891043621929, + "loss": 3.8336, + "step": 2082 + }, + { + "epoch": 0.1, + "grad_norm": 0.6064931750297546, + "learning_rate": 0.0005982874620216543, + "loss": 3.6037, + "step": 2083 + }, + { + "epoch": 0.1, + "grad_norm": 0.6325314044952393, + "learning_rate": 0.000598285818895484, + "loss": 3.5522, + "step": 2084 + }, + { + "epoch": 0.1, + "grad_norm": 0.6174896955490112, + "learning_rate": 0.0005982841749836863, + "loss": 3.8025, + "step": 2085 + }, + { + "epoch": 0.1, + "grad_norm": 0.5929061770439148, + "learning_rate": 0.0005982825302862654, + "loss": 3.7384, + "step": 2086 + }, + { + "epoch": 0.1, + "grad_norm": 0.597075879573822, + "learning_rate": 0.0005982808848032258, + "loss": 3.8178, + "step": 2087 + }, + { + "epoch": 0.1, + "grad_norm": 0.5691017508506775, + "learning_rate": 0.0005982792385345717, + "loss": 3.6708, + "step": 2088 + }, + { + "epoch": 0.1, + "grad_norm": 0.6304391026496887, + "learning_rate": 0.0005982775914803075, + "loss": 3.8078, + "step": 2089 + }, + { + "epoch": 0.1, + "grad_norm": 0.6006346344947815, + "learning_rate": 0.0005982759436404376, + "loss": 3.7204, + "step": 2090 + }, + { + "epoch": 0.1, + "grad_norm": 0.6283980011940002, + "learning_rate": 0.0005982742950149661, + "loss": 3.4926, + "step": 2091 + }, + { + "epoch": 0.1, + "grad_norm": 0.6308605074882507, + "learning_rate": 0.0005982726456038977, + "loss": 3.7156, + "step": 2092 + }, + { + "epoch": 0.1, + "grad_norm": 0.6353732943534851, + "learning_rate": 0.0005982709954072365, + "loss": 3.6275, + "step": 2093 + }, + { + "epoch": 0.1, + "grad_norm": 0.7076538801193237, + "learning_rate": 0.0005982693444249868, + "loss": 3.7353, + "step": 2094 + }, + { + "epoch": 0.1, + "grad_norm": 0.5855659246444702, + "learning_rate": 0.0005982676926571532, + "loss": 3.555, + "step": 2095 + }, + { + "epoch": 0.1, + "grad_norm": 0.6150306463241577, + "learning_rate": 0.0005982660401037398, + "loss": 3.6312, + "step": 2096 + }, + { + "epoch": 0.1, + "grad_norm": 0.6198219656944275, + "learning_rate": 0.000598264386764751, + "loss": 3.7313, + "step": 2097 + }, + { + "epoch": 0.1, + "grad_norm": 0.6115104556083679, + "learning_rate": 0.0005982627326401914, + "loss": 3.8263, + "step": 2098 + }, + { + "epoch": 0.1, + "grad_norm": 0.6037588119506836, + "learning_rate": 0.000598261077730065, + "loss": 3.6085, + "step": 2099 + }, + { + "epoch": 0.1, + "grad_norm": 0.6247969269752502, + "learning_rate": 0.0005982594220343764, + "loss": 3.7709, + "step": 2100 + }, + { + "epoch": 0.1, + "grad_norm": 0.6451380848884583, + "learning_rate": 0.0005982577655531298, + "loss": 3.8026, + "step": 2101 + }, + { + "epoch": 0.1, + "grad_norm": 0.6023620963096619, + "learning_rate": 0.0005982561082863298, + "loss": 3.5155, + "step": 2102 + }, + { + "epoch": 0.1, + "grad_norm": 0.6898672580718994, + "learning_rate": 0.0005982544502339805, + "loss": 3.7734, + "step": 2103 + }, + { + "epoch": 0.1, + "grad_norm": 0.620043933391571, + "learning_rate": 0.0005982527913960863, + "loss": 3.7091, + "step": 2104 + }, + { + "epoch": 0.1, + "grad_norm": 0.6267081499099731, + "learning_rate": 0.0005982511317726518, + "loss": 3.7821, + "step": 2105 + }, + { + "epoch": 0.1, + "grad_norm": 0.5801389813423157, + "learning_rate": 0.0005982494713636812, + "loss": 3.5639, + "step": 2106 + }, + { + "epoch": 0.1, + "grad_norm": 0.5749569535255432, + "learning_rate": 0.0005982478101691788, + "loss": 3.7611, + "step": 2107 + }, + { + "epoch": 0.1, + "grad_norm": 0.6347259283065796, + "learning_rate": 0.0005982461481891491, + "loss": 3.6399, + "step": 2108 + }, + { + "epoch": 0.1, + "grad_norm": 0.6066511869430542, + "learning_rate": 0.0005982444854235964, + "loss": 3.6464, + "step": 2109 + }, + { + "epoch": 0.1, + "grad_norm": 0.6020014882087708, + "learning_rate": 0.0005982428218725252, + "loss": 3.5554, + "step": 2110 + }, + { + "epoch": 0.1, + "grad_norm": 0.6045134663581848, + "learning_rate": 0.0005982411575359398, + "loss": 3.835, + "step": 2111 + }, + { + "epoch": 0.1, + "grad_norm": 0.6153691411018372, + "learning_rate": 0.0005982394924138446, + "loss": 3.2436, + "step": 2112 + }, + { + "epoch": 0.1, + "grad_norm": 0.608252227306366, + "learning_rate": 0.0005982378265062439, + "loss": 3.7666, + "step": 2113 + }, + { + "epoch": 0.1, + "grad_norm": 0.5522031784057617, + "learning_rate": 0.0005982361598131422, + "loss": 3.5962, + "step": 2114 + }, + { + "epoch": 0.1, + "grad_norm": 0.6099326014518738, + "learning_rate": 0.0005982344923345439, + "loss": 3.5498, + "step": 2115 + }, + { + "epoch": 0.1, + "grad_norm": 0.6369093060493469, + "learning_rate": 0.0005982328240704533, + "loss": 3.6785, + "step": 2116 + }, + { + "epoch": 0.1, + "grad_norm": 0.5921229720115662, + "learning_rate": 0.0005982311550208749, + "loss": 3.7165, + "step": 2117 + }, + { + "epoch": 0.1, + "grad_norm": 0.6032543778419495, + "learning_rate": 0.000598229485185813, + "loss": 3.6678, + "step": 2118 + }, + { + "epoch": 0.1, + "grad_norm": 0.616646409034729, + "learning_rate": 0.000598227814565272, + "loss": 3.6858, + "step": 2119 + }, + { + "epoch": 0.1, + "grad_norm": 0.610694944858551, + "learning_rate": 0.0005982261431592564, + "loss": 3.5347, + "step": 2120 + }, + { + "epoch": 0.1, + "grad_norm": 0.5749186873435974, + "learning_rate": 0.0005982244709677704, + "loss": 3.8255, + "step": 2121 + }, + { + "epoch": 0.1, + "grad_norm": 0.5923564434051514, + "learning_rate": 0.0005982227979908186, + "loss": 3.4269, + "step": 2122 + }, + { + "epoch": 0.1, + "grad_norm": 0.6011732816696167, + "learning_rate": 0.0005982211242284054, + "loss": 3.4419, + "step": 2123 + }, + { + "epoch": 0.1, + "grad_norm": 0.5927369594573975, + "learning_rate": 0.0005982194496805351, + "loss": 3.6693, + "step": 2124 + }, + { + "epoch": 0.1, + "grad_norm": 0.575406551361084, + "learning_rate": 0.0005982177743472122, + "loss": 3.688, + "step": 2125 + }, + { + "epoch": 0.1, + "grad_norm": 0.6836567521095276, + "learning_rate": 0.000598216098228441, + "loss": 3.7712, + "step": 2126 + }, + { + "epoch": 0.1, + "grad_norm": 0.5466444492340088, + "learning_rate": 0.0005982144213242261, + "loss": 3.7642, + "step": 2127 + }, + { + "epoch": 0.1, + "grad_norm": 0.6006033420562744, + "learning_rate": 0.0005982127436345718, + "loss": 3.6812, + "step": 2128 + }, + { + "epoch": 0.1, + "grad_norm": 0.6072657704353333, + "learning_rate": 0.0005982110651594824, + "loss": 3.7972, + "step": 2129 + }, + { + "epoch": 0.1, + "grad_norm": 0.6421571373939514, + "learning_rate": 0.0005982093858989625, + "loss": 3.7497, + "step": 2130 + }, + { + "epoch": 0.1, + "grad_norm": 0.5948945879936218, + "learning_rate": 0.0005982077058530165, + "loss": 3.8954, + "step": 2131 + }, + { + "epoch": 0.1, + "grad_norm": 0.6586947441101074, + "learning_rate": 0.0005982060250216488, + "loss": 3.7188, + "step": 2132 + }, + { + "epoch": 0.1, + "grad_norm": 0.6014887094497681, + "learning_rate": 0.0005982043434048638, + "loss": 3.7739, + "step": 2133 + }, + { + "epoch": 0.1, + "grad_norm": 0.6018311977386475, + "learning_rate": 0.000598202661002666, + "loss": 3.6456, + "step": 2134 + }, + { + "epoch": 0.1, + "grad_norm": 0.6336399912834167, + "learning_rate": 0.0005982009778150596, + "loss": 3.6948, + "step": 2135 + }, + { + "epoch": 0.1, + "grad_norm": 0.586958646774292, + "learning_rate": 0.0005981992938420493, + "loss": 3.7109, + "step": 2136 + }, + { + "epoch": 0.1, + "grad_norm": 0.641817033290863, + "learning_rate": 0.0005981976090836396, + "loss": 3.9746, + "step": 2137 + }, + { + "epoch": 0.1, + "grad_norm": 0.5889933705329895, + "learning_rate": 0.0005981959235398347, + "loss": 3.5553, + "step": 2138 + }, + { + "epoch": 0.1, + "grad_norm": 0.5783056616783142, + "learning_rate": 0.0005981942372106391, + "loss": 3.7265, + "step": 2139 + }, + { + "epoch": 0.1, + "grad_norm": 0.6405118107795715, + "learning_rate": 0.0005981925500960574, + "loss": 3.8398, + "step": 2140 + }, + { + "epoch": 0.1, + "grad_norm": 0.585486114025116, + "learning_rate": 0.0005981908621960937, + "loss": 3.7428, + "step": 2141 + }, + { + "epoch": 0.1, + "grad_norm": 0.6322093605995178, + "learning_rate": 0.0005981891735107528, + "loss": 3.7375, + "step": 2142 + }, + { + "epoch": 0.11, + "grad_norm": 0.6491268873214722, + "learning_rate": 0.0005981874840400389, + "loss": 3.7564, + "step": 2143 + }, + { + "epoch": 0.11, + "grad_norm": 0.5832539796829224, + "learning_rate": 0.0005981857937839566, + "loss": 3.7131, + "step": 2144 + }, + { + "epoch": 0.11, + "grad_norm": 0.5929149389266968, + "learning_rate": 0.0005981841027425102, + "loss": 3.6175, + "step": 2145 + }, + { + "epoch": 0.11, + "grad_norm": 0.5840476155281067, + "learning_rate": 0.0005981824109157044, + "loss": 3.5056, + "step": 2146 + }, + { + "epoch": 0.11, + "grad_norm": 0.5910953879356384, + "learning_rate": 0.0005981807183035436, + "loss": 3.7863, + "step": 2147 + }, + { + "epoch": 0.11, + "grad_norm": 0.6023166179656982, + "learning_rate": 0.000598179024906032, + "loss": 3.4796, + "step": 2148 + }, + { + "epoch": 0.11, + "grad_norm": 0.6427032351493835, + "learning_rate": 0.0005981773307231743, + "loss": 3.5623, + "step": 2149 + }, + { + "epoch": 0.11, + "grad_norm": 0.5569535493850708, + "learning_rate": 0.0005981756357549749, + "loss": 3.7547, + "step": 2150 + }, + { + "epoch": 0.11, + "grad_norm": 0.6145084500312805, + "learning_rate": 0.0005981739400014383, + "loss": 3.8746, + "step": 2151 + }, + { + "epoch": 0.11, + "grad_norm": 0.6303276419639587, + "learning_rate": 0.0005981722434625689, + "loss": 3.8413, + "step": 2152 + }, + { + "epoch": 0.11, + "grad_norm": 0.6206766963005066, + "learning_rate": 0.0005981705461383712, + "loss": 3.7907, + "step": 2153 + }, + { + "epoch": 0.11, + "grad_norm": 0.6251280903816223, + "learning_rate": 0.0005981688480288496, + "loss": 3.8965, + "step": 2154 + }, + { + "epoch": 0.11, + "grad_norm": 0.6077283620834351, + "learning_rate": 0.0005981671491340087, + "loss": 3.6284, + "step": 2155 + }, + { + "epoch": 0.11, + "grad_norm": 0.6264916658401489, + "learning_rate": 0.0005981654494538528, + "loss": 3.7238, + "step": 2156 + }, + { + "epoch": 0.11, + "grad_norm": 0.5991299152374268, + "learning_rate": 0.0005981637489883866, + "loss": 3.7317, + "step": 2157 + }, + { + "epoch": 0.11, + "grad_norm": 0.6194969415664673, + "learning_rate": 0.0005981620477376144, + "loss": 3.7232, + "step": 2158 + }, + { + "epoch": 0.11, + "grad_norm": 0.5819993019104004, + "learning_rate": 0.0005981603457015409, + "loss": 3.7031, + "step": 2159 + }, + { + "epoch": 0.11, + "grad_norm": 0.6135178804397583, + "learning_rate": 0.0005981586428801703, + "loss": 3.7341, + "step": 2160 + }, + { + "epoch": 0.11, + "grad_norm": 0.5985665917396545, + "learning_rate": 0.0005981569392735072, + "loss": 3.774, + "step": 2161 + }, + { + "epoch": 0.11, + "grad_norm": 0.5931420922279358, + "learning_rate": 0.0005981552348815562, + "loss": 3.6447, + "step": 2162 + }, + { + "epoch": 0.11, + "grad_norm": 0.7057386636734009, + "learning_rate": 0.0005981535297043216, + "loss": 3.6028, + "step": 2163 + }, + { + "epoch": 0.11, + "grad_norm": 0.5932669043540955, + "learning_rate": 0.0005981518237418081, + "loss": 3.8519, + "step": 2164 + }, + { + "epoch": 0.11, + "grad_norm": 0.6052893996238708, + "learning_rate": 0.0005981501169940199, + "loss": 3.8123, + "step": 2165 + }, + { + "epoch": 0.11, + "grad_norm": 1.3449329137802124, + "learning_rate": 0.0005981484094609618, + "loss": 3.8314, + "step": 2166 + }, + { + "epoch": 0.11, + "grad_norm": 0.5868152976036072, + "learning_rate": 0.0005981467011426381, + "loss": 3.7069, + "step": 2167 + }, + { + "epoch": 0.11, + "grad_norm": 0.6063858866691589, + "learning_rate": 0.0005981449920390534, + "loss": 3.7754, + "step": 2168 + }, + { + "epoch": 0.11, + "grad_norm": 0.6096360087394714, + "learning_rate": 0.0005981432821502122, + "loss": 3.6768, + "step": 2169 + }, + { + "epoch": 0.11, + "grad_norm": 0.6433746814727783, + "learning_rate": 0.000598141571476119, + "loss": 3.781, + "step": 2170 + }, + { + "epoch": 0.11, + "grad_norm": 0.5999099016189575, + "learning_rate": 0.0005981398600167782, + "loss": 3.9962, + "step": 2171 + }, + { + "epoch": 0.11, + "grad_norm": 0.6435213685035706, + "learning_rate": 0.0005981381477721944, + "loss": 3.5761, + "step": 2172 + }, + { + "epoch": 0.11, + "grad_norm": 0.5516665577888489, + "learning_rate": 0.0005981364347423722, + "loss": 3.6282, + "step": 2173 + }, + { + "epoch": 0.11, + "grad_norm": 0.5798248052597046, + "learning_rate": 0.000598134720927316, + "loss": 3.54, + "step": 2174 + }, + { + "epoch": 0.11, + "grad_norm": 0.6035856604576111, + "learning_rate": 0.0005981330063270302, + "loss": 3.4801, + "step": 2175 + }, + { + "epoch": 0.11, + "grad_norm": 0.6492979526519775, + "learning_rate": 0.0005981312909415195, + "loss": 3.7736, + "step": 2176 + }, + { + "epoch": 0.11, + "grad_norm": 0.6277186870574951, + "learning_rate": 0.0005981295747707882, + "loss": 3.9117, + "step": 2177 + }, + { + "epoch": 0.11, + "grad_norm": 0.5910218954086304, + "learning_rate": 0.0005981278578148412, + "loss": 3.8235, + "step": 2178 + }, + { + "epoch": 0.11, + "grad_norm": 0.7807388305664062, + "learning_rate": 0.0005981261400736827, + "loss": 3.6833, + "step": 2179 + }, + { + "epoch": 0.11, + "grad_norm": 0.5943345427513123, + "learning_rate": 0.0005981244215473174, + "loss": 3.7929, + "step": 2180 + }, + { + "epoch": 0.11, + "grad_norm": 0.578851580619812, + "learning_rate": 0.0005981227022357497, + "loss": 3.7879, + "step": 2181 + }, + { + "epoch": 0.11, + "grad_norm": 0.6276848316192627, + "learning_rate": 0.0005981209821389841, + "loss": 3.4234, + "step": 2182 + }, + { + "epoch": 0.11, + "grad_norm": 0.6022424697875977, + "learning_rate": 0.0005981192612570253, + "loss": 3.7131, + "step": 2183 + }, + { + "epoch": 0.11, + "grad_norm": 0.669609546661377, + "learning_rate": 0.0005981175395898777, + "loss": 3.4337, + "step": 2184 + }, + { + "epoch": 0.11, + "grad_norm": 0.6255220174789429, + "learning_rate": 0.0005981158171375459, + "loss": 3.7007, + "step": 2185 + }, + { + "epoch": 0.11, + "grad_norm": 0.5757836699485779, + "learning_rate": 0.0005981140939000344, + "loss": 3.6742, + "step": 2186 + }, + { + "epoch": 0.11, + "grad_norm": 0.552218496799469, + "learning_rate": 0.0005981123698773478, + "loss": 3.7315, + "step": 2187 + }, + { + "epoch": 0.11, + "grad_norm": 0.5861135125160217, + "learning_rate": 0.0005981106450694904, + "loss": 3.842, + "step": 2188 + }, + { + "epoch": 0.11, + "grad_norm": 0.6071463823318481, + "learning_rate": 0.0005981089194764672, + "loss": 3.7262, + "step": 2189 + }, + { + "epoch": 0.11, + "grad_norm": 0.6482838988304138, + "learning_rate": 0.0005981071930982823, + "loss": 3.4363, + "step": 2190 + }, + { + "epoch": 0.11, + "grad_norm": 0.5629037618637085, + "learning_rate": 0.0005981054659349405, + "loss": 3.7168, + "step": 2191 + }, + { + "epoch": 0.11, + "grad_norm": 0.6590587496757507, + "learning_rate": 0.0005981037379864463, + "loss": 3.7393, + "step": 2192 + }, + { + "epoch": 0.11, + "grad_norm": 0.6444178819656372, + "learning_rate": 0.0005981020092528041, + "loss": 3.6663, + "step": 2193 + }, + { + "epoch": 0.11, + "grad_norm": 0.5685077905654907, + "learning_rate": 0.0005981002797340187, + "loss": 3.8345, + "step": 2194 + }, + { + "epoch": 0.11, + "grad_norm": 0.5644817352294922, + "learning_rate": 0.0005980985494300946, + "loss": 3.632, + "step": 2195 + }, + { + "epoch": 0.11, + "grad_norm": 0.6295031309127808, + "learning_rate": 0.000598096818341036, + "loss": 3.5742, + "step": 2196 + }, + { + "epoch": 0.11, + "grad_norm": 0.5638957023620605, + "learning_rate": 0.000598095086466848, + "loss": 3.7152, + "step": 2197 + }, + { + "epoch": 0.11, + "grad_norm": 0.5969396829605103, + "learning_rate": 0.0005980933538075349, + "loss": 3.5886, + "step": 2198 + }, + { + "epoch": 0.11, + "grad_norm": 0.5802797675132751, + "learning_rate": 0.0005980916203631011, + "loss": 3.6731, + "step": 2199 + }, + { + "epoch": 0.11, + "grad_norm": 0.5977463722229004, + "learning_rate": 0.0005980898861335515, + "loss": 3.6158, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 0.5731961727142334, + "learning_rate": 0.0005980881511188904, + "loss": 3.8086, + "step": 2201 + }, + { + "epoch": 0.11, + "grad_norm": 0.5767385363578796, + "learning_rate": 0.0005980864153191226, + "loss": 3.5526, + "step": 2202 + }, + { + "epoch": 0.11, + "grad_norm": 0.6023370623588562, + "learning_rate": 0.0005980846787342524, + "loss": 3.4739, + "step": 2203 + }, + { + "epoch": 0.11, + "grad_norm": 0.5843697786331177, + "learning_rate": 0.0005980829413642847, + "loss": 3.6017, + "step": 2204 + }, + { + "epoch": 0.11, + "grad_norm": 0.5621604323387146, + "learning_rate": 0.0005980812032092238, + "loss": 3.8069, + "step": 2205 + }, + { + "epoch": 0.11, + "grad_norm": 0.6262988448143005, + "learning_rate": 0.0005980794642690744, + "loss": 3.8661, + "step": 2206 + }, + { + "epoch": 0.11, + "grad_norm": 0.594237744808197, + "learning_rate": 0.0005980777245438411, + "loss": 3.6751, + "step": 2207 + }, + { + "epoch": 0.11, + "grad_norm": 0.5916548371315002, + "learning_rate": 0.0005980759840335284, + "loss": 3.7278, + "step": 2208 + }, + { + "epoch": 0.11, + "grad_norm": 0.5857176184654236, + "learning_rate": 0.0005980742427381409, + "loss": 3.646, + "step": 2209 + }, + { + "epoch": 0.11, + "grad_norm": 0.6896742582321167, + "learning_rate": 0.0005980725006576832, + "loss": 3.5892, + "step": 2210 + }, + { + "epoch": 0.11, + "grad_norm": 0.678264856338501, + "learning_rate": 0.0005980707577921599, + "loss": 3.8561, + "step": 2211 + }, + { + "epoch": 0.11, + "grad_norm": 0.5566852688789368, + "learning_rate": 0.0005980690141415756, + "loss": 3.6284, + "step": 2212 + }, + { + "epoch": 0.11, + "grad_norm": 0.5891258716583252, + "learning_rate": 0.000598067269705935, + "loss": 3.4689, + "step": 2213 + }, + { + "epoch": 0.11, + "grad_norm": 0.6085823178291321, + "learning_rate": 0.0005980655244852424, + "loss": 3.8165, + "step": 2214 + }, + { + "epoch": 0.11, + "grad_norm": 0.6259714365005493, + "learning_rate": 0.0005980637784795027, + "loss": 3.454, + "step": 2215 + }, + { + "epoch": 0.11, + "grad_norm": 0.6584763526916504, + "learning_rate": 0.0005980620316887203, + "loss": 3.6297, + "step": 2216 + }, + { + "epoch": 0.11, + "grad_norm": 0.5733041167259216, + "learning_rate": 0.0005980602841128998, + "loss": 3.8796, + "step": 2217 + }, + { + "epoch": 0.11, + "grad_norm": 0.6462345123291016, + "learning_rate": 0.000598058535752046, + "loss": 3.6018, + "step": 2218 + }, + { + "epoch": 0.11, + "grad_norm": 0.6512056589126587, + "learning_rate": 0.0005980567866061634, + "loss": 3.7185, + "step": 2219 + }, + { + "epoch": 0.11, + "grad_norm": 0.6104543209075928, + "learning_rate": 0.0005980550366752565, + "loss": 3.313, + "step": 2220 + }, + { + "epoch": 0.11, + "grad_norm": 0.5999908447265625, + "learning_rate": 0.00059805328595933, + "loss": 3.7027, + "step": 2221 + }, + { + "epoch": 0.11, + "grad_norm": 0.5871838331222534, + "learning_rate": 0.0005980515344583886, + "loss": 3.8634, + "step": 2222 + }, + { + "epoch": 0.11, + "grad_norm": 0.5965851545333862, + "learning_rate": 0.0005980497821724366, + "loss": 3.6761, + "step": 2223 + }, + { + "epoch": 0.11, + "grad_norm": 0.6039659380912781, + "learning_rate": 0.000598048029101479, + "loss": 3.769, + "step": 2224 + }, + { + "epoch": 0.11, + "grad_norm": 0.6120340824127197, + "learning_rate": 0.0005980462752455204, + "loss": 3.5583, + "step": 2225 + }, + { + "epoch": 0.11, + "grad_norm": 0.605404257774353, + "learning_rate": 0.0005980445206045649, + "loss": 3.8279, + "step": 2226 + }, + { + "epoch": 0.11, + "grad_norm": 0.5787206888198853, + "learning_rate": 0.0005980427651786179, + "loss": 3.8282, + "step": 2227 + }, + { + "epoch": 0.11, + "grad_norm": 0.5788346529006958, + "learning_rate": 0.0005980410089676833, + "loss": 3.9292, + "step": 2228 + }, + { + "epoch": 0.11, + "grad_norm": 0.6174167394638062, + "learning_rate": 0.0005980392519717661, + "loss": 3.3578, + "step": 2229 + }, + { + "epoch": 0.11, + "grad_norm": 0.5495995283126831, + "learning_rate": 0.000598037494190871, + "loss": 3.7939, + "step": 2230 + }, + { + "epoch": 0.11, + "grad_norm": 0.6039677262306213, + "learning_rate": 0.0005980357356250023, + "loss": 3.7216, + "step": 2231 + }, + { + "epoch": 0.11, + "grad_norm": 0.598181962966919, + "learning_rate": 0.0005980339762741651, + "loss": 3.7851, + "step": 2232 + }, + { + "epoch": 0.11, + "grad_norm": 0.5918916463851929, + "learning_rate": 0.0005980322161383636, + "loss": 3.6826, + "step": 2233 + }, + { + "epoch": 0.11, + "grad_norm": 0.5864900350570679, + "learning_rate": 0.0005980304552176026, + "loss": 3.8629, + "step": 2234 + }, + { + "epoch": 0.11, + "grad_norm": 0.5640453696250916, + "learning_rate": 0.0005980286935118868, + "loss": 3.4874, + "step": 2235 + }, + { + "epoch": 0.11, + "grad_norm": 0.6437229514122009, + "learning_rate": 0.0005980269310212207, + "loss": 3.7286, + "step": 2236 + }, + { + "epoch": 0.11, + "grad_norm": 0.5866195559501648, + "learning_rate": 0.000598025167745609, + "loss": 3.6045, + "step": 2237 + }, + { + "epoch": 0.11, + "grad_norm": 0.6194109320640564, + "learning_rate": 0.0005980234036850565, + "loss": 3.7216, + "step": 2238 + }, + { + "epoch": 0.11, + "grad_norm": 0.6149755120277405, + "learning_rate": 0.0005980216388395676, + "loss": 3.7214, + "step": 2239 + }, + { + "epoch": 0.11, + "grad_norm": 0.5646103024482727, + "learning_rate": 0.0005980198732091471, + "loss": 3.6628, + "step": 2240 + }, + { + "epoch": 0.11, + "grad_norm": 0.6370061635971069, + "learning_rate": 0.0005980181067937996, + "loss": 3.7355, + "step": 2241 + }, + { + "epoch": 0.11, + "grad_norm": 0.6375163793563843, + "learning_rate": 0.0005980163395935297, + "loss": 3.6579, + "step": 2242 + }, + { + "epoch": 0.11, + "grad_norm": 0.600969135761261, + "learning_rate": 0.0005980145716083423, + "loss": 3.8107, + "step": 2243 + }, + { + "epoch": 0.11, + "grad_norm": 0.5838377475738525, + "learning_rate": 0.0005980128028382416, + "loss": 3.8588, + "step": 2244 + }, + { + "epoch": 0.11, + "grad_norm": 0.5946401357650757, + "learning_rate": 0.0005980110332832328, + "loss": 3.7246, + "step": 2245 + }, + { + "epoch": 0.11, + "grad_norm": 0.6003056764602661, + "learning_rate": 0.0005980092629433202, + "loss": 3.4673, + "step": 2246 + }, + { + "epoch": 0.11, + "grad_norm": 0.6012457609176636, + "learning_rate": 0.0005980074918185083, + "loss": 3.7586, + "step": 2247 + }, + { + "epoch": 0.11, + "grad_norm": 0.5687946677207947, + "learning_rate": 0.0005980057199088024, + "loss": 3.4091, + "step": 2248 + }, + { + "epoch": 0.11, + "grad_norm": 0.6248691082000732, + "learning_rate": 0.0005980039472142065, + "loss": 3.7712, + "step": 2249 + }, + { + "epoch": 0.11, + "grad_norm": 0.5778698325157166, + "learning_rate": 0.0005980021737347258, + "loss": 3.629, + "step": 2250 + }, + { + "epoch": 0.11, + "grad_norm": 1.24909245967865, + "learning_rate": 0.0005980003994703647, + "loss": 4.0102, + "step": 2251 + }, + { + "epoch": 0.11, + "grad_norm": 0.6096078157424927, + "learning_rate": 0.0005979986244211277, + "loss": 3.7681, + "step": 2252 + }, + { + "epoch": 0.11, + "grad_norm": 0.6022841334342957, + "learning_rate": 0.0005979968485870198, + "loss": 3.6394, + "step": 2253 + }, + { + "epoch": 0.11, + "grad_norm": 0.6294955015182495, + "learning_rate": 0.0005979950719680455, + "loss": 3.8566, + "step": 2254 + }, + { + "epoch": 0.11, + "grad_norm": 0.641569972038269, + "learning_rate": 0.0005979932945642096, + "loss": 3.5181, + "step": 2255 + }, + { + "epoch": 0.11, + "grad_norm": 0.5697000622749329, + "learning_rate": 0.0005979915163755168, + "loss": 3.6019, + "step": 2256 + }, + { + "epoch": 0.11, + "grad_norm": 0.6431049704551697, + "learning_rate": 0.0005979897374019715, + "loss": 3.6606, + "step": 2257 + }, + { + "epoch": 0.11, + "grad_norm": 0.6189910769462585, + "learning_rate": 0.0005979879576435786, + "loss": 3.6255, + "step": 2258 + }, + { + "epoch": 0.11, + "grad_norm": 0.6022100448608398, + "learning_rate": 0.0005979861771003429, + "loss": 3.7233, + "step": 2259 + }, + { + "epoch": 0.11, + "grad_norm": 0.6038528084754944, + "learning_rate": 0.0005979843957722688, + "loss": 3.8155, + "step": 2260 + }, + { + "epoch": 0.11, + "grad_norm": 0.5769616961479187, + "learning_rate": 0.0005979826136593612, + "loss": 3.4622, + "step": 2261 + }, + { + "epoch": 0.11, + "grad_norm": 0.5804604291915894, + "learning_rate": 0.0005979808307616248, + "loss": 3.7446, + "step": 2262 + }, + { + "epoch": 0.11, + "grad_norm": 0.5664138793945312, + "learning_rate": 0.0005979790470790642, + "loss": 3.9118, + "step": 2263 + }, + { + "epoch": 0.11, + "grad_norm": 0.6211099028587341, + "learning_rate": 0.0005979772626116841, + "loss": 3.7406, + "step": 2264 + }, + { + "epoch": 0.11, + "grad_norm": 0.624521791934967, + "learning_rate": 0.0005979754773594894, + "loss": 3.7997, + "step": 2265 + }, + { + "epoch": 0.11, + "grad_norm": 0.5502910614013672, + "learning_rate": 0.0005979736913224844, + "loss": 3.7596, + "step": 2266 + }, + { + "epoch": 0.11, + "grad_norm": 0.5801634788513184, + "learning_rate": 0.0005979719045006743, + "loss": 3.5966, + "step": 2267 + }, + { + "epoch": 0.11, + "grad_norm": 0.6068748831748962, + "learning_rate": 0.0005979701168940633, + "loss": 3.5563, + "step": 2268 + }, + { + "epoch": 0.11, + "grad_norm": 0.6216042041778564, + "learning_rate": 0.0005979683285026565, + "loss": 3.4848, + "step": 2269 + }, + { + "epoch": 0.11, + "grad_norm": 0.5503404140472412, + "learning_rate": 0.0005979665393264585, + "loss": 3.7462, + "step": 2270 + }, + { + "epoch": 0.11, + "grad_norm": 0.6132727265357971, + "learning_rate": 0.000597964749365474, + "loss": 3.8308, + "step": 2271 + }, + { + "epoch": 0.11, + "grad_norm": 0.6045131087303162, + "learning_rate": 0.0005979629586197075, + "loss": 3.877, + "step": 2272 + }, + { + "epoch": 0.11, + "grad_norm": 0.6383094191551208, + "learning_rate": 0.0005979611670891641, + "loss": 3.8716, + "step": 2273 + }, + { + "epoch": 0.11, + "grad_norm": 0.6388508677482605, + "learning_rate": 0.0005979593747738483, + "loss": 3.7067, + "step": 2274 + }, + { + "epoch": 0.11, + "grad_norm": 0.630363941192627, + "learning_rate": 0.0005979575816737648, + "loss": 3.6567, + "step": 2275 + }, + { + "epoch": 0.11, + "grad_norm": 0.6004341244697571, + "learning_rate": 0.0005979557877889184, + "loss": 3.6867, + "step": 2276 + }, + { + "epoch": 0.11, + "grad_norm": 0.5606918931007385, + "learning_rate": 0.0005979539931193137, + "loss": 3.6417, + "step": 2277 + }, + { + "epoch": 0.11, + "grad_norm": 0.5711063146591187, + "learning_rate": 0.0005979521976649556, + "loss": 3.607, + "step": 2278 + }, + { + "epoch": 0.11, + "grad_norm": 0.6434407830238342, + "learning_rate": 0.0005979504014258488, + "loss": 3.7508, + "step": 2279 + }, + { + "epoch": 0.11, + "grad_norm": 0.6066713929176331, + "learning_rate": 0.0005979486044019979, + "loss": 3.6702, + "step": 2280 + }, + { + "epoch": 0.11, + "grad_norm": 0.566117525100708, + "learning_rate": 0.0005979468065934078, + "loss": 3.7053, + "step": 2281 + }, + { + "epoch": 0.11, + "grad_norm": 0.6023035645484924, + "learning_rate": 0.000597945008000083, + "loss": 3.7924, + "step": 2282 + }, + { + "epoch": 0.11, + "grad_norm": 0.649734377861023, + "learning_rate": 0.0005979432086220287, + "loss": 3.6891, + "step": 2283 + }, + { + "epoch": 0.11, + "grad_norm": 0.6085857152938843, + "learning_rate": 0.000597941408459249, + "loss": 3.6721, + "step": 2284 + }, + { + "epoch": 0.11, + "grad_norm": 0.594468355178833, + "learning_rate": 0.0005979396075117492, + "loss": 3.7176, + "step": 2285 + }, + { + "epoch": 0.11, + "grad_norm": 0.5784687995910645, + "learning_rate": 0.0005979378057795337, + "loss": 3.6966, + "step": 2286 + }, + { + "epoch": 0.11, + "grad_norm": 0.586226224899292, + "learning_rate": 0.0005979360032626073, + "loss": 3.8213, + "step": 2287 + }, + { + "epoch": 0.11, + "grad_norm": 0.5555378198623657, + "learning_rate": 0.0005979341999609749, + "loss": 3.7659, + "step": 2288 + }, + { + "epoch": 0.11, + "grad_norm": 0.6186184287071228, + "learning_rate": 0.0005979323958746411, + "loss": 3.7084, + "step": 2289 + }, + { + "epoch": 0.11, + "grad_norm": 0.5890335440635681, + "learning_rate": 0.0005979305910036108, + "loss": 3.6129, + "step": 2290 + }, + { + "epoch": 0.11, + "grad_norm": 0.586846113204956, + "learning_rate": 0.0005979287853478886, + "loss": 3.7499, + "step": 2291 + }, + { + "epoch": 0.11, + "grad_norm": 0.6445232033729553, + "learning_rate": 0.0005979269789074793, + "loss": 3.5936, + "step": 2292 + }, + { + "epoch": 0.11, + "grad_norm": 0.5615684390068054, + "learning_rate": 0.0005979251716823877, + "loss": 3.8463, + "step": 2293 + }, + { + "epoch": 0.11, + "grad_norm": 0.6355283856391907, + "learning_rate": 0.0005979233636726186, + "loss": 3.5838, + "step": 2294 + }, + { + "epoch": 0.11, + "grad_norm": 0.5905806422233582, + "learning_rate": 0.0005979215548781766, + "loss": 3.5728, + "step": 2295 + }, + { + "epoch": 0.11, + "grad_norm": 0.6034395694732666, + "learning_rate": 0.0005979197452990665, + "loss": 3.3597, + "step": 2296 + }, + { + "epoch": 0.11, + "grad_norm": 0.5971759557723999, + "learning_rate": 0.0005979179349352932, + "loss": 3.7747, + "step": 2297 + }, + { + "epoch": 0.11, + "grad_norm": 0.5553016662597656, + "learning_rate": 0.0005979161237868615, + "loss": 3.7177, + "step": 2298 + }, + { + "epoch": 0.11, + "grad_norm": 0.6574896574020386, + "learning_rate": 0.000597914311853776, + "loss": 3.8484, + "step": 2299 + }, + { + "epoch": 0.11, + "grad_norm": 0.5912606120109558, + "learning_rate": 0.0005979124991360414, + "loss": 3.713, + "step": 2300 + }, + { + "epoch": 0.11, + "grad_norm": 0.5491510629653931, + "learning_rate": 0.0005979106856336628, + "loss": 3.7381, + "step": 2301 + }, + { + "epoch": 0.11, + "grad_norm": 0.5802510976791382, + "learning_rate": 0.0005979088713466447, + "loss": 3.7492, + "step": 2302 + }, + { + "epoch": 0.11, + "grad_norm": 0.5774723291397095, + "learning_rate": 0.000597907056274992, + "loss": 3.7597, + "step": 2303 + }, + { + "epoch": 0.11, + "grad_norm": 0.6084253191947937, + "learning_rate": 0.0005979052404187094, + "loss": 3.7054, + "step": 2304 + }, + { + "epoch": 0.11, + "grad_norm": 0.5598316788673401, + "learning_rate": 0.0005979034237778018, + "loss": 3.7911, + "step": 2305 + }, + { + "epoch": 0.11, + "grad_norm": 0.6073153614997864, + "learning_rate": 0.0005979016063522738, + "loss": 3.6144, + "step": 2306 + }, + { + "epoch": 0.11, + "grad_norm": 0.7126467227935791, + "learning_rate": 0.0005978997881421304, + "loss": 3.7347, + "step": 2307 + }, + { + "epoch": 0.11, + "grad_norm": 0.6775062084197998, + "learning_rate": 0.0005978979691473763, + "loss": 3.632, + "step": 2308 + }, + { + "epoch": 0.11, + "grad_norm": 0.6118178367614746, + "learning_rate": 0.0005978961493680162, + "loss": 3.5525, + "step": 2309 + }, + { + "epoch": 0.11, + "grad_norm": 0.6124061346054077, + "learning_rate": 0.000597894328804055, + "loss": 3.6992, + "step": 2310 + }, + { + "epoch": 0.11, + "grad_norm": 0.6024648547172546, + "learning_rate": 0.0005978925074554975, + "loss": 3.8113, + "step": 2311 + }, + { + "epoch": 0.11, + "grad_norm": 0.6617346405982971, + "learning_rate": 0.0005978906853223485, + "loss": 3.7497, + "step": 2312 + }, + { + "epoch": 0.11, + "grad_norm": 0.5842245221138, + "learning_rate": 0.0005978888624046127, + "loss": 3.5113, + "step": 2313 + }, + { + "epoch": 0.11, + "grad_norm": 0.6647869348526001, + "learning_rate": 0.0005978870387022949, + "loss": 3.7863, + "step": 2314 + }, + { + "epoch": 0.11, + "grad_norm": 0.6231033205986023, + "learning_rate": 0.0005978852142154001, + "loss": 3.8737, + "step": 2315 + }, + { + "epoch": 0.11, + "grad_norm": 0.5999862551689148, + "learning_rate": 0.000597883388943933, + "loss": 3.774, + "step": 2316 + }, + { + "epoch": 0.11, + "grad_norm": 0.6005386114120483, + "learning_rate": 0.0005978815628878982, + "loss": 3.832, + "step": 2317 + }, + { + "epoch": 0.11, + "grad_norm": 0.6149207353591919, + "learning_rate": 0.0005978797360473009, + "loss": 3.7553, + "step": 2318 + }, + { + "epoch": 0.11, + "grad_norm": 0.5701582431793213, + "learning_rate": 0.0005978779084221456, + "loss": 3.6775, + "step": 2319 + }, + { + "epoch": 0.11, + "grad_norm": 0.5574246644973755, + "learning_rate": 0.0005978760800124372, + "loss": 3.6211, + "step": 2320 + }, + { + "epoch": 0.11, + "grad_norm": 0.6085734963417053, + "learning_rate": 0.0005978742508181805, + "loss": 3.6918, + "step": 2321 + }, + { + "epoch": 0.11, + "grad_norm": 0.6652593612670898, + "learning_rate": 0.0005978724208393804, + "loss": 3.8293, + "step": 2322 + }, + { + "epoch": 0.11, + "grad_norm": 0.5725169777870178, + "learning_rate": 0.0005978705900760418, + "loss": 3.7319, + "step": 2323 + }, + { + "epoch": 0.11, + "grad_norm": 0.6140974164009094, + "learning_rate": 0.0005978687585281692, + "loss": 3.7711, + "step": 2324 + }, + { + "epoch": 0.11, + "grad_norm": 0.5903205871582031, + "learning_rate": 0.0005978669261957676, + "loss": 3.7894, + "step": 2325 + }, + { + "epoch": 0.11, + "grad_norm": 0.553845226764679, + "learning_rate": 0.000597865093078842, + "loss": 3.6955, + "step": 2326 + }, + { + "epoch": 0.11, + "grad_norm": 0.5786354541778564, + "learning_rate": 0.0005978632591773969, + "loss": 3.8862, + "step": 2327 + }, + { + "epoch": 0.11, + "grad_norm": 0.5850010514259338, + "learning_rate": 0.0005978614244914375, + "loss": 3.5009, + "step": 2328 + }, + { + "epoch": 0.11, + "grad_norm": 0.5797577500343323, + "learning_rate": 0.0005978595890209683, + "loss": 3.7438, + "step": 2329 + }, + { + "epoch": 0.11, + "grad_norm": 0.5699589848518372, + "learning_rate": 0.0005978577527659943, + "loss": 3.7208, + "step": 2330 + }, + { + "epoch": 0.11, + "grad_norm": 0.6231520771980286, + "learning_rate": 0.0005978559157265203, + "loss": 3.7847, + "step": 2331 + }, + { + "epoch": 0.11, + "grad_norm": 0.5472413897514343, + "learning_rate": 0.0005978540779025511, + "loss": 3.8139, + "step": 2332 + }, + { + "epoch": 0.11, + "grad_norm": 0.6323192119598389, + "learning_rate": 0.0005978522392940917, + "loss": 3.6501, + "step": 2333 + }, + { + "epoch": 0.11, + "grad_norm": 0.8912398219108582, + "learning_rate": 0.0005978503999011467, + "loss": 3.9693, + "step": 2334 + }, + { + "epoch": 0.11, + "grad_norm": 0.5911180377006531, + "learning_rate": 0.000597848559723721, + "loss": 3.5229, + "step": 2335 + }, + { + "epoch": 0.11, + "grad_norm": 0.5412778258323669, + "learning_rate": 0.0005978467187618198, + "loss": 3.7783, + "step": 2336 + }, + { + "epoch": 0.11, + "grad_norm": 0.5602182745933533, + "learning_rate": 0.0005978448770154474, + "loss": 3.6555, + "step": 2337 + }, + { + "epoch": 0.11, + "grad_norm": 0.6185539364814758, + "learning_rate": 0.000597843034484609, + "loss": 3.5993, + "step": 2338 + }, + { + "epoch": 0.11, + "grad_norm": 0.6103714108467102, + "learning_rate": 0.0005978411911693094, + "loss": 3.7237, + "step": 2339 + }, + { + "epoch": 0.11, + "grad_norm": 0.6211607456207275, + "learning_rate": 0.0005978393470695534, + "loss": 3.5363, + "step": 2340 + }, + { + "epoch": 0.11, + "grad_norm": 0.5573241710662842, + "learning_rate": 0.0005978375021853459, + "loss": 3.9038, + "step": 2341 + }, + { + "epoch": 0.11, + "grad_norm": 0.6002277731895447, + "learning_rate": 0.0005978356565166917, + "loss": 3.7787, + "step": 2342 + }, + { + "epoch": 0.11, + "grad_norm": 0.6228034496307373, + "learning_rate": 0.0005978338100635958, + "loss": 3.5716, + "step": 2343 + }, + { + "epoch": 0.11, + "grad_norm": 0.5964246988296509, + "learning_rate": 0.0005978319628260629, + "loss": 3.5264, + "step": 2344 + }, + { + "epoch": 0.11, + "grad_norm": 0.6132970452308655, + "learning_rate": 0.0005978301148040978, + "loss": 3.6864, + "step": 2345 + }, + { + "epoch": 0.11, + "grad_norm": 0.6156790852546692, + "learning_rate": 0.0005978282659977058, + "loss": 3.5236, + "step": 2346 + }, + { + "epoch": 0.12, + "grad_norm": 0.5750837326049805, + "learning_rate": 0.0005978264164068912, + "loss": 3.7859, + "step": 2347 + }, + { + "epoch": 0.12, + "grad_norm": 0.6026135087013245, + "learning_rate": 0.0005978245660316592, + "loss": 3.6397, + "step": 2348 + }, + { + "epoch": 0.12, + "grad_norm": 0.5814940929412842, + "learning_rate": 0.0005978227148720146, + "loss": 3.8765, + "step": 2349 + }, + { + "epoch": 0.12, + "grad_norm": 0.6358333826065063, + "learning_rate": 0.0005978208629279623, + "loss": 3.6133, + "step": 2350 + }, + { + "epoch": 0.12, + "grad_norm": 0.6395803689956665, + "learning_rate": 0.0005978190101995071, + "loss": 3.7048, + "step": 2351 + }, + { + "epoch": 0.12, + "grad_norm": 0.5972474813461304, + "learning_rate": 0.000597817156686654, + "loss": 3.7461, + "step": 2352 + }, + { + "epoch": 0.12, + "grad_norm": 0.5645350217819214, + "learning_rate": 0.0005978153023894079, + "loss": 3.6762, + "step": 2353 + }, + { + "epoch": 0.12, + "grad_norm": 0.6386092305183411, + "learning_rate": 0.0005978134473077736, + "loss": 3.5883, + "step": 2354 + }, + { + "epoch": 0.12, + "grad_norm": 0.578127384185791, + "learning_rate": 0.0005978115914417559, + "loss": 3.7026, + "step": 2355 + }, + { + "epoch": 0.12, + "grad_norm": 0.5714936256408691, + "learning_rate": 0.0005978097347913598, + "loss": 3.6753, + "step": 2356 + }, + { + "epoch": 0.12, + "grad_norm": 0.6167704463005066, + "learning_rate": 0.0005978078773565903, + "loss": 3.7017, + "step": 2357 + }, + { + "epoch": 0.12, + "grad_norm": 0.6012189984321594, + "learning_rate": 0.000597806019137452, + "loss": 3.6736, + "step": 2358 + }, + { + "epoch": 0.12, + "grad_norm": 0.6270298361778259, + "learning_rate": 0.00059780416013395, + "loss": 3.5279, + "step": 2359 + }, + { + "epoch": 0.12, + "grad_norm": 0.6216274499893188, + "learning_rate": 0.0005978023003460893, + "loss": 3.4673, + "step": 2360 + }, + { + "epoch": 0.12, + "grad_norm": 0.6030069589614868, + "learning_rate": 0.0005978004397738744, + "loss": 3.6831, + "step": 2361 + }, + { + "epoch": 0.12, + "grad_norm": 0.5585080981254578, + "learning_rate": 0.0005977985784173107, + "loss": 3.7218, + "step": 2362 + }, + { + "epoch": 0.12, + "grad_norm": 0.6055890321731567, + "learning_rate": 0.0005977967162764027, + "loss": 3.8694, + "step": 2363 + }, + { + "epoch": 0.12, + "grad_norm": 0.58843994140625, + "learning_rate": 0.0005977948533511555, + "loss": 3.5457, + "step": 2364 + }, + { + "epoch": 0.12, + "grad_norm": 0.6120928525924683, + "learning_rate": 0.0005977929896415741, + "loss": 3.7752, + "step": 2365 + }, + { + "epoch": 0.12, + "grad_norm": 0.6189221739768982, + "learning_rate": 0.000597791125147663, + "loss": 3.6428, + "step": 2366 + }, + { + "epoch": 0.12, + "grad_norm": 0.6048324108123779, + "learning_rate": 0.0005977892598694276, + "loss": 3.5921, + "step": 2367 + }, + { + "epoch": 0.12, + "grad_norm": 0.5761814117431641, + "learning_rate": 0.0005977873938068725, + "loss": 3.5946, + "step": 2368 + }, + { + "epoch": 0.12, + "grad_norm": 0.6603354811668396, + "learning_rate": 0.0005977855269600027, + "loss": 3.6483, + "step": 2369 + }, + { + "epoch": 0.12, + "grad_norm": 0.5804461240768433, + "learning_rate": 0.0005977836593288233, + "loss": 3.5695, + "step": 2370 + }, + { + "epoch": 0.12, + "grad_norm": 0.5955277681350708, + "learning_rate": 0.0005977817909133389, + "loss": 3.6131, + "step": 2371 + }, + { + "epoch": 0.12, + "grad_norm": 0.6651073098182678, + "learning_rate": 0.0005977799217135547, + "loss": 3.7153, + "step": 2372 + }, + { + "epoch": 0.12, + "grad_norm": 0.5861976146697998, + "learning_rate": 0.0005977780517294754, + "loss": 3.7365, + "step": 2373 + }, + { + "epoch": 0.12, + "grad_norm": 0.5709624290466309, + "learning_rate": 0.000597776180961106, + "loss": 3.6133, + "step": 2374 + }, + { + "epoch": 0.12, + "grad_norm": 0.6309444904327393, + "learning_rate": 0.0005977743094084514, + "loss": 3.681, + "step": 2375 + }, + { + "epoch": 0.12, + "grad_norm": 0.5742753148078918, + "learning_rate": 0.0005977724370715167, + "loss": 3.7932, + "step": 2376 + }, + { + "epoch": 0.12, + "grad_norm": 0.5696350336074829, + "learning_rate": 0.0005977705639503067, + "loss": 3.4466, + "step": 2377 + }, + { + "epoch": 0.12, + "grad_norm": 0.6605788469314575, + "learning_rate": 0.0005977686900448262, + "loss": 3.7482, + "step": 2378 + }, + { + "epoch": 0.12, + "grad_norm": 0.5416749715805054, + "learning_rate": 0.0005977668153550804, + "loss": 3.7162, + "step": 2379 + }, + { + "epoch": 0.12, + "grad_norm": 0.6018760204315186, + "learning_rate": 0.0005977649398810741, + "loss": 3.5618, + "step": 2380 + }, + { + "epoch": 0.12, + "grad_norm": 0.6329287886619568, + "learning_rate": 0.0005977630636228123, + "loss": 3.4429, + "step": 2381 + }, + { + "epoch": 0.12, + "grad_norm": 0.6521981358528137, + "learning_rate": 0.0005977611865802999, + "loss": 3.7848, + "step": 2382 + }, + { + "epoch": 0.12, + "grad_norm": 0.585591733455658, + "learning_rate": 0.0005977593087535417, + "loss": 3.6851, + "step": 2383 + }, + { + "epoch": 0.12, + "grad_norm": 0.5864081382751465, + "learning_rate": 0.0005977574301425429, + "loss": 3.6888, + "step": 2384 + }, + { + "epoch": 0.12, + "grad_norm": 0.6254026889801025, + "learning_rate": 0.0005977555507473083, + "loss": 3.7411, + "step": 2385 + }, + { + "epoch": 0.12, + "grad_norm": 0.6233348250389099, + "learning_rate": 0.0005977536705678429, + "loss": 3.9431, + "step": 2386 + }, + { + "epoch": 0.12, + "grad_norm": 0.583896279335022, + "learning_rate": 0.0005977517896041516, + "loss": 3.7437, + "step": 2387 + }, + { + "epoch": 0.12, + "grad_norm": 0.5994085073471069, + "learning_rate": 0.0005977499078562394, + "loss": 3.7577, + "step": 2388 + }, + { + "epoch": 0.12, + "grad_norm": 0.6116136908531189, + "learning_rate": 0.0005977480253241112, + "loss": 3.527, + "step": 2389 + }, + { + "epoch": 0.12, + "grad_norm": 0.6351898908615112, + "learning_rate": 0.0005977461420077721, + "loss": 3.627, + "step": 2390 + }, + { + "epoch": 0.12, + "grad_norm": 0.6024461984634399, + "learning_rate": 0.0005977442579072269, + "loss": 3.7272, + "step": 2391 + }, + { + "epoch": 0.12, + "grad_norm": 0.608726978302002, + "learning_rate": 0.0005977423730224807, + "loss": 3.7063, + "step": 2392 + }, + { + "epoch": 0.12, + "grad_norm": 0.7524157166481018, + "learning_rate": 0.0005977404873535383, + "loss": 3.5555, + "step": 2393 + }, + { + "epoch": 0.12, + "grad_norm": 0.6578160524368286, + "learning_rate": 0.0005977386009004048, + "loss": 3.7474, + "step": 2394 + }, + { + "epoch": 0.12, + "grad_norm": 0.6224360466003418, + "learning_rate": 0.0005977367136630852, + "loss": 3.6999, + "step": 2395 + }, + { + "epoch": 0.12, + "grad_norm": 0.6000403165817261, + "learning_rate": 0.0005977348256415843, + "loss": 3.6645, + "step": 2396 + }, + { + "epoch": 0.12, + "grad_norm": 0.5816790461540222, + "learning_rate": 0.0005977329368359072, + "loss": 3.4327, + "step": 2397 + }, + { + "epoch": 0.12, + "grad_norm": 0.6410402059555054, + "learning_rate": 0.0005977310472460588, + "loss": 3.4193, + "step": 2398 + }, + { + "epoch": 0.12, + "grad_norm": 0.607941210269928, + "learning_rate": 0.0005977291568720442, + "loss": 3.5642, + "step": 2399 + }, + { + "epoch": 0.12, + "grad_norm": 0.6291766166687012, + "learning_rate": 0.0005977272657138683, + "loss": 3.5734, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 0.6615599989891052, + "learning_rate": 0.000597725373771536, + "loss": 3.7853, + "step": 2401 + }, + { + "epoch": 0.12, + "grad_norm": 0.5869166254997253, + "learning_rate": 0.0005977234810450524, + "loss": 3.6767, + "step": 2402 + }, + { + "epoch": 0.12, + "grad_norm": 0.647530198097229, + "learning_rate": 0.0005977215875344224, + "loss": 3.3796, + "step": 2403 + }, + { + "epoch": 0.12, + "grad_norm": 0.5894125699996948, + "learning_rate": 0.000597719693239651, + "loss": 3.3358, + "step": 2404 + }, + { + "epoch": 0.12, + "grad_norm": 0.5925880074501038, + "learning_rate": 0.0005977177981607434, + "loss": 3.5175, + "step": 2405 + }, + { + "epoch": 0.12, + "grad_norm": 0.6181460022926331, + "learning_rate": 0.0005977159022977043, + "loss": 3.7263, + "step": 2406 + }, + { + "epoch": 0.12, + "grad_norm": 0.5811930298805237, + "learning_rate": 0.0005977140056505389, + "loss": 3.8623, + "step": 2407 + }, + { + "epoch": 0.12, + "grad_norm": 0.6250795125961304, + "learning_rate": 0.000597712108219252, + "loss": 3.7118, + "step": 2408 + }, + { + "epoch": 0.12, + "grad_norm": 0.5673991441726685, + "learning_rate": 0.0005977102100038488, + "loss": 3.7807, + "step": 2409 + }, + { + "epoch": 0.12, + "grad_norm": 0.60816890001297, + "learning_rate": 0.0005977083110043341, + "loss": 3.5926, + "step": 2410 + }, + { + "epoch": 0.12, + "grad_norm": 0.5878978371620178, + "learning_rate": 0.0005977064112207131, + "loss": 3.7433, + "step": 2411 + }, + { + "epoch": 0.12, + "grad_norm": 0.5771352648735046, + "learning_rate": 0.0005977045106529906, + "loss": 3.7076, + "step": 2412 + }, + { + "epoch": 0.12, + "grad_norm": 0.5752608776092529, + "learning_rate": 0.0005977026093011719, + "loss": 3.6284, + "step": 2413 + }, + { + "epoch": 0.12, + "grad_norm": 0.5990986227989197, + "learning_rate": 0.0005977007071652616, + "loss": 3.7138, + "step": 2414 + }, + { + "epoch": 0.12, + "grad_norm": 0.5749576091766357, + "learning_rate": 0.0005976988042452651, + "loss": 3.5255, + "step": 2415 + }, + { + "epoch": 0.12, + "grad_norm": 0.5812985897064209, + "learning_rate": 0.0005976969005411871, + "loss": 3.8895, + "step": 2416 + }, + { + "epoch": 0.12, + "grad_norm": 0.5838669538497925, + "learning_rate": 0.0005976949960530329, + "loss": 3.604, + "step": 2417 + }, + { + "epoch": 0.12, + "grad_norm": 0.5880911946296692, + "learning_rate": 0.0005976930907808073, + "loss": 3.5379, + "step": 2418 + }, + { + "epoch": 0.12, + "grad_norm": 0.6260443925857544, + "learning_rate": 0.0005976911847245153, + "loss": 3.5423, + "step": 2419 + }, + { + "epoch": 0.12, + "grad_norm": 0.6164513826370239, + "learning_rate": 0.0005976892778841622, + "loss": 3.9167, + "step": 2420 + }, + { + "epoch": 0.12, + "grad_norm": 0.6131654977798462, + "learning_rate": 0.0005976873702597527, + "loss": 3.7101, + "step": 2421 + }, + { + "epoch": 0.12, + "grad_norm": 0.6063238978385925, + "learning_rate": 0.000597685461851292, + "loss": 3.6194, + "step": 2422 + }, + { + "epoch": 0.12, + "grad_norm": 0.5958883762359619, + "learning_rate": 0.0005976835526587851, + "loss": 3.8709, + "step": 2423 + }, + { + "epoch": 0.12, + "grad_norm": 0.623001217842102, + "learning_rate": 0.0005976816426822369, + "loss": 3.7071, + "step": 2424 + }, + { + "epoch": 0.12, + "grad_norm": 0.5648288726806641, + "learning_rate": 0.0005976797319216527, + "loss": 3.8756, + "step": 2425 + }, + { + "epoch": 0.12, + "grad_norm": 0.5784841179847717, + "learning_rate": 0.0005976778203770373, + "loss": 3.9232, + "step": 2426 + }, + { + "epoch": 0.12, + "grad_norm": 0.6105582118034363, + "learning_rate": 0.0005976759080483958, + "loss": 3.7534, + "step": 2427 + }, + { + "epoch": 0.12, + "grad_norm": 0.5793781280517578, + "learning_rate": 0.0005976739949357333, + "loss": 3.7578, + "step": 2428 + }, + { + "epoch": 0.12, + "grad_norm": 0.5981903672218323, + "learning_rate": 0.0005976720810390547, + "loss": 3.6772, + "step": 2429 + }, + { + "epoch": 0.12, + "grad_norm": 0.585236132144928, + "learning_rate": 0.0005976701663583652, + "loss": 3.5096, + "step": 2430 + }, + { + "epoch": 0.12, + "grad_norm": 0.6657644510269165, + "learning_rate": 0.0005976682508936699, + "loss": 3.6456, + "step": 2431 + }, + { + "epoch": 0.12, + "grad_norm": 0.5640720129013062, + "learning_rate": 0.0005976663346449735, + "loss": 3.8045, + "step": 2432 + }, + { + "epoch": 0.12, + "grad_norm": 0.6080378293991089, + "learning_rate": 0.0005976644176122813, + "loss": 3.732, + "step": 2433 + }, + { + "epoch": 0.12, + "grad_norm": 0.5977432727813721, + "learning_rate": 0.0005976624997955984, + "loss": 3.7909, + "step": 2434 + }, + { + "epoch": 0.12, + "grad_norm": 0.6109653115272522, + "learning_rate": 0.0005976605811949296, + "loss": 3.6774, + "step": 2435 + }, + { + "epoch": 0.12, + "grad_norm": 0.6155614256858826, + "learning_rate": 0.0005976586618102802, + "loss": 3.7383, + "step": 2436 + }, + { + "epoch": 0.12, + "grad_norm": 0.5923848748207092, + "learning_rate": 0.0005976567416416552, + "loss": 3.74, + "step": 2437 + }, + { + "epoch": 0.12, + "grad_norm": 0.6466518640518188, + "learning_rate": 0.0005976548206890597, + "loss": 3.8091, + "step": 2438 + }, + { + "epoch": 0.12, + "grad_norm": 0.6154281497001648, + "learning_rate": 0.0005976528989524985, + "loss": 3.5068, + "step": 2439 + }, + { + "epoch": 0.12, + "grad_norm": 0.5379983186721802, + "learning_rate": 0.000597650976431977, + "loss": 3.8493, + "step": 2440 + }, + { + "epoch": 0.12, + "grad_norm": 0.5665551424026489, + "learning_rate": 0.0005976490531275, + "loss": 3.6155, + "step": 2441 + }, + { + "epoch": 0.12, + "grad_norm": 0.5697283744812012, + "learning_rate": 0.0005976471290390727, + "loss": 3.8031, + "step": 2442 + }, + { + "epoch": 0.12, + "grad_norm": 0.6457461714744568, + "learning_rate": 0.0005976452041667002, + "loss": 3.6258, + "step": 2443 + }, + { + "epoch": 0.12, + "grad_norm": 0.5855603218078613, + "learning_rate": 0.0005976432785103875, + "loss": 3.6725, + "step": 2444 + }, + { + "epoch": 0.12, + "grad_norm": 0.642481803894043, + "learning_rate": 0.0005976413520701397, + "loss": 3.5866, + "step": 2445 + }, + { + "epoch": 0.12, + "grad_norm": 0.5984083414077759, + "learning_rate": 0.0005976394248459619, + "loss": 3.6047, + "step": 2446 + }, + { + "epoch": 0.12, + "grad_norm": 0.6179513931274414, + "learning_rate": 0.0005976374968378591, + "loss": 3.7578, + "step": 2447 + }, + { + "epoch": 0.12, + "grad_norm": 0.5873163938522339, + "learning_rate": 0.0005976355680458364, + "loss": 3.4911, + "step": 2448 + }, + { + "epoch": 0.12, + "grad_norm": 0.6037702560424805, + "learning_rate": 0.0005976336384698989, + "loss": 3.556, + "step": 2449 + }, + { + "epoch": 0.12, + "grad_norm": 0.6157534718513489, + "learning_rate": 0.0005976317081100517, + "loss": 3.5765, + "step": 2450 + }, + { + "epoch": 0.12, + "grad_norm": 0.5667474865913391, + "learning_rate": 0.0005976297769662997, + "loss": 3.6285, + "step": 2451 + }, + { + "epoch": 0.12, + "grad_norm": 0.5748934149742126, + "learning_rate": 0.0005976278450386483, + "loss": 3.6365, + "step": 2452 + }, + { + "epoch": 0.12, + "grad_norm": 0.5666347146034241, + "learning_rate": 0.0005976259123271025, + "loss": 3.5536, + "step": 2453 + }, + { + "epoch": 0.12, + "grad_norm": 0.6123989224433899, + "learning_rate": 0.0005976239788316671, + "loss": 3.6866, + "step": 2454 + }, + { + "epoch": 0.12, + "grad_norm": 0.5808203220367432, + "learning_rate": 0.0005976220445523476, + "loss": 3.6159, + "step": 2455 + }, + { + "epoch": 0.12, + "grad_norm": 0.6113956570625305, + "learning_rate": 0.0005976201094891489, + "loss": 3.6135, + "step": 2456 + }, + { + "epoch": 0.12, + "grad_norm": 0.5960624814033508, + "learning_rate": 0.0005976181736420762, + "loss": 3.7485, + "step": 2457 + }, + { + "epoch": 0.12, + "grad_norm": 0.6104657649993896, + "learning_rate": 0.0005976162370111343, + "loss": 3.8493, + "step": 2458 + }, + { + "epoch": 0.12, + "grad_norm": 0.5828893780708313, + "learning_rate": 0.0005976142995963285, + "loss": 3.6571, + "step": 2459 + }, + { + "epoch": 0.12, + "grad_norm": 0.6023302674293518, + "learning_rate": 0.000597612361397664, + "loss": 3.8401, + "step": 2460 + }, + { + "epoch": 0.12, + "grad_norm": 0.5271471738815308, + "learning_rate": 0.000597610422415146, + "loss": 3.8007, + "step": 2461 + }, + { + "epoch": 0.12, + "grad_norm": 0.6546827554702759, + "learning_rate": 0.0005976084826487792, + "loss": 3.4448, + "step": 2462 + }, + { + "epoch": 0.12, + "grad_norm": 0.610604465007782, + "learning_rate": 0.0005976065420985689, + "loss": 3.6248, + "step": 2463 + }, + { + "epoch": 0.12, + "grad_norm": 0.5978257060050964, + "learning_rate": 0.0005976046007645203, + "loss": 3.578, + "step": 2464 + }, + { + "epoch": 0.12, + "grad_norm": 0.599612832069397, + "learning_rate": 0.0005976026586466386, + "loss": 3.7923, + "step": 2465 + }, + { + "epoch": 0.12, + "grad_norm": 0.6239912509918213, + "learning_rate": 0.0005976007157449286, + "loss": 3.4729, + "step": 2466 + }, + { + "epoch": 0.12, + "grad_norm": 0.6227158904075623, + "learning_rate": 0.0005975987720593957, + "loss": 3.6351, + "step": 2467 + }, + { + "epoch": 0.12, + "grad_norm": 0.5865234732627869, + "learning_rate": 0.0005975968275900448, + "loss": 3.6103, + "step": 2468 + }, + { + "epoch": 0.12, + "grad_norm": 0.5831554532051086, + "learning_rate": 0.0005975948823368812, + "loss": 3.758, + "step": 2469 + }, + { + "epoch": 0.12, + "grad_norm": 0.6110011339187622, + "learning_rate": 0.0005975929362999099, + "loss": 3.6188, + "step": 2470 + }, + { + "epoch": 0.12, + "grad_norm": 0.6303837895393372, + "learning_rate": 0.000597590989479136, + "loss": 3.7157, + "step": 2471 + }, + { + "epoch": 0.12, + "grad_norm": 0.602101743221283, + "learning_rate": 0.000597589041874565, + "loss": 3.6808, + "step": 2472 + }, + { + "epoch": 0.12, + "grad_norm": 0.6086721420288086, + "learning_rate": 0.0005975870934862015, + "loss": 3.7579, + "step": 2473 + }, + { + "epoch": 0.12, + "grad_norm": 0.5447176694869995, + "learning_rate": 0.0005975851443140509, + "loss": 3.8872, + "step": 2474 + }, + { + "epoch": 0.12, + "grad_norm": 0.6065565347671509, + "learning_rate": 0.0005975831943581184, + "loss": 3.7509, + "step": 2475 + }, + { + "epoch": 0.12, + "grad_norm": 0.582368791103363, + "learning_rate": 0.0005975812436184089, + "loss": 3.6838, + "step": 2476 + }, + { + "epoch": 0.12, + "grad_norm": 0.5834957957267761, + "learning_rate": 0.0005975792920949278, + "loss": 3.7677, + "step": 2477 + }, + { + "epoch": 0.12, + "grad_norm": 0.5708243250846863, + "learning_rate": 0.00059757733978768, + "loss": 3.6435, + "step": 2478 + }, + { + "epoch": 0.12, + "grad_norm": 0.5927959084510803, + "learning_rate": 0.0005975753866966708, + "loss": 3.8947, + "step": 2479 + }, + { + "epoch": 0.12, + "grad_norm": 0.5893408060073853, + "learning_rate": 0.0005975734328219054, + "loss": 3.4956, + "step": 2480 + }, + { + "epoch": 0.12, + "grad_norm": 0.5960896611213684, + "learning_rate": 0.0005975714781633887, + "loss": 3.7718, + "step": 2481 + }, + { + "epoch": 0.12, + "grad_norm": 0.6134823560714722, + "learning_rate": 0.0005975695227211261, + "loss": 3.9268, + "step": 2482 + }, + { + "epoch": 0.12, + "grad_norm": 0.5897719264030457, + "learning_rate": 0.0005975675664951225, + "loss": 3.7412, + "step": 2483 + }, + { + "epoch": 0.12, + "grad_norm": 0.5636825561523438, + "learning_rate": 0.0005975656094853834, + "loss": 3.6555, + "step": 2484 + }, + { + "epoch": 0.12, + "grad_norm": 0.6075690984725952, + "learning_rate": 0.0005975636516919136, + "loss": 3.4279, + "step": 2485 + }, + { + "epoch": 0.12, + "grad_norm": 0.6178411245346069, + "learning_rate": 0.0005975616931147185, + "loss": 3.6293, + "step": 2486 + }, + { + "epoch": 0.12, + "grad_norm": 0.6706565022468567, + "learning_rate": 0.0005975597337538032, + "loss": 3.2851, + "step": 2487 + }, + { + "epoch": 0.12, + "grad_norm": 0.5771182775497437, + "learning_rate": 0.0005975577736091727, + "loss": 3.4126, + "step": 2488 + }, + { + "epoch": 0.12, + "grad_norm": 0.5679176449775696, + "learning_rate": 0.0005975558126808323, + "loss": 3.7656, + "step": 2489 + }, + { + "epoch": 0.12, + "grad_norm": 0.5970303416252136, + "learning_rate": 0.0005975538509687872, + "loss": 3.5579, + "step": 2490 + }, + { + "epoch": 0.12, + "grad_norm": 0.557059645652771, + "learning_rate": 0.0005975518884730425, + "loss": 3.693, + "step": 2491 + }, + { + "epoch": 0.12, + "grad_norm": 0.713829755783081, + "learning_rate": 0.0005975499251936034, + "loss": 3.7477, + "step": 2492 + }, + { + "epoch": 0.12, + "grad_norm": 0.5846167206764221, + "learning_rate": 0.000597547961130475, + "loss": 3.8181, + "step": 2493 + }, + { + "epoch": 0.12, + "grad_norm": 0.5585032105445862, + "learning_rate": 0.0005975459962836627, + "loss": 3.7385, + "step": 2494 + }, + { + "epoch": 0.12, + "grad_norm": 0.6127833724021912, + "learning_rate": 0.0005975440306531715, + "loss": 3.6375, + "step": 2495 + }, + { + "epoch": 0.12, + "grad_norm": 0.6165521144866943, + "learning_rate": 0.0005975420642390064, + "loss": 3.6615, + "step": 2496 + }, + { + "epoch": 0.12, + "grad_norm": 0.5974895358085632, + "learning_rate": 0.0005975400970411729, + "loss": 3.6218, + "step": 2497 + }, + { + "epoch": 0.12, + "grad_norm": 0.6264663338661194, + "learning_rate": 0.0005975381290596759, + "loss": 3.5796, + "step": 2498 + }, + { + "epoch": 0.12, + "grad_norm": 0.5844413638114929, + "learning_rate": 0.0005975361602945209, + "loss": 3.5147, + "step": 2499 + }, + { + "epoch": 0.12, + "grad_norm": 0.6054651141166687, + "learning_rate": 0.0005975341907457129, + "loss": 3.6625, + "step": 2500 + }, + { + "epoch": 0.12, + "grad_norm": 0.6386021971702576, + "learning_rate": 0.0005975322204132571, + "loss": 3.6716, + "step": 2501 + }, + { + "epoch": 0.12, + "grad_norm": 0.6154173612594604, + "learning_rate": 0.0005975302492971586, + "loss": 3.6322, + "step": 2502 + }, + { + "epoch": 0.12, + "grad_norm": 0.5977914929389954, + "learning_rate": 0.0005975282773974228, + "loss": 3.5976, + "step": 2503 + }, + { + "epoch": 0.12, + "grad_norm": 0.6002846360206604, + "learning_rate": 0.0005975263047140547, + "loss": 3.5668, + "step": 2504 + }, + { + "epoch": 0.12, + "grad_norm": 0.5692142844200134, + "learning_rate": 0.0005975243312470596, + "loss": 3.6982, + "step": 2505 + }, + { + "epoch": 0.12, + "grad_norm": 0.6204922795295715, + "learning_rate": 0.0005975223569964427, + "loss": 3.4332, + "step": 2506 + }, + { + "epoch": 0.12, + "grad_norm": 0.6070844531059265, + "learning_rate": 0.0005975203819622091, + "loss": 3.5652, + "step": 2507 + }, + { + "epoch": 0.12, + "grad_norm": 0.6402035355567932, + "learning_rate": 0.0005975184061443641, + "loss": 3.9418, + "step": 2508 + }, + { + "epoch": 0.12, + "grad_norm": 0.5677720308303833, + "learning_rate": 0.0005975164295429129, + "loss": 3.5175, + "step": 2509 + }, + { + "epoch": 0.12, + "grad_norm": 0.6889002323150635, + "learning_rate": 0.0005975144521578607, + "loss": 3.6094, + "step": 2510 + }, + { + "epoch": 0.12, + "grad_norm": 0.6181936264038086, + "learning_rate": 0.0005975124739892126, + "loss": 3.6502, + "step": 2511 + }, + { + "epoch": 0.12, + "grad_norm": 0.6118708848953247, + "learning_rate": 0.000597510495036974, + "loss": 3.7122, + "step": 2512 + }, + { + "epoch": 0.12, + "grad_norm": 0.5670859217643738, + "learning_rate": 0.0005975085153011499, + "loss": 3.677, + "step": 2513 + }, + { + "epoch": 0.12, + "grad_norm": 0.5545445680618286, + "learning_rate": 0.0005975065347817457, + "loss": 3.8839, + "step": 2514 + }, + { + "epoch": 0.12, + "grad_norm": 0.6169227361679077, + "learning_rate": 0.0005975045534787666, + "loss": 3.7485, + "step": 2515 + }, + { + "epoch": 0.12, + "grad_norm": 0.6185595393180847, + "learning_rate": 0.0005975025713922177, + "loss": 3.763, + "step": 2516 + }, + { + "epoch": 0.12, + "grad_norm": 0.5830788612365723, + "learning_rate": 0.0005975005885221044, + "loss": 3.66, + "step": 2517 + }, + { + "epoch": 0.12, + "grad_norm": 0.5783771276473999, + "learning_rate": 0.0005974986048684315, + "loss": 3.6327, + "step": 2518 + }, + { + "epoch": 0.12, + "grad_norm": 0.6497078537940979, + "learning_rate": 0.0005974966204312048, + "loss": 3.4842, + "step": 2519 + }, + { + "epoch": 0.12, + "grad_norm": 0.5857980251312256, + "learning_rate": 0.0005974946352104292, + "loss": 3.6608, + "step": 2520 + }, + { + "epoch": 0.12, + "grad_norm": 0.5564964413642883, + "learning_rate": 0.00059749264920611, + "loss": 3.7486, + "step": 2521 + }, + { + "epoch": 0.12, + "grad_norm": 0.6064032912254333, + "learning_rate": 0.0005974906624182523, + "loss": 3.7437, + "step": 2522 + }, + { + "epoch": 0.12, + "grad_norm": 0.5508198142051697, + "learning_rate": 0.0005974886748468616, + "loss": 3.4906, + "step": 2523 + }, + { + "epoch": 0.12, + "grad_norm": 0.588590145111084, + "learning_rate": 0.0005974866864919429, + "loss": 3.5258, + "step": 2524 + }, + { + "epoch": 0.12, + "grad_norm": 0.6488778591156006, + "learning_rate": 0.0005974846973535015, + "loss": 3.5921, + "step": 2525 + }, + { + "epoch": 0.12, + "grad_norm": 0.634125292301178, + "learning_rate": 0.0005974827074315427, + "loss": 3.5026, + "step": 2526 + }, + { + "epoch": 0.12, + "grad_norm": 0.6007183790206909, + "learning_rate": 0.0005974807167260717, + "loss": 3.5792, + "step": 2527 + }, + { + "epoch": 0.12, + "grad_norm": 0.5504261255264282, + "learning_rate": 0.0005974787252370937, + "loss": 3.6759, + "step": 2528 + }, + { + "epoch": 0.12, + "grad_norm": 0.6158027052879333, + "learning_rate": 0.000597476732964614, + "loss": 3.564, + "step": 2529 + }, + { + "epoch": 0.12, + "grad_norm": 0.6696975827217102, + "learning_rate": 0.0005974747399086379, + "loss": 3.4829, + "step": 2530 + }, + { + "epoch": 0.12, + "grad_norm": 0.6032002568244934, + "learning_rate": 0.0005974727460691706, + "loss": 3.7942, + "step": 2531 + }, + { + "epoch": 0.12, + "grad_norm": 0.5709253549575806, + "learning_rate": 0.0005974707514462173, + "loss": 3.8751, + "step": 2532 + }, + { + "epoch": 0.12, + "grad_norm": 0.6123468279838562, + "learning_rate": 0.0005974687560397832, + "loss": 3.5406, + "step": 2533 + }, + { + "epoch": 0.12, + "grad_norm": 0.7552351355552673, + "learning_rate": 0.0005974667598498737, + "loss": 3.6732, + "step": 2534 + }, + { + "epoch": 0.12, + "grad_norm": 0.6883453130722046, + "learning_rate": 0.0005974647628764941, + "loss": 3.5411, + "step": 2535 + }, + { + "epoch": 0.12, + "grad_norm": 0.614341676235199, + "learning_rate": 0.0005974627651196495, + "loss": 3.5317, + "step": 2536 + }, + { + "epoch": 0.12, + "grad_norm": 0.5992317795753479, + "learning_rate": 0.0005974607665793452, + "loss": 3.694, + "step": 2537 + }, + { + "epoch": 0.12, + "grad_norm": 0.5962361693382263, + "learning_rate": 0.0005974587672555865, + "loss": 3.5415, + "step": 2538 + }, + { + "epoch": 0.12, + "grad_norm": 0.6169871687889099, + "learning_rate": 0.0005974567671483786, + "loss": 3.5371, + "step": 2539 + }, + { + "epoch": 0.12, + "grad_norm": 0.575014591217041, + "learning_rate": 0.0005974547662577268, + "loss": 3.692, + "step": 2540 + }, + { + "epoch": 0.12, + "grad_norm": 0.66556316614151, + "learning_rate": 0.0005974527645836365, + "loss": 3.4761, + "step": 2541 + }, + { + "epoch": 0.12, + "grad_norm": 0.5992640852928162, + "learning_rate": 0.0005974507621261129, + "loss": 3.7176, + "step": 2542 + }, + { + "epoch": 0.12, + "grad_norm": 0.6112333536148071, + "learning_rate": 0.0005974487588851611, + "loss": 3.4594, + "step": 2543 + }, + { + "epoch": 0.12, + "grad_norm": 0.6187512874603271, + "learning_rate": 0.0005974467548607866, + "loss": 3.6286, + "step": 2544 + }, + { + "epoch": 0.12, + "grad_norm": 0.5970472693443298, + "learning_rate": 0.0005974447500529946, + "loss": 3.5783, + "step": 2545 + }, + { + "epoch": 0.12, + "grad_norm": 0.622491180896759, + "learning_rate": 0.0005974427444617903, + "loss": 3.935, + "step": 2546 + }, + { + "epoch": 0.12, + "grad_norm": 0.5881618857383728, + "learning_rate": 0.000597440738087179, + "loss": 3.7599, + "step": 2547 + }, + { + "epoch": 0.12, + "grad_norm": 0.5989762544631958, + "learning_rate": 0.0005974387309291662, + "loss": 3.5662, + "step": 2548 + }, + { + "epoch": 0.12, + "grad_norm": 0.5770282745361328, + "learning_rate": 0.000597436722987757, + "loss": 3.5526, + "step": 2549 + }, + { + "epoch": 0.12, + "grad_norm": 0.5764257907867432, + "learning_rate": 0.0005974347142629567, + "loss": 3.6311, + "step": 2550 + }, + { + "epoch": 0.13, + "grad_norm": 0.5779116749763489, + "learning_rate": 0.0005974327047547706, + "loss": 3.5682, + "step": 2551 + }, + { + "epoch": 0.13, + "grad_norm": 0.5993980169296265, + "learning_rate": 0.000597430694463204, + "loss": 3.9816, + "step": 2552 + }, + { + "epoch": 0.13, + "grad_norm": 0.6707353591918945, + "learning_rate": 0.0005974286833882621, + "loss": 3.7115, + "step": 2553 + }, + { + "epoch": 0.13, + "grad_norm": 0.5597850680351257, + "learning_rate": 0.0005974266715299504, + "loss": 3.5479, + "step": 2554 + }, + { + "epoch": 0.13, + "grad_norm": 0.5796374678611755, + "learning_rate": 0.0005974246588882741, + "loss": 3.6176, + "step": 2555 + }, + { + "epoch": 0.13, + "grad_norm": 0.6326395869255066, + "learning_rate": 0.0005974226454632384, + "loss": 3.728, + "step": 2556 + }, + { + "epoch": 0.13, + "grad_norm": 0.5709221363067627, + "learning_rate": 0.0005974206312548487, + "loss": 3.7504, + "step": 2557 + }, + { + "epoch": 0.13, + "grad_norm": 0.6459200978279114, + "learning_rate": 0.0005974186162631104, + "loss": 3.8128, + "step": 2558 + }, + { + "epoch": 0.13, + "grad_norm": 0.6132066249847412, + "learning_rate": 0.0005974166004880286, + "loss": 3.6669, + "step": 2559 + }, + { + "epoch": 0.13, + "grad_norm": 0.593331515789032, + "learning_rate": 0.0005974145839296088, + "loss": 3.7232, + "step": 2560 + }, + { + "epoch": 0.13, + "grad_norm": 0.5764754414558411, + "learning_rate": 0.0005974125665878561, + "loss": 3.4763, + "step": 2561 + }, + { + "epoch": 0.13, + "grad_norm": 0.5679061412811279, + "learning_rate": 0.000597410548462776, + "loss": 3.3199, + "step": 2562 + }, + { + "epoch": 0.13, + "grad_norm": 0.6383253931999207, + "learning_rate": 0.0005974085295543739, + "loss": 3.6122, + "step": 2563 + }, + { + "epoch": 0.13, + "grad_norm": 0.6798160672187805, + "learning_rate": 0.0005974065098626548, + "loss": 3.8301, + "step": 2564 + }, + { + "epoch": 0.13, + "grad_norm": 0.5811006426811218, + "learning_rate": 0.0005974044893876241, + "loss": 3.7606, + "step": 2565 + }, + { + "epoch": 0.13, + "grad_norm": 0.6266860961914062, + "learning_rate": 0.0005974024681292874, + "loss": 3.6762, + "step": 2566 + }, + { + "epoch": 0.13, + "grad_norm": 0.5711492896080017, + "learning_rate": 0.0005974004460876498, + "loss": 3.6717, + "step": 2567 + }, + { + "epoch": 0.13, + "grad_norm": 0.5752375721931458, + "learning_rate": 0.0005973984232627166, + "loss": 3.5786, + "step": 2568 + }, + { + "epoch": 0.13, + "grad_norm": 0.6051990985870361, + "learning_rate": 0.0005973963996544932, + "loss": 3.7287, + "step": 2569 + }, + { + "epoch": 0.13, + "grad_norm": 0.6400178074836731, + "learning_rate": 0.000597394375262985, + "loss": 3.4631, + "step": 2570 + }, + { + "epoch": 0.13, + "grad_norm": 0.6259292364120483, + "learning_rate": 0.0005973923500881971, + "loss": 3.5266, + "step": 2571 + }, + { + "epoch": 0.13, + "grad_norm": 0.5960250496864319, + "learning_rate": 0.0005973903241301352, + "loss": 3.5943, + "step": 2572 + }, + { + "epoch": 0.13, + "grad_norm": 0.6018271446228027, + "learning_rate": 0.0005973882973888042, + "loss": 3.6866, + "step": 2573 + }, + { + "epoch": 0.13, + "grad_norm": 0.5785349011421204, + "learning_rate": 0.0005973862698642097, + "loss": 3.7706, + "step": 2574 + }, + { + "epoch": 0.13, + "grad_norm": 0.7196159362792969, + "learning_rate": 0.0005973842415563572, + "loss": 3.6392, + "step": 2575 + }, + { + "epoch": 0.13, + "grad_norm": 0.6078731417655945, + "learning_rate": 0.0005973822124652516, + "loss": 3.8852, + "step": 2576 + }, + { + "epoch": 0.13, + "grad_norm": 0.6295037865638733, + "learning_rate": 0.0005973801825908987, + "loss": 3.5333, + "step": 2577 + }, + { + "epoch": 0.13, + "grad_norm": 0.5880255103111267, + "learning_rate": 0.0005973781519333034, + "loss": 3.6906, + "step": 2578 + }, + { + "epoch": 0.13, + "grad_norm": 0.5796219110488892, + "learning_rate": 0.0005973761204924714, + "loss": 3.6351, + "step": 2579 + }, + { + "epoch": 0.13, + "grad_norm": 0.5962153077125549, + "learning_rate": 0.0005973740882684079, + "loss": 3.7197, + "step": 2580 + }, + { + "epoch": 0.13, + "grad_norm": 0.5779426097869873, + "learning_rate": 0.0005973720552611183, + "loss": 3.7243, + "step": 2581 + }, + { + "epoch": 0.13, + "grad_norm": 0.6200467348098755, + "learning_rate": 0.000597370021470608, + "loss": 3.7223, + "step": 2582 + }, + { + "epoch": 0.13, + "grad_norm": 0.5809329152107239, + "learning_rate": 0.0005973679868968822, + "loss": 3.4354, + "step": 2583 + }, + { + "epoch": 0.13, + "grad_norm": 0.617591142654419, + "learning_rate": 0.0005973659515399463, + "loss": 3.667, + "step": 2584 + }, + { + "epoch": 0.13, + "grad_norm": 0.6338459253311157, + "learning_rate": 0.0005973639153998058, + "loss": 3.6246, + "step": 2585 + }, + { + "epoch": 0.13, + "grad_norm": 0.5794420838356018, + "learning_rate": 0.000597361878476466, + "loss": 3.6043, + "step": 2586 + }, + { + "epoch": 0.13, + "grad_norm": 0.6019072532653809, + "learning_rate": 0.0005973598407699322, + "loss": 3.6836, + "step": 2587 + }, + { + "epoch": 0.13, + "grad_norm": 0.5721585154533386, + "learning_rate": 0.0005973578022802098, + "loss": 3.7401, + "step": 2588 + }, + { + "epoch": 0.13, + "grad_norm": 0.6371548771858215, + "learning_rate": 0.0005973557630073042, + "loss": 3.5781, + "step": 2589 + }, + { + "epoch": 0.13, + "grad_norm": 0.5836586356163025, + "learning_rate": 0.0005973537229512208, + "loss": 3.7384, + "step": 2590 + }, + { + "epoch": 0.13, + "grad_norm": 0.5855005383491516, + "learning_rate": 0.0005973516821119649, + "loss": 3.6999, + "step": 2591 + }, + { + "epoch": 0.13, + "grad_norm": 0.5802505016326904, + "learning_rate": 0.0005973496404895419, + "loss": 3.7894, + "step": 2592 + }, + { + "epoch": 0.13, + "grad_norm": 0.6217300295829773, + "learning_rate": 0.0005973475980839571, + "loss": 3.7012, + "step": 2593 + }, + { + "epoch": 0.13, + "grad_norm": 0.657211422920227, + "learning_rate": 0.0005973455548952159, + "loss": 3.6203, + "step": 2594 + }, + { + "epoch": 0.13, + "grad_norm": 0.5941809415817261, + "learning_rate": 0.0005973435109233238, + "loss": 3.7269, + "step": 2595 + }, + { + "epoch": 0.13, + "grad_norm": 0.5752348303794861, + "learning_rate": 0.0005973414661682862, + "loss": 3.7513, + "step": 2596 + }, + { + "epoch": 0.13, + "grad_norm": 0.59751957654953, + "learning_rate": 0.0005973394206301083, + "loss": 3.5195, + "step": 2597 + }, + { + "epoch": 0.13, + "grad_norm": 0.5890262722969055, + "learning_rate": 0.0005973373743087956, + "loss": 3.4186, + "step": 2598 + }, + { + "epoch": 0.13, + "grad_norm": 0.5875232815742493, + "learning_rate": 0.0005973353272043535, + "loss": 3.5819, + "step": 2599 + }, + { + "epoch": 0.13, + "grad_norm": 0.5776680111885071, + "learning_rate": 0.0005973332793167874, + "loss": 3.6035, + "step": 2600 + }, + { + "epoch": 0.13, + "grad_norm": 0.652600884437561, + "learning_rate": 0.0005973312306461027, + "loss": 3.5718, + "step": 2601 + }, + { + "epoch": 0.13, + "grad_norm": 0.6072568893432617, + "learning_rate": 0.0005973291811923046, + "loss": 3.6524, + "step": 2602 + }, + { + "epoch": 0.13, + "grad_norm": 0.5894418954849243, + "learning_rate": 0.0005973271309553988, + "loss": 3.7459, + "step": 2603 + }, + { + "epoch": 0.13, + "grad_norm": 0.6451753377914429, + "learning_rate": 0.0005973250799353906, + "loss": 3.6723, + "step": 2604 + }, + { + "epoch": 0.13, + "grad_norm": 0.5642520785331726, + "learning_rate": 0.0005973230281322852, + "loss": 3.6376, + "step": 2605 + }, + { + "epoch": 0.13, + "grad_norm": 0.6151884198188782, + "learning_rate": 0.0005973209755460883, + "loss": 3.6376, + "step": 2606 + }, + { + "epoch": 0.13, + "grad_norm": 0.5698564052581787, + "learning_rate": 0.0005973189221768051, + "loss": 3.5028, + "step": 2607 + }, + { + "epoch": 0.13, + "grad_norm": 0.5662623047828674, + "learning_rate": 0.000597316868024441, + "loss": 3.7416, + "step": 2608 + }, + { + "epoch": 0.13, + "grad_norm": 0.6105310916900635, + "learning_rate": 0.0005973148130890017, + "loss": 3.506, + "step": 2609 + }, + { + "epoch": 0.13, + "grad_norm": 0.5945753455162048, + "learning_rate": 0.0005973127573704923, + "loss": 3.7686, + "step": 2610 + }, + { + "epoch": 0.13, + "grad_norm": 0.5580887794494629, + "learning_rate": 0.0005973107008689182, + "loss": 3.4169, + "step": 2611 + }, + { + "epoch": 0.13, + "grad_norm": 0.5982559323310852, + "learning_rate": 0.0005973086435842851, + "loss": 3.8084, + "step": 2612 + }, + { + "epoch": 0.13, + "grad_norm": 0.545141875743866, + "learning_rate": 0.0005973065855165981, + "loss": 3.7542, + "step": 2613 + }, + { + "epoch": 0.13, + "grad_norm": 0.5577916502952576, + "learning_rate": 0.0005973045266658629, + "loss": 3.7296, + "step": 2614 + }, + { + "epoch": 0.13, + "grad_norm": 0.6226997971534729, + "learning_rate": 0.0005973024670320848, + "loss": 3.61, + "step": 2615 + }, + { + "epoch": 0.13, + "grad_norm": 0.5903077721595764, + "learning_rate": 0.0005973004066152692, + "loss": 3.5793, + "step": 2616 + }, + { + "epoch": 0.13, + "grad_norm": 0.6483430862426758, + "learning_rate": 0.0005972983454154215, + "loss": 3.7537, + "step": 2617 + }, + { + "epoch": 0.13, + "grad_norm": 0.5781821608543396, + "learning_rate": 0.0005972962834325472, + "loss": 3.7462, + "step": 2618 + }, + { + "epoch": 0.13, + "grad_norm": 0.6237555742263794, + "learning_rate": 0.0005972942206666517, + "loss": 3.709, + "step": 2619 + }, + { + "epoch": 0.13, + "grad_norm": 0.538802444934845, + "learning_rate": 0.0005972921571177405, + "loss": 3.8094, + "step": 2620 + }, + { + "epoch": 0.13, + "grad_norm": 0.6104321479797363, + "learning_rate": 0.0005972900927858188, + "loss": 3.5672, + "step": 2621 + }, + { + "epoch": 0.13, + "grad_norm": 0.5900411605834961, + "learning_rate": 0.0005972880276708925, + "loss": 3.6144, + "step": 2622 + }, + { + "epoch": 0.13, + "grad_norm": 0.6186203360557556, + "learning_rate": 0.0005972859617729665, + "loss": 3.6618, + "step": 2623 + }, + { + "epoch": 0.13, + "grad_norm": 0.5977591872215271, + "learning_rate": 0.0005972838950920465, + "loss": 3.5381, + "step": 2624 + }, + { + "epoch": 0.13, + "grad_norm": 0.5627738237380981, + "learning_rate": 0.000597281827628138, + "loss": 3.5466, + "step": 2625 + }, + { + "epoch": 0.13, + "grad_norm": 0.5746119022369385, + "learning_rate": 0.0005972797593812464, + "loss": 3.4968, + "step": 2626 + }, + { + "epoch": 0.13, + "grad_norm": 0.6021314859390259, + "learning_rate": 0.0005972776903513771, + "loss": 3.6362, + "step": 2627 + }, + { + "epoch": 0.13, + "grad_norm": 0.5764266848564148, + "learning_rate": 0.0005972756205385356, + "loss": 3.5602, + "step": 2628 + }, + { + "epoch": 0.13, + "grad_norm": 0.560896098613739, + "learning_rate": 0.0005972735499427273, + "loss": 3.9325, + "step": 2629 + }, + { + "epoch": 0.13, + "grad_norm": 0.5999066829681396, + "learning_rate": 0.0005972714785639576, + "loss": 3.5947, + "step": 2630 + }, + { + "epoch": 0.13, + "grad_norm": 0.6318387389183044, + "learning_rate": 0.0005972694064022322, + "loss": 3.4485, + "step": 2631 + }, + { + "epoch": 0.13, + "grad_norm": 0.6000919938087463, + "learning_rate": 0.0005972673334575562, + "loss": 3.6263, + "step": 2632 + }, + { + "epoch": 0.13, + "grad_norm": 0.6186161041259766, + "learning_rate": 0.0005972652597299354, + "loss": 3.4619, + "step": 2633 + }, + { + "epoch": 0.13, + "grad_norm": 0.6438592076301575, + "learning_rate": 0.0005972631852193751, + "loss": 3.8048, + "step": 2634 + }, + { + "epoch": 0.13, + "grad_norm": 0.6455599069595337, + "learning_rate": 0.0005972611099258808, + "loss": 3.8894, + "step": 2635 + }, + { + "epoch": 0.13, + "grad_norm": 0.6064257025718689, + "learning_rate": 0.0005972590338494579, + "loss": 3.6067, + "step": 2636 + }, + { + "epoch": 0.13, + "grad_norm": 0.5977962613105774, + "learning_rate": 0.0005972569569901119, + "loss": 3.8438, + "step": 2637 + }, + { + "epoch": 0.13, + "grad_norm": 0.5873711109161377, + "learning_rate": 0.0005972548793478483, + "loss": 3.4951, + "step": 2638 + }, + { + "epoch": 0.13, + "grad_norm": 0.5406238436698914, + "learning_rate": 0.0005972528009226725, + "loss": 3.5039, + "step": 2639 + }, + { + "epoch": 0.13, + "grad_norm": 0.6180803179740906, + "learning_rate": 0.0005972507217145901, + "loss": 3.816, + "step": 2640 + }, + { + "epoch": 0.13, + "grad_norm": 0.5730354189872742, + "learning_rate": 0.0005972486417236065, + "loss": 3.5936, + "step": 2641 + }, + { + "epoch": 0.13, + "grad_norm": 0.5667976140975952, + "learning_rate": 0.0005972465609497273, + "loss": 3.7131, + "step": 2642 + }, + { + "epoch": 0.13, + "grad_norm": 0.5629318356513977, + "learning_rate": 0.0005972444793929577, + "loss": 3.7713, + "step": 2643 + }, + { + "epoch": 0.13, + "grad_norm": 0.6278749704360962, + "learning_rate": 0.0005972423970533033, + "loss": 3.6363, + "step": 2644 + }, + { + "epoch": 0.13, + "grad_norm": 0.637045681476593, + "learning_rate": 0.0005972403139307698, + "loss": 3.6883, + "step": 2645 + }, + { + "epoch": 0.13, + "grad_norm": 0.5682446956634521, + "learning_rate": 0.0005972382300253625, + "loss": 3.7383, + "step": 2646 + }, + { + "epoch": 0.13, + "grad_norm": 0.5715309977531433, + "learning_rate": 0.0005972361453370868, + "loss": 3.4358, + "step": 2647 + }, + { + "epoch": 0.13, + "grad_norm": 0.5956022143363953, + "learning_rate": 0.0005972340598659483, + "loss": 3.5463, + "step": 2648 + }, + { + "epoch": 0.13, + "grad_norm": 0.5942005515098572, + "learning_rate": 0.0005972319736119526, + "loss": 3.667, + "step": 2649 + }, + { + "epoch": 0.13, + "grad_norm": 0.6336799263954163, + "learning_rate": 0.000597229886575105, + "loss": 3.5427, + "step": 2650 + }, + { + "epoch": 0.13, + "grad_norm": 0.5908926129341125, + "learning_rate": 0.0005972277987554111, + "loss": 3.6074, + "step": 2651 + }, + { + "epoch": 0.13, + "grad_norm": 0.6374375224113464, + "learning_rate": 0.0005972257101528763, + "loss": 3.4208, + "step": 2652 + }, + { + "epoch": 0.13, + "grad_norm": 0.5974894762039185, + "learning_rate": 0.0005972236207675063, + "loss": 3.6029, + "step": 2653 + }, + { + "epoch": 0.13, + "grad_norm": 0.5751729011535645, + "learning_rate": 0.0005972215305993065, + "loss": 3.6742, + "step": 2654 + }, + { + "epoch": 0.13, + "grad_norm": 0.5705915689468384, + "learning_rate": 0.0005972194396482823, + "loss": 3.8229, + "step": 2655 + }, + { + "epoch": 0.13, + "grad_norm": 0.6375955939292908, + "learning_rate": 0.0005972173479144394, + "loss": 3.5791, + "step": 2656 + }, + { + "epoch": 0.13, + "grad_norm": 0.6054398417472839, + "learning_rate": 0.0005972152553977831, + "loss": 3.4664, + "step": 2657 + }, + { + "epoch": 0.13, + "grad_norm": 0.5961582064628601, + "learning_rate": 0.0005972131620983191, + "loss": 3.7149, + "step": 2658 + }, + { + "epoch": 0.13, + "grad_norm": 0.5579370856285095, + "learning_rate": 0.0005972110680160528, + "loss": 3.7323, + "step": 2659 + }, + { + "epoch": 0.13, + "grad_norm": 0.6265712380409241, + "learning_rate": 0.0005972089731509897, + "loss": 3.494, + "step": 2660 + }, + { + "epoch": 0.13, + "grad_norm": 0.5627157688140869, + "learning_rate": 0.0005972068775031353, + "loss": 3.8952, + "step": 2661 + }, + { + "epoch": 0.13, + "grad_norm": 0.6081935167312622, + "learning_rate": 0.0005972047810724954, + "loss": 3.6359, + "step": 2662 + }, + { + "epoch": 0.13, + "grad_norm": 0.6013399958610535, + "learning_rate": 0.0005972026838590753, + "loss": 3.7558, + "step": 2663 + }, + { + "epoch": 0.13, + "grad_norm": 0.7497528791427612, + "learning_rate": 0.0005972005858628803, + "loss": 3.5532, + "step": 2664 + }, + { + "epoch": 0.13, + "grad_norm": 0.5855658650398254, + "learning_rate": 0.0005971984870839162, + "loss": 3.8398, + "step": 2665 + }, + { + "epoch": 0.13, + "grad_norm": 0.5891215205192566, + "learning_rate": 0.0005971963875221886, + "loss": 3.7379, + "step": 2666 + }, + { + "epoch": 0.13, + "grad_norm": 0.5891420841217041, + "learning_rate": 0.000597194287177703, + "loss": 3.7002, + "step": 2667 + }, + { + "epoch": 0.13, + "grad_norm": 0.5906747579574585, + "learning_rate": 0.0005971921860504646, + "loss": 3.6601, + "step": 2668 + }, + { + "epoch": 0.13, + "grad_norm": 0.5525054931640625, + "learning_rate": 0.0005971900841404794, + "loss": 4.0302, + "step": 2669 + }, + { + "epoch": 0.13, + "grad_norm": 0.6058871746063232, + "learning_rate": 0.0005971879814477525, + "loss": 3.795, + "step": 2670 + }, + { + "epoch": 0.13, + "grad_norm": 0.5821624398231506, + "learning_rate": 0.0005971858779722898, + "loss": 3.6976, + "step": 2671 + }, + { + "epoch": 0.13, + "grad_norm": 0.5979516506195068, + "learning_rate": 0.0005971837737140966, + "loss": 3.5227, + "step": 2672 + }, + { + "epoch": 0.13, + "grad_norm": 0.558814287185669, + "learning_rate": 0.0005971816686731786, + "loss": 3.5519, + "step": 2673 + }, + { + "epoch": 0.13, + "grad_norm": 0.5667821168899536, + "learning_rate": 0.0005971795628495413, + "loss": 3.7112, + "step": 2674 + }, + { + "epoch": 0.13, + "grad_norm": 0.5700370669364929, + "learning_rate": 0.0005971774562431901, + "loss": 3.528, + "step": 2675 + }, + { + "epoch": 0.13, + "grad_norm": 0.5918428301811218, + "learning_rate": 0.0005971753488541307, + "loss": 3.6761, + "step": 2676 + }, + { + "epoch": 0.13, + "grad_norm": 0.6164225935935974, + "learning_rate": 0.0005971732406823688, + "loss": 3.6645, + "step": 2677 + }, + { + "epoch": 0.13, + "grad_norm": 0.5859614610671997, + "learning_rate": 0.0005971711317279096, + "loss": 3.698, + "step": 2678 + }, + { + "epoch": 0.13, + "grad_norm": 0.5908759236335754, + "learning_rate": 0.0005971690219907589, + "loss": 3.7241, + "step": 2679 + }, + { + "epoch": 0.13, + "grad_norm": 0.6182529330253601, + "learning_rate": 0.0005971669114709222, + "loss": 3.5972, + "step": 2680 + }, + { + "epoch": 0.13, + "grad_norm": 0.576896071434021, + "learning_rate": 0.000597164800168405, + "loss": 3.679, + "step": 2681 + }, + { + "epoch": 0.13, + "grad_norm": 0.5724910497665405, + "learning_rate": 0.000597162688083213, + "loss": 3.7296, + "step": 2682 + }, + { + "epoch": 0.13, + "grad_norm": 0.6576551198959351, + "learning_rate": 0.0005971605752153516, + "loss": 3.6643, + "step": 2683 + }, + { + "epoch": 0.13, + "grad_norm": 0.6664922833442688, + "learning_rate": 0.0005971584615648264, + "loss": 3.5636, + "step": 2684 + }, + { + "epoch": 0.13, + "grad_norm": 0.639898955821991, + "learning_rate": 0.000597156347131643, + "loss": 3.6449, + "step": 2685 + }, + { + "epoch": 0.13, + "grad_norm": 0.680196225643158, + "learning_rate": 0.000597154231915807, + "loss": 3.4298, + "step": 2686 + }, + { + "epoch": 0.13, + "grad_norm": 0.6258304119110107, + "learning_rate": 0.0005971521159173239, + "loss": 3.8802, + "step": 2687 + }, + { + "epoch": 0.13, + "grad_norm": 0.5518138408660889, + "learning_rate": 0.0005971499991361994, + "loss": 3.5749, + "step": 2688 + }, + { + "epoch": 0.13, + "grad_norm": 0.5616171956062317, + "learning_rate": 0.000597147881572439, + "loss": 3.7806, + "step": 2689 + }, + { + "epoch": 0.13, + "grad_norm": 0.5623195171356201, + "learning_rate": 0.0005971457632260482, + "loss": 3.4841, + "step": 2690 + }, + { + "epoch": 0.13, + "grad_norm": 0.6076975464820862, + "learning_rate": 0.0005971436440970326, + "loss": 3.7146, + "step": 2691 + }, + { + "epoch": 0.13, + "grad_norm": 0.5725387334823608, + "learning_rate": 0.0005971415241853979, + "loss": 3.622, + "step": 2692 + }, + { + "epoch": 0.13, + "grad_norm": 0.5814027786254883, + "learning_rate": 0.0005971394034911497, + "loss": 3.5529, + "step": 2693 + }, + { + "epoch": 0.13, + "grad_norm": 0.5468875169754028, + "learning_rate": 0.0005971372820142934, + "loss": 3.4851, + "step": 2694 + }, + { + "epoch": 0.13, + "grad_norm": 0.6255273222923279, + "learning_rate": 0.0005971351597548346, + "loss": 3.641, + "step": 2695 + }, + { + "epoch": 0.13, + "grad_norm": 0.6269606351852417, + "learning_rate": 0.000597133036712779, + "loss": 3.7258, + "step": 2696 + }, + { + "epoch": 0.13, + "grad_norm": 0.6093981266021729, + "learning_rate": 0.0005971309128881324, + "loss": 3.3623, + "step": 2697 + }, + { + "epoch": 0.13, + "grad_norm": 0.5752262473106384, + "learning_rate": 0.0005971287882808999, + "loss": 3.6917, + "step": 2698 + }, + { + "epoch": 0.13, + "grad_norm": 0.6046422123908997, + "learning_rate": 0.0005971266628910874, + "loss": 3.7567, + "step": 2699 + }, + { + "epoch": 0.13, + "grad_norm": 0.6624324917793274, + "learning_rate": 0.0005971245367187005, + "loss": 3.6769, + "step": 2700 + }, + { + "epoch": 0.13, + "grad_norm": 0.5971564054489136, + "learning_rate": 0.0005971224097637447, + "loss": 3.7973, + "step": 2701 + }, + { + "epoch": 0.13, + "grad_norm": 0.5755377411842346, + "learning_rate": 0.0005971202820262258, + "loss": 3.5409, + "step": 2702 + }, + { + "epoch": 0.13, + "grad_norm": 0.6195607781410217, + "learning_rate": 0.0005971181535061491, + "loss": 3.6003, + "step": 2703 + }, + { + "epoch": 0.13, + "grad_norm": 0.6000617742538452, + "learning_rate": 0.0005971160242035204, + "loss": 3.6304, + "step": 2704 + }, + { + "epoch": 0.13, + "grad_norm": 0.5995765328407288, + "learning_rate": 0.0005971138941183454, + "loss": 3.4578, + "step": 2705 + }, + { + "epoch": 0.13, + "grad_norm": 0.5783835053443909, + "learning_rate": 0.0005971117632506295, + "loss": 3.609, + "step": 2706 + }, + { + "epoch": 0.13, + "grad_norm": 0.5949788093566895, + "learning_rate": 0.0005971096316003783, + "loss": 3.7868, + "step": 2707 + }, + { + "epoch": 0.13, + "grad_norm": 0.5998488068580627, + "learning_rate": 0.0005971074991675976, + "loss": 3.599, + "step": 2708 + }, + { + "epoch": 0.13, + "grad_norm": 0.5909786820411682, + "learning_rate": 0.0005971053659522929, + "loss": 3.5773, + "step": 2709 + }, + { + "epoch": 0.13, + "grad_norm": 0.5821101665496826, + "learning_rate": 0.0005971032319544698, + "loss": 3.5233, + "step": 2710 + }, + { + "epoch": 0.13, + "grad_norm": 0.587090253829956, + "learning_rate": 0.000597101097174134, + "loss": 3.7505, + "step": 2711 + }, + { + "epoch": 0.13, + "grad_norm": 0.622183620929718, + "learning_rate": 0.0005970989616112913, + "loss": 3.3969, + "step": 2712 + }, + { + "epoch": 0.13, + "grad_norm": 0.6129473447799683, + "learning_rate": 0.0005970968252659468, + "loss": 3.6862, + "step": 2713 + }, + { + "epoch": 0.13, + "grad_norm": 0.659932017326355, + "learning_rate": 0.0005970946881381066, + "loss": 3.5053, + "step": 2714 + }, + { + "epoch": 0.13, + "grad_norm": 0.6454612016677856, + "learning_rate": 0.0005970925502277761, + "loss": 3.4388, + "step": 2715 + }, + { + "epoch": 0.13, + "grad_norm": 0.7548837065696716, + "learning_rate": 0.0005970904115349612, + "loss": 3.51, + "step": 2716 + }, + { + "epoch": 0.13, + "grad_norm": 0.6142184138298035, + "learning_rate": 0.000597088272059667, + "loss": 3.6681, + "step": 2717 + }, + { + "epoch": 0.13, + "grad_norm": 0.7135729193687439, + "learning_rate": 0.0005970861318018997, + "loss": 3.6722, + "step": 2718 + }, + { + "epoch": 0.13, + "grad_norm": 0.6415162682533264, + "learning_rate": 0.0005970839907616646, + "loss": 3.5688, + "step": 2719 + }, + { + "epoch": 0.13, + "grad_norm": 0.5897626280784607, + "learning_rate": 0.0005970818489389675, + "loss": 3.8595, + "step": 2720 + }, + { + "epoch": 0.13, + "grad_norm": 0.6751065850257874, + "learning_rate": 0.000597079706333814, + "loss": 3.435, + "step": 2721 + }, + { + "epoch": 0.13, + "grad_norm": 0.6195039749145508, + "learning_rate": 0.0005970775629462096, + "loss": 3.6634, + "step": 2722 + }, + { + "epoch": 0.13, + "grad_norm": 0.6431128978729248, + "learning_rate": 0.0005970754187761602, + "loss": 3.6171, + "step": 2723 + }, + { + "epoch": 0.13, + "grad_norm": 0.5993673205375671, + "learning_rate": 0.0005970732738236713, + "loss": 3.5535, + "step": 2724 + }, + { + "epoch": 0.13, + "grad_norm": 0.6270466446876526, + "learning_rate": 0.0005970711280887485, + "loss": 3.5313, + "step": 2725 + }, + { + "epoch": 0.13, + "grad_norm": 0.5823235511779785, + "learning_rate": 0.0005970689815713976, + "loss": 3.6458, + "step": 2726 + }, + { + "epoch": 0.13, + "grad_norm": 0.5822427272796631, + "learning_rate": 0.0005970668342716242, + "loss": 3.8305, + "step": 2727 + }, + { + "epoch": 0.13, + "grad_norm": 0.5477180480957031, + "learning_rate": 0.0005970646861894338, + "loss": 3.6481, + "step": 2728 + }, + { + "epoch": 0.13, + "grad_norm": 0.578444242477417, + "learning_rate": 0.0005970625373248323, + "loss": 3.5803, + "step": 2729 + }, + { + "epoch": 0.13, + "grad_norm": 0.6037389636039734, + "learning_rate": 0.0005970603876778252, + "loss": 3.5236, + "step": 2730 + }, + { + "epoch": 0.13, + "grad_norm": 0.6162851452827454, + "learning_rate": 0.0005970582372484182, + "loss": 3.6997, + "step": 2731 + }, + { + "epoch": 0.13, + "grad_norm": 0.6191104650497437, + "learning_rate": 0.0005970560860366171, + "loss": 3.6575, + "step": 2732 + }, + { + "epoch": 0.13, + "grad_norm": 0.6048828959465027, + "learning_rate": 0.0005970539340424273, + "loss": 3.3794, + "step": 2733 + }, + { + "epoch": 0.13, + "grad_norm": 0.6663912534713745, + "learning_rate": 0.0005970517812658547, + "loss": 3.3146, + "step": 2734 + }, + { + "epoch": 0.13, + "grad_norm": 0.5750559568405151, + "learning_rate": 0.0005970496277069048, + "loss": 3.5452, + "step": 2735 + }, + { + "epoch": 0.13, + "grad_norm": 0.5543981790542603, + "learning_rate": 0.0005970474733655833, + "loss": 3.8258, + "step": 2736 + }, + { + "epoch": 0.13, + "grad_norm": 0.5796617865562439, + "learning_rate": 0.0005970453182418961, + "loss": 3.7764, + "step": 2737 + }, + { + "epoch": 0.13, + "grad_norm": 0.5608425140380859, + "learning_rate": 0.0005970431623358485, + "loss": 3.4656, + "step": 2738 + }, + { + "epoch": 0.13, + "grad_norm": 0.6683704853057861, + "learning_rate": 0.0005970410056474466, + "loss": 3.7797, + "step": 2739 + }, + { + "epoch": 0.13, + "grad_norm": 0.6158978343009949, + "learning_rate": 0.0005970388481766957, + "loss": 3.7061, + "step": 2740 + }, + { + "epoch": 0.13, + "grad_norm": 0.5848861932754517, + "learning_rate": 0.0005970366899236015, + "loss": 3.5359, + "step": 2741 + }, + { + "epoch": 0.13, + "grad_norm": 0.5714642405509949, + "learning_rate": 0.0005970345308881701, + "loss": 3.4968, + "step": 2742 + }, + { + "epoch": 0.13, + "grad_norm": 0.5503971576690674, + "learning_rate": 0.0005970323710704068, + "loss": 3.5294, + "step": 2743 + }, + { + "epoch": 0.13, + "grad_norm": 0.6631685495376587, + "learning_rate": 0.0005970302104703174, + "loss": 3.4137, + "step": 2744 + }, + { + "epoch": 0.13, + "grad_norm": 0.5520222187042236, + "learning_rate": 0.0005970280490879076, + "loss": 3.5508, + "step": 2745 + }, + { + "epoch": 0.13, + "grad_norm": 0.5935842990875244, + "learning_rate": 0.0005970258869231831, + "loss": 3.6917, + "step": 2746 + }, + { + "epoch": 0.13, + "grad_norm": 0.6259281635284424, + "learning_rate": 0.0005970237239761496, + "loss": 3.5549, + "step": 2747 + }, + { + "epoch": 0.13, + "grad_norm": 0.6221252679824829, + "learning_rate": 0.0005970215602468127, + "loss": 3.5163, + "step": 2748 + }, + { + "epoch": 0.13, + "grad_norm": 0.572911262512207, + "learning_rate": 0.0005970193957351782, + "loss": 3.6548, + "step": 2749 + }, + { + "epoch": 0.13, + "grad_norm": 0.5745850801467896, + "learning_rate": 0.0005970172304412517, + "loss": 3.9044, + "step": 2750 + }, + { + "epoch": 0.13, + "grad_norm": 0.5390734672546387, + "learning_rate": 0.0005970150643650391, + "loss": 3.635, + "step": 2751 + }, + { + "epoch": 0.13, + "grad_norm": 0.6086849570274353, + "learning_rate": 0.0005970128975065459, + "loss": 3.527, + "step": 2752 + }, + { + "epoch": 0.13, + "grad_norm": 0.5953769683837891, + "learning_rate": 0.000597010729865778, + "loss": 3.5134, + "step": 2753 + }, + { + "epoch": 0.13, + "grad_norm": 0.5471598505973816, + "learning_rate": 0.0005970085614427409, + "loss": 3.4587, + "step": 2754 + }, + { + "epoch": 0.14, + "grad_norm": 0.5561854243278503, + "learning_rate": 0.0005970063922374404, + "loss": 3.6646, + "step": 2755 + }, + { + "epoch": 0.14, + "grad_norm": 0.5902842283248901, + "learning_rate": 0.0005970042222498823, + "loss": 3.4375, + "step": 2756 + }, + { + "epoch": 0.14, + "grad_norm": 0.6033221483230591, + "learning_rate": 0.0005970020514800722, + "loss": 3.4188, + "step": 2757 + }, + { + "epoch": 0.14, + "grad_norm": 0.5598551034927368, + "learning_rate": 0.0005969998799280158, + "loss": 3.5559, + "step": 2758 + }, + { + "epoch": 0.14, + "grad_norm": 0.5618605017662048, + "learning_rate": 0.0005969977075937189, + "loss": 3.778, + "step": 2759 + }, + { + "epoch": 0.14, + "grad_norm": 0.6076387763023376, + "learning_rate": 0.0005969955344771872, + "loss": 3.5922, + "step": 2760 + }, + { + "epoch": 0.14, + "grad_norm": 0.5863784551620483, + "learning_rate": 0.0005969933605784265, + "loss": 3.5362, + "step": 2761 + }, + { + "epoch": 0.14, + "grad_norm": 0.5936189293861389, + "learning_rate": 0.0005969911858974424, + "loss": 3.6277, + "step": 2762 + }, + { + "epoch": 0.14, + "grad_norm": 0.5283812880516052, + "learning_rate": 0.0005969890104342406, + "loss": 3.4249, + "step": 2763 + }, + { + "epoch": 0.14, + "grad_norm": 0.6723504066467285, + "learning_rate": 0.000596986834188827, + "loss": 3.583, + "step": 2764 + }, + { + "epoch": 0.14, + "grad_norm": 0.5366487503051758, + "learning_rate": 0.0005969846571612072, + "loss": 3.6369, + "step": 2765 + }, + { + "epoch": 0.14, + "grad_norm": 0.6037803292274475, + "learning_rate": 0.0005969824793513871, + "loss": 3.4389, + "step": 2766 + }, + { + "epoch": 0.14, + "grad_norm": 0.6620571613311768, + "learning_rate": 0.000596980300759372, + "loss": 3.6024, + "step": 2767 + }, + { + "epoch": 0.14, + "grad_norm": 0.5733856558799744, + "learning_rate": 0.0005969781213851682, + "loss": 3.4694, + "step": 2768 + }, + { + "epoch": 0.14, + "grad_norm": 0.5712093710899353, + "learning_rate": 0.000596975941228781, + "loss": 3.638, + "step": 2769 + }, + { + "epoch": 0.14, + "grad_norm": 0.6224941611289978, + "learning_rate": 0.0005969737602902165, + "loss": 3.4212, + "step": 2770 + }, + { + "epoch": 0.14, + "grad_norm": 0.6067854166030884, + "learning_rate": 0.0005969715785694802, + "loss": 3.6626, + "step": 2771 + }, + { + "epoch": 0.14, + "grad_norm": 0.5889960527420044, + "learning_rate": 0.0005969693960665779, + "loss": 3.641, + "step": 2772 + }, + { + "epoch": 0.14, + "grad_norm": 0.6073998212814331, + "learning_rate": 0.0005969672127815153, + "loss": 3.3504, + "step": 2773 + }, + { + "epoch": 0.14, + "grad_norm": 0.5916741490364075, + "learning_rate": 0.0005969650287142983, + "loss": 3.6483, + "step": 2774 + }, + { + "epoch": 0.14, + "grad_norm": 0.5662715435028076, + "learning_rate": 0.0005969628438649326, + "loss": 3.3069, + "step": 2775 + }, + { + "epoch": 0.14, + "grad_norm": 0.5828217267990112, + "learning_rate": 0.0005969606582334238, + "loss": 3.6519, + "step": 2776 + }, + { + "epoch": 0.14, + "grad_norm": 0.5853772759437561, + "learning_rate": 0.0005969584718197779, + "loss": 3.5239, + "step": 2777 + }, + { + "epoch": 0.14, + "grad_norm": 0.5956276655197144, + "learning_rate": 0.0005969562846240005, + "loss": 3.6071, + "step": 2778 + }, + { + "epoch": 0.14, + "grad_norm": 0.5762497782707214, + "learning_rate": 0.0005969540966460974, + "loss": 3.738, + "step": 2779 + }, + { + "epoch": 0.14, + "grad_norm": 0.5765044093132019, + "learning_rate": 0.0005969519078860742, + "loss": 3.575, + "step": 2780 + }, + { + "epoch": 0.14, + "grad_norm": 0.7366907596588135, + "learning_rate": 0.0005969497183439371, + "loss": 3.4483, + "step": 2781 + }, + { + "epoch": 0.14, + "grad_norm": 0.5674998164176941, + "learning_rate": 0.0005969475280196915, + "loss": 3.6753, + "step": 2782 + }, + { + "epoch": 0.14, + "grad_norm": 0.5822417140007019, + "learning_rate": 0.0005969453369133431, + "loss": 3.5733, + "step": 2783 + }, + { + "epoch": 0.14, + "grad_norm": 0.5587081909179688, + "learning_rate": 0.000596943145024898, + "loss": 3.5532, + "step": 2784 + }, + { + "epoch": 0.14, + "grad_norm": 0.5745068192481995, + "learning_rate": 0.0005969409523543618, + "loss": 3.5765, + "step": 2785 + }, + { + "epoch": 0.14, + "grad_norm": 0.5741488933563232, + "learning_rate": 0.0005969387589017402, + "loss": 3.6993, + "step": 2786 + }, + { + "epoch": 0.14, + "grad_norm": 0.5863240957260132, + "learning_rate": 0.0005969365646670391, + "loss": 3.6545, + "step": 2787 + }, + { + "epoch": 0.14, + "grad_norm": 0.6137883067131042, + "learning_rate": 0.0005969343696502642, + "loss": 3.6242, + "step": 2788 + }, + { + "epoch": 0.14, + "grad_norm": 0.593112587928772, + "learning_rate": 0.0005969321738514213, + "loss": 3.5533, + "step": 2789 + }, + { + "epoch": 0.14, + "grad_norm": 0.5809875130653381, + "learning_rate": 0.0005969299772705163, + "loss": 3.8685, + "step": 2790 + }, + { + "epoch": 0.14, + "grad_norm": 0.5593841671943665, + "learning_rate": 0.0005969277799075548, + "loss": 3.6496, + "step": 2791 + }, + { + "epoch": 0.14, + "grad_norm": 0.5753694176673889, + "learning_rate": 0.0005969255817625428, + "loss": 3.4793, + "step": 2792 + }, + { + "epoch": 0.14, + "grad_norm": 0.5818933248519897, + "learning_rate": 0.0005969233828354857, + "loss": 3.7245, + "step": 2793 + }, + { + "epoch": 0.14, + "grad_norm": 0.5473413467407227, + "learning_rate": 0.0005969211831263899, + "loss": 3.3412, + "step": 2794 + }, + { + "epoch": 0.14, + "grad_norm": 0.5579399466514587, + "learning_rate": 0.0005969189826352605, + "loss": 3.6375, + "step": 2795 + }, + { + "epoch": 0.14, + "grad_norm": 0.6152604222297668, + "learning_rate": 0.0005969167813621038, + "loss": 3.5726, + "step": 2796 + }, + { + "epoch": 0.14, + "grad_norm": 0.6010841727256775, + "learning_rate": 0.0005969145793069255, + "loss": 3.6222, + "step": 2797 + }, + { + "epoch": 0.14, + "grad_norm": 0.5537547469139099, + "learning_rate": 0.0005969123764697313, + "loss": 3.7334, + "step": 2798 + }, + { + "epoch": 0.14, + "grad_norm": 0.5809540748596191, + "learning_rate": 0.0005969101728505269, + "loss": 3.7945, + "step": 2799 + }, + { + "epoch": 0.14, + "grad_norm": 0.5480958223342896, + "learning_rate": 0.0005969079684493184, + "loss": 3.6835, + "step": 2800 + }, + { + "epoch": 0.14, + "grad_norm": 0.5858062505722046, + "learning_rate": 0.0005969057632661115, + "loss": 3.5088, + "step": 2801 + }, + { + "epoch": 0.14, + "grad_norm": 0.6234527230262756, + "learning_rate": 0.0005969035573009119, + "loss": 3.5747, + "step": 2802 + }, + { + "epoch": 0.14, + "grad_norm": 0.5944379568099976, + "learning_rate": 0.0005969013505537254, + "loss": 3.6146, + "step": 2803 + }, + { + "epoch": 0.14, + "grad_norm": 0.544714093208313, + "learning_rate": 0.0005968991430245579, + "loss": 3.5981, + "step": 2804 + }, + { + "epoch": 0.14, + "grad_norm": 0.5830975770950317, + "learning_rate": 0.0005968969347134154, + "loss": 3.5539, + "step": 2805 + }, + { + "epoch": 0.14, + "grad_norm": 0.5846774578094482, + "learning_rate": 0.0005968947256203033, + "loss": 3.4609, + "step": 2806 + }, + { + "epoch": 0.14, + "grad_norm": 0.5505405068397522, + "learning_rate": 0.0005968925157452276, + "loss": 3.5572, + "step": 2807 + }, + { + "epoch": 0.14, + "grad_norm": 0.584732174873352, + "learning_rate": 0.0005968903050881943, + "loss": 3.5472, + "step": 2808 + }, + { + "epoch": 0.14, + "grad_norm": 0.6293777227401733, + "learning_rate": 0.0005968880936492089, + "loss": 3.5822, + "step": 2809 + }, + { + "epoch": 0.14, + "grad_norm": 0.6841658353805542, + "learning_rate": 0.0005968858814282776, + "loss": 3.4138, + "step": 2810 + }, + { + "epoch": 0.14, + "grad_norm": 0.6176837086677551, + "learning_rate": 0.0005968836684254058, + "loss": 3.4622, + "step": 2811 + }, + { + "epoch": 0.14, + "grad_norm": 0.5423951148986816, + "learning_rate": 0.0005968814546405998, + "loss": 3.7367, + "step": 2812 + }, + { + "epoch": 0.14, + "grad_norm": 0.5896667242050171, + "learning_rate": 0.000596879240073865, + "loss": 3.8234, + "step": 2813 + }, + { + "epoch": 0.14, + "grad_norm": 0.6115859150886536, + "learning_rate": 0.0005968770247252074, + "loss": 3.5844, + "step": 2814 + }, + { + "epoch": 0.14, + "grad_norm": 0.5899752378463745, + "learning_rate": 0.000596874808594633, + "loss": 3.6039, + "step": 2815 + }, + { + "epoch": 0.14, + "grad_norm": 0.5383777022361755, + "learning_rate": 0.0005968725916821474, + "loss": 3.5118, + "step": 2816 + }, + { + "epoch": 0.14, + "grad_norm": 0.5764883756637573, + "learning_rate": 0.0005968703739877566, + "loss": 3.7468, + "step": 2817 + }, + { + "epoch": 0.14, + "grad_norm": 0.927655816078186, + "learning_rate": 0.0005968681555114663, + "loss": 3.5418, + "step": 2818 + }, + { + "epoch": 0.14, + "grad_norm": 0.5765186548233032, + "learning_rate": 0.0005968659362532824, + "loss": 3.6529, + "step": 2819 + }, + { + "epoch": 0.14, + "grad_norm": 0.5878188610076904, + "learning_rate": 0.0005968637162132108, + "loss": 3.6059, + "step": 2820 + }, + { + "epoch": 0.14, + "grad_norm": 0.637100875377655, + "learning_rate": 0.0005968614953912574, + "loss": 3.6037, + "step": 2821 + }, + { + "epoch": 0.14, + "grad_norm": 0.6326918601989746, + "learning_rate": 0.0005968592737874278, + "loss": 3.695, + "step": 2822 + }, + { + "epoch": 0.14, + "grad_norm": 0.5842844843864441, + "learning_rate": 0.000596857051401728, + "loss": 3.7953, + "step": 2823 + }, + { + "epoch": 0.14, + "grad_norm": 0.6204371452331543, + "learning_rate": 0.0005968548282341639, + "loss": 3.5894, + "step": 2824 + }, + { + "epoch": 0.14, + "grad_norm": 0.556685209274292, + "learning_rate": 0.0005968526042847413, + "loss": 3.606, + "step": 2825 + }, + { + "epoch": 0.14, + "grad_norm": 0.5710826516151428, + "learning_rate": 0.0005968503795534661, + "loss": 3.5026, + "step": 2826 + }, + { + "epoch": 0.14, + "grad_norm": 0.569648802280426, + "learning_rate": 0.0005968481540403441, + "loss": 3.6895, + "step": 2827 + }, + { + "epoch": 0.14, + "grad_norm": 0.585290253162384, + "learning_rate": 0.0005968459277453813, + "loss": 3.5407, + "step": 2828 + }, + { + "epoch": 0.14, + "grad_norm": 0.5872741937637329, + "learning_rate": 0.0005968437006685834, + "loss": 3.7963, + "step": 2829 + }, + { + "epoch": 0.14, + "grad_norm": 0.5374106168746948, + "learning_rate": 0.0005968414728099563, + "loss": 3.6502, + "step": 2830 + }, + { + "epoch": 0.14, + "grad_norm": 0.5915255546569824, + "learning_rate": 0.0005968392441695058, + "loss": 3.4514, + "step": 2831 + }, + { + "epoch": 0.14, + "grad_norm": 0.6637536287307739, + "learning_rate": 0.000596837014747238, + "loss": 3.5065, + "step": 2832 + }, + { + "epoch": 0.14, + "grad_norm": 0.5768295526504517, + "learning_rate": 0.0005968347845431586, + "loss": 3.5267, + "step": 2833 + }, + { + "epoch": 0.14, + "grad_norm": 0.595833957195282, + "learning_rate": 0.0005968325535572734, + "loss": 3.5768, + "step": 2834 + }, + { + "epoch": 0.14, + "grad_norm": 0.5429056286811829, + "learning_rate": 0.0005968303217895885, + "loss": 3.569, + "step": 2835 + }, + { + "epoch": 0.14, + "grad_norm": 0.6460859775543213, + "learning_rate": 0.0005968280892401095, + "loss": 3.503, + "step": 2836 + }, + { + "epoch": 0.14, + "grad_norm": 0.6241177916526794, + "learning_rate": 0.0005968258559088427, + "loss": 3.6161, + "step": 2837 + }, + { + "epoch": 0.14, + "grad_norm": 0.5874194502830505, + "learning_rate": 0.0005968236217957935, + "loss": 3.6232, + "step": 2838 + }, + { + "epoch": 0.14, + "grad_norm": 0.5746603608131409, + "learning_rate": 0.0005968213869009681, + "loss": 3.5134, + "step": 2839 + }, + { + "epoch": 0.14, + "grad_norm": 0.5978888273239136, + "learning_rate": 0.0005968191512243723, + "loss": 3.8172, + "step": 2840 + }, + { + "epoch": 0.14, + "grad_norm": 0.6809991002082825, + "learning_rate": 0.0005968169147660119, + "loss": 3.5159, + "step": 2841 + }, + { + "epoch": 0.14, + "grad_norm": 0.6131600141525269, + "learning_rate": 0.0005968146775258929, + "loss": 3.5549, + "step": 2842 + }, + { + "epoch": 0.14, + "grad_norm": 0.5964472889900208, + "learning_rate": 0.0005968124395040212, + "loss": 3.4967, + "step": 2843 + }, + { + "epoch": 0.14, + "grad_norm": 0.599494457244873, + "learning_rate": 0.0005968102007004026, + "loss": 3.5785, + "step": 2844 + }, + { + "epoch": 0.14, + "grad_norm": 0.5741751790046692, + "learning_rate": 0.0005968079611150431, + "loss": 3.7153, + "step": 2845 + }, + { + "epoch": 0.14, + "grad_norm": 0.5869643688201904, + "learning_rate": 0.0005968057207479486, + "loss": 3.5068, + "step": 2846 + }, + { + "epoch": 0.14, + "grad_norm": 0.5856844186782837, + "learning_rate": 0.0005968034795991249, + "loss": 3.8081, + "step": 2847 + }, + { + "epoch": 0.14, + "grad_norm": 0.6089456677436829, + "learning_rate": 0.000596801237668578, + "loss": 3.6488, + "step": 2848 + }, + { + "epoch": 0.14, + "grad_norm": 0.5712987780570984, + "learning_rate": 0.0005967989949563136, + "loss": 3.4939, + "step": 2849 + }, + { + "epoch": 0.14, + "grad_norm": 0.560558021068573, + "learning_rate": 0.000596796751462338, + "loss": 3.4375, + "step": 2850 + }, + { + "epoch": 0.14, + "grad_norm": 0.6255397200584412, + "learning_rate": 0.0005967945071866567, + "loss": 3.4126, + "step": 2851 + }, + { + "epoch": 0.14, + "grad_norm": 0.5810412764549255, + "learning_rate": 0.0005967922621292758, + "loss": 3.5257, + "step": 2852 + }, + { + "epoch": 0.14, + "grad_norm": 0.6136355400085449, + "learning_rate": 0.0005967900162902013, + "loss": 3.5239, + "step": 2853 + }, + { + "epoch": 0.14, + "grad_norm": 0.6601942777633667, + "learning_rate": 0.000596787769669439, + "loss": 3.6628, + "step": 2854 + }, + { + "epoch": 0.14, + "grad_norm": 0.5993295311927795, + "learning_rate": 0.0005967855222669947, + "loss": 3.4613, + "step": 2855 + }, + { + "epoch": 0.14, + "grad_norm": 0.6075938940048218, + "learning_rate": 0.0005967832740828746, + "loss": 3.4571, + "step": 2856 + }, + { + "epoch": 0.14, + "grad_norm": 0.7354562878608704, + "learning_rate": 0.0005967810251170844, + "loss": 3.6329, + "step": 2857 + }, + { + "epoch": 0.14, + "grad_norm": 0.5470739603042603, + "learning_rate": 0.0005967787753696301, + "loss": 3.309, + "step": 2858 + }, + { + "epoch": 0.14, + "grad_norm": 0.7858545780181885, + "learning_rate": 0.0005967765248405177, + "loss": 3.6904, + "step": 2859 + }, + { + "epoch": 0.14, + "grad_norm": 0.6135296821594238, + "learning_rate": 0.0005967742735297529, + "loss": 3.6427, + "step": 2860 + }, + { + "epoch": 0.14, + "grad_norm": 0.5984460711479187, + "learning_rate": 0.0005967720214373419, + "loss": 3.8465, + "step": 2861 + }, + { + "epoch": 0.14, + "grad_norm": 0.5901587009429932, + "learning_rate": 0.0005967697685632904, + "loss": 3.6866, + "step": 2862 + }, + { + "epoch": 0.14, + "grad_norm": 0.5759127140045166, + "learning_rate": 0.0005967675149076046, + "loss": 3.6211, + "step": 2863 + }, + { + "epoch": 0.14, + "grad_norm": 0.6124855279922485, + "learning_rate": 0.0005967652604702902, + "loss": 3.5057, + "step": 2864 + }, + { + "epoch": 0.14, + "grad_norm": 0.5732986927032471, + "learning_rate": 0.0005967630052513532, + "loss": 3.6978, + "step": 2865 + }, + { + "epoch": 0.14, + "grad_norm": 0.5618267059326172, + "learning_rate": 0.0005967607492507995, + "loss": 3.7548, + "step": 2866 + }, + { + "epoch": 0.14, + "grad_norm": 0.6002089381217957, + "learning_rate": 0.0005967584924686353, + "loss": 3.7316, + "step": 2867 + }, + { + "epoch": 0.14, + "grad_norm": 0.5847598314285278, + "learning_rate": 0.0005967562349048662, + "loss": 3.5734, + "step": 2868 + }, + { + "epoch": 0.14, + "grad_norm": 0.5832062363624573, + "learning_rate": 0.0005967539765594983, + "loss": 3.7337, + "step": 2869 + }, + { + "epoch": 0.14, + "grad_norm": 0.5734034776687622, + "learning_rate": 0.0005967517174325375, + "loss": 3.6052, + "step": 2870 + }, + { + "epoch": 0.14, + "grad_norm": 0.5718584656715393, + "learning_rate": 0.0005967494575239899, + "loss": 3.6527, + "step": 2871 + }, + { + "epoch": 0.14, + "grad_norm": 0.6056542992591858, + "learning_rate": 0.0005967471968338612, + "loss": 3.7136, + "step": 2872 + }, + { + "epoch": 0.14, + "grad_norm": 0.6068187952041626, + "learning_rate": 0.0005967449353621575, + "loss": 3.7391, + "step": 2873 + }, + { + "epoch": 0.14, + "grad_norm": 0.5216867327690125, + "learning_rate": 0.0005967426731088849, + "loss": 3.6024, + "step": 2874 + }, + { + "epoch": 0.14, + "grad_norm": 0.6825017929077148, + "learning_rate": 0.000596740410074049, + "loss": 3.6602, + "step": 2875 + }, + { + "epoch": 0.14, + "grad_norm": 0.5960718393325806, + "learning_rate": 0.0005967381462576561, + "loss": 3.6202, + "step": 2876 + }, + { + "epoch": 0.14, + "grad_norm": 0.594954252243042, + "learning_rate": 0.0005967358816597121, + "loss": 3.5797, + "step": 2877 + }, + { + "epoch": 0.14, + "grad_norm": 0.5977593660354614, + "learning_rate": 0.0005967336162802228, + "loss": 3.6304, + "step": 2878 + }, + { + "epoch": 0.14, + "grad_norm": 0.5887445211410522, + "learning_rate": 0.0005967313501191942, + "loss": 3.5324, + "step": 2879 + }, + { + "epoch": 0.14, + "grad_norm": 0.5791778564453125, + "learning_rate": 0.0005967290831766325, + "loss": 3.7551, + "step": 2880 + }, + { + "epoch": 0.14, + "grad_norm": 0.6645099520683289, + "learning_rate": 0.0005967268154525434, + "loss": 3.468, + "step": 2881 + }, + { + "epoch": 0.14, + "grad_norm": 0.5834534168243408, + "learning_rate": 0.000596724546946933, + "loss": 3.7626, + "step": 2882 + }, + { + "epoch": 0.14, + "grad_norm": 0.600719153881073, + "learning_rate": 0.0005967222776598072, + "loss": 3.6733, + "step": 2883 + }, + { + "epoch": 0.14, + "grad_norm": 0.6395867466926575, + "learning_rate": 0.0005967200075911722, + "loss": 3.5922, + "step": 2884 + }, + { + "epoch": 0.14, + "grad_norm": 0.5920771360397339, + "learning_rate": 0.0005967177367410335, + "loss": 3.7339, + "step": 2885 + }, + { + "epoch": 0.14, + "grad_norm": 0.5998722910881042, + "learning_rate": 0.0005967154651093977, + "loss": 3.6804, + "step": 2886 + }, + { + "epoch": 0.14, + "grad_norm": 0.6235655546188354, + "learning_rate": 0.0005967131926962703, + "loss": 3.6184, + "step": 2887 + }, + { + "epoch": 0.14, + "grad_norm": 0.5598371624946594, + "learning_rate": 0.0005967109195016575, + "loss": 3.4734, + "step": 2888 + }, + { + "epoch": 0.14, + "grad_norm": 0.5574737787246704, + "learning_rate": 0.0005967086455255651, + "loss": 3.7455, + "step": 2889 + }, + { + "epoch": 0.14, + "grad_norm": 0.60200434923172, + "learning_rate": 0.0005967063707679994, + "loss": 3.5133, + "step": 2890 + }, + { + "epoch": 0.14, + "grad_norm": 0.5801019668579102, + "learning_rate": 0.0005967040952289661, + "loss": 3.6737, + "step": 2891 + }, + { + "epoch": 0.14, + "grad_norm": 0.567669153213501, + "learning_rate": 0.0005967018189084715, + "loss": 3.7216, + "step": 2892 + }, + { + "epoch": 0.14, + "grad_norm": 0.7055478096008301, + "learning_rate": 0.0005966995418065212, + "loss": 3.4534, + "step": 2893 + }, + { + "epoch": 0.14, + "grad_norm": 0.562245786190033, + "learning_rate": 0.0005966972639231215, + "loss": 3.7154, + "step": 2894 + }, + { + "epoch": 0.14, + "grad_norm": 0.5916703343391418, + "learning_rate": 0.0005966949852582783, + "loss": 3.5089, + "step": 2895 + }, + { + "epoch": 0.14, + "grad_norm": 0.5672451853752136, + "learning_rate": 0.0005966927058119976, + "loss": 3.7155, + "step": 2896 + }, + { + "epoch": 0.14, + "grad_norm": 0.5992611646652222, + "learning_rate": 0.0005966904255842856, + "loss": 3.7891, + "step": 2897 + }, + { + "epoch": 0.14, + "grad_norm": 0.5881920456886292, + "learning_rate": 0.0005966881445751478, + "loss": 3.6009, + "step": 2898 + }, + { + "epoch": 0.14, + "grad_norm": 0.5571060180664062, + "learning_rate": 0.0005966858627845907, + "loss": 3.7047, + "step": 2899 + }, + { + "epoch": 0.14, + "grad_norm": 0.6121320128440857, + "learning_rate": 0.0005966835802126201, + "loss": 3.518, + "step": 2900 + }, + { + "epoch": 0.14, + "grad_norm": 0.6062599420547485, + "learning_rate": 0.000596681296859242, + "loss": 3.3848, + "step": 2901 + }, + { + "epoch": 0.14, + "grad_norm": 0.6015748381614685, + "learning_rate": 0.0005966790127244626, + "loss": 3.5153, + "step": 2902 + }, + { + "epoch": 0.14, + "grad_norm": 0.5726820230484009, + "learning_rate": 0.0005966767278082877, + "loss": 3.7427, + "step": 2903 + }, + { + "epoch": 0.14, + "grad_norm": 0.5963112115859985, + "learning_rate": 0.0005966744421107234, + "loss": 3.701, + "step": 2904 + }, + { + "epoch": 0.14, + "grad_norm": 0.5752584934234619, + "learning_rate": 0.0005966721556317757, + "loss": 3.7769, + "step": 2905 + }, + { + "epoch": 0.14, + "grad_norm": 0.56533282995224, + "learning_rate": 0.0005966698683714506, + "loss": 3.4874, + "step": 2906 + }, + { + "epoch": 0.14, + "grad_norm": 0.5514774918556213, + "learning_rate": 0.0005966675803297542, + "loss": 3.4779, + "step": 2907 + }, + { + "epoch": 0.14, + "grad_norm": 0.6327130794525146, + "learning_rate": 0.0005966652915066925, + "loss": 3.6413, + "step": 2908 + }, + { + "epoch": 0.14, + "grad_norm": 0.562054455280304, + "learning_rate": 0.0005966630019022715, + "loss": 3.7253, + "step": 2909 + }, + { + "epoch": 0.14, + "grad_norm": 0.5362357497215271, + "learning_rate": 0.0005966607115164974, + "loss": 3.7303, + "step": 2910 + }, + { + "epoch": 0.14, + "grad_norm": 0.5918092131614685, + "learning_rate": 0.0005966584203493759, + "loss": 3.5367, + "step": 2911 + }, + { + "epoch": 0.14, + "grad_norm": 0.6234297752380371, + "learning_rate": 0.0005966561284009132, + "loss": 3.6713, + "step": 2912 + }, + { + "epoch": 0.14, + "grad_norm": 0.6105513572692871, + "learning_rate": 0.0005966538356711154, + "loss": 3.6219, + "step": 2913 + }, + { + "epoch": 0.14, + "grad_norm": 0.527632474899292, + "learning_rate": 0.0005966515421599885, + "loss": 3.4177, + "step": 2914 + }, + { + "epoch": 0.14, + "grad_norm": 0.6240413188934326, + "learning_rate": 0.0005966492478675384, + "loss": 3.6584, + "step": 2915 + }, + { + "epoch": 0.14, + "grad_norm": 0.584805965423584, + "learning_rate": 0.0005966469527937716, + "loss": 3.5188, + "step": 2916 + }, + { + "epoch": 0.14, + "grad_norm": 0.581307053565979, + "learning_rate": 0.0005966446569386936, + "loss": 3.5399, + "step": 2917 + }, + { + "epoch": 0.14, + "grad_norm": 0.6673707962036133, + "learning_rate": 0.0005966423603023105, + "loss": 3.5465, + "step": 2918 + }, + { + "epoch": 0.14, + "grad_norm": 0.5942588448524475, + "learning_rate": 0.0005966400628846288, + "loss": 3.4065, + "step": 2919 + }, + { + "epoch": 0.14, + "grad_norm": 0.5521858930587769, + "learning_rate": 0.0005966377646856541, + "loss": 3.5278, + "step": 2920 + }, + { + "epoch": 0.14, + "grad_norm": 0.5489894151687622, + "learning_rate": 0.0005966354657053926, + "loss": 3.805, + "step": 2921 + }, + { + "epoch": 0.14, + "grad_norm": 1.0046864748001099, + "learning_rate": 0.0005966331659438505, + "loss": 3.5583, + "step": 2922 + }, + { + "epoch": 0.14, + "grad_norm": 0.5688363909721375, + "learning_rate": 0.0005966308654010337, + "loss": 3.5806, + "step": 2923 + }, + { + "epoch": 0.14, + "grad_norm": 0.5838121771812439, + "learning_rate": 0.0005966285640769482, + "loss": 3.404, + "step": 2924 + }, + { + "epoch": 0.14, + "grad_norm": 0.6157076954841614, + "learning_rate": 0.0005966262619716003, + "loss": 3.6069, + "step": 2925 + }, + { + "epoch": 0.14, + "grad_norm": 0.5905383229255676, + "learning_rate": 0.0005966239590849958, + "loss": 3.6116, + "step": 2926 + }, + { + "epoch": 0.14, + "grad_norm": 0.5673978328704834, + "learning_rate": 0.0005966216554171409, + "loss": 3.3259, + "step": 2927 + }, + { + "epoch": 0.14, + "grad_norm": 0.6059072613716125, + "learning_rate": 0.0005966193509680416, + "loss": 3.8699, + "step": 2928 + }, + { + "epoch": 0.14, + "grad_norm": 0.5982714295387268, + "learning_rate": 0.000596617045737704, + "loss": 3.6271, + "step": 2929 + }, + { + "epoch": 0.14, + "grad_norm": 0.5978359580039978, + "learning_rate": 0.0005966147397261344, + "loss": 3.55, + "step": 2930 + }, + { + "epoch": 0.14, + "grad_norm": 0.5846501588821411, + "learning_rate": 0.0005966124329333384, + "loss": 3.72, + "step": 2931 + }, + { + "epoch": 0.14, + "grad_norm": 0.5485678911209106, + "learning_rate": 0.0005966101253593226, + "loss": 3.6435, + "step": 2932 + }, + { + "epoch": 0.14, + "grad_norm": 0.6040960550308228, + "learning_rate": 0.0005966078170040925, + "loss": 3.4047, + "step": 2933 + }, + { + "epoch": 0.14, + "grad_norm": 0.629386842250824, + "learning_rate": 0.0005966055078676547, + "loss": 3.6221, + "step": 2934 + }, + { + "epoch": 0.14, + "grad_norm": 0.5997804999351501, + "learning_rate": 0.000596603197950015, + "loss": 4.0703, + "step": 2935 + }, + { + "epoch": 0.14, + "grad_norm": 0.6014472842216492, + "learning_rate": 0.0005966008872511796, + "loss": 3.6938, + "step": 2936 + }, + { + "epoch": 0.14, + "grad_norm": 0.5625265836715698, + "learning_rate": 0.0005965985757711545, + "loss": 3.6621, + "step": 2937 + }, + { + "epoch": 0.14, + "grad_norm": 0.5449039936065674, + "learning_rate": 0.0005965962635099459, + "loss": 3.648, + "step": 2938 + }, + { + "epoch": 0.14, + "grad_norm": 0.5726627707481384, + "learning_rate": 0.0005965939504675598, + "loss": 3.6625, + "step": 2939 + }, + { + "epoch": 0.14, + "grad_norm": 0.5476054549217224, + "learning_rate": 0.0005965916366440023, + "loss": 3.7134, + "step": 2940 + }, + { + "epoch": 0.14, + "grad_norm": 0.6098428964614868, + "learning_rate": 0.0005965893220392795, + "loss": 3.9104, + "step": 2941 + }, + { + "epoch": 0.14, + "grad_norm": 0.6405155062675476, + "learning_rate": 0.0005965870066533975, + "loss": 3.6453, + "step": 2942 + }, + { + "epoch": 0.14, + "grad_norm": 0.6072257161140442, + "learning_rate": 0.0005965846904863625, + "loss": 3.657, + "step": 2943 + }, + { + "epoch": 0.14, + "grad_norm": 0.5281662940979004, + "learning_rate": 0.0005965823735381803, + "loss": 3.8624, + "step": 2944 + }, + { + "epoch": 0.14, + "grad_norm": 0.5664449334144592, + "learning_rate": 0.0005965800558088573, + "loss": 3.7622, + "step": 2945 + }, + { + "epoch": 0.14, + "grad_norm": 0.5815826654434204, + "learning_rate": 0.0005965777372983996, + "loss": 3.7028, + "step": 2946 + }, + { + "epoch": 0.14, + "grad_norm": 0.5717793107032776, + "learning_rate": 0.0005965754180068132, + "loss": 3.5994, + "step": 2947 + }, + { + "epoch": 0.14, + "grad_norm": 0.56984543800354, + "learning_rate": 0.0005965730979341042, + "loss": 3.4422, + "step": 2948 + }, + { + "epoch": 0.14, + "grad_norm": 0.5990952849388123, + "learning_rate": 0.0005965707770802786, + "loss": 3.5145, + "step": 2949 + }, + { + "epoch": 0.14, + "grad_norm": 0.5824059247970581, + "learning_rate": 0.0005965684554453428, + "loss": 3.5848, + "step": 2950 + }, + { + "epoch": 0.14, + "grad_norm": 0.5779900550842285, + "learning_rate": 0.0005965661330293027, + "loss": 3.6382, + "step": 2951 + }, + { + "epoch": 0.14, + "grad_norm": 0.5480761528015137, + "learning_rate": 0.0005965638098321646, + "loss": 3.6473, + "step": 2952 + }, + { + "epoch": 0.14, + "grad_norm": 0.5885190367698669, + "learning_rate": 0.0005965614858539344, + "loss": 3.5578, + "step": 2953 + }, + { + "epoch": 0.14, + "grad_norm": 0.5397934913635254, + "learning_rate": 0.0005965591610946184, + "loss": 3.547, + "step": 2954 + }, + { + "epoch": 0.14, + "grad_norm": 0.5664386749267578, + "learning_rate": 0.0005965568355542225, + "loss": 3.7429, + "step": 2955 + }, + { + "epoch": 0.14, + "grad_norm": 0.5953521728515625, + "learning_rate": 0.0005965545092327531, + "loss": 3.6301, + "step": 2956 + }, + { + "epoch": 0.14, + "grad_norm": 0.5471416711807251, + "learning_rate": 0.0005965521821302162, + "loss": 3.6399, + "step": 2957 + }, + { + "epoch": 0.14, + "grad_norm": 0.5689468383789062, + "learning_rate": 0.0005965498542466178, + "loss": 3.6759, + "step": 2958 + }, + { + "epoch": 0.15, + "grad_norm": 0.5762372612953186, + "learning_rate": 0.0005965475255819642, + "loss": 3.6877, + "step": 2959 + }, + { + "epoch": 0.15, + "grad_norm": 0.5529139637947083, + "learning_rate": 0.0005965451961362615, + "loss": 3.4141, + "step": 2960 + }, + { + "epoch": 0.15, + "grad_norm": 0.5945937633514404, + "learning_rate": 0.0005965428659095158, + "loss": 3.5489, + "step": 2961 + }, + { + "epoch": 0.15, + "grad_norm": 0.548331081867218, + "learning_rate": 0.0005965405349017334, + "loss": 3.6366, + "step": 2962 + }, + { + "epoch": 0.15, + "grad_norm": 0.6198789477348328, + "learning_rate": 0.0005965382031129201, + "loss": 3.5774, + "step": 2963 + }, + { + "epoch": 0.15, + "grad_norm": 0.6308538913726807, + "learning_rate": 0.0005965358705430824, + "loss": 3.5777, + "step": 2964 + }, + { + "epoch": 0.15, + "grad_norm": 0.5835420489311218, + "learning_rate": 0.0005965335371922261, + "loss": 3.4532, + "step": 2965 + }, + { + "epoch": 0.15, + "grad_norm": 0.56312495470047, + "learning_rate": 0.0005965312030603578, + "loss": 3.6189, + "step": 2966 + }, + { + "epoch": 0.15, + "grad_norm": 0.6242998838424683, + "learning_rate": 0.0005965288681474832, + "loss": 3.4261, + "step": 2967 + }, + { + "epoch": 0.15, + "grad_norm": 0.5804765224456787, + "learning_rate": 0.0005965265324536087, + "loss": 3.6305, + "step": 2968 + }, + { + "epoch": 0.15, + "grad_norm": 0.5844853520393372, + "learning_rate": 0.0005965241959787404, + "loss": 3.4267, + "step": 2969 + }, + { + "epoch": 0.15, + "grad_norm": 0.5954545140266418, + "learning_rate": 0.0005965218587228844, + "loss": 3.615, + "step": 2970 + }, + { + "epoch": 0.15, + "grad_norm": 0.5783625245094299, + "learning_rate": 0.0005965195206860468, + "loss": 3.7189, + "step": 2971 + }, + { + "epoch": 0.15, + "grad_norm": 0.5781705975532532, + "learning_rate": 0.0005965171818682339, + "loss": 3.6033, + "step": 2972 + }, + { + "epoch": 0.15, + "grad_norm": 0.5989189147949219, + "learning_rate": 0.0005965148422694518, + "loss": 3.5753, + "step": 2973 + }, + { + "epoch": 0.15, + "grad_norm": 0.6061220169067383, + "learning_rate": 0.0005965125018897068, + "loss": 3.6133, + "step": 2974 + }, + { + "epoch": 0.15, + "grad_norm": 0.6106086373329163, + "learning_rate": 0.0005965101607290047, + "loss": 3.5082, + "step": 2975 + }, + { + "epoch": 0.15, + "grad_norm": 0.540738046169281, + "learning_rate": 0.0005965078187873522, + "loss": 3.3238, + "step": 2976 + }, + { + "epoch": 0.15, + "grad_norm": 0.5824552774429321, + "learning_rate": 0.0005965054760647549, + "loss": 3.611, + "step": 2977 + }, + { + "epoch": 0.15, + "grad_norm": 0.5599578619003296, + "learning_rate": 0.0005965031325612195, + "loss": 3.5603, + "step": 2978 + }, + { + "epoch": 0.15, + "grad_norm": 0.5765003561973572, + "learning_rate": 0.0005965007882767516, + "loss": 3.4808, + "step": 2979 + }, + { + "epoch": 0.15, + "grad_norm": 0.5761189460754395, + "learning_rate": 0.0005964984432113579, + "loss": 3.5618, + "step": 2980 + }, + { + "epoch": 0.15, + "grad_norm": 0.5370120406150818, + "learning_rate": 0.0005964960973650444, + "loss": 3.479, + "step": 2981 + }, + { + "epoch": 0.15, + "grad_norm": 0.5539388060569763, + "learning_rate": 0.0005964937507378171, + "loss": 3.5543, + "step": 2982 + }, + { + "epoch": 0.15, + "grad_norm": 0.5843010544776917, + "learning_rate": 0.0005964914033296824, + "loss": 3.4738, + "step": 2983 + }, + { + "epoch": 0.15, + "grad_norm": 0.5479416251182556, + "learning_rate": 0.0005964890551406463, + "loss": 3.6649, + "step": 2984 + }, + { + "epoch": 0.15, + "grad_norm": 0.590008020401001, + "learning_rate": 0.0005964867061707152, + "loss": 3.6951, + "step": 2985 + }, + { + "epoch": 0.15, + "grad_norm": 0.6158657073974609, + "learning_rate": 0.0005964843564198951, + "loss": 3.5394, + "step": 2986 + }, + { + "epoch": 0.15, + "grad_norm": 0.5672131180763245, + "learning_rate": 0.0005964820058881925, + "loss": 3.5563, + "step": 2987 + }, + { + "epoch": 0.15, + "grad_norm": 0.5937507748603821, + "learning_rate": 0.000596479654575613, + "loss": 3.7613, + "step": 2988 + }, + { + "epoch": 0.15, + "grad_norm": 0.6074742078781128, + "learning_rate": 0.0005964773024821633, + "loss": 3.6806, + "step": 2989 + }, + { + "epoch": 0.15, + "grad_norm": 0.5991113185882568, + "learning_rate": 0.0005964749496078495, + "loss": 3.528, + "step": 2990 + }, + { + "epoch": 0.15, + "grad_norm": 0.5741263031959534, + "learning_rate": 0.0005964725959526777, + "loss": 3.4517, + "step": 2991 + }, + { + "epoch": 0.15, + "grad_norm": 0.6311984658241272, + "learning_rate": 0.0005964702415166541, + "loss": 3.5116, + "step": 2992 + }, + { + "epoch": 0.15, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.000596467886299785, + "loss": 3.4346, + "step": 2993 + }, + { + "epoch": 0.15, + "grad_norm": 0.5953471660614014, + "learning_rate": 0.0005964655303020766, + "loss": 3.566, + "step": 2994 + }, + { + "epoch": 0.15, + "grad_norm": 0.6620184779167175, + "learning_rate": 0.0005964631735235349, + "loss": 3.7845, + "step": 2995 + }, + { + "epoch": 0.15, + "grad_norm": 0.5985296964645386, + "learning_rate": 0.0005964608159641664, + "loss": 3.6863, + "step": 2996 + }, + { + "epoch": 0.15, + "grad_norm": 0.6449403166770935, + "learning_rate": 0.000596458457623977, + "loss": 3.5711, + "step": 2997 + }, + { + "epoch": 0.15, + "grad_norm": 0.583063006401062, + "learning_rate": 0.0005964560985029732, + "loss": 3.3602, + "step": 2998 + }, + { + "epoch": 0.15, + "grad_norm": 0.5750587582588196, + "learning_rate": 0.0005964537386011611, + "loss": 3.6342, + "step": 2999 + }, + { + "epoch": 0.15, + "grad_norm": 0.5808610320091248, + "learning_rate": 0.0005964513779185468, + "loss": 3.5019, + "step": 3000 + }, + { + "epoch": 0.15, + "grad_norm": 0.6237676739692688, + "learning_rate": 0.0005964490164551367, + "loss": 3.5213, + "step": 3001 + }, + { + "epoch": 0.15, + "grad_norm": 0.5874052047729492, + "learning_rate": 0.000596446654210937, + "loss": 3.3836, + "step": 3002 + }, + { + "epoch": 0.15, + "grad_norm": 0.5484369993209839, + "learning_rate": 0.0005964442911859538, + "loss": 3.2887, + "step": 3003 + }, + { + "epoch": 0.15, + "grad_norm": 0.5622039437294006, + "learning_rate": 0.0005964419273801934, + "loss": 3.5959, + "step": 3004 + }, + { + "epoch": 0.15, + "grad_norm": 0.601517915725708, + "learning_rate": 0.000596439562793662, + "loss": 3.588, + "step": 3005 + }, + { + "epoch": 0.15, + "grad_norm": 0.6053403615951538, + "learning_rate": 0.0005964371974263658, + "loss": 3.6096, + "step": 3006 + }, + { + "epoch": 0.15, + "grad_norm": 0.5661147832870483, + "learning_rate": 0.0005964348312783111, + "loss": 3.52, + "step": 3007 + }, + { + "epoch": 0.15, + "grad_norm": 0.6172541975975037, + "learning_rate": 0.0005964324643495042, + "loss": 3.6069, + "step": 3008 + }, + { + "epoch": 0.15, + "grad_norm": 0.5625926852226257, + "learning_rate": 0.0005964300966399511, + "loss": 3.6741, + "step": 3009 + }, + { + "epoch": 0.15, + "grad_norm": 0.5759730339050293, + "learning_rate": 0.0005964277281496583, + "loss": 3.5645, + "step": 3010 + }, + { + "epoch": 0.15, + "grad_norm": 0.5957322120666504, + "learning_rate": 0.0005964253588786318, + "loss": 3.5519, + "step": 3011 + }, + { + "epoch": 0.15, + "grad_norm": 0.5785747170448303, + "learning_rate": 0.000596422988826878, + "loss": 3.4793, + "step": 3012 + }, + { + "epoch": 0.15, + "grad_norm": 0.5878031849861145, + "learning_rate": 0.0005964206179944031, + "loss": 3.5527, + "step": 3013 + }, + { + "epoch": 0.15, + "grad_norm": 0.5843365788459778, + "learning_rate": 0.0005964182463812133, + "loss": 3.5102, + "step": 3014 + }, + { + "epoch": 0.15, + "grad_norm": 0.5953040719032288, + "learning_rate": 0.000596415873987315, + "loss": 3.6765, + "step": 3015 + }, + { + "epoch": 0.15, + "grad_norm": 0.5682270526885986, + "learning_rate": 0.0005964135008127143, + "loss": 3.6247, + "step": 3016 + }, + { + "epoch": 0.15, + "grad_norm": 0.5600559115409851, + "learning_rate": 0.0005964111268574174, + "loss": 3.5984, + "step": 3017 + }, + { + "epoch": 0.15, + "grad_norm": 0.6236439943313599, + "learning_rate": 0.0005964087521214307, + "loss": 3.5485, + "step": 3018 + }, + { + "epoch": 0.15, + "grad_norm": 0.5868257284164429, + "learning_rate": 0.0005964063766047602, + "loss": 3.6191, + "step": 3019 + }, + { + "epoch": 0.15, + "grad_norm": 0.5846828818321228, + "learning_rate": 0.0005964040003074125, + "loss": 3.3374, + "step": 3020 + }, + { + "epoch": 0.15, + "grad_norm": 0.5583348274230957, + "learning_rate": 0.0005964016232293939, + "loss": 3.6922, + "step": 3021 + }, + { + "epoch": 0.15, + "grad_norm": 0.6195686459541321, + "learning_rate": 0.0005963992453707102, + "loss": 3.708, + "step": 3022 + }, + { + "epoch": 0.15, + "grad_norm": 0.6074361801147461, + "learning_rate": 0.0005963968667313681, + "loss": 3.6087, + "step": 3023 + }, + { + "epoch": 0.15, + "grad_norm": 0.5756213665008545, + "learning_rate": 0.0005963944873113736, + "loss": 3.5665, + "step": 3024 + }, + { + "epoch": 0.15, + "grad_norm": 0.6131848096847534, + "learning_rate": 0.0005963921071107331, + "loss": 3.1928, + "step": 3025 + }, + { + "epoch": 0.15, + "grad_norm": 0.550272524356842, + "learning_rate": 0.0005963897261294528, + "loss": 3.6607, + "step": 3026 + }, + { + "epoch": 0.15, + "grad_norm": 0.6190971732139587, + "learning_rate": 0.000596387344367539, + "loss": 3.619, + "step": 3027 + }, + { + "epoch": 0.15, + "grad_norm": 0.6038632392883301, + "learning_rate": 0.000596384961824998, + "loss": 3.775, + "step": 3028 + }, + { + "epoch": 0.15, + "grad_norm": 0.5631963014602661, + "learning_rate": 0.0005963825785018361, + "loss": 3.5747, + "step": 3029 + }, + { + "epoch": 0.15, + "grad_norm": 0.5642056465148926, + "learning_rate": 0.0005963801943980595, + "loss": 3.542, + "step": 3030 + }, + { + "epoch": 0.15, + "grad_norm": 0.5921563506126404, + "learning_rate": 0.0005963778095136744, + "loss": 3.5717, + "step": 3031 + }, + { + "epoch": 0.15, + "grad_norm": 0.5655155777931213, + "learning_rate": 0.0005963754238486873, + "loss": 3.5262, + "step": 3032 + }, + { + "epoch": 0.15, + "grad_norm": 0.5852659940719604, + "learning_rate": 0.0005963730374031044, + "loss": 3.6345, + "step": 3033 + }, + { + "epoch": 0.15, + "grad_norm": 0.5611869096755981, + "learning_rate": 0.0005963706501769319, + "loss": 3.6346, + "step": 3034 + }, + { + "epoch": 0.15, + "grad_norm": 0.5403077602386475, + "learning_rate": 0.0005963682621701761, + "loss": 3.7496, + "step": 3035 + }, + { + "epoch": 0.15, + "grad_norm": 0.5627409219741821, + "learning_rate": 0.0005963658733828433, + "loss": 3.6472, + "step": 3036 + }, + { + "epoch": 0.15, + "grad_norm": 0.5713598132133484, + "learning_rate": 0.00059636348381494, + "loss": 3.3384, + "step": 3037 + }, + { + "epoch": 0.15, + "grad_norm": 0.6267187595367432, + "learning_rate": 0.0005963610934664722, + "loss": 3.5897, + "step": 3038 + }, + { + "epoch": 0.15, + "grad_norm": 0.6044197678565979, + "learning_rate": 0.0005963587023374463, + "loss": 3.7926, + "step": 3039 + }, + { + "epoch": 0.15, + "grad_norm": 0.6235830783843994, + "learning_rate": 0.0005963563104278687, + "loss": 3.7628, + "step": 3040 + }, + { + "epoch": 0.15, + "grad_norm": 0.6233013868331909, + "learning_rate": 0.0005963539177377456, + "loss": 3.4503, + "step": 3041 + }, + { + "epoch": 0.15, + "grad_norm": 0.5676363110542297, + "learning_rate": 0.0005963515242670833, + "loss": 3.5615, + "step": 3042 + }, + { + "epoch": 0.15, + "grad_norm": 0.6279856562614441, + "learning_rate": 0.0005963491300158881, + "loss": 3.6456, + "step": 3043 + }, + { + "epoch": 0.15, + "grad_norm": 0.5585012435913086, + "learning_rate": 0.0005963467349841663, + "loss": 3.5764, + "step": 3044 + }, + { + "epoch": 0.15, + "grad_norm": 0.5672016143798828, + "learning_rate": 0.0005963443391719242, + "loss": 3.3576, + "step": 3045 + }, + { + "epoch": 0.15, + "grad_norm": 0.6148715019226074, + "learning_rate": 0.0005963419425791683, + "loss": 3.7448, + "step": 3046 + }, + { + "epoch": 0.15, + "grad_norm": 0.5559766888618469, + "learning_rate": 0.0005963395452059046, + "loss": 3.7044, + "step": 3047 + }, + { + "epoch": 0.15, + "grad_norm": 0.5408651232719421, + "learning_rate": 0.0005963371470521396, + "loss": 3.5665, + "step": 3048 + }, + { + "epoch": 0.15, + "grad_norm": 0.5515286922454834, + "learning_rate": 0.0005963347481178796, + "loss": 3.5655, + "step": 3049 + }, + { + "epoch": 0.15, + "grad_norm": 0.5659925937652588, + "learning_rate": 0.0005963323484031309, + "loss": 3.7384, + "step": 3050 + }, + { + "epoch": 0.15, + "grad_norm": 0.5940973162651062, + "learning_rate": 0.0005963299479078999, + "loss": 3.6098, + "step": 3051 + }, + { + "epoch": 0.15, + "grad_norm": 0.5960044264793396, + "learning_rate": 0.0005963275466321928, + "loss": 3.5784, + "step": 3052 + }, + { + "epoch": 0.15, + "grad_norm": 0.5913186073303223, + "learning_rate": 0.000596325144576016, + "loss": 3.3696, + "step": 3053 + }, + { + "epoch": 0.15, + "grad_norm": 0.5472952723503113, + "learning_rate": 0.0005963227417393758, + "loss": 3.3607, + "step": 3054 + }, + { + "epoch": 0.15, + "grad_norm": 0.5894946455955505, + "learning_rate": 0.0005963203381222784, + "loss": 3.5893, + "step": 3055 + }, + { + "epoch": 0.15, + "grad_norm": 0.586334228515625, + "learning_rate": 0.0005963179337247303, + "loss": 3.6586, + "step": 3056 + }, + { + "epoch": 0.15, + "grad_norm": 0.5724272131919861, + "learning_rate": 0.0005963155285467379, + "loss": 3.7376, + "step": 3057 + }, + { + "epoch": 0.15, + "grad_norm": 0.5854930281639099, + "learning_rate": 0.0005963131225883074, + "loss": 3.6825, + "step": 3058 + }, + { + "epoch": 0.15, + "grad_norm": 0.5693349242210388, + "learning_rate": 0.0005963107158494451, + "loss": 3.5495, + "step": 3059 + }, + { + "epoch": 0.15, + "grad_norm": 0.5654905438423157, + "learning_rate": 0.0005963083083301575, + "loss": 3.6469, + "step": 3060 + }, + { + "epoch": 0.15, + "grad_norm": 0.5747386813163757, + "learning_rate": 0.0005963059000304507, + "loss": 3.6497, + "step": 3061 + }, + { + "epoch": 0.15, + "grad_norm": 0.6350069046020508, + "learning_rate": 0.0005963034909503311, + "loss": 3.6941, + "step": 3062 + }, + { + "epoch": 0.15, + "grad_norm": 0.6059046387672424, + "learning_rate": 0.0005963010810898054, + "loss": 3.5363, + "step": 3063 + }, + { + "epoch": 0.15, + "grad_norm": 0.5842269062995911, + "learning_rate": 0.0005962986704488796, + "loss": 3.7521, + "step": 3064 + }, + { + "epoch": 0.15, + "grad_norm": 0.5611490607261658, + "learning_rate": 0.00059629625902756, + "loss": 3.6577, + "step": 3065 + }, + { + "epoch": 0.15, + "grad_norm": 0.5932784676551819, + "learning_rate": 0.0005962938468258532, + "loss": 3.5643, + "step": 3066 + }, + { + "epoch": 0.15, + "grad_norm": 0.596534252166748, + "learning_rate": 0.0005962914338437654, + "loss": 3.7409, + "step": 3067 + }, + { + "epoch": 0.15, + "grad_norm": 0.5720816254615784, + "learning_rate": 0.0005962890200813029, + "loss": 3.7528, + "step": 3068 + }, + { + "epoch": 0.15, + "grad_norm": 0.5783271193504333, + "learning_rate": 0.0005962866055384722, + "loss": 3.8315, + "step": 3069 + }, + { + "epoch": 0.15, + "grad_norm": 0.5567073225975037, + "learning_rate": 0.0005962841902152796, + "loss": 3.5997, + "step": 3070 + }, + { + "epoch": 0.15, + "grad_norm": 0.5587060451507568, + "learning_rate": 0.0005962817741117316, + "loss": 3.7797, + "step": 3071 + }, + { + "epoch": 0.15, + "grad_norm": 0.5583739876747131, + "learning_rate": 0.0005962793572278342, + "loss": 3.768, + "step": 3072 + }, + { + "epoch": 0.15, + "grad_norm": 0.6153991222381592, + "learning_rate": 0.0005962769395635942, + "loss": 3.6803, + "step": 3073 + }, + { + "epoch": 0.15, + "grad_norm": 0.6166850328445435, + "learning_rate": 0.0005962745211190176, + "loss": 3.489, + "step": 3074 + }, + { + "epoch": 0.15, + "grad_norm": 0.6104187369346619, + "learning_rate": 0.000596272101894111, + "loss": 3.4945, + "step": 3075 + }, + { + "epoch": 0.15, + "grad_norm": 0.6122152805328369, + "learning_rate": 0.0005962696818888807, + "loss": 3.7251, + "step": 3076 + }, + { + "epoch": 0.15, + "grad_norm": 0.5618709921836853, + "learning_rate": 0.0005962672611033331, + "loss": 3.6053, + "step": 3077 + }, + { + "epoch": 0.15, + "grad_norm": 0.5944236516952515, + "learning_rate": 0.0005962648395374745, + "loss": 3.4591, + "step": 3078 + }, + { + "epoch": 0.15, + "grad_norm": 0.6525372266769409, + "learning_rate": 0.0005962624171913113, + "loss": 3.4751, + "step": 3079 + }, + { + "epoch": 0.15, + "grad_norm": 0.64317387342453, + "learning_rate": 0.00059625999406485, + "loss": 3.6205, + "step": 3080 + }, + { + "epoch": 0.15, + "grad_norm": 0.6131284236907959, + "learning_rate": 0.0005962575701580968, + "loss": 3.6735, + "step": 3081 + }, + { + "epoch": 0.15, + "grad_norm": 0.5894636511802673, + "learning_rate": 0.0005962551454710583, + "loss": 3.8265, + "step": 3082 + }, + { + "epoch": 0.15, + "grad_norm": 0.6023474335670471, + "learning_rate": 0.0005962527200037407, + "loss": 3.573, + "step": 3083 + }, + { + "epoch": 0.15, + "grad_norm": 0.5374550223350525, + "learning_rate": 0.0005962502937561505, + "loss": 3.467, + "step": 3084 + }, + { + "epoch": 0.15, + "grad_norm": 0.5970792174339294, + "learning_rate": 0.0005962478667282939, + "loss": 3.6305, + "step": 3085 + }, + { + "epoch": 0.15, + "grad_norm": 0.6187454462051392, + "learning_rate": 0.0005962454389201776, + "loss": 3.5442, + "step": 3086 + }, + { + "epoch": 0.15, + "grad_norm": 0.5832275152206421, + "learning_rate": 0.0005962430103318077, + "loss": 3.5218, + "step": 3087 + }, + { + "epoch": 0.15, + "grad_norm": 0.6036730408668518, + "learning_rate": 0.0005962405809631908, + "loss": 3.3699, + "step": 3088 + }, + { + "epoch": 0.15, + "grad_norm": 0.569485068321228, + "learning_rate": 0.0005962381508143332, + "loss": 3.6637, + "step": 3089 + }, + { + "epoch": 0.15, + "grad_norm": 0.587161660194397, + "learning_rate": 0.0005962357198852412, + "loss": 3.8073, + "step": 3090 + }, + { + "epoch": 0.15, + "grad_norm": 0.5674685835838318, + "learning_rate": 0.0005962332881759215, + "loss": 3.5457, + "step": 3091 + }, + { + "epoch": 0.15, + "grad_norm": 0.5693333745002747, + "learning_rate": 0.0005962308556863803, + "loss": 3.6018, + "step": 3092 + }, + { + "epoch": 0.15, + "grad_norm": 0.6043726801872253, + "learning_rate": 0.000596228422416624, + "loss": 3.5762, + "step": 3093 + }, + { + "epoch": 0.15, + "grad_norm": 0.5639926791191101, + "learning_rate": 0.0005962259883666591, + "loss": 3.8223, + "step": 3094 + }, + { + "epoch": 0.15, + "grad_norm": 0.6272081136703491, + "learning_rate": 0.0005962235535364918, + "loss": 3.6311, + "step": 3095 + }, + { + "epoch": 0.15, + "grad_norm": 0.5490891933441162, + "learning_rate": 0.0005962211179261288, + "loss": 3.455, + "step": 3096 + }, + { + "epoch": 0.15, + "grad_norm": 0.6012369990348816, + "learning_rate": 0.0005962186815355765, + "loss": 3.4856, + "step": 3097 + }, + { + "epoch": 0.15, + "grad_norm": 0.5988452434539795, + "learning_rate": 0.000596216244364841, + "loss": 3.5496, + "step": 3098 + }, + { + "epoch": 0.15, + "grad_norm": 0.6350785493850708, + "learning_rate": 0.000596213806413929, + "loss": 3.5515, + "step": 3099 + }, + { + "epoch": 0.15, + "grad_norm": 0.6114810705184937, + "learning_rate": 0.0005962113676828468, + "loss": 3.4899, + "step": 3100 + }, + { + "epoch": 0.15, + "grad_norm": 0.5787695646286011, + "learning_rate": 0.000596208928171601, + "loss": 3.5458, + "step": 3101 + }, + { + "epoch": 0.15, + "grad_norm": 0.5776200890541077, + "learning_rate": 0.0005962064878801977, + "loss": 3.4334, + "step": 3102 + }, + { + "epoch": 0.15, + "grad_norm": 0.5516855716705322, + "learning_rate": 0.0005962040468086436, + "loss": 3.4205, + "step": 3103 + }, + { + "epoch": 0.15, + "grad_norm": 0.5812780857086182, + "learning_rate": 0.0005962016049569451, + "loss": 3.7321, + "step": 3104 + }, + { + "epoch": 0.15, + "grad_norm": 0.5914744138717651, + "learning_rate": 0.0005961991623251085, + "loss": 3.6045, + "step": 3105 + }, + { + "epoch": 0.15, + "grad_norm": 0.5617961287498474, + "learning_rate": 0.0005961967189131403, + "loss": 3.6301, + "step": 3106 + }, + { + "epoch": 0.15, + "grad_norm": 0.6185020208358765, + "learning_rate": 0.0005961942747210469, + "loss": 3.5499, + "step": 3107 + }, + { + "epoch": 0.15, + "grad_norm": 0.609798014163971, + "learning_rate": 0.0005961918297488349, + "loss": 3.5542, + "step": 3108 + }, + { + "epoch": 0.15, + "grad_norm": 0.5684676766395569, + "learning_rate": 0.0005961893839965106, + "loss": 3.5955, + "step": 3109 + }, + { + "epoch": 0.15, + "grad_norm": 0.584693193435669, + "learning_rate": 0.0005961869374640804, + "loss": 3.3336, + "step": 3110 + }, + { + "epoch": 0.15, + "grad_norm": 0.6073812246322632, + "learning_rate": 0.0005961844901515509, + "loss": 3.669, + "step": 3111 + }, + { + "epoch": 0.15, + "grad_norm": 0.5767044425010681, + "learning_rate": 0.0005961820420589285, + "loss": 3.63, + "step": 3112 + }, + { + "epoch": 0.15, + "grad_norm": 0.5799092054367065, + "learning_rate": 0.0005961795931862194, + "loss": 3.4574, + "step": 3113 + }, + { + "epoch": 0.15, + "grad_norm": 0.545676052570343, + "learning_rate": 0.0005961771435334304, + "loss": 3.5919, + "step": 3114 + }, + { + "epoch": 0.15, + "grad_norm": 0.582565188407898, + "learning_rate": 0.0005961746931005678, + "loss": 3.664, + "step": 3115 + }, + { + "epoch": 0.15, + "grad_norm": 0.5571921467781067, + "learning_rate": 0.000596172241887638, + "loss": 3.5959, + "step": 3116 + }, + { + "epoch": 0.15, + "grad_norm": 0.5776621699333191, + "learning_rate": 0.0005961697898946475, + "loss": 3.5268, + "step": 3117 + }, + { + "epoch": 0.15, + "grad_norm": 0.6262937188148499, + "learning_rate": 0.0005961673371216029, + "loss": 3.4795, + "step": 3118 + }, + { + "epoch": 0.15, + "grad_norm": 0.6134164929389954, + "learning_rate": 0.0005961648835685105, + "loss": 3.782, + "step": 3119 + }, + { + "epoch": 0.15, + "grad_norm": 0.5699538588523865, + "learning_rate": 0.0005961624292353766, + "loss": 3.8283, + "step": 3120 + }, + { + "epoch": 0.15, + "grad_norm": 0.5842909216880798, + "learning_rate": 0.0005961599741222081, + "loss": 3.4827, + "step": 3121 + }, + { + "epoch": 0.15, + "grad_norm": 0.5857353806495667, + "learning_rate": 0.0005961575182290111, + "loss": 3.5638, + "step": 3122 + }, + { + "epoch": 0.15, + "grad_norm": 0.5917581915855408, + "learning_rate": 0.0005961550615557923, + "loss": 3.5723, + "step": 3123 + }, + { + "epoch": 0.15, + "grad_norm": 0.546079158782959, + "learning_rate": 0.000596152604102558, + "loss": 3.6738, + "step": 3124 + }, + { + "epoch": 0.15, + "grad_norm": 0.5460621118545532, + "learning_rate": 0.0005961501458693147, + "loss": 3.6153, + "step": 3125 + }, + { + "epoch": 0.15, + "grad_norm": 0.5750552415847778, + "learning_rate": 0.000596147686856069, + "loss": 3.5495, + "step": 3126 + }, + { + "epoch": 0.15, + "grad_norm": 0.55301833152771, + "learning_rate": 0.0005961452270628272, + "loss": 3.5873, + "step": 3127 + }, + { + "epoch": 0.15, + "grad_norm": 0.5648224949836731, + "learning_rate": 0.0005961427664895959, + "loss": 3.6474, + "step": 3128 + }, + { + "epoch": 0.15, + "grad_norm": 0.581761360168457, + "learning_rate": 0.0005961403051363814, + "loss": 3.477, + "step": 3129 + }, + { + "epoch": 0.15, + "grad_norm": 0.5733395218849182, + "learning_rate": 0.0005961378430031905, + "loss": 3.7321, + "step": 3130 + }, + { + "epoch": 0.15, + "grad_norm": 0.5752854347229004, + "learning_rate": 0.0005961353800900296, + "loss": 3.7605, + "step": 3131 + }, + { + "epoch": 0.15, + "grad_norm": 0.5976043343544006, + "learning_rate": 0.000596132916396905, + "loss": 3.7176, + "step": 3132 + }, + { + "epoch": 0.15, + "grad_norm": 0.6109990477561951, + "learning_rate": 0.0005961304519238232, + "loss": 3.9394, + "step": 3133 + }, + { + "epoch": 0.15, + "grad_norm": 0.6013369560241699, + "learning_rate": 0.0005961279866707909, + "loss": 3.4758, + "step": 3134 + }, + { + "epoch": 0.15, + "grad_norm": 0.5654555559158325, + "learning_rate": 0.0005961255206378144, + "loss": 3.6548, + "step": 3135 + }, + { + "epoch": 0.15, + "grad_norm": 0.5910729765892029, + "learning_rate": 0.0005961230538249003, + "loss": 3.5525, + "step": 3136 + }, + { + "epoch": 0.15, + "grad_norm": 0.6182341575622559, + "learning_rate": 0.0005961205862320551, + "loss": 3.5527, + "step": 3137 + }, + { + "epoch": 0.15, + "grad_norm": 0.5825470089912415, + "learning_rate": 0.0005961181178592852, + "loss": 3.6325, + "step": 3138 + }, + { + "epoch": 0.15, + "grad_norm": 0.6601877808570862, + "learning_rate": 0.0005961156487065972, + "loss": 3.3948, + "step": 3139 + }, + { + "epoch": 0.15, + "grad_norm": 0.5642703771591187, + "learning_rate": 0.0005961131787739976, + "loss": 3.5232, + "step": 3140 + }, + { + "epoch": 0.15, + "grad_norm": 0.5936177372932434, + "learning_rate": 0.0005961107080614929, + "loss": 3.6665, + "step": 3141 + }, + { + "epoch": 0.15, + "grad_norm": 0.5690498948097229, + "learning_rate": 0.0005961082365690894, + "loss": 3.4099, + "step": 3142 + }, + { + "epoch": 0.15, + "grad_norm": 0.6080211400985718, + "learning_rate": 0.0005961057642967939, + "loss": 3.402, + "step": 3143 + }, + { + "epoch": 0.15, + "grad_norm": 0.5955443382263184, + "learning_rate": 0.0005961032912446128, + "loss": 3.4212, + "step": 3144 + }, + { + "epoch": 0.15, + "grad_norm": 0.5814092755317688, + "learning_rate": 0.0005961008174125527, + "loss": 3.3803, + "step": 3145 + }, + { + "epoch": 0.15, + "grad_norm": 0.5517794489860535, + "learning_rate": 0.00059609834280062, + "loss": 3.5878, + "step": 3146 + }, + { + "epoch": 0.15, + "grad_norm": 0.5653300881385803, + "learning_rate": 0.0005960958674088211, + "loss": 3.535, + "step": 3147 + }, + { + "epoch": 0.15, + "grad_norm": 0.5799789428710938, + "learning_rate": 0.0005960933912371629, + "loss": 3.6731, + "step": 3148 + }, + { + "epoch": 0.15, + "grad_norm": 0.5752755403518677, + "learning_rate": 0.0005960909142856516, + "loss": 3.6781, + "step": 3149 + }, + { + "epoch": 0.15, + "grad_norm": 0.6208489537239075, + "learning_rate": 0.0005960884365542937, + "loss": 3.3908, + "step": 3150 + }, + { + "epoch": 0.15, + "grad_norm": 0.6199827790260315, + "learning_rate": 0.000596085958043096, + "loss": 3.7275, + "step": 3151 + }, + { + "epoch": 0.15, + "grad_norm": 0.6208821535110474, + "learning_rate": 0.0005960834787520648, + "loss": 3.4982, + "step": 3152 + }, + { + "epoch": 0.15, + "grad_norm": 0.5410616993904114, + "learning_rate": 0.0005960809986812067, + "loss": 3.5705, + "step": 3153 + }, + { + "epoch": 0.15, + "grad_norm": 0.5999816060066223, + "learning_rate": 0.0005960785178305283, + "loss": 3.4113, + "step": 3154 + }, + { + "epoch": 0.15, + "grad_norm": 0.6006971001625061, + "learning_rate": 0.000596076036200036, + "loss": 3.7676, + "step": 3155 + }, + { + "epoch": 0.15, + "grad_norm": 0.5484111309051514, + "learning_rate": 0.0005960735537897364, + "loss": 3.5441, + "step": 3156 + }, + { + "epoch": 0.15, + "grad_norm": 0.581990659236908, + "learning_rate": 0.000596071070599636, + "loss": 3.3492, + "step": 3157 + }, + { + "epoch": 0.15, + "grad_norm": 0.6446429491043091, + "learning_rate": 0.0005960685866297415, + "loss": 3.6986, + "step": 3158 + }, + { + "epoch": 0.15, + "grad_norm": 0.564018726348877, + "learning_rate": 0.0005960661018800592, + "loss": 3.4903, + "step": 3159 + }, + { + "epoch": 0.15, + "grad_norm": 0.5862210392951965, + "learning_rate": 0.0005960636163505958, + "loss": 3.5107, + "step": 3160 + }, + { + "epoch": 0.15, + "grad_norm": 0.5642343163490295, + "learning_rate": 0.0005960611300413578, + "loss": 3.6318, + "step": 3161 + }, + { + "epoch": 0.15, + "grad_norm": 0.5777455568313599, + "learning_rate": 0.0005960586429523518, + "loss": 3.417, + "step": 3162 + }, + { + "epoch": 0.16, + "grad_norm": 0.586602509021759, + "learning_rate": 0.0005960561550835843, + "loss": 3.5432, + "step": 3163 + }, + { + "epoch": 0.16, + "grad_norm": 0.5912312269210815, + "learning_rate": 0.0005960536664350619, + "loss": 3.6441, + "step": 3164 + }, + { + "epoch": 0.16, + "grad_norm": 0.6185685396194458, + "learning_rate": 0.0005960511770067911, + "loss": 3.5537, + "step": 3165 + }, + { + "epoch": 0.16, + "grad_norm": 0.6227981448173523, + "learning_rate": 0.0005960486867987784, + "loss": 3.4673, + "step": 3166 + }, + { + "epoch": 0.16, + "grad_norm": 0.6355583071708679, + "learning_rate": 0.0005960461958110305, + "loss": 3.5603, + "step": 3167 + }, + { + "epoch": 0.16, + "grad_norm": 0.6054497957229614, + "learning_rate": 0.0005960437040435539, + "loss": 3.5745, + "step": 3168 + }, + { + "epoch": 0.16, + "grad_norm": 0.61248379945755, + "learning_rate": 0.0005960412114963552, + "loss": 3.5392, + "step": 3169 + }, + { + "epoch": 0.16, + "grad_norm": 0.600622832775116, + "learning_rate": 0.0005960387181694408, + "loss": 3.629, + "step": 3170 + }, + { + "epoch": 0.16, + "grad_norm": 0.6262516379356384, + "learning_rate": 0.0005960362240628175, + "loss": 3.3767, + "step": 3171 + }, + { + "epoch": 0.16, + "grad_norm": 0.5428594350814819, + "learning_rate": 0.0005960337291764918, + "loss": 3.6812, + "step": 3172 + }, + { + "epoch": 0.16, + "grad_norm": 0.5798599123954773, + "learning_rate": 0.0005960312335104701, + "loss": 3.6121, + "step": 3173 + }, + { + "epoch": 0.16, + "grad_norm": 0.600669801235199, + "learning_rate": 0.0005960287370647591, + "loss": 3.5828, + "step": 3174 + }, + { + "epoch": 0.16, + "grad_norm": 0.5855419635772705, + "learning_rate": 0.0005960262398393654, + "loss": 3.7861, + "step": 3175 + }, + { + "epoch": 0.16, + "grad_norm": 0.5565866827964783, + "learning_rate": 0.0005960237418342957, + "loss": 3.7263, + "step": 3176 + }, + { + "epoch": 0.16, + "grad_norm": 0.5831587910652161, + "learning_rate": 0.0005960212430495564, + "loss": 3.7619, + "step": 3177 + }, + { + "epoch": 0.16, + "grad_norm": 0.6254531741142273, + "learning_rate": 0.000596018743485154, + "loss": 3.5224, + "step": 3178 + }, + { + "epoch": 0.16, + "grad_norm": 0.5760573744773865, + "learning_rate": 0.0005960162431410953, + "loss": 3.4772, + "step": 3179 + }, + { + "epoch": 0.16, + "grad_norm": 0.5323291420936584, + "learning_rate": 0.0005960137420173866, + "loss": 3.4991, + "step": 3180 + }, + { + "epoch": 0.16, + "grad_norm": 0.5438007116317749, + "learning_rate": 0.0005960112401140348, + "loss": 3.6223, + "step": 3181 + }, + { + "epoch": 0.16, + "grad_norm": 0.5264496207237244, + "learning_rate": 0.0005960087374310464, + "loss": 3.7063, + "step": 3182 + }, + { + "epoch": 0.16, + "grad_norm": 0.5956478118896484, + "learning_rate": 0.0005960062339684279, + "loss": 3.564, + "step": 3183 + }, + { + "epoch": 0.16, + "grad_norm": 0.6297171711921692, + "learning_rate": 0.000596003729726186, + "loss": 3.5353, + "step": 3184 + }, + { + "epoch": 0.16, + "grad_norm": 0.5785334706306458, + "learning_rate": 0.0005960012247043272, + "loss": 3.7652, + "step": 3185 + }, + { + "epoch": 0.16, + "grad_norm": 0.6340588331222534, + "learning_rate": 0.0005959987189028582, + "loss": 3.6217, + "step": 3186 + }, + { + "epoch": 0.16, + "grad_norm": 0.5726503133773804, + "learning_rate": 0.0005959962123217855, + "loss": 3.585, + "step": 3187 + }, + { + "epoch": 0.16, + "grad_norm": 0.5697149038314819, + "learning_rate": 0.0005959937049611156, + "loss": 3.4602, + "step": 3188 + }, + { + "epoch": 0.16, + "grad_norm": 0.5911883115768433, + "learning_rate": 0.0005959911968208554, + "loss": 3.4454, + "step": 3189 + }, + { + "epoch": 0.16, + "grad_norm": 0.579783022403717, + "learning_rate": 0.0005959886879010113, + "loss": 3.6873, + "step": 3190 + }, + { + "epoch": 0.16, + "grad_norm": 0.5348889827728271, + "learning_rate": 0.00059598617820159, + "loss": 3.8555, + "step": 3191 + }, + { + "epoch": 0.16, + "grad_norm": 0.5788812637329102, + "learning_rate": 0.000595983667722598, + "loss": 3.634, + "step": 3192 + }, + { + "epoch": 0.16, + "grad_norm": 0.5206723809242249, + "learning_rate": 0.0005959811564640421, + "loss": 3.3991, + "step": 3193 + }, + { + "epoch": 0.16, + "grad_norm": 0.5956487059593201, + "learning_rate": 0.0005959786444259287, + "loss": 3.4554, + "step": 3194 + }, + { + "epoch": 0.16, + "grad_norm": 0.5612884759902954, + "learning_rate": 0.0005959761316082646, + "loss": 3.6328, + "step": 3195 + }, + { + "epoch": 0.16, + "grad_norm": 0.5556085705757141, + "learning_rate": 0.0005959736180110563, + "loss": 3.6107, + "step": 3196 + }, + { + "epoch": 0.16, + "grad_norm": 0.5669443607330322, + "learning_rate": 0.0005959711036343104, + "loss": 3.5104, + "step": 3197 + }, + { + "epoch": 0.16, + "grad_norm": 0.5675809383392334, + "learning_rate": 0.0005959685884780337, + "loss": 3.5678, + "step": 3198 + }, + { + "epoch": 0.16, + "grad_norm": 0.605646014213562, + "learning_rate": 0.0005959660725422325, + "loss": 3.7298, + "step": 3199 + }, + { + "epoch": 0.16, + "grad_norm": 0.5824044942855835, + "learning_rate": 0.0005959635558269137, + "loss": 3.416, + "step": 3200 + }, + { + "epoch": 0.16, + "grad_norm": 0.5763425230979919, + "learning_rate": 0.0005959610383320839, + "loss": 3.6143, + "step": 3201 + }, + { + "epoch": 0.16, + "grad_norm": 0.5799658894538879, + "learning_rate": 0.0005959585200577497, + "loss": 3.5673, + "step": 3202 + }, + { + "epoch": 0.16, + "grad_norm": 0.5830967426300049, + "learning_rate": 0.0005959560010039177, + "loss": 3.5561, + "step": 3203 + }, + { + "epoch": 0.16, + "grad_norm": 0.5466955304145813, + "learning_rate": 0.0005959534811705946, + "loss": 3.6316, + "step": 3204 + }, + { + "epoch": 0.16, + "grad_norm": 0.6080002784729004, + "learning_rate": 0.0005959509605577869, + "loss": 3.6567, + "step": 3205 + }, + { + "epoch": 0.16, + "grad_norm": 0.5686078071594238, + "learning_rate": 0.0005959484391655013, + "loss": 3.5206, + "step": 3206 + }, + { + "epoch": 0.16, + "grad_norm": 0.6283111572265625, + "learning_rate": 0.0005959459169937445, + "loss": 3.669, + "step": 3207 + }, + { + "epoch": 0.16, + "grad_norm": 0.551060140132904, + "learning_rate": 0.0005959433940425231, + "loss": 3.5432, + "step": 3208 + }, + { + "epoch": 0.16, + "grad_norm": 0.5849263668060303, + "learning_rate": 0.0005959408703118437, + "loss": 3.463, + "step": 3209 + }, + { + "epoch": 0.16, + "grad_norm": 0.5780369639396667, + "learning_rate": 0.0005959383458017133, + "loss": 3.4947, + "step": 3210 + }, + { + "epoch": 0.16, + "grad_norm": 0.6128699779510498, + "learning_rate": 0.000595935820512138, + "loss": 3.3344, + "step": 3211 + }, + { + "epoch": 0.16, + "grad_norm": 0.5325568318367004, + "learning_rate": 0.0005959332944431246, + "loss": 3.5697, + "step": 3212 + }, + { + "epoch": 0.16, + "grad_norm": 0.6106698513031006, + "learning_rate": 0.00059593076759468, + "loss": 3.4766, + "step": 3213 + }, + { + "epoch": 0.16, + "grad_norm": 0.6008756160736084, + "learning_rate": 0.0005959282399668108, + "loss": 3.5924, + "step": 3214 + }, + { + "epoch": 0.16, + "grad_norm": 0.5620996952056885, + "learning_rate": 0.0005959257115595235, + "loss": 3.548, + "step": 3215 + }, + { + "epoch": 0.16, + "grad_norm": 0.6242149472236633, + "learning_rate": 0.0005959231823728247, + "loss": 3.6272, + "step": 3216 + }, + { + "epoch": 0.16, + "grad_norm": 0.6121542453765869, + "learning_rate": 0.0005959206524067213, + "loss": 3.6008, + "step": 3217 + }, + { + "epoch": 0.16, + "grad_norm": 0.6612833142280579, + "learning_rate": 0.0005959181216612199, + "loss": 3.3679, + "step": 3218 + }, + { + "epoch": 0.16, + "grad_norm": 0.6306580305099487, + "learning_rate": 0.000595915590136327, + "loss": 3.6513, + "step": 3219 + }, + { + "epoch": 0.16, + "grad_norm": 0.615294337272644, + "learning_rate": 0.0005959130578320495, + "loss": 3.4633, + "step": 3220 + }, + { + "epoch": 0.16, + "grad_norm": 0.5863372683525085, + "learning_rate": 0.0005959105247483939, + "loss": 3.6524, + "step": 3221 + }, + { + "epoch": 0.16, + "grad_norm": 0.702987551689148, + "learning_rate": 0.0005959079908853669, + "loss": 3.7698, + "step": 3222 + }, + { + "epoch": 0.16, + "grad_norm": 0.6178707480430603, + "learning_rate": 0.0005959054562429753, + "loss": 3.5101, + "step": 3223 + }, + { + "epoch": 0.16, + "grad_norm": 0.5774521231651306, + "learning_rate": 0.0005959029208212255, + "loss": 3.5955, + "step": 3224 + }, + { + "epoch": 0.16, + "grad_norm": 0.587511420249939, + "learning_rate": 0.0005959003846201245, + "loss": 3.5713, + "step": 3225 + }, + { + "epoch": 0.16, + "grad_norm": 0.5820519924163818, + "learning_rate": 0.0005958978476396788, + "loss": 3.1941, + "step": 3226 + }, + { + "epoch": 0.16, + "grad_norm": 0.5678663849830627, + "learning_rate": 0.0005958953098798952, + "loss": 3.6527, + "step": 3227 + }, + { + "epoch": 0.16, + "grad_norm": 0.547321617603302, + "learning_rate": 0.0005958927713407801, + "loss": 3.5103, + "step": 3228 + }, + { + "epoch": 0.16, + "grad_norm": 0.5541772246360779, + "learning_rate": 0.0005958902320223405, + "loss": 3.4871, + "step": 3229 + }, + { + "epoch": 0.16, + "grad_norm": 0.5748250484466553, + "learning_rate": 0.000595887691924583, + "loss": 3.4653, + "step": 3230 + }, + { + "epoch": 0.16, + "grad_norm": 0.6465830206871033, + "learning_rate": 0.0005958851510475142, + "loss": 3.5919, + "step": 3231 + }, + { + "epoch": 0.16, + "grad_norm": 0.5622817277908325, + "learning_rate": 0.000595882609391141, + "loss": 3.6361, + "step": 3232 + }, + { + "epoch": 0.16, + "grad_norm": 0.5414732098579407, + "learning_rate": 0.0005958800669554698, + "loss": 3.797, + "step": 3233 + }, + { + "epoch": 0.16, + "grad_norm": 0.5994971990585327, + "learning_rate": 0.0005958775237405074, + "loss": 3.5292, + "step": 3234 + }, + { + "epoch": 0.16, + "grad_norm": 0.6444474458694458, + "learning_rate": 0.0005958749797462607, + "loss": 3.5301, + "step": 3235 + }, + { + "epoch": 0.16, + "grad_norm": 0.5784667730331421, + "learning_rate": 0.0005958724349727362, + "loss": 3.4053, + "step": 3236 + }, + { + "epoch": 0.16, + "grad_norm": 0.5597053170204163, + "learning_rate": 0.0005958698894199406, + "loss": 3.5452, + "step": 3237 + }, + { + "epoch": 0.16, + "grad_norm": 0.5689312815666199, + "learning_rate": 0.0005958673430878807, + "loss": 3.6081, + "step": 3238 + }, + { + "epoch": 0.16, + "grad_norm": 0.5763968825340271, + "learning_rate": 0.0005958647959765632, + "loss": 3.3315, + "step": 3239 + }, + { + "epoch": 0.16, + "grad_norm": 0.5735750794410706, + "learning_rate": 0.0005958622480859947, + "loss": 3.5241, + "step": 3240 + }, + { + "epoch": 0.16, + "grad_norm": 0.5359008312225342, + "learning_rate": 0.0005958596994161819, + "loss": 3.7146, + "step": 3241 + }, + { + "epoch": 0.16, + "grad_norm": 0.6043967008590698, + "learning_rate": 0.0005958571499671318, + "loss": 3.6643, + "step": 3242 + }, + { + "epoch": 0.16, + "grad_norm": 0.588692843914032, + "learning_rate": 0.0005958545997388506, + "loss": 3.6953, + "step": 3243 + }, + { + "epoch": 0.16, + "grad_norm": 0.5518408417701721, + "learning_rate": 0.0005958520487313456, + "loss": 3.49, + "step": 3244 + }, + { + "epoch": 0.16, + "grad_norm": 0.6246479153633118, + "learning_rate": 0.0005958494969446231, + "loss": 3.5562, + "step": 3245 + }, + { + "epoch": 0.16, + "grad_norm": 0.6223381757736206, + "learning_rate": 0.0005958469443786901, + "loss": 3.5703, + "step": 3246 + }, + { + "epoch": 0.16, + "grad_norm": 0.5942324995994568, + "learning_rate": 0.0005958443910335531, + "loss": 3.4527, + "step": 3247 + }, + { + "epoch": 0.16, + "grad_norm": 0.5596261620521545, + "learning_rate": 0.0005958418369092189, + "loss": 3.63, + "step": 3248 + }, + { + "epoch": 0.16, + "grad_norm": 0.5686338543891907, + "learning_rate": 0.0005958392820056942, + "loss": 3.5783, + "step": 3249 + }, + { + "epoch": 0.16, + "grad_norm": 0.5892527103424072, + "learning_rate": 0.0005958367263229859, + "loss": 3.5206, + "step": 3250 + }, + { + "epoch": 0.16, + "grad_norm": 0.5530284643173218, + "learning_rate": 0.0005958341698611004, + "loss": 3.7264, + "step": 3251 + }, + { + "epoch": 0.16, + "grad_norm": 0.5221811532974243, + "learning_rate": 0.0005958316126200448, + "loss": 3.5932, + "step": 3252 + }, + { + "epoch": 0.16, + "grad_norm": 0.5697766542434692, + "learning_rate": 0.0005958290545998255, + "loss": 3.6294, + "step": 3253 + }, + { + "epoch": 0.16, + "grad_norm": 0.5952369570732117, + "learning_rate": 0.0005958264958004496, + "loss": 3.5551, + "step": 3254 + }, + { + "epoch": 0.16, + "grad_norm": 0.5782142281532288, + "learning_rate": 0.0005958239362219235, + "loss": 3.4631, + "step": 3255 + }, + { + "epoch": 0.16, + "grad_norm": 0.5828744173049927, + "learning_rate": 0.0005958213758642542, + "loss": 3.5134, + "step": 3256 + }, + { + "epoch": 0.16, + "grad_norm": 0.5539057850837708, + "learning_rate": 0.0005958188147274483, + "loss": 3.3305, + "step": 3257 + }, + { + "epoch": 0.16, + "grad_norm": 0.6173965930938721, + "learning_rate": 0.0005958162528115125, + "loss": 3.7218, + "step": 3258 + }, + { + "epoch": 0.16, + "grad_norm": 0.5583602786064148, + "learning_rate": 0.0005958136901164537, + "loss": 3.6743, + "step": 3259 + }, + { + "epoch": 0.16, + "grad_norm": 0.5693215131759644, + "learning_rate": 0.0005958111266422785, + "loss": 3.5812, + "step": 3260 + }, + { + "epoch": 0.16, + "grad_norm": 0.5246009826660156, + "learning_rate": 0.0005958085623889937, + "loss": 3.6991, + "step": 3261 + }, + { + "epoch": 0.16, + "grad_norm": 0.5803536772727966, + "learning_rate": 0.0005958059973566062, + "loss": 3.7107, + "step": 3262 + }, + { + "epoch": 0.16, + "grad_norm": 0.5437319874763489, + "learning_rate": 0.0005958034315451224, + "loss": 3.7399, + "step": 3263 + }, + { + "epoch": 0.16, + "grad_norm": 0.5739423036575317, + "learning_rate": 0.0005958008649545496, + "loss": 3.522, + "step": 3264 + }, + { + "epoch": 0.16, + "grad_norm": 0.6183177828788757, + "learning_rate": 0.000595798297584894, + "loss": 3.4463, + "step": 3265 + }, + { + "epoch": 0.16, + "grad_norm": 0.521632194519043, + "learning_rate": 0.0005957957294361628, + "loss": 3.57, + "step": 3266 + }, + { + "epoch": 0.16, + "grad_norm": 0.5398718118667603, + "learning_rate": 0.0005957931605083624, + "loss": 3.5119, + "step": 3267 + }, + { + "epoch": 0.16, + "grad_norm": 0.556581437587738, + "learning_rate": 0.0005957905908014999, + "loss": 3.6029, + "step": 3268 + }, + { + "epoch": 0.16, + "grad_norm": 0.5454385876655579, + "learning_rate": 0.0005957880203155818, + "loss": 3.4989, + "step": 3269 + }, + { + "epoch": 0.16, + "grad_norm": 0.5625463128089905, + "learning_rate": 0.000595785449050615, + "loss": 3.6914, + "step": 3270 + }, + { + "epoch": 0.16, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0005957828770066063, + "loss": 3.5866, + "step": 3271 + }, + { + "epoch": 0.16, + "grad_norm": 0.5634467601776123, + "learning_rate": 0.0005957803041835623, + "loss": 3.8559, + "step": 3272 + }, + { + "epoch": 0.16, + "grad_norm": 0.5901991128921509, + "learning_rate": 0.0005957777305814901, + "loss": 3.4086, + "step": 3273 + }, + { + "epoch": 0.16, + "grad_norm": 0.582669734954834, + "learning_rate": 0.0005957751562003961, + "loss": 3.506, + "step": 3274 + }, + { + "epoch": 0.16, + "grad_norm": 0.5981149673461914, + "learning_rate": 0.0005957725810402874, + "loss": 3.717, + "step": 3275 + }, + { + "epoch": 0.16, + "grad_norm": 0.6032350063323975, + "learning_rate": 0.0005957700051011706, + "loss": 3.6699, + "step": 3276 + }, + { + "epoch": 0.16, + "grad_norm": 0.5754993557929993, + "learning_rate": 0.0005957674283830525, + "loss": 3.5732, + "step": 3277 + }, + { + "epoch": 0.16, + "grad_norm": 0.6205776929855347, + "learning_rate": 0.00059576485088594, + "loss": 3.6692, + "step": 3278 + }, + { + "epoch": 0.16, + "grad_norm": 0.5795102119445801, + "learning_rate": 0.0005957622726098397, + "loss": 3.574, + "step": 3279 + }, + { + "epoch": 0.16, + "grad_norm": 0.547251284122467, + "learning_rate": 0.0005957596935547586, + "loss": 3.7679, + "step": 3280 + }, + { + "epoch": 0.16, + "grad_norm": 0.5527950525283813, + "learning_rate": 0.0005957571137207032, + "loss": 3.4746, + "step": 3281 + }, + { + "epoch": 0.16, + "grad_norm": 0.5651281476020813, + "learning_rate": 0.0005957545331076806, + "loss": 3.5383, + "step": 3282 + }, + { + "epoch": 0.16, + "grad_norm": 0.599833071231842, + "learning_rate": 0.0005957519517156975, + "loss": 3.5415, + "step": 3283 + }, + { + "epoch": 0.16, + "grad_norm": 0.6276541352272034, + "learning_rate": 0.0005957493695447607, + "loss": 3.5067, + "step": 3284 + }, + { + "epoch": 0.16, + "grad_norm": 0.5757126212120056, + "learning_rate": 0.0005957467865948769, + "loss": 3.6089, + "step": 3285 + }, + { + "epoch": 0.16, + "grad_norm": 0.5513081550598145, + "learning_rate": 0.000595744202866053, + "loss": 3.6373, + "step": 3286 + }, + { + "epoch": 0.16, + "grad_norm": 0.5885028839111328, + "learning_rate": 0.0005957416183582958, + "loss": 3.7294, + "step": 3287 + }, + { + "epoch": 0.16, + "grad_norm": 0.5601800084114075, + "learning_rate": 0.0005957390330716121, + "loss": 3.5964, + "step": 3288 + }, + { + "epoch": 0.16, + "grad_norm": 0.6039920449256897, + "learning_rate": 0.0005957364470060087, + "loss": 3.3979, + "step": 3289 + }, + { + "epoch": 0.16, + "grad_norm": 0.5872266888618469, + "learning_rate": 0.0005957338601614924, + "loss": 3.6418, + "step": 3290 + }, + { + "epoch": 0.16, + "grad_norm": 0.5281251072883606, + "learning_rate": 0.0005957312725380701, + "loss": 3.5281, + "step": 3291 + }, + { + "epoch": 0.16, + "grad_norm": 0.5793739557266235, + "learning_rate": 0.0005957286841357485, + "loss": 3.6252, + "step": 3292 + }, + { + "epoch": 0.16, + "grad_norm": 0.6035564541816711, + "learning_rate": 0.0005957260949545345, + "loss": 3.4221, + "step": 3293 + }, + { + "epoch": 0.16, + "grad_norm": 0.5742641687393188, + "learning_rate": 0.0005957235049944347, + "loss": 3.7449, + "step": 3294 + }, + { + "epoch": 0.16, + "grad_norm": 0.5799552202224731, + "learning_rate": 0.0005957209142554563, + "loss": 3.7006, + "step": 3295 + }, + { + "epoch": 0.16, + "grad_norm": 0.5929350256919861, + "learning_rate": 0.0005957183227376058, + "loss": 3.5188, + "step": 3296 + }, + { + "epoch": 0.16, + "grad_norm": 0.5783431529998779, + "learning_rate": 0.0005957157304408902, + "loss": 3.4988, + "step": 3297 + }, + { + "epoch": 0.16, + "grad_norm": 0.6081163883209229, + "learning_rate": 0.0005957131373653163, + "loss": 3.7293, + "step": 3298 + }, + { + "epoch": 0.16, + "grad_norm": 0.5771412253379822, + "learning_rate": 0.0005957105435108909, + "loss": 3.5091, + "step": 3299 + }, + { + "epoch": 0.16, + "grad_norm": 0.5864212512969971, + "learning_rate": 0.0005957079488776208, + "loss": 3.6186, + "step": 3300 + }, + { + "epoch": 0.16, + "grad_norm": 0.5488468408584595, + "learning_rate": 0.0005957053534655129, + "loss": 3.617, + "step": 3301 + }, + { + "epoch": 0.16, + "grad_norm": 0.5773611068725586, + "learning_rate": 0.000595702757274574, + "loss": 3.6083, + "step": 3302 + }, + { + "epoch": 0.16, + "grad_norm": 0.5988139510154724, + "learning_rate": 0.0005957001603048109, + "loss": 3.4651, + "step": 3303 + }, + { + "epoch": 0.16, + "grad_norm": 0.5818193554878235, + "learning_rate": 0.0005956975625562305, + "loss": 3.3751, + "step": 3304 + }, + { + "epoch": 0.16, + "grad_norm": 0.5249636173248291, + "learning_rate": 0.0005956949640288397, + "loss": 3.6246, + "step": 3305 + }, + { + "epoch": 0.16, + "grad_norm": 0.5715710520744324, + "learning_rate": 0.0005956923647226453, + "loss": 3.5151, + "step": 3306 + }, + { + "epoch": 0.16, + "grad_norm": 0.5765474438667297, + "learning_rate": 0.000595689764637654, + "loss": 3.4035, + "step": 3307 + }, + { + "epoch": 0.16, + "grad_norm": 0.6147971153259277, + "learning_rate": 0.0005956871637738728, + "loss": 3.6643, + "step": 3308 + }, + { + "epoch": 0.16, + "grad_norm": 0.6140459775924683, + "learning_rate": 0.0005956845621313086, + "loss": 3.3888, + "step": 3309 + }, + { + "epoch": 0.16, + "grad_norm": 0.6852372884750366, + "learning_rate": 0.0005956819597099681, + "loss": 3.9077, + "step": 3310 + }, + { + "epoch": 0.16, + "grad_norm": 0.5585691928863525, + "learning_rate": 0.0005956793565098582, + "loss": 3.4769, + "step": 3311 + }, + { + "epoch": 0.16, + "grad_norm": 0.5834951996803284, + "learning_rate": 0.0005956767525309858, + "loss": 3.588, + "step": 3312 + }, + { + "epoch": 0.16, + "grad_norm": 0.6259799599647522, + "learning_rate": 0.0005956741477733578, + "loss": 3.5964, + "step": 3313 + }, + { + "epoch": 0.16, + "grad_norm": 0.5601211190223694, + "learning_rate": 0.0005956715422369809, + "loss": 3.5845, + "step": 3314 + }, + { + "epoch": 0.16, + "grad_norm": 0.5854201316833496, + "learning_rate": 0.0005956689359218621, + "loss": 3.4226, + "step": 3315 + }, + { + "epoch": 0.16, + "grad_norm": 0.5881748199462891, + "learning_rate": 0.0005956663288280082, + "loss": 3.4351, + "step": 3316 + }, + { + "epoch": 0.16, + "grad_norm": 0.6546245813369751, + "learning_rate": 0.0005956637209554262, + "loss": 3.44, + "step": 3317 + }, + { + "epoch": 0.16, + "grad_norm": 0.5833501815795898, + "learning_rate": 0.0005956611123041229, + "loss": 3.6924, + "step": 3318 + }, + { + "epoch": 0.16, + "grad_norm": 0.5682247877120972, + "learning_rate": 0.0005956585028741049, + "loss": 3.7348, + "step": 3319 + }, + { + "epoch": 0.16, + "grad_norm": 0.6428276896476746, + "learning_rate": 0.0005956558926653794, + "loss": 3.3296, + "step": 3320 + }, + { + "epoch": 0.16, + "grad_norm": 0.5545470714569092, + "learning_rate": 0.0005956532816779532, + "loss": 3.6842, + "step": 3321 + }, + { + "epoch": 0.16, + "grad_norm": 0.5805429220199585, + "learning_rate": 0.0005956506699118331, + "loss": 3.3973, + "step": 3322 + }, + { + "epoch": 0.16, + "grad_norm": 0.5832202434539795, + "learning_rate": 0.0005956480573670261, + "loss": 3.699, + "step": 3323 + }, + { + "epoch": 0.16, + "grad_norm": 0.5636213421821594, + "learning_rate": 0.000595645444043539, + "loss": 3.647, + "step": 3324 + }, + { + "epoch": 0.16, + "grad_norm": 0.599785327911377, + "learning_rate": 0.0005956428299413788, + "loss": 3.5346, + "step": 3325 + }, + { + "epoch": 0.16, + "grad_norm": 0.5496143698692322, + "learning_rate": 0.0005956402150605522, + "loss": 3.6491, + "step": 3326 + }, + { + "epoch": 0.16, + "grad_norm": 0.5501871109008789, + "learning_rate": 0.0005956375994010661, + "loss": 3.5646, + "step": 3327 + }, + { + "epoch": 0.16, + "grad_norm": 0.6170458793640137, + "learning_rate": 0.0005956349829629276, + "loss": 3.5171, + "step": 3328 + }, + { + "epoch": 0.16, + "grad_norm": 0.554207980632782, + "learning_rate": 0.0005956323657461434, + "loss": 3.4183, + "step": 3329 + }, + { + "epoch": 0.16, + "grad_norm": 0.6159272193908691, + "learning_rate": 0.0005956297477507204, + "loss": 3.5173, + "step": 3330 + }, + { + "epoch": 0.16, + "grad_norm": 0.5841159224510193, + "learning_rate": 0.0005956271289766654, + "loss": 3.5016, + "step": 3331 + }, + { + "epoch": 0.16, + "grad_norm": 0.5938801169395447, + "learning_rate": 0.0005956245094239857, + "loss": 3.557, + "step": 3332 + }, + { + "epoch": 0.16, + "grad_norm": 0.5655121207237244, + "learning_rate": 0.0005956218890926878, + "loss": 3.5488, + "step": 3333 + }, + { + "epoch": 0.16, + "grad_norm": 0.581397294998169, + "learning_rate": 0.0005956192679827788, + "loss": 3.6558, + "step": 3334 + }, + { + "epoch": 0.16, + "grad_norm": 0.5534145832061768, + "learning_rate": 0.0005956166460942654, + "loss": 3.6431, + "step": 3335 + }, + { + "epoch": 0.16, + "grad_norm": 0.6180419325828552, + "learning_rate": 0.0005956140234271549, + "loss": 3.5917, + "step": 3336 + }, + { + "epoch": 0.16, + "grad_norm": 0.5950748324394226, + "learning_rate": 0.0005956113999814537, + "loss": 3.5197, + "step": 3337 + }, + { + "epoch": 0.16, + "grad_norm": 0.5673226118087769, + "learning_rate": 0.0005956087757571691, + "loss": 3.465, + "step": 3338 + }, + { + "epoch": 0.16, + "grad_norm": 0.5869221091270447, + "learning_rate": 0.0005956061507543078, + "loss": 3.5244, + "step": 3339 + }, + { + "epoch": 0.16, + "grad_norm": 0.6073868870735168, + "learning_rate": 0.0005956035249728769, + "loss": 3.4927, + "step": 3340 + }, + { + "epoch": 0.16, + "grad_norm": 0.547737181186676, + "learning_rate": 0.0005956008984128831, + "loss": 3.4597, + "step": 3341 + }, + { + "epoch": 0.16, + "grad_norm": 0.5933374166488647, + "learning_rate": 0.0005955982710743336, + "loss": 3.5967, + "step": 3342 + }, + { + "epoch": 0.16, + "grad_norm": 0.564926266670227, + "learning_rate": 0.000595595642957235, + "loss": 3.5223, + "step": 3343 + }, + { + "epoch": 0.16, + "grad_norm": 0.625313937664032, + "learning_rate": 0.0005955930140615943, + "loss": 3.4885, + "step": 3344 + }, + { + "epoch": 0.16, + "grad_norm": 0.6144617199897766, + "learning_rate": 0.0005955903843874185, + "loss": 3.3671, + "step": 3345 + }, + { + "epoch": 0.16, + "grad_norm": 0.6368928551673889, + "learning_rate": 0.0005955877539347146, + "loss": 3.5404, + "step": 3346 + }, + { + "epoch": 0.16, + "grad_norm": 0.5616483688354492, + "learning_rate": 0.0005955851227034894, + "loss": 3.5257, + "step": 3347 + }, + { + "epoch": 0.16, + "grad_norm": 0.5560274720191956, + "learning_rate": 0.00059558249069375, + "loss": 3.5856, + "step": 3348 + }, + { + "epoch": 0.16, + "grad_norm": 0.6093305945396423, + "learning_rate": 0.0005955798579055031, + "loss": 3.6487, + "step": 3349 + }, + { + "epoch": 0.16, + "grad_norm": 0.5649324655532837, + "learning_rate": 0.0005955772243387556, + "loss": 3.5062, + "step": 3350 + }, + { + "epoch": 0.16, + "grad_norm": 0.5727810859680176, + "learning_rate": 0.0005955745899935147, + "loss": 3.2842, + "step": 3351 + }, + { + "epoch": 0.16, + "grad_norm": 0.6266840100288391, + "learning_rate": 0.0005955719548697873, + "loss": 3.7879, + "step": 3352 + }, + { + "epoch": 0.16, + "grad_norm": 0.6102510690689087, + "learning_rate": 0.0005955693189675802, + "loss": 3.71, + "step": 3353 + }, + { + "epoch": 0.16, + "grad_norm": 0.5587088465690613, + "learning_rate": 0.0005955666822869004, + "loss": 3.4238, + "step": 3354 + }, + { + "epoch": 0.16, + "grad_norm": 0.617514431476593, + "learning_rate": 0.0005955640448277548, + "loss": 3.6391, + "step": 3355 + }, + { + "epoch": 0.16, + "grad_norm": 0.5572774410247803, + "learning_rate": 0.0005955614065901504, + "loss": 3.6646, + "step": 3356 + }, + { + "epoch": 0.16, + "grad_norm": 0.598943829536438, + "learning_rate": 0.0005955587675740942, + "loss": 3.5632, + "step": 3357 + }, + { + "epoch": 0.16, + "grad_norm": 0.6389573812484741, + "learning_rate": 0.000595556127779593, + "loss": 3.4648, + "step": 3358 + }, + { + "epoch": 0.16, + "grad_norm": 0.5725786685943604, + "learning_rate": 0.0005955534872066538, + "loss": 3.387, + "step": 3359 + }, + { + "epoch": 0.16, + "grad_norm": 0.5625653862953186, + "learning_rate": 0.0005955508458552837, + "loss": 3.6279, + "step": 3360 + }, + { + "epoch": 0.16, + "grad_norm": 0.5734360814094543, + "learning_rate": 0.0005955482037254895, + "loss": 3.5787, + "step": 3361 + }, + { + "epoch": 0.16, + "grad_norm": 0.5375578999519348, + "learning_rate": 0.0005955455608172783, + "loss": 3.7722, + "step": 3362 + }, + { + "epoch": 0.16, + "grad_norm": 0.6041198968887329, + "learning_rate": 0.0005955429171306569, + "loss": 3.5613, + "step": 3363 + }, + { + "epoch": 0.16, + "grad_norm": 0.5527943968772888, + "learning_rate": 0.0005955402726656323, + "loss": 3.4843, + "step": 3364 + }, + { + "epoch": 0.16, + "grad_norm": 0.528593897819519, + "learning_rate": 0.0005955376274222116, + "loss": 3.5478, + "step": 3365 + }, + { + "epoch": 0.16, + "grad_norm": 0.5430445671081543, + "learning_rate": 0.0005955349814004016, + "loss": 3.5002, + "step": 3366 + }, + { + "epoch": 0.17, + "grad_norm": 0.5947743654251099, + "learning_rate": 0.0005955323346002094, + "loss": 3.454, + "step": 3367 + }, + { + "epoch": 0.17, + "grad_norm": 0.5306910872459412, + "learning_rate": 0.0005955296870216419, + "loss": 3.5488, + "step": 3368 + }, + { + "epoch": 0.17, + "grad_norm": 0.5531356930732727, + "learning_rate": 0.000595527038664706, + "loss": 3.7194, + "step": 3369 + }, + { + "epoch": 0.17, + "grad_norm": 0.601188600063324, + "learning_rate": 0.0005955243895294087, + "loss": 3.5034, + "step": 3370 + }, + { + "epoch": 0.17, + "grad_norm": 0.5490877628326416, + "learning_rate": 0.0005955217396157571, + "loss": 3.8795, + "step": 3371 + }, + { + "epoch": 0.17, + "grad_norm": 0.5771283507347107, + "learning_rate": 0.0005955190889237581, + "loss": 3.66, + "step": 3372 + }, + { + "epoch": 0.17, + "grad_norm": 0.5651893019676208, + "learning_rate": 0.0005955164374534187, + "loss": 3.3696, + "step": 3373 + }, + { + "epoch": 0.17, + "grad_norm": 0.5814728140830994, + "learning_rate": 0.0005955137852047459, + "loss": 3.6069, + "step": 3374 + }, + { + "epoch": 0.17, + "grad_norm": 0.5790256261825562, + "learning_rate": 0.0005955111321777467, + "loss": 3.4807, + "step": 3375 + }, + { + "epoch": 0.17, + "grad_norm": 0.5738055109977722, + "learning_rate": 0.0005955084783724281, + "loss": 3.4382, + "step": 3376 + }, + { + "epoch": 0.17, + "grad_norm": 0.5678678154945374, + "learning_rate": 0.0005955058237887968, + "loss": 3.3348, + "step": 3377 + }, + { + "epoch": 0.17, + "grad_norm": 0.5809141397476196, + "learning_rate": 0.0005955031684268603, + "loss": 3.5302, + "step": 3378 + }, + { + "epoch": 0.17, + "grad_norm": 0.5668545961380005, + "learning_rate": 0.0005955005122866251, + "loss": 3.437, + "step": 3379 + }, + { + "epoch": 0.17, + "grad_norm": 0.5487068891525269, + "learning_rate": 0.0005954978553680987, + "loss": 3.482, + "step": 3380 + }, + { + "epoch": 0.17, + "grad_norm": 0.5972830653190613, + "learning_rate": 0.0005954951976712876, + "loss": 3.5068, + "step": 3381 + }, + { + "epoch": 0.17, + "grad_norm": 0.5785419940948486, + "learning_rate": 0.0005954925391961991, + "loss": 3.4964, + "step": 3382 + }, + { + "epoch": 0.17, + "grad_norm": 0.5430408120155334, + "learning_rate": 0.00059548987994284, + "loss": 3.535, + "step": 3383 + }, + { + "epoch": 0.17, + "grad_norm": 0.5556647181510925, + "learning_rate": 0.0005954872199112175, + "loss": 3.589, + "step": 3384 + }, + { + "epoch": 0.17, + "grad_norm": 0.6139194369316101, + "learning_rate": 0.0005954845591013385, + "loss": 3.4409, + "step": 3385 + }, + { + "epoch": 0.17, + "grad_norm": 0.5702038407325745, + "learning_rate": 0.0005954818975132102, + "loss": 3.5252, + "step": 3386 + }, + { + "epoch": 0.17, + "grad_norm": 0.5292709469795227, + "learning_rate": 0.0005954792351468393, + "loss": 3.7446, + "step": 3387 + }, + { + "epoch": 0.17, + "grad_norm": 0.5479516983032227, + "learning_rate": 0.000595476572002233, + "loss": 3.789, + "step": 3388 + }, + { + "epoch": 0.17, + "grad_norm": 0.5670396089553833, + "learning_rate": 0.0005954739080793983, + "loss": 3.5434, + "step": 3389 + }, + { + "epoch": 0.17, + "grad_norm": 0.5650784969329834, + "learning_rate": 0.0005954712433783421, + "loss": 3.4521, + "step": 3390 + }, + { + "epoch": 0.17, + "grad_norm": 0.555449366569519, + "learning_rate": 0.0005954685778990715, + "loss": 3.6411, + "step": 3391 + }, + { + "epoch": 0.17, + "grad_norm": 0.5617675185203552, + "learning_rate": 0.0005954659116415936, + "loss": 3.5113, + "step": 3392 + }, + { + "epoch": 0.17, + "grad_norm": 0.545922040939331, + "learning_rate": 0.0005954632446059153, + "loss": 3.6585, + "step": 3393 + }, + { + "epoch": 0.17, + "grad_norm": 0.562699019908905, + "learning_rate": 0.0005954605767920437, + "loss": 3.5584, + "step": 3394 + }, + { + "epoch": 0.17, + "grad_norm": 0.5851036310195923, + "learning_rate": 0.0005954579081999859, + "loss": 3.3386, + "step": 3395 + }, + { + "epoch": 0.17, + "grad_norm": 0.6002635359764099, + "learning_rate": 0.0005954552388297488, + "loss": 3.6444, + "step": 3396 + }, + { + "epoch": 0.17, + "grad_norm": 0.5689345002174377, + "learning_rate": 0.0005954525686813394, + "loss": 3.5515, + "step": 3397 + }, + { + "epoch": 0.17, + "grad_norm": 0.5846789479255676, + "learning_rate": 0.0005954498977547648, + "loss": 3.5281, + "step": 3398 + }, + { + "epoch": 0.17, + "grad_norm": 0.5559812188148499, + "learning_rate": 0.0005954472260500321, + "loss": 3.4974, + "step": 3399 + }, + { + "epoch": 0.17, + "grad_norm": 0.6159769296646118, + "learning_rate": 0.0005954445535671482, + "loss": 3.4743, + "step": 3400 + }, + { + "epoch": 0.17, + "grad_norm": 0.6169503927230835, + "learning_rate": 0.0005954418803061201, + "loss": 3.3966, + "step": 3401 + }, + { + "epoch": 0.17, + "grad_norm": 0.5606275796890259, + "learning_rate": 0.0005954392062669551, + "loss": 3.6091, + "step": 3402 + }, + { + "epoch": 0.17, + "grad_norm": 0.5616119503974915, + "learning_rate": 0.0005954365314496602, + "loss": 3.3921, + "step": 3403 + }, + { + "epoch": 0.17, + "grad_norm": 0.5786251425743103, + "learning_rate": 0.0005954338558542421, + "loss": 3.5837, + "step": 3404 + }, + { + "epoch": 0.17, + "grad_norm": 0.645964503288269, + "learning_rate": 0.0005954311794807082, + "loss": 3.5519, + "step": 3405 + }, + { + "epoch": 0.17, + "grad_norm": 0.6374315619468689, + "learning_rate": 0.0005954285023290653, + "loss": 3.6006, + "step": 3406 + }, + { + "epoch": 0.17, + "grad_norm": 0.6593900322914124, + "learning_rate": 0.0005954258243993207, + "loss": 3.7227, + "step": 3407 + }, + { + "epoch": 0.17, + "grad_norm": 0.5792526602745056, + "learning_rate": 0.0005954231456914813, + "loss": 3.665, + "step": 3408 + }, + { + "epoch": 0.17, + "grad_norm": 0.5797976851463318, + "learning_rate": 0.0005954204662055542, + "loss": 3.3409, + "step": 3409 + }, + { + "epoch": 0.17, + "grad_norm": 0.5604798197746277, + "learning_rate": 0.0005954177859415465, + "loss": 3.416, + "step": 3410 + }, + { + "epoch": 0.17, + "grad_norm": 0.5253710150718689, + "learning_rate": 0.0005954151048994652, + "loss": 3.5158, + "step": 3411 + }, + { + "epoch": 0.17, + "grad_norm": 0.5831363797187805, + "learning_rate": 0.0005954124230793173, + "loss": 3.4324, + "step": 3412 + }, + { + "epoch": 0.17, + "grad_norm": 0.592633843421936, + "learning_rate": 0.00059540974048111, + "loss": 3.6425, + "step": 3413 + }, + { + "epoch": 0.17, + "grad_norm": 0.5312915444374084, + "learning_rate": 0.0005954070571048503, + "loss": 3.5972, + "step": 3414 + }, + { + "epoch": 0.17, + "grad_norm": 0.5762085914611816, + "learning_rate": 0.0005954043729505452, + "loss": 3.6361, + "step": 3415 + }, + { + "epoch": 0.17, + "grad_norm": 0.5579561591148376, + "learning_rate": 0.0005954016880182018, + "loss": 3.5233, + "step": 3416 + }, + { + "epoch": 0.17, + "grad_norm": 0.5626066327095032, + "learning_rate": 0.0005953990023078273, + "loss": 3.7441, + "step": 3417 + }, + { + "epoch": 0.17, + "grad_norm": 0.5891571044921875, + "learning_rate": 0.0005953963158194285, + "loss": 3.7897, + "step": 3418 + }, + { + "epoch": 0.17, + "grad_norm": 0.5446343421936035, + "learning_rate": 0.0005953936285530129, + "loss": 3.5544, + "step": 3419 + }, + { + "epoch": 0.17, + "grad_norm": 0.6051816940307617, + "learning_rate": 0.0005953909405085872, + "loss": 3.5061, + "step": 3420 + }, + { + "epoch": 0.17, + "grad_norm": 0.5892947316169739, + "learning_rate": 0.0005953882516861586, + "loss": 3.5149, + "step": 3421 + }, + { + "epoch": 0.17, + "grad_norm": 0.6093165874481201, + "learning_rate": 0.0005953855620857342, + "loss": 3.8573, + "step": 3422 + }, + { + "epoch": 0.17, + "grad_norm": 0.5854846239089966, + "learning_rate": 0.0005953828717073212, + "loss": 3.3395, + "step": 3423 + }, + { + "epoch": 0.17, + "grad_norm": 0.5802544951438904, + "learning_rate": 0.0005953801805509264, + "loss": 3.4692, + "step": 3424 + }, + { + "epoch": 0.17, + "grad_norm": 0.5809839367866516, + "learning_rate": 0.0005953774886165572, + "loss": 3.5708, + "step": 3425 + }, + { + "epoch": 0.17, + "grad_norm": 0.602876603603363, + "learning_rate": 0.0005953747959042204, + "loss": 3.4044, + "step": 3426 + }, + { + "epoch": 0.17, + "grad_norm": 0.6141166090965271, + "learning_rate": 0.0005953721024139233, + "loss": 3.3439, + "step": 3427 + }, + { + "epoch": 0.17, + "grad_norm": 0.5813528895378113, + "learning_rate": 0.000595369408145673, + "loss": 3.4252, + "step": 3428 + }, + { + "epoch": 0.17, + "grad_norm": 0.5566039085388184, + "learning_rate": 0.0005953667130994763, + "loss": 3.4517, + "step": 3429 + }, + { + "epoch": 0.17, + "grad_norm": 0.5614877939224243, + "learning_rate": 0.0005953640172753407, + "loss": 3.4067, + "step": 3430 + }, + { + "epoch": 0.17, + "grad_norm": 0.5762594938278198, + "learning_rate": 0.0005953613206732731, + "loss": 3.4889, + "step": 3431 + }, + { + "epoch": 0.17, + "grad_norm": 0.5990262031555176, + "learning_rate": 0.0005953586232932806, + "loss": 3.4092, + "step": 3432 + }, + { + "epoch": 0.17, + "grad_norm": 0.5427011251449585, + "learning_rate": 0.0005953559251353703, + "loss": 3.6497, + "step": 3433 + }, + { + "epoch": 0.17, + "grad_norm": 0.5713863968849182, + "learning_rate": 0.0005953532261995494, + "loss": 3.4972, + "step": 3434 + }, + { + "epoch": 0.17, + "grad_norm": 0.6282929182052612, + "learning_rate": 0.0005953505264858249, + "loss": 3.4623, + "step": 3435 + }, + { + "epoch": 0.17, + "grad_norm": 0.5442312359809875, + "learning_rate": 0.0005953478259942041, + "loss": 3.4691, + "step": 3436 + }, + { + "epoch": 0.17, + "grad_norm": 0.5707518458366394, + "learning_rate": 0.0005953451247246937, + "loss": 3.7231, + "step": 3437 + }, + { + "epoch": 0.17, + "grad_norm": 0.5848592519760132, + "learning_rate": 0.0005953424226773013, + "loss": 3.6997, + "step": 3438 + }, + { + "epoch": 0.17, + "grad_norm": 0.5771193504333496, + "learning_rate": 0.0005953397198520337, + "loss": 3.7628, + "step": 3439 + }, + { + "epoch": 0.17, + "grad_norm": 0.6253427863121033, + "learning_rate": 0.0005953370162488981, + "loss": 3.3619, + "step": 3440 + }, + { + "epoch": 0.17, + "grad_norm": 0.6061219573020935, + "learning_rate": 0.0005953343118679018, + "loss": 3.6352, + "step": 3441 + }, + { + "epoch": 0.17, + "grad_norm": 0.5454443693161011, + "learning_rate": 0.0005953316067090516, + "loss": 3.4859, + "step": 3442 + }, + { + "epoch": 0.17, + "grad_norm": 0.617382287979126, + "learning_rate": 0.0005953289007723548, + "loss": 3.5843, + "step": 3443 + }, + { + "epoch": 0.17, + "grad_norm": 0.5367785096168518, + "learning_rate": 0.0005953261940578186, + "loss": 3.4825, + "step": 3444 + }, + { + "epoch": 0.17, + "grad_norm": 0.5802174210548401, + "learning_rate": 0.00059532348656545, + "loss": 3.4618, + "step": 3445 + }, + { + "epoch": 0.17, + "grad_norm": 0.5566971302032471, + "learning_rate": 0.000595320778295256, + "loss": 3.5407, + "step": 3446 + }, + { + "epoch": 0.17, + "grad_norm": 0.5934160351753235, + "learning_rate": 0.0005953180692472441, + "loss": 3.5885, + "step": 3447 + }, + { + "epoch": 0.17, + "grad_norm": 0.5748066902160645, + "learning_rate": 0.0005953153594214212, + "loss": 3.8451, + "step": 3448 + }, + { + "epoch": 0.17, + "grad_norm": 0.6022830009460449, + "learning_rate": 0.0005953126488177946, + "loss": 3.5088, + "step": 3449 + }, + { + "epoch": 0.17, + "grad_norm": 0.6487731337547302, + "learning_rate": 0.0005953099374363711, + "loss": 3.4576, + "step": 3450 + }, + { + "epoch": 0.17, + "grad_norm": 0.6260390877723694, + "learning_rate": 0.0005953072252771581, + "loss": 3.4172, + "step": 3451 + }, + { + "epoch": 0.17, + "grad_norm": 0.5463585257530212, + "learning_rate": 0.0005953045123401628, + "loss": 3.6589, + "step": 3452 + }, + { + "epoch": 0.17, + "grad_norm": 0.6311070322990417, + "learning_rate": 0.0005953017986253922, + "loss": 3.6357, + "step": 3453 + }, + { + "epoch": 0.17, + "grad_norm": 0.5336967706680298, + "learning_rate": 0.0005952990841328536, + "loss": 3.4526, + "step": 3454 + }, + { + "epoch": 0.17, + "grad_norm": 0.5945968627929688, + "learning_rate": 0.0005952963688625538, + "loss": 3.5846, + "step": 3455 + }, + { + "epoch": 0.17, + "grad_norm": 0.5942106246948242, + "learning_rate": 0.0005952936528145004, + "loss": 3.5166, + "step": 3456 + }, + { + "epoch": 0.17, + "grad_norm": 0.5890370607376099, + "learning_rate": 0.0005952909359887002, + "loss": 3.7746, + "step": 3457 + }, + { + "epoch": 0.17, + "grad_norm": 0.5749627351760864, + "learning_rate": 0.0005952882183851606, + "loss": 3.4682, + "step": 3458 + }, + { + "epoch": 0.17, + "grad_norm": 0.59616619348526, + "learning_rate": 0.0005952855000038886, + "loss": 3.7142, + "step": 3459 + }, + { + "epoch": 0.17, + "grad_norm": 0.5930215716362, + "learning_rate": 0.0005952827808448916, + "loss": 3.7, + "step": 3460 + }, + { + "epoch": 0.17, + "grad_norm": 0.564595103263855, + "learning_rate": 0.0005952800609081764, + "loss": 3.5487, + "step": 3461 + }, + { + "epoch": 0.17, + "grad_norm": 0.5829174518585205, + "learning_rate": 0.0005952773401937504, + "loss": 3.6912, + "step": 3462 + }, + { + "epoch": 0.17, + "grad_norm": 0.5580412149429321, + "learning_rate": 0.0005952746187016208, + "loss": 3.5539, + "step": 3463 + }, + { + "epoch": 0.17, + "grad_norm": 0.5945863127708435, + "learning_rate": 0.0005952718964317945, + "loss": 3.567, + "step": 3464 + }, + { + "epoch": 0.17, + "grad_norm": 0.5504742860794067, + "learning_rate": 0.000595269173384279, + "loss": 3.6912, + "step": 3465 + }, + { + "epoch": 0.17, + "grad_norm": 0.6526525616645813, + "learning_rate": 0.0005952664495590813, + "loss": 3.4437, + "step": 3466 + }, + { + "epoch": 0.17, + "grad_norm": 0.5954388976097107, + "learning_rate": 0.0005952637249562085, + "loss": 3.6307, + "step": 3467 + }, + { + "epoch": 0.17, + "grad_norm": 0.712817907333374, + "learning_rate": 0.000595260999575668, + "loss": 3.6924, + "step": 3468 + }, + { + "epoch": 0.17, + "grad_norm": 0.6235787868499756, + "learning_rate": 0.0005952582734174667, + "loss": 3.5941, + "step": 3469 + }, + { + "epoch": 0.17, + "grad_norm": 0.5776662826538086, + "learning_rate": 0.0005952555464816122, + "loss": 3.8929, + "step": 3470 + }, + { + "epoch": 0.17, + "grad_norm": 0.6015415787696838, + "learning_rate": 0.0005952528187681111, + "loss": 3.6784, + "step": 3471 + }, + { + "epoch": 0.17, + "grad_norm": 0.5785424113273621, + "learning_rate": 0.0005952500902769711, + "loss": 3.7124, + "step": 3472 + }, + { + "epoch": 0.17, + "grad_norm": 0.6219342947006226, + "learning_rate": 0.0005952473610081991, + "loss": 3.6605, + "step": 3473 + }, + { + "epoch": 0.17, + "grad_norm": 0.5495757460594177, + "learning_rate": 0.0005952446309618023, + "loss": 3.5599, + "step": 3474 + }, + { + "epoch": 0.17, + "grad_norm": 0.5983782410621643, + "learning_rate": 0.0005952419001377881, + "loss": 3.5188, + "step": 3475 + }, + { + "epoch": 0.17, + "grad_norm": 0.6638477444648743, + "learning_rate": 0.0005952391685361634, + "loss": 3.6374, + "step": 3476 + }, + { + "epoch": 0.17, + "grad_norm": 0.5729101896286011, + "learning_rate": 0.0005952364361569358, + "loss": 3.4728, + "step": 3477 + }, + { + "epoch": 0.17, + "grad_norm": 0.5611720681190491, + "learning_rate": 0.000595233703000112, + "loss": 3.4283, + "step": 3478 + }, + { + "epoch": 0.17, + "grad_norm": 0.615308403968811, + "learning_rate": 0.0005952309690656997, + "loss": 3.6614, + "step": 3479 + }, + { + "epoch": 0.17, + "grad_norm": 0.6057369709014893, + "learning_rate": 0.0005952282343537057, + "loss": 3.4623, + "step": 3480 + }, + { + "epoch": 0.17, + "grad_norm": 0.6732720136642456, + "learning_rate": 0.0005952254988641373, + "loss": 3.7109, + "step": 3481 + }, + { + "epoch": 0.17, + "grad_norm": 0.5600842833518982, + "learning_rate": 0.0005952227625970018, + "loss": 3.6057, + "step": 3482 + }, + { + "epoch": 0.17, + "grad_norm": 0.5803719162940979, + "learning_rate": 0.0005952200255523063, + "loss": 3.6392, + "step": 3483 + }, + { + "epoch": 0.17, + "grad_norm": 0.6266722679138184, + "learning_rate": 0.0005952172877300582, + "loss": 3.5691, + "step": 3484 + }, + { + "epoch": 0.17, + "grad_norm": 0.5460732579231262, + "learning_rate": 0.0005952145491302645, + "loss": 3.3658, + "step": 3485 + }, + { + "epoch": 0.17, + "grad_norm": 0.5742864012718201, + "learning_rate": 0.0005952118097529325, + "loss": 3.7658, + "step": 3486 + }, + { + "epoch": 0.17, + "grad_norm": 0.5541781187057495, + "learning_rate": 0.0005952090695980695, + "loss": 3.4799, + "step": 3487 + }, + { + "epoch": 0.17, + "grad_norm": 0.6336858868598938, + "learning_rate": 0.0005952063286656824, + "loss": 3.2493, + "step": 3488 + }, + { + "epoch": 0.17, + "grad_norm": 0.5907337069511414, + "learning_rate": 0.0005952035869557788, + "loss": 3.4372, + "step": 3489 + }, + { + "epoch": 0.17, + "grad_norm": 0.5762500166893005, + "learning_rate": 0.0005952008444683657, + "loss": 3.8303, + "step": 3490 + }, + { + "epoch": 0.17, + "grad_norm": 0.580866813659668, + "learning_rate": 0.0005951981012034505, + "loss": 3.409, + "step": 3491 + }, + { + "epoch": 0.17, + "grad_norm": 0.5785665512084961, + "learning_rate": 0.0005951953571610403, + "loss": 3.4152, + "step": 3492 + }, + { + "epoch": 0.17, + "grad_norm": 0.5975906848907471, + "learning_rate": 0.0005951926123411422, + "loss": 3.554, + "step": 3493 + }, + { + "epoch": 0.17, + "grad_norm": 0.5543960332870483, + "learning_rate": 0.0005951898667437637, + "loss": 3.4609, + "step": 3494 + }, + { + "epoch": 0.17, + "grad_norm": 0.58534836769104, + "learning_rate": 0.0005951871203689118, + "loss": 3.5745, + "step": 3495 + }, + { + "epoch": 0.17, + "grad_norm": 0.5618305802345276, + "learning_rate": 0.0005951843732165938, + "loss": 3.6903, + "step": 3496 + }, + { + "epoch": 0.17, + "grad_norm": 0.6008855700492859, + "learning_rate": 0.0005951816252868172, + "loss": 3.5196, + "step": 3497 + }, + { + "epoch": 0.17, + "grad_norm": 0.6193830370903015, + "learning_rate": 0.0005951788765795889, + "loss": 3.4215, + "step": 3498 + }, + { + "epoch": 0.17, + "grad_norm": 0.5549221038818359, + "learning_rate": 0.0005951761270949161, + "loss": 3.4351, + "step": 3499 + }, + { + "epoch": 0.17, + "grad_norm": 0.5623039603233337, + "learning_rate": 0.0005951733768328063, + "loss": 3.6452, + "step": 3500 + }, + { + "epoch": 0.17, + "grad_norm": 0.5464016795158386, + "learning_rate": 0.0005951706257932667, + "loss": 3.518, + "step": 3501 + }, + { + "epoch": 0.17, + "grad_norm": 0.5889121294021606, + "learning_rate": 0.0005951678739763044, + "loss": 3.6592, + "step": 3502 + }, + { + "epoch": 0.17, + "grad_norm": 0.6298267245292664, + "learning_rate": 0.0005951651213819268, + "loss": 3.5137, + "step": 3503 + }, + { + "epoch": 0.17, + "grad_norm": 0.5713311433792114, + "learning_rate": 0.000595162368010141, + "loss": 3.5688, + "step": 3504 + }, + { + "epoch": 0.17, + "grad_norm": 0.610822856426239, + "learning_rate": 0.0005951596138609543, + "loss": 3.6497, + "step": 3505 + }, + { + "epoch": 0.17, + "grad_norm": 0.5696738362312317, + "learning_rate": 0.000595156858934374, + "loss": 3.3546, + "step": 3506 + }, + { + "epoch": 0.17, + "grad_norm": 0.5880289673805237, + "learning_rate": 0.0005951541032304075, + "loss": 3.7143, + "step": 3507 + }, + { + "epoch": 0.17, + "grad_norm": 0.6033392548561096, + "learning_rate": 0.0005951513467490617, + "loss": 3.5457, + "step": 3508 + }, + { + "epoch": 0.17, + "grad_norm": 0.5870084166526794, + "learning_rate": 0.0005951485894903441, + "loss": 3.4465, + "step": 3509 + }, + { + "epoch": 0.17, + "grad_norm": 0.5881246328353882, + "learning_rate": 0.0005951458314542619, + "loss": 3.6633, + "step": 3510 + }, + { + "epoch": 0.17, + "grad_norm": 0.5996487140655518, + "learning_rate": 0.0005951430726408224, + "loss": 3.5734, + "step": 3511 + }, + { + "epoch": 0.17, + "grad_norm": 0.5851162075996399, + "learning_rate": 0.0005951403130500329, + "loss": 3.5603, + "step": 3512 + }, + { + "epoch": 0.17, + "grad_norm": 0.5705832242965698, + "learning_rate": 0.0005951375526819006, + "loss": 3.5581, + "step": 3513 + }, + { + "epoch": 0.17, + "grad_norm": 0.5960780382156372, + "learning_rate": 0.0005951347915364327, + "loss": 3.4452, + "step": 3514 + }, + { + "epoch": 0.17, + "grad_norm": 0.5697128176689148, + "learning_rate": 0.0005951320296136367, + "loss": 3.6832, + "step": 3515 + }, + { + "epoch": 0.17, + "grad_norm": 0.5737714171409607, + "learning_rate": 0.0005951292669135197, + "loss": 3.6346, + "step": 3516 + }, + { + "epoch": 0.17, + "grad_norm": 0.6310943365097046, + "learning_rate": 0.0005951265034360889, + "loss": 3.3565, + "step": 3517 + }, + { + "epoch": 0.17, + "grad_norm": 0.5577863454818726, + "learning_rate": 0.0005951237391813518, + "loss": 3.4109, + "step": 3518 + }, + { + "epoch": 0.17, + "grad_norm": 0.5705747008323669, + "learning_rate": 0.0005951209741493154, + "loss": 3.4384, + "step": 3519 + }, + { + "epoch": 0.17, + "grad_norm": 0.5870479345321655, + "learning_rate": 0.0005951182083399874, + "loss": 3.5561, + "step": 3520 + }, + { + "epoch": 0.17, + "grad_norm": 0.5648481845855713, + "learning_rate": 0.0005951154417533748, + "loss": 3.3586, + "step": 3521 + }, + { + "epoch": 0.17, + "grad_norm": 0.5373155474662781, + "learning_rate": 0.0005951126743894846, + "loss": 3.7483, + "step": 3522 + }, + { + "epoch": 0.17, + "grad_norm": 0.5593510270118713, + "learning_rate": 0.0005951099062483248, + "loss": 3.4208, + "step": 3523 + }, + { + "epoch": 0.17, + "grad_norm": 0.5978959202766418, + "learning_rate": 0.0005951071373299021, + "loss": 3.5202, + "step": 3524 + }, + { + "epoch": 0.17, + "grad_norm": 0.5732042789459229, + "learning_rate": 0.0005951043676342241, + "loss": 3.5725, + "step": 3525 + }, + { + "epoch": 0.17, + "grad_norm": 0.5535354018211365, + "learning_rate": 0.000595101597161298, + "loss": 3.741, + "step": 3526 + }, + { + "epoch": 0.17, + "grad_norm": 0.5729936957359314, + "learning_rate": 0.0005950988259111309, + "loss": 3.7625, + "step": 3527 + }, + { + "epoch": 0.17, + "grad_norm": 0.5622599124908447, + "learning_rate": 0.0005950960538837305, + "loss": 3.3271, + "step": 3528 + }, + { + "epoch": 0.17, + "grad_norm": 0.5429548025131226, + "learning_rate": 0.0005950932810791038, + "loss": 3.5124, + "step": 3529 + }, + { + "epoch": 0.17, + "grad_norm": 0.5643887519836426, + "learning_rate": 0.0005950905074972581, + "loss": 3.4261, + "step": 3530 + }, + { + "epoch": 0.17, + "grad_norm": 0.548894464969635, + "learning_rate": 0.0005950877331382009, + "loss": 3.6266, + "step": 3531 + }, + { + "epoch": 0.17, + "grad_norm": 0.5614020228385925, + "learning_rate": 0.0005950849580019393, + "loss": 3.4156, + "step": 3532 + }, + { + "epoch": 0.17, + "grad_norm": 0.6196742653846741, + "learning_rate": 0.0005950821820884808, + "loss": 3.5224, + "step": 3533 + }, + { + "epoch": 0.17, + "grad_norm": 0.5436410903930664, + "learning_rate": 0.0005950794053978327, + "loss": 3.4044, + "step": 3534 + }, + { + "epoch": 0.17, + "grad_norm": 0.5510107278823853, + "learning_rate": 0.0005950766279300021, + "loss": 3.438, + "step": 3535 + }, + { + "epoch": 0.17, + "grad_norm": 0.6485077738761902, + "learning_rate": 0.0005950738496849965, + "loss": 3.4866, + "step": 3536 + }, + { + "epoch": 0.17, + "grad_norm": 0.5946866869926453, + "learning_rate": 0.0005950710706628232, + "loss": 3.5581, + "step": 3537 + }, + { + "epoch": 0.17, + "grad_norm": 0.6310849189758301, + "learning_rate": 0.0005950682908634895, + "loss": 3.4498, + "step": 3538 + }, + { + "epoch": 0.17, + "grad_norm": 0.5689077973365784, + "learning_rate": 0.0005950655102870026, + "loss": 3.4855, + "step": 3539 + }, + { + "epoch": 0.17, + "grad_norm": 0.5511724352836609, + "learning_rate": 0.00059506272893337, + "loss": 3.6545, + "step": 3540 + }, + { + "epoch": 0.17, + "grad_norm": 0.5716314911842346, + "learning_rate": 0.000595059946802599, + "loss": 3.5961, + "step": 3541 + }, + { + "epoch": 0.17, + "grad_norm": 0.5928378105163574, + "learning_rate": 0.0005950571638946968, + "loss": 3.3571, + "step": 3542 + }, + { + "epoch": 0.17, + "grad_norm": 0.5479989051818848, + "learning_rate": 0.0005950543802096708, + "loss": 3.328, + "step": 3543 + }, + { + "epoch": 0.17, + "grad_norm": 0.5652512311935425, + "learning_rate": 0.0005950515957475284, + "loss": 3.6244, + "step": 3544 + }, + { + "epoch": 0.17, + "grad_norm": 0.6006174683570862, + "learning_rate": 0.000595048810508277, + "loss": 3.5703, + "step": 3545 + }, + { + "epoch": 0.17, + "grad_norm": 0.5618359446525574, + "learning_rate": 0.0005950460244919236, + "loss": 3.5093, + "step": 3546 + }, + { + "epoch": 0.17, + "grad_norm": 0.6154499053955078, + "learning_rate": 0.0005950432376984758, + "loss": 3.6431, + "step": 3547 + }, + { + "epoch": 0.17, + "grad_norm": 0.5481139421463013, + "learning_rate": 0.000595040450127941, + "loss": 3.5728, + "step": 3548 + }, + { + "epoch": 0.17, + "grad_norm": 0.5588555932044983, + "learning_rate": 0.0005950376617803264, + "loss": 3.5644, + "step": 3549 + }, + { + "epoch": 0.17, + "grad_norm": 0.5340237021446228, + "learning_rate": 0.0005950348726556393, + "loss": 3.6245, + "step": 3550 + }, + { + "epoch": 0.17, + "grad_norm": 0.5997275710105896, + "learning_rate": 0.0005950320827538873, + "loss": 3.4301, + "step": 3551 + }, + { + "epoch": 0.17, + "grad_norm": 0.6511825919151306, + "learning_rate": 0.0005950292920750773, + "loss": 3.5778, + "step": 3552 + }, + { + "epoch": 0.17, + "grad_norm": 0.6428468823432922, + "learning_rate": 0.0005950265006192171, + "loss": 3.6624, + "step": 3553 + }, + { + "epoch": 0.17, + "grad_norm": 0.576602041721344, + "learning_rate": 0.0005950237083863139, + "loss": 3.6925, + "step": 3554 + }, + { + "epoch": 0.17, + "grad_norm": 0.5820669531822205, + "learning_rate": 0.0005950209153763751, + "loss": 3.4164, + "step": 3555 + }, + { + "epoch": 0.17, + "grad_norm": 0.5869800448417664, + "learning_rate": 0.0005950181215894078, + "loss": 3.6757, + "step": 3556 + }, + { + "epoch": 0.17, + "grad_norm": 0.5729996562004089, + "learning_rate": 0.0005950153270254196, + "loss": 3.4834, + "step": 3557 + }, + { + "epoch": 0.17, + "grad_norm": 0.5958981513977051, + "learning_rate": 0.0005950125316844179, + "loss": 3.5674, + "step": 3558 + }, + { + "epoch": 0.17, + "grad_norm": 0.6202664375305176, + "learning_rate": 0.0005950097355664098, + "loss": 3.4014, + "step": 3559 + }, + { + "epoch": 0.17, + "grad_norm": 0.5707253813743591, + "learning_rate": 0.000595006938671403, + "loss": 3.5684, + "step": 3560 + }, + { + "epoch": 0.17, + "grad_norm": 0.6256164908409119, + "learning_rate": 0.0005950041409994046, + "loss": 3.6933, + "step": 3561 + }, + { + "epoch": 0.17, + "grad_norm": 0.5959612131118774, + "learning_rate": 0.0005950013425504221, + "loss": 3.553, + "step": 3562 + }, + { + "epoch": 0.17, + "grad_norm": 0.8165073394775391, + "learning_rate": 0.0005949985433244628, + "loss": 3.6789, + "step": 3563 + }, + { + "epoch": 0.17, + "grad_norm": 0.5742766261100769, + "learning_rate": 0.0005949957433215342, + "loss": 3.2581, + "step": 3564 + }, + { + "epoch": 0.17, + "grad_norm": 0.5817901492118835, + "learning_rate": 0.0005949929425416435, + "loss": 3.4921, + "step": 3565 + }, + { + "epoch": 0.17, + "grad_norm": 0.561545729637146, + "learning_rate": 0.0005949901409847982, + "loss": 3.4838, + "step": 3566 + }, + { + "epoch": 0.17, + "grad_norm": 0.5388302803039551, + "learning_rate": 0.0005949873386510056, + "loss": 3.5186, + "step": 3567 + }, + { + "epoch": 0.17, + "grad_norm": 0.5865461826324463, + "learning_rate": 0.0005949845355402732, + "loss": 3.4544, + "step": 3568 + }, + { + "epoch": 0.17, + "grad_norm": 0.5656617879867554, + "learning_rate": 0.0005949817316526083, + "loss": 3.6103, + "step": 3569 + }, + { + "epoch": 0.17, + "grad_norm": 0.5684191584587097, + "learning_rate": 0.0005949789269880182, + "loss": 3.4794, + "step": 3570 + }, + { + "epoch": 0.18, + "grad_norm": 0.5902529358863831, + "learning_rate": 0.0005949761215465104, + "loss": 3.5839, + "step": 3571 + }, + { + "epoch": 0.18, + "grad_norm": 0.6156822443008423, + "learning_rate": 0.0005949733153280922, + "loss": 3.6531, + "step": 3572 + }, + { + "epoch": 0.18, + "grad_norm": 0.5745266079902649, + "learning_rate": 0.0005949705083327713, + "loss": 3.7956, + "step": 3573 + }, + { + "epoch": 0.18, + "grad_norm": 0.5861325860023499, + "learning_rate": 0.0005949677005605546, + "loss": 3.6641, + "step": 3574 + }, + { + "epoch": 0.18, + "grad_norm": 0.5582994818687439, + "learning_rate": 0.0005949648920114499, + "loss": 3.4384, + "step": 3575 + }, + { + "epoch": 0.18, + "grad_norm": 0.5631774663925171, + "learning_rate": 0.0005949620826854644, + "loss": 3.3741, + "step": 3576 + }, + { + "epoch": 0.18, + "grad_norm": 0.5708392858505249, + "learning_rate": 0.0005949592725826055, + "loss": 3.4894, + "step": 3577 + }, + { + "epoch": 0.18, + "grad_norm": 0.6094598770141602, + "learning_rate": 0.0005949564617028807, + "loss": 3.3773, + "step": 3578 + }, + { + "epoch": 0.18, + "grad_norm": 0.6077072620391846, + "learning_rate": 0.0005949536500462972, + "loss": 3.4769, + "step": 3579 + }, + { + "epoch": 0.18, + "grad_norm": 0.5706542134284973, + "learning_rate": 0.0005949508376128627, + "loss": 3.5485, + "step": 3580 + }, + { + "epoch": 0.18, + "grad_norm": 0.54489666223526, + "learning_rate": 0.0005949480244025844, + "loss": 3.4537, + "step": 3581 + }, + { + "epoch": 0.18, + "grad_norm": 0.560539960861206, + "learning_rate": 0.0005949452104154698, + "loss": 3.6606, + "step": 3582 + }, + { + "epoch": 0.18, + "grad_norm": 0.6367251873016357, + "learning_rate": 0.0005949423956515263, + "loss": 3.4471, + "step": 3583 + }, + { + "epoch": 0.18, + "grad_norm": 0.5641475915908813, + "learning_rate": 0.0005949395801107612, + "loss": 3.6803, + "step": 3584 + }, + { + "epoch": 0.18, + "grad_norm": 0.5714133977890015, + "learning_rate": 0.0005949367637931821, + "loss": 3.6771, + "step": 3585 + }, + { + "epoch": 0.18, + "grad_norm": 0.539216935634613, + "learning_rate": 0.0005949339466987963, + "loss": 3.5072, + "step": 3586 + }, + { + "epoch": 0.18, + "grad_norm": 0.5528108477592468, + "learning_rate": 0.0005949311288276113, + "loss": 3.6318, + "step": 3587 + }, + { + "epoch": 0.18, + "grad_norm": 0.5910629630088806, + "learning_rate": 0.0005949283101796342, + "loss": 3.3507, + "step": 3588 + }, + { + "epoch": 0.18, + "grad_norm": 0.640604555606842, + "learning_rate": 0.000594925490754873, + "loss": 3.4874, + "step": 3589 + }, + { + "epoch": 0.18, + "grad_norm": 0.6236400008201599, + "learning_rate": 0.0005949226705533348, + "loss": 3.4188, + "step": 3590 + }, + { + "epoch": 0.18, + "grad_norm": 0.5964713096618652, + "learning_rate": 0.0005949198495750268, + "loss": 3.4705, + "step": 3591 + }, + { + "epoch": 0.18, + "grad_norm": 0.5554364323616028, + "learning_rate": 0.0005949170278199569, + "loss": 3.4395, + "step": 3592 + }, + { + "epoch": 0.18, + "grad_norm": 0.5770800709724426, + "learning_rate": 0.0005949142052881322, + "loss": 3.5081, + "step": 3593 + }, + { + "epoch": 0.18, + "grad_norm": 0.6268064975738525, + "learning_rate": 0.0005949113819795603, + "loss": 3.5902, + "step": 3594 + }, + { + "epoch": 0.18, + "grad_norm": 0.5979000329971313, + "learning_rate": 0.0005949085578942486, + "loss": 3.3657, + "step": 3595 + }, + { + "epoch": 0.18, + "grad_norm": 0.5864534974098206, + "learning_rate": 0.0005949057330322044, + "loss": 3.6838, + "step": 3596 + }, + { + "epoch": 0.18, + "grad_norm": 0.5664005279541016, + "learning_rate": 0.0005949029073934354, + "loss": 3.4289, + "step": 3597 + }, + { + "epoch": 0.18, + "grad_norm": 0.5910850167274475, + "learning_rate": 0.0005949000809779488, + "loss": 3.5872, + "step": 3598 + }, + { + "epoch": 0.18, + "grad_norm": 0.590392529964447, + "learning_rate": 0.0005948972537857521, + "loss": 3.4473, + "step": 3599 + }, + { + "epoch": 0.18, + "grad_norm": 0.5588194727897644, + "learning_rate": 0.0005948944258168529, + "loss": 3.4485, + "step": 3600 + }, + { + "epoch": 0.18, + "grad_norm": 0.5453645586967468, + "learning_rate": 0.0005948915970712585, + "loss": 3.6381, + "step": 3601 + }, + { + "epoch": 0.18, + "grad_norm": 0.5611460208892822, + "learning_rate": 0.0005948887675489763, + "loss": 3.7556, + "step": 3602 + }, + { + "epoch": 0.18, + "grad_norm": 0.5924997329711914, + "learning_rate": 0.000594885937250014, + "loss": 3.5867, + "step": 3603 + }, + { + "epoch": 0.18, + "grad_norm": 0.5849661231040955, + "learning_rate": 0.0005948831061743788, + "loss": 3.5556, + "step": 3604 + }, + { + "epoch": 0.18, + "grad_norm": 0.5972135066986084, + "learning_rate": 0.0005948802743220782, + "loss": 3.4393, + "step": 3605 + }, + { + "epoch": 0.18, + "grad_norm": 0.5754001140594482, + "learning_rate": 0.0005948774416931197, + "loss": 3.5528, + "step": 3606 + }, + { + "epoch": 0.18, + "grad_norm": 0.592771053314209, + "learning_rate": 0.0005948746082875109, + "loss": 3.4963, + "step": 3607 + }, + { + "epoch": 0.18, + "grad_norm": 0.6338792443275452, + "learning_rate": 0.0005948717741052591, + "loss": 3.2424, + "step": 3608 + }, + { + "epoch": 0.18, + "grad_norm": 0.5741003751754761, + "learning_rate": 0.0005948689391463718, + "loss": 3.6585, + "step": 3609 + }, + { + "epoch": 0.18, + "grad_norm": 0.5677915811538696, + "learning_rate": 0.0005948661034108564, + "loss": 3.5649, + "step": 3610 + }, + { + "epoch": 0.18, + "grad_norm": 0.59786057472229, + "learning_rate": 0.0005948632668987204, + "loss": 3.6715, + "step": 3611 + }, + { + "epoch": 0.18, + "grad_norm": 0.5489624738693237, + "learning_rate": 0.0005948604296099714, + "loss": 3.3292, + "step": 3612 + }, + { + "epoch": 0.18, + "grad_norm": 0.5429912209510803, + "learning_rate": 0.0005948575915446167, + "loss": 3.6417, + "step": 3613 + }, + { + "epoch": 0.18, + "grad_norm": 0.637424647808075, + "learning_rate": 0.0005948547527026639, + "loss": 3.4017, + "step": 3614 + }, + { + "epoch": 0.18, + "grad_norm": 0.5805572867393494, + "learning_rate": 0.0005948519130841203, + "loss": 3.4764, + "step": 3615 + }, + { + "epoch": 0.18, + "grad_norm": 0.6204468011856079, + "learning_rate": 0.0005948490726889937, + "loss": 3.5658, + "step": 3616 + }, + { + "epoch": 0.18, + "grad_norm": 0.5520532131195068, + "learning_rate": 0.0005948462315172913, + "loss": 3.7156, + "step": 3617 + }, + { + "epoch": 0.18, + "grad_norm": 0.5868279933929443, + "learning_rate": 0.0005948433895690207, + "loss": 3.5782, + "step": 3618 + }, + { + "epoch": 0.18, + "grad_norm": 0.5735620856285095, + "learning_rate": 0.0005948405468441893, + "loss": 3.5567, + "step": 3619 + }, + { + "epoch": 0.18, + "grad_norm": 0.5258257389068604, + "learning_rate": 0.0005948377033428046, + "loss": 3.4518, + "step": 3620 + }, + { + "epoch": 0.18, + "grad_norm": 0.5907759070396423, + "learning_rate": 0.0005948348590648743, + "loss": 3.4862, + "step": 3621 + }, + { + "epoch": 0.18, + "grad_norm": 0.5513227581977844, + "learning_rate": 0.0005948320140104056, + "loss": 3.3782, + "step": 3622 + }, + { + "epoch": 0.18, + "grad_norm": 0.620806097984314, + "learning_rate": 0.0005948291681794062, + "loss": 3.4239, + "step": 3623 + }, + { + "epoch": 0.18, + "grad_norm": 0.6315174698829651, + "learning_rate": 0.0005948263215718834, + "loss": 3.5383, + "step": 3624 + }, + { + "epoch": 0.18, + "grad_norm": 0.6031731963157654, + "learning_rate": 0.0005948234741878449, + "loss": 3.4314, + "step": 3625 + }, + { + "epoch": 0.18, + "grad_norm": 0.6577680110931396, + "learning_rate": 0.0005948206260272981, + "loss": 3.4231, + "step": 3626 + }, + { + "epoch": 0.18, + "grad_norm": 0.5863077640533447, + "learning_rate": 0.0005948177770902504, + "loss": 3.5375, + "step": 3627 + }, + { + "epoch": 0.18, + "grad_norm": 0.5604285597801208, + "learning_rate": 0.0005948149273767095, + "loss": 3.5875, + "step": 3628 + }, + { + "epoch": 0.18, + "grad_norm": 0.6838080286979675, + "learning_rate": 0.0005948120768866828, + "loss": 3.4889, + "step": 3629 + }, + { + "epoch": 0.18, + "grad_norm": 0.5543857216835022, + "learning_rate": 0.0005948092256201779, + "loss": 3.6624, + "step": 3630 + }, + { + "epoch": 0.18, + "grad_norm": 0.5770386457443237, + "learning_rate": 0.0005948063735772022, + "loss": 3.6973, + "step": 3631 + }, + { + "epoch": 0.18, + "grad_norm": 0.6096564531326294, + "learning_rate": 0.0005948035207577632, + "loss": 3.6492, + "step": 3632 + }, + { + "epoch": 0.18, + "grad_norm": 0.6562588810920715, + "learning_rate": 0.0005948006671618685, + "loss": 3.6059, + "step": 3633 + }, + { + "epoch": 0.18, + "grad_norm": 0.576315701007843, + "learning_rate": 0.0005947978127895256, + "loss": 3.7477, + "step": 3634 + }, + { + "epoch": 0.18, + "grad_norm": 0.5762778520584106, + "learning_rate": 0.000594794957640742, + "loss": 3.5309, + "step": 3635 + }, + { + "epoch": 0.18, + "grad_norm": 0.596248984336853, + "learning_rate": 0.0005947921017155251, + "loss": 3.5666, + "step": 3636 + }, + { + "epoch": 0.18, + "grad_norm": 0.5929814577102661, + "learning_rate": 0.0005947892450138827, + "loss": 3.4502, + "step": 3637 + }, + { + "epoch": 0.18, + "grad_norm": 0.581706702709198, + "learning_rate": 0.000594786387535822, + "loss": 3.4126, + "step": 3638 + }, + { + "epoch": 0.18, + "grad_norm": 0.609158992767334, + "learning_rate": 0.0005947835292813507, + "loss": 3.6607, + "step": 3639 + }, + { + "epoch": 0.18, + "grad_norm": 0.56010901927948, + "learning_rate": 0.0005947806702504765, + "loss": 3.5313, + "step": 3640 + }, + { + "epoch": 0.18, + "grad_norm": 0.5166653990745544, + "learning_rate": 0.0005947778104432067, + "loss": 3.6868, + "step": 3641 + }, + { + "epoch": 0.18, + "grad_norm": 0.6067258715629578, + "learning_rate": 0.0005947749498595487, + "loss": 3.4264, + "step": 3642 + }, + { + "epoch": 0.18, + "grad_norm": 0.5833512544631958, + "learning_rate": 0.0005947720884995104, + "loss": 3.5246, + "step": 3643 + }, + { + "epoch": 0.18, + "grad_norm": 0.5999512076377869, + "learning_rate": 0.000594769226363099, + "loss": 3.3084, + "step": 3644 + }, + { + "epoch": 0.18, + "grad_norm": 0.5532363057136536, + "learning_rate": 0.0005947663634503223, + "loss": 3.6718, + "step": 3645 + }, + { + "epoch": 0.18, + "grad_norm": 0.5565996170043945, + "learning_rate": 0.0005947634997611876, + "loss": 3.4042, + "step": 3646 + }, + { + "epoch": 0.18, + "grad_norm": 0.6335086822509766, + "learning_rate": 0.0005947606352957027, + "loss": 3.5925, + "step": 3647 + }, + { + "epoch": 0.18, + "grad_norm": 0.5904930830001831, + "learning_rate": 0.0005947577700538751, + "loss": 3.4858, + "step": 3648 + }, + { + "epoch": 0.18, + "grad_norm": 0.5819961428642273, + "learning_rate": 0.000594754904035712, + "loss": 3.4756, + "step": 3649 + }, + { + "epoch": 0.18, + "grad_norm": 0.5676370859146118, + "learning_rate": 0.0005947520372412214, + "loss": 3.3994, + "step": 3650 + }, + { + "epoch": 0.18, + "grad_norm": 0.528371274471283, + "learning_rate": 0.0005947491696704105, + "loss": 3.5782, + "step": 3651 + }, + { + "epoch": 0.18, + "grad_norm": 0.5858977437019348, + "learning_rate": 0.0005947463013232872, + "loss": 3.5947, + "step": 3652 + }, + { + "epoch": 0.18, + "grad_norm": 0.6060065627098083, + "learning_rate": 0.0005947434321998588, + "loss": 3.3542, + "step": 3653 + }, + { + "epoch": 0.18, + "grad_norm": 0.6193976402282715, + "learning_rate": 0.000594740562300133, + "loss": 3.4906, + "step": 3654 + }, + { + "epoch": 0.18, + "grad_norm": 0.6060580015182495, + "learning_rate": 0.000594737691624117, + "loss": 3.4806, + "step": 3655 + }, + { + "epoch": 0.18, + "grad_norm": 0.6116194725036621, + "learning_rate": 0.000594734820171819, + "loss": 3.5376, + "step": 3656 + }, + { + "epoch": 0.18, + "grad_norm": 0.5761888027191162, + "learning_rate": 0.000594731947943246, + "loss": 3.5894, + "step": 3657 + }, + { + "epoch": 0.18, + "grad_norm": 0.5662036538124084, + "learning_rate": 0.0005947290749384059, + "loss": 3.5399, + "step": 3658 + }, + { + "epoch": 0.18, + "grad_norm": 0.6075831055641174, + "learning_rate": 0.0005947262011573059, + "loss": 3.7756, + "step": 3659 + }, + { + "epoch": 0.18, + "grad_norm": 0.5735023617744446, + "learning_rate": 0.000594723326599954, + "loss": 3.48, + "step": 3660 + }, + { + "epoch": 0.18, + "grad_norm": 0.5627445578575134, + "learning_rate": 0.0005947204512663575, + "loss": 3.6544, + "step": 3661 + }, + { + "epoch": 0.18, + "grad_norm": 0.5845668911933899, + "learning_rate": 0.0005947175751565241, + "loss": 3.4011, + "step": 3662 + }, + { + "epoch": 0.18, + "grad_norm": 0.5823236107826233, + "learning_rate": 0.0005947146982704613, + "loss": 3.2938, + "step": 3663 + }, + { + "epoch": 0.18, + "grad_norm": 0.5699519515037537, + "learning_rate": 0.0005947118206081767, + "loss": 3.7, + "step": 3664 + }, + { + "epoch": 0.18, + "grad_norm": 0.6400408744812012, + "learning_rate": 0.0005947089421696779, + "loss": 3.572, + "step": 3665 + }, + { + "epoch": 0.18, + "grad_norm": 0.5679846405982971, + "learning_rate": 0.0005947060629549725, + "loss": 3.7148, + "step": 3666 + }, + { + "epoch": 0.18, + "grad_norm": 0.5408234000205994, + "learning_rate": 0.0005947031829640681, + "loss": 3.35, + "step": 3667 + }, + { + "epoch": 0.18, + "grad_norm": 0.5761770606040955, + "learning_rate": 0.0005947003021969721, + "loss": 3.5302, + "step": 3668 + }, + { + "epoch": 0.18, + "grad_norm": 0.5421696305274963, + "learning_rate": 0.0005946974206536923, + "loss": 3.7173, + "step": 3669 + }, + { + "epoch": 0.18, + "grad_norm": 0.5892155170440674, + "learning_rate": 0.0005946945383342361, + "loss": 3.5065, + "step": 3670 + }, + { + "epoch": 0.18, + "grad_norm": 0.6017743349075317, + "learning_rate": 0.0005946916552386113, + "loss": 3.4401, + "step": 3671 + }, + { + "epoch": 0.18, + "grad_norm": 0.5975637435913086, + "learning_rate": 0.0005946887713668253, + "loss": 3.6127, + "step": 3672 + }, + { + "epoch": 0.18, + "grad_norm": 0.5787088871002197, + "learning_rate": 0.000594685886718886, + "loss": 3.635, + "step": 3673 + }, + { + "epoch": 0.18, + "grad_norm": 0.5916794538497925, + "learning_rate": 0.0005946830012948006, + "loss": 3.4734, + "step": 3674 + }, + { + "epoch": 0.18, + "grad_norm": 0.5836448669433594, + "learning_rate": 0.000594680115094577, + "loss": 3.4839, + "step": 3675 + }, + { + "epoch": 0.18, + "grad_norm": 0.5569721460342407, + "learning_rate": 0.0005946772281182226, + "loss": 3.3795, + "step": 3676 + }, + { + "epoch": 0.18, + "grad_norm": 0.6041418313980103, + "learning_rate": 0.000594674340365745, + "loss": 3.8186, + "step": 3677 + }, + { + "epoch": 0.18, + "grad_norm": 0.5707287788391113, + "learning_rate": 0.0005946714518371521, + "loss": 3.5052, + "step": 3678 + }, + { + "epoch": 0.18, + "grad_norm": 0.5691279768943787, + "learning_rate": 0.0005946685625324512, + "loss": 3.693, + "step": 3679 + }, + { + "epoch": 0.18, + "grad_norm": 0.545986533164978, + "learning_rate": 0.00059466567245165, + "loss": 3.4695, + "step": 3680 + }, + { + "epoch": 0.18, + "grad_norm": 0.5766247510910034, + "learning_rate": 0.0005946627815947562, + "loss": 3.6347, + "step": 3681 + }, + { + "epoch": 0.18, + "grad_norm": 0.624789834022522, + "learning_rate": 0.0005946598899617773, + "loss": 3.4847, + "step": 3682 + }, + { + "epoch": 0.18, + "grad_norm": 0.5886110067367554, + "learning_rate": 0.0005946569975527209, + "loss": 3.5243, + "step": 3683 + }, + { + "epoch": 0.18, + "grad_norm": 0.6018827557563782, + "learning_rate": 0.0005946541043675948, + "loss": 3.4334, + "step": 3684 + }, + { + "epoch": 0.18, + "grad_norm": 0.595812201499939, + "learning_rate": 0.0005946512104064065, + "loss": 3.551, + "step": 3685 + }, + { + "epoch": 0.18, + "grad_norm": 0.6566250920295715, + "learning_rate": 0.0005946483156691636, + "loss": 3.6164, + "step": 3686 + }, + { + "epoch": 0.18, + "grad_norm": 0.5355682373046875, + "learning_rate": 0.0005946454201558736, + "loss": 3.5166, + "step": 3687 + }, + { + "epoch": 0.18, + "grad_norm": 0.5876386761665344, + "learning_rate": 0.0005946425238665445, + "loss": 3.4562, + "step": 3688 + }, + { + "epoch": 0.18, + "grad_norm": 0.5719892978668213, + "learning_rate": 0.0005946396268011834, + "loss": 3.3293, + "step": 3689 + }, + { + "epoch": 0.18, + "grad_norm": 0.5640550255775452, + "learning_rate": 0.0005946367289597985, + "loss": 3.4951, + "step": 3690 + }, + { + "epoch": 0.18, + "grad_norm": 0.580362856388092, + "learning_rate": 0.0005946338303423971, + "loss": 3.4677, + "step": 3691 + }, + { + "epoch": 0.18, + "grad_norm": 0.604098379611969, + "learning_rate": 0.0005946309309489868, + "loss": 3.556, + "step": 3692 + }, + { + "epoch": 0.18, + "grad_norm": 0.6758443117141724, + "learning_rate": 0.0005946280307795753, + "loss": 3.2999, + "step": 3693 + }, + { + "epoch": 0.18, + "grad_norm": 0.6679175496101379, + "learning_rate": 0.0005946251298341704, + "loss": 3.4755, + "step": 3694 + }, + { + "epoch": 0.18, + "grad_norm": 0.5557239651679993, + "learning_rate": 0.0005946222281127796, + "loss": 3.712, + "step": 3695 + }, + { + "epoch": 0.18, + "grad_norm": 0.5595870018005371, + "learning_rate": 0.0005946193256154105, + "loss": 3.5436, + "step": 3696 + }, + { + "epoch": 0.18, + "grad_norm": 0.5639623403549194, + "learning_rate": 0.0005946164223420708, + "loss": 3.2984, + "step": 3697 + }, + { + "epoch": 0.18, + "grad_norm": 0.6050103902816772, + "learning_rate": 0.0005946135182927681, + "loss": 3.4194, + "step": 3698 + }, + { + "epoch": 0.18, + "grad_norm": 0.5879496335983276, + "learning_rate": 0.0005946106134675102, + "loss": 3.2472, + "step": 3699 + }, + { + "epoch": 0.18, + "grad_norm": 0.5664472579956055, + "learning_rate": 0.0005946077078663046, + "loss": 3.4548, + "step": 3700 + }, + { + "epoch": 0.18, + "grad_norm": 0.574516236782074, + "learning_rate": 0.000594604801489159, + "loss": 3.3961, + "step": 3701 + }, + { + "epoch": 0.18, + "grad_norm": 0.583021879196167, + "learning_rate": 0.000594601894336081, + "loss": 3.441, + "step": 3702 + }, + { + "epoch": 0.18, + "grad_norm": 0.5696473121643066, + "learning_rate": 0.0005945989864070783, + "loss": 3.5425, + "step": 3703 + }, + { + "epoch": 0.18, + "grad_norm": 0.5899128913879395, + "learning_rate": 0.0005945960777021587, + "loss": 3.449, + "step": 3704 + }, + { + "epoch": 0.18, + "grad_norm": 0.5821749567985535, + "learning_rate": 0.0005945931682213296, + "loss": 3.6383, + "step": 3705 + }, + { + "epoch": 0.18, + "grad_norm": 0.5889866352081299, + "learning_rate": 0.0005945902579645988, + "loss": 3.5592, + "step": 3706 + }, + { + "epoch": 0.18, + "grad_norm": 0.6704166531562805, + "learning_rate": 0.000594587346931974, + "loss": 3.2622, + "step": 3707 + }, + { + "epoch": 0.18, + "grad_norm": 0.5655115842819214, + "learning_rate": 0.0005945844351234628, + "loss": 3.5981, + "step": 3708 + }, + { + "epoch": 0.18, + "grad_norm": 0.5271401405334473, + "learning_rate": 0.0005945815225390729, + "loss": 3.6283, + "step": 3709 + }, + { + "epoch": 0.18, + "grad_norm": 0.5654386281967163, + "learning_rate": 0.0005945786091788119, + "loss": 3.3157, + "step": 3710 + }, + { + "epoch": 0.18, + "grad_norm": 0.5483503341674805, + "learning_rate": 0.0005945756950426876, + "loss": 3.4272, + "step": 3711 + }, + { + "epoch": 0.18, + "grad_norm": 0.554814875125885, + "learning_rate": 0.0005945727801307077, + "loss": 3.5075, + "step": 3712 + }, + { + "epoch": 0.18, + "grad_norm": 0.6500690579414368, + "learning_rate": 0.0005945698644428797, + "loss": 3.6308, + "step": 3713 + }, + { + "epoch": 0.18, + "grad_norm": 0.5715118050575256, + "learning_rate": 0.0005945669479792113, + "loss": 3.6008, + "step": 3714 + }, + { + "epoch": 0.18, + "grad_norm": 0.5506441593170166, + "learning_rate": 0.0005945640307397104, + "loss": 3.5656, + "step": 3715 + }, + { + "epoch": 0.18, + "grad_norm": 0.5786412358283997, + "learning_rate": 0.0005945611127243844, + "loss": 3.575, + "step": 3716 + }, + { + "epoch": 0.18, + "grad_norm": 0.6186723709106445, + "learning_rate": 0.0005945581939332412, + "loss": 3.6169, + "step": 3717 + }, + { + "epoch": 0.18, + "grad_norm": 0.5763697624206543, + "learning_rate": 0.0005945552743662883, + "loss": 3.7148, + "step": 3718 + }, + { + "epoch": 0.18, + "grad_norm": 0.659016489982605, + "learning_rate": 0.0005945523540235336, + "loss": 3.5355, + "step": 3719 + }, + { + "epoch": 0.18, + "grad_norm": 0.5776824951171875, + "learning_rate": 0.0005945494329049846, + "loss": 3.6071, + "step": 3720 + }, + { + "epoch": 0.18, + "grad_norm": 0.5591484904289246, + "learning_rate": 0.0005945465110106492, + "loss": 3.389, + "step": 3721 + }, + { + "epoch": 0.18, + "grad_norm": 0.5636910200119019, + "learning_rate": 0.0005945435883405348, + "loss": 3.594, + "step": 3722 + }, + { + "epoch": 0.18, + "grad_norm": 0.6116440296173096, + "learning_rate": 0.0005945406648946495, + "loss": 3.4926, + "step": 3723 + }, + { + "epoch": 0.18, + "grad_norm": 0.558946430683136, + "learning_rate": 0.0005945377406730007, + "loss": 3.6447, + "step": 3724 + }, + { + "epoch": 0.18, + "grad_norm": 0.5685746669769287, + "learning_rate": 0.0005945348156755962, + "loss": 3.4761, + "step": 3725 + }, + { + "epoch": 0.18, + "grad_norm": 0.9950370192527771, + "learning_rate": 0.0005945318899024435, + "loss": 3.7469, + "step": 3726 + }, + { + "epoch": 0.18, + "grad_norm": 0.7801612615585327, + "learning_rate": 0.0005945289633535506, + "loss": 3.6114, + "step": 3727 + }, + { + "epoch": 0.18, + "grad_norm": 0.6060027480125427, + "learning_rate": 0.0005945260360289251, + "loss": 3.4032, + "step": 3728 + }, + { + "epoch": 0.18, + "grad_norm": 0.580491840839386, + "learning_rate": 0.0005945231079285747, + "loss": 3.3195, + "step": 3729 + }, + { + "epoch": 0.18, + "grad_norm": 0.7394986748695374, + "learning_rate": 0.000594520179052507, + "loss": 3.6785, + "step": 3730 + }, + { + "epoch": 0.18, + "grad_norm": 0.5658180713653564, + "learning_rate": 0.00059451724940073, + "loss": 3.4843, + "step": 3731 + }, + { + "epoch": 0.18, + "grad_norm": 0.5555374622344971, + "learning_rate": 0.0005945143189732512, + "loss": 3.4209, + "step": 3732 + }, + { + "epoch": 0.18, + "grad_norm": 0.6545796990394592, + "learning_rate": 0.0005945113877700785, + "loss": 3.52, + "step": 3733 + }, + { + "epoch": 0.18, + "grad_norm": 0.5792942047119141, + "learning_rate": 0.0005945084557912192, + "loss": 3.5676, + "step": 3734 + }, + { + "epoch": 0.18, + "grad_norm": 0.587958574295044, + "learning_rate": 0.0005945055230366815, + "loss": 3.5171, + "step": 3735 + }, + { + "epoch": 0.18, + "grad_norm": 0.570886492729187, + "learning_rate": 0.0005945025895064728, + "loss": 3.5358, + "step": 3736 + }, + { + "epoch": 0.18, + "grad_norm": 0.5610484480857849, + "learning_rate": 0.0005944996552006011, + "loss": 3.6122, + "step": 3737 + }, + { + "epoch": 0.18, + "grad_norm": 0.6265158653259277, + "learning_rate": 0.000594496720119074, + "loss": 3.3024, + "step": 3738 + }, + { + "epoch": 0.18, + "grad_norm": 0.5752745866775513, + "learning_rate": 0.0005944937842618991, + "loss": 3.6183, + "step": 3739 + }, + { + "epoch": 0.18, + "grad_norm": 0.6028185486793518, + "learning_rate": 0.0005944908476290843, + "loss": 3.4493, + "step": 3740 + }, + { + "epoch": 0.18, + "grad_norm": 0.5684720873832703, + "learning_rate": 0.0005944879102206372, + "loss": 3.5517, + "step": 3741 + }, + { + "epoch": 0.18, + "grad_norm": 0.5804165005683899, + "learning_rate": 0.0005944849720365657, + "loss": 3.3608, + "step": 3742 + }, + { + "epoch": 0.18, + "grad_norm": 0.5693696737289429, + "learning_rate": 0.0005944820330768775, + "loss": 3.5387, + "step": 3743 + }, + { + "epoch": 0.18, + "grad_norm": 0.6432933807373047, + "learning_rate": 0.0005944790933415802, + "loss": 3.5484, + "step": 3744 + }, + { + "epoch": 0.18, + "grad_norm": 0.6080823540687561, + "learning_rate": 0.0005944761528306817, + "loss": 3.5617, + "step": 3745 + }, + { + "epoch": 0.18, + "grad_norm": 0.5980885028839111, + "learning_rate": 0.0005944732115441897, + "loss": 3.4468, + "step": 3746 + }, + { + "epoch": 0.18, + "grad_norm": 0.6021443605422974, + "learning_rate": 0.0005944702694821119, + "loss": 3.4973, + "step": 3747 + }, + { + "epoch": 0.18, + "grad_norm": 0.6065918803215027, + "learning_rate": 0.0005944673266444561, + "loss": 3.3864, + "step": 3748 + }, + { + "epoch": 0.18, + "grad_norm": 0.579862654209137, + "learning_rate": 0.00059446438303123, + "loss": 3.7255, + "step": 3749 + }, + { + "epoch": 0.18, + "grad_norm": 0.5811945199966431, + "learning_rate": 0.0005944614386424414, + "loss": 3.5034, + "step": 3750 + }, + { + "epoch": 0.18, + "grad_norm": 0.5766950249671936, + "learning_rate": 0.0005944584934780981, + "loss": 3.7607, + "step": 3751 + }, + { + "epoch": 0.18, + "grad_norm": 0.6201891899108887, + "learning_rate": 0.0005944555475382077, + "loss": 3.342, + "step": 3752 + }, + { + "epoch": 0.18, + "grad_norm": 0.5696931481361389, + "learning_rate": 0.0005944526008227781, + "loss": 3.419, + "step": 3753 + }, + { + "epoch": 0.18, + "grad_norm": 0.6073271036148071, + "learning_rate": 0.000594449653331817, + "loss": 3.3548, + "step": 3754 + }, + { + "epoch": 0.18, + "grad_norm": 0.5775006413459778, + "learning_rate": 0.0005944467050653323, + "loss": 3.3832, + "step": 3755 + }, + { + "epoch": 0.18, + "grad_norm": 0.6126733422279358, + "learning_rate": 0.0005944437560233314, + "loss": 3.3462, + "step": 3756 + }, + { + "epoch": 0.18, + "grad_norm": 0.7034265995025635, + "learning_rate": 0.0005944408062058225, + "loss": 3.4659, + "step": 3757 + }, + { + "epoch": 0.18, + "grad_norm": 0.60416179895401, + "learning_rate": 0.0005944378556128132, + "loss": 3.2714, + "step": 3758 + }, + { + "epoch": 0.18, + "grad_norm": 0.640256404876709, + "learning_rate": 0.0005944349042443112, + "loss": 3.5203, + "step": 3759 + }, + { + "epoch": 0.18, + "grad_norm": 0.590840220451355, + "learning_rate": 0.0005944319521003243, + "loss": 3.5765, + "step": 3760 + }, + { + "epoch": 0.18, + "grad_norm": 0.561656653881073, + "learning_rate": 0.0005944289991808604, + "loss": 3.509, + "step": 3761 + }, + { + "epoch": 0.18, + "grad_norm": 0.5961998105049133, + "learning_rate": 0.0005944260454859271, + "loss": 3.3526, + "step": 3762 + }, + { + "epoch": 0.18, + "grad_norm": 0.5496677756309509, + "learning_rate": 0.0005944230910155322, + "loss": 3.7351, + "step": 3763 + }, + { + "epoch": 0.18, + "grad_norm": 0.5810037851333618, + "learning_rate": 0.0005944201357696837, + "loss": 3.4154, + "step": 3764 + }, + { + "epoch": 0.18, + "grad_norm": 0.6055195927619934, + "learning_rate": 0.0005944171797483891, + "loss": 3.3845, + "step": 3765 + }, + { + "epoch": 0.18, + "grad_norm": 0.5705102682113647, + "learning_rate": 0.0005944142229516563, + "loss": 3.4328, + "step": 3766 + }, + { + "epoch": 0.18, + "grad_norm": 0.5758143067359924, + "learning_rate": 0.0005944112653794931, + "loss": 3.31, + "step": 3767 + }, + { + "epoch": 0.18, + "grad_norm": 0.5275263786315918, + "learning_rate": 0.0005944083070319074, + "loss": 3.6008, + "step": 3768 + }, + { + "epoch": 0.18, + "grad_norm": 0.5653240084648132, + "learning_rate": 0.0005944053479089069, + "loss": 3.4932, + "step": 3769 + }, + { + "epoch": 0.18, + "grad_norm": 0.5635776519775391, + "learning_rate": 0.0005944023880104992, + "loss": 3.5663, + "step": 3770 + }, + { + "epoch": 0.18, + "grad_norm": 0.6032636165618896, + "learning_rate": 0.0005943994273366923, + "loss": 3.2103, + "step": 3771 + }, + { + "epoch": 0.18, + "grad_norm": 0.6400607228279114, + "learning_rate": 0.000594396465887494, + "loss": 3.3427, + "step": 3772 + }, + { + "epoch": 0.18, + "grad_norm": 0.5439187288284302, + "learning_rate": 0.0005943935036629122, + "loss": 3.5142, + "step": 3773 + }, + { + "epoch": 0.18, + "grad_norm": 0.7817490100860596, + "learning_rate": 0.0005943905406629545, + "loss": 3.3834, + "step": 3774 + }, + { + "epoch": 0.19, + "grad_norm": 0.5938287973403931, + "learning_rate": 0.0005943875768876286, + "loss": 3.4816, + "step": 3775 + }, + { + "epoch": 0.19, + "grad_norm": 0.5887666940689087, + "learning_rate": 0.0005943846123369426, + "loss": 3.7435, + "step": 3776 + }, + { + "epoch": 0.19, + "grad_norm": 0.5787019729614258, + "learning_rate": 0.0005943816470109042, + "loss": 3.5568, + "step": 3777 + }, + { + "epoch": 0.19, + "grad_norm": 0.5176889896392822, + "learning_rate": 0.0005943786809095213, + "loss": 3.514, + "step": 3778 + }, + { + "epoch": 0.19, + "grad_norm": 0.5439308285713196, + "learning_rate": 0.0005943757140328014, + "loss": 3.416, + "step": 3779 + }, + { + "epoch": 0.19, + "grad_norm": 0.5765101909637451, + "learning_rate": 0.0005943727463807527, + "loss": 3.6599, + "step": 3780 + }, + { + "epoch": 0.19, + "grad_norm": 0.5823919177055359, + "learning_rate": 0.0005943697779533828, + "loss": 3.4311, + "step": 3781 + }, + { + "epoch": 0.19, + "grad_norm": 0.5826864242553711, + "learning_rate": 0.0005943668087506994, + "loss": 3.5703, + "step": 3782 + }, + { + "epoch": 0.19, + "grad_norm": 0.5674658417701721, + "learning_rate": 0.0005943638387727107, + "loss": 3.3271, + "step": 3783 + }, + { + "epoch": 0.19, + "grad_norm": 0.564477801322937, + "learning_rate": 0.0005943608680194241, + "loss": 3.3989, + "step": 3784 + }, + { + "epoch": 0.19, + "grad_norm": 0.5395248532295227, + "learning_rate": 0.0005943578964908478, + "loss": 3.5984, + "step": 3785 + }, + { + "epoch": 0.19, + "grad_norm": 0.5394876599311829, + "learning_rate": 0.0005943549241869894, + "loss": 3.4793, + "step": 3786 + }, + { + "epoch": 0.19, + "grad_norm": 0.5413987636566162, + "learning_rate": 0.0005943519511078568, + "loss": 3.4323, + "step": 3787 + }, + { + "epoch": 0.19, + "grad_norm": 0.57403963804245, + "learning_rate": 0.0005943489772534576, + "loss": 3.6065, + "step": 3788 + }, + { + "epoch": 0.19, + "grad_norm": 0.5907705426216125, + "learning_rate": 0.0005943460026238001, + "loss": 3.5587, + "step": 3789 + }, + { + "epoch": 0.19, + "grad_norm": 0.5364418625831604, + "learning_rate": 0.0005943430272188917, + "loss": 3.5492, + "step": 3790 + }, + { + "epoch": 0.19, + "grad_norm": 0.5795174241065979, + "learning_rate": 0.0005943400510387405, + "loss": 3.6131, + "step": 3791 + }, + { + "epoch": 0.19, + "grad_norm": 0.5407680869102478, + "learning_rate": 0.0005943370740833543, + "loss": 3.5612, + "step": 3792 + }, + { + "epoch": 0.19, + "grad_norm": 0.6448293924331665, + "learning_rate": 0.0005943340963527409, + "loss": 3.3383, + "step": 3793 + }, + { + "epoch": 0.19, + "grad_norm": 0.562232255935669, + "learning_rate": 0.0005943311178469079, + "loss": 3.4904, + "step": 3794 + }, + { + "epoch": 0.19, + "grad_norm": 0.5268046259880066, + "learning_rate": 0.0005943281385658636, + "loss": 3.7969, + "step": 3795 + }, + { + "epoch": 0.19, + "grad_norm": 0.5817152261734009, + "learning_rate": 0.0005943251585096156, + "loss": 3.5373, + "step": 3796 + }, + { + "epoch": 0.19, + "grad_norm": 0.523223876953125, + "learning_rate": 0.0005943221776781719, + "loss": 3.4779, + "step": 3797 + }, + { + "epoch": 0.19, + "grad_norm": 0.6563833355903625, + "learning_rate": 0.0005943191960715399, + "loss": 3.5764, + "step": 3798 + }, + { + "epoch": 0.19, + "grad_norm": 0.6011426448822021, + "learning_rate": 0.000594316213689728, + "loss": 3.5565, + "step": 3799 + }, + { + "epoch": 0.19, + "grad_norm": 0.5760040283203125, + "learning_rate": 0.0005943132305327438, + "loss": 3.6364, + "step": 3800 + }, + { + "epoch": 0.19, + "grad_norm": 0.551845371723175, + "learning_rate": 0.0005943102466005952, + "loss": 3.481, + "step": 3801 + }, + { + "epoch": 0.19, + "grad_norm": 0.5699048638343811, + "learning_rate": 0.0005943072618932901, + "loss": 3.3893, + "step": 3802 + }, + { + "epoch": 0.19, + "grad_norm": 0.5873308777809143, + "learning_rate": 0.0005943042764108363, + "loss": 3.6338, + "step": 3803 + }, + { + "epoch": 0.19, + "grad_norm": 0.5894830226898193, + "learning_rate": 0.0005943012901532416, + "loss": 3.4884, + "step": 3804 + }, + { + "epoch": 0.19, + "grad_norm": 0.5429089069366455, + "learning_rate": 0.0005942983031205139, + "loss": 3.4782, + "step": 3805 + }, + { + "epoch": 0.19, + "grad_norm": 0.5496631264686584, + "learning_rate": 0.0005942953153126613, + "loss": 3.7168, + "step": 3806 + }, + { + "epoch": 0.19, + "grad_norm": 0.5998590588569641, + "learning_rate": 0.0005942923267296913, + "loss": 3.7289, + "step": 3807 + }, + { + "epoch": 0.19, + "grad_norm": 0.6006838083267212, + "learning_rate": 0.000594289337371612, + "loss": 3.4137, + "step": 3808 + }, + { + "epoch": 0.19, + "grad_norm": 0.5551097393035889, + "learning_rate": 0.0005942863472384313, + "loss": 3.7467, + "step": 3809 + }, + { + "epoch": 0.19, + "grad_norm": 0.5894075036048889, + "learning_rate": 0.0005942833563301569, + "loss": 3.3947, + "step": 3810 + }, + { + "epoch": 0.19, + "grad_norm": 0.5639740228652954, + "learning_rate": 0.0005942803646467968, + "loss": 3.6335, + "step": 3811 + }, + { + "epoch": 0.19, + "grad_norm": 0.5757173299789429, + "learning_rate": 0.0005942773721883588, + "loss": 3.6161, + "step": 3812 + }, + { + "epoch": 0.19, + "grad_norm": 0.547919750213623, + "learning_rate": 0.000594274378954851, + "loss": 3.5195, + "step": 3813 + }, + { + "epoch": 0.19, + "grad_norm": 0.543421745300293, + "learning_rate": 0.000594271384946281, + "loss": 3.2388, + "step": 3814 + }, + { + "epoch": 0.19, + "grad_norm": 0.5859833359718323, + "learning_rate": 0.0005942683901626569, + "loss": 3.5582, + "step": 3815 + }, + { + "epoch": 0.19, + "grad_norm": 0.5319817066192627, + "learning_rate": 0.0005942653946039864, + "loss": 3.574, + "step": 3816 + }, + { + "epoch": 0.19, + "grad_norm": 0.5493303537368774, + "learning_rate": 0.0005942623982702775, + "loss": 3.4146, + "step": 3817 + }, + { + "epoch": 0.19, + "grad_norm": 0.5733358860015869, + "learning_rate": 0.000594259401161538, + "loss": 3.543, + "step": 3818 + }, + { + "epoch": 0.19, + "grad_norm": 0.5682793855667114, + "learning_rate": 0.000594256403277776, + "loss": 3.6779, + "step": 3819 + }, + { + "epoch": 0.19, + "grad_norm": 0.5126398801803589, + "learning_rate": 0.0005942534046189992, + "loss": 3.5189, + "step": 3820 + }, + { + "epoch": 0.19, + "grad_norm": 0.5806546211242676, + "learning_rate": 0.0005942504051852155, + "loss": 3.4849, + "step": 3821 + }, + { + "epoch": 0.19, + "grad_norm": 0.5452494025230408, + "learning_rate": 0.000594247404976433, + "loss": 3.5726, + "step": 3822 + }, + { + "epoch": 0.19, + "grad_norm": 0.5558573007583618, + "learning_rate": 0.0005942444039926594, + "loss": 3.5881, + "step": 3823 + }, + { + "epoch": 0.19, + "grad_norm": 0.5447998642921448, + "learning_rate": 0.0005942414022339027, + "loss": 3.4945, + "step": 3824 + }, + { + "epoch": 0.19, + "grad_norm": 0.6169920563697815, + "learning_rate": 0.0005942383997001707, + "loss": 3.3484, + "step": 3825 + }, + { + "epoch": 0.19, + "grad_norm": 0.5817279815673828, + "learning_rate": 0.0005942353963914714, + "loss": 3.4676, + "step": 3826 + }, + { + "epoch": 0.19, + "grad_norm": 0.5804048776626587, + "learning_rate": 0.0005942323923078127, + "loss": 3.2754, + "step": 3827 + }, + { + "epoch": 0.19, + "grad_norm": 0.5827670097351074, + "learning_rate": 0.0005942293874492025, + "loss": 3.4499, + "step": 3828 + }, + { + "epoch": 0.19, + "grad_norm": 0.5710329413414001, + "learning_rate": 0.0005942263818156488, + "loss": 3.5819, + "step": 3829 + }, + { + "epoch": 0.19, + "grad_norm": 0.6096348762512207, + "learning_rate": 0.0005942233754071594, + "loss": 3.3934, + "step": 3830 + }, + { + "epoch": 0.19, + "grad_norm": 0.5973950028419495, + "learning_rate": 0.0005942203682237422, + "loss": 3.4731, + "step": 3831 + }, + { + "epoch": 0.19, + "grad_norm": 0.5598435401916504, + "learning_rate": 0.0005942173602654053, + "loss": 3.5913, + "step": 3832 + }, + { + "epoch": 0.19, + "grad_norm": 0.6751794219017029, + "learning_rate": 0.0005942143515321563, + "loss": 3.576, + "step": 3833 + }, + { + "epoch": 0.19, + "grad_norm": 0.5890194773674011, + "learning_rate": 0.0005942113420240034, + "loss": 3.6024, + "step": 3834 + }, + { + "epoch": 0.19, + "grad_norm": 0.6194825768470764, + "learning_rate": 0.0005942083317409545, + "loss": 3.3712, + "step": 3835 + }, + { + "epoch": 0.19, + "grad_norm": 0.5517776608467102, + "learning_rate": 0.0005942053206830175, + "loss": 3.4849, + "step": 3836 + }, + { + "epoch": 0.19, + "grad_norm": 0.5492923855781555, + "learning_rate": 0.0005942023088502001, + "loss": 3.3793, + "step": 3837 + }, + { + "epoch": 0.19, + "grad_norm": 0.5847030282020569, + "learning_rate": 0.0005941992962425107, + "loss": 3.6312, + "step": 3838 + }, + { + "epoch": 0.19, + "grad_norm": 0.5900754928588867, + "learning_rate": 0.0005941962828599568, + "loss": 3.5081, + "step": 3839 + }, + { + "epoch": 0.19, + "grad_norm": 0.5455581545829773, + "learning_rate": 0.0005941932687025467, + "loss": 3.4151, + "step": 3840 + }, + { + "epoch": 0.19, + "grad_norm": 0.5705889463424683, + "learning_rate": 0.000594190253770288, + "loss": 3.6245, + "step": 3841 + }, + { + "epoch": 0.19, + "grad_norm": 0.5608065724372864, + "learning_rate": 0.0005941872380631888, + "loss": 3.368, + "step": 3842 + }, + { + "epoch": 0.19, + "grad_norm": 0.579862117767334, + "learning_rate": 0.000594184221581257, + "loss": 3.5874, + "step": 3843 + }, + { + "epoch": 0.19, + "grad_norm": 0.570897102355957, + "learning_rate": 0.0005941812043245006, + "loss": 3.5095, + "step": 3844 + }, + { + "epoch": 0.19, + "grad_norm": 0.5384157299995422, + "learning_rate": 0.0005941781862929275, + "loss": 3.6649, + "step": 3845 + }, + { + "epoch": 0.19, + "grad_norm": 0.5707191824913025, + "learning_rate": 0.0005941751674865457, + "loss": 3.7034, + "step": 3846 + }, + { + "epoch": 0.19, + "grad_norm": 0.5771855711936951, + "learning_rate": 0.0005941721479053632, + "loss": 3.5403, + "step": 3847 + }, + { + "epoch": 0.19, + "grad_norm": 0.5326586961746216, + "learning_rate": 0.0005941691275493878, + "loss": 3.6345, + "step": 3848 + }, + { + "epoch": 0.19, + "grad_norm": 0.6068933010101318, + "learning_rate": 0.0005941661064186276, + "loss": 3.5202, + "step": 3849 + }, + { + "epoch": 0.19, + "grad_norm": 0.5824964642524719, + "learning_rate": 0.0005941630845130905, + "loss": 3.7016, + "step": 3850 + }, + { + "epoch": 0.19, + "grad_norm": 0.5302342176437378, + "learning_rate": 0.0005941600618327843, + "loss": 3.5209, + "step": 3851 + }, + { + "epoch": 0.19, + "grad_norm": 0.5523245334625244, + "learning_rate": 0.0005941570383777171, + "loss": 3.6873, + "step": 3852 + }, + { + "epoch": 0.19, + "grad_norm": 0.5614722967147827, + "learning_rate": 0.0005941540141478971, + "loss": 3.5292, + "step": 3853 + }, + { + "epoch": 0.19, + "grad_norm": 0.5354779958724976, + "learning_rate": 0.0005941509891433319, + "loss": 3.3006, + "step": 3854 + }, + { + "epoch": 0.19, + "grad_norm": 0.6500388383865356, + "learning_rate": 0.0005941479633640296, + "loss": 3.5103, + "step": 3855 + }, + { + "epoch": 0.19, + "grad_norm": 0.5664093494415283, + "learning_rate": 0.0005941449368099982, + "loss": 3.4756, + "step": 3856 + }, + { + "epoch": 0.19, + "grad_norm": 0.5725468397140503, + "learning_rate": 0.0005941419094812456, + "loss": 3.4823, + "step": 3857 + }, + { + "epoch": 0.19, + "grad_norm": 0.5943357348442078, + "learning_rate": 0.0005941388813777799, + "loss": 3.3334, + "step": 3858 + }, + { + "epoch": 0.19, + "grad_norm": 0.6101706027984619, + "learning_rate": 0.000594135852499609, + "loss": 3.1905, + "step": 3859 + }, + { + "epoch": 0.19, + "grad_norm": 0.5723959803581238, + "learning_rate": 0.0005941328228467408, + "loss": 3.4729, + "step": 3860 + }, + { + "epoch": 0.19, + "grad_norm": 0.5894114971160889, + "learning_rate": 0.0005941297924191833, + "loss": 3.3462, + "step": 3861 + }, + { + "epoch": 0.19, + "grad_norm": 0.5975579023361206, + "learning_rate": 0.0005941267612169447, + "loss": 3.4768, + "step": 3862 + }, + { + "epoch": 0.19, + "grad_norm": 0.5968523621559143, + "learning_rate": 0.0005941237292400326, + "loss": 3.5171, + "step": 3863 + }, + { + "epoch": 0.19, + "grad_norm": 0.5831944942474365, + "learning_rate": 0.0005941206964884554, + "loss": 3.2998, + "step": 3864 + }, + { + "epoch": 0.19, + "grad_norm": 0.581929624080658, + "learning_rate": 0.0005941176629622207, + "loss": 3.3597, + "step": 3865 + }, + { + "epoch": 0.19, + "grad_norm": 0.565826952457428, + "learning_rate": 0.0005941146286613368, + "loss": 3.4829, + "step": 3866 + }, + { + "epoch": 0.19, + "grad_norm": 0.5366962552070618, + "learning_rate": 0.0005941115935858116, + "loss": 3.8651, + "step": 3867 + }, + { + "epoch": 0.19, + "grad_norm": 0.5875154733657837, + "learning_rate": 0.000594108557735653, + "loss": 3.4664, + "step": 3868 + }, + { + "epoch": 0.19, + "grad_norm": 0.5598399639129639, + "learning_rate": 0.0005941055211108691, + "loss": 3.6399, + "step": 3869 + }, + { + "epoch": 0.19, + "grad_norm": 0.5694958567619324, + "learning_rate": 0.0005941024837114679, + "loss": 3.4511, + "step": 3870 + }, + { + "epoch": 0.19, + "grad_norm": 0.5622062683105469, + "learning_rate": 0.0005940994455374574, + "loss": 3.4977, + "step": 3871 + }, + { + "epoch": 0.19, + "grad_norm": 0.5713880062103271, + "learning_rate": 0.0005940964065888454, + "loss": 3.4234, + "step": 3872 + }, + { + "epoch": 0.19, + "grad_norm": 0.5845184326171875, + "learning_rate": 0.0005940933668656402, + "loss": 3.4577, + "step": 3873 + }, + { + "epoch": 0.19, + "grad_norm": 0.5796321034431458, + "learning_rate": 0.0005940903263678495, + "loss": 3.5307, + "step": 3874 + }, + { + "epoch": 0.19, + "grad_norm": 0.5476338267326355, + "learning_rate": 0.0005940872850954816, + "loss": 3.4395, + "step": 3875 + }, + { + "epoch": 0.19, + "grad_norm": 0.579772412776947, + "learning_rate": 0.0005940842430485443, + "loss": 3.3787, + "step": 3876 + }, + { + "epoch": 0.19, + "grad_norm": 0.5444009304046631, + "learning_rate": 0.0005940812002270458, + "loss": 3.6889, + "step": 3877 + }, + { + "epoch": 0.19, + "grad_norm": 0.5588319301605225, + "learning_rate": 0.0005940781566309941, + "loss": 3.6946, + "step": 3878 + }, + { + "epoch": 0.19, + "grad_norm": 0.5704074501991272, + "learning_rate": 0.0005940751122603971, + "loss": 3.5999, + "step": 3879 + }, + { + "epoch": 0.19, + "grad_norm": 0.7025076746940613, + "learning_rate": 0.0005940720671152627, + "loss": 3.2713, + "step": 3880 + }, + { + "epoch": 0.19, + "grad_norm": 0.8407301306724548, + "learning_rate": 0.0005940690211955991, + "loss": 3.4089, + "step": 3881 + }, + { + "epoch": 0.19, + "grad_norm": 0.5442054867744446, + "learning_rate": 0.0005940659745014143, + "loss": 3.5208, + "step": 3882 + }, + { + "epoch": 0.19, + "grad_norm": 0.5735301375389099, + "learning_rate": 0.0005940629270327165, + "loss": 3.7339, + "step": 3883 + }, + { + "epoch": 0.19, + "grad_norm": 0.5662720203399658, + "learning_rate": 0.0005940598787895132, + "loss": 3.5411, + "step": 3884 + }, + { + "epoch": 0.19, + "grad_norm": 0.635032057762146, + "learning_rate": 0.000594056829771813, + "loss": 3.6586, + "step": 3885 + }, + { + "epoch": 0.19, + "grad_norm": 0.5698032379150391, + "learning_rate": 0.0005940537799796237, + "loss": 3.5767, + "step": 3886 + }, + { + "epoch": 0.19, + "grad_norm": 0.5576255321502686, + "learning_rate": 0.0005940507294129534, + "loss": 3.6144, + "step": 3887 + }, + { + "epoch": 0.19, + "grad_norm": 0.5743609070777893, + "learning_rate": 0.0005940476780718099, + "loss": 3.4329, + "step": 3888 + }, + { + "epoch": 0.19, + "grad_norm": 0.5626585483551025, + "learning_rate": 0.0005940446259562015, + "loss": 3.62, + "step": 3889 + }, + { + "epoch": 0.19, + "grad_norm": 0.5813789963722229, + "learning_rate": 0.0005940415730661361, + "loss": 3.4282, + "step": 3890 + }, + { + "epoch": 0.19, + "grad_norm": 0.6178430318832397, + "learning_rate": 0.0005940385194016219, + "loss": 3.612, + "step": 3891 + }, + { + "epoch": 0.19, + "grad_norm": 0.569376528263092, + "learning_rate": 0.0005940354649626667, + "loss": 3.4726, + "step": 3892 + }, + { + "epoch": 0.19, + "grad_norm": 0.5537204742431641, + "learning_rate": 0.0005940324097492788, + "loss": 3.5413, + "step": 3893 + }, + { + "epoch": 0.19, + "grad_norm": 0.5710381269454956, + "learning_rate": 0.000594029353761466, + "loss": 3.4135, + "step": 3894 + }, + { + "epoch": 0.19, + "grad_norm": 0.6075487732887268, + "learning_rate": 0.0005940262969992365, + "loss": 3.3629, + "step": 3895 + }, + { + "epoch": 0.19, + "grad_norm": 0.5630260705947876, + "learning_rate": 0.0005940232394625984, + "loss": 3.4259, + "step": 3896 + }, + { + "epoch": 0.19, + "grad_norm": 0.5889552235603333, + "learning_rate": 0.0005940201811515595, + "loss": 3.4283, + "step": 3897 + }, + { + "epoch": 0.19, + "grad_norm": 0.593014657497406, + "learning_rate": 0.0005940171220661281, + "loss": 3.4631, + "step": 3898 + }, + { + "epoch": 0.19, + "grad_norm": 0.5618011355400085, + "learning_rate": 0.0005940140622063123, + "loss": 3.4224, + "step": 3899 + }, + { + "epoch": 0.19, + "grad_norm": 0.5981526970863342, + "learning_rate": 0.0005940110015721199, + "loss": 3.2224, + "step": 3900 + }, + { + "epoch": 0.19, + "grad_norm": 0.5683342814445496, + "learning_rate": 0.0005940079401635592, + "loss": 3.5997, + "step": 3901 + }, + { + "epoch": 0.19, + "grad_norm": 0.5758498311042786, + "learning_rate": 0.0005940048779806381, + "loss": 3.498, + "step": 3902 + }, + { + "epoch": 0.19, + "grad_norm": 0.553376317024231, + "learning_rate": 0.0005940018150233647, + "loss": 3.613, + "step": 3903 + }, + { + "epoch": 0.19, + "grad_norm": 0.5808395147323608, + "learning_rate": 0.0005939987512917472, + "loss": 3.5252, + "step": 3904 + }, + { + "epoch": 0.19, + "grad_norm": 0.5820940732955933, + "learning_rate": 0.0005939956867857936, + "loss": 3.7314, + "step": 3905 + }, + { + "epoch": 0.19, + "grad_norm": 0.5412302017211914, + "learning_rate": 0.0005939926215055119, + "loss": 3.5086, + "step": 3906 + }, + { + "epoch": 0.19, + "grad_norm": 0.5737446546554565, + "learning_rate": 0.0005939895554509102, + "loss": 3.3745, + "step": 3907 + }, + { + "epoch": 0.19, + "grad_norm": 0.5778170228004456, + "learning_rate": 0.0005939864886219966, + "loss": 3.6322, + "step": 3908 + }, + { + "epoch": 0.19, + "grad_norm": 0.5643134713172913, + "learning_rate": 0.0005939834210187792, + "loss": 3.6112, + "step": 3909 + }, + { + "epoch": 0.19, + "grad_norm": 0.5836457014083862, + "learning_rate": 0.0005939803526412658, + "loss": 3.6238, + "step": 3910 + }, + { + "epoch": 0.19, + "grad_norm": 0.5769031047821045, + "learning_rate": 0.0005939772834894651, + "loss": 3.4665, + "step": 3911 + }, + { + "epoch": 0.19, + "grad_norm": 0.5801408290863037, + "learning_rate": 0.0005939742135633846, + "loss": 3.1557, + "step": 3912 + }, + { + "epoch": 0.19, + "grad_norm": 0.5858706831932068, + "learning_rate": 0.0005939711428630326, + "loss": 3.5962, + "step": 3913 + }, + { + "epoch": 0.19, + "grad_norm": 0.6179161071777344, + "learning_rate": 0.0005939680713884173, + "loss": 3.5575, + "step": 3914 + }, + { + "epoch": 0.19, + "grad_norm": 0.5316437482833862, + "learning_rate": 0.0005939649991395466, + "loss": 3.2445, + "step": 3915 + }, + { + "epoch": 0.19, + "grad_norm": 0.5476315021514893, + "learning_rate": 0.0005939619261164286, + "loss": 3.5819, + "step": 3916 + }, + { + "epoch": 0.19, + "grad_norm": 0.5380899906158447, + "learning_rate": 0.0005939588523190716, + "loss": 3.647, + "step": 3917 + }, + { + "epoch": 0.19, + "grad_norm": 0.6630106568336487, + "learning_rate": 0.0005939557777474833, + "loss": 3.3215, + "step": 3918 + }, + { + "epoch": 0.19, + "grad_norm": 0.5557861924171448, + "learning_rate": 0.0005939527024016723, + "loss": 3.5431, + "step": 3919 + }, + { + "epoch": 0.19, + "grad_norm": 0.5520122647285461, + "learning_rate": 0.0005939496262816464, + "loss": 3.5015, + "step": 3920 + }, + { + "epoch": 0.19, + "grad_norm": 0.593203604221344, + "learning_rate": 0.0005939465493874139, + "loss": 3.6008, + "step": 3921 + }, + { + "epoch": 0.19, + "grad_norm": 0.5582143068313599, + "learning_rate": 0.0005939434717189825, + "loss": 3.3972, + "step": 3922 + }, + { + "epoch": 0.19, + "grad_norm": 0.55299311876297, + "learning_rate": 0.0005939403932763607, + "loss": 3.5225, + "step": 3923 + }, + { + "epoch": 0.19, + "grad_norm": 0.5243273377418518, + "learning_rate": 0.0005939373140595565, + "loss": 3.5162, + "step": 3924 + }, + { + "epoch": 0.19, + "grad_norm": 0.5494229793548584, + "learning_rate": 0.0005939342340685778, + "loss": 3.4238, + "step": 3925 + }, + { + "epoch": 0.19, + "grad_norm": 0.56685471534729, + "learning_rate": 0.0005939311533034331, + "loss": 3.3776, + "step": 3926 + }, + { + "epoch": 0.19, + "grad_norm": 0.5525105595588684, + "learning_rate": 0.0005939280717641302, + "loss": 3.5084, + "step": 3927 + }, + { + "epoch": 0.19, + "grad_norm": 0.6117692589759827, + "learning_rate": 0.0005939249894506773, + "loss": 3.4447, + "step": 3928 + }, + { + "epoch": 0.19, + "grad_norm": 0.6117774844169617, + "learning_rate": 0.0005939219063630826, + "loss": 3.5056, + "step": 3929 + }, + { + "epoch": 0.19, + "grad_norm": 0.5624076724052429, + "learning_rate": 0.0005939188225013543, + "loss": 3.4238, + "step": 3930 + }, + { + "epoch": 0.19, + "grad_norm": 0.591431736946106, + "learning_rate": 0.0005939157378655002, + "loss": 3.4953, + "step": 3931 + }, + { + "epoch": 0.19, + "grad_norm": 0.5721966624259949, + "learning_rate": 0.0005939126524555287, + "loss": 3.4561, + "step": 3932 + }, + { + "epoch": 0.19, + "grad_norm": 0.5344234108924866, + "learning_rate": 0.0005939095662714479, + "loss": 3.4767, + "step": 3933 + }, + { + "epoch": 0.19, + "grad_norm": 0.6061483025550842, + "learning_rate": 0.0005939064793132658, + "loss": 3.6087, + "step": 3934 + }, + { + "epoch": 0.19, + "grad_norm": 0.5591382384300232, + "learning_rate": 0.0005939033915809905, + "loss": 3.5391, + "step": 3935 + }, + { + "epoch": 0.19, + "grad_norm": 0.5948025584220886, + "learning_rate": 0.0005939003030746302, + "loss": 3.5729, + "step": 3936 + }, + { + "epoch": 0.19, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0005938972137941933, + "loss": 3.4074, + "step": 3937 + }, + { + "epoch": 0.19, + "grad_norm": 0.5574817061424255, + "learning_rate": 0.0005938941237396876, + "loss": 3.371, + "step": 3938 + }, + { + "epoch": 0.19, + "grad_norm": 0.5657892823219299, + "learning_rate": 0.0005938910329111213, + "loss": 3.2757, + "step": 3939 + }, + { + "epoch": 0.19, + "grad_norm": 0.5320510864257812, + "learning_rate": 0.0005938879413085026, + "loss": 3.5539, + "step": 3940 + }, + { + "epoch": 0.19, + "grad_norm": 0.594287097454071, + "learning_rate": 0.0005938848489318396, + "loss": 3.3186, + "step": 3941 + }, + { + "epoch": 0.19, + "grad_norm": 0.5167534351348877, + "learning_rate": 0.0005938817557811405, + "loss": 3.4876, + "step": 3942 + }, + { + "epoch": 0.19, + "grad_norm": 0.5355756282806396, + "learning_rate": 0.0005938786618564134, + "loss": 3.6319, + "step": 3943 + }, + { + "epoch": 0.19, + "grad_norm": 0.5719373226165771, + "learning_rate": 0.0005938755671576664, + "loss": 3.6176, + "step": 3944 + }, + { + "epoch": 0.19, + "grad_norm": 0.5378432869911194, + "learning_rate": 0.0005938724716849078, + "loss": 3.2131, + "step": 3945 + }, + { + "epoch": 0.19, + "grad_norm": 0.5518651008605957, + "learning_rate": 0.0005938693754381456, + "loss": 3.3925, + "step": 3946 + }, + { + "epoch": 0.19, + "grad_norm": 0.5597622990608215, + "learning_rate": 0.0005938662784173881, + "loss": 3.436, + "step": 3947 + }, + { + "epoch": 0.19, + "grad_norm": 0.5614737868309021, + "learning_rate": 0.0005938631806226434, + "loss": 3.696, + "step": 3948 + }, + { + "epoch": 0.19, + "grad_norm": 0.5528543591499329, + "learning_rate": 0.0005938600820539195, + "loss": 3.4416, + "step": 3949 + }, + { + "epoch": 0.19, + "grad_norm": 0.588862955570221, + "learning_rate": 0.0005938569827112247, + "loss": 3.5648, + "step": 3950 + }, + { + "epoch": 0.19, + "grad_norm": 0.590054452419281, + "learning_rate": 0.0005938538825945673, + "loss": 3.3912, + "step": 3951 + }, + { + "epoch": 0.19, + "grad_norm": 0.5708290934562683, + "learning_rate": 0.0005938507817039553, + "loss": 3.5998, + "step": 3952 + }, + { + "epoch": 0.19, + "grad_norm": 0.5861159563064575, + "learning_rate": 0.0005938476800393968, + "loss": 3.4476, + "step": 3953 + }, + { + "epoch": 0.19, + "grad_norm": 0.5528053641319275, + "learning_rate": 0.0005938445776009002, + "loss": 3.4957, + "step": 3954 + }, + { + "epoch": 0.19, + "grad_norm": 0.5492965579032898, + "learning_rate": 0.0005938414743884734, + "loss": 3.3417, + "step": 3955 + }, + { + "epoch": 0.19, + "grad_norm": 0.575512707233429, + "learning_rate": 0.0005938383704021247, + "loss": 3.398, + "step": 3956 + }, + { + "epoch": 0.19, + "grad_norm": 0.5666360855102539, + "learning_rate": 0.0005938352656418624, + "loss": 3.5014, + "step": 3957 + }, + { + "epoch": 0.19, + "grad_norm": 0.6097762584686279, + "learning_rate": 0.0005938321601076944, + "loss": 3.5334, + "step": 3958 + }, + { + "epoch": 0.19, + "grad_norm": 0.5508065819740295, + "learning_rate": 0.0005938290537996292, + "loss": 3.4517, + "step": 3959 + }, + { + "epoch": 0.19, + "grad_norm": 0.5451350808143616, + "learning_rate": 0.0005938259467176748, + "loss": 3.3272, + "step": 3960 + }, + { + "epoch": 0.19, + "grad_norm": 0.5097694993019104, + "learning_rate": 0.0005938228388618393, + "loss": 3.3928, + "step": 3961 + }, + { + "epoch": 0.19, + "grad_norm": 0.5869589447975159, + "learning_rate": 0.0005938197302321312, + "loss": 3.5485, + "step": 3962 + }, + { + "epoch": 0.19, + "grad_norm": 0.5630464553833008, + "learning_rate": 0.0005938166208285582, + "loss": 3.41, + "step": 3963 + }, + { + "epoch": 0.19, + "grad_norm": 0.6036396622657776, + "learning_rate": 0.0005938135106511289, + "loss": 3.6272, + "step": 3964 + }, + { + "epoch": 0.19, + "grad_norm": 0.6252908706665039, + "learning_rate": 0.0005938103996998514, + "loss": 3.3919, + "step": 3965 + }, + { + "epoch": 0.19, + "grad_norm": 0.5575089454650879, + "learning_rate": 0.0005938072879747338, + "loss": 3.47, + "step": 3966 + }, + { + "epoch": 0.19, + "grad_norm": 0.5552845597267151, + "learning_rate": 0.0005938041754757844, + "loss": 3.384, + "step": 3967 + }, + { + "epoch": 0.19, + "grad_norm": 0.5382720828056335, + "learning_rate": 0.0005938010622030112, + "loss": 3.4518, + "step": 3968 + }, + { + "epoch": 0.19, + "grad_norm": 0.5792542695999146, + "learning_rate": 0.0005937979481564227, + "loss": 3.3024, + "step": 3969 + }, + { + "epoch": 0.19, + "grad_norm": 0.574998676776886, + "learning_rate": 0.000593794833336027, + "loss": 3.4806, + "step": 3970 + }, + { + "epoch": 0.19, + "grad_norm": 0.6154451966285706, + "learning_rate": 0.000593791717741832, + "loss": 3.3694, + "step": 3971 + }, + { + "epoch": 0.19, + "grad_norm": 0.5885059237480164, + "learning_rate": 0.0005937886013738464, + "loss": 3.3851, + "step": 3972 + }, + { + "epoch": 0.19, + "grad_norm": 0.5663660168647766, + "learning_rate": 0.000593785484232078, + "loss": 3.5687, + "step": 3973 + }, + { + "epoch": 0.19, + "grad_norm": 0.5735329389572144, + "learning_rate": 0.0005937823663165352, + "loss": 3.3934, + "step": 3974 + }, + { + "epoch": 0.19, + "grad_norm": 0.6525985598564148, + "learning_rate": 0.0005937792476272262, + "loss": 3.328, + "step": 3975 + }, + { + "epoch": 0.19, + "grad_norm": 0.586041271686554, + "learning_rate": 0.0005937761281641594, + "loss": 3.4332, + "step": 3976 + }, + { + "epoch": 0.19, + "grad_norm": 0.563795804977417, + "learning_rate": 0.0005937730079273426, + "loss": 3.6729, + "step": 3977 + }, + { + "epoch": 0.19, + "grad_norm": 0.5492075681686401, + "learning_rate": 0.0005937698869167842, + "loss": 3.4315, + "step": 3978 + }, + { + "epoch": 0.2, + "grad_norm": 0.6151273846626282, + "learning_rate": 0.0005937667651324925, + "loss": 3.7117, + "step": 3979 + }, + { + "epoch": 0.2, + "grad_norm": 0.5914956331253052, + "learning_rate": 0.0005937636425744757, + "loss": 3.4685, + "step": 3980 + }, + { + "epoch": 0.2, + "grad_norm": 0.5682784914970398, + "learning_rate": 0.0005937605192427422, + "loss": 3.2269, + "step": 3981 + }, + { + "epoch": 0.2, + "grad_norm": 0.5962632298469543, + "learning_rate": 0.0005937573951372997, + "loss": 3.4843, + "step": 3982 + }, + { + "epoch": 0.2, + "grad_norm": 0.6012601852416992, + "learning_rate": 0.0005937542702581569, + "loss": 3.4149, + "step": 3983 + }, + { + "epoch": 0.2, + "grad_norm": 0.556812047958374, + "learning_rate": 0.0005937511446053219, + "loss": 3.5567, + "step": 3984 + }, + { + "epoch": 0.2, + "grad_norm": 0.6080185174942017, + "learning_rate": 0.0005937480181788029, + "loss": 3.5601, + "step": 3985 + }, + { + "epoch": 0.2, + "grad_norm": 0.6003567576408386, + "learning_rate": 0.0005937448909786083, + "loss": 3.5517, + "step": 3986 + }, + { + "epoch": 0.2, + "grad_norm": 0.575144350528717, + "learning_rate": 0.0005937417630047459, + "loss": 3.415, + "step": 3987 + }, + { + "epoch": 0.2, + "grad_norm": 0.5906019806861877, + "learning_rate": 0.0005937386342572244, + "loss": 3.6005, + "step": 3988 + }, + { + "epoch": 0.2, + "grad_norm": 0.564968466758728, + "learning_rate": 0.0005937355047360519, + "loss": 3.4319, + "step": 3989 + }, + { + "epoch": 0.2, + "grad_norm": 0.5609432458877563, + "learning_rate": 0.0005937323744412365, + "loss": 3.3004, + "step": 3990 + }, + { + "epoch": 0.2, + "grad_norm": 0.5350099205970764, + "learning_rate": 0.0005937292433727866, + "loss": 3.5976, + "step": 3991 + }, + { + "epoch": 0.2, + "grad_norm": 0.5905284881591797, + "learning_rate": 0.0005937261115307104, + "loss": 3.5769, + "step": 3992 + }, + { + "epoch": 0.2, + "grad_norm": 0.5806431770324707, + "learning_rate": 0.0005937229789150162, + "loss": 3.6226, + "step": 3993 + }, + { + "epoch": 0.2, + "grad_norm": 0.5419069528579712, + "learning_rate": 0.0005937198455257122, + "loss": 3.8644, + "step": 3994 + }, + { + "epoch": 0.2, + "grad_norm": 0.5670180916786194, + "learning_rate": 0.0005937167113628067, + "loss": 3.4316, + "step": 3995 + }, + { + "epoch": 0.2, + "grad_norm": 0.5351924896240234, + "learning_rate": 0.0005937135764263077, + "loss": 3.5708, + "step": 3996 + }, + { + "epoch": 0.2, + "grad_norm": 0.5488182306289673, + "learning_rate": 0.0005937104407162238, + "loss": 3.5572, + "step": 3997 + }, + { + "epoch": 0.2, + "grad_norm": 0.5685787200927734, + "learning_rate": 0.0005937073042325631, + "loss": 3.4464, + "step": 3998 + }, + { + "epoch": 0.2, + "grad_norm": 0.5353884696960449, + "learning_rate": 0.000593704166975334, + "loss": 3.4783, + "step": 3999 + }, + { + "epoch": 0.2, + "grad_norm": 0.6061869859695435, + "learning_rate": 0.0005937010289445444, + "loss": 3.2644, + "step": 4000 + }, + { + "epoch": 0.2, + "grad_norm": 0.571092426776886, + "learning_rate": 0.000593697890140203, + "loss": 3.5456, + "step": 4001 + }, + { + "epoch": 0.2, + "grad_norm": 0.6306237578392029, + "learning_rate": 0.0005936947505623179, + "loss": 3.3862, + "step": 4002 + }, + { + "epoch": 0.2, + "grad_norm": 0.5207166075706482, + "learning_rate": 0.0005936916102108974, + "loss": 3.3985, + "step": 4003 + }, + { + "epoch": 0.2, + "grad_norm": 0.6106595396995544, + "learning_rate": 0.0005936884690859495, + "loss": 3.6107, + "step": 4004 + }, + { + "epoch": 0.2, + "grad_norm": 0.6063200235366821, + "learning_rate": 0.0005936853271874829, + "loss": 3.5122, + "step": 4005 + }, + { + "epoch": 0.2, + "grad_norm": 0.5411254167556763, + "learning_rate": 0.0005936821845155055, + "loss": 3.4418, + "step": 4006 + }, + { + "epoch": 0.2, + "grad_norm": 0.5724605321884155, + "learning_rate": 0.0005936790410700259, + "loss": 3.455, + "step": 4007 + }, + { + "epoch": 0.2, + "grad_norm": 0.5903576016426086, + "learning_rate": 0.0005936758968510521, + "loss": 3.4542, + "step": 4008 + }, + { + "epoch": 0.2, + "grad_norm": 0.5617760419845581, + "learning_rate": 0.0005936727518585925, + "loss": 3.5287, + "step": 4009 + }, + { + "epoch": 0.2, + "grad_norm": 0.57075035572052, + "learning_rate": 0.0005936696060926555, + "loss": 3.54, + "step": 4010 + }, + { + "epoch": 0.2, + "grad_norm": 0.5305947065353394, + "learning_rate": 0.0005936664595532493, + "loss": 3.7036, + "step": 4011 + }, + { + "epoch": 0.2, + "grad_norm": 0.5543831586837769, + "learning_rate": 0.000593663312240382, + "loss": 3.4049, + "step": 4012 + }, + { + "epoch": 0.2, + "grad_norm": 0.5600700974464417, + "learning_rate": 0.0005936601641540622, + "loss": 3.8008, + "step": 4013 + }, + { + "epoch": 0.2, + "grad_norm": 0.5779480934143066, + "learning_rate": 0.0005936570152942979, + "loss": 3.7728, + "step": 4014 + }, + { + "epoch": 0.2, + "grad_norm": 0.5971601009368896, + "learning_rate": 0.0005936538656610977, + "loss": 3.4096, + "step": 4015 + }, + { + "epoch": 0.2, + "grad_norm": 0.5580905675888062, + "learning_rate": 0.0005936507152544695, + "loss": 3.6679, + "step": 4016 + }, + { + "epoch": 0.2, + "grad_norm": 0.6468621492385864, + "learning_rate": 0.0005936475640744221, + "loss": 3.6336, + "step": 4017 + }, + { + "epoch": 0.2, + "grad_norm": 0.6058899164199829, + "learning_rate": 0.0005936444121209634, + "loss": 3.4921, + "step": 4018 + }, + { + "epoch": 0.2, + "grad_norm": 0.5818850994110107, + "learning_rate": 0.0005936412593941019, + "loss": 3.4677, + "step": 4019 + }, + { + "epoch": 0.2, + "grad_norm": 0.5825283527374268, + "learning_rate": 0.0005936381058938457, + "loss": 3.4282, + "step": 4020 + }, + { + "epoch": 0.2, + "grad_norm": 0.5536349415779114, + "learning_rate": 0.0005936349516202034, + "loss": 3.3787, + "step": 4021 + }, + { + "epoch": 0.2, + "grad_norm": 0.664323091506958, + "learning_rate": 0.000593631796573183, + "loss": 3.4861, + "step": 4022 + }, + { + "epoch": 0.2, + "grad_norm": 0.5751873254776001, + "learning_rate": 0.0005936286407527931, + "loss": 3.5405, + "step": 4023 + }, + { + "epoch": 0.2, + "grad_norm": 0.5620585083961487, + "learning_rate": 0.0005936254841590417, + "loss": 3.2407, + "step": 4024 + }, + { + "epoch": 0.2, + "grad_norm": 0.5601255893707275, + "learning_rate": 0.0005936223267919374, + "loss": 3.3897, + "step": 4025 + }, + { + "epoch": 0.2, + "grad_norm": 0.6205532550811768, + "learning_rate": 0.0005936191686514884, + "loss": 3.5952, + "step": 4026 + }, + { + "epoch": 0.2, + "grad_norm": 0.5451340675354004, + "learning_rate": 0.000593616009737703, + "loss": 3.3103, + "step": 4027 + }, + { + "epoch": 0.2, + "grad_norm": 0.5882269144058228, + "learning_rate": 0.0005936128500505896, + "loss": 3.274, + "step": 4028 + }, + { + "epoch": 0.2, + "grad_norm": 0.5673192739486694, + "learning_rate": 0.0005936096895901564, + "loss": 3.5805, + "step": 4029 + }, + { + "epoch": 0.2, + "grad_norm": 0.5817726850509644, + "learning_rate": 0.0005936065283564117, + "loss": 3.4446, + "step": 4030 + }, + { + "epoch": 0.2, + "grad_norm": 0.6373004913330078, + "learning_rate": 0.0005936033663493641, + "loss": 3.662, + "step": 4031 + }, + { + "epoch": 0.2, + "grad_norm": 0.5789350271224976, + "learning_rate": 0.0005936002035690216, + "loss": 3.4954, + "step": 4032 + }, + { + "epoch": 0.2, + "grad_norm": 0.5490628480911255, + "learning_rate": 0.0005935970400153927, + "loss": 3.4452, + "step": 4033 + }, + { + "epoch": 0.2, + "grad_norm": 0.611883282661438, + "learning_rate": 0.0005935938756884857, + "loss": 3.5924, + "step": 4034 + }, + { + "epoch": 0.2, + "grad_norm": 0.5716962814331055, + "learning_rate": 0.000593590710588309, + "loss": 3.4642, + "step": 4035 + }, + { + "epoch": 0.2, + "grad_norm": 0.5935686230659485, + "learning_rate": 0.0005935875447148707, + "loss": 3.3395, + "step": 4036 + }, + { + "epoch": 0.2, + "grad_norm": 0.5877057909965515, + "learning_rate": 0.0005935843780681795, + "loss": 3.3124, + "step": 4037 + }, + { + "epoch": 0.2, + "grad_norm": 0.6173941493034363, + "learning_rate": 0.0005935812106482433, + "loss": 3.606, + "step": 4038 + }, + { + "epoch": 0.2, + "grad_norm": 0.5606823563575745, + "learning_rate": 0.0005935780424550709, + "loss": 3.6931, + "step": 4039 + }, + { + "epoch": 0.2, + "grad_norm": 0.6052460074424744, + "learning_rate": 0.0005935748734886704, + "loss": 3.4013, + "step": 4040 + }, + { + "epoch": 0.2, + "grad_norm": 0.586379885673523, + "learning_rate": 0.0005935717037490501, + "loss": 3.4105, + "step": 4041 + }, + { + "epoch": 0.2, + "grad_norm": 0.5679966807365417, + "learning_rate": 0.0005935685332362186, + "loss": 3.5497, + "step": 4042 + }, + { + "epoch": 0.2, + "grad_norm": 0.5713095664978027, + "learning_rate": 0.000593565361950184, + "loss": 3.4705, + "step": 4043 + }, + { + "epoch": 0.2, + "grad_norm": 0.5471493005752563, + "learning_rate": 0.0005935621898909547, + "loss": 3.4439, + "step": 4044 + }, + { + "epoch": 0.2, + "grad_norm": 0.5692678093910217, + "learning_rate": 0.0005935590170585391, + "loss": 3.4251, + "step": 4045 + }, + { + "epoch": 0.2, + "grad_norm": 0.5359514951705933, + "learning_rate": 0.0005935558434529456, + "loss": 3.705, + "step": 4046 + }, + { + "epoch": 0.2, + "grad_norm": 0.6026199460029602, + "learning_rate": 0.0005935526690741824, + "loss": 3.5045, + "step": 4047 + }, + { + "epoch": 0.2, + "grad_norm": 0.5974529385566711, + "learning_rate": 0.0005935494939222581, + "loss": 3.3607, + "step": 4048 + }, + { + "epoch": 0.2, + "grad_norm": 0.583665132522583, + "learning_rate": 0.0005935463179971808, + "loss": 3.7261, + "step": 4049 + }, + { + "epoch": 0.2, + "grad_norm": 0.5661903619766235, + "learning_rate": 0.0005935431412989591, + "loss": 3.591, + "step": 4050 + }, + { + "epoch": 0.2, + "grad_norm": 0.5681086778640747, + "learning_rate": 0.0005935399638276012, + "loss": 3.5113, + "step": 4051 + }, + { + "epoch": 0.2, + "grad_norm": 0.5938650965690613, + "learning_rate": 0.0005935367855831155, + "loss": 3.4974, + "step": 4052 + }, + { + "epoch": 0.2, + "grad_norm": 0.5689106583595276, + "learning_rate": 0.0005935336065655104, + "loss": 3.4858, + "step": 4053 + }, + { + "epoch": 0.2, + "grad_norm": 0.552765965461731, + "learning_rate": 0.0005935304267747943, + "loss": 3.6311, + "step": 4054 + }, + { + "epoch": 0.2, + "grad_norm": 0.5879870057106018, + "learning_rate": 0.0005935272462109756, + "loss": 3.4236, + "step": 4055 + }, + { + "epoch": 0.2, + "grad_norm": 0.5558753609657288, + "learning_rate": 0.0005935240648740625, + "loss": 3.447, + "step": 4056 + }, + { + "epoch": 0.2, + "grad_norm": 0.5875343084335327, + "learning_rate": 0.0005935208827640636, + "loss": 3.5598, + "step": 4057 + }, + { + "epoch": 0.2, + "grad_norm": 0.580474853515625, + "learning_rate": 0.0005935176998809873, + "loss": 3.5608, + "step": 4058 + }, + { + "epoch": 0.2, + "grad_norm": 0.5907713770866394, + "learning_rate": 0.0005935145162248416, + "loss": 3.2178, + "step": 4059 + }, + { + "epoch": 0.2, + "grad_norm": 0.5516918301582336, + "learning_rate": 0.0005935113317956353, + "loss": 3.4183, + "step": 4060 + }, + { + "epoch": 0.2, + "grad_norm": 0.5646939277648926, + "learning_rate": 0.0005935081465933767, + "loss": 3.4653, + "step": 4061 + }, + { + "epoch": 0.2, + "grad_norm": 0.5440240502357483, + "learning_rate": 0.000593504960618074, + "loss": 3.7186, + "step": 4062 + }, + { + "epoch": 0.2, + "grad_norm": 0.5458465218544006, + "learning_rate": 0.0005935017738697357, + "loss": 3.4928, + "step": 4063 + }, + { + "epoch": 0.2, + "grad_norm": 0.6689546704292297, + "learning_rate": 0.0005934985863483704, + "loss": 3.3547, + "step": 4064 + }, + { + "epoch": 0.2, + "grad_norm": 0.6123103499412537, + "learning_rate": 0.0005934953980539861, + "loss": 3.375, + "step": 4065 + }, + { + "epoch": 0.2, + "grad_norm": 0.5664320588111877, + "learning_rate": 0.0005934922089865915, + "loss": 3.449, + "step": 4066 + }, + { + "epoch": 0.2, + "grad_norm": 0.6381657719612122, + "learning_rate": 0.0005934890191461949, + "loss": 3.4441, + "step": 4067 + }, + { + "epoch": 0.2, + "grad_norm": 0.5956173539161682, + "learning_rate": 0.0005934858285328048, + "loss": 3.6867, + "step": 4068 + }, + { + "epoch": 0.2, + "grad_norm": 0.6595749855041504, + "learning_rate": 0.0005934826371464294, + "loss": 3.3421, + "step": 4069 + }, + { + "epoch": 0.2, + "grad_norm": 0.5597966313362122, + "learning_rate": 0.0005934794449870772, + "loss": 3.5673, + "step": 4070 + }, + { + "epoch": 0.2, + "grad_norm": 0.5948126912117004, + "learning_rate": 0.0005934762520547566, + "loss": 3.7263, + "step": 4071 + }, + { + "epoch": 0.2, + "grad_norm": 0.5999248027801514, + "learning_rate": 0.0005934730583494761, + "loss": 3.4787, + "step": 4072 + }, + { + "epoch": 0.2, + "grad_norm": 0.6411767601966858, + "learning_rate": 0.0005934698638712441, + "loss": 3.4928, + "step": 4073 + }, + { + "epoch": 0.2, + "grad_norm": 0.5557070374488831, + "learning_rate": 0.0005934666686200689, + "loss": 3.5091, + "step": 4074 + }, + { + "epoch": 0.2, + "grad_norm": 0.5832461714744568, + "learning_rate": 0.0005934634725959589, + "loss": 3.5301, + "step": 4075 + }, + { + "epoch": 0.2, + "grad_norm": 0.5976592302322388, + "learning_rate": 0.0005934602757989227, + "loss": 3.4888, + "step": 4076 + }, + { + "epoch": 0.2, + "grad_norm": 0.5408334732055664, + "learning_rate": 0.0005934570782289686, + "loss": 3.6005, + "step": 4077 + }, + { + "epoch": 0.2, + "grad_norm": 0.5419313311576843, + "learning_rate": 0.000593453879886105, + "loss": 3.5205, + "step": 4078 + }, + { + "epoch": 0.2, + "grad_norm": 0.6286296844482422, + "learning_rate": 0.0005934506807703403, + "loss": 3.4126, + "step": 4079 + }, + { + "epoch": 0.2, + "grad_norm": 0.5880137085914612, + "learning_rate": 0.0005934474808816831, + "loss": 3.5638, + "step": 4080 + }, + { + "epoch": 0.2, + "grad_norm": 0.5868574976921082, + "learning_rate": 0.0005934442802201416, + "loss": 3.4796, + "step": 4081 + }, + { + "epoch": 0.2, + "grad_norm": 0.5653226375579834, + "learning_rate": 0.0005934410787857245, + "loss": 3.0373, + "step": 4082 + }, + { + "epoch": 0.2, + "grad_norm": 0.5920324921607971, + "learning_rate": 0.0005934378765784399, + "loss": 3.5852, + "step": 4083 + }, + { + "epoch": 0.2, + "grad_norm": 0.5790802836418152, + "learning_rate": 0.0005934346735982964, + "loss": 3.6043, + "step": 4084 + }, + { + "epoch": 0.2, + "grad_norm": 0.5777429938316345, + "learning_rate": 0.0005934314698453026, + "loss": 3.3492, + "step": 4085 + }, + { + "epoch": 0.2, + "grad_norm": 0.569974422454834, + "learning_rate": 0.0005934282653194667, + "loss": 3.4926, + "step": 4086 + }, + { + "epoch": 0.2, + "grad_norm": 0.6177379488945007, + "learning_rate": 0.0005934250600207972, + "loss": 3.6035, + "step": 4087 + }, + { + "epoch": 0.2, + "grad_norm": 0.5291566848754883, + "learning_rate": 0.0005934218539493026, + "loss": 3.5606, + "step": 4088 + }, + { + "epoch": 0.2, + "grad_norm": 0.5401890873908997, + "learning_rate": 0.0005934186471049913, + "loss": 3.4682, + "step": 4089 + }, + { + "epoch": 0.2, + "grad_norm": 0.5783869028091431, + "learning_rate": 0.0005934154394878717, + "loss": 3.4999, + "step": 4090 + }, + { + "epoch": 0.2, + "grad_norm": 0.5341452360153198, + "learning_rate": 0.0005934122310979524, + "loss": 3.4623, + "step": 4091 + }, + { + "epoch": 0.2, + "grad_norm": 0.5385696887969971, + "learning_rate": 0.0005934090219352416, + "loss": 3.6001, + "step": 4092 + }, + { + "epoch": 0.2, + "grad_norm": 0.6240125298500061, + "learning_rate": 0.000593405811999748, + "loss": 3.3475, + "step": 4093 + }, + { + "epoch": 0.2, + "grad_norm": 0.5452415347099304, + "learning_rate": 0.00059340260129148, + "loss": 3.4982, + "step": 4094 + }, + { + "epoch": 0.2, + "grad_norm": 0.5494929552078247, + "learning_rate": 0.000593399389810446, + "loss": 3.4822, + "step": 4095 + }, + { + "epoch": 0.2, + "grad_norm": 0.55720055103302, + "learning_rate": 0.0005933961775566543, + "loss": 3.4194, + "step": 4096 + }, + { + "epoch": 0.2, + "grad_norm": 0.5275405049324036, + "learning_rate": 0.0005933929645301138, + "loss": 3.4621, + "step": 4097 + }, + { + "epoch": 0.2, + "grad_norm": 0.559727668762207, + "learning_rate": 0.0005933897507308325, + "loss": 3.5763, + "step": 4098 + }, + { + "epoch": 0.2, + "grad_norm": 0.5319221019744873, + "learning_rate": 0.0005933865361588192, + "loss": 3.3231, + "step": 4099 + }, + { + "epoch": 0.2, + "grad_norm": 0.5701916217803955, + "learning_rate": 0.0005933833208140822, + "loss": 3.5887, + "step": 4100 + }, + { + "epoch": 0.2, + "grad_norm": 0.6000038981437683, + "learning_rate": 0.0005933801046966299, + "loss": 3.5659, + "step": 4101 + }, + { + "epoch": 0.2, + "grad_norm": 0.5542453527450562, + "learning_rate": 0.0005933768878064709, + "loss": 3.6189, + "step": 4102 + }, + { + "epoch": 0.2, + "grad_norm": 0.5546266436576843, + "learning_rate": 0.0005933736701436137, + "loss": 3.4361, + "step": 4103 + }, + { + "epoch": 0.2, + "grad_norm": 0.5334687232971191, + "learning_rate": 0.0005933704517080666, + "loss": 3.5126, + "step": 4104 + }, + { + "epoch": 0.2, + "grad_norm": 0.5554494857788086, + "learning_rate": 0.0005933672324998382, + "loss": 3.291, + "step": 4105 + }, + { + "epoch": 0.2, + "grad_norm": 0.5818085670471191, + "learning_rate": 0.0005933640125189371, + "loss": 3.2228, + "step": 4106 + }, + { + "epoch": 0.2, + "grad_norm": 0.5691803097724915, + "learning_rate": 0.0005933607917653715, + "loss": 3.4671, + "step": 4107 + }, + { + "epoch": 0.2, + "grad_norm": 0.5687941908836365, + "learning_rate": 0.0005933575702391501, + "loss": 3.3455, + "step": 4108 + }, + { + "epoch": 0.2, + "grad_norm": 0.5801530480384827, + "learning_rate": 0.0005933543479402814, + "loss": 3.5252, + "step": 4109 + }, + { + "epoch": 0.2, + "grad_norm": 0.5499988198280334, + "learning_rate": 0.0005933511248687738, + "loss": 3.4909, + "step": 4110 + }, + { + "epoch": 0.2, + "grad_norm": 0.5364803671836853, + "learning_rate": 0.0005933479010246357, + "loss": 3.6644, + "step": 4111 + }, + { + "epoch": 0.2, + "grad_norm": 0.5526125431060791, + "learning_rate": 0.0005933446764078758, + "loss": 3.3869, + "step": 4112 + }, + { + "epoch": 0.2, + "grad_norm": 0.5705166459083557, + "learning_rate": 0.0005933414510185023, + "loss": 3.4363, + "step": 4113 + }, + { + "epoch": 0.2, + "grad_norm": 0.6104483008384705, + "learning_rate": 0.0005933382248565239, + "loss": 3.4521, + "step": 4114 + }, + { + "epoch": 0.2, + "grad_norm": 0.6703182458877563, + "learning_rate": 0.0005933349979219492, + "loss": 3.538, + "step": 4115 + }, + { + "epoch": 0.2, + "grad_norm": 0.5792509913444519, + "learning_rate": 0.0005933317702147865, + "loss": 3.4318, + "step": 4116 + }, + { + "epoch": 0.2, + "grad_norm": 0.5312807559967041, + "learning_rate": 0.0005933285417350444, + "loss": 3.477, + "step": 4117 + }, + { + "epoch": 0.2, + "grad_norm": 0.5642654895782471, + "learning_rate": 0.0005933253124827313, + "loss": 3.5236, + "step": 4118 + }, + { + "epoch": 0.2, + "grad_norm": 0.5508056879043579, + "learning_rate": 0.0005933220824578559, + "loss": 3.6048, + "step": 4119 + }, + { + "epoch": 0.2, + "grad_norm": 0.5829192399978638, + "learning_rate": 0.0005933188516604264, + "loss": 3.4721, + "step": 4120 + }, + { + "epoch": 0.2, + "grad_norm": 0.5596129298210144, + "learning_rate": 0.0005933156200904516, + "loss": 3.2922, + "step": 4121 + }, + { + "epoch": 0.2, + "grad_norm": 0.5868514776229858, + "learning_rate": 0.00059331238774794, + "loss": 3.4381, + "step": 4122 + }, + { + "epoch": 0.2, + "grad_norm": 0.5288069248199463, + "learning_rate": 0.0005933091546328999, + "loss": 3.3479, + "step": 4123 + }, + { + "epoch": 0.2, + "grad_norm": 0.5082468390464783, + "learning_rate": 0.00059330592074534, + "loss": 3.5534, + "step": 4124 + }, + { + "epoch": 0.2, + "grad_norm": 0.5732336640357971, + "learning_rate": 0.0005933026860852688, + "loss": 3.3907, + "step": 4125 + }, + { + "epoch": 0.2, + "grad_norm": 0.5370706915855408, + "learning_rate": 0.0005932994506526947, + "loss": 3.6477, + "step": 4126 + }, + { + "epoch": 0.2, + "grad_norm": 0.5239009857177734, + "learning_rate": 0.0005932962144476262, + "loss": 3.6473, + "step": 4127 + }, + { + "epoch": 0.2, + "grad_norm": 0.566159725189209, + "learning_rate": 0.000593292977470072, + "loss": 3.687, + "step": 4128 + }, + { + "epoch": 0.2, + "grad_norm": 0.5577471256256104, + "learning_rate": 0.0005932897397200405, + "loss": 3.5333, + "step": 4129 + }, + { + "epoch": 0.2, + "grad_norm": 0.6108332276344299, + "learning_rate": 0.0005932865011975403, + "loss": 3.4125, + "step": 4130 + }, + { + "epoch": 0.2, + "grad_norm": 0.5488579869270325, + "learning_rate": 0.00059328326190258, + "loss": 3.4491, + "step": 4131 + }, + { + "epoch": 0.2, + "grad_norm": 0.522066056728363, + "learning_rate": 0.000593280021835168, + "loss": 3.4321, + "step": 4132 + }, + { + "epoch": 0.2, + "grad_norm": 0.5913230776786804, + "learning_rate": 0.0005932767809953128, + "loss": 3.6254, + "step": 4133 + }, + { + "epoch": 0.2, + "grad_norm": 0.5286851525306702, + "learning_rate": 0.0005932735393830229, + "loss": 3.493, + "step": 4134 + }, + { + "epoch": 0.2, + "grad_norm": 0.5211114883422852, + "learning_rate": 0.0005932702969983071, + "loss": 3.7179, + "step": 4135 + }, + { + "epoch": 0.2, + "grad_norm": 0.5448617339134216, + "learning_rate": 0.0005932670538411737, + "loss": 3.5312, + "step": 4136 + }, + { + "epoch": 0.2, + "grad_norm": 0.5416504740715027, + "learning_rate": 0.0005932638099116314, + "loss": 3.523, + "step": 4137 + }, + { + "epoch": 0.2, + "grad_norm": 0.5332238078117371, + "learning_rate": 0.0005932605652096887, + "loss": 3.5286, + "step": 4138 + }, + { + "epoch": 0.2, + "grad_norm": 0.5574766993522644, + "learning_rate": 0.0005932573197353539, + "loss": 3.2139, + "step": 4139 + }, + { + "epoch": 0.2, + "grad_norm": 0.5327970385551453, + "learning_rate": 0.000593254073488636, + "loss": 3.4693, + "step": 4140 + }, + { + "epoch": 0.2, + "grad_norm": 0.5722629427909851, + "learning_rate": 0.0005932508264695431, + "loss": 3.429, + "step": 4141 + }, + { + "epoch": 0.2, + "grad_norm": 0.5846360921859741, + "learning_rate": 0.0005932475786780841, + "loss": 3.4747, + "step": 4142 + }, + { + "epoch": 0.2, + "grad_norm": 0.5634985566139221, + "learning_rate": 0.0005932443301142673, + "loss": 3.4314, + "step": 4143 + }, + { + "epoch": 0.2, + "grad_norm": 0.5535609722137451, + "learning_rate": 0.0005932410807781015, + "loss": 3.4342, + "step": 4144 + }, + { + "epoch": 0.2, + "grad_norm": 0.5832380652427673, + "learning_rate": 0.0005932378306695951, + "loss": 3.2958, + "step": 4145 + }, + { + "epoch": 0.2, + "grad_norm": 0.5615767240524292, + "learning_rate": 0.0005932345797887566, + "loss": 3.4979, + "step": 4146 + }, + { + "epoch": 0.2, + "grad_norm": 0.5781761407852173, + "learning_rate": 0.0005932313281355947, + "loss": 3.5425, + "step": 4147 + }, + { + "epoch": 0.2, + "grad_norm": 0.5604632496833801, + "learning_rate": 0.000593228075710118, + "loss": 3.5805, + "step": 4148 + }, + { + "epoch": 0.2, + "grad_norm": 0.6127885580062866, + "learning_rate": 0.0005932248225123349, + "loss": 3.4723, + "step": 4149 + }, + { + "epoch": 0.2, + "grad_norm": 0.5799558162689209, + "learning_rate": 0.000593221568542254, + "loss": 3.4249, + "step": 4150 + }, + { + "epoch": 0.2, + "grad_norm": 0.5683193802833557, + "learning_rate": 0.0005932183137998839, + "loss": 3.5735, + "step": 4151 + }, + { + "epoch": 0.2, + "grad_norm": 0.5393085479736328, + "learning_rate": 0.0005932150582852333, + "loss": 3.3668, + "step": 4152 + }, + { + "epoch": 0.2, + "grad_norm": 0.6012470126152039, + "learning_rate": 0.0005932118019983107, + "loss": 3.3904, + "step": 4153 + }, + { + "epoch": 0.2, + "grad_norm": 0.5704240798950195, + "learning_rate": 0.0005932085449391245, + "loss": 3.4991, + "step": 4154 + }, + { + "epoch": 0.2, + "grad_norm": 0.6046105623245239, + "learning_rate": 0.0005932052871076835, + "loss": 3.531, + "step": 4155 + }, + { + "epoch": 0.2, + "grad_norm": 0.5393555760383606, + "learning_rate": 0.0005932020285039963, + "loss": 3.6832, + "step": 4156 + }, + { + "epoch": 0.2, + "grad_norm": 0.5603122115135193, + "learning_rate": 0.0005931987691280712, + "loss": 3.5048, + "step": 4157 + }, + { + "epoch": 0.2, + "grad_norm": 0.5598936676979065, + "learning_rate": 0.0005931955089799172, + "loss": 3.4841, + "step": 4158 + }, + { + "epoch": 0.2, + "grad_norm": 0.6121480464935303, + "learning_rate": 0.0005931922480595424, + "loss": 3.4819, + "step": 4159 + }, + { + "epoch": 0.2, + "grad_norm": 0.5565993189811707, + "learning_rate": 0.0005931889863669558, + "loss": 3.5906, + "step": 4160 + }, + { + "epoch": 0.2, + "grad_norm": 0.5580215454101562, + "learning_rate": 0.0005931857239021657, + "loss": 3.4656, + "step": 4161 + }, + { + "epoch": 0.2, + "grad_norm": 0.604782223701477, + "learning_rate": 0.000593182460665181, + "loss": 3.3578, + "step": 4162 + }, + { + "epoch": 0.2, + "grad_norm": 0.5579813122749329, + "learning_rate": 0.00059317919665601, + "loss": 3.5514, + "step": 4163 + }, + { + "epoch": 0.2, + "grad_norm": 0.5879456400871277, + "learning_rate": 0.0005931759318746615, + "loss": 3.5933, + "step": 4164 + }, + { + "epoch": 0.2, + "grad_norm": 0.5648056268692017, + "learning_rate": 0.000593172666321144, + "loss": 3.4556, + "step": 4165 + }, + { + "epoch": 0.2, + "grad_norm": 0.6041473150253296, + "learning_rate": 0.0005931693999954661, + "loss": 3.6027, + "step": 4166 + }, + { + "epoch": 0.2, + "grad_norm": 0.5803984999656677, + "learning_rate": 0.0005931661328976365, + "loss": 3.3593, + "step": 4167 + }, + { + "epoch": 0.2, + "grad_norm": 0.5941168665885925, + "learning_rate": 0.0005931628650276636, + "loss": 3.4784, + "step": 4168 + }, + { + "epoch": 0.2, + "grad_norm": 0.5952200293540955, + "learning_rate": 0.0005931595963855562, + "loss": 3.4204, + "step": 4169 + }, + { + "epoch": 0.2, + "grad_norm": 0.5529075860977173, + "learning_rate": 0.0005931563269713228, + "loss": 3.5687, + "step": 4170 + }, + { + "epoch": 0.2, + "grad_norm": 0.5690220594406128, + "learning_rate": 0.000593153056784972, + "loss": 3.5338, + "step": 4171 + }, + { + "epoch": 0.2, + "grad_norm": 0.6204742789268494, + "learning_rate": 0.0005931497858265126, + "loss": 3.4925, + "step": 4172 + }, + { + "epoch": 0.2, + "grad_norm": 0.6174020171165466, + "learning_rate": 0.000593146514095953, + "loss": 3.4493, + "step": 4173 + }, + { + "epoch": 0.2, + "grad_norm": 0.5436307191848755, + "learning_rate": 0.000593143241593302, + "loss": 3.5383, + "step": 4174 + }, + { + "epoch": 0.2, + "grad_norm": 0.5812988877296448, + "learning_rate": 0.0005931399683185679, + "loss": 3.265, + "step": 4175 + }, + { + "epoch": 0.2, + "grad_norm": 0.6355542540550232, + "learning_rate": 0.0005931366942717597, + "loss": 3.4879, + "step": 4176 + }, + { + "epoch": 0.2, + "grad_norm": 0.5769966840744019, + "learning_rate": 0.0005931334194528859, + "loss": 3.4083, + "step": 4177 + }, + { + "epoch": 0.2, + "grad_norm": 0.5602624416351318, + "learning_rate": 0.000593130143861955, + "loss": 3.4455, + "step": 4178 + }, + { + "epoch": 0.2, + "grad_norm": 0.5522497892379761, + "learning_rate": 0.0005931268674989758, + "loss": 3.589, + "step": 4179 + }, + { + "epoch": 0.2, + "grad_norm": 0.6143437623977661, + "learning_rate": 0.0005931235903639568, + "loss": 3.39, + "step": 4180 + }, + { + "epoch": 0.2, + "grad_norm": 0.5582066774368286, + "learning_rate": 0.0005931203124569066, + "loss": 3.2465, + "step": 4181 + }, + { + "epoch": 0.2, + "grad_norm": 0.5512022972106934, + "learning_rate": 0.000593117033777834, + "loss": 3.3506, + "step": 4182 + }, + { + "epoch": 0.2, + "grad_norm": 0.5688873529434204, + "learning_rate": 0.0005931137543267476, + "loss": 3.6862, + "step": 4183 + }, + { + "epoch": 0.21, + "grad_norm": 0.5612702369689941, + "learning_rate": 0.0005931104741036558, + "loss": 3.4739, + "step": 4184 + }, + { + "epoch": 0.21, + "grad_norm": 0.5855376720428467, + "learning_rate": 0.0005931071931085676, + "loss": 3.4008, + "step": 4185 + }, + { + "epoch": 0.21, + "grad_norm": 0.5829062461853027, + "learning_rate": 0.0005931039113414914, + "loss": 3.3817, + "step": 4186 + }, + { + "epoch": 0.21, + "grad_norm": 0.5704607963562012, + "learning_rate": 0.0005931006288024358, + "loss": 3.5417, + "step": 4187 + }, + { + "epoch": 0.21, + "grad_norm": 0.5722986459732056, + "learning_rate": 0.0005930973454914097, + "loss": 3.6125, + "step": 4188 + }, + { + "epoch": 0.21, + "grad_norm": 0.5302977561950684, + "learning_rate": 0.0005930940614084216, + "loss": 3.2082, + "step": 4189 + }, + { + "epoch": 0.21, + "grad_norm": 0.5311842560768127, + "learning_rate": 0.00059309077655348, + "loss": 3.4686, + "step": 4190 + }, + { + "epoch": 0.21, + "grad_norm": 0.5523838996887207, + "learning_rate": 0.0005930874909265938, + "loss": 3.525, + "step": 4191 + }, + { + "epoch": 0.21, + "grad_norm": 0.6022278070449829, + "learning_rate": 0.0005930842045277716, + "loss": 3.3398, + "step": 4192 + }, + { + "epoch": 0.21, + "grad_norm": 0.588112473487854, + "learning_rate": 0.000593080917357022, + "loss": 3.4084, + "step": 4193 + }, + { + "epoch": 0.21, + "grad_norm": 0.5542445182800293, + "learning_rate": 0.0005930776294143536, + "loss": 3.2733, + "step": 4194 + }, + { + "epoch": 0.21, + "grad_norm": 0.6074942350387573, + "learning_rate": 0.0005930743406997752, + "loss": 3.5434, + "step": 4195 + }, + { + "epoch": 0.21, + "grad_norm": 0.5761960744857788, + "learning_rate": 0.0005930710512132954, + "loss": 3.6245, + "step": 4196 + }, + { + "epoch": 0.21, + "grad_norm": 0.5809390544891357, + "learning_rate": 0.0005930677609549227, + "loss": 3.5375, + "step": 4197 + }, + { + "epoch": 0.21, + "grad_norm": 0.572761595249176, + "learning_rate": 0.000593064469924666, + "loss": 3.5079, + "step": 4198 + }, + { + "epoch": 0.21, + "grad_norm": 0.5556721687316895, + "learning_rate": 0.000593061178122534, + "loss": 3.4619, + "step": 4199 + }, + { + "epoch": 0.21, + "grad_norm": 0.5726044178009033, + "learning_rate": 0.0005930578855485351, + "loss": 3.3863, + "step": 4200 + }, + { + "epoch": 0.21, + "grad_norm": 0.5410187840461731, + "learning_rate": 0.0005930545922026783, + "loss": 3.3838, + "step": 4201 + }, + { + "epoch": 0.21, + "grad_norm": 0.5719176530838013, + "learning_rate": 0.0005930512980849719, + "loss": 3.6133, + "step": 4202 + }, + { + "epoch": 0.21, + "grad_norm": 0.5681977272033691, + "learning_rate": 0.0005930480031954249, + "loss": 3.5757, + "step": 4203 + }, + { + "epoch": 0.21, + "grad_norm": 0.5701371431350708, + "learning_rate": 0.0005930447075340458, + "loss": 3.2612, + "step": 4204 + }, + { + "epoch": 0.21, + "grad_norm": 0.8378312587738037, + "learning_rate": 0.0005930414111008435, + "loss": 3.5506, + "step": 4205 + }, + { + "epoch": 0.21, + "grad_norm": 0.6150882840156555, + "learning_rate": 0.0005930381138958263, + "loss": 3.4457, + "step": 4206 + }, + { + "epoch": 0.21, + "grad_norm": 0.5677423477172852, + "learning_rate": 0.0005930348159190031, + "loss": 3.3982, + "step": 4207 + }, + { + "epoch": 0.21, + "grad_norm": 0.6335375308990479, + "learning_rate": 0.0005930315171703827, + "loss": 3.3809, + "step": 4208 + }, + { + "epoch": 0.21, + "grad_norm": 0.5192340612411499, + "learning_rate": 0.0005930282176499738, + "loss": 3.4675, + "step": 4209 + }, + { + "epoch": 0.21, + "grad_norm": 0.5766927599906921, + "learning_rate": 0.0005930249173577848, + "loss": 3.5231, + "step": 4210 + }, + { + "epoch": 0.21, + "grad_norm": 0.5748723149299622, + "learning_rate": 0.0005930216162938246, + "loss": 3.4761, + "step": 4211 + }, + { + "epoch": 0.21, + "grad_norm": 0.549691379070282, + "learning_rate": 0.0005930183144581019, + "loss": 3.5352, + "step": 4212 + }, + { + "epoch": 0.21, + "grad_norm": 0.5617498755455017, + "learning_rate": 0.0005930150118506253, + "loss": 3.5251, + "step": 4213 + }, + { + "epoch": 0.21, + "grad_norm": 0.5673946738243103, + "learning_rate": 0.0005930117084714036, + "loss": 3.4009, + "step": 4214 + }, + { + "epoch": 0.21, + "grad_norm": 0.5976900458335876, + "learning_rate": 0.0005930084043204454, + "loss": 3.5733, + "step": 4215 + }, + { + "epoch": 0.21, + "grad_norm": 0.5517022013664246, + "learning_rate": 0.0005930050993977594, + "loss": 3.4349, + "step": 4216 + }, + { + "epoch": 0.21, + "grad_norm": 0.6215957999229431, + "learning_rate": 0.0005930017937033545, + "loss": 3.7329, + "step": 4217 + }, + { + "epoch": 0.21, + "grad_norm": 0.5812935829162598, + "learning_rate": 0.0005929984872372391, + "loss": 3.5745, + "step": 4218 + }, + { + "epoch": 0.21, + "grad_norm": 0.5533018112182617, + "learning_rate": 0.0005929951799994222, + "loss": 3.5439, + "step": 4219 + }, + { + "epoch": 0.21, + "grad_norm": 0.556189775466919, + "learning_rate": 0.0005929918719899123, + "loss": 3.6134, + "step": 4220 + }, + { + "epoch": 0.21, + "grad_norm": 0.5521323084831238, + "learning_rate": 0.0005929885632087183, + "loss": 3.3643, + "step": 4221 + }, + { + "epoch": 0.21, + "grad_norm": 0.6353476643562317, + "learning_rate": 0.0005929852536558487, + "loss": 3.2529, + "step": 4222 + }, + { + "epoch": 0.21, + "grad_norm": 0.6318731904029846, + "learning_rate": 0.0005929819433313124, + "loss": 3.5494, + "step": 4223 + }, + { + "epoch": 0.21, + "grad_norm": 0.5776653289794922, + "learning_rate": 0.0005929786322351181, + "loss": 3.4537, + "step": 4224 + }, + { + "epoch": 0.21, + "grad_norm": 0.5921760201454163, + "learning_rate": 0.0005929753203672743, + "loss": 3.5448, + "step": 4225 + }, + { + "epoch": 0.21, + "grad_norm": 0.5609504580497742, + "learning_rate": 0.00059297200772779, + "loss": 3.4304, + "step": 4226 + }, + { + "epoch": 0.21, + "grad_norm": 0.5797039866447449, + "learning_rate": 0.0005929686943166738, + "loss": 3.4262, + "step": 4227 + }, + { + "epoch": 0.21, + "grad_norm": 0.578444242477417, + "learning_rate": 0.0005929653801339344, + "loss": 3.2239, + "step": 4228 + }, + { + "epoch": 0.21, + "grad_norm": 0.5497398376464844, + "learning_rate": 0.0005929620651795806, + "loss": 3.4644, + "step": 4229 + }, + { + "epoch": 0.21, + "grad_norm": 0.5600916743278503, + "learning_rate": 0.0005929587494536212, + "loss": 3.4196, + "step": 4230 + }, + { + "epoch": 0.21, + "grad_norm": 0.5911729335784912, + "learning_rate": 0.0005929554329560647, + "loss": 3.2735, + "step": 4231 + }, + { + "epoch": 0.21, + "grad_norm": 0.5636935234069824, + "learning_rate": 0.00059295211568692, + "loss": 3.4634, + "step": 4232 + }, + { + "epoch": 0.21, + "grad_norm": 0.5678104758262634, + "learning_rate": 0.0005929487976461959, + "loss": 3.5156, + "step": 4233 + }, + { + "epoch": 0.21, + "grad_norm": 0.5603824853897095, + "learning_rate": 0.0005929454788339009, + "loss": 3.3868, + "step": 4234 + }, + { + "epoch": 0.21, + "grad_norm": 0.5836139917373657, + "learning_rate": 0.000592942159250044, + "loss": 3.5746, + "step": 4235 + }, + { + "epoch": 0.21, + "grad_norm": 0.7062385678291321, + "learning_rate": 0.0005929388388946338, + "loss": 3.3934, + "step": 4236 + }, + { + "epoch": 0.21, + "grad_norm": 0.5844118595123291, + "learning_rate": 0.000592935517767679, + "loss": 3.494, + "step": 4237 + }, + { + "epoch": 0.21, + "grad_norm": 0.5271828174591064, + "learning_rate": 0.0005929321958691885, + "loss": 3.4183, + "step": 4238 + }, + { + "epoch": 0.21, + "grad_norm": 0.528325617313385, + "learning_rate": 0.000592928873199171, + "loss": 3.3322, + "step": 4239 + }, + { + "epoch": 0.21, + "grad_norm": 0.5398619174957275, + "learning_rate": 0.0005929255497576353, + "loss": 3.4529, + "step": 4240 + }, + { + "epoch": 0.21, + "grad_norm": 0.5449317693710327, + "learning_rate": 0.0005929222255445899, + "loss": 3.6722, + "step": 4241 + }, + { + "epoch": 0.21, + "grad_norm": 0.5887119770050049, + "learning_rate": 0.0005929189005600438, + "loss": 3.5835, + "step": 4242 + }, + { + "epoch": 0.21, + "grad_norm": 0.5551977753639221, + "learning_rate": 0.0005929155748040057, + "loss": 3.3044, + "step": 4243 + }, + { + "epoch": 0.21, + "grad_norm": 0.5693026781082153, + "learning_rate": 0.0005929122482764844, + "loss": 3.3308, + "step": 4244 + }, + { + "epoch": 0.21, + "grad_norm": 0.6203973293304443, + "learning_rate": 0.0005929089209774883, + "loss": 3.5217, + "step": 4245 + }, + { + "epoch": 0.21, + "grad_norm": 0.6212824583053589, + "learning_rate": 0.0005929055929070269, + "loss": 3.4606, + "step": 4246 + }, + { + "epoch": 0.21, + "grad_norm": 0.5740115642547607, + "learning_rate": 0.0005929022640651083, + "loss": 3.4078, + "step": 4247 + }, + { + "epoch": 0.21, + "grad_norm": 0.5441243052482605, + "learning_rate": 0.0005928989344517415, + "loss": 3.4294, + "step": 4248 + }, + { + "epoch": 0.21, + "grad_norm": 0.6401486396789551, + "learning_rate": 0.0005928956040669353, + "loss": 3.4507, + "step": 4249 + }, + { + "epoch": 0.21, + "grad_norm": 0.5190750360488892, + "learning_rate": 0.0005928922729106986, + "loss": 3.4062, + "step": 4250 + }, + { + "epoch": 0.21, + "grad_norm": 0.5887166857719421, + "learning_rate": 0.0005928889409830398, + "loss": 3.5247, + "step": 4251 + }, + { + "epoch": 0.21, + "grad_norm": 0.5587601065635681, + "learning_rate": 0.0005928856082839681, + "loss": 3.462, + "step": 4252 + }, + { + "epoch": 0.21, + "grad_norm": 0.5775784254074097, + "learning_rate": 0.0005928822748134919, + "loss": 3.5834, + "step": 4253 + }, + { + "epoch": 0.21, + "grad_norm": 0.5799462795257568, + "learning_rate": 0.0005928789405716203, + "loss": 3.524, + "step": 4254 + }, + { + "epoch": 0.21, + "grad_norm": 0.6155045628547668, + "learning_rate": 0.0005928756055583619, + "loss": 3.4856, + "step": 4255 + }, + { + "epoch": 0.21, + "grad_norm": 0.5401473641395569, + "learning_rate": 0.0005928722697737254, + "loss": 3.4083, + "step": 4256 + }, + { + "epoch": 0.21, + "grad_norm": 0.5645717978477478, + "learning_rate": 0.0005928689332177199, + "loss": 3.6458, + "step": 4257 + }, + { + "epoch": 0.21, + "grad_norm": 0.5968051552772522, + "learning_rate": 0.0005928655958903538, + "loss": 3.6914, + "step": 4258 + }, + { + "epoch": 0.21, + "grad_norm": 0.5574849843978882, + "learning_rate": 0.0005928622577916362, + "loss": 3.4648, + "step": 4259 + }, + { + "epoch": 0.21, + "grad_norm": 0.5590183138847351, + "learning_rate": 0.0005928589189215757, + "loss": 3.5068, + "step": 4260 + }, + { + "epoch": 0.21, + "grad_norm": 0.5854844450950623, + "learning_rate": 0.0005928555792801812, + "loss": 3.4404, + "step": 4261 + }, + { + "epoch": 0.21, + "grad_norm": 0.5542050004005432, + "learning_rate": 0.0005928522388674616, + "loss": 3.7172, + "step": 4262 + }, + { + "epoch": 0.21, + "grad_norm": 0.5726750493049622, + "learning_rate": 0.0005928488976834254, + "loss": 3.5171, + "step": 4263 + }, + { + "epoch": 0.21, + "grad_norm": 0.5503663420677185, + "learning_rate": 0.0005928455557280815, + "loss": 3.5306, + "step": 4264 + }, + { + "epoch": 0.21, + "grad_norm": 0.5870798826217651, + "learning_rate": 0.000592842213001439, + "loss": 3.416, + "step": 4265 + }, + { + "epoch": 0.21, + "grad_norm": 0.5509695410728455, + "learning_rate": 0.0005928388695035064, + "loss": 3.6081, + "step": 4266 + }, + { + "epoch": 0.21, + "grad_norm": 0.578829824924469, + "learning_rate": 0.0005928355252342925, + "loss": 3.4515, + "step": 4267 + }, + { + "epoch": 0.21, + "grad_norm": 0.6161765456199646, + "learning_rate": 0.0005928321801938061, + "loss": 3.5032, + "step": 4268 + }, + { + "epoch": 0.21, + "grad_norm": 0.5503228306770325, + "learning_rate": 0.0005928288343820563, + "loss": 3.3577, + "step": 4269 + }, + { + "epoch": 0.21, + "grad_norm": 0.550617516040802, + "learning_rate": 0.0005928254877990515, + "loss": 3.2324, + "step": 4270 + }, + { + "epoch": 0.21, + "grad_norm": 0.5668909549713135, + "learning_rate": 0.0005928221404448008, + "loss": 3.431, + "step": 4271 + }, + { + "epoch": 0.21, + "grad_norm": 0.5765318274497986, + "learning_rate": 0.000592818792319313, + "loss": 3.5756, + "step": 4272 + }, + { + "epoch": 0.21, + "grad_norm": 0.6077347993850708, + "learning_rate": 0.0005928154434225968, + "loss": 3.5527, + "step": 4273 + }, + { + "epoch": 0.21, + "grad_norm": 0.5627986788749695, + "learning_rate": 0.0005928120937546611, + "loss": 3.8691, + "step": 4274 + }, + { + "epoch": 0.21, + "grad_norm": 0.5574711561203003, + "learning_rate": 0.0005928087433155147, + "loss": 3.5428, + "step": 4275 + }, + { + "epoch": 0.21, + "grad_norm": 0.5420995950698853, + "learning_rate": 0.0005928053921051664, + "loss": 3.4421, + "step": 4276 + }, + { + "epoch": 0.21, + "grad_norm": 0.6338256001472473, + "learning_rate": 0.000592802040123625, + "loss": 3.3063, + "step": 4277 + }, + { + "epoch": 0.21, + "grad_norm": 0.572799026966095, + "learning_rate": 0.0005927986873708995, + "loss": 3.4216, + "step": 4278 + }, + { + "epoch": 0.21, + "grad_norm": 0.5351685285568237, + "learning_rate": 0.0005927953338469986, + "loss": 3.391, + "step": 4279 + }, + { + "epoch": 0.21, + "grad_norm": 0.5662745237350464, + "learning_rate": 0.0005927919795519311, + "loss": 3.5054, + "step": 4280 + }, + { + "epoch": 0.21, + "grad_norm": 0.5973455905914307, + "learning_rate": 0.0005927886244857058, + "loss": 3.5656, + "step": 4281 + }, + { + "epoch": 0.21, + "grad_norm": 0.5676031112670898, + "learning_rate": 0.0005927852686483317, + "loss": 3.2299, + "step": 4282 + }, + { + "epoch": 0.21, + "grad_norm": 0.5569466352462769, + "learning_rate": 0.0005927819120398175, + "loss": 3.4992, + "step": 4283 + }, + { + "epoch": 0.21, + "grad_norm": 0.5708233714103699, + "learning_rate": 0.0005927785546601721, + "loss": 3.5776, + "step": 4284 + }, + { + "epoch": 0.21, + "grad_norm": 0.5976619124412537, + "learning_rate": 0.0005927751965094044, + "loss": 3.3507, + "step": 4285 + }, + { + "epoch": 0.21, + "grad_norm": 0.5370445847511292, + "learning_rate": 0.0005927718375875231, + "loss": 3.3449, + "step": 4286 + }, + { + "epoch": 0.21, + "grad_norm": 0.5878073573112488, + "learning_rate": 0.0005927684778945371, + "loss": 3.4856, + "step": 4287 + }, + { + "epoch": 0.21, + "grad_norm": 0.5714808702468872, + "learning_rate": 0.0005927651174304553, + "loss": 3.6395, + "step": 4288 + }, + { + "epoch": 0.21, + "grad_norm": 0.5233124494552612, + "learning_rate": 0.0005927617561952866, + "loss": 3.4073, + "step": 4289 + }, + { + "epoch": 0.21, + "grad_norm": 0.5612361431121826, + "learning_rate": 0.0005927583941890398, + "loss": 3.3293, + "step": 4290 + }, + { + "epoch": 0.21, + "grad_norm": 0.5414638519287109, + "learning_rate": 0.0005927550314117235, + "loss": 3.4124, + "step": 4291 + }, + { + "epoch": 0.21, + "grad_norm": 0.586334764957428, + "learning_rate": 0.0005927516678633471, + "loss": 3.387, + "step": 4292 + }, + { + "epoch": 0.21, + "grad_norm": 0.5697146654129028, + "learning_rate": 0.000592748303543919, + "loss": 3.6256, + "step": 4293 + }, + { + "epoch": 0.21, + "grad_norm": 0.5857806205749512, + "learning_rate": 0.0005927449384534482, + "loss": 3.5125, + "step": 4294 + }, + { + "epoch": 0.21, + "grad_norm": 0.5624878406524658, + "learning_rate": 0.0005927415725919435, + "loss": 3.5866, + "step": 4295 + }, + { + "epoch": 0.21, + "grad_norm": 0.5444654226303101, + "learning_rate": 0.0005927382059594139, + "loss": 3.6301, + "step": 4296 + }, + { + "epoch": 0.21, + "grad_norm": 0.5403390526771545, + "learning_rate": 0.0005927348385558682, + "loss": 3.4554, + "step": 4297 + }, + { + "epoch": 0.21, + "grad_norm": 0.5376538634300232, + "learning_rate": 0.0005927314703813154, + "loss": 3.464, + "step": 4298 + }, + { + "epoch": 0.21, + "grad_norm": 0.5038751363754272, + "learning_rate": 0.000592728101435764, + "loss": 3.5199, + "step": 4299 + }, + { + "epoch": 0.21, + "grad_norm": 0.5603412389755249, + "learning_rate": 0.0005927247317192233, + "loss": 3.2781, + "step": 4300 + }, + { + "epoch": 0.21, + "grad_norm": 0.5391646027565002, + "learning_rate": 0.0005927213612317019, + "loss": 3.533, + "step": 4301 + }, + { + "epoch": 0.21, + "grad_norm": 0.6210039258003235, + "learning_rate": 0.0005927179899732088, + "loss": 3.4641, + "step": 4302 + }, + { + "epoch": 0.21, + "grad_norm": 0.5959204435348511, + "learning_rate": 0.0005927146179437529, + "loss": 3.5338, + "step": 4303 + }, + { + "epoch": 0.21, + "grad_norm": 0.53571617603302, + "learning_rate": 0.000592711245143343, + "loss": 3.5324, + "step": 4304 + }, + { + "epoch": 0.21, + "grad_norm": 0.5784791707992554, + "learning_rate": 0.000592707871571988, + "loss": 3.6262, + "step": 4305 + }, + { + "epoch": 0.21, + "grad_norm": 0.5615180134773254, + "learning_rate": 0.0005927044972296969, + "loss": 3.4118, + "step": 4306 + }, + { + "epoch": 0.21, + "grad_norm": 0.5299209952354431, + "learning_rate": 0.0005927011221164783, + "loss": 3.7879, + "step": 4307 + }, + { + "epoch": 0.21, + "grad_norm": 0.5538467168807983, + "learning_rate": 0.0005926977462323414, + "loss": 3.3704, + "step": 4308 + }, + { + "epoch": 0.21, + "grad_norm": 0.6038463711738586, + "learning_rate": 0.0005926943695772949, + "loss": 3.517, + "step": 4309 + }, + { + "epoch": 0.21, + "grad_norm": 0.5968214869499207, + "learning_rate": 0.0005926909921513477, + "loss": 3.4321, + "step": 4310 + }, + { + "epoch": 0.21, + "grad_norm": 0.5371940732002258, + "learning_rate": 0.0005926876139545089, + "loss": 3.4314, + "step": 4311 + }, + { + "epoch": 0.21, + "grad_norm": 0.6479960680007935, + "learning_rate": 0.0005926842349867873, + "loss": 3.4106, + "step": 4312 + }, + { + "epoch": 0.21, + "grad_norm": 0.5392265319824219, + "learning_rate": 0.0005926808552481917, + "loss": 3.6161, + "step": 4313 + }, + { + "epoch": 0.21, + "grad_norm": 0.572577714920044, + "learning_rate": 0.000592677474738731, + "loss": 3.5508, + "step": 4314 + }, + { + "epoch": 0.21, + "grad_norm": 0.5596196055412292, + "learning_rate": 0.0005926740934584141, + "loss": 3.5483, + "step": 4315 + }, + { + "epoch": 0.21, + "grad_norm": 0.5759965777397156, + "learning_rate": 0.0005926707114072501, + "loss": 3.411, + "step": 4316 + }, + { + "epoch": 0.21, + "grad_norm": 0.53867506980896, + "learning_rate": 0.0005926673285852477, + "loss": 3.5226, + "step": 4317 + }, + { + "epoch": 0.21, + "grad_norm": 0.5533181428909302, + "learning_rate": 0.0005926639449924158, + "loss": 3.5157, + "step": 4318 + }, + { + "epoch": 0.21, + "grad_norm": 0.5320466756820679, + "learning_rate": 0.0005926605606287635, + "loss": 3.6545, + "step": 4319 + }, + { + "epoch": 0.21, + "grad_norm": 0.5725945234298706, + "learning_rate": 0.0005926571754942996, + "loss": 3.4898, + "step": 4320 + }, + { + "epoch": 0.21, + "grad_norm": 0.633961021900177, + "learning_rate": 0.000592653789589033, + "loss": 3.4104, + "step": 4321 + }, + { + "epoch": 0.21, + "grad_norm": 0.5438200831413269, + "learning_rate": 0.0005926504029129726, + "loss": 3.4986, + "step": 4322 + }, + { + "epoch": 0.21, + "grad_norm": 0.5423779487609863, + "learning_rate": 0.0005926470154661275, + "loss": 3.2405, + "step": 4323 + }, + { + "epoch": 0.21, + "grad_norm": 0.5243359804153442, + "learning_rate": 0.0005926436272485064, + "loss": 3.6668, + "step": 4324 + }, + { + "epoch": 0.21, + "grad_norm": 0.5646772384643555, + "learning_rate": 0.0005926402382601183, + "loss": 3.2187, + "step": 4325 + }, + { + "epoch": 0.21, + "grad_norm": 0.5363373756408691, + "learning_rate": 0.0005926368485009721, + "loss": 3.6333, + "step": 4326 + }, + { + "epoch": 0.21, + "grad_norm": 0.5637429356575012, + "learning_rate": 0.0005926334579710768, + "loss": 3.7082, + "step": 4327 + }, + { + "epoch": 0.21, + "grad_norm": 0.5417162775993347, + "learning_rate": 0.0005926300666704413, + "loss": 3.6215, + "step": 4328 + }, + { + "epoch": 0.21, + "grad_norm": 0.5589267015457153, + "learning_rate": 0.0005926266745990745, + "loss": 3.4844, + "step": 4329 + }, + { + "epoch": 0.21, + "grad_norm": 0.5703160762786865, + "learning_rate": 0.0005926232817569853, + "loss": 3.4402, + "step": 4330 + }, + { + "epoch": 0.21, + "grad_norm": 0.5476822257041931, + "learning_rate": 0.0005926198881441828, + "loss": 3.4692, + "step": 4331 + }, + { + "epoch": 0.21, + "grad_norm": 0.5585446357727051, + "learning_rate": 0.0005926164937606758, + "loss": 3.5845, + "step": 4332 + }, + { + "epoch": 0.21, + "grad_norm": 0.540649950504303, + "learning_rate": 0.0005926130986064733, + "loss": 3.4894, + "step": 4333 + }, + { + "epoch": 0.21, + "grad_norm": 0.5446892976760864, + "learning_rate": 0.0005926097026815842, + "loss": 3.4582, + "step": 4334 + }, + { + "epoch": 0.21, + "grad_norm": 0.5253145694732666, + "learning_rate": 0.0005926063059860173, + "loss": 3.4895, + "step": 4335 + }, + { + "epoch": 0.21, + "grad_norm": 0.5359606742858887, + "learning_rate": 0.0005926029085197819, + "loss": 3.4371, + "step": 4336 + }, + { + "epoch": 0.21, + "grad_norm": 0.5448620915412903, + "learning_rate": 0.0005925995102828867, + "loss": 3.4806, + "step": 4337 + }, + { + "epoch": 0.21, + "grad_norm": 0.5616055130958557, + "learning_rate": 0.0005925961112753406, + "loss": 3.3458, + "step": 4338 + }, + { + "epoch": 0.21, + "grad_norm": 0.584311842918396, + "learning_rate": 0.0005925927114971527, + "loss": 3.4402, + "step": 4339 + }, + { + "epoch": 0.21, + "grad_norm": 0.5829342603683472, + "learning_rate": 0.000592589310948332, + "loss": 3.4649, + "step": 4340 + }, + { + "epoch": 0.21, + "grad_norm": 0.5744354128837585, + "learning_rate": 0.0005925859096288874, + "loss": 3.5193, + "step": 4341 + }, + { + "epoch": 0.21, + "grad_norm": 0.5684018731117249, + "learning_rate": 0.0005925825075388277, + "loss": 3.4638, + "step": 4342 + }, + { + "epoch": 0.21, + "grad_norm": 0.5490890741348267, + "learning_rate": 0.000592579104678162, + "loss": 3.593, + "step": 4343 + }, + { + "epoch": 0.21, + "grad_norm": 0.6014746427536011, + "learning_rate": 0.0005925757010468993, + "loss": 3.2675, + "step": 4344 + }, + { + "epoch": 0.21, + "grad_norm": 0.6437888145446777, + "learning_rate": 0.0005925722966450485, + "loss": 3.4341, + "step": 4345 + }, + { + "epoch": 0.21, + "grad_norm": 0.5803835391998291, + "learning_rate": 0.0005925688914726185, + "loss": 3.368, + "step": 4346 + }, + { + "epoch": 0.21, + "grad_norm": 0.5558992028236389, + "learning_rate": 0.0005925654855296183, + "loss": 3.5642, + "step": 4347 + }, + { + "epoch": 0.21, + "grad_norm": 0.6441301703453064, + "learning_rate": 0.0005925620788160571, + "loss": 3.2692, + "step": 4348 + }, + { + "epoch": 0.21, + "grad_norm": 0.5592367649078369, + "learning_rate": 0.0005925586713319436, + "loss": 3.5717, + "step": 4349 + }, + { + "epoch": 0.21, + "grad_norm": 0.5774462223052979, + "learning_rate": 0.0005925552630772869, + "loss": 3.5024, + "step": 4350 + }, + { + "epoch": 0.21, + "grad_norm": 0.6486428380012512, + "learning_rate": 0.0005925518540520958, + "loss": 3.4881, + "step": 4351 + }, + { + "epoch": 0.21, + "grad_norm": 0.5728597044944763, + "learning_rate": 0.0005925484442563795, + "loss": 3.3829, + "step": 4352 + }, + { + "epoch": 0.21, + "grad_norm": 0.5495771765708923, + "learning_rate": 0.000592545033690147, + "loss": 3.5991, + "step": 4353 + }, + { + "epoch": 0.21, + "grad_norm": 0.5710741281509399, + "learning_rate": 0.0005925416223534071, + "loss": 3.3564, + "step": 4354 + }, + { + "epoch": 0.21, + "grad_norm": 0.6084657311439514, + "learning_rate": 0.0005925382102461689, + "loss": 3.3813, + "step": 4355 + }, + { + "epoch": 0.21, + "grad_norm": 0.5266708135604858, + "learning_rate": 0.0005925347973684414, + "loss": 3.3215, + "step": 4356 + }, + { + "epoch": 0.21, + "grad_norm": 0.6160021424293518, + "learning_rate": 0.0005925313837202334, + "loss": 3.5516, + "step": 4357 + }, + { + "epoch": 0.21, + "grad_norm": 0.5864583253860474, + "learning_rate": 0.0005925279693015541, + "loss": 3.5683, + "step": 4358 + }, + { + "epoch": 0.21, + "grad_norm": 0.5891723036766052, + "learning_rate": 0.0005925245541124124, + "loss": 3.3166, + "step": 4359 + }, + { + "epoch": 0.21, + "grad_norm": 0.5659295916557312, + "learning_rate": 0.0005925211381528175, + "loss": 3.4871, + "step": 4360 + }, + { + "epoch": 0.21, + "grad_norm": 0.609416127204895, + "learning_rate": 0.000592517721422778, + "loss": 3.4384, + "step": 4361 + }, + { + "epoch": 0.21, + "grad_norm": 0.518798828125, + "learning_rate": 0.0005925143039223032, + "loss": 3.3999, + "step": 4362 + }, + { + "epoch": 0.21, + "grad_norm": 0.6183592081069946, + "learning_rate": 0.000592510885651402, + "loss": 3.5708, + "step": 4363 + }, + { + "epoch": 0.21, + "grad_norm": 0.522781491279602, + "learning_rate": 0.0005925074666100834, + "loss": 3.5036, + "step": 4364 + }, + { + "epoch": 0.21, + "grad_norm": 0.600847601890564, + "learning_rate": 0.0005925040467983564, + "loss": 3.4813, + "step": 4365 + }, + { + "epoch": 0.21, + "grad_norm": 0.556460440158844, + "learning_rate": 0.0005925006262162302, + "loss": 3.5809, + "step": 4366 + }, + { + "epoch": 0.21, + "grad_norm": 0.5647059082984924, + "learning_rate": 0.0005924972048637135, + "loss": 3.5571, + "step": 4367 + }, + { + "epoch": 0.21, + "grad_norm": 0.5725759267807007, + "learning_rate": 0.0005924937827408154, + "loss": 3.4548, + "step": 4368 + }, + { + "epoch": 0.21, + "grad_norm": 0.5902033448219299, + "learning_rate": 0.0005924903598475451, + "loss": 3.4404, + "step": 4369 + }, + { + "epoch": 0.21, + "grad_norm": 0.723042905330658, + "learning_rate": 0.0005924869361839115, + "loss": 3.5888, + "step": 4370 + }, + { + "epoch": 0.21, + "grad_norm": 0.7233627438545227, + "learning_rate": 0.0005924835117499235, + "loss": 3.3951, + "step": 4371 + }, + { + "epoch": 0.21, + "grad_norm": 0.5561408996582031, + "learning_rate": 0.0005924800865455903, + "loss": 3.6309, + "step": 4372 + }, + { + "epoch": 0.21, + "grad_norm": 0.5627115964889526, + "learning_rate": 0.0005924766605709209, + "loss": 3.5393, + "step": 4373 + }, + { + "epoch": 0.21, + "grad_norm": 0.5121928453445435, + "learning_rate": 0.000592473233825924, + "loss": 3.5696, + "step": 4374 + }, + { + "epoch": 0.21, + "grad_norm": 0.6076629757881165, + "learning_rate": 0.0005924698063106091, + "loss": 3.4886, + "step": 4375 + }, + { + "epoch": 0.21, + "grad_norm": 0.5451004505157471, + "learning_rate": 0.0005924663780249851, + "loss": 3.3686, + "step": 4376 + }, + { + "epoch": 0.21, + "grad_norm": 0.582217276096344, + "learning_rate": 0.0005924629489690608, + "loss": 3.5121, + "step": 4377 + }, + { + "epoch": 0.21, + "grad_norm": 0.5545997023582458, + "learning_rate": 0.0005924595191428454, + "loss": 3.5433, + "step": 4378 + }, + { + "epoch": 0.21, + "grad_norm": 0.5341416001319885, + "learning_rate": 0.0005924560885463479, + "loss": 3.5336, + "step": 4379 + }, + { + "epoch": 0.21, + "grad_norm": 0.6326627731323242, + "learning_rate": 0.0005924526571795774, + "loss": 3.4165, + "step": 4380 + }, + { + "epoch": 0.21, + "grad_norm": 0.5663824677467346, + "learning_rate": 0.0005924492250425428, + "loss": 3.5157, + "step": 4381 + }, + { + "epoch": 0.21, + "grad_norm": 0.5439797043800354, + "learning_rate": 0.0005924457921352533, + "loss": 3.4163, + "step": 4382 + }, + { + "epoch": 0.21, + "grad_norm": 0.5896198749542236, + "learning_rate": 0.0005924423584577178, + "loss": 3.5275, + "step": 4383 + }, + { + "epoch": 0.21, + "grad_norm": 0.5586664080619812, + "learning_rate": 0.0005924389240099454, + "loss": 3.576, + "step": 4384 + }, + { + "epoch": 0.21, + "grad_norm": 0.5929638743400574, + "learning_rate": 0.0005924354887919452, + "loss": 3.5569, + "step": 4385 + }, + { + "epoch": 0.21, + "grad_norm": 0.6475085020065308, + "learning_rate": 0.0005924320528037263, + "loss": 3.602, + "step": 4386 + }, + { + "epoch": 0.21, + "grad_norm": 0.5904268622398376, + "learning_rate": 0.0005924286160452974, + "loss": 3.4897, + "step": 4387 + }, + { + "epoch": 0.22, + "grad_norm": 0.5506060719490051, + "learning_rate": 0.000592425178516668, + "loss": 3.5867, + "step": 4388 + }, + { + "epoch": 0.22, + "grad_norm": 0.6361829042434692, + "learning_rate": 0.000592421740217847, + "loss": 3.6221, + "step": 4389 + }, + { + "epoch": 0.22, + "grad_norm": 0.5692065954208374, + "learning_rate": 0.0005924183011488433, + "loss": 3.6743, + "step": 4390 + }, + { + "epoch": 0.22, + "grad_norm": 0.5525909066200256, + "learning_rate": 0.000592414861309666, + "loss": 3.4595, + "step": 4391 + }, + { + "epoch": 0.22, + "grad_norm": 0.5622319579124451, + "learning_rate": 0.0005924114207003243, + "loss": 3.5213, + "step": 4392 + }, + { + "epoch": 0.22, + "grad_norm": 0.5368682146072388, + "learning_rate": 0.0005924079793208273, + "loss": 3.5174, + "step": 4393 + }, + { + "epoch": 0.22, + "grad_norm": 0.5590381622314453, + "learning_rate": 0.000592404537171184, + "loss": 3.4644, + "step": 4394 + }, + { + "epoch": 0.22, + "grad_norm": 0.5523700714111328, + "learning_rate": 0.0005924010942514032, + "loss": 3.3879, + "step": 4395 + }, + { + "epoch": 0.22, + "grad_norm": 0.5766987204551697, + "learning_rate": 0.0005923976505614944, + "loss": 3.7217, + "step": 4396 + }, + { + "epoch": 0.22, + "grad_norm": 0.5824809670448303, + "learning_rate": 0.0005923942061014663, + "loss": 3.459, + "step": 4397 + }, + { + "epoch": 0.22, + "grad_norm": 0.5317344665527344, + "learning_rate": 0.0005923907608713282, + "loss": 3.3727, + "step": 4398 + }, + { + "epoch": 0.22, + "grad_norm": 0.5514039993286133, + "learning_rate": 0.0005923873148710892, + "loss": 3.3122, + "step": 4399 + }, + { + "epoch": 0.22, + "grad_norm": 0.5581009387969971, + "learning_rate": 0.0005923838681007581, + "loss": 3.6301, + "step": 4400 + }, + { + "epoch": 0.22, + "grad_norm": 0.5631245374679565, + "learning_rate": 0.0005923804205603442, + "loss": 3.1971, + "step": 4401 + }, + { + "epoch": 0.22, + "grad_norm": 0.5507313013076782, + "learning_rate": 0.0005923769722498566, + "loss": 3.4534, + "step": 4402 + }, + { + "epoch": 0.22, + "grad_norm": 0.5625492334365845, + "learning_rate": 0.0005923735231693043, + "loss": 3.5629, + "step": 4403 + }, + { + "epoch": 0.22, + "grad_norm": 0.599851131439209, + "learning_rate": 0.0005923700733186965, + "loss": 3.6198, + "step": 4404 + }, + { + "epoch": 0.22, + "grad_norm": 0.6086528897285461, + "learning_rate": 0.0005923666226980421, + "loss": 3.5182, + "step": 4405 + }, + { + "epoch": 0.22, + "grad_norm": 0.5781559944152832, + "learning_rate": 0.0005923631713073503, + "loss": 3.3736, + "step": 4406 + }, + { + "epoch": 0.22, + "grad_norm": 0.5884630680084229, + "learning_rate": 0.0005923597191466301, + "loss": 3.4333, + "step": 4407 + }, + { + "epoch": 0.22, + "grad_norm": 0.5860154628753662, + "learning_rate": 0.0005923562662158907, + "loss": 3.5609, + "step": 4408 + }, + { + "epoch": 0.22, + "grad_norm": 0.5450155138969421, + "learning_rate": 0.0005923528125151412, + "loss": 3.3551, + "step": 4409 + }, + { + "epoch": 0.22, + "grad_norm": 0.6374961733818054, + "learning_rate": 0.0005923493580443907, + "loss": 3.4167, + "step": 4410 + }, + { + "epoch": 0.22, + "grad_norm": 0.562510073184967, + "learning_rate": 0.0005923459028036482, + "loss": 3.3272, + "step": 4411 + }, + { + "epoch": 0.22, + "grad_norm": 0.5314709544181824, + "learning_rate": 0.0005923424467929229, + "loss": 3.4734, + "step": 4412 + }, + { + "epoch": 0.22, + "grad_norm": 0.5496847629547119, + "learning_rate": 0.0005923389900122239, + "loss": 3.0824, + "step": 4413 + }, + { + "epoch": 0.22, + "grad_norm": 0.5986039638519287, + "learning_rate": 0.0005923355324615601, + "loss": 3.5452, + "step": 4414 + }, + { + "epoch": 0.22, + "grad_norm": 0.5644450187683105, + "learning_rate": 0.0005923320741409409, + "loss": 3.3707, + "step": 4415 + }, + { + "epoch": 0.22, + "grad_norm": 0.5469602346420288, + "learning_rate": 0.0005923286150503753, + "loss": 3.5989, + "step": 4416 + }, + { + "epoch": 0.22, + "grad_norm": 0.5269728899002075, + "learning_rate": 0.0005923251551898722, + "loss": 3.6738, + "step": 4417 + }, + { + "epoch": 0.22, + "grad_norm": 0.5269190073013306, + "learning_rate": 0.000592321694559441, + "loss": 3.6033, + "step": 4418 + }, + { + "epoch": 0.22, + "grad_norm": 0.5562129020690918, + "learning_rate": 0.0005923182331590908, + "loss": 3.4572, + "step": 4419 + }, + { + "epoch": 0.22, + "grad_norm": 0.5670672655105591, + "learning_rate": 0.0005923147709888305, + "loss": 3.4981, + "step": 4420 + }, + { + "epoch": 0.22, + "grad_norm": 0.5788316130638123, + "learning_rate": 0.0005923113080486695, + "loss": 3.3878, + "step": 4421 + }, + { + "epoch": 0.22, + "grad_norm": 0.585292637348175, + "learning_rate": 0.0005923078443386166, + "loss": 3.3861, + "step": 4422 + }, + { + "epoch": 0.22, + "grad_norm": 0.5587526559829712, + "learning_rate": 0.0005923043798586812, + "loss": 3.3619, + "step": 4423 + }, + { + "epoch": 0.22, + "grad_norm": 0.5967184901237488, + "learning_rate": 0.0005923009146088724, + "loss": 3.3487, + "step": 4424 + }, + { + "epoch": 0.22, + "grad_norm": 0.5900785326957703, + "learning_rate": 0.0005922974485891991, + "loss": 3.4949, + "step": 4425 + }, + { + "epoch": 0.22, + "grad_norm": 0.5535380840301514, + "learning_rate": 0.0005922939817996706, + "loss": 3.5115, + "step": 4426 + }, + { + "epoch": 0.22, + "grad_norm": 0.5610527992248535, + "learning_rate": 0.000592290514240296, + "loss": 3.4378, + "step": 4427 + }, + { + "epoch": 0.22, + "grad_norm": 0.5495424270629883, + "learning_rate": 0.0005922870459110844, + "loss": 3.2887, + "step": 4428 + }, + { + "epoch": 0.22, + "grad_norm": 0.5786449313163757, + "learning_rate": 0.000592283576812045, + "loss": 3.4501, + "step": 4429 + }, + { + "epoch": 0.22, + "grad_norm": 0.5528695583343506, + "learning_rate": 0.000592280106943187, + "loss": 3.58, + "step": 4430 + }, + { + "epoch": 0.22, + "grad_norm": 0.6839733719825745, + "learning_rate": 0.0005922766363045195, + "loss": 3.3403, + "step": 4431 + }, + { + "epoch": 0.22, + "grad_norm": 0.568471372127533, + "learning_rate": 0.0005922731648960514, + "loss": 3.4483, + "step": 4432 + }, + { + "epoch": 0.22, + "grad_norm": 0.5260776877403259, + "learning_rate": 0.0005922696927177921, + "loss": 3.7186, + "step": 4433 + }, + { + "epoch": 0.22, + "grad_norm": 0.5697051882743835, + "learning_rate": 0.0005922662197697507, + "loss": 3.3252, + "step": 4434 + }, + { + "epoch": 0.22, + "grad_norm": 0.5838139057159424, + "learning_rate": 0.0005922627460519363, + "loss": 3.5432, + "step": 4435 + }, + { + "epoch": 0.22, + "grad_norm": 0.5762350559234619, + "learning_rate": 0.0005922592715643582, + "loss": 3.4488, + "step": 4436 + }, + { + "epoch": 0.22, + "grad_norm": 0.5904020071029663, + "learning_rate": 0.0005922557963070252, + "loss": 3.5981, + "step": 4437 + }, + { + "epoch": 0.22, + "grad_norm": 0.5588435530662537, + "learning_rate": 0.0005922523202799468, + "loss": 3.6139, + "step": 4438 + }, + { + "epoch": 0.22, + "grad_norm": 0.5399730205535889, + "learning_rate": 0.000592248843483132, + "loss": 3.3933, + "step": 4439 + }, + { + "epoch": 0.22, + "grad_norm": 0.57188481092453, + "learning_rate": 0.0005922453659165901, + "loss": 3.5267, + "step": 4440 + }, + { + "epoch": 0.22, + "grad_norm": 0.5735987424850464, + "learning_rate": 0.0005922418875803301, + "loss": 3.4599, + "step": 4441 + }, + { + "epoch": 0.22, + "grad_norm": 0.5924454927444458, + "learning_rate": 0.0005922384084743611, + "loss": 3.754, + "step": 4442 + }, + { + "epoch": 0.22, + "grad_norm": 0.5900004506111145, + "learning_rate": 0.0005922349285986925, + "loss": 3.3268, + "step": 4443 + }, + { + "epoch": 0.22, + "grad_norm": 0.5811010003089905, + "learning_rate": 0.0005922314479533333, + "loss": 3.3628, + "step": 4444 + }, + { + "epoch": 0.22, + "grad_norm": 0.5504472255706787, + "learning_rate": 0.0005922279665382927, + "loss": 3.5839, + "step": 4445 + }, + { + "epoch": 0.22, + "grad_norm": 0.5668983459472656, + "learning_rate": 0.0005922244843535798, + "loss": 3.6179, + "step": 4446 + }, + { + "epoch": 0.22, + "grad_norm": 0.5283928513526917, + "learning_rate": 0.000592221001399204, + "loss": 3.3998, + "step": 4447 + }, + { + "epoch": 0.22, + "grad_norm": 0.5632615685462952, + "learning_rate": 0.0005922175176751742, + "loss": 3.3972, + "step": 4448 + }, + { + "epoch": 0.22, + "grad_norm": 0.5324033498764038, + "learning_rate": 0.0005922140331814997, + "loss": 3.4857, + "step": 4449 + }, + { + "epoch": 0.22, + "grad_norm": 0.5987648963928223, + "learning_rate": 0.0005922105479181898, + "loss": 3.5155, + "step": 4450 + }, + { + "epoch": 0.22, + "grad_norm": 0.5266212821006775, + "learning_rate": 0.0005922070618852535, + "loss": 3.3693, + "step": 4451 + }, + { + "epoch": 0.22, + "grad_norm": 0.5755586624145508, + "learning_rate": 0.0005922035750827, + "loss": 3.5271, + "step": 4452 + }, + { + "epoch": 0.22, + "grad_norm": 0.6109917163848877, + "learning_rate": 0.0005922000875105385, + "loss": 3.3581, + "step": 4453 + }, + { + "epoch": 0.22, + "grad_norm": 0.5387493968009949, + "learning_rate": 0.0005921965991687783, + "loss": 3.539, + "step": 4454 + }, + { + "epoch": 0.22, + "grad_norm": 0.5238696336746216, + "learning_rate": 0.0005921931100574284, + "loss": 3.6753, + "step": 4455 + }, + { + "epoch": 0.22, + "grad_norm": 0.583580493927002, + "learning_rate": 0.0005921896201764981, + "loss": 3.4583, + "step": 4456 + }, + { + "epoch": 0.22, + "grad_norm": 0.5637732148170471, + "learning_rate": 0.0005921861295259966, + "loss": 3.689, + "step": 4457 + }, + { + "epoch": 0.22, + "grad_norm": 0.5128093957901001, + "learning_rate": 0.000592182638105933, + "loss": 3.5987, + "step": 4458 + }, + { + "epoch": 0.22, + "grad_norm": 0.6136223673820496, + "learning_rate": 0.0005921791459163167, + "loss": 3.4421, + "step": 4459 + }, + { + "epoch": 0.22, + "grad_norm": 0.573216438293457, + "learning_rate": 0.0005921756529571566, + "loss": 3.3476, + "step": 4460 + }, + { + "epoch": 0.22, + "grad_norm": 0.5862829685211182, + "learning_rate": 0.0005921721592284621, + "loss": 3.5071, + "step": 4461 + }, + { + "epoch": 0.22, + "grad_norm": 0.5221441984176636, + "learning_rate": 0.0005921686647302424, + "loss": 3.4102, + "step": 4462 + }, + { + "epoch": 0.22, + "grad_norm": 0.6147505640983582, + "learning_rate": 0.0005921651694625067, + "loss": 3.5414, + "step": 4463 + }, + { + "epoch": 0.22, + "grad_norm": 0.5590131282806396, + "learning_rate": 0.0005921616734252641, + "loss": 3.4173, + "step": 4464 + }, + { + "epoch": 0.22, + "grad_norm": 0.5548772811889648, + "learning_rate": 0.0005921581766185239, + "loss": 3.3825, + "step": 4465 + }, + { + "epoch": 0.22, + "grad_norm": 0.6083531975746155, + "learning_rate": 0.0005921546790422953, + "loss": 3.3853, + "step": 4466 + }, + { + "epoch": 0.22, + "grad_norm": 0.5427078604698181, + "learning_rate": 0.0005921511806965875, + "loss": 3.4407, + "step": 4467 + }, + { + "epoch": 0.22, + "grad_norm": 0.5490184426307678, + "learning_rate": 0.0005921476815814098, + "loss": 3.521, + "step": 4468 + }, + { + "epoch": 0.22, + "grad_norm": 0.5580472350120544, + "learning_rate": 0.0005921441816967711, + "loss": 3.4545, + "step": 4469 + }, + { + "epoch": 0.22, + "grad_norm": 0.5435226559638977, + "learning_rate": 0.000592140681042681, + "loss": 3.5815, + "step": 4470 + }, + { + "epoch": 0.22, + "grad_norm": 0.5188127160072327, + "learning_rate": 0.0005921371796191486, + "loss": 3.5472, + "step": 4471 + }, + { + "epoch": 0.22, + "grad_norm": 0.5608434081077576, + "learning_rate": 0.0005921336774261831, + "loss": 3.374, + "step": 4472 + }, + { + "epoch": 0.22, + "grad_norm": 0.5581082105636597, + "learning_rate": 0.0005921301744637937, + "loss": 3.5685, + "step": 4473 + }, + { + "epoch": 0.22, + "grad_norm": 0.5264663696289062, + "learning_rate": 0.0005921266707319896, + "loss": 3.5169, + "step": 4474 + }, + { + "epoch": 0.22, + "grad_norm": 0.5787158608436584, + "learning_rate": 0.0005921231662307801, + "loss": 3.4266, + "step": 4475 + }, + { + "epoch": 0.22, + "grad_norm": 0.5512024164199829, + "learning_rate": 0.0005921196609601744, + "loss": 3.6085, + "step": 4476 + }, + { + "epoch": 0.22, + "grad_norm": 0.559208333492279, + "learning_rate": 0.0005921161549201818, + "loss": 3.3556, + "step": 4477 + }, + { + "epoch": 0.22, + "grad_norm": 0.5857759118080139, + "learning_rate": 0.0005921126481108112, + "loss": 3.5417, + "step": 4478 + }, + { + "epoch": 0.22, + "grad_norm": 0.5337649583816528, + "learning_rate": 0.0005921091405320724, + "loss": 3.4392, + "step": 4479 + }, + { + "epoch": 0.22, + "grad_norm": 0.5375748872756958, + "learning_rate": 0.0005921056321839742, + "loss": 3.4743, + "step": 4480 + }, + { + "epoch": 0.22, + "grad_norm": 0.5559386610984802, + "learning_rate": 0.0005921021230665261, + "loss": 3.4687, + "step": 4481 + }, + { + "epoch": 0.22, + "grad_norm": 0.5421441793441772, + "learning_rate": 0.000592098613179737, + "loss": 3.3652, + "step": 4482 + }, + { + "epoch": 0.22, + "grad_norm": 0.5195907354354858, + "learning_rate": 0.0005920951025236166, + "loss": 3.6266, + "step": 4483 + }, + { + "epoch": 0.22, + "grad_norm": 0.600489616394043, + "learning_rate": 0.0005920915910981739, + "loss": 3.3397, + "step": 4484 + }, + { + "epoch": 0.22, + "grad_norm": 0.5409472584724426, + "learning_rate": 0.000592088078903418, + "loss": 3.2999, + "step": 4485 + }, + { + "epoch": 0.22, + "grad_norm": 0.522708535194397, + "learning_rate": 0.0005920845659393584, + "loss": 3.3678, + "step": 4486 + }, + { + "epoch": 0.22, + "grad_norm": 0.53269362449646, + "learning_rate": 0.0005920810522060042, + "loss": 3.5437, + "step": 4487 + }, + { + "epoch": 0.22, + "grad_norm": 0.5417482256889343, + "learning_rate": 0.0005920775377033648, + "loss": 3.5003, + "step": 4488 + }, + { + "epoch": 0.22, + "grad_norm": 0.5512046813964844, + "learning_rate": 0.0005920740224314494, + "loss": 3.3525, + "step": 4489 + }, + { + "epoch": 0.22, + "grad_norm": 0.5806989073753357, + "learning_rate": 0.0005920705063902672, + "loss": 3.4915, + "step": 4490 + }, + { + "epoch": 0.22, + "grad_norm": 0.5602362155914307, + "learning_rate": 0.0005920669895798275, + "loss": 3.5072, + "step": 4491 + }, + { + "epoch": 0.22, + "grad_norm": 0.5372531414031982, + "learning_rate": 0.0005920634720001396, + "loss": 3.3132, + "step": 4492 + }, + { + "epoch": 0.22, + "grad_norm": 0.6997660994529724, + "learning_rate": 0.0005920599536512126, + "loss": 3.2909, + "step": 4493 + }, + { + "epoch": 0.22, + "grad_norm": 0.5167723894119263, + "learning_rate": 0.000592056434533056, + "loss": 3.498, + "step": 4494 + }, + { + "epoch": 0.22, + "grad_norm": 0.5544456243515015, + "learning_rate": 0.0005920529146456789, + "loss": 3.393, + "step": 4495 + }, + { + "epoch": 0.22, + "grad_norm": 0.5869852304458618, + "learning_rate": 0.0005920493939890907, + "loss": 3.5451, + "step": 4496 + }, + { + "epoch": 0.22, + "grad_norm": 0.5493007898330688, + "learning_rate": 0.0005920458725633005, + "loss": 3.4858, + "step": 4497 + }, + { + "epoch": 0.22, + "grad_norm": 0.5228421092033386, + "learning_rate": 0.0005920423503683178, + "loss": 3.5755, + "step": 4498 + }, + { + "epoch": 0.22, + "grad_norm": 0.5734871029853821, + "learning_rate": 0.0005920388274041516, + "loss": 3.4937, + "step": 4499 + }, + { + "epoch": 0.22, + "grad_norm": 0.6023586392402649, + "learning_rate": 0.0005920353036708115, + "loss": 3.3827, + "step": 4500 + }, + { + "epoch": 0.22, + "grad_norm": 0.5095198750495911, + "learning_rate": 0.0005920317791683065, + "loss": 3.4317, + "step": 4501 + }, + { + "epoch": 0.22, + "grad_norm": 0.5262815356254578, + "learning_rate": 0.0005920282538966461, + "loss": 3.2033, + "step": 4502 + }, + { + "epoch": 0.22, + "grad_norm": 0.5463591814041138, + "learning_rate": 0.0005920247278558394, + "loss": 3.4123, + "step": 4503 + }, + { + "epoch": 0.22, + "grad_norm": 0.5262253880500793, + "learning_rate": 0.0005920212010458957, + "loss": 3.5664, + "step": 4504 + }, + { + "epoch": 0.22, + "grad_norm": 0.5638479590415955, + "learning_rate": 0.0005920176734668244, + "loss": 3.3824, + "step": 4505 + }, + { + "epoch": 0.22, + "grad_norm": 0.5075425505638123, + "learning_rate": 0.0005920141451186349, + "loss": 3.5525, + "step": 4506 + }, + { + "epoch": 0.22, + "grad_norm": 0.5334774255752563, + "learning_rate": 0.0005920106160013362, + "loss": 3.4679, + "step": 4507 + }, + { + "epoch": 0.22, + "grad_norm": 0.5502597689628601, + "learning_rate": 0.0005920070861149377, + "loss": 3.5242, + "step": 4508 + }, + { + "epoch": 0.22, + "grad_norm": 0.5628333687782288, + "learning_rate": 0.0005920035554594489, + "loss": 3.4435, + "step": 4509 + }, + { + "epoch": 0.22, + "grad_norm": 0.5412539839744568, + "learning_rate": 0.0005920000240348788, + "loss": 3.5723, + "step": 4510 + }, + { + "epoch": 0.22, + "grad_norm": 0.5367388725280762, + "learning_rate": 0.0005919964918412368, + "loss": 3.3911, + "step": 4511 + }, + { + "epoch": 0.22, + "grad_norm": 0.5794838070869446, + "learning_rate": 0.0005919929588785323, + "loss": 3.4213, + "step": 4512 + }, + { + "epoch": 0.22, + "grad_norm": 0.5445544123649597, + "learning_rate": 0.0005919894251467746, + "loss": 3.4142, + "step": 4513 + }, + { + "epoch": 0.22, + "grad_norm": 0.5294917225837708, + "learning_rate": 0.0005919858906459728, + "loss": 3.5728, + "step": 4514 + }, + { + "epoch": 0.22, + "grad_norm": 0.5747779607772827, + "learning_rate": 0.0005919823553761365, + "loss": 3.7209, + "step": 4515 + }, + { + "epoch": 0.22, + "grad_norm": 0.5731784105300903, + "learning_rate": 0.0005919788193372748, + "loss": 3.3981, + "step": 4516 + }, + { + "epoch": 0.22, + "grad_norm": 0.5659194588661194, + "learning_rate": 0.000591975282529397, + "loss": 3.4661, + "step": 4517 + }, + { + "epoch": 0.22, + "grad_norm": 0.6080614924430847, + "learning_rate": 0.0005919717449525127, + "loss": 3.448, + "step": 4518 + }, + { + "epoch": 0.22, + "grad_norm": 0.5205955505371094, + "learning_rate": 0.0005919682066066309, + "loss": 3.7119, + "step": 4519 + }, + { + "epoch": 0.22, + "grad_norm": 0.6132055521011353, + "learning_rate": 0.000591964667491761, + "loss": 3.5377, + "step": 4520 + }, + { + "epoch": 0.22, + "grad_norm": 0.5384352207183838, + "learning_rate": 0.0005919611276079124, + "loss": 3.5972, + "step": 4521 + }, + { + "epoch": 0.22, + "grad_norm": 0.5405988693237305, + "learning_rate": 0.0005919575869550944, + "loss": 3.4752, + "step": 4522 + }, + { + "epoch": 0.22, + "grad_norm": 0.5446063280105591, + "learning_rate": 0.0005919540455333163, + "loss": 3.5515, + "step": 4523 + }, + { + "epoch": 0.22, + "grad_norm": 0.6086515188217163, + "learning_rate": 0.0005919505033425875, + "loss": 3.5652, + "step": 4524 + }, + { + "epoch": 0.22, + "grad_norm": 0.5298771262168884, + "learning_rate": 0.0005919469603829171, + "loss": 3.3091, + "step": 4525 + }, + { + "epoch": 0.22, + "grad_norm": 0.5765856504440308, + "learning_rate": 0.0005919434166543146, + "loss": 3.3966, + "step": 4526 + }, + { + "epoch": 0.22, + "grad_norm": 0.569276750087738, + "learning_rate": 0.0005919398721567895, + "loss": 3.454, + "step": 4527 + }, + { + "epoch": 0.22, + "grad_norm": 0.568664014339447, + "learning_rate": 0.0005919363268903508, + "loss": 3.6629, + "step": 4528 + }, + { + "epoch": 0.22, + "grad_norm": 0.569466233253479, + "learning_rate": 0.000591932780855008, + "loss": 3.4888, + "step": 4529 + }, + { + "epoch": 0.22, + "grad_norm": 0.5888025760650635, + "learning_rate": 0.0005919292340507706, + "loss": 3.3833, + "step": 4530 + }, + { + "epoch": 0.22, + "grad_norm": 0.6375117897987366, + "learning_rate": 0.0005919256864776476, + "loss": 3.7058, + "step": 4531 + }, + { + "epoch": 0.22, + "grad_norm": 0.5959801077842712, + "learning_rate": 0.0005919221381356486, + "loss": 3.3804, + "step": 4532 + }, + { + "epoch": 0.22, + "grad_norm": 0.5718854069709778, + "learning_rate": 0.0005919185890247828, + "loss": 3.48, + "step": 4533 + }, + { + "epoch": 0.22, + "grad_norm": 0.5459699630737305, + "learning_rate": 0.0005919150391450597, + "loss": 3.4685, + "step": 4534 + }, + { + "epoch": 0.22, + "grad_norm": 0.5691114068031311, + "learning_rate": 0.0005919114884964886, + "loss": 3.4821, + "step": 4535 + }, + { + "epoch": 0.22, + "grad_norm": 0.5469587445259094, + "learning_rate": 0.0005919079370790789, + "loss": 3.4524, + "step": 4536 + }, + { + "epoch": 0.22, + "grad_norm": 0.5794917941093445, + "learning_rate": 0.0005919043848928397, + "loss": 3.5051, + "step": 4537 + }, + { + "epoch": 0.22, + "grad_norm": 0.5750076770782471, + "learning_rate": 0.0005919008319377805, + "loss": 3.3637, + "step": 4538 + }, + { + "epoch": 0.22, + "grad_norm": 0.5511584281921387, + "learning_rate": 0.0005918972782139108, + "loss": 3.3772, + "step": 4539 + }, + { + "epoch": 0.22, + "grad_norm": 0.5298534631729126, + "learning_rate": 0.0005918937237212397, + "loss": 3.6845, + "step": 4540 + }, + { + "epoch": 0.22, + "grad_norm": 0.5647959113121033, + "learning_rate": 0.0005918901684597769, + "loss": 3.2087, + "step": 4541 + }, + { + "epoch": 0.22, + "grad_norm": 0.5687061548233032, + "learning_rate": 0.0005918866124295315, + "loss": 3.4084, + "step": 4542 + }, + { + "epoch": 0.22, + "grad_norm": 0.5853356122970581, + "learning_rate": 0.000591883055630513, + "loss": 3.4805, + "step": 4543 + }, + { + "epoch": 0.22, + "grad_norm": 0.5594838261604309, + "learning_rate": 0.0005918794980627305, + "loss": 3.728, + "step": 4544 + }, + { + "epoch": 0.22, + "grad_norm": 0.5661375522613525, + "learning_rate": 0.0005918759397261936, + "loss": 3.4592, + "step": 4545 + }, + { + "epoch": 0.22, + "grad_norm": 0.5649911165237427, + "learning_rate": 0.0005918723806209119, + "loss": 3.4811, + "step": 4546 + }, + { + "epoch": 0.22, + "grad_norm": 0.568805456161499, + "learning_rate": 0.0005918688207468942, + "loss": 3.3881, + "step": 4547 + }, + { + "epoch": 0.22, + "grad_norm": 0.5687074661254883, + "learning_rate": 0.0005918652601041505, + "loss": 3.4324, + "step": 4548 + }, + { + "epoch": 0.22, + "grad_norm": 0.5697916150093079, + "learning_rate": 0.0005918616986926898, + "loss": 3.5119, + "step": 4549 + }, + { + "epoch": 0.22, + "grad_norm": 0.5741244554519653, + "learning_rate": 0.0005918581365125215, + "loss": 3.5548, + "step": 4550 + }, + { + "epoch": 0.22, + "grad_norm": 0.5744023323059082, + "learning_rate": 0.000591854573563655, + "loss": 3.5327, + "step": 4551 + }, + { + "epoch": 0.22, + "grad_norm": 0.5800146460533142, + "learning_rate": 0.0005918510098460997, + "loss": 3.4031, + "step": 4552 + }, + { + "epoch": 0.22, + "grad_norm": 0.5622119307518005, + "learning_rate": 0.000591847445359865, + "loss": 3.4033, + "step": 4553 + }, + { + "epoch": 0.22, + "grad_norm": 0.5262593626976013, + "learning_rate": 0.0005918438801049605, + "loss": 3.2841, + "step": 4554 + }, + { + "epoch": 0.22, + "grad_norm": 0.5564959645271301, + "learning_rate": 0.0005918403140813952, + "loss": 3.6266, + "step": 4555 + }, + { + "epoch": 0.22, + "grad_norm": 0.574227511882782, + "learning_rate": 0.0005918367472891787, + "loss": 3.3961, + "step": 4556 + }, + { + "epoch": 0.22, + "grad_norm": 0.563045084476471, + "learning_rate": 0.0005918331797283204, + "loss": 3.6054, + "step": 4557 + }, + { + "epoch": 0.22, + "grad_norm": 0.5304876565933228, + "learning_rate": 0.0005918296113988297, + "loss": 3.3966, + "step": 4558 + }, + { + "epoch": 0.22, + "grad_norm": 0.5531924962997437, + "learning_rate": 0.0005918260423007159, + "loss": 3.2937, + "step": 4559 + }, + { + "epoch": 0.22, + "grad_norm": 0.5169435739517212, + "learning_rate": 0.0005918224724339885, + "loss": 3.5655, + "step": 4560 + }, + { + "epoch": 0.22, + "grad_norm": 0.6689804196357727, + "learning_rate": 0.0005918189017986569, + "loss": 3.6632, + "step": 4561 + }, + { + "epoch": 0.22, + "grad_norm": 0.5568976998329163, + "learning_rate": 0.0005918153303947304, + "loss": 3.6612, + "step": 4562 + }, + { + "epoch": 0.22, + "grad_norm": 0.5638584494590759, + "learning_rate": 0.0005918117582222185, + "loss": 3.4756, + "step": 4563 + }, + { + "epoch": 0.22, + "grad_norm": 0.5415082573890686, + "learning_rate": 0.0005918081852811307, + "loss": 3.5922, + "step": 4564 + }, + { + "epoch": 0.22, + "grad_norm": 0.5457674264907837, + "learning_rate": 0.0005918046115714762, + "loss": 3.6223, + "step": 4565 + }, + { + "epoch": 0.22, + "grad_norm": 0.5880576968193054, + "learning_rate": 0.0005918010370932645, + "loss": 3.3749, + "step": 4566 + }, + { + "epoch": 0.22, + "grad_norm": 0.572864830493927, + "learning_rate": 0.0005917974618465051, + "loss": 3.428, + "step": 4567 + }, + { + "epoch": 0.22, + "grad_norm": 0.5527423024177551, + "learning_rate": 0.0005917938858312073, + "loss": 3.3815, + "step": 4568 + }, + { + "epoch": 0.22, + "grad_norm": 0.6350066661834717, + "learning_rate": 0.0005917903090473806, + "loss": 3.1924, + "step": 4569 + }, + { + "epoch": 0.22, + "grad_norm": 0.5983766913414001, + "learning_rate": 0.0005917867314950343, + "loss": 3.4771, + "step": 4570 + }, + { + "epoch": 0.22, + "grad_norm": 0.5613915324211121, + "learning_rate": 0.0005917831531741779, + "loss": 3.4837, + "step": 4571 + }, + { + "epoch": 0.22, + "grad_norm": 0.5356848835945129, + "learning_rate": 0.000591779574084821, + "loss": 3.5368, + "step": 4572 + }, + { + "epoch": 0.22, + "grad_norm": 0.5657932162284851, + "learning_rate": 0.0005917759942269727, + "loss": 3.3458, + "step": 4573 + }, + { + "epoch": 0.22, + "grad_norm": 0.5904025435447693, + "learning_rate": 0.0005917724136006425, + "loss": 3.3554, + "step": 4574 + }, + { + "epoch": 0.22, + "grad_norm": 0.5392588376998901, + "learning_rate": 0.00059176883220584, + "loss": 3.4159, + "step": 4575 + }, + { + "epoch": 0.22, + "grad_norm": 0.5528022646903992, + "learning_rate": 0.0005917652500425747, + "loss": 3.5421, + "step": 4576 + }, + { + "epoch": 0.22, + "grad_norm": 0.5817191004753113, + "learning_rate": 0.0005917616671108557, + "loss": 3.4776, + "step": 4577 + }, + { + "epoch": 0.22, + "grad_norm": 0.5895118713378906, + "learning_rate": 0.0005917580834106927, + "loss": 3.4579, + "step": 4578 + }, + { + "epoch": 0.22, + "grad_norm": 0.5503705143928528, + "learning_rate": 0.0005917544989420951, + "loss": 3.4782, + "step": 4579 + }, + { + "epoch": 0.22, + "grad_norm": 0.5728640556335449, + "learning_rate": 0.0005917509137050723, + "loss": 3.2202, + "step": 4580 + }, + { + "epoch": 0.22, + "grad_norm": 0.5362029671669006, + "learning_rate": 0.0005917473276996336, + "loss": 3.4599, + "step": 4581 + }, + { + "epoch": 0.22, + "grad_norm": 0.5472753643989563, + "learning_rate": 0.0005917437409257886, + "loss": 3.5085, + "step": 4582 + }, + { + "epoch": 0.22, + "grad_norm": 0.575137197971344, + "learning_rate": 0.0005917401533835469, + "loss": 3.4713, + "step": 4583 + }, + { + "epoch": 0.22, + "grad_norm": 0.5501198768615723, + "learning_rate": 0.0005917365650729176, + "loss": 3.3051, + "step": 4584 + }, + { + "epoch": 0.22, + "grad_norm": 0.5528760552406311, + "learning_rate": 0.0005917329759939105, + "loss": 3.4805, + "step": 4585 + }, + { + "epoch": 0.22, + "grad_norm": 0.5414803624153137, + "learning_rate": 0.0005917293861465347, + "loss": 3.577, + "step": 4586 + }, + { + "epoch": 0.22, + "grad_norm": 0.5812093615531921, + "learning_rate": 0.0005917257955308, + "loss": 3.5206, + "step": 4587 + }, + { + "epoch": 0.22, + "grad_norm": 0.6059795022010803, + "learning_rate": 0.0005917222041467156, + "loss": 3.4236, + "step": 4588 + }, + { + "epoch": 0.22, + "grad_norm": 0.5378461480140686, + "learning_rate": 0.0005917186119942912, + "loss": 3.4233, + "step": 4589 + }, + { + "epoch": 0.22, + "grad_norm": 0.5715885758399963, + "learning_rate": 0.000591715019073536, + "loss": 3.3245, + "step": 4590 + }, + { + "epoch": 0.22, + "grad_norm": 0.5676418542861938, + "learning_rate": 0.0005917114253844596, + "loss": 3.1704, + "step": 4591 + }, + { + "epoch": 0.23, + "grad_norm": 0.5657680034637451, + "learning_rate": 0.0005917078309270714, + "loss": 3.3586, + "step": 4592 + }, + { + "epoch": 0.23, + "grad_norm": 0.5191043019294739, + "learning_rate": 0.000591704235701381, + "loss": 3.3325, + "step": 4593 + }, + { + "epoch": 0.23, + "grad_norm": 0.567353367805481, + "learning_rate": 0.0005917006397073977, + "loss": 3.6905, + "step": 4594 + }, + { + "epoch": 0.23, + "grad_norm": 0.5378613471984863, + "learning_rate": 0.0005916970429451311, + "loss": 3.7788, + "step": 4595 + }, + { + "epoch": 0.23, + "grad_norm": 0.548173725605011, + "learning_rate": 0.0005916934454145906, + "loss": 3.3993, + "step": 4596 + }, + { + "epoch": 0.23, + "grad_norm": 0.5689377188682556, + "learning_rate": 0.0005916898471157858, + "loss": 3.5089, + "step": 4597 + }, + { + "epoch": 0.23, + "grad_norm": 0.5914126634597778, + "learning_rate": 0.000591686248048726, + "loss": 3.4533, + "step": 4598 + }, + { + "epoch": 0.23, + "grad_norm": 0.533582866191864, + "learning_rate": 0.0005916826482134207, + "loss": 3.5641, + "step": 4599 + }, + { + "epoch": 0.23, + "grad_norm": 0.5263054370880127, + "learning_rate": 0.0005916790476098796, + "loss": 3.397, + "step": 4600 + }, + { + "epoch": 0.23, + "grad_norm": 0.5878801941871643, + "learning_rate": 0.000591675446238112, + "loss": 3.4843, + "step": 4601 + }, + { + "epoch": 0.23, + "grad_norm": 0.7907947301864624, + "learning_rate": 0.0005916718440981272, + "loss": 3.5027, + "step": 4602 + }, + { + "epoch": 0.23, + "grad_norm": 0.5525874495506287, + "learning_rate": 0.000591668241189935, + "loss": 3.3515, + "step": 4603 + }, + { + "epoch": 0.23, + "grad_norm": 0.5373077988624573, + "learning_rate": 0.0005916646375135449, + "loss": 3.2774, + "step": 4604 + }, + { + "epoch": 0.23, + "grad_norm": 0.5783985257148743, + "learning_rate": 0.0005916610330689661, + "loss": 3.3954, + "step": 4605 + }, + { + "epoch": 0.23, + "grad_norm": 0.5470697283744812, + "learning_rate": 0.0005916574278562085, + "loss": 3.3739, + "step": 4606 + }, + { + "epoch": 0.23, + "grad_norm": 0.5767296552658081, + "learning_rate": 0.0005916538218752812, + "loss": 3.3787, + "step": 4607 + }, + { + "epoch": 0.23, + "grad_norm": 0.5586828589439392, + "learning_rate": 0.000591650215126194, + "loss": 3.4687, + "step": 4608 + }, + { + "epoch": 0.23, + "grad_norm": 0.5796303153038025, + "learning_rate": 0.000591646607608956, + "loss": 3.7055, + "step": 4609 + }, + { + "epoch": 0.23, + "grad_norm": 0.5355789065361023, + "learning_rate": 0.0005916429993235772, + "loss": 3.5942, + "step": 4610 + }, + { + "epoch": 0.23, + "grad_norm": 0.5896856188774109, + "learning_rate": 0.0005916393902700667, + "loss": 3.4948, + "step": 4611 + }, + { + "epoch": 0.23, + "grad_norm": 0.5608619451522827, + "learning_rate": 0.0005916357804484344, + "loss": 3.5695, + "step": 4612 + }, + { + "epoch": 0.23, + "grad_norm": 0.5785589218139648, + "learning_rate": 0.0005916321698586894, + "loss": 3.5872, + "step": 4613 + }, + { + "epoch": 0.23, + "grad_norm": 0.5732633471488953, + "learning_rate": 0.0005916285585008415, + "loss": 3.3187, + "step": 4614 + }, + { + "epoch": 0.23, + "grad_norm": 0.5910802483558655, + "learning_rate": 0.0005916249463748999, + "loss": 3.4223, + "step": 4615 + }, + { + "epoch": 0.23, + "grad_norm": 0.5657915472984314, + "learning_rate": 0.0005916213334808745, + "loss": 3.5133, + "step": 4616 + }, + { + "epoch": 0.23, + "grad_norm": 0.522301435470581, + "learning_rate": 0.0005916177198187746, + "loss": 3.4913, + "step": 4617 + }, + { + "epoch": 0.23, + "grad_norm": 0.5693953037261963, + "learning_rate": 0.0005916141053886097, + "loss": 3.3059, + "step": 4618 + }, + { + "epoch": 0.23, + "grad_norm": 0.5592297911643982, + "learning_rate": 0.0005916104901903894, + "loss": 3.3731, + "step": 4619 + }, + { + "epoch": 0.23, + "grad_norm": 0.5223372578620911, + "learning_rate": 0.0005916068742241232, + "loss": 3.1434, + "step": 4620 + }, + { + "epoch": 0.23, + "grad_norm": 0.5213208794593811, + "learning_rate": 0.0005916032574898206, + "loss": 3.3789, + "step": 4621 + }, + { + "epoch": 0.23, + "grad_norm": 0.5590026378631592, + "learning_rate": 0.0005915996399874911, + "loss": 3.5919, + "step": 4622 + }, + { + "epoch": 0.23, + "grad_norm": 0.5489098429679871, + "learning_rate": 0.0005915960217171444, + "loss": 3.3545, + "step": 4623 + }, + { + "epoch": 0.23, + "grad_norm": 0.5755786299705505, + "learning_rate": 0.0005915924026787898, + "loss": 3.5858, + "step": 4624 + }, + { + "epoch": 0.23, + "grad_norm": 0.5431007742881775, + "learning_rate": 0.0005915887828724369, + "loss": 3.5246, + "step": 4625 + }, + { + "epoch": 0.23, + "grad_norm": 0.5259215235710144, + "learning_rate": 0.0005915851622980954, + "loss": 3.5859, + "step": 4626 + }, + { + "epoch": 0.23, + "grad_norm": 0.5726738572120667, + "learning_rate": 0.0005915815409557745, + "loss": 3.7115, + "step": 4627 + }, + { + "epoch": 0.23, + "grad_norm": 0.5531672239303589, + "learning_rate": 0.000591577918845484, + "loss": 3.4266, + "step": 4628 + }, + { + "epoch": 0.23, + "grad_norm": 0.5267097353935242, + "learning_rate": 0.0005915742959672335, + "loss": 3.5795, + "step": 4629 + }, + { + "epoch": 0.23, + "grad_norm": 0.5696015357971191, + "learning_rate": 0.0005915706723210323, + "loss": 3.2015, + "step": 4630 + }, + { + "epoch": 0.23, + "grad_norm": 0.5455690622329712, + "learning_rate": 0.00059156704790689, + "loss": 3.44, + "step": 4631 + }, + { + "epoch": 0.23, + "grad_norm": 0.6064535975456238, + "learning_rate": 0.0005915634227248163, + "loss": 3.2651, + "step": 4632 + }, + { + "epoch": 0.23, + "grad_norm": 0.5588998794555664, + "learning_rate": 0.0005915597967748207, + "loss": 3.383, + "step": 4633 + }, + { + "epoch": 0.23, + "grad_norm": 0.5298521518707275, + "learning_rate": 0.0005915561700569127, + "loss": 3.5665, + "step": 4634 + }, + { + "epoch": 0.23, + "grad_norm": 0.5695735812187195, + "learning_rate": 0.0005915525425711018, + "loss": 3.4372, + "step": 4635 + }, + { + "epoch": 0.23, + "grad_norm": 0.5828069448471069, + "learning_rate": 0.0005915489143173976, + "loss": 3.4493, + "step": 4636 + }, + { + "epoch": 0.23, + "grad_norm": 0.5570802092552185, + "learning_rate": 0.0005915452852958098, + "loss": 3.6557, + "step": 4637 + }, + { + "epoch": 0.23, + "grad_norm": 0.5548609495162964, + "learning_rate": 0.0005915416555063478, + "loss": 3.5455, + "step": 4638 + }, + { + "epoch": 0.23, + "grad_norm": 0.6037477850914001, + "learning_rate": 0.000591538024949021, + "loss": 3.5569, + "step": 4639 + }, + { + "epoch": 0.23, + "grad_norm": 0.6033467054367065, + "learning_rate": 0.0005915343936238394, + "loss": 3.5211, + "step": 4640 + }, + { + "epoch": 0.23, + "grad_norm": 0.6203020811080933, + "learning_rate": 0.0005915307615308122, + "loss": 3.6181, + "step": 4641 + }, + { + "epoch": 0.23, + "grad_norm": 0.5475682616233826, + "learning_rate": 0.000591527128669949, + "loss": 3.5266, + "step": 4642 + }, + { + "epoch": 0.23, + "grad_norm": 0.5472739934921265, + "learning_rate": 0.0005915234950412596, + "loss": 3.6185, + "step": 4643 + }, + { + "epoch": 0.23, + "grad_norm": 0.5932005047798157, + "learning_rate": 0.0005915198606447533, + "loss": 3.3851, + "step": 4644 + }, + { + "epoch": 0.23, + "grad_norm": 0.5578650832176208, + "learning_rate": 0.0005915162254804398, + "loss": 3.4982, + "step": 4645 + }, + { + "epoch": 0.23, + "grad_norm": 0.5666883587837219, + "learning_rate": 0.0005915125895483288, + "loss": 3.3541, + "step": 4646 + }, + { + "epoch": 0.23, + "grad_norm": 0.6000686883926392, + "learning_rate": 0.0005915089528484295, + "loss": 3.376, + "step": 4647 + }, + { + "epoch": 0.23, + "grad_norm": 0.593292772769928, + "learning_rate": 0.0005915053153807519, + "loss": 3.5401, + "step": 4648 + }, + { + "epoch": 0.23, + "grad_norm": 0.5368260145187378, + "learning_rate": 0.0005915016771453054, + "loss": 3.6079, + "step": 4649 + }, + { + "epoch": 0.23, + "grad_norm": 0.6727637648582458, + "learning_rate": 0.0005914980381420996, + "loss": 3.4033, + "step": 4650 + }, + { + "epoch": 0.23, + "grad_norm": 0.541647732257843, + "learning_rate": 0.0005914943983711439, + "loss": 3.3189, + "step": 4651 + }, + { + "epoch": 0.23, + "grad_norm": 0.5538272857666016, + "learning_rate": 0.0005914907578324481, + "loss": 3.4644, + "step": 4652 + }, + { + "epoch": 0.23, + "grad_norm": 0.5518693923950195, + "learning_rate": 0.0005914871165260219, + "loss": 3.6281, + "step": 4653 + }, + { + "epoch": 0.23, + "grad_norm": 0.5680675506591797, + "learning_rate": 0.0005914834744518747, + "loss": 3.4995, + "step": 4654 + }, + { + "epoch": 0.23, + "grad_norm": 0.5549336671829224, + "learning_rate": 0.000591479831610016, + "loss": 3.264, + "step": 4655 + }, + { + "epoch": 0.23, + "grad_norm": 0.5913617014884949, + "learning_rate": 0.0005914761880004555, + "loss": 3.2961, + "step": 4656 + }, + { + "epoch": 0.23, + "grad_norm": 0.5764253735542297, + "learning_rate": 0.000591472543623203, + "loss": 3.5085, + "step": 4657 + }, + { + "epoch": 0.23, + "grad_norm": 0.5552957057952881, + "learning_rate": 0.0005914688984782677, + "loss": 3.3739, + "step": 4658 + }, + { + "epoch": 0.23, + "grad_norm": 0.5654916763305664, + "learning_rate": 0.0005914652525656596, + "loss": 3.3516, + "step": 4659 + }, + { + "epoch": 0.23, + "grad_norm": 0.5623388886451721, + "learning_rate": 0.0005914616058853881, + "loss": 3.5072, + "step": 4660 + }, + { + "epoch": 0.23, + "grad_norm": 0.5291627645492554, + "learning_rate": 0.0005914579584374627, + "loss": 3.4728, + "step": 4661 + }, + { + "epoch": 0.23, + "grad_norm": 0.5866690874099731, + "learning_rate": 0.0005914543102218933, + "loss": 3.3841, + "step": 4662 + }, + { + "epoch": 0.23, + "grad_norm": 0.5244299173355103, + "learning_rate": 0.0005914506612386891, + "loss": 3.4587, + "step": 4663 + }, + { + "epoch": 0.23, + "grad_norm": 0.5474516153335571, + "learning_rate": 0.0005914470114878602, + "loss": 3.5114, + "step": 4664 + }, + { + "epoch": 0.23, + "grad_norm": 0.56264728307724, + "learning_rate": 0.0005914433609694158, + "loss": 3.5291, + "step": 4665 + }, + { + "epoch": 0.23, + "grad_norm": 0.5429409146308899, + "learning_rate": 0.0005914397096833658, + "loss": 3.7083, + "step": 4666 + }, + { + "epoch": 0.23, + "grad_norm": 0.5317074656486511, + "learning_rate": 0.0005914360576297196, + "loss": 3.4968, + "step": 4667 + }, + { + "epoch": 0.23, + "grad_norm": 0.5686256885528564, + "learning_rate": 0.0005914324048084869, + "loss": 3.4443, + "step": 4668 + }, + { + "epoch": 0.23, + "grad_norm": 0.5332667827606201, + "learning_rate": 0.0005914287512196775, + "loss": 3.525, + "step": 4669 + }, + { + "epoch": 0.23, + "grad_norm": 0.5805628299713135, + "learning_rate": 0.0005914250968633006, + "loss": 3.5423, + "step": 4670 + }, + { + "epoch": 0.23, + "grad_norm": 0.5518739819526672, + "learning_rate": 0.0005914214417393663, + "loss": 3.5086, + "step": 4671 + }, + { + "epoch": 0.23, + "grad_norm": 0.6050487160682678, + "learning_rate": 0.0005914177858478837, + "loss": 3.4918, + "step": 4672 + }, + { + "epoch": 0.23, + "grad_norm": 0.5575416088104248, + "learning_rate": 0.000591414129188863, + "loss": 3.4099, + "step": 4673 + }, + { + "epoch": 0.23, + "grad_norm": 0.5643271803855896, + "learning_rate": 0.0005914104717623136, + "loss": 3.2613, + "step": 4674 + }, + { + "epoch": 0.23, + "grad_norm": 0.5856224298477173, + "learning_rate": 0.000591406813568245, + "loss": 3.3773, + "step": 4675 + }, + { + "epoch": 0.23, + "grad_norm": 0.5721701383590698, + "learning_rate": 0.0005914031546066669, + "loss": 3.3944, + "step": 4676 + }, + { + "epoch": 0.23, + "grad_norm": 0.5789268016815186, + "learning_rate": 0.000591399494877589, + "loss": 3.5411, + "step": 4677 + }, + { + "epoch": 0.23, + "grad_norm": 0.5379511713981628, + "learning_rate": 0.000591395834381021, + "loss": 3.5688, + "step": 4678 + }, + { + "epoch": 0.23, + "grad_norm": 0.5450015068054199, + "learning_rate": 0.0005913921731169724, + "loss": 3.5541, + "step": 4679 + }, + { + "epoch": 0.23, + "grad_norm": 0.5848316550254822, + "learning_rate": 0.0005913885110854529, + "loss": 3.628, + "step": 4680 + }, + { + "epoch": 0.23, + "grad_norm": 0.5671524405479431, + "learning_rate": 0.0005913848482864721, + "loss": 3.629, + "step": 4681 + }, + { + "epoch": 0.23, + "grad_norm": 0.5497124791145325, + "learning_rate": 0.0005913811847200397, + "loss": 3.6733, + "step": 4682 + }, + { + "epoch": 0.23, + "grad_norm": 0.5373255610466003, + "learning_rate": 0.0005913775203861654, + "loss": 3.5201, + "step": 4683 + }, + { + "epoch": 0.23, + "grad_norm": 0.5729827880859375, + "learning_rate": 0.0005913738552848587, + "loss": 3.3868, + "step": 4684 + }, + { + "epoch": 0.23, + "grad_norm": 0.5580928921699524, + "learning_rate": 0.0005913701894161294, + "loss": 3.4114, + "step": 4685 + }, + { + "epoch": 0.23, + "grad_norm": 0.5716227889060974, + "learning_rate": 0.000591366522779987, + "loss": 3.5178, + "step": 4686 + }, + { + "epoch": 0.23, + "grad_norm": 0.539944052696228, + "learning_rate": 0.0005913628553764413, + "loss": 3.4705, + "step": 4687 + }, + { + "epoch": 0.23, + "grad_norm": 0.5653459429740906, + "learning_rate": 0.000591359187205502, + "loss": 3.3099, + "step": 4688 + }, + { + "epoch": 0.23, + "grad_norm": 0.5607462525367737, + "learning_rate": 0.0005913555182671786, + "loss": 3.6437, + "step": 4689 + }, + { + "epoch": 0.23, + "grad_norm": 0.5527281165122986, + "learning_rate": 0.0005913518485614809, + "loss": 3.4185, + "step": 4690 + }, + { + "epoch": 0.23, + "grad_norm": 0.5707682967185974, + "learning_rate": 0.0005913481780884185, + "loss": 3.4184, + "step": 4691 + }, + { + "epoch": 0.23, + "grad_norm": 0.5300978422164917, + "learning_rate": 0.000591344506848001, + "loss": 3.2963, + "step": 4692 + }, + { + "epoch": 0.23, + "grad_norm": 0.553466260433197, + "learning_rate": 0.0005913408348402382, + "loss": 3.3923, + "step": 4693 + }, + { + "epoch": 0.23, + "grad_norm": 0.5744982361793518, + "learning_rate": 0.0005913371620651396, + "loss": 3.3072, + "step": 4694 + }, + { + "epoch": 0.23, + "grad_norm": 0.5825265645980835, + "learning_rate": 0.0005913334885227151, + "loss": 3.3266, + "step": 4695 + }, + { + "epoch": 0.23, + "grad_norm": 0.5940053462982178, + "learning_rate": 0.0005913298142129741, + "loss": 3.2624, + "step": 4696 + }, + { + "epoch": 0.23, + "grad_norm": 0.5737009644508362, + "learning_rate": 0.0005913261391359266, + "loss": 3.2402, + "step": 4697 + }, + { + "epoch": 0.23, + "grad_norm": 0.5772563219070435, + "learning_rate": 0.0005913224632915821, + "loss": 3.541, + "step": 4698 + }, + { + "epoch": 0.23, + "grad_norm": 0.5606215000152588, + "learning_rate": 0.0005913187866799503, + "loss": 3.2515, + "step": 4699 + }, + { + "epoch": 0.23, + "grad_norm": 0.6204817295074463, + "learning_rate": 0.0005913151093010408, + "loss": 3.6065, + "step": 4700 + }, + { + "epoch": 0.23, + "grad_norm": 0.5818078517913818, + "learning_rate": 0.0005913114311548635, + "loss": 3.582, + "step": 4701 + }, + { + "epoch": 0.23, + "grad_norm": 0.556207001209259, + "learning_rate": 0.0005913077522414278, + "loss": 3.4011, + "step": 4702 + }, + { + "epoch": 0.23, + "grad_norm": 0.5517778992652893, + "learning_rate": 0.0005913040725607437, + "loss": 3.362, + "step": 4703 + }, + { + "epoch": 0.23, + "grad_norm": 0.5277376174926758, + "learning_rate": 0.0005913003921128206, + "loss": 3.5635, + "step": 4704 + }, + { + "epoch": 0.23, + "grad_norm": 0.5681208968162537, + "learning_rate": 0.0005912967108976684, + "loss": 3.4729, + "step": 4705 + }, + { + "epoch": 0.23, + "grad_norm": 0.554221510887146, + "learning_rate": 0.0005912930289152967, + "loss": 3.5178, + "step": 4706 + }, + { + "epoch": 0.23, + "grad_norm": 0.5068381428718567, + "learning_rate": 0.0005912893461657152, + "loss": 3.5696, + "step": 4707 + }, + { + "epoch": 0.23, + "grad_norm": 0.5582959055900574, + "learning_rate": 0.0005912856626489337, + "loss": 3.3457, + "step": 4708 + }, + { + "epoch": 0.23, + "grad_norm": 0.5499424338340759, + "learning_rate": 0.0005912819783649617, + "loss": 3.5449, + "step": 4709 + }, + { + "epoch": 0.23, + "grad_norm": 0.5410943627357483, + "learning_rate": 0.0005912782933138091, + "loss": 3.5805, + "step": 4710 + }, + { + "epoch": 0.23, + "grad_norm": 0.5599467754364014, + "learning_rate": 0.0005912746074954856, + "loss": 3.4815, + "step": 4711 + }, + { + "epoch": 0.23, + "grad_norm": 0.6126967072486877, + "learning_rate": 0.0005912709209100007, + "loss": 3.2459, + "step": 4712 + }, + { + "epoch": 0.23, + "grad_norm": 0.5697960257530212, + "learning_rate": 0.0005912672335573643, + "loss": 3.4214, + "step": 4713 + }, + { + "epoch": 0.23, + "grad_norm": 0.5213540196418762, + "learning_rate": 0.0005912635454375861, + "loss": 3.6434, + "step": 4714 + }, + { + "epoch": 0.23, + "grad_norm": 0.667456328868866, + "learning_rate": 0.0005912598565506758, + "loss": 3.4364, + "step": 4715 + }, + { + "epoch": 0.23, + "grad_norm": 0.5707606077194214, + "learning_rate": 0.0005912561668966431, + "loss": 3.4236, + "step": 4716 + }, + { + "epoch": 0.23, + "grad_norm": 0.551876962184906, + "learning_rate": 0.0005912524764754976, + "loss": 3.4234, + "step": 4717 + }, + { + "epoch": 0.23, + "grad_norm": 0.5813961029052734, + "learning_rate": 0.0005912487852872492, + "loss": 3.4069, + "step": 4718 + }, + { + "epoch": 0.23, + "grad_norm": 0.5820927023887634, + "learning_rate": 0.0005912450933319075, + "loss": 3.429, + "step": 4719 + }, + { + "epoch": 0.23, + "grad_norm": 0.6033832430839539, + "learning_rate": 0.0005912414006094824, + "loss": 3.5748, + "step": 4720 + }, + { + "epoch": 0.23, + "grad_norm": 0.5736304521560669, + "learning_rate": 0.0005912377071199834, + "loss": 3.2263, + "step": 4721 + }, + { + "epoch": 0.23, + "grad_norm": 0.555508553981781, + "learning_rate": 0.0005912340128634205, + "loss": 3.3534, + "step": 4722 + }, + { + "epoch": 0.23, + "grad_norm": 0.5986608266830444, + "learning_rate": 0.000591230317839803, + "loss": 3.4823, + "step": 4723 + }, + { + "epoch": 0.23, + "grad_norm": 0.6043596267700195, + "learning_rate": 0.000591226622049141, + "loss": 3.4663, + "step": 4724 + }, + { + "epoch": 0.23, + "grad_norm": 0.540687620639801, + "learning_rate": 0.0005912229254914442, + "loss": 3.3754, + "step": 4725 + }, + { + "epoch": 0.23, + "grad_norm": 0.6195804476737976, + "learning_rate": 0.0005912192281667221, + "loss": 3.39, + "step": 4726 + }, + { + "epoch": 0.23, + "grad_norm": 0.5378120541572571, + "learning_rate": 0.0005912155300749846, + "loss": 3.5522, + "step": 4727 + }, + { + "epoch": 0.23, + "grad_norm": 0.5554460287094116, + "learning_rate": 0.0005912118312162416, + "loss": 3.3615, + "step": 4728 + }, + { + "epoch": 0.23, + "grad_norm": 0.5363547205924988, + "learning_rate": 0.0005912081315905026, + "loss": 3.3284, + "step": 4729 + }, + { + "epoch": 0.23, + "grad_norm": 0.6088721752166748, + "learning_rate": 0.0005912044311977775, + "loss": 3.4141, + "step": 4730 + }, + { + "epoch": 0.23, + "grad_norm": 0.5486693382263184, + "learning_rate": 0.0005912007300380758, + "loss": 3.3685, + "step": 4731 + }, + { + "epoch": 0.23, + "grad_norm": 0.558586597442627, + "learning_rate": 0.0005911970281114075, + "loss": 3.4063, + "step": 4732 + }, + { + "epoch": 0.23, + "grad_norm": 0.5668739676475525, + "learning_rate": 0.0005911933254177824, + "loss": 3.3792, + "step": 4733 + }, + { + "epoch": 0.23, + "grad_norm": 0.5825808644294739, + "learning_rate": 0.00059118962195721, + "loss": 3.6577, + "step": 4734 + }, + { + "epoch": 0.23, + "grad_norm": 0.5412962436676025, + "learning_rate": 0.0005911859177297002, + "loss": 3.2624, + "step": 4735 + }, + { + "epoch": 0.23, + "grad_norm": 0.5681858658790588, + "learning_rate": 0.0005911822127352628, + "loss": 3.3996, + "step": 4736 + }, + { + "epoch": 0.23, + "grad_norm": 0.6266666650772095, + "learning_rate": 0.0005911785069739073, + "loss": 3.5886, + "step": 4737 + }, + { + "epoch": 0.23, + "grad_norm": 0.5447356700897217, + "learning_rate": 0.0005911748004456438, + "loss": 3.3494, + "step": 4738 + }, + { + "epoch": 0.23, + "grad_norm": 0.5406894683837891, + "learning_rate": 0.0005911710931504818, + "loss": 3.4121, + "step": 4739 + }, + { + "epoch": 0.23, + "grad_norm": 0.5392957329750061, + "learning_rate": 0.0005911673850884313, + "loss": 3.3898, + "step": 4740 + }, + { + "epoch": 0.23, + "grad_norm": 0.5871279239654541, + "learning_rate": 0.000591163676259502, + "loss": 3.081, + "step": 4741 + }, + { + "epoch": 0.23, + "grad_norm": 0.6096645593643188, + "learning_rate": 0.0005911599666637035, + "loss": 3.508, + "step": 4742 + }, + { + "epoch": 0.23, + "grad_norm": 0.5398980379104614, + "learning_rate": 0.0005911562563010457, + "loss": 3.5063, + "step": 4743 + }, + { + "epoch": 0.23, + "grad_norm": 0.5835353136062622, + "learning_rate": 0.0005911525451715383, + "loss": 3.477, + "step": 4744 + }, + { + "epoch": 0.23, + "grad_norm": 0.5687283277511597, + "learning_rate": 0.0005911488332751911, + "loss": 3.4038, + "step": 4745 + }, + { + "epoch": 0.23, + "grad_norm": 0.5824450254440308, + "learning_rate": 0.000591145120612014, + "loss": 3.5479, + "step": 4746 + }, + { + "epoch": 0.23, + "grad_norm": 0.5424936413764954, + "learning_rate": 0.0005911414071820167, + "loss": 3.4889, + "step": 4747 + }, + { + "epoch": 0.23, + "grad_norm": 0.5513694882392883, + "learning_rate": 0.000591137692985209, + "loss": 3.4297, + "step": 4748 + }, + { + "epoch": 0.23, + "grad_norm": 0.5256475210189819, + "learning_rate": 0.0005911339780216005, + "loss": 3.6925, + "step": 4749 + }, + { + "epoch": 0.23, + "grad_norm": 0.5420495867729187, + "learning_rate": 0.0005911302622912013, + "loss": 3.6353, + "step": 4750 + }, + { + "epoch": 0.23, + "grad_norm": 0.5440043807029724, + "learning_rate": 0.000591126545794021, + "loss": 3.3512, + "step": 4751 + }, + { + "epoch": 0.23, + "grad_norm": 0.5715298652648926, + "learning_rate": 0.0005911228285300692, + "loss": 3.3806, + "step": 4752 + }, + { + "epoch": 0.23, + "grad_norm": 0.5849560499191284, + "learning_rate": 0.0005911191104993561, + "loss": 3.4816, + "step": 4753 + }, + { + "epoch": 0.23, + "grad_norm": 0.5608013272285461, + "learning_rate": 0.0005911153917018912, + "loss": 3.527, + "step": 4754 + }, + { + "epoch": 0.23, + "grad_norm": 0.584721028804779, + "learning_rate": 0.0005911116721376844, + "loss": 3.284, + "step": 4755 + }, + { + "epoch": 0.23, + "grad_norm": 0.5911849737167358, + "learning_rate": 0.0005911079518067455, + "loss": 3.4843, + "step": 4756 + }, + { + "epoch": 0.23, + "grad_norm": 0.569872260093689, + "learning_rate": 0.0005911042307090843, + "loss": 3.5176, + "step": 4757 + }, + { + "epoch": 0.23, + "grad_norm": 0.6122729778289795, + "learning_rate": 0.0005911005088447105, + "loss": 3.6845, + "step": 4758 + }, + { + "epoch": 0.23, + "grad_norm": 0.552301287651062, + "learning_rate": 0.0005910967862136341, + "loss": 3.5472, + "step": 4759 + }, + { + "epoch": 0.23, + "grad_norm": 0.5530393719673157, + "learning_rate": 0.0005910930628158647, + "loss": 3.6068, + "step": 4760 + }, + { + "epoch": 0.23, + "grad_norm": 0.5497402548789978, + "learning_rate": 0.0005910893386514122, + "loss": 3.3727, + "step": 4761 + }, + { + "epoch": 0.23, + "grad_norm": 0.6003914475440979, + "learning_rate": 0.0005910856137202865, + "loss": 3.5103, + "step": 4762 + }, + { + "epoch": 0.23, + "grad_norm": 0.5288448333740234, + "learning_rate": 0.0005910818880224973, + "loss": 3.4836, + "step": 4763 + }, + { + "epoch": 0.23, + "grad_norm": 0.5383844375610352, + "learning_rate": 0.0005910781615580542, + "loss": 3.3261, + "step": 4764 + }, + { + "epoch": 0.23, + "grad_norm": 0.5371490120887756, + "learning_rate": 0.0005910744343269674, + "loss": 3.5052, + "step": 4765 + }, + { + "epoch": 0.23, + "grad_norm": 0.5520905256271362, + "learning_rate": 0.0005910707063292466, + "loss": 3.5325, + "step": 4766 + }, + { + "epoch": 0.23, + "grad_norm": 0.5796260237693787, + "learning_rate": 0.0005910669775649016, + "loss": 3.1702, + "step": 4767 + }, + { + "epoch": 0.23, + "grad_norm": 0.6260196566581726, + "learning_rate": 0.0005910632480339421, + "loss": 3.4242, + "step": 4768 + }, + { + "epoch": 0.23, + "grad_norm": 0.5613893270492554, + "learning_rate": 0.0005910595177363781, + "loss": 3.6865, + "step": 4769 + }, + { + "epoch": 0.23, + "grad_norm": 0.5682705044746399, + "learning_rate": 0.0005910557866722193, + "loss": 3.1459, + "step": 4770 + }, + { + "epoch": 0.23, + "grad_norm": 0.6285190582275391, + "learning_rate": 0.0005910520548414756, + "loss": 3.3947, + "step": 4771 + }, + { + "epoch": 0.23, + "grad_norm": 0.5714618563652039, + "learning_rate": 0.0005910483222441568, + "loss": 3.5817, + "step": 4772 + }, + { + "epoch": 0.23, + "grad_norm": 0.5577132105827332, + "learning_rate": 0.0005910445888802727, + "loss": 3.4281, + "step": 4773 + }, + { + "epoch": 0.23, + "grad_norm": 0.6071662306785583, + "learning_rate": 0.0005910408547498332, + "loss": 3.3626, + "step": 4774 + }, + { + "epoch": 0.23, + "grad_norm": 0.5966734886169434, + "learning_rate": 0.000591037119852848, + "loss": 3.4659, + "step": 4775 + }, + { + "epoch": 0.23, + "grad_norm": 0.5586175322532654, + "learning_rate": 0.0005910333841893271, + "loss": 3.3256, + "step": 4776 + }, + { + "epoch": 0.23, + "grad_norm": 0.5855554938316345, + "learning_rate": 0.0005910296477592803, + "loss": 3.7103, + "step": 4777 + }, + { + "epoch": 0.23, + "grad_norm": 0.52793288230896, + "learning_rate": 0.0005910259105627174, + "loss": 3.6772, + "step": 4778 + }, + { + "epoch": 0.23, + "grad_norm": 0.5505519509315491, + "learning_rate": 0.0005910221725996483, + "loss": 3.278, + "step": 4779 + }, + { + "epoch": 0.23, + "grad_norm": 0.5763952136039734, + "learning_rate": 0.0005910184338700828, + "loss": 3.4688, + "step": 4780 + }, + { + "epoch": 0.23, + "grad_norm": 0.6565443873405457, + "learning_rate": 0.0005910146943740308, + "loss": 3.5173, + "step": 4781 + }, + { + "epoch": 0.23, + "grad_norm": 0.582163393497467, + "learning_rate": 0.0005910109541115019, + "loss": 3.1918, + "step": 4782 + }, + { + "epoch": 0.23, + "grad_norm": 0.5900031328201294, + "learning_rate": 0.0005910072130825063, + "loss": 3.4327, + "step": 4783 + }, + { + "epoch": 0.23, + "grad_norm": 0.6079528331756592, + "learning_rate": 0.0005910034712870538, + "loss": 3.4477, + "step": 4784 + }, + { + "epoch": 0.23, + "grad_norm": 0.5783188939094543, + "learning_rate": 0.0005909997287251539, + "loss": 3.5502, + "step": 4785 + }, + { + "epoch": 0.23, + "grad_norm": 0.5941360592842102, + "learning_rate": 0.000590995985396817, + "loss": 3.351, + "step": 4786 + }, + { + "epoch": 0.23, + "grad_norm": 0.6161750555038452, + "learning_rate": 0.0005909922413020525, + "loss": 3.5587, + "step": 4787 + }, + { + "epoch": 0.23, + "grad_norm": 0.5669690370559692, + "learning_rate": 0.0005909884964408705, + "loss": 3.5122, + "step": 4788 + }, + { + "epoch": 0.23, + "grad_norm": 0.5724545121192932, + "learning_rate": 0.0005909847508132808, + "loss": 3.3534, + "step": 4789 + }, + { + "epoch": 0.23, + "grad_norm": 0.5895712375640869, + "learning_rate": 0.0005909810044192932, + "loss": 3.4805, + "step": 4790 + }, + { + "epoch": 0.23, + "grad_norm": 0.542855441570282, + "learning_rate": 0.0005909772572589177, + "loss": 3.5939, + "step": 4791 + }, + { + "epoch": 0.23, + "grad_norm": 0.5481388568878174, + "learning_rate": 0.0005909735093321642, + "loss": 3.4121, + "step": 4792 + }, + { + "epoch": 0.23, + "grad_norm": 0.5825479626655579, + "learning_rate": 0.0005909697606390423, + "loss": 3.5954, + "step": 4793 + }, + { + "epoch": 0.23, + "grad_norm": 0.5931483507156372, + "learning_rate": 0.0005909660111795621, + "loss": 3.2799, + "step": 4794 + }, + { + "epoch": 0.23, + "grad_norm": 0.530947208404541, + "learning_rate": 0.0005909622609537334, + "loss": 3.5938, + "step": 4795 + }, + { + "epoch": 0.24, + "grad_norm": 0.5414092540740967, + "learning_rate": 0.0005909585099615662, + "loss": 3.6877, + "step": 4796 + }, + { + "epoch": 0.24, + "grad_norm": 0.5826252698898315, + "learning_rate": 0.0005909547582030702, + "loss": 3.4782, + "step": 4797 + }, + { + "epoch": 0.24, + "grad_norm": 0.5572245717048645, + "learning_rate": 0.0005909510056782554, + "loss": 3.3905, + "step": 4798 + }, + { + "epoch": 0.24, + "grad_norm": 0.564272403717041, + "learning_rate": 0.0005909472523871317, + "loss": 3.5094, + "step": 4799 + }, + { + "epoch": 0.24, + "grad_norm": 0.5508630871772766, + "learning_rate": 0.0005909434983297089, + "loss": 3.501, + "step": 4800 + }, + { + "epoch": 0.24, + "grad_norm": 0.5623564720153809, + "learning_rate": 0.0005909397435059967, + "loss": 3.3671, + "step": 4801 + }, + { + "epoch": 0.24, + "grad_norm": 0.5778913497924805, + "learning_rate": 0.0005909359879160055, + "loss": 3.5242, + "step": 4802 + }, + { + "epoch": 0.24, + "grad_norm": 0.5555720329284668, + "learning_rate": 0.0005909322315597448, + "loss": 3.4082, + "step": 4803 + }, + { + "epoch": 0.24, + "grad_norm": 0.5474628806114197, + "learning_rate": 0.0005909284744372245, + "loss": 3.3297, + "step": 4804 + }, + { + "epoch": 0.24, + "grad_norm": 0.534807026386261, + "learning_rate": 0.0005909247165484547, + "loss": 3.3332, + "step": 4805 + }, + { + "epoch": 0.24, + "grad_norm": 0.524055540561676, + "learning_rate": 0.0005909209578934452, + "loss": 3.583, + "step": 4806 + }, + { + "epoch": 0.24, + "grad_norm": 0.5375897288322449, + "learning_rate": 0.0005909171984722058, + "loss": 3.4536, + "step": 4807 + }, + { + "epoch": 0.24, + "grad_norm": 0.5646499395370483, + "learning_rate": 0.0005909134382847466, + "loss": 3.3704, + "step": 4808 + }, + { + "epoch": 0.24, + "grad_norm": 0.5710200071334839, + "learning_rate": 0.0005909096773310773, + "loss": 3.5032, + "step": 4809 + }, + { + "epoch": 0.24, + "grad_norm": 0.5503785610198975, + "learning_rate": 0.000590905915611208, + "loss": 3.4326, + "step": 4810 + }, + { + "epoch": 0.24, + "grad_norm": 0.5581929087638855, + "learning_rate": 0.0005909021531251483, + "loss": 3.5753, + "step": 4811 + }, + { + "epoch": 0.24, + "grad_norm": 0.6241434216499329, + "learning_rate": 0.0005908983898729084, + "loss": 3.4697, + "step": 4812 + }, + { + "epoch": 0.24, + "grad_norm": 0.6147471070289612, + "learning_rate": 0.0005908946258544982, + "loss": 3.2685, + "step": 4813 + }, + { + "epoch": 0.24, + "grad_norm": 0.6412365436553955, + "learning_rate": 0.0005908908610699275, + "loss": 3.4211, + "step": 4814 + }, + { + "epoch": 0.24, + "grad_norm": 0.5611572861671448, + "learning_rate": 0.0005908870955192064, + "loss": 3.4937, + "step": 4815 + }, + { + "epoch": 0.24, + "grad_norm": 0.5360367298126221, + "learning_rate": 0.0005908833292023445, + "loss": 3.6016, + "step": 4816 + }, + { + "epoch": 0.24, + "grad_norm": 0.546721339225769, + "learning_rate": 0.0005908795621193519, + "loss": 3.2601, + "step": 4817 + }, + { + "epoch": 0.24, + "grad_norm": 0.5728215575218201, + "learning_rate": 0.0005908757942702385, + "loss": 3.4103, + "step": 4818 + }, + { + "epoch": 0.24, + "grad_norm": 0.556920051574707, + "learning_rate": 0.0005908720256550143, + "loss": 3.3727, + "step": 4819 + }, + { + "epoch": 0.24, + "grad_norm": 0.5450972318649292, + "learning_rate": 0.0005908682562736892, + "loss": 3.4042, + "step": 4820 + }, + { + "epoch": 0.24, + "grad_norm": 0.5655497908592224, + "learning_rate": 0.000590864486126273, + "loss": 3.3996, + "step": 4821 + }, + { + "epoch": 0.24, + "grad_norm": 0.5585508942604065, + "learning_rate": 0.0005908607152127757, + "loss": 3.4254, + "step": 4822 + }, + { + "epoch": 0.24, + "grad_norm": 0.542323648929596, + "learning_rate": 0.0005908569435332072, + "loss": 3.457, + "step": 4823 + }, + { + "epoch": 0.24, + "grad_norm": 0.5362521409988403, + "learning_rate": 0.0005908531710875777, + "loss": 3.5526, + "step": 4824 + }, + { + "epoch": 0.24, + "grad_norm": 0.5256375670433044, + "learning_rate": 0.0005908493978758969, + "loss": 3.4572, + "step": 4825 + }, + { + "epoch": 0.24, + "grad_norm": 0.5696166157722473, + "learning_rate": 0.0005908456238981747, + "loss": 3.4514, + "step": 4826 + }, + { + "epoch": 0.24, + "grad_norm": 0.5578446984291077, + "learning_rate": 0.000590841849154421, + "loss": 3.5851, + "step": 4827 + }, + { + "epoch": 0.24, + "grad_norm": 0.5300297737121582, + "learning_rate": 0.0005908380736446459, + "loss": 3.5034, + "step": 4828 + }, + { + "epoch": 0.24, + "grad_norm": 0.5648180246353149, + "learning_rate": 0.0005908342973688594, + "loss": 3.599, + "step": 4829 + }, + { + "epoch": 0.24, + "grad_norm": 0.6014158129692078, + "learning_rate": 0.0005908305203270712, + "loss": 3.4826, + "step": 4830 + }, + { + "epoch": 0.24, + "grad_norm": 0.5411103367805481, + "learning_rate": 0.0005908267425192914, + "loss": 3.4111, + "step": 4831 + }, + { + "epoch": 0.24, + "grad_norm": 0.5625185370445251, + "learning_rate": 0.00059082296394553, + "loss": 3.403, + "step": 4832 + }, + { + "epoch": 0.24, + "grad_norm": 0.6025658845901489, + "learning_rate": 0.0005908191846057968, + "loss": 3.329, + "step": 4833 + }, + { + "epoch": 0.24, + "grad_norm": 0.5506299734115601, + "learning_rate": 0.0005908154045001019, + "loss": 3.6375, + "step": 4834 + }, + { + "epoch": 0.24, + "grad_norm": 0.5933164358139038, + "learning_rate": 0.0005908116236284551, + "loss": 3.2049, + "step": 4835 + }, + { + "epoch": 0.24, + "grad_norm": 0.5729756951332092, + "learning_rate": 0.0005908078419908666, + "loss": 3.3538, + "step": 4836 + }, + { + "epoch": 0.24, + "grad_norm": 0.5527672171592712, + "learning_rate": 0.000590804059587346, + "loss": 3.3438, + "step": 4837 + }, + { + "epoch": 0.24, + "grad_norm": 0.5585240721702576, + "learning_rate": 0.0005908002764179037, + "loss": 3.5065, + "step": 4838 + }, + { + "epoch": 0.24, + "grad_norm": 0.5482339262962341, + "learning_rate": 0.0005907964924825493, + "loss": 3.425, + "step": 4839 + }, + { + "epoch": 0.24, + "grad_norm": 0.5115832686424255, + "learning_rate": 0.000590792707781293, + "loss": 3.4516, + "step": 4840 + }, + { + "epoch": 0.24, + "grad_norm": 0.5796265006065369, + "learning_rate": 0.0005907889223141446, + "loss": 3.4701, + "step": 4841 + }, + { + "epoch": 0.24, + "grad_norm": 0.5543243288993835, + "learning_rate": 0.0005907851360811142, + "loss": 3.2507, + "step": 4842 + }, + { + "epoch": 0.24, + "grad_norm": 0.5635089874267578, + "learning_rate": 0.0005907813490822116, + "loss": 3.3394, + "step": 4843 + }, + { + "epoch": 0.24, + "grad_norm": 0.5561545491218567, + "learning_rate": 0.000590777561317447, + "loss": 3.3604, + "step": 4844 + }, + { + "epoch": 0.24, + "grad_norm": 0.5824812650680542, + "learning_rate": 0.0005907737727868302, + "loss": 3.5275, + "step": 4845 + }, + { + "epoch": 0.24, + "grad_norm": 0.5519053339958191, + "learning_rate": 0.0005907699834903713, + "loss": 3.5574, + "step": 4846 + }, + { + "epoch": 0.24, + "grad_norm": 0.5963732004165649, + "learning_rate": 0.0005907661934280801, + "loss": 3.4281, + "step": 4847 + }, + { + "epoch": 0.24, + "grad_norm": 0.5388736724853516, + "learning_rate": 0.0005907624025999669, + "loss": 3.3095, + "step": 4848 + }, + { + "epoch": 0.24, + "grad_norm": 0.5201981067657471, + "learning_rate": 0.0005907586110060414, + "loss": 3.2326, + "step": 4849 + }, + { + "epoch": 0.24, + "grad_norm": 0.5812944769859314, + "learning_rate": 0.0005907548186463137, + "loss": 3.4784, + "step": 4850 + }, + { + "epoch": 0.24, + "grad_norm": 0.5888471603393555, + "learning_rate": 0.0005907510255207936, + "loss": 3.5884, + "step": 4851 + }, + { + "epoch": 0.24, + "grad_norm": 0.5519949793815613, + "learning_rate": 0.0005907472316294914, + "loss": 3.3247, + "step": 4852 + }, + { + "epoch": 0.24, + "grad_norm": 0.5584381818771362, + "learning_rate": 0.0005907434369724169, + "loss": 3.4411, + "step": 4853 + }, + { + "epoch": 0.24, + "grad_norm": 0.5249735713005066, + "learning_rate": 0.0005907396415495802, + "loss": 3.4842, + "step": 4854 + }, + { + "epoch": 0.24, + "grad_norm": 0.5697634816169739, + "learning_rate": 0.0005907358453609912, + "loss": 3.5065, + "step": 4855 + }, + { + "epoch": 0.24, + "grad_norm": 0.5187658071517944, + "learning_rate": 0.00059073204840666, + "loss": 3.3634, + "step": 4856 + }, + { + "epoch": 0.24, + "grad_norm": 0.5130595564842224, + "learning_rate": 0.0005907282506865965, + "loss": 3.2991, + "step": 4857 + }, + { + "epoch": 0.24, + "grad_norm": 0.5928294658660889, + "learning_rate": 0.0005907244522008107, + "loss": 3.4294, + "step": 4858 + }, + { + "epoch": 0.24, + "grad_norm": 0.5565031170845032, + "learning_rate": 0.0005907206529493127, + "loss": 3.7071, + "step": 4859 + }, + { + "epoch": 0.24, + "grad_norm": 0.5738771557807922, + "learning_rate": 0.0005907168529321124, + "loss": 3.3815, + "step": 4860 + }, + { + "epoch": 0.24, + "grad_norm": 0.5372669696807861, + "learning_rate": 0.0005907130521492198, + "loss": 3.4133, + "step": 4861 + }, + { + "epoch": 0.24, + "grad_norm": 0.7971889972686768, + "learning_rate": 0.0005907092506006452, + "loss": 3.5911, + "step": 4862 + }, + { + "epoch": 0.24, + "grad_norm": 0.6166131496429443, + "learning_rate": 0.0005907054482863981, + "loss": 3.4924, + "step": 4863 + }, + { + "epoch": 0.24, + "grad_norm": 0.6117991209030151, + "learning_rate": 0.0005907016452064889, + "loss": 3.4626, + "step": 4864 + }, + { + "epoch": 0.24, + "grad_norm": 0.5576360821723938, + "learning_rate": 0.0005906978413609276, + "loss": 3.4898, + "step": 4865 + }, + { + "epoch": 0.24, + "grad_norm": 0.5783229470252991, + "learning_rate": 0.000590694036749724, + "loss": 3.3548, + "step": 4866 + }, + { + "epoch": 0.24, + "grad_norm": 0.5609897375106812, + "learning_rate": 0.0005906902313728885, + "loss": 3.3822, + "step": 4867 + }, + { + "epoch": 0.24, + "grad_norm": 0.5815325379371643, + "learning_rate": 0.0005906864252304305, + "loss": 3.5328, + "step": 4868 + }, + { + "epoch": 0.24, + "grad_norm": 0.5757078528404236, + "learning_rate": 0.0005906826183223607, + "loss": 3.4499, + "step": 4869 + }, + { + "epoch": 0.24, + "grad_norm": 0.5703591108322144, + "learning_rate": 0.0005906788106486888, + "loss": 3.4756, + "step": 4870 + }, + { + "epoch": 0.24, + "grad_norm": 0.551869809627533, + "learning_rate": 0.0005906750022094246, + "loss": 3.6005, + "step": 4871 + }, + { + "epoch": 0.24, + "grad_norm": 0.643150269985199, + "learning_rate": 0.0005906711930045787, + "loss": 3.4636, + "step": 4872 + }, + { + "epoch": 0.24, + "grad_norm": 0.5789183378219604, + "learning_rate": 0.0005906673830341606, + "loss": 3.5697, + "step": 4873 + }, + { + "epoch": 0.24, + "grad_norm": 0.5472760796546936, + "learning_rate": 0.0005906635722981807, + "loss": 3.5206, + "step": 4874 + }, + { + "epoch": 0.24, + "grad_norm": 0.6037294268608093, + "learning_rate": 0.0005906597607966488, + "loss": 3.4146, + "step": 4875 + }, + { + "epoch": 0.24, + "grad_norm": 0.5475078225135803, + "learning_rate": 0.000590655948529575, + "loss": 3.5947, + "step": 4876 + }, + { + "epoch": 0.24, + "grad_norm": 0.5711190700531006, + "learning_rate": 0.0005906521354969695, + "loss": 3.2974, + "step": 4877 + }, + { + "epoch": 0.24, + "grad_norm": 0.5613605380058289, + "learning_rate": 0.0005906483216988421, + "loss": 3.2993, + "step": 4878 + }, + { + "epoch": 0.24, + "grad_norm": 0.5774216055870056, + "learning_rate": 0.000590644507135203, + "loss": 3.3731, + "step": 4879 + }, + { + "epoch": 0.24, + "grad_norm": 0.5997605323791504, + "learning_rate": 0.0005906406918060622, + "loss": 3.5623, + "step": 4880 + }, + { + "epoch": 0.24, + "grad_norm": 0.5883737206459045, + "learning_rate": 0.0005906368757114298, + "loss": 3.6096, + "step": 4881 + }, + { + "epoch": 0.24, + "grad_norm": 0.5710089802742004, + "learning_rate": 0.0005906330588513157, + "loss": 3.4704, + "step": 4882 + }, + { + "epoch": 0.24, + "grad_norm": 0.516525149345398, + "learning_rate": 0.0005906292412257301, + "loss": 3.4233, + "step": 4883 + }, + { + "epoch": 0.24, + "grad_norm": 0.585911750793457, + "learning_rate": 0.000590625422834683, + "loss": 3.4795, + "step": 4884 + }, + { + "epoch": 0.24, + "grad_norm": 0.5662615299224854, + "learning_rate": 0.0005906216036781845, + "loss": 3.5267, + "step": 4885 + }, + { + "epoch": 0.24, + "grad_norm": 0.5360583066940308, + "learning_rate": 0.0005906177837562446, + "loss": 3.7512, + "step": 4886 + }, + { + "epoch": 0.24, + "grad_norm": 0.5965997576713562, + "learning_rate": 0.0005906139630688735, + "loss": 3.4446, + "step": 4887 + }, + { + "epoch": 0.24, + "grad_norm": 0.5111775994300842, + "learning_rate": 0.0005906101416160809, + "loss": 3.4171, + "step": 4888 + }, + { + "epoch": 0.24, + "grad_norm": 0.5461533069610596, + "learning_rate": 0.0005906063193978773, + "loss": 3.5264, + "step": 4889 + }, + { + "epoch": 0.24, + "grad_norm": 0.543048620223999, + "learning_rate": 0.0005906024964142726, + "loss": 3.2279, + "step": 4890 + }, + { + "epoch": 0.24, + "grad_norm": 0.5315186977386475, + "learning_rate": 0.0005905986726652769, + "loss": 3.5112, + "step": 4891 + }, + { + "epoch": 0.24, + "grad_norm": 0.5514733791351318, + "learning_rate": 0.0005905948481509, + "loss": 3.1409, + "step": 4892 + }, + { + "epoch": 0.24, + "grad_norm": 0.5441703200340271, + "learning_rate": 0.0005905910228711524, + "loss": 3.3357, + "step": 4893 + }, + { + "epoch": 0.24, + "grad_norm": 0.5863307118415833, + "learning_rate": 0.0005905871968260438, + "loss": 3.5144, + "step": 4894 + }, + { + "epoch": 0.24, + "grad_norm": 0.5528578162193298, + "learning_rate": 0.0005905833700155847, + "loss": 3.3631, + "step": 4895 + }, + { + "epoch": 0.24, + "grad_norm": 0.565290093421936, + "learning_rate": 0.0005905795424397848, + "loss": 3.3386, + "step": 4896 + }, + { + "epoch": 0.24, + "grad_norm": 0.5290336012840271, + "learning_rate": 0.0005905757140986542, + "loss": 3.6237, + "step": 4897 + }, + { + "epoch": 0.24, + "grad_norm": 0.5270799398422241, + "learning_rate": 0.0005905718849922031, + "loss": 3.4527, + "step": 4898 + }, + { + "epoch": 0.24, + "grad_norm": 0.5542150735855103, + "learning_rate": 0.0005905680551204416, + "loss": 3.3004, + "step": 4899 + }, + { + "epoch": 0.24, + "grad_norm": 0.5722207427024841, + "learning_rate": 0.0005905642244833799, + "loss": 3.4647, + "step": 4900 + }, + { + "epoch": 0.24, + "grad_norm": 0.5691162943840027, + "learning_rate": 0.000590560393081028, + "loss": 3.4091, + "step": 4901 + }, + { + "epoch": 0.24, + "grad_norm": 0.5475846529006958, + "learning_rate": 0.0005905565609133957, + "loss": 3.6877, + "step": 4902 + }, + { + "epoch": 0.24, + "grad_norm": 0.5672742128372192, + "learning_rate": 0.0005905527279804934, + "loss": 3.4401, + "step": 4903 + }, + { + "epoch": 0.24, + "grad_norm": 0.49786025285720825, + "learning_rate": 0.0005905488942823312, + "loss": 3.4634, + "step": 4904 + }, + { + "epoch": 0.24, + "grad_norm": 0.5762016773223877, + "learning_rate": 0.0005905450598189191, + "loss": 3.3126, + "step": 4905 + }, + { + "epoch": 0.24, + "grad_norm": 0.5646076798439026, + "learning_rate": 0.0005905412245902673, + "loss": 3.4117, + "step": 4906 + }, + { + "epoch": 0.24, + "grad_norm": 0.6327397227287292, + "learning_rate": 0.0005905373885963857, + "loss": 3.3765, + "step": 4907 + }, + { + "epoch": 0.24, + "grad_norm": 0.5711735486984253, + "learning_rate": 0.0005905335518372846, + "loss": 3.3918, + "step": 4908 + }, + { + "epoch": 0.24, + "grad_norm": 0.5844196677207947, + "learning_rate": 0.0005905297143129741, + "loss": 3.3744, + "step": 4909 + }, + { + "epoch": 0.24, + "grad_norm": 0.603782057762146, + "learning_rate": 0.0005905258760234641, + "loss": 3.3061, + "step": 4910 + }, + { + "epoch": 0.24, + "grad_norm": 0.5405983328819275, + "learning_rate": 0.000590522036968765, + "loss": 3.6347, + "step": 4911 + }, + { + "epoch": 0.24, + "grad_norm": 0.5765902400016785, + "learning_rate": 0.0005905181971488867, + "loss": 3.4907, + "step": 4912 + }, + { + "epoch": 0.24, + "grad_norm": 0.6272974014282227, + "learning_rate": 0.0005905143565638394, + "loss": 3.2816, + "step": 4913 + }, + { + "epoch": 0.24, + "grad_norm": 0.57854825258255, + "learning_rate": 0.0005905105152136332, + "loss": 3.4262, + "step": 4914 + }, + { + "epoch": 0.24, + "grad_norm": 0.578022301197052, + "learning_rate": 0.0005905066730982782, + "loss": 3.321, + "step": 4915 + }, + { + "epoch": 0.24, + "grad_norm": 0.5480340123176575, + "learning_rate": 0.0005905028302177845, + "loss": 3.5696, + "step": 4916 + }, + { + "epoch": 0.24, + "grad_norm": 0.529426634311676, + "learning_rate": 0.0005904989865721622, + "loss": 3.4581, + "step": 4917 + }, + { + "epoch": 0.24, + "grad_norm": 0.523251473903656, + "learning_rate": 0.0005904951421614216, + "loss": 3.5019, + "step": 4918 + }, + { + "epoch": 0.24, + "grad_norm": 0.5666603446006775, + "learning_rate": 0.0005904912969855727, + "loss": 3.5147, + "step": 4919 + }, + { + "epoch": 0.24, + "grad_norm": 0.6100955605506897, + "learning_rate": 0.0005904874510446256, + "loss": 3.4469, + "step": 4920 + }, + { + "epoch": 0.24, + "grad_norm": 0.5637255907058716, + "learning_rate": 0.0005904836043385905, + "loss": 3.3005, + "step": 4921 + }, + { + "epoch": 0.24, + "grad_norm": 0.5765202641487122, + "learning_rate": 0.0005904797568674774, + "loss": 3.4327, + "step": 4922 + }, + { + "epoch": 0.24, + "grad_norm": 0.5553877353668213, + "learning_rate": 0.0005904759086312965, + "loss": 3.3015, + "step": 4923 + }, + { + "epoch": 0.24, + "grad_norm": 0.5453650951385498, + "learning_rate": 0.0005904720596300581, + "loss": 3.4105, + "step": 4924 + }, + { + "epoch": 0.24, + "grad_norm": 0.5456913113594055, + "learning_rate": 0.0005904682098637721, + "loss": 3.4166, + "step": 4925 + }, + { + "epoch": 0.24, + "grad_norm": 0.5444839596748352, + "learning_rate": 0.0005904643593324487, + "loss": 3.5552, + "step": 4926 + }, + { + "epoch": 0.24, + "grad_norm": 0.5601552724838257, + "learning_rate": 0.0005904605080360982, + "loss": 3.5365, + "step": 4927 + }, + { + "epoch": 0.24, + "grad_norm": 0.5500659346580505, + "learning_rate": 0.0005904566559747305, + "loss": 3.4039, + "step": 4928 + }, + { + "epoch": 0.24, + "grad_norm": 0.5786320567131042, + "learning_rate": 0.000590452803148356, + "loss": 3.4035, + "step": 4929 + }, + { + "epoch": 0.24, + "grad_norm": 0.5525652766227722, + "learning_rate": 0.0005904489495569846, + "loss": 3.2293, + "step": 4930 + }, + { + "epoch": 0.24, + "grad_norm": 0.5330731272697449, + "learning_rate": 0.0005904450952006267, + "loss": 3.5334, + "step": 4931 + }, + { + "epoch": 0.24, + "grad_norm": 0.551021158695221, + "learning_rate": 0.0005904412400792921, + "loss": 3.4606, + "step": 4932 + }, + { + "epoch": 0.24, + "grad_norm": 0.5765814781188965, + "learning_rate": 0.0005904373841929913, + "loss": 3.5593, + "step": 4933 + }, + { + "epoch": 0.24, + "grad_norm": 0.5668351054191589, + "learning_rate": 0.0005904335275417344, + "loss": 3.3573, + "step": 4934 + }, + { + "epoch": 0.24, + "grad_norm": 0.5682421326637268, + "learning_rate": 0.0005904296701255314, + "loss": 3.3952, + "step": 4935 + }, + { + "epoch": 0.24, + "grad_norm": 0.5964768528938293, + "learning_rate": 0.0005904258119443925, + "loss": 3.5121, + "step": 4936 + }, + { + "epoch": 0.24, + "grad_norm": 0.5435137748718262, + "learning_rate": 0.000590421952998328, + "loss": 3.4751, + "step": 4937 + }, + { + "epoch": 0.24, + "grad_norm": 0.5703478455543518, + "learning_rate": 0.0005904180932873479, + "loss": 3.3983, + "step": 4938 + }, + { + "epoch": 0.24, + "grad_norm": 0.5706227421760559, + "learning_rate": 0.0005904142328114624, + "loss": 3.2301, + "step": 4939 + }, + { + "epoch": 0.24, + "grad_norm": 0.5686094164848328, + "learning_rate": 0.0005904103715706819, + "loss": 3.5421, + "step": 4940 + }, + { + "epoch": 0.24, + "grad_norm": 0.6083675026893616, + "learning_rate": 0.0005904065095650162, + "loss": 3.4906, + "step": 4941 + }, + { + "epoch": 0.24, + "grad_norm": 0.6154701113700867, + "learning_rate": 0.0005904026467944757, + "loss": 3.319, + "step": 4942 + }, + { + "epoch": 0.24, + "grad_norm": 0.589542031288147, + "learning_rate": 0.0005903987832590703, + "loss": 3.4153, + "step": 4943 + }, + { + "epoch": 0.24, + "grad_norm": 0.5331887602806091, + "learning_rate": 0.0005903949189588107, + "loss": 3.37, + "step": 4944 + }, + { + "epoch": 0.24, + "grad_norm": 0.5606480240821838, + "learning_rate": 0.0005903910538937067, + "loss": 3.4934, + "step": 4945 + }, + { + "epoch": 0.24, + "grad_norm": 0.6116060614585876, + "learning_rate": 0.0005903871880637684, + "loss": 3.5101, + "step": 4946 + }, + { + "epoch": 0.24, + "grad_norm": 0.5732858180999756, + "learning_rate": 0.0005903833214690064, + "loss": 3.4792, + "step": 4947 + }, + { + "epoch": 0.24, + "grad_norm": 0.5375874638557434, + "learning_rate": 0.0005903794541094304, + "loss": 3.619, + "step": 4948 + }, + { + "epoch": 0.24, + "grad_norm": 0.552217423915863, + "learning_rate": 0.0005903755859850508, + "loss": 3.3901, + "step": 4949 + }, + { + "epoch": 0.24, + "grad_norm": 0.544503927230835, + "learning_rate": 0.0005903717170958779, + "loss": 3.3962, + "step": 4950 + }, + { + "epoch": 0.24, + "grad_norm": 0.5237309336662292, + "learning_rate": 0.0005903678474419217, + "loss": 3.4463, + "step": 4951 + }, + { + "epoch": 0.24, + "grad_norm": 0.5415375828742981, + "learning_rate": 0.0005903639770231925, + "loss": 3.2885, + "step": 4952 + }, + { + "epoch": 0.24, + "grad_norm": 0.5329828858375549, + "learning_rate": 0.0005903601058397005, + "loss": 3.2622, + "step": 4953 + }, + { + "epoch": 0.24, + "grad_norm": 0.5593845248222351, + "learning_rate": 0.0005903562338914558, + "loss": 3.6063, + "step": 4954 + }, + { + "epoch": 0.24, + "grad_norm": 0.689546525478363, + "learning_rate": 0.0005903523611784687, + "loss": 3.3436, + "step": 4955 + }, + { + "epoch": 0.24, + "grad_norm": 0.5579712986946106, + "learning_rate": 0.0005903484877007493, + "loss": 3.392, + "step": 4956 + }, + { + "epoch": 0.24, + "grad_norm": 0.5876078009605408, + "learning_rate": 0.0005903446134583079, + "loss": 3.5828, + "step": 4957 + }, + { + "epoch": 0.24, + "grad_norm": 0.6026819348335266, + "learning_rate": 0.0005903407384511547, + "loss": 3.5553, + "step": 4958 + }, + { + "epoch": 0.24, + "grad_norm": 0.5198916792869568, + "learning_rate": 0.0005903368626793, + "loss": 3.4045, + "step": 4959 + }, + { + "epoch": 0.24, + "grad_norm": 0.5764784216880798, + "learning_rate": 0.0005903329861427537, + "loss": 3.3433, + "step": 4960 + }, + { + "epoch": 0.24, + "grad_norm": 0.5851700305938721, + "learning_rate": 0.0005903291088415261, + "loss": 3.554, + "step": 4961 + }, + { + "epoch": 0.24, + "grad_norm": 0.5475395321846008, + "learning_rate": 0.0005903252307756276, + "loss": 3.3761, + "step": 4962 + }, + { + "epoch": 0.24, + "grad_norm": 0.6040002107620239, + "learning_rate": 0.0005903213519450683, + "loss": 3.7124, + "step": 4963 + }, + { + "epoch": 0.24, + "grad_norm": 0.5730305910110474, + "learning_rate": 0.0005903174723498586, + "loss": 3.4037, + "step": 4964 + }, + { + "epoch": 0.24, + "grad_norm": 0.5729190111160278, + "learning_rate": 0.0005903135919900084, + "loss": 3.1942, + "step": 4965 + }, + { + "epoch": 0.24, + "grad_norm": 0.6518261432647705, + "learning_rate": 0.000590309710865528, + "loss": 3.5579, + "step": 4966 + }, + { + "epoch": 0.24, + "grad_norm": 0.5518261790275574, + "learning_rate": 0.0005903058289764279, + "loss": 3.6733, + "step": 4967 + }, + { + "epoch": 0.24, + "grad_norm": 0.5408662557601929, + "learning_rate": 0.000590301946322718, + "loss": 3.2613, + "step": 4968 + }, + { + "epoch": 0.24, + "grad_norm": 0.5802552700042725, + "learning_rate": 0.0005902980629044085, + "loss": 3.5049, + "step": 4969 + }, + { + "epoch": 0.24, + "grad_norm": 0.5875983238220215, + "learning_rate": 0.00059029417872151, + "loss": 3.3783, + "step": 4970 + }, + { + "epoch": 0.24, + "grad_norm": 0.5996633768081665, + "learning_rate": 0.0005902902937740325, + "loss": 3.516, + "step": 4971 + }, + { + "epoch": 0.24, + "grad_norm": 0.5604908466339111, + "learning_rate": 0.0005902864080619861, + "loss": 3.4725, + "step": 4972 + }, + { + "epoch": 0.24, + "grad_norm": 0.5704894065856934, + "learning_rate": 0.0005902825215853812, + "loss": 3.3333, + "step": 4973 + }, + { + "epoch": 0.24, + "grad_norm": 0.5649724006652832, + "learning_rate": 0.000590278634344228, + "loss": 3.3508, + "step": 4974 + }, + { + "epoch": 0.24, + "grad_norm": 0.5423358678817749, + "learning_rate": 0.0005902747463385368, + "loss": 3.4517, + "step": 4975 + }, + { + "epoch": 0.24, + "grad_norm": 0.5924620032310486, + "learning_rate": 0.0005902708575683177, + "loss": 3.4768, + "step": 4976 + }, + { + "epoch": 0.24, + "grad_norm": 0.5734094381332397, + "learning_rate": 0.0005902669680335811, + "loss": 3.3845, + "step": 4977 + }, + { + "epoch": 0.24, + "grad_norm": 0.5569986701011658, + "learning_rate": 0.0005902630777343372, + "loss": 3.5556, + "step": 4978 + }, + { + "epoch": 0.24, + "grad_norm": 0.555212676525116, + "learning_rate": 0.0005902591866705962, + "loss": 3.3604, + "step": 4979 + }, + { + "epoch": 0.24, + "grad_norm": 0.5357699990272522, + "learning_rate": 0.0005902552948423682, + "loss": 3.4253, + "step": 4980 + }, + { + "epoch": 0.24, + "grad_norm": 0.539989173412323, + "learning_rate": 0.0005902514022496638, + "loss": 3.2653, + "step": 4981 + }, + { + "epoch": 0.24, + "grad_norm": 0.539340615272522, + "learning_rate": 0.0005902475088924929, + "loss": 3.6315, + "step": 4982 + }, + { + "epoch": 0.24, + "grad_norm": 0.5658690929412842, + "learning_rate": 0.0005902436147708661, + "loss": 3.3263, + "step": 4983 + }, + { + "epoch": 0.24, + "grad_norm": 0.5437087416648865, + "learning_rate": 0.0005902397198847934, + "loss": 3.3987, + "step": 4984 + }, + { + "epoch": 0.24, + "grad_norm": 0.5789981484413147, + "learning_rate": 0.0005902358242342851, + "loss": 3.5408, + "step": 4985 + }, + { + "epoch": 0.24, + "grad_norm": 0.5366504192352295, + "learning_rate": 0.0005902319278193516, + "loss": 3.4576, + "step": 4986 + }, + { + "epoch": 0.24, + "grad_norm": 0.5638992190361023, + "learning_rate": 0.000590228030640003, + "loss": 3.4145, + "step": 4987 + }, + { + "epoch": 0.24, + "grad_norm": 0.5469755530357361, + "learning_rate": 0.0005902241326962496, + "loss": 3.3786, + "step": 4988 + }, + { + "epoch": 0.24, + "grad_norm": 0.5611241459846497, + "learning_rate": 0.0005902202339881017, + "loss": 3.4466, + "step": 4989 + }, + { + "epoch": 0.24, + "grad_norm": 0.5863252282142639, + "learning_rate": 0.0005902163345155696, + "loss": 3.3491, + "step": 4990 + }, + { + "epoch": 0.24, + "grad_norm": 0.577661395072937, + "learning_rate": 0.0005902124342786635, + "loss": 3.1473, + "step": 4991 + }, + { + "epoch": 0.24, + "grad_norm": 0.5453828573226929, + "learning_rate": 0.0005902085332773935, + "loss": 3.3708, + "step": 4992 + }, + { + "epoch": 0.24, + "grad_norm": 0.5631303787231445, + "learning_rate": 0.0005902046315117704, + "loss": 3.4, + "step": 4993 + }, + { + "epoch": 0.24, + "grad_norm": 0.5475294589996338, + "learning_rate": 0.000590200728981804, + "loss": 3.3217, + "step": 4994 + }, + { + "epoch": 0.24, + "grad_norm": 0.5487865805625916, + "learning_rate": 0.0005901968256875047, + "loss": 3.3504, + "step": 4995 + }, + { + "epoch": 0.24, + "grad_norm": 0.5545656085014343, + "learning_rate": 0.0005901929216288828, + "loss": 3.4953, + "step": 4996 + }, + { + "epoch": 0.24, + "grad_norm": 0.5044790506362915, + "learning_rate": 0.0005901890168059487, + "loss": 3.142, + "step": 4997 + }, + { + "epoch": 0.24, + "grad_norm": 0.5951768755912781, + "learning_rate": 0.0005901851112187125, + "loss": 3.4991, + "step": 4998 + }, + { + "epoch": 0.24, + "grad_norm": 0.540361225605011, + "learning_rate": 0.0005901812048671847, + "loss": 3.4398, + "step": 4999 + }, + { + "epoch": 0.25, + "grad_norm": 0.5717666745185852, + "learning_rate": 0.0005901772977513754, + "loss": 3.3718, + "step": 5000 + }, + { + "epoch": 0.25, + "grad_norm": 0.5859256386756897, + "learning_rate": 0.0005901733898712948, + "loss": 3.2956, + "step": 5001 + }, + { + "epoch": 0.25, + "grad_norm": 0.5552880764007568, + "learning_rate": 0.0005901694812269533, + "loss": 3.5566, + "step": 5002 + }, + { + "epoch": 0.25, + "grad_norm": 0.5178077220916748, + "learning_rate": 0.0005901655718183615, + "loss": 3.503, + "step": 5003 + }, + { + "epoch": 0.25, + "grad_norm": 0.5720773339271545, + "learning_rate": 0.0005901616616455292, + "loss": 3.2363, + "step": 5004 + }, + { + "epoch": 0.25, + "grad_norm": 0.5559942126274109, + "learning_rate": 0.0005901577507084671, + "loss": 3.4998, + "step": 5005 + }, + { + "epoch": 0.25, + "grad_norm": 0.5338559746742249, + "learning_rate": 0.0005901538390071853, + "loss": 3.6233, + "step": 5006 + }, + { + "epoch": 0.25, + "grad_norm": 0.5544248223304749, + "learning_rate": 0.0005901499265416939, + "loss": 3.4523, + "step": 5007 + }, + { + "epoch": 0.25, + "grad_norm": 0.5183117985725403, + "learning_rate": 0.0005901460133120037, + "loss": 3.4746, + "step": 5008 + }, + { + "epoch": 0.25, + "grad_norm": 0.5503129959106445, + "learning_rate": 0.0005901420993181247, + "loss": 3.5572, + "step": 5009 + }, + { + "epoch": 0.25, + "grad_norm": 0.6068310141563416, + "learning_rate": 0.000590138184560067, + "loss": 3.5988, + "step": 5010 + }, + { + "epoch": 0.25, + "grad_norm": 0.5316990613937378, + "learning_rate": 0.0005901342690378413, + "loss": 3.6803, + "step": 5011 + }, + { + "epoch": 0.25, + "grad_norm": 0.516459047794342, + "learning_rate": 0.0005901303527514579, + "loss": 3.3902, + "step": 5012 + }, + { + "epoch": 0.25, + "grad_norm": 0.5768346786499023, + "learning_rate": 0.0005901264357009269, + "loss": 3.572, + "step": 5013 + }, + { + "epoch": 0.25, + "grad_norm": 0.5403907299041748, + "learning_rate": 0.0005901225178862588, + "loss": 3.5612, + "step": 5014 + }, + { + "epoch": 0.25, + "grad_norm": 0.5107371807098389, + "learning_rate": 0.0005901185993074637, + "loss": 3.4745, + "step": 5015 + }, + { + "epoch": 0.25, + "grad_norm": 0.527996301651001, + "learning_rate": 0.0005901146799645521, + "loss": 3.5359, + "step": 5016 + }, + { + "epoch": 0.25, + "grad_norm": 0.5773131847381592, + "learning_rate": 0.0005901107598575342, + "loss": 3.4109, + "step": 5017 + }, + { + "epoch": 0.25, + "grad_norm": 0.5386675596237183, + "learning_rate": 0.0005901068389864204, + "loss": 3.4633, + "step": 5018 + }, + { + "epoch": 0.25, + "grad_norm": 0.5442395806312561, + "learning_rate": 0.0005901029173512211, + "loss": 3.5547, + "step": 5019 + }, + { + "epoch": 0.25, + "grad_norm": 0.5196928977966309, + "learning_rate": 0.0005900989949519465, + "loss": 3.5074, + "step": 5020 + }, + { + "epoch": 0.25, + "grad_norm": 0.5516992211341858, + "learning_rate": 0.000590095071788607, + "loss": 3.3833, + "step": 5021 + }, + { + "epoch": 0.25, + "grad_norm": 0.5801973342895508, + "learning_rate": 0.0005900911478612129, + "loss": 3.2449, + "step": 5022 + }, + { + "epoch": 0.25, + "grad_norm": 0.5835938453674316, + "learning_rate": 0.0005900872231697745, + "loss": 3.4805, + "step": 5023 + }, + { + "epoch": 0.25, + "grad_norm": 0.5422643423080444, + "learning_rate": 0.0005900832977143022, + "loss": 3.347, + "step": 5024 + }, + { + "epoch": 0.25, + "grad_norm": 0.5615981221199036, + "learning_rate": 0.0005900793714948065, + "loss": 3.234, + "step": 5025 + }, + { + "epoch": 0.25, + "grad_norm": 0.616595447063446, + "learning_rate": 0.0005900754445112974, + "loss": 3.3415, + "step": 5026 + }, + { + "epoch": 0.25, + "grad_norm": 0.5145601034164429, + "learning_rate": 0.0005900715167637854, + "loss": 3.222, + "step": 5027 + }, + { + "epoch": 0.25, + "grad_norm": 0.5435516834259033, + "learning_rate": 0.0005900675882522808, + "loss": 3.5811, + "step": 5028 + }, + { + "epoch": 0.25, + "grad_norm": 0.5470258593559265, + "learning_rate": 0.0005900636589767941, + "loss": 3.4718, + "step": 5029 + }, + { + "epoch": 0.25, + "grad_norm": 0.5767030119895935, + "learning_rate": 0.0005900597289373355, + "loss": 3.4194, + "step": 5030 + }, + { + "epoch": 0.25, + "grad_norm": 0.5893784761428833, + "learning_rate": 0.0005900557981339153, + "loss": 3.4108, + "step": 5031 + }, + { + "epoch": 0.25, + "grad_norm": 0.5466517806053162, + "learning_rate": 0.0005900518665665441, + "loss": 3.439, + "step": 5032 + }, + { + "epoch": 0.25, + "grad_norm": 0.544185996055603, + "learning_rate": 0.000590047934235232, + "loss": 3.5536, + "step": 5033 + }, + { + "epoch": 0.25, + "grad_norm": 0.5449085235595703, + "learning_rate": 0.0005900440011399895, + "loss": 3.527, + "step": 5034 + }, + { + "epoch": 0.25, + "grad_norm": 0.5282718539237976, + "learning_rate": 0.000590040067280827, + "loss": 3.6722, + "step": 5035 + }, + { + "epoch": 0.25, + "grad_norm": 0.5545849204063416, + "learning_rate": 0.0005900361326577547, + "loss": 3.5084, + "step": 5036 + }, + { + "epoch": 0.25, + "grad_norm": 0.5555055141448975, + "learning_rate": 0.0005900321972707831, + "loss": 3.2376, + "step": 5037 + }, + { + "epoch": 0.25, + "grad_norm": 0.5830196142196655, + "learning_rate": 0.0005900282611199225, + "loss": 3.3006, + "step": 5038 + }, + { + "epoch": 0.25, + "grad_norm": 0.5802530646324158, + "learning_rate": 0.0005900243242051833, + "loss": 3.2766, + "step": 5039 + }, + { + "epoch": 0.25, + "grad_norm": 0.5483977198600769, + "learning_rate": 0.0005900203865265757, + "loss": 3.5217, + "step": 5040 + }, + { + "epoch": 0.25, + "grad_norm": 0.5744450092315674, + "learning_rate": 0.0005900164480841104, + "loss": 3.4969, + "step": 5041 + }, + { + "epoch": 0.25, + "grad_norm": 0.5572537779808044, + "learning_rate": 0.0005900125088777974, + "loss": 3.4738, + "step": 5042 + }, + { + "epoch": 0.25, + "grad_norm": 0.5599087476730347, + "learning_rate": 0.0005900085689076474, + "loss": 3.4334, + "step": 5043 + }, + { + "epoch": 0.25, + "grad_norm": 0.5435726046562195, + "learning_rate": 0.0005900046281736707, + "loss": 3.5784, + "step": 5044 + }, + { + "epoch": 0.25, + "grad_norm": 0.5599098205566406, + "learning_rate": 0.0005900006866758774, + "loss": 3.4954, + "step": 5045 + }, + { + "epoch": 0.25, + "grad_norm": 0.5361737608909607, + "learning_rate": 0.0005899967444142783, + "loss": 3.5627, + "step": 5046 + }, + { + "epoch": 0.25, + "grad_norm": 0.5649425387382507, + "learning_rate": 0.0005899928013888835, + "loss": 3.447, + "step": 5047 + }, + { + "epoch": 0.25, + "grad_norm": 0.568342924118042, + "learning_rate": 0.0005899888575997035, + "loss": 3.4439, + "step": 5048 + }, + { + "epoch": 0.25, + "grad_norm": 0.5820963382720947, + "learning_rate": 0.0005899849130467488, + "loss": 3.4019, + "step": 5049 + }, + { + "epoch": 0.25, + "grad_norm": 0.5288267135620117, + "learning_rate": 0.0005899809677300294, + "loss": 3.6183, + "step": 5050 + }, + { + "epoch": 0.25, + "grad_norm": 0.5664812326431274, + "learning_rate": 0.0005899770216495561, + "loss": 3.4155, + "step": 5051 + }, + { + "epoch": 0.25, + "grad_norm": 0.5544307827949524, + "learning_rate": 0.000589973074805339, + "loss": 3.1679, + "step": 5052 + }, + { + "epoch": 0.25, + "grad_norm": 0.5682546496391296, + "learning_rate": 0.0005899691271973888, + "loss": 3.3364, + "step": 5053 + }, + { + "epoch": 0.25, + "grad_norm": 0.5925281643867493, + "learning_rate": 0.0005899651788257155, + "loss": 3.2977, + "step": 5054 + }, + { + "epoch": 0.25, + "grad_norm": 0.5429676175117493, + "learning_rate": 0.00058996122969033, + "loss": 3.4784, + "step": 5055 + }, + { + "epoch": 0.25, + "grad_norm": 0.5814798474311829, + "learning_rate": 0.0005899572797912422, + "loss": 3.2982, + "step": 5056 + }, + { + "epoch": 0.25, + "grad_norm": 0.5307891964912415, + "learning_rate": 0.0005899533291284628, + "loss": 3.6792, + "step": 5057 + }, + { + "epoch": 0.25, + "grad_norm": 0.570961594581604, + "learning_rate": 0.0005899493777020021, + "loss": 3.4474, + "step": 5058 + }, + { + "epoch": 0.25, + "grad_norm": 0.5177574157714844, + "learning_rate": 0.0005899454255118707, + "loss": 3.3287, + "step": 5059 + }, + { + "epoch": 0.25, + "grad_norm": 0.5786210894584656, + "learning_rate": 0.0005899414725580787, + "loss": 3.4678, + "step": 5060 + }, + { + "epoch": 0.25, + "grad_norm": 0.5470947623252869, + "learning_rate": 0.0005899375188406366, + "loss": 3.5344, + "step": 5061 + }, + { + "epoch": 0.25, + "grad_norm": 0.5230193138122559, + "learning_rate": 0.000589933564359555, + "loss": 3.5553, + "step": 5062 + }, + { + "epoch": 0.25, + "grad_norm": 0.5864706635475159, + "learning_rate": 0.0005899296091148442, + "loss": 3.3547, + "step": 5063 + }, + { + "epoch": 0.25, + "grad_norm": 0.5561924576759338, + "learning_rate": 0.0005899256531065145, + "loss": 3.6271, + "step": 5064 + }, + { + "epoch": 0.25, + "grad_norm": 0.5769221186637878, + "learning_rate": 0.0005899216963345766, + "loss": 3.4765, + "step": 5065 + }, + { + "epoch": 0.25, + "grad_norm": 0.5741177201271057, + "learning_rate": 0.0005899177387990406, + "loss": 3.5332, + "step": 5066 + }, + { + "epoch": 0.25, + "grad_norm": 0.550760805606842, + "learning_rate": 0.000589913780499917, + "loss": 3.583, + "step": 5067 + }, + { + "epoch": 0.25, + "grad_norm": 0.6087910532951355, + "learning_rate": 0.0005899098214372164, + "loss": 3.2774, + "step": 5068 + }, + { + "epoch": 0.25, + "grad_norm": 0.6020672917366028, + "learning_rate": 0.0005899058616109492, + "loss": 3.5222, + "step": 5069 + }, + { + "epoch": 0.25, + "grad_norm": 0.5288599133491516, + "learning_rate": 0.0005899019010211258, + "loss": 3.5523, + "step": 5070 + }, + { + "epoch": 0.25, + "grad_norm": 0.6012958288192749, + "learning_rate": 0.0005898979396677564, + "loss": 3.5154, + "step": 5071 + }, + { + "epoch": 0.25, + "grad_norm": 0.5777899622917175, + "learning_rate": 0.0005898939775508517, + "loss": 3.2862, + "step": 5072 + }, + { + "epoch": 0.25, + "grad_norm": 0.5450665950775146, + "learning_rate": 0.000589890014670422, + "loss": 3.2721, + "step": 5073 + }, + { + "epoch": 0.25, + "grad_norm": 0.5397341847419739, + "learning_rate": 0.0005898860510264778, + "loss": 3.3992, + "step": 5074 + }, + { + "epoch": 0.25, + "grad_norm": 0.5461152195930481, + "learning_rate": 0.0005898820866190297, + "loss": 3.4282, + "step": 5075 + }, + { + "epoch": 0.25, + "grad_norm": 0.5408706665039062, + "learning_rate": 0.0005898781214480878, + "loss": 3.5096, + "step": 5076 + }, + { + "epoch": 0.25, + "grad_norm": 0.5572208166122437, + "learning_rate": 0.0005898741555136627, + "loss": 3.4326, + "step": 5077 + }, + { + "epoch": 0.25, + "grad_norm": 0.5309155583381653, + "learning_rate": 0.0005898701888157649, + "loss": 3.3239, + "step": 5078 + }, + { + "epoch": 0.25, + "grad_norm": 0.5635367035865784, + "learning_rate": 0.0005898662213544048, + "loss": 3.2986, + "step": 5079 + }, + { + "epoch": 0.25, + "grad_norm": 0.5413909554481506, + "learning_rate": 0.0005898622531295929, + "loss": 3.5096, + "step": 5080 + }, + { + "epoch": 0.25, + "grad_norm": 0.5198115110397339, + "learning_rate": 0.0005898582841413397, + "loss": 3.3075, + "step": 5081 + }, + { + "epoch": 0.25, + "grad_norm": 0.5086342692375183, + "learning_rate": 0.0005898543143896555, + "loss": 3.3621, + "step": 5082 + }, + { + "epoch": 0.25, + "grad_norm": 0.6958548426628113, + "learning_rate": 0.0005898503438745507, + "loss": 3.461, + "step": 5083 + }, + { + "epoch": 0.25, + "grad_norm": 0.5100436210632324, + "learning_rate": 0.0005898463725960359, + "loss": 3.3348, + "step": 5084 + }, + { + "epoch": 0.25, + "grad_norm": 0.6006938815116882, + "learning_rate": 0.0005898424005541216, + "loss": 3.4243, + "step": 5085 + }, + { + "epoch": 0.25, + "grad_norm": 0.5753837823867798, + "learning_rate": 0.0005898384277488182, + "loss": 3.5554, + "step": 5086 + }, + { + "epoch": 0.25, + "grad_norm": 0.5907991528511047, + "learning_rate": 0.0005898344541801362, + "loss": 3.4331, + "step": 5087 + }, + { + "epoch": 0.25, + "grad_norm": 0.5146076679229736, + "learning_rate": 0.000589830479848086, + "loss": 3.3868, + "step": 5088 + }, + { + "epoch": 0.25, + "grad_norm": 0.5464553833007812, + "learning_rate": 0.0005898265047526781, + "loss": 3.468, + "step": 5089 + }, + { + "epoch": 0.25, + "grad_norm": 0.5454960465431213, + "learning_rate": 0.000589822528893923, + "loss": 3.4598, + "step": 5090 + }, + { + "epoch": 0.25, + "grad_norm": 0.5484764575958252, + "learning_rate": 0.000589818552271831, + "loss": 3.3738, + "step": 5091 + }, + { + "epoch": 0.25, + "grad_norm": 0.554803729057312, + "learning_rate": 0.0005898145748864128, + "loss": 3.1957, + "step": 5092 + }, + { + "epoch": 0.25, + "grad_norm": 0.5293271541595459, + "learning_rate": 0.0005898105967376788, + "loss": 3.5345, + "step": 5093 + }, + { + "epoch": 0.25, + "grad_norm": 0.56730717420578, + "learning_rate": 0.0005898066178256394, + "loss": 3.4173, + "step": 5094 + }, + { + "epoch": 0.25, + "grad_norm": 0.5431801080703735, + "learning_rate": 0.0005898026381503051, + "loss": 3.3932, + "step": 5095 + }, + { + "epoch": 0.25, + "grad_norm": 0.6341773867607117, + "learning_rate": 0.0005897986577116865, + "loss": 3.318, + "step": 5096 + }, + { + "epoch": 0.25, + "grad_norm": 0.5350258946418762, + "learning_rate": 0.0005897946765097941, + "loss": 3.4459, + "step": 5097 + }, + { + "epoch": 0.25, + "grad_norm": 0.5501015186309814, + "learning_rate": 0.0005897906945446382, + "loss": 3.2029, + "step": 5098 + }, + { + "epoch": 0.25, + "grad_norm": 0.5346304178237915, + "learning_rate": 0.0005897867118162294, + "loss": 3.4928, + "step": 5099 + }, + { + "epoch": 0.25, + "grad_norm": 0.6124736070632935, + "learning_rate": 0.0005897827283245781, + "loss": 3.65, + "step": 5100 + }, + { + "epoch": 0.25, + "grad_norm": 0.5016577839851379, + "learning_rate": 0.000589778744069695, + "loss": 3.3579, + "step": 5101 + }, + { + "epoch": 0.25, + "grad_norm": 0.541294276714325, + "learning_rate": 0.0005897747590515904, + "loss": 3.1981, + "step": 5102 + }, + { + "epoch": 0.25, + "grad_norm": 0.5059472322463989, + "learning_rate": 0.0005897707732702748, + "loss": 3.3077, + "step": 5103 + }, + { + "epoch": 0.25, + "grad_norm": 0.5518838763237, + "learning_rate": 0.0005897667867257588, + "loss": 3.3557, + "step": 5104 + }, + { + "epoch": 0.25, + "grad_norm": 0.5457679033279419, + "learning_rate": 0.0005897627994180529, + "loss": 3.3617, + "step": 5105 + }, + { + "epoch": 0.25, + "grad_norm": 0.6038949489593506, + "learning_rate": 0.0005897588113471674, + "loss": 3.2915, + "step": 5106 + }, + { + "epoch": 0.25, + "grad_norm": 0.590950071811676, + "learning_rate": 0.000589754822513113, + "loss": 3.1439, + "step": 5107 + }, + { + "epoch": 0.25, + "grad_norm": 0.5621350407600403, + "learning_rate": 0.0005897508329159003, + "loss": 3.2661, + "step": 5108 + }, + { + "epoch": 0.25, + "grad_norm": 0.5230415463447571, + "learning_rate": 0.0005897468425555395, + "loss": 3.5213, + "step": 5109 + }, + { + "epoch": 0.25, + "grad_norm": 0.5285176038742065, + "learning_rate": 0.0005897428514320414, + "loss": 3.3294, + "step": 5110 + }, + { + "epoch": 0.25, + "grad_norm": 0.5499192476272583, + "learning_rate": 0.0005897388595454164, + "loss": 3.4348, + "step": 5111 + }, + { + "epoch": 0.25, + "grad_norm": 0.5603438019752502, + "learning_rate": 0.000589734866895675, + "loss": 3.4721, + "step": 5112 + }, + { + "epoch": 0.25, + "grad_norm": 0.5261187553405762, + "learning_rate": 0.0005897308734828276, + "loss": 3.4034, + "step": 5113 + }, + { + "epoch": 0.25, + "grad_norm": 0.5853512287139893, + "learning_rate": 0.0005897268793068848, + "loss": 3.3507, + "step": 5114 + }, + { + "epoch": 0.25, + "grad_norm": 0.586706817150116, + "learning_rate": 0.0005897228843678574, + "loss": 3.2244, + "step": 5115 + }, + { + "epoch": 0.25, + "grad_norm": 0.5517246127128601, + "learning_rate": 0.0005897188886657555, + "loss": 3.5185, + "step": 5116 + }, + { + "epoch": 0.25, + "grad_norm": 0.5283602476119995, + "learning_rate": 0.00058971489220059, + "loss": 3.2524, + "step": 5117 + }, + { + "epoch": 0.25, + "grad_norm": 0.5656755566596985, + "learning_rate": 0.000589710894972371, + "loss": 3.5088, + "step": 5118 + }, + { + "epoch": 0.25, + "grad_norm": 0.5920532941818237, + "learning_rate": 0.0005897068969811094, + "loss": 3.5174, + "step": 5119 + }, + { + "epoch": 0.25, + "grad_norm": 0.5770808458328247, + "learning_rate": 0.0005897028982268155, + "loss": 3.304, + "step": 5120 + }, + { + "epoch": 0.25, + "grad_norm": 0.542290210723877, + "learning_rate": 0.0005896988987094999, + "loss": 3.5272, + "step": 5121 + }, + { + "epoch": 0.25, + "grad_norm": 0.5746126770973206, + "learning_rate": 0.0005896948984291733, + "loss": 3.6472, + "step": 5122 + }, + { + "epoch": 0.25, + "grad_norm": 0.5588904023170471, + "learning_rate": 0.0005896908973858459, + "loss": 3.5337, + "step": 5123 + }, + { + "epoch": 0.25, + "grad_norm": 0.5217200517654419, + "learning_rate": 0.0005896868955795286, + "loss": 3.3863, + "step": 5124 + }, + { + "epoch": 0.25, + "grad_norm": 0.5870938897132874, + "learning_rate": 0.0005896828930102316, + "loss": 3.3279, + "step": 5125 + }, + { + "epoch": 0.25, + "grad_norm": 0.6563788652420044, + "learning_rate": 0.0005896788896779658, + "loss": 3.2821, + "step": 5126 + }, + { + "epoch": 0.25, + "grad_norm": 0.5480024218559265, + "learning_rate": 0.0005896748855827414, + "loss": 3.3585, + "step": 5127 + }, + { + "epoch": 0.25, + "grad_norm": 0.5863211154937744, + "learning_rate": 0.0005896708807245693, + "loss": 3.4657, + "step": 5128 + }, + { + "epoch": 0.25, + "grad_norm": 0.6124269366264343, + "learning_rate": 0.0005896668751034596, + "loss": 3.5115, + "step": 5129 + }, + { + "epoch": 0.25, + "grad_norm": 0.6593347191810608, + "learning_rate": 0.0005896628687194232, + "loss": 3.3722, + "step": 5130 + }, + { + "epoch": 0.25, + "grad_norm": 0.5311346054077148, + "learning_rate": 0.0005896588615724706, + "loss": 3.2849, + "step": 5131 + }, + { + "epoch": 0.25, + "grad_norm": 0.5981443524360657, + "learning_rate": 0.0005896548536626123, + "loss": 3.2468, + "step": 5132 + }, + { + "epoch": 0.25, + "grad_norm": 0.5324131846427917, + "learning_rate": 0.0005896508449898587, + "loss": 3.4398, + "step": 5133 + }, + { + "epoch": 0.25, + "grad_norm": 0.5485259294509888, + "learning_rate": 0.0005896468355542206, + "loss": 3.3877, + "step": 5134 + }, + { + "epoch": 0.25, + "grad_norm": 0.6059028506278992, + "learning_rate": 0.0005896428253557086, + "loss": 3.2956, + "step": 5135 + }, + { + "epoch": 0.25, + "grad_norm": 0.5261827111244202, + "learning_rate": 0.0005896388143943331, + "loss": 3.5208, + "step": 5136 + }, + { + "epoch": 0.25, + "grad_norm": 0.596929669380188, + "learning_rate": 0.0005896348026701047, + "loss": 3.3708, + "step": 5137 + }, + { + "epoch": 0.25, + "grad_norm": 0.5352926254272461, + "learning_rate": 0.0005896307901830339, + "loss": 3.3242, + "step": 5138 + }, + { + "epoch": 0.25, + "grad_norm": 0.5876359939575195, + "learning_rate": 0.0005896267769331313, + "loss": 3.6109, + "step": 5139 + }, + { + "epoch": 0.25, + "grad_norm": 0.5615381598472595, + "learning_rate": 0.0005896227629204077, + "loss": 3.2824, + "step": 5140 + }, + { + "epoch": 0.25, + "grad_norm": 0.5441862940788269, + "learning_rate": 0.0005896187481448734, + "loss": 3.316, + "step": 5141 + }, + { + "epoch": 0.25, + "grad_norm": 0.5573780536651611, + "learning_rate": 0.000589614732606539, + "loss": 3.3629, + "step": 5142 + }, + { + "epoch": 0.25, + "grad_norm": 0.5792946815490723, + "learning_rate": 0.0005896107163054151, + "loss": 3.4812, + "step": 5143 + }, + { + "epoch": 0.25, + "grad_norm": 0.5667078495025635, + "learning_rate": 0.0005896066992415125, + "loss": 3.4833, + "step": 5144 + }, + { + "epoch": 0.25, + "grad_norm": 0.5536587834358215, + "learning_rate": 0.0005896026814148414, + "loss": 3.3131, + "step": 5145 + }, + { + "epoch": 0.25, + "grad_norm": 0.5453284978866577, + "learning_rate": 0.0005895986628254126, + "loss": 3.518, + "step": 5146 + }, + { + "epoch": 0.25, + "grad_norm": 0.6266303062438965, + "learning_rate": 0.0005895946434732366, + "loss": 3.3667, + "step": 5147 + }, + { + "epoch": 0.25, + "grad_norm": 0.5294222831726074, + "learning_rate": 0.0005895906233583242, + "loss": 3.3286, + "step": 5148 + }, + { + "epoch": 0.25, + "grad_norm": 0.6126430034637451, + "learning_rate": 0.0005895866024806858, + "loss": 3.3122, + "step": 5149 + }, + { + "epoch": 0.25, + "grad_norm": 0.5357118248939514, + "learning_rate": 0.0005895825808403321, + "loss": 3.3943, + "step": 5150 + }, + { + "epoch": 0.25, + "grad_norm": 0.5523558855056763, + "learning_rate": 0.0005895785584372734, + "loss": 3.5076, + "step": 5151 + }, + { + "epoch": 0.25, + "grad_norm": 0.7012884020805359, + "learning_rate": 0.0005895745352715205, + "loss": 3.0987, + "step": 5152 + }, + { + "epoch": 0.25, + "grad_norm": 0.5519027709960938, + "learning_rate": 0.0005895705113430842, + "loss": 3.6015, + "step": 5153 + }, + { + "epoch": 0.25, + "grad_norm": 0.6840552091598511, + "learning_rate": 0.0005895664866519748, + "loss": 3.2324, + "step": 5154 + }, + { + "epoch": 0.25, + "grad_norm": 0.5728353261947632, + "learning_rate": 0.000589562461198203, + "loss": 3.5145, + "step": 5155 + }, + { + "epoch": 0.25, + "grad_norm": 0.5669745802879333, + "learning_rate": 0.0005895584349817794, + "loss": 3.2349, + "step": 5156 + }, + { + "epoch": 0.25, + "grad_norm": 0.551141083240509, + "learning_rate": 0.0005895544080027147, + "loss": 3.4344, + "step": 5157 + }, + { + "epoch": 0.25, + "grad_norm": 0.5899537205696106, + "learning_rate": 0.0005895503802610193, + "loss": 3.5101, + "step": 5158 + }, + { + "epoch": 0.25, + "grad_norm": 0.541935384273529, + "learning_rate": 0.000589546351756704, + "loss": 3.6462, + "step": 5159 + }, + { + "epoch": 0.25, + "grad_norm": 0.5857089757919312, + "learning_rate": 0.0005895423224897792, + "loss": 3.5987, + "step": 5160 + }, + { + "epoch": 0.25, + "grad_norm": 0.5397253632545471, + "learning_rate": 0.0005895382924602558, + "loss": 3.4983, + "step": 5161 + }, + { + "epoch": 0.25, + "grad_norm": 0.5339411497116089, + "learning_rate": 0.0005895342616681442, + "loss": 3.3218, + "step": 5162 + }, + { + "epoch": 0.25, + "grad_norm": 0.5548738241195679, + "learning_rate": 0.0005895302301134551, + "loss": 3.5583, + "step": 5163 + }, + { + "epoch": 0.25, + "grad_norm": 0.5891041159629822, + "learning_rate": 0.0005895261977961991, + "loss": 3.4377, + "step": 5164 + }, + { + "epoch": 0.25, + "grad_norm": 0.569484531879425, + "learning_rate": 0.0005895221647163867, + "loss": 3.3795, + "step": 5165 + }, + { + "epoch": 0.25, + "grad_norm": 0.5073286890983582, + "learning_rate": 0.0005895181308740288, + "loss": 3.6983, + "step": 5166 + }, + { + "epoch": 0.25, + "grad_norm": 0.550396203994751, + "learning_rate": 0.0005895140962691358, + "loss": 3.3491, + "step": 5167 + }, + { + "epoch": 0.25, + "grad_norm": 0.502250611782074, + "learning_rate": 0.0005895100609017184, + "loss": 3.3437, + "step": 5168 + }, + { + "epoch": 0.25, + "grad_norm": 0.5587743520736694, + "learning_rate": 0.0005895060247717871, + "loss": 3.6706, + "step": 5169 + }, + { + "epoch": 0.25, + "grad_norm": 0.5262305736541748, + "learning_rate": 0.0005895019878793527, + "loss": 3.4108, + "step": 5170 + }, + { + "epoch": 0.25, + "grad_norm": 0.5481409430503845, + "learning_rate": 0.0005894979502244259, + "loss": 3.4605, + "step": 5171 + }, + { + "epoch": 0.25, + "grad_norm": 0.5923982858657837, + "learning_rate": 0.0005894939118070171, + "loss": 3.4484, + "step": 5172 + }, + { + "epoch": 0.25, + "grad_norm": 0.5860485434532166, + "learning_rate": 0.0005894898726271371, + "loss": 3.0184, + "step": 5173 + }, + { + "epoch": 0.25, + "grad_norm": 0.5705693364143372, + "learning_rate": 0.0005894858326847965, + "loss": 3.439, + "step": 5174 + }, + { + "epoch": 0.25, + "grad_norm": 0.5961979031562805, + "learning_rate": 0.0005894817919800059, + "loss": 3.5159, + "step": 5175 + }, + { + "epoch": 0.25, + "grad_norm": 0.5484743714332581, + "learning_rate": 0.000589477750512776, + "loss": 3.2929, + "step": 5176 + }, + { + "epoch": 0.25, + "grad_norm": 0.628601610660553, + "learning_rate": 0.0005894737082831173, + "loss": 3.3603, + "step": 5177 + }, + { + "epoch": 0.25, + "grad_norm": 0.5901480317115784, + "learning_rate": 0.0005894696652910408, + "loss": 3.3222, + "step": 5178 + }, + { + "epoch": 0.25, + "grad_norm": 0.555828869342804, + "learning_rate": 0.0005894656215365567, + "loss": 3.4859, + "step": 5179 + }, + { + "epoch": 0.25, + "grad_norm": 0.5342446565628052, + "learning_rate": 0.000589461577019676, + "loss": 3.7213, + "step": 5180 + }, + { + "epoch": 0.25, + "grad_norm": 0.5517928004264832, + "learning_rate": 0.000589457531740409, + "loss": 3.5133, + "step": 5181 + }, + { + "epoch": 0.25, + "grad_norm": 0.5539923906326294, + "learning_rate": 0.0005894534856987668, + "loss": 3.4874, + "step": 5182 + }, + { + "epoch": 0.25, + "grad_norm": 0.5563471913337708, + "learning_rate": 0.0005894494388947598, + "loss": 3.4176, + "step": 5183 + }, + { + "epoch": 0.25, + "grad_norm": 0.5729748606681824, + "learning_rate": 0.0005894453913283986, + "loss": 3.4297, + "step": 5184 + }, + { + "epoch": 0.25, + "grad_norm": 0.577461302280426, + "learning_rate": 0.0005894413429996939, + "loss": 3.0731, + "step": 5185 + }, + { + "epoch": 0.25, + "grad_norm": 0.5624281764030457, + "learning_rate": 0.0005894372939086565, + "loss": 3.6104, + "step": 5186 + }, + { + "epoch": 0.25, + "grad_norm": 0.593137800693512, + "learning_rate": 0.0005894332440552969, + "loss": 3.4637, + "step": 5187 + }, + { + "epoch": 0.25, + "grad_norm": 0.5267358422279358, + "learning_rate": 0.0005894291934396259, + "loss": 3.2419, + "step": 5188 + }, + { + "epoch": 0.25, + "grad_norm": 0.6192349195480347, + "learning_rate": 0.0005894251420616541, + "loss": 3.5089, + "step": 5189 + }, + { + "epoch": 0.25, + "grad_norm": 0.5877721905708313, + "learning_rate": 0.0005894210899213921, + "loss": 3.4892, + "step": 5190 + }, + { + "epoch": 0.25, + "grad_norm": 0.7060837149620056, + "learning_rate": 0.0005894170370188508, + "loss": 3.3611, + "step": 5191 + }, + { + "epoch": 0.25, + "grad_norm": 0.5571918487548828, + "learning_rate": 0.0005894129833540405, + "loss": 3.5587, + "step": 5192 + }, + { + "epoch": 0.25, + "grad_norm": 0.5204548239707947, + "learning_rate": 0.0005894089289269724, + "loss": 3.4699, + "step": 5193 + }, + { + "epoch": 0.25, + "grad_norm": 0.5898545384407043, + "learning_rate": 0.0005894048737376567, + "loss": 3.499, + "step": 5194 + }, + { + "epoch": 0.25, + "grad_norm": 0.5386577248573303, + "learning_rate": 0.0005894008177861043, + "loss": 3.3448, + "step": 5195 + }, + { + "epoch": 0.25, + "grad_norm": 0.5543986558914185, + "learning_rate": 0.0005893967610723257, + "loss": 3.4441, + "step": 5196 + }, + { + "epoch": 0.25, + "grad_norm": 0.6119479537010193, + "learning_rate": 0.0005893927035963319, + "loss": 3.3068, + "step": 5197 + }, + { + "epoch": 0.25, + "grad_norm": 0.5174164772033691, + "learning_rate": 0.0005893886453581334, + "loss": 3.2137, + "step": 5198 + }, + { + "epoch": 0.25, + "grad_norm": 0.5318208932876587, + "learning_rate": 0.0005893845863577409, + "loss": 3.4068, + "step": 5199 + }, + { + "epoch": 0.25, + "grad_norm": 0.558030366897583, + "learning_rate": 0.000589380526595165, + "loss": 3.1989, + "step": 5200 + }, + { + "epoch": 0.25, + "grad_norm": 0.5202679634094238, + "learning_rate": 0.0005893764660704166, + "loss": 3.3643, + "step": 5201 + }, + { + "epoch": 0.25, + "grad_norm": 0.5347263813018799, + "learning_rate": 0.0005893724047835062, + "loss": 3.4693, + "step": 5202 + }, + { + "epoch": 0.25, + "grad_norm": 0.5433123111724854, + "learning_rate": 0.0005893683427344445, + "loss": 3.2793, + "step": 5203 + }, + { + "epoch": 0.26, + "grad_norm": 0.5789240598678589, + "learning_rate": 0.0005893642799232425, + "loss": 3.1991, + "step": 5204 + }, + { + "epoch": 0.26, + "grad_norm": 0.5516834259033203, + "learning_rate": 0.0005893602163499105, + "loss": 3.4335, + "step": 5205 + }, + { + "epoch": 0.26, + "grad_norm": 0.5537156462669373, + "learning_rate": 0.0005893561520144594, + "loss": 3.3853, + "step": 5206 + }, + { + "epoch": 0.26, + "grad_norm": 0.5437060594558716, + "learning_rate": 0.0005893520869168999, + "loss": 3.301, + "step": 5207 + }, + { + "epoch": 0.26, + "grad_norm": 0.6070511341094971, + "learning_rate": 0.0005893480210572427, + "loss": 3.4726, + "step": 5208 + }, + { + "epoch": 0.26, + "grad_norm": 0.5556560754776001, + "learning_rate": 0.0005893439544354984, + "loss": 3.2371, + "step": 5209 + }, + { + "epoch": 0.26, + "grad_norm": 0.5555329322814941, + "learning_rate": 0.000589339887051678, + "loss": 3.3675, + "step": 5210 + }, + { + "epoch": 0.26, + "grad_norm": 0.551421582698822, + "learning_rate": 0.0005893358189057919, + "loss": 3.3801, + "step": 5211 + }, + { + "epoch": 0.26, + "grad_norm": 0.5763034224510193, + "learning_rate": 0.0005893317499978511, + "loss": 3.5617, + "step": 5212 + }, + { + "epoch": 0.26, + "grad_norm": 0.5456756353378296, + "learning_rate": 0.0005893276803278659, + "loss": 3.5274, + "step": 5213 + }, + { + "epoch": 0.26, + "grad_norm": 0.5538662672042847, + "learning_rate": 0.0005893236098958474, + "loss": 3.3474, + "step": 5214 + }, + { + "epoch": 0.26, + "grad_norm": 0.5430862307548523, + "learning_rate": 0.0005893195387018062, + "loss": 3.3443, + "step": 5215 + }, + { + "epoch": 0.26, + "grad_norm": 0.5130735039710999, + "learning_rate": 0.000589315466745753, + "loss": 3.637, + "step": 5216 + }, + { + "epoch": 0.26, + "grad_norm": 0.5832960605621338, + "learning_rate": 0.0005893113940276985, + "loss": 3.319, + "step": 5217 + }, + { + "epoch": 0.26, + "grad_norm": 0.5417028665542603, + "learning_rate": 0.0005893073205476535, + "loss": 3.3201, + "step": 5218 + }, + { + "epoch": 0.26, + "grad_norm": 0.5512108206748962, + "learning_rate": 0.0005893032463056288, + "loss": 3.2845, + "step": 5219 + }, + { + "epoch": 0.26, + "grad_norm": 0.5375921726226807, + "learning_rate": 0.0005892991713016348, + "loss": 3.642, + "step": 5220 + }, + { + "epoch": 0.26, + "grad_norm": 0.5193976759910583, + "learning_rate": 0.0005892950955356827, + "loss": 3.5162, + "step": 5221 + }, + { + "epoch": 0.26, + "grad_norm": 0.5780069231987, + "learning_rate": 0.0005892910190077828, + "loss": 3.5228, + "step": 5222 + }, + { + "epoch": 0.26, + "grad_norm": 0.5876449346542358, + "learning_rate": 0.0005892869417179461, + "loss": 3.2904, + "step": 5223 + }, + { + "epoch": 0.26, + "grad_norm": 0.5421982407569885, + "learning_rate": 0.0005892828636661833, + "loss": 3.5897, + "step": 5224 + }, + { + "epoch": 0.26, + "grad_norm": 0.6510359048843384, + "learning_rate": 0.0005892787848525052, + "loss": 3.2834, + "step": 5225 + }, + { + "epoch": 0.26, + "grad_norm": 0.5625903010368347, + "learning_rate": 0.0005892747052769222, + "loss": 3.2265, + "step": 5226 + }, + { + "epoch": 0.26, + "grad_norm": 0.5502780079841614, + "learning_rate": 0.0005892706249394455, + "loss": 3.3097, + "step": 5227 + }, + { + "epoch": 0.26, + "grad_norm": 0.5297096967697144, + "learning_rate": 0.0005892665438400856, + "loss": 3.6185, + "step": 5228 + }, + { + "epoch": 0.26, + "grad_norm": 0.5778887271881104, + "learning_rate": 0.0005892624619788533, + "loss": 3.4486, + "step": 5229 + }, + { + "epoch": 0.26, + "grad_norm": 0.5754767656326294, + "learning_rate": 0.0005892583793557594, + "loss": 3.3823, + "step": 5230 + }, + { + "epoch": 0.26, + "grad_norm": 0.5356746315956116, + "learning_rate": 0.0005892542959708145, + "loss": 3.686, + "step": 5231 + }, + { + "epoch": 0.26, + "grad_norm": 0.5547672510147095, + "learning_rate": 0.0005892502118240295, + "loss": 3.4749, + "step": 5232 + }, + { + "epoch": 0.26, + "grad_norm": 0.5451996326446533, + "learning_rate": 0.0005892461269154151, + "loss": 3.2667, + "step": 5233 + }, + { + "epoch": 0.26, + "grad_norm": 0.5528019070625305, + "learning_rate": 0.0005892420412449821, + "loss": 3.3068, + "step": 5234 + }, + { + "epoch": 0.26, + "grad_norm": 0.5734788179397583, + "learning_rate": 0.0005892379548127411, + "loss": 3.4308, + "step": 5235 + }, + { + "epoch": 0.26, + "grad_norm": 0.6340581774711609, + "learning_rate": 0.000589233867618703, + "loss": 3.4311, + "step": 5236 + }, + { + "epoch": 0.26, + "grad_norm": 0.6577187180519104, + "learning_rate": 0.0005892297796628787, + "loss": 3.4902, + "step": 5237 + }, + { + "epoch": 0.26, + "grad_norm": 0.5538560152053833, + "learning_rate": 0.0005892256909452788, + "loss": 3.5402, + "step": 5238 + }, + { + "epoch": 0.26, + "grad_norm": 0.5807637572288513, + "learning_rate": 0.0005892216014659141, + "loss": 3.4806, + "step": 5239 + }, + { + "epoch": 0.26, + "grad_norm": 0.6450725793838501, + "learning_rate": 0.0005892175112247952, + "loss": 3.3013, + "step": 5240 + }, + { + "epoch": 0.26, + "grad_norm": 0.5808170437812805, + "learning_rate": 0.0005892134202219332, + "loss": 3.4958, + "step": 5241 + }, + { + "epoch": 0.26, + "grad_norm": 0.5749102234840393, + "learning_rate": 0.0005892093284573386, + "loss": 3.4446, + "step": 5242 + }, + { + "epoch": 0.26, + "grad_norm": 0.5990297794342041, + "learning_rate": 0.0005892052359310224, + "loss": 3.2321, + "step": 5243 + }, + { + "epoch": 0.26, + "grad_norm": 0.5670678019523621, + "learning_rate": 0.0005892011426429952, + "loss": 3.5992, + "step": 5244 + }, + { + "epoch": 0.26, + "grad_norm": 0.5572386384010315, + "learning_rate": 0.0005891970485932679, + "loss": 3.3791, + "step": 5245 + }, + { + "epoch": 0.26, + "grad_norm": 0.5236799120903015, + "learning_rate": 0.0005891929537818512, + "loss": 3.5792, + "step": 5246 + }, + { + "epoch": 0.26, + "grad_norm": 0.595807671546936, + "learning_rate": 0.0005891888582087559, + "loss": 3.1541, + "step": 5247 + }, + { + "epoch": 0.26, + "grad_norm": 0.5384485125541687, + "learning_rate": 0.0005891847618739929, + "loss": 3.0939, + "step": 5248 + }, + { + "epoch": 0.26, + "grad_norm": 0.5316618084907532, + "learning_rate": 0.0005891806647775727, + "loss": 3.6524, + "step": 5249 + }, + { + "epoch": 0.26, + "grad_norm": 0.527287483215332, + "learning_rate": 0.0005891765669195064, + "loss": 3.4803, + "step": 5250 + }, + { + "epoch": 0.26, + "grad_norm": 0.5352149605751038, + "learning_rate": 0.0005891724682998046, + "loss": 3.4944, + "step": 5251 + }, + { + "epoch": 0.26, + "grad_norm": 0.5439637303352356, + "learning_rate": 0.0005891683689184784, + "loss": 3.3703, + "step": 5252 + }, + { + "epoch": 0.26, + "grad_norm": 0.5407761335372925, + "learning_rate": 0.0005891642687755382, + "loss": 3.635, + "step": 5253 + }, + { + "epoch": 0.26, + "grad_norm": 0.5867874622344971, + "learning_rate": 0.0005891601678709949, + "loss": 3.3837, + "step": 5254 + }, + { + "epoch": 0.26, + "grad_norm": 0.5279407501220703, + "learning_rate": 0.0005891560662048595, + "loss": 3.3589, + "step": 5255 + }, + { + "epoch": 0.26, + "grad_norm": 0.549512505531311, + "learning_rate": 0.0005891519637771426, + "loss": 3.3413, + "step": 5256 + }, + { + "epoch": 0.26, + "grad_norm": 0.566560685634613, + "learning_rate": 0.0005891478605878551, + "loss": 3.4066, + "step": 5257 + }, + { + "epoch": 0.26, + "grad_norm": 0.5679735541343689, + "learning_rate": 0.0005891437566370078, + "loss": 3.3625, + "step": 5258 + }, + { + "epoch": 0.26, + "grad_norm": 0.5230506062507629, + "learning_rate": 0.0005891396519246114, + "loss": 3.3698, + "step": 5259 + }, + { + "epoch": 0.26, + "grad_norm": 0.5605278611183167, + "learning_rate": 0.0005891355464506769, + "loss": 3.3979, + "step": 5260 + }, + { + "epoch": 0.26, + "grad_norm": 0.5670188069343567, + "learning_rate": 0.000589131440215215, + "loss": 3.5514, + "step": 5261 + }, + { + "epoch": 0.26, + "grad_norm": 0.5283556580543518, + "learning_rate": 0.0005891273332182365, + "loss": 3.416, + "step": 5262 + }, + { + "epoch": 0.26, + "grad_norm": 0.5201714038848877, + "learning_rate": 0.0005891232254597523, + "loss": 3.3487, + "step": 5263 + }, + { + "epoch": 0.26, + "grad_norm": 0.6400845646858215, + "learning_rate": 0.0005891191169397731, + "loss": 3.2951, + "step": 5264 + }, + { + "epoch": 0.26, + "grad_norm": 0.5510321259498596, + "learning_rate": 0.0005891150076583098, + "loss": 3.4685, + "step": 5265 + }, + { + "epoch": 0.26, + "grad_norm": 0.5668927431106567, + "learning_rate": 0.0005891108976153732, + "loss": 3.4804, + "step": 5266 + }, + { + "epoch": 0.26, + "grad_norm": 0.5134303569793701, + "learning_rate": 0.0005891067868109743, + "loss": 3.3349, + "step": 5267 + }, + { + "epoch": 0.26, + "grad_norm": 0.6687813997268677, + "learning_rate": 0.0005891026752451235, + "loss": 3.4913, + "step": 5268 + }, + { + "epoch": 0.26, + "grad_norm": 0.5363355278968811, + "learning_rate": 0.0005890985629178321, + "loss": 3.3884, + "step": 5269 + }, + { + "epoch": 0.26, + "grad_norm": 0.5483888387680054, + "learning_rate": 0.0005890944498291106, + "loss": 3.2948, + "step": 5270 + }, + { + "epoch": 0.26, + "grad_norm": 0.5730322599411011, + "learning_rate": 0.00058909033597897, + "loss": 3.4693, + "step": 5271 + }, + { + "epoch": 0.26, + "grad_norm": 0.6208261847496033, + "learning_rate": 0.0005890862213674211, + "loss": 3.403, + "step": 5272 + }, + { + "epoch": 0.26, + "grad_norm": 0.5502722263336182, + "learning_rate": 0.0005890821059944748, + "loss": 3.442, + "step": 5273 + }, + { + "epoch": 0.26, + "grad_norm": 0.5494133234024048, + "learning_rate": 0.0005890779898601417, + "loss": 3.4079, + "step": 5274 + }, + { + "epoch": 0.26, + "grad_norm": 0.5719727873802185, + "learning_rate": 0.0005890738729644329, + "loss": 3.4966, + "step": 5275 + }, + { + "epoch": 0.26, + "grad_norm": 0.5802255868911743, + "learning_rate": 0.0005890697553073591, + "loss": 3.4647, + "step": 5276 + }, + { + "epoch": 0.26, + "grad_norm": 0.5484011769294739, + "learning_rate": 0.0005890656368889313, + "loss": 3.4309, + "step": 5277 + }, + { + "epoch": 0.26, + "grad_norm": 0.5723143219947815, + "learning_rate": 0.0005890615177091601, + "loss": 3.6246, + "step": 5278 + }, + { + "epoch": 0.26, + "grad_norm": 0.5343371033668518, + "learning_rate": 0.0005890573977680567, + "loss": 3.6963, + "step": 5279 + }, + { + "epoch": 0.26, + "grad_norm": 0.5718325972557068, + "learning_rate": 0.0005890532770656316, + "loss": 3.487, + "step": 5280 + }, + { + "epoch": 0.26, + "grad_norm": 0.5279609560966492, + "learning_rate": 0.0005890491556018958, + "loss": 3.4661, + "step": 5281 + }, + { + "epoch": 0.26, + "grad_norm": 0.5968024134635925, + "learning_rate": 0.0005890450333768601, + "loss": 3.533, + "step": 5282 + }, + { + "epoch": 0.26, + "grad_norm": 0.5400145053863525, + "learning_rate": 0.0005890409103905356, + "loss": 3.2575, + "step": 5283 + }, + { + "epoch": 0.26, + "grad_norm": 0.5489025712013245, + "learning_rate": 0.0005890367866429329, + "loss": 3.36, + "step": 5284 + }, + { + "epoch": 0.26, + "grad_norm": 0.600946843624115, + "learning_rate": 0.0005890326621340628, + "loss": 3.5745, + "step": 5285 + }, + { + "epoch": 0.26, + "grad_norm": 0.5801662802696228, + "learning_rate": 0.0005890285368639363, + "loss": 3.4846, + "step": 5286 + }, + { + "epoch": 0.26, + "grad_norm": 0.5930183529853821, + "learning_rate": 0.0005890244108325645, + "loss": 3.3913, + "step": 5287 + }, + { + "epoch": 0.26, + "grad_norm": 0.5204430818557739, + "learning_rate": 0.0005890202840399579, + "loss": 3.3981, + "step": 5288 + }, + { + "epoch": 0.26, + "grad_norm": 0.5407831072807312, + "learning_rate": 0.0005890161564861274, + "loss": 3.6175, + "step": 5289 + }, + { + "epoch": 0.26, + "grad_norm": 0.5350310206413269, + "learning_rate": 0.000589012028171084, + "loss": 3.6356, + "step": 5290 + }, + { + "epoch": 0.26, + "grad_norm": 0.601344645023346, + "learning_rate": 0.0005890078990948386, + "loss": 3.2246, + "step": 5291 + }, + { + "epoch": 0.26, + "grad_norm": 0.5307313799858093, + "learning_rate": 0.000589003769257402, + "loss": 3.6117, + "step": 5292 + }, + { + "epoch": 0.26, + "grad_norm": 0.5290243029594421, + "learning_rate": 0.000588999638658785, + "loss": 3.4505, + "step": 5293 + }, + { + "epoch": 0.26, + "grad_norm": 0.5569483637809753, + "learning_rate": 0.0005889955072989986, + "loss": 3.3312, + "step": 5294 + }, + { + "epoch": 0.26, + "grad_norm": 0.5312089920043945, + "learning_rate": 0.0005889913751780537, + "loss": 3.5256, + "step": 5295 + }, + { + "epoch": 0.26, + "grad_norm": 0.6026173830032349, + "learning_rate": 0.0005889872422959612, + "loss": 3.4126, + "step": 5296 + }, + { + "epoch": 0.26, + "grad_norm": 0.6224252581596375, + "learning_rate": 0.0005889831086527318, + "loss": 3.483, + "step": 5297 + }, + { + "epoch": 0.26, + "grad_norm": 0.5267722606658936, + "learning_rate": 0.0005889789742483766, + "loss": 3.5036, + "step": 5298 + }, + { + "epoch": 0.26, + "grad_norm": 0.5711367130279541, + "learning_rate": 0.0005889748390829064, + "loss": 3.3469, + "step": 5299 + }, + { + "epoch": 0.26, + "grad_norm": 0.5197421312332153, + "learning_rate": 0.000588970703156332, + "loss": 3.3095, + "step": 5300 + }, + { + "epoch": 0.26, + "grad_norm": 0.552913248538971, + "learning_rate": 0.0005889665664686644, + "loss": 3.483, + "step": 5301 + }, + { + "epoch": 0.26, + "grad_norm": 0.5599504113197327, + "learning_rate": 0.0005889624290199145, + "loss": 3.2881, + "step": 5302 + }, + { + "epoch": 0.26, + "grad_norm": 0.5224385261535645, + "learning_rate": 0.0005889582908100932, + "loss": 3.6117, + "step": 5303 + }, + { + "epoch": 0.26, + "grad_norm": 0.5339375734329224, + "learning_rate": 0.0005889541518392114, + "loss": 3.4773, + "step": 5304 + }, + { + "epoch": 0.26, + "grad_norm": 0.5676855444908142, + "learning_rate": 0.0005889500121072799, + "loss": 3.365, + "step": 5305 + }, + { + "epoch": 0.26, + "grad_norm": 0.6814284920692444, + "learning_rate": 0.0005889458716143097, + "loss": 3.5605, + "step": 5306 + }, + { + "epoch": 0.26, + "grad_norm": 0.52662593126297, + "learning_rate": 0.0005889417303603117, + "loss": 3.5284, + "step": 5307 + }, + { + "epoch": 0.26, + "grad_norm": 0.5496254563331604, + "learning_rate": 0.0005889375883452967, + "loss": 3.4624, + "step": 5308 + }, + { + "epoch": 0.26, + "grad_norm": 0.5711499452590942, + "learning_rate": 0.0005889334455692758, + "loss": 3.4269, + "step": 5309 + }, + { + "epoch": 0.26, + "grad_norm": 0.5623414516448975, + "learning_rate": 0.0005889293020322597, + "loss": 3.4268, + "step": 5310 + }, + { + "epoch": 0.26, + "grad_norm": 0.5185620188713074, + "learning_rate": 0.0005889251577342597, + "loss": 3.3941, + "step": 5311 + }, + { + "epoch": 0.26, + "grad_norm": 0.5413085222244263, + "learning_rate": 0.0005889210126752861, + "loss": 3.5135, + "step": 5312 + }, + { + "epoch": 0.26, + "grad_norm": 0.5592106580734253, + "learning_rate": 0.0005889168668553504, + "loss": 3.3922, + "step": 5313 + }, + { + "epoch": 0.26, + "grad_norm": 0.5406152606010437, + "learning_rate": 0.0005889127202744631, + "loss": 3.4864, + "step": 5314 + }, + { + "epoch": 0.26, + "grad_norm": 0.5414677262306213, + "learning_rate": 0.0005889085729326354, + "loss": 3.3056, + "step": 5315 + }, + { + "epoch": 0.26, + "grad_norm": 0.5268755555152893, + "learning_rate": 0.0005889044248298781, + "loss": 3.2041, + "step": 5316 + }, + { + "epoch": 0.26, + "grad_norm": 0.5885822772979736, + "learning_rate": 0.0005889002759662021, + "loss": 3.4368, + "step": 5317 + }, + { + "epoch": 0.26, + "grad_norm": 0.5720695853233337, + "learning_rate": 0.0005888961263416185, + "loss": 3.4827, + "step": 5318 + }, + { + "epoch": 0.26, + "grad_norm": 0.5145799517631531, + "learning_rate": 0.000588891975956138, + "loss": 3.4205, + "step": 5319 + }, + { + "epoch": 0.26, + "grad_norm": 0.5636258721351624, + "learning_rate": 0.0005888878248097717, + "loss": 3.4612, + "step": 5320 + }, + { + "epoch": 0.26, + "grad_norm": 0.5309170484542847, + "learning_rate": 0.0005888836729025304, + "loss": 3.3744, + "step": 5321 + }, + { + "epoch": 0.26, + "grad_norm": 0.5823953747749329, + "learning_rate": 0.0005888795202344251, + "loss": 3.1887, + "step": 5322 + }, + { + "epoch": 0.26, + "grad_norm": 0.5687710046768188, + "learning_rate": 0.0005888753668054667, + "loss": 3.4677, + "step": 5323 + }, + { + "epoch": 0.26, + "grad_norm": 0.5707094669342041, + "learning_rate": 0.0005888712126156663, + "loss": 3.3657, + "step": 5324 + }, + { + "epoch": 0.26, + "grad_norm": 0.5369641184806824, + "learning_rate": 0.0005888670576650346, + "loss": 3.3792, + "step": 5325 + }, + { + "epoch": 0.26, + "grad_norm": 0.550995409488678, + "learning_rate": 0.0005888629019535828, + "loss": 3.3031, + "step": 5326 + }, + { + "epoch": 0.26, + "grad_norm": 0.5561549067497253, + "learning_rate": 0.0005888587454813216, + "loss": 3.3467, + "step": 5327 + }, + { + "epoch": 0.26, + "grad_norm": 0.564005434513092, + "learning_rate": 0.0005888545882482622, + "loss": 3.262, + "step": 5328 + }, + { + "epoch": 0.26, + "grad_norm": 0.5233489871025085, + "learning_rate": 0.0005888504302544152, + "loss": 3.1761, + "step": 5329 + }, + { + "epoch": 0.26, + "grad_norm": 0.586366593837738, + "learning_rate": 0.0005888462714997919, + "loss": 3.2845, + "step": 5330 + }, + { + "epoch": 0.26, + "grad_norm": 0.6343865990638733, + "learning_rate": 0.000588842111984403, + "loss": 3.5545, + "step": 5331 + }, + { + "epoch": 0.26, + "grad_norm": 0.66603022813797, + "learning_rate": 0.0005888379517082597, + "loss": 3.4753, + "step": 5332 + }, + { + "epoch": 0.26, + "grad_norm": 0.5965749621391296, + "learning_rate": 0.0005888337906713728, + "loss": 3.3929, + "step": 5333 + }, + { + "epoch": 0.26, + "grad_norm": 0.5687329173088074, + "learning_rate": 0.0005888296288737531, + "loss": 3.4986, + "step": 5334 + }, + { + "epoch": 0.26, + "grad_norm": 0.5994446277618408, + "learning_rate": 0.0005888254663154119, + "loss": 3.5974, + "step": 5335 + }, + { + "epoch": 0.26, + "grad_norm": 0.5664075613021851, + "learning_rate": 0.00058882130299636, + "loss": 3.451, + "step": 5336 + }, + { + "epoch": 0.26, + "grad_norm": 0.5647047758102417, + "learning_rate": 0.0005888171389166083, + "loss": 3.4053, + "step": 5337 + }, + { + "epoch": 0.26, + "grad_norm": 0.5460997819900513, + "learning_rate": 0.0005888129740761679, + "loss": 3.2895, + "step": 5338 + }, + { + "epoch": 0.26, + "grad_norm": 0.5719659328460693, + "learning_rate": 0.0005888088084750497, + "loss": 3.2872, + "step": 5339 + }, + { + "epoch": 0.26, + "grad_norm": 0.5610400438308716, + "learning_rate": 0.0005888046421132647, + "loss": 3.3737, + "step": 5340 + }, + { + "epoch": 0.26, + "grad_norm": 0.5502822995185852, + "learning_rate": 0.0005888004749908239, + "loss": 3.3927, + "step": 5341 + }, + { + "epoch": 0.26, + "grad_norm": 0.6235296726226807, + "learning_rate": 0.0005887963071077382, + "loss": 3.4243, + "step": 5342 + }, + { + "epoch": 0.26, + "grad_norm": 0.5775569081306458, + "learning_rate": 0.0005887921384640186, + "loss": 3.4377, + "step": 5343 + }, + { + "epoch": 0.26, + "grad_norm": 0.5198445320129395, + "learning_rate": 0.000588787969059676, + "loss": 3.308, + "step": 5344 + }, + { + "epoch": 0.26, + "grad_norm": 0.5459919571876526, + "learning_rate": 0.0005887837988947216, + "loss": 3.5653, + "step": 5345 + }, + { + "epoch": 0.26, + "grad_norm": 0.5754595994949341, + "learning_rate": 0.0005887796279691661, + "loss": 3.226, + "step": 5346 + }, + { + "epoch": 0.26, + "grad_norm": 0.5614638328552246, + "learning_rate": 0.0005887754562830207, + "loss": 3.4762, + "step": 5347 + }, + { + "epoch": 0.26, + "grad_norm": 0.5696617960929871, + "learning_rate": 0.0005887712838362963, + "loss": 3.2895, + "step": 5348 + }, + { + "epoch": 0.26, + "grad_norm": 0.529895544052124, + "learning_rate": 0.000588767110629004, + "loss": 3.3031, + "step": 5349 + }, + { + "epoch": 0.26, + "grad_norm": 0.5466099977493286, + "learning_rate": 0.0005887629366611547, + "loss": 3.6197, + "step": 5350 + }, + { + "epoch": 0.26, + "grad_norm": 0.6054186820983887, + "learning_rate": 0.0005887587619327594, + "loss": 3.3346, + "step": 5351 + }, + { + "epoch": 0.26, + "grad_norm": 0.551434338092804, + "learning_rate": 0.0005887545864438291, + "loss": 3.6942, + "step": 5352 + }, + { + "epoch": 0.26, + "grad_norm": 0.5595240592956543, + "learning_rate": 0.0005887504101943746, + "loss": 3.2481, + "step": 5353 + }, + { + "epoch": 0.26, + "grad_norm": 0.5537325739860535, + "learning_rate": 0.0005887462331844072, + "loss": 3.3643, + "step": 5354 + }, + { + "epoch": 0.26, + "grad_norm": 0.5237861275672913, + "learning_rate": 0.0005887420554139379, + "loss": 3.0849, + "step": 5355 + }, + { + "epoch": 0.26, + "grad_norm": 0.5428269505500793, + "learning_rate": 0.0005887378768829776, + "loss": 3.6186, + "step": 5356 + }, + { + "epoch": 0.26, + "grad_norm": 0.5731378793716431, + "learning_rate": 0.0005887336975915372, + "loss": 3.4173, + "step": 5357 + }, + { + "epoch": 0.26, + "grad_norm": 0.5460600852966309, + "learning_rate": 0.0005887295175396279, + "loss": 3.294, + "step": 5358 + }, + { + "epoch": 0.26, + "grad_norm": 0.5621720552444458, + "learning_rate": 0.0005887253367272605, + "loss": 3.19, + "step": 5359 + }, + { + "epoch": 0.26, + "grad_norm": 0.5691437125205994, + "learning_rate": 0.0005887211551544462, + "loss": 3.346, + "step": 5360 + }, + { + "epoch": 0.26, + "grad_norm": 0.6207178235054016, + "learning_rate": 0.0005887169728211959, + "loss": 3.4977, + "step": 5361 + }, + { + "epoch": 0.26, + "grad_norm": 0.5653191804885864, + "learning_rate": 0.0005887127897275206, + "loss": 3.2567, + "step": 5362 + }, + { + "epoch": 0.26, + "grad_norm": 0.6655686497688293, + "learning_rate": 0.0005887086058734316, + "loss": 3.4619, + "step": 5363 + }, + { + "epoch": 0.26, + "grad_norm": 0.6186230182647705, + "learning_rate": 0.0005887044212589395, + "loss": 3.4998, + "step": 5364 + }, + { + "epoch": 0.26, + "grad_norm": 0.5586534142494202, + "learning_rate": 0.0005887002358840556, + "loss": 3.4456, + "step": 5365 + }, + { + "epoch": 0.26, + "grad_norm": 0.561849057674408, + "learning_rate": 0.0005886960497487908, + "loss": 3.335, + "step": 5366 + }, + { + "epoch": 0.26, + "grad_norm": 0.5541581511497498, + "learning_rate": 0.0005886918628531562, + "loss": 3.2474, + "step": 5367 + }, + { + "epoch": 0.26, + "grad_norm": 0.5545457005500793, + "learning_rate": 0.0005886876751971628, + "loss": 3.3815, + "step": 5368 + }, + { + "epoch": 0.26, + "grad_norm": 0.5654429197311401, + "learning_rate": 0.0005886834867808215, + "loss": 3.2011, + "step": 5369 + }, + { + "epoch": 0.26, + "grad_norm": 0.635900616645813, + "learning_rate": 0.0005886792976041436, + "loss": 3.4373, + "step": 5370 + }, + { + "epoch": 0.26, + "grad_norm": 0.5833329558372498, + "learning_rate": 0.0005886751076671399, + "loss": 3.2561, + "step": 5371 + }, + { + "epoch": 0.26, + "grad_norm": 0.565444827079773, + "learning_rate": 0.0005886709169698216, + "loss": 3.3935, + "step": 5372 + }, + { + "epoch": 0.26, + "grad_norm": 0.5286258459091187, + "learning_rate": 0.0005886667255121995, + "loss": 3.5707, + "step": 5373 + }, + { + "epoch": 0.26, + "grad_norm": 0.5440570712089539, + "learning_rate": 0.000588662533294285, + "loss": 3.311, + "step": 5374 + }, + { + "epoch": 0.26, + "grad_norm": 0.6369450092315674, + "learning_rate": 0.0005886583403160889, + "loss": 3.6501, + "step": 5375 + }, + { + "epoch": 0.26, + "grad_norm": 0.5417674779891968, + "learning_rate": 0.0005886541465776223, + "loss": 3.288, + "step": 5376 + }, + { + "epoch": 0.26, + "grad_norm": 0.6063926815986633, + "learning_rate": 0.0005886499520788961, + "loss": 3.2348, + "step": 5377 + }, + { + "epoch": 0.26, + "grad_norm": 0.5412531495094299, + "learning_rate": 0.0005886457568199216, + "loss": 3.6838, + "step": 5378 + }, + { + "epoch": 0.26, + "grad_norm": 0.5758473873138428, + "learning_rate": 0.0005886415608007096, + "loss": 3.5398, + "step": 5379 + }, + { + "epoch": 0.26, + "grad_norm": 0.577717661857605, + "learning_rate": 0.0005886373640212714, + "loss": 3.4554, + "step": 5380 + }, + { + "epoch": 0.26, + "grad_norm": 0.5586448907852173, + "learning_rate": 0.000588633166481618, + "loss": 3.5677, + "step": 5381 + }, + { + "epoch": 0.26, + "grad_norm": 0.574151873588562, + "learning_rate": 0.0005886289681817601, + "loss": 3.3092, + "step": 5382 + }, + { + "epoch": 0.26, + "grad_norm": 0.5311241149902344, + "learning_rate": 0.0005886247691217093, + "loss": 3.5122, + "step": 5383 + }, + { + "epoch": 0.26, + "grad_norm": 0.5521003007888794, + "learning_rate": 0.0005886205693014764, + "loss": 3.4751, + "step": 5384 + }, + { + "epoch": 0.26, + "grad_norm": 0.5857923030853271, + "learning_rate": 0.0005886163687210724, + "loss": 3.531, + "step": 5385 + }, + { + "epoch": 0.26, + "grad_norm": 0.5671178102493286, + "learning_rate": 0.0005886121673805083, + "loss": 3.3799, + "step": 5386 + }, + { + "epoch": 0.26, + "grad_norm": 0.523662805557251, + "learning_rate": 0.0005886079652797954, + "loss": 3.6136, + "step": 5387 + }, + { + "epoch": 0.26, + "grad_norm": 0.5966594219207764, + "learning_rate": 0.0005886037624189448, + "loss": 3.1987, + "step": 5388 + }, + { + "epoch": 0.26, + "grad_norm": 0.5404354929924011, + "learning_rate": 0.0005885995587979672, + "loss": 3.469, + "step": 5389 + }, + { + "epoch": 0.26, + "grad_norm": 0.565900981426239, + "learning_rate": 0.0005885953544168741, + "loss": 3.4291, + "step": 5390 + }, + { + "epoch": 0.26, + "grad_norm": 0.5991320610046387, + "learning_rate": 0.0005885911492756763, + "loss": 3.4353, + "step": 5391 + }, + { + "epoch": 0.26, + "grad_norm": 0.5885935425758362, + "learning_rate": 0.0005885869433743849, + "loss": 3.3665, + "step": 5392 + }, + { + "epoch": 0.26, + "grad_norm": 0.5500056743621826, + "learning_rate": 0.0005885827367130112, + "loss": 3.3849, + "step": 5393 + }, + { + "epoch": 0.26, + "grad_norm": 0.6144444942474365, + "learning_rate": 0.000588578529291566, + "loss": 3.4082, + "step": 5394 + }, + { + "epoch": 0.26, + "grad_norm": 0.5883260369300842, + "learning_rate": 0.0005885743211100603, + "loss": 3.5509, + "step": 5395 + }, + { + "epoch": 0.26, + "grad_norm": 0.5883315205574036, + "learning_rate": 0.0005885701121685057, + "loss": 3.4782, + "step": 5396 + }, + { + "epoch": 0.26, + "grad_norm": 0.5557137131690979, + "learning_rate": 0.0005885659024669128, + "loss": 3.3478, + "step": 5397 + }, + { + "epoch": 0.26, + "grad_norm": 0.5621869564056396, + "learning_rate": 0.0005885616920052928, + "loss": 3.2406, + "step": 5398 + }, + { + "epoch": 0.26, + "grad_norm": 0.6264198422431946, + "learning_rate": 0.000588557480783657, + "loss": 3.2804, + "step": 5399 + }, + { + "epoch": 0.26, + "grad_norm": 0.543046236038208, + "learning_rate": 0.0005885532688020163, + "loss": 3.3372, + "step": 5400 + }, + { + "epoch": 0.26, + "grad_norm": 0.5603328943252563, + "learning_rate": 0.0005885490560603818, + "loss": 3.2055, + "step": 5401 + }, + { + "epoch": 0.26, + "grad_norm": 0.5257130861282349, + "learning_rate": 0.0005885448425587645, + "loss": 3.3474, + "step": 5402 + }, + { + "epoch": 0.26, + "grad_norm": 0.5107335448265076, + "learning_rate": 0.0005885406282971756, + "loss": 3.5721, + "step": 5403 + }, + { + "epoch": 0.26, + "grad_norm": 0.5753149390220642, + "learning_rate": 0.0005885364132756264, + "loss": 3.4103, + "step": 5404 + }, + { + "epoch": 0.26, + "grad_norm": 0.5694027543067932, + "learning_rate": 0.0005885321974941277, + "loss": 3.4066, + "step": 5405 + }, + { + "epoch": 0.26, + "grad_norm": 0.5609138607978821, + "learning_rate": 0.0005885279809526908, + "loss": 3.3846, + "step": 5406 + }, + { + "epoch": 0.26, + "grad_norm": 0.5206153988838196, + "learning_rate": 0.0005885237636513267, + "loss": 3.5455, + "step": 5407 + }, + { + "epoch": 0.27, + "grad_norm": 0.6045171022415161, + "learning_rate": 0.0005885195455900465, + "loss": 3.2741, + "step": 5408 + }, + { + "epoch": 0.27, + "grad_norm": 0.565991997718811, + "learning_rate": 0.0005885153267688613, + "loss": 3.5252, + "step": 5409 + }, + { + "epoch": 0.27, + "grad_norm": 0.5464850664138794, + "learning_rate": 0.0005885111071877823, + "loss": 3.2544, + "step": 5410 + }, + { + "epoch": 0.27, + "grad_norm": 0.567910373210907, + "learning_rate": 0.0005885068868468206, + "loss": 3.5726, + "step": 5411 + }, + { + "epoch": 0.27, + "grad_norm": 0.5260792374610901, + "learning_rate": 0.0005885026657459873, + "loss": 3.3725, + "step": 5412 + }, + { + "epoch": 0.27, + "grad_norm": 0.5899006128311157, + "learning_rate": 0.0005884984438852934, + "loss": 3.2411, + "step": 5413 + }, + { + "epoch": 0.27, + "grad_norm": 0.6080503463745117, + "learning_rate": 0.0005884942212647502, + "loss": 3.32, + "step": 5414 + }, + { + "epoch": 0.27, + "grad_norm": 0.5826466083526611, + "learning_rate": 0.0005884899978843688, + "loss": 3.2586, + "step": 5415 + }, + { + "epoch": 0.27, + "grad_norm": 0.5410050749778748, + "learning_rate": 0.0005884857737441601, + "loss": 3.3174, + "step": 5416 + }, + { + "epoch": 0.27, + "grad_norm": 0.5297449827194214, + "learning_rate": 0.0005884815488441356, + "loss": 3.3025, + "step": 5417 + }, + { + "epoch": 0.27, + "grad_norm": 0.5570391416549683, + "learning_rate": 0.0005884773231843062, + "loss": 3.5041, + "step": 5418 + }, + { + "epoch": 0.27, + "grad_norm": 0.5632867217063904, + "learning_rate": 0.0005884730967646828, + "loss": 3.4786, + "step": 5419 + }, + { + "epoch": 0.27, + "grad_norm": 0.5535479784011841, + "learning_rate": 0.0005884688695852769, + "loss": 3.2442, + "step": 5420 + }, + { + "epoch": 0.27, + "grad_norm": 0.5231944918632507, + "learning_rate": 0.0005884646416460997, + "loss": 3.5603, + "step": 5421 + }, + { + "epoch": 0.27, + "grad_norm": 0.5422855019569397, + "learning_rate": 0.000588460412947162, + "loss": 3.2527, + "step": 5422 + }, + { + "epoch": 0.27, + "grad_norm": 0.5713430643081665, + "learning_rate": 0.000588456183488475, + "loss": 3.5105, + "step": 5423 + }, + { + "epoch": 0.27, + "grad_norm": 0.568607747554779, + "learning_rate": 0.0005884519532700501, + "loss": 3.363, + "step": 5424 + }, + { + "epoch": 0.27, + "grad_norm": 0.5918583273887634, + "learning_rate": 0.0005884477222918981, + "loss": 3.3211, + "step": 5425 + }, + { + "epoch": 0.27, + "grad_norm": 0.6431357264518738, + "learning_rate": 0.0005884434905540305, + "loss": 3.2004, + "step": 5426 + }, + { + "epoch": 0.27, + "grad_norm": 0.5770518183708191, + "learning_rate": 0.0005884392580564581, + "loss": 3.4413, + "step": 5427 + }, + { + "epoch": 0.27, + "grad_norm": 0.6105692982673645, + "learning_rate": 0.0005884350247991923, + "loss": 3.4229, + "step": 5428 + }, + { + "epoch": 0.27, + "grad_norm": 0.5172119736671448, + "learning_rate": 0.0005884307907822441, + "loss": 3.3484, + "step": 5429 + }, + { + "epoch": 0.27, + "grad_norm": 0.6026398539543152, + "learning_rate": 0.0005884265560056246, + "loss": 3.2793, + "step": 5430 + }, + { + "epoch": 0.27, + "grad_norm": 0.5257851481437683, + "learning_rate": 0.0005884223204693453, + "loss": 3.324, + "step": 5431 + }, + { + "epoch": 0.27, + "grad_norm": 0.5769349932670593, + "learning_rate": 0.000588418084173417, + "loss": 3.3327, + "step": 5432 + }, + { + "epoch": 0.27, + "grad_norm": 0.5765005350112915, + "learning_rate": 0.000588413847117851, + "loss": 3.557, + "step": 5433 + }, + { + "epoch": 0.27, + "grad_norm": 0.5662025213241577, + "learning_rate": 0.0005884096093026584, + "loss": 3.4275, + "step": 5434 + }, + { + "epoch": 0.27, + "grad_norm": 0.5836684107780457, + "learning_rate": 0.0005884053707278504, + "loss": 3.4793, + "step": 5435 + }, + { + "epoch": 0.27, + "grad_norm": 0.5512568354606628, + "learning_rate": 0.0005884011313934381, + "loss": 3.2952, + "step": 5436 + }, + { + "epoch": 0.27, + "grad_norm": 0.5661933422088623, + "learning_rate": 0.0005883968912994328, + "loss": 3.4282, + "step": 5437 + }, + { + "epoch": 0.27, + "grad_norm": 0.5790491700172424, + "learning_rate": 0.0005883926504458456, + "loss": 3.3611, + "step": 5438 + }, + { + "epoch": 0.27, + "grad_norm": 0.5423797369003296, + "learning_rate": 0.0005883884088326877, + "loss": 3.3749, + "step": 5439 + }, + { + "epoch": 0.27, + "grad_norm": 0.5996915698051453, + "learning_rate": 0.0005883841664599701, + "loss": 3.1857, + "step": 5440 + }, + { + "epoch": 0.27, + "grad_norm": 0.5513800382614136, + "learning_rate": 0.0005883799233277042, + "loss": 3.2941, + "step": 5441 + }, + { + "epoch": 0.27, + "grad_norm": 0.5614347457885742, + "learning_rate": 0.000588375679435901, + "loss": 3.4229, + "step": 5442 + }, + { + "epoch": 0.27, + "grad_norm": 0.5885158777236938, + "learning_rate": 0.0005883714347845718, + "loss": 3.4624, + "step": 5443 + }, + { + "epoch": 0.27, + "grad_norm": 0.5393982529640198, + "learning_rate": 0.0005883671893737279, + "loss": 3.2643, + "step": 5444 + }, + { + "epoch": 0.27, + "grad_norm": 0.5851886868476868, + "learning_rate": 0.0005883629432033801, + "loss": 3.4327, + "step": 5445 + }, + { + "epoch": 0.27, + "grad_norm": 0.5609333515167236, + "learning_rate": 0.0005883586962735399, + "loss": 3.2944, + "step": 5446 + }, + { + "epoch": 0.27, + "grad_norm": 0.5521209239959717, + "learning_rate": 0.0005883544485842183, + "loss": 3.3134, + "step": 5447 + }, + { + "epoch": 0.27, + "grad_norm": 0.5714461803436279, + "learning_rate": 0.0005883502001354267, + "loss": 3.5679, + "step": 5448 + }, + { + "epoch": 0.27, + "grad_norm": 0.5831002593040466, + "learning_rate": 0.0005883459509271762, + "loss": 3.2632, + "step": 5449 + }, + { + "epoch": 0.27, + "grad_norm": 0.5336947441101074, + "learning_rate": 0.0005883417009594778, + "loss": 3.3042, + "step": 5450 + }, + { + "epoch": 0.27, + "grad_norm": 0.575208306312561, + "learning_rate": 0.0005883374502323429, + "loss": 3.3451, + "step": 5451 + }, + { + "epoch": 0.27, + "grad_norm": 0.5348706841468811, + "learning_rate": 0.0005883331987457827, + "loss": 3.4311, + "step": 5452 + }, + { + "epoch": 0.27, + "grad_norm": 0.5652807950973511, + "learning_rate": 0.0005883289464998083, + "loss": 3.4136, + "step": 5453 + }, + { + "epoch": 0.27, + "grad_norm": 0.6097990870475769, + "learning_rate": 0.000588324693494431, + "loss": 3.3811, + "step": 5454 + }, + { + "epoch": 0.27, + "grad_norm": 0.5866896510124207, + "learning_rate": 0.0005883204397296619, + "loss": 3.2683, + "step": 5455 + }, + { + "epoch": 0.27, + "grad_norm": 0.5691618323326111, + "learning_rate": 0.0005883161852055122, + "loss": 3.3863, + "step": 5456 + }, + { + "epoch": 0.27, + "grad_norm": 0.6082558035850525, + "learning_rate": 0.0005883119299219932, + "loss": 3.3178, + "step": 5457 + }, + { + "epoch": 0.27, + "grad_norm": 0.5235224962234497, + "learning_rate": 0.0005883076738791161, + "loss": 3.5059, + "step": 5458 + }, + { + "epoch": 0.27, + "grad_norm": 0.537198007106781, + "learning_rate": 0.0005883034170768921, + "loss": 3.3683, + "step": 5459 + }, + { + "epoch": 0.27, + "grad_norm": 0.5196539163589478, + "learning_rate": 0.0005882991595153323, + "loss": 3.3288, + "step": 5460 + }, + { + "epoch": 0.27, + "grad_norm": 0.5324885249137878, + "learning_rate": 0.0005882949011944481, + "loss": 3.3257, + "step": 5461 + }, + { + "epoch": 0.27, + "grad_norm": 0.5442826747894287, + "learning_rate": 0.0005882906421142505, + "loss": 3.3675, + "step": 5462 + }, + { + "epoch": 0.27, + "grad_norm": 0.5731666684150696, + "learning_rate": 0.0005882863822747508, + "loss": 3.4277, + "step": 5463 + }, + { + "epoch": 0.27, + "grad_norm": 0.598007321357727, + "learning_rate": 0.0005882821216759604, + "loss": 3.4906, + "step": 5464 + }, + { + "epoch": 0.27, + "grad_norm": 0.5194315910339355, + "learning_rate": 0.0005882778603178903, + "loss": 3.2031, + "step": 5465 + }, + { + "epoch": 0.27, + "grad_norm": 0.5207369327545166, + "learning_rate": 0.0005882735982005519, + "loss": 3.2482, + "step": 5466 + }, + { + "epoch": 0.27, + "grad_norm": 0.5437124371528625, + "learning_rate": 0.0005882693353239562, + "loss": 3.5863, + "step": 5467 + }, + { + "epoch": 0.27, + "grad_norm": 0.5464715957641602, + "learning_rate": 0.0005882650716881146, + "loss": 3.4643, + "step": 5468 + }, + { + "epoch": 0.27, + "grad_norm": 0.5588339567184448, + "learning_rate": 0.0005882608072930381, + "loss": 3.3967, + "step": 5469 + }, + { + "epoch": 0.27, + "grad_norm": 0.5152556896209717, + "learning_rate": 0.0005882565421387383, + "loss": 3.324, + "step": 5470 + }, + { + "epoch": 0.27, + "grad_norm": 0.5201581120491028, + "learning_rate": 0.0005882522762252262, + "loss": 3.1271, + "step": 5471 + }, + { + "epoch": 0.27, + "grad_norm": 0.5199649930000305, + "learning_rate": 0.0005882480095525132, + "loss": 3.491, + "step": 5472 + }, + { + "epoch": 0.27, + "grad_norm": 0.5226982235908508, + "learning_rate": 0.0005882437421206102, + "loss": 3.4153, + "step": 5473 + }, + { + "epoch": 0.27, + "grad_norm": 0.5411034226417542, + "learning_rate": 0.0005882394739295287, + "loss": 3.4042, + "step": 5474 + }, + { + "epoch": 0.27, + "grad_norm": 0.554895281791687, + "learning_rate": 0.0005882352049792801, + "loss": 3.384, + "step": 5475 + }, + { + "epoch": 0.27, + "grad_norm": 0.538006603717804, + "learning_rate": 0.0005882309352698752, + "loss": 3.4293, + "step": 5476 + }, + { + "epoch": 0.27, + "grad_norm": 0.5578272342681885, + "learning_rate": 0.0005882266648013256, + "loss": 3.4783, + "step": 5477 + }, + { + "epoch": 0.27, + "grad_norm": 0.5473056435585022, + "learning_rate": 0.0005882223935736424, + "loss": 3.5646, + "step": 5478 + }, + { + "epoch": 0.27, + "grad_norm": 0.5610681176185608, + "learning_rate": 0.0005882181215868369, + "loss": 3.5254, + "step": 5479 + }, + { + "epoch": 0.27, + "grad_norm": 0.5292629599571228, + "learning_rate": 0.0005882138488409204, + "loss": 3.376, + "step": 5480 + }, + { + "epoch": 0.27, + "grad_norm": 0.5061684846878052, + "learning_rate": 0.000588209575335904, + "loss": 3.5303, + "step": 5481 + }, + { + "epoch": 0.27, + "grad_norm": 0.5533640384674072, + "learning_rate": 0.0005882053010717991, + "loss": 3.4842, + "step": 5482 + }, + { + "epoch": 0.27, + "grad_norm": 0.5831032991409302, + "learning_rate": 0.0005882010260486169, + "loss": 3.5035, + "step": 5483 + }, + { + "epoch": 0.27, + "grad_norm": 0.5656222701072693, + "learning_rate": 0.0005881967502663687, + "loss": 3.5032, + "step": 5484 + }, + { + "epoch": 0.27, + "grad_norm": 0.6056988835334778, + "learning_rate": 0.0005881924737250655, + "loss": 3.5061, + "step": 5485 + }, + { + "epoch": 0.27, + "grad_norm": 0.5542979836463928, + "learning_rate": 0.0005881881964247191, + "loss": 3.4491, + "step": 5486 + }, + { + "epoch": 0.27, + "grad_norm": 0.5534904599189758, + "learning_rate": 0.0005881839183653402, + "loss": 3.4249, + "step": 5487 + }, + { + "epoch": 0.27, + "grad_norm": 0.5802499651908875, + "learning_rate": 0.0005881796395469406, + "loss": 3.242, + "step": 5488 + }, + { + "epoch": 0.27, + "grad_norm": 0.6037202477455139, + "learning_rate": 0.000588175359969531, + "loss": 3.4127, + "step": 5489 + }, + { + "epoch": 0.27, + "grad_norm": 0.5480342507362366, + "learning_rate": 0.0005881710796331231, + "loss": 3.2675, + "step": 5490 + }, + { + "epoch": 0.27, + "grad_norm": 0.5399625301361084, + "learning_rate": 0.000588166798537728, + "loss": 3.4004, + "step": 5491 + }, + { + "epoch": 0.27, + "grad_norm": 0.5208690166473389, + "learning_rate": 0.000588162516683357, + "loss": 3.2553, + "step": 5492 + }, + { + "epoch": 0.27, + "grad_norm": 0.5354910492897034, + "learning_rate": 0.0005881582340700215, + "loss": 3.2685, + "step": 5493 + }, + { + "epoch": 0.27, + "grad_norm": 0.6398562788963318, + "learning_rate": 0.0005881539506977326, + "loss": 3.7163, + "step": 5494 + }, + { + "epoch": 0.27, + "grad_norm": 0.6458652019500732, + "learning_rate": 0.0005881496665665016, + "loss": 3.4954, + "step": 5495 + }, + { + "epoch": 0.27, + "grad_norm": 0.5615726113319397, + "learning_rate": 0.0005881453816763398, + "loss": 3.3978, + "step": 5496 + }, + { + "epoch": 0.27, + "grad_norm": 0.5364488959312439, + "learning_rate": 0.0005881410960272587, + "loss": 3.3949, + "step": 5497 + }, + { + "epoch": 0.27, + "grad_norm": 0.5817398428916931, + "learning_rate": 0.0005881368096192693, + "loss": 3.3621, + "step": 5498 + }, + { + "epoch": 0.27, + "grad_norm": 0.8163599371910095, + "learning_rate": 0.000588132522452383, + "loss": 3.6545, + "step": 5499 + }, + { + "epoch": 0.27, + "grad_norm": 0.5170142650604248, + "learning_rate": 0.0005881282345266111, + "loss": 3.4511, + "step": 5500 + }, + { + "epoch": 0.27, + "grad_norm": 0.5624502897262573, + "learning_rate": 0.0005881239458419648, + "loss": 3.3277, + "step": 5501 + }, + { + "epoch": 0.27, + "grad_norm": 0.5259974002838135, + "learning_rate": 0.0005881196563984555, + "loss": 3.3884, + "step": 5502 + }, + { + "epoch": 0.27, + "grad_norm": 0.5127625465393066, + "learning_rate": 0.0005881153661960946, + "loss": 3.4118, + "step": 5503 + }, + { + "epoch": 0.27, + "grad_norm": 0.5912742018699646, + "learning_rate": 0.0005881110752348931, + "loss": 3.5946, + "step": 5504 + }, + { + "epoch": 0.27, + "grad_norm": 0.5364376902580261, + "learning_rate": 0.0005881067835148626, + "loss": 3.4842, + "step": 5505 + }, + { + "epoch": 0.27, + "grad_norm": 0.5497884154319763, + "learning_rate": 0.0005881024910360143, + "loss": 3.3788, + "step": 5506 + }, + { + "epoch": 0.27, + "grad_norm": 0.5624418258666992, + "learning_rate": 0.0005880981977983594, + "loss": 3.1812, + "step": 5507 + }, + { + "epoch": 0.27, + "grad_norm": 0.548305332660675, + "learning_rate": 0.0005880939038019093, + "loss": 3.2861, + "step": 5508 + }, + { + "epoch": 0.27, + "grad_norm": 0.6290483474731445, + "learning_rate": 0.0005880896090466754, + "loss": 3.2389, + "step": 5509 + }, + { + "epoch": 0.27, + "grad_norm": 0.6057081818580627, + "learning_rate": 0.0005880853135326688, + "loss": 3.102, + "step": 5510 + }, + { + "epoch": 0.27, + "grad_norm": 0.567807674407959, + "learning_rate": 0.0005880810172599009, + "loss": 3.3086, + "step": 5511 + }, + { + "epoch": 0.27, + "grad_norm": 0.5722109079360962, + "learning_rate": 0.0005880767202283832, + "loss": 3.2491, + "step": 5512 + }, + { + "epoch": 0.27, + "grad_norm": 0.5263040661811829, + "learning_rate": 0.0005880724224381267, + "loss": 3.4022, + "step": 5513 + }, + { + "epoch": 0.27, + "grad_norm": 0.5402182340621948, + "learning_rate": 0.0005880681238891429, + "loss": 3.6702, + "step": 5514 + }, + { + "epoch": 0.27, + "grad_norm": 0.5795789361000061, + "learning_rate": 0.0005880638245814433, + "loss": 3.3956, + "step": 5515 + }, + { + "epoch": 0.27, + "grad_norm": 0.6076276302337646, + "learning_rate": 0.0005880595245150388, + "loss": 3.3292, + "step": 5516 + }, + { + "epoch": 0.27, + "grad_norm": 0.5449411869049072, + "learning_rate": 0.0005880552236899411, + "loss": 3.4199, + "step": 5517 + }, + { + "epoch": 0.27, + "grad_norm": 0.5408446192741394, + "learning_rate": 0.0005880509221061613, + "loss": 3.5047, + "step": 5518 + }, + { + "epoch": 0.27, + "grad_norm": 0.548406183719635, + "learning_rate": 0.0005880466197637108, + "loss": 3.639, + "step": 5519 + }, + { + "epoch": 0.27, + "grad_norm": 0.6209466457366943, + "learning_rate": 0.000588042316662601, + "loss": 3.4597, + "step": 5520 + }, + { + "epoch": 0.27, + "grad_norm": 0.5157639980316162, + "learning_rate": 0.0005880380128028431, + "loss": 3.3713, + "step": 5521 + }, + { + "epoch": 0.27, + "grad_norm": 0.5012747645378113, + "learning_rate": 0.0005880337081844485, + "loss": 3.3632, + "step": 5522 + }, + { + "epoch": 0.27, + "grad_norm": 0.5477715134620667, + "learning_rate": 0.0005880294028074286, + "loss": 3.4232, + "step": 5523 + }, + { + "epoch": 0.27, + "grad_norm": 0.6210871338844299, + "learning_rate": 0.0005880250966717946, + "loss": 3.2222, + "step": 5524 + }, + { + "epoch": 0.27, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0005880207897775581, + "loss": 3.5465, + "step": 5525 + }, + { + "epoch": 0.27, + "grad_norm": 0.5438306331634521, + "learning_rate": 0.0005880164821247301, + "loss": 3.2486, + "step": 5526 + }, + { + "epoch": 0.27, + "grad_norm": 0.5276567935943604, + "learning_rate": 0.0005880121737133221, + "loss": 3.3924, + "step": 5527 + }, + { + "epoch": 0.27, + "grad_norm": 0.5347236394882202, + "learning_rate": 0.0005880078645433456, + "loss": 3.1702, + "step": 5528 + }, + { + "epoch": 0.27, + "grad_norm": 0.532122015953064, + "learning_rate": 0.0005880035546148118, + "loss": 3.6436, + "step": 5529 + }, + { + "epoch": 0.27, + "grad_norm": 0.6177424788475037, + "learning_rate": 0.000587999243927732, + "loss": 3.4205, + "step": 5530 + }, + { + "epoch": 0.27, + "grad_norm": 0.5172913670539856, + "learning_rate": 0.0005879949324821177, + "loss": 3.4277, + "step": 5531 + }, + { + "epoch": 0.27, + "grad_norm": 0.5155248641967773, + "learning_rate": 0.0005879906202779801, + "loss": 3.3915, + "step": 5532 + }, + { + "epoch": 0.27, + "grad_norm": 0.5313438177108765, + "learning_rate": 0.0005879863073153306, + "loss": 3.1238, + "step": 5533 + }, + { + "epoch": 0.27, + "grad_norm": 0.5252282619476318, + "learning_rate": 0.0005879819935941806, + "loss": 3.5629, + "step": 5534 + }, + { + "epoch": 0.27, + "grad_norm": 0.5261170268058777, + "learning_rate": 0.0005879776791145416, + "loss": 3.6691, + "step": 5535 + }, + { + "epoch": 0.27, + "grad_norm": 0.5184322595596313, + "learning_rate": 0.0005879733638764246, + "loss": 3.5832, + "step": 5536 + }, + { + "epoch": 0.27, + "grad_norm": 0.5469010472297668, + "learning_rate": 0.0005879690478798413, + "loss": 3.4247, + "step": 5537 + }, + { + "epoch": 0.27, + "grad_norm": 0.5656020045280457, + "learning_rate": 0.0005879647311248029, + "loss": 3.3261, + "step": 5538 + }, + { + "epoch": 0.27, + "grad_norm": 0.526353657245636, + "learning_rate": 0.0005879604136113209, + "loss": 3.4339, + "step": 5539 + }, + { + "epoch": 0.27, + "grad_norm": 0.5460639595985413, + "learning_rate": 0.0005879560953394066, + "loss": 3.2131, + "step": 5540 + }, + { + "epoch": 0.27, + "grad_norm": 0.5069341659545898, + "learning_rate": 0.0005879517763090712, + "loss": 3.4322, + "step": 5541 + }, + { + "epoch": 0.27, + "grad_norm": 0.5094472169876099, + "learning_rate": 0.0005879474565203263, + "loss": 3.36, + "step": 5542 + }, + { + "epoch": 0.27, + "grad_norm": 0.520026445388794, + "learning_rate": 0.0005879431359731833, + "loss": 3.4691, + "step": 5543 + }, + { + "epoch": 0.27, + "grad_norm": 0.5083997249603271, + "learning_rate": 0.0005879388146676535, + "loss": 3.3143, + "step": 5544 + }, + { + "epoch": 0.27, + "grad_norm": 0.5479220747947693, + "learning_rate": 0.0005879344926037482, + "loss": 3.5481, + "step": 5545 + }, + { + "epoch": 0.27, + "grad_norm": 0.561329185962677, + "learning_rate": 0.0005879301697814789, + "loss": 3.4544, + "step": 5546 + }, + { + "epoch": 0.27, + "grad_norm": 0.5535502433776855, + "learning_rate": 0.000587925846200857, + "loss": 3.5186, + "step": 5547 + }, + { + "epoch": 0.27, + "grad_norm": 0.5314511656761169, + "learning_rate": 0.0005879215218618937, + "loss": 3.5539, + "step": 5548 + }, + { + "epoch": 0.27, + "grad_norm": 0.4937261641025543, + "learning_rate": 0.0005879171967646006, + "loss": 3.5703, + "step": 5549 + }, + { + "epoch": 0.27, + "grad_norm": 0.5664466619491577, + "learning_rate": 0.0005879128709089889, + "loss": 3.454, + "step": 5550 + }, + { + "epoch": 0.27, + "grad_norm": 0.513667106628418, + "learning_rate": 0.0005879085442950703, + "loss": 3.2555, + "step": 5551 + }, + { + "epoch": 0.27, + "grad_norm": 0.5058261156082153, + "learning_rate": 0.000587904216922856, + "loss": 3.5081, + "step": 5552 + }, + { + "epoch": 0.27, + "grad_norm": 0.5401785373687744, + "learning_rate": 0.0005878998887923572, + "loss": 3.4128, + "step": 5553 + }, + { + "epoch": 0.27, + "grad_norm": 0.5664740800857544, + "learning_rate": 0.0005878955599035857, + "loss": 3.5442, + "step": 5554 + }, + { + "epoch": 0.27, + "grad_norm": 0.5653204321861267, + "learning_rate": 0.0005878912302565526, + "loss": 3.4864, + "step": 5555 + }, + { + "epoch": 0.27, + "grad_norm": 0.5319250822067261, + "learning_rate": 0.0005878868998512694, + "loss": 3.5001, + "step": 5556 + }, + { + "epoch": 0.27, + "grad_norm": 0.5475034713745117, + "learning_rate": 0.0005878825686877476, + "loss": 3.4573, + "step": 5557 + }, + { + "epoch": 0.27, + "grad_norm": 0.5714825391769409, + "learning_rate": 0.0005878782367659984, + "loss": 3.3223, + "step": 5558 + }, + { + "epoch": 0.27, + "grad_norm": 0.5734561681747437, + "learning_rate": 0.0005878739040860334, + "loss": 3.3125, + "step": 5559 + }, + { + "epoch": 0.27, + "grad_norm": 0.5484195947647095, + "learning_rate": 0.0005878695706478639, + "loss": 3.4178, + "step": 5560 + }, + { + "epoch": 0.27, + "grad_norm": 0.5906480550765991, + "learning_rate": 0.0005878652364515014, + "loss": 3.4166, + "step": 5561 + }, + { + "epoch": 0.27, + "grad_norm": 0.5374751091003418, + "learning_rate": 0.0005878609014969572, + "loss": 3.4265, + "step": 5562 + }, + { + "epoch": 0.27, + "grad_norm": 0.5320459008216858, + "learning_rate": 0.0005878565657842428, + "loss": 3.3379, + "step": 5563 + }, + { + "epoch": 0.27, + "grad_norm": 0.5613856911659241, + "learning_rate": 0.0005878522293133697, + "loss": 3.3711, + "step": 5564 + }, + { + "epoch": 0.27, + "grad_norm": 0.5332615971565247, + "learning_rate": 0.0005878478920843492, + "loss": 3.4358, + "step": 5565 + }, + { + "epoch": 0.27, + "grad_norm": 0.5565606951713562, + "learning_rate": 0.0005878435540971926, + "loss": 3.3137, + "step": 5566 + }, + { + "epoch": 0.27, + "grad_norm": 0.5280033946037292, + "learning_rate": 0.0005878392153519117, + "loss": 3.4746, + "step": 5567 + }, + { + "epoch": 0.27, + "grad_norm": 0.5665175914764404, + "learning_rate": 0.0005878348758485176, + "loss": 3.4319, + "step": 5568 + }, + { + "epoch": 0.27, + "grad_norm": 0.5359581708908081, + "learning_rate": 0.0005878305355870218, + "loss": 3.4903, + "step": 5569 + }, + { + "epoch": 0.27, + "grad_norm": 0.5408956408500671, + "learning_rate": 0.0005878261945674358, + "loss": 3.4426, + "step": 5570 + }, + { + "epoch": 0.27, + "grad_norm": 0.5556782484054565, + "learning_rate": 0.0005878218527897709, + "loss": 3.4188, + "step": 5571 + }, + { + "epoch": 0.27, + "grad_norm": 0.549187183380127, + "learning_rate": 0.0005878175102540387, + "loss": 3.4785, + "step": 5572 + }, + { + "epoch": 0.27, + "grad_norm": 0.5007501244544983, + "learning_rate": 0.0005878131669602506, + "loss": 3.254, + "step": 5573 + }, + { + "epoch": 0.27, + "grad_norm": 0.5330648422241211, + "learning_rate": 0.000587808822908418, + "loss": 3.5062, + "step": 5574 + }, + { + "epoch": 0.27, + "grad_norm": 0.5661302208900452, + "learning_rate": 0.0005878044780985523, + "loss": 3.555, + "step": 5575 + }, + { + "epoch": 0.27, + "grad_norm": 0.5426991581916809, + "learning_rate": 0.000587800132530665, + "loss": 3.7042, + "step": 5576 + }, + { + "epoch": 0.27, + "grad_norm": 0.5796633958816528, + "learning_rate": 0.0005877957862047676, + "loss": 3.3673, + "step": 5577 + }, + { + "epoch": 0.27, + "grad_norm": 0.5920323133468628, + "learning_rate": 0.0005877914391208716, + "loss": 3.3901, + "step": 5578 + }, + { + "epoch": 0.27, + "grad_norm": 0.5278183817863464, + "learning_rate": 0.0005877870912789882, + "loss": 3.4738, + "step": 5579 + }, + { + "epoch": 0.27, + "grad_norm": 0.506721556186676, + "learning_rate": 0.0005877827426791289, + "loss": 3.2758, + "step": 5580 + }, + { + "epoch": 0.27, + "grad_norm": 0.5367504954338074, + "learning_rate": 0.0005877783933213054, + "loss": 3.3765, + "step": 5581 + }, + { + "epoch": 0.27, + "grad_norm": 0.5439772009849548, + "learning_rate": 0.0005877740432055288, + "loss": 3.6383, + "step": 5582 + }, + { + "epoch": 0.27, + "grad_norm": 0.5529850721359253, + "learning_rate": 0.0005877696923318109, + "loss": 3.3865, + "step": 5583 + }, + { + "epoch": 0.27, + "grad_norm": 0.5937544703483582, + "learning_rate": 0.000587765340700163, + "loss": 3.6044, + "step": 5584 + }, + { + "epoch": 0.27, + "grad_norm": 0.5193443894386292, + "learning_rate": 0.0005877609883105966, + "loss": 3.2867, + "step": 5585 + }, + { + "epoch": 0.27, + "grad_norm": 0.5450002551078796, + "learning_rate": 0.0005877566351631231, + "loss": 3.4058, + "step": 5586 + }, + { + "epoch": 0.27, + "grad_norm": 0.5292603373527527, + "learning_rate": 0.0005877522812577539, + "loss": 3.5574, + "step": 5587 + }, + { + "epoch": 0.27, + "grad_norm": 0.5539287328720093, + "learning_rate": 0.0005877479265945008, + "loss": 3.3719, + "step": 5588 + }, + { + "epoch": 0.27, + "grad_norm": 0.5461103320121765, + "learning_rate": 0.0005877435711733748, + "loss": 3.3806, + "step": 5589 + }, + { + "epoch": 0.27, + "grad_norm": 0.5659968256950378, + "learning_rate": 0.0005877392149943877, + "loss": 3.3854, + "step": 5590 + }, + { + "epoch": 0.27, + "grad_norm": 0.6119036674499512, + "learning_rate": 0.0005877348580575509, + "loss": 3.2403, + "step": 5591 + }, + { + "epoch": 0.27, + "grad_norm": 0.5469770431518555, + "learning_rate": 0.000587730500362876, + "loss": 3.428, + "step": 5592 + }, + { + "epoch": 0.27, + "grad_norm": 0.5068824887275696, + "learning_rate": 0.0005877261419103741, + "loss": 3.5666, + "step": 5593 + }, + { + "epoch": 0.27, + "grad_norm": 0.5368444919586182, + "learning_rate": 0.000587721782700057, + "loss": 3.2824, + "step": 5594 + }, + { + "epoch": 0.27, + "grad_norm": 0.5606796145439148, + "learning_rate": 0.0005877174227319362, + "loss": 3.2626, + "step": 5595 + }, + { + "epoch": 0.27, + "grad_norm": 0.5552815794944763, + "learning_rate": 0.0005877130620060229, + "loss": 3.2393, + "step": 5596 + }, + { + "epoch": 0.27, + "grad_norm": 0.591176450252533, + "learning_rate": 0.000587708700522329, + "loss": 3.339, + "step": 5597 + }, + { + "epoch": 0.27, + "grad_norm": 0.5659617781639099, + "learning_rate": 0.0005877043382808655, + "loss": 3.582, + "step": 5598 + }, + { + "epoch": 0.27, + "grad_norm": 0.5646779537200928, + "learning_rate": 0.0005876999752816443, + "loss": 3.5119, + "step": 5599 + }, + { + "epoch": 0.27, + "grad_norm": 0.5375205278396606, + "learning_rate": 0.0005876956115246767, + "loss": 3.39, + "step": 5600 + }, + { + "epoch": 0.27, + "grad_norm": 0.5745865106582642, + "learning_rate": 0.0005876912470099742, + "loss": 3.515, + "step": 5601 + }, + { + "epoch": 0.27, + "grad_norm": 0.6667315363883972, + "learning_rate": 0.0005876868817375483, + "loss": 3.2686, + "step": 5602 + }, + { + "epoch": 0.27, + "grad_norm": 0.5387046933174133, + "learning_rate": 0.0005876825157074107, + "loss": 3.3706, + "step": 5603 + }, + { + "epoch": 0.27, + "grad_norm": 0.5582010746002197, + "learning_rate": 0.0005876781489195725, + "loss": 3.5323, + "step": 5604 + }, + { + "epoch": 0.27, + "grad_norm": 0.5929326415061951, + "learning_rate": 0.0005876737813740456, + "loss": 3.3948, + "step": 5605 + }, + { + "epoch": 0.27, + "grad_norm": 0.5750752091407776, + "learning_rate": 0.0005876694130708412, + "loss": 3.2972, + "step": 5606 + }, + { + "epoch": 0.27, + "grad_norm": 0.5479739904403687, + "learning_rate": 0.0005876650440099709, + "loss": 3.5172, + "step": 5607 + }, + { + "epoch": 0.27, + "grad_norm": 0.5554889440536499, + "learning_rate": 0.0005876606741914462, + "loss": 3.399, + "step": 5608 + }, + { + "epoch": 0.27, + "grad_norm": 0.5355261564254761, + "learning_rate": 0.0005876563036152788, + "loss": 3.5163, + "step": 5609 + }, + { + "epoch": 0.27, + "grad_norm": 0.5420413613319397, + "learning_rate": 0.00058765193228148, + "loss": 3.3165, + "step": 5610 + }, + { + "epoch": 0.27, + "grad_norm": 0.5698093771934509, + "learning_rate": 0.0005876475601900614, + "loss": 3.0865, + "step": 5611 + }, + { + "epoch": 0.28, + "grad_norm": 0.5109629034996033, + "learning_rate": 0.0005876431873410344, + "loss": 3.4757, + "step": 5612 + }, + { + "epoch": 0.28, + "grad_norm": 0.5421175360679626, + "learning_rate": 0.0005876388137344107, + "loss": 3.422, + "step": 5613 + }, + { + "epoch": 0.28, + "grad_norm": 0.5593693256378174, + "learning_rate": 0.0005876344393702016, + "loss": 3.1981, + "step": 5614 + }, + { + "epoch": 0.28, + "grad_norm": 0.5609410405158997, + "learning_rate": 0.0005876300642484189, + "loss": 3.4627, + "step": 5615 + }, + { + "epoch": 0.28, + "grad_norm": 0.5721429586410522, + "learning_rate": 0.0005876256883690739, + "loss": 3.5989, + "step": 5616 + }, + { + "epoch": 0.28, + "grad_norm": 0.531762957572937, + "learning_rate": 0.0005876213117321781, + "loss": 3.5232, + "step": 5617 + }, + { + "epoch": 0.28, + "grad_norm": 0.6050277352333069, + "learning_rate": 0.0005876169343377432, + "loss": 3.2072, + "step": 5618 + }, + { + "epoch": 0.28, + "grad_norm": 0.536950945854187, + "learning_rate": 0.0005876125561857806, + "loss": 3.4877, + "step": 5619 + }, + { + "epoch": 0.28, + "grad_norm": 0.5331342816352844, + "learning_rate": 0.0005876081772763019, + "loss": 3.3034, + "step": 5620 + }, + { + "epoch": 0.28, + "grad_norm": 0.5625576972961426, + "learning_rate": 0.0005876037976093188, + "loss": 3.2909, + "step": 5621 + }, + { + "epoch": 0.28, + "grad_norm": 0.5438462495803833, + "learning_rate": 0.0005875994171848424, + "loss": 3.5022, + "step": 5622 + }, + { + "epoch": 0.28, + "grad_norm": 0.5512886643409729, + "learning_rate": 0.0005875950360028846, + "loss": 3.4472, + "step": 5623 + }, + { + "epoch": 0.28, + "grad_norm": 0.5511341094970703, + "learning_rate": 0.0005875906540634567, + "loss": 3.1134, + "step": 5624 + }, + { + "epoch": 0.28, + "grad_norm": 0.5950229167938232, + "learning_rate": 0.0005875862713665705, + "loss": 3.5233, + "step": 5625 + }, + { + "epoch": 0.28, + "grad_norm": 0.5315006375312805, + "learning_rate": 0.0005875818879122373, + "loss": 3.6534, + "step": 5626 + }, + { + "epoch": 0.28, + "grad_norm": 0.5488346815109253, + "learning_rate": 0.0005875775037004689, + "loss": 3.3631, + "step": 5627 + }, + { + "epoch": 0.28, + "grad_norm": 0.5246545672416687, + "learning_rate": 0.0005875731187312766, + "loss": 3.5114, + "step": 5628 + }, + { + "epoch": 0.28, + "grad_norm": 0.5800779461860657, + "learning_rate": 0.000587568733004672, + "loss": 3.4159, + "step": 5629 + }, + { + "epoch": 0.28, + "grad_norm": 0.5169866681098938, + "learning_rate": 0.0005875643465206668, + "loss": 3.2865, + "step": 5630 + }, + { + "epoch": 0.28, + "grad_norm": 0.5645882487297058, + "learning_rate": 0.0005875599592792724, + "loss": 3.4028, + "step": 5631 + }, + { + "epoch": 0.28, + "grad_norm": 0.5463460087776184, + "learning_rate": 0.0005875555712805005, + "loss": 3.4289, + "step": 5632 + }, + { + "epoch": 0.28, + "grad_norm": 0.5568528771400452, + "learning_rate": 0.0005875511825243624, + "loss": 3.5229, + "step": 5633 + }, + { + "epoch": 0.28, + "grad_norm": 0.529748260974884, + "learning_rate": 0.00058754679301087, + "loss": 3.4635, + "step": 5634 + }, + { + "epoch": 0.28, + "grad_norm": 0.5450758337974548, + "learning_rate": 0.0005875424027400346, + "loss": 3.1628, + "step": 5635 + }, + { + "epoch": 0.28, + "grad_norm": 0.536340594291687, + "learning_rate": 0.0005875380117118679, + "loss": 3.4757, + "step": 5636 + }, + { + "epoch": 0.28, + "grad_norm": 0.5831730961799622, + "learning_rate": 0.0005875336199263814, + "loss": 3.4245, + "step": 5637 + }, + { + "epoch": 0.28, + "grad_norm": 0.532107949256897, + "learning_rate": 0.0005875292273835865, + "loss": 3.304, + "step": 5638 + }, + { + "epoch": 0.28, + "grad_norm": 0.5280243754386902, + "learning_rate": 0.0005875248340834953, + "loss": 3.2755, + "step": 5639 + }, + { + "epoch": 0.28, + "grad_norm": 0.5418915748596191, + "learning_rate": 0.0005875204400261187, + "loss": 3.337, + "step": 5640 + }, + { + "epoch": 0.28, + "grad_norm": 0.5245290398597717, + "learning_rate": 0.0005875160452114689, + "loss": 3.528, + "step": 5641 + }, + { + "epoch": 0.28, + "grad_norm": 0.5836740732192993, + "learning_rate": 0.0005875116496395569, + "loss": 3.3991, + "step": 5642 + }, + { + "epoch": 0.28, + "grad_norm": 0.5641128420829773, + "learning_rate": 0.0005875072533103946, + "loss": 3.2812, + "step": 5643 + }, + { + "epoch": 0.28, + "grad_norm": 0.5404115915298462, + "learning_rate": 0.0005875028562239936, + "loss": 3.3253, + "step": 5644 + }, + { + "epoch": 0.28, + "grad_norm": 0.5600898861885071, + "learning_rate": 0.0005874984583803653, + "loss": 3.2992, + "step": 5645 + }, + { + "epoch": 0.28, + "grad_norm": 0.5351735949516296, + "learning_rate": 0.0005874940597795215, + "loss": 3.5004, + "step": 5646 + }, + { + "epoch": 0.28, + "grad_norm": 0.561495840549469, + "learning_rate": 0.0005874896604214737, + "loss": 3.392, + "step": 5647 + }, + { + "epoch": 0.28, + "grad_norm": 0.5557546615600586, + "learning_rate": 0.0005874852603062334, + "loss": 3.2979, + "step": 5648 + }, + { + "epoch": 0.28, + "grad_norm": 0.5082600116729736, + "learning_rate": 0.0005874808594338122, + "loss": 3.5587, + "step": 5649 + }, + { + "epoch": 0.28, + "grad_norm": 0.5817442536354065, + "learning_rate": 0.0005874764578042218, + "loss": 3.571, + "step": 5650 + }, + { + "epoch": 0.28, + "grad_norm": 0.5154877305030823, + "learning_rate": 0.0005874720554174738, + "loss": 3.4611, + "step": 5651 + }, + { + "epoch": 0.28, + "grad_norm": 0.5388178825378418, + "learning_rate": 0.0005874676522735796, + "loss": 3.2679, + "step": 5652 + }, + { + "epoch": 0.28, + "grad_norm": 0.568677544593811, + "learning_rate": 0.000587463248372551, + "loss": 3.3036, + "step": 5653 + }, + { + "epoch": 0.28, + "grad_norm": 0.520805835723877, + "learning_rate": 0.0005874588437143996, + "loss": 3.561, + "step": 5654 + }, + { + "epoch": 0.28, + "grad_norm": 0.5892708897590637, + "learning_rate": 0.0005874544382991368, + "loss": 3.3356, + "step": 5655 + }, + { + "epoch": 0.28, + "grad_norm": 0.5433066487312317, + "learning_rate": 0.0005874500321267743, + "loss": 3.4601, + "step": 5656 + }, + { + "epoch": 0.28, + "grad_norm": 0.6382086873054504, + "learning_rate": 0.000587445625197324, + "loss": 3.5814, + "step": 5657 + }, + { + "epoch": 0.28, + "grad_norm": 0.5490455031394958, + "learning_rate": 0.000587441217510797, + "loss": 3.4123, + "step": 5658 + }, + { + "epoch": 0.28, + "grad_norm": 0.5612518191337585, + "learning_rate": 0.0005874368090672053, + "loss": 3.3378, + "step": 5659 + }, + { + "epoch": 0.28, + "grad_norm": 0.578199565410614, + "learning_rate": 0.0005874323998665603, + "loss": 3.3815, + "step": 5660 + }, + { + "epoch": 0.28, + "grad_norm": 0.5689168572425842, + "learning_rate": 0.0005874279899088735, + "loss": 3.3816, + "step": 5661 + }, + { + "epoch": 0.28, + "grad_norm": 0.5449416637420654, + "learning_rate": 0.0005874235791941569, + "loss": 3.4455, + "step": 5662 + }, + { + "epoch": 0.28, + "grad_norm": 0.5448278784751892, + "learning_rate": 0.0005874191677224218, + "loss": 3.6087, + "step": 5663 + }, + { + "epoch": 0.28, + "grad_norm": 0.521791934967041, + "learning_rate": 0.00058741475549368, + "loss": 3.3265, + "step": 5664 + }, + { + "epoch": 0.28, + "grad_norm": 0.5685543417930603, + "learning_rate": 0.000587410342507943, + "loss": 3.4758, + "step": 5665 + }, + { + "epoch": 0.28, + "grad_norm": 0.5723705291748047, + "learning_rate": 0.0005874059287652225, + "loss": 3.3845, + "step": 5666 + }, + { + "epoch": 0.28, + "grad_norm": 0.5475627779960632, + "learning_rate": 0.0005874015142655302, + "loss": 3.4733, + "step": 5667 + }, + { + "epoch": 0.28, + "grad_norm": 0.565759539604187, + "learning_rate": 0.0005873970990088775, + "loss": 3.0086, + "step": 5668 + }, + { + "epoch": 0.28, + "grad_norm": 0.5776841640472412, + "learning_rate": 0.0005873926829952762, + "loss": 3.4155, + "step": 5669 + }, + { + "epoch": 0.28, + "grad_norm": 0.5339114665985107, + "learning_rate": 0.0005873882662247378, + "loss": 3.4948, + "step": 5670 + }, + { + "epoch": 0.28, + "grad_norm": 0.5639076828956604, + "learning_rate": 0.000587383848697274, + "loss": 3.1924, + "step": 5671 + }, + { + "epoch": 0.28, + "grad_norm": 0.5452633500099182, + "learning_rate": 0.0005873794304128966, + "loss": 3.151, + "step": 5672 + }, + { + "epoch": 0.28, + "grad_norm": 0.5319305658340454, + "learning_rate": 0.000587375011371617, + "loss": 3.4559, + "step": 5673 + }, + { + "epoch": 0.28, + "grad_norm": 0.551712691783905, + "learning_rate": 0.000587370591573447, + "loss": 3.5398, + "step": 5674 + }, + { + "epoch": 0.28, + "grad_norm": 0.5740674138069153, + "learning_rate": 0.000587366171018398, + "loss": 3.415, + "step": 5675 + }, + { + "epoch": 0.28, + "grad_norm": 0.5360549688339233, + "learning_rate": 0.000587361749706482, + "loss": 3.3225, + "step": 5676 + }, + { + "epoch": 0.28, + "grad_norm": 0.5476113557815552, + "learning_rate": 0.0005873573276377103, + "loss": 3.6012, + "step": 5677 + }, + { + "epoch": 0.28, + "grad_norm": 0.5887551307678223, + "learning_rate": 0.0005873529048120948, + "loss": 3.3345, + "step": 5678 + }, + { + "epoch": 0.28, + "grad_norm": 0.5409665107727051, + "learning_rate": 0.000587348481229647, + "loss": 3.3906, + "step": 5679 + }, + { + "epoch": 0.28, + "grad_norm": 0.5526324510574341, + "learning_rate": 0.0005873440568903785, + "loss": 3.3438, + "step": 5680 + }, + { + "epoch": 0.28, + "grad_norm": 0.5718168616294861, + "learning_rate": 0.0005873396317943013, + "loss": 3.4642, + "step": 5681 + }, + { + "epoch": 0.28, + "grad_norm": 0.5631795525550842, + "learning_rate": 0.0005873352059414267, + "loss": 3.2674, + "step": 5682 + }, + { + "epoch": 0.28, + "grad_norm": 0.6147184371948242, + "learning_rate": 0.0005873307793317664, + "loss": 3.2464, + "step": 5683 + }, + { + "epoch": 0.28, + "grad_norm": 0.6067205667495728, + "learning_rate": 0.0005873263519653323, + "loss": 3.4046, + "step": 5684 + }, + { + "epoch": 0.28, + "grad_norm": 0.542724072933197, + "learning_rate": 0.0005873219238421356, + "loss": 3.5222, + "step": 5685 + }, + { + "epoch": 0.28, + "grad_norm": 0.5695931315422058, + "learning_rate": 0.0005873174949621885, + "loss": 3.4056, + "step": 5686 + }, + { + "epoch": 0.28, + "grad_norm": 0.5388402938842773, + "learning_rate": 0.0005873130653255023, + "loss": 3.4411, + "step": 5687 + }, + { + "epoch": 0.28, + "grad_norm": 0.6163471341133118, + "learning_rate": 0.0005873086349320888, + "loss": 3.3085, + "step": 5688 + }, + { + "epoch": 0.28, + "grad_norm": 0.5419004559516907, + "learning_rate": 0.0005873042037819597, + "loss": 3.5004, + "step": 5689 + }, + { + "epoch": 0.28, + "grad_norm": 0.5461073517799377, + "learning_rate": 0.0005872997718751265, + "loss": 3.3721, + "step": 5690 + }, + { + "epoch": 0.28, + "grad_norm": 0.5264971852302551, + "learning_rate": 0.0005872953392116011, + "loss": 3.3873, + "step": 5691 + }, + { + "epoch": 0.28, + "grad_norm": 0.5653229355812073, + "learning_rate": 0.0005872909057913951, + "loss": 3.3069, + "step": 5692 + }, + { + "epoch": 0.28, + "grad_norm": 0.5882543325424194, + "learning_rate": 0.00058728647161452, + "loss": 3.1904, + "step": 5693 + }, + { + "epoch": 0.28, + "grad_norm": 0.5486871600151062, + "learning_rate": 0.0005872820366809877, + "loss": 3.2986, + "step": 5694 + }, + { + "epoch": 0.28, + "grad_norm": 0.53853440284729, + "learning_rate": 0.0005872776009908099, + "loss": 3.4683, + "step": 5695 + }, + { + "epoch": 0.28, + "grad_norm": 0.5672714710235596, + "learning_rate": 0.000587273164543998, + "loss": 3.4573, + "step": 5696 + }, + { + "epoch": 0.28, + "grad_norm": 0.5596035122871399, + "learning_rate": 0.000587268727340564, + "loss": 3.556, + "step": 5697 + }, + { + "epoch": 0.28, + "grad_norm": 0.5245596766471863, + "learning_rate": 0.0005872642893805194, + "loss": 3.3127, + "step": 5698 + }, + { + "epoch": 0.28, + "grad_norm": 0.5364984273910522, + "learning_rate": 0.0005872598506638761, + "loss": 3.3348, + "step": 5699 + }, + { + "epoch": 0.28, + "grad_norm": 0.5501704812049866, + "learning_rate": 0.0005872554111906454, + "loss": 3.352, + "step": 5700 + }, + { + "epoch": 0.28, + "grad_norm": 0.5427786707878113, + "learning_rate": 0.0005872509709608394, + "loss": 3.4774, + "step": 5701 + }, + { + "epoch": 0.28, + "grad_norm": 0.5833746194839478, + "learning_rate": 0.0005872465299744696, + "loss": 3.2135, + "step": 5702 + }, + { + "epoch": 0.28, + "grad_norm": 0.5424846410751343, + "learning_rate": 0.0005872420882315476, + "loss": 3.4114, + "step": 5703 + }, + { + "epoch": 0.28, + "grad_norm": 0.5223133563995361, + "learning_rate": 0.0005872376457320853, + "loss": 3.2703, + "step": 5704 + }, + { + "epoch": 0.28, + "grad_norm": 0.5719478726387024, + "learning_rate": 0.0005872332024760944, + "loss": 3.5279, + "step": 5705 + }, + { + "epoch": 0.28, + "grad_norm": 0.5417484641075134, + "learning_rate": 0.0005872287584635864, + "loss": 3.5221, + "step": 5706 + }, + { + "epoch": 0.28, + "grad_norm": 0.5238466262817383, + "learning_rate": 0.0005872243136945732, + "loss": 3.3905, + "step": 5707 + }, + { + "epoch": 0.28, + "grad_norm": 0.5247589349746704, + "learning_rate": 0.0005872198681690664, + "loss": 3.3364, + "step": 5708 + }, + { + "epoch": 0.28, + "grad_norm": 0.5623005628585815, + "learning_rate": 0.0005872154218870778, + "loss": 3.3918, + "step": 5709 + }, + { + "epoch": 0.28, + "grad_norm": 0.6546081900596619, + "learning_rate": 0.0005872109748486189, + "loss": 3.6369, + "step": 5710 + }, + { + "epoch": 0.28, + "grad_norm": 0.5542550683021545, + "learning_rate": 0.0005872065270537017, + "loss": 3.4674, + "step": 5711 + }, + { + "epoch": 0.28, + "grad_norm": 0.5579176545143127, + "learning_rate": 0.0005872020785023379, + "loss": 3.3265, + "step": 5712 + }, + { + "epoch": 0.28, + "grad_norm": 0.6107970476150513, + "learning_rate": 0.0005871976291945388, + "loss": 3.5986, + "step": 5713 + }, + { + "epoch": 0.28, + "grad_norm": 0.534400463104248, + "learning_rate": 0.0005871931791303167, + "loss": 3.5339, + "step": 5714 + }, + { + "epoch": 0.28, + "grad_norm": 0.5474011898040771, + "learning_rate": 0.000587188728309683, + "loss": 3.2851, + "step": 5715 + }, + { + "epoch": 0.28, + "grad_norm": 0.5572850108146667, + "learning_rate": 0.0005871842767326492, + "loss": 3.3349, + "step": 5716 + }, + { + "epoch": 0.28, + "grad_norm": 0.6158696413040161, + "learning_rate": 0.0005871798243992276, + "loss": 3.3417, + "step": 5717 + }, + { + "epoch": 0.28, + "grad_norm": 0.5303359627723694, + "learning_rate": 0.0005871753713094294, + "loss": 3.5762, + "step": 5718 + }, + { + "epoch": 0.28, + "grad_norm": 0.5638172030448914, + "learning_rate": 0.0005871709174632666, + "loss": 3.3489, + "step": 5719 + }, + { + "epoch": 0.28, + "grad_norm": 0.5369972586631775, + "learning_rate": 0.0005871664628607509, + "loss": 3.2923, + "step": 5720 + }, + { + "epoch": 0.28, + "grad_norm": 0.6055926084518433, + "learning_rate": 0.000587162007501894, + "loss": 3.145, + "step": 5721 + }, + { + "epoch": 0.28, + "grad_norm": 0.5599731802940369, + "learning_rate": 0.0005871575513867076, + "loss": 3.5095, + "step": 5722 + }, + { + "epoch": 0.28, + "grad_norm": 0.5579766035079956, + "learning_rate": 0.0005871530945152035, + "loss": 3.6883, + "step": 5723 + }, + { + "epoch": 0.28, + "grad_norm": 0.5651584267616272, + "learning_rate": 0.0005871486368873934, + "loss": 3.4495, + "step": 5724 + }, + { + "epoch": 0.28, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.000587144178503289, + "loss": 3.2469, + "step": 5725 + }, + { + "epoch": 0.28, + "grad_norm": 0.556201159954071, + "learning_rate": 0.0005871397193629022, + "loss": 3.5754, + "step": 5726 + }, + { + "epoch": 0.28, + "grad_norm": 0.5447879433631897, + "learning_rate": 0.0005871352594662446, + "loss": 3.4798, + "step": 5727 + }, + { + "epoch": 0.28, + "grad_norm": 0.5088903307914734, + "learning_rate": 0.000587130798813328, + "loss": 3.4272, + "step": 5728 + }, + { + "epoch": 0.28, + "grad_norm": 0.5600134134292603, + "learning_rate": 0.0005871263374041642, + "loss": 3.3428, + "step": 5729 + }, + { + "epoch": 0.28, + "grad_norm": 0.5825614333152771, + "learning_rate": 0.0005871218752387647, + "loss": 3.3385, + "step": 5730 + }, + { + "epoch": 0.28, + "grad_norm": 0.5817743539810181, + "learning_rate": 0.0005871174123171415, + "loss": 3.3358, + "step": 5731 + }, + { + "epoch": 0.28, + "grad_norm": 0.5553821325302124, + "learning_rate": 0.0005871129486393064, + "loss": 3.3266, + "step": 5732 + }, + { + "epoch": 0.28, + "grad_norm": 0.583930253982544, + "learning_rate": 0.0005871084842052711, + "loss": 3.3519, + "step": 5733 + }, + { + "epoch": 0.28, + "grad_norm": 0.5544360280036926, + "learning_rate": 0.0005871040190150471, + "loss": 3.2843, + "step": 5734 + }, + { + "epoch": 0.28, + "grad_norm": 0.5816003084182739, + "learning_rate": 0.0005870995530686465, + "loss": 3.5052, + "step": 5735 + }, + { + "epoch": 0.28, + "grad_norm": 0.5861048698425293, + "learning_rate": 0.0005870950863660808, + "loss": 3.4918, + "step": 5736 + }, + { + "epoch": 0.28, + "grad_norm": 0.5465466380119324, + "learning_rate": 0.000587090618907362, + "loss": 3.5772, + "step": 5737 + }, + { + "epoch": 0.28, + "grad_norm": 0.5588349103927612, + "learning_rate": 0.0005870861506925018, + "loss": 3.5592, + "step": 5738 + }, + { + "epoch": 0.28, + "grad_norm": 0.558573842048645, + "learning_rate": 0.0005870816817215119, + "loss": 3.4306, + "step": 5739 + }, + { + "epoch": 0.28, + "grad_norm": 0.5814093351364136, + "learning_rate": 0.0005870772119944041, + "loss": 3.3825, + "step": 5740 + }, + { + "epoch": 0.28, + "grad_norm": 0.5688466429710388, + "learning_rate": 0.0005870727415111901, + "loss": 3.4397, + "step": 5741 + }, + { + "epoch": 0.28, + "grad_norm": 0.5587722063064575, + "learning_rate": 0.0005870682702718817, + "loss": 3.4223, + "step": 5742 + }, + { + "epoch": 0.28, + "grad_norm": 0.578432559967041, + "learning_rate": 0.000587063798276491, + "loss": 3.6142, + "step": 5743 + }, + { + "epoch": 0.28, + "grad_norm": 0.5772233605384827, + "learning_rate": 0.0005870593255250293, + "loss": 3.4018, + "step": 5744 + }, + { + "epoch": 0.28, + "grad_norm": 0.5719894170761108, + "learning_rate": 0.0005870548520175086, + "loss": 3.3818, + "step": 5745 + }, + { + "epoch": 0.28, + "grad_norm": 0.5581337213516235, + "learning_rate": 0.0005870503777539406, + "loss": 3.6375, + "step": 5746 + }, + { + "epoch": 0.28, + "grad_norm": 0.5520333051681519, + "learning_rate": 0.0005870459027343373, + "loss": 3.5316, + "step": 5747 + }, + { + "epoch": 0.28, + "grad_norm": 0.524156928062439, + "learning_rate": 0.0005870414269587102, + "loss": 3.2556, + "step": 5748 + }, + { + "epoch": 0.28, + "grad_norm": 0.5664880871772766, + "learning_rate": 0.0005870369504270713, + "loss": 3.3596, + "step": 5749 + }, + { + "epoch": 0.28, + "grad_norm": 0.5417056679725647, + "learning_rate": 0.0005870324731394323, + "loss": 3.4458, + "step": 5750 + }, + { + "epoch": 0.28, + "grad_norm": 0.5310121774673462, + "learning_rate": 0.0005870279950958051, + "loss": 3.3986, + "step": 5751 + }, + { + "epoch": 0.28, + "grad_norm": 0.5482550263404846, + "learning_rate": 0.0005870235162962012, + "loss": 3.2324, + "step": 5752 + }, + { + "epoch": 0.28, + "grad_norm": 0.6072642207145691, + "learning_rate": 0.0005870190367406327, + "loss": 3.5553, + "step": 5753 + }, + { + "epoch": 0.28, + "grad_norm": 0.5456782579421997, + "learning_rate": 0.0005870145564291113, + "loss": 3.18, + "step": 5754 + }, + { + "epoch": 0.28, + "grad_norm": 0.5461390018463135, + "learning_rate": 0.0005870100753616488, + "loss": 3.4445, + "step": 5755 + }, + { + "epoch": 0.28, + "grad_norm": 0.5472182035446167, + "learning_rate": 0.000587005593538257, + "loss": 3.4471, + "step": 5756 + }, + { + "epoch": 0.28, + "grad_norm": 0.5583928823471069, + "learning_rate": 0.0005870011109589477, + "loss": 3.3076, + "step": 5757 + }, + { + "epoch": 0.28, + "grad_norm": 0.5505167841911316, + "learning_rate": 0.0005869966276237327, + "loss": 3.3274, + "step": 5758 + }, + { + "epoch": 0.28, + "grad_norm": 0.5618990659713745, + "learning_rate": 0.0005869921435326238, + "loss": 3.4508, + "step": 5759 + }, + { + "epoch": 0.28, + "grad_norm": 0.5286376476287842, + "learning_rate": 0.0005869876586856328, + "loss": 3.207, + "step": 5760 + }, + { + "epoch": 0.28, + "grad_norm": 0.5474581718444824, + "learning_rate": 0.0005869831730827715, + "loss": 3.6744, + "step": 5761 + }, + { + "epoch": 0.28, + "grad_norm": 0.5827766060829163, + "learning_rate": 0.0005869786867240519, + "loss": 3.3691, + "step": 5762 + }, + { + "epoch": 0.28, + "grad_norm": 0.5204379558563232, + "learning_rate": 0.0005869741996094856, + "loss": 3.7098, + "step": 5763 + }, + { + "epoch": 0.28, + "grad_norm": 0.5318394899368286, + "learning_rate": 0.0005869697117390846, + "loss": 3.5282, + "step": 5764 + }, + { + "epoch": 0.28, + "grad_norm": 0.520143985748291, + "learning_rate": 0.0005869652231128604, + "loss": 3.621, + "step": 5765 + }, + { + "epoch": 0.28, + "grad_norm": 0.5361225008964539, + "learning_rate": 0.0005869607337308251, + "loss": 3.4061, + "step": 5766 + }, + { + "epoch": 0.28, + "grad_norm": 0.6079183220863342, + "learning_rate": 0.0005869562435929905, + "loss": 3.387, + "step": 5767 + }, + { + "epoch": 0.28, + "grad_norm": 0.5326195359230042, + "learning_rate": 0.0005869517526993684, + "loss": 3.4427, + "step": 5768 + }, + { + "epoch": 0.28, + "grad_norm": 0.541530191898346, + "learning_rate": 0.0005869472610499706, + "loss": 3.4417, + "step": 5769 + }, + { + "epoch": 0.28, + "grad_norm": 0.5230389833450317, + "learning_rate": 0.0005869427686448088, + "loss": 3.2022, + "step": 5770 + }, + { + "epoch": 0.28, + "grad_norm": 0.566440761089325, + "learning_rate": 0.0005869382754838951, + "loss": 3.3521, + "step": 5771 + }, + { + "epoch": 0.28, + "grad_norm": 0.5397255420684814, + "learning_rate": 0.0005869337815672413, + "loss": 3.5468, + "step": 5772 + }, + { + "epoch": 0.28, + "grad_norm": 0.5574547648429871, + "learning_rate": 0.0005869292868948589, + "loss": 3.13, + "step": 5773 + }, + { + "epoch": 0.28, + "grad_norm": 0.5912591218948364, + "learning_rate": 0.0005869247914667601, + "loss": 3.2856, + "step": 5774 + }, + { + "epoch": 0.28, + "grad_norm": 0.5619760751724243, + "learning_rate": 0.0005869202952829567, + "loss": 3.3057, + "step": 5775 + }, + { + "epoch": 0.28, + "grad_norm": 0.5392544865608215, + "learning_rate": 0.0005869157983434604, + "loss": 3.4446, + "step": 5776 + }, + { + "epoch": 0.28, + "grad_norm": 0.5367756485939026, + "learning_rate": 0.000586911300648283, + "loss": 3.4562, + "step": 5777 + }, + { + "epoch": 0.28, + "grad_norm": 0.5628572702407837, + "learning_rate": 0.0005869068021974366, + "loss": 3.5371, + "step": 5778 + }, + { + "epoch": 0.28, + "grad_norm": 0.5055538415908813, + "learning_rate": 0.0005869023029909328, + "loss": 3.5217, + "step": 5779 + }, + { + "epoch": 0.28, + "grad_norm": 0.546389102935791, + "learning_rate": 0.0005868978030287836, + "loss": 3.4341, + "step": 5780 + }, + { + "epoch": 0.28, + "grad_norm": 0.5764334201812744, + "learning_rate": 0.0005868933023110008, + "loss": 3.0859, + "step": 5781 + }, + { + "epoch": 0.28, + "grad_norm": 0.6767985820770264, + "learning_rate": 0.0005868888008375963, + "loss": 3.332, + "step": 5782 + }, + { + "epoch": 0.28, + "grad_norm": 0.5384335517883301, + "learning_rate": 0.0005868842986085818, + "loss": 3.4159, + "step": 5783 + }, + { + "epoch": 0.28, + "grad_norm": 0.5437288284301758, + "learning_rate": 0.0005868797956239693, + "loss": 3.2161, + "step": 5784 + }, + { + "epoch": 0.28, + "grad_norm": 0.5375307202339172, + "learning_rate": 0.0005868752918837707, + "loss": 3.3611, + "step": 5785 + }, + { + "epoch": 0.28, + "grad_norm": 0.5247658491134644, + "learning_rate": 0.0005868707873879978, + "loss": 3.2946, + "step": 5786 + }, + { + "epoch": 0.28, + "grad_norm": 0.5466168522834778, + "learning_rate": 0.0005868662821366624, + "loss": 3.4666, + "step": 5787 + }, + { + "epoch": 0.28, + "grad_norm": 0.5654569864273071, + "learning_rate": 0.0005868617761297764, + "loss": 3.4604, + "step": 5788 + }, + { + "epoch": 0.28, + "grad_norm": 0.5402109026908875, + "learning_rate": 0.0005868572693673516, + "loss": 3.359, + "step": 5789 + }, + { + "epoch": 0.28, + "grad_norm": 0.517817497253418, + "learning_rate": 0.0005868527618494001, + "loss": 3.3345, + "step": 5790 + }, + { + "epoch": 0.28, + "grad_norm": 0.49960243701934814, + "learning_rate": 0.0005868482535759337, + "loss": 3.4715, + "step": 5791 + }, + { + "epoch": 0.28, + "grad_norm": 0.540615439414978, + "learning_rate": 0.000586843744546964, + "loss": 3.4621, + "step": 5792 + }, + { + "epoch": 0.28, + "grad_norm": 0.5866577625274658, + "learning_rate": 0.0005868392347625032, + "loss": 3.2817, + "step": 5793 + }, + { + "epoch": 0.28, + "grad_norm": 0.5393642783164978, + "learning_rate": 0.0005868347242225632, + "loss": 3.2075, + "step": 5794 + }, + { + "epoch": 0.28, + "grad_norm": 0.5420916080474854, + "learning_rate": 0.0005868302129271555, + "loss": 3.4428, + "step": 5795 + }, + { + "epoch": 0.28, + "grad_norm": 0.5432751774787903, + "learning_rate": 0.0005868257008762924, + "loss": 3.4497, + "step": 5796 + }, + { + "epoch": 0.28, + "grad_norm": 0.5415337085723877, + "learning_rate": 0.0005868211880699854, + "loss": 3.3068, + "step": 5797 + }, + { + "epoch": 0.28, + "grad_norm": 0.5475549697875977, + "learning_rate": 0.0005868166745082468, + "loss": 3.4784, + "step": 5798 + }, + { + "epoch": 0.28, + "grad_norm": 0.5423837900161743, + "learning_rate": 0.0005868121601910882, + "loss": 3.2155, + "step": 5799 + }, + { + "epoch": 0.28, + "grad_norm": 0.52655428647995, + "learning_rate": 0.0005868076451185215, + "loss": 3.4475, + "step": 5800 + }, + { + "epoch": 0.28, + "grad_norm": 0.5762920379638672, + "learning_rate": 0.0005868031292905586, + "loss": 3.3285, + "step": 5801 + }, + { + "epoch": 0.28, + "grad_norm": 0.5149837732315063, + "learning_rate": 0.0005867986127072116, + "loss": 3.5152, + "step": 5802 + }, + { + "epoch": 0.28, + "grad_norm": 0.5225587487220764, + "learning_rate": 0.0005867940953684921, + "loss": 3.276, + "step": 5803 + }, + { + "epoch": 0.28, + "grad_norm": 0.6301308870315552, + "learning_rate": 0.0005867895772744124, + "loss": 3.3187, + "step": 5804 + }, + { + "epoch": 0.28, + "grad_norm": 0.5626131892204285, + "learning_rate": 0.0005867850584249841, + "loss": 3.4969, + "step": 5805 + }, + { + "epoch": 0.28, + "grad_norm": 0.5513469576835632, + "learning_rate": 0.0005867805388202189, + "loss": 3.2877, + "step": 5806 + }, + { + "epoch": 0.28, + "grad_norm": 0.5425329208374023, + "learning_rate": 0.0005867760184601292, + "loss": 3.5523, + "step": 5807 + }, + { + "epoch": 0.28, + "grad_norm": 0.5266916155815125, + "learning_rate": 0.0005867714973447265, + "loss": 3.0832, + "step": 5808 + }, + { + "epoch": 0.28, + "grad_norm": 0.5299080014228821, + "learning_rate": 0.000586766975474023, + "loss": 3.4578, + "step": 5809 + }, + { + "epoch": 0.28, + "grad_norm": 0.510150671005249, + "learning_rate": 0.0005867624528480303, + "loss": 3.3181, + "step": 5810 + }, + { + "epoch": 0.28, + "grad_norm": 0.5414705872535706, + "learning_rate": 0.0005867579294667606, + "loss": 3.4233, + "step": 5811 + }, + { + "epoch": 0.28, + "grad_norm": 0.5347448587417603, + "learning_rate": 0.0005867534053302258, + "loss": 3.6438, + "step": 5812 + }, + { + "epoch": 0.28, + "grad_norm": 0.5466963648796082, + "learning_rate": 0.0005867488804384377, + "loss": 3.5877, + "step": 5813 + }, + { + "epoch": 0.28, + "grad_norm": 0.5780187249183655, + "learning_rate": 0.0005867443547914081, + "loss": 3.4495, + "step": 5814 + }, + { + "epoch": 0.28, + "grad_norm": 0.5186501145362854, + "learning_rate": 0.0005867398283891491, + "loss": 3.4562, + "step": 5815 + }, + { + "epoch": 0.29, + "grad_norm": 0.5260766744613647, + "learning_rate": 0.0005867353012316725, + "loss": 3.2264, + "step": 5816 + }, + { + "epoch": 0.29, + "grad_norm": 0.5569272041320801, + "learning_rate": 0.0005867307733189905, + "loss": 3.3634, + "step": 5817 + }, + { + "epoch": 0.29, + "grad_norm": 0.5397284030914307, + "learning_rate": 0.0005867262446511147, + "loss": 3.2977, + "step": 5818 + }, + { + "epoch": 0.29, + "grad_norm": 0.5917802453041077, + "learning_rate": 0.0005867217152280571, + "loss": 3.5556, + "step": 5819 + }, + { + "epoch": 0.29, + "grad_norm": 0.5505132079124451, + "learning_rate": 0.0005867171850498298, + "loss": 3.4379, + "step": 5820 + }, + { + "epoch": 0.29, + "grad_norm": 0.5808700919151306, + "learning_rate": 0.0005867126541164445, + "loss": 3.3594, + "step": 5821 + }, + { + "epoch": 0.29, + "grad_norm": 0.5809784531593323, + "learning_rate": 0.0005867081224279133, + "loss": 3.4292, + "step": 5822 + }, + { + "epoch": 0.29, + "grad_norm": 0.5430020093917847, + "learning_rate": 0.0005867035899842481, + "loss": 3.3711, + "step": 5823 + }, + { + "epoch": 0.29, + "grad_norm": 0.6083835959434509, + "learning_rate": 0.0005866990567854608, + "loss": 3.4043, + "step": 5824 + }, + { + "epoch": 0.29, + "grad_norm": 0.5165572166442871, + "learning_rate": 0.0005866945228315634, + "loss": 3.6597, + "step": 5825 + }, + { + "epoch": 0.29, + "grad_norm": 0.5558134913444519, + "learning_rate": 0.0005866899881225678, + "loss": 3.2103, + "step": 5826 + }, + { + "epoch": 0.29, + "grad_norm": 0.5334741473197937, + "learning_rate": 0.0005866854526584859, + "loss": 3.087, + "step": 5827 + }, + { + "epoch": 0.29, + "grad_norm": 0.5371214747428894, + "learning_rate": 0.0005866809164393297, + "loss": 3.4371, + "step": 5828 + }, + { + "epoch": 0.29, + "grad_norm": 0.5531008839607239, + "learning_rate": 0.0005866763794651111, + "loss": 3.2865, + "step": 5829 + }, + { + "epoch": 0.29, + "grad_norm": 0.5216488242149353, + "learning_rate": 0.0005866718417358421, + "loss": 3.4875, + "step": 5830 + }, + { + "epoch": 0.29, + "grad_norm": 0.542599618434906, + "learning_rate": 0.0005866673032515347, + "loss": 3.3621, + "step": 5831 + }, + { + "epoch": 0.29, + "grad_norm": 0.5730884075164795, + "learning_rate": 0.0005866627640122008, + "loss": 3.2273, + "step": 5832 + }, + { + "epoch": 0.29, + "grad_norm": 0.5188013911247253, + "learning_rate": 0.0005866582240178523, + "loss": 3.2409, + "step": 5833 + }, + { + "epoch": 0.29, + "grad_norm": 0.5457878708839417, + "learning_rate": 0.0005866536832685013, + "loss": 3.2904, + "step": 5834 + }, + { + "epoch": 0.29, + "grad_norm": 0.569889485836029, + "learning_rate": 0.0005866491417641595, + "loss": 3.4469, + "step": 5835 + }, + { + "epoch": 0.29, + "grad_norm": 0.5618494153022766, + "learning_rate": 0.0005866445995048392, + "loss": 3.3618, + "step": 5836 + }, + { + "epoch": 0.29, + "grad_norm": 0.5545337796211243, + "learning_rate": 0.0005866400564905521, + "loss": 3.3963, + "step": 5837 + }, + { + "epoch": 0.29, + "grad_norm": 0.5397124290466309, + "learning_rate": 0.0005866355127213102, + "loss": 3.4804, + "step": 5838 + }, + { + "epoch": 0.29, + "grad_norm": 0.5252190828323364, + "learning_rate": 0.0005866309681971256, + "loss": 3.5564, + "step": 5839 + }, + { + "epoch": 0.29, + "grad_norm": 0.5801165699958801, + "learning_rate": 0.0005866264229180102, + "loss": 3.5835, + "step": 5840 + }, + { + "epoch": 0.29, + "grad_norm": 0.536383330821991, + "learning_rate": 0.000586621876883976, + "loss": 3.4126, + "step": 5841 + }, + { + "epoch": 0.29, + "grad_norm": 0.5214142203330994, + "learning_rate": 0.0005866173300950349, + "loss": 3.2283, + "step": 5842 + }, + { + "epoch": 0.29, + "grad_norm": 0.5730195641517639, + "learning_rate": 0.0005866127825511989, + "loss": 3.5788, + "step": 5843 + }, + { + "epoch": 0.29, + "grad_norm": 0.527881383895874, + "learning_rate": 0.00058660823425248, + "loss": 3.5447, + "step": 5844 + }, + { + "epoch": 0.29, + "grad_norm": 0.5443788766860962, + "learning_rate": 0.0005866036851988901, + "loss": 3.2235, + "step": 5845 + }, + { + "epoch": 0.29, + "grad_norm": 0.6077579855918884, + "learning_rate": 0.0005865991353904413, + "loss": 3.3991, + "step": 5846 + }, + { + "epoch": 0.29, + "grad_norm": 0.578214704990387, + "learning_rate": 0.0005865945848271455, + "loss": 3.2017, + "step": 5847 + }, + { + "epoch": 0.29, + "grad_norm": 0.6207023859024048, + "learning_rate": 0.0005865900335090149, + "loss": 3.4464, + "step": 5848 + }, + { + "epoch": 0.29, + "grad_norm": 0.5436177253723145, + "learning_rate": 0.0005865854814360612, + "loss": 3.3529, + "step": 5849 + }, + { + "epoch": 0.29, + "grad_norm": 0.6536390781402588, + "learning_rate": 0.0005865809286082963, + "loss": 3.4443, + "step": 5850 + }, + { + "epoch": 0.29, + "grad_norm": 0.5636897087097168, + "learning_rate": 0.0005865763750257327, + "loss": 3.4031, + "step": 5851 + }, + { + "epoch": 0.29, + "grad_norm": 0.5775346159934998, + "learning_rate": 0.0005865718206883819, + "loss": 3.2281, + "step": 5852 + }, + { + "epoch": 0.29, + "grad_norm": 0.5930302143096924, + "learning_rate": 0.0005865672655962561, + "loss": 3.2219, + "step": 5853 + }, + { + "epoch": 0.29, + "grad_norm": 0.5139269828796387, + "learning_rate": 0.0005865627097493672, + "loss": 3.5864, + "step": 5854 + }, + { + "epoch": 0.29, + "grad_norm": 0.5036550760269165, + "learning_rate": 0.0005865581531477274, + "loss": 3.5422, + "step": 5855 + }, + { + "epoch": 0.29, + "grad_norm": 0.6104763746261597, + "learning_rate": 0.0005865535957913486, + "loss": 3.4522, + "step": 5856 + }, + { + "epoch": 0.29, + "grad_norm": 0.5676093697547913, + "learning_rate": 0.0005865490376802427, + "loss": 3.4295, + "step": 5857 + }, + { + "epoch": 0.29, + "grad_norm": 0.538794994354248, + "learning_rate": 0.0005865444788144217, + "loss": 3.2915, + "step": 5858 + }, + { + "epoch": 0.29, + "grad_norm": 0.5480157732963562, + "learning_rate": 0.0005865399191938979, + "loss": 3.5967, + "step": 5859 + }, + { + "epoch": 0.29, + "grad_norm": 0.5287188291549683, + "learning_rate": 0.000586535358818683, + "loss": 3.4753, + "step": 5860 + }, + { + "epoch": 0.29, + "grad_norm": 0.5639832615852356, + "learning_rate": 0.0005865307976887891, + "loss": 3.1815, + "step": 5861 + }, + { + "epoch": 0.29, + "grad_norm": 0.5048469305038452, + "learning_rate": 0.0005865262358042281, + "loss": 3.4191, + "step": 5862 + }, + { + "epoch": 0.29, + "grad_norm": 0.5484185814857483, + "learning_rate": 0.0005865216731650123, + "loss": 3.4258, + "step": 5863 + }, + { + "epoch": 0.29, + "grad_norm": 0.5514761209487915, + "learning_rate": 0.0005865171097711535, + "loss": 3.2908, + "step": 5864 + }, + { + "epoch": 0.29, + "grad_norm": 0.6602954864501953, + "learning_rate": 0.0005865125456226638, + "loss": 3.5131, + "step": 5865 + }, + { + "epoch": 0.29, + "grad_norm": 0.6218395829200745, + "learning_rate": 0.0005865079807195552, + "loss": 3.3947, + "step": 5866 + }, + { + "epoch": 0.29, + "grad_norm": 0.5158183574676514, + "learning_rate": 0.0005865034150618397, + "loss": 3.2724, + "step": 5867 + }, + { + "epoch": 0.29, + "grad_norm": 0.5524356961250305, + "learning_rate": 0.0005864988486495293, + "loss": 3.1983, + "step": 5868 + }, + { + "epoch": 0.29, + "grad_norm": 0.5388345718383789, + "learning_rate": 0.000586494281482636, + "loss": 3.4331, + "step": 5869 + }, + { + "epoch": 0.29, + "grad_norm": 0.532199501991272, + "learning_rate": 0.0005864897135611721, + "loss": 3.3253, + "step": 5870 + }, + { + "epoch": 0.29, + "grad_norm": 0.6542239189147949, + "learning_rate": 0.0005864851448851493, + "loss": 3.3342, + "step": 5871 + }, + { + "epoch": 0.29, + "grad_norm": 0.5302449464797974, + "learning_rate": 0.0005864805754545798, + "loss": 3.2958, + "step": 5872 + }, + { + "epoch": 0.29, + "grad_norm": 0.5232753753662109, + "learning_rate": 0.0005864760052694756, + "loss": 3.6324, + "step": 5873 + }, + { + "epoch": 0.29, + "grad_norm": 0.6836511492729187, + "learning_rate": 0.0005864714343298488, + "loss": 3.4663, + "step": 5874 + }, + { + "epoch": 0.29, + "grad_norm": 0.5641322135925293, + "learning_rate": 0.0005864668626357112, + "loss": 3.3574, + "step": 5875 + }, + { + "epoch": 0.29, + "grad_norm": 0.5231789350509644, + "learning_rate": 0.0005864622901870753, + "loss": 3.4077, + "step": 5876 + }, + { + "epoch": 0.29, + "grad_norm": 0.5678579807281494, + "learning_rate": 0.0005864577169839525, + "loss": 3.2796, + "step": 5877 + }, + { + "epoch": 0.29, + "grad_norm": 0.5374036431312561, + "learning_rate": 0.0005864531430263555, + "loss": 3.342, + "step": 5878 + }, + { + "epoch": 0.29, + "grad_norm": 0.5334300994873047, + "learning_rate": 0.000586448568314296, + "loss": 3.6034, + "step": 5879 + }, + { + "epoch": 0.29, + "grad_norm": 0.5521325469017029, + "learning_rate": 0.0005864439928477859, + "loss": 3.1953, + "step": 5880 + }, + { + "epoch": 0.29, + "grad_norm": 0.563149094581604, + "learning_rate": 0.0005864394166268376, + "loss": 3.4593, + "step": 5881 + }, + { + "epoch": 0.29, + "grad_norm": 0.5999089479446411, + "learning_rate": 0.000586434839651463, + "loss": 3.5447, + "step": 5882 + }, + { + "epoch": 0.29, + "grad_norm": 0.5314249992370605, + "learning_rate": 0.0005864302619216742, + "loss": 3.0678, + "step": 5883 + }, + { + "epoch": 0.29, + "grad_norm": 0.5752025842666626, + "learning_rate": 0.0005864256834374832, + "loss": 3.3008, + "step": 5884 + }, + { + "epoch": 0.29, + "grad_norm": 0.6359145045280457, + "learning_rate": 0.0005864211041989021, + "loss": 3.1086, + "step": 5885 + }, + { + "epoch": 0.29, + "grad_norm": 0.5294399857521057, + "learning_rate": 0.0005864165242059427, + "loss": 3.4525, + "step": 5886 + }, + { + "epoch": 0.29, + "grad_norm": 0.5078052878379822, + "learning_rate": 0.0005864119434586176, + "loss": 3.6026, + "step": 5887 + }, + { + "epoch": 0.29, + "grad_norm": 0.5631575584411621, + "learning_rate": 0.0005864073619569383, + "loss": 3.3792, + "step": 5888 + }, + { + "epoch": 0.29, + "grad_norm": 0.5679768323898315, + "learning_rate": 0.0005864027797009173, + "loss": 3.169, + "step": 5889 + }, + { + "epoch": 0.29, + "grad_norm": 0.5411725044250488, + "learning_rate": 0.0005863981966905665, + "loss": 3.448, + "step": 5890 + }, + { + "epoch": 0.29, + "grad_norm": 0.5465644598007202, + "learning_rate": 0.0005863936129258979, + "loss": 3.3566, + "step": 5891 + }, + { + "epoch": 0.29, + "grad_norm": 0.5652481317520142, + "learning_rate": 0.0005863890284069236, + "loss": 3.1992, + "step": 5892 + }, + { + "epoch": 0.29, + "grad_norm": 0.5486741065979004, + "learning_rate": 0.0005863844431336559, + "loss": 3.1683, + "step": 5893 + }, + { + "epoch": 0.29, + "grad_norm": 0.5372270941734314, + "learning_rate": 0.0005863798571061065, + "loss": 3.3014, + "step": 5894 + }, + { + "epoch": 0.29, + "grad_norm": 0.609737753868103, + "learning_rate": 0.0005863752703242878, + "loss": 3.0947, + "step": 5895 + }, + { + "epoch": 0.29, + "grad_norm": 0.5803235769271851, + "learning_rate": 0.0005863706827882116, + "loss": 3.4442, + "step": 5896 + }, + { + "epoch": 0.29, + "grad_norm": 0.560324490070343, + "learning_rate": 0.0005863660944978904, + "loss": 3.2564, + "step": 5897 + }, + { + "epoch": 0.29, + "grad_norm": 0.532012403011322, + "learning_rate": 0.0005863615054533357, + "loss": 3.5374, + "step": 5898 + }, + { + "epoch": 0.29, + "grad_norm": 0.6232673525810242, + "learning_rate": 0.0005863569156545601, + "loss": 3.4147, + "step": 5899 + }, + { + "epoch": 0.29, + "grad_norm": 0.5468286871910095, + "learning_rate": 0.0005863523251015755, + "loss": 3.4548, + "step": 5900 + }, + { + "epoch": 0.29, + "grad_norm": 0.5706323981285095, + "learning_rate": 0.0005863477337943939, + "loss": 3.3646, + "step": 5901 + }, + { + "epoch": 0.29, + "grad_norm": 0.5574482083320618, + "learning_rate": 0.0005863431417330275, + "loss": 3.5875, + "step": 5902 + }, + { + "epoch": 0.29, + "grad_norm": 0.5878828763961792, + "learning_rate": 0.0005863385489174883, + "loss": 3.4706, + "step": 5903 + }, + { + "epoch": 0.29, + "grad_norm": 0.5340185761451721, + "learning_rate": 0.0005863339553477887, + "loss": 3.5311, + "step": 5904 + }, + { + "epoch": 0.29, + "grad_norm": 0.5711298584938049, + "learning_rate": 0.0005863293610239404, + "loss": 3.2338, + "step": 5905 + }, + { + "epoch": 0.29, + "grad_norm": 0.5839642286300659, + "learning_rate": 0.0005863247659459557, + "loss": 3.3848, + "step": 5906 + }, + { + "epoch": 0.29, + "grad_norm": 0.5418100953102112, + "learning_rate": 0.0005863201701138466, + "loss": 3.4485, + "step": 5907 + }, + { + "epoch": 0.29, + "grad_norm": 0.5416857600212097, + "learning_rate": 0.0005863155735276254, + "loss": 3.3346, + "step": 5908 + }, + { + "epoch": 0.29, + "grad_norm": 0.5420189499855042, + "learning_rate": 0.000586310976187304, + "loss": 3.391, + "step": 5909 + }, + { + "epoch": 0.29, + "grad_norm": 0.5418663024902344, + "learning_rate": 0.0005863063780928946, + "loss": 3.5192, + "step": 5910 + }, + { + "epoch": 0.29, + "grad_norm": 0.5965156555175781, + "learning_rate": 0.0005863017792444092, + "loss": 3.3288, + "step": 5911 + }, + { + "epoch": 0.29, + "grad_norm": 0.5291689038276672, + "learning_rate": 0.0005862971796418603, + "loss": 3.4977, + "step": 5912 + }, + { + "epoch": 0.29, + "grad_norm": 0.5859507322311401, + "learning_rate": 0.0005862925792852595, + "loss": 3.3662, + "step": 5913 + }, + { + "epoch": 0.29, + "grad_norm": 0.6109771728515625, + "learning_rate": 0.0005862879781746192, + "loss": 3.2146, + "step": 5914 + }, + { + "epoch": 0.29, + "grad_norm": 0.5541035532951355, + "learning_rate": 0.0005862833763099515, + "loss": 3.3712, + "step": 5915 + }, + { + "epoch": 0.29, + "grad_norm": 0.5736545324325562, + "learning_rate": 0.0005862787736912684, + "loss": 3.3146, + "step": 5916 + }, + { + "epoch": 0.29, + "grad_norm": 0.6064590215682983, + "learning_rate": 0.0005862741703185822, + "loss": 3.2912, + "step": 5917 + }, + { + "epoch": 0.29, + "grad_norm": 0.5344297885894775, + "learning_rate": 0.0005862695661919048, + "loss": 3.3818, + "step": 5918 + }, + { + "epoch": 0.29, + "grad_norm": 0.5522978901863098, + "learning_rate": 0.0005862649613112486, + "loss": 3.325, + "step": 5919 + }, + { + "epoch": 0.29, + "grad_norm": 0.5202211141586304, + "learning_rate": 0.0005862603556766254, + "loss": 3.3577, + "step": 5920 + }, + { + "epoch": 0.29, + "grad_norm": 0.5692675113677979, + "learning_rate": 0.0005862557492880477, + "loss": 3.6632, + "step": 5921 + }, + { + "epoch": 0.29, + "grad_norm": 0.5373372435569763, + "learning_rate": 0.0005862511421455274, + "loss": 3.3315, + "step": 5922 + }, + { + "epoch": 0.29, + "grad_norm": 0.5706237554550171, + "learning_rate": 0.0005862465342490766, + "loss": 3.3529, + "step": 5923 + }, + { + "epoch": 0.29, + "grad_norm": 0.5713226199150085, + "learning_rate": 0.0005862419255987076, + "loss": 3.1983, + "step": 5924 + }, + { + "epoch": 0.29, + "grad_norm": 0.567156195640564, + "learning_rate": 0.0005862373161944325, + "loss": 3.279, + "step": 5925 + }, + { + "epoch": 0.29, + "grad_norm": 0.5587449669837952, + "learning_rate": 0.0005862327060362633, + "loss": 3.3058, + "step": 5926 + }, + { + "epoch": 0.29, + "grad_norm": 0.5511670112609863, + "learning_rate": 0.0005862280951242123, + "loss": 3.5264, + "step": 5927 + }, + { + "epoch": 0.29, + "grad_norm": 0.5433431267738342, + "learning_rate": 0.0005862234834582916, + "loss": 3.3096, + "step": 5928 + }, + { + "epoch": 0.29, + "grad_norm": 0.5212329030036926, + "learning_rate": 0.0005862188710385131, + "loss": 3.2682, + "step": 5929 + }, + { + "epoch": 0.29, + "grad_norm": 0.5597579479217529, + "learning_rate": 0.0005862142578648895, + "loss": 3.3191, + "step": 5930 + }, + { + "epoch": 0.29, + "grad_norm": 0.5288726687431335, + "learning_rate": 0.0005862096439374325, + "loss": 3.4722, + "step": 5931 + }, + { + "epoch": 0.29, + "grad_norm": 0.5642876029014587, + "learning_rate": 0.0005862050292561544, + "loss": 3.4802, + "step": 5932 + }, + { + "epoch": 0.29, + "grad_norm": 0.5844772458076477, + "learning_rate": 0.0005862004138210673, + "loss": 3.3419, + "step": 5933 + }, + { + "epoch": 0.29, + "grad_norm": 0.52292799949646, + "learning_rate": 0.0005861957976321834, + "loss": 3.49, + "step": 5934 + }, + { + "epoch": 0.29, + "grad_norm": 0.5201006531715393, + "learning_rate": 0.0005861911806895148, + "loss": 3.429, + "step": 5935 + }, + { + "epoch": 0.29, + "grad_norm": 0.5457620024681091, + "learning_rate": 0.0005861865629930738, + "loss": 3.5125, + "step": 5936 + }, + { + "epoch": 0.29, + "grad_norm": 0.5805454254150391, + "learning_rate": 0.0005861819445428724, + "loss": 3.5591, + "step": 5937 + }, + { + "epoch": 0.29, + "grad_norm": 0.5340372323989868, + "learning_rate": 0.0005861773253389228, + "loss": 3.2514, + "step": 5938 + }, + { + "epoch": 0.29, + "grad_norm": 0.5509132146835327, + "learning_rate": 0.0005861727053812373, + "loss": 3.3563, + "step": 5939 + }, + { + "epoch": 0.29, + "grad_norm": 0.5151408314704895, + "learning_rate": 0.0005861680846698279, + "loss": 3.291, + "step": 5940 + }, + { + "epoch": 0.29, + "grad_norm": 0.517645537853241, + "learning_rate": 0.0005861634632047069, + "loss": 3.3734, + "step": 5941 + }, + { + "epoch": 0.29, + "grad_norm": 0.5592371821403503, + "learning_rate": 0.0005861588409858864, + "loss": 3.3977, + "step": 5942 + }, + { + "epoch": 0.29, + "grad_norm": 0.5972470641136169, + "learning_rate": 0.0005861542180133785, + "loss": 3.274, + "step": 5943 + }, + { + "epoch": 0.29, + "grad_norm": 0.5444090366363525, + "learning_rate": 0.0005861495942871955, + "loss": 3.2464, + "step": 5944 + }, + { + "epoch": 0.29, + "grad_norm": 0.5292772054672241, + "learning_rate": 0.0005861449698073497, + "loss": 3.4715, + "step": 5945 + }, + { + "epoch": 0.29, + "grad_norm": 0.5135490298271179, + "learning_rate": 0.0005861403445738529, + "loss": 3.339, + "step": 5946 + }, + { + "epoch": 0.29, + "grad_norm": 0.5963358283042908, + "learning_rate": 0.0005861357185867176, + "loss": 3.3574, + "step": 5947 + }, + { + "epoch": 0.29, + "grad_norm": 0.5650749802589417, + "learning_rate": 0.0005861310918459559, + "loss": 3.2734, + "step": 5948 + }, + { + "epoch": 0.29, + "grad_norm": 0.5786062479019165, + "learning_rate": 0.0005861264643515799, + "loss": 3.3048, + "step": 5949 + }, + { + "epoch": 0.29, + "grad_norm": 0.578070878982544, + "learning_rate": 0.0005861218361036019, + "loss": 3.2631, + "step": 5950 + }, + { + "epoch": 0.29, + "grad_norm": 0.5257975459098816, + "learning_rate": 0.000586117207102034, + "loss": 3.6505, + "step": 5951 + }, + { + "epoch": 0.29, + "grad_norm": 0.5603009462356567, + "learning_rate": 0.0005861125773468886, + "loss": 3.297, + "step": 5952 + }, + { + "epoch": 0.29, + "grad_norm": 0.6115564703941345, + "learning_rate": 0.0005861079468381776, + "loss": 3.466, + "step": 5953 + }, + { + "epoch": 0.29, + "grad_norm": 0.5789349675178528, + "learning_rate": 0.0005861033155759133, + "loss": 3.4272, + "step": 5954 + }, + { + "epoch": 0.29, + "grad_norm": 0.565288245677948, + "learning_rate": 0.000586098683560108, + "loss": 3.5096, + "step": 5955 + }, + { + "epoch": 0.29, + "grad_norm": 0.563439667224884, + "learning_rate": 0.0005860940507907738, + "loss": 3.2744, + "step": 5956 + }, + { + "epoch": 0.29, + "grad_norm": 0.5860108137130737, + "learning_rate": 0.0005860894172679229, + "loss": 3.4212, + "step": 5957 + }, + { + "epoch": 0.29, + "grad_norm": 0.5446553826332092, + "learning_rate": 0.0005860847829915676, + "loss": 3.363, + "step": 5958 + }, + { + "epoch": 0.29, + "grad_norm": 0.5710722208023071, + "learning_rate": 0.0005860801479617201, + "loss": 3.2973, + "step": 5959 + }, + { + "epoch": 0.29, + "grad_norm": 0.5353807806968689, + "learning_rate": 0.0005860755121783924, + "loss": 3.5851, + "step": 5960 + }, + { + "epoch": 0.29, + "grad_norm": 0.5594916939735413, + "learning_rate": 0.0005860708756415969, + "loss": 3.5233, + "step": 5961 + }, + { + "epoch": 0.29, + "grad_norm": 0.5313096642494202, + "learning_rate": 0.0005860662383513458, + "loss": 3.3611, + "step": 5962 + }, + { + "epoch": 0.29, + "grad_norm": 0.5173554420471191, + "learning_rate": 0.0005860616003076514, + "loss": 3.4742, + "step": 5963 + }, + { + "epoch": 0.29, + "grad_norm": 0.5095874071121216, + "learning_rate": 0.0005860569615105256, + "loss": 3.3907, + "step": 5964 + }, + { + "epoch": 0.29, + "grad_norm": 0.5513678193092346, + "learning_rate": 0.0005860523219599809, + "loss": 3.3048, + "step": 5965 + }, + { + "epoch": 0.29, + "grad_norm": 0.5937461853027344, + "learning_rate": 0.0005860476816560294, + "loss": 3.3119, + "step": 5966 + }, + { + "epoch": 0.29, + "grad_norm": 0.5583381056785583, + "learning_rate": 0.0005860430405986833, + "loss": 3.3584, + "step": 5967 + }, + { + "epoch": 0.29, + "grad_norm": 0.5448343753814697, + "learning_rate": 0.000586038398787955, + "loss": 3.5657, + "step": 5968 + }, + { + "epoch": 0.29, + "grad_norm": 0.5197991132736206, + "learning_rate": 0.0005860337562238566, + "loss": 3.3398, + "step": 5969 + }, + { + "epoch": 0.29, + "grad_norm": 0.5560016632080078, + "learning_rate": 0.0005860291129064003, + "loss": 3.2854, + "step": 5970 + }, + { + "epoch": 0.29, + "grad_norm": 0.5649117231369019, + "learning_rate": 0.0005860244688355984, + "loss": 3.3855, + "step": 5971 + }, + { + "epoch": 0.29, + "grad_norm": 0.56504225730896, + "learning_rate": 0.000586019824011463, + "loss": 3.3499, + "step": 5972 + }, + { + "epoch": 0.29, + "grad_norm": 0.5295236706733704, + "learning_rate": 0.0005860151784340065, + "loss": 3.2858, + "step": 5973 + }, + { + "epoch": 0.29, + "grad_norm": 0.5536357164382935, + "learning_rate": 0.0005860105321032411, + "loss": 3.3838, + "step": 5974 + }, + { + "epoch": 0.29, + "grad_norm": 0.5174973011016846, + "learning_rate": 0.0005860058850191789, + "loss": 3.3564, + "step": 5975 + }, + { + "epoch": 0.29, + "grad_norm": 0.5602399110794067, + "learning_rate": 0.0005860012371818324, + "loss": 3.3537, + "step": 5976 + }, + { + "epoch": 0.29, + "grad_norm": 0.5399395823478699, + "learning_rate": 0.0005859965885912135, + "loss": 3.35, + "step": 5977 + }, + { + "epoch": 0.29, + "grad_norm": 0.5447744131088257, + "learning_rate": 0.0005859919392473348, + "loss": 3.2962, + "step": 5978 + }, + { + "epoch": 0.29, + "grad_norm": 0.5593130588531494, + "learning_rate": 0.0005859872891502083, + "loss": 3.3472, + "step": 5979 + }, + { + "epoch": 0.29, + "grad_norm": 0.5140863656997681, + "learning_rate": 0.0005859826382998462, + "loss": 3.303, + "step": 5980 + }, + { + "epoch": 0.29, + "grad_norm": 0.5375498533248901, + "learning_rate": 0.0005859779866962609, + "loss": 3.3374, + "step": 5981 + }, + { + "epoch": 0.29, + "grad_norm": 0.5679370164871216, + "learning_rate": 0.0005859733343394648, + "loss": 3.4518, + "step": 5982 + }, + { + "epoch": 0.29, + "grad_norm": 0.5210931301116943, + "learning_rate": 0.0005859686812294698, + "loss": 3.4724, + "step": 5983 + }, + { + "epoch": 0.29, + "grad_norm": 0.5706584453582764, + "learning_rate": 0.0005859640273662885, + "loss": 3.5289, + "step": 5984 + }, + { + "epoch": 0.29, + "grad_norm": 0.5336747169494629, + "learning_rate": 0.0005859593727499329, + "loss": 3.4766, + "step": 5985 + }, + { + "epoch": 0.29, + "grad_norm": 0.5442331433296204, + "learning_rate": 0.0005859547173804152, + "loss": 3.2875, + "step": 5986 + }, + { + "epoch": 0.29, + "grad_norm": 0.5738904476165771, + "learning_rate": 0.000585950061257748, + "loss": 3.6571, + "step": 5987 + }, + { + "epoch": 0.29, + "grad_norm": 0.7155367136001587, + "learning_rate": 0.0005859454043819433, + "loss": 3.3561, + "step": 5988 + }, + { + "epoch": 0.29, + "grad_norm": 0.526063084602356, + "learning_rate": 0.0005859407467530134, + "loss": 3.3691, + "step": 5989 + }, + { + "epoch": 0.29, + "grad_norm": 0.5534820556640625, + "learning_rate": 0.0005859360883709707, + "loss": 3.3648, + "step": 5990 + }, + { + "epoch": 0.29, + "grad_norm": 0.5640885829925537, + "learning_rate": 0.0005859314292358274, + "loss": 3.6089, + "step": 5991 + }, + { + "epoch": 0.29, + "grad_norm": 0.5343618392944336, + "learning_rate": 0.0005859267693475956, + "loss": 3.5103, + "step": 5992 + }, + { + "epoch": 0.29, + "grad_norm": 0.5363438725471497, + "learning_rate": 0.0005859221087062878, + "loss": 3.4727, + "step": 5993 + }, + { + "epoch": 0.29, + "grad_norm": 0.5537747144699097, + "learning_rate": 0.0005859174473119162, + "loss": 3.1938, + "step": 5994 + }, + { + "epoch": 0.29, + "grad_norm": 0.5269249081611633, + "learning_rate": 0.0005859127851644931, + "loss": 3.1289, + "step": 5995 + }, + { + "epoch": 0.29, + "grad_norm": 0.5200663208961487, + "learning_rate": 0.0005859081222640306, + "loss": 3.3508, + "step": 5996 + }, + { + "epoch": 0.29, + "grad_norm": 0.5903448462486267, + "learning_rate": 0.0005859034586105412, + "loss": 3.3433, + "step": 5997 + }, + { + "epoch": 0.29, + "grad_norm": 0.5499140024185181, + "learning_rate": 0.0005858987942040371, + "loss": 3.4114, + "step": 5998 + }, + { + "epoch": 0.29, + "grad_norm": 0.5757197737693787, + "learning_rate": 0.0005858941290445307, + "loss": 3.3508, + "step": 5999 + }, + { + "epoch": 0.29, + "grad_norm": 0.5636634826660156, + "learning_rate": 0.000585889463132034, + "loss": 3.495, + "step": 6000 + }, + { + "epoch": 0.29, + "grad_norm": 0.5337130427360535, + "learning_rate": 0.0005858847964665598, + "loss": 3.4583, + "step": 6001 + }, + { + "epoch": 0.29, + "grad_norm": 0.5628725290298462, + "learning_rate": 0.0005858801290481197, + "loss": 3.6494, + "step": 6002 + }, + { + "epoch": 0.29, + "grad_norm": 0.5964218974113464, + "learning_rate": 0.0005858754608767266, + "loss": 3.3848, + "step": 6003 + }, + { + "epoch": 0.29, + "grad_norm": 0.5543503165245056, + "learning_rate": 0.0005858707919523924, + "loss": 3.3203, + "step": 6004 + }, + { + "epoch": 0.29, + "grad_norm": 0.5465249419212341, + "learning_rate": 0.0005858661222751297, + "loss": 3.3155, + "step": 6005 + }, + { + "epoch": 0.29, + "grad_norm": 0.5593113303184509, + "learning_rate": 0.0005858614518449506, + "loss": 3.268, + "step": 6006 + }, + { + "epoch": 0.29, + "grad_norm": 0.5494705438613892, + "learning_rate": 0.0005858567806618673, + "loss": 3.4332, + "step": 6007 + }, + { + "epoch": 0.29, + "grad_norm": 0.5505009293556213, + "learning_rate": 0.0005858521087258924, + "loss": 3.4089, + "step": 6008 + }, + { + "epoch": 0.29, + "grad_norm": 0.5421280264854431, + "learning_rate": 0.000585847436037038, + "loss": 3.431, + "step": 6009 + }, + { + "epoch": 0.29, + "grad_norm": 0.5362679362297058, + "learning_rate": 0.0005858427625953166, + "loss": 3.4296, + "step": 6010 + }, + { + "epoch": 0.29, + "grad_norm": 0.5678845047950745, + "learning_rate": 0.0005858380884007403, + "loss": 3.2674, + "step": 6011 + }, + { + "epoch": 0.29, + "grad_norm": 0.5496119856834412, + "learning_rate": 0.0005858334134533214, + "loss": 3.5641, + "step": 6012 + }, + { + "epoch": 0.29, + "grad_norm": 0.5399487018585205, + "learning_rate": 0.0005858287377530723, + "loss": 3.2081, + "step": 6013 + }, + { + "epoch": 0.29, + "grad_norm": 0.5608866214752197, + "learning_rate": 0.0005858240613000054, + "loss": 3.3, + "step": 6014 + }, + { + "epoch": 0.29, + "grad_norm": 0.5753394365310669, + "learning_rate": 0.0005858193840941329, + "loss": 3.3895, + "step": 6015 + }, + { + "epoch": 0.29, + "grad_norm": 0.545498788356781, + "learning_rate": 0.0005858147061354672, + "loss": 3.5889, + "step": 6016 + }, + { + "epoch": 0.29, + "grad_norm": 0.5470389723777771, + "learning_rate": 0.0005858100274240205, + "loss": 3.4151, + "step": 6017 + }, + { + "epoch": 0.29, + "grad_norm": 0.5118675827980042, + "learning_rate": 0.0005858053479598053, + "loss": 3.2757, + "step": 6018 + }, + { + "epoch": 0.29, + "grad_norm": 0.520729660987854, + "learning_rate": 0.0005858006677428337, + "loss": 3.4047, + "step": 6019 + }, + { + "epoch": 0.3, + "grad_norm": 0.5678765773773193, + "learning_rate": 0.0005857959867731181, + "loss": 3.2805, + "step": 6020 + }, + { + "epoch": 0.3, + "grad_norm": 0.5853541493415833, + "learning_rate": 0.000585791305050671, + "loss": 3.3365, + "step": 6021 + }, + { + "epoch": 0.3, + "grad_norm": 0.5466600656509399, + "learning_rate": 0.0005857866225755045, + "loss": 3.2846, + "step": 6022 + }, + { + "epoch": 0.3, + "grad_norm": 0.5336818099021912, + "learning_rate": 0.0005857819393476312, + "loss": 3.4511, + "step": 6023 + }, + { + "epoch": 0.3, + "grad_norm": 0.5479405522346497, + "learning_rate": 0.0005857772553670631, + "loss": 3.2949, + "step": 6024 + }, + { + "epoch": 0.3, + "grad_norm": 0.5725945830345154, + "learning_rate": 0.0005857725706338129, + "loss": 3.2428, + "step": 6025 + }, + { + "epoch": 0.3, + "grad_norm": 0.6460245847702026, + "learning_rate": 0.0005857678851478925, + "loss": 3.3048, + "step": 6026 + }, + { + "epoch": 0.3, + "grad_norm": 0.5461933016777039, + "learning_rate": 0.0005857631989093147, + "loss": 3.1917, + "step": 6027 + }, + { + "epoch": 0.3, + "grad_norm": 0.615522563457489, + "learning_rate": 0.0005857585119180915, + "loss": 3.2753, + "step": 6028 + }, + { + "epoch": 0.3, + "grad_norm": 0.5617178678512573, + "learning_rate": 0.0005857538241742354, + "loss": 3.2788, + "step": 6029 + }, + { + "epoch": 0.3, + "grad_norm": 0.5808992981910706, + "learning_rate": 0.0005857491356777587, + "loss": 3.4891, + "step": 6030 + }, + { + "epoch": 0.3, + "grad_norm": 0.49388301372528076, + "learning_rate": 0.0005857444464286739, + "loss": 3.6251, + "step": 6031 + }, + { + "epoch": 0.3, + "grad_norm": 0.5432161688804626, + "learning_rate": 0.0005857397564269931, + "loss": 3.35, + "step": 6032 + }, + { + "epoch": 0.3, + "grad_norm": 0.62075275182724, + "learning_rate": 0.0005857350656727289, + "loss": 3.3793, + "step": 6033 + }, + { + "epoch": 0.3, + "grad_norm": 0.5460484027862549, + "learning_rate": 0.0005857303741658933, + "loss": 3.3715, + "step": 6034 + }, + { + "epoch": 0.3, + "grad_norm": 0.5340959429740906, + "learning_rate": 0.0005857256819064991, + "loss": 3.2426, + "step": 6035 + }, + { + "epoch": 0.3, + "grad_norm": 0.547926127910614, + "learning_rate": 0.0005857209888945583, + "loss": 3.6102, + "step": 6036 + }, + { + "epoch": 0.3, + "grad_norm": 0.540779709815979, + "learning_rate": 0.0005857162951300835, + "loss": 3.4833, + "step": 6037 + }, + { + "epoch": 0.3, + "grad_norm": 0.5335472822189331, + "learning_rate": 0.0005857116006130869, + "loss": 3.438, + "step": 6038 + }, + { + "epoch": 0.3, + "grad_norm": 0.5480554699897766, + "learning_rate": 0.0005857069053435809, + "loss": 3.4828, + "step": 6039 + }, + { + "epoch": 0.3, + "grad_norm": 0.5643362998962402, + "learning_rate": 0.000585702209321578, + "loss": 3.3695, + "step": 6040 + }, + { + "epoch": 0.3, + "grad_norm": 0.5678135752677917, + "learning_rate": 0.0005856975125470904, + "loss": 3.3233, + "step": 6041 + }, + { + "epoch": 0.3, + "grad_norm": 0.5982794165611267, + "learning_rate": 0.0005856928150201306, + "loss": 3.4825, + "step": 6042 + }, + { + "epoch": 0.3, + "grad_norm": 0.5603371858596802, + "learning_rate": 0.0005856881167407109, + "loss": 3.4292, + "step": 6043 + }, + { + "epoch": 0.3, + "grad_norm": 0.5519452691078186, + "learning_rate": 0.0005856834177088436, + "loss": 3.6237, + "step": 6044 + }, + { + "epoch": 0.3, + "grad_norm": 0.5269837379455566, + "learning_rate": 0.0005856787179245412, + "loss": 3.5085, + "step": 6045 + }, + { + "epoch": 0.3, + "grad_norm": 0.5146785378456116, + "learning_rate": 0.000585674017387816, + "loss": 3.3698, + "step": 6046 + }, + { + "epoch": 0.3, + "grad_norm": 0.5637503862380981, + "learning_rate": 0.0005856693160986805, + "loss": 3.2, + "step": 6047 + }, + { + "epoch": 0.3, + "grad_norm": 0.5364857316017151, + "learning_rate": 0.000585664614057147, + "loss": 3.4152, + "step": 6048 + }, + { + "epoch": 0.3, + "grad_norm": 0.562177300453186, + "learning_rate": 0.0005856599112632277, + "loss": 3.415, + "step": 6049 + }, + { + "epoch": 0.3, + "grad_norm": 0.548433780670166, + "learning_rate": 0.0005856552077169354, + "loss": 3.2522, + "step": 6050 + }, + { + "epoch": 0.3, + "grad_norm": 0.5104949474334717, + "learning_rate": 0.0005856505034182822, + "loss": 3.5135, + "step": 6051 + }, + { + "epoch": 0.3, + "grad_norm": 0.5480595231056213, + "learning_rate": 0.0005856457983672805, + "loss": 3.6652, + "step": 6052 + }, + { + "epoch": 0.3, + "grad_norm": 0.5037899017333984, + "learning_rate": 0.0005856410925639428, + "loss": 3.4469, + "step": 6053 + }, + { + "epoch": 0.3, + "grad_norm": 0.5276090502738953, + "learning_rate": 0.0005856363860082813, + "loss": 3.3164, + "step": 6054 + }, + { + "epoch": 0.3, + "grad_norm": 0.5726231336593628, + "learning_rate": 0.0005856316787003086, + "loss": 3.4393, + "step": 6055 + }, + { + "epoch": 0.3, + "grad_norm": 0.530457079410553, + "learning_rate": 0.0005856269706400371, + "loss": 3.3204, + "step": 6056 + }, + { + "epoch": 0.3, + "grad_norm": 0.5776810050010681, + "learning_rate": 0.000585622261827479, + "loss": 3.0907, + "step": 6057 + }, + { + "epoch": 0.3, + "grad_norm": 0.5454444289207458, + "learning_rate": 0.000585617552262647, + "loss": 3.1768, + "step": 6058 + }, + { + "epoch": 0.3, + "grad_norm": 0.5331177115440369, + "learning_rate": 0.0005856128419455532, + "loss": 3.5736, + "step": 6059 + }, + { + "epoch": 0.3, + "grad_norm": 0.5558076500892639, + "learning_rate": 0.0005856081308762102, + "loss": 3.5581, + "step": 6060 + }, + { + "epoch": 0.3, + "grad_norm": 0.6561071872711182, + "learning_rate": 0.0005856034190546304, + "loss": 3.4186, + "step": 6061 + }, + { + "epoch": 0.3, + "grad_norm": 0.5517882704734802, + "learning_rate": 0.000585598706480826, + "loss": 3.2384, + "step": 6062 + }, + { + "epoch": 0.3, + "grad_norm": 0.5598825812339783, + "learning_rate": 0.0005855939931548097, + "loss": 3.2194, + "step": 6063 + }, + { + "epoch": 0.3, + "grad_norm": 0.508906900882721, + "learning_rate": 0.0005855892790765937, + "loss": 3.4494, + "step": 6064 + }, + { + "epoch": 0.3, + "grad_norm": 0.5944477319717407, + "learning_rate": 0.0005855845642461907, + "loss": 3.4631, + "step": 6065 + }, + { + "epoch": 0.3, + "grad_norm": 0.5435997247695923, + "learning_rate": 0.0005855798486636127, + "loss": 3.2613, + "step": 6066 + }, + { + "epoch": 0.3, + "grad_norm": 0.5678590536117554, + "learning_rate": 0.0005855751323288724, + "loss": 3.4153, + "step": 6067 + }, + { + "epoch": 0.3, + "grad_norm": 0.5375375151634216, + "learning_rate": 0.0005855704152419822, + "loss": 3.378, + "step": 6068 + }, + { + "epoch": 0.3, + "grad_norm": 0.5496307611465454, + "learning_rate": 0.0005855656974029544, + "loss": 3.4264, + "step": 6069 + }, + { + "epoch": 0.3, + "grad_norm": 0.6323420405387878, + "learning_rate": 0.0005855609788118017, + "loss": 3.4229, + "step": 6070 + }, + { + "epoch": 0.3, + "grad_norm": 0.5859168171882629, + "learning_rate": 0.0005855562594685362, + "loss": 3.4765, + "step": 6071 + }, + { + "epoch": 0.3, + "grad_norm": 0.5254231095314026, + "learning_rate": 0.0005855515393731704, + "loss": 3.326, + "step": 6072 + }, + { + "epoch": 0.3, + "grad_norm": 0.49722519516944885, + "learning_rate": 0.0005855468185257169, + "loss": 3.4279, + "step": 6073 + }, + { + "epoch": 0.3, + "grad_norm": 0.5665181279182434, + "learning_rate": 0.0005855420969261881, + "loss": 3.4913, + "step": 6074 + }, + { + "epoch": 0.3, + "grad_norm": 0.5734388828277588, + "learning_rate": 0.0005855373745745962, + "loss": 3.1717, + "step": 6075 + }, + { + "epoch": 0.3, + "grad_norm": 0.5337647795677185, + "learning_rate": 0.0005855326514709539, + "loss": 3.2627, + "step": 6076 + }, + { + "epoch": 0.3, + "grad_norm": 0.532321572303772, + "learning_rate": 0.0005855279276152736, + "loss": 3.3021, + "step": 6077 + }, + { + "epoch": 0.3, + "grad_norm": 0.5565383434295654, + "learning_rate": 0.0005855232030075677, + "loss": 3.3376, + "step": 6078 + }, + { + "epoch": 0.3, + "grad_norm": 0.5236389636993408, + "learning_rate": 0.0005855184776478485, + "loss": 3.5663, + "step": 6079 + }, + { + "epoch": 0.3, + "grad_norm": 0.5841774940490723, + "learning_rate": 0.0005855137515361286, + "loss": 3.2004, + "step": 6080 + }, + { + "epoch": 0.3, + "grad_norm": 0.5455382466316223, + "learning_rate": 0.0005855090246724205, + "loss": 3.3466, + "step": 6081 + }, + { + "epoch": 0.3, + "grad_norm": 0.6889823079109192, + "learning_rate": 0.0005855042970567366, + "loss": 3.4259, + "step": 6082 + }, + { + "epoch": 0.3, + "grad_norm": 0.5753301978111267, + "learning_rate": 0.0005854995686890893, + "loss": 3.2617, + "step": 6083 + }, + { + "epoch": 0.3, + "grad_norm": 0.5561864972114563, + "learning_rate": 0.0005854948395694911, + "loss": 3.2474, + "step": 6084 + }, + { + "epoch": 0.3, + "grad_norm": 0.5513240694999695, + "learning_rate": 0.0005854901096979543, + "loss": 3.2588, + "step": 6085 + }, + { + "epoch": 0.3, + "grad_norm": 0.5519996285438538, + "learning_rate": 0.0005854853790744917, + "loss": 3.4927, + "step": 6086 + }, + { + "epoch": 0.3, + "grad_norm": 0.5103303790092468, + "learning_rate": 0.0005854806476991154, + "loss": 3.6893, + "step": 6087 + }, + { + "epoch": 0.3, + "grad_norm": 0.5609255433082581, + "learning_rate": 0.000585475915571838, + "loss": 3.2162, + "step": 6088 + }, + { + "epoch": 0.3, + "grad_norm": 0.5276150703430176, + "learning_rate": 0.000585471182692672, + "loss": 3.3367, + "step": 6089 + }, + { + "epoch": 0.3, + "grad_norm": 0.5595441460609436, + "learning_rate": 0.00058546644906163, + "loss": 3.4075, + "step": 6090 + }, + { + "epoch": 0.3, + "grad_norm": 0.5784932971000671, + "learning_rate": 0.0005854617146787242, + "loss": 3.2866, + "step": 6091 + }, + { + "epoch": 0.3, + "grad_norm": 0.5267335176467896, + "learning_rate": 0.000585456979543967, + "loss": 3.5419, + "step": 6092 + }, + { + "epoch": 0.3, + "grad_norm": 0.5266568064689636, + "learning_rate": 0.0005854522436573714, + "loss": 3.2447, + "step": 6093 + }, + { + "epoch": 0.3, + "grad_norm": 0.5461187958717346, + "learning_rate": 0.0005854475070189493, + "loss": 3.4585, + "step": 6094 + }, + { + "epoch": 0.3, + "grad_norm": 0.594099760055542, + "learning_rate": 0.0005854427696287134, + "loss": 3.42, + "step": 6095 + }, + { + "epoch": 0.3, + "grad_norm": 0.5209415555000305, + "learning_rate": 0.0005854380314866763, + "loss": 3.6037, + "step": 6096 + }, + { + "epoch": 0.3, + "grad_norm": 0.5774303078651428, + "learning_rate": 0.0005854332925928503, + "loss": 3.5351, + "step": 6097 + }, + { + "epoch": 0.3, + "grad_norm": 0.5695366263389587, + "learning_rate": 0.000585428552947248, + "loss": 3.3669, + "step": 6098 + }, + { + "epoch": 0.3, + "grad_norm": 0.5497203469276428, + "learning_rate": 0.0005854238125498817, + "loss": 3.3797, + "step": 6099 + }, + { + "epoch": 0.3, + "grad_norm": 0.5589160323143005, + "learning_rate": 0.0005854190714007641, + "loss": 3.3957, + "step": 6100 + }, + { + "epoch": 0.3, + "grad_norm": 0.8883815407752991, + "learning_rate": 0.0005854143294999075, + "loss": 3.3728, + "step": 6101 + }, + { + "epoch": 0.3, + "grad_norm": 0.5451207756996155, + "learning_rate": 0.0005854095868473246, + "loss": 3.6537, + "step": 6102 + }, + { + "epoch": 0.3, + "grad_norm": 0.5537768006324768, + "learning_rate": 0.0005854048434430277, + "loss": 3.3309, + "step": 6103 + }, + { + "epoch": 0.3, + "grad_norm": 0.5225555896759033, + "learning_rate": 0.0005854000992870295, + "loss": 3.4239, + "step": 6104 + }, + { + "epoch": 0.3, + "grad_norm": 0.5561574101448059, + "learning_rate": 0.0005853953543793422, + "loss": 3.3518, + "step": 6105 + }, + { + "epoch": 0.3, + "grad_norm": 0.5772457122802734, + "learning_rate": 0.0005853906087199785, + "loss": 3.5618, + "step": 6106 + }, + { + "epoch": 0.3, + "grad_norm": 0.5690733194351196, + "learning_rate": 0.0005853858623089509, + "loss": 3.2823, + "step": 6107 + }, + { + "epoch": 0.3, + "grad_norm": 0.5398492813110352, + "learning_rate": 0.0005853811151462719, + "loss": 3.4709, + "step": 6108 + }, + { + "epoch": 0.3, + "grad_norm": 0.5477753281593323, + "learning_rate": 0.0005853763672319538, + "loss": 3.4316, + "step": 6109 + }, + { + "epoch": 0.3, + "grad_norm": 0.5604354739189148, + "learning_rate": 0.0005853716185660095, + "loss": 3.4123, + "step": 6110 + }, + { + "epoch": 0.3, + "grad_norm": 0.5720632076263428, + "learning_rate": 0.0005853668691484512, + "loss": 3.3442, + "step": 6111 + }, + { + "epoch": 0.3, + "grad_norm": 0.5532767176628113, + "learning_rate": 0.0005853621189792913, + "loss": 3.2747, + "step": 6112 + }, + { + "epoch": 0.3, + "grad_norm": 0.5390136241912842, + "learning_rate": 0.0005853573680585427, + "loss": 3.2519, + "step": 6113 + }, + { + "epoch": 0.3, + "grad_norm": 0.5394711494445801, + "learning_rate": 0.0005853526163862177, + "loss": 3.4465, + "step": 6114 + }, + { + "epoch": 0.3, + "grad_norm": 0.5646769404411316, + "learning_rate": 0.0005853478639623287, + "loss": 3.2085, + "step": 6115 + }, + { + "epoch": 0.3, + "grad_norm": 0.5330178737640381, + "learning_rate": 0.0005853431107868886, + "loss": 3.5491, + "step": 6116 + }, + { + "epoch": 0.3, + "grad_norm": 0.554105281829834, + "learning_rate": 0.0005853383568599094, + "loss": 3.7143, + "step": 6117 + }, + { + "epoch": 0.3, + "grad_norm": 0.5287169218063354, + "learning_rate": 0.000585333602181404, + "loss": 3.3217, + "step": 6118 + }, + { + "epoch": 0.3, + "grad_norm": 0.5341808199882507, + "learning_rate": 0.0005853288467513848, + "loss": 3.2725, + "step": 6119 + }, + { + "epoch": 0.3, + "grad_norm": 0.5601862072944641, + "learning_rate": 0.0005853240905698642, + "loss": 3.3502, + "step": 6120 + }, + { + "epoch": 0.3, + "grad_norm": 0.58427894115448, + "learning_rate": 0.0005853193336368551, + "loss": 3.4909, + "step": 6121 + }, + { + "epoch": 0.3, + "grad_norm": 0.523365318775177, + "learning_rate": 0.0005853145759523695, + "loss": 3.47, + "step": 6122 + }, + { + "epoch": 0.3, + "grad_norm": 0.5647141337394714, + "learning_rate": 0.0005853098175164204, + "loss": 3.2372, + "step": 6123 + }, + { + "epoch": 0.3, + "grad_norm": 0.5395342111587524, + "learning_rate": 0.0005853050583290202, + "loss": 3.4103, + "step": 6124 + }, + { + "epoch": 0.3, + "grad_norm": 0.5265369415283203, + "learning_rate": 0.0005853002983901812, + "loss": 3.3473, + "step": 6125 + }, + { + "epoch": 0.3, + "grad_norm": 0.5595860481262207, + "learning_rate": 0.0005852955376999163, + "loss": 3.4987, + "step": 6126 + }, + { + "epoch": 0.3, + "grad_norm": 0.6001688241958618, + "learning_rate": 0.0005852907762582377, + "loss": 3.3633, + "step": 6127 + }, + { + "epoch": 0.3, + "grad_norm": 0.517382800579071, + "learning_rate": 0.0005852860140651583, + "loss": 3.3587, + "step": 6128 + }, + { + "epoch": 0.3, + "grad_norm": 0.5585851669311523, + "learning_rate": 0.0005852812511206902, + "loss": 3.4912, + "step": 6129 + }, + { + "epoch": 0.3, + "grad_norm": 0.5613790154457092, + "learning_rate": 0.0005852764874248464, + "loss": 3.2364, + "step": 6130 + }, + { + "epoch": 0.3, + "grad_norm": 0.5714898109436035, + "learning_rate": 0.0005852717229776392, + "loss": 3.3002, + "step": 6131 + }, + { + "epoch": 0.3, + "grad_norm": 0.59192955493927, + "learning_rate": 0.000585266957779081, + "loss": 3.3714, + "step": 6132 + }, + { + "epoch": 0.3, + "grad_norm": 0.5501763820648193, + "learning_rate": 0.0005852621918291846, + "loss": 3.2259, + "step": 6133 + }, + { + "epoch": 0.3, + "grad_norm": 0.5687995553016663, + "learning_rate": 0.0005852574251279626, + "loss": 3.3774, + "step": 6134 + }, + { + "epoch": 0.3, + "grad_norm": 0.5306016802787781, + "learning_rate": 0.0005852526576754274, + "loss": 3.319, + "step": 6135 + }, + { + "epoch": 0.3, + "grad_norm": 0.5609244704246521, + "learning_rate": 0.0005852478894715917, + "loss": 3.363, + "step": 6136 + }, + { + "epoch": 0.3, + "grad_norm": 0.5303353667259216, + "learning_rate": 0.0005852431205164678, + "loss": 3.3597, + "step": 6137 + }, + { + "epoch": 0.3, + "grad_norm": 0.5196649432182312, + "learning_rate": 0.0005852383508100685, + "loss": 3.3383, + "step": 6138 + }, + { + "epoch": 0.3, + "grad_norm": 0.5467498302459717, + "learning_rate": 0.0005852335803524062, + "loss": 3.4312, + "step": 6139 + }, + { + "epoch": 0.3, + "grad_norm": 0.5493993163108826, + "learning_rate": 0.0005852288091434936, + "loss": 3.209, + "step": 6140 + }, + { + "epoch": 0.3, + "grad_norm": 0.534140944480896, + "learning_rate": 0.0005852240371833432, + "loss": 3.3898, + "step": 6141 + }, + { + "epoch": 0.3, + "grad_norm": 0.5165014863014221, + "learning_rate": 0.0005852192644719675, + "loss": 3.1214, + "step": 6142 + }, + { + "epoch": 0.3, + "grad_norm": 0.6956821084022522, + "learning_rate": 0.0005852144910093792, + "loss": 3.2089, + "step": 6143 + }, + { + "epoch": 0.3, + "grad_norm": 0.544562816619873, + "learning_rate": 0.0005852097167955909, + "loss": 3.3128, + "step": 6144 + }, + { + "epoch": 0.3, + "grad_norm": 0.6112951040267944, + "learning_rate": 0.0005852049418306151, + "loss": 3.3378, + "step": 6145 + }, + { + "epoch": 0.3, + "grad_norm": 0.545120358467102, + "learning_rate": 0.0005852001661144643, + "loss": 3.4138, + "step": 6146 + }, + { + "epoch": 0.3, + "grad_norm": 0.5408572554588318, + "learning_rate": 0.0005851953896471512, + "loss": 3.3295, + "step": 6147 + }, + { + "epoch": 0.3, + "grad_norm": 0.5871040225028992, + "learning_rate": 0.0005851906124286882, + "loss": 3.3294, + "step": 6148 + }, + { + "epoch": 0.3, + "grad_norm": 0.5361684560775757, + "learning_rate": 0.0005851858344590881, + "loss": 3.4245, + "step": 6149 + }, + { + "epoch": 0.3, + "grad_norm": 0.541067361831665, + "learning_rate": 0.0005851810557383634, + "loss": 3.3993, + "step": 6150 + }, + { + "epoch": 0.3, + "grad_norm": 0.5347387194633484, + "learning_rate": 0.0005851762762665267, + "loss": 3.1216, + "step": 6151 + }, + { + "epoch": 0.3, + "grad_norm": 0.5603194236755371, + "learning_rate": 0.0005851714960435906, + "loss": 3.416, + "step": 6152 + }, + { + "epoch": 0.3, + "grad_norm": 0.7698899507522583, + "learning_rate": 0.0005851667150695676, + "loss": 3.2229, + "step": 6153 + }, + { + "epoch": 0.3, + "grad_norm": 0.5307655930519104, + "learning_rate": 0.0005851619333444703, + "loss": 3.2918, + "step": 6154 + }, + { + "epoch": 0.3, + "grad_norm": 0.5312530994415283, + "learning_rate": 0.0005851571508683115, + "loss": 3.5303, + "step": 6155 + }, + { + "epoch": 0.3, + "grad_norm": 0.5486868619918823, + "learning_rate": 0.0005851523676411036, + "loss": 3.3959, + "step": 6156 + }, + { + "epoch": 0.3, + "grad_norm": 0.5212375521659851, + "learning_rate": 0.0005851475836628591, + "loss": 3.2698, + "step": 6157 + }, + { + "epoch": 0.3, + "grad_norm": 0.5626296997070312, + "learning_rate": 0.0005851427989335909, + "loss": 3.4387, + "step": 6158 + }, + { + "epoch": 0.3, + "grad_norm": 0.5545399785041809, + "learning_rate": 0.0005851380134533114, + "loss": 3.3047, + "step": 6159 + }, + { + "epoch": 0.3, + "grad_norm": 0.5794121026992798, + "learning_rate": 0.0005851332272220332, + "loss": 3.3047, + "step": 6160 + }, + { + "epoch": 0.3, + "grad_norm": 0.6324692964553833, + "learning_rate": 0.0005851284402397691, + "loss": 3.3857, + "step": 6161 + }, + { + "epoch": 0.3, + "grad_norm": 0.530517041683197, + "learning_rate": 0.0005851236525065314, + "loss": 3.4357, + "step": 6162 + }, + { + "epoch": 0.3, + "grad_norm": 0.5403311252593994, + "learning_rate": 0.000585118864022333, + "loss": 3.3202, + "step": 6163 + }, + { + "epoch": 0.3, + "grad_norm": 0.7685026526451111, + "learning_rate": 0.0005851140747871863, + "loss": 3.2808, + "step": 6164 + }, + { + "epoch": 0.3, + "grad_norm": 0.6147613525390625, + "learning_rate": 0.0005851092848011039, + "loss": 3.5743, + "step": 6165 + }, + { + "epoch": 0.3, + "grad_norm": 0.5476180911064148, + "learning_rate": 0.0005851044940640987, + "loss": 3.6459, + "step": 6166 + }, + { + "epoch": 0.3, + "grad_norm": 0.5747238397598267, + "learning_rate": 0.000585099702576183, + "loss": 3.4143, + "step": 6167 + }, + { + "epoch": 0.3, + "grad_norm": 0.565980076789856, + "learning_rate": 0.0005850949103373697, + "loss": 3.3223, + "step": 6168 + }, + { + "epoch": 0.3, + "grad_norm": 0.5255545973777771, + "learning_rate": 0.0005850901173476712, + "loss": 3.2993, + "step": 6169 + }, + { + "epoch": 0.3, + "grad_norm": 0.5456163883209229, + "learning_rate": 0.0005850853236071003, + "loss": 3.4025, + "step": 6170 + }, + { + "epoch": 0.3, + "grad_norm": 0.5844244956970215, + "learning_rate": 0.0005850805291156692, + "loss": 3.6342, + "step": 6171 + }, + { + "epoch": 0.3, + "grad_norm": 0.5095750689506531, + "learning_rate": 0.0005850757338733911, + "loss": 3.3355, + "step": 6172 + }, + { + "epoch": 0.3, + "grad_norm": 0.6050037741661072, + "learning_rate": 0.0005850709378802785, + "loss": 3.4111, + "step": 6173 + }, + { + "epoch": 0.3, + "grad_norm": 0.5732804536819458, + "learning_rate": 0.0005850661411363437, + "loss": 3.5586, + "step": 6174 + }, + { + "epoch": 0.3, + "grad_norm": 0.5283846855163574, + "learning_rate": 0.0005850613436415996, + "loss": 3.237, + "step": 6175 + }, + { + "epoch": 0.3, + "grad_norm": 0.5460174083709717, + "learning_rate": 0.0005850565453960586, + "loss": 3.171, + "step": 6176 + }, + { + "epoch": 0.3, + "grad_norm": 0.5901962518692017, + "learning_rate": 0.0005850517463997339, + "loss": 3.3324, + "step": 6177 + }, + { + "epoch": 0.3, + "grad_norm": 0.5234120488166809, + "learning_rate": 0.0005850469466526376, + "loss": 3.4535, + "step": 6178 + }, + { + "epoch": 0.3, + "grad_norm": 0.5157477855682373, + "learning_rate": 0.0005850421461547823, + "loss": 3.4048, + "step": 6179 + }, + { + "epoch": 0.3, + "grad_norm": 0.518635094165802, + "learning_rate": 0.000585037344906181, + "loss": 3.3394, + "step": 6180 + }, + { + "epoch": 0.3, + "grad_norm": 0.5400442481040955, + "learning_rate": 0.0005850325429068462, + "loss": 3.3305, + "step": 6181 + }, + { + "epoch": 0.3, + "grad_norm": 0.5420893430709839, + "learning_rate": 0.0005850277401567906, + "loss": 3.4494, + "step": 6182 + }, + { + "epoch": 0.3, + "grad_norm": 0.5632343292236328, + "learning_rate": 0.0005850229366560268, + "loss": 3.411, + "step": 6183 + }, + { + "epoch": 0.3, + "grad_norm": 0.5508242249488831, + "learning_rate": 0.0005850181324045673, + "loss": 3.1892, + "step": 6184 + }, + { + "epoch": 0.3, + "grad_norm": 0.5260775089263916, + "learning_rate": 0.0005850133274024249, + "loss": 3.0536, + "step": 6185 + }, + { + "epoch": 0.3, + "grad_norm": 0.5268356800079346, + "learning_rate": 0.0005850085216496123, + "loss": 3.4394, + "step": 6186 + }, + { + "epoch": 0.3, + "grad_norm": 0.4986163079738617, + "learning_rate": 0.0005850037151461421, + "loss": 3.4986, + "step": 6187 + }, + { + "epoch": 0.3, + "grad_norm": 0.5085906386375427, + "learning_rate": 0.0005849989078920271, + "loss": 3.6332, + "step": 6188 + }, + { + "epoch": 0.3, + "grad_norm": 0.5513754487037659, + "learning_rate": 0.0005849940998872796, + "loss": 3.1801, + "step": 6189 + }, + { + "epoch": 0.3, + "grad_norm": 0.5337885022163391, + "learning_rate": 0.0005849892911319126, + "loss": 3.3828, + "step": 6190 + }, + { + "epoch": 0.3, + "grad_norm": 0.5386898517608643, + "learning_rate": 0.0005849844816259387, + "loss": 3.4224, + "step": 6191 + }, + { + "epoch": 0.3, + "grad_norm": 0.5503227710723877, + "learning_rate": 0.0005849796713693704, + "loss": 3.2951, + "step": 6192 + }, + { + "epoch": 0.3, + "grad_norm": 0.5732369422912598, + "learning_rate": 0.0005849748603622205, + "loss": 3.4267, + "step": 6193 + }, + { + "epoch": 0.3, + "grad_norm": 0.5297451615333557, + "learning_rate": 0.0005849700486045018, + "loss": 3.3835, + "step": 6194 + }, + { + "epoch": 0.3, + "grad_norm": 0.5413727760314941, + "learning_rate": 0.0005849652360962268, + "loss": 3.3712, + "step": 6195 + }, + { + "epoch": 0.3, + "grad_norm": 0.5539155006408691, + "learning_rate": 0.000584960422837408, + "loss": 3.3727, + "step": 6196 + }, + { + "epoch": 0.3, + "grad_norm": 0.5665326118469238, + "learning_rate": 0.0005849556088280585, + "loss": 3.3948, + "step": 6197 + }, + { + "epoch": 0.3, + "grad_norm": 0.5332841873168945, + "learning_rate": 0.0005849507940681907, + "loss": 3.5232, + "step": 6198 + }, + { + "epoch": 0.3, + "grad_norm": 0.5573357939720154, + "learning_rate": 0.0005849459785578174, + "loss": 3.3837, + "step": 6199 + }, + { + "epoch": 0.3, + "grad_norm": 0.5696620345115662, + "learning_rate": 0.0005849411622969511, + "loss": 3.4472, + "step": 6200 + }, + { + "epoch": 0.3, + "grad_norm": 0.5535147190093994, + "learning_rate": 0.0005849363452856048, + "loss": 3.5722, + "step": 6201 + }, + { + "epoch": 0.3, + "grad_norm": 0.5474180579185486, + "learning_rate": 0.0005849315275237908, + "loss": 3.2806, + "step": 6202 + }, + { + "epoch": 0.3, + "grad_norm": 0.519481360912323, + "learning_rate": 0.0005849267090115222, + "loss": 3.3679, + "step": 6203 + }, + { + "epoch": 0.3, + "grad_norm": 0.5893501043319702, + "learning_rate": 0.0005849218897488115, + "loss": 3.3842, + "step": 6204 + }, + { + "epoch": 0.3, + "grad_norm": 0.5267314910888672, + "learning_rate": 0.0005849170697356711, + "loss": 3.5745, + "step": 6205 + }, + { + "epoch": 0.3, + "grad_norm": 0.5750004649162292, + "learning_rate": 0.0005849122489721142, + "loss": 3.2855, + "step": 6206 + }, + { + "epoch": 0.3, + "grad_norm": 0.5196614265441895, + "learning_rate": 0.0005849074274581531, + "loss": 3.4182, + "step": 6207 + }, + { + "epoch": 0.3, + "grad_norm": 0.5150504112243652, + "learning_rate": 0.0005849026051938009, + "loss": 3.5894, + "step": 6208 + }, + { + "epoch": 0.3, + "grad_norm": 0.532528817653656, + "learning_rate": 0.0005848977821790699, + "loss": 3.3033, + "step": 6209 + }, + { + "epoch": 0.3, + "grad_norm": 0.63649982213974, + "learning_rate": 0.0005848929584139731, + "loss": 3.3009, + "step": 6210 + }, + { + "epoch": 0.3, + "grad_norm": 0.5199731588363647, + "learning_rate": 0.0005848881338985229, + "loss": 3.6216, + "step": 6211 + }, + { + "epoch": 0.3, + "grad_norm": 0.5677109956741333, + "learning_rate": 0.0005848833086327323, + "loss": 3.5486, + "step": 6212 + }, + { + "epoch": 0.3, + "grad_norm": 0.5644501447677612, + "learning_rate": 0.0005848784826166139, + "loss": 3.2385, + "step": 6213 + }, + { + "epoch": 0.3, + "grad_norm": 0.6257779002189636, + "learning_rate": 0.0005848736558501804, + "loss": 3.3152, + "step": 6214 + }, + { + "epoch": 0.3, + "grad_norm": 0.6935915350914001, + "learning_rate": 0.0005848688283334445, + "loss": 3.4666, + "step": 6215 + }, + { + "epoch": 0.3, + "grad_norm": 0.5401797890663147, + "learning_rate": 0.0005848640000664188, + "loss": 3.4822, + "step": 6216 + }, + { + "epoch": 0.3, + "grad_norm": 0.5422528982162476, + "learning_rate": 0.0005848591710491164, + "loss": 3.32, + "step": 6217 + }, + { + "epoch": 0.3, + "grad_norm": 0.5878642201423645, + "learning_rate": 0.0005848543412815496, + "loss": 3.1503, + "step": 6218 + }, + { + "epoch": 0.3, + "grad_norm": 0.5432202219963074, + "learning_rate": 0.0005848495107637312, + "loss": 3.3538, + "step": 6219 + }, + { + "epoch": 0.3, + "grad_norm": 0.6053672432899475, + "learning_rate": 0.0005848446794956742, + "loss": 3.1726, + "step": 6220 + }, + { + "epoch": 0.3, + "grad_norm": 0.5622955560684204, + "learning_rate": 0.0005848398474773911, + "loss": 3.2979, + "step": 6221 + }, + { + "epoch": 0.3, + "grad_norm": 0.5476831197738647, + "learning_rate": 0.0005848350147088946, + "loss": 3.5093, + "step": 6222 + }, + { + "epoch": 0.3, + "grad_norm": 0.5248444676399231, + "learning_rate": 0.0005848301811901974, + "loss": 3.3824, + "step": 6223 + }, + { + "epoch": 0.31, + "grad_norm": 0.5185007452964783, + "learning_rate": 0.0005848253469213125, + "loss": 3.4452, + "step": 6224 + }, + { + "epoch": 0.31, + "grad_norm": 0.618705153465271, + "learning_rate": 0.0005848205119022524, + "loss": 3.6843, + "step": 6225 + }, + { + "epoch": 0.31, + "grad_norm": 0.5357537865638733, + "learning_rate": 0.0005848156761330298, + "loss": 3.4044, + "step": 6226 + }, + { + "epoch": 0.31, + "grad_norm": 0.5574560761451721, + "learning_rate": 0.0005848108396136576, + "loss": 3.2504, + "step": 6227 + }, + { + "epoch": 0.31, + "grad_norm": 0.5414639115333557, + "learning_rate": 0.0005848060023441484, + "loss": 3.439, + "step": 6228 + }, + { + "epoch": 0.31, + "grad_norm": 0.5341112017631531, + "learning_rate": 0.0005848011643245151, + "loss": 3.283, + "step": 6229 + }, + { + "epoch": 0.31, + "grad_norm": 0.5189714431762695, + "learning_rate": 0.0005847963255547704, + "loss": 3.3486, + "step": 6230 + }, + { + "epoch": 0.31, + "grad_norm": 0.6320374608039856, + "learning_rate": 0.0005847914860349268, + "loss": 3.5425, + "step": 6231 + }, + { + "epoch": 0.31, + "grad_norm": 0.5230095386505127, + "learning_rate": 0.0005847866457649973, + "loss": 3.3306, + "step": 6232 + }, + { + "epoch": 0.31, + "grad_norm": 0.5646961331367493, + "learning_rate": 0.0005847818047449946, + "loss": 3.4545, + "step": 6233 + }, + { + "epoch": 0.31, + "grad_norm": 0.5346679091453552, + "learning_rate": 0.0005847769629749314, + "loss": 3.4056, + "step": 6234 + }, + { + "epoch": 0.31, + "grad_norm": 0.5118999481201172, + "learning_rate": 0.0005847721204548206, + "loss": 3.3682, + "step": 6235 + }, + { + "epoch": 0.31, + "grad_norm": 0.5368824005126953, + "learning_rate": 0.0005847672771846748, + "loss": 3.37, + "step": 6236 + }, + { + "epoch": 0.31, + "grad_norm": 0.563027024269104, + "learning_rate": 0.0005847624331645067, + "loss": 3.4851, + "step": 6237 + }, + { + "epoch": 0.31, + "grad_norm": 0.5190998911857605, + "learning_rate": 0.0005847575883943292, + "loss": 3.4153, + "step": 6238 + }, + { + "epoch": 0.31, + "grad_norm": 0.5824965834617615, + "learning_rate": 0.000584752742874155, + "loss": 3.5383, + "step": 6239 + }, + { + "epoch": 0.31, + "grad_norm": 0.5584920048713684, + "learning_rate": 0.000584747896603997, + "loss": 3.2722, + "step": 6240 + }, + { + "epoch": 0.31, + "grad_norm": 0.5643733739852905, + "learning_rate": 0.0005847430495838677, + "loss": 3.4414, + "step": 6241 + }, + { + "epoch": 0.31, + "grad_norm": 0.5031020045280457, + "learning_rate": 0.00058473820181378, + "loss": 3.3009, + "step": 6242 + }, + { + "epoch": 0.31, + "grad_norm": 0.5333477854728699, + "learning_rate": 0.0005847333532937467, + "loss": 3.3855, + "step": 6243 + }, + { + "epoch": 0.31, + "grad_norm": 0.5437228679656982, + "learning_rate": 0.0005847285040237807, + "loss": 3.2759, + "step": 6244 + }, + { + "epoch": 0.31, + "grad_norm": 0.5025436878204346, + "learning_rate": 0.0005847236540038944, + "loss": 3.3027, + "step": 6245 + }, + { + "epoch": 0.31, + "grad_norm": 0.5325117111206055, + "learning_rate": 0.0005847188032341009, + "loss": 3.5029, + "step": 6246 + }, + { + "epoch": 0.31, + "grad_norm": 0.4821946918964386, + "learning_rate": 0.0005847139517144128, + "loss": 3.5097, + "step": 6247 + }, + { + "epoch": 0.31, + "grad_norm": 0.5105711221694946, + "learning_rate": 0.0005847090994448431, + "loss": 3.3654, + "step": 6248 + }, + { + "epoch": 0.31, + "grad_norm": 0.5041024088859558, + "learning_rate": 0.0005847042464254043, + "loss": 3.4881, + "step": 6249 + }, + { + "epoch": 0.31, + "grad_norm": 0.5586902499198914, + "learning_rate": 0.0005846993926561093, + "loss": 3.2771, + "step": 6250 + }, + { + "epoch": 0.31, + "grad_norm": 0.5156290531158447, + "learning_rate": 0.0005846945381369709, + "loss": 3.3685, + "step": 6251 + }, + { + "epoch": 0.31, + "grad_norm": 0.5516939759254456, + "learning_rate": 0.0005846896828680019, + "loss": 3.4328, + "step": 6252 + }, + { + "epoch": 0.31, + "grad_norm": 0.628324031829834, + "learning_rate": 0.0005846848268492151, + "loss": 3.164, + "step": 6253 + }, + { + "epoch": 0.31, + "grad_norm": 0.5220317840576172, + "learning_rate": 0.0005846799700806232, + "loss": 3.5043, + "step": 6254 + }, + { + "epoch": 0.31, + "grad_norm": 0.523783802986145, + "learning_rate": 0.000584675112562239, + "loss": 3.5469, + "step": 6255 + }, + { + "epoch": 0.31, + "grad_norm": 0.5619279146194458, + "learning_rate": 0.0005846702542940755, + "loss": 3.3155, + "step": 6256 + }, + { + "epoch": 0.31, + "grad_norm": 0.6332241296768188, + "learning_rate": 0.0005846653952761452, + "loss": 3.1784, + "step": 6257 + }, + { + "epoch": 0.31, + "grad_norm": 0.5562300682067871, + "learning_rate": 0.0005846605355084611, + "loss": 3.2124, + "step": 6258 + }, + { + "epoch": 0.31, + "grad_norm": 0.5105780363082886, + "learning_rate": 0.0005846556749910358, + "loss": 3.3471, + "step": 6259 + }, + { + "epoch": 0.31, + "grad_norm": 0.5315980911254883, + "learning_rate": 0.0005846508137238825, + "loss": 3.2881, + "step": 6260 + }, + { + "epoch": 0.31, + "grad_norm": 0.5600804686546326, + "learning_rate": 0.0005846459517070135, + "loss": 3.4305, + "step": 6261 + }, + { + "epoch": 0.31, + "grad_norm": 0.5662427544593811, + "learning_rate": 0.000584641088940442, + "loss": 3.3034, + "step": 6262 + }, + { + "epoch": 0.31, + "grad_norm": 0.5419402122497559, + "learning_rate": 0.0005846362254241805, + "loss": 3.4267, + "step": 6263 + }, + { + "epoch": 0.31, + "grad_norm": 0.5330461263656616, + "learning_rate": 0.000584631361158242, + "loss": 3.4127, + "step": 6264 + }, + { + "epoch": 0.31, + "grad_norm": 0.5632162690162659, + "learning_rate": 0.0005846264961426394, + "loss": 3.1738, + "step": 6265 + }, + { + "epoch": 0.31, + "grad_norm": 0.5450085997581482, + "learning_rate": 0.0005846216303773853, + "loss": 3.4474, + "step": 6266 + }, + { + "epoch": 0.31, + "grad_norm": 0.5258246660232544, + "learning_rate": 0.0005846167638624926, + "loss": 3.3843, + "step": 6267 + }, + { + "epoch": 0.31, + "grad_norm": 0.5241590738296509, + "learning_rate": 0.0005846118965979741, + "loss": 3.4877, + "step": 6268 + }, + { + "epoch": 0.31, + "grad_norm": 0.5271158218383789, + "learning_rate": 0.0005846070285838427, + "loss": 3.3302, + "step": 6269 + }, + { + "epoch": 0.31, + "grad_norm": 0.5716249942779541, + "learning_rate": 0.0005846021598201111, + "loss": 3.6957, + "step": 6270 + }, + { + "epoch": 0.31, + "grad_norm": 0.546226441860199, + "learning_rate": 0.0005845972903067922, + "loss": 3.4328, + "step": 6271 + }, + { + "epoch": 0.31, + "grad_norm": 0.532817006111145, + "learning_rate": 0.0005845924200438987, + "loss": 3.4793, + "step": 6272 + }, + { + "epoch": 0.31, + "grad_norm": 0.5522794723510742, + "learning_rate": 0.0005845875490314437, + "loss": 3.2225, + "step": 6273 + }, + { + "epoch": 0.31, + "grad_norm": 0.5274179577827454, + "learning_rate": 0.0005845826772694399, + "loss": 2.9809, + "step": 6274 + }, + { + "epoch": 0.31, + "grad_norm": 0.5768852829933167, + "learning_rate": 0.0005845778047578999, + "loss": 3.161, + "step": 6275 + }, + { + "epoch": 0.31, + "grad_norm": 0.5117722749710083, + "learning_rate": 0.0005845729314968369, + "loss": 3.3208, + "step": 6276 + }, + { + "epoch": 0.31, + "grad_norm": 0.6170226335525513, + "learning_rate": 0.0005845680574862635, + "loss": 3.3823, + "step": 6277 + }, + { + "epoch": 0.31, + "grad_norm": 0.567700207233429, + "learning_rate": 0.0005845631827261927, + "loss": 3.5089, + "step": 6278 + }, + { + "epoch": 0.31, + "grad_norm": 0.5353153347969055, + "learning_rate": 0.0005845583072166371, + "loss": 3.3691, + "step": 6279 + }, + { + "epoch": 0.31, + "grad_norm": 0.5365519523620605, + "learning_rate": 0.0005845534309576097, + "loss": 3.432, + "step": 6280 + }, + { + "epoch": 0.31, + "grad_norm": 0.5466122627258301, + "learning_rate": 0.0005845485539491234, + "loss": 3.3228, + "step": 6281 + }, + { + "epoch": 0.31, + "grad_norm": 0.6011998653411865, + "learning_rate": 0.0005845436761911909, + "loss": 3.0104, + "step": 6282 + }, + { + "epoch": 0.31, + "grad_norm": 0.5234431624412537, + "learning_rate": 0.0005845387976838251, + "loss": 3.3694, + "step": 6283 + }, + { + "epoch": 0.31, + "grad_norm": 0.540686309337616, + "learning_rate": 0.000584533918427039, + "loss": 3.6565, + "step": 6284 + }, + { + "epoch": 0.31, + "grad_norm": 0.5174767971038818, + "learning_rate": 0.0005845290384208453, + "loss": 3.5169, + "step": 6285 + }, + { + "epoch": 0.31, + "grad_norm": 0.5468271970748901, + "learning_rate": 0.0005845241576652567, + "loss": 3.3747, + "step": 6286 + }, + { + "epoch": 0.31, + "grad_norm": 0.5398919582366943, + "learning_rate": 0.0005845192761602864, + "loss": 3.6541, + "step": 6287 + }, + { + "epoch": 0.31, + "grad_norm": 0.5341367125511169, + "learning_rate": 0.000584514393905947, + "loss": 3.3902, + "step": 6288 + }, + { + "epoch": 0.31, + "grad_norm": 0.5506887435913086, + "learning_rate": 0.0005845095109022514, + "loss": 3.4969, + "step": 6289 + }, + { + "epoch": 0.31, + "grad_norm": 0.5707558393478394, + "learning_rate": 0.0005845046271492127, + "loss": 3.2885, + "step": 6290 + }, + { + "epoch": 0.31, + "grad_norm": 0.49530622363090515, + "learning_rate": 0.0005844997426468434, + "loss": 3.297, + "step": 6291 + }, + { + "epoch": 0.31, + "grad_norm": 0.5162578821182251, + "learning_rate": 0.0005844948573951565, + "loss": 3.3581, + "step": 6292 + }, + { + "epoch": 0.31, + "grad_norm": 0.5304474234580994, + "learning_rate": 0.000584489971394165, + "loss": 3.4776, + "step": 6293 + }, + { + "epoch": 0.31, + "grad_norm": 0.5445380806922913, + "learning_rate": 0.0005844850846438816, + "loss": 3.4076, + "step": 6294 + }, + { + "epoch": 0.31, + "grad_norm": 0.5633144974708557, + "learning_rate": 0.0005844801971443193, + "loss": 3.4512, + "step": 6295 + }, + { + "epoch": 0.31, + "grad_norm": 0.5193982124328613, + "learning_rate": 0.0005844753088954908, + "loss": 3.404, + "step": 6296 + }, + { + "epoch": 0.31, + "grad_norm": 0.5558420419692993, + "learning_rate": 0.0005844704198974093, + "loss": 3.5152, + "step": 6297 + }, + { + "epoch": 0.31, + "grad_norm": 0.535561203956604, + "learning_rate": 0.0005844655301500873, + "loss": 3.3756, + "step": 6298 + }, + { + "epoch": 0.31, + "grad_norm": 0.5079712271690369, + "learning_rate": 0.0005844606396535378, + "loss": 3.5013, + "step": 6299 + }, + { + "epoch": 0.31, + "grad_norm": 0.5386590361595154, + "learning_rate": 0.0005844557484077738, + "loss": 3.1547, + "step": 6300 + }, + { + "epoch": 0.31, + "grad_norm": 0.5513303875923157, + "learning_rate": 0.000584450856412808, + "loss": 3.5032, + "step": 6301 + }, + { + "epoch": 0.31, + "grad_norm": 0.5191980600357056, + "learning_rate": 0.0005844459636686535, + "loss": 3.388, + "step": 6302 + }, + { + "epoch": 0.31, + "grad_norm": 0.5659469366073608, + "learning_rate": 0.0005844410701753231, + "loss": 3.3341, + "step": 6303 + }, + { + "epoch": 0.31, + "grad_norm": 0.5289974808692932, + "learning_rate": 0.0005844361759328295, + "loss": 3.4693, + "step": 6304 + }, + { + "epoch": 0.31, + "grad_norm": 0.5283775925636292, + "learning_rate": 0.000584431280941186, + "loss": 3.4818, + "step": 6305 + }, + { + "epoch": 0.31, + "grad_norm": 0.5795498490333557, + "learning_rate": 0.000584426385200405, + "loss": 3.4025, + "step": 6306 + }, + { + "epoch": 0.31, + "grad_norm": 0.5070279240608215, + "learning_rate": 0.0005844214887104998, + "loss": 3.2794, + "step": 6307 + }, + { + "epoch": 0.31, + "grad_norm": 0.5226815342903137, + "learning_rate": 0.000584416591471483, + "loss": 3.4417, + "step": 6308 + }, + { + "epoch": 0.31, + "grad_norm": 0.5250857472419739, + "learning_rate": 0.0005844116934833678, + "loss": 3.5051, + "step": 6309 + }, + { + "epoch": 0.31, + "grad_norm": 0.5460673570632935, + "learning_rate": 0.0005844067947461669, + "loss": 3.3849, + "step": 6310 + }, + { + "epoch": 0.31, + "grad_norm": 0.5464668869972229, + "learning_rate": 0.0005844018952598931, + "loss": 3.2028, + "step": 6311 + }, + { + "epoch": 0.31, + "grad_norm": 0.5466782450675964, + "learning_rate": 0.0005843969950245595, + "loss": 3.1737, + "step": 6312 + }, + { + "epoch": 0.31, + "grad_norm": 0.5372828841209412, + "learning_rate": 0.0005843920940401792, + "loss": 3.5099, + "step": 6313 + }, + { + "epoch": 0.31, + "grad_norm": 0.5676745176315308, + "learning_rate": 0.0005843871923067645, + "loss": 3.353, + "step": 6314 + }, + { + "epoch": 0.31, + "grad_norm": 0.5780575275421143, + "learning_rate": 0.0005843822898243289, + "loss": 3.2481, + "step": 6315 + }, + { + "epoch": 0.31, + "grad_norm": 0.7814470529556274, + "learning_rate": 0.000584377386592885, + "loss": 3.3843, + "step": 6316 + }, + { + "epoch": 0.31, + "grad_norm": 0.5636649131774902, + "learning_rate": 0.0005843724826124457, + "loss": 3.5316, + "step": 6317 + }, + { + "epoch": 0.31, + "grad_norm": 0.548147976398468, + "learning_rate": 0.0005843675778830241, + "loss": 3.1795, + "step": 6318 + }, + { + "epoch": 0.31, + "grad_norm": 0.5491986870765686, + "learning_rate": 0.000584362672404633, + "loss": 3.2468, + "step": 6319 + }, + { + "epoch": 0.31, + "grad_norm": 0.5343765020370483, + "learning_rate": 0.0005843577661772854, + "loss": 3.3482, + "step": 6320 + }, + { + "epoch": 0.31, + "grad_norm": 0.5225366950035095, + "learning_rate": 0.0005843528592009941, + "loss": 3.6224, + "step": 6321 + }, + { + "epoch": 0.31, + "grad_norm": 0.5188947319984436, + "learning_rate": 0.0005843479514757721, + "loss": 3.2754, + "step": 6322 + }, + { + "epoch": 0.31, + "grad_norm": 0.5342457294464111, + "learning_rate": 0.0005843430430016324, + "loss": 3.4175, + "step": 6323 + }, + { + "epoch": 0.31, + "grad_norm": 0.5652335286140442, + "learning_rate": 0.0005843381337785877, + "loss": 3.2715, + "step": 6324 + }, + { + "epoch": 0.31, + "grad_norm": 0.49999839067459106, + "learning_rate": 0.0005843332238066512, + "loss": 3.3006, + "step": 6325 + }, + { + "epoch": 0.31, + "grad_norm": 0.5379924178123474, + "learning_rate": 0.0005843283130858357, + "loss": 3.2138, + "step": 6326 + }, + { + "epoch": 0.31, + "grad_norm": 0.5372043251991272, + "learning_rate": 0.0005843234016161542, + "loss": 3.3537, + "step": 6327 + }, + { + "epoch": 0.31, + "grad_norm": 0.5262093544006348, + "learning_rate": 0.0005843184893976194, + "loss": 3.3396, + "step": 6328 + }, + { + "epoch": 0.31, + "grad_norm": 0.5059529542922974, + "learning_rate": 0.0005843135764302446, + "loss": 3.4341, + "step": 6329 + }, + { + "epoch": 0.31, + "grad_norm": 0.5077370405197144, + "learning_rate": 0.0005843086627140425, + "loss": 3.1674, + "step": 6330 + }, + { + "epoch": 0.31, + "grad_norm": 0.5159595012664795, + "learning_rate": 0.000584303748249026, + "loss": 3.429, + "step": 6331 + }, + { + "epoch": 0.31, + "grad_norm": 0.5877441763877869, + "learning_rate": 0.0005842988330352082, + "loss": 3.3886, + "step": 6332 + }, + { + "epoch": 0.31, + "grad_norm": 0.5474517941474915, + "learning_rate": 0.000584293917072602, + "loss": 3.3393, + "step": 6333 + }, + { + "epoch": 0.31, + "grad_norm": 0.5149897336959839, + "learning_rate": 0.0005842890003612204, + "loss": 3.4596, + "step": 6334 + }, + { + "epoch": 0.31, + "grad_norm": 0.5382705330848694, + "learning_rate": 0.0005842840829010762, + "loss": 3.428, + "step": 6335 + }, + { + "epoch": 0.31, + "grad_norm": 0.5204023718833923, + "learning_rate": 0.0005842791646921825, + "loss": 3.5418, + "step": 6336 + }, + { + "epoch": 0.31, + "grad_norm": 0.5262681245803833, + "learning_rate": 0.0005842742457345523, + "loss": 3.3259, + "step": 6337 + }, + { + "epoch": 0.31, + "grad_norm": 0.5385437607765198, + "learning_rate": 0.0005842693260281981, + "loss": 3.2901, + "step": 6338 + }, + { + "epoch": 0.31, + "grad_norm": 0.5480816960334778, + "learning_rate": 0.0005842644055731335, + "loss": 3.2381, + "step": 6339 + }, + { + "epoch": 0.31, + "grad_norm": 0.5376442670822144, + "learning_rate": 0.0005842594843693711, + "loss": 3.0217, + "step": 6340 + }, + { + "epoch": 0.31, + "grad_norm": 0.5409801602363586, + "learning_rate": 0.0005842545624169239, + "loss": 3.2004, + "step": 6341 + }, + { + "epoch": 0.31, + "grad_norm": 0.5036441087722778, + "learning_rate": 0.0005842496397158049, + "loss": 3.4932, + "step": 6342 + }, + { + "epoch": 0.31, + "grad_norm": 0.6348100304603577, + "learning_rate": 0.000584244716266027, + "loss": 3.4698, + "step": 6343 + }, + { + "epoch": 0.31, + "grad_norm": 0.5811209082603455, + "learning_rate": 0.0005842397920676032, + "loss": 3.4319, + "step": 6344 + }, + { + "epoch": 0.31, + "grad_norm": 0.574324369430542, + "learning_rate": 0.0005842348671205466, + "loss": 3.3559, + "step": 6345 + }, + { + "epoch": 0.31, + "grad_norm": 0.5267350077629089, + "learning_rate": 0.00058422994142487, + "loss": 3.3692, + "step": 6346 + }, + { + "epoch": 0.31, + "grad_norm": 0.5143805742263794, + "learning_rate": 0.0005842250149805865, + "loss": 3.4085, + "step": 6347 + }, + { + "epoch": 0.31, + "grad_norm": 0.5390629172325134, + "learning_rate": 0.000584220087787709, + "loss": 3.2466, + "step": 6348 + }, + { + "epoch": 0.31, + "grad_norm": 0.5482839941978455, + "learning_rate": 0.0005842151598462504, + "loss": 3.4562, + "step": 6349 + }, + { + "epoch": 0.31, + "grad_norm": 0.5902968049049377, + "learning_rate": 0.0005842102311562238, + "loss": 3.4197, + "step": 6350 + }, + { + "epoch": 0.31, + "grad_norm": 0.561595618724823, + "learning_rate": 0.0005842053017176422, + "loss": 3.4425, + "step": 6351 + }, + { + "epoch": 0.31, + "grad_norm": 0.5435536503791809, + "learning_rate": 0.0005842003715305185, + "loss": 3.595, + "step": 6352 + }, + { + "epoch": 0.31, + "grad_norm": 0.5846390724182129, + "learning_rate": 0.0005841954405948656, + "loss": 3.5844, + "step": 6353 + }, + { + "epoch": 0.31, + "grad_norm": 0.5653446912765503, + "learning_rate": 0.0005841905089106968, + "loss": 3.3855, + "step": 6354 + }, + { + "epoch": 0.31, + "grad_norm": 0.5833756327629089, + "learning_rate": 0.0005841855764780248, + "loss": 3.1619, + "step": 6355 + }, + { + "epoch": 0.31, + "grad_norm": 0.565251350402832, + "learning_rate": 0.0005841806432968626, + "loss": 3.2028, + "step": 6356 + }, + { + "epoch": 0.31, + "grad_norm": 0.5358942747116089, + "learning_rate": 0.0005841757093672234, + "loss": 3.2987, + "step": 6357 + }, + { + "epoch": 0.31, + "grad_norm": 0.5292371511459351, + "learning_rate": 0.0005841707746891201, + "loss": 3.6083, + "step": 6358 + }, + { + "epoch": 0.31, + "grad_norm": 0.606774628162384, + "learning_rate": 0.0005841658392625656, + "loss": 3.3125, + "step": 6359 + }, + { + "epoch": 0.31, + "grad_norm": 0.5326677560806274, + "learning_rate": 0.000584160903087573, + "loss": 3.3493, + "step": 6360 + }, + { + "epoch": 0.31, + "grad_norm": 0.5448054671287537, + "learning_rate": 0.0005841559661641552, + "loss": 3.3649, + "step": 6361 + }, + { + "epoch": 0.31, + "grad_norm": 0.5255556106567383, + "learning_rate": 0.0005841510284923253, + "loss": 3.3162, + "step": 6362 + }, + { + "epoch": 0.31, + "grad_norm": 0.5699189305305481, + "learning_rate": 0.0005841460900720963, + "loss": 3.2386, + "step": 6363 + }, + { + "epoch": 0.31, + "grad_norm": 0.5408067107200623, + "learning_rate": 0.0005841411509034812, + "loss": 3.5189, + "step": 6364 + }, + { + "epoch": 0.31, + "grad_norm": 0.561336100101471, + "learning_rate": 0.000584136210986493, + "loss": 3.4752, + "step": 6365 + }, + { + "epoch": 0.31, + "grad_norm": 0.5111557245254517, + "learning_rate": 0.0005841312703211447, + "loss": 3.3528, + "step": 6366 + }, + { + "epoch": 0.31, + "grad_norm": 0.532857358455658, + "learning_rate": 0.0005841263289074493, + "loss": 3.4392, + "step": 6367 + }, + { + "epoch": 0.31, + "grad_norm": 0.5691311359405518, + "learning_rate": 0.0005841213867454198, + "loss": 3.1776, + "step": 6368 + }, + { + "epoch": 0.31, + "grad_norm": 0.5483730435371399, + "learning_rate": 0.0005841164438350693, + "loss": 3.3162, + "step": 6369 + }, + { + "epoch": 0.31, + "grad_norm": 0.5207633376121521, + "learning_rate": 0.0005841115001764107, + "loss": 3.3971, + "step": 6370 + }, + { + "epoch": 0.31, + "grad_norm": 0.5637426972389221, + "learning_rate": 0.0005841065557694572, + "loss": 3.3626, + "step": 6371 + }, + { + "epoch": 0.31, + "grad_norm": 0.5457177758216858, + "learning_rate": 0.0005841016106142216, + "loss": 3.4505, + "step": 6372 + }, + { + "epoch": 0.31, + "grad_norm": 0.5361736416816711, + "learning_rate": 0.0005840966647107171, + "loss": 3.5193, + "step": 6373 + }, + { + "epoch": 0.31, + "grad_norm": 0.5250360369682312, + "learning_rate": 0.0005840917180589566, + "loss": 3.3164, + "step": 6374 + }, + { + "epoch": 0.31, + "grad_norm": 0.5771521925926208, + "learning_rate": 0.0005840867706589531, + "loss": 3.1594, + "step": 6375 + }, + { + "epoch": 0.31, + "grad_norm": 0.5609606504440308, + "learning_rate": 0.0005840818225107199, + "loss": 3.1954, + "step": 6376 + }, + { + "epoch": 0.31, + "grad_norm": 0.5278244018554688, + "learning_rate": 0.0005840768736142698, + "loss": 3.6396, + "step": 6377 + }, + { + "epoch": 0.31, + "grad_norm": 0.5072911381721497, + "learning_rate": 0.0005840719239696159, + "loss": 3.4169, + "step": 6378 + }, + { + "epoch": 0.31, + "grad_norm": 0.5723280906677246, + "learning_rate": 0.0005840669735767712, + "loss": 3.2226, + "step": 6379 + }, + { + "epoch": 0.31, + "grad_norm": 0.5588771104812622, + "learning_rate": 0.0005840620224357487, + "loss": 3.4331, + "step": 6380 + }, + { + "epoch": 0.31, + "grad_norm": 0.5622850656509399, + "learning_rate": 0.0005840570705465616, + "loss": 3.5246, + "step": 6381 + }, + { + "epoch": 0.31, + "grad_norm": 0.5287413597106934, + "learning_rate": 0.0005840521179092228, + "loss": 3.5396, + "step": 6382 + }, + { + "epoch": 0.31, + "grad_norm": 0.6131024360656738, + "learning_rate": 0.0005840471645237454, + "loss": 3.4931, + "step": 6383 + }, + { + "epoch": 0.31, + "grad_norm": 0.49073895812034607, + "learning_rate": 0.0005840422103901425, + "loss": 3.3222, + "step": 6384 + }, + { + "epoch": 0.31, + "grad_norm": 0.5380456447601318, + "learning_rate": 0.000584037255508427, + "loss": 3.4926, + "step": 6385 + }, + { + "epoch": 0.31, + "grad_norm": 0.500093400478363, + "learning_rate": 0.000584032299878612, + "loss": 3.5383, + "step": 6386 + }, + { + "epoch": 0.31, + "grad_norm": 0.5496594905853271, + "learning_rate": 0.0005840273435007106, + "loss": 3.4764, + "step": 6387 + }, + { + "epoch": 0.31, + "grad_norm": 0.5659000277519226, + "learning_rate": 0.0005840223863747359, + "loss": 3.4011, + "step": 6388 + }, + { + "epoch": 0.31, + "grad_norm": 0.5337103605270386, + "learning_rate": 0.0005840174285007009, + "loss": 3.3357, + "step": 6389 + }, + { + "epoch": 0.31, + "grad_norm": 0.5471076965332031, + "learning_rate": 0.0005840124698786186, + "loss": 3.3584, + "step": 6390 + }, + { + "epoch": 0.31, + "grad_norm": 0.5406061410903931, + "learning_rate": 0.0005840075105085021, + "loss": 3.5359, + "step": 6391 + }, + { + "epoch": 0.31, + "grad_norm": 0.5463137626647949, + "learning_rate": 0.0005840025503903645, + "loss": 3.29, + "step": 6392 + }, + { + "epoch": 0.31, + "grad_norm": 0.5283336043357849, + "learning_rate": 0.0005839975895242189, + "loss": 3.6057, + "step": 6393 + }, + { + "epoch": 0.31, + "grad_norm": 0.5675161480903625, + "learning_rate": 0.0005839926279100783, + "loss": 3.2287, + "step": 6394 + }, + { + "epoch": 0.31, + "grad_norm": 0.5471466183662415, + "learning_rate": 0.0005839876655479557, + "loss": 3.4739, + "step": 6395 + }, + { + "epoch": 0.31, + "grad_norm": 0.5402804017066956, + "learning_rate": 0.0005839827024378643, + "loss": 3.3865, + "step": 6396 + }, + { + "epoch": 0.31, + "grad_norm": 0.5770726799964905, + "learning_rate": 0.000583977738579817, + "loss": 3.3723, + "step": 6397 + }, + { + "epoch": 0.31, + "grad_norm": 0.5275018811225891, + "learning_rate": 0.0005839727739738271, + "loss": 3.3587, + "step": 6398 + }, + { + "epoch": 0.31, + "grad_norm": 0.5402635931968689, + "learning_rate": 0.0005839678086199076, + "loss": 3.6001, + "step": 6399 + }, + { + "epoch": 0.31, + "grad_norm": 0.6085510849952698, + "learning_rate": 0.0005839628425180714, + "loss": 3.303, + "step": 6400 + }, + { + "epoch": 0.31, + "grad_norm": 0.626964271068573, + "learning_rate": 0.0005839578756683318, + "loss": 3.3263, + "step": 6401 + }, + { + "epoch": 0.31, + "grad_norm": 0.5337768197059631, + "learning_rate": 0.0005839529080707019, + "loss": 3.4304, + "step": 6402 + }, + { + "epoch": 0.31, + "grad_norm": 0.5228325128555298, + "learning_rate": 0.0005839479397251946, + "loss": 3.2449, + "step": 6403 + }, + { + "epoch": 0.31, + "grad_norm": 0.565669596195221, + "learning_rate": 0.000583942970631823, + "loss": 3.3662, + "step": 6404 + }, + { + "epoch": 0.31, + "grad_norm": 0.5665708780288696, + "learning_rate": 0.0005839380007906003, + "loss": 3.339, + "step": 6405 + }, + { + "epoch": 0.31, + "grad_norm": 0.530850350856781, + "learning_rate": 0.0005839330302015396, + "loss": 3.4299, + "step": 6406 + }, + { + "epoch": 0.31, + "grad_norm": 0.5689116716384888, + "learning_rate": 0.000583928058864654, + "loss": 3.4183, + "step": 6407 + }, + { + "epoch": 0.31, + "grad_norm": 0.5822932720184326, + "learning_rate": 0.0005839230867799565, + "loss": 3.3617, + "step": 6408 + }, + { + "epoch": 0.31, + "grad_norm": 0.5526020526885986, + "learning_rate": 0.0005839181139474601, + "loss": 3.3813, + "step": 6409 + }, + { + "epoch": 0.31, + "grad_norm": 0.5911718606948853, + "learning_rate": 0.0005839131403671782, + "loss": 3.3916, + "step": 6410 + }, + { + "epoch": 0.31, + "grad_norm": 0.501130223274231, + "learning_rate": 0.0005839081660391236, + "loss": 3.2104, + "step": 6411 + }, + { + "epoch": 0.31, + "grad_norm": 0.5649345517158508, + "learning_rate": 0.0005839031909633096, + "loss": 3.3394, + "step": 6412 + }, + { + "epoch": 0.31, + "grad_norm": 0.5334262847900391, + "learning_rate": 0.0005838982151397492, + "loss": 3.2688, + "step": 6413 + }, + { + "epoch": 0.31, + "grad_norm": 0.5680837035179138, + "learning_rate": 0.0005838932385684556, + "loss": 3.5027, + "step": 6414 + }, + { + "epoch": 0.31, + "grad_norm": 0.5568577647209167, + "learning_rate": 0.0005838882612494417, + "loss": 3.3118, + "step": 6415 + }, + { + "epoch": 0.31, + "grad_norm": 0.5233654379844666, + "learning_rate": 0.0005838832831827209, + "loss": 3.3249, + "step": 6416 + }, + { + "epoch": 0.31, + "grad_norm": 0.5565677881240845, + "learning_rate": 0.0005838783043683062, + "loss": 3.257, + "step": 6417 + }, + { + "epoch": 0.31, + "grad_norm": 0.588458776473999, + "learning_rate": 0.0005838733248062105, + "loss": 3.3794, + "step": 6418 + }, + { + "epoch": 0.31, + "grad_norm": 0.543366551399231, + "learning_rate": 0.0005838683444964473, + "loss": 3.4565, + "step": 6419 + }, + { + "epoch": 0.31, + "grad_norm": 0.508150041103363, + "learning_rate": 0.0005838633634390293, + "loss": 3.2742, + "step": 6420 + }, + { + "epoch": 0.31, + "grad_norm": 0.5089104175567627, + "learning_rate": 0.00058385838163397, + "loss": 3.1958, + "step": 6421 + }, + { + "epoch": 0.31, + "grad_norm": 0.5183136463165283, + "learning_rate": 0.0005838533990812822, + "loss": 3.4799, + "step": 6422 + }, + { + "epoch": 0.31, + "grad_norm": 0.5491008758544922, + "learning_rate": 0.0005838484157809794, + "loss": 3.281, + "step": 6423 + }, + { + "epoch": 0.31, + "grad_norm": 0.5273602604866028, + "learning_rate": 0.0005838434317330743, + "loss": 3.5572, + "step": 6424 + }, + { + "epoch": 0.31, + "grad_norm": 0.5344679951667786, + "learning_rate": 0.0005838384469375803, + "loss": 3.3987, + "step": 6425 + }, + { + "epoch": 0.31, + "grad_norm": 0.5330580472946167, + "learning_rate": 0.0005838334613945105, + "loss": 3.1804, + "step": 6426 + }, + { + "epoch": 0.31, + "grad_norm": 0.5547220706939697, + "learning_rate": 0.0005838284751038779, + "loss": 3.6428, + "step": 6427 + }, + { + "epoch": 0.32, + "grad_norm": 0.5338019728660583, + "learning_rate": 0.0005838234880656957, + "loss": 3.3878, + "step": 6428 + }, + { + "epoch": 0.32, + "grad_norm": 0.54743891954422, + "learning_rate": 0.0005838185002799771, + "loss": 3.4011, + "step": 6429 + }, + { + "epoch": 0.32, + "grad_norm": 0.5494392514228821, + "learning_rate": 0.0005838135117467352, + "loss": 3.3812, + "step": 6430 + }, + { + "epoch": 0.32, + "grad_norm": 0.6107029318809509, + "learning_rate": 0.0005838085224659832, + "loss": 3.3056, + "step": 6431 + }, + { + "epoch": 0.32, + "grad_norm": 0.5496015548706055, + "learning_rate": 0.0005838035324377341, + "loss": 3.3015, + "step": 6432 + }, + { + "epoch": 0.32, + "grad_norm": 0.5744660496711731, + "learning_rate": 0.0005837985416620011, + "loss": 3.6286, + "step": 6433 + }, + { + "epoch": 0.32, + "grad_norm": 0.5377488732337952, + "learning_rate": 0.0005837935501387975, + "loss": 3.4111, + "step": 6434 + }, + { + "epoch": 0.32, + "grad_norm": 0.5246378779411316, + "learning_rate": 0.0005837885578681361, + "loss": 3.4557, + "step": 6435 + }, + { + "epoch": 0.32, + "grad_norm": 0.5327603220939636, + "learning_rate": 0.0005837835648500304, + "loss": 3.3453, + "step": 6436 + }, + { + "epoch": 0.32, + "grad_norm": 0.5735540986061096, + "learning_rate": 0.0005837785710844934, + "loss": 3.4068, + "step": 6437 + }, + { + "epoch": 0.32, + "grad_norm": 0.5326542854309082, + "learning_rate": 0.0005837735765715381, + "loss": 3.5506, + "step": 6438 + }, + { + "epoch": 0.32, + "grad_norm": 0.6021717190742493, + "learning_rate": 0.000583768581311178, + "loss": 3.4768, + "step": 6439 + }, + { + "epoch": 0.32, + "grad_norm": 0.5350591540336609, + "learning_rate": 0.0005837635853034259, + "loss": 3.3959, + "step": 6440 + }, + { + "epoch": 0.32, + "grad_norm": 0.5394638180732727, + "learning_rate": 0.0005837585885482953, + "loss": 3.4337, + "step": 6441 + }, + { + "epoch": 0.32, + "grad_norm": 0.5362926125526428, + "learning_rate": 0.0005837535910457991, + "loss": 3.3943, + "step": 6442 + }, + { + "epoch": 0.32, + "grad_norm": 0.5286582112312317, + "learning_rate": 0.0005837485927959505, + "loss": 3.4177, + "step": 6443 + }, + { + "epoch": 0.32, + "grad_norm": 0.5231122970581055, + "learning_rate": 0.0005837435937987628, + "loss": 3.5713, + "step": 6444 + }, + { + "epoch": 0.32, + "grad_norm": 0.541363537311554, + "learning_rate": 0.0005837385940542491, + "loss": 3.2653, + "step": 6445 + }, + { + "epoch": 0.32, + "grad_norm": 0.5405844449996948, + "learning_rate": 0.0005837335935624225, + "loss": 3.4653, + "step": 6446 + }, + { + "epoch": 0.32, + "grad_norm": 0.5295756459236145, + "learning_rate": 0.0005837285923232963, + "loss": 3.3157, + "step": 6447 + }, + { + "epoch": 0.32, + "grad_norm": 0.5377822518348694, + "learning_rate": 0.0005837235903368834, + "loss": 3.3217, + "step": 6448 + }, + { + "epoch": 0.32, + "grad_norm": 0.5518754720687866, + "learning_rate": 0.0005837185876031973, + "loss": 3.2826, + "step": 6449 + }, + { + "epoch": 0.32, + "grad_norm": 0.5562870502471924, + "learning_rate": 0.000583713584122251, + "loss": 3.1871, + "step": 6450 + }, + { + "epoch": 0.32, + "grad_norm": 0.5364195704460144, + "learning_rate": 0.0005837085798940577, + "loss": 3.4522, + "step": 6451 + }, + { + "epoch": 0.32, + "grad_norm": 0.5666096210479736, + "learning_rate": 0.0005837035749186307, + "loss": 3.5498, + "step": 6452 + }, + { + "epoch": 0.32, + "grad_norm": 0.5489985942840576, + "learning_rate": 0.000583698569195983, + "loss": 3.2855, + "step": 6453 + }, + { + "epoch": 0.32, + "grad_norm": 0.5230780839920044, + "learning_rate": 0.0005836935627261279, + "loss": 3.4296, + "step": 6454 + }, + { + "epoch": 0.32, + "grad_norm": 0.5031773447990417, + "learning_rate": 0.0005836885555090786, + "loss": 3.455, + "step": 6455 + }, + { + "epoch": 0.32, + "grad_norm": 0.5535476207733154, + "learning_rate": 0.0005836835475448482, + "loss": 3.2767, + "step": 6456 + }, + { + "epoch": 0.32, + "grad_norm": 0.5760499835014343, + "learning_rate": 0.0005836785388334499, + "loss": 3.5469, + "step": 6457 + }, + { + "epoch": 0.32, + "grad_norm": 0.5332911610603333, + "learning_rate": 0.0005836735293748969, + "loss": 3.4841, + "step": 6458 + }, + { + "epoch": 0.32, + "grad_norm": 0.5543957352638245, + "learning_rate": 0.0005836685191692026, + "loss": 3.3279, + "step": 6459 + }, + { + "epoch": 0.32, + "grad_norm": 0.576471209526062, + "learning_rate": 0.0005836635082163798, + "loss": 3.4943, + "step": 6460 + }, + { + "epoch": 0.32, + "grad_norm": 0.5583447813987732, + "learning_rate": 0.000583658496516442, + "loss": 3.3526, + "step": 6461 + }, + { + "epoch": 0.32, + "grad_norm": 0.5250188112258911, + "learning_rate": 0.0005836534840694024, + "loss": 3.4248, + "step": 6462 + }, + { + "epoch": 0.32, + "grad_norm": 0.5952808856964111, + "learning_rate": 0.0005836484708752739, + "loss": 3.368, + "step": 6463 + }, + { + "epoch": 0.32, + "grad_norm": 0.5756708979606628, + "learning_rate": 0.0005836434569340701, + "loss": 3.3606, + "step": 6464 + }, + { + "epoch": 0.32, + "grad_norm": 0.5491698384284973, + "learning_rate": 0.000583638442245804, + "loss": 3.4689, + "step": 6465 + }, + { + "epoch": 0.32, + "grad_norm": 0.5349763631820679, + "learning_rate": 0.0005836334268104887, + "loss": 3.4542, + "step": 6466 + }, + { + "epoch": 0.32, + "grad_norm": 0.5523869395256042, + "learning_rate": 0.0005836284106281377, + "loss": 3.3126, + "step": 6467 + }, + { + "epoch": 0.32, + "grad_norm": 0.5572420358657837, + "learning_rate": 0.0005836233936987639, + "loss": 3.4631, + "step": 6468 + }, + { + "epoch": 0.32, + "grad_norm": 0.5321294665336609, + "learning_rate": 0.0005836183760223808, + "loss": 3.3318, + "step": 6469 + }, + { + "epoch": 0.32, + "grad_norm": 0.5739319324493408, + "learning_rate": 0.0005836133575990015, + "loss": 3.5455, + "step": 6470 + }, + { + "epoch": 0.32, + "grad_norm": 0.5573357939720154, + "learning_rate": 0.0005836083384286391, + "loss": 3.3142, + "step": 6471 + }, + { + "epoch": 0.32, + "grad_norm": 0.6728717684745789, + "learning_rate": 0.0005836033185113069, + "loss": 3.4165, + "step": 6472 + }, + { + "epoch": 0.32, + "grad_norm": 0.5487765073776245, + "learning_rate": 0.0005835982978470182, + "loss": 3.3554, + "step": 6473 + }, + { + "epoch": 0.32, + "grad_norm": 0.5465099215507507, + "learning_rate": 0.0005835932764357861, + "loss": 3.2763, + "step": 6474 + }, + { + "epoch": 0.32, + "grad_norm": 0.5527744293212891, + "learning_rate": 0.0005835882542776239, + "loss": 3.2369, + "step": 6475 + }, + { + "epoch": 0.32, + "grad_norm": 0.6429268717765808, + "learning_rate": 0.0005835832313725449, + "loss": 3.3702, + "step": 6476 + }, + { + "epoch": 0.32, + "grad_norm": 0.5672469735145569, + "learning_rate": 0.0005835782077205623, + "loss": 3.4388, + "step": 6477 + }, + { + "epoch": 0.32, + "grad_norm": 0.5245232582092285, + "learning_rate": 0.0005835731833216891, + "loss": 3.5464, + "step": 6478 + }, + { + "epoch": 0.32, + "grad_norm": 0.5858415961265564, + "learning_rate": 0.0005835681581759387, + "loss": 3.3392, + "step": 6479 + }, + { + "epoch": 0.32, + "grad_norm": 0.5938719511032104, + "learning_rate": 0.0005835631322833244, + "loss": 3.3036, + "step": 6480 + }, + { + "epoch": 0.32, + "grad_norm": 0.5960376262664795, + "learning_rate": 0.0005835581056438595, + "loss": 3.3089, + "step": 6481 + }, + { + "epoch": 0.32, + "grad_norm": 0.5122845768928528, + "learning_rate": 0.000583553078257557, + "loss": 3.4641, + "step": 6482 + }, + { + "epoch": 0.32, + "grad_norm": 0.5184789299964905, + "learning_rate": 0.0005835480501244302, + "loss": 3.2639, + "step": 6483 + }, + { + "epoch": 0.32, + "grad_norm": 0.5567031502723694, + "learning_rate": 0.0005835430212444927, + "loss": 3.6092, + "step": 6484 + }, + { + "epoch": 0.32, + "grad_norm": 0.6219637393951416, + "learning_rate": 0.0005835379916177572, + "loss": 3.4196, + "step": 6485 + }, + { + "epoch": 0.32, + "grad_norm": 0.5328686833381653, + "learning_rate": 0.0005835329612442372, + "loss": 3.523, + "step": 6486 + }, + { + "epoch": 0.32, + "grad_norm": 0.5400885939598083, + "learning_rate": 0.000583527930123946, + "loss": 3.0661, + "step": 6487 + }, + { + "epoch": 0.32, + "grad_norm": 0.5412009954452515, + "learning_rate": 0.0005835228982568968, + "loss": 3.2229, + "step": 6488 + }, + { + "epoch": 0.32, + "grad_norm": 0.5066078901290894, + "learning_rate": 0.0005835178656431029, + "loss": 3.2135, + "step": 6489 + }, + { + "epoch": 0.32, + "grad_norm": 0.6025939583778381, + "learning_rate": 0.0005835128322825774, + "loss": 3.5742, + "step": 6490 + }, + { + "epoch": 0.32, + "grad_norm": 0.5483773350715637, + "learning_rate": 0.0005835077981753337, + "loss": 3.4047, + "step": 6491 + }, + { + "epoch": 0.32, + "grad_norm": 0.5667142868041992, + "learning_rate": 0.0005835027633213851, + "loss": 3.3673, + "step": 6492 + }, + { + "epoch": 0.32, + "grad_norm": 0.5416460633277893, + "learning_rate": 0.0005834977277207447, + "loss": 3.5253, + "step": 6493 + }, + { + "epoch": 0.32, + "grad_norm": 0.5411655902862549, + "learning_rate": 0.0005834926913734259, + "loss": 3.2967, + "step": 6494 + }, + { + "epoch": 0.32, + "grad_norm": 0.5534458756446838, + "learning_rate": 0.0005834876542794418, + "loss": 3.2466, + "step": 6495 + }, + { + "epoch": 0.32, + "grad_norm": 0.5487003922462463, + "learning_rate": 0.0005834826164388059, + "loss": 3.4169, + "step": 6496 + }, + { + "epoch": 0.32, + "grad_norm": 0.5103386044502258, + "learning_rate": 0.0005834775778515313, + "loss": 3.4592, + "step": 6497 + }, + { + "epoch": 0.32, + "grad_norm": 0.5426819324493408, + "learning_rate": 0.0005834725385176312, + "loss": 3.4416, + "step": 6498 + }, + { + "epoch": 0.32, + "grad_norm": 0.5770543217658997, + "learning_rate": 0.0005834674984371191, + "loss": 3.3999, + "step": 6499 + }, + { + "epoch": 0.32, + "grad_norm": 0.5331725478172302, + "learning_rate": 0.0005834624576100082, + "loss": 3.4512, + "step": 6500 + }, + { + "epoch": 0.32, + "grad_norm": 0.5609892010688782, + "learning_rate": 0.0005834574160363117, + "loss": 3.461, + "step": 6501 + }, + { + "epoch": 0.32, + "grad_norm": 0.5909136533737183, + "learning_rate": 0.000583452373716043, + "loss": 3.3132, + "step": 6502 + }, + { + "epoch": 0.32, + "grad_norm": 0.55914705991745, + "learning_rate": 0.0005834473306492152, + "loss": 3.5681, + "step": 6503 + }, + { + "epoch": 0.32, + "grad_norm": 0.5517709255218506, + "learning_rate": 0.0005834422868358416, + "loss": 3.3071, + "step": 6504 + }, + { + "epoch": 0.32, + "grad_norm": 0.5129667520523071, + "learning_rate": 0.0005834372422759358, + "loss": 3.2362, + "step": 6505 + }, + { + "epoch": 0.32, + "grad_norm": 0.5354728102684021, + "learning_rate": 0.0005834321969695107, + "loss": 3.3641, + "step": 6506 + }, + { + "epoch": 0.32, + "grad_norm": 0.5102858543395996, + "learning_rate": 0.0005834271509165798, + "loss": 3.1551, + "step": 6507 + }, + { + "epoch": 0.32, + "grad_norm": 0.5487684011459351, + "learning_rate": 0.0005834221041171563, + "loss": 3.5077, + "step": 6508 + }, + { + "epoch": 0.32, + "grad_norm": 0.5984534025192261, + "learning_rate": 0.0005834170565712535, + "loss": 3.3783, + "step": 6509 + }, + { + "epoch": 0.32, + "grad_norm": 0.5358244180679321, + "learning_rate": 0.0005834120082788847, + "loss": 3.3893, + "step": 6510 + }, + { + "epoch": 0.32, + "grad_norm": 0.5399655103683472, + "learning_rate": 0.0005834069592400632, + "loss": 3.3156, + "step": 6511 + }, + { + "epoch": 0.32, + "grad_norm": 0.5290285348892212, + "learning_rate": 0.0005834019094548025, + "loss": 3.1687, + "step": 6512 + }, + { + "epoch": 0.32, + "grad_norm": 0.5532265901565552, + "learning_rate": 0.0005833968589231155, + "loss": 3.4857, + "step": 6513 + }, + { + "epoch": 0.32, + "grad_norm": 0.5471756458282471, + "learning_rate": 0.0005833918076450158, + "loss": 3.2192, + "step": 6514 + }, + { + "epoch": 0.32, + "grad_norm": 0.5379656553268433, + "learning_rate": 0.0005833867556205165, + "loss": 3.4739, + "step": 6515 + }, + { + "epoch": 0.32, + "grad_norm": 0.5359077453613281, + "learning_rate": 0.0005833817028496313, + "loss": 3.3655, + "step": 6516 + }, + { + "epoch": 0.32, + "grad_norm": 0.5260913968086243, + "learning_rate": 0.000583376649332373, + "loss": 3.315, + "step": 6517 + }, + { + "epoch": 0.32, + "grad_norm": 0.5894081592559814, + "learning_rate": 0.0005833715950687552, + "loss": 3.1694, + "step": 6518 + }, + { + "epoch": 0.32, + "grad_norm": 0.5539742708206177, + "learning_rate": 0.0005833665400587911, + "loss": 3.3007, + "step": 6519 + }, + { + "epoch": 0.32, + "grad_norm": 0.506983757019043, + "learning_rate": 0.0005833614843024942, + "loss": 3.3166, + "step": 6520 + }, + { + "epoch": 0.32, + "grad_norm": 0.5148934721946716, + "learning_rate": 0.0005833564277998776, + "loss": 3.3476, + "step": 6521 + }, + { + "epoch": 0.32, + "grad_norm": 0.5183430910110474, + "learning_rate": 0.0005833513705509547, + "loss": 3.5182, + "step": 6522 + }, + { + "epoch": 0.32, + "grad_norm": 0.5564296841621399, + "learning_rate": 0.0005833463125557389, + "loss": 3.2431, + "step": 6523 + }, + { + "epoch": 0.32, + "grad_norm": 0.5783804655075073, + "learning_rate": 0.0005833412538142433, + "loss": 3.3268, + "step": 6524 + }, + { + "epoch": 0.32, + "grad_norm": 0.5203677415847778, + "learning_rate": 0.0005833361943264815, + "loss": 3.3579, + "step": 6525 + }, + { + "epoch": 0.32, + "grad_norm": 0.5871049165725708, + "learning_rate": 0.0005833311340924666, + "loss": 3.3097, + "step": 6526 + }, + { + "epoch": 0.32, + "grad_norm": 0.556999146938324, + "learning_rate": 0.0005833260731122121, + "loss": 3.254, + "step": 6527 + }, + { + "epoch": 0.32, + "grad_norm": 0.5999124646186829, + "learning_rate": 0.0005833210113857313, + "loss": 3.439, + "step": 6528 + }, + { + "epoch": 0.32, + "grad_norm": 0.5835153460502625, + "learning_rate": 0.0005833159489130373, + "loss": 3.4526, + "step": 6529 + }, + { + "epoch": 0.32, + "grad_norm": 0.521670401096344, + "learning_rate": 0.0005833108856941438, + "loss": 3.4603, + "step": 6530 + }, + { + "epoch": 0.32, + "grad_norm": 0.5895057916641235, + "learning_rate": 0.0005833058217290638, + "loss": 3.4598, + "step": 6531 + }, + { + "epoch": 0.32, + "grad_norm": 0.5201472043991089, + "learning_rate": 0.0005833007570178109, + "loss": 3.5078, + "step": 6532 + }, + { + "epoch": 0.32, + "grad_norm": 0.5210258364677429, + "learning_rate": 0.0005832956915603982, + "loss": 3.3483, + "step": 6533 + }, + { + "epoch": 0.32, + "grad_norm": 0.5231567621231079, + "learning_rate": 0.0005832906253568392, + "loss": 3.4721, + "step": 6534 + }, + { + "epoch": 0.32, + "grad_norm": 0.5170723795890808, + "learning_rate": 0.0005832855584071474, + "loss": 3.3227, + "step": 6535 + }, + { + "epoch": 0.32, + "grad_norm": 0.5135728716850281, + "learning_rate": 0.0005832804907113358, + "loss": 3.1659, + "step": 6536 + }, + { + "epoch": 0.32, + "grad_norm": 0.5827286243438721, + "learning_rate": 0.0005832754222694179, + "loss": 3.3854, + "step": 6537 + }, + { + "epoch": 0.32, + "grad_norm": 0.5649005174636841, + "learning_rate": 0.000583270353081407, + "loss": 3.2509, + "step": 6538 + }, + { + "epoch": 0.32, + "grad_norm": 0.585883378982544, + "learning_rate": 0.0005832652831473166, + "loss": 3.3499, + "step": 6539 + }, + { + "epoch": 0.32, + "grad_norm": 0.5253201723098755, + "learning_rate": 0.00058326021246716, + "loss": 3.4087, + "step": 6540 + }, + { + "epoch": 0.32, + "grad_norm": 0.4951026141643524, + "learning_rate": 0.0005832551410409505, + "loss": 3.4144, + "step": 6541 + }, + { + "epoch": 0.32, + "grad_norm": 0.5574667453765869, + "learning_rate": 0.0005832500688687014, + "loss": 3.2957, + "step": 6542 + }, + { + "epoch": 0.32, + "grad_norm": 0.5781322121620178, + "learning_rate": 0.0005832449959504262, + "loss": 3.3506, + "step": 6543 + }, + { + "epoch": 0.32, + "grad_norm": 0.6145309209823608, + "learning_rate": 0.0005832399222861381, + "loss": 3.4637, + "step": 6544 + }, + { + "epoch": 0.32, + "grad_norm": 0.5183820724487305, + "learning_rate": 0.0005832348478758507, + "loss": 3.5608, + "step": 6545 + }, + { + "epoch": 0.32, + "grad_norm": 0.5703016519546509, + "learning_rate": 0.0005832297727195771, + "loss": 3.3926, + "step": 6546 + }, + { + "epoch": 0.32, + "grad_norm": 0.5798136591911316, + "learning_rate": 0.0005832246968173309, + "loss": 3.3352, + "step": 6547 + }, + { + "epoch": 0.32, + "grad_norm": 0.5665884017944336, + "learning_rate": 0.0005832196201691252, + "loss": 3.2569, + "step": 6548 + }, + { + "epoch": 0.32, + "grad_norm": 0.5170712471008301, + "learning_rate": 0.0005832145427749737, + "loss": 3.4392, + "step": 6549 + }, + { + "epoch": 0.32, + "grad_norm": 0.5397904515266418, + "learning_rate": 0.0005832094646348894, + "loss": 3.5144, + "step": 6550 + }, + { + "epoch": 0.32, + "grad_norm": 0.6113349795341492, + "learning_rate": 0.000583204385748886, + "loss": 3.3113, + "step": 6551 + }, + { + "epoch": 0.32, + "grad_norm": 0.5789533853530884, + "learning_rate": 0.0005831993061169768, + "loss": 3.2817, + "step": 6552 + }, + { + "epoch": 0.32, + "grad_norm": 0.5456516742706299, + "learning_rate": 0.000583194225739175, + "loss": 3.327, + "step": 6553 + }, + { + "epoch": 0.32, + "grad_norm": 0.5421919822692871, + "learning_rate": 0.0005831891446154942, + "loss": 3.2764, + "step": 6554 + }, + { + "epoch": 0.32, + "grad_norm": 0.5785422325134277, + "learning_rate": 0.0005831840627459476, + "loss": 3.0956, + "step": 6555 + }, + { + "epoch": 0.32, + "grad_norm": 0.5162643194198608, + "learning_rate": 0.0005831789801305488, + "loss": 3.4063, + "step": 6556 + }, + { + "epoch": 0.32, + "grad_norm": 0.6994883418083191, + "learning_rate": 0.0005831738967693109, + "loss": 3.3117, + "step": 6557 + }, + { + "epoch": 0.32, + "grad_norm": 0.5696769952774048, + "learning_rate": 0.0005831688126622477, + "loss": 3.2208, + "step": 6558 + }, + { + "epoch": 0.32, + "grad_norm": 0.5177335739135742, + "learning_rate": 0.0005831637278093722, + "loss": 3.2212, + "step": 6559 + }, + { + "epoch": 0.32, + "grad_norm": 0.5111486315727234, + "learning_rate": 0.000583158642210698, + "loss": 3.4638, + "step": 6560 + }, + { + "epoch": 0.32, + "grad_norm": 0.5310366153717041, + "learning_rate": 0.0005831535558662383, + "loss": 3.2656, + "step": 6561 + }, + { + "epoch": 0.32, + "grad_norm": 0.5311713218688965, + "learning_rate": 0.0005831484687760067, + "loss": 3.6579, + "step": 6562 + }, + { + "epoch": 0.32, + "grad_norm": 0.6105450987815857, + "learning_rate": 0.0005831433809400166, + "loss": 3.3503, + "step": 6563 + }, + { + "epoch": 0.32, + "grad_norm": 0.5762481689453125, + "learning_rate": 0.0005831382923582812, + "loss": 3.4268, + "step": 6564 + }, + { + "epoch": 0.32, + "grad_norm": 0.5664306879043579, + "learning_rate": 0.0005831332030308142, + "loss": 3.2995, + "step": 6565 + }, + { + "epoch": 0.32, + "grad_norm": 0.5523332953453064, + "learning_rate": 0.0005831281129576286, + "loss": 3.0233, + "step": 6566 + }, + { + "epoch": 0.32, + "grad_norm": 0.5818634033203125, + "learning_rate": 0.0005831230221387382, + "loss": 3.4713, + "step": 6567 + }, + { + "epoch": 0.32, + "grad_norm": 0.5426694750785828, + "learning_rate": 0.0005831179305741562, + "loss": 3.3265, + "step": 6568 + }, + { + "epoch": 0.32, + "grad_norm": 0.5816428661346436, + "learning_rate": 0.000583112838263896, + "loss": 3.1057, + "step": 6569 + }, + { + "epoch": 0.32, + "grad_norm": 0.5406796932220459, + "learning_rate": 0.0005831077452079712, + "loss": 2.973, + "step": 6570 + }, + { + "epoch": 0.32, + "grad_norm": 0.5280852317810059, + "learning_rate": 0.000583102651406395, + "loss": 3.3631, + "step": 6571 + }, + { + "epoch": 0.32, + "grad_norm": 0.5578991174697876, + "learning_rate": 0.0005830975568591809, + "loss": 3.3991, + "step": 6572 + }, + { + "epoch": 0.32, + "grad_norm": 0.585803210735321, + "learning_rate": 0.0005830924615663423, + "loss": 3.4668, + "step": 6573 + }, + { + "epoch": 0.32, + "grad_norm": 0.5431290864944458, + "learning_rate": 0.0005830873655278927, + "loss": 3.1047, + "step": 6574 + }, + { + "epoch": 0.32, + "grad_norm": 0.5530149340629578, + "learning_rate": 0.0005830822687438455, + "loss": 3.4276, + "step": 6575 + }, + { + "epoch": 0.32, + "grad_norm": 0.5447601675987244, + "learning_rate": 0.000583077171214214, + "loss": 3.2974, + "step": 6576 + }, + { + "epoch": 0.32, + "grad_norm": 0.5636044144630432, + "learning_rate": 0.0005830720729390118, + "loss": 3.1732, + "step": 6577 + }, + { + "epoch": 0.32, + "grad_norm": 0.5744757056236267, + "learning_rate": 0.0005830669739182522, + "loss": 3.3084, + "step": 6578 + }, + { + "epoch": 0.32, + "grad_norm": 0.5662171840667725, + "learning_rate": 0.0005830618741519486, + "loss": 3.5397, + "step": 6579 + }, + { + "epoch": 0.32, + "grad_norm": 0.5077491998672485, + "learning_rate": 0.0005830567736401145, + "loss": 2.9837, + "step": 6580 + }, + { + "epoch": 0.32, + "grad_norm": 0.5053831338882446, + "learning_rate": 0.0005830516723827634, + "loss": 3.2128, + "step": 6581 + }, + { + "epoch": 0.32, + "grad_norm": 0.5454685688018799, + "learning_rate": 0.0005830465703799086, + "loss": 3.5079, + "step": 6582 + }, + { + "epoch": 0.32, + "grad_norm": 0.5580922961235046, + "learning_rate": 0.0005830414676315636, + "loss": 3.4775, + "step": 6583 + }, + { + "epoch": 0.32, + "grad_norm": 0.5592604279518127, + "learning_rate": 0.000583036364137742, + "loss": 3.2971, + "step": 6584 + }, + { + "epoch": 0.32, + "grad_norm": 0.5692855715751648, + "learning_rate": 0.0005830312598984569, + "loss": 3.1209, + "step": 6585 + }, + { + "epoch": 0.32, + "grad_norm": 0.5731725692749023, + "learning_rate": 0.0005830261549137221, + "loss": 3.2845, + "step": 6586 + }, + { + "epoch": 0.32, + "grad_norm": 0.5284132957458496, + "learning_rate": 0.0005830210491835508, + "loss": 3.438, + "step": 6587 + }, + { + "epoch": 0.32, + "grad_norm": 0.5840345621109009, + "learning_rate": 0.0005830159427079565, + "loss": 3.4824, + "step": 6588 + }, + { + "epoch": 0.32, + "grad_norm": 0.5384155511856079, + "learning_rate": 0.0005830108354869528, + "loss": 3.466, + "step": 6589 + }, + { + "epoch": 0.32, + "grad_norm": 0.5243905782699585, + "learning_rate": 0.0005830057275205529, + "loss": 3.3626, + "step": 6590 + }, + { + "epoch": 0.32, + "grad_norm": 0.4902263581752777, + "learning_rate": 0.0005830006188087704, + "loss": 3.22, + "step": 6591 + }, + { + "epoch": 0.32, + "grad_norm": 0.5410398840904236, + "learning_rate": 0.0005829955093516187, + "loss": 3.317, + "step": 6592 + }, + { + "epoch": 0.32, + "grad_norm": 0.5666494965553284, + "learning_rate": 0.0005829903991491113, + "loss": 3.5118, + "step": 6593 + }, + { + "epoch": 0.32, + "grad_norm": 0.5500527024269104, + "learning_rate": 0.0005829852882012616, + "loss": 3.1609, + "step": 6594 + }, + { + "epoch": 0.32, + "grad_norm": 0.5293498039245605, + "learning_rate": 0.0005829801765080833, + "loss": 3.5884, + "step": 6595 + }, + { + "epoch": 0.32, + "grad_norm": 0.5184808969497681, + "learning_rate": 0.0005829750640695896, + "loss": 3.4994, + "step": 6596 + }, + { + "epoch": 0.32, + "grad_norm": 0.5679014921188354, + "learning_rate": 0.000582969950885794, + "loss": 3.182, + "step": 6597 + }, + { + "epoch": 0.32, + "grad_norm": 0.5611880421638489, + "learning_rate": 0.0005829648369567099, + "loss": 3.3356, + "step": 6598 + }, + { + "epoch": 0.32, + "grad_norm": 0.5921120643615723, + "learning_rate": 0.0005829597222823511, + "loss": 3.2854, + "step": 6599 + }, + { + "epoch": 0.32, + "grad_norm": 0.62581467628479, + "learning_rate": 0.0005829546068627306, + "loss": 3.3484, + "step": 6600 + }, + { + "epoch": 0.32, + "grad_norm": 0.554572343826294, + "learning_rate": 0.0005829494906978624, + "loss": 3.4792, + "step": 6601 + }, + { + "epoch": 0.32, + "grad_norm": 0.535103976726532, + "learning_rate": 0.0005829443737877595, + "loss": 3.5559, + "step": 6602 + }, + { + "epoch": 0.32, + "grad_norm": 0.5401231050491333, + "learning_rate": 0.0005829392561324357, + "loss": 3.3311, + "step": 6603 + }, + { + "epoch": 0.32, + "grad_norm": 0.5672370791435242, + "learning_rate": 0.0005829341377319042, + "loss": 3.452, + "step": 6604 + }, + { + "epoch": 0.32, + "grad_norm": 0.5229573845863342, + "learning_rate": 0.0005829290185861788, + "loss": 3.1934, + "step": 6605 + }, + { + "epoch": 0.32, + "grad_norm": 0.5550198554992676, + "learning_rate": 0.0005829238986952728, + "loss": 3.2653, + "step": 6606 + }, + { + "epoch": 0.32, + "grad_norm": 0.4899026155471802, + "learning_rate": 0.0005829187780591995, + "loss": 3.5498, + "step": 6607 + }, + { + "epoch": 0.32, + "grad_norm": 0.5529863238334656, + "learning_rate": 0.0005829136566779727, + "loss": 3.4041, + "step": 6608 + }, + { + "epoch": 0.32, + "grad_norm": 0.5543391704559326, + "learning_rate": 0.0005829085345516058, + "loss": 3.5464, + "step": 6609 + }, + { + "epoch": 0.32, + "grad_norm": 0.5163125991821289, + "learning_rate": 0.0005829034116801123, + "loss": 3.556, + "step": 6610 + }, + { + "epoch": 0.32, + "grad_norm": 0.5830318331718445, + "learning_rate": 0.0005828982880635057, + "loss": 3.4059, + "step": 6611 + }, + { + "epoch": 0.32, + "grad_norm": 0.5730868577957153, + "learning_rate": 0.0005828931637017992, + "loss": 3.2501, + "step": 6612 + }, + { + "epoch": 0.32, + "grad_norm": 0.5114001631736755, + "learning_rate": 0.0005828880385950068, + "loss": 3.3564, + "step": 6613 + }, + { + "epoch": 0.32, + "grad_norm": 0.5583293437957764, + "learning_rate": 0.0005828829127431418, + "loss": 3.1739, + "step": 6614 + }, + { + "epoch": 0.32, + "grad_norm": 0.5196751952171326, + "learning_rate": 0.0005828777861462174, + "loss": 3.4315, + "step": 6615 + }, + { + "epoch": 0.32, + "grad_norm": 0.5976796746253967, + "learning_rate": 0.0005828726588042475, + "loss": 3.3738, + "step": 6616 + }, + { + "epoch": 0.32, + "grad_norm": 0.5232165455818176, + "learning_rate": 0.0005828675307172455, + "loss": 3.3511, + "step": 6617 + }, + { + "epoch": 0.32, + "grad_norm": 0.5699440240859985, + "learning_rate": 0.0005828624018852247, + "loss": 3.4635, + "step": 6618 + }, + { + "epoch": 0.32, + "grad_norm": 0.5138837099075317, + "learning_rate": 0.0005828572723081989, + "loss": 3.33, + "step": 6619 + }, + { + "epoch": 0.32, + "grad_norm": 0.5367459654808044, + "learning_rate": 0.0005828521419861814, + "loss": 3.3618, + "step": 6620 + }, + { + "epoch": 0.32, + "grad_norm": 0.5628336668014526, + "learning_rate": 0.0005828470109191859, + "loss": 3.4253, + "step": 6621 + }, + { + "epoch": 0.32, + "grad_norm": 0.5682950615882874, + "learning_rate": 0.0005828418791072258, + "loss": 3.3819, + "step": 6622 + }, + { + "epoch": 0.32, + "grad_norm": 0.5145696401596069, + "learning_rate": 0.0005828367465503145, + "loss": 3.409, + "step": 6623 + }, + { + "epoch": 0.32, + "grad_norm": 0.5609208345413208, + "learning_rate": 0.0005828316132484657, + "loss": 3.471, + "step": 6624 + }, + { + "epoch": 0.32, + "grad_norm": 0.514314591884613, + "learning_rate": 0.0005828264792016929, + "loss": 3.4783, + "step": 6625 + }, + { + "epoch": 0.32, + "grad_norm": 0.5764790773391724, + "learning_rate": 0.0005828213444100096, + "loss": 3.5846, + "step": 6626 + }, + { + "epoch": 0.32, + "grad_norm": 0.5133751630783081, + "learning_rate": 0.0005828162088734293, + "loss": 3.4859, + "step": 6627 + }, + { + "epoch": 0.32, + "grad_norm": 0.5235024094581604, + "learning_rate": 0.0005828110725919655, + "loss": 3.3398, + "step": 6628 + }, + { + "epoch": 0.32, + "grad_norm": 0.5416542291641235, + "learning_rate": 0.0005828059355656317, + "loss": 3.279, + "step": 6629 + }, + { + "epoch": 0.32, + "grad_norm": 0.5646631121635437, + "learning_rate": 0.0005828007977944416, + "loss": 3.3315, + "step": 6630 + }, + { + "epoch": 0.32, + "grad_norm": 0.5523773431777954, + "learning_rate": 0.0005827956592784087, + "loss": 3.3439, + "step": 6631 + }, + { + "epoch": 0.33, + "grad_norm": 0.5373170375823975, + "learning_rate": 0.0005827905200175463, + "loss": 3.3712, + "step": 6632 + }, + { + "epoch": 0.33, + "grad_norm": 0.5419214367866516, + "learning_rate": 0.0005827853800118682, + "loss": 3.3848, + "step": 6633 + }, + { + "epoch": 0.33, + "grad_norm": 0.5145456194877625, + "learning_rate": 0.0005827802392613877, + "loss": 3.4817, + "step": 6634 + }, + { + "epoch": 0.33, + "grad_norm": 0.5141541957855225, + "learning_rate": 0.0005827750977661184, + "loss": 3.3013, + "step": 6635 + }, + { + "epoch": 0.33, + "grad_norm": 0.519413948059082, + "learning_rate": 0.0005827699555260742, + "loss": 3.2296, + "step": 6636 + }, + { + "epoch": 0.33, + "grad_norm": 0.5240925550460815, + "learning_rate": 0.0005827648125412683, + "loss": 3.3045, + "step": 6637 + }, + { + "epoch": 0.33, + "grad_norm": 0.5714395046234131, + "learning_rate": 0.0005827596688117141, + "loss": 3.1928, + "step": 6638 + }, + { + "epoch": 0.33, + "grad_norm": 0.5431802868843079, + "learning_rate": 0.0005827545243374256, + "loss": 3.2881, + "step": 6639 + }, + { + "epoch": 0.33, + "grad_norm": 0.542210578918457, + "learning_rate": 0.0005827493791184159, + "loss": 3.4252, + "step": 6640 + }, + { + "epoch": 0.33, + "grad_norm": 0.5310395956039429, + "learning_rate": 0.0005827442331546987, + "loss": 3.4693, + "step": 6641 + }, + { + "epoch": 0.33, + "grad_norm": 0.5362105369567871, + "learning_rate": 0.0005827390864462878, + "loss": 3.5127, + "step": 6642 + }, + { + "epoch": 0.33, + "grad_norm": 0.5233688354492188, + "learning_rate": 0.0005827339389931966, + "loss": 3.5151, + "step": 6643 + }, + { + "epoch": 0.33, + "grad_norm": 0.5598340630531311, + "learning_rate": 0.0005827287907954385, + "loss": 3.3003, + "step": 6644 + }, + { + "epoch": 0.33, + "grad_norm": 0.5226010680198669, + "learning_rate": 0.0005827236418530272, + "loss": 3.3553, + "step": 6645 + }, + { + "epoch": 0.33, + "grad_norm": 0.6050033569335938, + "learning_rate": 0.0005827184921659761, + "loss": 3.1462, + "step": 6646 + }, + { + "epoch": 0.33, + "grad_norm": 0.5034282207489014, + "learning_rate": 0.0005827133417342991, + "loss": 3.5462, + "step": 6647 + }, + { + "epoch": 0.33, + "grad_norm": 0.5270561575889587, + "learning_rate": 0.0005827081905580095, + "loss": 3.5129, + "step": 6648 + }, + { + "epoch": 0.33, + "grad_norm": 0.5257344245910645, + "learning_rate": 0.0005827030386371209, + "loss": 3.3749, + "step": 6649 + }, + { + "epoch": 0.33, + "grad_norm": 0.6640146374702454, + "learning_rate": 0.0005826978859716469, + "loss": 3.0764, + "step": 6650 + }, + { + "epoch": 0.33, + "grad_norm": 0.5421690940856934, + "learning_rate": 0.0005826927325616012, + "loss": 3.2901, + "step": 6651 + }, + { + "epoch": 0.33, + "grad_norm": 0.5615790486335754, + "learning_rate": 0.0005826875784069971, + "loss": 3.3495, + "step": 6652 + }, + { + "epoch": 0.33, + "grad_norm": 0.5812866687774658, + "learning_rate": 0.0005826824235078484, + "loss": 3.6013, + "step": 6653 + }, + { + "epoch": 0.33, + "grad_norm": 0.5673434138298035, + "learning_rate": 0.0005826772678641685, + "loss": 3.313, + "step": 6654 + }, + { + "epoch": 0.33, + "grad_norm": 0.5085522532463074, + "learning_rate": 0.0005826721114759711, + "loss": 3.3503, + "step": 6655 + }, + { + "epoch": 0.33, + "grad_norm": 0.5811455249786377, + "learning_rate": 0.0005826669543432699, + "loss": 3.5832, + "step": 6656 + }, + { + "epoch": 0.33, + "grad_norm": 0.584208607673645, + "learning_rate": 0.0005826617964660783, + "loss": 3.2827, + "step": 6657 + }, + { + "epoch": 0.33, + "grad_norm": 0.547170877456665, + "learning_rate": 0.0005826566378444099, + "loss": 3.2459, + "step": 6658 + }, + { + "epoch": 0.33, + "grad_norm": 0.5642180442810059, + "learning_rate": 0.0005826514784782783, + "loss": 3.3707, + "step": 6659 + }, + { + "epoch": 0.33, + "grad_norm": 0.5796110033988953, + "learning_rate": 0.000582646318367697, + "loss": 3.4914, + "step": 6660 + }, + { + "epoch": 0.33, + "grad_norm": 0.5225422978401184, + "learning_rate": 0.0005826411575126798, + "loss": 3.3454, + "step": 6661 + }, + { + "epoch": 0.33, + "grad_norm": 0.5265496969223022, + "learning_rate": 0.0005826359959132402, + "loss": 3.2391, + "step": 6662 + }, + { + "epoch": 0.33, + "grad_norm": 0.48403891921043396, + "learning_rate": 0.0005826308335693919, + "loss": 3.2319, + "step": 6663 + }, + { + "epoch": 0.33, + "grad_norm": 0.5459921360015869, + "learning_rate": 0.0005826256704811481, + "loss": 3.488, + "step": 6664 + }, + { + "epoch": 0.33, + "grad_norm": 0.524363100528717, + "learning_rate": 0.0005826205066485229, + "loss": 3.5475, + "step": 6665 + }, + { + "epoch": 0.33, + "grad_norm": 0.5568280816078186, + "learning_rate": 0.0005826153420715295, + "loss": 3.3387, + "step": 6666 + }, + { + "epoch": 0.33, + "grad_norm": 0.5115675330162048, + "learning_rate": 0.0005826101767501818, + "loss": 3.5857, + "step": 6667 + }, + { + "epoch": 0.33, + "grad_norm": 0.5281095504760742, + "learning_rate": 0.0005826050106844934, + "loss": 3.3025, + "step": 6668 + }, + { + "epoch": 0.33, + "grad_norm": 0.5385881066322327, + "learning_rate": 0.0005825998438744775, + "loss": 3.244, + "step": 6669 + }, + { + "epoch": 0.33, + "grad_norm": 0.5694405436515808, + "learning_rate": 0.0005825946763201482, + "loss": 3.2063, + "step": 6670 + }, + { + "epoch": 0.33, + "grad_norm": 0.5485799908638, + "learning_rate": 0.0005825895080215188, + "loss": 3.3736, + "step": 6671 + }, + { + "epoch": 0.33, + "grad_norm": 0.5174115300178528, + "learning_rate": 0.000582584338978603, + "loss": 3.3348, + "step": 6672 + }, + { + "epoch": 0.33, + "grad_norm": 0.5306274890899658, + "learning_rate": 0.0005825791691914146, + "loss": 3.4455, + "step": 6673 + }, + { + "epoch": 0.33, + "grad_norm": 0.5434350371360779, + "learning_rate": 0.000582573998659967, + "loss": 3.4376, + "step": 6674 + }, + { + "epoch": 0.33, + "grad_norm": 0.570429801940918, + "learning_rate": 0.0005825688273842738, + "loss": 3.3823, + "step": 6675 + }, + { + "epoch": 0.33, + "grad_norm": 0.5130692720413208, + "learning_rate": 0.0005825636553643488, + "loss": 3.2729, + "step": 6676 + }, + { + "epoch": 0.33, + "grad_norm": 0.5442194938659668, + "learning_rate": 0.0005825584826002054, + "loss": 3.547, + "step": 6677 + }, + { + "epoch": 0.33, + "grad_norm": 0.6229523420333862, + "learning_rate": 0.0005825533090918574, + "loss": 3.3318, + "step": 6678 + }, + { + "epoch": 0.33, + "grad_norm": 0.5505388975143433, + "learning_rate": 0.0005825481348393183, + "loss": 3.4383, + "step": 6679 + }, + { + "epoch": 0.33, + "grad_norm": 0.5161044597625732, + "learning_rate": 0.0005825429598426018, + "loss": 3.3366, + "step": 6680 + }, + { + "epoch": 0.33, + "grad_norm": 0.5270329117774963, + "learning_rate": 0.0005825377841017215, + "loss": 3.296, + "step": 6681 + }, + { + "epoch": 0.33, + "grad_norm": 0.5343664884567261, + "learning_rate": 0.0005825326076166912, + "loss": 3.3817, + "step": 6682 + }, + { + "epoch": 0.33, + "grad_norm": 0.5493436455726624, + "learning_rate": 0.0005825274303875242, + "loss": 3.3233, + "step": 6683 + }, + { + "epoch": 0.33, + "grad_norm": 0.5526965260505676, + "learning_rate": 0.0005825222524142345, + "loss": 3.1901, + "step": 6684 + }, + { + "epoch": 0.33, + "grad_norm": 0.5074105858802795, + "learning_rate": 0.0005825170736968354, + "loss": 3.3946, + "step": 6685 + }, + { + "epoch": 0.33, + "grad_norm": 0.5705122947692871, + "learning_rate": 0.0005825118942353408, + "loss": 3.2233, + "step": 6686 + }, + { + "epoch": 0.33, + "grad_norm": 0.5070473551750183, + "learning_rate": 0.0005825067140297641, + "loss": 3.2129, + "step": 6687 + }, + { + "epoch": 0.33, + "grad_norm": 0.5425854325294495, + "learning_rate": 0.0005825015330801192, + "loss": 3.2749, + "step": 6688 + }, + { + "epoch": 0.33, + "grad_norm": 0.5736327767372131, + "learning_rate": 0.0005824963513864197, + "loss": 3.2822, + "step": 6689 + }, + { + "epoch": 0.33, + "grad_norm": 0.5530392527580261, + "learning_rate": 0.0005824911689486791, + "loss": 3.2252, + "step": 6690 + }, + { + "epoch": 0.33, + "grad_norm": 0.5579785704612732, + "learning_rate": 0.0005824859857669111, + "loss": 3.6146, + "step": 6691 + }, + { + "epoch": 0.33, + "grad_norm": 0.671022891998291, + "learning_rate": 0.0005824808018411294, + "loss": 3.1428, + "step": 6692 + }, + { + "epoch": 0.33, + "grad_norm": 0.5243225693702698, + "learning_rate": 0.0005824756171713477, + "loss": 3.3311, + "step": 6693 + }, + { + "epoch": 0.33, + "grad_norm": 0.6820229291915894, + "learning_rate": 0.0005824704317575795, + "loss": 3.2211, + "step": 6694 + }, + { + "epoch": 0.33, + "grad_norm": 0.5242649912834167, + "learning_rate": 0.0005824652455998385, + "loss": 3.3226, + "step": 6695 + }, + { + "epoch": 0.33, + "grad_norm": 0.5282347202301025, + "learning_rate": 0.0005824600586981386, + "loss": 3.4453, + "step": 6696 + }, + { + "epoch": 0.33, + "grad_norm": 0.5148776173591614, + "learning_rate": 0.0005824548710524931, + "loss": 3.3774, + "step": 6697 + }, + { + "epoch": 0.33, + "grad_norm": 0.5305414795875549, + "learning_rate": 0.0005824496826629159, + "loss": 3.282, + "step": 6698 + }, + { + "epoch": 0.33, + "grad_norm": 1.0000241994857788, + "learning_rate": 0.0005824444935294206, + "loss": 3.4245, + "step": 6699 + }, + { + "epoch": 0.33, + "grad_norm": 0.5258133411407471, + "learning_rate": 0.0005824393036520208, + "loss": 3.4044, + "step": 6700 + }, + { + "epoch": 0.33, + "grad_norm": 0.6157311201095581, + "learning_rate": 0.0005824341130307302, + "loss": 3.3356, + "step": 6701 + }, + { + "epoch": 0.33, + "grad_norm": 0.5688565969467163, + "learning_rate": 0.0005824289216655626, + "loss": 3.3015, + "step": 6702 + }, + { + "epoch": 0.33, + "grad_norm": 0.5846591591835022, + "learning_rate": 0.0005824237295565315, + "loss": 3.5027, + "step": 6703 + }, + { + "epoch": 0.33, + "grad_norm": 0.5892431139945984, + "learning_rate": 0.0005824185367036507, + "loss": 3.3725, + "step": 6704 + }, + { + "epoch": 0.33, + "grad_norm": 0.5611383318901062, + "learning_rate": 0.0005824133431069338, + "loss": 3.0607, + "step": 6705 + }, + { + "epoch": 0.33, + "grad_norm": 0.5581179857254028, + "learning_rate": 0.0005824081487663945, + "loss": 3.1806, + "step": 6706 + }, + { + "epoch": 0.33, + "grad_norm": 0.5461733341217041, + "learning_rate": 0.0005824029536820466, + "loss": 3.1481, + "step": 6707 + }, + { + "epoch": 0.33, + "grad_norm": 0.5615188479423523, + "learning_rate": 0.0005823977578539035, + "loss": 3.3157, + "step": 6708 + }, + { + "epoch": 0.33, + "grad_norm": 0.5543760061264038, + "learning_rate": 0.0005823925612819792, + "loss": 3.1724, + "step": 6709 + }, + { + "epoch": 0.33, + "grad_norm": 0.512069046497345, + "learning_rate": 0.0005823873639662871, + "loss": 3.476, + "step": 6710 + }, + { + "epoch": 0.33, + "grad_norm": 0.7344093918800354, + "learning_rate": 0.0005823821659068411, + "loss": 3.3028, + "step": 6711 + }, + { + "epoch": 0.33, + "grad_norm": 0.5303009152412415, + "learning_rate": 0.0005823769671036549, + "loss": 3.3448, + "step": 6712 + }, + { + "epoch": 0.33, + "grad_norm": 0.5423128008842468, + "learning_rate": 0.0005823717675567419, + "loss": 3.2607, + "step": 6713 + }, + { + "epoch": 0.33, + "grad_norm": 0.5232293009757996, + "learning_rate": 0.0005823665672661161, + "loss": 3.2688, + "step": 6714 + }, + { + "epoch": 0.33, + "grad_norm": 0.5623641610145569, + "learning_rate": 0.0005823613662317912, + "loss": 3.2962, + "step": 6715 + }, + { + "epoch": 0.33, + "grad_norm": 0.6031582355499268, + "learning_rate": 0.0005823561644537807, + "loss": 3.3641, + "step": 6716 + }, + { + "epoch": 0.33, + "grad_norm": 0.5149546265602112, + "learning_rate": 0.0005823509619320986, + "loss": 3.4208, + "step": 6717 + }, + { + "epoch": 0.33, + "grad_norm": 0.5367235541343689, + "learning_rate": 0.0005823457586667582, + "loss": 3.2141, + "step": 6718 + }, + { + "epoch": 0.33, + "grad_norm": 0.5604205131530762, + "learning_rate": 0.0005823405546577735, + "loss": 3.5344, + "step": 6719 + }, + { + "epoch": 0.33, + "grad_norm": 0.5116593241691589, + "learning_rate": 0.000582335349905158, + "loss": 3.5164, + "step": 6720 + }, + { + "epoch": 0.33, + "grad_norm": 0.5447813272476196, + "learning_rate": 0.0005823301444089256, + "loss": 3.4071, + "step": 6721 + }, + { + "epoch": 0.33, + "grad_norm": 0.5833255052566528, + "learning_rate": 0.0005823249381690899, + "loss": 3.3173, + "step": 6722 + }, + { + "epoch": 0.33, + "grad_norm": 0.5849522352218628, + "learning_rate": 0.0005823197311856647, + "loss": 3.1083, + "step": 6723 + }, + { + "epoch": 0.33, + "grad_norm": 0.5260932445526123, + "learning_rate": 0.0005823145234586638, + "loss": 3.3465, + "step": 6724 + }, + { + "epoch": 0.33, + "grad_norm": 0.5599755644798279, + "learning_rate": 0.0005823093149881005, + "loss": 3.3558, + "step": 6725 + }, + { + "epoch": 0.33, + "grad_norm": 0.5124487280845642, + "learning_rate": 0.0005823041057739889, + "loss": 3.589, + "step": 6726 + }, + { + "epoch": 0.33, + "grad_norm": 0.5424213409423828, + "learning_rate": 0.0005822988958163427, + "loss": 3.3946, + "step": 6727 + }, + { + "epoch": 0.33, + "grad_norm": 0.5511559844017029, + "learning_rate": 0.0005822936851151755, + "loss": 3.3763, + "step": 6728 + }, + { + "epoch": 0.33, + "grad_norm": 0.5458391904830933, + "learning_rate": 0.0005822884736705011, + "loss": 3.2826, + "step": 6729 + }, + { + "epoch": 0.33, + "grad_norm": 0.5722623467445374, + "learning_rate": 0.000582283261482333, + "loss": 3.2023, + "step": 6730 + }, + { + "epoch": 0.33, + "grad_norm": 0.5854646563529968, + "learning_rate": 0.0005822780485506854, + "loss": 3.2832, + "step": 6731 + }, + { + "epoch": 0.33, + "grad_norm": 0.5965773463249207, + "learning_rate": 0.0005822728348755716, + "loss": 3.1899, + "step": 6732 + }, + { + "epoch": 0.33, + "grad_norm": 0.5229557156562805, + "learning_rate": 0.0005822676204570054, + "loss": 3.3824, + "step": 6733 + }, + { + "epoch": 0.33, + "grad_norm": 0.527096688747406, + "learning_rate": 0.0005822624052950006, + "loss": 3.4582, + "step": 6734 + }, + { + "epoch": 0.33, + "grad_norm": 0.5202319025993347, + "learning_rate": 0.0005822571893895711, + "loss": 3.4314, + "step": 6735 + }, + { + "epoch": 0.33, + "grad_norm": 0.5747625827789307, + "learning_rate": 0.0005822519727407304, + "loss": 3.345, + "step": 6736 + }, + { + "epoch": 0.33, + "grad_norm": 0.498573362827301, + "learning_rate": 0.0005822467553484923, + "loss": 3.4014, + "step": 6737 + }, + { + "epoch": 0.33, + "grad_norm": 0.5384262800216675, + "learning_rate": 0.0005822415372128706, + "loss": 3.3868, + "step": 6738 + }, + { + "epoch": 0.33, + "grad_norm": 0.5200839042663574, + "learning_rate": 0.000582236318333879, + "loss": 3.3441, + "step": 6739 + }, + { + "epoch": 0.33, + "grad_norm": 0.5176569819450378, + "learning_rate": 0.0005822310987115314, + "loss": 3.2837, + "step": 6740 + }, + { + "epoch": 0.33, + "grad_norm": 0.6018693447113037, + "learning_rate": 0.0005822258783458412, + "loss": 3.2213, + "step": 6741 + }, + { + "epoch": 0.33, + "grad_norm": 0.5896228551864624, + "learning_rate": 0.0005822206572368224, + "loss": 3.339, + "step": 6742 + }, + { + "epoch": 0.33, + "grad_norm": 0.5164976119995117, + "learning_rate": 0.0005822154353844887, + "loss": 3.4402, + "step": 6743 + }, + { + "epoch": 0.33, + "grad_norm": 0.5089805722236633, + "learning_rate": 0.0005822102127888539, + "loss": 3.1868, + "step": 6744 + }, + { + "epoch": 0.33, + "grad_norm": 0.5495955348014832, + "learning_rate": 0.0005822049894499316, + "loss": 3.5313, + "step": 6745 + }, + { + "epoch": 0.33, + "grad_norm": 0.5287194848060608, + "learning_rate": 0.0005821997653677359, + "loss": 3.1508, + "step": 6746 + }, + { + "epoch": 0.33, + "grad_norm": 0.6048151254653931, + "learning_rate": 0.0005821945405422802, + "loss": 3.1749, + "step": 6747 + }, + { + "epoch": 0.33, + "grad_norm": 0.5822990536689758, + "learning_rate": 0.0005821893149735784, + "loss": 3.3297, + "step": 6748 + }, + { + "epoch": 0.33, + "grad_norm": 0.518795907497406, + "learning_rate": 0.0005821840886616441, + "loss": 3.3508, + "step": 6749 + }, + { + "epoch": 0.33, + "grad_norm": 0.608176052570343, + "learning_rate": 0.0005821788616064914, + "loss": 3.2698, + "step": 6750 + }, + { + "epoch": 0.33, + "grad_norm": 0.5752763748168945, + "learning_rate": 0.0005821736338081339, + "loss": 3.3454, + "step": 6751 + }, + { + "epoch": 0.33, + "grad_norm": 0.5169047117233276, + "learning_rate": 0.0005821684052665851, + "loss": 3.609, + "step": 6752 + }, + { + "epoch": 0.33, + "grad_norm": 0.529773473739624, + "learning_rate": 0.0005821631759818593, + "loss": 3.369, + "step": 6753 + }, + { + "epoch": 0.33, + "grad_norm": 0.543318510055542, + "learning_rate": 0.00058215794595397, + "loss": 3.4376, + "step": 6754 + }, + { + "epoch": 0.33, + "grad_norm": 0.6167493462562561, + "learning_rate": 0.0005821527151829308, + "loss": 3.2169, + "step": 6755 + }, + { + "epoch": 0.33, + "grad_norm": 0.5135380625724792, + "learning_rate": 0.0005821474836687559, + "loss": 3.2988, + "step": 6756 + }, + { + "epoch": 0.33, + "grad_norm": 0.597224235534668, + "learning_rate": 0.0005821422514114585, + "loss": 3.3903, + "step": 6757 + }, + { + "epoch": 0.33, + "grad_norm": 0.5847658514976501, + "learning_rate": 0.0005821370184110529, + "loss": 3.6174, + "step": 6758 + }, + { + "epoch": 0.33, + "grad_norm": 0.5715068578720093, + "learning_rate": 0.0005821317846675527, + "loss": 3.3343, + "step": 6759 + }, + { + "epoch": 0.33, + "grad_norm": 0.5231801271438599, + "learning_rate": 0.0005821265501809716, + "loss": 3.3706, + "step": 6760 + }, + { + "epoch": 0.33, + "grad_norm": 0.5947988033294678, + "learning_rate": 0.0005821213149513235, + "loss": 3.2535, + "step": 6761 + }, + { + "epoch": 0.33, + "grad_norm": 0.5078067779541016, + "learning_rate": 0.0005821160789786222, + "loss": 3.412, + "step": 6762 + }, + { + "epoch": 0.33, + "grad_norm": 0.5282332897186279, + "learning_rate": 0.0005821108422628813, + "loss": 3.2963, + "step": 6763 + }, + { + "epoch": 0.33, + "grad_norm": 0.5280579924583435, + "learning_rate": 0.0005821056048041149, + "loss": 3.4438, + "step": 6764 + }, + { + "epoch": 0.33, + "grad_norm": 0.5364978909492493, + "learning_rate": 0.0005821003666023366, + "loss": 3.3856, + "step": 6765 + }, + { + "epoch": 0.33, + "grad_norm": 0.5377125144004822, + "learning_rate": 0.0005820951276575601, + "loss": 3.4609, + "step": 6766 + }, + { + "epoch": 0.33, + "grad_norm": 0.5311643481254578, + "learning_rate": 0.0005820898879697994, + "loss": 3.3563, + "step": 6767 + }, + { + "epoch": 0.33, + "grad_norm": 0.5513588190078735, + "learning_rate": 0.0005820846475390683, + "loss": 3.4952, + "step": 6768 + }, + { + "epoch": 0.33, + "grad_norm": 0.5474352240562439, + "learning_rate": 0.0005820794063653805, + "loss": 3.6479, + "step": 6769 + }, + { + "epoch": 0.33, + "grad_norm": 0.5823585987091064, + "learning_rate": 0.0005820741644487497, + "loss": 3.1314, + "step": 6770 + }, + { + "epoch": 0.33, + "grad_norm": 0.5332926511764526, + "learning_rate": 0.0005820689217891899, + "loss": 3.4825, + "step": 6771 + }, + { + "epoch": 0.33, + "grad_norm": 0.5201532244682312, + "learning_rate": 0.0005820636783867149, + "loss": 3.4215, + "step": 6772 + }, + { + "epoch": 0.33, + "grad_norm": 0.573883056640625, + "learning_rate": 0.0005820584342413385, + "loss": 3.3778, + "step": 6773 + }, + { + "epoch": 0.33, + "grad_norm": 0.6504254937171936, + "learning_rate": 0.0005820531893530744, + "loss": 3.3315, + "step": 6774 + }, + { + "epoch": 0.33, + "grad_norm": 0.5481976270675659, + "learning_rate": 0.0005820479437219365, + "loss": 3.4299, + "step": 6775 + }, + { + "epoch": 0.33, + "grad_norm": 0.5290274024009705, + "learning_rate": 0.0005820426973479386, + "loss": 3.4469, + "step": 6776 + }, + { + "epoch": 0.33, + "grad_norm": 0.5465989112854004, + "learning_rate": 0.0005820374502310945, + "loss": 3.4553, + "step": 6777 + }, + { + "epoch": 0.33, + "grad_norm": 0.5408307313919067, + "learning_rate": 0.0005820322023714179, + "loss": 3.2675, + "step": 6778 + }, + { + "epoch": 0.33, + "grad_norm": 0.5560708045959473, + "learning_rate": 0.0005820269537689229, + "loss": 3.235, + "step": 6779 + }, + { + "epoch": 0.33, + "grad_norm": 0.5538557171821594, + "learning_rate": 0.0005820217044236232, + "loss": 3.3199, + "step": 6780 + }, + { + "epoch": 0.33, + "grad_norm": 0.5382668972015381, + "learning_rate": 0.0005820164543355326, + "loss": 3.5514, + "step": 6781 + }, + { + "epoch": 0.33, + "grad_norm": 0.5423834323883057, + "learning_rate": 0.0005820112035046648, + "loss": 3.2956, + "step": 6782 + }, + { + "epoch": 0.33, + "grad_norm": 0.5152990221977234, + "learning_rate": 0.0005820059519310339, + "loss": 3.2701, + "step": 6783 + }, + { + "epoch": 0.33, + "grad_norm": 0.5263355374336243, + "learning_rate": 0.0005820006996146536, + "loss": 3.4816, + "step": 6784 + }, + { + "epoch": 0.33, + "grad_norm": 0.5338313579559326, + "learning_rate": 0.0005819954465555377, + "loss": 3.2517, + "step": 6785 + }, + { + "epoch": 0.33, + "grad_norm": 0.5005643367767334, + "learning_rate": 0.0005819901927537, + "loss": 3.1299, + "step": 6786 + }, + { + "epoch": 0.33, + "grad_norm": 0.5044447183609009, + "learning_rate": 0.0005819849382091545, + "loss": 3.501, + "step": 6787 + }, + { + "epoch": 0.33, + "grad_norm": 0.5595723390579224, + "learning_rate": 0.0005819796829219149, + "loss": 3.4216, + "step": 6788 + }, + { + "epoch": 0.33, + "grad_norm": 0.5340045690536499, + "learning_rate": 0.0005819744268919951, + "loss": 3.4126, + "step": 6789 + }, + { + "epoch": 0.33, + "grad_norm": 0.5521731972694397, + "learning_rate": 0.000581969170119409, + "loss": 3.5361, + "step": 6790 + }, + { + "epoch": 0.33, + "grad_norm": 0.546519935131073, + "learning_rate": 0.0005819639126041702, + "loss": 3.5096, + "step": 6791 + }, + { + "epoch": 0.33, + "grad_norm": 0.5405811071395874, + "learning_rate": 0.0005819586543462928, + "loss": 3.2427, + "step": 6792 + }, + { + "epoch": 0.33, + "grad_norm": 0.5262058973312378, + "learning_rate": 0.0005819533953457906, + "loss": 3.4118, + "step": 6793 + }, + { + "epoch": 0.33, + "grad_norm": 0.5601271986961365, + "learning_rate": 0.0005819481356026774, + "loss": 3.2993, + "step": 6794 + }, + { + "epoch": 0.33, + "grad_norm": 0.5209169387817383, + "learning_rate": 0.0005819428751169671, + "loss": 3.2435, + "step": 6795 + }, + { + "epoch": 0.33, + "grad_norm": 0.5251837968826294, + "learning_rate": 0.0005819376138886734, + "loss": 3.2903, + "step": 6796 + }, + { + "epoch": 0.33, + "grad_norm": 0.5272718667984009, + "learning_rate": 0.0005819323519178105, + "loss": 3.3486, + "step": 6797 + }, + { + "epoch": 0.33, + "grad_norm": 0.6075021624565125, + "learning_rate": 0.0005819270892043919, + "loss": 3.413, + "step": 6798 + }, + { + "epoch": 0.33, + "grad_norm": 0.5117523670196533, + "learning_rate": 0.0005819218257484315, + "loss": 3.4657, + "step": 6799 + }, + { + "epoch": 0.33, + "grad_norm": 0.4947341978549957, + "learning_rate": 0.0005819165615499435, + "loss": 3.5324, + "step": 6800 + }, + { + "epoch": 0.33, + "grad_norm": 0.5065931081771851, + "learning_rate": 0.0005819112966089415, + "loss": 3.4816, + "step": 6801 + }, + { + "epoch": 0.33, + "grad_norm": 0.5459311604499817, + "learning_rate": 0.0005819060309254393, + "loss": 3.2451, + "step": 6802 + }, + { + "epoch": 0.33, + "grad_norm": 0.5343260169029236, + "learning_rate": 0.0005819007644994509, + "loss": 3.4584, + "step": 6803 + }, + { + "epoch": 0.33, + "grad_norm": 0.514397382736206, + "learning_rate": 0.0005818954973309901, + "loss": 3.4568, + "step": 6804 + }, + { + "epoch": 0.33, + "grad_norm": 0.5538575053215027, + "learning_rate": 0.0005818902294200708, + "loss": 3.208, + "step": 6805 + }, + { + "epoch": 0.33, + "grad_norm": 0.5236798524856567, + "learning_rate": 0.0005818849607667069, + "loss": 3.3301, + "step": 6806 + }, + { + "epoch": 0.33, + "grad_norm": 0.5536655187606812, + "learning_rate": 0.0005818796913709123, + "loss": 3.4512, + "step": 6807 + }, + { + "epoch": 0.33, + "grad_norm": 0.5622990727424622, + "learning_rate": 0.0005818744212327009, + "loss": 3.5709, + "step": 6808 + }, + { + "epoch": 0.33, + "grad_norm": 0.5513145327568054, + "learning_rate": 0.0005818691503520865, + "loss": 3.4054, + "step": 6809 + }, + { + "epoch": 0.33, + "grad_norm": 0.5191030502319336, + "learning_rate": 0.0005818638787290829, + "loss": 3.3703, + "step": 6810 + }, + { + "epoch": 0.33, + "grad_norm": 0.5005136132240295, + "learning_rate": 0.0005818586063637041, + "loss": 3.3643, + "step": 6811 + }, + { + "epoch": 0.33, + "grad_norm": 0.5312431454658508, + "learning_rate": 0.0005818533332559642, + "loss": 3.4871, + "step": 6812 + }, + { + "epoch": 0.33, + "grad_norm": 0.5487942099571228, + "learning_rate": 0.0005818480594058766, + "loss": 3.3361, + "step": 6813 + }, + { + "epoch": 0.33, + "grad_norm": 0.5031387209892273, + "learning_rate": 0.0005818427848134556, + "loss": 3.5931, + "step": 6814 + }, + { + "epoch": 0.33, + "grad_norm": 0.5074050426483154, + "learning_rate": 0.0005818375094787148, + "loss": 3.3569, + "step": 6815 + }, + { + "epoch": 0.33, + "grad_norm": 0.5995979905128479, + "learning_rate": 0.0005818322334016684, + "loss": 3.412, + "step": 6816 + }, + { + "epoch": 0.33, + "grad_norm": 0.5569592118263245, + "learning_rate": 0.0005818269565823301, + "loss": 3.4492, + "step": 6817 + }, + { + "epoch": 0.33, + "grad_norm": 0.5373505353927612, + "learning_rate": 0.0005818216790207137, + "loss": 3.3863, + "step": 6818 + }, + { + "epoch": 0.33, + "grad_norm": 0.513282835483551, + "learning_rate": 0.0005818164007168335, + "loss": 3.1659, + "step": 6819 + }, + { + "epoch": 0.33, + "grad_norm": 0.5331324338912964, + "learning_rate": 0.0005818111216707029, + "loss": 3.1737, + "step": 6820 + }, + { + "epoch": 0.33, + "grad_norm": 0.5432617664337158, + "learning_rate": 0.0005818058418823361, + "loss": 3.5055, + "step": 6821 + }, + { + "epoch": 0.33, + "grad_norm": 0.5340288877487183, + "learning_rate": 0.000581800561351747, + "loss": 3.3134, + "step": 6822 + }, + { + "epoch": 0.33, + "grad_norm": 0.523755669593811, + "learning_rate": 0.0005817952800789494, + "loss": 3.41, + "step": 6823 + }, + { + "epoch": 0.33, + "grad_norm": 0.5807611346244812, + "learning_rate": 0.0005817899980639573, + "loss": 3.3649, + "step": 6824 + }, + { + "epoch": 0.33, + "grad_norm": 0.5344754457473755, + "learning_rate": 0.0005817847153067846, + "loss": 3.3996, + "step": 6825 + }, + { + "epoch": 0.33, + "grad_norm": 0.5475096106529236, + "learning_rate": 0.0005817794318074452, + "loss": 3.2395, + "step": 6826 + }, + { + "epoch": 0.33, + "grad_norm": 0.5475925803184509, + "learning_rate": 0.0005817741475659529, + "loss": 3.0788, + "step": 6827 + }, + { + "epoch": 0.33, + "grad_norm": 0.5353468060493469, + "learning_rate": 0.0005817688625823218, + "loss": 3.3761, + "step": 6828 + }, + { + "epoch": 0.33, + "grad_norm": 0.5556272864341736, + "learning_rate": 0.0005817635768565657, + "loss": 3.4141, + "step": 6829 + }, + { + "epoch": 0.33, + "grad_norm": 0.5247024297714233, + "learning_rate": 0.0005817582903886986, + "loss": 3.2435, + "step": 6830 + }, + { + "epoch": 0.33, + "grad_norm": 0.5360187292098999, + "learning_rate": 0.0005817530031787344, + "loss": 3.3121, + "step": 6831 + }, + { + "epoch": 0.33, + "grad_norm": 0.5367879867553711, + "learning_rate": 0.000581747715226687, + "loss": 3.3657, + "step": 6832 + }, + { + "epoch": 0.33, + "grad_norm": 0.5176432728767395, + "learning_rate": 0.0005817424265325703, + "loss": 3.3106, + "step": 6833 + }, + { + "epoch": 0.33, + "grad_norm": 0.5033512115478516, + "learning_rate": 0.0005817371370963984, + "loss": 3.3319, + "step": 6834 + }, + { + "epoch": 0.33, + "grad_norm": 0.5678747892379761, + "learning_rate": 0.0005817318469181849, + "loss": 3.3628, + "step": 6835 + }, + { + "epoch": 0.34, + "grad_norm": 0.5114911794662476, + "learning_rate": 0.000581726555997944, + "loss": 3.1957, + "step": 6836 + }, + { + "epoch": 0.34, + "grad_norm": 0.5610429644584656, + "learning_rate": 0.0005817212643356897, + "loss": 3.1318, + "step": 6837 + }, + { + "epoch": 0.34, + "grad_norm": 0.5097834467887878, + "learning_rate": 0.0005817159719314358, + "loss": 3.2942, + "step": 6838 + }, + { + "epoch": 0.34, + "grad_norm": 0.5260331630706787, + "learning_rate": 0.0005817106787851962, + "loss": 3.3047, + "step": 6839 + }, + { + "epoch": 0.34, + "grad_norm": 0.5727056860923767, + "learning_rate": 0.0005817053848969848, + "loss": 3.4154, + "step": 6840 + }, + { + "epoch": 0.34, + "grad_norm": 0.5135591626167297, + "learning_rate": 0.0005817000902668157, + "loss": 3.5154, + "step": 6841 + }, + { + "epoch": 0.34, + "grad_norm": 0.5463142395019531, + "learning_rate": 0.0005816947948947028, + "loss": 3.2277, + "step": 6842 + }, + { + "epoch": 0.34, + "grad_norm": 0.5426071882247925, + "learning_rate": 0.00058168949878066, + "loss": 3.4398, + "step": 6843 + }, + { + "epoch": 0.34, + "grad_norm": 0.5238614082336426, + "learning_rate": 0.0005816842019247012, + "loss": 3.3057, + "step": 6844 + }, + { + "epoch": 0.34, + "grad_norm": 0.4938346743583679, + "learning_rate": 0.0005816789043268406, + "loss": 3.3241, + "step": 6845 + }, + { + "epoch": 0.34, + "grad_norm": 0.5384917259216309, + "learning_rate": 0.0005816736059870918, + "loss": 3.2526, + "step": 6846 + }, + { + "epoch": 0.34, + "grad_norm": 0.5270363092422485, + "learning_rate": 0.000581668306905469, + "loss": 3.3041, + "step": 6847 + }, + { + "epoch": 0.34, + "grad_norm": 0.508563220500946, + "learning_rate": 0.0005816630070819861, + "loss": 3.3541, + "step": 6848 + }, + { + "epoch": 0.34, + "grad_norm": 0.5612684488296509, + "learning_rate": 0.000581657706516657, + "loss": 3.463, + "step": 6849 + }, + { + "epoch": 0.34, + "grad_norm": 0.5991032123565674, + "learning_rate": 0.0005816524052094957, + "loss": 3.1475, + "step": 6850 + }, + { + "epoch": 0.34, + "grad_norm": 0.5217996835708618, + "learning_rate": 0.0005816471031605163, + "loss": 3.565, + "step": 6851 + }, + { + "epoch": 0.34, + "grad_norm": 0.5394172072410583, + "learning_rate": 0.0005816418003697324, + "loss": 3.2722, + "step": 6852 + }, + { + "epoch": 0.34, + "grad_norm": 0.5643725395202637, + "learning_rate": 0.0005816364968371584, + "loss": 3.2096, + "step": 6853 + }, + { + "epoch": 0.34, + "grad_norm": 0.5298509001731873, + "learning_rate": 0.000581631192562808, + "loss": 3.379, + "step": 6854 + }, + { + "epoch": 0.34, + "grad_norm": 0.5548331141471863, + "learning_rate": 0.0005816258875466953, + "loss": 3.1849, + "step": 6855 + }, + { + "epoch": 0.34, + "grad_norm": 0.4925016760826111, + "learning_rate": 0.0005816205817888342, + "loss": 3.3093, + "step": 6856 + }, + { + "epoch": 0.34, + "grad_norm": 0.5486854910850525, + "learning_rate": 0.0005816152752892387, + "loss": 3.3507, + "step": 6857 + }, + { + "epoch": 0.34, + "grad_norm": 0.5491653084754944, + "learning_rate": 0.0005816099680479226, + "loss": 3.3045, + "step": 6858 + }, + { + "epoch": 0.34, + "grad_norm": 0.5340956449508667, + "learning_rate": 0.0005816046600649002, + "loss": 3.3908, + "step": 6859 + }, + { + "epoch": 0.34, + "grad_norm": 0.5760660767555237, + "learning_rate": 0.0005815993513401853, + "loss": 3.2897, + "step": 6860 + }, + { + "epoch": 0.34, + "grad_norm": 0.5405675768852234, + "learning_rate": 0.0005815940418737918, + "loss": 3.3984, + "step": 6861 + }, + { + "epoch": 0.34, + "grad_norm": 0.5545721054077148, + "learning_rate": 0.0005815887316657339, + "loss": 3.6153, + "step": 6862 + }, + { + "epoch": 0.34, + "grad_norm": 0.5303523540496826, + "learning_rate": 0.0005815834207160255, + "loss": 3.4198, + "step": 6863 + }, + { + "epoch": 0.34, + "grad_norm": 0.5388019680976868, + "learning_rate": 0.0005815781090246805, + "loss": 3.3454, + "step": 6864 + }, + { + "epoch": 0.34, + "grad_norm": 0.5352962613105774, + "learning_rate": 0.0005815727965917129, + "loss": 3.2491, + "step": 6865 + }, + { + "epoch": 0.34, + "grad_norm": 0.5232275128364563, + "learning_rate": 0.0005815674834171368, + "loss": 3.3865, + "step": 6866 + }, + { + "epoch": 0.34, + "grad_norm": 0.5305415987968445, + "learning_rate": 0.0005815621695009662, + "loss": 3.25, + "step": 6867 + }, + { + "epoch": 0.34, + "grad_norm": 0.5452413558959961, + "learning_rate": 0.000581556854843215, + "loss": 3.3942, + "step": 6868 + }, + { + "epoch": 0.34, + "grad_norm": 0.5450773239135742, + "learning_rate": 0.0005815515394438971, + "loss": 3.3143, + "step": 6869 + }, + { + "epoch": 0.34, + "grad_norm": 0.541786789894104, + "learning_rate": 0.0005815462233030267, + "loss": 3.5092, + "step": 6870 + }, + { + "epoch": 0.34, + "grad_norm": 0.5301926136016846, + "learning_rate": 0.0005815409064206178, + "loss": 3.5044, + "step": 6871 + }, + { + "epoch": 0.34, + "grad_norm": 0.5375513434410095, + "learning_rate": 0.0005815355887966843, + "loss": 3.2972, + "step": 6872 + }, + { + "epoch": 0.34, + "grad_norm": 0.5402582883834839, + "learning_rate": 0.0005815302704312403, + "loss": 3.2579, + "step": 6873 + }, + { + "epoch": 0.34, + "grad_norm": 0.5533377528190613, + "learning_rate": 0.0005815249513242996, + "loss": 3.5222, + "step": 6874 + }, + { + "epoch": 0.34, + "grad_norm": 0.5436393022537231, + "learning_rate": 0.0005815196314758765, + "loss": 3.1836, + "step": 6875 + }, + { + "epoch": 0.34, + "grad_norm": 0.5396994948387146, + "learning_rate": 0.0005815143108859848, + "loss": 3.374, + "step": 6876 + }, + { + "epoch": 0.34, + "grad_norm": 0.5397756099700928, + "learning_rate": 0.0005815089895546386, + "loss": 3.2155, + "step": 6877 + }, + { + "epoch": 0.34, + "grad_norm": 0.5924909710884094, + "learning_rate": 0.0005815036674818519, + "loss": 3.35, + "step": 6878 + }, + { + "epoch": 0.34, + "grad_norm": 0.5628549456596375, + "learning_rate": 0.0005814983446676388, + "loss": 3.44, + "step": 6879 + }, + { + "epoch": 0.34, + "grad_norm": 0.5356522798538208, + "learning_rate": 0.0005814930211120132, + "loss": 3.2106, + "step": 6880 + }, + { + "epoch": 0.34, + "grad_norm": 0.5571137070655823, + "learning_rate": 0.0005814876968149891, + "loss": 3.4285, + "step": 6881 + }, + { + "epoch": 0.34, + "grad_norm": 0.579041600227356, + "learning_rate": 0.0005814823717765806, + "loss": 3.3484, + "step": 6882 + }, + { + "epoch": 0.34, + "grad_norm": 0.4999987781047821, + "learning_rate": 0.0005814770459968016, + "loss": 3.3225, + "step": 6883 + }, + { + "epoch": 0.34, + "grad_norm": 0.5245411396026611, + "learning_rate": 0.0005814717194756663, + "loss": 3.3605, + "step": 6884 + }, + { + "epoch": 0.34, + "grad_norm": 0.5306369066238403, + "learning_rate": 0.0005814663922131888, + "loss": 3.4097, + "step": 6885 + }, + { + "epoch": 0.34, + "grad_norm": 0.5047025084495544, + "learning_rate": 0.0005814610642093828, + "loss": 3.287, + "step": 6886 + }, + { + "epoch": 0.34, + "grad_norm": 0.5388005971908569, + "learning_rate": 0.0005814557354642627, + "loss": 3.2558, + "step": 6887 + }, + { + "epoch": 0.34, + "grad_norm": 0.5090034008026123, + "learning_rate": 0.0005814504059778422, + "loss": 3.2232, + "step": 6888 + }, + { + "epoch": 0.34, + "grad_norm": 0.523379921913147, + "learning_rate": 0.0005814450757501355, + "loss": 3.2021, + "step": 6889 + }, + { + "epoch": 0.34, + "grad_norm": 0.5205253958702087, + "learning_rate": 0.0005814397447811568, + "loss": 3.5486, + "step": 6890 + }, + { + "epoch": 0.34, + "grad_norm": 0.5232972502708435, + "learning_rate": 0.0005814344130709198, + "loss": 3.3859, + "step": 6891 + }, + { + "epoch": 0.34, + "grad_norm": 0.548040509223938, + "learning_rate": 0.0005814290806194388, + "loss": 3.2865, + "step": 6892 + }, + { + "epoch": 0.34, + "grad_norm": 0.5398008227348328, + "learning_rate": 0.0005814237474267277, + "loss": 3.3062, + "step": 6893 + }, + { + "epoch": 0.34, + "grad_norm": 0.5877861380577087, + "learning_rate": 0.0005814184134928008, + "loss": 3.3348, + "step": 6894 + }, + { + "epoch": 0.34, + "grad_norm": 0.5505077242851257, + "learning_rate": 0.0005814130788176718, + "loss": 3.3898, + "step": 6895 + }, + { + "epoch": 0.34, + "grad_norm": 0.5634656548500061, + "learning_rate": 0.0005814077434013549, + "loss": 3.2064, + "step": 6896 + }, + { + "epoch": 0.34, + "grad_norm": 0.5058372616767883, + "learning_rate": 0.0005814024072438642, + "loss": 3.3888, + "step": 6897 + }, + { + "epoch": 0.34, + "grad_norm": 0.5141379237174988, + "learning_rate": 0.0005813970703452138, + "loss": 3.174, + "step": 6898 + }, + { + "epoch": 0.34, + "grad_norm": 0.5080946683883667, + "learning_rate": 0.0005813917327054175, + "loss": 3.1621, + "step": 6899 + }, + { + "epoch": 0.34, + "grad_norm": 0.533840000629425, + "learning_rate": 0.0005813863943244897, + "loss": 3.4227, + "step": 6900 + }, + { + "epoch": 0.34, + "grad_norm": 0.5628681182861328, + "learning_rate": 0.0005813810552024441, + "loss": 3.1773, + "step": 6901 + }, + { + "epoch": 0.34, + "grad_norm": 0.5388134717941284, + "learning_rate": 0.0005813757153392951, + "loss": 3.5305, + "step": 6902 + }, + { + "epoch": 0.34, + "grad_norm": 0.5274122953414917, + "learning_rate": 0.0005813703747350566, + "loss": 3.3009, + "step": 6903 + }, + { + "epoch": 0.34, + "grad_norm": 0.5223855972290039, + "learning_rate": 0.0005813650333897426, + "loss": 3.3918, + "step": 6904 + }, + { + "epoch": 0.34, + "grad_norm": 0.5121346712112427, + "learning_rate": 0.0005813596913033673, + "loss": 3.3949, + "step": 6905 + }, + { + "epoch": 0.34, + "grad_norm": 0.5447664260864258, + "learning_rate": 0.0005813543484759448, + "loss": 3.3467, + "step": 6906 + }, + { + "epoch": 0.34, + "grad_norm": 0.5025432705879211, + "learning_rate": 0.000581349004907489, + "loss": 3.388, + "step": 6907 + }, + { + "epoch": 0.34, + "grad_norm": 0.5353335738182068, + "learning_rate": 0.0005813436605980142, + "loss": 3.2766, + "step": 6908 + }, + { + "epoch": 0.34, + "grad_norm": 0.503914475440979, + "learning_rate": 0.0005813383155475342, + "loss": 3.2077, + "step": 6909 + }, + { + "epoch": 0.34, + "grad_norm": 0.5467348098754883, + "learning_rate": 0.0005813329697560633, + "loss": 3.2751, + "step": 6910 + }, + { + "epoch": 0.34, + "grad_norm": 0.5445327162742615, + "learning_rate": 0.0005813276232236155, + "loss": 3.3215, + "step": 6911 + }, + { + "epoch": 0.34, + "grad_norm": 0.5562853217124939, + "learning_rate": 0.0005813222759502047, + "loss": 3.4057, + "step": 6912 + }, + { + "epoch": 0.34, + "grad_norm": 0.5421975255012512, + "learning_rate": 0.0005813169279358454, + "loss": 3.2963, + "step": 6913 + }, + { + "epoch": 0.34, + "grad_norm": 0.5326264500617981, + "learning_rate": 0.0005813115791805513, + "loss": 3.2552, + "step": 6914 + }, + { + "epoch": 0.34, + "grad_norm": 0.5453374981880188, + "learning_rate": 0.0005813062296843368, + "loss": 3.5338, + "step": 6915 + }, + { + "epoch": 0.34, + "grad_norm": 0.531548023223877, + "learning_rate": 0.0005813008794472158, + "loss": 3.3438, + "step": 6916 + }, + { + "epoch": 0.34, + "grad_norm": 0.586500346660614, + "learning_rate": 0.0005812955284692022, + "loss": 3.2636, + "step": 6917 + }, + { + "epoch": 0.34, + "grad_norm": 0.523780345916748, + "learning_rate": 0.0005812901767503105, + "loss": 3.4048, + "step": 6918 + }, + { + "epoch": 0.34, + "grad_norm": 0.5507670640945435, + "learning_rate": 0.0005812848242905547, + "loss": 3.3771, + "step": 6919 + }, + { + "epoch": 0.34, + "grad_norm": 0.5263717174530029, + "learning_rate": 0.0005812794710899486, + "loss": 3.1895, + "step": 6920 + }, + { + "epoch": 0.34, + "grad_norm": 0.5932714939117432, + "learning_rate": 0.0005812741171485066, + "loss": 3.3006, + "step": 6921 + }, + { + "epoch": 0.34, + "grad_norm": 0.5689055919647217, + "learning_rate": 0.0005812687624662427, + "loss": 3.3535, + "step": 6922 + }, + { + "epoch": 0.34, + "grad_norm": 0.5490853786468506, + "learning_rate": 0.000581263407043171, + "loss": 3.2321, + "step": 6923 + }, + { + "epoch": 0.34, + "grad_norm": 0.5083225965499878, + "learning_rate": 0.0005812580508793056, + "loss": 3.2538, + "step": 6924 + }, + { + "epoch": 0.34, + "grad_norm": 0.5350843071937561, + "learning_rate": 0.0005812526939746607, + "loss": 3.351, + "step": 6925 + }, + { + "epoch": 0.34, + "grad_norm": 0.5658883452415466, + "learning_rate": 0.0005812473363292503, + "loss": 3.2112, + "step": 6926 + }, + { + "epoch": 0.34, + "grad_norm": 0.5066301822662354, + "learning_rate": 0.0005812419779430885, + "loss": 3.5274, + "step": 6927 + }, + { + "epoch": 0.34, + "grad_norm": 0.5520161986351013, + "learning_rate": 0.0005812366188161894, + "loss": 3.1782, + "step": 6928 + }, + { + "epoch": 0.34, + "grad_norm": 0.5627199411392212, + "learning_rate": 0.0005812312589485673, + "loss": 3.2834, + "step": 6929 + }, + { + "epoch": 0.34, + "grad_norm": 0.544322669506073, + "learning_rate": 0.0005812258983402363, + "loss": 3.5101, + "step": 6930 + }, + { + "epoch": 0.34, + "grad_norm": 0.5476049184799194, + "learning_rate": 0.0005812205369912102, + "loss": 3.2291, + "step": 6931 + }, + { + "epoch": 0.34, + "grad_norm": 0.5459288358688354, + "learning_rate": 0.0005812151749015034, + "loss": 3.2593, + "step": 6932 + }, + { + "epoch": 0.34, + "grad_norm": 0.533470094203949, + "learning_rate": 0.0005812098120711299, + "loss": 3.3264, + "step": 6933 + }, + { + "epoch": 0.34, + "grad_norm": 0.5248565077781677, + "learning_rate": 0.0005812044485001039, + "loss": 3.2764, + "step": 6934 + }, + { + "epoch": 0.34, + "grad_norm": 0.5313982963562012, + "learning_rate": 0.0005811990841884395, + "loss": 3.4671, + "step": 6935 + }, + { + "epoch": 0.34, + "grad_norm": 0.695208728313446, + "learning_rate": 0.0005811937191361508, + "loss": 3.5434, + "step": 6936 + }, + { + "epoch": 0.34, + "grad_norm": 0.5338233709335327, + "learning_rate": 0.0005811883533432521, + "loss": 3.249, + "step": 6937 + }, + { + "epoch": 0.34, + "grad_norm": 0.5622416138648987, + "learning_rate": 0.0005811829868097572, + "loss": 3.4326, + "step": 6938 + }, + { + "epoch": 0.34, + "grad_norm": 0.5586525797843933, + "learning_rate": 0.0005811776195356805, + "loss": 3.3476, + "step": 6939 + }, + { + "epoch": 0.34, + "grad_norm": 0.5135695934295654, + "learning_rate": 0.0005811722515210361, + "loss": 3.3332, + "step": 6940 + }, + { + "epoch": 0.34, + "grad_norm": 0.5429893732070923, + "learning_rate": 0.0005811668827658381, + "loss": 3.2199, + "step": 6941 + }, + { + "epoch": 0.34, + "grad_norm": 0.6040959358215332, + "learning_rate": 0.0005811615132701005, + "loss": 3.493, + "step": 6942 + }, + { + "epoch": 0.34, + "grad_norm": 0.5325751304626465, + "learning_rate": 0.0005811561430338378, + "loss": 3.2832, + "step": 6943 + }, + { + "epoch": 0.34, + "grad_norm": 0.5440836548805237, + "learning_rate": 0.0005811507720570638, + "loss": 3.2621, + "step": 6944 + }, + { + "epoch": 0.34, + "grad_norm": 0.5436229109764099, + "learning_rate": 0.0005811454003397928, + "loss": 3.3883, + "step": 6945 + }, + { + "epoch": 0.34, + "grad_norm": 0.5637691020965576, + "learning_rate": 0.0005811400278820391, + "loss": 3.4486, + "step": 6946 + }, + { + "epoch": 0.34, + "grad_norm": 0.5535193085670471, + "learning_rate": 0.0005811346546838165, + "loss": 3.4498, + "step": 6947 + }, + { + "epoch": 0.34, + "grad_norm": 0.5341395139694214, + "learning_rate": 0.0005811292807451393, + "loss": 3.4532, + "step": 6948 + }, + { + "epoch": 0.34, + "grad_norm": 0.6911645531654358, + "learning_rate": 0.0005811239060660217, + "loss": 3.2766, + "step": 6949 + }, + { + "epoch": 0.34, + "grad_norm": 0.5011906027793884, + "learning_rate": 0.0005811185306464779, + "loss": 3.1654, + "step": 6950 + }, + { + "epoch": 0.34, + "grad_norm": 0.5271753668785095, + "learning_rate": 0.0005811131544865218, + "loss": 3.3565, + "step": 6951 + }, + { + "epoch": 0.34, + "grad_norm": 0.5180072784423828, + "learning_rate": 0.000581107777586168, + "loss": 3.4378, + "step": 6952 + }, + { + "epoch": 0.34, + "grad_norm": 0.541093111038208, + "learning_rate": 0.0005811023999454303, + "loss": 3.2264, + "step": 6953 + }, + { + "epoch": 0.34, + "grad_norm": 0.5188407897949219, + "learning_rate": 0.000581097021564323, + "loss": 3.2942, + "step": 6954 + }, + { + "epoch": 0.34, + "grad_norm": 0.5392166376113892, + "learning_rate": 0.0005810916424428602, + "loss": 3.5065, + "step": 6955 + }, + { + "epoch": 0.34, + "grad_norm": 0.5347229242324829, + "learning_rate": 0.0005810862625810562, + "loss": 3.2041, + "step": 6956 + }, + { + "epoch": 0.34, + "grad_norm": 0.6093183159828186, + "learning_rate": 0.000581080881978925, + "loss": 3.5634, + "step": 6957 + }, + { + "epoch": 0.34, + "grad_norm": 0.5233674645423889, + "learning_rate": 0.0005810755006364809, + "loss": 3.2121, + "step": 6958 + }, + { + "epoch": 0.34, + "grad_norm": 0.5367003679275513, + "learning_rate": 0.0005810701185537379, + "loss": 3.4925, + "step": 6959 + }, + { + "epoch": 0.34, + "grad_norm": 0.5209534764289856, + "learning_rate": 0.0005810647357307105, + "loss": 3.292, + "step": 6960 + }, + { + "epoch": 0.34, + "grad_norm": 0.5548787713050842, + "learning_rate": 0.0005810593521674125, + "loss": 3.1859, + "step": 6961 + }, + { + "epoch": 0.34, + "grad_norm": 0.5430557131767273, + "learning_rate": 0.0005810539678638584, + "loss": 3.4861, + "step": 6962 + }, + { + "epoch": 0.34, + "grad_norm": 0.5306611657142639, + "learning_rate": 0.0005810485828200622, + "loss": 3.4295, + "step": 6963 + }, + { + "epoch": 0.34, + "grad_norm": 0.6463212966918945, + "learning_rate": 0.000581043197036038, + "loss": 3.4153, + "step": 6964 + }, + { + "epoch": 0.34, + "grad_norm": 0.5019036531448364, + "learning_rate": 0.0005810378105118002, + "loss": 3.3514, + "step": 6965 + }, + { + "epoch": 0.34, + "grad_norm": 0.5331908464431763, + "learning_rate": 0.0005810324232473629, + "loss": 3.4269, + "step": 6966 + }, + { + "epoch": 0.34, + "grad_norm": 0.5343552827835083, + "learning_rate": 0.0005810270352427403, + "loss": 3.5078, + "step": 6967 + }, + { + "epoch": 0.34, + "grad_norm": 0.561147928237915, + "learning_rate": 0.0005810216464979466, + "loss": 3.233, + "step": 6968 + }, + { + "epoch": 0.34, + "grad_norm": 0.5656723380088806, + "learning_rate": 0.0005810162570129958, + "loss": 3.1724, + "step": 6969 + }, + { + "epoch": 0.34, + "grad_norm": 0.5117201805114746, + "learning_rate": 0.0005810108667879025, + "loss": 3.3606, + "step": 6970 + }, + { + "epoch": 0.34, + "grad_norm": 0.5727672576904297, + "learning_rate": 0.0005810054758226805, + "loss": 3.4015, + "step": 6971 + }, + { + "epoch": 0.34, + "grad_norm": 0.594057559967041, + "learning_rate": 0.0005810000841173442, + "loss": 3.4109, + "step": 6972 + }, + { + "epoch": 0.34, + "grad_norm": 0.5242696404457092, + "learning_rate": 0.0005809946916719077, + "loss": 3.1948, + "step": 6973 + }, + { + "epoch": 0.34, + "grad_norm": 0.5410612225532532, + "learning_rate": 0.0005809892984863854, + "loss": 3.3574, + "step": 6974 + }, + { + "epoch": 0.34, + "grad_norm": 0.5484248995780945, + "learning_rate": 0.0005809839045607913, + "loss": 3.1889, + "step": 6975 + }, + { + "epoch": 0.34, + "grad_norm": 0.5360888242721558, + "learning_rate": 0.0005809785098951396, + "loss": 3.3528, + "step": 6976 + }, + { + "epoch": 0.34, + "grad_norm": 0.6561615467071533, + "learning_rate": 0.0005809731144894445, + "loss": 3.3181, + "step": 6977 + }, + { + "epoch": 0.34, + "grad_norm": 0.5585454106330872, + "learning_rate": 0.0005809677183437206, + "loss": 3.3861, + "step": 6978 + }, + { + "epoch": 0.34, + "grad_norm": 0.539271354675293, + "learning_rate": 0.0005809623214579816, + "loss": 3.3186, + "step": 6979 + }, + { + "epoch": 0.34, + "grad_norm": 0.5142511129379272, + "learning_rate": 0.0005809569238322419, + "loss": 3.296, + "step": 6980 + }, + { + "epoch": 0.34, + "grad_norm": 0.5410292148590088, + "learning_rate": 0.0005809515254665158, + "loss": 3.3056, + "step": 6981 + }, + { + "epoch": 0.34, + "grad_norm": 0.5308331847190857, + "learning_rate": 0.0005809461263608175, + "loss": 3.3361, + "step": 6982 + }, + { + "epoch": 0.34, + "grad_norm": 0.546271562576294, + "learning_rate": 0.000580940726515161, + "loss": 3.4194, + "step": 6983 + }, + { + "epoch": 0.34, + "grad_norm": 0.566936731338501, + "learning_rate": 0.0005809353259295608, + "loss": 3.3992, + "step": 6984 + }, + { + "epoch": 0.34, + "grad_norm": 0.5126430988311768, + "learning_rate": 0.000580929924604031, + "loss": 3.2163, + "step": 6985 + }, + { + "epoch": 0.34, + "grad_norm": 0.4886528551578522, + "learning_rate": 0.000580924522538586, + "loss": 3.3871, + "step": 6986 + }, + { + "epoch": 0.34, + "grad_norm": 0.5845642685890198, + "learning_rate": 0.0005809191197332397, + "loss": 3.4001, + "step": 6987 + }, + { + "epoch": 0.34, + "grad_norm": 0.5310481786727905, + "learning_rate": 0.0005809137161880065, + "loss": 3.5392, + "step": 6988 + }, + { + "epoch": 0.34, + "grad_norm": 0.5669585466384888, + "learning_rate": 0.0005809083119029006, + "loss": 3.4265, + "step": 6989 + }, + { + "epoch": 0.34, + "grad_norm": 0.5161418914794922, + "learning_rate": 0.0005809029068779365, + "loss": 3.3468, + "step": 6990 + }, + { + "epoch": 0.34, + "grad_norm": 0.5583595037460327, + "learning_rate": 0.0005808975011131279, + "loss": 3.4198, + "step": 6991 + }, + { + "epoch": 0.34, + "grad_norm": 0.5388359427452087, + "learning_rate": 0.0005808920946084896, + "loss": 3.4827, + "step": 6992 + }, + { + "epoch": 0.34, + "grad_norm": 0.5691794157028198, + "learning_rate": 0.0005808866873640353, + "loss": 3.5184, + "step": 6993 + }, + { + "epoch": 0.34, + "grad_norm": 0.6655071973800659, + "learning_rate": 0.0005808812793797797, + "loss": 3.3542, + "step": 6994 + }, + { + "epoch": 0.34, + "grad_norm": 0.5387119054794312, + "learning_rate": 0.000580875870655737, + "loss": 3.2979, + "step": 6995 + }, + { + "epoch": 0.34, + "grad_norm": 0.6775738596916199, + "learning_rate": 0.0005808704611919211, + "loss": 3.3308, + "step": 6996 + }, + { + "epoch": 0.34, + "grad_norm": 0.537527859210968, + "learning_rate": 0.0005808650509883465, + "loss": 3.4031, + "step": 6997 + }, + { + "epoch": 0.34, + "grad_norm": 0.5527935028076172, + "learning_rate": 0.0005808596400450275, + "loss": 3.251, + "step": 6998 + }, + { + "epoch": 0.34, + "grad_norm": 0.5826919674873352, + "learning_rate": 0.0005808542283619781, + "loss": 3.3252, + "step": 6999 + }, + { + "epoch": 0.34, + "grad_norm": 0.5161682367324829, + "learning_rate": 0.0005808488159392129, + "loss": 3.0676, + "step": 7000 + }, + { + "epoch": 0.34, + "grad_norm": 0.5332803130149841, + "learning_rate": 0.0005808434027767459, + "loss": 3.5714, + "step": 7001 + }, + { + "epoch": 0.34, + "grad_norm": 0.5207781791687012, + "learning_rate": 0.0005808379888745914, + "loss": 3.3575, + "step": 7002 + }, + { + "epoch": 0.34, + "grad_norm": 0.5248399376869202, + "learning_rate": 0.0005808325742327636, + "loss": 3.0861, + "step": 7003 + }, + { + "epoch": 0.34, + "grad_norm": 0.547214686870575, + "learning_rate": 0.0005808271588512771, + "loss": 3.4018, + "step": 7004 + }, + { + "epoch": 0.34, + "grad_norm": 0.49481138586997986, + "learning_rate": 0.0005808217427301456, + "loss": 3.2186, + "step": 7005 + }, + { + "epoch": 0.34, + "grad_norm": 0.567725658416748, + "learning_rate": 0.0005808163258693839, + "loss": 3.5048, + "step": 7006 + }, + { + "epoch": 0.34, + "grad_norm": 0.6113198399543762, + "learning_rate": 0.000580810908269006, + "loss": 3.2416, + "step": 7007 + }, + { + "epoch": 0.34, + "grad_norm": 0.5307554006576538, + "learning_rate": 0.0005808054899290262, + "loss": 3.2915, + "step": 7008 + }, + { + "epoch": 0.34, + "grad_norm": 0.5398176312446594, + "learning_rate": 0.0005808000708494587, + "loss": 3.2426, + "step": 7009 + }, + { + "epoch": 0.34, + "grad_norm": 0.5282324552536011, + "learning_rate": 0.0005807946510303179, + "loss": 3.5821, + "step": 7010 + }, + { + "epoch": 0.34, + "grad_norm": 0.5628719925880432, + "learning_rate": 0.0005807892304716181, + "loss": 3.5204, + "step": 7011 + }, + { + "epoch": 0.34, + "grad_norm": 0.5679787397384644, + "learning_rate": 0.0005807838091733734, + "loss": 3.3541, + "step": 7012 + }, + { + "epoch": 0.34, + "grad_norm": 0.5467495322227478, + "learning_rate": 0.0005807783871355982, + "loss": 3.2906, + "step": 7013 + }, + { + "epoch": 0.34, + "grad_norm": 0.5178360939025879, + "learning_rate": 0.0005807729643583069, + "loss": 3.3828, + "step": 7014 + }, + { + "epoch": 0.34, + "grad_norm": 0.5609511137008667, + "learning_rate": 0.0005807675408415134, + "loss": 3.3343, + "step": 7015 + }, + { + "epoch": 0.34, + "grad_norm": 0.5502193570137024, + "learning_rate": 0.0005807621165852325, + "loss": 3.3203, + "step": 7016 + }, + { + "epoch": 0.34, + "grad_norm": 0.5327548980712891, + "learning_rate": 0.000580756691589478, + "loss": 3.3419, + "step": 7017 + }, + { + "epoch": 0.34, + "grad_norm": 0.506458044052124, + "learning_rate": 0.0005807512658542646, + "loss": 3.2494, + "step": 7018 + }, + { + "epoch": 0.34, + "grad_norm": 0.5278050303459167, + "learning_rate": 0.0005807458393796062, + "loss": 3.3536, + "step": 7019 + }, + { + "epoch": 0.34, + "grad_norm": 0.530926525592804, + "learning_rate": 0.0005807404121655175, + "loss": 3.3645, + "step": 7020 + }, + { + "epoch": 0.34, + "grad_norm": 0.6142529845237732, + "learning_rate": 0.0005807349842120124, + "loss": 3.3903, + "step": 7021 + }, + { + "epoch": 0.34, + "grad_norm": 0.5566715002059937, + "learning_rate": 0.0005807295555191055, + "loss": 3.3908, + "step": 7022 + }, + { + "epoch": 0.34, + "grad_norm": 0.5471014380455017, + "learning_rate": 0.0005807241260868109, + "loss": 3.4871, + "step": 7023 + }, + { + "epoch": 0.34, + "grad_norm": 0.584735095500946, + "learning_rate": 0.000580718695915143, + "loss": 3.3576, + "step": 7024 + }, + { + "epoch": 0.34, + "grad_norm": 0.5210456848144531, + "learning_rate": 0.0005807132650041162, + "loss": 3.3165, + "step": 7025 + }, + { + "epoch": 0.34, + "grad_norm": 0.5415894985198975, + "learning_rate": 0.0005807078333537445, + "loss": 3.4889, + "step": 7026 + }, + { + "epoch": 0.34, + "grad_norm": 0.519156813621521, + "learning_rate": 0.0005807024009640425, + "loss": 3.3567, + "step": 7027 + }, + { + "epoch": 0.34, + "grad_norm": 0.5499240159988403, + "learning_rate": 0.0005806969678350243, + "loss": 3.3908, + "step": 7028 + }, + { + "epoch": 0.34, + "grad_norm": 0.5356557369232178, + "learning_rate": 0.0005806915339667044, + "loss": 3.1221, + "step": 7029 + }, + { + "epoch": 0.34, + "grad_norm": 0.562337338924408, + "learning_rate": 0.000580686099359097, + "loss": 3.1642, + "step": 7030 + }, + { + "epoch": 0.34, + "grad_norm": 0.5245587229728699, + "learning_rate": 0.0005806806640122164, + "loss": 3.277, + "step": 7031 + }, + { + "epoch": 0.34, + "grad_norm": 0.548549473285675, + "learning_rate": 0.000580675227926077, + "loss": 3.497, + "step": 7032 + }, + { + "epoch": 0.34, + "grad_norm": 0.5365231037139893, + "learning_rate": 0.0005806697911006931, + "loss": 3.257, + "step": 7033 + }, + { + "epoch": 0.34, + "grad_norm": 0.5649599432945251, + "learning_rate": 0.0005806643535360789, + "loss": 3.2649, + "step": 7034 + }, + { + "epoch": 0.34, + "grad_norm": 0.5196275115013123, + "learning_rate": 0.0005806589152322489, + "loss": 3.2268, + "step": 7035 + }, + { + "epoch": 0.34, + "grad_norm": 0.509590744972229, + "learning_rate": 0.0005806534761892172, + "loss": 3.2333, + "step": 7036 + }, + { + "epoch": 0.34, + "grad_norm": 0.5706533789634705, + "learning_rate": 0.0005806480364069983, + "loss": 3.011, + "step": 7037 + }, + { + "epoch": 0.34, + "grad_norm": 0.5456781983375549, + "learning_rate": 0.0005806425958856065, + "loss": 3.5429, + "step": 7038 + }, + { + "epoch": 0.34, + "grad_norm": 0.5550026893615723, + "learning_rate": 0.0005806371546250562, + "loss": 3.4187, + "step": 7039 + }, + { + "epoch": 0.35, + "grad_norm": 0.5485456585884094, + "learning_rate": 0.0005806317126253616, + "loss": 3.2612, + "step": 7040 + }, + { + "epoch": 0.35, + "grad_norm": 0.5675588250160217, + "learning_rate": 0.0005806262698865369, + "loss": 3.4519, + "step": 7041 + }, + { + "epoch": 0.35, + "grad_norm": 0.5755375623703003, + "learning_rate": 0.0005806208264085968, + "loss": 3.2874, + "step": 7042 + }, + { + "epoch": 0.35, + "grad_norm": 0.5225034952163696, + "learning_rate": 0.0005806153821915554, + "loss": 3.317, + "step": 7043 + }, + { + "epoch": 0.35, + "grad_norm": 0.5519276261329651, + "learning_rate": 0.0005806099372354271, + "loss": 3.4839, + "step": 7044 + }, + { + "epoch": 0.35, + "grad_norm": 0.5228083729743958, + "learning_rate": 0.0005806044915402262, + "loss": 3.334, + "step": 7045 + }, + { + "epoch": 0.35, + "grad_norm": 0.5322780013084412, + "learning_rate": 0.0005805990451059671, + "loss": 3.397, + "step": 7046 + }, + { + "epoch": 0.35, + "grad_norm": 0.5328549742698669, + "learning_rate": 0.0005805935979326641, + "loss": 3.4732, + "step": 7047 + }, + { + "epoch": 0.35, + "grad_norm": 0.48566094040870667, + "learning_rate": 0.0005805881500203316, + "loss": 3.3144, + "step": 7048 + }, + { + "epoch": 0.35, + "grad_norm": 0.5601107478141785, + "learning_rate": 0.0005805827013689839, + "loss": 3.3341, + "step": 7049 + }, + { + "epoch": 0.35, + "grad_norm": 0.5418280363082886, + "learning_rate": 0.0005805772519786353, + "loss": 3.4561, + "step": 7050 + }, + { + "epoch": 0.35, + "grad_norm": 0.5339364409446716, + "learning_rate": 0.0005805718018493003, + "loss": 3.4457, + "step": 7051 + }, + { + "epoch": 0.35, + "grad_norm": 0.6148756146430969, + "learning_rate": 0.0005805663509809932, + "loss": 3.3762, + "step": 7052 + }, + { + "epoch": 0.35, + "grad_norm": 0.5323485136032104, + "learning_rate": 0.0005805608993737282, + "loss": 3.3379, + "step": 7053 + }, + { + "epoch": 0.35, + "grad_norm": 0.5185231566429138, + "learning_rate": 0.0005805554470275199, + "loss": 3.1767, + "step": 7054 + }, + { + "epoch": 0.35, + "grad_norm": 0.54229736328125, + "learning_rate": 0.0005805499939423826, + "loss": 3.3344, + "step": 7055 + }, + { + "epoch": 0.35, + "grad_norm": 0.5611851215362549, + "learning_rate": 0.0005805445401183305, + "loss": 3.4762, + "step": 7056 + }, + { + "epoch": 0.35, + "grad_norm": 0.5249059200286865, + "learning_rate": 0.000580539085555378, + "loss": 3.4588, + "step": 7057 + }, + { + "epoch": 0.35, + "grad_norm": 0.5460208654403687, + "learning_rate": 0.0005805336302535397, + "loss": 3.3557, + "step": 7058 + }, + { + "epoch": 0.35, + "grad_norm": 0.6470723152160645, + "learning_rate": 0.0005805281742128297, + "loss": 3.0215, + "step": 7059 + }, + { + "epoch": 0.35, + "grad_norm": 0.5494178533554077, + "learning_rate": 0.0005805227174332625, + "loss": 3.3288, + "step": 7060 + }, + { + "epoch": 0.35, + "grad_norm": 0.5545464754104614, + "learning_rate": 0.0005805172599148525, + "loss": 3.222, + "step": 7061 + }, + { + "epoch": 0.35, + "grad_norm": 0.5664929747581482, + "learning_rate": 0.0005805118016576139, + "loss": 3.2427, + "step": 7062 + }, + { + "epoch": 0.35, + "grad_norm": 0.589393138885498, + "learning_rate": 0.0005805063426615613, + "loss": 3.3364, + "step": 7063 + }, + { + "epoch": 0.35, + "grad_norm": 0.5010703802108765, + "learning_rate": 0.000580500882926709, + "loss": 3.4066, + "step": 7064 + }, + { + "epoch": 0.35, + "grad_norm": 0.5556477904319763, + "learning_rate": 0.0005804954224530712, + "loss": 3.4683, + "step": 7065 + }, + { + "epoch": 0.35, + "grad_norm": 0.5511776208877563, + "learning_rate": 0.0005804899612406627, + "loss": 3.3322, + "step": 7066 + }, + { + "epoch": 0.35, + "grad_norm": 0.5432040095329285, + "learning_rate": 0.0005804844992894975, + "loss": 3.2741, + "step": 7067 + }, + { + "epoch": 0.35, + "grad_norm": 0.5588909983634949, + "learning_rate": 0.00058047903659959, + "loss": 3.6012, + "step": 7068 + }, + { + "epoch": 0.35, + "grad_norm": 0.5545442700386047, + "learning_rate": 0.0005804735731709548, + "loss": 3.373, + "step": 7069 + }, + { + "epoch": 0.35, + "grad_norm": 0.5283875465393066, + "learning_rate": 0.0005804681090036062, + "loss": 3.2354, + "step": 7070 + }, + { + "epoch": 0.35, + "grad_norm": 0.4935014843940735, + "learning_rate": 0.0005804626440975585, + "loss": 3.3775, + "step": 7071 + }, + { + "epoch": 0.35, + "grad_norm": 0.5301677584648132, + "learning_rate": 0.0005804571784528262, + "loss": 3.3929, + "step": 7072 + }, + { + "epoch": 0.35, + "grad_norm": 0.5176547169685364, + "learning_rate": 0.0005804517120694238, + "loss": 3.4168, + "step": 7073 + }, + { + "epoch": 0.35, + "grad_norm": 0.5569431781768799, + "learning_rate": 0.0005804462449473654, + "loss": 3.4039, + "step": 7074 + }, + { + "epoch": 0.35, + "grad_norm": 0.5683629512786865, + "learning_rate": 0.0005804407770866656, + "loss": 3.4013, + "step": 7075 + }, + { + "epoch": 0.35, + "grad_norm": 0.5122573375701904, + "learning_rate": 0.0005804353084873388, + "loss": 3.2245, + "step": 7076 + }, + { + "epoch": 0.35, + "grad_norm": 0.5609269142150879, + "learning_rate": 0.0005804298391493993, + "loss": 3.2241, + "step": 7077 + }, + { + "epoch": 0.35, + "grad_norm": 0.5435025691986084, + "learning_rate": 0.0005804243690728617, + "loss": 3.2807, + "step": 7078 + }, + { + "epoch": 0.35, + "grad_norm": 0.5618283152580261, + "learning_rate": 0.0005804188982577402, + "loss": 3.1999, + "step": 7079 + }, + { + "epoch": 0.35, + "grad_norm": 0.5037075877189636, + "learning_rate": 0.0005804134267040494, + "loss": 3.3382, + "step": 7080 + }, + { + "epoch": 0.35, + "grad_norm": 0.5446479916572571, + "learning_rate": 0.0005804079544118034, + "loss": 3.1628, + "step": 7081 + }, + { + "epoch": 0.35, + "grad_norm": 0.5279077291488647, + "learning_rate": 0.0005804024813810169, + "loss": 3.2108, + "step": 7082 + }, + { + "epoch": 0.35, + "grad_norm": 0.556573748588562, + "learning_rate": 0.0005803970076117043, + "loss": 3.47, + "step": 7083 + }, + { + "epoch": 0.35, + "grad_norm": 0.5409570932388306, + "learning_rate": 0.0005803915331038799, + "loss": 3.2393, + "step": 7084 + }, + { + "epoch": 0.35, + "grad_norm": 0.5191570520401001, + "learning_rate": 0.0005803860578575581, + "loss": 3.3679, + "step": 7085 + }, + { + "epoch": 0.35, + "grad_norm": 0.5882177352905273, + "learning_rate": 0.0005803805818727535, + "loss": 3.1786, + "step": 7086 + }, + { + "epoch": 0.35, + "grad_norm": 0.5755952000617981, + "learning_rate": 0.0005803751051494803, + "loss": 3.4671, + "step": 7087 + }, + { + "epoch": 0.35, + "grad_norm": 0.5264267325401306, + "learning_rate": 0.000580369627687753, + "loss": 3.1961, + "step": 7088 + }, + { + "epoch": 0.35, + "grad_norm": 0.5818674564361572, + "learning_rate": 0.0005803641494875861, + "loss": 3.334, + "step": 7089 + }, + { + "epoch": 0.35, + "grad_norm": 0.529707670211792, + "learning_rate": 0.000580358670548994, + "loss": 3.3026, + "step": 7090 + }, + { + "epoch": 0.35, + "grad_norm": 0.5402799844741821, + "learning_rate": 0.0005803531908719912, + "loss": 3.4426, + "step": 7091 + }, + { + "epoch": 0.35, + "grad_norm": 0.5018097162246704, + "learning_rate": 0.000580347710456592, + "loss": 3.1922, + "step": 7092 + }, + { + "epoch": 0.35, + "grad_norm": 0.5327255129814148, + "learning_rate": 0.0005803422293028109, + "loss": 3.2424, + "step": 7093 + }, + { + "epoch": 0.35, + "grad_norm": 0.5543902516365051, + "learning_rate": 0.0005803367474106623, + "loss": 3.463, + "step": 7094 + }, + { + "epoch": 0.35, + "grad_norm": 0.5371736288070679, + "learning_rate": 0.0005803312647801607, + "loss": 3.2514, + "step": 7095 + }, + { + "epoch": 0.35, + "grad_norm": 0.5503165125846863, + "learning_rate": 0.0005803257814113204, + "loss": 3.5273, + "step": 7096 + }, + { + "epoch": 0.35, + "grad_norm": 0.5683706998825073, + "learning_rate": 0.0005803202973041561, + "loss": 3.189, + "step": 7097 + }, + { + "epoch": 0.35, + "grad_norm": 0.5162700414657593, + "learning_rate": 0.0005803148124586819, + "loss": 3.3167, + "step": 7098 + }, + { + "epoch": 0.35, + "grad_norm": 0.49133065342903137, + "learning_rate": 0.0005803093268749125, + "loss": 3.1603, + "step": 7099 + }, + { + "epoch": 0.35, + "grad_norm": 0.5067523717880249, + "learning_rate": 0.0005803038405528622, + "loss": 3.4655, + "step": 7100 + }, + { + "epoch": 0.35, + "grad_norm": 0.5425522923469543, + "learning_rate": 0.0005802983534925457, + "loss": 3.4017, + "step": 7101 + }, + { + "epoch": 0.35, + "grad_norm": 0.5662040114402771, + "learning_rate": 0.0005802928656939773, + "loss": 3.4728, + "step": 7102 + }, + { + "epoch": 0.35, + "grad_norm": 0.5476053953170776, + "learning_rate": 0.0005802873771571712, + "loss": 3.5488, + "step": 7103 + }, + { + "epoch": 0.35, + "grad_norm": 0.5239982604980469, + "learning_rate": 0.0005802818878821424, + "loss": 3.3556, + "step": 7104 + }, + { + "epoch": 0.35, + "grad_norm": 0.5249889492988586, + "learning_rate": 0.0005802763978689048, + "loss": 3.5085, + "step": 7105 + }, + { + "epoch": 0.35, + "grad_norm": 0.5688517689704895, + "learning_rate": 0.0005802709071174732, + "loss": 3.3199, + "step": 7106 + }, + { + "epoch": 0.35, + "grad_norm": 0.5531274080276489, + "learning_rate": 0.0005802654156278619, + "loss": 3.3684, + "step": 7107 + }, + { + "epoch": 0.35, + "grad_norm": 0.5954955220222473, + "learning_rate": 0.0005802599234000855, + "loss": 3.3379, + "step": 7108 + }, + { + "epoch": 0.35, + "grad_norm": 0.5915130376815796, + "learning_rate": 0.0005802544304341584, + "loss": 3.4219, + "step": 7109 + }, + { + "epoch": 0.35, + "grad_norm": 0.5060703754425049, + "learning_rate": 0.000580248936730095, + "loss": 3.2657, + "step": 7110 + }, + { + "epoch": 0.35, + "grad_norm": 0.5294564366340637, + "learning_rate": 0.0005802434422879099, + "loss": 3.4222, + "step": 7111 + }, + { + "epoch": 0.35, + "grad_norm": 0.5669923424720764, + "learning_rate": 0.0005802379471076175, + "loss": 3.4991, + "step": 7112 + }, + { + "epoch": 0.35, + "grad_norm": 0.550014317035675, + "learning_rate": 0.0005802324511892323, + "loss": 3.3429, + "step": 7113 + }, + { + "epoch": 0.35, + "grad_norm": 0.5213134288787842, + "learning_rate": 0.0005802269545327688, + "loss": 3.4838, + "step": 7114 + }, + { + "epoch": 0.35, + "grad_norm": 0.5911935567855835, + "learning_rate": 0.0005802214571382413, + "loss": 3.267, + "step": 7115 + }, + { + "epoch": 0.35, + "grad_norm": 0.5569819808006287, + "learning_rate": 0.0005802159590056644, + "loss": 3.4411, + "step": 7116 + }, + { + "epoch": 0.35, + "grad_norm": 0.5387932658195496, + "learning_rate": 0.0005802104601350528, + "loss": 3.5086, + "step": 7117 + }, + { + "epoch": 0.35, + "grad_norm": 0.5420069694519043, + "learning_rate": 0.0005802049605264205, + "loss": 3.3445, + "step": 7118 + }, + { + "epoch": 0.35, + "grad_norm": 0.5610042214393616, + "learning_rate": 0.0005801994601797825, + "loss": 3.3451, + "step": 7119 + }, + { + "epoch": 0.35, + "grad_norm": 0.5179895162582397, + "learning_rate": 0.000580193959095153, + "loss": 3.3824, + "step": 7120 + }, + { + "epoch": 0.35, + "grad_norm": 0.545372486114502, + "learning_rate": 0.0005801884572725464, + "loss": 3.5493, + "step": 7121 + }, + { + "epoch": 0.35, + "grad_norm": 0.5903968214988708, + "learning_rate": 0.0005801829547119775, + "loss": 3.4681, + "step": 7122 + }, + { + "epoch": 0.35, + "grad_norm": 0.6884878873825073, + "learning_rate": 0.0005801774514134605, + "loss": 3.4401, + "step": 7123 + }, + { + "epoch": 0.35, + "grad_norm": 0.5330374836921692, + "learning_rate": 0.00058017194737701, + "loss": 3.5123, + "step": 7124 + }, + { + "epoch": 0.35, + "grad_norm": 0.5485833883285522, + "learning_rate": 0.0005801664426026405, + "loss": 3.4519, + "step": 7125 + }, + { + "epoch": 0.35, + "grad_norm": 0.6042609214782715, + "learning_rate": 0.0005801609370903666, + "loss": 3.3484, + "step": 7126 + }, + { + "epoch": 0.35, + "grad_norm": 0.5192358493804932, + "learning_rate": 0.0005801554308402028, + "loss": 3.3385, + "step": 7127 + }, + { + "epoch": 0.35, + "grad_norm": 0.5295864343643188, + "learning_rate": 0.0005801499238521634, + "loss": 3.4082, + "step": 7128 + }, + { + "epoch": 0.35, + "grad_norm": 0.5500878691673279, + "learning_rate": 0.000580144416126263, + "loss": 3.2593, + "step": 7129 + }, + { + "epoch": 0.35, + "grad_norm": 0.5443781614303589, + "learning_rate": 0.0005801389076625161, + "loss": 3.3759, + "step": 7130 + }, + { + "epoch": 0.35, + "grad_norm": 0.5370346307754517, + "learning_rate": 0.0005801333984609372, + "loss": 3.1128, + "step": 7131 + }, + { + "epoch": 0.35, + "grad_norm": 0.5285878777503967, + "learning_rate": 0.0005801278885215409, + "loss": 3.3864, + "step": 7132 + }, + { + "epoch": 0.35, + "grad_norm": 0.6192882657051086, + "learning_rate": 0.0005801223778443417, + "loss": 3.2203, + "step": 7133 + }, + { + "epoch": 0.35, + "grad_norm": 0.5949166417121887, + "learning_rate": 0.000580116866429354, + "loss": 3.527, + "step": 7134 + }, + { + "epoch": 0.35, + "grad_norm": 0.5258962512016296, + "learning_rate": 0.0005801113542765925, + "loss": 3.4491, + "step": 7135 + }, + { + "epoch": 0.35, + "grad_norm": 0.524845540523529, + "learning_rate": 0.0005801058413860714, + "loss": 3.2764, + "step": 7136 + }, + { + "epoch": 0.35, + "grad_norm": 0.5003640651702881, + "learning_rate": 0.0005801003277578055, + "loss": 3.3281, + "step": 7137 + }, + { + "epoch": 0.35, + "grad_norm": 0.5418901443481445, + "learning_rate": 0.0005800948133918094, + "loss": 3.2208, + "step": 7138 + }, + { + "epoch": 0.35, + "grad_norm": 0.5504566431045532, + "learning_rate": 0.0005800892982880973, + "loss": 3.1584, + "step": 7139 + }, + { + "epoch": 0.35, + "grad_norm": 0.5194109082221985, + "learning_rate": 0.000580083782446684, + "loss": 3.641, + "step": 7140 + }, + { + "epoch": 0.35, + "grad_norm": 0.5327237248420715, + "learning_rate": 0.0005800782658675838, + "loss": 3.1637, + "step": 7141 + }, + { + "epoch": 0.35, + "grad_norm": 0.6268619894981384, + "learning_rate": 0.0005800727485508114, + "loss": 3.264, + "step": 7142 + }, + { + "epoch": 0.35, + "grad_norm": 0.5085632801055908, + "learning_rate": 0.0005800672304963813, + "loss": 3.5367, + "step": 7143 + }, + { + "epoch": 0.35, + "grad_norm": 0.6155527830123901, + "learning_rate": 0.000580061711704308, + "loss": 3.3654, + "step": 7144 + }, + { + "epoch": 0.35, + "grad_norm": 0.5261454582214355, + "learning_rate": 0.0005800561921746061, + "loss": 3.4477, + "step": 7145 + }, + { + "epoch": 0.35, + "grad_norm": 0.5312758684158325, + "learning_rate": 0.0005800506719072899, + "loss": 3.4262, + "step": 7146 + }, + { + "epoch": 0.35, + "grad_norm": 0.5167414546012878, + "learning_rate": 0.0005800451509023744, + "loss": 3.4457, + "step": 7147 + }, + { + "epoch": 0.35, + "grad_norm": 0.5222433805465698, + "learning_rate": 0.0005800396291598737, + "loss": 3.4803, + "step": 7148 + }, + { + "epoch": 0.35, + "grad_norm": 0.49773696064949036, + "learning_rate": 0.0005800341066798025, + "loss": 3.1294, + "step": 7149 + }, + { + "epoch": 0.35, + "grad_norm": 0.5385370254516602, + "learning_rate": 0.0005800285834621754, + "loss": 3.4034, + "step": 7150 + }, + { + "epoch": 0.35, + "grad_norm": 0.5338417887687683, + "learning_rate": 0.000580023059507007, + "loss": 3.1577, + "step": 7151 + }, + { + "epoch": 0.35, + "grad_norm": 0.5068842768669128, + "learning_rate": 0.0005800175348143116, + "loss": 3.5412, + "step": 7152 + }, + { + "epoch": 0.35, + "grad_norm": 0.49130746722221375, + "learning_rate": 0.0005800120093841039, + "loss": 3.4468, + "step": 7153 + }, + { + "epoch": 0.35, + "grad_norm": 0.5758814215660095, + "learning_rate": 0.0005800064832163985, + "loss": 3.3889, + "step": 7154 + }, + { + "epoch": 0.35, + "grad_norm": 0.5376800298690796, + "learning_rate": 0.00058000095631121, + "loss": 3.4071, + "step": 7155 + }, + { + "epoch": 0.35, + "grad_norm": 0.5141198635101318, + "learning_rate": 0.0005799954286685527, + "loss": 3.5247, + "step": 7156 + }, + { + "epoch": 0.35, + "grad_norm": 0.5330366492271423, + "learning_rate": 0.0005799899002884415, + "loss": 3.0816, + "step": 7157 + }, + { + "epoch": 0.35, + "grad_norm": 0.4993706941604614, + "learning_rate": 0.0005799843711708908, + "loss": 3.3283, + "step": 7158 + }, + { + "epoch": 0.35, + "grad_norm": 0.520387589931488, + "learning_rate": 0.0005799788413159149, + "loss": 3.4417, + "step": 7159 + }, + { + "epoch": 0.35, + "grad_norm": 0.5464620590209961, + "learning_rate": 0.0005799733107235288, + "loss": 3.5151, + "step": 7160 + }, + { + "epoch": 0.35, + "grad_norm": 0.5394158363342285, + "learning_rate": 0.0005799677793937469, + "loss": 3.4495, + "step": 7161 + }, + { + "epoch": 0.35, + "grad_norm": 0.505730390548706, + "learning_rate": 0.0005799622473265837, + "loss": 3.3232, + "step": 7162 + }, + { + "epoch": 0.35, + "grad_norm": 0.5446467995643616, + "learning_rate": 0.0005799567145220539, + "loss": 3.3166, + "step": 7163 + }, + { + "epoch": 0.35, + "grad_norm": 0.5053635835647583, + "learning_rate": 0.0005799511809801719, + "loss": 3.2405, + "step": 7164 + }, + { + "epoch": 0.35, + "grad_norm": 0.5116280317306519, + "learning_rate": 0.0005799456467009523, + "loss": 3.0451, + "step": 7165 + }, + { + "epoch": 0.35, + "grad_norm": 0.575804591178894, + "learning_rate": 0.0005799401116844099, + "loss": 3.4171, + "step": 7166 + }, + { + "epoch": 0.35, + "grad_norm": 0.5386016368865967, + "learning_rate": 0.000579934575930559, + "loss": 3.5537, + "step": 7167 + }, + { + "epoch": 0.35, + "grad_norm": 0.58470219373703, + "learning_rate": 0.0005799290394394144, + "loss": 3.4352, + "step": 7168 + }, + { + "epoch": 0.35, + "grad_norm": 0.5334919691085815, + "learning_rate": 0.0005799235022109906, + "loss": 3.4018, + "step": 7169 + }, + { + "epoch": 0.35, + "grad_norm": 0.513592004776001, + "learning_rate": 0.000579917964245302, + "loss": 3.5992, + "step": 7170 + }, + { + "epoch": 0.35, + "grad_norm": 0.5190378427505493, + "learning_rate": 0.0005799124255423634, + "loss": 3.3433, + "step": 7171 + }, + { + "epoch": 0.35, + "grad_norm": 0.5345826148986816, + "learning_rate": 0.0005799068861021895, + "loss": 3.469, + "step": 7172 + }, + { + "epoch": 0.35, + "grad_norm": 0.5253705382347107, + "learning_rate": 0.0005799013459247946, + "loss": 3.2886, + "step": 7173 + }, + { + "epoch": 0.35, + "grad_norm": 0.5313470363616943, + "learning_rate": 0.0005798958050101935, + "loss": 3.2108, + "step": 7174 + }, + { + "epoch": 0.35, + "grad_norm": 0.545667290687561, + "learning_rate": 0.0005798902633584006, + "loss": 3.5013, + "step": 7175 + }, + { + "epoch": 0.35, + "grad_norm": 0.5079378485679626, + "learning_rate": 0.0005798847209694308, + "loss": 3.3649, + "step": 7176 + }, + { + "epoch": 0.35, + "grad_norm": 0.555418074131012, + "learning_rate": 0.0005798791778432984, + "loss": 3.406, + "step": 7177 + }, + { + "epoch": 0.35, + "grad_norm": 0.5127460956573486, + "learning_rate": 0.0005798736339800181, + "loss": 3.333, + "step": 7178 + }, + { + "epoch": 0.35, + "grad_norm": 0.5296385288238525, + "learning_rate": 0.0005798680893796045, + "loss": 3.2033, + "step": 7179 + }, + { + "epoch": 0.35, + "grad_norm": 0.5440773367881775, + "learning_rate": 0.0005798625440420721, + "loss": 3.533, + "step": 7180 + }, + { + "epoch": 0.35, + "grad_norm": 0.5760077834129333, + "learning_rate": 0.0005798569979674358, + "loss": 3.3711, + "step": 7181 + }, + { + "epoch": 0.35, + "grad_norm": 0.550422728061676, + "learning_rate": 0.00057985145115571, + "loss": 3.3078, + "step": 7182 + }, + { + "epoch": 0.35, + "grad_norm": 0.5506702661514282, + "learning_rate": 0.0005798459036069094, + "loss": 3.2914, + "step": 7183 + }, + { + "epoch": 0.35, + "grad_norm": 0.5331199169158936, + "learning_rate": 0.0005798403553210484, + "loss": 3.2955, + "step": 7184 + }, + { + "epoch": 0.35, + "grad_norm": 0.5018191337585449, + "learning_rate": 0.0005798348062981419, + "loss": 3.2215, + "step": 7185 + }, + { + "epoch": 0.35, + "grad_norm": 0.5439038872718811, + "learning_rate": 0.0005798292565382042, + "loss": 3.3194, + "step": 7186 + }, + { + "epoch": 0.35, + "grad_norm": 0.5371338725090027, + "learning_rate": 0.0005798237060412502, + "loss": 3.4208, + "step": 7187 + }, + { + "epoch": 0.35, + "grad_norm": 0.5246415138244629, + "learning_rate": 0.0005798181548072943, + "loss": 3.2842, + "step": 7188 + }, + { + "epoch": 0.35, + "grad_norm": 0.519935131072998, + "learning_rate": 0.0005798126028363514, + "loss": 3.1714, + "step": 7189 + }, + { + "epoch": 0.35, + "grad_norm": 0.5213209390640259, + "learning_rate": 0.0005798070501284359, + "loss": 3.1316, + "step": 7190 + }, + { + "epoch": 0.35, + "grad_norm": 0.5461356043815613, + "learning_rate": 0.0005798014966835625, + "loss": 3.3031, + "step": 7191 + }, + { + "epoch": 0.35, + "grad_norm": 0.5040096044540405, + "learning_rate": 0.0005797959425017457, + "loss": 3.3874, + "step": 7192 + }, + { + "epoch": 0.35, + "grad_norm": 0.5159637331962585, + "learning_rate": 0.0005797903875830004, + "loss": 3.2251, + "step": 7193 + }, + { + "epoch": 0.35, + "grad_norm": 0.5395895838737488, + "learning_rate": 0.000579784831927341, + "loss": 3.4672, + "step": 7194 + }, + { + "epoch": 0.35, + "grad_norm": 0.5256237387657166, + "learning_rate": 0.000579779275534782, + "loss": 3.4937, + "step": 7195 + }, + { + "epoch": 0.35, + "grad_norm": 0.5334175825119019, + "learning_rate": 0.0005797737184053385, + "loss": 3.2622, + "step": 7196 + }, + { + "epoch": 0.35, + "grad_norm": 0.5539845824241638, + "learning_rate": 0.0005797681605390248, + "loss": 3.3592, + "step": 7197 + }, + { + "epoch": 0.35, + "grad_norm": 0.5329002737998962, + "learning_rate": 0.0005797626019358556, + "loss": 3.2035, + "step": 7198 + }, + { + "epoch": 0.35, + "grad_norm": 0.5471001267433167, + "learning_rate": 0.0005797570425958454, + "loss": 3.4235, + "step": 7199 + }, + { + "epoch": 0.35, + "grad_norm": 0.5086562633514404, + "learning_rate": 0.0005797514825190092, + "loss": 3.2384, + "step": 7200 + }, + { + "epoch": 0.35, + "grad_norm": 0.5643491744995117, + "learning_rate": 0.0005797459217053613, + "loss": 3.4946, + "step": 7201 + }, + { + "epoch": 0.35, + "grad_norm": 0.5237635374069214, + "learning_rate": 0.0005797403601549166, + "loss": 3.3411, + "step": 7202 + }, + { + "epoch": 0.35, + "grad_norm": 0.5038819909095764, + "learning_rate": 0.0005797347978676895, + "loss": 3.5791, + "step": 7203 + }, + { + "epoch": 0.35, + "grad_norm": 0.49819880723953247, + "learning_rate": 0.0005797292348436949, + "loss": 3.4183, + "step": 7204 + }, + { + "epoch": 0.35, + "grad_norm": 0.5045859813690186, + "learning_rate": 0.0005797236710829473, + "loss": 3.3744, + "step": 7205 + }, + { + "epoch": 0.35, + "grad_norm": 0.5460410714149475, + "learning_rate": 0.0005797181065854613, + "loss": 3.3562, + "step": 7206 + }, + { + "epoch": 0.35, + "grad_norm": 0.5919173955917358, + "learning_rate": 0.0005797125413512517, + "loss": 3.3013, + "step": 7207 + }, + { + "epoch": 0.35, + "grad_norm": 0.5470105409622192, + "learning_rate": 0.0005797069753803332, + "loss": 3.2812, + "step": 7208 + }, + { + "epoch": 0.35, + "grad_norm": 0.5546271204948425, + "learning_rate": 0.0005797014086727201, + "loss": 3.5578, + "step": 7209 + }, + { + "epoch": 0.35, + "grad_norm": 0.5407612323760986, + "learning_rate": 0.0005796958412284275, + "loss": 3.5534, + "step": 7210 + }, + { + "epoch": 0.35, + "grad_norm": 0.5482301115989685, + "learning_rate": 0.0005796902730474698, + "loss": 3.2728, + "step": 7211 + }, + { + "epoch": 0.35, + "grad_norm": 0.5478767156600952, + "learning_rate": 0.0005796847041298619, + "loss": 3.1991, + "step": 7212 + }, + { + "epoch": 0.35, + "grad_norm": 0.5303657054901123, + "learning_rate": 0.0005796791344756182, + "loss": 3.3981, + "step": 7213 + }, + { + "epoch": 0.35, + "grad_norm": 0.5501422882080078, + "learning_rate": 0.0005796735640847535, + "loss": 3.2946, + "step": 7214 + }, + { + "epoch": 0.35, + "grad_norm": 0.5230575203895569, + "learning_rate": 0.0005796679929572826, + "loss": 3.2646, + "step": 7215 + }, + { + "epoch": 0.35, + "grad_norm": 0.5465124249458313, + "learning_rate": 0.0005796624210932197, + "loss": 3.0865, + "step": 7216 + }, + { + "epoch": 0.35, + "grad_norm": 0.5589669346809387, + "learning_rate": 0.00057965684849258, + "loss": 3.1481, + "step": 7217 + }, + { + "epoch": 0.35, + "grad_norm": 0.5898823142051697, + "learning_rate": 0.000579651275155378, + "loss": 3.25, + "step": 7218 + }, + { + "epoch": 0.35, + "grad_norm": 0.5371426939964294, + "learning_rate": 0.0005796457010816284, + "loss": 3.1897, + "step": 7219 + }, + { + "epoch": 0.35, + "grad_norm": 0.5486535429954529, + "learning_rate": 0.0005796401262713457, + "loss": 3.1218, + "step": 7220 + }, + { + "epoch": 0.35, + "grad_norm": 0.5335965156555176, + "learning_rate": 0.0005796345507245448, + "loss": 3.2851, + "step": 7221 + }, + { + "epoch": 0.35, + "grad_norm": 0.5595109462738037, + "learning_rate": 0.0005796289744412404, + "loss": 3.4765, + "step": 7222 + }, + { + "epoch": 0.35, + "grad_norm": 0.5471145510673523, + "learning_rate": 0.000579623397421447, + "loss": 3.4348, + "step": 7223 + }, + { + "epoch": 0.35, + "grad_norm": 0.5449061393737793, + "learning_rate": 0.0005796178196651794, + "loss": 3.2696, + "step": 7224 + }, + { + "epoch": 0.35, + "grad_norm": 0.5270853042602539, + "learning_rate": 0.0005796122411724523, + "loss": 3.5372, + "step": 7225 + }, + { + "epoch": 0.35, + "grad_norm": 0.5229194164276123, + "learning_rate": 0.0005796066619432803, + "loss": 3.3925, + "step": 7226 + }, + { + "epoch": 0.35, + "grad_norm": 0.564228892326355, + "learning_rate": 0.0005796010819776782, + "loss": 3.325, + "step": 7227 + }, + { + "epoch": 0.35, + "grad_norm": 0.55915766954422, + "learning_rate": 0.0005795955012756607, + "loss": 3.4594, + "step": 7228 + }, + { + "epoch": 0.35, + "grad_norm": 0.513458251953125, + "learning_rate": 0.0005795899198372423, + "loss": 3.3471, + "step": 7229 + }, + { + "epoch": 0.35, + "grad_norm": 0.5892640948295593, + "learning_rate": 0.0005795843376624381, + "loss": 3.2332, + "step": 7230 + }, + { + "epoch": 0.35, + "grad_norm": 0.5395854115486145, + "learning_rate": 0.0005795787547512624, + "loss": 3.1295, + "step": 7231 + }, + { + "epoch": 0.35, + "grad_norm": 0.5873109698295593, + "learning_rate": 0.0005795731711037301, + "loss": 3.2344, + "step": 7232 + }, + { + "epoch": 0.35, + "grad_norm": 0.5463156700134277, + "learning_rate": 0.0005795675867198559, + "loss": 3.2807, + "step": 7233 + }, + { + "epoch": 0.35, + "grad_norm": 0.5152543783187866, + "learning_rate": 0.0005795620015996545, + "loss": 3.3083, + "step": 7234 + }, + { + "epoch": 0.35, + "grad_norm": 0.630656361579895, + "learning_rate": 0.0005795564157431405, + "loss": 3.5325, + "step": 7235 + }, + { + "epoch": 0.35, + "grad_norm": 0.5535502433776855, + "learning_rate": 0.0005795508291503288, + "loss": 3.4074, + "step": 7236 + }, + { + "epoch": 0.35, + "grad_norm": 0.5285792946815491, + "learning_rate": 0.0005795452418212339, + "loss": 3.3968, + "step": 7237 + }, + { + "epoch": 0.35, + "grad_norm": 0.548895537853241, + "learning_rate": 0.0005795396537558707, + "loss": 3.3346, + "step": 7238 + }, + { + "epoch": 0.35, + "grad_norm": 0.5185685157775879, + "learning_rate": 0.0005795340649542539, + "loss": 3.5812, + "step": 7239 + }, + { + "epoch": 0.35, + "grad_norm": 0.5098907351493835, + "learning_rate": 0.0005795284754163981, + "loss": 3.3364, + "step": 7240 + }, + { + "epoch": 0.35, + "grad_norm": 0.5207344889640808, + "learning_rate": 0.0005795228851423182, + "loss": 3.1888, + "step": 7241 + }, + { + "epoch": 0.35, + "grad_norm": 0.5404914021492004, + "learning_rate": 0.0005795172941320287, + "loss": 3.4547, + "step": 7242 + }, + { + "epoch": 0.35, + "grad_norm": 0.6177796125411987, + "learning_rate": 0.0005795117023855446, + "loss": 3.2094, + "step": 7243 + }, + { + "epoch": 0.36, + "grad_norm": 0.5546385049819946, + "learning_rate": 0.0005795061099028802, + "loss": 3.4063, + "step": 7244 + }, + { + "epoch": 0.36, + "grad_norm": 0.5809313058853149, + "learning_rate": 0.0005795005166840507, + "loss": 3.3204, + "step": 7245 + }, + { + "epoch": 0.36, + "grad_norm": 0.6135513186454773, + "learning_rate": 0.0005794949227290705, + "loss": 3.2965, + "step": 7246 + }, + { + "epoch": 0.36, + "grad_norm": 0.5403055548667908, + "learning_rate": 0.0005794893280379546, + "loss": 3.3973, + "step": 7247 + }, + { + "epoch": 0.36, + "grad_norm": 0.5268582105636597, + "learning_rate": 0.0005794837326107175, + "loss": 3.4849, + "step": 7248 + }, + { + "epoch": 0.36, + "grad_norm": 0.5323728919029236, + "learning_rate": 0.0005794781364473741, + "loss": 3.5033, + "step": 7249 + }, + { + "epoch": 0.36, + "grad_norm": 0.5301493406295776, + "learning_rate": 0.000579472539547939, + "loss": 3.3231, + "step": 7250 + }, + { + "epoch": 0.36, + "grad_norm": 0.5442898273468018, + "learning_rate": 0.0005794669419124271, + "loss": 3.1438, + "step": 7251 + }, + { + "epoch": 0.36, + "grad_norm": 0.6318520903587341, + "learning_rate": 0.0005794613435408531, + "loss": 3.1192, + "step": 7252 + }, + { + "epoch": 0.36, + "grad_norm": 0.5194680094718933, + "learning_rate": 0.0005794557444332316, + "loss": 3.3424, + "step": 7253 + }, + { + "epoch": 0.36, + "grad_norm": 0.5083394646644592, + "learning_rate": 0.0005794501445895774, + "loss": 3.1855, + "step": 7254 + }, + { + "epoch": 0.36, + "grad_norm": 0.512860119342804, + "learning_rate": 0.0005794445440099054, + "loss": 3.291, + "step": 7255 + }, + { + "epoch": 0.36, + "grad_norm": 0.5411530137062073, + "learning_rate": 0.0005794389426942302, + "loss": 3.3483, + "step": 7256 + }, + { + "epoch": 0.36, + "grad_norm": 0.5959362387657166, + "learning_rate": 0.0005794333406425667, + "loss": 3.4056, + "step": 7257 + }, + { + "epoch": 0.36, + "grad_norm": 0.596243143081665, + "learning_rate": 0.0005794277378549296, + "loss": 3.4625, + "step": 7258 + }, + { + "epoch": 0.36, + "grad_norm": 0.523287832736969, + "learning_rate": 0.0005794221343313334, + "loss": 3.4901, + "step": 7259 + }, + { + "epoch": 0.36, + "grad_norm": 0.5272862315177917, + "learning_rate": 0.0005794165300717932, + "loss": 3.3664, + "step": 7260 + }, + { + "epoch": 0.36, + "grad_norm": 0.5093209147453308, + "learning_rate": 0.0005794109250763236, + "loss": 3.3563, + "step": 7261 + }, + { + "epoch": 0.36, + "grad_norm": 0.5650598406791687, + "learning_rate": 0.0005794053193449394, + "loss": 3.0961, + "step": 7262 + }, + { + "epoch": 0.36, + "grad_norm": 0.5295237898826599, + "learning_rate": 0.0005793997128776554, + "loss": 3.4869, + "step": 7263 + }, + { + "epoch": 0.36, + "grad_norm": 0.5343896150588989, + "learning_rate": 0.0005793941056744863, + "loss": 3.316, + "step": 7264 + }, + { + "epoch": 0.36, + "grad_norm": 0.5440590977668762, + "learning_rate": 0.000579388497735447, + "loss": 3.2155, + "step": 7265 + }, + { + "epoch": 0.36, + "grad_norm": 0.4939824938774109, + "learning_rate": 0.000579382889060552, + "loss": 3.3011, + "step": 7266 + }, + { + "epoch": 0.36, + "grad_norm": 0.49455365538597107, + "learning_rate": 0.0005793772796498163, + "loss": 3.4065, + "step": 7267 + }, + { + "epoch": 0.36, + "grad_norm": 0.5292657613754272, + "learning_rate": 0.0005793716695032546, + "loss": 3.4599, + "step": 7268 + }, + { + "epoch": 0.36, + "grad_norm": 0.5179243683815002, + "learning_rate": 0.0005793660586208818, + "loss": 3.5574, + "step": 7269 + }, + { + "epoch": 0.36, + "grad_norm": 0.5624231100082397, + "learning_rate": 0.0005793604470027124, + "loss": 3.2575, + "step": 7270 + }, + { + "epoch": 0.36, + "grad_norm": 0.5110817551612854, + "learning_rate": 0.0005793548346487614, + "loss": 3.4008, + "step": 7271 + }, + { + "epoch": 0.36, + "grad_norm": 0.5481024384498596, + "learning_rate": 0.0005793492215590435, + "loss": 3.1985, + "step": 7272 + }, + { + "epoch": 0.36, + "grad_norm": 0.5263863205909729, + "learning_rate": 0.0005793436077335736, + "loss": 3.2327, + "step": 7273 + }, + { + "epoch": 0.36, + "grad_norm": 0.5095751881599426, + "learning_rate": 0.0005793379931723664, + "loss": 3.5152, + "step": 7274 + }, + { + "epoch": 0.36, + "grad_norm": 0.5422204732894897, + "learning_rate": 0.0005793323778754367, + "loss": 3.341, + "step": 7275 + }, + { + "epoch": 0.36, + "grad_norm": 0.5029591917991638, + "learning_rate": 0.0005793267618427991, + "loss": 3.4303, + "step": 7276 + }, + { + "epoch": 0.36, + "grad_norm": 0.5158606767654419, + "learning_rate": 0.0005793211450744688, + "loss": 3.4689, + "step": 7277 + }, + { + "epoch": 0.36, + "grad_norm": 0.565763533115387, + "learning_rate": 0.0005793155275704601, + "loss": 3.2644, + "step": 7278 + }, + { + "epoch": 0.36, + "grad_norm": 0.5302228331565857, + "learning_rate": 0.0005793099093307883, + "loss": 3.4085, + "step": 7279 + }, + { + "epoch": 0.36, + "grad_norm": 0.5519428849220276, + "learning_rate": 0.0005793042903554679, + "loss": 3.349, + "step": 7280 + }, + { + "epoch": 0.36, + "grad_norm": 0.5244438648223877, + "learning_rate": 0.0005792986706445137, + "loss": 3.2967, + "step": 7281 + }, + { + "epoch": 0.36, + "grad_norm": 0.5649428367614746, + "learning_rate": 0.0005792930501979406, + "loss": 3.2556, + "step": 7282 + }, + { + "epoch": 0.36, + "grad_norm": 0.512695848941803, + "learning_rate": 0.0005792874290157633, + "loss": 3.3095, + "step": 7283 + }, + { + "epoch": 0.36, + "grad_norm": 0.5236402750015259, + "learning_rate": 0.0005792818070979967, + "loss": 3.4541, + "step": 7284 + }, + { + "epoch": 0.36, + "grad_norm": 0.5257241725921631, + "learning_rate": 0.0005792761844446555, + "loss": 3.3157, + "step": 7285 + }, + { + "epoch": 0.36, + "grad_norm": 0.5090532898902893, + "learning_rate": 0.0005792705610557546, + "loss": 3.4076, + "step": 7286 + }, + { + "epoch": 0.36, + "grad_norm": 0.5210464000701904, + "learning_rate": 0.0005792649369313088, + "loss": 3.1769, + "step": 7287 + }, + { + "epoch": 0.36, + "grad_norm": 0.5331592559814453, + "learning_rate": 0.0005792593120713329, + "loss": 3.4404, + "step": 7288 + }, + { + "epoch": 0.36, + "grad_norm": 0.5243934988975525, + "learning_rate": 0.0005792536864758418, + "loss": 3.2593, + "step": 7289 + }, + { + "epoch": 0.36, + "grad_norm": 0.5213882327079773, + "learning_rate": 0.0005792480601448502, + "loss": 3.3763, + "step": 7290 + }, + { + "epoch": 0.36, + "grad_norm": 0.5196213722229004, + "learning_rate": 0.0005792424330783729, + "loss": 3.3627, + "step": 7291 + }, + { + "epoch": 0.36, + "grad_norm": 0.5319061875343323, + "learning_rate": 0.0005792368052764248, + "loss": 3.3825, + "step": 7292 + }, + { + "epoch": 0.36, + "grad_norm": 0.5190185308456421, + "learning_rate": 0.0005792311767390207, + "loss": 3.3911, + "step": 7293 + }, + { + "epoch": 0.36, + "grad_norm": 0.5538293123245239, + "learning_rate": 0.0005792255474661753, + "loss": 3.5731, + "step": 7294 + }, + { + "epoch": 0.36, + "grad_norm": 0.5294688940048218, + "learning_rate": 0.0005792199174579038, + "loss": 3.4866, + "step": 7295 + }, + { + "epoch": 0.36, + "grad_norm": 0.5441796183586121, + "learning_rate": 0.0005792142867142206, + "loss": 3.4322, + "step": 7296 + }, + { + "epoch": 0.36, + "grad_norm": 0.49350860714912415, + "learning_rate": 0.0005792086552351407, + "loss": 3.1275, + "step": 7297 + }, + { + "epoch": 0.36, + "grad_norm": 0.569159746170044, + "learning_rate": 0.0005792030230206789, + "loss": 3.3663, + "step": 7298 + }, + { + "epoch": 0.36, + "grad_norm": 0.5132843255996704, + "learning_rate": 0.0005791973900708502, + "loss": 3.1675, + "step": 7299 + }, + { + "epoch": 0.36, + "grad_norm": 0.6056221723556519, + "learning_rate": 0.0005791917563856692, + "loss": 3.4195, + "step": 7300 + }, + { + "epoch": 0.36, + "grad_norm": 0.532596230506897, + "learning_rate": 0.000579186121965151, + "loss": 3.183, + "step": 7301 + }, + { + "epoch": 0.36, + "grad_norm": 0.5601078867912292, + "learning_rate": 0.0005791804868093101, + "loss": 3.228, + "step": 7302 + }, + { + "epoch": 0.36, + "grad_norm": 0.5013242959976196, + "learning_rate": 0.0005791748509181616, + "loss": 3.269, + "step": 7303 + }, + { + "epoch": 0.36, + "grad_norm": 0.5339405536651611, + "learning_rate": 0.0005791692142917203, + "loss": 3.416, + "step": 7304 + }, + { + "epoch": 0.36, + "grad_norm": 0.5372835993766785, + "learning_rate": 0.000579163576930001, + "loss": 3.4051, + "step": 7305 + }, + { + "epoch": 0.36, + "grad_norm": 0.5756890773773193, + "learning_rate": 0.0005791579388330186, + "loss": 3.5151, + "step": 7306 + }, + { + "epoch": 0.36, + "grad_norm": 0.5298717617988586, + "learning_rate": 0.0005791523000007878, + "loss": 3.3441, + "step": 7307 + }, + { + "epoch": 0.36, + "grad_norm": 0.5848795175552368, + "learning_rate": 0.0005791466604333237, + "loss": 3.4685, + "step": 7308 + }, + { + "epoch": 0.36, + "grad_norm": 0.5432910323143005, + "learning_rate": 0.0005791410201306409, + "loss": 3.4025, + "step": 7309 + }, + { + "epoch": 0.36, + "grad_norm": 0.5559453964233398, + "learning_rate": 0.0005791353790927545, + "loss": 3.3999, + "step": 7310 + }, + { + "epoch": 0.36, + "grad_norm": 0.5492426753044128, + "learning_rate": 0.0005791297373196791, + "loss": 3.4373, + "step": 7311 + }, + { + "epoch": 0.36, + "grad_norm": 0.5173240900039673, + "learning_rate": 0.0005791240948114297, + "loss": 3.2916, + "step": 7312 + }, + { + "epoch": 0.36, + "grad_norm": 0.5640772581100464, + "learning_rate": 0.0005791184515680213, + "loss": 3.447, + "step": 7313 + }, + { + "epoch": 0.36, + "grad_norm": 0.5233004093170166, + "learning_rate": 0.0005791128075894685, + "loss": 3.5497, + "step": 7314 + }, + { + "epoch": 0.36, + "grad_norm": 0.5180065631866455, + "learning_rate": 0.0005791071628757862, + "loss": 3.3526, + "step": 7315 + }, + { + "epoch": 0.36, + "grad_norm": 0.5473960041999817, + "learning_rate": 0.0005791015174269895, + "loss": 3.1132, + "step": 7316 + }, + { + "epoch": 0.36, + "grad_norm": 0.5282230377197266, + "learning_rate": 0.0005790958712430931, + "loss": 3.4481, + "step": 7317 + }, + { + "epoch": 0.36, + "grad_norm": 0.5316844582557678, + "learning_rate": 0.0005790902243241119, + "loss": 3.4213, + "step": 7318 + }, + { + "epoch": 0.36, + "grad_norm": 0.5286957621574402, + "learning_rate": 0.0005790845766700608, + "loss": 3.5678, + "step": 7319 + }, + { + "epoch": 0.36, + "grad_norm": 0.49466997385025024, + "learning_rate": 0.0005790789282809545, + "loss": 3.1566, + "step": 7320 + }, + { + "epoch": 0.36, + "grad_norm": 0.5194963812828064, + "learning_rate": 0.0005790732791568081, + "loss": 3.2, + "step": 7321 + }, + { + "epoch": 0.36, + "grad_norm": 0.5403347611427307, + "learning_rate": 0.0005790676292976363, + "loss": 3.5393, + "step": 7322 + }, + { + "epoch": 0.36, + "grad_norm": 0.5521409511566162, + "learning_rate": 0.0005790619787034542, + "loss": 3.1492, + "step": 7323 + }, + { + "epoch": 0.36, + "grad_norm": 0.5029959678649902, + "learning_rate": 0.0005790563273742764, + "loss": 3.2783, + "step": 7324 + }, + { + "epoch": 0.36, + "grad_norm": 0.5743463039398193, + "learning_rate": 0.0005790506753101181, + "loss": 3.3343, + "step": 7325 + }, + { + "epoch": 0.36, + "grad_norm": 0.5631399154663086, + "learning_rate": 0.0005790450225109939, + "loss": 3.4623, + "step": 7326 + }, + { + "epoch": 0.36, + "grad_norm": 0.5515002608299255, + "learning_rate": 0.0005790393689769188, + "loss": 3.5293, + "step": 7327 + }, + { + "epoch": 0.36, + "grad_norm": 0.5547970533370972, + "learning_rate": 0.0005790337147079078, + "loss": 3.3389, + "step": 7328 + }, + { + "epoch": 0.36, + "grad_norm": 0.5851017832756042, + "learning_rate": 0.0005790280597039757, + "loss": 3.4403, + "step": 7329 + }, + { + "epoch": 0.36, + "grad_norm": 0.5544925332069397, + "learning_rate": 0.0005790224039651374, + "loss": 3.4946, + "step": 7330 + }, + { + "epoch": 0.36, + "grad_norm": 0.5337487459182739, + "learning_rate": 0.0005790167474914077, + "loss": 3.379, + "step": 7331 + }, + { + "epoch": 0.36, + "grad_norm": 0.5269622206687927, + "learning_rate": 0.0005790110902828017, + "loss": 3.2976, + "step": 7332 + }, + { + "epoch": 0.36, + "grad_norm": 0.5398240089416504, + "learning_rate": 0.000579005432339334, + "loss": 3.4284, + "step": 7333 + }, + { + "epoch": 0.36, + "grad_norm": 0.5220947861671448, + "learning_rate": 0.0005789997736610199, + "loss": 3.2636, + "step": 7334 + }, + { + "epoch": 0.36, + "grad_norm": 0.5519540309906006, + "learning_rate": 0.000578994114247874, + "loss": 3.1985, + "step": 7335 + }, + { + "epoch": 0.36, + "grad_norm": 0.604718029499054, + "learning_rate": 0.0005789884540999112, + "loss": 3.5494, + "step": 7336 + }, + { + "epoch": 0.36, + "grad_norm": 0.5395439863204956, + "learning_rate": 0.0005789827932171466, + "loss": 3.4658, + "step": 7337 + }, + { + "epoch": 0.36, + "grad_norm": 0.5418725609779358, + "learning_rate": 0.0005789771315995951, + "loss": 3.4172, + "step": 7338 + }, + { + "epoch": 0.36, + "grad_norm": 0.5764614343643188, + "learning_rate": 0.0005789714692472714, + "loss": 3.2426, + "step": 7339 + }, + { + "epoch": 0.36, + "grad_norm": 0.5681049227714539, + "learning_rate": 0.0005789658061601905, + "loss": 3.3262, + "step": 7340 + }, + { + "epoch": 0.36, + "grad_norm": 0.5404443144798279, + "learning_rate": 0.0005789601423383675, + "loss": 3.6112, + "step": 7341 + }, + { + "epoch": 0.36, + "grad_norm": 0.5424375534057617, + "learning_rate": 0.000578954477781817, + "loss": 3.5564, + "step": 7342 + }, + { + "epoch": 0.36, + "grad_norm": 0.5254672765731812, + "learning_rate": 0.0005789488124905542, + "loss": 3.271, + "step": 7343 + }, + { + "epoch": 0.36, + "grad_norm": 0.5020593404769897, + "learning_rate": 0.000578943146464594, + "loss": 3.3267, + "step": 7344 + }, + { + "epoch": 0.36, + "grad_norm": 0.5463967323303223, + "learning_rate": 0.0005789374797039511, + "loss": 3.4191, + "step": 7345 + }, + { + "epoch": 0.36, + "grad_norm": 0.5019806623458862, + "learning_rate": 0.0005789318122086406, + "loss": 3.2696, + "step": 7346 + }, + { + "epoch": 0.36, + "grad_norm": 0.5514928698539734, + "learning_rate": 0.0005789261439786774, + "loss": 3.1895, + "step": 7347 + }, + { + "epoch": 0.36, + "grad_norm": 0.512698769569397, + "learning_rate": 0.0005789204750140764, + "loss": 3.4189, + "step": 7348 + }, + { + "epoch": 0.36, + "grad_norm": 0.5055906772613525, + "learning_rate": 0.0005789148053148525, + "loss": 3.2841, + "step": 7349 + }, + { + "epoch": 0.36, + "grad_norm": 0.527269721031189, + "learning_rate": 0.0005789091348810208, + "loss": 3.5163, + "step": 7350 + }, + { + "epoch": 0.36, + "grad_norm": 0.5826881527900696, + "learning_rate": 0.0005789034637125961, + "loss": 3.4445, + "step": 7351 + }, + { + "epoch": 0.36, + "grad_norm": 0.5765799283981323, + "learning_rate": 0.0005788977918095932, + "loss": 3.2363, + "step": 7352 + }, + { + "epoch": 0.36, + "grad_norm": 0.5874758362770081, + "learning_rate": 0.0005788921191720273, + "loss": 3.5972, + "step": 7353 + }, + { + "epoch": 0.36, + "grad_norm": 0.5443326234817505, + "learning_rate": 0.0005788864457999133, + "loss": 3.3818, + "step": 7354 + }, + { + "epoch": 0.36, + "grad_norm": 0.5976613759994507, + "learning_rate": 0.000578880771693266, + "loss": 3.1729, + "step": 7355 + }, + { + "epoch": 0.36, + "grad_norm": 0.571810781955719, + "learning_rate": 0.0005788750968521003, + "loss": 3.3752, + "step": 7356 + }, + { + "epoch": 0.36, + "grad_norm": 0.5518274307250977, + "learning_rate": 0.0005788694212764314, + "loss": 3.3239, + "step": 7357 + }, + { + "epoch": 0.36, + "grad_norm": 0.5681546330451965, + "learning_rate": 0.0005788637449662742, + "loss": 3.3544, + "step": 7358 + }, + { + "epoch": 0.36, + "grad_norm": 0.5265949368476868, + "learning_rate": 0.0005788580679216434, + "loss": 3.3936, + "step": 7359 + }, + { + "epoch": 0.36, + "grad_norm": 0.5548690557479858, + "learning_rate": 0.0005788523901425541, + "loss": 3.2901, + "step": 7360 + }, + { + "epoch": 0.36, + "grad_norm": 0.5343828201293945, + "learning_rate": 0.0005788467116290214, + "loss": 3.2131, + "step": 7361 + }, + { + "epoch": 0.36, + "grad_norm": 0.5076448321342468, + "learning_rate": 0.0005788410323810602, + "loss": 3.3964, + "step": 7362 + }, + { + "epoch": 0.36, + "grad_norm": 0.49037474393844604, + "learning_rate": 0.0005788353523986852, + "loss": 3.2415, + "step": 7363 + }, + { + "epoch": 0.36, + "grad_norm": 0.5263360142707825, + "learning_rate": 0.0005788296716819116, + "loss": 3.4041, + "step": 7364 + }, + { + "epoch": 0.36, + "grad_norm": 0.5553779006004333, + "learning_rate": 0.0005788239902307543, + "loss": 3.3971, + "step": 7365 + }, + { + "epoch": 0.36, + "grad_norm": 0.5806408524513245, + "learning_rate": 0.0005788183080452283, + "loss": 3.1735, + "step": 7366 + }, + { + "epoch": 0.36, + "grad_norm": 0.5436879396438599, + "learning_rate": 0.0005788126251253486, + "loss": 3.5634, + "step": 7367 + }, + { + "epoch": 0.36, + "grad_norm": 0.5230838656425476, + "learning_rate": 0.00057880694147113, + "loss": 3.2974, + "step": 7368 + }, + { + "epoch": 0.36, + "grad_norm": 0.630568265914917, + "learning_rate": 0.0005788012570825875, + "loss": 3.2782, + "step": 7369 + }, + { + "epoch": 0.36, + "grad_norm": 0.5829651951789856, + "learning_rate": 0.0005787955719597362, + "loss": 3.0793, + "step": 7370 + }, + { + "epoch": 0.36, + "grad_norm": 0.5027291774749756, + "learning_rate": 0.0005787898861025912, + "loss": 3.4173, + "step": 7371 + }, + { + "epoch": 0.36, + "grad_norm": 0.5039538145065308, + "learning_rate": 0.0005787841995111671, + "loss": 3.2034, + "step": 7372 + }, + { + "epoch": 0.36, + "grad_norm": 0.5734065175056458, + "learning_rate": 0.0005787785121854791, + "loss": 3.459, + "step": 7373 + }, + { + "epoch": 0.36, + "grad_norm": 0.5559008717536926, + "learning_rate": 0.0005787728241255422, + "loss": 3.1911, + "step": 7374 + }, + { + "epoch": 0.36, + "grad_norm": 0.5591650605201721, + "learning_rate": 0.0005787671353313712, + "loss": 3.1949, + "step": 7375 + }, + { + "epoch": 0.36, + "grad_norm": 0.52269047498703, + "learning_rate": 0.0005787614458029813, + "loss": 3.3678, + "step": 7376 + }, + { + "epoch": 0.36, + "grad_norm": 0.5631672143936157, + "learning_rate": 0.0005787557555403875, + "loss": 3.3729, + "step": 7377 + }, + { + "epoch": 0.36, + "grad_norm": 0.5107264518737793, + "learning_rate": 0.0005787500645436047, + "loss": 3.3954, + "step": 7378 + }, + { + "epoch": 0.36, + "grad_norm": 0.5732921361923218, + "learning_rate": 0.0005787443728126478, + "loss": 3.4472, + "step": 7379 + }, + { + "epoch": 0.36, + "grad_norm": 0.5226930975914001, + "learning_rate": 0.0005787386803475318, + "loss": 3.4802, + "step": 7380 + }, + { + "epoch": 0.36, + "grad_norm": 0.5663575530052185, + "learning_rate": 0.0005787329871482717, + "loss": 3.3462, + "step": 7381 + }, + { + "epoch": 0.36, + "grad_norm": 0.5493120551109314, + "learning_rate": 0.0005787272932148827, + "loss": 3.2889, + "step": 7382 + }, + { + "epoch": 0.36, + "grad_norm": 0.5059551000595093, + "learning_rate": 0.0005787215985473797, + "loss": 3.3501, + "step": 7383 + }, + { + "epoch": 0.36, + "grad_norm": 0.5528244972229004, + "learning_rate": 0.0005787159031457776, + "loss": 3.3196, + "step": 7384 + }, + { + "epoch": 0.36, + "grad_norm": 0.519409716129303, + "learning_rate": 0.0005787102070100914, + "loss": 3.2856, + "step": 7385 + }, + { + "epoch": 0.36, + "grad_norm": 0.5072803497314453, + "learning_rate": 0.0005787045101403362, + "loss": 3.4766, + "step": 7386 + }, + { + "epoch": 0.36, + "grad_norm": 0.5262261033058167, + "learning_rate": 0.0005786988125365269, + "loss": 3.3746, + "step": 7387 + }, + { + "epoch": 0.36, + "grad_norm": 0.5350049734115601, + "learning_rate": 0.0005786931141986786, + "loss": 3.2329, + "step": 7388 + }, + { + "epoch": 0.36, + "grad_norm": 0.5463743805885315, + "learning_rate": 0.0005786874151268062, + "loss": 3.2722, + "step": 7389 + }, + { + "epoch": 0.36, + "grad_norm": 0.5149025321006775, + "learning_rate": 0.0005786817153209249, + "loss": 3.243, + "step": 7390 + }, + { + "epoch": 0.36, + "grad_norm": 0.5367695093154907, + "learning_rate": 0.0005786760147810496, + "loss": 3.4026, + "step": 7391 + }, + { + "epoch": 0.36, + "grad_norm": 0.5625688433647156, + "learning_rate": 0.0005786703135071952, + "loss": 3.3234, + "step": 7392 + }, + { + "epoch": 0.36, + "grad_norm": 0.5635698437690735, + "learning_rate": 0.000578664611499377, + "loss": 3.3182, + "step": 7393 + }, + { + "epoch": 0.36, + "grad_norm": 0.5578081607818604, + "learning_rate": 0.0005786589087576097, + "loss": 3.1293, + "step": 7394 + }, + { + "epoch": 0.36, + "grad_norm": 0.5266342163085938, + "learning_rate": 0.0005786532052819085, + "loss": 3.4332, + "step": 7395 + }, + { + "epoch": 0.36, + "grad_norm": 0.524929940700531, + "learning_rate": 0.0005786475010722883, + "loss": 3.1655, + "step": 7396 + }, + { + "epoch": 0.36, + "grad_norm": 0.521431028842926, + "learning_rate": 0.0005786417961287643, + "loss": 3.2723, + "step": 7397 + }, + { + "epoch": 0.36, + "grad_norm": 0.5528025031089783, + "learning_rate": 0.0005786360904513515, + "loss": 3.495, + "step": 7398 + }, + { + "epoch": 0.36, + "grad_norm": 0.47876498103141785, + "learning_rate": 0.0005786303840400647, + "loss": 3.2928, + "step": 7399 + }, + { + "epoch": 0.36, + "grad_norm": 0.5592080354690552, + "learning_rate": 0.0005786246768949193, + "loss": 3.1918, + "step": 7400 + }, + { + "epoch": 0.36, + "grad_norm": 0.5398855209350586, + "learning_rate": 0.0005786189690159299, + "loss": 3.3853, + "step": 7401 + }, + { + "epoch": 0.36, + "grad_norm": 0.5718713402748108, + "learning_rate": 0.0005786132604031119, + "loss": 3.3739, + "step": 7402 + }, + { + "epoch": 0.36, + "grad_norm": 0.5837875604629517, + "learning_rate": 0.0005786075510564801, + "loss": 3.3774, + "step": 7403 + }, + { + "epoch": 0.36, + "grad_norm": 0.5221856236457825, + "learning_rate": 0.0005786018409760497, + "loss": 3.4436, + "step": 7404 + }, + { + "epoch": 0.36, + "grad_norm": 0.5874915719032288, + "learning_rate": 0.0005785961301618356, + "loss": 3.3879, + "step": 7405 + }, + { + "epoch": 0.36, + "grad_norm": 0.5608260035514832, + "learning_rate": 0.000578590418613853, + "loss": 3.3331, + "step": 7406 + }, + { + "epoch": 0.36, + "grad_norm": 0.5786241292953491, + "learning_rate": 0.0005785847063321168, + "loss": 3.2905, + "step": 7407 + }, + { + "epoch": 0.36, + "grad_norm": 0.5414553284645081, + "learning_rate": 0.0005785789933166422, + "loss": 3.2087, + "step": 7408 + }, + { + "epoch": 0.36, + "grad_norm": 0.5226951837539673, + "learning_rate": 0.000578573279567444, + "loss": 3.4015, + "step": 7409 + }, + { + "epoch": 0.36, + "grad_norm": 0.5430212616920471, + "learning_rate": 0.0005785675650845374, + "loss": 3.208, + "step": 7410 + }, + { + "epoch": 0.36, + "grad_norm": 0.5425985455513, + "learning_rate": 0.0005785618498679375, + "loss": 3.3109, + "step": 7411 + }, + { + "epoch": 0.36, + "grad_norm": 0.5451919436454773, + "learning_rate": 0.0005785561339176593, + "loss": 3.1937, + "step": 7412 + }, + { + "epoch": 0.36, + "grad_norm": 0.5355823636054993, + "learning_rate": 0.0005785504172337178, + "loss": 3.2868, + "step": 7413 + }, + { + "epoch": 0.36, + "grad_norm": 0.4789738059043884, + "learning_rate": 0.0005785446998161282, + "loss": 3.4312, + "step": 7414 + }, + { + "epoch": 0.36, + "grad_norm": 0.5494072437286377, + "learning_rate": 0.0005785389816649054, + "loss": 3.481, + "step": 7415 + }, + { + "epoch": 0.36, + "grad_norm": 0.529240608215332, + "learning_rate": 0.0005785332627800645, + "loss": 3.231, + "step": 7416 + }, + { + "epoch": 0.36, + "grad_norm": 0.5651838779449463, + "learning_rate": 0.0005785275431616207, + "loss": 3.3466, + "step": 7417 + }, + { + "epoch": 0.36, + "grad_norm": 0.5189810395240784, + "learning_rate": 0.0005785218228095889, + "loss": 3.2454, + "step": 7418 + }, + { + "epoch": 0.36, + "grad_norm": 0.5465260744094849, + "learning_rate": 0.0005785161017239842, + "loss": 3.3845, + "step": 7419 + }, + { + "epoch": 0.36, + "grad_norm": 0.5511559844017029, + "learning_rate": 0.0005785103799048218, + "loss": 3.4355, + "step": 7420 + }, + { + "epoch": 0.36, + "grad_norm": 0.5183231830596924, + "learning_rate": 0.0005785046573521165, + "loss": 3.3434, + "step": 7421 + }, + { + "epoch": 0.36, + "grad_norm": 0.5259350538253784, + "learning_rate": 0.0005784989340658837, + "loss": 3.3595, + "step": 7422 + }, + { + "epoch": 0.36, + "grad_norm": 0.5619202852249146, + "learning_rate": 0.000578493210046138, + "loss": 3.4631, + "step": 7423 + }, + { + "epoch": 0.36, + "grad_norm": 0.5505042672157288, + "learning_rate": 0.000578487485292895, + "loss": 3.3808, + "step": 7424 + }, + { + "epoch": 0.36, + "grad_norm": 0.5139535665512085, + "learning_rate": 0.0005784817598061696, + "loss": 3.2009, + "step": 7425 + }, + { + "epoch": 0.36, + "grad_norm": 0.580228328704834, + "learning_rate": 0.0005784760335859769, + "loss": 3.1475, + "step": 7426 + }, + { + "epoch": 0.36, + "grad_norm": 0.535790205001831, + "learning_rate": 0.0005784703066323317, + "loss": 3.2759, + "step": 7427 + }, + { + "epoch": 0.36, + "grad_norm": 0.5042516589164734, + "learning_rate": 0.0005784645789452494, + "loss": 3.1972, + "step": 7428 + }, + { + "epoch": 0.36, + "grad_norm": 0.5909183025360107, + "learning_rate": 0.000578458850524745, + "loss": 3.3301, + "step": 7429 + }, + { + "epoch": 0.36, + "grad_norm": 0.5318119525909424, + "learning_rate": 0.0005784531213708336, + "loss": 3.4631, + "step": 7430 + }, + { + "epoch": 0.36, + "grad_norm": 0.4836869537830353, + "learning_rate": 0.0005784473914835302, + "loss": 3.5986, + "step": 7431 + }, + { + "epoch": 0.36, + "grad_norm": 0.5234432816505432, + "learning_rate": 0.0005784416608628501, + "loss": 3.1308, + "step": 7432 + }, + { + "epoch": 0.36, + "grad_norm": 0.6105952858924866, + "learning_rate": 0.0005784359295088081, + "loss": 3.3292, + "step": 7433 + }, + { + "epoch": 0.36, + "grad_norm": 0.600591242313385, + "learning_rate": 0.0005784301974214195, + "loss": 3.3737, + "step": 7434 + }, + { + "epoch": 0.36, + "grad_norm": 0.48450472950935364, + "learning_rate": 0.0005784244646006993, + "loss": 3.2875, + "step": 7435 + }, + { + "epoch": 0.36, + "grad_norm": 0.6669772863388062, + "learning_rate": 0.0005784187310466628, + "loss": 3.239, + "step": 7436 + }, + { + "epoch": 0.36, + "grad_norm": 0.5766210556030273, + "learning_rate": 0.0005784129967593249, + "loss": 3.2816, + "step": 7437 + }, + { + "epoch": 0.36, + "grad_norm": 0.5472173690795898, + "learning_rate": 0.0005784072617387006, + "loss": 3.3769, + "step": 7438 + }, + { + "epoch": 0.36, + "grad_norm": 0.5345730185508728, + "learning_rate": 0.0005784015259848053, + "loss": 3.4177, + "step": 7439 + }, + { + "epoch": 0.36, + "grad_norm": 0.5472462177276611, + "learning_rate": 0.0005783957894976538, + "loss": 3.3872, + "step": 7440 + }, + { + "epoch": 0.36, + "grad_norm": 0.5371647477149963, + "learning_rate": 0.0005783900522772615, + "loss": 3.4841, + "step": 7441 + }, + { + "epoch": 0.36, + "grad_norm": 0.5290404558181763, + "learning_rate": 0.0005783843143236433, + "loss": 3.2963, + "step": 7442 + }, + { + "epoch": 0.36, + "grad_norm": 0.5215094685554504, + "learning_rate": 0.0005783785756368145, + "loss": 3.3373, + "step": 7443 + }, + { + "epoch": 0.36, + "grad_norm": 0.61955726146698, + "learning_rate": 0.0005783728362167901, + "loss": 3.3818, + "step": 7444 + }, + { + "epoch": 0.36, + "grad_norm": 0.5387982130050659, + "learning_rate": 0.0005783670960635851, + "loss": 3.3528, + "step": 7445 + }, + { + "epoch": 0.36, + "grad_norm": 0.553604245185852, + "learning_rate": 0.0005783613551772149, + "loss": 3.3234, + "step": 7446 + }, + { + "epoch": 0.36, + "grad_norm": 0.5161128640174866, + "learning_rate": 0.0005783556135576942, + "loss": 3.2385, + "step": 7447 + }, + { + "epoch": 0.37, + "grad_norm": 0.5468826293945312, + "learning_rate": 0.0005783498712050386, + "loss": 3.3768, + "step": 7448 + }, + { + "epoch": 0.37, + "grad_norm": 0.5205458402633667, + "learning_rate": 0.000578344128119263, + "loss": 3.4234, + "step": 7449 + }, + { + "epoch": 0.37, + "grad_norm": 0.5659615397453308, + "learning_rate": 0.0005783383843003825, + "loss": 3.2642, + "step": 7450 + }, + { + "epoch": 0.37, + "grad_norm": 0.6167935132980347, + "learning_rate": 0.0005783326397484123, + "loss": 3.4605, + "step": 7451 + }, + { + "epoch": 0.37, + "grad_norm": 0.5576913356781006, + "learning_rate": 0.0005783268944633675, + "loss": 3.4278, + "step": 7452 + }, + { + "epoch": 0.37, + "grad_norm": 0.5447071194648743, + "learning_rate": 0.0005783211484452632, + "loss": 3.2907, + "step": 7453 + }, + { + "epoch": 0.37, + "grad_norm": 0.49073418974876404, + "learning_rate": 0.0005783154016941145, + "loss": 3.3534, + "step": 7454 + }, + { + "epoch": 0.37, + "grad_norm": 0.5985206365585327, + "learning_rate": 0.0005783096542099366, + "loss": 3.1996, + "step": 7455 + }, + { + "epoch": 0.37, + "grad_norm": 0.5504509806632996, + "learning_rate": 0.0005783039059927448, + "loss": 3.3993, + "step": 7456 + }, + { + "epoch": 0.37, + "grad_norm": 0.5634692907333374, + "learning_rate": 0.0005782981570425539, + "loss": 3.3579, + "step": 7457 + }, + { + "epoch": 0.37, + "grad_norm": 0.49616754055023193, + "learning_rate": 0.0005782924073593792, + "loss": 3.3781, + "step": 7458 + }, + { + "epoch": 0.37, + "grad_norm": 0.533212423324585, + "learning_rate": 0.0005782866569432359, + "loss": 3.1977, + "step": 7459 + }, + { + "epoch": 0.37, + "grad_norm": 0.5390735864639282, + "learning_rate": 0.0005782809057941392, + "loss": 3.358, + "step": 7460 + }, + { + "epoch": 0.37, + "grad_norm": 0.5744284391403198, + "learning_rate": 0.000578275153912104, + "loss": 3.3979, + "step": 7461 + }, + { + "epoch": 0.37, + "grad_norm": 0.5401898622512817, + "learning_rate": 0.0005782694012971458, + "loss": 3.557, + "step": 7462 + }, + { + "epoch": 0.37, + "grad_norm": 0.5606440305709839, + "learning_rate": 0.0005782636479492793, + "loss": 3.2477, + "step": 7463 + }, + { + "epoch": 0.37, + "grad_norm": 0.48706990480422974, + "learning_rate": 0.0005782578938685201, + "loss": 3.5734, + "step": 7464 + }, + { + "epoch": 0.37, + "grad_norm": 0.5521231293678284, + "learning_rate": 0.0005782521390548831, + "loss": 3.3121, + "step": 7465 + }, + { + "epoch": 0.37, + "grad_norm": 0.5380448698997498, + "learning_rate": 0.0005782463835083834, + "loss": 3.4478, + "step": 7466 + }, + { + "epoch": 0.37, + "grad_norm": 0.5551396608352661, + "learning_rate": 0.0005782406272290364, + "loss": 3.2938, + "step": 7467 + }, + { + "epoch": 0.37, + "grad_norm": 0.5721957683563232, + "learning_rate": 0.0005782348702168572, + "loss": 3.3066, + "step": 7468 + }, + { + "epoch": 0.37, + "grad_norm": 0.5749885439872742, + "learning_rate": 0.0005782291124718608, + "loss": 3.3865, + "step": 7469 + }, + { + "epoch": 0.37, + "grad_norm": 0.5412680506706238, + "learning_rate": 0.0005782233539940625, + "loss": 3.1974, + "step": 7470 + }, + { + "epoch": 0.37, + "grad_norm": 0.530744731426239, + "learning_rate": 0.0005782175947834774, + "loss": 3.42, + "step": 7471 + }, + { + "epoch": 0.37, + "grad_norm": 0.5663665533065796, + "learning_rate": 0.0005782118348401207, + "loss": 3.4735, + "step": 7472 + }, + { + "epoch": 0.37, + "grad_norm": 0.5773357152938843, + "learning_rate": 0.0005782060741640075, + "loss": 3.287, + "step": 7473 + }, + { + "epoch": 0.37, + "grad_norm": 0.5321745872497559, + "learning_rate": 0.000578200312755153, + "loss": 3.2528, + "step": 7474 + }, + { + "epoch": 0.37, + "grad_norm": 0.543752908706665, + "learning_rate": 0.0005781945506135726, + "loss": 3.1304, + "step": 7475 + }, + { + "epoch": 0.37, + "grad_norm": 0.5263063311576843, + "learning_rate": 0.0005781887877392812, + "loss": 3.5356, + "step": 7476 + }, + { + "epoch": 0.37, + "grad_norm": 0.5132307410240173, + "learning_rate": 0.0005781830241322942, + "loss": 3.0732, + "step": 7477 + }, + { + "epoch": 0.37, + "grad_norm": 0.5528070330619812, + "learning_rate": 0.0005781772597926266, + "loss": 3.5516, + "step": 7478 + }, + { + "epoch": 0.37, + "grad_norm": 0.5620551109313965, + "learning_rate": 0.0005781714947202935, + "loss": 3.5392, + "step": 7479 + }, + { + "epoch": 0.37, + "grad_norm": 0.5856842398643494, + "learning_rate": 0.0005781657289153103, + "loss": 3.4985, + "step": 7480 + }, + { + "epoch": 0.37, + "grad_norm": 0.49245211482048035, + "learning_rate": 0.0005781599623776922, + "loss": 3.2934, + "step": 7481 + }, + { + "epoch": 0.37, + "grad_norm": 0.5112742781639099, + "learning_rate": 0.0005781541951074541, + "loss": 3.39, + "step": 7482 + }, + { + "epoch": 0.37, + "grad_norm": 0.567556619644165, + "learning_rate": 0.0005781484271046115, + "loss": 3.3748, + "step": 7483 + }, + { + "epoch": 0.37, + "grad_norm": 0.5500913262367249, + "learning_rate": 0.0005781426583691794, + "loss": 3.1794, + "step": 7484 + }, + { + "epoch": 0.37, + "grad_norm": 0.588733434677124, + "learning_rate": 0.0005781368889011731, + "loss": 3.3915, + "step": 7485 + }, + { + "epoch": 0.37, + "grad_norm": 0.5132182240486145, + "learning_rate": 0.0005781311187006079, + "loss": 3.2548, + "step": 7486 + }, + { + "epoch": 0.37, + "grad_norm": 0.5105894804000854, + "learning_rate": 0.0005781253477674987, + "loss": 3.4658, + "step": 7487 + }, + { + "epoch": 0.37, + "grad_norm": 0.5359848737716675, + "learning_rate": 0.0005781195761018609, + "loss": 3.2688, + "step": 7488 + }, + { + "epoch": 0.37, + "grad_norm": 0.5214976072311401, + "learning_rate": 0.0005781138037037096, + "loss": 3.3225, + "step": 7489 + }, + { + "epoch": 0.37, + "grad_norm": 0.5228166580200195, + "learning_rate": 0.0005781080305730603, + "loss": 3.2782, + "step": 7490 + }, + { + "epoch": 0.37, + "grad_norm": 0.5464044809341431, + "learning_rate": 0.0005781022567099277, + "loss": 3.2469, + "step": 7491 + }, + { + "epoch": 0.37, + "grad_norm": 0.5573117733001709, + "learning_rate": 0.0005780964821143274, + "loss": 3.0187, + "step": 7492 + }, + { + "epoch": 0.37, + "grad_norm": 0.5702837109565735, + "learning_rate": 0.0005780907067862744, + "loss": 3.2428, + "step": 7493 + }, + { + "epoch": 0.37, + "grad_norm": 0.6031702160835266, + "learning_rate": 0.0005780849307257842, + "loss": 3.4579, + "step": 7494 + }, + { + "epoch": 0.37, + "grad_norm": 0.5179644823074341, + "learning_rate": 0.0005780791539328716, + "loss": 3.1622, + "step": 7495 + }, + { + "epoch": 0.37, + "grad_norm": 0.5007571578025818, + "learning_rate": 0.0005780733764075521, + "loss": 3.4954, + "step": 7496 + }, + { + "epoch": 0.37, + "grad_norm": 0.5171201825141907, + "learning_rate": 0.0005780675981498409, + "loss": 3.379, + "step": 7497 + }, + { + "epoch": 0.37, + "grad_norm": 0.5398955941200256, + "learning_rate": 0.0005780618191597531, + "loss": 3.3197, + "step": 7498 + }, + { + "epoch": 0.37, + "grad_norm": 0.5368615984916687, + "learning_rate": 0.000578056039437304, + "loss": 3.5917, + "step": 7499 + }, + { + "epoch": 0.37, + "grad_norm": 0.5254148244857788, + "learning_rate": 0.0005780502589825087, + "loss": 3.3423, + "step": 7500 + }, + { + "epoch": 0.37, + "grad_norm": 0.49917274713516235, + "learning_rate": 0.0005780444777953827, + "loss": 3.3729, + "step": 7501 + }, + { + "epoch": 0.37, + "grad_norm": 0.579575777053833, + "learning_rate": 0.000578038695875941, + "loss": 3.1839, + "step": 7502 + }, + { + "epoch": 0.37, + "grad_norm": 0.5500216484069824, + "learning_rate": 0.0005780329132241989, + "loss": 3.4427, + "step": 7503 + }, + { + "epoch": 0.37, + "grad_norm": 0.5117226243019104, + "learning_rate": 0.0005780271298401715, + "loss": 3.043, + "step": 7504 + }, + { + "epoch": 0.37, + "grad_norm": 0.57242751121521, + "learning_rate": 0.0005780213457238743, + "loss": 3.2203, + "step": 7505 + }, + { + "epoch": 0.37, + "grad_norm": 0.538989245891571, + "learning_rate": 0.0005780155608753223, + "loss": 3.4584, + "step": 7506 + }, + { + "epoch": 0.37, + "grad_norm": 0.5599838495254517, + "learning_rate": 0.0005780097752945307, + "loss": 3.4769, + "step": 7507 + }, + { + "epoch": 0.37, + "grad_norm": 0.5817453265190125, + "learning_rate": 0.0005780039889815151, + "loss": 3.4676, + "step": 7508 + }, + { + "epoch": 0.37, + "grad_norm": 0.5717339515686035, + "learning_rate": 0.0005779982019362903, + "loss": 3.1948, + "step": 7509 + }, + { + "epoch": 0.37, + "grad_norm": 0.5515158772468567, + "learning_rate": 0.0005779924141588718, + "loss": 3.3845, + "step": 7510 + }, + { + "epoch": 0.37, + "grad_norm": 0.5656583905220032, + "learning_rate": 0.0005779866256492749, + "loss": 3.3043, + "step": 7511 + }, + { + "epoch": 0.37, + "grad_norm": 0.5461367964744568, + "learning_rate": 0.0005779808364075146, + "loss": 3.2265, + "step": 7512 + }, + { + "epoch": 0.37, + "grad_norm": 0.5367287397384644, + "learning_rate": 0.0005779750464336062, + "loss": 3.3777, + "step": 7513 + }, + { + "epoch": 0.37, + "grad_norm": 0.5329413414001465, + "learning_rate": 0.0005779692557275651, + "loss": 3.3883, + "step": 7514 + }, + { + "epoch": 0.37, + "grad_norm": 0.5147910714149475, + "learning_rate": 0.0005779634642894066, + "loss": 3.1705, + "step": 7515 + }, + { + "epoch": 0.37, + "grad_norm": 0.5768789052963257, + "learning_rate": 0.0005779576721191457, + "loss": 3.2288, + "step": 7516 + }, + { + "epoch": 0.37, + "grad_norm": 0.5086293816566467, + "learning_rate": 0.0005779518792167978, + "loss": 3.3029, + "step": 7517 + }, + { + "epoch": 0.37, + "grad_norm": 0.5314181447029114, + "learning_rate": 0.0005779460855823782, + "loss": 3.5192, + "step": 7518 + }, + { + "epoch": 0.37, + "grad_norm": 0.5063535571098328, + "learning_rate": 0.0005779402912159021, + "loss": 3.3481, + "step": 7519 + }, + { + "epoch": 0.37, + "grad_norm": 0.5236873030662537, + "learning_rate": 0.0005779344961173847, + "loss": 3.3957, + "step": 7520 + }, + { + "epoch": 0.37, + "grad_norm": 0.5629249215126038, + "learning_rate": 0.0005779287002868413, + "loss": 3.2262, + "step": 7521 + }, + { + "epoch": 0.37, + "grad_norm": 0.6165677905082703, + "learning_rate": 0.0005779229037242873, + "loss": 3.184, + "step": 7522 + }, + { + "epoch": 0.37, + "grad_norm": 0.5311621427536011, + "learning_rate": 0.0005779171064297378, + "loss": 3.3883, + "step": 7523 + }, + { + "epoch": 0.37, + "grad_norm": 0.5682266354560852, + "learning_rate": 0.0005779113084032082, + "loss": 3.4386, + "step": 7524 + }, + { + "epoch": 0.37, + "grad_norm": 0.5127180814743042, + "learning_rate": 0.0005779055096447136, + "loss": 3.3266, + "step": 7525 + }, + { + "epoch": 0.37, + "grad_norm": 0.49226856231689453, + "learning_rate": 0.0005778997101542694, + "loss": 3.3149, + "step": 7526 + }, + { + "epoch": 0.37, + "grad_norm": 0.5309831500053406, + "learning_rate": 0.0005778939099318908, + "loss": 3.2658, + "step": 7527 + }, + { + "epoch": 0.37, + "grad_norm": 0.5468692779541016, + "learning_rate": 0.0005778881089775933, + "loss": 3.3354, + "step": 7528 + }, + { + "epoch": 0.37, + "grad_norm": 0.5492680072784424, + "learning_rate": 0.0005778823072913918, + "loss": 3.3651, + "step": 7529 + }, + { + "epoch": 0.37, + "grad_norm": 0.5209020972251892, + "learning_rate": 0.0005778765048733019, + "loss": 3.3828, + "step": 7530 + }, + { + "epoch": 0.37, + "grad_norm": 0.5251026153564453, + "learning_rate": 0.0005778707017233387, + "loss": 3.4814, + "step": 7531 + }, + { + "epoch": 0.37, + "grad_norm": 0.5343338847160339, + "learning_rate": 0.0005778648978415176, + "loss": 3.3883, + "step": 7532 + }, + { + "epoch": 0.37, + "grad_norm": 0.5628687143325806, + "learning_rate": 0.0005778590932278537, + "loss": 3.4574, + "step": 7533 + }, + { + "epoch": 0.37, + "grad_norm": 0.5103763341903687, + "learning_rate": 0.0005778532878823625, + "loss": 3.1286, + "step": 7534 + }, + { + "epoch": 0.37, + "grad_norm": 0.5265288949012756, + "learning_rate": 0.0005778474818050593, + "loss": 3.2203, + "step": 7535 + }, + { + "epoch": 0.37, + "grad_norm": 0.5499712824821472, + "learning_rate": 0.0005778416749959592, + "loss": 3.1883, + "step": 7536 + }, + { + "epoch": 0.37, + "grad_norm": 0.5209925174713135, + "learning_rate": 0.0005778358674550778, + "loss": 3.2346, + "step": 7537 + }, + { + "epoch": 0.37, + "grad_norm": 0.5306970477104187, + "learning_rate": 0.00057783005918243, + "loss": 3.3883, + "step": 7538 + }, + { + "epoch": 0.37, + "grad_norm": 0.506366491317749, + "learning_rate": 0.0005778242501780313, + "loss": 3.2689, + "step": 7539 + }, + { + "epoch": 0.37, + "grad_norm": 0.5261942744255066, + "learning_rate": 0.0005778184404418971, + "loss": 3.5414, + "step": 7540 + }, + { + "epoch": 0.37, + "grad_norm": 0.539635181427002, + "learning_rate": 0.0005778126299740425, + "loss": 3.2619, + "step": 7541 + }, + { + "epoch": 0.37, + "grad_norm": 0.5381528735160828, + "learning_rate": 0.0005778068187744829, + "loss": 3.2378, + "step": 7542 + }, + { + "epoch": 0.37, + "grad_norm": 0.5434399843215942, + "learning_rate": 0.0005778010068432336, + "loss": 3.1842, + "step": 7543 + }, + { + "epoch": 0.37, + "grad_norm": 0.5563649535179138, + "learning_rate": 0.0005777951941803099, + "loss": 3.3198, + "step": 7544 + }, + { + "epoch": 0.37, + "grad_norm": 0.5185420513153076, + "learning_rate": 0.0005777893807857273, + "loss": 3.3414, + "step": 7545 + }, + { + "epoch": 0.37, + "grad_norm": 0.5348244905471802, + "learning_rate": 0.0005777835666595007, + "loss": 3.2746, + "step": 7546 + }, + { + "epoch": 0.37, + "grad_norm": 0.5372639894485474, + "learning_rate": 0.0005777777518016458, + "loss": 3.2459, + "step": 7547 + }, + { + "epoch": 0.37, + "grad_norm": 0.5764162540435791, + "learning_rate": 0.0005777719362121777, + "loss": 3.3808, + "step": 7548 + }, + { + "epoch": 0.37, + "grad_norm": 0.533909022808075, + "learning_rate": 0.0005777661198911118, + "loss": 3.4382, + "step": 7549 + }, + { + "epoch": 0.37, + "grad_norm": 0.5675473809242249, + "learning_rate": 0.0005777603028384634, + "loss": 3.1793, + "step": 7550 + }, + { + "epoch": 0.37, + "grad_norm": 0.5419423580169678, + "learning_rate": 0.0005777544850542477, + "loss": 3.309, + "step": 7551 + }, + { + "epoch": 0.37, + "grad_norm": 0.5336681008338928, + "learning_rate": 0.0005777486665384802, + "loss": 3.4056, + "step": 7552 + }, + { + "epoch": 0.37, + "grad_norm": 0.5690992474555969, + "learning_rate": 0.0005777428472911763, + "loss": 3.4831, + "step": 7553 + }, + { + "epoch": 0.37, + "grad_norm": 0.5622754693031311, + "learning_rate": 0.000577737027312351, + "loss": 3.3529, + "step": 7554 + }, + { + "epoch": 0.37, + "grad_norm": 0.5477057695388794, + "learning_rate": 0.0005777312066020199, + "loss": 3.4004, + "step": 7555 + }, + { + "epoch": 0.37, + "grad_norm": 0.5189183950424194, + "learning_rate": 0.0005777253851601984, + "loss": 3.319, + "step": 7556 + }, + { + "epoch": 0.37, + "grad_norm": 0.5486992001533508, + "learning_rate": 0.0005777195629869015, + "loss": 3.3856, + "step": 7557 + }, + { + "epoch": 0.37, + "grad_norm": 0.5244187712669373, + "learning_rate": 0.0005777137400821448, + "loss": 3.2891, + "step": 7558 + }, + { + "epoch": 0.37, + "grad_norm": 0.5623378157615662, + "learning_rate": 0.0005777079164459436, + "loss": 3.3083, + "step": 7559 + }, + { + "epoch": 0.37, + "grad_norm": 0.8148874044418335, + "learning_rate": 0.000577702092078313, + "loss": 3.3492, + "step": 7560 + }, + { + "epoch": 0.37, + "grad_norm": 0.5145521759986877, + "learning_rate": 0.0005776962669792687, + "loss": 3.2571, + "step": 7561 + }, + { + "epoch": 0.37, + "grad_norm": 0.5764961242675781, + "learning_rate": 0.0005776904411488259, + "loss": 3.4779, + "step": 7562 + }, + { + "epoch": 0.37, + "grad_norm": 0.518203616142273, + "learning_rate": 0.0005776846145869997, + "loss": 3.4448, + "step": 7563 + }, + { + "epoch": 0.37, + "grad_norm": 0.6169568300247192, + "learning_rate": 0.0005776787872938059, + "loss": 3.3586, + "step": 7564 + }, + { + "epoch": 0.37, + "grad_norm": 0.47566235065460205, + "learning_rate": 0.0005776729592692596, + "loss": 3.5285, + "step": 7565 + }, + { + "epoch": 0.37, + "grad_norm": 0.5244961380958557, + "learning_rate": 0.0005776671305133761, + "loss": 3.3522, + "step": 7566 + }, + { + "epoch": 0.37, + "grad_norm": 0.5160118937492371, + "learning_rate": 0.0005776613010261708, + "loss": 3.2136, + "step": 7567 + }, + { + "epoch": 0.37, + "grad_norm": 0.6301264762878418, + "learning_rate": 0.0005776554708076591, + "loss": 3.1727, + "step": 7568 + }, + { + "epoch": 0.37, + "grad_norm": 0.48011279106140137, + "learning_rate": 0.0005776496398578562, + "loss": 3.3688, + "step": 7569 + }, + { + "epoch": 0.37, + "grad_norm": 0.5248762369155884, + "learning_rate": 0.0005776438081767778, + "loss": 3.159, + "step": 7570 + }, + { + "epoch": 0.37, + "grad_norm": 0.5746804475784302, + "learning_rate": 0.000577637975764439, + "loss": 3.5644, + "step": 7571 + }, + { + "epoch": 0.37, + "grad_norm": 0.5382371544837952, + "learning_rate": 0.0005776321426208551, + "loss": 3.3489, + "step": 7572 + }, + { + "epoch": 0.37, + "grad_norm": 0.5431787371635437, + "learning_rate": 0.0005776263087460416, + "loss": 3.0477, + "step": 7573 + }, + { + "epoch": 0.37, + "grad_norm": 0.5550752878189087, + "learning_rate": 0.0005776204741400138, + "loss": 3.3814, + "step": 7574 + }, + { + "epoch": 0.37, + "grad_norm": 0.5394303798675537, + "learning_rate": 0.0005776146388027872, + "loss": 3.32, + "step": 7575 + }, + { + "epoch": 0.37, + "grad_norm": 0.5841784477233887, + "learning_rate": 0.000577608802734377, + "loss": 3.4877, + "step": 7576 + }, + { + "epoch": 0.37, + "grad_norm": 0.5106384754180908, + "learning_rate": 0.0005776029659347986, + "loss": 3.2927, + "step": 7577 + }, + { + "epoch": 0.37, + "grad_norm": 0.5529481172561646, + "learning_rate": 0.0005775971284040675, + "loss": 3.414, + "step": 7578 + }, + { + "epoch": 0.37, + "grad_norm": 0.5255731344223022, + "learning_rate": 0.0005775912901421989, + "loss": 3.2171, + "step": 7579 + }, + { + "epoch": 0.37, + "grad_norm": 0.4985384941101074, + "learning_rate": 0.0005775854511492084, + "loss": 3.4437, + "step": 7580 + }, + { + "epoch": 0.37, + "grad_norm": 0.5763261318206787, + "learning_rate": 0.0005775796114251111, + "loss": 3.3172, + "step": 7581 + }, + { + "epoch": 0.37, + "grad_norm": 0.5372400879859924, + "learning_rate": 0.0005775737709699227, + "loss": 3.5312, + "step": 7582 + }, + { + "epoch": 0.37, + "grad_norm": 0.5711855888366699, + "learning_rate": 0.0005775679297836582, + "loss": 3.3675, + "step": 7583 + }, + { + "epoch": 0.37, + "grad_norm": 0.5218439698219299, + "learning_rate": 0.0005775620878663333, + "loss": 3.2173, + "step": 7584 + }, + { + "epoch": 0.37, + "grad_norm": 0.5360540151596069, + "learning_rate": 0.0005775562452179632, + "loss": 3.2145, + "step": 7585 + }, + { + "epoch": 0.37, + "grad_norm": 0.5195031762123108, + "learning_rate": 0.0005775504018385635, + "loss": 3.5188, + "step": 7586 + }, + { + "epoch": 0.37, + "grad_norm": 0.5842971801757812, + "learning_rate": 0.0005775445577281494, + "loss": 3.3013, + "step": 7587 + }, + { + "epoch": 0.37, + "grad_norm": 0.5489726662635803, + "learning_rate": 0.0005775387128867363, + "loss": 3.1806, + "step": 7588 + }, + { + "epoch": 0.37, + "grad_norm": 0.529917299747467, + "learning_rate": 0.0005775328673143398, + "loss": 3.1544, + "step": 7589 + }, + { + "epoch": 0.37, + "grad_norm": 0.5455032587051392, + "learning_rate": 0.0005775270210109751, + "loss": 3.2344, + "step": 7590 + }, + { + "epoch": 0.37, + "grad_norm": 0.5468717217445374, + "learning_rate": 0.0005775211739766574, + "loss": 3.3418, + "step": 7591 + }, + { + "epoch": 0.37, + "grad_norm": 0.5684468746185303, + "learning_rate": 0.0005775153262114025, + "loss": 3.2833, + "step": 7592 + }, + { + "epoch": 0.37, + "grad_norm": 0.502007246017456, + "learning_rate": 0.0005775094777152257, + "loss": 3.2564, + "step": 7593 + }, + { + "epoch": 0.37, + "grad_norm": 0.5651172995567322, + "learning_rate": 0.0005775036284881422, + "loss": 3.4975, + "step": 7594 + }, + { + "epoch": 0.37, + "grad_norm": 0.525607705116272, + "learning_rate": 0.0005774977785301677, + "loss": 3.3124, + "step": 7595 + }, + { + "epoch": 0.37, + "grad_norm": 0.5294498205184937, + "learning_rate": 0.0005774919278413174, + "loss": 3.3201, + "step": 7596 + }, + { + "epoch": 0.37, + "grad_norm": 0.5207985639572144, + "learning_rate": 0.0005774860764216068, + "loss": 3.4086, + "step": 7597 + }, + { + "epoch": 0.37, + "grad_norm": 0.5538904666900635, + "learning_rate": 0.0005774802242710513, + "loss": 3.2833, + "step": 7598 + }, + { + "epoch": 0.37, + "grad_norm": 0.5455402135848999, + "learning_rate": 0.0005774743713896661, + "loss": 3.3564, + "step": 7599 + }, + { + "epoch": 0.37, + "grad_norm": 0.48715144395828247, + "learning_rate": 0.000577468517777467, + "loss": 3.3534, + "step": 7600 + }, + { + "epoch": 0.37, + "grad_norm": 0.520661473274231, + "learning_rate": 0.0005774626634344692, + "loss": 3.5091, + "step": 7601 + }, + { + "epoch": 0.37, + "grad_norm": 0.5360146760940552, + "learning_rate": 0.000577456808360688, + "loss": 3.4542, + "step": 7602 + }, + { + "epoch": 0.37, + "grad_norm": 0.511243462562561, + "learning_rate": 0.0005774509525561392, + "loss": 3.2027, + "step": 7603 + }, + { + "epoch": 0.37, + "grad_norm": 0.5435131788253784, + "learning_rate": 0.0005774450960208378, + "loss": 3.2947, + "step": 7604 + }, + { + "epoch": 0.37, + "grad_norm": 0.5104213953018188, + "learning_rate": 0.0005774392387547995, + "loss": 3.3127, + "step": 7605 + }, + { + "epoch": 0.37, + "grad_norm": 0.5034766793251038, + "learning_rate": 0.0005774333807580395, + "loss": 3.3856, + "step": 7606 + }, + { + "epoch": 0.37, + "grad_norm": 0.5869839787483215, + "learning_rate": 0.0005774275220305735, + "loss": 3.308, + "step": 7607 + }, + { + "epoch": 0.37, + "grad_norm": 0.4974309206008911, + "learning_rate": 0.0005774216625724168, + "loss": 3.4021, + "step": 7608 + }, + { + "epoch": 0.37, + "grad_norm": 0.5040432810783386, + "learning_rate": 0.0005774158023835848, + "loss": 2.9991, + "step": 7609 + }, + { + "epoch": 0.37, + "grad_norm": 0.5178181529045105, + "learning_rate": 0.000577409941464093, + "loss": 3.2486, + "step": 7610 + }, + { + "epoch": 0.37, + "grad_norm": 0.5407399535179138, + "learning_rate": 0.0005774040798139567, + "loss": 3.3858, + "step": 7611 + }, + { + "epoch": 0.37, + "grad_norm": 0.5059396028518677, + "learning_rate": 0.0005773982174331915, + "loss": 3.431, + "step": 7612 + }, + { + "epoch": 0.37, + "grad_norm": 0.5154821276664734, + "learning_rate": 0.0005773923543218128, + "loss": 3.2791, + "step": 7613 + }, + { + "epoch": 0.37, + "grad_norm": 0.5808987021446228, + "learning_rate": 0.000577386490479836, + "loss": 3.3297, + "step": 7614 + }, + { + "epoch": 0.37, + "grad_norm": 0.5294091701507568, + "learning_rate": 0.0005773806259072766, + "loss": 3.3628, + "step": 7615 + }, + { + "epoch": 0.37, + "grad_norm": 0.49198436737060547, + "learning_rate": 0.00057737476060415, + "loss": 3.441, + "step": 7616 + }, + { + "epoch": 0.37, + "grad_norm": 0.5616235733032227, + "learning_rate": 0.0005773688945704717, + "loss": 3.4603, + "step": 7617 + }, + { + "epoch": 0.37, + "grad_norm": 0.5396645665168762, + "learning_rate": 0.0005773630278062571, + "loss": 3.3925, + "step": 7618 + }, + { + "epoch": 0.37, + "grad_norm": 0.5407871007919312, + "learning_rate": 0.0005773571603115216, + "loss": 3.4043, + "step": 7619 + }, + { + "epoch": 0.37, + "grad_norm": 0.5107448697090149, + "learning_rate": 0.0005773512920862808, + "loss": 3.4727, + "step": 7620 + }, + { + "epoch": 0.37, + "grad_norm": 0.5386175513267517, + "learning_rate": 0.00057734542313055, + "loss": 3.2539, + "step": 7621 + }, + { + "epoch": 0.37, + "grad_norm": 0.5494099855422974, + "learning_rate": 0.0005773395534443448, + "loss": 3.3771, + "step": 7622 + }, + { + "epoch": 0.37, + "grad_norm": 0.5173628926277161, + "learning_rate": 0.0005773336830276805, + "loss": 3.379, + "step": 7623 + }, + { + "epoch": 0.37, + "grad_norm": 0.5712496638298035, + "learning_rate": 0.0005773278118805727, + "loss": 3.1727, + "step": 7624 + }, + { + "epoch": 0.37, + "grad_norm": 0.524523138999939, + "learning_rate": 0.0005773219400030369, + "loss": 3.1427, + "step": 7625 + }, + { + "epoch": 0.37, + "grad_norm": 0.5556202530860901, + "learning_rate": 0.0005773160673950883, + "loss": 3.4616, + "step": 7626 + }, + { + "epoch": 0.37, + "grad_norm": 0.5599361658096313, + "learning_rate": 0.0005773101940567427, + "loss": 3.3032, + "step": 7627 + }, + { + "epoch": 0.37, + "grad_norm": 0.5155829191207886, + "learning_rate": 0.0005773043199880154, + "loss": 3.3193, + "step": 7628 + }, + { + "epoch": 0.37, + "grad_norm": 0.5319119095802307, + "learning_rate": 0.0005772984451889219, + "loss": 3.3039, + "step": 7629 + }, + { + "epoch": 0.37, + "grad_norm": 0.5668798089027405, + "learning_rate": 0.0005772925696594776, + "loss": 3.3281, + "step": 7630 + }, + { + "epoch": 0.37, + "grad_norm": 0.5063618421554565, + "learning_rate": 0.000577286693399698, + "loss": 3.2928, + "step": 7631 + }, + { + "epoch": 0.37, + "grad_norm": 0.5319681763648987, + "learning_rate": 0.0005772808164095988, + "loss": 3.2887, + "step": 7632 + }, + { + "epoch": 0.37, + "grad_norm": 0.5006822347640991, + "learning_rate": 0.0005772749386891951, + "loss": 3.5356, + "step": 7633 + }, + { + "epoch": 0.37, + "grad_norm": 0.5074585676193237, + "learning_rate": 0.0005772690602385026, + "loss": 3.4013, + "step": 7634 + }, + { + "epoch": 0.37, + "grad_norm": 0.5565813779830933, + "learning_rate": 0.0005772631810575369, + "loss": 3.4353, + "step": 7635 + }, + { + "epoch": 0.37, + "grad_norm": 0.549435019493103, + "learning_rate": 0.0005772573011463131, + "loss": 3.0845, + "step": 7636 + }, + { + "epoch": 0.37, + "grad_norm": 0.5290880799293518, + "learning_rate": 0.0005772514205048472, + "loss": 3.459, + "step": 7637 + }, + { + "epoch": 0.37, + "grad_norm": 0.5667818784713745, + "learning_rate": 0.0005772455391331542, + "loss": 3.2927, + "step": 7638 + }, + { + "epoch": 0.37, + "grad_norm": 0.5045775175094604, + "learning_rate": 0.0005772396570312499, + "loss": 3.2672, + "step": 7639 + }, + { + "epoch": 0.37, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0005772337741991497, + "loss": 3.2494, + "step": 7640 + }, + { + "epoch": 0.37, + "grad_norm": 0.5238841772079468, + "learning_rate": 0.0005772278906368692, + "loss": 3.2805, + "step": 7641 + }, + { + "epoch": 0.37, + "grad_norm": 0.5158736109733582, + "learning_rate": 0.0005772220063444236, + "loss": 3.3447, + "step": 7642 + }, + { + "epoch": 0.37, + "grad_norm": 0.5552991032600403, + "learning_rate": 0.0005772161213218286, + "loss": 3.1852, + "step": 7643 + }, + { + "epoch": 0.37, + "grad_norm": 0.5120370388031006, + "learning_rate": 0.0005772102355690998, + "loss": 3.2797, + "step": 7644 + }, + { + "epoch": 0.37, + "grad_norm": 0.47802504897117615, + "learning_rate": 0.0005772043490862525, + "loss": 3.2269, + "step": 7645 + }, + { + "epoch": 0.37, + "grad_norm": 0.5293357372283936, + "learning_rate": 0.0005771984618733024, + "loss": 3.5585, + "step": 7646 + }, + { + "epoch": 0.37, + "grad_norm": 0.5576381683349609, + "learning_rate": 0.0005771925739302649, + "loss": 3.5653, + "step": 7647 + }, + { + "epoch": 0.37, + "grad_norm": 0.6383746266365051, + "learning_rate": 0.0005771866852571554, + "loss": 3.4087, + "step": 7648 + }, + { + "epoch": 0.37, + "grad_norm": 0.5233350992202759, + "learning_rate": 0.0005771807958539895, + "loss": 3.5551, + "step": 7649 + }, + { + "epoch": 0.37, + "grad_norm": 0.6100677847862244, + "learning_rate": 0.0005771749057207828, + "loss": 3.2505, + "step": 7650 + }, + { + "epoch": 0.37, + "grad_norm": 0.49944281578063965, + "learning_rate": 0.0005771690148575508, + "loss": 3.3978, + "step": 7651 + }, + { + "epoch": 0.38, + "grad_norm": 0.5267223119735718, + "learning_rate": 0.0005771631232643088, + "loss": 3.1858, + "step": 7652 + }, + { + "epoch": 0.38, + "grad_norm": 0.5045062899589539, + "learning_rate": 0.0005771572309410726, + "loss": 3.4002, + "step": 7653 + }, + { + "epoch": 0.38, + "grad_norm": 0.5398784279823303, + "learning_rate": 0.0005771513378878576, + "loss": 3.6054, + "step": 7654 + }, + { + "epoch": 0.38, + "grad_norm": 0.5356148481369019, + "learning_rate": 0.0005771454441046793, + "loss": 3.5048, + "step": 7655 + }, + { + "epoch": 0.38, + "grad_norm": 0.5511022806167603, + "learning_rate": 0.0005771395495915531, + "loss": 3.1728, + "step": 7656 + }, + { + "epoch": 0.38, + "grad_norm": 0.4920971691608429, + "learning_rate": 0.0005771336543484948, + "loss": 3.2441, + "step": 7657 + }, + { + "epoch": 0.38, + "grad_norm": 0.5204853415489197, + "learning_rate": 0.0005771277583755198, + "loss": 3.4215, + "step": 7658 + }, + { + "epoch": 0.38, + "grad_norm": 0.5585867762565613, + "learning_rate": 0.0005771218616726436, + "loss": 3.2756, + "step": 7659 + }, + { + "epoch": 0.38, + "grad_norm": 0.5565081238746643, + "learning_rate": 0.0005771159642398817, + "loss": 3.4225, + "step": 7660 + }, + { + "epoch": 0.38, + "grad_norm": 0.5324299335479736, + "learning_rate": 0.0005771100660772497, + "loss": 3.127, + "step": 7661 + }, + { + "epoch": 0.38, + "grad_norm": 0.516318678855896, + "learning_rate": 0.0005771041671847632, + "loss": 3.4088, + "step": 7662 + }, + { + "epoch": 0.38, + "grad_norm": 0.5677664279937744, + "learning_rate": 0.0005770982675624374, + "loss": 3.3326, + "step": 7663 + }, + { + "epoch": 0.38, + "grad_norm": 0.5322689414024353, + "learning_rate": 0.0005770923672102883, + "loss": 3.3283, + "step": 7664 + }, + { + "epoch": 0.38, + "grad_norm": 0.6116513013839722, + "learning_rate": 0.0005770864661283312, + "loss": 3.3218, + "step": 7665 + }, + { + "epoch": 0.38, + "grad_norm": 0.5484536290168762, + "learning_rate": 0.0005770805643165819, + "loss": 3.2955, + "step": 7666 + }, + { + "epoch": 0.38, + "grad_norm": 0.5231142044067383, + "learning_rate": 0.0005770746617750553, + "loss": 3.4475, + "step": 7667 + }, + { + "epoch": 0.38, + "grad_norm": 0.5095486640930176, + "learning_rate": 0.0005770687585037676, + "loss": 3.4287, + "step": 7668 + }, + { + "epoch": 0.38, + "grad_norm": 0.544468343257904, + "learning_rate": 0.0005770628545027341, + "loss": 3.3266, + "step": 7669 + }, + { + "epoch": 0.38, + "grad_norm": 0.5009661912918091, + "learning_rate": 0.0005770569497719703, + "loss": 3.4781, + "step": 7670 + }, + { + "epoch": 0.38, + "grad_norm": 0.510226845741272, + "learning_rate": 0.0005770510443114918, + "loss": 3.3332, + "step": 7671 + }, + { + "epoch": 0.38, + "grad_norm": 0.5290097594261169, + "learning_rate": 0.0005770451381213144, + "loss": 3.4185, + "step": 7672 + }, + { + "epoch": 0.38, + "grad_norm": 0.5284835696220398, + "learning_rate": 0.0005770392312014532, + "loss": 3.2657, + "step": 7673 + }, + { + "epoch": 0.38, + "grad_norm": 0.4978867471218109, + "learning_rate": 0.000577033323551924, + "loss": 3.3484, + "step": 7674 + }, + { + "epoch": 0.38, + "grad_norm": 0.5280537009239197, + "learning_rate": 0.0005770274151727424, + "loss": 3.3948, + "step": 7675 + }, + { + "epoch": 0.38, + "grad_norm": 0.5177137851715088, + "learning_rate": 0.0005770215060639239, + "loss": 3.2166, + "step": 7676 + }, + { + "epoch": 0.38, + "grad_norm": 0.5431517362594604, + "learning_rate": 0.000577015596225484, + "loss": 3.1223, + "step": 7677 + }, + { + "epoch": 0.38, + "grad_norm": 0.5485773682594299, + "learning_rate": 0.0005770096856574383, + "loss": 3.2481, + "step": 7678 + }, + { + "epoch": 0.38, + "grad_norm": 0.5501081347465515, + "learning_rate": 0.0005770037743598025, + "loss": 3.5521, + "step": 7679 + }, + { + "epoch": 0.38, + "grad_norm": 0.5404449701309204, + "learning_rate": 0.000576997862332592, + "loss": 3.4757, + "step": 7680 + }, + { + "epoch": 0.38, + "grad_norm": 0.5793304443359375, + "learning_rate": 0.0005769919495758225, + "loss": 3.1084, + "step": 7681 + }, + { + "epoch": 0.38, + "grad_norm": 0.559897243976593, + "learning_rate": 0.0005769860360895096, + "loss": 3.0807, + "step": 7682 + }, + { + "epoch": 0.38, + "grad_norm": 0.5876379609107971, + "learning_rate": 0.0005769801218736686, + "loss": 3.2023, + "step": 7683 + }, + { + "epoch": 0.38, + "grad_norm": 0.548350989818573, + "learning_rate": 0.0005769742069283154, + "loss": 3.2818, + "step": 7684 + }, + { + "epoch": 0.38, + "grad_norm": 0.5384724140167236, + "learning_rate": 0.0005769682912534653, + "loss": 3.2043, + "step": 7685 + }, + { + "epoch": 0.38, + "grad_norm": 0.5407583713531494, + "learning_rate": 0.0005769623748491342, + "loss": 3.3275, + "step": 7686 + }, + { + "epoch": 0.38, + "grad_norm": 0.5203924179077148, + "learning_rate": 0.0005769564577153374, + "loss": 3.3591, + "step": 7687 + }, + { + "epoch": 0.38, + "grad_norm": 0.5495003461837769, + "learning_rate": 0.0005769505398520907, + "loss": 3.2688, + "step": 7688 + }, + { + "epoch": 0.38, + "grad_norm": 0.5348663926124573, + "learning_rate": 0.0005769446212594094, + "loss": 3.3437, + "step": 7689 + }, + { + "epoch": 0.38, + "grad_norm": 0.5982791185379028, + "learning_rate": 0.0005769387019373094, + "loss": 3.398, + "step": 7690 + }, + { + "epoch": 0.38, + "grad_norm": 0.5527102947235107, + "learning_rate": 0.0005769327818858062, + "loss": 3.4327, + "step": 7691 + }, + { + "epoch": 0.38, + "grad_norm": 0.5997951626777649, + "learning_rate": 0.0005769268611049152, + "loss": 3.4901, + "step": 7692 + }, + { + "epoch": 0.38, + "grad_norm": 0.5478419065475464, + "learning_rate": 0.0005769209395946522, + "loss": 3.3142, + "step": 7693 + }, + { + "epoch": 0.38, + "grad_norm": 0.49550652503967285, + "learning_rate": 0.0005769150173550328, + "loss": 3.5203, + "step": 7694 + }, + { + "epoch": 0.38, + "grad_norm": 0.5617064833641052, + "learning_rate": 0.0005769090943860724, + "loss": 3.5058, + "step": 7695 + }, + { + "epoch": 0.38, + "grad_norm": 0.4888365566730499, + "learning_rate": 0.0005769031706877869, + "loss": 3.4375, + "step": 7696 + }, + { + "epoch": 0.38, + "grad_norm": 0.5538713335990906, + "learning_rate": 0.0005768972462601916, + "loss": 3.3487, + "step": 7697 + }, + { + "epoch": 0.38, + "grad_norm": 0.5713244080543518, + "learning_rate": 0.0005768913211033024, + "loss": 3.2288, + "step": 7698 + }, + { + "epoch": 0.38, + "grad_norm": 0.5322604775428772, + "learning_rate": 0.0005768853952171346, + "loss": 3.5587, + "step": 7699 + }, + { + "epoch": 0.38, + "grad_norm": 0.5296763777732849, + "learning_rate": 0.000576879468601704, + "loss": 3.3886, + "step": 7700 + }, + { + "epoch": 0.38, + "grad_norm": 0.581745982170105, + "learning_rate": 0.0005768735412570262, + "loss": 3.3111, + "step": 7701 + }, + { + "epoch": 0.38, + "grad_norm": 0.5159407258033752, + "learning_rate": 0.0005768676131831168, + "loss": 3.2317, + "step": 7702 + }, + { + "epoch": 0.38, + "grad_norm": 0.527847945690155, + "learning_rate": 0.0005768616843799913, + "loss": 3.2123, + "step": 7703 + }, + { + "epoch": 0.38, + "grad_norm": 0.5626876354217529, + "learning_rate": 0.0005768557548476654, + "loss": 3.1141, + "step": 7704 + }, + { + "epoch": 0.38, + "grad_norm": 0.5151354670524597, + "learning_rate": 0.0005768498245861548, + "loss": 3.2901, + "step": 7705 + }, + { + "epoch": 0.38, + "grad_norm": 0.538934051990509, + "learning_rate": 0.000576843893595475, + "loss": 3.3142, + "step": 7706 + }, + { + "epoch": 0.38, + "grad_norm": 0.5391485095024109, + "learning_rate": 0.0005768379618756417, + "loss": 3.1658, + "step": 7707 + }, + { + "epoch": 0.38, + "grad_norm": 0.5370870232582092, + "learning_rate": 0.0005768320294266705, + "loss": 3.2983, + "step": 7708 + }, + { + "epoch": 0.38, + "grad_norm": 0.5493991374969482, + "learning_rate": 0.0005768260962485769, + "loss": 3.3358, + "step": 7709 + }, + { + "epoch": 0.38, + "grad_norm": 0.5690867900848389, + "learning_rate": 0.0005768201623413768, + "loss": 3.4018, + "step": 7710 + }, + { + "epoch": 0.38, + "grad_norm": 0.5145018100738525, + "learning_rate": 0.0005768142277050856, + "loss": 3.3555, + "step": 7711 + }, + { + "epoch": 0.38, + "grad_norm": 0.526140034198761, + "learning_rate": 0.000576808292339719, + "loss": 3.4113, + "step": 7712 + }, + { + "epoch": 0.38, + "grad_norm": 0.5112320780754089, + "learning_rate": 0.0005768023562452926, + "loss": 3.3697, + "step": 7713 + }, + { + "epoch": 0.38, + "grad_norm": 0.5290682315826416, + "learning_rate": 0.0005767964194218221, + "loss": 3.5244, + "step": 7714 + }, + { + "epoch": 0.38, + "grad_norm": 0.5239760279655457, + "learning_rate": 0.0005767904818693231, + "loss": 3.4218, + "step": 7715 + }, + { + "epoch": 0.38, + "grad_norm": 0.5848803520202637, + "learning_rate": 0.0005767845435878113, + "loss": 3.298, + "step": 7716 + }, + { + "epoch": 0.38, + "grad_norm": 0.5328184366226196, + "learning_rate": 0.0005767786045773021, + "loss": 3.2251, + "step": 7717 + }, + { + "epoch": 0.38, + "grad_norm": 0.5391929745674133, + "learning_rate": 0.0005767726648378115, + "loss": 3.2584, + "step": 7718 + }, + { + "epoch": 0.38, + "grad_norm": 0.5395445823669434, + "learning_rate": 0.0005767667243693548, + "loss": 3.3085, + "step": 7719 + }, + { + "epoch": 0.38, + "grad_norm": 0.5276076197624207, + "learning_rate": 0.0005767607831719479, + "loss": 3.3174, + "step": 7720 + }, + { + "epoch": 0.38, + "grad_norm": 0.4999656677246094, + "learning_rate": 0.0005767548412456064, + "loss": 3.3946, + "step": 7721 + }, + { + "epoch": 0.38, + "grad_norm": 0.5640438795089722, + "learning_rate": 0.0005767488985903459, + "loss": 3.4772, + "step": 7722 + }, + { + "epoch": 0.38, + "grad_norm": 0.5889086723327637, + "learning_rate": 0.000576742955206182, + "loss": 3.4004, + "step": 7723 + }, + { + "epoch": 0.38, + "grad_norm": 0.5023418068885803, + "learning_rate": 0.0005767370110931306, + "loss": 3.4566, + "step": 7724 + }, + { + "epoch": 0.38, + "grad_norm": 0.5705462694168091, + "learning_rate": 0.000576731066251207, + "loss": 3.2547, + "step": 7725 + }, + { + "epoch": 0.38, + "grad_norm": 0.6652705073356628, + "learning_rate": 0.000576725120680427, + "loss": 3.1061, + "step": 7726 + }, + { + "epoch": 0.38, + "grad_norm": 0.5656840205192566, + "learning_rate": 0.0005767191743808064, + "loss": 3.1697, + "step": 7727 + }, + { + "epoch": 0.38, + "grad_norm": 0.5446447134017944, + "learning_rate": 0.0005767132273523606, + "loss": 3.2713, + "step": 7728 + }, + { + "epoch": 0.38, + "grad_norm": 0.5088135004043579, + "learning_rate": 0.0005767072795951056, + "loss": 3.439, + "step": 7729 + }, + { + "epoch": 0.38, + "grad_norm": 0.573800802230835, + "learning_rate": 0.0005767013311090567, + "loss": 3.5233, + "step": 7730 + }, + { + "epoch": 0.38, + "grad_norm": 0.5233531594276428, + "learning_rate": 0.0005766953818942299, + "loss": 3.368, + "step": 7731 + }, + { + "epoch": 0.38, + "grad_norm": 0.5602032542228699, + "learning_rate": 0.0005766894319506406, + "loss": 3.5343, + "step": 7732 + }, + { + "epoch": 0.38, + "grad_norm": 0.5444862246513367, + "learning_rate": 0.0005766834812783047, + "loss": 3.3899, + "step": 7733 + }, + { + "epoch": 0.38, + "grad_norm": 0.5610346794128418, + "learning_rate": 0.0005766775298772377, + "loss": 3.263, + "step": 7734 + }, + { + "epoch": 0.38, + "grad_norm": 0.5168694257736206, + "learning_rate": 0.0005766715777474553, + "loss": 3.3323, + "step": 7735 + }, + { + "epoch": 0.38, + "grad_norm": 0.5316835641860962, + "learning_rate": 0.0005766656248889732, + "loss": 3.4178, + "step": 7736 + }, + { + "epoch": 0.38, + "grad_norm": 0.5672079920768738, + "learning_rate": 0.0005766596713018072, + "loss": 3.3652, + "step": 7737 + }, + { + "epoch": 0.38, + "grad_norm": 0.5037599205970764, + "learning_rate": 0.0005766537169859728, + "loss": 3.3349, + "step": 7738 + }, + { + "epoch": 0.38, + "grad_norm": 0.538493812084198, + "learning_rate": 0.0005766477619414858, + "loss": 3.4587, + "step": 7739 + }, + { + "epoch": 0.38, + "grad_norm": 0.5200104713439941, + "learning_rate": 0.0005766418061683618, + "loss": 3.3614, + "step": 7740 + }, + { + "epoch": 0.38, + "grad_norm": 0.5429470539093018, + "learning_rate": 0.0005766358496666165, + "loss": 3.2853, + "step": 7741 + }, + { + "epoch": 0.38, + "grad_norm": 0.5296519994735718, + "learning_rate": 0.0005766298924362656, + "loss": 3.3077, + "step": 7742 + }, + { + "epoch": 0.38, + "grad_norm": 0.5855775475502014, + "learning_rate": 0.0005766239344773249, + "loss": 3.3771, + "step": 7743 + }, + { + "epoch": 0.38, + "grad_norm": 0.6540366411209106, + "learning_rate": 0.0005766179757898098, + "loss": 3.432, + "step": 7744 + }, + { + "epoch": 0.38, + "grad_norm": 0.5268910527229309, + "learning_rate": 0.0005766120163737364, + "loss": 3.3832, + "step": 7745 + }, + { + "epoch": 0.38, + "grad_norm": 0.5402625799179077, + "learning_rate": 0.00057660605622912, + "loss": 3.2677, + "step": 7746 + }, + { + "epoch": 0.38, + "grad_norm": 0.545888364315033, + "learning_rate": 0.0005766000953559767, + "loss": 3.1326, + "step": 7747 + }, + { + "epoch": 0.38, + "grad_norm": 0.5113720893859863, + "learning_rate": 0.0005765941337543218, + "loss": 3.3382, + "step": 7748 + }, + { + "epoch": 0.38, + "grad_norm": 0.5178648829460144, + "learning_rate": 0.0005765881714241714, + "loss": 3.3942, + "step": 7749 + }, + { + "epoch": 0.38, + "grad_norm": 0.5671051740646362, + "learning_rate": 0.0005765822083655407, + "loss": 3.3048, + "step": 7750 + }, + { + "epoch": 0.38, + "grad_norm": 0.501137375831604, + "learning_rate": 0.0005765762445784459, + "loss": 3.2334, + "step": 7751 + }, + { + "epoch": 0.38, + "grad_norm": 0.5226545333862305, + "learning_rate": 0.0005765702800629023, + "loss": 3.3512, + "step": 7752 + }, + { + "epoch": 0.38, + "grad_norm": 0.5014787912368774, + "learning_rate": 0.000576564314818926, + "loss": 3.318, + "step": 7753 + }, + { + "epoch": 0.38, + "grad_norm": 0.5254672169685364, + "learning_rate": 0.0005765583488465324, + "loss": 3.3825, + "step": 7754 + }, + { + "epoch": 0.38, + "grad_norm": 0.5159661173820496, + "learning_rate": 0.0005765523821457374, + "loss": 3.3095, + "step": 7755 + }, + { + "epoch": 0.38, + "grad_norm": 0.5051149725914001, + "learning_rate": 0.0005765464147165566, + "loss": 3.2341, + "step": 7756 + }, + { + "epoch": 0.38, + "grad_norm": 0.5197758078575134, + "learning_rate": 0.0005765404465590059, + "loss": 3.2489, + "step": 7757 + }, + { + "epoch": 0.38, + "grad_norm": 0.5094234943389893, + "learning_rate": 0.0005765344776731008, + "loss": 3.2445, + "step": 7758 + }, + { + "epoch": 0.38, + "grad_norm": 0.5342963337898254, + "learning_rate": 0.000576528508058857, + "loss": 3.5146, + "step": 7759 + }, + { + "epoch": 0.38, + "grad_norm": 0.5151779055595398, + "learning_rate": 0.0005765225377162904, + "loss": 3.1479, + "step": 7760 + }, + { + "epoch": 0.38, + "grad_norm": 0.5261240601539612, + "learning_rate": 0.0005765165666454167, + "loss": 3.0576, + "step": 7761 + }, + { + "epoch": 0.38, + "grad_norm": 0.5607361793518066, + "learning_rate": 0.0005765105948462516, + "loss": 3.211, + "step": 7762 + }, + { + "epoch": 0.38, + "grad_norm": 0.534424364566803, + "learning_rate": 0.0005765046223188108, + "loss": 3.3502, + "step": 7763 + }, + { + "epoch": 0.38, + "grad_norm": 0.5433931946754456, + "learning_rate": 0.00057649864906311, + "loss": 3.3541, + "step": 7764 + }, + { + "epoch": 0.38, + "grad_norm": 0.5713850855827332, + "learning_rate": 0.0005764926750791649, + "loss": 3.1849, + "step": 7765 + }, + { + "epoch": 0.38, + "grad_norm": 0.5363174676895142, + "learning_rate": 0.0005764867003669914, + "loss": 3.297, + "step": 7766 + }, + { + "epoch": 0.38, + "grad_norm": 0.5349758267402649, + "learning_rate": 0.0005764807249266052, + "loss": 3.2353, + "step": 7767 + }, + { + "epoch": 0.38, + "grad_norm": 0.5053718686103821, + "learning_rate": 0.0005764747487580218, + "loss": 3.0331, + "step": 7768 + }, + { + "epoch": 0.38, + "grad_norm": 0.5225927233695984, + "learning_rate": 0.0005764687718612572, + "loss": 3.2927, + "step": 7769 + }, + { + "epoch": 0.38, + "grad_norm": 0.5191579461097717, + "learning_rate": 0.0005764627942363271, + "loss": 3.2522, + "step": 7770 + }, + { + "epoch": 0.38, + "grad_norm": 0.530640721321106, + "learning_rate": 0.0005764568158832472, + "loss": 3.1888, + "step": 7771 + }, + { + "epoch": 0.38, + "grad_norm": 0.5546700358390808, + "learning_rate": 0.0005764508368020333, + "loss": 2.9471, + "step": 7772 + }, + { + "epoch": 0.38, + "grad_norm": 0.541641354560852, + "learning_rate": 0.0005764448569927009, + "loss": 3.1619, + "step": 7773 + }, + { + "epoch": 0.38, + "grad_norm": 0.5144984722137451, + "learning_rate": 0.0005764388764552662, + "loss": 3.1562, + "step": 7774 + }, + { + "epoch": 0.38, + "grad_norm": 0.5483419299125671, + "learning_rate": 0.0005764328951897446, + "loss": 3.5068, + "step": 7775 + }, + { + "epoch": 0.38, + "grad_norm": 0.5148709416389465, + "learning_rate": 0.0005764269131961519, + "loss": 3.3056, + "step": 7776 + }, + { + "epoch": 0.38, + "grad_norm": 0.535964846611023, + "learning_rate": 0.000576420930474504, + "loss": 3.4626, + "step": 7777 + }, + { + "epoch": 0.38, + "grad_norm": 0.5585061311721802, + "learning_rate": 0.0005764149470248166, + "loss": 3.3747, + "step": 7778 + }, + { + "epoch": 0.38, + "grad_norm": 0.5796385407447815, + "learning_rate": 0.0005764089628471054, + "loss": 3.3358, + "step": 7779 + }, + { + "epoch": 0.38, + "grad_norm": 0.5621188282966614, + "learning_rate": 0.000576402977941386, + "loss": 3.5115, + "step": 7780 + }, + { + "epoch": 0.38, + "grad_norm": 0.5454630255699158, + "learning_rate": 0.0005763969923076746, + "loss": 3.4531, + "step": 7781 + }, + { + "epoch": 0.38, + "grad_norm": 0.5305492877960205, + "learning_rate": 0.0005763910059459865, + "loss": 3.3743, + "step": 7782 + }, + { + "epoch": 0.38, + "grad_norm": 0.5111590027809143, + "learning_rate": 0.0005763850188563378, + "loss": 3.1609, + "step": 7783 + }, + { + "epoch": 0.38, + "grad_norm": 0.520412266254425, + "learning_rate": 0.0005763790310387441, + "loss": 3.5552, + "step": 7784 + }, + { + "epoch": 0.38, + "grad_norm": 0.5311888456344604, + "learning_rate": 0.0005763730424932213, + "loss": 3.3538, + "step": 7785 + }, + { + "epoch": 0.38, + "grad_norm": 0.5347929000854492, + "learning_rate": 0.0005763670532197851, + "loss": 3.4564, + "step": 7786 + }, + { + "epoch": 0.38, + "grad_norm": 0.5463716983795166, + "learning_rate": 0.0005763610632184512, + "loss": 3.3186, + "step": 7787 + }, + { + "epoch": 0.38, + "grad_norm": 0.5599271655082703, + "learning_rate": 0.0005763550724892355, + "loss": 3.3339, + "step": 7788 + }, + { + "epoch": 0.38, + "grad_norm": 0.545688271522522, + "learning_rate": 0.0005763490810321537, + "loss": 3.3977, + "step": 7789 + }, + { + "epoch": 0.38, + "grad_norm": 0.5224574208259583, + "learning_rate": 0.0005763430888472217, + "loss": 3.2594, + "step": 7790 + }, + { + "epoch": 0.38, + "grad_norm": 0.49941286444664, + "learning_rate": 0.0005763370959344549, + "loss": 3.185, + "step": 7791 + }, + { + "epoch": 0.38, + "grad_norm": 0.5086252689361572, + "learning_rate": 0.0005763311022938696, + "loss": 3.3799, + "step": 7792 + }, + { + "epoch": 0.38, + "grad_norm": 0.5339295268058777, + "learning_rate": 0.0005763251079254814, + "loss": 3.5027, + "step": 7793 + }, + { + "epoch": 0.38, + "grad_norm": 0.5528396368026733, + "learning_rate": 0.000576319112829306, + "loss": 3.3822, + "step": 7794 + }, + { + "epoch": 0.38, + "grad_norm": 0.5613994002342224, + "learning_rate": 0.0005763131170053591, + "loss": 3.439, + "step": 7795 + }, + { + "epoch": 0.38, + "grad_norm": 0.5107088685035706, + "learning_rate": 0.0005763071204536568, + "loss": 3.243, + "step": 7796 + }, + { + "epoch": 0.38, + "grad_norm": 0.5415818691253662, + "learning_rate": 0.0005763011231742146, + "loss": 3.1863, + "step": 7797 + }, + { + "epoch": 0.38, + "grad_norm": 0.550011396408081, + "learning_rate": 0.0005762951251670485, + "loss": 3.3939, + "step": 7798 + }, + { + "epoch": 0.38, + "grad_norm": 0.5543647408485413, + "learning_rate": 0.0005762891264321742, + "loss": 3.4366, + "step": 7799 + }, + { + "epoch": 0.38, + "grad_norm": 0.5226086974143982, + "learning_rate": 0.0005762831269696075, + "loss": 3.3135, + "step": 7800 + }, + { + "epoch": 0.38, + "grad_norm": 0.5125158429145813, + "learning_rate": 0.0005762771267793642, + "loss": 3.5191, + "step": 7801 + }, + { + "epoch": 0.38, + "grad_norm": 0.485068678855896, + "learning_rate": 0.0005762711258614602, + "loss": 3.5, + "step": 7802 + }, + { + "epoch": 0.38, + "grad_norm": 0.49382463097572327, + "learning_rate": 0.0005762651242159111, + "loss": 3.3202, + "step": 7803 + }, + { + "epoch": 0.38, + "grad_norm": 0.5490413904190063, + "learning_rate": 0.0005762591218427328, + "loss": 3.1602, + "step": 7804 + }, + { + "epoch": 0.38, + "grad_norm": 0.496092826128006, + "learning_rate": 0.0005762531187419413, + "loss": 3.511, + "step": 7805 + }, + { + "epoch": 0.38, + "grad_norm": 0.5007348656654358, + "learning_rate": 0.0005762471149135522, + "loss": 3.2596, + "step": 7806 + }, + { + "epoch": 0.38, + "grad_norm": 0.5107442736625671, + "learning_rate": 0.0005762411103575813, + "loss": 3.3431, + "step": 7807 + }, + { + "epoch": 0.38, + "grad_norm": 0.5263485908508301, + "learning_rate": 0.0005762351050740445, + "loss": 3.3426, + "step": 7808 + }, + { + "epoch": 0.38, + "grad_norm": 0.5867173671722412, + "learning_rate": 0.0005762290990629576, + "loss": 3.3641, + "step": 7809 + }, + { + "epoch": 0.38, + "grad_norm": 0.5284026861190796, + "learning_rate": 0.0005762230923243365, + "loss": 3.2272, + "step": 7810 + }, + { + "epoch": 0.38, + "grad_norm": 0.5352510809898376, + "learning_rate": 0.0005762170848581968, + "loss": 3.447, + "step": 7811 + }, + { + "epoch": 0.38, + "grad_norm": 0.4992859959602356, + "learning_rate": 0.0005762110766645546, + "loss": 3.4186, + "step": 7812 + }, + { + "epoch": 0.38, + "grad_norm": 0.514480471611023, + "learning_rate": 0.0005762050677434254, + "loss": 3.2831, + "step": 7813 + }, + { + "epoch": 0.38, + "grad_norm": 0.5168455243110657, + "learning_rate": 0.0005761990580948254, + "loss": 3.3549, + "step": 7814 + }, + { + "epoch": 0.38, + "grad_norm": 0.5284212827682495, + "learning_rate": 0.0005761930477187701, + "loss": 3.3554, + "step": 7815 + }, + { + "epoch": 0.38, + "grad_norm": 0.5353116393089294, + "learning_rate": 0.0005761870366152755, + "loss": 3.2887, + "step": 7816 + }, + { + "epoch": 0.38, + "grad_norm": 0.5134118795394897, + "learning_rate": 0.0005761810247843574, + "loss": 3.4749, + "step": 7817 + }, + { + "epoch": 0.38, + "grad_norm": 0.49037235975265503, + "learning_rate": 0.0005761750122260317, + "loss": 3.1841, + "step": 7818 + }, + { + "epoch": 0.38, + "grad_norm": 0.5198505520820618, + "learning_rate": 0.0005761689989403141, + "loss": 3.4457, + "step": 7819 + }, + { + "epoch": 0.38, + "grad_norm": 0.5264098048210144, + "learning_rate": 0.0005761629849272205, + "loss": 3.3493, + "step": 7820 + }, + { + "epoch": 0.38, + "grad_norm": 0.5358461141586304, + "learning_rate": 0.0005761569701867668, + "loss": 3.309, + "step": 7821 + }, + { + "epoch": 0.38, + "grad_norm": 0.5349216461181641, + "learning_rate": 0.0005761509547189688, + "loss": 3.2732, + "step": 7822 + }, + { + "epoch": 0.38, + "grad_norm": 0.498106986284256, + "learning_rate": 0.0005761449385238422, + "loss": 3.1092, + "step": 7823 + }, + { + "epoch": 0.38, + "grad_norm": 0.5381754040718079, + "learning_rate": 0.0005761389216014031, + "loss": 3.3537, + "step": 7824 + }, + { + "epoch": 0.38, + "grad_norm": 0.535024106502533, + "learning_rate": 0.0005761329039516671, + "loss": 3.269, + "step": 7825 + }, + { + "epoch": 0.38, + "grad_norm": 0.5659539103507996, + "learning_rate": 0.0005761268855746503, + "loss": 3.3276, + "step": 7826 + }, + { + "epoch": 0.38, + "grad_norm": 0.5511694550514221, + "learning_rate": 0.0005761208664703684, + "loss": 3.3624, + "step": 7827 + }, + { + "epoch": 0.38, + "grad_norm": 0.5096381902694702, + "learning_rate": 0.0005761148466388373, + "loss": 3.2467, + "step": 7828 + }, + { + "epoch": 0.38, + "grad_norm": 0.5487003326416016, + "learning_rate": 0.0005761088260800728, + "loss": 3.361, + "step": 7829 + }, + { + "epoch": 0.38, + "grad_norm": 0.5437702536582947, + "learning_rate": 0.0005761028047940907, + "loss": 3.4645, + "step": 7830 + }, + { + "epoch": 0.38, + "grad_norm": 0.5526220798492432, + "learning_rate": 0.0005760967827809072, + "loss": 3.1801, + "step": 7831 + }, + { + "epoch": 0.38, + "grad_norm": 0.5293183326721191, + "learning_rate": 0.0005760907600405377, + "loss": 3.2272, + "step": 7832 + }, + { + "epoch": 0.38, + "grad_norm": 0.5198914408683777, + "learning_rate": 0.0005760847365729984, + "loss": 3.3003, + "step": 7833 + }, + { + "epoch": 0.38, + "grad_norm": 0.5536202788352966, + "learning_rate": 0.0005760787123783049, + "loss": 2.9891, + "step": 7834 + }, + { + "epoch": 0.38, + "grad_norm": 0.5597622990608215, + "learning_rate": 0.0005760726874564732, + "loss": 3.0627, + "step": 7835 + }, + { + "epoch": 0.38, + "grad_norm": 0.5665072202682495, + "learning_rate": 0.0005760666618075192, + "loss": 3.2544, + "step": 7836 + }, + { + "epoch": 0.38, + "grad_norm": 0.6050432324409485, + "learning_rate": 0.0005760606354314588, + "loss": 3.3163, + "step": 7837 + }, + { + "epoch": 0.38, + "grad_norm": 0.5545769929885864, + "learning_rate": 0.0005760546083283079, + "loss": 3.1321, + "step": 7838 + }, + { + "epoch": 0.38, + "grad_norm": 0.5636788010597229, + "learning_rate": 0.0005760485804980821, + "loss": 3.1955, + "step": 7839 + }, + { + "epoch": 0.38, + "grad_norm": 0.5258702635765076, + "learning_rate": 0.0005760425519407976, + "loss": 3.274, + "step": 7840 + }, + { + "epoch": 0.38, + "grad_norm": 0.5311688184738159, + "learning_rate": 0.0005760365226564701, + "loss": 3.2431, + "step": 7841 + }, + { + "epoch": 0.38, + "grad_norm": 0.5252965092658997, + "learning_rate": 0.0005760304926451155, + "loss": 3.3003, + "step": 7842 + }, + { + "epoch": 0.38, + "grad_norm": 0.5277718901634216, + "learning_rate": 0.0005760244619067498, + "loss": 3.3049, + "step": 7843 + }, + { + "epoch": 0.38, + "grad_norm": 0.5588723421096802, + "learning_rate": 0.0005760184304413887, + "loss": 3.0993, + "step": 7844 + }, + { + "epoch": 0.38, + "grad_norm": 0.5348266363143921, + "learning_rate": 0.0005760123982490481, + "loss": 3.3248, + "step": 7845 + }, + { + "epoch": 0.38, + "grad_norm": 0.5221610069274902, + "learning_rate": 0.0005760063653297441, + "loss": 3.1574, + "step": 7846 + }, + { + "epoch": 0.38, + "grad_norm": 0.5521312355995178, + "learning_rate": 0.0005760003316834924, + "loss": 3.2215, + "step": 7847 + }, + { + "epoch": 0.38, + "grad_norm": 0.570512056350708, + "learning_rate": 0.0005759942973103089, + "loss": 3.478, + "step": 7848 + }, + { + "epoch": 0.38, + "grad_norm": 0.5131711363792419, + "learning_rate": 0.0005759882622102096, + "loss": 3.3638, + "step": 7849 + }, + { + "epoch": 0.38, + "grad_norm": 0.5566141605377197, + "learning_rate": 0.0005759822263832103, + "loss": 3.0577, + "step": 7850 + }, + { + "epoch": 0.38, + "grad_norm": 0.5762760043144226, + "learning_rate": 0.0005759761898293269, + "loss": 3.275, + "step": 7851 + }, + { + "epoch": 0.38, + "grad_norm": 0.5507886409759521, + "learning_rate": 0.0005759701525485754, + "loss": 3.1812, + "step": 7852 + }, + { + "epoch": 0.38, + "grad_norm": 0.5555328726768494, + "learning_rate": 0.0005759641145409716, + "loss": 3.3363, + "step": 7853 + }, + { + "epoch": 0.38, + "grad_norm": 0.5610153079032898, + "learning_rate": 0.0005759580758065315, + "loss": 3.3007, + "step": 7854 + }, + { + "epoch": 0.38, + "grad_norm": 0.5262787938117981, + "learning_rate": 0.0005759520363452709, + "loss": 3.5069, + "step": 7855 + }, + { + "epoch": 0.39, + "grad_norm": 0.5241034030914307, + "learning_rate": 0.0005759459961572057, + "loss": 3.3264, + "step": 7856 + }, + { + "epoch": 0.39, + "grad_norm": 0.5599466562271118, + "learning_rate": 0.0005759399552423518, + "loss": 3.4544, + "step": 7857 + }, + { + "epoch": 0.39, + "grad_norm": 0.5439044833183289, + "learning_rate": 0.0005759339136007253, + "loss": 3.3155, + "step": 7858 + }, + { + "epoch": 0.39, + "grad_norm": 0.5432597994804382, + "learning_rate": 0.0005759278712323419, + "loss": 3.2769, + "step": 7859 + }, + { + "epoch": 0.39, + "grad_norm": 0.550878643989563, + "learning_rate": 0.0005759218281372175, + "loss": 3.5132, + "step": 7860 + }, + { + "epoch": 0.39, + "grad_norm": 0.5362837910652161, + "learning_rate": 0.0005759157843153683, + "loss": 3.2445, + "step": 7861 + }, + { + "epoch": 0.39, + "grad_norm": 0.5157976150512695, + "learning_rate": 0.00057590973976681, + "loss": 3.3654, + "step": 7862 + }, + { + "epoch": 0.39, + "grad_norm": 0.5195072293281555, + "learning_rate": 0.0005759036944915585, + "loss": 3.2124, + "step": 7863 + }, + { + "epoch": 0.39, + "grad_norm": 0.5618158578872681, + "learning_rate": 0.0005758976484896296, + "loss": 3.1182, + "step": 7864 + }, + { + "epoch": 0.39, + "grad_norm": 0.5115683078765869, + "learning_rate": 0.0005758916017610396, + "loss": 3.4023, + "step": 7865 + }, + { + "epoch": 0.39, + "grad_norm": 0.5243606567382812, + "learning_rate": 0.0005758855543058042, + "loss": 3.2927, + "step": 7866 + }, + { + "epoch": 0.39, + "grad_norm": 0.5262304544448853, + "learning_rate": 0.0005758795061239393, + "loss": 3.3996, + "step": 7867 + }, + { + "epoch": 0.39, + "grad_norm": 0.5342805981636047, + "learning_rate": 0.0005758734572154609, + "loss": 3.443, + "step": 7868 + }, + { + "epoch": 0.39, + "grad_norm": 0.49876853823661804, + "learning_rate": 0.0005758674075803848, + "loss": 3.5409, + "step": 7869 + }, + { + "epoch": 0.39, + "grad_norm": 0.534772515296936, + "learning_rate": 0.000575861357218727, + "loss": 3.3427, + "step": 7870 + }, + { + "epoch": 0.39, + "grad_norm": 0.5307782292366028, + "learning_rate": 0.0005758553061305038, + "loss": 3.3184, + "step": 7871 + }, + { + "epoch": 0.39, + "grad_norm": 0.5947427153587341, + "learning_rate": 0.0005758492543157305, + "loss": 3.1305, + "step": 7872 + }, + { + "epoch": 0.39, + "grad_norm": 0.5112699270248413, + "learning_rate": 0.0005758432017744235, + "loss": 3.2861, + "step": 7873 + }, + { + "epoch": 0.39, + "grad_norm": 0.5101796388626099, + "learning_rate": 0.0005758371485065986, + "loss": 3.3975, + "step": 7874 + }, + { + "epoch": 0.39, + "grad_norm": 0.5383381247520447, + "learning_rate": 0.0005758310945122717, + "loss": 3.2371, + "step": 7875 + }, + { + "epoch": 0.39, + "grad_norm": 0.5568742156028748, + "learning_rate": 0.0005758250397914587, + "loss": 3.3188, + "step": 7876 + }, + { + "epoch": 0.39, + "grad_norm": 0.5260884761810303, + "learning_rate": 0.0005758189843441757, + "loss": 3.3591, + "step": 7877 + }, + { + "epoch": 0.39, + "grad_norm": 0.5004744529724121, + "learning_rate": 0.0005758129281704386, + "loss": 3.2018, + "step": 7878 + }, + { + "epoch": 0.39, + "grad_norm": 0.5695815682411194, + "learning_rate": 0.0005758068712702633, + "loss": 3.0783, + "step": 7879 + }, + { + "epoch": 0.39, + "grad_norm": 0.5590498447418213, + "learning_rate": 0.0005758008136436658, + "loss": 3.3908, + "step": 7880 + }, + { + "epoch": 0.39, + "grad_norm": 0.5661531090736389, + "learning_rate": 0.0005757947552906621, + "loss": 3.2036, + "step": 7881 + }, + { + "epoch": 0.39, + "grad_norm": 0.5372530817985535, + "learning_rate": 0.0005757886962112679, + "loss": 3.3096, + "step": 7882 + }, + { + "epoch": 0.39, + "grad_norm": 0.5631973147392273, + "learning_rate": 0.0005757826364054995, + "loss": 3.3011, + "step": 7883 + }, + { + "epoch": 0.39, + "grad_norm": 0.5350229740142822, + "learning_rate": 0.0005757765758733727, + "loss": 3.4572, + "step": 7884 + }, + { + "epoch": 0.39, + "grad_norm": 0.5403389930725098, + "learning_rate": 0.0005757705146149034, + "loss": 3.3266, + "step": 7885 + }, + { + "epoch": 0.39, + "grad_norm": 0.5325131416320801, + "learning_rate": 0.0005757644526301078, + "loss": 3.3954, + "step": 7886 + }, + { + "epoch": 0.39, + "grad_norm": 0.5278269052505493, + "learning_rate": 0.0005757583899190016, + "loss": 3.3926, + "step": 7887 + }, + { + "epoch": 0.39, + "grad_norm": 0.5290004014968872, + "learning_rate": 0.0005757523264816009, + "loss": 3.5282, + "step": 7888 + }, + { + "epoch": 0.39, + "grad_norm": 0.5585773587226868, + "learning_rate": 0.0005757462623179215, + "loss": 3.5418, + "step": 7889 + }, + { + "epoch": 0.39, + "grad_norm": 0.5469244122505188, + "learning_rate": 0.0005757401974279796, + "loss": 3.2753, + "step": 7890 + }, + { + "epoch": 0.39, + "grad_norm": 0.6067792177200317, + "learning_rate": 0.0005757341318117911, + "loss": 3.3368, + "step": 7891 + }, + { + "epoch": 0.39, + "grad_norm": 0.5292524099349976, + "learning_rate": 0.0005757280654693721, + "loss": 3.2878, + "step": 7892 + }, + { + "epoch": 0.39, + "grad_norm": 0.6546759605407715, + "learning_rate": 0.0005757219984007382, + "loss": 3.1904, + "step": 7893 + }, + { + "epoch": 0.39, + "grad_norm": 0.5419657826423645, + "learning_rate": 0.0005757159306059057, + "loss": 3.4508, + "step": 7894 + }, + { + "epoch": 0.39, + "grad_norm": 0.5196870565414429, + "learning_rate": 0.0005757098620848905, + "loss": 3.4328, + "step": 7895 + }, + { + "epoch": 0.39, + "grad_norm": 0.5152660608291626, + "learning_rate": 0.0005757037928377087, + "loss": 3.3568, + "step": 7896 + }, + { + "epoch": 0.39, + "grad_norm": 0.5874919891357422, + "learning_rate": 0.0005756977228643761, + "loss": 3.4204, + "step": 7897 + }, + { + "epoch": 0.39, + "grad_norm": 0.5220939517021179, + "learning_rate": 0.0005756916521649088, + "loss": 3.5503, + "step": 7898 + }, + { + "epoch": 0.39, + "grad_norm": 0.5331912636756897, + "learning_rate": 0.0005756855807393227, + "loss": 3.2373, + "step": 7899 + }, + { + "epoch": 0.39, + "grad_norm": 0.5303639769554138, + "learning_rate": 0.0005756795085876338, + "loss": 3.2296, + "step": 7900 + }, + { + "epoch": 0.39, + "grad_norm": 0.522990882396698, + "learning_rate": 0.0005756734357098581, + "loss": 3.2013, + "step": 7901 + }, + { + "epoch": 0.39, + "grad_norm": 0.5241119861602783, + "learning_rate": 0.0005756673621060117, + "loss": 3.0806, + "step": 7902 + }, + { + "epoch": 0.39, + "grad_norm": 0.5303024649620056, + "learning_rate": 0.0005756612877761105, + "loss": 3.3074, + "step": 7903 + }, + { + "epoch": 0.39, + "grad_norm": 0.5631526112556458, + "learning_rate": 0.0005756552127201706, + "loss": 3.3211, + "step": 7904 + }, + { + "epoch": 0.39, + "grad_norm": 0.5668905377388, + "learning_rate": 0.0005756491369382078, + "loss": 3.2826, + "step": 7905 + }, + { + "epoch": 0.39, + "grad_norm": 0.5107238292694092, + "learning_rate": 0.0005756430604302383, + "loss": 3.2575, + "step": 7906 + }, + { + "epoch": 0.39, + "grad_norm": 0.5243762731552124, + "learning_rate": 0.0005756369831962779, + "loss": 3.294, + "step": 7907 + }, + { + "epoch": 0.39, + "grad_norm": 0.5488375425338745, + "learning_rate": 0.0005756309052363429, + "loss": 3.6128, + "step": 7908 + }, + { + "epoch": 0.39, + "grad_norm": 0.5057998299598694, + "learning_rate": 0.0005756248265504491, + "loss": 3.294, + "step": 7909 + }, + { + "epoch": 0.39, + "grad_norm": 0.544693648815155, + "learning_rate": 0.0005756187471386126, + "loss": 3.2293, + "step": 7910 + }, + { + "epoch": 0.39, + "grad_norm": 0.5097287893295288, + "learning_rate": 0.0005756126670008492, + "loss": 3.1767, + "step": 7911 + }, + { + "epoch": 0.39, + "grad_norm": 0.5124967098236084, + "learning_rate": 0.0005756065861371751, + "loss": 3.0868, + "step": 7912 + }, + { + "epoch": 0.39, + "grad_norm": 0.5405759215354919, + "learning_rate": 0.0005756005045476064, + "loss": 3.4881, + "step": 7913 + }, + { + "epoch": 0.39, + "grad_norm": 0.5262176394462585, + "learning_rate": 0.000575594422232159, + "loss": 3.4049, + "step": 7914 + }, + { + "epoch": 0.39, + "grad_norm": 0.5306533575057983, + "learning_rate": 0.0005755883391908489, + "loss": 3.3627, + "step": 7915 + }, + { + "epoch": 0.39, + "grad_norm": 0.5325959920883179, + "learning_rate": 0.0005755822554236921, + "loss": 3.4184, + "step": 7916 + }, + { + "epoch": 0.39, + "grad_norm": 0.5592959523200989, + "learning_rate": 0.0005755761709307048, + "loss": 3.1438, + "step": 7917 + }, + { + "epoch": 0.39, + "grad_norm": 0.5043314695358276, + "learning_rate": 0.0005755700857119028, + "loss": 3.2669, + "step": 7918 + }, + { + "epoch": 0.39, + "grad_norm": 0.5433175563812256, + "learning_rate": 0.0005755639997673022, + "loss": 3.2691, + "step": 7919 + }, + { + "epoch": 0.39, + "grad_norm": 0.5013863444328308, + "learning_rate": 0.0005755579130969191, + "loss": 3.1457, + "step": 7920 + }, + { + "epoch": 0.39, + "grad_norm": 0.5583698749542236, + "learning_rate": 0.0005755518257007696, + "loss": 3.6578, + "step": 7921 + }, + { + "epoch": 0.39, + "grad_norm": 0.5930216312408447, + "learning_rate": 0.0005755457375788694, + "loss": 3.2728, + "step": 7922 + }, + { + "epoch": 0.39, + "grad_norm": 0.5604255795478821, + "learning_rate": 0.000575539648731235, + "loss": 3.2173, + "step": 7923 + }, + { + "epoch": 0.39, + "grad_norm": 0.5618053674697876, + "learning_rate": 0.000575533559157882, + "loss": 3.3222, + "step": 7924 + }, + { + "epoch": 0.39, + "grad_norm": 0.5377988219261169, + "learning_rate": 0.0005755274688588268, + "loss": 3.3963, + "step": 7925 + }, + { + "epoch": 0.39, + "grad_norm": 0.5500739216804504, + "learning_rate": 0.0005755213778340852, + "loss": 3.6868, + "step": 7926 + }, + { + "epoch": 0.39, + "grad_norm": 0.5406518578529358, + "learning_rate": 0.0005755152860836733, + "loss": 3.374, + "step": 7927 + }, + { + "epoch": 0.39, + "grad_norm": 0.5654966235160828, + "learning_rate": 0.0005755091936076071, + "loss": 3.3569, + "step": 7928 + }, + { + "epoch": 0.39, + "grad_norm": 0.5690451860427856, + "learning_rate": 0.0005755031004059028, + "loss": 3.2013, + "step": 7929 + }, + { + "epoch": 0.39, + "grad_norm": 0.6072752475738525, + "learning_rate": 0.0005754970064785763, + "loss": 3.3478, + "step": 7930 + }, + { + "epoch": 0.39, + "grad_norm": 0.5231303572654724, + "learning_rate": 0.0005754909118256438, + "loss": 3.1507, + "step": 7931 + }, + { + "epoch": 0.39, + "grad_norm": 0.5188618898391724, + "learning_rate": 0.0005754848164471211, + "loss": 3.3347, + "step": 7932 + }, + { + "epoch": 0.39, + "grad_norm": 0.5124830007553101, + "learning_rate": 0.0005754787203430245, + "loss": 3.5574, + "step": 7933 + }, + { + "epoch": 0.39, + "grad_norm": 0.5399743914604187, + "learning_rate": 0.0005754726235133701, + "loss": 3.4946, + "step": 7934 + }, + { + "epoch": 0.39, + "grad_norm": 0.526694655418396, + "learning_rate": 0.0005754665259581736, + "loss": 3.2302, + "step": 7935 + }, + { + "epoch": 0.39, + "grad_norm": 0.5251884460449219, + "learning_rate": 0.0005754604276774515, + "loss": 3.5666, + "step": 7936 + }, + { + "epoch": 0.39, + "grad_norm": 0.5409374237060547, + "learning_rate": 0.0005754543286712193, + "loss": 3.2799, + "step": 7937 + }, + { + "epoch": 0.39, + "grad_norm": 0.49520036578178406, + "learning_rate": 0.0005754482289394938, + "loss": 3.3421, + "step": 7938 + }, + { + "epoch": 0.39, + "grad_norm": 0.5165755152702332, + "learning_rate": 0.0005754421284822905, + "loss": 3.0666, + "step": 7939 + }, + { + "epoch": 0.39, + "grad_norm": 0.5416868329048157, + "learning_rate": 0.0005754360272996256, + "loss": 3.2758, + "step": 7940 + }, + { + "epoch": 0.39, + "grad_norm": 0.53125, + "learning_rate": 0.0005754299253915153, + "loss": 3.2919, + "step": 7941 + }, + { + "epoch": 0.39, + "grad_norm": 0.5226501822471619, + "learning_rate": 0.0005754238227579755, + "loss": 3.2847, + "step": 7942 + }, + { + "epoch": 0.39, + "grad_norm": 0.5210117101669312, + "learning_rate": 0.0005754177193990225, + "loss": 3.4445, + "step": 7943 + }, + { + "epoch": 0.39, + "grad_norm": 0.5392207503318787, + "learning_rate": 0.0005754116153146721, + "loss": 3.2395, + "step": 7944 + }, + { + "epoch": 0.39, + "grad_norm": 0.5218811631202698, + "learning_rate": 0.0005754055105049406, + "loss": 3.3672, + "step": 7945 + }, + { + "epoch": 0.39, + "grad_norm": 0.5551583170890808, + "learning_rate": 0.0005753994049698439, + "loss": 3.483, + "step": 7946 + }, + { + "epoch": 0.39, + "grad_norm": 0.5441166758537292, + "learning_rate": 0.0005753932987093983, + "loss": 3.2976, + "step": 7947 + }, + { + "epoch": 0.39, + "grad_norm": 0.5118980407714844, + "learning_rate": 0.0005753871917236195, + "loss": 3.258, + "step": 7948 + }, + { + "epoch": 0.39, + "grad_norm": 0.51288902759552, + "learning_rate": 0.000575381084012524, + "loss": 3.1806, + "step": 7949 + }, + { + "epoch": 0.39, + "grad_norm": 0.5351516008377075, + "learning_rate": 0.0005753749755761277, + "loss": 3.2706, + "step": 7950 + }, + { + "epoch": 0.39, + "grad_norm": 0.5109082460403442, + "learning_rate": 0.0005753688664144467, + "loss": 3.0689, + "step": 7951 + }, + { + "epoch": 0.39, + "grad_norm": 0.49430152773857117, + "learning_rate": 0.0005753627565274972, + "loss": 3.5631, + "step": 7952 + }, + { + "epoch": 0.39, + "grad_norm": 0.5566909909248352, + "learning_rate": 0.0005753566459152952, + "loss": 3.3404, + "step": 7953 + }, + { + "epoch": 0.39, + "grad_norm": 0.576958179473877, + "learning_rate": 0.0005753505345778567, + "loss": 3.2608, + "step": 7954 + }, + { + "epoch": 0.39, + "grad_norm": 0.5318806767463684, + "learning_rate": 0.0005753444225151978, + "loss": 3.2429, + "step": 7955 + }, + { + "epoch": 0.39, + "grad_norm": 0.5288330316543579, + "learning_rate": 0.0005753383097273348, + "loss": 3.4162, + "step": 7956 + }, + { + "epoch": 0.39, + "grad_norm": 0.5240846276283264, + "learning_rate": 0.0005753321962142837, + "loss": 3.142, + "step": 7957 + }, + { + "epoch": 0.39, + "grad_norm": 0.5280718207359314, + "learning_rate": 0.0005753260819760605, + "loss": 3.2296, + "step": 7958 + }, + { + "epoch": 0.39, + "grad_norm": 0.5125327110290527, + "learning_rate": 0.0005753199670126815, + "loss": 3.3329, + "step": 7959 + }, + { + "epoch": 0.39, + "grad_norm": 0.5324898362159729, + "learning_rate": 0.0005753138513241626, + "loss": 3.3372, + "step": 7960 + }, + { + "epoch": 0.39, + "grad_norm": 0.519921064376831, + "learning_rate": 0.00057530773491052, + "loss": 3.653, + "step": 7961 + }, + { + "epoch": 0.39, + "grad_norm": 0.5081679821014404, + "learning_rate": 0.0005753016177717699, + "loss": 3.2574, + "step": 7962 + }, + { + "epoch": 0.39, + "grad_norm": 0.5309244990348816, + "learning_rate": 0.0005752954999079282, + "loss": 3.3269, + "step": 7963 + }, + { + "epoch": 0.39, + "grad_norm": 0.5274438261985779, + "learning_rate": 0.0005752893813190112, + "loss": 3.3909, + "step": 7964 + }, + { + "epoch": 0.39, + "grad_norm": 0.5856743454933167, + "learning_rate": 0.000575283262005035, + "loss": 3.2714, + "step": 7965 + }, + { + "epoch": 0.39, + "grad_norm": 0.528982400894165, + "learning_rate": 0.0005752771419660156, + "loss": 3.2405, + "step": 7966 + }, + { + "epoch": 0.39, + "grad_norm": 0.5737332701683044, + "learning_rate": 0.0005752710212019692, + "loss": 3.5454, + "step": 7967 + }, + { + "epoch": 0.39, + "grad_norm": 0.5274087190628052, + "learning_rate": 0.000575264899712912, + "loss": 3.3418, + "step": 7968 + }, + { + "epoch": 0.39, + "grad_norm": 0.5080131888389587, + "learning_rate": 0.0005752587774988598, + "loss": 3.3738, + "step": 7969 + }, + { + "epoch": 0.39, + "grad_norm": 0.4994853138923645, + "learning_rate": 0.0005752526545598291, + "loss": 3.4255, + "step": 7970 + }, + { + "epoch": 0.39, + "grad_norm": 0.5325427651405334, + "learning_rate": 0.0005752465308958358, + "loss": 3.3933, + "step": 7971 + }, + { + "epoch": 0.39, + "grad_norm": 0.5056576728820801, + "learning_rate": 0.0005752404065068963, + "loss": 3.4345, + "step": 7972 + }, + { + "epoch": 0.39, + "grad_norm": 0.5503882765769958, + "learning_rate": 0.0005752342813930264, + "loss": 3.1064, + "step": 7973 + }, + { + "epoch": 0.39, + "grad_norm": 0.537910521030426, + "learning_rate": 0.0005752281555542423, + "loss": 3.264, + "step": 7974 + }, + { + "epoch": 0.39, + "grad_norm": 0.5261189937591553, + "learning_rate": 0.0005752220289905604, + "loss": 3.375, + "step": 7975 + }, + { + "epoch": 0.39, + "grad_norm": 0.5032466650009155, + "learning_rate": 0.0005752159017019965, + "loss": 3.2479, + "step": 7976 + }, + { + "epoch": 0.39, + "grad_norm": 0.5199475884437561, + "learning_rate": 0.0005752097736885669, + "loss": 3.4072, + "step": 7977 + }, + { + "epoch": 0.39, + "grad_norm": 0.5200490951538086, + "learning_rate": 0.0005752036449502878, + "loss": 3.2218, + "step": 7978 + }, + { + "epoch": 0.39, + "grad_norm": 0.5323976874351501, + "learning_rate": 0.0005751975154871752, + "loss": 3.3384, + "step": 7979 + }, + { + "epoch": 0.39, + "grad_norm": 0.5439580082893372, + "learning_rate": 0.0005751913852992454, + "loss": 3.3075, + "step": 7980 + }, + { + "epoch": 0.39, + "grad_norm": 0.5520688891410828, + "learning_rate": 0.0005751852543865143, + "loss": 3.2518, + "step": 7981 + }, + { + "epoch": 0.39, + "grad_norm": 0.5149745345115662, + "learning_rate": 0.0005751791227489983, + "loss": 3.2855, + "step": 7982 + }, + { + "epoch": 0.39, + "grad_norm": 0.6031532287597656, + "learning_rate": 0.0005751729903867134, + "loss": 2.9604, + "step": 7983 + }, + { + "epoch": 0.39, + "grad_norm": 0.5211678147315979, + "learning_rate": 0.0005751668572996758, + "loss": 3.5004, + "step": 7984 + }, + { + "epoch": 0.39, + "grad_norm": 0.5391950011253357, + "learning_rate": 0.0005751607234879017, + "loss": 3.3339, + "step": 7985 + }, + { + "epoch": 0.39, + "grad_norm": 0.5196065306663513, + "learning_rate": 0.0005751545889514072, + "loss": 3.2076, + "step": 7986 + }, + { + "epoch": 0.39, + "grad_norm": 0.5143944025039673, + "learning_rate": 0.0005751484536902084, + "loss": 3.1234, + "step": 7987 + }, + { + "epoch": 0.39, + "grad_norm": 0.5726907253265381, + "learning_rate": 0.0005751423177043216, + "loss": 3.3739, + "step": 7988 + }, + { + "epoch": 0.39, + "grad_norm": 0.5271036028862, + "learning_rate": 0.0005751361809937629, + "loss": 3.4486, + "step": 7989 + }, + { + "epoch": 0.39, + "grad_norm": 0.5144461393356323, + "learning_rate": 0.0005751300435585483, + "loss": 3.5016, + "step": 7990 + }, + { + "epoch": 0.39, + "grad_norm": 0.5567069053649902, + "learning_rate": 0.0005751239053986944, + "loss": 3.3089, + "step": 7991 + }, + { + "epoch": 0.39, + "grad_norm": 0.5412687659263611, + "learning_rate": 0.0005751177665142169, + "loss": 3.2718, + "step": 7992 + }, + { + "epoch": 0.39, + "grad_norm": 0.5389328598976135, + "learning_rate": 0.0005751116269051321, + "loss": 3.1432, + "step": 7993 + }, + { + "epoch": 0.39, + "grad_norm": 0.5082566142082214, + "learning_rate": 0.0005751054865714562, + "loss": 3.3403, + "step": 7994 + }, + { + "epoch": 0.39, + "grad_norm": 0.51983243227005, + "learning_rate": 0.0005750993455132055, + "loss": 3.2146, + "step": 7995 + }, + { + "epoch": 0.39, + "grad_norm": 0.5300633311271667, + "learning_rate": 0.0005750932037303961, + "loss": 3.234, + "step": 7996 + }, + { + "epoch": 0.39, + "grad_norm": 0.5436065196990967, + "learning_rate": 0.0005750870612230442, + "loss": 3.2246, + "step": 7997 + }, + { + "epoch": 0.39, + "grad_norm": 0.5409985184669495, + "learning_rate": 0.0005750809179911657, + "loss": 3.1161, + "step": 7998 + }, + { + "epoch": 0.39, + "grad_norm": 0.5061076879501343, + "learning_rate": 0.0005750747740347771, + "loss": 3.2811, + "step": 7999 + }, + { + "epoch": 0.39, + "grad_norm": 0.5530107617378235, + "learning_rate": 0.0005750686293538945, + "loss": 3.2673, + "step": 8000 + }, + { + "epoch": 0.39, + "grad_norm": 0.5269939303398132, + "learning_rate": 0.0005750624839485341, + "loss": 3.3215, + "step": 8001 + }, + { + "epoch": 0.39, + "grad_norm": 0.5110950469970703, + "learning_rate": 0.000575056337818712, + "loss": 3.5774, + "step": 8002 + }, + { + "epoch": 0.39, + "grad_norm": 0.5932079553604126, + "learning_rate": 0.0005750501909644445, + "loss": 3.5213, + "step": 8003 + }, + { + "epoch": 0.39, + "grad_norm": 0.5554304122924805, + "learning_rate": 0.0005750440433857477, + "loss": 3.2986, + "step": 8004 + }, + { + "epoch": 0.39, + "grad_norm": 0.5720034241676331, + "learning_rate": 0.0005750378950826378, + "loss": 3.3181, + "step": 8005 + }, + { + "epoch": 0.39, + "grad_norm": 0.5665091276168823, + "learning_rate": 0.000575031746055131, + "loss": 2.9911, + "step": 8006 + }, + { + "epoch": 0.39, + "grad_norm": 0.5483243465423584, + "learning_rate": 0.0005750255963032436, + "loss": 3.3377, + "step": 8007 + }, + { + "epoch": 0.39, + "grad_norm": 0.5081855058670044, + "learning_rate": 0.0005750194458269916, + "loss": 3.2402, + "step": 8008 + }, + { + "epoch": 0.39, + "grad_norm": 0.5034002065658569, + "learning_rate": 0.0005750132946263914, + "loss": 3.2496, + "step": 8009 + }, + { + "epoch": 0.39, + "grad_norm": 0.6338850259780884, + "learning_rate": 0.0005750071427014591, + "loss": 3.3337, + "step": 8010 + }, + { + "epoch": 0.39, + "grad_norm": 0.5657280683517456, + "learning_rate": 0.0005750009900522109, + "loss": 3.4191, + "step": 8011 + }, + { + "epoch": 0.39, + "grad_norm": 0.5098117589950562, + "learning_rate": 0.000574994836678663, + "loss": 3.3851, + "step": 8012 + }, + { + "epoch": 0.39, + "grad_norm": 0.5298253297805786, + "learning_rate": 0.0005749886825808316, + "loss": 3.3719, + "step": 8013 + }, + { + "epoch": 0.39, + "grad_norm": 0.5312026143074036, + "learning_rate": 0.000574982527758733, + "loss": 3.4547, + "step": 8014 + }, + { + "epoch": 0.39, + "grad_norm": 0.5396552085876465, + "learning_rate": 0.0005749763722123832, + "loss": 3.2734, + "step": 8015 + }, + { + "epoch": 0.39, + "grad_norm": 0.5509809255599976, + "learning_rate": 0.0005749702159417988, + "loss": 3.4702, + "step": 8016 + }, + { + "epoch": 0.39, + "grad_norm": 0.5270836353302002, + "learning_rate": 0.0005749640589469956, + "loss": 3.3203, + "step": 8017 + }, + { + "epoch": 0.39, + "grad_norm": 0.540736973285675, + "learning_rate": 0.00057495790122799, + "loss": 3.1879, + "step": 8018 + }, + { + "epoch": 0.39, + "grad_norm": 0.531613290309906, + "learning_rate": 0.0005749517427847982, + "loss": 3.5024, + "step": 8019 + }, + { + "epoch": 0.39, + "grad_norm": 0.5209075808525085, + "learning_rate": 0.0005749455836174365, + "loss": 3.5551, + "step": 8020 + }, + { + "epoch": 0.39, + "grad_norm": 0.536415159702301, + "learning_rate": 0.0005749394237259209, + "loss": 3.2383, + "step": 8021 + }, + { + "epoch": 0.39, + "grad_norm": 0.5413359999656677, + "learning_rate": 0.0005749332631102679, + "loss": 3.1601, + "step": 8022 + }, + { + "epoch": 0.39, + "grad_norm": 0.5244982242584229, + "learning_rate": 0.0005749271017704935, + "loss": 3.2752, + "step": 8023 + }, + { + "epoch": 0.39, + "grad_norm": 0.5341219305992126, + "learning_rate": 0.0005749209397066142, + "loss": 3.1301, + "step": 8024 + }, + { + "epoch": 0.39, + "grad_norm": 0.53799968957901, + "learning_rate": 0.0005749147769186459, + "loss": 3.2056, + "step": 8025 + }, + { + "epoch": 0.39, + "grad_norm": 0.5490437746047974, + "learning_rate": 0.0005749086134066051, + "loss": 3.5046, + "step": 8026 + }, + { + "epoch": 0.39, + "grad_norm": 0.5136128664016724, + "learning_rate": 0.0005749024491705078, + "loss": 3.2401, + "step": 8027 + }, + { + "epoch": 0.39, + "grad_norm": 0.566075325012207, + "learning_rate": 0.0005748962842103703, + "loss": 3.2781, + "step": 8028 + }, + { + "epoch": 0.39, + "grad_norm": 0.5798686742782593, + "learning_rate": 0.0005748901185262091, + "loss": 3.3664, + "step": 8029 + }, + { + "epoch": 0.39, + "grad_norm": 0.541175365447998, + "learning_rate": 0.0005748839521180401, + "loss": 3.4175, + "step": 8030 + }, + { + "epoch": 0.39, + "grad_norm": 0.5209834575653076, + "learning_rate": 0.0005748777849858797, + "loss": 3.3311, + "step": 8031 + }, + { + "epoch": 0.39, + "grad_norm": 0.5575250387191772, + "learning_rate": 0.0005748716171297441, + "loss": 3.3556, + "step": 8032 + }, + { + "epoch": 0.39, + "grad_norm": 0.5445090532302856, + "learning_rate": 0.0005748654485496496, + "loss": 3.2137, + "step": 8033 + }, + { + "epoch": 0.39, + "grad_norm": 0.5050318241119385, + "learning_rate": 0.0005748592792456124, + "loss": 3.5883, + "step": 8034 + }, + { + "epoch": 0.39, + "grad_norm": 0.5381130576133728, + "learning_rate": 0.0005748531092176487, + "loss": 3.3816, + "step": 8035 + }, + { + "epoch": 0.39, + "grad_norm": 0.5211299657821655, + "learning_rate": 0.0005748469384657749, + "loss": 3.405, + "step": 8036 + }, + { + "epoch": 0.39, + "grad_norm": 0.543992817401886, + "learning_rate": 0.0005748407669900071, + "loss": 3.1935, + "step": 8037 + }, + { + "epoch": 0.39, + "grad_norm": 0.5342070460319519, + "learning_rate": 0.0005748345947903615, + "loss": 3.2672, + "step": 8038 + }, + { + "epoch": 0.39, + "grad_norm": 0.5374672412872314, + "learning_rate": 0.0005748284218668546, + "loss": 3.6805, + "step": 8039 + }, + { + "epoch": 0.39, + "grad_norm": 0.5109804272651672, + "learning_rate": 0.0005748222482195026, + "loss": 3.3104, + "step": 8040 + }, + { + "epoch": 0.39, + "grad_norm": 0.4903632402420044, + "learning_rate": 0.0005748160738483216, + "loss": 3.3117, + "step": 8041 + }, + { + "epoch": 0.39, + "grad_norm": 0.525119423866272, + "learning_rate": 0.0005748098987533279, + "loss": 3.3496, + "step": 8042 + }, + { + "epoch": 0.39, + "grad_norm": 0.5801721215248108, + "learning_rate": 0.0005748037229345379, + "loss": 3.2369, + "step": 8043 + }, + { + "epoch": 0.39, + "grad_norm": 0.514396071434021, + "learning_rate": 0.0005747975463919677, + "loss": 3.2263, + "step": 8044 + }, + { + "epoch": 0.39, + "grad_norm": 0.5463517904281616, + "learning_rate": 0.0005747913691256338, + "loss": 3.296, + "step": 8045 + }, + { + "epoch": 0.39, + "grad_norm": 0.5206927061080933, + "learning_rate": 0.0005747851911355521, + "loss": 3.3216, + "step": 8046 + }, + { + "epoch": 0.39, + "grad_norm": 0.5000367760658264, + "learning_rate": 0.0005747790124217393, + "loss": 3.2014, + "step": 8047 + }, + { + "epoch": 0.39, + "grad_norm": 0.5487034320831299, + "learning_rate": 0.0005747728329842113, + "loss": 3.0561, + "step": 8048 + }, + { + "epoch": 0.39, + "grad_norm": 0.5461079478263855, + "learning_rate": 0.0005747666528229847, + "loss": 3.3097, + "step": 8049 + }, + { + "epoch": 0.39, + "grad_norm": 0.4785720407962799, + "learning_rate": 0.0005747604719380754, + "loss": 3.4331, + "step": 8050 + }, + { + "epoch": 0.39, + "grad_norm": 0.5695480704307556, + "learning_rate": 0.0005747542903295, + "loss": 3.2418, + "step": 8051 + }, + { + "epoch": 0.39, + "grad_norm": 0.5708943009376526, + "learning_rate": 0.0005747481079972749, + "loss": 3.3734, + "step": 8052 + }, + { + "epoch": 0.39, + "grad_norm": 0.5381008386611938, + "learning_rate": 0.0005747419249414159, + "loss": 3.4608, + "step": 8053 + }, + { + "epoch": 0.39, + "grad_norm": 0.5541656017303467, + "learning_rate": 0.0005747357411619396, + "loss": 3.3497, + "step": 8054 + }, + { + "epoch": 0.39, + "grad_norm": 0.5573221445083618, + "learning_rate": 0.0005747295566588622, + "loss": 3.3627, + "step": 8055 + }, + { + "epoch": 0.39, + "grad_norm": 0.5886656641960144, + "learning_rate": 0.0005747233714322001, + "loss": 3.399, + "step": 8056 + }, + { + "epoch": 0.39, + "grad_norm": 0.5136348605155945, + "learning_rate": 0.0005747171854819694, + "loss": 3.3561, + "step": 8057 + }, + { + "epoch": 0.39, + "grad_norm": 0.5364802479743958, + "learning_rate": 0.0005747109988081865, + "loss": 3.4321, + "step": 8058 + }, + { + "epoch": 0.39, + "grad_norm": 0.5847378969192505, + "learning_rate": 0.000574704811410868, + "loss": 3.3716, + "step": 8059 + }, + { + "epoch": 0.4, + "grad_norm": 0.5139754414558411, + "learning_rate": 0.0005746986232900295, + "loss": 3.3843, + "step": 8060 + }, + { + "epoch": 0.4, + "grad_norm": 0.5183747410774231, + "learning_rate": 0.0005746924344456879, + "loss": 3.3141, + "step": 8061 + }, + { + "epoch": 0.4, + "grad_norm": 0.5344926714897156, + "learning_rate": 0.0005746862448778593, + "loss": 3.4683, + "step": 8062 + }, + { + "epoch": 0.4, + "grad_norm": 0.5372681021690369, + "learning_rate": 0.0005746800545865599, + "loss": 3.4784, + "step": 8063 + }, + { + "epoch": 0.4, + "grad_norm": 0.5549917817115784, + "learning_rate": 0.0005746738635718061, + "loss": 3.2525, + "step": 8064 + }, + { + "epoch": 0.4, + "grad_norm": 0.5214190483093262, + "learning_rate": 0.0005746676718336143, + "loss": 3.4208, + "step": 8065 + }, + { + "epoch": 0.4, + "grad_norm": 0.5115858912467957, + "learning_rate": 0.0005746614793720007, + "loss": 3.1698, + "step": 8066 + }, + { + "epoch": 0.4, + "grad_norm": 0.571573793888092, + "learning_rate": 0.0005746552861869815, + "loss": 3.3276, + "step": 8067 + }, + { + "epoch": 0.4, + "grad_norm": 0.5278741121292114, + "learning_rate": 0.0005746490922785733, + "loss": 3.3125, + "step": 8068 + }, + { + "epoch": 0.4, + "grad_norm": 0.5263270139694214, + "learning_rate": 0.0005746428976467922, + "loss": 3.39, + "step": 8069 + }, + { + "epoch": 0.4, + "grad_norm": 0.553209125995636, + "learning_rate": 0.0005746367022916545, + "loss": 3.1092, + "step": 8070 + }, + { + "epoch": 0.4, + "grad_norm": 0.5743881464004517, + "learning_rate": 0.0005746305062131765, + "loss": 3.3103, + "step": 8071 + }, + { + "epoch": 0.4, + "grad_norm": 0.5375149846076965, + "learning_rate": 0.0005746243094113748, + "loss": 3.4892, + "step": 8072 + }, + { + "epoch": 0.4, + "grad_norm": 0.5758310556411743, + "learning_rate": 0.0005746181118862655, + "loss": 3.199, + "step": 8073 + }, + { + "epoch": 0.4, + "grad_norm": 0.522111177444458, + "learning_rate": 0.0005746119136378648, + "loss": 3.2247, + "step": 8074 + }, + { + "epoch": 0.4, + "grad_norm": 0.5190290808677673, + "learning_rate": 0.0005746057146661892, + "loss": 3.1963, + "step": 8075 + }, + { + "epoch": 0.4, + "grad_norm": 0.5556468367576599, + "learning_rate": 0.0005745995149712552, + "loss": 3.0931, + "step": 8076 + }, + { + "epoch": 0.4, + "grad_norm": 0.5247904658317566, + "learning_rate": 0.0005745933145530787, + "loss": 3.28, + "step": 8077 + }, + { + "epoch": 0.4, + "grad_norm": 0.5039224624633789, + "learning_rate": 0.0005745871134116763, + "loss": 3.3245, + "step": 8078 + }, + { + "epoch": 0.4, + "grad_norm": 0.5405135750770569, + "learning_rate": 0.0005745809115470643, + "loss": 3.1882, + "step": 8079 + }, + { + "epoch": 0.4, + "grad_norm": 0.5452626347541809, + "learning_rate": 0.000574574708959259, + "loss": 3.3268, + "step": 8080 + }, + { + "epoch": 0.4, + "grad_norm": 0.5153411626815796, + "learning_rate": 0.000574568505648277, + "loss": 3.3195, + "step": 8081 + }, + { + "epoch": 0.4, + "grad_norm": 0.5305858254432678, + "learning_rate": 0.0005745623016141341, + "loss": 3.4117, + "step": 8082 + }, + { + "epoch": 0.4, + "grad_norm": 0.5426940321922302, + "learning_rate": 0.000574556096856847, + "loss": 3.1285, + "step": 8083 + }, + { + "epoch": 0.4, + "grad_norm": 0.4963018596172333, + "learning_rate": 0.000574549891376432, + "loss": 3.2248, + "step": 8084 + }, + { + "epoch": 0.4, + "grad_norm": 0.5105463266372681, + "learning_rate": 0.0005745436851729055, + "loss": 3.2956, + "step": 8085 + }, + { + "epoch": 0.4, + "grad_norm": 0.5522632598876953, + "learning_rate": 0.0005745374782462837, + "loss": 3.0467, + "step": 8086 + }, + { + "epoch": 0.4, + "grad_norm": 0.5475056767463684, + "learning_rate": 0.000574531270596583, + "loss": 3.382, + "step": 8087 + }, + { + "epoch": 0.4, + "grad_norm": 0.501108705997467, + "learning_rate": 0.0005745250622238198, + "loss": 3.5365, + "step": 8088 + }, + { + "epoch": 0.4, + "grad_norm": 0.5344222187995911, + "learning_rate": 0.0005745188531280105, + "loss": 3.2848, + "step": 8089 + }, + { + "epoch": 0.4, + "grad_norm": 0.5425450801849365, + "learning_rate": 0.0005745126433091712, + "loss": 3.5062, + "step": 8090 + }, + { + "epoch": 0.4, + "grad_norm": 0.5173064470291138, + "learning_rate": 0.0005745064327673185, + "loss": 3.1953, + "step": 8091 + }, + { + "epoch": 0.4, + "grad_norm": 0.5049318671226501, + "learning_rate": 0.0005745002215024687, + "loss": 3.4188, + "step": 8092 + }, + { + "epoch": 0.4, + "grad_norm": 0.5095267295837402, + "learning_rate": 0.0005744940095146381, + "loss": 3.5542, + "step": 8093 + }, + { + "epoch": 0.4, + "grad_norm": 0.5197057127952576, + "learning_rate": 0.0005744877968038432, + "loss": 3.341, + "step": 8094 + }, + { + "epoch": 0.4, + "grad_norm": 0.4955214262008667, + "learning_rate": 0.0005744815833701003, + "loss": 3.3152, + "step": 8095 + }, + { + "epoch": 0.4, + "grad_norm": 0.5291468501091003, + "learning_rate": 0.0005744753692134256, + "loss": 3.3581, + "step": 8096 + }, + { + "epoch": 0.4, + "grad_norm": 0.5096137523651123, + "learning_rate": 0.0005744691543338357, + "loss": 3.4981, + "step": 8097 + }, + { + "epoch": 0.4, + "grad_norm": 0.47097456455230713, + "learning_rate": 0.0005744629387313469, + "loss": 3.2238, + "step": 8098 + }, + { + "epoch": 0.4, + "grad_norm": 0.4965920150279999, + "learning_rate": 0.0005744567224059754, + "loss": 3.5343, + "step": 8099 + }, + { + "epoch": 0.4, + "grad_norm": 0.5325508713722229, + "learning_rate": 0.0005744505053577379, + "loss": 3.2636, + "step": 8100 + }, + { + "epoch": 0.4, + "grad_norm": 0.5449921488761902, + "learning_rate": 0.0005744442875866504, + "loss": 3.4296, + "step": 8101 + }, + { + "epoch": 0.4, + "grad_norm": 0.5376118421554565, + "learning_rate": 0.0005744380690927295, + "loss": 3.2763, + "step": 8102 + }, + { + "epoch": 0.4, + "grad_norm": 0.5433290600776672, + "learning_rate": 0.0005744318498759917, + "loss": 3.2533, + "step": 8103 + }, + { + "epoch": 0.4, + "grad_norm": 0.5353100299835205, + "learning_rate": 0.000574425629936453, + "loss": 3.3458, + "step": 8104 + }, + { + "epoch": 0.4, + "grad_norm": 0.5136173963546753, + "learning_rate": 0.0005744194092741302, + "loss": 3.3949, + "step": 8105 + }, + { + "epoch": 0.4, + "grad_norm": 0.5367599725723267, + "learning_rate": 0.0005744131878890394, + "loss": 3.2921, + "step": 8106 + }, + { + "epoch": 0.4, + "grad_norm": 0.5330642461776733, + "learning_rate": 0.0005744069657811971, + "loss": 3.3031, + "step": 8107 + }, + { + "epoch": 0.4, + "grad_norm": 0.5547467470169067, + "learning_rate": 0.0005744007429506195, + "loss": 3.5355, + "step": 8108 + }, + { + "epoch": 0.4, + "grad_norm": 0.47983911633491516, + "learning_rate": 0.0005743945193973233, + "loss": 3.4107, + "step": 8109 + }, + { + "epoch": 0.4, + "grad_norm": 0.5580479502677917, + "learning_rate": 0.0005743882951213247, + "loss": 3.2559, + "step": 8110 + }, + { + "epoch": 0.4, + "grad_norm": 0.5408802628517151, + "learning_rate": 0.0005743820701226402, + "loss": 3.2648, + "step": 8111 + }, + { + "epoch": 0.4, + "grad_norm": 0.5535223484039307, + "learning_rate": 0.0005743758444012861, + "loss": 3.3305, + "step": 8112 + }, + { + "epoch": 0.4, + "grad_norm": 0.5365628600120544, + "learning_rate": 0.0005743696179572788, + "loss": 3.4065, + "step": 8113 + }, + { + "epoch": 0.4, + "grad_norm": 0.6094711422920227, + "learning_rate": 0.0005743633907906348, + "loss": 3.5076, + "step": 8114 + }, + { + "epoch": 0.4, + "grad_norm": 0.5201326608657837, + "learning_rate": 0.0005743571629013704, + "loss": 3.4203, + "step": 8115 + }, + { + "epoch": 0.4, + "grad_norm": 0.5079147219657898, + "learning_rate": 0.000574350934289502, + "loss": 3.2888, + "step": 8116 + }, + { + "epoch": 0.4, + "grad_norm": 0.5209977030754089, + "learning_rate": 0.0005743447049550461, + "loss": 3.292, + "step": 8117 + }, + { + "epoch": 0.4, + "grad_norm": 0.524752140045166, + "learning_rate": 0.000574338474898019, + "loss": 3.129, + "step": 8118 + }, + { + "epoch": 0.4, + "grad_norm": 0.5393475890159607, + "learning_rate": 0.0005743322441184372, + "loss": 3.1805, + "step": 8119 + }, + { + "epoch": 0.4, + "grad_norm": 0.5741223096847534, + "learning_rate": 0.000574326012616317, + "loss": 3.3835, + "step": 8120 + }, + { + "epoch": 0.4, + "grad_norm": 0.531087338924408, + "learning_rate": 0.0005743197803916749, + "loss": 3.4494, + "step": 8121 + }, + { + "epoch": 0.4, + "grad_norm": 0.5499753952026367, + "learning_rate": 0.0005743135474445273, + "loss": 3.1866, + "step": 8122 + }, + { + "epoch": 0.4, + "grad_norm": 0.5074490904808044, + "learning_rate": 0.0005743073137748906, + "loss": 3.3362, + "step": 8123 + }, + { + "epoch": 0.4, + "grad_norm": 0.4964973032474518, + "learning_rate": 0.0005743010793827813, + "loss": 3.1807, + "step": 8124 + }, + { + "epoch": 0.4, + "grad_norm": 0.4979309141635895, + "learning_rate": 0.0005742948442682157, + "loss": 3.3788, + "step": 8125 + }, + { + "epoch": 0.4, + "grad_norm": 0.5132046341896057, + "learning_rate": 0.0005742886084312102, + "loss": 3.2557, + "step": 8126 + }, + { + "epoch": 0.4, + "grad_norm": 0.5504202246665955, + "learning_rate": 0.0005742823718717814, + "loss": 3.3969, + "step": 8127 + }, + { + "epoch": 0.4, + "grad_norm": 0.5701044797897339, + "learning_rate": 0.0005742761345899455, + "loss": 3.2887, + "step": 8128 + }, + { + "epoch": 0.4, + "grad_norm": 0.5031047463417053, + "learning_rate": 0.0005742698965857192, + "loss": 3.2875, + "step": 8129 + }, + { + "epoch": 0.4, + "grad_norm": 0.512702465057373, + "learning_rate": 0.0005742636578591186, + "loss": 3.2581, + "step": 8130 + }, + { + "epoch": 0.4, + "grad_norm": 0.5461293458938599, + "learning_rate": 0.0005742574184101605, + "loss": 3.3255, + "step": 8131 + }, + { + "epoch": 0.4, + "grad_norm": 0.5259397625923157, + "learning_rate": 0.000574251178238861, + "loss": 2.9756, + "step": 8132 + }, + { + "epoch": 0.4, + "grad_norm": 0.5428778529167175, + "learning_rate": 0.0005742449373452367, + "loss": 3.2215, + "step": 8133 + }, + { + "epoch": 0.4, + "grad_norm": 0.5212182402610779, + "learning_rate": 0.0005742386957293041, + "loss": 3.4854, + "step": 8134 + }, + { + "epoch": 0.4, + "grad_norm": 0.537875235080719, + "learning_rate": 0.0005742324533910794, + "loss": 3.467, + "step": 8135 + }, + { + "epoch": 0.4, + "grad_norm": 0.5199194550514221, + "learning_rate": 0.0005742262103305793, + "loss": 3.432, + "step": 8136 + }, + { + "epoch": 0.4, + "grad_norm": 0.498753160238266, + "learning_rate": 0.0005742199665478201, + "loss": 3.2888, + "step": 8137 + }, + { + "epoch": 0.4, + "grad_norm": 0.5349212288856506, + "learning_rate": 0.0005742137220428184, + "loss": 3.3626, + "step": 8138 + }, + { + "epoch": 0.4, + "grad_norm": 0.5592776536941528, + "learning_rate": 0.0005742074768155903, + "loss": 3.5535, + "step": 8139 + }, + { + "epoch": 0.4, + "grad_norm": 0.5438594818115234, + "learning_rate": 0.0005742012308661527, + "loss": 3.3263, + "step": 8140 + }, + { + "epoch": 0.4, + "grad_norm": 0.5591511726379395, + "learning_rate": 0.0005741949841945217, + "loss": 3.191, + "step": 8141 + }, + { + "epoch": 0.4, + "grad_norm": 0.5086604356765747, + "learning_rate": 0.000574188736800714, + "loss": 3.4601, + "step": 8142 + }, + { + "epoch": 0.4, + "grad_norm": 0.5508185029029846, + "learning_rate": 0.0005741824886847458, + "loss": 3.2594, + "step": 8143 + }, + { + "epoch": 0.4, + "grad_norm": 0.5638971328735352, + "learning_rate": 0.0005741762398466337, + "loss": 3.366, + "step": 8144 + }, + { + "epoch": 0.4, + "grad_norm": 0.5249466300010681, + "learning_rate": 0.0005741699902863943, + "loss": 3.3142, + "step": 8145 + }, + { + "epoch": 0.4, + "grad_norm": 0.5074349045753479, + "learning_rate": 0.0005741637400040438, + "loss": 3.4865, + "step": 8146 + }, + { + "epoch": 0.4, + "grad_norm": 0.5369219779968262, + "learning_rate": 0.0005741574889995987, + "loss": 3.2189, + "step": 8147 + }, + { + "epoch": 0.4, + "grad_norm": 0.546882688999176, + "learning_rate": 0.0005741512372730757, + "loss": 3.2913, + "step": 8148 + }, + { + "epoch": 0.4, + "grad_norm": 0.5176022052764893, + "learning_rate": 0.000574144984824491, + "loss": 3.2776, + "step": 8149 + }, + { + "epoch": 0.4, + "grad_norm": 0.5151086449623108, + "learning_rate": 0.0005741387316538612, + "loss": 3.3433, + "step": 8150 + }, + { + "epoch": 0.4, + "grad_norm": 0.5456839799880981, + "learning_rate": 0.0005741324777612027, + "loss": 3.3781, + "step": 8151 + }, + { + "epoch": 0.4, + "grad_norm": 0.591968834400177, + "learning_rate": 0.0005741262231465321, + "loss": 3.3347, + "step": 8152 + }, + { + "epoch": 0.4, + "grad_norm": 0.5454823970794678, + "learning_rate": 0.0005741199678098657, + "loss": 3.6877, + "step": 8153 + }, + { + "epoch": 0.4, + "grad_norm": 0.5745152831077576, + "learning_rate": 0.00057411371175122, + "loss": 3.3773, + "step": 8154 + }, + { + "epoch": 0.4, + "grad_norm": 0.5442945957183838, + "learning_rate": 0.0005741074549706117, + "loss": 3.4401, + "step": 8155 + }, + { + "epoch": 0.4, + "grad_norm": 0.5125526785850525, + "learning_rate": 0.000574101197468057, + "loss": 3.3032, + "step": 8156 + }, + { + "epoch": 0.4, + "grad_norm": 0.5886013507843018, + "learning_rate": 0.0005740949392435726, + "loss": 3.3589, + "step": 8157 + }, + { + "epoch": 0.4, + "grad_norm": 0.5841769576072693, + "learning_rate": 0.0005740886802971748, + "loss": 3.1734, + "step": 8158 + }, + { + "epoch": 0.4, + "grad_norm": 0.5075914263725281, + "learning_rate": 0.0005740824206288801, + "loss": 3.2588, + "step": 8159 + }, + { + "epoch": 0.4, + "grad_norm": 0.5713216662406921, + "learning_rate": 0.0005740761602387052, + "loss": 3.1835, + "step": 8160 + }, + { + "epoch": 0.4, + "grad_norm": 0.5094101428985596, + "learning_rate": 0.0005740698991266664, + "loss": 3.5436, + "step": 8161 + }, + { + "epoch": 0.4, + "grad_norm": 0.49177950620651245, + "learning_rate": 0.0005740636372927802, + "loss": 3.3805, + "step": 8162 + }, + { + "epoch": 0.4, + "grad_norm": 0.5621153116226196, + "learning_rate": 0.0005740573747370632, + "loss": 3.1933, + "step": 8163 + }, + { + "epoch": 0.4, + "grad_norm": 0.5192760229110718, + "learning_rate": 0.0005740511114595317, + "loss": 3.2768, + "step": 8164 + }, + { + "epoch": 0.4, + "grad_norm": 0.5471800565719604, + "learning_rate": 0.0005740448474602024, + "loss": 3.3567, + "step": 8165 + }, + { + "epoch": 0.4, + "grad_norm": 0.541714608669281, + "learning_rate": 0.0005740385827390916, + "loss": 3.1748, + "step": 8166 + }, + { + "epoch": 0.4, + "grad_norm": 0.4933028519153595, + "learning_rate": 0.000574032317296216, + "loss": 3.3155, + "step": 8167 + }, + { + "epoch": 0.4, + "grad_norm": 0.5285734534263611, + "learning_rate": 0.0005740260511315921, + "loss": 3.0752, + "step": 8168 + }, + { + "epoch": 0.4, + "grad_norm": 0.5299739837646484, + "learning_rate": 0.0005740197842452362, + "loss": 3.3437, + "step": 8169 + }, + { + "epoch": 0.4, + "grad_norm": 0.5052409768104553, + "learning_rate": 0.000574013516637165, + "loss": 3.4159, + "step": 8170 + }, + { + "epoch": 0.4, + "grad_norm": 0.522113561630249, + "learning_rate": 0.0005740072483073948, + "loss": 3.3624, + "step": 8171 + }, + { + "epoch": 0.4, + "grad_norm": 0.5488013625144958, + "learning_rate": 0.0005740009792559424, + "loss": 3.338, + "step": 8172 + }, + { + "epoch": 0.4, + "grad_norm": 0.5112448334693909, + "learning_rate": 0.000573994709482824, + "loss": 3.3995, + "step": 8173 + }, + { + "epoch": 0.4, + "grad_norm": 0.5400604605674744, + "learning_rate": 0.0005739884389880564, + "loss": 3.0325, + "step": 8174 + }, + { + "epoch": 0.4, + "grad_norm": 0.5453541874885559, + "learning_rate": 0.000573982167771656, + "loss": 3.333, + "step": 8175 + }, + { + "epoch": 0.4, + "grad_norm": 0.5471153855323792, + "learning_rate": 0.0005739758958336392, + "loss": 3.2572, + "step": 8176 + }, + { + "epoch": 0.4, + "grad_norm": 0.5193156003952026, + "learning_rate": 0.0005739696231740226, + "loss": 3.1392, + "step": 8177 + }, + { + "epoch": 0.4, + "grad_norm": 0.5482691526412964, + "learning_rate": 0.0005739633497928228, + "loss": 3.2852, + "step": 8178 + }, + { + "epoch": 0.4, + "grad_norm": 0.5617150664329529, + "learning_rate": 0.0005739570756900563, + "loss": 3.3936, + "step": 8179 + }, + { + "epoch": 0.4, + "grad_norm": 0.5217198133468628, + "learning_rate": 0.0005739508008657394, + "loss": 3.2155, + "step": 8180 + }, + { + "epoch": 0.4, + "grad_norm": 0.5516659021377563, + "learning_rate": 0.000573944525319889, + "loss": 3.2566, + "step": 8181 + }, + { + "epoch": 0.4, + "grad_norm": 0.5190207362174988, + "learning_rate": 0.0005739382490525214, + "loss": 3.6173, + "step": 8182 + }, + { + "epoch": 0.4, + "grad_norm": 0.5515297055244446, + "learning_rate": 0.0005739319720636532, + "loss": 3.1461, + "step": 8183 + }, + { + "epoch": 0.4, + "grad_norm": 0.4930954873561859, + "learning_rate": 0.0005739256943533007, + "loss": 3.3663, + "step": 8184 + }, + { + "epoch": 0.4, + "grad_norm": 0.5274080038070679, + "learning_rate": 0.0005739194159214808, + "loss": 3.5295, + "step": 8185 + }, + { + "epoch": 0.4, + "grad_norm": 0.5591486096382141, + "learning_rate": 0.00057391313676821, + "loss": 3.4101, + "step": 8186 + }, + { + "epoch": 0.4, + "grad_norm": 0.5552020072937012, + "learning_rate": 0.0005739068568935046, + "loss": 3.3131, + "step": 8187 + }, + { + "epoch": 0.4, + "grad_norm": 0.5581715106964111, + "learning_rate": 0.0005739005762973813, + "loss": 3.1061, + "step": 8188 + }, + { + "epoch": 0.4, + "grad_norm": 0.5154845714569092, + "learning_rate": 0.0005738942949798564, + "loss": 3.3115, + "step": 8189 + }, + { + "epoch": 0.4, + "grad_norm": 0.49115267395973206, + "learning_rate": 0.0005738880129409469, + "loss": 3.395, + "step": 8190 + }, + { + "epoch": 0.4, + "grad_norm": 0.5890899896621704, + "learning_rate": 0.0005738817301806689, + "loss": 3.1336, + "step": 8191 + }, + { + "epoch": 0.4, + "grad_norm": 0.5730346441268921, + "learning_rate": 0.0005738754466990393, + "loss": 3.2823, + "step": 8192 + }, + { + "epoch": 0.4, + "grad_norm": 0.5348773002624512, + "learning_rate": 0.0005738691624960743, + "loss": 3.3428, + "step": 8193 + }, + { + "epoch": 0.4, + "grad_norm": 0.5359728336334229, + "learning_rate": 0.0005738628775717907, + "loss": 3.3814, + "step": 8194 + }, + { + "epoch": 0.4, + "grad_norm": 0.5193948149681091, + "learning_rate": 0.000573856591926205, + "loss": 3.3435, + "step": 8195 + }, + { + "epoch": 0.4, + "grad_norm": 0.5473117232322693, + "learning_rate": 0.0005738503055593338, + "loss": 3.5604, + "step": 8196 + }, + { + "epoch": 0.4, + "grad_norm": 0.5555245280265808, + "learning_rate": 0.0005738440184711934, + "loss": 3.3051, + "step": 8197 + }, + { + "epoch": 0.4, + "grad_norm": 0.6440548896789551, + "learning_rate": 0.0005738377306618008, + "loss": 3.357, + "step": 8198 + }, + { + "epoch": 0.4, + "grad_norm": 0.4919479489326477, + "learning_rate": 0.0005738314421311722, + "loss": 3.1436, + "step": 8199 + }, + { + "epoch": 0.4, + "grad_norm": 0.5028539299964905, + "learning_rate": 0.0005738251528793244, + "loss": 3.3776, + "step": 8200 + }, + { + "epoch": 0.4, + "grad_norm": 0.5393859148025513, + "learning_rate": 0.0005738188629062737, + "loss": 3.3237, + "step": 8201 + }, + { + "epoch": 0.4, + "grad_norm": 0.5415965914726257, + "learning_rate": 0.0005738125722120368, + "loss": 3.3788, + "step": 8202 + }, + { + "epoch": 0.4, + "grad_norm": 0.5266854763031006, + "learning_rate": 0.0005738062807966303, + "loss": 3.3418, + "step": 8203 + }, + { + "epoch": 0.4, + "grad_norm": 0.5485467314720154, + "learning_rate": 0.0005737999886600709, + "loss": 3.3907, + "step": 8204 + }, + { + "epoch": 0.4, + "grad_norm": 0.5207034945487976, + "learning_rate": 0.0005737936958023749, + "loss": 3.3886, + "step": 8205 + }, + { + "epoch": 0.4, + "grad_norm": 0.5115718841552734, + "learning_rate": 0.0005737874022235589, + "loss": 3.4131, + "step": 8206 + }, + { + "epoch": 0.4, + "grad_norm": 0.52555251121521, + "learning_rate": 0.0005737811079236397, + "loss": 3.308, + "step": 8207 + }, + { + "epoch": 0.4, + "grad_norm": 0.5632022023200989, + "learning_rate": 0.0005737748129026338, + "loss": 3.2812, + "step": 8208 + }, + { + "epoch": 0.4, + "grad_norm": 0.5216935873031616, + "learning_rate": 0.0005737685171605577, + "loss": 3.3546, + "step": 8209 + }, + { + "epoch": 0.4, + "grad_norm": 0.5155556201934814, + "learning_rate": 0.0005737622206974279, + "loss": 3.3297, + "step": 8210 + }, + { + "epoch": 0.4, + "grad_norm": 0.5012245178222656, + "learning_rate": 0.0005737559235132612, + "loss": 3.4148, + "step": 8211 + }, + { + "epoch": 0.4, + "grad_norm": 0.8418143391609192, + "learning_rate": 0.000573749625608074, + "loss": 3.4831, + "step": 8212 + }, + { + "epoch": 0.4, + "grad_norm": 0.5532170534133911, + "learning_rate": 0.0005737433269818831, + "loss": 3.3624, + "step": 8213 + }, + { + "epoch": 0.4, + "grad_norm": 0.5478039979934692, + "learning_rate": 0.0005737370276347049, + "loss": 3.1837, + "step": 8214 + }, + { + "epoch": 0.4, + "grad_norm": 0.5057279467582703, + "learning_rate": 0.0005737307275665561, + "loss": 3.2443, + "step": 8215 + }, + { + "epoch": 0.4, + "grad_norm": 0.5062552690505981, + "learning_rate": 0.0005737244267774531, + "loss": 3.423, + "step": 8216 + }, + { + "epoch": 0.4, + "grad_norm": 0.5578494071960449, + "learning_rate": 0.0005737181252674127, + "loss": 3.2908, + "step": 8217 + }, + { + "epoch": 0.4, + "grad_norm": 0.5365298390388489, + "learning_rate": 0.0005737118230364515, + "loss": 3.4519, + "step": 8218 + }, + { + "epoch": 0.4, + "grad_norm": 0.4922030568122864, + "learning_rate": 0.0005737055200845861, + "loss": 3.3189, + "step": 8219 + }, + { + "epoch": 0.4, + "grad_norm": 0.5830522775650024, + "learning_rate": 0.0005736992164118328, + "loss": 3.1601, + "step": 8220 + }, + { + "epoch": 0.4, + "grad_norm": 0.5145552158355713, + "learning_rate": 0.0005736929120182086, + "loss": 3.4044, + "step": 8221 + }, + { + "epoch": 0.4, + "grad_norm": 0.5255774259567261, + "learning_rate": 0.0005736866069037299, + "loss": 3.2494, + "step": 8222 + }, + { + "epoch": 0.4, + "grad_norm": 0.518876314163208, + "learning_rate": 0.0005736803010684134, + "loss": 3.1429, + "step": 8223 + }, + { + "epoch": 0.4, + "grad_norm": 0.49960964918136597, + "learning_rate": 0.0005736739945122756, + "loss": 3.1982, + "step": 8224 + }, + { + "epoch": 0.4, + "grad_norm": 0.5569823980331421, + "learning_rate": 0.0005736676872353331, + "loss": 3.2566, + "step": 8225 + }, + { + "epoch": 0.4, + "grad_norm": 0.5008841753005981, + "learning_rate": 0.0005736613792376027, + "loss": 3.3866, + "step": 8226 + }, + { + "epoch": 0.4, + "grad_norm": 0.5104358196258545, + "learning_rate": 0.0005736550705191009, + "loss": 3.291, + "step": 8227 + }, + { + "epoch": 0.4, + "grad_norm": 0.5353780388832092, + "learning_rate": 0.0005736487610798443, + "loss": 3.2402, + "step": 8228 + }, + { + "epoch": 0.4, + "grad_norm": 0.5313364863395691, + "learning_rate": 0.0005736424509198495, + "loss": 3.5111, + "step": 8229 + }, + { + "epoch": 0.4, + "grad_norm": 0.520453155040741, + "learning_rate": 0.0005736361400391332, + "loss": 3.4149, + "step": 8230 + }, + { + "epoch": 0.4, + "grad_norm": 0.5445863008499146, + "learning_rate": 0.0005736298284377119, + "loss": 3.2695, + "step": 8231 + }, + { + "epoch": 0.4, + "grad_norm": 0.5336853265762329, + "learning_rate": 0.0005736235161156023, + "loss": 3.4905, + "step": 8232 + }, + { + "epoch": 0.4, + "grad_norm": 0.5520970225334167, + "learning_rate": 0.000573617203072821, + "loss": 3.3523, + "step": 8233 + }, + { + "epoch": 0.4, + "grad_norm": 0.5282114744186401, + "learning_rate": 0.0005736108893093847, + "loss": 3.2539, + "step": 8234 + }, + { + "epoch": 0.4, + "grad_norm": 0.560897946357727, + "learning_rate": 0.00057360457482531, + "loss": 3.3655, + "step": 8235 + }, + { + "epoch": 0.4, + "grad_norm": 0.5395894646644592, + "learning_rate": 0.0005735982596206133, + "loss": 3.3413, + "step": 8236 + }, + { + "epoch": 0.4, + "grad_norm": 0.5417824983596802, + "learning_rate": 0.0005735919436953117, + "loss": 3.2808, + "step": 8237 + }, + { + "epoch": 0.4, + "grad_norm": 0.5005999207496643, + "learning_rate": 0.0005735856270494214, + "loss": 3.3514, + "step": 8238 + }, + { + "epoch": 0.4, + "grad_norm": 0.5334196090698242, + "learning_rate": 0.0005735793096829593, + "loss": 3.4646, + "step": 8239 + }, + { + "epoch": 0.4, + "grad_norm": 0.571017861366272, + "learning_rate": 0.0005735729915959419, + "loss": 3.3096, + "step": 8240 + }, + { + "epoch": 0.4, + "grad_norm": 0.560081422328949, + "learning_rate": 0.0005735666727883861, + "loss": 3.3187, + "step": 8241 + }, + { + "epoch": 0.4, + "grad_norm": 0.5151913166046143, + "learning_rate": 0.000573560353260308, + "loss": 3.3688, + "step": 8242 + }, + { + "epoch": 0.4, + "grad_norm": 0.5449792742729187, + "learning_rate": 0.0005735540330117247, + "loss": 3.1677, + "step": 8243 + }, + { + "epoch": 0.4, + "grad_norm": 0.530566394329071, + "learning_rate": 0.0005735477120426528, + "loss": 3.0801, + "step": 8244 + }, + { + "epoch": 0.4, + "grad_norm": 0.5325320363044739, + "learning_rate": 0.0005735413903531089, + "loss": 3.2273, + "step": 8245 + }, + { + "epoch": 0.4, + "grad_norm": 0.5118095874786377, + "learning_rate": 0.0005735350679431095, + "loss": 3.2628, + "step": 8246 + }, + { + "epoch": 0.4, + "grad_norm": 0.5123176574707031, + "learning_rate": 0.0005735287448126714, + "loss": 3.2, + "step": 8247 + }, + { + "epoch": 0.4, + "grad_norm": 0.5000588297843933, + "learning_rate": 0.0005735224209618113, + "loss": 3.2256, + "step": 8248 + }, + { + "epoch": 0.4, + "grad_norm": 0.6550585627555847, + "learning_rate": 0.0005735160963905458, + "loss": 3.3236, + "step": 8249 + }, + { + "epoch": 0.4, + "grad_norm": 0.5188483595848083, + "learning_rate": 0.0005735097710988914, + "loss": 3.2484, + "step": 8250 + }, + { + "epoch": 0.4, + "grad_norm": 0.5245037078857422, + "learning_rate": 0.000573503445086865, + "loss": 3.1947, + "step": 8251 + }, + { + "epoch": 0.4, + "grad_norm": 0.5324363708496094, + "learning_rate": 0.0005734971183544832, + "loss": 3.2969, + "step": 8252 + }, + { + "epoch": 0.4, + "grad_norm": 0.5693351626396179, + "learning_rate": 0.0005734907909017624, + "loss": 3.4193, + "step": 8253 + }, + { + "epoch": 0.4, + "grad_norm": 0.5394246578216553, + "learning_rate": 0.0005734844627287197, + "loss": 3.3065, + "step": 8254 + }, + { + "epoch": 0.4, + "grad_norm": 0.5450414419174194, + "learning_rate": 0.0005734781338353716, + "loss": 3.4544, + "step": 8255 + }, + { + "epoch": 0.4, + "grad_norm": 0.5223867297172546, + "learning_rate": 0.0005734718042217345, + "loss": 3.3402, + "step": 8256 + }, + { + "epoch": 0.4, + "grad_norm": 0.5268417000770569, + "learning_rate": 0.0005734654738878256, + "loss": 3.254, + "step": 8257 + }, + { + "epoch": 0.4, + "grad_norm": 0.5623879432678223, + "learning_rate": 0.0005734591428336611, + "loss": 3.2588, + "step": 8258 + }, + { + "epoch": 0.4, + "grad_norm": 0.5085955858230591, + "learning_rate": 0.0005734528110592577, + "loss": 3.3621, + "step": 8259 + }, + { + "epoch": 0.4, + "grad_norm": 0.5223268270492554, + "learning_rate": 0.0005734464785646325, + "loss": 3.201, + "step": 8260 + }, + { + "epoch": 0.4, + "grad_norm": 0.4991510808467865, + "learning_rate": 0.0005734401453498018, + "loss": 3.2424, + "step": 8261 + }, + { + "epoch": 0.4, + "grad_norm": 0.5398638248443604, + "learning_rate": 0.0005734338114147823, + "loss": 3.4962, + "step": 8262 + }, + { + "epoch": 0.4, + "grad_norm": 0.5050841569900513, + "learning_rate": 0.0005734274767595908, + "loss": 3.4429, + "step": 8263 + }, + { + "epoch": 0.4, + "grad_norm": 0.5685188174247742, + "learning_rate": 0.000573421141384244, + "loss": 3.1578, + "step": 8264 + }, + { + "epoch": 0.41, + "grad_norm": 0.5070170164108276, + "learning_rate": 0.0005734148052887585, + "loss": 3.3676, + "step": 8265 + }, + { + "epoch": 0.41, + "grad_norm": 0.5665714144706726, + "learning_rate": 0.0005734084684731511, + "loss": 3.2815, + "step": 8266 + }, + { + "epoch": 0.41, + "grad_norm": 0.498685359954834, + "learning_rate": 0.0005734021309374383, + "loss": 3.3352, + "step": 8267 + }, + { + "epoch": 0.41, + "grad_norm": 0.509119987487793, + "learning_rate": 0.000573395792681637, + "loss": 3.2356, + "step": 8268 + }, + { + "epoch": 0.41, + "grad_norm": 0.5538164973258972, + "learning_rate": 0.0005733894537057638, + "loss": 3.2961, + "step": 8269 + }, + { + "epoch": 0.41, + "grad_norm": 0.5612331628799438, + "learning_rate": 0.0005733831140098353, + "loss": 3.3455, + "step": 8270 + }, + { + "epoch": 0.41, + "grad_norm": 0.5053423047065735, + "learning_rate": 0.0005733767735938683, + "loss": 3.3879, + "step": 8271 + }, + { + "epoch": 0.41, + "grad_norm": 0.56037837266922, + "learning_rate": 0.0005733704324578795, + "loss": 3.0991, + "step": 8272 + }, + { + "epoch": 0.41, + "grad_norm": 0.5380472540855408, + "learning_rate": 0.0005733640906018856, + "loss": 3.2985, + "step": 8273 + }, + { + "epoch": 0.41, + "grad_norm": 0.5246527791023254, + "learning_rate": 0.0005733577480259034, + "loss": 3.3638, + "step": 8274 + }, + { + "epoch": 0.41, + "grad_norm": 0.5476314425468445, + "learning_rate": 0.0005733514047299494, + "loss": 3.4797, + "step": 8275 + }, + { + "epoch": 0.41, + "grad_norm": 0.5380063652992249, + "learning_rate": 0.0005733450607140404, + "loss": 3.3618, + "step": 8276 + }, + { + "epoch": 0.41, + "grad_norm": 0.5321431159973145, + "learning_rate": 0.0005733387159781931, + "loss": 3.3288, + "step": 8277 + }, + { + "epoch": 0.41, + "grad_norm": 0.6225263476371765, + "learning_rate": 0.0005733323705224243, + "loss": 3.3563, + "step": 8278 + }, + { + "epoch": 0.41, + "grad_norm": 0.5668192505836487, + "learning_rate": 0.0005733260243467506, + "loss": 3.2277, + "step": 8279 + }, + { + "epoch": 0.41, + "grad_norm": 0.5005346536636353, + "learning_rate": 0.0005733196774511888, + "loss": 3.4365, + "step": 8280 + }, + { + "epoch": 0.41, + "grad_norm": 0.597763180732727, + "learning_rate": 0.0005733133298357555, + "loss": 3.102, + "step": 8281 + }, + { + "epoch": 0.41, + "grad_norm": 0.5184804797172546, + "learning_rate": 0.0005733069815004675, + "loss": 3.5517, + "step": 8282 + }, + { + "epoch": 0.41, + "grad_norm": 0.540427565574646, + "learning_rate": 0.0005733006324453415, + "loss": 3.3953, + "step": 8283 + }, + { + "epoch": 0.41, + "grad_norm": 0.595478355884552, + "learning_rate": 0.0005732942826703943, + "loss": 3.3344, + "step": 8284 + }, + { + "epoch": 0.41, + "grad_norm": 0.551795482635498, + "learning_rate": 0.0005732879321756426, + "loss": 3.3396, + "step": 8285 + }, + { + "epoch": 0.41, + "grad_norm": 0.5238455533981323, + "learning_rate": 0.0005732815809611029, + "loss": 3.5336, + "step": 8286 + }, + { + "epoch": 0.41, + "grad_norm": 0.5102473497390747, + "learning_rate": 0.0005732752290267921, + "loss": 3.5597, + "step": 8287 + }, + { + "epoch": 0.41, + "grad_norm": 0.5258164405822754, + "learning_rate": 0.0005732688763727273, + "loss": 3.4711, + "step": 8288 + }, + { + "epoch": 0.41, + "grad_norm": 0.5059826970100403, + "learning_rate": 0.0005732625229989245, + "loss": 3.2522, + "step": 8289 + }, + { + "epoch": 0.41, + "grad_norm": 0.5301188826560974, + "learning_rate": 0.0005732561689054009, + "loss": 3.5621, + "step": 8290 + }, + { + "epoch": 0.41, + "grad_norm": 0.5123043060302734, + "learning_rate": 0.0005732498140921732, + "loss": 3.4555, + "step": 8291 + }, + { + "epoch": 0.41, + "grad_norm": 0.5111595988273621, + "learning_rate": 0.000573243458559258, + "loss": 3.3539, + "step": 8292 + }, + { + "epoch": 0.41, + "grad_norm": 0.6053318977355957, + "learning_rate": 0.0005732371023066721, + "loss": 3.4818, + "step": 8293 + }, + { + "epoch": 0.41, + "grad_norm": 0.5144967436790466, + "learning_rate": 0.0005732307453344323, + "loss": 3.4808, + "step": 8294 + }, + { + "epoch": 0.41, + "grad_norm": 0.5194787383079529, + "learning_rate": 0.0005732243876425554, + "loss": 3.1705, + "step": 8295 + }, + { + "epoch": 0.41, + "grad_norm": 0.5327216386795044, + "learning_rate": 0.0005732180292310579, + "loss": 3.1993, + "step": 8296 + }, + { + "epoch": 0.41, + "grad_norm": 0.49364081025123596, + "learning_rate": 0.0005732116700999568, + "loss": 3.4751, + "step": 8297 + }, + { + "epoch": 0.41, + "grad_norm": 0.5248584747314453, + "learning_rate": 0.0005732053102492687, + "loss": 3.7122, + "step": 8298 + }, + { + "epoch": 0.41, + "grad_norm": 0.5066429376602173, + "learning_rate": 0.0005731989496790105, + "loss": 3.073, + "step": 8299 + }, + { + "epoch": 0.41, + "grad_norm": 0.5377814173698425, + "learning_rate": 0.0005731925883891986, + "loss": 3.3839, + "step": 8300 + }, + { + "epoch": 0.41, + "grad_norm": 0.5316858291625977, + "learning_rate": 0.0005731862263798502, + "loss": 3.0971, + "step": 8301 + }, + { + "epoch": 0.41, + "grad_norm": 0.5127855539321899, + "learning_rate": 0.0005731798636509817, + "loss": 3.1861, + "step": 8302 + }, + { + "epoch": 0.41, + "grad_norm": 0.5574333071708679, + "learning_rate": 0.0005731735002026102, + "loss": 3.3133, + "step": 8303 + }, + { + "epoch": 0.41, + "grad_norm": 0.5213868021965027, + "learning_rate": 0.0005731671360347521, + "loss": 3.1549, + "step": 8304 + }, + { + "epoch": 0.41, + "grad_norm": 0.5839649438858032, + "learning_rate": 0.0005731607711474244, + "loss": 3.3224, + "step": 8305 + }, + { + "epoch": 0.41, + "grad_norm": 0.524940013885498, + "learning_rate": 0.0005731544055406439, + "loss": 3.5019, + "step": 8306 + }, + { + "epoch": 0.41, + "grad_norm": 0.5284970998764038, + "learning_rate": 0.0005731480392144272, + "loss": 3.2947, + "step": 8307 + }, + { + "epoch": 0.41, + "grad_norm": 0.5189017057418823, + "learning_rate": 0.000573141672168791, + "loss": 3.0949, + "step": 8308 + }, + { + "epoch": 0.41, + "grad_norm": 0.5156539678573608, + "learning_rate": 0.0005731353044037524, + "loss": 3.3861, + "step": 8309 + }, + { + "epoch": 0.41, + "grad_norm": 0.49928221106529236, + "learning_rate": 0.0005731289359193278, + "loss": 3.4869, + "step": 8310 + }, + { + "epoch": 0.41, + "grad_norm": 0.546218991279602, + "learning_rate": 0.0005731225667155343, + "loss": 3.2345, + "step": 8311 + }, + { + "epoch": 0.41, + "grad_norm": 0.5048373937606812, + "learning_rate": 0.0005731161967923885, + "loss": 3.1884, + "step": 8312 + }, + { + "epoch": 0.41, + "grad_norm": 0.5357444286346436, + "learning_rate": 0.0005731098261499071, + "loss": 3.3281, + "step": 8313 + }, + { + "epoch": 0.41, + "grad_norm": 0.5371036529541016, + "learning_rate": 0.0005731034547881071, + "loss": 3.3279, + "step": 8314 + }, + { + "epoch": 0.41, + "grad_norm": 0.539368748664856, + "learning_rate": 0.000573097082707005, + "loss": 3.4323, + "step": 8315 + }, + { + "epoch": 0.41, + "grad_norm": 0.48763561248779297, + "learning_rate": 0.0005730907099066179, + "loss": 3.5516, + "step": 8316 + }, + { + "epoch": 0.41, + "grad_norm": 0.5571647882461548, + "learning_rate": 0.0005730843363869624, + "loss": 3.4282, + "step": 8317 + }, + { + "epoch": 0.41, + "grad_norm": 0.51971834897995, + "learning_rate": 0.0005730779621480552, + "loss": 3.2306, + "step": 8318 + }, + { + "epoch": 0.41, + "grad_norm": 0.5173705816268921, + "learning_rate": 0.0005730715871899133, + "loss": 3.3458, + "step": 8319 + }, + { + "epoch": 0.41, + "grad_norm": 0.5432358980178833, + "learning_rate": 0.0005730652115125533, + "loss": 3.3075, + "step": 8320 + }, + { + "epoch": 0.41, + "grad_norm": 0.5046690106391907, + "learning_rate": 0.0005730588351159922, + "loss": 3.2926, + "step": 8321 + }, + { + "epoch": 0.41, + "grad_norm": 0.5098605155944824, + "learning_rate": 0.0005730524580002468, + "loss": 3.21, + "step": 8322 + }, + { + "epoch": 0.41, + "grad_norm": 0.5081766247749329, + "learning_rate": 0.0005730460801653335, + "loss": 3.401, + "step": 8323 + }, + { + "epoch": 0.41, + "grad_norm": 0.5278263092041016, + "learning_rate": 0.0005730397016112694, + "loss": 3.2034, + "step": 8324 + }, + { + "epoch": 0.41, + "grad_norm": 0.5464687347412109, + "learning_rate": 0.0005730333223380714, + "loss": 3.4792, + "step": 8325 + }, + { + "epoch": 0.41, + "grad_norm": 0.5445109009742737, + "learning_rate": 0.0005730269423457561, + "loss": 3.5362, + "step": 8326 + }, + { + "epoch": 0.41, + "grad_norm": 0.5181647539138794, + "learning_rate": 0.0005730205616343405, + "loss": 3.1952, + "step": 8327 + }, + { + "epoch": 0.41, + "grad_norm": 0.5573515892028809, + "learning_rate": 0.0005730141802038413, + "loss": 3.1039, + "step": 8328 + }, + { + "epoch": 0.41, + "grad_norm": 0.5178354978561401, + "learning_rate": 0.0005730077980542751, + "loss": 3.536, + "step": 8329 + }, + { + "epoch": 0.41, + "grad_norm": 0.5695675015449524, + "learning_rate": 0.000573001415185659, + "loss": 3.3917, + "step": 8330 + }, + { + "epoch": 0.41, + "grad_norm": 0.5261296629905701, + "learning_rate": 0.0005729950315980098, + "loss": 3.4801, + "step": 8331 + }, + { + "epoch": 0.41, + "grad_norm": 0.5165058374404907, + "learning_rate": 0.0005729886472913441, + "loss": 3.3878, + "step": 8332 + }, + { + "epoch": 0.41, + "grad_norm": 0.513191819190979, + "learning_rate": 0.0005729822622656788, + "loss": 3.4338, + "step": 8333 + }, + { + "epoch": 0.41, + "grad_norm": 0.5528197884559631, + "learning_rate": 0.0005729758765210309, + "loss": 3.5439, + "step": 8334 + }, + { + "epoch": 0.41, + "grad_norm": 0.5654256343841553, + "learning_rate": 0.000572969490057417, + "loss": 3.2453, + "step": 8335 + }, + { + "epoch": 0.41, + "grad_norm": 0.5060741901397705, + "learning_rate": 0.0005729631028748539, + "loss": 3.3054, + "step": 8336 + }, + { + "epoch": 0.41, + "grad_norm": 0.5616364479064941, + "learning_rate": 0.0005729567149733587, + "loss": 3.2475, + "step": 8337 + }, + { + "epoch": 0.41, + "grad_norm": 0.4996776878833771, + "learning_rate": 0.0005729503263529479, + "loss": 3.3329, + "step": 8338 + }, + { + "epoch": 0.41, + "grad_norm": 0.519364595413208, + "learning_rate": 0.0005729439370136387, + "loss": 3.1648, + "step": 8339 + }, + { + "epoch": 0.41, + "grad_norm": 0.5395146012306213, + "learning_rate": 0.0005729375469554474, + "loss": 3.2874, + "step": 8340 + }, + { + "epoch": 0.41, + "grad_norm": 0.5185738205909729, + "learning_rate": 0.0005729311561783913, + "loss": 3.092, + "step": 8341 + }, + { + "epoch": 0.41, + "grad_norm": 0.5481864809989929, + "learning_rate": 0.0005729247646824871, + "loss": 3.2024, + "step": 8342 + }, + { + "epoch": 0.41, + "grad_norm": 0.5273303389549255, + "learning_rate": 0.0005729183724677515, + "loss": 3.1049, + "step": 8343 + }, + { + "epoch": 0.41, + "grad_norm": 0.5579976439476013, + "learning_rate": 0.0005729119795342014, + "loss": 3.2024, + "step": 8344 + }, + { + "epoch": 0.41, + "grad_norm": 0.5214023590087891, + "learning_rate": 0.0005729055858818537, + "loss": 3.4417, + "step": 8345 + }, + { + "epoch": 0.41, + "grad_norm": 0.5415693521499634, + "learning_rate": 0.0005728991915107252, + "loss": 3.4384, + "step": 8346 + }, + { + "epoch": 0.41, + "grad_norm": 0.5267033576965332, + "learning_rate": 0.0005728927964208329, + "loss": 3.1189, + "step": 8347 + }, + { + "epoch": 0.41, + "grad_norm": 0.5356072783470154, + "learning_rate": 0.0005728864006121934, + "loss": 3.3506, + "step": 8348 + }, + { + "epoch": 0.41, + "grad_norm": 0.5404013395309448, + "learning_rate": 0.0005728800040848235, + "loss": 3.3664, + "step": 8349 + }, + { + "epoch": 0.41, + "grad_norm": 0.5271034836769104, + "learning_rate": 0.0005728736068387404, + "loss": 3.3377, + "step": 8350 + }, + { + "epoch": 0.41, + "grad_norm": 0.5246227383613586, + "learning_rate": 0.0005728672088739606, + "loss": 3.4265, + "step": 8351 + }, + { + "epoch": 0.41, + "grad_norm": 0.4726375937461853, + "learning_rate": 0.0005728608101905012, + "loss": 3.2613, + "step": 8352 + }, + { + "epoch": 0.41, + "grad_norm": 0.5876821875572205, + "learning_rate": 0.0005728544107883788, + "loss": 3.246, + "step": 8353 + }, + { + "epoch": 0.41, + "grad_norm": 0.5646312832832336, + "learning_rate": 0.0005728480106676105, + "loss": 3.5403, + "step": 8354 + }, + { + "epoch": 0.41, + "grad_norm": 0.5237587690353394, + "learning_rate": 0.000572841609828213, + "loss": 3.3111, + "step": 8355 + }, + { + "epoch": 0.41, + "grad_norm": 0.5066969394683838, + "learning_rate": 0.0005728352082702032, + "loss": 3.2042, + "step": 8356 + }, + { + "epoch": 0.41, + "grad_norm": 0.5408262610435486, + "learning_rate": 0.000572828805993598, + "loss": 3.1741, + "step": 8357 + }, + { + "epoch": 0.41, + "grad_norm": 0.4968174695968628, + "learning_rate": 0.0005728224029984142, + "loss": 3.2532, + "step": 8358 + }, + { + "epoch": 0.41, + "grad_norm": 0.5312142372131348, + "learning_rate": 0.0005728159992846687, + "loss": 3.1769, + "step": 8359 + }, + { + "epoch": 0.41, + "grad_norm": 0.5489519834518433, + "learning_rate": 0.0005728095948523784, + "loss": 3.2044, + "step": 8360 + }, + { + "epoch": 0.41, + "grad_norm": 0.5215625166893005, + "learning_rate": 0.0005728031897015601, + "loss": 3.2395, + "step": 8361 + }, + { + "epoch": 0.41, + "grad_norm": 0.4845183491706848, + "learning_rate": 0.0005727967838322307, + "loss": 3.3525, + "step": 8362 + }, + { + "epoch": 0.41, + "grad_norm": 0.5335796475410461, + "learning_rate": 0.0005727903772444071, + "loss": 3.3049, + "step": 8363 + }, + { + "epoch": 0.41, + "grad_norm": 0.5164490342140198, + "learning_rate": 0.0005727839699381062, + "loss": 3.1817, + "step": 8364 + }, + { + "epoch": 0.41, + "grad_norm": 0.5204142332077026, + "learning_rate": 0.0005727775619133446, + "loss": 3.0639, + "step": 8365 + }, + { + "epoch": 0.41, + "grad_norm": 0.5255699753761292, + "learning_rate": 0.0005727711531701396, + "loss": 3.285, + "step": 8366 + }, + { + "epoch": 0.41, + "grad_norm": 0.527921736240387, + "learning_rate": 0.0005727647437085078, + "loss": 3.4429, + "step": 8367 + }, + { + "epoch": 0.41, + "grad_norm": 0.5462525486946106, + "learning_rate": 0.0005727583335284662, + "loss": 3.5266, + "step": 8368 + }, + { + "epoch": 0.41, + "grad_norm": 0.5473728179931641, + "learning_rate": 0.0005727519226300317, + "loss": 3.3949, + "step": 8369 + }, + { + "epoch": 0.41, + "grad_norm": 0.5657687187194824, + "learning_rate": 0.000572745511013221, + "loss": 3.1723, + "step": 8370 + }, + { + "epoch": 0.41, + "grad_norm": 0.5343332886695862, + "learning_rate": 0.0005727390986780513, + "loss": 3.2042, + "step": 8371 + }, + { + "epoch": 0.41, + "grad_norm": 0.4889446794986725, + "learning_rate": 0.0005727326856245391, + "loss": 3.2929, + "step": 8372 + }, + { + "epoch": 0.41, + "grad_norm": 0.5153390169143677, + "learning_rate": 0.0005727262718527014, + "loss": 3.5404, + "step": 8373 + }, + { + "epoch": 0.41, + "grad_norm": 0.5464727282524109, + "learning_rate": 0.0005727198573625555, + "loss": 3.449, + "step": 8374 + }, + { + "epoch": 0.41, + "grad_norm": 0.5315839648246765, + "learning_rate": 0.0005727134421541179, + "loss": 3.2474, + "step": 8375 + }, + { + "epoch": 0.41, + "grad_norm": 0.5293451547622681, + "learning_rate": 0.0005727070262274054, + "loss": 3.3761, + "step": 8376 + }, + { + "epoch": 0.41, + "grad_norm": 0.5146985054016113, + "learning_rate": 0.0005727006095824352, + "loss": 3.2667, + "step": 8377 + }, + { + "epoch": 0.41, + "grad_norm": 0.5047792196273804, + "learning_rate": 0.0005726941922192242, + "loss": 3.4058, + "step": 8378 + }, + { + "epoch": 0.41, + "grad_norm": 0.5004801154136658, + "learning_rate": 0.0005726877741377889, + "loss": 3.2643, + "step": 8379 + }, + { + "epoch": 0.41, + "grad_norm": 0.5195822715759277, + "learning_rate": 0.0005726813553381467, + "loss": 3.354, + "step": 8380 + }, + { + "epoch": 0.41, + "grad_norm": 0.5108522772789001, + "learning_rate": 0.0005726749358203142, + "loss": 3.2618, + "step": 8381 + }, + { + "epoch": 0.41, + "grad_norm": 0.5223830342292786, + "learning_rate": 0.0005726685155843083, + "loss": 3.3946, + "step": 8382 + }, + { + "epoch": 0.41, + "grad_norm": 0.49974021315574646, + "learning_rate": 0.0005726620946301462, + "loss": 3.1381, + "step": 8383 + }, + { + "epoch": 0.41, + "grad_norm": 0.5291284918785095, + "learning_rate": 0.0005726556729578445, + "loss": 3.3743, + "step": 8384 + }, + { + "epoch": 0.41, + "grad_norm": 0.5420185327529907, + "learning_rate": 0.0005726492505674204, + "loss": 3.245, + "step": 8385 + }, + { + "epoch": 0.41, + "grad_norm": 0.5729690790176392, + "learning_rate": 0.0005726428274588904, + "loss": 3.1877, + "step": 8386 + }, + { + "epoch": 0.41, + "grad_norm": 0.5272477269172668, + "learning_rate": 0.0005726364036322718, + "loss": 3.1366, + "step": 8387 + }, + { + "epoch": 0.41, + "grad_norm": 0.5126119256019592, + "learning_rate": 0.0005726299790875814, + "loss": 3.2684, + "step": 8388 + }, + { + "epoch": 0.41, + "grad_norm": 0.5218641757965088, + "learning_rate": 0.000572623553824836, + "loss": 3.1348, + "step": 8389 + }, + { + "epoch": 0.41, + "grad_norm": 0.505780041217804, + "learning_rate": 0.0005726171278440527, + "loss": 3.2774, + "step": 8390 + }, + { + "epoch": 0.41, + "grad_norm": 0.5334300994873047, + "learning_rate": 0.0005726107011452484, + "loss": 3.3003, + "step": 8391 + }, + { + "epoch": 0.41, + "grad_norm": 0.5299844741821289, + "learning_rate": 0.0005726042737284399, + "loss": 3.1663, + "step": 8392 + }, + { + "epoch": 0.41, + "grad_norm": 0.5310911536216736, + "learning_rate": 0.0005725978455936443, + "loss": 3.3384, + "step": 8393 + }, + { + "epoch": 0.41, + "grad_norm": 0.5203625559806824, + "learning_rate": 0.0005725914167408784, + "loss": 3.3432, + "step": 8394 + }, + { + "epoch": 0.41, + "grad_norm": 0.5236132144927979, + "learning_rate": 0.000572584987170159, + "loss": 3.2808, + "step": 8395 + }, + { + "epoch": 0.41, + "grad_norm": 0.5143657326698303, + "learning_rate": 0.0005725785568815034, + "loss": 3.4455, + "step": 8396 + }, + { + "epoch": 0.41, + "grad_norm": 0.5301834344863892, + "learning_rate": 0.0005725721258749282, + "loss": 3.3219, + "step": 8397 + }, + { + "epoch": 0.41, + "grad_norm": 0.5046020150184631, + "learning_rate": 0.0005725656941504507, + "loss": 3.4859, + "step": 8398 + }, + { + "epoch": 0.41, + "grad_norm": 0.49806228280067444, + "learning_rate": 0.0005725592617080873, + "loss": 3.3874, + "step": 8399 + }, + { + "epoch": 0.41, + "grad_norm": 0.5187866687774658, + "learning_rate": 0.0005725528285478555, + "loss": 3.5443, + "step": 8400 + }, + { + "epoch": 0.41, + "grad_norm": 0.5415318608283997, + "learning_rate": 0.000572546394669772, + "loss": 3.2546, + "step": 8401 + }, + { + "epoch": 0.41, + "grad_norm": 0.4997556209564209, + "learning_rate": 0.0005725399600738537, + "loss": 3.4427, + "step": 8402 + }, + { + "epoch": 0.41, + "grad_norm": 0.5891625285148621, + "learning_rate": 0.0005725335247601176, + "loss": 3.5401, + "step": 8403 + }, + { + "epoch": 0.41, + "grad_norm": 0.6055923104286194, + "learning_rate": 0.0005725270887285806, + "loss": 3.1657, + "step": 8404 + }, + { + "epoch": 0.41, + "grad_norm": 0.5357984304428101, + "learning_rate": 0.0005725206519792597, + "loss": 3.3858, + "step": 8405 + }, + { + "epoch": 0.41, + "grad_norm": 0.5380067825317383, + "learning_rate": 0.0005725142145121719, + "loss": 3.2251, + "step": 8406 + }, + { + "epoch": 0.41, + "grad_norm": 0.5393832325935364, + "learning_rate": 0.0005725077763273341, + "loss": 3.3881, + "step": 8407 + }, + { + "epoch": 0.41, + "grad_norm": 0.4998052716255188, + "learning_rate": 0.0005725013374247633, + "loss": 3.5416, + "step": 8408 + }, + { + "epoch": 0.41, + "grad_norm": 0.5088112950325012, + "learning_rate": 0.0005724948978044763, + "loss": 3.5218, + "step": 8409 + }, + { + "epoch": 0.41, + "grad_norm": 0.5030485987663269, + "learning_rate": 0.0005724884574664903, + "loss": 3.4079, + "step": 8410 + }, + { + "epoch": 0.41, + "grad_norm": 0.5525428652763367, + "learning_rate": 0.000572482016410822, + "loss": 3.2823, + "step": 8411 + }, + { + "epoch": 0.41, + "grad_norm": 0.5205178260803223, + "learning_rate": 0.0005724755746374887, + "loss": 3.3983, + "step": 8412 + }, + { + "epoch": 0.41, + "grad_norm": 0.5077677369117737, + "learning_rate": 0.0005724691321465071, + "loss": 3.2847, + "step": 8413 + }, + { + "epoch": 0.41, + "grad_norm": 0.568877100944519, + "learning_rate": 0.0005724626889378942, + "loss": 3.323, + "step": 8414 + }, + { + "epoch": 0.41, + "grad_norm": 0.563330352306366, + "learning_rate": 0.0005724562450116669, + "loss": 3.1247, + "step": 8415 + }, + { + "epoch": 0.41, + "grad_norm": 0.5267123579978943, + "learning_rate": 0.0005724498003678425, + "loss": 3.296, + "step": 8416 + }, + { + "epoch": 0.41, + "grad_norm": 0.483675479888916, + "learning_rate": 0.0005724433550064376, + "loss": 3.4229, + "step": 8417 + }, + { + "epoch": 0.41, + "grad_norm": 0.5056666731834412, + "learning_rate": 0.0005724369089274695, + "loss": 3.3706, + "step": 8418 + }, + { + "epoch": 0.41, + "grad_norm": 0.5181419253349304, + "learning_rate": 0.000572430462130955, + "loss": 3.2534, + "step": 8419 + }, + { + "epoch": 0.41, + "grad_norm": 0.502122700214386, + "learning_rate": 0.000572424014616911, + "loss": 3.5067, + "step": 8420 + }, + { + "epoch": 0.41, + "grad_norm": 0.5571039319038391, + "learning_rate": 0.0005724175663853545, + "loss": 2.9924, + "step": 8421 + }, + { + "epoch": 0.41, + "grad_norm": 0.5149720907211304, + "learning_rate": 0.0005724111174363026, + "loss": 3.1874, + "step": 8422 + }, + { + "epoch": 0.41, + "grad_norm": 0.5191240310668945, + "learning_rate": 0.0005724046677697724, + "loss": 3.4348, + "step": 8423 + }, + { + "epoch": 0.41, + "grad_norm": 0.5127260684967041, + "learning_rate": 0.0005723982173857806, + "loss": 3.1189, + "step": 8424 + }, + { + "epoch": 0.41, + "grad_norm": 0.5314476490020752, + "learning_rate": 0.0005723917662843444, + "loss": 3.1581, + "step": 8425 + }, + { + "epoch": 0.41, + "grad_norm": 0.5084034204483032, + "learning_rate": 0.0005723853144654806, + "loss": 3.2397, + "step": 8426 + }, + { + "epoch": 0.41, + "grad_norm": 0.4968039095401764, + "learning_rate": 0.0005723788619292064, + "loss": 3.2453, + "step": 8427 + }, + { + "epoch": 0.41, + "grad_norm": 0.5596404671669006, + "learning_rate": 0.0005723724086755386, + "loss": 3.329, + "step": 8428 + }, + { + "epoch": 0.41, + "grad_norm": 0.5200359225273132, + "learning_rate": 0.0005723659547044944, + "loss": 3.4476, + "step": 8429 + }, + { + "epoch": 0.41, + "grad_norm": 0.48586541414260864, + "learning_rate": 0.0005723595000160906, + "loss": 3.3514, + "step": 8430 + }, + { + "epoch": 0.41, + "grad_norm": 0.5171741843223572, + "learning_rate": 0.0005723530446103443, + "loss": 3.2959, + "step": 8431 + }, + { + "epoch": 0.41, + "grad_norm": 0.4921620190143585, + "learning_rate": 0.0005723465884872726, + "loss": 3.3176, + "step": 8432 + }, + { + "epoch": 0.41, + "grad_norm": 0.5125603675842285, + "learning_rate": 0.0005723401316468923, + "loss": 3.377, + "step": 8433 + }, + { + "epoch": 0.41, + "grad_norm": 0.5400583744049072, + "learning_rate": 0.0005723336740892206, + "loss": 3.1951, + "step": 8434 + }, + { + "epoch": 0.41, + "grad_norm": 0.5424138903617859, + "learning_rate": 0.0005723272158142744, + "loss": 3.1451, + "step": 8435 + }, + { + "epoch": 0.41, + "grad_norm": 0.5870009064674377, + "learning_rate": 0.0005723207568220707, + "loss": 3.5986, + "step": 8436 + }, + { + "epoch": 0.41, + "grad_norm": 0.5539098381996155, + "learning_rate": 0.0005723142971126265, + "loss": 3.2467, + "step": 8437 + }, + { + "epoch": 0.41, + "grad_norm": 0.520626425743103, + "learning_rate": 0.0005723078366859588, + "loss": 3.4634, + "step": 8438 + }, + { + "epoch": 0.41, + "grad_norm": 0.501064658164978, + "learning_rate": 0.0005723013755420847, + "loss": 3.1619, + "step": 8439 + }, + { + "epoch": 0.41, + "grad_norm": 0.5047374963760376, + "learning_rate": 0.0005722949136810212, + "loss": 3.2784, + "step": 8440 + }, + { + "epoch": 0.41, + "grad_norm": 0.5190777778625488, + "learning_rate": 0.0005722884511027853, + "loss": 3.4443, + "step": 8441 + }, + { + "epoch": 0.41, + "grad_norm": 0.5325513482093811, + "learning_rate": 0.0005722819878073942, + "loss": 3.2189, + "step": 8442 + }, + { + "epoch": 0.41, + "grad_norm": 0.5083960294723511, + "learning_rate": 0.0005722755237948645, + "loss": 3.4314, + "step": 8443 + }, + { + "epoch": 0.41, + "grad_norm": 0.5294110178947449, + "learning_rate": 0.0005722690590652136, + "loss": 3.4491, + "step": 8444 + }, + { + "epoch": 0.41, + "grad_norm": 0.5367835164070129, + "learning_rate": 0.0005722625936184583, + "loss": 3.3437, + "step": 8445 + }, + { + "epoch": 0.41, + "grad_norm": 0.5289018750190735, + "learning_rate": 0.0005722561274546158, + "loss": 3.4518, + "step": 8446 + }, + { + "epoch": 0.41, + "grad_norm": 0.5518658757209778, + "learning_rate": 0.0005722496605737031, + "loss": 3.4819, + "step": 8447 + }, + { + "epoch": 0.41, + "grad_norm": 0.5337439179420471, + "learning_rate": 0.0005722431929757371, + "loss": 3.3336, + "step": 8448 + }, + { + "epoch": 0.41, + "grad_norm": 0.5443679094314575, + "learning_rate": 0.0005722367246607349, + "loss": 3.2325, + "step": 8449 + }, + { + "epoch": 0.41, + "grad_norm": 0.5037354230880737, + "learning_rate": 0.0005722302556287137, + "loss": 3.1895, + "step": 8450 + }, + { + "epoch": 0.41, + "grad_norm": 0.51902174949646, + "learning_rate": 0.0005722237858796903, + "loss": 3.3569, + "step": 8451 + }, + { + "epoch": 0.41, + "grad_norm": 0.5206155776977539, + "learning_rate": 0.0005722173154136818, + "loss": 3.2948, + "step": 8452 + }, + { + "epoch": 0.41, + "grad_norm": 0.5317664742469788, + "learning_rate": 0.0005722108442307054, + "loss": 3.4704, + "step": 8453 + }, + { + "epoch": 0.41, + "grad_norm": 0.527557909488678, + "learning_rate": 0.000572204372330778, + "loss": 3.2911, + "step": 8454 + }, + { + "epoch": 0.41, + "grad_norm": 0.5252202153205872, + "learning_rate": 0.0005721978997139165, + "loss": 3.3554, + "step": 8455 + }, + { + "epoch": 0.41, + "grad_norm": 0.5341652035713196, + "learning_rate": 0.0005721914263801382, + "loss": 3.0762, + "step": 8456 + }, + { + "epoch": 0.41, + "grad_norm": 0.5101057887077332, + "learning_rate": 0.0005721849523294602, + "loss": 3.3628, + "step": 8457 + }, + { + "epoch": 0.41, + "grad_norm": 0.5079684853553772, + "learning_rate": 0.0005721784775618993, + "loss": 3.4156, + "step": 8458 + }, + { + "epoch": 0.41, + "grad_norm": 0.5565444231033325, + "learning_rate": 0.0005721720020774727, + "loss": 3.3037, + "step": 8459 + }, + { + "epoch": 0.41, + "grad_norm": 0.5274984240531921, + "learning_rate": 0.0005721655258761973, + "loss": 3.2268, + "step": 8460 + }, + { + "epoch": 0.41, + "grad_norm": 0.5040982365608215, + "learning_rate": 0.0005721590489580904, + "loss": 3.4673, + "step": 8461 + }, + { + "epoch": 0.41, + "grad_norm": 0.5225953459739685, + "learning_rate": 0.000572152571323169, + "loss": 3.4349, + "step": 8462 + }, + { + "epoch": 0.41, + "grad_norm": 0.5098921656608582, + "learning_rate": 0.0005721460929714501, + "loss": 3.4011, + "step": 8463 + }, + { + "epoch": 0.41, + "grad_norm": 0.5205847024917603, + "learning_rate": 0.0005721396139029507, + "loss": 3.2664, + "step": 8464 + }, + { + "epoch": 0.41, + "grad_norm": 0.5714619755744934, + "learning_rate": 0.000572133134117688, + "loss": 3.2306, + "step": 8465 + }, + { + "epoch": 0.41, + "grad_norm": 0.5325881838798523, + "learning_rate": 0.0005721266536156789, + "loss": 3.219, + "step": 8466 + }, + { + "epoch": 0.41, + "grad_norm": 0.5481316447257996, + "learning_rate": 0.0005721201723969407, + "loss": 3.3076, + "step": 8467 + }, + { + "epoch": 0.41, + "grad_norm": 0.4882158637046814, + "learning_rate": 0.0005721136904614901, + "loss": 3.3831, + "step": 8468 + }, + { + "epoch": 0.42, + "grad_norm": 0.5075830817222595, + "learning_rate": 0.0005721072078093447, + "loss": 3.3366, + "step": 8469 + }, + { + "epoch": 0.42, + "grad_norm": 0.5224390029907227, + "learning_rate": 0.0005721007244405211, + "loss": 3.4743, + "step": 8470 + }, + { + "epoch": 0.42, + "grad_norm": 0.5543192028999329, + "learning_rate": 0.0005720942403550366, + "loss": 3.2743, + "step": 8471 + }, + { + "epoch": 0.42, + "grad_norm": 0.572250485420227, + "learning_rate": 0.0005720877555529082, + "loss": 3.2516, + "step": 8472 + }, + { + "epoch": 0.42, + "grad_norm": 0.5291645526885986, + "learning_rate": 0.0005720812700341531, + "loss": 3.2301, + "step": 8473 + }, + { + "epoch": 0.42, + "grad_norm": 0.5106573104858398, + "learning_rate": 0.0005720747837987882, + "loss": 3.3192, + "step": 8474 + }, + { + "epoch": 0.42, + "grad_norm": 0.5043050646781921, + "learning_rate": 0.0005720682968468308, + "loss": 3.3815, + "step": 8475 + }, + { + "epoch": 0.42, + "grad_norm": 0.5045797824859619, + "learning_rate": 0.0005720618091782978, + "loss": 3.2098, + "step": 8476 + }, + { + "epoch": 0.42, + "grad_norm": 0.5489840507507324, + "learning_rate": 0.0005720553207932064, + "loss": 3.3597, + "step": 8477 + }, + { + "epoch": 0.42, + "grad_norm": 0.5759950876235962, + "learning_rate": 0.0005720488316915736, + "loss": 3.2866, + "step": 8478 + }, + { + "epoch": 0.42, + "grad_norm": 0.4919251799583435, + "learning_rate": 0.0005720423418734164, + "loss": 3.3322, + "step": 8479 + }, + { + "epoch": 0.42, + "grad_norm": 0.5042932033538818, + "learning_rate": 0.0005720358513387522, + "loss": 3.3563, + "step": 8480 + }, + { + "epoch": 0.42, + "grad_norm": 0.5451899766921997, + "learning_rate": 0.0005720293600875979, + "loss": 3.3275, + "step": 8481 + }, + { + "epoch": 0.42, + "grad_norm": 0.5713815689086914, + "learning_rate": 0.0005720228681199707, + "loss": 3.2709, + "step": 8482 + }, + { + "epoch": 0.42, + "grad_norm": 0.5565445423126221, + "learning_rate": 0.0005720163754358874, + "loss": 3.3623, + "step": 8483 + }, + { + "epoch": 0.42, + "grad_norm": 0.5319048166275024, + "learning_rate": 0.0005720098820353655, + "loss": 3.2325, + "step": 8484 + }, + { + "epoch": 0.42, + "grad_norm": 0.5170350670814514, + "learning_rate": 0.0005720033879184219, + "loss": 3.2892, + "step": 8485 + }, + { + "epoch": 0.42, + "grad_norm": 0.5210342407226562, + "learning_rate": 0.0005719968930850736, + "loss": 3.2944, + "step": 8486 + }, + { + "epoch": 0.42, + "grad_norm": 0.5216629505157471, + "learning_rate": 0.000571990397535338, + "loss": 3.4078, + "step": 8487 + }, + { + "epoch": 0.42, + "grad_norm": 0.5704594254493713, + "learning_rate": 0.0005719839012692319, + "loss": 3.3264, + "step": 8488 + }, + { + "epoch": 0.42, + "grad_norm": 0.5190528035163879, + "learning_rate": 0.0005719774042867726, + "loss": 3.11, + "step": 8489 + }, + { + "epoch": 0.42, + "grad_norm": 0.5578228235244751, + "learning_rate": 0.0005719709065879771, + "loss": 3.4529, + "step": 8490 + }, + { + "epoch": 0.42, + "grad_norm": 0.5134179592132568, + "learning_rate": 0.0005719644081728627, + "loss": 3.1287, + "step": 8491 + }, + { + "epoch": 0.42, + "grad_norm": 0.5582857131958008, + "learning_rate": 0.0005719579090414464, + "loss": 3.1355, + "step": 8492 + }, + { + "epoch": 0.42, + "grad_norm": 0.5763298273086548, + "learning_rate": 0.0005719514091937451, + "loss": 3.2548, + "step": 8493 + }, + { + "epoch": 0.42, + "grad_norm": 0.5173172950744629, + "learning_rate": 0.0005719449086297762, + "loss": 3.3526, + "step": 8494 + }, + { + "epoch": 0.42, + "grad_norm": 1.0055021047592163, + "learning_rate": 0.0005719384073495569, + "loss": 3.1682, + "step": 8495 + }, + { + "epoch": 0.42, + "grad_norm": 0.49112802743911743, + "learning_rate": 0.000571931905353104, + "loss": 3.5877, + "step": 8496 + }, + { + "epoch": 0.42, + "grad_norm": 0.5321609377861023, + "learning_rate": 0.0005719254026404349, + "loss": 3.2739, + "step": 8497 + }, + { + "epoch": 0.42, + "grad_norm": 0.5343760848045349, + "learning_rate": 0.0005719188992115667, + "loss": 3.4689, + "step": 8498 + }, + { + "epoch": 0.42, + "grad_norm": 0.5921098589897156, + "learning_rate": 0.0005719123950665162, + "loss": 3.229, + "step": 8499 + }, + { + "epoch": 0.42, + "grad_norm": 0.5534712672233582, + "learning_rate": 0.0005719058902053009, + "loss": 3.2989, + "step": 8500 + }, + { + "epoch": 0.42, + "grad_norm": 0.513491690158844, + "learning_rate": 0.000571899384627938, + "loss": 3.183, + "step": 8501 + }, + { + "epoch": 0.42, + "grad_norm": 0.5323536396026611, + "learning_rate": 0.0005718928783344442, + "loss": 3.2725, + "step": 8502 + }, + { + "epoch": 0.42, + "grad_norm": 0.6444754004478455, + "learning_rate": 0.0005718863713248371, + "loss": 3.3666, + "step": 8503 + }, + { + "epoch": 0.42, + "grad_norm": 0.48075416684150696, + "learning_rate": 0.0005718798635991334, + "loss": 3.1333, + "step": 8504 + }, + { + "epoch": 0.42, + "grad_norm": 0.507790207862854, + "learning_rate": 0.0005718733551573506, + "loss": 3.3297, + "step": 8505 + }, + { + "epoch": 0.42, + "grad_norm": 0.5401341915130615, + "learning_rate": 0.0005718668459995056, + "loss": 3.3617, + "step": 8506 + }, + { + "epoch": 0.42, + "grad_norm": 0.5216659307479858, + "learning_rate": 0.0005718603361256157, + "loss": 3.3293, + "step": 8507 + }, + { + "epoch": 0.42, + "grad_norm": 0.5784571170806885, + "learning_rate": 0.0005718538255356981, + "loss": 3.2978, + "step": 8508 + }, + { + "epoch": 0.42, + "grad_norm": 0.5394245982170105, + "learning_rate": 0.0005718473142297697, + "loss": 3.1971, + "step": 8509 + }, + { + "epoch": 0.42, + "grad_norm": 0.547707200050354, + "learning_rate": 0.0005718408022078479, + "loss": 3.4488, + "step": 8510 + }, + { + "epoch": 0.42, + "grad_norm": 0.5773078203201294, + "learning_rate": 0.0005718342894699497, + "loss": 3.4022, + "step": 8511 + }, + { + "epoch": 0.42, + "grad_norm": 0.5116841197013855, + "learning_rate": 0.0005718277760160922, + "loss": 3.5378, + "step": 8512 + }, + { + "epoch": 0.42, + "grad_norm": 0.5021461844444275, + "learning_rate": 0.0005718212618462928, + "loss": 3.2106, + "step": 8513 + }, + { + "epoch": 0.42, + "grad_norm": 0.5065045952796936, + "learning_rate": 0.0005718147469605684, + "loss": 3.2924, + "step": 8514 + }, + { + "epoch": 0.42, + "grad_norm": 0.5146911144256592, + "learning_rate": 0.0005718082313589363, + "loss": 3.4854, + "step": 8515 + }, + { + "epoch": 0.42, + "grad_norm": 0.5704720616340637, + "learning_rate": 0.0005718017150414137, + "loss": 3.1635, + "step": 8516 + }, + { + "epoch": 0.42, + "grad_norm": 0.5921428799629211, + "learning_rate": 0.0005717951980080176, + "loss": 3.3375, + "step": 8517 + }, + { + "epoch": 0.42, + "grad_norm": 0.5015804767608643, + "learning_rate": 0.0005717886802587653, + "loss": 3.2446, + "step": 8518 + }, + { + "epoch": 0.42, + "grad_norm": 0.5033897161483765, + "learning_rate": 0.0005717821617936739, + "loss": 3.4058, + "step": 8519 + }, + { + "epoch": 0.42, + "grad_norm": 0.5112386345863342, + "learning_rate": 0.0005717756426127606, + "loss": 3.4671, + "step": 8520 + }, + { + "epoch": 0.42, + "grad_norm": 0.5142545104026794, + "learning_rate": 0.0005717691227160426, + "loss": 3.3429, + "step": 8521 + }, + { + "epoch": 0.42, + "grad_norm": 0.5595449209213257, + "learning_rate": 0.000571762602103537, + "loss": 3.4415, + "step": 8522 + }, + { + "epoch": 0.42, + "grad_norm": 0.5375891327857971, + "learning_rate": 0.000571756080775261, + "loss": 3.2085, + "step": 8523 + }, + { + "epoch": 0.42, + "grad_norm": 0.5820460319519043, + "learning_rate": 0.0005717495587312318, + "loss": 3.2163, + "step": 8524 + }, + { + "epoch": 0.42, + "grad_norm": 0.5265014171600342, + "learning_rate": 0.0005717430359714666, + "loss": 3.0536, + "step": 8525 + }, + { + "epoch": 0.42, + "grad_norm": 0.5012522339820862, + "learning_rate": 0.0005717365124959824, + "loss": 3.2223, + "step": 8526 + }, + { + "epoch": 0.42, + "grad_norm": 0.5129319429397583, + "learning_rate": 0.0005717299883047967, + "loss": 3.1573, + "step": 8527 + }, + { + "epoch": 0.42, + "grad_norm": 0.5185990333557129, + "learning_rate": 0.0005717234633979265, + "loss": 3.1524, + "step": 8528 + }, + { + "epoch": 0.42, + "grad_norm": 0.5384962558746338, + "learning_rate": 0.0005717169377753888, + "loss": 3.2809, + "step": 8529 + }, + { + "epoch": 0.42, + "grad_norm": 0.5573104023933411, + "learning_rate": 0.0005717104114372012, + "loss": 3.3692, + "step": 8530 + }, + { + "epoch": 0.42, + "grad_norm": 0.5535567402839661, + "learning_rate": 0.0005717038843833805, + "loss": 3.2744, + "step": 8531 + }, + { + "epoch": 0.42, + "grad_norm": 0.543593168258667, + "learning_rate": 0.0005716973566139441, + "loss": 3.3109, + "step": 8532 + }, + { + "epoch": 0.42, + "grad_norm": 0.5009624361991882, + "learning_rate": 0.0005716908281289092, + "loss": 3.4917, + "step": 8533 + }, + { + "epoch": 0.42, + "grad_norm": 0.5481317043304443, + "learning_rate": 0.000571684298928293, + "loss": 3.3068, + "step": 8534 + }, + { + "epoch": 0.42, + "grad_norm": 0.538657546043396, + "learning_rate": 0.0005716777690121125, + "loss": 3.409, + "step": 8535 + }, + { + "epoch": 0.42, + "grad_norm": 0.5731958150863647, + "learning_rate": 0.0005716712383803851, + "loss": 3.1841, + "step": 8536 + }, + { + "epoch": 0.42, + "grad_norm": 0.5232669115066528, + "learning_rate": 0.000571664707033128, + "loss": 3.231, + "step": 8537 + }, + { + "epoch": 0.42, + "grad_norm": 0.5260790586471558, + "learning_rate": 0.0005716581749703583, + "loss": 3.3928, + "step": 8538 + }, + { + "epoch": 0.42, + "grad_norm": 0.5380903482437134, + "learning_rate": 0.0005716516421920932, + "loss": 3.2905, + "step": 8539 + }, + { + "epoch": 0.42, + "grad_norm": 0.5121652483940125, + "learning_rate": 0.00057164510869835, + "loss": 3.3719, + "step": 8540 + }, + { + "epoch": 0.42, + "grad_norm": 0.5237699747085571, + "learning_rate": 0.0005716385744891459, + "loss": 3.273, + "step": 8541 + }, + { + "epoch": 0.42, + "grad_norm": 0.6156141757965088, + "learning_rate": 0.000571632039564498, + "loss": 3.1406, + "step": 8542 + }, + { + "epoch": 0.42, + "grad_norm": 0.48015645146369934, + "learning_rate": 0.0005716255039244235, + "loss": 3.4731, + "step": 8543 + }, + { + "epoch": 0.42, + "grad_norm": 0.5515682697296143, + "learning_rate": 0.0005716189675689399, + "loss": 3.1267, + "step": 8544 + }, + { + "epoch": 0.42, + "grad_norm": 0.5330041646957397, + "learning_rate": 0.0005716124304980642, + "loss": 3.2132, + "step": 8545 + }, + { + "epoch": 0.42, + "grad_norm": 0.537356972694397, + "learning_rate": 0.0005716058927118135, + "loss": 3.3974, + "step": 8546 + }, + { + "epoch": 0.42, + "grad_norm": 0.5180966854095459, + "learning_rate": 0.0005715993542102052, + "loss": 3.3525, + "step": 8547 + }, + { + "epoch": 0.42, + "grad_norm": 0.5257582664489746, + "learning_rate": 0.0005715928149932565, + "loss": 3.2291, + "step": 8548 + }, + { + "epoch": 0.42, + "grad_norm": 0.5318938493728638, + "learning_rate": 0.0005715862750609845, + "loss": 2.958, + "step": 8549 + }, + { + "epoch": 0.42, + "grad_norm": 0.5303199887275696, + "learning_rate": 0.0005715797344134067, + "loss": 3.3393, + "step": 8550 + }, + { + "epoch": 0.42, + "grad_norm": 0.5268381834030151, + "learning_rate": 0.0005715731930505401, + "loss": 3.3426, + "step": 8551 + }, + { + "epoch": 0.42, + "grad_norm": 0.6098791360855103, + "learning_rate": 0.0005715666509724019, + "loss": 3.0801, + "step": 8552 + }, + { + "epoch": 0.42, + "grad_norm": 0.5364266037940979, + "learning_rate": 0.0005715601081790094, + "loss": 3.0393, + "step": 8553 + }, + { + "epoch": 0.42, + "grad_norm": 0.5054658651351929, + "learning_rate": 0.0005715535646703798, + "loss": 3.3621, + "step": 8554 + }, + { + "epoch": 0.42, + "grad_norm": 0.5238931179046631, + "learning_rate": 0.0005715470204465305, + "loss": 3.2215, + "step": 8555 + }, + { + "epoch": 0.42, + "grad_norm": 0.5390506386756897, + "learning_rate": 0.0005715404755074785, + "loss": 3.4286, + "step": 8556 + }, + { + "epoch": 0.42, + "grad_norm": 0.5367797613143921, + "learning_rate": 0.0005715339298532412, + "loss": 3.428, + "step": 8557 + }, + { + "epoch": 0.42, + "grad_norm": 0.5821202397346497, + "learning_rate": 0.0005715273834838358, + "loss": 3.3585, + "step": 8558 + }, + { + "epoch": 0.42, + "grad_norm": 0.523921549320221, + "learning_rate": 0.0005715208363992794, + "loss": 3.2719, + "step": 8559 + }, + { + "epoch": 0.42, + "grad_norm": 0.5809316635131836, + "learning_rate": 0.0005715142885995895, + "loss": 3.1228, + "step": 8560 + }, + { + "epoch": 0.42, + "grad_norm": 0.5245625376701355, + "learning_rate": 0.0005715077400847832, + "loss": 3.771, + "step": 8561 + }, + { + "epoch": 0.42, + "grad_norm": 0.5554155707359314, + "learning_rate": 0.0005715011908548778, + "loss": 3.1731, + "step": 8562 + }, + { + "epoch": 0.42, + "grad_norm": 0.530573308467865, + "learning_rate": 0.0005714946409098905, + "loss": 3.2805, + "step": 8563 + }, + { + "epoch": 0.42, + "grad_norm": 0.5505214333534241, + "learning_rate": 0.0005714880902498385, + "loss": 3.2432, + "step": 8564 + }, + { + "epoch": 0.42, + "grad_norm": 0.5658882260322571, + "learning_rate": 0.0005714815388747391, + "loss": 3.2784, + "step": 8565 + }, + { + "epoch": 0.42, + "grad_norm": 0.5260425209999084, + "learning_rate": 0.0005714749867846097, + "loss": 3.2037, + "step": 8566 + }, + { + "epoch": 0.42, + "grad_norm": 0.4990243911743164, + "learning_rate": 0.0005714684339794674, + "loss": 3.3172, + "step": 8567 + }, + { + "epoch": 0.42, + "grad_norm": 0.5792827606201172, + "learning_rate": 0.0005714618804593295, + "loss": 3.5224, + "step": 8568 + }, + { + "epoch": 0.42, + "grad_norm": 0.5181724429130554, + "learning_rate": 0.0005714553262242131, + "loss": 3.3583, + "step": 8569 + }, + { + "epoch": 0.42, + "grad_norm": 0.564866304397583, + "learning_rate": 0.0005714487712741357, + "loss": 3.4391, + "step": 8570 + }, + { + "epoch": 0.42, + "grad_norm": 0.4975354075431824, + "learning_rate": 0.0005714422156091146, + "loss": 3.2768, + "step": 8571 + }, + { + "epoch": 0.42, + "grad_norm": 0.5749588012695312, + "learning_rate": 0.0005714356592291668, + "loss": 3.2569, + "step": 8572 + }, + { + "epoch": 0.42, + "grad_norm": 0.49790215492248535, + "learning_rate": 0.0005714291021343097, + "loss": 3.2321, + "step": 8573 + }, + { + "epoch": 0.42, + "grad_norm": 0.5509989857673645, + "learning_rate": 0.0005714225443245607, + "loss": 3.2786, + "step": 8574 + }, + { + "epoch": 0.42, + "grad_norm": 0.5187534093856812, + "learning_rate": 0.0005714159857999368, + "loss": 3.241, + "step": 8575 + }, + { + "epoch": 0.42, + "grad_norm": 0.5033349394798279, + "learning_rate": 0.0005714094265604556, + "loss": 3.3021, + "step": 8576 + }, + { + "epoch": 0.42, + "grad_norm": 0.5202603936195374, + "learning_rate": 0.0005714028666061341, + "loss": 3.1149, + "step": 8577 + }, + { + "epoch": 0.42, + "grad_norm": 0.544248104095459, + "learning_rate": 0.0005713963059369898, + "loss": 3.5272, + "step": 8578 + }, + { + "epoch": 0.42, + "grad_norm": 0.5000052452087402, + "learning_rate": 0.0005713897445530396, + "loss": 3.278, + "step": 8579 + }, + { + "epoch": 0.42, + "grad_norm": 0.5294511318206787, + "learning_rate": 0.0005713831824543013, + "loss": 3.3534, + "step": 8580 + }, + { + "epoch": 0.42, + "grad_norm": 0.5449314713478088, + "learning_rate": 0.0005713766196407919, + "loss": 3.1439, + "step": 8581 + }, + { + "epoch": 0.42, + "grad_norm": 0.5595104694366455, + "learning_rate": 0.0005713700561125286, + "loss": 3.2132, + "step": 8582 + }, + { + "epoch": 0.42, + "grad_norm": 0.5275298953056335, + "learning_rate": 0.0005713634918695288, + "loss": 3.5211, + "step": 8583 + }, + { + "epoch": 0.42, + "grad_norm": 0.5486336946487427, + "learning_rate": 0.0005713569269118099, + "loss": 3.3394, + "step": 8584 + }, + { + "epoch": 0.42, + "grad_norm": 0.5621734261512756, + "learning_rate": 0.0005713503612393889, + "loss": 3.2691, + "step": 8585 + }, + { + "epoch": 0.42, + "grad_norm": 0.5324231386184692, + "learning_rate": 0.0005713437948522834, + "loss": 3.1905, + "step": 8586 + }, + { + "epoch": 0.42, + "grad_norm": 0.5473950505256653, + "learning_rate": 0.0005713372277505106, + "loss": 3.2829, + "step": 8587 + }, + { + "epoch": 0.42, + "grad_norm": 0.5286551713943481, + "learning_rate": 0.0005713306599340877, + "loss": 3.2689, + "step": 8588 + }, + { + "epoch": 0.42, + "grad_norm": 0.5888059139251709, + "learning_rate": 0.000571324091403032, + "loss": 3.4355, + "step": 8589 + }, + { + "epoch": 0.42, + "grad_norm": 0.5260636210441589, + "learning_rate": 0.000571317522157361, + "loss": 3.4292, + "step": 8590 + }, + { + "epoch": 0.42, + "grad_norm": 0.5505531430244446, + "learning_rate": 0.0005713109521970918, + "loss": 3.3409, + "step": 8591 + }, + { + "epoch": 0.42, + "grad_norm": 0.5252068638801575, + "learning_rate": 0.0005713043815222418, + "loss": 3.3137, + "step": 8592 + }, + { + "epoch": 0.42, + "grad_norm": 0.5074268579483032, + "learning_rate": 0.0005712978101328281, + "loss": 3.2255, + "step": 8593 + }, + { + "epoch": 0.42, + "grad_norm": 0.4985024631023407, + "learning_rate": 0.0005712912380288683, + "loss": 3.3656, + "step": 8594 + }, + { + "epoch": 0.42, + "grad_norm": 0.5729339122772217, + "learning_rate": 0.0005712846652103796, + "loss": 3.1283, + "step": 8595 + }, + { + "epoch": 0.42, + "grad_norm": 0.5986528396606445, + "learning_rate": 0.0005712780916773794, + "loss": 3.2123, + "step": 8596 + }, + { + "epoch": 0.42, + "grad_norm": 0.5517857670783997, + "learning_rate": 0.0005712715174298848, + "loss": 3.3586, + "step": 8597 + }, + { + "epoch": 0.42, + "grad_norm": 0.5476078391075134, + "learning_rate": 0.0005712649424679132, + "loss": 3.2001, + "step": 8598 + }, + { + "epoch": 0.42, + "grad_norm": 0.5720165371894836, + "learning_rate": 0.000571258366791482, + "loss": 2.9817, + "step": 8599 + }, + { + "epoch": 0.42, + "grad_norm": 0.5489450693130493, + "learning_rate": 0.0005712517904006085, + "loss": 3.4279, + "step": 8600 + }, + { + "epoch": 0.42, + "grad_norm": 0.5727310180664062, + "learning_rate": 0.0005712452132953099, + "loss": 3.1009, + "step": 8601 + }, + { + "epoch": 0.42, + "grad_norm": 0.5166281461715698, + "learning_rate": 0.0005712386354756037, + "loss": 3.3062, + "step": 8602 + }, + { + "epoch": 0.42, + "grad_norm": 0.5071576237678528, + "learning_rate": 0.0005712320569415071, + "loss": 3.4458, + "step": 8603 + }, + { + "epoch": 0.42, + "grad_norm": 0.5760843753814697, + "learning_rate": 0.0005712254776930374, + "loss": 3.3133, + "step": 8604 + }, + { + "epoch": 0.42, + "grad_norm": 0.5532389283180237, + "learning_rate": 0.0005712188977302121, + "loss": 3.1873, + "step": 8605 + }, + { + "epoch": 0.42, + "grad_norm": 0.5423989295959473, + "learning_rate": 0.0005712123170530484, + "loss": 3.0871, + "step": 8606 + }, + { + "epoch": 0.42, + "grad_norm": 0.5491202473640442, + "learning_rate": 0.0005712057356615637, + "loss": 3.176, + "step": 8607 + }, + { + "epoch": 0.42, + "grad_norm": 0.5447011590003967, + "learning_rate": 0.0005711991535557751, + "loss": 3.1241, + "step": 8608 + }, + { + "epoch": 0.42, + "grad_norm": 0.5630657076835632, + "learning_rate": 0.0005711925707357002, + "loss": 3.3227, + "step": 8609 + }, + { + "epoch": 0.42, + "grad_norm": 0.4960578978061676, + "learning_rate": 0.0005711859872013563, + "loss": 3.3174, + "step": 8610 + }, + { + "epoch": 0.42, + "grad_norm": 0.5437517166137695, + "learning_rate": 0.0005711794029527607, + "loss": 3.3928, + "step": 8611 + }, + { + "epoch": 0.42, + "grad_norm": 0.5181750059127808, + "learning_rate": 0.0005711728179899308, + "loss": 3.3892, + "step": 8612 + }, + { + "epoch": 0.42, + "grad_norm": 0.5108785033226013, + "learning_rate": 0.0005711662323128838, + "loss": 3.3299, + "step": 8613 + }, + { + "epoch": 0.42, + "grad_norm": 0.4989277720451355, + "learning_rate": 0.0005711596459216372, + "loss": 3.3234, + "step": 8614 + }, + { + "epoch": 0.42, + "grad_norm": 0.5421345233917236, + "learning_rate": 0.0005711530588162082, + "loss": 3.3554, + "step": 8615 + }, + { + "epoch": 0.42, + "grad_norm": 0.5189564228057861, + "learning_rate": 0.0005711464709966142, + "loss": 3.2916, + "step": 8616 + }, + { + "epoch": 0.42, + "grad_norm": 0.5410189628601074, + "learning_rate": 0.0005711398824628727, + "loss": 3.3517, + "step": 8617 + }, + { + "epoch": 0.42, + "grad_norm": 0.5351953506469727, + "learning_rate": 0.0005711332932150008, + "loss": 3.1984, + "step": 8618 + }, + { + "epoch": 0.42, + "grad_norm": 0.5187792181968689, + "learning_rate": 0.0005711267032530161, + "loss": 3.3928, + "step": 8619 + }, + { + "epoch": 0.42, + "grad_norm": 0.5344939231872559, + "learning_rate": 0.0005711201125769358, + "loss": 3.3316, + "step": 8620 + }, + { + "epoch": 0.42, + "grad_norm": 0.5304107666015625, + "learning_rate": 0.0005711135211867773, + "loss": 3.4969, + "step": 8621 + }, + { + "epoch": 0.42, + "grad_norm": 0.5163986682891846, + "learning_rate": 0.0005711069290825579, + "loss": 3.117, + "step": 8622 + }, + { + "epoch": 0.42, + "grad_norm": 0.5484994649887085, + "learning_rate": 0.0005711003362642951, + "loss": 3.3076, + "step": 8623 + }, + { + "epoch": 0.42, + "grad_norm": 0.5722675323486328, + "learning_rate": 0.0005710937427320062, + "loss": 3.437, + "step": 8624 + }, + { + "epoch": 0.42, + "grad_norm": 0.5870904326438904, + "learning_rate": 0.0005710871484857085, + "loss": 3.3008, + "step": 8625 + }, + { + "epoch": 0.42, + "grad_norm": 0.5617459416389465, + "learning_rate": 0.0005710805535254195, + "loss": 3.244, + "step": 8626 + }, + { + "epoch": 0.42, + "grad_norm": 0.5233004689216614, + "learning_rate": 0.0005710739578511564, + "loss": 3.3681, + "step": 8627 + }, + { + "epoch": 0.42, + "grad_norm": 0.5104897618293762, + "learning_rate": 0.0005710673614629367, + "loss": 3.3192, + "step": 8628 + }, + { + "epoch": 0.42, + "grad_norm": 0.5213035345077515, + "learning_rate": 0.0005710607643607778, + "loss": 3.4488, + "step": 8629 + }, + { + "epoch": 0.42, + "grad_norm": 0.5398159623146057, + "learning_rate": 0.0005710541665446969, + "loss": 3.4134, + "step": 8630 + }, + { + "epoch": 0.42, + "grad_norm": 0.5116217136383057, + "learning_rate": 0.0005710475680147115, + "loss": 3.1915, + "step": 8631 + }, + { + "epoch": 0.42, + "grad_norm": 0.5108386278152466, + "learning_rate": 0.0005710409687708391, + "loss": 3.2113, + "step": 8632 + }, + { + "epoch": 0.42, + "grad_norm": 0.5582148432731628, + "learning_rate": 0.0005710343688130968, + "loss": 3.2279, + "step": 8633 + }, + { + "epoch": 0.42, + "grad_norm": 0.5281783938407898, + "learning_rate": 0.0005710277681415022, + "loss": 3.4379, + "step": 8634 + }, + { + "epoch": 0.42, + "grad_norm": 0.4925077259540558, + "learning_rate": 0.0005710211667560726, + "loss": 3.2618, + "step": 8635 + }, + { + "epoch": 0.42, + "grad_norm": 0.5662412047386169, + "learning_rate": 0.0005710145646568254, + "loss": 3.2086, + "step": 8636 + }, + { + "epoch": 0.42, + "grad_norm": 0.528595507144928, + "learning_rate": 0.0005710079618437781, + "loss": 3.452, + "step": 8637 + }, + { + "epoch": 0.42, + "grad_norm": 0.5423139333724976, + "learning_rate": 0.000571001358316948, + "loss": 3.2282, + "step": 8638 + }, + { + "epoch": 0.42, + "grad_norm": 0.5846478939056396, + "learning_rate": 0.0005709947540763524, + "loss": 3.3114, + "step": 8639 + }, + { + "epoch": 0.42, + "grad_norm": 0.543500542640686, + "learning_rate": 0.0005709881491220087, + "loss": 3.2656, + "step": 8640 + }, + { + "epoch": 0.42, + "grad_norm": 0.5225181579589844, + "learning_rate": 0.0005709815434539344, + "loss": 3.2979, + "step": 8641 + }, + { + "epoch": 0.42, + "grad_norm": 0.5145890712738037, + "learning_rate": 0.0005709749370721469, + "loss": 3.0179, + "step": 8642 + }, + { + "epoch": 0.42, + "grad_norm": 0.527047336101532, + "learning_rate": 0.0005709683299766635, + "loss": 3.1937, + "step": 8643 + }, + { + "epoch": 0.42, + "grad_norm": 0.558195948600769, + "learning_rate": 0.0005709617221675017, + "loss": 3.3409, + "step": 8644 + }, + { + "epoch": 0.42, + "grad_norm": 0.5157774686813354, + "learning_rate": 0.000570955113644679, + "loss": 3.4396, + "step": 8645 + }, + { + "epoch": 0.42, + "grad_norm": 0.5344122052192688, + "learning_rate": 0.0005709485044082125, + "loss": 3.3056, + "step": 8646 + }, + { + "epoch": 0.42, + "grad_norm": 0.5069317817687988, + "learning_rate": 0.0005709418944581199, + "loss": 3.2913, + "step": 8647 + }, + { + "epoch": 0.42, + "grad_norm": 0.5138603448867798, + "learning_rate": 0.0005709352837944184, + "loss": 3.3517, + "step": 8648 + }, + { + "epoch": 0.42, + "grad_norm": 0.5492541193962097, + "learning_rate": 0.0005709286724171256, + "loss": 3.3809, + "step": 8649 + }, + { + "epoch": 0.42, + "grad_norm": 0.6118084192276001, + "learning_rate": 0.0005709220603262587, + "loss": 3.1827, + "step": 8650 + }, + { + "epoch": 0.42, + "grad_norm": 0.522902250289917, + "learning_rate": 0.0005709154475218354, + "loss": 3.2828, + "step": 8651 + }, + { + "epoch": 0.42, + "grad_norm": 0.5477918386459351, + "learning_rate": 0.0005709088340038729, + "loss": 3.431, + "step": 8652 + }, + { + "epoch": 0.42, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0005709022197723886, + "loss": 3.0517, + "step": 8653 + }, + { + "epoch": 0.42, + "grad_norm": 0.563598096370697, + "learning_rate": 0.0005708956048273999, + "loss": 3.1275, + "step": 8654 + }, + { + "epoch": 0.42, + "grad_norm": 0.5188009738922119, + "learning_rate": 0.0005708889891689245, + "loss": 3.1314, + "step": 8655 + }, + { + "epoch": 0.42, + "grad_norm": 0.5175809264183044, + "learning_rate": 0.0005708823727969796, + "loss": 3.1442, + "step": 8656 + }, + { + "epoch": 0.42, + "grad_norm": 0.5397357940673828, + "learning_rate": 0.0005708757557115826, + "loss": 3.1639, + "step": 8657 + }, + { + "epoch": 0.42, + "grad_norm": 1.0455191135406494, + "learning_rate": 0.000570869137912751, + "loss": 3.2676, + "step": 8658 + }, + { + "epoch": 0.42, + "grad_norm": 0.571491003036499, + "learning_rate": 0.0005708625194005023, + "loss": 3.4802, + "step": 8659 + }, + { + "epoch": 0.42, + "grad_norm": 0.5649874210357666, + "learning_rate": 0.0005708559001748538, + "loss": 3.1757, + "step": 8660 + }, + { + "epoch": 0.42, + "grad_norm": 0.5234768986701965, + "learning_rate": 0.000570849280235823, + "loss": 3.3455, + "step": 8661 + }, + { + "epoch": 0.42, + "grad_norm": 0.5115209817886353, + "learning_rate": 0.0005708426595834273, + "loss": 3.167, + "step": 8662 + }, + { + "epoch": 0.42, + "grad_norm": 0.5697651505470276, + "learning_rate": 0.0005708360382176841, + "loss": 3.4317, + "step": 8663 + }, + { + "epoch": 0.42, + "grad_norm": 0.5905675292015076, + "learning_rate": 0.0005708294161386109, + "loss": 3.2357, + "step": 8664 + }, + { + "epoch": 0.42, + "grad_norm": 0.5598018765449524, + "learning_rate": 0.0005708227933462252, + "loss": 3.2358, + "step": 8665 + }, + { + "epoch": 0.42, + "grad_norm": 0.5009368658065796, + "learning_rate": 0.0005708161698405444, + "loss": 3.2861, + "step": 8666 + }, + { + "epoch": 0.42, + "grad_norm": 0.5382165312767029, + "learning_rate": 0.0005708095456215859, + "loss": 3.3066, + "step": 8667 + }, + { + "epoch": 0.42, + "grad_norm": 0.5542304515838623, + "learning_rate": 0.0005708029206893672, + "loss": 3.4175, + "step": 8668 + }, + { + "epoch": 0.42, + "grad_norm": 0.5310438871383667, + "learning_rate": 0.0005707962950439057, + "loss": 3.6001, + "step": 8669 + }, + { + "epoch": 0.42, + "grad_norm": 0.5058013200759888, + "learning_rate": 0.0005707896686852189, + "loss": 3.3482, + "step": 8670 + }, + { + "epoch": 0.42, + "grad_norm": 0.5328607559204102, + "learning_rate": 0.0005707830416133243, + "loss": 3.2293, + "step": 8671 + }, + { + "epoch": 0.42, + "grad_norm": 0.4994487464427948, + "learning_rate": 0.0005707764138282391, + "loss": 3.3414, + "step": 8672 + }, + { + "epoch": 0.43, + "grad_norm": 0.547566294670105, + "learning_rate": 0.0005707697853299811, + "loss": 3.4422, + "step": 8673 + }, + { + "epoch": 0.43, + "grad_norm": 0.5558493137359619, + "learning_rate": 0.0005707631561185675, + "loss": 3.3349, + "step": 8674 + }, + { + "epoch": 0.43, + "grad_norm": 0.5128504633903503, + "learning_rate": 0.0005707565261940158, + "loss": 3.3234, + "step": 8675 + }, + { + "epoch": 0.43, + "grad_norm": 0.543178141117096, + "learning_rate": 0.0005707498955563437, + "loss": 3.3392, + "step": 8676 + }, + { + "epoch": 0.43, + "grad_norm": 0.5692942142486572, + "learning_rate": 0.0005707432642055683, + "loss": 3.4206, + "step": 8677 + }, + { + "epoch": 0.43, + "grad_norm": 0.49869203567504883, + "learning_rate": 0.0005707366321417073, + "loss": 3.104, + "step": 8678 + }, + { + "epoch": 0.43, + "grad_norm": 0.5554705262184143, + "learning_rate": 0.0005707299993647782, + "loss": 3.2966, + "step": 8679 + }, + { + "epoch": 0.43, + "grad_norm": 0.5459063053131104, + "learning_rate": 0.0005707233658747983, + "loss": 3.4036, + "step": 8680 + }, + { + "epoch": 0.43, + "grad_norm": 0.5476892590522766, + "learning_rate": 0.0005707167316717852, + "loss": 3.3779, + "step": 8681 + }, + { + "epoch": 0.43, + "grad_norm": 0.5351525545120239, + "learning_rate": 0.0005707100967557563, + "loss": 3.26, + "step": 8682 + }, + { + "epoch": 0.43, + "grad_norm": 0.5344018936157227, + "learning_rate": 0.0005707034611267291, + "loss": 3.2521, + "step": 8683 + }, + { + "epoch": 0.43, + "grad_norm": 0.5126794576644897, + "learning_rate": 0.0005706968247847212, + "loss": 3.086, + "step": 8684 + }, + { + "epoch": 0.43, + "grad_norm": 0.5328701734542847, + "learning_rate": 0.0005706901877297498, + "loss": 3.2423, + "step": 8685 + }, + { + "epoch": 0.43, + "grad_norm": 0.6240466237068176, + "learning_rate": 0.0005706835499618326, + "loss": 3.3378, + "step": 8686 + }, + { + "epoch": 0.43, + "grad_norm": 0.49643224477767944, + "learning_rate": 0.000570676911480987, + "loss": 3.2525, + "step": 8687 + }, + { + "epoch": 0.43, + "grad_norm": 0.5192905068397522, + "learning_rate": 0.0005706702722872305, + "loss": 3.369, + "step": 8688 + }, + { + "epoch": 0.43, + "grad_norm": 0.5646792650222778, + "learning_rate": 0.0005706636323805807, + "loss": 3.3023, + "step": 8689 + }, + { + "epoch": 0.43, + "grad_norm": 0.5123174786567688, + "learning_rate": 0.0005706569917610548, + "loss": 3.2993, + "step": 8690 + }, + { + "epoch": 0.43, + "grad_norm": 0.5409421324729919, + "learning_rate": 0.0005706503504286707, + "loss": 3.163, + "step": 8691 + }, + { + "epoch": 0.43, + "grad_norm": 0.5075324773788452, + "learning_rate": 0.0005706437083834456, + "loss": 3.4438, + "step": 8692 + }, + { + "epoch": 0.43, + "grad_norm": 0.4756852686405182, + "learning_rate": 0.000570637065625397, + "loss": 3.3624, + "step": 8693 + }, + { + "epoch": 0.43, + "grad_norm": 0.5012529492378235, + "learning_rate": 0.0005706304221545424, + "loss": 3.2455, + "step": 8694 + }, + { + "epoch": 0.43, + "grad_norm": 0.5726290345191956, + "learning_rate": 0.0005706237779708994, + "loss": 3.11, + "step": 8695 + }, + { + "epoch": 0.43, + "grad_norm": 0.5796265006065369, + "learning_rate": 0.0005706171330744854, + "loss": 3.2597, + "step": 8696 + }, + { + "epoch": 0.43, + "grad_norm": 0.5194861888885498, + "learning_rate": 0.000570610487465318, + "loss": 3.0619, + "step": 8697 + }, + { + "epoch": 0.43, + "grad_norm": 0.5499964952468872, + "learning_rate": 0.0005706038411434147, + "loss": 3.4847, + "step": 8698 + }, + { + "epoch": 0.43, + "grad_norm": 0.5542723536491394, + "learning_rate": 0.000570597194108793, + "loss": 3.3495, + "step": 8699 + }, + { + "epoch": 0.43, + "grad_norm": 0.5347164273262024, + "learning_rate": 0.0005705905463614702, + "loss": 3.288, + "step": 8700 + }, + { + "epoch": 0.43, + "grad_norm": 0.5562871098518372, + "learning_rate": 0.0005705838979014642, + "loss": 3.3801, + "step": 8701 + }, + { + "epoch": 0.43, + "grad_norm": 0.5251455307006836, + "learning_rate": 0.0005705772487287921, + "loss": 3.3115, + "step": 8702 + }, + { + "epoch": 0.43, + "grad_norm": 0.526947021484375, + "learning_rate": 0.0005705705988434716, + "loss": 3.2882, + "step": 8703 + }, + { + "epoch": 0.43, + "grad_norm": 0.5250048637390137, + "learning_rate": 0.0005705639482455204, + "loss": 3.1308, + "step": 8704 + }, + { + "epoch": 0.43, + "grad_norm": 0.5624373555183411, + "learning_rate": 0.0005705572969349556, + "loss": 3.2379, + "step": 8705 + }, + { + "epoch": 0.43, + "grad_norm": 0.600304365158081, + "learning_rate": 0.000570550644911795, + "loss": 3.2652, + "step": 8706 + }, + { + "epoch": 0.43, + "grad_norm": 0.5497757196426392, + "learning_rate": 0.0005705439921760562, + "loss": 3.2458, + "step": 8707 + }, + { + "epoch": 0.43, + "grad_norm": 0.5430121421813965, + "learning_rate": 0.0005705373387277566, + "loss": 3.4981, + "step": 8708 + }, + { + "epoch": 0.43, + "grad_norm": 0.5202553868293762, + "learning_rate": 0.0005705306845669137, + "loss": 3.3059, + "step": 8709 + }, + { + "epoch": 0.43, + "grad_norm": 0.53725266456604, + "learning_rate": 0.0005705240296935448, + "loss": 3.321, + "step": 8710 + }, + { + "epoch": 0.43, + "grad_norm": 0.5087431073188782, + "learning_rate": 0.000570517374107668, + "loss": 3.255, + "step": 8711 + }, + { + "epoch": 0.43, + "grad_norm": 0.5445797443389893, + "learning_rate": 0.0005705107178093004, + "loss": 3.2109, + "step": 8712 + }, + { + "epoch": 0.43, + "grad_norm": 0.563112199306488, + "learning_rate": 0.0005705040607984595, + "loss": 3.322, + "step": 8713 + }, + { + "epoch": 0.43, + "grad_norm": 0.544439435005188, + "learning_rate": 0.000570497403075163, + "loss": 3.0042, + "step": 8714 + }, + { + "epoch": 0.43, + "grad_norm": 0.5439892411231995, + "learning_rate": 0.0005704907446394285, + "loss": 3.3405, + "step": 8715 + }, + { + "epoch": 0.43, + "grad_norm": 0.554537296295166, + "learning_rate": 0.0005704840854912732, + "loss": 3.3383, + "step": 8716 + }, + { + "epoch": 0.43, + "grad_norm": 0.506321907043457, + "learning_rate": 0.0005704774256307151, + "loss": 3.3718, + "step": 8717 + }, + { + "epoch": 0.43, + "grad_norm": 0.5402343273162842, + "learning_rate": 0.0005704707650577716, + "loss": 3.1319, + "step": 8718 + }, + { + "epoch": 0.43, + "grad_norm": 0.5468218326568604, + "learning_rate": 0.0005704641037724599, + "loss": 3.2299, + "step": 8719 + }, + { + "epoch": 0.43, + "grad_norm": 0.5793018341064453, + "learning_rate": 0.000570457441774798, + "loss": 3.1532, + "step": 8720 + }, + { + "epoch": 0.43, + "grad_norm": 0.5519994497299194, + "learning_rate": 0.0005704507790648031, + "loss": 3.2421, + "step": 8721 + }, + { + "epoch": 0.43, + "grad_norm": 0.5348733067512512, + "learning_rate": 0.0005704441156424931, + "loss": 3.1543, + "step": 8722 + }, + { + "epoch": 0.43, + "grad_norm": 0.5524995923042297, + "learning_rate": 0.0005704374515078853, + "loss": 3.2966, + "step": 8723 + }, + { + "epoch": 0.43, + "grad_norm": 0.5177032351493835, + "learning_rate": 0.0005704307866609971, + "loss": 3.4132, + "step": 8724 + }, + { + "epoch": 0.43, + "grad_norm": 0.5006693005561829, + "learning_rate": 0.0005704241211018464, + "loss": 3.2054, + "step": 8725 + }, + { + "epoch": 0.43, + "grad_norm": 0.5278719663619995, + "learning_rate": 0.0005704174548304506, + "loss": 3.2603, + "step": 8726 + }, + { + "epoch": 0.43, + "grad_norm": 0.4858294129371643, + "learning_rate": 0.0005704107878468272, + "loss": 3.3033, + "step": 8727 + }, + { + "epoch": 0.43, + "grad_norm": 0.5394824147224426, + "learning_rate": 0.0005704041201509939, + "loss": 3.3758, + "step": 8728 + }, + { + "epoch": 0.43, + "grad_norm": 0.5565510988235474, + "learning_rate": 0.0005703974517429681, + "loss": 3.0167, + "step": 8729 + }, + { + "epoch": 0.43, + "grad_norm": 0.533441960811615, + "learning_rate": 0.0005703907826227676, + "loss": 3.5257, + "step": 8730 + }, + { + "epoch": 0.43, + "grad_norm": 0.5071123242378235, + "learning_rate": 0.0005703841127904097, + "loss": 3.347, + "step": 8731 + }, + { + "epoch": 0.43, + "grad_norm": 0.5355621576309204, + "learning_rate": 0.000570377442245912, + "loss": 3.074, + "step": 8732 + }, + { + "epoch": 0.43, + "grad_norm": 0.5217567086219788, + "learning_rate": 0.0005703707709892923, + "loss": 3.4603, + "step": 8733 + }, + { + "epoch": 0.43, + "grad_norm": 0.5563116669654846, + "learning_rate": 0.0005703640990205681, + "loss": 3.2609, + "step": 8734 + }, + { + "epoch": 0.43, + "grad_norm": 0.5034108757972717, + "learning_rate": 0.0005703574263397566, + "loss": 2.9131, + "step": 8735 + }, + { + "epoch": 0.43, + "grad_norm": 0.5095783472061157, + "learning_rate": 0.000570350752946876, + "loss": 3.3354, + "step": 8736 + }, + { + "epoch": 0.43, + "grad_norm": 0.542126476764679, + "learning_rate": 0.0005703440788419435, + "loss": 3.272, + "step": 8737 + }, + { + "epoch": 0.43, + "grad_norm": 0.5572932362556458, + "learning_rate": 0.0005703374040249765, + "loss": 3.3902, + "step": 8738 + }, + { + "epoch": 0.43, + "grad_norm": 0.5626958012580872, + "learning_rate": 0.000570330728495993, + "loss": 3.2746, + "step": 8739 + }, + { + "epoch": 0.43, + "grad_norm": 0.5667341947555542, + "learning_rate": 0.0005703240522550102, + "loss": 3.3586, + "step": 8740 + }, + { + "epoch": 0.43, + "grad_norm": 0.556536078453064, + "learning_rate": 0.0005703173753020461, + "loss": 3.227, + "step": 8741 + }, + { + "epoch": 0.43, + "grad_norm": 0.530508816242218, + "learning_rate": 0.0005703106976371179, + "loss": 3.5442, + "step": 8742 + }, + { + "epoch": 0.43, + "grad_norm": 0.5346199870109558, + "learning_rate": 0.0005703040192602435, + "loss": 3.5053, + "step": 8743 + }, + { + "epoch": 0.43, + "grad_norm": 0.5241973996162415, + "learning_rate": 0.0005702973401714402, + "loss": 3.4193, + "step": 8744 + }, + { + "epoch": 0.43, + "grad_norm": 0.590114414691925, + "learning_rate": 0.0005702906603707256, + "loss": 3.1247, + "step": 8745 + }, + { + "epoch": 0.43, + "grad_norm": 0.5349878668785095, + "learning_rate": 0.0005702839798581176, + "loss": 3.3134, + "step": 8746 + }, + { + "epoch": 0.43, + "grad_norm": 0.598888099193573, + "learning_rate": 0.0005702772986336337, + "loss": 3.546, + "step": 8747 + }, + { + "epoch": 0.43, + "grad_norm": 0.4993423521518707, + "learning_rate": 0.0005702706166972912, + "loss": 3.3099, + "step": 8748 + }, + { + "epoch": 0.43, + "grad_norm": 0.5180084705352783, + "learning_rate": 0.000570263934049108, + "loss": 3.175, + "step": 8749 + }, + { + "epoch": 0.43, + "grad_norm": 0.5331127643585205, + "learning_rate": 0.0005702572506891017, + "loss": 3.0252, + "step": 8750 + }, + { + "epoch": 0.43, + "grad_norm": 0.5388214588165283, + "learning_rate": 0.0005702505666172897, + "loss": 3.2869, + "step": 8751 + }, + { + "epoch": 0.43, + "grad_norm": 0.5273463129997253, + "learning_rate": 0.0005702438818336897, + "loss": 3.2632, + "step": 8752 + }, + { + "epoch": 0.43, + "grad_norm": 0.5084626078605652, + "learning_rate": 0.0005702371963383194, + "loss": 3.3897, + "step": 8753 + }, + { + "epoch": 0.43, + "grad_norm": 0.5808398127555847, + "learning_rate": 0.0005702305101311963, + "loss": 3.3616, + "step": 8754 + }, + { + "epoch": 0.43, + "grad_norm": 0.5917160511016846, + "learning_rate": 0.000570223823212338, + "loss": 3.1535, + "step": 8755 + }, + { + "epoch": 0.43, + "grad_norm": 0.5147940516471863, + "learning_rate": 0.0005702171355817623, + "loss": 3.1924, + "step": 8756 + }, + { + "epoch": 0.43, + "grad_norm": 0.5295524597167969, + "learning_rate": 0.0005702104472394866, + "loss": 3.2706, + "step": 8757 + }, + { + "epoch": 0.43, + "grad_norm": 0.5461099743843079, + "learning_rate": 0.0005702037581855285, + "loss": 3.4574, + "step": 8758 + }, + { + "epoch": 0.43, + "grad_norm": 0.5407048463821411, + "learning_rate": 0.0005701970684199057, + "loss": 3.1396, + "step": 8759 + }, + { + "epoch": 0.43, + "grad_norm": 0.5329182744026184, + "learning_rate": 0.000570190377942636, + "loss": 3.1328, + "step": 8760 + }, + { + "epoch": 0.43, + "grad_norm": 0.5270434617996216, + "learning_rate": 0.0005701836867537367, + "loss": 3.4657, + "step": 8761 + }, + { + "epoch": 0.43, + "grad_norm": 0.5661664009094238, + "learning_rate": 0.0005701769948532257, + "loss": 3.1477, + "step": 8762 + }, + { + "epoch": 0.43, + "grad_norm": 0.526901364326477, + "learning_rate": 0.0005701703022411203, + "loss": 3.3944, + "step": 8763 + }, + { + "epoch": 0.43, + "grad_norm": 0.5473800897598267, + "learning_rate": 0.0005701636089174384, + "loss": 3.4588, + "step": 8764 + }, + { + "epoch": 0.43, + "grad_norm": 0.5030317902565002, + "learning_rate": 0.0005701569148821976, + "loss": 3.2712, + "step": 8765 + }, + { + "epoch": 0.43, + "grad_norm": 0.5436576008796692, + "learning_rate": 0.0005701502201354154, + "loss": 3.2773, + "step": 8766 + }, + { + "epoch": 0.43, + "grad_norm": 0.5438995361328125, + "learning_rate": 0.0005701435246771095, + "loss": 3.1112, + "step": 8767 + }, + { + "epoch": 0.43, + "grad_norm": 0.51812344789505, + "learning_rate": 0.0005701368285072977, + "loss": 3.4097, + "step": 8768 + }, + { + "epoch": 0.43, + "grad_norm": 0.5479652285575867, + "learning_rate": 0.0005701301316259973, + "loss": 3.1205, + "step": 8769 + }, + { + "epoch": 0.43, + "grad_norm": 0.5035589337348938, + "learning_rate": 0.0005701234340332262, + "loss": 3.3524, + "step": 8770 + }, + { + "epoch": 0.43, + "grad_norm": 0.5446529388427734, + "learning_rate": 0.000570116735729002, + "loss": 3.2749, + "step": 8771 + }, + { + "epoch": 0.43, + "grad_norm": 0.5573705434799194, + "learning_rate": 0.0005701100367133422, + "loss": 3.4812, + "step": 8772 + }, + { + "epoch": 0.43, + "grad_norm": 0.517190158367157, + "learning_rate": 0.0005701033369862647, + "loss": 3.1171, + "step": 8773 + }, + { + "epoch": 0.43, + "grad_norm": 0.5003644824028015, + "learning_rate": 0.0005700966365477869, + "loss": 3.3361, + "step": 8774 + }, + { + "epoch": 0.43, + "grad_norm": 0.6054016947746277, + "learning_rate": 0.0005700899353979265, + "loss": 3.0853, + "step": 8775 + }, + { + "epoch": 0.43, + "grad_norm": 0.46385738253593445, + "learning_rate": 0.0005700832335367012, + "loss": 3.4381, + "step": 8776 + }, + { + "epoch": 0.43, + "grad_norm": 0.495850145816803, + "learning_rate": 0.0005700765309641287, + "loss": 3.2427, + "step": 8777 + }, + { + "epoch": 0.43, + "grad_norm": 0.5276760458946228, + "learning_rate": 0.0005700698276802266, + "loss": 3.2379, + "step": 8778 + }, + { + "epoch": 0.43, + "grad_norm": 0.5193692445755005, + "learning_rate": 0.0005700631236850124, + "loss": 3.1412, + "step": 8779 + }, + { + "epoch": 0.43, + "grad_norm": 0.537988543510437, + "learning_rate": 0.0005700564189785041, + "loss": 3.1968, + "step": 8780 + }, + { + "epoch": 0.43, + "grad_norm": 0.5210827589035034, + "learning_rate": 0.0005700497135607191, + "loss": 3.3336, + "step": 8781 + }, + { + "epoch": 0.43, + "grad_norm": 0.5374914407730103, + "learning_rate": 0.0005700430074316751, + "loss": 3.2019, + "step": 8782 + }, + { + "epoch": 0.43, + "grad_norm": 0.5302662253379822, + "learning_rate": 0.0005700363005913898, + "loss": 3.2672, + "step": 8783 + }, + { + "epoch": 0.43, + "grad_norm": 0.5213940143585205, + "learning_rate": 0.0005700295930398809, + "loss": 3.3535, + "step": 8784 + }, + { + "epoch": 0.43, + "grad_norm": 0.509855329990387, + "learning_rate": 0.000570022884777166, + "loss": 3.3465, + "step": 8785 + }, + { + "epoch": 0.43, + "grad_norm": 0.5186008214950562, + "learning_rate": 0.0005700161758032628, + "loss": 3.3766, + "step": 8786 + }, + { + "epoch": 0.43, + "grad_norm": 0.558765709400177, + "learning_rate": 0.0005700094661181889, + "loss": 3.1944, + "step": 8787 + }, + { + "epoch": 0.43, + "grad_norm": 0.5422574877738953, + "learning_rate": 0.000570002755721962, + "loss": 3.2641, + "step": 8788 + }, + { + "epoch": 0.43, + "grad_norm": 0.5169627070426941, + "learning_rate": 0.0005699960446145999, + "loss": 3.2564, + "step": 8789 + }, + { + "epoch": 0.43, + "grad_norm": 0.5492352843284607, + "learning_rate": 0.0005699893327961201, + "loss": 3.0885, + "step": 8790 + }, + { + "epoch": 0.43, + "grad_norm": 0.553865909576416, + "learning_rate": 0.0005699826202665405, + "loss": 3.4303, + "step": 8791 + }, + { + "epoch": 0.43, + "grad_norm": 0.5586493611335754, + "learning_rate": 0.0005699759070258785, + "loss": 3.1598, + "step": 8792 + }, + { + "epoch": 0.43, + "grad_norm": 0.5349708199501038, + "learning_rate": 0.000569969193074152, + "loss": 3.3447, + "step": 8793 + }, + { + "epoch": 0.43, + "grad_norm": 0.5063890814781189, + "learning_rate": 0.0005699624784113785, + "loss": 3.3572, + "step": 8794 + }, + { + "epoch": 0.43, + "grad_norm": 0.5007393956184387, + "learning_rate": 0.0005699557630375759, + "loss": 3.3862, + "step": 8795 + }, + { + "epoch": 0.43, + "grad_norm": 0.4956451654434204, + "learning_rate": 0.0005699490469527617, + "loss": 3.3052, + "step": 8796 + }, + { + "epoch": 0.43, + "grad_norm": 0.5122515559196472, + "learning_rate": 0.0005699423301569536, + "loss": 3.3706, + "step": 8797 + }, + { + "epoch": 0.43, + "grad_norm": 0.5375229716300964, + "learning_rate": 0.0005699356126501695, + "loss": 3.3361, + "step": 8798 + }, + { + "epoch": 0.43, + "grad_norm": 0.48476994037628174, + "learning_rate": 0.0005699288944324268, + "loss": 3.3599, + "step": 8799 + }, + { + "epoch": 0.43, + "grad_norm": 0.5187641978263855, + "learning_rate": 0.0005699221755037435, + "loss": 3.2034, + "step": 8800 + }, + { + "epoch": 0.43, + "grad_norm": 0.5315556526184082, + "learning_rate": 0.000569915455864137, + "loss": 3.5175, + "step": 8801 + }, + { + "epoch": 0.43, + "grad_norm": 0.5142885446548462, + "learning_rate": 0.0005699087355136252, + "loss": 3.1606, + "step": 8802 + }, + { + "epoch": 0.43, + "grad_norm": 0.5275764465332031, + "learning_rate": 0.0005699020144522257, + "loss": 3.4446, + "step": 8803 + }, + { + "epoch": 0.43, + "grad_norm": 0.5340292453765869, + "learning_rate": 0.0005698952926799563, + "loss": 3.3616, + "step": 8804 + }, + { + "epoch": 0.43, + "grad_norm": 0.50909823179245, + "learning_rate": 0.0005698885701968347, + "loss": 3.388, + "step": 8805 + }, + { + "epoch": 0.43, + "grad_norm": 0.5391430258750916, + "learning_rate": 0.0005698818470028784, + "loss": 3.2484, + "step": 8806 + }, + { + "epoch": 0.43, + "grad_norm": 0.5389807224273682, + "learning_rate": 0.0005698751230981053, + "loss": 3.1581, + "step": 8807 + }, + { + "epoch": 0.43, + "grad_norm": 0.4988233745098114, + "learning_rate": 0.0005698683984825331, + "loss": 3.0371, + "step": 8808 + }, + { + "epoch": 0.43, + "grad_norm": 0.5417897701263428, + "learning_rate": 0.0005698616731561794, + "loss": 3.3176, + "step": 8809 + }, + { + "epoch": 0.43, + "grad_norm": 0.5258243680000305, + "learning_rate": 0.0005698549471190621, + "loss": 3.5196, + "step": 8810 + }, + { + "epoch": 0.43, + "grad_norm": 0.48488008975982666, + "learning_rate": 0.0005698482203711988, + "loss": 3.2598, + "step": 8811 + }, + { + "epoch": 0.43, + "grad_norm": 0.48625877499580383, + "learning_rate": 0.0005698414929126071, + "loss": 3.3822, + "step": 8812 + }, + { + "epoch": 0.43, + "grad_norm": 0.5102300047874451, + "learning_rate": 0.0005698347647433049, + "loss": 3.3212, + "step": 8813 + }, + { + "epoch": 0.43, + "grad_norm": 0.5392574667930603, + "learning_rate": 0.0005698280358633099, + "loss": 3.1794, + "step": 8814 + }, + { + "epoch": 0.43, + "grad_norm": 0.5915513038635254, + "learning_rate": 0.0005698213062726397, + "loss": 3.3302, + "step": 8815 + }, + { + "epoch": 0.43, + "grad_norm": 0.5354930758476257, + "learning_rate": 0.0005698145759713122, + "loss": 3.2355, + "step": 8816 + }, + { + "epoch": 0.43, + "grad_norm": 0.5436181426048279, + "learning_rate": 0.000569807844959345, + "loss": 3.3668, + "step": 8817 + }, + { + "epoch": 0.43, + "grad_norm": 0.5166468024253845, + "learning_rate": 0.0005698011132367558, + "loss": 3.1715, + "step": 8818 + }, + { + "epoch": 0.43, + "grad_norm": 0.5697653889656067, + "learning_rate": 0.0005697943808035625, + "loss": 3.271, + "step": 8819 + }, + { + "epoch": 0.43, + "grad_norm": 0.5263750553131104, + "learning_rate": 0.0005697876476597826, + "loss": 3.5375, + "step": 8820 + }, + { + "epoch": 0.43, + "grad_norm": 0.5435929894447327, + "learning_rate": 0.0005697809138054341, + "loss": 3.3333, + "step": 8821 + }, + { + "epoch": 0.43, + "grad_norm": 0.5712611079216003, + "learning_rate": 0.0005697741792405344, + "loss": 3.453, + "step": 8822 + }, + { + "epoch": 0.43, + "grad_norm": 0.5347647666931152, + "learning_rate": 0.0005697674439651017, + "loss": 3.164, + "step": 8823 + }, + { + "epoch": 0.43, + "grad_norm": 0.549929141998291, + "learning_rate": 0.0005697607079791533, + "loss": 3.4472, + "step": 8824 + }, + { + "epoch": 0.43, + "grad_norm": 0.5421603918075562, + "learning_rate": 0.0005697539712827071, + "loss": 3.0966, + "step": 8825 + }, + { + "epoch": 0.43, + "grad_norm": 0.5252081751823425, + "learning_rate": 0.0005697472338757808, + "loss": 3.1905, + "step": 8826 + }, + { + "epoch": 0.43, + "grad_norm": 0.5157675743103027, + "learning_rate": 0.0005697404957583923, + "loss": 3.2351, + "step": 8827 + }, + { + "epoch": 0.43, + "grad_norm": 0.5952437520027161, + "learning_rate": 0.0005697337569305594, + "loss": 3.0903, + "step": 8828 + }, + { + "epoch": 0.43, + "grad_norm": 0.5244645476341248, + "learning_rate": 0.0005697270173922994, + "loss": 3.2001, + "step": 8829 + }, + { + "epoch": 0.43, + "grad_norm": 0.5517503619194031, + "learning_rate": 0.0005697202771436305, + "loss": 3.1902, + "step": 8830 + }, + { + "epoch": 0.43, + "grad_norm": 0.5599244236946106, + "learning_rate": 0.0005697135361845703, + "loss": 3.2742, + "step": 8831 + }, + { + "epoch": 0.43, + "grad_norm": 0.5345979332923889, + "learning_rate": 0.0005697067945151365, + "loss": 3.5683, + "step": 8832 + }, + { + "epoch": 0.43, + "grad_norm": 0.5085006356239319, + "learning_rate": 0.000569700052135347, + "loss": 3.2911, + "step": 8833 + }, + { + "epoch": 0.43, + "grad_norm": 0.5625810027122498, + "learning_rate": 0.0005696933090452193, + "loss": 3.1879, + "step": 8834 + }, + { + "epoch": 0.43, + "grad_norm": 0.5771826505661011, + "learning_rate": 0.0005696865652447715, + "loss": 3.2465, + "step": 8835 + }, + { + "epoch": 0.43, + "grad_norm": 0.5013545155525208, + "learning_rate": 0.0005696798207340211, + "loss": 3.1855, + "step": 8836 + }, + { + "epoch": 0.43, + "grad_norm": 0.5615394115447998, + "learning_rate": 0.000569673075512986, + "loss": 3.4138, + "step": 8837 + }, + { + "epoch": 0.43, + "grad_norm": 0.5521506071090698, + "learning_rate": 0.0005696663295816839, + "loss": 3.1211, + "step": 8838 + }, + { + "epoch": 0.43, + "grad_norm": 0.713085412979126, + "learning_rate": 0.0005696595829401325, + "loss": 3.1726, + "step": 8839 + }, + { + "epoch": 0.43, + "grad_norm": 0.5880343914031982, + "learning_rate": 0.0005696528355883497, + "loss": 3.3201, + "step": 8840 + }, + { + "epoch": 0.43, + "grad_norm": 0.6011033654212952, + "learning_rate": 0.0005696460875263534, + "loss": 3.243, + "step": 8841 + }, + { + "epoch": 0.43, + "grad_norm": 0.5444574356079102, + "learning_rate": 0.000569639338754161, + "loss": 3.5191, + "step": 8842 + }, + { + "epoch": 0.43, + "grad_norm": 0.5401180386543274, + "learning_rate": 0.0005696325892717906, + "loss": 3.2096, + "step": 8843 + }, + { + "epoch": 0.43, + "grad_norm": 0.5681575536727905, + "learning_rate": 0.0005696258390792598, + "loss": 3.2179, + "step": 8844 + }, + { + "epoch": 0.43, + "grad_norm": 0.505577027797699, + "learning_rate": 0.0005696190881765864, + "loss": 3.5161, + "step": 8845 + }, + { + "epoch": 0.43, + "grad_norm": 0.5092687606811523, + "learning_rate": 0.0005696123365637882, + "loss": 3.3775, + "step": 8846 + }, + { + "epoch": 0.43, + "grad_norm": 0.4876072108745575, + "learning_rate": 0.000569605584240883, + "loss": 3.3778, + "step": 8847 + }, + { + "epoch": 0.43, + "grad_norm": 0.5532077550888062, + "learning_rate": 0.0005695988312078886, + "loss": 3.2693, + "step": 8848 + }, + { + "epoch": 0.43, + "grad_norm": 0.5383113622665405, + "learning_rate": 0.0005695920774648227, + "loss": 3.2532, + "step": 8849 + }, + { + "epoch": 0.43, + "grad_norm": 0.5278736352920532, + "learning_rate": 0.0005695853230117033, + "loss": 3.286, + "step": 8850 + }, + { + "epoch": 0.43, + "grad_norm": 0.5102056264877319, + "learning_rate": 0.000569578567848548, + "loss": 3.493, + "step": 8851 + }, + { + "epoch": 0.43, + "grad_norm": 0.5252404808998108, + "learning_rate": 0.0005695718119753746, + "loss": 3.2592, + "step": 8852 + }, + { + "epoch": 0.43, + "grad_norm": 0.5343819856643677, + "learning_rate": 0.0005695650553922009, + "loss": 3.2568, + "step": 8853 + }, + { + "epoch": 0.43, + "grad_norm": 0.5407352447509766, + "learning_rate": 0.0005695582980990448, + "loss": 3.2887, + "step": 8854 + }, + { + "epoch": 0.43, + "grad_norm": 0.5392612218856812, + "learning_rate": 0.0005695515400959239, + "loss": 3.4683, + "step": 8855 + }, + { + "epoch": 0.43, + "grad_norm": 0.5008077025413513, + "learning_rate": 0.0005695447813828562, + "loss": 3.4033, + "step": 8856 + }, + { + "epoch": 0.43, + "grad_norm": 0.5120012164115906, + "learning_rate": 0.0005695380219598594, + "loss": 3.3343, + "step": 8857 + }, + { + "epoch": 0.43, + "grad_norm": 0.545326292514801, + "learning_rate": 0.0005695312618269513, + "loss": 3.2657, + "step": 8858 + }, + { + "epoch": 0.43, + "grad_norm": 0.550786018371582, + "learning_rate": 0.0005695245009841497, + "loss": 3.4312, + "step": 8859 + }, + { + "epoch": 0.43, + "grad_norm": 0.5332869291305542, + "learning_rate": 0.0005695177394314725, + "loss": 3.4009, + "step": 8860 + }, + { + "epoch": 0.43, + "grad_norm": 0.5279282927513123, + "learning_rate": 0.0005695109771689375, + "loss": 3.1493, + "step": 8861 + }, + { + "epoch": 0.43, + "grad_norm": 0.5188350081443787, + "learning_rate": 0.0005695042141965624, + "loss": 3.4996, + "step": 8862 + }, + { + "epoch": 0.43, + "grad_norm": 0.5265781283378601, + "learning_rate": 0.000569497450514365, + "loss": 3.4578, + "step": 8863 + }, + { + "epoch": 0.43, + "grad_norm": 0.5063434839248657, + "learning_rate": 0.0005694906861223632, + "loss": 3.4383, + "step": 8864 + }, + { + "epoch": 0.43, + "grad_norm": 0.5559127926826477, + "learning_rate": 0.0005694839210205749, + "loss": 3.2548, + "step": 8865 + }, + { + "epoch": 0.43, + "grad_norm": 0.5212278962135315, + "learning_rate": 0.0005694771552090177, + "loss": 3.1999, + "step": 8866 + }, + { + "epoch": 0.43, + "grad_norm": 0.5368652939796448, + "learning_rate": 0.0005694703886877097, + "loss": 3.1888, + "step": 8867 + }, + { + "epoch": 0.43, + "grad_norm": 0.5175959467887878, + "learning_rate": 0.0005694636214566684, + "loss": 3.3606, + "step": 8868 + }, + { + "epoch": 0.43, + "grad_norm": 0.5428808927536011, + "learning_rate": 0.0005694568535159118, + "loss": 3.2056, + "step": 8869 + }, + { + "epoch": 0.43, + "grad_norm": 0.5059577822685242, + "learning_rate": 0.0005694500848654578, + "loss": 3.1956, + "step": 8870 + }, + { + "epoch": 0.43, + "grad_norm": 0.5455737113952637, + "learning_rate": 0.000569443315505324, + "loss": 3.5257, + "step": 8871 + }, + { + "epoch": 0.43, + "grad_norm": 0.5467769503593445, + "learning_rate": 0.0005694365454355284, + "loss": 3.4259, + "step": 8872 + }, + { + "epoch": 0.43, + "grad_norm": 0.5153681635856628, + "learning_rate": 0.0005694297746560888, + "loss": 3.5655, + "step": 8873 + }, + { + "epoch": 0.43, + "grad_norm": 0.4906521141529083, + "learning_rate": 0.0005694230031670231, + "loss": 3.5068, + "step": 8874 + }, + { + "epoch": 0.43, + "grad_norm": 0.5056790113449097, + "learning_rate": 0.0005694162309683489, + "loss": 3.2567, + "step": 8875 + }, + { + "epoch": 0.43, + "grad_norm": 0.5721079111099243, + "learning_rate": 0.0005694094580600843, + "loss": 3.158, + "step": 8876 + }, + { + "epoch": 0.44, + "grad_norm": 0.5457658767700195, + "learning_rate": 0.0005694026844422471, + "loss": 3.2658, + "step": 8877 + }, + { + "epoch": 0.44, + "grad_norm": 0.5211271047592163, + "learning_rate": 0.000569395910114855, + "loss": 3.3445, + "step": 8878 + }, + { + "epoch": 0.44, + "grad_norm": 0.5187593102455139, + "learning_rate": 0.000569389135077926, + "loss": 3.2324, + "step": 8879 + }, + { + "epoch": 0.44, + "grad_norm": 0.49315252900123596, + "learning_rate": 0.0005693823593314778, + "loss": 3.2866, + "step": 8880 + }, + { + "epoch": 0.44, + "grad_norm": 0.5174716711044312, + "learning_rate": 0.0005693755828755283, + "loss": 3.0987, + "step": 8881 + }, + { + "epoch": 0.44, + "grad_norm": 0.5270132422447205, + "learning_rate": 0.0005693688057100953, + "loss": 3.3338, + "step": 8882 + }, + { + "epoch": 0.44, + "grad_norm": 0.5709814429283142, + "learning_rate": 0.0005693620278351968, + "loss": 3.2527, + "step": 8883 + }, + { + "epoch": 0.44, + "grad_norm": 0.50661700963974, + "learning_rate": 0.0005693552492508505, + "loss": 3.1687, + "step": 8884 + }, + { + "epoch": 0.44, + "grad_norm": 0.5201055407524109, + "learning_rate": 0.0005693484699570744, + "loss": 3.2431, + "step": 8885 + }, + { + "epoch": 0.44, + "grad_norm": 0.5174275040626526, + "learning_rate": 0.0005693416899538861, + "loss": 3.1476, + "step": 8886 + }, + { + "epoch": 0.44, + "grad_norm": 0.5571829676628113, + "learning_rate": 0.0005693349092413038, + "loss": 3.349, + "step": 8887 + }, + { + "epoch": 0.44, + "grad_norm": 0.5371866226196289, + "learning_rate": 0.0005693281278193452, + "loss": 3.1509, + "step": 8888 + }, + { + "epoch": 0.44, + "grad_norm": 0.5208328366279602, + "learning_rate": 0.000569321345688028, + "loss": 3.4085, + "step": 8889 + }, + { + "epoch": 0.44, + "grad_norm": 0.5333773493766785, + "learning_rate": 0.0005693145628473703, + "loss": 3.3804, + "step": 8890 + }, + { + "epoch": 0.44, + "grad_norm": 0.5705960392951965, + "learning_rate": 0.0005693077792973899, + "loss": 3.4182, + "step": 8891 + }, + { + "epoch": 0.44, + "grad_norm": 0.5110260844230652, + "learning_rate": 0.0005693009950381046, + "loss": 3.2802, + "step": 8892 + }, + { + "epoch": 0.44, + "grad_norm": 0.4911125600337982, + "learning_rate": 0.0005692942100695322, + "loss": 3.244, + "step": 8893 + }, + { + "epoch": 0.44, + "grad_norm": 0.6147732138633728, + "learning_rate": 0.0005692874243916908, + "loss": 3.2172, + "step": 8894 + }, + { + "epoch": 0.44, + "grad_norm": 0.5848454236984253, + "learning_rate": 0.0005692806380045981, + "loss": 3.4469, + "step": 8895 + }, + { + "epoch": 0.44, + "grad_norm": 0.5394555330276489, + "learning_rate": 0.000569273850908272, + "loss": 3.4887, + "step": 8896 + }, + { + "epoch": 0.44, + "grad_norm": 0.5056954622268677, + "learning_rate": 0.0005692670631027304, + "loss": 3.3544, + "step": 8897 + }, + { + "epoch": 0.44, + "grad_norm": 0.4951151907444, + "learning_rate": 0.0005692602745879913, + "loss": 3.2587, + "step": 8898 + }, + { + "epoch": 0.44, + "grad_norm": 0.5151358246803284, + "learning_rate": 0.0005692534853640723, + "loss": 3.3862, + "step": 8899 + }, + { + "epoch": 0.44, + "grad_norm": 0.5135519504547119, + "learning_rate": 0.0005692466954309915, + "loss": 3.2097, + "step": 8900 + }, + { + "epoch": 0.44, + "grad_norm": 0.5229550004005432, + "learning_rate": 0.0005692399047887667, + "loss": 3.291, + "step": 8901 + }, + { + "epoch": 0.44, + "grad_norm": 0.5192024111747742, + "learning_rate": 0.000569233113437416, + "loss": 3.2481, + "step": 8902 + }, + { + "epoch": 0.44, + "grad_norm": 0.5763697624206543, + "learning_rate": 0.0005692263213769569, + "loss": 3.1955, + "step": 8903 + }, + { + "epoch": 0.44, + "grad_norm": 0.5331725478172302, + "learning_rate": 0.0005692195286074075, + "loss": 3.2931, + "step": 8904 + }, + { + "epoch": 0.44, + "grad_norm": 0.5367273092269897, + "learning_rate": 0.0005692127351287857, + "loss": 3.2269, + "step": 8905 + }, + { + "epoch": 0.44, + "grad_norm": 0.5095797777175903, + "learning_rate": 0.0005692059409411094, + "loss": 3.3072, + "step": 8906 + }, + { + "epoch": 0.44, + "grad_norm": 0.5251290202140808, + "learning_rate": 0.0005691991460443964, + "loss": 3.3444, + "step": 8907 + }, + { + "epoch": 0.44, + "grad_norm": 0.5581363439559937, + "learning_rate": 0.0005691923504386646, + "loss": 3.3821, + "step": 8908 + }, + { + "epoch": 0.44, + "grad_norm": 0.536482572555542, + "learning_rate": 0.000569185554123932, + "loss": 3.3329, + "step": 8909 + }, + { + "epoch": 0.44, + "grad_norm": 0.5598181486129761, + "learning_rate": 0.0005691787571002165, + "loss": 3.3854, + "step": 8910 + }, + { + "epoch": 0.44, + "grad_norm": 0.5411391854286194, + "learning_rate": 0.000569171959367536, + "loss": 3.2254, + "step": 8911 + }, + { + "epoch": 0.44, + "grad_norm": 0.5239635109901428, + "learning_rate": 0.0005691651609259083, + "loss": 3.4488, + "step": 8912 + }, + { + "epoch": 0.44, + "grad_norm": 0.5380442142486572, + "learning_rate": 0.0005691583617753514, + "loss": 3.601, + "step": 8913 + }, + { + "epoch": 0.44, + "grad_norm": 0.5128017663955688, + "learning_rate": 0.0005691515619158831, + "loss": 3.39, + "step": 8914 + }, + { + "epoch": 0.44, + "grad_norm": 0.5668172240257263, + "learning_rate": 0.0005691447613475214, + "loss": 3.4607, + "step": 8915 + }, + { + "epoch": 0.44, + "grad_norm": 0.5155226588249207, + "learning_rate": 0.0005691379600702843, + "loss": 3.1785, + "step": 8916 + }, + { + "epoch": 0.44, + "grad_norm": 0.53493332862854, + "learning_rate": 0.0005691311580841896, + "loss": 3.3619, + "step": 8917 + }, + { + "epoch": 0.44, + "grad_norm": 0.5263345241546631, + "learning_rate": 0.0005691243553892551, + "loss": 3.3008, + "step": 8918 + }, + { + "epoch": 0.44, + "grad_norm": 0.5928928852081299, + "learning_rate": 0.0005691175519854989, + "loss": 3.1524, + "step": 8919 + }, + { + "epoch": 0.44, + "grad_norm": 0.5617950558662415, + "learning_rate": 0.0005691107478729389, + "loss": 3.0835, + "step": 8920 + }, + { + "epoch": 0.44, + "grad_norm": 0.5107870697975159, + "learning_rate": 0.0005691039430515929, + "loss": 3.248, + "step": 8921 + }, + { + "epoch": 0.44, + "grad_norm": 0.5091202259063721, + "learning_rate": 0.000569097137521479, + "loss": 3.3436, + "step": 8922 + }, + { + "epoch": 0.44, + "grad_norm": 0.5600295066833496, + "learning_rate": 0.000569090331282615, + "loss": 3.1758, + "step": 8923 + }, + { + "epoch": 0.44, + "grad_norm": 0.5212362408638, + "learning_rate": 0.0005690835243350188, + "loss": 3.4895, + "step": 8924 + }, + { + "epoch": 0.44, + "grad_norm": 0.5386121273040771, + "learning_rate": 0.0005690767166787084, + "loss": 3.277, + "step": 8925 + }, + { + "epoch": 0.44, + "grad_norm": 0.5397125482559204, + "learning_rate": 0.0005690699083137018, + "loss": 3.354, + "step": 8926 + }, + { + "epoch": 0.44, + "grad_norm": 0.5409417152404785, + "learning_rate": 0.0005690630992400168, + "loss": 3.2076, + "step": 8927 + }, + { + "epoch": 0.44, + "grad_norm": 0.5249380469322205, + "learning_rate": 0.0005690562894576713, + "loss": 3.3417, + "step": 8928 + }, + { + "epoch": 0.44, + "grad_norm": 0.4952598810195923, + "learning_rate": 0.0005690494789666834, + "loss": 3.2278, + "step": 8929 + }, + { + "epoch": 0.44, + "grad_norm": 0.5684417486190796, + "learning_rate": 0.0005690426677670709, + "loss": 3.0598, + "step": 8930 + }, + { + "epoch": 0.44, + "grad_norm": 0.5358139872550964, + "learning_rate": 0.0005690358558588519, + "loss": 3.4284, + "step": 8931 + }, + { + "epoch": 0.44, + "grad_norm": 0.5555673241615295, + "learning_rate": 0.0005690290432420441, + "loss": 3.2636, + "step": 8932 + }, + { + "epoch": 0.44, + "grad_norm": 0.5253645777702332, + "learning_rate": 0.0005690222299166656, + "loss": 3.4941, + "step": 8933 + }, + { + "epoch": 0.44, + "grad_norm": 0.5221039056777954, + "learning_rate": 0.0005690154158827344, + "loss": 3.3307, + "step": 8934 + }, + { + "epoch": 0.44, + "grad_norm": 0.5121043920516968, + "learning_rate": 0.0005690086011402683, + "loss": 3.2696, + "step": 8935 + }, + { + "epoch": 0.44, + "grad_norm": 0.5969629883766174, + "learning_rate": 0.0005690017856892853, + "loss": 3.1725, + "step": 8936 + }, + { + "epoch": 0.44, + "grad_norm": 0.5254464745521545, + "learning_rate": 0.0005689949695298034, + "loss": 3.3222, + "step": 8937 + }, + { + "epoch": 0.44, + "grad_norm": 0.5058313608169556, + "learning_rate": 0.0005689881526618405, + "loss": 3.3466, + "step": 8938 + }, + { + "epoch": 0.44, + "grad_norm": 0.49000418186187744, + "learning_rate": 0.0005689813350854147, + "loss": 3.4226, + "step": 8939 + }, + { + "epoch": 0.44, + "grad_norm": 0.5364489555358887, + "learning_rate": 0.0005689745168005437, + "loss": 3.3661, + "step": 8940 + }, + { + "epoch": 0.44, + "grad_norm": 0.5190027952194214, + "learning_rate": 0.0005689676978072456, + "loss": 3.2002, + "step": 8941 + }, + { + "epoch": 0.44, + "grad_norm": 0.5191941261291504, + "learning_rate": 0.0005689608781055383, + "loss": 3.0918, + "step": 8942 + }, + { + "epoch": 0.44, + "grad_norm": 0.5378019213676453, + "learning_rate": 0.0005689540576954399, + "loss": 3.3341, + "step": 8943 + }, + { + "epoch": 0.44, + "grad_norm": 0.6198469400405884, + "learning_rate": 0.0005689472365769683, + "loss": 3.3351, + "step": 8944 + }, + { + "epoch": 0.44, + "grad_norm": 0.6274257302284241, + "learning_rate": 0.0005689404147501413, + "loss": 3.1836, + "step": 8945 + }, + { + "epoch": 0.44, + "grad_norm": 0.5766898393630981, + "learning_rate": 0.000568933592214977, + "loss": 3.3648, + "step": 8946 + }, + { + "epoch": 0.44, + "grad_norm": 0.5012130737304688, + "learning_rate": 0.0005689267689714934, + "loss": 3.3627, + "step": 8947 + }, + { + "epoch": 0.44, + "grad_norm": 0.5235632061958313, + "learning_rate": 0.0005689199450197086, + "loss": 3.3629, + "step": 8948 + }, + { + "epoch": 0.44, + "grad_norm": 0.4902224838733673, + "learning_rate": 0.0005689131203596404, + "loss": 3.4249, + "step": 8949 + }, + { + "epoch": 0.44, + "grad_norm": 0.555653989315033, + "learning_rate": 0.0005689062949913067, + "loss": 3.2133, + "step": 8950 + }, + { + "epoch": 0.44, + "grad_norm": 0.49776995182037354, + "learning_rate": 0.0005688994689147256, + "loss": 3.3663, + "step": 8951 + }, + { + "epoch": 0.44, + "grad_norm": 0.5030352473258972, + "learning_rate": 0.000568892642129915, + "loss": 3.1925, + "step": 8952 + }, + { + "epoch": 0.44, + "grad_norm": 0.4929159879684448, + "learning_rate": 0.000568885814636893, + "loss": 3.2832, + "step": 8953 + }, + { + "epoch": 0.44, + "grad_norm": 0.5239042639732361, + "learning_rate": 0.0005688789864356775, + "loss": 3.2955, + "step": 8954 + }, + { + "epoch": 0.44, + "grad_norm": 0.5698967576026917, + "learning_rate": 0.0005688721575262865, + "loss": 3.205, + "step": 8955 + }, + { + "epoch": 0.44, + "grad_norm": 0.5118188858032227, + "learning_rate": 0.0005688653279087379, + "loss": 3.1984, + "step": 8956 + }, + { + "epoch": 0.44, + "grad_norm": 0.6158947944641113, + "learning_rate": 0.0005688584975830498, + "loss": 3.1382, + "step": 8957 + }, + { + "epoch": 0.44, + "grad_norm": 0.571174144744873, + "learning_rate": 0.0005688516665492403, + "loss": 3.1952, + "step": 8958 + }, + { + "epoch": 0.44, + "grad_norm": 0.5016655921936035, + "learning_rate": 0.0005688448348073271, + "loss": 3.2333, + "step": 8959 + }, + { + "epoch": 0.44, + "grad_norm": 0.5244669914245605, + "learning_rate": 0.0005688380023573284, + "loss": 3.3077, + "step": 8960 + }, + { + "epoch": 0.44, + "grad_norm": 0.542945146560669, + "learning_rate": 0.0005688311691992621, + "loss": 3.1278, + "step": 8961 + }, + { + "epoch": 0.44, + "grad_norm": 0.5659850835800171, + "learning_rate": 0.0005688243353331462, + "loss": 3.2479, + "step": 8962 + }, + { + "epoch": 0.44, + "grad_norm": 0.5096282362937927, + "learning_rate": 0.0005688175007589989, + "loss": 3.3169, + "step": 8963 + }, + { + "epoch": 0.44, + "grad_norm": 0.5422225594520569, + "learning_rate": 0.0005688106654768379, + "loss": 3.412, + "step": 8964 + }, + { + "epoch": 0.44, + "grad_norm": 0.5580344200134277, + "learning_rate": 0.0005688038294866814, + "loss": 3.2899, + "step": 8965 + }, + { + "epoch": 0.44, + "grad_norm": 0.5104421973228455, + "learning_rate": 0.0005687969927885474, + "loss": 3.1282, + "step": 8966 + }, + { + "epoch": 0.44, + "grad_norm": 0.5075674057006836, + "learning_rate": 0.0005687901553824537, + "loss": 3.4159, + "step": 8967 + }, + { + "epoch": 0.44, + "grad_norm": 0.5359368324279785, + "learning_rate": 0.0005687833172684186, + "loss": 3.4478, + "step": 8968 + }, + { + "epoch": 0.44, + "grad_norm": 0.5409534573554993, + "learning_rate": 0.0005687764784464599, + "loss": 3.3085, + "step": 8969 + }, + { + "epoch": 0.44, + "grad_norm": 0.5197051763534546, + "learning_rate": 0.0005687696389165956, + "loss": 3.0945, + "step": 8970 + }, + { + "epoch": 0.44, + "grad_norm": 0.5289903283119202, + "learning_rate": 0.000568762798678844, + "loss": 3.4001, + "step": 8971 + }, + { + "epoch": 0.44, + "grad_norm": 0.5642632842063904, + "learning_rate": 0.0005687559577332228, + "loss": 3.5348, + "step": 8972 + }, + { + "epoch": 0.44, + "grad_norm": 0.6010560989379883, + "learning_rate": 0.0005687491160797501, + "loss": 3.2148, + "step": 8973 + }, + { + "epoch": 0.44, + "grad_norm": 0.5142990350723267, + "learning_rate": 0.000568742273718444, + "loss": 3.2754, + "step": 8974 + }, + { + "epoch": 0.44, + "grad_norm": 0.5444654822349548, + "learning_rate": 0.0005687354306493225, + "loss": 3.4776, + "step": 8975 + }, + { + "epoch": 0.44, + "grad_norm": 0.5075336694717407, + "learning_rate": 0.0005687285868724035, + "loss": 3.1729, + "step": 8976 + }, + { + "epoch": 0.44, + "grad_norm": 0.5050414204597473, + "learning_rate": 0.0005687217423877051, + "loss": 3.244, + "step": 8977 + }, + { + "epoch": 0.44, + "grad_norm": 0.49637553095817566, + "learning_rate": 0.0005687148971952455, + "loss": 3.3921, + "step": 8978 + }, + { + "epoch": 0.44, + "grad_norm": 0.5068024396896362, + "learning_rate": 0.0005687080512950426, + "loss": 3.228, + "step": 8979 + }, + { + "epoch": 0.44, + "grad_norm": 0.512059211730957, + "learning_rate": 0.0005687012046871143, + "loss": 3.2777, + "step": 8980 + }, + { + "epoch": 0.44, + "grad_norm": 0.5337809920310974, + "learning_rate": 0.0005686943573714787, + "loss": 3.4469, + "step": 8981 + }, + { + "epoch": 0.44, + "grad_norm": 0.5400496125221252, + "learning_rate": 0.0005686875093481539, + "loss": 3.2248, + "step": 8982 + }, + { + "epoch": 0.44, + "grad_norm": 0.5225552916526794, + "learning_rate": 0.0005686806606171579, + "loss": 3.4751, + "step": 8983 + }, + { + "epoch": 0.44, + "grad_norm": 0.5124357342720032, + "learning_rate": 0.0005686738111785088, + "loss": 3.1647, + "step": 8984 + }, + { + "epoch": 0.44, + "grad_norm": 0.5976647734642029, + "learning_rate": 0.0005686669610322246, + "loss": 3.3468, + "step": 8985 + }, + { + "epoch": 0.44, + "grad_norm": 0.5431848764419556, + "learning_rate": 0.0005686601101783232, + "loss": 3.3075, + "step": 8986 + }, + { + "epoch": 0.44, + "grad_norm": 0.522193193435669, + "learning_rate": 0.000568653258616823, + "loss": 3.2333, + "step": 8987 + }, + { + "epoch": 0.44, + "grad_norm": 0.5003888607025146, + "learning_rate": 0.0005686464063477416, + "loss": 3.4794, + "step": 8988 + }, + { + "epoch": 0.44, + "grad_norm": 0.5476301312446594, + "learning_rate": 0.0005686395533710973, + "loss": 3.1188, + "step": 8989 + }, + { + "epoch": 0.44, + "grad_norm": 0.5447575449943542, + "learning_rate": 0.0005686326996869083, + "loss": 3.3467, + "step": 8990 + }, + { + "epoch": 0.44, + "grad_norm": 0.5251429677009583, + "learning_rate": 0.0005686258452951923, + "loss": 3.2945, + "step": 8991 + }, + { + "epoch": 0.44, + "grad_norm": 0.5147581100463867, + "learning_rate": 0.0005686189901959675, + "loss": 3.4831, + "step": 8992 + }, + { + "epoch": 0.44, + "grad_norm": 0.5492880344390869, + "learning_rate": 0.000568612134389252, + "loss": 3.4656, + "step": 8993 + }, + { + "epoch": 0.44, + "grad_norm": 0.511838436126709, + "learning_rate": 0.000568605277875064, + "loss": 3.2636, + "step": 8994 + }, + { + "epoch": 0.44, + "grad_norm": 0.5325186848640442, + "learning_rate": 0.0005685984206534212, + "loss": 3.3853, + "step": 8995 + }, + { + "epoch": 0.44, + "grad_norm": 0.528266966342926, + "learning_rate": 0.000568591562724342, + "loss": 3.3297, + "step": 8996 + }, + { + "epoch": 0.44, + "grad_norm": 0.5761194229125977, + "learning_rate": 0.0005685847040878441, + "loss": 3.1765, + "step": 8997 + }, + { + "epoch": 0.44, + "grad_norm": 0.4952467978000641, + "learning_rate": 0.000568577844743946, + "loss": 3.4474, + "step": 8998 + }, + { + "epoch": 0.44, + "grad_norm": 0.5307065844535828, + "learning_rate": 0.0005685709846926654, + "loss": 3.3741, + "step": 8999 + }, + { + "epoch": 0.44, + "grad_norm": 0.5305284857749939, + "learning_rate": 0.0005685641239340206, + "loss": 3.2478, + "step": 9000 + }, + { + "epoch": 0.44, + "grad_norm": 0.5165554881095886, + "learning_rate": 0.0005685572624680295, + "loss": 3.1295, + "step": 9001 + }, + { + "epoch": 0.44, + "grad_norm": 0.5124072432518005, + "learning_rate": 0.0005685504002947104, + "loss": 3.3352, + "step": 9002 + }, + { + "epoch": 0.44, + "grad_norm": 0.5218518376350403, + "learning_rate": 0.0005685435374140811, + "loss": 3.2078, + "step": 9003 + }, + { + "epoch": 0.44, + "grad_norm": 0.574737548828125, + "learning_rate": 0.0005685366738261598, + "loss": 3.2878, + "step": 9004 + }, + { + "epoch": 0.44, + "grad_norm": 0.5492660403251648, + "learning_rate": 0.0005685298095309646, + "loss": 3.2467, + "step": 9005 + }, + { + "epoch": 0.44, + "grad_norm": 0.5594704151153564, + "learning_rate": 0.0005685229445285137, + "loss": 3.4172, + "step": 9006 + }, + { + "epoch": 0.44, + "grad_norm": 0.5028058290481567, + "learning_rate": 0.000568516078818825, + "loss": 3.3881, + "step": 9007 + }, + { + "epoch": 0.44, + "grad_norm": 0.5461245179176331, + "learning_rate": 0.0005685092124019165, + "loss": 3.5241, + "step": 9008 + }, + { + "epoch": 0.44, + "grad_norm": 0.5365086793899536, + "learning_rate": 0.0005685023452778065, + "loss": 3.3182, + "step": 9009 + }, + { + "epoch": 0.44, + "grad_norm": 0.530041515827179, + "learning_rate": 0.0005684954774465129, + "loss": 3.4351, + "step": 9010 + }, + { + "epoch": 0.44, + "grad_norm": 0.5450629591941833, + "learning_rate": 0.000568488608908054, + "loss": 3.386, + "step": 9011 + }, + { + "epoch": 0.44, + "grad_norm": 0.5048748254776001, + "learning_rate": 0.0005684817396624476, + "loss": 3.6001, + "step": 9012 + }, + { + "epoch": 0.44, + "grad_norm": 0.5259529948234558, + "learning_rate": 0.0005684748697097121, + "loss": 3.0533, + "step": 9013 + }, + { + "epoch": 0.44, + "grad_norm": 0.5435417294502258, + "learning_rate": 0.0005684679990498655, + "loss": 3.4664, + "step": 9014 + }, + { + "epoch": 0.44, + "grad_norm": 0.5544543266296387, + "learning_rate": 0.0005684611276829259, + "loss": 3.151, + "step": 9015 + }, + { + "epoch": 0.44, + "grad_norm": 0.536688506603241, + "learning_rate": 0.0005684542556089112, + "loss": 3.3264, + "step": 9016 + }, + { + "epoch": 0.44, + "grad_norm": 0.5237370133399963, + "learning_rate": 0.0005684473828278398, + "loss": 3.3703, + "step": 9017 + }, + { + "epoch": 0.44, + "grad_norm": 0.5127825140953064, + "learning_rate": 0.0005684405093397296, + "loss": 3.2351, + "step": 9018 + }, + { + "epoch": 0.44, + "grad_norm": 0.5225836634635925, + "learning_rate": 0.0005684336351445987, + "loss": 3.2658, + "step": 9019 + }, + { + "epoch": 0.44, + "grad_norm": 0.5963146686553955, + "learning_rate": 0.0005684267602424652, + "loss": 3.2942, + "step": 9020 + }, + { + "epoch": 0.44, + "grad_norm": 0.5185006260871887, + "learning_rate": 0.0005684198846333475, + "loss": 3.1149, + "step": 9021 + }, + { + "epoch": 0.44, + "grad_norm": 0.5315390229225159, + "learning_rate": 0.0005684130083172634, + "loss": 3.1857, + "step": 9022 + }, + { + "epoch": 0.44, + "grad_norm": 0.5717655420303345, + "learning_rate": 0.0005684061312942309, + "loss": 3.3572, + "step": 9023 + }, + { + "epoch": 0.44, + "grad_norm": 0.559047520160675, + "learning_rate": 0.0005683992535642686, + "loss": 3.3638, + "step": 9024 + }, + { + "epoch": 0.44, + "grad_norm": 0.5396157503128052, + "learning_rate": 0.0005683923751273941, + "loss": 3.2502, + "step": 9025 + }, + { + "epoch": 0.44, + "grad_norm": 0.5662112832069397, + "learning_rate": 0.0005683854959836259, + "loss": 3.244, + "step": 9026 + }, + { + "epoch": 0.44, + "grad_norm": 0.49871206283569336, + "learning_rate": 0.0005683786161329819, + "loss": 3.1904, + "step": 9027 + }, + { + "epoch": 0.44, + "grad_norm": 0.5973621606826782, + "learning_rate": 0.0005683717355754802, + "loss": 3.2459, + "step": 9028 + }, + { + "epoch": 0.44, + "grad_norm": 0.5342198014259338, + "learning_rate": 0.000568364854311139, + "loss": 3.1354, + "step": 9029 + }, + { + "epoch": 0.44, + "grad_norm": 0.54073566198349, + "learning_rate": 0.0005683579723399765, + "loss": 3.2937, + "step": 9030 + }, + { + "epoch": 0.44, + "grad_norm": 0.5744414329528809, + "learning_rate": 0.0005683510896620108, + "loss": 3.2309, + "step": 9031 + }, + { + "epoch": 0.44, + "grad_norm": 0.5294274687767029, + "learning_rate": 0.0005683442062772598, + "loss": 3.1342, + "step": 9032 + }, + { + "epoch": 0.44, + "grad_norm": 0.5062122941017151, + "learning_rate": 0.0005683373221857419, + "loss": 3.3708, + "step": 9033 + }, + { + "epoch": 0.44, + "grad_norm": 0.5301145315170288, + "learning_rate": 0.0005683304373874752, + "loss": 3.3389, + "step": 9034 + }, + { + "epoch": 0.44, + "grad_norm": 0.5067840814590454, + "learning_rate": 0.0005683235518824777, + "loss": 3.3437, + "step": 9035 + }, + { + "epoch": 0.44, + "grad_norm": 0.5595483183860779, + "learning_rate": 0.0005683166656707676, + "loss": 3.315, + "step": 9036 + }, + { + "epoch": 0.44, + "grad_norm": 0.5125157833099365, + "learning_rate": 0.0005683097787523631, + "loss": 3.3297, + "step": 9037 + }, + { + "epoch": 0.44, + "grad_norm": 0.5199974179267883, + "learning_rate": 0.0005683028911272822, + "loss": 3.23, + "step": 9038 + }, + { + "epoch": 0.44, + "grad_norm": 0.5390259027481079, + "learning_rate": 0.0005682960027955431, + "loss": 3.2136, + "step": 9039 + }, + { + "epoch": 0.44, + "grad_norm": 0.500478982925415, + "learning_rate": 0.0005682891137571641, + "loss": 3.3919, + "step": 9040 + }, + { + "epoch": 0.44, + "grad_norm": 0.5096060037612915, + "learning_rate": 0.0005682822240121631, + "loss": 3.5207, + "step": 9041 + }, + { + "epoch": 0.44, + "grad_norm": 0.4984170198440552, + "learning_rate": 0.0005682753335605583, + "loss": 3.3042, + "step": 9042 + }, + { + "epoch": 0.44, + "grad_norm": 0.5067201852798462, + "learning_rate": 0.000568268442402368, + "loss": 3.231, + "step": 9043 + }, + { + "epoch": 0.44, + "grad_norm": 0.5161800384521484, + "learning_rate": 0.0005682615505376103, + "loss": 3.296, + "step": 9044 + }, + { + "epoch": 0.44, + "grad_norm": 0.5189632773399353, + "learning_rate": 0.0005682546579663032, + "loss": 3.2647, + "step": 9045 + }, + { + "epoch": 0.44, + "grad_norm": 0.5524479150772095, + "learning_rate": 0.000568247764688465, + "loss": 3.1366, + "step": 9046 + }, + { + "epoch": 0.44, + "grad_norm": 0.5149971842765808, + "learning_rate": 0.0005682408707041138, + "loss": 3.3692, + "step": 9047 + }, + { + "epoch": 0.44, + "grad_norm": 0.5103289484977722, + "learning_rate": 0.0005682339760132677, + "loss": 3.6341, + "step": 9048 + }, + { + "epoch": 0.44, + "grad_norm": 0.5169664621353149, + "learning_rate": 0.000568227080615945, + "loss": 3.3884, + "step": 9049 + }, + { + "epoch": 0.44, + "grad_norm": 0.5336402058601379, + "learning_rate": 0.0005682201845121638, + "loss": 3.3195, + "step": 9050 + }, + { + "epoch": 0.44, + "grad_norm": 0.4936128556728363, + "learning_rate": 0.0005682132877019424, + "loss": 3.2705, + "step": 9051 + }, + { + "epoch": 0.44, + "grad_norm": 0.49011480808258057, + "learning_rate": 0.0005682063901852986, + "loss": 3.3899, + "step": 9052 + }, + { + "epoch": 0.44, + "grad_norm": 0.5118272304534912, + "learning_rate": 0.0005681994919622508, + "loss": 3.3134, + "step": 9053 + }, + { + "epoch": 0.44, + "grad_norm": 0.5226544737815857, + "learning_rate": 0.0005681925930328172, + "loss": 3.2044, + "step": 9054 + }, + { + "epoch": 0.44, + "grad_norm": 0.5248229503631592, + "learning_rate": 0.0005681856933970159, + "loss": 3.0951, + "step": 9055 + }, + { + "epoch": 0.44, + "grad_norm": 0.533872127532959, + "learning_rate": 0.0005681787930548652, + "loss": 3.3359, + "step": 9056 + }, + { + "epoch": 0.44, + "grad_norm": 0.5588023066520691, + "learning_rate": 0.0005681718920063831, + "loss": 3.3904, + "step": 9057 + }, + { + "epoch": 0.44, + "grad_norm": 0.5113193988800049, + "learning_rate": 0.0005681649902515878, + "loss": 3.3799, + "step": 9058 + }, + { + "epoch": 0.44, + "grad_norm": 0.5519219040870667, + "learning_rate": 0.0005681580877904977, + "loss": 3.2737, + "step": 9059 + }, + { + "epoch": 0.44, + "grad_norm": 0.5289116501808167, + "learning_rate": 0.0005681511846231306, + "loss": 3.4338, + "step": 9060 + }, + { + "epoch": 0.44, + "grad_norm": 0.6075866222381592, + "learning_rate": 0.000568144280749505, + "loss": 3.1228, + "step": 9061 + }, + { + "epoch": 0.44, + "grad_norm": 0.5013885498046875, + "learning_rate": 0.0005681373761696389, + "loss": 3.1611, + "step": 9062 + }, + { + "epoch": 0.44, + "grad_norm": 0.5552061200141907, + "learning_rate": 0.0005681304708835506, + "loss": 3.2345, + "step": 9063 + }, + { + "epoch": 0.44, + "grad_norm": 0.5864258408546448, + "learning_rate": 0.0005681235648912581, + "loss": 3.2384, + "step": 9064 + }, + { + "epoch": 0.44, + "grad_norm": 0.5120783448219299, + "learning_rate": 0.00056811665819278, + "loss": 3.3637, + "step": 9065 + }, + { + "epoch": 0.44, + "grad_norm": 0.49182936549186707, + "learning_rate": 0.0005681097507881342, + "loss": 3.3427, + "step": 9066 + }, + { + "epoch": 0.44, + "grad_norm": 0.49345386028289795, + "learning_rate": 0.0005681028426773387, + "loss": 3.2404, + "step": 9067 + }, + { + "epoch": 0.44, + "grad_norm": 0.5188148617744446, + "learning_rate": 0.0005680959338604121, + "loss": 3.4539, + "step": 9068 + }, + { + "epoch": 0.44, + "grad_norm": 0.5250046849250793, + "learning_rate": 0.0005680890243373722, + "loss": 3.4335, + "step": 9069 + }, + { + "epoch": 0.44, + "grad_norm": 0.5325458645820618, + "learning_rate": 0.0005680821141082375, + "loss": 3.18, + "step": 9070 + }, + { + "epoch": 0.44, + "grad_norm": 0.5744656324386597, + "learning_rate": 0.0005680752031730262, + "loss": 3.3223, + "step": 9071 + }, + { + "epoch": 0.44, + "grad_norm": 0.5349627733230591, + "learning_rate": 0.0005680682915317562, + "loss": 3.1908, + "step": 9072 + }, + { + "epoch": 0.44, + "grad_norm": 0.5127280354499817, + "learning_rate": 0.0005680613791844461, + "loss": 3.29, + "step": 9073 + }, + { + "epoch": 0.44, + "grad_norm": 0.5042176246643066, + "learning_rate": 0.000568054466131114, + "loss": 3.3658, + "step": 9074 + }, + { + "epoch": 0.44, + "grad_norm": 0.5064307451248169, + "learning_rate": 0.0005680475523717778, + "loss": 3.3205, + "step": 9075 + }, + { + "epoch": 0.44, + "grad_norm": 0.48436474800109863, + "learning_rate": 0.0005680406379064561, + "loss": 3.2207, + "step": 9076 + }, + { + "epoch": 0.44, + "grad_norm": 0.5284563302993774, + "learning_rate": 0.0005680337227351668, + "loss": 3.3994, + "step": 9077 + }, + { + "epoch": 0.44, + "grad_norm": 0.5303294658660889, + "learning_rate": 0.0005680268068579282, + "loss": 3.4003, + "step": 9078 + }, + { + "epoch": 0.44, + "grad_norm": 0.5509895086288452, + "learning_rate": 0.0005680198902747589, + "loss": 3.2329, + "step": 9079 + }, + { + "epoch": 0.44, + "grad_norm": 0.5539824366569519, + "learning_rate": 0.0005680129729856765, + "loss": 3.1104, + "step": 9080 + }, + { + "epoch": 0.45, + "grad_norm": 0.6219602823257446, + "learning_rate": 0.0005680060549906996, + "loss": 3.4123, + "step": 9081 + }, + { + "epoch": 0.45, + "grad_norm": 0.5088189244270325, + "learning_rate": 0.0005679991362898463, + "loss": 3.2834, + "step": 9082 + }, + { + "epoch": 0.45, + "grad_norm": 0.557954728603363, + "learning_rate": 0.0005679922168831348, + "loss": 3.2455, + "step": 9083 + }, + { + "epoch": 0.45, + "grad_norm": 0.6321285367012024, + "learning_rate": 0.0005679852967705836, + "loss": 3.1699, + "step": 9084 + }, + { + "epoch": 0.45, + "grad_norm": 0.585474967956543, + "learning_rate": 0.0005679783759522105, + "loss": 3.1827, + "step": 9085 + }, + { + "epoch": 0.45, + "grad_norm": 0.5444757342338562, + "learning_rate": 0.000567971454428034, + "loss": 3.3533, + "step": 9086 + }, + { + "epoch": 0.45, + "grad_norm": 0.4752892553806305, + "learning_rate": 0.0005679645321980723, + "loss": 3.256, + "step": 9087 + }, + { + "epoch": 0.45, + "grad_norm": 0.5205745100975037, + "learning_rate": 0.0005679576092623435, + "loss": 3.4033, + "step": 9088 + }, + { + "epoch": 0.45, + "grad_norm": 0.5516600608825684, + "learning_rate": 0.000567950685620866, + "loss": 3.3387, + "step": 9089 + }, + { + "epoch": 0.45, + "grad_norm": 0.5286070704460144, + "learning_rate": 0.0005679437612736578, + "loss": 3.2433, + "step": 9090 + }, + { + "epoch": 0.45, + "grad_norm": 0.5709437131881714, + "learning_rate": 0.0005679368362207375, + "loss": 3.2315, + "step": 9091 + }, + { + "epoch": 0.45, + "grad_norm": 0.5004037022590637, + "learning_rate": 0.0005679299104621231, + "loss": 3.3357, + "step": 9092 + }, + { + "epoch": 0.45, + "grad_norm": 0.5113794207572937, + "learning_rate": 0.0005679229839978329, + "loss": 3.2806, + "step": 9093 + }, + { + "epoch": 0.45, + "grad_norm": 0.5075574517250061, + "learning_rate": 0.0005679160568278851, + "loss": 3.6071, + "step": 9094 + }, + { + "epoch": 0.45, + "grad_norm": 0.5023056268692017, + "learning_rate": 0.0005679091289522979, + "loss": 3.3308, + "step": 9095 + }, + { + "epoch": 0.45, + "grad_norm": 0.5051378607749939, + "learning_rate": 0.0005679022003710896, + "loss": 3.4196, + "step": 9096 + }, + { + "epoch": 0.45, + "grad_norm": 0.5265752077102661, + "learning_rate": 0.0005678952710842785, + "loss": 3.5432, + "step": 9097 + }, + { + "epoch": 0.45, + "grad_norm": 0.49984413385391235, + "learning_rate": 0.0005678883410918828, + "loss": 3.4198, + "step": 9098 + }, + { + "epoch": 0.45, + "grad_norm": 0.5776906609535217, + "learning_rate": 0.0005678814103939209, + "loss": 3.4116, + "step": 9099 + }, + { + "epoch": 0.45, + "grad_norm": 0.5122084617614746, + "learning_rate": 0.0005678744789904108, + "loss": 3.3746, + "step": 9100 + }, + { + "epoch": 0.45, + "grad_norm": 0.5504657626152039, + "learning_rate": 0.0005678675468813709, + "loss": 3.4364, + "step": 9101 + }, + { + "epoch": 0.45, + "grad_norm": 0.5166839361190796, + "learning_rate": 0.0005678606140668194, + "loss": 3.4628, + "step": 9102 + }, + { + "epoch": 0.45, + "grad_norm": 0.4547307789325714, + "learning_rate": 0.0005678536805467746, + "loss": 3.5267, + "step": 9103 + }, + { + "epoch": 0.45, + "grad_norm": 0.5224431157112122, + "learning_rate": 0.0005678467463212549, + "loss": 3.2876, + "step": 9104 + }, + { + "epoch": 0.45, + "grad_norm": 0.501089334487915, + "learning_rate": 0.0005678398113902782, + "loss": 3.4004, + "step": 9105 + }, + { + "epoch": 0.45, + "grad_norm": 0.5165853500366211, + "learning_rate": 0.0005678328757538631, + "loss": 3.3895, + "step": 9106 + }, + { + "epoch": 0.45, + "grad_norm": 0.5630010366439819, + "learning_rate": 0.0005678259394120277, + "loss": 3.4256, + "step": 9107 + }, + { + "epoch": 0.45, + "grad_norm": 0.5082736015319824, + "learning_rate": 0.0005678190023647903, + "loss": 3.3324, + "step": 9108 + }, + { + "epoch": 0.45, + "grad_norm": 0.5279152989387512, + "learning_rate": 0.0005678120646121694, + "loss": 3.4226, + "step": 9109 + }, + { + "epoch": 0.45, + "grad_norm": 0.5158643126487732, + "learning_rate": 0.0005678051261541828, + "loss": 3.1352, + "step": 9110 + }, + { + "epoch": 0.45, + "grad_norm": 0.5294057130813599, + "learning_rate": 0.0005677981869908491, + "loss": 3.0135, + "step": 9111 + }, + { + "epoch": 0.45, + "grad_norm": 0.5090487599372864, + "learning_rate": 0.0005677912471221866, + "loss": 3.396, + "step": 9112 + }, + { + "epoch": 0.45, + "grad_norm": 0.5615133047103882, + "learning_rate": 0.0005677843065482135, + "loss": 3.1692, + "step": 9113 + }, + { + "epoch": 0.45, + "grad_norm": 0.5106341242790222, + "learning_rate": 0.000567777365268948, + "loss": 3.3025, + "step": 9114 + }, + { + "epoch": 0.45, + "grad_norm": 0.5407706499099731, + "learning_rate": 0.0005677704232844084, + "loss": 3.2903, + "step": 9115 + }, + { + "epoch": 0.45, + "grad_norm": 0.5446071028709412, + "learning_rate": 0.0005677634805946131, + "loss": 3.311, + "step": 9116 + }, + { + "epoch": 0.45, + "grad_norm": 0.5780032277107239, + "learning_rate": 0.0005677565371995804, + "loss": 3.4569, + "step": 9117 + }, + { + "epoch": 0.45, + "grad_norm": 0.49284398555755615, + "learning_rate": 0.0005677495930993284, + "loss": 3.1733, + "step": 9118 + }, + { + "epoch": 0.45, + "grad_norm": 0.48607781529426575, + "learning_rate": 0.0005677426482938756, + "loss": 3.3819, + "step": 9119 + }, + { + "epoch": 0.45, + "grad_norm": 0.520286500453949, + "learning_rate": 0.0005677357027832401, + "loss": 3.0716, + "step": 9120 + }, + { + "epoch": 0.45, + "grad_norm": 0.5000066757202148, + "learning_rate": 0.0005677287565674404, + "loss": 3.2683, + "step": 9121 + }, + { + "epoch": 0.45, + "grad_norm": 0.5120892524719238, + "learning_rate": 0.0005677218096464946, + "loss": 3.349, + "step": 9122 + }, + { + "epoch": 0.45, + "grad_norm": 0.5167591571807861, + "learning_rate": 0.0005677148620204211, + "loss": 3.4684, + "step": 9123 + }, + { + "epoch": 0.45, + "grad_norm": 0.50194251537323, + "learning_rate": 0.0005677079136892381, + "loss": 3.2571, + "step": 9124 + }, + { + "epoch": 0.45, + "grad_norm": 0.5270057320594788, + "learning_rate": 0.000567700964652964, + "loss": 3.3654, + "step": 9125 + }, + { + "epoch": 0.45, + "grad_norm": 0.5042098760604858, + "learning_rate": 0.0005676940149116172, + "loss": 3.3709, + "step": 9126 + }, + { + "epoch": 0.45, + "grad_norm": 0.4862194061279297, + "learning_rate": 0.0005676870644652158, + "loss": 3.3156, + "step": 9127 + }, + { + "epoch": 0.45, + "grad_norm": 0.5414255261421204, + "learning_rate": 0.0005676801133137782, + "loss": 3.191, + "step": 9128 + }, + { + "epoch": 0.45, + "grad_norm": 0.5115101933479309, + "learning_rate": 0.0005676731614573228, + "loss": 3.276, + "step": 9129 + }, + { + "epoch": 0.45, + "grad_norm": 0.4912344515323639, + "learning_rate": 0.0005676662088958676, + "loss": 3.393, + "step": 9130 + }, + { + "epoch": 0.45, + "grad_norm": 0.5179233551025391, + "learning_rate": 0.0005676592556294312, + "loss": 3.2392, + "step": 9131 + }, + { + "epoch": 0.45, + "grad_norm": 0.5112375617027283, + "learning_rate": 0.0005676523016580319, + "loss": 3.3681, + "step": 9132 + }, + { + "epoch": 0.45, + "grad_norm": 0.5025057792663574, + "learning_rate": 0.0005676453469816878, + "loss": 3.3225, + "step": 9133 + }, + { + "epoch": 0.45, + "grad_norm": 0.4907090961933136, + "learning_rate": 0.0005676383916004174, + "loss": 3.3381, + "step": 9134 + }, + { + "epoch": 0.45, + "grad_norm": 0.5072884559631348, + "learning_rate": 0.0005676314355142392, + "loss": 3.2997, + "step": 9135 + }, + { + "epoch": 0.45, + "grad_norm": 0.5344531536102295, + "learning_rate": 0.0005676244787231711, + "loss": 3.1497, + "step": 9136 + }, + { + "epoch": 0.45, + "grad_norm": 0.5543556809425354, + "learning_rate": 0.0005676175212272317, + "loss": 3.5104, + "step": 9137 + }, + { + "epoch": 0.45, + "grad_norm": 0.5010618567466736, + "learning_rate": 0.0005676105630264392, + "loss": 3.6381, + "step": 9138 + }, + { + "epoch": 0.45, + "grad_norm": 0.5100370645523071, + "learning_rate": 0.000567603604120812, + "loss": 3.1735, + "step": 9139 + }, + { + "epoch": 0.45, + "grad_norm": 0.5232434868812561, + "learning_rate": 0.0005675966445103684, + "loss": 3.5975, + "step": 9140 + }, + { + "epoch": 0.45, + "grad_norm": 0.5133650898933411, + "learning_rate": 0.0005675896841951268, + "loss": 3.4042, + "step": 9141 + }, + { + "epoch": 0.45, + "grad_norm": 0.526075541973114, + "learning_rate": 0.0005675827231751055, + "loss": 3.4472, + "step": 9142 + }, + { + "epoch": 0.45, + "grad_norm": 0.5056864619255066, + "learning_rate": 0.0005675757614503227, + "loss": 3.1239, + "step": 9143 + }, + { + "epoch": 0.45, + "grad_norm": 0.5270462036132812, + "learning_rate": 0.0005675687990207969, + "loss": 3.4077, + "step": 9144 + }, + { + "epoch": 0.45, + "grad_norm": 0.5691565275192261, + "learning_rate": 0.0005675618358865463, + "loss": 3.3887, + "step": 9145 + }, + { + "epoch": 0.45, + "grad_norm": 0.47389543056488037, + "learning_rate": 0.0005675548720475894, + "loss": 3.3965, + "step": 9146 + }, + { + "epoch": 0.45, + "grad_norm": 0.5528371930122375, + "learning_rate": 0.0005675479075039444, + "loss": 3.1438, + "step": 9147 + }, + { + "epoch": 0.45, + "grad_norm": 0.48816508054733276, + "learning_rate": 0.0005675409422556297, + "loss": 3.2555, + "step": 9148 + }, + { + "epoch": 0.45, + "grad_norm": 0.5429815649986267, + "learning_rate": 0.0005675339763026638, + "loss": 3.1632, + "step": 9149 + }, + { + "epoch": 0.45, + "grad_norm": 0.5055248141288757, + "learning_rate": 0.0005675270096450648, + "loss": 3.3858, + "step": 9150 + }, + { + "epoch": 0.45, + "grad_norm": 0.5444815158843994, + "learning_rate": 0.000567520042282851, + "loss": 3.1408, + "step": 9151 + }, + { + "epoch": 0.45, + "grad_norm": 0.5034661889076233, + "learning_rate": 0.000567513074216041, + "loss": 3.4106, + "step": 9152 + }, + { + "epoch": 0.45, + "grad_norm": 0.5205017924308777, + "learning_rate": 0.0005675061054446531, + "loss": 3.3771, + "step": 9153 + }, + { + "epoch": 0.45, + "grad_norm": 0.5044941306114197, + "learning_rate": 0.0005674991359687055, + "loss": 3.1945, + "step": 9154 + }, + { + "epoch": 0.45, + "grad_norm": 0.5016582608222961, + "learning_rate": 0.0005674921657882168, + "loss": 3.1752, + "step": 9155 + }, + { + "epoch": 0.45, + "grad_norm": 0.524653434753418, + "learning_rate": 0.0005674851949032052, + "loss": 3.1112, + "step": 9156 + }, + { + "epoch": 0.45, + "grad_norm": 0.551464319229126, + "learning_rate": 0.0005674782233136889, + "loss": 3.4564, + "step": 9157 + }, + { + "epoch": 0.45, + "grad_norm": 0.6055057644844055, + "learning_rate": 0.0005674712510196865, + "loss": 3.235, + "step": 9158 + }, + { + "epoch": 0.45, + "grad_norm": 0.5390385389328003, + "learning_rate": 0.0005674642780212165, + "loss": 3.2609, + "step": 9159 + }, + { + "epoch": 0.45, + "grad_norm": 0.5527321100234985, + "learning_rate": 0.0005674573043182969, + "loss": 3.1562, + "step": 9160 + }, + { + "epoch": 0.45, + "grad_norm": 0.5559296607971191, + "learning_rate": 0.0005674503299109462, + "loss": 3.1802, + "step": 9161 + }, + { + "epoch": 0.45, + "grad_norm": 0.5836023092269897, + "learning_rate": 0.0005674433547991828, + "loss": 3.2304, + "step": 9162 + }, + { + "epoch": 0.45, + "grad_norm": 0.5389158725738525, + "learning_rate": 0.0005674363789830251, + "loss": 3.1442, + "step": 9163 + }, + { + "epoch": 0.45, + "grad_norm": 0.48464417457580566, + "learning_rate": 0.0005674294024624914, + "loss": 3.4488, + "step": 9164 + }, + { + "epoch": 0.45, + "grad_norm": 0.5225753784179688, + "learning_rate": 0.0005674224252376003, + "loss": 3.2688, + "step": 9165 + }, + { + "epoch": 0.45, + "grad_norm": 0.516726553440094, + "learning_rate": 0.0005674154473083699, + "loss": 3.3803, + "step": 9166 + }, + { + "epoch": 0.45, + "grad_norm": 0.5179518461227417, + "learning_rate": 0.0005674084686748186, + "loss": 3.438, + "step": 9167 + }, + { + "epoch": 0.45, + "grad_norm": 0.5502138733863831, + "learning_rate": 0.000567401489336965, + "loss": 3.1833, + "step": 9168 + }, + { + "epoch": 0.45, + "grad_norm": 0.523209273815155, + "learning_rate": 0.0005673945092948273, + "loss": 3.2305, + "step": 9169 + }, + { + "epoch": 0.45, + "grad_norm": 0.48779064416885376, + "learning_rate": 0.0005673875285484238, + "loss": 3.3943, + "step": 9170 + }, + { + "epoch": 0.45, + "grad_norm": 0.5196689963340759, + "learning_rate": 0.0005673805470977732, + "loss": 3.2378, + "step": 9171 + }, + { + "epoch": 0.45, + "grad_norm": 0.5514273643493652, + "learning_rate": 0.0005673735649428934, + "loss": 3.2909, + "step": 9172 + }, + { + "epoch": 0.45, + "grad_norm": 0.5025139451026917, + "learning_rate": 0.0005673665820838034, + "loss": 3.3022, + "step": 9173 + }, + { + "epoch": 0.45, + "grad_norm": 0.5027960538864136, + "learning_rate": 0.000567359598520521, + "loss": 3.2242, + "step": 9174 + }, + { + "epoch": 0.45, + "grad_norm": 0.5085102319717407, + "learning_rate": 0.0005673526142530651, + "loss": 3.2399, + "step": 9175 + }, + { + "epoch": 0.45, + "grad_norm": 0.5565027594566345, + "learning_rate": 0.0005673456292814539, + "loss": 3.2157, + "step": 9176 + }, + { + "epoch": 0.45, + "grad_norm": 0.5032663941383362, + "learning_rate": 0.0005673386436057056, + "loss": 3.2285, + "step": 9177 + }, + { + "epoch": 0.45, + "grad_norm": 0.5043344497680664, + "learning_rate": 0.0005673316572258389, + "loss": 3.3707, + "step": 9178 + }, + { + "epoch": 0.45, + "grad_norm": 0.5071136951446533, + "learning_rate": 0.000567324670141872, + "loss": 3.1895, + "step": 9179 + }, + { + "epoch": 0.45, + "grad_norm": 0.4878983795642853, + "learning_rate": 0.0005673176823538232, + "loss": 3.1579, + "step": 9180 + }, + { + "epoch": 0.45, + "grad_norm": 0.5063034892082214, + "learning_rate": 0.0005673106938617113, + "loss": 3.3481, + "step": 9181 + }, + { + "epoch": 0.45, + "grad_norm": 0.5148180723190308, + "learning_rate": 0.0005673037046655544, + "loss": 3.4275, + "step": 9182 + }, + { + "epoch": 0.45, + "grad_norm": 0.529994547367096, + "learning_rate": 0.0005672967147653709, + "loss": 3.321, + "step": 9183 + }, + { + "epoch": 0.45, + "grad_norm": 0.5083233118057251, + "learning_rate": 0.0005672897241611795, + "loss": 3.2562, + "step": 9184 + }, + { + "epoch": 0.45, + "grad_norm": 0.5259013772010803, + "learning_rate": 0.0005672827328529982, + "loss": 3.4605, + "step": 9185 + }, + { + "epoch": 0.45, + "grad_norm": 0.5973625779151917, + "learning_rate": 0.0005672757408408458, + "loss": 3.2049, + "step": 9186 + }, + { + "epoch": 0.45, + "grad_norm": 0.5016379952430725, + "learning_rate": 0.0005672687481247404, + "loss": 2.9897, + "step": 9187 + }, + { + "epoch": 0.45, + "grad_norm": 0.4864937961101532, + "learning_rate": 0.0005672617547047006, + "loss": 3.4482, + "step": 9188 + }, + { + "epoch": 0.45, + "grad_norm": 0.5325981974601746, + "learning_rate": 0.0005672547605807449, + "loss": 3.1332, + "step": 9189 + }, + { + "epoch": 0.45, + "grad_norm": 0.5618669390678406, + "learning_rate": 0.0005672477657528913, + "loss": 3.6335, + "step": 9190 + }, + { + "epoch": 0.45, + "grad_norm": 0.5770689249038696, + "learning_rate": 0.0005672407702211588, + "loss": 3.4217, + "step": 9191 + }, + { + "epoch": 0.45, + "grad_norm": 0.5329024791717529, + "learning_rate": 0.0005672337739855654, + "loss": 3.2829, + "step": 9192 + }, + { + "epoch": 0.45, + "grad_norm": 0.5350424647331238, + "learning_rate": 0.0005672267770461297, + "loss": 3.2882, + "step": 9193 + }, + { + "epoch": 0.45, + "grad_norm": 0.5432996153831482, + "learning_rate": 0.00056721977940287, + "loss": 3.2945, + "step": 9194 + }, + { + "epoch": 0.45, + "grad_norm": 0.5327097773551941, + "learning_rate": 0.000567212781055805, + "loss": 3.4755, + "step": 9195 + }, + { + "epoch": 0.45, + "grad_norm": 0.5117045044898987, + "learning_rate": 0.000567205782004953, + "loss": 3.1168, + "step": 9196 + }, + { + "epoch": 0.45, + "grad_norm": 0.5237148404121399, + "learning_rate": 0.0005671987822503322, + "loss": 3.2491, + "step": 9197 + }, + { + "epoch": 0.45, + "grad_norm": 0.5158869624137878, + "learning_rate": 0.0005671917817919613, + "loss": 3.2337, + "step": 9198 + }, + { + "epoch": 0.45, + "grad_norm": 0.5397765040397644, + "learning_rate": 0.0005671847806298587, + "loss": 3.3685, + "step": 9199 + }, + { + "epoch": 0.45, + "grad_norm": 0.5293693542480469, + "learning_rate": 0.0005671777787640428, + "loss": 3.0961, + "step": 9200 + }, + { + "epoch": 0.45, + "grad_norm": 0.522372305393219, + "learning_rate": 0.000567170776194532, + "loss": 3.3425, + "step": 9201 + }, + { + "epoch": 0.45, + "grad_norm": 0.5064888596534729, + "learning_rate": 0.000567163772921345, + "loss": 3.3638, + "step": 9202 + }, + { + "epoch": 0.45, + "grad_norm": 0.5614068508148193, + "learning_rate": 0.0005671567689444997, + "loss": 3.305, + "step": 9203 + }, + { + "epoch": 0.45, + "grad_norm": 0.5083727240562439, + "learning_rate": 0.0005671497642640151, + "loss": 3.1742, + "step": 9204 + }, + { + "epoch": 0.45, + "grad_norm": 0.5845643281936646, + "learning_rate": 0.0005671427588799094, + "loss": 3.2797, + "step": 9205 + }, + { + "epoch": 0.45, + "grad_norm": 0.5405354499816895, + "learning_rate": 0.000567135752792201, + "loss": 3.4243, + "step": 9206 + }, + { + "epoch": 0.45, + "grad_norm": 0.5075390934944153, + "learning_rate": 0.0005671287460009086, + "loss": 3.3293, + "step": 9207 + }, + { + "epoch": 0.45, + "grad_norm": 0.5443362593650818, + "learning_rate": 0.0005671217385060504, + "loss": 3.3097, + "step": 9208 + }, + { + "epoch": 0.45, + "grad_norm": 0.5043942332267761, + "learning_rate": 0.0005671147303076449, + "loss": 3.2578, + "step": 9209 + }, + { + "epoch": 0.45, + "grad_norm": 0.5327485799789429, + "learning_rate": 0.0005671077214057107, + "loss": 3.3638, + "step": 9210 + }, + { + "epoch": 0.45, + "grad_norm": 0.5666617751121521, + "learning_rate": 0.0005671007118002662, + "loss": 3.3362, + "step": 9211 + }, + { + "epoch": 0.45, + "grad_norm": 0.5105660557746887, + "learning_rate": 0.0005670937014913297, + "loss": 3.3562, + "step": 9212 + }, + { + "epoch": 0.45, + "grad_norm": 0.5232953429222107, + "learning_rate": 0.0005670866904789199, + "loss": 3.3715, + "step": 9213 + }, + { + "epoch": 0.45, + "grad_norm": 0.5395106077194214, + "learning_rate": 0.0005670796787630552, + "loss": 3.213, + "step": 9214 + }, + { + "epoch": 0.45, + "grad_norm": 0.48984065651893616, + "learning_rate": 0.0005670726663437539, + "loss": 3.3097, + "step": 9215 + }, + { + "epoch": 0.45, + "grad_norm": 0.5296399593353271, + "learning_rate": 0.0005670656532210346, + "loss": 3.3367, + "step": 9216 + }, + { + "epoch": 0.45, + "grad_norm": 0.4997063875198364, + "learning_rate": 0.0005670586393949159, + "loss": 3.4825, + "step": 9217 + }, + { + "epoch": 0.45, + "grad_norm": 0.5573795437812805, + "learning_rate": 0.000567051624865416, + "loss": 3.2277, + "step": 9218 + }, + { + "epoch": 0.45, + "grad_norm": 0.5867475271224976, + "learning_rate": 0.0005670446096325536, + "loss": 3.4567, + "step": 9219 + }, + { + "epoch": 0.45, + "grad_norm": 0.5648623704910278, + "learning_rate": 0.000567037593696347, + "loss": 3.3481, + "step": 9220 + }, + { + "epoch": 0.45, + "grad_norm": 0.5602989792823792, + "learning_rate": 0.0005670305770568148, + "loss": 3.4067, + "step": 9221 + }, + { + "epoch": 0.45, + "grad_norm": 0.5342902541160583, + "learning_rate": 0.0005670235597139755, + "loss": 3.1211, + "step": 9222 + }, + { + "epoch": 0.45, + "grad_norm": 0.5073335766792297, + "learning_rate": 0.0005670165416678476, + "loss": 3.3586, + "step": 9223 + }, + { + "epoch": 0.45, + "grad_norm": 0.5396925806999207, + "learning_rate": 0.0005670095229184494, + "loss": 3.4748, + "step": 9224 + }, + { + "epoch": 0.45, + "grad_norm": 0.5194887518882751, + "learning_rate": 0.0005670025034657995, + "loss": 3.4222, + "step": 9225 + }, + { + "epoch": 0.45, + "grad_norm": 0.56588214635849, + "learning_rate": 0.0005669954833099164, + "loss": 3.3498, + "step": 9226 + }, + { + "epoch": 0.45, + "grad_norm": 0.5211643576622009, + "learning_rate": 0.0005669884624508186, + "loss": 3.431, + "step": 9227 + }, + { + "epoch": 0.45, + "grad_norm": 0.5741928219795227, + "learning_rate": 0.0005669814408885246, + "loss": 3.4504, + "step": 9228 + }, + { + "epoch": 0.45, + "grad_norm": 0.5376482009887695, + "learning_rate": 0.0005669744186230528, + "loss": 3.3266, + "step": 9229 + }, + { + "epoch": 0.45, + "grad_norm": 0.5111083388328552, + "learning_rate": 0.0005669673956544219, + "loss": 3.049, + "step": 9230 + }, + { + "epoch": 0.45, + "grad_norm": 0.5390723943710327, + "learning_rate": 0.0005669603719826501, + "loss": 3.1554, + "step": 9231 + }, + { + "epoch": 0.45, + "grad_norm": 0.5268939733505249, + "learning_rate": 0.0005669533476077561, + "loss": 3.4162, + "step": 9232 + }, + { + "epoch": 0.45, + "grad_norm": 0.528060793876648, + "learning_rate": 0.0005669463225297585, + "loss": 3.2039, + "step": 9233 + }, + { + "epoch": 0.45, + "grad_norm": 0.5250667333602905, + "learning_rate": 0.0005669392967486755, + "loss": 3.2844, + "step": 9234 + }, + { + "epoch": 0.45, + "grad_norm": 0.557022213935852, + "learning_rate": 0.0005669322702645257, + "loss": 3.3297, + "step": 9235 + }, + { + "epoch": 0.45, + "grad_norm": 0.5029047131538391, + "learning_rate": 0.0005669252430773279, + "loss": 3.2711, + "step": 9236 + }, + { + "epoch": 0.45, + "grad_norm": 0.5264648199081421, + "learning_rate": 0.0005669182151871003, + "loss": 3.3115, + "step": 9237 + }, + { + "epoch": 0.45, + "grad_norm": 0.5282554626464844, + "learning_rate": 0.0005669111865938614, + "loss": 3.3636, + "step": 9238 + }, + { + "epoch": 0.45, + "grad_norm": 0.4783882200717926, + "learning_rate": 0.0005669041572976299, + "loss": 3.4196, + "step": 9239 + }, + { + "epoch": 0.45, + "grad_norm": 0.5388532280921936, + "learning_rate": 0.0005668971272984242, + "loss": 3.3577, + "step": 9240 + }, + { + "epoch": 0.45, + "grad_norm": 0.5190770030021667, + "learning_rate": 0.0005668900965962627, + "loss": 3.252, + "step": 9241 + }, + { + "epoch": 0.45, + "grad_norm": 0.4901539087295532, + "learning_rate": 0.0005668830651911642, + "loss": 3.4138, + "step": 9242 + }, + { + "epoch": 0.45, + "grad_norm": 0.5431410074234009, + "learning_rate": 0.000566876033083147, + "loss": 3.2324, + "step": 9243 + }, + { + "epoch": 0.45, + "grad_norm": 0.5126324892044067, + "learning_rate": 0.0005668690002722297, + "loss": 3.2226, + "step": 9244 + }, + { + "epoch": 0.45, + "grad_norm": 0.5156897306442261, + "learning_rate": 0.0005668619667584309, + "loss": 3.2097, + "step": 9245 + }, + { + "epoch": 0.45, + "grad_norm": 0.49389657378196716, + "learning_rate": 0.000566854932541769, + "loss": 3.284, + "step": 9246 + }, + { + "epoch": 0.45, + "grad_norm": 0.5490319728851318, + "learning_rate": 0.0005668478976222624, + "loss": 3.4974, + "step": 9247 + }, + { + "epoch": 0.45, + "grad_norm": 0.5475131273269653, + "learning_rate": 0.00056684086199993, + "loss": 3.4208, + "step": 9248 + }, + { + "epoch": 0.45, + "grad_norm": 0.5155820846557617, + "learning_rate": 0.00056683382567479, + "loss": 3.4763, + "step": 9249 + }, + { + "epoch": 0.45, + "grad_norm": 0.5059611201286316, + "learning_rate": 0.0005668267886468611, + "loss": 3.223, + "step": 9250 + }, + { + "epoch": 0.45, + "grad_norm": 0.5430670380592346, + "learning_rate": 0.0005668197509161618, + "loss": 3.1563, + "step": 9251 + }, + { + "epoch": 0.45, + "grad_norm": 0.5147794485092163, + "learning_rate": 0.0005668127124827106, + "loss": 3.218, + "step": 9252 + }, + { + "epoch": 0.45, + "grad_norm": 0.5046300888061523, + "learning_rate": 0.0005668056733465262, + "loss": 3.118, + "step": 9253 + }, + { + "epoch": 0.45, + "grad_norm": 0.5530743598937988, + "learning_rate": 0.0005667986335076269, + "loss": 3.2957, + "step": 9254 + }, + { + "epoch": 0.45, + "grad_norm": 0.5372881889343262, + "learning_rate": 0.0005667915929660313, + "loss": 3.3998, + "step": 9255 + }, + { + "epoch": 0.45, + "grad_norm": 0.5331935286521912, + "learning_rate": 0.000566784551721758, + "loss": 3.4255, + "step": 9256 + }, + { + "epoch": 0.45, + "grad_norm": 0.5113346576690674, + "learning_rate": 0.0005667775097748255, + "loss": 3.1996, + "step": 9257 + }, + { + "epoch": 0.45, + "grad_norm": 0.4899602234363556, + "learning_rate": 0.0005667704671252525, + "loss": 3.484, + "step": 9258 + }, + { + "epoch": 0.45, + "grad_norm": 0.535860538482666, + "learning_rate": 0.0005667634237730573, + "loss": 3.1899, + "step": 9259 + }, + { + "epoch": 0.45, + "grad_norm": 0.5384031534194946, + "learning_rate": 0.0005667563797182586, + "loss": 3.3316, + "step": 9260 + }, + { + "epoch": 0.45, + "grad_norm": 0.5427521467208862, + "learning_rate": 0.0005667493349608751, + "loss": 3.3506, + "step": 9261 + }, + { + "epoch": 0.45, + "grad_norm": 0.543279230594635, + "learning_rate": 0.000566742289500925, + "loss": 3.2267, + "step": 9262 + }, + { + "epoch": 0.45, + "grad_norm": 0.5359872579574585, + "learning_rate": 0.0005667352433384272, + "loss": 3.2173, + "step": 9263 + }, + { + "epoch": 0.45, + "grad_norm": 0.5161938667297363, + "learning_rate": 0.0005667281964734, + "loss": 3.2177, + "step": 9264 + }, + { + "epoch": 0.45, + "grad_norm": 0.5487397313117981, + "learning_rate": 0.0005667211489058621, + "loss": 3.3548, + "step": 9265 + }, + { + "epoch": 0.45, + "grad_norm": 0.5496209263801575, + "learning_rate": 0.0005667141006358321, + "loss": 3.1292, + "step": 9266 + }, + { + "epoch": 0.45, + "grad_norm": 0.5191566348075867, + "learning_rate": 0.0005667070516633283, + "loss": 3.31, + "step": 9267 + }, + { + "epoch": 0.45, + "grad_norm": 0.5273240804672241, + "learning_rate": 0.0005667000019883696, + "loss": 2.9271, + "step": 9268 + }, + { + "epoch": 0.45, + "grad_norm": 0.5322637557983398, + "learning_rate": 0.0005666929516109744, + "loss": 3.2176, + "step": 9269 + }, + { + "epoch": 0.45, + "grad_norm": 0.5463858246803284, + "learning_rate": 0.0005666859005311612, + "loss": 3.1465, + "step": 9270 + }, + { + "epoch": 0.45, + "grad_norm": 0.5420017838478088, + "learning_rate": 0.0005666788487489488, + "loss": 3.1499, + "step": 9271 + }, + { + "epoch": 0.45, + "grad_norm": 0.5984858274459839, + "learning_rate": 0.0005666717962643555, + "loss": 3.293, + "step": 9272 + }, + { + "epoch": 0.45, + "grad_norm": 0.5361172556877136, + "learning_rate": 0.0005666647430774001, + "loss": 3.3239, + "step": 9273 + }, + { + "epoch": 0.45, + "grad_norm": 0.5419718623161316, + "learning_rate": 0.000566657689188101, + "loss": 3.3363, + "step": 9274 + }, + { + "epoch": 0.45, + "grad_norm": 0.6194941997528076, + "learning_rate": 0.000566650634596477, + "loss": 3.2487, + "step": 9275 + }, + { + "epoch": 0.45, + "grad_norm": 0.5028418302536011, + "learning_rate": 0.0005666435793025465, + "loss": 3.5507, + "step": 9276 + }, + { + "epoch": 0.45, + "grad_norm": 0.5127885937690735, + "learning_rate": 0.0005666365233063281, + "loss": 3.4255, + "step": 9277 + }, + { + "epoch": 0.45, + "grad_norm": 0.5219062566757202, + "learning_rate": 0.0005666294666078404, + "loss": 3.1941, + "step": 9278 + }, + { + "epoch": 0.45, + "grad_norm": 0.515256404876709, + "learning_rate": 0.0005666224092071019, + "loss": 3.3369, + "step": 9279 + }, + { + "epoch": 0.45, + "grad_norm": 0.5556721687316895, + "learning_rate": 0.0005666153511041315, + "loss": 3.3815, + "step": 9280 + }, + { + "epoch": 0.45, + "grad_norm": 0.5763710737228394, + "learning_rate": 0.0005666082922989475, + "loss": 3.3639, + "step": 9281 + }, + { + "epoch": 0.45, + "grad_norm": 0.5201305747032166, + "learning_rate": 0.0005666012327915686, + "loss": 3.2809, + "step": 9282 + }, + { + "epoch": 0.45, + "grad_norm": 0.5183270573616028, + "learning_rate": 0.0005665941725820133, + "loss": 3.1675, + "step": 9283 + }, + { + "epoch": 0.45, + "grad_norm": 0.533555805683136, + "learning_rate": 0.0005665871116703003, + "loss": 3.195, + "step": 9284 + }, + { + "epoch": 0.46, + "grad_norm": 0.5322220325469971, + "learning_rate": 0.000566580050056448, + "loss": 3.3518, + "step": 9285 + }, + { + "epoch": 0.46, + "grad_norm": 0.5173676013946533, + "learning_rate": 0.0005665729877404753, + "loss": 3.3363, + "step": 9286 + }, + { + "epoch": 0.46, + "grad_norm": 0.5588550567626953, + "learning_rate": 0.0005665659247224006, + "loss": 3.3185, + "step": 9287 + }, + { + "epoch": 0.46, + "grad_norm": 0.5288047790527344, + "learning_rate": 0.0005665588610022426, + "loss": 3.2531, + "step": 9288 + }, + { + "epoch": 0.46, + "grad_norm": 0.4969140589237213, + "learning_rate": 0.0005665517965800199, + "loss": 3.3127, + "step": 9289 + }, + { + "epoch": 0.46, + "grad_norm": 0.5432141423225403, + "learning_rate": 0.000566544731455751, + "loss": 3.082, + "step": 9290 + }, + { + "epoch": 0.46, + "grad_norm": 0.5413832664489746, + "learning_rate": 0.0005665376656294545, + "loss": 3.3992, + "step": 9291 + }, + { + "epoch": 0.46, + "grad_norm": 0.5215909481048584, + "learning_rate": 0.0005665305991011492, + "loss": 3.5054, + "step": 9292 + }, + { + "epoch": 0.46, + "grad_norm": 0.5591144561767578, + "learning_rate": 0.0005665235318708537, + "loss": 3.3934, + "step": 9293 + }, + { + "epoch": 0.46, + "grad_norm": 0.6027560234069824, + "learning_rate": 0.0005665164639385863, + "loss": 3.252, + "step": 9294 + }, + { + "epoch": 0.46, + "grad_norm": 0.5260102152824402, + "learning_rate": 0.0005665093953043658, + "loss": 3.2683, + "step": 9295 + }, + { + "epoch": 0.46, + "grad_norm": 0.525118887424469, + "learning_rate": 0.0005665023259682111, + "loss": 3.277, + "step": 9296 + }, + { + "epoch": 0.46, + "grad_norm": 0.4957008957862854, + "learning_rate": 0.0005664952559301403, + "loss": 3.1469, + "step": 9297 + }, + { + "epoch": 0.46, + "grad_norm": 0.5985009074211121, + "learning_rate": 0.0005664881851901725, + "loss": 3.195, + "step": 9298 + }, + { + "epoch": 0.46, + "grad_norm": 0.5201311111450195, + "learning_rate": 0.0005664811137483259, + "loss": 3.4488, + "step": 9299 + }, + { + "epoch": 0.46, + "grad_norm": 0.5100109577178955, + "learning_rate": 0.0005664740416046195, + "loss": 3.1562, + "step": 9300 + }, + { + "epoch": 0.46, + "grad_norm": 0.49282416701316833, + "learning_rate": 0.0005664669687590717, + "loss": 3.3542, + "step": 9301 + }, + { + "epoch": 0.46, + "grad_norm": 0.5214360356330872, + "learning_rate": 0.0005664598952117012, + "loss": 3.2439, + "step": 9302 + }, + { + "epoch": 0.46, + "grad_norm": 0.5333632826805115, + "learning_rate": 0.0005664528209625265, + "loss": 3.2928, + "step": 9303 + }, + { + "epoch": 0.46, + "grad_norm": 0.5121089816093445, + "learning_rate": 0.0005664457460115665, + "loss": 3.3875, + "step": 9304 + }, + { + "epoch": 0.46, + "grad_norm": 0.48593372106552124, + "learning_rate": 0.0005664386703588396, + "loss": 3.3829, + "step": 9305 + }, + { + "epoch": 0.46, + "grad_norm": 0.5539550185203552, + "learning_rate": 0.0005664315940043645, + "loss": 3.427, + "step": 9306 + }, + { + "epoch": 0.46, + "grad_norm": 0.5157627463340759, + "learning_rate": 0.00056642451694816, + "loss": 3.3568, + "step": 9307 + }, + { + "epoch": 0.46, + "grad_norm": 0.5772610902786255, + "learning_rate": 0.0005664174391902444, + "loss": 3.0817, + "step": 9308 + }, + { + "epoch": 0.46, + "grad_norm": 0.5100268125534058, + "learning_rate": 0.0005664103607306367, + "loss": 3.1593, + "step": 9309 + }, + { + "epoch": 0.46, + "grad_norm": 0.5215173363685608, + "learning_rate": 0.0005664032815693553, + "loss": 3.1953, + "step": 9310 + }, + { + "epoch": 0.46, + "grad_norm": 0.5628623962402344, + "learning_rate": 0.000566396201706419, + "loss": 3.1241, + "step": 9311 + }, + { + "epoch": 0.46, + "grad_norm": 0.5145275592803955, + "learning_rate": 0.0005663891211418463, + "loss": 3.2781, + "step": 9312 + }, + { + "epoch": 0.46, + "grad_norm": 0.545166015625, + "learning_rate": 0.0005663820398756559, + "loss": 3.3879, + "step": 9313 + }, + { + "epoch": 0.46, + "grad_norm": 0.5119915008544922, + "learning_rate": 0.0005663749579078665, + "loss": 3.0349, + "step": 9314 + }, + { + "epoch": 0.46, + "grad_norm": 0.5298326015472412, + "learning_rate": 0.0005663678752384968, + "loss": 3.3834, + "step": 9315 + }, + { + "epoch": 0.46, + "grad_norm": 0.5123884081840515, + "learning_rate": 0.0005663607918675654, + "loss": 3.3531, + "step": 9316 + }, + { + "epoch": 0.46, + "grad_norm": 0.5344944000244141, + "learning_rate": 0.0005663537077950908, + "loss": 3.4021, + "step": 9317 + }, + { + "epoch": 0.46, + "grad_norm": 0.5218607187271118, + "learning_rate": 0.0005663466230210919, + "loss": 3.1947, + "step": 9318 + }, + { + "epoch": 0.46, + "grad_norm": 0.5089520215988159, + "learning_rate": 0.0005663395375455872, + "loss": 3.3224, + "step": 9319 + }, + { + "epoch": 0.46, + "grad_norm": 0.5154445171356201, + "learning_rate": 0.0005663324513685954, + "loss": 3.4781, + "step": 9320 + }, + { + "epoch": 0.46, + "grad_norm": 0.4960334002971649, + "learning_rate": 0.0005663253644901351, + "loss": 3.316, + "step": 9321 + }, + { + "epoch": 0.46, + "grad_norm": 0.5327979326248169, + "learning_rate": 0.0005663182769102252, + "loss": 3.3396, + "step": 9322 + }, + { + "epoch": 0.46, + "grad_norm": 0.5505029559135437, + "learning_rate": 0.0005663111886288842, + "loss": 3.2301, + "step": 9323 + }, + { + "epoch": 0.46, + "grad_norm": 0.5501486659049988, + "learning_rate": 0.0005663040996461308, + "loss": 3.4768, + "step": 9324 + }, + { + "epoch": 0.46, + "grad_norm": 0.5302287936210632, + "learning_rate": 0.0005662970099619835, + "loss": 3.1638, + "step": 9325 + }, + { + "epoch": 0.46, + "grad_norm": 0.5448341965675354, + "learning_rate": 0.0005662899195764612, + "loss": 3.17, + "step": 9326 + }, + { + "epoch": 0.46, + "grad_norm": 0.5440930724143982, + "learning_rate": 0.0005662828284895826, + "loss": 3.4745, + "step": 9327 + }, + { + "epoch": 0.46, + "grad_norm": 0.5151364803314209, + "learning_rate": 0.0005662757367013662, + "loss": 3.1387, + "step": 9328 + }, + { + "epoch": 0.46, + "grad_norm": 0.4982331395149231, + "learning_rate": 0.0005662686442118308, + "loss": 3.285, + "step": 9329 + }, + { + "epoch": 0.46, + "grad_norm": 0.5508311986923218, + "learning_rate": 0.0005662615510209949, + "loss": 3.2912, + "step": 9330 + }, + { + "epoch": 0.46, + "grad_norm": 0.4659712016582489, + "learning_rate": 0.0005662544571288775, + "loss": 3.4545, + "step": 9331 + }, + { + "epoch": 0.46, + "grad_norm": 0.5672176480293274, + "learning_rate": 0.0005662473625354969, + "loss": 3.2206, + "step": 9332 + }, + { + "epoch": 0.46, + "grad_norm": 0.5322652459144592, + "learning_rate": 0.0005662402672408722, + "loss": 3.2292, + "step": 9333 + }, + { + "epoch": 0.46, + "grad_norm": 0.5741870403289795, + "learning_rate": 0.0005662331712450216, + "loss": 3.2488, + "step": 9334 + }, + { + "epoch": 0.46, + "grad_norm": 0.5742843151092529, + "learning_rate": 0.0005662260745479643, + "loss": 3.36, + "step": 9335 + }, + { + "epoch": 0.46, + "grad_norm": 0.5701401233673096, + "learning_rate": 0.0005662189771497187, + "loss": 3.166, + "step": 9336 + }, + { + "epoch": 0.46, + "grad_norm": 0.5188326239585876, + "learning_rate": 0.0005662118790503035, + "loss": 3.4782, + "step": 9337 + }, + { + "epoch": 0.46, + "grad_norm": 0.5277048945426941, + "learning_rate": 0.0005662047802497376, + "loss": 3.4702, + "step": 9338 + }, + { + "epoch": 0.46, + "grad_norm": 0.5392853021621704, + "learning_rate": 0.0005661976807480394, + "loss": 3.5907, + "step": 9339 + }, + { + "epoch": 0.46, + "grad_norm": 0.5241400599479675, + "learning_rate": 0.0005661905805452277, + "loss": 3.5178, + "step": 9340 + }, + { + "epoch": 0.46, + "grad_norm": 0.5726556181907654, + "learning_rate": 0.0005661834796413214, + "loss": 3.3445, + "step": 9341 + }, + { + "epoch": 0.46, + "grad_norm": 0.5296681523323059, + "learning_rate": 0.0005661763780363389, + "loss": 3.3437, + "step": 9342 + }, + { + "epoch": 0.46, + "grad_norm": 0.5572839379310608, + "learning_rate": 0.0005661692757302991, + "loss": 3.4079, + "step": 9343 + }, + { + "epoch": 0.46, + "grad_norm": 0.5110607147216797, + "learning_rate": 0.0005661621727232206, + "loss": 3.2675, + "step": 9344 + }, + { + "epoch": 0.46, + "grad_norm": 0.4931510388851166, + "learning_rate": 0.0005661550690151222, + "loss": 3.3693, + "step": 9345 + }, + { + "epoch": 0.46, + "grad_norm": 0.5317081212997437, + "learning_rate": 0.0005661479646060227, + "loss": 3.2439, + "step": 9346 + }, + { + "epoch": 0.46, + "grad_norm": 0.5172111392021179, + "learning_rate": 0.0005661408594959405, + "loss": 3.3955, + "step": 9347 + }, + { + "epoch": 0.46, + "grad_norm": 0.567136824131012, + "learning_rate": 0.0005661337536848946, + "loss": 3.4301, + "step": 9348 + }, + { + "epoch": 0.46, + "grad_norm": 0.5417160987854004, + "learning_rate": 0.0005661266471729035, + "loss": 3.268, + "step": 9349 + }, + { + "epoch": 0.46, + "grad_norm": 0.5237059593200684, + "learning_rate": 0.0005661195399599861, + "loss": 3.0861, + "step": 9350 + }, + { + "epoch": 0.46, + "grad_norm": 0.5109018087387085, + "learning_rate": 0.0005661124320461611, + "loss": 3.3038, + "step": 9351 + }, + { + "epoch": 0.46, + "grad_norm": 0.4800640940666199, + "learning_rate": 0.0005661053234314471, + "loss": 3.2938, + "step": 9352 + }, + { + "epoch": 0.46, + "grad_norm": 0.5349900126457214, + "learning_rate": 0.000566098214115863, + "loss": 3.1207, + "step": 9353 + }, + { + "epoch": 0.46, + "grad_norm": 0.49598759412765503, + "learning_rate": 0.0005660911040994272, + "loss": 3.2266, + "step": 9354 + }, + { + "epoch": 0.46, + "grad_norm": 0.5334964394569397, + "learning_rate": 0.0005660839933821588, + "loss": 3.2708, + "step": 9355 + }, + { + "epoch": 0.46, + "grad_norm": 0.6042339205741882, + "learning_rate": 0.0005660768819640764, + "loss": 3.1651, + "step": 9356 + }, + { + "epoch": 0.46, + "grad_norm": 0.5126491785049438, + "learning_rate": 0.0005660697698451985, + "loss": 3.1133, + "step": 9357 + }, + { + "epoch": 0.46, + "grad_norm": 0.5190067291259766, + "learning_rate": 0.0005660626570255442, + "loss": 3.5437, + "step": 9358 + }, + { + "epoch": 0.46, + "grad_norm": 0.5068770051002502, + "learning_rate": 0.0005660555435051321, + "loss": 3.4367, + "step": 9359 + }, + { + "epoch": 0.46, + "grad_norm": 0.5406444668769836, + "learning_rate": 0.0005660484292839807, + "loss": 3.3291, + "step": 9360 + }, + { + "epoch": 0.46, + "grad_norm": 0.46386462450027466, + "learning_rate": 0.0005660413143621091, + "loss": 3.2917, + "step": 9361 + }, + { + "epoch": 0.46, + "grad_norm": 0.5372816920280457, + "learning_rate": 0.000566034198739536, + "loss": 3.2199, + "step": 9362 + }, + { + "epoch": 0.46, + "grad_norm": 0.5300079584121704, + "learning_rate": 0.0005660270824162798, + "loss": 3.2802, + "step": 9363 + }, + { + "epoch": 0.46, + "grad_norm": 0.5637562274932861, + "learning_rate": 0.0005660199653923594, + "loss": 3.2477, + "step": 9364 + }, + { + "epoch": 0.46, + "grad_norm": 0.5250990390777588, + "learning_rate": 0.0005660128476677939, + "loss": 3.078, + "step": 9365 + }, + { + "epoch": 0.46, + "grad_norm": 0.5225361585617065, + "learning_rate": 0.0005660057292426016, + "loss": 3.4664, + "step": 9366 + }, + { + "epoch": 0.46, + "grad_norm": 0.4957283139228821, + "learning_rate": 0.0005659986101168013, + "loss": 3.3938, + "step": 9367 + }, + { + "epoch": 0.46, + "grad_norm": 0.5098510384559631, + "learning_rate": 0.0005659914902904121, + "loss": 3.2841, + "step": 9368 + }, + { + "epoch": 0.46, + "grad_norm": 0.5317561030387878, + "learning_rate": 0.0005659843697634522, + "loss": 3.2659, + "step": 9369 + }, + { + "epoch": 0.46, + "grad_norm": 0.4984907805919647, + "learning_rate": 0.0005659772485359409, + "loss": 3.1747, + "step": 9370 + }, + { + "epoch": 0.46, + "grad_norm": 0.5244981646537781, + "learning_rate": 0.0005659701266078966, + "loss": 3.2435, + "step": 9371 + }, + { + "epoch": 0.46, + "grad_norm": 0.5228347182273865, + "learning_rate": 0.0005659630039793383, + "loss": 3.2482, + "step": 9372 + }, + { + "epoch": 0.46, + "grad_norm": 0.5450955629348755, + "learning_rate": 0.0005659558806502845, + "loss": 3.3684, + "step": 9373 + }, + { + "epoch": 0.46, + "grad_norm": 0.5306031107902527, + "learning_rate": 0.0005659487566207542, + "loss": 2.9565, + "step": 9374 + }, + { + "epoch": 0.46, + "grad_norm": 0.5437656044960022, + "learning_rate": 0.000565941631890766, + "loss": 3.2759, + "step": 9375 + }, + { + "epoch": 0.46, + "grad_norm": 0.5219473838806152, + "learning_rate": 0.0005659345064603387, + "loss": 3.3767, + "step": 9376 + }, + { + "epoch": 0.46, + "grad_norm": 0.5270979404449463, + "learning_rate": 0.000565927380329491, + "loss": 3.4866, + "step": 9377 + }, + { + "epoch": 0.46, + "grad_norm": 0.5120222568511963, + "learning_rate": 0.0005659202534982419, + "loss": 3.1391, + "step": 9378 + }, + { + "epoch": 0.46, + "grad_norm": 0.520464301109314, + "learning_rate": 0.00056591312596661, + "loss": 3.2149, + "step": 9379 + }, + { + "epoch": 0.46, + "grad_norm": 0.5516645908355713, + "learning_rate": 0.0005659059977346141, + "loss": 3.2735, + "step": 9380 + }, + { + "epoch": 0.46, + "grad_norm": 0.519219696521759, + "learning_rate": 0.000565898868802273, + "loss": 3.242, + "step": 9381 + }, + { + "epoch": 0.46, + "grad_norm": 0.5575689077377319, + "learning_rate": 0.0005658917391696054, + "loss": 3.366, + "step": 9382 + }, + { + "epoch": 0.46, + "grad_norm": 0.5153242349624634, + "learning_rate": 0.0005658846088366302, + "loss": 3.2188, + "step": 9383 + }, + { + "epoch": 0.46, + "grad_norm": 0.5371017456054688, + "learning_rate": 0.000565877477803366, + "loss": 3.3019, + "step": 9384 + }, + { + "epoch": 0.46, + "grad_norm": 0.57710200548172, + "learning_rate": 0.0005658703460698318, + "loss": 3.4646, + "step": 9385 + }, + { + "epoch": 0.46, + "grad_norm": 0.5319957137107849, + "learning_rate": 0.0005658632136360461, + "loss": 3.1126, + "step": 9386 + }, + { + "epoch": 0.46, + "grad_norm": 0.5224001407623291, + "learning_rate": 0.0005658560805020281, + "loss": 3.384, + "step": 9387 + }, + { + "epoch": 0.46, + "grad_norm": 0.5222063660621643, + "learning_rate": 0.0005658489466677963, + "loss": 3.3125, + "step": 9388 + }, + { + "epoch": 0.46, + "grad_norm": 0.5301948189735413, + "learning_rate": 0.0005658418121333694, + "loss": 3.3855, + "step": 9389 + }, + { + "epoch": 0.46, + "grad_norm": 0.506697952747345, + "learning_rate": 0.0005658346768987664, + "loss": 3.4046, + "step": 9390 + }, + { + "epoch": 0.46, + "grad_norm": 0.5180991291999817, + "learning_rate": 0.0005658275409640062, + "loss": 3.2995, + "step": 9391 + }, + { + "epoch": 0.46, + "grad_norm": 0.5402610898017883, + "learning_rate": 0.0005658204043291072, + "loss": 3.412, + "step": 9392 + }, + { + "epoch": 0.46, + "grad_norm": 0.5179954171180725, + "learning_rate": 0.0005658132669940885, + "loss": 3.4606, + "step": 9393 + }, + { + "epoch": 0.46, + "grad_norm": 0.5407754778862, + "learning_rate": 0.0005658061289589687, + "loss": 3.0892, + "step": 9394 + }, + { + "epoch": 0.46, + "grad_norm": 0.5019816756248474, + "learning_rate": 0.0005657989902237669, + "loss": 3.2621, + "step": 9395 + }, + { + "epoch": 0.46, + "grad_norm": 0.5457536578178406, + "learning_rate": 0.0005657918507885016, + "loss": 3.1853, + "step": 9396 + }, + { + "epoch": 0.46, + "grad_norm": 0.5361157059669495, + "learning_rate": 0.0005657847106531916, + "loss": 3.1843, + "step": 9397 + }, + { + "epoch": 0.46, + "grad_norm": 0.5209142565727234, + "learning_rate": 0.000565777569817856, + "loss": 3.4549, + "step": 9398 + }, + { + "epoch": 0.46, + "grad_norm": 0.49910151958465576, + "learning_rate": 0.0005657704282825133, + "loss": 3.4455, + "step": 9399 + }, + { + "epoch": 0.46, + "grad_norm": 0.5092660188674927, + "learning_rate": 0.0005657632860471826, + "loss": 3.1643, + "step": 9400 + }, + { + "epoch": 0.46, + "grad_norm": 0.5319451093673706, + "learning_rate": 0.0005657561431118824, + "loss": 3.1362, + "step": 9401 + }, + { + "epoch": 0.46, + "grad_norm": 0.5203292369842529, + "learning_rate": 0.0005657489994766318, + "loss": 3.3171, + "step": 9402 + }, + { + "epoch": 0.46, + "grad_norm": 0.5382462739944458, + "learning_rate": 0.0005657418551414494, + "loss": 3.3471, + "step": 9403 + }, + { + "epoch": 0.46, + "grad_norm": 0.5401602387428284, + "learning_rate": 0.0005657347101063541, + "loss": 3.415, + "step": 9404 + }, + { + "epoch": 0.46, + "grad_norm": 0.5223503112792969, + "learning_rate": 0.0005657275643713648, + "loss": 3.2212, + "step": 9405 + }, + { + "epoch": 0.46, + "grad_norm": 0.4994029998779297, + "learning_rate": 0.0005657204179365001, + "loss": 3.1509, + "step": 9406 + }, + { + "epoch": 0.46, + "grad_norm": 0.5694056749343872, + "learning_rate": 0.000565713270801779, + "loss": 3.3337, + "step": 9407 + }, + { + "epoch": 0.46, + "grad_norm": 0.5182304978370667, + "learning_rate": 0.0005657061229672203, + "loss": 3.4504, + "step": 9408 + }, + { + "epoch": 0.46, + "grad_norm": 0.5066438317298889, + "learning_rate": 0.0005656989744328428, + "loss": 2.98, + "step": 9409 + }, + { + "epoch": 0.46, + "grad_norm": 0.5442813634872437, + "learning_rate": 0.0005656918251986654, + "loss": 3.2891, + "step": 9410 + }, + { + "epoch": 0.46, + "grad_norm": 0.5112155675888062, + "learning_rate": 0.0005656846752647068, + "loss": 3.4086, + "step": 9411 + }, + { + "epoch": 0.46, + "grad_norm": 0.4860506057739258, + "learning_rate": 0.0005656775246309859, + "loss": 3.1496, + "step": 9412 + }, + { + "epoch": 0.46, + "grad_norm": 0.5639499425888062, + "learning_rate": 0.0005656703732975215, + "loss": 3.4534, + "step": 9413 + }, + { + "epoch": 0.46, + "grad_norm": 0.49762552976608276, + "learning_rate": 0.0005656632212643326, + "loss": 3.0931, + "step": 9414 + }, + { + "epoch": 0.46, + "grad_norm": 0.5720370411872864, + "learning_rate": 0.0005656560685314378, + "loss": 3.3915, + "step": 9415 + }, + { + "epoch": 0.46, + "grad_norm": 0.5175341963768005, + "learning_rate": 0.000565648915098856, + "loss": 3.3709, + "step": 9416 + }, + { + "epoch": 0.46, + "grad_norm": 0.5201534032821655, + "learning_rate": 0.0005656417609666061, + "loss": 3.2131, + "step": 9417 + }, + { + "epoch": 0.46, + "grad_norm": 0.5602389574050903, + "learning_rate": 0.000565634606134707, + "loss": 3.4146, + "step": 9418 + }, + { + "epoch": 0.46, + "grad_norm": 0.5673112869262695, + "learning_rate": 0.0005656274506031775, + "loss": 3.29, + "step": 9419 + }, + { + "epoch": 0.46, + "grad_norm": 0.5292515754699707, + "learning_rate": 0.0005656202943720363, + "loss": 3.2578, + "step": 9420 + }, + { + "epoch": 0.46, + "grad_norm": 0.4988608658313751, + "learning_rate": 0.0005656131374413024, + "loss": 3.1482, + "step": 9421 + }, + { + "epoch": 0.46, + "grad_norm": 0.5025928020477295, + "learning_rate": 0.0005656059798109947, + "loss": 3.3671, + "step": 9422 + }, + { + "epoch": 0.46, + "grad_norm": 0.5336331725120544, + "learning_rate": 0.0005655988214811318, + "loss": 3.1554, + "step": 9423 + }, + { + "epoch": 0.46, + "grad_norm": 0.4792225956916809, + "learning_rate": 0.0005655916624517328, + "loss": 3.3616, + "step": 9424 + }, + { + "epoch": 0.46, + "grad_norm": 0.5044949650764465, + "learning_rate": 0.0005655845027228164, + "loss": 3.2776, + "step": 9425 + }, + { + "epoch": 0.46, + "grad_norm": 0.519375205039978, + "learning_rate": 0.0005655773422944017, + "loss": 3.4197, + "step": 9426 + }, + { + "epoch": 0.46, + "grad_norm": 0.5419018268585205, + "learning_rate": 0.0005655701811665073, + "loss": 3.1409, + "step": 9427 + }, + { + "epoch": 0.46, + "grad_norm": 0.5148293972015381, + "learning_rate": 0.0005655630193391522, + "loss": 3.2837, + "step": 9428 + }, + { + "epoch": 0.46, + "grad_norm": 0.522705078125, + "learning_rate": 0.0005655558568123551, + "loss": 3.2122, + "step": 9429 + }, + { + "epoch": 0.46, + "grad_norm": 0.4861201345920563, + "learning_rate": 0.0005655486935861352, + "loss": 3.1855, + "step": 9430 + }, + { + "epoch": 0.46, + "grad_norm": 0.4865737557411194, + "learning_rate": 0.0005655415296605109, + "loss": 3.1497, + "step": 9431 + }, + { + "epoch": 0.46, + "grad_norm": 0.5147886276245117, + "learning_rate": 0.0005655343650355014, + "loss": 3.5226, + "step": 9432 + }, + { + "epoch": 0.46, + "grad_norm": 0.5273894667625427, + "learning_rate": 0.0005655271997111256, + "loss": 3.3109, + "step": 9433 + }, + { + "epoch": 0.46, + "grad_norm": 0.5883224010467529, + "learning_rate": 0.0005655200336874021, + "loss": 3.2307, + "step": 9434 + }, + { + "epoch": 0.46, + "grad_norm": 0.535631537437439, + "learning_rate": 0.00056551286696435, + "loss": 3.1471, + "step": 9435 + }, + { + "epoch": 0.46, + "grad_norm": 0.5643936991691589, + "learning_rate": 0.0005655056995419881, + "loss": 3.1523, + "step": 9436 + }, + { + "epoch": 0.46, + "grad_norm": 0.5332732200622559, + "learning_rate": 0.0005654985314203354, + "loss": 3.3777, + "step": 9437 + }, + { + "epoch": 0.46, + "grad_norm": 0.4967159628868103, + "learning_rate": 0.0005654913625994105, + "loss": 3.5025, + "step": 9438 + }, + { + "epoch": 0.46, + "grad_norm": 0.5050669312477112, + "learning_rate": 0.0005654841930792325, + "loss": 3.2085, + "step": 9439 + }, + { + "epoch": 0.46, + "grad_norm": 0.5131103992462158, + "learning_rate": 0.0005654770228598202, + "loss": 3.3931, + "step": 9440 + }, + { + "epoch": 0.46, + "grad_norm": 0.5723204612731934, + "learning_rate": 0.0005654698519411925, + "loss": 3.2395, + "step": 9441 + }, + { + "epoch": 0.46, + "grad_norm": 0.521449089050293, + "learning_rate": 0.0005654626803233684, + "loss": 3.2252, + "step": 9442 + }, + { + "epoch": 0.46, + "grad_norm": 0.559038519859314, + "learning_rate": 0.0005654555080063665, + "loss": 3.2708, + "step": 9443 + }, + { + "epoch": 0.46, + "grad_norm": 0.5154927968978882, + "learning_rate": 0.000565448334990206, + "loss": 3.03, + "step": 9444 + }, + { + "epoch": 0.46, + "grad_norm": 0.6243354678153992, + "learning_rate": 0.0005654411612749058, + "loss": 3.3827, + "step": 9445 + }, + { + "epoch": 0.46, + "grad_norm": 0.6051532030105591, + "learning_rate": 0.0005654339868604846, + "loss": 3.2296, + "step": 9446 + }, + { + "epoch": 0.46, + "grad_norm": 0.5143436789512634, + "learning_rate": 0.0005654268117469613, + "loss": 3.2727, + "step": 9447 + }, + { + "epoch": 0.46, + "grad_norm": 0.485236257314682, + "learning_rate": 0.0005654196359343548, + "loss": 3.2742, + "step": 9448 + }, + { + "epoch": 0.46, + "grad_norm": 0.536080539226532, + "learning_rate": 0.0005654124594226841, + "loss": 3.2754, + "step": 9449 + }, + { + "epoch": 0.46, + "grad_norm": 0.648050844669342, + "learning_rate": 0.0005654052822119681, + "loss": 3.0665, + "step": 9450 + }, + { + "epoch": 0.46, + "grad_norm": 0.5583487749099731, + "learning_rate": 0.0005653981043022257, + "loss": 3.2576, + "step": 9451 + }, + { + "epoch": 0.46, + "grad_norm": 0.5086358785629272, + "learning_rate": 0.0005653909256934757, + "loss": 3.3566, + "step": 9452 + }, + { + "epoch": 0.46, + "grad_norm": 0.5181884765625, + "learning_rate": 0.0005653837463857371, + "loss": 3.1578, + "step": 9453 + }, + { + "epoch": 0.46, + "grad_norm": 0.4989009499549866, + "learning_rate": 0.0005653765663790288, + "loss": 2.9139, + "step": 9454 + }, + { + "epoch": 0.46, + "grad_norm": 0.5821312069892883, + "learning_rate": 0.0005653693856733698, + "loss": 3.2885, + "step": 9455 + }, + { + "epoch": 0.46, + "grad_norm": 0.5479455590248108, + "learning_rate": 0.0005653622042687788, + "loss": 3.0113, + "step": 9456 + }, + { + "epoch": 0.46, + "grad_norm": 0.5062962770462036, + "learning_rate": 0.0005653550221652747, + "loss": 3.2562, + "step": 9457 + }, + { + "epoch": 0.46, + "grad_norm": 0.5756667256355286, + "learning_rate": 0.0005653478393628767, + "loss": 3.1515, + "step": 9458 + }, + { + "epoch": 0.46, + "grad_norm": 0.5819968581199646, + "learning_rate": 0.0005653406558616034, + "loss": 3.3712, + "step": 9459 + }, + { + "epoch": 0.46, + "grad_norm": 0.53989577293396, + "learning_rate": 0.0005653334716614741, + "loss": 3.1972, + "step": 9460 + }, + { + "epoch": 0.46, + "grad_norm": 0.5572423934936523, + "learning_rate": 0.0005653262867625074, + "loss": 3.3386, + "step": 9461 + }, + { + "epoch": 0.46, + "grad_norm": 0.5422435998916626, + "learning_rate": 0.0005653191011647223, + "loss": 3.1865, + "step": 9462 + }, + { + "epoch": 0.46, + "grad_norm": 0.5119712352752686, + "learning_rate": 0.0005653119148681378, + "loss": 3.346, + "step": 9463 + }, + { + "epoch": 0.46, + "grad_norm": 0.5083188414573669, + "learning_rate": 0.0005653047278727728, + "loss": 3.3056, + "step": 9464 + }, + { + "epoch": 0.46, + "grad_norm": 0.5463467836380005, + "learning_rate": 0.0005652975401786461, + "loss": 3.3122, + "step": 9465 + }, + { + "epoch": 0.46, + "grad_norm": 0.5052841305732727, + "learning_rate": 0.0005652903517857768, + "loss": 3.3023, + "step": 9466 + }, + { + "epoch": 0.46, + "grad_norm": 0.5321139097213745, + "learning_rate": 0.0005652831626941838, + "loss": 3.2202, + "step": 9467 + }, + { + "epoch": 0.46, + "grad_norm": 0.5242906808853149, + "learning_rate": 0.0005652759729038859, + "loss": 3.1611, + "step": 9468 + }, + { + "epoch": 0.46, + "grad_norm": 0.5150046944618225, + "learning_rate": 0.0005652687824149022, + "loss": 3.3922, + "step": 9469 + }, + { + "epoch": 0.46, + "grad_norm": 0.5529156923294067, + "learning_rate": 0.0005652615912272516, + "loss": 3.2372, + "step": 9470 + }, + { + "epoch": 0.46, + "grad_norm": 0.4883171021938324, + "learning_rate": 0.000565254399340953, + "loss": 3.1305, + "step": 9471 + }, + { + "epoch": 0.46, + "grad_norm": 0.5113022923469543, + "learning_rate": 0.0005652472067560254, + "loss": 3.4073, + "step": 9472 + }, + { + "epoch": 0.46, + "grad_norm": 0.5019404292106628, + "learning_rate": 0.0005652400134724877, + "loss": 3.2087, + "step": 9473 + }, + { + "epoch": 0.46, + "grad_norm": 0.5218349695205688, + "learning_rate": 0.0005652328194903587, + "loss": 3.2981, + "step": 9474 + }, + { + "epoch": 0.46, + "grad_norm": 0.6199938058853149, + "learning_rate": 0.0005652256248096577, + "loss": 3.3046, + "step": 9475 + }, + { + "epoch": 0.46, + "grad_norm": 0.5042975544929504, + "learning_rate": 0.0005652184294304034, + "loss": 3.3814, + "step": 9476 + }, + { + "epoch": 0.46, + "grad_norm": 0.5005037784576416, + "learning_rate": 0.0005652112333526147, + "loss": 3.316, + "step": 9477 + }, + { + "epoch": 0.46, + "grad_norm": 0.507548451423645, + "learning_rate": 0.0005652040365763107, + "loss": 3.2449, + "step": 9478 + }, + { + "epoch": 0.46, + "grad_norm": 0.5326326489448547, + "learning_rate": 0.0005651968391015104, + "loss": 3.333, + "step": 9479 + }, + { + "epoch": 0.46, + "grad_norm": 0.5457335114479065, + "learning_rate": 0.0005651896409282326, + "loss": 3.2439, + "step": 9480 + }, + { + "epoch": 0.46, + "grad_norm": 0.5438616275787354, + "learning_rate": 0.0005651824420564962, + "loss": 3.0847, + "step": 9481 + }, + { + "epoch": 0.46, + "grad_norm": 0.5413506627082825, + "learning_rate": 0.0005651752424863205, + "loss": 3.3258, + "step": 9482 + }, + { + "epoch": 0.46, + "grad_norm": 0.5376286506652832, + "learning_rate": 0.0005651680422177241, + "loss": 3.0409, + "step": 9483 + }, + { + "epoch": 0.46, + "grad_norm": 0.5599796772003174, + "learning_rate": 0.0005651608412507262, + "loss": 3.2176, + "step": 9484 + }, + { + "epoch": 0.46, + "grad_norm": 0.5151841044425964, + "learning_rate": 0.0005651536395853456, + "loss": 3.1449, + "step": 9485 + }, + { + "epoch": 0.46, + "grad_norm": 0.5488869547843933, + "learning_rate": 0.0005651464372216015, + "loss": 3.2857, + "step": 9486 + }, + { + "epoch": 0.46, + "grad_norm": 0.5191092491149902, + "learning_rate": 0.0005651392341595125, + "loss": 3.3297, + "step": 9487 + }, + { + "epoch": 0.46, + "grad_norm": 0.5657668709754944, + "learning_rate": 0.0005651320303990978, + "loss": 3.1369, + "step": 9488 + }, + { + "epoch": 0.47, + "grad_norm": 0.5233826637268066, + "learning_rate": 0.0005651248259403765, + "loss": 3.2528, + "step": 9489 + }, + { + "epoch": 0.47, + "grad_norm": 0.49316510558128357, + "learning_rate": 0.0005651176207833673, + "loss": 3.2797, + "step": 9490 + }, + { + "epoch": 0.47, + "grad_norm": 0.5391998291015625, + "learning_rate": 0.0005651104149280894, + "loss": 3.2407, + "step": 9491 + }, + { + "epoch": 0.47, + "grad_norm": 0.7097063660621643, + "learning_rate": 0.0005651032083745616, + "loss": 3.3675, + "step": 9492 + }, + { + "epoch": 0.47, + "grad_norm": 0.5312466621398926, + "learning_rate": 0.0005650960011228031, + "loss": 3.2063, + "step": 9493 + }, + { + "epoch": 0.47, + "grad_norm": 0.5219338536262512, + "learning_rate": 0.0005650887931728326, + "loss": 3.3838, + "step": 9494 + }, + { + "epoch": 0.47, + "grad_norm": 0.5633711218833923, + "learning_rate": 0.0005650815845246694, + "loss": 3.084, + "step": 9495 + }, + { + "epoch": 0.47, + "grad_norm": 0.5363718867301941, + "learning_rate": 0.0005650743751783321, + "loss": 3.2394, + "step": 9496 + }, + { + "epoch": 0.47, + "grad_norm": 0.5066879987716675, + "learning_rate": 0.0005650671651338401, + "loss": 3.3002, + "step": 9497 + }, + { + "epoch": 0.47, + "grad_norm": 0.5164818167686462, + "learning_rate": 0.0005650599543912121, + "loss": 2.9735, + "step": 9498 + }, + { + "epoch": 0.47, + "grad_norm": 0.5876210331916809, + "learning_rate": 0.0005650527429504673, + "loss": 3.4418, + "step": 9499 + }, + { + "epoch": 0.47, + "grad_norm": 0.4980337917804718, + "learning_rate": 0.0005650455308116245, + "loss": 3.3927, + "step": 9500 + }, + { + "epoch": 0.47, + "grad_norm": 0.5478733777999878, + "learning_rate": 0.0005650383179747028, + "loss": 3.2894, + "step": 9501 + }, + { + "epoch": 0.47, + "grad_norm": 0.5227766036987305, + "learning_rate": 0.0005650311044397212, + "loss": 3.3741, + "step": 9502 + }, + { + "epoch": 0.47, + "grad_norm": 0.4926789700984955, + "learning_rate": 0.0005650238902066987, + "loss": 3.3118, + "step": 9503 + }, + { + "epoch": 0.47, + "grad_norm": 0.5030087828636169, + "learning_rate": 0.0005650166752756542, + "loss": 3.4107, + "step": 9504 + }, + { + "epoch": 0.47, + "grad_norm": 0.5236660838127136, + "learning_rate": 0.0005650094596466068, + "loss": 3.2949, + "step": 9505 + }, + { + "epoch": 0.47, + "grad_norm": 0.5276412963867188, + "learning_rate": 0.0005650022433195755, + "loss": 3.2227, + "step": 9506 + }, + { + "epoch": 0.47, + "grad_norm": 0.5344411730766296, + "learning_rate": 0.0005649950262945794, + "loss": 3.1652, + "step": 9507 + }, + { + "epoch": 0.47, + "grad_norm": 0.5086687207221985, + "learning_rate": 0.0005649878085716372, + "loss": 3.1716, + "step": 9508 + }, + { + "epoch": 0.47, + "grad_norm": 0.5827347636222839, + "learning_rate": 0.0005649805901507682, + "loss": 3.265, + "step": 9509 + }, + { + "epoch": 0.47, + "grad_norm": 0.5286895036697388, + "learning_rate": 0.0005649733710319913, + "loss": 3.3054, + "step": 9510 + }, + { + "epoch": 0.47, + "grad_norm": 0.5271221399307251, + "learning_rate": 0.0005649661512153256, + "loss": 3.2053, + "step": 9511 + }, + { + "epoch": 0.47, + "grad_norm": 0.4985242784023285, + "learning_rate": 0.00056495893070079, + "loss": 3.1904, + "step": 9512 + }, + { + "epoch": 0.47, + "grad_norm": 0.5197968482971191, + "learning_rate": 0.0005649517094884036, + "loss": 3.2541, + "step": 9513 + }, + { + "epoch": 0.47, + "grad_norm": 0.5603257417678833, + "learning_rate": 0.0005649444875781853, + "loss": 3.3489, + "step": 9514 + }, + { + "epoch": 0.47, + "grad_norm": 0.5142351388931274, + "learning_rate": 0.0005649372649701544, + "loss": 3.2883, + "step": 9515 + }, + { + "epoch": 0.47, + "grad_norm": 0.6001418232917786, + "learning_rate": 0.0005649300416643296, + "loss": 3.1696, + "step": 9516 + }, + { + "epoch": 0.47, + "grad_norm": 0.5239596962928772, + "learning_rate": 0.0005649228176607301, + "loss": 3.3235, + "step": 9517 + }, + { + "epoch": 0.47, + "grad_norm": 0.48557403683662415, + "learning_rate": 0.0005649155929593748, + "loss": 3.1674, + "step": 9518 + }, + { + "epoch": 0.47, + "grad_norm": 0.5428034663200378, + "learning_rate": 0.0005649083675602829, + "loss": 3.1312, + "step": 9519 + }, + { + "epoch": 0.47, + "grad_norm": 0.5654087662696838, + "learning_rate": 0.0005649011414634733, + "loss": 3.5056, + "step": 9520 + }, + { + "epoch": 0.47, + "grad_norm": 0.4970563054084778, + "learning_rate": 0.000564893914668965, + "loss": 3.3086, + "step": 9521 + }, + { + "epoch": 0.47, + "grad_norm": 0.5339492559432983, + "learning_rate": 0.0005648866871767772, + "loss": 3.2905, + "step": 9522 + }, + { + "epoch": 0.47, + "grad_norm": 0.5164199471473694, + "learning_rate": 0.0005648794589869289, + "loss": 3.1285, + "step": 9523 + }, + { + "epoch": 0.47, + "grad_norm": 0.476354718208313, + "learning_rate": 0.000564872230099439, + "loss": 3.2268, + "step": 9524 + }, + { + "epoch": 0.47, + "grad_norm": 0.5380869507789612, + "learning_rate": 0.0005648650005143267, + "loss": 3.0886, + "step": 9525 + }, + { + "epoch": 0.47, + "grad_norm": 0.5686689019203186, + "learning_rate": 0.0005648577702316108, + "loss": 3.353, + "step": 9526 + }, + { + "epoch": 0.47, + "grad_norm": 0.5547523498535156, + "learning_rate": 0.0005648505392513107, + "loss": 3.2541, + "step": 9527 + }, + { + "epoch": 0.47, + "grad_norm": 0.5288465619087219, + "learning_rate": 0.0005648433075734451, + "loss": 3.3771, + "step": 9528 + }, + { + "epoch": 0.47, + "grad_norm": 0.5291219353675842, + "learning_rate": 0.0005648360751980332, + "loss": 3.2426, + "step": 9529 + }, + { + "epoch": 0.47, + "grad_norm": 0.5199901461601257, + "learning_rate": 0.0005648288421250942, + "loss": 3.245, + "step": 9530 + }, + { + "epoch": 0.47, + "grad_norm": 0.5697982311248779, + "learning_rate": 0.0005648216083546469, + "loss": 3.3015, + "step": 9531 + }, + { + "epoch": 0.47, + "grad_norm": 0.5655345916748047, + "learning_rate": 0.0005648143738867104, + "loss": 3.3584, + "step": 9532 + }, + { + "epoch": 0.47, + "grad_norm": 0.5142018795013428, + "learning_rate": 0.0005648071387213039, + "loss": 3.2386, + "step": 9533 + }, + { + "epoch": 0.47, + "grad_norm": 0.5273463129997253, + "learning_rate": 0.0005647999028584463, + "loss": 3.211, + "step": 9534 + }, + { + "epoch": 0.47, + "grad_norm": 0.58909672498703, + "learning_rate": 0.0005647926662981568, + "loss": 3.4849, + "step": 9535 + }, + { + "epoch": 0.47, + "grad_norm": 0.7461726665496826, + "learning_rate": 0.0005647854290404543, + "loss": 3.036, + "step": 9536 + }, + { + "epoch": 0.47, + "grad_norm": 0.5585423707962036, + "learning_rate": 0.0005647781910853579, + "loss": 3.2072, + "step": 9537 + }, + { + "epoch": 0.47, + "grad_norm": 0.5416821241378784, + "learning_rate": 0.0005647709524328867, + "loss": 3.3082, + "step": 9538 + }, + { + "epoch": 0.47, + "grad_norm": 0.5571542382240295, + "learning_rate": 0.0005647637130830599, + "loss": 3.4454, + "step": 9539 + }, + { + "epoch": 0.47, + "grad_norm": 0.5479061007499695, + "learning_rate": 0.0005647564730358963, + "loss": 3.3225, + "step": 9540 + }, + { + "epoch": 0.47, + "grad_norm": 0.5047709941864014, + "learning_rate": 0.0005647492322914152, + "loss": 3.4261, + "step": 9541 + }, + { + "epoch": 0.47, + "grad_norm": 0.5049424171447754, + "learning_rate": 0.0005647419908496355, + "loss": 3.3516, + "step": 9542 + }, + { + "epoch": 0.47, + "grad_norm": 0.5358158349990845, + "learning_rate": 0.0005647347487105764, + "loss": 3.309, + "step": 9543 + }, + { + "epoch": 0.47, + "grad_norm": 0.5347208380699158, + "learning_rate": 0.0005647275058742569, + "loss": 3.3036, + "step": 9544 + }, + { + "epoch": 0.47, + "grad_norm": 0.5153633952140808, + "learning_rate": 0.000564720262340696, + "loss": 3.3403, + "step": 9545 + }, + { + "epoch": 0.47, + "grad_norm": 0.5642601847648621, + "learning_rate": 0.0005647130181099131, + "loss": 3.3031, + "step": 9546 + }, + { + "epoch": 0.47, + "grad_norm": 0.5254157185554504, + "learning_rate": 0.0005647057731819269, + "loss": 3.3091, + "step": 9547 + }, + { + "epoch": 0.47, + "grad_norm": 0.5136579275131226, + "learning_rate": 0.0005646985275567566, + "loss": 3.1616, + "step": 9548 + }, + { + "epoch": 0.47, + "grad_norm": 0.504160463809967, + "learning_rate": 0.0005646912812344214, + "loss": 3.2302, + "step": 9549 + }, + { + "epoch": 0.47, + "grad_norm": 0.5003314018249512, + "learning_rate": 0.0005646840342149403, + "loss": 3.0769, + "step": 9550 + }, + { + "epoch": 0.47, + "grad_norm": 0.5534130334854126, + "learning_rate": 0.0005646767864983325, + "loss": 3.3755, + "step": 9551 + }, + { + "epoch": 0.47, + "grad_norm": 0.5688060522079468, + "learning_rate": 0.0005646695380846168, + "loss": 3.3327, + "step": 9552 + }, + { + "epoch": 0.47, + "grad_norm": 0.5155788660049438, + "learning_rate": 0.0005646622889738125, + "loss": 3.2615, + "step": 9553 + }, + { + "epoch": 0.47, + "grad_norm": 0.5471298098564148, + "learning_rate": 0.0005646550391659387, + "loss": 3.3885, + "step": 9554 + }, + { + "epoch": 0.47, + "grad_norm": 0.5412099957466125, + "learning_rate": 0.0005646477886610145, + "loss": 3.215, + "step": 9555 + }, + { + "epoch": 0.47, + "grad_norm": 0.4741232693195343, + "learning_rate": 0.0005646405374590589, + "loss": 3.4638, + "step": 9556 + }, + { + "epoch": 0.47, + "grad_norm": 0.5171995162963867, + "learning_rate": 0.0005646332855600911, + "loss": 3.1037, + "step": 9557 + }, + { + "epoch": 0.47, + "grad_norm": 0.48581480979919434, + "learning_rate": 0.0005646260329641302, + "loss": 3.3405, + "step": 9558 + }, + { + "epoch": 0.47, + "grad_norm": 0.5740549564361572, + "learning_rate": 0.0005646187796711951, + "loss": 2.8654, + "step": 9559 + }, + { + "epoch": 0.47, + "grad_norm": 0.5170774459838867, + "learning_rate": 0.0005646115256813053, + "loss": 3.2295, + "step": 9560 + }, + { + "epoch": 0.47, + "grad_norm": 0.5482763648033142, + "learning_rate": 0.0005646042709944794, + "loss": 3.3797, + "step": 9561 + }, + { + "epoch": 0.47, + "grad_norm": 0.4993712604045868, + "learning_rate": 0.0005645970156107369, + "loss": 3.3476, + "step": 9562 + }, + { + "epoch": 0.47, + "grad_norm": 0.5390969514846802, + "learning_rate": 0.0005645897595300967, + "loss": 3.2817, + "step": 9563 + }, + { + "epoch": 0.47, + "grad_norm": 0.53498375415802, + "learning_rate": 0.0005645825027525781, + "loss": 3.3194, + "step": 9564 + }, + { + "epoch": 0.47, + "grad_norm": 0.528544008731842, + "learning_rate": 0.0005645752452782001, + "loss": 3.3329, + "step": 9565 + }, + { + "epoch": 0.47, + "grad_norm": 0.5482951402664185, + "learning_rate": 0.0005645679871069817, + "loss": 3.2478, + "step": 9566 + }, + { + "epoch": 0.47, + "grad_norm": 0.5523584485054016, + "learning_rate": 0.0005645607282389423, + "loss": 3.0492, + "step": 9567 + }, + { + "epoch": 0.47, + "grad_norm": 0.5092670321464539, + "learning_rate": 0.0005645534686741009, + "loss": 3.1282, + "step": 9568 + }, + { + "epoch": 0.47, + "grad_norm": 0.5134148597717285, + "learning_rate": 0.0005645462084124765, + "loss": 3.2474, + "step": 9569 + }, + { + "epoch": 0.47, + "grad_norm": 0.5111011862754822, + "learning_rate": 0.0005645389474540882, + "loss": 3.1094, + "step": 9570 + }, + { + "epoch": 0.47, + "grad_norm": 0.5071569085121155, + "learning_rate": 0.0005645316857989553, + "loss": 3.3619, + "step": 9571 + }, + { + "epoch": 0.47, + "grad_norm": 0.49778759479522705, + "learning_rate": 0.0005645244234470969, + "loss": 3.1761, + "step": 9572 + }, + { + "epoch": 0.47, + "grad_norm": 0.5030775666236877, + "learning_rate": 0.0005645171603985321, + "loss": 3.3071, + "step": 9573 + }, + { + "epoch": 0.47, + "grad_norm": 0.5095931887626648, + "learning_rate": 0.0005645098966532798, + "loss": 3.4006, + "step": 9574 + }, + { + "epoch": 0.47, + "grad_norm": 0.5154526233673096, + "learning_rate": 0.0005645026322113596, + "loss": 3.3653, + "step": 9575 + }, + { + "epoch": 0.47, + "grad_norm": 0.4855976700782776, + "learning_rate": 0.0005644953670727902, + "loss": 3.4481, + "step": 9576 + }, + { + "epoch": 0.47, + "grad_norm": 0.5199698805809021, + "learning_rate": 0.0005644881012375909, + "loss": 3.3039, + "step": 9577 + }, + { + "epoch": 0.47, + "grad_norm": 0.5196504592895508, + "learning_rate": 0.0005644808347057809, + "loss": 3.4343, + "step": 9578 + }, + { + "epoch": 0.47, + "grad_norm": 0.5275589227676392, + "learning_rate": 0.0005644735674773793, + "loss": 3.3105, + "step": 9579 + }, + { + "epoch": 0.47, + "grad_norm": 0.4889765977859497, + "learning_rate": 0.0005644662995524051, + "loss": 3.2342, + "step": 9580 + }, + { + "epoch": 0.47, + "grad_norm": 0.5421981811523438, + "learning_rate": 0.0005644590309308778, + "loss": 3.4189, + "step": 9581 + }, + { + "epoch": 0.47, + "grad_norm": 0.5198723077774048, + "learning_rate": 0.0005644517616128161, + "loss": 3.3944, + "step": 9582 + }, + { + "epoch": 0.47, + "grad_norm": 0.518722653388977, + "learning_rate": 0.0005644444915982394, + "loss": 3.4588, + "step": 9583 + }, + { + "epoch": 0.47, + "grad_norm": 0.5175428986549377, + "learning_rate": 0.0005644372208871668, + "loss": 3.2401, + "step": 9584 + }, + { + "epoch": 0.47, + "grad_norm": 0.5273626446723938, + "learning_rate": 0.0005644299494796175, + "loss": 3.3321, + "step": 9585 + }, + { + "epoch": 0.47, + "grad_norm": 0.5376038551330566, + "learning_rate": 0.0005644226773756107, + "loss": 3.1802, + "step": 9586 + }, + { + "epoch": 0.47, + "grad_norm": 0.5051103234291077, + "learning_rate": 0.0005644154045751652, + "loss": 3.2906, + "step": 9587 + }, + { + "epoch": 0.47, + "grad_norm": 0.5152342319488525, + "learning_rate": 0.0005644081310783006, + "loss": 3.231, + "step": 9588 + }, + { + "epoch": 0.47, + "grad_norm": 0.490278959274292, + "learning_rate": 0.0005644008568850359, + "loss": 3.333, + "step": 9589 + }, + { + "epoch": 0.47, + "grad_norm": 0.5564194321632385, + "learning_rate": 0.0005643935819953901, + "loss": 3.1443, + "step": 9590 + }, + { + "epoch": 0.47, + "grad_norm": 0.5331935286521912, + "learning_rate": 0.0005643863064093825, + "loss": 3.2971, + "step": 9591 + }, + { + "epoch": 0.47, + "grad_norm": 0.5207473635673523, + "learning_rate": 0.0005643790301270323, + "loss": 3.3384, + "step": 9592 + }, + { + "epoch": 0.47, + "grad_norm": 0.5474972724914551, + "learning_rate": 0.0005643717531483586, + "loss": 3.2845, + "step": 9593 + }, + { + "epoch": 0.47, + "grad_norm": 0.534555196762085, + "learning_rate": 0.0005643644754733805, + "loss": 3.2346, + "step": 9594 + }, + { + "epoch": 0.47, + "grad_norm": 0.5441542267799377, + "learning_rate": 0.0005643571971021174, + "loss": 3.3468, + "step": 9595 + }, + { + "epoch": 0.47, + "grad_norm": 0.5864543914794922, + "learning_rate": 0.0005643499180345882, + "loss": 3.3281, + "step": 9596 + }, + { + "epoch": 0.47, + "grad_norm": 0.5105965733528137, + "learning_rate": 0.0005643426382708124, + "loss": 3.2396, + "step": 9597 + }, + { + "epoch": 0.47, + "grad_norm": 0.5301392674446106, + "learning_rate": 0.0005643353578108088, + "loss": 3.3751, + "step": 9598 + }, + { + "epoch": 0.47, + "grad_norm": 0.5022128820419312, + "learning_rate": 0.0005643280766545967, + "loss": 3.393, + "step": 9599 + }, + { + "epoch": 0.47, + "grad_norm": 0.5763443112373352, + "learning_rate": 0.0005643207948021954, + "loss": 3.0997, + "step": 9600 + }, + { + "epoch": 0.47, + "grad_norm": 0.5480446815490723, + "learning_rate": 0.000564313512253624, + "loss": 3.2441, + "step": 9601 + }, + { + "epoch": 0.47, + "grad_norm": 0.5295218229293823, + "learning_rate": 0.0005643062290089017, + "loss": 3.4164, + "step": 9602 + }, + { + "epoch": 0.47, + "grad_norm": 0.524186909198761, + "learning_rate": 0.0005642989450680474, + "loss": 3.4485, + "step": 9603 + }, + { + "epoch": 0.47, + "grad_norm": 0.5457528829574585, + "learning_rate": 0.0005642916604310809, + "loss": 3.5014, + "step": 9604 + }, + { + "epoch": 0.47, + "grad_norm": 0.5163187384605408, + "learning_rate": 0.0005642843750980209, + "loss": 3.359, + "step": 9605 + }, + { + "epoch": 0.47, + "grad_norm": 0.5899530053138733, + "learning_rate": 0.0005642770890688866, + "loss": 3.4003, + "step": 9606 + }, + { + "epoch": 0.47, + "grad_norm": 0.5038844347000122, + "learning_rate": 0.0005642698023436974, + "loss": 3.2742, + "step": 9607 + }, + { + "epoch": 0.47, + "grad_norm": 0.5313959717750549, + "learning_rate": 0.0005642625149224724, + "loss": 3.1566, + "step": 9608 + }, + { + "epoch": 0.47, + "grad_norm": 0.5297811627388, + "learning_rate": 0.0005642552268052309, + "loss": 3.3764, + "step": 9609 + }, + { + "epoch": 0.47, + "grad_norm": 0.5239699482917786, + "learning_rate": 0.0005642479379919918, + "loss": 3.2483, + "step": 9610 + }, + { + "epoch": 0.47, + "grad_norm": 0.5261852145195007, + "learning_rate": 0.0005642406484827746, + "loss": 3.2288, + "step": 9611 + }, + { + "epoch": 0.47, + "grad_norm": 0.5086889266967773, + "learning_rate": 0.0005642333582775984, + "loss": 3.2811, + "step": 9612 + }, + { + "epoch": 0.47, + "grad_norm": 0.5237385034561157, + "learning_rate": 0.0005642260673764822, + "loss": 3.2338, + "step": 9613 + }, + { + "epoch": 0.47, + "grad_norm": 0.5552691221237183, + "learning_rate": 0.0005642187757794456, + "loss": 3.2998, + "step": 9614 + }, + { + "epoch": 0.47, + "grad_norm": 0.5657722353935242, + "learning_rate": 0.0005642114834865076, + "loss": 3.2386, + "step": 9615 + }, + { + "epoch": 0.47, + "grad_norm": 0.49646058678627014, + "learning_rate": 0.0005642041904976873, + "loss": 3.404, + "step": 9616 + }, + { + "epoch": 0.47, + "grad_norm": 0.5342143177986145, + "learning_rate": 0.000564196896813004, + "loss": 3.1347, + "step": 9617 + }, + { + "epoch": 0.47, + "grad_norm": 0.49228495359420776, + "learning_rate": 0.0005641896024324769, + "loss": 3.4549, + "step": 9618 + }, + { + "epoch": 0.47, + "grad_norm": 0.5277411937713623, + "learning_rate": 0.0005641823073561253, + "loss": 3.385, + "step": 9619 + }, + { + "epoch": 0.47, + "grad_norm": 0.5513384938240051, + "learning_rate": 0.0005641750115839685, + "loss": 3.2151, + "step": 9620 + }, + { + "epoch": 0.47, + "grad_norm": 0.5188488960266113, + "learning_rate": 0.0005641677151160253, + "loss": 3.2605, + "step": 9621 + }, + { + "epoch": 0.47, + "grad_norm": 0.5301359295845032, + "learning_rate": 0.0005641604179523153, + "loss": 3.0863, + "step": 9622 + }, + { + "epoch": 0.47, + "grad_norm": 0.5934592485427856, + "learning_rate": 0.0005641531200928575, + "loss": 3.33, + "step": 9623 + }, + { + "epoch": 0.47, + "grad_norm": 0.5196259617805481, + "learning_rate": 0.0005641458215376713, + "loss": 3.1144, + "step": 9624 + }, + { + "epoch": 0.47, + "grad_norm": 0.5472568273544312, + "learning_rate": 0.0005641385222867758, + "loss": 3.197, + "step": 9625 + }, + { + "epoch": 0.47, + "grad_norm": 0.5343920588493347, + "learning_rate": 0.0005641312223401904, + "loss": 3.3338, + "step": 9626 + }, + { + "epoch": 0.47, + "grad_norm": 0.5346314907073975, + "learning_rate": 0.000564123921697934, + "loss": 3.0962, + "step": 9627 + }, + { + "epoch": 0.47, + "grad_norm": 0.5246593356132507, + "learning_rate": 0.0005641166203600262, + "loss": 3.342, + "step": 9628 + }, + { + "epoch": 0.47, + "grad_norm": 0.5096229910850525, + "learning_rate": 0.000564109318326486, + "loss": 3.1208, + "step": 9629 + }, + { + "epoch": 0.47, + "grad_norm": 0.5147344470024109, + "learning_rate": 0.0005641020155973326, + "loss": 3.4918, + "step": 9630 + }, + { + "epoch": 0.47, + "grad_norm": 0.5902569890022278, + "learning_rate": 0.0005640947121725853, + "loss": 3.4023, + "step": 9631 + }, + { + "epoch": 0.47, + "grad_norm": 0.5098241567611694, + "learning_rate": 0.0005640874080522635, + "loss": 3.2977, + "step": 9632 + }, + { + "epoch": 0.47, + "grad_norm": 0.5633296966552734, + "learning_rate": 0.0005640801032363862, + "loss": 3.3381, + "step": 9633 + }, + { + "epoch": 0.47, + "grad_norm": 0.5749189257621765, + "learning_rate": 0.0005640727977249728, + "loss": 3.3493, + "step": 9634 + }, + { + "epoch": 0.47, + "grad_norm": 0.5577384829521179, + "learning_rate": 0.0005640654915180424, + "loss": 3.2943, + "step": 9635 + }, + { + "epoch": 0.47, + "grad_norm": 0.5394460558891296, + "learning_rate": 0.0005640581846156143, + "loss": 3.1971, + "step": 9636 + }, + { + "epoch": 0.47, + "grad_norm": 0.6581524610519409, + "learning_rate": 0.0005640508770177079, + "loss": 3.4347, + "step": 9637 + }, + { + "epoch": 0.47, + "grad_norm": 0.5236822962760925, + "learning_rate": 0.0005640435687243421, + "loss": 3.273, + "step": 9638 + }, + { + "epoch": 0.47, + "grad_norm": 0.5210537910461426, + "learning_rate": 0.0005640362597355365, + "loss": 3.2335, + "step": 9639 + }, + { + "epoch": 0.47, + "grad_norm": 0.5516870021820068, + "learning_rate": 0.0005640289500513101, + "loss": 3.3469, + "step": 9640 + }, + { + "epoch": 0.47, + "grad_norm": 0.5240686535835266, + "learning_rate": 0.0005640216396716824, + "loss": 3.5318, + "step": 9641 + }, + { + "epoch": 0.47, + "grad_norm": 0.48787635564804077, + "learning_rate": 0.0005640143285966724, + "loss": 3.423, + "step": 9642 + }, + { + "epoch": 0.47, + "grad_norm": 0.5234872102737427, + "learning_rate": 0.0005640070168262996, + "loss": 3.292, + "step": 9643 + }, + { + "epoch": 0.47, + "grad_norm": 0.5378441214561462, + "learning_rate": 0.000563999704360583, + "loss": 3.2868, + "step": 9644 + }, + { + "epoch": 0.47, + "grad_norm": 0.5410236716270447, + "learning_rate": 0.000563992391199542, + "loss": 3.2939, + "step": 9645 + }, + { + "epoch": 0.47, + "grad_norm": 0.5149595737457275, + "learning_rate": 0.0005639850773431959, + "loss": 3.3569, + "step": 9646 + }, + { + "epoch": 0.47, + "grad_norm": 0.5150244235992432, + "learning_rate": 0.0005639777627915639, + "loss": 3.2811, + "step": 9647 + }, + { + "epoch": 0.47, + "grad_norm": 0.5107737183570862, + "learning_rate": 0.0005639704475446653, + "loss": 3.2627, + "step": 9648 + }, + { + "epoch": 0.47, + "grad_norm": 0.5012353658676147, + "learning_rate": 0.0005639631316025193, + "loss": 3.2458, + "step": 9649 + }, + { + "epoch": 0.47, + "grad_norm": 0.5054120421409607, + "learning_rate": 0.0005639558149651452, + "loss": 3.141, + "step": 9650 + }, + { + "epoch": 0.47, + "grad_norm": 0.5207725763320923, + "learning_rate": 0.0005639484976325623, + "loss": 3.3233, + "step": 9651 + }, + { + "epoch": 0.47, + "grad_norm": 0.5636855363845825, + "learning_rate": 0.0005639411796047898, + "loss": 3.1278, + "step": 9652 + }, + { + "epoch": 0.47, + "grad_norm": 0.5159328579902649, + "learning_rate": 0.0005639338608818471, + "loss": 3.422, + "step": 9653 + }, + { + "epoch": 0.47, + "grad_norm": 0.5192365050315857, + "learning_rate": 0.0005639265414637534, + "loss": 3.2066, + "step": 9654 + }, + { + "epoch": 0.47, + "grad_norm": 0.5930947065353394, + "learning_rate": 0.0005639192213505279, + "loss": 3.2735, + "step": 9655 + }, + { + "epoch": 0.47, + "grad_norm": 0.5183570981025696, + "learning_rate": 0.0005639119005421901, + "loss": 3.1819, + "step": 9656 + }, + { + "epoch": 0.47, + "grad_norm": 0.5415933132171631, + "learning_rate": 0.000563904579038759, + "loss": 3.3708, + "step": 9657 + }, + { + "epoch": 0.47, + "grad_norm": 0.5286021828651428, + "learning_rate": 0.0005638972568402542, + "loss": 3.3341, + "step": 9658 + }, + { + "epoch": 0.47, + "grad_norm": 0.523844838142395, + "learning_rate": 0.0005638899339466948, + "loss": 3.3128, + "step": 9659 + }, + { + "epoch": 0.47, + "grad_norm": 0.5575231909751892, + "learning_rate": 0.0005638826103580999, + "loss": 3.2247, + "step": 9660 + }, + { + "epoch": 0.47, + "grad_norm": 0.5189796686172485, + "learning_rate": 0.0005638752860744891, + "loss": 3.3183, + "step": 9661 + }, + { + "epoch": 0.47, + "grad_norm": 0.5277138352394104, + "learning_rate": 0.0005638679610958817, + "loss": 3.2104, + "step": 9662 + }, + { + "epoch": 0.47, + "grad_norm": 0.500163733959198, + "learning_rate": 0.0005638606354222967, + "loss": 3.3856, + "step": 9663 + }, + { + "epoch": 0.47, + "grad_norm": 0.555261492729187, + "learning_rate": 0.0005638533090537536, + "loss": 3.3742, + "step": 9664 + }, + { + "epoch": 0.47, + "grad_norm": 0.5257711410522461, + "learning_rate": 0.0005638459819902718, + "loss": 3.2025, + "step": 9665 + }, + { + "epoch": 0.47, + "grad_norm": 0.5524579286575317, + "learning_rate": 0.0005638386542318703, + "loss": 3.2449, + "step": 9666 + }, + { + "epoch": 0.47, + "grad_norm": 0.561862051486969, + "learning_rate": 0.0005638313257785685, + "loss": 3.1967, + "step": 9667 + }, + { + "epoch": 0.47, + "grad_norm": 0.5200432538986206, + "learning_rate": 0.0005638239966303859, + "loss": 3.2598, + "step": 9668 + }, + { + "epoch": 0.47, + "grad_norm": 0.524965226650238, + "learning_rate": 0.0005638166667873417, + "loss": 3.4027, + "step": 9669 + }, + { + "epoch": 0.47, + "grad_norm": 0.5578727722167969, + "learning_rate": 0.0005638093362494551, + "loss": 3.0414, + "step": 9670 + }, + { + "epoch": 0.47, + "grad_norm": 0.5654264092445374, + "learning_rate": 0.0005638020050167456, + "loss": 3.3246, + "step": 9671 + }, + { + "epoch": 0.47, + "grad_norm": 0.5263561606407166, + "learning_rate": 0.0005637946730892323, + "loss": 3.2548, + "step": 9672 + }, + { + "epoch": 0.47, + "grad_norm": 0.5265440940856934, + "learning_rate": 0.0005637873404669345, + "loss": 3.1688, + "step": 9673 + }, + { + "epoch": 0.47, + "grad_norm": 0.5504220128059387, + "learning_rate": 0.0005637800071498717, + "loss": 3.1825, + "step": 9674 + }, + { + "epoch": 0.47, + "grad_norm": 0.5497219562530518, + "learning_rate": 0.0005637726731380631, + "loss": 3.1871, + "step": 9675 + }, + { + "epoch": 0.47, + "grad_norm": 0.5197966694831848, + "learning_rate": 0.0005637653384315281, + "loss": 3.5599, + "step": 9676 + }, + { + "epoch": 0.47, + "grad_norm": 0.5388951897621155, + "learning_rate": 0.0005637580030302859, + "loss": 3.3868, + "step": 9677 + }, + { + "epoch": 0.47, + "grad_norm": 0.5066190361976624, + "learning_rate": 0.000563750666934356, + "loss": 3.1101, + "step": 9678 + }, + { + "epoch": 0.47, + "grad_norm": 0.5020130276679993, + "learning_rate": 0.0005637433301437575, + "loss": 3.2261, + "step": 9679 + }, + { + "epoch": 0.47, + "grad_norm": 0.5464184880256653, + "learning_rate": 0.0005637359926585099, + "loss": 3.2281, + "step": 9680 + }, + { + "epoch": 0.47, + "grad_norm": 0.5577077865600586, + "learning_rate": 0.0005637286544786323, + "loss": 3.3123, + "step": 9681 + }, + { + "epoch": 0.47, + "grad_norm": 0.564420759677887, + "learning_rate": 0.0005637213156041443, + "loss": 3.0541, + "step": 9682 + }, + { + "epoch": 0.47, + "grad_norm": 0.4688968360424042, + "learning_rate": 0.000563713976035065, + "loss": 3.2077, + "step": 9683 + }, + { + "epoch": 0.47, + "grad_norm": 0.4943731129169464, + "learning_rate": 0.000563706635771414, + "loss": 3.2392, + "step": 9684 + }, + { + "epoch": 0.47, + "grad_norm": 0.5055580139160156, + "learning_rate": 0.0005636992948132103, + "loss": 3.3449, + "step": 9685 + }, + { + "epoch": 0.47, + "grad_norm": 0.4915640950202942, + "learning_rate": 0.0005636919531604736, + "loss": 3.3726, + "step": 9686 + }, + { + "epoch": 0.47, + "grad_norm": 0.4966067969799042, + "learning_rate": 0.0005636846108132229, + "loss": 3.322, + "step": 9687 + }, + { + "epoch": 0.47, + "grad_norm": 0.5646604299545288, + "learning_rate": 0.0005636772677714777, + "loss": 3.1713, + "step": 9688 + }, + { + "epoch": 0.47, + "grad_norm": 0.5298094153404236, + "learning_rate": 0.0005636699240352574, + "loss": 3.2453, + "step": 9689 + }, + { + "epoch": 0.47, + "grad_norm": 0.48152562975883484, + "learning_rate": 0.0005636625796045813, + "loss": 3.3521, + "step": 9690 + }, + { + "epoch": 0.47, + "grad_norm": 0.5157354474067688, + "learning_rate": 0.0005636552344794685, + "loss": 3.3635, + "step": 9691 + }, + { + "epoch": 0.47, + "grad_norm": 0.5314420461654663, + "learning_rate": 0.0005636478886599387, + "loss": 3.1824, + "step": 9692 + }, + { + "epoch": 0.48, + "grad_norm": 0.49262553453445435, + "learning_rate": 0.0005636405421460112, + "loss": 3.0643, + "step": 9693 + }, + { + "epoch": 0.48, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.000563633194937705, + "loss": 3.4652, + "step": 9694 + }, + { + "epoch": 0.48, + "grad_norm": 0.5083454847335815, + "learning_rate": 0.0005636258470350399, + "loss": 3.1061, + "step": 9695 + }, + { + "epoch": 0.48, + "grad_norm": 0.5256954431533813, + "learning_rate": 0.0005636184984380349, + "loss": 3.2333, + "step": 9696 + }, + { + "epoch": 0.48, + "grad_norm": 0.5341222286224365, + "learning_rate": 0.0005636111491467097, + "loss": 3.3005, + "step": 9697 + }, + { + "epoch": 0.48, + "grad_norm": 0.5235113501548767, + "learning_rate": 0.0005636037991610833, + "loss": 3.3963, + "step": 9698 + }, + { + "epoch": 0.48, + "grad_norm": 0.5327295064926147, + "learning_rate": 0.0005635964484811753, + "loss": 3.0668, + "step": 9699 + }, + { + "epoch": 0.48, + "grad_norm": 0.5342981815338135, + "learning_rate": 0.000563589097107005, + "loss": 3.271, + "step": 9700 + }, + { + "epoch": 0.48, + "grad_norm": 0.5108175277709961, + "learning_rate": 0.0005635817450385918, + "loss": 3.4472, + "step": 9701 + }, + { + "epoch": 0.48, + "grad_norm": 0.5050833225250244, + "learning_rate": 0.0005635743922759548, + "loss": 3.4441, + "step": 9702 + }, + { + "epoch": 0.48, + "grad_norm": 0.5103448033332825, + "learning_rate": 0.0005635670388191137, + "loss": 3.4166, + "step": 9703 + }, + { + "epoch": 0.48, + "grad_norm": 0.5414007902145386, + "learning_rate": 0.0005635596846680878, + "loss": 3.2219, + "step": 9704 + }, + { + "epoch": 0.48, + "grad_norm": 0.5395253300666809, + "learning_rate": 0.0005635523298228964, + "loss": 3.3887, + "step": 9705 + }, + { + "epoch": 0.48, + "grad_norm": 0.5145513415336609, + "learning_rate": 0.0005635449742835588, + "loss": 3.3582, + "step": 9706 + }, + { + "epoch": 0.48, + "grad_norm": 0.5107079148292542, + "learning_rate": 0.0005635376180500945, + "loss": 3.2202, + "step": 9707 + }, + { + "epoch": 0.48, + "grad_norm": 0.496855229139328, + "learning_rate": 0.0005635302611225228, + "loss": 3.3866, + "step": 9708 + }, + { + "epoch": 0.48, + "grad_norm": 0.5205982327461243, + "learning_rate": 0.0005635229035008632, + "loss": 3.0983, + "step": 9709 + }, + { + "epoch": 0.48, + "grad_norm": 0.5210973024368286, + "learning_rate": 0.0005635155451851349, + "loss": 3.2638, + "step": 9710 + }, + { + "epoch": 0.48, + "grad_norm": 0.5027773380279541, + "learning_rate": 0.0005635081861753575, + "loss": 3.3372, + "step": 9711 + }, + { + "epoch": 0.48, + "grad_norm": 0.5695503950119019, + "learning_rate": 0.0005635008264715501, + "loss": 3.3357, + "step": 9712 + }, + { + "epoch": 0.48, + "grad_norm": 0.5400940179824829, + "learning_rate": 0.0005634934660737323, + "loss": 3.0366, + "step": 9713 + }, + { + "epoch": 0.48, + "grad_norm": 0.6115588545799255, + "learning_rate": 0.0005634861049819234, + "loss": 3.3567, + "step": 9714 + }, + { + "epoch": 0.48, + "grad_norm": 0.5351577401161194, + "learning_rate": 0.0005634787431961428, + "loss": 3.1951, + "step": 9715 + }, + { + "epoch": 0.48, + "grad_norm": 0.5367692112922668, + "learning_rate": 0.00056347138071641, + "loss": 3.216, + "step": 9716 + }, + { + "epoch": 0.48, + "grad_norm": 0.5334668159484863, + "learning_rate": 0.0005634640175427441, + "loss": 3.4465, + "step": 9717 + }, + { + "epoch": 0.48, + "grad_norm": 0.5270911455154419, + "learning_rate": 0.0005634566536751648, + "loss": 3.2577, + "step": 9718 + }, + { + "epoch": 0.48, + "grad_norm": 0.547909677028656, + "learning_rate": 0.0005634492891136914, + "loss": 3.154, + "step": 9719 + }, + { + "epoch": 0.48, + "grad_norm": 0.5077096819877625, + "learning_rate": 0.0005634419238583433, + "loss": 3.2528, + "step": 9720 + }, + { + "epoch": 0.48, + "grad_norm": 0.5063366889953613, + "learning_rate": 0.0005634345579091398, + "loss": 3.2683, + "step": 9721 + }, + { + "epoch": 0.48, + "grad_norm": 0.5194924473762512, + "learning_rate": 0.0005634271912661003, + "loss": 3.1052, + "step": 9722 + }, + { + "epoch": 0.48, + "grad_norm": 0.4994848966598511, + "learning_rate": 0.0005634198239292444, + "loss": 3.3142, + "step": 9723 + }, + { + "epoch": 0.48, + "grad_norm": 0.5550405383110046, + "learning_rate": 0.0005634124558985913, + "loss": 3.217, + "step": 9724 + }, + { + "epoch": 0.48, + "grad_norm": 0.5168113708496094, + "learning_rate": 0.0005634050871741606, + "loss": 3.0289, + "step": 9725 + }, + { + "epoch": 0.48, + "grad_norm": 0.5467432737350464, + "learning_rate": 0.0005633977177559715, + "loss": 3.2294, + "step": 9726 + }, + { + "epoch": 0.48, + "grad_norm": 0.5153245329856873, + "learning_rate": 0.0005633903476440434, + "loss": 3.3251, + "step": 9727 + }, + { + "epoch": 0.48, + "grad_norm": 0.5289653539657593, + "learning_rate": 0.0005633829768383961, + "loss": 3.0288, + "step": 9728 + }, + { + "epoch": 0.48, + "grad_norm": 0.5196574330329895, + "learning_rate": 0.0005633756053390485, + "loss": 3.0948, + "step": 9729 + }, + { + "epoch": 0.48, + "grad_norm": 0.5112874507904053, + "learning_rate": 0.0005633682331460204, + "loss": 3.3054, + "step": 9730 + }, + { + "epoch": 0.48, + "grad_norm": 0.5316709876060486, + "learning_rate": 0.000563360860259331, + "loss": 3.2422, + "step": 9731 + }, + { + "epoch": 0.48, + "grad_norm": 0.5173203349113464, + "learning_rate": 0.0005633534866789997, + "loss": 3.3232, + "step": 9732 + }, + { + "epoch": 0.48, + "grad_norm": 0.5282646417617798, + "learning_rate": 0.000563346112405046, + "loss": 3.2662, + "step": 9733 + }, + { + "epoch": 0.48, + "grad_norm": 0.598261296749115, + "learning_rate": 0.0005633387374374894, + "loss": 2.9985, + "step": 9734 + }, + { + "epoch": 0.48, + "grad_norm": 0.4993882179260254, + "learning_rate": 0.0005633313617763493, + "loss": 3.2471, + "step": 9735 + }, + { + "epoch": 0.48, + "grad_norm": 0.5630433559417725, + "learning_rate": 0.0005633239854216449, + "loss": 3.095, + "step": 9736 + }, + { + "epoch": 0.48, + "grad_norm": 0.47922441363334656, + "learning_rate": 0.0005633166083733959, + "loss": 3.1201, + "step": 9737 + }, + { + "epoch": 0.48, + "grad_norm": 0.5252969264984131, + "learning_rate": 0.0005633092306316216, + "loss": 3.2614, + "step": 9738 + }, + { + "epoch": 0.48, + "grad_norm": 0.513140857219696, + "learning_rate": 0.0005633018521963415, + "loss": 3.1924, + "step": 9739 + }, + { + "epoch": 0.48, + "grad_norm": 0.6242732405662537, + "learning_rate": 0.0005632944730675749, + "loss": 3.2182, + "step": 9740 + }, + { + "epoch": 0.48, + "grad_norm": 0.5124731659889221, + "learning_rate": 0.0005632870932453415, + "loss": 3.3061, + "step": 9741 + }, + { + "epoch": 0.48, + "grad_norm": 0.49745190143585205, + "learning_rate": 0.0005632797127296605, + "loss": 3.3783, + "step": 9742 + }, + { + "epoch": 0.48, + "grad_norm": 0.5272019505500793, + "learning_rate": 0.0005632723315205513, + "loss": 3.2325, + "step": 9743 + }, + { + "epoch": 0.48, + "grad_norm": 0.5917782187461853, + "learning_rate": 0.0005632649496180336, + "loss": 3.1377, + "step": 9744 + }, + { + "epoch": 0.48, + "grad_norm": 0.5247296094894409, + "learning_rate": 0.0005632575670221266, + "loss": 3.1735, + "step": 9745 + }, + { + "epoch": 0.48, + "grad_norm": 0.546837329864502, + "learning_rate": 0.0005632501837328498, + "loss": 3.6283, + "step": 9746 + }, + { + "epoch": 0.48, + "grad_norm": 0.5424903035163879, + "learning_rate": 0.0005632427997502227, + "loss": 3.3488, + "step": 9747 + }, + { + "epoch": 0.48, + "grad_norm": 0.5044673681259155, + "learning_rate": 0.0005632354150742648, + "loss": 3.2579, + "step": 9748 + }, + { + "epoch": 0.48, + "grad_norm": 0.5371702909469604, + "learning_rate": 0.0005632280297049954, + "loss": 3.1453, + "step": 9749 + }, + { + "epoch": 0.48, + "grad_norm": 0.5304502844810486, + "learning_rate": 0.0005632206436424342, + "loss": 3.2807, + "step": 9750 + }, + { + "epoch": 0.48, + "grad_norm": 0.6061875820159912, + "learning_rate": 0.0005632132568866002, + "loss": 2.9629, + "step": 9751 + }, + { + "epoch": 0.48, + "grad_norm": 0.5296114683151245, + "learning_rate": 0.0005632058694375134, + "loss": 3.2677, + "step": 9752 + }, + { + "epoch": 0.48, + "grad_norm": 0.5466916561126709, + "learning_rate": 0.0005631984812951928, + "loss": 3.4509, + "step": 9753 + }, + { + "epoch": 0.48, + "grad_norm": 0.5157197117805481, + "learning_rate": 0.0005631910924596582, + "loss": 3.1786, + "step": 9754 + }, + { + "epoch": 0.48, + "grad_norm": 0.5921932458877563, + "learning_rate": 0.0005631837029309288, + "loss": 3.1227, + "step": 9755 + }, + { + "epoch": 0.48, + "grad_norm": 0.4948883652687073, + "learning_rate": 0.0005631763127090242, + "loss": 3.3028, + "step": 9756 + }, + { + "epoch": 0.48, + "grad_norm": 0.543792724609375, + "learning_rate": 0.0005631689217939639, + "loss": 3.1953, + "step": 9757 + }, + { + "epoch": 0.48, + "grad_norm": 0.4762703478336334, + "learning_rate": 0.0005631615301857673, + "loss": 3.0992, + "step": 9758 + }, + { + "epoch": 0.48, + "grad_norm": 0.5200537443161011, + "learning_rate": 0.0005631541378844538, + "loss": 3.2991, + "step": 9759 + }, + { + "epoch": 0.48, + "grad_norm": 0.5312259793281555, + "learning_rate": 0.000563146744890043, + "loss": 3.2549, + "step": 9760 + }, + { + "epoch": 0.48, + "grad_norm": 0.547489583492279, + "learning_rate": 0.0005631393512025544, + "loss": 3.392, + "step": 9761 + }, + { + "epoch": 0.48, + "grad_norm": 0.5420605540275574, + "learning_rate": 0.0005631319568220072, + "loss": 3.5638, + "step": 9762 + }, + { + "epoch": 0.48, + "grad_norm": 0.5814469456672668, + "learning_rate": 0.0005631245617484211, + "loss": 3.1061, + "step": 9763 + }, + { + "epoch": 0.48, + "grad_norm": 0.48665496706962585, + "learning_rate": 0.0005631171659818158, + "loss": 3.3398, + "step": 9764 + }, + { + "epoch": 0.48, + "grad_norm": 0.506272554397583, + "learning_rate": 0.0005631097695222103, + "loss": 3.3197, + "step": 9765 + }, + { + "epoch": 0.48, + "grad_norm": 0.49559804797172546, + "learning_rate": 0.0005631023723696243, + "loss": 3.3218, + "step": 9766 + }, + { + "epoch": 0.48, + "grad_norm": 0.5244449377059937, + "learning_rate": 0.0005630949745240773, + "loss": 3.3369, + "step": 9767 + }, + { + "epoch": 0.48, + "grad_norm": 0.5261397957801819, + "learning_rate": 0.0005630875759855889, + "loss": 3.3897, + "step": 9768 + }, + { + "epoch": 0.48, + "grad_norm": 0.5492829084396362, + "learning_rate": 0.0005630801767541782, + "loss": 3.1185, + "step": 9769 + }, + { + "epoch": 0.48, + "grad_norm": 0.5077092051506042, + "learning_rate": 0.0005630727768298652, + "loss": 3.4211, + "step": 9770 + }, + { + "epoch": 0.48, + "grad_norm": 0.5366680026054382, + "learning_rate": 0.000563065376212669, + "loss": 3.2115, + "step": 9771 + }, + { + "epoch": 0.48, + "grad_norm": 0.5005910992622375, + "learning_rate": 0.0005630579749026093, + "loss": 3.2202, + "step": 9772 + }, + { + "epoch": 0.48, + "grad_norm": 0.5190508365631104, + "learning_rate": 0.0005630505728997055, + "loss": 3.3077, + "step": 9773 + }, + { + "epoch": 0.48, + "grad_norm": 0.5151124000549316, + "learning_rate": 0.000563043170203977, + "loss": 3.2981, + "step": 9774 + }, + { + "epoch": 0.48, + "grad_norm": 0.5408289432525635, + "learning_rate": 0.0005630357668154435, + "loss": 3.288, + "step": 9775 + }, + { + "epoch": 0.48, + "grad_norm": 0.5253680944442749, + "learning_rate": 0.0005630283627341245, + "loss": 3.4114, + "step": 9776 + }, + { + "epoch": 0.48, + "grad_norm": 0.5189976096153259, + "learning_rate": 0.0005630209579600393, + "loss": 3.1448, + "step": 9777 + }, + { + "epoch": 0.48, + "grad_norm": 0.5061872601509094, + "learning_rate": 0.0005630135524932076, + "loss": 3.1726, + "step": 9778 + }, + { + "epoch": 0.48, + "grad_norm": 0.5295987725257874, + "learning_rate": 0.0005630061463336488, + "loss": 3.1084, + "step": 9779 + }, + { + "epoch": 0.48, + "grad_norm": 0.5112553238868713, + "learning_rate": 0.0005629987394813824, + "loss": 3.4138, + "step": 9780 + }, + { + "epoch": 0.48, + "grad_norm": 0.5010356903076172, + "learning_rate": 0.0005629913319364279, + "loss": 3.0226, + "step": 9781 + }, + { + "epoch": 0.48, + "grad_norm": 0.497075617313385, + "learning_rate": 0.0005629839236988048, + "loss": 3.3942, + "step": 9782 + }, + { + "epoch": 0.48, + "grad_norm": 0.5749728679656982, + "learning_rate": 0.0005629765147685328, + "loss": 3.2406, + "step": 9783 + }, + { + "epoch": 0.48, + "grad_norm": 0.5109654664993286, + "learning_rate": 0.0005629691051456312, + "loss": 3.4464, + "step": 9784 + }, + { + "epoch": 0.48, + "grad_norm": 0.4870873689651489, + "learning_rate": 0.0005629616948301196, + "loss": 3.3477, + "step": 9785 + }, + { + "epoch": 0.48, + "grad_norm": 0.5036649107933044, + "learning_rate": 0.0005629542838220175, + "loss": 3.3398, + "step": 9786 + }, + { + "epoch": 0.48, + "grad_norm": 0.4647713303565979, + "learning_rate": 0.0005629468721213444, + "loss": 3.4662, + "step": 9787 + }, + { + "epoch": 0.48, + "grad_norm": 0.5171986818313599, + "learning_rate": 0.0005629394597281199, + "loss": 3.3284, + "step": 9788 + }, + { + "epoch": 0.48, + "grad_norm": 0.5243804454803467, + "learning_rate": 0.0005629320466423634, + "loss": 3.3182, + "step": 9789 + }, + { + "epoch": 0.48, + "grad_norm": 0.5532283186912537, + "learning_rate": 0.0005629246328640945, + "loss": 3.3757, + "step": 9790 + }, + { + "epoch": 0.48, + "grad_norm": 0.5032833814620972, + "learning_rate": 0.0005629172183933327, + "loss": 3.3534, + "step": 9791 + }, + { + "epoch": 0.48, + "grad_norm": 0.5246580839157104, + "learning_rate": 0.0005629098032300978, + "loss": 3.408, + "step": 9792 + }, + { + "epoch": 0.48, + "grad_norm": 0.5527105927467346, + "learning_rate": 0.0005629023873744087, + "loss": 3.2838, + "step": 9793 + }, + { + "epoch": 0.48, + "grad_norm": 0.5229583978652954, + "learning_rate": 0.0005628949708262856, + "loss": 3.3617, + "step": 9794 + }, + { + "epoch": 0.48, + "grad_norm": 0.5168598294258118, + "learning_rate": 0.0005628875535857476, + "loss": 3.4933, + "step": 9795 + }, + { + "epoch": 0.48, + "grad_norm": 0.4852995276451111, + "learning_rate": 0.0005628801356528144, + "loss": 3.2868, + "step": 9796 + }, + { + "epoch": 0.48, + "grad_norm": 0.5106096267700195, + "learning_rate": 0.0005628727170275055, + "loss": 3.188, + "step": 9797 + }, + { + "epoch": 0.48, + "grad_norm": 0.5226311683654785, + "learning_rate": 0.0005628652977098405, + "loss": 3.4544, + "step": 9798 + }, + { + "epoch": 0.48, + "grad_norm": 0.5285156965255737, + "learning_rate": 0.0005628578776998389, + "loss": 2.9966, + "step": 9799 + }, + { + "epoch": 0.48, + "grad_norm": 0.5192917585372925, + "learning_rate": 0.0005628504569975201, + "loss": 3.5259, + "step": 9800 + }, + { + "epoch": 0.48, + "grad_norm": 0.5123999714851379, + "learning_rate": 0.0005628430356029039, + "loss": 3.1751, + "step": 9801 + }, + { + "epoch": 0.48, + "grad_norm": 0.484468013048172, + "learning_rate": 0.0005628356135160097, + "loss": 3.3334, + "step": 9802 + }, + { + "epoch": 0.48, + "grad_norm": 0.5368399620056152, + "learning_rate": 0.0005628281907368571, + "loss": 3.4312, + "step": 9803 + }, + { + "epoch": 0.48, + "grad_norm": 0.5847996473312378, + "learning_rate": 0.0005628207672654656, + "loss": 3.4378, + "step": 9804 + }, + { + "epoch": 0.48, + "grad_norm": 0.487942099571228, + "learning_rate": 0.0005628133431018548, + "loss": 3.1906, + "step": 9805 + }, + { + "epoch": 0.48, + "grad_norm": 0.4921421706676483, + "learning_rate": 0.0005628059182460442, + "loss": 3.1767, + "step": 9806 + }, + { + "epoch": 0.48, + "grad_norm": 0.5178766250610352, + "learning_rate": 0.0005627984926980535, + "loss": 3.2761, + "step": 9807 + }, + { + "epoch": 0.48, + "grad_norm": 0.5124492049217224, + "learning_rate": 0.000562791066457902, + "loss": 3.4485, + "step": 9808 + }, + { + "epoch": 0.48, + "grad_norm": 0.5111625790596008, + "learning_rate": 0.0005627836395256095, + "loss": 3.3804, + "step": 9809 + }, + { + "epoch": 0.48, + "grad_norm": 0.5112327337265015, + "learning_rate": 0.0005627762119011955, + "loss": 3.3067, + "step": 9810 + }, + { + "epoch": 0.48, + "grad_norm": 0.5200767517089844, + "learning_rate": 0.0005627687835846794, + "loss": 3.2413, + "step": 9811 + }, + { + "epoch": 0.48, + "grad_norm": 0.5200941562652588, + "learning_rate": 0.000562761354576081, + "loss": 3.3482, + "step": 9812 + }, + { + "epoch": 0.48, + "grad_norm": 0.5365529656410217, + "learning_rate": 0.0005627539248754196, + "loss": 3.281, + "step": 9813 + }, + { + "epoch": 0.48, + "grad_norm": 0.5436131954193115, + "learning_rate": 0.0005627464944827151, + "loss": 3.3693, + "step": 9814 + }, + { + "epoch": 0.48, + "grad_norm": 0.545846164226532, + "learning_rate": 0.0005627390633979869, + "loss": 3.3813, + "step": 9815 + }, + { + "epoch": 0.48, + "grad_norm": 0.5612288117408752, + "learning_rate": 0.0005627316316212544, + "loss": 3.1317, + "step": 9816 + }, + { + "epoch": 0.48, + "grad_norm": 0.5418693423271179, + "learning_rate": 0.0005627241991525376, + "loss": 3.3768, + "step": 9817 + }, + { + "epoch": 0.48, + "grad_norm": 0.5217850804328918, + "learning_rate": 0.0005627167659918557, + "loss": 3.2675, + "step": 9818 + }, + { + "epoch": 0.48, + "grad_norm": 0.5639944076538086, + "learning_rate": 0.0005627093321392283, + "loss": 3.313, + "step": 9819 + }, + { + "epoch": 0.48, + "grad_norm": 0.5243082642555237, + "learning_rate": 0.0005627018975946752, + "loss": 3.4244, + "step": 9820 + }, + { + "epoch": 0.48, + "grad_norm": 0.4913342595100403, + "learning_rate": 0.0005626944623582158, + "loss": 3.3463, + "step": 9821 + }, + { + "epoch": 0.48, + "grad_norm": 0.5058574080467224, + "learning_rate": 0.0005626870264298698, + "loss": 3.402, + "step": 9822 + }, + { + "epoch": 0.48, + "grad_norm": 0.5007573962211609, + "learning_rate": 0.0005626795898096568, + "loss": 3.3769, + "step": 9823 + }, + { + "epoch": 0.48, + "grad_norm": 0.5387548804283142, + "learning_rate": 0.0005626721524975962, + "loss": 3.2397, + "step": 9824 + }, + { + "epoch": 0.48, + "grad_norm": 0.5633268356323242, + "learning_rate": 0.0005626647144937076, + "loss": 3.4537, + "step": 9825 + }, + { + "epoch": 0.48, + "grad_norm": 0.48922309279441833, + "learning_rate": 0.0005626572757980109, + "loss": 3.2293, + "step": 9826 + }, + { + "epoch": 0.48, + "grad_norm": 0.5135512948036194, + "learning_rate": 0.0005626498364105254, + "loss": 3.2582, + "step": 9827 + }, + { + "epoch": 0.48, + "grad_norm": 0.5401769280433655, + "learning_rate": 0.0005626423963312707, + "loss": 3.4899, + "step": 9828 + }, + { + "epoch": 0.48, + "grad_norm": 0.5234643816947937, + "learning_rate": 0.0005626349555602666, + "loss": 3.3206, + "step": 9829 + }, + { + "epoch": 0.48, + "grad_norm": 0.5322685837745667, + "learning_rate": 0.0005626275140975326, + "loss": 3.2296, + "step": 9830 + }, + { + "epoch": 0.48, + "grad_norm": 0.5047280192375183, + "learning_rate": 0.0005626200719430881, + "loss": 3.2615, + "step": 9831 + }, + { + "epoch": 0.48, + "grad_norm": 0.5207504630088806, + "learning_rate": 0.0005626126290969529, + "loss": 3.2849, + "step": 9832 + }, + { + "epoch": 0.48, + "grad_norm": 0.4990766942501068, + "learning_rate": 0.0005626051855591467, + "loss": 3.21, + "step": 9833 + }, + { + "epoch": 0.48, + "grad_norm": 0.5085784792900085, + "learning_rate": 0.0005625977413296889, + "loss": 3.4235, + "step": 9834 + }, + { + "epoch": 0.48, + "grad_norm": 0.5034537315368652, + "learning_rate": 0.0005625902964085992, + "loss": 3.2626, + "step": 9835 + }, + { + "epoch": 0.48, + "grad_norm": 0.6476481556892395, + "learning_rate": 0.0005625828507958973, + "loss": 3.1521, + "step": 9836 + }, + { + "epoch": 0.48, + "grad_norm": 0.5102798938751221, + "learning_rate": 0.0005625754044916025, + "loss": 3.2906, + "step": 9837 + }, + { + "epoch": 0.48, + "grad_norm": 0.510310173034668, + "learning_rate": 0.0005625679574957349, + "loss": 2.9805, + "step": 9838 + }, + { + "epoch": 0.48, + "grad_norm": 0.5330062508583069, + "learning_rate": 0.0005625605098083135, + "loss": 3.287, + "step": 9839 + }, + { + "epoch": 0.48, + "grad_norm": 0.5602695941925049, + "learning_rate": 0.0005625530614293584, + "loss": 3.1953, + "step": 9840 + }, + { + "epoch": 0.48, + "grad_norm": 0.5196672677993774, + "learning_rate": 0.0005625456123588892, + "loss": 3.2402, + "step": 9841 + }, + { + "epoch": 0.48, + "grad_norm": 0.6540876626968384, + "learning_rate": 0.0005625381625969252, + "loss": 3.3599, + "step": 9842 + }, + { + "epoch": 0.48, + "grad_norm": 0.5737609267234802, + "learning_rate": 0.0005625307121434862, + "loss": 3.2897, + "step": 9843 + }, + { + "epoch": 0.48, + "grad_norm": 0.5284690260887146, + "learning_rate": 0.0005625232609985919, + "loss": 3.2414, + "step": 9844 + }, + { + "epoch": 0.48, + "grad_norm": 0.5034304261207581, + "learning_rate": 0.0005625158091622619, + "loss": 3.2033, + "step": 9845 + }, + { + "epoch": 0.48, + "grad_norm": 0.49428412318229675, + "learning_rate": 0.0005625083566345158, + "loss": 3.1545, + "step": 9846 + }, + { + "epoch": 0.48, + "grad_norm": 0.5276312828063965, + "learning_rate": 0.0005625009034153732, + "loss": 3.1025, + "step": 9847 + }, + { + "epoch": 0.48, + "grad_norm": 0.5318641066551208, + "learning_rate": 0.0005624934495048535, + "loss": 3.2542, + "step": 9848 + }, + { + "epoch": 0.48, + "grad_norm": 0.5393041372299194, + "learning_rate": 0.0005624859949029768, + "loss": 3.2845, + "step": 9849 + }, + { + "epoch": 0.48, + "grad_norm": 0.5345759391784668, + "learning_rate": 0.0005624785396097625, + "loss": 3.2756, + "step": 9850 + }, + { + "epoch": 0.48, + "grad_norm": 0.5293691754341125, + "learning_rate": 0.0005624710836252302, + "loss": 3.2409, + "step": 9851 + }, + { + "epoch": 0.48, + "grad_norm": 0.5240424275398254, + "learning_rate": 0.0005624636269493995, + "loss": 3.1066, + "step": 9852 + }, + { + "epoch": 0.48, + "grad_norm": 0.5183370113372803, + "learning_rate": 0.0005624561695822903, + "loss": 3.2444, + "step": 9853 + }, + { + "epoch": 0.48, + "grad_norm": 0.5233088731765747, + "learning_rate": 0.0005624487115239219, + "loss": 3.0997, + "step": 9854 + }, + { + "epoch": 0.48, + "grad_norm": 0.5566701889038086, + "learning_rate": 0.0005624412527743142, + "loss": 3.6284, + "step": 9855 + }, + { + "epoch": 0.48, + "grad_norm": 0.5460590720176697, + "learning_rate": 0.0005624337933334867, + "loss": 3.2746, + "step": 9856 + }, + { + "epoch": 0.48, + "grad_norm": 0.5318386554718018, + "learning_rate": 0.0005624263332014591, + "loss": 3.38, + "step": 9857 + }, + { + "epoch": 0.48, + "grad_norm": 0.5089846253395081, + "learning_rate": 0.0005624188723782511, + "loss": 3.2773, + "step": 9858 + }, + { + "epoch": 0.48, + "grad_norm": 0.5393161177635193, + "learning_rate": 0.0005624114108638822, + "loss": 2.969, + "step": 9859 + }, + { + "epoch": 0.48, + "grad_norm": 0.5289279818534851, + "learning_rate": 0.0005624039486583721, + "loss": 3.1732, + "step": 9860 + }, + { + "epoch": 0.48, + "grad_norm": 0.5223667621612549, + "learning_rate": 0.0005623964857617407, + "loss": 3.0991, + "step": 9861 + }, + { + "epoch": 0.48, + "grad_norm": 0.5194090604782104, + "learning_rate": 0.0005623890221740074, + "loss": 3.1106, + "step": 9862 + }, + { + "epoch": 0.48, + "grad_norm": 0.5405828952789307, + "learning_rate": 0.0005623815578951918, + "loss": 2.9179, + "step": 9863 + }, + { + "epoch": 0.48, + "grad_norm": 0.49917301535606384, + "learning_rate": 0.0005623740929253136, + "loss": 3.1937, + "step": 9864 + }, + { + "epoch": 0.48, + "grad_norm": 0.5121216773986816, + "learning_rate": 0.0005623666272643927, + "loss": 3.483, + "step": 9865 + }, + { + "epoch": 0.48, + "grad_norm": 0.5729875564575195, + "learning_rate": 0.0005623591609124486, + "loss": 3.2528, + "step": 9866 + }, + { + "epoch": 0.48, + "grad_norm": 0.47562557458877563, + "learning_rate": 0.0005623516938695009, + "loss": 3.1612, + "step": 9867 + }, + { + "epoch": 0.48, + "grad_norm": 0.5255392789840698, + "learning_rate": 0.0005623442261355694, + "loss": 3.2327, + "step": 9868 + }, + { + "epoch": 0.48, + "grad_norm": 0.5365827083587646, + "learning_rate": 0.0005623367577106736, + "loss": 3.2066, + "step": 9869 + }, + { + "epoch": 0.48, + "grad_norm": 0.5231798887252808, + "learning_rate": 0.0005623292885948333, + "loss": 3.2322, + "step": 9870 + }, + { + "epoch": 0.48, + "grad_norm": 0.5149793028831482, + "learning_rate": 0.0005623218187880682, + "loss": 3.4431, + "step": 9871 + }, + { + "epoch": 0.48, + "grad_norm": 0.5159502625465393, + "learning_rate": 0.0005623143482903979, + "loss": 3.103, + "step": 9872 + }, + { + "epoch": 0.48, + "grad_norm": 0.523520290851593, + "learning_rate": 0.0005623068771018419, + "loss": 3.2667, + "step": 9873 + }, + { + "epoch": 0.48, + "grad_norm": 0.5203685760498047, + "learning_rate": 0.0005622994052224203, + "loss": 3.3393, + "step": 9874 + }, + { + "epoch": 0.48, + "grad_norm": 0.5236057639122009, + "learning_rate": 0.0005622919326521525, + "loss": 3.3034, + "step": 9875 + }, + { + "epoch": 0.48, + "grad_norm": 0.5350485444068909, + "learning_rate": 0.0005622844593910583, + "loss": 3.4209, + "step": 9876 + }, + { + "epoch": 0.48, + "grad_norm": 0.48950937390327454, + "learning_rate": 0.0005622769854391571, + "loss": 3.3428, + "step": 9877 + }, + { + "epoch": 0.48, + "grad_norm": 0.5686964988708496, + "learning_rate": 0.0005622695107964689, + "loss": 3.0339, + "step": 9878 + }, + { + "epoch": 0.48, + "grad_norm": 0.5592756271362305, + "learning_rate": 0.0005622620354630133, + "loss": 3.3535, + "step": 9879 + }, + { + "epoch": 0.48, + "grad_norm": 0.4988493025302887, + "learning_rate": 0.00056225455943881, + "loss": 3.3169, + "step": 9880 + }, + { + "epoch": 0.48, + "grad_norm": 0.5318952202796936, + "learning_rate": 0.0005622470827238786, + "loss": 3.171, + "step": 9881 + }, + { + "epoch": 0.48, + "grad_norm": 0.5253759622573853, + "learning_rate": 0.000562239605318239, + "loss": 3.4359, + "step": 9882 + }, + { + "epoch": 0.48, + "grad_norm": 0.4880097806453705, + "learning_rate": 0.0005622321272219105, + "loss": 3.3283, + "step": 9883 + }, + { + "epoch": 0.48, + "grad_norm": 0.49652519822120667, + "learning_rate": 0.0005622246484349132, + "loss": 3.1138, + "step": 9884 + }, + { + "epoch": 0.48, + "grad_norm": 0.554382860660553, + "learning_rate": 0.0005622171689572666, + "loss": 3.1591, + "step": 9885 + }, + { + "epoch": 0.48, + "grad_norm": 0.5065361857414246, + "learning_rate": 0.0005622096887889905, + "loss": 3.1288, + "step": 9886 + }, + { + "epoch": 0.48, + "grad_norm": 0.5004228353500366, + "learning_rate": 0.0005622022079301045, + "loss": 3.1548, + "step": 9887 + }, + { + "epoch": 0.48, + "grad_norm": 0.5251879096031189, + "learning_rate": 0.0005621947263806284, + "loss": 3.2474, + "step": 9888 + }, + { + "epoch": 0.48, + "grad_norm": 0.5283453464508057, + "learning_rate": 0.0005621872441405818, + "loss": 3.3388, + "step": 9889 + }, + { + "epoch": 0.48, + "grad_norm": 0.5356844067573547, + "learning_rate": 0.0005621797612099845, + "loss": 3.3577, + "step": 9890 + }, + { + "epoch": 0.48, + "grad_norm": 0.5235812067985535, + "learning_rate": 0.0005621722775888561, + "loss": 3.1009, + "step": 9891 + }, + { + "epoch": 0.48, + "grad_norm": 0.5591863989830017, + "learning_rate": 0.0005621647932772164, + "loss": 3.3538, + "step": 9892 + }, + { + "epoch": 0.48, + "grad_norm": 0.48452454805374146, + "learning_rate": 0.000562157308275085, + "loss": 3.429, + "step": 9893 + }, + { + "epoch": 0.48, + "grad_norm": 0.5546316504478455, + "learning_rate": 0.0005621498225824818, + "loss": 3.214, + "step": 9894 + }, + { + "epoch": 0.48, + "grad_norm": 0.48921647667884827, + "learning_rate": 0.0005621423361994264, + "loss": 3.2568, + "step": 9895 + }, + { + "epoch": 0.48, + "grad_norm": 0.5067411661148071, + "learning_rate": 0.0005621348491259386, + "loss": 3.5054, + "step": 9896 + }, + { + "epoch": 0.49, + "grad_norm": 0.5032419562339783, + "learning_rate": 0.000562127361362038, + "loss": 3.3825, + "step": 9897 + }, + { + "epoch": 0.49, + "grad_norm": 0.48625296354293823, + "learning_rate": 0.0005621198729077444, + "loss": 3.0214, + "step": 9898 + }, + { + "epoch": 0.49, + "grad_norm": 0.5369133949279785, + "learning_rate": 0.0005621123837630776, + "loss": 3.1147, + "step": 9899 + }, + { + "epoch": 0.49, + "grad_norm": 0.5231306552886963, + "learning_rate": 0.000562104893928057, + "loss": 3.3092, + "step": 9900 + }, + { + "epoch": 0.49, + "grad_norm": 0.5136045217514038, + "learning_rate": 0.0005620974034027026, + "loss": 3.3411, + "step": 9901 + }, + { + "epoch": 0.49, + "grad_norm": 0.5047800540924072, + "learning_rate": 0.0005620899121870342, + "loss": 3.3716, + "step": 9902 + }, + { + "epoch": 0.49, + "grad_norm": 0.5415762662887573, + "learning_rate": 0.0005620824202810713, + "loss": 3.2862, + "step": 9903 + }, + { + "epoch": 0.49, + "grad_norm": 0.5388754606246948, + "learning_rate": 0.0005620749276848339, + "loss": 3.3603, + "step": 9904 + }, + { + "epoch": 0.49, + "grad_norm": 0.4989524781703949, + "learning_rate": 0.0005620674343983415, + "loss": 3.2319, + "step": 9905 + }, + { + "epoch": 0.49, + "grad_norm": 0.5338003039360046, + "learning_rate": 0.0005620599404216138, + "loss": 3.2099, + "step": 9906 + }, + { + "epoch": 0.49, + "grad_norm": 0.483467698097229, + "learning_rate": 0.0005620524457546708, + "loss": 3.3824, + "step": 9907 + }, + { + "epoch": 0.49, + "grad_norm": 0.5273773670196533, + "learning_rate": 0.000562044950397532, + "loss": 3.2784, + "step": 9908 + }, + { + "epoch": 0.49, + "grad_norm": 0.5121115446090698, + "learning_rate": 0.0005620374543502173, + "loss": 3.2377, + "step": 9909 + }, + { + "epoch": 0.49, + "grad_norm": 0.5008372068405151, + "learning_rate": 0.0005620299576127463, + "loss": 3.3862, + "step": 9910 + }, + { + "epoch": 0.49, + "grad_norm": 0.49955323338508606, + "learning_rate": 0.0005620224601851389, + "loss": 3.4437, + "step": 9911 + }, + { + "epoch": 0.49, + "grad_norm": 0.5031404495239258, + "learning_rate": 0.0005620149620674147, + "loss": 3.3564, + "step": 9912 + }, + { + "epoch": 0.49, + "grad_norm": 0.5203202962875366, + "learning_rate": 0.0005620074632595936, + "loss": 3.1263, + "step": 9913 + }, + { + "epoch": 0.49, + "grad_norm": 0.48752161860466003, + "learning_rate": 0.0005619999637616951, + "loss": 3.476, + "step": 9914 + }, + { + "epoch": 0.49, + "grad_norm": 0.5217284560203552, + "learning_rate": 0.0005619924635737393, + "loss": 3.3237, + "step": 9915 + }, + { + "epoch": 0.49, + "grad_norm": 0.5151270031929016, + "learning_rate": 0.0005619849626957457, + "loss": 3.2331, + "step": 9916 + }, + { + "epoch": 0.49, + "grad_norm": 0.549656093120575, + "learning_rate": 0.0005619774611277342, + "loss": 3.2498, + "step": 9917 + }, + { + "epoch": 0.49, + "grad_norm": 0.568009078502655, + "learning_rate": 0.0005619699588697243, + "loss": 3.3618, + "step": 9918 + }, + { + "epoch": 0.49, + "grad_norm": 0.4984912872314453, + "learning_rate": 0.0005619624559217361, + "loss": 3.3981, + "step": 9919 + }, + { + "epoch": 0.49, + "grad_norm": 0.5053259134292603, + "learning_rate": 0.0005619549522837891, + "loss": 3.5771, + "step": 9920 + }, + { + "epoch": 0.49, + "grad_norm": 0.5003093481063843, + "learning_rate": 0.0005619474479559033, + "loss": 3.3099, + "step": 9921 + }, + { + "epoch": 0.49, + "grad_norm": 0.5107256770133972, + "learning_rate": 0.0005619399429380983, + "loss": 3.2729, + "step": 9922 + }, + { + "epoch": 0.49, + "grad_norm": 0.5822942852973938, + "learning_rate": 0.0005619324372303938, + "loss": 3.1418, + "step": 9923 + }, + { + "epoch": 0.49, + "grad_norm": 0.6355964541435242, + "learning_rate": 0.0005619249308328098, + "loss": 3.2895, + "step": 9924 + }, + { + "epoch": 0.49, + "grad_norm": 0.5443882942199707, + "learning_rate": 0.0005619174237453658, + "loss": 3.3586, + "step": 9925 + }, + { + "epoch": 0.49, + "grad_norm": 0.5277007818222046, + "learning_rate": 0.0005619099159680818, + "loss": 3.2578, + "step": 9926 + }, + { + "epoch": 0.49, + "grad_norm": 0.49704602360725403, + "learning_rate": 0.0005619024075009775, + "loss": 3.2784, + "step": 9927 + }, + { + "epoch": 0.49, + "grad_norm": 0.5114836692810059, + "learning_rate": 0.0005618948983440727, + "loss": 3.0513, + "step": 9928 + }, + { + "epoch": 0.49, + "grad_norm": 0.5052823424339294, + "learning_rate": 0.0005618873884973871, + "loss": 3.4613, + "step": 9929 + }, + { + "epoch": 0.49, + "grad_norm": 0.5057597756385803, + "learning_rate": 0.0005618798779609405, + "loss": 3.2938, + "step": 9930 + }, + { + "epoch": 0.49, + "grad_norm": 0.5410943031311035, + "learning_rate": 0.0005618723667347526, + "loss": 3.2756, + "step": 9931 + }, + { + "epoch": 0.49, + "grad_norm": 0.5320307612419128, + "learning_rate": 0.0005618648548188434, + "loss": 3.2589, + "step": 9932 + }, + { + "epoch": 0.49, + "grad_norm": 0.5032922029495239, + "learning_rate": 0.0005618573422132327, + "loss": 3.308, + "step": 9933 + }, + { + "epoch": 0.49, + "grad_norm": 0.49512460827827454, + "learning_rate": 0.0005618498289179399, + "loss": 3.2621, + "step": 9934 + }, + { + "epoch": 0.49, + "grad_norm": 0.5111963152885437, + "learning_rate": 0.0005618423149329853, + "loss": 3.2512, + "step": 9935 + }, + { + "epoch": 0.49, + "grad_norm": 0.5099007487297058, + "learning_rate": 0.0005618348002583883, + "loss": 3.4597, + "step": 9936 + }, + { + "epoch": 0.49, + "grad_norm": 0.5154657363891602, + "learning_rate": 0.0005618272848941687, + "loss": 3.4915, + "step": 9937 + }, + { + "epoch": 0.49, + "grad_norm": 0.5354853272438049, + "learning_rate": 0.0005618197688403466, + "loss": 3.3357, + "step": 9938 + }, + { + "epoch": 0.49, + "grad_norm": 0.5375204682350159, + "learning_rate": 0.0005618122520969416, + "loss": 3.2745, + "step": 9939 + }, + { + "epoch": 0.49, + "grad_norm": 0.5439879894256592, + "learning_rate": 0.0005618047346639735, + "loss": 3.3741, + "step": 9940 + }, + { + "epoch": 0.49, + "grad_norm": 0.5084054470062256, + "learning_rate": 0.0005617972165414621, + "loss": 3.4072, + "step": 9941 + }, + { + "epoch": 0.49, + "grad_norm": 0.5047259330749512, + "learning_rate": 0.0005617896977294271, + "loss": 3.1825, + "step": 9942 + }, + { + "epoch": 0.49, + "grad_norm": 0.5263615846633911, + "learning_rate": 0.0005617821782278886, + "loss": 3.3154, + "step": 9943 + }, + { + "epoch": 0.49, + "grad_norm": 0.5237258076667786, + "learning_rate": 0.0005617746580368661, + "loss": 3.3176, + "step": 9944 + }, + { + "epoch": 0.49, + "grad_norm": 0.535914957523346, + "learning_rate": 0.0005617671371563797, + "loss": 3.2534, + "step": 9945 + }, + { + "epoch": 0.49, + "grad_norm": 0.5265949368476868, + "learning_rate": 0.0005617596155864489, + "loss": 3.3145, + "step": 9946 + }, + { + "epoch": 0.49, + "grad_norm": 0.49210605025291443, + "learning_rate": 0.0005617520933270937, + "loss": 3.4768, + "step": 9947 + }, + { + "epoch": 0.49, + "grad_norm": 0.5354414582252502, + "learning_rate": 0.0005617445703783337, + "loss": 3.0916, + "step": 9948 + }, + { + "epoch": 0.49, + "grad_norm": 0.525729775428772, + "learning_rate": 0.0005617370467401891, + "loss": 3.5307, + "step": 9949 + }, + { + "epoch": 0.49, + "grad_norm": 0.5057360529899597, + "learning_rate": 0.0005617295224126794, + "loss": 3.1895, + "step": 9950 + }, + { + "epoch": 0.49, + "grad_norm": 0.5312747955322266, + "learning_rate": 0.0005617219973958244, + "loss": 3.3153, + "step": 9951 + }, + { + "epoch": 0.49, + "grad_norm": 0.5353907942771912, + "learning_rate": 0.0005617144716896441, + "loss": 3.3866, + "step": 9952 + }, + { + "epoch": 0.49, + "grad_norm": 0.4996834397315979, + "learning_rate": 0.0005617069452941584, + "loss": 3.0625, + "step": 9953 + }, + { + "epoch": 0.49, + "grad_norm": 0.5687936544418335, + "learning_rate": 0.0005616994182093869, + "loss": 3.1249, + "step": 9954 + }, + { + "epoch": 0.49, + "grad_norm": 0.5160571336746216, + "learning_rate": 0.0005616918904353494, + "loss": 3.2989, + "step": 9955 + }, + { + "epoch": 0.49, + "grad_norm": 0.5176610350608826, + "learning_rate": 0.0005616843619720658, + "loss": 3.2189, + "step": 9956 + }, + { + "epoch": 0.49, + "grad_norm": 0.5297330021858215, + "learning_rate": 0.000561676832819556, + "loss": 3.2166, + "step": 9957 + }, + { + "epoch": 0.49, + "grad_norm": 0.5623044371604919, + "learning_rate": 0.00056166930297784, + "loss": 3.4027, + "step": 9958 + }, + { + "epoch": 0.49, + "grad_norm": 0.5169060230255127, + "learning_rate": 0.0005616617724469371, + "loss": 3.3397, + "step": 9959 + }, + { + "epoch": 0.49, + "grad_norm": 0.5446101427078247, + "learning_rate": 0.0005616542412268677, + "loss": 3.3629, + "step": 9960 + }, + { + "epoch": 0.49, + "grad_norm": 0.5260056853294373, + "learning_rate": 0.0005616467093176511, + "loss": 3.0683, + "step": 9961 + }, + { + "epoch": 0.49, + "grad_norm": 0.5235419869422913, + "learning_rate": 0.0005616391767193077, + "loss": 3.1902, + "step": 9962 + }, + { + "epoch": 0.49, + "grad_norm": 0.514287531375885, + "learning_rate": 0.0005616316434318569, + "loss": 3.2299, + "step": 9963 + }, + { + "epoch": 0.49, + "grad_norm": 0.5356868505477905, + "learning_rate": 0.0005616241094553188, + "loss": 3.4199, + "step": 9964 + }, + { + "epoch": 0.49, + "grad_norm": 0.5147073864936829, + "learning_rate": 0.000561616574789713, + "loss": 3.2718, + "step": 9965 + }, + { + "epoch": 0.49, + "grad_norm": 0.5343377590179443, + "learning_rate": 0.0005616090394350596, + "loss": 3.2803, + "step": 9966 + }, + { + "epoch": 0.49, + "grad_norm": 0.5024563670158386, + "learning_rate": 0.0005616015033913784, + "loss": 3.204, + "step": 9967 + }, + { + "epoch": 0.49, + "grad_norm": 0.5427229404449463, + "learning_rate": 0.0005615939666586891, + "loss": 3.395, + "step": 9968 + }, + { + "epoch": 0.49, + "grad_norm": 0.5278394222259521, + "learning_rate": 0.0005615864292370116, + "loss": 3.07, + "step": 9969 + }, + { + "epoch": 0.49, + "grad_norm": 0.5011979937553406, + "learning_rate": 0.0005615788911263659, + "loss": 3.3395, + "step": 9970 + }, + { + "epoch": 0.49, + "grad_norm": 0.5226938724517822, + "learning_rate": 0.0005615713523267716, + "loss": 3.1389, + "step": 9971 + }, + { + "epoch": 0.49, + "grad_norm": 0.5529884696006775, + "learning_rate": 0.0005615638128382488, + "loss": 3.2872, + "step": 9972 + }, + { + "epoch": 0.49, + "grad_norm": 0.5180261731147766, + "learning_rate": 0.0005615562726608173, + "loss": 2.9213, + "step": 9973 + }, + { + "epoch": 0.49, + "grad_norm": 0.49697425961494446, + "learning_rate": 0.0005615487317944969, + "loss": 3.3555, + "step": 9974 + }, + { + "epoch": 0.49, + "grad_norm": 0.5160676836967468, + "learning_rate": 0.0005615411902393073, + "loss": 3.175, + "step": 9975 + }, + { + "epoch": 0.49, + "grad_norm": 0.49578121304512024, + "learning_rate": 0.0005615336479952687, + "loss": 3.1123, + "step": 9976 + }, + { + "epoch": 0.49, + "grad_norm": 0.5369651317596436, + "learning_rate": 0.0005615261050624007, + "loss": 3.2317, + "step": 9977 + }, + { + "epoch": 0.49, + "grad_norm": 0.557094156742096, + "learning_rate": 0.0005615185614407234, + "loss": 3.2406, + "step": 9978 + }, + { + "epoch": 0.49, + "grad_norm": 0.5667169094085693, + "learning_rate": 0.0005615110171302565, + "loss": 3.2702, + "step": 9979 + }, + { + "epoch": 0.49, + "grad_norm": 0.5118554830551147, + "learning_rate": 0.0005615034721310199, + "loss": 3.2947, + "step": 9980 + }, + { + "epoch": 0.49, + "grad_norm": 0.4968426823616028, + "learning_rate": 0.0005614959264430335, + "loss": 3.251, + "step": 9981 + }, + { + "epoch": 0.49, + "grad_norm": 0.5141085386276245, + "learning_rate": 0.000561488380066317, + "loss": 3.1599, + "step": 9982 + }, + { + "epoch": 0.49, + "grad_norm": 0.49446555972099304, + "learning_rate": 0.0005614808330008906, + "loss": 3.3511, + "step": 9983 + }, + { + "epoch": 0.49, + "grad_norm": 0.5145710110664368, + "learning_rate": 0.0005614732852467741, + "loss": 3.2855, + "step": 9984 + }, + { + "epoch": 0.49, + "grad_norm": 0.5225349068641663, + "learning_rate": 0.0005614657368039871, + "loss": 3.3159, + "step": 9985 + }, + { + "epoch": 0.49, + "grad_norm": 0.5085193514823914, + "learning_rate": 0.0005614581876725497, + "loss": 3.1745, + "step": 9986 + }, + { + "epoch": 0.49, + "grad_norm": 0.5428699851036072, + "learning_rate": 0.0005614506378524818, + "loss": 3.2747, + "step": 9987 + }, + { + "epoch": 0.49, + "grad_norm": 0.495169073343277, + "learning_rate": 0.0005614430873438032, + "loss": 3.3072, + "step": 9988 + }, + { + "epoch": 0.49, + "grad_norm": 0.558083176612854, + "learning_rate": 0.0005614355361465338, + "loss": 3.3659, + "step": 9989 + }, + { + "epoch": 0.49, + "grad_norm": 0.5748363733291626, + "learning_rate": 0.0005614279842606936, + "loss": 3.3423, + "step": 9990 + }, + { + "epoch": 0.49, + "grad_norm": 0.5683549046516418, + "learning_rate": 0.0005614204316863023, + "loss": 3.1963, + "step": 9991 + }, + { + "epoch": 0.49, + "grad_norm": 0.576992392539978, + "learning_rate": 0.00056141287842338, + "loss": 3.1108, + "step": 9992 + }, + { + "epoch": 0.49, + "grad_norm": 0.545221209526062, + "learning_rate": 0.0005614053244719464, + "loss": 3.2083, + "step": 9993 + }, + { + "epoch": 0.49, + "grad_norm": 0.5298340320587158, + "learning_rate": 0.0005613977698320215, + "loss": 3.231, + "step": 9994 + }, + { + "epoch": 0.49, + "grad_norm": 0.5359015464782715, + "learning_rate": 0.0005613902145036253, + "loss": 3.2543, + "step": 9995 + }, + { + "epoch": 0.49, + "grad_norm": 0.518440842628479, + "learning_rate": 0.0005613826584867775, + "loss": 3.2654, + "step": 9996 + }, + { + "epoch": 0.49, + "grad_norm": 0.5107799768447876, + "learning_rate": 0.000561375101781498, + "loss": 3.2014, + "step": 9997 + }, + { + "epoch": 0.49, + "grad_norm": 0.5043929219245911, + "learning_rate": 0.0005613675443878069, + "loss": 3.167, + "step": 9998 + }, + { + "epoch": 0.49, + "grad_norm": 0.5088926553726196, + "learning_rate": 0.0005613599863057238, + "loss": 3.1162, + "step": 9999 + }, + { + "epoch": 0.49, + "grad_norm": 0.5218754410743713, + "learning_rate": 0.000561352427535269, + "loss": 3.3379, + "step": 10000 + }, + { + "epoch": 0.49, + "grad_norm": 0.521481990814209, + "learning_rate": 0.0005613448680764621, + "loss": 3.3661, + "step": 10001 + }, + { + "epoch": 0.49, + "grad_norm": 0.5064173936843872, + "learning_rate": 0.0005613373079293232, + "loss": 3.3448, + "step": 10002 + }, + { + "epoch": 0.49, + "grad_norm": 0.5323746204376221, + "learning_rate": 0.0005613297470938721, + "loss": 3.0224, + "step": 10003 + }, + { + "epoch": 0.49, + "grad_norm": 0.5698615312576294, + "learning_rate": 0.0005613221855701287, + "loss": 3.3162, + "step": 10004 + }, + { + "epoch": 0.49, + "grad_norm": 0.5549126267433167, + "learning_rate": 0.000561314623358113, + "loss": 2.9439, + "step": 10005 + }, + { + "epoch": 0.49, + "grad_norm": 0.5148659944534302, + "learning_rate": 0.0005613070604578448, + "loss": 3.1785, + "step": 10006 + }, + { + "epoch": 0.49, + "grad_norm": 0.5279960632324219, + "learning_rate": 0.0005612994968693442, + "loss": 3.1706, + "step": 10007 + }, + { + "epoch": 0.49, + "grad_norm": 0.5391876697540283, + "learning_rate": 0.0005612919325926308, + "loss": 3.3005, + "step": 10008 + }, + { + "epoch": 0.49, + "grad_norm": 0.5231080651283264, + "learning_rate": 0.0005612843676277249, + "loss": 3.3432, + "step": 10009 + }, + { + "epoch": 0.49, + "grad_norm": 0.5668114423751831, + "learning_rate": 0.0005612768019746464, + "loss": 3.3572, + "step": 10010 + }, + { + "epoch": 0.49, + "grad_norm": 0.5466375946998596, + "learning_rate": 0.0005612692356334149, + "loss": 3.2859, + "step": 10011 + }, + { + "epoch": 0.49, + "grad_norm": 0.5328760147094727, + "learning_rate": 0.0005612616686040505, + "loss": 3.1786, + "step": 10012 + }, + { + "epoch": 0.49, + "grad_norm": 0.5143555998802185, + "learning_rate": 0.0005612541008865733, + "loss": 3.2706, + "step": 10013 + }, + { + "epoch": 0.49, + "grad_norm": 0.5146706104278564, + "learning_rate": 0.0005612465324810029, + "loss": 3.4123, + "step": 10014 + }, + { + "epoch": 0.49, + "grad_norm": 0.4874396324157715, + "learning_rate": 0.0005612389633873594, + "loss": 3.3607, + "step": 10015 + }, + { + "epoch": 0.49, + "grad_norm": 0.5126667022705078, + "learning_rate": 0.0005612313936056629, + "loss": 3.2169, + "step": 10016 + }, + { + "epoch": 0.49, + "grad_norm": 0.5450606346130371, + "learning_rate": 0.0005612238231359331, + "loss": 3.3098, + "step": 10017 + }, + { + "epoch": 0.49, + "grad_norm": 0.5393866896629333, + "learning_rate": 0.0005612162519781901, + "loss": 3.3113, + "step": 10018 + }, + { + "epoch": 0.49, + "grad_norm": 0.5736554265022278, + "learning_rate": 0.0005612086801324536, + "loss": 3.3441, + "step": 10019 + }, + { + "epoch": 0.49, + "grad_norm": 0.5407893061637878, + "learning_rate": 0.0005612011075987439, + "loss": 3.3276, + "step": 10020 + }, + { + "epoch": 0.49, + "grad_norm": 0.5435847043991089, + "learning_rate": 0.0005611935343770806, + "loss": 3.1054, + "step": 10021 + }, + { + "epoch": 0.49, + "grad_norm": 0.5143806338310242, + "learning_rate": 0.0005611859604674839, + "loss": 3.2321, + "step": 10022 + }, + { + "epoch": 0.49, + "grad_norm": 0.5631576776504517, + "learning_rate": 0.0005611783858699736, + "loss": 3.2555, + "step": 10023 + }, + { + "epoch": 0.49, + "grad_norm": 0.5366818904876709, + "learning_rate": 0.0005611708105845697, + "loss": 3.1924, + "step": 10024 + }, + { + "epoch": 0.49, + "grad_norm": 0.5211443901062012, + "learning_rate": 0.0005611632346112921, + "loss": 3.3326, + "step": 10025 + }, + { + "epoch": 0.49, + "grad_norm": 0.5157060623168945, + "learning_rate": 0.000561155657950161, + "loss": 3.3635, + "step": 10026 + }, + { + "epoch": 0.49, + "grad_norm": 0.5200645327568054, + "learning_rate": 0.0005611480806011959, + "loss": 3.3153, + "step": 10027 + }, + { + "epoch": 0.49, + "grad_norm": 0.5269324779510498, + "learning_rate": 0.0005611405025644171, + "loss": 3.2965, + "step": 10028 + }, + { + "epoch": 0.49, + "grad_norm": 0.4967326521873474, + "learning_rate": 0.0005611329238398446, + "loss": 3.1998, + "step": 10029 + }, + { + "epoch": 0.49, + "grad_norm": 0.5370751619338989, + "learning_rate": 0.0005611253444274981, + "loss": 3.2197, + "step": 10030 + }, + { + "epoch": 0.49, + "grad_norm": 0.581263542175293, + "learning_rate": 0.0005611177643273977, + "loss": 3.2003, + "step": 10031 + }, + { + "epoch": 0.49, + "grad_norm": 0.5175623297691345, + "learning_rate": 0.0005611101835395634, + "loss": 3.2288, + "step": 10032 + }, + { + "epoch": 0.49, + "grad_norm": 0.5144805312156677, + "learning_rate": 0.0005611026020640151, + "loss": 3.2167, + "step": 10033 + }, + { + "epoch": 0.49, + "grad_norm": 0.5305891633033752, + "learning_rate": 0.0005610950199007728, + "loss": 3.3823, + "step": 10034 + }, + { + "epoch": 0.49, + "grad_norm": 0.5469315052032471, + "learning_rate": 0.0005610874370498566, + "loss": 3.0891, + "step": 10035 + }, + { + "epoch": 0.49, + "grad_norm": 0.54054856300354, + "learning_rate": 0.0005610798535112862, + "loss": 3.3481, + "step": 10036 + }, + { + "epoch": 0.49, + "grad_norm": 0.5348857641220093, + "learning_rate": 0.0005610722692850817, + "loss": 3.3687, + "step": 10037 + }, + { + "epoch": 0.49, + "grad_norm": 0.5133238434791565, + "learning_rate": 0.0005610646843712631, + "loss": 3.3272, + "step": 10038 + }, + { + "epoch": 0.49, + "grad_norm": 0.5264869928359985, + "learning_rate": 0.0005610570987698504, + "loss": 3.1792, + "step": 10039 + }, + { + "epoch": 0.49, + "grad_norm": 0.5270009636878967, + "learning_rate": 0.0005610495124808635, + "loss": 3.1658, + "step": 10040 + }, + { + "epoch": 0.49, + "grad_norm": 0.5224774479866028, + "learning_rate": 0.0005610419255043225, + "loss": 3.1901, + "step": 10041 + }, + { + "epoch": 0.49, + "grad_norm": 0.5436100363731384, + "learning_rate": 0.0005610343378402473, + "loss": 3.2561, + "step": 10042 + }, + { + "epoch": 0.49, + "grad_norm": 0.5261750817298889, + "learning_rate": 0.0005610267494886578, + "loss": 3.3767, + "step": 10043 + }, + { + "epoch": 0.49, + "grad_norm": 0.5026256442070007, + "learning_rate": 0.0005610191604495741, + "loss": 3.0808, + "step": 10044 + }, + { + "epoch": 0.49, + "grad_norm": 0.5132651329040527, + "learning_rate": 0.0005610115707230162, + "loss": 3.2861, + "step": 10045 + }, + { + "epoch": 0.49, + "grad_norm": 0.5228791236877441, + "learning_rate": 0.0005610039803090041, + "loss": 3.1605, + "step": 10046 + }, + { + "epoch": 0.49, + "grad_norm": 0.5274104475975037, + "learning_rate": 0.0005609963892075577, + "loss": 3.2428, + "step": 10047 + }, + { + "epoch": 0.49, + "grad_norm": 0.53509920835495, + "learning_rate": 0.0005609887974186969, + "loss": 3.4265, + "step": 10048 + }, + { + "epoch": 0.49, + "grad_norm": 0.5528433918952942, + "learning_rate": 0.000560981204942442, + "loss": 3.2642, + "step": 10049 + }, + { + "epoch": 0.49, + "grad_norm": 0.5362628102302551, + "learning_rate": 0.0005609736117788129, + "loss": 3.0798, + "step": 10050 + }, + { + "epoch": 0.49, + "grad_norm": 0.5504554510116577, + "learning_rate": 0.0005609660179278295, + "loss": 3.1879, + "step": 10051 + }, + { + "epoch": 0.49, + "grad_norm": 0.5313317179679871, + "learning_rate": 0.0005609584233895117, + "loss": 3.3346, + "step": 10052 + }, + { + "epoch": 0.49, + "grad_norm": 0.5195586085319519, + "learning_rate": 0.0005609508281638798, + "loss": 3.3072, + "step": 10053 + }, + { + "epoch": 0.49, + "grad_norm": 0.5247373580932617, + "learning_rate": 0.0005609432322509534, + "loss": 3.222, + "step": 10054 + }, + { + "epoch": 0.49, + "grad_norm": 0.5279762744903564, + "learning_rate": 0.0005609356356507529, + "loss": 3.2947, + "step": 10055 + }, + { + "epoch": 0.49, + "grad_norm": 0.509883463382721, + "learning_rate": 0.0005609280383632981, + "loss": 3.4618, + "step": 10056 + }, + { + "epoch": 0.49, + "grad_norm": 0.5072588324546814, + "learning_rate": 0.0005609204403886092, + "loss": 3.0462, + "step": 10057 + }, + { + "epoch": 0.49, + "grad_norm": 0.5225916504859924, + "learning_rate": 0.000560912841726706, + "loss": 3.3883, + "step": 10058 + }, + { + "epoch": 0.49, + "grad_norm": 0.5001477599143982, + "learning_rate": 0.0005609052423776085, + "loss": 3.3117, + "step": 10059 + }, + { + "epoch": 0.49, + "grad_norm": 0.5187198519706726, + "learning_rate": 0.0005608976423413369, + "loss": 3.2966, + "step": 10060 + }, + { + "epoch": 0.49, + "grad_norm": 0.5253342390060425, + "learning_rate": 0.0005608900416179111, + "loss": 3.3214, + "step": 10061 + }, + { + "epoch": 0.49, + "grad_norm": 0.5546568632125854, + "learning_rate": 0.0005608824402073512, + "loss": 3.4283, + "step": 10062 + }, + { + "epoch": 0.49, + "grad_norm": 0.5639400482177734, + "learning_rate": 0.0005608748381096771, + "loss": 3.3312, + "step": 10063 + }, + { + "epoch": 0.49, + "grad_norm": 0.5478060245513916, + "learning_rate": 0.0005608672353249089, + "loss": 3.4401, + "step": 10064 + }, + { + "epoch": 0.49, + "grad_norm": 0.5374152660369873, + "learning_rate": 0.0005608596318530665, + "loss": 3.4056, + "step": 10065 + }, + { + "epoch": 0.49, + "grad_norm": 0.5391674041748047, + "learning_rate": 0.0005608520276941701, + "loss": 3.2503, + "step": 10066 + }, + { + "epoch": 0.49, + "grad_norm": 0.504303514957428, + "learning_rate": 0.0005608444228482398, + "loss": 3.4028, + "step": 10067 + }, + { + "epoch": 0.49, + "grad_norm": 0.5163530111312866, + "learning_rate": 0.0005608368173152953, + "loss": 3.2145, + "step": 10068 + }, + { + "epoch": 0.49, + "grad_norm": 0.5681028366088867, + "learning_rate": 0.0005608292110953569, + "loss": 3.2051, + "step": 10069 + }, + { + "epoch": 0.49, + "grad_norm": 0.5483840703964233, + "learning_rate": 0.0005608216041884446, + "loss": 3.1229, + "step": 10070 + }, + { + "epoch": 0.49, + "grad_norm": 0.5046508312225342, + "learning_rate": 0.0005608139965945783, + "loss": 3.1301, + "step": 10071 + }, + { + "epoch": 0.49, + "grad_norm": 0.5168729424476624, + "learning_rate": 0.0005608063883137782, + "loss": 3.4912, + "step": 10072 + }, + { + "epoch": 0.49, + "grad_norm": 0.5135037302970886, + "learning_rate": 0.0005607987793460642, + "loss": 3.4096, + "step": 10073 + }, + { + "epoch": 0.49, + "grad_norm": 0.524517834186554, + "learning_rate": 0.0005607911696914565, + "loss": 3.1052, + "step": 10074 + }, + { + "epoch": 0.49, + "grad_norm": 0.4915313422679901, + "learning_rate": 0.0005607835593499749, + "loss": 3.2567, + "step": 10075 + }, + { + "epoch": 0.49, + "grad_norm": 0.503527045249939, + "learning_rate": 0.0005607759483216398, + "loss": 3.0298, + "step": 10076 + }, + { + "epoch": 0.49, + "grad_norm": 0.5238627791404724, + "learning_rate": 0.0005607683366064709, + "loss": 3.1529, + "step": 10077 + }, + { + "epoch": 0.49, + "grad_norm": 0.5229913592338562, + "learning_rate": 0.0005607607242044884, + "loss": 3.3084, + "step": 10078 + }, + { + "epoch": 0.49, + "grad_norm": 0.5081816911697388, + "learning_rate": 0.0005607531111157123, + "loss": 3.2387, + "step": 10079 + }, + { + "epoch": 0.49, + "grad_norm": 0.5463985800743103, + "learning_rate": 0.0005607454973401627, + "loss": 3.1542, + "step": 10080 + }, + { + "epoch": 0.49, + "grad_norm": 0.5712737441062927, + "learning_rate": 0.0005607378828778598, + "loss": 3.3672, + "step": 10081 + }, + { + "epoch": 0.49, + "grad_norm": 0.48129820823669434, + "learning_rate": 0.0005607302677288233, + "loss": 3.5522, + "step": 10082 + }, + { + "epoch": 0.49, + "grad_norm": 0.5331260561943054, + "learning_rate": 0.0005607226518930735, + "loss": 3.2012, + "step": 10083 + }, + { + "epoch": 0.49, + "grad_norm": 0.5193986296653748, + "learning_rate": 0.0005607150353706305, + "loss": 3.3374, + "step": 10084 + }, + { + "epoch": 0.49, + "grad_norm": 0.5116517543792725, + "learning_rate": 0.0005607074181615141, + "loss": 3.2151, + "step": 10085 + }, + { + "epoch": 0.49, + "grad_norm": 0.5041857957839966, + "learning_rate": 0.0005606998002657447, + "loss": 3.2215, + "step": 10086 + }, + { + "epoch": 0.49, + "grad_norm": 0.5375064015388489, + "learning_rate": 0.0005606921816833422, + "loss": 3.3488, + "step": 10087 + }, + { + "epoch": 0.49, + "grad_norm": 0.5213835835456848, + "learning_rate": 0.0005606845624143266, + "loss": 3.346, + "step": 10088 + }, + { + "epoch": 0.49, + "grad_norm": 0.5121163725852966, + "learning_rate": 0.000560676942458718, + "loss": 3.0301, + "step": 10089 + }, + { + "epoch": 0.49, + "grad_norm": 0.5614414811134338, + "learning_rate": 0.0005606693218165366, + "loss": 3.2334, + "step": 10090 + }, + { + "epoch": 0.49, + "grad_norm": 0.5325416326522827, + "learning_rate": 0.0005606617004878024, + "loss": 3.3862, + "step": 10091 + }, + { + "epoch": 0.49, + "grad_norm": 0.5521125793457031, + "learning_rate": 0.0005606540784725353, + "loss": 3.3762, + "step": 10092 + }, + { + "epoch": 0.49, + "grad_norm": 0.5565793514251709, + "learning_rate": 0.0005606464557707555, + "loss": 3.1573, + "step": 10093 + }, + { + "epoch": 0.49, + "grad_norm": 0.5189270973205566, + "learning_rate": 0.0005606388323824832, + "loss": 3.4819, + "step": 10094 + }, + { + "epoch": 0.49, + "grad_norm": 0.5117472410202026, + "learning_rate": 0.0005606312083077384, + "loss": 3.2461, + "step": 10095 + }, + { + "epoch": 0.49, + "grad_norm": 0.5243008732795715, + "learning_rate": 0.0005606235835465412, + "loss": 3.0837, + "step": 10096 + }, + { + "epoch": 0.49, + "grad_norm": 0.5307744145393372, + "learning_rate": 0.0005606159580989115, + "loss": 3.2632, + "step": 10097 + }, + { + "epoch": 0.49, + "grad_norm": 0.5260905027389526, + "learning_rate": 0.0005606083319648695, + "loss": 3.2347, + "step": 10098 + }, + { + "epoch": 0.49, + "grad_norm": 0.5161594152450562, + "learning_rate": 0.0005606007051444354, + "loss": 3.214, + "step": 10099 + }, + { + "epoch": 0.49, + "grad_norm": 0.5047897100448608, + "learning_rate": 0.0005605930776376293, + "loss": 3.1914, + "step": 10100 + }, + { + "epoch": 0.5, + "grad_norm": 0.4992629885673523, + "learning_rate": 0.000560585449444471, + "loss": 3.4533, + "step": 10101 + }, + { + "epoch": 0.5, + "grad_norm": 0.4943458139896393, + "learning_rate": 0.0005605778205649808, + "loss": 3.217, + "step": 10102 + }, + { + "epoch": 0.5, + "grad_norm": 0.5585146546363831, + "learning_rate": 0.0005605701909991789, + "loss": 3.0695, + "step": 10103 + }, + { + "epoch": 0.5, + "grad_norm": 0.5517671704292297, + "learning_rate": 0.0005605625607470851, + "loss": 3.0032, + "step": 10104 + }, + { + "epoch": 0.5, + "grad_norm": 0.5213403105735779, + "learning_rate": 0.0005605549298087197, + "loss": 3.3822, + "step": 10105 + }, + { + "epoch": 0.5, + "grad_norm": 0.5177978277206421, + "learning_rate": 0.0005605472981841028, + "loss": 3.4062, + "step": 10106 + }, + { + "epoch": 0.5, + "grad_norm": 0.5317561030387878, + "learning_rate": 0.0005605396658732545, + "loss": 3.2718, + "step": 10107 + }, + { + "epoch": 0.5, + "grad_norm": 0.5072512626647949, + "learning_rate": 0.0005605320328761948, + "loss": 3.3612, + "step": 10108 + }, + { + "epoch": 0.5, + "grad_norm": 0.519949734210968, + "learning_rate": 0.000560524399192944, + "loss": 3.4358, + "step": 10109 + }, + { + "epoch": 0.5, + "grad_norm": 0.4792541563510895, + "learning_rate": 0.0005605167648235218, + "loss": 3.2888, + "step": 10110 + }, + { + "epoch": 0.5, + "grad_norm": 0.5226483345031738, + "learning_rate": 0.0005605091297679488, + "loss": 3.1335, + "step": 10111 + }, + { + "epoch": 0.5, + "grad_norm": 0.5207281112670898, + "learning_rate": 0.0005605014940262449, + "loss": 3.3549, + "step": 10112 + }, + { + "epoch": 0.5, + "grad_norm": 0.5150095224380493, + "learning_rate": 0.0005604938575984301, + "loss": 3.0484, + "step": 10113 + }, + { + "epoch": 0.5, + "grad_norm": 0.5432024002075195, + "learning_rate": 0.0005604862204845245, + "loss": 3.1677, + "step": 10114 + }, + { + "epoch": 0.5, + "grad_norm": 0.5010020732879639, + "learning_rate": 0.0005604785826845486, + "loss": 3.2577, + "step": 10115 + }, + { + "epoch": 0.5, + "grad_norm": 0.49979767203330994, + "learning_rate": 0.0005604709441985222, + "loss": 3.3097, + "step": 10116 + }, + { + "epoch": 0.5, + "grad_norm": 0.5717746615409851, + "learning_rate": 0.0005604633050264652, + "loss": 3.1078, + "step": 10117 + }, + { + "epoch": 0.5, + "grad_norm": 0.530775785446167, + "learning_rate": 0.0005604556651683982, + "loss": 3.0593, + "step": 10118 + }, + { + "epoch": 0.5, + "grad_norm": 0.5330826640129089, + "learning_rate": 0.000560448024624341, + "loss": 3.3647, + "step": 10119 + }, + { + "epoch": 0.5, + "grad_norm": 0.4870454668998718, + "learning_rate": 0.000560440383394314, + "loss": 3.3083, + "step": 10120 + }, + { + "epoch": 0.5, + "grad_norm": 0.5176739692687988, + "learning_rate": 0.000560432741478337, + "loss": 3.3033, + "step": 10121 + }, + { + "epoch": 0.5, + "grad_norm": 0.5079305768013, + "learning_rate": 0.0005604250988764303, + "loss": 3.3905, + "step": 10122 + }, + { + "epoch": 0.5, + "grad_norm": 0.5102031826972961, + "learning_rate": 0.000560417455588614, + "loss": 3.2952, + "step": 10123 + }, + { + "epoch": 0.5, + "grad_norm": 0.5234565138816833, + "learning_rate": 0.0005604098116149083, + "loss": 3.4796, + "step": 10124 + }, + { + "epoch": 0.5, + "grad_norm": 0.48953551054000854, + "learning_rate": 0.0005604021669553332, + "loss": 3.135, + "step": 10125 + }, + { + "epoch": 0.5, + "grad_norm": 0.5201693773269653, + "learning_rate": 0.000560394521609909, + "loss": 3.2396, + "step": 10126 + }, + { + "epoch": 0.5, + "grad_norm": 0.5162103176116943, + "learning_rate": 0.0005603868755786557, + "loss": 3.3356, + "step": 10127 + }, + { + "epoch": 0.5, + "grad_norm": 0.535090982913971, + "learning_rate": 0.0005603792288615935, + "loss": 3.2915, + "step": 10128 + }, + { + "epoch": 0.5, + "grad_norm": 0.5197641253471375, + "learning_rate": 0.0005603715814587425, + "loss": 3.1518, + "step": 10129 + }, + { + "epoch": 0.5, + "grad_norm": 0.5615101456642151, + "learning_rate": 0.0005603639333701228, + "loss": 3.3298, + "step": 10130 + }, + { + "epoch": 0.5, + "grad_norm": 0.5671878457069397, + "learning_rate": 0.0005603562845957548, + "loss": 3.2195, + "step": 10131 + }, + { + "epoch": 0.5, + "grad_norm": 0.654164731502533, + "learning_rate": 0.0005603486351356582, + "loss": 3.2873, + "step": 10132 + }, + { + "epoch": 0.5, + "grad_norm": 0.5772500038146973, + "learning_rate": 0.0005603409849898535, + "loss": 3.2613, + "step": 10133 + }, + { + "epoch": 0.5, + "grad_norm": 0.5550999641418457, + "learning_rate": 0.0005603333341583607, + "loss": 3.2879, + "step": 10134 + }, + { + "epoch": 0.5, + "grad_norm": 0.49901139736175537, + "learning_rate": 0.0005603256826412002, + "loss": 3.1976, + "step": 10135 + }, + { + "epoch": 0.5, + "grad_norm": 0.5248491764068604, + "learning_rate": 0.0005603180304383917, + "loss": 3.4905, + "step": 10136 + }, + { + "epoch": 0.5, + "grad_norm": 0.5227196216583252, + "learning_rate": 0.0005603103775499559, + "loss": 3.3214, + "step": 10137 + }, + { + "epoch": 0.5, + "grad_norm": 0.5142403841018677, + "learning_rate": 0.0005603027239759124, + "loss": 3.1934, + "step": 10138 + }, + { + "epoch": 0.5, + "grad_norm": 0.5276763439178467, + "learning_rate": 0.0005602950697162817, + "loss": 3.1553, + "step": 10139 + }, + { + "epoch": 0.5, + "grad_norm": 0.4911149740219116, + "learning_rate": 0.0005602874147710838, + "loss": 3.3715, + "step": 10140 + }, + { + "epoch": 0.5, + "grad_norm": 0.5246410965919495, + "learning_rate": 0.0005602797591403391, + "loss": 3.1999, + "step": 10141 + }, + { + "epoch": 0.5, + "grad_norm": 0.5695706605911255, + "learning_rate": 0.0005602721028240675, + "loss": 3.4837, + "step": 10142 + }, + { + "epoch": 0.5, + "grad_norm": 0.5213264226913452, + "learning_rate": 0.0005602644458222893, + "loss": 3.258, + "step": 10143 + }, + { + "epoch": 0.5, + "grad_norm": 0.5553370714187622, + "learning_rate": 0.0005602567881350248, + "loss": 3.3955, + "step": 10144 + }, + { + "epoch": 0.5, + "grad_norm": 0.5400976538658142, + "learning_rate": 0.0005602491297622939, + "loss": 3.2611, + "step": 10145 + }, + { + "epoch": 0.5, + "grad_norm": 0.5228300094604492, + "learning_rate": 0.0005602414707041168, + "loss": 3.2218, + "step": 10146 + }, + { + "epoch": 0.5, + "grad_norm": 0.5486548542976379, + "learning_rate": 0.0005602338109605137, + "loss": 3.2305, + "step": 10147 + }, + { + "epoch": 0.5, + "grad_norm": 0.5071641802787781, + "learning_rate": 0.0005602261505315049, + "loss": 3.2951, + "step": 10148 + }, + { + "epoch": 0.5, + "grad_norm": 0.5746123790740967, + "learning_rate": 0.0005602184894171106, + "loss": 3.3812, + "step": 10149 + }, + { + "epoch": 0.5, + "grad_norm": 0.5090572237968445, + "learning_rate": 0.0005602108276173507, + "loss": 3.2406, + "step": 10150 + }, + { + "epoch": 0.5, + "grad_norm": 0.5613428354263306, + "learning_rate": 0.0005602031651322457, + "loss": 3.248, + "step": 10151 + }, + { + "epoch": 0.5, + "grad_norm": 0.49390625953674316, + "learning_rate": 0.0005601955019618156, + "loss": 3.1618, + "step": 10152 + }, + { + "epoch": 0.5, + "grad_norm": 0.5180720686912537, + "learning_rate": 0.0005601878381060807, + "loss": 3.3309, + "step": 10153 + }, + { + "epoch": 0.5, + "grad_norm": 0.4854278266429901, + "learning_rate": 0.0005601801735650609, + "loss": 3.4998, + "step": 10154 + }, + { + "epoch": 0.5, + "grad_norm": 0.539308488368988, + "learning_rate": 0.0005601725083387767, + "loss": 3.3767, + "step": 10155 + }, + { + "epoch": 0.5, + "grad_norm": 0.5451698899269104, + "learning_rate": 0.0005601648424272482, + "loss": 3.0673, + "step": 10156 + }, + { + "epoch": 0.5, + "grad_norm": 0.5229968428611755, + "learning_rate": 0.0005601571758304957, + "loss": 3.2044, + "step": 10157 + }, + { + "epoch": 0.5, + "grad_norm": 0.5304858088493347, + "learning_rate": 0.0005601495085485391, + "loss": 3.4131, + "step": 10158 + }, + { + "epoch": 0.5, + "grad_norm": 0.5545448064804077, + "learning_rate": 0.0005601418405813989, + "loss": 3.3089, + "step": 10159 + }, + { + "epoch": 0.5, + "grad_norm": 0.47743499279022217, + "learning_rate": 0.000560134171929095, + "loss": 3.2294, + "step": 10160 + }, + { + "epoch": 0.5, + "grad_norm": 0.5139665007591248, + "learning_rate": 0.0005601265025916478, + "loss": 3.2909, + "step": 10161 + }, + { + "epoch": 0.5, + "grad_norm": 0.54677414894104, + "learning_rate": 0.0005601188325690774, + "loss": 3.0912, + "step": 10162 + }, + { + "epoch": 0.5, + "grad_norm": 0.5394816994667053, + "learning_rate": 0.0005601111618614041, + "loss": 3.1764, + "step": 10163 + }, + { + "epoch": 0.5, + "grad_norm": 0.5438968539237976, + "learning_rate": 0.0005601034904686482, + "loss": 3.1558, + "step": 10164 + }, + { + "epoch": 0.5, + "grad_norm": 0.5133037567138672, + "learning_rate": 0.0005600958183908296, + "loss": 3.2835, + "step": 10165 + }, + { + "epoch": 0.5, + "grad_norm": 0.5024428963661194, + "learning_rate": 0.0005600881456279687, + "loss": 3.1066, + "step": 10166 + }, + { + "epoch": 0.5, + "grad_norm": 0.5286892652511597, + "learning_rate": 0.0005600804721800857, + "loss": 3.1596, + "step": 10167 + }, + { + "epoch": 0.5, + "grad_norm": 0.5346733927726746, + "learning_rate": 0.0005600727980472009, + "loss": 3.4603, + "step": 10168 + }, + { + "epoch": 0.5, + "grad_norm": 0.51430344581604, + "learning_rate": 0.0005600651232293343, + "loss": 3.3469, + "step": 10169 + }, + { + "epoch": 0.5, + "grad_norm": 0.4911240041255951, + "learning_rate": 0.0005600574477265062, + "loss": 3.4604, + "step": 10170 + }, + { + "epoch": 0.5, + "grad_norm": 0.5240756273269653, + "learning_rate": 0.0005600497715387368, + "loss": 3.4024, + "step": 10171 + }, + { + "epoch": 0.5, + "grad_norm": 0.5592271685600281, + "learning_rate": 0.0005600420946660464, + "loss": 2.9897, + "step": 10172 + }, + { + "epoch": 0.5, + "grad_norm": 0.49334201216697693, + "learning_rate": 0.0005600344171084551, + "loss": 3.098, + "step": 10173 + }, + { + "epoch": 0.5, + "grad_norm": 0.5433202385902405, + "learning_rate": 0.0005600267388659833, + "loss": 3.0928, + "step": 10174 + }, + { + "epoch": 0.5, + "grad_norm": 0.5227448344230652, + "learning_rate": 0.0005600190599386511, + "loss": 3.0703, + "step": 10175 + }, + { + "epoch": 0.5, + "grad_norm": 0.5543997287750244, + "learning_rate": 0.0005600113803264787, + "loss": 3.0732, + "step": 10176 + }, + { + "epoch": 0.5, + "grad_norm": 0.4942059814929962, + "learning_rate": 0.0005600037000294863, + "loss": 3.2383, + "step": 10177 + }, + { + "epoch": 0.5, + "grad_norm": 0.5727730393409729, + "learning_rate": 0.0005599960190476943, + "loss": 3.1809, + "step": 10178 + }, + { + "epoch": 0.5, + "grad_norm": 0.5260288119316101, + "learning_rate": 0.0005599883373811228, + "loss": 3.3584, + "step": 10179 + }, + { + "epoch": 0.5, + "grad_norm": 0.5285677313804626, + "learning_rate": 0.0005599806550297921, + "loss": 3.2441, + "step": 10180 + }, + { + "epoch": 0.5, + "grad_norm": 0.5523196458816528, + "learning_rate": 0.0005599729719937222, + "loss": 3.2402, + "step": 10181 + }, + { + "epoch": 0.5, + "grad_norm": 0.5142799019813538, + "learning_rate": 0.0005599652882729337, + "loss": 3.1881, + "step": 10182 + }, + { + "epoch": 0.5, + "grad_norm": 0.5352577567100525, + "learning_rate": 0.0005599576038674466, + "loss": 3.3204, + "step": 10183 + }, + { + "epoch": 0.5, + "grad_norm": 0.5273776650428772, + "learning_rate": 0.0005599499187772813, + "loss": 3.105, + "step": 10184 + }, + { + "epoch": 0.5, + "grad_norm": 0.5525439381599426, + "learning_rate": 0.0005599422330024578, + "loss": 3.2628, + "step": 10185 + }, + { + "epoch": 0.5, + "grad_norm": 0.5069778561592102, + "learning_rate": 0.0005599345465429965, + "loss": 3.0637, + "step": 10186 + }, + { + "epoch": 0.5, + "grad_norm": 0.5232580900192261, + "learning_rate": 0.0005599268593989177, + "loss": 3.1459, + "step": 10187 + }, + { + "epoch": 0.5, + "grad_norm": 0.5141770839691162, + "learning_rate": 0.0005599191715702416, + "loss": 3.2485, + "step": 10188 + }, + { + "epoch": 0.5, + "grad_norm": 0.5252525210380554, + "learning_rate": 0.0005599114830569884, + "loss": 3.5268, + "step": 10189 + }, + { + "epoch": 0.5, + "grad_norm": 0.552005410194397, + "learning_rate": 0.0005599037938591782, + "loss": 3.3599, + "step": 10190 + }, + { + "epoch": 0.5, + "grad_norm": 0.4965636134147644, + "learning_rate": 0.0005598961039768316, + "loss": 3.1345, + "step": 10191 + }, + { + "epoch": 0.5, + "grad_norm": 0.5195544362068176, + "learning_rate": 0.0005598884134099687, + "loss": 3.2845, + "step": 10192 + }, + { + "epoch": 0.5, + "grad_norm": 0.5534344911575317, + "learning_rate": 0.0005598807221586097, + "loss": 3.2648, + "step": 10193 + }, + { + "epoch": 0.5, + "grad_norm": 0.5047644376754761, + "learning_rate": 0.0005598730302227749, + "loss": 3.363, + "step": 10194 + }, + { + "epoch": 0.5, + "grad_norm": 0.5294030904769897, + "learning_rate": 0.0005598653376024845, + "loss": 3.3243, + "step": 10195 + }, + { + "epoch": 0.5, + "grad_norm": 0.4891115725040436, + "learning_rate": 0.0005598576442977588, + "loss": 3.1471, + "step": 10196 + }, + { + "epoch": 0.5, + "grad_norm": 0.4774000942707062, + "learning_rate": 0.0005598499503086182, + "loss": 3.5423, + "step": 10197 + }, + { + "epoch": 0.5, + "grad_norm": 0.5198843479156494, + "learning_rate": 0.0005598422556350827, + "loss": 3.0888, + "step": 10198 + }, + { + "epoch": 0.5, + "grad_norm": 0.5236388444900513, + "learning_rate": 0.0005598345602771728, + "loss": 3.2707, + "step": 10199 + }, + { + "epoch": 0.5, + "grad_norm": 0.5683999061584473, + "learning_rate": 0.0005598268642349086, + "loss": 3.4979, + "step": 10200 + }, + { + "epoch": 0.5, + "grad_norm": 0.5080311298370361, + "learning_rate": 0.0005598191675083105, + "loss": 3.2974, + "step": 10201 + }, + { + "epoch": 0.5, + "grad_norm": 0.5282857418060303, + "learning_rate": 0.0005598114700973988, + "loss": 3.2676, + "step": 10202 + }, + { + "epoch": 0.5, + "grad_norm": 0.5361818671226501, + "learning_rate": 0.0005598037720021935, + "loss": 3.1697, + "step": 10203 + }, + { + "epoch": 0.5, + "grad_norm": 0.5248768925666809, + "learning_rate": 0.0005597960732227152, + "loss": 3.2677, + "step": 10204 + }, + { + "epoch": 0.5, + "grad_norm": 0.5688296556472778, + "learning_rate": 0.000559788373758984, + "loss": 3.2665, + "step": 10205 + }, + { + "epoch": 0.5, + "grad_norm": 0.5076455473899841, + "learning_rate": 0.0005597806736110202, + "loss": 3.2625, + "step": 10206 + }, + { + "epoch": 0.5, + "grad_norm": 0.5241527557373047, + "learning_rate": 0.0005597729727788442, + "loss": 3.3101, + "step": 10207 + }, + { + "epoch": 0.5, + "grad_norm": 0.561117947101593, + "learning_rate": 0.0005597652712624761, + "loss": 3.3028, + "step": 10208 + }, + { + "epoch": 0.5, + "grad_norm": 0.49607518315315247, + "learning_rate": 0.0005597575690619362, + "loss": 3.3582, + "step": 10209 + }, + { + "epoch": 0.5, + "grad_norm": 0.5208153128623962, + "learning_rate": 0.000559749866177245, + "loss": 3.2614, + "step": 10210 + }, + { + "epoch": 0.5, + "grad_norm": 0.47716373205184937, + "learning_rate": 0.0005597421626084225, + "loss": 3.0946, + "step": 10211 + }, + { + "epoch": 0.5, + "grad_norm": 0.5305400490760803, + "learning_rate": 0.0005597344583554893, + "loss": 3.3251, + "step": 10212 + }, + { + "epoch": 0.5, + "grad_norm": 0.49316057562828064, + "learning_rate": 0.0005597267534184654, + "loss": 3.4436, + "step": 10213 + }, + { + "epoch": 0.5, + "grad_norm": 0.5044198036193848, + "learning_rate": 0.0005597190477973712, + "loss": 3.4866, + "step": 10214 + }, + { + "epoch": 0.5, + "grad_norm": 0.5129861831665039, + "learning_rate": 0.0005597113414922271, + "loss": 3.1051, + "step": 10215 + }, + { + "epoch": 0.5, + "grad_norm": 0.5174407362937927, + "learning_rate": 0.0005597036345030532, + "loss": 3.2929, + "step": 10216 + }, + { + "epoch": 0.5, + "grad_norm": 0.5471522808074951, + "learning_rate": 0.00055969592682987, + "loss": 3.3473, + "step": 10217 + }, + { + "epoch": 0.5, + "grad_norm": 0.5389727354049683, + "learning_rate": 0.0005596882184726976, + "loss": 3.4021, + "step": 10218 + }, + { + "epoch": 0.5, + "grad_norm": 0.5008918642997742, + "learning_rate": 0.0005596805094315565, + "loss": 3.2045, + "step": 10219 + }, + { + "epoch": 0.5, + "grad_norm": 0.5358201265335083, + "learning_rate": 0.0005596727997064669, + "loss": 3.5891, + "step": 10220 + }, + { + "epoch": 0.5, + "grad_norm": 0.5327726602554321, + "learning_rate": 0.0005596650892974492, + "loss": 3.3117, + "step": 10221 + }, + { + "epoch": 0.5, + "grad_norm": 0.549594521522522, + "learning_rate": 0.0005596573782045236, + "loss": 3.2777, + "step": 10222 + }, + { + "epoch": 0.5, + "grad_norm": 0.4677071273326874, + "learning_rate": 0.0005596496664277104, + "loss": 3.1865, + "step": 10223 + }, + { + "epoch": 0.5, + "grad_norm": 0.5113584995269775, + "learning_rate": 0.0005596419539670299, + "loss": 3.4198, + "step": 10224 + }, + { + "epoch": 0.5, + "grad_norm": 0.5231432914733887, + "learning_rate": 0.0005596342408225024, + "loss": 3.2867, + "step": 10225 + }, + { + "epoch": 0.5, + "grad_norm": 0.5117649435997009, + "learning_rate": 0.0005596265269941485, + "loss": 3.266, + "step": 10226 + }, + { + "epoch": 0.5, + "grad_norm": 0.5720617771148682, + "learning_rate": 0.0005596188124819881, + "loss": 3.4351, + "step": 10227 + }, + { + "epoch": 0.5, + "grad_norm": 0.5223726034164429, + "learning_rate": 0.0005596110972860419, + "loss": 3.2824, + "step": 10228 + }, + { + "epoch": 0.5, + "grad_norm": 0.5558088421821594, + "learning_rate": 0.0005596033814063299, + "loss": 3.3895, + "step": 10229 + }, + { + "epoch": 0.5, + "grad_norm": 0.5451314449310303, + "learning_rate": 0.0005595956648428726, + "loss": 3.3244, + "step": 10230 + }, + { + "epoch": 0.5, + "grad_norm": 0.5551238059997559, + "learning_rate": 0.0005595879475956902, + "loss": 3.2968, + "step": 10231 + }, + { + "epoch": 0.5, + "grad_norm": 0.5394368171691895, + "learning_rate": 0.0005595802296648031, + "loss": 3.0322, + "step": 10232 + }, + { + "epoch": 0.5, + "grad_norm": 0.5566186904907227, + "learning_rate": 0.0005595725110502317, + "loss": 3.4952, + "step": 10233 + }, + { + "epoch": 0.5, + "grad_norm": 0.508893609046936, + "learning_rate": 0.0005595647917519962, + "loss": 3.338, + "step": 10234 + }, + { + "epoch": 0.5, + "grad_norm": 0.5071843266487122, + "learning_rate": 0.0005595570717701172, + "loss": 3.3411, + "step": 10235 + }, + { + "epoch": 0.5, + "grad_norm": 0.5460813045501709, + "learning_rate": 0.0005595493511046146, + "loss": 3.3642, + "step": 10236 + }, + { + "epoch": 0.5, + "grad_norm": 0.5000070929527283, + "learning_rate": 0.000559541629755509, + "loss": 3.4009, + "step": 10237 + }, + { + "epoch": 0.5, + "grad_norm": 0.5307058691978455, + "learning_rate": 0.0005595339077228207, + "loss": 3.2637, + "step": 10238 + }, + { + "epoch": 0.5, + "grad_norm": 0.5203930735588074, + "learning_rate": 0.0005595261850065701, + "loss": 3.3635, + "step": 10239 + }, + { + "epoch": 0.5, + "grad_norm": 0.5376899242401123, + "learning_rate": 0.0005595184616067775, + "loss": 3.3816, + "step": 10240 + }, + { + "epoch": 0.5, + "grad_norm": 0.661049485206604, + "learning_rate": 0.0005595107375234631, + "loss": 3.3454, + "step": 10241 + }, + { + "epoch": 0.5, + "grad_norm": 0.505993664264679, + "learning_rate": 0.0005595030127566474, + "loss": 3.274, + "step": 10242 + }, + { + "epoch": 0.5, + "grad_norm": 0.4986911118030548, + "learning_rate": 0.0005594952873063507, + "loss": 3.2775, + "step": 10243 + }, + { + "epoch": 0.5, + "grad_norm": 0.5380048155784607, + "learning_rate": 0.0005594875611725934, + "loss": 3.2546, + "step": 10244 + }, + { + "epoch": 0.5, + "grad_norm": 0.5448837280273438, + "learning_rate": 0.0005594798343553959, + "loss": 3.0941, + "step": 10245 + }, + { + "epoch": 0.5, + "grad_norm": 0.5074895024299622, + "learning_rate": 0.0005594721068547783, + "loss": 3.5444, + "step": 10246 + }, + { + "epoch": 0.5, + "grad_norm": 0.5164600014686584, + "learning_rate": 0.0005594643786707611, + "loss": 3.3968, + "step": 10247 + }, + { + "epoch": 0.5, + "grad_norm": 0.49650079011917114, + "learning_rate": 0.0005594566498033647, + "loss": 3.4343, + "step": 10248 + }, + { + "epoch": 0.5, + "grad_norm": 0.4930596351623535, + "learning_rate": 0.0005594489202526094, + "loss": 3.3165, + "step": 10249 + }, + { + "epoch": 0.5, + "grad_norm": 0.5038466453552246, + "learning_rate": 0.0005594411900185156, + "loss": 3.3183, + "step": 10250 + }, + { + "epoch": 0.5, + "grad_norm": 0.5192158818244934, + "learning_rate": 0.0005594334591011036, + "loss": 3.2953, + "step": 10251 + }, + { + "epoch": 0.5, + "grad_norm": 0.5114601254463196, + "learning_rate": 0.0005594257275003938, + "loss": 3.4671, + "step": 10252 + }, + { + "epoch": 0.5, + "grad_norm": 0.5503695011138916, + "learning_rate": 0.0005594179952164067, + "loss": 3.4004, + "step": 10253 + }, + { + "epoch": 0.5, + "grad_norm": 0.49898719787597656, + "learning_rate": 0.0005594102622491624, + "loss": 3.2809, + "step": 10254 + }, + { + "epoch": 0.5, + "grad_norm": 0.5273557305335999, + "learning_rate": 0.0005594025285986814, + "loss": 3.3543, + "step": 10255 + }, + { + "epoch": 0.5, + "grad_norm": 0.5539382100105286, + "learning_rate": 0.0005593947942649841, + "loss": 3.2133, + "step": 10256 + }, + { + "epoch": 0.5, + "grad_norm": 0.5285389423370361, + "learning_rate": 0.0005593870592480908, + "loss": 3.2378, + "step": 10257 + }, + { + "epoch": 0.5, + "grad_norm": 0.504951000213623, + "learning_rate": 0.000559379323548022, + "loss": 3.3102, + "step": 10258 + }, + { + "epoch": 0.5, + "grad_norm": 0.5255095362663269, + "learning_rate": 0.0005593715871647978, + "loss": 3.2966, + "step": 10259 + }, + { + "epoch": 0.5, + "grad_norm": 0.48558107018470764, + "learning_rate": 0.0005593638500984389, + "loss": 3.4101, + "step": 10260 + }, + { + "epoch": 0.5, + "grad_norm": 0.5173567533493042, + "learning_rate": 0.0005593561123489654, + "loss": 3.4406, + "step": 10261 + }, + { + "epoch": 0.5, + "grad_norm": 0.5314114093780518, + "learning_rate": 0.0005593483739163979, + "loss": 3.244, + "step": 10262 + }, + { + "epoch": 0.5, + "grad_norm": 0.502029538154602, + "learning_rate": 0.0005593406348007567, + "loss": 3.39, + "step": 10263 + }, + { + "epoch": 0.5, + "grad_norm": 0.5283157825469971, + "learning_rate": 0.0005593328950020623, + "loss": 3.2418, + "step": 10264 + }, + { + "epoch": 0.5, + "grad_norm": 0.5121799111366272, + "learning_rate": 0.0005593251545203348, + "loss": 3.2263, + "step": 10265 + }, + { + "epoch": 0.5, + "grad_norm": 0.517483651638031, + "learning_rate": 0.0005593174133555948, + "loss": 3.3841, + "step": 10266 + }, + { + "epoch": 0.5, + "grad_norm": 0.5155675411224365, + "learning_rate": 0.0005593096715078626, + "loss": 3.0287, + "step": 10267 + }, + { + "epoch": 0.5, + "grad_norm": 0.5659499168395996, + "learning_rate": 0.0005593019289771587, + "loss": 3.11, + "step": 10268 + }, + { + "epoch": 0.5, + "grad_norm": 0.5271885395050049, + "learning_rate": 0.0005592941857635034, + "loss": 3.3701, + "step": 10269 + }, + { + "epoch": 0.5, + "grad_norm": 0.5610504746437073, + "learning_rate": 0.0005592864418669171, + "loss": 3.2141, + "step": 10270 + }, + { + "epoch": 0.5, + "grad_norm": 0.5438143014907837, + "learning_rate": 0.0005592786972874203, + "loss": 3.1933, + "step": 10271 + }, + { + "epoch": 0.5, + "grad_norm": 0.5469740033149719, + "learning_rate": 0.0005592709520250331, + "loss": 3.3807, + "step": 10272 + }, + { + "epoch": 0.5, + "grad_norm": 0.530945360660553, + "learning_rate": 0.0005592632060797762, + "loss": 3.3132, + "step": 10273 + }, + { + "epoch": 0.5, + "grad_norm": 0.5154049396514893, + "learning_rate": 0.00055925545945167, + "loss": 3.2693, + "step": 10274 + }, + { + "epoch": 0.5, + "grad_norm": 0.5086045861244202, + "learning_rate": 0.0005592477121407347, + "loss": 3.267, + "step": 10275 + }, + { + "epoch": 0.5, + "grad_norm": 0.5159934163093567, + "learning_rate": 0.000559239964146991, + "loss": 3.1684, + "step": 10276 + }, + { + "epoch": 0.5, + "grad_norm": 0.4987911283969879, + "learning_rate": 0.000559232215470459, + "loss": 3.238, + "step": 10277 + }, + { + "epoch": 0.5, + "grad_norm": 0.5027976632118225, + "learning_rate": 0.0005592244661111592, + "loss": 3.2168, + "step": 10278 + }, + { + "epoch": 0.5, + "grad_norm": 0.5337111353874207, + "learning_rate": 0.0005592167160691121, + "loss": 3.2408, + "step": 10279 + }, + { + "epoch": 0.5, + "grad_norm": 0.4724106192588806, + "learning_rate": 0.000559208965344338, + "loss": 3.201, + "step": 10280 + }, + { + "epoch": 0.5, + "grad_norm": 0.5122838616371155, + "learning_rate": 0.0005592012139368574, + "loss": 3.1215, + "step": 10281 + }, + { + "epoch": 0.5, + "grad_norm": 0.5199552774429321, + "learning_rate": 0.0005591934618466908, + "loss": 3.2236, + "step": 10282 + }, + { + "epoch": 0.5, + "grad_norm": 0.506280243396759, + "learning_rate": 0.0005591857090738584, + "loss": 3.0693, + "step": 10283 + }, + { + "epoch": 0.5, + "grad_norm": 0.5122705101966858, + "learning_rate": 0.0005591779556183807, + "loss": 3.2787, + "step": 10284 + }, + { + "epoch": 0.5, + "grad_norm": 0.47597208619117737, + "learning_rate": 0.0005591702014802782, + "loss": 3.3259, + "step": 10285 + }, + { + "epoch": 0.5, + "grad_norm": 0.5174579620361328, + "learning_rate": 0.0005591624466595712, + "loss": 3.2819, + "step": 10286 + }, + { + "epoch": 0.5, + "grad_norm": 0.5105884075164795, + "learning_rate": 0.0005591546911562802, + "loss": 3.4321, + "step": 10287 + }, + { + "epoch": 0.5, + "grad_norm": 0.5002108812332153, + "learning_rate": 0.0005591469349704257, + "loss": 3.2793, + "step": 10288 + }, + { + "epoch": 0.5, + "grad_norm": 0.5126345157623291, + "learning_rate": 0.000559139178102028, + "loss": 3.4978, + "step": 10289 + }, + { + "epoch": 0.5, + "grad_norm": 0.5179390907287598, + "learning_rate": 0.0005591314205511075, + "loss": 3.3352, + "step": 10290 + }, + { + "epoch": 0.5, + "grad_norm": 0.49281585216522217, + "learning_rate": 0.0005591236623176849, + "loss": 3.2815, + "step": 10291 + }, + { + "epoch": 0.5, + "grad_norm": 0.5143146514892578, + "learning_rate": 0.0005591159034017803, + "loss": 3.4356, + "step": 10292 + }, + { + "epoch": 0.5, + "grad_norm": 0.5127323865890503, + "learning_rate": 0.0005591081438034143, + "loss": 3.2153, + "step": 10293 + }, + { + "epoch": 0.5, + "grad_norm": 0.5019605755805969, + "learning_rate": 0.0005591003835226073, + "loss": 3.3087, + "step": 10294 + }, + { + "epoch": 0.5, + "grad_norm": 0.5273397564888, + "learning_rate": 0.0005590926225593798, + "loss": 3.2319, + "step": 10295 + }, + { + "epoch": 0.5, + "grad_norm": 0.528471052646637, + "learning_rate": 0.0005590848609137521, + "loss": 3.3845, + "step": 10296 + }, + { + "epoch": 0.5, + "grad_norm": 0.5653498768806458, + "learning_rate": 0.0005590770985857448, + "loss": 3.2638, + "step": 10297 + }, + { + "epoch": 0.5, + "grad_norm": 0.5153144001960754, + "learning_rate": 0.0005590693355753784, + "loss": 3.3797, + "step": 10298 + }, + { + "epoch": 0.5, + "grad_norm": 0.5238440632820129, + "learning_rate": 0.0005590615718826731, + "loss": 3.3346, + "step": 10299 + }, + { + "epoch": 0.5, + "grad_norm": 0.5217507481575012, + "learning_rate": 0.0005590538075076494, + "loss": 3.0556, + "step": 10300 + }, + { + "epoch": 0.5, + "grad_norm": 0.5109837055206299, + "learning_rate": 0.000559046042450328, + "loss": 3.4186, + "step": 10301 + }, + { + "epoch": 0.5, + "grad_norm": 0.5007416009902954, + "learning_rate": 0.0005590382767107291, + "loss": 3.403, + "step": 10302 + }, + { + "epoch": 0.5, + "grad_norm": 0.5148555040359497, + "learning_rate": 0.0005590305102888732, + "loss": 3.2634, + "step": 10303 + }, + { + "epoch": 0.5, + "grad_norm": 0.5685636401176453, + "learning_rate": 0.0005590227431847809, + "loss": 3.3305, + "step": 10304 + }, + { + "epoch": 0.51, + "grad_norm": 0.5556630492210388, + "learning_rate": 0.0005590149753984724, + "loss": 3.3192, + "step": 10305 + }, + { + "epoch": 0.51, + "grad_norm": 0.545971691608429, + "learning_rate": 0.0005590072069299684, + "loss": 3.1871, + "step": 10306 + }, + { + "epoch": 0.51, + "grad_norm": 0.5093954801559448, + "learning_rate": 0.0005589994377792893, + "loss": 3.3295, + "step": 10307 + }, + { + "epoch": 0.51, + "grad_norm": 0.5161018967628479, + "learning_rate": 0.0005589916679464554, + "loss": 3.2058, + "step": 10308 + }, + { + "epoch": 0.51, + "grad_norm": 0.527915358543396, + "learning_rate": 0.0005589838974314874, + "loss": 3.2002, + "step": 10309 + }, + { + "epoch": 0.51, + "grad_norm": 0.5070471167564392, + "learning_rate": 0.0005589761262344056, + "loss": 3.0882, + "step": 10310 + }, + { + "epoch": 0.51, + "grad_norm": 0.4909028708934784, + "learning_rate": 0.0005589683543552305, + "loss": 3.2828, + "step": 10311 + }, + { + "epoch": 0.51, + "grad_norm": 0.5422300696372986, + "learning_rate": 0.0005589605817939826, + "loss": 3.3773, + "step": 10312 + }, + { + "epoch": 0.51, + "grad_norm": 0.5772907137870789, + "learning_rate": 0.0005589528085506824, + "loss": 3.2922, + "step": 10313 + }, + { + "epoch": 0.51, + "grad_norm": 0.5355245471000671, + "learning_rate": 0.0005589450346253504, + "loss": 3.1342, + "step": 10314 + }, + { + "epoch": 0.51, + "grad_norm": 0.5125998258590698, + "learning_rate": 0.0005589372600180069, + "loss": 3.4883, + "step": 10315 + }, + { + "epoch": 0.51, + "grad_norm": 0.5136677622795105, + "learning_rate": 0.0005589294847286726, + "loss": 3.3296, + "step": 10316 + }, + { + "epoch": 0.51, + "grad_norm": 0.514228105545044, + "learning_rate": 0.0005589217087573678, + "loss": 3.2156, + "step": 10317 + }, + { + "epoch": 0.51, + "grad_norm": 0.5587817430496216, + "learning_rate": 0.000558913932104113, + "loss": 3.2759, + "step": 10318 + }, + { + "epoch": 0.51, + "grad_norm": 0.6269665360450745, + "learning_rate": 0.0005589061547689288, + "loss": 3.2579, + "step": 10319 + }, + { + "epoch": 0.51, + "grad_norm": 0.5368645787239075, + "learning_rate": 0.0005588983767518356, + "loss": 3.3245, + "step": 10320 + }, + { + "epoch": 0.51, + "grad_norm": 0.5467808842658997, + "learning_rate": 0.0005588905980528539, + "loss": 3.272, + "step": 10321 + }, + { + "epoch": 0.51, + "grad_norm": 0.5453903079032898, + "learning_rate": 0.0005588828186720041, + "loss": 3.2743, + "step": 10322 + }, + { + "epoch": 0.51, + "grad_norm": 0.5781856179237366, + "learning_rate": 0.000558875038609307, + "loss": 3.2912, + "step": 10323 + }, + { + "epoch": 0.51, + "grad_norm": 0.6446077227592468, + "learning_rate": 0.0005588672578647827, + "loss": 3.2426, + "step": 10324 + }, + { + "epoch": 0.51, + "grad_norm": 0.5255220532417297, + "learning_rate": 0.0005588594764384519, + "loss": 3.4687, + "step": 10325 + }, + { + "epoch": 0.51, + "grad_norm": 0.5455769896507263, + "learning_rate": 0.0005588516943303352, + "loss": 3.2468, + "step": 10326 + }, + { + "epoch": 0.51, + "grad_norm": 0.5492979288101196, + "learning_rate": 0.0005588439115404527, + "loss": 3.4958, + "step": 10327 + }, + { + "epoch": 0.51, + "grad_norm": 0.5501328110694885, + "learning_rate": 0.0005588361280688252, + "loss": 3.2044, + "step": 10328 + }, + { + "epoch": 0.51, + "grad_norm": 0.5918759107589722, + "learning_rate": 0.0005588283439154733, + "loss": 3.4665, + "step": 10329 + }, + { + "epoch": 0.51, + "grad_norm": 0.5502012968063354, + "learning_rate": 0.0005588205590804173, + "loss": 3.2085, + "step": 10330 + }, + { + "epoch": 0.51, + "grad_norm": 0.49773234128952026, + "learning_rate": 0.0005588127735636776, + "loss": 3.3999, + "step": 10331 + }, + { + "epoch": 0.51, + "grad_norm": 0.542114794254303, + "learning_rate": 0.0005588049873652751, + "loss": 3.321, + "step": 10332 + }, + { + "epoch": 0.51, + "grad_norm": 0.5828530788421631, + "learning_rate": 0.00055879720048523, + "loss": 3.2922, + "step": 10333 + }, + { + "epoch": 0.51, + "grad_norm": 0.5640475153923035, + "learning_rate": 0.0005587894129235628, + "loss": 3.1788, + "step": 10334 + }, + { + "epoch": 0.51, + "grad_norm": 0.5116139650344849, + "learning_rate": 0.0005587816246802941, + "loss": 3.1212, + "step": 10335 + }, + { + "epoch": 0.51, + "grad_norm": 0.4903753399848938, + "learning_rate": 0.0005587738357554445, + "loss": 3.1048, + "step": 10336 + }, + { + "epoch": 0.51, + "grad_norm": 0.5303298234939575, + "learning_rate": 0.0005587660461490343, + "loss": 3.1958, + "step": 10337 + }, + { + "epoch": 0.51, + "grad_norm": 0.5109275579452515, + "learning_rate": 0.0005587582558610843, + "loss": 3.3446, + "step": 10338 + }, + { + "epoch": 0.51, + "grad_norm": 0.5509363412857056, + "learning_rate": 0.0005587504648916147, + "loss": 3.2626, + "step": 10339 + }, + { + "epoch": 0.51, + "grad_norm": 0.510272204875946, + "learning_rate": 0.0005587426732406463, + "loss": 3.3572, + "step": 10340 + }, + { + "epoch": 0.51, + "grad_norm": 0.5146096348762512, + "learning_rate": 0.0005587348809081993, + "loss": 3.1036, + "step": 10341 + }, + { + "epoch": 0.51, + "grad_norm": 0.48640674352645874, + "learning_rate": 0.0005587270878942947, + "loss": 3.4293, + "step": 10342 + }, + { + "epoch": 0.51, + "grad_norm": 0.5620465278625488, + "learning_rate": 0.0005587192941989525, + "loss": 3.2765, + "step": 10343 + }, + { + "epoch": 0.51, + "grad_norm": 0.5385037064552307, + "learning_rate": 0.0005587114998221935, + "loss": 3.0954, + "step": 10344 + }, + { + "epoch": 0.51, + "grad_norm": 0.547918975353241, + "learning_rate": 0.0005587037047640383, + "loss": 3.2205, + "step": 10345 + }, + { + "epoch": 0.51, + "grad_norm": 0.5199856758117676, + "learning_rate": 0.0005586959090245074, + "loss": 3.3202, + "step": 10346 + }, + { + "epoch": 0.51, + "grad_norm": 0.49615347385406494, + "learning_rate": 0.000558688112603621, + "loss": 3.1684, + "step": 10347 + }, + { + "epoch": 0.51, + "grad_norm": 0.5756198763847351, + "learning_rate": 0.0005586803155014001, + "loss": 3.2479, + "step": 10348 + }, + { + "epoch": 0.51, + "grad_norm": 0.5044625997543335, + "learning_rate": 0.000558672517717865, + "loss": 3.2301, + "step": 10349 + }, + { + "epoch": 0.51, + "grad_norm": 0.5165237784385681, + "learning_rate": 0.0005586647192530362, + "loss": 3.1831, + "step": 10350 + }, + { + "epoch": 0.51, + "grad_norm": 0.5043236613273621, + "learning_rate": 0.0005586569201069344, + "loss": 3.2906, + "step": 10351 + }, + { + "epoch": 0.51, + "grad_norm": 0.5174095034599304, + "learning_rate": 0.00055864912027958, + "loss": 3.3525, + "step": 10352 + }, + { + "epoch": 0.51, + "grad_norm": 0.539576530456543, + "learning_rate": 0.0005586413197709936, + "loss": 3.2209, + "step": 10353 + }, + { + "epoch": 0.51, + "grad_norm": 0.49933311343193054, + "learning_rate": 0.0005586335185811957, + "loss": 3.3751, + "step": 10354 + }, + { + "epoch": 0.51, + "grad_norm": 0.48638248443603516, + "learning_rate": 0.0005586257167102069, + "loss": 3.1653, + "step": 10355 + }, + { + "epoch": 0.51, + "grad_norm": 0.5505256652832031, + "learning_rate": 0.0005586179141580478, + "loss": 2.9672, + "step": 10356 + }, + { + "epoch": 0.51, + "grad_norm": 0.5323838591575623, + "learning_rate": 0.0005586101109247389, + "loss": 3.0727, + "step": 10357 + }, + { + "epoch": 0.51, + "grad_norm": 0.5171400904655457, + "learning_rate": 0.0005586023070103006, + "loss": 3.1937, + "step": 10358 + }, + { + "epoch": 0.51, + "grad_norm": 0.5195598006248474, + "learning_rate": 0.0005585945024147537, + "loss": 3.6333, + "step": 10359 + }, + { + "epoch": 0.51, + "grad_norm": 0.49022892117500305, + "learning_rate": 0.0005585866971381186, + "loss": 3.1178, + "step": 10360 + }, + { + "epoch": 0.51, + "grad_norm": 0.5279939770698547, + "learning_rate": 0.000558578891180416, + "loss": 3.2818, + "step": 10361 + }, + { + "epoch": 0.51, + "grad_norm": 0.5240474343299866, + "learning_rate": 0.0005585710845416663, + "loss": 3.2201, + "step": 10362 + }, + { + "epoch": 0.51, + "grad_norm": 0.5336095094680786, + "learning_rate": 0.0005585632772218901, + "loss": 3.1314, + "step": 10363 + }, + { + "epoch": 0.51, + "grad_norm": 0.4804680347442627, + "learning_rate": 0.000558555469221108, + "loss": 3.27, + "step": 10364 + }, + { + "epoch": 0.51, + "grad_norm": 0.5072349309921265, + "learning_rate": 0.0005585476605393406, + "loss": 3.3179, + "step": 10365 + }, + { + "epoch": 0.51, + "grad_norm": 0.6415109634399414, + "learning_rate": 0.0005585398511766084, + "loss": 3.493, + "step": 10366 + }, + { + "epoch": 0.51, + "grad_norm": 0.49252402782440186, + "learning_rate": 0.0005585320411329321, + "loss": 3.1779, + "step": 10367 + }, + { + "epoch": 0.51, + "grad_norm": 0.5037877559661865, + "learning_rate": 0.000558524230408332, + "loss": 3.2647, + "step": 10368 + }, + { + "epoch": 0.51, + "grad_norm": 0.5191027522087097, + "learning_rate": 0.0005585164190028287, + "loss": 3.3537, + "step": 10369 + }, + { + "epoch": 0.51, + "grad_norm": 0.49495670199394226, + "learning_rate": 0.0005585086069164432, + "loss": 3.3338, + "step": 10370 + }, + { + "epoch": 0.51, + "grad_norm": 0.5146889090538025, + "learning_rate": 0.0005585007941491957, + "loss": 3.2084, + "step": 10371 + }, + { + "epoch": 0.51, + "grad_norm": 0.5505595207214355, + "learning_rate": 0.0005584929807011068, + "loss": 3.2504, + "step": 10372 + }, + { + "epoch": 0.51, + "grad_norm": 0.4919992983341217, + "learning_rate": 0.0005584851665721972, + "loss": 3.2404, + "step": 10373 + }, + { + "epoch": 0.51, + "grad_norm": 0.49141108989715576, + "learning_rate": 0.0005584773517624873, + "loss": 3.3036, + "step": 10374 + }, + { + "epoch": 0.51, + "grad_norm": 0.5041980743408203, + "learning_rate": 0.0005584695362719979, + "loss": 3.4077, + "step": 10375 + }, + { + "epoch": 0.51, + "grad_norm": 0.5285553336143494, + "learning_rate": 0.0005584617201007494, + "loss": 3.3163, + "step": 10376 + }, + { + "epoch": 0.51, + "grad_norm": 0.5297923684120178, + "learning_rate": 0.0005584539032487626, + "loss": 3.0486, + "step": 10377 + }, + { + "epoch": 0.51, + "grad_norm": 0.4896916151046753, + "learning_rate": 0.0005584460857160578, + "loss": 3.5066, + "step": 10378 + }, + { + "epoch": 0.51, + "grad_norm": 0.528682291507721, + "learning_rate": 0.0005584382675026558, + "loss": 3.2385, + "step": 10379 + }, + { + "epoch": 0.51, + "grad_norm": 0.4898264706134796, + "learning_rate": 0.0005584304486085772, + "loss": 3.1054, + "step": 10380 + }, + { + "epoch": 0.51, + "grad_norm": 0.5090500116348267, + "learning_rate": 0.0005584226290338425, + "loss": 3.3509, + "step": 10381 + }, + { + "epoch": 0.51, + "grad_norm": 0.5073500871658325, + "learning_rate": 0.0005584148087784724, + "loss": 3.526, + "step": 10382 + }, + { + "epoch": 0.51, + "grad_norm": 0.5196703672409058, + "learning_rate": 0.0005584069878424873, + "loss": 3.2417, + "step": 10383 + }, + { + "epoch": 0.51, + "grad_norm": 0.4907332956790924, + "learning_rate": 0.0005583991662259082, + "loss": 3.2636, + "step": 10384 + }, + { + "epoch": 0.51, + "grad_norm": 0.47070619463920593, + "learning_rate": 0.000558391343928755, + "loss": 3.2755, + "step": 10385 + }, + { + "epoch": 0.51, + "grad_norm": 0.5050802230834961, + "learning_rate": 0.000558383520951049, + "loss": 3.3161, + "step": 10386 + }, + { + "epoch": 0.51, + "grad_norm": 0.5075258612632751, + "learning_rate": 0.0005583756972928106, + "loss": 3.1821, + "step": 10387 + }, + { + "epoch": 0.51, + "grad_norm": 0.5390806198120117, + "learning_rate": 0.0005583678729540602, + "loss": 3.2685, + "step": 10388 + }, + { + "epoch": 0.51, + "grad_norm": 0.5114220380783081, + "learning_rate": 0.0005583600479348185, + "loss": 3.1876, + "step": 10389 + }, + { + "epoch": 0.51, + "grad_norm": 0.5140728950500488, + "learning_rate": 0.0005583522222351062, + "loss": 3.2796, + "step": 10390 + }, + { + "epoch": 0.51, + "grad_norm": 0.5219741463661194, + "learning_rate": 0.0005583443958549439, + "loss": 3.3553, + "step": 10391 + }, + { + "epoch": 0.51, + "grad_norm": 0.5452612042427063, + "learning_rate": 0.0005583365687943521, + "loss": 3.4334, + "step": 10392 + }, + { + "epoch": 0.51, + "grad_norm": 0.6287804841995239, + "learning_rate": 0.0005583287410533516, + "loss": 3.1848, + "step": 10393 + }, + { + "epoch": 0.51, + "grad_norm": 0.5615096092224121, + "learning_rate": 0.0005583209126319629, + "loss": 3.4768, + "step": 10394 + }, + { + "epoch": 0.51, + "grad_norm": 0.5138780474662781, + "learning_rate": 0.0005583130835302066, + "loss": 3.3886, + "step": 10395 + }, + { + "epoch": 0.51, + "grad_norm": 0.5071299076080322, + "learning_rate": 0.0005583052537481034, + "loss": 3.3302, + "step": 10396 + }, + { + "epoch": 0.51, + "grad_norm": 0.5131478905677795, + "learning_rate": 0.0005582974232856738, + "loss": 3.4358, + "step": 10397 + }, + { + "epoch": 0.51, + "grad_norm": 0.5130304098129272, + "learning_rate": 0.0005582895921429385, + "loss": 3.1993, + "step": 10398 + }, + { + "epoch": 0.51, + "grad_norm": 0.5018404722213745, + "learning_rate": 0.0005582817603199182, + "loss": 3.4178, + "step": 10399 + }, + { + "epoch": 0.51, + "grad_norm": 0.5256871581077576, + "learning_rate": 0.0005582739278166334, + "loss": 3.0488, + "step": 10400 + }, + { + "epoch": 0.51, + "grad_norm": 0.5261520743370056, + "learning_rate": 0.0005582660946331047, + "loss": 3.1773, + "step": 10401 + }, + { + "epoch": 0.51, + "grad_norm": 0.5217165946960449, + "learning_rate": 0.0005582582607693529, + "loss": 3.1666, + "step": 10402 + }, + { + "epoch": 0.51, + "grad_norm": 0.5378689169883728, + "learning_rate": 0.0005582504262253986, + "loss": 3.199, + "step": 10403 + }, + { + "epoch": 0.51, + "grad_norm": 0.5194031596183777, + "learning_rate": 0.0005582425910012624, + "loss": 3.3709, + "step": 10404 + }, + { + "epoch": 0.51, + "grad_norm": 0.5214276909828186, + "learning_rate": 0.0005582347550969648, + "loss": 3.2128, + "step": 10405 + }, + { + "epoch": 0.51, + "grad_norm": 0.5072099566459656, + "learning_rate": 0.0005582269185125264, + "loss": 3.1792, + "step": 10406 + }, + { + "epoch": 0.51, + "grad_norm": 0.6110155582427979, + "learning_rate": 0.0005582190812479683, + "loss": 3.3741, + "step": 10407 + }, + { + "epoch": 0.51, + "grad_norm": 0.517043948173523, + "learning_rate": 0.0005582112433033107, + "loss": 3.3555, + "step": 10408 + }, + { + "epoch": 0.51, + "grad_norm": 0.5081747174263, + "learning_rate": 0.0005582034046785745, + "loss": 3.4135, + "step": 10409 + }, + { + "epoch": 0.51, + "grad_norm": 0.5313897132873535, + "learning_rate": 0.0005581955653737801, + "loss": 3.2883, + "step": 10410 + }, + { + "epoch": 0.51, + "grad_norm": 0.5237902998924255, + "learning_rate": 0.0005581877253889482, + "loss": 3.3033, + "step": 10411 + }, + { + "epoch": 0.51, + "grad_norm": 0.49405860900878906, + "learning_rate": 0.0005581798847240997, + "loss": 3.0508, + "step": 10412 + }, + { + "epoch": 0.51, + "grad_norm": 0.49449262022972107, + "learning_rate": 0.000558172043379255, + "loss": 3.268, + "step": 10413 + }, + { + "epoch": 0.51, + "grad_norm": 0.5103479623794556, + "learning_rate": 0.0005581642013544347, + "loss": 3.18, + "step": 10414 + }, + { + "epoch": 0.51, + "grad_norm": 0.4870624244213104, + "learning_rate": 0.0005581563586496597, + "loss": 3.2905, + "step": 10415 + }, + { + "epoch": 0.51, + "grad_norm": 0.5494616031646729, + "learning_rate": 0.0005581485152649506, + "loss": 3.3204, + "step": 10416 + }, + { + "epoch": 0.51, + "grad_norm": 0.5629361271858215, + "learning_rate": 0.0005581406712003279, + "loss": 3.3171, + "step": 10417 + }, + { + "epoch": 0.51, + "grad_norm": 0.48867589235305786, + "learning_rate": 0.0005581328264558123, + "loss": 3.3654, + "step": 10418 + }, + { + "epoch": 0.51, + "grad_norm": 0.5334605574607849, + "learning_rate": 0.0005581249810314245, + "loss": 3.0235, + "step": 10419 + }, + { + "epoch": 0.51, + "grad_norm": 0.5167070031166077, + "learning_rate": 0.0005581171349271853, + "loss": 3.2379, + "step": 10420 + }, + { + "epoch": 0.51, + "grad_norm": 0.5437859892845154, + "learning_rate": 0.0005581092881431152, + "loss": 3.2161, + "step": 10421 + }, + { + "epoch": 0.51, + "grad_norm": 0.4782818853855133, + "learning_rate": 0.0005581014406792348, + "loss": 3.2091, + "step": 10422 + }, + { + "epoch": 0.51, + "grad_norm": 0.5339621901512146, + "learning_rate": 0.0005580935925355652, + "loss": 3.1981, + "step": 10423 + }, + { + "epoch": 0.51, + "grad_norm": 0.5722216963768005, + "learning_rate": 0.0005580857437121264, + "loss": 3.2892, + "step": 10424 + }, + { + "epoch": 0.51, + "grad_norm": 0.5274963974952698, + "learning_rate": 0.0005580778942089396, + "loss": 3.4175, + "step": 10425 + }, + { + "epoch": 0.51, + "grad_norm": 0.5084352493286133, + "learning_rate": 0.0005580700440260253, + "loss": 3.3634, + "step": 10426 + }, + { + "epoch": 0.51, + "grad_norm": 0.49948909878730774, + "learning_rate": 0.000558062193163404, + "loss": 3.2063, + "step": 10427 + }, + { + "epoch": 0.51, + "grad_norm": 0.5016568899154663, + "learning_rate": 0.0005580543416210967, + "loss": 3.0554, + "step": 10428 + }, + { + "epoch": 0.51, + "grad_norm": 0.49527838826179504, + "learning_rate": 0.000558046489399124, + "loss": 3.0898, + "step": 10429 + }, + { + "epoch": 0.51, + "grad_norm": 0.5031401515007019, + "learning_rate": 0.0005580386364975064, + "loss": 3.2988, + "step": 10430 + }, + { + "epoch": 0.51, + "grad_norm": 0.5411937832832336, + "learning_rate": 0.0005580307829162646, + "loss": 3.3959, + "step": 10431 + }, + { + "epoch": 0.51, + "grad_norm": 0.48796623945236206, + "learning_rate": 0.0005580229286554195, + "loss": 3.2403, + "step": 10432 + }, + { + "epoch": 0.51, + "grad_norm": 0.502121090888977, + "learning_rate": 0.0005580150737149916, + "loss": 3.2067, + "step": 10433 + }, + { + "epoch": 0.51, + "grad_norm": 0.5142828822135925, + "learning_rate": 0.0005580072180950017, + "loss": 3.2503, + "step": 10434 + }, + { + "epoch": 0.51, + "grad_norm": 0.4943016767501831, + "learning_rate": 0.0005579993617954704, + "loss": 3.4234, + "step": 10435 + }, + { + "epoch": 0.51, + "grad_norm": 0.4971558153629303, + "learning_rate": 0.0005579915048164186, + "loss": 3.2003, + "step": 10436 + }, + { + "epoch": 0.51, + "grad_norm": 0.508538544178009, + "learning_rate": 0.0005579836471578665, + "loss": 3.299, + "step": 10437 + }, + { + "epoch": 0.51, + "grad_norm": 0.5330371856689453, + "learning_rate": 0.0005579757888198353, + "loss": 3.2828, + "step": 10438 + }, + { + "epoch": 0.51, + "grad_norm": 0.48706308007240295, + "learning_rate": 0.0005579679298023456, + "loss": 3.369, + "step": 10439 + }, + { + "epoch": 0.51, + "grad_norm": 0.519385039806366, + "learning_rate": 0.0005579600701054179, + "loss": 3.3164, + "step": 10440 + }, + { + "epoch": 0.51, + "grad_norm": 0.504608154296875, + "learning_rate": 0.0005579522097290732, + "loss": 3.338, + "step": 10441 + }, + { + "epoch": 0.51, + "grad_norm": 0.5118958353996277, + "learning_rate": 0.0005579443486733318, + "loss": 3.3114, + "step": 10442 + }, + { + "epoch": 0.51, + "grad_norm": 0.49348047375679016, + "learning_rate": 0.0005579364869382148, + "loss": 3.2707, + "step": 10443 + }, + { + "epoch": 0.51, + "grad_norm": 0.5011881589889526, + "learning_rate": 0.0005579286245237426, + "loss": 3.2466, + "step": 10444 + }, + { + "epoch": 0.51, + "grad_norm": 0.49209505319595337, + "learning_rate": 0.0005579207614299361, + "loss": 3.1962, + "step": 10445 + }, + { + "epoch": 0.51, + "grad_norm": 0.5165194869041443, + "learning_rate": 0.000557912897656816, + "loss": 3.2352, + "step": 10446 + }, + { + "epoch": 0.51, + "grad_norm": 0.48565101623535156, + "learning_rate": 0.0005579050332044029, + "loss": 3.4099, + "step": 10447 + }, + { + "epoch": 0.51, + "grad_norm": 0.5325493812561035, + "learning_rate": 0.0005578971680727177, + "loss": 3.5164, + "step": 10448 + }, + { + "epoch": 0.51, + "grad_norm": 0.5225394368171692, + "learning_rate": 0.0005578893022617808, + "loss": 3.5059, + "step": 10449 + }, + { + "epoch": 0.51, + "grad_norm": 0.5450068712234497, + "learning_rate": 0.0005578814357716132, + "loss": 3.3856, + "step": 10450 + }, + { + "epoch": 0.51, + "grad_norm": 0.4797419011592865, + "learning_rate": 0.0005578735686022355, + "loss": 3.2904, + "step": 10451 + }, + { + "epoch": 0.51, + "grad_norm": 0.5698765516281128, + "learning_rate": 0.0005578657007536685, + "loss": 3.274, + "step": 10452 + }, + { + "epoch": 0.51, + "grad_norm": 0.5215989947319031, + "learning_rate": 0.0005578578322259328, + "loss": 3.422, + "step": 10453 + }, + { + "epoch": 0.51, + "grad_norm": 0.5165630578994751, + "learning_rate": 0.0005578499630190493, + "loss": 3.3273, + "step": 10454 + }, + { + "epoch": 0.51, + "grad_norm": 0.5286875367164612, + "learning_rate": 0.0005578420931330385, + "loss": 3.302, + "step": 10455 + }, + { + "epoch": 0.51, + "grad_norm": 0.5140888690948486, + "learning_rate": 0.0005578342225679213, + "loss": 3.1989, + "step": 10456 + }, + { + "epoch": 0.51, + "grad_norm": 0.4854901432991028, + "learning_rate": 0.0005578263513237185, + "loss": 3.417, + "step": 10457 + }, + { + "epoch": 0.51, + "grad_norm": 0.5346885323524475, + "learning_rate": 0.0005578184794004506, + "loss": 3.4183, + "step": 10458 + }, + { + "epoch": 0.51, + "grad_norm": 0.5289451479911804, + "learning_rate": 0.0005578106067981384, + "loss": 3.2724, + "step": 10459 + }, + { + "epoch": 0.51, + "grad_norm": 0.536060631275177, + "learning_rate": 0.0005578027335168027, + "loss": 3.3203, + "step": 10460 + }, + { + "epoch": 0.51, + "grad_norm": 0.5051462650299072, + "learning_rate": 0.0005577948595564643, + "loss": 3.2259, + "step": 10461 + }, + { + "epoch": 0.51, + "grad_norm": 0.483619749546051, + "learning_rate": 0.0005577869849171437, + "loss": 3.3036, + "step": 10462 + }, + { + "epoch": 0.51, + "grad_norm": 0.5087090730667114, + "learning_rate": 0.0005577791095988619, + "loss": 3.4634, + "step": 10463 + }, + { + "epoch": 0.51, + "grad_norm": 0.5173831582069397, + "learning_rate": 0.0005577712336016395, + "loss": 3.4216, + "step": 10464 + }, + { + "epoch": 0.51, + "grad_norm": 0.504951536655426, + "learning_rate": 0.0005577633569254974, + "loss": 3.3021, + "step": 10465 + }, + { + "epoch": 0.51, + "grad_norm": 0.5104922652244568, + "learning_rate": 0.0005577554795704561, + "loss": 3.2196, + "step": 10466 + }, + { + "epoch": 0.51, + "grad_norm": 0.4943409264087677, + "learning_rate": 0.0005577476015365365, + "loss": 3.1075, + "step": 10467 + }, + { + "epoch": 0.51, + "grad_norm": 0.5140161514282227, + "learning_rate": 0.0005577397228237593, + "loss": 3.2358, + "step": 10468 + }, + { + "epoch": 0.51, + "grad_norm": 0.5718799829483032, + "learning_rate": 0.0005577318434321453, + "loss": 3.1215, + "step": 10469 + }, + { + "epoch": 0.51, + "grad_norm": 0.5070701241493225, + "learning_rate": 0.0005577239633617153, + "loss": 3.1489, + "step": 10470 + }, + { + "epoch": 0.51, + "grad_norm": 0.5295852422714233, + "learning_rate": 0.0005577160826124899, + "loss": 3.2577, + "step": 10471 + }, + { + "epoch": 0.51, + "grad_norm": 0.5276656150817871, + "learning_rate": 0.0005577082011844898, + "loss": 3.2257, + "step": 10472 + }, + { + "epoch": 0.51, + "grad_norm": 0.6001366376876831, + "learning_rate": 0.0005577003190777361, + "loss": 3.268, + "step": 10473 + }, + { + "epoch": 0.51, + "grad_norm": 0.5033782720565796, + "learning_rate": 0.0005576924362922493, + "loss": 3.3577, + "step": 10474 + }, + { + "epoch": 0.51, + "grad_norm": 0.5481687784194946, + "learning_rate": 0.0005576845528280503, + "loss": 3.1024, + "step": 10475 + }, + { + "epoch": 0.51, + "grad_norm": 0.5187335014343262, + "learning_rate": 0.0005576766686851595, + "loss": 3.2345, + "step": 10476 + }, + { + "epoch": 0.51, + "grad_norm": 0.5212278962135315, + "learning_rate": 0.0005576687838635983, + "loss": 3.2455, + "step": 10477 + }, + { + "epoch": 0.51, + "grad_norm": 0.5137622356414795, + "learning_rate": 0.0005576608983633868, + "loss": 3.3298, + "step": 10478 + }, + { + "epoch": 0.51, + "grad_norm": 0.544165313243866, + "learning_rate": 0.0005576530121845463, + "loss": 3.0631, + "step": 10479 + }, + { + "epoch": 0.51, + "grad_norm": 0.5154332518577576, + "learning_rate": 0.0005576451253270973, + "loss": 3.3821, + "step": 10480 + }, + { + "epoch": 0.51, + "grad_norm": 0.5267524719238281, + "learning_rate": 0.0005576372377910605, + "loss": 3.2283, + "step": 10481 + }, + { + "epoch": 0.51, + "grad_norm": 0.5340297818183899, + "learning_rate": 0.000557629349576457, + "loss": 3.1936, + "step": 10482 + }, + { + "epoch": 0.51, + "grad_norm": 0.5131736397743225, + "learning_rate": 0.0005576214606833073, + "loss": 3.2363, + "step": 10483 + }, + { + "epoch": 0.51, + "grad_norm": 0.5175976753234863, + "learning_rate": 0.0005576135711116322, + "loss": 3.3296, + "step": 10484 + }, + { + "epoch": 0.51, + "grad_norm": 0.5071966052055359, + "learning_rate": 0.0005576056808614526, + "loss": 3.3664, + "step": 10485 + }, + { + "epoch": 0.51, + "grad_norm": 0.5538497567176819, + "learning_rate": 0.0005575977899327892, + "loss": 3.1944, + "step": 10486 + }, + { + "epoch": 0.51, + "grad_norm": 0.5503365993499756, + "learning_rate": 0.0005575898983256627, + "loss": 3.2746, + "step": 10487 + }, + { + "epoch": 0.51, + "grad_norm": 0.5471153855323792, + "learning_rate": 0.000557582006040094, + "loss": 3.4136, + "step": 10488 + }, + { + "epoch": 0.51, + "grad_norm": 0.5127497911453247, + "learning_rate": 0.000557574113076104, + "loss": 3.4066, + "step": 10489 + }, + { + "epoch": 0.51, + "grad_norm": 0.5495885610580444, + "learning_rate": 0.0005575662194337133, + "loss": 3.1525, + "step": 10490 + }, + { + "epoch": 0.51, + "grad_norm": 0.5071145296096802, + "learning_rate": 0.0005575583251129426, + "loss": 3.3041, + "step": 10491 + }, + { + "epoch": 0.51, + "grad_norm": 0.5305414199829102, + "learning_rate": 0.000557550430113813, + "loss": 3.3928, + "step": 10492 + }, + { + "epoch": 0.51, + "grad_norm": 0.5200974345207214, + "learning_rate": 0.0005575425344363452, + "loss": 3.4053, + "step": 10493 + }, + { + "epoch": 0.51, + "grad_norm": 0.5152339339256287, + "learning_rate": 0.0005575346380805598, + "loss": 3.306, + "step": 10494 + }, + { + "epoch": 0.51, + "grad_norm": 0.5076627135276794, + "learning_rate": 0.0005575267410464778, + "loss": 3.414, + "step": 10495 + }, + { + "epoch": 0.51, + "grad_norm": 0.5367875695228577, + "learning_rate": 0.0005575188433341198, + "loss": 3.3654, + "step": 10496 + }, + { + "epoch": 0.51, + "grad_norm": 0.5150578618049622, + "learning_rate": 0.0005575109449435068, + "loss": 3.2771, + "step": 10497 + }, + { + "epoch": 0.51, + "grad_norm": 0.5308785438537598, + "learning_rate": 0.0005575030458746595, + "loss": 3.4856, + "step": 10498 + }, + { + "epoch": 0.51, + "grad_norm": 0.5408422946929932, + "learning_rate": 0.0005574951461275989, + "loss": 3.018, + "step": 10499 + }, + { + "epoch": 0.51, + "grad_norm": 0.5197494626045227, + "learning_rate": 0.0005574872457023455, + "loss": 3.2124, + "step": 10500 + }, + { + "epoch": 0.51, + "grad_norm": 0.5191333293914795, + "learning_rate": 0.0005574793445989202, + "loss": 3.4525, + "step": 10501 + }, + { + "epoch": 0.51, + "grad_norm": 0.5601697564125061, + "learning_rate": 0.000557471442817344, + "loss": 3.3425, + "step": 10502 + }, + { + "epoch": 0.51, + "grad_norm": 0.5880317687988281, + "learning_rate": 0.0005574635403576374, + "loss": 3.363, + "step": 10503 + }, + { + "epoch": 0.51, + "grad_norm": 0.528934121131897, + "learning_rate": 0.0005574556372198215, + "loss": 3.2255, + "step": 10504 + }, + { + "epoch": 0.51, + "grad_norm": 0.5256865620613098, + "learning_rate": 0.000557447733403917, + "loss": 3.3382, + "step": 10505 + }, + { + "epoch": 0.51, + "grad_norm": 0.534885823726654, + "learning_rate": 0.0005574398289099448, + "loss": 3.1408, + "step": 10506 + }, + { + "epoch": 0.51, + "grad_norm": 0.5375804901123047, + "learning_rate": 0.0005574319237379255, + "loss": 3.2966, + "step": 10507 + }, + { + "epoch": 0.51, + "grad_norm": 0.5356113314628601, + "learning_rate": 0.00055742401788788, + "loss": 2.8656, + "step": 10508 + }, + { + "epoch": 0.52, + "grad_norm": 0.5177241563796997, + "learning_rate": 0.0005574161113598293, + "loss": 3.3429, + "step": 10509 + }, + { + "epoch": 0.52, + "grad_norm": 0.619787871837616, + "learning_rate": 0.0005574082041537941, + "loss": 3.3243, + "step": 10510 + }, + { + "epoch": 0.52, + "grad_norm": 0.5078191161155701, + "learning_rate": 0.0005574002962697953, + "loss": 3.2323, + "step": 10511 + }, + { + "epoch": 0.52, + "grad_norm": 0.49133309721946716, + "learning_rate": 0.0005573923877078534, + "loss": 3.2236, + "step": 10512 + }, + { + "epoch": 0.52, + "grad_norm": 0.5388320088386536, + "learning_rate": 0.0005573844784679897, + "loss": 3.1184, + "step": 10513 + }, + { + "epoch": 0.52, + "grad_norm": 0.5303979516029358, + "learning_rate": 0.0005573765685502247, + "loss": 3.5108, + "step": 10514 + }, + { + "epoch": 0.52, + "grad_norm": 0.5063710808753967, + "learning_rate": 0.0005573686579545795, + "loss": 3.2074, + "step": 10515 + }, + { + "epoch": 0.52, + "grad_norm": 0.533225417137146, + "learning_rate": 0.0005573607466810747, + "loss": 2.9851, + "step": 10516 + }, + { + "epoch": 0.52, + "grad_norm": 0.48221254348754883, + "learning_rate": 0.0005573528347297312, + "loss": 3.3277, + "step": 10517 + }, + { + "epoch": 0.52, + "grad_norm": 0.4855428636074066, + "learning_rate": 0.0005573449221005699, + "loss": 3.3781, + "step": 10518 + }, + { + "epoch": 0.52, + "grad_norm": 0.5387424826622009, + "learning_rate": 0.0005573370087936117, + "loss": 3.4274, + "step": 10519 + }, + { + "epoch": 0.52, + "grad_norm": 0.5350310802459717, + "learning_rate": 0.0005573290948088772, + "loss": 3.2629, + "step": 10520 + }, + { + "epoch": 0.52, + "grad_norm": 0.5230748653411865, + "learning_rate": 0.0005573211801463874, + "loss": 3.1636, + "step": 10521 + }, + { + "epoch": 0.52, + "grad_norm": 0.5564438104629517, + "learning_rate": 0.000557313264806163, + "loss": 3.2858, + "step": 10522 + }, + { + "epoch": 0.52, + "grad_norm": 0.4955500066280365, + "learning_rate": 0.0005573053487882252, + "loss": 3.2284, + "step": 10523 + }, + { + "epoch": 0.52, + "grad_norm": 0.5798243284225464, + "learning_rate": 0.0005572974320925946, + "loss": 3.3389, + "step": 10524 + }, + { + "epoch": 0.52, + "grad_norm": 0.5217620134353638, + "learning_rate": 0.000557289514719292, + "loss": 3.2003, + "step": 10525 + }, + { + "epoch": 0.52, + "grad_norm": 0.49824416637420654, + "learning_rate": 0.0005572815966683385, + "loss": 3.1126, + "step": 10526 + }, + { + "epoch": 0.52, + "grad_norm": 0.5172311663627625, + "learning_rate": 0.0005572736779397546, + "loss": 3.2446, + "step": 10527 + }, + { + "epoch": 0.52, + "grad_norm": 0.5369932651519775, + "learning_rate": 0.0005572657585335614, + "loss": 3.3792, + "step": 10528 + }, + { + "epoch": 0.52, + "grad_norm": 0.5192260146141052, + "learning_rate": 0.0005572578384497797, + "loss": 3.4216, + "step": 10529 + }, + { + "epoch": 0.52, + "grad_norm": 0.5092270374298096, + "learning_rate": 0.0005572499176884305, + "loss": 3.4087, + "step": 10530 + }, + { + "epoch": 0.52, + "grad_norm": 0.5477033853530884, + "learning_rate": 0.0005572419962495344, + "loss": 3.2542, + "step": 10531 + }, + { + "epoch": 0.52, + "grad_norm": 0.49869391322135925, + "learning_rate": 0.0005572340741331124, + "loss": 3.5372, + "step": 10532 + }, + { + "epoch": 0.52, + "grad_norm": 0.5513463616371155, + "learning_rate": 0.0005572261513391854, + "loss": 3.2842, + "step": 10533 + }, + { + "epoch": 0.52, + "grad_norm": 0.5208010673522949, + "learning_rate": 0.0005572182278677741, + "loss": 3.2494, + "step": 10534 + }, + { + "epoch": 0.52, + "grad_norm": 0.5104653239250183, + "learning_rate": 0.0005572103037188996, + "loss": 3.4666, + "step": 10535 + }, + { + "epoch": 0.52, + "grad_norm": 0.49619725346565247, + "learning_rate": 0.0005572023788925827, + "loss": 3.3879, + "step": 10536 + }, + { + "epoch": 0.52, + "grad_norm": 0.5280769467353821, + "learning_rate": 0.000557194453388844, + "loss": 3.2656, + "step": 10537 + }, + { + "epoch": 0.52, + "grad_norm": 0.5171984434127808, + "learning_rate": 0.0005571865272077049, + "loss": 3.3996, + "step": 10538 + }, + { + "epoch": 0.52, + "grad_norm": 0.4976312816143036, + "learning_rate": 0.0005571786003491858, + "loss": 3.3502, + "step": 10539 + }, + { + "epoch": 0.52, + "grad_norm": 0.5273638963699341, + "learning_rate": 0.0005571706728133078, + "loss": 3.3182, + "step": 10540 + }, + { + "epoch": 0.52, + "grad_norm": 0.5085924863815308, + "learning_rate": 0.0005571627446000917, + "loss": 3.1397, + "step": 10541 + }, + { + "epoch": 0.52, + "grad_norm": 0.5273411870002747, + "learning_rate": 0.0005571548157095585, + "loss": 3.1939, + "step": 10542 + }, + { + "epoch": 0.52, + "grad_norm": 0.5445072054862976, + "learning_rate": 0.000557146886141729, + "loss": 3.1008, + "step": 10543 + }, + { + "epoch": 0.52, + "grad_norm": 0.5404632091522217, + "learning_rate": 0.0005571389558966241, + "loss": 3.4246, + "step": 10544 + }, + { + "epoch": 0.52, + "grad_norm": 0.5002350211143494, + "learning_rate": 0.0005571310249742647, + "loss": 3.4459, + "step": 10545 + }, + { + "epoch": 0.52, + "grad_norm": 0.5209913849830627, + "learning_rate": 0.0005571230933746716, + "loss": 3.0648, + "step": 10546 + }, + { + "epoch": 0.52, + "grad_norm": 0.5363181233406067, + "learning_rate": 0.0005571151610978658, + "loss": 3.2458, + "step": 10547 + }, + { + "epoch": 0.52, + "grad_norm": 0.5626364350318909, + "learning_rate": 0.0005571072281438681, + "loss": 2.9742, + "step": 10548 + }, + { + "epoch": 0.52, + "grad_norm": 0.5303053259849548, + "learning_rate": 0.0005570992945126994, + "loss": 3.2247, + "step": 10549 + }, + { + "epoch": 0.52, + "grad_norm": 0.5701689720153809, + "learning_rate": 0.0005570913602043808, + "loss": 3.2445, + "step": 10550 + }, + { + "epoch": 0.52, + "grad_norm": 0.49203193187713623, + "learning_rate": 0.0005570834252189329, + "loss": 3.265, + "step": 10551 + }, + { + "epoch": 0.52, + "grad_norm": 0.5791632533073425, + "learning_rate": 0.0005570754895563767, + "loss": 3.2336, + "step": 10552 + }, + { + "epoch": 0.52, + "grad_norm": 0.5477674007415771, + "learning_rate": 0.0005570675532167333, + "loss": 3.1324, + "step": 10553 + }, + { + "epoch": 0.52, + "grad_norm": 0.49030929803848267, + "learning_rate": 0.0005570596162000233, + "loss": 3.076, + "step": 10554 + }, + { + "epoch": 0.52, + "grad_norm": 0.5654690861701965, + "learning_rate": 0.0005570516785062678, + "loss": 3.1474, + "step": 10555 + }, + { + "epoch": 0.52, + "grad_norm": 0.5409621000289917, + "learning_rate": 0.0005570437401354877, + "loss": 3.0283, + "step": 10556 + }, + { + "epoch": 0.52, + "grad_norm": 0.4912637174129486, + "learning_rate": 0.0005570358010877038, + "loss": 3.0622, + "step": 10557 + }, + { + "epoch": 0.52, + "grad_norm": 0.5478519797325134, + "learning_rate": 0.000557027861362937, + "loss": 3.1291, + "step": 10558 + }, + { + "epoch": 0.52, + "grad_norm": 0.5594099760055542, + "learning_rate": 0.0005570199209612084, + "loss": 3.2931, + "step": 10559 + }, + { + "epoch": 0.52, + "grad_norm": 0.5263882279396057, + "learning_rate": 0.0005570119798825388, + "loss": 3.3384, + "step": 10560 + }, + { + "epoch": 0.52, + "grad_norm": 0.5052241683006287, + "learning_rate": 0.000557004038126949, + "loss": 3.4222, + "step": 10561 + }, + { + "epoch": 0.52, + "grad_norm": 0.5351318120956421, + "learning_rate": 0.0005569960956944601, + "loss": 3.2503, + "step": 10562 + }, + { + "epoch": 0.52, + "grad_norm": 0.5089155435562134, + "learning_rate": 0.0005569881525850929, + "loss": 3.1974, + "step": 10563 + }, + { + "epoch": 0.52, + "grad_norm": 0.5822809338569641, + "learning_rate": 0.0005569802087988684, + "loss": 3.2786, + "step": 10564 + }, + { + "epoch": 0.52, + "grad_norm": 0.5087013840675354, + "learning_rate": 0.0005569722643358075, + "loss": 3.3187, + "step": 10565 + }, + { + "epoch": 0.52, + "grad_norm": 0.5044753551483154, + "learning_rate": 0.000556964319195931, + "loss": 3.4414, + "step": 10566 + }, + { + "epoch": 0.52, + "grad_norm": 0.5216333866119385, + "learning_rate": 0.0005569563733792601, + "loss": 3.3294, + "step": 10567 + }, + { + "epoch": 0.52, + "grad_norm": 0.5563536882400513, + "learning_rate": 0.0005569484268858155, + "loss": 2.9519, + "step": 10568 + }, + { + "epoch": 0.52, + "grad_norm": 0.5203812122344971, + "learning_rate": 0.000556940479715618, + "loss": 3.3726, + "step": 10569 + }, + { + "epoch": 0.52, + "grad_norm": 0.545968770980835, + "learning_rate": 0.000556932531868689, + "loss": 3.1397, + "step": 10570 + }, + { + "epoch": 0.52, + "grad_norm": 0.5787644386291504, + "learning_rate": 0.000556924583345049, + "loss": 3.2113, + "step": 10571 + }, + { + "epoch": 0.52, + "grad_norm": 0.5733756422996521, + "learning_rate": 0.0005569166341447192, + "loss": 3.5598, + "step": 10572 + }, + { + "epoch": 0.52, + "grad_norm": 0.541483998298645, + "learning_rate": 0.0005569086842677203, + "loss": 3.2303, + "step": 10573 + }, + { + "epoch": 0.52, + "grad_norm": 0.549351692199707, + "learning_rate": 0.0005569007337140736, + "loss": 3.354, + "step": 10574 + }, + { + "epoch": 0.52, + "grad_norm": 0.5365926623344421, + "learning_rate": 0.0005568927824837996, + "loss": 3.2604, + "step": 10575 + }, + { + "epoch": 0.52, + "grad_norm": 0.5326177477836609, + "learning_rate": 0.0005568848305769195, + "loss": 3.2394, + "step": 10576 + }, + { + "epoch": 0.52, + "grad_norm": 0.523600697517395, + "learning_rate": 0.0005568768779934542, + "loss": 3.1083, + "step": 10577 + }, + { + "epoch": 0.52, + "grad_norm": 0.5898261666297913, + "learning_rate": 0.0005568689247334247, + "loss": 3.1423, + "step": 10578 + }, + { + "epoch": 0.52, + "grad_norm": 0.5717616677284241, + "learning_rate": 0.0005568609707968518, + "loss": 3.1968, + "step": 10579 + }, + { + "epoch": 0.52, + "grad_norm": 0.49297675490379333, + "learning_rate": 0.0005568530161837566, + "loss": 3.2594, + "step": 10580 + }, + { + "epoch": 0.52, + "grad_norm": 0.505689263343811, + "learning_rate": 0.0005568450608941599, + "loss": 3.187, + "step": 10581 + }, + { + "epoch": 0.52, + "grad_norm": 0.5439349412918091, + "learning_rate": 0.0005568371049280827, + "loss": 3.2561, + "step": 10582 + }, + { + "epoch": 0.52, + "grad_norm": 0.5405676960945129, + "learning_rate": 0.0005568291482855462, + "loss": 3.353, + "step": 10583 + }, + { + "epoch": 0.52, + "grad_norm": 0.4949883818626404, + "learning_rate": 0.000556821190966571, + "loss": 3.2305, + "step": 10584 + }, + { + "epoch": 0.52, + "grad_norm": 0.5206522941589355, + "learning_rate": 0.0005568132329711783, + "loss": 3.2696, + "step": 10585 + }, + { + "epoch": 0.52, + "grad_norm": 0.5137814283370972, + "learning_rate": 0.0005568052742993889, + "loss": 3.1519, + "step": 10586 + }, + { + "epoch": 0.52, + "grad_norm": 0.577721118927002, + "learning_rate": 0.0005567973149512239, + "loss": 3.2501, + "step": 10587 + }, + { + "epoch": 0.52, + "grad_norm": 0.4995252192020416, + "learning_rate": 0.0005567893549267042, + "loss": 3.2882, + "step": 10588 + }, + { + "epoch": 0.52, + "grad_norm": 0.5364012122154236, + "learning_rate": 0.0005567813942258506, + "loss": 3.3206, + "step": 10589 + }, + { + "epoch": 0.52, + "grad_norm": 0.5193110108375549, + "learning_rate": 0.0005567734328486844, + "loss": 3.1889, + "step": 10590 + }, + { + "epoch": 0.52, + "grad_norm": 0.510722815990448, + "learning_rate": 0.0005567654707952262, + "loss": 3.3038, + "step": 10591 + }, + { + "epoch": 0.52, + "grad_norm": 0.5020973682403564, + "learning_rate": 0.0005567575080654974, + "loss": 3.25, + "step": 10592 + }, + { + "epoch": 0.52, + "grad_norm": 0.5226955413818359, + "learning_rate": 0.0005567495446595187, + "loss": 3.3096, + "step": 10593 + }, + { + "epoch": 0.52, + "grad_norm": 0.5112230777740479, + "learning_rate": 0.000556741580577311, + "loss": 3.2976, + "step": 10594 + }, + { + "epoch": 0.52, + "grad_norm": 0.5107260942459106, + "learning_rate": 0.0005567336158188955, + "loss": 3.3089, + "step": 10595 + }, + { + "epoch": 0.52, + "grad_norm": 0.5062245726585388, + "learning_rate": 0.0005567256503842929, + "loss": 3.1591, + "step": 10596 + }, + { + "epoch": 0.52, + "grad_norm": 0.625127375125885, + "learning_rate": 0.0005567176842735244, + "loss": 3.3238, + "step": 10597 + }, + { + "epoch": 0.52, + "grad_norm": 0.48830580711364746, + "learning_rate": 0.000556709717486611, + "loss": 3.2201, + "step": 10598 + }, + { + "epoch": 0.52, + "grad_norm": 0.5055554509162903, + "learning_rate": 0.0005567017500235736, + "loss": 3.2346, + "step": 10599 + }, + { + "epoch": 0.52, + "grad_norm": 0.558253824710846, + "learning_rate": 0.0005566937818844332, + "loss": 3.3789, + "step": 10600 + }, + { + "epoch": 0.52, + "grad_norm": 0.5219190120697021, + "learning_rate": 0.0005566858130692107, + "loss": 3.1847, + "step": 10601 + }, + { + "epoch": 0.52, + "grad_norm": 0.5320719480514526, + "learning_rate": 0.0005566778435779272, + "loss": 3.3797, + "step": 10602 + }, + { + "epoch": 0.52, + "grad_norm": 0.5657633543014526, + "learning_rate": 0.0005566698734106037, + "loss": 3.2031, + "step": 10603 + }, + { + "epoch": 0.52, + "grad_norm": 0.6036418080329895, + "learning_rate": 0.0005566619025672611, + "loss": 3.3975, + "step": 10604 + }, + { + "epoch": 0.52, + "grad_norm": 0.5115952491760254, + "learning_rate": 0.0005566539310479206, + "loss": 3.058, + "step": 10605 + }, + { + "epoch": 0.52, + "grad_norm": 0.5792054533958435, + "learning_rate": 0.0005566459588526028, + "loss": 3.3311, + "step": 10606 + }, + { + "epoch": 0.52, + "grad_norm": 0.5313816070556641, + "learning_rate": 0.000556637985981329, + "loss": 3.428, + "step": 10607 + }, + { + "epoch": 0.52, + "grad_norm": 0.5156925320625305, + "learning_rate": 0.0005566300124341203, + "loss": 3.2343, + "step": 10608 + }, + { + "epoch": 0.52, + "grad_norm": 0.5339927077293396, + "learning_rate": 0.0005566220382109974, + "loss": 3.0971, + "step": 10609 + }, + { + "epoch": 0.52, + "grad_norm": 0.5031298995018005, + "learning_rate": 0.0005566140633119814, + "loss": 3.2465, + "step": 10610 + }, + { + "epoch": 0.52, + "grad_norm": 0.5161144137382507, + "learning_rate": 0.0005566060877370934, + "loss": 3.1803, + "step": 10611 + }, + { + "epoch": 0.52, + "grad_norm": 0.49942952394485474, + "learning_rate": 0.0005565981114863544, + "loss": 3.3168, + "step": 10612 + }, + { + "epoch": 0.52, + "grad_norm": 0.5302490592002869, + "learning_rate": 0.0005565901345597853, + "loss": 3.2728, + "step": 10613 + }, + { + "epoch": 0.52, + "grad_norm": 0.49967092275619507, + "learning_rate": 0.0005565821569574072, + "loss": 3.3966, + "step": 10614 + }, + { + "epoch": 0.52, + "grad_norm": 0.516776978969574, + "learning_rate": 0.000556574178679241, + "loss": 3.1585, + "step": 10615 + }, + { + "epoch": 0.52, + "grad_norm": 0.5194513201713562, + "learning_rate": 0.0005565661997253079, + "loss": 3.3124, + "step": 10616 + }, + { + "epoch": 0.52, + "grad_norm": 0.5056691765785217, + "learning_rate": 0.0005565582200956288, + "loss": 3.3491, + "step": 10617 + }, + { + "epoch": 0.52, + "grad_norm": 0.5017831921577454, + "learning_rate": 0.0005565502397902246, + "loss": 3.2153, + "step": 10618 + }, + { + "epoch": 0.52, + "grad_norm": 0.5000698566436768, + "learning_rate": 0.0005565422588091165, + "loss": 3.4327, + "step": 10619 + }, + { + "epoch": 0.52, + "grad_norm": 0.5435037016868591, + "learning_rate": 0.0005565342771523255, + "loss": 3.0531, + "step": 10620 + }, + { + "epoch": 0.52, + "grad_norm": 0.5276373624801636, + "learning_rate": 0.0005565262948198726, + "loss": 3.4252, + "step": 10621 + }, + { + "epoch": 0.52, + "grad_norm": 0.5036376118659973, + "learning_rate": 0.0005565183118117787, + "loss": 3.1393, + "step": 10622 + }, + { + "epoch": 0.52, + "grad_norm": 0.5199164748191833, + "learning_rate": 0.0005565103281280652, + "loss": 3.2729, + "step": 10623 + }, + { + "epoch": 0.52, + "grad_norm": 0.5530470013618469, + "learning_rate": 0.0005565023437687526, + "loss": 3.4069, + "step": 10624 + }, + { + "epoch": 0.52, + "grad_norm": 0.5330382585525513, + "learning_rate": 0.0005564943587338622, + "loss": 3.2335, + "step": 10625 + }, + { + "epoch": 0.52, + "grad_norm": 0.530340313911438, + "learning_rate": 0.0005564863730234151, + "loss": 3.1747, + "step": 10626 + }, + { + "epoch": 0.52, + "grad_norm": 0.5398244261741638, + "learning_rate": 0.0005564783866374323, + "loss": 3.407, + "step": 10627 + }, + { + "epoch": 0.52, + "grad_norm": 0.4917842447757721, + "learning_rate": 0.0005564703995759347, + "loss": 3.144, + "step": 10628 + }, + { + "epoch": 0.52, + "grad_norm": 0.5320751667022705, + "learning_rate": 0.0005564624118389435, + "loss": 3.1363, + "step": 10629 + }, + { + "epoch": 0.52, + "grad_norm": 0.5649660229682922, + "learning_rate": 0.0005564544234264797, + "loss": 3.283, + "step": 10630 + }, + { + "epoch": 0.52, + "grad_norm": 0.5005009174346924, + "learning_rate": 0.0005564464343385642, + "loss": 3.2569, + "step": 10631 + }, + { + "epoch": 0.52, + "grad_norm": 0.5064327120780945, + "learning_rate": 0.0005564384445752181, + "loss": 3.066, + "step": 10632 + }, + { + "epoch": 0.52, + "grad_norm": 0.5261727571487427, + "learning_rate": 0.0005564304541364626, + "loss": 3.186, + "step": 10633 + }, + { + "epoch": 0.52, + "grad_norm": 0.49180030822753906, + "learning_rate": 0.0005564224630223186, + "loss": 3.3107, + "step": 10634 + }, + { + "epoch": 0.52, + "grad_norm": 0.5282382369041443, + "learning_rate": 0.0005564144712328072, + "loss": 3.2852, + "step": 10635 + }, + { + "epoch": 0.52, + "grad_norm": 0.5133998394012451, + "learning_rate": 0.0005564064787679494, + "loss": 3.2587, + "step": 10636 + }, + { + "epoch": 0.52, + "grad_norm": 0.5127143859863281, + "learning_rate": 0.0005563984856277662, + "loss": 3.475, + "step": 10637 + }, + { + "epoch": 0.52, + "grad_norm": 0.5018812417984009, + "learning_rate": 0.000556390491812279, + "loss": 2.9832, + "step": 10638 + }, + { + "epoch": 0.52, + "grad_norm": 0.5890460014343262, + "learning_rate": 0.0005563824973215083, + "loss": 3.3091, + "step": 10639 + }, + { + "epoch": 0.52, + "grad_norm": 0.5279850363731384, + "learning_rate": 0.0005563745021554756, + "loss": 3.4503, + "step": 10640 + }, + { + "epoch": 0.52, + "grad_norm": 0.5249155759811401, + "learning_rate": 0.0005563665063142018, + "loss": 3.0548, + "step": 10641 + }, + { + "epoch": 0.52, + "grad_norm": 0.5137816667556763, + "learning_rate": 0.0005563585097977079, + "loss": 3.4102, + "step": 10642 + }, + { + "epoch": 0.52, + "grad_norm": 0.5169107913970947, + "learning_rate": 0.000556350512606015, + "loss": 3.3964, + "step": 10643 + }, + { + "epoch": 0.52, + "grad_norm": 0.5247167944908142, + "learning_rate": 0.0005563425147391442, + "loss": 3.1934, + "step": 10644 + }, + { + "epoch": 0.52, + "grad_norm": 0.5554961562156677, + "learning_rate": 0.0005563345161971165, + "loss": 3.6094, + "step": 10645 + }, + { + "epoch": 0.52, + "grad_norm": 0.5095405578613281, + "learning_rate": 0.0005563265169799532, + "loss": 3.5217, + "step": 10646 + }, + { + "epoch": 0.52, + "grad_norm": 0.5305556058883667, + "learning_rate": 0.0005563185170876751, + "loss": 3.1786, + "step": 10647 + }, + { + "epoch": 0.52, + "grad_norm": 0.5047399997711182, + "learning_rate": 0.0005563105165203034, + "loss": 3.1298, + "step": 10648 + }, + { + "epoch": 0.52, + "grad_norm": 0.5853273272514343, + "learning_rate": 0.0005563025152778591, + "loss": 3.0866, + "step": 10649 + }, + { + "epoch": 0.52, + "grad_norm": 0.5093114972114563, + "learning_rate": 0.0005562945133603633, + "loss": 3.273, + "step": 10650 + }, + { + "epoch": 0.52, + "grad_norm": 0.4893055558204651, + "learning_rate": 0.000556286510767837, + "loss": 3.3782, + "step": 10651 + }, + { + "epoch": 0.52, + "grad_norm": 0.6072062849998474, + "learning_rate": 0.0005562785075003013, + "loss": 3.24, + "step": 10652 + }, + { + "epoch": 0.52, + "grad_norm": 0.49338966608047485, + "learning_rate": 0.0005562705035577775, + "loss": 3.4271, + "step": 10653 + }, + { + "epoch": 0.52, + "grad_norm": 0.5136591792106628, + "learning_rate": 0.0005562624989402864, + "loss": 3.2597, + "step": 10654 + }, + { + "epoch": 0.52, + "grad_norm": 0.5117157101631165, + "learning_rate": 0.0005562544936478492, + "loss": 3.2494, + "step": 10655 + }, + { + "epoch": 0.52, + "grad_norm": 0.4980575442314148, + "learning_rate": 0.0005562464876804871, + "loss": 3.3992, + "step": 10656 + }, + { + "epoch": 0.52, + "grad_norm": 0.5555065870285034, + "learning_rate": 0.000556238481038221, + "loss": 3.2717, + "step": 10657 + }, + { + "epoch": 0.52, + "grad_norm": 0.5585774779319763, + "learning_rate": 0.000556230473721072, + "loss": 3.2388, + "step": 10658 + }, + { + "epoch": 0.52, + "grad_norm": 0.6022112965583801, + "learning_rate": 0.0005562224657290613, + "loss": 3.0722, + "step": 10659 + }, + { + "epoch": 0.52, + "grad_norm": 0.5127832889556885, + "learning_rate": 0.0005562144570622099, + "loss": 3.3021, + "step": 10660 + }, + { + "epoch": 0.52, + "grad_norm": 0.5011223554611206, + "learning_rate": 0.000556206447720539, + "loss": 3.3003, + "step": 10661 + }, + { + "epoch": 0.52, + "grad_norm": 0.5553777813911438, + "learning_rate": 0.0005561984377040695, + "loss": 3.1519, + "step": 10662 + }, + { + "epoch": 0.52, + "grad_norm": 0.5582361817359924, + "learning_rate": 0.0005561904270128227, + "loss": 3.1381, + "step": 10663 + }, + { + "epoch": 0.52, + "grad_norm": 0.5032232403755188, + "learning_rate": 0.0005561824156468196, + "loss": 3.1779, + "step": 10664 + }, + { + "epoch": 0.52, + "grad_norm": 0.516594409942627, + "learning_rate": 0.0005561744036060814, + "loss": 3.3612, + "step": 10665 + }, + { + "epoch": 0.52, + "grad_norm": 0.5512834191322327, + "learning_rate": 0.0005561663908906291, + "loss": 3.2972, + "step": 10666 + }, + { + "epoch": 0.52, + "grad_norm": 0.5587075352668762, + "learning_rate": 0.0005561583775004837, + "loss": 3.1885, + "step": 10667 + }, + { + "epoch": 0.52, + "grad_norm": 0.6395102739334106, + "learning_rate": 0.0005561503634356666, + "loss": 3.368, + "step": 10668 + }, + { + "epoch": 0.52, + "grad_norm": 0.572404146194458, + "learning_rate": 0.0005561423486961987, + "loss": 3.0949, + "step": 10669 + }, + { + "epoch": 0.52, + "grad_norm": 0.5496918559074402, + "learning_rate": 0.000556134333282101, + "loss": 3.1419, + "step": 10670 + }, + { + "epoch": 0.52, + "grad_norm": 0.5133854150772095, + "learning_rate": 0.0005561263171933949, + "loss": 3.3315, + "step": 10671 + }, + { + "epoch": 0.52, + "grad_norm": 0.5696301460266113, + "learning_rate": 0.0005561183004301015, + "loss": 3.1278, + "step": 10672 + }, + { + "epoch": 0.52, + "grad_norm": 0.5422386527061462, + "learning_rate": 0.0005561102829922415, + "loss": 3.2289, + "step": 10673 + }, + { + "epoch": 0.52, + "grad_norm": 0.5507988333702087, + "learning_rate": 0.0005561022648798364, + "loss": 3.2526, + "step": 10674 + }, + { + "epoch": 0.52, + "grad_norm": 0.5414395332336426, + "learning_rate": 0.0005560942460929074, + "loss": 3.0932, + "step": 10675 + }, + { + "epoch": 0.52, + "grad_norm": 0.611114501953125, + "learning_rate": 0.0005560862266314752, + "loss": 3.0416, + "step": 10676 + }, + { + "epoch": 0.52, + "grad_norm": 0.5457938313484192, + "learning_rate": 0.0005560782064955613, + "loss": 3.1892, + "step": 10677 + }, + { + "epoch": 0.52, + "grad_norm": 0.5073613524436951, + "learning_rate": 0.0005560701856851866, + "loss": 3.3985, + "step": 10678 + }, + { + "epoch": 0.52, + "grad_norm": 0.5243719220161438, + "learning_rate": 0.0005560621642003723, + "loss": 3.1556, + "step": 10679 + }, + { + "epoch": 0.52, + "grad_norm": 0.5766971707344055, + "learning_rate": 0.0005560541420411398, + "loss": 3.3581, + "step": 10680 + }, + { + "epoch": 0.52, + "grad_norm": 0.5084385871887207, + "learning_rate": 0.0005560461192075097, + "loss": 3.1972, + "step": 10681 + }, + { + "epoch": 0.52, + "grad_norm": 0.5689043402671814, + "learning_rate": 0.0005560380956995035, + "loss": 3.2245, + "step": 10682 + }, + { + "epoch": 0.52, + "grad_norm": 0.506496787071228, + "learning_rate": 0.0005560300715171421, + "loss": 3.2089, + "step": 10683 + }, + { + "epoch": 0.52, + "grad_norm": 0.5448225736618042, + "learning_rate": 0.0005560220466604469, + "loss": 3.1088, + "step": 10684 + }, + { + "epoch": 0.52, + "grad_norm": 0.5289160013198853, + "learning_rate": 0.0005560140211294389, + "loss": 3.3693, + "step": 10685 + }, + { + "epoch": 0.52, + "grad_norm": 0.5228204131126404, + "learning_rate": 0.0005560059949241392, + "loss": 3.4617, + "step": 10686 + }, + { + "epoch": 0.52, + "grad_norm": 0.49474701285362244, + "learning_rate": 0.000555997968044569, + "loss": 3.2549, + "step": 10687 + }, + { + "epoch": 0.52, + "grad_norm": 0.5213100910186768, + "learning_rate": 0.0005559899404907493, + "loss": 3.2791, + "step": 10688 + }, + { + "epoch": 0.52, + "grad_norm": 0.5579019784927368, + "learning_rate": 0.0005559819122627016, + "loss": 3.0853, + "step": 10689 + }, + { + "epoch": 0.52, + "grad_norm": 0.5246168375015259, + "learning_rate": 0.0005559738833604466, + "loss": 3.1685, + "step": 10690 + }, + { + "epoch": 0.52, + "grad_norm": 0.5287438631057739, + "learning_rate": 0.0005559658537840058, + "loss": 3.2581, + "step": 10691 + }, + { + "epoch": 0.52, + "grad_norm": 0.576284646987915, + "learning_rate": 0.0005559578235334002, + "loss": 3.3451, + "step": 10692 + }, + { + "epoch": 0.52, + "grad_norm": 0.5133945345878601, + "learning_rate": 0.0005559497926086508, + "loss": 3.2493, + "step": 10693 + }, + { + "epoch": 0.52, + "grad_norm": 0.5034992098808289, + "learning_rate": 0.000555941761009779, + "loss": 3.3255, + "step": 10694 + }, + { + "epoch": 0.52, + "grad_norm": 0.5439068078994751, + "learning_rate": 0.0005559337287368058, + "loss": 3.2783, + "step": 10695 + }, + { + "epoch": 0.52, + "grad_norm": 0.5081217288970947, + "learning_rate": 0.0005559256957897525, + "loss": 3.2928, + "step": 10696 + }, + { + "epoch": 0.52, + "grad_norm": 0.5664759278297424, + "learning_rate": 0.0005559176621686402, + "loss": 3.2541, + "step": 10697 + }, + { + "epoch": 0.52, + "grad_norm": 0.5075600743293762, + "learning_rate": 0.0005559096278734899, + "loss": 3.1877, + "step": 10698 + }, + { + "epoch": 0.52, + "grad_norm": 0.5304969549179077, + "learning_rate": 0.000555901592904323, + "loss": 3.347, + "step": 10699 + }, + { + "epoch": 0.52, + "grad_norm": 0.5134689211845398, + "learning_rate": 0.0005558935572611605, + "loss": 3.2153, + "step": 10700 + }, + { + "epoch": 0.52, + "grad_norm": 0.5504531264305115, + "learning_rate": 0.0005558855209440236, + "loss": 3.1492, + "step": 10701 + }, + { + "epoch": 0.52, + "grad_norm": 0.5000514984130859, + "learning_rate": 0.0005558774839529335, + "loss": 3.472, + "step": 10702 + }, + { + "epoch": 0.52, + "grad_norm": 0.532200276851654, + "learning_rate": 0.0005558694462879113, + "loss": 3.1464, + "step": 10703 + }, + { + "epoch": 0.52, + "grad_norm": 0.5475481152534485, + "learning_rate": 0.0005558614079489784, + "loss": 2.9702, + "step": 10704 + }, + { + "epoch": 0.52, + "grad_norm": 0.5486966967582703, + "learning_rate": 0.0005558533689361557, + "loss": 3.5168, + "step": 10705 + }, + { + "epoch": 0.52, + "grad_norm": 0.5467349886894226, + "learning_rate": 0.0005558453292494644, + "loss": 3.4869, + "step": 10706 + }, + { + "epoch": 0.52, + "grad_norm": 0.720486581325531, + "learning_rate": 0.0005558372888889258, + "loss": 3.0787, + "step": 10707 + }, + { + "epoch": 0.52, + "grad_norm": 0.5721949338912964, + "learning_rate": 0.0005558292478545611, + "loss": 3.5098, + "step": 10708 + }, + { + "epoch": 0.52, + "grad_norm": 0.4928334355354309, + "learning_rate": 0.0005558212061463912, + "loss": 3.5137, + "step": 10709 + }, + { + "epoch": 0.52, + "grad_norm": 0.5080165863037109, + "learning_rate": 0.0005558131637644376, + "loss": 3.3198, + "step": 10710 + }, + { + "epoch": 0.52, + "grad_norm": 0.5456277132034302, + "learning_rate": 0.0005558051207087214, + "loss": 3.2869, + "step": 10711 + }, + { + "epoch": 0.52, + "grad_norm": 0.48766282200813293, + "learning_rate": 0.0005557970769792636, + "loss": 3.1597, + "step": 10712 + }, + { + "epoch": 0.53, + "grad_norm": 0.5692100524902344, + "learning_rate": 0.0005557890325760856, + "loss": 3.2611, + "step": 10713 + }, + { + "epoch": 0.53, + "grad_norm": 0.5719293355941772, + "learning_rate": 0.0005557809874992086, + "loss": 3.0606, + "step": 10714 + }, + { + "epoch": 0.53, + "grad_norm": 0.5092300772666931, + "learning_rate": 0.0005557729417486536, + "loss": 3.1642, + "step": 10715 + }, + { + "epoch": 0.53, + "grad_norm": 0.5151323676109314, + "learning_rate": 0.0005557648953244419, + "loss": 3.2684, + "step": 10716 + }, + { + "epoch": 0.53, + "grad_norm": 0.48842179775238037, + "learning_rate": 0.0005557568482265947, + "loss": 3.3302, + "step": 10717 + }, + { + "epoch": 0.53, + "grad_norm": 0.5149058699607849, + "learning_rate": 0.0005557488004551332, + "loss": 3.3089, + "step": 10718 + }, + { + "epoch": 0.53, + "grad_norm": 0.5239120125770569, + "learning_rate": 0.0005557407520100785, + "loss": 3.2215, + "step": 10719 + }, + { + "epoch": 0.53, + "grad_norm": 0.5721972584724426, + "learning_rate": 0.000555732702891452, + "loss": 3.2324, + "step": 10720 + }, + { + "epoch": 0.53, + "grad_norm": 0.5425095558166504, + "learning_rate": 0.0005557246530992748, + "loss": 3.3185, + "step": 10721 + }, + { + "epoch": 0.53, + "grad_norm": 0.5574762225151062, + "learning_rate": 0.000555716602633568, + "loss": 3.4277, + "step": 10722 + }, + { + "epoch": 0.53, + "grad_norm": 0.5534297823905945, + "learning_rate": 0.0005557085514943529, + "loss": 3.3156, + "step": 10723 + }, + { + "epoch": 0.53, + "grad_norm": 0.5003551244735718, + "learning_rate": 0.0005557004996816507, + "loss": 3.3441, + "step": 10724 + }, + { + "epoch": 0.53, + "grad_norm": 0.5311870574951172, + "learning_rate": 0.0005556924471954826, + "loss": 3.235, + "step": 10725 + }, + { + "epoch": 0.53, + "grad_norm": 0.5108062624931335, + "learning_rate": 0.0005556843940358698, + "loss": 3.1934, + "step": 10726 + }, + { + "epoch": 0.53, + "grad_norm": 0.5077083110809326, + "learning_rate": 0.0005556763402028334, + "loss": 3.2491, + "step": 10727 + }, + { + "epoch": 0.53, + "grad_norm": 0.5326809883117676, + "learning_rate": 0.0005556682856963949, + "loss": 3.3736, + "step": 10728 + }, + { + "epoch": 0.53, + "grad_norm": 0.49083372950553894, + "learning_rate": 0.0005556602305165752, + "loss": 3.0318, + "step": 10729 + }, + { + "epoch": 0.53, + "grad_norm": 0.49490422010421753, + "learning_rate": 0.0005556521746633957, + "loss": 3.1, + "step": 10730 + }, + { + "epoch": 0.53, + "grad_norm": 0.48100271821022034, + "learning_rate": 0.0005556441181368776, + "loss": 3.2817, + "step": 10731 + }, + { + "epoch": 0.53, + "grad_norm": 0.5259497761726379, + "learning_rate": 0.0005556360609370421, + "loss": 3.3625, + "step": 10732 + }, + { + "epoch": 0.53, + "grad_norm": 0.5028544068336487, + "learning_rate": 0.0005556280030639103, + "loss": 3.2937, + "step": 10733 + }, + { + "epoch": 0.53, + "grad_norm": 0.628638744354248, + "learning_rate": 0.0005556199445175037, + "loss": 3.2895, + "step": 10734 + }, + { + "epoch": 0.53, + "grad_norm": 0.517248809337616, + "learning_rate": 0.0005556118852978433, + "loss": 3.4441, + "step": 10735 + }, + { + "epoch": 0.53, + "grad_norm": 0.5241116285324097, + "learning_rate": 0.0005556038254049503, + "loss": 3.071, + "step": 10736 + }, + { + "epoch": 0.53, + "grad_norm": 0.5196915864944458, + "learning_rate": 0.0005555957648388461, + "loss": 3.4301, + "step": 10737 + }, + { + "epoch": 0.53, + "grad_norm": 0.5392457246780396, + "learning_rate": 0.000555587703599552, + "loss": 3.0426, + "step": 10738 + }, + { + "epoch": 0.53, + "grad_norm": 0.49311745166778564, + "learning_rate": 0.0005555796416870888, + "loss": 3.3097, + "step": 10739 + }, + { + "epoch": 0.53, + "grad_norm": 0.643158495426178, + "learning_rate": 0.0005555715791014781, + "loss": 3.2897, + "step": 10740 + }, + { + "epoch": 0.53, + "grad_norm": 0.5130305290222168, + "learning_rate": 0.0005555635158427412, + "loss": 3.2176, + "step": 10741 + }, + { + "epoch": 0.53, + "grad_norm": 0.5505951046943665, + "learning_rate": 0.000555555451910899, + "loss": 3.2898, + "step": 10742 + }, + { + "epoch": 0.53, + "grad_norm": 0.5373878479003906, + "learning_rate": 0.000555547387305973, + "loss": 3.2094, + "step": 10743 + }, + { + "epoch": 0.53, + "grad_norm": 0.4722157418727875, + "learning_rate": 0.0005555393220279843, + "loss": 3.1176, + "step": 10744 + }, + { + "epoch": 0.53, + "grad_norm": 0.5164155960083008, + "learning_rate": 0.0005555312560769542, + "loss": 3.1844, + "step": 10745 + }, + { + "epoch": 0.53, + "grad_norm": 0.49586886167526245, + "learning_rate": 0.000555523189452904, + "loss": 3.0007, + "step": 10746 + }, + { + "epoch": 0.53, + "grad_norm": 0.5387834906578064, + "learning_rate": 0.0005555151221558549, + "loss": 3.2779, + "step": 10747 + }, + { + "epoch": 0.53, + "grad_norm": 0.5266426205635071, + "learning_rate": 0.0005555070541858281, + "loss": 3.1793, + "step": 10748 + }, + { + "epoch": 0.53, + "grad_norm": 0.5374503135681152, + "learning_rate": 0.000555498985542845, + "loss": 3.2667, + "step": 10749 + }, + { + "epoch": 0.53, + "grad_norm": 0.5462484359741211, + "learning_rate": 0.0005554909162269266, + "loss": 3.189, + "step": 10750 + }, + { + "epoch": 0.53, + "grad_norm": 0.5269687175750732, + "learning_rate": 0.0005554828462380945, + "loss": 3.3992, + "step": 10751 + }, + { + "epoch": 0.53, + "grad_norm": 0.5425649881362915, + "learning_rate": 0.0005554747755763695, + "loss": 3.2771, + "step": 10752 + }, + { + "epoch": 0.53, + "grad_norm": 0.5352199077606201, + "learning_rate": 0.0005554667042417733, + "loss": 3.5624, + "step": 10753 + }, + { + "epoch": 0.53, + "grad_norm": 0.5189509987831116, + "learning_rate": 0.000555458632234327, + "loss": 3.2905, + "step": 10754 + }, + { + "epoch": 0.53, + "grad_norm": 0.5382845401763916, + "learning_rate": 0.0005554505595540516, + "loss": 3.1669, + "step": 10755 + }, + { + "epoch": 0.53, + "grad_norm": 0.5350643396377563, + "learning_rate": 0.0005554424862009688, + "loss": 3.202, + "step": 10756 + }, + { + "epoch": 0.53, + "grad_norm": 0.5131836533546448, + "learning_rate": 0.0005554344121750994, + "loss": 3.1861, + "step": 10757 + }, + { + "epoch": 0.53, + "grad_norm": 0.5483985543251038, + "learning_rate": 0.0005554263374764651, + "loss": 3.2958, + "step": 10758 + }, + { + "epoch": 0.53, + "grad_norm": 0.5102946758270264, + "learning_rate": 0.000555418262105087, + "loss": 3.2614, + "step": 10759 + }, + { + "epoch": 0.53, + "grad_norm": 0.5368705987930298, + "learning_rate": 0.0005554101860609864, + "loss": 3.3872, + "step": 10760 + }, + { + "epoch": 0.53, + "grad_norm": 0.49759748578071594, + "learning_rate": 0.0005554021093441844, + "loss": 3.3422, + "step": 10761 + }, + { + "epoch": 0.53, + "grad_norm": 0.5840714573860168, + "learning_rate": 0.0005553940319547024, + "loss": 3.3937, + "step": 10762 + }, + { + "epoch": 0.53, + "grad_norm": 0.5512921214103699, + "learning_rate": 0.0005553859538925617, + "loss": 3.3002, + "step": 10763 + }, + { + "epoch": 0.53, + "grad_norm": 0.49647557735443115, + "learning_rate": 0.0005553778751577836, + "loss": 3.3018, + "step": 10764 + }, + { + "epoch": 0.53, + "grad_norm": 0.550879955291748, + "learning_rate": 0.0005553697957503893, + "loss": 3.3197, + "step": 10765 + }, + { + "epoch": 0.53, + "grad_norm": 0.5185007452964783, + "learning_rate": 0.0005553617156704001, + "loss": 3.2382, + "step": 10766 + }, + { + "epoch": 0.53, + "grad_norm": 0.510525643825531, + "learning_rate": 0.0005553536349178372, + "loss": 3.3413, + "step": 10767 + }, + { + "epoch": 0.53, + "grad_norm": 0.5063631534576416, + "learning_rate": 0.0005553455534927221, + "loss": 3.1335, + "step": 10768 + }, + { + "epoch": 0.53, + "grad_norm": 0.5001246929168701, + "learning_rate": 0.000555337471395076, + "loss": 3.1726, + "step": 10769 + }, + { + "epoch": 0.53, + "grad_norm": 0.5542845129966736, + "learning_rate": 0.00055532938862492, + "loss": 3.1714, + "step": 10770 + }, + { + "epoch": 0.53, + "grad_norm": 0.5402297377586365, + "learning_rate": 0.0005553213051822755, + "loss": 3.2763, + "step": 10771 + }, + { + "epoch": 0.53, + "grad_norm": 0.5092653632164001, + "learning_rate": 0.0005553132210671639, + "loss": 3.2897, + "step": 10772 + }, + { + "epoch": 0.53, + "grad_norm": 0.5712873935699463, + "learning_rate": 0.0005553051362796064, + "loss": 3.3383, + "step": 10773 + }, + { + "epoch": 0.53, + "grad_norm": 0.5029581189155579, + "learning_rate": 0.0005552970508196243, + "loss": 3.1195, + "step": 10774 + }, + { + "epoch": 0.53, + "grad_norm": 0.558640718460083, + "learning_rate": 0.0005552889646872389, + "loss": 3.1996, + "step": 10775 + }, + { + "epoch": 0.53, + "grad_norm": 0.5104977488517761, + "learning_rate": 0.0005552808778824715, + "loss": 3.1922, + "step": 10776 + }, + { + "epoch": 0.53, + "grad_norm": 0.4940243363380432, + "learning_rate": 0.0005552727904053435, + "loss": 2.9714, + "step": 10777 + }, + { + "epoch": 0.53, + "grad_norm": 0.5271364450454712, + "learning_rate": 0.0005552647022558759, + "loss": 3.2721, + "step": 10778 + }, + { + "epoch": 0.53, + "grad_norm": 0.5337794423103333, + "learning_rate": 0.0005552566134340904, + "loss": 3.3958, + "step": 10779 + }, + { + "epoch": 0.53, + "grad_norm": 0.4681585133075714, + "learning_rate": 0.000555248523940008, + "loss": 3.3627, + "step": 10780 + }, + { + "epoch": 0.53, + "grad_norm": 0.5498515367507935, + "learning_rate": 0.0005552404337736501, + "loss": 3.3218, + "step": 10781 + }, + { + "epoch": 0.53, + "grad_norm": 0.4920918941497803, + "learning_rate": 0.000555232342935038, + "loss": 3.2476, + "step": 10782 + }, + { + "epoch": 0.53, + "grad_norm": 0.5251589417457581, + "learning_rate": 0.0005552242514241931, + "loss": 3.4136, + "step": 10783 + }, + { + "epoch": 0.53, + "grad_norm": 0.49055927991867065, + "learning_rate": 0.0005552161592411366, + "loss": 3.4924, + "step": 10784 + }, + { + "epoch": 0.53, + "grad_norm": 0.5254443287849426, + "learning_rate": 0.0005552080663858899, + "loss": 3.1118, + "step": 10785 + }, + { + "epoch": 0.53, + "grad_norm": 0.5273293852806091, + "learning_rate": 0.0005551999728584742, + "loss": 3.2376, + "step": 10786 + }, + { + "epoch": 0.53, + "grad_norm": 0.46797534823417664, + "learning_rate": 0.000555191878658911, + "loss": 3.1969, + "step": 10787 + }, + { + "epoch": 0.53, + "grad_norm": 0.5711906552314758, + "learning_rate": 0.0005551837837872213, + "loss": 3.3446, + "step": 10788 + }, + { + "epoch": 0.53, + "grad_norm": 0.5303103923797607, + "learning_rate": 0.0005551756882434268, + "loss": 3.1252, + "step": 10789 + }, + { + "epoch": 0.53, + "grad_norm": 0.515155017375946, + "learning_rate": 0.0005551675920275486, + "loss": 3.2225, + "step": 10790 + }, + { + "epoch": 0.53, + "grad_norm": 0.5145278573036194, + "learning_rate": 0.0005551594951396081, + "loss": 3.5122, + "step": 10791 + }, + { + "epoch": 0.53, + "grad_norm": 0.4963783025741577, + "learning_rate": 0.0005551513975796265, + "loss": 3.4213, + "step": 10792 + }, + { + "epoch": 0.53, + "grad_norm": 0.5196395516395569, + "learning_rate": 0.0005551432993476254, + "loss": 3.2661, + "step": 10793 + }, + { + "epoch": 0.53, + "grad_norm": 0.5125848054885864, + "learning_rate": 0.0005551352004436258, + "loss": 3.2938, + "step": 10794 + }, + { + "epoch": 0.53, + "grad_norm": 0.5345282554626465, + "learning_rate": 0.0005551271008676492, + "loss": 3.1414, + "step": 10795 + }, + { + "epoch": 0.53, + "grad_norm": 0.4970857799053192, + "learning_rate": 0.0005551190006197169, + "loss": 3.3573, + "step": 10796 + }, + { + "epoch": 0.53, + "grad_norm": 0.49079298973083496, + "learning_rate": 0.0005551108996998503, + "loss": 3.3713, + "step": 10797 + }, + { + "epoch": 0.53, + "grad_norm": 0.502100944519043, + "learning_rate": 0.0005551027981080707, + "loss": 3.2045, + "step": 10798 + }, + { + "epoch": 0.53, + "grad_norm": 0.5889394879341125, + "learning_rate": 0.0005550946958443994, + "loss": 3.3565, + "step": 10799 + }, + { + "epoch": 0.53, + "grad_norm": 0.5061367154121399, + "learning_rate": 0.0005550865929088577, + "loss": 3.349, + "step": 10800 + }, + { + "epoch": 0.53, + "grad_norm": 0.5013979077339172, + "learning_rate": 0.000555078489301467, + "loss": 3.3525, + "step": 10801 + }, + { + "epoch": 0.53, + "grad_norm": 0.49582046270370483, + "learning_rate": 0.0005550703850222487, + "loss": 3.2322, + "step": 10802 + }, + { + "epoch": 0.53, + "grad_norm": 0.5003193020820618, + "learning_rate": 0.0005550622800712242, + "loss": 3.3309, + "step": 10803 + }, + { + "epoch": 0.53, + "grad_norm": 0.5031837821006775, + "learning_rate": 0.0005550541744484147, + "loss": 3.1939, + "step": 10804 + }, + { + "epoch": 0.53, + "grad_norm": 0.5380825400352478, + "learning_rate": 0.0005550460681538415, + "loss": 3.2945, + "step": 10805 + }, + { + "epoch": 0.53, + "grad_norm": 0.5140364766120911, + "learning_rate": 0.0005550379611875261, + "loss": 3.3461, + "step": 10806 + }, + { + "epoch": 0.53, + "grad_norm": 0.5664733052253723, + "learning_rate": 0.0005550298535494898, + "loss": 3.0924, + "step": 10807 + }, + { + "epoch": 0.53, + "grad_norm": 0.5468794703483582, + "learning_rate": 0.000555021745239754, + "loss": 3.0776, + "step": 10808 + }, + { + "epoch": 0.53, + "grad_norm": 0.5097938776016235, + "learning_rate": 0.00055501363625834, + "loss": 3.4317, + "step": 10809 + }, + { + "epoch": 0.53, + "grad_norm": 0.5171865820884705, + "learning_rate": 0.000555005526605269, + "loss": 3.1843, + "step": 10810 + }, + { + "epoch": 0.53, + "grad_norm": 0.544694721698761, + "learning_rate": 0.0005549974162805626, + "loss": 3.1868, + "step": 10811 + }, + { + "epoch": 0.53, + "grad_norm": 0.4788232147693634, + "learning_rate": 0.0005549893052842421, + "loss": 3.4821, + "step": 10812 + }, + { + "epoch": 0.53, + "grad_norm": 0.5630718469619751, + "learning_rate": 0.0005549811936163288, + "loss": 3.2703, + "step": 10813 + }, + { + "epoch": 0.53, + "grad_norm": 0.5108457207679749, + "learning_rate": 0.0005549730812768441, + "loss": 3.2386, + "step": 10814 + }, + { + "epoch": 0.53, + "grad_norm": 0.5172486305236816, + "learning_rate": 0.0005549649682658096, + "loss": 3.4601, + "step": 10815 + }, + { + "epoch": 0.53, + "grad_norm": 0.5107886791229248, + "learning_rate": 0.0005549568545832462, + "loss": 3.1461, + "step": 10816 + }, + { + "epoch": 0.53, + "grad_norm": 0.5882880091667175, + "learning_rate": 0.0005549487402291756, + "loss": 3.2976, + "step": 10817 + }, + { + "epoch": 0.53, + "grad_norm": 0.5461258292198181, + "learning_rate": 0.000554940625203619, + "loss": 3.3833, + "step": 10818 + }, + { + "epoch": 0.53, + "grad_norm": 0.5405899882316589, + "learning_rate": 0.0005549325095065979, + "loss": 3.3489, + "step": 10819 + }, + { + "epoch": 0.53, + "grad_norm": 0.5153059959411621, + "learning_rate": 0.0005549243931381336, + "loss": 2.9776, + "step": 10820 + }, + { + "epoch": 0.53, + "grad_norm": 0.49118930101394653, + "learning_rate": 0.0005549162760982477, + "loss": 3.0449, + "step": 10821 + }, + { + "epoch": 0.53, + "grad_norm": 0.5056452751159668, + "learning_rate": 0.0005549081583869612, + "loss": 3.4947, + "step": 10822 + }, + { + "epoch": 0.53, + "grad_norm": 0.5282566547393799, + "learning_rate": 0.0005549000400042958, + "loss": 3.3881, + "step": 10823 + }, + { + "epoch": 0.53, + "grad_norm": 0.5301047563552856, + "learning_rate": 0.0005548919209502726, + "loss": 3.1563, + "step": 10824 + }, + { + "epoch": 0.53, + "grad_norm": 0.5251234769821167, + "learning_rate": 0.0005548838012249132, + "loss": 3.2803, + "step": 10825 + }, + { + "epoch": 0.53, + "grad_norm": 0.523571252822876, + "learning_rate": 0.000554875680828239, + "loss": 3.318, + "step": 10826 + }, + { + "epoch": 0.53, + "grad_norm": 0.5182716846466064, + "learning_rate": 0.0005548675597602711, + "loss": 3.3921, + "step": 10827 + }, + { + "epoch": 0.53, + "grad_norm": 0.5157158970832825, + "learning_rate": 0.0005548594380210313, + "loss": 3.0689, + "step": 10828 + }, + { + "epoch": 0.53, + "grad_norm": 0.5221959948539734, + "learning_rate": 0.0005548513156105407, + "loss": 3.1948, + "step": 10829 + }, + { + "epoch": 0.53, + "grad_norm": 0.49166935682296753, + "learning_rate": 0.0005548431925288208, + "loss": 3.1451, + "step": 10830 + }, + { + "epoch": 0.53, + "grad_norm": 0.5118061900138855, + "learning_rate": 0.000554835068775893, + "loss": 3.2432, + "step": 10831 + }, + { + "epoch": 0.53, + "grad_norm": 0.49349334836006165, + "learning_rate": 0.0005548269443517787, + "loss": 3.2898, + "step": 10832 + }, + { + "epoch": 0.53, + "grad_norm": 0.5066019892692566, + "learning_rate": 0.0005548188192564991, + "loss": 3.2891, + "step": 10833 + }, + { + "epoch": 0.53, + "grad_norm": 0.5938020944595337, + "learning_rate": 0.000554810693490076, + "loss": 3.2116, + "step": 10834 + }, + { + "epoch": 0.53, + "grad_norm": 0.5334991216659546, + "learning_rate": 0.0005548025670525304, + "loss": 3.1081, + "step": 10835 + }, + { + "epoch": 0.53, + "grad_norm": 0.502778947353363, + "learning_rate": 0.000554794439943884, + "loss": 3.233, + "step": 10836 + }, + { + "epoch": 0.53, + "grad_norm": 0.5045210123062134, + "learning_rate": 0.0005547863121641581, + "loss": 3.314, + "step": 10837 + }, + { + "epoch": 0.53, + "grad_norm": 0.5293318033218384, + "learning_rate": 0.000554778183713374, + "loss": 3.3784, + "step": 10838 + }, + { + "epoch": 0.53, + "grad_norm": 0.518131673336029, + "learning_rate": 0.0005547700545915531, + "loss": 3.3426, + "step": 10839 + }, + { + "epoch": 0.53, + "grad_norm": 0.4948231279850006, + "learning_rate": 0.0005547619247987171, + "loss": 3.0953, + "step": 10840 + }, + { + "epoch": 0.53, + "grad_norm": 0.5470211505889893, + "learning_rate": 0.0005547537943348871, + "loss": 3.1605, + "step": 10841 + }, + { + "epoch": 0.53, + "grad_norm": 0.5280324816703796, + "learning_rate": 0.0005547456632000846, + "loss": 3.2811, + "step": 10842 + }, + { + "epoch": 0.53, + "grad_norm": 0.5659511089324951, + "learning_rate": 0.0005547375313943312, + "loss": 3.2445, + "step": 10843 + }, + { + "epoch": 0.53, + "grad_norm": 0.5293766856193542, + "learning_rate": 0.000554729398917648, + "loss": 3.2949, + "step": 10844 + }, + { + "epoch": 0.53, + "grad_norm": 0.5941742062568665, + "learning_rate": 0.0005547212657700568, + "loss": 3.202, + "step": 10845 + }, + { + "epoch": 0.53, + "grad_norm": 0.5199886560440063, + "learning_rate": 0.0005547131319515787, + "loss": 3.1408, + "step": 10846 + }, + { + "epoch": 0.53, + "grad_norm": 0.5088639259338379, + "learning_rate": 0.0005547049974622351, + "loss": 3.2688, + "step": 10847 + }, + { + "epoch": 0.53, + "grad_norm": 0.5248528718948364, + "learning_rate": 0.0005546968623020477, + "loss": 3.2334, + "step": 10848 + }, + { + "epoch": 0.53, + "grad_norm": 0.5154558420181274, + "learning_rate": 0.0005546887264710377, + "loss": 3.2585, + "step": 10849 + }, + { + "epoch": 0.53, + "grad_norm": 0.5221732258796692, + "learning_rate": 0.0005546805899692267, + "loss": 3.2253, + "step": 10850 + }, + { + "epoch": 0.53, + "grad_norm": 0.5415873527526855, + "learning_rate": 0.000554672452796636, + "loss": 3.1749, + "step": 10851 + }, + { + "epoch": 0.53, + "grad_norm": 0.4975447952747345, + "learning_rate": 0.000554664314953287, + "loss": 3.0391, + "step": 10852 + }, + { + "epoch": 0.53, + "grad_norm": 0.4918574094772339, + "learning_rate": 0.0005546561764392014, + "loss": 3.1746, + "step": 10853 + }, + { + "epoch": 0.53, + "grad_norm": 0.516654372215271, + "learning_rate": 0.0005546480372544003, + "loss": 3.2713, + "step": 10854 + }, + { + "epoch": 0.53, + "grad_norm": 0.5436745285987854, + "learning_rate": 0.0005546398973989053, + "loss": 3.0698, + "step": 10855 + }, + { + "epoch": 0.53, + "grad_norm": 0.5498283505439758, + "learning_rate": 0.0005546317568727379, + "loss": 3.3623, + "step": 10856 + }, + { + "epoch": 0.53, + "grad_norm": 0.5308879613876343, + "learning_rate": 0.0005546236156759194, + "loss": 3.2788, + "step": 10857 + }, + { + "epoch": 0.53, + "grad_norm": 0.535848081111908, + "learning_rate": 0.0005546154738084711, + "loss": 3.3993, + "step": 10858 + }, + { + "epoch": 0.53, + "grad_norm": 0.6059902310371399, + "learning_rate": 0.0005546073312704149, + "loss": 3.2072, + "step": 10859 + }, + { + "epoch": 0.53, + "grad_norm": 0.5028658509254456, + "learning_rate": 0.0005545991880617719, + "loss": 3.2419, + "step": 10860 + }, + { + "epoch": 0.53, + "grad_norm": 0.5084434747695923, + "learning_rate": 0.0005545910441825636, + "loss": 3.2826, + "step": 10861 + }, + { + "epoch": 0.53, + "grad_norm": 0.5181224942207336, + "learning_rate": 0.0005545828996328116, + "loss": 3.1966, + "step": 10862 + }, + { + "epoch": 0.53, + "grad_norm": 0.5151437520980835, + "learning_rate": 0.0005545747544125371, + "loss": 3.169, + "step": 10863 + }, + { + "epoch": 0.53, + "grad_norm": 0.4924545884132385, + "learning_rate": 0.0005545666085217619, + "loss": 3.254, + "step": 10864 + }, + { + "epoch": 0.53, + "grad_norm": 0.5671659708023071, + "learning_rate": 0.000554558461960507, + "loss": 3.4059, + "step": 10865 + }, + { + "epoch": 0.53, + "grad_norm": 0.5134097337722778, + "learning_rate": 0.0005545503147287941, + "loss": 3.1668, + "step": 10866 + }, + { + "epoch": 0.53, + "grad_norm": 0.5386476516723633, + "learning_rate": 0.0005545421668266448, + "loss": 3.2502, + "step": 10867 + }, + { + "epoch": 0.53, + "grad_norm": 0.5015349984169006, + "learning_rate": 0.0005545340182540805, + "loss": 3.3447, + "step": 10868 + }, + { + "epoch": 0.53, + "grad_norm": 0.4915396273136139, + "learning_rate": 0.0005545258690111224, + "loss": 3.302, + "step": 10869 + }, + { + "epoch": 0.53, + "grad_norm": 0.5859756469726562, + "learning_rate": 0.0005545177190977923, + "loss": 3.1863, + "step": 10870 + }, + { + "epoch": 0.53, + "grad_norm": 0.500820517539978, + "learning_rate": 0.0005545095685141113, + "loss": 3.3541, + "step": 10871 + }, + { + "epoch": 0.53, + "grad_norm": 0.5108267068862915, + "learning_rate": 0.0005545014172601014, + "loss": 3.0616, + "step": 10872 + }, + { + "epoch": 0.53, + "grad_norm": 0.5519191026687622, + "learning_rate": 0.0005544932653357835, + "loss": 3.3225, + "step": 10873 + }, + { + "epoch": 0.53, + "grad_norm": 0.4729250967502594, + "learning_rate": 0.0005544851127411793, + "loss": 3.2311, + "step": 10874 + }, + { + "epoch": 0.53, + "grad_norm": 0.5374777317047119, + "learning_rate": 0.0005544769594763104, + "loss": 3.1297, + "step": 10875 + }, + { + "epoch": 0.53, + "grad_norm": 0.4953976273536682, + "learning_rate": 0.0005544688055411981, + "loss": 3.0189, + "step": 10876 + }, + { + "epoch": 0.53, + "grad_norm": 0.6086330413818359, + "learning_rate": 0.0005544606509358639, + "loss": 3.0814, + "step": 10877 + }, + { + "epoch": 0.53, + "grad_norm": 0.5771964192390442, + "learning_rate": 0.0005544524956603294, + "loss": 3.194, + "step": 10878 + }, + { + "epoch": 0.53, + "grad_norm": 0.5086356997489929, + "learning_rate": 0.000554444339714616, + "loss": 3.1714, + "step": 10879 + }, + { + "epoch": 0.53, + "grad_norm": 0.4892556071281433, + "learning_rate": 0.0005544361830987451, + "loss": 3.2823, + "step": 10880 + }, + { + "epoch": 0.53, + "grad_norm": 0.5008576512336731, + "learning_rate": 0.0005544280258127383, + "loss": 3.2336, + "step": 10881 + }, + { + "epoch": 0.53, + "grad_norm": 0.5014019012451172, + "learning_rate": 0.0005544198678566171, + "loss": 3.321, + "step": 10882 + }, + { + "epoch": 0.53, + "grad_norm": 0.5316650867462158, + "learning_rate": 0.0005544117092304029, + "loss": 3.4712, + "step": 10883 + }, + { + "epoch": 0.53, + "grad_norm": 0.4876498878002167, + "learning_rate": 0.0005544035499341172, + "loss": 3.4199, + "step": 10884 + }, + { + "epoch": 0.53, + "grad_norm": 0.5168114304542542, + "learning_rate": 0.0005543953899677815, + "loss": 3.1632, + "step": 10885 + }, + { + "epoch": 0.53, + "grad_norm": 0.5220661163330078, + "learning_rate": 0.0005543872293314174, + "loss": 3.0831, + "step": 10886 + }, + { + "epoch": 0.53, + "grad_norm": 0.4988704025745392, + "learning_rate": 0.0005543790680250461, + "loss": 3.2857, + "step": 10887 + }, + { + "epoch": 0.53, + "grad_norm": 0.4903241991996765, + "learning_rate": 0.0005543709060486895, + "loss": 3.2408, + "step": 10888 + }, + { + "epoch": 0.53, + "grad_norm": 0.5027443170547485, + "learning_rate": 0.0005543627434023688, + "loss": 3.2762, + "step": 10889 + }, + { + "epoch": 0.53, + "grad_norm": 0.4971694350242615, + "learning_rate": 0.0005543545800861055, + "loss": 3.1649, + "step": 10890 + }, + { + "epoch": 0.53, + "grad_norm": 0.5153796672821045, + "learning_rate": 0.0005543464160999214, + "loss": 3.3079, + "step": 10891 + }, + { + "epoch": 0.53, + "grad_norm": 0.5482482314109802, + "learning_rate": 0.0005543382514438376, + "loss": 3.0886, + "step": 10892 + }, + { + "epoch": 0.53, + "grad_norm": 0.5379815101623535, + "learning_rate": 0.0005543300861178759, + "loss": 3.1402, + "step": 10893 + }, + { + "epoch": 0.53, + "grad_norm": 0.5130733251571655, + "learning_rate": 0.0005543219201220576, + "loss": 3.5, + "step": 10894 + }, + { + "epoch": 0.53, + "grad_norm": 0.47875428199768066, + "learning_rate": 0.0005543137534564044, + "loss": 3.1322, + "step": 10895 + }, + { + "epoch": 0.53, + "grad_norm": 0.5776912569999695, + "learning_rate": 0.0005543055861209376, + "loss": 3.1212, + "step": 10896 + }, + { + "epoch": 0.53, + "grad_norm": 0.5165498852729797, + "learning_rate": 0.0005542974181156789, + "loss": 3.1643, + "step": 10897 + }, + { + "epoch": 0.53, + "grad_norm": 0.5151985883712769, + "learning_rate": 0.0005542892494406499, + "loss": 3.0556, + "step": 10898 + }, + { + "epoch": 0.53, + "grad_norm": 0.5367438793182373, + "learning_rate": 0.0005542810800958718, + "loss": 3.2911, + "step": 10899 + }, + { + "epoch": 0.53, + "grad_norm": 0.49575287103652954, + "learning_rate": 0.0005542729100813662, + "loss": 3.3014, + "step": 10900 + }, + { + "epoch": 0.53, + "grad_norm": 0.5376871824264526, + "learning_rate": 0.0005542647393971548, + "loss": 3.2976, + "step": 10901 + }, + { + "epoch": 0.53, + "grad_norm": 0.5368502140045166, + "learning_rate": 0.000554256568043259, + "loss": 3.4037, + "step": 10902 + }, + { + "epoch": 0.53, + "grad_norm": 0.5264731049537659, + "learning_rate": 0.0005542483960197002, + "loss": 3.3177, + "step": 10903 + }, + { + "epoch": 0.53, + "grad_norm": 0.5258291363716125, + "learning_rate": 0.0005542402233265003, + "loss": 3.0652, + "step": 10904 + }, + { + "epoch": 0.53, + "grad_norm": 0.5016001462936401, + "learning_rate": 0.0005542320499636804, + "loss": 3.3955, + "step": 10905 + }, + { + "epoch": 0.53, + "grad_norm": 0.5093668103218079, + "learning_rate": 0.0005542238759312623, + "loss": 3.1226, + "step": 10906 + }, + { + "epoch": 0.53, + "grad_norm": 0.504332959651947, + "learning_rate": 0.0005542157012292673, + "loss": 3.2307, + "step": 10907 + }, + { + "epoch": 0.53, + "grad_norm": 0.5304985642433167, + "learning_rate": 0.0005542075258577172, + "loss": 3.1603, + "step": 10908 + }, + { + "epoch": 0.53, + "grad_norm": 0.5355924367904663, + "learning_rate": 0.0005541993498166334, + "loss": 3.4154, + "step": 10909 + }, + { + "epoch": 0.53, + "grad_norm": 0.598781943321228, + "learning_rate": 0.0005541911731060374, + "loss": 3.4022, + "step": 10910 + }, + { + "epoch": 0.53, + "grad_norm": 0.5250735282897949, + "learning_rate": 0.0005541829957259508, + "loss": 3.0233, + "step": 10911 + }, + { + "epoch": 0.53, + "grad_norm": 0.5747032761573792, + "learning_rate": 0.0005541748176763951, + "loss": 3.1773, + "step": 10912 + }, + { + "epoch": 0.53, + "grad_norm": 0.5299064517021179, + "learning_rate": 0.0005541666389573918, + "loss": 2.9314, + "step": 10913 + }, + { + "epoch": 0.53, + "grad_norm": 0.4818550646305084, + "learning_rate": 0.0005541584595689624, + "loss": 3.2298, + "step": 10914 + }, + { + "epoch": 0.53, + "grad_norm": 0.5289633274078369, + "learning_rate": 0.0005541502795111288, + "loss": 3.26, + "step": 10915 + }, + { + "epoch": 0.53, + "grad_norm": 0.5091275572776794, + "learning_rate": 0.000554142098783912, + "loss": 3.2175, + "step": 10916 + }, + { + "epoch": 0.54, + "grad_norm": 0.5333389639854431, + "learning_rate": 0.000554133917387334, + "loss": 3.6551, + "step": 10917 + }, + { + "epoch": 0.54, + "grad_norm": 0.4829789400100708, + "learning_rate": 0.0005541257353214161, + "loss": 3.1711, + "step": 10918 + }, + { + "epoch": 0.54, + "grad_norm": 0.5595918297767639, + "learning_rate": 0.00055411755258618, + "loss": 3.2935, + "step": 10919 + }, + { + "epoch": 0.54, + "grad_norm": 0.5571411848068237, + "learning_rate": 0.000554109369181647, + "loss": 3.2141, + "step": 10920 + }, + { + "epoch": 0.54, + "grad_norm": 0.534190833568573, + "learning_rate": 0.000554101185107839, + "loss": 3.3717, + "step": 10921 + }, + { + "epoch": 0.54, + "grad_norm": 0.502724289894104, + "learning_rate": 0.0005540930003647773, + "loss": 3.2407, + "step": 10922 + }, + { + "epoch": 0.54, + "grad_norm": 0.5559344291687012, + "learning_rate": 0.0005540848149524835, + "loss": 3.1769, + "step": 10923 + }, + { + "epoch": 0.54, + "grad_norm": 0.5261440277099609, + "learning_rate": 0.0005540766288709792, + "loss": 3.3852, + "step": 10924 + }, + { + "epoch": 0.54, + "grad_norm": 0.5682399272918701, + "learning_rate": 0.000554068442120286, + "loss": 3.2593, + "step": 10925 + }, + { + "epoch": 0.54, + "grad_norm": 0.4791615605354309, + "learning_rate": 0.0005540602547004255, + "loss": 3.428, + "step": 10926 + }, + { + "epoch": 0.54, + "grad_norm": 0.5266600847244263, + "learning_rate": 0.000554052066611419, + "loss": 3.3388, + "step": 10927 + }, + { + "epoch": 0.54, + "grad_norm": 0.5460447072982788, + "learning_rate": 0.0005540438778532885, + "loss": 3.0518, + "step": 10928 + }, + { + "epoch": 0.54, + "grad_norm": 0.534444272518158, + "learning_rate": 0.0005540356884260551, + "loss": 3.2383, + "step": 10929 + }, + { + "epoch": 0.54, + "grad_norm": 0.5190091729164124, + "learning_rate": 0.0005540274983297407, + "loss": 3.2742, + "step": 10930 + }, + { + "epoch": 0.54, + "grad_norm": 0.5553072094917297, + "learning_rate": 0.0005540193075643668, + "loss": 3.304, + "step": 10931 + }, + { + "epoch": 0.54, + "grad_norm": 0.5311570167541504, + "learning_rate": 0.0005540111161299547, + "loss": 3.3071, + "step": 10932 + }, + { + "epoch": 0.54, + "grad_norm": 0.523281455039978, + "learning_rate": 0.0005540029240265265, + "loss": 3.2171, + "step": 10933 + }, + { + "epoch": 0.54, + "grad_norm": 0.5270448327064514, + "learning_rate": 0.0005539947312541033, + "loss": 3.3145, + "step": 10934 + }, + { + "epoch": 0.54, + "grad_norm": 0.5168893933296204, + "learning_rate": 0.0005539865378127069, + "loss": 3.2543, + "step": 10935 + }, + { + "epoch": 0.54, + "grad_norm": 0.5505146384239197, + "learning_rate": 0.0005539783437023588, + "loss": 3.0689, + "step": 10936 + }, + { + "epoch": 0.54, + "grad_norm": 0.48953521251678467, + "learning_rate": 0.0005539701489230807, + "loss": 3.2071, + "step": 10937 + }, + { + "epoch": 0.54, + "grad_norm": 0.5777466297149658, + "learning_rate": 0.000553961953474894, + "loss": 3.228, + "step": 10938 + }, + { + "epoch": 0.54, + "grad_norm": 0.538305938243866, + "learning_rate": 0.0005539537573578205, + "loss": 3.2121, + "step": 10939 + }, + { + "epoch": 0.54, + "grad_norm": 0.5043644905090332, + "learning_rate": 0.0005539455605718817, + "loss": 3.3311, + "step": 10940 + }, + { + "epoch": 0.54, + "grad_norm": 0.5230538845062256, + "learning_rate": 0.000553937363117099, + "loss": 3.0433, + "step": 10941 + }, + { + "epoch": 0.54, + "grad_norm": 0.49755704402923584, + "learning_rate": 0.0005539291649934943, + "loss": 3.2049, + "step": 10942 + }, + { + "epoch": 0.54, + "grad_norm": 0.49073851108551025, + "learning_rate": 0.0005539209662010889, + "loss": 3.0794, + "step": 10943 + }, + { + "epoch": 0.54, + "grad_norm": 0.5318127274513245, + "learning_rate": 0.0005539127667399048, + "loss": 3.3468, + "step": 10944 + }, + { + "epoch": 0.54, + "grad_norm": 0.5030101537704468, + "learning_rate": 0.0005539045666099632, + "loss": 3.3754, + "step": 10945 + }, + { + "epoch": 0.54, + "grad_norm": 0.5165680646896362, + "learning_rate": 0.0005538963658112858, + "loss": 3.102, + "step": 10946 + }, + { + "epoch": 0.54, + "grad_norm": 0.48056358098983765, + "learning_rate": 0.0005538881643438943, + "loss": 3.3966, + "step": 10947 + }, + { + "epoch": 0.54, + "grad_norm": 0.517524242401123, + "learning_rate": 0.0005538799622078101, + "loss": 3.3516, + "step": 10948 + }, + { + "epoch": 0.54, + "grad_norm": 0.5031284689903259, + "learning_rate": 0.000553871759403055, + "loss": 3.0759, + "step": 10949 + }, + { + "epoch": 0.54, + "grad_norm": 0.5510088801383972, + "learning_rate": 0.0005538635559296507, + "loss": 3.1852, + "step": 10950 + }, + { + "epoch": 0.54, + "grad_norm": 0.48190462589263916, + "learning_rate": 0.0005538553517876185, + "loss": 3.5855, + "step": 10951 + }, + { + "epoch": 0.54, + "grad_norm": 0.518423318862915, + "learning_rate": 0.0005538471469769802, + "loss": 3.4513, + "step": 10952 + }, + { + "epoch": 0.54, + "grad_norm": 0.5671064257621765, + "learning_rate": 0.0005538389414977573, + "loss": 3.1931, + "step": 10953 + }, + { + "epoch": 0.54, + "grad_norm": 0.5123745799064636, + "learning_rate": 0.0005538307353499715, + "loss": 3.2022, + "step": 10954 + }, + { + "epoch": 0.54, + "grad_norm": 0.4641689658164978, + "learning_rate": 0.0005538225285336445, + "loss": 3.3417, + "step": 10955 + }, + { + "epoch": 0.54, + "grad_norm": 0.5528790950775146, + "learning_rate": 0.0005538143210487977, + "loss": 3.3302, + "step": 10956 + }, + { + "epoch": 0.54, + "grad_norm": 0.5319774150848389, + "learning_rate": 0.000553806112895453, + "loss": 3.1984, + "step": 10957 + }, + { + "epoch": 0.54, + "grad_norm": 0.5352564454078674, + "learning_rate": 0.0005537979040736317, + "loss": 3.2959, + "step": 10958 + }, + { + "epoch": 0.54, + "grad_norm": 0.5923522710800171, + "learning_rate": 0.0005537896945833555, + "loss": 3.2073, + "step": 10959 + }, + { + "epoch": 0.54, + "grad_norm": 0.5047934651374817, + "learning_rate": 0.0005537814844246462, + "loss": 3.5024, + "step": 10960 + }, + { + "epoch": 0.54, + "grad_norm": 0.5356642007827759, + "learning_rate": 0.0005537732735975252, + "loss": 3.2996, + "step": 10961 + }, + { + "epoch": 0.54, + "grad_norm": 0.5496769547462463, + "learning_rate": 0.0005537650621020143, + "loss": 3.2524, + "step": 10962 + }, + { + "epoch": 0.54, + "grad_norm": 0.5538159608840942, + "learning_rate": 0.0005537568499381351, + "loss": 3.1377, + "step": 10963 + }, + { + "epoch": 0.54, + "grad_norm": 0.5073235630989075, + "learning_rate": 0.0005537486371059092, + "loss": 3.4908, + "step": 10964 + }, + { + "epoch": 0.54, + "grad_norm": 0.5352749228477478, + "learning_rate": 0.0005537404236053582, + "loss": 3.3659, + "step": 10965 + }, + { + "epoch": 0.54, + "grad_norm": 0.501549482345581, + "learning_rate": 0.0005537322094365038, + "loss": 3.4548, + "step": 10966 + }, + { + "epoch": 0.54, + "grad_norm": 0.5620731115341187, + "learning_rate": 0.0005537239945993675, + "loss": 3.1772, + "step": 10967 + }, + { + "epoch": 0.54, + "grad_norm": 0.5060946345329285, + "learning_rate": 0.0005537157790939711, + "loss": 3.1628, + "step": 10968 + }, + { + "epoch": 0.54, + "grad_norm": 0.5136206150054932, + "learning_rate": 0.0005537075629203361, + "loss": 3.1592, + "step": 10969 + }, + { + "epoch": 0.54, + "grad_norm": 0.5491225123405457, + "learning_rate": 0.0005536993460784843, + "loss": 3.1226, + "step": 10970 + }, + { + "epoch": 0.54, + "grad_norm": 0.5510766506195068, + "learning_rate": 0.0005536911285684372, + "loss": 3.2825, + "step": 10971 + }, + { + "epoch": 0.54, + "grad_norm": 0.59326171875, + "learning_rate": 0.0005536829103902164, + "loss": 3.2038, + "step": 10972 + }, + { + "epoch": 0.54, + "grad_norm": 0.5343856811523438, + "learning_rate": 0.0005536746915438438, + "loss": 3.2069, + "step": 10973 + }, + { + "epoch": 0.54, + "grad_norm": 0.524446964263916, + "learning_rate": 0.0005536664720293408, + "loss": 3.1553, + "step": 10974 + }, + { + "epoch": 0.54, + "grad_norm": 0.5818180441856384, + "learning_rate": 0.0005536582518467292, + "loss": 3.2898, + "step": 10975 + }, + { + "epoch": 0.54, + "grad_norm": 0.49687114357948303, + "learning_rate": 0.0005536500309960304, + "loss": 3.4254, + "step": 10976 + }, + { + "epoch": 0.54, + "grad_norm": 0.5242406725883484, + "learning_rate": 0.0005536418094772664, + "loss": 3.1369, + "step": 10977 + }, + { + "epoch": 0.54, + "grad_norm": 0.5363040566444397, + "learning_rate": 0.0005536335872904587, + "loss": 3.2857, + "step": 10978 + }, + { + "epoch": 0.54, + "grad_norm": 0.5277572870254517, + "learning_rate": 0.0005536253644356289, + "loss": 3.2198, + "step": 10979 + }, + { + "epoch": 0.54, + "grad_norm": 0.5251345634460449, + "learning_rate": 0.0005536171409127986, + "loss": 3.3776, + "step": 10980 + }, + { + "epoch": 0.54, + "grad_norm": 0.4895996153354645, + "learning_rate": 0.0005536089167219897, + "loss": 3.1118, + "step": 10981 + }, + { + "epoch": 0.54, + "grad_norm": 0.550209105014801, + "learning_rate": 0.0005536006918632236, + "loss": 3.3635, + "step": 10982 + }, + { + "epoch": 0.54, + "grad_norm": 0.5238742232322693, + "learning_rate": 0.0005535924663365221, + "loss": 3.37, + "step": 10983 + }, + { + "epoch": 0.54, + "grad_norm": 0.5147226452827454, + "learning_rate": 0.0005535842401419071, + "loss": 3.3436, + "step": 10984 + }, + { + "epoch": 0.54, + "grad_norm": 0.5709726214408875, + "learning_rate": 0.0005535760132793997, + "loss": 3.2941, + "step": 10985 + }, + { + "epoch": 0.54, + "grad_norm": 0.5367663502693176, + "learning_rate": 0.000553567785749022, + "loss": 3.2, + "step": 10986 + }, + { + "epoch": 0.54, + "grad_norm": 0.5051316022872925, + "learning_rate": 0.0005535595575507955, + "loss": 3.2523, + "step": 10987 + }, + { + "epoch": 0.54, + "grad_norm": 0.502051830291748, + "learning_rate": 0.0005535513286847421, + "loss": 3.2888, + "step": 10988 + }, + { + "epoch": 0.54, + "grad_norm": 0.5447306632995605, + "learning_rate": 0.0005535430991508831, + "loss": 3.2808, + "step": 10989 + }, + { + "epoch": 0.54, + "grad_norm": 0.5220550894737244, + "learning_rate": 0.0005535348689492404, + "loss": 3.1534, + "step": 10990 + }, + { + "epoch": 0.54, + "grad_norm": 0.6093209385871887, + "learning_rate": 0.0005535266380798358, + "loss": 3.1002, + "step": 10991 + }, + { + "epoch": 0.54, + "grad_norm": 0.51113361120224, + "learning_rate": 0.0005535184065426907, + "loss": 3.2006, + "step": 10992 + }, + { + "epoch": 0.54, + "grad_norm": 0.5078108310699463, + "learning_rate": 0.000553510174337827, + "loss": 3.1692, + "step": 10993 + }, + { + "epoch": 0.54, + "grad_norm": 0.5096676349639893, + "learning_rate": 0.0005535019414652662, + "loss": 3.2649, + "step": 10994 + }, + { + "epoch": 0.54, + "grad_norm": 0.6298714280128479, + "learning_rate": 0.0005534937079250301, + "loss": 3.0877, + "step": 10995 + }, + { + "epoch": 0.54, + "grad_norm": 0.5195075869560242, + "learning_rate": 0.0005534854737171402, + "loss": 3.3914, + "step": 10996 + }, + { + "epoch": 0.54, + "grad_norm": 0.5262744426727295, + "learning_rate": 0.0005534772388416186, + "loss": 3.3941, + "step": 10997 + }, + { + "epoch": 0.54, + "grad_norm": 0.5323526263237, + "learning_rate": 0.0005534690032984866, + "loss": 3.2896, + "step": 10998 + }, + { + "epoch": 0.54, + "grad_norm": 0.567234456539154, + "learning_rate": 0.0005534607670877661, + "loss": 3.1214, + "step": 10999 + }, + { + "epoch": 0.54, + "grad_norm": 0.49547919631004333, + "learning_rate": 0.0005534525302094787, + "loss": 3.4712, + "step": 11000 + }, + { + "epoch": 0.54, + "grad_norm": 0.5253562331199646, + "learning_rate": 0.0005534442926636461, + "loss": 3.4296, + "step": 11001 + }, + { + "epoch": 0.54, + "grad_norm": 0.536155104637146, + "learning_rate": 0.00055343605445029, + "loss": 3.2442, + "step": 11002 + }, + { + "epoch": 0.54, + "grad_norm": 0.5386752486228943, + "learning_rate": 0.0005534278155694321, + "loss": 3.1203, + "step": 11003 + }, + { + "epoch": 0.54, + "grad_norm": 0.5196104049682617, + "learning_rate": 0.0005534195760210941, + "loss": 3.2681, + "step": 11004 + }, + { + "epoch": 0.54, + "grad_norm": 0.5381636023521423, + "learning_rate": 0.0005534113358052977, + "loss": 3.2436, + "step": 11005 + }, + { + "epoch": 0.54, + "grad_norm": 0.5484285950660706, + "learning_rate": 0.0005534030949220646, + "loss": 3.2857, + "step": 11006 + }, + { + "epoch": 0.54, + "grad_norm": 0.5643191933631897, + "learning_rate": 0.0005533948533714166, + "loss": 3.2674, + "step": 11007 + }, + { + "epoch": 0.54, + "grad_norm": 0.5526049733161926, + "learning_rate": 0.0005533866111533753, + "loss": 3.2277, + "step": 11008 + }, + { + "epoch": 0.54, + "grad_norm": 0.5223939418792725, + "learning_rate": 0.0005533783682679624, + "loss": 3.3033, + "step": 11009 + }, + { + "epoch": 0.54, + "grad_norm": 0.5286209583282471, + "learning_rate": 0.0005533701247151996, + "loss": 3.2965, + "step": 11010 + }, + { + "epoch": 0.54, + "grad_norm": 0.5266342759132385, + "learning_rate": 0.0005533618804951087, + "loss": 3.2701, + "step": 11011 + }, + { + "epoch": 0.54, + "grad_norm": 0.5430400967597961, + "learning_rate": 0.0005533536356077113, + "loss": 3.0469, + "step": 11012 + }, + { + "epoch": 0.54, + "grad_norm": 0.49393025040626526, + "learning_rate": 0.0005533453900530294, + "loss": 3.1746, + "step": 11013 + }, + { + "epoch": 0.54, + "grad_norm": 0.5422281622886658, + "learning_rate": 0.0005533371438310843, + "loss": 3.3271, + "step": 11014 + }, + { + "epoch": 0.54, + "grad_norm": 0.5507173538208008, + "learning_rate": 0.000553328896941898, + "loss": 3.2458, + "step": 11015 + }, + { + "epoch": 0.54, + "grad_norm": 0.4943542778491974, + "learning_rate": 0.000553320649385492, + "loss": 3.2087, + "step": 11016 + }, + { + "epoch": 0.54, + "grad_norm": 0.5512304902076721, + "learning_rate": 0.0005533124011618884, + "loss": 3.1788, + "step": 11017 + }, + { + "epoch": 0.54, + "grad_norm": 0.45192670822143555, + "learning_rate": 0.0005533041522711085, + "loss": 3.1242, + "step": 11018 + }, + { + "epoch": 0.54, + "grad_norm": 0.5453776717185974, + "learning_rate": 0.0005532959027131743, + "loss": 3.3347, + "step": 11019 + }, + { + "epoch": 0.54, + "grad_norm": 0.5177527070045471, + "learning_rate": 0.0005532876524881075, + "loss": 3.2072, + "step": 11020 + }, + { + "epoch": 0.54, + "grad_norm": 0.5608527660369873, + "learning_rate": 0.0005532794015959296, + "loss": 3.2669, + "step": 11021 + }, + { + "epoch": 0.54, + "grad_norm": 0.5439431667327881, + "learning_rate": 0.0005532711500366625, + "loss": 3.1555, + "step": 11022 + }, + { + "epoch": 0.54, + "grad_norm": 0.5050414204597473, + "learning_rate": 0.000553262897810328, + "loss": 3.3673, + "step": 11023 + }, + { + "epoch": 0.54, + "grad_norm": 0.6056579947471619, + "learning_rate": 0.0005532546449169478, + "loss": 3.0847, + "step": 11024 + }, + { + "epoch": 0.54, + "grad_norm": 0.50806725025177, + "learning_rate": 0.0005532463913565436, + "loss": 3.1318, + "step": 11025 + }, + { + "epoch": 0.54, + "grad_norm": 0.47920647263526917, + "learning_rate": 0.0005532381371291372, + "loss": 3.307, + "step": 11026 + }, + { + "epoch": 0.54, + "grad_norm": 0.49545514583587646, + "learning_rate": 0.0005532298822347501, + "loss": 3.369, + "step": 11027 + }, + { + "epoch": 0.54, + "grad_norm": 0.4789004921913147, + "learning_rate": 0.0005532216266734044, + "loss": 3.2807, + "step": 11028 + }, + { + "epoch": 0.54, + "grad_norm": 0.5694847106933594, + "learning_rate": 0.0005532133704451216, + "loss": 3.2739, + "step": 11029 + }, + { + "epoch": 0.54, + "grad_norm": 0.5591051578521729, + "learning_rate": 0.0005532051135499236, + "loss": 3.2335, + "step": 11030 + }, + { + "epoch": 0.54, + "grad_norm": 0.5124754309654236, + "learning_rate": 0.000553196855987832, + "loss": 3.0694, + "step": 11031 + }, + { + "epoch": 0.54, + "grad_norm": 0.5239121317863464, + "learning_rate": 0.0005531885977588686, + "loss": 3.3521, + "step": 11032 + }, + { + "epoch": 0.54, + "grad_norm": 0.5368714332580566, + "learning_rate": 0.0005531803388630551, + "loss": 3.2614, + "step": 11033 + }, + { + "epoch": 0.54, + "grad_norm": 0.5138024687767029, + "learning_rate": 0.0005531720793004135, + "loss": 3.2224, + "step": 11034 + }, + { + "epoch": 0.54, + "grad_norm": 0.5147234201431274, + "learning_rate": 0.0005531638190709651, + "loss": 3.2612, + "step": 11035 + }, + { + "epoch": 0.54, + "grad_norm": 0.5324978828430176, + "learning_rate": 0.0005531555581747321, + "loss": 3.4883, + "step": 11036 + }, + { + "epoch": 0.54, + "grad_norm": 0.501106321811676, + "learning_rate": 0.0005531472966117361, + "loss": 3.2922, + "step": 11037 + }, + { + "epoch": 0.54, + "grad_norm": 0.5278347134590149, + "learning_rate": 0.0005531390343819987, + "loss": 3.2189, + "step": 11038 + }, + { + "epoch": 0.54, + "grad_norm": 0.5315659046173096, + "learning_rate": 0.0005531307714855419, + "loss": 3.2379, + "step": 11039 + }, + { + "epoch": 0.54, + "grad_norm": 0.523786187171936, + "learning_rate": 0.0005531225079223874, + "loss": 3.2896, + "step": 11040 + }, + { + "epoch": 0.54, + "grad_norm": 0.5235302448272705, + "learning_rate": 0.0005531142436925569, + "loss": 3.5168, + "step": 11041 + }, + { + "epoch": 0.54, + "grad_norm": 0.492496132850647, + "learning_rate": 0.0005531059787960721, + "loss": 3.2794, + "step": 11042 + }, + { + "epoch": 0.54, + "grad_norm": 0.5163391828536987, + "learning_rate": 0.0005530977132329549, + "loss": 3.0546, + "step": 11043 + }, + { + "epoch": 0.54, + "grad_norm": 0.5126833915710449, + "learning_rate": 0.0005530894470032271, + "loss": 3.1981, + "step": 11044 + }, + { + "epoch": 0.54, + "grad_norm": 0.5161365270614624, + "learning_rate": 0.0005530811801069104, + "loss": 3.2133, + "step": 11045 + }, + { + "epoch": 0.54, + "grad_norm": 0.5676143169403076, + "learning_rate": 0.0005530729125440265, + "loss": 3.117, + "step": 11046 + }, + { + "epoch": 0.54, + "grad_norm": 0.5352276563644409, + "learning_rate": 0.0005530646443145973, + "loss": 3.2627, + "step": 11047 + }, + { + "epoch": 0.54, + "grad_norm": 0.5018179416656494, + "learning_rate": 0.0005530563754186444, + "loss": 3.2552, + "step": 11048 + }, + { + "epoch": 0.54, + "grad_norm": 0.5012510418891907, + "learning_rate": 0.0005530481058561899, + "loss": 3.2785, + "step": 11049 + }, + { + "epoch": 0.54, + "grad_norm": 0.5244871973991394, + "learning_rate": 0.0005530398356272554, + "loss": 3.3138, + "step": 11050 + }, + { + "epoch": 0.54, + "grad_norm": 0.5109242796897888, + "learning_rate": 0.0005530315647318626, + "loss": 3.3587, + "step": 11051 + }, + { + "epoch": 0.54, + "grad_norm": 0.5416744351387024, + "learning_rate": 0.0005530232931700333, + "loss": 3.182, + "step": 11052 + }, + { + "epoch": 0.54, + "grad_norm": 0.5090746283531189, + "learning_rate": 0.0005530150209417894, + "loss": 3.1963, + "step": 11053 + }, + { + "epoch": 0.54, + "grad_norm": 0.5169641971588135, + "learning_rate": 0.0005530067480471526, + "loss": 3.3494, + "step": 11054 + }, + { + "epoch": 0.54, + "grad_norm": 0.4892723560333252, + "learning_rate": 0.0005529984744861448, + "loss": 3.3338, + "step": 11055 + }, + { + "epoch": 0.54, + "grad_norm": 0.5152621865272522, + "learning_rate": 0.0005529902002587877, + "loss": 3.3415, + "step": 11056 + }, + { + "epoch": 0.54, + "grad_norm": 0.4981408417224884, + "learning_rate": 0.000552981925365103, + "loss": 3.2972, + "step": 11057 + }, + { + "epoch": 0.54, + "grad_norm": 0.5364402532577515, + "learning_rate": 0.0005529736498051127, + "loss": 3.2753, + "step": 11058 + }, + { + "epoch": 0.54, + "grad_norm": 0.4937974214553833, + "learning_rate": 0.0005529653735788383, + "loss": 3.2908, + "step": 11059 + }, + { + "epoch": 0.54, + "grad_norm": 0.5432580709457397, + "learning_rate": 0.0005529570966863021, + "loss": 3.2897, + "step": 11060 + }, + { + "epoch": 0.54, + "grad_norm": 0.5275623798370361, + "learning_rate": 0.0005529488191275253, + "loss": 3.3915, + "step": 11061 + }, + { + "epoch": 0.54, + "grad_norm": 0.5411865711212158, + "learning_rate": 0.0005529405409025302, + "loss": 3.2582, + "step": 11062 + }, + { + "epoch": 0.54, + "grad_norm": 0.5429436564445496, + "learning_rate": 0.0005529322620113382, + "loss": 3.267, + "step": 11063 + }, + { + "epoch": 0.54, + "grad_norm": 0.5083034038543701, + "learning_rate": 0.0005529239824539715, + "loss": 3.3395, + "step": 11064 + }, + { + "epoch": 0.54, + "grad_norm": 0.49724969267845154, + "learning_rate": 0.0005529157022304516, + "loss": 3.0817, + "step": 11065 + }, + { + "epoch": 0.54, + "grad_norm": 0.48776689171791077, + "learning_rate": 0.0005529074213408004, + "loss": 3.2054, + "step": 11066 + }, + { + "epoch": 0.54, + "grad_norm": 0.4902147054672241, + "learning_rate": 0.0005528991397850397, + "loss": 3.3422, + "step": 11067 + }, + { + "epoch": 0.54, + "grad_norm": 0.5294910073280334, + "learning_rate": 0.0005528908575631914, + "loss": 3.3306, + "step": 11068 + }, + { + "epoch": 0.54, + "grad_norm": 0.5070090293884277, + "learning_rate": 0.0005528825746752773, + "loss": 3.4023, + "step": 11069 + }, + { + "epoch": 0.54, + "grad_norm": 0.5285083055496216, + "learning_rate": 0.0005528742911213191, + "loss": 3.2324, + "step": 11070 + }, + { + "epoch": 0.54, + "grad_norm": 0.513079822063446, + "learning_rate": 0.0005528660069013387, + "loss": 3.2539, + "step": 11071 + }, + { + "epoch": 0.54, + "grad_norm": 0.5381377339363098, + "learning_rate": 0.0005528577220153579, + "loss": 3.4106, + "step": 11072 + }, + { + "epoch": 0.54, + "grad_norm": 0.4675285518169403, + "learning_rate": 0.0005528494364633985, + "loss": 3.4012, + "step": 11073 + }, + { + "epoch": 0.54, + "grad_norm": 0.5249191522598267, + "learning_rate": 0.0005528411502454824, + "loss": 3.1627, + "step": 11074 + }, + { + "epoch": 0.54, + "grad_norm": 0.49168458580970764, + "learning_rate": 0.0005528328633616313, + "loss": 3.2428, + "step": 11075 + }, + { + "epoch": 0.54, + "grad_norm": 0.4863361716270447, + "learning_rate": 0.0005528245758118671, + "loss": 3.2086, + "step": 11076 + }, + { + "epoch": 0.54, + "grad_norm": 0.5234729647636414, + "learning_rate": 0.0005528162875962117, + "loss": 3.4566, + "step": 11077 + }, + { + "epoch": 0.54, + "grad_norm": 0.5098854303359985, + "learning_rate": 0.0005528079987146868, + "loss": 3.3018, + "step": 11078 + }, + { + "epoch": 0.54, + "grad_norm": 0.5080091953277588, + "learning_rate": 0.0005527997091673143, + "loss": 3.3924, + "step": 11079 + }, + { + "epoch": 0.54, + "grad_norm": 0.5155888795852661, + "learning_rate": 0.000552791418954116, + "loss": 3.1819, + "step": 11080 + }, + { + "epoch": 0.54, + "grad_norm": 0.5143377780914307, + "learning_rate": 0.0005527831280751138, + "loss": 3.5005, + "step": 11081 + }, + { + "epoch": 0.54, + "grad_norm": 0.5565736293792725, + "learning_rate": 0.0005527748365303295, + "loss": 3.198, + "step": 11082 + }, + { + "epoch": 0.54, + "grad_norm": 0.5252052545547485, + "learning_rate": 0.0005527665443197849, + "loss": 3.1299, + "step": 11083 + }, + { + "epoch": 0.54, + "grad_norm": 0.5647584199905396, + "learning_rate": 0.0005527582514435017, + "loss": 3.5219, + "step": 11084 + }, + { + "epoch": 0.54, + "grad_norm": 0.581468939781189, + "learning_rate": 0.0005527499579015021, + "loss": 3.3298, + "step": 11085 + }, + { + "epoch": 0.54, + "grad_norm": 0.5266270041465759, + "learning_rate": 0.0005527416636938077, + "loss": 3.1852, + "step": 11086 + }, + { + "epoch": 0.54, + "grad_norm": 0.5025612115859985, + "learning_rate": 0.0005527333688204405, + "loss": 3.3013, + "step": 11087 + }, + { + "epoch": 0.54, + "grad_norm": 0.5126590132713318, + "learning_rate": 0.0005527250732814222, + "loss": 3.4074, + "step": 11088 + }, + { + "epoch": 0.54, + "grad_norm": 0.5125699043273926, + "learning_rate": 0.0005527167770767746, + "loss": 3.0508, + "step": 11089 + }, + { + "epoch": 0.54, + "grad_norm": 0.5985158085823059, + "learning_rate": 0.0005527084802065197, + "loss": 3.5684, + "step": 11090 + }, + { + "epoch": 0.54, + "grad_norm": 0.4821082353591919, + "learning_rate": 0.0005527001826706793, + "loss": 3.2999, + "step": 11091 + }, + { + "epoch": 0.54, + "grad_norm": 0.5150079131126404, + "learning_rate": 0.0005526918844692752, + "loss": 3.3448, + "step": 11092 + }, + { + "epoch": 0.54, + "grad_norm": 0.5428050756454468, + "learning_rate": 0.0005526835856023294, + "loss": 3.0969, + "step": 11093 + }, + { + "epoch": 0.54, + "grad_norm": 0.5639786720275879, + "learning_rate": 0.0005526752860698636, + "loss": 3.2585, + "step": 11094 + }, + { + "epoch": 0.54, + "grad_norm": 0.5006402730941772, + "learning_rate": 0.0005526669858718998, + "loss": 3.2274, + "step": 11095 + }, + { + "epoch": 0.54, + "grad_norm": 0.5905807614326477, + "learning_rate": 0.0005526586850084596, + "loss": 3.3441, + "step": 11096 + }, + { + "epoch": 0.54, + "grad_norm": 0.5085511207580566, + "learning_rate": 0.0005526503834795654, + "loss": 3.416, + "step": 11097 + }, + { + "epoch": 0.54, + "grad_norm": 0.54239422082901, + "learning_rate": 0.0005526420812852384, + "loss": 2.8477, + "step": 11098 + }, + { + "epoch": 0.54, + "grad_norm": 0.5083803534507751, + "learning_rate": 0.0005526337784255009, + "loss": 3.3743, + "step": 11099 + }, + { + "epoch": 0.54, + "grad_norm": 0.4984433352947235, + "learning_rate": 0.0005526254749003746, + "loss": 3.4834, + "step": 11100 + }, + { + "epoch": 0.54, + "grad_norm": 0.5210543870925903, + "learning_rate": 0.0005526171707098814, + "loss": 3.336, + "step": 11101 + }, + { + "epoch": 0.54, + "grad_norm": 0.5217922925949097, + "learning_rate": 0.0005526088658540433, + "loss": 3.1881, + "step": 11102 + }, + { + "epoch": 0.54, + "grad_norm": 0.4963856339454651, + "learning_rate": 0.000552600560332882, + "loss": 3.1313, + "step": 11103 + }, + { + "epoch": 0.54, + "grad_norm": 0.5491219758987427, + "learning_rate": 0.0005525922541464195, + "loss": 3.2162, + "step": 11104 + }, + { + "epoch": 0.54, + "grad_norm": 0.5014486908912659, + "learning_rate": 0.0005525839472946774, + "loss": 3.2393, + "step": 11105 + }, + { + "epoch": 0.54, + "grad_norm": 0.5988680720329285, + "learning_rate": 0.000552575639777678, + "loss": 3.3308, + "step": 11106 + }, + { + "epoch": 0.54, + "grad_norm": 0.5203366279602051, + "learning_rate": 0.0005525673315954428, + "loss": 3.3049, + "step": 11107 + }, + { + "epoch": 0.54, + "grad_norm": 0.5146699547767639, + "learning_rate": 0.0005525590227479941, + "loss": 3.0487, + "step": 11108 + }, + { + "epoch": 0.54, + "grad_norm": 0.5197408199310303, + "learning_rate": 0.0005525507132353533, + "loss": 3.1794, + "step": 11109 + }, + { + "epoch": 0.54, + "grad_norm": 0.5432579517364502, + "learning_rate": 0.0005525424030575427, + "loss": 3.0737, + "step": 11110 + }, + { + "epoch": 0.54, + "grad_norm": 0.5454299449920654, + "learning_rate": 0.000552534092214584, + "loss": 3.1482, + "step": 11111 + }, + { + "epoch": 0.54, + "grad_norm": 0.5839552283287048, + "learning_rate": 0.000552525780706499, + "loss": 3.2161, + "step": 11112 + }, + { + "epoch": 0.54, + "grad_norm": 0.5181378126144409, + "learning_rate": 0.0005525174685333098, + "loss": 3.2954, + "step": 11113 + }, + { + "epoch": 0.54, + "grad_norm": 0.5586148500442505, + "learning_rate": 0.0005525091556950381, + "loss": 3.1837, + "step": 11114 + }, + { + "epoch": 0.54, + "grad_norm": 0.5175696015357971, + "learning_rate": 0.000552500842191706, + "loss": 3.1286, + "step": 11115 + }, + { + "epoch": 0.54, + "grad_norm": 0.5837456583976746, + "learning_rate": 0.0005524925280233351, + "loss": 3.1588, + "step": 11116 + }, + { + "epoch": 0.54, + "grad_norm": 0.5126967430114746, + "learning_rate": 0.0005524842131899476, + "loss": 3.4051, + "step": 11117 + }, + { + "epoch": 0.54, + "grad_norm": 0.49251213669776917, + "learning_rate": 0.0005524758976915652, + "loss": 3.2309, + "step": 11118 + }, + { + "epoch": 0.54, + "grad_norm": 0.5178259015083313, + "learning_rate": 0.0005524675815282099, + "loss": 3.2385, + "step": 11119 + }, + { + "epoch": 0.54, + "grad_norm": 0.5161910653114319, + "learning_rate": 0.0005524592646999035, + "loss": 3.1784, + "step": 11120 + }, + { + "epoch": 0.55, + "grad_norm": 0.5630325078964233, + "learning_rate": 0.0005524509472066683, + "loss": 3.0025, + "step": 11121 + }, + { + "epoch": 0.55, + "grad_norm": 0.5052022337913513, + "learning_rate": 0.0005524426290485255, + "loss": 3.1637, + "step": 11122 + }, + { + "epoch": 0.55, + "grad_norm": 0.5926064252853394, + "learning_rate": 0.0005524343102254976, + "loss": 3.3637, + "step": 11123 + }, + { + "epoch": 0.55, + "grad_norm": 0.5273416638374329, + "learning_rate": 0.0005524259907376063, + "loss": 3.2147, + "step": 11124 + }, + { + "epoch": 0.55, + "grad_norm": 0.552270233631134, + "learning_rate": 0.0005524176705848735, + "loss": 3.1198, + "step": 11125 + }, + { + "epoch": 0.55, + "grad_norm": 0.5399708151817322, + "learning_rate": 0.0005524093497673211, + "loss": 3.0797, + "step": 11126 + }, + { + "epoch": 0.55, + "grad_norm": 0.5599607229232788, + "learning_rate": 0.000552401028284971, + "loss": 3.0212, + "step": 11127 + }, + { + "epoch": 0.55, + "grad_norm": 0.536972165107727, + "learning_rate": 0.0005523927061378453, + "loss": 2.9896, + "step": 11128 + }, + { + "epoch": 0.55, + "grad_norm": 0.5732285976409912, + "learning_rate": 0.0005523843833259658, + "loss": 3.1458, + "step": 11129 + }, + { + "epoch": 0.55, + "grad_norm": 0.5276511311531067, + "learning_rate": 0.0005523760598493544, + "loss": 3.5314, + "step": 11130 + }, + { + "epoch": 0.55, + "grad_norm": 0.506087064743042, + "learning_rate": 0.0005523677357080329, + "loss": 3.3586, + "step": 11131 + }, + { + "epoch": 0.55, + "grad_norm": 0.5099312663078308, + "learning_rate": 0.0005523594109020233, + "loss": 3.328, + "step": 11132 + }, + { + "epoch": 0.55, + "grad_norm": 0.51421719789505, + "learning_rate": 0.0005523510854313478, + "loss": 3.3486, + "step": 11133 + }, + { + "epoch": 0.55, + "grad_norm": 0.5033894181251526, + "learning_rate": 0.000552342759296028, + "loss": 3.2523, + "step": 11134 + }, + { + "epoch": 0.55, + "grad_norm": 0.5557559132575989, + "learning_rate": 0.000552334432496086, + "loss": 3.4129, + "step": 11135 + }, + { + "epoch": 0.55, + "grad_norm": 0.5153203010559082, + "learning_rate": 0.0005523261050315435, + "loss": 3.4043, + "step": 11136 + }, + { + "epoch": 0.55, + "grad_norm": 0.5677420496940613, + "learning_rate": 0.0005523177769024228, + "loss": 3.0835, + "step": 11137 + }, + { + "epoch": 0.55, + "grad_norm": 0.555735170841217, + "learning_rate": 0.0005523094481087455, + "loss": 3.2663, + "step": 11138 + }, + { + "epoch": 0.55, + "grad_norm": 0.5250891447067261, + "learning_rate": 0.0005523011186505338, + "loss": 3.1598, + "step": 11139 + }, + { + "epoch": 0.55, + "grad_norm": 0.501929759979248, + "learning_rate": 0.0005522927885278094, + "loss": 3.3389, + "step": 11140 + }, + { + "epoch": 0.55, + "grad_norm": 0.5323585271835327, + "learning_rate": 0.0005522844577405943, + "loss": 3.2071, + "step": 11141 + }, + { + "epoch": 0.55, + "grad_norm": 0.5253964066505432, + "learning_rate": 0.0005522761262889106, + "loss": 3.0697, + "step": 11142 + }, + { + "epoch": 0.55, + "grad_norm": 0.5121307969093323, + "learning_rate": 0.0005522677941727801, + "loss": 3.1454, + "step": 11143 + }, + { + "epoch": 0.55, + "grad_norm": 0.5765665173530579, + "learning_rate": 0.0005522594613922248, + "loss": 3.3376, + "step": 11144 + }, + { + "epoch": 0.55, + "grad_norm": 0.5566869974136353, + "learning_rate": 0.0005522511279472666, + "loss": 3.2357, + "step": 11145 + }, + { + "epoch": 0.55, + "grad_norm": 0.540515124797821, + "learning_rate": 0.0005522427938379275, + "loss": 3.2131, + "step": 11146 + }, + { + "epoch": 0.55, + "grad_norm": 0.5782142877578735, + "learning_rate": 0.0005522344590642294, + "loss": 3.3415, + "step": 11147 + }, + { + "epoch": 0.55, + "grad_norm": 0.5115688443183899, + "learning_rate": 0.0005522261236261943, + "loss": 3.1543, + "step": 11148 + }, + { + "epoch": 0.55, + "grad_norm": 0.5313143730163574, + "learning_rate": 0.0005522177875238441, + "loss": 3.3948, + "step": 11149 + }, + { + "epoch": 0.55, + "grad_norm": 0.563582718372345, + "learning_rate": 0.0005522094507572009, + "loss": 3.2437, + "step": 11150 + }, + { + "epoch": 0.55, + "grad_norm": 0.514954686164856, + "learning_rate": 0.0005522011133262864, + "loss": 3.3182, + "step": 11151 + }, + { + "epoch": 0.55, + "grad_norm": 0.5384813547134399, + "learning_rate": 0.0005521927752311227, + "loss": 3.309, + "step": 11152 + }, + { + "epoch": 0.55, + "grad_norm": 0.5301558971405029, + "learning_rate": 0.0005521844364717319, + "loss": 3.1738, + "step": 11153 + }, + { + "epoch": 0.55, + "grad_norm": 0.5261275768280029, + "learning_rate": 0.0005521760970481357, + "loss": 3.2791, + "step": 11154 + }, + { + "epoch": 0.55, + "grad_norm": 0.539004385471344, + "learning_rate": 0.0005521677569603563, + "loss": 3.172, + "step": 11155 + }, + { + "epoch": 0.55, + "grad_norm": 0.5053558945655823, + "learning_rate": 0.0005521594162084155, + "loss": 3.207, + "step": 11156 + }, + { + "epoch": 0.55, + "grad_norm": 0.4946291744709015, + "learning_rate": 0.0005521510747923353, + "loss": 3.1878, + "step": 11157 + }, + { + "epoch": 0.55, + "grad_norm": 0.5328413844108582, + "learning_rate": 0.0005521427327121376, + "loss": 3.2071, + "step": 11158 + }, + { + "epoch": 0.55, + "grad_norm": 0.49959659576416016, + "learning_rate": 0.0005521343899678447, + "loss": 3.3371, + "step": 11159 + }, + { + "epoch": 0.55, + "grad_norm": 0.535815417766571, + "learning_rate": 0.0005521260465594782, + "loss": 3.2255, + "step": 11160 + }, + { + "epoch": 0.55, + "grad_norm": 0.5187474489212036, + "learning_rate": 0.0005521177024870602, + "loss": 3.2528, + "step": 11161 + }, + { + "epoch": 0.55, + "grad_norm": 0.5385481715202332, + "learning_rate": 0.0005521093577506128, + "loss": 3.2155, + "step": 11162 + }, + { + "epoch": 0.55, + "grad_norm": 0.5287864804267883, + "learning_rate": 0.0005521010123501578, + "loss": 3.2346, + "step": 11163 + }, + { + "epoch": 0.55, + "grad_norm": 0.501727819442749, + "learning_rate": 0.0005520926662857174, + "loss": 3.2213, + "step": 11164 + }, + { + "epoch": 0.55, + "grad_norm": 0.5046778321266174, + "learning_rate": 0.0005520843195573132, + "loss": 3.3777, + "step": 11165 + }, + { + "epoch": 0.55, + "grad_norm": 0.5383339524269104, + "learning_rate": 0.0005520759721649676, + "loss": 3.3155, + "step": 11166 + }, + { + "epoch": 0.55, + "grad_norm": 0.49202319979667664, + "learning_rate": 0.0005520676241087023, + "loss": 3.0838, + "step": 11167 + }, + { + "epoch": 0.55, + "grad_norm": 0.4806743562221527, + "learning_rate": 0.0005520592753885394, + "loss": 3.3125, + "step": 11168 + }, + { + "epoch": 0.55, + "grad_norm": 0.5282479524612427, + "learning_rate": 0.0005520509260045009, + "loss": 3.1906, + "step": 11169 + }, + { + "epoch": 0.55, + "grad_norm": 0.5135144591331482, + "learning_rate": 0.0005520425759566087, + "loss": 3.1592, + "step": 11170 + }, + { + "epoch": 0.55, + "grad_norm": 0.5296515822410583, + "learning_rate": 0.000552034225244885, + "loss": 3.3004, + "step": 11171 + }, + { + "epoch": 0.55, + "grad_norm": 0.5206128358840942, + "learning_rate": 0.0005520258738693516, + "loss": 3.308, + "step": 11172 + }, + { + "epoch": 0.55, + "grad_norm": 0.5016621351242065, + "learning_rate": 0.0005520175218300305, + "loss": 3.2374, + "step": 11173 + }, + { + "epoch": 0.55, + "grad_norm": 0.5277829170227051, + "learning_rate": 0.0005520091691269438, + "loss": 3.2679, + "step": 11174 + }, + { + "epoch": 0.55, + "grad_norm": 0.48501524329185486, + "learning_rate": 0.0005520008157601134, + "loss": 3.3124, + "step": 11175 + }, + { + "epoch": 0.55, + "grad_norm": 0.5397695899009705, + "learning_rate": 0.0005519924617295613, + "loss": 3.0827, + "step": 11176 + }, + { + "epoch": 0.55, + "grad_norm": 0.4927702844142914, + "learning_rate": 0.0005519841070353097, + "loss": 3.2994, + "step": 11177 + }, + { + "epoch": 0.55, + "grad_norm": 0.48005515336990356, + "learning_rate": 0.0005519757516773804, + "loss": 3.4319, + "step": 11178 + }, + { + "epoch": 0.55, + "grad_norm": 0.5230315327644348, + "learning_rate": 0.0005519673956557954, + "loss": 3.2013, + "step": 11179 + }, + { + "epoch": 0.55, + "grad_norm": 0.5246223211288452, + "learning_rate": 0.0005519590389705769, + "loss": 3.2824, + "step": 11180 + }, + { + "epoch": 0.55, + "grad_norm": 0.5252644419670105, + "learning_rate": 0.0005519506816217466, + "loss": 3.323, + "step": 11181 + }, + { + "epoch": 0.55, + "grad_norm": 0.5019978284835815, + "learning_rate": 0.0005519423236093268, + "loss": 3.3634, + "step": 11182 + }, + { + "epoch": 0.55, + "grad_norm": 0.5068698525428772, + "learning_rate": 0.0005519339649333394, + "loss": 3.2813, + "step": 11183 + }, + { + "epoch": 0.55, + "grad_norm": 0.5238425731658936, + "learning_rate": 0.0005519256055938064, + "loss": 3.2488, + "step": 11184 + }, + { + "epoch": 0.55, + "grad_norm": 0.51482093334198, + "learning_rate": 0.0005519172455907499, + "loss": 3.3138, + "step": 11185 + }, + { + "epoch": 0.55, + "grad_norm": 0.5190244317054749, + "learning_rate": 0.0005519088849241918, + "loss": 3.246, + "step": 11186 + }, + { + "epoch": 0.55, + "grad_norm": 0.49023398756980896, + "learning_rate": 0.0005519005235941542, + "loss": 3.2134, + "step": 11187 + }, + { + "epoch": 0.55, + "grad_norm": 0.5314106345176697, + "learning_rate": 0.0005518921616006591, + "loss": 3.134, + "step": 11188 + }, + { + "epoch": 0.55, + "grad_norm": 0.4904386103153229, + "learning_rate": 0.0005518837989437285, + "loss": 3.3674, + "step": 11189 + }, + { + "epoch": 0.55, + "grad_norm": 0.49414244294166565, + "learning_rate": 0.0005518754356233845, + "loss": 3.0914, + "step": 11190 + }, + { + "epoch": 0.55, + "grad_norm": 0.5230140089988708, + "learning_rate": 0.000551867071639649, + "loss": 3.1609, + "step": 11191 + }, + { + "epoch": 0.55, + "grad_norm": 0.5059942007064819, + "learning_rate": 0.0005518587069925442, + "loss": 3.2041, + "step": 11192 + }, + { + "epoch": 0.55, + "grad_norm": 0.5030388832092285, + "learning_rate": 0.000551850341682092, + "loss": 3.1284, + "step": 11193 + }, + { + "epoch": 0.55, + "grad_norm": 0.5302931666374207, + "learning_rate": 0.0005518419757083145, + "loss": 3.2589, + "step": 11194 + }, + { + "epoch": 0.55, + "grad_norm": 0.5348262786865234, + "learning_rate": 0.0005518336090712337, + "loss": 3.3606, + "step": 11195 + }, + { + "epoch": 0.55, + "grad_norm": 0.5146247148513794, + "learning_rate": 0.0005518252417708716, + "loss": 3.214, + "step": 11196 + }, + { + "epoch": 0.55, + "grad_norm": 0.5075222253799438, + "learning_rate": 0.0005518168738072502, + "loss": 3.259, + "step": 11197 + }, + { + "epoch": 0.55, + "grad_norm": 0.5404617190361023, + "learning_rate": 0.0005518085051803918, + "loss": 3.3196, + "step": 11198 + }, + { + "epoch": 0.55, + "grad_norm": 0.5376437306404114, + "learning_rate": 0.0005518001358903182, + "loss": 3.0802, + "step": 11199 + }, + { + "epoch": 0.55, + "grad_norm": 0.5305649638175964, + "learning_rate": 0.0005517917659370513, + "loss": 3.2274, + "step": 11200 + }, + { + "epoch": 0.55, + "grad_norm": 0.533257246017456, + "learning_rate": 0.0005517833953206135, + "loss": 3.1717, + "step": 11201 + }, + { + "epoch": 0.55, + "grad_norm": 0.5252943634986877, + "learning_rate": 0.0005517750240410268, + "loss": 3.0743, + "step": 11202 + }, + { + "epoch": 0.55, + "grad_norm": 0.4910027086734772, + "learning_rate": 0.000551766652098313, + "loss": 3.5248, + "step": 11203 + }, + { + "epoch": 0.55, + "grad_norm": 0.5113924741744995, + "learning_rate": 0.0005517582794924943, + "loss": 3.1122, + "step": 11204 + }, + { + "epoch": 0.55, + "grad_norm": 0.5466858744621277, + "learning_rate": 0.0005517499062235928, + "loss": 3.2754, + "step": 11205 + }, + { + "epoch": 0.55, + "grad_norm": 0.5288564562797546, + "learning_rate": 0.0005517415322916305, + "loss": 3.3129, + "step": 11206 + }, + { + "epoch": 0.55, + "grad_norm": 0.5409068465232849, + "learning_rate": 0.0005517331576966294, + "loss": 3.2236, + "step": 11207 + }, + { + "epoch": 0.55, + "grad_norm": 0.581696093082428, + "learning_rate": 0.0005517247824386117, + "loss": 3.5911, + "step": 11208 + }, + { + "epoch": 0.55, + "grad_norm": 0.4891887903213501, + "learning_rate": 0.0005517164065175993, + "loss": 3.3352, + "step": 11209 + }, + { + "epoch": 0.55, + "grad_norm": 0.5119558572769165, + "learning_rate": 0.0005517080299336143, + "loss": 3.1011, + "step": 11210 + }, + { + "epoch": 0.55, + "grad_norm": 0.5541062355041504, + "learning_rate": 0.0005516996526866788, + "loss": 3.2571, + "step": 11211 + }, + { + "epoch": 0.55, + "grad_norm": 0.5030975341796875, + "learning_rate": 0.0005516912747768149, + "loss": 3.1197, + "step": 11212 + }, + { + "epoch": 0.55, + "grad_norm": 0.5093387365341187, + "learning_rate": 0.0005516828962040446, + "loss": 3.2713, + "step": 11213 + }, + { + "epoch": 0.55, + "grad_norm": 0.525682806968689, + "learning_rate": 0.00055167451696839, + "loss": 3.2748, + "step": 11214 + }, + { + "epoch": 0.55, + "grad_norm": 0.565834105014801, + "learning_rate": 0.0005516661370698732, + "loss": 3.0386, + "step": 11215 + }, + { + "epoch": 0.55, + "grad_norm": 0.5513453483581543, + "learning_rate": 0.0005516577565085162, + "loss": 3.2004, + "step": 11216 + }, + { + "epoch": 0.55, + "grad_norm": 0.5197407007217407, + "learning_rate": 0.0005516493752843411, + "loss": 3.1911, + "step": 11217 + }, + { + "epoch": 0.55, + "grad_norm": 0.5003480315208435, + "learning_rate": 0.00055164099339737, + "loss": 3.3038, + "step": 11218 + }, + { + "epoch": 0.55, + "grad_norm": 0.5131300687789917, + "learning_rate": 0.0005516326108476248, + "loss": 3.2264, + "step": 11219 + }, + { + "epoch": 0.55, + "grad_norm": 0.528200626373291, + "learning_rate": 0.0005516242276351279, + "loss": 3.1164, + "step": 11220 + }, + { + "epoch": 0.55, + "grad_norm": 0.5266100764274597, + "learning_rate": 0.0005516158437599011, + "loss": 3.2815, + "step": 11221 + }, + { + "epoch": 0.55, + "grad_norm": 0.5277635455131531, + "learning_rate": 0.0005516074592219666, + "loss": 3.2759, + "step": 11222 + }, + { + "epoch": 0.55, + "grad_norm": 0.5396936535835266, + "learning_rate": 0.0005515990740213466, + "loss": 3.2679, + "step": 11223 + }, + { + "epoch": 0.55, + "grad_norm": 0.5293653607368469, + "learning_rate": 0.000551590688158063, + "loss": 3.111, + "step": 11224 + }, + { + "epoch": 0.55, + "grad_norm": 0.539237380027771, + "learning_rate": 0.0005515823016321379, + "loss": 3.2653, + "step": 11225 + }, + { + "epoch": 0.55, + "grad_norm": 0.5173228979110718, + "learning_rate": 0.0005515739144435934, + "loss": 3.2804, + "step": 11226 + }, + { + "epoch": 0.55, + "grad_norm": 0.569195032119751, + "learning_rate": 0.0005515655265924518, + "loss": 3.2265, + "step": 11227 + }, + { + "epoch": 0.55, + "grad_norm": 0.5007268786430359, + "learning_rate": 0.0005515571380787348, + "loss": 3.4115, + "step": 11228 + }, + { + "epoch": 0.55, + "grad_norm": 0.5260988473892212, + "learning_rate": 0.0005515487489024649, + "loss": 3.2786, + "step": 11229 + }, + { + "epoch": 0.55, + "grad_norm": 0.5149096250534058, + "learning_rate": 0.0005515403590636639, + "loss": 3.382, + "step": 11230 + }, + { + "epoch": 0.55, + "grad_norm": 0.5453810691833496, + "learning_rate": 0.000551531968562354, + "loss": 3.4435, + "step": 11231 + }, + { + "epoch": 0.55, + "grad_norm": 0.5204135775566101, + "learning_rate": 0.0005515235773985573, + "loss": 3.2987, + "step": 11232 + }, + { + "epoch": 0.55, + "grad_norm": 0.5300283432006836, + "learning_rate": 0.000551515185572296, + "loss": 3.264, + "step": 11233 + }, + { + "epoch": 0.55, + "grad_norm": 0.5270791053771973, + "learning_rate": 0.000551506793083592, + "loss": 3.3112, + "step": 11234 + }, + { + "epoch": 0.55, + "grad_norm": 0.5587947368621826, + "learning_rate": 0.0005514983999324676, + "loss": 3.2582, + "step": 11235 + }, + { + "epoch": 0.55, + "grad_norm": 0.5228792428970337, + "learning_rate": 0.0005514900061189447, + "loss": 3.2563, + "step": 11236 + }, + { + "epoch": 0.55, + "grad_norm": 0.507921040058136, + "learning_rate": 0.0005514816116430456, + "loss": 3.4942, + "step": 11237 + }, + { + "epoch": 0.55, + "grad_norm": 0.5501565337181091, + "learning_rate": 0.0005514732165047923, + "loss": 3.2608, + "step": 11238 + }, + { + "epoch": 0.55, + "grad_norm": 0.5317707657814026, + "learning_rate": 0.000551464820704207, + "loss": 3.058, + "step": 11239 + }, + { + "epoch": 0.55, + "grad_norm": 0.5452500581741333, + "learning_rate": 0.0005514564242413118, + "loss": 2.9297, + "step": 11240 + }, + { + "epoch": 0.55, + "grad_norm": 0.5187104344367981, + "learning_rate": 0.0005514480271161287, + "loss": 3.1697, + "step": 11241 + }, + { + "epoch": 0.55, + "grad_norm": 0.5452839732170105, + "learning_rate": 0.0005514396293286798, + "loss": 3.2642, + "step": 11242 + }, + { + "epoch": 0.55, + "grad_norm": 0.5251529812812805, + "learning_rate": 0.0005514312308789875, + "loss": 3.0369, + "step": 11243 + }, + { + "epoch": 0.55, + "grad_norm": 0.5177854299545288, + "learning_rate": 0.0005514228317670736, + "loss": 3.2987, + "step": 11244 + }, + { + "epoch": 0.55, + "grad_norm": 0.5245817303657532, + "learning_rate": 0.0005514144319929604, + "loss": 3.1808, + "step": 11245 + }, + { + "epoch": 0.55, + "grad_norm": 0.5251028537750244, + "learning_rate": 0.0005514060315566699, + "loss": 3.0786, + "step": 11246 + }, + { + "epoch": 0.55, + "grad_norm": 0.513904333114624, + "learning_rate": 0.0005513976304582243, + "loss": 3.1704, + "step": 11247 + }, + { + "epoch": 0.55, + "grad_norm": 0.5236141681671143, + "learning_rate": 0.0005513892286976458, + "loss": 3.3471, + "step": 11248 + }, + { + "epoch": 0.55, + "grad_norm": 0.5166938304901123, + "learning_rate": 0.0005513808262749564, + "loss": 3.2649, + "step": 11249 + }, + { + "epoch": 0.55, + "grad_norm": 0.5092709064483643, + "learning_rate": 0.0005513724231901783, + "loss": 3.2016, + "step": 11250 + }, + { + "epoch": 0.55, + "grad_norm": 0.5671427845954895, + "learning_rate": 0.0005513640194433336, + "loss": 3.0024, + "step": 11251 + }, + { + "epoch": 0.55, + "grad_norm": 0.4978698194026947, + "learning_rate": 0.0005513556150344445, + "loss": 3.2126, + "step": 11252 + }, + { + "epoch": 0.55, + "grad_norm": 0.5027517676353455, + "learning_rate": 0.000551347209963533, + "loss": 3.2967, + "step": 11253 + }, + { + "epoch": 0.55, + "grad_norm": 0.5220968723297119, + "learning_rate": 0.0005513388042306214, + "loss": 3.4618, + "step": 11254 + }, + { + "epoch": 0.55, + "grad_norm": 0.49995917081832886, + "learning_rate": 0.0005513303978357317, + "loss": 3.2012, + "step": 11255 + }, + { + "epoch": 0.55, + "grad_norm": 0.5419469475746155, + "learning_rate": 0.0005513219907788861, + "loss": 3.134, + "step": 11256 + }, + { + "epoch": 0.55, + "grad_norm": 0.5099649429321289, + "learning_rate": 0.0005513135830601068, + "loss": 3.1283, + "step": 11257 + }, + { + "epoch": 0.55, + "grad_norm": 0.5278074741363525, + "learning_rate": 0.0005513051746794159, + "loss": 3.3438, + "step": 11258 + }, + { + "epoch": 0.55, + "grad_norm": 0.5176796317100525, + "learning_rate": 0.0005512967656368355, + "loss": 3.0982, + "step": 11259 + }, + { + "epoch": 0.55, + "grad_norm": 0.5078525543212891, + "learning_rate": 0.0005512883559323878, + "loss": 3.4361, + "step": 11260 + }, + { + "epoch": 0.55, + "grad_norm": 0.4989159405231476, + "learning_rate": 0.0005512799455660948, + "loss": 3.2102, + "step": 11261 + }, + { + "epoch": 0.55, + "grad_norm": 0.5165753960609436, + "learning_rate": 0.000551271534537979, + "loss": 3.2573, + "step": 11262 + }, + { + "epoch": 0.55, + "grad_norm": 0.5196847319602966, + "learning_rate": 0.0005512631228480622, + "loss": 3.3376, + "step": 11263 + }, + { + "epoch": 0.55, + "grad_norm": 0.49779194593429565, + "learning_rate": 0.0005512547104963668, + "loss": 3.5134, + "step": 11264 + }, + { + "epoch": 0.55, + "grad_norm": 0.5193706750869751, + "learning_rate": 0.0005512462974829147, + "loss": 3.2742, + "step": 11265 + }, + { + "epoch": 0.55, + "grad_norm": 0.5080481171607971, + "learning_rate": 0.0005512378838077283, + "loss": 3.1623, + "step": 11266 + }, + { + "epoch": 0.55, + "grad_norm": 0.5434474945068359, + "learning_rate": 0.0005512294694708296, + "loss": 3.3106, + "step": 11267 + }, + { + "epoch": 0.55, + "grad_norm": 0.4940679967403412, + "learning_rate": 0.0005512210544722408, + "loss": 3.2186, + "step": 11268 + }, + { + "epoch": 0.55, + "grad_norm": 0.5241498947143555, + "learning_rate": 0.0005512126388119842, + "loss": 3.1095, + "step": 11269 + }, + { + "epoch": 0.55, + "grad_norm": 0.5188072323799133, + "learning_rate": 0.0005512042224900818, + "loss": 3.3153, + "step": 11270 + }, + { + "epoch": 0.55, + "grad_norm": 0.492482990026474, + "learning_rate": 0.000551195805506556, + "loss": 3.2382, + "step": 11271 + }, + { + "epoch": 0.55, + "grad_norm": 0.49130314588546753, + "learning_rate": 0.0005511873878614286, + "loss": 3.236, + "step": 11272 + }, + { + "epoch": 0.55, + "grad_norm": 0.517808735370636, + "learning_rate": 0.0005511789695547219, + "loss": 3.3371, + "step": 11273 + }, + { + "epoch": 0.55, + "grad_norm": 0.5449178218841553, + "learning_rate": 0.0005511705505864582, + "loss": 3.0906, + "step": 11274 + }, + { + "epoch": 0.55, + "grad_norm": 0.5152904987335205, + "learning_rate": 0.0005511621309566597, + "loss": 3.2692, + "step": 11275 + }, + { + "epoch": 0.55, + "grad_norm": 0.554939866065979, + "learning_rate": 0.0005511537106653484, + "loss": 3.3965, + "step": 11276 + }, + { + "epoch": 0.55, + "grad_norm": 0.5003484487533569, + "learning_rate": 0.0005511452897125467, + "loss": 3.1112, + "step": 11277 + }, + { + "epoch": 0.55, + "grad_norm": 0.5301329493522644, + "learning_rate": 0.0005511368680982765, + "loss": 3.3394, + "step": 11278 + }, + { + "epoch": 0.55, + "grad_norm": 0.4994280934333801, + "learning_rate": 0.0005511284458225601, + "loss": 3.1309, + "step": 11279 + }, + { + "epoch": 0.55, + "grad_norm": 0.5375825762748718, + "learning_rate": 0.0005511200228854197, + "loss": 3.2027, + "step": 11280 + }, + { + "epoch": 0.55, + "grad_norm": 0.532985270023346, + "learning_rate": 0.0005511115992868776, + "loss": 3.2824, + "step": 11281 + }, + { + "epoch": 0.55, + "grad_norm": 0.48315703868865967, + "learning_rate": 0.0005511031750269558, + "loss": 3.2923, + "step": 11282 + }, + { + "epoch": 0.55, + "grad_norm": 0.5368021726608276, + "learning_rate": 0.0005510947501056766, + "loss": 3.2745, + "step": 11283 + }, + { + "epoch": 0.55, + "grad_norm": 0.5001078844070435, + "learning_rate": 0.0005510863245230621, + "loss": 3.2778, + "step": 11284 + }, + { + "epoch": 0.55, + "grad_norm": 0.49867165088653564, + "learning_rate": 0.0005510778982791345, + "loss": 3.0507, + "step": 11285 + }, + { + "epoch": 0.55, + "grad_norm": 0.5319203734397888, + "learning_rate": 0.0005510694713739162, + "loss": 3.1747, + "step": 11286 + }, + { + "epoch": 0.55, + "grad_norm": 0.5031033158302307, + "learning_rate": 0.0005510610438074291, + "loss": 3.2447, + "step": 11287 + }, + { + "epoch": 0.55, + "grad_norm": 0.4784884452819824, + "learning_rate": 0.0005510526155796956, + "loss": 3.4212, + "step": 11288 + }, + { + "epoch": 0.55, + "grad_norm": 0.5477344989776611, + "learning_rate": 0.0005510441866907377, + "loss": 3.1972, + "step": 11289 + }, + { + "epoch": 0.55, + "grad_norm": 0.49922096729278564, + "learning_rate": 0.000551035757140578, + "loss": 3.3556, + "step": 11290 + }, + { + "epoch": 0.55, + "grad_norm": 0.4894360899925232, + "learning_rate": 0.0005510273269292382, + "loss": 3.2473, + "step": 11291 + }, + { + "epoch": 0.55, + "grad_norm": 0.5014991760253906, + "learning_rate": 0.0005510188960567408, + "loss": 3.2122, + "step": 11292 + }, + { + "epoch": 0.55, + "grad_norm": 0.5156797170639038, + "learning_rate": 0.0005510104645231079, + "loss": 3.3523, + "step": 11293 + }, + { + "epoch": 0.55, + "grad_norm": 0.510955274105072, + "learning_rate": 0.0005510020323283618, + "loss": 3.1125, + "step": 11294 + }, + { + "epoch": 0.55, + "grad_norm": 0.5010038614273071, + "learning_rate": 0.0005509935994725245, + "loss": 3.2618, + "step": 11295 + }, + { + "epoch": 0.55, + "grad_norm": 0.5552043318748474, + "learning_rate": 0.0005509851659556185, + "loss": 3.0575, + "step": 11296 + }, + { + "epoch": 0.55, + "grad_norm": 0.48965537548065186, + "learning_rate": 0.0005509767317776659, + "loss": 3.1812, + "step": 11297 + }, + { + "epoch": 0.55, + "grad_norm": 0.521115243434906, + "learning_rate": 0.0005509682969386888, + "loss": 3.2456, + "step": 11298 + }, + { + "epoch": 0.55, + "grad_norm": 0.544768214225769, + "learning_rate": 0.0005509598614387096, + "loss": 3.3468, + "step": 11299 + }, + { + "epoch": 0.55, + "grad_norm": 0.5138852596282959, + "learning_rate": 0.0005509514252777503, + "loss": 3.4115, + "step": 11300 + }, + { + "epoch": 0.55, + "grad_norm": 0.5543062090873718, + "learning_rate": 0.0005509429884558334, + "loss": 3.3333, + "step": 11301 + }, + { + "epoch": 0.55, + "grad_norm": 0.5786649584770203, + "learning_rate": 0.0005509345509729808, + "loss": 3.0643, + "step": 11302 + }, + { + "epoch": 0.55, + "grad_norm": 0.5284539461135864, + "learning_rate": 0.000550926112829215, + "loss": 3.4618, + "step": 11303 + }, + { + "epoch": 0.55, + "grad_norm": 0.5231096744537354, + "learning_rate": 0.000550917674024558, + "loss": 3.213, + "step": 11304 + }, + { + "epoch": 0.55, + "grad_norm": 0.5191011428833008, + "learning_rate": 0.0005509092345590323, + "loss": 3.1695, + "step": 11305 + }, + { + "epoch": 0.55, + "grad_norm": 0.5704165101051331, + "learning_rate": 0.0005509007944326598, + "loss": 3.2244, + "step": 11306 + }, + { + "epoch": 0.55, + "grad_norm": 0.5259694457054138, + "learning_rate": 0.000550892353645463, + "loss": 3.0671, + "step": 11307 + }, + { + "epoch": 0.55, + "grad_norm": 0.5437265634536743, + "learning_rate": 0.0005508839121974641, + "loss": 3.2964, + "step": 11308 + }, + { + "epoch": 0.55, + "grad_norm": 0.5166676044464111, + "learning_rate": 0.0005508754700886851, + "loss": 3.16, + "step": 11309 + }, + { + "epoch": 0.55, + "grad_norm": 0.5371460318565369, + "learning_rate": 0.0005508670273191485, + "loss": 3.0633, + "step": 11310 + }, + { + "epoch": 0.55, + "grad_norm": 0.5051896572113037, + "learning_rate": 0.0005508585838888764, + "loss": 3.2427, + "step": 11311 + }, + { + "epoch": 0.55, + "grad_norm": 0.6367805600166321, + "learning_rate": 0.000550850139797891, + "loss": 3.3534, + "step": 11312 + }, + { + "epoch": 0.55, + "grad_norm": 0.5592988729476929, + "learning_rate": 0.0005508416950462146, + "loss": 3.0948, + "step": 11313 + }, + { + "epoch": 0.55, + "grad_norm": 0.5036086440086365, + "learning_rate": 0.0005508332496338696, + "loss": 3.1853, + "step": 11314 + }, + { + "epoch": 0.55, + "grad_norm": 0.49412891268730164, + "learning_rate": 0.000550824803560878, + "loss": 3.1354, + "step": 11315 + }, + { + "epoch": 0.55, + "grad_norm": 0.5339794158935547, + "learning_rate": 0.0005508163568272622, + "loss": 3.2264, + "step": 11316 + }, + { + "epoch": 0.55, + "grad_norm": 0.5197786092758179, + "learning_rate": 0.0005508079094330443, + "loss": 3.388, + "step": 11317 + }, + { + "epoch": 0.55, + "grad_norm": 0.5386886596679688, + "learning_rate": 0.0005507994613782466, + "loss": 3.0609, + "step": 11318 + }, + { + "epoch": 0.55, + "grad_norm": 0.5509684085845947, + "learning_rate": 0.0005507910126628915, + "loss": 3.131, + "step": 11319 + }, + { + "epoch": 0.55, + "grad_norm": 0.5496838688850403, + "learning_rate": 0.0005507825632870011, + "loss": 3.4422, + "step": 11320 + }, + { + "epoch": 0.55, + "grad_norm": 0.5109300017356873, + "learning_rate": 0.0005507741132505977, + "loss": 3.2252, + "step": 11321 + }, + { + "epoch": 0.55, + "grad_norm": 0.48941949009895325, + "learning_rate": 0.0005507656625537035, + "loss": 3.2382, + "step": 11322 + }, + { + "epoch": 0.55, + "grad_norm": 0.545982301235199, + "learning_rate": 0.0005507572111963408, + "loss": 3.2181, + "step": 11323 + }, + { + "epoch": 0.55, + "grad_norm": 0.5077481269836426, + "learning_rate": 0.000550748759178532, + "loss": 3.2736, + "step": 11324 + }, + { + "epoch": 0.56, + "grad_norm": 0.5178395509719849, + "learning_rate": 0.0005507403065002991, + "loss": 3.249, + "step": 11325 + }, + { + "epoch": 0.56, + "grad_norm": 0.5448976159095764, + "learning_rate": 0.0005507318531616645, + "loss": 3.2074, + "step": 11326 + }, + { + "epoch": 0.56, + "grad_norm": 0.4983932375907898, + "learning_rate": 0.0005507233991626504, + "loss": 3.309, + "step": 11327 + }, + { + "epoch": 0.56, + "grad_norm": 0.5093890428543091, + "learning_rate": 0.0005507149445032793, + "loss": 3.2261, + "step": 11328 + }, + { + "epoch": 0.56, + "grad_norm": 0.4824458658695221, + "learning_rate": 0.0005507064891835732, + "loss": 3.1697, + "step": 11329 + }, + { + "epoch": 0.56, + "grad_norm": 0.5757153034210205, + "learning_rate": 0.0005506980332035543, + "loss": 3.1732, + "step": 11330 + }, + { + "epoch": 0.56, + "grad_norm": 0.5348081588745117, + "learning_rate": 0.0005506895765632452, + "loss": 2.9841, + "step": 11331 + }, + { + "epoch": 0.56, + "grad_norm": 0.526430070400238, + "learning_rate": 0.0005506811192626679, + "loss": 3.1298, + "step": 11332 + }, + { + "epoch": 0.56, + "grad_norm": 0.5291725993156433, + "learning_rate": 0.0005506726613018449, + "loss": 3.2061, + "step": 11333 + }, + { + "epoch": 0.56, + "grad_norm": 0.5114479064941406, + "learning_rate": 0.0005506642026807983, + "loss": 3.3891, + "step": 11334 + }, + { + "epoch": 0.56, + "grad_norm": 0.544650673866272, + "learning_rate": 0.0005506557433995505, + "loss": 3.2341, + "step": 11335 + }, + { + "epoch": 0.56, + "grad_norm": 0.5388392806053162, + "learning_rate": 0.0005506472834581236, + "loss": 3.1413, + "step": 11336 + }, + { + "epoch": 0.56, + "grad_norm": 0.5075357556343079, + "learning_rate": 0.0005506388228565401, + "loss": 3.1682, + "step": 11337 + }, + { + "epoch": 0.56, + "grad_norm": 0.5130732655525208, + "learning_rate": 0.000550630361594822, + "loss": 3.2566, + "step": 11338 + }, + { + "epoch": 0.56, + "grad_norm": 0.5059003233909607, + "learning_rate": 0.0005506218996729919, + "loss": 3.1209, + "step": 11339 + }, + { + "epoch": 0.56, + "grad_norm": 0.5154292583465576, + "learning_rate": 0.0005506134370910719, + "loss": 3.1587, + "step": 11340 + }, + { + "epoch": 0.56, + "grad_norm": 0.5048404932022095, + "learning_rate": 0.0005506049738490844, + "loss": 3.2718, + "step": 11341 + }, + { + "epoch": 0.56, + "grad_norm": 0.6062512397766113, + "learning_rate": 0.0005505965099470516, + "loss": 3.2463, + "step": 11342 + }, + { + "epoch": 0.56, + "grad_norm": 0.5517935156822205, + "learning_rate": 0.0005505880453849958, + "loss": 3.2156, + "step": 11343 + }, + { + "epoch": 0.56, + "grad_norm": 0.49947217106819153, + "learning_rate": 0.0005505795801629393, + "loss": 3.1784, + "step": 11344 + }, + { + "epoch": 0.56, + "grad_norm": 0.4772196114063263, + "learning_rate": 0.0005505711142809043, + "loss": 3.2223, + "step": 11345 + }, + { + "epoch": 0.56, + "grad_norm": 0.49474474787712097, + "learning_rate": 0.0005505626477389134, + "loss": 3.4315, + "step": 11346 + }, + { + "epoch": 0.56, + "grad_norm": 0.527829110622406, + "learning_rate": 0.0005505541805369888, + "loss": 3.2979, + "step": 11347 + }, + { + "epoch": 0.56, + "grad_norm": 0.5038018822669983, + "learning_rate": 0.0005505457126751524, + "loss": 3.0596, + "step": 11348 + }, + { + "epoch": 0.56, + "grad_norm": 0.5035970211029053, + "learning_rate": 0.000550537244153427, + "loss": 3.267, + "step": 11349 + }, + { + "epoch": 0.56, + "grad_norm": 0.5311546325683594, + "learning_rate": 0.0005505287749718348, + "loss": 3.2461, + "step": 11350 + }, + { + "epoch": 0.56, + "grad_norm": 0.5109077095985413, + "learning_rate": 0.0005505203051303978, + "loss": 3.3966, + "step": 11351 + }, + { + "epoch": 0.56, + "grad_norm": 0.5348523855209351, + "learning_rate": 0.0005505118346291386, + "loss": 3.3345, + "step": 11352 + }, + { + "epoch": 0.56, + "grad_norm": 0.5685103535652161, + "learning_rate": 0.0005505033634680795, + "loss": 3.1709, + "step": 11353 + }, + { + "epoch": 0.56, + "grad_norm": 0.533107578754425, + "learning_rate": 0.0005504948916472427, + "loss": 3.1036, + "step": 11354 + }, + { + "epoch": 0.56, + "grad_norm": 0.5173946619033813, + "learning_rate": 0.0005504864191666506, + "loss": 3.3519, + "step": 11355 + }, + { + "epoch": 0.56, + "grad_norm": 0.5562966465950012, + "learning_rate": 0.0005504779460263255, + "loss": 3.2072, + "step": 11356 + }, + { + "epoch": 0.56, + "grad_norm": 0.5066145062446594, + "learning_rate": 0.0005504694722262897, + "loss": 3.1163, + "step": 11357 + }, + { + "epoch": 0.56, + "grad_norm": 0.5840992331504822, + "learning_rate": 0.0005504609977665655, + "loss": 3.5294, + "step": 11358 + }, + { + "epoch": 0.56, + "grad_norm": 0.5080714225769043, + "learning_rate": 0.0005504525226471752, + "loss": 3.2118, + "step": 11359 + }, + { + "epoch": 0.56, + "grad_norm": 0.5689548254013062, + "learning_rate": 0.0005504440468681412, + "loss": 3.426, + "step": 11360 + }, + { + "epoch": 0.56, + "grad_norm": 0.5232775211334229, + "learning_rate": 0.0005504355704294857, + "loss": 3.3447, + "step": 11361 + }, + { + "epoch": 0.56, + "grad_norm": 0.530229926109314, + "learning_rate": 0.0005504270933312311, + "loss": 3.1767, + "step": 11362 + }, + { + "epoch": 0.56, + "grad_norm": 0.4973139762878418, + "learning_rate": 0.0005504186155733998, + "loss": 3.5491, + "step": 11363 + }, + { + "epoch": 0.56, + "grad_norm": 0.5430886149406433, + "learning_rate": 0.0005504101371560141, + "loss": 3.0946, + "step": 11364 + }, + { + "epoch": 0.56, + "grad_norm": 0.5411336421966553, + "learning_rate": 0.0005504016580790963, + "loss": 3.306, + "step": 11365 + }, + { + "epoch": 0.56, + "grad_norm": 0.5088608860969543, + "learning_rate": 0.0005503931783426686, + "loss": 3.3453, + "step": 11366 + }, + { + "epoch": 0.56, + "grad_norm": 0.5100044012069702, + "learning_rate": 0.0005503846979467535, + "loss": 3.177, + "step": 11367 + }, + { + "epoch": 0.56, + "grad_norm": 0.49586084485054016, + "learning_rate": 0.0005503762168913733, + "loss": 3.3249, + "step": 11368 + }, + { + "epoch": 0.56, + "grad_norm": 0.5439696311950684, + "learning_rate": 0.0005503677351765503, + "loss": 3.2196, + "step": 11369 + }, + { + "epoch": 0.56, + "grad_norm": 0.5079057216644287, + "learning_rate": 0.0005503592528023069, + "loss": 3.3114, + "step": 11370 + }, + { + "epoch": 0.56, + "grad_norm": 0.5126676559448242, + "learning_rate": 0.0005503507697686654, + "loss": 3.4438, + "step": 11371 + }, + { + "epoch": 0.56, + "grad_norm": 0.5350159406661987, + "learning_rate": 0.0005503422860756482, + "loss": 3.2688, + "step": 11372 + }, + { + "epoch": 0.56, + "grad_norm": 0.5482049584388733, + "learning_rate": 0.0005503338017232776, + "loss": 3.2605, + "step": 11373 + }, + { + "epoch": 0.56, + "grad_norm": 0.5451943278312683, + "learning_rate": 0.0005503253167115759, + "loss": 3.1051, + "step": 11374 + }, + { + "epoch": 0.56, + "grad_norm": 0.610254168510437, + "learning_rate": 0.0005503168310405655, + "loss": 3.2985, + "step": 11375 + }, + { + "epoch": 0.56, + "grad_norm": 0.5316271185874939, + "learning_rate": 0.0005503083447102688, + "loss": 3.3086, + "step": 11376 + }, + { + "epoch": 0.56, + "grad_norm": 0.5323793292045593, + "learning_rate": 0.000550299857720708, + "loss": 3.2379, + "step": 11377 + }, + { + "epoch": 0.56, + "grad_norm": 0.5014329552650452, + "learning_rate": 0.0005502913700719056, + "loss": 3.2504, + "step": 11378 + }, + { + "epoch": 0.56, + "grad_norm": 0.548941433429718, + "learning_rate": 0.0005502828817638838, + "loss": 3.0864, + "step": 11379 + }, + { + "epoch": 0.56, + "grad_norm": 0.5736619234085083, + "learning_rate": 0.0005502743927966652, + "loss": 3.2717, + "step": 11380 + }, + { + "epoch": 0.56, + "grad_norm": 0.5364370942115784, + "learning_rate": 0.0005502659031702721, + "loss": 3.0777, + "step": 11381 + }, + { + "epoch": 0.56, + "grad_norm": 0.6484993100166321, + "learning_rate": 0.0005502574128847266, + "loss": 3.3747, + "step": 11382 + }, + { + "epoch": 0.56, + "grad_norm": 0.5159845948219299, + "learning_rate": 0.0005502489219400512, + "loss": 3.377, + "step": 11383 + }, + { + "epoch": 0.56, + "grad_norm": 0.48935675621032715, + "learning_rate": 0.0005502404303362684, + "loss": 3.2342, + "step": 11384 + }, + { + "epoch": 0.56, + "grad_norm": 0.5167993307113647, + "learning_rate": 0.0005502319380734005, + "loss": 3.5881, + "step": 11385 + }, + { + "epoch": 0.56, + "grad_norm": 0.5177289843559265, + "learning_rate": 0.0005502234451514697, + "loss": 3.392, + "step": 11386 + }, + { + "epoch": 0.56, + "grad_norm": 0.5180376172065735, + "learning_rate": 0.0005502149515704985, + "loss": 2.9029, + "step": 11387 + }, + { + "epoch": 0.56, + "grad_norm": 0.5249106884002686, + "learning_rate": 0.0005502064573305094, + "loss": 3.2103, + "step": 11388 + }, + { + "epoch": 0.56, + "grad_norm": 0.5171917676925659, + "learning_rate": 0.0005501979624315247, + "loss": 3.109, + "step": 11389 + }, + { + "epoch": 0.56, + "grad_norm": 0.5251325368881226, + "learning_rate": 0.0005501894668735666, + "loss": 3.3081, + "step": 11390 + }, + { + "epoch": 0.56, + "grad_norm": 0.518473207950592, + "learning_rate": 0.0005501809706566575, + "loss": 3.0674, + "step": 11391 + }, + { + "epoch": 0.56, + "grad_norm": 0.5217577815055847, + "learning_rate": 0.00055017247378082, + "loss": 3.2443, + "step": 11392 + }, + { + "epoch": 0.56, + "grad_norm": 0.498160719871521, + "learning_rate": 0.0005501639762460764, + "loss": 3.1407, + "step": 11393 + }, + { + "epoch": 0.56, + "grad_norm": 0.5372119545936584, + "learning_rate": 0.000550155478052449, + "loss": 3.3184, + "step": 11394 + }, + { + "epoch": 0.56, + "grad_norm": 0.550507664680481, + "learning_rate": 0.0005501469791999601, + "loss": 3.2081, + "step": 11395 + }, + { + "epoch": 0.56, + "grad_norm": 0.5311989784240723, + "learning_rate": 0.0005501384796886323, + "loss": 3.3632, + "step": 11396 + }, + { + "epoch": 0.56, + "grad_norm": 0.5252476334571838, + "learning_rate": 0.0005501299795184878, + "loss": 3.1677, + "step": 11397 + }, + { + "epoch": 0.56, + "grad_norm": 0.5009608268737793, + "learning_rate": 0.0005501214786895491, + "loss": 3.2492, + "step": 11398 + }, + { + "epoch": 0.56, + "grad_norm": 0.5100443959236145, + "learning_rate": 0.0005501129772018387, + "loss": 3.4272, + "step": 11399 + }, + { + "epoch": 0.56, + "grad_norm": 0.5339254140853882, + "learning_rate": 0.0005501044750553788, + "loss": 3.2937, + "step": 11400 + }, + { + "epoch": 0.56, + "grad_norm": 0.5647916793823242, + "learning_rate": 0.0005500959722501917, + "loss": 3.2429, + "step": 11401 + }, + { + "epoch": 0.56, + "grad_norm": 0.48013341426849365, + "learning_rate": 0.0005500874687863, + "loss": 3.2782, + "step": 11402 + }, + { + "epoch": 0.56, + "grad_norm": 0.5085362195968628, + "learning_rate": 0.000550078964663726, + "loss": 3.148, + "step": 11403 + }, + { + "epoch": 0.56, + "grad_norm": 0.5075708031654358, + "learning_rate": 0.0005500704598824923, + "loss": 3.2333, + "step": 11404 + }, + { + "epoch": 0.56, + "grad_norm": 0.5158693194389343, + "learning_rate": 0.0005500619544426211, + "loss": 3.276, + "step": 11405 + }, + { + "epoch": 0.56, + "grad_norm": 0.5510281920433044, + "learning_rate": 0.0005500534483441347, + "loss": 3.0602, + "step": 11406 + }, + { + "epoch": 0.56, + "grad_norm": 0.492349237203598, + "learning_rate": 0.0005500449415870558, + "loss": 3.3378, + "step": 11407 + }, + { + "epoch": 0.56, + "grad_norm": 0.502224862575531, + "learning_rate": 0.0005500364341714066, + "loss": 2.86, + "step": 11408 + }, + { + "epoch": 0.56, + "grad_norm": 0.5376916527748108, + "learning_rate": 0.0005500279260972095, + "loss": 3.1087, + "step": 11409 + }, + { + "epoch": 0.56, + "grad_norm": 0.5204702615737915, + "learning_rate": 0.0005500194173644869, + "loss": 3.235, + "step": 11410 + }, + { + "epoch": 0.56, + "grad_norm": 0.5284196734428406, + "learning_rate": 0.0005500109079732613, + "loss": 3.2917, + "step": 11411 + }, + { + "epoch": 0.56, + "grad_norm": 0.5423819422721863, + "learning_rate": 0.0005500023979235552, + "loss": 3.2564, + "step": 11412 + }, + { + "epoch": 0.56, + "grad_norm": 0.7053835988044739, + "learning_rate": 0.0005499938872153908, + "loss": 3.2405, + "step": 11413 + }, + { + "epoch": 0.56, + "grad_norm": 0.5075749754905701, + "learning_rate": 0.0005499853758487907, + "loss": 3.2424, + "step": 11414 + }, + { + "epoch": 0.56, + "grad_norm": 0.5351574420928955, + "learning_rate": 0.0005499768638237771, + "loss": 3.2685, + "step": 11415 + }, + { + "epoch": 0.56, + "grad_norm": 0.49611949920654297, + "learning_rate": 0.0005499683511403727, + "loss": 3.2747, + "step": 11416 + }, + { + "epoch": 0.56, + "grad_norm": 0.5512820482254028, + "learning_rate": 0.0005499598377985997, + "loss": 3.2599, + "step": 11417 + }, + { + "epoch": 0.56, + "grad_norm": 0.5308202505111694, + "learning_rate": 0.0005499513237984807, + "loss": 3.134, + "step": 11418 + }, + { + "epoch": 0.56, + "grad_norm": 0.5064956545829773, + "learning_rate": 0.0005499428091400378, + "loss": 3.2349, + "step": 11419 + }, + { + "epoch": 0.56, + "grad_norm": 0.5472379326820374, + "learning_rate": 0.0005499342938232938, + "loss": 3.1336, + "step": 11420 + }, + { + "epoch": 0.56, + "grad_norm": 0.5334680676460266, + "learning_rate": 0.0005499257778482709, + "loss": 3.2006, + "step": 11421 + }, + { + "epoch": 0.56, + "grad_norm": 0.5283361077308655, + "learning_rate": 0.0005499172612149916, + "loss": 3.5046, + "step": 11422 + }, + { + "epoch": 0.56, + "grad_norm": 0.5037579536437988, + "learning_rate": 0.0005499087439234784, + "loss": 3.2281, + "step": 11423 + }, + { + "epoch": 0.56, + "grad_norm": 0.5362147688865662, + "learning_rate": 0.0005499002259737536, + "loss": 3.2373, + "step": 11424 + }, + { + "epoch": 0.56, + "grad_norm": 0.5454772710800171, + "learning_rate": 0.0005498917073658397, + "loss": 3.1557, + "step": 11425 + }, + { + "epoch": 0.56, + "grad_norm": 0.5577088594436646, + "learning_rate": 0.0005498831880997591, + "loss": 3.3548, + "step": 11426 + }, + { + "epoch": 0.56, + "grad_norm": 0.688560962677002, + "learning_rate": 0.0005498746681755343, + "loss": 3.0268, + "step": 11427 + }, + { + "epoch": 0.56, + "grad_norm": 0.5097750425338745, + "learning_rate": 0.0005498661475931879, + "loss": 3.438, + "step": 11428 + }, + { + "epoch": 0.56, + "grad_norm": 0.5364587903022766, + "learning_rate": 0.000549857626352742, + "loss": 3.2104, + "step": 11429 + }, + { + "epoch": 0.56, + "grad_norm": 0.5260832905769348, + "learning_rate": 0.0005498491044542191, + "loss": 3.4208, + "step": 11430 + }, + { + "epoch": 0.56, + "grad_norm": 0.49136224389076233, + "learning_rate": 0.0005498405818976418, + "loss": 3.4387, + "step": 11431 + }, + { + "epoch": 0.56, + "grad_norm": 0.5791602730751038, + "learning_rate": 0.0005498320586830326, + "loss": 3.4698, + "step": 11432 + }, + { + "epoch": 0.56, + "grad_norm": 0.5048266053199768, + "learning_rate": 0.0005498235348104137, + "loss": 3.1668, + "step": 11433 + }, + { + "epoch": 0.56, + "grad_norm": 0.5733696818351746, + "learning_rate": 0.0005498150102798078, + "loss": 3.261, + "step": 11434 + }, + { + "epoch": 0.56, + "grad_norm": 0.5145251750946045, + "learning_rate": 0.0005498064850912373, + "loss": 3.3864, + "step": 11435 + }, + { + "epoch": 0.56, + "grad_norm": 0.5857335329055786, + "learning_rate": 0.0005497979592447244, + "loss": 3.1253, + "step": 11436 + }, + { + "epoch": 0.56, + "grad_norm": 0.5221096277236938, + "learning_rate": 0.0005497894327402918, + "loss": 3.1466, + "step": 11437 + }, + { + "epoch": 0.56, + "grad_norm": 0.4997885525226593, + "learning_rate": 0.0005497809055779619, + "loss": 3.1797, + "step": 11438 + }, + { + "epoch": 0.56, + "grad_norm": 0.5484127998352051, + "learning_rate": 0.0005497723777577573, + "loss": 3.1413, + "step": 11439 + }, + { + "epoch": 0.56, + "grad_norm": 0.5325717926025391, + "learning_rate": 0.0005497638492797002, + "loss": 3.3583, + "step": 11440 + }, + { + "epoch": 0.56, + "grad_norm": 0.5324811339378357, + "learning_rate": 0.0005497553201438132, + "loss": 3.111, + "step": 11441 + }, + { + "epoch": 0.56, + "grad_norm": 0.5136557221412659, + "learning_rate": 0.0005497467903501188, + "loss": 3.2754, + "step": 11442 + }, + { + "epoch": 0.56, + "grad_norm": 0.5221689343452454, + "learning_rate": 0.0005497382598986394, + "loss": 3.3697, + "step": 11443 + }, + { + "epoch": 0.56, + "grad_norm": 0.5226287245750427, + "learning_rate": 0.0005497297287893975, + "loss": 3.5221, + "step": 11444 + }, + { + "epoch": 0.56, + "grad_norm": 0.5215665102005005, + "learning_rate": 0.0005497211970224156, + "loss": 3.2144, + "step": 11445 + }, + { + "epoch": 0.56, + "grad_norm": 0.5560000538825989, + "learning_rate": 0.000549712664597716, + "loss": 3.0434, + "step": 11446 + }, + { + "epoch": 0.56, + "grad_norm": 0.5306897163391113, + "learning_rate": 0.0005497041315153215, + "loss": 3.2474, + "step": 11447 + }, + { + "epoch": 0.56, + "grad_norm": 0.5063650012016296, + "learning_rate": 0.0005496955977752541, + "loss": 3.2657, + "step": 11448 + }, + { + "epoch": 0.56, + "grad_norm": 0.5555657148361206, + "learning_rate": 0.0005496870633775367, + "loss": 3.2211, + "step": 11449 + }, + { + "epoch": 0.56, + "grad_norm": 0.5415468811988831, + "learning_rate": 0.0005496785283221917, + "loss": 3.3836, + "step": 11450 + }, + { + "epoch": 0.56, + "grad_norm": 0.6452375054359436, + "learning_rate": 0.0005496699926092415, + "loss": 3.2545, + "step": 11451 + }, + { + "epoch": 0.56, + "grad_norm": 0.5206488966941833, + "learning_rate": 0.0005496614562387085, + "loss": 3.2159, + "step": 11452 + }, + { + "epoch": 0.56, + "grad_norm": 0.4810924232006073, + "learning_rate": 0.0005496529192106153, + "loss": 3.2473, + "step": 11453 + }, + { + "epoch": 0.56, + "grad_norm": 0.5150770545005798, + "learning_rate": 0.0005496443815249843, + "loss": 3.1742, + "step": 11454 + }, + { + "epoch": 0.56, + "grad_norm": 0.5128140449523926, + "learning_rate": 0.0005496358431818381, + "loss": 3.0848, + "step": 11455 + }, + { + "epoch": 0.56, + "grad_norm": 0.5324752926826477, + "learning_rate": 0.0005496273041811991, + "loss": 3.066, + "step": 11456 + }, + { + "epoch": 0.56, + "grad_norm": 0.5085709095001221, + "learning_rate": 0.0005496187645230898, + "loss": 3.216, + "step": 11457 + }, + { + "epoch": 0.56, + "grad_norm": 0.500564455986023, + "learning_rate": 0.0005496102242075328, + "loss": 3.2649, + "step": 11458 + }, + { + "epoch": 0.56, + "grad_norm": 0.5043260455131531, + "learning_rate": 0.0005496016832345505, + "loss": 3.0378, + "step": 11459 + }, + { + "epoch": 0.56, + "grad_norm": 0.496572345495224, + "learning_rate": 0.0005495931416041654, + "loss": 3.1889, + "step": 11460 + }, + { + "epoch": 0.56, + "grad_norm": 0.5041834712028503, + "learning_rate": 0.0005495845993163999, + "loss": 3.3908, + "step": 11461 + }, + { + "epoch": 0.56, + "grad_norm": 0.50934898853302, + "learning_rate": 0.0005495760563712768, + "loss": 3.3334, + "step": 11462 + }, + { + "epoch": 0.56, + "grad_norm": 0.5718405246734619, + "learning_rate": 0.0005495675127688181, + "loss": 3.1535, + "step": 11463 + }, + { + "epoch": 0.56, + "grad_norm": 0.48958179354667664, + "learning_rate": 0.0005495589685090468, + "loss": 3.0113, + "step": 11464 + }, + { + "epoch": 0.56, + "grad_norm": 0.5216549038887024, + "learning_rate": 0.0005495504235919852, + "loss": 3.3366, + "step": 11465 + }, + { + "epoch": 0.56, + "grad_norm": 0.4936378300189972, + "learning_rate": 0.0005495418780176558, + "loss": 3.3092, + "step": 11466 + }, + { + "epoch": 0.56, + "grad_norm": 0.5067612528800964, + "learning_rate": 0.0005495333317860812, + "loss": 3.3701, + "step": 11467 + }, + { + "epoch": 0.56, + "grad_norm": 0.5563675165176392, + "learning_rate": 0.0005495247848972837, + "loss": 3.2486, + "step": 11468 + }, + { + "epoch": 0.56, + "grad_norm": 0.5286310911178589, + "learning_rate": 0.0005495162373512859, + "loss": 3.2765, + "step": 11469 + }, + { + "epoch": 0.56, + "grad_norm": 0.5466699004173279, + "learning_rate": 0.0005495076891481104, + "loss": 3.1424, + "step": 11470 + }, + { + "epoch": 0.56, + "grad_norm": 0.5529357194900513, + "learning_rate": 0.0005494991402877797, + "loss": 3.1241, + "step": 11471 + }, + { + "epoch": 0.56, + "grad_norm": 0.5391454696655273, + "learning_rate": 0.0005494905907703164, + "loss": 3.1807, + "step": 11472 + }, + { + "epoch": 0.56, + "grad_norm": 0.5178135633468628, + "learning_rate": 0.0005494820405957427, + "loss": 3.2315, + "step": 11473 + }, + { + "epoch": 0.56, + "grad_norm": 0.5371395945549011, + "learning_rate": 0.0005494734897640815, + "loss": 3.3146, + "step": 11474 + }, + { + "epoch": 0.56, + "grad_norm": 0.4930516183376312, + "learning_rate": 0.0005494649382753549, + "loss": 3.1017, + "step": 11475 + }, + { + "epoch": 0.56, + "grad_norm": 0.518638551235199, + "learning_rate": 0.0005494563861295858, + "loss": 3.098, + "step": 11476 + }, + { + "epoch": 0.56, + "grad_norm": 0.515612781047821, + "learning_rate": 0.0005494478333267966, + "loss": 3.2221, + "step": 11477 + }, + { + "epoch": 0.56, + "grad_norm": 0.5380525588989258, + "learning_rate": 0.0005494392798670099, + "loss": 3.3513, + "step": 11478 + }, + { + "epoch": 0.56, + "grad_norm": 0.5430471301078796, + "learning_rate": 0.0005494307257502479, + "loss": 3.0634, + "step": 11479 + }, + { + "epoch": 0.56, + "grad_norm": 0.5702585577964783, + "learning_rate": 0.0005494221709765335, + "loss": 3.1669, + "step": 11480 + }, + { + "epoch": 0.56, + "grad_norm": 0.5334585905075073, + "learning_rate": 0.0005494136155458892, + "loss": 3.167, + "step": 11481 + }, + { + "epoch": 0.56, + "grad_norm": 0.5363363027572632, + "learning_rate": 0.0005494050594583373, + "loss": 3.2432, + "step": 11482 + }, + { + "epoch": 0.56, + "grad_norm": 0.5021289587020874, + "learning_rate": 0.0005493965027139005, + "loss": 3.4602, + "step": 11483 + }, + { + "epoch": 0.56, + "grad_norm": 0.5429815649986267, + "learning_rate": 0.0005493879453126013, + "loss": 3.1518, + "step": 11484 + }, + { + "epoch": 0.56, + "grad_norm": 0.48367413878440857, + "learning_rate": 0.0005493793872544625, + "loss": 3.1878, + "step": 11485 + }, + { + "epoch": 0.56, + "grad_norm": 0.5013885498046875, + "learning_rate": 0.0005493708285395061, + "loss": 3.0526, + "step": 11486 + }, + { + "epoch": 0.56, + "grad_norm": 0.5268964767456055, + "learning_rate": 0.000549362269167755, + "loss": 3.109, + "step": 11487 + }, + { + "epoch": 0.56, + "grad_norm": 0.5033860802650452, + "learning_rate": 0.0005493537091392316, + "loss": 3.3082, + "step": 11488 + }, + { + "epoch": 0.56, + "grad_norm": 0.535088300704956, + "learning_rate": 0.0005493451484539586, + "loss": 3.2002, + "step": 11489 + }, + { + "epoch": 0.56, + "grad_norm": 0.49352458119392395, + "learning_rate": 0.0005493365871119584, + "loss": 3.4408, + "step": 11490 + }, + { + "epoch": 0.56, + "grad_norm": 0.5212691426277161, + "learning_rate": 0.0005493280251132538, + "loss": 3.0987, + "step": 11491 + }, + { + "epoch": 0.56, + "grad_norm": 0.500873327255249, + "learning_rate": 0.000549319462457867, + "loss": 3.3256, + "step": 11492 + }, + { + "epoch": 0.56, + "grad_norm": 0.5535351037979126, + "learning_rate": 0.0005493108991458207, + "loss": 3.1447, + "step": 11493 + }, + { + "epoch": 0.56, + "grad_norm": 0.495970219373703, + "learning_rate": 0.0005493023351771376, + "loss": 3.2712, + "step": 11494 + }, + { + "epoch": 0.56, + "grad_norm": 0.5443682670593262, + "learning_rate": 0.0005492937705518401, + "loss": 3.0544, + "step": 11495 + }, + { + "epoch": 0.56, + "grad_norm": 0.5153055787086487, + "learning_rate": 0.0005492852052699507, + "loss": 3.3102, + "step": 11496 + }, + { + "epoch": 0.56, + "grad_norm": 0.48461559414863586, + "learning_rate": 0.0005492766393314921, + "loss": 3.2207, + "step": 11497 + }, + { + "epoch": 0.56, + "grad_norm": 0.541000247001648, + "learning_rate": 0.0005492680727364868, + "loss": 3.2273, + "step": 11498 + }, + { + "epoch": 0.56, + "grad_norm": 0.53602534532547, + "learning_rate": 0.0005492595054849572, + "loss": 3.2255, + "step": 11499 + }, + { + "epoch": 0.56, + "grad_norm": 0.46997231245040894, + "learning_rate": 0.0005492509375769264, + "loss": 3.197, + "step": 11500 + }, + { + "epoch": 0.56, + "grad_norm": 0.4768153429031372, + "learning_rate": 0.0005492423690124164, + "loss": 3.2706, + "step": 11501 + }, + { + "epoch": 0.56, + "grad_norm": 0.5005391836166382, + "learning_rate": 0.0005492337997914499, + "loss": 3.0595, + "step": 11502 + }, + { + "epoch": 0.56, + "grad_norm": 0.5174161195755005, + "learning_rate": 0.0005492252299140497, + "loss": 3.493, + "step": 11503 + }, + { + "epoch": 0.56, + "grad_norm": 0.5118815898895264, + "learning_rate": 0.000549216659380238, + "loss": 3.2988, + "step": 11504 + }, + { + "epoch": 0.56, + "grad_norm": 0.5185562372207642, + "learning_rate": 0.0005492080881900377, + "loss": 3.3501, + "step": 11505 + }, + { + "epoch": 0.56, + "grad_norm": 0.5072935819625854, + "learning_rate": 0.0005491995163434712, + "loss": 3.4849, + "step": 11506 + }, + { + "epoch": 0.56, + "grad_norm": 0.49706950783729553, + "learning_rate": 0.0005491909438405612, + "loss": 3.4126, + "step": 11507 + }, + { + "epoch": 0.56, + "grad_norm": 0.5042622685432434, + "learning_rate": 0.0005491823706813303, + "loss": 3.1778, + "step": 11508 + }, + { + "epoch": 0.56, + "grad_norm": 0.5073032379150391, + "learning_rate": 0.0005491737968658009, + "loss": 3.041, + "step": 11509 + }, + { + "epoch": 0.56, + "grad_norm": 0.5095604658126831, + "learning_rate": 0.0005491652223939956, + "loss": 3.3084, + "step": 11510 + }, + { + "epoch": 0.56, + "grad_norm": 0.5435943603515625, + "learning_rate": 0.0005491566472659372, + "loss": 3.1515, + "step": 11511 + }, + { + "epoch": 0.56, + "grad_norm": 0.51333087682724, + "learning_rate": 0.0005491480714816479, + "loss": 3.2505, + "step": 11512 + }, + { + "epoch": 0.56, + "grad_norm": 0.528915286064148, + "learning_rate": 0.0005491394950411508, + "loss": 3.1406, + "step": 11513 + }, + { + "epoch": 0.56, + "grad_norm": 0.5109853744506836, + "learning_rate": 0.000549130917944468, + "loss": 3.2142, + "step": 11514 + }, + { + "epoch": 0.56, + "grad_norm": 0.5281078219413757, + "learning_rate": 0.0005491223401916225, + "loss": 3.0415, + "step": 11515 + }, + { + "epoch": 0.56, + "grad_norm": 0.506549596786499, + "learning_rate": 0.0005491137617826366, + "loss": 3.6062, + "step": 11516 + }, + { + "epoch": 0.56, + "grad_norm": 0.5103456974029541, + "learning_rate": 0.000549105182717533, + "loss": 3.3239, + "step": 11517 + }, + { + "epoch": 0.56, + "grad_norm": 0.5172756910324097, + "learning_rate": 0.0005490966029963343, + "loss": 3.1329, + "step": 11518 + }, + { + "epoch": 0.56, + "grad_norm": 0.5389090776443481, + "learning_rate": 0.0005490880226190632, + "loss": 3.1909, + "step": 11519 + }, + { + "epoch": 0.56, + "grad_norm": 0.5944215059280396, + "learning_rate": 0.000549079441585742, + "loss": 3.2218, + "step": 11520 + }, + { + "epoch": 0.56, + "grad_norm": 0.525435745716095, + "learning_rate": 0.0005490708598963936, + "loss": 3.2495, + "step": 11521 + }, + { + "epoch": 0.56, + "grad_norm": 0.511767566204071, + "learning_rate": 0.0005490622775510404, + "loss": 3.2823, + "step": 11522 + }, + { + "epoch": 0.56, + "grad_norm": 0.5037621855735779, + "learning_rate": 0.0005490536945497052, + "loss": 3.3753, + "step": 11523 + }, + { + "epoch": 0.56, + "grad_norm": 0.5477989315986633, + "learning_rate": 0.0005490451108924104, + "loss": 3.299, + "step": 11524 + }, + { + "epoch": 0.56, + "grad_norm": 0.5245879292488098, + "learning_rate": 0.0005490365265791787, + "loss": 3.1515, + "step": 11525 + }, + { + "epoch": 0.56, + "grad_norm": 0.48814061284065247, + "learning_rate": 0.0005490279416100328, + "loss": 3.154, + "step": 11526 + }, + { + "epoch": 0.56, + "grad_norm": 0.5687476992607117, + "learning_rate": 0.0005490193559849953, + "loss": 3.1192, + "step": 11527 + }, + { + "epoch": 0.56, + "grad_norm": 0.49493011832237244, + "learning_rate": 0.0005490107697040886, + "loss": 3.0917, + "step": 11528 + }, + { + "epoch": 0.57, + "grad_norm": 0.5287251472473145, + "learning_rate": 0.0005490021827673355, + "loss": 3.2482, + "step": 11529 + }, + { + "epoch": 0.57, + "grad_norm": 0.5165507197380066, + "learning_rate": 0.0005489935951747586, + "loss": 3.2542, + "step": 11530 + }, + { + "epoch": 0.57, + "grad_norm": 0.48569759726524353, + "learning_rate": 0.0005489850069263804, + "loss": 3.2343, + "step": 11531 + }, + { + "epoch": 0.57, + "grad_norm": 0.5181214809417725, + "learning_rate": 0.0005489764180222237, + "loss": 3.5893, + "step": 11532 + }, + { + "epoch": 0.57, + "grad_norm": 0.5267981886863708, + "learning_rate": 0.0005489678284623109, + "loss": 3.2945, + "step": 11533 + }, + { + "epoch": 0.57, + "grad_norm": 0.5052371025085449, + "learning_rate": 0.0005489592382466649, + "loss": 3.2504, + "step": 11534 + }, + { + "epoch": 0.57, + "grad_norm": 0.5270872116088867, + "learning_rate": 0.0005489506473753082, + "loss": 3.1425, + "step": 11535 + }, + { + "epoch": 0.57, + "grad_norm": 0.5012282133102417, + "learning_rate": 0.0005489420558482634, + "loss": 3.3208, + "step": 11536 + }, + { + "epoch": 0.57, + "grad_norm": 0.5245894193649292, + "learning_rate": 0.0005489334636655529, + "loss": 3.1897, + "step": 11537 + }, + { + "epoch": 0.57, + "grad_norm": 0.5544889569282532, + "learning_rate": 0.0005489248708271999, + "loss": 3.4318, + "step": 11538 + }, + { + "epoch": 0.57, + "grad_norm": 0.5470820069313049, + "learning_rate": 0.0005489162773332265, + "loss": 3.0173, + "step": 11539 + }, + { + "epoch": 0.57, + "grad_norm": 0.5013363361358643, + "learning_rate": 0.0005489076831836555, + "loss": 3.3224, + "step": 11540 + }, + { + "epoch": 0.57, + "grad_norm": 0.514118492603302, + "learning_rate": 0.0005488990883785097, + "loss": 3.0869, + "step": 11541 + }, + { + "epoch": 0.57, + "grad_norm": 0.5037225484848022, + "learning_rate": 0.0005488904929178115, + "loss": 3.1977, + "step": 11542 + }, + { + "epoch": 0.57, + "grad_norm": 0.5562716722488403, + "learning_rate": 0.0005488818968015836, + "loss": 3.3027, + "step": 11543 + }, + { + "epoch": 0.57, + "grad_norm": 0.4860294759273529, + "learning_rate": 0.0005488733000298488, + "loss": 3.2274, + "step": 11544 + }, + { + "epoch": 0.57, + "grad_norm": 0.5283397436141968, + "learning_rate": 0.0005488647026026296, + "loss": 3.3057, + "step": 11545 + }, + { + "epoch": 0.57, + "grad_norm": 0.5059245824813843, + "learning_rate": 0.0005488561045199487, + "loss": 3.3249, + "step": 11546 + }, + { + "epoch": 0.57, + "grad_norm": 0.5175250172615051, + "learning_rate": 0.0005488475057818286, + "loss": 3.2407, + "step": 11547 + }, + { + "epoch": 0.57, + "grad_norm": 0.505158007144928, + "learning_rate": 0.0005488389063882922, + "loss": 3.3951, + "step": 11548 + }, + { + "epoch": 0.57, + "grad_norm": 0.48427003622055054, + "learning_rate": 0.0005488303063393619, + "loss": 3.2815, + "step": 11549 + }, + { + "epoch": 0.57, + "grad_norm": 0.49035337567329407, + "learning_rate": 0.0005488217056350605, + "loss": 3.3064, + "step": 11550 + }, + { + "epoch": 0.57, + "grad_norm": 0.5698608756065369, + "learning_rate": 0.0005488131042754107, + "loss": 3.2583, + "step": 11551 + }, + { + "epoch": 0.57, + "grad_norm": 0.5469956994056702, + "learning_rate": 0.0005488045022604349, + "loss": 3.324, + "step": 11552 + }, + { + "epoch": 0.57, + "grad_norm": 0.5337932109832764, + "learning_rate": 0.000548795899590156, + "loss": 3.0973, + "step": 11553 + }, + { + "epoch": 0.57, + "grad_norm": 0.5492282509803772, + "learning_rate": 0.0005487872962645966, + "loss": 3.218, + "step": 11554 + }, + { + "epoch": 0.57, + "grad_norm": 0.5477828979492188, + "learning_rate": 0.0005487786922837793, + "loss": 3.224, + "step": 11555 + }, + { + "epoch": 0.57, + "grad_norm": 0.4647465944290161, + "learning_rate": 0.0005487700876477268, + "loss": 3.1174, + "step": 11556 + }, + { + "epoch": 0.57, + "grad_norm": 0.5192469954490662, + "learning_rate": 0.0005487614823564619, + "loss": 3.1721, + "step": 11557 + }, + { + "epoch": 0.57, + "grad_norm": 0.5096564888954163, + "learning_rate": 0.000548752876410007, + "loss": 3.0154, + "step": 11558 + }, + { + "epoch": 0.57, + "grad_norm": 0.5024121999740601, + "learning_rate": 0.0005487442698083848, + "loss": 3.3643, + "step": 11559 + }, + { + "epoch": 0.57, + "grad_norm": 0.5501764416694641, + "learning_rate": 0.0005487356625516183, + "loss": 3.3475, + "step": 11560 + }, + { + "epoch": 0.57, + "grad_norm": 0.5160467624664307, + "learning_rate": 0.0005487270546397299, + "loss": 3.1848, + "step": 11561 + }, + { + "epoch": 0.57, + "grad_norm": 0.5589426159858704, + "learning_rate": 0.0005487184460727422, + "loss": 2.9357, + "step": 11562 + }, + { + "epoch": 0.57, + "grad_norm": 0.5194653868675232, + "learning_rate": 0.000548709836850678, + "loss": 3.4209, + "step": 11563 + }, + { + "epoch": 0.57, + "grad_norm": 0.5185741782188416, + "learning_rate": 0.0005487012269735599, + "loss": 3.1228, + "step": 11564 + }, + { + "epoch": 0.57, + "grad_norm": 0.4845937192440033, + "learning_rate": 0.0005486926164414108, + "loss": 3.3671, + "step": 11565 + }, + { + "epoch": 0.57, + "grad_norm": 0.561070442199707, + "learning_rate": 0.0005486840052542531, + "loss": 3.0674, + "step": 11566 + }, + { + "epoch": 0.57, + "grad_norm": 0.5006464719772339, + "learning_rate": 0.0005486753934121095, + "loss": 3.2313, + "step": 11567 + }, + { + "epoch": 0.57, + "grad_norm": 0.49923497438430786, + "learning_rate": 0.0005486667809150029, + "loss": 3.453, + "step": 11568 + }, + { + "epoch": 0.57, + "grad_norm": 0.5308060050010681, + "learning_rate": 0.0005486581677629558, + "loss": 3.2375, + "step": 11569 + }, + { + "epoch": 0.57, + "grad_norm": 0.5206416249275208, + "learning_rate": 0.000548649553955991, + "loss": 3.1938, + "step": 11570 + }, + { + "epoch": 0.57, + "grad_norm": 0.5626609325408936, + "learning_rate": 0.0005486409394941311, + "loss": 3.2765, + "step": 11571 + }, + { + "epoch": 0.57, + "grad_norm": 0.5191032290458679, + "learning_rate": 0.0005486323243773988, + "loss": 3.3281, + "step": 11572 + }, + { + "epoch": 0.57, + "grad_norm": 0.503855288028717, + "learning_rate": 0.0005486237086058169, + "loss": 3.0798, + "step": 11573 + }, + { + "epoch": 0.57, + "grad_norm": 0.5209179520606995, + "learning_rate": 0.0005486150921794079, + "loss": 3.1159, + "step": 11574 + }, + { + "epoch": 0.57, + "grad_norm": 0.514619767665863, + "learning_rate": 0.0005486064750981946, + "loss": 3.2129, + "step": 11575 + }, + { + "epoch": 0.57, + "grad_norm": 0.5100206136703491, + "learning_rate": 0.0005485978573621997, + "loss": 3.2254, + "step": 11576 + }, + { + "epoch": 0.57, + "grad_norm": 0.48534801602363586, + "learning_rate": 0.0005485892389714459, + "loss": 3.2878, + "step": 11577 + }, + { + "epoch": 0.57, + "grad_norm": 0.5108082294464111, + "learning_rate": 0.000548580619925956, + "loss": 3.3976, + "step": 11578 + }, + { + "epoch": 0.57, + "grad_norm": 0.5104441046714783, + "learning_rate": 0.0005485720002257524, + "loss": 3.0612, + "step": 11579 + }, + { + "epoch": 0.57, + "grad_norm": 0.5448250770568848, + "learning_rate": 0.000548563379870858, + "loss": 3.1129, + "step": 11580 + }, + { + "epoch": 0.57, + "grad_norm": 0.4960390329360962, + "learning_rate": 0.0005485547588612956, + "loss": 3.0571, + "step": 11581 + }, + { + "epoch": 0.57, + "grad_norm": 0.515153706073761, + "learning_rate": 0.0005485461371970878, + "loss": 3.1635, + "step": 11582 + }, + { + "epoch": 0.57, + "grad_norm": 0.5046236515045166, + "learning_rate": 0.0005485375148782572, + "loss": 3.1507, + "step": 11583 + }, + { + "epoch": 0.57, + "grad_norm": 0.5140905976295471, + "learning_rate": 0.0005485288919048266, + "loss": 3.2966, + "step": 11584 + }, + { + "epoch": 0.57, + "grad_norm": 0.5744099617004395, + "learning_rate": 0.0005485202682768189, + "loss": 3.2489, + "step": 11585 + }, + { + "epoch": 0.57, + "grad_norm": 0.49185919761657715, + "learning_rate": 0.0005485116439942566, + "loss": 3.2854, + "step": 11586 + }, + { + "epoch": 0.57, + "grad_norm": 0.5120055675506592, + "learning_rate": 0.0005485030190571623, + "loss": 3.2128, + "step": 11587 + }, + { + "epoch": 0.57, + "grad_norm": 0.5046406388282776, + "learning_rate": 0.000548494393465559, + "loss": 2.9364, + "step": 11588 + }, + { + "epoch": 0.57, + "grad_norm": 0.5551308393478394, + "learning_rate": 0.0005484857672194693, + "loss": 3.1919, + "step": 11589 + }, + { + "epoch": 0.57, + "grad_norm": 0.5424243807792664, + "learning_rate": 0.0005484771403189158, + "loss": 3.4059, + "step": 11590 + }, + { + "epoch": 0.57, + "grad_norm": 0.5322892069816589, + "learning_rate": 0.0005484685127639215, + "loss": 3.1558, + "step": 11591 + }, + { + "epoch": 0.57, + "grad_norm": 0.5228599905967712, + "learning_rate": 0.0005484598845545089, + "loss": 3.4993, + "step": 11592 + }, + { + "epoch": 0.57, + "grad_norm": 0.5182601809501648, + "learning_rate": 0.0005484512556907007, + "loss": 3.2109, + "step": 11593 + }, + { + "epoch": 0.57, + "grad_norm": 0.5158981084823608, + "learning_rate": 0.0005484426261725198, + "loss": 3.4712, + "step": 11594 + }, + { + "epoch": 0.57, + "grad_norm": 0.565634548664093, + "learning_rate": 0.0005484339959999887, + "loss": 3.5278, + "step": 11595 + }, + { + "epoch": 0.57, + "grad_norm": 0.5130377411842346, + "learning_rate": 0.0005484253651731305, + "loss": 3.1502, + "step": 11596 + }, + { + "epoch": 0.57, + "grad_norm": 0.5539126992225647, + "learning_rate": 0.0005484167336919675, + "loss": 3.3652, + "step": 11597 + }, + { + "epoch": 0.57, + "grad_norm": 0.5684749484062195, + "learning_rate": 0.0005484081015565226, + "loss": 3.0461, + "step": 11598 + }, + { + "epoch": 0.57, + "grad_norm": 0.5369781851768494, + "learning_rate": 0.0005483994687668187, + "loss": 3.3655, + "step": 11599 + }, + { + "epoch": 0.57, + "grad_norm": 0.5240543484687805, + "learning_rate": 0.0005483908353228784, + "loss": 3.387, + "step": 11600 + }, + { + "epoch": 0.57, + "grad_norm": 0.520671010017395, + "learning_rate": 0.0005483822012247244, + "loss": 3.1823, + "step": 11601 + }, + { + "epoch": 0.57, + "grad_norm": 0.4907285273075104, + "learning_rate": 0.0005483735664723795, + "loss": 3.2514, + "step": 11602 + }, + { + "epoch": 0.57, + "grad_norm": 0.519844651222229, + "learning_rate": 0.0005483649310658665, + "loss": 3.4297, + "step": 11603 + }, + { + "epoch": 0.57, + "grad_norm": 0.5508689880371094, + "learning_rate": 0.0005483562950052079, + "loss": 3.2559, + "step": 11604 + }, + { + "epoch": 0.57, + "grad_norm": 0.595680832862854, + "learning_rate": 0.0005483476582904268, + "loss": 3.2321, + "step": 11605 + }, + { + "epoch": 0.57, + "grad_norm": 0.4791205823421478, + "learning_rate": 0.0005483390209215456, + "loss": 3.0654, + "step": 11606 + }, + { + "epoch": 0.57, + "grad_norm": 0.4992697238922119, + "learning_rate": 0.0005483303828985873, + "loss": 3.3038, + "step": 11607 + }, + { + "epoch": 0.57, + "grad_norm": 0.5372535586357117, + "learning_rate": 0.0005483217442215745, + "loss": 3.349, + "step": 11608 + }, + { + "epoch": 0.57, + "grad_norm": 0.5181151032447815, + "learning_rate": 0.0005483131048905301, + "loss": 3.1583, + "step": 11609 + }, + { + "epoch": 0.57, + "grad_norm": 0.5052180886268616, + "learning_rate": 0.0005483044649054767, + "loss": 3.4163, + "step": 11610 + }, + { + "epoch": 0.57, + "grad_norm": 0.5012765526771545, + "learning_rate": 0.0005482958242664373, + "loss": 3.4786, + "step": 11611 + }, + { + "epoch": 0.57, + "grad_norm": 0.5442771911621094, + "learning_rate": 0.0005482871829734342, + "loss": 3.1562, + "step": 11612 + }, + { + "epoch": 0.57, + "grad_norm": 0.5303670167922974, + "learning_rate": 0.0005482785410264907, + "loss": 3.2916, + "step": 11613 + }, + { + "epoch": 0.57, + "grad_norm": 0.5204977989196777, + "learning_rate": 0.0005482698984256291, + "loss": 3.2238, + "step": 11614 + }, + { + "epoch": 0.57, + "grad_norm": 0.4961937367916107, + "learning_rate": 0.0005482612551708725, + "loss": 3.3505, + "step": 11615 + }, + { + "epoch": 0.57, + "grad_norm": 0.5263996124267578, + "learning_rate": 0.0005482526112622435, + "loss": 3.1029, + "step": 11616 + }, + { + "epoch": 0.57, + "grad_norm": 0.5191607475280762, + "learning_rate": 0.0005482439666997648, + "loss": 3.1744, + "step": 11617 + }, + { + "epoch": 0.57, + "grad_norm": 0.5280757546424866, + "learning_rate": 0.0005482353214834594, + "loss": 3.1277, + "step": 11618 + }, + { + "epoch": 0.57, + "grad_norm": 0.583758533000946, + "learning_rate": 0.0005482266756133498, + "loss": 3.2388, + "step": 11619 + }, + { + "epoch": 0.57, + "grad_norm": 0.49690064787864685, + "learning_rate": 0.0005482180290894592, + "loss": 3.2565, + "step": 11620 + }, + { + "epoch": 0.57, + "grad_norm": 0.4999449849128723, + "learning_rate": 0.0005482093819118098, + "loss": 3.1643, + "step": 11621 + }, + { + "epoch": 0.57, + "grad_norm": 0.4750107228755951, + "learning_rate": 0.0005482007340804248, + "loss": 2.9841, + "step": 11622 + }, + { + "epoch": 0.57, + "grad_norm": 0.5504389405250549, + "learning_rate": 0.0005481920855953268, + "loss": 3.0009, + "step": 11623 + }, + { + "epoch": 0.57, + "grad_norm": 0.5175572037696838, + "learning_rate": 0.0005481834364565386, + "loss": 3.3435, + "step": 11624 + }, + { + "epoch": 0.57, + "grad_norm": 0.5125237107276917, + "learning_rate": 0.000548174786664083, + "loss": 3.5311, + "step": 11625 + }, + { + "epoch": 0.57, + "grad_norm": 0.556337296962738, + "learning_rate": 0.0005481661362179827, + "loss": 3.2554, + "step": 11626 + }, + { + "epoch": 0.57, + "grad_norm": 0.5328450202941895, + "learning_rate": 0.0005481574851182606, + "loss": 3.2114, + "step": 11627 + }, + { + "epoch": 0.57, + "grad_norm": 0.5400496125221252, + "learning_rate": 0.0005481488333649395, + "loss": 3.2479, + "step": 11628 + }, + { + "epoch": 0.57, + "grad_norm": 0.529948890209198, + "learning_rate": 0.0005481401809580421, + "loss": 3.2482, + "step": 11629 + }, + { + "epoch": 0.57, + "grad_norm": 0.5066934823989868, + "learning_rate": 0.0005481315278975911, + "loss": 3.4325, + "step": 11630 + }, + { + "epoch": 0.57, + "grad_norm": 0.4977806806564331, + "learning_rate": 0.0005481228741836096, + "loss": 3.3261, + "step": 11631 + }, + { + "epoch": 0.57, + "grad_norm": 0.526506245136261, + "learning_rate": 0.0005481142198161201, + "loss": 3.1154, + "step": 11632 + }, + { + "epoch": 0.57, + "grad_norm": 0.6684457659721375, + "learning_rate": 0.0005481055647951456, + "loss": 3.2972, + "step": 11633 + }, + { + "epoch": 0.57, + "grad_norm": 0.49738237261772156, + "learning_rate": 0.0005480969091207086, + "loss": 3.2738, + "step": 11634 + }, + { + "epoch": 0.57, + "grad_norm": 0.5682615637779236, + "learning_rate": 0.0005480882527928322, + "loss": 3.3088, + "step": 11635 + }, + { + "epoch": 0.57, + "grad_norm": 0.5242875814437866, + "learning_rate": 0.0005480795958115391, + "loss": 3.2557, + "step": 11636 + }, + { + "epoch": 0.57, + "grad_norm": 0.5072360038757324, + "learning_rate": 0.0005480709381768521, + "loss": 3.0437, + "step": 11637 + }, + { + "epoch": 0.57, + "grad_norm": 0.5002800822257996, + "learning_rate": 0.000548062279888794, + "loss": 3.2534, + "step": 11638 + }, + { + "epoch": 0.57, + "grad_norm": 0.5526995062828064, + "learning_rate": 0.0005480536209473874, + "loss": 3.0043, + "step": 11639 + }, + { + "epoch": 0.57, + "grad_norm": 0.5290579199790955, + "learning_rate": 0.0005480449613526555, + "loss": 3.2353, + "step": 11640 + }, + { + "epoch": 0.57, + "grad_norm": 0.5533804893493652, + "learning_rate": 0.0005480363011046208, + "loss": 3.3581, + "step": 11641 + }, + { + "epoch": 0.57, + "grad_norm": 0.4938673973083496, + "learning_rate": 0.0005480276402033064, + "loss": 3.2501, + "step": 11642 + }, + { + "epoch": 0.57, + "grad_norm": 0.5148259401321411, + "learning_rate": 0.0005480189786487348, + "loss": 3.3157, + "step": 11643 + }, + { + "epoch": 0.57, + "grad_norm": 0.5525953769683838, + "learning_rate": 0.0005480103164409289, + "loss": 3.0652, + "step": 11644 + }, + { + "epoch": 0.57, + "grad_norm": 0.5157591104507446, + "learning_rate": 0.0005480016535799117, + "loss": 3.2691, + "step": 11645 + }, + { + "epoch": 0.57, + "grad_norm": 0.4905431866645813, + "learning_rate": 0.0005479929900657057, + "loss": 3.1825, + "step": 11646 + }, + { + "epoch": 0.57, + "grad_norm": 0.49346473813056946, + "learning_rate": 0.000547984325898334, + "loss": 3.2634, + "step": 11647 + }, + { + "epoch": 0.57, + "grad_norm": 0.5140097141265869, + "learning_rate": 0.0005479756610778194, + "loss": 3.1073, + "step": 11648 + }, + { + "epoch": 0.57, + "grad_norm": 0.4972335398197174, + "learning_rate": 0.0005479669956041844, + "loss": 3.2337, + "step": 11649 + }, + { + "epoch": 0.57, + "grad_norm": 0.5696159601211548, + "learning_rate": 0.0005479583294774522, + "loss": 3.2024, + "step": 11650 + }, + { + "epoch": 0.57, + "grad_norm": 0.5201365947723389, + "learning_rate": 0.0005479496626976455, + "loss": 3.4275, + "step": 11651 + }, + { + "epoch": 0.57, + "grad_norm": 0.5507444143295288, + "learning_rate": 0.000547940995264787, + "loss": 3.0116, + "step": 11652 + }, + { + "epoch": 0.57, + "grad_norm": 0.5264685153961182, + "learning_rate": 0.0005479323271788997, + "loss": 3.2018, + "step": 11653 + }, + { + "epoch": 0.57, + "grad_norm": 0.5618388652801514, + "learning_rate": 0.0005479236584400065, + "loss": 3.2548, + "step": 11654 + }, + { + "epoch": 0.57, + "grad_norm": 0.5223483443260193, + "learning_rate": 0.0005479149890481299, + "loss": 3.2424, + "step": 11655 + }, + { + "epoch": 0.57, + "grad_norm": 0.5244974493980408, + "learning_rate": 0.000547906319003293, + "loss": 3.221, + "step": 11656 + }, + { + "epoch": 0.57, + "grad_norm": 0.5352416634559631, + "learning_rate": 0.0005478976483055185, + "loss": 3.1539, + "step": 11657 + }, + { + "epoch": 0.57, + "grad_norm": 0.5786653757095337, + "learning_rate": 0.0005478889769548295, + "loss": 3.2923, + "step": 11658 + }, + { + "epoch": 0.57, + "grad_norm": 0.5236338376998901, + "learning_rate": 0.0005478803049512484, + "loss": 3.1554, + "step": 11659 + }, + { + "epoch": 0.57, + "grad_norm": 0.529576301574707, + "learning_rate": 0.0005478716322947985, + "loss": 3.2727, + "step": 11660 + }, + { + "epoch": 0.57, + "grad_norm": 0.5126217007637024, + "learning_rate": 0.0005478629589855022, + "loss": 3.3197, + "step": 11661 + }, + { + "epoch": 0.57, + "grad_norm": 0.5071096420288086, + "learning_rate": 0.0005478542850233827, + "loss": 3.4426, + "step": 11662 + }, + { + "epoch": 0.57, + "grad_norm": 0.5255385637283325, + "learning_rate": 0.0005478456104084627, + "loss": 3.2004, + "step": 11663 + }, + { + "epoch": 0.57, + "grad_norm": 0.5327421426773071, + "learning_rate": 0.0005478369351407651, + "loss": 3.1227, + "step": 11664 + }, + { + "epoch": 0.57, + "grad_norm": 0.5336896181106567, + "learning_rate": 0.0005478282592203126, + "loss": 3.2945, + "step": 11665 + }, + { + "epoch": 0.57, + "grad_norm": 0.4557580351829529, + "learning_rate": 0.0005478195826471282, + "loss": 3.2835, + "step": 11666 + }, + { + "epoch": 0.57, + "grad_norm": 0.5098157525062561, + "learning_rate": 0.0005478109054212349, + "loss": 3.3144, + "step": 11667 + }, + { + "epoch": 0.57, + "grad_norm": 0.5083901286125183, + "learning_rate": 0.0005478022275426551, + "loss": 3.1161, + "step": 11668 + }, + { + "epoch": 0.57, + "grad_norm": 0.5286469459533691, + "learning_rate": 0.000547793549011412, + "loss": 3.2938, + "step": 11669 + }, + { + "epoch": 0.57, + "grad_norm": 0.5404655933380127, + "learning_rate": 0.0005477848698275285, + "loss": 3.1669, + "step": 11670 + }, + { + "epoch": 0.57, + "grad_norm": 0.5109778046607971, + "learning_rate": 0.0005477761899910272, + "loss": 3.2569, + "step": 11671 + }, + { + "epoch": 0.57, + "grad_norm": 0.5234958529472351, + "learning_rate": 0.0005477675095019312, + "loss": 3.2914, + "step": 11672 + }, + { + "epoch": 0.57, + "grad_norm": 0.5528802871704102, + "learning_rate": 0.0005477588283602632, + "loss": 3.2845, + "step": 11673 + }, + { + "epoch": 0.57, + "grad_norm": 0.5386664867401123, + "learning_rate": 0.0005477501465660461, + "loss": 3.344, + "step": 11674 + }, + { + "epoch": 0.57, + "grad_norm": 0.5150508284568787, + "learning_rate": 0.0005477414641193028, + "loss": 3.3954, + "step": 11675 + }, + { + "epoch": 0.57, + "grad_norm": 0.4998857378959656, + "learning_rate": 0.0005477327810200562, + "loss": 3.3447, + "step": 11676 + }, + { + "epoch": 0.57, + "grad_norm": 0.5428280234336853, + "learning_rate": 0.0005477240972683292, + "loss": 3.1212, + "step": 11677 + }, + { + "epoch": 0.57, + "grad_norm": 0.5053800940513611, + "learning_rate": 0.0005477154128641445, + "loss": 3.2313, + "step": 11678 + }, + { + "epoch": 0.57, + "grad_norm": 0.586479663848877, + "learning_rate": 0.0005477067278075251, + "loss": 3.3651, + "step": 11679 + }, + { + "epoch": 0.57, + "grad_norm": 0.5042681694030762, + "learning_rate": 0.000547698042098494, + "loss": 3.2991, + "step": 11680 + }, + { + "epoch": 0.57, + "grad_norm": 0.5273053646087646, + "learning_rate": 0.0005476893557370737, + "loss": 3.1918, + "step": 11681 + }, + { + "epoch": 0.57, + "grad_norm": 0.5238834023475647, + "learning_rate": 0.0005476806687232874, + "loss": 3.3427, + "step": 11682 + }, + { + "epoch": 0.57, + "grad_norm": 0.5385345220565796, + "learning_rate": 0.0005476719810571579, + "loss": 3.2813, + "step": 11683 + }, + { + "epoch": 0.57, + "grad_norm": 0.4971548318862915, + "learning_rate": 0.000547663292738708, + "loss": 3.2153, + "step": 11684 + }, + { + "epoch": 0.57, + "grad_norm": 0.5180023908615112, + "learning_rate": 0.0005476546037679608, + "loss": 3.3399, + "step": 11685 + }, + { + "epoch": 0.57, + "grad_norm": 0.510409414768219, + "learning_rate": 0.0005476459141449388, + "loss": 3.1909, + "step": 11686 + }, + { + "epoch": 0.57, + "grad_norm": 0.555425763130188, + "learning_rate": 0.0005476372238696653, + "loss": 3.1216, + "step": 11687 + }, + { + "epoch": 0.57, + "grad_norm": 0.5270985960960388, + "learning_rate": 0.0005476285329421629, + "loss": 3.1847, + "step": 11688 + }, + { + "epoch": 0.57, + "grad_norm": 0.5697065591812134, + "learning_rate": 0.0005476198413624548, + "loss": 3.1772, + "step": 11689 + }, + { + "epoch": 0.57, + "grad_norm": 0.514995813369751, + "learning_rate": 0.0005476111491305635, + "loss": 3.3174, + "step": 11690 + }, + { + "epoch": 0.57, + "grad_norm": 0.5724436044692993, + "learning_rate": 0.0005476024562465121, + "loss": 3.3164, + "step": 11691 + }, + { + "epoch": 0.57, + "grad_norm": 0.5155641436576843, + "learning_rate": 0.0005475937627103237, + "loss": 3.0523, + "step": 11692 + }, + { + "epoch": 0.57, + "grad_norm": 0.4877888560295105, + "learning_rate": 0.0005475850685220208, + "loss": 3.2307, + "step": 11693 + }, + { + "epoch": 0.57, + "grad_norm": 0.49294334650039673, + "learning_rate": 0.0005475763736816264, + "loss": 3.0084, + "step": 11694 + }, + { + "epoch": 0.57, + "grad_norm": 0.4982193410396576, + "learning_rate": 0.0005475676781891636, + "loss": 3.3148, + "step": 11695 + }, + { + "epoch": 0.57, + "grad_norm": 0.6280152797698975, + "learning_rate": 0.0005475589820446552, + "loss": 3.3086, + "step": 11696 + }, + { + "epoch": 0.57, + "grad_norm": 0.5116281509399414, + "learning_rate": 0.000547550285248124, + "loss": 3.2566, + "step": 11697 + }, + { + "epoch": 0.57, + "grad_norm": 0.5089260339736938, + "learning_rate": 0.000547541587799593, + "loss": 3.1644, + "step": 11698 + }, + { + "epoch": 0.57, + "grad_norm": 0.5232623815536499, + "learning_rate": 0.0005475328896990851, + "loss": 3.2846, + "step": 11699 + }, + { + "epoch": 0.57, + "grad_norm": 0.5016197562217712, + "learning_rate": 0.0005475241909466234, + "loss": 3.0528, + "step": 11700 + }, + { + "epoch": 0.57, + "grad_norm": 0.5107477307319641, + "learning_rate": 0.0005475154915422304, + "loss": 3.0808, + "step": 11701 + }, + { + "epoch": 0.57, + "grad_norm": 0.5307403206825256, + "learning_rate": 0.0005475067914859292, + "loss": 3.2766, + "step": 11702 + }, + { + "epoch": 0.57, + "grad_norm": 0.50713050365448, + "learning_rate": 0.0005474980907777428, + "loss": 3.3566, + "step": 11703 + }, + { + "epoch": 0.57, + "grad_norm": 0.4886563718318939, + "learning_rate": 0.0005474893894176941, + "loss": 3.3369, + "step": 11704 + }, + { + "epoch": 0.57, + "grad_norm": 0.5123024582862854, + "learning_rate": 0.000547480687405806, + "loss": 3.1893, + "step": 11705 + }, + { + "epoch": 0.57, + "grad_norm": 0.510797917842865, + "learning_rate": 0.0005474719847421015, + "loss": 3.0718, + "step": 11706 + }, + { + "epoch": 0.57, + "grad_norm": 0.558925449848175, + "learning_rate": 0.0005474632814266031, + "loss": 2.9562, + "step": 11707 + }, + { + "epoch": 0.57, + "grad_norm": 0.5396509170532227, + "learning_rate": 0.0005474545774593343, + "loss": 3.4154, + "step": 11708 + }, + { + "epoch": 0.57, + "grad_norm": 0.5025230050086975, + "learning_rate": 0.0005474458728403176, + "loss": 3.0728, + "step": 11709 + }, + { + "epoch": 0.57, + "grad_norm": 0.5266216993331909, + "learning_rate": 0.0005474371675695762, + "loss": 3.4107, + "step": 11710 + }, + { + "epoch": 0.57, + "grad_norm": 0.5457305312156677, + "learning_rate": 0.000547428461647133, + "loss": 3.1522, + "step": 11711 + }, + { + "epoch": 0.57, + "grad_norm": 0.522469162940979, + "learning_rate": 0.0005474197550730107, + "loss": 3.5021, + "step": 11712 + }, + { + "epoch": 0.57, + "grad_norm": 0.4818090796470642, + "learning_rate": 0.0005474110478472325, + "loss": 3.0651, + "step": 11713 + }, + { + "epoch": 0.57, + "grad_norm": 0.520215630531311, + "learning_rate": 0.0005474023399698212, + "loss": 3.3468, + "step": 11714 + }, + { + "epoch": 0.57, + "grad_norm": 0.5050302743911743, + "learning_rate": 0.0005473936314407996, + "loss": 3.144, + "step": 11715 + }, + { + "epoch": 0.57, + "grad_norm": 0.5783846378326416, + "learning_rate": 0.0005473849222601909, + "loss": 3.2924, + "step": 11716 + }, + { + "epoch": 0.57, + "grad_norm": 0.4962663948535919, + "learning_rate": 0.0005473762124280179, + "loss": 3.0763, + "step": 11717 + }, + { + "epoch": 0.57, + "grad_norm": 0.48002344369888306, + "learning_rate": 0.0005473675019443036, + "loss": 3.1622, + "step": 11718 + }, + { + "epoch": 0.57, + "grad_norm": 0.5629715919494629, + "learning_rate": 0.0005473587908090709, + "loss": 3.351, + "step": 11719 + }, + { + "epoch": 0.57, + "grad_norm": 0.5525310039520264, + "learning_rate": 0.0005473500790223428, + "loss": 3.2598, + "step": 11720 + }, + { + "epoch": 0.57, + "grad_norm": 0.47623834013938904, + "learning_rate": 0.000547341366584142, + "loss": 3.2051, + "step": 11721 + }, + { + "epoch": 0.57, + "grad_norm": 0.546389639377594, + "learning_rate": 0.0005473326534944918, + "loss": 3.1717, + "step": 11722 + }, + { + "epoch": 0.57, + "grad_norm": 0.4834481477737427, + "learning_rate": 0.000547323939753415, + "loss": 3.2224, + "step": 11723 + }, + { + "epoch": 0.57, + "grad_norm": 0.5237901210784912, + "learning_rate": 0.0005473152253609345, + "loss": 3.1499, + "step": 11724 + }, + { + "epoch": 0.57, + "grad_norm": 0.5636874437332153, + "learning_rate": 0.0005473065103170733, + "loss": 3.1321, + "step": 11725 + }, + { + "epoch": 0.57, + "grad_norm": 0.5102803707122803, + "learning_rate": 0.0005472977946218543, + "loss": 3.3097, + "step": 11726 + }, + { + "epoch": 0.57, + "grad_norm": 0.5490740537643433, + "learning_rate": 0.0005472890782753006, + "loss": 3.3188, + "step": 11727 + }, + { + "epoch": 0.57, + "grad_norm": 0.5862623453140259, + "learning_rate": 0.000547280361277435, + "loss": 3.1409, + "step": 11728 + }, + { + "epoch": 0.57, + "grad_norm": 0.48724859952926636, + "learning_rate": 0.0005472716436282806, + "loss": 3.4519, + "step": 11729 + }, + { + "epoch": 0.57, + "grad_norm": 0.5731695294380188, + "learning_rate": 0.0005472629253278601, + "loss": 3.2008, + "step": 11730 + }, + { + "epoch": 0.57, + "grad_norm": 0.5652557611465454, + "learning_rate": 0.0005472542063761968, + "loss": 3.2627, + "step": 11731 + }, + { + "epoch": 0.57, + "grad_norm": 0.49549639225006104, + "learning_rate": 0.0005472454867733134, + "loss": 3.2696, + "step": 11732 + }, + { + "epoch": 0.58, + "grad_norm": 0.5207010507583618, + "learning_rate": 0.0005472367665192331, + "loss": 3.2412, + "step": 11733 + }, + { + "epoch": 0.58, + "grad_norm": 0.5095499157905579, + "learning_rate": 0.0005472280456139786, + "loss": 3.066, + "step": 11734 + }, + { + "epoch": 0.58, + "grad_norm": 0.5058441162109375, + "learning_rate": 0.000547219324057573, + "loss": 3.3581, + "step": 11735 + }, + { + "epoch": 0.58, + "grad_norm": 0.536139190196991, + "learning_rate": 0.0005472106018500394, + "loss": 3.2069, + "step": 11736 + }, + { + "epoch": 0.58, + "grad_norm": 0.5336572527885437, + "learning_rate": 0.0005472018789914007, + "loss": 3.1781, + "step": 11737 + }, + { + "epoch": 0.58, + "grad_norm": 0.5066656470298767, + "learning_rate": 0.0005471931554816797, + "loss": 3.0132, + "step": 11738 + }, + { + "epoch": 0.58, + "grad_norm": 0.53529292345047, + "learning_rate": 0.0005471844313208995, + "loss": 3.2795, + "step": 11739 + }, + { + "epoch": 0.58, + "grad_norm": 0.4876529276371002, + "learning_rate": 0.0005471757065090831, + "loss": 3.2349, + "step": 11740 + }, + { + "epoch": 0.58, + "grad_norm": 0.5300586819648743, + "learning_rate": 0.0005471669810462534, + "loss": 3.1156, + "step": 11741 + }, + { + "epoch": 0.58, + "grad_norm": 0.5383363962173462, + "learning_rate": 0.0005471582549324336, + "loss": 3.3422, + "step": 11742 + }, + { + "epoch": 0.58, + "grad_norm": 0.5305922031402588, + "learning_rate": 0.0005471495281676464, + "loss": 3.1445, + "step": 11743 + }, + { + "epoch": 0.58, + "grad_norm": 0.5225808620452881, + "learning_rate": 0.000547140800751915, + "loss": 3.3356, + "step": 11744 + }, + { + "epoch": 0.58, + "grad_norm": 0.5275900959968567, + "learning_rate": 0.0005471320726852621, + "loss": 3.3428, + "step": 11745 + }, + { + "epoch": 0.58, + "grad_norm": 0.553205132484436, + "learning_rate": 0.000547123343967711, + "loss": 3.2428, + "step": 11746 + }, + { + "epoch": 0.58, + "grad_norm": 0.5640978217124939, + "learning_rate": 0.0005471146145992847, + "loss": 3.0525, + "step": 11747 + }, + { + "epoch": 0.58, + "grad_norm": 0.5505227446556091, + "learning_rate": 0.0005471058845800059, + "loss": 3.059, + "step": 11748 + }, + { + "epoch": 0.58, + "grad_norm": 0.5284337997436523, + "learning_rate": 0.0005470971539098978, + "loss": 3.3115, + "step": 11749 + }, + { + "epoch": 0.58, + "grad_norm": 0.5046818256378174, + "learning_rate": 0.0005470884225889834, + "loss": 3.2677, + "step": 11750 + }, + { + "epoch": 0.58, + "grad_norm": 0.5719025135040283, + "learning_rate": 0.0005470796906172855, + "loss": 3.1408, + "step": 11751 + }, + { + "epoch": 0.58, + "grad_norm": 0.5198563933372498, + "learning_rate": 0.0005470709579948274, + "loss": 3.3498, + "step": 11752 + }, + { + "epoch": 0.58, + "grad_norm": 0.5236194133758545, + "learning_rate": 0.0005470622247216319, + "loss": 3.255, + "step": 11753 + }, + { + "epoch": 0.58, + "grad_norm": 0.5159012675285339, + "learning_rate": 0.000547053490797722, + "loss": 3.4876, + "step": 11754 + }, + { + "epoch": 0.58, + "grad_norm": 0.4822523891925812, + "learning_rate": 0.0005470447562231209, + "loss": 3.2904, + "step": 11755 + }, + { + "epoch": 0.58, + "grad_norm": 0.5354002118110657, + "learning_rate": 0.0005470360209978513, + "loss": 3.123, + "step": 11756 + }, + { + "epoch": 0.58, + "grad_norm": 0.5133538246154785, + "learning_rate": 0.0005470272851219364, + "loss": 3.412, + "step": 11757 + }, + { + "epoch": 0.58, + "grad_norm": 0.5027098655700684, + "learning_rate": 0.0005470185485953992, + "loss": 3.2522, + "step": 11758 + }, + { + "epoch": 0.58, + "grad_norm": 0.5310078859329224, + "learning_rate": 0.0005470098114182627, + "loss": 3.2146, + "step": 11759 + }, + { + "epoch": 0.58, + "grad_norm": 0.5589718818664551, + "learning_rate": 0.0005470010735905499, + "loss": 3.2976, + "step": 11760 + }, + { + "epoch": 0.58, + "grad_norm": 0.5355909466743469, + "learning_rate": 0.0005469923351122837, + "loss": 3.2431, + "step": 11761 + }, + { + "epoch": 0.58, + "grad_norm": 0.5391919016838074, + "learning_rate": 0.0005469835959834873, + "loss": 3.0428, + "step": 11762 + }, + { + "epoch": 0.58, + "grad_norm": 0.5123609304428101, + "learning_rate": 0.0005469748562041837, + "loss": 3.2564, + "step": 11763 + }, + { + "epoch": 0.58, + "grad_norm": 0.4937504827976227, + "learning_rate": 0.0005469661157743958, + "loss": 3.2483, + "step": 11764 + }, + { + "epoch": 0.58, + "grad_norm": 0.5278680920600891, + "learning_rate": 0.0005469573746941467, + "loss": 3.2399, + "step": 11765 + }, + { + "epoch": 0.58, + "grad_norm": 0.5318676233291626, + "learning_rate": 0.0005469486329634593, + "loss": 3.1255, + "step": 11766 + }, + { + "epoch": 0.58, + "grad_norm": 0.5315147638320923, + "learning_rate": 0.0005469398905823569, + "loss": 3.3234, + "step": 11767 + }, + { + "epoch": 0.58, + "grad_norm": 0.5593608617782593, + "learning_rate": 0.0005469311475508622, + "loss": 3.253, + "step": 11768 + }, + { + "epoch": 0.58, + "grad_norm": 0.5003385543823242, + "learning_rate": 0.0005469224038689985, + "loss": 2.979, + "step": 11769 + }, + { + "epoch": 0.58, + "grad_norm": 0.530859649181366, + "learning_rate": 0.0005469136595367887, + "loss": 3.1236, + "step": 11770 + }, + { + "epoch": 0.58, + "grad_norm": 0.5268099308013916, + "learning_rate": 0.0005469049145542558, + "loss": 3.4322, + "step": 11771 + }, + { + "epoch": 0.58, + "grad_norm": 0.5292220115661621, + "learning_rate": 0.0005468961689214228, + "loss": 3.2846, + "step": 11772 + }, + { + "epoch": 0.58, + "grad_norm": 0.5274138450622559, + "learning_rate": 0.0005468874226383128, + "loss": 3.1513, + "step": 11773 + }, + { + "epoch": 0.58, + "grad_norm": 0.5348718762397766, + "learning_rate": 0.000546878675704949, + "loss": 3.2605, + "step": 11774 + }, + { + "epoch": 0.58, + "grad_norm": 0.5165255665779114, + "learning_rate": 0.0005468699281213541, + "loss": 3.1522, + "step": 11775 + }, + { + "epoch": 0.58, + "grad_norm": 0.5076016783714294, + "learning_rate": 0.0005468611798875515, + "loss": 3.2869, + "step": 11776 + }, + { + "epoch": 0.58, + "grad_norm": 0.5093604922294617, + "learning_rate": 0.0005468524310035639, + "loss": 3.0047, + "step": 11777 + }, + { + "epoch": 0.58, + "grad_norm": 0.5292362570762634, + "learning_rate": 0.0005468436814694146, + "loss": 3.1524, + "step": 11778 + }, + { + "epoch": 0.58, + "grad_norm": 0.5151748657226562, + "learning_rate": 0.0005468349312851265, + "loss": 3.1051, + "step": 11779 + }, + { + "epoch": 0.58, + "grad_norm": 0.5270185470581055, + "learning_rate": 0.0005468261804507228, + "loss": 3.2477, + "step": 11780 + }, + { + "epoch": 0.58, + "grad_norm": 0.5115005970001221, + "learning_rate": 0.0005468174289662264, + "loss": 3.265, + "step": 11781 + }, + { + "epoch": 0.58, + "grad_norm": 0.5105668902397156, + "learning_rate": 0.0005468086768316604, + "loss": 3.189, + "step": 11782 + }, + { + "epoch": 0.58, + "grad_norm": 0.4958377778530121, + "learning_rate": 0.0005467999240470478, + "loss": 3.2936, + "step": 11783 + }, + { + "epoch": 0.58, + "grad_norm": 0.5410537719726562, + "learning_rate": 0.0005467911706124117, + "loss": 3.2262, + "step": 11784 + }, + { + "epoch": 0.58, + "grad_norm": 0.5103792548179626, + "learning_rate": 0.0005467824165277752, + "loss": 3.2767, + "step": 11785 + }, + { + "epoch": 0.58, + "grad_norm": 0.623610258102417, + "learning_rate": 0.0005467736617931613, + "loss": 3.2318, + "step": 11786 + }, + { + "epoch": 0.58, + "grad_norm": 0.5323020219802856, + "learning_rate": 0.000546764906408593, + "loss": 3.1768, + "step": 11787 + }, + { + "epoch": 0.58, + "grad_norm": 0.5189688205718994, + "learning_rate": 0.0005467561503740934, + "loss": 3.1885, + "step": 11788 + }, + { + "epoch": 0.58, + "grad_norm": 0.5025799870491028, + "learning_rate": 0.0005467473936896857, + "loss": 3.3244, + "step": 11789 + }, + { + "epoch": 0.58, + "grad_norm": 0.5281747579574585, + "learning_rate": 0.0005467386363553927, + "loss": 3.3322, + "step": 11790 + }, + { + "epoch": 0.58, + "grad_norm": 0.5055098533630371, + "learning_rate": 0.0005467298783712378, + "loss": 3.1912, + "step": 11791 + }, + { + "epoch": 0.58, + "grad_norm": 0.5109471678733826, + "learning_rate": 0.0005467211197372438, + "loss": 3.3548, + "step": 11792 + }, + { + "epoch": 0.58, + "grad_norm": 0.49919557571411133, + "learning_rate": 0.0005467123604534338, + "loss": 3.2776, + "step": 11793 + }, + { + "epoch": 0.58, + "grad_norm": 0.5183613300323486, + "learning_rate": 0.0005467036005198311, + "loss": 3.3152, + "step": 11794 + }, + { + "epoch": 0.58, + "grad_norm": 0.49261006712913513, + "learning_rate": 0.0005466948399364584, + "loss": 3.0799, + "step": 11795 + }, + { + "epoch": 0.58, + "grad_norm": 0.5078513622283936, + "learning_rate": 0.0005466860787033391, + "loss": 3.2225, + "step": 11796 + }, + { + "epoch": 0.58, + "grad_norm": 0.5038175582885742, + "learning_rate": 0.000546677316820496, + "loss": 3.3051, + "step": 11797 + }, + { + "epoch": 0.58, + "grad_norm": 0.5619712471961975, + "learning_rate": 0.0005466685542879525, + "loss": 3.2478, + "step": 11798 + }, + { + "epoch": 0.58, + "grad_norm": 0.4969761371612549, + "learning_rate": 0.0005466597911057314, + "loss": 3.2508, + "step": 11799 + }, + { + "epoch": 0.58, + "grad_norm": 0.5520473122596741, + "learning_rate": 0.0005466510272738559, + "loss": 3.3678, + "step": 11800 + }, + { + "epoch": 0.58, + "grad_norm": 0.5152010917663574, + "learning_rate": 0.000546642262792349, + "loss": 3.2858, + "step": 11801 + }, + { + "epoch": 0.58, + "grad_norm": 0.5011783242225647, + "learning_rate": 0.000546633497661234, + "loss": 3.2719, + "step": 11802 + }, + { + "epoch": 0.58, + "grad_norm": 0.5367514491081238, + "learning_rate": 0.0005466247318805336, + "loss": 3.3372, + "step": 11803 + }, + { + "epoch": 0.58, + "grad_norm": 0.5518689155578613, + "learning_rate": 0.0005466159654502713, + "loss": 3.2038, + "step": 11804 + }, + { + "epoch": 0.58, + "grad_norm": 0.5553038716316223, + "learning_rate": 0.00054660719837047, + "loss": 3.082, + "step": 11805 + }, + { + "epoch": 0.58, + "grad_norm": 0.5008841753005981, + "learning_rate": 0.0005465984306411528, + "loss": 3.0853, + "step": 11806 + }, + { + "epoch": 0.58, + "grad_norm": 0.48236528038978577, + "learning_rate": 0.0005465896622623427, + "loss": 3.2213, + "step": 11807 + }, + { + "epoch": 0.58, + "grad_norm": 0.5906662940979004, + "learning_rate": 0.000546580893234063, + "loss": 3.1436, + "step": 11808 + }, + { + "epoch": 0.58, + "grad_norm": 0.5112290382385254, + "learning_rate": 0.0005465721235563365, + "loss": 3.2082, + "step": 11809 + }, + { + "epoch": 0.58, + "grad_norm": 0.5187637805938721, + "learning_rate": 0.0005465633532291867, + "loss": 3.2795, + "step": 11810 + }, + { + "epoch": 0.58, + "grad_norm": 0.5141165852546692, + "learning_rate": 0.0005465545822526364, + "loss": 3.191, + "step": 11811 + }, + { + "epoch": 0.58, + "grad_norm": 0.6304354071617126, + "learning_rate": 0.0005465458106267087, + "loss": 3.3437, + "step": 11812 + }, + { + "epoch": 0.58, + "grad_norm": 0.5287776589393616, + "learning_rate": 0.0005465370383514269, + "loss": 3.1787, + "step": 11813 + }, + { + "epoch": 0.58, + "grad_norm": 0.5059935450553894, + "learning_rate": 0.000546528265426814, + "loss": 3.3609, + "step": 11814 + }, + { + "epoch": 0.58, + "grad_norm": 0.5362207293510437, + "learning_rate": 0.000546519491852893, + "loss": 3.1434, + "step": 11815 + }, + { + "epoch": 0.58, + "grad_norm": 0.5879662036895752, + "learning_rate": 0.0005465107176296872, + "loss": 3.4093, + "step": 11816 + }, + { + "epoch": 0.58, + "grad_norm": 0.5185505747795105, + "learning_rate": 0.0005465019427572195, + "loss": 3.2697, + "step": 11817 + }, + { + "epoch": 0.58, + "grad_norm": 0.5067775845527649, + "learning_rate": 0.0005464931672355131, + "loss": 3.4235, + "step": 11818 + }, + { + "epoch": 0.58, + "grad_norm": 0.5255836248397827, + "learning_rate": 0.0005464843910645913, + "loss": 3.3745, + "step": 11819 + }, + { + "epoch": 0.58, + "grad_norm": 0.6073008179664612, + "learning_rate": 0.0005464756142444769, + "loss": 3.182, + "step": 11820 + }, + { + "epoch": 0.58, + "grad_norm": 0.48989614844322205, + "learning_rate": 0.0005464668367751933, + "loss": 3.4873, + "step": 11821 + }, + { + "epoch": 0.58, + "grad_norm": 0.6135048270225525, + "learning_rate": 0.0005464580586567634, + "loss": 3.3022, + "step": 11822 + }, + { + "epoch": 0.58, + "grad_norm": 0.5023560523986816, + "learning_rate": 0.0005464492798892104, + "loss": 3.2013, + "step": 11823 + }, + { + "epoch": 0.58, + "grad_norm": 0.5405312180519104, + "learning_rate": 0.0005464405004725574, + "loss": 3.3605, + "step": 11824 + }, + { + "epoch": 0.58, + "grad_norm": 0.5376933217048645, + "learning_rate": 0.0005464317204068276, + "loss": 3.1126, + "step": 11825 + }, + { + "epoch": 0.58, + "grad_norm": 0.5398737788200378, + "learning_rate": 0.0005464229396920441, + "loss": 3.1471, + "step": 11826 + }, + { + "epoch": 0.58, + "grad_norm": 0.5017811059951782, + "learning_rate": 0.0005464141583282299, + "loss": 3.3562, + "step": 11827 + }, + { + "epoch": 0.58, + "grad_norm": 0.49591362476348877, + "learning_rate": 0.0005464053763154083, + "loss": 3.1727, + "step": 11828 + }, + { + "epoch": 0.58, + "grad_norm": 0.5274151563644409, + "learning_rate": 0.0005463965936536024, + "loss": 3.0793, + "step": 11829 + }, + { + "epoch": 0.58, + "grad_norm": 0.4952261745929718, + "learning_rate": 0.0005463878103428353, + "loss": 3.4623, + "step": 11830 + }, + { + "epoch": 0.58, + "grad_norm": 0.5281954407691956, + "learning_rate": 0.0005463790263831301, + "loss": 3.3092, + "step": 11831 + }, + { + "epoch": 0.58, + "grad_norm": 0.5254419445991516, + "learning_rate": 0.0005463702417745099, + "loss": 3.333, + "step": 11832 + }, + { + "epoch": 0.58, + "grad_norm": 0.5291889309883118, + "learning_rate": 0.0005463614565169979, + "loss": 3.2975, + "step": 11833 + }, + { + "epoch": 0.58, + "grad_norm": 0.5549447536468506, + "learning_rate": 0.0005463526706106173, + "loss": 3.3199, + "step": 11834 + }, + { + "epoch": 0.58, + "grad_norm": 0.5520808696746826, + "learning_rate": 0.0005463438840553912, + "loss": 3.1179, + "step": 11835 + }, + { + "epoch": 0.58, + "grad_norm": 0.5752543807029724, + "learning_rate": 0.0005463350968513426, + "loss": 3.4448, + "step": 11836 + }, + { + "epoch": 0.58, + "grad_norm": 0.5210770964622498, + "learning_rate": 0.0005463263089984948, + "loss": 3.3781, + "step": 11837 + }, + { + "epoch": 0.58, + "grad_norm": 0.5394595265388489, + "learning_rate": 0.000546317520496871, + "loss": 3.0691, + "step": 11838 + }, + { + "epoch": 0.58, + "grad_norm": 0.5369167327880859, + "learning_rate": 0.0005463087313464942, + "loss": 3.299, + "step": 11839 + }, + { + "epoch": 0.58, + "grad_norm": 0.5101680755615234, + "learning_rate": 0.0005462999415473877, + "loss": 3.1371, + "step": 11840 + }, + { + "epoch": 0.58, + "grad_norm": 0.4878643751144409, + "learning_rate": 0.0005462911510995744, + "loss": 3.277, + "step": 11841 + }, + { + "epoch": 0.58, + "grad_norm": 0.513389527797699, + "learning_rate": 0.0005462823600030776, + "loss": 3.2535, + "step": 11842 + }, + { + "epoch": 0.58, + "grad_norm": 0.5329154133796692, + "learning_rate": 0.0005462735682579205, + "loss": 3.4121, + "step": 11843 + }, + { + "epoch": 0.58, + "grad_norm": 0.4917752146720886, + "learning_rate": 0.0005462647758641263, + "loss": 3.3547, + "step": 11844 + }, + { + "epoch": 0.58, + "grad_norm": 0.49476316571235657, + "learning_rate": 0.000546255982821718, + "loss": 3.0001, + "step": 11845 + }, + { + "epoch": 0.58, + "grad_norm": 0.4978811740875244, + "learning_rate": 0.0005462471891307189, + "loss": 3.2179, + "step": 11846 + }, + { + "epoch": 0.58, + "grad_norm": 0.5276978611946106, + "learning_rate": 0.000546238394791152, + "loss": 3.4435, + "step": 11847 + }, + { + "epoch": 0.58, + "grad_norm": 0.5227974653244019, + "learning_rate": 0.0005462295998030406, + "loss": 3.4436, + "step": 11848 + }, + { + "epoch": 0.58, + "grad_norm": 0.5526250004768372, + "learning_rate": 0.0005462208041664079, + "loss": 3.4602, + "step": 11849 + }, + { + "epoch": 0.58, + "grad_norm": 0.5186265110969543, + "learning_rate": 0.0005462120078812769, + "loss": 3.1297, + "step": 11850 + }, + { + "epoch": 0.58, + "grad_norm": 0.5635437965393066, + "learning_rate": 0.0005462032109476709, + "loss": 2.9613, + "step": 11851 + }, + { + "epoch": 0.58, + "grad_norm": 0.5280086398124695, + "learning_rate": 0.000546194413365613, + "loss": 3.1061, + "step": 11852 + }, + { + "epoch": 0.58, + "grad_norm": 0.5283723473548889, + "learning_rate": 0.0005461856151351264, + "loss": 3.1436, + "step": 11853 + }, + { + "epoch": 0.58, + "grad_norm": 0.5015434622764587, + "learning_rate": 0.0005461768162562342, + "loss": 3.307, + "step": 11854 + }, + { + "epoch": 0.58, + "grad_norm": 0.5084378719329834, + "learning_rate": 0.0005461680167289598, + "loss": 3.2405, + "step": 11855 + }, + { + "epoch": 0.58, + "grad_norm": 0.5142874717712402, + "learning_rate": 0.0005461592165533261, + "loss": 3.2759, + "step": 11856 + }, + { + "epoch": 0.58, + "grad_norm": 0.5148090720176697, + "learning_rate": 0.0005461504157293563, + "loss": 2.9513, + "step": 11857 + }, + { + "epoch": 0.58, + "grad_norm": 0.5030063390731812, + "learning_rate": 0.0005461416142570739, + "loss": 3.3158, + "step": 11858 + }, + { + "epoch": 0.58, + "grad_norm": 0.4925903379917145, + "learning_rate": 0.0005461328121365018, + "loss": 3.3714, + "step": 11859 + }, + { + "epoch": 0.58, + "grad_norm": 0.5264309644699097, + "learning_rate": 0.0005461240093676632, + "loss": 3.271, + "step": 11860 + }, + { + "epoch": 0.58, + "grad_norm": 0.5105083584785461, + "learning_rate": 0.0005461152059505814, + "loss": 3.2343, + "step": 11861 + }, + { + "epoch": 0.58, + "grad_norm": 0.48054060339927673, + "learning_rate": 0.0005461064018852794, + "loss": 3.1617, + "step": 11862 + }, + { + "epoch": 0.58, + "grad_norm": 0.5289759635925293, + "learning_rate": 0.0005460975971717805, + "loss": 3.2449, + "step": 11863 + }, + { + "epoch": 0.58, + "grad_norm": 0.5321834087371826, + "learning_rate": 0.000546088791810108, + "loss": 3.2072, + "step": 11864 + }, + { + "epoch": 0.58, + "grad_norm": 0.6084837913513184, + "learning_rate": 0.0005460799858002849, + "loss": 3.2506, + "step": 11865 + }, + { + "epoch": 0.58, + "grad_norm": 0.5481894016265869, + "learning_rate": 0.0005460711791423344, + "loss": 3.1303, + "step": 11866 + }, + { + "epoch": 0.58, + "grad_norm": 0.5393480658531189, + "learning_rate": 0.0005460623718362799, + "loss": 3.2954, + "step": 11867 + }, + { + "epoch": 0.58, + "grad_norm": 0.5366723537445068, + "learning_rate": 0.0005460535638821444, + "loss": 3.1237, + "step": 11868 + }, + { + "epoch": 0.58, + "grad_norm": 0.505320131778717, + "learning_rate": 0.0005460447552799512, + "loss": 3.2153, + "step": 11869 + }, + { + "epoch": 0.58, + "grad_norm": 0.5377161502838135, + "learning_rate": 0.0005460359460297235, + "loss": 3.1654, + "step": 11870 + }, + { + "epoch": 0.58, + "grad_norm": 0.49337872862815857, + "learning_rate": 0.0005460271361314845, + "loss": 3.2592, + "step": 11871 + }, + { + "epoch": 0.58, + "grad_norm": 0.5618535280227661, + "learning_rate": 0.0005460183255852573, + "loss": 3.2642, + "step": 11872 + }, + { + "epoch": 0.58, + "grad_norm": 0.4871905446052551, + "learning_rate": 0.0005460095143910652, + "loss": 3.3339, + "step": 11873 + }, + { + "epoch": 0.58, + "grad_norm": 0.5062762498855591, + "learning_rate": 0.0005460007025489313, + "loss": 3.3657, + "step": 11874 + }, + { + "epoch": 0.58, + "grad_norm": 0.530390739440918, + "learning_rate": 0.0005459918900588789, + "loss": 3.303, + "step": 11875 + }, + { + "epoch": 0.58, + "grad_norm": 0.5175230503082275, + "learning_rate": 0.0005459830769209314, + "loss": 3.0723, + "step": 11876 + }, + { + "epoch": 0.58, + "grad_norm": 0.5422409176826477, + "learning_rate": 0.0005459742631351118, + "loss": 3.2174, + "step": 11877 + }, + { + "epoch": 0.58, + "grad_norm": 0.5116949081420898, + "learning_rate": 0.0005459654487014431, + "loss": 3.0453, + "step": 11878 + }, + { + "epoch": 0.58, + "grad_norm": 0.49938473105430603, + "learning_rate": 0.0005459566336199488, + "loss": 3.021, + "step": 11879 + }, + { + "epoch": 0.58, + "grad_norm": 0.5277251601219177, + "learning_rate": 0.0005459478178906522, + "loss": 3.2052, + "step": 11880 + }, + { + "epoch": 0.58, + "grad_norm": 0.5035902261734009, + "learning_rate": 0.0005459390015135762, + "loss": 3.2446, + "step": 11881 + }, + { + "epoch": 0.58, + "grad_norm": 0.5442661643028259, + "learning_rate": 0.0005459301844887444, + "loss": 3.1998, + "step": 11882 + }, + { + "epoch": 0.58, + "grad_norm": 0.5180203914642334, + "learning_rate": 0.0005459213668161797, + "loss": 3.4836, + "step": 11883 + }, + { + "epoch": 0.58, + "grad_norm": 0.5293683409690857, + "learning_rate": 0.0005459125484959054, + "loss": 3.3354, + "step": 11884 + }, + { + "epoch": 0.58, + "grad_norm": 0.4947066307067871, + "learning_rate": 0.0005459037295279449, + "loss": 2.9769, + "step": 11885 + }, + { + "epoch": 0.58, + "grad_norm": 0.5257008671760559, + "learning_rate": 0.0005458949099123214, + "loss": 3.3676, + "step": 11886 + }, + { + "epoch": 0.58, + "grad_norm": 0.4895974397659302, + "learning_rate": 0.0005458860896490577, + "loss": 3.1573, + "step": 11887 + }, + { + "epoch": 0.58, + "grad_norm": 0.56827712059021, + "learning_rate": 0.0005458772687381776, + "loss": 3.0959, + "step": 11888 + }, + { + "epoch": 0.58, + "grad_norm": 0.5436473488807678, + "learning_rate": 0.000545868447179704, + "loss": 3.3533, + "step": 11889 + }, + { + "epoch": 0.58, + "grad_norm": 0.5245484113693237, + "learning_rate": 0.0005458596249736604, + "loss": 3.3021, + "step": 11890 + }, + { + "epoch": 0.58, + "grad_norm": 0.555914044380188, + "learning_rate": 0.0005458508021200697, + "loss": 3.2511, + "step": 11891 + }, + { + "epoch": 0.58, + "grad_norm": 0.5292685031890869, + "learning_rate": 0.0005458419786189552, + "loss": 3.3015, + "step": 11892 + }, + { + "epoch": 0.58, + "grad_norm": 0.5257068276405334, + "learning_rate": 0.0005458331544703405, + "loss": 3.1262, + "step": 11893 + }, + { + "epoch": 0.58, + "grad_norm": 0.5110155940055847, + "learning_rate": 0.0005458243296742485, + "loss": 3.2781, + "step": 11894 + }, + { + "epoch": 0.58, + "grad_norm": 0.5501812100410461, + "learning_rate": 0.0005458155042307024, + "loss": 3.0971, + "step": 11895 + }, + { + "epoch": 0.58, + "grad_norm": 0.5484451651573181, + "learning_rate": 0.0005458066781397257, + "loss": 3.0403, + "step": 11896 + }, + { + "epoch": 0.58, + "grad_norm": 0.5546119213104248, + "learning_rate": 0.0005457978514013415, + "loss": 3.3563, + "step": 11897 + }, + { + "epoch": 0.58, + "grad_norm": 0.5238593220710754, + "learning_rate": 0.000545789024015573, + "loss": 3.3929, + "step": 11898 + }, + { + "epoch": 0.58, + "grad_norm": 0.5623632669448853, + "learning_rate": 0.0005457801959824435, + "loss": 3.2798, + "step": 11899 + }, + { + "epoch": 0.58, + "grad_norm": 0.5106022357940674, + "learning_rate": 0.0005457713673019764, + "loss": 3.1605, + "step": 11900 + }, + { + "epoch": 0.58, + "grad_norm": 0.5641065835952759, + "learning_rate": 0.0005457625379741947, + "loss": 3.364, + "step": 11901 + }, + { + "epoch": 0.58, + "grad_norm": 0.5487107634544373, + "learning_rate": 0.0005457537079991218, + "loss": 3.1397, + "step": 11902 + }, + { + "epoch": 0.58, + "grad_norm": 0.5087454915046692, + "learning_rate": 0.000545744877376781, + "loss": 3.1786, + "step": 11903 + }, + { + "epoch": 0.58, + "grad_norm": 0.5044289827346802, + "learning_rate": 0.0005457360461071953, + "loss": 3.2765, + "step": 11904 + }, + { + "epoch": 0.58, + "grad_norm": 0.5639960169792175, + "learning_rate": 0.0005457272141903884, + "loss": 3.3457, + "step": 11905 + }, + { + "epoch": 0.58, + "grad_norm": 0.5312978029251099, + "learning_rate": 0.0005457183816263831, + "loss": 3.1708, + "step": 11906 + }, + { + "epoch": 0.58, + "grad_norm": 0.562142550945282, + "learning_rate": 0.000545709548415203, + "loss": 3.4045, + "step": 11907 + }, + { + "epoch": 0.58, + "grad_norm": 0.5207469463348389, + "learning_rate": 0.0005457007145568712, + "loss": 3.2396, + "step": 11908 + }, + { + "epoch": 0.58, + "grad_norm": 0.513844907283783, + "learning_rate": 0.0005456918800514109, + "loss": 3.2508, + "step": 11909 + }, + { + "epoch": 0.58, + "grad_norm": 0.5175647735595703, + "learning_rate": 0.0005456830448988456, + "loss": 3.0191, + "step": 11910 + }, + { + "epoch": 0.58, + "grad_norm": 0.5360447764396667, + "learning_rate": 0.0005456742090991984, + "loss": 3.2997, + "step": 11911 + }, + { + "epoch": 0.58, + "grad_norm": 0.5037137866020203, + "learning_rate": 0.0005456653726524926, + "loss": 3.2219, + "step": 11912 + }, + { + "epoch": 0.58, + "grad_norm": 0.5330907106399536, + "learning_rate": 0.0005456565355587515, + "loss": 3.4176, + "step": 11913 + }, + { + "epoch": 0.58, + "grad_norm": 0.511675238609314, + "learning_rate": 0.0005456476978179984, + "loss": 3.3674, + "step": 11914 + }, + { + "epoch": 0.58, + "grad_norm": 0.5179082751274109, + "learning_rate": 0.0005456388594302565, + "loss": 3.3472, + "step": 11915 + }, + { + "epoch": 0.58, + "grad_norm": 0.5486281514167786, + "learning_rate": 0.0005456300203955491, + "loss": 3.4349, + "step": 11916 + }, + { + "epoch": 0.58, + "grad_norm": 0.516790509223938, + "learning_rate": 0.0005456211807138996, + "loss": 2.9798, + "step": 11917 + }, + { + "epoch": 0.58, + "grad_norm": 0.5260696411132812, + "learning_rate": 0.0005456123403853311, + "loss": 3.0199, + "step": 11918 + }, + { + "epoch": 0.58, + "grad_norm": 0.4837647080421448, + "learning_rate": 0.0005456034994098671, + "loss": 3.19, + "step": 11919 + }, + { + "epoch": 0.58, + "grad_norm": 0.5303565263748169, + "learning_rate": 0.0005455946577875304, + "loss": 3.3105, + "step": 11920 + }, + { + "epoch": 0.58, + "grad_norm": 0.4975842535495758, + "learning_rate": 0.0005455858155183449, + "loss": 3.4884, + "step": 11921 + }, + { + "epoch": 0.58, + "grad_norm": 0.5344429612159729, + "learning_rate": 0.0005455769726023336, + "loss": 3.1137, + "step": 11922 + }, + { + "epoch": 0.58, + "grad_norm": 0.5470472574234009, + "learning_rate": 0.0005455681290395199, + "loss": 3.0215, + "step": 11923 + }, + { + "epoch": 0.58, + "grad_norm": 0.49586766958236694, + "learning_rate": 0.0005455592848299269, + "loss": 3.1258, + "step": 11924 + }, + { + "epoch": 0.58, + "grad_norm": 0.49502456188201904, + "learning_rate": 0.000545550439973578, + "loss": 3.188, + "step": 11925 + }, + { + "epoch": 0.58, + "grad_norm": 0.5571303963661194, + "learning_rate": 0.0005455415944704966, + "loss": 3.2725, + "step": 11926 + }, + { + "epoch": 0.58, + "grad_norm": 0.4836958944797516, + "learning_rate": 0.0005455327483207057, + "loss": 3.2533, + "step": 11927 + }, + { + "epoch": 0.58, + "grad_norm": 0.49969571828842163, + "learning_rate": 0.0005455239015242289, + "loss": 3.3009, + "step": 11928 + }, + { + "epoch": 0.58, + "grad_norm": 0.5281708240509033, + "learning_rate": 0.0005455150540810894, + "loss": 3.2464, + "step": 11929 + }, + { + "epoch": 0.58, + "grad_norm": 0.5075505375862122, + "learning_rate": 0.0005455062059913106, + "loss": 3.2988, + "step": 11930 + }, + { + "epoch": 0.58, + "grad_norm": 0.5302159190177917, + "learning_rate": 0.0005454973572549156, + "loss": 3.2058, + "step": 11931 + }, + { + "epoch": 0.58, + "grad_norm": 0.5521146059036255, + "learning_rate": 0.0005454885078719277, + "loss": 3.3473, + "step": 11932 + }, + { + "epoch": 0.58, + "grad_norm": 0.5224472880363464, + "learning_rate": 0.0005454796578423705, + "loss": 3.1948, + "step": 11933 + }, + { + "epoch": 0.58, + "grad_norm": 0.5724395513534546, + "learning_rate": 0.000545470807166267, + "loss": 3.4027, + "step": 11934 + }, + { + "epoch": 0.58, + "grad_norm": 0.5843696594238281, + "learning_rate": 0.0005454619558436407, + "loss": 3.277, + "step": 11935 + }, + { + "epoch": 0.58, + "grad_norm": 0.5131762027740479, + "learning_rate": 0.0005454531038745148, + "loss": 3.3245, + "step": 11936 + }, + { + "epoch": 0.59, + "grad_norm": 0.5358636975288391, + "learning_rate": 0.0005454442512589127, + "loss": 3.1341, + "step": 11937 + }, + { + "epoch": 0.59, + "grad_norm": 0.5540893077850342, + "learning_rate": 0.0005454353979968576, + "loss": 3.2948, + "step": 11938 + }, + { + "epoch": 0.59, + "grad_norm": 0.48805323243141174, + "learning_rate": 0.000545426544088373, + "loss": 3.2973, + "step": 11939 + }, + { + "epoch": 0.59, + "grad_norm": 0.4978611171245575, + "learning_rate": 0.0005454176895334822, + "loss": 3.3075, + "step": 11940 + }, + { + "epoch": 0.59, + "grad_norm": 0.49105244874954224, + "learning_rate": 0.0005454088343322083, + "loss": 3.202, + "step": 11941 + }, + { + "epoch": 0.59, + "grad_norm": 0.4822368621826172, + "learning_rate": 0.0005453999784845747, + "loss": 3.2363, + "step": 11942 + }, + { + "epoch": 0.59, + "grad_norm": 0.5346906185150146, + "learning_rate": 0.0005453911219906049, + "loss": 3.2, + "step": 11943 + }, + { + "epoch": 0.59, + "grad_norm": 0.5279293060302734, + "learning_rate": 0.0005453822648503221, + "loss": 3.1051, + "step": 11944 + }, + { + "epoch": 0.59, + "grad_norm": 0.5374941825866699, + "learning_rate": 0.0005453734070637496, + "loss": 3.2868, + "step": 11945 + }, + { + "epoch": 0.59, + "grad_norm": 0.5096423625946045, + "learning_rate": 0.0005453645486309109, + "loss": 3.2491, + "step": 11946 + }, + { + "epoch": 0.59, + "grad_norm": 0.5027688145637512, + "learning_rate": 0.000545355689551829, + "loss": 3.252, + "step": 11947 + }, + { + "epoch": 0.59, + "grad_norm": 0.5206319689750671, + "learning_rate": 0.0005453468298265275, + "loss": 3.3398, + "step": 11948 + }, + { + "epoch": 0.59, + "grad_norm": 0.5208166241645813, + "learning_rate": 0.0005453379694550297, + "loss": 3.3007, + "step": 11949 + }, + { + "epoch": 0.59, + "grad_norm": 0.5166978240013123, + "learning_rate": 0.0005453291084373589, + "loss": 3.2176, + "step": 11950 + }, + { + "epoch": 0.59, + "grad_norm": 0.5170909762382507, + "learning_rate": 0.0005453202467735384, + "loss": 3.1967, + "step": 11951 + }, + { + "epoch": 0.59, + "grad_norm": 0.5434339642524719, + "learning_rate": 0.0005453113844635916, + "loss": 3.2238, + "step": 11952 + }, + { + "epoch": 0.59, + "grad_norm": 0.4791112244129181, + "learning_rate": 0.0005453025215075419, + "loss": 3.1606, + "step": 11953 + }, + { + "epoch": 0.59, + "grad_norm": 0.49351251125335693, + "learning_rate": 0.0005452936579054125, + "loss": 3.0225, + "step": 11954 + }, + { + "epoch": 0.59, + "grad_norm": 0.5247668623924255, + "learning_rate": 0.0005452847936572268, + "loss": 3.0996, + "step": 11955 + }, + { + "epoch": 0.59, + "grad_norm": 0.5701345801353455, + "learning_rate": 0.0005452759287630081, + "loss": 3.3449, + "step": 11956 + }, + { + "epoch": 0.59, + "grad_norm": 0.5012340545654297, + "learning_rate": 0.00054526706322278, + "loss": 3.1187, + "step": 11957 + }, + { + "epoch": 0.59, + "grad_norm": 0.5431010723114014, + "learning_rate": 0.0005452581970365655, + "loss": 3.3408, + "step": 11958 + }, + { + "epoch": 0.59, + "grad_norm": 0.5214460492134094, + "learning_rate": 0.0005452493302043882, + "loss": 3.1243, + "step": 11959 + }, + { + "epoch": 0.59, + "grad_norm": 0.49427273869514465, + "learning_rate": 0.0005452404627262713, + "loss": 3.08, + "step": 11960 + }, + { + "epoch": 0.59, + "grad_norm": 0.5280971527099609, + "learning_rate": 0.0005452315946022383, + "loss": 3.253, + "step": 11961 + }, + { + "epoch": 0.59, + "grad_norm": 0.5085782408714294, + "learning_rate": 0.0005452227258323124, + "loss": 3.1777, + "step": 11962 + }, + { + "epoch": 0.59, + "grad_norm": 0.5629658102989197, + "learning_rate": 0.000545213856416517, + "loss": 3.2163, + "step": 11963 + }, + { + "epoch": 0.59, + "grad_norm": 0.49250122904777527, + "learning_rate": 0.0005452049863548756, + "loss": 3.1596, + "step": 11964 + }, + { + "epoch": 0.59, + "grad_norm": 0.5141192078590393, + "learning_rate": 0.0005451961156474113, + "loss": 3.182, + "step": 11965 + }, + { + "epoch": 0.59, + "grad_norm": 0.4984816908836365, + "learning_rate": 0.0005451872442941478, + "loss": 3.0893, + "step": 11966 + }, + { + "epoch": 0.59, + "grad_norm": 0.4933840334415436, + "learning_rate": 0.0005451783722951082, + "loss": 3.0771, + "step": 11967 + }, + { + "epoch": 0.59, + "grad_norm": 0.564106822013855, + "learning_rate": 0.000545169499650316, + "loss": 3.4349, + "step": 11968 + }, + { + "epoch": 0.59, + "grad_norm": 0.4997556805610657, + "learning_rate": 0.0005451606263597945, + "loss": 3.2064, + "step": 11969 + }, + { + "epoch": 0.59, + "grad_norm": 0.4757207930088043, + "learning_rate": 0.000545151752423567, + "loss": 3.1864, + "step": 11970 + }, + { + "epoch": 0.59, + "grad_norm": 0.4952181577682495, + "learning_rate": 0.0005451428778416571, + "loss": 3.0965, + "step": 11971 + }, + { + "epoch": 0.59, + "grad_norm": 0.5092923045158386, + "learning_rate": 0.0005451340026140879, + "loss": 3.2239, + "step": 11972 + }, + { + "epoch": 0.59, + "grad_norm": 0.5279220342636108, + "learning_rate": 0.000545125126740883, + "loss": 3.4003, + "step": 11973 + }, + { + "epoch": 0.59, + "grad_norm": 0.4941803216934204, + "learning_rate": 0.0005451162502220657, + "loss": 3.2384, + "step": 11974 + }, + { + "epoch": 0.59, + "grad_norm": 0.5303682684898376, + "learning_rate": 0.0005451073730576594, + "loss": 3.373, + "step": 11975 + }, + { + "epoch": 0.59, + "grad_norm": 0.548751175403595, + "learning_rate": 0.0005450984952476874, + "loss": 3.1797, + "step": 11976 + }, + { + "epoch": 0.59, + "grad_norm": 0.509413480758667, + "learning_rate": 0.0005450896167921731, + "loss": 3.1192, + "step": 11977 + }, + { + "epoch": 0.59, + "grad_norm": 0.5299882888793945, + "learning_rate": 0.00054508073769114, + "loss": 2.747, + "step": 11978 + }, + { + "epoch": 0.59, + "grad_norm": 0.5232790112495422, + "learning_rate": 0.0005450718579446113, + "loss": 3.3651, + "step": 11979 + }, + { + "epoch": 0.59, + "grad_norm": 0.5589876174926758, + "learning_rate": 0.0005450629775526106, + "loss": 3.2521, + "step": 11980 + }, + { + "epoch": 0.59, + "grad_norm": 0.4836844205856323, + "learning_rate": 0.0005450540965151612, + "loss": 3.0705, + "step": 11981 + }, + { + "epoch": 0.59, + "grad_norm": 0.5362475514411926, + "learning_rate": 0.0005450452148322864, + "loss": 3.1405, + "step": 11982 + }, + { + "epoch": 0.59, + "grad_norm": 0.5328556895256042, + "learning_rate": 0.0005450363325040096, + "loss": 3.2575, + "step": 11983 + }, + { + "epoch": 0.59, + "grad_norm": 0.5442496538162231, + "learning_rate": 0.0005450274495303544, + "loss": 3.1584, + "step": 11984 + }, + { + "epoch": 0.59, + "grad_norm": 0.5376067757606506, + "learning_rate": 0.0005450185659113439, + "loss": 3.3907, + "step": 11985 + }, + { + "epoch": 0.59, + "grad_norm": 0.5082165598869324, + "learning_rate": 0.0005450096816470017, + "loss": 3.1008, + "step": 11986 + }, + { + "epoch": 0.59, + "grad_norm": 0.4951854944229126, + "learning_rate": 0.0005450007967373512, + "loss": 3.1832, + "step": 11987 + }, + { + "epoch": 0.59, + "grad_norm": 0.5797527432441711, + "learning_rate": 0.0005449919111824157, + "loss": 3.1662, + "step": 11988 + }, + { + "epoch": 0.59, + "grad_norm": 0.5416431427001953, + "learning_rate": 0.0005449830249822187, + "loss": 3.1162, + "step": 11989 + }, + { + "epoch": 0.59, + "grad_norm": 0.5033136010169983, + "learning_rate": 0.0005449741381367836, + "loss": 3.1388, + "step": 11990 + }, + { + "epoch": 0.59, + "grad_norm": 0.5002380609512329, + "learning_rate": 0.0005449652506461337, + "loss": 3.31, + "step": 11991 + }, + { + "epoch": 0.59, + "grad_norm": 0.49132394790649414, + "learning_rate": 0.0005449563625102926, + "loss": 3.1882, + "step": 11992 + }, + { + "epoch": 0.59, + "grad_norm": 0.4861402213573456, + "learning_rate": 0.0005449474737292834, + "loss": 3.4393, + "step": 11993 + }, + { + "epoch": 0.59, + "grad_norm": 0.5278277397155762, + "learning_rate": 0.0005449385843031298, + "loss": 3.2842, + "step": 11994 + }, + { + "epoch": 0.59, + "grad_norm": 0.5361764430999756, + "learning_rate": 0.000544929694231855, + "loss": 3.2113, + "step": 11995 + }, + { + "epoch": 0.59, + "grad_norm": 0.5028777122497559, + "learning_rate": 0.0005449208035154827, + "loss": 3.294, + "step": 11996 + }, + { + "epoch": 0.59, + "grad_norm": 0.5272602438926697, + "learning_rate": 0.000544911912154036, + "loss": 3.3859, + "step": 11997 + }, + { + "epoch": 0.59, + "grad_norm": 0.5307137966156006, + "learning_rate": 0.0005449030201475385, + "loss": 3.1559, + "step": 11998 + }, + { + "epoch": 0.59, + "grad_norm": 0.510999858379364, + "learning_rate": 0.0005448941274960136, + "loss": 3.1829, + "step": 11999 + }, + { + "epoch": 0.59, + "grad_norm": 0.5812216997146606, + "learning_rate": 0.0005448852341994846, + "loss": 3.1065, + "step": 12000 + }, + { + "epoch": 0.59, + "grad_norm": 0.5590925812721252, + "learning_rate": 0.0005448763402579752, + "loss": 3.1302, + "step": 12001 + }, + { + "epoch": 0.59, + "grad_norm": 0.5319423079490662, + "learning_rate": 0.0005448674456715085, + "loss": 2.961, + "step": 12002 + }, + { + "epoch": 0.59, + "grad_norm": 0.5174445509910583, + "learning_rate": 0.0005448585504401082, + "loss": 2.9525, + "step": 12003 + }, + { + "epoch": 0.59, + "grad_norm": 0.5351834893226624, + "learning_rate": 0.0005448496545637975, + "loss": 3.1183, + "step": 12004 + }, + { + "epoch": 0.59, + "grad_norm": 0.5270566940307617, + "learning_rate": 0.0005448407580425999, + "loss": 3.2682, + "step": 12005 + }, + { + "epoch": 0.59, + "grad_norm": 0.5064947009086609, + "learning_rate": 0.000544831860876539, + "loss": 3.3034, + "step": 12006 + }, + { + "epoch": 0.59, + "grad_norm": 0.5744919180870056, + "learning_rate": 0.0005448229630656379, + "loss": 3.0479, + "step": 12007 + }, + { + "epoch": 0.59, + "grad_norm": 0.5197696089744568, + "learning_rate": 0.0005448140646099204, + "loss": 3.0092, + "step": 12008 + }, + { + "epoch": 0.59, + "grad_norm": 0.52765291929245, + "learning_rate": 0.0005448051655094098, + "loss": 3.1997, + "step": 12009 + }, + { + "epoch": 0.59, + "grad_norm": 0.5932416915893555, + "learning_rate": 0.0005447962657641294, + "loss": 3.3712, + "step": 12010 + }, + { + "epoch": 0.59, + "grad_norm": 0.5595377087593079, + "learning_rate": 0.0005447873653741027, + "loss": 3.0408, + "step": 12011 + }, + { + "epoch": 0.59, + "grad_norm": 0.5186452269554138, + "learning_rate": 0.0005447784643393533, + "loss": 3.5143, + "step": 12012 + }, + { + "epoch": 0.59, + "grad_norm": 0.5053780674934387, + "learning_rate": 0.0005447695626599045, + "loss": 3.2042, + "step": 12013 + }, + { + "epoch": 0.59, + "grad_norm": 0.5170993208885193, + "learning_rate": 0.0005447606603357798, + "loss": 3.0966, + "step": 12014 + }, + { + "epoch": 0.59, + "grad_norm": 0.5181581377983093, + "learning_rate": 0.0005447517573670026, + "loss": 3.1413, + "step": 12015 + }, + { + "epoch": 0.59, + "grad_norm": 0.48989567160606384, + "learning_rate": 0.0005447428537535963, + "loss": 3.1706, + "step": 12016 + }, + { + "epoch": 0.59, + "grad_norm": 0.5295213460922241, + "learning_rate": 0.0005447339494955845, + "loss": 3.2038, + "step": 12017 + }, + { + "epoch": 0.59, + "grad_norm": 0.5177062749862671, + "learning_rate": 0.0005447250445929906, + "loss": 3.3209, + "step": 12018 + }, + { + "epoch": 0.59, + "grad_norm": 0.5109267234802246, + "learning_rate": 0.0005447161390458381, + "loss": 3.2872, + "step": 12019 + }, + { + "epoch": 0.59, + "grad_norm": 0.5133575201034546, + "learning_rate": 0.0005447072328541502, + "loss": 3.2706, + "step": 12020 + }, + { + "epoch": 0.59, + "grad_norm": 0.5542067289352417, + "learning_rate": 0.0005446983260179506, + "loss": 3.3886, + "step": 12021 + }, + { + "epoch": 0.59, + "grad_norm": 0.537536084651947, + "learning_rate": 0.0005446894185372628, + "loss": 3.4365, + "step": 12022 + }, + { + "epoch": 0.59, + "grad_norm": 0.5550865530967712, + "learning_rate": 0.0005446805104121101, + "loss": 3.3174, + "step": 12023 + }, + { + "epoch": 0.59, + "grad_norm": 0.5242879390716553, + "learning_rate": 0.000544671601642516, + "loss": 3.1944, + "step": 12024 + }, + { + "epoch": 0.59, + "grad_norm": 0.5210665464401245, + "learning_rate": 0.000544662692228504, + "loss": 3.1458, + "step": 12025 + }, + { + "epoch": 0.59, + "grad_norm": 0.48247066140174866, + "learning_rate": 0.0005446537821700977, + "loss": 3.376, + "step": 12026 + }, + { + "epoch": 0.59, + "grad_norm": 0.5202993154525757, + "learning_rate": 0.0005446448714673202, + "loss": 3.286, + "step": 12027 + }, + { + "epoch": 0.59, + "grad_norm": 0.5529611706733704, + "learning_rate": 0.0005446359601201953, + "loss": 3.0672, + "step": 12028 + }, + { + "epoch": 0.59, + "grad_norm": 0.536654531955719, + "learning_rate": 0.0005446270481287463, + "loss": 3.1053, + "step": 12029 + }, + { + "epoch": 0.59, + "grad_norm": 0.5165926814079285, + "learning_rate": 0.0005446181354929969, + "loss": 3.3346, + "step": 12030 + }, + { + "epoch": 0.59, + "grad_norm": 0.5090608596801758, + "learning_rate": 0.0005446092222129703, + "loss": 3.1495, + "step": 12031 + }, + { + "epoch": 0.59, + "grad_norm": 0.5462985038757324, + "learning_rate": 0.00054460030828869, + "loss": 3.0608, + "step": 12032 + }, + { + "epoch": 0.59, + "grad_norm": 0.48308515548706055, + "learning_rate": 0.0005445913937201797, + "loss": 2.9185, + "step": 12033 + }, + { + "epoch": 0.59, + "grad_norm": 0.5395585894584656, + "learning_rate": 0.0005445824785074627, + "loss": 3.3225, + "step": 12034 + }, + { + "epoch": 0.59, + "grad_norm": 0.4958081543445587, + "learning_rate": 0.0005445735626505624, + "loss": 3.2987, + "step": 12035 + }, + { + "epoch": 0.59, + "grad_norm": 0.5074117183685303, + "learning_rate": 0.0005445646461495025, + "loss": 3.2805, + "step": 12036 + }, + { + "epoch": 0.59, + "grad_norm": 0.5241624712944031, + "learning_rate": 0.0005445557290043064, + "loss": 3.3391, + "step": 12037 + }, + { + "epoch": 0.59, + "grad_norm": 0.520176112651825, + "learning_rate": 0.0005445468112149977, + "loss": 3.4373, + "step": 12038 + }, + { + "epoch": 0.59, + "grad_norm": 0.5940264463424683, + "learning_rate": 0.0005445378927815995, + "loss": 3.0772, + "step": 12039 + }, + { + "epoch": 0.59, + "grad_norm": 0.5212997794151306, + "learning_rate": 0.0005445289737041357, + "loss": 3.279, + "step": 12040 + }, + { + "epoch": 0.59, + "grad_norm": 0.5169647336006165, + "learning_rate": 0.0005445200539826297, + "loss": 3.2505, + "step": 12041 + }, + { + "epoch": 0.59, + "grad_norm": 0.5246007442474365, + "learning_rate": 0.0005445111336171048, + "loss": 3.4347, + "step": 12042 + }, + { + "epoch": 0.59, + "grad_norm": 0.524836003780365, + "learning_rate": 0.0005445022126075847, + "loss": 3.0207, + "step": 12043 + }, + { + "epoch": 0.59, + "grad_norm": 0.490573525428772, + "learning_rate": 0.0005444932909540928, + "loss": 3.2743, + "step": 12044 + }, + { + "epoch": 0.59, + "grad_norm": 0.5303407907485962, + "learning_rate": 0.0005444843686566528, + "loss": 3.2337, + "step": 12045 + }, + { + "epoch": 0.59, + "grad_norm": 0.5297519564628601, + "learning_rate": 0.0005444754457152878, + "loss": 3.3453, + "step": 12046 + }, + { + "epoch": 0.59, + "grad_norm": 0.6097574830055237, + "learning_rate": 0.0005444665221300216, + "loss": 3.0906, + "step": 12047 + }, + { + "epoch": 0.59, + "grad_norm": 0.5111755132675171, + "learning_rate": 0.0005444575979008777, + "loss": 3.4244, + "step": 12048 + }, + { + "epoch": 0.59, + "grad_norm": 0.5661916136741638, + "learning_rate": 0.0005444486730278794, + "loss": 3.2577, + "step": 12049 + }, + { + "epoch": 0.59, + "grad_norm": 0.5134075880050659, + "learning_rate": 0.0005444397475110504, + "loss": 3.1203, + "step": 12050 + }, + { + "epoch": 0.59, + "grad_norm": 0.5311046242713928, + "learning_rate": 0.0005444308213504143, + "loss": 3.376, + "step": 12051 + }, + { + "epoch": 0.59, + "grad_norm": 0.5159074068069458, + "learning_rate": 0.0005444218945459943, + "loss": 3.2704, + "step": 12052 + }, + { + "epoch": 0.59, + "grad_norm": 0.5199549794197083, + "learning_rate": 0.000544412967097814, + "loss": 3.3863, + "step": 12053 + }, + { + "epoch": 0.59, + "grad_norm": 0.5141023397445679, + "learning_rate": 0.000544404039005897, + "loss": 3.1942, + "step": 12054 + }, + { + "epoch": 0.59, + "grad_norm": 0.5393960475921631, + "learning_rate": 0.000544395110270267, + "loss": 3.2088, + "step": 12055 + }, + { + "epoch": 0.59, + "grad_norm": 0.5226369500160217, + "learning_rate": 0.0005443861808909472, + "loss": 3.2853, + "step": 12056 + }, + { + "epoch": 0.59, + "grad_norm": 0.5066924691200256, + "learning_rate": 0.0005443772508679611, + "loss": 3.0143, + "step": 12057 + }, + { + "epoch": 0.59, + "grad_norm": 0.5037077069282532, + "learning_rate": 0.0005443683202013325, + "loss": 3.4013, + "step": 12058 + }, + { + "epoch": 0.59, + "grad_norm": 0.534941554069519, + "learning_rate": 0.0005443593888910847, + "loss": 3.2482, + "step": 12059 + }, + { + "epoch": 0.59, + "grad_norm": 0.5512545108795166, + "learning_rate": 0.0005443504569372414, + "loss": 3.177, + "step": 12060 + }, + { + "epoch": 0.59, + "grad_norm": 0.5608600974082947, + "learning_rate": 0.000544341524339826, + "loss": 3.3516, + "step": 12061 + }, + { + "epoch": 0.59, + "grad_norm": 0.5026220679283142, + "learning_rate": 0.0005443325910988619, + "loss": 3.2668, + "step": 12062 + }, + { + "epoch": 0.59, + "grad_norm": 0.5782676935195923, + "learning_rate": 0.0005443236572143729, + "loss": 3.1726, + "step": 12063 + }, + { + "epoch": 0.59, + "grad_norm": 0.5567470788955688, + "learning_rate": 0.0005443147226863824, + "loss": 3.1268, + "step": 12064 + }, + { + "epoch": 0.59, + "grad_norm": 0.4905277192592621, + "learning_rate": 0.000544305787514914, + "loss": 3.0573, + "step": 12065 + }, + { + "epoch": 0.59, + "grad_norm": 0.5269580483436584, + "learning_rate": 0.000544296851699991, + "loss": 3.2479, + "step": 12066 + }, + { + "epoch": 0.59, + "grad_norm": 0.5122688412666321, + "learning_rate": 0.0005442879152416373, + "loss": 3.2095, + "step": 12067 + }, + { + "epoch": 0.59, + "grad_norm": 0.4919382631778717, + "learning_rate": 0.0005442789781398761, + "loss": 3.3365, + "step": 12068 + }, + { + "epoch": 0.59, + "grad_norm": 0.5227946639060974, + "learning_rate": 0.0005442700403947312, + "loss": 2.9657, + "step": 12069 + }, + { + "epoch": 0.59, + "grad_norm": 0.5085049867630005, + "learning_rate": 0.0005442611020062259, + "loss": 3.358, + "step": 12070 + }, + { + "epoch": 0.59, + "grad_norm": 0.5325589776039124, + "learning_rate": 0.0005442521629743839, + "loss": 3.1012, + "step": 12071 + }, + { + "epoch": 0.59, + "grad_norm": 0.5010466575622559, + "learning_rate": 0.0005442432232992289, + "loss": 3.1746, + "step": 12072 + }, + { + "epoch": 0.59, + "grad_norm": 0.5115364193916321, + "learning_rate": 0.0005442342829807841, + "loss": 3.1592, + "step": 12073 + }, + { + "epoch": 0.59, + "grad_norm": 0.4746970236301422, + "learning_rate": 0.0005442253420190732, + "loss": 3.3611, + "step": 12074 + }, + { + "epoch": 0.59, + "grad_norm": 0.5140778422355652, + "learning_rate": 0.0005442164004141196, + "loss": 3.2885, + "step": 12075 + }, + { + "epoch": 0.59, + "grad_norm": 0.480934202671051, + "learning_rate": 0.0005442074581659472, + "loss": 3.3753, + "step": 12076 + }, + { + "epoch": 0.59, + "grad_norm": 0.4761304557323456, + "learning_rate": 0.0005441985152745794, + "loss": 3.2269, + "step": 12077 + }, + { + "epoch": 0.59, + "grad_norm": 0.5257875323295593, + "learning_rate": 0.0005441895717400396, + "loss": 3.1432, + "step": 12078 + }, + { + "epoch": 0.59, + "grad_norm": 0.5360661745071411, + "learning_rate": 0.0005441806275623515, + "loss": 3.1401, + "step": 12079 + }, + { + "epoch": 0.59, + "grad_norm": 0.48718738555908203, + "learning_rate": 0.0005441716827415385, + "loss": 3.3443, + "step": 12080 + }, + { + "epoch": 0.59, + "grad_norm": 0.4796687066555023, + "learning_rate": 0.0005441627372776244, + "loss": 3.2142, + "step": 12081 + }, + { + "epoch": 0.59, + "grad_norm": 0.47105666995048523, + "learning_rate": 0.0005441537911706327, + "loss": 3.0479, + "step": 12082 + }, + { + "epoch": 0.59, + "grad_norm": 0.5267301201820374, + "learning_rate": 0.0005441448444205868, + "loss": 3.3318, + "step": 12083 + }, + { + "epoch": 0.59, + "grad_norm": 0.5082873106002808, + "learning_rate": 0.0005441358970275104, + "loss": 3.2415, + "step": 12084 + }, + { + "epoch": 0.59, + "grad_norm": 0.5294603705406189, + "learning_rate": 0.000544126948991427, + "loss": 3.1729, + "step": 12085 + }, + { + "epoch": 0.59, + "grad_norm": 0.536239504814148, + "learning_rate": 0.0005441180003123603, + "loss": 3.1646, + "step": 12086 + }, + { + "epoch": 0.59, + "grad_norm": 0.5099353194236755, + "learning_rate": 0.0005441090509903336, + "loss": 3.2599, + "step": 12087 + }, + { + "epoch": 0.59, + "grad_norm": 0.5732430219650269, + "learning_rate": 0.0005441001010253707, + "loss": 3.2231, + "step": 12088 + }, + { + "epoch": 0.59, + "grad_norm": 0.5284761786460876, + "learning_rate": 0.0005440911504174952, + "loss": 3.5178, + "step": 12089 + }, + { + "epoch": 0.59, + "grad_norm": 0.5436829924583435, + "learning_rate": 0.0005440821991667306, + "loss": 3.4769, + "step": 12090 + }, + { + "epoch": 0.59, + "grad_norm": 0.5140102505683899, + "learning_rate": 0.0005440732472731004, + "loss": 3.2641, + "step": 12091 + }, + { + "epoch": 0.59, + "grad_norm": 0.5060403347015381, + "learning_rate": 0.0005440642947366281, + "loss": 3.1533, + "step": 12092 + }, + { + "epoch": 0.59, + "grad_norm": 0.5052133798599243, + "learning_rate": 0.0005440553415573375, + "loss": 3.3999, + "step": 12093 + }, + { + "epoch": 0.59, + "grad_norm": 0.6156617999076843, + "learning_rate": 0.0005440463877352522, + "loss": 3.3402, + "step": 12094 + }, + { + "epoch": 0.59, + "grad_norm": 0.5013222098350525, + "learning_rate": 0.0005440374332703956, + "loss": 3.1037, + "step": 12095 + }, + { + "epoch": 0.59, + "grad_norm": 0.515588641166687, + "learning_rate": 0.0005440284781627915, + "loss": 3.2543, + "step": 12096 + }, + { + "epoch": 0.59, + "grad_norm": 0.49556267261505127, + "learning_rate": 0.0005440195224124633, + "loss": 3.2262, + "step": 12097 + }, + { + "epoch": 0.59, + "grad_norm": 0.528473436832428, + "learning_rate": 0.0005440105660194345, + "loss": 3.1852, + "step": 12098 + }, + { + "epoch": 0.59, + "grad_norm": 0.5471101999282837, + "learning_rate": 0.0005440016089837289, + "loss": 3.2124, + "step": 12099 + }, + { + "epoch": 0.59, + "grad_norm": 0.5259342193603516, + "learning_rate": 0.0005439926513053701, + "loss": 3.318, + "step": 12100 + }, + { + "epoch": 0.59, + "grad_norm": 0.4963459372520447, + "learning_rate": 0.0005439836929843815, + "loss": 3.2905, + "step": 12101 + }, + { + "epoch": 0.59, + "grad_norm": 0.5480161309242249, + "learning_rate": 0.0005439747340207869, + "loss": 3.1982, + "step": 12102 + }, + { + "epoch": 0.59, + "grad_norm": 0.4799657464027405, + "learning_rate": 0.0005439657744146098, + "loss": 3.4103, + "step": 12103 + }, + { + "epoch": 0.59, + "grad_norm": 0.4825492203235626, + "learning_rate": 0.0005439568141658738, + "loss": 3.2936, + "step": 12104 + }, + { + "epoch": 0.59, + "grad_norm": 0.5499624609947205, + "learning_rate": 0.0005439478532746024, + "loss": 3.1542, + "step": 12105 + }, + { + "epoch": 0.59, + "grad_norm": 0.5210323333740234, + "learning_rate": 0.0005439388917408194, + "loss": 3.364, + "step": 12106 + }, + { + "epoch": 0.59, + "grad_norm": 0.5199629664421082, + "learning_rate": 0.0005439299295645482, + "loss": 2.9365, + "step": 12107 + }, + { + "epoch": 0.59, + "grad_norm": 0.5032206773757935, + "learning_rate": 0.0005439209667458127, + "loss": 3.0098, + "step": 12108 + }, + { + "epoch": 0.59, + "grad_norm": 0.5243465304374695, + "learning_rate": 0.0005439120032846361, + "loss": 3.1939, + "step": 12109 + }, + { + "epoch": 0.59, + "grad_norm": 0.4972172975540161, + "learning_rate": 0.0005439030391810424, + "loss": 3.5505, + "step": 12110 + }, + { + "epoch": 0.59, + "grad_norm": 0.5937232971191406, + "learning_rate": 0.000543894074435055, + "loss": 3.0929, + "step": 12111 + }, + { + "epoch": 0.59, + "grad_norm": 0.5489984154701233, + "learning_rate": 0.0005438851090466975, + "loss": 3.2707, + "step": 12112 + }, + { + "epoch": 0.59, + "grad_norm": 0.5671371221542358, + "learning_rate": 0.0005438761430159936, + "loss": 3.1847, + "step": 12113 + }, + { + "epoch": 0.59, + "grad_norm": 0.5630182027816772, + "learning_rate": 0.0005438671763429668, + "loss": 3.1924, + "step": 12114 + }, + { + "epoch": 0.59, + "grad_norm": 0.5923741459846497, + "learning_rate": 0.0005438582090276408, + "loss": 3.3754, + "step": 12115 + }, + { + "epoch": 0.59, + "grad_norm": 0.5446171164512634, + "learning_rate": 0.0005438492410700394, + "loss": 3.2884, + "step": 12116 + }, + { + "epoch": 0.59, + "grad_norm": 0.5040573477745056, + "learning_rate": 0.0005438402724701857, + "loss": 3.4074, + "step": 12117 + }, + { + "epoch": 0.59, + "grad_norm": 0.5180044174194336, + "learning_rate": 0.0005438313032281039, + "loss": 3.0427, + "step": 12118 + }, + { + "epoch": 0.59, + "grad_norm": 0.5119923949241638, + "learning_rate": 0.0005438223333438173, + "loss": 3.0863, + "step": 12119 + }, + { + "epoch": 0.59, + "grad_norm": 0.5041249394416809, + "learning_rate": 0.0005438133628173496, + "loss": 3.0335, + "step": 12120 + }, + { + "epoch": 0.59, + "grad_norm": 0.5423731207847595, + "learning_rate": 0.0005438043916487245, + "loss": 3.2203, + "step": 12121 + }, + { + "epoch": 0.59, + "grad_norm": 0.5120278000831604, + "learning_rate": 0.0005437954198379654, + "loss": 3.2297, + "step": 12122 + }, + { + "epoch": 0.59, + "grad_norm": 0.5203986167907715, + "learning_rate": 0.0005437864473850962, + "loss": 3.1436, + "step": 12123 + }, + { + "epoch": 0.59, + "grad_norm": 0.5062956213951111, + "learning_rate": 0.0005437774742901403, + "loss": 3.225, + "step": 12124 + }, + { + "epoch": 0.59, + "grad_norm": 0.5282447934150696, + "learning_rate": 0.0005437685005531216, + "loss": 3.3557, + "step": 12125 + }, + { + "epoch": 0.59, + "grad_norm": 0.5864943861961365, + "learning_rate": 0.0005437595261740635, + "loss": 3.3101, + "step": 12126 + }, + { + "epoch": 0.59, + "grad_norm": 0.5011259913444519, + "learning_rate": 0.0005437505511529897, + "loss": 3.2331, + "step": 12127 + }, + { + "epoch": 0.59, + "grad_norm": 0.5472565293312073, + "learning_rate": 0.0005437415754899239, + "loss": 3.4375, + "step": 12128 + }, + { + "epoch": 0.59, + "grad_norm": 0.49552929401397705, + "learning_rate": 0.0005437325991848896, + "loss": 3.3217, + "step": 12129 + }, + { + "epoch": 0.59, + "grad_norm": 0.5005829334259033, + "learning_rate": 0.0005437236222379107, + "loss": 3.3369, + "step": 12130 + }, + { + "epoch": 0.59, + "grad_norm": 0.5018550753593445, + "learning_rate": 0.0005437146446490107, + "loss": 3.3297, + "step": 12131 + }, + { + "epoch": 0.59, + "grad_norm": 0.5203503370285034, + "learning_rate": 0.000543705666418213, + "loss": 3.2202, + "step": 12132 + }, + { + "epoch": 0.59, + "grad_norm": 0.568457841873169, + "learning_rate": 0.0005436966875455417, + "loss": 3.3885, + "step": 12133 + }, + { + "epoch": 0.59, + "grad_norm": 0.5276675224304199, + "learning_rate": 0.0005436877080310202, + "loss": 3.2712, + "step": 12134 + }, + { + "epoch": 0.59, + "grad_norm": 0.5073103308677673, + "learning_rate": 0.000543678727874672, + "loss": 3.3254, + "step": 12135 + }, + { + "epoch": 0.59, + "grad_norm": 0.48787346482276917, + "learning_rate": 0.0005436697470765212, + "loss": 3.3454, + "step": 12136 + }, + { + "epoch": 0.59, + "grad_norm": 0.5986006259918213, + "learning_rate": 0.000543660765636591, + "loss": 3.2068, + "step": 12137 + }, + { + "epoch": 0.59, + "grad_norm": 0.5651094317436218, + "learning_rate": 0.0005436517835549052, + "loss": 3.1003, + "step": 12138 + }, + { + "epoch": 0.59, + "grad_norm": 0.49831265211105347, + "learning_rate": 0.0005436428008314876, + "loss": 3.2614, + "step": 12139 + }, + { + "epoch": 0.59, + "grad_norm": 0.5459215044975281, + "learning_rate": 0.0005436338174663617, + "loss": 3.2675, + "step": 12140 + }, + { + "epoch": 0.6, + "grad_norm": 0.5546776652336121, + "learning_rate": 0.0005436248334595512, + "loss": 3.3903, + "step": 12141 + }, + { + "epoch": 0.6, + "grad_norm": 0.518644392490387, + "learning_rate": 0.0005436158488110798, + "loss": 3.2527, + "step": 12142 + }, + { + "epoch": 0.6, + "grad_norm": 0.5478095412254333, + "learning_rate": 0.0005436068635209712, + "loss": 3.0529, + "step": 12143 + }, + { + "epoch": 0.6, + "grad_norm": 0.497527152299881, + "learning_rate": 0.0005435978775892489, + "loss": 3.2598, + "step": 12144 + }, + { + "epoch": 0.6, + "grad_norm": 0.5034574270248413, + "learning_rate": 0.0005435888910159367, + "loss": 3.1994, + "step": 12145 + }, + { + "epoch": 0.6, + "grad_norm": 0.5118417739868164, + "learning_rate": 0.0005435799038010583, + "loss": 3.4717, + "step": 12146 + }, + { + "epoch": 0.6, + "grad_norm": 0.5360592007637024, + "learning_rate": 0.0005435709159446374, + "loss": 3.2304, + "step": 12147 + }, + { + "epoch": 0.6, + "grad_norm": 0.4851125478744507, + "learning_rate": 0.0005435619274466974, + "loss": 3.1554, + "step": 12148 + }, + { + "epoch": 0.6, + "grad_norm": 0.4884193539619446, + "learning_rate": 0.0005435529383072622, + "loss": 3.0128, + "step": 12149 + }, + { + "epoch": 0.6, + "grad_norm": 0.5373410582542419, + "learning_rate": 0.0005435439485263554, + "loss": 3.152, + "step": 12150 + }, + { + "epoch": 0.6, + "grad_norm": 0.5832005143165588, + "learning_rate": 0.0005435349581040007, + "loss": 3.1407, + "step": 12151 + }, + { + "epoch": 0.6, + "grad_norm": 0.5295130014419556, + "learning_rate": 0.0005435259670402219, + "loss": 3.1837, + "step": 12152 + }, + { + "epoch": 0.6, + "grad_norm": 0.5200740694999695, + "learning_rate": 0.0005435169753350425, + "loss": 3.1467, + "step": 12153 + }, + { + "epoch": 0.6, + "grad_norm": 0.5249675512313843, + "learning_rate": 0.0005435079829884862, + "loss": 3.1414, + "step": 12154 + }, + { + "epoch": 0.6, + "grad_norm": 0.5205420851707458, + "learning_rate": 0.0005434989900005769, + "loss": 3.291, + "step": 12155 + }, + { + "epoch": 0.6, + "grad_norm": 0.4993976950645447, + "learning_rate": 0.000543489996371338, + "loss": 3.1936, + "step": 12156 + }, + { + "epoch": 0.6, + "grad_norm": 0.49298644065856934, + "learning_rate": 0.0005434810021007933, + "loss": 3.2909, + "step": 12157 + }, + { + "epoch": 0.6, + "grad_norm": 0.5064563751220703, + "learning_rate": 0.0005434720071889666, + "loss": 3.2605, + "step": 12158 + }, + { + "epoch": 0.6, + "grad_norm": 0.4936967194080353, + "learning_rate": 0.0005434630116358814, + "loss": 3.1549, + "step": 12159 + }, + { + "epoch": 0.6, + "grad_norm": 0.4993574917316437, + "learning_rate": 0.0005434540154415615, + "loss": 3.1781, + "step": 12160 + }, + { + "epoch": 0.6, + "grad_norm": 0.5252702236175537, + "learning_rate": 0.0005434450186060305, + "loss": 3.3435, + "step": 12161 + }, + { + "epoch": 0.6, + "grad_norm": 0.5163652896881104, + "learning_rate": 0.0005434360211293123, + "loss": 3.0923, + "step": 12162 + }, + { + "epoch": 0.6, + "grad_norm": 0.5049394965171814, + "learning_rate": 0.0005434270230114305, + "loss": 3.1294, + "step": 12163 + }, + { + "epoch": 0.6, + "grad_norm": 0.578566312789917, + "learning_rate": 0.0005434180242524086, + "loss": 3.3308, + "step": 12164 + }, + { + "epoch": 0.6, + "grad_norm": 0.5135256052017212, + "learning_rate": 0.0005434090248522706, + "loss": 3.1645, + "step": 12165 + }, + { + "epoch": 0.6, + "grad_norm": 0.4964609146118164, + "learning_rate": 0.00054340002481104, + "loss": 3.3597, + "step": 12166 + }, + { + "epoch": 0.6, + "grad_norm": 0.5104126334190369, + "learning_rate": 0.0005433910241287407, + "loss": 2.9874, + "step": 12167 + }, + { + "epoch": 0.6, + "grad_norm": 0.6239469051361084, + "learning_rate": 0.0005433820228053962, + "loss": 3.2153, + "step": 12168 + }, + { + "epoch": 0.6, + "grad_norm": 0.4951239824295044, + "learning_rate": 0.0005433730208410303, + "loss": 3.0318, + "step": 12169 + }, + { + "epoch": 0.6, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0005433640182356666, + "loss": 3.3427, + "step": 12170 + }, + { + "epoch": 0.6, + "grad_norm": 0.6572966575622559, + "learning_rate": 0.0005433550149893291, + "loss": 3.2291, + "step": 12171 + }, + { + "epoch": 0.6, + "grad_norm": 0.5308460593223572, + "learning_rate": 0.0005433460111020413, + "loss": 3.1009, + "step": 12172 + }, + { + "epoch": 0.6, + "grad_norm": 0.5671391487121582, + "learning_rate": 0.0005433370065738268, + "loss": 3.1697, + "step": 12173 + }, + { + "epoch": 0.6, + "grad_norm": 0.5267753005027771, + "learning_rate": 0.0005433280014047095, + "loss": 3.2779, + "step": 12174 + }, + { + "epoch": 0.6, + "grad_norm": 0.5052096247673035, + "learning_rate": 0.0005433189955947131, + "loss": 3.2854, + "step": 12175 + }, + { + "epoch": 0.6, + "grad_norm": 0.5118441581726074, + "learning_rate": 0.0005433099891438614, + "loss": 3.276, + "step": 12176 + }, + { + "epoch": 0.6, + "grad_norm": 0.504606306552887, + "learning_rate": 0.0005433009820521779, + "loss": 3.3353, + "step": 12177 + }, + { + "epoch": 0.6, + "grad_norm": 0.5112560391426086, + "learning_rate": 0.0005432919743196865, + "loss": 3.2776, + "step": 12178 + }, + { + "epoch": 0.6, + "grad_norm": 0.5266879200935364, + "learning_rate": 0.0005432829659464107, + "loss": 3.3279, + "step": 12179 + }, + { + "epoch": 0.6, + "grad_norm": 0.5236815214157104, + "learning_rate": 0.0005432739569323745, + "loss": 3.1972, + "step": 12180 + }, + { + "epoch": 0.6, + "grad_norm": 0.5193027257919312, + "learning_rate": 0.0005432649472776015, + "loss": 3.1793, + "step": 12181 + }, + { + "epoch": 0.6, + "grad_norm": 0.5326626300811768, + "learning_rate": 0.0005432559369821156, + "loss": 3.2743, + "step": 12182 + }, + { + "epoch": 0.6, + "grad_norm": 0.5254206657409668, + "learning_rate": 0.0005432469260459402, + "loss": 3.4511, + "step": 12183 + }, + { + "epoch": 0.6, + "grad_norm": 0.5383450388908386, + "learning_rate": 0.0005432379144690992, + "loss": 3.1277, + "step": 12184 + }, + { + "epoch": 0.6, + "grad_norm": 0.48813480138778687, + "learning_rate": 0.0005432289022516164, + "loss": 2.9519, + "step": 12185 + }, + { + "epoch": 0.6, + "grad_norm": 0.5338407158851624, + "learning_rate": 0.0005432198893935155, + "loss": 3.1829, + "step": 12186 + }, + { + "epoch": 0.6, + "grad_norm": 0.5344692468643188, + "learning_rate": 0.0005432108758948201, + "loss": 3.2572, + "step": 12187 + }, + { + "epoch": 0.6, + "grad_norm": 0.5389733910560608, + "learning_rate": 0.0005432018617555542, + "loss": 3.1464, + "step": 12188 + }, + { + "epoch": 0.6, + "grad_norm": 0.49581897258758545, + "learning_rate": 0.0005431928469757415, + "loss": 3.1573, + "step": 12189 + }, + { + "epoch": 0.6, + "grad_norm": 0.5306205749511719, + "learning_rate": 0.0005431838315554054, + "loss": 3.3115, + "step": 12190 + }, + { + "epoch": 0.6, + "grad_norm": 0.5148359537124634, + "learning_rate": 0.0005431748154945701, + "loss": 3.2685, + "step": 12191 + }, + { + "epoch": 0.6, + "grad_norm": 0.5625180602073669, + "learning_rate": 0.0005431657987932589, + "loss": 3.1569, + "step": 12192 + }, + { + "epoch": 0.6, + "grad_norm": 0.48063063621520996, + "learning_rate": 0.000543156781451496, + "loss": 3.3465, + "step": 12193 + }, + { + "epoch": 0.6, + "grad_norm": 0.49699029326438904, + "learning_rate": 0.0005431477634693048, + "loss": 3.2766, + "step": 12194 + }, + { + "epoch": 0.6, + "grad_norm": 0.4957977533340454, + "learning_rate": 0.0005431387448467091, + "loss": 3.4118, + "step": 12195 + }, + { + "epoch": 0.6, + "grad_norm": 0.5320689678192139, + "learning_rate": 0.000543129725583733, + "loss": 3.2341, + "step": 12196 + }, + { + "epoch": 0.6, + "grad_norm": 0.4783723056316376, + "learning_rate": 0.0005431207056803999, + "loss": 3.1873, + "step": 12197 + }, + { + "epoch": 0.6, + "grad_norm": 0.5043808817863464, + "learning_rate": 0.0005431116851367336, + "loss": 3.0182, + "step": 12198 + }, + { + "epoch": 0.6, + "grad_norm": 0.5587702393531799, + "learning_rate": 0.000543102663952758, + "loss": 3.2452, + "step": 12199 + }, + { + "epoch": 0.6, + "grad_norm": 0.5152283310890198, + "learning_rate": 0.0005430936421284967, + "loss": 3.2618, + "step": 12200 + }, + { + "epoch": 0.6, + "grad_norm": 0.5216172337532043, + "learning_rate": 0.0005430846196639734, + "loss": 3.1672, + "step": 12201 + }, + { + "epoch": 0.6, + "grad_norm": 0.5034514665603638, + "learning_rate": 0.0005430755965592121, + "loss": 3.1348, + "step": 12202 + }, + { + "epoch": 0.6, + "grad_norm": 0.5430858731269836, + "learning_rate": 0.0005430665728142366, + "loss": 3.2768, + "step": 12203 + }, + { + "epoch": 0.6, + "grad_norm": 0.5175026059150696, + "learning_rate": 0.0005430575484290704, + "loss": 3.2739, + "step": 12204 + }, + { + "epoch": 0.6, + "grad_norm": 0.5324364304542542, + "learning_rate": 0.0005430485234037373, + "loss": 3.3467, + "step": 12205 + }, + { + "epoch": 0.6, + "grad_norm": 0.5634534955024719, + "learning_rate": 0.0005430394977382613, + "loss": 3.0756, + "step": 12206 + }, + { + "epoch": 0.6, + "grad_norm": 0.5331823229789734, + "learning_rate": 0.000543030471432666, + "loss": 3.3163, + "step": 12207 + }, + { + "epoch": 0.6, + "grad_norm": 0.5143170356750488, + "learning_rate": 0.0005430214444869752, + "loss": 3.16, + "step": 12208 + }, + { + "epoch": 0.6, + "grad_norm": 0.5276603102684021, + "learning_rate": 0.0005430124169012127, + "loss": 3.182, + "step": 12209 + }, + { + "epoch": 0.6, + "grad_norm": 0.5391786694526672, + "learning_rate": 0.0005430033886754022, + "loss": 3.0836, + "step": 12210 + }, + { + "epoch": 0.6, + "grad_norm": 0.5072814226150513, + "learning_rate": 0.0005429943598095677, + "loss": 3.2794, + "step": 12211 + }, + { + "epoch": 0.6, + "grad_norm": 0.5004345774650574, + "learning_rate": 0.0005429853303037328, + "loss": 3.3006, + "step": 12212 + }, + { + "epoch": 0.6, + "grad_norm": 0.5201953053474426, + "learning_rate": 0.0005429763001579211, + "loss": 3.2392, + "step": 12213 + }, + { + "epoch": 0.6, + "grad_norm": 0.6965551972389221, + "learning_rate": 0.0005429672693721567, + "loss": 3.0654, + "step": 12214 + }, + { + "epoch": 0.6, + "grad_norm": 0.5072376132011414, + "learning_rate": 0.0005429582379464634, + "loss": 3.4563, + "step": 12215 + }, + { + "epoch": 0.6, + "grad_norm": 0.47854286432266235, + "learning_rate": 0.0005429492058808647, + "loss": 3.2517, + "step": 12216 + }, + { + "epoch": 0.6, + "grad_norm": 0.5493162870407104, + "learning_rate": 0.0005429401731753846, + "loss": 3.3376, + "step": 12217 + }, + { + "epoch": 0.6, + "grad_norm": 0.5450829863548279, + "learning_rate": 0.0005429311398300468, + "loss": 3.2878, + "step": 12218 + }, + { + "epoch": 0.6, + "grad_norm": 0.4950675368309021, + "learning_rate": 0.0005429221058448752, + "loss": 3.0321, + "step": 12219 + }, + { + "epoch": 0.6, + "grad_norm": 0.5706702470779419, + "learning_rate": 0.0005429130712198935, + "loss": 3.4247, + "step": 12220 + }, + { + "epoch": 0.6, + "grad_norm": 0.4959094226360321, + "learning_rate": 0.0005429040359551256, + "loss": 3.1983, + "step": 12221 + }, + { + "epoch": 0.6, + "grad_norm": 0.5183280110359192, + "learning_rate": 0.0005428950000505951, + "loss": 3.0507, + "step": 12222 + }, + { + "epoch": 0.6, + "grad_norm": 0.4983315169811249, + "learning_rate": 0.0005428859635063258, + "loss": 3.3056, + "step": 12223 + }, + { + "epoch": 0.6, + "grad_norm": 0.5146493315696716, + "learning_rate": 0.0005428769263223418, + "loss": 3.3479, + "step": 12224 + }, + { + "epoch": 0.6, + "grad_norm": 0.4943070709705353, + "learning_rate": 0.0005428678884986667, + "loss": 3.1996, + "step": 12225 + }, + { + "epoch": 0.6, + "grad_norm": 0.49658817052841187, + "learning_rate": 0.0005428588500353243, + "loss": 3.1671, + "step": 12226 + }, + { + "epoch": 0.6, + "grad_norm": 0.5471540689468384, + "learning_rate": 0.0005428498109323384, + "loss": 3.1609, + "step": 12227 + }, + { + "epoch": 0.6, + "grad_norm": 0.5084481835365295, + "learning_rate": 0.0005428407711897329, + "loss": 3.2232, + "step": 12228 + }, + { + "epoch": 0.6, + "grad_norm": 0.5170078277587891, + "learning_rate": 0.0005428317308075315, + "loss": 3.3208, + "step": 12229 + }, + { + "epoch": 0.6, + "grad_norm": 0.4958844780921936, + "learning_rate": 0.0005428226897857581, + "loss": 3.4691, + "step": 12230 + }, + { + "epoch": 0.6, + "grad_norm": 0.5121991634368896, + "learning_rate": 0.0005428136481244363, + "loss": 3.3307, + "step": 12231 + }, + { + "epoch": 0.6, + "grad_norm": 0.5569097399711609, + "learning_rate": 0.0005428046058235903, + "loss": 3.2827, + "step": 12232 + }, + { + "epoch": 0.6, + "grad_norm": 0.5006715655326843, + "learning_rate": 0.0005427955628832436, + "loss": 3.4895, + "step": 12233 + }, + { + "epoch": 0.6, + "grad_norm": 0.5267584919929504, + "learning_rate": 0.0005427865193034202, + "loss": 3.0273, + "step": 12234 + }, + { + "epoch": 0.6, + "grad_norm": 0.5398766994476318, + "learning_rate": 0.0005427774750841437, + "loss": 3.2227, + "step": 12235 + }, + { + "epoch": 0.6, + "grad_norm": 0.49153032898902893, + "learning_rate": 0.0005427684302254381, + "loss": 3.2155, + "step": 12236 + }, + { + "epoch": 0.6, + "grad_norm": 0.55174720287323, + "learning_rate": 0.0005427593847273272, + "loss": 3.3676, + "step": 12237 + }, + { + "epoch": 0.6, + "grad_norm": 0.5181941986083984, + "learning_rate": 0.0005427503385898348, + "loss": 3.2572, + "step": 12238 + }, + { + "epoch": 0.6, + "grad_norm": 0.5273623466491699, + "learning_rate": 0.0005427412918129847, + "loss": 3.0727, + "step": 12239 + }, + { + "epoch": 0.6, + "grad_norm": 0.5499759912490845, + "learning_rate": 0.0005427322443968007, + "loss": 3.2162, + "step": 12240 + }, + { + "epoch": 0.6, + "grad_norm": 0.4892823100090027, + "learning_rate": 0.0005427231963413069, + "loss": 3.2164, + "step": 12241 + }, + { + "epoch": 0.6, + "grad_norm": 0.49758782982826233, + "learning_rate": 0.0005427141476465266, + "loss": 3.1819, + "step": 12242 + }, + { + "epoch": 0.6, + "grad_norm": 0.5292083024978638, + "learning_rate": 0.0005427050983124842, + "loss": 3.241, + "step": 12243 + }, + { + "epoch": 0.6, + "grad_norm": 0.5434751510620117, + "learning_rate": 0.0005426960483392031, + "loss": 3.2821, + "step": 12244 + }, + { + "epoch": 0.6, + "grad_norm": 0.5042057037353516, + "learning_rate": 0.0005426869977267075, + "loss": 3.2146, + "step": 12245 + }, + { + "epoch": 0.6, + "grad_norm": 0.5056325197219849, + "learning_rate": 0.000542677946475021, + "loss": 3.3742, + "step": 12246 + }, + { + "epoch": 0.6, + "grad_norm": 0.5204561352729797, + "learning_rate": 0.0005426688945841674, + "loss": 3.2397, + "step": 12247 + }, + { + "epoch": 0.6, + "grad_norm": 0.5179333686828613, + "learning_rate": 0.0005426598420541707, + "loss": 3.2824, + "step": 12248 + }, + { + "epoch": 0.6, + "grad_norm": 0.5086974501609802, + "learning_rate": 0.0005426507888850547, + "loss": 3.2344, + "step": 12249 + }, + { + "epoch": 0.6, + "grad_norm": 0.49645212292671204, + "learning_rate": 0.0005426417350768431, + "loss": 3.0371, + "step": 12250 + }, + { + "epoch": 0.6, + "grad_norm": 0.4945175051689148, + "learning_rate": 0.00054263268062956, + "loss": 3.2515, + "step": 12251 + }, + { + "epoch": 0.6, + "grad_norm": 0.5622349381446838, + "learning_rate": 0.0005426236255432291, + "loss": 3.3922, + "step": 12252 + }, + { + "epoch": 0.6, + "grad_norm": 0.5245631337165833, + "learning_rate": 0.0005426145698178742, + "loss": 3.1202, + "step": 12253 + }, + { + "epoch": 0.6, + "grad_norm": 0.5517042279243469, + "learning_rate": 0.0005426055134535192, + "loss": 3.0991, + "step": 12254 + }, + { + "epoch": 0.6, + "grad_norm": 0.5217844843864441, + "learning_rate": 0.000542596456450188, + "loss": 3.2649, + "step": 12255 + }, + { + "epoch": 0.6, + "grad_norm": 0.47473424673080444, + "learning_rate": 0.0005425873988079045, + "loss": 3.2014, + "step": 12256 + }, + { + "epoch": 0.6, + "grad_norm": 0.5011487007141113, + "learning_rate": 0.0005425783405266924, + "loss": 3.3579, + "step": 12257 + }, + { + "epoch": 0.6, + "grad_norm": 0.5660378336906433, + "learning_rate": 0.0005425692816065757, + "loss": 3.3866, + "step": 12258 + }, + { + "epoch": 0.6, + "grad_norm": 0.5503248572349548, + "learning_rate": 0.0005425602220475781, + "loss": 3.3629, + "step": 12259 + }, + { + "epoch": 0.6, + "grad_norm": 0.5156776309013367, + "learning_rate": 0.0005425511618497237, + "loss": 3.1339, + "step": 12260 + }, + { + "epoch": 0.6, + "grad_norm": 0.5958825349807739, + "learning_rate": 0.0005425421010130361, + "loss": 3.1596, + "step": 12261 + }, + { + "epoch": 0.6, + "grad_norm": 0.5076464414596558, + "learning_rate": 0.0005425330395375392, + "loss": 3.287, + "step": 12262 + }, + { + "epoch": 0.6, + "grad_norm": 0.4932123124599457, + "learning_rate": 0.0005425239774232572, + "loss": 3.3516, + "step": 12263 + }, + { + "epoch": 0.6, + "grad_norm": 0.5037664175033569, + "learning_rate": 0.0005425149146702135, + "loss": 3.1989, + "step": 12264 + }, + { + "epoch": 0.6, + "grad_norm": 0.5109959244728088, + "learning_rate": 0.0005425058512784323, + "loss": 3.3902, + "step": 12265 + }, + { + "epoch": 0.6, + "grad_norm": 0.49323901534080505, + "learning_rate": 0.0005424967872479373, + "loss": 3.4398, + "step": 12266 + }, + { + "epoch": 0.6, + "grad_norm": 0.49260270595550537, + "learning_rate": 0.0005424877225787525, + "loss": 3.293, + "step": 12267 + }, + { + "epoch": 0.6, + "grad_norm": 0.5285913348197937, + "learning_rate": 0.0005424786572709015, + "loss": 3.452, + "step": 12268 + }, + { + "epoch": 0.6, + "grad_norm": 0.5172354578971863, + "learning_rate": 0.0005424695913244087, + "loss": 3.3493, + "step": 12269 + }, + { + "epoch": 0.6, + "grad_norm": 0.5332759022712708, + "learning_rate": 0.0005424605247392975, + "loss": 3.1187, + "step": 12270 + }, + { + "epoch": 0.6, + "grad_norm": 0.5140502452850342, + "learning_rate": 0.0005424514575155919, + "loss": 3.3081, + "step": 12271 + }, + { + "epoch": 0.6, + "grad_norm": 0.49481186270713806, + "learning_rate": 0.0005424423896533159, + "loss": 3.1223, + "step": 12272 + }, + { + "epoch": 0.6, + "grad_norm": 0.5081273913383484, + "learning_rate": 0.0005424333211524932, + "loss": 3.1317, + "step": 12273 + }, + { + "epoch": 0.6, + "grad_norm": 0.4956912100315094, + "learning_rate": 0.0005424242520131479, + "loss": 3.4277, + "step": 12274 + }, + { + "epoch": 0.6, + "grad_norm": 0.5238935947418213, + "learning_rate": 0.0005424151822353038, + "loss": 3.2959, + "step": 12275 + }, + { + "epoch": 0.6, + "grad_norm": 0.5190920233726501, + "learning_rate": 0.0005424061118189847, + "loss": 3.226, + "step": 12276 + }, + { + "epoch": 0.6, + "grad_norm": 0.5076124668121338, + "learning_rate": 0.0005423970407642145, + "loss": 3.2697, + "step": 12277 + }, + { + "epoch": 0.6, + "grad_norm": 0.4859389066696167, + "learning_rate": 0.0005423879690710172, + "loss": 3.1284, + "step": 12278 + }, + { + "epoch": 0.6, + "grad_norm": 0.5037886500358582, + "learning_rate": 0.0005423788967394166, + "loss": 3.3524, + "step": 12279 + }, + { + "epoch": 0.6, + "grad_norm": 0.5088861584663391, + "learning_rate": 0.0005423698237694368, + "loss": 3.3835, + "step": 12280 + }, + { + "epoch": 0.6, + "grad_norm": 0.4993892312049866, + "learning_rate": 0.0005423607501611013, + "loss": 3.1905, + "step": 12281 + }, + { + "epoch": 0.6, + "grad_norm": 0.5249192714691162, + "learning_rate": 0.0005423516759144343, + "loss": 3.4946, + "step": 12282 + }, + { + "epoch": 0.6, + "grad_norm": 0.5226430296897888, + "learning_rate": 0.0005423426010294597, + "loss": 3.1759, + "step": 12283 + }, + { + "epoch": 0.6, + "grad_norm": 0.5157943964004517, + "learning_rate": 0.0005423335255062013, + "loss": 3.0393, + "step": 12284 + }, + { + "epoch": 0.6, + "grad_norm": 0.4765145182609558, + "learning_rate": 0.000542324449344683, + "loss": 3.2868, + "step": 12285 + }, + { + "epoch": 0.6, + "grad_norm": 0.539223849773407, + "learning_rate": 0.0005423153725449287, + "loss": 3.2323, + "step": 12286 + }, + { + "epoch": 0.6, + "grad_norm": 0.5330362319946289, + "learning_rate": 0.0005423062951069624, + "loss": 3.2413, + "step": 12287 + }, + { + "epoch": 0.6, + "grad_norm": 0.5498042106628418, + "learning_rate": 0.000542297217030808, + "loss": 2.9149, + "step": 12288 + }, + { + "epoch": 0.6, + "grad_norm": 0.5433230400085449, + "learning_rate": 0.0005422881383164893, + "loss": 3.1215, + "step": 12289 + }, + { + "epoch": 0.6, + "grad_norm": 0.5228878855705261, + "learning_rate": 0.0005422790589640303, + "loss": 3.2642, + "step": 12290 + }, + { + "epoch": 0.6, + "grad_norm": 0.5186182260513306, + "learning_rate": 0.0005422699789734548, + "loss": 3.4114, + "step": 12291 + }, + { + "epoch": 0.6, + "grad_norm": 0.5254524946212769, + "learning_rate": 0.0005422608983447868, + "loss": 3.4245, + "step": 12292 + }, + { + "epoch": 0.6, + "grad_norm": 0.5335171222686768, + "learning_rate": 0.0005422518170780504, + "loss": 3.0748, + "step": 12293 + }, + { + "epoch": 0.6, + "grad_norm": 0.5440496206283569, + "learning_rate": 0.0005422427351732692, + "loss": 3.0979, + "step": 12294 + }, + { + "epoch": 0.6, + "grad_norm": 0.5325355529785156, + "learning_rate": 0.0005422336526304673, + "loss": 3.2867, + "step": 12295 + }, + { + "epoch": 0.6, + "grad_norm": 0.536151647567749, + "learning_rate": 0.0005422245694496685, + "loss": 3.3285, + "step": 12296 + }, + { + "epoch": 0.6, + "grad_norm": 0.516470730304718, + "learning_rate": 0.0005422154856308969, + "loss": 3.3174, + "step": 12297 + }, + { + "epoch": 0.6, + "grad_norm": 0.4913516938686371, + "learning_rate": 0.0005422064011741763, + "loss": 3.1424, + "step": 12298 + }, + { + "epoch": 0.6, + "grad_norm": 0.4844023883342743, + "learning_rate": 0.0005421973160795306, + "loss": 3.3294, + "step": 12299 + }, + { + "epoch": 0.6, + "grad_norm": 0.5266075134277344, + "learning_rate": 0.0005421882303469839, + "loss": 3.1666, + "step": 12300 + }, + { + "epoch": 0.6, + "grad_norm": 0.5134689807891846, + "learning_rate": 0.0005421791439765599, + "loss": 3.1568, + "step": 12301 + }, + { + "epoch": 0.6, + "grad_norm": 0.5349195599555969, + "learning_rate": 0.0005421700569682827, + "loss": 3.3157, + "step": 12302 + }, + { + "epoch": 0.6, + "grad_norm": 0.5083217024803162, + "learning_rate": 0.0005421609693221761, + "loss": 3.0988, + "step": 12303 + }, + { + "epoch": 0.6, + "grad_norm": 0.49216586351394653, + "learning_rate": 0.0005421518810382641, + "loss": 3.3646, + "step": 12304 + }, + { + "epoch": 0.6, + "grad_norm": 0.5193181037902832, + "learning_rate": 0.0005421427921165708, + "loss": 3.3176, + "step": 12305 + }, + { + "epoch": 0.6, + "grad_norm": 0.5635499358177185, + "learning_rate": 0.0005421337025571199, + "loss": 3.1367, + "step": 12306 + }, + { + "epoch": 0.6, + "grad_norm": 0.5421886444091797, + "learning_rate": 0.0005421246123599354, + "loss": 3.2597, + "step": 12307 + }, + { + "epoch": 0.6, + "grad_norm": 0.49263355135917664, + "learning_rate": 0.0005421155215250413, + "loss": 3.3185, + "step": 12308 + }, + { + "epoch": 0.6, + "grad_norm": 0.4998137652873993, + "learning_rate": 0.0005421064300524615, + "loss": 3.2735, + "step": 12309 + }, + { + "epoch": 0.6, + "grad_norm": 0.5159027576446533, + "learning_rate": 0.00054209733794222, + "loss": 3.1562, + "step": 12310 + }, + { + "epoch": 0.6, + "grad_norm": 0.48767736554145813, + "learning_rate": 0.0005420882451943406, + "loss": 3.0237, + "step": 12311 + }, + { + "epoch": 0.6, + "grad_norm": 0.5056929588317871, + "learning_rate": 0.0005420791518088474, + "loss": 3.2519, + "step": 12312 + }, + { + "epoch": 0.6, + "grad_norm": 0.5624472498893738, + "learning_rate": 0.0005420700577857643, + "loss": 3.0888, + "step": 12313 + }, + { + "epoch": 0.6, + "grad_norm": 0.5267937183380127, + "learning_rate": 0.0005420609631251154, + "loss": 3.1991, + "step": 12314 + }, + { + "epoch": 0.6, + "grad_norm": 0.5424567461013794, + "learning_rate": 0.0005420518678269244, + "loss": 3.2991, + "step": 12315 + }, + { + "epoch": 0.6, + "grad_norm": 0.4930018484592438, + "learning_rate": 0.0005420427718912153, + "loss": 3.123, + "step": 12316 + }, + { + "epoch": 0.6, + "grad_norm": 0.5012913346290588, + "learning_rate": 0.0005420336753180122, + "loss": 3.3452, + "step": 12317 + }, + { + "epoch": 0.6, + "grad_norm": 0.5235104560852051, + "learning_rate": 0.0005420245781073389, + "loss": 3.1449, + "step": 12318 + }, + { + "epoch": 0.6, + "grad_norm": 0.5059866905212402, + "learning_rate": 0.0005420154802592195, + "loss": 3.213, + "step": 12319 + }, + { + "epoch": 0.6, + "grad_norm": 0.5268304944038391, + "learning_rate": 0.0005420063817736778, + "loss": 3.2597, + "step": 12320 + }, + { + "epoch": 0.6, + "grad_norm": 0.5179598331451416, + "learning_rate": 0.000541997282650738, + "loss": 3.0205, + "step": 12321 + }, + { + "epoch": 0.6, + "grad_norm": 0.501977264881134, + "learning_rate": 0.0005419881828904239, + "loss": 3.1745, + "step": 12322 + }, + { + "epoch": 0.6, + "grad_norm": 0.5240980386734009, + "learning_rate": 0.0005419790824927595, + "loss": 3.2458, + "step": 12323 + }, + { + "epoch": 0.6, + "grad_norm": 0.49430304765701294, + "learning_rate": 0.0005419699814577687, + "loss": 3.4038, + "step": 12324 + }, + { + "epoch": 0.6, + "grad_norm": 0.5261852145195007, + "learning_rate": 0.0005419608797854757, + "loss": 3.3711, + "step": 12325 + }, + { + "epoch": 0.6, + "grad_norm": 0.5073800683021545, + "learning_rate": 0.0005419517774759041, + "loss": 3.1167, + "step": 12326 + }, + { + "epoch": 0.6, + "grad_norm": 0.5025413036346436, + "learning_rate": 0.0005419426745290782, + "loss": 3.3158, + "step": 12327 + }, + { + "epoch": 0.6, + "grad_norm": 0.5052799582481384, + "learning_rate": 0.0005419335709450218, + "loss": 3.131, + "step": 12328 + }, + { + "epoch": 0.6, + "grad_norm": 0.5107625722885132, + "learning_rate": 0.000541924466723759, + "loss": 3.4689, + "step": 12329 + }, + { + "epoch": 0.6, + "grad_norm": 0.5340820550918579, + "learning_rate": 0.0005419153618653137, + "loss": 3.3352, + "step": 12330 + }, + { + "epoch": 0.6, + "grad_norm": 0.5223778486251831, + "learning_rate": 0.0005419062563697098, + "loss": 3.2638, + "step": 12331 + }, + { + "epoch": 0.6, + "grad_norm": 0.5212966203689575, + "learning_rate": 0.0005418971502369716, + "loss": 3.2023, + "step": 12332 + }, + { + "epoch": 0.6, + "grad_norm": 0.5198991298675537, + "learning_rate": 0.0005418880434671226, + "loss": 3.2943, + "step": 12333 + }, + { + "epoch": 0.6, + "grad_norm": 0.5650694966316223, + "learning_rate": 0.0005418789360601872, + "loss": 3.0997, + "step": 12334 + }, + { + "epoch": 0.6, + "grad_norm": 0.5771616101264954, + "learning_rate": 0.0005418698280161892, + "loss": 3.07, + "step": 12335 + }, + { + "epoch": 0.6, + "grad_norm": 0.5297648906707764, + "learning_rate": 0.0005418607193351526, + "loss": 3.3583, + "step": 12336 + }, + { + "epoch": 0.6, + "grad_norm": 0.5116552710533142, + "learning_rate": 0.0005418516100171015, + "loss": 3.1286, + "step": 12337 + }, + { + "epoch": 0.6, + "grad_norm": 0.5247451663017273, + "learning_rate": 0.0005418425000620597, + "loss": 2.9956, + "step": 12338 + }, + { + "epoch": 0.6, + "grad_norm": 0.5303729772567749, + "learning_rate": 0.0005418333894700513, + "loss": 3.1847, + "step": 12339 + }, + { + "epoch": 0.6, + "grad_norm": 0.5321176648139954, + "learning_rate": 0.0005418242782411004, + "loss": 3.2272, + "step": 12340 + }, + { + "epoch": 0.6, + "grad_norm": 0.5363842844963074, + "learning_rate": 0.0005418151663752307, + "loss": 3.1136, + "step": 12341 + }, + { + "epoch": 0.6, + "grad_norm": 0.543472409248352, + "learning_rate": 0.0005418060538724666, + "loss": 3.3665, + "step": 12342 + }, + { + "epoch": 0.6, + "grad_norm": 0.5340068936347961, + "learning_rate": 0.0005417969407328318, + "loss": 3.1082, + "step": 12343 + }, + { + "epoch": 0.6, + "grad_norm": 0.49749499559402466, + "learning_rate": 0.0005417878269563504, + "loss": 3.0852, + "step": 12344 + }, + { + "epoch": 0.6, + "grad_norm": 0.5240546464920044, + "learning_rate": 0.0005417787125430464, + "loss": 3.1989, + "step": 12345 + }, + { + "epoch": 0.61, + "grad_norm": 0.5031521320343018, + "learning_rate": 0.0005417695974929438, + "loss": 3.1801, + "step": 12346 + }, + { + "epoch": 0.61, + "grad_norm": 0.5448225736618042, + "learning_rate": 0.0005417604818060666, + "loss": 3.4148, + "step": 12347 + }, + { + "epoch": 0.61, + "grad_norm": 0.5050467848777771, + "learning_rate": 0.0005417513654824387, + "loss": 3.1923, + "step": 12348 + }, + { + "epoch": 0.61, + "grad_norm": 0.5299510359764099, + "learning_rate": 0.0005417422485220844, + "loss": 3.1741, + "step": 12349 + }, + { + "epoch": 0.61, + "grad_norm": 0.5421825647354126, + "learning_rate": 0.0005417331309250275, + "loss": 3.2135, + "step": 12350 + }, + { + "epoch": 0.61, + "grad_norm": 0.5428870916366577, + "learning_rate": 0.0005417240126912921, + "loss": 3.1527, + "step": 12351 + }, + { + "epoch": 0.61, + "grad_norm": 0.6039544939994812, + "learning_rate": 0.0005417148938209021, + "loss": 3.3517, + "step": 12352 + }, + { + "epoch": 0.61, + "grad_norm": 0.5433862805366516, + "learning_rate": 0.0005417057743138816, + "loss": 3.2371, + "step": 12353 + }, + { + "epoch": 0.61, + "grad_norm": 0.48752960562705994, + "learning_rate": 0.0005416966541702546, + "loss": 3.1304, + "step": 12354 + }, + { + "epoch": 0.61, + "grad_norm": 0.529007613658905, + "learning_rate": 0.0005416875333900451, + "loss": 3.1325, + "step": 12355 + }, + { + "epoch": 0.61, + "grad_norm": 0.5171936750411987, + "learning_rate": 0.0005416784119732773, + "loss": 3.1296, + "step": 12356 + }, + { + "epoch": 0.61, + "grad_norm": 0.5062511563301086, + "learning_rate": 0.0005416692899199749, + "loss": 3.3456, + "step": 12357 + }, + { + "epoch": 0.61, + "grad_norm": 0.5134553909301758, + "learning_rate": 0.0005416601672301622, + "loss": 3.0025, + "step": 12358 + }, + { + "epoch": 0.61, + "grad_norm": 0.5461899042129517, + "learning_rate": 0.0005416510439038631, + "loss": 3.4576, + "step": 12359 + }, + { + "epoch": 0.61, + "grad_norm": 0.6372771859169006, + "learning_rate": 0.0005416419199411016, + "loss": 3.4149, + "step": 12360 + }, + { + "epoch": 0.61, + "grad_norm": 0.5487025380134583, + "learning_rate": 0.0005416327953419018, + "loss": 3.2196, + "step": 12361 + }, + { + "epoch": 0.61, + "grad_norm": 0.5107635855674744, + "learning_rate": 0.0005416236701062878, + "loss": 3.1029, + "step": 12362 + }, + { + "epoch": 0.61, + "grad_norm": 0.556648850440979, + "learning_rate": 0.0005416145442342836, + "loss": 3.2373, + "step": 12363 + }, + { + "epoch": 0.61, + "grad_norm": 0.5657925605773926, + "learning_rate": 0.0005416054177259131, + "loss": 3.1103, + "step": 12364 + }, + { + "epoch": 0.61, + "grad_norm": 0.49510687589645386, + "learning_rate": 0.0005415962905812004, + "loss": 3.2521, + "step": 12365 + }, + { + "epoch": 0.61, + "grad_norm": 0.5163629651069641, + "learning_rate": 0.0005415871628001696, + "loss": 3.2231, + "step": 12366 + }, + { + "epoch": 0.61, + "grad_norm": 0.5414348244667053, + "learning_rate": 0.0005415780343828447, + "loss": 3.0145, + "step": 12367 + }, + { + "epoch": 0.61, + "grad_norm": 0.5388548374176025, + "learning_rate": 0.0005415689053292497, + "loss": 3.0301, + "step": 12368 + }, + { + "epoch": 0.61, + "grad_norm": 0.49389225244522095, + "learning_rate": 0.0005415597756394089, + "loss": 3.2222, + "step": 12369 + }, + { + "epoch": 0.61, + "grad_norm": 0.5336668491363525, + "learning_rate": 0.0005415506453133459, + "loss": 3.2878, + "step": 12370 + }, + { + "epoch": 0.61, + "grad_norm": 0.500521719455719, + "learning_rate": 0.000541541514351085, + "loss": 3.113, + "step": 12371 + }, + { + "epoch": 0.61, + "grad_norm": 0.4869195520877838, + "learning_rate": 0.0005415323827526505, + "loss": 3.1353, + "step": 12372 + }, + { + "epoch": 0.61, + "grad_norm": 0.5147513747215271, + "learning_rate": 0.000541523250518066, + "loss": 3.3804, + "step": 12373 + }, + { + "epoch": 0.61, + "grad_norm": 0.5055763721466064, + "learning_rate": 0.0005415141176473557, + "loss": 3.3596, + "step": 12374 + }, + { + "epoch": 0.61, + "grad_norm": 0.5058522820472717, + "learning_rate": 0.0005415049841405437, + "loss": 3.0117, + "step": 12375 + }, + { + "epoch": 0.61, + "grad_norm": 0.5772499442100525, + "learning_rate": 0.0005414958499976541, + "loss": 3.1889, + "step": 12376 + }, + { + "epoch": 0.61, + "grad_norm": 0.5016292333602905, + "learning_rate": 0.0005414867152187108, + "loss": 3.1536, + "step": 12377 + }, + { + "epoch": 0.61, + "grad_norm": 0.5072944760322571, + "learning_rate": 0.0005414775798037382, + "loss": 3.2119, + "step": 12378 + }, + { + "epoch": 0.61, + "grad_norm": 0.5283709168434143, + "learning_rate": 0.0005414684437527599, + "loss": 2.9134, + "step": 12379 + }, + { + "epoch": 0.61, + "grad_norm": 0.5712698101997375, + "learning_rate": 0.0005414593070658004, + "loss": 3.2677, + "step": 12380 + }, + { + "epoch": 0.61, + "grad_norm": 0.5036458373069763, + "learning_rate": 0.0005414501697428833, + "loss": 3.2085, + "step": 12381 + }, + { + "epoch": 0.61, + "grad_norm": 0.5280167460441589, + "learning_rate": 0.0005414410317840331, + "loss": 3.1025, + "step": 12382 + }, + { + "epoch": 0.61, + "grad_norm": 0.548110842704773, + "learning_rate": 0.0005414318931892737, + "loss": 3.3154, + "step": 12383 + }, + { + "epoch": 0.61, + "grad_norm": 0.5638526082038879, + "learning_rate": 0.0005414227539586291, + "loss": 3.189, + "step": 12384 + }, + { + "epoch": 0.61, + "grad_norm": 0.519481897354126, + "learning_rate": 0.0005414136140921234, + "loss": 3.1508, + "step": 12385 + }, + { + "epoch": 0.61, + "grad_norm": 0.49351656436920166, + "learning_rate": 0.0005414044735897808, + "loss": 3.2086, + "step": 12386 + }, + { + "epoch": 0.61, + "grad_norm": 0.47887444496154785, + "learning_rate": 0.0005413953324516251, + "loss": 3.0731, + "step": 12387 + }, + { + "epoch": 0.61, + "grad_norm": 0.5507675409317017, + "learning_rate": 0.0005413861906776807, + "loss": 3.3115, + "step": 12388 + }, + { + "epoch": 0.61, + "grad_norm": 0.518807590007782, + "learning_rate": 0.0005413770482679714, + "loss": 3.5088, + "step": 12389 + }, + { + "epoch": 0.61, + "grad_norm": 0.5343676209449768, + "learning_rate": 0.0005413679052225216, + "loss": 3.2752, + "step": 12390 + }, + { + "epoch": 0.61, + "grad_norm": 0.5148628354072571, + "learning_rate": 0.0005413587615413551, + "loss": 3.1195, + "step": 12391 + }, + { + "epoch": 0.61, + "grad_norm": 0.5338741540908813, + "learning_rate": 0.000541349617224496, + "loss": 3.0901, + "step": 12392 + }, + { + "epoch": 0.61, + "grad_norm": 0.508698046207428, + "learning_rate": 0.0005413404722719686, + "loss": 3.3288, + "step": 12393 + }, + { + "epoch": 0.61, + "grad_norm": 0.5255823731422424, + "learning_rate": 0.0005413313266837968, + "loss": 3.2126, + "step": 12394 + }, + { + "epoch": 0.61, + "grad_norm": 0.5159070491790771, + "learning_rate": 0.0005413221804600047, + "loss": 3.0548, + "step": 12395 + }, + { + "epoch": 0.61, + "grad_norm": 0.5951764583587646, + "learning_rate": 0.0005413130336006164, + "loss": 3.3417, + "step": 12396 + }, + { + "epoch": 0.61, + "grad_norm": 0.5076818466186523, + "learning_rate": 0.000541303886105656, + "loss": 3.3209, + "step": 12397 + }, + { + "epoch": 0.61, + "grad_norm": 0.5986706018447876, + "learning_rate": 0.0005412947379751477, + "loss": 3.2027, + "step": 12398 + }, + { + "epoch": 0.61, + "grad_norm": 0.506370484828949, + "learning_rate": 0.0005412855892091154, + "loss": 3.1447, + "step": 12399 + }, + { + "epoch": 0.61, + "grad_norm": 0.5274400115013123, + "learning_rate": 0.0005412764398075833, + "loss": 3.3458, + "step": 12400 + }, + { + "epoch": 0.61, + "grad_norm": 0.5394174456596375, + "learning_rate": 0.0005412672897705755, + "loss": 3.3062, + "step": 12401 + }, + { + "epoch": 0.61, + "grad_norm": 0.49921727180480957, + "learning_rate": 0.0005412581390981161, + "loss": 3.3473, + "step": 12402 + }, + { + "epoch": 0.61, + "grad_norm": 0.5664904713630676, + "learning_rate": 0.0005412489877902292, + "loss": 3.2196, + "step": 12403 + }, + { + "epoch": 0.61, + "grad_norm": 0.5725209712982178, + "learning_rate": 0.0005412398358469389, + "loss": 3.3237, + "step": 12404 + }, + { + "epoch": 0.61, + "grad_norm": 0.5521411299705505, + "learning_rate": 0.0005412306832682693, + "loss": 3.1473, + "step": 12405 + }, + { + "epoch": 0.61, + "grad_norm": 0.5140300393104553, + "learning_rate": 0.0005412215300542446, + "loss": 3.2564, + "step": 12406 + }, + { + "epoch": 0.61, + "grad_norm": 0.5515618920326233, + "learning_rate": 0.0005412123762048885, + "loss": 3.029, + "step": 12407 + }, + { + "epoch": 0.61, + "grad_norm": 0.5705258846282959, + "learning_rate": 0.0005412032217202257, + "loss": 3.2977, + "step": 12408 + }, + { + "epoch": 0.61, + "grad_norm": 0.5424354672431946, + "learning_rate": 0.00054119406660028, + "loss": 3.1524, + "step": 12409 + }, + { + "epoch": 0.61, + "grad_norm": 0.5054551959037781, + "learning_rate": 0.0005411849108450756, + "loss": 3.2744, + "step": 12410 + }, + { + "epoch": 0.61, + "grad_norm": 0.5675023794174194, + "learning_rate": 0.0005411757544546364, + "loss": 3.1539, + "step": 12411 + }, + { + "epoch": 0.61, + "grad_norm": 0.5317476391792297, + "learning_rate": 0.0005411665974289867, + "loss": 3.2369, + "step": 12412 + }, + { + "epoch": 0.61, + "grad_norm": 0.5437188744544983, + "learning_rate": 0.0005411574397681507, + "loss": 3.3225, + "step": 12413 + }, + { + "epoch": 0.61, + "grad_norm": 0.5426942706108093, + "learning_rate": 0.0005411482814721523, + "loss": 3.2466, + "step": 12414 + }, + { + "epoch": 0.61, + "grad_norm": 0.5315844416618347, + "learning_rate": 0.0005411391225410159, + "loss": 3.1621, + "step": 12415 + }, + { + "epoch": 0.61, + "grad_norm": 0.5135928392410278, + "learning_rate": 0.0005411299629747654, + "loss": 3.1404, + "step": 12416 + }, + { + "epoch": 0.61, + "grad_norm": 0.5293831825256348, + "learning_rate": 0.000541120802773425, + "loss": 3.1356, + "step": 12417 + }, + { + "epoch": 0.61, + "grad_norm": 0.5011104941368103, + "learning_rate": 0.0005411116419370188, + "loss": 3.281, + "step": 12418 + }, + { + "epoch": 0.61, + "grad_norm": 0.524491012096405, + "learning_rate": 0.0005411024804655708, + "loss": 3.1678, + "step": 12419 + }, + { + "epoch": 0.61, + "grad_norm": 0.5337167978286743, + "learning_rate": 0.0005410933183591053, + "loss": 3.1052, + "step": 12420 + }, + { + "epoch": 0.61, + "grad_norm": 0.49512872099876404, + "learning_rate": 0.0005410841556176465, + "loss": 3.3314, + "step": 12421 + }, + { + "epoch": 0.61, + "grad_norm": 0.5637658834457397, + "learning_rate": 0.0005410749922412184, + "loss": 3.0071, + "step": 12422 + }, + { + "epoch": 0.61, + "grad_norm": 0.5093816518783569, + "learning_rate": 0.0005410658282298451, + "loss": 3.38, + "step": 12423 + }, + { + "epoch": 0.61, + "grad_norm": 0.5493909120559692, + "learning_rate": 0.0005410566635835509, + "loss": 3.4419, + "step": 12424 + }, + { + "epoch": 0.61, + "grad_norm": 0.6119068264961243, + "learning_rate": 0.0005410474983023599, + "loss": 3.3356, + "step": 12425 + }, + { + "epoch": 0.61, + "grad_norm": 0.5353181958198547, + "learning_rate": 0.000541038332386296, + "loss": 3.2801, + "step": 12426 + }, + { + "epoch": 0.61, + "grad_norm": 0.5060677528381348, + "learning_rate": 0.0005410291658353837, + "loss": 3.1649, + "step": 12427 + }, + { + "epoch": 0.61, + "grad_norm": 0.5227444171905518, + "learning_rate": 0.0005410199986496467, + "loss": 3.4991, + "step": 12428 + }, + { + "epoch": 0.61, + "grad_norm": 0.5346323251724243, + "learning_rate": 0.0005410108308291097, + "loss": 3.2646, + "step": 12429 + }, + { + "epoch": 0.61, + "grad_norm": 0.5016912221908569, + "learning_rate": 0.0005410016623737964, + "loss": 3.328, + "step": 12430 + }, + { + "epoch": 0.61, + "grad_norm": 0.5151358246803284, + "learning_rate": 0.0005409924932837312, + "loss": 3.2546, + "step": 12431 + }, + { + "epoch": 0.61, + "grad_norm": 0.5453892350196838, + "learning_rate": 0.0005409833235589381, + "loss": 3.1258, + "step": 12432 + }, + { + "epoch": 0.61, + "grad_norm": 0.5162804126739502, + "learning_rate": 0.0005409741531994413, + "loss": 3.212, + "step": 12433 + }, + { + "epoch": 0.61, + "grad_norm": 0.5229930877685547, + "learning_rate": 0.0005409649822052648, + "loss": 2.9828, + "step": 12434 + }, + { + "epoch": 0.61, + "grad_norm": 0.5364524722099304, + "learning_rate": 0.0005409558105764332, + "loss": 3.2532, + "step": 12435 + }, + { + "epoch": 0.61, + "grad_norm": 0.5143020153045654, + "learning_rate": 0.0005409466383129701, + "loss": 3.1768, + "step": 12436 + }, + { + "epoch": 0.61, + "grad_norm": 0.523679256439209, + "learning_rate": 0.0005409374654149001, + "loss": 3.3917, + "step": 12437 + }, + { + "epoch": 0.61, + "grad_norm": 0.5435823798179626, + "learning_rate": 0.0005409282918822472, + "loss": 3.1854, + "step": 12438 + }, + { + "epoch": 0.61, + "grad_norm": 0.5507246255874634, + "learning_rate": 0.0005409191177150355, + "loss": 3.1831, + "step": 12439 + }, + { + "epoch": 0.61, + "grad_norm": 0.5211560726165771, + "learning_rate": 0.0005409099429132891, + "loss": 3.2665, + "step": 12440 + }, + { + "epoch": 0.61, + "grad_norm": 0.49892207980155945, + "learning_rate": 0.0005409007674770324, + "loss": 2.9391, + "step": 12441 + }, + { + "epoch": 0.61, + "grad_norm": 0.5270420908927917, + "learning_rate": 0.0005408915914062894, + "loss": 3.2581, + "step": 12442 + }, + { + "epoch": 0.61, + "grad_norm": 0.518926203250885, + "learning_rate": 0.0005408824147010844, + "loss": 3.2074, + "step": 12443 + }, + { + "epoch": 0.61, + "grad_norm": 0.4994022846221924, + "learning_rate": 0.0005408732373614414, + "loss": 3.0999, + "step": 12444 + }, + { + "epoch": 0.61, + "grad_norm": 0.5447221994400024, + "learning_rate": 0.0005408640593873846, + "loss": 3.1956, + "step": 12445 + }, + { + "epoch": 0.61, + "grad_norm": 0.5384201407432556, + "learning_rate": 0.0005408548807789383, + "loss": 3.3842, + "step": 12446 + }, + { + "epoch": 0.61, + "grad_norm": 0.5086652040481567, + "learning_rate": 0.0005408457015361266, + "loss": 3.2858, + "step": 12447 + }, + { + "epoch": 0.61, + "grad_norm": 0.49078619480133057, + "learning_rate": 0.0005408365216589736, + "loss": 3.143, + "step": 12448 + }, + { + "epoch": 0.61, + "grad_norm": 0.5547512769699097, + "learning_rate": 0.0005408273411475036, + "loss": 3.374, + "step": 12449 + }, + { + "epoch": 0.61, + "grad_norm": 0.49764230847358704, + "learning_rate": 0.0005408181600017407, + "loss": 3.1671, + "step": 12450 + }, + { + "epoch": 0.61, + "grad_norm": 0.5081911087036133, + "learning_rate": 0.0005408089782217092, + "loss": 3.1078, + "step": 12451 + }, + { + "epoch": 0.61, + "grad_norm": 0.5074054002761841, + "learning_rate": 0.0005407997958074331, + "loss": 3.1706, + "step": 12452 + }, + { + "epoch": 0.61, + "grad_norm": 0.47185125946998596, + "learning_rate": 0.0005407906127589368, + "loss": 3.1016, + "step": 12453 + }, + { + "epoch": 0.61, + "grad_norm": 0.5095272064208984, + "learning_rate": 0.0005407814290762442, + "loss": 3.3671, + "step": 12454 + }, + { + "epoch": 0.61, + "grad_norm": 0.5170059204101562, + "learning_rate": 0.0005407722447593798, + "loss": 3.1382, + "step": 12455 + }, + { + "epoch": 0.61, + "grad_norm": 0.5567418336868286, + "learning_rate": 0.0005407630598083676, + "loss": 3.2116, + "step": 12456 + }, + { + "epoch": 0.61, + "grad_norm": 0.48573851585388184, + "learning_rate": 0.0005407538742232317, + "loss": 3.3303, + "step": 12457 + }, + { + "epoch": 0.61, + "grad_norm": 0.5352392792701721, + "learning_rate": 0.0005407446880039966, + "loss": 2.8769, + "step": 12458 + }, + { + "epoch": 0.61, + "grad_norm": 0.6195808053016663, + "learning_rate": 0.0005407355011506862, + "loss": 3.3433, + "step": 12459 + }, + { + "epoch": 0.61, + "grad_norm": 0.5522502660751343, + "learning_rate": 0.000540726313663325, + "loss": 3.2809, + "step": 12460 + }, + { + "epoch": 0.61, + "grad_norm": 0.5253701210021973, + "learning_rate": 0.0005407171255419369, + "loss": 3.3492, + "step": 12461 + }, + { + "epoch": 0.61, + "grad_norm": 0.5422989130020142, + "learning_rate": 0.0005407079367865462, + "loss": 3.2108, + "step": 12462 + }, + { + "epoch": 0.61, + "grad_norm": 0.478427916765213, + "learning_rate": 0.0005406987473971772, + "loss": 3.0953, + "step": 12463 + }, + { + "epoch": 0.61, + "grad_norm": 0.616083562374115, + "learning_rate": 0.0005406895573738539, + "loss": 3.2778, + "step": 12464 + }, + { + "epoch": 0.61, + "grad_norm": 0.5093544125556946, + "learning_rate": 0.0005406803667166008, + "loss": 3.3096, + "step": 12465 + }, + { + "epoch": 0.61, + "grad_norm": 0.5077793598175049, + "learning_rate": 0.0005406711754254418, + "loss": 3.2769, + "step": 12466 + }, + { + "epoch": 0.61, + "grad_norm": 0.52507483959198, + "learning_rate": 0.0005406619835004012, + "loss": 3.2368, + "step": 12467 + }, + { + "epoch": 0.61, + "grad_norm": 0.500822126865387, + "learning_rate": 0.0005406527909415033, + "loss": 3.068, + "step": 12468 + }, + { + "epoch": 0.61, + "grad_norm": 0.5564792156219482, + "learning_rate": 0.0005406435977487723, + "loss": 3.1324, + "step": 12469 + }, + { + "epoch": 0.61, + "grad_norm": 0.5204517245292664, + "learning_rate": 0.0005406344039222325, + "loss": 3.0969, + "step": 12470 + }, + { + "epoch": 0.61, + "grad_norm": 0.5361521244049072, + "learning_rate": 0.0005406252094619079, + "loss": 3.3305, + "step": 12471 + }, + { + "epoch": 0.61, + "grad_norm": 0.5398157238960266, + "learning_rate": 0.0005406160143678226, + "loss": 3.0892, + "step": 12472 + }, + { + "epoch": 0.61, + "grad_norm": 0.5248488187789917, + "learning_rate": 0.0005406068186400013, + "loss": 3.1276, + "step": 12473 + }, + { + "epoch": 0.61, + "grad_norm": 0.5540500283241272, + "learning_rate": 0.0005405976222784678, + "loss": 3.1021, + "step": 12474 + }, + { + "epoch": 0.61, + "grad_norm": 0.5601980686187744, + "learning_rate": 0.0005405884252832465, + "loss": 2.9468, + "step": 12475 + }, + { + "epoch": 0.61, + "grad_norm": 0.5255300998687744, + "learning_rate": 0.0005405792276543616, + "loss": 3.3761, + "step": 12476 + }, + { + "epoch": 0.61, + "grad_norm": 0.5468989014625549, + "learning_rate": 0.0005405700293918373, + "loss": 3.3636, + "step": 12477 + }, + { + "epoch": 0.61, + "grad_norm": 0.5150614976882935, + "learning_rate": 0.000540560830495698, + "loss": 3.3567, + "step": 12478 + }, + { + "epoch": 0.61, + "grad_norm": 0.555590033531189, + "learning_rate": 0.0005405516309659674, + "loss": 3.0297, + "step": 12479 + }, + { + "epoch": 0.61, + "grad_norm": 0.5096856951713562, + "learning_rate": 0.0005405424308026704, + "loss": 3.2316, + "step": 12480 + }, + { + "epoch": 0.61, + "grad_norm": 0.5092878341674805, + "learning_rate": 0.0005405332300058308, + "loss": 3.5083, + "step": 12481 + }, + { + "epoch": 0.61, + "grad_norm": 0.5776546597480774, + "learning_rate": 0.0005405240285754731, + "loss": 3.1405, + "step": 12482 + }, + { + "epoch": 0.61, + "grad_norm": 0.5385265946388245, + "learning_rate": 0.0005405148265116213, + "loss": 3.3177, + "step": 12483 + }, + { + "epoch": 0.61, + "grad_norm": 0.5115643739700317, + "learning_rate": 0.0005405056238142997, + "loss": 3.3791, + "step": 12484 + }, + { + "epoch": 0.61, + "grad_norm": 0.4903174340724945, + "learning_rate": 0.0005404964204835327, + "loss": 3.3797, + "step": 12485 + }, + { + "epoch": 0.61, + "grad_norm": 0.5272238254547119, + "learning_rate": 0.0005404872165193443, + "loss": 3.2622, + "step": 12486 + }, + { + "epoch": 0.61, + "grad_norm": 0.5495355725288391, + "learning_rate": 0.0005404780119217589, + "loss": 3.0021, + "step": 12487 + }, + { + "epoch": 0.61, + "grad_norm": 0.5022799968719482, + "learning_rate": 0.0005404688066908006, + "loss": 3.26, + "step": 12488 + }, + { + "epoch": 0.61, + "grad_norm": 0.5563308596611023, + "learning_rate": 0.0005404596008264939, + "loss": 3.3463, + "step": 12489 + }, + { + "epoch": 0.61, + "grad_norm": 0.5790652632713318, + "learning_rate": 0.0005404503943288628, + "loss": 3.3775, + "step": 12490 + }, + { + "epoch": 0.61, + "grad_norm": 0.5320955514907837, + "learning_rate": 0.0005404411871979316, + "loss": 3.2666, + "step": 12491 + }, + { + "epoch": 0.61, + "grad_norm": 0.5261737704277039, + "learning_rate": 0.0005404319794337246, + "loss": 3.1438, + "step": 12492 + }, + { + "epoch": 0.61, + "grad_norm": 0.4732470214366913, + "learning_rate": 0.0005404227710362661, + "loss": 3.1054, + "step": 12493 + }, + { + "epoch": 0.61, + "grad_norm": 0.5220340490341187, + "learning_rate": 0.0005404135620055803, + "loss": 3.2321, + "step": 12494 + }, + { + "epoch": 0.61, + "grad_norm": 0.4781638979911804, + "learning_rate": 0.0005404043523416914, + "loss": 3.1854, + "step": 12495 + }, + { + "epoch": 0.61, + "grad_norm": 0.5066351294517517, + "learning_rate": 0.0005403951420446237, + "loss": 3.2492, + "step": 12496 + }, + { + "epoch": 0.61, + "grad_norm": 0.49951955676078796, + "learning_rate": 0.0005403859311144015, + "loss": 3.1887, + "step": 12497 + }, + { + "epoch": 0.61, + "grad_norm": 0.5178000330924988, + "learning_rate": 0.000540376719551049, + "loss": 3.2451, + "step": 12498 + }, + { + "epoch": 0.61, + "grad_norm": 0.5040697455406189, + "learning_rate": 0.0005403675073545906, + "loss": 3.2617, + "step": 12499 + }, + { + "epoch": 0.61, + "grad_norm": 0.48904526233673096, + "learning_rate": 0.0005403582945250503, + "loss": 3.0489, + "step": 12500 + }, + { + "epoch": 0.61, + "grad_norm": 0.5385371446609497, + "learning_rate": 0.0005403490810624525, + "loss": 3.1477, + "step": 12501 + }, + { + "epoch": 0.61, + "grad_norm": 0.5342295169830322, + "learning_rate": 0.0005403398669668215, + "loss": 3.1246, + "step": 12502 + }, + { + "epoch": 0.61, + "grad_norm": 0.5917371511459351, + "learning_rate": 0.0005403306522381815, + "loss": 3.241, + "step": 12503 + }, + { + "epoch": 0.61, + "grad_norm": 0.5648188591003418, + "learning_rate": 0.0005403214368765569, + "loss": 3.3202, + "step": 12504 + }, + { + "epoch": 0.61, + "grad_norm": 0.534196674823761, + "learning_rate": 0.0005403122208819718, + "loss": 3.3616, + "step": 12505 + }, + { + "epoch": 0.61, + "grad_norm": 0.5107386112213135, + "learning_rate": 0.0005403030042544506, + "loss": 3.179, + "step": 12506 + }, + { + "epoch": 0.61, + "grad_norm": 0.5261774659156799, + "learning_rate": 0.0005402937869940177, + "loss": 3.5787, + "step": 12507 + }, + { + "epoch": 0.61, + "grad_norm": 0.4821683168411255, + "learning_rate": 0.000540284569100697, + "loss": 3.2806, + "step": 12508 + }, + { + "epoch": 0.61, + "grad_norm": 0.5442585349082947, + "learning_rate": 0.0005402753505745131, + "loss": 3.3793, + "step": 12509 + }, + { + "epoch": 0.61, + "grad_norm": 0.5089293122291565, + "learning_rate": 0.0005402661314154901, + "loss": 3.3455, + "step": 12510 + }, + { + "epoch": 0.61, + "grad_norm": 0.5478642582893372, + "learning_rate": 0.0005402569116236525, + "loss": 3.1172, + "step": 12511 + }, + { + "epoch": 0.61, + "grad_norm": 0.4844317138195038, + "learning_rate": 0.0005402476911990242, + "loss": 3.3636, + "step": 12512 + }, + { + "epoch": 0.61, + "grad_norm": 0.5121679902076721, + "learning_rate": 0.0005402384701416298, + "loss": 3.1472, + "step": 12513 + }, + { + "epoch": 0.61, + "grad_norm": 0.4980253279209137, + "learning_rate": 0.0005402292484514935, + "loss": 3.3732, + "step": 12514 + }, + { + "epoch": 0.61, + "grad_norm": 0.49577051401138306, + "learning_rate": 0.0005402200261286396, + "loss": 3.2947, + "step": 12515 + }, + { + "epoch": 0.61, + "grad_norm": 0.5190374255180359, + "learning_rate": 0.0005402108031730923, + "loss": 3.0081, + "step": 12516 + }, + { + "epoch": 0.61, + "grad_norm": 0.5585235953330994, + "learning_rate": 0.0005402015795848762, + "loss": 3.2278, + "step": 12517 + }, + { + "epoch": 0.61, + "grad_norm": 0.5044002532958984, + "learning_rate": 0.0005401923553640152, + "loss": 3.397, + "step": 12518 + }, + { + "epoch": 0.61, + "grad_norm": 0.5685398578643799, + "learning_rate": 0.0005401831305105339, + "loss": 3.2287, + "step": 12519 + }, + { + "epoch": 0.61, + "grad_norm": 0.5105231404304504, + "learning_rate": 0.0005401739050244563, + "loss": 3.2115, + "step": 12520 + }, + { + "epoch": 0.61, + "grad_norm": 0.49602973461151123, + "learning_rate": 0.000540164678905807, + "loss": 3.2298, + "step": 12521 + }, + { + "epoch": 0.61, + "grad_norm": 0.5373128056526184, + "learning_rate": 0.00054015545215461, + "loss": 2.917, + "step": 12522 + }, + { + "epoch": 0.61, + "grad_norm": 0.5047098994255066, + "learning_rate": 0.0005401462247708899, + "loss": 3.4197, + "step": 12523 + }, + { + "epoch": 0.61, + "grad_norm": 0.5524264574050903, + "learning_rate": 0.0005401369967546708, + "loss": 3.0935, + "step": 12524 + }, + { + "epoch": 0.61, + "grad_norm": 0.5501293540000916, + "learning_rate": 0.000540127768105977, + "loss": 3.1648, + "step": 12525 + }, + { + "epoch": 0.61, + "grad_norm": 0.5153632760047913, + "learning_rate": 0.0005401185388248329, + "loss": 3.1659, + "step": 12526 + }, + { + "epoch": 0.61, + "grad_norm": 0.48313668370246887, + "learning_rate": 0.0005401093089112628, + "loss": 3.0783, + "step": 12527 + }, + { + "epoch": 0.61, + "grad_norm": 0.5056579113006592, + "learning_rate": 0.0005401000783652911, + "loss": 3.2297, + "step": 12528 + }, + { + "epoch": 0.61, + "grad_norm": 0.5305315256118774, + "learning_rate": 0.0005400908471869419, + "loss": 3.1893, + "step": 12529 + }, + { + "epoch": 0.61, + "grad_norm": 0.5503982901573181, + "learning_rate": 0.0005400816153762396, + "loss": 3.1965, + "step": 12530 + }, + { + "epoch": 0.61, + "grad_norm": 0.4872680902481079, + "learning_rate": 0.0005400723829332085, + "loss": 3.1287, + "step": 12531 + }, + { + "epoch": 0.61, + "grad_norm": 0.5678309798240662, + "learning_rate": 0.000540063149857873, + "loss": 3.1886, + "step": 12532 + }, + { + "epoch": 0.61, + "grad_norm": 0.4857582151889801, + "learning_rate": 0.0005400539161502574, + "loss": 3.208, + "step": 12533 + }, + { + "epoch": 0.61, + "grad_norm": 0.5303822755813599, + "learning_rate": 0.000540044681810386, + "loss": 3.339, + "step": 12534 + }, + { + "epoch": 0.61, + "grad_norm": 0.5294134616851807, + "learning_rate": 0.000540035446838283, + "loss": 3.2367, + "step": 12535 + }, + { + "epoch": 0.61, + "grad_norm": 0.49902549386024475, + "learning_rate": 0.0005400262112339728, + "loss": 3.2169, + "step": 12536 + }, + { + "epoch": 0.61, + "grad_norm": 0.4956739544868469, + "learning_rate": 0.0005400169749974798, + "loss": 3.1918, + "step": 12537 + }, + { + "epoch": 0.61, + "grad_norm": 0.5438934564590454, + "learning_rate": 0.0005400077381288284, + "loss": 3.1434, + "step": 12538 + }, + { + "epoch": 0.61, + "grad_norm": 0.4825858771800995, + "learning_rate": 0.0005399985006280427, + "loss": 3.0899, + "step": 12539 + }, + { + "epoch": 0.61, + "grad_norm": 0.7768726944923401, + "learning_rate": 0.0005399892624951472, + "loss": 3.2963, + "step": 12540 + }, + { + "epoch": 0.61, + "grad_norm": 0.5147074460983276, + "learning_rate": 0.0005399800237301661, + "loss": 3.1657, + "step": 12541 + }, + { + "epoch": 0.61, + "grad_norm": 0.5385922789573669, + "learning_rate": 0.0005399707843331238, + "loss": 3.1703, + "step": 12542 + }, + { + "epoch": 0.61, + "grad_norm": 0.5428267121315002, + "learning_rate": 0.0005399615443040447, + "loss": 3.0775, + "step": 12543 + }, + { + "epoch": 0.61, + "grad_norm": 0.5166685581207275, + "learning_rate": 0.000539952303642953, + "loss": 3.2899, + "step": 12544 + }, + { + "epoch": 0.61, + "grad_norm": 0.5144116282463074, + "learning_rate": 0.0005399430623498732, + "loss": 3.3296, + "step": 12545 + }, + { + "epoch": 0.61, + "grad_norm": 0.49154841899871826, + "learning_rate": 0.0005399338204248295, + "loss": 3.0634, + "step": 12546 + }, + { + "epoch": 0.61, + "grad_norm": 0.5445835590362549, + "learning_rate": 0.0005399245778678464, + "loss": 3.3533, + "step": 12547 + }, + { + "epoch": 0.61, + "grad_norm": 0.4968005418777466, + "learning_rate": 0.0005399153346789479, + "loss": 3.3812, + "step": 12548 + }, + { + "epoch": 0.61, + "grad_norm": 0.5119567513465881, + "learning_rate": 0.0005399060908581587, + "loss": 3.2191, + "step": 12549 + }, + { + "epoch": 0.62, + "grad_norm": 0.5225988626480103, + "learning_rate": 0.0005398968464055031, + "loss": 3.2854, + "step": 12550 + }, + { + "epoch": 0.62, + "grad_norm": 0.5341635942459106, + "learning_rate": 0.0005398876013210053, + "loss": 3.2697, + "step": 12551 + }, + { + "epoch": 0.62, + "grad_norm": 0.5326067805290222, + "learning_rate": 0.0005398783556046897, + "loss": 3.535, + "step": 12552 + }, + { + "epoch": 0.62, + "grad_norm": 0.5414754152297974, + "learning_rate": 0.0005398691092565808, + "loss": 3.3331, + "step": 12553 + }, + { + "epoch": 0.62, + "grad_norm": 0.5281270146369934, + "learning_rate": 0.0005398598622767027, + "loss": 3.454, + "step": 12554 + }, + { + "epoch": 0.62, + "grad_norm": 0.5076268911361694, + "learning_rate": 0.0005398506146650799, + "loss": 3.4165, + "step": 12555 + }, + { + "epoch": 0.62, + "grad_norm": 0.5204314589500427, + "learning_rate": 0.0005398413664217368, + "loss": 3.1195, + "step": 12556 + }, + { + "epoch": 0.62, + "grad_norm": 0.5105859041213989, + "learning_rate": 0.0005398321175466977, + "loss": 3.2604, + "step": 12557 + }, + { + "epoch": 0.62, + "grad_norm": 0.5473527908325195, + "learning_rate": 0.0005398228680399869, + "loss": 3.1417, + "step": 12558 + }, + { + "epoch": 0.62, + "grad_norm": 0.4897322356700897, + "learning_rate": 0.0005398136179016288, + "loss": 3.2681, + "step": 12559 + }, + { + "epoch": 0.62, + "grad_norm": 0.532136857509613, + "learning_rate": 0.000539804367131648, + "loss": 3.2423, + "step": 12560 + }, + { + "epoch": 0.62, + "grad_norm": 0.532255232334137, + "learning_rate": 0.0005397951157300684, + "loss": 3.3202, + "step": 12561 + }, + { + "epoch": 0.62, + "grad_norm": 0.531471312046051, + "learning_rate": 0.0005397858636969148, + "loss": 3.0657, + "step": 12562 + }, + { + "epoch": 0.62, + "grad_norm": 0.5550201535224915, + "learning_rate": 0.0005397766110322112, + "loss": 3.203, + "step": 12563 + }, + { + "epoch": 0.62, + "grad_norm": 0.519936740398407, + "learning_rate": 0.0005397673577359822, + "loss": 3.2974, + "step": 12564 + }, + { + "epoch": 0.62, + "grad_norm": 0.4963712990283966, + "learning_rate": 0.0005397581038082521, + "loss": 3.2021, + "step": 12565 + }, + { + "epoch": 0.62, + "grad_norm": 0.5781660676002502, + "learning_rate": 0.0005397488492490455, + "loss": 3.1871, + "step": 12566 + }, + { + "epoch": 0.62, + "grad_norm": 0.49295535683631897, + "learning_rate": 0.0005397395940583864, + "loss": 3.127, + "step": 12567 + }, + { + "epoch": 0.62, + "grad_norm": 0.5267174243927002, + "learning_rate": 0.0005397303382362994, + "loss": 2.8856, + "step": 12568 + }, + { + "epoch": 0.62, + "grad_norm": 0.5607599020004272, + "learning_rate": 0.0005397210817828088, + "loss": 3.1008, + "step": 12569 + }, + { + "epoch": 0.62, + "grad_norm": 0.5201016068458557, + "learning_rate": 0.000539711824697939, + "loss": 3.1056, + "step": 12570 + }, + { + "epoch": 0.62, + "grad_norm": 0.5651845932006836, + "learning_rate": 0.0005397025669817144, + "loss": 3.2542, + "step": 12571 + }, + { + "epoch": 0.62, + "grad_norm": 0.5444151163101196, + "learning_rate": 0.0005396933086341593, + "loss": 3.0831, + "step": 12572 + }, + { + "epoch": 0.62, + "grad_norm": 0.5270144939422607, + "learning_rate": 0.0005396840496552982, + "loss": 2.9548, + "step": 12573 + }, + { + "epoch": 0.62, + "grad_norm": 0.5137320756912231, + "learning_rate": 0.0005396747900451555, + "loss": 3.1978, + "step": 12574 + }, + { + "epoch": 0.62, + "grad_norm": 0.5519542694091797, + "learning_rate": 0.0005396655298037555, + "loss": 3.2202, + "step": 12575 + }, + { + "epoch": 0.62, + "grad_norm": 0.5144039392471313, + "learning_rate": 0.0005396562689311226, + "loss": 3.1693, + "step": 12576 + }, + { + "epoch": 0.62, + "grad_norm": 0.5353249907493591, + "learning_rate": 0.0005396470074272812, + "loss": 3.404, + "step": 12577 + }, + { + "epoch": 0.62, + "grad_norm": 0.5302972793579102, + "learning_rate": 0.0005396377452922558, + "loss": 3.4043, + "step": 12578 + }, + { + "epoch": 0.62, + "grad_norm": 0.4982793927192688, + "learning_rate": 0.0005396284825260705, + "loss": 3.2489, + "step": 12579 + }, + { + "epoch": 0.62, + "grad_norm": 0.5199756026268005, + "learning_rate": 0.0005396192191287502, + "loss": 3.1789, + "step": 12580 + }, + { + "epoch": 0.62, + "grad_norm": 0.5148780345916748, + "learning_rate": 0.0005396099551003187, + "loss": 3.2521, + "step": 12581 + }, + { + "epoch": 0.62, + "grad_norm": 0.5244265198707581, + "learning_rate": 0.0005396006904408009, + "loss": 3.3569, + "step": 12582 + }, + { + "epoch": 0.62, + "grad_norm": 0.5343596935272217, + "learning_rate": 0.0005395914251502208, + "loss": 3.3348, + "step": 12583 + }, + { + "epoch": 0.62, + "grad_norm": 0.514910101890564, + "learning_rate": 0.0005395821592286031, + "loss": 3.4142, + "step": 12584 + }, + { + "epoch": 0.62, + "grad_norm": 0.5111855864524841, + "learning_rate": 0.0005395728926759721, + "loss": 3.0777, + "step": 12585 + }, + { + "epoch": 0.62, + "grad_norm": 0.5080044269561768, + "learning_rate": 0.0005395636254923522, + "loss": 3.3565, + "step": 12586 + }, + { + "epoch": 0.62, + "grad_norm": 0.47200343012809753, + "learning_rate": 0.0005395543576777679, + "loss": 3.1417, + "step": 12587 + }, + { + "epoch": 0.62, + "grad_norm": 0.5408815741539001, + "learning_rate": 0.0005395450892322433, + "loss": 3.3413, + "step": 12588 + }, + { + "epoch": 0.62, + "grad_norm": 0.5425565242767334, + "learning_rate": 0.0005395358201558032, + "loss": 3.2498, + "step": 12589 + }, + { + "epoch": 0.62, + "grad_norm": 0.5002942681312561, + "learning_rate": 0.0005395265504484719, + "loss": 3.2378, + "step": 12590 + }, + { + "epoch": 0.62, + "grad_norm": 0.5082552433013916, + "learning_rate": 0.0005395172801102736, + "loss": 3.2024, + "step": 12591 + }, + { + "epoch": 0.62, + "grad_norm": 0.5224912166595459, + "learning_rate": 0.000539508009141233, + "loss": 3.215, + "step": 12592 + }, + { + "epoch": 0.62, + "grad_norm": 0.7015027403831482, + "learning_rate": 0.0005394987375413745, + "loss": 3.2016, + "step": 12593 + }, + { + "epoch": 0.62, + "grad_norm": 0.4700780510902405, + "learning_rate": 0.0005394894653107222, + "loss": 3.3461, + "step": 12594 + }, + { + "epoch": 0.62, + "grad_norm": 0.49604517221450806, + "learning_rate": 0.0005394801924493008, + "loss": 3.2246, + "step": 12595 + }, + { + "epoch": 0.62, + "grad_norm": 0.5033439993858337, + "learning_rate": 0.0005394709189571347, + "loss": 3.2227, + "step": 12596 + }, + { + "epoch": 0.62, + "grad_norm": 0.5192400217056274, + "learning_rate": 0.0005394616448342483, + "loss": 3.3658, + "step": 12597 + }, + { + "epoch": 0.62, + "grad_norm": 0.5239402055740356, + "learning_rate": 0.0005394523700806659, + "loss": 2.9431, + "step": 12598 + }, + { + "epoch": 0.62, + "grad_norm": 0.5193118453025818, + "learning_rate": 0.0005394430946964122, + "loss": 3.3063, + "step": 12599 + }, + { + "epoch": 0.62, + "grad_norm": 0.5492883324623108, + "learning_rate": 0.0005394338186815114, + "loss": 3.1588, + "step": 12600 + }, + { + "epoch": 0.62, + "grad_norm": 0.48858433961868286, + "learning_rate": 0.000539424542035988, + "loss": 3.3016, + "step": 12601 + }, + { + "epoch": 0.62, + "grad_norm": 0.5181791186332703, + "learning_rate": 0.0005394152647598664, + "loss": 3.2007, + "step": 12602 + }, + { + "epoch": 0.62, + "grad_norm": 0.5329108238220215, + "learning_rate": 0.0005394059868531711, + "loss": 3.1491, + "step": 12603 + }, + { + "epoch": 0.62, + "grad_norm": 0.5719881653785706, + "learning_rate": 0.0005393967083159266, + "loss": 3.2346, + "step": 12604 + }, + { + "epoch": 0.62, + "grad_norm": 0.509117841720581, + "learning_rate": 0.0005393874291481571, + "loss": 3.1653, + "step": 12605 + }, + { + "epoch": 0.62, + "grad_norm": 0.5386404395103455, + "learning_rate": 0.0005393781493498872, + "loss": 3.3633, + "step": 12606 + }, + { + "epoch": 0.62, + "grad_norm": 0.5088648200035095, + "learning_rate": 0.0005393688689211413, + "loss": 3.1949, + "step": 12607 + }, + { + "epoch": 0.62, + "grad_norm": 0.5584010481834412, + "learning_rate": 0.000539359587861944, + "loss": 3.4235, + "step": 12608 + }, + { + "epoch": 0.62, + "grad_norm": 0.6642793416976929, + "learning_rate": 0.0005393503061723196, + "loss": 3.3751, + "step": 12609 + }, + { + "epoch": 0.62, + "grad_norm": 0.5091199278831482, + "learning_rate": 0.0005393410238522924, + "loss": 3.1341, + "step": 12610 + }, + { + "epoch": 0.62, + "grad_norm": 0.5205773711204529, + "learning_rate": 0.0005393317409018871, + "loss": 3.2806, + "step": 12611 + }, + { + "epoch": 0.62, + "grad_norm": 0.5145056247711182, + "learning_rate": 0.0005393224573211281, + "loss": 3.3279, + "step": 12612 + }, + { + "epoch": 0.62, + "grad_norm": 0.5004215240478516, + "learning_rate": 0.0005393131731100398, + "loss": 3.3095, + "step": 12613 + }, + { + "epoch": 0.62, + "grad_norm": 0.5290515422821045, + "learning_rate": 0.0005393038882686466, + "loss": 3.2428, + "step": 12614 + }, + { + "epoch": 0.62, + "grad_norm": 0.5009429454803467, + "learning_rate": 0.000539294602796973, + "loss": 3.0593, + "step": 12615 + }, + { + "epoch": 0.62, + "grad_norm": 0.6031820178031921, + "learning_rate": 0.0005392853166950436, + "loss": 3.1133, + "step": 12616 + }, + { + "epoch": 0.62, + "grad_norm": 0.5044199824333191, + "learning_rate": 0.0005392760299628825, + "loss": 3.3295, + "step": 12617 + }, + { + "epoch": 0.62, + "grad_norm": 0.582173228263855, + "learning_rate": 0.0005392667426005146, + "loss": 3.172, + "step": 12618 + }, + { + "epoch": 0.62, + "grad_norm": 0.5244518518447876, + "learning_rate": 0.000539257454607964, + "loss": 3.4616, + "step": 12619 + }, + { + "epoch": 0.62, + "grad_norm": 0.5106006264686584, + "learning_rate": 0.0005392481659852554, + "loss": 3.453, + "step": 12620 + }, + { + "epoch": 0.62, + "grad_norm": 0.5275260806083679, + "learning_rate": 0.0005392388767324132, + "loss": 3.2022, + "step": 12621 + }, + { + "epoch": 0.62, + "grad_norm": 0.6484015583992004, + "learning_rate": 0.0005392295868494617, + "loss": 3.1695, + "step": 12622 + }, + { + "epoch": 0.62, + "grad_norm": 0.5001370310783386, + "learning_rate": 0.0005392202963364256, + "loss": 3.1173, + "step": 12623 + }, + { + "epoch": 0.62, + "grad_norm": 0.5128294229507446, + "learning_rate": 0.0005392110051933293, + "loss": 3.3133, + "step": 12624 + }, + { + "epoch": 0.62, + "grad_norm": 0.48935312032699585, + "learning_rate": 0.0005392017134201973, + "loss": 3.2482, + "step": 12625 + }, + { + "epoch": 0.62, + "grad_norm": 0.546270489692688, + "learning_rate": 0.0005391924210170539, + "loss": 3.2341, + "step": 12626 + }, + { + "epoch": 0.62, + "grad_norm": 0.5206765532493591, + "learning_rate": 0.0005391831279839237, + "loss": 3.1757, + "step": 12627 + }, + { + "epoch": 0.62, + "grad_norm": 0.5980457067489624, + "learning_rate": 0.0005391738343208313, + "loss": 3.1083, + "step": 12628 + }, + { + "epoch": 0.62, + "grad_norm": 0.5202022194862366, + "learning_rate": 0.0005391645400278009, + "loss": 3.256, + "step": 12629 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010168552398682, + "learning_rate": 0.0005391552451048571, + "loss": 3.1127, + "step": 12630 + }, + { + "epoch": 0.62, + "grad_norm": 0.5590848326683044, + "learning_rate": 0.0005391459495520247, + "loss": 2.9533, + "step": 12631 + }, + { + "epoch": 0.62, + "grad_norm": 0.49834656715393066, + "learning_rate": 0.0005391366533693276, + "loss": 3.1844, + "step": 12632 + }, + { + "epoch": 0.62, + "grad_norm": 0.5144838690757751, + "learning_rate": 0.0005391273565567906, + "loss": 3.1728, + "step": 12633 + }, + { + "epoch": 0.62, + "grad_norm": 0.5475999712944031, + "learning_rate": 0.0005391180591144383, + "loss": 3.3422, + "step": 12634 + }, + { + "epoch": 0.62, + "grad_norm": 0.5091504454612732, + "learning_rate": 0.000539108761042295, + "loss": 3.1376, + "step": 12635 + }, + { + "epoch": 0.62, + "grad_norm": 0.5275076031684875, + "learning_rate": 0.0005390994623403853, + "loss": 3.3421, + "step": 12636 + }, + { + "epoch": 0.62, + "grad_norm": 0.530569851398468, + "learning_rate": 0.0005390901630087336, + "loss": 3.1941, + "step": 12637 + }, + { + "epoch": 0.62, + "grad_norm": 0.5221152901649475, + "learning_rate": 0.0005390808630473643, + "loss": 3.2616, + "step": 12638 + }, + { + "epoch": 0.62, + "grad_norm": 0.5298863649368286, + "learning_rate": 0.0005390715624563021, + "loss": 3.2261, + "step": 12639 + }, + { + "epoch": 0.62, + "grad_norm": 0.513576865196228, + "learning_rate": 0.0005390622612355715, + "loss": 3.2288, + "step": 12640 + }, + { + "epoch": 0.62, + "grad_norm": 0.4933810234069824, + "learning_rate": 0.0005390529593851969, + "loss": 3.261, + "step": 12641 + }, + { + "epoch": 0.62, + "grad_norm": 0.5063499808311462, + "learning_rate": 0.0005390436569052027, + "loss": 3.1588, + "step": 12642 + }, + { + "epoch": 0.62, + "grad_norm": 0.4862244725227356, + "learning_rate": 0.0005390343537956135, + "loss": 3.3587, + "step": 12643 + }, + { + "epoch": 0.62, + "grad_norm": 0.49645307660102844, + "learning_rate": 0.0005390250500564539, + "loss": 3.0493, + "step": 12644 + }, + { + "epoch": 0.62, + "grad_norm": 0.506386399269104, + "learning_rate": 0.0005390157456877483, + "loss": 3.3449, + "step": 12645 + }, + { + "epoch": 0.62, + "grad_norm": 0.5540480613708496, + "learning_rate": 0.0005390064406895212, + "loss": 3.3795, + "step": 12646 + }, + { + "epoch": 0.62, + "grad_norm": 0.5100905895233154, + "learning_rate": 0.0005389971350617972, + "loss": 3.2195, + "step": 12647 + }, + { + "epoch": 0.62, + "grad_norm": 0.5141370296478271, + "learning_rate": 0.0005389878288046007, + "loss": 3.3391, + "step": 12648 + }, + { + "epoch": 0.62, + "grad_norm": 0.49485722184181213, + "learning_rate": 0.0005389785219179562, + "loss": 3.0948, + "step": 12649 + }, + { + "epoch": 0.62, + "grad_norm": 0.5505667328834534, + "learning_rate": 0.0005389692144018883, + "loss": 3.1628, + "step": 12650 + }, + { + "epoch": 0.62, + "grad_norm": 0.5638545751571655, + "learning_rate": 0.0005389599062564216, + "loss": 3.0784, + "step": 12651 + }, + { + "epoch": 0.62, + "grad_norm": 0.5764116048812866, + "learning_rate": 0.0005389505974815803, + "loss": 3.0416, + "step": 12652 + }, + { + "epoch": 0.62, + "grad_norm": 0.516578197479248, + "learning_rate": 0.0005389412880773891, + "loss": 3.4401, + "step": 12653 + }, + { + "epoch": 0.62, + "grad_norm": 0.5281229019165039, + "learning_rate": 0.0005389319780438728, + "loss": 3.1235, + "step": 12654 + }, + { + "epoch": 0.62, + "grad_norm": 0.5611839294433594, + "learning_rate": 0.0005389226673810554, + "loss": 3.2922, + "step": 12655 + }, + { + "epoch": 0.62, + "grad_norm": 0.5751029849052429, + "learning_rate": 0.0005389133560889617, + "loss": 3.0524, + "step": 12656 + }, + { + "epoch": 0.62, + "grad_norm": 0.5245303511619568, + "learning_rate": 0.0005389040441676164, + "loss": 3.2482, + "step": 12657 + }, + { + "epoch": 0.62, + "grad_norm": 0.5082685351371765, + "learning_rate": 0.0005388947316170437, + "loss": 3.2612, + "step": 12658 + }, + { + "epoch": 0.62, + "grad_norm": 0.5107821822166443, + "learning_rate": 0.0005388854184372682, + "loss": 3.3678, + "step": 12659 + }, + { + "epoch": 0.62, + "grad_norm": 0.5098000168800354, + "learning_rate": 0.0005388761046283146, + "loss": 3.1828, + "step": 12660 + }, + { + "epoch": 0.62, + "grad_norm": 0.506803035736084, + "learning_rate": 0.0005388667901902071, + "loss": 3.3027, + "step": 12661 + }, + { + "epoch": 0.62, + "grad_norm": 0.5258462429046631, + "learning_rate": 0.0005388574751229707, + "loss": 3.2059, + "step": 12662 + }, + { + "epoch": 0.62, + "grad_norm": 0.5080270767211914, + "learning_rate": 0.0005388481594266295, + "loss": 3.2258, + "step": 12663 + }, + { + "epoch": 0.62, + "grad_norm": 0.5246100425720215, + "learning_rate": 0.0005388388431012083, + "loss": 3.2327, + "step": 12664 + }, + { + "epoch": 0.62, + "grad_norm": 0.5078802108764648, + "learning_rate": 0.0005388295261467315, + "loss": 3.1975, + "step": 12665 + }, + { + "epoch": 0.62, + "grad_norm": 0.5111520290374756, + "learning_rate": 0.0005388202085632237, + "loss": 3.2364, + "step": 12666 + }, + { + "epoch": 0.62, + "grad_norm": 0.5226684212684631, + "learning_rate": 0.0005388108903507095, + "loss": 3.118, + "step": 12667 + }, + { + "epoch": 0.62, + "grad_norm": 0.5708491802215576, + "learning_rate": 0.0005388015715092133, + "loss": 3.3362, + "step": 12668 + }, + { + "epoch": 0.62, + "grad_norm": 0.5695480108261108, + "learning_rate": 0.0005387922520387597, + "loss": 3.2954, + "step": 12669 + }, + { + "epoch": 0.62, + "grad_norm": 0.4956531822681427, + "learning_rate": 0.0005387829319393735, + "loss": 3.1396, + "step": 12670 + }, + { + "epoch": 0.62, + "grad_norm": 0.5610358715057373, + "learning_rate": 0.0005387736112110787, + "loss": 3.1429, + "step": 12671 + }, + { + "epoch": 0.62, + "grad_norm": 0.5279606580734253, + "learning_rate": 0.0005387642898539004, + "loss": 3.1699, + "step": 12672 + }, + { + "epoch": 0.62, + "grad_norm": 0.5159085988998413, + "learning_rate": 0.0005387549678678627, + "loss": 3.1701, + "step": 12673 + }, + { + "epoch": 0.62, + "grad_norm": 0.5232102870941162, + "learning_rate": 0.0005387456452529904, + "loss": 3.2938, + "step": 12674 + }, + { + "epoch": 0.62, + "grad_norm": 0.5625350475311279, + "learning_rate": 0.0005387363220093082, + "loss": 3.1154, + "step": 12675 + }, + { + "epoch": 0.62, + "grad_norm": 0.524451732635498, + "learning_rate": 0.0005387269981368403, + "loss": 3.2826, + "step": 12676 + }, + { + "epoch": 0.62, + "grad_norm": 0.5030291080474854, + "learning_rate": 0.0005387176736356116, + "loss": 3.0269, + "step": 12677 + }, + { + "epoch": 0.62, + "grad_norm": 0.5304322242736816, + "learning_rate": 0.0005387083485056463, + "loss": 3.3728, + "step": 12678 + }, + { + "epoch": 0.62, + "grad_norm": 0.5527458786964417, + "learning_rate": 0.0005386990227469693, + "loss": 3.2158, + "step": 12679 + }, + { + "epoch": 0.62, + "grad_norm": 0.5265601277351379, + "learning_rate": 0.0005386896963596049, + "loss": 3.3758, + "step": 12680 + }, + { + "epoch": 0.62, + "grad_norm": 0.5212921500205994, + "learning_rate": 0.0005386803693435778, + "loss": 3.3079, + "step": 12681 + }, + { + "epoch": 0.62, + "grad_norm": 0.5151424407958984, + "learning_rate": 0.0005386710416989126, + "loss": 3.2643, + "step": 12682 + }, + { + "epoch": 0.62, + "grad_norm": 0.5002523064613342, + "learning_rate": 0.0005386617134256337, + "loss": 3.2302, + "step": 12683 + }, + { + "epoch": 0.62, + "grad_norm": 0.5380643010139465, + "learning_rate": 0.0005386523845237659, + "loss": 3.1046, + "step": 12684 + }, + { + "epoch": 0.62, + "grad_norm": 0.5118758082389832, + "learning_rate": 0.0005386430549933336, + "loss": 3.192, + "step": 12685 + }, + { + "epoch": 0.62, + "grad_norm": 0.5095431804656982, + "learning_rate": 0.0005386337248343613, + "loss": 3.3801, + "step": 12686 + }, + { + "epoch": 0.62, + "grad_norm": 0.5566089749336243, + "learning_rate": 0.0005386243940468738, + "loss": 3.0442, + "step": 12687 + }, + { + "epoch": 0.62, + "grad_norm": 0.4650208055973053, + "learning_rate": 0.0005386150626308957, + "loss": 3.0336, + "step": 12688 + }, + { + "epoch": 0.62, + "grad_norm": 0.5460222959518433, + "learning_rate": 0.0005386057305864513, + "loss": 3.206, + "step": 12689 + }, + { + "epoch": 0.62, + "grad_norm": 0.5706539154052734, + "learning_rate": 0.0005385963979135653, + "loss": 3.3201, + "step": 12690 + }, + { + "epoch": 0.62, + "grad_norm": 0.5206764936447144, + "learning_rate": 0.0005385870646122624, + "loss": 3.2854, + "step": 12691 + }, + { + "epoch": 0.62, + "grad_norm": 0.4984837770462036, + "learning_rate": 0.000538577730682567, + "loss": 3.2708, + "step": 12692 + }, + { + "epoch": 0.62, + "grad_norm": 0.5224167108535767, + "learning_rate": 0.0005385683961245039, + "loss": 3.2557, + "step": 12693 + }, + { + "epoch": 0.62, + "grad_norm": 0.5294681787490845, + "learning_rate": 0.0005385590609380974, + "loss": 3.1294, + "step": 12694 + }, + { + "epoch": 0.62, + "grad_norm": 0.544699490070343, + "learning_rate": 0.0005385497251233724, + "loss": 3.3876, + "step": 12695 + }, + { + "epoch": 0.62, + "grad_norm": 0.5481709241867065, + "learning_rate": 0.0005385403886803532, + "loss": 3.2215, + "step": 12696 + }, + { + "epoch": 0.62, + "grad_norm": 0.4763781428337097, + "learning_rate": 0.0005385310516090646, + "loss": 3.4355, + "step": 12697 + }, + { + "epoch": 0.62, + "grad_norm": 0.48930293321609497, + "learning_rate": 0.0005385217139095311, + "loss": 3.1579, + "step": 12698 + }, + { + "epoch": 0.62, + "grad_norm": 0.48442885279655457, + "learning_rate": 0.0005385123755817773, + "loss": 3.3198, + "step": 12699 + }, + { + "epoch": 0.62, + "grad_norm": 0.551783561706543, + "learning_rate": 0.0005385030366258278, + "loss": 3.3245, + "step": 12700 + }, + { + "epoch": 0.62, + "grad_norm": 0.4957975149154663, + "learning_rate": 0.0005384936970417073, + "loss": 3.1174, + "step": 12701 + }, + { + "epoch": 0.62, + "grad_norm": 0.5431187152862549, + "learning_rate": 0.0005384843568294401, + "loss": 3.0538, + "step": 12702 + }, + { + "epoch": 0.62, + "grad_norm": 0.5256251096725464, + "learning_rate": 0.0005384750159890512, + "loss": 3.1746, + "step": 12703 + }, + { + "epoch": 0.62, + "grad_norm": 0.5222598910331726, + "learning_rate": 0.0005384656745205649, + "loss": 3.4051, + "step": 12704 + }, + { + "epoch": 0.62, + "grad_norm": 0.5458401441574097, + "learning_rate": 0.000538456332424006, + "loss": 3.1197, + "step": 12705 + }, + { + "epoch": 0.62, + "grad_norm": 0.4959386885166168, + "learning_rate": 0.0005384469896993989, + "loss": 3.2583, + "step": 12706 + }, + { + "epoch": 0.62, + "grad_norm": 0.5014752745628357, + "learning_rate": 0.0005384376463467683, + "loss": 3.1556, + "step": 12707 + }, + { + "epoch": 0.62, + "grad_norm": 0.491812527179718, + "learning_rate": 0.0005384283023661389, + "loss": 3.2109, + "step": 12708 + }, + { + "epoch": 0.62, + "grad_norm": 0.5517925024032593, + "learning_rate": 0.0005384189577575352, + "loss": 3.2206, + "step": 12709 + }, + { + "epoch": 0.62, + "grad_norm": 0.504330039024353, + "learning_rate": 0.000538409612520982, + "loss": 3.1117, + "step": 12710 + }, + { + "epoch": 0.62, + "grad_norm": 0.5130345225334167, + "learning_rate": 0.0005384002666565036, + "loss": 3.1259, + "step": 12711 + }, + { + "epoch": 0.62, + "grad_norm": 0.511404275894165, + "learning_rate": 0.0005383909201641247, + "loss": 3.2433, + "step": 12712 + }, + { + "epoch": 0.62, + "grad_norm": 0.5336930155754089, + "learning_rate": 0.0005383815730438702, + "loss": 3.3069, + "step": 12713 + }, + { + "epoch": 0.62, + "grad_norm": 0.5323059558868408, + "learning_rate": 0.0005383722252957644, + "loss": 3.0323, + "step": 12714 + }, + { + "epoch": 0.62, + "grad_norm": 0.540348470211029, + "learning_rate": 0.000538362876919832, + "loss": 2.9597, + "step": 12715 + }, + { + "epoch": 0.62, + "grad_norm": 0.5608701109886169, + "learning_rate": 0.0005383535279160978, + "loss": 3.2359, + "step": 12716 + }, + { + "epoch": 0.62, + "grad_norm": 0.5046235918998718, + "learning_rate": 0.0005383441782845863, + "loss": 3.0872, + "step": 12717 + }, + { + "epoch": 0.62, + "grad_norm": 0.5153566002845764, + "learning_rate": 0.0005383348280253219, + "loss": 3.1468, + "step": 12718 + }, + { + "epoch": 0.62, + "grad_norm": 0.521328866481781, + "learning_rate": 0.0005383254771383296, + "loss": 3.3589, + "step": 12719 + }, + { + "epoch": 0.62, + "grad_norm": 0.505582332611084, + "learning_rate": 0.0005383161256236337, + "loss": 3.3211, + "step": 12720 + }, + { + "epoch": 0.62, + "grad_norm": 0.5521395206451416, + "learning_rate": 0.0005383067734812592, + "loss": 3.3091, + "step": 12721 + }, + { + "epoch": 0.62, + "grad_norm": 0.5566513538360596, + "learning_rate": 0.0005382974207112304, + "loss": 3.1199, + "step": 12722 + }, + { + "epoch": 0.62, + "grad_norm": 0.5230540633201599, + "learning_rate": 0.000538288067313572, + "loss": 3.3551, + "step": 12723 + }, + { + "epoch": 0.62, + "grad_norm": 0.5009475350379944, + "learning_rate": 0.0005382787132883087, + "loss": 3.4286, + "step": 12724 + }, + { + "epoch": 0.62, + "grad_norm": 0.5285469889640808, + "learning_rate": 0.0005382693586354653, + "loss": 3.1925, + "step": 12725 + }, + { + "epoch": 0.62, + "grad_norm": 0.5307457447052002, + "learning_rate": 0.000538260003355066, + "loss": 3.2882, + "step": 12726 + }, + { + "epoch": 0.62, + "grad_norm": 0.5530621409416199, + "learning_rate": 0.0005382506474471359, + "loss": 3.1743, + "step": 12727 + }, + { + "epoch": 0.62, + "grad_norm": 0.5210942625999451, + "learning_rate": 0.0005382412909116993, + "loss": 3.0778, + "step": 12728 + }, + { + "epoch": 0.62, + "grad_norm": 0.4954610764980316, + "learning_rate": 0.0005382319337487812, + "loss": 3.3454, + "step": 12729 + }, + { + "epoch": 0.62, + "grad_norm": 0.5154182314872742, + "learning_rate": 0.0005382225759584058, + "loss": 3.3812, + "step": 12730 + }, + { + "epoch": 0.62, + "grad_norm": 0.5248116850852966, + "learning_rate": 0.0005382132175405982, + "loss": 3.4016, + "step": 12731 + }, + { + "epoch": 0.62, + "grad_norm": 0.524318277835846, + "learning_rate": 0.0005382038584953828, + "loss": 3.1419, + "step": 12732 + }, + { + "epoch": 0.62, + "grad_norm": 0.5205380916595459, + "learning_rate": 0.0005381944988227842, + "loss": 3.2683, + "step": 12733 + }, + { + "epoch": 0.62, + "grad_norm": 0.5063703060150146, + "learning_rate": 0.000538185138522827, + "loss": 3.3365, + "step": 12734 + }, + { + "epoch": 0.62, + "grad_norm": 0.5397539734840393, + "learning_rate": 0.0005381757775955362, + "loss": 3.23, + "step": 12735 + }, + { + "epoch": 0.62, + "grad_norm": 0.5201455950737, + "learning_rate": 0.0005381664160409362, + "loss": 3.2598, + "step": 12736 + }, + { + "epoch": 0.62, + "grad_norm": 0.5498931407928467, + "learning_rate": 0.0005381570538590517, + "loss": 3.2865, + "step": 12737 + }, + { + "epoch": 0.62, + "grad_norm": 0.4814511239528656, + "learning_rate": 0.0005381476910499073, + "loss": 3.4717, + "step": 12738 + }, + { + "epoch": 0.62, + "grad_norm": 0.49921301007270813, + "learning_rate": 0.0005381383276135277, + "loss": 3.3063, + "step": 12739 + }, + { + "epoch": 0.62, + "grad_norm": 0.5252735614776611, + "learning_rate": 0.0005381289635499376, + "loss": 3.1865, + "step": 12740 + }, + { + "epoch": 0.62, + "grad_norm": 0.5268913507461548, + "learning_rate": 0.0005381195988591617, + "loss": 3.1435, + "step": 12741 + }, + { + "epoch": 0.62, + "grad_norm": 0.4931754171848297, + "learning_rate": 0.0005381102335412245, + "loss": 3.4524, + "step": 12742 + }, + { + "epoch": 0.62, + "grad_norm": 0.5258716940879822, + "learning_rate": 0.0005381008675961509, + "loss": 3.3626, + "step": 12743 + }, + { + "epoch": 0.62, + "grad_norm": 0.5764732360839844, + "learning_rate": 0.0005380915010239654, + "loss": 3.3538, + "step": 12744 + }, + { + "epoch": 0.62, + "grad_norm": 0.49837490916252136, + "learning_rate": 0.0005380821338246926, + "loss": 3.1224, + "step": 12745 + }, + { + "epoch": 0.62, + "grad_norm": 0.5089791417121887, + "learning_rate": 0.0005380727659983573, + "loss": 3.1517, + "step": 12746 + }, + { + "epoch": 0.62, + "grad_norm": 0.5005397796630859, + "learning_rate": 0.0005380633975449842, + "loss": 3.1022, + "step": 12747 + }, + { + "epoch": 0.62, + "grad_norm": 0.5430983901023865, + "learning_rate": 0.0005380540284645979, + "loss": 3.3593, + "step": 12748 + }, + { + "epoch": 0.62, + "grad_norm": 0.5346560478210449, + "learning_rate": 0.0005380446587572231, + "loss": 3.2986, + "step": 12749 + }, + { + "epoch": 0.62, + "grad_norm": 0.5033416152000427, + "learning_rate": 0.0005380352884228846, + "loss": 3.4088, + "step": 12750 + }, + { + "epoch": 0.62, + "grad_norm": 0.49493205547332764, + "learning_rate": 0.0005380259174616068, + "loss": 3.3954, + "step": 12751 + }, + { + "epoch": 0.62, + "grad_norm": 0.49495062232017517, + "learning_rate": 0.0005380165458734147, + "loss": 3.3133, + "step": 12752 + }, + { + "epoch": 0.62, + "grad_norm": 0.5404918193817139, + "learning_rate": 0.0005380071736583327, + "loss": 3.1616, + "step": 12753 + }, + { + "epoch": 0.63, + "grad_norm": 0.5051382184028625, + "learning_rate": 0.0005379978008163857, + "loss": 3.103, + "step": 12754 + }, + { + "epoch": 0.63, + "grad_norm": 0.5010060667991638, + "learning_rate": 0.0005379884273475982, + "loss": 3.4863, + "step": 12755 + }, + { + "epoch": 0.63, + "grad_norm": 0.4935106933116913, + "learning_rate": 0.0005379790532519951, + "loss": 3.2361, + "step": 12756 + }, + { + "epoch": 0.63, + "grad_norm": 0.5263615846633911, + "learning_rate": 0.0005379696785296008, + "loss": 3.1684, + "step": 12757 + }, + { + "epoch": 0.63, + "grad_norm": 0.5205957293510437, + "learning_rate": 0.0005379603031804404, + "loss": 3.1844, + "step": 12758 + }, + { + "epoch": 0.63, + "grad_norm": 0.5138325095176697, + "learning_rate": 0.0005379509272045381, + "loss": 3.2793, + "step": 12759 + }, + { + "epoch": 0.63, + "grad_norm": 0.5030960440635681, + "learning_rate": 0.000537941550601919, + "loss": 3.3753, + "step": 12760 + }, + { + "epoch": 0.63, + "grad_norm": 0.49174514412879944, + "learning_rate": 0.0005379321733726077, + "loss": 3.2894, + "step": 12761 + }, + { + "epoch": 0.63, + "grad_norm": 0.5075502395629883, + "learning_rate": 0.0005379227955166287, + "loss": 3.1492, + "step": 12762 + }, + { + "epoch": 0.63, + "grad_norm": 0.47649258375167847, + "learning_rate": 0.0005379134170340069, + "loss": 3.2046, + "step": 12763 + }, + { + "epoch": 0.63, + "grad_norm": 0.5589198470115662, + "learning_rate": 0.000537904037924767, + "loss": 3.3309, + "step": 12764 + }, + { + "epoch": 0.63, + "grad_norm": 0.47373253107070923, + "learning_rate": 0.0005378946581889336, + "loss": 3.1731, + "step": 12765 + }, + { + "epoch": 0.63, + "grad_norm": 0.5284726023674011, + "learning_rate": 0.0005378852778265315, + "loss": 3.2825, + "step": 12766 + }, + { + "epoch": 0.63, + "grad_norm": 0.5329920053482056, + "learning_rate": 0.0005378758968375854, + "loss": 3.1192, + "step": 12767 + }, + { + "epoch": 0.63, + "grad_norm": 0.5024005770683289, + "learning_rate": 0.0005378665152221198, + "loss": 3.2978, + "step": 12768 + }, + { + "epoch": 0.63, + "grad_norm": 0.49619409441947937, + "learning_rate": 0.0005378571329801596, + "loss": 3.3436, + "step": 12769 + }, + { + "epoch": 0.63, + "grad_norm": 0.5017001628875732, + "learning_rate": 0.0005378477501117296, + "loss": 3.3643, + "step": 12770 + }, + { + "epoch": 0.63, + "grad_norm": 0.5197016596794128, + "learning_rate": 0.0005378383666168545, + "loss": 3.1311, + "step": 12771 + }, + { + "epoch": 0.63, + "grad_norm": 0.5079033374786377, + "learning_rate": 0.0005378289824955587, + "loss": 3.3026, + "step": 12772 + }, + { + "epoch": 0.63, + "grad_norm": 0.4886915981769562, + "learning_rate": 0.0005378195977478672, + "loss": 3.2003, + "step": 12773 + }, + { + "epoch": 0.63, + "grad_norm": 0.51219242811203, + "learning_rate": 0.0005378102123738046, + "loss": 3.3336, + "step": 12774 + }, + { + "epoch": 0.63, + "grad_norm": 0.5035862326622009, + "learning_rate": 0.0005378008263733959, + "loss": 3.1417, + "step": 12775 + }, + { + "epoch": 0.63, + "grad_norm": 0.5228604078292847, + "learning_rate": 0.0005377914397466653, + "loss": 3.2611, + "step": 12776 + }, + { + "epoch": 0.63, + "grad_norm": 0.5679159760475159, + "learning_rate": 0.000537782052493638, + "loss": 3.4281, + "step": 12777 + }, + { + "epoch": 0.63, + "grad_norm": 0.5072061419487, + "learning_rate": 0.0005377726646143384, + "loss": 3.1225, + "step": 12778 + }, + { + "epoch": 0.63, + "grad_norm": 0.4947572350502014, + "learning_rate": 0.0005377632761087915, + "loss": 3.2278, + "step": 12779 + }, + { + "epoch": 0.63, + "grad_norm": 0.5147088170051575, + "learning_rate": 0.0005377538869770218, + "loss": 3.0943, + "step": 12780 + }, + { + "epoch": 0.63, + "grad_norm": 0.4995141923427582, + "learning_rate": 0.0005377444972190541, + "loss": 3.4808, + "step": 12781 + }, + { + "epoch": 0.63, + "grad_norm": 0.5687293410301208, + "learning_rate": 0.0005377351068349132, + "loss": 3.3684, + "step": 12782 + }, + { + "epoch": 0.63, + "grad_norm": 0.4961227476596832, + "learning_rate": 0.0005377257158246237, + "loss": 3.145, + "step": 12783 + }, + { + "epoch": 0.63, + "grad_norm": 0.5262210965156555, + "learning_rate": 0.0005377163241882105, + "loss": 3.2739, + "step": 12784 + }, + { + "epoch": 0.63, + "grad_norm": 0.47961992025375366, + "learning_rate": 0.0005377069319256983, + "loss": 3.0336, + "step": 12785 + }, + { + "epoch": 0.63, + "grad_norm": 0.5090808272361755, + "learning_rate": 0.0005376975390371116, + "loss": 3.2362, + "step": 12786 + }, + { + "epoch": 0.63, + "grad_norm": 0.5485349297523499, + "learning_rate": 0.0005376881455224755, + "loss": 2.8677, + "step": 12787 + }, + { + "epoch": 0.63, + "grad_norm": 0.5236116647720337, + "learning_rate": 0.0005376787513818145, + "loss": 3.3714, + "step": 12788 + }, + { + "epoch": 0.63, + "grad_norm": 0.5215224623680115, + "learning_rate": 0.0005376693566151535, + "loss": 3.1176, + "step": 12789 + }, + { + "epoch": 0.63, + "grad_norm": 1.1414709091186523, + "learning_rate": 0.0005376599612225171, + "loss": 3.4685, + "step": 12790 + }, + { + "epoch": 0.63, + "grad_norm": 0.5067881345748901, + "learning_rate": 0.00053765056520393, + "loss": 3.2304, + "step": 12791 + }, + { + "epoch": 0.63, + "grad_norm": 0.5385640263557434, + "learning_rate": 0.0005376411685594171, + "loss": 3.2995, + "step": 12792 + }, + { + "epoch": 0.63, + "grad_norm": 0.6865431666374207, + "learning_rate": 0.0005376317712890032, + "loss": 3.2136, + "step": 12793 + }, + { + "epoch": 0.63, + "grad_norm": 0.5289443135261536, + "learning_rate": 0.0005376223733927129, + "loss": 3.317, + "step": 12794 + }, + { + "epoch": 0.63, + "grad_norm": 0.5437031984329224, + "learning_rate": 0.0005376129748705709, + "loss": 3.1993, + "step": 12795 + }, + { + "epoch": 0.63, + "grad_norm": 0.53926020860672, + "learning_rate": 0.0005376035757226022, + "loss": 3.2593, + "step": 12796 + }, + { + "epoch": 0.63, + "grad_norm": 0.6048049330711365, + "learning_rate": 0.0005375941759488313, + "loss": 3.2746, + "step": 12797 + }, + { + "epoch": 0.63, + "grad_norm": 0.5292719602584839, + "learning_rate": 0.0005375847755492831, + "loss": 3.1617, + "step": 12798 + }, + { + "epoch": 0.63, + "grad_norm": 0.5354344844818115, + "learning_rate": 0.0005375753745239823, + "loss": 3.1443, + "step": 12799 + }, + { + "epoch": 0.63, + "grad_norm": 0.5445627570152283, + "learning_rate": 0.0005375659728729537, + "loss": 3.2476, + "step": 12800 + }, + { + "epoch": 0.63, + "grad_norm": 0.5120712518692017, + "learning_rate": 0.0005375565705962222, + "loss": 3.1865, + "step": 12801 + }, + { + "epoch": 0.63, + "grad_norm": 0.5278573036193848, + "learning_rate": 0.0005375471676938123, + "loss": 3.3421, + "step": 12802 + }, + { + "epoch": 0.63, + "grad_norm": 0.4721059501171112, + "learning_rate": 0.0005375377641657487, + "loss": 3.3415, + "step": 12803 + }, + { + "epoch": 0.63, + "grad_norm": 0.5143515467643738, + "learning_rate": 0.0005375283600120565, + "loss": 3.4199, + "step": 12804 + }, + { + "epoch": 0.63, + "grad_norm": 0.5790268778800964, + "learning_rate": 0.0005375189552327604, + "loss": 3.0928, + "step": 12805 + }, + { + "epoch": 0.63, + "grad_norm": 0.5175043940544128, + "learning_rate": 0.000537509549827885, + "loss": 3.5718, + "step": 12806 + }, + { + "epoch": 0.63, + "grad_norm": 0.5538207292556763, + "learning_rate": 0.0005375001437974552, + "loss": 3.1262, + "step": 12807 + }, + { + "epoch": 0.63, + "grad_norm": 0.49472522735595703, + "learning_rate": 0.0005374907371414956, + "loss": 3.3913, + "step": 12808 + }, + { + "epoch": 0.63, + "grad_norm": 0.5046636462211609, + "learning_rate": 0.0005374813298600312, + "loss": 3.2723, + "step": 12809 + }, + { + "epoch": 0.63, + "grad_norm": 0.538169264793396, + "learning_rate": 0.0005374719219530867, + "loss": 3.3067, + "step": 12810 + }, + { + "epoch": 0.63, + "grad_norm": 0.5092310309410095, + "learning_rate": 0.0005374625134206868, + "loss": 3.0894, + "step": 12811 + }, + { + "epoch": 0.63, + "grad_norm": 0.5437710881233215, + "learning_rate": 0.0005374531042628564, + "loss": 3.0988, + "step": 12812 + }, + { + "epoch": 0.63, + "grad_norm": 0.5265883207321167, + "learning_rate": 0.0005374436944796202, + "loss": 3.1499, + "step": 12813 + }, + { + "epoch": 0.63, + "grad_norm": 0.5095208287239075, + "learning_rate": 0.0005374342840710029, + "loss": 3.3266, + "step": 12814 + }, + { + "epoch": 0.63, + "grad_norm": 0.5680007338523865, + "learning_rate": 0.0005374248730370295, + "loss": 3.3421, + "step": 12815 + }, + { + "epoch": 0.63, + "grad_norm": 0.5302740931510925, + "learning_rate": 0.0005374154613777246, + "loss": 3.314, + "step": 12816 + }, + { + "epoch": 0.63, + "grad_norm": 0.5018924474716187, + "learning_rate": 0.0005374060490931132, + "loss": 3.4018, + "step": 12817 + }, + { + "epoch": 0.63, + "grad_norm": 0.5072476267814636, + "learning_rate": 0.0005373966361832199, + "loss": 3.1629, + "step": 12818 + }, + { + "epoch": 0.63, + "grad_norm": 0.5976406931877136, + "learning_rate": 0.0005373872226480695, + "loss": 3.2877, + "step": 12819 + }, + { + "epoch": 0.63, + "grad_norm": 0.5225663781166077, + "learning_rate": 0.0005373778084876869, + "loss": 3.3186, + "step": 12820 + }, + { + "epoch": 0.63, + "grad_norm": 0.5068392753601074, + "learning_rate": 0.0005373683937020967, + "loss": 3.1388, + "step": 12821 + }, + { + "epoch": 0.63, + "grad_norm": 0.5025680065155029, + "learning_rate": 0.000537358978291324, + "loss": 3.3463, + "step": 12822 + }, + { + "epoch": 0.63, + "grad_norm": 0.5425643920898438, + "learning_rate": 0.0005373495622553934, + "loss": 3.1294, + "step": 12823 + }, + { + "epoch": 0.63, + "grad_norm": 0.5345323085784912, + "learning_rate": 0.0005373401455943298, + "loss": 3.2662, + "step": 12824 + }, + { + "epoch": 0.63, + "grad_norm": 0.5626019239425659, + "learning_rate": 0.0005373307283081577, + "loss": 3.4571, + "step": 12825 + }, + { + "epoch": 0.63, + "grad_norm": 0.5290101170539856, + "learning_rate": 0.0005373213103969024, + "loss": 3.3035, + "step": 12826 + }, + { + "epoch": 0.63, + "grad_norm": 0.5548025965690613, + "learning_rate": 0.0005373118918605883, + "loss": 3.1378, + "step": 12827 + }, + { + "epoch": 0.63, + "grad_norm": 0.5364861488342285, + "learning_rate": 0.0005373024726992403, + "loss": 3.3674, + "step": 12828 + }, + { + "epoch": 0.63, + "grad_norm": 0.5761748552322388, + "learning_rate": 0.0005372930529128833, + "loss": 3.2908, + "step": 12829 + }, + { + "epoch": 0.63, + "grad_norm": 0.5271551012992859, + "learning_rate": 0.0005372836325015422, + "loss": 3.2864, + "step": 12830 + }, + { + "epoch": 0.63, + "grad_norm": 0.5387271046638489, + "learning_rate": 0.0005372742114652415, + "loss": 3.0969, + "step": 12831 + }, + { + "epoch": 0.63, + "grad_norm": 0.48338204622268677, + "learning_rate": 0.0005372647898040062, + "loss": 3.1388, + "step": 12832 + }, + { + "epoch": 0.63, + "grad_norm": 0.5032204389572144, + "learning_rate": 0.0005372553675178612, + "loss": 3.0124, + "step": 12833 + }, + { + "epoch": 0.63, + "grad_norm": 0.5275563597679138, + "learning_rate": 0.0005372459446068312, + "loss": 3.3673, + "step": 12834 + }, + { + "epoch": 0.63, + "grad_norm": 0.5216962695121765, + "learning_rate": 0.000537236521070941, + "loss": 3.2244, + "step": 12835 + }, + { + "epoch": 0.63, + "grad_norm": 0.4898160398006439, + "learning_rate": 0.0005372270969102156, + "loss": 3.2835, + "step": 12836 + }, + { + "epoch": 0.63, + "grad_norm": 0.5112550854682922, + "learning_rate": 0.0005372176721246795, + "loss": 3.2138, + "step": 12837 + }, + { + "epoch": 0.63, + "grad_norm": 0.541236937046051, + "learning_rate": 0.0005372082467143578, + "loss": 3.1371, + "step": 12838 + }, + { + "epoch": 0.63, + "grad_norm": 0.5265771150588989, + "learning_rate": 0.0005371988206792752, + "loss": 3.2119, + "step": 12839 + }, + { + "epoch": 0.63, + "grad_norm": 0.5151164531707764, + "learning_rate": 0.0005371893940194566, + "loss": 3.1975, + "step": 12840 + }, + { + "epoch": 0.63, + "grad_norm": 0.4924558997154236, + "learning_rate": 0.0005371799667349267, + "loss": 3.1876, + "step": 12841 + }, + { + "epoch": 0.63, + "grad_norm": 0.5264354348182678, + "learning_rate": 0.0005371705388257105, + "loss": 3.152, + "step": 12842 + }, + { + "epoch": 0.63, + "grad_norm": 0.5298112034797668, + "learning_rate": 0.0005371611102918327, + "loss": 3.111, + "step": 12843 + }, + { + "epoch": 0.63, + "grad_norm": 0.5277727842330933, + "learning_rate": 0.0005371516811333182, + "loss": 3.1264, + "step": 12844 + }, + { + "epoch": 0.63, + "grad_norm": 0.5544726252555847, + "learning_rate": 0.0005371422513501919, + "loss": 3.0475, + "step": 12845 + }, + { + "epoch": 0.63, + "grad_norm": 0.4931381940841675, + "learning_rate": 0.0005371328209424783, + "loss": 3.3639, + "step": 12846 + }, + { + "epoch": 0.63, + "grad_norm": 0.4993634819984436, + "learning_rate": 0.0005371233899102027, + "loss": 3.1516, + "step": 12847 + }, + { + "epoch": 0.63, + "grad_norm": 0.5138599276542664, + "learning_rate": 0.0005371139582533896, + "loss": 3.198, + "step": 12848 + }, + { + "epoch": 0.63, + "grad_norm": 0.5093522071838379, + "learning_rate": 0.000537104525972064, + "loss": 3.03, + "step": 12849 + }, + { + "epoch": 0.63, + "grad_norm": 0.4839041531085968, + "learning_rate": 0.0005370950930662508, + "loss": 3.2278, + "step": 12850 + }, + { + "epoch": 0.63, + "grad_norm": 0.48701995611190796, + "learning_rate": 0.0005370856595359746, + "loss": 3.2649, + "step": 12851 + }, + { + "epoch": 0.63, + "grad_norm": 0.5446614027023315, + "learning_rate": 0.0005370762253812605, + "loss": 3.0454, + "step": 12852 + }, + { + "epoch": 0.63, + "grad_norm": 0.500033438205719, + "learning_rate": 0.0005370667906021332, + "loss": 3.0629, + "step": 12853 + }, + { + "epoch": 0.63, + "grad_norm": 0.5227843523025513, + "learning_rate": 0.0005370573551986178, + "loss": 3.3169, + "step": 12854 + }, + { + "epoch": 0.63, + "grad_norm": 0.4906866252422333, + "learning_rate": 0.0005370479191707387, + "loss": 3.2001, + "step": 12855 + }, + { + "epoch": 0.63, + "grad_norm": 0.5372707843780518, + "learning_rate": 0.0005370384825185211, + "loss": 3.0848, + "step": 12856 + }, + { + "epoch": 0.63, + "grad_norm": 0.5140169262886047, + "learning_rate": 0.0005370290452419898, + "loss": 3.3082, + "step": 12857 + }, + { + "epoch": 0.63, + "grad_norm": 0.5246527791023254, + "learning_rate": 0.0005370196073411696, + "loss": 3.0989, + "step": 12858 + }, + { + "epoch": 0.63, + "grad_norm": 0.5074948668479919, + "learning_rate": 0.0005370101688160852, + "loss": 3.1239, + "step": 12859 + }, + { + "epoch": 0.63, + "grad_norm": 0.5050709843635559, + "learning_rate": 0.0005370007296667617, + "loss": 3.1019, + "step": 12860 + }, + { + "epoch": 0.63, + "grad_norm": 0.5377639532089233, + "learning_rate": 0.000536991289893224, + "loss": 3.1435, + "step": 12861 + }, + { + "epoch": 0.63, + "grad_norm": 0.526069164276123, + "learning_rate": 0.0005369818494954968, + "loss": 3.48, + "step": 12862 + }, + { + "epoch": 0.63, + "grad_norm": 0.47031062841415405, + "learning_rate": 0.0005369724084736051, + "loss": 3.3851, + "step": 12863 + }, + { + "epoch": 0.63, + "grad_norm": 0.510369598865509, + "learning_rate": 0.0005369629668275736, + "loss": 3.2161, + "step": 12864 + }, + { + "epoch": 0.63, + "grad_norm": 0.5503350496292114, + "learning_rate": 0.0005369535245574272, + "loss": 3.0545, + "step": 12865 + }, + { + "epoch": 0.63, + "grad_norm": 0.48866593837738037, + "learning_rate": 0.0005369440816631909, + "loss": 3.3215, + "step": 12866 + }, + { + "epoch": 0.63, + "grad_norm": 0.553887665271759, + "learning_rate": 0.0005369346381448894, + "loss": 3.1592, + "step": 12867 + }, + { + "epoch": 0.63, + "grad_norm": 0.5026930570602417, + "learning_rate": 0.0005369251940025478, + "loss": 3.1913, + "step": 12868 + }, + { + "epoch": 0.63, + "grad_norm": 0.49965840578079224, + "learning_rate": 0.0005369157492361907, + "loss": 3.1695, + "step": 12869 + }, + { + "epoch": 0.63, + "grad_norm": 0.5119639039039612, + "learning_rate": 0.0005369063038458432, + "loss": 3.1687, + "step": 12870 + }, + { + "epoch": 0.63, + "grad_norm": 0.5644335746765137, + "learning_rate": 0.00053689685783153, + "loss": 3.3533, + "step": 12871 + }, + { + "epoch": 0.63, + "grad_norm": 0.5100526213645935, + "learning_rate": 0.0005368874111932761, + "loss": 3.2576, + "step": 12872 + }, + { + "epoch": 0.63, + "grad_norm": 0.5507298111915588, + "learning_rate": 0.0005368779639311064, + "loss": 3.2868, + "step": 12873 + }, + { + "epoch": 0.63, + "grad_norm": 0.519503116607666, + "learning_rate": 0.0005368685160450457, + "loss": 3.2348, + "step": 12874 + }, + { + "epoch": 0.63, + "grad_norm": 0.5227647423744202, + "learning_rate": 0.000536859067535119, + "loss": 3.1734, + "step": 12875 + }, + { + "epoch": 0.63, + "grad_norm": 0.5177075862884521, + "learning_rate": 0.000536849618401351, + "loss": 2.9512, + "step": 12876 + }, + { + "epoch": 0.63, + "grad_norm": 0.49735453724861145, + "learning_rate": 0.0005368401686437667, + "loss": 3.137, + "step": 12877 + }, + { + "epoch": 0.63, + "grad_norm": 0.5045885443687439, + "learning_rate": 0.0005368307182623909, + "loss": 3.2829, + "step": 12878 + }, + { + "epoch": 0.63, + "grad_norm": 0.4974973499774933, + "learning_rate": 0.0005368212672572487, + "loss": 3.4202, + "step": 12879 + }, + { + "epoch": 0.63, + "grad_norm": 0.49687159061431885, + "learning_rate": 0.0005368118156283648, + "loss": 3.3512, + "step": 12880 + }, + { + "epoch": 0.63, + "grad_norm": 0.5176152586936951, + "learning_rate": 0.0005368023633757642, + "loss": 3.3083, + "step": 12881 + }, + { + "epoch": 0.63, + "grad_norm": 0.4925430417060852, + "learning_rate": 0.0005367929104994717, + "loss": 3.1931, + "step": 12882 + }, + { + "epoch": 0.63, + "grad_norm": 0.5005999207496643, + "learning_rate": 0.0005367834569995122, + "loss": 3.2917, + "step": 12883 + }, + { + "epoch": 0.63, + "grad_norm": 0.5244042277336121, + "learning_rate": 0.0005367740028759108, + "loss": 3.0702, + "step": 12884 + }, + { + "epoch": 0.63, + "grad_norm": 0.567849338054657, + "learning_rate": 0.0005367645481286921, + "loss": 3.2763, + "step": 12885 + }, + { + "epoch": 0.63, + "grad_norm": 0.5182596445083618, + "learning_rate": 0.0005367550927578812, + "loss": 3.2015, + "step": 12886 + }, + { + "epoch": 0.63, + "grad_norm": 0.5369324684143066, + "learning_rate": 0.0005367456367635029, + "loss": 3.1494, + "step": 12887 + }, + { + "epoch": 0.63, + "grad_norm": 0.5310854911804199, + "learning_rate": 0.0005367361801455823, + "loss": 3.4733, + "step": 12888 + }, + { + "epoch": 0.63, + "grad_norm": 0.4995528757572174, + "learning_rate": 0.0005367267229041441, + "loss": 3.316, + "step": 12889 + }, + { + "epoch": 0.63, + "grad_norm": 0.5234096646308899, + "learning_rate": 0.000536717265039213, + "loss": 3.3181, + "step": 12890 + }, + { + "epoch": 0.63, + "grad_norm": 0.5279734134674072, + "learning_rate": 0.0005367078065508146, + "loss": 3.3797, + "step": 12891 + }, + { + "epoch": 0.63, + "grad_norm": 0.563451886177063, + "learning_rate": 0.0005366983474389732, + "loss": 3.2402, + "step": 12892 + }, + { + "epoch": 0.63, + "grad_norm": 0.5074998140335083, + "learning_rate": 0.0005366888877037138, + "loss": 3.2033, + "step": 12893 + }, + { + "epoch": 0.63, + "grad_norm": 0.5854350924491882, + "learning_rate": 0.0005366794273450615, + "loss": 3.3255, + "step": 12894 + }, + { + "epoch": 0.63, + "grad_norm": 0.5305065512657166, + "learning_rate": 0.0005366699663630413, + "loss": 3.1053, + "step": 12895 + }, + { + "epoch": 0.63, + "grad_norm": 0.5451061725616455, + "learning_rate": 0.0005366605047576778, + "loss": 3.2225, + "step": 12896 + }, + { + "epoch": 0.63, + "grad_norm": 0.48388218879699707, + "learning_rate": 0.000536651042528996, + "loss": 3.3632, + "step": 12897 + }, + { + "epoch": 0.63, + "grad_norm": 0.5109940767288208, + "learning_rate": 0.000536641579677021, + "loss": 3.1846, + "step": 12898 + }, + { + "epoch": 0.63, + "grad_norm": 0.5055460333824158, + "learning_rate": 0.0005366321162017778, + "loss": 3.1477, + "step": 12899 + }, + { + "epoch": 0.63, + "grad_norm": 0.5371001958847046, + "learning_rate": 0.0005366226521032908, + "loss": 3.1253, + "step": 12900 + }, + { + "epoch": 0.63, + "grad_norm": 0.5191012024879456, + "learning_rate": 0.0005366131873815855, + "loss": 3.2262, + "step": 12901 + }, + { + "epoch": 0.63, + "grad_norm": 0.5225028991699219, + "learning_rate": 0.0005366037220366866, + "loss": 3.5542, + "step": 12902 + }, + { + "epoch": 0.63, + "grad_norm": 0.49432340264320374, + "learning_rate": 0.0005365942560686189, + "loss": 3.3985, + "step": 12903 + }, + { + "epoch": 0.63, + "grad_norm": 0.5132206082344055, + "learning_rate": 0.0005365847894774076, + "loss": 3.161, + "step": 12904 + }, + { + "epoch": 0.63, + "grad_norm": 0.550641655921936, + "learning_rate": 0.0005365753222630774, + "loss": 3.4166, + "step": 12905 + }, + { + "epoch": 0.63, + "grad_norm": 0.5244418382644653, + "learning_rate": 0.0005365658544256533, + "loss": 3.1923, + "step": 12906 + }, + { + "epoch": 0.63, + "grad_norm": 0.5510836839675903, + "learning_rate": 0.0005365563859651604, + "loss": 3.1081, + "step": 12907 + }, + { + "epoch": 0.63, + "grad_norm": 0.5539757013320923, + "learning_rate": 0.0005365469168816235, + "loss": 3.3491, + "step": 12908 + }, + { + "epoch": 0.63, + "grad_norm": 0.5009015798568726, + "learning_rate": 0.0005365374471750674, + "loss": 3.2821, + "step": 12909 + }, + { + "epoch": 0.63, + "grad_norm": 0.4771251976490021, + "learning_rate": 0.0005365279768455173, + "loss": 3.1718, + "step": 12910 + }, + { + "epoch": 0.63, + "grad_norm": 0.5260068774223328, + "learning_rate": 0.000536518505892998, + "loss": 3.1779, + "step": 12911 + }, + { + "epoch": 0.63, + "grad_norm": 0.526455283164978, + "learning_rate": 0.0005365090343175345, + "loss": 3.057, + "step": 12912 + }, + { + "epoch": 0.63, + "grad_norm": 0.5597702860832214, + "learning_rate": 0.0005364995621191516, + "loss": 3.1074, + "step": 12913 + }, + { + "epoch": 0.63, + "grad_norm": 0.5866166353225708, + "learning_rate": 0.0005364900892978746, + "loss": 3.0222, + "step": 12914 + }, + { + "epoch": 0.63, + "grad_norm": 0.49002647399902344, + "learning_rate": 0.000536480615853728, + "loss": 3.3856, + "step": 12915 + }, + { + "epoch": 0.63, + "grad_norm": 0.5181958675384521, + "learning_rate": 0.0005364711417867371, + "loss": 3.288, + "step": 12916 + }, + { + "epoch": 0.63, + "grad_norm": 0.5336577892303467, + "learning_rate": 0.0005364616670969266, + "loss": 3.3546, + "step": 12917 + }, + { + "epoch": 0.63, + "grad_norm": 0.5243266224861145, + "learning_rate": 0.0005364521917843217, + "loss": 3.3334, + "step": 12918 + }, + { + "epoch": 0.63, + "grad_norm": 0.5071207880973816, + "learning_rate": 0.0005364427158489472, + "loss": 3.1865, + "step": 12919 + }, + { + "epoch": 0.63, + "grad_norm": 0.5273388028144836, + "learning_rate": 0.000536433239290828, + "loss": 3.3596, + "step": 12920 + }, + { + "epoch": 0.63, + "grad_norm": 0.5106172561645508, + "learning_rate": 0.0005364237621099893, + "loss": 3.2821, + "step": 12921 + }, + { + "epoch": 0.63, + "grad_norm": 0.5156787037849426, + "learning_rate": 0.0005364142843064558, + "loss": 3.2522, + "step": 12922 + }, + { + "epoch": 0.63, + "grad_norm": 0.593102216720581, + "learning_rate": 0.0005364048058802527, + "loss": 3.1004, + "step": 12923 + }, + { + "epoch": 0.63, + "grad_norm": 0.5650702118873596, + "learning_rate": 0.0005363953268314048, + "loss": 3.3883, + "step": 12924 + }, + { + "epoch": 0.63, + "grad_norm": 0.5169339179992676, + "learning_rate": 0.0005363858471599369, + "loss": 3.2, + "step": 12925 + }, + { + "epoch": 0.63, + "grad_norm": 0.5006090998649597, + "learning_rate": 0.0005363763668658744, + "loss": 3.3361, + "step": 12926 + }, + { + "epoch": 0.63, + "grad_norm": 0.533066987991333, + "learning_rate": 0.000536366885949242, + "loss": 3.225, + "step": 12927 + }, + { + "epoch": 0.63, + "grad_norm": 0.5449780225753784, + "learning_rate": 0.0005363574044100647, + "loss": 3.1673, + "step": 12928 + }, + { + "epoch": 0.63, + "grad_norm": 0.49450409412384033, + "learning_rate": 0.0005363479222483674, + "loss": 3.3605, + "step": 12929 + }, + { + "epoch": 0.63, + "grad_norm": 0.49364954233169556, + "learning_rate": 0.0005363384394641753, + "loss": 3.1942, + "step": 12930 + }, + { + "epoch": 0.63, + "grad_norm": 0.4991125762462616, + "learning_rate": 0.0005363289560575131, + "loss": 3.0315, + "step": 12931 + }, + { + "epoch": 0.63, + "grad_norm": 0.6006268858909607, + "learning_rate": 0.000536319472028406, + "loss": 2.9982, + "step": 12932 + }, + { + "epoch": 0.63, + "grad_norm": 0.49599677324295044, + "learning_rate": 0.0005363099873768787, + "loss": 3.0228, + "step": 12933 + }, + { + "epoch": 0.63, + "grad_norm": 0.5204678177833557, + "learning_rate": 0.0005363005021029566, + "loss": 3.0848, + "step": 12934 + }, + { + "epoch": 0.63, + "grad_norm": 0.5052371621131897, + "learning_rate": 0.0005362910162066644, + "loss": 3.0953, + "step": 12935 + }, + { + "epoch": 0.63, + "grad_norm": 0.5127009153366089, + "learning_rate": 0.0005362815296880272, + "loss": 3.5021, + "step": 12936 + }, + { + "epoch": 0.63, + "grad_norm": 0.5664146542549133, + "learning_rate": 0.0005362720425470698, + "loss": 3.1467, + "step": 12937 + }, + { + "epoch": 0.63, + "grad_norm": 0.5321229100227356, + "learning_rate": 0.0005362625547838173, + "loss": 3.3785, + "step": 12938 + }, + { + "epoch": 0.63, + "grad_norm": 0.5562167167663574, + "learning_rate": 0.0005362530663982948, + "loss": 3.4932, + "step": 12939 + }, + { + "epoch": 0.63, + "grad_norm": 0.508590817451477, + "learning_rate": 0.0005362435773905271, + "loss": 3.2824, + "step": 12940 + }, + { + "epoch": 0.63, + "grad_norm": 0.511133074760437, + "learning_rate": 0.0005362340877605394, + "loss": 3.0221, + "step": 12941 + }, + { + "epoch": 0.63, + "grad_norm": 0.5249322652816772, + "learning_rate": 0.0005362245975083566, + "loss": 3.1149, + "step": 12942 + }, + { + "epoch": 0.63, + "grad_norm": 0.5249238014221191, + "learning_rate": 0.0005362151066340035, + "loss": 3.0869, + "step": 12943 + }, + { + "epoch": 0.63, + "grad_norm": 0.5448753237724304, + "learning_rate": 0.0005362056151375054, + "loss": 3.0745, + "step": 12944 + }, + { + "epoch": 0.63, + "grad_norm": 0.49821531772613525, + "learning_rate": 0.0005361961230188871, + "loss": 3.2544, + "step": 12945 + }, + { + "epoch": 0.63, + "grad_norm": 0.5204089879989624, + "learning_rate": 0.0005361866302781736, + "loss": 3.0721, + "step": 12946 + }, + { + "epoch": 0.63, + "grad_norm": 0.5314644575119019, + "learning_rate": 0.0005361771369153901, + "loss": 3.1426, + "step": 12947 + }, + { + "epoch": 0.63, + "grad_norm": 0.6273903250694275, + "learning_rate": 0.0005361676429305615, + "loss": 3.1625, + "step": 12948 + }, + { + "epoch": 0.63, + "grad_norm": 0.510073184967041, + "learning_rate": 0.0005361581483237127, + "loss": 3.3129, + "step": 12949 + }, + { + "epoch": 0.63, + "grad_norm": 0.5302249193191528, + "learning_rate": 0.0005361486530948688, + "loss": 3.4637, + "step": 12950 + }, + { + "epoch": 0.63, + "grad_norm": 0.49914729595184326, + "learning_rate": 0.0005361391572440547, + "loss": 3.4225, + "step": 12951 + }, + { + "epoch": 0.63, + "grad_norm": 0.5495375394821167, + "learning_rate": 0.0005361296607712956, + "loss": 3.0738, + "step": 12952 + }, + { + "epoch": 0.63, + "grad_norm": 0.5367246270179749, + "learning_rate": 0.0005361201636766165, + "loss": 3.1163, + "step": 12953 + }, + { + "epoch": 0.63, + "grad_norm": 0.5100537538528442, + "learning_rate": 0.0005361106659600423, + "loss": 3.1204, + "step": 12954 + }, + { + "epoch": 0.63, + "grad_norm": 0.49203306436538696, + "learning_rate": 0.000536101167621598, + "loss": 3.3834, + "step": 12955 + }, + { + "epoch": 0.63, + "grad_norm": 0.5398481488227844, + "learning_rate": 0.0005360916686613087, + "loss": 3.2211, + "step": 12956 + }, + { + "epoch": 0.63, + "grad_norm": 0.5405081510543823, + "learning_rate": 0.0005360821690791992, + "loss": 3.1053, + "step": 12957 + }, + { + "epoch": 0.64, + "grad_norm": 0.5752179622650146, + "learning_rate": 0.000536072668875295, + "loss": 2.9739, + "step": 12958 + }, + { + "epoch": 0.64, + "grad_norm": 0.4982735514640808, + "learning_rate": 0.0005360631680496206, + "loss": 3.289, + "step": 12959 + }, + { + "epoch": 0.64, + "grad_norm": 0.500749409198761, + "learning_rate": 0.0005360536666022014, + "loss": 3.1595, + "step": 12960 + }, + { + "epoch": 0.64, + "grad_norm": 0.5099477171897888, + "learning_rate": 0.000536044164533062, + "loss": 3.2093, + "step": 12961 + }, + { + "epoch": 0.64, + "grad_norm": 0.5565930604934692, + "learning_rate": 0.000536034661842228, + "loss": 3.0367, + "step": 12962 + }, + { + "epoch": 0.64, + "grad_norm": 0.5251843929290771, + "learning_rate": 0.0005360251585297239, + "loss": 3.1134, + "step": 12963 + }, + { + "epoch": 0.64, + "grad_norm": 0.5190328359603882, + "learning_rate": 0.0005360156545955752, + "loss": 3.377, + "step": 12964 + }, + { + "epoch": 0.64, + "grad_norm": 0.4984540343284607, + "learning_rate": 0.0005360061500398065, + "loss": 3.4222, + "step": 12965 + }, + { + "epoch": 0.64, + "grad_norm": 0.5332692861557007, + "learning_rate": 0.0005359966448624431, + "loss": 3.0324, + "step": 12966 + }, + { + "epoch": 0.64, + "grad_norm": 0.5812960863113403, + "learning_rate": 0.00053598713906351, + "loss": 3.3179, + "step": 12967 + }, + { + "epoch": 0.64, + "grad_norm": 0.498300164937973, + "learning_rate": 0.0005359776326430321, + "loss": 3.3108, + "step": 12968 + }, + { + "epoch": 0.64, + "grad_norm": 0.4898775517940521, + "learning_rate": 0.0005359681256010345, + "loss": 3.215, + "step": 12969 + }, + { + "epoch": 0.64, + "grad_norm": 0.5427769422531128, + "learning_rate": 0.0005359586179375424, + "loss": 3.2795, + "step": 12970 + }, + { + "epoch": 0.64, + "grad_norm": 0.517987072467804, + "learning_rate": 0.0005359491096525806, + "loss": 2.9976, + "step": 12971 + }, + { + "epoch": 0.64, + "grad_norm": 0.5325692296028137, + "learning_rate": 0.0005359396007461743, + "loss": 3.327, + "step": 12972 + }, + { + "epoch": 0.64, + "grad_norm": 0.5042824149131775, + "learning_rate": 0.0005359300912183485, + "loss": 3.1492, + "step": 12973 + }, + { + "epoch": 0.64, + "grad_norm": 0.5307673811912537, + "learning_rate": 0.0005359205810691282, + "loss": 3.3302, + "step": 12974 + }, + { + "epoch": 0.64, + "grad_norm": 0.5268490314483643, + "learning_rate": 0.0005359110702985385, + "loss": 3.3567, + "step": 12975 + }, + { + "epoch": 0.64, + "grad_norm": 0.5066551566123962, + "learning_rate": 0.0005359015589066046, + "loss": 3.2215, + "step": 12976 + }, + { + "epoch": 0.64, + "grad_norm": 0.503842294216156, + "learning_rate": 0.0005358920468933511, + "loss": 3.2734, + "step": 12977 + }, + { + "epoch": 0.64, + "grad_norm": 0.5022884011268616, + "learning_rate": 0.0005358825342588035, + "loss": 3.2803, + "step": 12978 + }, + { + "epoch": 0.64, + "grad_norm": 0.5310874581336975, + "learning_rate": 0.0005358730210029869, + "loss": 3.2037, + "step": 12979 + }, + { + "epoch": 0.64, + "grad_norm": 0.509129524230957, + "learning_rate": 0.0005358635071259259, + "loss": 3.3409, + "step": 12980 + }, + { + "epoch": 0.64, + "grad_norm": 0.548210620880127, + "learning_rate": 0.0005358539926276459, + "loss": 3.2471, + "step": 12981 + }, + { + "epoch": 0.64, + "grad_norm": 0.5100131630897522, + "learning_rate": 0.0005358444775081718, + "loss": 3.1354, + "step": 12982 + }, + { + "epoch": 0.64, + "grad_norm": 0.5340960621833801, + "learning_rate": 0.0005358349617675289, + "loss": 3.2023, + "step": 12983 + }, + { + "epoch": 0.64, + "grad_norm": 0.5364866256713867, + "learning_rate": 0.000535825445405742, + "loss": 3.2475, + "step": 12984 + }, + { + "epoch": 0.64, + "grad_norm": 0.5168478488922119, + "learning_rate": 0.0005358159284228363, + "loss": 3.0523, + "step": 12985 + }, + { + "epoch": 0.64, + "grad_norm": 0.49745577573776245, + "learning_rate": 0.0005358064108188366, + "loss": 3.1681, + "step": 12986 + }, + { + "epoch": 0.64, + "grad_norm": 0.5580443739891052, + "learning_rate": 0.0005357968925937685, + "loss": 3.4102, + "step": 12987 + }, + { + "epoch": 0.64, + "grad_norm": 0.5218451023101807, + "learning_rate": 0.0005357873737476565, + "loss": 3.279, + "step": 12988 + }, + { + "epoch": 0.64, + "grad_norm": 0.5404812693595886, + "learning_rate": 0.0005357778542805262, + "loss": 3.4815, + "step": 12989 + }, + { + "epoch": 0.64, + "grad_norm": 0.5017077326774597, + "learning_rate": 0.0005357683341924023, + "loss": 3.3535, + "step": 12990 + }, + { + "epoch": 0.64, + "grad_norm": 0.49366968870162964, + "learning_rate": 0.00053575881348331, + "loss": 3.1195, + "step": 12991 + }, + { + "epoch": 0.64, + "grad_norm": 0.5663856863975525, + "learning_rate": 0.0005357492921532743, + "loss": 3.3108, + "step": 12992 + }, + { + "epoch": 0.64, + "grad_norm": 0.5067052841186523, + "learning_rate": 0.0005357397702023204, + "loss": 3.3169, + "step": 12993 + }, + { + "epoch": 0.64, + "grad_norm": 0.5267317295074463, + "learning_rate": 0.0005357302476304732, + "loss": 3.0451, + "step": 12994 + }, + { + "epoch": 0.64, + "grad_norm": 0.5492493510246277, + "learning_rate": 0.000535720724437758, + "loss": 3.091, + "step": 12995 + }, + { + "epoch": 0.64, + "grad_norm": 0.4951799511909485, + "learning_rate": 0.0005357112006241998, + "loss": 3.1907, + "step": 12996 + }, + { + "epoch": 0.64, + "grad_norm": 0.5452933311462402, + "learning_rate": 0.0005357016761898236, + "loss": 3.1006, + "step": 12997 + }, + { + "epoch": 0.64, + "grad_norm": 0.5037853121757507, + "learning_rate": 0.0005356921511346545, + "loss": 3.2203, + "step": 12998 + }, + { + "epoch": 0.64, + "grad_norm": 0.48112764954566956, + "learning_rate": 0.0005356826254587177, + "loss": 3.2942, + "step": 12999 + }, + { + "epoch": 0.64, + "grad_norm": 0.49758175015449524, + "learning_rate": 0.0005356730991620382, + "loss": 3.2869, + "step": 13000 + }, + { + "epoch": 0.64, + "grad_norm": 0.5027998089790344, + "learning_rate": 0.0005356635722446412, + "loss": 3.1453, + "step": 13001 + }, + { + "epoch": 0.64, + "grad_norm": 0.5159273147583008, + "learning_rate": 0.0005356540447065516, + "loss": 3.162, + "step": 13002 + }, + { + "epoch": 0.64, + "grad_norm": 0.5522950887680054, + "learning_rate": 0.0005356445165477947, + "loss": 3.164, + "step": 13003 + }, + { + "epoch": 0.64, + "grad_norm": 0.5712302923202515, + "learning_rate": 0.0005356349877683954, + "loss": 3.1857, + "step": 13004 + }, + { + "epoch": 0.64, + "grad_norm": 0.4907442331314087, + "learning_rate": 0.0005356254583683789, + "loss": 3.4676, + "step": 13005 + }, + { + "epoch": 0.64, + "grad_norm": 0.49316293001174927, + "learning_rate": 0.0005356159283477703, + "loss": 3.274, + "step": 13006 + }, + { + "epoch": 0.64, + "grad_norm": 0.5303720831871033, + "learning_rate": 0.0005356063977065948, + "loss": 3.1756, + "step": 13007 + }, + { + "epoch": 0.64, + "grad_norm": 0.4843452572822571, + "learning_rate": 0.0005355968664448772, + "loss": 3.0976, + "step": 13008 + }, + { + "epoch": 0.64, + "grad_norm": 0.5406621098518372, + "learning_rate": 0.0005355873345626429, + "loss": 3.216, + "step": 13009 + }, + { + "epoch": 0.64, + "grad_norm": 0.5066521167755127, + "learning_rate": 0.0005355778020599168, + "loss": 3.3491, + "step": 13010 + }, + { + "epoch": 0.64, + "grad_norm": 0.5344280004501343, + "learning_rate": 0.0005355682689367243, + "loss": 3.3312, + "step": 13011 + }, + { + "epoch": 0.64, + "grad_norm": 0.5455307364463806, + "learning_rate": 0.0005355587351930902, + "loss": 3.2931, + "step": 13012 + }, + { + "epoch": 0.64, + "grad_norm": 0.46568596363067627, + "learning_rate": 0.0005355492008290397, + "loss": 3.2364, + "step": 13013 + }, + { + "epoch": 0.64, + "grad_norm": 0.4989795684814453, + "learning_rate": 0.000535539665844598, + "loss": 3.3095, + "step": 13014 + }, + { + "epoch": 0.64, + "grad_norm": 0.5595158338546753, + "learning_rate": 0.0005355301302397901, + "loss": 3.3839, + "step": 13015 + }, + { + "epoch": 0.64, + "grad_norm": 0.4899807870388031, + "learning_rate": 0.0005355205940146412, + "loss": 3.3119, + "step": 13016 + }, + { + "epoch": 0.64, + "grad_norm": 0.48893171548843384, + "learning_rate": 0.0005355110571691764, + "loss": 3.2628, + "step": 13017 + }, + { + "epoch": 0.64, + "grad_norm": 0.5232262015342712, + "learning_rate": 0.0005355015197034207, + "loss": 3.281, + "step": 13018 + }, + { + "epoch": 0.64, + "grad_norm": 0.5392224788665771, + "learning_rate": 0.0005354919816173995, + "loss": 3.2029, + "step": 13019 + }, + { + "epoch": 0.64, + "grad_norm": 0.5342455506324768, + "learning_rate": 0.0005354824429111376, + "loss": 3.2832, + "step": 13020 + }, + { + "epoch": 0.64, + "grad_norm": 0.48179611563682556, + "learning_rate": 0.0005354729035846603, + "loss": 3.0406, + "step": 13021 + }, + { + "epoch": 0.64, + "grad_norm": 0.4980888068675995, + "learning_rate": 0.0005354633636379927, + "loss": 3.2632, + "step": 13022 + }, + { + "epoch": 0.64, + "grad_norm": 0.5115153193473816, + "learning_rate": 0.0005354538230711598, + "loss": 3.1863, + "step": 13023 + }, + { + "epoch": 0.64, + "grad_norm": 0.5516042709350586, + "learning_rate": 0.0005354442818841869, + "loss": 3.2411, + "step": 13024 + }, + { + "epoch": 0.64, + "grad_norm": 0.4984360933303833, + "learning_rate": 0.0005354347400770992, + "loss": 3.1063, + "step": 13025 + }, + { + "epoch": 0.64, + "grad_norm": 0.5270858407020569, + "learning_rate": 0.0005354251976499217, + "loss": 3.2525, + "step": 13026 + }, + { + "epoch": 0.64, + "grad_norm": 0.5250088572502136, + "learning_rate": 0.0005354156546026794, + "loss": 3.3425, + "step": 13027 + }, + { + "epoch": 0.64, + "grad_norm": 0.5461434125900269, + "learning_rate": 0.0005354061109353976, + "loss": 3.3327, + "step": 13028 + }, + { + "epoch": 0.64, + "grad_norm": 0.5004782676696777, + "learning_rate": 0.0005353965666481015, + "loss": 3.1885, + "step": 13029 + }, + { + "epoch": 0.64, + "grad_norm": 0.5391425490379333, + "learning_rate": 0.0005353870217408161, + "loss": 3.0705, + "step": 13030 + }, + { + "epoch": 0.64, + "grad_norm": 0.5242264866828918, + "learning_rate": 0.0005353774762135666, + "loss": 3.1418, + "step": 13031 + }, + { + "epoch": 0.64, + "grad_norm": 0.5164439678192139, + "learning_rate": 0.000535367930066378, + "loss": 3.2949, + "step": 13032 + }, + { + "epoch": 0.64, + "grad_norm": 0.5032666921615601, + "learning_rate": 0.0005353583832992758, + "loss": 3.257, + "step": 13033 + }, + { + "epoch": 0.64, + "grad_norm": 0.5135819315910339, + "learning_rate": 0.0005353488359122848, + "loss": 3.1992, + "step": 13034 + }, + { + "epoch": 0.64, + "grad_norm": 0.5079220533370972, + "learning_rate": 0.0005353392879054302, + "loss": 3.2749, + "step": 13035 + }, + { + "epoch": 0.64, + "grad_norm": 0.5061142444610596, + "learning_rate": 0.0005353297392787373, + "loss": 3.1821, + "step": 13036 + }, + { + "epoch": 0.64, + "grad_norm": 0.5289643406867981, + "learning_rate": 0.000535320190032231, + "loss": 3.29, + "step": 13037 + }, + { + "epoch": 0.64, + "grad_norm": 0.5101862549781799, + "learning_rate": 0.0005353106401659367, + "loss": 3.2464, + "step": 13038 + }, + { + "epoch": 0.64, + "grad_norm": 0.5174379348754883, + "learning_rate": 0.0005353010896798796, + "loss": 3.1531, + "step": 13039 + }, + { + "epoch": 0.64, + "grad_norm": 0.5504981875419617, + "learning_rate": 0.0005352915385740845, + "loss": 2.8933, + "step": 13040 + }, + { + "epoch": 0.64, + "grad_norm": 0.534186840057373, + "learning_rate": 0.0005352819868485769, + "loss": 3.2163, + "step": 13041 + }, + { + "epoch": 0.64, + "grad_norm": 0.5198753476142883, + "learning_rate": 0.0005352724345033818, + "loss": 3.1988, + "step": 13042 + }, + { + "epoch": 0.64, + "grad_norm": 0.5213879942893982, + "learning_rate": 0.0005352628815385244, + "loss": 3.1679, + "step": 13043 + }, + { + "epoch": 0.64, + "grad_norm": 0.5233698487281799, + "learning_rate": 0.0005352533279540298, + "loss": 3.0413, + "step": 13044 + }, + { + "epoch": 0.64, + "grad_norm": 0.5285971164703369, + "learning_rate": 0.0005352437737499232, + "loss": 3.0021, + "step": 13045 + }, + { + "epoch": 0.64, + "grad_norm": 0.5171172618865967, + "learning_rate": 0.0005352342189262298, + "loss": 3.1727, + "step": 13046 + }, + { + "epoch": 0.64, + "grad_norm": 0.5061637163162231, + "learning_rate": 0.0005352246634829748, + "loss": 3.5304, + "step": 13047 + }, + { + "epoch": 0.64, + "grad_norm": 0.535328209400177, + "learning_rate": 0.0005352151074201832, + "loss": 3.1083, + "step": 13048 + }, + { + "epoch": 0.64, + "grad_norm": 0.5753775835037231, + "learning_rate": 0.0005352055507378804, + "loss": 3.189, + "step": 13049 + }, + { + "epoch": 0.64, + "grad_norm": 0.5696597695350647, + "learning_rate": 0.0005351959934360913, + "loss": 3.121, + "step": 13050 + }, + { + "epoch": 0.64, + "grad_norm": 0.5169010758399963, + "learning_rate": 0.0005351864355148413, + "loss": 3.13, + "step": 13051 + }, + { + "epoch": 0.64, + "grad_norm": 0.6170527935028076, + "learning_rate": 0.0005351768769741556, + "loss": 3.1525, + "step": 13052 + }, + { + "epoch": 0.64, + "grad_norm": 0.4995497465133667, + "learning_rate": 0.000535167317814059, + "loss": 3.2603, + "step": 13053 + }, + { + "epoch": 0.64, + "grad_norm": 0.5279785990715027, + "learning_rate": 0.0005351577580345771, + "loss": 3.3865, + "step": 13054 + }, + { + "epoch": 0.64, + "grad_norm": 0.5221437811851501, + "learning_rate": 0.000535148197635735, + "loss": 3.3428, + "step": 13055 + }, + { + "epoch": 0.64, + "grad_norm": 0.5271676182746887, + "learning_rate": 0.0005351386366175577, + "loss": 3.1217, + "step": 13056 + }, + { + "epoch": 0.64, + "grad_norm": 0.5257646441459656, + "learning_rate": 0.0005351290749800705, + "loss": 3.2126, + "step": 13057 + }, + { + "epoch": 0.64, + "grad_norm": 0.5410515666007996, + "learning_rate": 0.0005351195127232986, + "loss": 3.0734, + "step": 13058 + }, + { + "epoch": 0.64, + "grad_norm": 0.5229355692863464, + "learning_rate": 0.0005351099498472671, + "loss": 3.1239, + "step": 13059 + }, + { + "epoch": 0.64, + "grad_norm": 0.5615711212158203, + "learning_rate": 0.0005351003863520013, + "loss": 3.0884, + "step": 13060 + }, + { + "epoch": 0.64, + "grad_norm": 0.5424404740333557, + "learning_rate": 0.0005350908222375263, + "loss": 3.2056, + "step": 13061 + }, + { + "epoch": 0.64, + "grad_norm": 0.537496030330658, + "learning_rate": 0.0005350812575038673, + "loss": 3.3297, + "step": 13062 + }, + { + "epoch": 0.64, + "grad_norm": 0.49330636858940125, + "learning_rate": 0.0005350716921510495, + "loss": 3.385, + "step": 13063 + }, + { + "epoch": 0.64, + "grad_norm": 0.5436633229255676, + "learning_rate": 0.0005350621261790982, + "loss": 3.198, + "step": 13064 + }, + { + "epoch": 0.64, + "grad_norm": 0.4999907910823822, + "learning_rate": 0.0005350525595880384, + "loss": 3.4466, + "step": 13065 + }, + { + "epoch": 0.64, + "grad_norm": 0.5513013005256653, + "learning_rate": 0.0005350429923778954, + "loss": 3.4655, + "step": 13066 + }, + { + "epoch": 0.64, + "grad_norm": 0.5286356806755066, + "learning_rate": 0.0005350334245486943, + "loss": 3.3866, + "step": 13067 + }, + { + "epoch": 0.64, + "grad_norm": 0.5245810747146606, + "learning_rate": 0.0005350238561004606, + "loss": 2.9812, + "step": 13068 + }, + { + "epoch": 0.64, + "grad_norm": 0.534548282623291, + "learning_rate": 0.0005350142870332192, + "loss": 3.1293, + "step": 13069 + }, + { + "epoch": 0.64, + "grad_norm": 0.5076059699058533, + "learning_rate": 0.0005350047173469953, + "loss": 3.3001, + "step": 13070 + }, + { + "epoch": 0.64, + "grad_norm": 0.5420432686805725, + "learning_rate": 0.0005349951470418144, + "loss": 3.0998, + "step": 13071 + }, + { + "epoch": 0.64, + "grad_norm": 0.5068901777267456, + "learning_rate": 0.0005349855761177014, + "loss": 3.2598, + "step": 13072 + }, + { + "epoch": 0.64, + "grad_norm": 0.5439526438713074, + "learning_rate": 0.0005349760045746816, + "loss": 3.4108, + "step": 13073 + }, + { + "epoch": 0.64, + "grad_norm": 0.4926943778991699, + "learning_rate": 0.0005349664324127803, + "loss": 3.2626, + "step": 13074 + }, + { + "epoch": 0.64, + "grad_norm": 0.5149872899055481, + "learning_rate": 0.0005349568596320225, + "loss": 3.2164, + "step": 13075 + }, + { + "epoch": 0.64, + "grad_norm": 0.5110384225845337, + "learning_rate": 0.0005349472862324337, + "loss": 3.1819, + "step": 13076 + }, + { + "epoch": 0.64, + "grad_norm": 0.5247209072113037, + "learning_rate": 0.0005349377122140388, + "loss": 3.2829, + "step": 13077 + }, + { + "epoch": 0.64, + "grad_norm": 0.553516149520874, + "learning_rate": 0.0005349281375768634, + "loss": 3.2516, + "step": 13078 + }, + { + "epoch": 0.64, + "grad_norm": 0.5458715558052063, + "learning_rate": 0.0005349185623209324, + "loss": 3.2085, + "step": 13079 + }, + { + "epoch": 0.64, + "grad_norm": 0.4971589744091034, + "learning_rate": 0.0005349089864462711, + "loss": 3.255, + "step": 13080 + }, + { + "epoch": 0.64, + "grad_norm": 0.5303791761398315, + "learning_rate": 0.0005348994099529047, + "loss": 3.2376, + "step": 13081 + }, + { + "epoch": 0.64, + "grad_norm": 0.5025544762611389, + "learning_rate": 0.0005348898328408584, + "loss": 3.2325, + "step": 13082 + }, + { + "epoch": 0.64, + "grad_norm": 0.49780189990997314, + "learning_rate": 0.0005348802551101578, + "loss": 2.9287, + "step": 13083 + }, + { + "epoch": 0.64, + "grad_norm": 0.5410318970680237, + "learning_rate": 0.0005348706767608275, + "loss": 3.2634, + "step": 13084 + }, + { + "epoch": 0.64, + "grad_norm": 0.5140213966369629, + "learning_rate": 0.0005348610977928931, + "loss": 3.1781, + "step": 13085 + }, + { + "epoch": 0.64, + "grad_norm": 0.5628545880317688, + "learning_rate": 0.0005348515182063799, + "loss": 3.239, + "step": 13086 + }, + { + "epoch": 0.64, + "grad_norm": 0.5181366205215454, + "learning_rate": 0.0005348419380013128, + "loss": 3.2054, + "step": 13087 + }, + { + "epoch": 0.64, + "grad_norm": 0.5096615552902222, + "learning_rate": 0.0005348323571777174, + "loss": 3.1634, + "step": 13088 + }, + { + "epoch": 0.64, + "grad_norm": 0.5191645622253418, + "learning_rate": 0.0005348227757356187, + "loss": 3.2104, + "step": 13089 + }, + { + "epoch": 0.64, + "grad_norm": 0.5057798624038696, + "learning_rate": 0.0005348131936750419, + "loss": 3.177, + "step": 13090 + }, + { + "epoch": 0.64, + "grad_norm": 0.4993162453174591, + "learning_rate": 0.0005348036109960125, + "loss": 3.3898, + "step": 13091 + }, + { + "epoch": 0.64, + "grad_norm": 0.5280262231826782, + "learning_rate": 0.0005347940276985555, + "loss": 3.3019, + "step": 13092 + }, + { + "epoch": 0.64, + "grad_norm": 0.5497944951057434, + "learning_rate": 0.0005347844437826962, + "loss": 3.2199, + "step": 13093 + }, + { + "epoch": 0.64, + "grad_norm": 0.47965291142463684, + "learning_rate": 0.0005347748592484599, + "loss": 3.2582, + "step": 13094 + }, + { + "epoch": 0.64, + "grad_norm": 0.5250326991081238, + "learning_rate": 0.0005347652740958718, + "loss": 3.3034, + "step": 13095 + }, + { + "epoch": 0.64, + "grad_norm": 0.5235711336135864, + "learning_rate": 0.0005347556883249572, + "loss": 3.5181, + "step": 13096 + }, + { + "epoch": 0.64, + "grad_norm": 0.4954131245613098, + "learning_rate": 0.0005347461019357412, + "loss": 3.2653, + "step": 13097 + }, + { + "epoch": 0.64, + "grad_norm": 0.5206896066665649, + "learning_rate": 0.0005347365149282492, + "loss": 3.1443, + "step": 13098 + }, + { + "epoch": 0.64, + "grad_norm": 0.5158017873764038, + "learning_rate": 0.0005347269273025064, + "loss": 3.2635, + "step": 13099 + }, + { + "epoch": 0.64, + "grad_norm": 0.5218068957328796, + "learning_rate": 0.0005347173390585381, + "loss": 3.2463, + "step": 13100 + }, + { + "epoch": 0.64, + "grad_norm": 0.5204295516014099, + "learning_rate": 0.0005347077501963694, + "loss": 3.2756, + "step": 13101 + }, + { + "epoch": 0.64, + "grad_norm": 0.5045682191848755, + "learning_rate": 0.0005346981607160257, + "loss": 3.2225, + "step": 13102 + }, + { + "epoch": 0.64, + "grad_norm": 0.5521380305290222, + "learning_rate": 0.0005346885706175321, + "loss": 3.0098, + "step": 13103 + }, + { + "epoch": 0.64, + "grad_norm": 0.5172122716903687, + "learning_rate": 0.0005346789799009141, + "loss": 3.2474, + "step": 13104 + }, + { + "epoch": 0.64, + "grad_norm": 0.5108321905136108, + "learning_rate": 0.0005346693885661968, + "loss": 3.132, + "step": 13105 + }, + { + "epoch": 0.64, + "grad_norm": 0.4934992790222168, + "learning_rate": 0.0005346597966134056, + "loss": 3.0254, + "step": 13106 + }, + { + "epoch": 0.64, + "grad_norm": 0.5101711750030518, + "learning_rate": 0.0005346502040425655, + "loss": 3.4279, + "step": 13107 + }, + { + "epoch": 0.64, + "grad_norm": 0.5210503935813904, + "learning_rate": 0.000534640610853702, + "loss": 3.2178, + "step": 13108 + }, + { + "epoch": 0.64, + "grad_norm": 0.496926873922348, + "learning_rate": 0.0005346310170468402, + "loss": 3.1698, + "step": 13109 + }, + { + "epoch": 0.64, + "grad_norm": 0.49992474913597107, + "learning_rate": 0.0005346214226220055, + "loss": 3.2576, + "step": 13110 + }, + { + "epoch": 0.64, + "grad_norm": 0.5001792311668396, + "learning_rate": 0.0005346118275792232, + "loss": 3.4211, + "step": 13111 + }, + { + "epoch": 0.64, + "grad_norm": 0.5183314681053162, + "learning_rate": 0.0005346022319185185, + "loss": 3.19, + "step": 13112 + }, + { + "epoch": 0.64, + "grad_norm": 0.4935111403465271, + "learning_rate": 0.0005345926356399166, + "loss": 3.3512, + "step": 13113 + }, + { + "epoch": 0.64, + "grad_norm": 0.5626166462898254, + "learning_rate": 0.0005345830387434428, + "loss": 3.3638, + "step": 13114 + }, + { + "epoch": 0.64, + "grad_norm": 0.5216866731643677, + "learning_rate": 0.0005345734412291226, + "loss": 3.244, + "step": 13115 + }, + { + "epoch": 0.64, + "grad_norm": 0.5332128405570984, + "learning_rate": 0.000534563843096981, + "loss": 3.2552, + "step": 13116 + }, + { + "epoch": 0.64, + "grad_norm": 0.47880974411964417, + "learning_rate": 0.0005345542443470434, + "loss": 3.1028, + "step": 13117 + }, + { + "epoch": 0.64, + "grad_norm": 0.4967505931854248, + "learning_rate": 0.000534544644979335, + "loss": 3.004, + "step": 13118 + }, + { + "epoch": 0.64, + "grad_norm": 0.5178172588348389, + "learning_rate": 0.0005345350449938811, + "loss": 3.2698, + "step": 13119 + }, + { + "epoch": 0.64, + "grad_norm": 0.5368995070457458, + "learning_rate": 0.0005345254443907072, + "loss": 3.3812, + "step": 13120 + }, + { + "epoch": 0.64, + "grad_norm": 0.4906540513038635, + "learning_rate": 0.0005345158431698383, + "loss": 3.1979, + "step": 13121 + }, + { + "epoch": 0.64, + "grad_norm": 0.5134233236312866, + "learning_rate": 0.0005345062413312998, + "loss": 3.3588, + "step": 13122 + }, + { + "epoch": 0.64, + "grad_norm": 0.5044988989830017, + "learning_rate": 0.0005344966388751171, + "loss": 3.1731, + "step": 13123 + }, + { + "epoch": 0.64, + "grad_norm": 0.5189638137817383, + "learning_rate": 0.0005344870358013152, + "loss": 3.1494, + "step": 13124 + }, + { + "epoch": 0.64, + "grad_norm": 0.6163929104804993, + "learning_rate": 0.0005344774321099197, + "loss": 3.2465, + "step": 13125 + }, + { + "epoch": 0.64, + "grad_norm": 0.5382804870605469, + "learning_rate": 0.0005344678278009557, + "loss": 3.1525, + "step": 13126 + }, + { + "epoch": 0.64, + "grad_norm": 0.5891967415809631, + "learning_rate": 0.0005344582228744486, + "loss": 3.1272, + "step": 13127 + }, + { + "epoch": 0.64, + "grad_norm": 0.5315770506858826, + "learning_rate": 0.0005344486173304236, + "loss": 2.9923, + "step": 13128 + }, + { + "epoch": 0.64, + "grad_norm": 0.5364977121353149, + "learning_rate": 0.0005344390111689061, + "loss": 3.2317, + "step": 13129 + }, + { + "epoch": 0.64, + "grad_norm": 0.5116824507713318, + "learning_rate": 0.0005344294043899215, + "loss": 3.1648, + "step": 13130 + }, + { + "epoch": 0.64, + "grad_norm": 0.5573520660400391, + "learning_rate": 0.0005344197969934949, + "loss": 3.353, + "step": 13131 + }, + { + "epoch": 0.64, + "grad_norm": 0.5192342400550842, + "learning_rate": 0.0005344101889796516, + "loss": 3.1479, + "step": 13132 + }, + { + "epoch": 0.64, + "grad_norm": 0.49575215578079224, + "learning_rate": 0.0005344005803484171, + "loss": 3.4167, + "step": 13133 + }, + { + "epoch": 0.64, + "grad_norm": 0.5160652995109558, + "learning_rate": 0.0005343909710998164, + "loss": 3.2124, + "step": 13134 + }, + { + "epoch": 0.64, + "grad_norm": 0.5108956098556519, + "learning_rate": 0.0005343813612338751, + "loss": 3.3316, + "step": 13135 + }, + { + "epoch": 0.64, + "grad_norm": 0.528973400592804, + "learning_rate": 0.0005343717507506184, + "loss": 3.0333, + "step": 13136 + }, + { + "epoch": 0.64, + "grad_norm": 0.5308817028999329, + "learning_rate": 0.0005343621396500716, + "loss": 3.1145, + "step": 13137 + }, + { + "epoch": 0.64, + "grad_norm": 0.5422894358634949, + "learning_rate": 0.00053435252793226, + "loss": 3.2858, + "step": 13138 + }, + { + "epoch": 0.64, + "grad_norm": 0.4965519607067108, + "learning_rate": 0.000534342915597209, + "loss": 3.2758, + "step": 13139 + }, + { + "epoch": 0.64, + "grad_norm": 0.5743324160575867, + "learning_rate": 0.0005343333026449438, + "loss": 3.2823, + "step": 13140 + }, + { + "epoch": 0.64, + "grad_norm": 0.4840207099914551, + "learning_rate": 0.0005343236890754898, + "loss": 3.0271, + "step": 13141 + }, + { + "epoch": 0.64, + "grad_norm": 0.5335875749588013, + "learning_rate": 0.0005343140748888724, + "loss": 3.2321, + "step": 13142 + }, + { + "epoch": 0.64, + "grad_norm": 0.5198099613189697, + "learning_rate": 0.0005343044600851166, + "loss": 3.1172, + "step": 13143 + }, + { + "epoch": 0.64, + "grad_norm": 0.5045928359031677, + "learning_rate": 0.0005342948446642481, + "loss": 3.2568, + "step": 13144 + }, + { + "epoch": 0.64, + "grad_norm": 0.49999576807022095, + "learning_rate": 0.0005342852286262921, + "loss": 3.2756, + "step": 13145 + }, + { + "epoch": 0.64, + "grad_norm": 0.5080724954605103, + "learning_rate": 0.0005342756119712737, + "loss": 3.3996, + "step": 13146 + }, + { + "epoch": 0.64, + "grad_norm": 0.48446428775787354, + "learning_rate": 0.0005342659946992187, + "loss": 3.1447, + "step": 13147 + }, + { + "epoch": 0.64, + "grad_norm": 0.5702810883522034, + "learning_rate": 0.000534256376810152, + "loss": 3.2459, + "step": 13148 + }, + { + "epoch": 0.64, + "grad_norm": 0.5255053639411926, + "learning_rate": 0.0005342467583040991, + "loss": 3.0702, + "step": 13149 + }, + { + "epoch": 0.64, + "grad_norm": 0.5071660280227661, + "learning_rate": 0.0005342371391810853, + "loss": 3.3868, + "step": 13150 + }, + { + "epoch": 0.64, + "grad_norm": 0.5754165053367615, + "learning_rate": 0.000534227519441136, + "loss": 3.2203, + "step": 13151 + }, + { + "epoch": 0.64, + "grad_norm": 0.5076290965080261, + "learning_rate": 0.0005342178990842765, + "loss": 3.4923, + "step": 13152 + }, + { + "epoch": 0.64, + "grad_norm": 0.5148435831069946, + "learning_rate": 0.0005342082781105321, + "loss": 3.3192, + "step": 13153 + }, + { + "epoch": 0.64, + "grad_norm": 0.5352534651756287, + "learning_rate": 0.0005341986565199281, + "loss": 3.2992, + "step": 13154 + }, + { + "epoch": 0.64, + "grad_norm": 0.5055733323097229, + "learning_rate": 0.0005341890343124902, + "loss": 3.1765, + "step": 13155 + }, + { + "epoch": 0.64, + "grad_norm": 0.5165096521377563, + "learning_rate": 0.0005341794114882431, + "loss": 3.4136, + "step": 13156 + }, + { + "epoch": 0.64, + "grad_norm": 0.53326416015625, + "learning_rate": 0.0005341697880472126, + "loss": 3.2336, + "step": 13157 + }, + { + "epoch": 0.64, + "grad_norm": 0.512313187122345, + "learning_rate": 0.000534160163989424, + "loss": 3.2201, + "step": 13158 + }, + { + "epoch": 0.64, + "grad_norm": 0.5026753544807434, + "learning_rate": 0.0005341505393149026, + "loss": 3.3028, + "step": 13159 + }, + { + "epoch": 0.64, + "grad_norm": 0.4956597685813904, + "learning_rate": 0.0005341409140236738, + "loss": 3.1856, + "step": 13160 + }, + { + "epoch": 0.64, + "grad_norm": 0.5383108854293823, + "learning_rate": 0.0005341312881157628, + "loss": 3.2079, + "step": 13161 + }, + { + "epoch": 0.65, + "grad_norm": 0.5031716823577881, + "learning_rate": 0.000534121661591195, + "loss": 3.363, + "step": 13162 + }, + { + "epoch": 0.65, + "grad_norm": 0.564416766166687, + "learning_rate": 0.0005341120344499959, + "loss": 3.1272, + "step": 13163 + }, + { + "epoch": 0.65, + "grad_norm": 0.5095632672309875, + "learning_rate": 0.0005341024066921907, + "loss": 3.2337, + "step": 13164 + }, + { + "epoch": 0.65, + "grad_norm": 0.520420253276825, + "learning_rate": 0.0005340927783178049, + "loss": 3.2769, + "step": 13165 + }, + { + "epoch": 0.65, + "grad_norm": 0.5509207248687744, + "learning_rate": 0.0005340831493268637, + "loss": 3.1734, + "step": 13166 + }, + { + "epoch": 0.65, + "grad_norm": 0.5348342657089233, + "learning_rate": 0.0005340735197193924, + "loss": 3.1599, + "step": 13167 + }, + { + "epoch": 0.65, + "grad_norm": 0.5090433955192566, + "learning_rate": 0.0005340638894954167, + "loss": 3.352, + "step": 13168 + }, + { + "epoch": 0.65, + "grad_norm": 0.5051810145378113, + "learning_rate": 0.0005340542586549617, + "loss": 3.0708, + "step": 13169 + }, + { + "epoch": 0.65, + "grad_norm": 0.5810332298278809, + "learning_rate": 0.0005340446271980528, + "loss": 3.34, + "step": 13170 + }, + { + "epoch": 0.65, + "grad_norm": 0.5068036913871765, + "learning_rate": 0.0005340349951247156, + "loss": 3.2238, + "step": 13171 + }, + { + "epoch": 0.65, + "grad_norm": 0.5000148415565491, + "learning_rate": 0.000534025362434975, + "loss": 3.3234, + "step": 13172 + }, + { + "epoch": 0.65, + "grad_norm": 0.5451956987380981, + "learning_rate": 0.0005340157291288568, + "loss": 2.9447, + "step": 13173 + }, + { + "epoch": 0.65, + "grad_norm": 0.524332582950592, + "learning_rate": 0.000534006095206386, + "loss": 3.1001, + "step": 13174 + }, + { + "epoch": 0.65, + "grad_norm": 0.48256099224090576, + "learning_rate": 0.0005339964606675883, + "loss": 3.1865, + "step": 13175 + }, + { + "epoch": 0.65, + "grad_norm": 0.5315820574760437, + "learning_rate": 0.000533986825512489, + "loss": 3.0933, + "step": 13176 + }, + { + "epoch": 0.65, + "grad_norm": 0.5178393721580505, + "learning_rate": 0.0005339771897411134, + "loss": 3.1253, + "step": 13177 + }, + { + "epoch": 0.65, + "grad_norm": 0.48656055331230164, + "learning_rate": 0.0005339675533534869, + "loss": 3.2523, + "step": 13178 + }, + { + "epoch": 0.65, + "grad_norm": 0.5785982608795166, + "learning_rate": 0.000533957916349635, + "loss": 3.1421, + "step": 13179 + }, + { + "epoch": 0.65, + "grad_norm": 0.5141577124595642, + "learning_rate": 0.0005339482787295828, + "loss": 3.4436, + "step": 13180 + }, + { + "epoch": 0.65, + "grad_norm": 0.5167346000671387, + "learning_rate": 0.000533938640493356, + "loss": 3.1199, + "step": 13181 + }, + { + "epoch": 0.65, + "grad_norm": 0.5306900143623352, + "learning_rate": 0.0005339290016409797, + "loss": 3.3397, + "step": 13182 + }, + { + "epoch": 0.65, + "grad_norm": 0.5225565433502197, + "learning_rate": 0.0005339193621724795, + "loss": 3.1148, + "step": 13183 + }, + { + "epoch": 0.65, + "grad_norm": 0.5168359875679016, + "learning_rate": 0.0005339097220878808, + "loss": 3.0261, + "step": 13184 + }, + { + "epoch": 0.65, + "grad_norm": 0.5109102725982666, + "learning_rate": 0.0005339000813872088, + "loss": 3.1654, + "step": 13185 + }, + { + "epoch": 0.65, + "grad_norm": 0.5324611663818359, + "learning_rate": 0.0005338904400704891, + "loss": 3.291, + "step": 13186 + }, + { + "epoch": 0.65, + "grad_norm": 0.5554729104042053, + "learning_rate": 0.0005338807981377469, + "loss": 3.4504, + "step": 13187 + }, + { + "epoch": 0.65, + "grad_norm": 0.5127044320106506, + "learning_rate": 0.0005338711555890077, + "loss": 3.3947, + "step": 13188 + }, + { + "epoch": 0.65, + "grad_norm": 0.5299892425537109, + "learning_rate": 0.000533861512424297, + "loss": 3.148, + "step": 13189 + }, + { + "epoch": 0.65, + "grad_norm": 0.5451264381408691, + "learning_rate": 0.0005338518686436399, + "loss": 3.4564, + "step": 13190 + }, + { + "epoch": 0.65, + "grad_norm": 0.5090279579162598, + "learning_rate": 0.0005338422242470621, + "loss": 3.114, + "step": 13191 + }, + { + "epoch": 0.65, + "grad_norm": 0.519496500492096, + "learning_rate": 0.0005338325792345888, + "loss": 2.9807, + "step": 13192 + }, + { + "epoch": 0.65, + "grad_norm": 0.5295326113700867, + "learning_rate": 0.0005338229336062456, + "loss": 3.3167, + "step": 13193 + }, + { + "epoch": 0.65, + "grad_norm": 0.5315036177635193, + "learning_rate": 0.0005338132873620577, + "loss": 3.0778, + "step": 13194 + }, + { + "epoch": 0.65, + "grad_norm": 0.5226171016693115, + "learning_rate": 0.0005338036405020507, + "loss": 3.1866, + "step": 13195 + }, + { + "epoch": 0.65, + "grad_norm": 0.49867236614227295, + "learning_rate": 0.0005337939930262497, + "loss": 3.1654, + "step": 13196 + }, + { + "epoch": 0.65, + "grad_norm": 0.5161440372467041, + "learning_rate": 0.0005337843449346807, + "loss": 3.1951, + "step": 13197 + }, + { + "epoch": 0.65, + "grad_norm": 0.49722862243652344, + "learning_rate": 0.0005337746962273684, + "loss": 3.2434, + "step": 13198 + }, + { + "epoch": 0.65, + "grad_norm": 0.5370376706123352, + "learning_rate": 0.0005337650469043387, + "loss": 3.2291, + "step": 13199 + }, + { + "epoch": 0.65, + "grad_norm": 0.5049730539321899, + "learning_rate": 0.0005337553969656166, + "loss": 3.291, + "step": 13200 + }, + { + "epoch": 0.65, + "grad_norm": 0.5508888363838196, + "learning_rate": 0.000533745746411228, + "loss": 3.1669, + "step": 13201 + }, + { + "epoch": 0.65, + "grad_norm": 0.49294355511665344, + "learning_rate": 0.0005337360952411981, + "loss": 3.0508, + "step": 13202 + }, + { + "epoch": 0.65, + "grad_norm": 0.5044154524803162, + "learning_rate": 0.0005337264434555522, + "loss": 3.2915, + "step": 13203 + }, + { + "epoch": 0.65, + "grad_norm": 0.5183781385421753, + "learning_rate": 0.000533716791054316, + "loss": 3.0519, + "step": 13204 + }, + { + "epoch": 0.65, + "grad_norm": 0.559165358543396, + "learning_rate": 0.0005337071380375146, + "loss": 3.2492, + "step": 13205 + }, + { + "epoch": 0.65, + "grad_norm": 0.5487591624259949, + "learning_rate": 0.0005336974844051736, + "loss": 3.2115, + "step": 13206 + }, + { + "epoch": 0.65, + "grad_norm": 0.5180480480194092, + "learning_rate": 0.0005336878301573185, + "loss": 3.2974, + "step": 13207 + }, + { + "epoch": 0.65, + "grad_norm": 0.4945024251937866, + "learning_rate": 0.0005336781752939744, + "loss": 3.1486, + "step": 13208 + }, + { + "epoch": 0.65, + "grad_norm": 0.4666735529899597, + "learning_rate": 0.0005336685198151672, + "loss": 3.1456, + "step": 13209 + }, + { + "epoch": 0.65, + "grad_norm": 0.5270425081253052, + "learning_rate": 0.0005336588637209219, + "loss": 3.1666, + "step": 13210 + }, + { + "epoch": 0.65, + "grad_norm": 0.5256056785583496, + "learning_rate": 0.0005336492070112642, + "loss": 3.1653, + "step": 13211 + }, + { + "epoch": 0.65, + "grad_norm": 0.4974687695503235, + "learning_rate": 0.0005336395496862195, + "loss": 3.209, + "step": 13212 + }, + { + "epoch": 0.65, + "grad_norm": 0.5208483934402466, + "learning_rate": 0.0005336298917458131, + "loss": 3.1261, + "step": 13213 + }, + { + "epoch": 0.65, + "grad_norm": 0.5191110372543335, + "learning_rate": 0.0005336202331900707, + "loss": 3.0473, + "step": 13214 + }, + { + "epoch": 0.65, + "grad_norm": 0.5726637244224548, + "learning_rate": 0.0005336105740190175, + "loss": 2.9428, + "step": 13215 + }, + { + "epoch": 0.65, + "grad_norm": 0.520617663860321, + "learning_rate": 0.000533600914232679, + "loss": 3.0333, + "step": 13216 + }, + { + "epoch": 0.65, + "grad_norm": 0.51925128698349, + "learning_rate": 0.0005335912538310805, + "loss": 3.0626, + "step": 13217 + }, + { + "epoch": 0.65, + "grad_norm": 0.5106421113014221, + "learning_rate": 0.0005335815928142478, + "loss": 3.2478, + "step": 13218 + }, + { + "epoch": 0.65, + "grad_norm": 0.4910167455673218, + "learning_rate": 0.000533571931182206, + "loss": 3.4039, + "step": 13219 + }, + { + "epoch": 0.65, + "grad_norm": 0.5045503973960876, + "learning_rate": 0.0005335622689349807, + "loss": 3.1105, + "step": 13220 + }, + { + "epoch": 0.65, + "grad_norm": 0.53183513879776, + "learning_rate": 0.0005335526060725975, + "loss": 3.3134, + "step": 13221 + }, + { + "epoch": 0.65, + "grad_norm": 0.5390097498893738, + "learning_rate": 0.0005335429425950814, + "loss": 3.1459, + "step": 13222 + }, + { + "epoch": 0.65, + "grad_norm": 0.53877192735672, + "learning_rate": 0.0005335332785024583, + "loss": 3.1017, + "step": 13223 + }, + { + "epoch": 0.65, + "grad_norm": 0.5238897800445557, + "learning_rate": 0.0005335236137947536, + "loss": 3.1823, + "step": 13224 + }, + { + "epoch": 0.65, + "grad_norm": 0.5160094499588013, + "learning_rate": 0.0005335139484719925, + "loss": 3.0258, + "step": 13225 + }, + { + "epoch": 0.65, + "grad_norm": 0.512133002281189, + "learning_rate": 0.0005335042825342005, + "loss": 3.1001, + "step": 13226 + }, + { + "epoch": 0.65, + "grad_norm": 0.4885791838169098, + "learning_rate": 0.0005334946159814033, + "loss": 3.3233, + "step": 13227 + }, + { + "epoch": 0.65, + "grad_norm": 0.5112274289131165, + "learning_rate": 0.0005334849488136263, + "loss": 3.269, + "step": 13228 + }, + { + "epoch": 0.65, + "grad_norm": 0.4955732524394989, + "learning_rate": 0.0005334752810308948, + "loss": 3.3469, + "step": 13229 + }, + { + "epoch": 0.65, + "grad_norm": 0.5273498296737671, + "learning_rate": 0.0005334656126332343, + "loss": 3.2339, + "step": 13230 + }, + { + "epoch": 0.65, + "grad_norm": 0.5362980961799622, + "learning_rate": 0.0005334559436206702, + "loss": 3.2441, + "step": 13231 + }, + { + "epoch": 0.65, + "grad_norm": 0.5081391334533691, + "learning_rate": 0.0005334462739932282, + "loss": 3.2659, + "step": 13232 + }, + { + "epoch": 0.65, + "grad_norm": 0.5640556812286377, + "learning_rate": 0.0005334366037509337, + "loss": 3.192, + "step": 13233 + }, + { + "epoch": 0.65, + "grad_norm": 0.5298064351081848, + "learning_rate": 0.000533426932893812, + "loss": 3.1988, + "step": 13234 + }, + { + "epoch": 0.65, + "grad_norm": 0.5060878396034241, + "learning_rate": 0.0005334172614218887, + "loss": 3.3089, + "step": 13235 + }, + { + "epoch": 0.65, + "grad_norm": 0.5255646705627441, + "learning_rate": 0.0005334075893351893, + "loss": 3.212, + "step": 13236 + }, + { + "epoch": 0.65, + "grad_norm": 0.5083935856819153, + "learning_rate": 0.0005333979166337393, + "loss": 3.2419, + "step": 13237 + }, + { + "epoch": 0.65, + "grad_norm": 0.5213572382926941, + "learning_rate": 0.000533388243317564, + "loss": 3.3522, + "step": 13238 + }, + { + "epoch": 0.65, + "grad_norm": 0.495538592338562, + "learning_rate": 0.000533378569386689, + "loss": 3.3924, + "step": 13239 + }, + { + "epoch": 0.65, + "grad_norm": 0.5392500758171082, + "learning_rate": 0.0005333688948411398, + "loss": 2.9577, + "step": 13240 + }, + { + "epoch": 0.65, + "grad_norm": 0.5462858080863953, + "learning_rate": 0.0005333592196809418, + "loss": 3.1583, + "step": 13241 + }, + { + "epoch": 0.65, + "grad_norm": 0.584762454032898, + "learning_rate": 0.0005333495439061206, + "loss": 3.1343, + "step": 13242 + }, + { + "epoch": 0.65, + "grad_norm": 0.520191490650177, + "learning_rate": 0.0005333398675167015, + "loss": 3.3106, + "step": 13243 + }, + { + "epoch": 0.65, + "grad_norm": 0.5066478252410889, + "learning_rate": 0.0005333301905127101, + "loss": 3.3081, + "step": 13244 + }, + { + "epoch": 0.65, + "grad_norm": 0.5514070391654968, + "learning_rate": 0.000533320512894172, + "loss": 3.1183, + "step": 13245 + }, + { + "epoch": 0.65, + "grad_norm": 0.5034546256065369, + "learning_rate": 0.0005333108346611124, + "loss": 3.0596, + "step": 13246 + }, + { + "epoch": 0.65, + "grad_norm": 0.5097323656082153, + "learning_rate": 0.0005333011558135572, + "loss": 3.1469, + "step": 13247 + }, + { + "epoch": 0.65, + "grad_norm": 0.5640053153038025, + "learning_rate": 0.0005332914763515314, + "loss": 3.1606, + "step": 13248 + }, + { + "epoch": 0.65, + "grad_norm": 0.5292096138000488, + "learning_rate": 0.0005332817962750609, + "loss": 3.1932, + "step": 13249 + }, + { + "epoch": 0.65, + "grad_norm": 0.4695495367050171, + "learning_rate": 0.0005332721155841711, + "loss": 3.147, + "step": 13250 + }, + { + "epoch": 0.65, + "grad_norm": 0.512503445148468, + "learning_rate": 0.0005332624342788873, + "loss": 3.1208, + "step": 13251 + }, + { + "epoch": 0.65, + "grad_norm": 0.5124956965446472, + "learning_rate": 0.0005332527523592353, + "loss": 3.1443, + "step": 13252 + }, + { + "epoch": 0.65, + "grad_norm": 0.5241851210594177, + "learning_rate": 0.0005332430698252403, + "loss": 3.1076, + "step": 13253 + }, + { + "epoch": 0.65, + "grad_norm": 0.5366250872612, + "learning_rate": 0.0005332333866769279, + "loss": 3.1135, + "step": 13254 + }, + { + "epoch": 0.65, + "grad_norm": 0.506049394607544, + "learning_rate": 0.0005332237029143238, + "loss": 3.411, + "step": 13255 + }, + { + "epoch": 0.65, + "grad_norm": 0.5145257711410522, + "learning_rate": 0.0005332140185374532, + "loss": 3.2594, + "step": 13256 + }, + { + "epoch": 0.65, + "grad_norm": 0.5252572894096375, + "learning_rate": 0.0005332043335463419, + "loss": 3.2745, + "step": 13257 + }, + { + "epoch": 0.65, + "grad_norm": 0.4891444146633148, + "learning_rate": 0.0005331946479410152, + "loss": 3.124, + "step": 13258 + }, + { + "epoch": 0.65, + "grad_norm": 0.569198727607727, + "learning_rate": 0.0005331849617214987, + "loss": 3.3658, + "step": 13259 + }, + { + "epoch": 0.65, + "grad_norm": 0.627887487411499, + "learning_rate": 0.0005331752748878179, + "loss": 3.0741, + "step": 13260 + }, + { + "epoch": 0.65, + "grad_norm": 0.5502414703369141, + "learning_rate": 0.0005331655874399982, + "loss": 3.1345, + "step": 13261 + }, + { + "epoch": 0.65, + "grad_norm": 0.5486642122268677, + "learning_rate": 0.0005331558993780653, + "loss": 3.1981, + "step": 13262 + }, + { + "epoch": 0.65, + "grad_norm": 0.5511811375617981, + "learning_rate": 0.0005331462107020446, + "loss": 3.1109, + "step": 13263 + }, + { + "epoch": 0.65, + "grad_norm": 0.5167766809463501, + "learning_rate": 0.0005331365214119617, + "loss": 3.2288, + "step": 13264 + }, + { + "epoch": 0.65, + "grad_norm": 0.5129167437553406, + "learning_rate": 0.000533126831507842, + "loss": 3.3583, + "step": 13265 + }, + { + "epoch": 0.65, + "grad_norm": 0.5186481475830078, + "learning_rate": 0.0005331171409897112, + "loss": 3.18, + "step": 13266 + }, + { + "epoch": 0.65, + "grad_norm": 0.5730112195014954, + "learning_rate": 0.0005331074498575946, + "loss": 3.0874, + "step": 13267 + }, + { + "epoch": 0.65, + "grad_norm": 0.5106200575828552, + "learning_rate": 0.000533097758111518, + "loss": 3.2888, + "step": 13268 + }, + { + "epoch": 0.65, + "grad_norm": 0.5146166682243347, + "learning_rate": 0.0005330880657515066, + "loss": 3.0715, + "step": 13269 + }, + { + "epoch": 0.65, + "grad_norm": 0.49323368072509766, + "learning_rate": 0.0005330783727775861, + "loss": 3.1591, + "step": 13270 + }, + { + "epoch": 0.65, + "grad_norm": 0.5173508524894714, + "learning_rate": 0.000533068679189782, + "loss": 3.092, + "step": 13271 + }, + { + "epoch": 0.65, + "grad_norm": 0.51826411485672, + "learning_rate": 0.00053305898498812, + "loss": 3.228, + "step": 13272 + }, + { + "epoch": 0.65, + "grad_norm": 0.5417203307151794, + "learning_rate": 0.0005330492901726255, + "loss": 3.1268, + "step": 13273 + }, + { + "epoch": 0.65, + "grad_norm": 0.5598623752593994, + "learning_rate": 0.0005330395947433238, + "loss": 3.1918, + "step": 13274 + }, + { + "epoch": 0.65, + "grad_norm": 0.5296295881271362, + "learning_rate": 0.0005330298987002408, + "loss": 3.1156, + "step": 13275 + }, + { + "epoch": 0.65, + "grad_norm": 0.4983462989330292, + "learning_rate": 0.0005330202020434019, + "loss": 3.0815, + "step": 13276 + }, + { + "epoch": 0.65, + "grad_norm": 0.604824423789978, + "learning_rate": 0.0005330105047728326, + "loss": 3.241, + "step": 13277 + }, + { + "epoch": 0.65, + "grad_norm": 0.5088874101638794, + "learning_rate": 0.0005330008068885585, + "loss": 3.255, + "step": 13278 + }, + { + "epoch": 0.65, + "grad_norm": 0.5178427696228027, + "learning_rate": 0.0005329911083906051, + "loss": 3.2774, + "step": 13279 + }, + { + "epoch": 0.65, + "grad_norm": 0.510136604309082, + "learning_rate": 0.0005329814092789979, + "loss": 3.4761, + "step": 13280 + }, + { + "epoch": 0.65, + "grad_norm": 0.48275598883628845, + "learning_rate": 0.0005329717095537627, + "loss": 3.2948, + "step": 13281 + }, + { + "epoch": 0.65, + "grad_norm": 0.5280422568321228, + "learning_rate": 0.0005329620092149247, + "loss": 3.0466, + "step": 13282 + }, + { + "epoch": 0.65, + "grad_norm": 0.5315813422203064, + "learning_rate": 0.0005329523082625097, + "loss": 3.0598, + "step": 13283 + }, + { + "epoch": 0.65, + "grad_norm": 0.49926483631134033, + "learning_rate": 0.0005329426066965431, + "loss": 3.2156, + "step": 13284 + }, + { + "epoch": 0.65, + "grad_norm": 0.5536932349205017, + "learning_rate": 0.0005329329045170503, + "loss": 3.3147, + "step": 13285 + }, + { + "epoch": 0.65, + "grad_norm": 0.5208083391189575, + "learning_rate": 0.0005329232017240573, + "loss": 3.1502, + "step": 13286 + }, + { + "epoch": 0.65, + "grad_norm": 0.5383280515670776, + "learning_rate": 0.0005329134983175894, + "loss": 3.2438, + "step": 13287 + }, + { + "epoch": 0.65, + "grad_norm": 0.5244266986846924, + "learning_rate": 0.0005329037942976721, + "loss": 3.4281, + "step": 13288 + }, + { + "epoch": 0.65, + "grad_norm": 0.47546955943107605, + "learning_rate": 0.0005328940896643311, + "loss": 3.2289, + "step": 13289 + }, + { + "epoch": 0.65, + "grad_norm": 0.5083588361740112, + "learning_rate": 0.0005328843844175918, + "loss": 3.138, + "step": 13290 + }, + { + "epoch": 0.65, + "grad_norm": 0.522491455078125, + "learning_rate": 0.0005328746785574799, + "loss": 3.1468, + "step": 13291 + }, + { + "epoch": 0.65, + "grad_norm": 0.49923422932624817, + "learning_rate": 0.0005328649720840209, + "loss": 3.1271, + "step": 13292 + }, + { + "epoch": 0.65, + "grad_norm": 0.5488773584365845, + "learning_rate": 0.0005328552649972405, + "loss": 3.1722, + "step": 13293 + }, + { + "epoch": 0.65, + "grad_norm": 0.5081570744514465, + "learning_rate": 0.0005328455572971639, + "loss": 3.3069, + "step": 13294 + }, + { + "epoch": 0.65, + "grad_norm": 0.48642170429229736, + "learning_rate": 0.0005328358489838171, + "loss": 3.3906, + "step": 13295 + }, + { + "epoch": 0.65, + "grad_norm": 0.5340069532394409, + "learning_rate": 0.0005328261400572254, + "loss": 3.1015, + "step": 13296 + }, + { + "epoch": 0.65, + "grad_norm": 0.5175917744636536, + "learning_rate": 0.0005328164305174146, + "loss": 3.2171, + "step": 13297 + }, + { + "epoch": 0.65, + "grad_norm": 0.5808233022689819, + "learning_rate": 0.00053280672036441, + "loss": 3.2978, + "step": 13298 + }, + { + "epoch": 0.65, + "grad_norm": 0.4955079257488251, + "learning_rate": 0.0005327970095982372, + "loss": 3.1519, + "step": 13299 + }, + { + "epoch": 0.65, + "grad_norm": 0.6096027493476868, + "learning_rate": 0.0005327872982189221, + "loss": 3.2015, + "step": 13300 + }, + { + "epoch": 0.65, + "grad_norm": 0.5957887172698975, + "learning_rate": 0.0005327775862264899, + "loss": 3.286, + "step": 13301 + }, + { + "epoch": 0.65, + "grad_norm": 0.5077028274536133, + "learning_rate": 0.0005327678736209664, + "loss": 3.1815, + "step": 13302 + }, + { + "epoch": 0.65, + "grad_norm": 0.5380714535713196, + "learning_rate": 0.0005327581604023772, + "loss": 3.4044, + "step": 13303 + }, + { + "epoch": 0.65, + "grad_norm": 0.6547355651855469, + "learning_rate": 0.0005327484465707477, + "loss": 3.2944, + "step": 13304 + }, + { + "epoch": 0.65, + "grad_norm": 0.5356435179710388, + "learning_rate": 0.0005327387321261035, + "loss": 2.9592, + "step": 13305 + }, + { + "epoch": 0.65, + "grad_norm": 0.48445892333984375, + "learning_rate": 0.0005327290170684705, + "loss": 3.4262, + "step": 13306 + }, + { + "epoch": 0.65, + "grad_norm": 0.5008205771446228, + "learning_rate": 0.0005327193013978739, + "loss": 3.0121, + "step": 13307 + }, + { + "epoch": 0.65, + "grad_norm": 0.5113664269447327, + "learning_rate": 0.0005327095851143394, + "loss": 3.3788, + "step": 13308 + }, + { + "epoch": 0.65, + "grad_norm": 0.5016259551048279, + "learning_rate": 0.0005326998682178927, + "loss": 3.0934, + "step": 13309 + }, + { + "epoch": 0.65, + "grad_norm": 0.48941028118133545, + "learning_rate": 0.0005326901507085594, + "loss": 3.364, + "step": 13310 + }, + { + "epoch": 0.65, + "grad_norm": 0.49986913800239563, + "learning_rate": 0.000532680432586365, + "loss": 3.3927, + "step": 13311 + }, + { + "epoch": 0.65, + "grad_norm": 0.5154104828834534, + "learning_rate": 0.0005326707138513351, + "loss": 3.3246, + "step": 13312 + }, + { + "epoch": 0.65, + "grad_norm": 0.5213533639907837, + "learning_rate": 0.0005326609945034953, + "loss": 3.1087, + "step": 13313 + }, + { + "epoch": 0.65, + "grad_norm": 0.5237452387809753, + "learning_rate": 0.0005326512745428713, + "loss": 3.347, + "step": 13314 + }, + { + "epoch": 0.65, + "grad_norm": 0.49681347608566284, + "learning_rate": 0.0005326415539694885, + "loss": 3.1432, + "step": 13315 + }, + { + "epoch": 0.65, + "grad_norm": 0.5009694695472717, + "learning_rate": 0.0005326318327833726, + "loss": 3.307, + "step": 13316 + }, + { + "epoch": 0.65, + "grad_norm": 0.5398823618888855, + "learning_rate": 0.0005326221109845493, + "loss": 3.1889, + "step": 13317 + }, + { + "epoch": 0.65, + "grad_norm": 0.4797592759132385, + "learning_rate": 0.0005326123885730441, + "loss": 3.2475, + "step": 13318 + }, + { + "epoch": 0.65, + "grad_norm": 0.5444583296775818, + "learning_rate": 0.0005326026655488827, + "loss": 3.0342, + "step": 13319 + }, + { + "epoch": 0.65, + "grad_norm": 0.5243876576423645, + "learning_rate": 0.0005325929419120906, + "loss": 3.2708, + "step": 13320 + }, + { + "epoch": 0.65, + "grad_norm": 0.46254366636276245, + "learning_rate": 0.0005325832176626934, + "loss": 2.979, + "step": 13321 + }, + { + "epoch": 0.65, + "grad_norm": 0.5210396647453308, + "learning_rate": 0.0005325734928007168, + "loss": 3.1684, + "step": 13322 + }, + { + "epoch": 0.65, + "grad_norm": 0.5120729207992554, + "learning_rate": 0.0005325637673261864, + "loss": 3.1615, + "step": 13323 + }, + { + "epoch": 0.65, + "grad_norm": 0.5329714417457581, + "learning_rate": 0.0005325540412391279, + "loss": 3.3009, + "step": 13324 + }, + { + "epoch": 0.65, + "grad_norm": 0.5156061053276062, + "learning_rate": 0.0005325443145395666, + "loss": 3.1694, + "step": 13325 + }, + { + "epoch": 0.65, + "grad_norm": 0.5308995842933655, + "learning_rate": 0.0005325345872275285, + "loss": 3.163, + "step": 13326 + }, + { + "epoch": 0.65, + "grad_norm": 0.5136690735816956, + "learning_rate": 0.0005325248593030389, + "loss": 3.5507, + "step": 13327 + }, + { + "epoch": 0.65, + "grad_norm": 0.5340861082077026, + "learning_rate": 0.0005325151307661237, + "loss": 3.4215, + "step": 13328 + }, + { + "epoch": 0.65, + "grad_norm": 0.6588315367698669, + "learning_rate": 0.0005325054016168083, + "loss": 3.4133, + "step": 13329 + }, + { + "epoch": 0.65, + "grad_norm": 0.49800702929496765, + "learning_rate": 0.0005324956718551185, + "loss": 3.2623, + "step": 13330 + }, + { + "epoch": 0.65, + "grad_norm": 0.5641326308250427, + "learning_rate": 0.0005324859414810798, + "loss": 3.0368, + "step": 13331 + }, + { + "epoch": 0.65, + "grad_norm": 0.49139222502708435, + "learning_rate": 0.0005324762104947179, + "loss": 3.432, + "step": 13332 + }, + { + "epoch": 0.65, + "grad_norm": 0.5017745494842529, + "learning_rate": 0.0005324664788960583, + "loss": 3.1099, + "step": 13333 + }, + { + "epoch": 0.65, + "grad_norm": 0.5510955452919006, + "learning_rate": 0.0005324567466851269, + "loss": 3.08, + "step": 13334 + }, + { + "epoch": 0.65, + "grad_norm": 0.5165101885795593, + "learning_rate": 0.0005324470138619492, + "loss": 3.2788, + "step": 13335 + }, + { + "epoch": 0.65, + "grad_norm": 0.5383206605911255, + "learning_rate": 0.0005324372804265505, + "loss": 3.1441, + "step": 13336 + }, + { + "epoch": 0.65, + "grad_norm": 0.5414730906486511, + "learning_rate": 0.000532427546378957, + "loss": 3.2794, + "step": 13337 + }, + { + "epoch": 0.65, + "grad_norm": 0.5007972717285156, + "learning_rate": 0.000532417811719194, + "loss": 3.2736, + "step": 13338 + }, + { + "epoch": 0.65, + "grad_norm": 0.543908953666687, + "learning_rate": 0.0005324080764472871, + "loss": 2.9989, + "step": 13339 + }, + { + "epoch": 0.65, + "grad_norm": 0.5582568049430847, + "learning_rate": 0.0005323983405632623, + "loss": 3.1517, + "step": 13340 + }, + { + "epoch": 0.65, + "grad_norm": 0.4982832968235016, + "learning_rate": 0.0005323886040671448, + "loss": 3.2608, + "step": 13341 + }, + { + "epoch": 0.65, + "grad_norm": 0.49134111404418945, + "learning_rate": 0.0005323788669589606, + "loss": 3.1525, + "step": 13342 + }, + { + "epoch": 0.65, + "grad_norm": 0.5180835723876953, + "learning_rate": 0.000532369129238735, + "loss": 3.0439, + "step": 13343 + }, + { + "epoch": 0.65, + "grad_norm": 0.49943089485168457, + "learning_rate": 0.000532359390906494, + "loss": 3.311, + "step": 13344 + }, + { + "epoch": 0.65, + "grad_norm": 0.5632797479629517, + "learning_rate": 0.0005323496519622629, + "loss": 3.1605, + "step": 13345 + }, + { + "epoch": 0.65, + "grad_norm": 0.4958808720111847, + "learning_rate": 0.0005323399124060677, + "loss": 3.3853, + "step": 13346 + }, + { + "epoch": 0.65, + "grad_norm": 0.5369716286659241, + "learning_rate": 0.0005323301722379338, + "loss": 3.6639, + "step": 13347 + }, + { + "epoch": 0.65, + "grad_norm": 0.5046685934066772, + "learning_rate": 0.000532320431457887, + "loss": 3.2543, + "step": 13348 + }, + { + "epoch": 0.65, + "grad_norm": 0.528698205947876, + "learning_rate": 0.0005323106900659529, + "loss": 2.9939, + "step": 13349 + }, + { + "epoch": 0.65, + "grad_norm": 0.5998996496200562, + "learning_rate": 0.0005323009480621571, + "loss": 3.2342, + "step": 13350 + }, + { + "epoch": 0.65, + "grad_norm": 0.5694158673286438, + "learning_rate": 0.0005322912054465253, + "loss": 3.0568, + "step": 13351 + }, + { + "epoch": 0.65, + "grad_norm": 0.4973612129688263, + "learning_rate": 0.0005322814622190831, + "loss": 3.1143, + "step": 13352 + }, + { + "epoch": 0.65, + "grad_norm": 0.5280294418334961, + "learning_rate": 0.0005322717183798564, + "loss": 3.1217, + "step": 13353 + }, + { + "epoch": 0.65, + "grad_norm": 0.507739245891571, + "learning_rate": 0.0005322619739288706, + "loss": 3.0354, + "step": 13354 + }, + { + "epoch": 0.65, + "grad_norm": 0.5100513696670532, + "learning_rate": 0.0005322522288661515, + "loss": 3.0251, + "step": 13355 + }, + { + "epoch": 0.65, + "grad_norm": 0.49843087792396545, + "learning_rate": 0.0005322424831917247, + "loss": 3.152, + "step": 13356 + }, + { + "epoch": 0.65, + "grad_norm": 0.5560824275016785, + "learning_rate": 0.000532232736905616, + "loss": 3.3028, + "step": 13357 + }, + { + "epoch": 0.65, + "grad_norm": 0.5524668097496033, + "learning_rate": 0.0005322229900078507, + "loss": 3.2832, + "step": 13358 + }, + { + "epoch": 0.65, + "grad_norm": 0.5191108584403992, + "learning_rate": 0.0005322132424984549, + "loss": 3.136, + "step": 13359 + }, + { + "epoch": 0.65, + "grad_norm": 0.49663254618644714, + "learning_rate": 0.0005322034943774542, + "loss": 3.0543, + "step": 13360 + }, + { + "epoch": 0.65, + "grad_norm": 0.5162781476974487, + "learning_rate": 0.0005321937456448741, + "loss": 3.2929, + "step": 13361 + }, + { + "epoch": 0.65, + "grad_norm": 0.5527589321136475, + "learning_rate": 0.0005321839963007402, + "loss": 3.4472, + "step": 13362 + }, + { + "epoch": 0.65, + "grad_norm": 0.539014458656311, + "learning_rate": 0.0005321742463450786, + "loss": 3.2797, + "step": 13363 + }, + { + "epoch": 0.65, + "grad_norm": 0.5228904485702515, + "learning_rate": 0.0005321644957779146, + "loss": 3.2703, + "step": 13364 + }, + { + "epoch": 0.65, + "grad_norm": 0.5191890597343445, + "learning_rate": 0.000532154744599274, + "loss": 3.118, + "step": 13365 + }, + { + "epoch": 0.66, + "grad_norm": 0.5481828451156616, + "learning_rate": 0.0005321449928091825, + "loss": 3.2102, + "step": 13366 + }, + { + "epoch": 0.66, + "grad_norm": 0.5154120326042175, + "learning_rate": 0.0005321352404076659, + "loss": 3.273, + "step": 13367 + }, + { + "epoch": 0.66, + "grad_norm": 0.5242748260498047, + "learning_rate": 0.0005321254873947495, + "loss": 3.2006, + "step": 13368 + }, + { + "epoch": 0.66, + "grad_norm": 0.5365203619003296, + "learning_rate": 0.0005321157337704594, + "loss": 2.8698, + "step": 13369 + }, + { + "epoch": 0.66, + "grad_norm": 0.5272378325462341, + "learning_rate": 0.000532105979534821, + "loss": 3.1841, + "step": 13370 + }, + { + "epoch": 0.66, + "grad_norm": 0.5126152634620667, + "learning_rate": 0.0005320962246878602, + "loss": 3.1408, + "step": 13371 + }, + { + "epoch": 0.66, + "grad_norm": 0.5221778154373169, + "learning_rate": 0.0005320864692296026, + "loss": 3.2598, + "step": 13372 + }, + { + "epoch": 0.66, + "grad_norm": 0.5323341488838196, + "learning_rate": 0.000532076713160074, + "loss": 3.3795, + "step": 13373 + }, + { + "epoch": 0.66, + "grad_norm": 0.5273595452308655, + "learning_rate": 0.0005320669564792999, + "loss": 3.5628, + "step": 13374 + }, + { + "epoch": 0.66, + "grad_norm": 0.5162096619606018, + "learning_rate": 0.0005320571991873061, + "loss": 3.1631, + "step": 13375 + }, + { + "epoch": 0.66, + "grad_norm": 0.49325329065322876, + "learning_rate": 0.0005320474412841183, + "loss": 3.4696, + "step": 13376 + }, + { + "epoch": 0.66, + "grad_norm": 0.5000008344650269, + "learning_rate": 0.0005320376827697622, + "loss": 3.1889, + "step": 13377 + }, + { + "epoch": 0.66, + "grad_norm": 0.5023176074028015, + "learning_rate": 0.0005320279236442635, + "loss": 3.2494, + "step": 13378 + }, + { + "epoch": 0.66, + "grad_norm": 0.4893326759338379, + "learning_rate": 0.000532018163907648, + "loss": 3.1743, + "step": 13379 + }, + { + "epoch": 0.66, + "grad_norm": 0.5529983639717102, + "learning_rate": 0.0005320084035599413, + "loss": 3.0792, + "step": 13380 + }, + { + "epoch": 0.66, + "grad_norm": 0.5161319375038147, + "learning_rate": 0.000531998642601169, + "loss": 3.2193, + "step": 13381 + }, + { + "epoch": 0.66, + "grad_norm": 0.4804127812385559, + "learning_rate": 0.000531988881031357, + "loss": 3.3955, + "step": 13382 + }, + { + "epoch": 0.66, + "grad_norm": 0.5218368172645569, + "learning_rate": 0.0005319791188505309, + "loss": 3.2941, + "step": 13383 + }, + { + "epoch": 0.66, + "grad_norm": 0.5038884878158569, + "learning_rate": 0.0005319693560587164, + "loss": 3.384, + "step": 13384 + }, + { + "epoch": 0.66, + "grad_norm": 0.503178060054779, + "learning_rate": 0.0005319595926559392, + "loss": 3.2356, + "step": 13385 + }, + { + "epoch": 0.66, + "grad_norm": 0.48960548639297485, + "learning_rate": 0.0005319498286422252, + "loss": 3.1029, + "step": 13386 + }, + { + "epoch": 0.66, + "grad_norm": 0.5522580146789551, + "learning_rate": 0.0005319400640176, + "loss": 3.2378, + "step": 13387 + }, + { + "epoch": 0.66, + "grad_norm": 0.5147445201873779, + "learning_rate": 0.0005319302987820894, + "loss": 3.3786, + "step": 13388 + }, + { + "epoch": 0.66, + "grad_norm": 0.5028669238090515, + "learning_rate": 0.0005319205329357188, + "loss": 3.2985, + "step": 13389 + }, + { + "epoch": 0.66, + "grad_norm": 0.5062679648399353, + "learning_rate": 0.0005319107664785144, + "loss": 3.2392, + "step": 13390 + }, + { + "epoch": 0.66, + "grad_norm": 0.5048500895500183, + "learning_rate": 0.0005319009994105014, + "loss": 3.3131, + "step": 13391 + }, + { + "epoch": 0.66, + "grad_norm": 0.5031437277793884, + "learning_rate": 0.000531891231731706, + "loss": 3.485, + "step": 13392 + }, + { + "epoch": 0.66, + "grad_norm": 0.5089054703712463, + "learning_rate": 0.0005318814634421537, + "loss": 3.082, + "step": 13393 + }, + { + "epoch": 0.66, + "grad_norm": 0.5406572222709656, + "learning_rate": 0.0005318716945418701, + "loss": 3.1211, + "step": 13394 + }, + { + "epoch": 0.66, + "grad_norm": 0.5037758350372314, + "learning_rate": 0.0005318619250308812, + "loss": 3.055, + "step": 13395 + }, + { + "epoch": 0.66, + "grad_norm": 0.48916196823120117, + "learning_rate": 0.0005318521549092126, + "loss": 3.3232, + "step": 13396 + }, + { + "epoch": 0.66, + "grad_norm": 0.5082511305809021, + "learning_rate": 0.0005318423841768901, + "loss": 3.4669, + "step": 13397 + }, + { + "epoch": 0.66, + "grad_norm": 0.5099027156829834, + "learning_rate": 0.0005318326128339393, + "loss": 3.3092, + "step": 13398 + }, + { + "epoch": 0.66, + "grad_norm": 0.5375298261642456, + "learning_rate": 0.0005318228408803861, + "loss": 3.4794, + "step": 13399 + }, + { + "epoch": 0.66, + "grad_norm": 0.5250174403190613, + "learning_rate": 0.0005318130683162561, + "loss": 3.2933, + "step": 13400 + }, + { + "epoch": 0.66, + "grad_norm": 0.5176973938941956, + "learning_rate": 0.0005318032951415751, + "loss": 3.2749, + "step": 13401 + }, + { + "epoch": 0.66, + "grad_norm": 0.5219639539718628, + "learning_rate": 0.0005317935213563687, + "loss": 3.4369, + "step": 13402 + }, + { + "epoch": 0.66, + "grad_norm": 0.49903932213783264, + "learning_rate": 0.000531783746960663, + "loss": 3.1772, + "step": 13403 + }, + { + "epoch": 0.66, + "grad_norm": 0.5053322911262512, + "learning_rate": 0.0005317739719544834, + "loss": 3.415, + "step": 13404 + }, + { + "epoch": 0.66, + "grad_norm": 0.5507119297981262, + "learning_rate": 0.0005317641963378557, + "loss": 3.0963, + "step": 13405 + }, + { + "epoch": 0.66, + "grad_norm": 0.5222508311271667, + "learning_rate": 0.0005317544201108058, + "loss": 3.2718, + "step": 13406 + }, + { + "epoch": 0.66, + "grad_norm": 0.509367048740387, + "learning_rate": 0.0005317446432733594, + "loss": 3.0838, + "step": 13407 + }, + { + "epoch": 0.66, + "grad_norm": 0.5182288885116577, + "learning_rate": 0.0005317348658255421, + "loss": 3.1657, + "step": 13408 + }, + { + "epoch": 0.66, + "grad_norm": 0.5248708128929138, + "learning_rate": 0.0005317250877673798, + "loss": 3.3512, + "step": 13409 + }, + { + "epoch": 0.66, + "grad_norm": 0.5264621376991272, + "learning_rate": 0.0005317153090988983, + "loss": 3.1901, + "step": 13410 + }, + { + "epoch": 0.66, + "grad_norm": 0.5019949078559875, + "learning_rate": 0.0005317055298201232, + "loss": 3.4335, + "step": 13411 + }, + { + "epoch": 0.66, + "grad_norm": 0.5084396004676819, + "learning_rate": 0.0005316957499310802, + "loss": 3.3224, + "step": 13412 + }, + { + "epoch": 0.66, + "grad_norm": 0.5858707427978516, + "learning_rate": 0.0005316859694317954, + "loss": 3.0917, + "step": 13413 + }, + { + "epoch": 0.66, + "grad_norm": 0.5144307613372803, + "learning_rate": 0.0005316761883222943, + "loss": 3.4099, + "step": 13414 + }, + { + "epoch": 0.66, + "grad_norm": 0.5141854286193848, + "learning_rate": 0.0005316664066026026, + "loss": 3.2635, + "step": 13415 + }, + { + "epoch": 0.66, + "grad_norm": 0.5495926737785339, + "learning_rate": 0.0005316566242727463, + "loss": 3.3313, + "step": 13416 + }, + { + "epoch": 0.66, + "grad_norm": 0.5578013062477112, + "learning_rate": 0.000531646841332751, + "loss": 3.4108, + "step": 13417 + }, + { + "epoch": 0.66, + "grad_norm": 0.5008851885795593, + "learning_rate": 0.0005316370577826424, + "loss": 3.2526, + "step": 13418 + }, + { + "epoch": 0.66, + "grad_norm": 0.5154222846031189, + "learning_rate": 0.0005316272736224464, + "loss": 3.2379, + "step": 13419 + }, + { + "epoch": 0.66, + "grad_norm": 0.527387261390686, + "learning_rate": 0.0005316174888521888, + "loss": 3.3232, + "step": 13420 + }, + { + "epoch": 0.66, + "grad_norm": 0.5717757344245911, + "learning_rate": 0.0005316077034718952, + "loss": 3.1741, + "step": 13421 + }, + { + "epoch": 0.66, + "grad_norm": 0.5230823159217834, + "learning_rate": 0.0005315979174815916, + "loss": 3.273, + "step": 13422 + }, + { + "epoch": 0.66, + "grad_norm": 0.521920382976532, + "learning_rate": 0.0005315881308813038, + "loss": 3.2103, + "step": 13423 + }, + { + "epoch": 0.66, + "grad_norm": 0.5073776245117188, + "learning_rate": 0.0005315783436710572, + "loss": 3.25, + "step": 13424 + }, + { + "epoch": 0.66, + "grad_norm": 0.5398444533348083, + "learning_rate": 0.0005315685558508779, + "loss": 3.1903, + "step": 13425 + }, + { + "epoch": 0.66, + "grad_norm": 0.4858630299568176, + "learning_rate": 0.0005315587674207914, + "loss": 3.2599, + "step": 13426 + }, + { + "epoch": 0.66, + "grad_norm": 0.5114261507987976, + "learning_rate": 0.0005315489783808239, + "loss": 3.1523, + "step": 13427 + }, + { + "epoch": 0.66, + "grad_norm": 0.5169461965560913, + "learning_rate": 0.0005315391887310009, + "loss": 3.1635, + "step": 13428 + }, + { + "epoch": 0.66, + "grad_norm": 0.49738162755966187, + "learning_rate": 0.0005315293984713482, + "loss": 3.0964, + "step": 13429 + }, + { + "epoch": 0.66, + "grad_norm": 0.5174451470375061, + "learning_rate": 0.0005315196076018917, + "loss": 3.2218, + "step": 13430 + }, + { + "epoch": 0.66, + "grad_norm": 0.5436557531356812, + "learning_rate": 0.000531509816122657, + "loss": 3.0565, + "step": 13431 + }, + { + "epoch": 0.66, + "grad_norm": 0.5209143161773682, + "learning_rate": 0.0005315000240336702, + "loss": 3.1762, + "step": 13432 + }, + { + "epoch": 0.66, + "grad_norm": 0.5072414875030518, + "learning_rate": 0.0005314902313349566, + "loss": 3.4239, + "step": 13433 + }, + { + "epoch": 0.66, + "grad_norm": 0.4978218674659729, + "learning_rate": 0.0005314804380265425, + "loss": 3.3271, + "step": 13434 + }, + { + "epoch": 0.66, + "grad_norm": 0.5045718550682068, + "learning_rate": 0.0005314706441084535, + "loss": 3.0421, + "step": 13435 + }, + { + "epoch": 0.66, + "grad_norm": 0.5093064904212952, + "learning_rate": 0.0005314608495807151, + "loss": 3.2584, + "step": 13436 + }, + { + "epoch": 0.66, + "grad_norm": 0.5230816602706909, + "learning_rate": 0.0005314510544433536, + "loss": 3.1959, + "step": 13437 + }, + { + "epoch": 0.66, + "grad_norm": 0.547500491142273, + "learning_rate": 0.0005314412586963945, + "loss": 3.1111, + "step": 13438 + }, + { + "epoch": 0.66, + "grad_norm": 0.5420114398002625, + "learning_rate": 0.0005314314623398637, + "loss": 3.2622, + "step": 13439 + }, + { + "epoch": 0.66, + "grad_norm": 0.5222113728523254, + "learning_rate": 0.0005314216653737869, + "loss": 3.1762, + "step": 13440 + }, + { + "epoch": 0.66, + "grad_norm": 0.5479421615600586, + "learning_rate": 0.00053141186779819, + "loss": 3.0739, + "step": 13441 + }, + { + "epoch": 0.66, + "grad_norm": 0.5246508121490479, + "learning_rate": 0.0005314020696130989, + "loss": 3.2307, + "step": 13442 + }, + { + "epoch": 0.66, + "grad_norm": 0.5006526112556458, + "learning_rate": 0.0005313922708185391, + "loss": 3.3417, + "step": 13443 + }, + { + "epoch": 0.66, + "grad_norm": 0.5077061653137207, + "learning_rate": 0.0005313824714145367, + "loss": 3.0799, + "step": 13444 + }, + { + "epoch": 0.66, + "grad_norm": 0.4841029942035675, + "learning_rate": 0.0005313726714011173, + "loss": 3.1433, + "step": 13445 + }, + { + "epoch": 0.66, + "grad_norm": 0.5092795491218567, + "learning_rate": 0.000531362870778307, + "loss": 3.2824, + "step": 13446 + }, + { + "epoch": 0.66, + "grad_norm": 0.5002308487892151, + "learning_rate": 0.0005313530695461313, + "loss": 3.2313, + "step": 13447 + }, + { + "epoch": 0.66, + "grad_norm": 0.5429970026016235, + "learning_rate": 0.0005313432677046162, + "loss": 3.2798, + "step": 13448 + }, + { + "epoch": 0.66, + "grad_norm": 0.5142826437950134, + "learning_rate": 0.0005313334652537873, + "loss": 3.0374, + "step": 13449 + }, + { + "epoch": 0.66, + "grad_norm": 0.5216500759124756, + "learning_rate": 0.0005313236621936707, + "loss": 3.401, + "step": 13450 + }, + { + "epoch": 0.66, + "grad_norm": 0.5026384592056274, + "learning_rate": 0.0005313138585242921, + "loss": 3.2493, + "step": 13451 + }, + { + "epoch": 0.66, + "grad_norm": 0.5115340352058411, + "learning_rate": 0.0005313040542456772, + "loss": 3.0881, + "step": 13452 + }, + { + "epoch": 0.66, + "grad_norm": 0.5016939043998718, + "learning_rate": 0.0005312942493578519, + "loss": 3.0879, + "step": 13453 + }, + { + "epoch": 0.66, + "grad_norm": 0.459194540977478, + "learning_rate": 0.0005312844438608423, + "loss": 2.9753, + "step": 13454 + }, + { + "epoch": 0.66, + "grad_norm": 0.5059602856636047, + "learning_rate": 0.0005312746377546739, + "loss": 3.1455, + "step": 13455 + }, + { + "epoch": 0.66, + "grad_norm": 0.49334749579429626, + "learning_rate": 0.0005312648310393726, + "loss": 3.1937, + "step": 13456 + }, + { + "epoch": 0.66, + "grad_norm": 0.5341232419013977, + "learning_rate": 0.0005312550237149641, + "loss": 3.2977, + "step": 13457 + }, + { + "epoch": 0.66, + "grad_norm": 0.5050813555717468, + "learning_rate": 0.0005312452157814746, + "loss": 3.3302, + "step": 13458 + }, + { + "epoch": 0.66, + "grad_norm": 0.5001168251037598, + "learning_rate": 0.0005312354072389296, + "loss": 3.0763, + "step": 13459 + }, + { + "epoch": 0.66, + "grad_norm": 0.5147970914840698, + "learning_rate": 0.000531225598087355, + "loss": 3.235, + "step": 13460 + }, + { + "epoch": 0.66, + "grad_norm": 0.529277503490448, + "learning_rate": 0.0005312157883267767, + "loss": 3.2548, + "step": 13461 + }, + { + "epoch": 0.66, + "grad_norm": 0.49942076206207275, + "learning_rate": 0.0005312059779572205, + "loss": 3.106, + "step": 13462 + }, + { + "epoch": 0.66, + "grad_norm": 0.5275735259056091, + "learning_rate": 0.0005311961669787124, + "loss": 3.0986, + "step": 13463 + }, + { + "epoch": 0.66, + "grad_norm": 0.5004652738571167, + "learning_rate": 0.0005311863553912778, + "loss": 3.2899, + "step": 13464 + }, + { + "epoch": 0.66, + "grad_norm": 0.5354148149490356, + "learning_rate": 0.0005311765431949431, + "loss": 3.2583, + "step": 13465 + }, + { + "epoch": 0.66, + "grad_norm": 0.5163960456848145, + "learning_rate": 0.0005311667303897337, + "loss": 3.1102, + "step": 13466 + }, + { + "epoch": 0.66, + "grad_norm": 0.5226514935493469, + "learning_rate": 0.0005311569169756757, + "loss": 3.1032, + "step": 13467 + }, + { + "epoch": 0.66, + "grad_norm": 0.5380831956863403, + "learning_rate": 0.000531147102952795, + "loss": 3.3771, + "step": 13468 + }, + { + "epoch": 0.66, + "grad_norm": 0.5179706811904907, + "learning_rate": 0.0005311372883211171, + "loss": 3.2395, + "step": 13469 + }, + { + "epoch": 0.66, + "grad_norm": 0.5135579705238342, + "learning_rate": 0.0005311274730806681, + "loss": 3.3656, + "step": 13470 + }, + { + "epoch": 0.66, + "grad_norm": 0.500043511390686, + "learning_rate": 0.0005311176572314739, + "loss": 3.354, + "step": 13471 + }, + { + "epoch": 0.66, + "grad_norm": 0.5214130282402039, + "learning_rate": 0.0005311078407735602, + "loss": 3.2447, + "step": 13472 + }, + { + "epoch": 0.66, + "grad_norm": 0.5328757762908936, + "learning_rate": 0.000531098023706953, + "loss": 3.0472, + "step": 13473 + }, + { + "epoch": 0.66, + "grad_norm": 0.49766138195991516, + "learning_rate": 0.000531088206031678, + "loss": 3.2254, + "step": 13474 + }, + { + "epoch": 0.66, + "grad_norm": 0.5517077445983887, + "learning_rate": 0.0005310783877477612, + "loss": 3.3586, + "step": 13475 + }, + { + "epoch": 0.66, + "grad_norm": 0.5423430800437927, + "learning_rate": 0.0005310685688552284, + "loss": 3.0909, + "step": 13476 + }, + { + "epoch": 0.66, + "grad_norm": 0.519158124923706, + "learning_rate": 0.0005310587493541054, + "loss": 3.1953, + "step": 13477 + }, + { + "epoch": 0.66, + "grad_norm": 0.5957249999046326, + "learning_rate": 0.0005310489292444182, + "loss": 3.1486, + "step": 13478 + }, + { + "epoch": 0.66, + "grad_norm": 0.5332064032554626, + "learning_rate": 0.0005310391085261926, + "loss": 3.3535, + "step": 13479 + }, + { + "epoch": 0.66, + "grad_norm": 0.5151717066764832, + "learning_rate": 0.0005310292871994544, + "loss": 3.2729, + "step": 13480 + }, + { + "epoch": 0.66, + "grad_norm": 0.5478836297988892, + "learning_rate": 0.0005310194652642295, + "loss": 3.1139, + "step": 13481 + }, + { + "epoch": 0.66, + "grad_norm": 0.5319055318832397, + "learning_rate": 0.0005310096427205437, + "loss": 3.1867, + "step": 13482 + }, + { + "epoch": 0.66, + "grad_norm": 0.5295076966285706, + "learning_rate": 0.0005309998195684231, + "loss": 3.4644, + "step": 13483 + }, + { + "epoch": 0.66, + "grad_norm": 0.5261130332946777, + "learning_rate": 0.0005309899958078934, + "loss": 3.1089, + "step": 13484 + }, + { + "epoch": 0.66, + "grad_norm": 0.504030168056488, + "learning_rate": 0.0005309801714389805, + "loss": 3.1605, + "step": 13485 + }, + { + "epoch": 0.66, + "grad_norm": 0.5086562037467957, + "learning_rate": 0.0005309703464617103, + "loss": 3.2354, + "step": 13486 + }, + { + "epoch": 0.66, + "grad_norm": 0.49703478813171387, + "learning_rate": 0.0005309605208761087, + "loss": 3.2152, + "step": 13487 + }, + { + "epoch": 0.66, + "grad_norm": 0.5037059783935547, + "learning_rate": 0.0005309506946822015, + "loss": 3.1068, + "step": 13488 + }, + { + "epoch": 0.66, + "grad_norm": 0.49587151408195496, + "learning_rate": 0.0005309408678800145, + "loss": 3.2943, + "step": 13489 + }, + { + "epoch": 0.66, + "grad_norm": 0.5075664520263672, + "learning_rate": 0.0005309310404695739, + "loss": 3.2823, + "step": 13490 + }, + { + "epoch": 0.66, + "grad_norm": 0.5008799433708191, + "learning_rate": 0.0005309212124509052, + "loss": 3.1233, + "step": 13491 + }, + { + "epoch": 0.66, + "grad_norm": 0.49640950560569763, + "learning_rate": 0.0005309113838240346, + "loss": 3.316, + "step": 13492 + }, + { + "epoch": 0.66, + "grad_norm": 0.5615620613098145, + "learning_rate": 0.0005309015545889878, + "loss": 3.3059, + "step": 13493 + }, + { + "epoch": 0.66, + "grad_norm": 0.5070755481719971, + "learning_rate": 0.0005308917247457907, + "loss": 3.2173, + "step": 13494 + }, + { + "epoch": 0.66, + "grad_norm": 0.48888513445854187, + "learning_rate": 0.0005308818942944693, + "loss": 3.1687, + "step": 13495 + }, + { + "epoch": 0.66, + "grad_norm": 0.5695043802261353, + "learning_rate": 0.0005308720632350494, + "loss": 3.2426, + "step": 13496 + }, + { + "epoch": 0.66, + "grad_norm": 0.533906102180481, + "learning_rate": 0.000530862231567557, + "loss": 3.1716, + "step": 13497 + }, + { + "epoch": 0.66, + "grad_norm": 0.5260034203529358, + "learning_rate": 0.0005308523992920178, + "loss": 3.2352, + "step": 13498 + }, + { + "epoch": 0.66, + "grad_norm": 0.5359194278717041, + "learning_rate": 0.0005308425664084579, + "loss": 3.2188, + "step": 13499 + }, + { + "epoch": 0.66, + "grad_norm": 0.49645182490348816, + "learning_rate": 0.0005308327329169029, + "loss": 3.2868, + "step": 13500 + }, + { + "epoch": 0.66, + "grad_norm": 0.5202005505561829, + "learning_rate": 0.0005308228988173792, + "loss": 3.3061, + "step": 13501 + }, + { + "epoch": 0.66, + "grad_norm": 0.5067334771156311, + "learning_rate": 0.0005308130641099122, + "loss": 3.0534, + "step": 13502 + }, + { + "epoch": 0.66, + "grad_norm": 0.5037301778793335, + "learning_rate": 0.0005308032287945281, + "loss": 3.3825, + "step": 13503 + }, + { + "epoch": 0.66, + "grad_norm": 0.6401285529136658, + "learning_rate": 0.0005307933928712527, + "loss": 3.1316, + "step": 13504 + }, + { + "epoch": 0.66, + "grad_norm": 0.516437292098999, + "learning_rate": 0.000530783556340112, + "loss": 3.095, + "step": 13505 + }, + { + "epoch": 0.66, + "grad_norm": 0.5353245139122009, + "learning_rate": 0.0005307737192011316, + "loss": 3.0556, + "step": 13506 + }, + { + "epoch": 0.66, + "grad_norm": 0.5425240993499756, + "learning_rate": 0.0005307638814543378, + "loss": 3.2073, + "step": 13507 + }, + { + "epoch": 0.66, + "grad_norm": 0.5350566506385803, + "learning_rate": 0.0005307540430997563, + "loss": 3.1884, + "step": 13508 + }, + { + "epoch": 0.66, + "grad_norm": 0.5366530418395996, + "learning_rate": 0.0005307442041374131, + "loss": 3.2401, + "step": 13509 + }, + { + "epoch": 0.66, + "grad_norm": 0.5320677161216736, + "learning_rate": 0.0005307343645673342, + "loss": 3.1042, + "step": 13510 + }, + { + "epoch": 0.66, + "grad_norm": 0.5887209177017212, + "learning_rate": 0.0005307245243895451, + "loss": 3.0003, + "step": 13511 + }, + { + "epoch": 0.66, + "grad_norm": 0.5200484991073608, + "learning_rate": 0.0005307146836040722, + "loss": 3.3779, + "step": 13512 + }, + { + "epoch": 0.66, + "grad_norm": 0.5377318263053894, + "learning_rate": 0.0005307048422109412, + "loss": 3.0319, + "step": 13513 + }, + { + "epoch": 0.66, + "grad_norm": 0.5469724535942078, + "learning_rate": 0.000530695000210178, + "loss": 3.1924, + "step": 13514 + }, + { + "epoch": 0.66, + "grad_norm": 0.5550596117973328, + "learning_rate": 0.0005306851576018086, + "loss": 3.2543, + "step": 13515 + }, + { + "epoch": 0.66, + "grad_norm": 0.49396204948425293, + "learning_rate": 0.0005306753143858588, + "loss": 3.2256, + "step": 13516 + }, + { + "epoch": 0.66, + "grad_norm": 0.49809572100639343, + "learning_rate": 0.0005306654705623547, + "loss": 3.362, + "step": 13517 + }, + { + "epoch": 0.66, + "grad_norm": 0.4874090254306793, + "learning_rate": 0.0005306556261313222, + "loss": 3.0973, + "step": 13518 + }, + { + "epoch": 0.66, + "grad_norm": 0.5111352801322937, + "learning_rate": 0.0005306457810927872, + "loss": 3.4853, + "step": 13519 + }, + { + "epoch": 0.66, + "grad_norm": 0.5032990574836731, + "learning_rate": 0.0005306359354467754, + "loss": 3.2901, + "step": 13520 + }, + { + "epoch": 0.66, + "grad_norm": 0.5103004574775696, + "learning_rate": 0.0005306260891933131, + "loss": 3.3242, + "step": 13521 + }, + { + "epoch": 0.66, + "grad_norm": 0.5028958916664124, + "learning_rate": 0.000530616242332426, + "loss": 3.232, + "step": 13522 + }, + { + "epoch": 0.66, + "grad_norm": 0.5463312864303589, + "learning_rate": 0.0005306063948641401, + "loss": 3.2748, + "step": 13523 + }, + { + "epoch": 0.66, + "grad_norm": 0.5075322985649109, + "learning_rate": 0.0005305965467884813, + "loss": 3.1649, + "step": 13524 + }, + { + "epoch": 0.66, + "grad_norm": 0.5177689790725708, + "learning_rate": 0.0005305866981054757, + "loss": 3.1198, + "step": 13525 + }, + { + "epoch": 0.66, + "grad_norm": 0.5131239295005798, + "learning_rate": 0.000530576848815149, + "loss": 3.29, + "step": 13526 + }, + { + "epoch": 0.66, + "grad_norm": 0.4989679455757141, + "learning_rate": 0.0005305669989175273, + "loss": 3.0434, + "step": 13527 + }, + { + "epoch": 0.66, + "grad_norm": 0.5141971111297607, + "learning_rate": 0.0005305571484126365, + "loss": 3.1655, + "step": 13528 + }, + { + "epoch": 0.66, + "grad_norm": 0.5180733799934387, + "learning_rate": 0.0005305472973005025, + "loss": 3.3093, + "step": 13529 + }, + { + "epoch": 0.66, + "grad_norm": 0.5635950565338135, + "learning_rate": 0.0005305374455811514, + "loss": 3.1966, + "step": 13530 + }, + { + "epoch": 0.66, + "grad_norm": 0.5000056624412537, + "learning_rate": 0.0005305275932546089, + "loss": 3.1471, + "step": 13531 + }, + { + "epoch": 0.66, + "grad_norm": 0.5147111415863037, + "learning_rate": 0.0005305177403209011, + "loss": 3.0226, + "step": 13532 + }, + { + "epoch": 0.66, + "grad_norm": 0.48060378432273865, + "learning_rate": 0.0005305078867800541, + "loss": 3.1014, + "step": 13533 + }, + { + "epoch": 0.66, + "grad_norm": 0.569724440574646, + "learning_rate": 0.0005304980326320935, + "loss": 3.2077, + "step": 13534 + }, + { + "epoch": 0.66, + "grad_norm": 0.5264111161231995, + "learning_rate": 0.0005304881778770455, + "loss": 3.0713, + "step": 13535 + }, + { + "epoch": 0.66, + "grad_norm": 0.512325644493103, + "learning_rate": 0.000530478322514936, + "loss": 3.3984, + "step": 13536 + }, + { + "epoch": 0.66, + "grad_norm": 0.5474256873130798, + "learning_rate": 0.000530468466545791, + "loss": 3.0077, + "step": 13537 + }, + { + "epoch": 0.66, + "grad_norm": 0.5356321334838867, + "learning_rate": 0.0005304586099696364, + "loss": 3.3486, + "step": 13538 + }, + { + "epoch": 0.66, + "grad_norm": 0.5049594044685364, + "learning_rate": 0.0005304487527864982, + "loss": 3.3997, + "step": 13539 + }, + { + "epoch": 0.66, + "grad_norm": 0.5442296862602234, + "learning_rate": 0.0005304388949964022, + "loss": 3.3842, + "step": 13540 + }, + { + "epoch": 0.66, + "grad_norm": 0.5284900665283203, + "learning_rate": 0.0005304290365993747, + "loss": 3.0788, + "step": 13541 + }, + { + "epoch": 0.66, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0005304191775954414, + "loss": 3.157, + "step": 13542 + }, + { + "epoch": 0.66, + "grad_norm": 0.49122077226638794, + "learning_rate": 0.0005304093179846281, + "loss": 3.0097, + "step": 13543 + }, + { + "epoch": 0.66, + "grad_norm": 0.5494361519813538, + "learning_rate": 0.0005303994577669612, + "loss": 3.0168, + "step": 13544 + }, + { + "epoch": 0.66, + "grad_norm": 0.5453931093215942, + "learning_rate": 0.0005303895969424665, + "loss": 3.2503, + "step": 13545 + }, + { + "epoch": 0.66, + "grad_norm": 0.5414052605628967, + "learning_rate": 0.0005303797355111699, + "loss": 3.0531, + "step": 13546 + }, + { + "epoch": 0.66, + "grad_norm": 0.5246322751045227, + "learning_rate": 0.0005303698734730974, + "loss": 3.3351, + "step": 13547 + }, + { + "epoch": 0.66, + "grad_norm": 0.5333435535430908, + "learning_rate": 0.0005303600108282749, + "loss": 3.4273, + "step": 13548 + }, + { + "epoch": 0.66, + "grad_norm": 0.511567234992981, + "learning_rate": 0.0005303501475767287, + "loss": 3.1325, + "step": 13549 + }, + { + "epoch": 0.66, + "grad_norm": 0.4871523082256317, + "learning_rate": 0.0005303402837184844, + "loss": 3.2711, + "step": 13550 + }, + { + "epoch": 0.66, + "grad_norm": 0.5091209411621094, + "learning_rate": 0.0005303304192535681, + "loss": 3.3666, + "step": 13551 + }, + { + "epoch": 0.66, + "grad_norm": 0.5344579815864563, + "learning_rate": 0.0005303205541820058, + "loss": 3.1501, + "step": 13552 + }, + { + "epoch": 0.66, + "grad_norm": 0.5157179832458496, + "learning_rate": 0.0005303106885038235, + "loss": 3.2942, + "step": 13553 + }, + { + "epoch": 0.66, + "grad_norm": 0.4973103404045105, + "learning_rate": 0.0005303008222190472, + "loss": 3.1057, + "step": 13554 + }, + { + "epoch": 0.66, + "grad_norm": 0.5082616209983826, + "learning_rate": 0.0005302909553277029, + "loss": 3.079, + "step": 13555 + }, + { + "epoch": 0.66, + "grad_norm": 0.491454154253006, + "learning_rate": 0.0005302810878298165, + "loss": 3.2827, + "step": 13556 + }, + { + "epoch": 0.66, + "grad_norm": 0.5277991890907288, + "learning_rate": 0.000530271219725414, + "loss": 3.3536, + "step": 13557 + }, + { + "epoch": 0.66, + "grad_norm": 0.49939367175102234, + "learning_rate": 0.0005302613510145215, + "loss": 3.1075, + "step": 13558 + }, + { + "epoch": 0.66, + "grad_norm": 0.5459829568862915, + "learning_rate": 0.0005302514816971648, + "loss": 3.2634, + "step": 13559 + }, + { + "epoch": 0.66, + "grad_norm": 0.5232791304588318, + "learning_rate": 0.0005302416117733701, + "loss": 3.1575, + "step": 13560 + }, + { + "epoch": 0.66, + "grad_norm": 0.5249155163764954, + "learning_rate": 0.0005302317412431632, + "loss": 3.1753, + "step": 13561 + }, + { + "epoch": 0.66, + "grad_norm": 0.5318909883499146, + "learning_rate": 0.0005302218701065703, + "loss": 3.177, + "step": 13562 + }, + { + "epoch": 0.66, + "grad_norm": 0.49357926845550537, + "learning_rate": 0.0005302119983636174, + "loss": 3.2127, + "step": 13563 + }, + { + "epoch": 0.66, + "grad_norm": 0.5603306293487549, + "learning_rate": 0.0005302021260143303, + "loss": 3.2759, + "step": 13564 + }, + { + "epoch": 0.66, + "grad_norm": 0.49928614497184753, + "learning_rate": 0.0005301922530587351, + "loss": 3.3354, + "step": 13565 + }, + { + "epoch": 0.66, + "grad_norm": 0.5590007901191711, + "learning_rate": 0.0005301823794968577, + "loss": 3.1456, + "step": 13566 + }, + { + "epoch": 0.66, + "grad_norm": 0.5050297975540161, + "learning_rate": 0.0005301725053287243, + "loss": 3.1094, + "step": 13567 + }, + { + "epoch": 0.66, + "grad_norm": 0.532940149307251, + "learning_rate": 0.0005301626305543608, + "loss": 3.2555, + "step": 13568 + }, + { + "epoch": 0.66, + "grad_norm": 0.511152982711792, + "learning_rate": 0.0005301527551737933, + "loss": 3.3773, + "step": 13569 + }, + { + "epoch": 0.67, + "grad_norm": 0.5149011611938477, + "learning_rate": 0.0005301428791870476, + "loss": 3.2087, + "step": 13570 + }, + { + "epoch": 0.67, + "grad_norm": 0.5097518563270569, + "learning_rate": 0.00053013300259415, + "loss": 3.2675, + "step": 13571 + }, + { + "epoch": 0.67, + "grad_norm": 0.5337220430374146, + "learning_rate": 0.0005301231253951263, + "loss": 3.3959, + "step": 13572 + }, + { + "epoch": 0.67, + "grad_norm": 0.5316744446754456, + "learning_rate": 0.0005301132475900026, + "loss": 3.4082, + "step": 13573 + }, + { + "epoch": 0.67, + "grad_norm": 0.48296254873275757, + "learning_rate": 0.0005301033691788048, + "loss": 3.2966, + "step": 13574 + }, + { + "epoch": 0.67, + "grad_norm": 0.5001648664474487, + "learning_rate": 0.0005300934901615591, + "loss": 3.2072, + "step": 13575 + }, + { + "epoch": 0.67, + "grad_norm": 0.5572800040245056, + "learning_rate": 0.0005300836105382914, + "loss": 3.2663, + "step": 13576 + }, + { + "epoch": 0.67, + "grad_norm": 0.5312752723693848, + "learning_rate": 0.0005300737303090277, + "loss": 3.2437, + "step": 13577 + }, + { + "epoch": 0.67, + "grad_norm": 0.528778076171875, + "learning_rate": 0.0005300638494737941, + "loss": 3.2888, + "step": 13578 + }, + { + "epoch": 0.67, + "grad_norm": 0.5192068219184875, + "learning_rate": 0.0005300539680326168, + "loss": 3.2725, + "step": 13579 + }, + { + "epoch": 0.67, + "grad_norm": 0.5534399747848511, + "learning_rate": 0.0005300440859855214, + "loss": 3.1588, + "step": 13580 + }, + { + "epoch": 0.67, + "grad_norm": 0.5094475746154785, + "learning_rate": 0.0005300342033325342, + "loss": 3.4536, + "step": 13581 + }, + { + "epoch": 0.67, + "grad_norm": 0.48778852820396423, + "learning_rate": 0.0005300243200736811, + "loss": 3.2469, + "step": 13582 + }, + { + "epoch": 0.67, + "grad_norm": 0.47840654850006104, + "learning_rate": 0.0005300144362089883, + "loss": 3.2045, + "step": 13583 + }, + { + "epoch": 0.67, + "grad_norm": 0.5369465947151184, + "learning_rate": 0.0005300045517384818, + "loss": 3.1993, + "step": 13584 + }, + { + "epoch": 0.67, + "grad_norm": 0.5591325759887695, + "learning_rate": 0.0005299946666621875, + "loss": 3.1054, + "step": 13585 + }, + { + "epoch": 0.67, + "grad_norm": 0.5063976645469666, + "learning_rate": 0.0005299847809801314, + "loss": 2.9926, + "step": 13586 + }, + { + "epoch": 0.67, + "grad_norm": 0.5150634050369263, + "learning_rate": 0.0005299748946923399, + "loss": 3.2566, + "step": 13587 + }, + { + "epoch": 0.67, + "grad_norm": 0.5185797214508057, + "learning_rate": 0.0005299650077988386, + "loss": 3.0262, + "step": 13588 + }, + { + "epoch": 0.67, + "grad_norm": 0.5387712717056274, + "learning_rate": 0.0005299551202996537, + "loss": 3.0643, + "step": 13589 + }, + { + "epoch": 0.67, + "grad_norm": 0.5186799764633179, + "learning_rate": 0.0005299452321948114, + "loss": 3.2493, + "step": 13590 + }, + { + "epoch": 0.67, + "grad_norm": 0.5232559442520142, + "learning_rate": 0.0005299353434843376, + "loss": 3.5282, + "step": 13591 + }, + { + "epoch": 0.67, + "grad_norm": 0.5263158679008484, + "learning_rate": 0.0005299254541682583, + "loss": 3.0349, + "step": 13592 + }, + { + "epoch": 0.67, + "grad_norm": 0.5348055362701416, + "learning_rate": 0.0005299155642465996, + "loss": 3.1144, + "step": 13593 + }, + { + "epoch": 0.67, + "grad_norm": 0.515724241733551, + "learning_rate": 0.0005299056737193876, + "loss": 3.3351, + "step": 13594 + }, + { + "epoch": 0.67, + "grad_norm": 0.5149891972541809, + "learning_rate": 0.0005298957825866482, + "loss": 3.1768, + "step": 13595 + }, + { + "epoch": 0.67, + "grad_norm": 0.509054958820343, + "learning_rate": 0.0005298858908484076, + "loss": 3.3885, + "step": 13596 + }, + { + "epoch": 0.67, + "grad_norm": 0.5293143391609192, + "learning_rate": 0.0005298759985046919, + "loss": 3.3963, + "step": 13597 + }, + { + "epoch": 0.67, + "grad_norm": 0.5564351677894592, + "learning_rate": 0.0005298661055555269, + "loss": 3.2376, + "step": 13598 + }, + { + "epoch": 0.67, + "grad_norm": 0.5585691928863525, + "learning_rate": 0.000529856212000939, + "loss": 3.3014, + "step": 13599 + }, + { + "epoch": 0.67, + "grad_norm": 0.5429052114486694, + "learning_rate": 0.000529846317840954, + "loss": 3.2396, + "step": 13600 + }, + { + "epoch": 0.67, + "grad_norm": 0.5204060673713684, + "learning_rate": 0.000529836423075598, + "loss": 3.3343, + "step": 13601 + }, + { + "epoch": 0.67, + "grad_norm": 0.4938051998615265, + "learning_rate": 0.0005298265277048971, + "loss": 3.2435, + "step": 13602 + }, + { + "epoch": 0.67, + "grad_norm": 0.4976741373538971, + "learning_rate": 0.0005298166317288774, + "loss": 3.2764, + "step": 13603 + }, + { + "epoch": 0.67, + "grad_norm": 0.5170921683311462, + "learning_rate": 0.0005298067351475649, + "loss": 3.1395, + "step": 13604 + }, + { + "epoch": 0.67, + "grad_norm": 0.4929213225841522, + "learning_rate": 0.0005297968379609858, + "loss": 3.2229, + "step": 13605 + }, + { + "epoch": 0.67, + "grad_norm": 0.5679933428764343, + "learning_rate": 0.0005297869401691658, + "loss": 3.3445, + "step": 13606 + }, + { + "epoch": 0.67, + "grad_norm": 0.5538800954818726, + "learning_rate": 0.0005297770417721314, + "loss": 3.2866, + "step": 13607 + }, + { + "epoch": 0.67, + "grad_norm": 0.5196000933647156, + "learning_rate": 0.0005297671427699084, + "loss": 3.204, + "step": 13608 + }, + { + "epoch": 0.67, + "grad_norm": 0.5372135043144226, + "learning_rate": 0.0005297572431625229, + "loss": 3.2732, + "step": 13609 + }, + { + "epoch": 0.67, + "grad_norm": 0.5452011227607727, + "learning_rate": 0.0005297473429500013, + "loss": 3.1766, + "step": 13610 + }, + { + "epoch": 0.67, + "grad_norm": 0.4952123463153839, + "learning_rate": 0.0005297374421323692, + "loss": 3.1995, + "step": 13611 + }, + { + "epoch": 0.67, + "grad_norm": 0.506626307964325, + "learning_rate": 0.000529727540709653, + "loss": 3.1581, + "step": 13612 + }, + { + "epoch": 0.67, + "grad_norm": 0.5172677636146545, + "learning_rate": 0.0005297176386818786, + "loss": 2.8921, + "step": 13613 + }, + { + "epoch": 0.67, + "grad_norm": 0.5049740076065063, + "learning_rate": 0.0005297077360490722, + "loss": 3.208, + "step": 13614 + }, + { + "epoch": 0.67, + "grad_norm": 0.5029763579368591, + "learning_rate": 0.0005296978328112598, + "loss": 3.1353, + "step": 13615 + }, + { + "epoch": 0.67, + "grad_norm": 0.5007016658782959, + "learning_rate": 0.0005296879289684675, + "loss": 3.1918, + "step": 13616 + }, + { + "epoch": 0.67, + "grad_norm": 0.5236932039260864, + "learning_rate": 0.0005296780245207215, + "loss": 3.0716, + "step": 13617 + }, + { + "epoch": 0.67, + "grad_norm": 0.5022386908531189, + "learning_rate": 0.0005296681194680477, + "loss": 3.3136, + "step": 13618 + }, + { + "epoch": 0.67, + "grad_norm": 0.50746089220047, + "learning_rate": 0.0005296582138104723, + "loss": 3.3146, + "step": 13619 + }, + { + "epoch": 0.67, + "grad_norm": 0.482505738735199, + "learning_rate": 0.0005296483075480213, + "loss": 3.3466, + "step": 13620 + }, + { + "epoch": 0.67, + "grad_norm": 0.49940067529678345, + "learning_rate": 0.0005296384006807209, + "loss": 3.2996, + "step": 13621 + }, + { + "epoch": 0.67, + "grad_norm": 0.49896591901779175, + "learning_rate": 0.0005296284932085972, + "loss": 3.1868, + "step": 13622 + }, + { + "epoch": 0.67, + "grad_norm": 0.5380591750144958, + "learning_rate": 0.0005296185851316761, + "loss": 3.0733, + "step": 13623 + }, + { + "epoch": 0.67, + "grad_norm": 0.5520894527435303, + "learning_rate": 0.0005296086764499839, + "loss": 3.1411, + "step": 13624 + }, + { + "epoch": 0.67, + "grad_norm": 0.5142804980278015, + "learning_rate": 0.0005295987671635468, + "loss": 3.4073, + "step": 13625 + }, + { + "epoch": 0.67, + "grad_norm": 0.5468404293060303, + "learning_rate": 0.0005295888572723906, + "loss": 3.0009, + "step": 13626 + }, + { + "epoch": 0.67, + "grad_norm": 0.5099403858184814, + "learning_rate": 0.0005295789467765414, + "loss": 3.3411, + "step": 13627 + }, + { + "epoch": 0.67, + "grad_norm": 0.4918704330921173, + "learning_rate": 0.0005295690356760256, + "loss": 3.0391, + "step": 13628 + }, + { + "epoch": 0.67, + "grad_norm": 0.5046103596687317, + "learning_rate": 0.0005295591239708691, + "loss": 3.4971, + "step": 13629 + }, + { + "epoch": 0.67, + "grad_norm": 0.5209892392158508, + "learning_rate": 0.0005295492116610982, + "loss": 2.7844, + "step": 13630 + }, + { + "epoch": 0.67, + "grad_norm": 0.5438646078109741, + "learning_rate": 0.0005295392987467387, + "loss": 3.1362, + "step": 13631 + }, + { + "epoch": 0.67, + "grad_norm": 0.5239036083221436, + "learning_rate": 0.0005295293852278168, + "loss": 3.3859, + "step": 13632 + }, + { + "epoch": 0.67, + "grad_norm": 0.529381275177002, + "learning_rate": 0.0005295194711043588, + "loss": 3.2098, + "step": 13633 + }, + { + "epoch": 0.67, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0005295095563763907, + "loss": 3.2301, + "step": 13634 + }, + { + "epoch": 0.67, + "grad_norm": 0.5186215043067932, + "learning_rate": 0.0005294996410439384, + "loss": 3.159, + "step": 13635 + }, + { + "epoch": 0.67, + "grad_norm": 0.5146812796592712, + "learning_rate": 0.0005294897251070283, + "loss": 3.2299, + "step": 13636 + }, + { + "epoch": 0.67, + "grad_norm": 0.5205144882202148, + "learning_rate": 0.0005294798085656865, + "loss": 3.1246, + "step": 13637 + }, + { + "epoch": 0.67, + "grad_norm": 0.5245778560638428, + "learning_rate": 0.0005294698914199391, + "loss": 3.3632, + "step": 13638 + }, + { + "epoch": 0.67, + "grad_norm": 0.5007055997848511, + "learning_rate": 0.0005294599736698121, + "loss": 3.1628, + "step": 13639 + }, + { + "epoch": 0.67, + "grad_norm": 0.5703233480453491, + "learning_rate": 0.0005294500553153316, + "loss": 3.3017, + "step": 13640 + }, + { + "epoch": 0.67, + "grad_norm": 0.5297021269798279, + "learning_rate": 0.0005294401363565239, + "loss": 3.3155, + "step": 13641 + }, + { + "epoch": 0.67, + "grad_norm": 0.5470933318138123, + "learning_rate": 0.000529430216793415, + "loss": 3.1739, + "step": 13642 + }, + { + "epoch": 0.67, + "grad_norm": 0.5373654961585999, + "learning_rate": 0.0005294202966260312, + "loss": 3.4286, + "step": 13643 + }, + { + "epoch": 0.67, + "grad_norm": 0.5070244669914246, + "learning_rate": 0.0005294103758543983, + "loss": 3.1327, + "step": 13644 + }, + { + "epoch": 0.67, + "grad_norm": 0.5400650501251221, + "learning_rate": 0.0005294004544785428, + "loss": 3.1624, + "step": 13645 + }, + { + "epoch": 0.67, + "grad_norm": 0.5214609503746033, + "learning_rate": 0.0005293905324984905, + "loss": 3.1664, + "step": 13646 + }, + { + "epoch": 0.67, + "grad_norm": 0.5696268081665039, + "learning_rate": 0.0005293806099142677, + "loss": 3.1741, + "step": 13647 + }, + { + "epoch": 0.67, + "grad_norm": 0.5091151595115662, + "learning_rate": 0.0005293706867259006, + "loss": 3.3379, + "step": 13648 + }, + { + "epoch": 0.67, + "grad_norm": 0.5270219445228577, + "learning_rate": 0.0005293607629334152, + "loss": 3.1869, + "step": 13649 + }, + { + "epoch": 0.67, + "grad_norm": 0.5341525077819824, + "learning_rate": 0.0005293508385368378, + "loss": 3.1583, + "step": 13650 + }, + { + "epoch": 0.67, + "grad_norm": 0.5189785361289978, + "learning_rate": 0.0005293409135361943, + "loss": 3.286, + "step": 13651 + }, + { + "epoch": 0.67, + "grad_norm": 0.5022042393684387, + "learning_rate": 0.0005293309879315111, + "loss": 3.2813, + "step": 13652 + }, + { + "epoch": 0.67, + "grad_norm": 0.4763084053993225, + "learning_rate": 0.0005293210617228141, + "loss": 3.3161, + "step": 13653 + }, + { + "epoch": 0.67, + "grad_norm": 0.510564386844635, + "learning_rate": 0.0005293111349101296, + "loss": 3.1931, + "step": 13654 + }, + { + "epoch": 0.67, + "grad_norm": 0.531646728515625, + "learning_rate": 0.0005293012074934836, + "loss": 3.2027, + "step": 13655 + }, + { + "epoch": 0.67, + "grad_norm": 0.5853457450866699, + "learning_rate": 0.0005292912794729025, + "loss": 3.1992, + "step": 13656 + }, + { + "epoch": 0.67, + "grad_norm": 0.5224567651748657, + "learning_rate": 0.0005292813508484122, + "loss": 3.3783, + "step": 13657 + }, + { + "epoch": 0.67, + "grad_norm": 0.5682322978973389, + "learning_rate": 0.000529271421620039, + "loss": 3.0409, + "step": 13658 + }, + { + "epoch": 0.67, + "grad_norm": 0.48867014050483704, + "learning_rate": 0.000529261491787809, + "loss": 3.2267, + "step": 13659 + }, + { + "epoch": 0.67, + "grad_norm": 0.5091564059257507, + "learning_rate": 0.0005292515613517483, + "loss": 3.2667, + "step": 13660 + }, + { + "epoch": 0.67, + "grad_norm": 0.5306949615478516, + "learning_rate": 0.0005292416303118832, + "loss": 3.027, + "step": 13661 + }, + { + "epoch": 0.67, + "grad_norm": 0.48959165811538696, + "learning_rate": 0.0005292316986682396, + "loss": 3.1372, + "step": 13662 + }, + { + "epoch": 0.67, + "grad_norm": 0.5132994055747986, + "learning_rate": 0.000529221766420844, + "loss": 3.0894, + "step": 13663 + }, + { + "epoch": 0.67, + "grad_norm": 0.48111510276794434, + "learning_rate": 0.0005292118335697223, + "loss": 3.4498, + "step": 13664 + }, + { + "epoch": 0.67, + "grad_norm": 0.522191047668457, + "learning_rate": 0.0005292019001149008, + "loss": 3.1612, + "step": 13665 + }, + { + "epoch": 0.67, + "grad_norm": 0.4841899275779724, + "learning_rate": 0.0005291919660564055, + "loss": 3.299, + "step": 13666 + }, + { + "epoch": 0.67, + "grad_norm": 0.5242030620574951, + "learning_rate": 0.0005291820313942627, + "loss": 3.4106, + "step": 13667 + }, + { + "epoch": 0.67, + "grad_norm": 0.5671795010566711, + "learning_rate": 0.0005291720961284986, + "loss": 3.073, + "step": 13668 + }, + { + "epoch": 0.67, + "grad_norm": 0.537661612033844, + "learning_rate": 0.0005291621602591393, + "loss": 3.2446, + "step": 13669 + }, + { + "epoch": 0.67, + "grad_norm": 0.530839204788208, + "learning_rate": 0.0005291522237862109, + "loss": 3.369, + "step": 13670 + }, + { + "epoch": 0.67, + "grad_norm": 0.5125889182090759, + "learning_rate": 0.0005291422867097397, + "loss": 3.1252, + "step": 13671 + }, + { + "epoch": 0.67, + "grad_norm": 0.5530248880386353, + "learning_rate": 0.0005291323490297518, + "loss": 3.025, + "step": 13672 + }, + { + "epoch": 0.67, + "grad_norm": 0.5228330492973328, + "learning_rate": 0.0005291224107462734, + "loss": 3.2608, + "step": 13673 + }, + { + "epoch": 0.67, + "grad_norm": 0.5220538973808289, + "learning_rate": 0.0005291124718593307, + "loss": 3.1958, + "step": 13674 + }, + { + "epoch": 0.67, + "grad_norm": 0.5133023858070374, + "learning_rate": 0.0005291025323689497, + "loss": 3.341, + "step": 13675 + }, + { + "epoch": 0.67, + "grad_norm": 0.5261237621307373, + "learning_rate": 0.0005290925922751569, + "loss": 3.1028, + "step": 13676 + }, + { + "epoch": 0.67, + "grad_norm": 0.5116633176803589, + "learning_rate": 0.0005290826515779782, + "loss": 3.0981, + "step": 13677 + }, + { + "epoch": 0.67, + "grad_norm": 0.4918628931045532, + "learning_rate": 0.00052907271027744, + "loss": 3.0695, + "step": 13678 + }, + { + "epoch": 0.67, + "grad_norm": 0.5067062973976135, + "learning_rate": 0.0005290627683735682, + "loss": 3.1897, + "step": 13679 + }, + { + "epoch": 0.67, + "grad_norm": 0.5566603541374207, + "learning_rate": 0.0005290528258663892, + "loss": 3.2876, + "step": 13680 + }, + { + "epoch": 0.67, + "grad_norm": 0.5340036153793335, + "learning_rate": 0.0005290428827559292, + "loss": 3.0107, + "step": 13681 + }, + { + "epoch": 0.67, + "grad_norm": 0.5094714164733887, + "learning_rate": 0.0005290329390422142, + "loss": 3.1796, + "step": 13682 + }, + { + "epoch": 0.67, + "grad_norm": 0.5311017632484436, + "learning_rate": 0.0005290229947252707, + "loss": 3.1173, + "step": 13683 + }, + { + "epoch": 0.67, + "grad_norm": 0.5469943881034851, + "learning_rate": 0.0005290130498051246, + "loss": 3.2645, + "step": 13684 + }, + { + "epoch": 0.67, + "grad_norm": 0.49231579899787903, + "learning_rate": 0.0005290031042818022, + "loss": 3.0617, + "step": 13685 + }, + { + "epoch": 0.67, + "grad_norm": 0.5386189222335815, + "learning_rate": 0.0005289931581553297, + "loss": 3.2606, + "step": 13686 + }, + { + "epoch": 0.67, + "grad_norm": 0.5233540534973145, + "learning_rate": 0.0005289832114257333, + "loss": 3.2946, + "step": 13687 + }, + { + "epoch": 0.67, + "grad_norm": 0.5471063256263733, + "learning_rate": 0.0005289732640930393, + "loss": 3.0275, + "step": 13688 + }, + { + "epoch": 0.67, + "grad_norm": 0.5296842455863953, + "learning_rate": 0.0005289633161572737, + "loss": 3.3652, + "step": 13689 + }, + { + "epoch": 0.67, + "grad_norm": 0.541083037853241, + "learning_rate": 0.0005289533676184627, + "loss": 3.2787, + "step": 13690 + }, + { + "epoch": 0.67, + "grad_norm": 0.527202308177948, + "learning_rate": 0.0005289434184766326, + "loss": 3.1486, + "step": 13691 + }, + { + "epoch": 0.67, + "grad_norm": 0.5069360136985779, + "learning_rate": 0.0005289334687318098, + "loss": 3.2396, + "step": 13692 + }, + { + "epoch": 0.67, + "grad_norm": 0.5278681516647339, + "learning_rate": 0.00052892351838402, + "loss": 3.3923, + "step": 13693 + }, + { + "epoch": 0.67, + "grad_norm": 0.5264055728912354, + "learning_rate": 0.0005289135674332899, + "loss": 3.342, + "step": 13694 + }, + { + "epoch": 0.67, + "grad_norm": 0.5079107284545898, + "learning_rate": 0.0005289036158796455, + "loss": 3.0714, + "step": 13695 + }, + { + "epoch": 0.67, + "grad_norm": 0.5393988490104675, + "learning_rate": 0.000528893663723113, + "loss": 3.156, + "step": 13696 + }, + { + "epoch": 0.67, + "grad_norm": 0.5057147145271301, + "learning_rate": 0.0005288837109637187, + "loss": 3.2109, + "step": 13697 + }, + { + "epoch": 0.67, + "grad_norm": 0.5622931718826294, + "learning_rate": 0.0005288737576014887, + "loss": 3.3168, + "step": 13698 + }, + { + "epoch": 0.67, + "grad_norm": 0.5340046286582947, + "learning_rate": 0.0005288638036364493, + "loss": 3.1038, + "step": 13699 + }, + { + "epoch": 0.67, + "grad_norm": 0.5570096969604492, + "learning_rate": 0.0005288538490686267, + "loss": 3.2489, + "step": 13700 + }, + { + "epoch": 0.67, + "grad_norm": 0.5238069891929626, + "learning_rate": 0.0005288438938980471, + "loss": 3.2243, + "step": 13701 + }, + { + "epoch": 0.67, + "grad_norm": 0.48438480496406555, + "learning_rate": 0.0005288339381247367, + "loss": 3.119, + "step": 13702 + }, + { + "epoch": 0.67, + "grad_norm": 0.5663520693778992, + "learning_rate": 0.0005288239817487217, + "loss": 3.2317, + "step": 13703 + }, + { + "epoch": 0.67, + "grad_norm": 0.5262908935546875, + "learning_rate": 0.0005288140247700285, + "loss": 3.1207, + "step": 13704 + }, + { + "epoch": 0.67, + "grad_norm": 0.5518969893455505, + "learning_rate": 0.0005288040671886831, + "loss": 3.1587, + "step": 13705 + }, + { + "epoch": 0.67, + "grad_norm": 0.5191221833229065, + "learning_rate": 0.0005287941090047118, + "loss": 3.2533, + "step": 13706 + }, + { + "epoch": 0.67, + "grad_norm": 0.5094031095504761, + "learning_rate": 0.0005287841502181409, + "loss": 3.2238, + "step": 13707 + }, + { + "epoch": 0.67, + "grad_norm": 0.512729823589325, + "learning_rate": 0.0005287741908289967, + "loss": 3.2058, + "step": 13708 + }, + { + "epoch": 0.67, + "grad_norm": 0.5242785215377808, + "learning_rate": 0.0005287642308373051, + "loss": 3.2444, + "step": 13709 + }, + { + "epoch": 0.67, + "grad_norm": 0.5172874927520752, + "learning_rate": 0.0005287542702430926, + "loss": 3.3319, + "step": 13710 + }, + { + "epoch": 0.67, + "grad_norm": 0.48693451285362244, + "learning_rate": 0.0005287443090463854, + "loss": 3.3414, + "step": 13711 + }, + { + "epoch": 0.67, + "grad_norm": 0.5203180313110352, + "learning_rate": 0.0005287343472472097, + "loss": 3.1379, + "step": 13712 + }, + { + "epoch": 0.67, + "grad_norm": 0.5015645623207092, + "learning_rate": 0.0005287243848455918, + "loss": 3.2083, + "step": 13713 + }, + { + "epoch": 0.67, + "grad_norm": 0.5029230117797852, + "learning_rate": 0.0005287144218415579, + "loss": 2.9887, + "step": 13714 + }, + { + "epoch": 0.67, + "grad_norm": 0.523784875869751, + "learning_rate": 0.0005287044582351341, + "loss": 2.9404, + "step": 13715 + }, + { + "epoch": 0.67, + "grad_norm": 0.49073922634124756, + "learning_rate": 0.000528694494026347, + "loss": 3.119, + "step": 13716 + }, + { + "epoch": 0.67, + "grad_norm": 0.5473636984825134, + "learning_rate": 0.0005286845292152224, + "loss": 3.2291, + "step": 13717 + }, + { + "epoch": 0.67, + "grad_norm": 0.4929855763912201, + "learning_rate": 0.0005286745638017868, + "loss": 3.1537, + "step": 13718 + }, + { + "epoch": 0.67, + "grad_norm": 0.5156190991401672, + "learning_rate": 0.0005286645977860664, + "loss": 3.1542, + "step": 13719 + }, + { + "epoch": 0.67, + "grad_norm": 0.4985933303833008, + "learning_rate": 0.0005286546311680876, + "loss": 3.0746, + "step": 13720 + }, + { + "epoch": 0.67, + "grad_norm": 0.5488167405128479, + "learning_rate": 0.0005286446639478764, + "loss": 3.2484, + "step": 13721 + }, + { + "epoch": 0.67, + "grad_norm": 0.5215466022491455, + "learning_rate": 0.0005286346961254591, + "loss": 3.024, + "step": 13722 + }, + { + "epoch": 0.67, + "grad_norm": 0.5182087421417236, + "learning_rate": 0.0005286247277008621, + "loss": 3.2172, + "step": 13723 + }, + { + "epoch": 0.67, + "grad_norm": 0.5041150450706482, + "learning_rate": 0.0005286147586741115, + "loss": 3.0219, + "step": 13724 + }, + { + "epoch": 0.67, + "grad_norm": 0.5143710970878601, + "learning_rate": 0.0005286047890452337, + "loss": 3.1054, + "step": 13725 + }, + { + "epoch": 0.67, + "grad_norm": 0.5302610993385315, + "learning_rate": 0.0005285948188142549, + "loss": 3.19, + "step": 13726 + }, + { + "epoch": 0.67, + "grad_norm": 0.5241024494171143, + "learning_rate": 0.0005285848479812012, + "loss": 3.2024, + "step": 13727 + }, + { + "epoch": 0.67, + "grad_norm": 0.5108481645584106, + "learning_rate": 0.0005285748765460991, + "loss": 3.3537, + "step": 13728 + }, + { + "epoch": 0.67, + "grad_norm": 0.4921933710575104, + "learning_rate": 0.0005285649045089748, + "loss": 3.1969, + "step": 13729 + }, + { + "epoch": 0.67, + "grad_norm": 0.5298369526863098, + "learning_rate": 0.0005285549318698544, + "loss": 3.1773, + "step": 13730 + }, + { + "epoch": 0.67, + "grad_norm": 0.5141018033027649, + "learning_rate": 0.0005285449586287644, + "loss": 3.2114, + "step": 13731 + }, + { + "epoch": 0.67, + "grad_norm": 0.5361664295196533, + "learning_rate": 0.0005285349847857309, + "loss": 3.0577, + "step": 13732 + }, + { + "epoch": 0.67, + "grad_norm": 0.5357168912887573, + "learning_rate": 0.0005285250103407804, + "loss": 3.2206, + "step": 13733 + }, + { + "epoch": 0.67, + "grad_norm": 0.5472400784492493, + "learning_rate": 0.0005285150352939388, + "loss": 3.0433, + "step": 13734 + }, + { + "epoch": 0.67, + "grad_norm": 0.49933162331581116, + "learning_rate": 0.0005285050596452326, + "loss": 3.4927, + "step": 13735 + }, + { + "epoch": 0.67, + "grad_norm": 0.5269633531570435, + "learning_rate": 0.0005284950833946882, + "loss": 3.1318, + "step": 13736 + }, + { + "epoch": 0.67, + "grad_norm": 0.5392086505889893, + "learning_rate": 0.0005284851065423316, + "loss": 3.3404, + "step": 13737 + }, + { + "epoch": 0.67, + "grad_norm": 0.4980263411998749, + "learning_rate": 0.0005284751290881893, + "loss": 3.2672, + "step": 13738 + }, + { + "epoch": 0.67, + "grad_norm": 0.49644413590431213, + "learning_rate": 0.0005284651510322874, + "loss": 3.2669, + "step": 13739 + }, + { + "epoch": 0.67, + "grad_norm": 0.5201795697212219, + "learning_rate": 0.0005284551723746523, + "loss": 3.4163, + "step": 13740 + }, + { + "epoch": 0.67, + "grad_norm": 0.5354999899864197, + "learning_rate": 0.0005284451931153102, + "loss": 3.2036, + "step": 13741 + }, + { + "epoch": 0.67, + "grad_norm": 0.5267964601516724, + "learning_rate": 0.0005284352132542873, + "loss": 3.118, + "step": 13742 + }, + { + "epoch": 0.67, + "grad_norm": 0.5029120445251465, + "learning_rate": 0.0005284252327916102, + "loss": 3.2955, + "step": 13743 + }, + { + "epoch": 0.67, + "grad_norm": 0.5081202983856201, + "learning_rate": 0.000528415251727305, + "loss": 3.2133, + "step": 13744 + }, + { + "epoch": 0.67, + "grad_norm": 0.517963707447052, + "learning_rate": 0.000528405270061398, + "loss": 2.9726, + "step": 13745 + }, + { + "epoch": 0.67, + "grad_norm": 0.55925452709198, + "learning_rate": 0.0005283952877939153, + "loss": 3.3627, + "step": 13746 + }, + { + "epoch": 0.67, + "grad_norm": 0.5444297790527344, + "learning_rate": 0.0005283853049248834, + "loss": 3.1062, + "step": 13747 + }, + { + "epoch": 0.67, + "grad_norm": 0.48813849687576294, + "learning_rate": 0.0005283753214543287, + "loss": 3.2675, + "step": 13748 + }, + { + "epoch": 0.67, + "grad_norm": 0.5234923362731934, + "learning_rate": 0.0005283653373822773, + "loss": 3.2118, + "step": 13749 + }, + { + "epoch": 0.67, + "grad_norm": 0.487904816865921, + "learning_rate": 0.0005283553527087556, + "loss": 3.2319, + "step": 13750 + }, + { + "epoch": 0.67, + "grad_norm": 0.5277583599090576, + "learning_rate": 0.0005283453674337898, + "loss": 3.3372, + "step": 13751 + }, + { + "epoch": 0.67, + "grad_norm": 0.5574771165847778, + "learning_rate": 0.0005283353815574063, + "loss": 3.1972, + "step": 13752 + }, + { + "epoch": 0.67, + "grad_norm": 0.5329998731613159, + "learning_rate": 0.0005283253950796312, + "loss": 3.1427, + "step": 13753 + }, + { + "epoch": 0.67, + "grad_norm": 0.4901135265827179, + "learning_rate": 0.000528315408000491, + "loss": 3.5074, + "step": 13754 + }, + { + "epoch": 0.67, + "grad_norm": 0.524285078048706, + "learning_rate": 0.000528305420320012, + "loss": 3.2045, + "step": 13755 + }, + { + "epoch": 0.67, + "grad_norm": 0.5120623707771301, + "learning_rate": 0.0005282954320382205, + "loss": 3.2821, + "step": 13756 + }, + { + "epoch": 0.67, + "grad_norm": 0.4949445426464081, + "learning_rate": 0.0005282854431551428, + "loss": 3.238, + "step": 13757 + }, + { + "epoch": 0.67, + "grad_norm": 0.5326241850852966, + "learning_rate": 0.0005282754536708051, + "loss": 3.193, + "step": 13758 + }, + { + "epoch": 0.67, + "grad_norm": 0.5440043807029724, + "learning_rate": 0.0005282654635852339, + "loss": 3.0779, + "step": 13759 + }, + { + "epoch": 0.67, + "grad_norm": 0.5041990876197815, + "learning_rate": 0.0005282554728984551, + "loss": 3.1848, + "step": 13760 + }, + { + "epoch": 0.67, + "grad_norm": 0.5535484552383423, + "learning_rate": 0.0005282454816104956, + "loss": 3.2104, + "step": 13761 + }, + { + "epoch": 0.67, + "grad_norm": 0.503193736076355, + "learning_rate": 0.0005282354897213812, + "loss": 3.0778, + "step": 13762 + }, + { + "epoch": 0.67, + "grad_norm": 0.49220454692840576, + "learning_rate": 0.0005282254972311386, + "loss": 3.2279, + "step": 13763 + }, + { + "epoch": 0.67, + "grad_norm": 0.516409695148468, + "learning_rate": 0.000528215504139794, + "loss": 3.0826, + "step": 13764 + }, + { + "epoch": 0.67, + "grad_norm": 0.5269769430160522, + "learning_rate": 0.0005282055104473736, + "loss": 3.3278, + "step": 13765 + }, + { + "epoch": 0.67, + "grad_norm": 0.5283230543136597, + "learning_rate": 0.0005281955161539039, + "loss": 3.1647, + "step": 13766 + }, + { + "epoch": 0.67, + "grad_norm": 0.5059794187545776, + "learning_rate": 0.0005281855212594111, + "loss": 3.2336, + "step": 13767 + }, + { + "epoch": 0.67, + "grad_norm": 0.53988116979599, + "learning_rate": 0.0005281755257639214, + "loss": 3.0983, + "step": 13768 + }, + { + "epoch": 0.67, + "grad_norm": 0.520753800868988, + "learning_rate": 0.0005281655296674615, + "loss": 3.1827, + "step": 13769 + }, + { + "epoch": 0.67, + "grad_norm": 0.5291000008583069, + "learning_rate": 0.0005281555329700574, + "loss": 3.2596, + "step": 13770 + }, + { + "epoch": 0.67, + "grad_norm": 0.5507010817527771, + "learning_rate": 0.0005281455356717355, + "loss": 3.1211, + "step": 13771 + }, + { + "epoch": 0.67, + "grad_norm": 0.49522608518600464, + "learning_rate": 0.0005281355377725223, + "loss": 3.1505, + "step": 13772 + }, + { + "epoch": 0.67, + "grad_norm": 0.5150662660598755, + "learning_rate": 0.0005281255392724438, + "loss": 3.3192, + "step": 13773 + }, + { + "epoch": 0.68, + "grad_norm": 0.5103153586387634, + "learning_rate": 0.0005281155401715267, + "loss": 3.0925, + "step": 13774 + }, + { + "epoch": 0.68, + "grad_norm": 0.5034166574478149, + "learning_rate": 0.000528105540469797, + "loss": 3.3163, + "step": 13775 + }, + { + "epoch": 0.68, + "grad_norm": 0.5109154582023621, + "learning_rate": 0.0005280955401672814, + "loss": 3.3177, + "step": 13776 + }, + { + "epoch": 0.68, + "grad_norm": 0.4964618682861328, + "learning_rate": 0.0005280855392640059, + "loss": 3.0376, + "step": 13777 + }, + { + "epoch": 0.68, + "grad_norm": 0.47001469135284424, + "learning_rate": 0.000528075537759997, + "loss": 3.3065, + "step": 13778 + }, + { + "epoch": 0.68, + "grad_norm": 0.49114564061164856, + "learning_rate": 0.0005280655356552811, + "loss": 3.3961, + "step": 13779 + }, + { + "epoch": 0.68, + "grad_norm": 0.5325284600257874, + "learning_rate": 0.0005280555329498845, + "loss": 3.2993, + "step": 13780 + }, + { + "epoch": 0.68, + "grad_norm": 0.5186975598335266, + "learning_rate": 0.0005280455296438333, + "loss": 3.1416, + "step": 13781 + }, + { + "epoch": 0.68, + "grad_norm": 0.5972892045974731, + "learning_rate": 0.0005280355257371544, + "loss": 3.4019, + "step": 13782 + }, + { + "epoch": 0.68, + "grad_norm": 0.5253905653953552, + "learning_rate": 0.0005280255212298735, + "loss": 3.1386, + "step": 13783 + }, + { + "epoch": 0.68, + "grad_norm": 0.5274996161460876, + "learning_rate": 0.0005280155161220174, + "loss": 3.2277, + "step": 13784 + }, + { + "epoch": 0.68, + "grad_norm": 0.5044057369232178, + "learning_rate": 0.0005280055104136123, + "loss": 3.2136, + "step": 13785 + }, + { + "epoch": 0.68, + "grad_norm": 0.49711552262306213, + "learning_rate": 0.0005279955041046845, + "loss": 3.3523, + "step": 13786 + }, + { + "epoch": 0.68, + "grad_norm": 0.5024986267089844, + "learning_rate": 0.0005279854971952606, + "loss": 3.0944, + "step": 13787 + }, + { + "epoch": 0.68, + "grad_norm": 0.5386544466018677, + "learning_rate": 0.0005279754896853667, + "loss": 3.4201, + "step": 13788 + }, + { + "epoch": 0.68, + "grad_norm": 0.5329682230949402, + "learning_rate": 0.0005279654815750291, + "loss": 3.145, + "step": 13789 + }, + { + "epoch": 0.68, + "grad_norm": 0.5430646538734436, + "learning_rate": 0.0005279554728642744, + "loss": 3.4528, + "step": 13790 + }, + { + "epoch": 0.68, + "grad_norm": 0.5068038105964661, + "learning_rate": 0.0005279454635531289, + "loss": 3.2641, + "step": 13791 + }, + { + "epoch": 0.68, + "grad_norm": 0.4926636219024658, + "learning_rate": 0.0005279354536416188, + "loss": 3.1893, + "step": 13792 + }, + { + "epoch": 0.68, + "grad_norm": 0.5283586382865906, + "learning_rate": 0.0005279254431297707, + "loss": 3.173, + "step": 13793 + }, + { + "epoch": 0.68, + "grad_norm": 0.5072885155677795, + "learning_rate": 0.0005279154320176107, + "loss": 3.0823, + "step": 13794 + }, + { + "epoch": 0.68, + "grad_norm": 0.4954681694507599, + "learning_rate": 0.0005279054203051655, + "loss": 3.2519, + "step": 13795 + }, + { + "epoch": 0.68, + "grad_norm": 0.50550377368927, + "learning_rate": 0.0005278954079924611, + "loss": 3.2629, + "step": 13796 + }, + { + "epoch": 0.68, + "grad_norm": 0.5151498913764954, + "learning_rate": 0.0005278853950795242, + "loss": 3.0185, + "step": 13797 + }, + { + "epoch": 0.68, + "grad_norm": 0.5135114192962646, + "learning_rate": 0.0005278753815663811, + "loss": 3.1437, + "step": 13798 + }, + { + "epoch": 0.68, + "grad_norm": 0.49339157342910767, + "learning_rate": 0.0005278653674530579, + "loss": 3.2054, + "step": 13799 + }, + { + "epoch": 0.68, + "grad_norm": 0.5128827691078186, + "learning_rate": 0.0005278553527395813, + "loss": 3.1548, + "step": 13800 + }, + { + "epoch": 0.68, + "grad_norm": 0.5171065330505371, + "learning_rate": 0.0005278453374259776, + "loss": 3.2015, + "step": 13801 + }, + { + "epoch": 0.68, + "grad_norm": 0.5121150612831116, + "learning_rate": 0.0005278353215122729, + "loss": 3.3808, + "step": 13802 + }, + { + "epoch": 0.68, + "grad_norm": 0.5693663954734802, + "learning_rate": 0.0005278253049984939, + "loss": 3.0008, + "step": 13803 + }, + { + "epoch": 0.68, + "grad_norm": 0.506459653377533, + "learning_rate": 0.000527815287884667, + "loss": 3.16, + "step": 13804 + }, + { + "epoch": 0.68, + "grad_norm": 0.5166449546813965, + "learning_rate": 0.0005278052701708184, + "loss": 3.1424, + "step": 13805 + }, + { + "epoch": 0.68, + "grad_norm": 0.5361778736114502, + "learning_rate": 0.0005277952518569746, + "loss": 3.4089, + "step": 13806 + }, + { + "epoch": 0.68, + "grad_norm": 0.5384201407432556, + "learning_rate": 0.000527785232943162, + "loss": 3.3915, + "step": 13807 + }, + { + "epoch": 0.68, + "grad_norm": 0.581087052822113, + "learning_rate": 0.0005277752134294067, + "loss": 3.2422, + "step": 13808 + }, + { + "epoch": 0.68, + "grad_norm": 0.4882732331752777, + "learning_rate": 0.0005277651933157355, + "loss": 3.3192, + "step": 13809 + }, + { + "epoch": 0.68, + "grad_norm": 0.5098223686218262, + "learning_rate": 0.0005277551726021746, + "loss": 3.0037, + "step": 13810 + }, + { + "epoch": 0.68, + "grad_norm": 0.522487223148346, + "learning_rate": 0.0005277451512887504, + "loss": 3.3347, + "step": 13811 + }, + { + "epoch": 0.68, + "grad_norm": 0.5181809663772583, + "learning_rate": 0.0005277351293754894, + "loss": 3.2624, + "step": 13812 + }, + { + "epoch": 0.68, + "grad_norm": 0.5491748452186584, + "learning_rate": 0.0005277251068624178, + "loss": 3.2306, + "step": 13813 + }, + { + "epoch": 0.68, + "grad_norm": 0.5368053913116455, + "learning_rate": 0.0005277150837495621, + "loss": 3.2325, + "step": 13814 + }, + { + "epoch": 0.68, + "grad_norm": 0.5411176085472107, + "learning_rate": 0.0005277050600369487, + "loss": 3.1715, + "step": 13815 + }, + { + "epoch": 0.68, + "grad_norm": 0.49559369683265686, + "learning_rate": 0.000527695035724604, + "loss": 3.0924, + "step": 13816 + }, + { + "epoch": 0.68, + "grad_norm": 0.5154446363449097, + "learning_rate": 0.0005276850108125544, + "loss": 3.6344, + "step": 13817 + }, + { + "epoch": 0.68, + "grad_norm": 0.5071130990982056, + "learning_rate": 0.0005276749853008263, + "loss": 3.0258, + "step": 13818 + }, + { + "epoch": 0.68, + "grad_norm": 0.5003767013549805, + "learning_rate": 0.0005276649591894459, + "loss": 3.0506, + "step": 13819 + }, + { + "epoch": 0.68, + "grad_norm": 0.5963944792747498, + "learning_rate": 0.0005276549324784401, + "loss": 3.0792, + "step": 13820 + }, + { + "epoch": 0.68, + "grad_norm": 0.5032363533973694, + "learning_rate": 0.0005276449051678349, + "loss": 3.1895, + "step": 13821 + }, + { + "epoch": 0.68, + "grad_norm": 0.48575350642204285, + "learning_rate": 0.000527634877257657, + "loss": 3.0814, + "step": 13822 + }, + { + "epoch": 0.68, + "grad_norm": 0.5336856245994568, + "learning_rate": 0.0005276248487479325, + "loss": 3.0062, + "step": 13823 + }, + { + "epoch": 0.68, + "grad_norm": 0.519405722618103, + "learning_rate": 0.0005276148196386879, + "loss": 3.1346, + "step": 13824 + }, + { + "epoch": 0.68, + "grad_norm": 0.5641560554504395, + "learning_rate": 0.0005276047899299497, + "loss": 3.1063, + "step": 13825 + }, + { + "epoch": 0.68, + "grad_norm": 0.49971362948417664, + "learning_rate": 0.0005275947596217442, + "loss": 3.1934, + "step": 13826 + }, + { + "epoch": 0.68, + "grad_norm": 0.5345155596733093, + "learning_rate": 0.0005275847287140981, + "loss": 3.1373, + "step": 13827 + }, + { + "epoch": 0.68, + "grad_norm": 0.5394874811172485, + "learning_rate": 0.0005275746972070376, + "loss": 3.301, + "step": 13828 + }, + { + "epoch": 0.68, + "grad_norm": 0.5135223865509033, + "learning_rate": 0.0005275646651005891, + "loss": 3.301, + "step": 13829 + }, + { + "epoch": 0.68, + "grad_norm": 0.5040108561515808, + "learning_rate": 0.000527554632394779, + "loss": 3.2598, + "step": 13830 + }, + { + "epoch": 0.68, + "grad_norm": 0.5344460010528564, + "learning_rate": 0.0005275445990896338, + "loss": 3.1245, + "step": 13831 + }, + { + "epoch": 0.68, + "grad_norm": 0.4787933826446533, + "learning_rate": 0.00052753456518518, + "loss": 3.3899, + "step": 13832 + }, + { + "epoch": 0.68, + "grad_norm": 0.5142536759376526, + "learning_rate": 0.0005275245306814439, + "loss": 3.1408, + "step": 13833 + }, + { + "epoch": 0.68, + "grad_norm": 0.5277648568153381, + "learning_rate": 0.000527514495578452, + "loss": 3.1087, + "step": 13834 + }, + { + "epoch": 0.68, + "grad_norm": 0.5267074704170227, + "learning_rate": 0.0005275044598762307, + "loss": 3.192, + "step": 13835 + }, + { + "epoch": 0.68, + "grad_norm": 0.5166609883308411, + "learning_rate": 0.0005274944235748064, + "loss": 3.3189, + "step": 13836 + }, + { + "epoch": 0.68, + "grad_norm": 0.5297030210494995, + "learning_rate": 0.0005274843866742056, + "loss": 3.0285, + "step": 13837 + }, + { + "epoch": 0.68, + "grad_norm": 0.5255849361419678, + "learning_rate": 0.0005274743491744548, + "loss": 3.2142, + "step": 13838 + }, + { + "epoch": 0.68, + "grad_norm": 0.49405017495155334, + "learning_rate": 0.0005274643110755801, + "loss": 3.3999, + "step": 13839 + }, + { + "epoch": 0.68, + "grad_norm": 0.4815720319747925, + "learning_rate": 0.0005274542723776083, + "loss": 3.4118, + "step": 13840 + }, + { + "epoch": 0.68, + "grad_norm": 0.5125225186347961, + "learning_rate": 0.0005274442330805658, + "loss": 3.1375, + "step": 13841 + }, + { + "epoch": 0.68, + "grad_norm": 0.5525969862937927, + "learning_rate": 0.0005274341931844788, + "loss": 3.3452, + "step": 13842 + }, + { + "epoch": 0.68, + "grad_norm": 0.5333914756774902, + "learning_rate": 0.0005274241526893741, + "loss": 3.3247, + "step": 13843 + }, + { + "epoch": 0.68, + "grad_norm": 0.5027710199356079, + "learning_rate": 0.0005274141115952779, + "loss": 3.228, + "step": 13844 + }, + { + "epoch": 0.68, + "grad_norm": 0.5438841581344604, + "learning_rate": 0.0005274040699022167, + "loss": 3.2234, + "step": 13845 + }, + { + "epoch": 0.68, + "grad_norm": 0.5225579142570496, + "learning_rate": 0.0005273940276102168, + "loss": 3.165, + "step": 13846 + }, + { + "epoch": 0.68, + "grad_norm": 0.5789764523506165, + "learning_rate": 0.000527383984719305, + "loss": 3.1446, + "step": 13847 + }, + { + "epoch": 0.68, + "grad_norm": 0.5264090895652771, + "learning_rate": 0.0005273739412295073, + "loss": 3.2548, + "step": 13848 + }, + { + "epoch": 0.68, + "grad_norm": 0.5331341028213501, + "learning_rate": 0.0005273638971408506, + "loss": 3.1594, + "step": 13849 + }, + { + "epoch": 0.68, + "grad_norm": 0.5145845413208008, + "learning_rate": 0.0005273538524533612, + "loss": 3.1882, + "step": 13850 + }, + { + "epoch": 0.68, + "grad_norm": 0.5996436476707458, + "learning_rate": 0.0005273438071670653, + "loss": 3.359, + "step": 13851 + }, + { + "epoch": 0.68, + "grad_norm": 0.5678809881210327, + "learning_rate": 0.0005273337612819897, + "loss": 3.3849, + "step": 13852 + }, + { + "epoch": 0.68, + "grad_norm": 0.49239474534988403, + "learning_rate": 0.0005273237147981607, + "loss": 3.1734, + "step": 13853 + }, + { + "epoch": 0.68, + "grad_norm": 0.5626592040061951, + "learning_rate": 0.0005273136677156048, + "loss": 3.0927, + "step": 13854 + }, + { + "epoch": 0.68, + "grad_norm": 0.49832597374916077, + "learning_rate": 0.0005273036200343484, + "loss": 3.3677, + "step": 13855 + }, + { + "epoch": 0.68, + "grad_norm": 0.6042376160621643, + "learning_rate": 0.000527293571754418, + "loss": 3.2694, + "step": 13856 + }, + { + "epoch": 0.68, + "grad_norm": 0.5671773552894592, + "learning_rate": 0.0005272835228758401, + "loss": 2.8949, + "step": 13857 + }, + { + "epoch": 0.68, + "grad_norm": 0.5112889409065247, + "learning_rate": 0.0005272734733986411, + "loss": 3.0336, + "step": 13858 + }, + { + "epoch": 0.68, + "grad_norm": 0.5109499096870422, + "learning_rate": 0.0005272634233228476, + "loss": 3.3543, + "step": 13859 + }, + { + "epoch": 0.68, + "grad_norm": 0.5124879479408264, + "learning_rate": 0.000527253372648486, + "loss": 3.2364, + "step": 13860 + }, + { + "epoch": 0.68, + "grad_norm": 0.5452317595481873, + "learning_rate": 0.0005272433213755827, + "loss": 3.1102, + "step": 13861 + }, + { + "epoch": 0.68, + "grad_norm": 0.5369084477424622, + "learning_rate": 0.0005272332695041642, + "loss": 3.2783, + "step": 13862 + }, + { + "epoch": 0.68, + "grad_norm": 0.5925998091697693, + "learning_rate": 0.0005272232170342569, + "loss": 3.0603, + "step": 13863 + }, + { + "epoch": 0.68, + "grad_norm": 0.5368659496307373, + "learning_rate": 0.0005272131639658876, + "loss": 3.1947, + "step": 13864 + }, + { + "epoch": 0.68, + "grad_norm": 0.516013503074646, + "learning_rate": 0.0005272031102990824, + "loss": 3.3695, + "step": 13865 + }, + { + "epoch": 0.68, + "grad_norm": 0.5412189960479736, + "learning_rate": 0.000527193056033868, + "loss": 2.8892, + "step": 13866 + }, + { + "epoch": 0.68, + "grad_norm": 0.5142117142677307, + "learning_rate": 0.0005271830011702708, + "loss": 3.292, + "step": 13867 + }, + { + "epoch": 0.68, + "grad_norm": 0.4763852059841156, + "learning_rate": 0.0005271729457083173, + "loss": 3.2973, + "step": 13868 + }, + { + "epoch": 0.68, + "grad_norm": 0.5418897271156311, + "learning_rate": 0.000527162889648034, + "loss": 3.2097, + "step": 13869 + }, + { + "epoch": 0.68, + "grad_norm": 0.5203564763069153, + "learning_rate": 0.0005271528329894473, + "loss": 3.0707, + "step": 13870 + }, + { + "epoch": 0.68, + "grad_norm": 0.5115311741828918, + "learning_rate": 0.0005271427757325839, + "loss": 3.2188, + "step": 13871 + }, + { + "epoch": 0.68, + "grad_norm": 0.5365582704544067, + "learning_rate": 0.00052713271787747, + "loss": 3.2721, + "step": 13872 + }, + { + "epoch": 0.68, + "grad_norm": 0.529913067817688, + "learning_rate": 0.0005271226594241323, + "loss": 3.2094, + "step": 13873 + }, + { + "epoch": 0.68, + "grad_norm": 0.5000056028366089, + "learning_rate": 0.0005271126003725972, + "loss": 2.963, + "step": 13874 + }, + { + "epoch": 0.68, + "grad_norm": 0.5235258936882019, + "learning_rate": 0.0005271025407228912, + "loss": 3.294, + "step": 13875 + }, + { + "epoch": 0.68, + "grad_norm": 0.5482226610183716, + "learning_rate": 0.0005270924804750408, + "loss": 3.2941, + "step": 13876 + }, + { + "epoch": 0.68, + "grad_norm": 0.5100428462028503, + "learning_rate": 0.0005270824196290726, + "loss": 3.2611, + "step": 13877 + }, + { + "epoch": 0.68, + "grad_norm": 0.5015908479690552, + "learning_rate": 0.000527072358185013, + "loss": 3.1427, + "step": 13878 + }, + { + "epoch": 0.68, + "grad_norm": 0.5557044744491577, + "learning_rate": 0.0005270622961428883, + "loss": 3.1989, + "step": 13879 + }, + { + "epoch": 0.68, + "grad_norm": 0.529284656047821, + "learning_rate": 0.0005270522335027255, + "loss": 3.1541, + "step": 13880 + }, + { + "epoch": 0.68, + "grad_norm": 0.5141466856002808, + "learning_rate": 0.0005270421702645506, + "loss": 3.3865, + "step": 13881 + }, + { + "epoch": 0.68, + "grad_norm": 0.5499520897865295, + "learning_rate": 0.0005270321064283905, + "loss": 3.1756, + "step": 13882 + }, + { + "epoch": 0.68, + "grad_norm": 0.5323184728622437, + "learning_rate": 0.0005270220419942714, + "loss": 3.2529, + "step": 13883 + }, + { + "epoch": 0.68, + "grad_norm": 0.5083839893341064, + "learning_rate": 0.00052701197696222, + "loss": 3.2344, + "step": 13884 + }, + { + "epoch": 0.68, + "grad_norm": 0.5136963129043579, + "learning_rate": 0.0005270019113322626, + "loss": 3.2175, + "step": 13885 + }, + { + "epoch": 0.68, + "grad_norm": 0.5176054239273071, + "learning_rate": 0.000526991845104426, + "loss": 3.0358, + "step": 13886 + }, + { + "epoch": 0.68, + "grad_norm": 0.49073687195777893, + "learning_rate": 0.0005269817782787365, + "loss": 3.1841, + "step": 13887 + }, + { + "epoch": 0.68, + "grad_norm": 0.495728999376297, + "learning_rate": 0.0005269717108552208, + "loss": 3.3771, + "step": 13888 + }, + { + "epoch": 0.68, + "grad_norm": 0.5247365832328796, + "learning_rate": 0.0005269616428339052, + "loss": 3.1361, + "step": 13889 + }, + { + "epoch": 0.68, + "grad_norm": 0.5086815357208252, + "learning_rate": 0.0005269515742148163, + "loss": 3.1449, + "step": 13890 + }, + { + "epoch": 0.68, + "grad_norm": 0.5526536703109741, + "learning_rate": 0.0005269415049979807, + "loss": 3.1725, + "step": 13891 + }, + { + "epoch": 0.68, + "grad_norm": 0.5365892052650452, + "learning_rate": 0.0005269314351834247, + "loss": 3.2791, + "step": 13892 + }, + { + "epoch": 0.68, + "grad_norm": 0.5318396687507629, + "learning_rate": 0.000526921364771175, + "loss": 3.0982, + "step": 13893 + }, + { + "epoch": 0.68, + "grad_norm": 0.5801922082901001, + "learning_rate": 0.0005269112937612582, + "loss": 3.0354, + "step": 13894 + }, + { + "epoch": 0.68, + "grad_norm": 0.4854373335838318, + "learning_rate": 0.0005269012221537008, + "loss": 3.0311, + "step": 13895 + }, + { + "epoch": 0.68, + "grad_norm": 0.5157737731933594, + "learning_rate": 0.0005268911499485291, + "loss": 3.2959, + "step": 13896 + }, + { + "epoch": 0.68, + "grad_norm": 0.5271645784378052, + "learning_rate": 0.0005268810771457698, + "loss": 3.4125, + "step": 13897 + }, + { + "epoch": 0.68, + "grad_norm": 0.5193085074424744, + "learning_rate": 0.0005268710037454494, + "loss": 3.1424, + "step": 13898 + }, + { + "epoch": 0.68, + "grad_norm": 0.5354040861129761, + "learning_rate": 0.0005268609297475944, + "loss": 3.0197, + "step": 13899 + }, + { + "epoch": 0.68, + "grad_norm": 0.5036852955818176, + "learning_rate": 0.0005268508551522315, + "loss": 3.3795, + "step": 13900 + }, + { + "epoch": 0.68, + "grad_norm": 0.527779757976532, + "learning_rate": 0.000526840779959387, + "loss": 3.1321, + "step": 13901 + }, + { + "epoch": 0.68, + "grad_norm": 0.5030642747879028, + "learning_rate": 0.0005268307041690877, + "loss": 3.3091, + "step": 13902 + }, + { + "epoch": 0.68, + "grad_norm": 0.4996885061264038, + "learning_rate": 0.0005268206277813598, + "loss": 3.2489, + "step": 13903 + }, + { + "epoch": 0.68, + "grad_norm": 0.5028959512710571, + "learning_rate": 0.0005268105507962301, + "loss": 3.2326, + "step": 13904 + }, + { + "epoch": 0.68, + "grad_norm": 0.49086108803749084, + "learning_rate": 0.000526800473213725, + "loss": 3.3486, + "step": 13905 + }, + { + "epoch": 0.68, + "grad_norm": 0.508350133895874, + "learning_rate": 0.0005267903950338711, + "loss": 3.2829, + "step": 13906 + }, + { + "epoch": 0.68, + "grad_norm": 0.54703289270401, + "learning_rate": 0.000526780316256695, + "loss": 3.3257, + "step": 13907 + }, + { + "epoch": 0.68, + "grad_norm": 0.5806416273117065, + "learning_rate": 0.0005267702368822232, + "loss": 3.2072, + "step": 13908 + }, + { + "epoch": 0.68, + "grad_norm": 0.4797351062297821, + "learning_rate": 0.0005267601569104823, + "loss": 3.5585, + "step": 13909 + }, + { + "epoch": 0.68, + "grad_norm": 0.5201489329338074, + "learning_rate": 0.0005267500763414986, + "loss": 3.0275, + "step": 13910 + }, + { + "epoch": 0.68, + "grad_norm": 0.5400049686431885, + "learning_rate": 0.000526739995175299, + "loss": 3.2039, + "step": 13911 + }, + { + "epoch": 0.68, + "grad_norm": 0.5291928052902222, + "learning_rate": 0.0005267299134119098, + "loss": 3.1484, + "step": 13912 + }, + { + "epoch": 0.68, + "grad_norm": 0.49172693490982056, + "learning_rate": 0.0005267198310513577, + "loss": 3.0451, + "step": 13913 + }, + { + "epoch": 0.68, + "grad_norm": 0.5199759006500244, + "learning_rate": 0.0005267097480936691, + "loss": 3.1473, + "step": 13914 + }, + { + "epoch": 0.68, + "grad_norm": 0.5097964406013489, + "learning_rate": 0.0005266996645388708, + "loss": 3.0738, + "step": 13915 + }, + { + "epoch": 0.68, + "grad_norm": 0.5464935898780823, + "learning_rate": 0.0005266895803869891, + "loss": 3.2276, + "step": 13916 + }, + { + "epoch": 0.68, + "grad_norm": 0.4840008616447449, + "learning_rate": 0.0005266794956380507, + "loss": 3.1427, + "step": 13917 + }, + { + "epoch": 0.68, + "grad_norm": 0.5694568753242493, + "learning_rate": 0.0005266694102920822, + "loss": 3.1599, + "step": 13918 + }, + { + "epoch": 0.68, + "grad_norm": 0.5026246905326843, + "learning_rate": 0.00052665932434911, + "loss": 3.2966, + "step": 13919 + }, + { + "epoch": 0.68, + "grad_norm": 0.5052033066749573, + "learning_rate": 0.0005266492378091608, + "loss": 3.3566, + "step": 13920 + }, + { + "epoch": 0.68, + "grad_norm": 0.5257059931755066, + "learning_rate": 0.0005266391506722611, + "loss": 3.1304, + "step": 13921 + }, + { + "epoch": 0.68, + "grad_norm": 0.5384652018547058, + "learning_rate": 0.0005266290629384376, + "loss": 3.2555, + "step": 13922 + }, + { + "epoch": 0.68, + "grad_norm": 0.5102568864822388, + "learning_rate": 0.0005266189746077167, + "loss": 3.2415, + "step": 13923 + }, + { + "epoch": 0.68, + "grad_norm": 0.5261817574501038, + "learning_rate": 0.000526608885680125, + "loss": 3.3374, + "step": 13924 + }, + { + "epoch": 0.68, + "grad_norm": 0.4975242018699646, + "learning_rate": 0.0005265987961556891, + "loss": 3.2718, + "step": 13925 + }, + { + "epoch": 0.68, + "grad_norm": 0.5032918453216553, + "learning_rate": 0.0005265887060344356, + "loss": 3.2327, + "step": 13926 + }, + { + "epoch": 0.68, + "grad_norm": 0.6491032242774963, + "learning_rate": 0.000526578615316391, + "loss": 3.183, + "step": 13927 + }, + { + "epoch": 0.68, + "grad_norm": 0.5199459791183472, + "learning_rate": 0.0005265685240015821, + "loss": 3.2972, + "step": 13928 + }, + { + "epoch": 0.68, + "grad_norm": 0.5117924809455872, + "learning_rate": 0.0005265584320900353, + "loss": 3.1965, + "step": 13929 + }, + { + "epoch": 0.68, + "grad_norm": 0.5264638662338257, + "learning_rate": 0.000526548339581777, + "loss": 3.0494, + "step": 13930 + }, + { + "epoch": 0.68, + "grad_norm": 0.5094579458236694, + "learning_rate": 0.0005265382464768341, + "loss": 3.18, + "step": 13931 + }, + { + "epoch": 0.68, + "grad_norm": 0.4896301329135895, + "learning_rate": 0.0005265281527752331, + "loss": 3.3233, + "step": 13932 + }, + { + "epoch": 0.68, + "grad_norm": 0.4931809604167938, + "learning_rate": 0.0005265180584770004, + "loss": 3.3084, + "step": 13933 + }, + { + "epoch": 0.68, + "grad_norm": 0.5263796448707581, + "learning_rate": 0.0005265079635821628, + "loss": 3.3991, + "step": 13934 + }, + { + "epoch": 0.68, + "grad_norm": 0.5130126476287842, + "learning_rate": 0.0005264978680907469, + "loss": 3.2796, + "step": 13935 + }, + { + "epoch": 0.68, + "grad_norm": 0.5254062414169312, + "learning_rate": 0.000526487772002779, + "loss": 3.2386, + "step": 13936 + }, + { + "epoch": 0.68, + "grad_norm": 0.5328313708305359, + "learning_rate": 0.0005264776753182861, + "loss": 3.0323, + "step": 13937 + }, + { + "epoch": 0.68, + "grad_norm": 0.4820843040943146, + "learning_rate": 0.0005264675780372945, + "loss": 3.3463, + "step": 13938 + }, + { + "epoch": 0.68, + "grad_norm": 0.5069538950920105, + "learning_rate": 0.0005264574801598309, + "loss": 3.1807, + "step": 13939 + }, + { + "epoch": 0.68, + "grad_norm": 0.5414431691169739, + "learning_rate": 0.0005264473816859219, + "loss": 3.4286, + "step": 13940 + }, + { + "epoch": 0.68, + "grad_norm": 0.5100494623184204, + "learning_rate": 0.0005264372826155941, + "loss": 3.1482, + "step": 13941 + }, + { + "epoch": 0.68, + "grad_norm": 0.5338406562805176, + "learning_rate": 0.000526427182948874, + "loss": 3.3412, + "step": 13942 + }, + { + "epoch": 0.68, + "grad_norm": 0.48375827074050903, + "learning_rate": 0.0005264170826857883, + "loss": 3.1934, + "step": 13943 + }, + { + "epoch": 0.68, + "grad_norm": 0.5023521184921265, + "learning_rate": 0.0005264069818263636, + "loss": 3.1973, + "step": 13944 + }, + { + "epoch": 0.68, + "grad_norm": 0.5352455377578735, + "learning_rate": 0.0005263968803706265, + "loss": 3.0357, + "step": 13945 + }, + { + "epoch": 0.68, + "grad_norm": 0.4851732552051544, + "learning_rate": 0.0005263867783186036, + "loss": 3.5018, + "step": 13946 + }, + { + "epoch": 0.68, + "grad_norm": 0.5262126326560974, + "learning_rate": 0.0005263766756703213, + "loss": 3.0649, + "step": 13947 + }, + { + "epoch": 0.68, + "grad_norm": 0.51038658618927, + "learning_rate": 0.0005263665724258066, + "loss": 3.3052, + "step": 13948 + }, + { + "epoch": 0.68, + "grad_norm": 0.49130168557167053, + "learning_rate": 0.000526356468585086, + "loss": 3.0464, + "step": 13949 + }, + { + "epoch": 0.68, + "grad_norm": 0.525030791759491, + "learning_rate": 0.0005263463641481858, + "loss": 3.4954, + "step": 13950 + }, + { + "epoch": 0.68, + "grad_norm": 0.5531750917434692, + "learning_rate": 0.000526336259115133, + "loss": 3.3464, + "step": 13951 + }, + { + "epoch": 0.68, + "grad_norm": 0.5086334347724915, + "learning_rate": 0.0005263261534859539, + "loss": 3.421, + "step": 13952 + }, + { + "epoch": 0.68, + "grad_norm": 0.5100414156913757, + "learning_rate": 0.0005263160472606754, + "loss": 3.261, + "step": 13953 + }, + { + "epoch": 0.68, + "grad_norm": 0.5146250128746033, + "learning_rate": 0.0005263059404393239, + "loss": 3.0504, + "step": 13954 + }, + { + "epoch": 0.68, + "grad_norm": 0.5014898180961609, + "learning_rate": 0.0005262958330219262, + "loss": 3.3744, + "step": 13955 + }, + { + "epoch": 0.68, + "grad_norm": 0.5280792117118835, + "learning_rate": 0.0005262857250085088, + "loss": 3.1879, + "step": 13956 + }, + { + "epoch": 0.68, + "grad_norm": 0.48315441608428955, + "learning_rate": 0.0005262756163990982, + "loss": 3.1047, + "step": 13957 + }, + { + "epoch": 0.68, + "grad_norm": 0.508493959903717, + "learning_rate": 0.0005262655071937214, + "loss": 3.3101, + "step": 13958 + }, + { + "epoch": 0.68, + "grad_norm": 0.5163034796714783, + "learning_rate": 0.0005262553973924047, + "loss": 3.4717, + "step": 13959 + }, + { + "epoch": 0.68, + "grad_norm": 0.5181001424789429, + "learning_rate": 0.0005262452869951749, + "loss": 3.2164, + "step": 13960 + }, + { + "epoch": 0.68, + "grad_norm": 0.526516854763031, + "learning_rate": 0.0005262351760020584, + "loss": 3.2027, + "step": 13961 + }, + { + "epoch": 0.68, + "grad_norm": 0.5265016555786133, + "learning_rate": 0.000526225064413082, + "loss": 3.2067, + "step": 13962 + }, + { + "epoch": 0.68, + "grad_norm": 0.508037269115448, + "learning_rate": 0.0005262149522282724, + "loss": 3.2605, + "step": 13963 + }, + { + "epoch": 0.68, + "grad_norm": 0.5375155806541443, + "learning_rate": 0.0005262048394476562, + "loss": 3.3783, + "step": 13964 + }, + { + "epoch": 0.68, + "grad_norm": 0.5235868096351624, + "learning_rate": 0.0005261947260712599, + "loss": 3.3954, + "step": 13965 + }, + { + "epoch": 0.68, + "grad_norm": 0.5014271140098572, + "learning_rate": 0.0005261846120991103, + "loss": 3.2368, + "step": 13966 + }, + { + "epoch": 0.68, + "grad_norm": 0.5021702647209167, + "learning_rate": 0.0005261744975312339, + "loss": 3.1191, + "step": 13967 + }, + { + "epoch": 0.68, + "grad_norm": 0.5597658753395081, + "learning_rate": 0.0005261643823676574, + "loss": 3.0137, + "step": 13968 + }, + { + "epoch": 0.68, + "grad_norm": 0.4978190064430237, + "learning_rate": 0.0005261542666084074, + "loss": 3.3998, + "step": 13969 + }, + { + "epoch": 0.68, + "grad_norm": 0.5252299308776855, + "learning_rate": 0.0005261441502535107, + "loss": 3.0559, + "step": 13970 + }, + { + "epoch": 0.68, + "grad_norm": 0.5657221674919128, + "learning_rate": 0.0005261340333029938, + "loss": 3.1933, + "step": 13971 + }, + { + "epoch": 0.68, + "grad_norm": 0.5250703692436218, + "learning_rate": 0.0005261239157568833, + "loss": 3.3627, + "step": 13972 + }, + { + "epoch": 0.68, + "grad_norm": 0.5377254486083984, + "learning_rate": 0.0005261137976152059, + "loss": 3.1583, + "step": 13973 + }, + { + "epoch": 0.68, + "grad_norm": 0.49435439705848694, + "learning_rate": 0.0005261036788779884, + "loss": 3.2259, + "step": 13974 + }, + { + "epoch": 0.68, + "grad_norm": 0.5244966149330139, + "learning_rate": 0.0005260935595452573, + "loss": 3.2558, + "step": 13975 + }, + { + "epoch": 0.68, + "grad_norm": 0.4984242916107178, + "learning_rate": 0.0005260834396170393, + "loss": 3.1271, + "step": 13976 + }, + { + "epoch": 0.68, + "grad_norm": 0.5036216974258423, + "learning_rate": 0.000526073319093361, + "loss": 3.2616, + "step": 13977 + }, + { + "epoch": 0.69, + "grad_norm": 0.5375218987464905, + "learning_rate": 0.000526063197974249, + "loss": 3.0446, + "step": 13978 + }, + { + "epoch": 0.69, + "grad_norm": 0.5373899936676025, + "learning_rate": 0.00052605307625973, + "loss": 3.123, + "step": 13979 + }, + { + "epoch": 0.69, + "grad_norm": 0.5408071279525757, + "learning_rate": 0.0005260429539498308, + "loss": 3.0805, + "step": 13980 + }, + { + "epoch": 0.69, + "grad_norm": 0.5188453197479248, + "learning_rate": 0.000526032831044578, + "loss": 3.1694, + "step": 13981 + }, + { + "epoch": 0.69, + "grad_norm": 0.5271825194358826, + "learning_rate": 0.0005260227075439983, + "loss": 3.1811, + "step": 13982 + }, + { + "epoch": 0.69, + "grad_norm": 0.47875532507896423, + "learning_rate": 0.0005260125834481181, + "loss": 3.494, + "step": 13983 + }, + { + "epoch": 0.69, + "grad_norm": 0.5747538208961487, + "learning_rate": 0.0005260024587569644, + "loss": 3.1184, + "step": 13984 + }, + { + "epoch": 0.69, + "grad_norm": 0.5151486992835999, + "learning_rate": 0.0005259923334705636, + "loss": 2.9681, + "step": 13985 + }, + { + "epoch": 0.69, + "grad_norm": 0.534481406211853, + "learning_rate": 0.0005259822075889426, + "loss": 3.2856, + "step": 13986 + }, + { + "epoch": 0.69, + "grad_norm": 0.51065993309021, + "learning_rate": 0.0005259720811121278, + "loss": 3.1268, + "step": 13987 + }, + { + "epoch": 0.69, + "grad_norm": 0.5007278323173523, + "learning_rate": 0.0005259619540401462, + "loss": 3.0136, + "step": 13988 + }, + { + "epoch": 0.69, + "grad_norm": 0.5178660154342651, + "learning_rate": 0.0005259518263730242, + "loss": 3.1685, + "step": 13989 + }, + { + "epoch": 0.69, + "grad_norm": 0.5047518610954285, + "learning_rate": 0.0005259416981107886, + "loss": 3.2531, + "step": 13990 + }, + { + "epoch": 0.69, + "grad_norm": 0.5168188810348511, + "learning_rate": 0.000525931569253466, + "loss": 3.3957, + "step": 13991 + }, + { + "epoch": 0.69, + "grad_norm": 0.5354238152503967, + "learning_rate": 0.0005259214398010833, + "loss": 3.3066, + "step": 13992 + }, + { + "epoch": 0.69, + "grad_norm": 0.5306437611579895, + "learning_rate": 0.0005259113097536667, + "loss": 3.2913, + "step": 13993 + }, + { + "epoch": 0.69, + "grad_norm": 0.5510126948356628, + "learning_rate": 0.0005259011791112435, + "loss": 3.0978, + "step": 13994 + }, + { + "epoch": 0.69, + "grad_norm": 0.5774776339530945, + "learning_rate": 0.00052589104787384, + "loss": 3.133, + "step": 13995 + }, + { + "epoch": 0.69, + "grad_norm": 0.5207593441009521, + "learning_rate": 0.0005258809160414827, + "loss": 3.1222, + "step": 13996 + }, + { + "epoch": 0.69, + "grad_norm": 0.5279645919799805, + "learning_rate": 0.0005258707836141989, + "loss": 3.2507, + "step": 13997 + }, + { + "epoch": 0.69, + "grad_norm": 0.5136027336120605, + "learning_rate": 0.0005258606505920147, + "loss": 3.4416, + "step": 13998 + }, + { + "epoch": 0.69, + "grad_norm": 0.49309471249580383, + "learning_rate": 0.0005258505169749571, + "loss": 3.1352, + "step": 13999 + }, + { + "epoch": 0.69, + "grad_norm": 0.5222823023796082, + "learning_rate": 0.0005258403827630527, + "loss": 3.3458, + "step": 14000 + }, + { + "epoch": 0.69, + "grad_norm": 0.4934551417827606, + "learning_rate": 0.0005258302479563282, + "loss": 3.129, + "step": 14001 + }, + { + "epoch": 0.69, + "grad_norm": 0.5131190419197083, + "learning_rate": 0.0005258201125548103, + "loss": 3.2386, + "step": 14002 + }, + { + "epoch": 0.69, + "grad_norm": 0.6161550879478455, + "learning_rate": 0.0005258099765585256, + "loss": 3.0293, + "step": 14003 + }, + { + "epoch": 0.69, + "grad_norm": 0.5266488194465637, + "learning_rate": 0.0005257998399675009, + "loss": 3.2425, + "step": 14004 + }, + { + "epoch": 0.69, + "grad_norm": 0.5947927832603455, + "learning_rate": 0.0005257897027817629, + "loss": 3.2695, + "step": 14005 + }, + { + "epoch": 0.69, + "grad_norm": 0.5191298723220825, + "learning_rate": 0.0005257795650013382, + "loss": 3.0016, + "step": 14006 + }, + { + "epoch": 0.69, + "grad_norm": 0.5333731770515442, + "learning_rate": 0.0005257694266262537, + "loss": 3.1303, + "step": 14007 + }, + { + "epoch": 0.69, + "grad_norm": 0.5200604200363159, + "learning_rate": 0.000525759287656536, + "loss": 3.3124, + "step": 14008 + }, + { + "epoch": 0.69, + "grad_norm": 0.5588985681533813, + "learning_rate": 0.0005257491480922117, + "loss": 3.262, + "step": 14009 + }, + { + "epoch": 0.69, + "grad_norm": 0.49545398354530334, + "learning_rate": 0.0005257390079333077, + "loss": 3.2407, + "step": 14010 + }, + { + "epoch": 0.69, + "grad_norm": 0.550879180431366, + "learning_rate": 0.0005257288671798504, + "loss": 3.0096, + "step": 14011 + }, + { + "epoch": 0.69, + "grad_norm": 0.5142801403999329, + "learning_rate": 0.0005257187258318668, + "loss": 3.261, + "step": 14012 + }, + { + "epoch": 0.69, + "grad_norm": 0.5323050022125244, + "learning_rate": 0.0005257085838893836, + "loss": 3.0495, + "step": 14013 + }, + { + "epoch": 0.69, + "grad_norm": 0.5185598731040955, + "learning_rate": 0.0005256984413524274, + "loss": 3.1516, + "step": 14014 + }, + { + "epoch": 0.69, + "grad_norm": 0.5257108807563782, + "learning_rate": 0.0005256882982210249, + "loss": 3.1726, + "step": 14015 + }, + { + "epoch": 0.69, + "grad_norm": 0.5254387259483337, + "learning_rate": 0.0005256781544952028, + "loss": 3.2881, + "step": 14016 + }, + { + "epoch": 0.69, + "grad_norm": 0.514013409614563, + "learning_rate": 0.000525668010174988, + "loss": 3.06, + "step": 14017 + }, + { + "epoch": 0.69, + "grad_norm": 0.5110748410224915, + "learning_rate": 0.000525657865260407, + "loss": 3.283, + "step": 14018 + }, + { + "epoch": 0.69, + "grad_norm": 0.5237401127815247, + "learning_rate": 0.0005256477197514866, + "loss": 3.1188, + "step": 14019 + }, + { + "epoch": 0.69, + "grad_norm": 0.4896359145641327, + "learning_rate": 0.0005256375736482536, + "loss": 3.2941, + "step": 14020 + }, + { + "epoch": 0.69, + "grad_norm": 0.5119338035583496, + "learning_rate": 0.0005256274269507346, + "loss": 3.1067, + "step": 14021 + }, + { + "epoch": 0.69, + "grad_norm": 0.5138943791389465, + "learning_rate": 0.0005256172796589564, + "loss": 3.2943, + "step": 14022 + }, + { + "epoch": 0.69, + "grad_norm": 0.5650306940078735, + "learning_rate": 0.0005256071317729457, + "loss": 3.0597, + "step": 14023 + }, + { + "epoch": 0.69, + "grad_norm": 0.5293331146240234, + "learning_rate": 0.0005255969832927293, + "loss": 3.2667, + "step": 14024 + }, + { + "epoch": 0.69, + "grad_norm": 0.5199422836303711, + "learning_rate": 0.0005255868342183338, + "loss": 3.1008, + "step": 14025 + }, + { + "epoch": 0.69, + "grad_norm": 0.53669273853302, + "learning_rate": 0.000525576684549786, + "loss": 3.1676, + "step": 14026 + }, + { + "epoch": 0.69, + "grad_norm": 0.5880951285362244, + "learning_rate": 0.0005255665342871126, + "loss": 3.3137, + "step": 14027 + }, + { + "epoch": 0.69, + "grad_norm": 0.5066351890563965, + "learning_rate": 0.0005255563834303404, + "loss": 3.1104, + "step": 14028 + }, + { + "epoch": 0.69, + "grad_norm": 0.495430052280426, + "learning_rate": 0.000525546231979496, + "loss": 3.0237, + "step": 14029 + }, + { + "epoch": 0.69, + "grad_norm": 0.47963449358940125, + "learning_rate": 0.0005255360799346064, + "loss": 3.1957, + "step": 14030 + }, + { + "epoch": 0.69, + "grad_norm": 0.48525768518447876, + "learning_rate": 0.0005255259272956981, + "loss": 3.1917, + "step": 14031 + }, + { + "epoch": 0.69, + "grad_norm": 0.5137525796890259, + "learning_rate": 0.000525515774062798, + "loss": 2.9276, + "step": 14032 + }, + { + "epoch": 0.69, + "grad_norm": 0.4915049076080322, + "learning_rate": 0.0005255056202359325, + "loss": 2.9671, + "step": 14033 + }, + { + "epoch": 0.69, + "grad_norm": 0.5103715062141418, + "learning_rate": 0.0005254954658151288, + "loss": 3.0266, + "step": 14034 + }, + { + "epoch": 0.69, + "grad_norm": 0.5029316544532776, + "learning_rate": 0.0005254853108004134, + "loss": 3.164, + "step": 14035 + }, + { + "epoch": 0.69, + "grad_norm": 0.5544018149375916, + "learning_rate": 0.0005254751551918131, + "loss": 3.2264, + "step": 14036 + }, + { + "epoch": 0.69, + "grad_norm": 0.5042638182640076, + "learning_rate": 0.0005254649989893547, + "loss": 2.8606, + "step": 14037 + }, + { + "epoch": 0.69, + "grad_norm": 0.49873411655426025, + "learning_rate": 0.0005254548421930647, + "loss": 3.2909, + "step": 14038 + }, + { + "epoch": 0.69, + "grad_norm": 0.5256275534629822, + "learning_rate": 0.0005254446848029701, + "loss": 3.1487, + "step": 14039 + }, + { + "epoch": 0.69, + "grad_norm": 0.5733166337013245, + "learning_rate": 0.0005254345268190976, + "loss": 2.9514, + "step": 14040 + }, + { + "epoch": 0.69, + "grad_norm": 0.4876699447631836, + "learning_rate": 0.000525424368241474, + "loss": 3.1855, + "step": 14041 + }, + { + "epoch": 0.69, + "grad_norm": 0.5447889566421509, + "learning_rate": 0.000525414209070126, + "loss": 3.2617, + "step": 14042 + }, + { + "epoch": 0.69, + "grad_norm": 0.48302027583122253, + "learning_rate": 0.0005254040493050802, + "loss": 3.1729, + "step": 14043 + }, + { + "epoch": 0.69, + "grad_norm": 0.4758540093898773, + "learning_rate": 0.0005253938889463638, + "loss": 3.3485, + "step": 14044 + }, + { + "epoch": 0.69, + "grad_norm": 0.489118754863739, + "learning_rate": 0.000525383727994003, + "loss": 3.2733, + "step": 14045 + }, + { + "epoch": 0.69, + "grad_norm": 0.5639674067497253, + "learning_rate": 0.000525373566448025, + "loss": 3.0852, + "step": 14046 + }, + { + "epoch": 0.69, + "grad_norm": 0.5075635313987732, + "learning_rate": 0.0005253634043084562, + "loss": 3.3726, + "step": 14047 + }, + { + "epoch": 0.69, + "grad_norm": 0.5024601817131042, + "learning_rate": 0.0005253532415753238, + "loss": 3.2864, + "step": 14048 + }, + { + "epoch": 0.69, + "grad_norm": 0.5034321546554565, + "learning_rate": 0.0005253430782486542, + "loss": 3.3487, + "step": 14049 + }, + { + "epoch": 0.69, + "grad_norm": 0.5592278242111206, + "learning_rate": 0.0005253329143284744, + "loss": 3.247, + "step": 14050 + }, + { + "epoch": 0.69, + "grad_norm": 0.5167191028594971, + "learning_rate": 0.000525322749814811, + "loss": 3.0507, + "step": 14051 + }, + { + "epoch": 0.69, + "grad_norm": 0.49541527032852173, + "learning_rate": 0.0005253125847076908, + "loss": 3.1837, + "step": 14052 + }, + { + "epoch": 0.69, + "grad_norm": 0.5572511553764343, + "learning_rate": 0.0005253024190071407, + "loss": 3.1698, + "step": 14053 + }, + { + "epoch": 0.69, + "grad_norm": 0.47061023116111755, + "learning_rate": 0.0005252922527131873, + "loss": 3.1254, + "step": 14054 + }, + { + "epoch": 0.69, + "grad_norm": 0.5373715162277222, + "learning_rate": 0.0005252820858258576, + "loss": 3.251, + "step": 14055 + }, + { + "epoch": 0.69, + "grad_norm": 0.5261591672897339, + "learning_rate": 0.000525271918345178, + "loss": 3.2149, + "step": 14056 + }, + { + "epoch": 0.69, + "grad_norm": 0.525307834148407, + "learning_rate": 0.0005252617502711758, + "loss": 3.1451, + "step": 14057 + }, + { + "epoch": 0.69, + "grad_norm": 0.5310854315757751, + "learning_rate": 0.0005252515816038773, + "loss": 3.1049, + "step": 14058 + }, + { + "epoch": 0.69, + "grad_norm": 0.5353348255157471, + "learning_rate": 0.0005252414123433095, + "loss": 3.3399, + "step": 14059 + }, + { + "epoch": 0.69, + "grad_norm": 0.5103008151054382, + "learning_rate": 0.0005252312424894993, + "loss": 3.5181, + "step": 14060 + }, + { + "epoch": 0.69, + "grad_norm": 0.5458347201347351, + "learning_rate": 0.0005252210720424733, + "loss": 3.3634, + "step": 14061 + }, + { + "epoch": 0.69, + "grad_norm": 0.4898107647895813, + "learning_rate": 0.0005252109010022583, + "loss": 3.2373, + "step": 14062 + }, + { + "epoch": 0.69, + "grad_norm": 0.5220617055892944, + "learning_rate": 0.0005252007293688812, + "loss": 3.1624, + "step": 14063 + }, + { + "epoch": 0.69, + "grad_norm": 0.5146247148513794, + "learning_rate": 0.0005251905571423686, + "loss": 3.045, + "step": 14064 + }, + { + "epoch": 0.69, + "grad_norm": 0.5100995302200317, + "learning_rate": 0.0005251803843227475, + "loss": 3.2578, + "step": 14065 + }, + { + "epoch": 0.69, + "grad_norm": 0.490809828042984, + "learning_rate": 0.0005251702109100447, + "loss": 3.16, + "step": 14066 + }, + { + "epoch": 0.69, + "grad_norm": 0.5795995593070984, + "learning_rate": 0.0005251600369042868, + "loss": 3.0737, + "step": 14067 + }, + { + "epoch": 0.69, + "grad_norm": 0.5179703831672668, + "learning_rate": 0.0005251498623055006, + "loss": 3.2451, + "step": 14068 + }, + { + "epoch": 0.69, + "grad_norm": 0.518638014793396, + "learning_rate": 0.0005251396871137131, + "loss": 3.3138, + "step": 14069 + }, + { + "epoch": 0.69, + "grad_norm": 0.49439534544944763, + "learning_rate": 0.000525129511328951, + "loss": 3.1177, + "step": 14070 + }, + { + "epoch": 0.69, + "grad_norm": 0.5000263452529907, + "learning_rate": 0.0005251193349512411, + "loss": 3.3391, + "step": 14071 + }, + { + "epoch": 0.69, + "grad_norm": 0.513812780380249, + "learning_rate": 0.0005251091579806101, + "loss": 3.2079, + "step": 14072 + }, + { + "epoch": 0.69, + "grad_norm": 0.5052744746208191, + "learning_rate": 0.000525098980417085, + "loss": 2.9954, + "step": 14073 + }, + { + "epoch": 0.69, + "grad_norm": 0.48377129435539246, + "learning_rate": 0.0005250888022606925, + "loss": 3.0341, + "step": 14074 + }, + { + "epoch": 0.69, + "grad_norm": 0.5143635869026184, + "learning_rate": 0.0005250786235114594, + "loss": 3.0281, + "step": 14075 + }, + { + "epoch": 0.69, + "grad_norm": 0.5064332485198975, + "learning_rate": 0.0005250684441694125, + "loss": 3.2199, + "step": 14076 + }, + { + "epoch": 0.69, + "grad_norm": 0.5439924001693726, + "learning_rate": 0.0005250582642345786, + "loss": 3.0897, + "step": 14077 + }, + { + "epoch": 0.69, + "grad_norm": 0.50617915391922, + "learning_rate": 0.0005250480837069846, + "loss": 3.278, + "step": 14078 + }, + { + "epoch": 0.69, + "grad_norm": 0.48605844378471375, + "learning_rate": 0.0005250379025866572, + "loss": 3.1856, + "step": 14079 + }, + { + "epoch": 0.69, + "grad_norm": 0.49857380986213684, + "learning_rate": 0.0005250277208736233, + "loss": 3.1719, + "step": 14080 + }, + { + "epoch": 0.69, + "grad_norm": 0.6834864020347595, + "learning_rate": 0.0005250175385679098, + "loss": 3.2893, + "step": 14081 + }, + { + "epoch": 0.69, + "grad_norm": 0.5134391188621521, + "learning_rate": 0.0005250073556695433, + "loss": 3.1539, + "step": 14082 + }, + { + "epoch": 0.69, + "grad_norm": 0.5330235958099365, + "learning_rate": 0.0005249971721785506, + "loss": 3.1599, + "step": 14083 + }, + { + "epoch": 0.69, + "grad_norm": 0.5358902812004089, + "learning_rate": 0.0005249869880949589, + "loss": 3.1398, + "step": 14084 + }, + { + "epoch": 0.69, + "grad_norm": 0.5489894151687622, + "learning_rate": 0.0005249768034187947, + "loss": 3.195, + "step": 14085 + }, + { + "epoch": 0.69, + "grad_norm": 0.505240261554718, + "learning_rate": 0.0005249666181500848, + "loss": 3.2595, + "step": 14086 + }, + { + "epoch": 0.69, + "grad_norm": 0.5279876589775085, + "learning_rate": 0.0005249564322888562, + "loss": 3.1531, + "step": 14087 + }, + { + "epoch": 0.69, + "grad_norm": 0.5321412682533264, + "learning_rate": 0.0005249462458351358, + "loss": 3.1561, + "step": 14088 + }, + { + "epoch": 0.69, + "grad_norm": 0.506250262260437, + "learning_rate": 0.00052493605878895, + "loss": 3.286, + "step": 14089 + }, + { + "epoch": 0.69, + "grad_norm": 0.528944730758667, + "learning_rate": 0.0005249258711503261, + "loss": 2.914, + "step": 14090 + }, + { + "epoch": 0.69, + "grad_norm": 0.5110275149345398, + "learning_rate": 0.0005249156829192906, + "loss": 3.1901, + "step": 14091 + }, + { + "epoch": 0.69, + "grad_norm": 0.4941984713077545, + "learning_rate": 0.0005249054940958707, + "loss": 3.1032, + "step": 14092 + }, + { + "epoch": 0.69, + "grad_norm": 0.5150231719017029, + "learning_rate": 0.0005248953046800928, + "loss": 3.1427, + "step": 14093 + }, + { + "epoch": 0.69, + "grad_norm": 0.4935740530490875, + "learning_rate": 0.000524885114671984, + "loss": 3.1807, + "step": 14094 + }, + { + "epoch": 0.69, + "grad_norm": 0.5428426265716553, + "learning_rate": 0.0005248749240715712, + "loss": 3.2272, + "step": 14095 + }, + { + "epoch": 0.69, + "grad_norm": 0.4966670274734497, + "learning_rate": 0.000524864732878881, + "loss": 3.1482, + "step": 14096 + }, + { + "epoch": 0.69, + "grad_norm": 0.5251358151435852, + "learning_rate": 0.0005248545410939404, + "loss": 3.2162, + "step": 14097 + }, + { + "epoch": 0.69, + "grad_norm": 0.5258474946022034, + "learning_rate": 0.0005248443487167762, + "loss": 3.1545, + "step": 14098 + }, + { + "epoch": 0.69, + "grad_norm": 0.5385447144508362, + "learning_rate": 0.0005248341557474152, + "loss": 3.1439, + "step": 14099 + }, + { + "epoch": 0.69, + "grad_norm": 0.5289866328239441, + "learning_rate": 0.0005248239621858845, + "loss": 3.294, + "step": 14100 + }, + { + "epoch": 0.69, + "grad_norm": 0.5030156373977661, + "learning_rate": 0.0005248137680322106, + "loss": 3.2187, + "step": 14101 + }, + { + "epoch": 0.69, + "grad_norm": 0.48551294207572937, + "learning_rate": 0.0005248035732864205, + "loss": 3.4024, + "step": 14102 + }, + { + "epoch": 0.69, + "grad_norm": 0.5139172673225403, + "learning_rate": 0.000524793377948541, + "loss": 3.0436, + "step": 14103 + }, + { + "epoch": 0.69, + "grad_norm": 0.529897928237915, + "learning_rate": 0.0005247831820185991, + "loss": 3.2084, + "step": 14104 + }, + { + "epoch": 0.69, + "grad_norm": 0.5231662392616272, + "learning_rate": 0.0005247729854966215, + "loss": 3.0257, + "step": 14105 + }, + { + "epoch": 0.69, + "grad_norm": 0.569053053855896, + "learning_rate": 0.0005247627883826353, + "loss": 3.2904, + "step": 14106 + }, + { + "epoch": 0.69, + "grad_norm": 0.49440523982048035, + "learning_rate": 0.0005247525906766669, + "loss": 3.3116, + "step": 14107 + }, + { + "epoch": 0.69, + "grad_norm": 0.5238437056541443, + "learning_rate": 0.0005247423923787437, + "loss": 3.3411, + "step": 14108 + }, + { + "epoch": 0.69, + "grad_norm": 0.5570396780967712, + "learning_rate": 0.0005247321934888921, + "loss": 3.2735, + "step": 14109 + }, + { + "epoch": 0.69, + "grad_norm": 0.5721766352653503, + "learning_rate": 0.0005247219940071392, + "loss": 3.2418, + "step": 14110 + }, + { + "epoch": 0.69, + "grad_norm": 0.5046465396881104, + "learning_rate": 0.0005247117939335119, + "loss": 3.1536, + "step": 14111 + }, + { + "epoch": 0.69, + "grad_norm": 0.522420346736908, + "learning_rate": 0.0005247015932680368, + "loss": 3.1374, + "step": 14112 + }, + { + "epoch": 0.69, + "grad_norm": 0.5400798320770264, + "learning_rate": 0.0005246913920107411, + "loss": 3.3477, + "step": 14113 + }, + { + "epoch": 0.69, + "grad_norm": 0.5389352440834045, + "learning_rate": 0.0005246811901616514, + "loss": 3.0954, + "step": 14114 + }, + { + "epoch": 0.69, + "grad_norm": 0.6275335550308228, + "learning_rate": 0.0005246709877207948, + "loss": 3.3585, + "step": 14115 + }, + { + "epoch": 0.69, + "grad_norm": 0.4925820827484131, + "learning_rate": 0.000524660784688198, + "loss": 3.2751, + "step": 14116 + }, + { + "epoch": 0.69, + "grad_norm": 0.5348528027534485, + "learning_rate": 0.000524650581063888, + "loss": 3.2773, + "step": 14117 + }, + { + "epoch": 0.69, + "grad_norm": 0.5362456440925598, + "learning_rate": 0.0005246403768478916, + "loss": 3.1904, + "step": 14118 + }, + { + "epoch": 0.69, + "grad_norm": 0.5220507979393005, + "learning_rate": 0.0005246301720402356, + "loss": 3.2062, + "step": 14119 + }, + { + "epoch": 0.69, + "grad_norm": 0.5358459949493408, + "learning_rate": 0.0005246199666409469, + "loss": 3.21, + "step": 14120 + }, + { + "epoch": 0.69, + "grad_norm": 0.5036259293556213, + "learning_rate": 0.0005246097606500526, + "loss": 3.1651, + "step": 14121 + }, + { + "epoch": 0.69, + "grad_norm": 0.5509597063064575, + "learning_rate": 0.0005245995540675793, + "loss": 3.1898, + "step": 14122 + }, + { + "epoch": 0.69, + "grad_norm": 0.5109611749649048, + "learning_rate": 0.000524589346893554, + "loss": 3.3053, + "step": 14123 + }, + { + "epoch": 0.69, + "grad_norm": 0.563474714756012, + "learning_rate": 0.0005245791391280037, + "loss": 3.2851, + "step": 14124 + }, + { + "epoch": 0.69, + "grad_norm": 0.49557414650917053, + "learning_rate": 0.000524568930770955, + "loss": 3.2507, + "step": 14125 + }, + { + "epoch": 0.69, + "grad_norm": 0.4812037944793701, + "learning_rate": 0.0005245587218224351, + "loss": 3.3388, + "step": 14126 + }, + { + "epoch": 0.69, + "grad_norm": 0.5291796326637268, + "learning_rate": 0.0005245485122824707, + "loss": 3.0031, + "step": 14127 + }, + { + "epoch": 0.69, + "grad_norm": 0.5801912546157837, + "learning_rate": 0.0005245383021510887, + "loss": 3.3179, + "step": 14128 + }, + { + "epoch": 0.69, + "grad_norm": 0.4930591881275177, + "learning_rate": 0.0005245280914283161, + "loss": 3.2406, + "step": 14129 + }, + { + "epoch": 0.69, + "grad_norm": 0.5279679894447327, + "learning_rate": 0.0005245178801141796, + "loss": 3.3534, + "step": 14130 + }, + { + "epoch": 0.69, + "grad_norm": 0.4926796555519104, + "learning_rate": 0.0005245076682087064, + "loss": 3.3052, + "step": 14131 + }, + { + "epoch": 0.69, + "grad_norm": 0.6082884073257446, + "learning_rate": 0.000524497455711923, + "loss": 3.1572, + "step": 14132 + }, + { + "epoch": 0.69, + "grad_norm": 0.4980970025062561, + "learning_rate": 0.0005244872426238567, + "loss": 3.0342, + "step": 14133 + }, + { + "epoch": 0.69, + "grad_norm": 0.5105233192443848, + "learning_rate": 0.000524477028944534, + "loss": 3.1551, + "step": 14134 + }, + { + "epoch": 0.69, + "grad_norm": 0.5064019560813904, + "learning_rate": 0.0005244668146739822, + "loss": 3.1884, + "step": 14135 + }, + { + "epoch": 0.69, + "grad_norm": 0.5264427661895752, + "learning_rate": 0.0005244565998122278, + "loss": 3.3935, + "step": 14136 + }, + { + "epoch": 0.69, + "grad_norm": 0.5206676721572876, + "learning_rate": 0.0005244463843592981, + "loss": 3.1003, + "step": 14137 + }, + { + "epoch": 0.69, + "grad_norm": 0.5028414726257324, + "learning_rate": 0.0005244361683152197, + "loss": 3.3872, + "step": 14138 + }, + { + "epoch": 0.69, + "grad_norm": 0.5728408694267273, + "learning_rate": 0.0005244259516800198, + "loss": 3.1309, + "step": 14139 + }, + { + "epoch": 0.69, + "grad_norm": 0.5234643816947937, + "learning_rate": 0.0005244157344537249, + "loss": 3.1336, + "step": 14140 + }, + { + "epoch": 0.69, + "grad_norm": 0.5184975862503052, + "learning_rate": 0.0005244055166363623, + "loss": 3.2717, + "step": 14141 + }, + { + "epoch": 0.69, + "grad_norm": 0.5001710057258606, + "learning_rate": 0.0005243952982279587, + "loss": 3.249, + "step": 14142 + }, + { + "epoch": 0.69, + "grad_norm": 0.5176953673362732, + "learning_rate": 0.0005243850792285411, + "loss": 3.1428, + "step": 14143 + }, + { + "epoch": 0.69, + "grad_norm": 0.5145552754402161, + "learning_rate": 0.0005243748596381364, + "loss": 3.0941, + "step": 14144 + }, + { + "epoch": 0.69, + "grad_norm": 0.5094621777534485, + "learning_rate": 0.0005243646394567714, + "loss": 3.3651, + "step": 14145 + }, + { + "epoch": 0.69, + "grad_norm": 0.4982603192329407, + "learning_rate": 0.0005243544186844733, + "loss": 3.4314, + "step": 14146 + }, + { + "epoch": 0.69, + "grad_norm": 0.48291024565696716, + "learning_rate": 0.0005243441973212687, + "loss": 3.4426, + "step": 14147 + }, + { + "epoch": 0.69, + "grad_norm": 0.5711972713470459, + "learning_rate": 0.0005243339753671847, + "loss": 3.273, + "step": 14148 + }, + { + "epoch": 0.69, + "grad_norm": 0.5552250146865845, + "learning_rate": 0.0005243237528222482, + "loss": 3.2662, + "step": 14149 + }, + { + "epoch": 0.69, + "grad_norm": 0.5337475538253784, + "learning_rate": 0.0005243135296864861, + "loss": 3.1347, + "step": 14150 + }, + { + "epoch": 0.69, + "grad_norm": 0.4788720905780792, + "learning_rate": 0.0005243033059599253, + "loss": 3.1474, + "step": 14151 + }, + { + "epoch": 0.69, + "grad_norm": 0.5156588554382324, + "learning_rate": 0.0005242930816425928, + "loss": 3.2473, + "step": 14152 + }, + { + "epoch": 0.69, + "grad_norm": 0.49395278096199036, + "learning_rate": 0.0005242828567345154, + "loss": 3.2504, + "step": 14153 + }, + { + "epoch": 0.69, + "grad_norm": 0.5037361979484558, + "learning_rate": 0.0005242726312357202, + "loss": 3.1234, + "step": 14154 + }, + { + "epoch": 0.69, + "grad_norm": 0.5027718544006348, + "learning_rate": 0.0005242624051462341, + "loss": 3.1226, + "step": 14155 + }, + { + "epoch": 0.69, + "grad_norm": 0.5126724243164062, + "learning_rate": 0.0005242521784660839, + "loss": 3.1279, + "step": 14156 + }, + { + "epoch": 0.69, + "grad_norm": 0.5130273103713989, + "learning_rate": 0.0005242419511952966, + "loss": 3.2485, + "step": 14157 + }, + { + "epoch": 0.69, + "grad_norm": 0.5258603692054749, + "learning_rate": 0.0005242317233338992, + "loss": 3.1893, + "step": 14158 + }, + { + "epoch": 0.69, + "grad_norm": 0.503662645816803, + "learning_rate": 0.0005242214948819186, + "loss": 3.2815, + "step": 14159 + }, + { + "epoch": 0.69, + "grad_norm": 0.4841233789920807, + "learning_rate": 0.0005242112658393817, + "loss": 3.1095, + "step": 14160 + }, + { + "epoch": 0.69, + "grad_norm": 0.5432723760604858, + "learning_rate": 0.0005242010362063156, + "loss": 3.4093, + "step": 14161 + }, + { + "epoch": 0.69, + "grad_norm": 0.5304633378982544, + "learning_rate": 0.0005241908059827469, + "loss": 3.2416, + "step": 14162 + }, + { + "epoch": 0.69, + "grad_norm": 0.5919955372810364, + "learning_rate": 0.0005241805751687029, + "loss": 3.2266, + "step": 14163 + }, + { + "epoch": 0.69, + "grad_norm": 0.5040702819824219, + "learning_rate": 0.0005241703437642104, + "loss": 3.0357, + "step": 14164 + }, + { + "epoch": 0.69, + "grad_norm": 0.5021560788154602, + "learning_rate": 0.0005241601117692964, + "loss": 3.1437, + "step": 14165 + }, + { + "epoch": 0.69, + "grad_norm": 0.513043224811554, + "learning_rate": 0.0005241498791839877, + "loss": 3.0822, + "step": 14166 + }, + { + "epoch": 0.69, + "grad_norm": 0.4927295446395874, + "learning_rate": 0.0005241396460083114, + "loss": 3.3873, + "step": 14167 + }, + { + "epoch": 0.69, + "grad_norm": 0.49543529748916626, + "learning_rate": 0.0005241294122422945, + "loss": 3.126, + "step": 14168 + }, + { + "epoch": 0.69, + "grad_norm": 0.528400719165802, + "learning_rate": 0.0005241191778859637, + "loss": 3.4217, + "step": 14169 + }, + { + "epoch": 0.69, + "grad_norm": 0.5334087014198303, + "learning_rate": 0.0005241089429393462, + "loss": 3.2066, + "step": 14170 + }, + { + "epoch": 0.69, + "grad_norm": 0.4597061574459076, + "learning_rate": 0.0005240987074024689, + "loss": 3.4921, + "step": 14171 + }, + { + "epoch": 0.69, + "grad_norm": 0.5277274250984192, + "learning_rate": 0.0005240884712753588, + "loss": 3.4269, + "step": 14172 + }, + { + "epoch": 0.69, + "grad_norm": 0.5084794759750366, + "learning_rate": 0.0005240782345580427, + "loss": 3.0579, + "step": 14173 + }, + { + "epoch": 0.69, + "grad_norm": 0.5290378332138062, + "learning_rate": 0.0005240679972505476, + "loss": 3.2095, + "step": 14174 + }, + { + "epoch": 0.69, + "grad_norm": 0.5595150589942932, + "learning_rate": 0.0005240577593529006, + "loss": 3.1068, + "step": 14175 + }, + { + "epoch": 0.69, + "grad_norm": 0.4827249050140381, + "learning_rate": 0.0005240475208651286, + "loss": 3.3068, + "step": 14176 + }, + { + "epoch": 0.69, + "grad_norm": 0.5036142468452454, + "learning_rate": 0.0005240372817872585, + "loss": 3.2103, + "step": 14177 + }, + { + "epoch": 0.69, + "grad_norm": 0.5806594491004944, + "learning_rate": 0.0005240270421193173, + "loss": 3.1885, + "step": 14178 + }, + { + "epoch": 0.69, + "grad_norm": 0.5014117360115051, + "learning_rate": 0.0005240168018613321, + "loss": 3.2471, + "step": 14179 + }, + { + "epoch": 0.69, + "grad_norm": 0.5045899152755737, + "learning_rate": 0.0005240065610133296, + "loss": 3.3355, + "step": 14180 + }, + { + "epoch": 0.69, + "grad_norm": 0.5164101719856262, + "learning_rate": 0.000523996319575337, + "loss": 3.111, + "step": 14181 + }, + { + "epoch": 0.7, + "grad_norm": 0.504895031452179, + "learning_rate": 0.0005239860775473811, + "loss": 3.2782, + "step": 14182 + }, + { + "epoch": 0.7, + "grad_norm": 0.5253392457962036, + "learning_rate": 0.0005239758349294891, + "loss": 3.3066, + "step": 14183 + }, + { + "epoch": 0.7, + "grad_norm": 0.5302526950836182, + "learning_rate": 0.0005239655917216879, + "loss": 3.176, + "step": 14184 + }, + { + "epoch": 0.7, + "grad_norm": 0.5363248586654663, + "learning_rate": 0.0005239553479240044, + "loss": 3.1524, + "step": 14185 + }, + { + "epoch": 0.7, + "grad_norm": 0.5327321290969849, + "learning_rate": 0.0005239451035364656, + "loss": 3.3454, + "step": 14186 + }, + { + "epoch": 0.7, + "grad_norm": 0.5333127975463867, + "learning_rate": 0.0005239348585590985, + "loss": 3.0064, + "step": 14187 + }, + { + "epoch": 0.7, + "grad_norm": 0.5116428732872009, + "learning_rate": 0.0005239246129919299, + "loss": 3.0169, + "step": 14188 + }, + { + "epoch": 0.7, + "grad_norm": 0.5070028305053711, + "learning_rate": 0.0005239143668349872, + "loss": 3.157, + "step": 14189 + }, + { + "epoch": 0.7, + "grad_norm": 0.5059760808944702, + "learning_rate": 0.0005239041200882971, + "loss": 3.4573, + "step": 14190 + }, + { + "epoch": 0.7, + "grad_norm": 0.5042681097984314, + "learning_rate": 0.0005238938727518867, + "loss": 3.2877, + "step": 14191 + }, + { + "epoch": 0.7, + "grad_norm": 0.8668105602264404, + "learning_rate": 0.0005238836248257829, + "loss": 3.3942, + "step": 14192 + }, + { + "epoch": 0.7, + "grad_norm": 0.5118007659912109, + "learning_rate": 0.0005238733763100126, + "loss": 3.0267, + "step": 14193 + }, + { + "epoch": 0.7, + "grad_norm": 0.527990460395813, + "learning_rate": 0.0005238631272046029, + "loss": 3.4337, + "step": 14194 + }, + { + "epoch": 0.7, + "grad_norm": 0.5032305717468262, + "learning_rate": 0.0005238528775095808, + "loss": 3.2735, + "step": 14195 + }, + { + "epoch": 0.7, + "grad_norm": 0.48045584559440613, + "learning_rate": 0.0005238426272249735, + "loss": 3.2052, + "step": 14196 + }, + { + "epoch": 0.7, + "grad_norm": 0.5078832507133484, + "learning_rate": 0.0005238323763508077, + "loss": 2.9881, + "step": 14197 + }, + { + "epoch": 0.7, + "grad_norm": 0.49898645281791687, + "learning_rate": 0.0005238221248871105, + "loss": 3.0997, + "step": 14198 + }, + { + "epoch": 0.7, + "grad_norm": 0.5208724737167358, + "learning_rate": 0.0005238118728339089, + "loss": 3.1622, + "step": 14199 + }, + { + "epoch": 0.7, + "grad_norm": 0.49348101019859314, + "learning_rate": 0.0005238016201912298, + "loss": 3.2991, + "step": 14200 + }, + { + "epoch": 0.7, + "grad_norm": 0.5580134391784668, + "learning_rate": 0.0005237913669591004, + "loss": 3.2781, + "step": 14201 + }, + { + "epoch": 0.7, + "grad_norm": 0.5383898615837097, + "learning_rate": 0.0005237811131375476, + "loss": 3.2633, + "step": 14202 + }, + { + "epoch": 0.7, + "grad_norm": 0.5041082501411438, + "learning_rate": 0.0005237708587265984, + "loss": 3.2228, + "step": 14203 + }, + { + "epoch": 0.7, + "grad_norm": 0.5141589641571045, + "learning_rate": 0.0005237606037262799, + "loss": 3.0724, + "step": 14204 + }, + { + "epoch": 0.7, + "grad_norm": 0.509434700012207, + "learning_rate": 0.0005237503481366189, + "loss": 3.1054, + "step": 14205 + }, + { + "epoch": 0.7, + "grad_norm": 0.5451233386993408, + "learning_rate": 0.0005237400919576426, + "loss": 3.0023, + "step": 14206 + }, + { + "epoch": 0.7, + "grad_norm": 0.5041611790657043, + "learning_rate": 0.000523729835189378, + "loss": 3.2864, + "step": 14207 + }, + { + "epoch": 0.7, + "grad_norm": 0.48671042919158936, + "learning_rate": 0.0005237195778318521, + "loss": 3.2833, + "step": 14208 + }, + { + "epoch": 0.7, + "grad_norm": 0.5079613924026489, + "learning_rate": 0.0005237093198850917, + "loss": 3.0954, + "step": 14209 + }, + { + "epoch": 0.7, + "grad_norm": 0.5360010862350464, + "learning_rate": 0.0005236990613491242, + "loss": 3.2204, + "step": 14210 + }, + { + "epoch": 0.7, + "grad_norm": 0.5029088258743286, + "learning_rate": 0.0005236888022239763, + "loss": 3.1848, + "step": 14211 + }, + { + "epoch": 0.7, + "grad_norm": 0.5785709023475647, + "learning_rate": 0.0005236785425096753, + "loss": 3.1963, + "step": 14212 + }, + { + "epoch": 0.7, + "grad_norm": 0.5002698302268982, + "learning_rate": 0.0005236682822062479, + "loss": 3.2937, + "step": 14213 + }, + { + "epoch": 0.7, + "grad_norm": 0.46872082352638245, + "learning_rate": 0.0005236580213137214, + "loss": 3.2813, + "step": 14214 + }, + { + "epoch": 0.7, + "grad_norm": 0.5846667289733887, + "learning_rate": 0.0005236477598321227, + "loss": 3.2123, + "step": 14215 + }, + { + "epoch": 0.7, + "grad_norm": 0.5419370532035828, + "learning_rate": 0.0005236374977614788, + "loss": 3.1255, + "step": 14216 + }, + { + "epoch": 0.7, + "grad_norm": 0.5296844840049744, + "learning_rate": 0.0005236272351018167, + "loss": 3.2611, + "step": 14217 + }, + { + "epoch": 0.7, + "grad_norm": 0.5379822254180908, + "learning_rate": 0.0005236169718531637, + "loss": 3.15, + "step": 14218 + }, + { + "epoch": 0.7, + "grad_norm": 0.5059786438941956, + "learning_rate": 0.0005236067080155465, + "loss": 3.3715, + "step": 14219 + }, + { + "epoch": 0.7, + "grad_norm": 0.48545315861701965, + "learning_rate": 0.0005235964435889923, + "loss": 3.2246, + "step": 14220 + }, + { + "epoch": 0.7, + "grad_norm": 0.5349769592285156, + "learning_rate": 0.0005235861785735281, + "loss": 3.1605, + "step": 14221 + }, + { + "epoch": 0.7, + "grad_norm": 0.5449730157852173, + "learning_rate": 0.0005235759129691809, + "loss": 3.2828, + "step": 14222 + }, + { + "epoch": 0.7, + "grad_norm": 0.5503379106521606, + "learning_rate": 0.0005235656467759778, + "loss": 3.2192, + "step": 14223 + }, + { + "epoch": 0.7, + "grad_norm": 0.5100115537643433, + "learning_rate": 0.0005235553799939458, + "loss": 3.2576, + "step": 14224 + }, + { + "epoch": 0.7, + "grad_norm": 0.5590293407440186, + "learning_rate": 0.0005235451126231119, + "loss": 3.2798, + "step": 14225 + }, + { + "epoch": 0.7, + "grad_norm": 0.5304601788520813, + "learning_rate": 0.0005235348446635034, + "loss": 3.0178, + "step": 14226 + }, + { + "epoch": 0.7, + "grad_norm": 0.5042637586593628, + "learning_rate": 0.0005235245761151468, + "loss": 3.1506, + "step": 14227 + }, + { + "epoch": 0.7, + "grad_norm": 0.5219356417655945, + "learning_rate": 0.0005235143069780698, + "loss": 3.2972, + "step": 14228 + }, + { + "epoch": 0.7, + "grad_norm": 0.5481365919113159, + "learning_rate": 0.000523504037252299, + "loss": 3.2286, + "step": 14229 + }, + { + "epoch": 0.7, + "grad_norm": 0.48297563195228577, + "learning_rate": 0.0005234937669378615, + "loss": 3.3976, + "step": 14230 + }, + { + "epoch": 0.7, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0005234834960347846, + "loss": 3.2173, + "step": 14231 + }, + { + "epoch": 0.7, + "grad_norm": 0.5015493035316467, + "learning_rate": 0.0005234732245430952, + "loss": 3.3385, + "step": 14232 + }, + { + "epoch": 0.7, + "grad_norm": 0.5298082232475281, + "learning_rate": 0.0005234629524628201, + "loss": 3.3677, + "step": 14233 + }, + { + "epoch": 0.7, + "grad_norm": 0.5369641780853271, + "learning_rate": 0.0005234526797939868, + "loss": 3.42, + "step": 14234 + }, + { + "epoch": 0.7, + "grad_norm": 0.6291206479072571, + "learning_rate": 0.000523442406536622, + "loss": 3.1479, + "step": 14235 + }, + { + "epoch": 0.7, + "grad_norm": 0.4945876896381378, + "learning_rate": 0.000523432132690753, + "loss": 3.1374, + "step": 14236 + }, + { + "epoch": 0.7, + "grad_norm": 0.519692063331604, + "learning_rate": 0.0005234218582564067, + "loss": 3.0743, + "step": 14237 + }, + { + "epoch": 0.7, + "grad_norm": 0.5264421701431274, + "learning_rate": 0.0005234115832336103, + "loss": 3.3572, + "step": 14238 + }, + { + "epoch": 0.7, + "grad_norm": 0.49894416332244873, + "learning_rate": 0.0005234013076223907, + "loss": 3.1509, + "step": 14239 + }, + { + "epoch": 0.7, + "grad_norm": 0.519243597984314, + "learning_rate": 0.0005233910314227751, + "loss": 3.2018, + "step": 14240 + }, + { + "epoch": 0.7, + "grad_norm": 0.5167336463928223, + "learning_rate": 0.0005233807546347906, + "loss": 3.3301, + "step": 14241 + }, + { + "epoch": 0.7, + "grad_norm": 0.5419256687164307, + "learning_rate": 0.000523370477258464, + "loss": 3.3605, + "step": 14242 + }, + { + "epoch": 0.7, + "grad_norm": 0.5188645124435425, + "learning_rate": 0.0005233601992938228, + "loss": 3.3297, + "step": 14243 + }, + { + "epoch": 0.7, + "grad_norm": 0.5342188477516174, + "learning_rate": 0.0005233499207408936, + "loss": 3.2517, + "step": 14244 + }, + { + "epoch": 0.7, + "grad_norm": 0.5165455341339111, + "learning_rate": 0.0005233396415997038, + "loss": 3.2128, + "step": 14245 + }, + { + "epoch": 0.7, + "grad_norm": 0.5181511640548706, + "learning_rate": 0.0005233293618702804, + "loss": 3.0307, + "step": 14246 + }, + { + "epoch": 0.7, + "grad_norm": 0.5308560729026794, + "learning_rate": 0.0005233190815526502, + "loss": 3.439, + "step": 14247 + }, + { + "epoch": 0.7, + "grad_norm": 0.4866609573364258, + "learning_rate": 0.0005233088006468407, + "loss": 3.0744, + "step": 14248 + }, + { + "epoch": 0.7, + "grad_norm": 0.5222994089126587, + "learning_rate": 0.0005232985191528788, + "loss": 3.1265, + "step": 14249 + }, + { + "epoch": 0.7, + "grad_norm": 0.495818555355072, + "learning_rate": 0.0005232882370707916, + "loss": 3.4966, + "step": 14250 + }, + { + "epoch": 0.7, + "grad_norm": 0.5098391175270081, + "learning_rate": 0.0005232779544006061, + "loss": 3.2485, + "step": 14251 + }, + { + "epoch": 0.7, + "grad_norm": 0.5358620882034302, + "learning_rate": 0.0005232676711423495, + "loss": 3.2363, + "step": 14252 + }, + { + "epoch": 0.7, + "grad_norm": 0.5330485105514526, + "learning_rate": 0.0005232573872960488, + "loss": 3.2469, + "step": 14253 + }, + { + "epoch": 0.7, + "grad_norm": 0.5173335075378418, + "learning_rate": 0.0005232471028617311, + "loss": 3.0878, + "step": 14254 + }, + { + "epoch": 0.7, + "grad_norm": 0.510068953037262, + "learning_rate": 0.0005232368178394234, + "loss": 3.1082, + "step": 14255 + }, + { + "epoch": 0.7, + "grad_norm": 0.567221462726593, + "learning_rate": 0.000523226532229153, + "loss": 3.1399, + "step": 14256 + }, + { + "epoch": 0.7, + "grad_norm": 0.5452208518981934, + "learning_rate": 0.0005232162460309468, + "loss": 3.119, + "step": 14257 + }, + { + "epoch": 0.7, + "grad_norm": 0.5163204669952393, + "learning_rate": 0.000523205959244832, + "loss": 3.2187, + "step": 14258 + }, + { + "epoch": 0.7, + "grad_norm": 0.4994533061981201, + "learning_rate": 0.0005231956718708357, + "loss": 3.1825, + "step": 14259 + }, + { + "epoch": 0.7, + "grad_norm": 0.5001878142356873, + "learning_rate": 0.0005231853839089849, + "loss": 3.2424, + "step": 14260 + }, + { + "epoch": 0.7, + "grad_norm": 0.5246791243553162, + "learning_rate": 0.0005231750953593068, + "loss": 3.2498, + "step": 14261 + }, + { + "epoch": 0.7, + "grad_norm": 0.4972473084926605, + "learning_rate": 0.0005231648062218285, + "loss": 3.193, + "step": 14262 + }, + { + "epoch": 0.7, + "grad_norm": 0.5774008631706238, + "learning_rate": 0.0005231545164965769, + "loss": 3.0555, + "step": 14263 + }, + { + "epoch": 0.7, + "grad_norm": 0.5512023568153381, + "learning_rate": 0.0005231442261835794, + "loss": 3.2012, + "step": 14264 + }, + { + "epoch": 0.7, + "grad_norm": 0.5204340815544128, + "learning_rate": 0.0005231339352828628, + "loss": 3.0874, + "step": 14265 + }, + { + "epoch": 0.7, + "grad_norm": 0.5237892866134644, + "learning_rate": 0.0005231236437944545, + "loss": 3.1603, + "step": 14266 + }, + { + "epoch": 0.7, + "grad_norm": 0.5312855839729309, + "learning_rate": 0.0005231133517183814, + "loss": 3.1177, + "step": 14267 + }, + { + "epoch": 0.7, + "grad_norm": 0.48745495080947876, + "learning_rate": 0.0005231030590546707, + "loss": 3.268, + "step": 14268 + }, + { + "epoch": 0.7, + "grad_norm": 0.5135250091552734, + "learning_rate": 0.0005230927658033494, + "loss": 3.395, + "step": 14269 + }, + { + "epoch": 0.7, + "grad_norm": 0.5581345558166504, + "learning_rate": 0.0005230824719644448, + "loss": 3.1017, + "step": 14270 + }, + { + "epoch": 0.7, + "grad_norm": 0.47780802845954895, + "learning_rate": 0.0005230721775379837, + "loss": 3.1137, + "step": 14271 + }, + { + "epoch": 0.7, + "grad_norm": 0.49343234300613403, + "learning_rate": 0.0005230618825239937, + "loss": 3.2004, + "step": 14272 + }, + { + "epoch": 0.7, + "grad_norm": 0.5608261823654175, + "learning_rate": 0.0005230515869225013, + "loss": 3.1291, + "step": 14273 + }, + { + "epoch": 0.7, + "grad_norm": 0.510046124458313, + "learning_rate": 0.0005230412907335343, + "loss": 3.3241, + "step": 14274 + }, + { + "epoch": 0.7, + "grad_norm": 0.506389856338501, + "learning_rate": 0.0005230309939571193, + "loss": 3.1, + "step": 14275 + }, + { + "epoch": 0.7, + "grad_norm": 0.4831939935684204, + "learning_rate": 0.0005230206965932836, + "loss": 3.2499, + "step": 14276 + }, + { + "epoch": 0.7, + "grad_norm": 0.506610095500946, + "learning_rate": 0.0005230103986420542, + "loss": 3.371, + "step": 14277 + }, + { + "epoch": 0.7, + "grad_norm": 0.49668189883232117, + "learning_rate": 0.0005230001001034585, + "loss": 3.105, + "step": 14278 + }, + { + "epoch": 0.7, + "grad_norm": 0.5084254145622253, + "learning_rate": 0.0005229898009775233, + "loss": 3.1564, + "step": 14279 + }, + { + "epoch": 0.7, + "grad_norm": 0.5354201793670654, + "learning_rate": 0.000522979501264276, + "loss": 3.2659, + "step": 14280 + }, + { + "epoch": 0.7, + "grad_norm": 0.5142485499382019, + "learning_rate": 0.0005229692009637437, + "loss": 3.0802, + "step": 14281 + }, + { + "epoch": 0.7, + "grad_norm": 0.5173102617263794, + "learning_rate": 0.0005229589000759531, + "loss": 3.0241, + "step": 14282 + }, + { + "epoch": 0.7, + "grad_norm": 0.5071712732315063, + "learning_rate": 0.0005229485986009319, + "loss": 3.2652, + "step": 14283 + }, + { + "epoch": 0.7, + "grad_norm": 0.4926835596561432, + "learning_rate": 0.0005229382965387069, + "loss": 3.1913, + "step": 14284 + }, + { + "epoch": 0.7, + "grad_norm": 0.503774106502533, + "learning_rate": 0.0005229279938893055, + "loss": 3.2452, + "step": 14285 + }, + { + "epoch": 0.7, + "grad_norm": 0.4997362196445465, + "learning_rate": 0.0005229176906527545, + "loss": 3.3595, + "step": 14286 + }, + { + "epoch": 0.7, + "grad_norm": 0.5188296437263489, + "learning_rate": 0.0005229073868290813, + "loss": 3.1387, + "step": 14287 + }, + { + "epoch": 0.7, + "grad_norm": 0.5256169438362122, + "learning_rate": 0.000522897082418313, + "loss": 3.1277, + "step": 14288 + }, + { + "epoch": 0.7, + "grad_norm": 0.5132431387901306, + "learning_rate": 0.0005228867774204765, + "loss": 3.1984, + "step": 14289 + }, + { + "epoch": 0.7, + "grad_norm": 0.5352613925933838, + "learning_rate": 0.0005228764718355993, + "loss": 3.301, + "step": 14290 + }, + { + "epoch": 0.7, + "grad_norm": 0.5036006569862366, + "learning_rate": 0.0005228661656637082, + "loss": 3.2298, + "step": 14291 + }, + { + "epoch": 0.7, + "grad_norm": 0.4817676544189453, + "learning_rate": 0.0005228558589048306, + "loss": 3.3918, + "step": 14292 + }, + { + "epoch": 0.7, + "grad_norm": 0.528864324092865, + "learning_rate": 0.0005228455515589935, + "loss": 3.0981, + "step": 14293 + }, + { + "epoch": 0.7, + "grad_norm": 0.5127184391021729, + "learning_rate": 0.0005228352436262243, + "loss": 3.3858, + "step": 14294 + }, + { + "epoch": 0.7, + "grad_norm": 0.5853077173233032, + "learning_rate": 0.0005228249351065498, + "loss": 3.2205, + "step": 14295 + }, + { + "epoch": 0.7, + "grad_norm": 0.5480331182479858, + "learning_rate": 0.0005228146259999972, + "loss": 3.0766, + "step": 14296 + }, + { + "epoch": 0.7, + "grad_norm": 0.5229378342628479, + "learning_rate": 0.000522804316306594, + "loss": 3.0965, + "step": 14297 + }, + { + "epoch": 0.7, + "grad_norm": 0.5020848512649536, + "learning_rate": 0.0005227940060263669, + "loss": 3.3201, + "step": 14298 + }, + { + "epoch": 0.7, + "grad_norm": 0.5315490365028381, + "learning_rate": 0.0005227836951593434, + "loss": 3.2358, + "step": 14299 + }, + { + "epoch": 0.7, + "grad_norm": 0.5193626880645752, + "learning_rate": 0.0005227733837055504, + "loss": 3.0192, + "step": 14300 + }, + { + "epoch": 0.7, + "grad_norm": 0.5127216577529907, + "learning_rate": 0.0005227630716650152, + "loss": 3.179, + "step": 14301 + }, + { + "epoch": 0.7, + "grad_norm": 0.5541197061538696, + "learning_rate": 0.0005227527590377651, + "loss": 3.1317, + "step": 14302 + }, + { + "epoch": 0.7, + "grad_norm": 0.5353065133094788, + "learning_rate": 0.0005227424458238269, + "loss": 3.1438, + "step": 14303 + }, + { + "epoch": 0.7, + "grad_norm": 0.4878837764263153, + "learning_rate": 0.0005227321320232281, + "loss": 3.3127, + "step": 14304 + }, + { + "epoch": 0.7, + "grad_norm": 0.5021807551383972, + "learning_rate": 0.0005227218176359957, + "loss": 3.2301, + "step": 14305 + }, + { + "epoch": 0.7, + "grad_norm": 0.4997186064720154, + "learning_rate": 0.0005227115026621568, + "loss": 3.1486, + "step": 14306 + }, + { + "epoch": 0.7, + "grad_norm": 0.5215864181518555, + "learning_rate": 0.0005227011871017388, + "loss": 3.1568, + "step": 14307 + }, + { + "epoch": 0.7, + "grad_norm": 0.5061655044555664, + "learning_rate": 0.0005226908709547687, + "loss": 3.1592, + "step": 14308 + }, + { + "epoch": 0.7, + "grad_norm": 0.5245349407196045, + "learning_rate": 0.0005226805542212737, + "loss": 3.3514, + "step": 14309 + }, + { + "epoch": 0.7, + "grad_norm": 0.4831959307193756, + "learning_rate": 0.000522670236901281, + "loss": 3.2534, + "step": 14310 + }, + { + "epoch": 0.7, + "grad_norm": 0.47621917724609375, + "learning_rate": 0.0005226599189948176, + "loss": 3.0129, + "step": 14311 + }, + { + "epoch": 0.7, + "grad_norm": 0.5170857310295105, + "learning_rate": 0.000522649600501911, + "loss": 3.073, + "step": 14312 + }, + { + "epoch": 0.7, + "grad_norm": 0.5005858540534973, + "learning_rate": 0.0005226392814225881, + "loss": 3.114, + "step": 14313 + }, + { + "epoch": 0.7, + "grad_norm": 0.5574570894241333, + "learning_rate": 0.0005226289617568763, + "loss": 3.3234, + "step": 14314 + }, + { + "epoch": 0.7, + "grad_norm": 0.5117168426513672, + "learning_rate": 0.0005226186415048026, + "loss": 3.1659, + "step": 14315 + }, + { + "epoch": 0.7, + "grad_norm": 0.49425193667411804, + "learning_rate": 0.0005226083206663941, + "loss": 2.9738, + "step": 14316 + }, + { + "epoch": 0.7, + "grad_norm": 0.5152395963668823, + "learning_rate": 0.0005225979992416783, + "loss": 3.2, + "step": 14317 + }, + { + "epoch": 0.7, + "grad_norm": 0.5114201903343201, + "learning_rate": 0.0005225876772306822, + "loss": 3.2103, + "step": 14318 + }, + { + "epoch": 0.7, + "grad_norm": 0.48314422369003296, + "learning_rate": 0.0005225773546334328, + "loss": 3.2442, + "step": 14319 + }, + { + "epoch": 0.7, + "grad_norm": 0.5234989523887634, + "learning_rate": 0.0005225670314499577, + "loss": 3.1864, + "step": 14320 + }, + { + "epoch": 0.7, + "grad_norm": 0.5077502727508545, + "learning_rate": 0.0005225567076802838, + "loss": 3.2501, + "step": 14321 + }, + { + "epoch": 0.7, + "grad_norm": 0.4953576624393463, + "learning_rate": 0.0005225463833244384, + "loss": 3.2259, + "step": 14322 + }, + { + "epoch": 0.7, + "grad_norm": 0.5094617605209351, + "learning_rate": 0.0005225360583824487, + "loss": 3.1856, + "step": 14323 + }, + { + "epoch": 0.7, + "grad_norm": 0.5310658812522888, + "learning_rate": 0.0005225257328543417, + "loss": 3.2715, + "step": 14324 + }, + { + "epoch": 0.7, + "grad_norm": 0.5143598318099976, + "learning_rate": 0.0005225154067401448, + "loss": 3.2914, + "step": 14325 + }, + { + "epoch": 0.7, + "grad_norm": 0.5416876673698425, + "learning_rate": 0.0005225050800398851, + "loss": 3.3223, + "step": 14326 + }, + { + "epoch": 0.7, + "grad_norm": 0.5177731513977051, + "learning_rate": 0.00052249475275359, + "loss": 3.1994, + "step": 14327 + }, + { + "epoch": 0.7, + "grad_norm": 0.4947225749492645, + "learning_rate": 0.0005224844248812864, + "loss": 3.2301, + "step": 14328 + }, + { + "epoch": 0.7, + "grad_norm": 0.5090954303741455, + "learning_rate": 0.0005224740964230017, + "loss": 3.1208, + "step": 14329 + }, + { + "epoch": 0.7, + "grad_norm": 0.5198858976364136, + "learning_rate": 0.0005224637673787631, + "loss": 3.2342, + "step": 14330 + }, + { + "epoch": 0.7, + "grad_norm": 0.5442454814910889, + "learning_rate": 0.0005224534377485977, + "loss": 3.0242, + "step": 14331 + }, + { + "epoch": 0.7, + "grad_norm": 0.6403926610946655, + "learning_rate": 0.0005224431075325327, + "loss": 3.0215, + "step": 14332 + }, + { + "epoch": 0.7, + "grad_norm": 0.49326714873313904, + "learning_rate": 0.0005224327767305954, + "loss": 3.4625, + "step": 14333 + }, + { + "epoch": 0.7, + "grad_norm": 0.527711808681488, + "learning_rate": 0.000522422445342813, + "loss": 3.3671, + "step": 14334 + }, + { + "epoch": 0.7, + "grad_norm": 0.5429787039756775, + "learning_rate": 0.0005224121133692127, + "loss": 3.3924, + "step": 14335 + }, + { + "epoch": 0.7, + "grad_norm": 0.5077711343765259, + "learning_rate": 0.0005224017808098217, + "loss": 3.2152, + "step": 14336 + }, + { + "epoch": 0.7, + "grad_norm": 0.48497462272644043, + "learning_rate": 0.0005223914476646671, + "loss": 3.1022, + "step": 14337 + }, + { + "epoch": 0.7, + "grad_norm": 0.5042962431907654, + "learning_rate": 0.0005223811139337763, + "loss": 3.3529, + "step": 14338 + }, + { + "epoch": 0.7, + "grad_norm": 0.5740790963172913, + "learning_rate": 0.0005223707796171765, + "loss": 3.0065, + "step": 14339 + }, + { + "epoch": 0.7, + "grad_norm": 0.5047106742858887, + "learning_rate": 0.0005223604447148947, + "loss": 3.2321, + "step": 14340 + }, + { + "epoch": 0.7, + "grad_norm": 0.5074807405471802, + "learning_rate": 0.0005223501092269584, + "loss": 3.2166, + "step": 14341 + }, + { + "epoch": 0.7, + "grad_norm": 0.49849480390548706, + "learning_rate": 0.0005223397731533947, + "loss": 2.8486, + "step": 14342 + }, + { + "epoch": 0.7, + "grad_norm": 0.4887546896934509, + "learning_rate": 0.0005223294364942309, + "loss": 3.2793, + "step": 14343 + }, + { + "epoch": 0.7, + "grad_norm": 0.5052506923675537, + "learning_rate": 0.0005223190992494941, + "loss": 3.2666, + "step": 14344 + }, + { + "epoch": 0.7, + "grad_norm": 0.530553936958313, + "learning_rate": 0.0005223087614192116, + "loss": 3.3803, + "step": 14345 + }, + { + "epoch": 0.7, + "grad_norm": 0.5107871294021606, + "learning_rate": 0.0005222984230034105, + "loss": 3.2682, + "step": 14346 + }, + { + "epoch": 0.7, + "grad_norm": 0.5242406725883484, + "learning_rate": 0.0005222880840021183, + "loss": 3.1691, + "step": 14347 + }, + { + "epoch": 0.7, + "grad_norm": 0.5011206269264221, + "learning_rate": 0.0005222777444153618, + "loss": 3.5911, + "step": 14348 + }, + { + "epoch": 0.7, + "grad_norm": 0.5178399085998535, + "learning_rate": 0.0005222674042431688, + "loss": 3.2566, + "step": 14349 + }, + { + "epoch": 0.7, + "grad_norm": 0.5099117159843445, + "learning_rate": 0.000522257063485566, + "loss": 3.159, + "step": 14350 + }, + { + "epoch": 0.7, + "grad_norm": 0.5087069869041443, + "learning_rate": 0.000522246722142581, + "loss": 3.0981, + "step": 14351 + }, + { + "epoch": 0.7, + "grad_norm": 0.49150705337524414, + "learning_rate": 0.0005222363802142409, + "loss": 3.2154, + "step": 14352 + }, + { + "epoch": 0.7, + "grad_norm": 0.5360817313194275, + "learning_rate": 0.0005222260377005729, + "loss": 3.0334, + "step": 14353 + }, + { + "epoch": 0.7, + "grad_norm": 0.7099424600601196, + "learning_rate": 0.0005222156946016043, + "loss": 3.1963, + "step": 14354 + }, + { + "epoch": 0.7, + "grad_norm": 0.5025527477264404, + "learning_rate": 0.0005222053509173623, + "loss": 3.0433, + "step": 14355 + }, + { + "epoch": 0.7, + "grad_norm": 0.543745219707489, + "learning_rate": 0.0005221950066478742, + "loss": 3.0136, + "step": 14356 + }, + { + "epoch": 0.7, + "grad_norm": 0.5021082758903503, + "learning_rate": 0.0005221846617931672, + "loss": 2.9967, + "step": 14357 + }, + { + "epoch": 0.7, + "grad_norm": 0.5673491358757019, + "learning_rate": 0.0005221743163532686, + "loss": 3.2684, + "step": 14358 + }, + { + "epoch": 0.7, + "grad_norm": 0.5165929198265076, + "learning_rate": 0.0005221639703282057, + "loss": 3.236, + "step": 14359 + }, + { + "epoch": 0.7, + "grad_norm": 0.5241513848304749, + "learning_rate": 0.0005221536237180054, + "loss": 3.178, + "step": 14360 + }, + { + "epoch": 0.7, + "grad_norm": 0.48851609230041504, + "learning_rate": 0.0005221432765226955, + "loss": 3.2317, + "step": 14361 + }, + { + "epoch": 0.7, + "grad_norm": 0.5319517254829407, + "learning_rate": 0.0005221329287423027, + "loss": 3.3219, + "step": 14362 + }, + { + "epoch": 0.7, + "grad_norm": 0.5038327574729919, + "learning_rate": 0.0005221225803768546, + "loss": 3.2557, + "step": 14363 + }, + { + "epoch": 0.7, + "grad_norm": 0.5223231911659241, + "learning_rate": 0.0005221122314263785, + "loss": 3.1857, + "step": 14364 + }, + { + "epoch": 0.7, + "grad_norm": 0.4955352544784546, + "learning_rate": 0.0005221018818909014, + "loss": 3.1251, + "step": 14365 + }, + { + "epoch": 0.7, + "grad_norm": 0.5417662262916565, + "learning_rate": 0.0005220915317704507, + "loss": 3.0374, + "step": 14366 + }, + { + "epoch": 0.7, + "grad_norm": 0.4988352060317993, + "learning_rate": 0.0005220811810650537, + "loss": 3.1441, + "step": 14367 + }, + { + "epoch": 0.7, + "grad_norm": 0.4997967779636383, + "learning_rate": 0.0005220708297747375, + "loss": 3.267, + "step": 14368 + }, + { + "epoch": 0.7, + "grad_norm": 0.5113928914070129, + "learning_rate": 0.0005220604778995296, + "loss": 3.1211, + "step": 14369 + }, + { + "epoch": 0.7, + "grad_norm": 0.5090401768684387, + "learning_rate": 0.0005220501254394571, + "loss": 3.2306, + "step": 14370 + }, + { + "epoch": 0.7, + "grad_norm": 0.5390316843986511, + "learning_rate": 0.0005220397723945473, + "loss": 3.3519, + "step": 14371 + }, + { + "epoch": 0.7, + "grad_norm": 0.5376928448677063, + "learning_rate": 0.0005220294187648275, + "loss": 3.0629, + "step": 14372 + }, + { + "epoch": 0.7, + "grad_norm": 0.5060685276985168, + "learning_rate": 0.0005220190645503248, + "loss": 3.2316, + "step": 14373 + }, + { + "epoch": 0.7, + "grad_norm": 0.5190165638923645, + "learning_rate": 0.0005220087097510668, + "loss": 3.1309, + "step": 14374 + }, + { + "epoch": 0.7, + "grad_norm": 0.5150943398475647, + "learning_rate": 0.0005219983543670805, + "loss": 3.2837, + "step": 14375 + }, + { + "epoch": 0.7, + "grad_norm": 0.4947223663330078, + "learning_rate": 0.0005219879983983933, + "loss": 3.376, + "step": 14376 + }, + { + "epoch": 0.7, + "grad_norm": 0.48892343044281006, + "learning_rate": 0.0005219776418450323, + "loss": 3.24, + "step": 14377 + }, + { + "epoch": 0.7, + "grad_norm": 0.5334641337394714, + "learning_rate": 0.0005219672847070251, + "loss": 3.2448, + "step": 14378 + }, + { + "epoch": 0.7, + "grad_norm": 0.5088352560997009, + "learning_rate": 0.0005219569269843987, + "loss": 3.1581, + "step": 14379 + }, + { + "epoch": 0.7, + "grad_norm": 0.49996280670166016, + "learning_rate": 0.0005219465686771805, + "loss": 3.2747, + "step": 14380 + }, + { + "epoch": 0.7, + "grad_norm": 0.5581712126731873, + "learning_rate": 0.0005219362097853977, + "loss": 3.135, + "step": 14381 + }, + { + "epoch": 0.7, + "grad_norm": 0.5300365686416626, + "learning_rate": 0.0005219258503090776, + "loss": 3.2328, + "step": 14382 + }, + { + "epoch": 0.7, + "grad_norm": 0.4860009551048279, + "learning_rate": 0.0005219154902482476, + "loss": 3.2522, + "step": 14383 + }, + { + "epoch": 0.7, + "grad_norm": 0.5062441825866699, + "learning_rate": 0.0005219051296029349, + "loss": 3.3731, + "step": 14384 + }, + { + "epoch": 0.7, + "grad_norm": 0.5869806408882141, + "learning_rate": 0.0005218947683731667, + "loss": 3.0537, + "step": 14385 + }, + { + "epoch": 0.71, + "grad_norm": 0.5176935791969299, + "learning_rate": 0.0005218844065589704, + "loss": 2.9489, + "step": 14386 + }, + { + "epoch": 0.71, + "grad_norm": 0.5050328373908997, + "learning_rate": 0.0005218740441603735, + "loss": 3.3333, + "step": 14387 + }, + { + "epoch": 0.71, + "grad_norm": 0.5312533378601074, + "learning_rate": 0.0005218636811774029, + "loss": 3.2162, + "step": 14388 + }, + { + "epoch": 0.71, + "grad_norm": 0.5098238587379456, + "learning_rate": 0.000521853317610086, + "loss": 3.1103, + "step": 14389 + }, + { + "epoch": 0.71, + "grad_norm": 0.48738306760787964, + "learning_rate": 0.0005218429534584502, + "loss": 3.2526, + "step": 14390 + }, + { + "epoch": 0.71, + "grad_norm": 0.5178094506263733, + "learning_rate": 0.0005218325887225228, + "loss": 3.2081, + "step": 14391 + }, + { + "epoch": 0.71, + "grad_norm": 0.5126038789749146, + "learning_rate": 0.000521822223402331, + "loss": 3.1075, + "step": 14392 + }, + { + "epoch": 0.71, + "grad_norm": 0.49259766936302185, + "learning_rate": 0.0005218118574979023, + "loss": 3.2309, + "step": 14393 + }, + { + "epoch": 0.71, + "grad_norm": 0.5296828746795654, + "learning_rate": 0.0005218014910092636, + "loss": 3.294, + "step": 14394 + }, + { + "epoch": 0.71, + "grad_norm": 0.5134131908416748, + "learning_rate": 0.0005217911239364427, + "loss": 3.2609, + "step": 14395 + }, + { + "epoch": 0.71, + "grad_norm": 0.5110315680503845, + "learning_rate": 0.0005217807562794666, + "loss": 3.2463, + "step": 14396 + }, + { + "epoch": 0.71, + "grad_norm": 0.5391361713409424, + "learning_rate": 0.0005217703880383626, + "loss": 2.8263, + "step": 14397 + }, + { + "epoch": 0.71, + "grad_norm": 0.5444517135620117, + "learning_rate": 0.0005217600192131582, + "loss": 3.1325, + "step": 14398 + }, + { + "epoch": 0.71, + "grad_norm": 0.5090447664260864, + "learning_rate": 0.0005217496498038805, + "loss": 2.9955, + "step": 14399 + }, + { + "epoch": 0.71, + "grad_norm": 0.5326402187347412, + "learning_rate": 0.0005217392798105569, + "loss": 2.8706, + "step": 14400 + }, + { + "epoch": 0.71, + "grad_norm": 0.586733877658844, + "learning_rate": 0.0005217289092332147, + "loss": 3.1483, + "step": 14401 + }, + { + "epoch": 0.71, + "grad_norm": 0.534562349319458, + "learning_rate": 0.0005217185380718812, + "loss": 3.0548, + "step": 14402 + }, + { + "epoch": 0.71, + "grad_norm": 0.49788811802864075, + "learning_rate": 0.0005217081663265838, + "loss": 3.1239, + "step": 14403 + }, + { + "epoch": 0.71, + "grad_norm": 0.5514092445373535, + "learning_rate": 0.0005216977939973498, + "loss": 3.2406, + "step": 14404 + }, + { + "epoch": 0.71, + "grad_norm": 0.5975626707077026, + "learning_rate": 0.0005216874210842064, + "loss": 3.0378, + "step": 14405 + }, + { + "epoch": 0.71, + "grad_norm": 0.5058104395866394, + "learning_rate": 0.0005216770475871811, + "loss": 3.1995, + "step": 14406 + }, + { + "epoch": 0.71, + "grad_norm": 0.5320433974266052, + "learning_rate": 0.0005216666735063009, + "loss": 2.9181, + "step": 14407 + }, + { + "epoch": 0.71, + "grad_norm": 0.5036849975585938, + "learning_rate": 0.0005216562988415935, + "loss": 3.3894, + "step": 14408 + }, + { + "epoch": 0.71, + "grad_norm": 0.5307196378707886, + "learning_rate": 0.0005216459235930861, + "loss": 3.0559, + "step": 14409 + }, + { + "epoch": 0.71, + "grad_norm": 0.5142438411712646, + "learning_rate": 0.0005216355477608059, + "loss": 3.0828, + "step": 14410 + }, + { + "epoch": 0.71, + "grad_norm": 0.5658327341079712, + "learning_rate": 0.0005216251713447804, + "loss": 3.2546, + "step": 14411 + }, + { + "epoch": 0.71, + "grad_norm": 0.5084229111671448, + "learning_rate": 0.0005216147943450368, + "loss": 3.3371, + "step": 14412 + }, + { + "epoch": 0.71, + "grad_norm": 0.5430961847305298, + "learning_rate": 0.0005216044167616025, + "loss": 3.2217, + "step": 14413 + }, + { + "epoch": 0.71, + "grad_norm": 0.5450170636177063, + "learning_rate": 0.0005215940385945048, + "loss": 3.1991, + "step": 14414 + }, + { + "epoch": 0.71, + "grad_norm": 0.49537381529808044, + "learning_rate": 0.000521583659843771, + "loss": 3.4065, + "step": 14415 + }, + { + "epoch": 0.71, + "grad_norm": 0.5271323919296265, + "learning_rate": 0.0005215732805094286, + "loss": 3.2777, + "step": 14416 + }, + { + "epoch": 0.71, + "grad_norm": 0.5401394367218018, + "learning_rate": 0.0005215629005915047, + "loss": 3.1833, + "step": 14417 + }, + { + "epoch": 0.71, + "grad_norm": 0.5234045386314392, + "learning_rate": 0.0005215525200900269, + "loss": 3.0229, + "step": 14418 + }, + { + "epoch": 0.71, + "grad_norm": 0.5828486680984497, + "learning_rate": 0.0005215421390050224, + "loss": 3.1473, + "step": 14419 + }, + { + "epoch": 0.71, + "grad_norm": 0.5277825593948364, + "learning_rate": 0.0005215317573365185, + "loss": 3.0845, + "step": 14420 + }, + { + "epoch": 0.71, + "grad_norm": 0.5510663390159607, + "learning_rate": 0.0005215213750845425, + "loss": 3.2722, + "step": 14421 + }, + { + "epoch": 0.71, + "grad_norm": 0.5259878039360046, + "learning_rate": 0.0005215109922491218, + "loss": 3.0543, + "step": 14422 + }, + { + "epoch": 0.71, + "grad_norm": 0.4986727237701416, + "learning_rate": 0.000521500608830284, + "loss": 3.2939, + "step": 14423 + }, + { + "epoch": 0.71, + "grad_norm": 0.558155357837677, + "learning_rate": 0.000521490224828056, + "loss": 3.0415, + "step": 14424 + }, + { + "epoch": 0.71, + "grad_norm": 0.5337893962860107, + "learning_rate": 0.0005214798402424655, + "loss": 2.949, + "step": 14425 + }, + { + "epoch": 0.71, + "grad_norm": 0.5568438172340393, + "learning_rate": 0.0005214694550735396, + "loss": 3.4036, + "step": 14426 + }, + { + "epoch": 0.71, + "grad_norm": 0.52046138048172, + "learning_rate": 0.000521459069321306, + "loss": 3.054, + "step": 14427 + }, + { + "epoch": 0.71, + "grad_norm": 0.4952624440193176, + "learning_rate": 0.0005214486829857916, + "loss": 3.2659, + "step": 14428 + }, + { + "epoch": 0.71, + "grad_norm": 0.584016740322113, + "learning_rate": 0.0005214382960670241, + "loss": 3.0175, + "step": 14429 + }, + { + "epoch": 0.71, + "grad_norm": 0.5094707608222961, + "learning_rate": 0.0005214279085650308, + "loss": 3.0569, + "step": 14430 + }, + { + "epoch": 0.71, + "grad_norm": 0.5051601529121399, + "learning_rate": 0.0005214175204798388, + "loss": 3.0209, + "step": 14431 + }, + { + "epoch": 0.71, + "grad_norm": 0.4987492263317108, + "learning_rate": 0.0005214071318114759, + "loss": 3.2559, + "step": 14432 + }, + { + "epoch": 0.71, + "grad_norm": 0.4821774661540985, + "learning_rate": 0.0005213967425599692, + "loss": 3.23, + "step": 14433 + }, + { + "epoch": 0.71, + "grad_norm": 0.5031430125236511, + "learning_rate": 0.0005213863527253459, + "loss": 3.2185, + "step": 14434 + }, + { + "epoch": 0.71, + "grad_norm": 0.52392578125, + "learning_rate": 0.0005213759623076337, + "loss": 3.1812, + "step": 14435 + }, + { + "epoch": 0.71, + "grad_norm": 0.4971693754196167, + "learning_rate": 0.0005213655713068598, + "loss": 3.0771, + "step": 14436 + }, + { + "epoch": 0.71, + "grad_norm": 0.5507715344429016, + "learning_rate": 0.0005213551797230516, + "loss": 3.0654, + "step": 14437 + }, + { + "epoch": 0.71, + "grad_norm": 0.5817131996154785, + "learning_rate": 0.0005213447875562365, + "loss": 3.1453, + "step": 14438 + }, + { + "epoch": 0.71, + "grad_norm": 0.5093923211097717, + "learning_rate": 0.0005213343948064417, + "loss": 2.9989, + "step": 14439 + }, + { + "epoch": 0.71, + "grad_norm": 0.5584835410118103, + "learning_rate": 0.0005213240014736947, + "loss": 3.1431, + "step": 14440 + }, + { + "epoch": 0.71, + "grad_norm": 0.534837007522583, + "learning_rate": 0.000521313607558023, + "loss": 3.1652, + "step": 14441 + }, + { + "epoch": 0.71, + "grad_norm": 0.532855749130249, + "learning_rate": 0.0005213032130594537, + "loss": 3.1471, + "step": 14442 + }, + { + "epoch": 0.71, + "grad_norm": 0.5114381909370422, + "learning_rate": 0.0005212928179780144, + "loss": 3.1387, + "step": 14443 + }, + { + "epoch": 0.71, + "grad_norm": 0.5551025867462158, + "learning_rate": 0.0005212824223137325, + "loss": 3.2062, + "step": 14444 + }, + { + "epoch": 0.71, + "grad_norm": 0.5081254839897156, + "learning_rate": 0.0005212720260666352, + "loss": 3.4413, + "step": 14445 + }, + { + "epoch": 0.71, + "grad_norm": 0.5733745098114014, + "learning_rate": 0.00052126162923675, + "loss": 3.2314, + "step": 14446 + }, + { + "epoch": 0.71, + "grad_norm": 0.5209436416625977, + "learning_rate": 0.0005212512318241042, + "loss": 3.4682, + "step": 14447 + }, + { + "epoch": 0.71, + "grad_norm": 0.5001239776611328, + "learning_rate": 0.0005212408338287252, + "loss": 3.1519, + "step": 14448 + }, + { + "epoch": 0.71, + "grad_norm": 0.5506751537322998, + "learning_rate": 0.0005212304352506405, + "loss": 2.9797, + "step": 14449 + }, + { + "epoch": 0.71, + "grad_norm": 0.5498661398887634, + "learning_rate": 0.0005212200360898775, + "loss": 3.0644, + "step": 14450 + }, + { + "epoch": 0.71, + "grad_norm": 0.5015347599983215, + "learning_rate": 0.0005212096363464633, + "loss": 3.1071, + "step": 14451 + }, + { + "epoch": 0.71, + "grad_norm": 0.5187351107597351, + "learning_rate": 0.0005211992360204256, + "loss": 3.1041, + "step": 14452 + }, + { + "epoch": 0.71, + "grad_norm": 0.5389336943626404, + "learning_rate": 0.0005211888351117917, + "loss": 3.1903, + "step": 14453 + }, + { + "epoch": 0.71, + "grad_norm": 0.5327998995780945, + "learning_rate": 0.0005211784336205889, + "loss": 3.3395, + "step": 14454 + }, + { + "epoch": 0.71, + "grad_norm": 0.5216488838195801, + "learning_rate": 0.0005211680315468448, + "loss": 3.2868, + "step": 14455 + }, + { + "epoch": 0.71, + "grad_norm": 0.5144593715667725, + "learning_rate": 0.0005211576288905865, + "loss": 3.35, + "step": 14456 + }, + { + "epoch": 0.71, + "grad_norm": 0.5322148203849792, + "learning_rate": 0.0005211472256518416, + "loss": 3.0846, + "step": 14457 + }, + { + "epoch": 0.71, + "grad_norm": 0.4917519688606262, + "learning_rate": 0.0005211368218306377, + "loss": 3.1299, + "step": 14458 + }, + { + "epoch": 0.71, + "grad_norm": 0.5577586889266968, + "learning_rate": 0.0005211264174270016, + "loss": 3.156, + "step": 14459 + }, + { + "epoch": 0.71, + "grad_norm": 0.5201316475868225, + "learning_rate": 0.0005211160124409613, + "loss": 3.2895, + "step": 14460 + }, + { + "epoch": 0.71, + "grad_norm": 0.49748873710632324, + "learning_rate": 0.000521105606872544, + "loss": 3.2544, + "step": 14461 + }, + { + "epoch": 0.71, + "grad_norm": 0.5247087478637695, + "learning_rate": 0.000521095200721777, + "loss": 3.1305, + "step": 14462 + }, + { + "epoch": 0.71, + "grad_norm": 0.5503666996955872, + "learning_rate": 0.0005210847939886878, + "loss": 3.2872, + "step": 14463 + }, + { + "epoch": 0.71, + "grad_norm": 0.5127733945846558, + "learning_rate": 0.0005210743866733039, + "loss": 3.3832, + "step": 14464 + }, + { + "epoch": 0.71, + "grad_norm": 0.5337607860565186, + "learning_rate": 0.0005210639787756525, + "loss": 3.2879, + "step": 14465 + }, + { + "epoch": 0.71, + "grad_norm": 0.4939156770706177, + "learning_rate": 0.0005210535702957612, + "loss": 3.2596, + "step": 14466 + }, + { + "epoch": 0.71, + "grad_norm": 0.5159709453582764, + "learning_rate": 0.0005210431612336573, + "loss": 3.3191, + "step": 14467 + }, + { + "epoch": 0.71, + "grad_norm": 0.5352545976638794, + "learning_rate": 0.0005210327515893683, + "loss": 3.1018, + "step": 14468 + }, + { + "epoch": 0.71, + "grad_norm": 0.5087549686431885, + "learning_rate": 0.0005210223413629214, + "loss": 3.0488, + "step": 14469 + }, + { + "epoch": 0.71, + "grad_norm": 0.5398578643798828, + "learning_rate": 0.0005210119305543443, + "loss": 3.1926, + "step": 14470 + }, + { + "epoch": 0.71, + "grad_norm": 0.9343258738517761, + "learning_rate": 0.0005210015191636643, + "loss": 3.0479, + "step": 14471 + }, + { + "epoch": 0.71, + "grad_norm": 0.5741486549377441, + "learning_rate": 0.0005209911071909089, + "loss": 3.1191, + "step": 14472 + }, + { + "epoch": 0.71, + "grad_norm": 0.5154914855957031, + "learning_rate": 0.0005209806946361055, + "loss": 3.2, + "step": 14473 + }, + { + "epoch": 0.71, + "grad_norm": 0.5743831992149353, + "learning_rate": 0.0005209702814992813, + "loss": 3.374, + "step": 14474 + }, + { + "epoch": 0.71, + "grad_norm": 0.5344333648681641, + "learning_rate": 0.000520959867780464, + "loss": 3.467, + "step": 14475 + }, + { + "epoch": 0.71, + "grad_norm": 0.5519991517066956, + "learning_rate": 0.0005209494534796809, + "loss": 2.9471, + "step": 14476 + }, + { + "epoch": 0.71, + "grad_norm": 0.5173640251159668, + "learning_rate": 0.0005209390385969595, + "loss": 3.24, + "step": 14477 + }, + { + "epoch": 0.71, + "grad_norm": 0.5470741987228394, + "learning_rate": 0.0005209286231323271, + "loss": 3.1831, + "step": 14478 + }, + { + "epoch": 0.71, + "grad_norm": 0.6194097399711609, + "learning_rate": 0.0005209182070858114, + "loss": 3.1974, + "step": 14479 + }, + { + "epoch": 0.71, + "grad_norm": 0.4889790415763855, + "learning_rate": 0.0005209077904574396, + "loss": 2.9937, + "step": 14480 + }, + { + "epoch": 0.71, + "grad_norm": 0.5154456496238708, + "learning_rate": 0.0005208973732472391, + "loss": 3.2068, + "step": 14481 + }, + { + "epoch": 0.71, + "grad_norm": 0.4870891273021698, + "learning_rate": 0.0005208869554552375, + "loss": 3.0451, + "step": 14482 + }, + { + "epoch": 0.71, + "grad_norm": 0.5939205288887024, + "learning_rate": 0.0005208765370814622, + "loss": 3.2206, + "step": 14483 + }, + { + "epoch": 0.71, + "grad_norm": 0.4910772442817688, + "learning_rate": 0.0005208661181259406, + "loss": 3.1084, + "step": 14484 + }, + { + "epoch": 0.71, + "grad_norm": 0.5324827432632446, + "learning_rate": 0.0005208556985887001, + "loss": 3.1209, + "step": 14485 + }, + { + "epoch": 0.71, + "grad_norm": 0.497478723526001, + "learning_rate": 0.0005208452784697681, + "loss": 3.2166, + "step": 14486 + }, + { + "epoch": 0.71, + "grad_norm": 0.5420011878013611, + "learning_rate": 0.0005208348577691723, + "loss": 3.2863, + "step": 14487 + }, + { + "epoch": 0.71, + "grad_norm": 0.5050036311149597, + "learning_rate": 0.0005208244364869399, + "loss": 3.0082, + "step": 14488 + }, + { + "epoch": 0.71, + "grad_norm": 0.4819994568824768, + "learning_rate": 0.0005208140146230985, + "loss": 3.2324, + "step": 14489 + }, + { + "epoch": 0.71, + "grad_norm": 0.5056858658790588, + "learning_rate": 0.0005208035921776755, + "loss": 3.2479, + "step": 14490 + }, + { + "epoch": 0.71, + "grad_norm": 0.5183698534965515, + "learning_rate": 0.0005207931691506982, + "loss": 3.206, + "step": 14491 + }, + { + "epoch": 0.71, + "grad_norm": 0.5309738516807556, + "learning_rate": 0.0005207827455421943, + "loss": 3.2727, + "step": 14492 + }, + { + "epoch": 0.71, + "grad_norm": 0.5266025066375732, + "learning_rate": 0.0005207723213521911, + "loss": 3.1433, + "step": 14493 + }, + { + "epoch": 0.71, + "grad_norm": 0.5019662976264954, + "learning_rate": 0.0005207618965807161, + "loss": 3.224, + "step": 14494 + }, + { + "epoch": 0.71, + "grad_norm": 0.5374035239219666, + "learning_rate": 0.0005207514712277968, + "loss": 3.1059, + "step": 14495 + }, + { + "epoch": 0.71, + "grad_norm": 0.5742165446281433, + "learning_rate": 0.0005207410452934605, + "loss": 3.2328, + "step": 14496 + }, + { + "epoch": 0.71, + "grad_norm": 0.4858763813972473, + "learning_rate": 0.0005207306187777348, + "loss": 3.0767, + "step": 14497 + }, + { + "epoch": 0.71, + "grad_norm": 0.5236046314239502, + "learning_rate": 0.0005207201916806473, + "loss": 3.3342, + "step": 14498 + }, + { + "epoch": 0.71, + "grad_norm": 0.5222908854484558, + "learning_rate": 0.000520709764002225, + "loss": 3.2661, + "step": 14499 + }, + { + "epoch": 0.71, + "grad_norm": 0.5480522513389587, + "learning_rate": 0.0005206993357424959, + "loss": 3.132, + "step": 14500 + }, + { + "epoch": 0.71, + "grad_norm": 0.5132131576538086, + "learning_rate": 0.0005206889069014871, + "loss": 3.3595, + "step": 14501 + }, + { + "epoch": 0.71, + "grad_norm": 0.494584858417511, + "learning_rate": 0.0005206784774792263, + "loss": 3.3245, + "step": 14502 + }, + { + "epoch": 0.71, + "grad_norm": 0.4954333007335663, + "learning_rate": 0.0005206680474757407, + "loss": 3.2417, + "step": 14503 + }, + { + "epoch": 0.71, + "grad_norm": 0.5081412196159363, + "learning_rate": 0.0005206576168910581, + "loss": 3.0736, + "step": 14504 + }, + { + "epoch": 0.71, + "grad_norm": 0.5705395340919495, + "learning_rate": 0.0005206471857252057, + "loss": 3.0512, + "step": 14505 + }, + { + "epoch": 0.71, + "grad_norm": 0.5100386142730713, + "learning_rate": 0.000520636753978211, + "loss": 3.082, + "step": 14506 + }, + { + "epoch": 0.71, + "grad_norm": 0.5036544799804688, + "learning_rate": 0.0005206263216501018, + "loss": 3.2025, + "step": 14507 + }, + { + "epoch": 0.71, + "grad_norm": 0.5994665026664734, + "learning_rate": 0.0005206158887409052, + "loss": 3.2004, + "step": 14508 + }, + { + "epoch": 0.71, + "grad_norm": 0.50201815366745, + "learning_rate": 0.0005206054552506487, + "loss": 3.1203, + "step": 14509 + }, + { + "epoch": 0.71, + "grad_norm": 0.5317625999450684, + "learning_rate": 0.00052059502117936, + "loss": 2.9033, + "step": 14510 + }, + { + "epoch": 0.71, + "grad_norm": 0.5502082705497742, + "learning_rate": 0.0005205845865270664, + "loss": 3.1199, + "step": 14511 + }, + { + "epoch": 0.71, + "grad_norm": 0.5232048034667969, + "learning_rate": 0.0005205741512937955, + "loss": 3.1681, + "step": 14512 + }, + { + "epoch": 0.71, + "grad_norm": 0.5692151784896851, + "learning_rate": 0.0005205637154795748, + "loss": 3.1412, + "step": 14513 + }, + { + "epoch": 0.71, + "grad_norm": 0.47583895921707153, + "learning_rate": 0.0005205532790844317, + "loss": 3.3911, + "step": 14514 + }, + { + "epoch": 0.71, + "grad_norm": 0.5234794616699219, + "learning_rate": 0.0005205428421083936, + "loss": 3.0717, + "step": 14515 + }, + { + "epoch": 0.71, + "grad_norm": 0.5292587876319885, + "learning_rate": 0.0005205324045514882, + "loss": 3.2203, + "step": 14516 + }, + { + "epoch": 0.71, + "grad_norm": 0.4843212068080902, + "learning_rate": 0.0005205219664137428, + "loss": 3.317, + "step": 14517 + }, + { + "epoch": 0.71, + "grad_norm": 0.5709378719329834, + "learning_rate": 0.000520511527695185, + "loss": 3.3233, + "step": 14518 + }, + { + "epoch": 0.71, + "grad_norm": 0.4811621308326721, + "learning_rate": 0.0005205010883958423, + "loss": 3.2231, + "step": 14519 + }, + { + "epoch": 0.71, + "grad_norm": 0.5761101841926575, + "learning_rate": 0.0005204906485157423, + "loss": 3.431, + "step": 14520 + }, + { + "epoch": 0.71, + "grad_norm": 0.5028489828109741, + "learning_rate": 0.0005204802080549122, + "loss": 3.1521, + "step": 14521 + }, + { + "epoch": 0.71, + "grad_norm": 0.5608251690864563, + "learning_rate": 0.0005204697670133798, + "loss": 2.9437, + "step": 14522 + }, + { + "epoch": 0.71, + "grad_norm": 0.5372280478477478, + "learning_rate": 0.0005204593253911724, + "loss": 3.249, + "step": 14523 + }, + { + "epoch": 0.71, + "grad_norm": 0.5417999029159546, + "learning_rate": 0.0005204488831883174, + "loss": 3.2695, + "step": 14524 + }, + { + "epoch": 0.71, + "grad_norm": 0.5193753242492676, + "learning_rate": 0.0005204384404048426, + "loss": 3.1652, + "step": 14525 + }, + { + "epoch": 0.71, + "grad_norm": 0.528472900390625, + "learning_rate": 0.0005204279970407754, + "loss": 3.1903, + "step": 14526 + }, + { + "epoch": 0.71, + "grad_norm": 0.5966113805770874, + "learning_rate": 0.0005204175530961432, + "loss": 3.2464, + "step": 14527 + }, + { + "epoch": 0.71, + "grad_norm": 0.5065594911575317, + "learning_rate": 0.0005204071085709737, + "loss": 3.4082, + "step": 14528 + }, + { + "epoch": 0.71, + "grad_norm": 0.5471876859664917, + "learning_rate": 0.0005203966634652942, + "loss": 3.0131, + "step": 14529 + }, + { + "epoch": 0.71, + "grad_norm": 0.5287893414497375, + "learning_rate": 0.0005203862177791324, + "loss": 3.1887, + "step": 14530 + }, + { + "epoch": 0.71, + "grad_norm": 0.4954758882522583, + "learning_rate": 0.0005203757715125157, + "loss": 3.2469, + "step": 14531 + }, + { + "epoch": 0.71, + "grad_norm": 0.4946754276752472, + "learning_rate": 0.0005203653246654715, + "loss": 3.3233, + "step": 14532 + }, + { + "epoch": 0.71, + "grad_norm": 0.5238100290298462, + "learning_rate": 0.0005203548772380276, + "loss": 3.1324, + "step": 14533 + }, + { + "epoch": 0.71, + "grad_norm": 0.5408486127853394, + "learning_rate": 0.0005203444292302112, + "loss": 3.1249, + "step": 14534 + }, + { + "epoch": 0.71, + "grad_norm": 0.4843558073043823, + "learning_rate": 0.0005203339806420501, + "loss": 3.1595, + "step": 14535 + }, + { + "epoch": 0.71, + "grad_norm": 0.5062605142593384, + "learning_rate": 0.0005203235314735717, + "loss": 3.1967, + "step": 14536 + }, + { + "epoch": 0.71, + "grad_norm": 0.5224248766899109, + "learning_rate": 0.0005203130817248035, + "loss": 3.4137, + "step": 14537 + }, + { + "epoch": 0.71, + "grad_norm": 0.5459402799606323, + "learning_rate": 0.000520302631395773, + "loss": 2.9693, + "step": 14538 + }, + { + "epoch": 0.71, + "grad_norm": 0.5118008852005005, + "learning_rate": 0.0005202921804865078, + "loss": 3.4552, + "step": 14539 + }, + { + "epoch": 0.71, + "grad_norm": 0.516243040561676, + "learning_rate": 0.0005202817289970353, + "loss": 3.3095, + "step": 14540 + }, + { + "epoch": 0.71, + "grad_norm": 0.4901047945022583, + "learning_rate": 0.0005202712769273833, + "loss": 3.1625, + "step": 14541 + }, + { + "epoch": 0.71, + "grad_norm": 0.5035889148712158, + "learning_rate": 0.0005202608242775791, + "loss": 3.2717, + "step": 14542 + }, + { + "epoch": 0.71, + "grad_norm": 0.5227107405662537, + "learning_rate": 0.0005202503710476502, + "loss": 3.3589, + "step": 14543 + }, + { + "epoch": 0.71, + "grad_norm": 0.5514225959777832, + "learning_rate": 0.0005202399172376242, + "loss": 3.1658, + "step": 14544 + }, + { + "epoch": 0.71, + "grad_norm": 0.5554363131523132, + "learning_rate": 0.0005202294628475287, + "loss": 3.0967, + "step": 14545 + }, + { + "epoch": 0.71, + "grad_norm": 0.47724464535713196, + "learning_rate": 0.0005202190078773912, + "loss": 3.2781, + "step": 14546 + }, + { + "epoch": 0.71, + "grad_norm": 0.5094786882400513, + "learning_rate": 0.0005202085523272393, + "loss": 3.3368, + "step": 14547 + }, + { + "epoch": 0.71, + "grad_norm": 0.4955451190471649, + "learning_rate": 0.0005201980961971002, + "loss": 3.1882, + "step": 14548 + }, + { + "epoch": 0.71, + "grad_norm": 0.5031975507736206, + "learning_rate": 0.0005201876394870018, + "loss": 3.0168, + "step": 14549 + }, + { + "epoch": 0.71, + "grad_norm": 0.5426763892173767, + "learning_rate": 0.0005201771821969716, + "loss": 3.14, + "step": 14550 + }, + { + "epoch": 0.71, + "grad_norm": 0.4851762056350708, + "learning_rate": 0.0005201667243270371, + "loss": 3.2685, + "step": 14551 + }, + { + "epoch": 0.71, + "grad_norm": 0.5311375856399536, + "learning_rate": 0.0005201562658772258, + "loss": 3.4027, + "step": 14552 + }, + { + "epoch": 0.71, + "grad_norm": 0.5157365798950195, + "learning_rate": 0.0005201458068475652, + "loss": 3.1562, + "step": 14553 + }, + { + "epoch": 0.71, + "grad_norm": 0.5381741523742676, + "learning_rate": 0.0005201353472380829, + "loss": 3.1626, + "step": 14554 + }, + { + "epoch": 0.71, + "grad_norm": 0.532410204410553, + "learning_rate": 0.0005201248870488065, + "loss": 3.3552, + "step": 14555 + }, + { + "epoch": 0.71, + "grad_norm": 0.5098461508750916, + "learning_rate": 0.0005201144262797636, + "loss": 3.1537, + "step": 14556 + }, + { + "epoch": 0.71, + "grad_norm": 0.5130876898765564, + "learning_rate": 0.0005201039649309815, + "loss": 3.2291, + "step": 14557 + }, + { + "epoch": 0.71, + "grad_norm": 0.512928307056427, + "learning_rate": 0.0005200935030024881, + "loss": 3.0487, + "step": 14558 + }, + { + "epoch": 0.71, + "grad_norm": 0.5555957555770874, + "learning_rate": 0.0005200830404943106, + "loss": 3.1302, + "step": 14559 + }, + { + "epoch": 0.71, + "grad_norm": 0.49892672896385193, + "learning_rate": 0.0005200725774064768, + "loss": 3.2873, + "step": 14560 + }, + { + "epoch": 0.71, + "grad_norm": 0.5405455827713013, + "learning_rate": 0.0005200621137390141, + "loss": 3.1677, + "step": 14561 + }, + { + "epoch": 0.71, + "grad_norm": 0.48926350474357605, + "learning_rate": 0.0005200516494919502, + "loss": 3.2558, + "step": 14562 + }, + { + "epoch": 0.71, + "grad_norm": 0.5370481014251709, + "learning_rate": 0.0005200411846653127, + "loss": 3.2475, + "step": 14563 + }, + { + "epoch": 0.71, + "grad_norm": 0.5003460049629211, + "learning_rate": 0.000520030719259129, + "loss": 3.1576, + "step": 14564 + }, + { + "epoch": 0.71, + "grad_norm": 0.48471590876579285, + "learning_rate": 0.0005200202532734267, + "loss": 3.1899, + "step": 14565 + }, + { + "epoch": 0.71, + "grad_norm": 0.5002859234809875, + "learning_rate": 0.0005200097867082335, + "loss": 3.2503, + "step": 14566 + }, + { + "epoch": 0.71, + "grad_norm": 0.5285899639129639, + "learning_rate": 0.0005199993195635768, + "loss": 3.1374, + "step": 14567 + }, + { + "epoch": 0.71, + "grad_norm": 0.5553933382034302, + "learning_rate": 0.0005199888518394841, + "loss": 3.1874, + "step": 14568 + }, + { + "epoch": 0.71, + "grad_norm": 0.7586709856987, + "learning_rate": 0.0005199783835359833, + "loss": 3.2705, + "step": 14569 + }, + { + "epoch": 0.71, + "grad_norm": 0.5392246842384338, + "learning_rate": 0.0005199679146531017, + "loss": 3.0995, + "step": 14570 + }, + { + "epoch": 0.71, + "grad_norm": 0.5427459478378296, + "learning_rate": 0.000519957445190867, + "loss": 3.0778, + "step": 14571 + }, + { + "epoch": 0.71, + "grad_norm": 0.5674456357955933, + "learning_rate": 0.0005199469751493065, + "loss": 3.1027, + "step": 14572 + }, + { + "epoch": 0.71, + "grad_norm": 0.5392853021621704, + "learning_rate": 0.0005199365045284482, + "loss": 3.2362, + "step": 14573 + }, + { + "epoch": 0.71, + "grad_norm": 0.5736402869224548, + "learning_rate": 0.0005199260333283195, + "loss": 3.0357, + "step": 14574 + }, + { + "epoch": 0.71, + "grad_norm": 0.5310801267623901, + "learning_rate": 0.0005199155615489478, + "loss": 3.3081, + "step": 14575 + }, + { + "epoch": 0.71, + "grad_norm": 0.5260907411575317, + "learning_rate": 0.000519905089190361, + "loss": 3.1554, + "step": 14576 + }, + { + "epoch": 0.71, + "grad_norm": 0.4972694218158722, + "learning_rate": 0.0005198946162525864, + "loss": 3.1766, + "step": 14577 + }, + { + "epoch": 0.71, + "grad_norm": 0.472507506608963, + "learning_rate": 0.0005198841427356517, + "loss": 3.1027, + "step": 14578 + }, + { + "epoch": 0.71, + "grad_norm": 0.6582967042922974, + "learning_rate": 0.0005198736686395846, + "loss": 3.3213, + "step": 14579 + }, + { + "epoch": 0.71, + "grad_norm": 0.5455946326255798, + "learning_rate": 0.0005198631939644124, + "loss": 3.1553, + "step": 14580 + }, + { + "epoch": 0.71, + "grad_norm": 0.5628792643547058, + "learning_rate": 0.000519852718710163, + "loss": 3.0593, + "step": 14581 + }, + { + "epoch": 0.71, + "grad_norm": 0.5313958525657654, + "learning_rate": 0.0005198422428768639, + "loss": 3.2098, + "step": 14582 + }, + { + "epoch": 0.71, + "grad_norm": 0.5243238806724548, + "learning_rate": 0.0005198317664645424, + "loss": 3.2207, + "step": 14583 + }, + { + "epoch": 0.71, + "grad_norm": 0.5564098358154297, + "learning_rate": 0.0005198212894732266, + "loss": 3.0649, + "step": 14584 + }, + { + "epoch": 0.71, + "grad_norm": 0.5318445563316345, + "learning_rate": 0.0005198108119029437, + "loss": 3.2918, + "step": 14585 + }, + { + "epoch": 0.71, + "grad_norm": 0.5206500291824341, + "learning_rate": 0.0005198003337537214, + "loss": 3.3708, + "step": 14586 + }, + { + "epoch": 0.71, + "grad_norm": 0.5864635109901428, + "learning_rate": 0.0005197898550255874, + "loss": 3.2293, + "step": 14587 + }, + { + "epoch": 0.71, + "grad_norm": 0.497435986995697, + "learning_rate": 0.0005197793757185692, + "loss": 3.2024, + "step": 14588 + }, + { + "epoch": 0.71, + "grad_norm": 0.5075864195823669, + "learning_rate": 0.0005197688958326944, + "loss": 3.2853, + "step": 14589 + }, + { + "epoch": 0.72, + "grad_norm": 0.5199374556541443, + "learning_rate": 0.0005197584153679906, + "loss": 3.2211, + "step": 14590 + }, + { + "epoch": 0.72, + "grad_norm": 0.5085850358009338, + "learning_rate": 0.0005197479343244854, + "loss": 3.43, + "step": 14591 + }, + { + "epoch": 0.72, + "grad_norm": 0.5507273077964783, + "learning_rate": 0.0005197374527022065, + "loss": 3.3325, + "step": 14592 + }, + { + "epoch": 0.72, + "grad_norm": 0.5263931155204773, + "learning_rate": 0.0005197269705011815, + "loss": 3.1009, + "step": 14593 + }, + { + "epoch": 0.72, + "grad_norm": 0.5148525834083557, + "learning_rate": 0.0005197164877214379, + "loss": 3.0378, + "step": 14594 + }, + { + "epoch": 0.72, + "grad_norm": 0.5128089189529419, + "learning_rate": 0.0005197060043630032, + "loss": 3.2473, + "step": 14595 + }, + { + "epoch": 0.72, + "grad_norm": 0.4934292733669281, + "learning_rate": 0.0005196955204259053, + "loss": 3.1324, + "step": 14596 + }, + { + "epoch": 0.72, + "grad_norm": 0.5595125555992126, + "learning_rate": 0.0005196850359101716, + "loss": 3.201, + "step": 14597 + }, + { + "epoch": 0.72, + "grad_norm": 0.5288695693016052, + "learning_rate": 0.0005196745508158299, + "loss": 3.0758, + "step": 14598 + }, + { + "epoch": 0.72, + "grad_norm": 0.5381522178649902, + "learning_rate": 0.0005196640651429078, + "loss": 3.0904, + "step": 14599 + }, + { + "epoch": 0.72, + "grad_norm": 0.502053439617157, + "learning_rate": 0.0005196535788914326, + "loss": 3.2181, + "step": 14600 + }, + { + "epoch": 0.72, + "grad_norm": 0.49978283047676086, + "learning_rate": 0.0005196430920614323, + "loss": 3.274, + "step": 14601 + }, + { + "epoch": 0.72, + "grad_norm": 0.5054643750190735, + "learning_rate": 0.0005196326046529344, + "loss": 3.2331, + "step": 14602 + }, + { + "epoch": 0.72, + "grad_norm": 0.5227307677268982, + "learning_rate": 0.0005196221166659663, + "loss": 3.3232, + "step": 14603 + }, + { + "epoch": 0.72, + "grad_norm": 0.5008553862571716, + "learning_rate": 0.000519611628100556, + "loss": 3.2415, + "step": 14604 + }, + { + "epoch": 0.72, + "grad_norm": 0.5128350853919983, + "learning_rate": 0.0005196011389567308, + "loss": 3.219, + "step": 14605 + }, + { + "epoch": 0.72, + "grad_norm": 0.5377113223075867, + "learning_rate": 0.0005195906492345186, + "loss": 3.0948, + "step": 14606 + }, + { + "epoch": 0.72, + "grad_norm": 0.5005555152893066, + "learning_rate": 0.0005195801589339468, + "loss": 3.3787, + "step": 14607 + }, + { + "epoch": 0.72, + "grad_norm": 0.5084679126739502, + "learning_rate": 0.0005195696680550431, + "loss": 3.2606, + "step": 14608 + }, + { + "epoch": 0.72, + "grad_norm": 0.4670720100402832, + "learning_rate": 0.0005195591765978352, + "loss": 3.2102, + "step": 14609 + }, + { + "epoch": 0.72, + "grad_norm": 0.5008261799812317, + "learning_rate": 0.0005195486845623507, + "loss": 3.3554, + "step": 14610 + }, + { + "epoch": 0.72, + "grad_norm": 0.5201194286346436, + "learning_rate": 0.0005195381919486171, + "loss": 3.1835, + "step": 14611 + }, + { + "epoch": 0.72, + "grad_norm": 0.5437430143356323, + "learning_rate": 0.0005195276987566623, + "loss": 3.1677, + "step": 14612 + }, + { + "epoch": 0.72, + "grad_norm": 0.5281556844711304, + "learning_rate": 0.0005195172049865138, + "loss": 3.3595, + "step": 14613 + }, + { + "epoch": 0.72, + "grad_norm": 0.5061221122741699, + "learning_rate": 0.0005195067106381992, + "loss": 3.2528, + "step": 14614 + }, + { + "epoch": 0.72, + "grad_norm": 0.5027444362640381, + "learning_rate": 0.000519496215711746, + "loss": 3.115, + "step": 14615 + }, + { + "epoch": 0.72, + "grad_norm": 0.5213714241981506, + "learning_rate": 0.0005194857202071822, + "loss": 3.3346, + "step": 14616 + }, + { + "epoch": 0.72, + "grad_norm": 0.5137941837310791, + "learning_rate": 0.0005194752241245352, + "loss": 3.2455, + "step": 14617 + }, + { + "epoch": 0.72, + "grad_norm": 0.520012617111206, + "learning_rate": 0.0005194647274638327, + "loss": 3.3542, + "step": 14618 + }, + { + "epoch": 0.72, + "grad_norm": 0.5151776075363159, + "learning_rate": 0.0005194542302251024, + "loss": 3.2251, + "step": 14619 + }, + { + "epoch": 0.72, + "grad_norm": 0.5593071579933167, + "learning_rate": 0.0005194437324083718, + "loss": 2.9776, + "step": 14620 + }, + { + "epoch": 0.72, + "grad_norm": 0.4902730882167816, + "learning_rate": 0.0005194332340136686, + "loss": 3.2225, + "step": 14621 + }, + { + "epoch": 0.72, + "grad_norm": 0.5338214039802551, + "learning_rate": 0.0005194227350410205, + "loss": 3.3136, + "step": 14622 + }, + { + "epoch": 0.72, + "grad_norm": 0.5151000022888184, + "learning_rate": 0.0005194122354904553, + "loss": 3.224, + "step": 14623 + }, + { + "epoch": 0.72, + "grad_norm": 0.5185823440551758, + "learning_rate": 0.0005194017353620004, + "loss": 3.149, + "step": 14624 + }, + { + "epoch": 0.72, + "grad_norm": 0.5511698126792908, + "learning_rate": 0.0005193912346556836, + "loss": 3.1918, + "step": 14625 + }, + { + "epoch": 0.72, + "grad_norm": 0.5243788361549377, + "learning_rate": 0.0005193807333715324, + "loss": 3.1177, + "step": 14626 + }, + { + "epoch": 0.72, + "grad_norm": 0.5167205333709717, + "learning_rate": 0.0005193702315095746, + "loss": 2.923, + "step": 14627 + }, + { + "epoch": 0.72, + "grad_norm": 0.6894581317901611, + "learning_rate": 0.0005193597290698379, + "loss": 3.3533, + "step": 14628 + }, + { + "epoch": 0.72, + "grad_norm": 0.5190028548240662, + "learning_rate": 0.0005193492260523499, + "loss": 3.2836, + "step": 14629 + }, + { + "epoch": 0.72, + "grad_norm": 0.508630633354187, + "learning_rate": 0.0005193387224571381, + "loss": 3.4301, + "step": 14630 + }, + { + "epoch": 0.72, + "grad_norm": 0.5017585158348083, + "learning_rate": 0.0005193282182842305, + "loss": 3.2973, + "step": 14631 + }, + { + "epoch": 0.72, + "grad_norm": 0.5430968403816223, + "learning_rate": 0.0005193177135336544, + "loss": 3.1365, + "step": 14632 + }, + { + "epoch": 0.72, + "grad_norm": 0.5301803350448608, + "learning_rate": 0.0005193072082054379, + "loss": 3.2685, + "step": 14633 + }, + { + "epoch": 0.72, + "grad_norm": 0.5263997912406921, + "learning_rate": 0.0005192967022996083, + "loss": 3.4263, + "step": 14634 + }, + { + "epoch": 0.72, + "grad_norm": 0.5006689429283142, + "learning_rate": 0.0005192861958161933, + "loss": 3.3041, + "step": 14635 + }, + { + "epoch": 0.72, + "grad_norm": 0.5174351334571838, + "learning_rate": 0.0005192756887552207, + "loss": 3.1699, + "step": 14636 + }, + { + "epoch": 0.72, + "grad_norm": 0.6178990602493286, + "learning_rate": 0.0005192651811167183, + "loss": 3.1885, + "step": 14637 + }, + { + "epoch": 0.72, + "grad_norm": 0.5171335339546204, + "learning_rate": 0.0005192546729007135, + "loss": 3.1908, + "step": 14638 + }, + { + "epoch": 0.72, + "grad_norm": 0.5851891040802002, + "learning_rate": 0.000519244164107234, + "loss": 3.2078, + "step": 14639 + }, + { + "epoch": 0.72, + "grad_norm": 0.49289432168006897, + "learning_rate": 0.0005192336547363076, + "loss": 3.2048, + "step": 14640 + }, + { + "epoch": 0.72, + "grad_norm": 0.4947859048843384, + "learning_rate": 0.0005192231447879621, + "loss": 3.1892, + "step": 14641 + }, + { + "epoch": 0.72, + "grad_norm": 0.5246251821517944, + "learning_rate": 0.0005192126342622249, + "loss": 2.9959, + "step": 14642 + }, + { + "epoch": 0.72, + "grad_norm": 0.5482833385467529, + "learning_rate": 0.0005192021231591238, + "loss": 3.3374, + "step": 14643 + }, + { + "epoch": 0.72, + "grad_norm": 0.5282585620880127, + "learning_rate": 0.0005191916114786865, + "loss": 3.2919, + "step": 14644 + }, + { + "epoch": 0.72, + "grad_norm": 0.5180543661117554, + "learning_rate": 0.0005191810992209407, + "loss": 3.2924, + "step": 14645 + }, + { + "epoch": 0.72, + "grad_norm": 0.5106507539749146, + "learning_rate": 0.0005191705863859141, + "loss": 3.0478, + "step": 14646 + }, + { + "epoch": 0.72, + "grad_norm": 0.501225471496582, + "learning_rate": 0.0005191600729736343, + "loss": 3.3096, + "step": 14647 + }, + { + "epoch": 0.72, + "grad_norm": 0.5608596205711365, + "learning_rate": 0.000519149558984129, + "loss": 3.1157, + "step": 14648 + }, + { + "epoch": 0.72, + "grad_norm": 0.521439790725708, + "learning_rate": 0.0005191390444174261, + "loss": 3.2124, + "step": 14649 + }, + { + "epoch": 0.72, + "grad_norm": 0.4991474747657776, + "learning_rate": 0.0005191285292735529, + "loss": 3.2962, + "step": 14650 + }, + { + "epoch": 0.72, + "grad_norm": 0.5133585333824158, + "learning_rate": 0.0005191180135525375, + "loss": 3.3118, + "step": 14651 + }, + { + "epoch": 0.72, + "grad_norm": 0.4970352053642273, + "learning_rate": 0.0005191074972544073, + "loss": 3.3128, + "step": 14652 + }, + { + "epoch": 0.72, + "grad_norm": 0.5411221385002136, + "learning_rate": 0.0005190969803791903, + "loss": 3.2144, + "step": 14653 + }, + { + "epoch": 0.72, + "grad_norm": 0.5432132482528687, + "learning_rate": 0.0005190864629269139, + "loss": 3.1416, + "step": 14654 + }, + { + "epoch": 0.72, + "grad_norm": 0.599610447883606, + "learning_rate": 0.0005190759448976059, + "loss": 2.9844, + "step": 14655 + }, + { + "epoch": 0.72, + "grad_norm": 0.49764925241470337, + "learning_rate": 0.0005190654262912941, + "loss": 3.1174, + "step": 14656 + }, + { + "epoch": 0.72, + "grad_norm": 0.49829450249671936, + "learning_rate": 0.000519054907108006, + "loss": 3.2279, + "step": 14657 + }, + { + "epoch": 0.72, + "grad_norm": 0.5487377643585205, + "learning_rate": 0.0005190443873477696, + "loss": 3.0803, + "step": 14658 + }, + { + "epoch": 0.72, + "grad_norm": 0.5312570333480835, + "learning_rate": 0.0005190338670106124, + "loss": 3.2438, + "step": 14659 + }, + { + "epoch": 0.72, + "grad_norm": 0.5211238265037537, + "learning_rate": 0.0005190233460965621, + "loss": 3.2646, + "step": 14660 + }, + { + "epoch": 0.72, + "grad_norm": 0.5218424797058105, + "learning_rate": 0.0005190128246056465, + "loss": 3.0871, + "step": 14661 + }, + { + "epoch": 0.72, + "grad_norm": 0.5989399552345276, + "learning_rate": 0.0005190023025378932, + "loss": 3.2677, + "step": 14662 + }, + { + "epoch": 0.72, + "grad_norm": 0.5372099876403809, + "learning_rate": 0.00051899177989333, + "loss": 3.164, + "step": 14663 + }, + { + "epoch": 0.72, + "grad_norm": 0.4821583926677704, + "learning_rate": 0.0005189812566719847, + "loss": 3.1532, + "step": 14664 + }, + { + "epoch": 0.72, + "grad_norm": 0.5003272891044617, + "learning_rate": 0.0005189707328738848, + "loss": 3.3264, + "step": 14665 + }, + { + "epoch": 0.72, + "grad_norm": 0.521091103553772, + "learning_rate": 0.0005189602084990581, + "loss": 3.1722, + "step": 14666 + }, + { + "epoch": 0.72, + "grad_norm": 0.5140360593795776, + "learning_rate": 0.0005189496835475325, + "loss": 3.2693, + "step": 14667 + }, + { + "epoch": 0.72, + "grad_norm": 0.5883575081825256, + "learning_rate": 0.0005189391580193354, + "loss": 3.0789, + "step": 14668 + }, + { + "epoch": 0.72, + "grad_norm": 0.4931708574295044, + "learning_rate": 0.0005189286319144949, + "loss": 3.1453, + "step": 14669 + }, + { + "epoch": 0.72, + "grad_norm": 0.5307677984237671, + "learning_rate": 0.0005189181052330384, + "loss": 3.3826, + "step": 14670 + }, + { + "epoch": 0.72, + "grad_norm": 0.4849635362625122, + "learning_rate": 0.0005189075779749937, + "loss": 3.0497, + "step": 14671 + }, + { + "epoch": 0.72, + "grad_norm": 0.49971646070480347, + "learning_rate": 0.0005188970501403886, + "loss": 3.0944, + "step": 14672 + }, + { + "epoch": 0.72, + "grad_norm": 0.521876871585846, + "learning_rate": 0.0005188865217292508, + "loss": 3.1896, + "step": 14673 + }, + { + "epoch": 0.72, + "grad_norm": 0.4860338568687439, + "learning_rate": 0.000518875992741608, + "loss": 3.2229, + "step": 14674 + }, + { + "epoch": 0.72, + "grad_norm": 0.5542223453521729, + "learning_rate": 0.0005188654631774881, + "loss": 3.0625, + "step": 14675 + }, + { + "epoch": 0.72, + "grad_norm": 0.5281686186790466, + "learning_rate": 0.0005188549330369186, + "loss": 3.1883, + "step": 14676 + }, + { + "epoch": 0.72, + "grad_norm": 0.49884727597236633, + "learning_rate": 0.0005188444023199272, + "loss": 3.141, + "step": 14677 + }, + { + "epoch": 0.72, + "grad_norm": 0.5445032715797424, + "learning_rate": 0.0005188338710265419, + "loss": 3.2494, + "step": 14678 + }, + { + "epoch": 0.72, + "grad_norm": 0.5325313806533813, + "learning_rate": 0.0005188233391567903, + "loss": 3.1469, + "step": 14679 + }, + { + "epoch": 0.72, + "grad_norm": 0.5177390575408936, + "learning_rate": 0.0005188128067107002, + "loss": 2.9586, + "step": 14680 + }, + { + "epoch": 0.72, + "grad_norm": 0.5098357200622559, + "learning_rate": 0.0005188022736882991, + "loss": 3.1629, + "step": 14681 + }, + { + "epoch": 0.72, + "grad_norm": 0.4955054521560669, + "learning_rate": 0.0005187917400896149, + "loss": 3.3163, + "step": 14682 + }, + { + "epoch": 0.72, + "grad_norm": 0.5325889587402344, + "learning_rate": 0.0005187812059146756, + "loss": 3.1758, + "step": 14683 + }, + { + "epoch": 0.72, + "grad_norm": 0.5700262188911438, + "learning_rate": 0.0005187706711635086, + "loss": 3.1304, + "step": 14684 + }, + { + "epoch": 0.72, + "grad_norm": 0.517220139503479, + "learning_rate": 0.0005187601358361417, + "loss": 3.0101, + "step": 14685 + }, + { + "epoch": 0.72, + "grad_norm": 0.5129212737083435, + "learning_rate": 0.0005187495999326027, + "loss": 2.9816, + "step": 14686 + }, + { + "epoch": 0.72, + "grad_norm": 0.5115025043487549, + "learning_rate": 0.0005187390634529194, + "loss": 3.1781, + "step": 14687 + }, + { + "epoch": 0.72, + "grad_norm": 0.5292808413505554, + "learning_rate": 0.0005187285263971196, + "loss": 3.129, + "step": 14688 + }, + { + "epoch": 0.72, + "grad_norm": 0.5144400596618652, + "learning_rate": 0.0005187179887652307, + "loss": 3.1161, + "step": 14689 + }, + { + "epoch": 0.72, + "grad_norm": 0.525768518447876, + "learning_rate": 0.000518707450557281, + "loss": 3.2922, + "step": 14690 + }, + { + "epoch": 0.72, + "grad_norm": 0.5286670327186584, + "learning_rate": 0.0005186969117732977, + "loss": 3.4953, + "step": 14691 + }, + { + "epoch": 0.72, + "grad_norm": 0.5114057660102844, + "learning_rate": 0.000518686372413309, + "loss": 3.3082, + "step": 14692 + }, + { + "epoch": 0.72, + "grad_norm": 0.513731062412262, + "learning_rate": 0.0005186758324773425, + "loss": 3.048, + "step": 14693 + }, + { + "epoch": 0.72, + "grad_norm": 0.5025843977928162, + "learning_rate": 0.0005186652919654259, + "loss": 3.2755, + "step": 14694 + }, + { + "epoch": 0.72, + "grad_norm": 0.615983247756958, + "learning_rate": 0.0005186547508775869, + "loss": 3.111, + "step": 14695 + }, + { + "epoch": 0.72, + "grad_norm": 0.5085950493812561, + "learning_rate": 0.0005186442092138535, + "loss": 3.2742, + "step": 14696 + }, + { + "epoch": 0.72, + "grad_norm": 0.5130026936531067, + "learning_rate": 0.0005186336669742533, + "loss": 3.1467, + "step": 14697 + }, + { + "epoch": 0.72, + "grad_norm": 0.5810296535491943, + "learning_rate": 0.000518623124158814, + "loss": 3.2415, + "step": 14698 + }, + { + "epoch": 0.72, + "grad_norm": 0.5049729943275452, + "learning_rate": 0.0005186125807675636, + "loss": 3.09, + "step": 14699 + }, + { + "epoch": 0.72, + "grad_norm": 0.5015254616737366, + "learning_rate": 0.0005186020368005297, + "loss": 3.0512, + "step": 14700 + }, + { + "epoch": 0.72, + "grad_norm": 0.5339839458465576, + "learning_rate": 0.00051859149225774, + "loss": 3.2614, + "step": 14701 + }, + { + "epoch": 0.72, + "grad_norm": 0.5117117762565613, + "learning_rate": 0.0005185809471392226, + "loss": 3.4624, + "step": 14702 + }, + { + "epoch": 0.72, + "grad_norm": 0.5019668936729431, + "learning_rate": 0.0005185704014450048, + "loss": 3.3371, + "step": 14703 + }, + { + "epoch": 0.72, + "grad_norm": 0.5273573994636536, + "learning_rate": 0.0005185598551751148, + "loss": 3.3187, + "step": 14704 + }, + { + "epoch": 0.72, + "grad_norm": 0.5079138278961182, + "learning_rate": 0.0005185493083295802, + "loss": 3.2523, + "step": 14705 + }, + { + "epoch": 0.72, + "grad_norm": 0.5488572120666504, + "learning_rate": 0.0005185387609084286, + "loss": 3.1907, + "step": 14706 + }, + { + "epoch": 0.72, + "grad_norm": 0.47864294052124023, + "learning_rate": 0.0005185282129116882, + "loss": 2.9153, + "step": 14707 + }, + { + "epoch": 0.72, + "grad_norm": 0.516621470451355, + "learning_rate": 0.0005185176643393864, + "loss": 3.2739, + "step": 14708 + }, + { + "epoch": 0.72, + "grad_norm": 0.5150627493858337, + "learning_rate": 0.0005185071151915512, + "loss": 3.2769, + "step": 14709 + }, + { + "epoch": 0.72, + "grad_norm": 0.5592693090438843, + "learning_rate": 0.0005184965654682103, + "loss": 3.2601, + "step": 14710 + }, + { + "epoch": 0.72, + "grad_norm": 0.5248083472251892, + "learning_rate": 0.0005184860151693914, + "loss": 3.0784, + "step": 14711 + }, + { + "epoch": 0.72, + "grad_norm": 0.5441768765449524, + "learning_rate": 0.0005184754642951224, + "loss": 3.0752, + "step": 14712 + }, + { + "epoch": 0.72, + "grad_norm": 0.4891248047351837, + "learning_rate": 0.0005184649128454312, + "loss": 3.2164, + "step": 14713 + }, + { + "epoch": 0.72, + "grad_norm": 0.5104601383209229, + "learning_rate": 0.0005184543608203454, + "loss": 2.9591, + "step": 14714 + }, + { + "epoch": 0.72, + "grad_norm": 0.5093756318092346, + "learning_rate": 0.0005184438082198929, + "loss": 3.1289, + "step": 14715 + }, + { + "epoch": 0.72, + "grad_norm": 0.5416193604469299, + "learning_rate": 0.0005184332550441013, + "loss": 3.1514, + "step": 14716 + }, + { + "epoch": 0.72, + "grad_norm": 0.5121513605117798, + "learning_rate": 0.0005184227012929987, + "loss": 3.3509, + "step": 14717 + }, + { + "epoch": 0.72, + "grad_norm": 0.5216866135597229, + "learning_rate": 0.0005184121469666127, + "loss": 3.1625, + "step": 14718 + }, + { + "epoch": 0.72, + "grad_norm": 0.5179713368415833, + "learning_rate": 0.0005184015920649711, + "loss": 3.0819, + "step": 14719 + }, + { + "epoch": 0.72, + "grad_norm": 0.4934249222278595, + "learning_rate": 0.0005183910365881018, + "loss": 3.0533, + "step": 14720 + }, + { + "epoch": 0.72, + "grad_norm": 0.5014771819114685, + "learning_rate": 0.0005183804805360324, + "loss": 3.4464, + "step": 14721 + }, + { + "epoch": 0.72, + "grad_norm": 0.4989439845085144, + "learning_rate": 0.000518369923908791, + "loss": 3.0768, + "step": 14722 + }, + { + "epoch": 0.72, + "grad_norm": 0.5312348008155823, + "learning_rate": 0.0005183593667064052, + "loss": 3.1842, + "step": 14723 + }, + { + "epoch": 0.72, + "grad_norm": 0.5148173570632935, + "learning_rate": 0.0005183488089289029, + "loss": 3.1817, + "step": 14724 + }, + { + "epoch": 0.72, + "grad_norm": 0.5735609531402588, + "learning_rate": 0.0005183382505763117, + "loss": 3.1671, + "step": 14725 + }, + { + "epoch": 0.72, + "grad_norm": 0.4953717887401581, + "learning_rate": 0.0005183276916486597, + "loss": 3.0436, + "step": 14726 + }, + { + "epoch": 0.72, + "grad_norm": 0.5508190393447876, + "learning_rate": 0.0005183171321459745, + "loss": 3.1293, + "step": 14727 + }, + { + "epoch": 0.72, + "grad_norm": 0.4912906289100647, + "learning_rate": 0.0005183065720682842, + "loss": 3.2873, + "step": 14728 + }, + { + "epoch": 0.72, + "grad_norm": 0.5511575937271118, + "learning_rate": 0.0005182960114156162, + "loss": 3.1012, + "step": 14729 + }, + { + "epoch": 0.72, + "grad_norm": 0.4874035716056824, + "learning_rate": 0.0005182854501879986, + "loss": 3.3522, + "step": 14730 + }, + { + "epoch": 0.72, + "grad_norm": 0.5335097312927246, + "learning_rate": 0.000518274888385459, + "loss": 3.0398, + "step": 14731 + }, + { + "epoch": 0.72, + "grad_norm": 0.5298565626144409, + "learning_rate": 0.0005182643260080254, + "loss": 2.8668, + "step": 14732 + }, + { + "epoch": 0.72, + "grad_norm": 0.5011436939239502, + "learning_rate": 0.0005182537630557257, + "loss": 3.2198, + "step": 14733 + }, + { + "epoch": 0.72, + "grad_norm": 0.5561090111732483, + "learning_rate": 0.0005182431995285875, + "loss": 2.8162, + "step": 14734 + }, + { + "epoch": 0.72, + "grad_norm": 0.5102478265762329, + "learning_rate": 0.0005182326354266387, + "loss": 3.1414, + "step": 14735 + }, + { + "epoch": 0.72, + "grad_norm": 0.472723126411438, + "learning_rate": 0.0005182220707499072, + "loss": 2.9028, + "step": 14736 + }, + { + "epoch": 0.72, + "grad_norm": 0.5202345252037048, + "learning_rate": 0.0005182115054984207, + "loss": 3.2935, + "step": 14737 + }, + { + "epoch": 0.72, + "grad_norm": 0.5131343007087708, + "learning_rate": 0.0005182009396722071, + "loss": 3.2739, + "step": 14738 + }, + { + "epoch": 0.72, + "grad_norm": 0.5101212859153748, + "learning_rate": 0.0005181903732712943, + "loss": 3.2873, + "step": 14739 + }, + { + "epoch": 0.72, + "grad_norm": 0.5080827474594116, + "learning_rate": 0.0005181798062957099, + "loss": 3.0049, + "step": 14740 + }, + { + "epoch": 0.72, + "grad_norm": 0.526172935962677, + "learning_rate": 0.000518169238745482, + "loss": 3.2314, + "step": 14741 + }, + { + "epoch": 0.72, + "grad_norm": 0.5397858619689941, + "learning_rate": 0.0005181586706206382, + "loss": 3.241, + "step": 14742 + }, + { + "epoch": 0.72, + "grad_norm": 0.5016712546348572, + "learning_rate": 0.0005181481019212066, + "loss": 3.3635, + "step": 14743 + }, + { + "epoch": 0.72, + "grad_norm": 0.5157707929611206, + "learning_rate": 0.0005181375326472147, + "loss": 3.2207, + "step": 14744 + }, + { + "epoch": 0.72, + "grad_norm": 0.5005051493644714, + "learning_rate": 0.0005181269627986908, + "loss": 3.1462, + "step": 14745 + }, + { + "epoch": 0.72, + "grad_norm": 0.47533586621284485, + "learning_rate": 0.0005181163923756622, + "loss": 3.1661, + "step": 14746 + }, + { + "epoch": 0.72, + "grad_norm": 0.537551760673523, + "learning_rate": 0.000518105821378157, + "loss": 3.185, + "step": 14747 + }, + { + "epoch": 0.72, + "grad_norm": 0.5036092400550842, + "learning_rate": 0.0005180952498062032, + "loss": 3.1514, + "step": 14748 + }, + { + "epoch": 0.72, + "grad_norm": 0.5005072355270386, + "learning_rate": 0.0005180846776598285, + "loss": 3.1253, + "step": 14749 + }, + { + "epoch": 0.72, + "grad_norm": 0.5564939379692078, + "learning_rate": 0.0005180741049390605, + "loss": 3.2356, + "step": 14750 + }, + { + "epoch": 0.72, + "grad_norm": 0.584366500377655, + "learning_rate": 0.0005180635316439275, + "loss": 3.0674, + "step": 14751 + }, + { + "epoch": 0.72, + "grad_norm": 0.5277496576309204, + "learning_rate": 0.000518052957774457, + "loss": 3.2329, + "step": 14752 + }, + { + "epoch": 0.72, + "grad_norm": 0.5059505105018616, + "learning_rate": 0.000518042383330677, + "loss": 3.2356, + "step": 14753 + }, + { + "epoch": 0.72, + "grad_norm": 0.5543511509895325, + "learning_rate": 0.0005180318083126154, + "loss": 3.39, + "step": 14754 + }, + { + "epoch": 0.72, + "grad_norm": 0.537144124507904, + "learning_rate": 0.0005180212327202999, + "loss": 3.078, + "step": 14755 + }, + { + "epoch": 0.72, + "grad_norm": 0.5184658765792847, + "learning_rate": 0.0005180106565537585, + "loss": 3.2036, + "step": 14756 + }, + { + "epoch": 0.72, + "grad_norm": 0.5322108268737793, + "learning_rate": 0.000518000079813019, + "loss": 3.2073, + "step": 14757 + }, + { + "epoch": 0.72, + "grad_norm": 0.511556088924408, + "learning_rate": 0.0005179895024981092, + "loss": 3.1481, + "step": 14758 + }, + { + "epoch": 0.72, + "grad_norm": 0.5586161017417908, + "learning_rate": 0.0005179789246090568, + "loss": 3.3132, + "step": 14759 + }, + { + "epoch": 0.72, + "grad_norm": 0.5228317379951477, + "learning_rate": 0.0005179683461458901, + "loss": 3.1418, + "step": 14760 + }, + { + "epoch": 0.72, + "grad_norm": 0.4944567084312439, + "learning_rate": 0.0005179577671086367, + "loss": 3.1762, + "step": 14761 + }, + { + "epoch": 0.72, + "grad_norm": 0.6001697182655334, + "learning_rate": 0.0005179471874973245, + "loss": 3.1919, + "step": 14762 + }, + { + "epoch": 0.72, + "grad_norm": 0.5296616554260254, + "learning_rate": 0.0005179366073119813, + "loss": 3.1888, + "step": 14763 + }, + { + "epoch": 0.72, + "grad_norm": 0.5390952825546265, + "learning_rate": 0.000517926026552635, + "loss": 3.0015, + "step": 14764 + }, + { + "epoch": 0.72, + "grad_norm": 0.5541010499000549, + "learning_rate": 0.0005179154452193135, + "loss": 3.1304, + "step": 14765 + }, + { + "epoch": 0.72, + "grad_norm": 0.49909520149230957, + "learning_rate": 0.0005179048633120447, + "loss": 3.1821, + "step": 14766 + }, + { + "epoch": 0.72, + "grad_norm": 0.5527284145355225, + "learning_rate": 0.0005178942808308564, + "loss": 3.2751, + "step": 14767 + }, + { + "epoch": 0.72, + "grad_norm": 0.5055661201477051, + "learning_rate": 0.0005178836977757765, + "loss": 3.2823, + "step": 14768 + }, + { + "epoch": 0.72, + "grad_norm": 0.5182890892028809, + "learning_rate": 0.0005178731141468329, + "loss": 3.1544, + "step": 14769 + }, + { + "epoch": 0.72, + "grad_norm": 0.4902885854244232, + "learning_rate": 0.0005178625299440534, + "loss": 3.0724, + "step": 14770 + }, + { + "epoch": 0.72, + "grad_norm": 0.5040931701660156, + "learning_rate": 0.0005178519451674659, + "loss": 3.378, + "step": 14771 + }, + { + "epoch": 0.72, + "grad_norm": 0.5123295187950134, + "learning_rate": 0.0005178413598170984, + "loss": 3.3552, + "step": 14772 + }, + { + "epoch": 0.72, + "grad_norm": 0.533523440361023, + "learning_rate": 0.0005178307738929786, + "loss": 3.1523, + "step": 14773 + }, + { + "epoch": 0.72, + "grad_norm": 0.5177558064460754, + "learning_rate": 0.0005178201873951346, + "loss": 3.139, + "step": 14774 + }, + { + "epoch": 0.72, + "grad_norm": 0.5482215285301208, + "learning_rate": 0.000517809600323594, + "loss": 2.9881, + "step": 14775 + }, + { + "epoch": 0.72, + "grad_norm": 0.5305909514427185, + "learning_rate": 0.0005177990126783849, + "loss": 3.1347, + "step": 14776 + }, + { + "epoch": 0.72, + "grad_norm": 0.5373917818069458, + "learning_rate": 0.0005177884244595352, + "loss": 3.3311, + "step": 14777 + }, + { + "epoch": 0.72, + "grad_norm": 0.5221949815750122, + "learning_rate": 0.0005177778356670725, + "loss": 3.154, + "step": 14778 + }, + { + "epoch": 0.72, + "grad_norm": 0.5249523520469666, + "learning_rate": 0.000517767246301025, + "loss": 3.026, + "step": 14779 + }, + { + "epoch": 0.72, + "grad_norm": 0.5519317388534546, + "learning_rate": 0.0005177566563614205, + "loss": 3.0845, + "step": 14780 + }, + { + "epoch": 0.72, + "grad_norm": 0.48520752787590027, + "learning_rate": 0.0005177460658482868, + "loss": 3.1916, + "step": 14781 + }, + { + "epoch": 0.72, + "grad_norm": 0.5459604263305664, + "learning_rate": 0.0005177354747616519, + "loss": 3.2222, + "step": 14782 + }, + { + "epoch": 0.72, + "grad_norm": 0.4783353805541992, + "learning_rate": 0.0005177248831015437, + "loss": 3.1407, + "step": 14783 + }, + { + "epoch": 0.72, + "grad_norm": 0.5260900259017944, + "learning_rate": 0.00051771429086799, + "loss": 3.2664, + "step": 14784 + }, + { + "epoch": 0.72, + "grad_norm": 0.5184537768363953, + "learning_rate": 0.0005177036980610188, + "loss": 3.3091, + "step": 14785 + }, + { + "epoch": 0.72, + "grad_norm": 0.523328959941864, + "learning_rate": 0.000517693104680658, + "loss": 3.2127, + "step": 14786 + }, + { + "epoch": 0.72, + "grad_norm": 0.4887447953224182, + "learning_rate": 0.0005176825107269353, + "loss": 3.177, + "step": 14787 + }, + { + "epoch": 0.72, + "grad_norm": 0.5406696200370789, + "learning_rate": 0.000517671916199879, + "loss": 3.3842, + "step": 14788 + }, + { + "epoch": 0.72, + "grad_norm": 0.5218521952629089, + "learning_rate": 0.0005176613210995166, + "loss": 3.0918, + "step": 14789 + }, + { + "epoch": 0.72, + "grad_norm": 0.5580902695655823, + "learning_rate": 0.0005176507254258763, + "loss": 3.2155, + "step": 14790 + }, + { + "epoch": 0.72, + "grad_norm": 0.5313786268234253, + "learning_rate": 0.0005176401291789857, + "loss": 3.1422, + "step": 14791 + }, + { + "epoch": 0.72, + "grad_norm": 0.5215513110160828, + "learning_rate": 0.000517629532358873, + "loss": 3.1058, + "step": 14792 + }, + { + "epoch": 0.72, + "grad_norm": 0.490644633769989, + "learning_rate": 0.000517618934965566, + "loss": 3.084, + "step": 14793 + }, + { + "epoch": 0.73, + "grad_norm": 0.5126657485961914, + "learning_rate": 0.0005176083369990925, + "loss": 2.951, + "step": 14794 + }, + { + "epoch": 0.73, + "grad_norm": 0.5562359690666199, + "learning_rate": 0.0005175977384594807, + "loss": 3.1351, + "step": 14795 + }, + { + "epoch": 0.73, + "grad_norm": 0.5439273715019226, + "learning_rate": 0.0005175871393467581, + "loss": 3.0672, + "step": 14796 + }, + { + "epoch": 0.73, + "grad_norm": 0.5346477031707764, + "learning_rate": 0.000517576539660953, + "loss": 3.1833, + "step": 14797 + }, + { + "epoch": 0.73, + "grad_norm": 0.5201503038406372, + "learning_rate": 0.0005175659394020932, + "loss": 3.2397, + "step": 14798 + }, + { + "epoch": 0.73, + "grad_norm": 0.5136426091194153, + "learning_rate": 0.0005175553385702065, + "loss": 3.2395, + "step": 14799 + }, + { + "epoch": 0.73, + "grad_norm": 0.5076440572738647, + "learning_rate": 0.000517544737165321, + "loss": 3.1516, + "step": 14800 + }, + { + "epoch": 0.73, + "grad_norm": 0.531037449836731, + "learning_rate": 0.0005175341351874645, + "loss": 3.0899, + "step": 14801 + }, + { + "epoch": 0.73, + "grad_norm": 0.5360001921653748, + "learning_rate": 0.0005175235326366649, + "loss": 3.3677, + "step": 14802 + }, + { + "epoch": 0.73, + "grad_norm": 0.5268040299415588, + "learning_rate": 0.0005175129295129503, + "loss": 3.1516, + "step": 14803 + }, + { + "epoch": 0.73, + "grad_norm": 0.5175145864486694, + "learning_rate": 0.0005175023258163483, + "loss": 3.1313, + "step": 14804 + }, + { + "epoch": 0.73, + "grad_norm": 0.5107941627502441, + "learning_rate": 0.000517491721546887, + "loss": 3.1514, + "step": 14805 + }, + { + "epoch": 0.73, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0005174811167045946, + "loss": 3.1506, + "step": 14806 + }, + { + "epoch": 0.73, + "grad_norm": 0.49850302934646606, + "learning_rate": 0.0005174705112894987, + "loss": 3.4543, + "step": 14807 + }, + { + "epoch": 0.73, + "grad_norm": 0.5240212678909302, + "learning_rate": 0.0005174599053016273, + "loss": 3.2027, + "step": 14808 + }, + { + "epoch": 0.73, + "grad_norm": 0.5271751880645752, + "learning_rate": 0.0005174492987410084, + "loss": 3.2392, + "step": 14809 + }, + { + "epoch": 0.73, + "grad_norm": 0.4971039295196533, + "learning_rate": 0.00051743869160767, + "loss": 3.1203, + "step": 14810 + }, + { + "epoch": 0.73, + "grad_norm": 0.5178433060646057, + "learning_rate": 0.0005174280839016398, + "loss": 3.1699, + "step": 14811 + }, + { + "epoch": 0.73, + "grad_norm": 0.5342043042182922, + "learning_rate": 0.0005174174756229458, + "loss": 3.0835, + "step": 14812 + }, + { + "epoch": 0.73, + "grad_norm": 0.5262744426727295, + "learning_rate": 0.0005174068667716162, + "loss": 3.2602, + "step": 14813 + }, + { + "epoch": 0.73, + "grad_norm": 0.5131629109382629, + "learning_rate": 0.0005173962573476786, + "loss": 3.2553, + "step": 14814 + }, + { + "epoch": 0.73, + "grad_norm": 0.5120997428894043, + "learning_rate": 0.0005173856473511612, + "loss": 3.2564, + "step": 14815 + }, + { + "epoch": 0.73, + "grad_norm": 0.5193552374839783, + "learning_rate": 0.0005173750367820917, + "loss": 3.2926, + "step": 14816 + }, + { + "epoch": 0.73, + "grad_norm": 0.4874288737773895, + "learning_rate": 0.0005173644256404983, + "loss": 3.1219, + "step": 14817 + }, + { + "epoch": 0.73, + "grad_norm": 0.5450944304466248, + "learning_rate": 0.0005173538139264087, + "loss": 3.2068, + "step": 14818 + }, + { + "epoch": 0.73, + "grad_norm": 0.5270337462425232, + "learning_rate": 0.0005173432016398513, + "loss": 3.3498, + "step": 14819 + }, + { + "epoch": 0.73, + "grad_norm": 0.5095805525779724, + "learning_rate": 0.0005173325887808535, + "loss": 3.1008, + "step": 14820 + }, + { + "epoch": 0.73, + "grad_norm": 0.5138509273529053, + "learning_rate": 0.0005173219753494435, + "loss": 3.0766, + "step": 14821 + }, + { + "epoch": 0.73, + "grad_norm": 0.5434461236000061, + "learning_rate": 0.0005173113613456493, + "loss": 3.1396, + "step": 14822 + }, + { + "epoch": 0.73, + "grad_norm": 0.5392305850982666, + "learning_rate": 0.0005173007467694986, + "loss": 3.1582, + "step": 14823 + }, + { + "epoch": 0.73, + "grad_norm": 0.5202492475509644, + "learning_rate": 0.0005172901316210197, + "loss": 3.1764, + "step": 14824 + }, + { + "epoch": 0.73, + "grad_norm": 0.5307274460792542, + "learning_rate": 0.0005172795159002405, + "loss": 3.3271, + "step": 14825 + }, + { + "epoch": 0.73, + "grad_norm": 0.5426248908042908, + "learning_rate": 0.0005172688996071889, + "loss": 3.0903, + "step": 14826 + }, + { + "epoch": 0.73, + "grad_norm": 0.5029236078262329, + "learning_rate": 0.0005172582827418927, + "loss": 3.386, + "step": 14827 + }, + { + "epoch": 0.73, + "grad_norm": 0.522150993347168, + "learning_rate": 0.0005172476653043799, + "loss": 3.2013, + "step": 14828 + }, + { + "epoch": 0.73, + "grad_norm": 0.5246309638023376, + "learning_rate": 0.0005172370472946787, + "loss": 3.2427, + "step": 14829 + }, + { + "epoch": 0.73, + "grad_norm": 0.5152744650840759, + "learning_rate": 0.000517226428712817, + "loss": 3.1461, + "step": 14830 + }, + { + "epoch": 0.73, + "grad_norm": 0.5182197690010071, + "learning_rate": 0.0005172158095588227, + "loss": 3.0218, + "step": 14831 + }, + { + "epoch": 0.73, + "grad_norm": 0.5244132280349731, + "learning_rate": 0.0005172051898327235, + "loss": 3.2129, + "step": 14832 + }, + { + "epoch": 0.73, + "grad_norm": 0.5407620072364807, + "learning_rate": 0.0005171945695345478, + "loss": 3.0372, + "step": 14833 + }, + { + "epoch": 0.73, + "grad_norm": 0.5058204531669617, + "learning_rate": 0.0005171839486643234, + "loss": 2.9253, + "step": 14834 + }, + { + "epoch": 0.73, + "grad_norm": 0.5572190284729004, + "learning_rate": 0.0005171733272220783, + "loss": 3.2299, + "step": 14835 + }, + { + "epoch": 0.73, + "grad_norm": 0.5125101208686829, + "learning_rate": 0.0005171627052078404, + "loss": 3.3526, + "step": 14836 + }, + { + "epoch": 0.73, + "grad_norm": 0.5376558899879456, + "learning_rate": 0.0005171520826216377, + "loss": 3.2156, + "step": 14837 + }, + { + "epoch": 0.73, + "grad_norm": 0.5481821298599243, + "learning_rate": 0.0005171414594634983, + "loss": 3.2858, + "step": 14838 + }, + { + "epoch": 0.73, + "grad_norm": 0.4958679974079132, + "learning_rate": 0.0005171308357334501, + "loss": 3.258, + "step": 14839 + }, + { + "epoch": 0.73, + "grad_norm": 0.5072594285011292, + "learning_rate": 0.000517120211431521, + "loss": 3.2469, + "step": 14840 + }, + { + "epoch": 0.73, + "grad_norm": 0.5196676850318909, + "learning_rate": 0.0005171095865577391, + "loss": 3.1494, + "step": 14841 + }, + { + "epoch": 0.73, + "grad_norm": 0.5403569936752319, + "learning_rate": 0.0005170989611121323, + "loss": 3.1397, + "step": 14842 + }, + { + "epoch": 0.73, + "grad_norm": 0.4967591464519501, + "learning_rate": 0.0005170883350947286, + "loss": 3.2614, + "step": 14843 + }, + { + "epoch": 0.73, + "grad_norm": 0.5076732635498047, + "learning_rate": 0.000517077708505556, + "loss": 2.9743, + "step": 14844 + }, + { + "epoch": 0.73, + "grad_norm": 0.5110801458358765, + "learning_rate": 0.0005170670813446425, + "loss": 3.1071, + "step": 14845 + }, + { + "epoch": 0.73, + "grad_norm": 0.48647382855415344, + "learning_rate": 0.0005170564536120161, + "loss": 3.2916, + "step": 14846 + }, + { + "epoch": 0.73, + "grad_norm": 0.5668714046478271, + "learning_rate": 0.0005170458253077048, + "loss": 3.0199, + "step": 14847 + }, + { + "epoch": 0.73, + "grad_norm": 0.48818862438201904, + "learning_rate": 0.0005170351964317364, + "loss": 3.1778, + "step": 14848 + }, + { + "epoch": 0.73, + "grad_norm": 0.5142984390258789, + "learning_rate": 0.0005170245669841393, + "loss": 3.3728, + "step": 14849 + }, + { + "epoch": 0.73, + "grad_norm": 0.5312206149101257, + "learning_rate": 0.0005170139369649412, + "loss": 3.0934, + "step": 14850 + }, + { + "epoch": 0.73, + "grad_norm": 0.5142450928688049, + "learning_rate": 0.0005170033063741701, + "loss": 3.1013, + "step": 14851 + }, + { + "epoch": 0.73, + "grad_norm": 0.5436277985572815, + "learning_rate": 0.0005169926752118541, + "loss": 3.2664, + "step": 14852 + }, + { + "epoch": 0.73, + "grad_norm": 0.545096218585968, + "learning_rate": 0.0005169820434780211, + "loss": 3.1664, + "step": 14853 + }, + { + "epoch": 0.73, + "grad_norm": 0.5273535847663879, + "learning_rate": 0.0005169714111726992, + "loss": 2.9818, + "step": 14854 + }, + { + "epoch": 0.73, + "grad_norm": 0.5400977730751038, + "learning_rate": 0.0005169607782959163, + "loss": 3.1989, + "step": 14855 + }, + { + "epoch": 0.73, + "grad_norm": 0.5369027853012085, + "learning_rate": 0.0005169501448477005, + "loss": 3.1274, + "step": 14856 + }, + { + "epoch": 0.73, + "grad_norm": 0.48228007555007935, + "learning_rate": 0.0005169395108280797, + "loss": 3.3068, + "step": 14857 + }, + { + "epoch": 0.73, + "grad_norm": 0.562002420425415, + "learning_rate": 0.0005169288762370821, + "loss": 3.0936, + "step": 14858 + }, + { + "epoch": 0.73, + "grad_norm": 0.5516139268875122, + "learning_rate": 0.0005169182410747356, + "loss": 3.0671, + "step": 14859 + }, + { + "epoch": 0.73, + "grad_norm": 0.5317583680152893, + "learning_rate": 0.0005169076053410681, + "loss": 3.0629, + "step": 14860 + }, + { + "epoch": 0.73, + "grad_norm": 0.5046291947364807, + "learning_rate": 0.0005168969690361077, + "loss": 3.1502, + "step": 14861 + }, + { + "epoch": 0.73, + "grad_norm": 0.5158634781837463, + "learning_rate": 0.0005168863321598825, + "loss": 3.157, + "step": 14862 + }, + { + "epoch": 0.73, + "grad_norm": 0.5672610402107239, + "learning_rate": 0.0005168756947124204, + "loss": 3.0771, + "step": 14863 + }, + { + "epoch": 0.73, + "grad_norm": 0.5045456290245056, + "learning_rate": 0.0005168650566937496, + "loss": 3.2721, + "step": 14864 + }, + { + "epoch": 0.73, + "grad_norm": 0.5558025240898132, + "learning_rate": 0.0005168544181038978, + "loss": 3.1883, + "step": 14865 + }, + { + "epoch": 0.73, + "grad_norm": 0.5100647807121277, + "learning_rate": 0.0005168437789428932, + "loss": 3.1412, + "step": 14866 + }, + { + "epoch": 0.73, + "grad_norm": 0.4676547050476074, + "learning_rate": 0.0005168331392107639, + "loss": 3.1914, + "step": 14867 + }, + { + "epoch": 0.73, + "grad_norm": 0.5415322780609131, + "learning_rate": 0.0005168224989075378, + "loss": 3.0172, + "step": 14868 + }, + { + "epoch": 0.73, + "grad_norm": 0.5330214500427246, + "learning_rate": 0.000516811858033243, + "loss": 3.2398, + "step": 14869 + }, + { + "epoch": 0.73, + "grad_norm": 0.6251876354217529, + "learning_rate": 0.0005168012165879074, + "loss": 3.0238, + "step": 14870 + }, + { + "epoch": 0.73, + "grad_norm": 0.5353378653526306, + "learning_rate": 0.0005167905745715592, + "loss": 3.2086, + "step": 14871 + }, + { + "epoch": 0.73, + "grad_norm": 0.537301242351532, + "learning_rate": 0.0005167799319842264, + "loss": 2.9889, + "step": 14872 + }, + { + "epoch": 0.73, + "grad_norm": 0.5349966883659363, + "learning_rate": 0.0005167692888259368, + "loss": 3.3271, + "step": 14873 + }, + { + "epoch": 0.73, + "grad_norm": 0.5379777550697327, + "learning_rate": 0.0005167586450967189, + "loss": 2.9083, + "step": 14874 + }, + { + "epoch": 0.73, + "grad_norm": 0.5365452766418457, + "learning_rate": 0.0005167480007966002, + "loss": 3.2886, + "step": 14875 + }, + { + "epoch": 0.73, + "grad_norm": 0.5008100867271423, + "learning_rate": 0.000516737355925609, + "loss": 3.1322, + "step": 14876 + }, + { + "epoch": 0.73, + "grad_norm": 0.5189768671989441, + "learning_rate": 0.0005167267104837735, + "loss": 3.1345, + "step": 14877 + }, + { + "epoch": 0.73, + "grad_norm": 0.5536531209945679, + "learning_rate": 0.0005167160644711214, + "loss": 3.0225, + "step": 14878 + }, + { + "epoch": 0.73, + "grad_norm": 0.4946756660938263, + "learning_rate": 0.000516705417887681, + "loss": 3.4478, + "step": 14879 + }, + { + "epoch": 0.73, + "grad_norm": 0.5488638877868652, + "learning_rate": 0.00051669477073348, + "loss": 2.9697, + "step": 14880 + }, + { + "epoch": 0.73, + "grad_norm": 0.520703911781311, + "learning_rate": 0.0005166841230085469, + "loss": 3.1385, + "step": 14881 + }, + { + "epoch": 0.73, + "grad_norm": 0.5039485096931458, + "learning_rate": 0.0005166734747129094, + "loss": 3.1356, + "step": 14882 + }, + { + "epoch": 0.73, + "grad_norm": 0.4801254868507385, + "learning_rate": 0.0005166628258465958, + "loss": 3.2317, + "step": 14883 + }, + { + "epoch": 0.73, + "grad_norm": 0.5044119358062744, + "learning_rate": 0.0005166521764096339, + "loss": 2.9742, + "step": 14884 + }, + { + "epoch": 0.73, + "grad_norm": 0.5282803773880005, + "learning_rate": 0.0005166415264020519, + "loss": 3.0444, + "step": 14885 + }, + { + "epoch": 0.73, + "grad_norm": 0.5237149596214294, + "learning_rate": 0.0005166308758238779, + "loss": 3.3579, + "step": 14886 + }, + { + "epoch": 0.73, + "grad_norm": 0.5436862111091614, + "learning_rate": 0.0005166202246751397, + "loss": 3.1705, + "step": 14887 + }, + { + "epoch": 0.73, + "grad_norm": 0.5225011110305786, + "learning_rate": 0.0005166095729558656, + "loss": 3.145, + "step": 14888 + }, + { + "epoch": 0.73, + "grad_norm": 0.5690309405326843, + "learning_rate": 0.0005165989206660836, + "loss": 2.8944, + "step": 14889 + }, + { + "epoch": 0.73, + "grad_norm": 0.5105469226837158, + "learning_rate": 0.0005165882678058217, + "loss": 3.1488, + "step": 14890 + }, + { + "epoch": 0.73, + "grad_norm": 0.5054097175598145, + "learning_rate": 0.000516577614375108, + "loss": 3.4983, + "step": 14891 + }, + { + "epoch": 0.73, + "grad_norm": 0.570755660533905, + "learning_rate": 0.0005165669603739704, + "loss": 3.2432, + "step": 14892 + }, + { + "epoch": 0.73, + "grad_norm": 0.5138176679611206, + "learning_rate": 0.0005165563058024373, + "loss": 3.1639, + "step": 14893 + }, + { + "epoch": 0.73, + "grad_norm": 0.4894483685493469, + "learning_rate": 0.0005165456506605365, + "loss": 2.9985, + "step": 14894 + }, + { + "epoch": 0.73, + "grad_norm": 0.4852317273616791, + "learning_rate": 0.0005165349949482962, + "loss": 3.2036, + "step": 14895 + }, + { + "epoch": 0.73, + "grad_norm": 0.5104283094406128, + "learning_rate": 0.0005165243386657442, + "loss": 3.2489, + "step": 14896 + }, + { + "epoch": 0.73, + "grad_norm": 0.5486336350440979, + "learning_rate": 0.000516513681812909, + "loss": 3.2546, + "step": 14897 + }, + { + "epoch": 0.73, + "grad_norm": 0.5464646816253662, + "learning_rate": 0.0005165030243898183, + "loss": 3.1661, + "step": 14898 + }, + { + "epoch": 0.73, + "grad_norm": 0.494334876537323, + "learning_rate": 0.0005164923663965003, + "loss": 3.3491, + "step": 14899 + }, + { + "epoch": 0.73, + "grad_norm": 0.51130211353302, + "learning_rate": 0.0005164817078329831, + "loss": 3.4166, + "step": 14900 + }, + { + "epoch": 0.73, + "grad_norm": 0.5445241928100586, + "learning_rate": 0.0005164710486992947, + "loss": 3.2104, + "step": 14901 + }, + { + "epoch": 0.73, + "grad_norm": 0.5161874294281006, + "learning_rate": 0.0005164603889954633, + "loss": 3.291, + "step": 14902 + }, + { + "epoch": 0.73, + "grad_norm": 0.5364497303962708, + "learning_rate": 0.0005164497287215169, + "loss": 3.3265, + "step": 14903 + }, + { + "epoch": 0.73, + "grad_norm": 0.4792921841144562, + "learning_rate": 0.0005164390678774834, + "loss": 3.2952, + "step": 14904 + }, + { + "epoch": 0.73, + "grad_norm": 0.5327908396720886, + "learning_rate": 0.0005164284064633912, + "loss": 3.1207, + "step": 14905 + }, + { + "epoch": 0.73, + "grad_norm": 0.5640168190002441, + "learning_rate": 0.0005164177444792683, + "loss": 3.2393, + "step": 14906 + }, + { + "epoch": 0.73, + "grad_norm": 0.5080165863037109, + "learning_rate": 0.0005164070819251425, + "loss": 3.1347, + "step": 14907 + }, + { + "epoch": 0.73, + "grad_norm": 0.4810307025909424, + "learning_rate": 0.0005163964188010424, + "loss": 3.2237, + "step": 14908 + }, + { + "epoch": 0.73, + "grad_norm": 0.5038141012191772, + "learning_rate": 0.0005163857551069954, + "loss": 3.2509, + "step": 14909 + }, + { + "epoch": 0.73, + "grad_norm": 0.5256493091583252, + "learning_rate": 0.0005163750908430303, + "loss": 3.1796, + "step": 14910 + }, + { + "epoch": 0.73, + "grad_norm": 0.5214911699295044, + "learning_rate": 0.0005163644260091746, + "loss": 3.211, + "step": 14911 + }, + { + "epoch": 0.73, + "grad_norm": 0.5438799858093262, + "learning_rate": 0.0005163537606054568, + "loss": 3.1117, + "step": 14912 + }, + { + "epoch": 0.73, + "grad_norm": 0.5240724682807922, + "learning_rate": 0.0005163430946319047, + "loss": 3.3282, + "step": 14913 + }, + { + "epoch": 0.73, + "grad_norm": 0.5152091979980469, + "learning_rate": 0.0005163324280885467, + "loss": 3.1758, + "step": 14914 + }, + { + "epoch": 0.73, + "grad_norm": 0.4732866585254669, + "learning_rate": 0.0005163217609754105, + "loss": 3.0374, + "step": 14915 + }, + { + "epoch": 0.73, + "grad_norm": 0.48373696208000183, + "learning_rate": 0.0005163110932925245, + "loss": 3.2967, + "step": 14916 + }, + { + "epoch": 0.73, + "grad_norm": 0.5080105662345886, + "learning_rate": 0.0005163004250399168, + "loss": 3.1916, + "step": 14917 + }, + { + "epoch": 0.73, + "grad_norm": 0.4790027141571045, + "learning_rate": 0.0005162897562176154, + "loss": 3.2397, + "step": 14918 + }, + { + "epoch": 0.73, + "grad_norm": 0.491432249546051, + "learning_rate": 0.0005162790868256484, + "loss": 3.2743, + "step": 14919 + }, + { + "epoch": 0.73, + "grad_norm": 0.5296748876571655, + "learning_rate": 0.0005162684168640439, + "loss": 2.8877, + "step": 14920 + }, + { + "epoch": 0.73, + "grad_norm": 0.5081189274787903, + "learning_rate": 0.00051625774633283, + "loss": 3.2914, + "step": 14921 + }, + { + "epoch": 0.73, + "grad_norm": 0.5037716031074524, + "learning_rate": 0.0005162470752320347, + "loss": 3.2631, + "step": 14922 + }, + { + "epoch": 0.73, + "grad_norm": 0.5493853688240051, + "learning_rate": 0.0005162364035616863, + "loss": 3.2527, + "step": 14923 + }, + { + "epoch": 0.73, + "grad_norm": 0.48433980345726013, + "learning_rate": 0.0005162257313218128, + "loss": 3.1472, + "step": 14924 + }, + { + "epoch": 0.73, + "grad_norm": 0.5239858627319336, + "learning_rate": 0.0005162150585124425, + "loss": 3.2365, + "step": 14925 + }, + { + "epoch": 0.73, + "grad_norm": 0.5162671208381653, + "learning_rate": 0.0005162043851336032, + "loss": 3.1116, + "step": 14926 + }, + { + "epoch": 0.73, + "grad_norm": 0.532788097858429, + "learning_rate": 0.0005161937111853232, + "loss": 3.185, + "step": 14927 + }, + { + "epoch": 0.73, + "grad_norm": 0.5161779522895813, + "learning_rate": 0.0005161830366676305, + "loss": 3.2383, + "step": 14928 + }, + { + "epoch": 0.73, + "grad_norm": 0.5137925744056702, + "learning_rate": 0.0005161723615805534, + "loss": 3.2344, + "step": 14929 + }, + { + "epoch": 0.73, + "grad_norm": 0.5238530039787292, + "learning_rate": 0.0005161616859241199, + "loss": 3.2037, + "step": 14930 + }, + { + "epoch": 0.73, + "grad_norm": 0.5177023410797119, + "learning_rate": 0.0005161510096983581, + "loss": 3.035, + "step": 14931 + }, + { + "epoch": 0.73, + "grad_norm": 0.49137774109840393, + "learning_rate": 0.0005161403329032961, + "loss": 3.2177, + "step": 14932 + }, + { + "epoch": 0.73, + "grad_norm": 0.5207857489585876, + "learning_rate": 0.000516129655538962, + "loss": 3.2774, + "step": 14933 + }, + { + "epoch": 0.73, + "grad_norm": 0.5206941962242126, + "learning_rate": 0.0005161189776053841, + "loss": 3.054, + "step": 14934 + }, + { + "epoch": 0.73, + "grad_norm": 0.5176581144332886, + "learning_rate": 0.0005161082991025904, + "loss": 3.2821, + "step": 14935 + }, + { + "epoch": 0.73, + "grad_norm": 0.5066308975219727, + "learning_rate": 0.000516097620030609, + "loss": 3.3941, + "step": 14936 + }, + { + "epoch": 0.73, + "grad_norm": 0.5001108646392822, + "learning_rate": 0.000516086940389468, + "loss": 3.1497, + "step": 14937 + }, + { + "epoch": 0.73, + "grad_norm": 0.5629680156707764, + "learning_rate": 0.0005160762601791956, + "loss": 3.4361, + "step": 14938 + }, + { + "epoch": 0.73, + "grad_norm": 0.5518137216567993, + "learning_rate": 0.00051606557939982, + "loss": 3.2953, + "step": 14939 + }, + { + "epoch": 0.73, + "grad_norm": 0.5315781831741333, + "learning_rate": 0.0005160548980513692, + "loss": 3.1599, + "step": 14940 + }, + { + "epoch": 0.73, + "grad_norm": 0.4899667501449585, + "learning_rate": 0.0005160442161338713, + "loss": 3.2092, + "step": 14941 + }, + { + "epoch": 0.73, + "grad_norm": 0.5403282046318054, + "learning_rate": 0.0005160335336473547, + "loss": 3.0352, + "step": 14942 + }, + { + "epoch": 0.73, + "grad_norm": 0.5120472311973572, + "learning_rate": 0.0005160228505918472, + "loss": 3.3255, + "step": 14943 + }, + { + "epoch": 0.73, + "grad_norm": 0.5195842385292053, + "learning_rate": 0.0005160121669673771, + "loss": 3.2938, + "step": 14944 + }, + { + "epoch": 0.73, + "grad_norm": 0.5495728254318237, + "learning_rate": 0.0005160014827739726, + "loss": 3.2412, + "step": 14945 + }, + { + "epoch": 0.73, + "grad_norm": 0.5019406080245972, + "learning_rate": 0.0005159907980116617, + "loss": 3.3542, + "step": 14946 + }, + { + "epoch": 0.73, + "grad_norm": 0.5161232352256775, + "learning_rate": 0.0005159801126804726, + "loss": 3.064, + "step": 14947 + }, + { + "epoch": 0.73, + "grad_norm": 0.4725062847137451, + "learning_rate": 0.0005159694267804335, + "loss": 2.9515, + "step": 14948 + }, + { + "epoch": 0.73, + "grad_norm": 0.5322039127349854, + "learning_rate": 0.0005159587403115724, + "loss": 3.1031, + "step": 14949 + }, + { + "epoch": 0.73, + "grad_norm": 0.48917528986930847, + "learning_rate": 0.0005159480532739176, + "loss": 3.3152, + "step": 14950 + }, + { + "epoch": 0.73, + "grad_norm": 0.5044992566108704, + "learning_rate": 0.0005159373656674972, + "loss": 3.304, + "step": 14951 + }, + { + "epoch": 0.73, + "grad_norm": 0.5023238658905029, + "learning_rate": 0.0005159266774923393, + "loss": 2.9814, + "step": 14952 + }, + { + "epoch": 0.73, + "grad_norm": 0.49678921699523926, + "learning_rate": 0.0005159159887484721, + "loss": 3.2713, + "step": 14953 + }, + { + "epoch": 0.73, + "grad_norm": 0.5398493409156799, + "learning_rate": 0.0005159052994359239, + "loss": 2.8908, + "step": 14954 + }, + { + "epoch": 0.73, + "grad_norm": 0.5190320014953613, + "learning_rate": 0.0005158946095547225, + "loss": 3.1101, + "step": 14955 + }, + { + "epoch": 0.73, + "grad_norm": 0.4837747812271118, + "learning_rate": 0.0005158839191048963, + "loss": 3.3732, + "step": 14956 + }, + { + "epoch": 0.73, + "grad_norm": 0.5262941718101501, + "learning_rate": 0.0005158732280864735, + "loss": 3.095, + "step": 14957 + }, + { + "epoch": 0.73, + "grad_norm": 0.5336409211158752, + "learning_rate": 0.000515862536499482, + "loss": 3.3913, + "step": 14958 + }, + { + "epoch": 0.73, + "grad_norm": 0.5454764366149902, + "learning_rate": 0.0005158518443439502, + "loss": 3.3525, + "step": 14959 + }, + { + "epoch": 0.73, + "grad_norm": 0.536488950252533, + "learning_rate": 0.0005158411516199061, + "loss": 3.2649, + "step": 14960 + }, + { + "epoch": 0.73, + "grad_norm": 0.5544682741165161, + "learning_rate": 0.000515830458327378, + "loss": 3.0752, + "step": 14961 + }, + { + "epoch": 0.73, + "grad_norm": 0.5361686944961548, + "learning_rate": 0.0005158197644663941, + "loss": 3.163, + "step": 14962 + }, + { + "epoch": 0.73, + "grad_norm": 0.5054564476013184, + "learning_rate": 0.0005158090700369824, + "loss": 3.0706, + "step": 14963 + }, + { + "epoch": 0.73, + "grad_norm": 0.5609951019287109, + "learning_rate": 0.0005157983750391711, + "loss": 3.2753, + "step": 14964 + }, + { + "epoch": 0.73, + "grad_norm": 0.5623591542243958, + "learning_rate": 0.0005157876794729885, + "loss": 3.1528, + "step": 14965 + }, + { + "epoch": 0.73, + "grad_norm": 0.5499200224876404, + "learning_rate": 0.0005157769833384626, + "loss": 3.0385, + "step": 14966 + }, + { + "epoch": 0.73, + "grad_norm": 0.47295448184013367, + "learning_rate": 0.0005157662866356217, + "loss": 3.0952, + "step": 14967 + }, + { + "epoch": 0.73, + "grad_norm": 0.5192577838897705, + "learning_rate": 0.0005157555893644939, + "loss": 3.3865, + "step": 14968 + }, + { + "epoch": 0.73, + "grad_norm": 0.51607346534729, + "learning_rate": 0.0005157448915251074, + "loss": 3.0292, + "step": 14969 + }, + { + "epoch": 0.73, + "grad_norm": 0.5210020542144775, + "learning_rate": 0.0005157341931174904, + "loss": 3.3449, + "step": 14970 + }, + { + "epoch": 0.73, + "grad_norm": 0.5266844034194946, + "learning_rate": 0.0005157234941416711, + "loss": 3.2106, + "step": 14971 + }, + { + "epoch": 0.73, + "grad_norm": 0.6120948791503906, + "learning_rate": 0.0005157127945976776, + "loss": 2.9999, + "step": 14972 + }, + { + "epoch": 0.73, + "grad_norm": 0.5157740116119385, + "learning_rate": 0.000515702094485538, + "loss": 3.1832, + "step": 14973 + }, + { + "epoch": 0.73, + "grad_norm": 0.49761828780174255, + "learning_rate": 0.0005156913938052808, + "loss": 3.1624, + "step": 14974 + }, + { + "epoch": 0.73, + "grad_norm": 0.5079160928726196, + "learning_rate": 0.0005156806925569338, + "loss": 3.2627, + "step": 14975 + }, + { + "epoch": 0.73, + "grad_norm": 0.5453952550888062, + "learning_rate": 0.0005156699907405255, + "loss": 2.9036, + "step": 14976 + }, + { + "epoch": 0.73, + "grad_norm": 0.5448424220085144, + "learning_rate": 0.0005156592883560838, + "loss": 3.1669, + "step": 14977 + }, + { + "epoch": 0.73, + "grad_norm": 0.49395671486854553, + "learning_rate": 0.0005156485854036371, + "loss": 3.2462, + "step": 14978 + }, + { + "epoch": 0.73, + "grad_norm": 0.5351203680038452, + "learning_rate": 0.0005156378818832136, + "loss": 3.3148, + "step": 14979 + }, + { + "epoch": 0.73, + "grad_norm": 0.5157082080841064, + "learning_rate": 0.0005156271777948414, + "loss": 3.3561, + "step": 14980 + }, + { + "epoch": 0.73, + "grad_norm": 0.5183131098747253, + "learning_rate": 0.0005156164731385487, + "loss": 3.1139, + "step": 14981 + }, + { + "epoch": 0.73, + "grad_norm": 0.5393839478492737, + "learning_rate": 0.0005156057679143636, + "loss": 3.2187, + "step": 14982 + }, + { + "epoch": 0.73, + "grad_norm": 0.49937084317207336, + "learning_rate": 0.0005155950621223145, + "loss": 3.1017, + "step": 14983 + }, + { + "epoch": 0.73, + "grad_norm": 0.5061917901039124, + "learning_rate": 0.0005155843557624296, + "loss": 3.0941, + "step": 14984 + }, + { + "epoch": 0.73, + "grad_norm": 0.5656406283378601, + "learning_rate": 0.0005155736488347369, + "loss": 3.0482, + "step": 14985 + }, + { + "epoch": 0.73, + "grad_norm": 0.5063902735710144, + "learning_rate": 0.0005155629413392646, + "loss": 3.0954, + "step": 14986 + }, + { + "epoch": 0.73, + "grad_norm": 0.5024520754814148, + "learning_rate": 0.0005155522332760411, + "loss": 3.0767, + "step": 14987 + }, + { + "epoch": 0.73, + "grad_norm": 0.83547043800354, + "learning_rate": 0.0005155415246450945, + "loss": 3.0165, + "step": 14988 + }, + { + "epoch": 0.73, + "grad_norm": 0.5294227600097656, + "learning_rate": 0.000515530815446453, + "loss": 3.2607, + "step": 14989 + }, + { + "epoch": 0.73, + "grad_norm": 0.5271424651145935, + "learning_rate": 0.0005155201056801449, + "loss": 3.1893, + "step": 14990 + }, + { + "epoch": 0.73, + "grad_norm": 0.5043739080429077, + "learning_rate": 0.0005155093953461981, + "loss": 3.1544, + "step": 14991 + }, + { + "epoch": 0.73, + "grad_norm": 0.5016458034515381, + "learning_rate": 0.0005154986844446411, + "loss": 3.099, + "step": 14992 + }, + { + "epoch": 0.73, + "grad_norm": 0.5234816670417786, + "learning_rate": 0.0005154879729755022, + "loss": 3.2483, + "step": 14993 + }, + { + "epoch": 0.73, + "grad_norm": 0.5132349133491516, + "learning_rate": 0.0005154772609388093, + "loss": 3.215, + "step": 14994 + }, + { + "epoch": 0.73, + "grad_norm": 0.5096055865287781, + "learning_rate": 0.0005154665483345909, + "loss": 3.289, + "step": 14995 + }, + { + "epoch": 0.73, + "grad_norm": 0.5225401520729065, + "learning_rate": 0.0005154558351628749, + "loss": 3.285, + "step": 14996 + }, + { + "epoch": 0.73, + "grad_norm": 0.5399203300476074, + "learning_rate": 0.0005154451214236898, + "loss": 3.0405, + "step": 14997 + }, + { + "epoch": 0.74, + "grad_norm": 0.5019733905792236, + "learning_rate": 0.0005154344071170637, + "loss": 3.2109, + "step": 14998 + }, + { + "epoch": 0.74, + "grad_norm": 0.5068507790565491, + "learning_rate": 0.0005154236922430248, + "loss": 3.0886, + "step": 14999 + }, + { + "epoch": 0.74, + "grad_norm": 0.5227283835411072, + "learning_rate": 0.0005154129768016013, + "loss": 3.0491, + "step": 15000 + }, + { + "epoch": 0.74, + "grad_norm": 0.5454354882240295, + "learning_rate": 0.0005154022607928217, + "loss": 3.2926, + "step": 15001 + }, + { + "epoch": 0.74, + "grad_norm": 0.5321445465087891, + "learning_rate": 0.0005153915442167138, + "loss": 3.1433, + "step": 15002 + }, + { + "epoch": 0.74, + "grad_norm": 0.5160802006721497, + "learning_rate": 0.0005153808270733061, + "loss": 3.0392, + "step": 15003 + }, + { + "epoch": 0.74, + "grad_norm": 0.5624576210975647, + "learning_rate": 0.0005153701093626267, + "loss": 3.2465, + "step": 15004 + }, + { + "epoch": 0.74, + "grad_norm": 0.5116503238677979, + "learning_rate": 0.0005153593910847039, + "loss": 3.4115, + "step": 15005 + }, + { + "epoch": 0.74, + "grad_norm": 0.4815455377101898, + "learning_rate": 0.000515348672239566, + "loss": 3.2247, + "step": 15006 + }, + { + "epoch": 0.74, + "grad_norm": 0.5008863806724548, + "learning_rate": 0.0005153379528272411, + "loss": 2.9155, + "step": 15007 + }, + { + "epoch": 0.74, + "grad_norm": 0.5305734276771545, + "learning_rate": 0.0005153272328477573, + "loss": 3.3725, + "step": 15008 + }, + { + "epoch": 0.74, + "grad_norm": 0.49309682846069336, + "learning_rate": 0.0005153165123011432, + "loss": 3.2633, + "step": 15009 + }, + { + "epoch": 0.74, + "grad_norm": 0.5189629793167114, + "learning_rate": 0.0005153057911874267, + "loss": 3.1755, + "step": 15010 + }, + { + "epoch": 0.74, + "grad_norm": 0.5381687879562378, + "learning_rate": 0.0005152950695066364, + "loss": 3.222, + "step": 15011 + }, + { + "epoch": 0.74, + "grad_norm": 0.4915025234222412, + "learning_rate": 0.0005152843472588001, + "loss": 3.1748, + "step": 15012 + }, + { + "epoch": 0.74, + "grad_norm": 0.5047193169593811, + "learning_rate": 0.0005152736244439464, + "loss": 3.1085, + "step": 15013 + }, + { + "epoch": 0.74, + "grad_norm": 0.5417593717575073, + "learning_rate": 0.0005152629010621033, + "loss": 3.1649, + "step": 15014 + }, + { + "epoch": 0.74, + "grad_norm": 0.5217851400375366, + "learning_rate": 0.0005152521771132993, + "loss": 3.2755, + "step": 15015 + }, + { + "epoch": 0.74, + "grad_norm": 0.517413854598999, + "learning_rate": 0.0005152414525975625, + "loss": 3.0616, + "step": 15016 + }, + { + "epoch": 0.74, + "grad_norm": 0.5072711110115051, + "learning_rate": 0.0005152307275149209, + "loss": 3.0893, + "step": 15017 + }, + { + "epoch": 0.74, + "grad_norm": 0.5205687284469604, + "learning_rate": 0.0005152200018654032, + "loss": 3.1728, + "step": 15018 + }, + { + "epoch": 0.74, + "grad_norm": 0.5183899402618408, + "learning_rate": 0.0005152092756490373, + "loss": 3.3177, + "step": 15019 + }, + { + "epoch": 0.74, + "grad_norm": 0.5392817854881287, + "learning_rate": 0.0005151985488658517, + "loss": 3.2724, + "step": 15020 + }, + { + "epoch": 0.74, + "grad_norm": 0.49955472350120544, + "learning_rate": 0.0005151878215158745, + "loss": 3.1084, + "step": 15021 + }, + { + "epoch": 0.74, + "grad_norm": 0.5460142493247986, + "learning_rate": 0.0005151770935991339, + "loss": 3.2581, + "step": 15022 + }, + { + "epoch": 0.74, + "grad_norm": 0.5164291262626648, + "learning_rate": 0.0005151663651156584, + "loss": 3.2137, + "step": 15023 + }, + { + "epoch": 0.74, + "grad_norm": 0.4972623586654663, + "learning_rate": 0.000515155636065476, + "loss": 3.1696, + "step": 15024 + }, + { + "epoch": 0.74, + "grad_norm": 0.5117320418357849, + "learning_rate": 0.0005151449064486151, + "loss": 2.9165, + "step": 15025 + }, + { + "epoch": 0.74, + "grad_norm": 0.5329243540763855, + "learning_rate": 0.000515134176265104, + "loss": 3.2211, + "step": 15026 + }, + { + "epoch": 0.74, + "grad_norm": 0.548600971698761, + "learning_rate": 0.0005151234455149707, + "loss": 3.4036, + "step": 15027 + }, + { + "epoch": 0.74, + "grad_norm": 0.5310418009757996, + "learning_rate": 0.0005151127141982437, + "loss": 3.1994, + "step": 15028 + }, + { + "epoch": 0.74, + "grad_norm": 0.5383056998252869, + "learning_rate": 0.0005151019823149513, + "loss": 3.2994, + "step": 15029 + }, + { + "epoch": 0.74, + "grad_norm": 0.4978289008140564, + "learning_rate": 0.0005150912498651216, + "loss": 3.2983, + "step": 15030 + }, + { + "epoch": 0.74, + "grad_norm": 0.5403746366500854, + "learning_rate": 0.000515080516848783, + "loss": 3.1833, + "step": 15031 + }, + { + "epoch": 0.74, + "grad_norm": 0.5366250276565552, + "learning_rate": 0.0005150697832659636, + "loss": 3.0881, + "step": 15032 + }, + { + "epoch": 0.74, + "grad_norm": 0.49605393409729004, + "learning_rate": 0.0005150590491166919, + "loss": 3.3317, + "step": 15033 + }, + { + "epoch": 0.74, + "grad_norm": 0.5022369027137756, + "learning_rate": 0.0005150483144009961, + "loss": 3.1123, + "step": 15034 + }, + { + "epoch": 0.74, + "grad_norm": 0.5486195683479309, + "learning_rate": 0.0005150375791189043, + "loss": 3.368, + "step": 15035 + }, + { + "epoch": 0.74, + "grad_norm": 0.511721670627594, + "learning_rate": 0.0005150268432704449, + "loss": 3.1523, + "step": 15036 + }, + { + "epoch": 0.74, + "grad_norm": 0.5076295137405396, + "learning_rate": 0.0005150161068556463, + "loss": 3.0981, + "step": 15037 + }, + { + "epoch": 0.74, + "grad_norm": 0.5491887331008911, + "learning_rate": 0.0005150053698745365, + "loss": 3.1993, + "step": 15038 + }, + { + "epoch": 0.74, + "grad_norm": 0.5024232268333435, + "learning_rate": 0.000514994632327144, + "loss": 3.4245, + "step": 15039 + }, + { + "epoch": 0.74, + "grad_norm": 0.5337858200073242, + "learning_rate": 0.0005149838942134971, + "loss": 3.2517, + "step": 15040 + }, + { + "epoch": 0.74, + "grad_norm": 0.5012475252151489, + "learning_rate": 0.0005149731555336239, + "loss": 3.2314, + "step": 15041 + }, + { + "epoch": 0.74, + "grad_norm": 0.5080512166023254, + "learning_rate": 0.0005149624162875528, + "loss": 3.432, + "step": 15042 + }, + { + "epoch": 0.74, + "grad_norm": 0.5350621938705444, + "learning_rate": 0.0005149516764753121, + "loss": 3.232, + "step": 15043 + }, + { + "epoch": 0.74, + "grad_norm": 0.5167108774185181, + "learning_rate": 0.00051494093609693, + "loss": 3.1149, + "step": 15044 + }, + { + "epoch": 0.74, + "grad_norm": 0.5356556177139282, + "learning_rate": 0.0005149301951524348, + "loss": 3.3853, + "step": 15045 + }, + { + "epoch": 0.74, + "grad_norm": 0.48991358280181885, + "learning_rate": 0.0005149194536418551, + "loss": 3.0879, + "step": 15046 + }, + { + "epoch": 0.74, + "grad_norm": 0.5196684002876282, + "learning_rate": 0.0005149087115652185, + "loss": 2.9087, + "step": 15047 + }, + { + "epoch": 0.74, + "grad_norm": 0.4956427216529846, + "learning_rate": 0.000514897968922554, + "loss": 3.2385, + "step": 15048 + }, + { + "epoch": 0.74, + "grad_norm": 0.48819366097450256, + "learning_rate": 0.0005148872257138895, + "loss": 3.2224, + "step": 15049 + }, + { + "epoch": 0.74, + "grad_norm": 0.5472261905670166, + "learning_rate": 0.0005148764819392535, + "loss": 3.1346, + "step": 15050 + }, + { + "epoch": 0.74, + "grad_norm": 0.5290123820304871, + "learning_rate": 0.0005148657375986741, + "loss": 3.1584, + "step": 15051 + }, + { + "epoch": 0.74, + "grad_norm": 0.5149974226951599, + "learning_rate": 0.0005148549926921798, + "loss": 3.3616, + "step": 15052 + }, + { + "epoch": 0.74, + "grad_norm": 0.5451234579086304, + "learning_rate": 0.0005148442472197986, + "loss": 3.145, + "step": 15053 + }, + { + "epoch": 0.74, + "grad_norm": 0.4977348744869232, + "learning_rate": 0.0005148335011815592, + "loss": 3.1043, + "step": 15054 + }, + { + "epoch": 0.74, + "grad_norm": 0.4974806010723114, + "learning_rate": 0.0005148227545774898, + "loss": 3.2151, + "step": 15055 + }, + { + "epoch": 0.74, + "grad_norm": 0.5258920192718506, + "learning_rate": 0.0005148120074076184, + "loss": 3.2716, + "step": 15056 + }, + { + "epoch": 0.74, + "grad_norm": 0.5211778879165649, + "learning_rate": 0.0005148012596719735, + "loss": 3.1984, + "step": 15057 + }, + { + "epoch": 0.74, + "grad_norm": 0.5030432939529419, + "learning_rate": 0.0005147905113705835, + "loss": 3.1843, + "step": 15058 + }, + { + "epoch": 0.74, + "grad_norm": 0.4971194267272949, + "learning_rate": 0.0005147797625034766, + "loss": 3.1692, + "step": 15059 + }, + { + "epoch": 0.74, + "grad_norm": 0.5721365213394165, + "learning_rate": 0.0005147690130706811, + "loss": 3.0595, + "step": 15060 + }, + { + "epoch": 0.74, + "grad_norm": 0.5956337451934814, + "learning_rate": 0.0005147582630722255, + "loss": 3.2497, + "step": 15061 + }, + { + "epoch": 0.74, + "grad_norm": 0.5455291271209717, + "learning_rate": 0.0005147475125081379, + "loss": 3.286, + "step": 15062 + }, + { + "epoch": 0.74, + "grad_norm": 0.49531859159469604, + "learning_rate": 0.0005147367613784465, + "loss": 3.2845, + "step": 15063 + }, + { + "epoch": 0.74, + "grad_norm": 0.5478885769844055, + "learning_rate": 0.00051472600968318, + "loss": 3.4411, + "step": 15064 + }, + { + "epoch": 0.74, + "grad_norm": 0.5322989821434021, + "learning_rate": 0.0005147152574223665, + "loss": 3.4172, + "step": 15065 + }, + { + "epoch": 0.74, + "grad_norm": 0.5252341628074646, + "learning_rate": 0.0005147045045960344, + "loss": 3.2743, + "step": 15066 + }, + { + "epoch": 0.74, + "grad_norm": 0.5184382796287537, + "learning_rate": 0.0005146937512042118, + "loss": 3.1589, + "step": 15067 + }, + { + "epoch": 0.74, + "grad_norm": 0.5355132818222046, + "learning_rate": 0.0005146829972469272, + "loss": 3.0857, + "step": 15068 + }, + { + "epoch": 0.74, + "grad_norm": 0.5309262275695801, + "learning_rate": 0.000514672242724209, + "loss": 3.3667, + "step": 15069 + }, + { + "epoch": 0.74, + "grad_norm": 0.529289186000824, + "learning_rate": 0.0005146614876360853, + "loss": 3.4135, + "step": 15070 + }, + { + "epoch": 0.74, + "grad_norm": 0.48815011978149414, + "learning_rate": 0.0005146507319825846, + "loss": 3.4187, + "step": 15071 + }, + { + "epoch": 0.74, + "grad_norm": 0.526040256023407, + "learning_rate": 0.0005146399757637352, + "loss": 3.1328, + "step": 15072 + }, + { + "epoch": 0.74, + "grad_norm": 0.5177284479141235, + "learning_rate": 0.0005146292189795654, + "loss": 3.2615, + "step": 15073 + }, + { + "epoch": 0.74, + "grad_norm": 0.4890044033527374, + "learning_rate": 0.0005146184616301036, + "loss": 3.0297, + "step": 15074 + }, + { + "epoch": 0.74, + "grad_norm": 0.511398434638977, + "learning_rate": 0.000514607703715378, + "loss": 3.5007, + "step": 15075 + }, + { + "epoch": 0.74, + "grad_norm": 0.5235434174537659, + "learning_rate": 0.000514596945235417, + "loss": 3.3912, + "step": 15076 + }, + { + "epoch": 0.74, + "grad_norm": 0.5152917504310608, + "learning_rate": 0.0005145861861902491, + "loss": 3.0959, + "step": 15077 + }, + { + "epoch": 0.74, + "grad_norm": 0.555122971534729, + "learning_rate": 0.0005145754265799023, + "loss": 3.1187, + "step": 15078 + }, + { + "epoch": 0.74, + "grad_norm": 0.5248861908912659, + "learning_rate": 0.0005145646664044053, + "loss": 3.3088, + "step": 15079 + }, + { + "epoch": 0.74, + "grad_norm": 0.48319074511528015, + "learning_rate": 0.0005145539056637861, + "loss": 3.0406, + "step": 15080 + }, + { + "epoch": 0.74, + "grad_norm": 0.5107579827308655, + "learning_rate": 0.0005145431443580732, + "loss": 2.9253, + "step": 15081 + }, + { + "epoch": 0.74, + "grad_norm": 0.5582626461982727, + "learning_rate": 0.000514532382487295, + "loss": 3.3489, + "step": 15082 + }, + { + "epoch": 0.74, + "grad_norm": 0.5222228765487671, + "learning_rate": 0.00051452162005148, + "loss": 3.2836, + "step": 15083 + }, + { + "epoch": 0.74, + "grad_norm": 0.49412620067596436, + "learning_rate": 0.0005145108570506561, + "loss": 3.163, + "step": 15084 + }, + { + "epoch": 0.74, + "grad_norm": 0.530225932598114, + "learning_rate": 0.0005145000934848519, + "loss": 3.144, + "step": 15085 + }, + { + "epoch": 0.74, + "grad_norm": 0.5154251456260681, + "learning_rate": 0.0005144893293540957, + "loss": 3.0688, + "step": 15086 + }, + { + "epoch": 0.74, + "grad_norm": 0.5582703948020935, + "learning_rate": 0.0005144785646584159, + "loss": 3.3622, + "step": 15087 + }, + { + "epoch": 0.74, + "grad_norm": 0.5435081720352173, + "learning_rate": 0.000514467799397841, + "loss": 3.1106, + "step": 15088 + }, + { + "epoch": 0.74, + "grad_norm": 0.5430107712745667, + "learning_rate": 0.000514457033572399, + "loss": 3.0035, + "step": 15089 + }, + { + "epoch": 0.74, + "grad_norm": 0.543735682964325, + "learning_rate": 0.0005144462671821186, + "loss": 3.1732, + "step": 15090 + }, + { + "epoch": 0.74, + "grad_norm": 0.5661927461624146, + "learning_rate": 0.0005144355002270278, + "loss": 2.9093, + "step": 15091 + }, + { + "epoch": 0.74, + "grad_norm": 0.5020666122436523, + "learning_rate": 0.0005144247327071553, + "loss": 3.2608, + "step": 15092 + }, + { + "epoch": 0.74, + "grad_norm": 0.5958285331726074, + "learning_rate": 0.0005144139646225293, + "loss": 3.0192, + "step": 15093 + }, + { + "epoch": 0.74, + "grad_norm": 0.5555652976036072, + "learning_rate": 0.0005144031959731783, + "loss": 3.2023, + "step": 15094 + }, + { + "epoch": 0.74, + "grad_norm": 0.5096257925033569, + "learning_rate": 0.0005143924267591304, + "loss": 3.2049, + "step": 15095 + }, + { + "epoch": 0.74, + "grad_norm": 0.5613263249397278, + "learning_rate": 0.0005143816569804141, + "loss": 3.074, + "step": 15096 + }, + { + "epoch": 0.74, + "grad_norm": 0.4865977168083191, + "learning_rate": 0.0005143708866370579, + "loss": 3.0918, + "step": 15097 + }, + { + "epoch": 0.74, + "grad_norm": 0.509009838104248, + "learning_rate": 0.00051436011572909, + "loss": 3.2596, + "step": 15098 + }, + { + "epoch": 0.74, + "grad_norm": 0.5192385315895081, + "learning_rate": 0.0005143493442565387, + "loss": 3.0407, + "step": 15099 + }, + { + "epoch": 0.74, + "grad_norm": 0.5306475758552551, + "learning_rate": 0.0005143385722194326, + "loss": 3.2282, + "step": 15100 + }, + { + "epoch": 0.74, + "grad_norm": 0.5092197060585022, + "learning_rate": 0.0005143277996177998, + "loss": 3.2562, + "step": 15101 + }, + { + "epoch": 0.74, + "grad_norm": 0.49553215503692627, + "learning_rate": 0.000514317026451669, + "loss": 3.114, + "step": 15102 + }, + { + "epoch": 0.74, + "grad_norm": 0.5059915781021118, + "learning_rate": 0.0005143062527210683, + "loss": 3.3094, + "step": 15103 + }, + { + "epoch": 0.74, + "grad_norm": 0.4825346767902374, + "learning_rate": 0.0005142954784260261, + "loss": 3.2117, + "step": 15104 + }, + { + "epoch": 0.74, + "grad_norm": 0.5263687968254089, + "learning_rate": 0.000514284703566571, + "loss": 3.0489, + "step": 15105 + }, + { + "epoch": 0.74, + "grad_norm": 0.50020432472229, + "learning_rate": 0.0005142739281427313, + "loss": 3.2894, + "step": 15106 + }, + { + "epoch": 0.74, + "grad_norm": 0.5188915133476257, + "learning_rate": 0.0005142631521545351, + "loss": 3.2912, + "step": 15107 + }, + { + "epoch": 0.74, + "grad_norm": 0.5241820812225342, + "learning_rate": 0.0005142523756020111, + "loss": 3.1568, + "step": 15108 + }, + { + "epoch": 0.74, + "grad_norm": 0.47457194328308105, + "learning_rate": 0.0005142415984851875, + "loss": 3.2036, + "step": 15109 + }, + { + "epoch": 0.74, + "grad_norm": 0.46626847982406616, + "learning_rate": 0.0005142308208040928, + "loss": 3.0613, + "step": 15110 + }, + { + "epoch": 0.74, + "grad_norm": 0.5026150941848755, + "learning_rate": 0.0005142200425587555, + "loss": 3.0074, + "step": 15111 + }, + { + "epoch": 0.74, + "grad_norm": 0.531330406665802, + "learning_rate": 0.0005142092637492036, + "loss": 3.3118, + "step": 15112 + }, + { + "epoch": 0.74, + "grad_norm": 0.5424619913101196, + "learning_rate": 0.0005141984843754658, + "loss": 3.2401, + "step": 15113 + }, + { + "epoch": 0.74, + "grad_norm": 0.5372337102890015, + "learning_rate": 0.0005141877044375705, + "loss": 3.2051, + "step": 15114 + }, + { + "epoch": 0.74, + "grad_norm": 0.5131272673606873, + "learning_rate": 0.000514176923935546, + "loss": 3.0717, + "step": 15115 + }, + { + "epoch": 0.74, + "grad_norm": 0.4956575334072113, + "learning_rate": 0.0005141661428694206, + "loss": 3.1742, + "step": 15116 + }, + { + "epoch": 0.74, + "grad_norm": 0.5434887409210205, + "learning_rate": 0.0005141553612392229, + "loss": 3.1218, + "step": 15117 + }, + { + "epoch": 0.74, + "grad_norm": 0.5086133480072021, + "learning_rate": 0.0005141445790449811, + "loss": 3.0653, + "step": 15118 + }, + { + "epoch": 0.74, + "grad_norm": 0.5095211863517761, + "learning_rate": 0.0005141337962867238, + "loss": 3.2132, + "step": 15119 + }, + { + "epoch": 0.74, + "grad_norm": 0.5019669532775879, + "learning_rate": 0.0005141230129644792, + "loss": 3.1334, + "step": 15120 + }, + { + "epoch": 0.74, + "grad_norm": 0.5907220244407654, + "learning_rate": 0.0005141122290782758, + "loss": 3.0343, + "step": 15121 + }, + { + "epoch": 0.74, + "grad_norm": 0.531029462814331, + "learning_rate": 0.000514101444628142, + "loss": 3.2226, + "step": 15122 + }, + { + "epoch": 0.74, + "grad_norm": 0.5512033104896545, + "learning_rate": 0.0005140906596141063, + "loss": 3.3203, + "step": 15123 + }, + { + "epoch": 0.74, + "grad_norm": 0.5198848843574524, + "learning_rate": 0.0005140798740361968, + "loss": 3.1245, + "step": 15124 + }, + { + "epoch": 0.74, + "grad_norm": 0.5568894147872925, + "learning_rate": 0.0005140690878944423, + "loss": 3.2085, + "step": 15125 + }, + { + "epoch": 0.74, + "grad_norm": 0.5185664296150208, + "learning_rate": 0.0005140583011888709, + "loss": 3.3906, + "step": 15126 + }, + { + "epoch": 0.74, + "grad_norm": 0.5689828991889954, + "learning_rate": 0.0005140475139195112, + "loss": 3.0878, + "step": 15127 + }, + { + "epoch": 0.74, + "grad_norm": 0.5163256525993347, + "learning_rate": 0.0005140367260863916, + "loss": 3.3418, + "step": 15128 + }, + { + "epoch": 0.74, + "grad_norm": 0.487690269947052, + "learning_rate": 0.0005140259376895404, + "loss": 3.4939, + "step": 15129 + }, + { + "epoch": 0.74, + "grad_norm": 0.5142568349838257, + "learning_rate": 0.000514015148728986, + "loss": 3.1635, + "step": 15130 + }, + { + "epoch": 0.74, + "grad_norm": 0.5546401739120483, + "learning_rate": 0.000514004359204757, + "loss": 3.1504, + "step": 15131 + }, + { + "epoch": 0.74, + "grad_norm": 0.5494998693466187, + "learning_rate": 0.0005139935691168816, + "loss": 3.1133, + "step": 15132 + }, + { + "epoch": 0.74, + "grad_norm": 0.5090645551681519, + "learning_rate": 0.0005139827784653884, + "loss": 3.1497, + "step": 15133 + }, + { + "epoch": 0.74, + "grad_norm": 0.5398714542388916, + "learning_rate": 0.0005139719872503057, + "loss": 3.2604, + "step": 15134 + }, + { + "epoch": 0.74, + "grad_norm": 0.5296069383621216, + "learning_rate": 0.0005139611954716619, + "loss": 3.4238, + "step": 15135 + }, + { + "epoch": 0.74, + "grad_norm": 0.507793664932251, + "learning_rate": 0.0005139504031294855, + "loss": 3.1279, + "step": 15136 + }, + { + "epoch": 0.74, + "grad_norm": 0.5212898850440979, + "learning_rate": 0.000513939610223805, + "loss": 3.2929, + "step": 15137 + }, + { + "epoch": 0.74, + "grad_norm": 0.5241724848747253, + "learning_rate": 0.0005139288167546487, + "loss": 3.2657, + "step": 15138 + }, + { + "epoch": 0.74, + "grad_norm": 0.5247151255607605, + "learning_rate": 0.0005139180227220451, + "loss": 3.2378, + "step": 15139 + }, + { + "epoch": 0.74, + "grad_norm": 0.4941554665565491, + "learning_rate": 0.0005139072281260226, + "loss": 3.0256, + "step": 15140 + }, + { + "epoch": 0.74, + "grad_norm": 0.5236513018608093, + "learning_rate": 0.0005138964329666096, + "loss": 3.1188, + "step": 15141 + }, + { + "epoch": 0.74, + "grad_norm": 0.5333306789398193, + "learning_rate": 0.0005138856372438347, + "loss": 3.3072, + "step": 15142 + }, + { + "epoch": 0.74, + "grad_norm": 0.5609138607978821, + "learning_rate": 0.000513874840957726, + "loss": 3.1602, + "step": 15143 + }, + { + "epoch": 0.74, + "grad_norm": 0.49884551763534546, + "learning_rate": 0.0005138640441083122, + "loss": 3.1501, + "step": 15144 + }, + { + "epoch": 0.74, + "grad_norm": 0.5260482430458069, + "learning_rate": 0.0005138532466956216, + "loss": 3.3145, + "step": 15145 + }, + { + "epoch": 0.74, + "grad_norm": 0.543743908405304, + "learning_rate": 0.0005138424487196829, + "loss": 3.0801, + "step": 15146 + }, + { + "epoch": 0.74, + "grad_norm": 0.5195195078849792, + "learning_rate": 0.0005138316501805242, + "loss": 3.2929, + "step": 15147 + }, + { + "epoch": 0.74, + "grad_norm": 0.5003941655158997, + "learning_rate": 0.0005138208510781741, + "loss": 3.1971, + "step": 15148 + }, + { + "epoch": 0.74, + "grad_norm": 0.5132508873939514, + "learning_rate": 0.000513810051412661, + "loss": 3.271, + "step": 15149 + }, + { + "epoch": 0.74, + "grad_norm": 0.5287436246871948, + "learning_rate": 0.0005137992511840134, + "loss": 3.1374, + "step": 15150 + }, + { + "epoch": 0.74, + "grad_norm": 0.5131794810295105, + "learning_rate": 0.0005137884503922597, + "loss": 3.1865, + "step": 15151 + }, + { + "epoch": 0.74, + "grad_norm": 0.4891071021556854, + "learning_rate": 0.0005137776490374284, + "loss": 3.1813, + "step": 15152 + }, + { + "epoch": 0.74, + "grad_norm": 0.518698513507843, + "learning_rate": 0.0005137668471195478, + "loss": 2.9667, + "step": 15153 + }, + { + "epoch": 0.74, + "grad_norm": 0.5069079995155334, + "learning_rate": 0.0005137560446386466, + "loss": 3.1667, + "step": 15154 + }, + { + "epoch": 0.74, + "grad_norm": 0.5019563436508179, + "learning_rate": 0.0005137452415947531, + "loss": 3.048, + "step": 15155 + }, + { + "epoch": 0.74, + "grad_norm": 0.5977448225021362, + "learning_rate": 0.0005137344379878958, + "loss": 3.0489, + "step": 15156 + }, + { + "epoch": 0.74, + "grad_norm": 0.526675283908844, + "learning_rate": 0.000513723633818103, + "loss": 3.2784, + "step": 15157 + }, + { + "epoch": 0.74, + "grad_norm": 0.5488605499267578, + "learning_rate": 0.0005137128290854035, + "loss": 3.3835, + "step": 15158 + }, + { + "epoch": 0.74, + "grad_norm": 0.5367397665977478, + "learning_rate": 0.0005137020237898254, + "loss": 3.3152, + "step": 15159 + }, + { + "epoch": 0.74, + "grad_norm": 0.5273241400718689, + "learning_rate": 0.0005136912179313973, + "loss": 2.9809, + "step": 15160 + }, + { + "epoch": 0.74, + "grad_norm": 0.555223822593689, + "learning_rate": 0.0005136804115101475, + "loss": 3.1469, + "step": 15161 + }, + { + "epoch": 0.74, + "grad_norm": 0.533078670501709, + "learning_rate": 0.0005136696045261049, + "loss": 3.1378, + "step": 15162 + }, + { + "epoch": 0.74, + "grad_norm": 0.5096789598464966, + "learning_rate": 0.0005136587969792975, + "loss": 3.2697, + "step": 15163 + }, + { + "epoch": 0.74, + "grad_norm": 0.4871804416179657, + "learning_rate": 0.000513647988869754, + "loss": 3.1497, + "step": 15164 + }, + { + "epoch": 0.74, + "grad_norm": 0.47410327196121216, + "learning_rate": 0.0005136371801975028, + "loss": 3.0985, + "step": 15165 + }, + { + "epoch": 0.74, + "grad_norm": 0.5203956961631775, + "learning_rate": 0.0005136263709625724, + "loss": 3.1768, + "step": 15166 + }, + { + "epoch": 0.74, + "grad_norm": 0.5182998180389404, + "learning_rate": 0.0005136155611649912, + "loss": 3.0054, + "step": 15167 + }, + { + "epoch": 0.74, + "grad_norm": 0.5807198882102966, + "learning_rate": 0.0005136047508047879, + "loss": 3.1998, + "step": 15168 + }, + { + "epoch": 0.74, + "grad_norm": 0.49342507123947144, + "learning_rate": 0.0005135939398819906, + "loss": 3.1338, + "step": 15169 + }, + { + "epoch": 0.74, + "grad_norm": 0.5400848984718323, + "learning_rate": 0.000513583128396628, + "loss": 3.1402, + "step": 15170 + }, + { + "epoch": 0.74, + "grad_norm": 0.7351841926574707, + "learning_rate": 0.0005135723163487286, + "loss": 3.0862, + "step": 15171 + }, + { + "epoch": 0.74, + "grad_norm": 0.601331353187561, + "learning_rate": 0.0005135615037383209, + "loss": 3.1345, + "step": 15172 + }, + { + "epoch": 0.74, + "grad_norm": 0.5336458086967468, + "learning_rate": 0.0005135506905654331, + "loss": 3.0461, + "step": 15173 + }, + { + "epoch": 0.74, + "grad_norm": 0.5114117860794067, + "learning_rate": 0.000513539876830094, + "loss": 3.3398, + "step": 15174 + }, + { + "epoch": 0.74, + "grad_norm": 0.521041750907898, + "learning_rate": 0.0005135290625323319, + "loss": 3.2127, + "step": 15175 + }, + { + "epoch": 0.74, + "grad_norm": 0.5434523224830627, + "learning_rate": 0.0005135182476721754, + "loss": 3.2585, + "step": 15176 + }, + { + "epoch": 0.74, + "grad_norm": 0.556186854839325, + "learning_rate": 0.0005135074322496529, + "loss": 3.1948, + "step": 15177 + }, + { + "epoch": 0.74, + "grad_norm": 0.540410041809082, + "learning_rate": 0.0005134966162647929, + "loss": 2.9405, + "step": 15178 + }, + { + "epoch": 0.74, + "grad_norm": 0.540989100933075, + "learning_rate": 0.0005134857997176241, + "loss": 3.3179, + "step": 15179 + }, + { + "epoch": 0.74, + "grad_norm": 0.5502685308456421, + "learning_rate": 0.0005134749826081745, + "loss": 2.9779, + "step": 15180 + }, + { + "epoch": 0.74, + "grad_norm": 0.5441862344741821, + "learning_rate": 0.000513464164936473, + "loss": 3.2497, + "step": 15181 + }, + { + "epoch": 0.74, + "grad_norm": 0.5529954433441162, + "learning_rate": 0.0005134533467025479, + "loss": 3.1787, + "step": 15182 + }, + { + "epoch": 0.74, + "grad_norm": 0.5215994715690613, + "learning_rate": 0.0005134425279064279, + "loss": 3.2406, + "step": 15183 + }, + { + "epoch": 0.74, + "grad_norm": 0.4941001832485199, + "learning_rate": 0.0005134317085481413, + "loss": 2.8891, + "step": 15184 + }, + { + "epoch": 0.74, + "grad_norm": 0.5015299320220947, + "learning_rate": 0.0005134208886277167, + "loss": 3.1398, + "step": 15185 + }, + { + "epoch": 0.74, + "grad_norm": 0.4704887866973877, + "learning_rate": 0.0005134100681451825, + "loss": 3.2913, + "step": 15186 + }, + { + "epoch": 0.74, + "grad_norm": 0.4975797235965729, + "learning_rate": 0.0005133992471005672, + "loss": 3.055, + "step": 15187 + }, + { + "epoch": 0.74, + "grad_norm": 0.5469768047332764, + "learning_rate": 0.0005133884254938994, + "loss": 3.1481, + "step": 15188 + }, + { + "epoch": 0.74, + "grad_norm": 0.5250169634819031, + "learning_rate": 0.0005133776033252076, + "loss": 3.2758, + "step": 15189 + }, + { + "epoch": 0.74, + "grad_norm": 0.5461866855621338, + "learning_rate": 0.0005133667805945202, + "loss": 3.2773, + "step": 15190 + }, + { + "epoch": 0.74, + "grad_norm": 0.501222550868988, + "learning_rate": 0.0005133559573018658, + "loss": 3.1817, + "step": 15191 + }, + { + "epoch": 0.74, + "grad_norm": 0.5163980722427368, + "learning_rate": 0.0005133451334472729, + "loss": 3.3022, + "step": 15192 + }, + { + "epoch": 0.74, + "grad_norm": 0.49753910303115845, + "learning_rate": 0.0005133343090307699, + "loss": 3.3661, + "step": 15193 + }, + { + "epoch": 0.74, + "grad_norm": 0.5362524390220642, + "learning_rate": 0.0005133234840523854, + "loss": 3.2704, + "step": 15194 + }, + { + "epoch": 0.74, + "grad_norm": 0.5319569110870361, + "learning_rate": 0.000513312658512148, + "loss": 3.2043, + "step": 15195 + }, + { + "epoch": 0.74, + "grad_norm": 0.5191274881362915, + "learning_rate": 0.0005133018324100859, + "loss": 3.3014, + "step": 15196 + }, + { + "epoch": 0.74, + "grad_norm": 0.6559003591537476, + "learning_rate": 0.000513291005746228, + "loss": 3.253, + "step": 15197 + }, + { + "epoch": 0.74, + "grad_norm": 0.5224505066871643, + "learning_rate": 0.0005132801785206026, + "loss": 3.2519, + "step": 15198 + }, + { + "epoch": 0.74, + "grad_norm": 0.5321853756904602, + "learning_rate": 0.0005132693507332383, + "loss": 2.9671, + "step": 15199 + }, + { + "epoch": 0.74, + "grad_norm": 0.5000855326652527, + "learning_rate": 0.0005132585223841635, + "loss": 3.1668, + "step": 15200 + }, + { + "epoch": 0.74, + "grad_norm": 0.5224664211273193, + "learning_rate": 0.0005132476934734068, + "loss": 3.2821, + "step": 15201 + }, + { + "epoch": 0.75, + "grad_norm": 0.4886454641819, + "learning_rate": 0.0005132368640009968, + "loss": 3.0804, + "step": 15202 + }, + { + "epoch": 0.75, + "grad_norm": 0.5113134384155273, + "learning_rate": 0.0005132260339669618, + "loss": 3.0912, + "step": 15203 + }, + { + "epoch": 0.75, + "grad_norm": 0.5165402293205261, + "learning_rate": 0.0005132152033713305, + "loss": 3.1487, + "step": 15204 + }, + { + "epoch": 0.75, + "grad_norm": 0.5087444186210632, + "learning_rate": 0.0005132043722141314, + "loss": 3.0623, + "step": 15205 + }, + { + "epoch": 0.75, + "grad_norm": 0.49701055884361267, + "learning_rate": 0.0005131935404953931, + "loss": 3.1004, + "step": 15206 + }, + { + "epoch": 0.75, + "grad_norm": 0.5303621888160706, + "learning_rate": 0.000513182708215144, + "loss": 3.1908, + "step": 15207 + }, + { + "epoch": 0.75, + "grad_norm": 0.48327744007110596, + "learning_rate": 0.0005131718753734127, + "loss": 3.1967, + "step": 15208 + }, + { + "epoch": 0.75, + "grad_norm": 0.5533356666564941, + "learning_rate": 0.0005131610419702276, + "loss": 3.2278, + "step": 15209 + }, + { + "epoch": 0.75, + "grad_norm": 0.5282249450683594, + "learning_rate": 0.0005131502080056174, + "loss": 3.2681, + "step": 15210 + }, + { + "epoch": 0.75, + "grad_norm": 0.5078888535499573, + "learning_rate": 0.0005131393734796106, + "loss": 3.3327, + "step": 15211 + }, + { + "epoch": 0.75, + "grad_norm": 0.49346816539764404, + "learning_rate": 0.0005131285383922357, + "loss": 3.1853, + "step": 15212 + }, + { + "epoch": 0.75, + "grad_norm": 0.5162724256515503, + "learning_rate": 0.0005131177027435213, + "loss": 3.0953, + "step": 15213 + }, + { + "epoch": 0.75, + "grad_norm": 0.5229780673980713, + "learning_rate": 0.0005131068665334957, + "loss": 3.0968, + "step": 15214 + }, + { + "epoch": 0.75, + "grad_norm": 0.5405413508415222, + "learning_rate": 0.0005130960297621877, + "loss": 2.9953, + "step": 15215 + }, + { + "epoch": 0.75, + "grad_norm": 0.5310748219490051, + "learning_rate": 0.000513085192429626, + "loss": 3.1261, + "step": 15216 + }, + { + "epoch": 0.75, + "grad_norm": 0.5561037659645081, + "learning_rate": 0.0005130743545358386, + "loss": 3.3495, + "step": 15217 + }, + { + "epoch": 0.75, + "grad_norm": 0.5247962474822998, + "learning_rate": 0.0005130635160808545, + "loss": 3.3857, + "step": 15218 + }, + { + "epoch": 0.75, + "grad_norm": 0.5569313168525696, + "learning_rate": 0.0005130526770647021, + "loss": 3.169, + "step": 15219 + }, + { + "epoch": 0.75, + "grad_norm": 0.516389012336731, + "learning_rate": 0.0005130418374874099, + "loss": 3.2518, + "step": 15220 + }, + { + "epoch": 0.75, + "grad_norm": 0.5360453128814697, + "learning_rate": 0.0005130309973490066, + "loss": 3.1335, + "step": 15221 + }, + { + "epoch": 0.75, + "grad_norm": 0.4962994158267975, + "learning_rate": 0.0005130201566495206, + "loss": 3.3244, + "step": 15222 + }, + { + "epoch": 0.75, + "grad_norm": 0.5042166113853455, + "learning_rate": 0.0005130093153889805, + "loss": 3.1213, + "step": 15223 + }, + { + "epoch": 0.75, + "grad_norm": 0.504375696182251, + "learning_rate": 0.0005129984735674149, + "loss": 3.1488, + "step": 15224 + }, + { + "epoch": 0.75, + "grad_norm": 0.5094754695892334, + "learning_rate": 0.0005129876311848522, + "loss": 3.4237, + "step": 15225 + }, + { + "epoch": 0.75, + "grad_norm": 0.49719002842903137, + "learning_rate": 0.0005129767882413211, + "loss": 3.2614, + "step": 15226 + }, + { + "epoch": 0.75, + "grad_norm": 0.5632519721984863, + "learning_rate": 0.0005129659447368502, + "loss": 3.1523, + "step": 15227 + }, + { + "epoch": 0.75, + "grad_norm": 0.5078555345535278, + "learning_rate": 0.0005129551006714678, + "loss": 3.2035, + "step": 15228 + }, + { + "epoch": 0.75, + "grad_norm": 0.5147547721862793, + "learning_rate": 0.0005129442560452029, + "loss": 3.3097, + "step": 15229 + }, + { + "epoch": 0.75, + "grad_norm": 0.5100898742675781, + "learning_rate": 0.0005129334108580837, + "loss": 3.2415, + "step": 15230 + }, + { + "epoch": 0.75, + "grad_norm": 0.49058064818382263, + "learning_rate": 0.0005129225651101389, + "loss": 3.1857, + "step": 15231 + }, + { + "epoch": 0.75, + "grad_norm": 0.4862991273403168, + "learning_rate": 0.0005129117188013971, + "loss": 3.4175, + "step": 15232 + }, + { + "epoch": 0.75, + "grad_norm": 0.5109384059906006, + "learning_rate": 0.0005129008719318867, + "loss": 3.3401, + "step": 15233 + }, + { + "epoch": 0.75, + "grad_norm": 0.5978837013244629, + "learning_rate": 0.0005128900245016365, + "loss": 2.9597, + "step": 15234 + }, + { + "epoch": 0.75, + "grad_norm": 0.5241730809211731, + "learning_rate": 0.0005128791765106747, + "loss": 3.3489, + "step": 15235 + }, + { + "epoch": 0.75, + "grad_norm": 0.544468104839325, + "learning_rate": 0.0005128683279590304, + "loss": 3.0905, + "step": 15236 + }, + { + "epoch": 0.75, + "grad_norm": 0.5095611214637756, + "learning_rate": 0.0005128574788467317, + "loss": 3.2927, + "step": 15237 + }, + { + "epoch": 0.75, + "grad_norm": 0.6200950741767883, + "learning_rate": 0.0005128466291738074, + "loss": 3.3109, + "step": 15238 + }, + { + "epoch": 0.75, + "grad_norm": 0.5368150472640991, + "learning_rate": 0.0005128357789402861, + "loss": 3.004, + "step": 15239 + }, + { + "epoch": 0.75, + "grad_norm": 0.49415868520736694, + "learning_rate": 0.0005128249281461963, + "loss": 3.2525, + "step": 15240 + }, + { + "epoch": 0.75, + "grad_norm": 0.5034611821174622, + "learning_rate": 0.0005128140767915666, + "loss": 3.0394, + "step": 15241 + }, + { + "epoch": 0.75, + "grad_norm": 0.5524153709411621, + "learning_rate": 0.0005128032248764256, + "loss": 3.0271, + "step": 15242 + }, + { + "epoch": 0.75, + "grad_norm": 0.537729024887085, + "learning_rate": 0.0005127923724008018, + "loss": 3.3173, + "step": 15243 + }, + { + "epoch": 0.75, + "grad_norm": 0.5131357908248901, + "learning_rate": 0.0005127815193647239, + "loss": 2.9923, + "step": 15244 + }, + { + "epoch": 0.75, + "grad_norm": 0.546363353729248, + "learning_rate": 0.0005127706657682205, + "loss": 2.9208, + "step": 15245 + }, + { + "epoch": 0.75, + "grad_norm": 0.565846860408783, + "learning_rate": 0.00051275981161132, + "loss": 3.3611, + "step": 15246 + }, + { + "epoch": 0.75, + "grad_norm": 0.5199288129806519, + "learning_rate": 0.0005127489568940511, + "loss": 3.185, + "step": 15247 + }, + { + "epoch": 0.75, + "grad_norm": 0.5325568318367004, + "learning_rate": 0.0005127381016164425, + "loss": 3.0524, + "step": 15248 + }, + { + "epoch": 0.75, + "grad_norm": 0.5222455263137817, + "learning_rate": 0.0005127272457785225, + "loss": 3.0541, + "step": 15249 + }, + { + "epoch": 0.75, + "grad_norm": 0.5106666684150696, + "learning_rate": 0.00051271638938032, + "loss": 3.2104, + "step": 15250 + }, + { + "epoch": 0.75, + "grad_norm": 0.5324909090995789, + "learning_rate": 0.0005127055324218635, + "loss": 3.2489, + "step": 15251 + }, + { + "epoch": 0.75, + "grad_norm": 0.5093536376953125, + "learning_rate": 0.0005126946749031814, + "loss": 3.1664, + "step": 15252 + }, + { + "epoch": 0.75, + "grad_norm": 0.5190713405609131, + "learning_rate": 0.0005126838168243026, + "loss": 3.1126, + "step": 15253 + }, + { + "epoch": 0.75, + "grad_norm": 0.6075634360313416, + "learning_rate": 0.0005126729581852556, + "loss": 3.4352, + "step": 15254 + }, + { + "epoch": 0.75, + "grad_norm": 0.5500290393829346, + "learning_rate": 0.0005126620989860688, + "loss": 3.1548, + "step": 15255 + }, + { + "epoch": 0.75, + "grad_norm": 0.5063414573669434, + "learning_rate": 0.000512651239226771, + "loss": 3.2785, + "step": 15256 + }, + { + "epoch": 0.75, + "grad_norm": 0.5023425817489624, + "learning_rate": 0.0005126403789073909, + "loss": 3.2736, + "step": 15257 + }, + { + "epoch": 0.75, + "grad_norm": 0.5160260796546936, + "learning_rate": 0.0005126295180279568, + "loss": 3.3132, + "step": 15258 + }, + { + "epoch": 0.75, + "grad_norm": 0.50048828125, + "learning_rate": 0.0005126186565884975, + "loss": 3.1487, + "step": 15259 + }, + { + "epoch": 0.75, + "grad_norm": 0.49590444564819336, + "learning_rate": 0.0005126077945890417, + "loss": 3.2356, + "step": 15260 + }, + { + "epoch": 0.75, + "grad_norm": 0.5084480047225952, + "learning_rate": 0.0005125969320296178, + "loss": 3.2633, + "step": 15261 + }, + { + "epoch": 0.75, + "grad_norm": 0.5234677195549011, + "learning_rate": 0.0005125860689102544, + "loss": 3.0592, + "step": 15262 + }, + { + "epoch": 0.75, + "grad_norm": 0.5146345496177673, + "learning_rate": 0.0005125752052309805, + "loss": 3.3137, + "step": 15263 + }, + { + "epoch": 0.75, + "grad_norm": 0.5760210752487183, + "learning_rate": 0.0005125643409918242, + "loss": 3.3984, + "step": 15264 + }, + { + "epoch": 0.75, + "grad_norm": 0.5140902996063232, + "learning_rate": 0.0005125534761928144, + "loss": 3.307, + "step": 15265 + }, + { + "epoch": 0.75, + "grad_norm": 0.5068773031234741, + "learning_rate": 0.0005125426108339795, + "loss": 3.0952, + "step": 15266 + }, + { + "epoch": 0.75, + "grad_norm": 0.5212466716766357, + "learning_rate": 0.0005125317449153484, + "loss": 3.1379, + "step": 15267 + }, + { + "epoch": 0.75, + "grad_norm": 0.5364983677864075, + "learning_rate": 0.0005125208784369495, + "loss": 3.2761, + "step": 15268 + }, + { + "epoch": 0.75, + "grad_norm": 0.4984794557094574, + "learning_rate": 0.0005125100113988117, + "loss": 3.1004, + "step": 15269 + }, + { + "epoch": 0.75, + "grad_norm": 0.5525820851325989, + "learning_rate": 0.0005124991438009632, + "loss": 3.2194, + "step": 15270 + }, + { + "epoch": 0.75, + "grad_norm": 0.5691623091697693, + "learning_rate": 0.000512488275643433, + "loss": 3.1136, + "step": 15271 + }, + { + "epoch": 0.75, + "grad_norm": 0.5180755853652954, + "learning_rate": 0.0005124774069262494, + "loss": 3.2401, + "step": 15272 + }, + { + "epoch": 0.75, + "grad_norm": 0.5265724062919617, + "learning_rate": 0.0005124665376494414, + "loss": 3.2386, + "step": 15273 + }, + { + "epoch": 0.75, + "grad_norm": 0.48479267954826355, + "learning_rate": 0.0005124556678130374, + "loss": 3.1576, + "step": 15274 + }, + { + "epoch": 0.75, + "grad_norm": 0.5298726558685303, + "learning_rate": 0.000512444797417066, + "loss": 2.8294, + "step": 15275 + }, + { + "epoch": 0.75, + "grad_norm": 0.509896457195282, + "learning_rate": 0.000512433926461556, + "loss": 3.0053, + "step": 15276 + }, + { + "epoch": 0.75, + "grad_norm": 0.5217825174331665, + "learning_rate": 0.0005124230549465357, + "loss": 3.2738, + "step": 15277 + }, + { + "epoch": 0.75, + "grad_norm": 0.4862534999847412, + "learning_rate": 0.0005124121828720341, + "loss": 3.2049, + "step": 15278 + }, + { + "epoch": 0.75, + "grad_norm": 0.5458556413650513, + "learning_rate": 0.0005124013102380797, + "loss": 3.2185, + "step": 15279 + }, + { + "epoch": 0.75, + "grad_norm": 0.5805124044418335, + "learning_rate": 0.0005123904370447011, + "loss": 3.0137, + "step": 15280 + }, + { + "epoch": 0.75, + "grad_norm": 0.521308958530426, + "learning_rate": 0.000512379563291927, + "loss": 3.1837, + "step": 15281 + }, + { + "epoch": 0.75, + "grad_norm": 0.5270718336105347, + "learning_rate": 0.000512368688979786, + "loss": 3.4552, + "step": 15282 + }, + { + "epoch": 0.75, + "grad_norm": 0.504327654838562, + "learning_rate": 0.0005123578141083067, + "loss": 3.405, + "step": 15283 + }, + { + "epoch": 0.75, + "grad_norm": 0.49246448278427124, + "learning_rate": 0.0005123469386775178, + "loss": 3.1681, + "step": 15284 + }, + { + "epoch": 0.75, + "grad_norm": 0.5137251615524292, + "learning_rate": 0.0005123360626874479, + "loss": 3.4152, + "step": 15285 + }, + { + "epoch": 0.75, + "grad_norm": 0.477211058139801, + "learning_rate": 0.0005123251861381257, + "loss": 2.988, + "step": 15286 + }, + { + "epoch": 0.75, + "grad_norm": 0.48780468106269836, + "learning_rate": 0.0005123143090295799, + "loss": 3.2013, + "step": 15287 + }, + { + "epoch": 0.75, + "grad_norm": 0.535379946231842, + "learning_rate": 0.0005123034313618389, + "loss": 3.1484, + "step": 15288 + }, + { + "epoch": 0.75, + "grad_norm": 0.5203052163124084, + "learning_rate": 0.0005122925531349317, + "loss": 3.1849, + "step": 15289 + }, + { + "epoch": 0.75, + "grad_norm": 0.5167236924171448, + "learning_rate": 0.0005122816743488866, + "loss": 3.0851, + "step": 15290 + }, + { + "epoch": 0.75, + "grad_norm": 0.4870116412639618, + "learning_rate": 0.0005122707950037325, + "loss": 3.3136, + "step": 15291 + }, + { + "epoch": 0.75, + "grad_norm": 0.5424923300743103, + "learning_rate": 0.0005122599150994981, + "loss": 3.1071, + "step": 15292 + }, + { + "epoch": 0.75, + "grad_norm": 0.5110763311386108, + "learning_rate": 0.0005122490346362119, + "loss": 3.0956, + "step": 15293 + }, + { + "epoch": 0.75, + "grad_norm": 0.5034542083740234, + "learning_rate": 0.0005122381536139025, + "loss": 3.2484, + "step": 15294 + }, + { + "epoch": 0.75, + "grad_norm": 0.49909499287605286, + "learning_rate": 0.0005122272720325986, + "loss": 3.314, + "step": 15295 + }, + { + "epoch": 0.75, + "grad_norm": 0.5280368328094482, + "learning_rate": 0.0005122163898923289, + "loss": 3.3027, + "step": 15296 + }, + { + "epoch": 0.75, + "grad_norm": 0.4968806505203247, + "learning_rate": 0.0005122055071931222, + "loss": 3.2456, + "step": 15297 + }, + { + "epoch": 0.75, + "grad_norm": 0.5014216899871826, + "learning_rate": 0.000512194623935007, + "loss": 3.1594, + "step": 15298 + }, + { + "epoch": 0.75, + "grad_norm": 0.535797655582428, + "learning_rate": 0.000512183740118012, + "loss": 3.2467, + "step": 15299 + }, + { + "epoch": 0.75, + "grad_norm": 0.52153080701828, + "learning_rate": 0.0005121728557421658, + "loss": 2.9868, + "step": 15300 + }, + { + "epoch": 0.75, + "grad_norm": 0.5368970632553101, + "learning_rate": 0.0005121619708074972, + "loss": 3.1427, + "step": 15301 + }, + { + "epoch": 0.75, + "grad_norm": 0.5109313726425171, + "learning_rate": 0.0005121510853140348, + "loss": 3.0168, + "step": 15302 + }, + { + "epoch": 0.75, + "grad_norm": 0.5726766586303711, + "learning_rate": 0.0005121401992618073, + "loss": 3.0998, + "step": 15303 + }, + { + "epoch": 0.75, + "grad_norm": 0.5439186692237854, + "learning_rate": 0.0005121293126508432, + "loss": 3.1734, + "step": 15304 + }, + { + "epoch": 0.75, + "grad_norm": 0.5013107061386108, + "learning_rate": 0.0005121184254811714, + "loss": 3.3949, + "step": 15305 + }, + { + "epoch": 0.75, + "grad_norm": 0.5400449633598328, + "learning_rate": 0.0005121075377528205, + "loss": 3.0781, + "step": 15306 + }, + { + "epoch": 0.75, + "grad_norm": 0.5805463790893555, + "learning_rate": 0.0005120966494658192, + "loss": 3.1349, + "step": 15307 + }, + { + "epoch": 0.75, + "grad_norm": 0.5074591636657715, + "learning_rate": 0.0005120857606201961, + "loss": 3.3115, + "step": 15308 + }, + { + "epoch": 0.75, + "grad_norm": 0.5145214200019836, + "learning_rate": 0.00051207487121598, + "loss": 3.1443, + "step": 15309 + }, + { + "epoch": 0.75, + "grad_norm": 0.5054406523704529, + "learning_rate": 0.0005120639812531995, + "loss": 3.4488, + "step": 15310 + }, + { + "epoch": 0.75, + "grad_norm": 0.5463675856590271, + "learning_rate": 0.0005120530907318831, + "loss": 2.9445, + "step": 15311 + }, + { + "epoch": 0.75, + "grad_norm": 0.5270100235939026, + "learning_rate": 0.0005120421996520597, + "loss": 3.2497, + "step": 15312 + }, + { + "epoch": 0.75, + "grad_norm": 0.5352219939231873, + "learning_rate": 0.0005120313080137581, + "loss": 3.1157, + "step": 15313 + }, + { + "epoch": 0.75, + "grad_norm": 0.550274670124054, + "learning_rate": 0.0005120204158170069, + "loss": 3.0731, + "step": 15314 + }, + { + "epoch": 0.75, + "grad_norm": 0.5238516330718994, + "learning_rate": 0.0005120095230618345, + "loss": 3.4092, + "step": 15315 + }, + { + "epoch": 0.75, + "grad_norm": 0.4942419230937958, + "learning_rate": 0.00051199862974827, + "loss": 3.3665, + "step": 15316 + }, + { + "epoch": 0.75, + "grad_norm": 0.489196240901947, + "learning_rate": 0.0005119877358763418, + "loss": 3.0608, + "step": 15317 + }, + { + "epoch": 0.75, + "grad_norm": 0.5123137831687927, + "learning_rate": 0.0005119768414460788, + "loss": 3.3294, + "step": 15318 + }, + { + "epoch": 0.75, + "grad_norm": 0.48939448595046997, + "learning_rate": 0.0005119659464575096, + "loss": 3.3551, + "step": 15319 + }, + { + "epoch": 0.75, + "grad_norm": 0.5240415930747986, + "learning_rate": 0.0005119550509106628, + "loss": 3.3512, + "step": 15320 + }, + { + "epoch": 0.75, + "grad_norm": 0.5232025980949402, + "learning_rate": 0.0005119441548055671, + "loss": 3.0429, + "step": 15321 + }, + { + "epoch": 0.75, + "grad_norm": 0.5226197242736816, + "learning_rate": 0.0005119332581422515, + "loss": 3.2391, + "step": 15322 + }, + { + "epoch": 0.75, + "grad_norm": 0.5628093481063843, + "learning_rate": 0.0005119223609207443, + "loss": 3.3304, + "step": 15323 + }, + { + "epoch": 0.75, + "grad_norm": 0.4734574854373932, + "learning_rate": 0.0005119114631410745, + "loss": 3.4568, + "step": 15324 + }, + { + "epoch": 0.75, + "grad_norm": 0.515416145324707, + "learning_rate": 0.0005119005648032707, + "loss": 3.2631, + "step": 15325 + }, + { + "epoch": 0.75, + "grad_norm": 0.5349523425102234, + "learning_rate": 0.0005118896659073616, + "loss": 3.5373, + "step": 15326 + }, + { + "epoch": 0.75, + "grad_norm": 0.562615156173706, + "learning_rate": 0.0005118787664533757, + "loss": 3.1333, + "step": 15327 + }, + { + "epoch": 0.75, + "grad_norm": 0.49806028604507446, + "learning_rate": 0.0005118678664413421, + "loss": 3.2441, + "step": 15328 + }, + { + "epoch": 0.75, + "grad_norm": 0.49525222182273865, + "learning_rate": 0.0005118569658712893, + "loss": 3.1967, + "step": 15329 + }, + { + "epoch": 0.75, + "grad_norm": 0.548129141330719, + "learning_rate": 0.0005118460647432461, + "loss": 3.159, + "step": 15330 + }, + { + "epoch": 0.75, + "grad_norm": 0.5434116721153259, + "learning_rate": 0.000511835163057241, + "loss": 3.2088, + "step": 15331 + }, + { + "epoch": 0.75, + "grad_norm": 0.5034930109977722, + "learning_rate": 0.0005118242608133028, + "loss": 3.3135, + "step": 15332 + }, + { + "epoch": 0.75, + "grad_norm": 0.5119454264640808, + "learning_rate": 0.0005118133580114604, + "loss": 3.1451, + "step": 15333 + }, + { + "epoch": 0.75, + "grad_norm": 0.5325192213058472, + "learning_rate": 0.0005118024546517424, + "loss": 3.1097, + "step": 15334 + }, + { + "epoch": 0.75, + "grad_norm": 0.5029995441436768, + "learning_rate": 0.0005117915507341774, + "loss": 3.3235, + "step": 15335 + }, + { + "epoch": 0.75, + "grad_norm": 0.5167403221130371, + "learning_rate": 0.0005117806462587942, + "loss": 3.1699, + "step": 15336 + }, + { + "epoch": 0.75, + "grad_norm": 0.5161690711975098, + "learning_rate": 0.0005117697412256216, + "loss": 3.3004, + "step": 15337 + }, + { + "epoch": 0.75, + "grad_norm": 0.5440047383308411, + "learning_rate": 0.0005117588356346881, + "loss": 3.187, + "step": 15338 + }, + { + "epoch": 0.75, + "grad_norm": 0.5046606063842773, + "learning_rate": 0.0005117479294860228, + "loss": 3.1708, + "step": 15339 + }, + { + "epoch": 0.75, + "grad_norm": 0.5875895023345947, + "learning_rate": 0.0005117370227796542, + "loss": 3.21, + "step": 15340 + }, + { + "epoch": 0.75, + "grad_norm": 0.5174129605293274, + "learning_rate": 0.0005117261155156109, + "loss": 3.0799, + "step": 15341 + }, + { + "epoch": 0.75, + "grad_norm": 0.5070399045944214, + "learning_rate": 0.0005117152076939218, + "loss": 3.294, + "step": 15342 + }, + { + "epoch": 0.75, + "grad_norm": 0.5156857967376709, + "learning_rate": 0.0005117042993146156, + "loss": 3.1872, + "step": 15343 + }, + { + "epoch": 0.75, + "grad_norm": 0.5003373026847839, + "learning_rate": 0.000511693390377721, + "loss": 2.9625, + "step": 15344 + }, + { + "epoch": 0.75, + "grad_norm": 0.5173194408416748, + "learning_rate": 0.0005116824808832668, + "loss": 3.0713, + "step": 15345 + }, + { + "epoch": 0.75, + "grad_norm": 0.5497887134552002, + "learning_rate": 0.0005116715708312817, + "loss": 3.2665, + "step": 15346 + }, + { + "epoch": 0.75, + "grad_norm": 0.5368115305900574, + "learning_rate": 0.0005116606602217944, + "loss": 3.3243, + "step": 15347 + }, + { + "epoch": 0.75, + "grad_norm": 0.5352510213851929, + "learning_rate": 0.0005116497490548335, + "loss": 3.0809, + "step": 15348 + }, + { + "epoch": 0.75, + "grad_norm": 0.5086609721183777, + "learning_rate": 0.0005116388373304281, + "loss": 3.2868, + "step": 15349 + }, + { + "epoch": 0.75, + "grad_norm": 0.5148516297340393, + "learning_rate": 0.0005116279250486067, + "loss": 3.1258, + "step": 15350 + }, + { + "epoch": 0.75, + "grad_norm": 0.5165446996688843, + "learning_rate": 0.000511617012209398, + "loss": 3.2221, + "step": 15351 + }, + { + "epoch": 0.75, + "grad_norm": 0.5249726176261902, + "learning_rate": 0.0005116060988128308, + "loss": 3.2277, + "step": 15352 + }, + { + "epoch": 0.75, + "grad_norm": 0.5012349486351013, + "learning_rate": 0.0005115951848589339, + "loss": 3.4042, + "step": 15353 + }, + { + "epoch": 0.75, + "grad_norm": 0.5562968850135803, + "learning_rate": 0.0005115842703477361, + "loss": 3.1529, + "step": 15354 + }, + { + "epoch": 0.75, + "grad_norm": 0.49537578225135803, + "learning_rate": 0.0005115733552792659, + "loss": 3.0784, + "step": 15355 + }, + { + "epoch": 0.75, + "grad_norm": 0.5626443028450012, + "learning_rate": 0.0005115624396535522, + "loss": 3.1834, + "step": 15356 + }, + { + "epoch": 0.75, + "grad_norm": 0.5942702889442444, + "learning_rate": 0.0005115515234706238, + "loss": 2.9109, + "step": 15357 + }, + { + "epoch": 0.75, + "grad_norm": 0.5349305272102356, + "learning_rate": 0.0005115406067305095, + "loss": 3.0626, + "step": 15358 + }, + { + "epoch": 0.75, + "grad_norm": 0.4917951822280884, + "learning_rate": 0.0005115296894332379, + "loss": 3.3152, + "step": 15359 + }, + { + "epoch": 0.75, + "grad_norm": 0.5094746351242065, + "learning_rate": 0.0005115187715788377, + "loss": 3.1875, + "step": 15360 + }, + { + "epoch": 0.75, + "grad_norm": 0.5121936798095703, + "learning_rate": 0.0005115078531673379, + "loss": 3.3627, + "step": 15361 + }, + { + "epoch": 0.75, + "grad_norm": 0.5854499340057373, + "learning_rate": 0.000511496934198767, + "loss": 3.2952, + "step": 15362 + }, + { + "epoch": 0.75, + "grad_norm": 0.5319592952728271, + "learning_rate": 0.000511486014673154, + "loss": 3.2366, + "step": 15363 + }, + { + "epoch": 0.75, + "grad_norm": 0.553443193435669, + "learning_rate": 0.0005114750945905275, + "loss": 3.0643, + "step": 15364 + }, + { + "epoch": 0.75, + "grad_norm": 0.5256295204162598, + "learning_rate": 0.0005114641739509162, + "loss": 3.1867, + "step": 15365 + }, + { + "epoch": 0.75, + "grad_norm": 0.5130583643913269, + "learning_rate": 0.0005114532527543492, + "loss": 3.2345, + "step": 15366 + }, + { + "epoch": 0.75, + "grad_norm": 0.5329893231391907, + "learning_rate": 0.0005114423310008547, + "loss": 3.4116, + "step": 15367 + }, + { + "epoch": 0.75, + "grad_norm": 0.5318312644958496, + "learning_rate": 0.000511431408690462, + "loss": 3.2325, + "step": 15368 + }, + { + "epoch": 0.75, + "grad_norm": 0.5396196842193604, + "learning_rate": 0.0005114204858231997, + "loss": 3.1291, + "step": 15369 + }, + { + "epoch": 0.75, + "grad_norm": 0.4775935113430023, + "learning_rate": 0.0005114095623990964, + "loss": 3.3107, + "step": 15370 + }, + { + "epoch": 0.75, + "grad_norm": 0.596052348613739, + "learning_rate": 0.000511398638418181, + "loss": 3.3655, + "step": 15371 + }, + { + "epoch": 0.75, + "grad_norm": 0.5218362212181091, + "learning_rate": 0.0005113877138804824, + "loss": 3.0283, + "step": 15372 + }, + { + "epoch": 0.75, + "grad_norm": 0.5000458359718323, + "learning_rate": 0.0005113767887860291, + "loss": 3.2683, + "step": 15373 + }, + { + "epoch": 0.75, + "grad_norm": 0.5197030901908875, + "learning_rate": 0.0005113658631348501, + "loss": 3.058, + "step": 15374 + }, + { + "epoch": 0.75, + "grad_norm": 0.5189664363861084, + "learning_rate": 0.000511354936926974, + "loss": 3.3382, + "step": 15375 + }, + { + "epoch": 0.75, + "grad_norm": 0.571043848991394, + "learning_rate": 0.0005113440101624299, + "loss": 3.0839, + "step": 15376 + }, + { + "epoch": 0.75, + "grad_norm": 0.5379744172096252, + "learning_rate": 0.0005113330828412461, + "loss": 3.1519, + "step": 15377 + }, + { + "epoch": 0.75, + "grad_norm": 0.4935210943222046, + "learning_rate": 0.0005113221549634517, + "loss": 3.1605, + "step": 15378 + }, + { + "epoch": 0.75, + "grad_norm": 0.5590223670005798, + "learning_rate": 0.0005113112265290755, + "loss": 3.3283, + "step": 15379 + }, + { + "epoch": 0.75, + "grad_norm": 0.5487520694732666, + "learning_rate": 0.0005113002975381462, + "loss": 3.4678, + "step": 15380 + }, + { + "epoch": 0.75, + "grad_norm": 0.5301699638366699, + "learning_rate": 0.0005112893679906926, + "loss": 3.0123, + "step": 15381 + }, + { + "epoch": 0.75, + "grad_norm": 0.5726925730705261, + "learning_rate": 0.0005112784378867432, + "loss": 3.1856, + "step": 15382 + }, + { + "epoch": 0.75, + "grad_norm": 0.5013753771781921, + "learning_rate": 0.0005112675072263273, + "loss": 3.1004, + "step": 15383 + }, + { + "epoch": 0.75, + "grad_norm": 0.5306413173675537, + "learning_rate": 0.0005112565760094734, + "loss": 3.1682, + "step": 15384 + }, + { + "epoch": 0.75, + "grad_norm": 0.4865894019603729, + "learning_rate": 0.0005112456442362104, + "loss": 3.1465, + "step": 15385 + }, + { + "epoch": 0.75, + "grad_norm": 0.5361476540565491, + "learning_rate": 0.0005112347119065669, + "loss": 3.5116, + "step": 15386 + }, + { + "epoch": 0.75, + "grad_norm": 0.5227615833282471, + "learning_rate": 0.0005112237790205719, + "loss": 3.2517, + "step": 15387 + }, + { + "epoch": 0.75, + "grad_norm": 0.5013512969017029, + "learning_rate": 0.000511212845578254, + "loss": 2.9083, + "step": 15388 + }, + { + "epoch": 0.75, + "grad_norm": 0.5280064940452576, + "learning_rate": 0.0005112019115796424, + "loss": 3.3894, + "step": 15389 + }, + { + "epoch": 0.75, + "grad_norm": 0.5008797645568848, + "learning_rate": 0.0005111909770247653, + "loss": 3.4333, + "step": 15390 + }, + { + "epoch": 0.75, + "grad_norm": 0.5193240642547607, + "learning_rate": 0.000511180041913652, + "loss": 3.1138, + "step": 15391 + }, + { + "epoch": 0.75, + "grad_norm": 0.49930593371391296, + "learning_rate": 0.000511169106246331, + "loss": 3.3978, + "step": 15392 + }, + { + "epoch": 0.75, + "grad_norm": 0.48021283745765686, + "learning_rate": 0.0005111581700228313, + "loss": 3.3122, + "step": 15393 + }, + { + "epoch": 0.75, + "grad_norm": 0.5200793147087097, + "learning_rate": 0.0005111472332431815, + "loss": 3.2217, + "step": 15394 + }, + { + "epoch": 0.75, + "grad_norm": 0.5211922526359558, + "learning_rate": 0.0005111362959074106, + "loss": 3.1195, + "step": 15395 + }, + { + "epoch": 0.75, + "grad_norm": 0.5001904368400574, + "learning_rate": 0.0005111253580155474, + "loss": 3.1291, + "step": 15396 + }, + { + "epoch": 0.75, + "grad_norm": 0.5240308046340942, + "learning_rate": 0.0005111144195676206, + "loss": 3.3654, + "step": 15397 + }, + { + "epoch": 0.75, + "grad_norm": 0.5145796537399292, + "learning_rate": 0.0005111034805636589, + "loss": 3.1215, + "step": 15398 + }, + { + "epoch": 0.75, + "grad_norm": 0.517951488494873, + "learning_rate": 0.0005110925410036914, + "loss": 3.1964, + "step": 15399 + }, + { + "epoch": 0.75, + "grad_norm": 0.5089027285575867, + "learning_rate": 0.0005110816008877468, + "loss": 3.2889, + "step": 15400 + }, + { + "epoch": 0.75, + "grad_norm": 0.5206027626991272, + "learning_rate": 0.0005110706602158539, + "loss": 3.2054, + "step": 15401 + }, + { + "epoch": 0.75, + "grad_norm": 0.522180438041687, + "learning_rate": 0.0005110597189880414, + "loss": 3.2392, + "step": 15402 + }, + { + "epoch": 0.75, + "grad_norm": 0.5154605507850647, + "learning_rate": 0.0005110487772043383, + "loss": 3.1462, + "step": 15403 + }, + { + "epoch": 0.75, + "grad_norm": 0.6345930695533752, + "learning_rate": 0.0005110378348647732, + "loss": 3.4421, + "step": 15404 + }, + { + "epoch": 0.75, + "grad_norm": 0.5428985357284546, + "learning_rate": 0.0005110268919693752, + "loss": 3.1896, + "step": 15405 + }, + { + "epoch": 0.76, + "grad_norm": 0.5568393468856812, + "learning_rate": 0.0005110159485181729, + "loss": 3.1683, + "step": 15406 + }, + { + "epoch": 0.76, + "grad_norm": 0.5200351476669312, + "learning_rate": 0.0005110050045111953, + "loss": 3.0301, + "step": 15407 + }, + { + "epoch": 0.76, + "grad_norm": 0.5830309391021729, + "learning_rate": 0.000510994059948471, + "loss": 3.2447, + "step": 15408 + }, + { + "epoch": 0.76, + "grad_norm": 0.5186257362365723, + "learning_rate": 0.000510983114830029, + "loss": 3.4328, + "step": 15409 + }, + { + "epoch": 0.76, + "grad_norm": 0.5107519626617432, + "learning_rate": 0.0005109721691558981, + "loss": 3.4096, + "step": 15410 + }, + { + "epoch": 0.76, + "grad_norm": 0.5244288444519043, + "learning_rate": 0.0005109612229261073, + "loss": 3.1377, + "step": 15411 + }, + { + "epoch": 0.76, + "grad_norm": 0.47387999296188354, + "learning_rate": 0.000510950276140685, + "loss": 3.3397, + "step": 15412 + }, + { + "epoch": 0.76, + "grad_norm": 0.5340439677238464, + "learning_rate": 0.0005109393287996602, + "loss": 3.1848, + "step": 15413 + }, + { + "epoch": 0.76, + "grad_norm": 0.506712794303894, + "learning_rate": 0.000510928380903062, + "loss": 3.1481, + "step": 15414 + }, + { + "epoch": 0.76, + "grad_norm": 0.5220345854759216, + "learning_rate": 0.0005109174324509189, + "loss": 3.0608, + "step": 15415 + }, + { + "epoch": 0.76, + "grad_norm": 0.48933979868888855, + "learning_rate": 0.00051090648344326, + "loss": 3.0937, + "step": 15416 + }, + { + "epoch": 0.76, + "grad_norm": 0.5643341541290283, + "learning_rate": 0.0005108955338801139, + "loss": 2.9736, + "step": 15417 + }, + { + "epoch": 0.76, + "grad_norm": 0.5401406288146973, + "learning_rate": 0.0005108845837615096, + "loss": 3.1143, + "step": 15418 + }, + { + "epoch": 0.76, + "grad_norm": 0.5696349740028381, + "learning_rate": 0.0005108736330874759, + "loss": 3.4066, + "step": 15419 + }, + { + "epoch": 0.76, + "grad_norm": 0.5899490118026733, + "learning_rate": 0.0005108626818580415, + "loss": 3.0104, + "step": 15420 + }, + { + "epoch": 0.76, + "grad_norm": 0.515295684337616, + "learning_rate": 0.0005108517300732356, + "loss": 3.2919, + "step": 15421 + }, + { + "epoch": 0.76, + "grad_norm": 0.5573397874832153, + "learning_rate": 0.0005108407777330867, + "loss": 3.061, + "step": 15422 + }, + { + "epoch": 0.76, + "grad_norm": 0.5408274531364441, + "learning_rate": 0.0005108298248376238, + "loss": 3.4797, + "step": 15423 + }, + { + "epoch": 0.76, + "grad_norm": 0.5029387474060059, + "learning_rate": 0.0005108188713868758, + "loss": 3.2384, + "step": 15424 + }, + { + "epoch": 0.76, + "grad_norm": 0.4932806193828583, + "learning_rate": 0.0005108079173808713, + "loss": 3.2024, + "step": 15425 + }, + { + "epoch": 0.76, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0005107969628196393, + "loss": 3.1832, + "step": 15426 + }, + { + "epoch": 0.76, + "grad_norm": 0.528161883354187, + "learning_rate": 0.0005107860077032088, + "loss": 3.1924, + "step": 15427 + }, + { + "epoch": 0.76, + "grad_norm": 0.5298112630844116, + "learning_rate": 0.0005107750520316085, + "loss": 3.2096, + "step": 15428 + }, + { + "epoch": 0.76, + "grad_norm": 0.5262047648429871, + "learning_rate": 0.0005107640958048673, + "loss": 3.2419, + "step": 15429 + }, + { + "epoch": 0.76, + "grad_norm": 0.5328362584114075, + "learning_rate": 0.000510753139023014, + "loss": 3.0983, + "step": 15430 + }, + { + "epoch": 0.76, + "grad_norm": 0.5387527346611023, + "learning_rate": 0.0005107421816860774, + "loss": 3.1221, + "step": 15431 + }, + { + "epoch": 0.76, + "grad_norm": 0.5194116234779358, + "learning_rate": 0.0005107312237940866, + "loss": 3.1749, + "step": 15432 + }, + { + "epoch": 0.76, + "grad_norm": 0.506995677947998, + "learning_rate": 0.0005107202653470703, + "loss": 3.1372, + "step": 15433 + }, + { + "epoch": 0.76, + "grad_norm": 0.5215865969657898, + "learning_rate": 0.0005107093063450573, + "loss": 3.1794, + "step": 15434 + }, + { + "epoch": 0.76, + "grad_norm": 0.5082940459251404, + "learning_rate": 0.0005106983467880765, + "loss": 3.0021, + "step": 15435 + }, + { + "epoch": 0.76, + "grad_norm": 0.5554096102714539, + "learning_rate": 0.0005106873866761569, + "loss": 3.2256, + "step": 15436 + }, + { + "epoch": 0.76, + "grad_norm": 0.5048863291740417, + "learning_rate": 0.0005106764260093273, + "loss": 3.3262, + "step": 15437 + }, + { + "epoch": 0.76, + "grad_norm": 0.507644534111023, + "learning_rate": 0.0005106654647876165, + "loss": 3.1868, + "step": 15438 + }, + { + "epoch": 0.76, + "grad_norm": 0.5397703051567078, + "learning_rate": 0.0005106545030110533, + "loss": 3.0611, + "step": 15439 + }, + { + "epoch": 0.76, + "grad_norm": 0.524456262588501, + "learning_rate": 0.0005106435406796668, + "loss": 3.3257, + "step": 15440 + }, + { + "epoch": 0.76, + "grad_norm": 0.49036774039268494, + "learning_rate": 0.0005106325777934857, + "loss": 3.5181, + "step": 15441 + }, + { + "epoch": 0.76, + "grad_norm": 0.5018916726112366, + "learning_rate": 0.000510621614352539, + "loss": 3.1098, + "step": 15442 + }, + { + "epoch": 0.76, + "grad_norm": 0.5243989825248718, + "learning_rate": 0.0005106106503568555, + "loss": 3.1468, + "step": 15443 + }, + { + "epoch": 0.76, + "grad_norm": 0.5382325649261475, + "learning_rate": 0.000510599685806464, + "loss": 3.2289, + "step": 15444 + }, + { + "epoch": 0.76, + "grad_norm": 0.5662506222724915, + "learning_rate": 0.0005105887207013934, + "loss": 3.1284, + "step": 15445 + }, + { + "epoch": 0.76, + "grad_norm": 0.5015741586685181, + "learning_rate": 0.0005105777550416728, + "loss": 3.3105, + "step": 15446 + }, + { + "epoch": 0.76, + "grad_norm": 0.5282851457595825, + "learning_rate": 0.0005105667888273309, + "loss": 3.2448, + "step": 15447 + }, + { + "epoch": 0.76, + "grad_norm": 0.5204810500144958, + "learning_rate": 0.0005105558220583965, + "loss": 3.2382, + "step": 15448 + }, + { + "epoch": 0.76, + "grad_norm": 0.5286573171615601, + "learning_rate": 0.0005105448547348986, + "loss": 3.2456, + "step": 15449 + }, + { + "epoch": 0.76, + "grad_norm": 0.5265070199966431, + "learning_rate": 0.0005105338868568661, + "loss": 3.2492, + "step": 15450 + }, + { + "epoch": 0.76, + "grad_norm": 0.5228598713874817, + "learning_rate": 0.0005105229184243279, + "loss": 3.1497, + "step": 15451 + }, + { + "epoch": 0.76, + "grad_norm": 0.5335772633552551, + "learning_rate": 0.0005105119494373128, + "loss": 3.2094, + "step": 15452 + }, + { + "epoch": 0.76, + "grad_norm": 0.4976807236671448, + "learning_rate": 0.0005105009798958498, + "loss": 3.1789, + "step": 15453 + }, + { + "epoch": 0.76, + "grad_norm": 0.5059366822242737, + "learning_rate": 0.0005104900097999676, + "loss": 3.2996, + "step": 15454 + }, + { + "epoch": 0.76, + "grad_norm": 0.4928973913192749, + "learning_rate": 0.0005104790391496953, + "loss": 3.2344, + "step": 15455 + }, + { + "epoch": 0.76, + "grad_norm": 0.48326247930526733, + "learning_rate": 0.0005104680679450618, + "loss": 3.3739, + "step": 15456 + }, + { + "epoch": 0.76, + "grad_norm": 0.5389878749847412, + "learning_rate": 0.0005104570961860958, + "loss": 3.0607, + "step": 15457 + }, + { + "epoch": 0.76, + "grad_norm": 0.5126422643661499, + "learning_rate": 0.0005104461238728264, + "loss": 3.2553, + "step": 15458 + }, + { + "epoch": 0.76, + "grad_norm": 0.5000913143157959, + "learning_rate": 0.0005104351510052823, + "loss": 3.156, + "step": 15459 + }, + { + "epoch": 0.76, + "grad_norm": 0.4934830665588379, + "learning_rate": 0.0005104241775834926, + "loss": 3.2617, + "step": 15460 + }, + { + "epoch": 0.76, + "grad_norm": 0.5197305083274841, + "learning_rate": 0.0005104132036074862, + "loss": 3.2124, + "step": 15461 + }, + { + "epoch": 0.76, + "grad_norm": 0.4591943025588989, + "learning_rate": 0.0005104022290772918, + "loss": 3.1996, + "step": 15462 + }, + { + "epoch": 0.76, + "grad_norm": 0.5356560349464417, + "learning_rate": 0.0005103912539929383, + "loss": 3.1692, + "step": 15463 + }, + { + "epoch": 0.76, + "grad_norm": 0.5015002489089966, + "learning_rate": 0.0005103802783544549, + "loss": 3.0953, + "step": 15464 + }, + { + "epoch": 0.76, + "grad_norm": 0.5325064063072205, + "learning_rate": 0.0005103693021618704, + "loss": 3.1567, + "step": 15465 + }, + { + "epoch": 0.76, + "grad_norm": 0.5010794401168823, + "learning_rate": 0.0005103583254152134, + "loss": 3.0977, + "step": 15466 + }, + { + "epoch": 0.76, + "grad_norm": 0.644633948802948, + "learning_rate": 0.0005103473481145132, + "loss": 3.4053, + "step": 15467 + }, + { + "epoch": 0.76, + "grad_norm": 0.5222316980361938, + "learning_rate": 0.0005103363702597986, + "loss": 3.3435, + "step": 15468 + }, + { + "epoch": 0.76, + "grad_norm": 0.5279338955879211, + "learning_rate": 0.0005103253918510984, + "loss": 3.2726, + "step": 15469 + }, + { + "epoch": 0.76, + "grad_norm": 0.5039032101631165, + "learning_rate": 0.0005103144128884416, + "loss": 3.1634, + "step": 15470 + }, + { + "epoch": 0.76, + "grad_norm": 0.5194478631019592, + "learning_rate": 0.0005103034333718572, + "loss": 3.2469, + "step": 15471 + }, + { + "epoch": 0.76, + "grad_norm": 0.5506890416145325, + "learning_rate": 0.000510292453301374, + "loss": 3.4793, + "step": 15472 + }, + { + "epoch": 0.76, + "grad_norm": 0.5099464058876038, + "learning_rate": 0.000510281472677021, + "loss": 3.2517, + "step": 15473 + }, + { + "epoch": 0.76, + "grad_norm": 0.5186891555786133, + "learning_rate": 0.0005102704914988269, + "loss": 3.1561, + "step": 15474 + }, + { + "epoch": 0.76, + "grad_norm": 0.4984113276004791, + "learning_rate": 0.0005102595097668209, + "loss": 3.2346, + "step": 15475 + }, + { + "epoch": 0.76, + "grad_norm": 0.5584531426429749, + "learning_rate": 0.0005102485274810319, + "loss": 3.1323, + "step": 15476 + }, + { + "epoch": 0.76, + "grad_norm": 0.5731603503227234, + "learning_rate": 0.0005102375446414886, + "loss": 3.2928, + "step": 15477 + }, + { + "epoch": 0.76, + "grad_norm": 0.4951866865158081, + "learning_rate": 0.0005102265612482201, + "loss": 3.2515, + "step": 15478 + }, + { + "epoch": 0.76, + "grad_norm": 0.49377912282943726, + "learning_rate": 0.0005102155773012553, + "loss": 2.8477, + "step": 15479 + }, + { + "epoch": 0.76, + "grad_norm": 0.5461580157279968, + "learning_rate": 0.0005102045928006231, + "loss": 3.1564, + "step": 15480 + }, + { + "epoch": 0.76, + "grad_norm": 0.6521041393280029, + "learning_rate": 0.0005101936077463525, + "loss": 3.3003, + "step": 15481 + }, + { + "epoch": 0.76, + "grad_norm": 0.48901596665382385, + "learning_rate": 0.0005101826221384724, + "loss": 3.1159, + "step": 15482 + }, + { + "epoch": 0.76, + "grad_norm": 0.5265803933143616, + "learning_rate": 0.0005101716359770117, + "loss": 3.3615, + "step": 15483 + }, + { + "epoch": 0.76, + "grad_norm": 0.5226854681968689, + "learning_rate": 0.0005101606492619994, + "loss": 3.0614, + "step": 15484 + }, + { + "epoch": 0.76, + "grad_norm": 0.5515998601913452, + "learning_rate": 0.0005101496619934644, + "loss": 3.2238, + "step": 15485 + }, + { + "epoch": 0.76, + "grad_norm": 0.487713485956192, + "learning_rate": 0.0005101386741714356, + "loss": 3.2274, + "step": 15486 + }, + { + "epoch": 0.76, + "grad_norm": 0.49414488673210144, + "learning_rate": 0.0005101276857959419, + "loss": 3.247, + "step": 15487 + }, + { + "epoch": 0.76, + "grad_norm": 0.5462265610694885, + "learning_rate": 0.0005101166968670125, + "loss": 3.2225, + "step": 15488 + }, + { + "epoch": 0.76, + "grad_norm": 0.5077669620513916, + "learning_rate": 0.000510105707384676, + "loss": 3.2476, + "step": 15489 + }, + { + "epoch": 0.76, + "grad_norm": 0.5247009992599487, + "learning_rate": 0.0005100947173489615, + "loss": 2.8845, + "step": 15490 + }, + { + "epoch": 0.76, + "grad_norm": 0.4996735751628876, + "learning_rate": 0.000510083726759898, + "loss": 3.262, + "step": 15491 + }, + { + "epoch": 0.76, + "grad_norm": 0.5037980675697327, + "learning_rate": 0.0005100727356175145, + "loss": 3.072, + "step": 15492 + }, + { + "epoch": 0.76, + "grad_norm": 0.515090823173523, + "learning_rate": 0.0005100617439218397, + "loss": 3.2084, + "step": 15493 + }, + { + "epoch": 0.76, + "grad_norm": 0.5135362148284912, + "learning_rate": 0.0005100507516729027, + "loss": 3.0892, + "step": 15494 + }, + { + "epoch": 0.76, + "grad_norm": 0.4950462579727173, + "learning_rate": 0.0005100397588707325, + "loss": 3.0106, + "step": 15495 + }, + { + "epoch": 0.76, + "grad_norm": 0.4800357520580292, + "learning_rate": 0.000510028765515358, + "loss": 3.2204, + "step": 15496 + }, + { + "epoch": 0.76, + "grad_norm": 0.534273087978363, + "learning_rate": 0.000510017771606808, + "loss": 3.1293, + "step": 15497 + }, + { + "epoch": 0.76, + "grad_norm": 0.5002960562705994, + "learning_rate": 0.0005100067771451118, + "loss": 3.2213, + "step": 15498 + }, + { + "epoch": 0.76, + "grad_norm": 0.5246545076370239, + "learning_rate": 0.0005099957821302981, + "loss": 3.2642, + "step": 15499 + }, + { + "epoch": 0.76, + "grad_norm": 0.5540328621864319, + "learning_rate": 0.000509984786562396, + "loss": 3.1108, + "step": 15500 + }, + { + "epoch": 0.76, + "grad_norm": 0.49777376651763916, + "learning_rate": 0.0005099737904414342, + "loss": 3.2185, + "step": 15501 + }, + { + "epoch": 0.76, + "grad_norm": 0.5149266719818115, + "learning_rate": 0.0005099627937674421, + "loss": 3.2102, + "step": 15502 + }, + { + "epoch": 0.76, + "grad_norm": 0.4814930260181427, + "learning_rate": 0.0005099517965404482, + "loss": 3.1635, + "step": 15503 + }, + { + "epoch": 0.76, + "grad_norm": 0.48603329062461853, + "learning_rate": 0.0005099407987604818, + "loss": 3.2968, + "step": 15504 + }, + { + "epoch": 0.76, + "grad_norm": 0.4911821484565735, + "learning_rate": 0.0005099298004275717, + "loss": 3.3795, + "step": 15505 + }, + { + "epoch": 0.76, + "grad_norm": 0.5378304719924927, + "learning_rate": 0.0005099188015417468, + "loss": 3.1863, + "step": 15506 + }, + { + "epoch": 0.76, + "grad_norm": 0.5142403841018677, + "learning_rate": 0.0005099078021030362, + "loss": 3.2434, + "step": 15507 + }, + { + "epoch": 0.76, + "grad_norm": 0.5013147592544556, + "learning_rate": 0.000509896802111469, + "loss": 3.1608, + "step": 15508 + }, + { + "epoch": 0.76, + "grad_norm": 0.4968760907649994, + "learning_rate": 0.000509885801567074, + "loss": 3.195, + "step": 15509 + }, + { + "epoch": 0.76, + "grad_norm": 0.5242990255355835, + "learning_rate": 0.0005098748004698801, + "loss": 3.2617, + "step": 15510 + }, + { + "epoch": 0.76, + "grad_norm": 0.5505073070526123, + "learning_rate": 0.0005098637988199163, + "loss": 3.0103, + "step": 15511 + }, + { + "epoch": 0.76, + "grad_norm": 0.5673673152923584, + "learning_rate": 0.0005098527966172117, + "loss": 3.1611, + "step": 15512 + }, + { + "epoch": 0.76, + "grad_norm": 0.5113774538040161, + "learning_rate": 0.0005098417938617952, + "loss": 3.4058, + "step": 15513 + }, + { + "epoch": 0.76, + "grad_norm": 0.4958935081958771, + "learning_rate": 0.0005098307905536959, + "loss": 3.1639, + "step": 15514 + }, + { + "epoch": 0.76, + "grad_norm": 0.5038467645645142, + "learning_rate": 0.0005098197866929427, + "loss": 3.1384, + "step": 15515 + }, + { + "epoch": 0.76, + "grad_norm": 0.493192195892334, + "learning_rate": 0.0005098087822795645, + "loss": 3.1471, + "step": 15516 + }, + { + "epoch": 0.76, + "grad_norm": 0.5338720679283142, + "learning_rate": 0.0005097977773135903, + "loss": 3.2762, + "step": 15517 + }, + { + "epoch": 0.76, + "grad_norm": 0.5778473615646362, + "learning_rate": 0.0005097867717950492, + "loss": 3.1879, + "step": 15518 + }, + { + "epoch": 0.76, + "grad_norm": 0.5032196044921875, + "learning_rate": 0.0005097757657239701, + "loss": 3.1646, + "step": 15519 + }, + { + "epoch": 0.76, + "grad_norm": 0.5262255668640137, + "learning_rate": 0.000509764759100382, + "loss": 3.2137, + "step": 15520 + }, + { + "epoch": 0.76, + "grad_norm": 0.49957624077796936, + "learning_rate": 0.0005097537519243139, + "loss": 3.111, + "step": 15521 + }, + { + "epoch": 0.76, + "grad_norm": 0.5052130818367004, + "learning_rate": 0.0005097427441957948, + "loss": 3.1133, + "step": 15522 + }, + { + "epoch": 0.76, + "grad_norm": 0.48932185769081116, + "learning_rate": 0.0005097317359148536, + "loss": 3.3346, + "step": 15523 + }, + { + "epoch": 0.76, + "grad_norm": 0.5048071146011353, + "learning_rate": 0.0005097207270815194, + "loss": 3.322, + "step": 15524 + }, + { + "epoch": 0.76, + "grad_norm": 0.5462115406990051, + "learning_rate": 0.0005097097176958212, + "loss": 2.9782, + "step": 15525 + }, + { + "epoch": 0.76, + "grad_norm": 0.5595006942749023, + "learning_rate": 0.000509698707757788, + "loss": 2.9495, + "step": 15526 + }, + { + "epoch": 0.76, + "grad_norm": 0.5353357791900635, + "learning_rate": 0.0005096876972674486, + "loss": 3.2685, + "step": 15527 + }, + { + "epoch": 0.76, + "grad_norm": 0.4759270250797272, + "learning_rate": 0.0005096766862248323, + "loss": 3.1232, + "step": 15528 + }, + { + "epoch": 0.76, + "grad_norm": 0.5706182718276978, + "learning_rate": 0.000509665674629968, + "loss": 3.0231, + "step": 15529 + }, + { + "epoch": 0.76, + "grad_norm": 0.5052098631858826, + "learning_rate": 0.0005096546624828845, + "loss": 3.12, + "step": 15530 + }, + { + "epoch": 0.76, + "grad_norm": 0.5506969690322876, + "learning_rate": 0.0005096436497836111, + "loss": 3.1312, + "step": 15531 + }, + { + "epoch": 0.76, + "grad_norm": 0.5286117792129517, + "learning_rate": 0.0005096326365321767, + "loss": 3.2991, + "step": 15532 + }, + { + "epoch": 0.76, + "grad_norm": 0.5287132859230042, + "learning_rate": 0.0005096216227286102, + "loss": 3.136, + "step": 15533 + }, + { + "epoch": 0.76, + "grad_norm": 0.512438952922821, + "learning_rate": 0.0005096106083729406, + "loss": 3.2366, + "step": 15534 + }, + { + "epoch": 0.76, + "grad_norm": 0.5107895731925964, + "learning_rate": 0.0005095995934651972, + "loss": 3.4381, + "step": 15535 + }, + { + "epoch": 0.76, + "grad_norm": 0.523353636264801, + "learning_rate": 0.0005095885780054087, + "loss": 3.1528, + "step": 15536 + }, + { + "epoch": 0.76, + "grad_norm": 0.5121175646781921, + "learning_rate": 0.0005095775619936042, + "loss": 3.2085, + "step": 15537 + }, + { + "epoch": 0.76, + "grad_norm": 0.5396693348884583, + "learning_rate": 0.0005095665454298129, + "loss": 2.9958, + "step": 15538 + }, + { + "epoch": 0.76, + "grad_norm": 0.49359792470932007, + "learning_rate": 0.0005095555283140635, + "loss": 3.0549, + "step": 15539 + }, + { + "epoch": 0.76, + "grad_norm": 0.584166944026947, + "learning_rate": 0.0005095445106463852, + "loss": 3.0041, + "step": 15540 + }, + { + "epoch": 0.76, + "grad_norm": 0.5615528225898743, + "learning_rate": 0.000509533492426807, + "loss": 3.2767, + "step": 15541 + }, + { + "epoch": 0.76, + "grad_norm": 0.5622826814651489, + "learning_rate": 0.0005095224736553578, + "loss": 3.09, + "step": 15542 + }, + { + "epoch": 0.76, + "grad_norm": 0.5125862956047058, + "learning_rate": 0.0005095114543320668, + "loss": 3.2732, + "step": 15543 + }, + { + "epoch": 0.76, + "grad_norm": 0.507877767086029, + "learning_rate": 0.000509500434456963, + "loss": 2.8684, + "step": 15544 + }, + { + "epoch": 0.76, + "grad_norm": 0.5564316511154175, + "learning_rate": 0.0005094894140300753, + "loss": 3.1338, + "step": 15545 + }, + { + "epoch": 0.76, + "grad_norm": 0.5386070609092712, + "learning_rate": 0.0005094783930514329, + "loss": 3.0938, + "step": 15546 + }, + { + "epoch": 0.76, + "grad_norm": 0.4967600405216217, + "learning_rate": 0.0005094673715210647, + "loss": 3.153, + "step": 15547 + }, + { + "epoch": 0.76, + "grad_norm": 0.4921207129955292, + "learning_rate": 0.0005094563494389997, + "loss": 3.1201, + "step": 15548 + }, + { + "epoch": 0.76, + "grad_norm": 0.5483155250549316, + "learning_rate": 0.0005094453268052671, + "loss": 2.9707, + "step": 15549 + }, + { + "epoch": 0.76, + "grad_norm": 0.5265492796897888, + "learning_rate": 0.0005094343036198956, + "loss": 3.1635, + "step": 15550 + }, + { + "epoch": 0.76, + "grad_norm": 0.5425223708152771, + "learning_rate": 0.0005094232798829146, + "loss": 2.9743, + "step": 15551 + }, + { + "epoch": 0.76, + "grad_norm": 0.5007231831550598, + "learning_rate": 0.0005094122555943529, + "loss": 3.2458, + "step": 15552 + }, + { + "epoch": 0.76, + "grad_norm": 0.5598417520523071, + "learning_rate": 0.0005094012307542398, + "loss": 3.128, + "step": 15553 + }, + { + "epoch": 0.76, + "grad_norm": 0.5178683996200562, + "learning_rate": 0.000509390205362604, + "loss": 3.3222, + "step": 15554 + }, + { + "epoch": 0.76, + "grad_norm": 0.5192438364028931, + "learning_rate": 0.0005093791794194747, + "loss": 3.2078, + "step": 15555 + }, + { + "epoch": 0.76, + "grad_norm": 0.5315037369728088, + "learning_rate": 0.000509368152924881, + "loss": 3.1406, + "step": 15556 + }, + { + "epoch": 0.76, + "grad_norm": 0.5374244451522827, + "learning_rate": 0.0005093571258788518, + "loss": 3.0132, + "step": 15557 + }, + { + "epoch": 0.76, + "grad_norm": 0.5307905077934265, + "learning_rate": 0.0005093460982814164, + "loss": 3.0444, + "step": 15558 + }, + { + "epoch": 0.76, + "grad_norm": 0.5297430157661438, + "learning_rate": 0.0005093350701326035, + "loss": 3.3374, + "step": 15559 + }, + { + "epoch": 0.76, + "grad_norm": 0.5116652846336365, + "learning_rate": 0.0005093240414324423, + "loss": 3.1824, + "step": 15560 + }, + { + "epoch": 0.76, + "grad_norm": 0.5311102271080017, + "learning_rate": 0.000509313012180962, + "loss": 3.1643, + "step": 15561 + }, + { + "epoch": 0.76, + "grad_norm": 0.5218523740768433, + "learning_rate": 0.0005093019823781915, + "loss": 3.2827, + "step": 15562 + }, + { + "epoch": 0.76, + "grad_norm": 0.5624101161956787, + "learning_rate": 0.0005092909520241596, + "loss": 3.2227, + "step": 15563 + }, + { + "epoch": 0.76, + "grad_norm": 0.5099868178367615, + "learning_rate": 0.0005092799211188959, + "loss": 3.1073, + "step": 15564 + }, + { + "epoch": 0.76, + "grad_norm": 0.5127807259559631, + "learning_rate": 0.000509268889662429, + "loss": 3.2646, + "step": 15565 + }, + { + "epoch": 0.76, + "grad_norm": 0.528416097164154, + "learning_rate": 0.0005092578576547882, + "loss": 3.1796, + "step": 15566 + }, + { + "epoch": 0.76, + "grad_norm": 0.49281713366508484, + "learning_rate": 0.0005092468250960025, + "loss": 3.1264, + "step": 15567 + }, + { + "epoch": 0.76, + "grad_norm": 0.5014824271202087, + "learning_rate": 0.0005092357919861009, + "loss": 3.1486, + "step": 15568 + }, + { + "epoch": 0.76, + "grad_norm": 0.4935286045074463, + "learning_rate": 0.0005092247583251124, + "loss": 3.2687, + "step": 15569 + }, + { + "epoch": 0.76, + "grad_norm": 0.520007312297821, + "learning_rate": 0.0005092137241130662, + "loss": 3.1096, + "step": 15570 + }, + { + "epoch": 0.76, + "grad_norm": 0.5099589824676514, + "learning_rate": 0.0005092026893499914, + "loss": 3.2324, + "step": 15571 + }, + { + "epoch": 0.76, + "grad_norm": 0.5131465196609497, + "learning_rate": 0.0005091916540359169, + "loss": 3.0389, + "step": 15572 + }, + { + "epoch": 0.76, + "grad_norm": 0.49344658851623535, + "learning_rate": 0.0005091806181708719, + "loss": 3.3404, + "step": 15573 + }, + { + "epoch": 0.76, + "grad_norm": 0.5237314701080322, + "learning_rate": 0.0005091695817548853, + "loss": 3.1772, + "step": 15574 + }, + { + "epoch": 0.76, + "grad_norm": 0.5355901122093201, + "learning_rate": 0.0005091585447879864, + "loss": 3.2269, + "step": 15575 + }, + { + "epoch": 0.76, + "grad_norm": 0.5290843844413757, + "learning_rate": 0.0005091475072702041, + "loss": 3.0839, + "step": 15576 + }, + { + "epoch": 0.76, + "grad_norm": 0.5169116258621216, + "learning_rate": 0.0005091364692015676, + "loss": 3.2695, + "step": 15577 + }, + { + "epoch": 0.76, + "grad_norm": 0.6524379253387451, + "learning_rate": 0.0005091254305821058, + "loss": 3.4017, + "step": 15578 + }, + { + "epoch": 0.76, + "grad_norm": 0.5682939291000366, + "learning_rate": 0.0005091143914118478, + "loss": 3.2909, + "step": 15579 + }, + { + "epoch": 0.76, + "grad_norm": 0.5173560976982117, + "learning_rate": 0.0005091033516908228, + "loss": 3.1853, + "step": 15580 + }, + { + "epoch": 0.76, + "grad_norm": 0.5009781718254089, + "learning_rate": 0.00050909231141906, + "loss": 3.3297, + "step": 15581 + }, + { + "epoch": 0.76, + "grad_norm": 0.5405966639518738, + "learning_rate": 0.0005090812705965881, + "loss": 3.1403, + "step": 15582 + }, + { + "epoch": 0.76, + "grad_norm": 0.4971311390399933, + "learning_rate": 0.0005090702292234363, + "loss": 3.1587, + "step": 15583 + }, + { + "epoch": 0.76, + "grad_norm": 0.5876317024230957, + "learning_rate": 0.0005090591872996338, + "loss": 3.1248, + "step": 15584 + }, + { + "epoch": 0.76, + "grad_norm": 0.665259599685669, + "learning_rate": 0.0005090481448252098, + "loss": 3.1146, + "step": 15585 + }, + { + "epoch": 0.76, + "grad_norm": 0.5335424542427063, + "learning_rate": 0.000509037101800193, + "loss": 3.1938, + "step": 15586 + }, + { + "epoch": 0.76, + "grad_norm": 0.5514816045761108, + "learning_rate": 0.0005090260582246128, + "loss": 3.3065, + "step": 15587 + }, + { + "epoch": 0.76, + "grad_norm": 0.5073038339614868, + "learning_rate": 0.0005090150140984982, + "loss": 3.2372, + "step": 15588 + }, + { + "epoch": 0.76, + "grad_norm": 0.5548021793365479, + "learning_rate": 0.0005090039694218782, + "loss": 3.1944, + "step": 15589 + }, + { + "epoch": 0.76, + "grad_norm": 0.5426687598228455, + "learning_rate": 0.0005089929241947821, + "loss": 3.3712, + "step": 15590 + }, + { + "epoch": 0.76, + "grad_norm": 0.5318143963813782, + "learning_rate": 0.0005089818784172388, + "loss": 3.1281, + "step": 15591 + }, + { + "epoch": 0.76, + "grad_norm": 0.5172745585441589, + "learning_rate": 0.0005089708320892774, + "loss": 3.1421, + "step": 15592 + }, + { + "epoch": 0.76, + "grad_norm": 0.5684549808502197, + "learning_rate": 0.0005089597852109271, + "loss": 3.0396, + "step": 15593 + }, + { + "epoch": 0.76, + "grad_norm": 0.5054349899291992, + "learning_rate": 0.000508948737782217, + "loss": 3.2081, + "step": 15594 + }, + { + "epoch": 0.76, + "grad_norm": 0.4890936315059662, + "learning_rate": 0.0005089376898031761, + "loss": 3.3387, + "step": 15595 + }, + { + "epoch": 0.76, + "grad_norm": 0.5217352509498596, + "learning_rate": 0.0005089266412738334, + "loss": 3.1863, + "step": 15596 + }, + { + "epoch": 0.76, + "grad_norm": 0.529723048210144, + "learning_rate": 0.0005089155921942183, + "loss": 3.0914, + "step": 15597 + }, + { + "epoch": 0.76, + "grad_norm": 0.5619720816612244, + "learning_rate": 0.0005089045425643596, + "loss": 3.222, + "step": 15598 + }, + { + "epoch": 0.76, + "grad_norm": 0.5275721549987793, + "learning_rate": 0.0005088934923842866, + "loss": 3.2866, + "step": 15599 + }, + { + "epoch": 0.76, + "grad_norm": 0.5122009515762329, + "learning_rate": 0.0005088824416540285, + "loss": 3.3297, + "step": 15600 + }, + { + "epoch": 0.76, + "grad_norm": 0.5561813712120056, + "learning_rate": 0.000508871390373614, + "loss": 3.218, + "step": 15601 + }, + { + "epoch": 0.76, + "grad_norm": 0.5213872194290161, + "learning_rate": 0.0005088603385430727, + "loss": 3.2862, + "step": 15602 + }, + { + "epoch": 0.76, + "grad_norm": 0.532748281955719, + "learning_rate": 0.0005088492861624332, + "loss": 3.2017, + "step": 15603 + }, + { + "epoch": 0.76, + "grad_norm": 0.5274077653884888, + "learning_rate": 0.000508838233231725, + "loss": 3.2662, + "step": 15604 + }, + { + "epoch": 0.76, + "grad_norm": 0.5420587062835693, + "learning_rate": 0.0005088271797509771, + "loss": 3.2567, + "step": 15605 + }, + { + "epoch": 0.76, + "grad_norm": 0.5665300488471985, + "learning_rate": 0.0005088161257202186, + "loss": 3.1568, + "step": 15606 + }, + { + "epoch": 0.76, + "grad_norm": 0.5209993124008179, + "learning_rate": 0.0005088050711394786, + "loss": 3.3423, + "step": 15607 + }, + { + "epoch": 0.76, + "grad_norm": 0.5469440221786499, + "learning_rate": 0.0005087940160087862, + "loss": 3.3472, + "step": 15608 + }, + { + "epoch": 0.76, + "grad_norm": 0.5044681429862976, + "learning_rate": 0.0005087829603281707, + "loss": 3.0458, + "step": 15609 + }, + { + "epoch": 0.77, + "grad_norm": 0.5147929787635803, + "learning_rate": 0.0005087719040976609, + "loss": 3.2094, + "step": 15610 + }, + { + "epoch": 0.77, + "grad_norm": 0.5366151928901672, + "learning_rate": 0.0005087608473172861, + "loss": 3.1899, + "step": 15611 + }, + { + "epoch": 0.77, + "grad_norm": 0.5306175351142883, + "learning_rate": 0.0005087497899870754, + "loss": 3.0799, + "step": 15612 + }, + { + "epoch": 0.77, + "grad_norm": 0.5149161219596863, + "learning_rate": 0.0005087387321070579, + "loss": 3.1933, + "step": 15613 + }, + { + "epoch": 0.77, + "grad_norm": 0.5391685366630554, + "learning_rate": 0.0005087276736772628, + "loss": 3.1901, + "step": 15614 + }, + { + "epoch": 0.77, + "grad_norm": 0.5293787717819214, + "learning_rate": 0.0005087166146977193, + "loss": 2.8526, + "step": 15615 + }, + { + "epoch": 0.77, + "grad_norm": 0.5558761358261108, + "learning_rate": 0.0005087055551684562, + "loss": 3.1358, + "step": 15616 + }, + { + "epoch": 0.77, + "grad_norm": 0.5193555951118469, + "learning_rate": 0.000508694495089503, + "loss": 3.0509, + "step": 15617 + }, + { + "epoch": 0.77, + "grad_norm": 0.49987754225730896, + "learning_rate": 0.0005086834344608885, + "loss": 3.1635, + "step": 15618 + }, + { + "epoch": 0.77, + "grad_norm": 0.4778132736682892, + "learning_rate": 0.000508672373282642, + "loss": 2.9455, + "step": 15619 + }, + { + "epoch": 0.77, + "grad_norm": 0.5623522996902466, + "learning_rate": 0.0005086613115547928, + "loss": 3.2735, + "step": 15620 + }, + { + "epoch": 0.77, + "grad_norm": 0.5368536710739136, + "learning_rate": 0.0005086502492773697, + "loss": 3.0355, + "step": 15621 + }, + { + "epoch": 0.77, + "grad_norm": 0.5360807776451111, + "learning_rate": 0.0005086391864504022, + "loss": 3.2697, + "step": 15622 + }, + { + "epoch": 0.77, + "grad_norm": 0.5049542188644409, + "learning_rate": 0.0005086281230739191, + "loss": 3.2567, + "step": 15623 + }, + { + "epoch": 0.77, + "grad_norm": 0.5606904625892639, + "learning_rate": 0.0005086170591479496, + "loss": 3.185, + "step": 15624 + }, + { + "epoch": 0.77, + "grad_norm": 0.5596327185630798, + "learning_rate": 0.000508605994672523, + "loss": 3.1354, + "step": 15625 + }, + { + "epoch": 0.77, + "grad_norm": 0.536123514175415, + "learning_rate": 0.0005085949296476684, + "loss": 3.1321, + "step": 15626 + }, + { + "epoch": 0.77, + "grad_norm": 0.5361589789390564, + "learning_rate": 0.0005085838640734149, + "loss": 3.2706, + "step": 15627 + }, + { + "epoch": 0.77, + "grad_norm": 0.5314226746559143, + "learning_rate": 0.0005085727979497915, + "loss": 3.1124, + "step": 15628 + }, + { + "epoch": 0.77, + "grad_norm": 0.5757156610488892, + "learning_rate": 0.0005085617312768277, + "loss": 3.1707, + "step": 15629 + }, + { + "epoch": 0.77, + "grad_norm": 0.5240241289138794, + "learning_rate": 0.0005085506640545522, + "loss": 3.105, + "step": 15630 + }, + { + "epoch": 0.77, + "grad_norm": 0.5034173727035522, + "learning_rate": 0.0005085395962829946, + "loss": 3.1758, + "step": 15631 + }, + { + "epoch": 0.77, + "grad_norm": 0.5380710363388062, + "learning_rate": 0.0005085285279621838, + "loss": 3.2823, + "step": 15632 + }, + { + "epoch": 0.77, + "grad_norm": 0.5575569272041321, + "learning_rate": 0.000508517459092149, + "loss": 3.1727, + "step": 15633 + }, + { + "epoch": 0.77, + "grad_norm": 0.5457764863967896, + "learning_rate": 0.0005085063896729192, + "loss": 3.0716, + "step": 15634 + }, + { + "epoch": 0.77, + "grad_norm": 0.5306626558303833, + "learning_rate": 0.0005084953197045238, + "loss": 2.9403, + "step": 15635 + }, + { + "epoch": 0.77, + "grad_norm": 0.5452754497528076, + "learning_rate": 0.0005084842491869918, + "loss": 3.3432, + "step": 15636 + }, + { + "epoch": 0.77, + "grad_norm": 0.5277231931686401, + "learning_rate": 0.0005084731781203525, + "loss": 3.1267, + "step": 15637 + }, + { + "epoch": 0.77, + "grad_norm": 0.5706171989440918, + "learning_rate": 0.0005084621065046349, + "loss": 3.2748, + "step": 15638 + }, + { + "epoch": 0.77, + "grad_norm": 0.5383669137954712, + "learning_rate": 0.0005084510343398682, + "loss": 3.2757, + "step": 15639 + }, + { + "epoch": 0.77, + "grad_norm": 0.5343171954154968, + "learning_rate": 0.0005084399616260818, + "loss": 3.24, + "step": 15640 + }, + { + "epoch": 0.77, + "grad_norm": 0.49760371446609497, + "learning_rate": 0.0005084288883633044, + "loss": 3.049, + "step": 15641 + }, + { + "epoch": 0.77, + "grad_norm": 0.5333790183067322, + "learning_rate": 0.0005084178145515656, + "loss": 3.1123, + "step": 15642 + }, + { + "epoch": 0.77, + "grad_norm": 0.5920032858848572, + "learning_rate": 0.0005084067401908942, + "loss": 2.9831, + "step": 15643 + }, + { + "epoch": 0.77, + "grad_norm": 0.5388069152832031, + "learning_rate": 0.0005083956652813196, + "loss": 3.0262, + "step": 15644 + }, + { + "epoch": 0.77, + "grad_norm": 0.5454514622688293, + "learning_rate": 0.0005083845898228711, + "loss": 3.3217, + "step": 15645 + }, + { + "epoch": 0.77, + "grad_norm": 0.5356780290603638, + "learning_rate": 0.0005083735138155775, + "loss": 3.1691, + "step": 15646 + }, + { + "epoch": 0.77, + "grad_norm": 0.4733395278453827, + "learning_rate": 0.0005083624372594683, + "loss": 3.011, + "step": 15647 + }, + { + "epoch": 0.77, + "grad_norm": 0.5192866921424866, + "learning_rate": 0.0005083513601545725, + "loss": 3.3161, + "step": 15648 + }, + { + "epoch": 0.77, + "grad_norm": 0.4957810640335083, + "learning_rate": 0.0005083402825009193, + "loss": 3.2863, + "step": 15649 + }, + { + "epoch": 0.77, + "grad_norm": 0.4861541986465454, + "learning_rate": 0.0005083292042985379, + "loss": 3.0497, + "step": 15650 + }, + { + "epoch": 0.77, + "grad_norm": 0.5232390761375427, + "learning_rate": 0.0005083181255474575, + "loss": 2.973, + "step": 15651 + }, + { + "epoch": 0.77, + "grad_norm": 0.5298289060592651, + "learning_rate": 0.0005083070462477072, + "loss": 3.3084, + "step": 15652 + }, + { + "epoch": 0.77, + "grad_norm": 0.511946439743042, + "learning_rate": 0.0005082959663993162, + "loss": 3.1297, + "step": 15653 + }, + { + "epoch": 0.77, + "grad_norm": 0.5342845320701599, + "learning_rate": 0.0005082848860023137, + "loss": 3.1233, + "step": 15654 + }, + { + "epoch": 0.77, + "grad_norm": 0.5111925601959229, + "learning_rate": 0.000508273805056729, + "loss": 3.2461, + "step": 15655 + }, + { + "epoch": 0.77, + "grad_norm": 0.553409993648529, + "learning_rate": 0.0005082627235625911, + "loss": 3.2622, + "step": 15656 + }, + { + "epoch": 0.77, + "grad_norm": 0.503697395324707, + "learning_rate": 0.0005082516415199293, + "loss": 3.0538, + "step": 15657 + }, + { + "epoch": 0.77, + "grad_norm": 0.5087273716926575, + "learning_rate": 0.0005082405589287728, + "loss": 3.1954, + "step": 15658 + }, + { + "epoch": 0.77, + "grad_norm": 0.5421915650367737, + "learning_rate": 0.0005082294757891507, + "loss": 3.1112, + "step": 15659 + }, + { + "epoch": 0.77, + "grad_norm": 0.5316805243492126, + "learning_rate": 0.0005082183921010922, + "loss": 3.1643, + "step": 15660 + }, + { + "epoch": 0.77, + "grad_norm": 0.4953380227088928, + "learning_rate": 0.0005082073078646266, + "loss": 3.2955, + "step": 15661 + }, + { + "epoch": 0.77, + "grad_norm": 0.5112192034721375, + "learning_rate": 0.000508196223079783, + "loss": 3.2071, + "step": 15662 + }, + { + "epoch": 0.77, + "grad_norm": 0.5044480562210083, + "learning_rate": 0.0005081851377465907, + "loss": 3.0958, + "step": 15663 + }, + { + "epoch": 0.77, + "grad_norm": 0.6140612363815308, + "learning_rate": 0.0005081740518650787, + "loss": 3.3124, + "step": 15664 + }, + { + "epoch": 0.77, + "grad_norm": 0.5115430951118469, + "learning_rate": 0.0005081629654352763, + "loss": 3.1975, + "step": 15665 + }, + { + "epoch": 0.77, + "grad_norm": 0.5128433108329773, + "learning_rate": 0.0005081518784572127, + "loss": 3.1859, + "step": 15666 + }, + { + "epoch": 0.77, + "grad_norm": 0.49921858310699463, + "learning_rate": 0.0005081407909309171, + "loss": 3.1841, + "step": 15667 + }, + { + "epoch": 0.77, + "grad_norm": 0.5513691306114197, + "learning_rate": 0.0005081297028564189, + "loss": 3.032, + "step": 15668 + }, + { + "epoch": 0.77, + "grad_norm": 0.5237788558006287, + "learning_rate": 0.0005081186142337469, + "loss": 3.164, + "step": 15669 + }, + { + "epoch": 0.77, + "grad_norm": 0.5211585164070129, + "learning_rate": 0.0005081075250629306, + "loss": 3.2778, + "step": 15670 + }, + { + "epoch": 0.77, + "grad_norm": 0.5375601053237915, + "learning_rate": 0.0005080964353439991, + "loss": 3.2492, + "step": 15671 + }, + { + "epoch": 0.77, + "grad_norm": 0.5239969491958618, + "learning_rate": 0.0005080853450769817, + "loss": 2.856, + "step": 15672 + }, + { + "epoch": 0.77, + "grad_norm": 0.5531800389289856, + "learning_rate": 0.0005080742542619075, + "loss": 3.125, + "step": 15673 + }, + { + "epoch": 0.77, + "grad_norm": 0.4881918728351593, + "learning_rate": 0.0005080631628988058, + "loss": 3.3741, + "step": 15674 + }, + { + "epoch": 0.77, + "grad_norm": 0.5813001394271851, + "learning_rate": 0.0005080520709877057, + "loss": 3.2355, + "step": 15675 + }, + { + "epoch": 0.77, + "grad_norm": 0.4898727238178253, + "learning_rate": 0.0005080409785286366, + "loss": 3.0873, + "step": 15676 + }, + { + "epoch": 0.77, + "grad_norm": 0.4959816038608551, + "learning_rate": 0.0005080298855216275, + "loss": 3.1912, + "step": 15677 + }, + { + "epoch": 0.77, + "grad_norm": 0.5205987691879272, + "learning_rate": 0.0005080187919667078, + "loss": 3.3549, + "step": 15678 + }, + { + "epoch": 0.77, + "grad_norm": 0.5168250203132629, + "learning_rate": 0.0005080076978639065, + "loss": 3.2099, + "step": 15679 + }, + { + "epoch": 0.77, + "grad_norm": 0.5724286437034607, + "learning_rate": 0.000507996603213253, + "loss": 3.0972, + "step": 15680 + }, + { + "epoch": 0.77, + "grad_norm": 0.5305948257446289, + "learning_rate": 0.0005079855080147766, + "loss": 3.1634, + "step": 15681 + }, + { + "epoch": 0.77, + "grad_norm": 0.5316970348358154, + "learning_rate": 0.0005079744122685063, + "loss": 3.1846, + "step": 15682 + }, + { + "epoch": 0.77, + "grad_norm": 0.5323175191879272, + "learning_rate": 0.0005079633159744715, + "loss": 3.1129, + "step": 15683 + }, + { + "epoch": 0.77, + "grad_norm": 0.49948516488075256, + "learning_rate": 0.0005079522191327012, + "loss": 3.2589, + "step": 15684 + }, + { + "epoch": 0.77, + "grad_norm": 0.5208277702331543, + "learning_rate": 0.0005079411217432249, + "loss": 3.0243, + "step": 15685 + }, + { + "epoch": 0.77, + "grad_norm": 0.5961699485778809, + "learning_rate": 0.0005079300238060717, + "loss": 2.9893, + "step": 15686 + }, + { + "epoch": 0.77, + "grad_norm": 0.5415461659431458, + "learning_rate": 0.0005079189253212708, + "loss": 3.1732, + "step": 15687 + }, + { + "epoch": 0.77, + "grad_norm": 0.4872366487979889, + "learning_rate": 0.0005079078262888515, + "loss": 3.1972, + "step": 15688 + }, + { + "epoch": 0.77, + "grad_norm": 0.575570285320282, + "learning_rate": 0.0005078967267088429, + "loss": 2.9043, + "step": 15689 + }, + { + "epoch": 0.77, + "grad_norm": 0.520625114440918, + "learning_rate": 0.0005078856265812745, + "loss": 3.0182, + "step": 15690 + }, + { + "epoch": 0.77, + "grad_norm": 0.5354496240615845, + "learning_rate": 0.0005078745259061752, + "loss": 3.2163, + "step": 15691 + }, + { + "epoch": 0.77, + "grad_norm": 0.5106788873672485, + "learning_rate": 0.0005078634246835745, + "loss": 2.838, + "step": 15692 + }, + { + "epoch": 0.77, + "grad_norm": 0.5290341377258301, + "learning_rate": 0.0005078523229135016, + "loss": 3.2877, + "step": 15693 + }, + { + "epoch": 0.77, + "grad_norm": 0.5494564175605774, + "learning_rate": 0.0005078412205959856, + "loss": 3.204, + "step": 15694 + }, + { + "epoch": 0.77, + "grad_norm": 0.5220087766647339, + "learning_rate": 0.0005078301177310557, + "loss": 3.3395, + "step": 15695 + }, + { + "epoch": 0.77, + "grad_norm": 0.573491096496582, + "learning_rate": 0.0005078190143187415, + "loss": 3.2214, + "step": 15696 + }, + { + "epoch": 0.77, + "grad_norm": 0.5294093489646912, + "learning_rate": 0.0005078079103590719, + "loss": 3.1203, + "step": 15697 + }, + { + "epoch": 0.77, + "grad_norm": 0.46979430317878723, + "learning_rate": 0.0005077968058520762, + "loss": 2.9733, + "step": 15698 + }, + { + "epoch": 0.77, + "grad_norm": 0.49335652589797974, + "learning_rate": 0.0005077857007977838, + "loss": 3.4289, + "step": 15699 + }, + { + "epoch": 0.77, + "grad_norm": 0.524200439453125, + "learning_rate": 0.0005077745951962238, + "loss": 3.0115, + "step": 15700 + }, + { + "epoch": 0.77, + "grad_norm": 0.522419810295105, + "learning_rate": 0.0005077634890474255, + "loss": 3.1244, + "step": 15701 + }, + { + "epoch": 0.77, + "grad_norm": 0.5147000551223755, + "learning_rate": 0.0005077523823514183, + "loss": 3.2699, + "step": 15702 + }, + { + "epoch": 0.77, + "grad_norm": 0.5115140676498413, + "learning_rate": 0.0005077412751082311, + "loss": 3.0467, + "step": 15703 + }, + { + "epoch": 0.77, + "grad_norm": 0.5686523914337158, + "learning_rate": 0.0005077301673178934, + "loss": 3.344, + "step": 15704 + }, + { + "epoch": 0.77, + "grad_norm": 0.5637875199317932, + "learning_rate": 0.0005077190589804346, + "loss": 3.1739, + "step": 15705 + }, + { + "epoch": 0.77, + "grad_norm": 0.5621719360351562, + "learning_rate": 0.0005077079500958836, + "loss": 3.2666, + "step": 15706 + }, + { + "epoch": 0.77, + "grad_norm": 0.5126227736473083, + "learning_rate": 0.0005076968406642699, + "loss": 3.3009, + "step": 15707 + }, + { + "epoch": 0.77, + "grad_norm": 0.5188931822776794, + "learning_rate": 0.0005076857306856227, + "loss": 3.0482, + "step": 15708 + }, + { + "epoch": 0.77, + "grad_norm": 0.5244320631027222, + "learning_rate": 0.0005076746201599712, + "loss": 3.2294, + "step": 15709 + }, + { + "epoch": 0.77, + "grad_norm": 0.5187132954597473, + "learning_rate": 0.0005076635090873448, + "loss": 3.364, + "step": 15710 + }, + { + "epoch": 0.77, + "grad_norm": 0.4979041814804077, + "learning_rate": 0.0005076523974677725, + "loss": 3.2371, + "step": 15711 + }, + { + "epoch": 0.77, + "grad_norm": 0.5199201703071594, + "learning_rate": 0.000507641285301284, + "loss": 3.3204, + "step": 15712 + }, + { + "epoch": 0.77, + "grad_norm": 0.5369486212730408, + "learning_rate": 0.0005076301725879082, + "loss": 3.1625, + "step": 15713 + }, + { + "epoch": 0.77, + "grad_norm": 0.5636427998542786, + "learning_rate": 0.0005076190593276743, + "loss": 3.2081, + "step": 15714 + }, + { + "epoch": 0.77, + "grad_norm": 0.5089654922485352, + "learning_rate": 0.0005076079455206121, + "loss": 3.3712, + "step": 15715 + }, + { + "epoch": 0.77, + "grad_norm": 0.48416969180107117, + "learning_rate": 0.0005075968311667502, + "loss": 3.1725, + "step": 15716 + }, + { + "epoch": 0.77, + "grad_norm": 0.5651270151138306, + "learning_rate": 0.0005075857162661184, + "loss": 3.3323, + "step": 15717 + }, + { + "epoch": 0.77, + "grad_norm": 0.5310222506523132, + "learning_rate": 0.0005075746008187458, + "loss": 3.2754, + "step": 15718 + }, + { + "epoch": 0.77, + "grad_norm": 0.5395601987838745, + "learning_rate": 0.0005075634848246616, + "loss": 3.352, + "step": 15719 + }, + { + "epoch": 0.77, + "grad_norm": 0.539057731628418, + "learning_rate": 0.0005075523682838951, + "loss": 3.0264, + "step": 15720 + }, + { + "epoch": 0.77, + "grad_norm": 0.5499597191810608, + "learning_rate": 0.0005075412511964755, + "loss": 3.0386, + "step": 15721 + }, + { + "epoch": 0.77, + "grad_norm": 0.5185397267341614, + "learning_rate": 0.0005075301335624323, + "loss": 3.2835, + "step": 15722 + }, + { + "epoch": 0.77, + "grad_norm": 0.49906614422798157, + "learning_rate": 0.0005075190153817948, + "loss": 3.0069, + "step": 15723 + }, + { + "epoch": 0.77, + "grad_norm": 0.5171295404434204, + "learning_rate": 0.000507507896654592, + "loss": 3.0378, + "step": 15724 + }, + { + "epoch": 0.77, + "grad_norm": 0.5171312689781189, + "learning_rate": 0.0005074967773808534, + "loss": 3.1803, + "step": 15725 + }, + { + "epoch": 0.77, + "grad_norm": 0.5338811874389648, + "learning_rate": 0.0005074856575606082, + "loss": 3.386, + "step": 15726 + }, + { + "epoch": 0.77, + "grad_norm": 0.5010255575180054, + "learning_rate": 0.0005074745371938857, + "loss": 3.3692, + "step": 15727 + }, + { + "epoch": 0.77, + "grad_norm": 0.6022095084190369, + "learning_rate": 0.0005074634162807152, + "loss": 3.1641, + "step": 15728 + }, + { + "epoch": 0.77, + "grad_norm": 0.534349262714386, + "learning_rate": 0.0005074522948211259, + "loss": 3.0681, + "step": 15729 + }, + { + "epoch": 0.77, + "grad_norm": 0.5148655772209167, + "learning_rate": 0.0005074411728151473, + "loss": 3.319, + "step": 15730 + }, + { + "epoch": 0.77, + "grad_norm": 0.49930626153945923, + "learning_rate": 0.0005074300502628085, + "loss": 3.251, + "step": 15731 + }, + { + "epoch": 0.77, + "grad_norm": 0.4870609939098358, + "learning_rate": 0.0005074189271641388, + "loss": 3.2754, + "step": 15732 + }, + { + "epoch": 0.77, + "grad_norm": 0.5289334058761597, + "learning_rate": 0.0005074078035191677, + "loss": 3.0453, + "step": 15733 + }, + { + "epoch": 0.77, + "grad_norm": 0.5236629247665405, + "learning_rate": 0.0005073966793279243, + "loss": 2.9417, + "step": 15734 + }, + { + "epoch": 0.77, + "grad_norm": 0.5354396104812622, + "learning_rate": 0.0005073855545904381, + "loss": 3.1531, + "step": 15735 + }, + { + "epoch": 0.77, + "grad_norm": 0.4817488193511963, + "learning_rate": 0.0005073744293067382, + "loss": 3.1444, + "step": 15736 + }, + { + "epoch": 0.77, + "grad_norm": 0.5554103851318359, + "learning_rate": 0.0005073633034768538, + "loss": 3.0, + "step": 15737 + }, + { + "epoch": 0.77, + "grad_norm": 0.5145366787910461, + "learning_rate": 0.0005073521771008145, + "loss": 3.187, + "step": 15738 + }, + { + "epoch": 0.77, + "grad_norm": 0.48250049352645874, + "learning_rate": 0.0005073410501786495, + "loss": 3.19, + "step": 15739 + }, + { + "epoch": 0.77, + "grad_norm": 0.5110865831375122, + "learning_rate": 0.000507329922710388, + "loss": 3.203, + "step": 15740 + }, + { + "epoch": 0.77, + "grad_norm": 0.5449311137199402, + "learning_rate": 0.0005073187946960594, + "loss": 3.1025, + "step": 15741 + }, + { + "epoch": 0.77, + "grad_norm": 0.5438368916511536, + "learning_rate": 0.000507307666135693, + "loss": 3.3186, + "step": 15742 + }, + { + "epoch": 0.77, + "grad_norm": 0.49835509061813354, + "learning_rate": 0.0005072965370293181, + "loss": 3.3353, + "step": 15743 + }, + { + "epoch": 0.77, + "grad_norm": 0.5272834897041321, + "learning_rate": 0.000507285407376964, + "loss": 3.0211, + "step": 15744 + }, + { + "epoch": 0.77, + "grad_norm": 0.5524513721466064, + "learning_rate": 0.0005072742771786601, + "loss": 3.2202, + "step": 15745 + }, + { + "epoch": 0.77, + "grad_norm": 0.5341085195541382, + "learning_rate": 0.0005072631464344355, + "loss": 3.0149, + "step": 15746 + }, + { + "epoch": 0.77, + "grad_norm": 0.5116642117500305, + "learning_rate": 0.0005072520151443197, + "loss": 3.5498, + "step": 15747 + }, + { + "epoch": 0.77, + "grad_norm": 0.5027918815612793, + "learning_rate": 0.0005072408833083421, + "loss": 3.1924, + "step": 15748 + }, + { + "epoch": 0.77, + "grad_norm": 0.545111358165741, + "learning_rate": 0.0005072297509265319, + "loss": 3.1091, + "step": 15749 + }, + { + "epoch": 0.77, + "grad_norm": 0.5268412828445435, + "learning_rate": 0.0005072186179989184, + "loss": 3.3507, + "step": 15750 + }, + { + "epoch": 0.77, + "grad_norm": 0.49303969740867615, + "learning_rate": 0.0005072074845255309, + "loss": 3.1132, + "step": 15751 + }, + { + "epoch": 0.77, + "grad_norm": 0.5391672849655151, + "learning_rate": 0.0005071963505063988, + "loss": 3.2345, + "step": 15752 + }, + { + "epoch": 0.77, + "grad_norm": 0.4987182915210724, + "learning_rate": 0.0005071852159415513, + "loss": 3.0815, + "step": 15753 + }, + { + "epoch": 0.77, + "grad_norm": 0.538796067237854, + "learning_rate": 0.0005071740808310179, + "loss": 2.9649, + "step": 15754 + }, + { + "epoch": 0.77, + "grad_norm": 0.4982840120792389, + "learning_rate": 0.0005071629451748277, + "loss": 3.2089, + "step": 15755 + }, + { + "epoch": 0.77, + "grad_norm": 0.49565887451171875, + "learning_rate": 0.0005071518089730103, + "loss": 3.3456, + "step": 15756 + }, + { + "epoch": 0.77, + "grad_norm": 0.5028196573257446, + "learning_rate": 0.0005071406722255948, + "loss": 3.1211, + "step": 15757 + }, + { + "epoch": 0.77, + "grad_norm": 0.4904448688030243, + "learning_rate": 0.0005071295349326108, + "loss": 3.2393, + "step": 15758 + }, + { + "epoch": 0.77, + "grad_norm": 0.5175606608390808, + "learning_rate": 0.0005071183970940874, + "loss": 3.4368, + "step": 15759 + }, + { + "epoch": 0.77, + "grad_norm": 0.5216371417045593, + "learning_rate": 0.0005071072587100539, + "loss": 3.0616, + "step": 15760 + }, + { + "epoch": 0.77, + "grad_norm": 0.5647536516189575, + "learning_rate": 0.0005070961197805399, + "loss": 3.3533, + "step": 15761 + }, + { + "epoch": 0.77, + "grad_norm": 0.5448745489120483, + "learning_rate": 0.0005070849803055744, + "loss": 3.2618, + "step": 15762 + }, + { + "epoch": 0.77, + "grad_norm": 0.5875873565673828, + "learning_rate": 0.0005070738402851871, + "loss": 3.2061, + "step": 15763 + }, + { + "epoch": 0.77, + "grad_norm": 0.5121092200279236, + "learning_rate": 0.0005070626997194071, + "loss": 3.2049, + "step": 15764 + }, + { + "epoch": 0.77, + "grad_norm": 0.5112888813018799, + "learning_rate": 0.0005070515586082638, + "loss": 2.9788, + "step": 15765 + }, + { + "epoch": 0.77, + "grad_norm": 0.537285327911377, + "learning_rate": 0.0005070404169517865, + "loss": 3.0783, + "step": 15766 + }, + { + "epoch": 0.77, + "grad_norm": 0.5411680936813354, + "learning_rate": 0.0005070292747500045, + "loss": 2.9802, + "step": 15767 + }, + { + "epoch": 0.77, + "grad_norm": 0.5896318554878235, + "learning_rate": 0.0005070181320029474, + "loss": 3.3174, + "step": 15768 + }, + { + "epoch": 0.77, + "grad_norm": 0.5077767968177795, + "learning_rate": 0.0005070069887106442, + "loss": 3.1943, + "step": 15769 + }, + { + "epoch": 0.77, + "grad_norm": 0.5382665395736694, + "learning_rate": 0.0005069958448731247, + "loss": 3.1708, + "step": 15770 + }, + { + "epoch": 0.77, + "grad_norm": 0.5162548422813416, + "learning_rate": 0.0005069847004904178, + "loss": 3.2341, + "step": 15771 + }, + { + "epoch": 0.77, + "grad_norm": 0.5016313195228577, + "learning_rate": 0.0005069735555625531, + "loss": 3.0787, + "step": 15772 + }, + { + "epoch": 0.77, + "grad_norm": 0.5164852738380432, + "learning_rate": 0.0005069624100895598, + "loss": 3.2173, + "step": 15773 + }, + { + "epoch": 0.77, + "grad_norm": 0.5445564389228821, + "learning_rate": 0.0005069512640714673, + "loss": 3.1206, + "step": 15774 + }, + { + "epoch": 0.77, + "grad_norm": 0.5352805256843567, + "learning_rate": 0.0005069401175083053, + "loss": 3.3894, + "step": 15775 + }, + { + "epoch": 0.77, + "grad_norm": 0.5283602476119995, + "learning_rate": 0.0005069289704001025, + "loss": 3.3639, + "step": 15776 + }, + { + "epoch": 0.77, + "grad_norm": 0.5328353643417358, + "learning_rate": 0.0005069178227468888, + "loss": 3.1275, + "step": 15777 + }, + { + "epoch": 0.77, + "grad_norm": 0.5417640805244446, + "learning_rate": 0.0005069066745486934, + "loss": 3.2893, + "step": 15778 + }, + { + "epoch": 0.77, + "grad_norm": 0.5066238641738892, + "learning_rate": 0.0005068955258055455, + "loss": 3.1368, + "step": 15779 + }, + { + "epoch": 0.77, + "grad_norm": 0.5763447284698486, + "learning_rate": 0.0005068843765174747, + "loss": 2.9799, + "step": 15780 + }, + { + "epoch": 0.77, + "grad_norm": 0.5056501030921936, + "learning_rate": 0.0005068732266845103, + "loss": 3.3112, + "step": 15781 + }, + { + "epoch": 0.77, + "grad_norm": 0.5100968480110168, + "learning_rate": 0.0005068620763066816, + "loss": 3.0971, + "step": 15782 + }, + { + "epoch": 0.77, + "grad_norm": 0.517602264881134, + "learning_rate": 0.000506850925384018, + "loss": 3.315, + "step": 15783 + }, + { + "epoch": 0.77, + "grad_norm": 0.5092670321464539, + "learning_rate": 0.0005068397739165488, + "loss": 3.2111, + "step": 15784 + }, + { + "epoch": 0.77, + "grad_norm": 0.5618929266929626, + "learning_rate": 0.0005068286219043035, + "loss": 3.1347, + "step": 15785 + }, + { + "epoch": 0.77, + "grad_norm": 0.5268994569778442, + "learning_rate": 0.0005068174693473115, + "loss": 3.1182, + "step": 15786 + }, + { + "epoch": 0.77, + "grad_norm": 0.5237292647361755, + "learning_rate": 0.000506806316245602, + "loss": 3.0985, + "step": 15787 + }, + { + "epoch": 0.77, + "grad_norm": 0.5168699622154236, + "learning_rate": 0.0005067951625992044, + "loss": 3.1803, + "step": 15788 + }, + { + "epoch": 0.77, + "grad_norm": 0.47693702578544617, + "learning_rate": 0.0005067840084081482, + "loss": 3.1477, + "step": 15789 + }, + { + "epoch": 0.77, + "grad_norm": 0.5285649299621582, + "learning_rate": 0.0005067728536724627, + "loss": 3.352, + "step": 15790 + }, + { + "epoch": 0.77, + "grad_norm": 0.49356603622436523, + "learning_rate": 0.0005067616983921774, + "loss": 2.9803, + "step": 15791 + }, + { + "epoch": 0.77, + "grad_norm": 0.512496829032898, + "learning_rate": 0.0005067505425673215, + "loss": 3.0556, + "step": 15792 + }, + { + "epoch": 0.77, + "grad_norm": 0.5645531415939331, + "learning_rate": 0.0005067393861979244, + "loss": 3.2463, + "step": 15793 + }, + { + "epoch": 0.77, + "grad_norm": 0.5089481472969055, + "learning_rate": 0.0005067282292840156, + "loss": 3.0876, + "step": 15794 + }, + { + "epoch": 0.77, + "grad_norm": 0.5088877081871033, + "learning_rate": 0.0005067170718256246, + "loss": 3.4672, + "step": 15795 + }, + { + "epoch": 0.77, + "grad_norm": 0.5133219957351685, + "learning_rate": 0.0005067059138227803, + "loss": 3.2869, + "step": 15796 + }, + { + "epoch": 0.77, + "grad_norm": 0.5056898593902588, + "learning_rate": 0.0005066947552755126, + "loss": 3.2511, + "step": 15797 + }, + { + "epoch": 0.77, + "grad_norm": 0.5130456686019897, + "learning_rate": 0.0005066835961838507, + "loss": 3.1219, + "step": 15798 + }, + { + "epoch": 0.77, + "grad_norm": 0.5243682861328125, + "learning_rate": 0.0005066724365478239, + "loss": 3.0821, + "step": 15799 + }, + { + "epoch": 0.77, + "grad_norm": 0.5532961487770081, + "learning_rate": 0.0005066612763674617, + "loss": 3.3234, + "step": 15800 + }, + { + "epoch": 0.77, + "grad_norm": 0.5712582468986511, + "learning_rate": 0.0005066501156427936, + "loss": 3.2391, + "step": 15801 + }, + { + "epoch": 0.77, + "grad_norm": 0.531348705291748, + "learning_rate": 0.0005066389543738487, + "loss": 3.485, + "step": 15802 + }, + { + "epoch": 0.77, + "grad_norm": 0.5381166338920593, + "learning_rate": 0.0005066277925606566, + "loss": 3.2193, + "step": 15803 + }, + { + "epoch": 0.77, + "grad_norm": 0.5312127470970154, + "learning_rate": 0.0005066166302032468, + "loss": 3.097, + "step": 15804 + }, + { + "epoch": 0.77, + "grad_norm": 0.5262129902839661, + "learning_rate": 0.0005066054673016484, + "loss": 3.1406, + "step": 15805 + }, + { + "epoch": 0.77, + "grad_norm": 0.5441365838050842, + "learning_rate": 0.0005065943038558909, + "loss": 3.0766, + "step": 15806 + }, + { + "epoch": 0.77, + "grad_norm": 0.4887438714504242, + "learning_rate": 0.0005065831398660039, + "loss": 3.1556, + "step": 15807 + }, + { + "epoch": 0.77, + "grad_norm": 0.5036865472793579, + "learning_rate": 0.0005065719753320167, + "loss": 3.0322, + "step": 15808 + }, + { + "epoch": 0.77, + "grad_norm": 0.5196355581283569, + "learning_rate": 0.0005065608102539586, + "loss": 3.0775, + "step": 15809 + }, + { + "epoch": 0.77, + "grad_norm": 0.5290484428405762, + "learning_rate": 0.000506549644631859, + "loss": 3.1921, + "step": 15810 + }, + { + "epoch": 0.77, + "grad_norm": 0.5046884417533875, + "learning_rate": 0.0005065384784657476, + "loss": 3.0588, + "step": 15811 + }, + { + "epoch": 0.77, + "grad_norm": 0.5305639505386353, + "learning_rate": 0.0005065273117556534, + "loss": 3.1943, + "step": 15812 + }, + { + "epoch": 0.77, + "grad_norm": 0.5117027163505554, + "learning_rate": 0.000506516144501606, + "loss": 3.1293, + "step": 15813 + }, + { + "epoch": 0.78, + "grad_norm": 0.52192223072052, + "learning_rate": 0.0005065049767036349, + "loss": 3.0501, + "step": 15814 + }, + { + "epoch": 0.78, + "grad_norm": 0.5212222337722778, + "learning_rate": 0.0005064938083617695, + "loss": 3.0072, + "step": 15815 + }, + { + "epoch": 0.78, + "grad_norm": 0.5260198712348938, + "learning_rate": 0.0005064826394760391, + "loss": 3.2323, + "step": 15816 + }, + { + "epoch": 0.78, + "grad_norm": 0.5320043563842773, + "learning_rate": 0.0005064714700464731, + "loss": 3.187, + "step": 15817 + }, + { + "epoch": 0.78, + "grad_norm": 0.5128186941146851, + "learning_rate": 0.000506460300073101, + "loss": 3.4666, + "step": 15818 + }, + { + "epoch": 0.78, + "grad_norm": 0.5312202572822571, + "learning_rate": 0.0005064491295559523, + "loss": 3.3288, + "step": 15819 + }, + { + "epoch": 0.78, + "grad_norm": 0.5054448843002319, + "learning_rate": 0.0005064379584950562, + "loss": 3.2213, + "step": 15820 + }, + { + "epoch": 0.78, + "grad_norm": 0.6765322685241699, + "learning_rate": 0.0005064267868904423, + "loss": 3.3935, + "step": 15821 + }, + { + "epoch": 0.78, + "grad_norm": 0.525917112827301, + "learning_rate": 0.00050641561474214, + "loss": 3.1906, + "step": 15822 + }, + { + "epoch": 0.78, + "grad_norm": 0.5486940741539001, + "learning_rate": 0.0005064044420501787, + "loss": 3.1634, + "step": 15823 + }, + { + "epoch": 0.78, + "grad_norm": 0.5462398529052734, + "learning_rate": 0.0005063932688145877, + "loss": 3.2199, + "step": 15824 + }, + { + "epoch": 0.78, + "grad_norm": 0.5118207335472107, + "learning_rate": 0.0005063820950353966, + "loss": 3.217, + "step": 15825 + }, + { + "epoch": 0.78, + "grad_norm": 0.5313277840614319, + "learning_rate": 0.000506370920712635, + "loss": 3.1615, + "step": 15826 + }, + { + "epoch": 0.78, + "grad_norm": 0.49920400977134705, + "learning_rate": 0.0005063597458463319, + "loss": 3.3053, + "step": 15827 + }, + { + "epoch": 0.78, + "grad_norm": 0.4805811047554016, + "learning_rate": 0.0005063485704365169, + "loss": 3.2122, + "step": 15828 + }, + { + "epoch": 0.78, + "grad_norm": 0.5627545714378357, + "learning_rate": 0.0005063373944832196, + "loss": 3.0398, + "step": 15829 + }, + { + "epoch": 0.78, + "grad_norm": 0.5166324377059937, + "learning_rate": 0.0005063262179864692, + "loss": 3.2995, + "step": 15830 + }, + { + "epoch": 0.78, + "grad_norm": 0.8689070343971252, + "learning_rate": 0.0005063150409462954, + "loss": 3.2035, + "step": 15831 + }, + { + "epoch": 0.78, + "grad_norm": 0.5488486289978027, + "learning_rate": 0.0005063038633627274, + "loss": 3.0854, + "step": 15832 + }, + { + "epoch": 0.78, + "grad_norm": 0.5090166926383972, + "learning_rate": 0.0005062926852357947, + "loss": 3.479, + "step": 15833 + }, + { + "epoch": 0.78, + "grad_norm": 0.515069842338562, + "learning_rate": 0.0005062815065655269, + "loss": 2.9501, + "step": 15834 + }, + { + "epoch": 0.78, + "grad_norm": 0.5386884212493896, + "learning_rate": 0.0005062703273519531, + "loss": 3.3355, + "step": 15835 + }, + { + "epoch": 0.78, + "grad_norm": 0.5605966448783875, + "learning_rate": 0.0005062591475951031, + "loss": 3.2428, + "step": 15836 + }, + { + "epoch": 0.78, + "grad_norm": 0.580772876739502, + "learning_rate": 0.0005062479672950063, + "loss": 3.1849, + "step": 15837 + }, + { + "epoch": 0.78, + "grad_norm": 0.5151135325431824, + "learning_rate": 0.0005062367864516919, + "loss": 3.139, + "step": 15838 + }, + { + "epoch": 0.78, + "grad_norm": 0.5454901456832886, + "learning_rate": 0.0005062256050651895, + "loss": 3.0489, + "step": 15839 + }, + { + "epoch": 0.78, + "grad_norm": 0.5578955411911011, + "learning_rate": 0.0005062144231355285, + "loss": 3.2888, + "step": 15840 + }, + { + "epoch": 0.78, + "grad_norm": 0.5507684350013733, + "learning_rate": 0.0005062032406627384, + "loss": 3.2061, + "step": 15841 + }, + { + "epoch": 0.78, + "grad_norm": 0.5147064924240112, + "learning_rate": 0.0005061920576468488, + "loss": 3.3796, + "step": 15842 + }, + { + "epoch": 0.78, + "grad_norm": 0.5263797044754028, + "learning_rate": 0.0005061808740878889, + "loss": 3.0018, + "step": 15843 + }, + { + "epoch": 0.78, + "grad_norm": 0.5236734747886658, + "learning_rate": 0.0005061696899858883, + "loss": 3.2627, + "step": 15844 + }, + { + "epoch": 0.78, + "grad_norm": 0.5589058995246887, + "learning_rate": 0.0005061585053408764, + "loss": 3.3151, + "step": 15845 + }, + { + "epoch": 0.78, + "grad_norm": 0.5322604179382324, + "learning_rate": 0.0005061473201528826, + "loss": 3.1628, + "step": 15846 + }, + { + "epoch": 0.78, + "grad_norm": 0.5170190334320068, + "learning_rate": 0.0005061361344219365, + "loss": 3.2358, + "step": 15847 + }, + { + "epoch": 0.78, + "grad_norm": 0.5249111652374268, + "learning_rate": 0.0005061249481480675, + "loss": 3.1819, + "step": 15848 + }, + { + "epoch": 0.78, + "grad_norm": 0.5331235527992249, + "learning_rate": 0.000506113761331305, + "loss": 3.2316, + "step": 15849 + }, + { + "epoch": 0.78, + "grad_norm": 0.5436180830001831, + "learning_rate": 0.0005061025739716786, + "loss": 2.9909, + "step": 15850 + }, + { + "epoch": 0.78, + "grad_norm": 0.5154057145118713, + "learning_rate": 0.0005060913860692177, + "loss": 3.0543, + "step": 15851 + }, + { + "epoch": 0.78, + "grad_norm": 0.5232006311416626, + "learning_rate": 0.0005060801976239516, + "loss": 3.1253, + "step": 15852 + }, + { + "epoch": 0.78, + "grad_norm": 0.605620801448822, + "learning_rate": 0.0005060690086359101, + "loss": 3.3018, + "step": 15853 + }, + { + "epoch": 0.78, + "grad_norm": 0.4980825185775757, + "learning_rate": 0.0005060578191051225, + "loss": 3.0301, + "step": 15854 + }, + { + "epoch": 0.78, + "grad_norm": 0.5214237570762634, + "learning_rate": 0.000506046629031618, + "loss": 3.4541, + "step": 15855 + }, + { + "epoch": 0.78, + "grad_norm": 0.5480488538742065, + "learning_rate": 0.0005060354384154265, + "loss": 3.018, + "step": 15856 + }, + { + "epoch": 0.78, + "grad_norm": 0.5750018358230591, + "learning_rate": 0.0005060242472565774, + "loss": 3.3321, + "step": 15857 + }, + { + "epoch": 0.78, + "grad_norm": 0.5459998250007629, + "learning_rate": 0.0005060130555550999, + "loss": 3.1318, + "step": 15858 + }, + { + "epoch": 0.78, + "grad_norm": 0.5523403882980347, + "learning_rate": 0.0005060018633110238, + "loss": 3.311, + "step": 15859 + }, + { + "epoch": 0.78, + "grad_norm": 0.5640004873275757, + "learning_rate": 0.0005059906705243783, + "loss": 3.1297, + "step": 15860 + }, + { + "epoch": 0.78, + "grad_norm": 0.5225830674171448, + "learning_rate": 0.0005059794771951931, + "loss": 3.2224, + "step": 15861 + }, + { + "epoch": 0.78, + "grad_norm": 0.49763786792755127, + "learning_rate": 0.0005059682833234977, + "loss": 3.2197, + "step": 15862 + }, + { + "epoch": 0.78, + "grad_norm": 0.517906665802002, + "learning_rate": 0.0005059570889093214, + "loss": 3.1153, + "step": 15863 + }, + { + "epoch": 0.78, + "grad_norm": 0.5416662096977234, + "learning_rate": 0.0005059458939526937, + "loss": 3.3586, + "step": 15864 + }, + { + "epoch": 0.78, + "grad_norm": 0.48512333631515503, + "learning_rate": 0.0005059346984536442, + "loss": 3.1849, + "step": 15865 + }, + { + "epoch": 0.78, + "grad_norm": 0.5373478531837463, + "learning_rate": 0.0005059235024122024, + "loss": 3.0875, + "step": 15866 + }, + { + "epoch": 0.78, + "grad_norm": 0.4900364577770233, + "learning_rate": 0.0005059123058283976, + "loss": 3.2715, + "step": 15867 + }, + { + "epoch": 0.78, + "grad_norm": 0.5320209860801697, + "learning_rate": 0.0005059011087022595, + "loss": 3.3284, + "step": 15868 + }, + { + "epoch": 0.78, + "grad_norm": 0.5448189973831177, + "learning_rate": 0.0005058899110338175, + "loss": 3.1436, + "step": 15869 + }, + { + "epoch": 0.78, + "grad_norm": 0.5075494647026062, + "learning_rate": 0.0005058787128231011, + "loss": 3.2013, + "step": 15870 + }, + { + "epoch": 0.78, + "grad_norm": 0.5199207663536072, + "learning_rate": 0.0005058675140701398, + "loss": 3.1813, + "step": 15871 + }, + { + "epoch": 0.78, + "grad_norm": 0.49894481897354126, + "learning_rate": 0.000505856314774963, + "loss": 3.1914, + "step": 15872 + }, + { + "epoch": 0.78, + "grad_norm": 0.5516239404678345, + "learning_rate": 0.0005058451149376003, + "loss": 3.2814, + "step": 15873 + }, + { + "epoch": 0.78, + "grad_norm": 0.5630885362625122, + "learning_rate": 0.0005058339145580813, + "loss": 3.219, + "step": 15874 + }, + { + "epoch": 0.78, + "grad_norm": 0.5175797343254089, + "learning_rate": 0.0005058227136364353, + "loss": 3.3844, + "step": 15875 + }, + { + "epoch": 0.78, + "grad_norm": 0.5631835460662842, + "learning_rate": 0.0005058115121726918, + "loss": 3.1561, + "step": 15876 + }, + { + "epoch": 0.78, + "grad_norm": 0.5062498450279236, + "learning_rate": 0.0005058003101668806, + "loss": 3.1483, + "step": 15877 + }, + { + "epoch": 0.78, + "grad_norm": 0.531692385673523, + "learning_rate": 0.0005057891076190309, + "loss": 3.3115, + "step": 15878 + }, + { + "epoch": 0.78, + "grad_norm": 0.491470068693161, + "learning_rate": 0.0005057779045291723, + "loss": 3.1625, + "step": 15879 + }, + { + "epoch": 0.78, + "grad_norm": 0.5266138315200806, + "learning_rate": 0.0005057667008973341, + "loss": 3.2485, + "step": 15880 + }, + { + "epoch": 0.78, + "grad_norm": 0.5649579763412476, + "learning_rate": 0.0005057554967235463, + "loss": 3.2064, + "step": 15881 + }, + { + "epoch": 0.78, + "grad_norm": 0.4945317804813385, + "learning_rate": 0.000505744292007838, + "loss": 3.077, + "step": 15882 + }, + { + "epoch": 0.78, + "grad_norm": 0.5235192775726318, + "learning_rate": 0.0005057330867502389, + "loss": 3.2043, + "step": 15883 + }, + { + "epoch": 0.78, + "grad_norm": 0.4946229159832001, + "learning_rate": 0.0005057218809507782, + "loss": 3.0865, + "step": 15884 + }, + { + "epoch": 0.78, + "grad_norm": 0.5265135169029236, + "learning_rate": 0.0005057106746094859, + "loss": 3.2467, + "step": 15885 + }, + { + "epoch": 0.78, + "grad_norm": 0.4959494173526764, + "learning_rate": 0.0005056994677263913, + "loss": 3.212, + "step": 15886 + }, + { + "epoch": 0.78, + "grad_norm": 0.5413668155670166, + "learning_rate": 0.0005056882603015237, + "loss": 3.0447, + "step": 15887 + }, + { + "epoch": 0.78, + "grad_norm": 0.4981670081615448, + "learning_rate": 0.000505677052334913, + "loss": 3.1375, + "step": 15888 + }, + { + "epoch": 0.78, + "grad_norm": 0.5313765406608582, + "learning_rate": 0.0005056658438265884, + "loss": 3.1866, + "step": 15889 + }, + { + "epoch": 0.78, + "grad_norm": 0.5187064409255981, + "learning_rate": 0.0005056546347765796, + "loss": 3.3122, + "step": 15890 + }, + { + "epoch": 0.78, + "grad_norm": 0.5525301098823547, + "learning_rate": 0.0005056434251849161, + "loss": 3.1409, + "step": 15891 + }, + { + "epoch": 0.78, + "grad_norm": 0.5722075700759888, + "learning_rate": 0.0005056322150516273, + "loss": 3.1696, + "step": 15892 + }, + { + "epoch": 0.78, + "grad_norm": 0.5104582905769348, + "learning_rate": 0.0005056210043767428, + "loss": 3.09, + "step": 15893 + }, + { + "epoch": 0.78, + "grad_norm": 0.5645208358764648, + "learning_rate": 0.0005056097931602923, + "loss": 3.2814, + "step": 15894 + }, + { + "epoch": 0.78, + "grad_norm": 0.513430118560791, + "learning_rate": 0.000505598581402305, + "loss": 3.0936, + "step": 15895 + }, + { + "epoch": 0.78, + "grad_norm": 0.5425407886505127, + "learning_rate": 0.0005055873691028108, + "loss": 3.3683, + "step": 15896 + }, + { + "epoch": 0.78, + "grad_norm": 0.48319244384765625, + "learning_rate": 0.0005055761562618388, + "loss": 3.1959, + "step": 15897 + }, + { + "epoch": 0.78, + "grad_norm": 0.518714964389801, + "learning_rate": 0.000505564942879419, + "loss": 3.0281, + "step": 15898 + }, + { + "epoch": 0.78, + "grad_norm": 0.5663323998451233, + "learning_rate": 0.0005055537289555806, + "loss": 3.3935, + "step": 15899 + }, + { + "epoch": 0.78, + "grad_norm": 0.5486282706260681, + "learning_rate": 0.0005055425144903532, + "loss": 3.3414, + "step": 15900 + }, + { + "epoch": 0.78, + "grad_norm": 0.5473208427429199, + "learning_rate": 0.0005055312994837664, + "loss": 3.3134, + "step": 15901 + }, + { + "epoch": 0.78, + "grad_norm": 0.5325106382369995, + "learning_rate": 0.0005055200839358497, + "loss": 3.1135, + "step": 15902 + }, + { + "epoch": 0.78, + "grad_norm": 0.5353180170059204, + "learning_rate": 0.0005055088678466327, + "loss": 3.1344, + "step": 15903 + }, + { + "epoch": 0.78, + "grad_norm": 0.5123947858810425, + "learning_rate": 0.0005054976512161449, + "loss": 3.0725, + "step": 15904 + }, + { + "epoch": 0.78, + "grad_norm": 0.5428218245506287, + "learning_rate": 0.0005054864340444158, + "loss": 3.244, + "step": 15905 + }, + { + "epoch": 0.78, + "grad_norm": 0.5174368023872375, + "learning_rate": 0.000505475216331475, + "loss": 3.2212, + "step": 15906 + }, + { + "epoch": 0.78, + "grad_norm": 0.5281883478164673, + "learning_rate": 0.0005054639980773519, + "loss": 3.1813, + "step": 15907 + }, + { + "epoch": 0.78, + "grad_norm": 0.564376175403595, + "learning_rate": 0.0005054527792820764, + "loss": 3.0156, + "step": 15908 + }, + { + "epoch": 0.78, + "grad_norm": 0.5363826751708984, + "learning_rate": 0.0005054415599456776, + "loss": 3.3758, + "step": 15909 + }, + { + "epoch": 0.78, + "grad_norm": 0.514251708984375, + "learning_rate": 0.0005054303400681855, + "loss": 3.2013, + "step": 15910 + }, + { + "epoch": 0.78, + "grad_norm": 0.5030025839805603, + "learning_rate": 0.0005054191196496293, + "loss": 3.2198, + "step": 15911 + }, + { + "epoch": 0.78, + "grad_norm": 0.5085748434066772, + "learning_rate": 0.0005054078986900387, + "loss": 3.2473, + "step": 15912 + }, + { + "epoch": 0.78, + "grad_norm": 0.5087880492210388, + "learning_rate": 0.0005053966771894432, + "loss": 3.2511, + "step": 15913 + }, + { + "epoch": 0.78, + "grad_norm": 0.550105631351471, + "learning_rate": 0.0005053854551478723, + "loss": 3.0949, + "step": 15914 + }, + { + "epoch": 0.78, + "grad_norm": 0.5327011942863464, + "learning_rate": 0.0005053742325653557, + "loss": 3.2427, + "step": 15915 + }, + { + "epoch": 0.78, + "grad_norm": 0.5084472894668579, + "learning_rate": 0.0005053630094419228, + "loss": 3.0718, + "step": 15916 + }, + { + "epoch": 0.78, + "grad_norm": 0.5227506160736084, + "learning_rate": 0.0005053517857776035, + "loss": 3.1136, + "step": 15917 + }, + { + "epoch": 0.78, + "grad_norm": 0.5210598707199097, + "learning_rate": 0.000505340561572427, + "loss": 3.266, + "step": 15918 + }, + { + "epoch": 0.78, + "grad_norm": 0.5185865759849548, + "learning_rate": 0.000505329336826423, + "loss": 3.0593, + "step": 15919 + }, + { + "epoch": 0.78, + "grad_norm": 0.5381935834884644, + "learning_rate": 0.0005053181115396209, + "loss": 3.1447, + "step": 15920 + }, + { + "epoch": 0.78, + "grad_norm": 0.5370237231254578, + "learning_rate": 0.0005053068857120505, + "loss": 3.0868, + "step": 15921 + }, + { + "epoch": 0.78, + "grad_norm": 0.5445959568023682, + "learning_rate": 0.0005052956593437413, + "loss": 3.2731, + "step": 15922 + }, + { + "epoch": 0.78, + "grad_norm": 0.5113406181335449, + "learning_rate": 0.0005052844324347228, + "loss": 3.2167, + "step": 15923 + }, + { + "epoch": 0.78, + "grad_norm": 0.5293589234352112, + "learning_rate": 0.0005052732049850246, + "loss": 3.3285, + "step": 15924 + }, + { + "epoch": 0.78, + "grad_norm": 0.523661196231842, + "learning_rate": 0.0005052619769946764, + "loss": 3.2882, + "step": 15925 + }, + { + "epoch": 0.78, + "grad_norm": 0.5041723251342773, + "learning_rate": 0.0005052507484637076, + "loss": 3.0888, + "step": 15926 + }, + { + "epoch": 0.78, + "grad_norm": 0.5257124900817871, + "learning_rate": 0.0005052395193921478, + "loss": 3.2741, + "step": 15927 + }, + { + "epoch": 0.78, + "grad_norm": 0.48543840646743774, + "learning_rate": 0.0005052282897800266, + "loss": 3.3522, + "step": 15928 + }, + { + "epoch": 0.78, + "grad_norm": 0.5463740825653076, + "learning_rate": 0.0005052170596273735, + "loss": 3.3716, + "step": 15929 + }, + { + "epoch": 0.78, + "grad_norm": 0.5675771236419678, + "learning_rate": 0.0005052058289342184, + "loss": 3.0156, + "step": 15930 + }, + { + "epoch": 0.78, + "grad_norm": 0.511083722114563, + "learning_rate": 0.0005051945977005905, + "loss": 3.1242, + "step": 15931 + }, + { + "epoch": 0.78, + "grad_norm": 0.5357405543327332, + "learning_rate": 0.0005051833659265195, + "loss": 3.2083, + "step": 15932 + }, + { + "epoch": 0.78, + "grad_norm": 0.5028762221336365, + "learning_rate": 0.000505172133612035, + "loss": 3.1726, + "step": 15933 + }, + { + "epoch": 0.78, + "grad_norm": 0.5303201675415039, + "learning_rate": 0.0005051609007571666, + "loss": 3.2378, + "step": 15934 + }, + { + "epoch": 0.78, + "grad_norm": 0.5492925047874451, + "learning_rate": 0.0005051496673619439, + "loss": 3.3461, + "step": 15935 + }, + { + "epoch": 0.78, + "grad_norm": 0.5268462896347046, + "learning_rate": 0.0005051384334263965, + "loss": 3.1529, + "step": 15936 + }, + { + "epoch": 0.78, + "grad_norm": 0.5270599722862244, + "learning_rate": 0.0005051271989505538, + "loss": 3.1615, + "step": 15937 + }, + { + "epoch": 0.78, + "grad_norm": 0.5161546468734741, + "learning_rate": 0.0005051159639344456, + "loss": 3.2024, + "step": 15938 + }, + { + "epoch": 0.78, + "grad_norm": 0.5089823603630066, + "learning_rate": 0.0005051047283781015, + "loss": 3.1198, + "step": 15939 + }, + { + "epoch": 0.78, + "grad_norm": 0.48041659593582153, + "learning_rate": 0.000505093492281551, + "loss": 3.3224, + "step": 15940 + }, + { + "epoch": 0.78, + "grad_norm": 0.5333757400512695, + "learning_rate": 0.0005050822556448236, + "loss": 3.1934, + "step": 15941 + }, + { + "epoch": 0.78, + "grad_norm": 0.5357717275619507, + "learning_rate": 0.000505071018467949, + "loss": 3.2549, + "step": 15942 + }, + { + "epoch": 0.78, + "grad_norm": 0.5316967368125916, + "learning_rate": 0.0005050597807509569, + "loss": 3.0993, + "step": 15943 + }, + { + "epoch": 0.78, + "grad_norm": 0.5037146210670471, + "learning_rate": 0.0005050485424938769, + "loss": 3.2657, + "step": 15944 + }, + { + "epoch": 0.78, + "grad_norm": 0.5503755807876587, + "learning_rate": 0.0005050373036967384, + "loss": 3.2414, + "step": 15945 + }, + { + "epoch": 0.78, + "grad_norm": 0.5169128179550171, + "learning_rate": 0.0005050260643595711, + "loss": 2.9953, + "step": 15946 + }, + { + "epoch": 0.78, + "grad_norm": 0.5049517154693604, + "learning_rate": 0.0005050148244824045, + "loss": 3.2635, + "step": 15947 + }, + { + "epoch": 0.78, + "grad_norm": 0.4886893928050995, + "learning_rate": 0.0005050035840652684, + "loss": 3.1339, + "step": 15948 + }, + { + "epoch": 0.78, + "grad_norm": 0.5033433437347412, + "learning_rate": 0.0005049923431081924, + "loss": 3.0855, + "step": 15949 + }, + { + "epoch": 0.78, + "grad_norm": 0.5404435396194458, + "learning_rate": 0.0005049811016112059, + "loss": 3.0479, + "step": 15950 + }, + { + "epoch": 0.78, + "grad_norm": 0.5240478515625, + "learning_rate": 0.0005049698595743387, + "loss": 3.0661, + "step": 15951 + }, + { + "epoch": 0.78, + "grad_norm": 0.5057730674743652, + "learning_rate": 0.0005049586169976202, + "loss": 3.3078, + "step": 15952 + }, + { + "epoch": 0.78, + "grad_norm": 0.5399793982505798, + "learning_rate": 0.0005049473738810803, + "loss": 3.0763, + "step": 15953 + }, + { + "epoch": 0.78, + "grad_norm": 0.4813418984413147, + "learning_rate": 0.0005049361302247485, + "loss": 3.1835, + "step": 15954 + }, + { + "epoch": 0.78, + "grad_norm": 0.5555798411369324, + "learning_rate": 0.0005049248860286542, + "loss": 3.1501, + "step": 15955 + }, + { + "epoch": 0.78, + "grad_norm": 0.5001389980316162, + "learning_rate": 0.0005049136412928273, + "loss": 3.143, + "step": 15956 + }, + { + "epoch": 0.78, + "grad_norm": 0.5168265700340271, + "learning_rate": 0.0005049023960172973, + "loss": 3.2054, + "step": 15957 + }, + { + "epoch": 0.78, + "grad_norm": 0.5343358516693115, + "learning_rate": 0.0005048911502020938, + "loss": 3.292, + "step": 15958 + }, + { + "epoch": 0.78, + "grad_norm": 0.5781263113021851, + "learning_rate": 0.0005048799038472465, + "loss": 3.1776, + "step": 15959 + }, + { + "epoch": 0.78, + "grad_norm": 0.5077539682388306, + "learning_rate": 0.0005048686569527848, + "loss": 3.1976, + "step": 15960 + }, + { + "epoch": 0.78, + "grad_norm": 0.5165067315101624, + "learning_rate": 0.0005048574095187385, + "loss": 3.0778, + "step": 15961 + }, + { + "epoch": 0.78, + "grad_norm": 0.5727941393852234, + "learning_rate": 0.0005048461615451374, + "loss": 3.2127, + "step": 15962 + }, + { + "epoch": 0.78, + "grad_norm": 0.5406970381736755, + "learning_rate": 0.0005048349130320108, + "loss": 3.2311, + "step": 15963 + }, + { + "epoch": 0.78, + "grad_norm": 0.5371522903442383, + "learning_rate": 0.0005048236639793885, + "loss": 3.1551, + "step": 15964 + }, + { + "epoch": 0.78, + "grad_norm": 0.5899889469146729, + "learning_rate": 0.0005048124143873001, + "loss": 3.213, + "step": 15965 + }, + { + "epoch": 0.78, + "grad_norm": 0.5044421553611755, + "learning_rate": 0.0005048011642557751, + "loss": 3.383, + "step": 15966 + }, + { + "epoch": 0.78, + "grad_norm": 0.5018906593322754, + "learning_rate": 0.0005047899135848435, + "loss": 3.4013, + "step": 15967 + }, + { + "epoch": 0.78, + "grad_norm": 0.47671008110046387, + "learning_rate": 0.0005047786623745345, + "loss": 3.3241, + "step": 15968 + }, + { + "epoch": 0.78, + "grad_norm": 0.5087835788726807, + "learning_rate": 0.0005047674106248779, + "loss": 3.223, + "step": 15969 + }, + { + "epoch": 0.78, + "grad_norm": 0.5542645454406738, + "learning_rate": 0.0005047561583359034, + "loss": 3.206, + "step": 15970 + }, + { + "epoch": 0.78, + "grad_norm": 0.4945680499076843, + "learning_rate": 0.0005047449055076407, + "loss": 3.1473, + "step": 15971 + }, + { + "epoch": 0.78, + "grad_norm": 0.4865614175796509, + "learning_rate": 0.0005047336521401191, + "loss": 3.1573, + "step": 15972 + }, + { + "epoch": 0.78, + "grad_norm": 0.4855407476425171, + "learning_rate": 0.0005047223982333685, + "loss": 3.2028, + "step": 15973 + }, + { + "epoch": 0.78, + "grad_norm": 0.5474462509155273, + "learning_rate": 0.0005047111437874186, + "loss": 3.4711, + "step": 15974 + }, + { + "epoch": 0.78, + "grad_norm": 0.5180954933166504, + "learning_rate": 0.0005046998888022988, + "loss": 3.0933, + "step": 15975 + }, + { + "epoch": 0.78, + "grad_norm": 0.5082975625991821, + "learning_rate": 0.0005046886332780392, + "loss": 3.1662, + "step": 15976 + }, + { + "epoch": 0.78, + "grad_norm": 0.6138001680374146, + "learning_rate": 0.0005046773772146688, + "loss": 3.0991, + "step": 15977 + }, + { + "epoch": 0.78, + "grad_norm": 0.5285327434539795, + "learning_rate": 0.0005046661206122178, + "loss": 3.1361, + "step": 15978 + }, + { + "epoch": 0.78, + "grad_norm": 0.5196309685707092, + "learning_rate": 0.0005046548634707155, + "loss": 3.0654, + "step": 15979 + }, + { + "epoch": 0.78, + "grad_norm": 0.5266782641410828, + "learning_rate": 0.0005046436057901917, + "loss": 3.2551, + "step": 15980 + }, + { + "epoch": 0.78, + "grad_norm": 0.5191410183906555, + "learning_rate": 0.0005046323475706761, + "loss": 3.3328, + "step": 15981 + }, + { + "epoch": 0.78, + "grad_norm": 0.5638286471366882, + "learning_rate": 0.0005046210888121982, + "loss": 3.324, + "step": 15982 + }, + { + "epoch": 0.78, + "grad_norm": 0.5119996666908264, + "learning_rate": 0.0005046098295147877, + "loss": 3.0339, + "step": 15983 + }, + { + "epoch": 0.78, + "grad_norm": 0.5803064703941345, + "learning_rate": 0.0005045985696784743, + "loss": 3.3781, + "step": 15984 + }, + { + "epoch": 0.78, + "grad_norm": 0.5395102500915527, + "learning_rate": 0.0005045873093032878, + "loss": 3.4653, + "step": 15985 + }, + { + "epoch": 0.78, + "grad_norm": 0.508126437664032, + "learning_rate": 0.0005045760483892575, + "loss": 3.1083, + "step": 15986 + }, + { + "epoch": 0.78, + "grad_norm": 0.48580315709114075, + "learning_rate": 0.0005045647869364134, + "loss": 3.0414, + "step": 15987 + }, + { + "epoch": 0.78, + "grad_norm": 0.5052012205123901, + "learning_rate": 0.0005045535249447848, + "loss": 2.9321, + "step": 15988 + }, + { + "epoch": 0.78, + "grad_norm": 0.575672447681427, + "learning_rate": 0.0005045422624144019, + "loss": 3.2742, + "step": 15989 + }, + { + "epoch": 0.78, + "grad_norm": 0.5062314867973328, + "learning_rate": 0.0005045309993452939, + "loss": 3.2034, + "step": 15990 + }, + { + "epoch": 0.78, + "grad_norm": 0.5185468792915344, + "learning_rate": 0.0005045197357374906, + "loss": 3.2348, + "step": 15991 + }, + { + "epoch": 0.78, + "grad_norm": 0.5151081681251526, + "learning_rate": 0.0005045084715910216, + "loss": 3.2609, + "step": 15992 + }, + { + "epoch": 0.78, + "grad_norm": 0.4942055642604828, + "learning_rate": 0.0005044972069059167, + "loss": 3.2212, + "step": 15993 + }, + { + "epoch": 0.78, + "grad_norm": 0.4843122065067291, + "learning_rate": 0.0005044859416822056, + "loss": 3.1712, + "step": 15994 + }, + { + "epoch": 0.78, + "grad_norm": 0.5047922134399414, + "learning_rate": 0.0005044746759199178, + "loss": 3.2883, + "step": 15995 + }, + { + "epoch": 0.78, + "grad_norm": 0.5109986066818237, + "learning_rate": 0.0005044634096190831, + "loss": 2.9292, + "step": 15996 + }, + { + "epoch": 0.78, + "grad_norm": 0.5134650468826294, + "learning_rate": 0.0005044521427797311, + "loss": 3.1455, + "step": 15997 + }, + { + "epoch": 0.78, + "grad_norm": 0.5431228280067444, + "learning_rate": 0.0005044408754018915, + "loss": 3.1955, + "step": 15998 + }, + { + "epoch": 0.78, + "grad_norm": 0.5586522817611694, + "learning_rate": 0.000504429607485594, + "loss": 3.2098, + "step": 15999 + }, + { + "epoch": 0.78, + "grad_norm": 0.5322924852371216, + "learning_rate": 0.0005044183390308682, + "loss": 3.3816, + "step": 16000 + }, + { + "epoch": 0.78, + "grad_norm": 0.49801820516586304, + "learning_rate": 0.000504407070037744, + "loss": 3.3149, + "step": 16001 + }, + { + "epoch": 0.78, + "grad_norm": 0.5596066117286682, + "learning_rate": 0.0005043958005062506, + "loss": 3.1123, + "step": 16002 + }, + { + "epoch": 0.78, + "grad_norm": 0.5819718241691589, + "learning_rate": 0.0005043845304364183, + "loss": 3.3048, + "step": 16003 + }, + { + "epoch": 0.78, + "grad_norm": 0.5312378406524658, + "learning_rate": 0.0005043732598282763, + "loss": 3.2235, + "step": 16004 + }, + { + "epoch": 0.78, + "grad_norm": 0.4894445538520813, + "learning_rate": 0.0005043619886818546, + "loss": 3.2897, + "step": 16005 + }, + { + "epoch": 0.78, + "grad_norm": 0.5514426827430725, + "learning_rate": 0.0005043507169971827, + "loss": 3.0689, + "step": 16006 + }, + { + "epoch": 0.78, + "grad_norm": 0.5011432766914368, + "learning_rate": 0.0005043394447742903, + "loss": 3.2763, + "step": 16007 + }, + { + "epoch": 0.78, + "grad_norm": 0.5099719762802124, + "learning_rate": 0.0005043281720132072, + "loss": 3.2354, + "step": 16008 + }, + { + "epoch": 0.78, + "grad_norm": 0.5169076919555664, + "learning_rate": 0.000504316898713963, + "loss": 3.2652, + "step": 16009 + }, + { + "epoch": 0.78, + "grad_norm": 0.5074393153190613, + "learning_rate": 0.0005043056248765875, + "loss": 3.3298, + "step": 16010 + }, + { + "epoch": 0.78, + "grad_norm": 0.5243656635284424, + "learning_rate": 0.0005042943505011101, + "loss": 3.2304, + "step": 16011 + }, + { + "epoch": 0.78, + "grad_norm": 0.5197635293006897, + "learning_rate": 0.0005042830755875607, + "loss": 3.1894, + "step": 16012 + }, + { + "epoch": 0.78, + "grad_norm": 0.5310289263725281, + "learning_rate": 0.0005042718001359691, + "loss": 3.3684, + "step": 16013 + }, + { + "epoch": 0.78, + "grad_norm": 0.532052218914032, + "learning_rate": 0.000504260524146365, + "loss": 3.1898, + "step": 16014 + }, + { + "epoch": 0.78, + "grad_norm": 0.5254867672920227, + "learning_rate": 0.0005042492476187778, + "loss": 3.1261, + "step": 16015 + }, + { + "epoch": 0.78, + "grad_norm": 0.5229167342185974, + "learning_rate": 0.0005042379705532374, + "loss": 3.2582, + "step": 16016 + }, + { + "epoch": 0.78, + "grad_norm": 0.5486670136451721, + "learning_rate": 0.0005042266929497736, + "loss": 3.1984, + "step": 16017 + }, + { + "epoch": 0.79, + "grad_norm": 0.5969434380531311, + "learning_rate": 0.0005042154148084159, + "loss": 3.3827, + "step": 16018 + }, + { + "epoch": 0.79, + "grad_norm": 0.4866141378879547, + "learning_rate": 0.0005042041361291941, + "loss": 3.2698, + "step": 16019 + }, + { + "epoch": 0.79, + "grad_norm": 0.6796237826347351, + "learning_rate": 0.000504192856912138, + "loss": 2.9635, + "step": 16020 + }, + { + "epoch": 0.79, + "grad_norm": 0.5028907656669617, + "learning_rate": 0.0005041815771572772, + "loss": 3.237, + "step": 16021 + }, + { + "epoch": 0.79, + "grad_norm": 0.5195736885070801, + "learning_rate": 0.0005041702968646413, + "loss": 3.3098, + "step": 16022 + }, + { + "epoch": 0.79, + "grad_norm": 0.490695059299469, + "learning_rate": 0.0005041590160342603, + "loss": 3.2567, + "step": 16023 + }, + { + "epoch": 0.79, + "grad_norm": 0.5087558031082153, + "learning_rate": 0.0005041477346661637, + "loss": 3.2146, + "step": 16024 + }, + { + "epoch": 0.79, + "grad_norm": 0.5431877374649048, + "learning_rate": 0.0005041364527603811, + "loss": 3.2565, + "step": 16025 + }, + { + "epoch": 0.79, + "grad_norm": 0.6024651527404785, + "learning_rate": 0.0005041251703169425, + "loss": 3.3531, + "step": 16026 + }, + { + "epoch": 0.79, + "grad_norm": 0.5574399828910828, + "learning_rate": 0.0005041138873358776, + "loss": 3.1318, + "step": 16027 + }, + { + "epoch": 0.79, + "grad_norm": 0.5236434936523438, + "learning_rate": 0.0005041026038172158, + "loss": 3.4343, + "step": 16028 + }, + { + "epoch": 0.79, + "grad_norm": 0.5195509195327759, + "learning_rate": 0.0005040913197609871, + "loss": 3.097, + "step": 16029 + }, + { + "epoch": 0.79, + "grad_norm": 0.5851946473121643, + "learning_rate": 0.0005040800351672211, + "loss": 3.3494, + "step": 16030 + }, + { + "epoch": 0.79, + "grad_norm": 0.5439422130584717, + "learning_rate": 0.0005040687500359476, + "loss": 3.1376, + "step": 16031 + }, + { + "epoch": 0.79, + "grad_norm": 0.5359077453613281, + "learning_rate": 0.0005040574643671963, + "loss": 3.162, + "step": 16032 + }, + { + "epoch": 0.79, + "grad_norm": 0.5320112705230713, + "learning_rate": 0.0005040461781609969, + "loss": 3.3413, + "step": 16033 + }, + { + "epoch": 0.79, + "grad_norm": 0.503227174282074, + "learning_rate": 0.0005040348914173791, + "loss": 3.2933, + "step": 16034 + }, + { + "epoch": 0.79, + "grad_norm": 0.5737831592559814, + "learning_rate": 0.0005040236041363728, + "loss": 3.3733, + "step": 16035 + }, + { + "epoch": 0.79, + "grad_norm": 0.4747275114059448, + "learning_rate": 0.0005040123163180075, + "loss": 3.1553, + "step": 16036 + }, + { + "epoch": 0.79, + "grad_norm": 0.5528791546821594, + "learning_rate": 0.0005040010279623129, + "loss": 3.1915, + "step": 16037 + }, + { + "epoch": 0.79, + "grad_norm": 0.526457667350769, + "learning_rate": 0.000503989739069319, + "loss": 3.3059, + "step": 16038 + }, + { + "epoch": 0.79, + "grad_norm": 0.5216947197914124, + "learning_rate": 0.0005039784496390554, + "loss": 3.1353, + "step": 16039 + }, + { + "epoch": 0.79, + "grad_norm": 0.5180486440658569, + "learning_rate": 0.0005039671596715517, + "loss": 3.0681, + "step": 16040 + }, + { + "epoch": 0.79, + "grad_norm": 0.5172383785247803, + "learning_rate": 0.0005039558691668378, + "loss": 3.0283, + "step": 16041 + }, + { + "epoch": 0.79, + "grad_norm": 0.5042517185211182, + "learning_rate": 0.0005039445781249435, + "loss": 3.1366, + "step": 16042 + }, + { + "epoch": 0.79, + "grad_norm": 0.546276330947876, + "learning_rate": 0.0005039332865458983, + "loss": 3.0346, + "step": 16043 + }, + { + "epoch": 0.79, + "grad_norm": 0.521732747554779, + "learning_rate": 0.0005039219944297321, + "loss": 3.2085, + "step": 16044 + }, + { + "epoch": 0.79, + "grad_norm": 0.46991196274757385, + "learning_rate": 0.0005039107017764747, + "loss": 3.128, + "step": 16045 + }, + { + "epoch": 0.79, + "grad_norm": 0.5190024971961975, + "learning_rate": 0.0005038994085861556, + "loss": 3.1441, + "step": 16046 + }, + { + "epoch": 0.79, + "grad_norm": 0.5141576528549194, + "learning_rate": 0.0005038881148588048, + "loss": 2.9767, + "step": 16047 + }, + { + "epoch": 0.79, + "grad_norm": 0.5428609848022461, + "learning_rate": 0.000503876820594452, + "loss": 3.0003, + "step": 16048 + }, + { + "epoch": 0.79, + "grad_norm": 0.5093159675598145, + "learning_rate": 0.0005038655257931269, + "loss": 3.1081, + "step": 16049 + }, + { + "epoch": 0.79, + "grad_norm": 0.5302721858024597, + "learning_rate": 0.0005038542304548591, + "loss": 3.2037, + "step": 16050 + }, + { + "epoch": 0.79, + "grad_norm": 0.526119589805603, + "learning_rate": 0.0005038429345796785, + "loss": 3.4543, + "step": 16051 + }, + { + "epoch": 0.79, + "grad_norm": 0.5228065848350525, + "learning_rate": 0.0005038316381676149, + "loss": 3.1163, + "step": 16052 + }, + { + "epoch": 0.79, + "grad_norm": 0.5292603969573975, + "learning_rate": 0.000503820341218698, + "loss": 3.2189, + "step": 16053 + }, + { + "epoch": 0.79, + "grad_norm": 0.5633078813552856, + "learning_rate": 0.0005038090437329575, + "loss": 3.1288, + "step": 16054 + }, + { + "epoch": 0.79, + "grad_norm": 0.5860145092010498, + "learning_rate": 0.0005037977457104233, + "loss": 3.2443, + "step": 16055 + }, + { + "epoch": 0.79, + "grad_norm": 0.614590048789978, + "learning_rate": 0.000503786447151125, + "loss": 3.0993, + "step": 16056 + }, + { + "epoch": 0.79, + "grad_norm": 0.502687394618988, + "learning_rate": 0.0005037751480550924, + "loss": 3.015, + "step": 16057 + }, + { + "epoch": 0.79, + "grad_norm": 0.5312213897705078, + "learning_rate": 0.0005037638484223553, + "loss": 3.0935, + "step": 16058 + }, + { + "epoch": 0.79, + "grad_norm": 0.5091737508773804, + "learning_rate": 0.0005037525482529435, + "loss": 3.1461, + "step": 16059 + }, + { + "epoch": 0.79, + "grad_norm": 0.5732948780059814, + "learning_rate": 0.0005037412475468865, + "loss": 3.1839, + "step": 16060 + }, + { + "epoch": 0.79, + "grad_norm": 0.5322034955024719, + "learning_rate": 0.0005037299463042144, + "loss": 3.2584, + "step": 16061 + }, + { + "epoch": 0.79, + "grad_norm": 0.5124638080596924, + "learning_rate": 0.0005037186445249568, + "loss": 3.2116, + "step": 16062 + }, + { + "epoch": 0.79, + "grad_norm": 0.5579886436462402, + "learning_rate": 0.0005037073422091436, + "loss": 3.1975, + "step": 16063 + }, + { + "epoch": 0.79, + "grad_norm": 0.5009481310844421, + "learning_rate": 0.0005036960393568044, + "loss": 3.136, + "step": 16064 + }, + { + "epoch": 0.79, + "grad_norm": 0.5488647818565369, + "learning_rate": 0.000503684735967969, + "loss": 3.0356, + "step": 16065 + }, + { + "epoch": 0.79, + "grad_norm": 0.5193225145339966, + "learning_rate": 0.0005036734320426672, + "loss": 3.2853, + "step": 16066 + }, + { + "epoch": 0.79, + "grad_norm": 0.5488074421882629, + "learning_rate": 0.0005036621275809288, + "loss": 3.1645, + "step": 16067 + }, + { + "epoch": 0.79, + "grad_norm": 0.6234692335128784, + "learning_rate": 0.0005036508225827836, + "loss": 2.9019, + "step": 16068 + }, + { + "epoch": 0.79, + "grad_norm": 0.48300352692604065, + "learning_rate": 0.0005036395170482613, + "loss": 3.0794, + "step": 16069 + }, + { + "epoch": 0.79, + "grad_norm": 0.5437573194503784, + "learning_rate": 0.0005036282109773917, + "loss": 3.2105, + "step": 16070 + }, + { + "epoch": 0.79, + "grad_norm": 0.6625828742980957, + "learning_rate": 0.0005036169043702045, + "loss": 3.2494, + "step": 16071 + }, + { + "epoch": 0.79, + "grad_norm": 0.6107515096664429, + "learning_rate": 0.0005036055972267296, + "loss": 3.2419, + "step": 16072 + }, + { + "epoch": 0.79, + "grad_norm": 0.5302785038948059, + "learning_rate": 0.0005035942895469967, + "loss": 3.1094, + "step": 16073 + }, + { + "epoch": 0.79, + "grad_norm": 0.6154295802116394, + "learning_rate": 0.0005035829813310358, + "loss": 3.1699, + "step": 16074 + }, + { + "epoch": 0.79, + "grad_norm": 0.6206402778625488, + "learning_rate": 0.0005035716725788763, + "loss": 3.1178, + "step": 16075 + }, + { + "epoch": 0.79, + "grad_norm": 0.52013099193573, + "learning_rate": 0.0005035603632905484, + "loss": 3.0002, + "step": 16076 + }, + { + "epoch": 0.79, + "grad_norm": 0.4945783019065857, + "learning_rate": 0.0005035490534660816, + "loss": 3.071, + "step": 16077 + }, + { + "epoch": 0.79, + "grad_norm": 0.509825587272644, + "learning_rate": 0.0005035377431055057, + "loss": 3.2746, + "step": 16078 + }, + { + "epoch": 0.79, + "grad_norm": 0.49139195680618286, + "learning_rate": 0.0005035264322088505, + "loss": 3.2474, + "step": 16079 + }, + { + "epoch": 0.79, + "grad_norm": 0.5229079127311707, + "learning_rate": 0.000503515120776146, + "loss": 3.2264, + "step": 16080 + }, + { + "epoch": 0.79, + "grad_norm": 0.5331903696060181, + "learning_rate": 0.0005035038088074217, + "loss": 3.4112, + "step": 16081 + }, + { + "epoch": 0.79, + "grad_norm": 0.5611699223518372, + "learning_rate": 0.0005034924963027077, + "loss": 3.0743, + "step": 16082 + }, + { + "epoch": 0.79, + "grad_norm": 0.504478394985199, + "learning_rate": 0.0005034811832620335, + "loss": 3.1197, + "step": 16083 + }, + { + "epoch": 0.79, + "grad_norm": 0.5261973142623901, + "learning_rate": 0.0005034698696854291, + "loss": 3.1517, + "step": 16084 + }, + { + "epoch": 0.79, + "grad_norm": 0.5199773907661438, + "learning_rate": 0.0005034585555729243, + "loss": 3.3782, + "step": 16085 + }, + { + "epoch": 0.79, + "grad_norm": 0.4901329576969147, + "learning_rate": 0.0005034472409245485, + "loss": 3.1992, + "step": 16086 + }, + { + "epoch": 0.79, + "grad_norm": 0.5331196784973145, + "learning_rate": 0.0005034359257403322, + "loss": 3.1904, + "step": 16087 + }, + { + "epoch": 0.79, + "grad_norm": 0.5385162830352783, + "learning_rate": 0.0005034246100203046, + "loss": 2.9416, + "step": 16088 + }, + { + "epoch": 0.79, + "grad_norm": 0.5102599263191223, + "learning_rate": 0.0005034132937644958, + "loss": 3.332, + "step": 16089 + }, + { + "epoch": 0.79, + "grad_norm": 0.5430083274841309, + "learning_rate": 0.0005034019769729355, + "loss": 3.0467, + "step": 16090 + }, + { + "epoch": 0.79, + "grad_norm": 0.563103437423706, + "learning_rate": 0.0005033906596456535, + "loss": 3.1664, + "step": 16091 + }, + { + "epoch": 0.79, + "grad_norm": 0.5284902453422546, + "learning_rate": 0.0005033793417826797, + "loss": 3.4442, + "step": 16092 + }, + { + "epoch": 0.79, + "grad_norm": 0.477546364068985, + "learning_rate": 0.0005033680233840439, + "loss": 3.2581, + "step": 16093 + }, + { + "epoch": 0.79, + "grad_norm": 0.5075785517692566, + "learning_rate": 0.0005033567044497758, + "loss": 3.1257, + "step": 16094 + }, + { + "epoch": 0.79, + "grad_norm": 0.5229860544204712, + "learning_rate": 0.0005033453849799053, + "loss": 3.2537, + "step": 16095 + }, + { + "epoch": 0.79, + "grad_norm": 0.5341659188270569, + "learning_rate": 0.0005033340649744621, + "loss": 3.09, + "step": 16096 + }, + { + "epoch": 0.79, + "grad_norm": 0.5053170323371887, + "learning_rate": 0.0005033227444334763, + "loss": 3.4498, + "step": 16097 + }, + { + "epoch": 0.79, + "grad_norm": 0.5055860877037048, + "learning_rate": 0.0005033114233569774, + "loss": 2.9695, + "step": 16098 + }, + { + "epoch": 0.79, + "grad_norm": 0.4993264377117157, + "learning_rate": 0.0005033001017449953, + "loss": 3.0594, + "step": 16099 + }, + { + "epoch": 0.79, + "grad_norm": 0.49798890948295593, + "learning_rate": 0.00050328877959756, + "loss": 3.2009, + "step": 16100 + }, + { + "epoch": 0.79, + "grad_norm": 0.5068756341934204, + "learning_rate": 0.000503277456914701, + "loss": 3.3866, + "step": 16101 + }, + { + "epoch": 0.79, + "grad_norm": 0.5351049900054932, + "learning_rate": 0.0005032661336964484, + "loss": 3.0999, + "step": 16102 + }, + { + "epoch": 0.79, + "grad_norm": 0.49359703063964844, + "learning_rate": 0.000503254809942832, + "loss": 3.1928, + "step": 16103 + }, + { + "epoch": 0.79, + "grad_norm": 0.526689887046814, + "learning_rate": 0.0005032434856538814, + "loss": 3.1952, + "step": 16104 + }, + { + "epoch": 0.79, + "grad_norm": 0.5129401683807373, + "learning_rate": 0.0005032321608296266, + "loss": 2.8986, + "step": 16105 + }, + { + "epoch": 0.79, + "grad_norm": 0.5190664529800415, + "learning_rate": 0.0005032208354700974, + "loss": 3.1921, + "step": 16106 + }, + { + "epoch": 0.79, + "grad_norm": 0.5858680605888367, + "learning_rate": 0.0005032095095753237, + "loss": 3.2601, + "step": 16107 + }, + { + "epoch": 0.79, + "grad_norm": 0.5072818994522095, + "learning_rate": 0.0005031981831453352, + "loss": 3.1682, + "step": 16108 + }, + { + "epoch": 0.79, + "grad_norm": 0.5385854244232178, + "learning_rate": 0.0005031868561801618, + "loss": 3.2302, + "step": 16109 + }, + { + "epoch": 0.79, + "grad_norm": 0.49237823486328125, + "learning_rate": 0.0005031755286798333, + "loss": 3.4822, + "step": 16110 + }, + { + "epoch": 0.79, + "grad_norm": 0.5026249885559082, + "learning_rate": 0.0005031642006443796, + "loss": 3.3075, + "step": 16111 + }, + { + "epoch": 0.79, + "grad_norm": 0.5177112817764282, + "learning_rate": 0.0005031528720738306, + "loss": 3.2019, + "step": 16112 + }, + { + "epoch": 0.79, + "grad_norm": 0.5290037989616394, + "learning_rate": 0.0005031415429682159, + "loss": 3.0193, + "step": 16113 + }, + { + "epoch": 0.79, + "grad_norm": 0.5207205414772034, + "learning_rate": 0.0005031302133275655, + "loss": 2.7481, + "step": 16114 + }, + { + "epoch": 0.79, + "grad_norm": 0.5251289010047913, + "learning_rate": 0.0005031188831519091, + "loss": 3.2789, + "step": 16115 + }, + { + "epoch": 0.79, + "grad_norm": 0.5037059187889099, + "learning_rate": 0.0005031075524412769, + "loss": 3.1427, + "step": 16116 + }, + { + "epoch": 0.79, + "grad_norm": 0.509303629398346, + "learning_rate": 0.0005030962211956982, + "loss": 3.4196, + "step": 16117 + }, + { + "epoch": 0.79, + "grad_norm": 0.47990882396698, + "learning_rate": 0.0005030848894152034, + "loss": 3.1945, + "step": 16118 + }, + { + "epoch": 0.79, + "grad_norm": 0.5095180869102478, + "learning_rate": 0.0005030735570998219, + "loss": 3.3179, + "step": 16119 + }, + { + "epoch": 0.79, + "grad_norm": 0.5448658466339111, + "learning_rate": 0.0005030622242495839, + "loss": 3.3563, + "step": 16120 + }, + { + "epoch": 0.79, + "grad_norm": 0.5198306441307068, + "learning_rate": 0.000503050890864519, + "loss": 3.1658, + "step": 16121 + }, + { + "epoch": 0.79, + "grad_norm": 0.5410298109054565, + "learning_rate": 0.0005030395569446571, + "loss": 3.2453, + "step": 16122 + }, + { + "epoch": 0.79, + "grad_norm": 0.5249944925308228, + "learning_rate": 0.0005030282224900281, + "loss": 3.1404, + "step": 16123 + }, + { + "epoch": 0.79, + "grad_norm": 0.5237621068954468, + "learning_rate": 0.0005030168875006618, + "loss": 3.1242, + "step": 16124 + }, + { + "epoch": 0.79, + "grad_norm": 0.5496729016304016, + "learning_rate": 0.0005030055519765882, + "loss": 3.0199, + "step": 16125 + }, + { + "epoch": 0.79, + "grad_norm": 0.5271469950675964, + "learning_rate": 0.000502994215917837, + "loss": 3.123, + "step": 16126 + }, + { + "epoch": 0.79, + "grad_norm": 0.5577632188796997, + "learning_rate": 0.000502982879324438, + "loss": 3.1257, + "step": 16127 + }, + { + "epoch": 0.79, + "grad_norm": 0.498698353767395, + "learning_rate": 0.0005029715421964213, + "loss": 3.3184, + "step": 16128 + }, + { + "epoch": 0.79, + "grad_norm": 0.544201135635376, + "learning_rate": 0.0005029602045338166, + "loss": 2.9086, + "step": 16129 + }, + { + "epoch": 0.79, + "grad_norm": 0.5301446914672852, + "learning_rate": 0.0005029488663366538, + "loss": 3.4906, + "step": 16130 + }, + { + "epoch": 0.79, + "grad_norm": 0.5524643063545227, + "learning_rate": 0.0005029375276049626, + "loss": 3.4142, + "step": 16131 + }, + { + "epoch": 0.79, + "grad_norm": 0.5376681685447693, + "learning_rate": 0.0005029261883387732, + "loss": 3.254, + "step": 16132 + }, + { + "epoch": 0.79, + "grad_norm": 0.5098219513893127, + "learning_rate": 0.000502914848538115, + "loss": 3.3139, + "step": 16133 + }, + { + "epoch": 0.79, + "grad_norm": 0.5753784775733948, + "learning_rate": 0.0005029035082030184, + "loss": 3.2448, + "step": 16134 + }, + { + "epoch": 0.79, + "grad_norm": 0.49853768944740295, + "learning_rate": 0.0005028921673335129, + "loss": 3.2964, + "step": 16135 + }, + { + "epoch": 0.79, + "grad_norm": 0.5184651017189026, + "learning_rate": 0.0005028808259296285, + "loss": 3.2924, + "step": 16136 + }, + { + "epoch": 0.79, + "grad_norm": 0.5892463326454163, + "learning_rate": 0.000502869483991395, + "loss": 3.0275, + "step": 16137 + }, + { + "epoch": 0.79, + "grad_norm": 0.4991600811481476, + "learning_rate": 0.0005028581415188425, + "loss": 3.1359, + "step": 16138 + }, + { + "epoch": 0.79, + "grad_norm": 0.49838724732398987, + "learning_rate": 0.0005028467985120006, + "loss": 3.1147, + "step": 16139 + }, + { + "epoch": 0.79, + "grad_norm": 0.5089886784553528, + "learning_rate": 0.0005028354549708992, + "loss": 2.9585, + "step": 16140 + }, + { + "epoch": 0.79, + "grad_norm": 0.52250075340271, + "learning_rate": 0.0005028241108955683, + "loss": 3.3139, + "step": 16141 + }, + { + "epoch": 0.79, + "grad_norm": 0.5590442419052124, + "learning_rate": 0.0005028127662860376, + "loss": 3.0677, + "step": 16142 + }, + { + "epoch": 0.79, + "grad_norm": 0.5002887845039368, + "learning_rate": 0.0005028014211423372, + "loss": 3.018, + "step": 16143 + }, + { + "epoch": 0.79, + "grad_norm": 0.5314575433731079, + "learning_rate": 0.000502790075464497, + "loss": 3.2391, + "step": 16144 + }, + { + "epoch": 0.79, + "grad_norm": 0.4909334182739258, + "learning_rate": 0.0005027787292525468, + "loss": 3.0808, + "step": 16145 + }, + { + "epoch": 0.79, + "grad_norm": 0.5140949487686157, + "learning_rate": 0.0005027673825065163, + "loss": 3.1941, + "step": 16146 + }, + { + "epoch": 0.79, + "grad_norm": 0.49293190240859985, + "learning_rate": 0.0005027560352264355, + "loss": 3.0625, + "step": 16147 + }, + { + "epoch": 0.79, + "grad_norm": 0.5335683822631836, + "learning_rate": 0.0005027446874123344, + "loss": 3.1745, + "step": 16148 + }, + { + "epoch": 0.79, + "grad_norm": 0.5320890545845032, + "learning_rate": 0.0005027333390642428, + "loss": 3.0896, + "step": 16149 + }, + { + "epoch": 0.79, + "grad_norm": 0.5180721879005432, + "learning_rate": 0.0005027219901821906, + "loss": 3.2275, + "step": 16150 + }, + { + "epoch": 0.79, + "grad_norm": 0.5077602863311768, + "learning_rate": 0.0005027106407662077, + "loss": 2.9512, + "step": 16151 + }, + { + "epoch": 0.79, + "grad_norm": 0.5526420474052429, + "learning_rate": 0.0005026992908163239, + "loss": 3.0982, + "step": 16152 + }, + { + "epoch": 0.79, + "grad_norm": 0.5293105840682983, + "learning_rate": 0.0005026879403325693, + "loss": 3.2531, + "step": 16153 + }, + { + "epoch": 0.79, + "grad_norm": 0.4970487058162689, + "learning_rate": 0.0005026765893149736, + "loss": 3.2379, + "step": 16154 + }, + { + "epoch": 0.79, + "grad_norm": 0.523263156414032, + "learning_rate": 0.0005026652377635669, + "loss": 3.3632, + "step": 16155 + }, + { + "epoch": 0.79, + "grad_norm": 0.5170415639877319, + "learning_rate": 0.0005026538856783789, + "loss": 3.139, + "step": 16156 + }, + { + "epoch": 0.79, + "grad_norm": 0.48945000767707825, + "learning_rate": 0.0005026425330594394, + "loss": 3.1406, + "step": 16157 + }, + { + "epoch": 0.79, + "grad_norm": 0.5514994859695435, + "learning_rate": 0.0005026311799067786, + "loss": 2.9625, + "step": 16158 + }, + { + "epoch": 0.79, + "grad_norm": 0.5217519402503967, + "learning_rate": 0.0005026198262204263, + "loss": 3.0109, + "step": 16159 + }, + { + "epoch": 0.79, + "grad_norm": 0.5658161044120789, + "learning_rate": 0.0005026084720004125, + "loss": 2.9256, + "step": 16160 + }, + { + "epoch": 0.79, + "grad_norm": 0.5193248391151428, + "learning_rate": 0.0005025971172467667, + "loss": 3.2494, + "step": 16161 + }, + { + "epoch": 0.79, + "grad_norm": 0.5272179841995239, + "learning_rate": 0.0005025857619595193, + "loss": 3.312, + "step": 16162 + }, + { + "epoch": 0.79, + "grad_norm": 0.4956285059452057, + "learning_rate": 0.0005025744061386998, + "loss": 3.2104, + "step": 16163 + }, + { + "epoch": 0.79, + "grad_norm": 0.521141767501831, + "learning_rate": 0.0005025630497843385, + "loss": 2.9773, + "step": 16164 + }, + { + "epoch": 0.79, + "grad_norm": 0.525787353515625, + "learning_rate": 0.000502551692896465, + "loss": 3.2225, + "step": 16165 + }, + { + "epoch": 0.79, + "grad_norm": 0.5088430643081665, + "learning_rate": 0.0005025403354751093, + "loss": 3.1288, + "step": 16166 + }, + { + "epoch": 0.79, + "grad_norm": 0.5418877005577087, + "learning_rate": 0.0005025289775203013, + "loss": 3.2467, + "step": 16167 + }, + { + "epoch": 0.79, + "grad_norm": 0.5093033313751221, + "learning_rate": 0.0005025176190320711, + "loss": 3.4795, + "step": 16168 + }, + { + "epoch": 0.79, + "grad_norm": 0.5372544527053833, + "learning_rate": 0.0005025062600104483, + "loss": 3.1049, + "step": 16169 + }, + { + "epoch": 0.79, + "grad_norm": 0.5223720669746399, + "learning_rate": 0.0005024949004554632, + "loss": 3.1735, + "step": 16170 + }, + { + "epoch": 0.79, + "grad_norm": 0.5225620865821838, + "learning_rate": 0.0005024835403671453, + "loss": 3.1505, + "step": 16171 + }, + { + "epoch": 0.79, + "grad_norm": 0.478676974773407, + "learning_rate": 0.0005024721797455249, + "loss": 3.1069, + "step": 16172 + }, + { + "epoch": 0.79, + "grad_norm": 0.5180579423904419, + "learning_rate": 0.0005024608185906315, + "loss": 3.0396, + "step": 16173 + }, + { + "epoch": 0.79, + "grad_norm": 0.5308378338813782, + "learning_rate": 0.0005024494569024954, + "loss": 3.204, + "step": 16174 + }, + { + "epoch": 0.79, + "grad_norm": 0.5326209664344788, + "learning_rate": 0.0005024380946811464, + "loss": 3.0794, + "step": 16175 + }, + { + "epoch": 0.79, + "grad_norm": 0.5528416037559509, + "learning_rate": 0.0005024267319266144, + "loss": 3.3221, + "step": 16176 + }, + { + "epoch": 0.79, + "grad_norm": 0.5260026454925537, + "learning_rate": 0.0005024153686389295, + "loss": 2.9779, + "step": 16177 + }, + { + "epoch": 0.79, + "grad_norm": 0.5304065346717834, + "learning_rate": 0.0005024040048181213, + "loss": 3.1639, + "step": 16178 + }, + { + "epoch": 0.79, + "grad_norm": 0.5245885252952576, + "learning_rate": 0.0005023926404642199, + "loss": 3.2939, + "step": 16179 + }, + { + "epoch": 0.79, + "grad_norm": 0.47927168011665344, + "learning_rate": 0.0005023812755772553, + "loss": 3.2998, + "step": 16180 + }, + { + "epoch": 0.79, + "grad_norm": 0.5522112250328064, + "learning_rate": 0.0005023699101572574, + "loss": 3.3079, + "step": 16181 + }, + { + "epoch": 0.79, + "grad_norm": 0.47725212574005127, + "learning_rate": 0.000502358544204256, + "loss": 3.0599, + "step": 16182 + }, + { + "epoch": 0.79, + "grad_norm": 0.5390349626541138, + "learning_rate": 0.0005023471777182813, + "loss": 3.2281, + "step": 16183 + }, + { + "epoch": 0.79, + "grad_norm": 0.5163893699645996, + "learning_rate": 0.0005023358106993629, + "loss": 2.8327, + "step": 16184 + }, + { + "epoch": 0.79, + "grad_norm": 0.5547180771827698, + "learning_rate": 0.000502324443147531, + "loss": 3.1906, + "step": 16185 + }, + { + "epoch": 0.79, + "grad_norm": 0.5237986445426941, + "learning_rate": 0.0005023130750628154, + "loss": 3.336, + "step": 16186 + }, + { + "epoch": 0.79, + "grad_norm": 0.493787944316864, + "learning_rate": 0.0005023017064452462, + "loss": 3.0081, + "step": 16187 + }, + { + "epoch": 0.79, + "grad_norm": 0.5115264654159546, + "learning_rate": 0.0005022903372948531, + "loss": 3.181, + "step": 16188 + }, + { + "epoch": 0.79, + "grad_norm": 0.5147324204444885, + "learning_rate": 0.0005022789676116663, + "loss": 2.998, + "step": 16189 + }, + { + "epoch": 0.79, + "grad_norm": 0.5190193057060242, + "learning_rate": 0.0005022675973957156, + "loss": 3.2676, + "step": 16190 + }, + { + "epoch": 0.79, + "grad_norm": 0.522584855556488, + "learning_rate": 0.000502256226647031, + "loss": 3.1966, + "step": 16191 + }, + { + "epoch": 0.79, + "grad_norm": 0.5173296332359314, + "learning_rate": 0.0005022448553656424, + "loss": 3.175, + "step": 16192 + }, + { + "epoch": 0.79, + "grad_norm": 0.5209254622459412, + "learning_rate": 0.0005022334835515797, + "loss": 3.113, + "step": 16193 + }, + { + "epoch": 0.79, + "grad_norm": 0.5002453923225403, + "learning_rate": 0.000502222111204873, + "loss": 3.1706, + "step": 16194 + }, + { + "epoch": 0.79, + "grad_norm": 0.5135509371757507, + "learning_rate": 0.0005022107383255522, + "loss": 3.2399, + "step": 16195 + }, + { + "epoch": 0.79, + "grad_norm": 0.5734914541244507, + "learning_rate": 0.0005021993649136472, + "loss": 2.9791, + "step": 16196 + }, + { + "epoch": 0.79, + "grad_norm": 0.5258508324623108, + "learning_rate": 0.000502187990969188, + "loss": 2.8981, + "step": 16197 + }, + { + "epoch": 0.79, + "grad_norm": 0.528937816619873, + "learning_rate": 0.0005021766164922044, + "loss": 2.9982, + "step": 16198 + }, + { + "epoch": 0.79, + "grad_norm": 0.559468686580658, + "learning_rate": 0.0005021652414827268, + "loss": 3.1467, + "step": 16199 + }, + { + "epoch": 0.79, + "grad_norm": 0.5440381169319153, + "learning_rate": 0.0005021538659407845, + "loss": 3.1731, + "step": 16200 + }, + { + "epoch": 0.79, + "grad_norm": 0.5404337048530579, + "learning_rate": 0.0005021424898664081, + "loss": 3.2169, + "step": 16201 + }, + { + "epoch": 0.79, + "grad_norm": 0.5026121735572815, + "learning_rate": 0.0005021311132596272, + "loss": 3.211, + "step": 16202 + }, + { + "epoch": 0.79, + "grad_norm": 0.5181695818901062, + "learning_rate": 0.0005021197361204719, + "loss": 3.1374, + "step": 16203 + }, + { + "epoch": 0.79, + "grad_norm": 0.5274845361709595, + "learning_rate": 0.0005021083584489719, + "loss": 3.118, + "step": 16204 + }, + { + "epoch": 0.79, + "grad_norm": 0.5087721943855286, + "learning_rate": 0.0005020969802451577, + "loss": 3.1625, + "step": 16205 + }, + { + "epoch": 0.79, + "grad_norm": 0.4713067412376404, + "learning_rate": 0.0005020856015090587, + "loss": 3.0954, + "step": 16206 + }, + { + "epoch": 0.79, + "grad_norm": 0.5224311351776123, + "learning_rate": 0.0005020742222407053, + "loss": 3.1967, + "step": 16207 + }, + { + "epoch": 0.79, + "grad_norm": 0.5140992403030396, + "learning_rate": 0.0005020628424401272, + "loss": 3.3444, + "step": 16208 + }, + { + "epoch": 0.79, + "grad_norm": 0.568848729133606, + "learning_rate": 0.0005020514621073544, + "loss": 3.2186, + "step": 16209 + }, + { + "epoch": 0.79, + "grad_norm": 0.5110242962837219, + "learning_rate": 0.000502040081242417, + "loss": 3.1196, + "step": 16210 + }, + { + "epoch": 0.79, + "grad_norm": 0.5247518420219421, + "learning_rate": 0.0005020286998453449, + "loss": 3.2261, + "step": 16211 + }, + { + "epoch": 0.79, + "grad_norm": 0.4877007007598877, + "learning_rate": 0.0005020173179161681, + "loss": 3.0439, + "step": 16212 + }, + { + "epoch": 0.79, + "grad_norm": 0.6002936363220215, + "learning_rate": 0.0005020059354549166, + "loss": 3.2361, + "step": 16213 + }, + { + "epoch": 0.79, + "grad_norm": 0.5274139642715454, + "learning_rate": 0.0005019945524616203, + "loss": 2.848, + "step": 16214 + }, + { + "epoch": 0.79, + "grad_norm": 0.4768544137477875, + "learning_rate": 0.0005019831689363091, + "loss": 3.186, + "step": 16215 + }, + { + "epoch": 0.79, + "grad_norm": 0.7357099652290344, + "learning_rate": 0.0005019717848790133, + "loss": 3.1149, + "step": 16216 + }, + { + "epoch": 0.79, + "grad_norm": 0.4986879825592041, + "learning_rate": 0.0005019604002897626, + "loss": 3.2408, + "step": 16217 + }, + { + "epoch": 0.79, + "grad_norm": 0.5395023822784424, + "learning_rate": 0.0005019490151685871, + "loss": 3.1948, + "step": 16218 + }, + { + "epoch": 0.79, + "grad_norm": 0.5745142102241516, + "learning_rate": 0.0005019376295155167, + "loss": 3.1383, + "step": 16219 + }, + { + "epoch": 0.79, + "grad_norm": 0.5301345586776733, + "learning_rate": 0.0005019262433305815, + "loss": 3.2496, + "step": 16220 + }, + { + "epoch": 0.79, + "grad_norm": 0.4867192208766937, + "learning_rate": 0.0005019148566138115, + "loss": 3.2222, + "step": 16221 + }, + { + "epoch": 0.8, + "grad_norm": 0.5403074622154236, + "learning_rate": 0.0005019034693652365, + "loss": 3.1839, + "step": 16222 + }, + { + "epoch": 0.8, + "grad_norm": 0.5404332280158997, + "learning_rate": 0.0005018920815848866, + "loss": 3.1374, + "step": 16223 + }, + { + "epoch": 0.8, + "grad_norm": 0.49627020955085754, + "learning_rate": 0.0005018806932727919, + "loss": 3.1589, + "step": 16224 + }, + { + "epoch": 0.8, + "grad_norm": 0.516038179397583, + "learning_rate": 0.0005018693044289823, + "loss": 3.1485, + "step": 16225 + }, + { + "epoch": 0.8, + "grad_norm": 0.49638882279396057, + "learning_rate": 0.0005018579150534878, + "loss": 3.1552, + "step": 16226 + }, + { + "epoch": 0.8, + "grad_norm": 0.4918712079524994, + "learning_rate": 0.0005018465251463384, + "loss": 3.3042, + "step": 16227 + }, + { + "epoch": 0.8, + "grad_norm": 0.4965533912181854, + "learning_rate": 0.0005018351347075641, + "loss": 2.9956, + "step": 16228 + }, + { + "epoch": 0.8, + "grad_norm": 0.49797412753105164, + "learning_rate": 0.0005018237437371949, + "loss": 3.0222, + "step": 16229 + }, + { + "epoch": 0.8, + "grad_norm": 0.5310698747634888, + "learning_rate": 0.0005018123522352607, + "loss": 3.1327, + "step": 16230 + }, + { + "epoch": 0.8, + "grad_norm": 0.4971972107887268, + "learning_rate": 0.0005018009602017918, + "loss": 3.1433, + "step": 16231 + }, + { + "epoch": 0.8, + "grad_norm": 0.5360703468322754, + "learning_rate": 0.0005017895676368179, + "loss": 3.1121, + "step": 16232 + }, + { + "epoch": 0.8, + "grad_norm": 0.5207992196083069, + "learning_rate": 0.0005017781745403692, + "loss": 2.9475, + "step": 16233 + }, + { + "epoch": 0.8, + "grad_norm": 0.49780797958374023, + "learning_rate": 0.0005017667809124755, + "loss": 3.2944, + "step": 16234 + }, + { + "epoch": 0.8, + "grad_norm": 0.5747670531272888, + "learning_rate": 0.0005017553867531669, + "loss": 3.1795, + "step": 16235 + }, + { + "epoch": 0.8, + "grad_norm": 0.5032791495323181, + "learning_rate": 0.0005017439920624735, + "loss": 3.4004, + "step": 16236 + }, + { + "epoch": 0.8, + "grad_norm": 0.5304082036018372, + "learning_rate": 0.0005017325968404252, + "loss": 3.2782, + "step": 16237 + }, + { + "epoch": 0.8, + "grad_norm": 0.5240580439567566, + "learning_rate": 0.0005017212010870522, + "loss": 2.8734, + "step": 16238 + }, + { + "epoch": 0.8, + "grad_norm": 0.4911217987537384, + "learning_rate": 0.0005017098048023843, + "loss": 3.3212, + "step": 16239 + }, + { + "epoch": 0.8, + "grad_norm": 0.5378339290618896, + "learning_rate": 0.0005016984079864516, + "loss": 3.1136, + "step": 16240 + }, + { + "epoch": 0.8, + "grad_norm": 0.5092733502388, + "learning_rate": 0.0005016870106392841, + "loss": 3.0082, + "step": 16241 + }, + { + "epoch": 0.8, + "grad_norm": 0.5283250212669373, + "learning_rate": 0.0005016756127609118, + "loss": 3.0912, + "step": 16242 + }, + { + "epoch": 0.8, + "grad_norm": 0.5703310966491699, + "learning_rate": 0.0005016642143513649, + "loss": 3.0603, + "step": 16243 + }, + { + "epoch": 0.8, + "grad_norm": 0.48841413855552673, + "learning_rate": 0.0005016528154106731, + "loss": 3.4069, + "step": 16244 + }, + { + "epoch": 0.8, + "grad_norm": 0.634024977684021, + "learning_rate": 0.0005016414159388667, + "loss": 3.2679, + "step": 16245 + }, + { + "epoch": 0.8, + "grad_norm": 0.497964084148407, + "learning_rate": 0.0005016300159359755, + "loss": 3.315, + "step": 16246 + }, + { + "epoch": 0.8, + "grad_norm": 0.5449110865592957, + "learning_rate": 0.0005016186154020297, + "loss": 3.36, + "step": 16247 + }, + { + "epoch": 0.8, + "grad_norm": 0.5338404774665833, + "learning_rate": 0.0005016072143370593, + "loss": 3.0915, + "step": 16248 + }, + { + "epoch": 0.8, + "grad_norm": 0.5039170980453491, + "learning_rate": 0.0005015958127410942, + "loss": 3.1784, + "step": 16249 + }, + { + "epoch": 0.8, + "grad_norm": 0.49808257818222046, + "learning_rate": 0.0005015844106141648, + "loss": 3.3481, + "step": 16250 + }, + { + "epoch": 0.8, + "grad_norm": 0.555994987487793, + "learning_rate": 0.0005015730079563006, + "loss": 3.2594, + "step": 16251 + }, + { + "epoch": 0.8, + "grad_norm": 0.5516330003738403, + "learning_rate": 0.0005015616047675319, + "loss": 3.1797, + "step": 16252 + }, + { + "epoch": 0.8, + "grad_norm": 0.523496150970459, + "learning_rate": 0.0005015502010478889, + "loss": 3.1193, + "step": 16253 + }, + { + "epoch": 0.8, + "grad_norm": 0.5190884470939636, + "learning_rate": 0.0005015387967974012, + "loss": 3.1007, + "step": 16254 + }, + { + "epoch": 0.8, + "grad_norm": 0.4918883740901947, + "learning_rate": 0.0005015273920160993, + "loss": 3.194, + "step": 16255 + }, + { + "epoch": 0.8, + "grad_norm": 0.5591009855270386, + "learning_rate": 0.0005015159867040129, + "loss": 3.0611, + "step": 16256 + }, + { + "epoch": 0.8, + "grad_norm": 0.4997139871120453, + "learning_rate": 0.0005015045808611723, + "loss": 3.2017, + "step": 16257 + }, + { + "epoch": 0.8, + "grad_norm": 0.63576740026474, + "learning_rate": 0.0005014931744876073, + "loss": 3.0073, + "step": 16258 + }, + { + "epoch": 0.8, + "grad_norm": 0.5021123290061951, + "learning_rate": 0.0005014817675833481, + "loss": 3.3267, + "step": 16259 + }, + { + "epoch": 0.8, + "grad_norm": 0.5300601720809937, + "learning_rate": 0.0005014703601484248, + "loss": 2.9822, + "step": 16260 + }, + { + "epoch": 0.8, + "grad_norm": 0.5239089131355286, + "learning_rate": 0.0005014589521828673, + "loss": 3.0789, + "step": 16261 + }, + { + "epoch": 0.8, + "grad_norm": 0.5151042938232422, + "learning_rate": 0.0005014475436867056, + "loss": 3.1529, + "step": 16262 + }, + { + "epoch": 0.8, + "grad_norm": 0.5113961696624756, + "learning_rate": 0.0005014361346599699, + "loss": 3.0487, + "step": 16263 + }, + { + "epoch": 0.8, + "grad_norm": 0.5840994119644165, + "learning_rate": 0.0005014247251026901, + "loss": 3.292, + "step": 16264 + }, + { + "epoch": 0.8, + "grad_norm": 0.52402663230896, + "learning_rate": 0.0005014133150148964, + "loss": 3.2253, + "step": 16265 + }, + { + "epoch": 0.8, + "grad_norm": 0.5494585037231445, + "learning_rate": 0.0005014019043966189, + "loss": 2.9822, + "step": 16266 + }, + { + "epoch": 0.8, + "grad_norm": 0.5364586114883423, + "learning_rate": 0.0005013904932478874, + "loss": 3.2012, + "step": 16267 + }, + { + "epoch": 0.8, + "grad_norm": 0.5051799416542053, + "learning_rate": 0.0005013790815687322, + "loss": 3.228, + "step": 16268 + }, + { + "epoch": 0.8, + "grad_norm": 0.5270349979400635, + "learning_rate": 0.0005013676693591832, + "loss": 3.1309, + "step": 16269 + }, + { + "epoch": 0.8, + "grad_norm": 0.523430585861206, + "learning_rate": 0.0005013562566192704, + "loss": 3.2644, + "step": 16270 + }, + { + "epoch": 0.8, + "grad_norm": 0.4938635528087616, + "learning_rate": 0.0005013448433490241, + "loss": 3.3449, + "step": 16271 + }, + { + "epoch": 0.8, + "grad_norm": 0.5350764989852905, + "learning_rate": 0.0005013334295484742, + "loss": 3.3254, + "step": 16272 + }, + { + "epoch": 0.8, + "grad_norm": 0.5360183715820312, + "learning_rate": 0.0005013220152176509, + "loss": 2.9552, + "step": 16273 + }, + { + "epoch": 0.8, + "grad_norm": 0.5047653913497925, + "learning_rate": 0.000501310600356584, + "loss": 3.172, + "step": 16274 + }, + { + "epoch": 0.8, + "grad_norm": 0.5344395041465759, + "learning_rate": 0.0005012991849653038, + "loss": 3.152, + "step": 16275 + }, + { + "epoch": 0.8, + "grad_norm": 0.5071520805358887, + "learning_rate": 0.0005012877690438403, + "loss": 3.2807, + "step": 16276 + }, + { + "epoch": 0.8, + "grad_norm": 0.501033365726471, + "learning_rate": 0.0005012763525922236, + "loss": 3.0155, + "step": 16277 + }, + { + "epoch": 0.8, + "grad_norm": 0.5294574499130249, + "learning_rate": 0.0005012649356104837, + "loss": 3.2542, + "step": 16278 + }, + { + "epoch": 0.8, + "grad_norm": 0.565493106842041, + "learning_rate": 0.0005012535180986507, + "loss": 2.9964, + "step": 16279 + }, + { + "epoch": 0.8, + "grad_norm": 0.5068030953407288, + "learning_rate": 0.0005012421000567545, + "loss": 3.2337, + "step": 16280 + }, + { + "epoch": 0.8, + "grad_norm": 0.5078257918357849, + "learning_rate": 0.0005012306814848255, + "loss": 3.2369, + "step": 16281 + }, + { + "epoch": 0.8, + "grad_norm": 0.5458744168281555, + "learning_rate": 0.0005012192623828935, + "loss": 3.4057, + "step": 16282 + }, + { + "epoch": 0.8, + "grad_norm": 0.5356884002685547, + "learning_rate": 0.0005012078427509888, + "loss": 3.2353, + "step": 16283 + }, + { + "epoch": 0.8, + "grad_norm": 0.5274999141693115, + "learning_rate": 0.0005011964225891414, + "loss": 3.2908, + "step": 16284 + }, + { + "epoch": 0.8, + "grad_norm": 0.5029778480529785, + "learning_rate": 0.0005011850018973813, + "loss": 3.164, + "step": 16285 + }, + { + "epoch": 0.8, + "grad_norm": 0.555999219417572, + "learning_rate": 0.0005011735806757384, + "loss": 3.0256, + "step": 16286 + }, + { + "epoch": 0.8, + "grad_norm": 0.48176196217536926, + "learning_rate": 0.0005011621589242433, + "loss": 3.2014, + "step": 16287 + }, + { + "epoch": 0.8, + "grad_norm": 0.49973785877227783, + "learning_rate": 0.0005011507366429257, + "loss": 3.2146, + "step": 16288 + }, + { + "epoch": 0.8, + "grad_norm": 0.537611722946167, + "learning_rate": 0.0005011393138318157, + "loss": 2.9839, + "step": 16289 + }, + { + "epoch": 0.8, + "grad_norm": 0.52414470911026, + "learning_rate": 0.0005011278904909437, + "loss": 3.2931, + "step": 16290 + }, + { + "epoch": 0.8, + "grad_norm": 0.5327220559120178, + "learning_rate": 0.0005011164666203392, + "loss": 3.276, + "step": 16291 + }, + { + "epoch": 0.8, + "grad_norm": 0.5477252006530762, + "learning_rate": 0.0005011050422200328, + "loss": 3.1397, + "step": 16292 + }, + { + "epoch": 0.8, + "grad_norm": 0.5243980884552002, + "learning_rate": 0.0005010936172900544, + "loss": 3.3518, + "step": 16293 + }, + { + "epoch": 0.8, + "grad_norm": 0.5618069171905518, + "learning_rate": 0.0005010821918304342, + "loss": 2.9984, + "step": 16294 + }, + { + "epoch": 0.8, + "grad_norm": 0.515941321849823, + "learning_rate": 0.0005010707658412021, + "loss": 3.1225, + "step": 16295 + }, + { + "epoch": 0.8, + "grad_norm": 0.5532481670379639, + "learning_rate": 0.0005010593393223883, + "loss": 3.0783, + "step": 16296 + }, + { + "epoch": 0.8, + "grad_norm": 0.4828968644142151, + "learning_rate": 0.0005010479122740229, + "loss": 3.0916, + "step": 16297 + }, + { + "epoch": 0.8, + "grad_norm": 0.5126880407333374, + "learning_rate": 0.000501036484696136, + "loss": 3.2333, + "step": 16298 + }, + { + "epoch": 0.8, + "grad_norm": 0.5703883171081543, + "learning_rate": 0.0005010250565887578, + "loss": 3.1723, + "step": 16299 + }, + { + "epoch": 0.8, + "grad_norm": 0.5786604881286621, + "learning_rate": 0.0005010136279519182, + "loss": 3.0571, + "step": 16300 + }, + { + "epoch": 0.8, + "grad_norm": 0.5473567843437195, + "learning_rate": 0.0005010021987856474, + "loss": 3.1133, + "step": 16301 + }, + { + "epoch": 0.8, + "grad_norm": 0.5086806416511536, + "learning_rate": 0.0005009907690899754, + "loss": 3.2605, + "step": 16302 + }, + { + "epoch": 0.8, + "grad_norm": 0.5256600975990295, + "learning_rate": 0.0005009793388649326, + "loss": 3.1623, + "step": 16303 + }, + { + "epoch": 0.8, + "grad_norm": 0.5034403800964355, + "learning_rate": 0.0005009679081105487, + "loss": 3.3153, + "step": 16304 + }, + { + "epoch": 0.8, + "grad_norm": 0.5190585851669312, + "learning_rate": 0.000500956476826854, + "loss": 3.1154, + "step": 16305 + }, + { + "epoch": 0.8, + "grad_norm": 0.501880407333374, + "learning_rate": 0.0005009450450138787, + "loss": 3.4226, + "step": 16306 + }, + { + "epoch": 0.8, + "grad_norm": 0.5461406707763672, + "learning_rate": 0.000500933612671653, + "loss": 3.1763, + "step": 16307 + }, + { + "epoch": 0.8, + "grad_norm": 0.5586167573928833, + "learning_rate": 0.0005009221798002065, + "loss": 3.1583, + "step": 16308 + }, + { + "epoch": 0.8, + "grad_norm": 0.5345634818077087, + "learning_rate": 0.0005009107463995699, + "loss": 3.35, + "step": 16309 + }, + { + "epoch": 0.8, + "grad_norm": 0.49737313389778137, + "learning_rate": 0.0005008993124697729, + "loss": 3.158, + "step": 16310 + }, + { + "epoch": 0.8, + "grad_norm": 0.5575215816497803, + "learning_rate": 0.0005008878780108459, + "loss": 3.1295, + "step": 16311 + }, + { + "epoch": 0.8, + "grad_norm": 0.5068386197090149, + "learning_rate": 0.0005008764430228188, + "loss": 3.2086, + "step": 16312 + }, + { + "epoch": 0.8, + "grad_norm": 0.5135799050331116, + "learning_rate": 0.0005008650075057218, + "loss": 3.4186, + "step": 16313 + }, + { + "epoch": 0.8, + "grad_norm": 0.50897216796875, + "learning_rate": 0.000500853571459585, + "loss": 3.1799, + "step": 16314 + }, + { + "epoch": 0.8, + "grad_norm": 0.4997579753398895, + "learning_rate": 0.0005008421348844387, + "loss": 3.0874, + "step": 16315 + }, + { + "epoch": 0.8, + "grad_norm": 0.5073021054267883, + "learning_rate": 0.0005008306977803128, + "loss": 3.1663, + "step": 16316 + }, + { + "epoch": 0.8, + "grad_norm": 0.502220630645752, + "learning_rate": 0.0005008192601472374, + "loss": 3.2532, + "step": 16317 + }, + { + "epoch": 0.8, + "grad_norm": 0.5047315359115601, + "learning_rate": 0.0005008078219852429, + "loss": 3.3159, + "step": 16318 + }, + { + "epoch": 0.8, + "grad_norm": 0.5216161012649536, + "learning_rate": 0.0005007963832943591, + "loss": 3.3052, + "step": 16319 + }, + { + "epoch": 0.8, + "grad_norm": 0.5240123867988586, + "learning_rate": 0.0005007849440746163, + "loss": 3.3936, + "step": 16320 + }, + { + "epoch": 0.8, + "grad_norm": 0.4939900040626526, + "learning_rate": 0.0005007735043260446, + "loss": 3.2465, + "step": 16321 + }, + { + "epoch": 0.8, + "grad_norm": 0.5109789967536926, + "learning_rate": 0.0005007620640486741, + "loss": 3.2318, + "step": 16322 + }, + { + "epoch": 0.8, + "grad_norm": 0.5235530138015747, + "learning_rate": 0.0005007506232425351, + "loss": 3.0924, + "step": 16323 + }, + { + "epoch": 0.8, + "grad_norm": 0.5005676746368408, + "learning_rate": 0.0005007391819076574, + "loss": 3.3931, + "step": 16324 + }, + { + "epoch": 0.8, + "grad_norm": 0.4928778409957886, + "learning_rate": 0.0005007277400440715, + "loss": 3.2683, + "step": 16325 + }, + { + "epoch": 0.8, + "grad_norm": 0.5503429770469666, + "learning_rate": 0.0005007162976518073, + "loss": 3.2935, + "step": 16326 + }, + { + "epoch": 0.8, + "grad_norm": 0.526890754699707, + "learning_rate": 0.0005007048547308948, + "loss": 3.0424, + "step": 16327 + }, + { + "epoch": 0.8, + "grad_norm": 0.5440188050270081, + "learning_rate": 0.0005006934112813646, + "loss": 3.2249, + "step": 16328 + }, + { + "epoch": 0.8, + "grad_norm": 0.48897939920425415, + "learning_rate": 0.0005006819673032465, + "loss": 3.0303, + "step": 16329 + }, + { + "epoch": 0.8, + "grad_norm": 0.4976714849472046, + "learning_rate": 0.0005006705227965707, + "loss": 3.2086, + "step": 16330 + }, + { + "epoch": 0.8, + "grad_norm": 0.519629716873169, + "learning_rate": 0.0005006590777613674, + "loss": 3.0235, + "step": 16331 + }, + { + "epoch": 0.8, + "grad_norm": 0.5517348647117615, + "learning_rate": 0.0005006476321976667, + "loss": 3.133, + "step": 16332 + }, + { + "epoch": 0.8, + "grad_norm": 0.5207356810569763, + "learning_rate": 0.0005006361861054988, + "loss": 3.2264, + "step": 16333 + }, + { + "epoch": 0.8, + "grad_norm": 0.521043062210083, + "learning_rate": 0.0005006247394848938, + "loss": 3.0017, + "step": 16334 + }, + { + "epoch": 0.8, + "grad_norm": 0.519502580165863, + "learning_rate": 0.0005006132923358818, + "loss": 3.2157, + "step": 16335 + }, + { + "epoch": 0.8, + "grad_norm": 0.47964170575141907, + "learning_rate": 0.0005006018446584928, + "loss": 3.2735, + "step": 16336 + }, + { + "epoch": 0.8, + "grad_norm": 0.515812337398529, + "learning_rate": 0.0005005903964527573, + "loss": 3.256, + "step": 16337 + }, + { + "epoch": 0.8, + "grad_norm": 0.5045713782310486, + "learning_rate": 0.0005005789477187055, + "loss": 3.1528, + "step": 16338 + }, + { + "epoch": 0.8, + "grad_norm": 0.5392544865608215, + "learning_rate": 0.0005005674984563672, + "loss": 3.0644, + "step": 16339 + }, + { + "epoch": 0.8, + "grad_norm": 0.5179302096366882, + "learning_rate": 0.0005005560486657726, + "loss": 3.1746, + "step": 16340 + }, + { + "epoch": 0.8, + "grad_norm": 0.5171295404434204, + "learning_rate": 0.0005005445983469522, + "loss": 3.0787, + "step": 16341 + }, + { + "epoch": 0.8, + "grad_norm": 0.5172526240348816, + "learning_rate": 0.0005005331474999357, + "loss": 3.0268, + "step": 16342 + }, + { + "epoch": 0.8, + "grad_norm": 0.5140368938446045, + "learning_rate": 0.0005005216961247535, + "loss": 3.5288, + "step": 16343 + }, + { + "epoch": 0.8, + "grad_norm": 0.525821328163147, + "learning_rate": 0.0005005102442214358, + "loss": 3.0389, + "step": 16344 + }, + { + "epoch": 0.8, + "grad_norm": 0.5332549214363098, + "learning_rate": 0.0005004987917900127, + "loss": 3.2078, + "step": 16345 + }, + { + "epoch": 0.8, + "grad_norm": 0.5117021203041077, + "learning_rate": 0.0005004873388305145, + "loss": 3.1699, + "step": 16346 + }, + { + "epoch": 0.8, + "grad_norm": 0.526621401309967, + "learning_rate": 0.000500475885342971, + "loss": 3.0896, + "step": 16347 + }, + { + "epoch": 0.8, + "grad_norm": 0.4984114468097687, + "learning_rate": 0.0005004644313274127, + "loss": 3.1271, + "step": 16348 + }, + { + "epoch": 0.8, + "grad_norm": 0.5194051861763, + "learning_rate": 0.0005004529767838697, + "loss": 3.4425, + "step": 16349 + }, + { + "epoch": 0.8, + "grad_norm": 0.530343234539032, + "learning_rate": 0.0005004415217123722, + "loss": 2.962, + "step": 16350 + }, + { + "epoch": 0.8, + "grad_norm": 0.5183805227279663, + "learning_rate": 0.0005004300661129502, + "loss": 3.2518, + "step": 16351 + }, + { + "epoch": 0.8, + "grad_norm": 0.5521455407142639, + "learning_rate": 0.000500418609985634, + "loss": 3.1005, + "step": 16352 + }, + { + "epoch": 0.8, + "grad_norm": 0.5434244871139526, + "learning_rate": 0.0005004071533304538, + "loss": 3.0559, + "step": 16353 + }, + { + "epoch": 0.8, + "grad_norm": 0.5996847152709961, + "learning_rate": 0.0005003956961474397, + "loss": 3.0903, + "step": 16354 + }, + { + "epoch": 0.8, + "grad_norm": 0.5073933005332947, + "learning_rate": 0.0005003842384366219, + "loss": 3.0769, + "step": 16355 + }, + { + "epoch": 0.8, + "grad_norm": 0.5215885043144226, + "learning_rate": 0.0005003727801980305, + "loss": 3.2447, + "step": 16356 + }, + { + "epoch": 0.8, + "grad_norm": 0.5436731576919556, + "learning_rate": 0.000500361321431696, + "loss": 3.1657, + "step": 16357 + }, + { + "epoch": 0.8, + "grad_norm": 0.5639539361000061, + "learning_rate": 0.000500349862137648, + "loss": 3.0724, + "step": 16358 + }, + { + "epoch": 0.8, + "grad_norm": 0.5657851099967957, + "learning_rate": 0.0005003384023159173, + "loss": 3.2198, + "step": 16359 + }, + { + "epoch": 0.8, + "grad_norm": 0.5688062906265259, + "learning_rate": 0.0005003269419665336, + "loss": 3.0992, + "step": 16360 + }, + { + "epoch": 0.8, + "grad_norm": 0.5081748366355896, + "learning_rate": 0.0005003154810895275, + "loss": 3.1934, + "step": 16361 + }, + { + "epoch": 0.8, + "grad_norm": 0.5793516039848328, + "learning_rate": 0.0005003040196849288, + "loss": 3.1196, + "step": 16362 + }, + { + "epoch": 0.8, + "grad_norm": 0.5692729353904724, + "learning_rate": 0.000500292557752768, + "loss": 3.0594, + "step": 16363 + }, + { + "epoch": 0.8, + "grad_norm": 0.5625926852226257, + "learning_rate": 0.0005002810952930751, + "loss": 3.3912, + "step": 16364 + }, + { + "epoch": 0.8, + "grad_norm": 0.5317186117172241, + "learning_rate": 0.0005002696323058803, + "loss": 3.2598, + "step": 16365 + }, + { + "epoch": 0.8, + "grad_norm": 0.5381388068199158, + "learning_rate": 0.0005002581687912138, + "loss": 3.1774, + "step": 16366 + }, + { + "epoch": 0.8, + "grad_norm": 0.5291975736618042, + "learning_rate": 0.0005002467047491058, + "loss": 3.1837, + "step": 16367 + }, + { + "epoch": 0.8, + "grad_norm": 0.5130248665809631, + "learning_rate": 0.0005002352401795866, + "loss": 3.1678, + "step": 16368 + }, + { + "epoch": 0.8, + "grad_norm": 0.4932388365268707, + "learning_rate": 0.0005002237750826863, + "loss": 3.2429, + "step": 16369 + }, + { + "epoch": 0.8, + "grad_norm": 0.5155643224716187, + "learning_rate": 0.0005002123094584351, + "loss": 3.3085, + "step": 16370 + }, + { + "epoch": 0.8, + "grad_norm": 0.5444445013999939, + "learning_rate": 0.0005002008433068633, + "loss": 3.2845, + "step": 16371 + }, + { + "epoch": 0.8, + "grad_norm": 0.5338315367698669, + "learning_rate": 0.0005001893766280009, + "loss": 3.1683, + "step": 16372 + }, + { + "epoch": 0.8, + "grad_norm": 0.535651445388794, + "learning_rate": 0.0005001779094218782, + "loss": 3.0012, + "step": 16373 + }, + { + "epoch": 0.8, + "grad_norm": 0.5226747989654541, + "learning_rate": 0.0005001664416885254, + "loss": 2.9169, + "step": 16374 + }, + { + "epoch": 0.8, + "grad_norm": 0.526677131652832, + "learning_rate": 0.0005001549734279729, + "loss": 3.1332, + "step": 16375 + }, + { + "epoch": 0.8, + "grad_norm": 0.540988028049469, + "learning_rate": 0.0005001435046402505, + "loss": 3.3839, + "step": 16376 + }, + { + "epoch": 0.8, + "grad_norm": 0.5214574337005615, + "learning_rate": 0.0005001320353253887, + "loss": 3.2017, + "step": 16377 + }, + { + "epoch": 0.8, + "grad_norm": 0.49261197447776794, + "learning_rate": 0.0005001205654834176, + "loss": 3.2731, + "step": 16378 + }, + { + "epoch": 0.8, + "grad_norm": 0.507203996181488, + "learning_rate": 0.0005001090951143675, + "loss": 3.0643, + "step": 16379 + }, + { + "epoch": 0.8, + "grad_norm": 0.5074104070663452, + "learning_rate": 0.0005000976242182687, + "loss": 3.1289, + "step": 16380 + }, + { + "epoch": 0.8, + "grad_norm": 0.533085823059082, + "learning_rate": 0.000500086152795151, + "loss": 3.1851, + "step": 16381 + }, + { + "epoch": 0.8, + "grad_norm": 0.5225232243537903, + "learning_rate": 0.000500074680845045, + "loss": 3.1941, + "step": 16382 + }, + { + "epoch": 0.8, + "grad_norm": 0.5232262015342712, + "learning_rate": 0.0005000632083679808, + "loss": 3.1863, + "step": 16383 + }, + { + "epoch": 0.8, + "grad_norm": 0.499962717294693, + "learning_rate": 0.0005000517353639886, + "loss": 3.2258, + "step": 16384 + }, + { + "epoch": 0.8, + "grad_norm": 0.4960383176803589, + "learning_rate": 0.0005000402618330986, + "loss": 3.4133, + "step": 16385 + }, + { + "epoch": 0.8, + "grad_norm": 0.5370568633079529, + "learning_rate": 0.0005000287877753411, + "loss": 3.0247, + "step": 16386 + }, + { + "epoch": 0.8, + "grad_norm": 0.5205273032188416, + "learning_rate": 0.0005000173131907463, + "loss": 3.028, + "step": 16387 + }, + { + "epoch": 0.8, + "grad_norm": 0.5405469536781311, + "learning_rate": 0.0005000058380793443, + "loss": 3.2769, + "step": 16388 + }, + { + "epoch": 0.8, + "grad_norm": 0.5633376836776733, + "learning_rate": 0.0004999943624411655, + "loss": 3.2323, + "step": 16389 + }, + { + "epoch": 0.8, + "grad_norm": 0.5104237198829651, + "learning_rate": 0.00049998288627624, + "loss": 3.2026, + "step": 16390 + }, + { + "epoch": 0.8, + "grad_norm": 0.5359808206558228, + "learning_rate": 0.000499971409584598, + "loss": 2.988, + "step": 16391 + }, + { + "epoch": 0.8, + "grad_norm": 0.5244869589805603, + "learning_rate": 0.0004999599323662699, + "loss": 3.309, + "step": 16392 + }, + { + "epoch": 0.8, + "grad_norm": 0.5385964512825012, + "learning_rate": 0.0004999484546212858, + "loss": 3.0099, + "step": 16393 + }, + { + "epoch": 0.8, + "grad_norm": 0.5380749702453613, + "learning_rate": 0.0004999369763496759, + "loss": 3.094, + "step": 16394 + }, + { + "epoch": 0.8, + "grad_norm": 0.518419086933136, + "learning_rate": 0.0004999254975514705, + "loss": 3.0281, + "step": 16395 + }, + { + "epoch": 0.8, + "grad_norm": 0.5032293200492859, + "learning_rate": 0.0004999140182266997, + "loss": 3.1709, + "step": 16396 + }, + { + "epoch": 0.8, + "grad_norm": 0.5485716462135315, + "learning_rate": 0.0004999025383753941, + "loss": 3.0291, + "step": 16397 + }, + { + "epoch": 0.8, + "grad_norm": 0.5190295577049255, + "learning_rate": 0.0004998910579975835, + "loss": 3.1124, + "step": 16398 + }, + { + "epoch": 0.8, + "grad_norm": 0.5125435590744019, + "learning_rate": 0.0004998795770932984, + "loss": 3.0552, + "step": 16399 + }, + { + "epoch": 0.8, + "grad_norm": 0.5453558564186096, + "learning_rate": 0.000499868095662569, + "loss": 3.0534, + "step": 16400 + }, + { + "epoch": 0.8, + "grad_norm": 0.49550220370292664, + "learning_rate": 0.0004998566137054255, + "loss": 3.3442, + "step": 16401 + }, + { + "epoch": 0.8, + "grad_norm": 0.4840456247329712, + "learning_rate": 0.000499845131221898, + "loss": 3.1657, + "step": 16402 + }, + { + "epoch": 0.8, + "grad_norm": 0.5119274258613586, + "learning_rate": 0.000499833648212017, + "loss": 3.4076, + "step": 16403 + }, + { + "epoch": 0.8, + "grad_norm": 0.48557212948799133, + "learning_rate": 0.0004998221646758125, + "loss": 3.3897, + "step": 16404 + }, + { + "epoch": 0.8, + "grad_norm": 0.516492486000061, + "learning_rate": 0.000499810680613315, + "loss": 3.04, + "step": 16405 + }, + { + "epoch": 0.8, + "grad_norm": 0.5236262083053589, + "learning_rate": 0.0004997991960245547, + "loss": 3.4405, + "step": 16406 + }, + { + "epoch": 0.8, + "grad_norm": 0.5504629015922546, + "learning_rate": 0.0004997877109095616, + "loss": 3.0354, + "step": 16407 + }, + { + "epoch": 0.8, + "grad_norm": 0.5525025725364685, + "learning_rate": 0.0004997762252683663, + "loss": 3.1878, + "step": 16408 + }, + { + "epoch": 0.8, + "grad_norm": 0.5103166699409485, + "learning_rate": 0.0004997647391009986, + "loss": 3.0423, + "step": 16409 + }, + { + "epoch": 0.8, + "grad_norm": 0.5244056582450867, + "learning_rate": 0.0004997532524074892, + "loss": 3.3401, + "step": 16410 + }, + { + "epoch": 0.8, + "grad_norm": 0.5187060236930847, + "learning_rate": 0.0004997417651878682, + "loss": 3.2776, + "step": 16411 + }, + { + "epoch": 0.8, + "grad_norm": 0.5291786789894104, + "learning_rate": 0.0004997302774421658, + "loss": 3.0764, + "step": 16412 + }, + { + "epoch": 0.8, + "grad_norm": 0.509762704372406, + "learning_rate": 0.0004997187891704123, + "loss": 3.2449, + "step": 16413 + }, + { + "epoch": 0.8, + "grad_norm": 0.5269352793693542, + "learning_rate": 0.0004997073003726378, + "loss": 3.169, + "step": 16414 + }, + { + "epoch": 0.8, + "grad_norm": 0.526393473148346, + "learning_rate": 0.0004996958110488729, + "loss": 3.1764, + "step": 16415 + }, + { + "epoch": 0.8, + "grad_norm": 0.5143744945526123, + "learning_rate": 0.0004996843211991477, + "loss": 3.1803, + "step": 16416 + }, + { + "epoch": 0.8, + "grad_norm": 0.526218056678772, + "learning_rate": 0.0004996728308234923, + "loss": 3.1629, + "step": 16417 + }, + { + "epoch": 0.8, + "grad_norm": 0.46396979689598083, + "learning_rate": 0.000499661339921937, + "loss": 3.2116, + "step": 16418 + }, + { + "epoch": 0.8, + "grad_norm": 0.49755245447158813, + "learning_rate": 0.0004996498484945123, + "loss": 3.2634, + "step": 16419 + }, + { + "epoch": 0.8, + "grad_norm": 0.5120879411697388, + "learning_rate": 0.0004996383565412483, + "loss": 3.1099, + "step": 16420 + }, + { + "epoch": 0.8, + "grad_norm": 0.5071130394935608, + "learning_rate": 0.0004996268640621753, + "loss": 3.1576, + "step": 16421 + }, + { + "epoch": 0.8, + "grad_norm": 0.5468621850013733, + "learning_rate": 0.0004996153710573236, + "loss": 3.1405, + "step": 16422 + }, + { + "epoch": 0.8, + "grad_norm": 0.5439679622650146, + "learning_rate": 0.0004996038775267233, + "loss": 3.0414, + "step": 16423 + }, + { + "epoch": 0.8, + "grad_norm": 0.5470306277275085, + "learning_rate": 0.0004995923834704049, + "loss": 3.2407, + "step": 16424 + }, + { + "epoch": 0.8, + "grad_norm": 0.4964117109775543, + "learning_rate": 0.0004995808888883986, + "loss": 3.0354, + "step": 16425 + }, + { + "epoch": 0.8, + "grad_norm": 0.5191156268119812, + "learning_rate": 0.0004995693937807345, + "loss": 3.2839, + "step": 16426 + }, + { + "epoch": 0.81, + "grad_norm": 0.5142076015472412, + "learning_rate": 0.0004995578981474431, + "loss": 3.212, + "step": 16427 + }, + { + "epoch": 0.81, + "grad_norm": 0.4969421923160553, + "learning_rate": 0.0004995464019885548, + "loss": 2.959, + "step": 16428 + }, + { + "epoch": 0.81, + "grad_norm": 0.5990926623344421, + "learning_rate": 0.0004995349053040993, + "loss": 3.2161, + "step": 16429 + }, + { + "epoch": 0.81, + "grad_norm": 0.5438251495361328, + "learning_rate": 0.0004995234080941075, + "loss": 3.2263, + "step": 16430 + }, + { + "epoch": 0.81, + "grad_norm": 0.5286610722541809, + "learning_rate": 0.0004995119103586095, + "loss": 3.1011, + "step": 16431 + }, + { + "epoch": 0.81, + "grad_norm": 0.5550453662872314, + "learning_rate": 0.0004995004120976354, + "loss": 3.2373, + "step": 16432 + }, + { + "epoch": 0.81, + "grad_norm": 0.5107359290122986, + "learning_rate": 0.0004994889133112156, + "loss": 3.2337, + "step": 16433 + }, + { + "epoch": 0.81, + "grad_norm": 0.5204411149024963, + "learning_rate": 0.0004994774139993805, + "loss": 3.3439, + "step": 16434 + }, + { + "epoch": 0.81, + "grad_norm": 0.5322909355163574, + "learning_rate": 0.0004994659141621602, + "loss": 3.5071, + "step": 16435 + }, + { + "epoch": 0.81, + "grad_norm": 0.5268922448158264, + "learning_rate": 0.0004994544137995851, + "loss": 3.0943, + "step": 16436 + }, + { + "epoch": 0.81, + "grad_norm": 0.5423510074615479, + "learning_rate": 0.0004994429129116854, + "loss": 3.2312, + "step": 16437 + }, + { + "epoch": 0.81, + "grad_norm": 0.5225160121917725, + "learning_rate": 0.0004994314114984915, + "loss": 3.1335, + "step": 16438 + }, + { + "epoch": 0.81, + "grad_norm": 0.5021830797195435, + "learning_rate": 0.0004994199095600337, + "loss": 2.975, + "step": 16439 + }, + { + "epoch": 0.81, + "grad_norm": 0.5276130437850952, + "learning_rate": 0.0004994084070963421, + "loss": 3.2795, + "step": 16440 + }, + { + "epoch": 0.81, + "grad_norm": 0.5111821889877319, + "learning_rate": 0.0004993969041074473, + "loss": 2.9674, + "step": 16441 + }, + { + "epoch": 0.81, + "grad_norm": 0.5199219584465027, + "learning_rate": 0.0004993854005933793, + "loss": 3.1355, + "step": 16442 + }, + { + "epoch": 0.81, + "grad_norm": 0.5186803340911865, + "learning_rate": 0.0004993738965541686, + "loss": 3.2157, + "step": 16443 + }, + { + "epoch": 0.81, + "grad_norm": 0.5261385440826416, + "learning_rate": 0.0004993623919898454, + "loss": 3.2304, + "step": 16444 + }, + { + "epoch": 0.81, + "grad_norm": 0.5131444334983826, + "learning_rate": 0.00049935088690044, + "loss": 3.1116, + "step": 16445 + }, + { + "epoch": 0.81, + "grad_norm": 0.5108278393745422, + "learning_rate": 0.0004993393812859827, + "loss": 3.322, + "step": 16446 + }, + { + "epoch": 0.81, + "grad_norm": 0.5285989046096802, + "learning_rate": 0.0004993278751465039, + "loss": 3.3048, + "step": 16447 + }, + { + "epoch": 0.81, + "grad_norm": 0.5042722225189209, + "learning_rate": 0.0004993163684820338, + "loss": 3.0386, + "step": 16448 + }, + { + "epoch": 0.81, + "grad_norm": 0.5362065434455872, + "learning_rate": 0.0004993048612926028, + "loss": 3.0104, + "step": 16449 + }, + { + "epoch": 0.81, + "grad_norm": 0.49889418482780457, + "learning_rate": 0.0004992933535782411, + "loss": 3.0916, + "step": 16450 + }, + { + "epoch": 0.81, + "grad_norm": 0.521359920501709, + "learning_rate": 0.0004992818453389791, + "loss": 3.1019, + "step": 16451 + }, + { + "epoch": 0.81, + "grad_norm": 0.5698845386505127, + "learning_rate": 0.0004992703365748471, + "loss": 3.1863, + "step": 16452 + }, + { + "epoch": 0.81, + "grad_norm": 0.45893803238868713, + "learning_rate": 0.0004992588272858753, + "loss": 3.2055, + "step": 16453 + }, + { + "epoch": 0.81, + "grad_norm": 0.5086631178855896, + "learning_rate": 0.0004992473174720942, + "loss": 3.2734, + "step": 16454 + }, + { + "epoch": 0.81, + "grad_norm": 0.507537305355072, + "learning_rate": 0.0004992358071335338, + "loss": 3.327, + "step": 16455 + }, + { + "epoch": 0.81, + "grad_norm": 0.5071486830711365, + "learning_rate": 0.0004992242962702248, + "loss": 3.1988, + "step": 16456 + }, + { + "epoch": 0.81, + "grad_norm": 0.5107549428939819, + "learning_rate": 0.0004992127848821973, + "loss": 3.1467, + "step": 16457 + }, + { + "epoch": 0.81, + "grad_norm": 0.5230646729469299, + "learning_rate": 0.0004992012729694817, + "loss": 3.1563, + "step": 16458 + }, + { + "epoch": 0.81, + "grad_norm": 0.5297747850418091, + "learning_rate": 0.0004991897605321082, + "loss": 3.1184, + "step": 16459 + }, + { + "epoch": 0.81, + "grad_norm": 0.5269367098808289, + "learning_rate": 0.0004991782475701073, + "loss": 2.8786, + "step": 16460 + }, + { + "epoch": 0.81, + "grad_norm": 0.4930148720741272, + "learning_rate": 0.0004991667340835093, + "loss": 3.0121, + "step": 16461 + }, + { + "epoch": 0.81, + "grad_norm": 0.5489005446434021, + "learning_rate": 0.0004991552200723443, + "loss": 3.0634, + "step": 16462 + }, + { + "epoch": 0.81, + "grad_norm": 0.5134395360946655, + "learning_rate": 0.0004991437055366428, + "loss": 3.2313, + "step": 16463 + }, + { + "epoch": 0.81, + "grad_norm": 0.5260294079780579, + "learning_rate": 0.0004991321904764352, + "loss": 3.0926, + "step": 16464 + }, + { + "epoch": 0.81, + "grad_norm": 0.5649300217628479, + "learning_rate": 0.0004991206748917517, + "loss": 3.2247, + "step": 16465 + }, + { + "epoch": 0.81, + "grad_norm": 0.5012965798377991, + "learning_rate": 0.0004991091587826225, + "loss": 3.1459, + "step": 16466 + }, + { + "epoch": 0.81, + "grad_norm": 0.49580734968185425, + "learning_rate": 0.0004990976421490783, + "loss": 3.1876, + "step": 16467 + }, + { + "epoch": 0.81, + "grad_norm": 0.54473477602005, + "learning_rate": 0.0004990861249911492, + "loss": 3.3208, + "step": 16468 + }, + { + "epoch": 0.81, + "grad_norm": 0.5577808022499084, + "learning_rate": 0.0004990746073088655, + "loss": 3.0318, + "step": 16469 + }, + { + "epoch": 0.81, + "grad_norm": 0.5220108032226562, + "learning_rate": 0.0004990630891022576, + "loss": 3.07, + "step": 16470 + }, + { + "epoch": 0.81, + "grad_norm": 0.4759686291217804, + "learning_rate": 0.0004990515703713559, + "loss": 3.1017, + "step": 16471 + }, + { + "epoch": 0.81, + "grad_norm": 0.5046111345291138, + "learning_rate": 0.0004990400511161907, + "loss": 3.1224, + "step": 16472 + }, + { + "epoch": 0.81, + "grad_norm": 0.5120763182640076, + "learning_rate": 0.0004990285313367922, + "loss": 3.2141, + "step": 16473 + }, + { + "epoch": 0.81, + "grad_norm": 0.5458747148513794, + "learning_rate": 0.0004990170110331908, + "loss": 3.126, + "step": 16474 + }, + { + "epoch": 0.81, + "grad_norm": 0.540588915348053, + "learning_rate": 0.000499005490205417, + "loss": 3.1728, + "step": 16475 + }, + { + "epoch": 0.81, + "grad_norm": 0.5352221727371216, + "learning_rate": 0.000498993968853501, + "loss": 3.1001, + "step": 16476 + }, + { + "epoch": 0.81, + "grad_norm": 0.5034423470497131, + "learning_rate": 0.0004989824469774732, + "loss": 3.1839, + "step": 16477 + }, + { + "epoch": 0.81, + "grad_norm": 0.5104349255561829, + "learning_rate": 0.0004989709245773639, + "loss": 3.1802, + "step": 16478 + }, + { + "epoch": 0.81, + "grad_norm": 0.5068408846855164, + "learning_rate": 0.0004989594016532036, + "loss": 3.0528, + "step": 16479 + }, + { + "epoch": 0.81, + "grad_norm": 0.5584886074066162, + "learning_rate": 0.0004989478782050224, + "loss": 3.1633, + "step": 16480 + }, + { + "epoch": 0.81, + "grad_norm": 0.4965101480484009, + "learning_rate": 0.0004989363542328508, + "loss": 2.9951, + "step": 16481 + }, + { + "epoch": 0.81, + "grad_norm": 0.5258487462997437, + "learning_rate": 0.0004989248297367191, + "loss": 3.2203, + "step": 16482 + }, + { + "epoch": 0.81, + "grad_norm": 0.49109527468681335, + "learning_rate": 0.0004989133047166577, + "loss": 3.2713, + "step": 16483 + }, + { + "epoch": 0.81, + "grad_norm": 0.49905163049697876, + "learning_rate": 0.0004989017791726971, + "loss": 3.117, + "step": 16484 + }, + { + "epoch": 0.81, + "grad_norm": 0.5777082443237305, + "learning_rate": 0.0004988902531048673, + "loss": 3.1115, + "step": 16485 + }, + { + "epoch": 0.81, + "grad_norm": 0.5189237594604492, + "learning_rate": 0.0004988787265131989, + "loss": 3.0824, + "step": 16486 + }, + { + "epoch": 0.81, + "grad_norm": 0.569271981716156, + "learning_rate": 0.0004988671993977221, + "loss": 2.9983, + "step": 16487 + }, + { + "epoch": 0.81, + "grad_norm": 0.5757372379302979, + "learning_rate": 0.0004988556717584676, + "loss": 3.0703, + "step": 16488 + }, + { + "epoch": 0.81, + "grad_norm": 0.5193915963172913, + "learning_rate": 0.0004988441435954654, + "loss": 3.1659, + "step": 16489 + }, + { + "epoch": 0.81, + "grad_norm": 0.5080418586730957, + "learning_rate": 0.000498832614908746, + "loss": 3.1788, + "step": 16490 + }, + { + "epoch": 0.81, + "grad_norm": 0.5090472102165222, + "learning_rate": 0.0004988210856983398, + "loss": 3.1489, + "step": 16491 + }, + { + "epoch": 0.81, + "grad_norm": 0.466819167137146, + "learning_rate": 0.000498809555964277, + "loss": 3.1436, + "step": 16492 + }, + { + "epoch": 0.81, + "grad_norm": 0.5463626980781555, + "learning_rate": 0.0004987980257065883, + "loss": 3.2058, + "step": 16493 + }, + { + "epoch": 0.81, + "grad_norm": 0.5318609476089478, + "learning_rate": 0.0004987864949253037, + "loss": 3.333, + "step": 16494 + }, + { + "epoch": 0.81, + "grad_norm": 0.5022921562194824, + "learning_rate": 0.0004987749636204537, + "loss": 3.0602, + "step": 16495 + }, + { + "epoch": 0.81, + "grad_norm": 0.49121004343032837, + "learning_rate": 0.0004987634317920688, + "loss": 3.3215, + "step": 16496 + }, + { + "epoch": 0.81, + "grad_norm": 0.5183935761451721, + "learning_rate": 0.0004987518994401792, + "loss": 3.2748, + "step": 16497 + }, + { + "epoch": 0.81, + "grad_norm": 0.4942324757575989, + "learning_rate": 0.0004987403665648155, + "loss": 3.0571, + "step": 16498 + }, + { + "epoch": 0.81, + "grad_norm": 0.5025029182434082, + "learning_rate": 0.0004987288331660078, + "loss": 3.2539, + "step": 16499 + }, + { + "epoch": 0.81, + "grad_norm": 0.5152884125709534, + "learning_rate": 0.0004987172992437866, + "loss": 3.2326, + "step": 16500 + }, + { + "epoch": 0.81, + "grad_norm": 0.5195552706718445, + "learning_rate": 0.0004987057647981824, + "loss": 3.0311, + "step": 16501 + }, + { + "epoch": 0.81, + "grad_norm": 0.5349781513214111, + "learning_rate": 0.0004986942298292253, + "loss": 3.1752, + "step": 16502 + }, + { + "epoch": 0.81, + "grad_norm": 0.5725705027580261, + "learning_rate": 0.0004986826943369459, + "loss": 3.2543, + "step": 16503 + }, + { + "epoch": 0.81, + "grad_norm": 0.6165834069252014, + "learning_rate": 0.0004986711583213745, + "loss": 3.3868, + "step": 16504 + }, + { + "epoch": 0.81, + "grad_norm": 0.5173270106315613, + "learning_rate": 0.0004986596217825415, + "loss": 3.1788, + "step": 16505 + }, + { + "epoch": 0.81, + "grad_norm": 0.5284629464149475, + "learning_rate": 0.0004986480847204772, + "loss": 3.2897, + "step": 16506 + }, + { + "epoch": 0.81, + "grad_norm": 0.5097475647926331, + "learning_rate": 0.0004986365471352122, + "loss": 2.8815, + "step": 16507 + }, + { + "epoch": 0.81, + "grad_norm": 0.5338273048400879, + "learning_rate": 0.0004986250090267768, + "loss": 3.2031, + "step": 16508 + }, + { + "epoch": 0.81, + "grad_norm": 0.5456317663192749, + "learning_rate": 0.0004986134703952013, + "loss": 2.9568, + "step": 16509 + }, + { + "epoch": 0.81, + "grad_norm": 0.55818772315979, + "learning_rate": 0.0004986019312405161, + "loss": 3.1333, + "step": 16510 + }, + { + "epoch": 0.81, + "grad_norm": 0.5008224844932556, + "learning_rate": 0.0004985903915627517, + "loss": 3.2126, + "step": 16511 + }, + { + "epoch": 0.81, + "grad_norm": 0.4851469099521637, + "learning_rate": 0.0004985788513619384, + "loss": 3.0776, + "step": 16512 + }, + { + "epoch": 0.81, + "grad_norm": 0.5189734697341919, + "learning_rate": 0.0004985673106381066, + "loss": 3.0294, + "step": 16513 + }, + { + "epoch": 0.81, + "grad_norm": 0.5031270980834961, + "learning_rate": 0.0004985557693912867, + "loss": 3.4658, + "step": 16514 + }, + { + "epoch": 0.81, + "grad_norm": 0.5297790169715881, + "learning_rate": 0.0004985442276215092, + "loss": 3.2422, + "step": 16515 + }, + { + "epoch": 0.81, + "grad_norm": 0.536777138710022, + "learning_rate": 0.0004985326853288044, + "loss": 3.1539, + "step": 16516 + }, + { + "epoch": 0.81, + "grad_norm": 0.5064894556999207, + "learning_rate": 0.0004985211425132027, + "loss": 3.1133, + "step": 16517 + }, + { + "epoch": 0.81, + "grad_norm": 0.511317789554596, + "learning_rate": 0.0004985095991747345, + "loss": 3.1467, + "step": 16518 + }, + { + "epoch": 0.81, + "grad_norm": 0.49673980474472046, + "learning_rate": 0.0004984980553134302, + "loss": 3.229, + "step": 16519 + }, + { + "epoch": 0.81, + "grad_norm": 0.5076729655265808, + "learning_rate": 0.0004984865109293203, + "loss": 3.3867, + "step": 16520 + }, + { + "epoch": 0.81, + "grad_norm": 0.5597838163375854, + "learning_rate": 0.0004984749660224351, + "loss": 3.2826, + "step": 16521 + }, + { + "epoch": 0.81, + "grad_norm": 0.5371355414390564, + "learning_rate": 0.0004984634205928049, + "loss": 3.2134, + "step": 16522 + }, + { + "epoch": 0.81, + "grad_norm": 0.5134449005126953, + "learning_rate": 0.0004984518746404604, + "loss": 3.2228, + "step": 16523 + }, + { + "epoch": 0.81, + "grad_norm": 0.514105498790741, + "learning_rate": 0.0004984403281654318, + "loss": 2.9934, + "step": 16524 + }, + { + "epoch": 0.81, + "grad_norm": 0.5345868468284607, + "learning_rate": 0.0004984287811677496, + "loss": 3.3375, + "step": 16525 + }, + { + "epoch": 0.81, + "grad_norm": 0.5686771273612976, + "learning_rate": 0.0004984172336474441, + "loss": 3.2227, + "step": 16526 + }, + { + "epoch": 0.81, + "grad_norm": 0.4849824905395508, + "learning_rate": 0.000498405685604546, + "loss": 3.2113, + "step": 16527 + }, + { + "epoch": 0.81, + "grad_norm": 0.5313514471054077, + "learning_rate": 0.0004983941370390853, + "loss": 3.3788, + "step": 16528 + }, + { + "epoch": 0.81, + "grad_norm": 0.5120206475257874, + "learning_rate": 0.0004983825879510927, + "loss": 3.1637, + "step": 16529 + }, + { + "epoch": 0.81, + "grad_norm": 0.5304714441299438, + "learning_rate": 0.0004983710383405984, + "loss": 2.988, + "step": 16530 + }, + { + "epoch": 0.81, + "grad_norm": 0.5290876030921936, + "learning_rate": 0.0004983594882076333, + "loss": 3.3558, + "step": 16531 + }, + { + "epoch": 0.81, + "grad_norm": 0.5857232809066772, + "learning_rate": 0.0004983479375522272, + "loss": 3.102, + "step": 16532 + }, + { + "epoch": 0.81, + "grad_norm": 0.7322335839271545, + "learning_rate": 0.0004983363863744108, + "loss": 3.0765, + "step": 16533 + }, + { + "epoch": 0.81, + "grad_norm": 0.5193672776222229, + "learning_rate": 0.0004983248346742147, + "loss": 3.0546, + "step": 16534 + }, + { + "epoch": 0.81, + "grad_norm": 0.5496091246604919, + "learning_rate": 0.000498313282451669, + "loss": 2.9767, + "step": 16535 + }, + { + "epoch": 0.81, + "grad_norm": 0.5549640655517578, + "learning_rate": 0.0004983017297068044, + "loss": 3.0501, + "step": 16536 + }, + { + "epoch": 0.81, + "grad_norm": 0.49452897906303406, + "learning_rate": 0.0004982901764396511, + "loss": 3.1101, + "step": 16537 + }, + { + "epoch": 0.81, + "grad_norm": 0.5188285112380981, + "learning_rate": 0.0004982786226502396, + "loss": 3.276, + "step": 16538 + }, + { + "epoch": 0.81, + "grad_norm": 0.5108150243759155, + "learning_rate": 0.0004982670683386006, + "loss": 3.1788, + "step": 16539 + }, + { + "epoch": 0.81, + "grad_norm": 0.5602210164070129, + "learning_rate": 0.0004982555135047641, + "loss": 3.1825, + "step": 16540 + }, + { + "epoch": 0.81, + "grad_norm": 0.5105909109115601, + "learning_rate": 0.0004982439581487607, + "loss": 3.3291, + "step": 16541 + }, + { + "epoch": 0.81, + "grad_norm": 0.5331498384475708, + "learning_rate": 0.000498232402270621, + "loss": 3.2307, + "step": 16542 + }, + { + "epoch": 0.81, + "grad_norm": 0.5256636142730713, + "learning_rate": 0.0004982208458703752, + "loss": 2.8307, + "step": 16543 + }, + { + "epoch": 0.81, + "grad_norm": 0.5503387451171875, + "learning_rate": 0.000498209288948054, + "loss": 3.2409, + "step": 16544 + }, + { + "epoch": 0.81, + "grad_norm": 0.5669764280319214, + "learning_rate": 0.0004981977315036876, + "loss": 3.1763, + "step": 16545 + }, + { + "epoch": 0.81, + "grad_norm": 0.5365728139877319, + "learning_rate": 0.0004981861735373064, + "loss": 3.0852, + "step": 16546 + }, + { + "epoch": 0.81, + "grad_norm": 0.5224799513816833, + "learning_rate": 0.000498174615048941, + "loss": 3.1728, + "step": 16547 + }, + { + "epoch": 0.81, + "grad_norm": 0.5562565922737122, + "learning_rate": 0.0004981630560386219, + "loss": 3.1805, + "step": 16548 + }, + { + "epoch": 0.81, + "grad_norm": 0.5232570171356201, + "learning_rate": 0.0004981514965063795, + "loss": 3.1247, + "step": 16549 + }, + { + "epoch": 0.81, + "grad_norm": 0.5539358258247375, + "learning_rate": 0.0004981399364522441, + "loss": 3.2232, + "step": 16550 + }, + { + "epoch": 0.81, + "grad_norm": 0.5199190378189087, + "learning_rate": 0.0004981283758762462, + "loss": 3.1485, + "step": 16551 + }, + { + "epoch": 0.81, + "grad_norm": 0.516223132610321, + "learning_rate": 0.0004981168147784163, + "loss": 3.1427, + "step": 16552 + }, + { + "epoch": 0.81, + "grad_norm": 0.504279613494873, + "learning_rate": 0.0004981052531587849, + "loss": 3.318, + "step": 16553 + }, + { + "epoch": 0.81, + "grad_norm": 0.5051873326301575, + "learning_rate": 0.0004980936910173823, + "loss": 3.2255, + "step": 16554 + }, + { + "epoch": 0.81, + "grad_norm": 0.5611341595649719, + "learning_rate": 0.0004980821283542392, + "loss": 3.0024, + "step": 16555 + }, + { + "epoch": 0.81, + "grad_norm": 0.5161449313163757, + "learning_rate": 0.0004980705651693858, + "loss": 3.0296, + "step": 16556 + }, + { + "epoch": 0.81, + "grad_norm": 0.5371371507644653, + "learning_rate": 0.0004980590014628526, + "loss": 3.273, + "step": 16557 + }, + { + "epoch": 0.81, + "grad_norm": 0.533517599105835, + "learning_rate": 0.0004980474372346701, + "loss": 3.2346, + "step": 16558 + }, + { + "epoch": 0.81, + "grad_norm": 0.5515034794807434, + "learning_rate": 0.0004980358724848688, + "loss": 3.1066, + "step": 16559 + }, + { + "epoch": 0.81, + "grad_norm": 0.5124566555023193, + "learning_rate": 0.0004980243072134792, + "loss": 3.2288, + "step": 16560 + }, + { + "epoch": 0.81, + "grad_norm": 0.4827638864517212, + "learning_rate": 0.0004980127414205317, + "loss": 3.3534, + "step": 16561 + }, + { + "epoch": 0.81, + "grad_norm": 0.5892248749732971, + "learning_rate": 0.0004980011751060566, + "loss": 3.207, + "step": 16562 + }, + { + "epoch": 0.81, + "grad_norm": 0.5171937942504883, + "learning_rate": 0.0004979896082700846, + "loss": 3.2058, + "step": 16563 + }, + { + "epoch": 0.81, + "grad_norm": 0.5337836146354675, + "learning_rate": 0.000497978040912646, + "loss": 3.3217, + "step": 16564 + }, + { + "epoch": 0.81, + "grad_norm": 0.5042610764503479, + "learning_rate": 0.0004979664730337714, + "loss": 3.3613, + "step": 16565 + }, + { + "epoch": 0.81, + "grad_norm": 0.515651285648346, + "learning_rate": 0.0004979549046334913, + "loss": 3.0309, + "step": 16566 + }, + { + "epoch": 0.81, + "grad_norm": 0.5128003358840942, + "learning_rate": 0.000497943335711836, + "loss": 3.4516, + "step": 16567 + }, + { + "epoch": 0.81, + "grad_norm": 0.5067887306213379, + "learning_rate": 0.0004979317662688359, + "loss": 3.1118, + "step": 16568 + }, + { + "epoch": 0.81, + "grad_norm": 0.4763834476470947, + "learning_rate": 0.0004979201963045218, + "loss": 3.3369, + "step": 16569 + }, + { + "epoch": 0.81, + "grad_norm": 0.5007560849189758, + "learning_rate": 0.0004979086258189241, + "loss": 3.1067, + "step": 16570 + }, + { + "epoch": 0.81, + "grad_norm": 0.539618968963623, + "learning_rate": 0.0004978970548120729, + "loss": 3.2345, + "step": 16571 + }, + { + "epoch": 0.81, + "grad_norm": 0.5771459341049194, + "learning_rate": 0.0004978854832839992, + "loss": 3.1452, + "step": 16572 + }, + { + "epoch": 0.81, + "grad_norm": 0.5069820284843445, + "learning_rate": 0.000497873911234733, + "loss": 3.2058, + "step": 16573 + }, + { + "epoch": 0.81, + "grad_norm": 0.5238664746284485, + "learning_rate": 0.000497862338664305, + "loss": 3.0031, + "step": 16574 + }, + { + "epoch": 0.81, + "grad_norm": 0.5101388692855835, + "learning_rate": 0.0004978507655727458, + "loss": 3.123, + "step": 16575 + }, + { + "epoch": 0.81, + "grad_norm": 0.5062222480773926, + "learning_rate": 0.0004978391919600857, + "loss": 3.1039, + "step": 16576 + }, + { + "epoch": 0.81, + "grad_norm": 0.571029007434845, + "learning_rate": 0.0004978276178263553, + "loss": 3.1371, + "step": 16577 + }, + { + "epoch": 0.81, + "grad_norm": 0.519842803478241, + "learning_rate": 0.000497816043171585, + "loss": 3.0524, + "step": 16578 + }, + { + "epoch": 0.81, + "grad_norm": 0.6620784997940063, + "learning_rate": 0.0004978044679958053, + "loss": 3.2173, + "step": 16579 + }, + { + "epoch": 0.81, + "grad_norm": 0.5876750349998474, + "learning_rate": 0.0004977928922990467, + "loss": 3.1815, + "step": 16580 + }, + { + "epoch": 0.81, + "grad_norm": 0.5399341583251953, + "learning_rate": 0.0004977813160813397, + "loss": 3.3868, + "step": 16581 + }, + { + "epoch": 0.81, + "grad_norm": 0.5021479725837708, + "learning_rate": 0.0004977697393427148, + "loss": 3.2335, + "step": 16582 + }, + { + "epoch": 0.81, + "grad_norm": 0.5305050015449524, + "learning_rate": 0.0004977581620832024, + "loss": 3.2448, + "step": 16583 + }, + { + "epoch": 0.81, + "grad_norm": 0.4928508698940277, + "learning_rate": 0.0004977465843028331, + "loss": 3.2395, + "step": 16584 + }, + { + "epoch": 0.81, + "grad_norm": 0.49715203046798706, + "learning_rate": 0.0004977350060016374, + "loss": 3.1318, + "step": 16585 + }, + { + "epoch": 0.81, + "grad_norm": 0.49613332748413086, + "learning_rate": 0.0004977234271796458, + "loss": 3.2116, + "step": 16586 + }, + { + "epoch": 0.81, + "grad_norm": 0.5376517176628113, + "learning_rate": 0.0004977118478368886, + "loss": 3.1922, + "step": 16587 + }, + { + "epoch": 0.81, + "grad_norm": 0.5719401836395264, + "learning_rate": 0.0004977002679733964, + "loss": 3.2298, + "step": 16588 + }, + { + "epoch": 0.81, + "grad_norm": 0.5064448118209839, + "learning_rate": 0.0004976886875891998, + "loss": 3.2184, + "step": 16589 + }, + { + "epoch": 0.81, + "grad_norm": 0.5336682200431824, + "learning_rate": 0.0004976771066843293, + "loss": 3.1029, + "step": 16590 + }, + { + "epoch": 0.81, + "grad_norm": 0.5417354106903076, + "learning_rate": 0.0004976655252588153, + "loss": 3.2497, + "step": 16591 + }, + { + "epoch": 0.81, + "grad_norm": 0.5155006051063538, + "learning_rate": 0.0004976539433126884, + "loss": 3.2171, + "step": 16592 + }, + { + "epoch": 0.81, + "grad_norm": 0.5138181447982788, + "learning_rate": 0.000497642360845979, + "loss": 3.0924, + "step": 16593 + }, + { + "epoch": 0.81, + "grad_norm": 0.5162732601165771, + "learning_rate": 0.0004976307778587176, + "loss": 3.218, + "step": 16594 + }, + { + "epoch": 0.81, + "grad_norm": 0.5005708932876587, + "learning_rate": 0.000497619194350935, + "loss": 3.3464, + "step": 16595 + }, + { + "epoch": 0.81, + "grad_norm": 0.5471556186676025, + "learning_rate": 0.0004976076103226613, + "loss": 3.286, + "step": 16596 + }, + { + "epoch": 0.81, + "grad_norm": 0.5286422371864319, + "learning_rate": 0.0004975960257739272, + "loss": 2.9602, + "step": 16597 + }, + { + "epoch": 0.81, + "grad_norm": 0.5641829967498779, + "learning_rate": 0.0004975844407047632, + "loss": 3.2977, + "step": 16598 + }, + { + "epoch": 0.81, + "grad_norm": 0.5358396172523499, + "learning_rate": 0.0004975728551151998, + "loss": 3.1017, + "step": 16599 + }, + { + "epoch": 0.81, + "grad_norm": 0.5861735939979553, + "learning_rate": 0.0004975612690052675, + "loss": 3.1687, + "step": 16600 + }, + { + "epoch": 0.81, + "grad_norm": 0.5610910654067993, + "learning_rate": 0.0004975496823749969, + "loss": 3.1995, + "step": 16601 + }, + { + "epoch": 0.81, + "grad_norm": 0.5274034738540649, + "learning_rate": 0.0004975380952244185, + "loss": 3.2817, + "step": 16602 + }, + { + "epoch": 0.81, + "grad_norm": 0.5060899257659912, + "learning_rate": 0.0004975265075535626, + "loss": 3.2932, + "step": 16603 + }, + { + "epoch": 0.81, + "grad_norm": 0.5002941489219666, + "learning_rate": 0.0004975149193624601, + "loss": 3.1504, + "step": 16604 + }, + { + "epoch": 0.81, + "grad_norm": 0.48612910509109497, + "learning_rate": 0.0004975033306511412, + "loss": 3.1963, + "step": 16605 + }, + { + "epoch": 0.81, + "grad_norm": 0.5231236815452576, + "learning_rate": 0.0004974917414196365, + "loss": 3.1359, + "step": 16606 + }, + { + "epoch": 0.81, + "grad_norm": 0.5047218203544617, + "learning_rate": 0.0004974801516679766, + "loss": 3.1402, + "step": 16607 + }, + { + "epoch": 0.81, + "grad_norm": 0.4852062165737152, + "learning_rate": 0.000497468561396192, + "loss": 3.2184, + "step": 16608 + }, + { + "epoch": 0.81, + "grad_norm": 0.5307152271270752, + "learning_rate": 0.0004974569706043132, + "loss": 3.1907, + "step": 16609 + }, + { + "epoch": 0.81, + "grad_norm": 0.5003814101219177, + "learning_rate": 0.0004974453792923708, + "loss": 3.0009, + "step": 16610 + }, + { + "epoch": 0.81, + "grad_norm": 0.5295878052711487, + "learning_rate": 0.0004974337874603952, + "loss": 3.1932, + "step": 16611 + }, + { + "epoch": 0.81, + "grad_norm": 0.5771824717521667, + "learning_rate": 0.000497422195108417, + "loss": 3.3406, + "step": 16612 + }, + { + "epoch": 0.81, + "grad_norm": 0.5096525549888611, + "learning_rate": 0.0004974106022364669, + "loss": 3.0476, + "step": 16613 + }, + { + "epoch": 0.81, + "grad_norm": 0.5477951765060425, + "learning_rate": 0.000497399008844575, + "loss": 3.2816, + "step": 16614 + }, + { + "epoch": 0.81, + "grad_norm": 0.5155587196350098, + "learning_rate": 0.0004973874149327722, + "loss": 2.9737, + "step": 16615 + }, + { + "epoch": 0.81, + "grad_norm": 0.5152885317802429, + "learning_rate": 0.0004973758205010889, + "loss": 3.2941, + "step": 16616 + }, + { + "epoch": 0.81, + "grad_norm": 0.5501928925514221, + "learning_rate": 0.0004973642255495558, + "loss": 3.3104, + "step": 16617 + }, + { + "epoch": 0.81, + "grad_norm": 0.5133585333824158, + "learning_rate": 0.0004973526300782032, + "loss": 3.1066, + "step": 16618 + }, + { + "epoch": 0.81, + "grad_norm": 0.5019925832748413, + "learning_rate": 0.0004973410340870618, + "loss": 3.2807, + "step": 16619 + }, + { + "epoch": 0.81, + "grad_norm": 0.5474452376365662, + "learning_rate": 0.0004973294375761621, + "loss": 3.2518, + "step": 16620 + }, + { + "epoch": 0.81, + "grad_norm": 0.5307877063751221, + "learning_rate": 0.0004973178405455347, + "loss": 2.98, + "step": 16621 + }, + { + "epoch": 0.81, + "grad_norm": 0.5195660591125488, + "learning_rate": 0.00049730624299521, + "loss": 3.2952, + "step": 16622 + }, + { + "epoch": 0.81, + "grad_norm": 0.5276235938072205, + "learning_rate": 0.0004972946449252187, + "loss": 3.2442, + "step": 16623 + }, + { + "epoch": 0.81, + "grad_norm": 0.5224148035049438, + "learning_rate": 0.0004972830463355912, + "loss": 3.0754, + "step": 16624 + }, + { + "epoch": 0.81, + "grad_norm": 0.5097349286079407, + "learning_rate": 0.0004972714472263581, + "loss": 3.3027, + "step": 16625 + }, + { + "epoch": 0.81, + "grad_norm": 0.511198878288269, + "learning_rate": 0.00049725984759755, + "loss": 3.229, + "step": 16626 + }, + { + "epoch": 0.81, + "grad_norm": 0.5765299797058105, + "learning_rate": 0.0004972482474491975, + "loss": 3.101, + "step": 16627 + }, + { + "epoch": 0.81, + "grad_norm": 0.5171129703521729, + "learning_rate": 0.0004972366467813309, + "loss": 3.1579, + "step": 16628 + }, + { + "epoch": 0.81, + "grad_norm": 0.5174678564071655, + "learning_rate": 0.0004972250455939811, + "loss": 3.2538, + "step": 16629 + }, + { + "epoch": 0.81, + "grad_norm": 0.5216899514198303, + "learning_rate": 0.0004972134438871786, + "loss": 3.0923, + "step": 16630 + }, + { + "epoch": 0.82, + "grad_norm": 0.5786752104759216, + "learning_rate": 0.0004972018416609536, + "loss": 3.127, + "step": 16631 + }, + { + "epoch": 0.82, + "grad_norm": 0.5211129784584045, + "learning_rate": 0.000497190238915337, + "loss": 3.1011, + "step": 16632 + }, + { + "epoch": 0.82, + "grad_norm": 0.5141299366950989, + "learning_rate": 0.0004971786356503592, + "loss": 2.9091, + "step": 16633 + }, + { + "epoch": 0.82, + "grad_norm": 0.48368707299232483, + "learning_rate": 0.0004971670318660509, + "loss": 3.324, + "step": 16634 + }, + { + "epoch": 0.82, + "grad_norm": 0.47780829668045044, + "learning_rate": 0.0004971554275624425, + "loss": 3.0679, + "step": 16635 + }, + { + "epoch": 0.82, + "grad_norm": 0.5046222805976868, + "learning_rate": 0.0004971438227395648, + "loss": 3.2293, + "step": 16636 + }, + { + "epoch": 0.82, + "grad_norm": 0.5113916993141174, + "learning_rate": 0.000497132217397448, + "loss": 3.0101, + "step": 16637 + }, + { + "epoch": 0.82, + "grad_norm": 0.5229575037956238, + "learning_rate": 0.000497120611536123, + "loss": 3.2457, + "step": 16638 + }, + { + "epoch": 0.82, + "grad_norm": 0.512291669845581, + "learning_rate": 0.0004971090051556202, + "loss": 3.259, + "step": 16639 + }, + { + "epoch": 0.82, + "grad_norm": 0.5554661750793457, + "learning_rate": 0.0004970973982559702, + "loss": 3.1555, + "step": 16640 + }, + { + "epoch": 0.82, + "grad_norm": 0.5595203042030334, + "learning_rate": 0.0004970857908372037, + "loss": 3.1944, + "step": 16641 + }, + { + "epoch": 0.82, + "grad_norm": 0.5318277478218079, + "learning_rate": 0.0004970741828993511, + "loss": 2.9727, + "step": 16642 + }, + { + "epoch": 0.82, + "grad_norm": 0.5298823714256287, + "learning_rate": 0.0004970625744424431, + "loss": 2.9161, + "step": 16643 + }, + { + "epoch": 0.82, + "grad_norm": 0.5190830230712891, + "learning_rate": 0.0004970509654665101, + "loss": 3.2831, + "step": 16644 + }, + { + "epoch": 0.82, + "grad_norm": 0.5548256039619446, + "learning_rate": 0.0004970393559715827, + "loss": 3.3253, + "step": 16645 + }, + { + "epoch": 0.82, + "grad_norm": 0.5534726977348328, + "learning_rate": 0.0004970277459576917, + "loss": 3.0462, + "step": 16646 + }, + { + "epoch": 0.82, + "grad_norm": 0.5380357503890991, + "learning_rate": 0.0004970161354248675, + "loss": 3.2276, + "step": 16647 + }, + { + "epoch": 0.82, + "grad_norm": 0.5422712564468384, + "learning_rate": 0.0004970045243731406, + "loss": 3.0344, + "step": 16648 + }, + { + "epoch": 0.82, + "grad_norm": 0.5209787487983704, + "learning_rate": 0.0004969929128025419, + "loss": 3.4289, + "step": 16649 + }, + { + "epoch": 0.82, + "grad_norm": 0.5309789776802063, + "learning_rate": 0.0004969813007131016, + "loss": 3.2243, + "step": 16650 + }, + { + "epoch": 0.82, + "grad_norm": 0.5202519297599792, + "learning_rate": 0.0004969696881048507, + "loss": 3.3096, + "step": 16651 + }, + { + "epoch": 0.82, + "grad_norm": 0.5129031538963318, + "learning_rate": 0.0004969580749778193, + "loss": 3.2491, + "step": 16652 + }, + { + "epoch": 0.82, + "grad_norm": 0.5805734395980835, + "learning_rate": 0.0004969464613320384, + "loss": 3.1584, + "step": 16653 + }, + { + "epoch": 0.82, + "grad_norm": 0.5051552057266235, + "learning_rate": 0.0004969348471675384, + "loss": 3.1706, + "step": 16654 + }, + { + "epoch": 0.82, + "grad_norm": 0.5367906093597412, + "learning_rate": 0.0004969232324843497, + "loss": 3.2394, + "step": 16655 + }, + { + "epoch": 0.82, + "grad_norm": 0.48859554529190063, + "learning_rate": 0.0004969116172825034, + "loss": 3.1854, + "step": 16656 + }, + { + "epoch": 0.82, + "grad_norm": 0.5127988457679749, + "learning_rate": 0.0004969000015620297, + "loss": 3.1622, + "step": 16657 + }, + { + "epoch": 0.82, + "grad_norm": 0.5549734830856323, + "learning_rate": 0.0004968883853229592, + "loss": 3.1321, + "step": 16658 + }, + { + "epoch": 0.82, + "grad_norm": 0.5598774552345276, + "learning_rate": 0.0004968767685653226, + "loss": 3.4022, + "step": 16659 + }, + { + "epoch": 0.82, + "grad_norm": 0.5506905913352966, + "learning_rate": 0.0004968651512891506, + "loss": 3.3344, + "step": 16660 + }, + { + "epoch": 0.82, + "grad_norm": 0.5095792412757874, + "learning_rate": 0.0004968535334944736, + "loss": 3.1894, + "step": 16661 + }, + { + "epoch": 0.82, + "grad_norm": 0.526882529258728, + "learning_rate": 0.0004968419151813224, + "loss": 3.3501, + "step": 16662 + }, + { + "epoch": 0.82, + "grad_norm": 0.5462402105331421, + "learning_rate": 0.0004968302963497273, + "loss": 2.8678, + "step": 16663 + }, + { + "epoch": 0.82, + "grad_norm": 0.528898298740387, + "learning_rate": 0.0004968186769997191, + "loss": 3.1944, + "step": 16664 + }, + { + "epoch": 0.82, + "grad_norm": 0.5134194493293762, + "learning_rate": 0.0004968070571313285, + "loss": 3.1576, + "step": 16665 + }, + { + "epoch": 0.82, + "grad_norm": 0.5237089395523071, + "learning_rate": 0.0004967954367445859, + "loss": 3.267, + "step": 16666 + }, + { + "epoch": 0.82, + "grad_norm": 0.558527946472168, + "learning_rate": 0.0004967838158395219, + "loss": 3.1597, + "step": 16667 + }, + { + "epoch": 0.82, + "grad_norm": 0.5071585178375244, + "learning_rate": 0.0004967721944161673, + "loss": 3.2747, + "step": 16668 + }, + { + "epoch": 0.82, + "grad_norm": 0.5414650440216064, + "learning_rate": 0.0004967605724745527, + "loss": 3.1734, + "step": 16669 + }, + { + "epoch": 0.82, + "grad_norm": 0.48503783345222473, + "learning_rate": 0.0004967489500147086, + "loss": 3.1875, + "step": 16670 + }, + { + "epoch": 0.82, + "grad_norm": 0.5278720855712891, + "learning_rate": 0.0004967373270366655, + "loss": 3.0154, + "step": 16671 + }, + { + "epoch": 0.82, + "grad_norm": 0.5245896577835083, + "learning_rate": 0.0004967257035404542, + "loss": 2.9901, + "step": 16672 + }, + { + "epoch": 0.82, + "grad_norm": 0.5041775107383728, + "learning_rate": 0.0004967140795261053, + "loss": 3.1008, + "step": 16673 + }, + { + "epoch": 0.82, + "grad_norm": 0.5002780556678772, + "learning_rate": 0.0004967024549936493, + "loss": 3.3013, + "step": 16674 + }, + { + "epoch": 0.82, + "grad_norm": 0.5385328531265259, + "learning_rate": 0.000496690829943117, + "loss": 3.3536, + "step": 16675 + }, + { + "epoch": 0.82, + "grad_norm": 0.47223567962646484, + "learning_rate": 0.0004966792043745389, + "loss": 3.1565, + "step": 16676 + }, + { + "epoch": 0.82, + "grad_norm": 0.5765127539634705, + "learning_rate": 0.0004966675782879455, + "loss": 3.1308, + "step": 16677 + }, + { + "epoch": 0.82, + "grad_norm": 0.5132501125335693, + "learning_rate": 0.0004966559516833677, + "loss": 3.2131, + "step": 16678 + }, + { + "epoch": 0.82, + "grad_norm": 0.5195982456207275, + "learning_rate": 0.0004966443245608359, + "loss": 3.3829, + "step": 16679 + }, + { + "epoch": 0.82, + "grad_norm": 0.4910086989402771, + "learning_rate": 0.0004966326969203807, + "loss": 3.2672, + "step": 16680 + }, + { + "epoch": 0.82, + "grad_norm": 0.5135030746459961, + "learning_rate": 0.0004966210687620329, + "loss": 3.3715, + "step": 16681 + }, + { + "epoch": 0.82, + "grad_norm": 0.5263185501098633, + "learning_rate": 0.0004966094400858231, + "loss": 3.0218, + "step": 16682 + }, + { + "epoch": 0.82, + "grad_norm": 0.5208287239074707, + "learning_rate": 0.0004965978108917818, + "loss": 3.1165, + "step": 16683 + }, + { + "epoch": 0.82, + "grad_norm": 0.5360139012336731, + "learning_rate": 0.0004965861811799397, + "loss": 3.0333, + "step": 16684 + }, + { + "epoch": 0.82, + "grad_norm": 0.47904497385025024, + "learning_rate": 0.0004965745509503275, + "loss": 3.2553, + "step": 16685 + }, + { + "epoch": 0.82, + "grad_norm": 0.4816090166568756, + "learning_rate": 0.0004965629202029758, + "loss": 3.1955, + "step": 16686 + }, + { + "epoch": 0.82, + "grad_norm": 0.5097158551216125, + "learning_rate": 0.0004965512889379151, + "loss": 3.2383, + "step": 16687 + }, + { + "epoch": 0.82, + "grad_norm": 0.509074330329895, + "learning_rate": 0.000496539657155176, + "loss": 3.2118, + "step": 16688 + }, + { + "epoch": 0.82, + "grad_norm": 0.5360403060913086, + "learning_rate": 0.0004965280248547895, + "loss": 2.9973, + "step": 16689 + }, + { + "epoch": 0.82, + "grad_norm": 0.5154137015342712, + "learning_rate": 0.0004965163920367859, + "loss": 3.3851, + "step": 16690 + }, + { + "epoch": 0.82, + "grad_norm": 0.48965588212013245, + "learning_rate": 0.0004965047587011959, + "loss": 3.2216, + "step": 16691 + }, + { + "epoch": 0.82, + "grad_norm": 0.4903731048107147, + "learning_rate": 0.0004964931248480503, + "loss": 3.2034, + "step": 16692 + }, + { + "epoch": 0.82, + "grad_norm": 0.5228937268257141, + "learning_rate": 0.0004964814904773795, + "loss": 3.1598, + "step": 16693 + }, + { + "epoch": 0.82, + "grad_norm": 0.5695022940635681, + "learning_rate": 0.0004964698555892144, + "loss": 3.1179, + "step": 16694 + }, + { + "epoch": 0.82, + "grad_norm": 0.5156790018081665, + "learning_rate": 0.0004964582201835855, + "loss": 3.0333, + "step": 16695 + }, + { + "epoch": 0.82, + "grad_norm": 0.5029388666152954, + "learning_rate": 0.0004964465842605234, + "loss": 3.1303, + "step": 16696 + }, + { + "epoch": 0.82, + "grad_norm": 0.5495452880859375, + "learning_rate": 0.0004964349478200588, + "loss": 3.178, + "step": 16697 + }, + { + "epoch": 0.82, + "grad_norm": 0.4993706941604614, + "learning_rate": 0.0004964233108622224, + "loss": 3.4597, + "step": 16698 + }, + { + "epoch": 0.82, + "grad_norm": 0.5246551632881165, + "learning_rate": 0.0004964116733870448, + "loss": 3.1988, + "step": 16699 + }, + { + "epoch": 0.82, + "grad_norm": 0.5039535164833069, + "learning_rate": 0.0004964000353945566, + "loss": 3.3829, + "step": 16700 + }, + { + "epoch": 0.82, + "grad_norm": 0.5181304812431335, + "learning_rate": 0.0004963883968847884, + "loss": 3.318, + "step": 16701 + }, + { + "epoch": 0.82, + "grad_norm": 0.5846447944641113, + "learning_rate": 0.0004963767578577712, + "loss": 3.1945, + "step": 16702 + }, + { + "epoch": 0.82, + "grad_norm": 0.518142580986023, + "learning_rate": 0.0004963651183135353, + "loss": 3.2293, + "step": 16703 + }, + { + "epoch": 0.82, + "grad_norm": 0.5273866057395935, + "learning_rate": 0.0004963534782521115, + "loss": 3.0665, + "step": 16704 + }, + { + "epoch": 0.82, + "grad_norm": 0.5090498328208923, + "learning_rate": 0.0004963418376735303, + "loss": 3.1259, + "step": 16705 + }, + { + "epoch": 0.82, + "grad_norm": 0.533167839050293, + "learning_rate": 0.0004963301965778226, + "loss": 3.3109, + "step": 16706 + }, + { + "epoch": 0.82, + "grad_norm": 0.5366831421852112, + "learning_rate": 0.0004963185549650189, + "loss": 3.0486, + "step": 16707 + }, + { + "epoch": 0.82, + "grad_norm": 0.5503694415092468, + "learning_rate": 0.00049630691283515, + "loss": 3.4465, + "step": 16708 + }, + { + "epoch": 0.82, + "grad_norm": 0.5301731824874878, + "learning_rate": 0.0004962952701882463, + "loss": 3.219, + "step": 16709 + }, + { + "epoch": 0.82, + "grad_norm": 0.4929310977458954, + "learning_rate": 0.0004962836270243388, + "loss": 3.2798, + "step": 16710 + }, + { + "epoch": 0.82, + "grad_norm": 0.5285437703132629, + "learning_rate": 0.0004962719833434579, + "loss": 3.0686, + "step": 16711 + }, + { + "epoch": 0.82, + "grad_norm": 0.5135146975517273, + "learning_rate": 0.0004962603391456345, + "loss": 3.1963, + "step": 16712 + }, + { + "epoch": 0.82, + "grad_norm": 0.5494550466537476, + "learning_rate": 0.000496248694430899, + "loss": 3.0039, + "step": 16713 + }, + { + "epoch": 0.82, + "grad_norm": 0.5301375389099121, + "learning_rate": 0.0004962370491992823, + "loss": 3.179, + "step": 16714 + }, + { + "epoch": 0.82, + "grad_norm": 0.5424580574035645, + "learning_rate": 0.000496225403450815, + "loss": 3.2411, + "step": 16715 + }, + { + "epoch": 0.82, + "grad_norm": 0.5308148264884949, + "learning_rate": 0.0004962137571855276, + "loss": 3.3935, + "step": 16716 + }, + { + "epoch": 0.82, + "grad_norm": 0.5148587226867676, + "learning_rate": 0.0004962021104034511, + "loss": 3.0353, + "step": 16717 + }, + { + "epoch": 0.82, + "grad_norm": 0.5002628564834595, + "learning_rate": 0.0004961904631046158, + "loss": 3.2585, + "step": 16718 + }, + { + "epoch": 0.82, + "grad_norm": 0.5126210451126099, + "learning_rate": 0.0004961788152890527, + "loss": 3.2791, + "step": 16719 + }, + { + "epoch": 0.82, + "grad_norm": 0.5162221789360046, + "learning_rate": 0.0004961671669567924, + "loss": 3.1034, + "step": 16720 + }, + { + "epoch": 0.82, + "grad_norm": 0.4941016137599945, + "learning_rate": 0.0004961555181078655, + "loss": 3.2295, + "step": 16721 + }, + { + "epoch": 0.82, + "grad_norm": 0.5905763506889343, + "learning_rate": 0.0004961438687423027, + "loss": 3.0365, + "step": 16722 + }, + { + "epoch": 0.82, + "grad_norm": 0.49847185611724854, + "learning_rate": 0.0004961322188601347, + "loss": 3.0049, + "step": 16723 + }, + { + "epoch": 0.82, + "grad_norm": 0.49443936347961426, + "learning_rate": 0.0004961205684613922, + "loss": 3.2913, + "step": 16724 + }, + { + "epoch": 0.82, + "grad_norm": 0.5241524577140808, + "learning_rate": 0.0004961089175461059, + "loss": 2.907, + "step": 16725 + }, + { + "epoch": 0.82, + "grad_norm": 0.5237706303596497, + "learning_rate": 0.0004960972661143064, + "loss": 3.1307, + "step": 16726 + }, + { + "epoch": 0.82, + "grad_norm": 0.5108638405799866, + "learning_rate": 0.0004960856141660244, + "loss": 3.271, + "step": 16727 + }, + { + "epoch": 0.82, + "grad_norm": 0.591118335723877, + "learning_rate": 0.0004960739617012906, + "loss": 3.2943, + "step": 16728 + }, + { + "epoch": 0.82, + "grad_norm": 0.5045537352561951, + "learning_rate": 0.0004960623087201358, + "loss": 3.2844, + "step": 16729 + }, + { + "epoch": 0.82, + "grad_norm": 0.569861650466919, + "learning_rate": 0.0004960506552225906, + "loss": 3.243, + "step": 16730 + }, + { + "epoch": 0.82, + "grad_norm": 0.5104923844337463, + "learning_rate": 0.0004960390012086858, + "loss": 3.2714, + "step": 16731 + }, + { + "epoch": 0.82, + "grad_norm": 0.5149794220924377, + "learning_rate": 0.0004960273466784519, + "loss": 2.9799, + "step": 16732 + }, + { + "epoch": 0.82, + "grad_norm": 0.5185301303863525, + "learning_rate": 0.0004960156916319196, + "loss": 3.1996, + "step": 16733 + }, + { + "epoch": 0.82, + "grad_norm": 0.5345672369003296, + "learning_rate": 0.0004960040360691199, + "loss": 3.1292, + "step": 16734 + }, + { + "epoch": 0.82, + "grad_norm": 0.5602273941040039, + "learning_rate": 0.0004959923799900831, + "loss": 3.1398, + "step": 16735 + }, + { + "epoch": 0.82, + "grad_norm": 0.5504186153411865, + "learning_rate": 0.0004959807233948403, + "loss": 3.2962, + "step": 16736 + }, + { + "epoch": 0.82, + "grad_norm": 0.5613498687744141, + "learning_rate": 0.0004959690662834219, + "loss": 3.1052, + "step": 16737 + }, + { + "epoch": 0.82, + "grad_norm": 0.5954709053039551, + "learning_rate": 0.0004959574086558586, + "loss": 3.1258, + "step": 16738 + }, + { + "epoch": 0.82, + "grad_norm": 0.5291658639907837, + "learning_rate": 0.0004959457505121813, + "loss": 3.2572, + "step": 16739 + }, + { + "epoch": 0.82, + "grad_norm": 0.5105869174003601, + "learning_rate": 0.0004959340918524205, + "loss": 3.157, + "step": 16740 + }, + { + "epoch": 0.82, + "grad_norm": 0.5140098333358765, + "learning_rate": 0.0004959224326766071, + "loss": 3.2509, + "step": 16741 + }, + { + "epoch": 0.82, + "grad_norm": 0.5428385734558105, + "learning_rate": 0.0004959107729847717, + "loss": 2.8749, + "step": 16742 + }, + { + "epoch": 0.82, + "grad_norm": 0.5247282385826111, + "learning_rate": 0.000495899112776945, + "loss": 2.9758, + "step": 16743 + }, + { + "epoch": 0.82, + "grad_norm": 0.5175803303718567, + "learning_rate": 0.0004958874520531578, + "loss": 3.2636, + "step": 16744 + }, + { + "epoch": 0.82, + "grad_norm": 0.5175894498825073, + "learning_rate": 0.0004958757908134407, + "loss": 3.1949, + "step": 16745 + }, + { + "epoch": 0.82, + "grad_norm": 0.49212566018104553, + "learning_rate": 0.0004958641290578245, + "loss": 3.232, + "step": 16746 + }, + { + "epoch": 0.82, + "grad_norm": 0.5174520015716553, + "learning_rate": 0.0004958524667863399, + "loss": 3.2229, + "step": 16747 + }, + { + "epoch": 0.82, + "grad_norm": 0.5180816054344177, + "learning_rate": 0.0004958408039990174, + "loss": 3.1441, + "step": 16748 + }, + { + "epoch": 0.82, + "grad_norm": 0.5078367590904236, + "learning_rate": 0.0004958291406958881, + "loss": 3.1435, + "step": 16749 + }, + { + "epoch": 0.82, + "grad_norm": 0.5164198875427246, + "learning_rate": 0.0004958174768769825, + "loss": 3.0394, + "step": 16750 + }, + { + "epoch": 0.82, + "grad_norm": 0.5266910791397095, + "learning_rate": 0.0004958058125423315, + "loss": 3.048, + "step": 16751 + }, + { + "epoch": 0.82, + "grad_norm": 0.523522675037384, + "learning_rate": 0.0004957941476919654, + "loss": 3.1053, + "step": 16752 + }, + { + "epoch": 0.82, + "grad_norm": 0.5101291537284851, + "learning_rate": 0.0004957824823259154, + "loss": 3.1284, + "step": 16753 + }, + { + "epoch": 0.82, + "grad_norm": 0.5190765261650085, + "learning_rate": 0.0004957708164442119, + "loss": 3.1728, + "step": 16754 + }, + { + "epoch": 0.82, + "grad_norm": 0.5073868632316589, + "learning_rate": 0.0004957591500468856, + "loss": 3.3285, + "step": 16755 + }, + { + "epoch": 0.82, + "grad_norm": 0.5432148575782776, + "learning_rate": 0.0004957474831339678, + "loss": 2.9638, + "step": 16756 + }, + { + "epoch": 0.82, + "grad_norm": 0.5055432319641113, + "learning_rate": 0.0004957358157054885, + "loss": 3.1185, + "step": 16757 + }, + { + "epoch": 0.82, + "grad_norm": 0.5079781413078308, + "learning_rate": 0.0004957241477614787, + "loss": 3.0659, + "step": 16758 + }, + { + "epoch": 0.82, + "grad_norm": 0.5281221866607666, + "learning_rate": 0.0004957124793019694, + "loss": 2.9227, + "step": 16759 + }, + { + "epoch": 0.82, + "grad_norm": 0.5235075950622559, + "learning_rate": 0.0004957008103269908, + "loss": 3.1578, + "step": 16760 + }, + { + "epoch": 0.82, + "grad_norm": 0.49377989768981934, + "learning_rate": 0.0004956891408365741, + "loss": 3.3034, + "step": 16761 + }, + { + "epoch": 0.82, + "grad_norm": 0.5380136370658875, + "learning_rate": 0.0004956774708307499, + "loss": 3.1188, + "step": 16762 + }, + { + "epoch": 0.82, + "grad_norm": 0.5021156072616577, + "learning_rate": 0.0004956658003095488, + "loss": 3.0115, + "step": 16763 + }, + { + "epoch": 0.82, + "grad_norm": 0.5770875811576843, + "learning_rate": 0.0004956541292730017, + "loss": 3.1021, + "step": 16764 + }, + { + "epoch": 0.82, + "grad_norm": 0.5153395533561707, + "learning_rate": 0.0004956424577211392, + "loss": 3.0592, + "step": 16765 + }, + { + "epoch": 0.82, + "grad_norm": 0.5426536798477173, + "learning_rate": 0.0004956307856539922, + "loss": 3.2771, + "step": 16766 + }, + { + "epoch": 0.82, + "grad_norm": 0.5408112406730652, + "learning_rate": 0.0004956191130715915, + "loss": 3.2663, + "step": 16767 + }, + { + "epoch": 0.82, + "grad_norm": 0.5236942768096924, + "learning_rate": 0.0004956074399739674, + "loss": 3.2467, + "step": 16768 + }, + { + "epoch": 0.82, + "grad_norm": 0.523478627204895, + "learning_rate": 0.000495595766361151, + "loss": 3.16, + "step": 16769 + }, + { + "epoch": 0.82, + "grad_norm": 0.5341914296150208, + "learning_rate": 0.0004955840922331732, + "loss": 3.1799, + "step": 16770 + }, + { + "epoch": 0.82, + "grad_norm": 0.5106289982795715, + "learning_rate": 0.0004955724175900644, + "loss": 3.0653, + "step": 16771 + }, + { + "epoch": 0.82, + "grad_norm": 0.5742872357368469, + "learning_rate": 0.0004955607424318555, + "loss": 3.1742, + "step": 16772 + }, + { + "epoch": 0.82, + "grad_norm": 0.5103678107261658, + "learning_rate": 0.0004955490667585773, + "loss": 3.3389, + "step": 16773 + }, + { + "epoch": 0.82, + "grad_norm": 0.5044515132904053, + "learning_rate": 0.0004955373905702604, + "loss": 2.9904, + "step": 16774 + }, + { + "epoch": 0.82, + "grad_norm": 0.5162826776504517, + "learning_rate": 0.0004955257138669357, + "loss": 3.3376, + "step": 16775 + }, + { + "epoch": 0.82, + "grad_norm": 0.545150101184845, + "learning_rate": 0.000495514036648634, + "loss": 3.0011, + "step": 16776 + }, + { + "epoch": 0.82, + "grad_norm": 0.5908412337303162, + "learning_rate": 0.0004955023589153858, + "loss": 3.2328, + "step": 16777 + }, + { + "epoch": 0.82, + "grad_norm": 0.5039973258972168, + "learning_rate": 0.000495490680667222, + "loss": 3.0851, + "step": 16778 + }, + { + "epoch": 0.82, + "grad_norm": 0.5710932016372681, + "learning_rate": 0.0004954790019041735, + "loss": 3.0769, + "step": 16779 + }, + { + "epoch": 0.82, + "grad_norm": 0.48418980836868286, + "learning_rate": 0.0004954673226262708, + "loss": 3.1852, + "step": 16780 + }, + { + "epoch": 0.82, + "grad_norm": 0.540973424911499, + "learning_rate": 0.0004954556428335449, + "loss": 3.1554, + "step": 16781 + }, + { + "epoch": 0.82, + "grad_norm": 0.5249459147453308, + "learning_rate": 0.0004954439625260264, + "loss": 3.1507, + "step": 16782 + }, + { + "epoch": 0.82, + "grad_norm": 0.562673807144165, + "learning_rate": 0.000495432281703746, + "loss": 3.2198, + "step": 16783 + }, + { + "epoch": 0.82, + "grad_norm": 0.49689170718193054, + "learning_rate": 0.0004954206003667347, + "loss": 3.1578, + "step": 16784 + }, + { + "epoch": 0.82, + "grad_norm": 0.5022982954978943, + "learning_rate": 0.0004954089185150233, + "loss": 3.2703, + "step": 16785 + }, + { + "epoch": 0.82, + "grad_norm": 0.5085593461990356, + "learning_rate": 0.0004953972361486423, + "loss": 3.3836, + "step": 16786 + }, + { + "epoch": 0.82, + "grad_norm": 0.5071133375167847, + "learning_rate": 0.0004953855532676225, + "loss": 3.1033, + "step": 16787 + }, + { + "epoch": 0.82, + "grad_norm": 0.5436131358146667, + "learning_rate": 0.0004953738698719948, + "loss": 3.074, + "step": 16788 + }, + { + "epoch": 0.82, + "grad_norm": 0.5148676037788391, + "learning_rate": 0.0004953621859617899, + "loss": 3.1817, + "step": 16789 + }, + { + "epoch": 0.82, + "grad_norm": 0.4939463138580322, + "learning_rate": 0.0004953505015370387, + "loss": 3.221, + "step": 16790 + }, + { + "epoch": 0.82, + "grad_norm": 0.49772271513938904, + "learning_rate": 0.0004953388165977717, + "loss": 3.0617, + "step": 16791 + }, + { + "epoch": 0.82, + "grad_norm": 0.5314618945121765, + "learning_rate": 0.00049532713114402, + "loss": 3.2513, + "step": 16792 + }, + { + "epoch": 0.82, + "grad_norm": 0.5048306584358215, + "learning_rate": 0.0004953154451758143, + "loss": 3.2086, + "step": 16793 + }, + { + "epoch": 0.82, + "grad_norm": 0.5072234272956848, + "learning_rate": 0.0004953037586931851, + "loss": 3.1794, + "step": 16794 + }, + { + "epoch": 0.82, + "grad_norm": 0.5115417838096619, + "learning_rate": 0.0004952920716961635, + "loss": 3.2774, + "step": 16795 + }, + { + "epoch": 0.82, + "grad_norm": 0.5184221267700195, + "learning_rate": 0.0004952803841847802, + "loss": 3.1443, + "step": 16796 + }, + { + "epoch": 0.82, + "grad_norm": 0.5140826106071472, + "learning_rate": 0.0004952686961590658, + "loss": 3.1012, + "step": 16797 + }, + { + "epoch": 0.82, + "grad_norm": 0.5001091957092285, + "learning_rate": 0.0004952570076190514, + "loss": 3.2363, + "step": 16798 + }, + { + "epoch": 0.82, + "grad_norm": 0.5328136682510376, + "learning_rate": 0.0004952453185647674, + "loss": 3.1932, + "step": 16799 + }, + { + "epoch": 0.82, + "grad_norm": 0.5111890435218811, + "learning_rate": 0.000495233628996245, + "loss": 3.1625, + "step": 16800 + }, + { + "epoch": 0.82, + "grad_norm": 0.5230638980865479, + "learning_rate": 0.0004952219389135149, + "loss": 3.1986, + "step": 16801 + }, + { + "epoch": 0.82, + "grad_norm": 0.5159973502159119, + "learning_rate": 0.0004952102483166076, + "loss": 3.3281, + "step": 16802 + }, + { + "epoch": 0.82, + "grad_norm": 0.5228443145751953, + "learning_rate": 0.000495198557205554, + "loss": 3.2278, + "step": 16803 + }, + { + "epoch": 0.82, + "grad_norm": 0.5011665225028992, + "learning_rate": 0.0004951868655803851, + "loss": 2.9619, + "step": 16804 + }, + { + "epoch": 0.82, + "grad_norm": 0.5224314332008362, + "learning_rate": 0.0004951751734411316, + "loss": 3.0738, + "step": 16805 + }, + { + "epoch": 0.82, + "grad_norm": 0.5976347327232361, + "learning_rate": 0.0004951634807878241, + "loss": 3.1095, + "step": 16806 + }, + { + "epoch": 0.82, + "grad_norm": 0.5195080041885376, + "learning_rate": 0.0004951517876204938, + "loss": 2.9196, + "step": 16807 + }, + { + "epoch": 0.82, + "grad_norm": 0.5941303968429565, + "learning_rate": 0.000495140093939171, + "loss": 2.9227, + "step": 16808 + }, + { + "epoch": 0.82, + "grad_norm": 0.48703116178512573, + "learning_rate": 0.0004951283997438869, + "loss": 3.1429, + "step": 16809 + }, + { + "epoch": 0.82, + "grad_norm": 0.4839284420013428, + "learning_rate": 0.0004951167050346721, + "loss": 3.3139, + "step": 16810 + }, + { + "epoch": 0.82, + "grad_norm": 0.5387030243873596, + "learning_rate": 0.0004951050098115574, + "loss": 3.2789, + "step": 16811 + }, + { + "epoch": 0.82, + "grad_norm": 0.5388344526290894, + "learning_rate": 0.0004950933140745737, + "loss": 3.1641, + "step": 16812 + }, + { + "epoch": 0.82, + "grad_norm": 0.5535876154899597, + "learning_rate": 0.0004950816178237518, + "loss": 3.4473, + "step": 16813 + }, + { + "epoch": 0.82, + "grad_norm": 0.49097132682800293, + "learning_rate": 0.0004950699210591223, + "loss": 3.133, + "step": 16814 + }, + { + "epoch": 0.82, + "grad_norm": 0.5270285606384277, + "learning_rate": 0.0004950582237807163, + "loss": 3.2307, + "step": 16815 + }, + { + "epoch": 0.82, + "grad_norm": 0.48164236545562744, + "learning_rate": 0.0004950465259885645, + "loss": 3.1475, + "step": 16816 + }, + { + "epoch": 0.82, + "grad_norm": 0.531959593296051, + "learning_rate": 0.0004950348276826977, + "loss": 2.9178, + "step": 16817 + }, + { + "epoch": 0.82, + "grad_norm": 0.5036345720291138, + "learning_rate": 0.0004950231288631466, + "loss": 3.4169, + "step": 16818 + }, + { + "epoch": 0.82, + "grad_norm": 0.5073093175888062, + "learning_rate": 0.0004950114295299422, + "loss": 3.1668, + "step": 16819 + }, + { + "epoch": 0.82, + "grad_norm": 0.5157398581504822, + "learning_rate": 0.0004949997296831152, + "loss": 3.0857, + "step": 16820 + }, + { + "epoch": 0.82, + "grad_norm": 0.5297724008560181, + "learning_rate": 0.0004949880293226964, + "loss": 3.0957, + "step": 16821 + }, + { + "epoch": 0.82, + "grad_norm": 0.5831009149551392, + "learning_rate": 0.0004949763284487166, + "loss": 3.3712, + "step": 16822 + }, + { + "epoch": 0.82, + "grad_norm": 0.5394142866134644, + "learning_rate": 0.0004949646270612069, + "loss": 3.1525, + "step": 16823 + }, + { + "epoch": 0.82, + "grad_norm": 0.5234691500663757, + "learning_rate": 0.0004949529251601977, + "loss": 3.28, + "step": 16824 + }, + { + "epoch": 0.82, + "grad_norm": 0.5316514372825623, + "learning_rate": 0.00049494122274572, + "loss": 3.252, + "step": 16825 + }, + { + "epoch": 0.82, + "grad_norm": 0.5195567011833191, + "learning_rate": 0.0004949295198178048, + "loss": 3.2669, + "step": 16826 + }, + { + "epoch": 0.82, + "grad_norm": 0.5230311751365662, + "learning_rate": 0.0004949178163764827, + "loss": 3.2243, + "step": 16827 + }, + { + "epoch": 0.82, + "grad_norm": 0.5435062646865845, + "learning_rate": 0.0004949061124217845, + "loss": 3.031, + "step": 16828 + }, + { + "epoch": 0.82, + "grad_norm": 0.5098403096199036, + "learning_rate": 0.0004948944079537413, + "loss": 3.1558, + "step": 16829 + }, + { + "epoch": 0.82, + "grad_norm": 0.5283143520355225, + "learning_rate": 0.0004948827029723834, + "loss": 3.2979, + "step": 16830 + }, + { + "epoch": 0.82, + "grad_norm": 0.5382714867591858, + "learning_rate": 0.0004948709974777422, + "loss": 3.2907, + "step": 16831 + }, + { + "epoch": 0.82, + "grad_norm": 0.5517304539680481, + "learning_rate": 0.0004948592914698483, + "loss": 2.9451, + "step": 16832 + }, + { + "epoch": 0.82, + "grad_norm": 0.5490243434906006, + "learning_rate": 0.0004948475849487325, + "loss": 2.9241, + "step": 16833 + }, + { + "epoch": 0.82, + "grad_norm": 0.5387569665908813, + "learning_rate": 0.0004948358779144256, + "loss": 3.127, + "step": 16834 + }, + { + "epoch": 0.83, + "grad_norm": 0.48919016122817993, + "learning_rate": 0.0004948241703669585, + "loss": 3.278, + "step": 16835 + }, + { + "epoch": 0.83, + "grad_norm": 0.5328850746154785, + "learning_rate": 0.0004948124623063621, + "loss": 3.1679, + "step": 16836 + }, + { + "epoch": 0.83, + "grad_norm": 0.5232701897621155, + "learning_rate": 0.0004948007537326672, + "loss": 3.0376, + "step": 16837 + }, + { + "epoch": 0.83, + "grad_norm": 0.49141281843185425, + "learning_rate": 0.0004947890446459046, + "loss": 3.1381, + "step": 16838 + }, + { + "epoch": 0.83, + "grad_norm": 0.508006751537323, + "learning_rate": 0.0004947773350461051, + "loss": 3.0339, + "step": 16839 + }, + { + "epoch": 0.83, + "grad_norm": 0.5270553827285767, + "learning_rate": 0.0004947656249332995, + "loss": 3.0294, + "step": 16840 + }, + { + "epoch": 0.83, + "grad_norm": 0.5074304342269897, + "learning_rate": 0.0004947539143075188, + "loss": 3.3718, + "step": 16841 + }, + { + "epoch": 0.83, + "grad_norm": 0.5027759075164795, + "learning_rate": 0.0004947422031687938, + "loss": 3.2201, + "step": 16842 + }, + { + "epoch": 0.83, + "grad_norm": 0.5462760329246521, + "learning_rate": 0.0004947304915171553, + "loss": 3.1749, + "step": 16843 + }, + { + "epoch": 0.83, + "grad_norm": 0.5136876106262207, + "learning_rate": 0.0004947187793526341, + "loss": 3.087, + "step": 16844 + }, + { + "epoch": 0.83, + "grad_norm": 0.5102071762084961, + "learning_rate": 0.0004947070666752612, + "loss": 3.119, + "step": 16845 + }, + { + "epoch": 0.83, + "grad_norm": 0.5251736044883728, + "learning_rate": 0.0004946953534850672, + "loss": 3.0766, + "step": 16846 + }, + { + "epoch": 0.83, + "grad_norm": 0.5002880692481995, + "learning_rate": 0.0004946836397820833, + "loss": 3.1961, + "step": 16847 + }, + { + "epoch": 0.83, + "grad_norm": 0.5180115103721619, + "learning_rate": 0.0004946719255663402, + "loss": 3.291, + "step": 16848 + }, + { + "epoch": 0.83, + "grad_norm": 0.5141701102256775, + "learning_rate": 0.0004946602108378685, + "loss": 3.2056, + "step": 16849 + }, + { + "epoch": 0.83, + "grad_norm": 0.5428344011306763, + "learning_rate": 0.0004946484955966994, + "loss": 3.204, + "step": 16850 + }, + { + "epoch": 0.83, + "grad_norm": 0.5158218145370483, + "learning_rate": 0.0004946367798428636, + "loss": 3.2134, + "step": 16851 + }, + { + "epoch": 0.83, + "grad_norm": 0.5351789593696594, + "learning_rate": 0.0004946250635763919, + "loss": 3.0868, + "step": 16852 + }, + { + "epoch": 0.83, + "grad_norm": 0.5360104441642761, + "learning_rate": 0.0004946133467973153, + "loss": 2.9659, + "step": 16853 + }, + { + "epoch": 0.83, + "grad_norm": 0.5146509408950806, + "learning_rate": 0.0004946016295056646, + "loss": 3.2547, + "step": 16854 + }, + { + "epoch": 0.83, + "grad_norm": 0.5186895132064819, + "learning_rate": 0.0004945899117014706, + "loss": 3.1007, + "step": 16855 + }, + { + "epoch": 0.83, + "grad_norm": 0.5219805836677551, + "learning_rate": 0.0004945781933847644, + "loss": 2.9965, + "step": 16856 + }, + { + "epoch": 0.83, + "grad_norm": 0.5439258813858032, + "learning_rate": 0.0004945664745555766, + "loss": 3.1165, + "step": 16857 + }, + { + "epoch": 0.83, + "grad_norm": 0.5208057761192322, + "learning_rate": 0.0004945547552139382, + "loss": 3.1639, + "step": 16858 + }, + { + "epoch": 0.83, + "grad_norm": 0.5104433298110962, + "learning_rate": 0.0004945430353598799, + "loss": 3.1975, + "step": 16859 + }, + { + "epoch": 0.83, + "grad_norm": 0.5061667561531067, + "learning_rate": 0.0004945313149934327, + "loss": 3.0906, + "step": 16860 + }, + { + "epoch": 0.83, + "grad_norm": 0.5064367651939392, + "learning_rate": 0.0004945195941146275, + "loss": 3.2914, + "step": 16861 + }, + { + "epoch": 0.83, + "grad_norm": 0.5224226713180542, + "learning_rate": 0.0004945078727234951, + "loss": 3.2299, + "step": 16862 + }, + { + "epoch": 0.83, + "grad_norm": 0.5368633270263672, + "learning_rate": 0.0004944961508200664, + "loss": 3.2864, + "step": 16863 + }, + { + "epoch": 0.83, + "grad_norm": 0.5402868986129761, + "learning_rate": 0.0004944844284043723, + "loss": 3.0904, + "step": 16864 + }, + { + "epoch": 0.83, + "grad_norm": 0.5218072533607483, + "learning_rate": 0.0004944727054764436, + "loss": 3.2323, + "step": 16865 + }, + { + "epoch": 0.83, + "grad_norm": 0.5438421964645386, + "learning_rate": 0.0004944609820363112, + "loss": 3.0236, + "step": 16866 + }, + { + "epoch": 0.83, + "grad_norm": 0.5093616843223572, + "learning_rate": 0.0004944492580840061, + "loss": 3.2269, + "step": 16867 + }, + { + "epoch": 0.83, + "grad_norm": 0.5139485001564026, + "learning_rate": 0.000494437533619559, + "loss": 3.1943, + "step": 16868 + }, + { + "epoch": 0.83, + "grad_norm": 0.5316404700279236, + "learning_rate": 0.0004944258086430009, + "loss": 3.2521, + "step": 16869 + }, + { + "epoch": 0.83, + "grad_norm": 0.5201852321624756, + "learning_rate": 0.0004944140831543626, + "loss": 3.0447, + "step": 16870 + }, + { + "epoch": 0.83, + "grad_norm": 0.5127416849136353, + "learning_rate": 0.000494402357153675, + "loss": 3.1414, + "step": 16871 + }, + { + "epoch": 0.83, + "grad_norm": 0.5286155343055725, + "learning_rate": 0.0004943906306409691, + "loss": 2.8942, + "step": 16872 + }, + { + "epoch": 0.83, + "grad_norm": 0.4988267421722412, + "learning_rate": 0.0004943789036162756, + "loss": 3.2984, + "step": 16873 + }, + { + "epoch": 0.83, + "grad_norm": 0.5161953568458557, + "learning_rate": 0.0004943671760796255, + "loss": 3.3117, + "step": 16874 + }, + { + "epoch": 0.83, + "grad_norm": 0.5332638025283813, + "learning_rate": 0.0004943554480310497, + "loss": 3.0845, + "step": 16875 + }, + { + "epoch": 0.83, + "grad_norm": 0.5356992483139038, + "learning_rate": 0.000494343719470579, + "loss": 2.9884, + "step": 16876 + }, + { + "epoch": 0.83, + "grad_norm": 0.5340079069137573, + "learning_rate": 0.0004943319903982444, + "loss": 3.1248, + "step": 16877 + }, + { + "epoch": 0.83, + "grad_norm": 0.5949048399925232, + "learning_rate": 0.0004943202608140767, + "loss": 3.074, + "step": 16878 + }, + { + "epoch": 0.83, + "grad_norm": 0.4927256107330322, + "learning_rate": 0.0004943085307181069, + "loss": 3.1862, + "step": 16879 + }, + { + "epoch": 0.83, + "grad_norm": 0.5098362565040588, + "learning_rate": 0.0004942968001103656, + "loss": 3.249, + "step": 16880 + }, + { + "epoch": 0.83, + "grad_norm": 0.559185802936554, + "learning_rate": 0.0004942850689908842, + "loss": 3.3038, + "step": 16881 + }, + { + "epoch": 0.83, + "grad_norm": 0.5128463506698608, + "learning_rate": 0.0004942733373596932, + "loss": 3.3892, + "step": 16882 + }, + { + "epoch": 0.83, + "grad_norm": 0.5219104290008545, + "learning_rate": 0.0004942616052168236, + "loss": 3.3491, + "step": 16883 + }, + { + "epoch": 0.83, + "grad_norm": 0.5195140838623047, + "learning_rate": 0.0004942498725623064, + "loss": 3.1738, + "step": 16884 + }, + { + "epoch": 0.83, + "grad_norm": 0.634857177734375, + "learning_rate": 0.0004942381393961724, + "loss": 3.1203, + "step": 16885 + }, + { + "epoch": 0.83, + "grad_norm": 0.5101731419563293, + "learning_rate": 0.0004942264057184524, + "loss": 3.2297, + "step": 16886 + }, + { + "epoch": 0.83, + "grad_norm": 0.5076460242271423, + "learning_rate": 0.0004942146715291775, + "loss": 3.2373, + "step": 16887 + }, + { + "epoch": 0.83, + "grad_norm": 0.5291589498519897, + "learning_rate": 0.0004942029368283786, + "loss": 2.9861, + "step": 16888 + }, + { + "epoch": 0.83, + "grad_norm": 0.5351511836051941, + "learning_rate": 0.0004941912016160864, + "loss": 3.0871, + "step": 16889 + }, + { + "epoch": 0.83, + "grad_norm": 0.5280094146728516, + "learning_rate": 0.0004941794658923321, + "loss": 3.2685, + "step": 16890 + }, + { + "epoch": 0.83, + "grad_norm": 0.5539615750312805, + "learning_rate": 0.0004941677296571463, + "loss": 3.3623, + "step": 16891 + }, + { + "epoch": 0.83, + "grad_norm": 0.5121122002601624, + "learning_rate": 0.0004941559929105602, + "loss": 3.0476, + "step": 16892 + }, + { + "epoch": 0.83, + "grad_norm": 0.551249623298645, + "learning_rate": 0.0004941442556526045, + "loss": 2.9187, + "step": 16893 + }, + { + "epoch": 0.83, + "grad_norm": 0.5004875659942627, + "learning_rate": 0.0004941325178833102, + "loss": 3.1174, + "step": 16894 + }, + { + "epoch": 0.83, + "grad_norm": 0.5540305972099304, + "learning_rate": 0.0004941207796027084, + "loss": 3.2438, + "step": 16895 + }, + { + "epoch": 0.83, + "grad_norm": 0.5179722905158997, + "learning_rate": 0.0004941090408108296, + "loss": 3.0293, + "step": 16896 + }, + { + "epoch": 0.83, + "grad_norm": 0.5183007717132568, + "learning_rate": 0.000494097301507705, + "loss": 3.0611, + "step": 16897 + }, + { + "epoch": 0.83, + "grad_norm": 0.5226312279701233, + "learning_rate": 0.0004940855616933654, + "loss": 3.2248, + "step": 16898 + }, + { + "epoch": 0.83, + "grad_norm": 0.5023093819618225, + "learning_rate": 0.0004940738213678419, + "loss": 3.1283, + "step": 16899 + }, + { + "epoch": 0.83, + "grad_norm": 0.5120278596878052, + "learning_rate": 0.0004940620805311654, + "loss": 3.4025, + "step": 16900 + }, + { + "epoch": 0.83, + "grad_norm": 0.5045427083969116, + "learning_rate": 0.0004940503391833664, + "loss": 3.2388, + "step": 16901 + }, + { + "epoch": 0.83, + "grad_norm": 0.5513672232627869, + "learning_rate": 0.0004940385973244765, + "loss": 3.2291, + "step": 16902 + }, + { + "epoch": 0.83, + "grad_norm": 0.5159168243408203, + "learning_rate": 0.0004940268549545261, + "loss": 3.0269, + "step": 16903 + }, + { + "epoch": 0.83, + "grad_norm": 0.5583893656730652, + "learning_rate": 0.0004940151120735463, + "loss": 3.2074, + "step": 16904 + }, + { + "epoch": 0.83, + "grad_norm": 0.5445465445518494, + "learning_rate": 0.000494003368681568, + "loss": 3.1999, + "step": 16905 + }, + { + "epoch": 0.83, + "grad_norm": 0.5105122327804565, + "learning_rate": 0.0004939916247786223, + "loss": 3.1474, + "step": 16906 + }, + { + "epoch": 0.83, + "grad_norm": 0.5456762313842773, + "learning_rate": 0.0004939798803647398, + "loss": 3.3013, + "step": 16907 + }, + { + "epoch": 0.83, + "grad_norm": 0.5078533887863159, + "learning_rate": 0.0004939681354399518, + "loss": 3.309, + "step": 16908 + }, + { + "epoch": 0.83, + "grad_norm": 0.5101911425590515, + "learning_rate": 0.000493956390004289, + "loss": 2.935, + "step": 16909 + }, + { + "epoch": 0.83, + "grad_norm": 0.5133745074272156, + "learning_rate": 0.0004939446440577823, + "loss": 3.2223, + "step": 16910 + }, + { + "epoch": 0.83, + "grad_norm": 0.5557057857513428, + "learning_rate": 0.000493932897600463, + "loss": 3.179, + "step": 16911 + }, + { + "epoch": 0.83, + "grad_norm": 0.5115715861320496, + "learning_rate": 0.0004939211506323615, + "loss": 3.0295, + "step": 16912 + }, + { + "epoch": 0.83, + "grad_norm": 0.5192151069641113, + "learning_rate": 0.0004939094031535091, + "loss": 3.2356, + "step": 16913 + }, + { + "epoch": 0.83, + "grad_norm": 0.5089118480682373, + "learning_rate": 0.0004938976551639368, + "loss": 3.5075, + "step": 16914 + }, + { + "epoch": 0.83, + "grad_norm": 0.5137461423873901, + "learning_rate": 0.0004938859066636751, + "loss": 3.2699, + "step": 16915 + }, + { + "epoch": 0.83, + "grad_norm": 0.5064414143562317, + "learning_rate": 0.0004938741576527555, + "loss": 3.0867, + "step": 16916 + }, + { + "epoch": 0.83, + "grad_norm": 0.4723142683506012, + "learning_rate": 0.0004938624081312085, + "loss": 3.1705, + "step": 16917 + }, + { + "epoch": 0.83, + "grad_norm": 0.5553394556045532, + "learning_rate": 0.0004938506580990654, + "loss": 3.4789, + "step": 16918 + }, + { + "epoch": 0.83, + "grad_norm": 0.6030188202857971, + "learning_rate": 0.0004938389075563568, + "loss": 3.1171, + "step": 16919 + }, + { + "epoch": 0.83, + "grad_norm": 0.5029276013374329, + "learning_rate": 0.0004938271565031139, + "loss": 2.9408, + "step": 16920 + }, + { + "epoch": 0.83, + "grad_norm": 0.5751969814300537, + "learning_rate": 0.0004938154049393676, + "loss": 2.8694, + "step": 16921 + }, + { + "epoch": 0.83, + "grad_norm": 0.5256320834159851, + "learning_rate": 0.0004938036528651488, + "loss": 3.1617, + "step": 16922 + }, + { + "epoch": 0.83, + "grad_norm": 0.5677057504653931, + "learning_rate": 0.0004937919002804885, + "loss": 3.2418, + "step": 16923 + }, + { + "epoch": 0.83, + "grad_norm": 0.5156370401382446, + "learning_rate": 0.0004937801471854176, + "loss": 3.0715, + "step": 16924 + }, + { + "epoch": 0.83, + "grad_norm": 0.5339087843894958, + "learning_rate": 0.000493768393579967, + "loss": 3.2977, + "step": 16925 + }, + { + "epoch": 0.83, + "grad_norm": 0.5484094619750977, + "learning_rate": 0.000493756639464168, + "loss": 3.0911, + "step": 16926 + }, + { + "epoch": 0.83, + "grad_norm": 0.6116513013839722, + "learning_rate": 0.0004937448848380511, + "loss": 3.3455, + "step": 16927 + }, + { + "epoch": 0.83, + "grad_norm": 0.503478467464447, + "learning_rate": 0.0004937331297016474, + "loss": 3.2488, + "step": 16928 + }, + { + "epoch": 0.83, + "grad_norm": 0.5339494943618774, + "learning_rate": 0.000493721374054988, + "loss": 3.0493, + "step": 16929 + }, + { + "epoch": 0.83, + "grad_norm": 0.5303932428359985, + "learning_rate": 0.0004937096178981038, + "loss": 3.1785, + "step": 16930 + }, + { + "epoch": 0.83, + "grad_norm": 0.48240697383880615, + "learning_rate": 0.0004936978612310257, + "loss": 3.32, + "step": 16931 + }, + { + "epoch": 0.83, + "grad_norm": 0.5174941420555115, + "learning_rate": 0.0004936861040537848, + "loss": 3.2424, + "step": 16932 + }, + { + "epoch": 0.83, + "grad_norm": 0.5395123362541199, + "learning_rate": 0.0004936743463664119, + "loss": 3.1445, + "step": 16933 + }, + { + "epoch": 0.83, + "grad_norm": 0.5444930791854858, + "learning_rate": 0.0004936625881689382, + "loss": 3.0873, + "step": 16934 + }, + { + "epoch": 0.83, + "grad_norm": 0.4984789490699768, + "learning_rate": 0.0004936508294613944, + "loss": 3.2639, + "step": 16935 + }, + { + "epoch": 0.83, + "grad_norm": 0.5275869965553284, + "learning_rate": 0.0004936390702438115, + "loss": 3.1615, + "step": 16936 + }, + { + "epoch": 0.83, + "grad_norm": 0.5192875266075134, + "learning_rate": 0.0004936273105162205, + "loss": 3.0108, + "step": 16937 + }, + { + "epoch": 0.83, + "grad_norm": 0.5731741189956665, + "learning_rate": 0.0004936155502786527, + "loss": 3.1182, + "step": 16938 + }, + { + "epoch": 0.83, + "grad_norm": 0.48318421840667725, + "learning_rate": 0.0004936037895311386, + "loss": 3.2587, + "step": 16939 + }, + { + "epoch": 0.83, + "grad_norm": 0.49875524640083313, + "learning_rate": 0.0004935920282737095, + "loss": 3.2617, + "step": 16940 + }, + { + "epoch": 0.83, + "grad_norm": 0.5293728709220886, + "learning_rate": 0.0004935802665063962, + "loss": 3.3205, + "step": 16941 + }, + { + "epoch": 0.83, + "grad_norm": 0.4939599335193634, + "learning_rate": 0.0004935685042292297, + "loss": 3.3076, + "step": 16942 + }, + { + "epoch": 0.83, + "grad_norm": 0.5058974027633667, + "learning_rate": 0.0004935567414422411, + "loss": 3.1232, + "step": 16943 + }, + { + "epoch": 0.83, + "grad_norm": 0.517693281173706, + "learning_rate": 0.0004935449781454612, + "loss": 3.3309, + "step": 16944 + }, + { + "epoch": 0.83, + "grad_norm": 0.5269827842712402, + "learning_rate": 0.0004935332143389212, + "loss": 3.2564, + "step": 16945 + }, + { + "epoch": 0.83, + "grad_norm": 0.5228087306022644, + "learning_rate": 0.0004935214500226518, + "loss": 3.284, + "step": 16946 + }, + { + "epoch": 0.83, + "grad_norm": 0.5041084885597229, + "learning_rate": 0.0004935096851966842, + "loss": 3.1787, + "step": 16947 + }, + { + "epoch": 0.83, + "grad_norm": 0.5007483959197998, + "learning_rate": 0.0004934979198610493, + "loss": 3.0264, + "step": 16948 + }, + { + "epoch": 0.83, + "grad_norm": 0.5107980370521545, + "learning_rate": 0.0004934861540157782, + "loss": 3.1986, + "step": 16949 + }, + { + "epoch": 0.83, + "grad_norm": 0.5581706166267395, + "learning_rate": 0.0004934743876609018, + "loss": 3.2615, + "step": 16950 + }, + { + "epoch": 0.83, + "grad_norm": 0.555671215057373, + "learning_rate": 0.000493462620796451, + "loss": 3.0465, + "step": 16951 + }, + { + "epoch": 0.83, + "grad_norm": 0.5258693695068359, + "learning_rate": 0.000493450853422457, + "loss": 3.0284, + "step": 16952 + }, + { + "epoch": 0.83, + "grad_norm": 0.5174445509910583, + "learning_rate": 0.0004934390855389506, + "loss": 3.0346, + "step": 16953 + }, + { + "epoch": 0.83, + "grad_norm": 0.5268626809120178, + "learning_rate": 0.000493427317145963, + "loss": 2.8945, + "step": 16954 + }, + { + "epoch": 0.83, + "grad_norm": 0.510155439376831, + "learning_rate": 0.0004934155482435249, + "loss": 3.211, + "step": 16955 + }, + { + "epoch": 0.83, + "grad_norm": 0.5453447699546814, + "learning_rate": 0.0004934037788316676, + "loss": 3.0665, + "step": 16956 + }, + { + "epoch": 0.83, + "grad_norm": 0.5393405556678772, + "learning_rate": 0.0004933920089104219, + "loss": 3.237, + "step": 16957 + }, + { + "epoch": 0.83, + "grad_norm": 0.5359326004981995, + "learning_rate": 0.000493380238479819, + "loss": 3.0703, + "step": 16958 + }, + { + "epoch": 0.83, + "grad_norm": 0.5360733270645142, + "learning_rate": 0.0004933684675398896, + "loss": 3.2203, + "step": 16959 + }, + { + "epoch": 0.83, + "grad_norm": 0.5361143946647644, + "learning_rate": 0.000493356696090665, + "loss": 3.1254, + "step": 16960 + }, + { + "epoch": 0.83, + "grad_norm": 0.5372664332389832, + "learning_rate": 0.000493344924132176, + "loss": 3.2338, + "step": 16961 + }, + { + "epoch": 0.83, + "grad_norm": 0.49514126777648926, + "learning_rate": 0.0004933331516644537, + "loss": 3.2379, + "step": 16962 + }, + { + "epoch": 0.83, + "grad_norm": 0.5488557815551758, + "learning_rate": 0.0004933213786875289, + "loss": 3.0711, + "step": 16963 + }, + { + "epoch": 0.83, + "grad_norm": 0.533857524394989, + "learning_rate": 0.0004933096052014331, + "loss": 3.0261, + "step": 16964 + }, + { + "epoch": 0.83, + "grad_norm": 0.5344963669776917, + "learning_rate": 0.0004932978312061969, + "loss": 3.234, + "step": 16965 + }, + { + "epoch": 0.83, + "grad_norm": 0.5355231761932373, + "learning_rate": 0.0004932860567018513, + "loss": 3.2416, + "step": 16966 + }, + { + "epoch": 0.83, + "grad_norm": 0.5180875062942505, + "learning_rate": 0.0004932742816884276, + "loss": 3.1729, + "step": 16967 + }, + { + "epoch": 0.83, + "grad_norm": 0.5027824640274048, + "learning_rate": 0.0004932625061659564, + "loss": 3.1361, + "step": 16968 + }, + { + "epoch": 0.83, + "grad_norm": 0.5158348083496094, + "learning_rate": 0.0004932507301344693, + "loss": 3.0643, + "step": 16969 + }, + { + "epoch": 0.83, + "grad_norm": 0.5437960028648376, + "learning_rate": 0.0004932389535939966, + "loss": 3.1696, + "step": 16970 + }, + { + "epoch": 0.83, + "grad_norm": 0.5309322476387024, + "learning_rate": 0.0004932271765445699, + "loss": 3.168, + "step": 16971 + }, + { + "epoch": 0.83, + "grad_norm": 0.5301249027252197, + "learning_rate": 0.00049321539898622, + "loss": 2.9829, + "step": 16972 + }, + { + "epoch": 0.83, + "grad_norm": 0.5049616098403931, + "learning_rate": 0.0004932036209189778, + "loss": 3.0103, + "step": 16973 + }, + { + "epoch": 0.83, + "grad_norm": 0.5803033709526062, + "learning_rate": 0.0004931918423428746, + "loss": 3.1533, + "step": 16974 + }, + { + "epoch": 0.83, + "grad_norm": 0.519544243812561, + "learning_rate": 0.0004931800632579412, + "loss": 3.3152, + "step": 16975 + }, + { + "epoch": 0.83, + "grad_norm": 0.5230675935745239, + "learning_rate": 0.0004931682836642088, + "loss": 3.3468, + "step": 16976 + }, + { + "epoch": 0.83, + "grad_norm": 0.5146074295043945, + "learning_rate": 0.0004931565035617081, + "loss": 3.1737, + "step": 16977 + }, + { + "epoch": 0.83, + "grad_norm": 0.5033275485038757, + "learning_rate": 0.0004931447229504705, + "loss": 3.1468, + "step": 16978 + }, + { + "epoch": 0.83, + "grad_norm": 0.5531938076019287, + "learning_rate": 0.0004931329418305267, + "loss": 3.1175, + "step": 16979 + }, + { + "epoch": 0.83, + "grad_norm": 0.5533403754234314, + "learning_rate": 0.0004931211602019082, + "loss": 3.017, + "step": 16980 + }, + { + "epoch": 0.83, + "grad_norm": 0.5449845194816589, + "learning_rate": 0.0004931093780646455, + "loss": 3.333, + "step": 16981 + }, + { + "epoch": 0.83, + "grad_norm": 0.4969363808631897, + "learning_rate": 0.0004930975954187699, + "loss": 3.1212, + "step": 16982 + }, + { + "epoch": 0.83, + "grad_norm": 0.5029790997505188, + "learning_rate": 0.0004930858122643124, + "loss": 3.006, + "step": 16983 + }, + { + "epoch": 0.83, + "grad_norm": 0.47780704498291016, + "learning_rate": 0.0004930740286013041, + "loss": 3.2491, + "step": 16984 + }, + { + "epoch": 0.83, + "grad_norm": 0.544589102268219, + "learning_rate": 0.0004930622444297758, + "loss": 3.2378, + "step": 16985 + }, + { + "epoch": 0.83, + "grad_norm": 0.5521456003189087, + "learning_rate": 0.0004930504597497589, + "loss": 3.331, + "step": 16986 + }, + { + "epoch": 0.83, + "grad_norm": 0.5297691822052002, + "learning_rate": 0.0004930386745612841, + "loss": 3.2069, + "step": 16987 + }, + { + "epoch": 0.83, + "grad_norm": 0.5114834308624268, + "learning_rate": 0.0004930268888643827, + "loss": 3.2844, + "step": 16988 + }, + { + "epoch": 0.83, + "grad_norm": 0.5340949296951294, + "learning_rate": 0.0004930151026590855, + "loss": 2.9535, + "step": 16989 + }, + { + "epoch": 0.83, + "grad_norm": 0.5658579468727112, + "learning_rate": 0.0004930033159454237, + "loss": 3.1912, + "step": 16990 + }, + { + "epoch": 0.83, + "grad_norm": 0.5223037004470825, + "learning_rate": 0.0004929915287234283, + "loss": 3.1753, + "step": 16991 + }, + { + "epoch": 0.83, + "grad_norm": 0.5227300524711609, + "learning_rate": 0.0004929797409931305, + "loss": 3.1405, + "step": 16992 + }, + { + "epoch": 0.83, + "grad_norm": 0.5639767646789551, + "learning_rate": 0.0004929679527545611, + "loss": 3.0427, + "step": 16993 + }, + { + "epoch": 0.83, + "grad_norm": 0.5049862265586853, + "learning_rate": 0.0004929561640077512, + "loss": 3.2142, + "step": 16994 + }, + { + "epoch": 0.83, + "grad_norm": 0.5709805488586426, + "learning_rate": 0.0004929443747527319, + "loss": 3.0873, + "step": 16995 + }, + { + "epoch": 0.83, + "grad_norm": 0.5158815383911133, + "learning_rate": 0.0004929325849895344, + "loss": 3.2739, + "step": 16996 + }, + { + "epoch": 0.83, + "grad_norm": 0.5977637767791748, + "learning_rate": 0.0004929207947181895, + "loss": 3.2, + "step": 16997 + }, + { + "epoch": 0.83, + "grad_norm": 0.5404507517814636, + "learning_rate": 0.0004929090039387283, + "loss": 3.1116, + "step": 16998 + }, + { + "epoch": 0.83, + "grad_norm": 0.5217320919036865, + "learning_rate": 0.0004928972126511819, + "loss": 3.1959, + "step": 16999 + }, + { + "epoch": 0.83, + "grad_norm": 0.5257030129432678, + "learning_rate": 0.0004928854208555815, + "loss": 3.105, + "step": 17000 + }, + { + "epoch": 0.83, + "grad_norm": 0.526759922504425, + "learning_rate": 0.0004928736285519579, + "loss": 3.2357, + "step": 17001 + }, + { + "epoch": 0.83, + "grad_norm": 0.5135772824287415, + "learning_rate": 0.0004928618357403424, + "loss": 3.1463, + "step": 17002 + }, + { + "epoch": 0.83, + "grad_norm": 0.5537254810333252, + "learning_rate": 0.0004928500424207658, + "loss": 3.2612, + "step": 17003 + }, + { + "epoch": 0.83, + "grad_norm": 0.5435295104980469, + "learning_rate": 0.0004928382485932594, + "loss": 2.9233, + "step": 17004 + }, + { + "epoch": 0.83, + "grad_norm": 0.5425717830657959, + "learning_rate": 0.0004928264542578541, + "loss": 3.1752, + "step": 17005 + }, + { + "epoch": 0.83, + "grad_norm": 0.5152084827423096, + "learning_rate": 0.000492814659414581, + "loss": 3.3909, + "step": 17006 + }, + { + "epoch": 0.83, + "grad_norm": 0.5382000207901001, + "learning_rate": 0.0004928028640634714, + "loss": 3.092, + "step": 17007 + }, + { + "epoch": 0.83, + "grad_norm": 0.5504263043403625, + "learning_rate": 0.000492791068204556, + "loss": 3.1998, + "step": 17008 + }, + { + "epoch": 0.83, + "grad_norm": 0.5138139128684998, + "learning_rate": 0.0004927792718378661, + "loss": 3.2068, + "step": 17009 + }, + { + "epoch": 0.83, + "grad_norm": 0.5308842062950134, + "learning_rate": 0.0004927674749634326, + "loss": 3.3046, + "step": 17010 + }, + { + "epoch": 0.83, + "grad_norm": 0.5355445146560669, + "learning_rate": 0.0004927556775812866, + "loss": 3.2819, + "step": 17011 + }, + { + "epoch": 0.83, + "grad_norm": 0.5167343020439148, + "learning_rate": 0.0004927438796914595, + "loss": 3.0078, + "step": 17012 + }, + { + "epoch": 0.83, + "grad_norm": 0.5470657348632812, + "learning_rate": 0.000492732081293982, + "loss": 3.1608, + "step": 17013 + }, + { + "epoch": 0.83, + "grad_norm": 0.5836805701255798, + "learning_rate": 0.0004927202823888853, + "loss": 3.1161, + "step": 17014 + }, + { + "epoch": 0.83, + "grad_norm": 0.558231770992279, + "learning_rate": 0.0004927084829762004, + "loss": 3.1451, + "step": 17015 + }, + { + "epoch": 0.83, + "grad_norm": 0.9510738849639893, + "learning_rate": 0.0004926966830559585, + "loss": 3.305, + "step": 17016 + }, + { + "epoch": 0.83, + "grad_norm": 0.5469083786010742, + "learning_rate": 0.0004926848826281907, + "loss": 3.3076, + "step": 17017 + }, + { + "epoch": 0.83, + "grad_norm": 0.530071496963501, + "learning_rate": 0.0004926730816929277, + "loss": 3.2547, + "step": 17018 + }, + { + "epoch": 0.83, + "grad_norm": 0.5168865919113159, + "learning_rate": 0.0004926612802502011, + "loss": 3.1445, + "step": 17019 + }, + { + "epoch": 0.83, + "grad_norm": 0.5375450849533081, + "learning_rate": 0.0004926494783000418, + "loss": 3.1627, + "step": 17020 + }, + { + "epoch": 0.83, + "grad_norm": 0.5115398168563843, + "learning_rate": 0.0004926376758424808, + "loss": 3.2687, + "step": 17021 + }, + { + "epoch": 0.83, + "grad_norm": 0.571524441242218, + "learning_rate": 0.0004926258728775492, + "loss": 3.0509, + "step": 17022 + }, + { + "epoch": 0.83, + "grad_norm": 0.5220298767089844, + "learning_rate": 0.0004926140694052782, + "loss": 3.0722, + "step": 17023 + }, + { + "epoch": 0.83, + "grad_norm": 0.5590171217918396, + "learning_rate": 0.0004926022654256989, + "loss": 3.2197, + "step": 17024 + }, + { + "epoch": 0.83, + "grad_norm": 0.5292062163352966, + "learning_rate": 0.0004925904609388421, + "loss": 3.2796, + "step": 17025 + }, + { + "epoch": 0.83, + "grad_norm": 0.4746219515800476, + "learning_rate": 0.0004925786559447391, + "loss": 3.2434, + "step": 17026 + }, + { + "epoch": 0.83, + "grad_norm": 0.5537968277931213, + "learning_rate": 0.0004925668504434211, + "loss": 3.0872, + "step": 17027 + }, + { + "epoch": 0.83, + "grad_norm": 0.5214465260505676, + "learning_rate": 0.0004925550444349191, + "loss": 3.0574, + "step": 17028 + }, + { + "epoch": 0.83, + "grad_norm": 0.532576858997345, + "learning_rate": 0.0004925432379192641, + "loss": 3.3728, + "step": 17029 + }, + { + "epoch": 0.83, + "grad_norm": 0.5130587816238403, + "learning_rate": 0.0004925314308964872, + "loss": 3.3297, + "step": 17030 + }, + { + "epoch": 0.83, + "grad_norm": 0.5362662076950073, + "learning_rate": 0.0004925196233666196, + "loss": 3.3526, + "step": 17031 + }, + { + "epoch": 0.83, + "grad_norm": 0.555117666721344, + "learning_rate": 0.0004925078153296924, + "loss": 3.2804, + "step": 17032 + }, + { + "epoch": 0.83, + "grad_norm": 0.5339804291725159, + "learning_rate": 0.0004924960067857367, + "loss": 3.2155, + "step": 17033 + }, + { + "epoch": 0.83, + "grad_norm": 0.5521584749221802, + "learning_rate": 0.0004924841977347835, + "loss": 3.2564, + "step": 17034 + }, + { + "epoch": 0.83, + "grad_norm": 0.6111563444137573, + "learning_rate": 0.0004924723881768639, + "loss": 2.9852, + "step": 17035 + }, + { + "epoch": 0.83, + "grad_norm": 0.5429837107658386, + "learning_rate": 0.0004924605781120092, + "loss": 3.0712, + "step": 17036 + }, + { + "epoch": 0.83, + "grad_norm": 0.5211032629013062, + "learning_rate": 0.0004924487675402504, + "loss": 3.0123, + "step": 17037 + }, + { + "epoch": 0.83, + "grad_norm": 0.5365507006645203, + "learning_rate": 0.0004924369564616185, + "loss": 3.0002, + "step": 17038 + }, + { + "epoch": 0.84, + "grad_norm": 0.5042828321456909, + "learning_rate": 0.0004924251448761446, + "loss": 3.0771, + "step": 17039 + }, + { + "epoch": 0.84, + "grad_norm": 0.5612789988517761, + "learning_rate": 0.0004924133327838601, + "loss": 3.0335, + "step": 17040 + }, + { + "epoch": 0.84, + "grad_norm": 0.5127493739128113, + "learning_rate": 0.0004924015201847958, + "loss": 3.1882, + "step": 17041 + }, + { + "epoch": 0.84, + "grad_norm": 0.5296911001205444, + "learning_rate": 0.000492389707078983, + "loss": 3.143, + "step": 17042 + }, + { + "epoch": 0.84, + "grad_norm": 0.5228750109672546, + "learning_rate": 0.0004923778934664526, + "loss": 3.1002, + "step": 17043 + }, + { + "epoch": 0.84, + "grad_norm": 0.5364986658096313, + "learning_rate": 0.000492366079347236, + "loss": 3.0465, + "step": 17044 + }, + { + "epoch": 0.84, + "grad_norm": 0.49486231803894043, + "learning_rate": 0.000492354264721364, + "loss": 3.1087, + "step": 17045 + }, + { + "epoch": 0.84, + "grad_norm": 0.5611582398414612, + "learning_rate": 0.0004923424495888681, + "loss": 3.0845, + "step": 17046 + }, + { + "epoch": 0.84, + "grad_norm": 0.6187002658843994, + "learning_rate": 0.0004923306339497791, + "loss": 3.0551, + "step": 17047 + }, + { + "epoch": 0.84, + "grad_norm": 0.5191326141357422, + "learning_rate": 0.0004923188178041282, + "loss": 3.0557, + "step": 17048 + }, + { + "epoch": 0.84, + "grad_norm": 0.5312925577163696, + "learning_rate": 0.0004923070011519466, + "loss": 3.4865, + "step": 17049 + }, + { + "epoch": 0.84, + "grad_norm": 0.5266924500465393, + "learning_rate": 0.0004922951839932653, + "loss": 3.2505, + "step": 17050 + }, + { + "epoch": 0.84, + "grad_norm": 0.5350109934806824, + "learning_rate": 0.0004922833663281156, + "loss": 3.3707, + "step": 17051 + }, + { + "epoch": 0.84, + "grad_norm": 0.4987470507621765, + "learning_rate": 0.0004922715481565284, + "loss": 3.332, + "step": 17052 + }, + { + "epoch": 0.84, + "grad_norm": 0.49026739597320557, + "learning_rate": 0.000492259729478535, + "loss": 3.1229, + "step": 17053 + }, + { + "epoch": 0.84, + "grad_norm": 0.5140583515167236, + "learning_rate": 0.0004922479102941665, + "loss": 3.2348, + "step": 17054 + }, + { + "epoch": 0.84, + "grad_norm": 0.5144451260566711, + "learning_rate": 0.0004922360906034538, + "loss": 3.0013, + "step": 17055 + }, + { + "epoch": 0.84, + "grad_norm": 0.5055959820747375, + "learning_rate": 0.0004922242704064285, + "loss": 3.2636, + "step": 17056 + }, + { + "epoch": 0.84, + "grad_norm": 0.5427704453468323, + "learning_rate": 0.0004922124497031214, + "loss": 3.36, + "step": 17057 + }, + { + "epoch": 0.84, + "grad_norm": 0.5104376673698425, + "learning_rate": 0.0004922006284935637, + "loss": 3.3998, + "step": 17058 + }, + { + "epoch": 0.84, + "grad_norm": 0.5442894697189331, + "learning_rate": 0.0004921888067777865, + "loss": 3.113, + "step": 17059 + }, + { + "epoch": 0.84, + "grad_norm": 0.5200850963592529, + "learning_rate": 0.000492176984555821, + "loss": 3.0532, + "step": 17060 + }, + { + "epoch": 0.84, + "grad_norm": 0.4936150312423706, + "learning_rate": 0.0004921651618276982, + "loss": 3.0992, + "step": 17061 + }, + { + "epoch": 0.84, + "grad_norm": 0.49831634759902954, + "learning_rate": 0.0004921533385934495, + "loss": 3.2988, + "step": 17062 + }, + { + "epoch": 0.84, + "grad_norm": 0.5185843706130981, + "learning_rate": 0.0004921415148531058, + "loss": 3.1793, + "step": 17063 + }, + { + "epoch": 0.84, + "grad_norm": 0.5367192029953003, + "learning_rate": 0.0004921296906066984, + "loss": 3.2338, + "step": 17064 + }, + { + "epoch": 0.84, + "grad_norm": 0.5617893934249878, + "learning_rate": 0.0004921178658542582, + "loss": 3.0399, + "step": 17065 + }, + { + "epoch": 0.84, + "grad_norm": 0.4962318241596222, + "learning_rate": 0.0004921060405958167, + "loss": 3.1821, + "step": 17066 + }, + { + "epoch": 0.84, + "grad_norm": 0.5281431674957275, + "learning_rate": 0.0004920942148314047, + "loss": 3.2091, + "step": 17067 + }, + { + "epoch": 0.84, + "grad_norm": 0.5583265423774719, + "learning_rate": 0.0004920823885610537, + "loss": 3.1087, + "step": 17068 + }, + { + "epoch": 0.84, + "grad_norm": 0.5328853726387024, + "learning_rate": 0.0004920705617847945, + "loss": 3.2278, + "step": 17069 + }, + { + "epoch": 0.84, + "grad_norm": 0.5027572512626648, + "learning_rate": 0.0004920587345026585, + "loss": 3.1542, + "step": 17070 + }, + { + "epoch": 0.84, + "grad_norm": 0.6189442873001099, + "learning_rate": 0.0004920469067146768, + "loss": 3.259, + "step": 17071 + }, + { + "epoch": 0.84, + "grad_norm": 0.5087341666221619, + "learning_rate": 0.0004920350784208803, + "loss": 2.9372, + "step": 17072 + }, + { + "epoch": 0.84, + "grad_norm": 0.5113468170166016, + "learning_rate": 0.0004920232496213005, + "loss": 3.241, + "step": 17073 + }, + { + "epoch": 0.84, + "grad_norm": 0.5053597688674927, + "learning_rate": 0.0004920114203159683, + "loss": 3.2242, + "step": 17074 + }, + { + "epoch": 0.84, + "grad_norm": 0.5626222491264343, + "learning_rate": 0.0004919995905049153, + "loss": 3.0204, + "step": 17075 + }, + { + "epoch": 0.84, + "grad_norm": 0.5104367136955261, + "learning_rate": 0.000491987760188172, + "loss": 3.0962, + "step": 17076 + }, + { + "epoch": 0.84, + "grad_norm": 0.507770299911499, + "learning_rate": 0.00049197592936577, + "loss": 3.3188, + "step": 17077 + }, + { + "epoch": 0.84, + "grad_norm": 0.5115891695022583, + "learning_rate": 0.0004919640980377404, + "loss": 2.9712, + "step": 17078 + }, + { + "epoch": 0.84, + "grad_norm": 0.488614946603775, + "learning_rate": 0.0004919522662041141, + "loss": 3.3266, + "step": 17079 + }, + { + "epoch": 0.84, + "grad_norm": 0.47894060611724854, + "learning_rate": 0.0004919404338649227, + "loss": 3.1626, + "step": 17080 + }, + { + "epoch": 0.84, + "grad_norm": 0.5004280805587769, + "learning_rate": 0.000491928601020197, + "loss": 3.016, + "step": 17081 + }, + { + "epoch": 0.84, + "grad_norm": 0.5228533148765564, + "learning_rate": 0.0004919167676699684, + "loss": 3.4334, + "step": 17082 + }, + { + "epoch": 0.84, + "grad_norm": 0.5518790483474731, + "learning_rate": 0.0004919049338142679, + "loss": 3.1275, + "step": 17083 + }, + { + "epoch": 0.84, + "grad_norm": 0.49410319328308105, + "learning_rate": 0.0004918930994531268, + "loss": 2.9918, + "step": 17084 + }, + { + "epoch": 0.84, + "grad_norm": 0.4980878233909607, + "learning_rate": 0.0004918812645865762, + "loss": 2.9434, + "step": 17085 + }, + { + "epoch": 0.84, + "grad_norm": 0.5068234801292419, + "learning_rate": 0.0004918694292146473, + "loss": 3.3573, + "step": 17086 + }, + { + "epoch": 0.84, + "grad_norm": 0.5471787452697754, + "learning_rate": 0.000491857593337371, + "loss": 3.1763, + "step": 17087 + }, + { + "epoch": 0.84, + "grad_norm": 0.519198477268219, + "learning_rate": 0.000491845756954779, + "loss": 3.2118, + "step": 17088 + }, + { + "epoch": 0.84, + "grad_norm": 0.5100758075714111, + "learning_rate": 0.0004918339200669021, + "loss": 3.3549, + "step": 17089 + }, + { + "epoch": 0.84, + "grad_norm": 0.525219202041626, + "learning_rate": 0.0004918220826737717, + "loss": 3.1919, + "step": 17090 + }, + { + "epoch": 0.84, + "grad_norm": 0.5719212889671326, + "learning_rate": 0.0004918102447754187, + "loss": 3.0999, + "step": 17091 + }, + { + "epoch": 0.84, + "grad_norm": 0.5068230032920837, + "learning_rate": 0.0004917984063718745, + "loss": 3.3169, + "step": 17092 + }, + { + "epoch": 0.84, + "grad_norm": 0.5205427408218384, + "learning_rate": 0.0004917865674631702, + "loss": 3.2123, + "step": 17093 + }, + { + "epoch": 0.84, + "grad_norm": 0.5136102437973022, + "learning_rate": 0.0004917747280493369, + "loss": 3.4344, + "step": 17094 + }, + { + "epoch": 0.84, + "grad_norm": 0.5310423374176025, + "learning_rate": 0.0004917628881304061, + "loss": 3.081, + "step": 17095 + }, + { + "epoch": 0.84, + "grad_norm": 0.5061217546463013, + "learning_rate": 0.0004917510477064085, + "loss": 2.9725, + "step": 17096 + }, + { + "epoch": 0.84, + "grad_norm": 0.5577815175056458, + "learning_rate": 0.0004917392067773756, + "loss": 3.2315, + "step": 17097 + }, + { + "epoch": 0.84, + "grad_norm": 0.5072141289710999, + "learning_rate": 0.0004917273653433388, + "loss": 3.2273, + "step": 17098 + }, + { + "epoch": 0.84, + "grad_norm": 0.5190002918243408, + "learning_rate": 0.0004917155234043287, + "loss": 3.1358, + "step": 17099 + }, + { + "epoch": 0.84, + "grad_norm": 0.49393171072006226, + "learning_rate": 0.000491703680960377, + "loss": 3.1898, + "step": 17100 + }, + { + "epoch": 0.84, + "grad_norm": 0.49661365151405334, + "learning_rate": 0.0004916918380115145, + "loss": 3.1599, + "step": 17101 + }, + { + "epoch": 0.84, + "grad_norm": 0.524917721748352, + "learning_rate": 0.0004916799945577727, + "loss": 3.2517, + "step": 17102 + }, + { + "epoch": 0.84, + "grad_norm": 0.5378203988075256, + "learning_rate": 0.0004916681505991827, + "loss": 3.2473, + "step": 17103 + }, + { + "epoch": 0.84, + "grad_norm": 0.5326823592185974, + "learning_rate": 0.0004916563061357756, + "loss": 3.1216, + "step": 17104 + }, + { + "epoch": 0.84, + "grad_norm": 0.5417553782463074, + "learning_rate": 0.0004916444611675828, + "loss": 3.2, + "step": 17105 + }, + { + "epoch": 0.84, + "grad_norm": 0.5240073204040527, + "learning_rate": 0.0004916326156946353, + "loss": 3.0368, + "step": 17106 + }, + { + "epoch": 0.84, + "grad_norm": 0.5561971664428711, + "learning_rate": 0.0004916207697169643, + "loss": 3.2357, + "step": 17107 + }, + { + "epoch": 0.84, + "grad_norm": 0.5211239457130432, + "learning_rate": 0.0004916089232346012, + "loss": 3.2528, + "step": 17108 + }, + { + "epoch": 0.84, + "grad_norm": 0.4768798053264618, + "learning_rate": 0.000491597076247577, + "loss": 3.3263, + "step": 17109 + }, + { + "epoch": 0.84, + "grad_norm": 0.4900779128074646, + "learning_rate": 0.0004915852287559231, + "loss": 3.2368, + "step": 17110 + }, + { + "epoch": 0.84, + "grad_norm": 0.5443329811096191, + "learning_rate": 0.0004915733807596705, + "loss": 2.9507, + "step": 17111 + }, + { + "epoch": 0.84, + "grad_norm": 0.5072310566902161, + "learning_rate": 0.0004915615322588503, + "loss": 3.0643, + "step": 17112 + }, + { + "epoch": 0.84, + "grad_norm": 0.5106002688407898, + "learning_rate": 0.0004915496832534941, + "loss": 3.0003, + "step": 17113 + }, + { + "epoch": 0.84, + "grad_norm": 0.5489236116409302, + "learning_rate": 0.0004915378337436328, + "loss": 3.0268, + "step": 17114 + }, + { + "epoch": 0.84, + "grad_norm": 0.48166120052337646, + "learning_rate": 0.0004915259837292978, + "loss": 3.3047, + "step": 17115 + }, + { + "epoch": 0.84, + "grad_norm": 0.5533964037895203, + "learning_rate": 0.0004915141332105203, + "loss": 3.2744, + "step": 17116 + }, + { + "epoch": 0.84, + "grad_norm": 0.5538474321365356, + "learning_rate": 0.0004915022821873311, + "loss": 3.2312, + "step": 17117 + }, + { + "epoch": 0.84, + "grad_norm": 0.5147866010665894, + "learning_rate": 0.0004914904306597621, + "loss": 2.9823, + "step": 17118 + }, + { + "epoch": 0.84, + "grad_norm": 0.5750763416290283, + "learning_rate": 0.0004914785786278439, + "loss": 3.129, + "step": 17119 + }, + { + "epoch": 0.84, + "grad_norm": 0.5425081253051758, + "learning_rate": 0.0004914667260916081, + "loss": 3.2427, + "step": 17120 + }, + { + "epoch": 0.84, + "grad_norm": 0.5620341897010803, + "learning_rate": 0.0004914548730510859, + "loss": 3.1747, + "step": 17121 + }, + { + "epoch": 0.84, + "grad_norm": 0.5213549137115479, + "learning_rate": 0.0004914430195063083, + "loss": 3.005, + "step": 17122 + }, + { + "epoch": 0.84, + "grad_norm": 0.5401297807693481, + "learning_rate": 0.0004914311654573066, + "loss": 3.2856, + "step": 17123 + }, + { + "epoch": 0.84, + "grad_norm": 0.5165397524833679, + "learning_rate": 0.000491419310904112, + "loss": 3.1327, + "step": 17124 + }, + { + "epoch": 0.84, + "grad_norm": 0.5052899718284607, + "learning_rate": 0.0004914074558467559, + "loss": 3.3247, + "step": 17125 + }, + { + "epoch": 0.84, + "grad_norm": 0.5615530014038086, + "learning_rate": 0.0004913956002852693, + "loss": 3.2895, + "step": 17126 + }, + { + "epoch": 0.84, + "grad_norm": 0.5121963024139404, + "learning_rate": 0.0004913837442196836, + "loss": 3.0851, + "step": 17127 + }, + { + "epoch": 0.84, + "grad_norm": 0.5730823278427124, + "learning_rate": 0.0004913718876500299, + "loss": 3.1933, + "step": 17128 + }, + { + "epoch": 0.84, + "grad_norm": 0.5356817245483398, + "learning_rate": 0.0004913600305763396, + "loss": 3.3342, + "step": 17129 + }, + { + "epoch": 0.84, + "grad_norm": 0.5603740811347961, + "learning_rate": 0.0004913481729986436, + "loss": 3.223, + "step": 17130 + }, + { + "epoch": 0.84, + "grad_norm": 0.6096175312995911, + "learning_rate": 0.0004913363149169735, + "loss": 3.1724, + "step": 17131 + }, + { + "epoch": 0.84, + "grad_norm": 0.5367498397827148, + "learning_rate": 0.0004913244563313602, + "loss": 3.297, + "step": 17132 + }, + { + "epoch": 0.84, + "grad_norm": 0.4697509706020355, + "learning_rate": 0.0004913125972418352, + "loss": 3.3864, + "step": 17133 + }, + { + "epoch": 0.84, + "grad_norm": 0.5341188907623291, + "learning_rate": 0.0004913007376484297, + "loss": 3.1566, + "step": 17134 + }, + { + "epoch": 0.84, + "grad_norm": 0.5422013401985168, + "learning_rate": 0.0004912888775511748, + "loss": 2.9892, + "step": 17135 + }, + { + "epoch": 0.84, + "grad_norm": 0.5414844155311584, + "learning_rate": 0.0004912770169501018, + "loss": 3.0971, + "step": 17136 + }, + { + "epoch": 0.84, + "grad_norm": 0.5191177129745483, + "learning_rate": 0.000491265155845242, + "loss": 3.2724, + "step": 17137 + }, + { + "epoch": 0.84, + "grad_norm": 0.541873037815094, + "learning_rate": 0.0004912532942366266, + "loss": 3.1036, + "step": 17138 + }, + { + "epoch": 0.84, + "grad_norm": 0.5198567509651184, + "learning_rate": 0.0004912414321242869, + "loss": 3.0269, + "step": 17139 + }, + { + "epoch": 0.84, + "grad_norm": 0.5153297781944275, + "learning_rate": 0.0004912295695082539, + "loss": 2.9057, + "step": 17140 + }, + { + "epoch": 0.84, + "grad_norm": 0.5370156764984131, + "learning_rate": 0.0004912177063885591, + "loss": 3.333, + "step": 17141 + }, + { + "epoch": 0.84, + "grad_norm": 0.5156940817832947, + "learning_rate": 0.0004912058427652337, + "loss": 3.1685, + "step": 17142 + }, + { + "epoch": 0.84, + "grad_norm": 0.558294415473938, + "learning_rate": 0.0004911939786383089, + "loss": 3.0098, + "step": 17143 + }, + { + "epoch": 0.84, + "grad_norm": 0.5128892064094543, + "learning_rate": 0.0004911821140078158, + "loss": 3.2013, + "step": 17144 + }, + { + "epoch": 0.84, + "grad_norm": 0.5227497816085815, + "learning_rate": 0.0004911702488737859, + "loss": 3.3653, + "step": 17145 + }, + { + "epoch": 0.84, + "grad_norm": 0.5522597432136536, + "learning_rate": 0.0004911583832362504, + "loss": 3.0272, + "step": 17146 + }, + { + "epoch": 0.84, + "grad_norm": 0.5611497759819031, + "learning_rate": 0.0004911465170952405, + "loss": 3.3738, + "step": 17147 + }, + { + "epoch": 0.84, + "grad_norm": 0.5275722742080688, + "learning_rate": 0.0004911346504507874, + "loss": 3.2767, + "step": 17148 + }, + { + "epoch": 0.84, + "grad_norm": 0.5308437347412109, + "learning_rate": 0.0004911227833029225, + "loss": 3.2512, + "step": 17149 + }, + { + "epoch": 0.84, + "grad_norm": 0.5144796371459961, + "learning_rate": 0.000491110915651677, + "loss": 3.2331, + "step": 17150 + }, + { + "epoch": 0.84, + "grad_norm": 0.5319437384605408, + "learning_rate": 0.0004910990474970821, + "loss": 3.0634, + "step": 17151 + }, + { + "epoch": 0.84, + "grad_norm": 0.5378063917160034, + "learning_rate": 0.000491087178839169, + "loss": 3.1613, + "step": 17152 + }, + { + "epoch": 0.84, + "grad_norm": 0.4878802001476288, + "learning_rate": 0.000491075309677969, + "loss": 3.1845, + "step": 17153 + }, + { + "epoch": 0.84, + "grad_norm": 0.5802315473556519, + "learning_rate": 0.0004910634400135136, + "loss": 3.2188, + "step": 17154 + }, + { + "epoch": 0.84, + "grad_norm": 0.5172027945518494, + "learning_rate": 0.0004910515698458338, + "loss": 3.0371, + "step": 17155 + }, + { + "epoch": 0.84, + "grad_norm": 0.5362008213996887, + "learning_rate": 0.0004910396991749608, + "loss": 3.1302, + "step": 17156 + }, + { + "epoch": 0.84, + "grad_norm": 0.50388503074646, + "learning_rate": 0.0004910278280009263, + "loss": 3.1382, + "step": 17157 + }, + { + "epoch": 0.84, + "grad_norm": 0.5768753886222839, + "learning_rate": 0.0004910159563237609, + "loss": 3.0569, + "step": 17158 + }, + { + "epoch": 0.84, + "grad_norm": 0.5347177386283875, + "learning_rate": 0.0004910040841434964, + "loss": 3.1319, + "step": 17159 + }, + { + "epoch": 0.84, + "grad_norm": 0.531885027885437, + "learning_rate": 0.0004909922114601639, + "loss": 3.4059, + "step": 17160 + }, + { + "epoch": 0.84, + "grad_norm": 0.5703318119049072, + "learning_rate": 0.0004909803382737947, + "loss": 3.1653, + "step": 17161 + }, + { + "epoch": 0.84, + "grad_norm": 0.5395308136940002, + "learning_rate": 0.0004909684645844201, + "loss": 3.2048, + "step": 17162 + }, + { + "epoch": 0.84, + "grad_norm": 0.5464442372322083, + "learning_rate": 0.0004909565903920711, + "loss": 2.963, + "step": 17163 + }, + { + "epoch": 0.84, + "grad_norm": 0.5010871887207031, + "learning_rate": 0.0004909447156967794, + "loss": 3.3017, + "step": 17164 + }, + { + "epoch": 0.84, + "grad_norm": 0.5098520517349243, + "learning_rate": 0.000490932840498576, + "loss": 3.076, + "step": 17165 + }, + { + "epoch": 0.84, + "grad_norm": 0.6024291515350342, + "learning_rate": 0.0004909209647974923, + "loss": 3.2139, + "step": 17166 + }, + { + "epoch": 0.84, + "grad_norm": 0.521047830581665, + "learning_rate": 0.0004909090885935594, + "loss": 3.1004, + "step": 17167 + }, + { + "epoch": 0.84, + "grad_norm": 0.534773051738739, + "learning_rate": 0.0004908972118868088, + "loss": 3.2295, + "step": 17168 + }, + { + "epoch": 0.84, + "grad_norm": 0.5155094265937805, + "learning_rate": 0.0004908853346772716, + "loss": 3.1153, + "step": 17169 + }, + { + "epoch": 0.84, + "grad_norm": 0.513957142829895, + "learning_rate": 0.0004908734569649793, + "loss": 3.0931, + "step": 17170 + }, + { + "epoch": 0.84, + "grad_norm": 0.5340112447738647, + "learning_rate": 0.000490861578749963, + "loss": 3.0543, + "step": 17171 + }, + { + "epoch": 0.84, + "grad_norm": 0.5084630250930786, + "learning_rate": 0.000490849700032254, + "loss": 3.323, + "step": 17172 + }, + { + "epoch": 0.84, + "grad_norm": 0.5156233310699463, + "learning_rate": 0.0004908378208118836, + "loss": 3.2439, + "step": 17173 + }, + { + "epoch": 0.84, + "grad_norm": 0.5381580591201782, + "learning_rate": 0.0004908259410888831, + "loss": 3.0843, + "step": 17174 + }, + { + "epoch": 0.84, + "grad_norm": 0.5627414584159851, + "learning_rate": 0.0004908140608632838, + "loss": 2.9431, + "step": 17175 + }, + { + "epoch": 0.84, + "grad_norm": 0.5171828866004944, + "learning_rate": 0.000490802180135117, + "loss": 3.2593, + "step": 17176 + }, + { + "epoch": 0.84, + "grad_norm": 0.5454464554786682, + "learning_rate": 0.000490790298904414, + "loss": 3.1044, + "step": 17177 + }, + { + "epoch": 0.84, + "grad_norm": 0.4904737174510956, + "learning_rate": 0.0004907784171712061, + "loss": 3.242, + "step": 17178 + }, + { + "epoch": 0.84, + "grad_norm": 0.5514205694198608, + "learning_rate": 0.0004907665349355245, + "loss": 3.2227, + "step": 17179 + }, + { + "epoch": 0.84, + "grad_norm": 0.49019309878349304, + "learning_rate": 0.0004907546521974006, + "loss": 3.1571, + "step": 17180 + }, + { + "epoch": 0.84, + "grad_norm": 0.552192747592926, + "learning_rate": 0.0004907427689568656, + "loss": 3.09, + "step": 17181 + }, + { + "epoch": 0.84, + "grad_norm": 0.5478540658950806, + "learning_rate": 0.0004907308852139508, + "loss": 3.1038, + "step": 17182 + }, + { + "epoch": 0.84, + "grad_norm": 0.5257436633110046, + "learning_rate": 0.0004907190009686878, + "loss": 3.4212, + "step": 17183 + }, + { + "epoch": 0.84, + "grad_norm": 0.5158893465995789, + "learning_rate": 0.0004907071162211074, + "loss": 3.288, + "step": 17184 + }, + { + "epoch": 0.84, + "grad_norm": 0.4981338083744049, + "learning_rate": 0.0004906952309712413, + "loss": 3.2846, + "step": 17185 + }, + { + "epoch": 0.84, + "grad_norm": 0.5687278509140015, + "learning_rate": 0.0004906833452191207, + "loss": 3.3939, + "step": 17186 + }, + { + "epoch": 0.84, + "grad_norm": 0.5252379179000854, + "learning_rate": 0.0004906714589647767, + "loss": 3.3868, + "step": 17187 + }, + { + "epoch": 0.84, + "grad_norm": 0.5520195364952087, + "learning_rate": 0.0004906595722082409, + "loss": 3.2195, + "step": 17188 + }, + { + "epoch": 0.84, + "grad_norm": 0.5275287628173828, + "learning_rate": 0.0004906476849495444, + "loss": 3.3285, + "step": 17189 + }, + { + "epoch": 0.84, + "grad_norm": 0.5144610404968262, + "learning_rate": 0.0004906357971887187, + "loss": 3.1138, + "step": 17190 + }, + { + "epoch": 0.84, + "grad_norm": 0.5572524070739746, + "learning_rate": 0.0004906239089257949, + "loss": 3.0022, + "step": 17191 + }, + { + "epoch": 0.84, + "grad_norm": 0.5245261192321777, + "learning_rate": 0.0004906120201608044, + "loss": 3.1331, + "step": 17192 + }, + { + "epoch": 0.84, + "grad_norm": 0.506518542766571, + "learning_rate": 0.0004906001308937787, + "loss": 3.3921, + "step": 17193 + }, + { + "epoch": 0.84, + "grad_norm": 0.4986775517463684, + "learning_rate": 0.0004905882411247487, + "loss": 3.1723, + "step": 17194 + }, + { + "epoch": 0.84, + "grad_norm": 0.5022253394126892, + "learning_rate": 0.0004905763508537461, + "loss": 3.1883, + "step": 17195 + }, + { + "epoch": 0.84, + "grad_norm": 0.5980327129364014, + "learning_rate": 0.000490564460080802, + "loss": 3.1204, + "step": 17196 + }, + { + "epoch": 0.84, + "grad_norm": 0.5253124833106995, + "learning_rate": 0.0004905525688059479, + "loss": 3.2769, + "step": 17197 + }, + { + "epoch": 0.84, + "grad_norm": 0.5250275135040283, + "learning_rate": 0.0004905406770292148, + "loss": 3.0089, + "step": 17198 + }, + { + "epoch": 0.84, + "grad_norm": 0.5560047626495361, + "learning_rate": 0.0004905287847506343, + "loss": 3.1203, + "step": 17199 + }, + { + "epoch": 0.84, + "grad_norm": 0.5059837102890015, + "learning_rate": 0.0004905168919702378, + "loss": 3.0903, + "step": 17200 + }, + { + "epoch": 0.84, + "grad_norm": 0.530164361000061, + "learning_rate": 0.0004905049986880563, + "loss": 3.2861, + "step": 17201 + }, + { + "epoch": 0.84, + "grad_norm": 0.5831915736198425, + "learning_rate": 0.0004904931049041214, + "loss": 3.202, + "step": 17202 + }, + { + "epoch": 0.84, + "grad_norm": 0.546757698059082, + "learning_rate": 0.0004904812106184643, + "loss": 3.1316, + "step": 17203 + }, + { + "epoch": 0.84, + "grad_norm": 0.49275583028793335, + "learning_rate": 0.0004904693158311162, + "loss": 3.0704, + "step": 17204 + }, + { + "epoch": 0.84, + "grad_norm": 0.5256943106651306, + "learning_rate": 0.0004904574205421089, + "loss": 3.0593, + "step": 17205 + }, + { + "epoch": 0.84, + "grad_norm": 0.542165219783783, + "learning_rate": 0.0004904455247514731, + "loss": 3.391, + "step": 17206 + }, + { + "epoch": 0.84, + "grad_norm": 0.5653906464576721, + "learning_rate": 0.0004904336284592407, + "loss": 3.2863, + "step": 17207 + }, + { + "epoch": 0.84, + "grad_norm": 0.5180942416191101, + "learning_rate": 0.0004904217316654425, + "loss": 3.1246, + "step": 17208 + }, + { + "epoch": 0.84, + "grad_norm": 0.49898773431777954, + "learning_rate": 0.0004904098343701102, + "loss": 3.1719, + "step": 17209 + }, + { + "epoch": 0.84, + "grad_norm": 0.531066358089447, + "learning_rate": 0.0004903979365732753, + "loss": 3.3398, + "step": 17210 + }, + { + "epoch": 0.84, + "grad_norm": 0.4966711103916168, + "learning_rate": 0.0004903860382749686, + "loss": 3.0826, + "step": 17211 + }, + { + "epoch": 0.84, + "grad_norm": 0.5284520983695984, + "learning_rate": 0.0004903741394752218, + "loss": 3.2555, + "step": 17212 + }, + { + "epoch": 0.84, + "grad_norm": 0.5519339442253113, + "learning_rate": 0.000490362240174066, + "loss": 3.0804, + "step": 17213 + }, + { + "epoch": 0.84, + "grad_norm": 0.5141843557357788, + "learning_rate": 0.0004903503403715329, + "loss": 3.2965, + "step": 17214 + }, + { + "epoch": 0.84, + "grad_norm": 0.5470436215400696, + "learning_rate": 0.0004903384400676535, + "loss": 3.0265, + "step": 17215 + }, + { + "epoch": 0.84, + "grad_norm": 0.5789403319358826, + "learning_rate": 0.0004903265392624594, + "loss": 3.0417, + "step": 17216 + }, + { + "epoch": 0.84, + "grad_norm": 0.515338659286499, + "learning_rate": 0.0004903146379559818, + "loss": 3.0671, + "step": 17217 + }, + { + "epoch": 0.84, + "grad_norm": 0.5383449196815491, + "learning_rate": 0.000490302736148252, + "loss": 3.0563, + "step": 17218 + }, + { + "epoch": 0.84, + "grad_norm": 0.5087341666221619, + "learning_rate": 0.0004902908338393014, + "loss": 3.3356, + "step": 17219 + }, + { + "epoch": 0.84, + "grad_norm": 0.5621129870414734, + "learning_rate": 0.0004902789310291615, + "loss": 3.1718, + "step": 17220 + }, + { + "epoch": 0.84, + "grad_norm": 0.49984487891197205, + "learning_rate": 0.0004902670277178634, + "loss": 3.2502, + "step": 17221 + }, + { + "epoch": 0.84, + "grad_norm": 0.6184155941009521, + "learning_rate": 0.0004902551239054386, + "loss": 3.1425, + "step": 17222 + }, + { + "epoch": 0.84, + "grad_norm": 0.5024511218070984, + "learning_rate": 0.0004902432195919184, + "loss": 3.2221, + "step": 17223 + }, + { + "epoch": 0.84, + "grad_norm": 0.5568563938140869, + "learning_rate": 0.0004902313147773342, + "loss": 2.9693, + "step": 17224 + }, + { + "epoch": 0.84, + "grad_norm": 0.5303897857666016, + "learning_rate": 0.0004902194094617174, + "loss": 3.2374, + "step": 17225 + }, + { + "epoch": 0.84, + "grad_norm": 0.5185246467590332, + "learning_rate": 0.0004902075036450992, + "loss": 3.2034, + "step": 17226 + }, + { + "epoch": 0.84, + "grad_norm": 0.5359919667243958, + "learning_rate": 0.000490195597327511, + "loss": 2.8955, + "step": 17227 + }, + { + "epoch": 0.84, + "grad_norm": 0.530653178691864, + "learning_rate": 0.0004901836905089842, + "loss": 3.2935, + "step": 17228 + }, + { + "epoch": 0.84, + "grad_norm": 0.5164836645126343, + "learning_rate": 0.0004901717831895503, + "loss": 3.229, + "step": 17229 + }, + { + "epoch": 0.84, + "grad_norm": 0.5706772804260254, + "learning_rate": 0.0004901598753692403, + "loss": 3.1432, + "step": 17230 + }, + { + "epoch": 0.84, + "grad_norm": 0.5535086393356323, + "learning_rate": 0.0004901479670480859, + "loss": 3.0893, + "step": 17231 + }, + { + "epoch": 0.84, + "grad_norm": 0.5143651366233826, + "learning_rate": 0.0004901360582261185, + "loss": 3.1783, + "step": 17232 + }, + { + "epoch": 0.84, + "grad_norm": 0.5126096606254578, + "learning_rate": 0.0004901241489033692, + "loss": 3.2156, + "step": 17233 + }, + { + "epoch": 0.84, + "grad_norm": 0.5179157853126526, + "learning_rate": 0.0004901122390798694, + "loss": 3.2843, + "step": 17234 + }, + { + "epoch": 0.84, + "grad_norm": 0.5556790232658386, + "learning_rate": 0.0004901003287556507, + "loss": 3.0277, + "step": 17235 + }, + { + "epoch": 0.84, + "grad_norm": 0.5082927942276001, + "learning_rate": 0.0004900884179307441, + "loss": 3.1995, + "step": 17236 + }, + { + "epoch": 0.84, + "grad_norm": 0.5159091353416443, + "learning_rate": 0.0004900765066051814, + "loss": 3.0237, + "step": 17237 + }, + { + "epoch": 0.84, + "grad_norm": 0.49801966547966003, + "learning_rate": 0.0004900645947789938, + "loss": 3.155, + "step": 17238 + }, + { + "epoch": 0.84, + "grad_norm": 0.5168977379798889, + "learning_rate": 0.0004900526824522125, + "loss": 3.1495, + "step": 17239 + }, + { + "epoch": 0.84, + "grad_norm": 0.5264788269996643, + "learning_rate": 0.000490040769624869, + "loss": 2.9527, + "step": 17240 + }, + { + "epoch": 0.84, + "grad_norm": 0.6261751055717468, + "learning_rate": 0.0004900288562969947, + "loss": 3.2426, + "step": 17241 + }, + { + "epoch": 0.84, + "grad_norm": 0.5559073090553284, + "learning_rate": 0.000490016942468621, + "loss": 3.0032, + "step": 17242 + }, + { + "epoch": 0.85, + "grad_norm": 0.5475751161575317, + "learning_rate": 0.0004900050281397792, + "loss": 3.396, + "step": 17243 + }, + { + "epoch": 0.85, + "grad_norm": 0.5120872259140015, + "learning_rate": 0.0004899931133105007, + "loss": 3.1183, + "step": 17244 + }, + { + "epoch": 0.85, + "grad_norm": 0.5100634694099426, + "learning_rate": 0.000489981197980817, + "loss": 3.1615, + "step": 17245 + }, + { + "epoch": 0.85, + "grad_norm": 0.5490902066230774, + "learning_rate": 0.0004899692821507593, + "loss": 3.0538, + "step": 17246 + }, + { + "epoch": 0.85, + "grad_norm": 0.48027950525283813, + "learning_rate": 0.0004899573658203591, + "loss": 3.1387, + "step": 17247 + }, + { + "epoch": 0.85, + "grad_norm": 0.5834192633628845, + "learning_rate": 0.0004899454489896479, + "loss": 3.1026, + "step": 17248 + }, + { + "epoch": 0.85, + "grad_norm": 0.5443624258041382, + "learning_rate": 0.0004899335316586568, + "loss": 3.1497, + "step": 17249 + }, + { + "epoch": 0.85, + "grad_norm": 0.5452780723571777, + "learning_rate": 0.0004899216138274172, + "loss": 3.2115, + "step": 17250 + }, + { + "epoch": 0.85, + "grad_norm": 0.5227807760238647, + "learning_rate": 0.0004899096954959608, + "loss": 3.2457, + "step": 17251 + }, + { + "epoch": 0.85, + "grad_norm": 0.5504705905914307, + "learning_rate": 0.0004898977766643188, + "loss": 2.9626, + "step": 17252 + }, + { + "epoch": 0.85, + "grad_norm": 0.5524275898933411, + "learning_rate": 0.0004898858573325226, + "loss": 3.2484, + "step": 17253 + }, + { + "epoch": 0.85, + "grad_norm": 0.5162675380706787, + "learning_rate": 0.0004898739375006036, + "loss": 3.2485, + "step": 17254 + }, + { + "epoch": 0.85, + "grad_norm": 0.5104801058769226, + "learning_rate": 0.0004898620171685932, + "loss": 3.2277, + "step": 17255 + }, + { + "epoch": 0.85, + "grad_norm": 0.4923844635486603, + "learning_rate": 0.0004898500963365226, + "loss": 3.1075, + "step": 17256 + }, + { + "epoch": 0.85, + "grad_norm": 0.5191243886947632, + "learning_rate": 0.0004898381750044236, + "loss": 3.007, + "step": 17257 + }, + { + "epoch": 0.85, + "grad_norm": 0.531520426273346, + "learning_rate": 0.0004898262531723273, + "loss": 3.1071, + "step": 17258 + }, + { + "epoch": 0.85, + "grad_norm": 0.5321716070175171, + "learning_rate": 0.0004898143308402652, + "loss": 3.2239, + "step": 17259 + }, + { + "epoch": 0.85, + "grad_norm": 0.533822238445282, + "learning_rate": 0.0004898024080082688, + "loss": 3.0522, + "step": 17260 + }, + { + "epoch": 0.85, + "grad_norm": 0.5310263633728027, + "learning_rate": 0.0004897904846763692, + "loss": 3.0263, + "step": 17261 + }, + { + "epoch": 0.85, + "grad_norm": 0.5534727573394775, + "learning_rate": 0.000489778560844598, + "loss": 3.2554, + "step": 17262 + }, + { + "epoch": 0.85, + "grad_norm": 0.5336495041847229, + "learning_rate": 0.0004897666365129867, + "loss": 3.1274, + "step": 17263 + }, + { + "epoch": 0.85, + "grad_norm": 0.5278199911117554, + "learning_rate": 0.0004897547116815666, + "loss": 3.2348, + "step": 17264 + }, + { + "epoch": 0.85, + "grad_norm": 0.5093748569488525, + "learning_rate": 0.0004897427863503691, + "loss": 3.1455, + "step": 17265 + }, + { + "epoch": 0.85, + "grad_norm": 0.565778374671936, + "learning_rate": 0.0004897308605194255, + "loss": 3.0988, + "step": 17266 + }, + { + "epoch": 0.85, + "grad_norm": 0.5380170941352844, + "learning_rate": 0.0004897189341887673, + "loss": 3.0954, + "step": 17267 + }, + { + "epoch": 0.85, + "grad_norm": 0.4972584843635559, + "learning_rate": 0.0004897070073584262, + "loss": 3.0722, + "step": 17268 + }, + { + "epoch": 0.85, + "grad_norm": 0.533125638961792, + "learning_rate": 0.0004896950800284332, + "loss": 3.1618, + "step": 17269 + }, + { + "epoch": 0.85, + "grad_norm": 0.49823445081710815, + "learning_rate": 0.0004896831521988198, + "loss": 3.1106, + "step": 17270 + }, + { + "epoch": 0.85, + "grad_norm": 0.5134430527687073, + "learning_rate": 0.0004896712238696176, + "loss": 2.9883, + "step": 17271 + }, + { + "epoch": 0.85, + "grad_norm": 0.5448001027107239, + "learning_rate": 0.0004896592950408579, + "loss": 3.3028, + "step": 17272 + }, + { + "epoch": 0.85, + "grad_norm": 0.5064616799354553, + "learning_rate": 0.0004896473657125719, + "loss": 3.4008, + "step": 17273 + }, + { + "epoch": 0.85, + "grad_norm": 0.5209435820579529, + "learning_rate": 0.0004896354358847915, + "loss": 3.3331, + "step": 17274 + }, + { + "epoch": 0.85, + "grad_norm": 0.5029204487800598, + "learning_rate": 0.0004896235055575477, + "loss": 3.2065, + "step": 17275 + }, + { + "epoch": 0.85, + "grad_norm": 0.5218020677566528, + "learning_rate": 0.0004896115747308722, + "loss": 3.2772, + "step": 17276 + }, + { + "epoch": 0.85, + "grad_norm": 0.5261409878730774, + "learning_rate": 0.0004895996434047962, + "loss": 3.357, + "step": 17277 + }, + { + "epoch": 0.85, + "grad_norm": 0.5557669997215271, + "learning_rate": 0.0004895877115793513, + "loss": 3.2209, + "step": 17278 + }, + { + "epoch": 0.85, + "grad_norm": 0.5135267972946167, + "learning_rate": 0.0004895757792545689, + "loss": 3.0166, + "step": 17279 + }, + { + "epoch": 0.85, + "grad_norm": 0.4932655096054077, + "learning_rate": 0.0004895638464304802, + "loss": 3.1704, + "step": 17280 + }, + { + "epoch": 0.85, + "grad_norm": 0.5083035230636597, + "learning_rate": 0.0004895519131071169, + "loss": 3.0698, + "step": 17281 + }, + { + "epoch": 0.85, + "grad_norm": 0.4993206560611725, + "learning_rate": 0.0004895399792845105, + "loss": 3.1426, + "step": 17282 + }, + { + "epoch": 0.85, + "grad_norm": 0.5377535820007324, + "learning_rate": 0.000489528044962692, + "loss": 3.2107, + "step": 17283 + }, + { + "epoch": 0.85, + "grad_norm": 0.5332820415496826, + "learning_rate": 0.0004895161101416932, + "loss": 3.1717, + "step": 17284 + }, + { + "epoch": 0.85, + "grad_norm": 0.6028470993041992, + "learning_rate": 0.0004895041748215456, + "loss": 3.222, + "step": 17285 + }, + { + "epoch": 0.85, + "grad_norm": 0.5052365064620972, + "learning_rate": 0.0004894922390022803, + "loss": 3.1013, + "step": 17286 + }, + { + "epoch": 0.85, + "grad_norm": 0.5114204287528992, + "learning_rate": 0.000489480302683929, + "loss": 3.2889, + "step": 17287 + }, + { + "epoch": 0.85, + "grad_norm": 0.5291855931282043, + "learning_rate": 0.000489468365866523, + "loss": 3.0852, + "step": 17288 + }, + { + "epoch": 0.85, + "grad_norm": 0.5404118895530701, + "learning_rate": 0.0004894564285500938, + "loss": 3.1578, + "step": 17289 + }, + { + "epoch": 0.85, + "grad_norm": 0.5513167977333069, + "learning_rate": 0.0004894444907346729, + "loss": 3.177, + "step": 17290 + }, + { + "epoch": 0.85, + "grad_norm": 0.546855628490448, + "learning_rate": 0.0004894325524202915, + "loss": 3.1454, + "step": 17291 + }, + { + "epoch": 0.85, + "grad_norm": 0.5344764590263367, + "learning_rate": 0.0004894206136069813, + "loss": 3.0274, + "step": 17292 + }, + { + "epoch": 0.85, + "grad_norm": 0.531498908996582, + "learning_rate": 0.0004894086742947737, + "loss": 2.8852, + "step": 17293 + }, + { + "epoch": 0.85, + "grad_norm": 0.4873696565628052, + "learning_rate": 0.0004893967344837, + "loss": 3.1002, + "step": 17294 + }, + { + "epoch": 0.85, + "grad_norm": 0.5418617129325867, + "learning_rate": 0.0004893847941737919, + "loss": 3.2382, + "step": 17295 + }, + { + "epoch": 0.85, + "grad_norm": 0.5241795182228088, + "learning_rate": 0.0004893728533650806, + "loss": 3.2205, + "step": 17296 + }, + { + "epoch": 0.85, + "grad_norm": 0.5475279092788696, + "learning_rate": 0.0004893609120575976, + "loss": 3.2555, + "step": 17297 + }, + { + "epoch": 0.85, + "grad_norm": 0.65479975938797, + "learning_rate": 0.0004893489702513745, + "loss": 3.0701, + "step": 17298 + }, + { + "epoch": 0.85, + "grad_norm": 0.5669695138931274, + "learning_rate": 0.0004893370279464427, + "loss": 3.1972, + "step": 17299 + }, + { + "epoch": 0.85, + "grad_norm": 0.517507791519165, + "learning_rate": 0.0004893250851428335, + "loss": 3.1766, + "step": 17300 + }, + { + "epoch": 0.85, + "grad_norm": 0.5156394839286804, + "learning_rate": 0.0004893131418405786, + "loss": 3.2759, + "step": 17301 + }, + { + "epoch": 0.85, + "grad_norm": 0.48930031061172485, + "learning_rate": 0.0004893011980397091, + "loss": 3.1122, + "step": 17302 + }, + { + "epoch": 0.85, + "grad_norm": 0.518436074256897, + "learning_rate": 0.0004892892537402568, + "loss": 3.1862, + "step": 17303 + }, + { + "epoch": 0.85, + "grad_norm": 0.5759012699127197, + "learning_rate": 0.0004892773089422531, + "loss": 3.1416, + "step": 17304 + }, + { + "epoch": 0.85, + "grad_norm": 0.48443731665611267, + "learning_rate": 0.0004892653636457293, + "loss": 2.9702, + "step": 17305 + }, + { + "epoch": 0.85, + "grad_norm": 0.5327646732330322, + "learning_rate": 0.000489253417850717, + "loss": 3.1856, + "step": 17306 + }, + { + "epoch": 0.85, + "grad_norm": 0.5462185144424438, + "learning_rate": 0.0004892414715572475, + "loss": 3.0689, + "step": 17307 + }, + { + "epoch": 0.85, + "grad_norm": 0.535132110118866, + "learning_rate": 0.0004892295247653526, + "loss": 3.1475, + "step": 17308 + }, + { + "epoch": 0.85, + "grad_norm": 0.5083247423171997, + "learning_rate": 0.0004892175774750633, + "loss": 3.0753, + "step": 17309 + }, + { + "epoch": 0.85, + "grad_norm": 0.5415558815002441, + "learning_rate": 0.0004892056296864116, + "loss": 3.2183, + "step": 17310 + }, + { + "epoch": 0.85, + "grad_norm": 0.5436738729476929, + "learning_rate": 0.0004891936813994285, + "loss": 3.2266, + "step": 17311 + }, + { + "epoch": 0.85, + "grad_norm": 0.5327355265617371, + "learning_rate": 0.0004891817326141457, + "loss": 3.2321, + "step": 17312 + }, + { + "epoch": 0.85, + "grad_norm": 0.5031436681747437, + "learning_rate": 0.0004891697833305946, + "loss": 3.0962, + "step": 17313 + }, + { + "epoch": 0.85, + "grad_norm": 0.5180808305740356, + "learning_rate": 0.0004891578335488066, + "loss": 3.0242, + "step": 17314 + }, + { + "epoch": 0.85, + "grad_norm": 0.5372978448867798, + "learning_rate": 0.0004891458832688135, + "loss": 2.9658, + "step": 17315 + }, + { + "epoch": 0.85, + "grad_norm": 0.5055201053619385, + "learning_rate": 0.0004891339324906464, + "loss": 3.0476, + "step": 17316 + }, + { + "epoch": 0.85, + "grad_norm": 0.5402676463127136, + "learning_rate": 0.0004891219812143369, + "loss": 3.0622, + "step": 17317 + }, + { + "epoch": 0.85, + "grad_norm": 0.4617784023284912, + "learning_rate": 0.0004891100294399166, + "loss": 3.1742, + "step": 17318 + }, + { + "epoch": 0.85, + "grad_norm": 0.5083367228507996, + "learning_rate": 0.0004890980771674169, + "loss": 3.3683, + "step": 17319 + }, + { + "epoch": 0.85, + "grad_norm": 0.4868304431438446, + "learning_rate": 0.0004890861243968691, + "loss": 3.3446, + "step": 17320 + }, + { + "epoch": 0.85, + "grad_norm": 0.5708618760108948, + "learning_rate": 0.000489074171128305, + "loss": 3.1252, + "step": 17321 + }, + { + "epoch": 0.85, + "grad_norm": 0.5267812013626099, + "learning_rate": 0.0004890622173617558, + "loss": 3.1191, + "step": 17322 + }, + { + "epoch": 0.85, + "grad_norm": 0.5371107459068298, + "learning_rate": 0.0004890502630972532, + "loss": 3.1529, + "step": 17323 + }, + { + "epoch": 0.85, + "grad_norm": 0.5120932459831238, + "learning_rate": 0.0004890383083348285, + "loss": 3.0703, + "step": 17324 + }, + { + "epoch": 0.85, + "grad_norm": 0.5467737317085266, + "learning_rate": 0.0004890263530745134, + "loss": 3.2532, + "step": 17325 + }, + { + "epoch": 0.85, + "grad_norm": 0.4981206953525543, + "learning_rate": 0.0004890143973163391, + "loss": 3.1652, + "step": 17326 + }, + { + "epoch": 0.85, + "grad_norm": 0.5158091187477112, + "learning_rate": 0.0004890024410603372, + "loss": 3.0072, + "step": 17327 + }, + { + "epoch": 0.85, + "grad_norm": 0.5790042281150818, + "learning_rate": 0.0004889904843065394, + "loss": 3.2988, + "step": 17328 + }, + { + "epoch": 0.85, + "grad_norm": 0.5327931642532349, + "learning_rate": 0.0004889785270549771, + "loss": 3.0204, + "step": 17329 + }, + { + "epoch": 0.85, + "grad_norm": 0.5174896717071533, + "learning_rate": 0.0004889665693056817, + "loss": 3.1526, + "step": 17330 + }, + { + "epoch": 0.85, + "grad_norm": 0.5148389339447021, + "learning_rate": 0.0004889546110586847, + "loss": 3.1528, + "step": 17331 + }, + { + "epoch": 0.85, + "grad_norm": 0.4947047829627991, + "learning_rate": 0.0004889426523140175, + "loss": 2.9137, + "step": 17332 + }, + { + "epoch": 0.85, + "grad_norm": 0.5458266735076904, + "learning_rate": 0.0004889306930717118, + "loss": 3.3774, + "step": 17333 + }, + { + "epoch": 0.85, + "grad_norm": 0.4884343147277832, + "learning_rate": 0.0004889187333317991, + "loss": 3.116, + "step": 17334 + }, + { + "epoch": 0.85, + "grad_norm": 0.5519797801971436, + "learning_rate": 0.0004889067730943107, + "loss": 3.1202, + "step": 17335 + }, + { + "epoch": 0.85, + "grad_norm": 0.5409486889839172, + "learning_rate": 0.0004888948123592783, + "loss": 3.1933, + "step": 17336 + }, + { + "epoch": 0.85, + "grad_norm": 0.5440702438354492, + "learning_rate": 0.0004888828511267332, + "loss": 3.2795, + "step": 17337 + }, + { + "epoch": 0.85, + "grad_norm": 0.5199924111366272, + "learning_rate": 0.0004888708893967071, + "loss": 2.9574, + "step": 17338 + }, + { + "epoch": 0.85, + "grad_norm": 0.5392219424247742, + "learning_rate": 0.0004888589271692314, + "loss": 3.3653, + "step": 17339 + }, + { + "epoch": 0.85, + "grad_norm": 0.49010539054870605, + "learning_rate": 0.0004888469644443377, + "loss": 3.1307, + "step": 17340 + }, + { + "epoch": 0.85, + "grad_norm": 0.5155744552612305, + "learning_rate": 0.0004888350012220573, + "loss": 3.142, + "step": 17341 + }, + { + "epoch": 0.85, + "grad_norm": 0.5155385732650757, + "learning_rate": 0.000488823037502422, + "loss": 3.3988, + "step": 17342 + }, + { + "epoch": 0.85, + "grad_norm": 0.5531888008117676, + "learning_rate": 0.000488811073285463, + "loss": 2.9271, + "step": 17343 + }, + { + "epoch": 0.85, + "grad_norm": 0.5326036214828491, + "learning_rate": 0.0004887991085712121, + "loss": 3.1794, + "step": 17344 + }, + { + "epoch": 0.85, + "grad_norm": 0.5249530673027039, + "learning_rate": 0.0004887871433597006, + "loss": 3.2291, + "step": 17345 + }, + { + "epoch": 0.85, + "grad_norm": 0.5138980746269226, + "learning_rate": 0.0004887751776509602, + "loss": 3.0757, + "step": 17346 + }, + { + "epoch": 0.85, + "grad_norm": 0.5147232413291931, + "learning_rate": 0.0004887632114450222, + "loss": 3.0998, + "step": 17347 + }, + { + "epoch": 0.85, + "grad_norm": 0.5001515746116638, + "learning_rate": 0.0004887512447419184, + "loss": 3.2291, + "step": 17348 + }, + { + "epoch": 0.85, + "grad_norm": 0.5295124650001526, + "learning_rate": 0.00048873927754168, + "loss": 3.1302, + "step": 17349 + }, + { + "epoch": 0.85, + "grad_norm": 0.5005471706390381, + "learning_rate": 0.0004887273098443388, + "loss": 3.0511, + "step": 17350 + }, + { + "epoch": 0.85, + "grad_norm": 0.5099095702171326, + "learning_rate": 0.000488715341649926, + "loss": 3.1537, + "step": 17351 + }, + { + "epoch": 0.85, + "grad_norm": 0.5452235341072083, + "learning_rate": 0.0004887033729584734, + "loss": 3.231, + "step": 17352 + }, + { + "epoch": 0.85, + "grad_norm": 0.5622381567955017, + "learning_rate": 0.0004886914037700124, + "loss": 3.0763, + "step": 17353 + }, + { + "epoch": 0.85, + "grad_norm": 0.5072722434997559, + "learning_rate": 0.0004886794340845746, + "loss": 3.0325, + "step": 17354 + }, + { + "epoch": 0.85, + "grad_norm": 0.5430617332458496, + "learning_rate": 0.0004886674639021914, + "loss": 3.1989, + "step": 17355 + }, + { + "epoch": 0.85, + "grad_norm": 0.5264281034469604, + "learning_rate": 0.0004886554932228945, + "loss": 3.2848, + "step": 17356 + }, + { + "epoch": 0.85, + "grad_norm": 0.5271099209785461, + "learning_rate": 0.0004886435220467154, + "loss": 3.2634, + "step": 17357 + }, + { + "epoch": 0.85, + "grad_norm": 0.578886866569519, + "learning_rate": 0.0004886315503736854, + "loss": 3.2401, + "step": 17358 + }, + { + "epoch": 0.85, + "grad_norm": 0.48551952838897705, + "learning_rate": 0.0004886195782038364, + "loss": 3.2252, + "step": 17359 + }, + { + "epoch": 0.85, + "grad_norm": 0.49196016788482666, + "learning_rate": 0.0004886076055371995, + "loss": 3.2997, + "step": 17360 + }, + { + "epoch": 0.85, + "grad_norm": 0.5155906677246094, + "learning_rate": 0.0004885956323738066, + "loss": 3.0486, + "step": 17361 + }, + { + "epoch": 0.85, + "grad_norm": 0.5028941631317139, + "learning_rate": 0.0004885836587136892, + "loss": 3.1732, + "step": 17362 + }, + { + "epoch": 0.85, + "grad_norm": 0.49608007073402405, + "learning_rate": 0.0004885716845568786, + "loss": 3.0215, + "step": 17363 + }, + { + "epoch": 0.85, + "grad_norm": 0.5235550999641418, + "learning_rate": 0.0004885597099034064, + "loss": 3.1015, + "step": 17364 + }, + { + "epoch": 0.85, + "grad_norm": 0.5136752128601074, + "learning_rate": 0.0004885477347533044, + "loss": 3.1078, + "step": 17365 + }, + { + "epoch": 0.85, + "grad_norm": 0.49417296051979065, + "learning_rate": 0.0004885357591066038, + "loss": 3.1097, + "step": 17366 + }, + { + "epoch": 0.85, + "grad_norm": 0.4974791705608368, + "learning_rate": 0.0004885237829633363, + "loss": 3.176, + "step": 17367 + }, + { + "epoch": 0.85, + "grad_norm": 0.4924065172672272, + "learning_rate": 0.0004885118063235335, + "loss": 3.2376, + "step": 17368 + }, + { + "epoch": 0.85, + "grad_norm": 0.5056686997413635, + "learning_rate": 0.0004884998291872269, + "loss": 3.3538, + "step": 17369 + }, + { + "epoch": 0.85, + "grad_norm": 0.5538915395736694, + "learning_rate": 0.0004884878515544481, + "loss": 3.1879, + "step": 17370 + }, + { + "epoch": 0.85, + "grad_norm": 0.5767045617103577, + "learning_rate": 0.0004884758734252285, + "loss": 3.1592, + "step": 17371 + }, + { + "epoch": 0.85, + "grad_norm": 0.5208998918533325, + "learning_rate": 0.0004884638947995996, + "loss": 3.1981, + "step": 17372 + }, + { + "epoch": 0.85, + "grad_norm": 0.5236579775810242, + "learning_rate": 0.0004884519156775932, + "loss": 3.0012, + "step": 17373 + }, + { + "epoch": 0.85, + "grad_norm": 0.5991045832633972, + "learning_rate": 0.0004884399360592407, + "loss": 3.1331, + "step": 17374 + }, + { + "epoch": 0.85, + "grad_norm": 0.5242659449577332, + "learning_rate": 0.0004884279559445737, + "loss": 3.3605, + "step": 17375 + }, + { + "epoch": 0.85, + "grad_norm": 0.5292356610298157, + "learning_rate": 0.0004884159753336237, + "loss": 3.2468, + "step": 17376 + }, + { + "epoch": 0.85, + "grad_norm": 0.5420272946357727, + "learning_rate": 0.0004884039942264222, + "loss": 2.9437, + "step": 17377 + }, + { + "epoch": 0.85, + "grad_norm": 0.4963341951370239, + "learning_rate": 0.000488392012623001, + "loss": 3.2331, + "step": 17378 + }, + { + "epoch": 0.85, + "grad_norm": 0.505173921585083, + "learning_rate": 0.0004883800305233914, + "loss": 3.271, + "step": 17379 + }, + { + "epoch": 0.85, + "grad_norm": 0.530937671661377, + "learning_rate": 0.000488368047927625, + "loss": 3.2967, + "step": 17380 + }, + { + "epoch": 0.85, + "grad_norm": 0.5380979776382446, + "learning_rate": 0.0004883560648357335, + "loss": 3.0993, + "step": 17381 + }, + { + "epoch": 0.85, + "grad_norm": 0.5381925702095032, + "learning_rate": 0.0004883440812477484, + "loss": 3.2143, + "step": 17382 + }, + { + "epoch": 0.85, + "grad_norm": 0.517504096031189, + "learning_rate": 0.0004883320971637012, + "loss": 3.2316, + "step": 17383 + }, + { + "epoch": 0.85, + "grad_norm": 0.5057746767997742, + "learning_rate": 0.0004883201125836234, + "loss": 3.1969, + "step": 17384 + }, + { + "epoch": 0.85, + "grad_norm": 0.5114002227783203, + "learning_rate": 0.0004883081275075467, + "loss": 3.135, + "step": 17385 + }, + { + "epoch": 0.85, + "grad_norm": 0.5425366163253784, + "learning_rate": 0.0004882961419355028, + "loss": 3.0397, + "step": 17386 + }, + { + "epoch": 0.85, + "grad_norm": 0.5366644859313965, + "learning_rate": 0.000488284155867523, + "loss": 3.3255, + "step": 17387 + }, + { + "epoch": 0.85, + "grad_norm": 0.5261602401733398, + "learning_rate": 0.0004882721693036389, + "loss": 3.1629, + "step": 17388 + }, + { + "epoch": 0.85, + "grad_norm": 0.5072944760322571, + "learning_rate": 0.0004882601822438823, + "loss": 3.1021, + "step": 17389 + }, + { + "epoch": 0.85, + "grad_norm": 0.6098434329032898, + "learning_rate": 0.0004882481946882845, + "loss": 2.9911, + "step": 17390 + }, + { + "epoch": 0.85, + "grad_norm": 0.535154402256012, + "learning_rate": 0.0004882362066368771, + "loss": 3.2225, + "step": 17391 + }, + { + "epoch": 0.85, + "grad_norm": 0.508673369884491, + "learning_rate": 0.00048822421808969197, + "loss": 3.2243, + "step": 17392 + }, + { + "epoch": 0.85, + "grad_norm": 0.5214126706123352, + "learning_rate": 0.0004882122290467603, + "loss": 3.1554, + "step": 17393 + }, + { + "epoch": 0.85, + "grad_norm": 0.5492194294929504, + "learning_rate": 0.0004882002395081139, + "loss": 3.1623, + "step": 17394 + }, + { + "epoch": 0.85, + "grad_norm": 0.558017909526825, + "learning_rate": 0.0004881882494737843, + "loss": 3.2415, + "step": 17395 + }, + { + "epoch": 0.85, + "grad_norm": 0.5010644793510437, + "learning_rate": 0.0004881762589438031, + "loss": 3.2828, + "step": 17396 + }, + { + "epoch": 0.85, + "grad_norm": 0.5531977415084839, + "learning_rate": 0.00048816426791820175, + "loss": 3.0989, + "step": 17397 + }, + { + "epoch": 0.85, + "grad_norm": 0.5068770051002502, + "learning_rate": 0.00048815227639701205, + "loss": 3.3572, + "step": 17398 + }, + { + "epoch": 0.85, + "grad_norm": 0.6324036121368408, + "learning_rate": 0.00048814028438026535, + "loss": 3.0686, + "step": 17399 + }, + { + "epoch": 0.85, + "grad_norm": 0.5348002910614014, + "learning_rate": 0.00048812829186799337, + "loss": 3.1609, + "step": 17400 + }, + { + "epoch": 0.85, + "grad_norm": 0.5051308274269104, + "learning_rate": 0.00048811629886022777, + "loss": 3.0573, + "step": 17401 + }, + { + "epoch": 0.85, + "grad_norm": 0.5545761585235596, + "learning_rate": 0.00048810430535699994, + "loss": 3.0548, + "step": 17402 + }, + { + "epoch": 0.85, + "grad_norm": 0.692309558391571, + "learning_rate": 0.0004880923113583416, + "loss": 3.1483, + "step": 17403 + }, + { + "epoch": 0.85, + "grad_norm": 0.5532946586608887, + "learning_rate": 0.0004880803168642843, + "loss": 3.1074, + "step": 17404 + }, + { + "epoch": 0.85, + "grad_norm": 0.7708467841148376, + "learning_rate": 0.00048806832187485964, + "loss": 3.3463, + "step": 17405 + }, + { + "epoch": 0.85, + "grad_norm": 0.5050947666168213, + "learning_rate": 0.00048805632639009936, + "loss": 3.0551, + "step": 17406 + }, + { + "epoch": 0.85, + "grad_norm": 0.49402984976768494, + "learning_rate": 0.0004880443304100347, + "loss": 3.1811, + "step": 17407 + }, + { + "epoch": 0.85, + "grad_norm": 0.6107621192932129, + "learning_rate": 0.0004880323339346976, + "loss": 3.1067, + "step": 17408 + }, + { + "epoch": 0.85, + "grad_norm": 0.5168527960777283, + "learning_rate": 0.00048802033696411946, + "loss": 3.1141, + "step": 17409 + }, + { + "epoch": 0.85, + "grad_norm": 0.518570601940155, + "learning_rate": 0.00048800833949833196, + "loss": 2.9312, + "step": 17410 + }, + { + "epoch": 0.85, + "grad_norm": 0.5209082961082458, + "learning_rate": 0.0004879963415373667, + "loss": 2.9536, + "step": 17411 + }, + { + "epoch": 0.85, + "grad_norm": 0.49775710701942444, + "learning_rate": 0.0004879843430812552, + "loss": 3.173, + "step": 17412 + }, + { + "epoch": 0.85, + "grad_norm": 0.5183805823326111, + "learning_rate": 0.00048797234413002914, + "loss": 3.2114, + "step": 17413 + }, + { + "epoch": 0.85, + "grad_norm": 0.5256752967834473, + "learning_rate": 0.00048796034468372016, + "loss": 3.0888, + "step": 17414 + }, + { + "epoch": 0.85, + "grad_norm": 0.47854384779930115, + "learning_rate": 0.0004879483447423597, + "loss": 3.3277, + "step": 17415 + }, + { + "epoch": 0.85, + "grad_norm": 0.5341052412986755, + "learning_rate": 0.00048793634430597946, + "loss": 3.0255, + "step": 17416 + }, + { + "epoch": 0.85, + "grad_norm": 0.5231382846832275, + "learning_rate": 0.00048792434337461107, + "loss": 3.367, + "step": 17417 + }, + { + "epoch": 0.85, + "grad_norm": 0.5256244540214539, + "learning_rate": 0.0004879123419482862, + "loss": 3.3875, + "step": 17418 + }, + { + "epoch": 0.85, + "grad_norm": 0.529234766960144, + "learning_rate": 0.00048790034002703646, + "loss": 3.075, + "step": 17419 + }, + { + "epoch": 0.85, + "grad_norm": 0.6451051831245422, + "learning_rate": 0.0004878883376108932, + "loss": 3.0641, + "step": 17420 + }, + { + "epoch": 0.85, + "grad_norm": 0.5298725366592407, + "learning_rate": 0.00048787633469988834, + "loss": 3.2128, + "step": 17421 + }, + { + "epoch": 0.85, + "grad_norm": 0.5713647603988647, + "learning_rate": 0.00048786433129405335, + "loss": 3.0181, + "step": 17422 + }, + { + "epoch": 0.85, + "grad_norm": 0.48418837785720825, + "learning_rate": 0.0004878523273934198, + "loss": 3.0541, + "step": 17423 + }, + { + "epoch": 0.85, + "grad_norm": 0.5305963754653931, + "learning_rate": 0.00048784032299801946, + "loss": 3.2004, + "step": 17424 + }, + { + "epoch": 0.85, + "grad_norm": 0.5473353266716003, + "learning_rate": 0.00048782831810788383, + "loss": 3.0637, + "step": 17425 + }, + { + "epoch": 0.85, + "grad_norm": 0.5448347926139832, + "learning_rate": 0.00048781631272304453, + "loss": 3.1816, + "step": 17426 + }, + { + "epoch": 0.85, + "grad_norm": 0.522763729095459, + "learning_rate": 0.00048780430684353327, + "loss": 3.0765, + "step": 17427 + }, + { + "epoch": 0.85, + "grad_norm": 0.49944186210632324, + "learning_rate": 0.0004877923004693815, + "loss": 3.2881, + "step": 17428 + }, + { + "epoch": 0.85, + "grad_norm": 0.522628664970398, + "learning_rate": 0.0004877802936006211, + "loss": 3.361, + "step": 17429 + }, + { + "epoch": 0.85, + "grad_norm": 0.5693197250366211, + "learning_rate": 0.00048776828623728346, + "loss": 3.3028, + "step": 17430 + }, + { + "epoch": 0.85, + "grad_norm": 0.5283417701721191, + "learning_rate": 0.00048775627837940026, + "loss": 3.2618, + "step": 17431 + }, + { + "epoch": 0.85, + "grad_norm": 0.5026355385780334, + "learning_rate": 0.0004877442700270032, + "loss": 3.0906, + "step": 17432 + }, + { + "epoch": 0.85, + "grad_norm": 0.5224277973175049, + "learning_rate": 0.0004877322611801239, + "loss": 2.9159, + "step": 17433 + }, + { + "epoch": 0.85, + "grad_norm": 0.5417836308479309, + "learning_rate": 0.0004877202518387939, + "loss": 3.1032, + "step": 17434 + }, + { + "epoch": 0.85, + "grad_norm": 0.5314958691596985, + "learning_rate": 0.00048770824200304494, + "loss": 3.1461, + "step": 17435 + }, + { + "epoch": 0.85, + "grad_norm": 0.5614078640937805, + "learning_rate": 0.0004876962316729085, + "loss": 3.0781, + "step": 17436 + }, + { + "epoch": 0.85, + "grad_norm": 0.5217527747154236, + "learning_rate": 0.00048768422084841643, + "loss": 3.1365, + "step": 17437 + }, + { + "epoch": 0.85, + "grad_norm": 0.5421162247657776, + "learning_rate": 0.00048767220952960013, + "loss": 3.2526, + "step": 17438 + }, + { + "epoch": 0.85, + "grad_norm": 0.647996187210083, + "learning_rate": 0.0004876601977164914, + "loss": 2.8944, + "step": 17439 + }, + { + "epoch": 0.85, + "grad_norm": 0.5372748970985413, + "learning_rate": 0.0004876481854091218, + "loss": 2.9952, + "step": 17440 + }, + { + "epoch": 0.85, + "grad_norm": 0.541601300239563, + "learning_rate": 0.000487636172607523, + "loss": 3.25, + "step": 17441 + }, + { + "epoch": 0.85, + "grad_norm": 0.5672915577888489, + "learning_rate": 0.0004876241593117267, + "loss": 2.9457, + "step": 17442 + }, + { + "epoch": 0.85, + "grad_norm": 0.530311644077301, + "learning_rate": 0.0004876121455217644, + "loss": 2.875, + "step": 17443 + }, + { + "epoch": 0.85, + "grad_norm": 0.5655810236930847, + "learning_rate": 0.0004876001312376679, + "loss": 3.3092, + "step": 17444 + }, + { + "epoch": 0.85, + "grad_norm": 0.5126374363899231, + "learning_rate": 0.0004875881164594686, + "loss": 3.0786, + "step": 17445 + }, + { + "epoch": 0.85, + "grad_norm": 0.5114005208015442, + "learning_rate": 0.0004875761011871984, + "loss": 3.4152, + "step": 17446 + }, + { + "epoch": 0.86, + "grad_norm": 0.5263176560401917, + "learning_rate": 0.00048756408542088885, + "loss": 3.2279, + "step": 17447 + }, + { + "epoch": 0.86, + "grad_norm": 0.5074858069419861, + "learning_rate": 0.0004875520691605716, + "loss": 3.2816, + "step": 17448 + }, + { + "epoch": 0.86, + "grad_norm": 0.5116199851036072, + "learning_rate": 0.00048754005240627833, + "loss": 3.3319, + "step": 17449 + }, + { + "epoch": 0.86, + "grad_norm": 0.5231490135192871, + "learning_rate": 0.00048752803515804063, + "loss": 3.3931, + "step": 17450 + }, + { + "epoch": 0.86, + "grad_norm": 0.6299848556518555, + "learning_rate": 0.00048751601741589014, + "loss": 2.8908, + "step": 17451 + }, + { + "epoch": 0.86, + "grad_norm": 0.5070332288742065, + "learning_rate": 0.00048750399917985856, + "loss": 3.2715, + "step": 17452 + }, + { + "epoch": 0.86, + "grad_norm": 0.6328778862953186, + "learning_rate": 0.0004874919804499775, + "loss": 3.2513, + "step": 17453 + }, + { + "epoch": 0.86, + "grad_norm": 0.52276211977005, + "learning_rate": 0.0004874799612262787, + "loss": 3.0997, + "step": 17454 + }, + { + "epoch": 0.86, + "grad_norm": 0.5386821031570435, + "learning_rate": 0.0004874679415087938, + "loss": 3.2454, + "step": 17455 + }, + { + "epoch": 0.86, + "grad_norm": 0.506618320941925, + "learning_rate": 0.00048745592129755433, + "loss": 3.1048, + "step": 17456 + }, + { + "epoch": 0.86, + "grad_norm": 0.5649548768997192, + "learning_rate": 0.0004874439005925921, + "loss": 2.8524, + "step": 17457 + }, + { + "epoch": 0.86, + "grad_norm": 0.5274940729141235, + "learning_rate": 0.00048743187939393867, + "loss": 3.2162, + "step": 17458 + }, + { + "epoch": 0.86, + "grad_norm": 0.546772301197052, + "learning_rate": 0.0004874198577016258, + "loss": 3.1506, + "step": 17459 + }, + { + "epoch": 0.86, + "grad_norm": 0.5609162449836731, + "learning_rate": 0.00048740783551568504, + "loss": 3.3401, + "step": 17460 + }, + { + "epoch": 0.86, + "grad_norm": 0.5612881183624268, + "learning_rate": 0.0004873958128361481, + "loss": 3.1514, + "step": 17461 + }, + { + "epoch": 0.86, + "grad_norm": 0.5242128968238831, + "learning_rate": 0.0004873837896630467, + "loss": 2.9899, + "step": 17462 + }, + { + "epoch": 0.86, + "grad_norm": 0.5031869411468506, + "learning_rate": 0.00048737176599641246, + "loss": 3.2779, + "step": 17463 + }, + { + "epoch": 0.86, + "grad_norm": 0.5017745494842529, + "learning_rate": 0.00048735974183627707, + "loss": 3.2305, + "step": 17464 + }, + { + "epoch": 0.86, + "grad_norm": 0.5221795439720154, + "learning_rate": 0.0004873477171826721, + "loss": 3.2039, + "step": 17465 + }, + { + "epoch": 0.86, + "grad_norm": 0.5255293846130371, + "learning_rate": 0.0004873356920356293, + "loss": 3.1282, + "step": 17466 + }, + { + "epoch": 0.86, + "grad_norm": 0.5047944784164429, + "learning_rate": 0.00048732366639518037, + "loss": 2.9352, + "step": 17467 + }, + { + "epoch": 0.86, + "grad_norm": 0.5233396291732788, + "learning_rate": 0.000487311640261357, + "loss": 3.2509, + "step": 17468 + }, + { + "epoch": 0.86, + "grad_norm": 0.505703330039978, + "learning_rate": 0.0004872996136341908, + "loss": 3.2049, + "step": 17469 + }, + { + "epoch": 0.86, + "grad_norm": 0.5102468729019165, + "learning_rate": 0.00048728758651371345, + "loss": 3.1221, + "step": 17470 + }, + { + "epoch": 0.86, + "grad_norm": 0.5118353962898254, + "learning_rate": 0.00048727555889995655, + "loss": 3.0826, + "step": 17471 + }, + { + "epoch": 0.86, + "grad_norm": 0.5210556387901306, + "learning_rate": 0.000487263530792952, + "loss": 3.0314, + "step": 17472 + }, + { + "epoch": 0.86, + "grad_norm": 0.5125072598457336, + "learning_rate": 0.0004872515021927313, + "loss": 3.3292, + "step": 17473 + }, + { + "epoch": 0.86, + "grad_norm": 0.5282806754112244, + "learning_rate": 0.00048723947309932625, + "loss": 3.3225, + "step": 17474 + }, + { + "epoch": 0.86, + "grad_norm": 0.4971778392791748, + "learning_rate": 0.00048722744351276836, + "loss": 3.2805, + "step": 17475 + }, + { + "epoch": 0.86, + "grad_norm": 0.5685766339302063, + "learning_rate": 0.00048721541343308946, + "loss": 3.1796, + "step": 17476 + }, + { + "epoch": 0.86, + "grad_norm": 0.5498575568199158, + "learning_rate": 0.0004872033828603212, + "loss": 3.1008, + "step": 17477 + }, + { + "epoch": 0.86, + "grad_norm": 0.5463971495628357, + "learning_rate": 0.0004871913517944952, + "loss": 3.0532, + "step": 17478 + }, + { + "epoch": 0.86, + "grad_norm": 0.5287131667137146, + "learning_rate": 0.0004871793202356432, + "loss": 3.3487, + "step": 17479 + }, + { + "epoch": 0.86, + "grad_norm": 0.5136821866035461, + "learning_rate": 0.000487167288183797, + "loss": 3.2506, + "step": 17480 + }, + { + "epoch": 0.86, + "grad_norm": 0.5033801794052124, + "learning_rate": 0.0004871552556389881, + "loss": 3.2242, + "step": 17481 + }, + { + "epoch": 0.86, + "grad_norm": 0.5138252377510071, + "learning_rate": 0.00048714322260124825, + "loss": 3.0705, + "step": 17482 + }, + { + "epoch": 0.86, + "grad_norm": 0.5219220519065857, + "learning_rate": 0.00048713118907060923, + "loss": 2.9397, + "step": 17483 + }, + { + "epoch": 0.86, + "grad_norm": 0.5228368043899536, + "learning_rate": 0.00048711915504710276, + "loss": 3.2545, + "step": 17484 + }, + { + "epoch": 0.86, + "grad_norm": 0.5889149308204651, + "learning_rate": 0.00048710712053076036, + "loss": 3.1426, + "step": 17485 + }, + { + "epoch": 0.86, + "grad_norm": 0.5183678865432739, + "learning_rate": 0.00048709508552161377, + "loss": 3.1804, + "step": 17486 + }, + { + "epoch": 0.86, + "grad_norm": 0.5020313858985901, + "learning_rate": 0.0004870830500196947, + "loss": 3.4664, + "step": 17487 + }, + { + "epoch": 0.86, + "grad_norm": 0.522677481174469, + "learning_rate": 0.00048707101402503493, + "loss": 3.2824, + "step": 17488 + }, + { + "epoch": 0.86, + "grad_norm": 0.5405004024505615, + "learning_rate": 0.0004870589775376661, + "loss": 3.1882, + "step": 17489 + }, + { + "epoch": 0.86, + "grad_norm": 0.4743952453136444, + "learning_rate": 0.00048704694055762005, + "loss": 3.257, + "step": 17490 + }, + { + "epoch": 0.86, + "grad_norm": 0.49832382798194885, + "learning_rate": 0.00048703490308492825, + "loss": 3.156, + "step": 17491 + }, + { + "epoch": 0.86, + "grad_norm": 0.553101122379303, + "learning_rate": 0.0004870228651196225, + "loss": 3.1769, + "step": 17492 + }, + { + "epoch": 0.86, + "grad_norm": 0.5105452537536621, + "learning_rate": 0.00048701082666173447, + "loss": 3.228, + "step": 17493 + }, + { + "epoch": 0.86, + "grad_norm": 0.4981645345687866, + "learning_rate": 0.0004869987877112961, + "loss": 3.1893, + "step": 17494 + }, + { + "epoch": 0.86, + "grad_norm": 0.5161097645759583, + "learning_rate": 0.00048698674826833875, + "loss": 3.1463, + "step": 17495 + }, + { + "epoch": 0.86, + "grad_norm": 0.5108583569526672, + "learning_rate": 0.00048697470833289434, + "loss": 3.1373, + "step": 17496 + }, + { + "epoch": 0.86, + "grad_norm": 0.5210059881210327, + "learning_rate": 0.00048696266790499457, + "loss": 3.2682, + "step": 17497 + }, + { + "epoch": 0.86, + "grad_norm": 0.5149890184402466, + "learning_rate": 0.000486950626984671, + "loss": 3.1596, + "step": 17498 + }, + { + "epoch": 0.86, + "grad_norm": 0.5131500959396362, + "learning_rate": 0.0004869385855719555, + "loss": 3.0184, + "step": 17499 + }, + { + "epoch": 0.86, + "grad_norm": 0.5088669061660767, + "learning_rate": 0.00048692654366687985, + "loss": 3.1619, + "step": 17500 + }, + { + "epoch": 0.86, + "grad_norm": 0.5249161124229431, + "learning_rate": 0.0004869145012694755, + "loss": 3.087, + "step": 17501 + }, + { + "epoch": 0.86, + "grad_norm": 0.541541337966919, + "learning_rate": 0.00048690245837977445, + "loss": 3.0936, + "step": 17502 + }, + { + "epoch": 0.86, + "grad_norm": 0.5092646479606628, + "learning_rate": 0.0004868904149978082, + "loss": 3.0542, + "step": 17503 + }, + { + "epoch": 0.86, + "grad_norm": 0.5421231389045715, + "learning_rate": 0.0004868783711236087, + "loss": 3.3001, + "step": 17504 + }, + { + "epoch": 0.86, + "grad_norm": 0.5062888860702515, + "learning_rate": 0.00048686632675720736, + "loss": 3.0985, + "step": 17505 + }, + { + "epoch": 0.86, + "grad_norm": 0.5206896066665649, + "learning_rate": 0.00048685428189863616, + "loss": 3.0927, + "step": 17506 + }, + { + "epoch": 0.86, + "grad_norm": 0.5333325862884521, + "learning_rate": 0.0004868422365479268, + "loss": 3.2301, + "step": 17507 + }, + { + "epoch": 0.86, + "grad_norm": 0.5655118227005005, + "learning_rate": 0.0004868301907051109, + "loss": 3.1063, + "step": 17508 + }, + { + "epoch": 0.86, + "grad_norm": 0.5310041904449463, + "learning_rate": 0.0004868181443702202, + "loss": 3.1183, + "step": 17509 + }, + { + "epoch": 0.86, + "grad_norm": 0.5519410967826843, + "learning_rate": 0.00048680609754328644, + "loss": 3.0312, + "step": 17510 + }, + { + "epoch": 0.86, + "grad_norm": 0.6148340106010437, + "learning_rate": 0.0004867940502243415, + "loss": 3.2461, + "step": 17511 + }, + { + "epoch": 0.86, + "grad_norm": 0.512523889541626, + "learning_rate": 0.00048678200241341685, + "loss": 3.268, + "step": 17512 + }, + { + "epoch": 0.86, + "grad_norm": 0.4837605059146881, + "learning_rate": 0.0004867699541105444, + "loss": 2.947, + "step": 17513 + }, + { + "epoch": 0.86, + "grad_norm": 0.5407007336616516, + "learning_rate": 0.00048675790531575583, + "loss": 3.1923, + "step": 17514 + }, + { + "epoch": 0.86, + "grad_norm": 0.495334267616272, + "learning_rate": 0.00048674585602908287, + "loss": 3.2698, + "step": 17515 + }, + { + "epoch": 0.86, + "grad_norm": 0.5168710947036743, + "learning_rate": 0.00048673380625055735, + "loss": 3.2698, + "step": 17516 + }, + { + "epoch": 0.86, + "grad_norm": 0.5113250613212585, + "learning_rate": 0.0004867217559802109, + "loss": 3.401, + "step": 17517 + }, + { + "epoch": 0.86, + "grad_norm": 0.5109712481498718, + "learning_rate": 0.00048670970521807517, + "loss": 3.0884, + "step": 17518 + }, + { + "epoch": 0.86, + "grad_norm": 0.5782906413078308, + "learning_rate": 0.00048669765396418214, + "loss": 3.0593, + "step": 17519 + }, + { + "epoch": 0.86, + "grad_norm": 0.4731515347957611, + "learning_rate": 0.00048668560221856337, + "loss": 3.1659, + "step": 17520 + }, + { + "epoch": 0.86, + "grad_norm": 0.5255733132362366, + "learning_rate": 0.0004866735499812506, + "loss": 3.1536, + "step": 17521 + }, + { + "epoch": 0.86, + "grad_norm": 0.5351244807243347, + "learning_rate": 0.0004866614972522757, + "loss": 3.2712, + "step": 17522 + }, + { + "epoch": 0.86, + "grad_norm": 0.5200554132461548, + "learning_rate": 0.0004866494440316703, + "loss": 3.1256, + "step": 17523 + }, + { + "epoch": 0.86, + "grad_norm": 0.510212242603302, + "learning_rate": 0.0004866373903194661, + "loss": 3.0483, + "step": 17524 + }, + { + "epoch": 0.86, + "grad_norm": 0.5115465521812439, + "learning_rate": 0.0004866253361156951, + "loss": 2.9676, + "step": 17525 + }, + { + "epoch": 0.86, + "grad_norm": 0.5391139984130859, + "learning_rate": 0.0004866132814203888, + "loss": 3.2172, + "step": 17526 + }, + { + "epoch": 0.86, + "grad_norm": 0.5223973393440247, + "learning_rate": 0.00048660122623357904, + "loss": 3.2046, + "step": 17527 + }, + { + "epoch": 0.86, + "grad_norm": 0.5327551364898682, + "learning_rate": 0.00048658917055529753, + "loss": 3.2311, + "step": 17528 + }, + { + "epoch": 0.86, + "grad_norm": 0.5152480602264404, + "learning_rate": 0.0004865771143855761, + "loss": 3.1152, + "step": 17529 + }, + { + "epoch": 0.86, + "grad_norm": 0.5064302086830139, + "learning_rate": 0.0004865650577244464, + "loss": 3.3183, + "step": 17530 + }, + { + "epoch": 0.86, + "grad_norm": 0.5391542911529541, + "learning_rate": 0.00048655300057194036, + "loss": 2.9688, + "step": 17531 + }, + { + "epoch": 0.86, + "grad_norm": 0.49061742424964905, + "learning_rate": 0.00048654094292808955, + "loss": 3.2061, + "step": 17532 + }, + { + "epoch": 0.86, + "grad_norm": 0.5557584762573242, + "learning_rate": 0.00048652888479292574, + "loss": 3.2288, + "step": 17533 + }, + { + "epoch": 0.86, + "grad_norm": 0.5739030241966248, + "learning_rate": 0.0004865168261664808, + "loss": 3.1942, + "step": 17534 + }, + { + "epoch": 0.86, + "grad_norm": 0.5252779126167297, + "learning_rate": 0.00048650476704878643, + "loss": 3.271, + "step": 17535 + }, + { + "epoch": 0.86, + "grad_norm": 0.5234642028808594, + "learning_rate": 0.0004864927074398744, + "loss": 3.3313, + "step": 17536 + }, + { + "epoch": 0.86, + "grad_norm": 0.5235569477081299, + "learning_rate": 0.00048648064733977653, + "loss": 3.38, + "step": 17537 + }, + { + "epoch": 0.86, + "grad_norm": 0.5185587406158447, + "learning_rate": 0.00048646858674852443, + "loss": 3.1555, + "step": 17538 + }, + { + "epoch": 0.86, + "grad_norm": 0.546230673789978, + "learning_rate": 0.00048645652566615, + "loss": 2.8245, + "step": 17539 + }, + { + "epoch": 0.86, + "grad_norm": 0.5275043249130249, + "learning_rate": 0.0004864444640926849, + "loss": 3.2103, + "step": 17540 + }, + { + "epoch": 0.86, + "grad_norm": 0.5376495122909546, + "learning_rate": 0.00048643240202816105, + "loss": 3.0776, + "step": 17541 + }, + { + "epoch": 0.86, + "grad_norm": 0.5582915544509888, + "learning_rate": 0.0004864203394726101, + "loss": 2.9081, + "step": 17542 + }, + { + "epoch": 0.86, + "grad_norm": 0.5006242990493774, + "learning_rate": 0.0004864082764260639, + "loss": 3.2375, + "step": 17543 + }, + { + "epoch": 0.86, + "grad_norm": 0.5330790281295776, + "learning_rate": 0.0004863962128885541, + "loss": 3.1756, + "step": 17544 + }, + { + "epoch": 0.86, + "grad_norm": 0.5119918584823608, + "learning_rate": 0.00048638414886011263, + "loss": 3.119, + "step": 17545 + }, + { + "epoch": 0.86, + "grad_norm": 0.5270196199417114, + "learning_rate": 0.0004863720843407711, + "loss": 3.0816, + "step": 17546 + }, + { + "epoch": 0.86, + "grad_norm": 0.525503396987915, + "learning_rate": 0.0004863600193305614, + "loss": 3.0303, + "step": 17547 + }, + { + "epoch": 0.86, + "grad_norm": 0.528007447719574, + "learning_rate": 0.00048634795382951525, + "loss": 3.2578, + "step": 17548 + }, + { + "epoch": 0.86, + "grad_norm": 0.495852530002594, + "learning_rate": 0.0004863358878376645, + "loss": 3.1074, + "step": 17549 + }, + { + "epoch": 0.86, + "grad_norm": 0.5401389598846436, + "learning_rate": 0.0004863238213550409, + "loss": 3.3623, + "step": 17550 + }, + { + "epoch": 0.86, + "grad_norm": 0.5262596607208252, + "learning_rate": 0.00048631175438167606, + "loss": 3.3607, + "step": 17551 + }, + { + "epoch": 0.86, + "grad_norm": 0.6294898986816406, + "learning_rate": 0.0004862996869176021, + "loss": 3.1021, + "step": 17552 + }, + { + "epoch": 0.86, + "grad_norm": 0.5093241930007935, + "learning_rate": 0.0004862876189628505, + "loss": 3.1727, + "step": 17553 + }, + { + "epoch": 0.86, + "grad_norm": 0.51717609167099, + "learning_rate": 0.0004862755505174533, + "loss": 3.0077, + "step": 17554 + }, + { + "epoch": 0.86, + "grad_norm": 0.5517076849937439, + "learning_rate": 0.00048626348158144206, + "loss": 3.1981, + "step": 17555 + }, + { + "epoch": 0.86, + "grad_norm": 0.5027273893356323, + "learning_rate": 0.0004862514121548486, + "loss": 3.0377, + "step": 17556 + }, + { + "epoch": 0.86, + "grad_norm": 0.5180482268333435, + "learning_rate": 0.0004862393422377048, + "loss": 3.1539, + "step": 17557 + }, + { + "epoch": 0.86, + "grad_norm": 0.5299745202064514, + "learning_rate": 0.00048622727183004246, + "loss": 3.0539, + "step": 17558 + }, + { + "epoch": 0.86, + "grad_norm": 0.49521052837371826, + "learning_rate": 0.0004862152009318933, + "loss": 3.3228, + "step": 17559 + }, + { + "epoch": 0.86, + "grad_norm": 0.5019090175628662, + "learning_rate": 0.0004862031295432892, + "loss": 3.1764, + "step": 17560 + }, + { + "epoch": 0.86, + "grad_norm": 0.5337850451469421, + "learning_rate": 0.0004861910576642618, + "loss": 3.2292, + "step": 17561 + }, + { + "epoch": 0.86, + "grad_norm": 0.5519272089004517, + "learning_rate": 0.000486178985294843, + "loss": 3.2541, + "step": 17562 + }, + { + "epoch": 0.86, + "grad_norm": 0.5433281660079956, + "learning_rate": 0.0004861669124350646, + "loss": 3.2232, + "step": 17563 + }, + { + "epoch": 0.86, + "grad_norm": 0.5310292840003967, + "learning_rate": 0.0004861548390849584, + "loss": 3.0135, + "step": 17564 + }, + { + "epoch": 0.86, + "grad_norm": 0.50967937707901, + "learning_rate": 0.00048614276524455615, + "loss": 3.193, + "step": 17565 + }, + { + "epoch": 0.86, + "grad_norm": 0.5428502559661865, + "learning_rate": 0.0004861306909138897, + "loss": 3.2202, + "step": 17566 + }, + { + "epoch": 0.86, + "grad_norm": 0.5671022534370422, + "learning_rate": 0.00048611861609299087, + "loss": 3.2069, + "step": 17567 + }, + { + "epoch": 0.86, + "grad_norm": 0.517419695854187, + "learning_rate": 0.0004861065407818914, + "loss": 3.1775, + "step": 17568 + }, + { + "epoch": 0.86, + "grad_norm": 0.5266711711883545, + "learning_rate": 0.00048609446498062303, + "loss": 3.0535, + "step": 17569 + }, + { + "epoch": 0.86, + "grad_norm": 0.5126215815544128, + "learning_rate": 0.00048608238868921773, + "loss": 3.2225, + "step": 17570 + }, + { + "epoch": 0.86, + "grad_norm": 0.5277024507522583, + "learning_rate": 0.0004860703119077073, + "loss": 2.9624, + "step": 17571 + }, + { + "epoch": 0.86, + "grad_norm": 0.5014190077781677, + "learning_rate": 0.00048605823463612334, + "loss": 3.1931, + "step": 17572 + }, + { + "epoch": 0.86, + "grad_norm": 0.5363627076148987, + "learning_rate": 0.00048604615687449794, + "loss": 3.3138, + "step": 17573 + }, + { + "epoch": 0.86, + "grad_norm": 0.5196545720100403, + "learning_rate": 0.00048603407862286266, + "loss": 3.0961, + "step": 17574 + }, + { + "epoch": 0.86, + "grad_norm": 0.49793750047683716, + "learning_rate": 0.0004860219998812494, + "loss": 3.2758, + "step": 17575 + }, + { + "epoch": 0.86, + "grad_norm": 0.5129599571228027, + "learning_rate": 0.00048600992064969014, + "loss": 3.2445, + "step": 17576 + }, + { + "epoch": 0.86, + "grad_norm": 0.5278726816177368, + "learning_rate": 0.0004859978409282164, + "loss": 3.0858, + "step": 17577 + }, + { + "epoch": 0.86, + "grad_norm": 0.5560405850410461, + "learning_rate": 0.0004859857607168602, + "loss": 3.0362, + "step": 17578 + }, + { + "epoch": 0.86, + "grad_norm": 0.5135570764541626, + "learning_rate": 0.0004859736800156533, + "loss": 3.0105, + "step": 17579 + }, + { + "epoch": 0.86, + "grad_norm": 0.5374484062194824, + "learning_rate": 0.0004859615988246275, + "loss": 3.118, + "step": 17580 + }, + { + "epoch": 0.86, + "grad_norm": 0.5141043066978455, + "learning_rate": 0.00048594951714381465, + "loss": 2.999, + "step": 17581 + }, + { + "epoch": 0.86, + "grad_norm": 0.5360473990440369, + "learning_rate": 0.0004859374349732466, + "loss": 3.1131, + "step": 17582 + }, + { + "epoch": 0.86, + "grad_norm": 0.5004850029945374, + "learning_rate": 0.000485925352312955, + "loss": 3.1204, + "step": 17583 + }, + { + "epoch": 0.86, + "grad_norm": 0.5214194655418396, + "learning_rate": 0.00048591326916297184, + "loss": 3.3562, + "step": 17584 + }, + { + "epoch": 0.86, + "grad_norm": 0.5029579401016235, + "learning_rate": 0.000485901185523329, + "loss": 3.2461, + "step": 17585 + }, + { + "epoch": 0.86, + "grad_norm": 0.5304317474365234, + "learning_rate": 0.0004858891013940582, + "loss": 3.2182, + "step": 17586 + }, + { + "epoch": 0.86, + "grad_norm": 0.5088167786598206, + "learning_rate": 0.00048587701677519127, + "loss": 3.1036, + "step": 17587 + }, + { + "epoch": 0.86, + "grad_norm": 0.5475550889968872, + "learning_rate": 0.00048586493166676004, + "loss": 3.0157, + "step": 17588 + }, + { + "epoch": 0.86, + "grad_norm": 0.5142987370491028, + "learning_rate": 0.0004858528460687963, + "loss": 2.8891, + "step": 17589 + }, + { + "epoch": 0.86, + "grad_norm": 0.5690994262695312, + "learning_rate": 0.000485840759981332, + "loss": 3.1212, + "step": 17590 + }, + { + "epoch": 0.86, + "grad_norm": 0.5271167159080505, + "learning_rate": 0.00048582867340439897, + "loss": 3.2301, + "step": 17591 + }, + { + "epoch": 0.86, + "grad_norm": 0.512944757938385, + "learning_rate": 0.00048581658633802883, + "loss": 3.3691, + "step": 17592 + }, + { + "epoch": 0.86, + "grad_norm": 0.5304507613182068, + "learning_rate": 0.00048580449878225367, + "loss": 3.3141, + "step": 17593 + }, + { + "epoch": 0.86, + "grad_norm": 0.6154621839523315, + "learning_rate": 0.0004857924107371051, + "loss": 3.1346, + "step": 17594 + }, + { + "epoch": 0.86, + "grad_norm": 0.556638777256012, + "learning_rate": 0.0004857803222026151, + "loss": 3.1249, + "step": 17595 + }, + { + "epoch": 0.86, + "grad_norm": 0.5296390652656555, + "learning_rate": 0.00048576823317881564, + "loss": 2.9605, + "step": 17596 + }, + { + "epoch": 0.86, + "grad_norm": 0.5081044435501099, + "learning_rate": 0.00048575614366573827, + "loss": 3.182, + "step": 17597 + }, + { + "epoch": 0.86, + "grad_norm": 0.5104318261146545, + "learning_rate": 0.0004857440536634151, + "loss": 3.342, + "step": 17598 + }, + { + "epoch": 0.86, + "grad_norm": 0.5356442332267761, + "learning_rate": 0.0004857319631718777, + "loss": 3.0014, + "step": 17599 + }, + { + "epoch": 0.86, + "grad_norm": 0.48045575618743896, + "learning_rate": 0.000485719872191158, + "loss": 3.2048, + "step": 17600 + }, + { + "epoch": 0.86, + "grad_norm": 0.4831188917160034, + "learning_rate": 0.00048570778072128806, + "loss": 3.4564, + "step": 17601 + }, + { + "epoch": 0.86, + "grad_norm": 0.5195653438568115, + "learning_rate": 0.00048569568876229944, + "loss": 3.4224, + "step": 17602 + }, + { + "epoch": 0.86, + "grad_norm": 0.5168091654777527, + "learning_rate": 0.0004856835963142242, + "loss": 3.0934, + "step": 17603 + }, + { + "epoch": 0.86, + "grad_norm": 0.49368128180503845, + "learning_rate": 0.0004856715033770941, + "loss": 2.9334, + "step": 17604 + }, + { + "epoch": 0.86, + "grad_norm": 0.5106991529464722, + "learning_rate": 0.00048565940995094097, + "loss": 3.1075, + "step": 17605 + }, + { + "epoch": 0.86, + "grad_norm": 0.5301958918571472, + "learning_rate": 0.0004856473160357967, + "loss": 2.885, + "step": 17606 + }, + { + "epoch": 0.86, + "grad_norm": 0.5249580144882202, + "learning_rate": 0.00048563522163169314, + "loss": 3.176, + "step": 17607 + }, + { + "epoch": 0.86, + "grad_norm": 0.5821053385734558, + "learning_rate": 0.000485623126738662, + "loss": 3.2309, + "step": 17608 + }, + { + "epoch": 0.86, + "grad_norm": 0.5431340932846069, + "learning_rate": 0.00048561103135673546, + "loss": 3.0735, + "step": 17609 + }, + { + "epoch": 0.86, + "grad_norm": 0.5106194019317627, + "learning_rate": 0.000485598935485945, + "loss": 3.1356, + "step": 17610 + }, + { + "epoch": 0.86, + "grad_norm": 0.5526049733161926, + "learning_rate": 0.00048558683912632277, + "loss": 3.2272, + "step": 17611 + }, + { + "epoch": 0.86, + "grad_norm": 0.5209463238716125, + "learning_rate": 0.00048557474227790056, + "loss": 3.1295, + "step": 17612 + }, + { + "epoch": 0.86, + "grad_norm": 0.5306352376937866, + "learning_rate": 0.00048556264494071014, + "loss": 3.151, + "step": 17613 + }, + { + "epoch": 0.86, + "grad_norm": 0.483786016702652, + "learning_rate": 0.00048555054711478345, + "loss": 3.1528, + "step": 17614 + }, + { + "epoch": 0.86, + "grad_norm": 0.5121860504150391, + "learning_rate": 0.00048553844880015225, + "loss": 3.2255, + "step": 17615 + }, + { + "epoch": 0.86, + "grad_norm": 0.5123046040534973, + "learning_rate": 0.0004855263499968486, + "loss": 3.2583, + "step": 17616 + }, + { + "epoch": 0.86, + "grad_norm": 0.5222660899162292, + "learning_rate": 0.0004855142507049042, + "loss": 3.3446, + "step": 17617 + }, + { + "epoch": 0.86, + "grad_norm": 0.5142706632614136, + "learning_rate": 0.00048550215092435094, + "loss": 3.2527, + "step": 17618 + }, + { + "epoch": 0.86, + "grad_norm": 0.5308032035827637, + "learning_rate": 0.00048549005065522073, + "loss": 3.3778, + "step": 17619 + }, + { + "epoch": 0.86, + "grad_norm": 0.5231295228004456, + "learning_rate": 0.00048547794989754544, + "loss": 3.0498, + "step": 17620 + }, + { + "epoch": 0.86, + "grad_norm": 0.5291686654090881, + "learning_rate": 0.00048546584865135684, + "loss": 3.1601, + "step": 17621 + }, + { + "epoch": 0.86, + "grad_norm": 0.5070790648460388, + "learning_rate": 0.00048545374691668703, + "loss": 3.2343, + "step": 17622 + }, + { + "epoch": 0.86, + "grad_norm": 0.5530633926391602, + "learning_rate": 0.00048544164469356766, + "loss": 3.076, + "step": 17623 + }, + { + "epoch": 0.86, + "grad_norm": 0.4944988489151001, + "learning_rate": 0.0004854295419820307, + "loss": 2.9956, + "step": 17624 + }, + { + "epoch": 0.86, + "grad_norm": 0.5238565802574158, + "learning_rate": 0.000485417438782108, + "loss": 2.9795, + "step": 17625 + }, + { + "epoch": 0.86, + "grad_norm": 0.5411428809165955, + "learning_rate": 0.00048540533509383143, + "loss": 3.3082, + "step": 17626 + }, + { + "epoch": 0.86, + "grad_norm": 0.4969344437122345, + "learning_rate": 0.00048539323091723296, + "loss": 3.0575, + "step": 17627 + }, + { + "epoch": 0.86, + "grad_norm": 0.5270441174507141, + "learning_rate": 0.00048538112625234436, + "loss": 3.0456, + "step": 17628 + }, + { + "epoch": 0.86, + "grad_norm": 0.5126833915710449, + "learning_rate": 0.00048536902109919756, + "loss": 3.183, + "step": 17629 + }, + { + "epoch": 0.86, + "grad_norm": 0.5584427118301392, + "learning_rate": 0.00048535691545782445, + "loss": 3.2503, + "step": 17630 + }, + { + "epoch": 0.86, + "grad_norm": 0.5255547761917114, + "learning_rate": 0.0004853448093282569, + "loss": 3.2817, + "step": 17631 + }, + { + "epoch": 0.86, + "grad_norm": 0.5031391382217407, + "learning_rate": 0.0004853327027105267, + "loss": 3.1273, + "step": 17632 + }, + { + "epoch": 0.86, + "grad_norm": 0.5100876092910767, + "learning_rate": 0.0004853205956046659, + "loss": 3.3041, + "step": 17633 + }, + { + "epoch": 0.86, + "grad_norm": 0.5045557022094727, + "learning_rate": 0.0004853084880107064, + "loss": 3.3764, + "step": 17634 + }, + { + "epoch": 0.86, + "grad_norm": 0.512071430683136, + "learning_rate": 0.00048529637992867985, + "loss": 3.2025, + "step": 17635 + }, + { + "epoch": 0.86, + "grad_norm": 0.505384087562561, + "learning_rate": 0.00048528427135861835, + "loss": 3.1017, + "step": 17636 + }, + { + "epoch": 0.86, + "grad_norm": 0.5387892127037048, + "learning_rate": 0.0004852721623005538, + "loss": 2.9785, + "step": 17637 + }, + { + "epoch": 0.86, + "grad_norm": 0.5180559158325195, + "learning_rate": 0.00048526005275451804, + "loss": 3.3379, + "step": 17638 + }, + { + "epoch": 0.86, + "grad_norm": 0.5065121650695801, + "learning_rate": 0.0004852479427205428, + "loss": 3.1839, + "step": 17639 + }, + { + "epoch": 0.86, + "grad_norm": 0.5370428562164307, + "learning_rate": 0.00048523583219866023, + "loss": 3.4099, + "step": 17640 + }, + { + "epoch": 0.86, + "grad_norm": 0.5340681076049805, + "learning_rate": 0.0004852237211889022, + "loss": 3.2899, + "step": 17641 + }, + { + "epoch": 0.86, + "grad_norm": 0.524978518486023, + "learning_rate": 0.0004852116096913004, + "loss": 3.0903, + "step": 17642 + }, + { + "epoch": 0.86, + "grad_norm": 0.5199117660522461, + "learning_rate": 0.000485199497705887, + "loss": 3.3207, + "step": 17643 + }, + { + "epoch": 0.86, + "grad_norm": 0.5268123149871826, + "learning_rate": 0.00048518738523269366, + "loss": 3.1258, + "step": 17644 + }, + { + "epoch": 0.86, + "grad_norm": 0.5221728086471558, + "learning_rate": 0.0004851752722717524, + "loss": 3.0099, + "step": 17645 + }, + { + "epoch": 0.86, + "grad_norm": 0.5612000226974487, + "learning_rate": 0.00048516315882309513, + "loss": 3.1997, + "step": 17646 + }, + { + "epoch": 0.86, + "grad_norm": 0.4834546148777008, + "learning_rate": 0.00048515104488675373, + "loss": 3.2142, + "step": 17647 + }, + { + "epoch": 0.86, + "grad_norm": 0.5152857899665833, + "learning_rate": 0.0004851389304627601, + "loss": 3.0943, + "step": 17648 + }, + { + "epoch": 0.86, + "grad_norm": 0.6461136937141418, + "learning_rate": 0.0004851268155511462, + "loss": 3.0051, + "step": 17649 + }, + { + "epoch": 0.86, + "grad_norm": 0.5414785146713257, + "learning_rate": 0.00048511470015194394, + "loss": 3.1216, + "step": 17650 + }, + { + "epoch": 0.87, + "grad_norm": 0.5066803693771362, + "learning_rate": 0.000485102584265185, + "loss": 2.9696, + "step": 17651 + }, + { + "epoch": 0.87, + "grad_norm": 0.5823258757591248, + "learning_rate": 0.0004850904678909016, + "loss": 3.2366, + "step": 17652 + }, + { + "epoch": 0.87, + "grad_norm": 0.5320606231689453, + "learning_rate": 0.0004850783510291256, + "loss": 2.8672, + "step": 17653 + }, + { + "epoch": 0.87, + "grad_norm": 0.5367481708526611, + "learning_rate": 0.0004850662336798888, + "loss": 3.3241, + "step": 17654 + }, + { + "epoch": 0.87, + "grad_norm": 0.5208763480186462, + "learning_rate": 0.00048505411584322304, + "loss": 3.123, + "step": 17655 + }, + { + "epoch": 0.87, + "grad_norm": 0.5208843946456909, + "learning_rate": 0.00048504199751916045, + "loss": 3.0314, + "step": 17656 + }, + { + "epoch": 0.87, + "grad_norm": 0.5739818811416626, + "learning_rate": 0.00048502987870773287, + "loss": 3.1522, + "step": 17657 + }, + { + "epoch": 0.87, + "grad_norm": 0.5213282108306885, + "learning_rate": 0.0004850177594089722, + "loss": 3.227, + "step": 17658 + }, + { + "epoch": 0.87, + "grad_norm": 0.4983675479888916, + "learning_rate": 0.00048500563962291024, + "loss": 3.1158, + "step": 17659 + }, + { + "epoch": 0.87, + "grad_norm": 0.5267578363418579, + "learning_rate": 0.00048499351934957915, + "loss": 3.216, + "step": 17660 + }, + { + "epoch": 0.87, + "grad_norm": 0.4949793219566345, + "learning_rate": 0.0004849813985890107, + "loss": 2.8112, + "step": 17661 + }, + { + "epoch": 0.87, + "grad_norm": 0.5270505547523499, + "learning_rate": 0.0004849692773412368, + "loss": 3.1422, + "step": 17662 + }, + { + "epoch": 0.87, + "grad_norm": 0.604992151260376, + "learning_rate": 0.00048495715560628946, + "loss": 3.2086, + "step": 17663 + }, + { + "epoch": 0.87, + "grad_norm": 0.5165683627128601, + "learning_rate": 0.0004849450333842006, + "loss": 3.1152, + "step": 17664 + }, + { + "epoch": 0.87, + "grad_norm": 0.49626970291137695, + "learning_rate": 0.00048493291067500214, + "loss": 3.0413, + "step": 17665 + }, + { + "epoch": 0.87, + "grad_norm": 0.5042181611061096, + "learning_rate": 0.0004849207874787258, + "loss": 3.1962, + "step": 17666 + }, + { + "epoch": 0.87, + "grad_norm": 0.5245406627655029, + "learning_rate": 0.0004849086637954039, + "loss": 3.0042, + "step": 17667 + }, + { + "epoch": 0.87, + "grad_norm": 0.49068549275398254, + "learning_rate": 0.00048489653962506806, + "loss": 3.0559, + "step": 17668 + }, + { + "epoch": 0.87, + "grad_norm": 0.576702892780304, + "learning_rate": 0.0004848844149677504, + "loss": 2.9814, + "step": 17669 + }, + { + "epoch": 0.87, + "grad_norm": 0.48933812975883484, + "learning_rate": 0.0004848722898234827, + "loss": 3.1889, + "step": 17670 + }, + { + "epoch": 0.87, + "grad_norm": 0.5341169834136963, + "learning_rate": 0.00048486016419229696, + "loss": 3.2209, + "step": 17671 + }, + { + "epoch": 0.87, + "grad_norm": 0.5466429591178894, + "learning_rate": 0.0004848480380742252, + "loss": 3.189, + "step": 17672 + }, + { + "epoch": 0.87, + "grad_norm": 0.5925959348678589, + "learning_rate": 0.00048483591146929926, + "loss": 3.4804, + "step": 17673 + }, + { + "epoch": 0.87, + "grad_norm": 0.544042706489563, + "learning_rate": 0.00048482378437755103, + "loss": 3.2635, + "step": 17674 + }, + { + "epoch": 0.87, + "grad_norm": 0.518909752368927, + "learning_rate": 0.0004848116567990126, + "loss": 3.3417, + "step": 17675 + }, + { + "epoch": 0.87, + "grad_norm": 0.5055917501449585, + "learning_rate": 0.0004847995287337158, + "loss": 3.0895, + "step": 17676 + }, + { + "epoch": 0.87, + "grad_norm": 0.5370952486991882, + "learning_rate": 0.0004847874001816926, + "loss": 3.2929, + "step": 17677 + }, + { + "epoch": 0.87, + "grad_norm": 0.5014994144439697, + "learning_rate": 0.00048477527114297494, + "loss": 3.09, + "step": 17678 + }, + { + "epoch": 0.87, + "grad_norm": 0.5466054081916809, + "learning_rate": 0.00048476314161759486, + "loss": 3.032, + "step": 17679 + }, + { + "epoch": 0.87, + "grad_norm": 0.554060161113739, + "learning_rate": 0.0004847510116055842, + "loss": 3.1949, + "step": 17680 + }, + { + "epoch": 0.87, + "grad_norm": 0.5110071897506714, + "learning_rate": 0.00048473888110697484, + "loss": 3.1546, + "step": 17681 + }, + { + "epoch": 0.87, + "grad_norm": 0.5429432392120361, + "learning_rate": 0.00048472675012179887, + "loss": 3.1586, + "step": 17682 + }, + { + "epoch": 0.87, + "grad_norm": 0.48281505703926086, + "learning_rate": 0.00048471461865008816, + "loss": 3.0572, + "step": 17683 + }, + { + "epoch": 0.87, + "grad_norm": 0.5270022749900818, + "learning_rate": 0.00048470248669187484, + "loss": 3.2111, + "step": 17684 + }, + { + "epoch": 0.87, + "grad_norm": 0.4998297691345215, + "learning_rate": 0.0004846903542471906, + "loss": 3.1914, + "step": 17685 + }, + { + "epoch": 0.87, + "grad_norm": 0.5160195231437683, + "learning_rate": 0.00048467822131606747, + "loss": 3.1542, + "step": 17686 + }, + { + "epoch": 0.87, + "grad_norm": 0.5145862698554993, + "learning_rate": 0.0004846660878985375, + "loss": 2.9667, + "step": 17687 + }, + { + "epoch": 0.87, + "grad_norm": 0.5544316172599792, + "learning_rate": 0.00048465395399463257, + "loss": 3.1801, + "step": 17688 + }, + { + "epoch": 0.87, + "grad_norm": 0.4879906177520752, + "learning_rate": 0.00048464181960438475, + "loss": 3.2567, + "step": 17689 + }, + { + "epoch": 0.87, + "grad_norm": 0.4931853711605072, + "learning_rate": 0.00048462968472782586, + "loss": 3.2061, + "step": 17690 + }, + { + "epoch": 0.87, + "grad_norm": 0.5492919683456421, + "learning_rate": 0.00048461754936498787, + "loss": 3.0187, + "step": 17691 + }, + { + "epoch": 0.87, + "grad_norm": 0.5467280149459839, + "learning_rate": 0.00048460541351590277, + "loss": 3.1143, + "step": 17692 + }, + { + "epoch": 0.87, + "grad_norm": 0.49284547567367554, + "learning_rate": 0.0004845932771806026, + "loss": 3.1644, + "step": 17693 + }, + { + "epoch": 0.87, + "grad_norm": 0.5207919478416443, + "learning_rate": 0.0004845811403591193, + "loss": 3.1206, + "step": 17694 + }, + { + "epoch": 0.87, + "grad_norm": 0.5158400535583496, + "learning_rate": 0.00048456900305148475, + "loss": 3.1989, + "step": 17695 + }, + { + "epoch": 0.87, + "grad_norm": 0.4813472330570221, + "learning_rate": 0.00048455686525773094, + "loss": 2.9873, + "step": 17696 + }, + { + "epoch": 0.87, + "grad_norm": 0.49759748578071594, + "learning_rate": 0.0004845447269778898, + "loss": 3.2696, + "step": 17697 + }, + { + "epoch": 0.87, + "grad_norm": 0.5683988332748413, + "learning_rate": 0.00048453258821199347, + "loss": 2.8853, + "step": 17698 + }, + { + "epoch": 0.87, + "grad_norm": 0.5115375518798828, + "learning_rate": 0.0004845204489600738, + "loss": 3.1641, + "step": 17699 + }, + { + "epoch": 0.87, + "grad_norm": 0.5470658540725708, + "learning_rate": 0.0004845083092221629, + "loss": 3.1421, + "step": 17700 + }, + { + "epoch": 0.87, + "grad_norm": 0.5110447406768799, + "learning_rate": 0.0004844961689982924, + "loss": 3.2852, + "step": 17701 + }, + { + "epoch": 0.87, + "grad_norm": 0.5411603450775146, + "learning_rate": 0.0004844840282884947, + "loss": 3.1624, + "step": 17702 + }, + { + "epoch": 0.87, + "grad_norm": 0.5393169522285461, + "learning_rate": 0.00048447188709280144, + "loss": 3.1563, + "step": 17703 + }, + { + "epoch": 0.87, + "grad_norm": 0.480815052986145, + "learning_rate": 0.00048445974541124474, + "loss": 3.3201, + "step": 17704 + }, + { + "epoch": 0.87, + "grad_norm": 0.5160706639289856, + "learning_rate": 0.00048444760324385655, + "loss": 3.3324, + "step": 17705 + }, + { + "epoch": 0.87, + "grad_norm": 0.5325861573219299, + "learning_rate": 0.0004844354605906689, + "loss": 3.0338, + "step": 17706 + }, + { + "epoch": 0.87, + "grad_norm": 0.4856245517730713, + "learning_rate": 0.0004844233174517138, + "loss": 3.1634, + "step": 17707 + }, + { + "epoch": 0.87, + "grad_norm": 0.5197152495384216, + "learning_rate": 0.00048441117382702316, + "loss": 3.1673, + "step": 17708 + }, + { + "epoch": 0.87, + "grad_norm": 0.537200927734375, + "learning_rate": 0.0004843990297166289, + "loss": 3.1794, + "step": 17709 + }, + { + "epoch": 0.87, + "grad_norm": 0.5353251695632935, + "learning_rate": 0.00048438688512056317, + "loss": 3.0948, + "step": 17710 + }, + { + "epoch": 0.87, + "grad_norm": 0.5023453831672668, + "learning_rate": 0.0004843747400388579, + "loss": 3.2727, + "step": 17711 + }, + { + "epoch": 0.87, + "grad_norm": 0.5291193127632141, + "learning_rate": 0.00048436259447154497, + "loss": 2.969, + "step": 17712 + }, + { + "epoch": 0.87, + "grad_norm": 0.5221428871154785, + "learning_rate": 0.00048435044841865646, + "loss": 3.1545, + "step": 17713 + }, + { + "epoch": 0.87, + "grad_norm": 0.5405849814414978, + "learning_rate": 0.00048433830188022433, + "loss": 3.109, + "step": 17714 + }, + { + "epoch": 0.87, + "grad_norm": 0.518534779548645, + "learning_rate": 0.0004843261548562806, + "loss": 3.2215, + "step": 17715 + }, + { + "epoch": 0.87, + "grad_norm": 0.5127702951431274, + "learning_rate": 0.00048431400734685724, + "loss": 3.1477, + "step": 17716 + }, + { + "epoch": 0.87, + "grad_norm": 0.49492380023002625, + "learning_rate": 0.0004843018593519863, + "loss": 3.1372, + "step": 17717 + }, + { + "epoch": 0.87, + "grad_norm": 0.5577251315116882, + "learning_rate": 0.0004842897108716997, + "loss": 3.078, + "step": 17718 + }, + { + "epoch": 0.87, + "grad_norm": 0.5143783092498779, + "learning_rate": 0.0004842775619060295, + "loss": 3.0365, + "step": 17719 + }, + { + "epoch": 0.87, + "grad_norm": 0.5474133491516113, + "learning_rate": 0.0004842654124550077, + "loss": 3.0492, + "step": 17720 + }, + { + "epoch": 0.87, + "grad_norm": 0.5189671516418457, + "learning_rate": 0.0004842532625186661, + "loss": 3.1727, + "step": 17721 + }, + { + "epoch": 0.87, + "grad_norm": 0.5471231341362, + "learning_rate": 0.00048424111209703706, + "loss": 3.169, + "step": 17722 + }, + { + "epoch": 0.87, + "grad_norm": 0.4973917603492737, + "learning_rate": 0.00048422896119015233, + "loss": 3.0737, + "step": 17723 + }, + { + "epoch": 0.87, + "grad_norm": 0.5816762447357178, + "learning_rate": 0.00048421680979804393, + "loss": 3.1443, + "step": 17724 + }, + { + "epoch": 0.87, + "grad_norm": 0.514470100402832, + "learning_rate": 0.0004842046579207439, + "loss": 3.111, + "step": 17725 + }, + { + "epoch": 0.87, + "grad_norm": 0.5202110409736633, + "learning_rate": 0.0004841925055582843, + "loss": 3.2958, + "step": 17726 + }, + { + "epoch": 0.87, + "grad_norm": 0.48983487486839294, + "learning_rate": 0.0004841803527106971, + "loss": 3.0307, + "step": 17727 + }, + { + "epoch": 0.87, + "grad_norm": 0.5123018026351929, + "learning_rate": 0.0004841681993780142, + "loss": 3.0805, + "step": 17728 + }, + { + "epoch": 0.87, + "grad_norm": 0.5432770252227783, + "learning_rate": 0.00048415604556026787, + "loss": 3.2386, + "step": 17729 + }, + { + "epoch": 0.87, + "grad_norm": 0.5174310207366943, + "learning_rate": 0.00048414389125748977, + "loss": 3.2467, + "step": 17730 + }, + { + "epoch": 0.87, + "grad_norm": 0.503591001033783, + "learning_rate": 0.00048413173646971226, + "loss": 3.1232, + "step": 17731 + }, + { + "epoch": 0.87, + "grad_norm": 0.5538753271102905, + "learning_rate": 0.00048411958119696716, + "loss": 2.9704, + "step": 17732 + }, + { + "epoch": 0.87, + "grad_norm": 0.5586168766021729, + "learning_rate": 0.0004841074254392864, + "loss": 3.0653, + "step": 17733 + }, + { + "epoch": 0.87, + "grad_norm": 0.5222452282905579, + "learning_rate": 0.0004840952691967022, + "loss": 3.0688, + "step": 17734 + }, + { + "epoch": 0.87, + "grad_norm": 0.5168008804321289, + "learning_rate": 0.0004840831124692465, + "loss": 3.0731, + "step": 17735 + }, + { + "epoch": 0.87, + "grad_norm": 0.4989527761936188, + "learning_rate": 0.00048407095525695125, + "loss": 3.0839, + "step": 17736 + }, + { + "epoch": 0.87, + "grad_norm": 0.5122324824333191, + "learning_rate": 0.0004840587975598486, + "loss": 3.1072, + "step": 17737 + }, + { + "epoch": 0.87, + "grad_norm": 0.5376427173614502, + "learning_rate": 0.0004840466393779704, + "loss": 3.2577, + "step": 17738 + }, + { + "epoch": 0.87, + "grad_norm": 0.5357086062431335, + "learning_rate": 0.00048403448071134887, + "loss": 3.2627, + "step": 17739 + }, + { + "epoch": 0.87, + "grad_norm": 0.530827522277832, + "learning_rate": 0.000484022321560016, + "loss": 3.1058, + "step": 17740 + }, + { + "epoch": 0.87, + "grad_norm": 0.5042960047721863, + "learning_rate": 0.0004840101619240036, + "loss": 3.2674, + "step": 17741 + }, + { + "epoch": 0.87, + "grad_norm": 0.5273114442825317, + "learning_rate": 0.00048399800180334396, + "loss": 3.2993, + "step": 17742 + }, + { + "epoch": 0.87, + "grad_norm": 0.5568901300430298, + "learning_rate": 0.0004839858411980689, + "loss": 3.0452, + "step": 17743 + }, + { + "epoch": 0.87, + "grad_norm": 0.5302985906600952, + "learning_rate": 0.0004839736801082106, + "loss": 3.0976, + "step": 17744 + }, + { + "epoch": 0.87, + "grad_norm": 0.5017320513725281, + "learning_rate": 0.00048396151853380106, + "loss": 3.0252, + "step": 17745 + }, + { + "epoch": 0.87, + "grad_norm": 0.5245184302330017, + "learning_rate": 0.00048394935647487226, + "loss": 2.9773, + "step": 17746 + }, + { + "epoch": 0.87, + "grad_norm": 0.5124706029891968, + "learning_rate": 0.00048393719393145617, + "loss": 3.0761, + "step": 17747 + }, + { + "epoch": 0.87, + "grad_norm": 0.5464129447937012, + "learning_rate": 0.000483925030903585, + "loss": 3.1286, + "step": 17748 + }, + { + "epoch": 0.87, + "grad_norm": 0.5206067562103271, + "learning_rate": 0.0004839128673912907, + "loss": 3.2381, + "step": 17749 + }, + { + "epoch": 0.87, + "grad_norm": 0.5206215381622314, + "learning_rate": 0.00048390070339460526, + "loss": 3.0297, + "step": 17750 + }, + { + "epoch": 0.87, + "grad_norm": 0.5299220085144043, + "learning_rate": 0.0004838885389135608, + "loss": 3.2162, + "step": 17751 + }, + { + "epoch": 0.87, + "grad_norm": 0.5235497355461121, + "learning_rate": 0.00048387637394818925, + "loss": 3.0847, + "step": 17752 + }, + { + "epoch": 0.87, + "grad_norm": 0.5170539021492004, + "learning_rate": 0.0004838642084985228, + "loss": 3.1975, + "step": 17753 + }, + { + "epoch": 0.87, + "grad_norm": 0.5105252265930176, + "learning_rate": 0.00048385204256459334, + "loss": 3.2036, + "step": 17754 + }, + { + "epoch": 0.87, + "grad_norm": 0.5179150700569153, + "learning_rate": 0.00048383987614643303, + "loss": 3.1076, + "step": 17755 + }, + { + "epoch": 0.87, + "grad_norm": 0.5759047865867615, + "learning_rate": 0.0004838277092440739, + "loss": 3.0669, + "step": 17756 + }, + { + "epoch": 0.87, + "grad_norm": 0.4824429452419281, + "learning_rate": 0.0004838155418575479, + "loss": 3.1084, + "step": 17757 + }, + { + "epoch": 0.87, + "grad_norm": 0.5309851765632629, + "learning_rate": 0.00048380337398688713, + "loss": 3.0484, + "step": 17758 + }, + { + "epoch": 0.87, + "grad_norm": 0.5082951188087463, + "learning_rate": 0.00048379120563212365, + "loss": 3.0945, + "step": 17759 + }, + { + "epoch": 0.87, + "grad_norm": 0.5423614382743835, + "learning_rate": 0.0004837790367932896, + "loss": 3.0284, + "step": 17760 + }, + { + "epoch": 0.87, + "grad_norm": 0.503147304058075, + "learning_rate": 0.00048376686747041684, + "loss": 3.1037, + "step": 17761 + }, + { + "epoch": 0.87, + "grad_norm": 0.5271511077880859, + "learning_rate": 0.00048375469766353754, + "loss": 3.2318, + "step": 17762 + }, + { + "epoch": 0.87, + "grad_norm": 0.5349052548408508, + "learning_rate": 0.0004837425273726838, + "loss": 3.2816, + "step": 17763 + }, + { + "epoch": 0.87, + "grad_norm": 0.5517288446426392, + "learning_rate": 0.0004837303565978875, + "loss": 3.2532, + "step": 17764 + }, + { + "epoch": 0.87, + "grad_norm": 0.5327991843223572, + "learning_rate": 0.00048371818533918075, + "loss": 2.9928, + "step": 17765 + }, + { + "epoch": 0.87, + "grad_norm": 0.5061706304550171, + "learning_rate": 0.0004837060135965958, + "loss": 3.2314, + "step": 17766 + }, + { + "epoch": 0.87, + "grad_norm": 0.5411424040794373, + "learning_rate": 0.00048369384137016456, + "loss": 3.1747, + "step": 17767 + }, + { + "epoch": 0.87, + "grad_norm": 0.4953779876232147, + "learning_rate": 0.000483681668659919, + "loss": 3.2839, + "step": 17768 + }, + { + "epoch": 0.87, + "grad_norm": 0.5827047824859619, + "learning_rate": 0.0004836694954658913, + "loss": 3.2348, + "step": 17769 + }, + { + "epoch": 0.87, + "grad_norm": 0.5480215549468994, + "learning_rate": 0.00048365732178811354, + "loss": 3.1482, + "step": 17770 + }, + { + "epoch": 0.87, + "grad_norm": 0.4860683083534241, + "learning_rate": 0.00048364514762661774, + "loss": 3.1271, + "step": 17771 + }, + { + "epoch": 0.87, + "grad_norm": 0.552074134349823, + "learning_rate": 0.0004836329729814359, + "loss": 3.3587, + "step": 17772 + }, + { + "epoch": 0.87, + "grad_norm": 0.4946533441543579, + "learning_rate": 0.00048362079785260027, + "loss": 3.0403, + "step": 17773 + }, + { + "epoch": 0.87, + "grad_norm": 0.5436502695083618, + "learning_rate": 0.00048360862224014267, + "loss": 3.1044, + "step": 17774 + }, + { + "epoch": 0.87, + "grad_norm": 0.5207213163375854, + "learning_rate": 0.00048359644614409534, + "loss": 3.1488, + "step": 17775 + }, + { + "epoch": 0.87, + "grad_norm": 0.5504875779151917, + "learning_rate": 0.00048358426956449026, + "loss": 3.2646, + "step": 17776 + }, + { + "epoch": 0.87, + "grad_norm": 0.517497718334198, + "learning_rate": 0.00048357209250135964, + "loss": 3.1974, + "step": 17777 + }, + { + "epoch": 0.87, + "grad_norm": 0.5165881514549255, + "learning_rate": 0.00048355991495473545, + "loss": 2.9446, + "step": 17778 + }, + { + "epoch": 0.87, + "grad_norm": 0.5115556716918945, + "learning_rate": 0.0004835477369246497, + "loss": 3.0173, + "step": 17779 + }, + { + "epoch": 0.87, + "grad_norm": 0.5526302456855774, + "learning_rate": 0.00048353555841113455, + "loss": 3.1211, + "step": 17780 + }, + { + "epoch": 0.87, + "grad_norm": 0.5087662935256958, + "learning_rate": 0.00048352337941422207, + "loss": 3.1776, + "step": 17781 + }, + { + "epoch": 0.87, + "grad_norm": 0.5549590587615967, + "learning_rate": 0.0004835111999339444, + "loss": 3.0331, + "step": 17782 + }, + { + "epoch": 0.87, + "grad_norm": 0.5352929830551147, + "learning_rate": 0.00048349901997033347, + "loss": 3.0673, + "step": 17783 + }, + { + "epoch": 0.87, + "grad_norm": 0.5016866326332092, + "learning_rate": 0.00048348683952342136, + "loss": 3.2462, + "step": 17784 + }, + { + "epoch": 0.87, + "grad_norm": 0.4875168800354004, + "learning_rate": 0.0004834746585932404, + "loss": 3.2957, + "step": 17785 + }, + { + "epoch": 0.87, + "grad_norm": 0.5208585858345032, + "learning_rate": 0.0004834624771798224, + "loss": 3.3014, + "step": 17786 + }, + { + "epoch": 0.87, + "grad_norm": 0.5304600596427917, + "learning_rate": 0.00048345029528319954, + "loss": 3.0373, + "step": 17787 + }, + { + "epoch": 0.87, + "grad_norm": 0.5236985683441162, + "learning_rate": 0.00048343811290340395, + "loss": 3.3229, + "step": 17788 + }, + { + "epoch": 0.87, + "grad_norm": 0.5062572360038757, + "learning_rate": 0.0004834259300404676, + "loss": 3.0878, + "step": 17789 + }, + { + "epoch": 0.87, + "grad_norm": 0.5024277567863464, + "learning_rate": 0.00048341374669442274, + "loss": 3.1605, + "step": 17790 + }, + { + "epoch": 0.87, + "grad_norm": 0.5190656781196594, + "learning_rate": 0.0004834015628653013, + "loss": 3.1117, + "step": 17791 + }, + { + "epoch": 0.87, + "grad_norm": 0.514510452747345, + "learning_rate": 0.0004833893785531355, + "loss": 2.8972, + "step": 17792 + }, + { + "epoch": 0.87, + "grad_norm": 0.4951634705066681, + "learning_rate": 0.0004833771937579574, + "loss": 3.242, + "step": 17793 + }, + { + "epoch": 0.87, + "grad_norm": 0.5369077324867249, + "learning_rate": 0.000483365008479799, + "loss": 3.4729, + "step": 17794 + }, + { + "epoch": 0.87, + "grad_norm": 0.5123760104179382, + "learning_rate": 0.0004833528227186925, + "loss": 2.9924, + "step": 17795 + }, + { + "epoch": 0.87, + "grad_norm": 0.5074111223220825, + "learning_rate": 0.00048334063647466986, + "loss": 3.329, + "step": 17796 + }, + { + "epoch": 0.87, + "grad_norm": 0.5059533715248108, + "learning_rate": 0.0004833284497477634, + "loss": 3.0242, + "step": 17797 + }, + { + "epoch": 0.87, + "grad_norm": 0.5089158415794373, + "learning_rate": 0.0004833162625380049, + "loss": 3.2417, + "step": 17798 + }, + { + "epoch": 0.87, + "grad_norm": 0.5231832265853882, + "learning_rate": 0.0004833040748454268, + "loss": 3.1011, + "step": 17799 + }, + { + "epoch": 0.87, + "grad_norm": 0.5030272006988525, + "learning_rate": 0.00048329188667006095, + "loss": 3.251, + "step": 17800 + }, + { + "epoch": 0.87, + "grad_norm": 0.516279399394989, + "learning_rate": 0.0004832796980119396, + "loss": 2.9739, + "step": 17801 + }, + { + "epoch": 0.87, + "grad_norm": 0.4982314705848694, + "learning_rate": 0.0004832675088710948, + "loss": 3.1481, + "step": 17802 + }, + { + "epoch": 0.87, + "grad_norm": 0.5156276822090149, + "learning_rate": 0.00048325531924755865, + "loss": 3.2038, + "step": 17803 + }, + { + "epoch": 0.87, + "grad_norm": 0.5175352692604065, + "learning_rate": 0.0004832431291413633, + "loss": 3.0606, + "step": 17804 + }, + { + "epoch": 0.87, + "grad_norm": 0.5309943556785583, + "learning_rate": 0.0004832309385525407, + "loss": 3.1735, + "step": 17805 + }, + { + "epoch": 0.87, + "grad_norm": 0.5159737467765808, + "learning_rate": 0.00048321874748112316, + "loss": 2.9259, + "step": 17806 + }, + { + "epoch": 0.87, + "grad_norm": 0.5461140275001526, + "learning_rate": 0.00048320655592714267, + "loss": 3.0094, + "step": 17807 + }, + { + "epoch": 0.87, + "grad_norm": 0.5539809465408325, + "learning_rate": 0.00048319436389063144, + "loss": 3.1468, + "step": 17808 + }, + { + "epoch": 0.87, + "grad_norm": 0.5527653694152832, + "learning_rate": 0.00048318217137162145, + "loss": 3.0324, + "step": 17809 + }, + { + "epoch": 0.87, + "grad_norm": 0.5274596810340881, + "learning_rate": 0.00048316997837014486, + "loss": 3.2017, + "step": 17810 + }, + { + "epoch": 0.87, + "grad_norm": 0.5476181507110596, + "learning_rate": 0.00048315778488623376, + "loss": 3.2789, + "step": 17811 + }, + { + "epoch": 0.87, + "grad_norm": 0.5081568956375122, + "learning_rate": 0.0004831455909199204, + "loss": 3.1682, + "step": 17812 + }, + { + "epoch": 0.87, + "grad_norm": 0.5192145109176636, + "learning_rate": 0.00048313339647123677, + "loss": 3.3376, + "step": 17813 + }, + { + "epoch": 0.87, + "grad_norm": 0.5384969115257263, + "learning_rate": 0.00048312120154021495, + "loss": 3.1973, + "step": 17814 + }, + { + "epoch": 0.87, + "grad_norm": 0.543453574180603, + "learning_rate": 0.00048310900612688726, + "loss": 3.2764, + "step": 17815 + }, + { + "epoch": 0.87, + "grad_norm": 0.53953617811203, + "learning_rate": 0.00048309681023128557, + "loss": 2.9113, + "step": 17816 + }, + { + "epoch": 0.87, + "grad_norm": 0.5175008177757263, + "learning_rate": 0.00048308461385344214, + "loss": 3.3165, + "step": 17817 + }, + { + "epoch": 0.87, + "grad_norm": 0.5362492799758911, + "learning_rate": 0.0004830724169933891, + "loss": 3.1817, + "step": 17818 + }, + { + "epoch": 0.87, + "grad_norm": 0.5375868082046509, + "learning_rate": 0.0004830602196511586, + "loss": 3.4691, + "step": 17819 + }, + { + "epoch": 0.87, + "grad_norm": 0.486717164516449, + "learning_rate": 0.0004830480218267826, + "loss": 3.0631, + "step": 17820 + }, + { + "epoch": 0.87, + "grad_norm": 0.5145992040634155, + "learning_rate": 0.00048303582352029345, + "loss": 3.1301, + "step": 17821 + }, + { + "epoch": 0.87, + "grad_norm": 0.5075737833976746, + "learning_rate": 0.00048302362473172307, + "loss": 3.2556, + "step": 17822 + }, + { + "epoch": 0.87, + "grad_norm": 0.5056702494621277, + "learning_rate": 0.0004830114254611037, + "loss": 3.0136, + "step": 17823 + }, + { + "epoch": 0.87, + "grad_norm": 0.5289957523345947, + "learning_rate": 0.00048299922570846756, + "loss": 3.0611, + "step": 17824 + }, + { + "epoch": 0.87, + "grad_norm": 0.5203523635864258, + "learning_rate": 0.00048298702547384655, + "loss": 3.1454, + "step": 17825 + }, + { + "epoch": 0.87, + "grad_norm": 0.5633178949356079, + "learning_rate": 0.00048297482475727295, + "loss": 2.9976, + "step": 17826 + }, + { + "epoch": 0.87, + "grad_norm": 0.5474057197570801, + "learning_rate": 0.00048296262355877897, + "loss": 3.1389, + "step": 17827 + }, + { + "epoch": 0.87, + "grad_norm": 0.5257229208946228, + "learning_rate": 0.0004829504218783966, + "loss": 3.1458, + "step": 17828 + }, + { + "epoch": 0.87, + "grad_norm": 0.519266664981842, + "learning_rate": 0.000482938219716158, + "loss": 3.0224, + "step": 17829 + }, + { + "epoch": 0.87, + "grad_norm": 0.5056769251823425, + "learning_rate": 0.0004829260170720953, + "loss": 3.2842, + "step": 17830 + }, + { + "epoch": 0.87, + "grad_norm": 0.5659570693969727, + "learning_rate": 0.0004829138139462408, + "loss": 2.9025, + "step": 17831 + }, + { + "epoch": 0.87, + "grad_norm": 0.4957529306411743, + "learning_rate": 0.00048290161033862636, + "loss": 3.181, + "step": 17832 + }, + { + "epoch": 0.87, + "grad_norm": 0.5560140609741211, + "learning_rate": 0.0004828894062492844, + "loss": 3.1902, + "step": 17833 + }, + { + "epoch": 0.87, + "grad_norm": 0.5515144467353821, + "learning_rate": 0.00048287720167824696, + "loss": 2.9447, + "step": 17834 + }, + { + "epoch": 0.87, + "grad_norm": 0.5933049917221069, + "learning_rate": 0.00048286499662554604, + "loss": 3.0005, + "step": 17835 + }, + { + "epoch": 0.87, + "grad_norm": 0.5094730854034424, + "learning_rate": 0.000482852791091214, + "loss": 3.3128, + "step": 17836 + }, + { + "epoch": 0.87, + "grad_norm": 0.4971897304058075, + "learning_rate": 0.0004828405850752829, + "loss": 3.2185, + "step": 17837 + }, + { + "epoch": 0.87, + "grad_norm": 0.5200973749160767, + "learning_rate": 0.0004828283785777848, + "loss": 3.1677, + "step": 17838 + }, + { + "epoch": 0.87, + "grad_norm": 0.5161073207855225, + "learning_rate": 0.00048281617159875203, + "loss": 3.3305, + "step": 17839 + }, + { + "epoch": 0.87, + "grad_norm": 0.5352292656898499, + "learning_rate": 0.0004828039641382167, + "loss": 3.2175, + "step": 17840 + }, + { + "epoch": 0.87, + "grad_norm": 0.49615222215652466, + "learning_rate": 0.00048279175619621073, + "loss": 3.2629, + "step": 17841 + }, + { + "epoch": 0.87, + "grad_norm": 0.5312261581420898, + "learning_rate": 0.0004827795477727666, + "loss": 3.0147, + "step": 17842 + }, + { + "epoch": 0.87, + "grad_norm": 0.5360621809959412, + "learning_rate": 0.0004827673388679163, + "loss": 3.0288, + "step": 17843 + }, + { + "epoch": 0.87, + "grad_norm": 0.5203635096549988, + "learning_rate": 0.000482755129481692, + "loss": 3.1102, + "step": 17844 + }, + { + "epoch": 0.87, + "grad_norm": 0.522244930267334, + "learning_rate": 0.0004827429196141259, + "loss": 3.4478, + "step": 17845 + }, + { + "epoch": 0.87, + "grad_norm": 0.5168775916099548, + "learning_rate": 0.00048273070926525, + "loss": 3.2777, + "step": 17846 + }, + { + "epoch": 0.87, + "grad_norm": 0.5495973229408264, + "learning_rate": 0.0004827184984350966, + "loss": 3.2903, + "step": 17847 + }, + { + "epoch": 0.87, + "grad_norm": 0.537128746509552, + "learning_rate": 0.0004827062871236979, + "loss": 3.3552, + "step": 17848 + }, + { + "epoch": 0.87, + "grad_norm": 0.5331090092658997, + "learning_rate": 0.00048269407533108597, + "loss": 3.1168, + "step": 17849 + }, + { + "epoch": 0.87, + "grad_norm": 0.4784494936466217, + "learning_rate": 0.00048268186305729305, + "loss": 3.2552, + "step": 17850 + }, + { + "epoch": 0.87, + "grad_norm": 0.5230741500854492, + "learning_rate": 0.00048266965030235116, + "loss": 3.164, + "step": 17851 + }, + { + "epoch": 0.87, + "grad_norm": 0.4925912320613861, + "learning_rate": 0.0004826574370662927, + "loss": 3.0437, + "step": 17852 + }, + { + "epoch": 0.87, + "grad_norm": 0.48744553327560425, + "learning_rate": 0.00048264522334914964, + "loss": 3.0296, + "step": 17853 + }, + { + "epoch": 0.87, + "grad_norm": 0.4946111738681793, + "learning_rate": 0.0004826330091509542, + "loss": 2.9261, + "step": 17854 + }, + { + "epoch": 0.88, + "grad_norm": 0.523500919342041, + "learning_rate": 0.0004826207944717386, + "loss": 3.0822, + "step": 17855 + }, + { + "epoch": 0.88, + "grad_norm": 0.5203326344490051, + "learning_rate": 0.00048260857931153487, + "loss": 3.2367, + "step": 17856 + }, + { + "epoch": 0.88, + "grad_norm": 0.4974472224712372, + "learning_rate": 0.00048259636367037535, + "loss": 3.3098, + "step": 17857 + }, + { + "epoch": 0.88, + "grad_norm": 0.4856162667274475, + "learning_rate": 0.00048258414754829226, + "loss": 3.4445, + "step": 17858 + }, + { + "epoch": 0.88, + "grad_norm": 0.5150359869003296, + "learning_rate": 0.0004825719309453175, + "loss": 3.2439, + "step": 17859 + }, + { + "epoch": 0.88, + "grad_norm": 0.5459046959877014, + "learning_rate": 0.00048255971386148346, + "loss": 3.2596, + "step": 17860 + }, + { + "epoch": 0.88, + "grad_norm": 0.515584409236908, + "learning_rate": 0.0004825474962968223, + "loss": 3.2124, + "step": 17861 + }, + { + "epoch": 0.88, + "grad_norm": 0.4975724518299103, + "learning_rate": 0.00048253527825136615, + "loss": 3.0945, + "step": 17862 + }, + { + "epoch": 0.88, + "grad_norm": 0.5150234699249268, + "learning_rate": 0.00048252305972514725, + "loss": 2.9796, + "step": 17863 + }, + { + "epoch": 0.88, + "grad_norm": 0.5973613262176514, + "learning_rate": 0.0004825108407181977, + "loss": 3.1476, + "step": 17864 + }, + { + "epoch": 0.88, + "grad_norm": 0.514706015586853, + "learning_rate": 0.0004824986212305497, + "loss": 2.8594, + "step": 17865 + }, + { + "epoch": 0.88, + "grad_norm": 0.5519055128097534, + "learning_rate": 0.0004824864012622355, + "loss": 3.0926, + "step": 17866 + }, + { + "epoch": 0.88, + "grad_norm": 0.5024924278259277, + "learning_rate": 0.00048247418081328724, + "loss": 3.3669, + "step": 17867 + }, + { + "epoch": 0.88, + "grad_norm": 0.5211602449417114, + "learning_rate": 0.000482461959883737, + "loss": 3.0907, + "step": 17868 + }, + { + "epoch": 0.88, + "grad_norm": 0.49125707149505615, + "learning_rate": 0.0004824497384736171, + "loss": 3.233, + "step": 17869 + }, + { + "epoch": 0.88, + "grad_norm": 0.5268430709838867, + "learning_rate": 0.00048243751658295984, + "loss": 2.996, + "step": 17870 + }, + { + "epoch": 0.88, + "grad_norm": 0.5411224961280823, + "learning_rate": 0.00048242529421179715, + "loss": 3.2701, + "step": 17871 + }, + { + "epoch": 0.88, + "grad_norm": 0.534550666809082, + "learning_rate": 0.00048241307136016133, + "loss": 3.0247, + "step": 17872 + }, + { + "epoch": 0.88, + "grad_norm": 0.5058745741844177, + "learning_rate": 0.0004824008480280847, + "loss": 3.3005, + "step": 17873 + }, + { + "epoch": 0.88, + "grad_norm": 0.5441546440124512, + "learning_rate": 0.00048238862421559923, + "loss": 3.0935, + "step": 17874 + }, + { + "epoch": 0.88, + "grad_norm": 0.5148599147796631, + "learning_rate": 0.0004823763999227373, + "loss": 3.2805, + "step": 17875 + }, + { + "epoch": 0.88, + "grad_norm": 0.5039719939231873, + "learning_rate": 0.00048236417514953094, + "loss": 3.2335, + "step": 17876 + }, + { + "epoch": 0.88, + "grad_norm": 0.5283924341201782, + "learning_rate": 0.0004823519498960125, + "loss": 3.0868, + "step": 17877 + }, + { + "epoch": 0.88, + "grad_norm": 0.5511729717254639, + "learning_rate": 0.00048233972416221417, + "loss": 3.0867, + "step": 17878 + }, + { + "epoch": 0.88, + "grad_norm": 0.5116291642189026, + "learning_rate": 0.00048232749794816806, + "loss": 3.2757, + "step": 17879 + }, + { + "epoch": 0.88, + "grad_norm": 0.5327066779136658, + "learning_rate": 0.00048231527125390636, + "loss": 3.0075, + "step": 17880 + }, + { + "epoch": 0.88, + "grad_norm": 0.5154629945755005, + "learning_rate": 0.00048230304407946144, + "loss": 3.2752, + "step": 17881 + }, + { + "epoch": 0.88, + "grad_norm": 0.5452733635902405, + "learning_rate": 0.00048229081642486523, + "loss": 3.2965, + "step": 17882 + }, + { + "epoch": 0.88, + "grad_norm": 0.48976922035217285, + "learning_rate": 0.0004822785882901502, + "loss": 3.1155, + "step": 17883 + }, + { + "epoch": 0.88, + "grad_norm": 0.5699204206466675, + "learning_rate": 0.00048226635967534834, + "loss": 3.2289, + "step": 17884 + }, + { + "epoch": 0.88, + "grad_norm": 0.5284189581871033, + "learning_rate": 0.0004822541305804921, + "loss": 3.3774, + "step": 17885 + }, + { + "epoch": 0.88, + "grad_norm": 0.48988762497901917, + "learning_rate": 0.00048224190100561355, + "loss": 3.0301, + "step": 17886 + }, + { + "epoch": 0.88, + "grad_norm": 0.496111124753952, + "learning_rate": 0.00048222967095074476, + "loss": 2.9562, + "step": 17887 + }, + { + "epoch": 0.88, + "grad_norm": 0.510610044002533, + "learning_rate": 0.0004822174404159182, + "loss": 3.4054, + "step": 17888 + }, + { + "epoch": 0.88, + "grad_norm": 0.5512583255767822, + "learning_rate": 0.00048220520940116593, + "loss": 3.1477, + "step": 17889 + }, + { + "epoch": 0.88, + "grad_norm": 0.4915429651737213, + "learning_rate": 0.00048219297790652024, + "loss": 3.1844, + "step": 17890 + }, + { + "epoch": 0.88, + "grad_norm": 0.5592296719551086, + "learning_rate": 0.0004821807459320134, + "loss": 3.2548, + "step": 17891 + }, + { + "epoch": 0.88, + "grad_norm": 0.49113866686820984, + "learning_rate": 0.0004821685134776773, + "loss": 3.1582, + "step": 17892 + }, + { + "epoch": 0.88, + "grad_norm": 0.5161522626876831, + "learning_rate": 0.0004821562805435446, + "loss": 3.1451, + "step": 17893 + }, + { + "epoch": 0.88, + "grad_norm": 0.5243754982948303, + "learning_rate": 0.00048214404712964713, + "loss": 3.3366, + "step": 17894 + }, + { + "epoch": 0.88, + "grad_norm": 0.5705099701881409, + "learning_rate": 0.00048213181323601754, + "loss": 3.019, + "step": 17895 + }, + { + "epoch": 0.88, + "grad_norm": 0.48698729276657104, + "learning_rate": 0.00048211957886268764, + "loss": 2.9851, + "step": 17896 + }, + { + "epoch": 0.88, + "grad_norm": 0.5281487703323364, + "learning_rate": 0.0004821073440096898, + "loss": 3.1592, + "step": 17897 + }, + { + "epoch": 0.88, + "grad_norm": 0.524651288986206, + "learning_rate": 0.0004820951086770563, + "loss": 3.1948, + "step": 17898 + }, + { + "epoch": 0.88, + "grad_norm": 0.5448842644691467, + "learning_rate": 0.0004820828728648194, + "loss": 3.1414, + "step": 17899 + }, + { + "epoch": 0.88, + "grad_norm": 0.5146068930625916, + "learning_rate": 0.0004820706365730112, + "loss": 3.0062, + "step": 17900 + }, + { + "epoch": 0.88, + "grad_norm": 0.49320217967033386, + "learning_rate": 0.000482058399801664, + "loss": 3.3239, + "step": 17901 + }, + { + "epoch": 0.88, + "grad_norm": 0.5205990076065063, + "learning_rate": 0.00048204616255080997, + "loss": 3.2077, + "step": 17902 + }, + { + "epoch": 0.88, + "grad_norm": 0.5180717706680298, + "learning_rate": 0.00048203392482048136, + "loss": 3.2312, + "step": 17903 + }, + { + "epoch": 0.88, + "grad_norm": 0.5263290405273438, + "learning_rate": 0.0004820216866107105, + "loss": 3.2392, + "step": 17904 + }, + { + "epoch": 0.88, + "grad_norm": 0.5504626631736755, + "learning_rate": 0.00048200944792152955, + "loss": 3.0838, + "step": 17905 + }, + { + "epoch": 0.88, + "grad_norm": 0.5280824303627014, + "learning_rate": 0.0004819972087529707, + "loss": 3.2164, + "step": 17906 + }, + { + "epoch": 0.88, + "grad_norm": 0.5233747959136963, + "learning_rate": 0.00048198496910506624, + "loss": 3.1294, + "step": 17907 + }, + { + "epoch": 0.88, + "grad_norm": 0.5285931825637817, + "learning_rate": 0.00048197272897784835, + "loss": 3.0499, + "step": 17908 + }, + { + "epoch": 0.88, + "grad_norm": 0.5670228600502014, + "learning_rate": 0.0004819604883713494, + "loss": 3.1822, + "step": 17909 + }, + { + "epoch": 0.88, + "grad_norm": 0.5432270765304565, + "learning_rate": 0.0004819482472856015, + "loss": 3.041, + "step": 17910 + }, + { + "epoch": 0.88, + "grad_norm": 0.5179724097251892, + "learning_rate": 0.0004819360057206369, + "loss": 3.327, + "step": 17911 + }, + { + "epoch": 0.88, + "grad_norm": 0.5055398344993591, + "learning_rate": 0.0004819237636764879, + "loss": 3.2491, + "step": 17912 + }, + { + "epoch": 0.88, + "grad_norm": 0.531039834022522, + "learning_rate": 0.0004819115211531867, + "loss": 3.1858, + "step": 17913 + }, + { + "epoch": 0.88, + "grad_norm": 0.5148627161979675, + "learning_rate": 0.00048189927815076565, + "loss": 3.1547, + "step": 17914 + }, + { + "epoch": 0.88, + "grad_norm": 0.5544278621673584, + "learning_rate": 0.0004818870346692569, + "loss": 3.2301, + "step": 17915 + }, + { + "epoch": 0.88, + "grad_norm": 0.564547598361969, + "learning_rate": 0.00048187479070869267, + "loss": 3.0203, + "step": 17916 + }, + { + "epoch": 0.88, + "grad_norm": 0.5364139080047607, + "learning_rate": 0.0004818625462691052, + "loss": 2.8439, + "step": 17917 + }, + { + "epoch": 0.88, + "grad_norm": 0.5530861020088196, + "learning_rate": 0.00048185030135052676, + "loss": 3.1894, + "step": 17918 + }, + { + "epoch": 0.88, + "grad_norm": 0.5109603404998779, + "learning_rate": 0.00048183805595298975, + "loss": 3.2271, + "step": 17919 + }, + { + "epoch": 0.88, + "grad_norm": 0.49450230598449707, + "learning_rate": 0.0004818258100765262, + "loss": 3.1592, + "step": 17920 + }, + { + "epoch": 0.88, + "grad_norm": 0.5180266499519348, + "learning_rate": 0.0004818135637211685, + "loss": 3.0048, + "step": 17921 + }, + { + "epoch": 0.88, + "grad_norm": 0.5050731301307678, + "learning_rate": 0.00048180131688694883, + "loss": 3.2961, + "step": 17922 + }, + { + "epoch": 0.88, + "grad_norm": 0.5055239200592041, + "learning_rate": 0.0004817890695738994, + "loss": 3.1692, + "step": 17923 + }, + { + "epoch": 0.88, + "grad_norm": 0.5030365586280823, + "learning_rate": 0.00048177682178205273, + "loss": 3.1633, + "step": 17924 + }, + { + "epoch": 0.88, + "grad_norm": 0.6658396124839783, + "learning_rate": 0.00048176457351144084, + "loss": 3.2588, + "step": 17925 + }, + { + "epoch": 0.88, + "grad_norm": 0.5380383133888245, + "learning_rate": 0.000481752324762096, + "loss": 3.2116, + "step": 17926 + }, + { + "epoch": 0.88, + "grad_norm": 0.507267951965332, + "learning_rate": 0.00048174007553405056, + "loss": 3.279, + "step": 17927 + }, + { + "epoch": 0.88, + "grad_norm": 0.5685015320777893, + "learning_rate": 0.0004817278258273366, + "loss": 3.0932, + "step": 17928 + }, + { + "epoch": 0.88, + "grad_norm": 0.5036908984184265, + "learning_rate": 0.0004817155756419866, + "loss": 3.1301, + "step": 17929 + }, + { + "epoch": 0.88, + "grad_norm": 0.516451895236969, + "learning_rate": 0.0004817033249780328, + "loss": 2.9769, + "step": 17930 + }, + { + "epoch": 0.88, + "grad_norm": 0.5260326862335205, + "learning_rate": 0.00048169107383550744, + "loss": 3.3973, + "step": 17931 + }, + { + "epoch": 0.88, + "grad_norm": 0.5365659594535828, + "learning_rate": 0.0004816788222144427, + "loss": 3.1838, + "step": 17932 + }, + { + "epoch": 0.88, + "grad_norm": 0.5538583397865295, + "learning_rate": 0.0004816665701148709, + "loss": 3.1503, + "step": 17933 + }, + { + "epoch": 0.88, + "grad_norm": 0.5433288812637329, + "learning_rate": 0.00048165431753682434, + "loss": 3.3311, + "step": 17934 + }, + { + "epoch": 0.88, + "grad_norm": 0.5256428718566895, + "learning_rate": 0.0004816420644803352, + "loss": 3.1285, + "step": 17935 + }, + { + "epoch": 0.88, + "grad_norm": 0.5579695701599121, + "learning_rate": 0.0004816298109454359, + "loss": 3.0131, + "step": 17936 + }, + { + "epoch": 0.88, + "grad_norm": 0.49737823009490967, + "learning_rate": 0.00048161755693215864, + "loss": 3.287, + "step": 17937 + }, + { + "epoch": 0.88, + "grad_norm": 0.5029211640357971, + "learning_rate": 0.00048160530244053564, + "loss": 3.2576, + "step": 17938 + }, + { + "epoch": 0.88, + "grad_norm": 0.5466650724411011, + "learning_rate": 0.0004815930474705992, + "loss": 2.9216, + "step": 17939 + }, + { + "epoch": 0.88, + "grad_norm": 0.5182828307151794, + "learning_rate": 0.0004815807920223816, + "loss": 3.2743, + "step": 17940 + }, + { + "epoch": 0.88, + "grad_norm": 0.5201794505119324, + "learning_rate": 0.00048156853609591525, + "loss": 3.2135, + "step": 17941 + }, + { + "epoch": 0.88, + "grad_norm": 0.5152579545974731, + "learning_rate": 0.0004815562796912323, + "loss": 3.1489, + "step": 17942 + }, + { + "epoch": 0.88, + "grad_norm": 0.5244562029838562, + "learning_rate": 0.00048154402280836504, + "loss": 3.016, + "step": 17943 + }, + { + "epoch": 0.88, + "grad_norm": 0.5533185601234436, + "learning_rate": 0.00048153176544734575, + "loss": 3.2095, + "step": 17944 + }, + { + "epoch": 0.88, + "grad_norm": 0.5288299918174744, + "learning_rate": 0.0004815195076082067, + "loss": 3.0927, + "step": 17945 + }, + { + "epoch": 0.88, + "grad_norm": 0.5070801377296448, + "learning_rate": 0.00048150724929098027, + "loss": 3.1879, + "step": 17946 + }, + { + "epoch": 0.88, + "grad_norm": 0.5345847606658936, + "learning_rate": 0.0004814949904956986, + "loss": 3.2283, + "step": 17947 + }, + { + "epoch": 0.88, + "grad_norm": 0.5151923298835754, + "learning_rate": 0.0004814827312223941, + "loss": 3.1727, + "step": 17948 + }, + { + "epoch": 0.88, + "grad_norm": 0.5357820987701416, + "learning_rate": 0.000481470471471099, + "loss": 3.2602, + "step": 17949 + }, + { + "epoch": 0.88, + "grad_norm": 0.4863514304161072, + "learning_rate": 0.00048145821124184556, + "loss": 3.1904, + "step": 17950 + }, + { + "epoch": 0.88, + "grad_norm": 0.5032357573509216, + "learning_rate": 0.00048144595053466616, + "loss": 3.1701, + "step": 17951 + }, + { + "epoch": 0.88, + "grad_norm": 0.5471490025520325, + "learning_rate": 0.00048143368934959306, + "loss": 3.2434, + "step": 17952 + }, + { + "epoch": 0.88, + "grad_norm": 0.4980376362800598, + "learning_rate": 0.00048142142768665844, + "loss": 3.0547, + "step": 17953 + }, + { + "epoch": 0.88, + "grad_norm": 0.4976464509963989, + "learning_rate": 0.0004814091655458947, + "loss": 3.0471, + "step": 17954 + }, + { + "epoch": 0.88, + "grad_norm": 0.5372846722602844, + "learning_rate": 0.0004813969029273343, + "loss": 3.2855, + "step": 17955 + }, + { + "epoch": 0.88, + "grad_norm": 0.5928505659103394, + "learning_rate": 0.00048138463983100926, + "loss": 3.3892, + "step": 17956 + }, + { + "epoch": 0.88, + "grad_norm": 0.5085715055465698, + "learning_rate": 0.00048137237625695207, + "loss": 3.1189, + "step": 17957 + }, + { + "epoch": 0.88, + "grad_norm": 0.5404731631278992, + "learning_rate": 0.00048136011220519486, + "loss": 3.259, + "step": 17958 + }, + { + "epoch": 0.88, + "grad_norm": 0.5489948987960815, + "learning_rate": 0.00048134784767577, + "loss": 3.0618, + "step": 17959 + }, + { + "epoch": 0.88, + "grad_norm": 0.5055735111236572, + "learning_rate": 0.0004813355826687099, + "loss": 3.149, + "step": 17960 + }, + { + "epoch": 0.88, + "grad_norm": 0.5085407495498657, + "learning_rate": 0.00048132331718404663, + "loss": 3.2623, + "step": 17961 + }, + { + "epoch": 0.88, + "grad_norm": 0.509974479675293, + "learning_rate": 0.0004813110512218127, + "loss": 3.0828, + "step": 17962 + }, + { + "epoch": 0.88, + "grad_norm": 0.5331966876983643, + "learning_rate": 0.00048129878478204047, + "loss": 3.1375, + "step": 17963 + }, + { + "epoch": 0.88, + "grad_norm": 0.4968498945236206, + "learning_rate": 0.000481286517864762, + "loss": 3.2832, + "step": 17964 + }, + { + "epoch": 0.88, + "grad_norm": 0.5317128896713257, + "learning_rate": 0.0004812742504700097, + "loss": 3.1265, + "step": 17965 + }, + { + "epoch": 0.88, + "grad_norm": 0.5396853685379028, + "learning_rate": 0.000481261982597816, + "loss": 3.3518, + "step": 17966 + }, + { + "epoch": 0.88, + "grad_norm": 0.5501961708068848, + "learning_rate": 0.00048124971424821315, + "loss": 3.2126, + "step": 17967 + }, + { + "epoch": 0.88, + "grad_norm": 0.5294440388679504, + "learning_rate": 0.00048123744542123345, + "loss": 3.1612, + "step": 17968 + }, + { + "epoch": 0.88, + "grad_norm": 0.5000289678573608, + "learning_rate": 0.0004812251761169091, + "loss": 3.4135, + "step": 17969 + }, + { + "epoch": 0.88, + "grad_norm": 0.49727967381477356, + "learning_rate": 0.00048121290633527247, + "loss": 2.9709, + "step": 17970 + }, + { + "epoch": 0.88, + "grad_norm": 0.5257933735847473, + "learning_rate": 0.0004812006360763561, + "loss": 3.5203, + "step": 17971 + }, + { + "epoch": 0.88, + "grad_norm": 0.5392364263534546, + "learning_rate": 0.000481188365340192, + "loss": 3.163, + "step": 17972 + }, + { + "epoch": 0.88, + "grad_norm": 0.5661436319351196, + "learning_rate": 0.0004811760941268127, + "loss": 3.0577, + "step": 17973 + }, + { + "epoch": 0.88, + "grad_norm": 0.5415921211242676, + "learning_rate": 0.0004811638224362503, + "loss": 2.9468, + "step": 17974 + }, + { + "epoch": 0.88, + "grad_norm": 0.5405935645103455, + "learning_rate": 0.0004811515502685374, + "loss": 3.1172, + "step": 17975 + }, + { + "epoch": 0.88, + "grad_norm": 0.5455482006072998, + "learning_rate": 0.00048113927762370614, + "loss": 3.1385, + "step": 17976 + }, + { + "epoch": 0.88, + "grad_norm": 0.5725964307785034, + "learning_rate": 0.00048112700450178884, + "loss": 3.0701, + "step": 17977 + }, + { + "epoch": 0.88, + "grad_norm": 0.5180301070213318, + "learning_rate": 0.00048111473090281797, + "loss": 3.0, + "step": 17978 + }, + { + "epoch": 0.88, + "grad_norm": 0.5263218879699707, + "learning_rate": 0.00048110245682682557, + "loss": 3.2949, + "step": 17979 + }, + { + "epoch": 0.88, + "grad_norm": 0.5030066967010498, + "learning_rate": 0.00048109018227384434, + "loss": 3.3081, + "step": 17980 + }, + { + "epoch": 0.88, + "grad_norm": 0.5428774952888489, + "learning_rate": 0.0004810779072439063, + "loss": 3.2514, + "step": 17981 + }, + { + "epoch": 0.88, + "grad_norm": 0.5697459578514099, + "learning_rate": 0.0004810656317370439, + "loss": 3.3227, + "step": 17982 + }, + { + "epoch": 0.88, + "grad_norm": 0.522610068321228, + "learning_rate": 0.00048105335575328957, + "loss": 3.135, + "step": 17983 + }, + { + "epoch": 0.88, + "grad_norm": 0.529964804649353, + "learning_rate": 0.0004810410792926755, + "loss": 3.2755, + "step": 17984 + }, + { + "epoch": 0.88, + "grad_norm": 0.5211076736450195, + "learning_rate": 0.00048102880235523405, + "loss": 3.3943, + "step": 17985 + }, + { + "epoch": 0.88, + "grad_norm": 0.5373325347900391, + "learning_rate": 0.0004810165249409976, + "loss": 3.0734, + "step": 17986 + }, + { + "epoch": 0.88, + "grad_norm": 0.537948727607727, + "learning_rate": 0.00048100424704999845, + "loss": 3.2213, + "step": 17987 + }, + { + "epoch": 0.88, + "grad_norm": 0.49827033281326294, + "learning_rate": 0.00048099196868226895, + "loss": 3.0853, + "step": 17988 + }, + { + "epoch": 0.88, + "grad_norm": 0.6188706755638123, + "learning_rate": 0.0004809796898378414, + "loss": 3.1362, + "step": 17989 + }, + { + "epoch": 0.88, + "grad_norm": 0.5543064475059509, + "learning_rate": 0.00048096741051674826, + "loss": 3.0382, + "step": 17990 + }, + { + "epoch": 0.88, + "grad_norm": 0.49779966473579407, + "learning_rate": 0.00048095513071902174, + "loss": 3.1029, + "step": 17991 + }, + { + "epoch": 0.88, + "grad_norm": 0.5499799251556396, + "learning_rate": 0.00048094285044469415, + "loss": 3.3016, + "step": 17992 + }, + { + "epoch": 0.88, + "grad_norm": 0.5513309836387634, + "learning_rate": 0.00048093056969379807, + "loss": 3.2205, + "step": 17993 + }, + { + "epoch": 0.88, + "grad_norm": 0.49735820293426514, + "learning_rate": 0.0004809182884663656, + "loss": 3.2093, + "step": 17994 + }, + { + "epoch": 0.88, + "grad_norm": 0.5192501544952393, + "learning_rate": 0.00048090600676242923, + "loss": 3.1286, + "step": 17995 + }, + { + "epoch": 0.88, + "grad_norm": 0.5403336882591248, + "learning_rate": 0.00048089372458202115, + "loss": 3.014, + "step": 17996 + }, + { + "epoch": 0.88, + "grad_norm": 0.4965198338031769, + "learning_rate": 0.00048088144192517387, + "loss": 3.115, + "step": 17997 + }, + { + "epoch": 0.88, + "grad_norm": 0.957526683807373, + "learning_rate": 0.0004808691587919197, + "loss": 3.324, + "step": 17998 + }, + { + "epoch": 0.88, + "grad_norm": 0.5187360048294067, + "learning_rate": 0.00048085687518229105, + "loss": 3.2005, + "step": 17999 + }, + { + "epoch": 0.88, + "grad_norm": 0.4975668489933014, + "learning_rate": 0.0004808445910963201, + "loss": 3.3057, + "step": 18000 + }, + { + "epoch": 0.88, + "grad_norm": 0.5819920897483826, + "learning_rate": 0.00048083230653403925, + "loss": 3.0514, + "step": 18001 + }, + { + "epoch": 0.88, + "grad_norm": 0.5546259880065918, + "learning_rate": 0.000480820021495481, + "loss": 2.7754, + "step": 18002 + }, + { + "epoch": 0.88, + "grad_norm": 0.49003690481185913, + "learning_rate": 0.00048080773598067753, + "loss": 3.2775, + "step": 18003 + }, + { + "epoch": 0.88, + "grad_norm": 0.515501081943512, + "learning_rate": 0.00048079544998966137, + "loss": 3.2949, + "step": 18004 + }, + { + "epoch": 0.88, + "grad_norm": 0.5453561544418335, + "learning_rate": 0.0004807831635224647, + "loss": 3.0814, + "step": 18005 + }, + { + "epoch": 0.88, + "grad_norm": 0.5018565058708191, + "learning_rate": 0.00048077087657912005, + "loss": 3.4001, + "step": 18006 + }, + { + "epoch": 0.88, + "grad_norm": 0.5057216882705688, + "learning_rate": 0.00048075858915965966, + "loss": 3.1185, + "step": 18007 + }, + { + "epoch": 0.88, + "grad_norm": 0.5177599191665649, + "learning_rate": 0.00048074630126411597, + "loss": 2.9411, + "step": 18008 + }, + { + "epoch": 0.88, + "grad_norm": 0.5200800895690918, + "learning_rate": 0.00048073401289252133, + "loss": 3.33, + "step": 18009 + }, + { + "epoch": 0.88, + "grad_norm": 0.5048688054084778, + "learning_rate": 0.000480721724044908, + "loss": 3.3512, + "step": 18010 + }, + { + "epoch": 0.88, + "grad_norm": 0.5326023697853088, + "learning_rate": 0.0004807094347213085, + "loss": 3.1505, + "step": 18011 + }, + { + "epoch": 0.88, + "grad_norm": 0.536878228187561, + "learning_rate": 0.0004806971449217551, + "loss": 3.0033, + "step": 18012 + }, + { + "epoch": 0.88, + "grad_norm": 0.5812287926673889, + "learning_rate": 0.0004806848546462802, + "loss": 2.9029, + "step": 18013 + }, + { + "epoch": 0.88, + "grad_norm": 0.5413682460784912, + "learning_rate": 0.00048067256389491613, + "loss": 3.137, + "step": 18014 + }, + { + "epoch": 0.88, + "grad_norm": 0.47277653217315674, + "learning_rate": 0.00048066027266769533, + "loss": 3.1877, + "step": 18015 + }, + { + "epoch": 0.88, + "grad_norm": 0.5533841848373413, + "learning_rate": 0.0004806479809646501, + "loss": 3.2071, + "step": 18016 + }, + { + "epoch": 0.88, + "grad_norm": 0.5669878721237183, + "learning_rate": 0.0004806356887858129, + "loss": 3.3474, + "step": 18017 + }, + { + "epoch": 0.88, + "grad_norm": 0.5068527460098267, + "learning_rate": 0.0004806233961312161, + "loss": 3.3926, + "step": 18018 + }, + { + "epoch": 0.88, + "grad_norm": 0.706555187702179, + "learning_rate": 0.00048061110300089203, + "loss": 3.1562, + "step": 18019 + }, + { + "epoch": 0.88, + "grad_norm": 0.5449730157852173, + "learning_rate": 0.00048059880939487295, + "loss": 3.3523, + "step": 18020 + }, + { + "epoch": 0.88, + "grad_norm": 0.5296577215194702, + "learning_rate": 0.0004805865153131915, + "loss": 3.0907, + "step": 18021 + }, + { + "epoch": 0.88, + "grad_norm": 0.5225890278816223, + "learning_rate": 0.0004805742207558799, + "loss": 3.0886, + "step": 18022 + }, + { + "epoch": 0.88, + "grad_norm": 0.6007885336875916, + "learning_rate": 0.00048056192572297046, + "loss": 3.399, + "step": 18023 + }, + { + "epoch": 0.88, + "grad_norm": 0.5413122177124023, + "learning_rate": 0.00048054963021449575, + "loss": 3.1492, + "step": 18024 + }, + { + "epoch": 0.88, + "grad_norm": 0.5432742238044739, + "learning_rate": 0.00048053733423048797, + "loss": 3.166, + "step": 18025 + }, + { + "epoch": 0.88, + "grad_norm": 0.5184715986251831, + "learning_rate": 0.0004805250377709797, + "loss": 2.9957, + "step": 18026 + }, + { + "epoch": 0.88, + "grad_norm": 0.5569450259208679, + "learning_rate": 0.0004805127408360032, + "loss": 3.1505, + "step": 18027 + }, + { + "epoch": 0.88, + "grad_norm": 0.5036641359329224, + "learning_rate": 0.00048050044342559087, + "loss": 3.0816, + "step": 18028 + }, + { + "epoch": 0.88, + "grad_norm": 0.6360530257225037, + "learning_rate": 0.0004804881455397751, + "loss": 3.154, + "step": 18029 + }, + { + "epoch": 0.88, + "grad_norm": 0.5100707411766052, + "learning_rate": 0.00048047584717858825, + "loss": 3.2289, + "step": 18030 + }, + { + "epoch": 0.88, + "grad_norm": 0.5365529656410217, + "learning_rate": 0.00048046354834206277, + "loss": 3.1688, + "step": 18031 + }, + { + "epoch": 0.88, + "grad_norm": 0.5900627970695496, + "learning_rate": 0.0004804512490302311, + "loss": 3.2375, + "step": 18032 + }, + { + "epoch": 0.88, + "grad_norm": 0.566939651966095, + "learning_rate": 0.0004804389492431255, + "loss": 2.9624, + "step": 18033 + }, + { + "epoch": 0.88, + "grad_norm": 0.5332266688346863, + "learning_rate": 0.0004804266489807785, + "loss": 3.0906, + "step": 18034 + }, + { + "epoch": 0.88, + "grad_norm": 0.5751016139984131, + "learning_rate": 0.0004804143482432224, + "loss": 3.2817, + "step": 18035 + }, + { + "epoch": 0.88, + "grad_norm": 0.6242155432701111, + "learning_rate": 0.0004804020470304896, + "loss": 3.0636, + "step": 18036 + }, + { + "epoch": 0.88, + "grad_norm": 0.5122875571250916, + "learning_rate": 0.00048038974534261256, + "loss": 3.0294, + "step": 18037 + }, + { + "epoch": 0.88, + "grad_norm": 0.5618741512298584, + "learning_rate": 0.00048037744317962357, + "loss": 3.0718, + "step": 18038 + }, + { + "epoch": 0.88, + "grad_norm": 0.5436500310897827, + "learning_rate": 0.0004803651405415553, + "loss": 3.3318, + "step": 18039 + }, + { + "epoch": 0.88, + "grad_norm": 0.5078718662261963, + "learning_rate": 0.0004803528374284398, + "loss": 3.3323, + "step": 18040 + }, + { + "epoch": 0.88, + "grad_norm": 0.5190596580505371, + "learning_rate": 0.00048034053384030963, + "loss": 3.4991, + "step": 18041 + }, + { + "epoch": 0.88, + "grad_norm": 0.5513627529144287, + "learning_rate": 0.0004803282297771973, + "loss": 3.1212, + "step": 18042 + }, + { + "epoch": 0.88, + "grad_norm": 0.5310901403427124, + "learning_rate": 0.000480315925239135, + "loss": 3.0746, + "step": 18043 + }, + { + "epoch": 0.88, + "grad_norm": 0.5999692678451538, + "learning_rate": 0.00048030362022615533, + "loss": 3.2624, + "step": 18044 + }, + { + "epoch": 0.88, + "grad_norm": 0.49754953384399414, + "learning_rate": 0.00048029131473829065, + "loss": 3.2649, + "step": 18045 + }, + { + "epoch": 0.88, + "grad_norm": 0.5372548699378967, + "learning_rate": 0.00048027900877557327, + "loss": 3.2262, + "step": 18046 + }, + { + "epoch": 0.88, + "grad_norm": 0.528826117515564, + "learning_rate": 0.00048026670233803574, + "loss": 3.2068, + "step": 18047 + }, + { + "epoch": 0.88, + "grad_norm": 0.5041863322257996, + "learning_rate": 0.00048025439542571035, + "loss": 3.0409, + "step": 18048 + }, + { + "epoch": 0.88, + "grad_norm": 0.5768441557884216, + "learning_rate": 0.00048024208803862964, + "loss": 3.2841, + "step": 18049 + }, + { + "epoch": 0.88, + "grad_norm": 0.5213032960891724, + "learning_rate": 0.00048022978017682596, + "loss": 3.2703, + "step": 18050 + }, + { + "epoch": 0.88, + "grad_norm": 0.5481829643249512, + "learning_rate": 0.00048021747184033163, + "loss": 3.1394, + "step": 18051 + }, + { + "epoch": 0.88, + "grad_norm": 0.5505427122116089, + "learning_rate": 0.00048020516302917923, + "loss": 3.4499, + "step": 18052 + }, + { + "epoch": 0.88, + "grad_norm": 0.5292855501174927, + "learning_rate": 0.00048019285374340106, + "loss": 3.2116, + "step": 18053 + }, + { + "epoch": 0.88, + "grad_norm": 0.5407589077949524, + "learning_rate": 0.00048018054398302966, + "loss": 3.3331, + "step": 18054 + }, + { + "epoch": 0.88, + "grad_norm": 0.5229291915893555, + "learning_rate": 0.0004801682337480974, + "loss": 3.3401, + "step": 18055 + }, + { + "epoch": 0.88, + "grad_norm": 0.47680187225341797, + "learning_rate": 0.00048015592303863653, + "loss": 2.9878, + "step": 18056 + }, + { + "epoch": 0.88, + "grad_norm": 0.5591782331466675, + "learning_rate": 0.00048014361185467986, + "loss": 3.3692, + "step": 18057 + }, + { + "epoch": 0.88, + "grad_norm": 0.5464686155319214, + "learning_rate": 0.0004801313001962594, + "loss": 3.2056, + "step": 18058 + }, + { + "epoch": 0.89, + "grad_norm": 0.5384355187416077, + "learning_rate": 0.00048011898806340787, + "loss": 3.3496, + "step": 18059 + }, + { + "epoch": 0.89, + "grad_norm": 0.5165776610374451, + "learning_rate": 0.00048010667545615753, + "loss": 3.3195, + "step": 18060 + }, + { + "epoch": 0.89, + "grad_norm": 0.489169716835022, + "learning_rate": 0.00048009436237454083, + "loss": 3.109, + "step": 18061 + }, + { + "epoch": 0.89, + "grad_norm": 0.5217012763023376, + "learning_rate": 0.00048008204881859034, + "loss": 3.3279, + "step": 18062 + }, + { + "epoch": 0.89, + "grad_norm": 0.5407010912895203, + "learning_rate": 0.00048006973478833837, + "loss": 3.1728, + "step": 18063 + }, + { + "epoch": 0.89, + "grad_norm": 0.5479187369346619, + "learning_rate": 0.0004800574202838174, + "loss": 2.9849, + "step": 18064 + }, + { + "epoch": 0.89, + "grad_norm": 0.5429320335388184, + "learning_rate": 0.00048004510530505977, + "loss": 3.1398, + "step": 18065 + }, + { + "epoch": 0.89, + "grad_norm": 0.5046254992485046, + "learning_rate": 0.000480032789852098, + "loss": 3.1535, + "step": 18066 + }, + { + "epoch": 0.89, + "grad_norm": 0.5253599882125854, + "learning_rate": 0.00048002047392496443, + "loss": 3.0859, + "step": 18067 + }, + { + "epoch": 0.89, + "grad_norm": 0.5937367081642151, + "learning_rate": 0.0004800081575236917, + "loss": 3.4202, + "step": 18068 + }, + { + "epoch": 0.89, + "grad_norm": 0.541233479976654, + "learning_rate": 0.0004799958406483121, + "loss": 3.3945, + "step": 18069 + }, + { + "epoch": 0.89, + "grad_norm": 0.5336980819702148, + "learning_rate": 0.00047998352329885815, + "loss": 3.2491, + "step": 18070 + }, + { + "epoch": 0.89, + "grad_norm": 0.5044065713882446, + "learning_rate": 0.00047997120547536214, + "loss": 3.2804, + "step": 18071 + }, + { + "epoch": 0.89, + "grad_norm": 0.4936935305595398, + "learning_rate": 0.0004799588871778566, + "loss": 3.3098, + "step": 18072 + }, + { + "epoch": 0.89, + "grad_norm": 0.55002760887146, + "learning_rate": 0.0004799465684063741, + "loss": 2.9614, + "step": 18073 + }, + { + "epoch": 0.89, + "grad_norm": 0.5033633708953857, + "learning_rate": 0.00047993424916094687, + "loss": 3.0059, + "step": 18074 + }, + { + "epoch": 0.89, + "grad_norm": 0.5659985542297363, + "learning_rate": 0.00047992192944160746, + "loss": 3.0396, + "step": 18075 + }, + { + "epoch": 0.89, + "grad_norm": 0.5208994150161743, + "learning_rate": 0.0004799096092483884, + "loss": 3.1537, + "step": 18076 + }, + { + "epoch": 0.89, + "grad_norm": 0.5257906913757324, + "learning_rate": 0.00047989728858132194, + "loss": 3.0355, + "step": 18077 + }, + { + "epoch": 0.89, + "grad_norm": 0.53220534324646, + "learning_rate": 0.0004798849674404407, + "loss": 3.109, + "step": 18078 + }, + { + "epoch": 0.89, + "grad_norm": 0.49276986718177795, + "learning_rate": 0.0004798726458257771, + "loss": 3.0571, + "step": 18079 + }, + { + "epoch": 0.89, + "grad_norm": 0.5161966681480408, + "learning_rate": 0.0004798603237373636, + "loss": 3.2301, + "step": 18080 + }, + { + "epoch": 0.89, + "grad_norm": 0.5360630750656128, + "learning_rate": 0.0004798480011752325, + "loss": 2.9056, + "step": 18081 + }, + { + "epoch": 0.89, + "grad_norm": 0.5495896339416504, + "learning_rate": 0.00047983567813941644, + "loss": 3.0623, + "step": 18082 + }, + { + "epoch": 0.89, + "grad_norm": 0.5134875178337097, + "learning_rate": 0.00047982335462994785, + "loss": 3.2153, + "step": 18083 + }, + { + "epoch": 0.89, + "grad_norm": 0.49886998534202576, + "learning_rate": 0.00047981103064685904, + "loss": 3.1122, + "step": 18084 + }, + { + "epoch": 0.89, + "grad_norm": 0.5390711426734924, + "learning_rate": 0.00047979870619018275, + "loss": 3.1859, + "step": 18085 + }, + { + "epoch": 0.89, + "grad_norm": 0.5308481454849243, + "learning_rate": 0.00047978638125995113, + "loss": 3.2953, + "step": 18086 + }, + { + "epoch": 0.89, + "grad_norm": 0.5197398662567139, + "learning_rate": 0.0004797740558561968, + "loss": 3.2586, + "step": 18087 + }, + { + "epoch": 0.89, + "grad_norm": 0.5043891668319702, + "learning_rate": 0.0004797617299789522, + "loss": 3.2689, + "step": 18088 + }, + { + "epoch": 0.89, + "grad_norm": 0.573706865310669, + "learning_rate": 0.00047974940362824987, + "loss": 3.2213, + "step": 18089 + }, + { + "epoch": 0.89, + "grad_norm": 0.5655125379562378, + "learning_rate": 0.00047973707680412224, + "loss": 3.0872, + "step": 18090 + }, + { + "epoch": 0.89, + "grad_norm": 0.5690857768058777, + "learning_rate": 0.00047972474950660164, + "loss": 3.0915, + "step": 18091 + }, + { + "epoch": 0.89, + "grad_norm": 0.5151222348213196, + "learning_rate": 0.00047971242173572065, + "loss": 3.0679, + "step": 18092 + }, + { + "epoch": 0.89, + "grad_norm": 0.6133147478103638, + "learning_rate": 0.00047970009349151174, + "loss": 3.2749, + "step": 18093 + }, + { + "epoch": 0.89, + "grad_norm": 0.5890400409698486, + "learning_rate": 0.0004796877647740074, + "loss": 3.0752, + "step": 18094 + }, + { + "epoch": 0.89, + "grad_norm": 0.5378167629241943, + "learning_rate": 0.00047967543558324, + "loss": 3.2817, + "step": 18095 + }, + { + "epoch": 0.89, + "grad_norm": 0.5156680941581726, + "learning_rate": 0.0004796631059192422, + "loss": 3.079, + "step": 18096 + }, + { + "epoch": 0.89, + "grad_norm": 0.5856471657752991, + "learning_rate": 0.0004796507757820462, + "loss": 3.1464, + "step": 18097 + }, + { + "epoch": 0.89, + "grad_norm": 0.5163522362709045, + "learning_rate": 0.00047963844517168473, + "loss": 3.284, + "step": 18098 + }, + { + "epoch": 0.89, + "grad_norm": 0.5546302199363708, + "learning_rate": 0.00047962611408819015, + "loss": 3.0596, + "step": 18099 + }, + { + "epoch": 0.89, + "grad_norm": 0.5673893690109253, + "learning_rate": 0.00047961378253159496, + "loss": 3.1698, + "step": 18100 + }, + { + "epoch": 0.89, + "grad_norm": 0.5053473711013794, + "learning_rate": 0.0004796014505019317, + "loss": 3.0218, + "step": 18101 + }, + { + "epoch": 0.89, + "grad_norm": 0.5111680626869202, + "learning_rate": 0.0004795891179992326, + "loss": 3.1531, + "step": 18102 + }, + { + "epoch": 0.89, + "grad_norm": 0.5182185769081116, + "learning_rate": 0.00047957678502353045, + "loss": 3.0609, + "step": 18103 + }, + { + "epoch": 0.89, + "grad_norm": 0.5161014199256897, + "learning_rate": 0.0004795644515748576, + "loss": 2.9817, + "step": 18104 + }, + { + "epoch": 0.89, + "grad_norm": 0.5604485273361206, + "learning_rate": 0.0004795521176532466, + "loss": 3.2267, + "step": 18105 + }, + { + "epoch": 0.89, + "grad_norm": 0.5659258961677551, + "learning_rate": 0.00047953978325872976, + "loss": 3.256, + "step": 18106 + }, + { + "epoch": 0.89, + "grad_norm": 0.5269966721534729, + "learning_rate": 0.00047952744839133973, + "loss": 3.1339, + "step": 18107 + }, + { + "epoch": 0.89, + "grad_norm": 0.5653190612792969, + "learning_rate": 0.000479515113051109, + "loss": 3.0896, + "step": 18108 + }, + { + "epoch": 0.89, + "grad_norm": 0.5687468647956848, + "learning_rate": 0.00047950277723806994, + "loss": 3.2164, + "step": 18109 + }, + { + "epoch": 0.89, + "grad_norm": 0.48306307196617126, + "learning_rate": 0.00047949044095225524, + "loss": 3.1984, + "step": 18110 + }, + { + "epoch": 0.89, + "grad_norm": 0.5953868627548218, + "learning_rate": 0.00047947810419369716, + "loss": 3.0912, + "step": 18111 + }, + { + "epoch": 0.89, + "grad_norm": 0.5070611238479614, + "learning_rate": 0.0004794657669624283, + "loss": 3.1685, + "step": 18112 + }, + { + "epoch": 0.89, + "grad_norm": 0.5517260432243347, + "learning_rate": 0.00047945342925848116, + "loss": 3.4973, + "step": 18113 + }, + { + "epoch": 0.89, + "grad_norm": 0.5137361884117126, + "learning_rate": 0.00047944109108188817, + "loss": 3.3537, + "step": 18114 + }, + { + "epoch": 0.89, + "grad_norm": 0.5287407636642456, + "learning_rate": 0.00047942875243268187, + "loss": 3.182, + "step": 18115 + }, + { + "epoch": 0.89, + "grad_norm": 0.5217429399490356, + "learning_rate": 0.0004794164133108949, + "loss": 3.2714, + "step": 18116 + }, + { + "epoch": 0.89, + "grad_norm": 0.48636454343795776, + "learning_rate": 0.00047940407371655956, + "loss": 3.1138, + "step": 18117 + }, + { + "epoch": 0.89, + "grad_norm": 0.5578451156616211, + "learning_rate": 0.00047939173364970833, + "loss": 3.1668, + "step": 18118 + }, + { + "epoch": 0.89, + "grad_norm": 0.5369355082511902, + "learning_rate": 0.0004793793931103739, + "loss": 3.0913, + "step": 18119 + }, + { + "epoch": 0.89, + "grad_norm": 0.5531823635101318, + "learning_rate": 0.0004793670520985886, + "loss": 2.9706, + "step": 18120 + }, + { + "epoch": 0.89, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0004793547106143851, + "loss": 3.2062, + "step": 18121 + }, + { + "epoch": 0.89, + "grad_norm": 0.5657452344894409, + "learning_rate": 0.00047934236865779576, + "loss": 3.1972, + "step": 18122 + }, + { + "epoch": 0.89, + "grad_norm": 0.5130107998847961, + "learning_rate": 0.0004793300262288531, + "loss": 3.2101, + "step": 18123 + }, + { + "epoch": 0.89, + "grad_norm": 0.5585821270942688, + "learning_rate": 0.00047931768332758976, + "loss": 3.2184, + "step": 18124 + }, + { + "epoch": 0.89, + "grad_norm": 0.5192179679870605, + "learning_rate": 0.0004793053399540381, + "loss": 3.1912, + "step": 18125 + }, + { + "epoch": 0.89, + "grad_norm": 0.5495747327804565, + "learning_rate": 0.00047929299610823065, + "loss": 3.2991, + "step": 18126 + }, + { + "epoch": 0.89, + "grad_norm": 0.5496625304222107, + "learning_rate": 0.0004792806517902, + "loss": 3.2042, + "step": 18127 + }, + { + "epoch": 0.89, + "grad_norm": 0.531826913356781, + "learning_rate": 0.00047926830699997853, + "loss": 3.0819, + "step": 18128 + }, + { + "epoch": 0.89, + "grad_norm": 0.5123338103294373, + "learning_rate": 0.00047925596173759895, + "loss": 2.9806, + "step": 18129 + }, + { + "epoch": 0.89, + "grad_norm": 0.561798095703125, + "learning_rate": 0.0004792436160030936, + "loss": 3.012, + "step": 18130 + }, + { + "epoch": 0.89, + "grad_norm": 0.5473758578300476, + "learning_rate": 0.0004792312697964951, + "loss": 3.167, + "step": 18131 + }, + { + "epoch": 0.89, + "grad_norm": 0.5174449682235718, + "learning_rate": 0.000479218923117836, + "loss": 3.0154, + "step": 18132 + }, + { + "epoch": 0.89, + "grad_norm": 0.5553672909736633, + "learning_rate": 0.0004792065759671486, + "loss": 3.0273, + "step": 18133 + }, + { + "epoch": 0.89, + "grad_norm": 0.5386341214179993, + "learning_rate": 0.0004791942283444656, + "loss": 3.0787, + "step": 18134 + }, + { + "epoch": 0.89, + "grad_norm": 0.4873228669166565, + "learning_rate": 0.0004791818802498195, + "loss": 3.0802, + "step": 18135 + }, + { + "epoch": 0.89, + "grad_norm": 0.5535769462585449, + "learning_rate": 0.00047916953168324284, + "loss": 3.0712, + "step": 18136 + }, + { + "epoch": 0.89, + "grad_norm": 0.5669405460357666, + "learning_rate": 0.0004791571826447681, + "loss": 2.7861, + "step": 18137 + }, + { + "epoch": 0.89, + "grad_norm": 0.5906660556793213, + "learning_rate": 0.0004791448331344278, + "loss": 3.2052, + "step": 18138 + }, + { + "epoch": 0.89, + "grad_norm": 0.5243983268737793, + "learning_rate": 0.0004791324831522545, + "loss": 3.0515, + "step": 18139 + }, + { + "epoch": 0.89, + "grad_norm": 0.5486578345298767, + "learning_rate": 0.00047912013269828073, + "loss": 3.2827, + "step": 18140 + }, + { + "epoch": 0.89, + "grad_norm": 0.5488114953041077, + "learning_rate": 0.00047910778177253906, + "loss": 3.2188, + "step": 18141 + }, + { + "epoch": 0.89, + "grad_norm": 0.5279528498649597, + "learning_rate": 0.00047909543037506183, + "loss": 3.2853, + "step": 18142 + }, + { + "epoch": 0.89, + "grad_norm": 0.5320385694503784, + "learning_rate": 0.00047908307850588175, + "loss": 3.0498, + "step": 18143 + }, + { + "epoch": 0.89, + "grad_norm": 0.5765833258628845, + "learning_rate": 0.0004790707261650313, + "loss": 3.1843, + "step": 18144 + }, + { + "epoch": 0.89, + "grad_norm": 0.5396232604980469, + "learning_rate": 0.0004790583733525431, + "loss": 3.358, + "step": 18145 + }, + { + "epoch": 0.89, + "grad_norm": 0.5210521817207336, + "learning_rate": 0.00047904602006844957, + "loss": 3.2112, + "step": 18146 + }, + { + "epoch": 0.89, + "grad_norm": 0.5123770833015442, + "learning_rate": 0.00047903366631278323, + "loss": 2.9842, + "step": 18147 + }, + { + "epoch": 0.89, + "grad_norm": 0.5506312847137451, + "learning_rate": 0.00047902131208557667, + "loss": 3.1214, + "step": 18148 + }, + { + "epoch": 0.89, + "grad_norm": 0.511390745639801, + "learning_rate": 0.00047900895738686245, + "loss": 3.147, + "step": 18149 + }, + { + "epoch": 0.89, + "grad_norm": 0.561761736869812, + "learning_rate": 0.0004789966022166732, + "loss": 3.2444, + "step": 18150 + }, + { + "epoch": 0.89, + "grad_norm": 0.5431965589523315, + "learning_rate": 0.00047898424657504126, + "loss": 3.0698, + "step": 18151 + }, + { + "epoch": 0.89, + "grad_norm": 0.5159698128700256, + "learning_rate": 0.00047897189046199924, + "loss": 3.238, + "step": 18152 + }, + { + "epoch": 0.89, + "grad_norm": 0.5606123208999634, + "learning_rate": 0.0004789595338775797, + "loss": 3.0589, + "step": 18153 + }, + { + "epoch": 0.89, + "grad_norm": 0.5363898873329163, + "learning_rate": 0.0004789471768218152, + "loss": 3.1029, + "step": 18154 + }, + { + "epoch": 0.89, + "grad_norm": 0.5742982029914856, + "learning_rate": 0.00047893481929473826, + "loss": 3.1311, + "step": 18155 + }, + { + "epoch": 0.89, + "grad_norm": 0.5812978744506836, + "learning_rate": 0.00047892246129638147, + "loss": 3.1121, + "step": 18156 + }, + { + "epoch": 0.89, + "grad_norm": 0.5430973768234253, + "learning_rate": 0.00047891010282677735, + "loss": 3.005, + "step": 18157 + }, + { + "epoch": 0.89, + "grad_norm": 0.524570107460022, + "learning_rate": 0.00047889774388595847, + "loss": 3.1939, + "step": 18158 + }, + { + "epoch": 0.89, + "grad_norm": 0.5263261198997498, + "learning_rate": 0.0004788853844739573, + "loss": 3.114, + "step": 18159 + }, + { + "epoch": 0.89, + "grad_norm": 0.5268939137458801, + "learning_rate": 0.00047887302459080644, + "loss": 3.271, + "step": 18160 + }, + { + "epoch": 0.89, + "grad_norm": 0.5581185817718506, + "learning_rate": 0.00047886066423653855, + "loss": 3.1335, + "step": 18161 + }, + { + "epoch": 0.89, + "grad_norm": 0.5777899026870728, + "learning_rate": 0.0004788483034111861, + "loss": 3.2539, + "step": 18162 + }, + { + "epoch": 0.89, + "grad_norm": 0.5424737334251404, + "learning_rate": 0.0004788359421147816, + "loss": 3.352, + "step": 18163 + }, + { + "epoch": 0.89, + "grad_norm": 0.5313791632652283, + "learning_rate": 0.0004788235803473576, + "loss": 3.2132, + "step": 18164 + }, + { + "epoch": 0.89, + "grad_norm": 0.5410112738609314, + "learning_rate": 0.0004788112181089467, + "loss": 3.1526, + "step": 18165 + }, + { + "epoch": 0.89, + "grad_norm": 0.5331111550331116, + "learning_rate": 0.0004787988553995815, + "loss": 3.1902, + "step": 18166 + }, + { + "epoch": 0.89, + "grad_norm": 0.5786098837852478, + "learning_rate": 0.00047878649221929455, + "loss": 3.2107, + "step": 18167 + }, + { + "epoch": 0.89, + "grad_norm": 0.5680441856384277, + "learning_rate": 0.00047877412856811834, + "loss": 3.223, + "step": 18168 + }, + { + "epoch": 0.89, + "grad_norm": 0.5643728971481323, + "learning_rate": 0.0004787617644460855, + "loss": 3.0881, + "step": 18169 + }, + { + "epoch": 0.89, + "grad_norm": 0.526891827583313, + "learning_rate": 0.0004787493998532286, + "loss": 2.929, + "step": 18170 + }, + { + "epoch": 0.89, + "grad_norm": 0.5148626565933228, + "learning_rate": 0.00047873703478958015, + "loss": 3.1915, + "step": 18171 + }, + { + "epoch": 0.89, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.00047872466925517274, + "loss": 3.0245, + "step": 18172 + }, + { + "epoch": 0.89, + "grad_norm": 0.5895242094993591, + "learning_rate": 0.000478712303250039, + "loss": 2.8557, + "step": 18173 + }, + { + "epoch": 0.89, + "grad_norm": 0.4982737898826599, + "learning_rate": 0.0004786999367742114, + "loss": 3.1632, + "step": 18174 + }, + { + "epoch": 0.89, + "grad_norm": 0.5337832570075989, + "learning_rate": 0.00047868756982772265, + "loss": 3.2289, + "step": 18175 + }, + { + "epoch": 0.89, + "grad_norm": 0.5422765612602234, + "learning_rate": 0.0004786752024106051, + "loss": 3.305, + "step": 18176 + }, + { + "epoch": 0.89, + "grad_norm": 0.5369231700897217, + "learning_rate": 0.0004786628345228915, + "loss": 3.1646, + "step": 18177 + }, + { + "epoch": 0.89, + "grad_norm": 0.49044135212898254, + "learning_rate": 0.00047865046616461446, + "loss": 3.1306, + "step": 18178 + }, + { + "epoch": 0.89, + "grad_norm": 0.520637571811676, + "learning_rate": 0.00047863809733580633, + "loss": 3.1229, + "step": 18179 + }, + { + "epoch": 0.89, + "grad_norm": 0.5776728987693787, + "learning_rate": 0.00047862572803649995, + "loss": 3.1429, + "step": 18180 + }, + { + "epoch": 0.89, + "grad_norm": 0.5988162755966187, + "learning_rate": 0.0004786133582667277, + "loss": 3.1521, + "step": 18181 + }, + { + "epoch": 0.89, + "grad_norm": 0.5374999642372131, + "learning_rate": 0.00047860098802652234, + "loss": 3.1968, + "step": 18182 + }, + { + "epoch": 0.89, + "grad_norm": 0.5586443543434143, + "learning_rate": 0.0004785886173159163, + "loss": 3.1005, + "step": 18183 + }, + { + "epoch": 0.89, + "grad_norm": 0.516861081123352, + "learning_rate": 0.00047857624613494216, + "loss": 3.184, + "step": 18184 + }, + { + "epoch": 0.89, + "grad_norm": 0.49465465545654297, + "learning_rate": 0.0004785638744836326, + "loss": 3.3298, + "step": 18185 + }, + { + "epoch": 0.89, + "grad_norm": 0.5254295468330383, + "learning_rate": 0.00047855150236202006, + "loss": 3.2069, + "step": 18186 + }, + { + "epoch": 0.89, + "grad_norm": 0.5072837471961975, + "learning_rate": 0.0004785391297701374, + "loss": 3.2564, + "step": 18187 + }, + { + "epoch": 0.89, + "grad_norm": 0.5069559812545776, + "learning_rate": 0.00047852675670801694, + "loss": 3.0861, + "step": 18188 + }, + { + "epoch": 0.89, + "grad_norm": 0.527495801448822, + "learning_rate": 0.0004785143831756913, + "loss": 3.0807, + "step": 18189 + }, + { + "epoch": 0.89, + "grad_norm": 0.4928951561450958, + "learning_rate": 0.00047850200917319325, + "loss": 3.0653, + "step": 18190 + }, + { + "epoch": 0.89, + "grad_norm": 0.5371759533882141, + "learning_rate": 0.0004784896347005552, + "loss": 3.1952, + "step": 18191 + }, + { + "epoch": 0.89, + "grad_norm": 0.5376279354095459, + "learning_rate": 0.0004784772597578098, + "loss": 3.0298, + "step": 18192 + }, + { + "epoch": 0.89, + "grad_norm": 0.5507842302322388, + "learning_rate": 0.00047846488434498966, + "loss": 3.3163, + "step": 18193 + }, + { + "epoch": 0.89, + "grad_norm": 0.5731067657470703, + "learning_rate": 0.0004784525084621274, + "loss": 3.1126, + "step": 18194 + }, + { + "epoch": 0.89, + "grad_norm": 0.5198032855987549, + "learning_rate": 0.00047844013210925556, + "loss": 3.0856, + "step": 18195 + }, + { + "epoch": 0.89, + "grad_norm": 0.5343044400215149, + "learning_rate": 0.0004784277552864067, + "loss": 3.3659, + "step": 18196 + }, + { + "epoch": 0.89, + "grad_norm": 0.49065908789634705, + "learning_rate": 0.00047841537799361345, + "loss": 3.1321, + "step": 18197 + }, + { + "epoch": 0.89, + "grad_norm": 0.521992027759552, + "learning_rate": 0.0004784030002309085, + "loss": 3.2767, + "step": 18198 + }, + { + "epoch": 0.89, + "grad_norm": 0.5364126563072205, + "learning_rate": 0.0004783906219983244, + "loss": 3.3079, + "step": 18199 + }, + { + "epoch": 0.89, + "grad_norm": 0.5045154690742493, + "learning_rate": 0.0004783782432958937, + "loss": 3.2046, + "step": 18200 + }, + { + "epoch": 0.89, + "grad_norm": 0.5102838277816772, + "learning_rate": 0.000478365864123649, + "loss": 3.0836, + "step": 18201 + }, + { + "epoch": 0.89, + "grad_norm": 0.5909935235977173, + "learning_rate": 0.000478353484481623, + "loss": 2.9596, + "step": 18202 + }, + { + "epoch": 0.89, + "grad_norm": 0.5541250705718994, + "learning_rate": 0.0004783411043698483, + "loss": 3.1793, + "step": 18203 + }, + { + "epoch": 0.89, + "grad_norm": 0.5354369878768921, + "learning_rate": 0.00047832872378835735, + "loss": 3.4151, + "step": 18204 + }, + { + "epoch": 0.89, + "grad_norm": 0.5579158067703247, + "learning_rate": 0.0004783163427371828, + "loss": 3.2288, + "step": 18205 + }, + { + "epoch": 0.89, + "grad_norm": 0.5801526308059692, + "learning_rate": 0.0004783039612163575, + "loss": 3.1398, + "step": 18206 + }, + { + "epoch": 0.89, + "grad_norm": 0.5363543629646301, + "learning_rate": 0.00047829157922591375, + "loss": 3.3956, + "step": 18207 + }, + { + "epoch": 0.89, + "grad_norm": 0.5684135556221008, + "learning_rate": 0.00047827919676588437, + "loss": 3.2425, + "step": 18208 + }, + { + "epoch": 0.89, + "grad_norm": 0.5314901471138, + "learning_rate": 0.0004782668138363019, + "loss": 3.2665, + "step": 18209 + }, + { + "epoch": 0.89, + "grad_norm": 0.5224202871322632, + "learning_rate": 0.0004782544304371988, + "loss": 3.0523, + "step": 18210 + }, + { + "epoch": 0.89, + "grad_norm": 0.5397300124168396, + "learning_rate": 0.00047824204656860794, + "loss": 3.1839, + "step": 18211 + }, + { + "epoch": 0.89, + "grad_norm": 0.5352901220321655, + "learning_rate": 0.0004782296622305619, + "loss": 3.2884, + "step": 18212 + }, + { + "epoch": 0.89, + "grad_norm": 0.5520905256271362, + "learning_rate": 0.0004782172774230932, + "loss": 3.0375, + "step": 18213 + }, + { + "epoch": 0.89, + "grad_norm": 0.5324362516403198, + "learning_rate": 0.0004782048921462344, + "loss": 3.2757, + "step": 18214 + }, + { + "epoch": 0.89, + "grad_norm": 0.5429298281669617, + "learning_rate": 0.00047819250640001833, + "loss": 3.2702, + "step": 18215 + }, + { + "epoch": 0.89, + "grad_norm": 0.5223371386528015, + "learning_rate": 0.00047818012018447737, + "loss": 3.135, + "step": 18216 + }, + { + "epoch": 0.89, + "grad_norm": 0.5844689607620239, + "learning_rate": 0.0004781677334996443, + "loss": 3.1293, + "step": 18217 + }, + { + "epoch": 0.89, + "grad_norm": 0.513873279094696, + "learning_rate": 0.00047815534634555187, + "loss": 3.1109, + "step": 18218 + }, + { + "epoch": 0.89, + "grad_norm": 0.522002637386322, + "learning_rate": 0.00047814295872223233, + "loss": 3.249, + "step": 18219 + }, + { + "epoch": 0.89, + "grad_norm": 0.4845925569534302, + "learning_rate": 0.0004781305706297186, + "loss": 3.1046, + "step": 18220 + }, + { + "epoch": 0.89, + "grad_norm": 0.5436228513717651, + "learning_rate": 0.00047811818206804324, + "loss": 3.1851, + "step": 18221 + }, + { + "epoch": 0.89, + "grad_norm": 0.4941715598106384, + "learning_rate": 0.0004781057930372389, + "loss": 3.3141, + "step": 18222 + }, + { + "epoch": 0.89, + "grad_norm": 0.5171759128570557, + "learning_rate": 0.0004780934035373381, + "loss": 3.3435, + "step": 18223 + }, + { + "epoch": 0.89, + "grad_norm": 0.5393630862236023, + "learning_rate": 0.0004780810135683736, + "loss": 3.1683, + "step": 18224 + }, + { + "epoch": 0.89, + "grad_norm": 0.5018099546432495, + "learning_rate": 0.0004780686231303779, + "loss": 3.2249, + "step": 18225 + }, + { + "epoch": 0.89, + "grad_norm": 0.5250770449638367, + "learning_rate": 0.0004780562322233839, + "loss": 3.3423, + "step": 18226 + }, + { + "epoch": 0.89, + "grad_norm": 0.5291233658790588, + "learning_rate": 0.0004780438408474239, + "loss": 3.191, + "step": 18227 + }, + { + "epoch": 0.89, + "grad_norm": 0.521236002445221, + "learning_rate": 0.00047803144900253076, + "loss": 2.9488, + "step": 18228 + }, + { + "epoch": 0.89, + "grad_norm": 0.5012958645820618, + "learning_rate": 0.000478019056688737, + "loss": 3.3888, + "step": 18229 + }, + { + "epoch": 0.89, + "grad_norm": 0.5145010352134705, + "learning_rate": 0.0004780066639060753, + "loss": 3.2967, + "step": 18230 + }, + { + "epoch": 0.89, + "grad_norm": 0.5077473521232605, + "learning_rate": 0.0004779942706545783, + "loss": 3.1262, + "step": 18231 + }, + { + "epoch": 0.89, + "grad_norm": 0.4998171627521515, + "learning_rate": 0.00047798187693427876, + "loss": 3.258, + "step": 18232 + }, + { + "epoch": 0.89, + "grad_norm": 0.5533021688461304, + "learning_rate": 0.000477969482745209, + "loss": 3.0468, + "step": 18233 + }, + { + "epoch": 0.89, + "grad_norm": 0.5420917868614197, + "learning_rate": 0.0004779570880874021, + "loss": 3.2048, + "step": 18234 + }, + { + "epoch": 0.89, + "grad_norm": 0.5190141201019287, + "learning_rate": 0.0004779446929608904, + "loss": 3.2259, + "step": 18235 + }, + { + "epoch": 0.89, + "grad_norm": 0.5503576993942261, + "learning_rate": 0.00047793229736570647, + "loss": 3.3452, + "step": 18236 + }, + { + "epoch": 0.89, + "grad_norm": 0.4898211359977722, + "learning_rate": 0.0004779199013018833, + "loss": 3.0858, + "step": 18237 + }, + { + "epoch": 0.89, + "grad_norm": 0.5298957824707031, + "learning_rate": 0.0004779075047694533, + "loss": 3.1858, + "step": 18238 + }, + { + "epoch": 0.89, + "grad_norm": 0.5395136475563049, + "learning_rate": 0.00047789510776844916, + "loss": 3.1886, + "step": 18239 + }, + { + "epoch": 0.89, + "grad_norm": 0.5132995247840881, + "learning_rate": 0.0004778827102989035, + "loss": 2.9574, + "step": 18240 + }, + { + "epoch": 0.89, + "grad_norm": 0.5644649267196655, + "learning_rate": 0.0004778703123608491, + "loss": 3.2534, + "step": 18241 + }, + { + "epoch": 0.89, + "grad_norm": 0.5382485389709473, + "learning_rate": 0.0004778579139543184, + "loss": 3.4383, + "step": 18242 + }, + { + "epoch": 0.89, + "grad_norm": 0.485053688287735, + "learning_rate": 0.0004778455150793444, + "loss": 3.2485, + "step": 18243 + }, + { + "epoch": 0.89, + "grad_norm": 0.49913448095321655, + "learning_rate": 0.0004778331157359594, + "loss": 3.0871, + "step": 18244 + }, + { + "epoch": 0.89, + "grad_norm": 0.550160825252533, + "learning_rate": 0.0004778207159241962, + "loss": 3.0416, + "step": 18245 + }, + { + "epoch": 0.89, + "grad_norm": 0.507668137550354, + "learning_rate": 0.0004778083156440874, + "loss": 3.0399, + "step": 18246 + }, + { + "epoch": 0.89, + "grad_norm": 0.5110635757446289, + "learning_rate": 0.00047779591489566583, + "loss": 2.992, + "step": 18247 + }, + { + "epoch": 0.89, + "grad_norm": 0.5091555714607239, + "learning_rate": 0.000477783513678964, + "loss": 3.0346, + "step": 18248 + }, + { + "epoch": 0.89, + "grad_norm": 0.5132473111152649, + "learning_rate": 0.00047777111199401464, + "loss": 3.1061, + "step": 18249 + }, + { + "epoch": 0.89, + "grad_norm": 0.5340614914894104, + "learning_rate": 0.0004777587098408503, + "loss": 3.0863, + "step": 18250 + }, + { + "epoch": 0.89, + "grad_norm": 0.5357064008712769, + "learning_rate": 0.0004777463072195037, + "loss": 3.3543, + "step": 18251 + }, + { + "epoch": 0.89, + "grad_norm": 0.5139802098274231, + "learning_rate": 0.0004777339041300077, + "loss": 3.2554, + "step": 18252 + }, + { + "epoch": 0.89, + "grad_norm": 0.5598965287208557, + "learning_rate": 0.0004777215005723947, + "loss": 2.9029, + "step": 18253 + }, + { + "epoch": 0.89, + "grad_norm": 0.5134537220001221, + "learning_rate": 0.0004777090965466975, + "loss": 3.0175, + "step": 18254 + }, + { + "epoch": 0.89, + "grad_norm": 0.6165127754211426, + "learning_rate": 0.0004776966920529487, + "loss": 3.15, + "step": 18255 + }, + { + "epoch": 0.89, + "grad_norm": 0.5549294352531433, + "learning_rate": 0.00047768428709118094, + "loss": 3.2095, + "step": 18256 + }, + { + "epoch": 0.89, + "grad_norm": 0.5079606771469116, + "learning_rate": 0.0004776718816614271, + "loss": 3.1343, + "step": 18257 + }, + { + "epoch": 0.89, + "grad_norm": 0.5567631125450134, + "learning_rate": 0.0004776594757637197, + "loss": 3.0041, + "step": 18258 + }, + { + "epoch": 0.89, + "grad_norm": 0.528123140335083, + "learning_rate": 0.00047764706939809143, + "loss": 3.1465, + "step": 18259 + }, + { + "epoch": 0.89, + "grad_norm": 0.5787830352783203, + "learning_rate": 0.000477634662564575, + "loss": 3.06, + "step": 18260 + }, + { + "epoch": 0.89, + "grad_norm": 0.5511603355407715, + "learning_rate": 0.00047762225526320297, + "loss": 3.1724, + "step": 18261 + }, + { + "epoch": 0.89, + "grad_norm": 0.5570658445358276, + "learning_rate": 0.00047760984749400806, + "loss": 3.219, + "step": 18262 + }, + { + "epoch": 0.9, + "grad_norm": 0.5181900858879089, + "learning_rate": 0.00047759743925702313, + "loss": 3.1698, + "step": 18263 + }, + { + "epoch": 0.9, + "grad_norm": 0.5297556519508362, + "learning_rate": 0.00047758503055228064, + "loss": 3.0683, + "step": 18264 + }, + { + "epoch": 0.9, + "grad_norm": 0.5381805896759033, + "learning_rate": 0.0004775726213798134, + "loss": 3.1135, + "step": 18265 + }, + { + "epoch": 0.9, + "grad_norm": 0.5791769027709961, + "learning_rate": 0.000477560211739654, + "loss": 3.2524, + "step": 18266 + }, + { + "epoch": 0.9, + "grad_norm": 0.5343177914619446, + "learning_rate": 0.0004775478016318352, + "loss": 3.055, + "step": 18267 + }, + { + "epoch": 0.9, + "grad_norm": 0.5313341021537781, + "learning_rate": 0.00047753539105638965, + "loss": 3.1583, + "step": 18268 + }, + { + "epoch": 0.9, + "grad_norm": 0.5037796497344971, + "learning_rate": 0.00047752298001335007, + "loss": 3.2608, + "step": 18269 + }, + { + "epoch": 0.9, + "grad_norm": 0.5314697027206421, + "learning_rate": 0.0004775105685027491, + "loss": 3.0916, + "step": 18270 + }, + { + "epoch": 0.9, + "grad_norm": 0.5548476576805115, + "learning_rate": 0.0004774981565246195, + "loss": 3.1843, + "step": 18271 + }, + { + "epoch": 0.9, + "grad_norm": 0.5024945735931396, + "learning_rate": 0.00047748574407899394, + "loss": 3.1704, + "step": 18272 + }, + { + "epoch": 0.9, + "grad_norm": 0.5327395796775818, + "learning_rate": 0.000477473331165905, + "loss": 3.1438, + "step": 18273 + }, + { + "epoch": 0.9, + "grad_norm": 0.5295423269271851, + "learning_rate": 0.00047746091778538553, + "loss": 3.0959, + "step": 18274 + }, + { + "epoch": 0.9, + "grad_norm": 0.5354325771331787, + "learning_rate": 0.0004774485039374681, + "loss": 3.1872, + "step": 18275 + }, + { + "epoch": 0.9, + "grad_norm": 0.5765413641929626, + "learning_rate": 0.0004774360896221855, + "loss": 3.0475, + "step": 18276 + }, + { + "epoch": 0.9, + "grad_norm": 0.5822243690490723, + "learning_rate": 0.0004774236748395704, + "loss": 3.0958, + "step": 18277 + }, + { + "epoch": 0.9, + "grad_norm": 0.5113519430160522, + "learning_rate": 0.0004774112595896554, + "loss": 3.2856, + "step": 18278 + }, + { + "epoch": 0.9, + "grad_norm": 0.5153071284294128, + "learning_rate": 0.00047739884387247334, + "loss": 3.3148, + "step": 18279 + }, + { + "epoch": 0.9, + "grad_norm": 0.5023435354232788, + "learning_rate": 0.0004773864276880569, + "loss": 3.1991, + "step": 18280 + }, + { + "epoch": 0.9, + "grad_norm": 0.5883546471595764, + "learning_rate": 0.00047737401103643866, + "loss": 3.0173, + "step": 18281 + }, + { + "epoch": 0.9, + "grad_norm": 0.51036536693573, + "learning_rate": 0.0004773615939176515, + "loss": 3.1579, + "step": 18282 + }, + { + "epoch": 0.9, + "grad_norm": 0.5233753323554993, + "learning_rate": 0.00047734917633172804, + "loss": 3.2266, + "step": 18283 + }, + { + "epoch": 0.9, + "grad_norm": 0.5153245329856873, + "learning_rate": 0.00047733675827870087, + "loss": 3.3655, + "step": 18284 + }, + { + "epoch": 0.9, + "grad_norm": 0.5131426453590393, + "learning_rate": 0.000477324339758603, + "loss": 3.2084, + "step": 18285 + }, + { + "epoch": 0.9, + "grad_norm": 0.532879650592804, + "learning_rate": 0.00047731192077146673, + "loss": 3.2726, + "step": 18286 + }, + { + "epoch": 0.9, + "grad_norm": 0.547607421875, + "learning_rate": 0.00047729950131732515, + "loss": 3.2178, + "step": 18287 + }, + { + "epoch": 0.9, + "grad_norm": 0.5815809965133667, + "learning_rate": 0.0004772870813962107, + "loss": 3.0909, + "step": 18288 + }, + { + "epoch": 0.9, + "grad_norm": 0.5434556603431702, + "learning_rate": 0.00047727466100815617, + "loss": 3.1988, + "step": 18289 + }, + { + "epoch": 0.9, + "grad_norm": 0.5472313761711121, + "learning_rate": 0.0004772622401531944, + "loss": 3.2903, + "step": 18290 + }, + { + "epoch": 0.9, + "grad_norm": 0.5081930160522461, + "learning_rate": 0.000477249818831358, + "loss": 3.1709, + "step": 18291 + }, + { + "epoch": 0.9, + "grad_norm": 0.5110074281692505, + "learning_rate": 0.00047723739704267964, + "loss": 3.4036, + "step": 18292 + }, + { + "epoch": 0.9, + "grad_norm": 0.5390797257423401, + "learning_rate": 0.0004772249747871921, + "loss": 3.2653, + "step": 18293 + }, + { + "epoch": 0.9, + "grad_norm": 0.5095894932746887, + "learning_rate": 0.00047721255206492814, + "loss": 3.3201, + "step": 18294 + }, + { + "epoch": 0.9, + "grad_norm": 0.5310311913490295, + "learning_rate": 0.00047720012887592035, + "loss": 3.1775, + "step": 18295 + }, + { + "epoch": 0.9, + "grad_norm": 0.5089913606643677, + "learning_rate": 0.0004771877052202016, + "loss": 3.095, + "step": 18296 + }, + { + "epoch": 0.9, + "grad_norm": 0.5221785306930542, + "learning_rate": 0.0004771752810978044, + "loss": 2.9333, + "step": 18297 + }, + { + "epoch": 0.9, + "grad_norm": 0.5876374244689941, + "learning_rate": 0.0004771628565087617, + "loss": 3.179, + "step": 18298 + }, + { + "epoch": 0.9, + "grad_norm": 0.5047369599342346, + "learning_rate": 0.0004771504314531061, + "loss": 3.1365, + "step": 18299 + }, + { + "epoch": 0.9, + "grad_norm": 0.5240013003349304, + "learning_rate": 0.0004771380059308705, + "loss": 3.3203, + "step": 18300 + }, + { + "epoch": 0.9, + "grad_norm": 0.5100141763687134, + "learning_rate": 0.0004771255799420873, + "loss": 3.1165, + "step": 18301 + }, + { + "epoch": 0.9, + "grad_norm": 0.5566077828407288, + "learning_rate": 0.0004771131534867894, + "loss": 3.1252, + "step": 18302 + }, + { + "epoch": 0.9, + "grad_norm": 0.5329804420471191, + "learning_rate": 0.00047710072656500965, + "loss": 3.0706, + "step": 18303 + }, + { + "epoch": 0.9, + "grad_norm": 0.5710445642471313, + "learning_rate": 0.00047708829917678065, + "loss": 3.086, + "step": 18304 + }, + { + "epoch": 0.9, + "grad_norm": 0.5329696536064148, + "learning_rate": 0.00047707587132213514, + "loss": 3.0563, + "step": 18305 + }, + { + "epoch": 0.9, + "grad_norm": 0.5192325115203857, + "learning_rate": 0.00047706344300110586, + "loss": 3.245, + "step": 18306 + }, + { + "epoch": 0.9, + "grad_norm": 0.5296958088874817, + "learning_rate": 0.00047705101421372556, + "loss": 3.0319, + "step": 18307 + }, + { + "epoch": 0.9, + "grad_norm": 0.5326006412506104, + "learning_rate": 0.00047703858496002697, + "loss": 3.1762, + "step": 18308 + }, + { + "epoch": 0.9, + "grad_norm": 0.5101357698440552, + "learning_rate": 0.0004770261552400428, + "loss": 3.0583, + "step": 18309 + }, + { + "epoch": 0.9, + "grad_norm": 0.5171705484390259, + "learning_rate": 0.0004770137250538058, + "loss": 3.0426, + "step": 18310 + }, + { + "epoch": 0.9, + "grad_norm": 0.5427629351615906, + "learning_rate": 0.0004770012944013487, + "loss": 3.1091, + "step": 18311 + }, + { + "epoch": 0.9, + "grad_norm": 0.5223405957221985, + "learning_rate": 0.0004769888632827043, + "loss": 3.0479, + "step": 18312 + }, + { + "epoch": 0.9, + "grad_norm": 0.5293853282928467, + "learning_rate": 0.0004769764316979052, + "loss": 3.3125, + "step": 18313 + }, + { + "epoch": 0.9, + "grad_norm": 0.5013025999069214, + "learning_rate": 0.00047696399964698434, + "loss": 3.0253, + "step": 18314 + }, + { + "epoch": 0.9, + "grad_norm": 0.5177825093269348, + "learning_rate": 0.0004769515671299743, + "loss": 3.0815, + "step": 18315 + }, + { + "epoch": 0.9, + "grad_norm": 0.5152139663696289, + "learning_rate": 0.00047693913414690795, + "loss": 3.1761, + "step": 18316 + }, + { + "epoch": 0.9, + "grad_norm": 0.5001190304756165, + "learning_rate": 0.000476926700697818, + "loss": 3.0974, + "step": 18317 + }, + { + "epoch": 0.9, + "grad_norm": 0.5169618129730225, + "learning_rate": 0.00047691426678273706, + "loss": 3.0948, + "step": 18318 + }, + { + "epoch": 0.9, + "grad_norm": 0.49277180433273315, + "learning_rate": 0.00047690183240169803, + "loss": 2.9982, + "step": 18319 + }, + { + "epoch": 0.9, + "grad_norm": 0.536400318145752, + "learning_rate": 0.00047688939755473363, + "loss": 3.2277, + "step": 18320 + }, + { + "epoch": 0.9, + "grad_norm": 0.4991290271282196, + "learning_rate": 0.0004768769622418766, + "loss": 3.0983, + "step": 18321 + }, + { + "epoch": 0.9, + "grad_norm": 0.6011770963668823, + "learning_rate": 0.0004768645264631597, + "loss": 3.1713, + "step": 18322 + }, + { + "epoch": 0.9, + "grad_norm": 0.4987938404083252, + "learning_rate": 0.00047685209021861567, + "loss": 3.1428, + "step": 18323 + }, + { + "epoch": 0.9, + "grad_norm": 0.507752537727356, + "learning_rate": 0.0004768396535082773, + "loss": 3.2872, + "step": 18324 + }, + { + "epoch": 0.9, + "grad_norm": 0.5121263265609741, + "learning_rate": 0.0004768272163321772, + "loss": 2.9893, + "step": 18325 + }, + { + "epoch": 0.9, + "grad_norm": 0.5051730275154114, + "learning_rate": 0.00047681477869034836, + "loss": 3.1656, + "step": 18326 + }, + { + "epoch": 0.9, + "grad_norm": 0.5293095111846924, + "learning_rate": 0.0004768023405828235, + "loss": 2.9227, + "step": 18327 + }, + { + "epoch": 0.9, + "grad_norm": 0.5156005620956421, + "learning_rate": 0.0004767899020096351, + "loss": 3.2264, + "step": 18328 + }, + { + "epoch": 0.9, + "grad_norm": 0.5265857577323914, + "learning_rate": 0.0004767774629708162, + "loss": 3.1721, + "step": 18329 + }, + { + "epoch": 0.9, + "grad_norm": 0.5211267471313477, + "learning_rate": 0.0004767650234663995, + "loss": 3.0617, + "step": 18330 + }, + { + "epoch": 0.9, + "grad_norm": 0.5150598883628845, + "learning_rate": 0.00047675258349641773, + "loss": 3.2196, + "step": 18331 + }, + { + "epoch": 0.9, + "grad_norm": 0.5152638554573059, + "learning_rate": 0.00047674014306090375, + "loss": 3.3045, + "step": 18332 + }, + { + "epoch": 0.9, + "grad_norm": 0.5148764252662659, + "learning_rate": 0.0004767277021598901, + "loss": 3.0967, + "step": 18333 + }, + { + "epoch": 0.9, + "grad_norm": 0.5162348747253418, + "learning_rate": 0.0004767152607934098, + "loss": 2.8442, + "step": 18334 + }, + { + "epoch": 0.9, + "grad_norm": 0.5254623889923096, + "learning_rate": 0.00047670281896149553, + "loss": 3.1644, + "step": 18335 + }, + { + "epoch": 0.9, + "grad_norm": 0.5040985345840454, + "learning_rate": 0.00047669037666418, + "loss": 2.972, + "step": 18336 + }, + { + "epoch": 0.9, + "grad_norm": 0.5320616960525513, + "learning_rate": 0.00047667793390149607, + "loss": 3.3935, + "step": 18337 + }, + { + "epoch": 0.9, + "grad_norm": 0.5272945761680603, + "learning_rate": 0.0004766654906734764, + "loss": 3.1817, + "step": 18338 + }, + { + "epoch": 0.9, + "grad_norm": 0.5453786253929138, + "learning_rate": 0.0004766530469801538, + "loss": 3.1222, + "step": 18339 + }, + { + "epoch": 0.9, + "grad_norm": 0.5060163140296936, + "learning_rate": 0.00047664060282156113, + "loss": 3.1181, + "step": 18340 + }, + { + "epoch": 0.9, + "grad_norm": 0.5192004442214966, + "learning_rate": 0.00047662815819773106, + "loss": 3.4064, + "step": 18341 + }, + { + "epoch": 0.9, + "grad_norm": 0.5070635080337524, + "learning_rate": 0.00047661571310869655, + "loss": 3.1392, + "step": 18342 + }, + { + "epoch": 0.9, + "grad_norm": 0.5369440317153931, + "learning_rate": 0.0004766032675544901, + "loss": 3.0245, + "step": 18343 + }, + { + "epoch": 0.9, + "grad_norm": 0.5121726393699646, + "learning_rate": 0.0004765908215351446, + "loss": 3.2891, + "step": 18344 + }, + { + "epoch": 0.9, + "grad_norm": 0.5277031064033508, + "learning_rate": 0.000476578375050693, + "loss": 3.0794, + "step": 18345 + }, + { + "epoch": 0.9, + "grad_norm": 0.5574108958244324, + "learning_rate": 0.00047656592810116794, + "loss": 2.8832, + "step": 18346 + }, + { + "epoch": 0.9, + "grad_norm": 0.5462555885314941, + "learning_rate": 0.0004765534806866021, + "loss": 3.0332, + "step": 18347 + }, + { + "epoch": 0.9, + "grad_norm": 0.5030328035354614, + "learning_rate": 0.0004765410328070284, + "loss": 3.347, + "step": 18348 + }, + { + "epoch": 0.9, + "grad_norm": 0.5085557103157043, + "learning_rate": 0.00047652858446247954, + "loss": 3.1421, + "step": 18349 + }, + { + "epoch": 0.9, + "grad_norm": 0.5109598636627197, + "learning_rate": 0.00047651613565298853, + "loss": 3.2073, + "step": 18350 + }, + { + "epoch": 0.9, + "grad_norm": 0.5085839033126831, + "learning_rate": 0.00047650368637858784, + "loss": 3.1515, + "step": 18351 + }, + { + "epoch": 0.9, + "grad_norm": 0.524103581905365, + "learning_rate": 0.00047649123663931047, + "loss": 3.1561, + "step": 18352 + }, + { + "epoch": 0.9, + "grad_norm": 0.5291694402694702, + "learning_rate": 0.0004764787864351892, + "loss": 3.1722, + "step": 18353 + }, + { + "epoch": 0.9, + "grad_norm": 0.590645432472229, + "learning_rate": 0.0004764663357662567, + "loss": 3.1482, + "step": 18354 + }, + { + "epoch": 0.9, + "grad_norm": 0.5344750285148621, + "learning_rate": 0.0004764538846325458, + "loss": 3.1071, + "step": 18355 + }, + { + "epoch": 0.9, + "grad_norm": 0.5120199918746948, + "learning_rate": 0.00047644143303408935, + "loss": 3.0998, + "step": 18356 + }, + { + "epoch": 0.9, + "grad_norm": 0.5367302894592285, + "learning_rate": 0.00047642898097092015, + "loss": 3.2283, + "step": 18357 + }, + { + "epoch": 0.9, + "grad_norm": 0.5890613794326782, + "learning_rate": 0.000476416528443071, + "loss": 3.1638, + "step": 18358 + }, + { + "epoch": 0.9, + "grad_norm": 0.5009247660636902, + "learning_rate": 0.0004764040754505746, + "loss": 3.2533, + "step": 18359 + }, + { + "epoch": 0.9, + "grad_norm": 0.528739333152771, + "learning_rate": 0.00047639162199346384, + "loss": 3.0933, + "step": 18360 + }, + { + "epoch": 0.9, + "grad_norm": 0.5243030786514282, + "learning_rate": 0.00047637916807177153, + "loss": 3.1643, + "step": 18361 + }, + { + "epoch": 0.9, + "grad_norm": 0.5276967287063599, + "learning_rate": 0.0004763667136855304, + "loss": 3.2262, + "step": 18362 + }, + { + "epoch": 0.9, + "grad_norm": 0.5526394844055176, + "learning_rate": 0.0004763542588347733, + "loss": 3.0935, + "step": 18363 + }, + { + "epoch": 0.9, + "grad_norm": 0.5326481461524963, + "learning_rate": 0.00047634180351953295, + "loss": 3.1979, + "step": 18364 + }, + { + "epoch": 0.9, + "grad_norm": 0.5490707159042358, + "learning_rate": 0.0004763293477398423, + "loss": 3.137, + "step": 18365 + }, + { + "epoch": 0.9, + "grad_norm": 0.5224721431732178, + "learning_rate": 0.0004763168914957341, + "loss": 3.255, + "step": 18366 + }, + { + "epoch": 0.9, + "grad_norm": 0.5512878894805908, + "learning_rate": 0.0004763044347872412, + "loss": 2.9361, + "step": 18367 + }, + { + "epoch": 0.9, + "grad_norm": 0.5557450652122498, + "learning_rate": 0.00047629197761439617, + "loss": 3.1725, + "step": 18368 + }, + { + "epoch": 0.9, + "grad_norm": 0.5146492719650269, + "learning_rate": 0.0004762795199772321, + "loss": 3.3767, + "step": 18369 + }, + { + "epoch": 0.9, + "grad_norm": 0.6718878746032715, + "learning_rate": 0.00047626706187578173, + "loss": 2.9582, + "step": 18370 + }, + { + "epoch": 0.9, + "grad_norm": 0.5548587441444397, + "learning_rate": 0.0004762546033100778, + "loss": 3.1481, + "step": 18371 + }, + { + "epoch": 0.9, + "grad_norm": 0.5026013255119324, + "learning_rate": 0.0004762421442801532, + "loss": 3.3895, + "step": 18372 + }, + { + "epoch": 0.9, + "grad_norm": 0.5194470286369324, + "learning_rate": 0.00047622968478604064, + "loss": 3.5001, + "step": 18373 + }, + { + "epoch": 0.9, + "grad_norm": 0.5245919823646545, + "learning_rate": 0.00047621722482777307, + "loss": 3.1301, + "step": 18374 + }, + { + "epoch": 0.9, + "grad_norm": 0.5411098599433899, + "learning_rate": 0.00047620476440538316, + "loss": 3.0287, + "step": 18375 + }, + { + "epoch": 0.9, + "grad_norm": 0.5193166732788086, + "learning_rate": 0.000476192303518904, + "loss": 3.1614, + "step": 18376 + }, + { + "epoch": 0.9, + "grad_norm": 0.5470539927482605, + "learning_rate": 0.000476179842168368, + "loss": 3.184, + "step": 18377 + }, + { + "epoch": 0.9, + "grad_norm": 0.5132575631141663, + "learning_rate": 0.0004761673803538084, + "loss": 3.0766, + "step": 18378 + }, + { + "epoch": 0.9, + "grad_norm": 0.5734747648239136, + "learning_rate": 0.00047615491807525764, + "loss": 3.2, + "step": 18379 + }, + { + "epoch": 0.9, + "grad_norm": 0.536043107509613, + "learning_rate": 0.0004761424553327488, + "loss": 3.2023, + "step": 18380 + }, + { + "epoch": 0.9, + "grad_norm": 0.5419288277626038, + "learning_rate": 0.00047612999212631464, + "loss": 3.2887, + "step": 18381 + }, + { + "epoch": 0.9, + "grad_norm": 0.5045163631439209, + "learning_rate": 0.00047611752845598803, + "loss": 3.1443, + "step": 18382 + }, + { + "epoch": 0.9, + "grad_norm": 0.5081565976142883, + "learning_rate": 0.0004761050643218017, + "loss": 3.0771, + "step": 18383 + }, + { + "epoch": 0.9, + "grad_norm": 0.5471091866493225, + "learning_rate": 0.00047609259972378843, + "loss": 3.1064, + "step": 18384 + }, + { + "epoch": 0.9, + "grad_norm": 0.5175535082817078, + "learning_rate": 0.00047608013466198125, + "loss": 3.1124, + "step": 18385 + }, + { + "epoch": 0.9, + "grad_norm": 0.5380799174308777, + "learning_rate": 0.0004760676691364129, + "loss": 2.9918, + "step": 18386 + }, + { + "epoch": 0.9, + "grad_norm": 0.5299926996231079, + "learning_rate": 0.0004760552031471162, + "loss": 3.3698, + "step": 18387 + }, + { + "epoch": 0.9, + "grad_norm": 0.5114349722862244, + "learning_rate": 0.00047604273669412387, + "loss": 2.9142, + "step": 18388 + }, + { + "epoch": 0.9, + "grad_norm": 0.5319020748138428, + "learning_rate": 0.000476030269777469, + "loss": 3.3346, + "step": 18389 + }, + { + "epoch": 0.9, + "grad_norm": 0.5370768904685974, + "learning_rate": 0.0004760178023971841, + "loss": 3.076, + "step": 18390 + }, + { + "epoch": 0.9, + "grad_norm": 0.5257225036621094, + "learning_rate": 0.0004760053345533023, + "loss": 3.0894, + "step": 18391 + }, + { + "epoch": 0.9, + "grad_norm": 0.5191155076026917, + "learning_rate": 0.0004759928662458562, + "loss": 3.0241, + "step": 18392 + }, + { + "epoch": 0.9, + "grad_norm": 0.5163692831993103, + "learning_rate": 0.0004759803974748789, + "loss": 3.0367, + "step": 18393 + }, + { + "epoch": 0.9, + "grad_norm": 0.5202220678329468, + "learning_rate": 0.00047596792824040315, + "loss": 3.1828, + "step": 18394 + }, + { + "epoch": 0.9, + "grad_norm": 0.5226185321807861, + "learning_rate": 0.0004759554585424616, + "loss": 3.1161, + "step": 18395 + }, + { + "epoch": 0.9, + "grad_norm": 0.5085725784301758, + "learning_rate": 0.0004759429883810873, + "loss": 3.1653, + "step": 18396 + }, + { + "epoch": 0.9, + "grad_norm": 0.5269706845283508, + "learning_rate": 0.000475930517756313, + "loss": 3.3381, + "step": 18397 + }, + { + "epoch": 0.9, + "grad_norm": 0.5295675992965698, + "learning_rate": 0.00047591804666817164, + "loss": 3.1354, + "step": 18398 + }, + { + "epoch": 0.9, + "grad_norm": 0.5254941582679749, + "learning_rate": 0.00047590557511669596, + "loss": 3.4621, + "step": 18399 + }, + { + "epoch": 0.9, + "grad_norm": 0.4832818806171417, + "learning_rate": 0.00047589310310191873, + "loss": 3.0591, + "step": 18400 + }, + { + "epoch": 0.9, + "grad_norm": 0.5161293745040894, + "learning_rate": 0.0004758806306238731, + "loss": 3.2809, + "step": 18401 + }, + { + "epoch": 0.9, + "grad_norm": 0.5065767168998718, + "learning_rate": 0.0004758681576825916, + "loss": 3.2523, + "step": 18402 + }, + { + "epoch": 0.9, + "grad_norm": 0.48637884855270386, + "learning_rate": 0.0004758556842781074, + "loss": 3.2953, + "step": 18403 + }, + { + "epoch": 0.9, + "grad_norm": 0.5562406778335571, + "learning_rate": 0.000475843210410453, + "loss": 3.0296, + "step": 18404 + }, + { + "epoch": 0.9, + "grad_norm": 0.553834855556488, + "learning_rate": 0.0004758307360796615, + "loss": 3.3277, + "step": 18405 + }, + { + "epoch": 0.9, + "grad_norm": 0.5108713507652283, + "learning_rate": 0.00047581826128576557, + "loss": 3.1895, + "step": 18406 + }, + { + "epoch": 0.9, + "grad_norm": 0.4815329909324646, + "learning_rate": 0.0004758057860287984, + "loss": 3.2556, + "step": 18407 + }, + { + "epoch": 0.9, + "grad_norm": 0.5623984336853027, + "learning_rate": 0.00047579331030879246, + "loss": 3.0558, + "step": 18408 + }, + { + "epoch": 0.9, + "grad_norm": 0.5089913010597229, + "learning_rate": 0.00047578083412578085, + "loss": 3.1773, + "step": 18409 + }, + { + "epoch": 0.9, + "grad_norm": 0.5331538319587708, + "learning_rate": 0.00047576835747979626, + "loss": 3.1828, + "step": 18410 + }, + { + "epoch": 0.9, + "grad_norm": 0.519395112991333, + "learning_rate": 0.0004757558803708717, + "loss": 3.0746, + "step": 18411 + }, + { + "epoch": 0.9, + "grad_norm": 0.538934051990509, + "learning_rate": 0.00047574340279903993, + "loss": 3.1324, + "step": 18412 + }, + { + "epoch": 0.9, + "grad_norm": 0.48608413338661194, + "learning_rate": 0.0004757309247643339, + "loss": 3.0911, + "step": 18413 + }, + { + "epoch": 0.9, + "grad_norm": 0.5462820529937744, + "learning_rate": 0.0004757184462667865, + "loss": 3.2101, + "step": 18414 + }, + { + "epoch": 0.9, + "grad_norm": 0.5184861421585083, + "learning_rate": 0.00047570596730643036, + "loss": 3.2584, + "step": 18415 + }, + { + "epoch": 0.9, + "grad_norm": 0.5258368253707886, + "learning_rate": 0.0004756934878832987, + "loss": 3.1552, + "step": 18416 + }, + { + "epoch": 0.9, + "grad_norm": 0.5133891701698303, + "learning_rate": 0.0004756810079974241, + "loss": 3.2167, + "step": 18417 + }, + { + "epoch": 0.9, + "grad_norm": 0.49524182081222534, + "learning_rate": 0.0004756685276488396, + "loss": 2.9747, + "step": 18418 + }, + { + "epoch": 0.9, + "grad_norm": 0.5075833797454834, + "learning_rate": 0.000475656046837578, + "loss": 3.0269, + "step": 18419 + }, + { + "epoch": 0.9, + "grad_norm": 0.4994834065437317, + "learning_rate": 0.0004756435655636721, + "loss": 3.2545, + "step": 18420 + }, + { + "epoch": 0.9, + "grad_norm": 0.5248388648033142, + "learning_rate": 0.00047563108382715487, + "loss": 3.1916, + "step": 18421 + }, + { + "epoch": 0.9, + "grad_norm": 0.5069358944892883, + "learning_rate": 0.00047561860162805925, + "loss": 3.0174, + "step": 18422 + }, + { + "epoch": 0.9, + "grad_norm": 0.5556639432907104, + "learning_rate": 0.0004756061189664179, + "loss": 3.0031, + "step": 18423 + }, + { + "epoch": 0.9, + "grad_norm": 0.49170437455177307, + "learning_rate": 0.00047559363584226394, + "loss": 3.2966, + "step": 18424 + }, + { + "epoch": 0.9, + "grad_norm": 0.5303202867507935, + "learning_rate": 0.0004755811522556301, + "loss": 2.9769, + "step": 18425 + }, + { + "epoch": 0.9, + "grad_norm": 0.49278274178504944, + "learning_rate": 0.0004755686682065493, + "loss": 3.1306, + "step": 18426 + }, + { + "epoch": 0.9, + "grad_norm": 0.5985496044158936, + "learning_rate": 0.0004755561836950544, + "loss": 3.0919, + "step": 18427 + }, + { + "epoch": 0.9, + "grad_norm": 0.5184206962585449, + "learning_rate": 0.00047554369872117834, + "loss": 3.1546, + "step": 18428 + }, + { + "epoch": 0.9, + "grad_norm": 0.5128380656242371, + "learning_rate": 0.000475531213284954, + "loss": 3.2689, + "step": 18429 + }, + { + "epoch": 0.9, + "grad_norm": 0.5515463948249817, + "learning_rate": 0.0004755187273864141, + "loss": 3.2732, + "step": 18430 + }, + { + "epoch": 0.9, + "grad_norm": 0.5561820864677429, + "learning_rate": 0.00047550624102559173, + "loss": 3.1093, + "step": 18431 + }, + { + "epoch": 0.9, + "grad_norm": 0.610331654548645, + "learning_rate": 0.0004754937542025197, + "loss": 3.0545, + "step": 18432 + }, + { + "epoch": 0.9, + "grad_norm": 0.5147985219955444, + "learning_rate": 0.0004754812669172308, + "loss": 3.0679, + "step": 18433 + }, + { + "epoch": 0.9, + "grad_norm": 0.553709864616394, + "learning_rate": 0.0004754687791697581, + "loss": 3.4146, + "step": 18434 + }, + { + "epoch": 0.9, + "grad_norm": 0.5066872239112854, + "learning_rate": 0.0004754562909601344, + "loss": 3.041, + "step": 18435 + }, + { + "epoch": 0.9, + "grad_norm": 0.5329242944717407, + "learning_rate": 0.0004754438022883926, + "loss": 3.0274, + "step": 18436 + }, + { + "epoch": 0.9, + "grad_norm": 0.570603609085083, + "learning_rate": 0.00047543131315456566, + "loss": 3.26, + "step": 18437 + }, + { + "epoch": 0.9, + "grad_norm": 0.5356221795082092, + "learning_rate": 0.0004754188235586863, + "loss": 3.054, + "step": 18438 + }, + { + "epoch": 0.9, + "grad_norm": 0.5116856694221497, + "learning_rate": 0.00047540633350078753, + "loss": 3.0638, + "step": 18439 + }, + { + "epoch": 0.9, + "grad_norm": 0.5043549537658691, + "learning_rate": 0.00047539384298090227, + "loss": 3.1825, + "step": 18440 + }, + { + "epoch": 0.9, + "grad_norm": 0.4952833950519562, + "learning_rate": 0.00047538135199906334, + "loss": 3.3498, + "step": 18441 + }, + { + "epoch": 0.9, + "grad_norm": 0.5181792378425598, + "learning_rate": 0.0004753688605553037, + "loss": 3.1791, + "step": 18442 + }, + { + "epoch": 0.9, + "grad_norm": 0.5022110939025879, + "learning_rate": 0.0004753563686496562, + "loss": 3.0999, + "step": 18443 + }, + { + "epoch": 0.9, + "grad_norm": 0.5453217625617981, + "learning_rate": 0.0004753438762821539, + "loss": 3.1889, + "step": 18444 + }, + { + "epoch": 0.9, + "grad_norm": 0.5252522826194763, + "learning_rate": 0.0004753313834528294, + "loss": 3.2477, + "step": 18445 + }, + { + "epoch": 0.9, + "grad_norm": 0.5274051427841187, + "learning_rate": 0.0004753188901617159, + "loss": 3.0772, + "step": 18446 + }, + { + "epoch": 0.9, + "grad_norm": 0.5190703272819519, + "learning_rate": 0.00047530639640884617, + "loss": 3.3103, + "step": 18447 + }, + { + "epoch": 0.9, + "grad_norm": 0.5288125872612, + "learning_rate": 0.0004752939021942531, + "loss": 2.9752, + "step": 18448 + }, + { + "epoch": 0.9, + "grad_norm": 0.5266156792640686, + "learning_rate": 0.0004752814075179696, + "loss": 3.1599, + "step": 18449 + }, + { + "epoch": 0.9, + "grad_norm": 0.506328284740448, + "learning_rate": 0.00047526891238002867, + "loss": 3.2425, + "step": 18450 + }, + { + "epoch": 0.9, + "grad_norm": 0.5088960528373718, + "learning_rate": 0.00047525641678046313, + "loss": 3.3905, + "step": 18451 + }, + { + "epoch": 0.9, + "grad_norm": 0.5209529995918274, + "learning_rate": 0.00047524392071930595, + "loss": 3.1773, + "step": 18452 + }, + { + "epoch": 0.9, + "grad_norm": 0.524338960647583, + "learning_rate": 0.0004752314241965899, + "loss": 3.1845, + "step": 18453 + }, + { + "epoch": 0.9, + "grad_norm": 0.5339444279670715, + "learning_rate": 0.00047521892721234804, + "loss": 2.802, + "step": 18454 + }, + { + "epoch": 0.9, + "grad_norm": 0.5499926805496216, + "learning_rate": 0.0004752064297666134, + "loss": 2.874, + "step": 18455 + }, + { + "epoch": 0.9, + "grad_norm": 0.5348438620567322, + "learning_rate": 0.00047519393185941857, + "loss": 3.202, + "step": 18456 + }, + { + "epoch": 0.9, + "grad_norm": 0.5232212543487549, + "learning_rate": 0.00047518143349079673, + "loss": 3.4002, + "step": 18457 + }, + { + "epoch": 0.9, + "grad_norm": 0.4834090769290924, + "learning_rate": 0.00047516893466078067, + "loss": 3.2367, + "step": 18458 + }, + { + "epoch": 0.9, + "grad_norm": 0.5394415855407715, + "learning_rate": 0.00047515643536940336, + "loss": 2.8562, + "step": 18459 + }, + { + "epoch": 0.9, + "grad_norm": 0.5132725238800049, + "learning_rate": 0.0004751439356166977, + "loss": 3.3076, + "step": 18460 + }, + { + "epoch": 0.9, + "grad_norm": 0.5205427408218384, + "learning_rate": 0.00047513143540269665, + "loss": 3.3082, + "step": 18461 + }, + { + "epoch": 0.9, + "grad_norm": 0.4787410497665405, + "learning_rate": 0.00047511893472743306, + "loss": 3.0763, + "step": 18462 + }, + { + "epoch": 0.9, + "grad_norm": 0.5411839485168457, + "learning_rate": 0.00047510643359093983, + "loss": 3.0592, + "step": 18463 + }, + { + "epoch": 0.9, + "grad_norm": 0.5531505346298218, + "learning_rate": 0.0004750939319932501, + "loss": 3.062, + "step": 18464 + }, + { + "epoch": 0.9, + "grad_norm": 0.52873295545578, + "learning_rate": 0.0004750814299343966, + "loss": 3.3097, + "step": 18465 + }, + { + "epoch": 0.9, + "grad_norm": 0.5306869149208069, + "learning_rate": 0.00047506892741441234, + "loss": 3.046, + "step": 18466 + }, + { + "epoch": 0.91, + "grad_norm": 0.5929498076438904, + "learning_rate": 0.0004750564244333302, + "loss": 2.9961, + "step": 18467 + }, + { + "epoch": 0.91, + "grad_norm": 0.5276499390602112, + "learning_rate": 0.0004750439209911831, + "loss": 3.1753, + "step": 18468 + }, + { + "epoch": 0.91, + "grad_norm": 0.5233253836631775, + "learning_rate": 0.00047503141708800394, + "loss": 3.1066, + "step": 18469 + }, + { + "epoch": 0.91, + "grad_norm": 0.527217447757721, + "learning_rate": 0.00047501891272382584, + "loss": 3.0618, + "step": 18470 + }, + { + "epoch": 0.91, + "grad_norm": 0.5036908388137817, + "learning_rate": 0.00047500640789868156, + "loss": 2.9057, + "step": 18471 + }, + { + "epoch": 0.91, + "grad_norm": 0.5078242421150208, + "learning_rate": 0.000474993902612604, + "loss": 3.4062, + "step": 18472 + }, + { + "epoch": 0.91, + "grad_norm": 0.5183581113815308, + "learning_rate": 0.0004749813968656262, + "loss": 3.1168, + "step": 18473 + }, + { + "epoch": 0.91, + "grad_norm": 0.5693612098693848, + "learning_rate": 0.0004749688906577812, + "loss": 2.7811, + "step": 18474 + }, + { + "epoch": 0.91, + "grad_norm": 0.5193041563034058, + "learning_rate": 0.0004749563839891017, + "loss": 3.0456, + "step": 18475 + }, + { + "epoch": 0.91, + "grad_norm": 0.5137484669685364, + "learning_rate": 0.0004749438768596208, + "loss": 3.1382, + "step": 18476 + }, + { + "epoch": 0.91, + "grad_norm": 0.536248505115509, + "learning_rate": 0.00047493136926937137, + "loss": 2.9656, + "step": 18477 + }, + { + "epoch": 0.91, + "grad_norm": 0.5261755585670471, + "learning_rate": 0.0004749188612183865, + "loss": 3.4115, + "step": 18478 + }, + { + "epoch": 0.91, + "grad_norm": 0.5276913642883301, + "learning_rate": 0.00047490635270669885, + "loss": 3.3476, + "step": 18479 + }, + { + "epoch": 0.91, + "grad_norm": 0.5933910012245178, + "learning_rate": 0.00047489384373434156, + "loss": 3.1409, + "step": 18480 + }, + { + "epoch": 0.91, + "grad_norm": 0.5715004205703735, + "learning_rate": 0.00047488133430134764, + "loss": 3.1825, + "step": 18481 + }, + { + "epoch": 0.91, + "grad_norm": 0.5107777118682861, + "learning_rate": 0.0004748688244077499, + "loss": 3.1458, + "step": 18482 + }, + { + "epoch": 0.91, + "grad_norm": 0.49393430352211, + "learning_rate": 0.00047485631405358127, + "loss": 3.0858, + "step": 18483 + }, + { + "epoch": 0.91, + "grad_norm": 0.514667272567749, + "learning_rate": 0.0004748438032388748, + "loss": 3.2898, + "step": 18484 + }, + { + "epoch": 0.91, + "grad_norm": 0.5319417715072632, + "learning_rate": 0.00047483129196366336, + "loss": 3.3375, + "step": 18485 + }, + { + "epoch": 0.91, + "grad_norm": 0.5158092975616455, + "learning_rate": 0.00047481878022798005, + "loss": 3.162, + "step": 18486 + }, + { + "epoch": 0.91, + "grad_norm": 0.5055559277534485, + "learning_rate": 0.0004748062680318576, + "loss": 3.204, + "step": 18487 + }, + { + "epoch": 0.91, + "grad_norm": 0.5113157033920288, + "learning_rate": 0.0004747937553753291, + "loss": 3.156, + "step": 18488 + }, + { + "epoch": 0.91, + "grad_norm": 0.5367483496665955, + "learning_rate": 0.00047478124225842764, + "loss": 3.0957, + "step": 18489 + }, + { + "epoch": 0.91, + "grad_norm": 0.5241609811782837, + "learning_rate": 0.0004747687286811859, + "loss": 3.2377, + "step": 18490 + }, + { + "epoch": 0.91, + "grad_norm": 0.49339574575424194, + "learning_rate": 0.0004747562146436369, + "loss": 3.1479, + "step": 18491 + }, + { + "epoch": 0.91, + "grad_norm": 0.512546956539154, + "learning_rate": 0.0004747437001458138, + "loss": 3.2149, + "step": 18492 + }, + { + "epoch": 0.91, + "grad_norm": 0.5302311182022095, + "learning_rate": 0.0004747311851877493, + "loss": 3.1386, + "step": 18493 + }, + { + "epoch": 0.91, + "grad_norm": 0.5048165321350098, + "learning_rate": 0.0004747186697694766, + "loss": 3.132, + "step": 18494 + }, + { + "epoch": 0.91, + "grad_norm": 0.5217496156692505, + "learning_rate": 0.0004747061538910285, + "loss": 3.2443, + "step": 18495 + }, + { + "epoch": 0.91, + "grad_norm": 0.5357189774513245, + "learning_rate": 0.00047469363755243797, + "loss": 3.0927, + "step": 18496 + }, + { + "epoch": 0.91, + "grad_norm": 0.5418717265129089, + "learning_rate": 0.00047468112075373797, + "loss": 3.3158, + "step": 18497 + }, + { + "epoch": 0.91, + "grad_norm": 0.5489016771316528, + "learning_rate": 0.0004746686034949617, + "loss": 3.3727, + "step": 18498 + }, + { + "epoch": 0.91, + "grad_norm": 0.5446333885192871, + "learning_rate": 0.0004746560857761418, + "loss": 3.3276, + "step": 18499 + }, + { + "epoch": 0.91, + "grad_norm": 0.5174359679222107, + "learning_rate": 0.0004746435675973115, + "loss": 2.9588, + "step": 18500 + }, + { + "epoch": 0.91, + "grad_norm": 0.5231800079345703, + "learning_rate": 0.0004746310489585036, + "loss": 3.1466, + "step": 18501 + }, + { + "epoch": 0.91, + "grad_norm": 0.5035195350646973, + "learning_rate": 0.000474618529859751, + "loss": 2.9377, + "step": 18502 + }, + { + "epoch": 0.91, + "grad_norm": 0.5154784917831421, + "learning_rate": 0.000474606010301087, + "loss": 3.1582, + "step": 18503 + }, + { + "epoch": 0.91, + "grad_norm": 0.5330885052680969, + "learning_rate": 0.0004745934902825442, + "loss": 3.1804, + "step": 18504 + }, + { + "epoch": 0.91, + "grad_norm": 0.4971103370189667, + "learning_rate": 0.0004745809698041558, + "loss": 3.2581, + "step": 18505 + }, + { + "epoch": 0.91, + "grad_norm": 0.521308422088623, + "learning_rate": 0.00047456844886595485, + "loss": 3.0899, + "step": 18506 + }, + { + "epoch": 0.91, + "grad_norm": 0.5021144151687622, + "learning_rate": 0.00047455592746797404, + "loss": 3.2276, + "step": 18507 + }, + { + "epoch": 0.91, + "grad_norm": 0.5008601546287537, + "learning_rate": 0.0004745434056102466, + "loss": 3.2421, + "step": 18508 + }, + { + "epoch": 0.91, + "grad_norm": 0.5506516695022583, + "learning_rate": 0.0004745308832928054, + "loss": 3.3397, + "step": 18509 + }, + { + "epoch": 0.91, + "grad_norm": 0.5164962410926819, + "learning_rate": 0.0004745183605156835, + "loss": 2.8647, + "step": 18510 + }, + { + "epoch": 0.91, + "grad_norm": 0.6326226592063904, + "learning_rate": 0.00047450583727891375, + "loss": 3.2928, + "step": 18511 + }, + { + "epoch": 0.91, + "grad_norm": 0.507206380367279, + "learning_rate": 0.0004744933135825292, + "loss": 3.1631, + "step": 18512 + }, + { + "epoch": 0.91, + "grad_norm": 0.5537012815475464, + "learning_rate": 0.00047448078942656285, + "loss": 3.2021, + "step": 18513 + }, + { + "epoch": 0.91, + "grad_norm": 0.5219477415084839, + "learning_rate": 0.00047446826481104774, + "loss": 3.1808, + "step": 18514 + }, + { + "epoch": 0.91, + "grad_norm": 0.5431067943572998, + "learning_rate": 0.00047445573973601673, + "loss": 3.2918, + "step": 18515 + }, + { + "epoch": 0.91, + "grad_norm": 0.5809006690979004, + "learning_rate": 0.0004744432142015029, + "loss": 3.0758, + "step": 18516 + }, + { + "epoch": 0.91, + "grad_norm": 0.5215701460838318, + "learning_rate": 0.0004744306882075392, + "loss": 3.1764, + "step": 18517 + }, + { + "epoch": 0.91, + "grad_norm": 0.4926910102367401, + "learning_rate": 0.0004744181617541587, + "loss": 3.2759, + "step": 18518 + }, + { + "epoch": 0.91, + "grad_norm": 0.5390565991401672, + "learning_rate": 0.0004744056348413942, + "loss": 3.1481, + "step": 18519 + }, + { + "epoch": 0.91, + "grad_norm": 0.523172914981842, + "learning_rate": 0.00047439310746927896, + "loss": 3.0277, + "step": 18520 + }, + { + "epoch": 0.91, + "grad_norm": 0.5034345984458923, + "learning_rate": 0.0004743805796378458, + "loss": 3.2691, + "step": 18521 + }, + { + "epoch": 0.91, + "grad_norm": 0.5197492837905884, + "learning_rate": 0.00047436805134712774, + "loss": 3.1503, + "step": 18522 + }, + { + "epoch": 0.91, + "grad_norm": 0.5116248726844788, + "learning_rate": 0.00047435552259715775, + "loss": 3.1957, + "step": 18523 + }, + { + "epoch": 0.91, + "grad_norm": 0.5436865091323853, + "learning_rate": 0.0004743429933879689, + "loss": 3.2398, + "step": 18524 + }, + { + "epoch": 0.91, + "grad_norm": 0.5367627739906311, + "learning_rate": 0.0004743304637195942, + "loss": 3.0797, + "step": 18525 + }, + { + "epoch": 0.91, + "grad_norm": 0.49198198318481445, + "learning_rate": 0.0004743179335920666, + "loss": 3.2193, + "step": 18526 + }, + { + "epoch": 0.91, + "grad_norm": 0.5412742495536804, + "learning_rate": 0.00047430540300541903, + "loss": 3.1761, + "step": 18527 + }, + { + "epoch": 0.91, + "grad_norm": 0.5427865386009216, + "learning_rate": 0.0004742928719596846, + "loss": 3.1722, + "step": 18528 + }, + { + "epoch": 0.91, + "grad_norm": 0.5101169347763062, + "learning_rate": 0.00047428034045489636, + "loss": 3.0649, + "step": 18529 + }, + { + "epoch": 0.91, + "grad_norm": 0.5221566557884216, + "learning_rate": 0.00047426780849108723, + "loss": 3.0671, + "step": 18530 + }, + { + "epoch": 0.91, + "grad_norm": 0.5130900144577026, + "learning_rate": 0.00047425527606829017, + "loss": 3.2507, + "step": 18531 + }, + { + "epoch": 0.91, + "grad_norm": 0.522251307964325, + "learning_rate": 0.0004742427431865383, + "loss": 3.1376, + "step": 18532 + }, + { + "epoch": 0.91, + "grad_norm": 0.5170092582702637, + "learning_rate": 0.00047423020984586455, + "loss": 3.3573, + "step": 18533 + }, + { + "epoch": 0.91, + "grad_norm": 0.5282794833183289, + "learning_rate": 0.0004742176760463019, + "loss": 3.0622, + "step": 18534 + }, + { + "epoch": 0.91, + "grad_norm": 0.5373873114585876, + "learning_rate": 0.0004742051417878836, + "loss": 2.9846, + "step": 18535 + }, + { + "epoch": 0.91, + "grad_norm": 0.5340383052825928, + "learning_rate": 0.0004741926070706423, + "loss": 3.3364, + "step": 18536 + }, + { + "epoch": 0.91, + "grad_norm": 0.5768246650695801, + "learning_rate": 0.0004741800718946113, + "loss": 3.0228, + "step": 18537 + }, + { + "epoch": 0.91, + "grad_norm": 0.508990466594696, + "learning_rate": 0.0004741675362598235, + "loss": 3.1434, + "step": 18538 + }, + { + "epoch": 0.91, + "grad_norm": 0.5513434410095215, + "learning_rate": 0.00047415500016631187, + "loss": 3.1098, + "step": 18539 + }, + { + "epoch": 0.91, + "grad_norm": 0.5139069557189941, + "learning_rate": 0.00047414246361410955, + "loss": 3.2203, + "step": 18540 + }, + { + "epoch": 0.91, + "grad_norm": 0.5511561632156372, + "learning_rate": 0.0004741299266032495, + "loss": 3.1658, + "step": 18541 + }, + { + "epoch": 0.91, + "grad_norm": 0.5869661569595337, + "learning_rate": 0.0004741173891337647, + "loss": 2.964, + "step": 18542 + }, + { + "epoch": 0.91, + "grad_norm": 0.5423870086669922, + "learning_rate": 0.0004741048512056882, + "loss": 3.1912, + "step": 18543 + }, + { + "epoch": 0.91, + "grad_norm": 0.5338352918624878, + "learning_rate": 0.00047409231281905296, + "loss": 2.918, + "step": 18544 + }, + { + "epoch": 0.91, + "grad_norm": 0.5400632619857788, + "learning_rate": 0.0004740797739738922, + "loss": 3.3985, + "step": 18545 + }, + { + "epoch": 0.91, + "grad_norm": 0.515967607498169, + "learning_rate": 0.00047406723467023886, + "loss": 3.3357, + "step": 18546 + }, + { + "epoch": 0.91, + "grad_norm": 0.5285216569900513, + "learning_rate": 0.0004740546949081258, + "loss": 3.2011, + "step": 18547 + }, + { + "epoch": 0.91, + "grad_norm": 0.5364177227020264, + "learning_rate": 0.0004740421546875862, + "loss": 3.1159, + "step": 18548 + }, + { + "epoch": 0.91, + "grad_norm": 0.5312500596046448, + "learning_rate": 0.000474029614008653, + "loss": 3.2883, + "step": 18549 + }, + { + "epoch": 0.91, + "grad_norm": 0.5263493657112122, + "learning_rate": 0.0004740170728713594, + "loss": 3.2674, + "step": 18550 + }, + { + "epoch": 0.91, + "grad_norm": 0.49179357290267944, + "learning_rate": 0.00047400453127573834, + "loss": 3.2257, + "step": 18551 + }, + { + "epoch": 0.91, + "grad_norm": 0.527242124080658, + "learning_rate": 0.0004739919892218228, + "loss": 3.1437, + "step": 18552 + }, + { + "epoch": 0.91, + "grad_norm": 0.5284623503684998, + "learning_rate": 0.0004739794467096458, + "loss": 3.1903, + "step": 18553 + }, + { + "epoch": 0.91, + "grad_norm": 0.5200530886650085, + "learning_rate": 0.0004739669037392404, + "loss": 3.1414, + "step": 18554 + }, + { + "epoch": 0.91, + "grad_norm": 0.5152423977851868, + "learning_rate": 0.0004739543603106397, + "loss": 3.1332, + "step": 18555 + }, + { + "epoch": 0.91, + "grad_norm": 0.5122321844100952, + "learning_rate": 0.0004739418164238766, + "loss": 3.2215, + "step": 18556 + }, + { + "epoch": 0.91, + "grad_norm": 0.5178201198577881, + "learning_rate": 0.0004739292720789844, + "loss": 3.16, + "step": 18557 + }, + { + "epoch": 0.91, + "grad_norm": 0.5422595143318176, + "learning_rate": 0.00047391672727599584, + "loss": 3.1908, + "step": 18558 + }, + { + "epoch": 0.91, + "grad_norm": 0.5113765597343445, + "learning_rate": 0.0004739041820149441, + "loss": 3.1155, + "step": 18559 + }, + { + "epoch": 0.91, + "grad_norm": 0.5253166556358337, + "learning_rate": 0.00047389163629586235, + "loss": 2.9544, + "step": 18560 + }, + { + "epoch": 0.91, + "grad_norm": 0.5218266248703003, + "learning_rate": 0.00047387909011878333, + "loss": 2.8984, + "step": 18561 + }, + { + "epoch": 0.91, + "grad_norm": 0.5512539744377136, + "learning_rate": 0.00047386654348374036, + "loss": 3.0539, + "step": 18562 + }, + { + "epoch": 0.91, + "grad_norm": 0.5330885052680969, + "learning_rate": 0.0004738539963907664, + "loss": 3.091, + "step": 18563 + }, + { + "epoch": 0.91, + "grad_norm": 0.5763328075408936, + "learning_rate": 0.0004738414488398943, + "loss": 3.1968, + "step": 18564 + }, + { + "epoch": 0.91, + "grad_norm": 0.5141280889511108, + "learning_rate": 0.0004738289008311575, + "loss": 3.1497, + "step": 18565 + }, + { + "epoch": 0.91, + "grad_norm": 0.5118435025215149, + "learning_rate": 0.0004738163523645887, + "loss": 2.9679, + "step": 18566 + }, + { + "epoch": 0.91, + "grad_norm": 0.5282021760940552, + "learning_rate": 0.00047380380344022105, + "loss": 3.0492, + "step": 18567 + }, + { + "epoch": 0.91, + "grad_norm": 0.6659740805625916, + "learning_rate": 0.0004737912540580877, + "loss": 3.1675, + "step": 18568 + }, + { + "epoch": 0.91, + "grad_norm": 0.520656168460846, + "learning_rate": 0.0004737787042182216, + "loss": 3.2612, + "step": 18569 + }, + { + "epoch": 0.91, + "grad_norm": 0.550905168056488, + "learning_rate": 0.0004737661539206558, + "loss": 3.0811, + "step": 18570 + }, + { + "epoch": 0.91, + "grad_norm": 0.532606840133667, + "learning_rate": 0.0004737536031654234, + "loss": 3.0482, + "step": 18571 + }, + { + "epoch": 0.91, + "grad_norm": 0.5544597506523132, + "learning_rate": 0.00047374105195255756, + "loss": 3.0428, + "step": 18572 + }, + { + "epoch": 0.91, + "grad_norm": 0.534093976020813, + "learning_rate": 0.0004737285002820912, + "loss": 3.176, + "step": 18573 + }, + { + "epoch": 0.91, + "grad_norm": 0.5330844521522522, + "learning_rate": 0.00047371594815405726, + "loss": 3.0923, + "step": 18574 + }, + { + "epoch": 0.91, + "grad_norm": 0.46944698691368103, + "learning_rate": 0.00047370339556848906, + "loss": 3.0567, + "step": 18575 + }, + { + "epoch": 0.91, + "grad_norm": 0.5329000949859619, + "learning_rate": 0.0004736908425254195, + "loss": 3.0776, + "step": 18576 + }, + { + "epoch": 0.91, + "grad_norm": 0.5113318562507629, + "learning_rate": 0.0004736782890248818, + "loss": 3.0746, + "step": 18577 + }, + { + "epoch": 0.91, + "grad_norm": 0.507371723651886, + "learning_rate": 0.0004736657350669088, + "loss": 3.1737, + "step": 18578 + }, + { + "epoch": 0.91, + "grad_norm": 0.5299968719482422, + "learning_rate": 0.0004736531806515337, + "loss": 3.0233, + "step": 18579 + }, + { + "epoch": 0.91, + "grad_norm": 0.5169973373413086, + "learning_rate": 0.0004736406257787895, + "loss": 3.1592, + "step": 18580 + }, + { + "epoch": 0.91, + "grad_norm": 0.5072317719459534, + "learning_rate": 0.0004736280704487094, + "loss": 3.24, + "step": 18581 + }, + { + "epoch": 0.91, + "grad_norm": 0.592707097530365, + "learning_rate": 0.00047361551466132634, + "loss": 3.1434, + "step": 18582 + }, + { + "epoch": 0.91, + "grad_norm": 0.5373207926750183, + "learning_rate": 0.00047360295841667344, + "loss": 3.1949, + "step": 18583 + }, + { + "epoch": 0.91, + "grad_norm": 0.555252194404602, + "learning_rate": 0.0004735904017147838, + "loss": 3.2368, + "step": 18584 + }, + { + "epoch": 0.91, + "grad_norm": 0.5238771438598633, + "learning_rate": 0.00047357784455569034, + "loss": 3.2168, + "step": 18585 + }, + { + "epoch": 0.91, + "grad_norm": 0.5633851289749146, + "learning_rate": 0.0004735652869394263, + "loss": 3.4173, + "step": 18586 + }, + { + "epoch": 0.91, + "grad_norm": 0.5264574289321899, + "learning_rate": 0.0004735527288660247, + "loss": 3.1726, + "step": 18587 + }, + { + "epoch": 0.91, + "grad_norm": 0.5027996301651001, + "learning_rate": 0.00047354017033551864, + "loss": 3.466, + "step": 18588 + }, + { + "epoch": 0.91, + "grad_norm": 0.5182791948318481, + "learning_rate": 0.00047352761134794114, + "loss": 3.0939, + "step": 18589 + }, + { + "epoch": 0.91, + "grad_norm": 0.4975305497646332, + "learning_rate": 0.00047351505190332526, + "loss": 3.3298, + "step": 18590 + }, + { + "epoch": 0.91, + "grad_norm": 0.5365167856216431, + "learning_rate": 0.0004735024920017042, + "loss": 3.0189, + "step": 18591 + }, + { + "epoch": 0.91, + "grad_norm": 0.5220871567726135, + "learning_rate": 0.00047348993164311086, + "loss": 3.3163, + "step": 18592 + }, + { + "epoch": 0.91, + "grad_norm": 0.5201936960220337, + "learning_rate": 0.00047347737082757853, + "loss": 3.1742, + "step": 18593 + }, + { + "epoch": 0.91, + "grad_norm": 0.5495591163635254, + "learning_rate": 0.0004734648095551402, + "loss": 3.2221, + "step": 18594 + }, + { + "epoch": 0.91, + "grad_norm": 0.504267156124115, + "learning_rate": 0.0004734522478258289, + "loss": 3.4931, + "step": 18595 + }, + { + "epoch": 0.91, + "grad_norm": 0.5782771706581116, + "learning_rate": 0.0004734396856396778, + "loss": 3.1074, + "step": 18596 + }, + { + "epoch": 0.91, + "grad_norm": 0.523160457611084, + "learning_rate": 0.0004734271229967199, + "loss": 3.0729, + "step": 18597 + }, + { + "epoch": 0.91, + "grad_norm": 0.5083732008934021, + "learning_rate": 0.0004734145598969884, + "loss": 3.2266, + "step": 18598 + }, + { + "epoch": 0.91, + "grad_norm": 0.538865864276886, + "learning_rate": 0.00047340199634051625, + "loss": 3.0961, + "step": 18599 + }, + { + "epoch": 0.91, + "grad_norm": 0.528105616569519, + "learning_rate": 0.00047338943232733664, + "loss": 3.0448, + "step": 18600 + }, + { + "epoch": 0.91, + "grad_norm": 0.5887966156005859, + "learning_rate": 0.00047337686785748256, + "loss": 3.0425, + "step": 18601 + }, + { + "epoch": 0.91, + "grad_norm": 0.49562567472457886, + "learning_rate": 0.0004733643029309873, + "loss": 3.3061, + "step": 18602 + }, + { + "epoch": 0.91, + "grad_norm": 0.5125460028648376, + "learning_rate": 0.0004733517375478838, + "loss": 2.8714, + "step": 18603 + }, + { + "epoch": 0.91, + "grad_norm": 0.5145890712738037, + "learning_rate": 0.0004733391717082052, + "loss": 3.0679, + "step": 18604 + }, + { + "epoch": 0.91, + "grad_norm": 0.5396844148635864, + "learning_rate": 0.00047332660541198453, + "loss": 3.1425, + "step": 18605 + }, + { + "epoch": 0.91, + "grad_norm": 0.5691478252410889, + "learning_rate": 0.00047331403865925496, + "loss": 3.3351, + "step": 18606 + }, + { + "epoch": 0.91, + "grad_norm": 0.5248494148254395, + "learning_rate": 0.0004733014714500496, + "loss": 3.2248, + "step": 18607 + }, + { + "epoch": 0.91, + "grad_norm": 0.49655720591545105, + "learning_rate": 0.0004732889037844014, + "loss": 3.0864, + "step": 18608 + }, + { + "epoch": 0.91, + "grad_norm": 0.5340755581855774, + "learning_rate": 0.0004732763356623437, + "loss": 3.1839, + "step": 18609 + }, + { + "epoch": 0.91, + "grad_norm": 0.5292792916297913, + "learning_rate": 0.0004732637670839094, + "loss": 3.3367, + "step": 18610 + }, + { + "epoch": 0.91, + "grad_norm": 0.48931747674942017, + "learning_rate": 0.0004732511980491318, + "loss": 3.4235, + "step": 18611 + }, + { + "epoch": 0.91, + "grad_norm": 0.5485087037086487, + "learning_rate": 0.0004732386285580438, + "loss": 3.0873, + "step": 18612 + }, + { + "epoch": 0.91, + "grad_norm": 0.5549004077911377, + "learning_rate": 0.0004732260586106787, + "loss": 3.109, + "step": 18613 + }, + { + "epoch": 0.91, + "grad_norm": 0.5397618412971497, + "learning_rate": 0.00047321348820706933, + "loss": 3.1704, + "step": 18614 + }, + { + "epoch": 0.91, + "grad_norm": 0.5148725509643555, + "learning_rate": 0.000473200917347249, + "loss": 3.1237, + "step": 18615 + }, + { + "epoch": 0.91, + "grad_norm": 0.49203985929489136, + "learning_rate": 0.00047318834603125084, + "loss": 3.1485, + "step": 18616 + }, + { + "epoch": 0.91, + "grad_norm": 0.5316979289054871, + "learning_rate": 0.00047317577425910794, + "loss": 3.2129, + "step": 18617 + }, + { + "epoch": 0.91, + "grad_norm": 0.5986040234565735, + "learning_rate": 0.00047316320203085333, + "loss": 3.1176, + "step": 18618 + }, + { + "epoch": 0.91, + "grad_norm": 0.5387392044067383, + "learning_rate": 0.0004731506293465202, + "loss": 3.1098, + "step": 18619 + }, + { + "epoch": 0.91, + "grad_norm": 0.5144277811050415, + "learning_rate": 0.0004731380562061416, + "loss": 3.1374, + "step": 18620 + }, + { + "epoch": 0.91, + "grad_norm": 0.5290771126747131, + "learning_rate": 0.0004731254826097507, + "loss": 3.1971, + "step": 18621 + }, + { + "epoch": 0.91, + "grad_norm": 0.5087745785713196, + "learning_rate": 0.00047311290855738057, + "loss": 3.3343, + "step": 18622 + }, + { + "epoch": 0.91, + "grad_norm": 0.5245369076728821, + "learning_rate": 0.00047310033404906435, + "loss": 3.2556, + "step": 18623 + }, + { + "epoch": 0.91, + "grad_norm": 0.5257542729377747, + "learning_rate": 0.00047308775908483526, + "loss": 3.2636, + "step": 18624 + }, + { + "epoch": 0.91, + "grad_norm": 0.50067138671875, + "learning_rate": 0.00047307518366472627, + "loss": 3.2939, + "step": 18625 + }, + { + "epoch": 0.91, + "grad_norm": 0.550284743309021, + "learning_rate": 0.0004730626077887705, + "loss": 3.1894, + "step": 18626 + }, + { + "epoch": 0.91, + "grad_norm": 0.5422661900520325, + "learning_rate": 0.00047305003145700115, + "loss": 3.2083, + "step": 18627 + }, + { + "epoch": 0.91, + "grad_norm": 0.5483496785163879, + "learning_rate": 0.00047303745466945133, + "loss": 3.029, + "step": 18628 + }, + { + "epoch": 0.91, + "grad_norm": 0.5778025388717651, + "learning_rate": 0.0004730248774261542, + "loss": 3.0593, + "step": 18629 + }, + { + "epoch": 0.91, + "grad_norm": 0.5487713813781738, + "learning_rate": 0.0004730122997271428, + "loss": 3.2694, + "step": 18630 + }, + { + "epoch": 0.91, + "grad_norm": 0.5207985043525696, + "learning_rate": 0.00047299972157245027, + "loss": 3.1083, + "step": 18631 + }, + { + "epoch": 0.91, + "grad_norm": 0.5357333421707153, + "learning_rate": 0.00047298714296210983, + "loss": 3.2432, + "step": 18632 + }, + { + "epoch": 0.91, + "grad_norm": 0.5432522296905518, + "learning_rate": 0.00047297456389615446, + "loss": 3.461, + "step": 18633 + }, + { + "epoch": 0.91, + "grad_norm": 0.4886643886566162, + "learning_rate": 0.0004729619843746175, + "loss": 3.2724, + "step": 18634 + }, + { + "epoch": 0.91, + "grad_norm": 0.5570962429046631, + "learning_rate": 0.00047294940439753197, + "loss": 3.1144, + "step": 18635 + }, + { + "epoch": 0.91, + "grad_norm": 0.559655487537384, + "learning_rate": 0.00047293682396493087, + "loss": 3.2365, + "step": 18636 + }, + { + "epoch": 0.91, + "grad_norm": 0.5164363384246826, + "learning_rate": 0.0004729242430768475, + "loss": 3.0816, + "step": 18637 + }, + { + "epoch": 0.91, + "grad_norm": 0.5455911159515381, + "learning_rate": 0.00047291166173331497, + "loss": 3.2254, + "step": 18638 + }, + { + "epoch": 0.91, + "grad_norm": 0.5552606582641602, + "learning_rate": 0.0004728990799343664, + "loss": 3.1659, + "step": 18639 + }, + { + "epoch": 0.91, + "grad_norm": 0.5630795955657959, + "learning_rate": 0.00047288649768003493, + "loss": 3.2478, + "step": 18640 + }, + { + "epoch": 0.91, + "grad_norm": 0.5146662592887878, + "learning_rate": 0.00047287391497035367, + "loss": 3.1473, + "step": 18641 + }, + { + "epoch": 0.91, + "grad_norm": 0.5582475066184998, + "learning_rate": 0.00047286133180535585, + "loss": 3.0386, + "step": 18642 + }, + { + "epoch": 0.91, + "grad_norm": 0.5350977778434753, + "learning_rate": 0.00047284874818507444, + "loss": 3.1054, + "step": 18643 + }, + { + "epoch": 0.91, + "grad_norm": 0.5022348165512085, + "learning_rate": 0.0004728361641095428, + "loss": 3.1659, + "step": 18644 + }, + { + "epoch": 0.91, + "grad_norm": 0.5127667188644409, + "learning_rate": 0.00047282357957879396, + "loss": 3.0755, + "step": 18645 + }, + { + "epoch": 0.91, + "grad_norm": 0.5345514416694641, + "learning_rate": 0.000472810994592861, + "loss": 3.3378, + "step": 18646 + }, + { + "epoch": 0.91, + "grad_norm": 0.5057428479194641, + "learning_rate": 0.0004727984091517772, + "loss": 3.0245, + "step": 18647 + }, + { + "epoch": 0.91, + "grad_norm": 0.5135453343391418, + "learning_rate": 0.0004727858232555756, + "loss": 3.1026, + "step": 18648 + }, + { + "epoch": 0.91, + "grad_norm": 0.539706289768219, + "learning_rate": 0.00047277323690428943, + "loss": 3.1584, + "step": 18649 + }, + { + "epoch": 0.91, + "grad_norm": 0.5512105822563171, + "learning_rate": 0.00047276065009795184, + "loss": 3.2228, + "step": 18650 + }, + { + "epoch": 0.91, + "grad_norm": 0.4987412095069885, + "learning_rate": 0.00047274806283659584, + "loss": 3.3146, + "step": 18651 + }, + { + "epoch": 0.91, + "grad_norm": 0.5345869064331055, + "learning_rate": 0.00047273547512025483, + "loss": 3.2434, + "step": 18652 + }, + { + "epoch": 0.91, + "grad_norm": 0.5088310837745667, + "learning_rate": 0.0004727228869489617, + "loss": 3.2025, + "step": 18653 + }, + { + "epoch": 0.91, + "grad_norm": 0.5335360765457153, + "learning_rate": 0.00047271029832274974, + "loss": 3.2062, + "step": 18654 + }, + { + "epoch": 0.91, + "grad_norm": 0.5184034705162048, + "learning_rate": 0.0004726977092416522, + "loss": 3.2948, + "step": 18655 + }, + { + "epoch": 0.91, + "grad_norm": 0.5256913304328918, + "learning_rate": 0.00047268511970570203, + "loss": 3.186, + "step": 18656 + }, + { + "epoch": 0.91, + "grad_norm": 0.5344150066375732, + "learning_rate": 0.0004726725297149325, + "loss": 3.1916, + "step": 18657 + }, + { + "epoch": 0.91, + "grad_norm": 0.6184183955192566, + "learning_rate": 0.0004726599392693768, + "loss": 3.3629, + "step": 18658 + }, + { + "epoch": 0.91, + "grad_norm": 0.49395012855529785, + "learning_rate": 0.00047264734836906805, + "loss": 3.1883, + "step": 18659 + }, + { + "epoch": 0.91, + "grad_norm": 0.532318651676178, + "learning_rate": 0.00047263475701403933, + "loss": 2.9763, + "step": 18660 + }, + { + "epoch": 0.91, + "grad_norm": 0.6311165690422058, + "learning_rate": 0.000472622165204324, + "loss": 3.2059, + "step": 18661 + }, + { + "epoch": 0.91, + "grad_norm": 0.5295420289039612, + "learning_rate": 0.000472609572939955, + "loss": 3.1236, + "step": 18662 + }, + { + "epoch": 0.91, + "grad_norm": 0.5523192882537842, + "learning_rate": 0.0004725969802209656, + "loss": 3.2315, + "step": 18663 + }, + { + "epoch": 0.91, + "grad_norm": 0.52613365650177, + "learning_rate": 0.0004725843870473891, + "loss": 3.1807, + "step": 18664 + }, + { + "epoch": 0.91, + "grad_norm": 0.5176199078559875, + "learning_rate": 0.0004725717934192584, + "loss": 3.0597, + "step": 18665 + }, + { + "epoch": 0.91, + "grad_norm": 0.5190128684043884, + "learning_rate": 0.00047255919933660695, + "loss": 2.7741, + "step": 18666 + }, + { + "epoch": 0.91, + "grad_norm": 0.5189964175224304, + "learning_rate": 0.0004725466047994676, + "loss": 3.3417, + "step": 18667 + }, + { + "epoch": 0.91, + "grad_norm": 0.5286535024642944, + "learning_rate": 0.0004725340098078738, + "loss": 3.1332, + "step": 18668 + }, + { + "epoch": 0.91, + "grad_norm": 0.5291597247123718, + "learning_rate": 0.00047252141436185857, + "loss": 3.0903, + "step": 18669 + }, + { + "epoch": 0.91, + "grad_norm": 0.5236563682556152, + "learning_rate": 0.00047250881846145525, + "loss": 3.16, + "step": 18670 + }, + { + "epoch": 0.92, + "grad_norm": 0.5021608471870422, + "learning_rate": 0.0004724962221066969, + "loss": 3.2304, + "step": 18671 + }, + { + "epoch": 0.92, + "grad_norm": 0.5602541565895081, + "learning_rate": 0.0004724836252976165, + "loss": 3.2892, + "step": 18672 + }, + { + "epoch": 0.92, + "grad_norm": 0.5403422117233276, + "learning_rate": 0.0004724710280342477, + "loss": 3.4172, + "step": 18673 + }, + { + "epoch": 0.92, + "grad_norm": 0.5357560515403748, + "learning_rate": 0.0004724584303166232, + "loss": 3.4125, + "step": 18674 + }, + { + "epoch": 0.92, + "grad_norm": 0.5377302765846252, + "learning_rate": 0.0004724458321447765, + "loss": 3.1649, + "step": 18675 + }, + { + "epoch": 0.92, + "grad_norm": 0.5543449521064758, + "learning_rate": 0.0004724332335187406, + "loss": 3.0616, + "step": 18676 + }, + { + "epoch": 0.92, + "grad_norm": 0.5366581082344055, + "learning_rate": 0.0004724206344385487, + "loss": 3.1542, + "step": 18677 + }, + { + "epoch": 0.92, + "grad_norm": 0.5297412276268005, + "learning_rate": 0.00047240803490423413, + "loss": 3.3427, + "step": 18678 + }, + { + "epoch": 0.92, + "grad_norm": 0.6091488599777222, + "learning_rate": 0.00047239543491582993, + "loss": 3.0589, + "step": 18679 + }, + { + "epoch": 0.92, + "grad_norm": 0.5463923215866089, + "learning_rate": 0.0004723828344733694, + "loss": 3.1833, + "step": 18680 + }, + { + "epoch": 0.92, + "grad_norm": 0.5743600726127625, + "learning_rate": 0.0004723702335768856, + "loss": 3.0632, + "step": 18681 + }, + { + "epoch": 0.92, + "grad_norm": 0.5193010568618774, + "learning_rate": 0.00047235763222641176, + "loss": 3.3067, + "step": 18682 + }, + { + "epoch": 0.92, + "grad_norm": 0.5466593503952026, + "learning_rate": 0.0004723450304219811, + "loss": 3.1118, + "step": 18683 + }, + { + "epoch": 0.92, + "grad_norm": 0.5027567148208618, + "learning_rate": 0.0004723324281636268, + "loss": 3.3245, + "step": 18684 + }, + { + "epoch": 0.92, + "grad_norm": 0.5612360239028931, + "learning_rate": 0.00047231982545138207, + "loss": 3.0984, + "step": 18685 + }, + { + "epoch": 0.92, + "grad_norm": 0.49265792965888977, + "learning_rate": 0.0004723072222852801, + "loss": 3.2577, + "step": 18686 + }, + { + "epoch": 0.92, + "grad_norm": 0.5681557059288025, + "learning_rate": 0.00047229461866535403, + "loss": 3.1238, + "step": 18687 + }, + { + "epoch": 0.92, + "grad_norm": 0.5673431158065796, + "learning_rate": 0.00047228201459163706, + "loss": 3.1261, + "step": 18688 + }, + { + "epoch": 0.92, + "grad_norm": 0.5525531768798828, + "learning_rate": 0.00047226941006416246, + "loss": 3.3082, + "step": 18689 + }, + { + "epoch": 0.92, + "grad_norm": 0.5107969641685486, + "learning_rate": 0.00047225680508296346, + "loss": 3.0155, + "step": 18690 + }, + { + "epoch": 0.92, + "grad_norm": 0.5525048971176147, + "learning_rate": 0.00047224419964807306, + "loss": 3.0787, + "step": 18691 + }, + { + "epoch": 0.92, + "grad_norm": 0.5003828406333923, + "learning_rate": 0.00047223159375952466, + "loss": 3.253, + "step": 18692 + }, + { + "epoch": 0.92, + "grad_norm": 0.49926337599754333, + "learning_rate": 0.0004722189874173514, + "loss": 3.1447, + "step": 18693 + }, + { + "epoch": 0.92, + "grad_norm": 0.49048912525177, + "learning_rate": 0.00047220638062158647, + "loss": 3.1531, + "step": 18694 + }, + { + "epoch": 0.92, + "grad_norm": 0.5253885984420776, + "learning_rate": 0.000472193773372263, + "loss": 3.2023, + "step": 18695 + }, + { + "epoch": 0.92, + "grad_norm": 0.5272942185401917, + "learning_rate": 0.0004721811656694144, + "loss": 3.1818, + "step": 18696 + }, + { + "epoch": 0.92, + "grad_norm": 0.5017642378807068, + "learning_rate": 0.00047216855751307365, + "loss": 3.174, + "step": 18697 + }, + { + "epoch": 0.92, + "grad_norm": 0.5158599615097046, + "learning_rate": 0.000472155948903274, + "loss": 3.2798, + "step": 18698 + }, + { + "epoch": 0.92, + "grad_norm": 0.48646584153175354, + "learning_rate": 0.00047214333984004885, + "loss": 3.1985, + "step": 18699 + }, + { + "epoch": 0.92, + "grad_norm": 0.5171990990638733, + "learning_rate": 0.0004721307303234312, + "loss": 3.128, + "step": 18700 + }, + { + "epoch": 0.92, + "grad_norm": 0.5293672680854797, + "learning_rate": 0.0004721181203534543, + "loss": 3.1083, + "step": 18701 + }, + { + "epoch": 0.92, + "grad_norm": 0.5768073797225952, + "learning_rate": 0.00047210550993015155, + "loss": 3.1399, + "step": 18702 + }, + { + "epoch": 0.92, + "grad_norm": 0.5409702658653259, + "learning_rate": 0.0004720928990535558, + "loss": 3.3084, + "step": 18703 + }, + { + "epoch": 0.92, + "grad_norm": 0.5259953737258911, + "learning_rate": 0.0004720802877237006, + "loss": 3.1811, + "step": 18704 + }, + { + "epoch": 0.92, + "grad_norm": 0.5130147337913513, + "learning_rate": 0.0004720676759406191, + "loss": 2.9815, + "step": 18705 + }, + { + "epoch": 0.92, + "grad_norm": 0.4972241520881653, + "learning_rate": 0.0004720550637043443, + "loss": 3.1822, + "step": 18706 + }, + { + "epoch": 0.92, + "grad_norm": 0.5117876529693604, + "learning_rate": 0.00047204245101490966, + "loss": 3.1758, + "step": 18707 + }, + { + "epoch": 0.92, + "grad_norm": 0.5374881625175476, + "learning_rate": 0.0004720298378723482, + "loss": 3.3106, + "step": 18708 + }, + { + "epoch": 0.92, + "grad_norm": 0.5357831716537476, + "learning_rate": 0.0004720172242766934, + "loss": 3.1024, + "step": 18709 + }, + { + "epoch": 0.92, + "grad_norm": 0.5341154932975769, + "learning_rate": 0.0004720046102279783, + "loss": 3.0437, + "step": 18710 + }, + { + "epoch": 0.92, + "grad_norm": 0.541704535484314, + "learning_rate": 0.00047199199572623617, + "loss": 3.3368, + "step": 18711 + }, + { + "epoch": 0.92, + "grad_norm": 0.5049653053283691, + "learning_rate": 0.0004719793807715002, + "loss": 3.1156, + "step": 18712 + }, + { + "epoch": 0.92, + "grad_norm": 0.517701268196106, + "learning_rate": 0.0004719667653638036, + "loss": 3.1901, + "step": 18713 + }, + { + "epoch": 0.92, + "grad_norm": 0.547834038734436, + "learning_rate": 0.00047195414950317965, + "loss": 3.1269, + "step": 18714 + }, + { + "epoch": 0.92, + "grad_norm": 0.5490109920501709, + "learning_rate": 0.0004719415331896616, + "loss": 3.1857, + "step": 18715 + }, + { + "epoch": 0.92, + "grad_norm": 0.5507968068122864, + "learning_rate": 0.0004719289164232827, + "loss": 3.3192, + "step": 18716 + }, + { + "epoch": 0.92, + "grad_norm": 0.5235647559165955, + "learning_rate": 0.000471916299204076, + "loss": 3.0702, + "step": 18717 + }, + { + "epoch": 0.92, + "grad_norm": 0.524595320224762, + "learning_rate": 0.0004719036815320749, + "loss": 3.0391, + "step": 18718 + }, + { + "epoch": 0.92, + "grad_norm": 0.5237332582473755, + "learning_rate": 0.0004718910634073126, + "loss": 3.1249, + "step": 18719 + }, + { + "epoch": 0.92, + "grad_norm": 0.541515052318573, + "learning_rate": 0.0004718784448298223, + "loss": 3.0022, + "step": 18720 + }, + { + "epoch": 0.92, + "grad_norm": 0.5793763995170593, + "learning_rate": 0.0004718658257996373, + "loss": 3.1743, + "step": 18721 + }, + { + "epoch": 0.92, + "grad_norm": 0.5258360505104065, + "learning_rate": 0.00047185320631679074, + "loss": 3.2977, + "step": 18722 + }, + { + "epoch": 0.92, + "grad_norm": 0.5656243562698364, + "learning_rate": 0.0004718405863813159, + "loss": 3.304, + "step": 18723 + }, + { + "epoch": 0.92, + "grad_norm": 0.5465549826622009, + "learning_rate": 0.0004718279659932461, + "loss": 3.3521, + "step": 18724 + }, + { + "epoch": 0.92, + "grad_norm": 0.5304923057556152, + "learning_rate": 0.0004718153451526144, + "loss": 3.1136, + "step": 18725 + }, + { + "epoch": 0.92, + "grad_norm": 0.5266689658164978, + "learning_rate": 0.00047180272385945426, + "loss": 3.258, + "step": 18726 + }, + { + "epoch": 0.92, + "grad_norm": 0.5561476945877075, + "learning_rate": 0.0004717901021137988, + "loss": 3.1804, + "step": 18727 + }, + { + "epoch": 0.92, + "grad_norm": 0.5388331413269043, + "learning_rate": 0.0004717774799156812, + "loss": 3.2172, + "step": 18728 + }, + { + "epoch": 0.92, + "grad_norm": 0.5558831691741943, + "learning_rate": 0.00047176485726513486, + "loss": 3.0524, + "step": 18729 + }, + { + "epoch": 0.92, + "grad_norm": 0.5594724416732788, + "learning_rate": 0.00047175223416219287, + "loss": 3.1281, + "step": 18730 + }, + { + "epoch": 0.92, + "grad_norm": 0.5364587306976318, + "learning_rate": 0.00047173961060688856, + "loss": 3.0472, + "step": 18731 + }, + { + "epoch": 0.92, + "grad_norm": 0.4832528531551361, + "learning_rate": 0.0004717269865992552, + "loss": 3.1427, + "step": 18732 + }, + { + "epoch": 0.92, + "grad_norm": 0.5160096287727356, + "learning_rate": 0.00047171436213932594, + "loss": 3.2637, + "step": 18733 + }, + { + "epoch": 0.92, + "grad_norm": 0.5533026456832886, + "learning_rate": 0.0004717017372271341, + "loss": 3.3568, + "step": 18734 + }, + { + "epoch": 0.92, + "grad_norm": 0.6159023642539978, + "learning_rate": 0.000471689111862713, + "loss": 3.4341, + "step": 18735 + }, + { + "epoch": 0.92, + "grad_norm": 0.5422484874725342, + "learning_rate": 0.00047167648604609587, + "loss": 3.0425, + "step": 18736 + }, + { + "epoch": 0.92, + "grad_norm": 0.5426680445671082, + "learning_rate": 0.00047166385977731584, + "loss": 3.0185, + "step": 18737 + }, + { + "epoch": 0.92, + "grad_norm": 0.5291516780853271, + "learning_rate": 0.0004716512330564062, + "loss": 3.0756, + "step": 18738 + }, + { + "epoch": 0.92, + "grad_norm": 0.5165324211120605, + "learning_rate": 0.0004716386058834003, + "loss": 3.1437, + "step": 18739 + }, + { + "epoch": 0.92, + "grad_norm": 0.5441447496414185, + "learning_rate": 0.00047162597825833126, + "loss": 3.1802, + "step": 18740 + }, + { + "epoch": 0.92, + "grad_norm": 0.5122418403625488, + "learning_rate": 0.00047161335018123255, + "loss": 3.0825, + "step": 18741 + }, + { + "epoch": 0.92, + "grad_norm": 0.5164358019828796, + "learning_rate": 0.0004716007216521372, + "loss": 3.1459, + "step": 18742 + }, + { + "epoch": 0.92, + "grad_norm": 0.5052106976509094, + "learning_rate": 0.0004715880926710787, + "loss": 3.1541, + "step": 18743 + }, + { + "epoch": 0.92, + "grad_norm": 0.51518315076828, + "learning_rate": 0.0004715754632380901, + "loss": 3.1712, + "step": 18744 + }, + { + "epoch": 0.92, + "grad_norm": 0.5447625517845154, + "learning_rate": 0.0004715628333532047, + "loss": 3.2673, + "step": 18745 + }, + { + "epoch": 0.92, + "grad_norm": 0.5232028961181641, + "learning_rate": 0.00047155020301645596, + "loss": 3.2926, + "step": 18746 + }, + { + "epoch": 0.92, + "grad_norm": 0.5608052015304565, + "learning_rate": 0.0004715375722278769, + "loss": 3.2625, + "step": 18747 + }, + { + "epoch": 0.92, + "grad_norm": 0.5659303665161133, + "learning_rate": 0.0004715249409875009, + "loss": 3.1281, + "step": 18748 + }, + { + "epoch": 0.92, + "grad_norm": 0.49152451753616333, + "learning_rate": 0.0004715123092953612, + "loss": 3.1931, + "step": 18749 + }, + { + "epoch": 0.92, + "grad_norm": 0.5154252052307129, + "learning_rate": 0.00047149967715149114, + "loss": 3.0923, + "step": 18750 + }, + { + "epoch": 0.92, + "grad_norm": 0.5097779035568237, + "learning_rate": 0.00047148704455592387, + "loss": 2.9026, + "step": 18751 + }, + { + "epoch": 0.92, + "grad_norm": 0.5418404936790466, + "learning_rate": 0.00047147441150869273, + "loss": 2.9388, + "step": 18752 + }, + { + "epoch": 0.92, + "grad_norm": 0.5441518425941467, + "learning_rate": 0.00047146177800983104, + "loss": 3.1708, + "step": 18753 + }, + { + "epoch": 0.92, + "grad_norm": 0.5178873538970947, + "learning_rate": 0.00047144914405937194, + "loss": 3.2744, + "step": 18754 + }, + { + "epoch": 0.92, + "grad_norm": 0.5090039968490601, + "learning_rate": 0.000471436509657349, + "loss": 3.1552, + "step": 18755 + }, + { + "epoch": 0.92, + "grad_norm": 0.5149325728416443, + "learning_rate": 0.000471423874803795, + "loss": 3.2349, + "step": 18756 + }, + { + "epoch": 0.92, + "grad_norm": 0.5354869961738586, + "learning_rate": 0.00047141123949874373, + "loss": 3.3082, + "step": 18757 + }, + { + "epoch": 0.92, + "grad_norm": 0.562247633934021, + "learning_rate": 0.00047139860374222813, + "loss": 3.2232, + "step": 18758 + }, + { + "epoch": 0.92, + "grad_norm": 0.5432979464530945, + "learning_rate": 0.00047138596753428164, + "loss": 3.2849, + "step": 18759 + }, + { + "epoch": 0.92, + "grad_norm": 0.5252031683921814, + "learning_rate": 0.00047137333087493745, + "loss": 3.0496, + "step": 18760 + }, + { + "epoch": 0.92, + "grad_norm": 0.5139029622077942, + "learning_rate": 0.00047136069376422894, + "loss": 3.0417, + "step": 18761 + }, + { + "epoch": 0.92, + "grad_norm": 0.49422696232795715, + "learning_rate": 0.00047134805620218935, + "loss": 3.0737, + "step": 18762 + }, + { + "epoch": 0.92, + "grad_norm": 0.548893928527832, + "learning_rate": 0.0004713354181888519, + "loss": 3.2133, + "step": 18763 + }, + { + "epoch": 0.92, + "grad_norm": 0.534731924533844, + "learning_rate": 0.00047132277972425, + "loss": 3.1941, + "step": 18764 + }, + { + "epoch": 0.92, + "grad_norm": 0.5220165252685547, + "learning_rate": 0.00047131014080841675, + "loss": 3.1918, + "step": 18765 + }, + { + "epoch": 0.92, + "grad_norm": 0.5755146145820618, + "learning_rate": 0.00047129750144138567, + "loss": 3.1908, + "step": 18766 + }, + { + "epoch": 0.92, + "grad_norm": 0.508004903793335, + "learning_rate": 0.0004712848616231899, + "loss": 3.2737, + "step": 18767 + }, + { + "epoch": 0.92, + "grad_norm": 0.5089250802993774, + "learning_rate": 0.0004712722213538628, + "loss": 2.9431, + "step": 18768 + }, + { + "epoch": 0.92, + "grad_norm": 0.573453962802887, + "learning_rate": 0.0004712595806334376, + "loss": 3.064, + "step": 18769 + }, + { + "epoch": 0.92, + "grad_norm": 0.5167307257652283, + "learning_rate": 0.0004712469394619476, + "loss": 3.0751, + "step": 18770 + }, + { + "epoch": 0.92, + "grad_norm": 0.5273975729942322, + "learning_rate": 0.00047123429783942614, + "loss": 3.1842, + "step": 18771 + }, + { + "epoch": 0.92, + "grad_norm": 0.5206418633460999, + "learning_rate": 0.00047122165576590655, + "loss": 3.0887, + "step": 18772 + }, + { + "epoch": 0.92, + "grad_norm": 0.5117706656455994, + "learning_rate": 0.00047120901324142206, + "loss": 3.3396, + "step": 18773 + }, + { + "epoch": 0.92, + "grad_norm": 0.5400931239128113, + "learning_rate": 0.0004711963702660059, + "loss": 3.1191, + "step": 18774 + }, + { + "epoch": 0.92, + "grad_norm": 0.5695016384124756, + "learning_rate": 0.0004711837268396915, + "loss": 3.0463, + "step": 18775 + }, + { + "epoch": 0.92, + "grad_norm": 0.5119388699531555, + "learning_rate": 0.000471171082962512, + "loss": 3.3504, + "step": 18776 + }, + { + "epoch": 0.92, + "grad_norm": 0.5598418712615967, + "learning_rate": 0.000471158438634501, + "loss": 3.1825, + "step": 18777 + }, + { + "epoch": 0.92, + "grad_norm": 0.5664933919906616, + "learning_rate": 0.0004711457938556915, + "loss": 3.1576, + "step": 18778 + }, + { + "epoch": 0.92, + "grad_norm": 0.5691736340522766, + "learning_rate": 0.000471133148626117, + "loss": 3.0413, + "step": 18779 + }, + { + "epoch": 0.92, + "grad_norm": 0.5121724009513855, + "learning_rate": 0.00047112050294581065, + "loss": 3.4054, + "step": 18780 + }, + { + "epoch": 0.92, + "grad_norm": 0.5062749981880188, + "learning_rate": 0.0004711078568148058, + "loss": 3.1947, + "step": 18781 + }, + { + "epoch": 0.92, + "grad_norm": 0.5275105237960815, + "learning_rate": 0.00047109521023313585, + "loss": 3.1643, + "step": 18782 + }, + { + "epoch": 0.92, + "grad_norm": 0.5385158061981201, + "learning_rate": 0.000471082563200834, + "loss": 3.0541, + "step": 18783 + }, + { + "epoch": 0.92, + "grad_norm": 0.5302549600601196, + "learning_rate": 0.00047106991571793357, + "loss": 2.8629, + "step": 18784 + }, + { + "epoch": 0.92, + "grad_norm": 0.5567852258682251, + "learning_rate": 0.0004710572677844679, + "loss": 3.1464, + "step": 18785 + }, + { + "epoch": 0.92, + "grad_norm": 0.5577290058135986, + "learning_rate": 0.0004710446194004704, + "loss": 3.2132, + "step": 18786 + }, + { + "epoch": 0.92, + "grad_norm": 0.516573965549469, + "learning_rate": 0.0004710319705659744, + "loss": 3.3098, + "step": 18787 + }, + { + "epoch": 0.92, + "grad_norm": 0.5409716367721558, + "learning_rate": 0.0004710193212810129, + "loss": 3.0126, + "step": 18788 + }, + { + "epoch": 0.92, + "grad_norm": 0.5368973612785339, + "learning_rate": 0.00047100667154561946, + "loss": 3.2313, + "step": 18789 + }, + { + "epoch": 0.92, + "grad_norm": 0.48436570167541504, + "learning_rate": 0.0004709940213598274, + "loss": 3.1132, + "step": 18790 + }, + { + "epoch": 0.92, + "grad_norm": 0.5413197875022888, + "learning_rate": 0.00047098137072366995, + "loss": 3.1343, + "step": 18791 + }, + { + "epoch": 0.92, + "grad_norm": 0.530865490436554, + "learning_rate": 0.00047096871963718046, + "loss": 3.0911, + "step": 18792 + }, + { + "epoch": 0.92, + "grad_norm": 0.5218861699104309, + "learning_rate": 0.00047095606810039237, + "loss": 3.2357, + "step": 18793 + }, + { + "epoch": 0.92, + "grad_norm": 0.5184247493743896, + "learning_rate": 0.00047094341611333886, + "loss": 3.1619, + "step": 18794 + }, + { + "epoch": 0.92, + "grad_norm": 0.5146009922027588, + "learning_rate": 0.00047093076367605316, + "loss": 3.2411, + "step": 18795 + }, + { + "epoch": 0.92, + "grad_norm": 0.5616743564605713, + "learning_rate": 0.0004709181107885689, + "loss": 3.233, + "step": 18796 + }, + { + "epoch": 0.92, + "grad_norm": 0.5262179970741272, + "learning_rate": 0.0004709054574509192, + "loss": 3.3252, + "step": 18797 + }, + { + "epoch": 0.92, + "grad_norm": 0.5056144595146179, + "learning_rate": 0.0004708928036631374, + "loss": 3.1111, + "step": 18798 + }, + { + "epoch": 0.92, + "grad_norm": 0.5371179580688477, + "learning_rate": 0.00047088014942525675, + "loss": 3.1897, + "step": 18799 + }, + { + "epoch": 0.92, + "grad_norm": 0.5067994594573975, + "learning_rate": 0.0004708674947373108, + "loss": 3.3786, + "step": 18800 + }, + { + "epoch": 0.92, + "grad_norm": 0.5345014929771423, + "learning_rate": 0.0004708548395993326, + "loss": 3.2104, + "step": 18801 + }, + { + "epoch": 0.92, + "grad_norm": 0.5636339783668518, + "learning_rate": 0.00047084218401135576, + "loss": 3.3303, + "step": 18802 + }, + { + "epoch": 0.92, + "grad_norm": 0.4965422749519348, + "learning_rate": 0.0004708295279734135, + "loss": 3.1805, + "step": 18803 + }, + { + "epoch": 0.92, + "grad_norm": 0.533908486366272, + "learning_rate": 0.0004708168714855391, + "loss": 3.1346, + "step": 18804 + }, + { + "epoch": 0.92, + "grad_norm": 0.512013852596283, + "learning_rate": 0.0004708042145477659, + "loss": 3.1826, + "step": 18805 + }, + { + "epoch": 0.92, + "grad_norm": 0.5449269413948059, + "learning_rate": 0.0004707915571601273, + "loss": 3.1899, + "step": 18806 + }, + { + "epoch": 0.92, + "grad_norm": 0.5012300610542297, + "learning_rate": 0.0004707788993226567, + "loss": 3.4398, + "step": 18807 + }, + { + "epoch": 0.92, + "grad_norm": 0.5492137670516968, + "learning_rate": 0.00047076624103538725, + "loss": 3.1645, + "step": 18808 + }, + { + "epoch": 0.92, + "grad_norm": 0.5013618469238281, + "learning_rate": 0.00047075358229835236, + "loss": 3.1924, + "step": 18809 + }, + { + "epoch": 0.92, + "grad_norm": 0.5345459580421448, + "learning_rate": 0.00047074092311158536, + "loss": 3.0099, + "step": 18810 + }, + { + "epoch": 0.92, + "grad_norm": 0.5092979669570923, + "learning_rate": 0.0004707282634751197, + "loss": 3.1141, + "step": 18811 + }, + { + "epoch": 0.92, + "grad_norm": 0.5068957805633545, + "learning_rate": 0.0004707156033889887, + "loss": 3.3472, + "step": 18812 + }, + { + "epoch": 0.92, + "grad_norm": 0.5265266299247742, + "learning_rate": 0.0004707029428532256, + "loss": 2.9948, + "step": 18813 + }, + { + "epoch": 0.92, + "grad_norm": 0.5371838808059692, + "learning_rate": 0.00047069028186786386, + "loss": 3.2557, + "step": 18814 + }, + { + "epoch": 0.92, + "grad_norm": 0.5402524471282959, + "learning_rate": 0.0004706776204329367, + "loss": 3.1617, + "step": 18815 + }, + { + "epoch": 0.92, + "grad_norm": 0.5330641269683838, + "learning_rate": 0.0004706649585484775, + "loss": 3.1771, + "step": 18816 + }, + { + "epoch": 0.92, + "grad_norm": 0.5558748841285706, + "learning_rate": 0.0004706522962145197, + "loss": 3.1897, + "step": 18817 + }, + { + "epoch": 0.92, + "grad_norm": 0.5087716579437256, + "learning_rate": 0.0004706396334310966, + "loss": 3.1177, + "step": 18818 + }, + { + "epoch": 0.92, + "grad_norm": 0.5257171392440796, + "learning_rate": 0.00047062697019824155, + "loss": 3.0529, + "step": 18819 + }, + { + "epoch": 0.92, + "grad_norm": 0.5224782228469849, + "learning_rate": 0.00047061430651598783, + "loss": 3.3477, + "step": 18820 + }, + { + "epoch": 0.92, + "grad_norm": 0.5466150641441345, + "learning_rate": 0.0004706016423843689, + "loss": 2.9722, + "step": 18821 + }, + { + "epoch": 0.92, + "grad_norm": 0.5087209939956665, + "learning_rate": 0.0004705889778034181, + "loss": 3.4086, + "step": 18822 + }, + { + "epoch": 0.92, + "grad_norm": 0.5442378520965576, + "learning_rate": 0.00047057631277316875, + "loss": 3.4454, + "step": 18823 + }, + { + "epoch": 0.92, + "grad_norm": 0.5326034426689148, + "learning_rate": 0.00047056364729365424, + "loss": 3.2417, + "step": 18824 + }, + { + "epoch": 0.92, + "grad_norm": 0.4734261929988861, + "learning_rate": 0.0004705509813649079, + "loss": 3.2332, + "step": 18825 + }, + { + "epoch": 0.92, + "grad_norm": 0.543708324432373, + "learning_rate": 0.0004705383149869631, + "loss": 3.2303, + "step": 18826 + }, + { + "epoch": 0.92, + "grad_norm": 0.5420598983764648, + "learning_rate": 0.00047052564815985317, + "loss": 3.2229, + "step": 18827 + }, + { + "epoch": 0.92, + "grad_norm": 0.543842077255249, + "learning_rate": 0.00047051298088361155, + "loss": 3.2, + "step": 18828 + }, + { + "epoch": 0.92, + "grad_norm": 0.511565089225769, + "learning_rate": 0.00047050031315827156, + "loss": 3.1562, + "step": 18829 + }, + { + "epoch": 0.92, + "grad_norm": 0.5324139595031738, + "learning_rate": 0.0004704876449838665, + "loss": 3.0808, + "step": 18830 + }, + { + "epoch": 0.92, + "grad_norm": 0.5224776268005371, + "learning_rate": 0.00047047497636042976, + "loss": 3.131, + "step": 18831 + }, + { + "epoch": 0.92, + "grad_norm": 0.525739312171936, + "learning_rate": 0.00047046230728799485, + "loss": 3.2305, + "step": 18832 + }, + { + "epoch": 0.92, + "grad_norm": 0.5212183594703674, + "learning_rate": 0.00047044963776659497, + "loss": 3.0373, + "step": 18833 + }, + { + "epoch": 0.92, + "grad_norm": 0.5085528492927551, + "learning_rate": 0.0004704369677962636, + "loss": 3.235, + "step": 18834 + }, + { + "epoch": 0.92, + "grad_norm": 0.5560317635536194, + "learning_rate": 0.0004704242973770339, + "loss": 3.2723, + "step": 18835 + }, + { + "epoch": 0.92, + "grad_norm": 0.48996028304100037, + "learning_rate": 0.00047041162650893957, + "loss": 3.139, + "step": 18836 + }, + { + "epoch": 0.92, + "grad_norm": 0.5113476514816284, + "learning_rate": 0.0004703989551920138, + "loss": 3.1393, + "step": 18837 + }, + { + "epoch": 0.92, + "grad_norm": 0.5235118865966797, + "learning_rate": 0.0004703862834262899, + "loss": 3.1353, + "step": 18838 + }, + { + "epoch": 0.92, + "grad_norm": 0.523135244846344, + "learning_rate": 0.0004703736112118014, + "loss": 3.1062, + "step": 18839 + }, + { + "epoch": 0.92, + "grad_norm": 0.5716493725776672, + "learning_rate": 0.0004703609385485815, + "loss": 3.2806, + "step": 18840 + }, + { + "epoch": 0.92, + "grad_norm": 0.4963917136192322, + "learning_rate": 0.00047034826543666374, + "loss": 3.0589, + "step": 18841 + }, + { + "epoch": 0.92, + "grad_norm": 0.54476398229599, + "learning_rate": 0.0004703355918760814, + "loss": 3.0997, + "step": 18842 + }, + { + "epoch": 0.92, + "grad_norm": 0.48611146211624146, + "learning_rate": 0.0004703229178668679, + "loss": 3.0625, + "step": 18843 + }, + { + "epoch": 0.92, + "grad_norm": 0.5003273487091064, + "learning_rate": 0.0004703102434090566, + "loss": 3.0927, + "step": 18844 + }, + { + "epoch": 0.92, + "grad_norm": 0.5381352305412292, + "learning_rate": 0.00047029756850268097, + "loss": 2.7574, + "step": 18845 + }, + { + "epoch": 0.92, + "grad_norm": 0.5133535265922546, + "learning_rate": 0.00047028489314777423, + "loss": 3.2066, + "step": 18846 + }, + { + "epoch": 0.92, + "grad_norm": 0.4886670708656311, + "learning_rate": 0.0004702722173443699, + "loss": 2.9163, + "step": 18847 + }, + { + "epoch": 0.92, + "grad_norm": 0.513353705406189, + "learning_rate": 0.0004702595410925013, + "loss": 3.2403, + "step": 18848 + }, + { + "epoch": 0.92, + "grad_norm": 0.5126110911369324, + "learning_rate": 0.00047024686439220187, + "loss": 3.1207, + "step": 18849 + }, + { + "epoch": 0.92, + "grad_norm": 0.525732696056366, + "learning_rate": 0.0004702341872435049, + "loss": 3.2541, + "step": 18850 + }, + { + "epoch": 0.92, + "grad_norm": 0.5350430607795715, + "learning_rate": 0.0004702215096464439, + "loss": 3.1009, + "step": 18851 + }, + { + "epoch": 0.92, + "grad_norm": 0.5231349468231201, + "learning_rate": 0.0004702088316010522, + "loss": 2.9764, + "step": 18852 + }, + { + "epoch": 0.92, + "grad_norm": 0.5163970589637756, + "learning_rate": 0.00047019615310736314, + "loss": 2.9218, + "step": 18853 + }, + { + "epoch": 0.92, + "grad_norm": 0.5017314553260803, + "learning_rate": 0.00047018347416541027, + "loss": 3.249, + "step": 18854 + }, + { + "epoch": 0.92, + "grad_norm": 0.5135806798934937, + "learning_rate": 0.00047017079477522676, + "loss": 3.0667, + "step": 18855 + }, + { + "epoch": 0.92, + "grad_norm": 0.4848870635032654, + "learning_rate": 0.0004701581149368463, + "loss": 3.2183, + "step": 18856 + }, + { + "epoch": 0.92, + "grad_norm": 0.5242249369621277, + "learning_rate": 0.0004701454346503019, + "loss": 3.0505, + "step": 18857 + }, + { + "epoch": 0.92, + "grad_norm": 0.5194023847579956, + "learning_rate": 0.00047013275391562736, + "loss": 3.2237, + "step": 18858 + }, + { + "epoch": 0.92, + "grad_norm": 0.5167834162712097, + "learning_rate": 0.00047012007273285574, + "loss": 3.0208, + "step": 18859 + }, + { + "epoch": 0.92, + "grad_norm": 0.5210264921188354, + "learning_rate": 0.0004701073911020207, + "loss": 3.0097, + "step": 18860 + }, + { + "epoch": 0.92, + "grad_norm": 0.5134527087211609, + "learning_rate": 0.0004700947090231555, + "loss": 3.0335, + "step": 18861 + }, + { + "epoch": 0.92, + "grad_norm": 0.5310779213905334, + "learning_rate": 0.00047008202649629347, + "loss": 3.2074, + "step": 18862 + }, + { + "epoch": 0.92, + "grad_norm": 0.5398174524307251, + "learning_rate": 0.0004700693435214682, + "loss": 2.9155, + "step": 18863 + }, + { + "epoch": 0.92, + "grad_norm": 0.5127217173576355, + "learning_rate": 0.000470056660098713, + "loss": 3.0767, + "step": 18864 + }, + { + "epoch": 0.92, + "grad_norm": 0.4852171838283539, + "learning_rate": 0.00047004397622806135, + "loss": 3.2109, + "step": 18865 + }, + { + "epoch": 0.92, + "grad_norm": 0.514201283454895, + "learning_rate": 0.0004700312919095465, + "loss": 3.2189, + "step": 18866 + }, + { + "epoch": 0.92, + "grad_norm": 0.4986250698566437, + "learning_rate": 0.000470018607143202, + "loss": 3.2653, + "step": 18867 + }, + { + "epoch": 0.92, + "grad_norm": 0.531745195388794, + "learning_rate": 0.00047000592192906115, + "loss": 3.1656, + "step": 18868 + }, + { + "epoch": 0.92, + "grad_norm": 0.5156833529472351, + "learning_rate": 0.00046999323626715746, + "loss": 3.1795, + "step": 18869 + }, + { + "epoch": 0.92, + "grad_norm": 0.5258499383926392, + "learning_rate": 0.00046998055015752435, + "loss": 3.031, + "step": 18870 + }, + { + "epoch": 0.92, + "grad_norm": 0.4953664541244507, + "learning_rate": 0.00046996786360019516, + "loss": 3.1802, + "step": 18871 + }, + { + "epoch": 0.92, + "grad_norm": 0.5456046462059021, + "learning_rate": 0.00046995517659520323, + "loss": 3.224, + "step": 18872 + }, + { + "epoch": 0.92, + "grad_norm": 0.5381906628608704, + "learning_rate": 0.00046994248914258217, + "loss": 3.1333, + "step": 18873 + }, + { + "epoch": 0.92, + "grad_norm": 0.5225964784622192, + "learning_rate": 0.00046992980124236527, + "loss": 3.2528, + "step": 18874 + }, + { + "epoch": 0.93, + "grad_norm": 0.492789089679718, + "learning_rate": 0.00046991711289458597, + "loss": 2.9492, + "step": 18875 + }, + { + "epoch": 0.93, + "grad_norm": 0.5383958220481873, + "learning_rate": 0.00046990442409927777, + "loss": 3.2909, + "step": 18876 + }, + { + "epoch": 0.93, + "grad_norm": 0.5192603468894958, + "learning_rate": 0.0004698917348564739, + "loss": 3.021, + "step": 18877 + }, + { + "epoch": 0.93, + "grad_norm": 0.5470327138900757, + "learning_rate": 0.000469879045166208, + "loss": 3.0586, + "step": 18878 + }, + { + "epoch": 0.93, + "grad_norm": 0.5224716067314148, + "learning_rate": 0.0004698663550285133, + "loss": 2.977, + "step": 18879 + }, + { + "epoch": 0.93, + "grad_norm": 0.5188048481941223, + "learning_rate": 0.00046985366444342335, + "loss": 3.1992, + "step": 18880 + }, + { + "epoch": 0.93, + "grad_norm": 0.5498374700546265, + "learning_rate": 0.0004698409734109716, + "loss": 2.9874, + "step": 18881 + }, + { + "epoch": 0.93, + "grad_norm": 0.5041355490684509, + "learning_rate": 0.00046982828193119127, + "loss": 3.0512, + "step": 18882 + }, + { + "epoch": 0.93, + "grad_norm": 0.5342227220535278, + "learning_rate": 0.00046981559000411604, + "loss": 3.1102, + "step": 18883 + }, + { + "epoch": 0.93, + "grad_norm": 0.5024885535240173, + "learning_rate": 0.0004698028976297792, + "loss": 3.1677, + "step": 18884 + }, + { + "epoch": 0.93, + "grad_norm": 0.5300989151000977, + "learning_rate": 0.0004697902048082142, + "loss": 3.1714, + "step": 18885 + }, + { + "epoch": 0.93, + "grad_norm": 0.5051508545875549, + "learning_rate": 0.00046977751153945457, + "loss": 2.9421, + "step": 18886 + }, + { + "epoch": 0.93, + "grad_norm": 0.5393108129501343, + "learning_rate": 0.0004697648178235335, + "loss": 3.0567, + "step": 18887 + }, + { + "epoch": 0.93, + "grad_norm": 0.5067991614341736, + "learning_rate": 0.00046975212366048467, + "loss": 3.2442, + "step": 18888 + }, + { + "epoch": 0.93, + "grad_norm": 0.5369142889976501, + "learning_rate": 0.00046973942905034137, + "loss": 3.0943, + "step": 18889 + }, + { + "epoch": 0.93, + "grad_norm": 0.5115844011306763, + "learning_rate": 0.0004697267339931371, + "loss": 3.1095, + "step": 18890 + }, + { + "epoch": 0.93, + "grad_norm": 0.48783326148986816, + "learning_rate": 0.0004697140384889053, + "loss": 3.1938, + "step": 18891 + }, + { + "epoch": 0.93, + "grad_norm": 0.5423147678375244, + "learning_rate": 0.00046970134253767937, + "loss": 3.1264, + "step": 18892 + }, + { + "epoch": 0.93, + "grad_norm": 0.5428636074066162, + "learning_rate": 0.00046968864613949267, + "loss": 3.3557, + "step": 18893 + }, + { + "epoch": 0.93, + "grad_norm": 0.5630121827125549, + "learning_rate": 0.00046967594929437885, + "loss": 2.9878, + "step": 18894 + }, + { + "epoch": 0.93, + "grad_norm": 0.5453979969024658, + "learning_rate": 0.00046966325200237117, + "loss": 3.0636, + "step": 18895 + }, + { + "epoch": 0.93, + "grad_norm": 0.5532796382904053, + "learning_rate": 0.0004696505542635032, + "loss": 3.1966, + "step": 18896 + }, + { + "epoch": 0.93, + "grad_norm": 0.5100991725921631, + "learning_rate": 0.00046963785607780827, + "loss": 3.0472, + "step": 18897 + }, + { + "epoch": 0.93, + "grad_norm": 0.5884232521057129, + "learning_rate": 0.0004696251574453198, + "loss": 2.9916, + "step": 18898 + }, + { + "epoch": 0.93, + "grad_norm": 0.5338824391365051, + "learning_rate": 0.0004696124583660715, + "loss": 3.2273, + "step": 18899 + }, + { + "epoch": 0.93, + "grad_norm": 0.5424486398696899, + "learning_rate": 0.00046959975884009646, + "loss": 3.3073, + "step": 18900 + }, + { + "epoch": 0.93, + "grad_norm": 0.5141465067863464, + "learning_rate": 0.0004695870588674284, + "loss": 3.2295, + "step": 18901 + }, + { + "epoch": 0.93, + "grad_norm": 0.5177417993545532, + "learning_rate": 0.00046957435844810063, + "loss": 3.2054, + "step": 18902 + }, + { + "epoch": 0.93, + "grad_norm": 0.5106486082077026, + "learning_rate": 0.0004695616575821466, + "loss": 3.1774, + "step": 18903 + }, + { + "epoch": 0.93, + "grad_norm": 0.5100563168525696, + "learning_rate": 0.0004695489562695998, + "loss": 3.0875, + "step": 18904 + }, + { + "epoch": 0.93, + "grad_norm": 0.524345338344574, + "learning_rate": 0.0004695362545104938, + "loss": 3.2862, + "step": 18905 + }, + { + "epoch": 0.93, + "grad_norm": 0.5443475246429443, + "learning_rate": 0.0004695235523048618, + "loss": 2.8944, + "step": 18906 + }, + { + "epoch": 0.93, + "grad_norm": 0.5217635631561279, + "learning_rate": 0.00046951084965273746, + "loss": 3.171, + "step": 18907 + }, + { + "epoch": 0.93, + "grad_norm": 0.5589973330497742, + "learning_rate": 0.00046949814655415406, + "loss": 3.0554, + "step": 18908 + }, + { + "epoch": 0.93, + "grad_norm": 0.49502140283584595, + "learning_rate": 0.0004694854430091452, + "loss": 3.2162, + "step": 18909 + }, + { + "epoch": 0.93, + "grad_norm": 0.5253203511238098, + "learning_rate": 0.00046947273901774443, + "loss": 3.2409, + "step": 18910 + }, + { + "epoch": 0.93, + "grad_norm": 0.508833646774292, + "learning_rate": 0.000469460034579985, + "loss": 3.0954, + "step": 18911 + }, + { + "epoch": 0.93, + "grad_norm": 0.5880780816078186, + "learning_rate": 0.0004694473296959004, + "loss": 3.1979, + "step": 18912 + }, + { + "epoch": 0.93, + "grad_norm": 0.5805985331535339, + "learning_rate": 0.0004694346243655242, + "loss": 3.0149, + "step": 18913 + }, + { + "epoch": 0.93, + "grad_norm": 0.5318664908409119, + "learning_rate": 0.00046942191858888977, + "loss": 3.1204, + "step": 18914 + }, + { + "epoch": 0.93, + "grad_norm": 0.5232014060020447, + "learning_rate": 0.00046940921236603064, + "loss": 3.1569, + "step": 18915 + }, + { + "epoch": 0.93, + "grad_norm": 0.5750177502632141, + "learning_rate": 0.00046939650569698024, + "loss": 2.9964, + "step": 18916 + }, + { + "epoch": 0.93, + "grad_norm": 0.5180473327636719, + "learning_rate": 0.00046938379858177203, + "loss": 3.063, + "step": 18917 + }, + { + "epoch": 0.93, + "grad_norm": 0.5332753658294678, + "learning_rate": 0.0004693710910204395, + "loss": 3.2211, + "step": 18918 + }, + { + "epoch": 0.93, + "grad_norm": 0.5309212803840637, + "learning_rate": 0.0004693583830130162, + "loss": 3.2455, + "step": 18919 + }, + { + "epoch": 0.93, + "grad_norm": 0.530160129070282, + "learning_rate": 0.0004693456745595354, + "loss": 2.9723, + "step": 18920 + }, + { + "epoch": 0.93, + "grad_norm": 0.5082368850708008, + "learning_rate": 0.00046933296566003076, + "loss": 3.102, + "step": 18921 + }, + { + "epoch": 0.93, + "grad_norm": 0.5547425746917725, + "learning_rate": 0.00046932025631453564, + "loss": 2.9199, + "step": 18922 + }, + { + "epoch": 0.93, + "grad_norm": 0.5218966603279114, + "learning_rate": 0.0004693075465230835, + "loss": 3.0292, + "step": 18923 + }, + { + "epoch": 0.93, + "grad_norm": 0.551936149597168, + "learning_rate": 0.00046929483628570793, + "loss": 3.1246, + "step": 18924 + }, + { + "epoch": 0.93, + "grad_norm": 0.516350269317627, + "learning_rate": 0.0004692821256024423, + "loss": 3.2375, + "step": 18925 + }, + { + "epoch": 0.93, + "grad_norm": 0.5745588541030884, + "learning_rate": 0.00046926941447332016, + "loss": 2.9728, + "step": 18926 + }, + { + "epoch": 0.93, + "grad_norm": 0.5034775733947754, + "learning_rate": 0.00046925670289837496, + "loss": 3.3807, + "step": 18927 + }, + { + "epoch": 0.93, + "grad_norm": 0.6255304217338562, + "learning_rate": 0.0004692439908776402, + "loss": 3.2879, + "step": 18928 + }, + { + "epoch": 0.93, + "grad_norm": 0.49725767970085144, + "learning_rate": 0.00046923127841114924, + "loss": 3.0999, + "step": 18929 + }, + { + "epoch": 0.93, + "grad_norm": 0.5189871788024902, + "learning_rate": 0.0004692185654989358, + "loss": 3.1402, + "step": 18930 + }, + { + "epoch": 0.93, + "grad_norm": 0.5304571390151978, + "learning_rate": 0.00046920585214103324, + "loss": 3.2725, + "step": 18931 + }, + { + "epoch": 0.93, + "grad_norm": 0.5378173589706421, + "learning_rate": 0.00046919313833747485, + "loss": 3.1548, + "step": 18932 + }, + { + "epoch": 0.93, + "grad_norm": 0.5394854545593262, + "learning_rate": 0.00046918042408829446, + "loss": 3.1416, + "step": 18933 + }, + { + "epoch": 0.93, + "grad_norm": 0.5187913179397583, + "learning_rate": 0.0004691677093935253, + "loss": 2.9619, + "step": 18934 + }, + { + "epoch": 0.93, + "grad_norm": 0.5153508186340332, + "learning_rate": 0.000469154994253201, + "loss": 3.1343, + "step": 18935 + }, + { + "epoch": 0.93, + "grad_norm": 0.5249007344245911, + "learning_rate": 0.0004691422786673549, + "loss": 3.223, + "step": 18936 + }, + { + "epoch": 0.93, + "grad_norm": 0.514267086982727, + "learning_rate": 0.00046912956263602077, + "loss": 3.1555, + "step": 18937 + }, + { + "epoch": 0.93, + "grad_norm": 0.515212893486023, + "learning_rate": 0.0004691168461592318, + "loss": 3.1127, + "step": 18938 + }, + { + "epoch": 0.93, + "grad_norm": 0.48960253596305847, + "learning_rate": 0.0004691041292370216, + "loss": 3.1147, + "step": 18939 + }, + { + "epoch": 0.93, + "grad_norm": 0.5144261717796326, + "learning_rate": 0.0004690914118694237, + "loss": 3.2216, + "step": 18940 + }, + { + "epoch": 0.93, + "grad_norm": 0.523517906665802, + "learning_rate": 0.0004690786940564716, + "loss": 3.0786, + "step": 18941 + }, + { + "epoch": 0.93, + "grad_norm": 0.5452386140823364, + "learning_rate": 0.00046906597579819865, + "loss": 3.1958, + "step": 18942 + }, + { + "epoch": 0.93, + "grad_norm": 0.5395174026489258, + "learning_rate": 0.0004690532570946386, + "loss": 2.9997, + "step": 18943 + }, + { + "epoch": 0.93, + "grad_norm": 0.5560466051101685, + "learning_rate": 0.00046904053794582465, + "loss": 3.3784, + "step": 18944 + }, + { + "epoch": 0.93, + "grad_norm": 0.525627076625824, + "learning_rate": 0.0004690278183517906, + "loss": 2.9482, + "step": 18945 + }, + { + "epoch": 0.93, + "grad_norm": 0.5005621314048767, + "learning_rate": 0.0004690150983125697, + "loss": 3.0905, + "step": 18946 + }, + { + "epoch": 0.93, + "grad_norm": 0.5510935187339783, + "learning_rate": 0.0004690023778281957, + "loss": 3.3415, + "step": 18947 + }, + { + "epoch": 0.93, + "grad_norm": 0.5250145196914673, + "learning_rate": 0.0004689896568987018, + "loss": 3.1714, + "step": 18948 + }, + { + "epoch": 0.93, + "grad_norm": 0.521665096282959, + "learning_rate": 0.0004689769355241217, + "loss": 3.2226, + "step": 18949 + }, + { + "epoch": 0.93, + "grad_norm": 0.6473313570022583, + "learning_rate": 0.00046896421370448894, + "loss": 3.2003, + "step": 18950 + }, + { + "epoch": 0.93, + "grad_norm": 0.5071868300437927, + "learning_rate": 0.0004689514914398369, + "loss": 2.9599, + "step": 18951 + }, + { + "epoch": 0.93, + "grad_norm": 0.5094021558761597, + "learning_rate": 0.0004689387687301992, + "loss": 2.9249, + "step": 18952 + }, + { + "epoch": 0.93, + "grad_norm": 0.5259853601455688, + "learning_rate": 0.0004689260455756092, + "loss": 3.2138, + "step": 18953 + }, + { + "epoch": 0.93, + "grad_norm": 0.4910939335823059, + "learning_rate": 0.00046891332197610057, + "loss": 3.1469, + "step": 18954 + }, + { + "epoch": 0.93, + "grad_norm": 0.5359907746315002, + "learning_rate": 0.00046890059793170676, + "loss": 3.2667, + "step": 18955 + }, + { + "epoch": 0.93, + "grad_norm": 0.5295614004135132, + "learning_rate": 0.00046888787344246134, + "loss": 2.998, + "step": 18956 + }, + { + "epoch": 0.93, + "grad_norm": 0.5557469129562378, + "learning_rate": 0.0004688751485083977, + "loss": 3.1919, + "step": 18957 + }, + { + "epoch": 0.93, + "grad_norm": 0.5100527405738831, + "learning_rate": 0.0004688624231295494, + "loss": 3.078, + "step": 18958 + }, + { + "epoch": 0.93, + "grad_norm": 0.5492949485778809, + "learning_rate": 0.00046884969730595, + "loss": 3.2749, + "step": 18959 + }, + { + "epoch": 0.93, + "grad_norm": 0.5385236144065857, + "learning_rate": 0.00046883697103763293, + "loss": 3.0871, + "step": 18960 + }, + { + "epoch": 0.93, + "grad_norm": 0.5516214966773987, + "learning_rate": 0.0004688242443246319, + "loss": 2.9884, + "step": 18961 + }, + { + "epoch": 0.93, + "grad_norm": 0.5254327654838562, + "learning_rate": 0.00046881151716698027, + "loss": 3.0269, + "step": 18962 + }, + { + "epoch": 0.93, + "grad_norm": 0.50152587890625, + "learning_rate": 0.0004687987895647115, + "loss": 3.1315, + "step": 18963 + }, + { + "epoch": 0.93, + "grad_norm": 0.5856966376304626, + "learning_rate": 0.0004687860615178593, + "loss": 2.9434, + "step": 18964 + }, + { + "epoch": 0.93, + "grad_norm": 0.5326898694038391, + "learning_rate": 0.00046877333302645707, + "loss": 3.2463, + "step": 18965 + }, + { + "epoch": 0.93, + "grad_norm": 0.5047300457954407, + "learning_rate": 0.00046876060409053826, + "loss": 3.0858, + "step": 18966 + }, + { + "epoch": 0.93, + "grad_norm": 0.601098358631134, + "learning_rate": 0.00046874787471013667, + "loss": 3.0427, + "step": 18967 + }, + { + "epoch": 0.93, + "grad_norm": 0.5478784441947937, + "learning_rate": 0.0004687351448852855, + "loss": 3.1414, + "step": 18968 + }, + { + "epoch": 0.93, + "grad_norm": 0.5420882701873779, + "learning_rate": 0.0004687224146160186, + "loss": 3.1592, + "step": 18969 + }, + { + "epoch": 0.93, + "grad_norm": 0.5226287245750427, + "learning_rate": 0.0004687096839023692, + "loss": 3.1503, + "step": 18970 + }, + { + "epoch": 0.93, + "grad_norm": 0.5345956087112427, + "learning_rate": 0.000468696952744371, + "loss": 3.3743, + "step": 18971 + }, + { + "epoch": 0.93, + "grad_norm": 0.5423299074172974, + "learning_rate": 0.00046868422114205755, + "loss": 3.0927, + "step": 18972 + }, + { + "epoch": 0.93, + "grad_norm": 0.532418966293335, + "learning_rate": 0.00046867148909546225, + "loss": 3.2014, + "step": 18973 + }, + { + "epoch": 0.93, + "grad_norm": 0.5317066311836243, + "learning_rate": 0.0004686587566046187, + "loss": 3.1707, + "step": 18974 + }, + { + "epoch": 0.93, + "grad_norm": 0.5457773208618164, + "learning_rate": 0.00046864602366956037, + "loss": 2.9554, + "step": 18975 + }, + { + "epoch": 0.93, + "grad_norm": 0.5188515186309814, + "learning_rate": 0.00046863329029032095, + "loss": 3.2574, + "step": 18976 + }, + { + "epoch": 0.93, + "grad_norm": 0.5296633839607239, + "learning_rate": 0.000468620556466934, + "loss": 3.1508, + "step": 18977 + }, + { + "epoch": 0.93, + "grad_norm": 0.5568720102310181, + "learning_rate": 0.0004686078221994329, + "loss": 3.2111, + "step": 18978 + }, + { + "epoch": 0.93, + "grad_norm": 0.5664288997650146, + "learning_rate": 0.00046859508748785126, + "loss": 3.0099, + "step": 18979 + }, + { + "epoch": 0.93, + "grad_norm": 0.5301545262336731, + "learning_rate": 0.0004685823523322225, + "loss": 3.2176, + "step": 18980 + }, + { + "epoch": 0.93, + "grad_norm": 0.5368116497993469, + "learning_rate": 0.0004685696167325803, + "loss": 3.2631, + "step": 18981 + }, + { + "epoch": 0.93, + "grad_norm": 0.534150242805481, + "learning_rate": 0.0004685568806889582, + "loss": 3.0123, + "step": 18982 + }, + { + "epoch": 0.93, + "grad_norm": 0.4915403723716736, + "learning_rate": 0.00046854414420138975, + "loss": 3.3539, + "step": 18983 + }, + { + "epoch": 0.93, + "grad_norm": 0.5354322195053101, + "learning_rate": 0.00046853140726990845, + "loss": 3.245, + "step": 18984 + }, + { + "epoch": 0.93, + "grad_norm": 0.4962729811668396, + "learning_rate": 0.0004685186698945478, + "loss": 3.1999, + "step": 18985 + }, + { + "epoch": 0.93, + "grad_norm": 0.5085036158561707, + "learning_rate": 0.0004685059320753414, + "loss": 3.2156, + "step": 18986 + }, + { + "epoch": 0.93, + "grad_norm": 0.5053709745407104, + "learning_rate": 0.00046849319381232285, + "loss": 3.1658, + "step": 18987 + }, + { + "epoch": 0.93, + "grad_norm": 0.5463716983795166, + "learning_rate": 0.0004684804551055257, + "loss": 3.1854, + "step": 18988 + }, + { + "epoch": 0.93, + "grad_norm": 0.5284098386764526, + "learning_rate": 0.0004684677159549834, + "loss": 3.3753, + "step": 18989 + }, + { + "epoch": 0.93, + "grad_norm": 0.5338199734687805, + "learning_rate": 0.0004684549763607296, + "loss": 3.1831, + "step": 18990 + }, + { + "epoch": 0.93, + "grad_norm": 0.5269510746002197, + "learning_rate": 0.0004684422363227977, + "loss": 3.2542, + "step": 18991 + }, + { + "epoch": 0.93, + "grad_norm": 0.5175192356109619, + "learning_rate": 0.0004684294958412215, + "loss": 3.0125, + "step": 18992 + }, + { + "epoch": 0.93, + "grad_norm": 0.5156930088996887, + "learning_rate": 0.0004684167549160344, + "loss": 3.3845, + "step": 18993 + }, + { + "epoch": 0.93, + "grad_norm": 0.5119640231132507, + "learning_rate": 0.0004684040135472699, + "loss": 3.0625, + "step": 18994 + }, + { + "epoch": 0.93, + "grad_norm": 0.5390982031822205, + "learning_rate": 0.00046839127173496176, + "loss": 3.0787, + "step": 18995 + }, + { + "epoch": 0.93, + "grad_norm": 0.5263301134109497, + "learning_rate": 0.00046837852947914324, + "loss": 3.0068, + "step": 18996 + }, + { + "epoch": 0.93, + "grad_norm": 0.5323955416679382, + "learning_rate": 0.00046836578677984824, + "loss": 3.0541, + "step": 18997 + }, + { + "epoch": 0.93, + "grad_norm": 0.5111451745033264, + "learning_rate": 0.00046835304363711006, + "loss": 3.058, + "step": 18998 + }, + { + "epoch": 0.93, + "grad_norm": 0.5534102916717529, + "learning_rate": 0.0004683403000509624, + "loss": 3.2107, + "step": 18999 + }, + { + "epoch": 0.93, + "grad_norm": 0.5770662426948547, + "learning_rate": 0.0004683275560214388, + "loss": 3.053, + "step": 19000 + }, + { + "epoch": 0.93, + "grad_norm": 0.6135122179985046, + "learning_rate": 0.00046831481154857287, + "loss": 3.2861, + "step": 19001 + }, + { + "epoch": 0.93, + "grad_norm": 0.5752828121185303, + "learning_rate": 0.00046830206663239803, + "loss": 2.9293, + "step": 19002 + }, + { + "epoch": 0.93, + "grad_norm": 0.5392802953720093, + "learning_rate": 0.00046828932127294796, + "loss": 3.1803, + "step": 19003 + }, + { + "epoch": 0.93, + "grad_norm": 0.5212385654449463, + "learning_rate": 0.00046827657547025625, + "loss": 3.2022, + "step": 19004 + }, + { + "epoch": 0.93, + "grad_norm": 0.5026683807373047, + "learning_rate": 0.00046826382922435635, + "loss": 3.3249, + "step": 19005 + }, + { + "epoch": 0.93, + "grad_norm": 0.536699652671814, + "learning_rate": 0.000468251082535282, + "loss": 3.1748, + "step": 19006 + }, + { + "epoch": 0.93, + "grad_norm": 0.5301597714424133, + "learning_rate": 0.0004682383354030666, + "loss": 3.1808, + "step": 19007 + }, + { + "epoch": 0.93, + "grad_norm": 0.5335184335708618, + "learning_rate": 0.0004682255878277438, + "loss": 3.2511, + "step": 19008 + }, + { + "epoch": 0.93, + "grad_norm": 0.5118494033813477, + "learning_rate": 0.00046821283980934725, + "loss": 3.0333, + "step": 19009 + }, + { + "epoch": 0.93, + "grad_norm": 0.5088841319084167, + "learning_rate": 0.0004682000913479104, + "loss": 3.1185, + "step": 19010 + }, + { + "epoch": 0.93, + "grad_norm": 0.5405777096748352, + "learning_rate": 0.00046818734244346677, + "loss": 3.4098, + "step": 19011 + }, + { + "epoch": 0.93, + "grad_norm": 0.5063764452934265, + "learning_rate": 0.0004681745930960502, + "loss": 3.2377, + "step": 19012 + }, + { + "epoch": 0.93, + "grad_norm": 0.5210672616958618, + "learning_rate": 0.0004681618433056941, + "loss": 3.2403, + "step": 19013 + }, + { + "epoch": 0.93, + "grad_norm": 0.5515353679656982, + "learning_rate": 0.00046814909307243204, + "loss": 3.2621, + "step": 19014 + }, + { + "epoch": 0.93, + "grad_norm": 0.5045962929725647, + "learning_rate": 0.00046813634239629764, + "loss": 3.2523, + "step": 19015 + }, + { + "epoch": 0.93, + "grad_norm": 0.4997437596321106, + "learning_rate": 0.0004681235912773245, + "loss": 3.0961, + "step": 19016 + }, + { + "epoch": 0.93, + "grad_norm": 0.5317462682723999, + "learning_rate": 0.0004681108397155461, + "loss": 3.061, + "step": 19017 + }, + { + "epoch": 0.93, + "grad_norm": 0.5284067988395691, + "learning_rate": 0.00046809808771099614, + "loss": 3.2071, + "step": 19018 + }, + { + "epoch": 0.93, + "grad_norm": 0.5324392914772034, + "learning_rate": 0.00046808533526370826, + "loss": 3.0611, + "step": 19019 + }, + { + "epoch": 0.93, + "grad_norm": 0.5281850695610046, + "learning_rate": 0.0004680725823737158, + "loss": 3.0396, + "step": 19020 + }, + { + "epoch": 0.93, + "grad_norm": 0.5365699529647827, + "learning_rate": 0.00046805982904105255, + "loss": 3.0534, + "step": 19021 + }, + { + "epoch": 0.93, + "grad_norm": 0.5103819966316223, + "learning_rate": 0.00046804707526575215, + "loss": 2.9962, + "step": 19022 + }, + { + "epoch": 0.93, + "grad_norm": 0.517741322517395, + "learning_rate": 0.000468034321047848, + "loss": 3.0247, + "step": 19023 + }, + { + "epoch": 0.93, + "grad_norm": 0.5281891822814941, + "learning_rate": 0.00046802156638737385, + "loss": 3.0705, + "step": 19024 + }, + { + "epoch": 0.93, + "grad_norm": 0.5412678718566895, + "learning_rate": 0.00046800881128436316, + "loss": 2.9022, + "step": 19025 + }, + { + "epoch": 0.93, + "grad_norm": 0.5153235793113708, + "learning_rate": 0.0004679960557388496, + "loss": 3.1095, + "step": 19026 + }, + { + "epoch": 0.93, + "grad_norm": 0.5097496509552002, + "learning_rate": 0.0004679832997508668, + "loss": 3.0689, + "step": 19027 + }, + { + "epoch": 0.93, + "grad_norm": 0.5528456568717957, + "learning_rate": 0.0004679705433204483, + "loss": 3.1618, + "step": 19028 + }, + { + "epoch": 0.93, + "grad_norm": 0.5313249230384827, + "learning_rate": 0.0004679577864476278, + "loss": 3.1174, + "step": 19029 + }, + { + "epoch": 0.93, + "grad_norm": 0.5198782086372375, + "learning_rate": 0.00046794502913243875, + "loss": 3.1929, + "step": 19030 + }, + { + "epoch": 0.93, + "grad_norm": 0.5220719575881958, + "learning_rate": 0.00046793227137491473, + "loss": 3.0891, + "step": 19031 + }, + { + "epoch": 0.93, + "grad_norm": 0.5011788606643677, + "learning_rate": 0.0004679195131750895, + "loss": 3.106, + "step": 19032 + }, + { + "epoch": 0.93, + "grad_norm": 0.525837242603302, + "learning_rate": 0.00046790675453299666, + "loss": 3.1896, + "step": 19033 + }, + { + "epoch": 0.93, + "grad_norm": 0.5511639714241028, + "learning_rate": 0.0004678939954486696, + "loss": 3.213, + "step": 19034 + }, + { + "epoch": 0.93, + "grad_norm": 0.5156654715538025, + "learning_rate": 0.00046788123592214224, + "loss": 3.1804, + "step": 19035 + }, + { + "epoch": 0.93, + "grad_norm": 0.49573853611946106, + "learning_rate": 0.00046786847595344774, + "loss": 3.1656, + "step": 19036 + }, + { + "epoch": 0.93, + "grad_norm": 0.5024093389511108, + "learning_rate": 0.00046785571554262026, + "loss": 3.1114, + "step": 19037 + }, + { + "epoch": 0.93, + "grad_norm": 0.5238023996353149, + "learning_rate": 0.000467842954689693, + "loss": 3.1527, + "step": 19038 + }, + { + "epoch": 0.93, + "grad_norm": 0.5388356447219849, + "learning_rate": 0.0004678301933946997, + "loss": 3.2028, + "step": 19039 + }, + { + "epoch": 0.93, + "grad_norm": 0.512860119342804, + "learning_rate": 0.00046781743165767405, + "loss": 3.0169, + "step": 19040 + }, + { + "epoch": 0.93, + "grad_norm": 0.5536538362503052, + "learning_rate": 0.0004678046694786495, + "loss": 2.8985, + "step": 19041 + }, + { + "epoch": 0.93, + "grad_norm": 0.4811255931854248, + "learning_rate": 0.0004677919068576597, + "loss": 3.3228, + "step": 19042 + }, + { + "epoch": 0.93, + "grad_norm": 0.5000419616699219, + "learning_rate": 0.00046777914379473847, + "loss": 3.0492, + "step": 19043 + }, + { + "epoch": 0.93, + "grad_norm": 0.5147316455841064, + "learning_rate": 0.0004677663802899192, + "loss": 3.2764, + "step": 19044 + }, + { + "epoch": 0.93, + "grad_norm": 0.5300585031509399, + "learning_rate": 0.0004677536163432355, + "loss": 2.9444, + "step": 19045 + }, + { + "epoch": 0.93, + "grad_norm": 0.515038788318634, + "learning_rate": 0.0004677408519547211, + "loss": 3.0745, + "step": 19046 + }, + { + "epoch": 0.93, + "grad_norm": 0.5364577174186707, + "learning_rate": 0.0004677280871244096, + "loss": 3.026, + "step": 19047 + }, + { + "epoch": 0.93, + "grad_norm": 0.5051068067550659, + "learning_rate": 0.00046771532185233456, + "loss": 3.2398, + "step": 19048 + }, + { + "epoch": 0.93, + "grad_norm": 0.5928849577903748, + "learning_rate": 0.00046770255613852967, + "loss": 3.2081, + "step": 19049 + }, + { + "epoch": 0.93, + "grad_norm": 0.5345249772071838, + "learning_rate": 0.0004676897899830285, + "loss": 3.0596, + "step": 19050 + }, + { + "epoch": 0.93, + "grad_norm": 0.513729453086853, + "learning_rate": 0.00046767702338586475, + "loss": 3.086, + "step": 19051 + }, + { + "epoch": 0.93, + "grad_norm": 0.5252007246017456, + "learning_rate": 0.0004676642563470719, + "loss": 3.015, + "step": 19052 + }, + { + "epoch": 0.93, + "grad_norm": 0.5086860060691833, + "learning_rate": 0.00046765148886668376, + "loss": 3.1461, + "step": 19053 + }, + { + "epoch": 0.93, + "grad_norm": 0.535895824432373, + "learning_rate": 0.0004676387209447338, + "loss": 3.1293, + "step": 19054 + }, + { + "epoch": 0.93, + "grad_norm": 0.5222206711769104, + "learning_rate": 0.0004676259525812558, + "loss": 3.0691, + "step": 19055 + }, + { + "epoch": 0.93, + "grad_norm": 0.5708796381950378, + "learning_rate": 0.0004676131837762832, + "loss": 3.1535, + "step": 19056 + }, + { + "epoch": 0.93, + "grad_norm": 0.49365147948265076, + "learning_rate": 0.0004676004145298497, + "loss": 3.268, + "step": 19057 + }, + { + "epoch": 0.93, + "grad_norm": 0.49543845653533936, + "learning_rate": 0.0004675876448419891, + "loss": 3.2324, + "step": 19058 + }, + { + "epoch": 0.93, + "grad_norm": 0.5274909138679504, + "learning_rate": 0.00046757487471273476, + "loss": 3.0694, + "step": 19059 + }, + { + "epoch": 0.93, + "grad_norm": 0.5357721447944641, + "learning_rate": 0.0004675621041421206, + "loss": 3.2226, + "step": 19060 + }, + { + "epoch": 0.93, + "grad_norm": 0.5113617777824402, + "learning_rate": 0.00046754933313018, + "loss": 3.2211, + "step": 19061 + }, + { + "epoch": 0.93, + "grad_norm": 0.500162661075592, + "learning_rate": 0.0004675365616769467, + "loss": 3.2316, + "step": 19062 + }, + { + "epoch": 0.93, + "grad_norm": 0.5388630628585815, + "learning_rate": 0.00046752378978245435, + "loss": 3.1944, + "step": 19063 + }, + { + "epoch": 0.93, + "grad_norm": 0.5528714656829834, + "learning_rate": 0.00046751101744673654, + "loss": 3.3388, + "step": 19064 + }, + { + "epoch": 0.93, + "grad_norm": 0.5113033056259155, + "learning_rate": 0.0004674982446698271, + "loss": 3.0037, + "step": 19065 + }, + { + "epoch": 0.93, + "grad_norm": 0.5158981084823608, + "learning_rate": 0.00046748547145175943, + "loss": 3.2846, + "step": 19066 + }, + { + "epoch": 0.93, + "grad_norm": 0.5108698010444641, + "learning_rate": 0.0004674726977925672, + "loss": 3.2234, + "step": 19067 + }, + { + "epoch": 0.93, + "grad_norm": 0.5266377329826355, + "learning_rate": 0.00046745992369228416, + "loss": 3.0511, + "step": 19068 + }, + { + "epoch": 0.93, + "grad_norm": 0.5855154991149902, + "learning_rate": 0.00046744714915094394, + "loss": 3.1555, + "step": 19069 + }, + { + "epoch": 0.93, + "grad_norm": 0.5538336038589478, + "learning_rate": 0.0004674343741685801, + "loss": 3.1339, + "step": 19070 + }, + { + "epoch": 0.93, + "grad_norm": 0.5286270976066589, + "learning_rate": 0.0004674215987452264, + "loss": 3.314, + "step": 19071 + }, + { + "epoch": 0.93, + "grad_norm": 0.5068775415420532, + "learning_rate": 0.00046740882288091634, + "loss": 3.113, + "step": 19072 + }, + { + "epoch": 0.93, + "grad_norm": 0.5175677537918091, + "learning_rate": 0.0004673960465756837, + "loss": 3.2558, + "step": 19073 + }, + { + "epoch": 0.93, + "grad_norm": 0.5037120580673218, + "learning_rate": 0.00046738326982956216, + "loss": 3.232, + "step": 19074 + }, + { + "epoch": 0.93, + "grad_norm": 0.5219966769218445, + "learning_rate": 0.00046737049264258525, + "loss": 3.0916, + "step": 19075 + }, + { + "epoch": 0.93, + "grad_norm": 0.5453975796699524, + "learning_rate": 0.00046735771501478675, + "loss": 3.1197, + "step": 19076 + }, + { + "epoch": 0.93, + "grad_norm": 0.5948343873023987, + "learning_rate": 0.00046734493694620006, + "loss": 2.7689, + "step": 19077 + }, + { + "epoch": 0.93, + "grad_norm": 0.5255204439163208, + "learning_rate": 0.0004673321584368591, + "loss": 3.3097, + "step": 19078 + }, + { + "epoch": 0.94, + "grad_norm": 0.508298933506012, + "learning_rate": 0.0004673193794867975, + "loss": 3.3616, + "step": 19079 + }, + { + "epoch": 0.94, + "grad_norm": 0.5323010683059692, + "learning_rate": 0.0004673066000960488, + "loss": 3.034, + "step": 19080 + }, + { + "epoch": 0.94, + "grad_norm": 0.5434712767601013, + "learning_rate": 0.00046729382026464676, + "loss": 3.1091, + "step": 19081 + }, + { + "epoch": 0.94, + "grad_norm": 0.5189756155014038, + "learning_rate": 0.0004672810399926249, + "loss": 3.1893, + "step": 19082 + }, + { + "epoch": 0.94, + "grad_norm": 0.5350856184959412, + "learning_rate": 0.000467268259280017, + "loss": 3.1141, + "step": 19083 + }, + { + "epoch": 0.94, + "grad_norm": 0.6006237864494324, + "learning_rate": 0.0004672554781268568, + "loss": 3.2461, + "step": 19084 + }, + { + "epoch": 0.94, + "grad_norm": 0.5676541328430176, + "learning_rate": 0.00046724269653317774, + "loss": 3.1233, + "step": 19085 + }, + { + "epoch": 0.94, + "grad_norm": 0.5647549629211426, + "learning_rate": 0.00046722991449901373, + "loss": 3.1782, + "step": 19086 + }, + { + "epoch": 0.94, + "grad_norm": 0.5188087821006775, + "learning_rate": 0.00046721713202439816, + "loss": 3.049, + "step": 19087 + }, + { + "epoch": 0.94, + "grad_norm": 0.4982798099517822, + "learning_rate": 0.00046720434910936493, + "loss": 3.1946, + "step": 19088 + }, + { + "epoch": 0.94, + "grad_norm": 0.549485445022583, + "learning_rate": 0.00046719156575394754, + "loss": 3.2946, + "step": 19089 + }, + { + "epoch": 0.94, + "grad_norm": 0.5249114632606506, + "learning_rate": 0.00046717878195817985, + "loss": 3.2452, + "step": 19090 + }, + { + "epoch": 0.94, + "grad_norm": 0.5417079925537109, + "learning_rate": 0.00046716599772209544, + "loss": 3.4411, + "step": 19091 + }, + { + "epoch": 0.94, + "grad_norm": 0.5425086617469788, + "learning_rate": 0.00046715321304572786, + "loss": 3.1668, + "step": 19092 + }, + { + "epoch": 0.94, + "grad_norm": 0.5171887278556824, + "learning_rate": 0.0004671404279291109, + "loss": 3.158, + "step": 19093 + }, + { + "epoch": 0.94, + "grad_norm": 0.5016870498657227, + "learning_rate": 0.00046712764237227827, + "loss": 3.2172, + "step": 19094 + }, + { + "epoch": 0.94, + "grad_norm": 0.4983656406402588, + "learning_rate": 0.0004671148563752636, + "loss": 3.0947, + "step": 19095 + }, + { + "epoch": 0.94, + "grad_norm": 0.5325701832771301, + "learning_rate": 0.0004671020699381005, + "loss": 3.0837, + "step": 19096 + }, + { + "epoch": 0.94, + "grad_norm": 0.5779774785041809, + "learning_rate": 0.0004670892830608228, + "loss": 3.0161, + "step": 19097 + }, + { + "epoch": 0.94, + "grad_norm": 0.53163081407547, + "learning_rate": 0.000467076495743464, + "loss": 3.1027, + "step": 19098 + }, + { + "epoch": 0.94, + "grad_norm": 0.5103086233139038, + "learning_rate": 0.0004670637079860579, + "loss": 3.242, + "step": 19099 + }, + { + "epoch": 0.94, + "grad_norm": 0.5351077914237976, + "learning_rate": 0.00046705091978863815, + "loss": 3.0781, + "step": 19100 + }, + { + "epoch": 0.94, + "grad_norm": 0.5188274383544922, + "learning_rate": 0.0004670381311512384, + "loss": 3.1909, + "step": 19101 + }, + { + "epoch": 0.94, + "grad_norm": 0.514674186706543, + "learning_rate": 0.0004670253420738924, + "loss": 3.1387, + "step": 19102 + }, + { + "epoch": 0.94, + "grad_norm": 0.4893554449081421, + "learning_rate": 0.00046701255255663374, + "loss": 3.322, + "step": 19103 + }, + { + "epoch": 0.94, + "grad_norm": 0.49983033537864685, + "learning_rate": 0.00046699976259949614, + "loss": 3.0131, + "step": 19104 + }, + { + "epoch": 0.94, + "grad_norm": 0.5099627375602722, + "learning_rate": 0.00046698697220251344, + "loss": 2.979, + "step": 19105 + }, + { + "epoch": 0.94, + "grad_norm": 0.5100765228271484, + "learning_rate": 0.0004669741813657191, + "loss": 2.9599, + "step": 19106 + }, + { + "epoch": 0.94, + "grad_norm": 0.5214683413505554, + "learning_rate": 0.00046696139008914697, + "loss": 3.074, + "step": 19107 + }, + { + "epoch": 0.94, + "grad_norm": 0.532818078994751, + "learning_rate": 0.0004669485983728305, + "loss": 3.1915, + "step": 19108 + }, + { + "epoch": 0.94, + "grad_norm": 0.48344460129737854, + "learning_rate": 0.00046693580621680363, + "loss": 3.2483, + "step": 19109 + }, + { + "epoch": 0.94, + "grad_norm": 0.5342273116111755, + "learning_rate": 0.0004669230136211, + "loss": 2.9076, + "step": 19110 + }, + { + "epoch": 0.94, + "grad_norm": 0.5045246481895447, + "learning_rate": 0.0004669102205857533, + "loss": 3.2139, + "step": 19111 + }, + { + "epoch": 0.94, + "grad_norm": 0.5883970856666565, + "learning_rate": 0.0004668974271107972, + "loss": 2.8011, + "step": 19112 + }, + { + "epoch": 0.94, + "grad_norm": 0.49823927879333496, + "learning_rate": 0.0004668846331962654, + "loss": 3.0764, + "step": 19113 + }, + { + "epoch": 0.94, + "grad_norm": 0.5665649175643921, + "learning_rate": 0.00046687183884219156, + "loss": 3.1699, + "step": 19114 + }, + { + "epoch": 0.94, + "grad_norm": 0.5797495245933533, + "learning_rate": 0.0004668590440486094, + "loss": 3.0427, + "step": 19115 + }, + { + "epoch": 0.94, + "grad_norm": 0.5581980347633362, + "learning_rate": 0.0004668462488155527, + "loss": 3.2031, + "step": 19116 + }, + { + "epoch": 0.94, + "grad_norm": 0.5199931263923645, + "learning_rate": 0.00046683345314305503, + "loss": 3.1927, + "step": 19117 + }, + { + "epoch": 0.94, + "grad_norm": 0.5535823702812195, + "learning_rate": 0.00046682065703115014, + "loss": 3.1794, + "step": 19118 + }, + { + "epoch": 0.94, + "grad_norm": 0.539772629737854, + "learning_rate": 0.0004668078604798718, + "loss": 3.3153, + "step": 19119 + }, + { + "epoch": 0.94, + "grad_norm": 0.5513725876808167, + "learning_rate": 0.00046679506348925367, + "loss": 3.0796, + "step": 19120 + }, + { + "epoch": 0.94, + "grad_norm": 0.5442554950714111, + "learning_rate": 0.00046678226605932936, + "loss": 3.2118, + "step": 19121 + }, + { + "epoch": 0.94, + "grad_norm": 0.6911001205444336, + "learning_rate": 0.0004667694681901327, + "loss": 3.2077, + "step": 19122 + }, + { + "epoch": 0.94, + "grad_norm": 0.5045164227485657, + "learning_rate": 0.0004667566698816974, + "loss": 3.2937, + "step": 19123 + }, + { + "epoch": 0.94, + "grad_norm": 0.5363736152648926, + "learning_rate": 0.000466743871134057, + "loss": 3.0034, + "step": 19124 + }, + { + "epoch": 0.94, + "grad_norm": 0.530031681060791, + "learning_rate": 0.0004667310719472455, + "loss": 3.0441, + "step": 19125 + }, + { + "epoch": 0.94, + "grad_norm": 0.5397589206695557, + "learning_rate": 0.00046671827232129634, + "loss": 3.2575, + "step": 19126 + }, + { + "epoch": 0.94, + "grad_norm": 0.5312854051589966, + "learning_rate": 0.0004667054722562433, + "loss": 3.1987, + "step": 19127 + }, + { + "epoch": 0.94, + "grad_norm": 0.5177079439163208, + "learning_rate": 0.0004666926717521203, + "loss": 3.2154, + "step": 19128 + }, + { + "epoch": 0.94, + "grad_norm": 0.525133490562439, + "learning_rate": 0.00046667987080896065, + "loss": 3.2139, + "step": 19129 + }, + { + "epoch": 0.94, + "grad_norm": 0.5109131932258606, + "learning_rate": 0.0004666670694267985, + "loss": 2.9454, + "step": 19130 + }, + { + "epoch": 0.94, + "grad_norm": 0.557773232460022, + "learning_rate": 0.00046665426760566733, + "loss": 3.3336, + "step": 19131 + }, + { + "epoch": 0.94, + "grad_norm": 0.534517228603363, + "learning_rate": 0.00046664146534560076, + "loss": 3.0634, + "step": 19132 + }, + { + "epoch": 0.94, + "grad_norm": 0.5470923781394958, + "learning_rate": 0.0004666286626466328, + "loss": 3.1499, + "step": 19133 + }, + { + "epoch": 0.94, + "grad_norm": 0.5293968319892883, + "learning_rate": 0.0004666158595087969, + "loss": 3.2722, + "step": 19134 + }, + { + "epoch": 0.94, + "grad_norm": 0.5513948202133179, + "learning_rate": 0.00046660305593212694, + "loss": 2.9834, + "step": 19135 + }, + { + "epoch": 0.94, + "grad_norm": 0.5250282883644104, + "learning_rate": 0.00046659025191665655, + "loss": 3.2274, + "step": 19136 + }, + { + "epoch": 0.94, + "grad_norm": 0.5674799680709839, + "learning_rate": 0.0004665774474624196, + "loss": 3.1483, + "step": 19137 + }, + { + "epoch": 0.94, + "grad_norm": 0.49620136618614197, + "learning_rate": 0.0004665646425694496, + "loss": 3.1663, + "step": 19138 + }, + { + "epoch": 0.94, + "grad_norm": 0.5123617053031921, + "learning_rate": 0.0004665518372377804, + "loss": 3.02, + "step": 19139 + }, + { + "epoch": 0.94, + "grad_norm": 0.5374035835266113, + "learning_rate": 0.00046653903146744576, + "loss": 3.227, + "step": 19140 + }, + { + "epoch": 0.94, + "grad_norm": 0.5367292165756226, + "learning_rate": 0.0004665262252584794, + "loss": 3.1528, + "step": 19141 + }, + { + "epoch": 0.94, + "grad_norm": 0.5225591659545898, + "learning_rate": 0.00046651341861091497, + "loss": 3.145, + "step": 19142 + }, + { + "epoch": 0.94, + "grad_norm": 0.5538541674613953, + "learning_rate": 0.00046650061152478617, + "loss": 3.1249, + "step": 19143 + }, + { + "epoch": 0.94, + "grad_norm": 0.5019550323486328, + "learning_rate": 0.00046648780400012686, + "loss": 3.125, + "step": 19144 + }, + { + "epoch": 0.94, + "grad_norm": 0.487470418214798, + "learning_rate": 0.00046647499603697076, + "loss": 3.0214, + "step": 19145 + }, + { + "epoch": 0.94, + "grad_norm": 0.5381044149398804, + "learning_rate": 0.0004664621876353515, + "loss": 3.0006, + "step": 19146 + }, + { + "epoch": 0.94, + "grad_norm": 0.563580334186554, + "learning_rate": 0.0004664493787953029, + "loss": 3.2355, + "step": 19147 + }, + { + "epoch": 0.94, + "grad_norm": 0.5690339207649231, + "learning_rate": 0.00046643656951685867, + "loss": 3.1099, + "step": 19148 + }, + { + "epoch": 0.94, + "grad_norm": 0.5722571015357971, + "learning_rate": 0.0004664237598000525, + "loss": 3.2627, + "step": 19149 + }, + { + "epoch": 0.94, + "grad_norm": 0.5227568745613098, + "learning_rate": 0.00046641094964491826, + "loss": 3.2356, + "step": 19150 + }, + { + "epoch": 0.94, + "grad_norm": 0.5287798047065735, + "learning_rate": 0.00046639813905148954, + "loss": 3.0889, + "step": 19151 + }, + { + "epoch": 0.94, + "grad_norm": 0.5487927198410034, + "learning_rate": 0.00046638532801980017, + "loss": 3.235, + "step": 19152 + }, + { + "epoch": 0.94, + "grad_norm": 0.5198444724082947, + "learning_rate": 0.0004663725165498839, + "loss": 3.1417, + "step": 19153 + }, + { + "epoch": 0.94, + "grad_norm": 0.5517452955245972, + "learning_rate": 0.00046635970464177436, + "loss": 3.3419, + "step": 19154 + }, + { + "epoch": 0.94, + "grad_norm": 0.5759121179580688, + "learning_rate": 0.0004663468922955054, + "loss": 3.3514, + "step": 19155 + }, + { + "epoch": 0.94, + "grad_norm": 0.5133375525474548, + "learning_rate": 0.00046633407951111075, + "loss": 3.1282, + "step": 19156 + }, + { + "epoch": 0.94, + "grad_norm": 0.5205239653587341, + "learning_rate": 0.0004663212662886242, + "loss": 3.0449, + "step": 19157 + }, + { + "epoch": 0.94, + "grad_norm": 0.5108022093772888, + "learning_rate": 0.00046630845262807935, + "loss": 3.2283, + "step": 19158 + }, + { + "epoch": 0.94, + "grad_norm": 0.5807484984397888, + "learning_rate": 0.00046629563852951006, + "loss": 3.1741, + "step": 19159 + }, + { + "epoch": 0.94, + "grad_norm": 0.5120785236358643, + "learning_rate": 0.0004662828239929502, + "loss": 3.148, + "step": 19160 + }, + { + "epoch": 0.94, + "grad_norm": 0.5248308181762695, + "learning_rate": 0.00046627000901843316, + "loss": 3.2335, + "step": 19161 + }, + { + "epoch": 0.94, + "grad_norm": 0.5054056644439697, + "learning_rate": 0.00046625719360599314, + "loss": 3.0538, + "step": 19162 + }, + { + "epoch": 0.94, + "grad_norm": 0.5169258117675781, + "learning_rate": 0.0004662443777556635, + "loss": 3.2335, + "step": 19163 + }, + { + "epoch": 0.94, + "grad_norm": 0.5012264847755432, + "learning_rate": 0.0004662315614674782, + "loss": 3.0468, + "step": 19164 + }, + { + "epoch": 0.94, + "grad_norm": 0.5255052447319031, + "learning_rate": 0.00046621874474147104, + "loss": 3.2484, + "step": 19165 + }, + { + "epoch": 0.94, + "grad_norm": 0.5075822472572327, + "learning_rate": 0.0004662059275776756, + "loss": 3.2547, + "step": 19166 + }, + { + "epoch": 0.94, + "grad_norm": 0.5703867077827454, + "learning_rate": 0.0004661931099761258, + "loss": 3.1126, + "step": 19167 + }, + { + "epoch": 0.94, + "grad_norm": 0.5306580066680908, + "learning_rate": 0.0004661802919368553, + "loss": 3.0482, + "step": 19168 + }, + { + "epoch": 0.94, + "grad_norm": 0.5228316187858582, + "learning_rate": 0.0004661674734598979, + "loss": 3.1605, + "step": 19169 + }, + { + "epoch": 0.94, + "grad_norm": 0.5195432901382446, + "learning_rate": 0.0004661546545452873, + "loss": 3.1001, + "step": 19170 + }, + { + "epoch": 0.94, + "grad_norm": 0.5943834781646729, + "learning_rate": 0.00046614183519305745, + "loss": 3.0419, + "step": 19171 + }, + { + "epoch": 0.94, + "grad_norm": 0.5026077628135681, + "learning_rate": 0.00046612901540324186, + "loss": 3.1679, + "step": 19172 + }, + { + "epoch": 0.94, + "grad_norm": 0.5464365482330322, + "learning_rate": 0.00046611619517587447, + "loss": 3.3324, + "step": 19173 + }, + { + "epoch": 0.94, + "grad_norm": 0.5255415439605713, + "learning_rate": 0.00046610337451098895, + "loss": 3.0416, + "step": 19174 + }, + { + "epoch": 0.94, + "grad_norm": 0.5834014415740967, + "learning_rate": 0.0004660905534086192, + "loss": 3.2447, + "step": 19175 + }, + { + "epoch": 0.94, + "grad_norm": 0.5191861391067505, + "learning_rate": 0.0004660777318687988, + "loss": 3.0245, + "step": 19176 + }, + { + "epoch": 0.94, + "grad_norm": 0.523767352104187, + "learning_rate": 0.00046606490989156165, + "loss": 3.0654, + "step": 19177 + }, + { + "epoch": 0.94, + "grad_norm": 0.5461781620979309, + "learning_rate": 0.00046605208747694155, + "loss": 3.1267, + "step": 19178 + }, + { + "epoch": 0.94, + "grad_norm": 0.507882297039032, + "learning_rate": 0.0004660392646249721, + "loss": 3.1003, + "step": 19179 + }, + { + "epoch": 0.94, + "grad_norm": 0.543165385723114, + "learning_rate": 0.00046602644133568715, + "loss": 3.0974, + "step": 19180 + }, + { + "epoch": 0.94, + "grad_norm": 0.5282258987426758, + "learning_rate": 0.0004660136176091207, + "loss": 3.1831, + "step": 19181 + }, + { + "epoch": 0.94, + "grad_norm": 0.5365543961524963, + "learning_rate": 0.0004660007934453062, + "loss": 3.1649, + "step": 19182 + }, + { + "epoch": 0.94, + "grad_norm": 0.5323196649551392, + "learning_rate": 0.0004659879688442776, + "loss": 3.0073, + "step": 19183 + }, + { + "epoch": 0.94, + "grad_norm": 0.5398780107498169, + "learning_rate": 0.00046597514380606854, + "loss": 3.0684, + "step": 19184 + }, + { + "epoch": 0.94, + "grad_norm": 0.5349022746086121, + "learning_rate": 0.0004659623183307129, + "loss": 3.065, + "step": 19185 + }, + { + "epoch": 0.94, + "grad_norm": 0.5340750217437744, + "learning_rate": 0.0004659494924182446, + "loss": 3.074, + "step": 19186 + }, + { + "epoch": 0.94, + "grad_norm": 0.5252206921577454, + "learning_rate": 0.0004659366660686972, + "loss": 3.1038, + "step": 19187 + }, + { + "epoch": 0.94, + "grad_norm": 0.5100142359733582, + "learning_rate": 0.00046592383928210457, + "loss": 3.0467, + "step": 19188 + }, + { + "epoch": 0.94, + "grad_norm": 0.5399733185768127, + "learning_rate": 0.00046591101205850047, + "loss": 3.0333, + "step": 19189 + }, + { + "epoch": 0.94, + "grad_norm": 0.5943720936775208, + "learning_rate": 0.0004658981843979186, + "loss": 3.072, + "step": 19190 + }, + { + "epoch": 0.94, + "grad_norm": 0.5421499609947205, + "learning_rate": 0.0004658853563003929, + "loss": 3.3646, + "step": 19191 + }, + { + "epoch": 0.94, + "grad_norm": 0.532548189163208, + "learning_rate": 0.00046587252776595717, + "loss": 3.0745, + "step": 19192 + }, + { + "epoch": 0.94, + "grad_norm": 0.5318267345428467, + "learning_rate": 0.0004658596987946451, + "loss": 3.0672, + "step": 19193 + }, + { + "epoch": 0.94, + "grad_norm": 0.5340205430984497, + "learning_rate": 0.00046584686938649044, + "loss": 3.3863, + "step": 19194 + }, + { + "epoch": 0.94, + "grad_norm": 0.5387397408485413, + "learning_rate": 0.00046583403954152705, + "loss": 3.1411, + "step": 19195 + }, + { + "epoch": 0.94, + "grad_norm": 0.5424886345863342, + "learning_rate": 0.0004658212092597888, + "loss": 3.3, + "step": 19196 + }, + { + "epoch": 0.94, + "grad_norm": 0.5942268967628479, + "learning_rate": 0.0004658083785413093, + "loss": 3.0225, + "step": 19197 + }, + { + "epoch": 0.94, + "grad_norm": 0.5251699090003967, + "learning_rate": 0.00046579554738612245, + "loss": 3.1073, + "step": 19198 + }, + { + "epoch": 0.94, + "grad_norm": 0.5253727436065674, + "learning_rate": 0.0004657827157942621, + "loss": 3.3228, + "step": 19199 + }, + { + "epoch": 0.94, + "grad_norm": 0.521644651889801, + "learning_rate": 0.0004657698837657619, + "loss": 3.2844, + "step": 19200 + }, + { + "epoch": 0.94, + "grad_norm": 0.5412613749504089, + "learning_rate": 0.00046575705130065585, + "loss": 3.2638, + "step": 19201 + }, + { + "epoch": 0.94, + "grad_norm": 0.5078155994415283, + "learning_rate": 0.00046574421839897754, + "loss": 3.1013, + "step": 19202 + }, + { + "epoch": 0.94, + "grad_norm": 0.5175430178642273, + "learning_rate": 0.0004657313850607609, + "loss": 2.9286, + "step": 19203 + }, + { + "epoch": 0.94, + "grad_norm": 0.5150618553161621, + "learning_rate": 0.0004657185512860397, + "loss": 3.1024, + "step": 19204 + }, + { + "epoch": 0.94, + "grad_norm": 0.5550238490104675, + "learning_rate": 0.0004657057170748477, + "loss": 2.8348, + "step": 19205 + }, + { + "epoch": 0.94, + "grad_norm": 0.4976348280906677, + "learning_rate": 0.00046569288242721867, + "loss": 3.1287, + "step": 19206 + }, + { + "epoch": 0.94, + "grad_norm": 0.5535556674003601, + "learning_rate": 0.00046568004734318655, + "loss": 3.095, + "step": 19207 + }, + { + "epoch": 0.94, + "grad_norm": 0.525455892086029, + "learning_rate": 0.0004656672118227851, + "loss": 3.2602, + "step": 19208 + }, + { + "epoch": 0.94, + "grad_norm": 0.5400662422180176, + "learning_rate": 0.00046565437586604805, + "loss": 3.2045, + "step": 19209 + }, + { + "epoch": 0.94, + "grad_norm": 0.5572945475578308, + "learning_rate": 0.0004656415394730092, + "loss": 3.3045, + "step": 19210 + }, + { + "epoch": 0.94, + "grad_norm": 0.5140613913536072, + "learning_rate": 0.00046562870264370244, + "loss": 3.2455, + "step": 19211 + }, + { + "epoch": 0.94, + "grad_norm": 0.5592833757400513, + "learning_rate": 0.0004656158653781616, + "loss": 3.1826, + "step": 19212 + }, + { + "epoch": 0.94, + "grad_norm": 0.5442313551902771, + "learning_rate": 0.0004656030276764205, + "loss": 3.1839, + "step": 19213 + }, + { + "epoch": 0.94, + "grad_norm": 0.5157129168510437, + "learning_rate": 0.0004655901895385128, + "loss": 3.1051, + "step": 19214 + }, + { + "epoch": 0.94, + "grad_norm": 0.5172207355499268, + "learning_rate": 0.00046557735096447244, + "loss": 3.331, + "step": 19215 + }, + { + "epoch": 0.94, + "grad_norm": 0.5096434354782104, + "learning_rate": 0.0004655645119543331, + "loss": 2.9974, + "step": 19216 + }, + { + "epoch": 0.94, + "grad_norm": 0.5327327847480774, + "learning_rate": 0.00046555167250812886, + "loss": 3.1229, + "step": 19217 + }, + { + "epoch": 0.94, + "grad_norm": 0.5447360873222351, + "learning_rate": 0.00046553883262589324, + "loss": 3.0939, + "step": 19218 + }, + { + "epoch": 0.94, + "grad_norm": 0.5559058785438538, + "learning_rate": 0.0004655259923076603, + "loss": 3.1901, + "step": 19219 + }, + { + "epoch": 0.94, + "grad_norm": 0.49087318778038025, + "learning_rate": 0.0004655131515534637, + "loss": 3.2312, + "step": 19220 + }, + { + "epoch": 0.94, + "grad_norm": 0.5363554954528809, + "learning_rate": 0.00046550031036333734, + "loss": 3.0807, + "step": 19221 + }, + { + "epoch": 0.94, + "grad_norm": 0.5305533409118652, + "learning_rate": 0.000465487468737315, + "loss": 3.0706, + "step": 19222 + }, + { + "epoch": 0.94, + "grad_norm": 0.5256800651550293, + "learning_rate": 0.0004654746266754306, + "loss": 3.1917, + "step": 19223 + }, + { + "epoch": 0.94, + "grad_norm": 0.5493904948234558, + "learning_rate": 0.00046546178417771774, + "loss": 3.0993, + "step": 19224 + }, + { + "epoch": 0.94, + "grad_norm": 0.5530810356140137, + "learning_rate": 0.0004654489412442105, + "loss": 3.1231, + "step": 19225 + }, + { + "epoch": 0.94, + "grad_norm": 0.5260566473007202, + "learning_rate": 0.0004654360978749424, + "loss": 3.1445, + "step": 19226 + }, + { + "epoch": 0.94, + "grad_norm": 0.5247520804405212, + "learning_rate": 0.0004654232540699476, + "loss": 3.3109, + "step": 19227 + }, + { + "epoch": 0.94, + "grad_norm": 0.5059221386909485, + "learning_rate": 0.0004654104098292598, + "loss": 3.1187, + "step": 19228 + }, + { + "epoch": 0.94, + "grad_norm": 0.5202158093452454, + "learning_rate": 0.00046539756515291285, + "loss": 3.2282, + "step": 19229 + }, + { + "epoch": 0.94, + "grad_norm": 0.542550802230835, + "learning_rate": 0.0004653847200409405, + "loss": 3.134, + "step": 19230 + }, + { + "epoch": 0.94, + "grad_norm": 0.5520852208137512, + "learning_rate": 0.00046537187449337654, + "loss": 3.2093, + "step": 19231 + }, + { + "epoch": 0.94, + "grad_norm": 0.5276000499725342, + "learning_rate": 0.00046535902851025496, + "loss": 3.0294, + "step": 19232 + }, + { + "epoch": 0.94, + "grad_norm": 0.5380484461784363, + "learning_rate": 0.0004653461820916096, + "loss": 3.097, + "step": 19233 + }, + { + "epoch": 0.94, + "grad_norm": 0.5322332382202148, + "learning_rate": 0.0004653333352374741, + "loss": 3.2955, + "step": 19234 + }, + { + "epoch": 0.94, + "grad_norm": 0.5831676125526428, + "learning_rate": 0.00046532048794788243, + "loss": 3.1426, + "step": 19235 + }, + { + "epoch": 0.94, + "grad_norm": 0.5310155153274536, + "learning_rate": 0.00046530764022286835, + "loss": 3.2932, + "step": 19236 + }, + { + "epoch": 0.94, + "grad_norm": 0.48738566040992737, + "learning_rate": 0.00046529479206246585, + "loss": 3.0159, + "step": 19237 + }, + { + "epoch": 0.94, + "grad_norm": 0.5302239060401917, + "learning_rate": 0.0004652819434667088, + "loss": 3.221, + "step": 19238 + }, + { + "epoch": 0.94, + "grad_norm": 0.5241603851318359, + "learning_rate": 0.00046526909443563074, + "loss": 3.0768, + "step": 19239 + }, + { + "epoch": 0.94, + "grad_norm": 0.5549540519714355, + "learning_rate": 0.0004652562449692658, + "loss": 3.2177, + "step": 19240 + }, + { + "epoch": 0.94, + "grad_norm": 0.521250307559967, + "learning_rate": 0.00046524339506764755, + "loss": 3.2072, + "step": 19241 + }, + { + "epoch": 0.94, + "grad_norm": 0.5026115775108337, + "learning_rate": 0.0004652305447308101, + "loss": 3.1384, + "step": 19242 + }, + { + "epoch": 0.94, + "grad_norm": 0.5054687261581421, + "learning_rate": 0.0004652176939587872, + "loss": 2.9898, + "step": 19243 + }, + { + "epoch": 0.94, + "grad_norm": 0.5394466519355774, + "learning_rate": 0.00046520484275161273, + "loss": 3.0466, + "step": 19244 + }, + { + "epoch": 0.94, + "grad_norm": 0.534470796585083, + "learning_rate": 0.0004651919911093204, + "loss": 3.0333, + "step": 19245 + }, + { + "epoch": 0.94, + "grad_norm": 0.5293780565261841, + "learning_rate": 0.0004651791390319443, + "loss": 3.3452, + "step": 19246 + }, + { + "epoch": 0.94, + "grad_norm": 0.5452441573143005, + "learning_rate": 0.00046516628651951806, + "loss": 3.1538, + "step": 19247 + }, + { + "epoch": 0.94, + "grad_norm": 0.49200424551963806, + "learning_rate": 0.00046515343357207554, + "loss": 3.3007, + "step": 19248 + }, + { + "epoch": 0.94, + "grad_norm": 0.5668293833732605, + "learning_rate": 0.0004651405801896507, + "loss": 3.2227, + "step": 19249 + }, + { + "epoch": 0.94, + "grad_norm": 0.5335699915885925, + "learning_rate": 0.00046512772637227745, + "loss": 3.2257, + "step": 19250 + }, + { + "epoch": 0.94, + "grad_norm": 0.5185949206352234, + "learning_rate": 0.00046511487211998954, + "loss": 3.0215, + "step": 19251 + }, + { + "epoch": 0.94, + "grad_norm": 0.5684577822685242, + "learning_rate": 0.0004651020174328207, + "loss": 3.1563, + "step": 19252 + }, + { + "epoch": 0.94, + "grad_norm": 0.5220710635185242, + "learning_rate": 0.000465089162310805, + "loss": 3.2089, + "step": 19253 + }, + { + "epoch": 0.94, + "grad_norm": 0.5233851075172424, + "learning_rate": 0.0004650763067539762, + "loss": 3.2768, + "step": 19254 + }, + { + "epoch": 0.94, + "grad_norm": 0.5065687298774719, + "learning_rate": 0.00046506345076236823, + "loss": 3.1146, + "step": 19255 + }, + { + "epoch": 0.94, + "grad_norm": 0.5225540399551392, + "learning_rate": 0.0004650505943360148, + "loss": 3.1571, + "step": 19256 + }, + { + "epoch": 0.94, + "grad_norm": 0.5644252300262451, + "learning_rate": 0.00046503773747494994, + "loss": 3.2514, + "step": 19257 + }, + { + "epoch": 0.94, + "grad_norm": 0.5171868205070496, + "learning_rate": 0.00046502488017920743, + "loss": 3.1537, + "step": 19258 + }, + { + "epoch": 0.94, + "grad_norm": 0.5381227731704712, + "learning_rate": 0.0004650120224488212, + "loss": 3.3608, + "step": 19259 + }, + { + "epoch": 0.94, + "grad_norm": 0.5154421329498291, + "learning_rate": 0.000464999164283825, + "loss": 3.0479, + "step": 19260 + }, + { + "epoch": 0.94, + "grad_norm": 0.5348089933395386, + "learning_rate": 0.00046498630568425273, + "loss": 3.2822, + "step": 19261 + }, + { + "epoch": 0.94, + "grad_norm": 0.518226683139801, + "learning_rate": 0.0004649734466501383, + "loss": 3.1494, + "step": 19262 + }, + { + "epoch": 0.94, + "grad_norm": 0.5043905973434448, + "learning_rate": 0.0004649605871815156, + "loss": 3.3799, + "step": 19263 + }, + { + "epoch": 0.94, + "grad_norm": 0.5411002039909363, + "learning_rate": 0.0004649477272784184, + "loss": 3.1399, + "step": 19264 + }, + { + "epoch": 0.94, + "grad_norm": 0.55306476354599, + "learning_rate": 0.0004649348669408807, + "loss": 3.0225, + "step": 19265 + }, + { + "epoch": 0.94, + "grad_norm": 0.5231731534004211, + "learning_rate": 0.00046492200616893623, + "loss": 3.0566, + "step": 19266 + }, + { + "epoch": 0.94, + "grad_norm": 0.5047503113746643, + "learning_rate": 0.00046490914496261895, + "loss": 3.0925, + "step": 19267 + }, + { + "epoch": 0.94, + "grad_norm": 0.4931429922580719, + "learning_rate": 0.0004648962833219627, + "loss": 3.3032, + "step": 19268 + }, + { + "epoch": 0.94, + "grad_norm": 0.5375720858573914, + "learning_rate": 0.0004648834212470015, + "loss": 2.9951, + "step": 19269 + }, + { + "epoch": 0.94, + "grad_norm": 0.5181878209114075, + "learning_rate": 0.000464870558737769, + "loss": 3.1347, + "step": 19270 + }, + { + "epoch": 0.94, + "grad_norm": 0.5091689825057983, + "learning_rate": 0.00046485769579429924, + "loss": 3.1832, + "step": 19271 + }, + { + "epoch": 0.94, + "grad_norm": 0.5010989308357239, + "learning_rate": 0.000464844832416626, + "loss": 3.3125, + "step": 19272 + }, + { + "epoch": 0.94, + "grad_norm": 0.5260189771652222, + "learning_rate": 0.0004648319686047831, + "loss": 3.1826, + "step": 19273 + }, + { + "epoch": 0.94, + "grad_norm": 0.5514676570892334, + "learning_rate": 0.0004648191043588046, + "loss": 3.2043, + "step": 19274 + }, + { + "epoch": 0.94, + "grad_norm": 0.5793471932411194, + "learning_rate": 0.0004648062396787243, + "loss": 3.106, + "step": 19275 + }, + { + "epoch": 0.94, + "grad_norm": 0.5154271125793457, + "learning_rate": 0.00046479337456457615, + "loss": 3.215, + "step": 19276 + }, + { + "epoch": 0.94, + "grad_norm": 0.5268782377243042, + "learning_rate": 0.0004647805090163939, + "loss": 3.0782, + "step": 19277 + }, + { + "epoch": 0.94, + "grad_norm": 0.5476327538490295, + "learning_rate": 0.0004647676430342115, + "loss": 3.0688, + "step": 19278 + }, + { + "epoch": 0.94, + "grad_norm": 0.5455402135848999, + "learning_rate": 0.00046475477661806283, + "loss": 3.1699, + "step": 19279 + }, + { + "epoch": 0.94, + "grad_norm": 0.5583153963088989, + "learning_rate": 0.00046474190976798183, + "loss": 3.0606, + "step": 19280 + }, + { + "epoch": 0.94, + "grad_norm": 0.5432180762290955, + "learning_rate": 0.0004647290424840023, + "loss": 3.3071, + "step": 19281 + }, + { + "epoch": 0.94, + "grad_norm": 0.5387990474700928, + "learning_rate": 0.00046471617476615825, + "loss": 3.2044, + "step": 19282 + }, + { + "epoch": 0.95, + "grad_norm": 0.5935296416282654, + "learning_rate": 0.0004647033066144834, + "loss": 3.0468, + "step": 19283 + }, + { + "epoch": 0.95, + "grad_norm": 0.5138740539550781, + "learning_rate": 0.00046469043802901174, + "loss": 2.9805, + "step": 19284 + }, + { + "epoch": 0.95, + "grad_norm": 0.5473163723945618, + "learning_rate": 0.0004646775690097772, + "loss": 3.0915, + "step": 19285 + }, + { + "epoch": 0.95, + "grad_norm": 0.5593499541282654, + "learning_rate": 0.0004646646995568136, + "loss": 3.2594, + "step": 19286 + }, + { + "epoch": 0.95, + "grad_norm": 0.5125278234481812, + "learning_rate": 0.0004646518296701549, + "loss": 2.9646, + "step": 19287 + }, + { + "epoch": 0.95, + "grad_norm": 0.5372989177703857, + "learning_rate": 0.000464638959349835, + "loss": 3.0846, + "step": 19288 + }, + { + "epoch": 0.95, + "grad_norm": 0.5440700650215149, + "learning_rate": 0.0004646260885958877, + "loss": 3.2548, + "step": 19289 + }, + { + "epoch": 0.95, + "grad_norm": 0.5197234749794006, + "learning_rate": 0.000464613217408347, + "loss": 2.9965, + "step": 19290 + }, + { + "epoch": 0.95, + "grad_norm": 0.5141440629959106, + "learning_rate": 0.0004646003457872468, + "loss": 3.0835, + "step": 19291 + }, + { + "epoch": 0.95, + "grad_norm": 0.5057777762413025, + "learning_rate": 0.000464587473732621, + "loss": 3.2426, + "step": 19292 + }, + { + "epoch": 0.95, + "grad_norm": 0.5304477214813232, + "learning_rate": 0.0004645746012445033, + "loss": 3.1578, + "step": 19293 + }, + { + "epoch": 0.95, + "grad_norm": 0.49843043088912964, + "learning_rate": 0.00046456172832292795, + "loss": 3.2066, + "step": 19294 + }, + { + "epoch": 0.95, + "grad_norm": 0.5783942341804504, + "learning_rate": 0.0004645488549679286, + "loss": 3.3959, + "step": 19295 + }, + { + "epoch": 0.95, + "grad_norm": 0.529236376285553, + "learning_rate": 0.0004645359811795393, + "loss": 3.1483, + "step": 19296 + }, + { + "epoch": 0.95, + "grad_norm": 0.5238991379737854, + "learning_rate": 0.0004645231069577938, + "loss": 3.0694, + "step": 19297 + }, + { + "epoch": 0.95, + "grad_norm": 0.5467185974121094, + "learning_rate": 0.00046451023230272606, + "loss": 3.1344, + "step": 19298 + }, + { + "epoch": 0.95, + "grad_norm": 0.5267390608787537, + "learning_rate": 0.00046449735721437014, + "loss": 2.9927, + "step": 19299 + }, + { + "epoch": 0.95, + "grad_norm": 0.5365429520606995, + "learning_rate": 0.0004644844816927598, + "loss": 3.0123, + "step": 19300 + }, + { + "epoch": 0.95, + "grad_norm": 0.5720606446266174, + "learning_rate": 0.000464471605737929, + "loss": 3.1872, + "step": 19301 + }, + { + "epoch": 0.95, + "grad_norm": 0.5571929216384888, + "learning_rate": 0.00046445872934991163, + "loss": 3.2165, + "step": 19302 + }, + { + "epoch": 0.95, + "grad_norm": 0.6748465299606323, + "learning_rate": 0.00046444585252874163, + "loss": 2.8737, + "step": 19303 + }, + { + "epoch": 0.95, + "grad_norm": 0.5558235049247742, + "learning_rate": 0.00046443297527445286, + "loss": 3.1197, + "step": 19304 + }, + { + "epoch": 0.95, + "grad_norm": 0.5244531631469727, + "learning_rate": 0.0004644200975870793, + "loss": 3.2594, + "step": 19305 + }, + { + "epoch": 0.95, + "grad_norm": 0.5120171904563904, + "learning_rate": 0.0004644072194666549, + "loss": 3.2905, + "step": 19306 + }, + { + "epoch": 0.95, + "grad_norm": 0.502840518951416, + "learning_rate": 0.0004643943409132135, + "loss": 2.9992, + "step": 19307 + }, + { + "epoch": 0.95, + "grad_norm": 0.5086363554000854, + "learning_rate": 0.000464381461926789, + "loss": 2.906, + "step": 19308 + }, + { + "epoch": 0.95, + "grad_norm": 0.5207031965255737, + "learning_rate": 0.0004643685825074154, + "loss": 3.3385, + "step": 19309 + }, + { + "epoch": 0.95, + "grad_norm": 0.512380838394165, + "learning_rate": 0.0004643557026551266, + "loss": 3.2487, + "step": 19310 + }, + { + "epoch": 0.95, + "grad_norm": 0.544256865978241, + "learning_rate": 0.00046434282236995655, + "loss": 3.1312, + "step": 19311 + }, + { + "epoch": 0.95, + "grad_norm": 0.5640758872032166, + "learning_rate": 0.000464329941651939, + "loss": 3.1362, + "step": 19312 + }, + { + "epoch": 0.95, + "grad_norm": 0.5477793216705322, + "learning_rate": 0.0004643170605011081, + "loss": 2.9544, + "step": 19313 + }, + { + "epoch": 0.95, + "grad_norm": 0.5070680975914001, + "learning_rate": 0.00046430417891749764, + "loss": 3.1279, + "step": 19314 + }, + { + "epoch": 0.95, + "grad_norm": 0.5301992893218994, + "learning_rate": 0.00046429129690114167, + "loss": 3.2035, + "step": 19315 + }, + { + "epoch": 0.95, + "grad_norm": 0.5810301899909973, + "learning_rate": 0.00046427841445207394, + "loss": 3.1866, + "step": 19316 + }, + { + "epoch": 0.95, + "grad_norm": 0.544684112071991, + "learning_rate": 0.00046426553157032855, + "loss": 3.2024, + "step": 19317 + }, + { + "epoch": 0.95, + "grad_norm": 0.5401524305343628, + "learning_rate": 0.0004642526482559394, + "loss": 2.8806, + "step": 19318 + }, + { + "epoch": 0.95, + "grad_norm": 0.6001310348510742, + "learning_rate": 0.0004642397645089403, + "loss": 3.1245, + "step": 19319 + }, + { + "epoch": 0.95, + "grad_norm": 0.5786098837852478, + "learning_rate": 0.0004642268803293653, + "loss": 3.1757, + "step": 19320 + }, + { + "epoch": 0.95, + "grad_norm": 0.579869270324707, + "learning_rate": 0.00046421399571724834, + "loss": 3.1806, + "step": 19321 + }, + { + "epoch": 0.95, + "grad_norm": 0.5166104435920715, + "learning_rate": 0.0004642011106726233, + "loss": 3.0663, + "step": 19322 + }, + { + "epoch": 0.95, + "grad_norm": 0.5852053165435791, + "learning_rate": 0.00046418822519552416, + "loss": 3.1294, + "step": 19323 + }, + { + "epoch": 0.95, + "grad_norm": 0.4941536784172058, + "learning_rate": 0.0004641753392859847, + "loss": 3.0502, + "step": 19324 + }, + { + "epoch": 0.95, + "grad_norm": 0.5297553539276123, + "learning_rate": 0.00046416245294403916, + "loss": 3.356, + "step": 19325 + }, + { + "epoch": 0.95, + "grad_norm": 0.5243952870368958, + "learning_rate": 0.00046414956616972126, + "loss": 3.1456, + "step": 19326 + }, + { + "epoch": 0.95, + "grad_norm": 0.5288766026496887, + "learning_rate": 0.00046413667896306495, + "loss": 2.9972, + "step": 19327 + }, + { + "epoch": 0.95, + "grad_norm": 0.5393479466438293, + "learning_rate": 0.00046412379132410427, + "loss": 2.8842, + "step": 19328 + }, + { + "epoch": 0.95, + "grad_norm": 0.49066945910453796, + "learning_rate": 0.0004641109032528731, + "loss": 2.9538, + "step": 19329 + }, + { + "epoch": 0.95, + "grad_norm": 0.509368360042572, + "learning_rate": 0.0004640980147494053, + "loss": 3.1996, + "step": 19330 + }, + { + "epoch": 0.95, + "grad_norm": 0.5085881948471069, + "learning_rate": 0.00046408512581373507, + "loss": 3.1171, + "step": 19331 + }, + { + "epoch": 0.95, + "grad_norm": 0.5356560349464417, + "learning_rate": 0.00046407223644589606, + "loss": 3.0997, + "step": 19332 + }, + { + "epoch": 0.95, + "grad_norm": 0.5444095730781555, + "learning_rate": 0.0004640593466459225, + "loss": 2.9612, + "step": 19333 + }, + { + "epoch": 0.95, + "grad_norm": 0.5353986620903015, + "learning_rate": 0.00046404645641384804, + "loss": 2.9857, + "step": 19334 + }, + { + "epoch": 0.95, + "grad_norm": 0.5451989769935608, + "learning_rate": 0.00046403356574970683, + "loss": 3.2518, + "step": 19335 + }, + { + "epoch": 0.95, + "grad_norm": 0.5122106671333313, + "learning_rate": 0.0004640206746535328, + "loss": 2.9274, + "step": 19336 + }, + { + "epoch": 0.95, + "grad_norm": 0.512254536151886, + "learning_rate": 0.00046400778312536, + "loss": 3.2984, + "step": 19337 + }, + { + "epoch": 0.95, + "grad_norm": 0.5183076858520508, + "learning_rate": 0.00046399489116522204, + "loss": 3.2875, + "step": 19338 + }, + { + "epoch": 0.95, + "grad_norm": 0.5587219595909119, + "learning_rate": 0.00046398199877315313, + "loss": 3.0897, + "step": 19339 + }, + { + "epoch": 0.95, + "grad_norm": 0.5293834209442139, + "learning_rate": 0.00046396910594918736, + "loss": 3.1424, + "step": 19340 + }, + { + "epoch": 0.95, + "grad_norm": 0.5190085172653198, + "learning_rate": 0.0004639562126933584, + "loss": 3.1517, + "step": 19341 + }, + { + "epoch": 0.95, + "grad_norm": 0.5054040551185608, + "learning_rate": 0.0004639433190057004, + "loss": 3.1675, + "step": 19342 + }, + { + "epoch": 0.95, + "grad_norm": 0.5302183628082275, + "learning_rate": 0.0004639304248862472, + "loss": 3.322, + "step": 19343 + }, + { + "epoch": 0.95, + "grad_norm": 0.5174694657325745, + "learning_rate": 0.0004639175303350328, + "loss": 3.1216, + "step": 19344 + }, + { + "epoch": 0.95, + "grad_norm": 0.5287314057350159, + "learning_rate": 0.00046390463535209115, + "loss": 2.9587, + "step": 19345 + }, + { + "epoch": 0.95, + "grad_norm": 0.49835264682769775, + "learning_rate": 0.0004638917399374563, + "loss": 3.1747, + "step": 19346 + }, + { + "epoch": 0.95, + "grad_norm": 0.5517767071723938, + "learning_rate": 0.0004638788440911621, + "loss": 3.1357, + "step": 19347 + }, + { + "epoch": 0.95, + "grad_norm": 0.5469534993171692, + "learning_rate": 0.0004638659478132426, + "loss": 3.1799, + "step": 19348 + }, + { + "epoch": 0.95, + "grad_norm": 0.5432090163230896, + "learning_rate": 0.0004638530511037317, + "loss": 3.2207, + "step": 19349 + }, + { + "epoch": 0.95, + "grad_norm": 0.6552304029464722, + "learning_rate": 0.00046384015396266345, + "loss": 3.1993, + "step": 19350 + }, + { + "epoch": 0.95, + "grad_norm": 0.5897645354270935, + "learning_rate": 0.00046382725639007165, + "loss": 3.1047, + "step": 19351 + }, + { + "epoch": 0.95, + "grad_norm": 0.5444059371948242, + "learning_rate": 0.0004638143583859905, + "loss": 3.164, + "step": 19352 + }, + { + "epoch": 0.95, + "grad_norm": 0.5374630093574524, + "learning_rate": 0.0004638014599504539, + "loss": 2.9453, + "step": 19353 + }, + { + "epoch": 0.95, + "grad_norm": 0.5767490267753601, + "learning_rate": 0.0004637885610834956, + "loss": 3.1528, + "step": 19354 + }, + { + "epoch": 0.95, + "grad_norm": 0.5264842510223389, + "learning_rate": 0.0004637756617851499, + "loss": 2.9574, + "step": 19355 + }, + { + "epoch": 0.95, + "grad_norm": 0.5502093434333801, + "learning_rate": 0.00046376276205545053, + "loss": 3.3082, + "step": 19356 + }, + { + "epoch": 0.95, + "grad_norm": 0.5488525629043579, + "learning_rate": 0.00046374986189443165, + "loss": 2.9616, + "step": 19357 + }, + { + "epoch": 0.95, + "grad_norm": 0.5013023018836975, + "learning_rate": 0.0004637369613021271, + "loss": 3.1773, + "step": 19358 + }, + { + "epoch": 0.95, + "grad_norm": 0.5348243713378906, + "learning_rate": 0.0004637240602785709, + "loss": 3.1887, + "step": 19359 + }, + { + "epoch": 0.95, + "grad_norm": 0.5497328042984009, + "learning_rate": 0.00046371115882379706, + "loss": 3.0564, + "step": 19360 + }, + { + "epoch": 0.95, + "grad_norm": 0.545487105846405, + "learning_rate": 0.0004636982569378395, + "loss": 3.0005, + "step": 19361 + }, + { + "epoch": 0.95, + "grad_norm": 0.5591291785240173, + "learning_rate": 0.0004636853546207323, + "loss": 3.0755, + "step": 19362 + }, + { + "epoch": 0.95, + "grad_norm": 0.5495262145996094, + "learning_rate": 0.0004636724518725093, + "loss": 3.4243, + "step": 19363 + }, + { + "epoch": 0.95, + "grad_norm": 0.5503337979316711, + "learning_rate": 0.00046365954869320464, + "loss": 2.9641, + "step": 19364 + }, + { + "epoch": 0.95, + "grad_norm": 0.508242130279541, + "learning_rate": 0.00046364664508285217, + "loss": 3.2054, + "step": 19365 + }, + { + "epoch": 0.95, + "grad_norm": 0.5202712416648865, + "learning_rate": 0.0004636337410414859, + "loss": 2.9891, + "step": 19366 + }, + { + "epoch": 0.95, + "grad_norm": 0.57612144947052, + "learning_rate": 0.0004636208365691399, + "loss": 3.0345, + "step": 19367 + }, + { + "epoch": 0.95, + "grad_norm": 0.5074007511138916, + "learning_rate": 0.0004636079316658481, + "loss": 3.0326, + "step": 19368 + }, + { + "epoch": 0.95, + "grad_norm": 0.5342861413955688, + "learning_rate": 0.0004635950263316445, + "loss": 3.0882, + "step": 19369 + }, + { + "epoch": 0.95, + "grad_norm": 0.531694769859314, + "learning_rate": 0.00046358212056656306, + "loss": 3.249, + "step": 19370 + }, + { + "epoch": 0.95, + "grad_norm": 0.5404502749443054, + "learning_rate": 0.00046356921437063777, + "loss": 3.159, + "step": 19371 + }, + { + "epoch": 0.95, + "grad_norm": 0.5267776250839233, + "learning_rate": 0.00046355630774390274, + "loss": 3.1012, + "step": 19372 + }, + { + "epoch": 0.95, + "grad_norm": 0.5390035510063171, + "learning_rate": 0.0004635434006863919, + "loss": 3.223, + "step": 19373 + }, + { + "epoch": 0.95, + "grad_norm": 0.49688008427619934, + "learning_rate": 0.00046353049319813904, + "loss": 3.1074, + "step": 19374 + }, + { + "epoch": 0.95, + "grad_norm": 0.557471752166748, + "learning_rate": 0.00046351758527917846, + "loss": 2.9832, + "step": 19375 + }, + { + "epoch": 0.95, + "grad_norm": 0.4971594214439392, + "learning_rate": 0.000463504676929544, + "loss": 3.2285, + "step": 19376 + }, + { + "epoch": 0.95, + "grad_norm": 0.5553884506225586, + "learning_rate": 0.0004634917681492697, + "loss": 3.1043, + "step": 19377 + }, + { + "epoch": 0.95, + "grad_norm": 0.5280371308326721, + "learning_rate": 0.00046347885893838957, + "loss": 3.2346, + "step": 19378 + }, + { + "epoch": 0.95, + "grad_norm": 0.5034427046775818, + "learning_rate": 0.0004634659492969376, + "loss": 3.1326, + "step": 19379 + }, + { + "epoch": 0.95, + "grad_norm": 0.5268436670303345, + "learning_rate": 0.0004634530392249476, + "loss": 3.2807, + "step": 19380 + }, + { + "epoch": 0.95, + "grad_norm": 0.5234237909317017, + "learning_rate": 0.000463440128722454, + "loss": 3.1346, + "step": 19381 + }, + { + "epoch": 0.95, + "grad_norm": 0.539435863494873, + "learning_rate": 0.0004634272177894904, + "loss": 3.1197, + "step": 19382 + }, + { + "epoch": 0.95, + "grad_norm": 0.5603162050247192, + "learning_rate": 0.000463414306426091, + "loss": 3.0302, + "step": 19383 + }, + { + "epoch": 0.95, + "grad_norm": 0.5288932919502258, + "learning_rate": 0.0004634013946322898, + "loss": 3.2148, + "step": 19384 + }, + { + "epoch": 0.95, + "grad_norm": 0.5117092728614807, + "learning_rate": 0.00046338848240812063, + "loss": 3.3284, + "step": 19385 + }, + { + "epoch": 0.95, + "grad_norm": 0.5178090929985046, + "learning_rate": 0.0004633755697536178, + "loss": 3.2783, + "step": 19386 + }, + { + "epoch": 0.95, + "grad_norm": 0.5456027388572693, + "learning_rate": 0.0004633626566688152, + "loss": 2.9413, + "step": 19387 + }, + { + "epoch": 0.95, + "grad_norm": 0.4823963940143585, + "learning_rate": 0.0004633497431537467, + "loss": 3.0158, + "step": 19388 + }, + { + "epoch": 0.95, + "grad_norm": 0.5608025193214417, + "learning_rate": 0.0004633368292084464, + "loss": 3.2938, + "step": 19389 + }, + { + "epoch": 0.95, + "grad_norm": 0.5916173458099365, + "learning_rate": 0.0004633239148329483, + "loss": 3.0889, + "step": 19390 + }, + { + "epoch": 0.95, + "grad_norm": 0.5329937934875488, + "learning_rate": 0.0004633110000272866, + "loss": 2.8621, + "step": 19391 + }, + { + "epoch": 0.95, + "grad_norm": 0.5109493732452393, + "learning_rate": 0.00046329808479149496, + "loss": 2.8752, + "step": 19392 + }, + { + "epoch": 0.95, + "grad_norm": 0.5147404074668884, + "learning_rate": 0.0004632851691256078, + "loss": 3.2002, + "step": 19393 + }, + { + "epoch": 0.95, + "grad_norm": 0.5089914798736572, + "learning_rate": 0.00046327225302965873, + "loss": 3.2138, + "step": 19394 + }, + { + "epoch": 0.95, + "grad_norm": 0.5555095672607422, + "learning_rate": 0.00046325933650368206, + "loss": 3.2143, + "step": 19395 + }, + { + "epoch": 0.95, + "grad_norm": 0.6058218479156494, + "learning_rate": 0.0004632464195477118, + "loss": 3.1112, + "step": 19396 + }, + { + "epoch": 0.95, + "grad_norm": 0.5387574434280396, + "learning_rate": 0.00046323350216178174, + "loss": 3.0377, + "step": 19397 + }, + { + "epoch": 0.95, + "grad_norm": 0.5902401804924011, + "learning_rate": 0.00046322058434592617, + "loss": 3.0203, + "step": 19398 + }, + { + "epoch": 0.95, + "grad_norm": 0.5146337747573853, + "learning_rate": 0.0004632076661001789, + "loss": 2.8675, + "step": 19399 + }, + { + "epoch": 0.95, + "grad_norm": 0.508232057094574, + "learning_rate": 0.0004631947474245741, + "loss": 3.2896, + "step": 19400 + }, + { + "epoch": 0.95, + "grad_norm": 0.5544096827507019, + "learning_rate": 0.00046318182831914565, + "loss": 3.0256, + "step": 19401 + }, + { + "epoch": 0.95, + "grad_norm": 0.5086759328842163, + "learning_rate": 0.0004631689087839278, + "loss": 3.1559, + "step": 19402 + }, + { + "epoch": 0.95, + "grad_norm": 0.5778783559799194, + "learning_rate": 0.0004631559888189544, + "loss": 3.1122, + "step": 19403 + }, + { + "epoch": 0.95, + "grad_norm": 0.5708818435668945, + "learning_rate": 0.00046314306842425954, + "loss": 3.1534, + "step": 19404 + }, + { + "epoch": 0.95, + "grad_norm": 0.5399811863899231, + "learning_rate": 0.00046313014759987716, + "loss": 3.0594, + "step": 19405 + }, + { + "epoch": 0.95, + "grad_norm": 0.5436029434204102, + "learning_rate": 0.0004631172263458414, + "loss": 3.0062, + "step": 19406 + }, + { + "epoch": 0.95, + "grad_norm": 0.5369762182235718, + "learning_rate": 0.0004631043046621863, + "loss": 3.0197, + "step": 19407 + }, + { + "epoch": 0.95, + "grad_norm": 0.5159459710121155, + "learning_rate": 0.00046309138254894586, + "loss": 3.0661, + "step": 19408 + }, + { + "epoch": 0.95, + "grad_norm": 0.5201807618141174, + "learning_rate": 0.00046307846000615406, + "loss": 3.1795, + "step": 19409 + }, + { + "epoch": 0.95, + "grad_norm": 0.5022982954978943, + "learning_rate": 0.000463065537033845, + "loss": 3.2928, + "step": 19410 + }, + { + "epoch": 0.95, + "grad_norm": 0.5162597298622131, + "learning_rate": 0.0004630526136320527, + "loss": 3.4038, + "step": 19411 + }, + { + "epoch": 0.95, + "grad_norm": 0.5503665804862976, + "learning_rate": 0.0004630396898008112, + "loss": 3.1428, + "step": 19412 + }, + { + "epoch": 0.95, + "grad_norm": 0.5348483920097351, + "learning_rate": 0.0004630267655401545, + "loss": 2.9746, + "step": 19413 + }, + { + "epoch": 0.95, + "grad_norm": 0.5585376024246216, + "learning_rate": 0.00046301384085011666, + "loss": 3.4144, + "step": 19414 + }, + { + "epoch": 0.95, + "grad_norm": 0.5287220478057861, + "learning_rate": 0.0004630009157307319, + "loss": 3.0886, + "step": 19415 + }, + { + "epoch": 0.95, + "grad_norm": 0.5285464525222778, + "learning_rate": 0.00046298799018203385, + "loss": 3.0322, + "step": 19416 + }, + { + "epoch": 0.95, + "grad_norm": 0.5362486839294434, + "learning_rate": 0.00046297506420405697, + "loss": 3.1066, + "step": 19417 + }, + { + "epoch": 0.95, + "grad_norm": 0.5312342643737793, + "learning_rate": 0.00046296213779683506, + "loss": 3.1128, + "step": 19418 + }, + { + "epoch": 0.95, + "grad_norm": 0.5306200981140137, + "learning_rate": 0.0004629492109604023, + "loss": 3.0913, + "step": 19419 + }, + { + "epoch": 0.95, + "grad_norm": 0.5508071780204773, + "learning_rate": 0.0004629362836947927, + "loss": 3.2167, + "step": 19420 + }, + { + "epoch": 0.95, + "grad_norm": 0.5401845574378967, + "learning_rate": 0.0004629233560000401, + "loss": 2.9779, + "step": 19421 + }, + { + "epoch": 0.95, + "grad_norm": 0.5715287327766418, + "learning_rate": 0.00046291042787617896, + "loss": 3.215, + "step": 19422 + }, + { + "epoch": 0.95, + "grad_norm": 0.5305150747299194, + "learning_rate": 0.000462897499323243, + "loss": 3.1558, + "step": 19423 + }, + { + "epoch": 0.95, + "grad_norm": 0.5180661082267761, + "learning_rate": 0.00046288457034126645, + "loss": 3.02, + "step": 19424 + }, + { + "epoch": 0.95, + "grad_norm": 0.5526053309440613, + "learning_rate": 0.0004628716409302832, + "loss": 3.0368, + "step": 19425 + }, + { + "epoch": 0.95, + "grad_norm": 0.5305153727531433, + "learning_rate": 0.00046285871109032743, + "loss": 3.0802, + "step": 19426 + }, + { + "epoch": 0.95, + "grad_norm": 0.5597370266914368, + "learning_rate": 0.00046284578082143315, + "loss": 3.0115, + "step": 19427 + }, + { + "epoch": 0.95, + "grad_norm": 0.5298964977264404, + "learning_rate": 0.00046283285012363446, + "loss": 3.121, + "step": 19428 + }, + { + "epoch": 0.95, + "grad_norm": 0.4885500371456146, + "learning_rate": 0.00046281991899696533, + "loss": 3.2538, + "step": 19429 + }, + { + "epoch": 0.95, + "grad_norm": 0.5146791934967041, + "learning_rate": 0.00046280698744145994, + "loss": 3.0759, + "step": 19430 + }, + { + "epoch": 0.95, + "grad_norm": 0.5360423922538757, + "learning_rate": 0.0004627940554571522, + "loss": 3.0549, + "step": 19431 + }, + { + "epoch": 0.95, + "grad_norm": 0.5529478192329407, + "learning_rate": 0.0004627811230440763, + "loss": 3.1961, + "step": 19432 + }, + { + "epoch": 0.95, + "grad_norm": 0.5056607723236084, + "learning_rate": 0.00046276819020226617, + "loss": 2.9044, + "step": 19433 + }, + { + "epoch": 0.95, + "grad_norm": 0.5211524367332458, + "learning_rate": 0.000462755256931756, + "loss": 3.2076, + "step": 19434 + }, + { + "epoch": 0.95, + "grad_norm": 0.5306402444839478, + "learning_rate": 0.00046274232323257984, + "loss": 3.2183, + "step": 19435 + }, + { + "epoch": 0.95, + "grad_norm": 0.5615695714950562, + "learning_rate": 0.00046272938910477167, + "loss": 3.0429, + "step": 19436 + }, + { + "epoch": 0.95, + "grad_norm": 0.5487105250358582, + "learning_rate": 0.0004627164545483656, + "loss": 3.0273, + "step": 19437 + }, + { + "epoch": 0.95, + "grad_norm": 0.5336460471153259, + "learning_rate": 0.0004627035195633958, + "loss": 3.3118, + "step": 19438 + }, + { + "epoch": 0.95, + "grad_norm": 0.5271636247634888, + "learning_rate": 0.00046269058414989613, + "loss": 3.1076, + "step": 19439 + }, + { + "epoch": 0.95, + "grad_norm": 0.5462597012519836, + "learning_rate": 0.00046267764830790084, + "loss": 3.1968, + "step": 19440 + }, + { + "epoch": 0.95, + "grad_norm": 0.5113808512687683, + "learning_rate": 0.000462664712037444, + "loss": 3.2844, + "step": 19441 + }, + { + "epoch": 0.95, + "grad_norm": 0.5376443266868591, + "learning_rate": 0.0004626517753385595, + "loss": 3.1548, + "step": 19442 + }, + { + "epoch": 0.95, + "grad_norm": 0.49754318594932556, + "learning_rate": 0.0004626388382112815, + "loss": 3.083, + "step": 19443 + }, + { + "epoch": 0.95, + "grad_norm": 0.4905608594417572, + "learning_rate": 0.0004626259006556441, + "loss": 3.3342, + "step": 19444 + }, + { + "epoch": 0.95, + "grad_norm": 0.5019567012786865, + "learning_rate": 0.00046261296267168146, + "loss": 3.1041, + "step": 19445 + }, + { + "epoch": 0.95, + "grad_norm": 0.5304019451141357, + "learning_rate": 0.00046260002425942755, + "loss": 3.3963, + "step": 19446 + }, + { + "epoch": 0.95, + "grad_norm": 0.5429536700248718, + "learning_rate": 0.0004625870854189165, + "loss": 3.04, + "step": 19447 + }, + { + "epoch": 0.95, + "grad_norm": 0.5398834943771362, + "learning_rate": 0.0004625741461501823, + "loss": 3.1989, + "step": 19448 + }, + { + "epoch": 0.95, + "grad_norm": 0.5834211707115173, + "learning_rate": 0.00046256120645325906, + "loss": 3.161, + "step": 19449 + }, + { + "epoch": 0.95, + "grad_norm": 0.5296370387077332, + "learning_rate": 0.000462548266328181, + "loss": 3.0605, + "step": 19450 + }, + { + "epoch": 0.95, + "grad_norm": 0.5220499634742737, + "learning_rate": 0.000462535325774982, + "loss": 3.2008, + "step": 19451 + }, + { + "epoch": 0.95, + "grad_norm": 0.5129631757736206, + "learning_rate": 0.00046252238479369624, + "loss": 3.1372, + "step": 19452 + }, + { + "epoch": 0.95, + "grad_norm": 0.5435758233070374, + "learning_rate": 0.0004625094433843579, + "loss": 3.0829, + "step": 19453 + }, + { + "epoch": 0.95, + "grad_norm": 0.57452392578125, + "learning_rate": 0.0004624965015470008, + "loss": 3.2124, + "step": 19454 + }, + { + "epoch": 0.95, + "grad_norm": 0.5123562216758728, + "learning_rate": 0.0004624835592816593, + "loss": 3.2627, + "step": 19455 + }, + { + "epoch": 0.95, + "grad_norm": 0.5587196350097656, + "learning_rate": 0.00046247061658836726, + "loss": 3.0844, + "step": 19456 + }, + { + "epoch": 0.95, + "grad_norm": 0.5435320138931274, + "learning_rate": 0.000462457673467159, + "loss": 3.2731, + "step": 19457 + }, + { + "epoch": 0.95, + "grad_norm": 0.5726426243782043, + "learning_rate": 0.0004624447299180685, + "loss": 3.237, + "step": 19458 + }, + { + "epoch": 0.95, + "grad_norm": 0.5328337550163269, + "learning_rate": 0.0004624317859411298, + "loss": 3.0805, + "step": 19459 + }, + { + "epoch": 0.95, + "grad_norm": 0.5529270172119141, + "learning_rate": 0.00046241884153637696, + "loss": 2.9795, + "step": 19460 + }, + { + "epoch": 0.95, + "grad_norm": 0.5545976161956787, + "learning_rate": 0.00046240589670384425, + "loss": 3.0586, + "step": 19461 + }, + { + "epoch": 0.95, + "grad_norm": 0.48107704520225525, + "learning_rate": 0.0004623929514435656, + "loss": 3.1515, + "step": 19462 + }, + { + "epoch": 0.95, + "grad_norm": 0.576178789138794, + "learning_rate": 0.0004623800057555752, + "loss": 3.2103, + "step": 19463 + }, + { + "epoch": 0.95, + "grad_norm": 0.5457341074943542, + "learning_rate": 0.00046236705963990715, + "loss": 3.2126, + "step": 19464 + }, + { + "epoch": 0.95, + "grad_norm": 0.5450583696365356, + "learning_rate": 0.0004623541130965955, + "loss": 3.0782, + "step": 19465 + }, + { + "epoch": 0.95, + "grad_norm": 0.5224042534828186, + "learning_rate": 0.00046234116612567437, + "loss": 3.1618, + "step": 19466 + }, + { + "epoch": 0.95, + "grad_norm": 0.5329810380935669, + "learning_rate": 0.00046232821872717783, + "loss": 3.2908, + "step": 19467 + }, + { + "epoch": 0.95, + "grad_norm": 0.5755662322044373, + "learning_rate": 0.00046231527090113993, + "loss": 3.0585, + "step": 19468 + }, + { + "epoch": 0.95, + "grad_norm": 0.5130234956741333, + "learning_rate": 0.000462302322647595, + "loss": 3.0353, + "step": 19469 + }, + { + "epoch": 0.95, + "grad_norm": 0.5430505275726318, + "learning_rate": 0.0004622893739665769, + "loss": 3.3614, + "step": 19470 + }, + { + "epoch": 0.95, + "grad_norm": 0.5734212398529053, + "learning_rate": 0.0004622764248581198, + "loss": 3.1137, + "step": 19471 + }, + { + "epoch": 0.95, + "grad_norm": 0.5490341782569885, + "learning_rate": 0.00046226347532225777, + "loss": 2.9489, + "step": 19472 + }, + { + "epoch": 0.95, + "grad_norm": 0.506134569644928, + "learning_rate": 0.0004622505253590251, + "loss": 3.2452, + "step": 19473 + }, + { + "epoch": 0.95, + "grad_norm": 0.5133436322212219, + "learning_rate": 0.00046223757496845577, + "loss": 3.1175, + "step": 19474 + }, + { + "epoch": 0.95, + "grad_norm": 0.4948771595954895, + "learning_rate": 0.0004622246241505839, + "loss": 3.2184, + "step": 19475 + }, + { + "epoch": 0.95, + "grad_norm": 0.663033664226532, + "learning_rate": 0.0004622116729054435, + "loss": 3.2785, + "step": 19476 + }, + { + "epoch": 0.95, + "grad_norm": 0.5038350224494934, + "learning_rate": 0.0004621987212330688, + "loss": 3.1118, + "step": 19477 + }, + { + "epoch": 0.95, + "grad_norm": 0.5808413624763489, + "learning_rate": 0.00046218576913349387, + "loss": 3.2685, + "step": 19478 + }, + { + "epoch": 0.95, + "grad_norm": 0.5481523275375366, + "learning_rate": 0.00046217281660675293, + "loss": 3.1188, + "step": 19479 + }, + { + "epoch": 0.95, + "grad_norm": 0.5153646469116211, + "learning_rate": 0.0004621598636528799, + "loss": 3.2296, + "step": 19480 + }, + { + "epoch": 0.95, + "grad_norm": 0.539476215839386, + "learning_rate": 0.00046214691027190904, + "loss": 3.3285, + "step": 19481 + }, + { + "epoch": 0.95, + "grad_norm": 0.4928472638130188, + "learning_rate": 0.0004621339564638744, + "loss": 3.1049, + "step": 19482 + }, + { + "epoch": 0.95, + "grad_norm": 0.4804633557796478, + "learning_rate": 0.00046212100222881014, + "loss": 3.2119, + "step": 19483 + }, + { + "epoch": 0.95, + "grad_norm": 0.524301290512085, + "learning_rate": 0.0004621080475667504, + "loss": 3.103, + "step": 19484 + }, + { + "epoch": 0.95, + "grad_norm": 0.50786954164505, + "learning_rate": 0.0004620950924777292, + "loss": 3.1123, + "step": 19485 + }, + { + "epoch": 0.95, + "grad_norm": 0.5153059363365173, + "learning_rate": 0.0004620821369617808, + "loss": 3.3104, + "step": 19486 + }, + { + "epoch": 0.96, + "grad_norm": 0.5356342196464539, + "learning_rate": 0.0004620691810189392, + "loss": 3.2367, + "step": 19487 + }, + { + "epoch": 0.96, + "grad_norm": 0.5034014582633972, + "learning_rate": 0.00046205622464923847, + "loss": 3.2318, + "step": 19488 + }, + { + "epoch": 0.96, + "grad_norm": 0.5269924402236938, + "learning_rate": 0.0004620432678527129, + "loss": 3.0648, + "step": 19489 + }, + { + "epoch": 0.96, + "grad_norm": 0.5155195593833923, + "learning_rate": 0.00046203031062939666, + "loss": 3.0857, + "step": 19490 + }, + { + "epoch": 0.96, + "grad_norm": 0.48004335165023804, + "learning_rate": 0.00046201735297932356, + "loss": 3.1868, + "step": 19491 + }, + { + "epoch": 0.96, + "grad_norm": 0.5057433843612671, + "learning_rate": 0.0004620043949025281, + "loss": 3.1481, + "step": 19492 + }, + { + "epoch": 0.96, + "grad_norm": 0.5076611638069153, + "learning_rate": 0.00046199143639904416, + "loss": 3.0926, + "step": 19493 + }, + { + "epoch": 0.96, + "grad_norm": 0.5741182565689087, + "learning_rate": 0.000461978477468906, + "loss": 3.1103, + "step": 19494 + }, + { + "epoch": 0.96, + "grad_norm": 0.5225886106491089, + "learning_rate": 0.00046196551811214775, + "loss": 3.1027, + "step": 19495 + }, + { + "epoch": 0.96, + "grad_norm": 0.5760247111320496, + "learning_rate": 0.0004619525583288034, + "loss": 3.2087, + "step": 19496 + }, + { + "epoch": 0.96, + "grad_norm": 0.5268136858940125, + "learning_rate": 0.0004619395981189072, + "loss": 3.1337, + "step": 19497 + }, + { + "epoch": 0.96, + "grad_norm": 0.5018905401229858, + "learning_rate": 0.0004619266374824932, + "loss": 3.2123, + "step": 19498 + }, + { + "epoch": 0.96, + "grad_norm": 0.509712815284729, + "learning_rate": 0.00046191367641959573, + "loss": 3.0303, + "step": 19499 + }, + { + "epoch": 0.96, + "grad_norm": 0.5171404480934143, + "learning_rate": 0.00046190071493024874, + "loss": 3.0558, + "step": 19500 + }, + { + "epoch": 0.96, + "grad_norm": 0.5230562686920166, + "learning_rate": 0.00046188775301448645, + "loss": 3.175, + "step": 19501 + }, + { + "epoch": 0.96, + "grad_norm": 0.5551393032073975, + "learning_rate": 0.00046187479067234295, + "loss": 3.0652, + "step": 19502 + }, + { + "epoch": 0.96, + "grad_norm": 0.5329045653343201, + "learning_rate": 0.0004618618279038524, + "loss": 3.0641, + "step": 19503 + }, + { + "epoch": 0.96, + "grad_norm": 0.4875706732273102, + "learning_rate": 0.00046184886470904894, + "loss": 3.0987, + "step": 19504 + }, + { + "epoch": 0.96, + "grad_norm": 0.5308302640914917, + "learning_rate": 0.00046183590108796673, + "loss": 3.156, + "step": 19505 + }, + { + "epoch": 0.96, + "grad_norm": 0.5178402662277222, + "learning_rate": 0.0004618229370406399, + "loss": 3.0371, + "step": 19506 + }, + { + "epoch": 0.96, + "grad_norm": 0.5128605365753174, + "learning_rate": 0.00046180997256710263, + "loss": 3.2408, + "step": 19507 + }, + { + "epoch": 0.96, + "grad_norm": 0.5387104749679565, + "learning_rate": 0.000461797007667389, + "loss": 3.2067, + "step": 19508 + }, + { + "epoch": 0.96, + "grad_norm": 0.5741128325462341, + "learning_rate": 0.0004617840423415332, + "loss": 3.2154, + "step": 19509 + }, + { + "epoch": 0.96, + "grad_norm": 0.5508107542991638, + "learning_rate": 0.0004617710765895693, + "loss": 3.1757, + "step": 19510 + }, + { + "epoch": 0.96, + "grad_norm": 0.5864234566688538, + "learning_rate": 0.00046175811041153164, + "loss": 3.1171, + "step": 19511 + }, + { + "epoch": 0.96, + "grad_norm": 0.5283697247505188, + "learning_rate": 0.00046174514380745423, + "loss": 3.4572, + "step": 19512 + }, + { + "epoch": 0.96, + "grad_norm": 0.4930495619773865, + "learning_rate": 0.0004617321767773711, + "loss": 2.9718, + "step": 19513 + }, + { + "epoch": 0.96, + "grad_norm": 0.49955621361732483, + "learning_rate": 0.00046171920932131666, + "loss": 3.215, + "step": 19514 + }, + { + "epoch": 0.96, + "grad_norm": 0.5142083764076233, + "learning_rate": 0.00046170624143932495, + "loss": 3.2683, + "step": 19515 + }, + { + "epoch": 0.96, + "grad_norm": 0.4969509243965149, + "learning_rate": 0.0004616932731314301, + "loss": 3.0456, + "step": 19516 + }, + { + "epoch": 0.96, + "grad_norm": 0.5548650622367859, + "learning_rate": 0.00046168030439766626, + "loss": 3.0429, + "step": 19517 + }, + { + "epoch": 0.96, + "grad_norm": 0.5828065276145935, + "learning_rate": 0.00046166733523806774, + "loss": 3.3183, + "step": 19518 + }, + { + "epoch": 0.96, + "grad_norm": 0.5464107990264893, + "learning_rate": 0.0004616543656526684, + "loss": 3.0603, + "step": 19519 + }, + { + "epoch": 0.96, + "grad_norm": 0.5539250373840332, + "learning_rate": 0.0004616413956415026, + "loss": 3.2492, + "step": 19520 + }, + { + "epoch": 0.96, + "grad_norm": 0.5596001744270325, + "learning_rate": 0.00046162842520460455, + "loss": 3.1842, + "step": 19521 + }, + { + "epoch": 0.96, + "grad_norm": 0.5353409647941589, + "learning_rate": 0.00046161545434200837, + "loss": 3.1103, + "step": 19522 + }, + { + "epoch": 0.96, + "grad_norm": 0.5830059051513672, + "learning_rate": 0.0004616024830537481, + "loss": 3.2124, + "step": 19523 + }, + { + "epoch": 0.96, + "grad_norm": 0.542768120765686, + "learning_rate": 0.0004615895113398579, + "loss": 3.1671, + "step": 19524 + }, + { + "epoch": 0.96, + "grad_norm": 0.5151305198669434, + "learning_rate": 0.00046157653920037223, + "loss": 3.0929, + "step": 19525 + }, + { + "epoch": 0.96, + "grad_norm": 0.5473406910896301, + "learning_rate": 0.000461563566635325, + "loss": 3.0294, + "step": 19526 + }, + { + "epoch": 0.96, + "grad_norm": 0.5182105302810669, + "learning_rate": 0.00046155059364475035, + "loss": 2.9818, + "step": 19527 + }, + { + "epoch": 0.96, + "grad_norm": 0.5524483323097229, + "learning_rate": 0.0004615376202286825, + "loss": 3.2716, + "step": 19528 + }, + { + "epoch": 0.96, + "grad_norm": 0.5527673363685608, + "learning_rate": 0.0004615246463871557, + "loss": 3.0887, + "step": 19529 + }, + { + "epoch": 0.96, + "grad_norm": 0.5192067623138428, + "learning_rate": 0.0004615116721202041, + "loss": 3.1964, + "step": 19530 + }, + { + "epoch": 0.96, + "grad_norm": 0.5294162631034851, + "learning_rate": 0.0004614986974278618, + "loss": 3.1753, + "step": 19531 + }, + { + "epoch": 0.96, + "grad_norm": 0.5797945261001587, + "learning_rate": 0.0004614857223101631, + "loss": 3.1273, + "step": 19532 + }, + { + "epoch": 0.96, + "grad_norm": 0.5336905717849731, + "learning_rate": 0.000461472746767142, + "loss": 3.2099, + "step": 19533 + }, + { + "epoch": 0.96, + "grad_norm": 0.5431013703346252, + "learning_rate": 0.0004614597707988327, + "loss": 3.2577, + "step": 19534 + }, + { + "epoch": 0.96, + "grad_norm": 0.5563946962356567, + "learning_rate": 0.0004614467944052695, + "loss": 2.8293, + "step": 19535 + }, + { + "epoch": 0.96, + "grad_norm": 0.5071994662284851, + "learning_rate": 0.00046143381758648666, + "loss": 3.0538, + "step": 19536 + }, + { + "epoch": 0.96, + "grad_norm": 0.5291922092437744, + "learning_rate": 0.000461420840342518, + "loss": 3.0829, + "step": 19537 + }, + { + "epoch": 0.96, + "grad_norm": 0.5239132642745972, + "learning_rate": 0.000461407862673398, + "loss": 3.1648, + "step": 19538 + }, + { + "epoch": 0.96, + "grad_norm": 0.5256131887435913, + "learning_rate": 0.00046139488457916074, + "loss": 3.1761, + "step": 19539 + }, + { + "epoch": 0.96, + "grad_norm": 0.5487732887268066, + "learning_rate": 0.0004613819060598404, + "loss": 3.2024, + "step": 19540 + }, + { + "epoch": 0.96, + "grad_norm": 0.5254712700843811, + "learning_rate": 0.00046136892711547123, + "loss": 3.1627, + "step": 19541 + }, + { + "epoch": 0.96, + "grad_norm": 0.5246246457099915, + "learning_rate": 0.00046135594774608733, + "loss": 3.0529, + "step": 19542 + }, + { + "epoch": 0.96, + "grad_norm": 0.5375425219535828, + "learning_rate": 0.0004613429679517229, + "loss": 3.3443, + "step": 19543 + }, + { + "epoch": 0.96, + "grad_norm": 0.5527751445770264, + "learning_rate": 0.00046132998773241215, + "loss": 2.9413, + "step": 19544 + }, + { + "epoch": 0.96, + "grad_norm": 0.5222285389900208, + "learning_rate": 0.00046131700708818924, + "loss": 3.3199, + "step": 19545 + }, + { + "epoch": 0.96, + "grad_norm": 0.577925443649292, + "learning_rate": 0.0004613040260190884, + "loss": 2.9484, + "step": 19546 + }, + { + "epoch": 0.96, + "grad_norm": 0.5509508848190308, + "learning_rate": 0.0004612910445251439, + "loss": 3.2603, + "step": 19547 + }, + { + "epoch": 0.96, + "grad_norm": 0.5331478714942932, + "learning_rate": 0.0004612780626063897, + "loss": 3.0465, + "step": 19548 + }, + { + "epoch": 0.96, + "grad_norm": 0.5440930128097534, + "learning_rate": 0.0004612650802628602, + "loss": 2.9711, + "step": 19549 + }, + { + "epoch": 0.96, + "grad_norm": 0.5174834728240967, + "learning_rate": 0.0004612520974945895, + "loss": 3.2217, + "step": 19550 + }, + { + "epoch": 0.96, + "grad_norm": 0.5301517844200134, + "learning_rate": 0.00046123911430161175, + "loss": 3.1833, + "step": 19551 + }, + { + "epoch": 0.96, + "grad_norm": 0.556764543056488, + "learning_rate": 0.00046122613068396123, + "loss": 2.9512, + "step": 19552 + }, + { + "epoch": 0.96, + "grad_norm": 0.5274459719657898, + "learning_rate": 0.00046121314664167214, + "loss": 3.0559, + "step": 19553 + }, + { + "epoch": 0.96, + "grad_norm": 0.5233287811279297, + "learning_rate": 0.0004612001621747786, + "loss": 3.1821, + "step": 19554 + }, + { + "epoch": 0.96, + "grad_norm": 0.49667537212371826, + "learning_rate": 0.00046118717728331493, + "loss": 3.1545, + "step": 19555 + }, + { + "epoch": 0.96, + "grad_norm": 0.5035794377326965, + "learning_rate": 0.00046117419196731516, + "loss": 2.9625, + "step": 19556 + }, + { + "epoch": 0.96, + "grad_norm": 0.536008894443512, + "learning_rate": 0.00046116120622681365, + "loss": 3.3597, + "step": 19557 + }, + { + "epoch": 0.96, + "grad_norm": 0.5389508008956909, + "learning_rate": 0.0004611482200618446, + "loss": 3.0717, + "step": 19558 + }, + { + "epoch": 0.96, + "grad_norm": 0.4923076629638672, + "learning_rate": 0.00046113523347244206, + "loss": 3.066, + "step": 19559 + }, + { + "epoch": 0.96, + "grad_norm": 0.4959174394607544, + "learning_rate": 0.00046112224645864023, + "loss": 3.1596, + "step": 19560 + }, + { + "epoch": 0.96, + "grad_norm": 0.5146404504776001, + "learning_rate": 0.0004611092590204736, + "loss": 3.1807, + "step": 19561 + }, + { + "epoch": 0.96, + "grad_norm": 0.5296205282211304, + "learning_rate": 0.0004610962711579761, + "loss": 3.0542, + "step": 19562 + }, + { + "epoch": 0.96, + "grad_norm": 0.5231249928474426, + "learning_rate": 0.00046108328287118203, + "loss": 3.3756, + "step": 19563 + }, + { + "epoch": 0.96, + "grad_norm": 0.5085944533348083, + "learning_rate": 0.0004610702941601256, + "loss": 3.316, + "step": 19564 + }, + { + "epoch": 0.96, + "grad_norm": 0.5056644082069397, + "learning_rate": 0.00046105730502484107, + "loss": 3.0114, + "step": 19565 + }, + { + "epoch": 0.96, + "grad_norm": 0.5329414010047913, + "learning_rate": 0.00046104431546536246, + "loss": 3.3503, + "step": 19566 + }, + { + "epoch": 0.96, + "grad_norm": 0.5404815077781677, + "learning_rate": 0.00046103132548172424, + "loss": 3.1329, + "step": 19567 + }, + { + "epoch": 0.96, + "grad_norm": 0.5342490673065186, + "learning_rate": 0.00046101833507396044, + "loss": 3.1882, + "step": 19568 + }, + { + "epoch": 0.96, + "grad_norm": 0.5149338245391846, + "learning_rate": 0.0004610053442421054, + "loss": 3.2602, + "step": 19569 + }, + { + "epoch": 0.96, + "grad_norm": 0.5321595072746277, + "learning_rate": 0.0004609923529861932, + "loss": 3.0032, + "step": 19570 + }, + { + "epoch": 0.96, + "grad_norm": 0.5394263863563538, + "learning_rate": 0.0004609793613062581, + "loss": 3.2131, + "step": 19571 + }, + { + "epoch": 0.96, + "grad_norm": 0.5631637573242188, + "learning_rate": 0.00046096636920233444, + "loss": 3.0242, + "step": 19572 + }, + { + "epoch": 0.96, + "grad_norm": 0.5770866870880127, + "learning_rate": 0.00046095337667445633, + "loss": 3.1954, + "step": 19573 + }, + { + "epoch": 0.96, + "grad_norm": 0.5115048289299011, + "learning_rate": 0.00046094038372265794, + "loss": 3.2293, + "step": 19574 + }, + { + "epoch": 0.96, + "grad_norm": 0.5524685978889465, + "learning_rate": 0.0004609273903469735, + "loss": 3.208, + "step": 19575 + }, + { + "epoch": 0.96, + "grad_norm": 0.5510011315345764, + "learning_rate": 0.00046091439654743745, + "loss": 3.228, + "step": 19576 + }, + { + "epoch": 0.96, + "grad_norm": 0.5279592275619507, + "learning_rate": 0.00046090140232408377, + "loss": 2.9963, + "step": 19577 + }, + { + "epoch": 0.96, + "grad_norm": 0.5411564111709595, + "learning_rate": 0.00046088840767694674, + "loss": 3.2272, + "step": 19578 + }, + { + "epoch": 0.96, + "grad_norm": 0.5248085260391235, + "learning_rate": 0.00046087541260606056, + "loss": 3.2172, + "step": 19579 + }, + { + "epoch": 0.96, + "grad_norm": 0.5476329922676086, + "learning_rate": 0.0004608624171114596, + "loss": 3.3069, + "step": 19580 + }, + { + "epoch": 0.96, + "grad_norm": 0.5507305264472961, + "learning_rate": 0.000460849421193178, + "loss": 3.096, + "step": 19581 + }, + { + "epoch": 0.96, + "grad_norm": 0.5406851172447205, + "learning_rate": 0.00046083642485124983, + "loss": 3.0172, + "step": 19582 + }, + { + "epoch": 0.96, + "grad_norm": 0.5372514724731445, + "learning_rate": 0.0004608234280857096, + "loss": 3.1018, + "step": 19583 + }, + { + "epoch": 0.96, + "grad_norm": 0.5436272621154785, + "learning_rate": 0.00046081043089659144, + "loss": 3.1314, + "step": 19584 + }, + { + "epoch": 0.96, + "grad_norm": 0.5231220126152039, + "learning_rate": 0.0004607974332839295, + "loss": 3.1906, + "step": 19585 + }, + { + "epoch": 0.96, + "grad_norm": 0.553106963634491, + "learning_rate": 0.000460784435247758, + "loss": 3.1702, + "step": 19586 + }, + { + "epoch": 0.96, + "grad_norm": 0.5628478527069092, + "learning_rate": 0.0004607714367881114, + "loss": 3.1288, + "step": 19587 + }, + { + "epoch": 0.96, + "grad_norm": 0.5031068921089172, + "learning_rate": 0.0004607584379050237, + "loss": 3.193, + "step": 19588 + }, + { + "epoch": 0.96, + "grad_norm": 0.5432695746421814, + "learning_rate": 0.00046074543859852917, + "loss": 3.0016, + "step": 19589 + }, + { + "epoch": 0.96, + "grad_norm": 0.5564255118370056, + "learning_rate": 0.00046073243886866216, + "loss": 3.1017, + "step": 19590 + }, + { + "epoch": 0.96, + "grad_norm": 0.7009369134902954, + "learning_rate": 0.0004607194387154567, + "loss": 3.213, + "step": 19591 + }, + { + "epoch": 0.96, + "grad_norm": 0.5528953671455383, + "learning_rate": 0.0004607064381389473, + "loss": 3.1717, + "step": 19592 + }, + { + "epoch": 0.96, + "grad_norm": 0.5072028636932373, + "learning_rate": 0.00046069343713916805, + "loss": 3.1961, + "step": 19593 + }, + { + "epoch": 0.96, + "grad_norm": 0.5909320712089539, + "learning_rate": 0.0004606804357161532, + "loss": 3.2262, + "step": 19594 + }, + { + "epoch": 0.96, + "grad_norm": 0.5035519003868103, + "learning_rate": 0.0004606674338699371, + "loss": 2.93, + "step": 19595 + }, + { + "epoch": 0.96, + "grad_norm": 0.5433776378631592, + "learning_rate": 0.0004606544316005537, + "loss": 3.5309, + "step": 19596 + }, + { + "epoch": 0.96, + "grad_norm": 0.550793468952179, + "learning_rate": 0.00046064142890803764, + "loss": 3.2649, + "step": 19597 + }, + { + "epoch": 0.96, + "grad_norm": 0.5635651350021362, + "learning_rate": 0.00046062842579242284, + "loss": 3.0402, + "step": 19598 + }, + { + "epoch": 0.96, + "grad_norm": 0.5558289289474487, + "learning_rate": 0.0004606154222537438, + "loss": 3.2313, + "step": 19599 + }, + { + "epoch": 0.96, + "grad_norm": 0.5349392294883728, + "learning_rate": 0.0004606024182920345, + "loss": 3.2041, + "step": 19600 + }, + { + "epoch": 0.96, + "grad_norm": 0.48463213443756104, + "learning_rate": 0.0004605894139073294, + "loss": 3.0712, + "step": 19601 + }, + { + "epoch": 0.96, + "grad_norm": 0.5352935194969177, + "learning_rate": 0.00046057640909966276, + "loss": 3.1331, + "step": 19602 + }, + { + "epoch": 0.96, + "grad_norm": 0.5178282856941223, + "learning_rate": 0.00046056340386906866, + "loss": 3.1748, + "step": 19603 + }, + { + "epoch": 0.96, + "grad_norm": 0.5154528617858887, + "learning_rate": 0.0004605503982155815, + "loss": 3.3167, + "step": 19604 + }, + { + "epoch": 0.96, + "grad_norm": 0.525182843208313, + "learning_rate": 0.0004605373921392355, + "loss": 3.2507, + "step": 19605 + }, + { + "epoch": 0.96, + "grad_norm": 0.549490213394165, + "learning_rate": 0.0004605243856400649, + "loss": 3.0423, + "step": 19606 + }, + { + "epoch": 0.96, + "grad_norm": 0.546172022819519, + "learning_rate": 0.00046051137871810395, + "loss": 3.1048, + "step": 19607 + }, + { + "epoch": 0.96, + "grad_norm": 0.5274287462234497, + "learning_rate": 0.00046049837137338695, + "loss": 3.0897, + "step": 19608 + }, + { + "epoch": 0.96, + "grad_norm": 0.5080083012580872, + "learning_rate": 0.00046048536360594815, + "loss": 3.1981, + "step": 19609 + }, + { + "epoch": 0.96, + "grad_norm": 0.5662698745727539, + "learning_rate": 0.00046047235541582174, + "loss": 3.2204, + "step": 19610 + }, + { + "epoch": 0.96, + "grad_norm": 0.49622294306755066, + "learning_rate": 0.00046045934680304194, + "loss": 2.9424, + "step": 19611 + }, + { + "epoch": 0.96, + "grad_norm": 0.5338951945304871, + "learning_rate": 0.0004604463377676432, + "loss": 3.0564, + "step": 19612 + }, + { + "epoch": 0.96, + "grad_norm": 0.5102505683898926, + "learning_rate": 0.00046043332830965973, + "loss": 3.1175, + "step": 19613 + }, + { + "epoch": 0.96, + "grad_norm": 0.5088198781013489, + "learning_rate": 0.00046042031842912575, + "loss": 3.0823, + "step": 19614 + }, + { + "epoch": 0.96, + "grad_norm": 0.4949013590812683, + "learning_rate": 0.0004604073081260754, + "loss": 3.1718, + "step": 19615 + }, + { + "epoch": 0.96, + "grad_norm": 0.529629647731781, + "learning_rate": 0.00046039429740054314, + "loss": 3.2467, + "step": 19616 + }, + { + "epoch": 0.96, + "grad_norm": 0.5418373942375183, + "learning_rate": 0.0004603812862525632, + "loss": 3.0991, + "step": 19617 + }, + { + "epoch": 0.96, + "grad_norm": 0.5319532155990601, + "learning_rate": 0.00046036827468216976, + "loss": 3.222, + "step": 19618 + }, + { + "epoch": 0.96, + "grad_norm": 0.5246682167053223, + "learning_rate": 0.0004603552626893972, + "loss": 3.1567, + "step": 19619 + }, + { + "epoch": 0.96, + "grad_norm": 0.5561453700065613, + "learning_rate": 0.00046034225027427974, + "loss": 3.1118, + "step": 19620 + }, + { + "epoch": 0.96, + "grad_norm": 0.554530143737793, + "learning_rate": 0.0004603292374368516, + "loss": 3.1151, + "step": 19621 + }, + { + "epoch": 0.96, + "grad_norm": 0.5948189496994019, + "learning_rate": 0.0004603162241771471, + "loss": 3.2595, + "step": 19622 + }, + { + "epoch": 0.96, + "grad_norm": 0.5682088136672974, + "learning_rate": 0.0004603032104952006, + "loss": 3.2056, + "step": 19623 + }, + { + "epoch": 0.96, + "grad_norm": 0.5129702091217041, + "learning_rate": 0.0004602901963910463, + "loss": 3.1712, + "step": 19624 + }, + { + "epoch": 0.96, + "grad_norm": 0.5358684659004211, + "learning_rate": 0.0004602771818647184, + "loss": 3.0202, + "step": 19625 + }, + { + "epoch": 0.96, + "grad_norm": 0.5510245561599731, + "learning_rate": 0.0004602641669162512, + "loss": 3.1402, + "step": 19626 + }, + { + "epoch": 0.96, + "grad_norm": 0.5696225762367249, + "learning_rate": 0.00046025115154567917, + "loss": 3.0832, + "step": 19627 + }, + { + "epoch": 0.96, + "grad_norm": 0.5569515824317932, + "learning_rate": 0.0004602381357530364, + "loss": 2.9884, + "step": 19628 + }, + { + "epoch": 0.96, + "grad_norm": 0.5148624181747437, + "learning_rate": 0.00046022511953835716, + "loss": 3.1615, + "step": 19629 + }, + { + "epoch": 0.96, + "grad_norm": 0.5084801316261292, + "learning_rate": 0.00046021210290167583, + "loss": 3.1071, + "step": 19630 + }, + { + "epoch": 0.96, + "grad_norm": 0.5485509634017944, + "learning_rate": 0.00046019908584302665, + "loss": 3.0196, + "step": 19631 + }, + { + "epoch": 0.96, + "grad_norm": 0.5287085175514221, + "learning_rate": 0.0004601860683624439, + "loss": 3.1482, + "step": 19632 + }, + { + "epoch": 0.96, + "grad_norm": 0.4873697757720947, + "learning_rate": 0.00046017305045996187, + "loss": 3.2477, + "step": 19633 + }, + { + "epoch": 0.96, + "grad_norm": 0.5111532807350159, + "learning_rate": 0.0004601600321356149, + "loss": 3.2722, + "step": 19634 + }, + { + "epoch": 0.96, + "grad_norm": 0.5960693359375, + "learning_rate": 0.00046014701338943723, + "loss": 2.9375, + "step": 19635 + }, + { + "epoch": 0.96, + "grad_norm": 0.5306110382080078, + "learning_rate": 0.00046013399422146307, + "loss": 3.0017, + "step": 19636 + }, + { + "epoch": 0.96, + "grad_norm": 0.5201436877250671, + "learning_rate": 0.0004601209746317268, + "loss": 3.2602, + "step": 19637 + }, + { + "epoch": 0.96, + "grad_norm": 0.5063363909721375, + "learning_rate": 0.0004601079546202628, + "loss": 3.1268, + "step": 19638 + }, + { + "epoch": 0.96, + "grad_norm": 0.5271068215370178, + "learning_rate": 0.00046009493418710514, + "loss": 3.2246, + "step": 19639 + }, + { + "epoch": 0.96, + "grad_norm": 0.527073860168457, + "learning_rate": 0.00046008191333228826, + "loss": 3.0653, + "step": 19640 + }, + { + "epoch": 0.96, + "grad_norm": 0.5272981524467468, + "learning_rate": 0.0004600688920558465, + "loss": 3.215, + "step": 19641 + }, + { + "epoch": 0.96, + "grad_norm": 0.5382295846939087, + "learning_rate": 0.000460055870357814, + "loss": 3.0829, + "step": 19642 + }, + { + "epoch": 0.96, + "grad_norm": 0.5190293192863464, + "learning_rate": 0.0004600428482382252, + "loss": 3.1779, + "step": 19643 + }, + { + "epoch": 0.96, + "grad_norm": 0.508520781993866, + "learning_rate": 0.0004600298256971143, + "loss": 3.2162, + "step": 19644 + }, + { + "epoch": 0.96, + "grad_norm": 0.5364028215408325, + "learning_rate": 0.0004600168027345156, + "loss": 3.2912, + "step": 19645 + }, + { + "epoch": 0.96, + "grad_norm": 0.541970431804657, + "learning_rate": 0.00046000377935046357, + "loss": 3.2242, + "step": 19646 + }, + { + "epoch": 0.96, + "grad_norm": 0.522739052772522, + "learning_rate": 0.0004599907555449922, + "loss": 3.1251, + "step": 19647 + }, + { + "epoch": 0.96, + "grad_norm": 0.5367051959037781, + "learning_rate": 0.00045997773131813607, + "loss": 3.1206, + "step": 19648 + }, + { + "epoch": 0.96, + "grad_norm": 0.5582072734832764, + "learning_rate": 0.0004599647066699295, + "loss": 3.1991, + "step": 19649 + }, + { + "epoch": 0.96, + "grad_norm": 0.5519962310791016, + "learning_rate": 0.00045995168160040654, + "loss": 3.0101, + "step": 19650 + }, + { + "epoch": 0.96, + "grad_norm": 0.529492199420929, + "learning_rate": 0.0004599386561096016, + "loss": 3.188, + "step": 19651 + }, + { + "epoch": 0.96, + "grad_norm": 0.5170547366142273, + "learning_rate": 0.0004599256301975491, + "loss": 3.1679, + "step": 19652 + }, + { + "epoch": 0.96, + "grad_norm": 0.5805275440216064, + "learning_rate": 0.00045991260386428327, + "loss": 3.0967, + "step": 19653 + }, + { + "epoch": 0.96, + "grad_norm": 0.5890424847602844, + "learning_rate": 0.0004598995771098384, + "loss": 3.3168, + "step": 19654 + }, + { + "epoch": 0.96, + "grad_norm": 0.5336859226226807, + "learning_rate": 0.0004598865499342488, + "loss": 3.0317, + "step": 19655 + }, + { + "epoch": 0.96, + "grad_norm": 0.6045253872871399, + "learning_rate": 0.0004598735223375488, + "loss": 3.1161, + "step": 19656 + }, + { + "epoch": 0.96, + "grad_norm": 0.5241043567657471, + "learning_rate": 0.00045986049431977265, + "loss": 3.141, + "step": 19657 + }, + { + "epoch": 0.96, + "grad_norm": 0.515809178352356, + "learning_rate": 0.0004598474658809548, + "loss": 3.1444, + "step": 19658 + }, + { + "epoch": 0.96, + "grad_norm": 0.5346136093139648, + "learning_rate": 0.00045983443702112946, + "loss": 3.181, + "step": 19659 + }, + { + "epoch": 0.96, + "grad_norm": 0.5317142605781555, + "learning_rate": 0.00045982140774033104, + "loss": 3.4307, + "step": 19660 + }, + { + "epoch": 0.96, + "grad_norm": 0.558459997177124, + "learning_rate": 0.0004598083780385938, + "loss": 3.2656, + "step": 19661 + }, + { + "epoch": 0.96, + "grad_norm": 0.5433655381202698, + "learning_rate": 0.0004597953479159519, + "loss": 3.1691, + "step": 19662 + }, + { + "epoch": 0.96, + "grad_norm": 0.5130866765975952, + "learning_rate": 0.0004597823173724399, + "loss": 3.2532, + "step": 19663 + }, + { + "epoch": 0.96, + "grad_norm": 0.5559121966362, + "learning_rate": 0.0004597692864080921, + "loss": 3.3065, + "step": 19664 + }, + { + "epoch": 0.96, + "grad_norm": 0.4829885959625244, + "learning_rate": 0.0004597562550229426, + "loss": 3.2244, + "step": 19665 + }, + { + "epoch": 0.96, + "grad_norm": 0.5515817403793335, + "learning_rate": 0.00045974322321702595, + "loss": 3.0803, + "step": 19666 + }, + { + "epoch": 0.96, + "grad_norm": 0.5156745314598083, + "learning_rate": 0.0004597301909903764, + "loss": 3.254, + "step": 19667 + }, + { + "epoch": 0.96, + "grad_norm": 0.574837863445282, + "learning_rate": 0.0004597171583430282, + "loss": 3.2968, + "step": 19668 + }, + { + "epoch": 0.96, + "grad_norm": 0.5619298219680786, + "learning_rate": 0.0004597041252750158, + "loss": 3.1675, + "step": 19669 + }, + { + "epoch": 0.96, + "grad_norm": 0.5266140103340149, + "learning_rate": 0.00045969109178637345, + "loss": 3.226, + "step": 19670 + }, + { + "epoch": 0.96, + "grad_norm": 0.5088499784469604, + "learning_rate": 0.00045967805787713563, + "loss": 3.0859, + "step": 19671 + }, + { + "epoch": 0.96, + "grad_norm": 0.5350180268287659, + "learning_rate": 0.0004596650235473364, + "loss": 3.1795, + "step": 19672 + }, + { + "epoch": 0.96, + "grad_norm": 0.5337379574775696, + "learning_rate": 0.0004596519887970102, + "loss": 3.135, + "step": 19673 + }, + { + "epoch": 0.96, + "grad_norm": 0.5811973810195923, + "learning_rate": 0.0004596389536261914, + "loss": 3.3238, + "step": 19674 + }, + { + "epoch": 0.96, + "grad_norm": 0.5261113047599792, + "learning_rate": 0.00045962591803491444, + "loss": 3.3253, + "step": 19675 + }, + { + "epoch": 0.96, + "grad_norm": 0.5248528718948364, + "learning_rate": 0.00045961288202321345, + "loss": 3.1994, + "step": 19676 + }, + { + "epoch": 0.96, + "grad_norm": 0.510138750076294, + "learning_rate": 0.0004595998455911228, + "loss": 3.3404, + "step": 19677 + }, + { + "epoch": 0.96, + "grad_norm": 0.5522067546844482, + "learning_rate": 0.0004595868087386769, + "loss": 2.9776, + "step": 19678 + }, + { + "epoch": 0.96, + "grad_norm": 0.7141591906547546, + "learning_rate": 0.0004595737714659101, + "loss": 3.2153, + "step": 19679 + }, + { + "epoch": 0.96, + "grad_norm": 0.49402374029159546, + "learning_rate": 0.00045956073377285663, + "loss": 3.3181, + "step": 19680 + }, + { + "epoch": 0.96, + "grad_norm": 0.5236515402793884, + "learning_rate": 0.00045954769565955094, + "loss": 3.3036, + "step": 19681 + }, + { + "epoch": 0.96, + "grad_norm": 0.525768518447876, + "learning_rate": 0.0004595346571260274, + "loss": 3.1389, + "step": 19682 + }, + { + "epoch": 0.96, + "grad_norm": 0.6157646179199219, + "learning_rate": 0.00045952161817232014, + "loss": 3.1411, + "step": 19683 + }, + { + "epoch": 0.96, + "grad_norm": 0.5027342438697815, + "learning_rate": 0.00045950857879846366, + "loss": 3.3379, + "step": 19684 + }, + { + "epoch": 0.96, + "grad_norm": 0.5160840153694153, + "learning_rate": 0.00045949553900449233, + "loss": 3.1588, + "step": 19685 + }, + { + "epoch": 0.96, + "grad_norm": 0.5430676341056824, + "learning_rate": 0.00045948249879044047, + "loss": 3.1516, + "step": 19686 + }, + { + "epoch": 0.96, + "grad_norm": 0.5123657584190369, + "learning_rate": 0.00045946945815634235, + "loss": 2.9488, + "step": 19687 + }, + { + "epoch": 0.96, + "grad_norm": 0.4994308054447174, + "learning_rate": 0.0004594564171022323, + "loss": 3.1864, + "step": 19688 + }, + { + "epoch": 0.96, + "grad_norm": 0.5245607495307922, + "learning_rate": 0.0004594433756281449, + "loss": 3.1557, + "step": 19689 + }, + { + "epoch": 0.96, + "grad_norm": 0.508391261100769, + "learning_rate": 0.0004594303337341143, + "loss": 3.1624, + "step": 19690 + }, + { + "epoch": 0.97, + "grad_norm": 0.5643361806869507, + "learning_rate": 0.00045941729142017477, + "loss": 3.0144, + "step": 19691 + }, + { + "epoch": 0.97, + "grad_norm": 0.5318419933319092, + "learning_rate": 0.00045940424868636085, + "loss": 2.9979, + "step": 19692 + }, + { + "epoch": 0.97, + "grad_norm": 0.5172003507614136, + "learning_rate": 0.00045939120553270676, + "loss": 3.1654, + "step": 19693 + }, + { + "epoch": 0.97, + "grad_norm": 0.5160081386566162, + "learning_rate": 0.000459378161959247, + "loss": 3.1003, + "step": 19694 + }, + { + "epoch": 0.97, + "grad_norm": 0.5427367687225342, + "learning_rate": 0.0004593651179660158, + "loss": 3.2305, + "step": 19695 + }, + { + "epoch": 0.97, + "grad_norm": 0.523469865322113, + "learning_rate": 0.00045935207355304744, + "loss": 3.2241, + "step": 19696 + }, + { + "epoch": 0.97, + "grad_norm": 0.5474386215209961, + "learning_rate": 0.0004593390287203766, + "loss": 3.1163, + "step": 19697 + }, + { + "epoch": 0.97, + "grad_norm": 0.5134205222129822, + "learning_rate": 0.0004593259834680372, + "loss": 3.2631, + "step": 19698 + }, + { + "epoch": 0.97, + "grad_norm": 0.5353065729141235, + "learning_rate": 0.00045931293779606397, + "loss": 3.2246, + "step": 19699 + }, + { + "epoch": 0.97, + "grad_norm": 0.5378754138946533, + "learning_rate": 0.00045929989170449115, + "loss": 3.3742, + "step": 19700 + }, + { + "epoch": 0.97, + "grad_norm": 0.5078233480453491, + "learning_rate": 0.00045928684519335296, + "loss": 3.0678, + "step": 19701 + }, + { + "epoch": 0.97, + "grad_norm": 0.534570038318634, + "learning_rate": 0.00045927379826268393, + "loss": 3.2471, + "step": 19702 + }, + { + "epoch": 0.97, + "grad_norm": 0.5478757619857788, + "learning_rate": 0.0004592607509125184, + "loss": 2.8799, + "step": 19703 + }, + { + "epoch": 0.97, + "grad_norm": 0.5131859183311462, + "learning_rate": 0.0004592477031428906, + "loss": 3.0923, + "step": 19704 + }, + { + "epoch": 0.97, + "grad_norm": 0.5837509036064148, + "learning_rate": 0.00045923465495383513, + "loss": 3.4114, + "step": 19705 + }, + { + "epoch": 0.97, + "grad_norm": 0.5130158066749573, + "learning_rate": 0.0004592216063453861, + "loss": 3.1598, + "step": 19706 + }, + { + "epoch": 0.97, + "grad_norm": 0.5604761242866516, + "learning_rate": 0.000459208557317578, + "loss": 3.2225, + "step": 19707 + }, + { + "epoch": 0.97, + "grad_norm": 0.5324618220329285, + "learning_rate": 0.00045919550787044527, + "loss": 3.1212, + "step": 19708 + }, + { + "epoch": 0.97, + "grad_norm": 0.5190209150314331, + "learning_rate": 0.0004591824580040222, + "loss": 3.0561, + "step": 19709 + }, + { + "epoch": 0.97, + "grad_norm": 0.5445383191108704, + "learning_rate": 0.0004591694077183432, + "loss": 3.1508, + "step": 19710 + }, + { + "epoch": 0.97, + "grad_norm": 0.5293034315109253, + "learning_rate": 0.0004591563570134427, + "loss": 3.187, + "step": 19711 + }, + { + "epoch": 0.97, + "grad_norm": 0.5551683902740479, + "learning_rate": 0.0004591433058893548, + "loss": 3.1458, + "step": 19712 + }, + { + "epoch": 0.97, + "grad_norm": 0.5138912796974182, + "learning_rate": 0.00045913025434611413, + "loss": 2.9319, + "step": 19713 + }, + { + "epoch": 0.97, + "grad_norm": 0.5497405529022217, + "learning_rate": 0.000459117202383755, + "loss": 3.1789, + "step": 19714 + }, + { + "epoch": 0.97, + "grad_norm": 0.572364091873169, + "learning_rate": 0.00045910415000231173, + "loss": 3.2243, + "step": 19715 + }, + { + "epoch": 0.97, + "grad_norm": 0.5154417753219604, + "learning_rate": 0.0004590910972018188, + "loss": 2.9992, + "step": 19716 + }, + { + "epoch": 0.97, + "grad_norm": 0.5528554320335388, + "learning_rate": 0.00045907804398231055, + "loss": 3.0243, + "step": 19717 + }, + { + "epoch": 0.97, + "grad_norm": 0.5160757303237915, + "learning_rate": 0.0004590649903438213, + "loss": 3.1319, + "step": 19718 + }, + { + "epoch": 0.97, + "grad_norm": 0.4939347803592682, + "learning_rate": 0.0004590519362863855, + "loss": 2.9028, + "step": 19719 + }, + { + "epoch": 0.97, + "grad_norm": 0.5522775650024414, + "learning_rate": 0.0004590388818100375, + "loss": 3.0864, + "step": 19720 + }, + { + "epoch": 0.97, + "grad_norm": 0.5493162870407104, + "learning_rate": 0.00045902582691481175, + "loss": 3.3321, + "step": 19721 + }, + { + "epoch": 0.97, + "grad_norm": 0.5284614562988281, + "learning_rate": 0.00045901277160074257, + "loss": 3.1432, + "step": 19722 + }, + { + "epoch": 0.97, + "grad_norm": 0.5101198554039001, + "learning_rate": 0.00045899971586786436, + "loss": 3.3082, + "step": 19723 + }, + { + "epoch": 0.97, + "grad_norm": 0.5157438516616821, + "learning_rate": 0.00045898665971621133, + "loss": 3.1203, + "step": 19724 + }, + { + "epoch": 0.97, + "grad_norm": 0.538258969783783, + "learning_rate": 0.00045897360314581823, + "loss": 2.9963, + "step": 19725 + }, + { + "epoch": 0.97, + "grad_norm": 0.5072723627090454, + "learning_rate": 0.0004589605461567192, + "loss": 3.2203, + "step": 19726 + }, + { + "epoch": 0.97, + "grad_norm": 0.5226778388023376, + "learning_rate": 0.0004589474887489486, + "loss": 3.2614, + "step": 19727 + }, + { + "epoch": 0.97, + "grad_norm": 0.5579911470413208, + "learning_rate": 0.000458934430922541, + "loss": 3.1676, + "step": 19728 + }, + { + "epoch": 0.97, + "grad_norm": 0.514929473400116, + "learning_rate": 0.0004589213726775307, + "loss": 3.1178, + "step": 19729 + }, + { + "epoch": 0.97, + "grad_norm": 0.5149416923522949, + "learning_rate": 0.000458908314013952, + "loss": 3.1532, + "step": 19730 + }, + { + "epoch": 0.97, + "grad_norm": 0.5170868039131165, + "learning_rate": 0.00045889525493183954, + "loss": 3.1691, + "step": 19731 + }, + { + "epoch": 0.97, + "grad_norm": 0.5064755082130432, + "learning_rate": 0.0004588821954312274, + "loss": 2.9093, + "step": 19732 + }, + { + "epoch": 0.97, + "grad_norm": 0.5192744135856628, + "learning_rate": 0.00045886913551215026, + "loss": 3.0864, + "step": 19733 + }, + { + "epoch": 0.97, + "grad_norm": 0.5007292032241821, + "learning_rate": 0.0004588560751746423, + "loss": 3.2911, + "step": 19734 + }, + { + "epoch": 0.97, + "grad_norm": 0.5587050914764404, + "learning_rate": 0.000458843014418738, + "loss": 3.0655, + "step": 19735 + }, + { + "epoch": 0.97, + "grad_norm": 0.5086398720741272, + "learning_rate": 0.0004588299532444718, + "loss": 3.1727, + "step": 19736 + }, + { + "epoch": 0.97, + "grad_norm": 0.5102616548538208, + "learning_rate": 0.0004588168916518781, + "loss": 3.1426, + "step": 19737 + }, + { + "epoch": 0.97, + "grad_norm": 0.5597113966941833, + "learning_rate": 0.0004588038296409913, + "loss": 3.0143, + "step": 19738 + }, + { + "epoch": 0.97, + "grad_norm": 0.5260500311851501, + "learning_rate": 0.00045879076721184564, + "loss": 2.9833, + "step": 19739 + }, + { + "epoch": 0.97, + "grad_norm": 0.5252350568771362, + "learning_rate": 0.00045877770436447587, + "loss": 3.433, + "step": 19740 + }, + { + "epoch": 0.97, + "grad_norm": 0.5167698264122009, + "learning_rate": 0.00045876464109891605, + "loss": 3.3007, + "step": 19741 + }, + { + "epoch": 0.97, + "grad_norm": 0.512118935585022, + "learning_rate": 0.0004587515774152007, + "loss": 3.1823, + "step": 19742 + }, + { + "epoch": 0.97, + "grad_norm": 0.53267502784729, + "learning_rate": 0.00045873851331336424, + "loss": 3.4128, + "step": 19743 + }, + { + "epoch": 0.97, + "grad_norm": 0.5323781371116638, + "learning_rate": 0.00045872544879344115, + "loss": 3.2691, + "step": 19744 + }, + { + "epoch": 0.97, + "grad_norm": 0.5187110900878906, + "learning_rate": 0.0004587123838554657, + "loss": 2.9825, + "step": 19745 + }, + { + "epoch": 0.97, + "grad_norm": 0.5188029408454895, + "learning_rate": 0.0004586993184994725, + "loss": 3.2929, + "step": 19746 + }, + { + "epoch": 0.97, + "grad_norm": 0.5603750348091125, + "learning_rate": 0.00045868625272549573, + "loss": 3.1024, + "step": 19747 + }, + { + "epoch": 0.97, + "grad_norm": 0.5371494889259338, + "learning_rate": 0.00045867318653356994, + "loss": 3.2192, + "step": 19748 + }, + { + "epoch": 0.97, + "grad_norm": 0.5349953770637512, + "learning_rate": 0.00045866011992372953, + "loss": 3.1364, + "step": 19749 + }, + { + "epoch": 0.97, + "grad_norm": 0.5063318610191345, + "learning_rate": 0.0004586470528960088, + "loss": 3.1449, + "step": 19750 + }, + { + "epoch": 0.97, + "grad_norm": 0.5154860019683838, + "learning_rate": 0.00045863398545044245, + "loss": 3.0492, + "step": 19751 + }, + { + "epoch": 0.97, + "grad_norm": 0.4861948490142822, + "learning_rate": 0.0004586209175870646, + "loss": 3.3389, + "step": 19752 + }, + { + "epoch": 0.97, + "grad_norm": 0.5171583294868469, + "learning_rate": 0.0004586078493059098, + "loss": 3.1478, + "step": 19753 + }, + { + "epoch": 0.97, + "grad_norm": 0.5161516666412354, + "learning_rate": 0.0004585947806070124, + "loss": 2.9258, + "step": 19754 + }, + { + "epoch": 0.97, + "grad_norm": 0.5537519454956055, + "learning_rate": 0.00045858171149040696, + "loss": 3.1637, + "step": 19755 + }, + { + "epoch": 0.97, + "grad_norm": 0.5510892271995544, + "learning_rate": 0.0004585686419561277, + "loss": 3.1517, + "step": 19756 + }, + { + "epoch": 0.97, + "grad_norm": 0.5581437349319458, + "learning_rate": 0.0004585555720042093, + "loss": 3.4651, + "step": 19757 + }, + { + "epoch": 0.97, + "grad_norm": 0.5405652523040771, + "learning_rate": 0.00045854250163468587, + "loss": 3.2142, + "step": 19758 + }, + { + "epoch": 0.97, + "grad_norm": 0.536334753036499, + "learning_rate": 0.00045852943084759214, + "loss": 3.2114, + "step": 19759 + }, + { + "epoch": 0.97, + "grad_norm": 0.5953757166862488, + "learning_rate": 0.00045851635964296234, + "loss": 3.0759, + "step": 19760 + }, + { + "epoch": 0.97, + "grad_norm": 0.4927489161491394, + "learning_rate": 0.00045850328802083095, + "loss": 3.3361, + "step": 19761 + }, + { + "epoch": 0.97, + "grad_norm": 0.5365741848945618, + "learning_rate": 0.00045849021598123245, + "loss": 3.0068, + "step": 19762 + }, + { + "epoch": 0.97, + "grad_norm": 0.5352941751480103, + "learning_rate": 0.0004584771435242012, + "loss": 3.2842, + "step": 19763 + }, + { + "epoch": 0.97, + "grad_norm": 0.5236901640892029, + "learning_rate": 0.00045846407064977163, + "loss": 3.0848, + "step": 19764 + }, + { + "epoch": 0.97, + "grad_norm": 0.5306557416915894, + "learning_rate": 0.0004584509973579782, + "loss": 3.1199, + "step": 19765 + }, + { + "epoch": 0.97, + "grad_norm": 0.5267027616500854, + "learning_rate": 0.00045843792364885533, + "loss": 3.346, + "step": 19766 + }, + { + "epoch": 0.97, + "grad_norm": 0.5062419176101685, + "learning_rate": 0.0004584248495224375, + "loss": 3.1383, + "step": 19767 + }, + { + "epoch": 0.97, + "grad_norm": 0.5400051474571228, + "learning_rate": 0.00045841177497875905, + "loss": 3.0832, + "step": 19768 + }, + { + "epoch": 0.97, + "grad_norm": 0.5341521501541138, + "learning_rate": 0.00045839870001785455, + "loss": 3.1805, + "step": 19769 + }, + { + "epoch": 0.97, + "grad_norm": 0.5922198295593262, + "learning_rate": 0.00045838562463975833, + "loss": 3.1083, + "step": 19770 + }, + { + "epoch": 0.97, + "grad_norm": 0.49147966504096985, + "learning_rate": 0.0004583725488445048, + "loss": 3.2987, + "step": 19771 + }, + { + "epoch": 0.97, + "grad_norm": 0.49468183517456055, + "learning_rate": 0.00045835947263212846, + "loss": 3.3835, + "step": 19772 + }, + { + "epoch": 0.97, + "grad_norm": 0.5233425498008728, + "learning_rate": 0.00045834639600266387, + "loss": 3.1561, + "step": 19773 + }, + { + "epoch": 0.97, + "grad_norm": 0.5618370175361633, + "learning_rate": 0.0004583333189561453, + "loss": 3.0311, + "step": 19774 + }, + { + "epoch": 0.97, + "grad_norm": 0.526765763759613, + "learning_rate": 0.00045832024149260707, + "loss": 3.2129, + "step": 19775 + }, + { + "epoch": 0.97, + "grad_norm": 0.5225813388824463, + "learning_rate": 0.0004583071636120839, + "loss": 3.273, + "step": 19776 + }, + { + "epoch": 0.97, + "grad_norm": 0.514604389667511, + "learning_rate": 0.00045829408531461023, + "loss": 3.0254, + "step": 19777 + }, + { + "epoch": 0.97, + "grad_norm": 0.5545125603675842, + "learning_rate": 0.00045828100660022037, + "loss": 3.1412, + "step": 19778 + }, + { + "epoch": 0.97, + "grad_norm": 0.544165849685669, + "learning_rate": 0.00045826792746894875, + "loss": 3.1968, + "step": 19779 + }, + { + "epoch": 0.97, + "grad_norm": 0.5067391395568848, + "learning_rate": 0.0004582548479208298, + "loss": 3.0627, + "step": 19780 + }, + { + "epoch": 0.97, + "grad_norm": 0.4904179275035858, + "learning_rate": 0.0004582417679558981, + "loss": 3.0005, + "step": 19781 + }, + { + "epoch": 0.97, + "grad_norm": 0.5631465315818787, + "learning_rate": 0.0004582286875741881, + "loss": 3.186, + "step": 19782 + }, + { + "epoch": 0.97, + "grad_norm": 0.5135632753372192, + "learning_rate": 0.00045821560677573414, + "loss": 3.1411, + "step": 19783 + }, + { + "epoch": 0.97, + "grad_norm": 0.5340128540992737, + "learning_rate": 0.0004582025255605708, + "loss": 2.9759, + "step": 19784 + }, + { + "epoch": 0.97, + "grad_norm": 0.5625054836273193, + "learning_rate": 0.0004581894439287322, + "loss": 3.1578, + "step": 19785 + }, + { + "epoch": 0.97, + "grad_norm": 0.5040884017944336, + "learning_rate": 0.00045817636188025333, + "loss": 3.1584, + "step": 19786 + }, + { + "epoch": 0.97, + "grad_norm": 0.5131189823150635, + "learning_rate": 0.00045816327941516823, + "loss": 3.3082, + "step": 19787 + }, + { + "epoch": 0.97, + "grad_norm": 0.5243518948554993, + "learning_rate": 0.0004581501965335115, + "loss": 3.0006, + "step": 19788 + }, + { + "epoch": 0.97, + "grad_norm": 0.5257127285003662, + "learning_rate": 0.0004581371132353176, + "loss": 3.2521, + "step": 19789 + }, + { + "epoch": 0.97, + "grad_norm": 0.5097827911376953, + "learning_rate": 0.00045812402952062105, + "loss": 3.0542, + "step": 19790 + }, + { + "epoch": 0.97, + "grad_norm": 0.572334349155426, + "learning_rate": 0.00045811094538945614, + "loss": 2.9641, + "step": 19791 + }, + { + "epoch": 0.97, + "grad_norm": 0.529404878616333, + "learning_rate": 0.00045809786084185746, + "loss": 3.0158, + "step": 19792 + }, + { + "epoch": 0.97, + "grad_norm": 0.5602056980133057, + "learning_rate": 0.00045808477587785945, + "loss": 3.2361, + "step": 19793 + }, + { + "epoch": 0.97, + "grad_norm": 0.5243390202522278, + "learning_rate": 0.00045807169049749653, + "loss": 3.2635, + "step": 19794 + }, + { + "epoch": 0.97, + "grad_norm": 0.5974134802818298, + "learning_rate": 0.0004580586047008033, + "loss": 3.0509, + "step": 19795 + }, + { + "epoch": 0.97, + "grad_norm": 0.4903809428215027, + "learning_rate": 0.0004580455184878139, + "loss": 3.148, + "step": 19796 + }, + { + "epoch": 0.97, + "grad_norm": 0.5298634767532349, + "learning_rate": 0.00045803243185856327, + "loss": 3.2435, + "step": 19797 + }, + { + "epoch": 0.97, + "grad_norm": 0.5195876955986023, + "learning_rate": 0.00045801934481308547, + "loss": 3.1942, + "step": 19798 + }, + { + "epoch": 0.97, + "grad_norm": 0.581200361251831, + "learning_rate": 0.0004580062573514153, + "loss": 3.017, + "step": 19799 + }, + { + "epoch": 0.97, + "grad_norm": 0.5249642729759216, + "learning_rate": 0.0004579931694735869, + "loss": 3.0202, + "step": 19800 + }, + { + "epoch": 0.97, + "grad_norm": 0.5444178581237793, + "learning_rate": 0.0004579800811796349, + "loss": 3.1979, + "step": 19801 + }, + { + "epoch": 0.97, + "grad_norm": 0.5659769177436829, + "learning_rate": 0.00045796699246959384, + "loss": 3.2618, + "step": 19802 + }, + { + "epoch": 0.97, + "grad_norm": 0.5352531671524048, + "learning_rate": 0.00045795390334349813, + "loss": 3.0874, + "step": 19803 + }, + { + "epoch": 0.97, + "grad_norm": 0.5184085965156555, + "learning_rate": 0.0004579408138013822, + "loss": 3.1453, + "step": 19804 + }, + { + "epoch": 0.97, + "grad_norm": 0.5398754477500916, + "learning_rate": 0.0004579277238432805, + "loss": 3.0704, + "step": 19805 + }, + { + "epoch": 0.97, + "grad_norm": 0.5079792737960815, + "learning_rate": 0.00045791463346922756, + "loss": 3.1852, + "step": 19806 + }, + { + "epoch": 0.97, + "grad_norm": 0.521885871887207, + "learning_rate": 0.00045790154267925795, + "loss": 3.2204, + "step": 19807 + }, + { + "epoch": 0.97, + "grad_norm": 0.5815854072570801, + "learning_rate": 0.00045788845147340607, + "loss": 2.9648, + "step": 19808 + }, + { + "epoch": 0.97, + "grad_norm": 0.5417388081550598, + "learning_rate": 0.00045787535985170643, + "loss": 3.3381, + "step": 19809 + }, + { + "epoch": 0.97, + "grad_norm": 0.5544579029083252, + "learning_rate": 0.00045786226781419347, + "loss": 3.0114, + "step": 19810 + }, + { + "epoch": 0.97, + "grad_norm": 0.5227230191230774, + "learning_rate": 0.00045784917536090155, + "loss": 3.1927, + "step": 19811 + }, + { + "epoch": 0.97, + "grad_norm": 0.5432718992233276, + "learning_rate": 0.0004578360824918653, + "loss": 2.9235, + "step": 19812 + }, + { + "epoch": 0.97, + "grad_norm": 0.5189009308815002, + "learning_rate": 0.0004578229892071193, + "loss": 3.1517, + "step": 19813 + }, + { + "epoch": 0.97, + "grad_norm": 0.5389320850372314, + "learning_rate": 0.000457809895506698, + "loss": 3.2118, + "step": 19814 + }, + { + "epoch": 0.97, + "grad_norm": 0.5334956049919128, + "learning_rate": 0.0004577968013906356, + "loss": 2.9926, + "step": 19815 + }, + { + "epoch": 0.97, + "grad_norm": 0.5629310011863708, + "learning_rate": 0.0004577837068589669, + "loss": 3.139, + "step": 19816 + }, + { + "epoch": 0.97, + "grad_norm": 0.5584556460380554, + "learning_rate": 0.0004577706119117262, + "loss": 3.049, + "step": 19817 + }, + { + "epoch": 0.97, + "grad_norm": 0.4945135712623596, + "learning_rate": 0.0004577575165489482, + "loss": 3.1779, + "step": 19818 + }, + { + "epoch": 0.97, + "grad_norm": 0.5308895111083984, + "learning_rate": 0.0004577444207706671, + "loss": 3.3229, + "step": 19819 + }, + { + "epoch": 0.97, + "grad_norm": 0.5382962226867676, + "learning_rate": 0.00045773132457691766, + "loss": 3.2655, + "step": 19820 + }, + { + "epoch": 0.97, + "grad_norm": 0.5207489728927612, + "learning_rate": 0.0004577182279677342, + "loss": 3.2315, + "step": 19821 + }, + { + "epoch": 0.97, + "grad_norm": 0.5437517166137695, + "learning_rate": 0.0004577051309431513, + "loss": 3.1617, + "step": 19822 + }, + { + "epoch": 0.97, + "grad_norm": 0.5084508657455444, + "learning_rate": 0.0004576920335032035, + "loss": 3.1338, + "step": 19823 + }, + { + "epoch": 0.97, + "grad_norm": 0.48307299613952637, + "learning_rate": 0.0004576789356479253, + "loss": 3.0642, + "step": 19824 + }, + { + "epoch": 0.97, + "grad_norm": 0.520545244216919, + "learning_rate": 0.000457665837377351, + "loss": 2.956, + "step": 19825 + }, + { + "epoch": 0.97, + "grad_norm": 0.5704953074455261, + "learning_rate": 0.00045765273869151514, + "loss": 3.1909, + "step": 19826 + }, + { + "epoch": 0.97, + "grad_norm": 0.5432178974151611, + "learning_rate": 0.00045763963959045246, + "loss": 3.1516, + "step": 19827 + }, + { + "epoch": 0.97, + "grad_norm": 0.5819136500358582, + "learning_rate": 0.0004576265400741973, + "loss": 3.1398, + "step": 19828 + }, + { + "epoch": 0.97, + "grad_norm": 0.51753169298172, + "learning_rate": 0.00045761344014278414, + "loss": 2.8872, + "step": 19829 + }, + { + "epoch": 0.97, + "grad_norm": 0.5441774725914001, + "learning_rate": 0.0004576003397962475, + "loss": 3.1017, + "step": 19830 + }, + { + "epoch": 0.97, + "grad_norm": 0.5453305840492249, + "learning_rate": 0.0004575872390346219, + "loss": 3.1247, + "step": 19831 + }, + { + "epoch": 0.97, + "grad_norm": 0.5290966033935547, + "learning_rate": 0.0004575741378579419, + "loss": 3.0589, + "step": 19832 + }, + { + "epoch": 0.97, + "grad_norm": 0.5535355806350708, + "learning_rate": 0.0004575610362662418, + "loss": 3.0727, + "step": 19833 + }, + { + "epoch": 0.97, + "grad_norm": 0.5289100408554077, + "learning_rate": 0.00045754793425955637, + "loss": 3.0748, + "step": 19834 + }, + { + "epoch": 0.97, + "grad_norm": 0.681056559085846, + "learning_rate": 0.00045753483183792005, + "loss": 3.1107, + "step": 19835 + }, + { + "epoch": 0.97, + "grad_norm": 0.514905571937561, + "learning_rate": 0.0004575217290013672, + "loss": 3.1522, + "step": 19836 + }, + { + "epoch": 0.97, + "grad_norm": 0.5799508094787598, + "learning_rate": 0.00045750862574993236, + "loss": 3.1752, + "step": 19837 + }, + { + "epoch": 0.97, + "grad_norm": 0.5410858988761902, + "learning_rate": 0.0004574955220836503, + "loss": 3.1188, + "step": 19838 + }, + { + "epoch": 0.97, + "grad_norm": 0.541213870048523, + "learning_rate": 0.0004574824180025553, + "loss": 3.1487, + "step": 19839 + }, + { + "epoch": 0.97, + "grad_norm": 0.5377730131149292, + "learning_rate": 0.00045746931350668183, + "loss": 3.2846, + "step": 19840 + }, + { + "epoch": 0.97, + "grad_norm": 0.5022572875022888, + "learning_rate": 0.0004574562085960646, + "loss": 3.1793, + "step": 19841 + }, + { + "epoch": 0.97, + "grad_norm": 0.535351574420929, + "learning_rate": 0.0004574431032707379, + "loss": 2.9624, + "step": 19842 + }, + { + "epoch": 0.97, + "grad_norm": 0.5488033890724182, + "learning_rate": 0.0004574299975307364, + "loss": 3.1208, + "step": 19843 + }, + { + "epoch": 0.97, + "grad_norm": 0.5779278874397278, + "learning_rate": 0.00045741689137609467, + "loss": 3.1375, + "step": 19844 + }, + { + "epoch": 0.97, + "grad_norm": 0.5179200768470764, + "learning_rate": 0.00045740378480684706, + "loss": 2.9844, + "step": 19845 + }, + { + "epoch": 0.97, + "grad_norm": 0.5455824732780457, + "learning_rate": 0.00045739067782302824, + "loss": 2.9835, + "step": 19846 + }, + { + "epoch": 0.97, + "grad_norm": 0.5416254997253418, + "learning_rate": 0.00045737757042467263, + "loss": 3.4106, + "step": 19847 + }, + { + "epoch": 0.97, + "grad_norm": 0.5338101387023926, + "learning_rate": 0.0004573644626118147, + "loss": 3.2009, + "step": 19848 + }, + { + "epoch": 0.97, + "grad_norm": 0.5255735516548157, + "learning_rate": 0.0004573513543844892, + "loss": 3.203, + "step": 19849 + }, + { + "epoch": 0.97, + "grad_norm": 0.5638295412063599, + "learning_rate": 0.0004573382457427305, + "loss": 3.2313, + "step": 19850 + }, + { + "epoch": 0.97, + "grad_norm": 0.5044254660606384, + "learning_rate": 0.0004573251366865731, + "loss": 3.1181, + "step": 19851 + }, + { + "epoch": 0.97, + "grad_norm": 0.675537109375, + "learning_rate": 0.00045731202721605144, + "loss": 3.3099, + "step": 19852 + }, + { + "epoch": 0.97, + "grad_norm": 0.517725944519043, + "learning_rate": 0.0004572989173312004, + "loss": 3.1187, + "step": 19853 + }, + { + "epoch": 0.97, + "grad_norm": 0.5297645330429077, + "learning_rate": 0.0004572858070320542, + "loss": 3.2745, + "step": 19854 + }, + { + "epoch": 0.97, + "grad_norm": 0.5531717538833618, + "learning_rate": 0.00045727269631864743, + "loss": 3.0869, + "step": 19855 + }, + { + "epoch": 0.97, + "grad_norm": 0.5511997938156128, + "learning_rate": 0.0004572595851910147, + "loss": 3.2544, + "step": 19856 + }, + { + "epoch": 0.97, + "grad_norm": 0.5770429968833923, + "learning_rate": 0.0004572464736491904, + "loss": 3.058, + "step": 19857 + }, + { + "epoch": 0.97, + "grad_norm": 0.5271993279457092, + "learning_rate": 0.00045723336169320916, + "loss": 3.221, + "step": 19858 + }, + { + "epoch": 0.97, + "grad_norm": 0.5206511616706848, + "learning_rate": 0.0004572202493231056, + "loss": 3.3138, + "step": 19859 + }, + { + "epoch": 0.97, + "grad_norm": 0.537216067314148, + "learning_rate": 0.0004572071365389141, + "loss": 3.0961, + "step": 19860 + }, + { + "epoch": 0.97, + "grad_norm": 0.5249066948890686, + "learning_rate": 0.00045719402334066926, + "loss": 3.1989, + "step": 19861 + }, + { + "epoch": 0.97, + "grad_norm": 0.49584659934043884, + "learning_rate": 0.0004571809097284055, + "loss": 3.3242, + "step": 19862 + }, + { + "epoch": 0.97, + "grad_norm": 0.4906226694583893, + "learning_rate": 0.00045716779570215767, + "loss": 3.2433, + "step": 19863 + }, + { + "epoch": 0.97, + "grad_norm": 0.5297616720199585, + "learning_rate": 0.00045715468126196, + "loss": 3.0477, + "step": 19864 + }, + { + "epoch": 0.97, + "grad_norm": 0.5272489786148071, + "learning_rate": 0.00045714156640784727, + "loss": 3.1792, + "step": 19865 + }, + { + "epoch": 0.97, + "grad_norm": 0.5244301557540894, + "learning_rate": 0.0004571284511398538, + "loss": 3.3991, + "step": 19866 + }, + { + "epoch": 0.97, + "grad_norm": 0.5065206289291382, + "learning_rate": 0.0004571153354580142, + "loss": 3.0658, + "step": 19867 + }, + { + "epoch": 0.97, + "grad_norm": 0.5413605570793152, + "learning_rate": 0.00045710221936236305, + "loss": 3.2466, + "step": 19868 + }, + { + "epoch": 0.97, + "grad_norm": 0.5623077154159546, + "learning_rate": 0.00045708910285293487, + "loss": 3.2348, + "step": 19869 + }, + { + "epoch": 0.97, + "grad_norm": 0.5285463333129883, + "learning_rate": 0.0004570759859297643, + "loss": 2.9883, + "step": 19870 + }, + { + "epoch": 0.97, + "grad_norm": 0.530189573764801, + "learning_rate": 0.0004570628685928858, + "loss": 3.1729, + "step": 19871 + }, + { + "epoch": 0.97, + "grad_norm": 0.5128258466720581, + "learning_rate": 0.00045704975084233395, + "loss": 3.0372, + "step": 19872 + }, + { + "epoch": 0.97, + "grad_norm": 0.5443863868713379, + "learning_rate": 0.0004570366326781432, + "loss": 3.1384, + "step": 19873 + }, + { + "epoch": 0.97, + "grad_norm": 0.547491729259491, + "learning_rate": 0.0004570235141003482, + "loss": 3.2092, + "step": 19874 + }, + { + "epoch": 0.97, + "grad_norm": 0.5074962377548218, + "learning_rate": 0.0004570103951089836, + "loss": 3.0942, + "step": 19875 + }, + { + "epoch": 0.97, + "grad_norm": 0.574950635433197, + "learning_rate": 0.00045699727570408375, + "loss": 3.1655, + "step": 19876 + }, + { + "epoch": 0.97, + "grad_norm": 0.5343877077102661, + "learning_rate": 0.00045698415588568334, + "loss": 3.1291, + "step": 19877 + }, + { + "epoch": 0.97, + "grad_norm": 0.5411051511764526, + "learning_rate": 0.0004569710356538167, + "loss": 3.2585, + "step": 19878 + }, + { + "epoch": 0.97, + "grad_norm": 0.5204007625579834, + "learning_rate": 0.0004569579150085188, + "loss": 3.2438, + "step": 19879 + }, + { + "epoch": 0.97, + "grad_norm": 0.518437385559082, + "learning_rate": 0.0004569447939498238, + "loss": 3.0511, + "step": 19880 + }, + { + "epoch": 0.97, + "grad_norm": 0.5438820123672485, + "learning_rate": 0.0004569316724777665, + "loss": 3.1059, + "step": 19881 + }, + { + "epoch": 0.97, + "grad_norm": 0.5277252793312073, + "learning_rate": 0.00045691855059238126, + "loss": 3.1655, + "step": 19882 + }, + { + "epoch": 0.97, + "grad_norm": 0.4945885241031647, + "learning_rate": 0.0004569054282937029, + "loss": 3.0627, + "step": 19883 + }, + { + "epoch": 0.97, + "grad_norm": 0.5790326595306396, + "learning_rate": 0.0004568923055817657, + "loss": 3.1567, + "step": 19884 + }, + { + "epoch": 0.97, + "grad_norm": 0.5103229880332947, + "learning_rate": 0.0004568791824566045, + "loss": 3.2317, + "step": 19885 + }, + { + "epoch": 0.97, + "grad_norm": 0.5120726823806763, + "learning_rate": 0.00045686605891825363, + "loss": 3.2693, + "step": 19886 + }, + { + "epoch": 0.97, + "grad_norm": 0.5078148245811462, + "learning_rate": 0.00045685293496674784, + "loss": 3.2933, + "step": 19887 + }, + { + "epoch": 0.97, + "grad_norm": 0.5411629676818848, + "learning_rate": 0.0004568398106021215, + "loss": 2.9652, + "step": 19888 + }, + { + "epoch": 0.97, + "grad_norm": 0.5563063621520996, + "learning_rate": 0.00045682668582440933, + "loss": 3.2037, + "step": 19889 + }, + { + "epoch": 0.97, + "grad_norm": 0.5265916585922241, + "learning_rate": 0.0004568135606336459, + "loss": 3.0914, + "step": 19890 + }, + { + "epoch": 0.97, + "grad_norm": 0.5466210842132568, + "learning_rate": 0.0004568004350298657, + "loss": 3.1264, + "step": 19891 + }, + { + "epoch": 0.97, + "grad_norm": 0.5217623710632324, + "learning_rate": 0.0004567873090131033, + "loss": 3.1154, + "step": 19892 + }, + { + "epoch": 0.97, + "grad_norm": 0.514886736869812, + "learning_rate": 0.0004567741825833933, + "loss": 3.2016, + "step": 19893 + }, + { + "epoch": 0.97, + "grad_norm": 0.6545686721801758, + "learning_rate": 0.00045676105574077026, + "loss": 3.2672, + "step": 19894 + }, + { + "epoch": 0.98, + "grad_norm": 0.527996838092804, + "learning_rate": 0.0004567479284852689, + "loss": 3.0232, + "step": 19895 + }, + { + "epoch": 0.98, + "grad_norm": 0.5357192754745483, + "learning_rate": 0.0004567348008169235, + "loss": 3.1696, + "step": 19896 + }, + { + "epoch": 0.98, + "grad_norm": 0.5141521692276001, + "learning_rate": 0.00045672167273576894, + "loss": 3.205, + "step": 19897 + }, + { + "epoch": 0.98, + "grad_norm": 0.5258877277374268, + "learning_rate": 0.00045670854424183953, + "loss": 3.2861, + "step": 19898 + }, + { + "epoch": 0.98, + "grad_norm": 0.574230432510376, + "learning_rate": 0.00045669541533517, + "loss": 3.245, + "step": 19899 + }, + { + "epoch": 0.98, + "grad_norm": 0.5562745928764343, + "learning_rate": 0.00045668228601579495, + "loss": 2.9874, + "step": 19900 + }, + { + "epoch": 0.98, + "grad_norm": 0.5093985199928284, + "learning_rate": 0.0004566691562837489, + "loss": 3.2173, + "step": 19901 + }, + { + "epoch": 0.98, + "grad_norm": 0.5798713564872742, + "learning_rate": 0.0004566560261390664, + "loss": 3.2883, + "step": 19902 + }, + { + "epoch": 0.98, + "grad_norm": 0.5926944017410278, + "learning_rate": 0.0004566428955817821, + "loss": 3.1588, + "step": 19903 + }, + { + "epoch": 0.98, + "grad_norm": 0.5162215828895569, + "learning_rate": 0.0004566297646119306, + "loss": 3.1708, + "step": 19904 + }, + { + "epoch": 0.98, + "grad_norm": 0.5461186766624451, + "learning_rate": 0.0004566166332295464, + "loss": 2.9684, + "step": 19905 + }, + { + "epoch": 0.98, + "grad_norm": 0.5802727341651917, + "learning_rate": 0.0004566035014346641, + "loss": 3.1052, + "step": 19906 + }, + { + "epoch": 0.98, + "grad_norm": 0.5327712297439575, + "learning_rate": 0.0004565903692273184, + "loss": 3.114, + "step": 19907 + }, + { + "epoch": 0.98, + "grad_norm": 0.5120739936828613, + "learning_rate": 0.00045657723660754384, + "loss": 3.0222, + "step": 19908 + }, + { + "epoch": 0.98, + "grad_norm": 0.5327271819114685, + "learning_rate": 0.0004565641035753748, + "loss": 3.0759, + "step": 19909 + }, + { + "epoch": 0.98, + "grad_norm": 0.524299144744873, + "learning_rate": 0.00045655097013084616, + "loss": 3.2712, + "step": 19910 + }, + { + "epoch": 0.98, + "grad_norm": 0.5683066248893738, + "learning_rate": 0.00045653783627399235, + "loss": 3.2524, + "step": 19911 + }, + { + "epoch": 0.98, + "grad_norm": 0.5321347117424011, + "learning_rate": 0.00045652470200484806, + "loss": 3.205, + "step": 19912 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390219688415527, + "learning_rate": 0.0004565115673234478, + "loss": 3.1628, + "step": 19913 + }, + { + "epoch": 0.98, + "grad_norm": 0.5145443677902222, + "learning_rate": 0.0004564984322298261, + "loss": 3.2744, + "step": 19914 + }, + { + "epoch": 0.98, + "grad_norm": 0.5551137924194336, + "learning_rate": 0.0004564852967240178, + "loss": 3.1102, + "step": 19915 + }, + { + "epoch": 0.98, + "grad_norm": 0.524955153465271, + "learning_rate": 0.00045647216080605725, + "loss": 3.3276, + "step": 19916 + }, + { + "epoch": 0.98, + "grad_norm": 0.5320925712585449, + "learning_rate": 0.00045645902447597923, + "loss": 3.3742, + "step": 19917 + }, + { + "epoch": 0.98, + "grad_norm": 0.518169105052948, + "learning_rate": 0.00045644588773381813, + "loss": 3.1312, + "step": 19918 + }, + { + "epoch": 0.98, + "grad_norm": 0.5469488501548767, + "learning_rate": 0.00045643275057960877, + "loss": 3.137, + "step": 19919 + }, + { + "epoch": 0.98, + "grad_norm": 0.5299994945526123, + "learning_rate": 0.0004564196130133856, + "loss": 3.2838, + "step": 19920 + }, + { + "epoch": 0.98, + "grad_norm": 0.5019270181655884, + "learning_rate": 0.0004564064750351833, + "loss": 3.0652, + "step": 19921 + }, + { + "epoch": 0.98, + "grad_norm": 0.5183849930763245, + "learning_rate": 0.00045639333664503637, + "loss": 2.9966, + "step": 19922 + }, + { + "epoch": 0.98, + "grad_norm": 0.5174096822738647, + "learning_rate": 0.0004563801978429797, + "loss": 2.9913, + "step": 19923 + }, + { + "epoch": 0.98, + "grad_norm": 0.5443447828292847, + "learning_rate": 0.0004563670586290475, + "loss": 3.057, + "step": 19924 + }, + { + "epoch": 0.98, + "grad_norm": 0.564255952835083, + "learning_rate": 0.0004563539190032746, + "loss": 3.0481, + "step": 19925 + }, + { + "epoch": 0.98, + "grad_norm": 0.5157086253166199, + "learning_rate": 0.0004563407789656956, + "loss": 3.0848, + "step": 19926 + }, + { + "epoch": 0.98, + "grad_norm": 0.5265598893165588, + "learning_rate": 0.00045632763851634496, + "loss": 3.1655, + "step": 19927 + }, + { + "epoch": 0.98, + "grad_norm": 0.5157798528671265, + "learning_rate": 0.00045631449765525753, + "loss": 3.2964, + "step": 19928 + }, + { + "epoch": 0.98, + "grad_norm": 0.5361979603767395, + "learning_rate": 0.00045630135638246775, + "loss": 3.1274, + "step": 19929 + }, + { + "epoch": 0.98, + "grad_norm": 0.5458618998527527, + "learning_rate": 0.0004562882146980103, + "loss": 3.1837, + "step": 19930 + }, + { + "epoch": 0.98, + "grad_norm": 0.49934300780296326, + "learning_rate": 0.0004562750726019197, + "loss": 3.0736, + "step": 19931 + }, + { + "epoch": 0.98, + "grad_norm": 0.5094740390777588, + "learning_rate": 0.00045626193009423076, + "loss": 2.8927, + "step": 19932 + }, + { + "epoch": 0.98, + "grad_norm": 0.5375152826309204, + "learning_rate": 0.00045624878717497784, + "loss": 3.2418, + "step": 19933 + }, + { + "epoch": 0.98, + "grad_norm": 0.5470656156539917, + "learning_rate": 0.0004562356438441957, + "loss": 3.2453, + "step": 19934 + }, + { + "epoch": 0.98, + "grad_norm": 0.5325232148170471, + "learning_rate": 0.000456222500101919, + "loss": 3.1305, + "step": 19935 + }, + { + "epoch": 0.98, + "grad_norm": 0.5318748950958252, + "learning_rate": 0.00045620935594818234, + "loss": 3.0646, + "step": 19936 + }, + { + "epoch": 0.98, + "grad_norm": 0.5223355889320374, + "learning_rate": 0.0004561962113830203, + "loss": 3.2699, + "step": 19937 + }, + { + "epoch": 0.98, + "grad_norm": 0.5093883872032166, + "learning_rate": 0.00045618306640646744, + "loss": 3.2268, + "step": 19938 + }, + { + "epoch": 0.98, + "grad_norm": 0.5158838033676147, + "learning_rate": 0.0004561699210185584, + "loss": 3.0998, + "step": 19939 + }, + { + "epoch": 0.98, + "grad_norm": 0.5459488034248352, + "learning_rate": 0.00045615677521932785, + "loss": 3.142, + "step": 19940 + }, + { + "epoch": 0.98, + "grad_norm": 0.574743390083313, + "learning_rate": 0.00045614362900881053, + "loss": 3.1598, + "step": 19941 + }, + { + "epoch": 0.98, + "grad_norm": 0.5598559379577637, + "learning_rate": 0.00045613048238704086, + "loss": 3.0253, + "step": 19942 + }, + { + "epoch": 0.98, + "grad_norm": 0.498945027589798, + "learning_rate": 0.0004561173353540535, + "loss": 3.3936, + "step": 19943 + }, + { + "epoch": 0.98, + "grad_norm": 0.5641855597496033, + "learning_rate": 0.0004561041879098832, + "loss": 3.4009, + "step": 19944 + }, + { + "epoch": 0.98, + "grad_norm": 0.5260211825370789, + "learning_rate": 0.0004560910400545645, + "loss": 3.2477, + "step": 19945 + }, + { + "epoch": 0.98, + "grad_norm": 0.5271660089492798, + "learning_rate": 0.000456077891788132, + "loss": 3.2298, + "step": 19946 + }, + { + "epoch": 0.98, + "grad_norm": 0.499926894903183, + "learning_rate": 0.00045606474311062035, + "loss": 3.2045, + "step": 19947 + }, + { + "epoch": 0.98, + "grad_norm": 0.5305925011634827, + "learning_rate": 0.00045605159402206435, + "loss": 3.1442, + "step": 19948 + }, + { + "epoch": 0.98, + "grad_norm": 0.5455648303031921, + "learning_rate": 0.0004560384445224984, + "loss": 3.0783, + "step": 19949 + }, + { + "epoch": 0.98, + "grad_norm": 0.5389901995658875, + "learning_rate": 0.0004560252946119571, + "loss": 3.148, + "step": 19950 + }, + { + "epoch": 0.98, + "grad_norm": 0.5463103652000427, + "learning_rate": 0.00045601214429047534, + "loss": 2.9844, + "step": 19951 + }, + { + "epoch": 0.98, + "grad_norm": 0.5185086131095886, + "learning_rate": 0.00045599899355808763, + "loss": 3.1557, + "step": 19952 + }, + { + "epoch": 0.98, + "grad_norm": 0.5186192393302917, + "learning_rate": 0.0004559858424148285, + "loss": 3.2037, + "step": 19953 + }, + { + "epoch": 0.98, + "grad_norm": 0.5496533513069153, + "learning_rate": 0.00045597269086073273, + "loss": 3.1717, + "step": 19954 + }, + { + "epoch": 0.98, + "grad_norm": 0.5932478308677673, + "learning_rate": 0.00045595953889583483, + "loss": 2.9012, + "step": 19955 + }, + { + "epoch": 0.98, + "grad_norm": 0.5475051403045654, + "learning_rate": 0.0004559463865201696, + "loss": 3.2653, + "step": 19956 + }, + { + "epoch": 0.98, + "grad_norm": 0.5264762043952942, + "learning_rate": 0.0004559332337337716, + "loss": 3.2941, + "step": 19957 + }, + { + "epoch": 0.98, + "grad_norm": 0.503323495388031, + "learning_rate": 0.0004559200805366755, + "loss": 3.1422, + "step": 19958 + }, + { + "epoch": 0.98, + "grad_norm": 0.5633005499839783, + "learning_rate": 0.0004559069269289159, + "loss": 3.2524, + "step": 19959 + }, + { + "epoch": 0.98, + "grad_norm": 0.5244492888450623, + "learning_rate": 0.0004558937729105274, + "loss": 3.1133, + "step": 19960 + }, + { + "epoch": 0.98, + "grad_norm": 0.5386800765991211, + "learning_rate": 0.00045588061848154477, + "loss": 3.2379, + "step": 19961 + }, + { + "epoch": 0.98, + "grad_norm": 0.5170870423316956, + "learning_rate": 0.00045586746364200255, + "loss": 3.2358, + "step": 19962 + }, + { + "epoch": 0.98, + "grad_norm": 0.561007559299469, + "learning_rate": 0.00045585430839193547, + "loss": 2.9753, + "step": 19963 + }, + { + "epoch": 0.98, + "grad_norm": 0.4810415208339691, + "learning_rate": 0.00045584115273137807, + "loss": 3.1482, + "step": 19964 + }, + { + "epoch": 0.98, + "grad_norm": 0.6551200747489929, + "learning_rate": 0.0004558279966603651, + "loss": 2.9541, + "step": 19965 + }, + { + "epoch": 0.98, + "grad_norm": 0.5145920515060425, + "learning_rate": 0.00045581484017893117, + "loss": 3.125, + "step": 19966 + }, + { + "epoch": 0.98, + "grad_norm": 0.4966809153556824, + "learning_rate": 0.000455801683287111, + "loss": 3.098, + "step": 19967 + }, + { + "epoch": 0.98, + "grad_norm": 0.5068411827087402, + "learning_rate": 0.00045578852598493914, + "loss": 2.9166, + "step": 19968 + }, + { + "epoch": 0.98, + "grad_norm": 0.5475143790245056, + "learning_rate": 0.00045577536827245025, + "loss": 2.9523, + "step": 19969 + }, + { + "epoch": 0.98, + "grad_norm": 0.5503715872764587, + "learning_rate": 0.000455762210149679, + "loss": 3.1146, + "step": 19970 + }, + { + "epoch": 0.98, + "grad_norm": 0.5356963276863098, + "learning_rate": 0.00045574905161666014, + "loss": 2.8835, + "step": 19971 + }, + { + "epoch": 0.98, + "grad_norm": 0.5304535627365112, + "learning_rate": 0.0004557358926734282, + "loss": 3.2342, + "step": 19972 + }, + { + "epoch": 0.98, + "grad_norm": 0.511310875415802, + "learning_rate": 0.0004557227333200179, + "loss": 3.2055, + "step": 19973 + }, + { + "epoch": 0.98, + "grad_norm": 0.4973483085632324, + "learning_rate": 0.000455709573556464, + "loss": 3.2209, + "step": 19974 + }, + { + "epoch": 0.98, + "grad_norm": 0.5432186126708984, + "learning_rate": 0.00045569641338280087, + "loss": 2.9227, + "step": 19975 + }, + { + "epoch": 0.98, + "grad_norm": 0.5128586292266846, + "learning_rate": 0.00045568325279906344, + "loss": 3.4699, + "step": 19976 + }, + { + "epoch": 0.98, + "grad_norm": 0.5524463057518005, + "learning_rate": 0.0004556700918052864, + "loss": 3.1689, + "step": 19977 + }, + { + "epoch": 0.98, + "grad_norm": 0.5236881375312805, + "learning_rate": 0.0004556569304015041, + "loss": 3.2393, + "step": 19978 + }, + { + "epoch": 0.98, + "grad_norm": 0.5629315972328186, + "learning_rate": 0.0004556437685877515, + "loss": 3.2885, + "step": 19979 + }, + { + "epoch": 0.98, + "grad_norm": 0.5505418181419373, + "learning_rate": 0.0004556306063640632, + "loss": 3.1929, + "step": 19980 + }, + { + "epoch": 0.98, + "grad_norm": 0.5632915496826172, + "learning_rate": 0.0004556174437304737, + "loss": 3.0301, + "step": 19981 + }, + { + "epoch": 0.98, + "grad_norm": 0.5991239547729492, + "learning_rate": 0.00045560428068701787, + "loss": 3.2244, + "step": 19982 + }, + { + "epoch": 0.98, + "grad_norm": 0.5407932996749878, + "learning_rate": 0.00045559111723373036, + "loss": 3.239, + "step": 19983 + }, + { + "epoch": 0.98, + "grad_norm": 0.5531562566757202, + "learning_rate": 0.00045557795337064575, + "loss": 3.3026, + "step": 19984 + }, + { + "epoch": 0.98, + "grad_norm": 0.5307109951972961, + "learning_rate": 0.00045556478909779876, + "loss": 3.2586, + "step": 19985 + }, + { + "epoch": 0.98, + "grad_norm": 0.515917181968689, + "learning_rate": 0.000455551624415224, + "loss": 3.0974, + "step": 19986 + }, + { + "epoch": 0.98, + "grad_norm": 0.5192380547523499, + "learning_rate": 0.0004555384593229562, + "loss": 3.2065, + "step": 19987 + }, + { + "epoch": 0.98, + "grad_norm": 0.523467481136322, + "learning_rate": 0.00045552529382103025, + "loss": 3.0622, + "step": 19988 + }, + { + "epoch": 0.98, + "grad_norm": 0.5278224945068359, + "learning_rate": 0.00045551212790948036, + "loss": 3.1597, + "step": 19989 + }, + { + "epoch": 0.98, + "grad_norm": 0.5236921310424805, + "learning_rate": 0.00045549896158834154, + "loss": 3.0244, + "step": 19990 + }, + { + "epoch": 0.98, + "grad_norm": 0.5542269945144653, + "learning_rate": 0.0004554857948576483, + "loss": 3.2861, + "step": 19991 + }, + { + "epoch": 0.98, + "grad_norm": 0.5261695981025696, + "learning_rate": 0.00045547262771743555, + "loss": 3.0632, + "step": 19992 + }, + { + "epoch": 0.98, + "grad_norm": 0.530767023563385, + "learning_rate": 0.00045545946016773775, + "loss": 3.2265, + "step": 19993 + }, + { + "epoch": 0.98, + "grad_norm": 0.5161399841308594, + "learning_rate": 0.00045544629220858966, + "loss": 3.2562, + "step": 19994 + }, + { + "epoch": 0.98, + "grad_norm": 0.4969814717769623, + "learning_rate": 0.00045543312384002595, + "loss": 3.0832, + "step": 19995 + }, + { + "epoch": 0.98, + "grad_norm": 0.5116978287696838, + "learning_rate": 0.0004554199550620812, + "loss": 3.15, + "step": 19996 + }, + { + "epoch": 0.98, + "grad_norm": 0.5162671208381653, + "learning_rate": 0.00045540678587479037, + "loss": 3.2172, + "step": 19997 + }, + { + "epoch": 0.98, + "grad_norm": 0.5306782722473145, + "learning_rate": 0.0004553936162781879, + "loss": 3.1452, + "step": 19998 + }, + { + "epoch": 0.98, + "grad_norm": 0.535821259021759, + "learning_rate": 0.0004553804462723086, + "loss": 3.2178, + "step": 19999 + }, + { + "epoch": 0.98, + "grad_norm": 0.5315077900886536, + "learning_rate": 0.00045536727585718706, + "loss": 3.2123, + "step": 20000 + }, + { + "epoch": 0.98, + "grad_norm": 0.522333025932312, + "learning_rate": 0.0004553541050328579, + "loss": 3.0348, + "step": 20001 + }, + { + "epoch": 0.98, + "grad_norm": 0.5301592350006104, + "learning_rate": 0.000455340933799356, + "loss": 3.1113, + "step": 20002 + }, + { + "epoch": 0.98, + "grad_norm": 0.584934651851654, + "learning_rate": 0.0004553277621567161, + "loss": 3.18, + "step": 20003 + }, + { + "epoch": 0.98, + "grad_norm": 0.6135393977165222, + "learning_rate": 0.0004553145901049727, + "loss": 3.1347, + "step": 20004 + }, + { + "epoch": 0.98, + "grad_norm": 0.5235838890075684, + "learning_rate": 0.0004553014176441605, + "loss": 2.9696, + "step": 20005 + }, + { + "epoch": 0.98, + "grad_norm": 0.5571325421333313, + "learning_rate": 0.0004552882447743143, + "loss": 3.215, + "step": 20006 + }, + { + "epoch": 0.98, + "grad_norm": 0.5705804824829102, + "learning_rate": 0.0004552750714954688, + "loss": 3.2719, + "step": 20007 + }, + { + "epoch": 0.98, + "grad_norm": 0.5331918597221375, + "learning_rate": 0.00045526189780765856, + "loss": 3.2195, + "step": 20008 + }, + { + "epoch": 0.98, + "grad_norm": 0.5457714200019836, + "learning_rate": 0.00045524872371091834, + "loss": 3.2391, + "step": 20009 + }, + { + "epoch": 0.98, + "grad_norm": 0.5504556894302368, + "learning_rate": 0.000455235549205283, + "loss": 3.2419, + "step": 20010 + }, + { + "epoch": 0.98, + "grad_norm": 0.49661940336227417, + "learning_rate": 0.000455222374290787, + "loss": 3.1479, + "step": 20011 + }, + { + "epoch": 0.98, + "grad_norm": 0.5197951197624207, + "learning_rate": 0.00045520919896746516, + "loss": 3.0056, + "step": 20012 + }, + { + "epoch": 0.98, + "grad_norm": 0.5557920932769775, + "learning_rate": 0.00045519602323535206, + "loss": 3.2958, + "step": 20013 + }, + { + "epoch": 0.98, + "grad_norm": 0.53912353515625, + "learning_rate": 0.0004551828470944827, + "loss": 2.8655, + "step": 20014 + }, + { + "epoch": 0.98, + "grad_norm": 0.5061671137809753, + "learning_rate": 0.0004551696705448915, + "loss": 3.2487, + "step": 20015 + }, + { + "epoch": 0.98, + "grad_norm": 0.5309056639671326, + "learning_rate": 0.00045515649358661317, + "loss": 3.1987, + "step": 20016 + }, + { + "epoch": 0.98, + "grad_norm": 0.5490389466285706, + "learning_rate": 0.0004551433162196826, + "loss": 3.084, + "step": 20017 + }, + { + "epoch": 0.98, + "grad_norm": 0.5171396136283875, + "learning_rate": 0.00045513013844413435, + "loss": 3.2162, + "step": 20018 + }, + { + "epoch": 0.98, + "grad_norm": 0.5325402617454529, + "learning_rate": 0.00045511696026000317, + "loss": 3.0539, + "step": 20019 + }, + { + "epoch": 0.98, + "grad_norm": 0.5274950861930847, + "learning_rate": 0.00045510378166732375, + "loss": 3.3672, + "step": 20020 + }, + { + "epoch": 0.98, + "grad_norm": 0.5318315625190735, + "learning_rate": 0.0004550906026661309, + "loss": 3.0185, + "step": 20021 + }, + { + "epoch": 0.98, + "grad_norm": 0.5088624358177185, + "learning_rate": 0.00045507742325645914, + "loss": 3.1083, + "step": 20022 + }, + { + "epoch": 0.98, + "grad_norm": 0.5656629800796509, + "learning_rate": 0.0004550642434383433, + "loss": 2.9348, + "step": 20023 + }, + { + "epoch": 0.98, + "grad_norm": 0.5793680548667908, + "learning_rate": 0.0004550510632118182, + "loss": 3.0635, + "step": 20024 + }, + { + "epoch": 0.98, + "grad_norm": 0.5440924167633057, + "learning_rate": 0.0004550378825769184, + "loss": 2.9255, + "step": 20025 + }, + { + "epoch": 0.98, + "grad_norm": 0.49890273809432983, + "learning_rate": 0.0004550247015336786, + "loss": 3.2322, + "step": 20026 + }, + { + "epoch": 0.98, + "grad_norm": 0.541521430015564, + "learning_rate": 0.0004550115200821335, + "loss": 2.9845, + "step": 20027 + }, + { + "epoch": 0.98, + "grad_norm": 0.5471722483634949, + "learning_rate": 0.00045499833822231807, + "loss": 3.002, + "step": 20028 + }, + { + "epoch": 0.98, + "grad_norm": 0.5366112589836121, + "learning_rate": 0.0004549851559542668, + "loss": 3.1757, + "step": 20029 + }, + { + "epoch": 0.98, + "grad_norm": 0.5042491555213928, + "learning_rate": 0.0004549719732780143, + "loss": 3.3144, + "step": 20030 + }, + { + "epoch": 0.98, + "grad_norm": 0.5647759437561035, + "learning_rate": 0.0004549587901935957, + "loss": 2.917, + "step": 20031 + }, + { + "epoch": 0.98, + "grad_norm": 0.5422675013542175, + "learning_rate": 0.00045494560670104525, + "loss": 3.0393, + "step": 20032 + }, + { + "epoch": 0.98, + "grad_norm": 0.532295823097229, + "learning_rate": 0.000454932422800398, + "loss": 3.0517, + "step": 20033 + }, + { + "epoch": 0.98, + "grad_norm": 0.5531377792358398, + "learning_rate": 0.00045491923849168856, + "loss": 3.1039, + "step": 20034 + }, + { + "epoch": 0.98, + "grad_norm": 0.4993976056575775, + "learning_rate": 0.0004549060537749516, + "loss": 2.9075, + "step": 20035 + }, + { + "epoch": 0.98, + "grad_norm": 0.5766929984092712, + "learning_rate": 0.00045489286865022205, + "loss": 3.448, + "step": 20036 + }, + { + "epoch": 0.98, + "grad_norm": 0.525661289691925, + "learning_rate": 0.00045487968311753427, + "loss": 3.204, + "step": 20037 + }, + { + "epoch": 0.98, + "grad_norm": 0.5298983454704285, + "learning_rate": 0.0004548664971769234, + "loss": 3.1754, + "step": 20038 + }, + { + "epoch": 0.98, + "grad_norm": 0.5623906254768372, + "learning_rate": 0.000454853310828424, + "loss": 3.2299, + "step": 20039 + }, + { + "epoch": 0.98, + "grad_norm": 0.5601446032524109, + "learning_rate": 0.0004548401240720706, + "loss": 3.135, + "step": 20040 + }, + { + "epoch": 0.98, + "grad_norm": 0.5393292307853699, + "learning_rate": 0.0004548269369078982, + "loss": 3.1141, + "step": 20041 + }, + { + "epoch": 0.98, + "grad_norm": 0.5087615847587585, + "learning_rate": 0.0004548137493359414, + "loss": 2.9242, + "step": 20042 + }, + { + "epoch": 0.98, + "grad_norm": 0.5590735673904419, + "learning_rate": 0.00045480056135623515, + "loss": 3.1836, + "step": 20043 + }, + { + "epoch": 0.98, + "grad_norm": 0.5152249932289124, + "learning_rate": 0.00045478737296881383, + "loss": 3.0764, + "step": 20044 + }, + { + "epoch": 0.98, + "grad_norm": 0.551261842250824, + "learning_rate": 0.0004547741841737125, + "loss": 3.1649, + "step": 20045 + }, + { + "epoch": 0.98, + "grad_norm": 0.5181664824485779, + "learning_rate": 0.0004547609949709656, + "loss": 3.231, + "step": 20046 + }, + { + "epoch": 0.98, + "grad_norm": 0.5038676857948303, + "learning_rate": 0.00045474780536060815, + "loss": 3.141, + "step": 20047 + }, + { + "epoch": 0.98, + "grad_norm": 0.5354689359664917, + "learning_rate": 0.00045473461534267475, + "loss": 3.2463, + "step": 20048 + }, + { + "epoch": 0.98, + "grad_norm": 0.5693992972373962, + "learning_rate": 0.00045472142491720006, + "loss": 3.0521, + "step": 20049 + }, + { + "epoch": 0.98, + "grad_norm": 0.532825767993927, + "learning_rate": 0.00045470823408421903, + "loss": 3.1193, + "step": 20050 + }, + { + "epoch": 0.98, + "grad_norm": 0.5376405715942383, + "learning_rate": 0.0004546950428437662, + "loss": 3.2254, + "step": 20051 + }, + { + "epoch": 0.98, + "grad_norm": 0.5095952749252319, + "learning_rate": 0.00045468185119587644, + "loss": 2.8989, + "step": 20052 + }, + { + "epoch": 0.98, + "grad_norm": 0.5750041604042053, + "learning_rate": 0.00045466865914058443, + "loss": 3.045, + "step": 20053 + }, + { + "epoch": 0.98, + "grad_norm": 0.5071499347686768, + "learning_rate": 0.00045465546667792497, + "loss": 3.0452, + "step": 20054 + }, + { + "epoch": 0.98, + "grad_norm": 0.49465328454971313, + "learning_rate": 0.0004546422738079327, + "loss": 3.1946, + "step": 20055 + }, + { + "epoch": 0.98, + "grad_norm": 0.5186102986335754, + "learning_rate": 0.0004546290805306426, + "loss": 3.2024, + "step": 20056 + }, + { + "epoch": 0.98, + "grad_norm": 0.5584270358085632, + "learning_rate": 0.00045461588684608914, + "loss": 3.0319, + "step": 20057 + }, + { + "epoch": 0.98, + "grad_norm": 0.5318349003791809, + "learning_rate": 0.0004546026927543072, + "loss": 2.9715, + "step": 20058 + }, + { + "epoch": 0.98, + "grad_norm": 0.5026640295982361, + "learning_rate": 0.0004545894982553315, + "loss": 3.2427, + "step": 20059 + }, + { + "epoch": 0.98, + "grad_norm": 0.5623165965080261, + "learning_rate": 0.0004545763033491968, + "loss": 3.2415, + "step": 20060 + }, + { + "epoch": 0.98, + "grad_norm": 0.5373814702033997, + "learning_rate": 0.000454563108035938, + "loss": 3.3984, + "step": 20061 + }, + { + "epoch": 0.98, + "grad_norm": 0.5032324194908142, + "learning_rate": 0.00045454991231558967, + "loss": 3.3189, + "step": 20062 + }, + { + "epoch": 0.98, + "grad_norm": 0.5357590913772583, + "learning_rate": 0.00045453671618818646, + "loss": 3.0099, + "step": 20063 + }, + { + "epoch": 0.98, + "grad_norm": 0.5333057641983032, + "learning_rate": 0.00045452351965376335, + "loss": 3.1873, + "step": 20064 + }, + { + "epoch": 0.98, + "grad_norm": 0.5362182855606079, + "learning_rate": 0.0004545103227123552, + "loss": 3.1468, + "step": 20065 + }, + { + "epoch": 0.98, + "grad_norm": 0.5074638724327087, + "learning_rate": 0.0004544971253639964, + "loss": 3.0841, + "step": 20066 + }, + { + "epoch": 0.98, + "grad_norm": 0.5193195343017578, + "learning_rate": 0.000454483927608722, + "loss": 2.9173, + "step": 20067 + }, + { + "epoch": 0.98, + "grad_norm": 0.5239328742027283, + "learning_rate": 0.0004544707294465667, + "loss": 3.2981, + "step": 20068 + }, + { + "epoch": 0.98, + "grad_norm": 0.5404732823371887, + "learning_rate": 0.00045445753087756507, + "loss": 3.056, + "step": 20069 + }, + { + "epoch": 0.98, + "grad_norm": 0.6814656257629395, + "learning_rate": 0.0004544443319017521, + "loss": 3.009, + "step": 20070 + }, + { + "epoch": 0.98, + "grad_norm": 0.5465784668922424, + "learning_rate": 0.0004544311325191625, + "loss": 3.3474, + "step": 20071 + }, + { + "epoch": 0.98, + "grad_norm": 0.5160285234451294, + "learning_rate": 0.00045441793272983107, + "loss": 2.8708, + "step": 20072 + }, + { + "epoch": 0.98, + "grad_norm": 0.4963335692882538, + "learning_rate": 0.00045440473253379246, + "loss": 3.2226, + "step": 20073 + }, + { + "epoch": 0.98, + "grad_norm": 0.5127677917480469, + "learning_rate": 0.00045439153193108155, + "loss": 3.2884, + "step": 20074 + }, + { + "epoch": 0.98, + "grad_norm": 0.5165018439292908, + "learning_rate": 0.000454378330921733, + "loss": 3.0943, + "step": 20075 + }, + { + "epoch": 0.98, + "grad_norm": 0.5334299206733704, + "learning_rate": 0.0004543651295057817, + "loss": 3.1644, + "step": 20076 + }, + { + "epoch": 0.98, + "grad_norm": 0.5283639430999756, + "learning_rate": 0.0004543519276832624, + "loss": 3.1569, + "step": 20077 + }, + { + "epoch": 0.98, + "grad_norm": 0.4838026762008667, + "learning_rate": 0.00045433872545420966, + "loss": 3.2105, + "step": 20078 + }, + { + "epoch": 0.98, + "grad_norm": 0.5402780175209045, + "learning_rate": 0.00045432552281865854, + "loss": 2.9989, + "step": 20079 + }, + { + "epoch": 0.98, + "grad_norm": 0.49882394075393677, + "learning_rate": 0.00045431231977664365, + "loss": 3.1154, + "step": 20080 + }, + { + "epoch": 0.98, + "grad_norm": 0.5234292149543762, + "learning_rate": 0.00045429911632819986, + "loss": 2.9596, + "step": 20081 + }, + { + "epoch": 0.98, + "grad_norm": 0.5079872608184814, + "learning_rate": 0.00045428591247336186, + "loss": 3.0581, + "step": 20082 + }, + { + "epoch": 0.98, + "grad_norm": 0.6135797500610352, + "learning_rate": 0.0004542727082121644, + "loss": 3.2326, + "step": 20083 + }, + { + "epoch": 0.98, + "grad_norm": 0.5245426893234253, + "learning_rate": 0.0004542595035446424, + "loss": 3.0909, + "step": 20084 + }, + { + "epoch": 0.98, + "grad_norm": 0.5490965247154236, + "learning_rate": 0.0004542462984708305, + "loss": 3.227, + "step": 20085 + }, + { + "epoch": 0.98, + "grad_norm": 0.5142471790313721, + "learning_rate": 0.0004542330929907636, + "loss": 3.2628, + "step": 20086 + }, + { + "epoch": 0.98, + "grad_norm": 0.5162678360939026, + "learning_rate": 0.0004542198871044764, + "loss": 2.9272, + "step": 20087 + }, + { + "epoch": 0.98, + "grad_norm": 0.507563591003418, + "learning_rate": 0.00045420668081200364, + "loss": 3.107, + "step": 20088 + }, + { + "epoch": 0.98, + "grad_norm": 0.49381065368652344, + "learning_rate": 0.00045419347411338015, + "loss": 3.141, + "step": 20089 + }, + { + "epoch": 0.98, + "grad_norm": 0.5393127202987671, + "learning_rate": 0.00045418026700864083, + "loss": 3.4015, + "step": 20090 + }, + { + "epoch": 0.98, + "grad_norm": 0.5078210830688477, + "learning_rate": 0.00045416705949782036, + "loss": 3.0461, + "step": 20091 + }, + { + "epoch": 0.98, + "grad_norm": 0.5738405585289001, + "learning_rate": 0.00045415385158095343, + "loss": 3.1799, + "step": 20092 + }, + { + "epoch": 0.98, + "grad_norm": 0.5566117167472839, + "learning_rate": 0.00045414064325807497, + "loss": 3.0548, + "step": 20093 + }, + { + "epoch": 0.98, + "grad_norm": 0.5047556161880493, + "learning_rate": 0.0004541274345292197, + "loss": 3.0273, + "step": 20094 + }, + { + "epoch": 0.98, + "grad_norm": 0.5348913073539734, + "learning_rate": 0.00045411422539442246, + "loss": 3.15, + "step": 20095 + }, + { + "epoch": 0.98, + "grad_norm": 0.5044894218444824, + "learning_rate": 0.0004541010158537179, + "loss": 3.1082, + "step": 20096 + }, + { + "epoch": 0.98, + "grad_norm": 0.5232664942741394, + "learning_rate": 0.00045408780590714103, + "loss": 3.1608, + "step": 20097 + }, + { + "epoch": 0.98, + "grad_norm": 0.5661266446113586, + "learning_rate": 0.00045407459555472647, + "loss": 3.3411, + "step": 20098 + }, + { + "epoch": 0.99, + "grad_norm": 0.5234664082527161, + "learning_rate": 0.0004540613847965092, + "loss": 3.2362, + "step": 20099 + }, + { + "epoch": 0.99, + "grad_norm": 0.5506570935249329, + "learning_rate": 0.0004540481736325238, + "loss": 3.1079, + "step": 20100 + }, + { + "epoch": 0.99, + "grad_norm": 0.5332557559013367, + "learning_rate": 0.00045403496206280515, + "loss": 3.0901, + "step": 20101 + }, + { + "epoch": 0.99, + "grad_norm": 0.5116790533065796, + "learning_rate": 0.0004540217500873881, + "loss": 3.0163, + "step": 20102 + }, + { + "epoch": 0.99, + "grad_norm": 0.4963262677192688, + "learning_rate": 0.00045400853770630737, + "loss": 3.1048, + "step": 20103 + }, + { + "epoch": 0.99, + "grad_norm": 0.5719655752182007, + "learning_rate": 0.0004539953249195978, + "loss": 3.1882, + "step": 20104 + }, + { + "epoch": 0.99, + "grad_norm": 0.6064972877502441, + "learning_rate": 0.0004539821117272942, + "loss": 3.1947, + "step": 20105 + }, + { + "epoch": 0.99, + "grad_norm": 0.5541272163391113, + "learning_rate": 0.00045396889812943136, + "loss": 3.1424, + "step": 20106 + }, + { + "epoch": 0.99, + "grad_norm": 0.4937479496002197, + "learning_rate": 0.000453955684126044, + "loss": 3.3325, + "step": 20107 + }, + { + "epoch": 0.99, + "grad_norm": 0.540358304977417, + "learning_rate": 0.0004539424697171671, + "loss": 3.3724, + "step": 20108 + }, + { + "epoch": 0.99, + "grad_norm": 0.5559720396995544, + "learning_rate": 0.0004539292549028352, + "loss": 2.9808, + "step": 20109 + }, + { + "epoch": 0.99, + "grad_norm": 0.5821501612663269, + "learning_rate": 0.0004539160396830834, + "loss": 3.2358, + "step": 20110 + }, + { + "epoch": 0.99, + "grad_norm": 0.5225769281387329, + "learning_rate": 0.00045390282405794634, + "loss": 3.2651, + "step": 20111 + }, + { + "epoch": 0.99, + "grad_norm": 0.50220787525177, + "learning_rate": 0.0004538896080274589, + "loss": 3.1423, + "step": 20112 + }, + { + "epoch": 0.99, + "grad_norm": 0.5386639833450317, + "learning_rate": 0.0004538763915916559, + "loss": 3.2382, + "step": 20113 + }, + { + "epoch": 0.99, + "grad_norm": 0.5450523495674133, + "learning_rate": 0.0004538631747505719, + "loss": 3.2255, + "step": 20114 + }, + { + "epoch": 0.99, + "grad_norm": 0.52878737449646, + "learning_rate": 0.000453849957504242, + "loss": 2.9539, + "step": 20115 + }, + { + "epoch": 0.99, + "grad_norm": 0.5469775199890137, + "learning_rate": 0.000453836739852701, + "loss": 2.9554, + "step": 20116 + }, + { + "epoch": 0.99, + "grad_norm": 0.5183457136154175, + "learning_rate": 0.0004538235217959836, + "loss": 3.0908, + "step": 20117 + }, + { + "epoch": 0.99, + "grad_norm": 0.5041838884353638, + "learning_rate": 0.0004538103033341246, + "loss": 3.1683, + "step": 20118 + }, + { + "epoch": 0.99, + "grad_norm": 0.5320984125137329, + "learning_rate": 0.00045379708446715894, + "loss": 3.1186, + "step": 20119 + }, + { + "epoch": 0.99, + "grad_norm": 0.5572245717048645, + "learning_rate": 0.00045378386519512125, + "loss": 3.2272, + "step": 20120 + }, + { + "epoch": 0.99, + "grad_norm": 0.6518499255180359, + "learning_rate": 0.0004537706455180465, + "loss": 3.1933, + "step": 20121 + }, + { + "epoch": 0.99, + "grad_norm": 0.5835242867469788, + "learning_rate": 0.0004537574254359695, + "loss": 3.2654, + "step": 20122 + }, + { + "epoch": 0.99, + "grad_norm": 0.5642088055610657, + "learning_rate": 0.0004537442049489251, + "loss": 3.2556, + "step": 20123 + }, + { + "epoch": 0.99, + "grad_norm": 0.5181591510772705, + "learning_rate": 0.00045373098405694786, + "loss": 3.1488, + "step": 20124 + }, + { + "epoch": 0.99, + "grad_norm": 0.5602161884307861, + "learning_rate": 0.0004537177627600729, + "loss": 3.0757, + "step": 20125 + }, + { + "epoch": 0.99, + "grad_norm": 0.5879859328269958, + "learning_rate": 0.00045370454105833494, + "loss": 3.2037, + "step": 20126 + }, + { + "epoch": 0.99, + "grad_norm": 0.5683193802833557, + "learning_rate": 0.00045369131895176885, + "loss": 2.9962, + "step": 20127 + }, + { + "epoch": 0.99, + "grad_norm": 0.5079681873321533, + "learning_rate": 0.0004536780964404093, + "loss": 3.0535, + "step": 20128 + }, + { + "epoch": 0.99, + "grad_norm": 0.5396766662597656, + "learning_rate": 0.0004536648735242912, + "loss": 3.1192, + "step": 20129 + }, + { + "epoch": 0.99, + "grad_norm": 0.5396043658256531, + "learning_rate": 0.0004536516502034495, + "loss": 3.2537, + "step": 20130 + }, + { + "epoch": 0.99, + "grad_norm": 0.5514666438102722, + "learning_rate": 0.0004536384264779189, + "loss": 2.9611, + "step": 20131 + }, + { + "epoch": 0.99, + "grad_norm": 0.5507171750068665, + "learning_rate": 0.0004536252023477343, + "loss": 3.2089, + "step": 20132 + }, + { + "epoch": 0.99, + "grad_norm": 0.5136435627937317, + "learning_rate": 0.0004536119778129304, + "loss": 3.0577, + "step": 20133 + }, + { + "epoch": 0.99, + "grad_norm": 0.5072475671768188, + "learning_rate": 0.00045359875287354214, + "loss": 3.2917, + "step": 20134 + }, + { + "epoch": 0.99, + "grad_norm": 0.5063516497612, + "learning_rate": 0.00045358552752960426, + "loss": 3.2465, + "step": 20135 + }, + { + "epoch": 0.99, + "grad_norm": 0.545147716999054, + "learning_rate": 0.00045357230178115173, + "loss": 3.042, + "step": 20136 + }, + { + "epoch": 0.99, + "grad_norm": 0.5409039855003357, + "learning_rate": 0.0004535590756282193, + "loss": 3.3088, + "step": 20137 + }, + { + "epoch": 0.99, + "grad_norm": 0.5154100656509399, + "learning_rate": 0.0004535458490708419, + "loss": 3.2658, + "step": 20138 + }, + { + "epoch": 0.99, + "grad_norm": 0.5560367703437805, + "learning_rate": 0.0004535326221090542, + "loss": 3.0783, + "step": 20139 + }, + { + "epoch": 0.99, + "grad_norm": 0.5108457207679749, + "learning_rate": 0.0004535193947428911, + "loss": 3.1656, + "step": 20140 + }, + { + "epoch": 0.99, + "grad_norm": 0.5491657257080078, + "learning_rate": 0.0004535061669723875, + "loss": 3.3704, + "step": 20141 + }, + { + "epoch": 0.99, + "grad_norm": 0.5991771221160889, + "learning_rate": 0.00045349293879757814, + "loss": 3.3599, + "step": 20142 + }, + { + "epoch": 0.99, + "grad_norm": 0.530377209186554, + "learning_rate": 0.00045347971021849796, + "loss": 3.3084, + "step": 20143 + }, + { + "epoch": 0.99, + "grad_norm": 0.546180009841919, + "learning_rate": 0.0004534664812351818, + "loss": 3.4132, + "step": 20144 + }, + { + "epoch": 0.99, + "grad_norm": 0.5626497864723206, + "learning_rate": 0.0004534532518476644, + "loss": 3.2105, + "step": 20145 + }, + { + "epoch": 0.99, + "grad_norm": 0.508690357208252, + "learning_rate": 0.00045344002205598074, + "loss": 3.274, + "step": 20146 + }, + { + "epoch": 0.99, + "grad_norm": 0.5522834062576294, + "learning_rate": 0.00045342679186016554, + "loss": 3.0697, + "step": 20147 + }, + { + "epoch": 0.99, + "grad_norm": 0.5134068727493286, + "learning_rate": 0.00045341356126025366, + "loss": 3.2479, + "step": 20148 + }, + { + "epoch": 0.99, + "grad_norm": 0.5544729232788086, + "learning_rate": 0.00045340033025628013, + "loss": 3.0962, + "step": 20149 + }, + { + "epoch": 0.99, + "grad_norm": 0.5248307585716248, + "learning_rate": 0.00045338709884827947, + "loss": 3.262, + "step": 20150 + }, + { + "epoch": 0.99, + "grad_norm": 0.5262455344200134, + "learning_rate": 0.00045337386703628676, + "loss": 3.1155, + "step": 20151 + }, + { + "epoch": 0.99, + "grad_norm": 0.5151076912879944, + "learning_rate": 0.00045336063482033696, + "loss": 3.0329, + "step": 20152 + }, + { + "epoch": 0.99, + "grad_norm": 0.5224588513374329, + "learning_rate": 0.00045334740220046455, + "loss": 3.1338, + "step": 20153 + }, + { + "epoch": 0.99, + "grad_norm": 0.5186708569526672, + "learning_rate": 0.0004533341691767047, + "loss": 3.0215, + "step": 20154 + }, + { + "epoch": 0.99, + "grad_norm": 0.5381702780723572, + "learning_rate": 0.0004533209357490921, + "loss": 3.2761, + "step": 20155 + }, + { + "epoch": 0.99, + "grad_norm": 0.5323552489280701, + "learning_rate": 0.00045330770191766176, + "loss": 3.1807, + "step": 20156 + }, + { + "epoch": 0.99, + "grad_norm": 0.5015830993652344, + "learning_rate": 0.0004532944676824484, + "loss": 3.0101, + "step": 20157 + }, + { + "epoch": 0.99, + "grad_norm": 0.5259556770324707, + "learning_rate": 0.0004532812330434869, + "loss": 3.1135, + "step": 20158 + }, + { + "epoch": 0.99, + "grad_norm": 0.544632613658905, + "learning_rate": 0.00045326799800081213, + "loss": 3.2611, + "step": 20159 + }, + { + "epoch": 0.99, + "grad_norm": 0.5256531834602356, + "learning_rate": 0.00045325476255445886, + "loss": 3.0899, + "step": 20160 + }, + { + "epoch": 0.99, + "grad_norm": 0.5340846180915833, + "learning_rate": 0.00045324152670446217, + "loss": 3.2513, + "step": 20161 + }, + { + "epoch": 0.99, + "grad_norm": 0.5545790791511536, + "learning_rate": 0.00045322829045085674, + "loss": 2.9697, + "step": 20162 + }, + { + "epoch": 0.99, + "grad_norm": 0.5166133642196655, + "learning_rate": 0.00045321505379367755, + "loss": 3.2083, + "step": 20163 + }, + { + "epoch": 0.99, + "grad_norm": 0.5389953851699829, + "learning_rate": 0.00045320181673295934, + "loss": 2.8237, + "step": 20164 + }, + { + "epoch": 0.99, + "grad_norm": 0.5397850871086121, + "learning_rate": 0.00045318857926873697, + "loss": 3.0537, + "step": 20165 + }, + { + "epoch": 0.99, + "grad_norm": 0.5113707780838013, + "learning_rate": 0.0004531753414010454, + "loss": 3.1937, + "step": 20166 + }, + { + "epoch": 0.99, + "grad_norm": 0.5229274034500122, + "learning_rate": 0.00045316210312991954, + "loss": 3.3204, + "step": 20167 + }, + { + "epoch": 0.99, + "grad_norm": 0.5383738279342651, + "learning_rate": 0.00045314886445539415, + "loss": 3.3542, + "step": 20168 + }, + { + "epoch": 0.99, + "grad_norm": 0.5492693185806274, + "learning_rate": 0.00045313562537750403, + "loss": 3.1827, + "step": 20169 + }, + { + "epoch": 0.99, + "grad_norm": 0.5596176385879517, + "learning_rate": 0.00045312238589628425, + "loss": 3.1018, + "step": 20170 + }, + { + "epoch": 0.99, + "grad_norm": 0.5374123454093933, + "learning_rate": 0.00045310914601176956, + "loss": 3.2086, + "step": 20171 + }, + { + "epoch": 0.99, + "grad_norm": 0.5204119086265564, + "learning_rate": 0.0004530959057239947, + "loss": 3.0035, + "step": 20172 + }, + { + "epoch": 0.99, + "grad_norm": 0.5454662442207336, + "learning_rate": 0.0004530826650329948, + "loss": 3.2789, + "step": 20173 + }, + { + "epoch": 0.99, + "grad_norm": 0.5484021902084351, + "learning_rate": 0.00045306942393880475, + "loss": 3.1308, + "step": 20174 + }, + { + "epoch": 0.99, + "grad_norm": 0.5319493412971497, + "learning_rate": 0.00045305618244145915, + "loss": 3.2038, + "step": 20175 + }, + { + "epoch": 0.99, + "grad_norm": 0.529231071472168, + "learning_rate": 0.000453042940540993, + "loss": 3.1151, + "step": 20176 + }, + { + "epoch": 0.99, + "grad_norm": 0.5511825084686279, + "learning_rate": 0.0004530296982374412, + "loss": 3.2654, + "step": 20177 + }, + { + "epoch": 0.99, + "grad_norm": 0.5340495109558105, + "learning_rate": 0.0004530164555308388, + "loss": 3.2052, + "step": 20178 + }, + { + "epoch": 0.99, + "grad_norm": 0.5212454795837402, + "learning_rate": 0.0004530032124212203, + "loss": 3.1776, + "step": 20179 + }, + { + "epoch": 0.99, + "grad_norm": 0.4973335564136505, + "learning_rate": 0.00045298996890862087, + "loss": 3.2903, + "step": 20180 + }, + { + "epoch": 0.99, + "grad_norm": 0.5556290149688721, + "learning_rate": 0.0004529767249930753, + "loss": 3.0608, + "step": 20181 + }, + { + "epoch": 0.99, + "grad_norm": 0.5058223009109497, + "learning_rate": 0.00045296348067461846, + "loss": 3.2073, + "step": 20182 + }, + { + "epoch": 0.99, + "grad_norm": 0.5586454272270203, + "learning_rate": 0.00045295023595328526, + "loss": 3.0764, + "step": 20183 + }, + { + "epoch": 0.99, + "grad_norm": 0.5111760497093201, + "learning_rate": 0.0004529369908291106, + "loss": 2.9447, + "step": 20184 + }, + { + "epoch": 0.99, + "grad_norm": 0.5140455961227417, + "learning_rate": 0.00045292374530212935, + "loss": 3.1685, + "step": 20185 + }, + { + "epoch": 0.99, + "grad_norm": 0.5379027128219604, + "learning_rate": 0.0004529104993723763, + "loss": 3.1918, + "step": 20186 + }, + { + "epoch": 0.99, + "grad_norm": 0.4842080771923065, + "learning_rate": 0.0004528972530398864, + "loss": 3.2543, + "step": 20187 + }, + { + "epoch": 0.99, + "grad_norm": 0.5368223786354065, + "learning_rate": 0.0004528840063046947, + "loss": 3.1263, + "step": 20188 + }, + { + "epoch": 0.99, + "grad_norm": 0.5022522807121277, + "learning_rate": 0.0004528707591668359, + "loss": 3.0206, + "step": 20189 + }, + { + "epoch": 0.99, + "grad_norm": 0.5098612904548645, + "learning_rate": 0.0004528575116263449, + "loss": 2.9806, + "step": 20190 + }, + { + "epoch": 0.99, + "grad_norm": 0.5354186296463013, + "learning_rate": 0.00045284426368325664, + "loss": 3.1973, + "step": 20191 + }, + { + "epoch": 0.99, + "grad_norm": 0.5448538661003113, + "learning_rate": 0.00045283101533760613, + "loss": 3.1923, + "step": 20192 + }, + { + "epoch": 0.99, + "grad_norm": 0.5186355710029602, + "learning_rate": 0.00045281776658942795, + "loss": 3.1251, + "step": 20193 + }, + { + "epoch": 0.99, + "grad_norm": 0.5228528380393982, + "learning_rate": 0.00045280451743875727, + "loss": 3.1935, + "step": 20194 + }, + { + "epoch": 0.99, + "grad_norm": 0.5764790177345276, + "learning_rate": 0.00045279126788562885, + "loss": 3.1752, + "step": 20195 + }, + { + "epoch": 0.99, + "grad_norm": 0.5042545199394226, + "learning_rate": 0.0004527780179300777, + "loss": 3.021, + "step": 20196 + }, + { + "epoch": 0.99, + "grad_norm": 0.5068015456199646, + "learning_rate": 0.00045276476757213864, + "loss": 3.1387, + "step": 20197 + }, + { + "epoch": 0.99, + "grad_norm": 0.5311840176582336, + "learning_rate": 0.00045275151681184656, + "loss": 2.9341, + "step": 20198 + }, + { + "epoch": 0.99, + "grad_norm": 0.57582688331604, + "learning_rate": 0.0004527382656492364, + "loss": 2.6663, + "step": 20199 + }, + { + "epoch": 0.99, + "grad_norm": 0.5275920033454895, + "learning_rate": 0.00045272501408434313, + "loss": 2.8333, + "step": 20200 + }, + { + "epoch": 0.99, + "grad_norm": 0.5140002965927124, + "learning_rate": 0.00045271176211720133, + "loss": 3.1819, + "step": 20201 + }, + { + "epoch": 0.99, + "grad_norm": 0.5207107067108154, + "learning_rate": 0.0004526985097478464, + "loss": 3.1758, + "step": 20202 + }, + { + "epoch": 0.99, + "grad_norm": 0.5530972480773926, + "learning_rate": 0.0004526852569763129, + "loss": 3.134, + "step": 20203 + }, + { + "epoch": 0.99, + "grad_norm": 0.5513622164726257, + "learning_rate": 0.00045267200380263577, + "loss": 3.0947, + "step": 20204 + }, + { + "epoch": 0.99, + "grad_norm": 0.5309062004089355, + "learning_rate": 0.00045265875022685, + "loss": 3.1286, + "step": 20205 + }, + { + "epoch": 0.99, + "grad_norm": 0.5550915002822876, + "learning_rate": 0.0004526454962489904, + "loss": 3.1141, + "step": 20206 + }, + { + "epoch": 0.99, + "grad_norm": 0.5073148608207703, + "learning_rate": 0.000452632241869092, + "loss": 3.1703, + "step": 20207 + }, + { + "epoch": 0.99, + "grad_norm": 0.544578492641449, + "learning_rate": 0.00045261898708718966, + "loss": 3.1061, + "step": 20208 + }, + { + "epoch": 0.99, + "grad_norm": 0.5766841173171997, + "learning_rate": 0.0004526057319033182, + "loss": 3.0463, + "step": 20209 + }, + { + "epoch": 0.99, + "grad_norm": 0.5074905753135681, + "learning_rate": 0.00045259247631751265, + "loss": 2.9534, + "step": 20210 + }, + { + "epoch": 0.99, + "grad_norm": 0.5443254709243774, + "learning_rate": 0.00045257922032980794, + "loss": 3.3291, + "step": 20211 + }, + { + "epoch": 0.99, + "grad_norm": 0.505168080329895, + "learning_rate": 0.0004525659639402388, + "loss": 3.2416, + "step": 20212 + }, + { + "epoch": 0.99, + "grad_norm": 0.5314830541610718, + "learning_rate": 0.00045255270714884035, + "loss": 3.1842, + "step": 20213 + }, + { + "epoch": 0.99, + "grad_norm": 0.5724264979362488, + "learning_rate": 0.00045253944995564746, + "loss": 3.0492, + "step": 20214 + }, + { + "epoch": 0.99, + "grad_norm": 0.5013821721076965, + "learning_rate": 0.0004525261923606951, + "loss": 2.9629, + "step": 20215 + }, + { + "epoch": 0.99, + "grad_norm": 0.5013492703437805, + "learning_rate": 0.0004525129343640179, + "loss": 3.0972, + "step": 20216 + }, + { + "epoch": 0.99, + "grad_norm": 0.5238487124443054, + "learning_rate": 0.00045249967596565105, + "loss": 3.1335, + "step": 20217 + }, + { + "epoch": 0.99, + "grad_norm": 0.5291327238082886, + "learning_rate": 0.0004524864171656295, + "loss": 3.0083, + "step": 20218 + }, + { + "epoch": 0.99, + "grad_norm": 0.5021530985832214, + "learning_rate": 0.000452473157963988, + "loss": 3.1272, + "step": 20219 + }, + { + "epoch": 0.99, + "grad_norm": 0.5439477562904358, + "learning_rate": 0.00045245989836076154, + "loss": 3.0997, + "step": 20220 + }, + { + "epoch": 0.99, + "grad_norm": 0.528767466545105, + "learning_rate": 0.00045244663835598505, + "loss": 3.1914, + "step": 20221 + }, + { + "epoch": 0.99, + "grad_norm": 0.5145512223243713, + "learning_rate": 0.00045243337794969343, + "loss": 3.1779, + "step": 20222 + }, + { + "epoch": 0.99, + "grad_norm": 0.5444108247756958, + "learning_rate": 0.00045242011714192174, + "loss": 3.1191, + "step": 20223 + }, + { + "epoch": 0.99, + "grad_norm": 0.537213146686554, + "learning_rate": 0.00045240685593270474, + "loss": 3.0256, + "step": 20224 + }, + { + "epoch": 0.99, + "grad_norm": 0.4993702471256256, + "learning_rate": 0.00045239359432207733, + "loss": 3.2517, + "step": 20225 + }, + { + "epoch": 0.99, + "grad_norm": 0.547257125377655, + "learning_rate": 0.00045238033231007464, + "loss": 3.1549, + "step": 20226 + }, + { + "epoch": 0.99, + "grad_norm": 0.5551975965499878, + "learning_rate": 0.0004523670698967314, + "loss": 3.1891, + "step": 20227 + }, + { + "epoch": 0.99, + "grad_norm": 0.5113047361373901, + "learning_rate": 0.00045235380708208255, + "loss": 3.192, + "step": 20228 + }, + { + "epoch": 0.99, + "grad_norm": 0.5420333743095398, + "learning_rate": 0.0004523405438661633, + "loss": 3.0101, + "step": 20229 + }, + { + "epoch": 0.99, + "grad_norm": 0.5321565866470337, + "learning_rate": 0.0004523272802490083, + "loss": 2.9095, + "step": 20230 + }, + { + "epoch": 0.99, + "grad_norm": 0.5126756429672241, + "learning_rate": 0.0004523140162306525, + "loss": 3.0235, + "step": 20231 + }, + { + "epoch": 0.99, + "grad_norm": 0.4914495646953583, + "learning_rate": 0.0004523007518111309, + "loss": 3.1374, + "step": 20232 + }, + { + "epoch": 0.99, + "grad_norm": 0.501433253288269, + "learning_rate": 0.00045228748699047844, + "loss": 2.8911, + "step": 20233 + }, + { + "epoch": 0.99, + "grad_norm": 0.5397343635559082, + "learning_rate": 0.00045227422176872997, + "loss": 3.2692, + "step": 20234 + }, + { + "epoch": 0.99, + "grad_norm": 0.5279703736305237, + "learning_rate": 0.0004522609561459207, + "loss": 3.2089, + "step": 20235 + }, + { + "epoch": 0.99, + "grad_norm": 0.533789336681366, + "learning_rate": 0.00045224769012208526, + "loss": 3.0578, + "step": 20236 + }, + { + "epoch": 0.99, + "grad_norm": 0.5652415752410889, + "learning_rate": 0.0004522344236972587, + "loss": 3.2415, + "step": 20237 + }, + { + "epoch": 0.99, + "grad_norm": 0.4909104108810425, + "learning_rate": 0.0004522211568714759, + "loss": 3.1395, + "step": 20238 + }, + { + "epoch": 0.99, + "grad_norm": 0.5707433819770813, + "learning_rate": 0.000452207889644772, + "loss": 3.1975, + "step": 20239 + }, + { + "epoch": 0.99, + "grad_norm": 0.6347876191139221, + "learning_rate": 0.0004521946220171818, + "loss": 3.0497, + "step": 20240 + }, + { + "epoch": 0.99, + "grad_norm": 0.5172315835952759, + "learning_rate": 0.0004521813539887402, + "loss": 3.335, + "step": 20241 + }, + { + "epoch": 0.99, + "grad_norm": 0.563426673412323, + "learning_rate": 0.00045216808555948216, + "loss": 3.099, + "step": 20242 + }, + { + "epoch": 0.99, + "grad_norm": 0.5538248419761658, + "learning_rate": 0.0004521548167294428, + "loss": 3.2189, + "step": 20243 + }, + { + "epoch": 0.99, + "grad_norm": 0.5178148150444031, + "learning_rate": 0.0004521415474986568, + "loss": 3.0827, + "step": 20244 + }, + { + "epoch": 0.99, + "grad_norm": 0.5169044733047485, + "learning_rate": 0.00045212827786715934, + "loss": 3.1631, + "step": 20245 + }, + { + "epoch": 0.99, + "grad_norm": 0.5301280617713928, + "learning_rate": 0.0004521150078349852, + "loss": 3.2562, + "step": 20246 + }, + { + "epoch": 0.99, + "grad_norm": 0.5236168503761292, + "learning_rate": 0.00045210173740216944, + "loss": 3.1808, + "step": 20247 + }, + { + "epoch": 0.99, + "grad_norm": 0.6142783164978027, + "learning_rate": 0.00045208846656874703, + "loss": 3.0495, + "step": 20248 + }, + { + "epoch": 0.99, + "grad_norm": 0.5414308309555054, + "learning_rate": 0.00045207519533475274, + "loss": 2.9324, + "step": 20249 + }, + { + "epoch": 0.99, + "grad_norm": 0.485738068819046, + "learning_rate": 0.0004520619237002218, + "loss": 3.1258, + "step": 20250 + }, + { + "epoch": 0.99, + "grad_norm": 0.5361891388893127, + "learning_rate": 0.0004520486516651889, + "loss": 3.0218, + "step": 20251 + }, + { + "epoch": 0.99, + "grad_norm": 0.5489019155502319, + "learning_rate": 0.00045203537922968915, + "loss": 3.201, + "step": 20252 + }, + { + "epoch": 0.99, + "grad_norm": 0.519248902797699, + "learning_rate": 0.00045202210639375747, + "loss": 3.2805, + "step": 20253 + }, + { + "epoch": 0.99, + "grad_norm": 0.4982036352157593, + "learning_rate": 0.0004520088331574289, + "loss": 3.2192, + "step": 20254 + }, + { + "epoch": 0.99, + "grad_norm": 0.5829838514328003, + "learning_rate": 0.00045199555952073824, + "loss": 3.2538, + "step": 20255 + }, + { + "epoch": 0.99, + "grad_norm": 0.5190660953521729, + "learning_rate": 0.00045198228548372056, + "loss": 3.0256, + "step": 20256 + }, + { + "epoch": 0.99, + "grad_norm": 0.5045883655548096, + "learning_rate": 0.00045196901104641073, + "loss": 3.0908, + "step": 20257 + }, + { + "epoch": 0.99, + "grad_norm": 0.5397174954414368, + "learning_rate": 0.0004519557362088438, + "loss": 3.3517, + "step": 20258 + }, + { + "epoch": 0.99, + "grad_norm": 0.5239180326461792, + "learning_rate": 0.00045194246097105467, + "loss": 3.358, + "step": 20259 + }, + { + "epoch": 0.99, + "grad_norm": 0.52662593126297, + "learning_rate": 0.0004519291853330784, + "loss": 3.168, + "step": 20260 + }, + { + "epoch": 0.99, + "grad_norm": 0.5203768014907837, + "learning_rate": 0.00045191590929494994, + "loss": 3.0684, + "step": 20261 + }, + { + "epoch": 0.99, + "grad_norm": 0.5362476706504822, + "learning_rate": 0.0004519026328567041, + "loss": 3.1779, + "step": 20262 + }, + { + "epoch": 0.99, + "grad_norm": 0.5580528974533081, + "learning_rate": 0.00045188935601837604, + "loss": 3.0697, + "step": 20263 + }, + { + "epoch": 0.99, + "grad_norm": 0.5004633069038391, + "learning_rate": 0.00045187607878000057, + "loss": 3.2735, + "step": 20264 + }, + { + "epoch": 0.99, + "grad_norm": 0.5355501174926758, + "learning_rate": 0.0004518628011416128, + "loss": 3.1514, + "step": 20265 + }, + { + "epoch": 0.99, + "grad_norm": 0.5439483523368835, + "learning_rate": 0.0004518495231032477, + "loss": 3.2839, + "step": 20266 + }, + { + "epoch": 0.99, + "grad_norm": 0.5279213190078735, + "learning_rate": 0.00045183624466494006, + "loss": 3.1213, + "step": 20267 + }, + { + "epoch": 0.99, + "grad_norm": 0.5979596376419067, + "learning_rate": 0.00045182296582672496, + "loss": 3.1, + "step": 20268 + }, + { + "epoch": 0.99, + "grad_norm": 0.5285710096359253, + "learning_rate": 0.0004518096865886375, + "loss": 3.3007, + "step": 20269 + }, + { + "epoch": 0.99, + "grad_norm": 0.571806788444519, + "learning_rate": 0.00045179640695071246, + "loss": 3.225, + "step": 20270 + }, + { + "epoch": 0.99, + "grad_norm": 0.5280594229698181, + "learning_rate": 0.000451783126912985, + "loss": 3.1499, + "step": 20271 + }, + { + "epoch": 0.99, + "grad_norm": 0.5388377904891968, + "learning_rate": 0.00045176984647548994, + "loss": 3.1795, + "step": 20272 + }, + { + "epoch": 0.99, + "grad_norm": 0.5278087258338928, + "learning_rate": 0.00045175656563826224, + "loss": 3.2452, + "step": 20273 + }, + { + "epoch": 0.99, + "grad_norm": 0.5274179577827454, + "learning_rate": 0.000451743284401337, + "loss": 3.1447, + "step": 20274 + }, + { + "epoch": 0.99, + "grad_norm": 0.5059541463851929, + "learning_rate": 0.0004517300027647491, + "loss": 3.0891, + "step": 20275 + }, + { + "epoch": 0.99, + "grad_norm": 0.5273914337158203, + "learning_rate": 0.0004517167207285337, + "loss": 3.1935, + "step": 20276 + }, + { + "epoch": 0.99, + "grad_norm": 0.5308375954627991, + "learning_rate": 0.00045170343829272565, + "loss": 3.1559, + "step": 20277 + }, + { + "epoch": 0.99, + "grad_norm": 0.530507504940033, + "learning_rate": 0.00045169015545735975, + "loss": 2.9621, + "step": 20278 + }, + { + "epoch": 0.99, + "grad_norm": 0.5165071487426758, + "learning_rate": 0.0004516768722224713, + "loss": 3.1421, + "step": 20279 + }, + { + "epoch": 0.99, + "grad_norm": 0.5435473322868347, + "learning_rate": 0.00045166358858809524, + "loss": 3.0609, + "step": 20280 + }, + { + "epoch": 0.99, + "grad_norm": 0.5258364081382751, + "learning_rate": 0.0004516503045542664, + "loss": 2.9946, + "step": 20281 + }, + { + "epoch": 0.99, + "grad_norm": 0.49890580773353577, + "learning_rate": 0.0004516370201210198, + "loss": 3.0339, + "step": 20282 + }, + { + "epoch": 0.99, + "grad_norm": 0.5879883170127869, + "learning_rate": 0.0004516237352883905, + "loss": 3.0634, + "step": 20283 + }, + { + "epoch": 0.99, + "grad_norm": 0.5395330786705017, + "learning_rate": 0.00045161045005641344, + "loss": 3.2598, + "step": 20284 + }, + { + "epoch": 0.99, + "grad_norm": 0.5076286196708679, + "learning_rate": 0.00045159716442512367, + "loss": 3.1879, + "step": 20285 + }, + { + "epoch": 0.99, + "grad_norm": 0.508956253528595, + "learning_rate": 0.0004515838783945562, + "loss": 3.0942, + "step": 20286 + }, + { + "epoch": 0.99, + "grad_norm": 0.5356499552726746, + "learning_rate": 0.00045157059196474593, + "loss": 3.1142, + "step": 20287 + }, + { + "epoch": 0.99, + "grad_norm": 0.535220742225647, + "learning_rate": 0.0004515573051357278, + "loss": 2.9021, + "step": 20288 + }, + { + "epoch": 0.99, + "grad_norm": 0.5434659123420715, + "learning_rate": 0.000451544017907537, + "loss": 3.0314, + "step": 20289 + }, + { + "epoch": 0.99, + "grad_norm": 0.5554776191711426, + "learning_rate": 0.0004515307302802084, + "loss": 2.9867, + "step": 20290 + }, + { + "epoch": 0.99, + "grad_norm": 0.5213914513587952, + "learning_rate": 0.00045151744225377697, + "loss": 3.2232, + "step": 20291 + }, + { + "epoch": 0.99, + "grad_norm": 0.5432573556900024, + "learning_rate": 0.0004515041538282778, + "loss": 3.1003, + "step": 20292 + }, + { + "epoch": 0.99, + "grad_norm": 0.4953993558883667, + "learning_rate": 0.00045149086500374585, + "loss": 3.2513, + "step": 20293 + }, + { + "epoch": 0.99, + "grad_norm": 0.5025012493133545, + "learning_rate": 0.0004514775757802161, + "loss": 3.2112, + "step": 20294 + }, + { + "epoch": 0.99, + "grad_norm": 0.5353289246559143, + "learning_rate": 0.0004514642861577236, + "loss": 3.0389, + "step": 20295 + }, + { + "epoch": 0.99, + "grad_norm": 0.5243365168571472, + "learning_rate": 0.0004514509961363033, + "loss": 3.0729, + "step": 20296 + }, + { + "epoch": 0.99, + "grad_norm": 0.5028917789459229, + "learning_rate": 0.0004514377057159902, + "loss": 3.1696, + "step": 20297 + }, + { + "epoch": 0.99, + "grad_norm": 0.5022268891334534, + "learning_rate": 0.0004514244148968194, + "loss": 2.985, + "step": 20298 + }, + { + "epoch": 0.99, + "grad_norm": 0.5356721878051758, + "learning_rate": 0.00045141112367882573, + "loss": 3.148, + "step": 20299 + }, + { + "epoch": 0.99, + "grad_norm": 0.5337976217269897, + "learning_rate": 0.0004513978320620443, + "loss": 2.9342, + "step": 20300 + }, + { + "epoch": 0.99, + "grad_norm": 0.5110748410224915, + "learning_rate": 0.00045138454004651016, + "loss": 3.2471, + "step": 20301 + }, + { + "epoch": 0.99, + "grad_norm": 0.5441080331802368, + "learning_rate": 0.00045137124763225834, + "loss": 3.0728, + "step": 20302 + }, + { + "epoch": 1.0, + "grad_norm": 0.5328307747840881, + "learning_rate": 0.00045135795481932375, + "loss": 3.1303, + "step": 20303 + }, + { + "epoch": 1.0, + "grad_norm": 0.5662921071052551, + "learning_rate": 0.00045134466160774136, + "loss": 3.2292, + "step": 20304 + }, + { + "epoch": 1.0, + "grad_norm": 0.5180608034133911, + "learning_rate": 0.0004513313679975463, + "loss": 3.303, + "step": 20305 + }, + { + "epoch": 1.0, + "grad_norm": 0.5455940961837769, + "learning_rate": 0.0004513180739887736, + "loss": 3.3352, + "step": 20306 + }, + { + "epoch": 1.0, + "grad_norm": 0.5291252732276917, + "learning_rate": 0.00045130477958145817, + "loss": 3.2022, + "step": 20307 + }, + { + "epoch": 1.0, + "grad_norm": 0.5131491422653198, + "learning_rate": 0.00045129148477563504, + "loss": 3.1474, + "step": 20308 + }, + { + "epoch": 1.0, + "grad_norm": 0.5317510962486267, + "learning_rate": 0.0004512781895713393, + "loss": 3.1086, + "step": 20309 + }, + { + "epoch": 1.0, + "grad_norm": 0.5623073577880859, + "learning_rate": 0.0004512648939686059, + "loss": 3.0699, + "step": 20310 + }, + { + "epoch": 1.0, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0004512515979674698, + "loss": 2.855, + "step": 20311 + }, + { + "epoch": 1.0, + "grad_norm": 0.5638948082923889, + "learning_rate": 0.00045123830156796626, + "loss": 2.9894, + "step": 20312 + }, + { + "epoch": 1.0, + "grad_norm": 0.515745222568512, + "learning_rate": 0.0004512250047701301, + "loss": 3.3086, + "step": 20313 + }, + { + "epoch": 1.0, + "grad_norm": 0.5643942952156067, + "learning_rate": 0.0004512117075739963, + "loss": 3.1348, + "step": 20314 + }, + { + "epoch": 1.0, + "grad_norm": 0.5207350850105286, + "learning_rate": 0.0004511984099796, + "loss": 3.3207, + "step": 20315 + }, + { + "epoch": 1.0, + "grad_norm": 0.5619992017745972, + "learning_rate": 0.0004511851119869762, + "loss": 3.3062, + "step": 20316 + }, + { + "epoch": 1.0, + "grad_norm": 0.5149810910224915, + "learning_rate": 0.00045117181359615995, + "loss": 3.1298, + "step": 20317 + }, + { + "epoch": 1.0, + "grad_norm": 0.528748095035553, + "learning_rate": 0.00045115851480718625, + "loss": 3.1774, + "step": 20318 + }, + { + "epoch": 1.0, + "grad_norm": 0.5054760575294495, + "learning_rate": 0.00045114521562008994, + "loss": 3.1187, + "step": 20319 + }, + { + "epoch": 1.0, + "grad_norm": 0.5329738259315491, + "learning_rate": 0.00045113191603490636, + "loss": 3.0029, + "step": 20320 + }, + { + "epoch": 1.0, + "grad_norm": 0.5501445531845093, + "learning_rate": 0.0004511186160516704, + "loss": 3.3807, + "step": 20321 + }, + { + "epoch": 1.0, + "grad_norm": 0.5410068035125732, + "learning_rate": 0.00045110531567041715, + "loss": 3.0491, + "step": 20322 + }, + { + "epoch": 1.0, + "grad_norm": 0.5346465110778809, + "learning_rate": 0.0004510920148911814, + "loss": 3.1324, + "step": 20323 + }, + { + "epoch": 1.0, + "grad_norm": 0.5548122525215149, + "learning_rate": 0.00045107871371399854, + "loss": 3.0366, + "step": 20324 + }, + { + "epoch": 1.0, + "grad_norm": 0.5406346917152405, + "learning_rate": 0.0004510654121389033, + "loss": 3.353, + "step": 20325 + }, + { + "epoch": 1.0, + "grad_norm": 0.5303011536598206, + "learning_rate": 0.0004510521101659308, + "loss": 3.1889, + "step": 20326 + }, + { + "epoch": 1.0, + "grad_norm": 0.5354381203651428, + "learning_rate": 0.0004510388077951163, + "loss": 3.0421, + "step": 20327 + }, + { + "epoch": 1.0, + "grad_norm": 0.5239824056625366, + "learning_rate": 0.0004510255050264945, + "loss": 3.2569, + "step": 20328 + }, + { + "epoch": 1.0, + "grad_norm": 0.5124732255935669, + "learning_rate": 0.00045101220186010056, + "loss": 3.2477, + "step": 20329 + }, + { + "epoch": 1.0, + "grad_norm": 0.528676450252533, + "learning_rate": 0.00045099889829596965, + "loss": 3.027, + "step": 20330 + }, + { + "epoch": 1.0, + "grad_norm": 0.5336599349975586, + "learning_rate": 0.00045098559433413675, + "loss": 2.9263, + "step": 20331 + }, + { + "epoch": 1.0, + "grad_norm": 0.5987780094146729, + "learning_rate": 0.00045097228997463676, + "loss": 3.0757, + "step": 20332 + }, + { + "epoch": 1.0, + "grad_norm": 0.5460469722747803, + "learning_rate": 0.00045095898521750475, + "loss": 2.9283, + "step": 20333 + }, + { + "epoch": 1.0, + "grad_norm": 0.5135546326637268, + "learning_rate": 0.0004509456800627759, + "loss": 3.0009, + "step": 20334 + }, + { + "epoch": 1.0, + "grad_norm": 0.5272367596626282, + "learning_rate": 0.0004509323745104852, + "loss": 3.1307, + "step": 20335 + }, + { + "epoch": 1.0, + "grad_norm": 0.5386490225791931, + "learning_rate": 0.00045091906856066765, + "loss": 3.2579, + "step": 20336 + }, + { + "epoch": 1.0, + "grad_norm": 0.54600989818573, + "learning_rate": 0.00045090576221335833, + "loss": 3.3436, + "step": 20337 + }, + { + "epoch": 1.0, + "grad_norm": 0.5243917107582092, + "learning_rate": 0.00045089245546859225, + "loss": 3.1057, + "step": 20338 + }, + { + "epoch": 1.0, + "grad_norm": 0.5199042558670044, + "learning_rate": 0.0004508791483264045, + "loss": 3.1192, + "step": 20339 + }, + { + "epoch": 1.0, + "grad_norm": 0.5423737168312073, + "learning_rate": 0.0004508658407868301, + "loss": 3.2666, + "step": 20340 + }, + { + "epoch": 1.0, + "grad_norm": 0.592741847038269, + "learning_rate": 0.000450852532849904, + "loss": 2.9049, + "step": 20341 + }, + { + "epoch": 1.0, + "grad_norm": 0.5735978484153748, + "learning_rate": 0.0004508392245156615, + "loss": 3.1306, + "step": 20342 + }, + { + "epoch": 1.0, + "grad_norm": 0.5209268927574158, + "learning_rate": 0.0004508259157841375, + "loss": 3.3385, + "step": 20343 + }, + { + "epoch": 1.0, + "grad_norm": 0.5497926473617554, + "learning_rate": 0.0004508126066553671, + "loss": 3.0599, + "step": 20344 + }, + { + "epoch": 1.0, + "grad_norm": 0.5306968092918396, + "learning_rate": 0.0004507992971293852, + "loss": 3.067, + "step": 20345 + }, + { + "epoch": 1.0, + "grad_norm": 0.5316476821899414, + "learning_rate": 0.00045078598720622707, + "loss": 3.1147, + "step": 20346 + }, + { + "epoch": 1.0, + "grad_norm": 0.528810977935791, + "learning_rate": 0.0004507726768859277, + "loss": 3.0775, + "step": 20347 + }, + { + "epoch": 1.0, + "grad_norm": 0.5107991099357605, + "learning_rate": 0.00045075936616852206, + "loss": 3.1226, + "step": 20348 + }, + { + "epoch": 1.0, + "grad_norm": 0.513142466545105, + "learning_rate": 0.0004507460550540452, + "loss": 3.1823, + "step": 20349 + }, + { + "epoch": 1.0, + "grad_norm": 0.6255738735198975, + "learning_rate": 0.0004507327435425323, + "loss": 3.0949, + "step": 20350 + }, + { + "epoch": 1.0, + "grad_norm": 0.5634238719940186, + "learning_rate": 0.00045071943163401833, + "loss": 2.9638, + "step": 20351 + }, + { + "epoch": 1.0, + "grad_norm": 0.5344851016998291, + "learning_rate": 0.0004507061193285384, + "loss": 3.1896, + "step": 20352 + }, + { + "epoch": 1.0, + "grad_norm": 0.5088251233100891, + "learning_rate": 0.00045069280662612764, + "loss": 3.0654, + "step": 20353 + }, + { + "epoch": 1.0, + "grad_norm": 0.5598111748695374, + "learning_rate": 0.00045067949352682097, + "loss": 3.1859, + "step": 20354 + }, + { + "epoch": 1.0, + "grad_norm": 0.5614855289459229, + "learning_rate": 0.00045066618003065343, + "loss": 3.1514, + "step": 20355 + }, + { + "epoch": 1.0, + "grad_norm": 0.5441474318504333, + "learning_rate": 0.0004506528661376603, + "loss": 3.2191, + "step": 20356 + }, + { + "epoch": 1.0, + "grad_norm": 0.5195251107215881, + "learning_rate": 0.0004506395518478765, + "loss": 3.0811, + "step": 20357 + }, + { + "epoch": 1.0, + "grad_norm": 0.5556178092956543, + "learning_rate": 0.00045062623716133704, + "loss": 3.1629, + "step": 20358 + }, + { + "epoch": 1.0, + "grad_norm": 0.5472621917724609, + "learning_rate": 0.00045061292207807706, + "loss": 3.049, + "step": 20359 + }, + { + "epoch": 1.0, + "grad_norm": 0.5794954299926758, + "learning_rate": 0.00045059960659813163, + "loss": 3.0257, + "step": 20360 + }, + { + "epoch": 1.0, + "grad_norm": 0.5246169567108154, + "learning_rate": 0.0004505862907215359, + "loss": 3.2008, + "step": 20361 + }, + { + "epoch": 1.0, + "grad_norm": 0.6028974056243896, + "learning_rate": 0.0004505729744483248, + "loss": 3.1549, + "step": 20362 + }, + { + "epoch": 1.0, + "grad_norm": 0.5344709753990173, + "learning_rate": 0.0004505596577785334, + "loss": 3.1225, + "step": 20363 + }, + { + "epoch": 1.0, + "grad_norm": 0.49651411175727844, + "learning_rate": 0.000450546340712197, + "loss": 3.1713, + "step": 20364 + }, + { + "epoch": 1.0, + "grad_norm": 0.5115389823913574, + "learning_rate": 0.0004505330232493504, + "loss": 3.1379, + "step": 20365 + }, + { + "epoch": 1.0, + "grad_norm": 0.5067347288131714, + "learning_rate": 0.00045051970539002875, + "loss": 3.1478, + "step": 20366 + }, + { + "epoch": 1.0, + "grad_norm": 0.583716630935669, + "learning_rate": 0.00045050638713426723, + "loss": 3.0329, + "step": 20367 + }, + { + "epoch": 1.0, + "grad_norm": 0.5057085156440735, + "learning_rate": 0.0004504930684821009, + "loss": 3.1614, + "step": 20368 + }, + { + "epoch": 1.0, + "grad_norm": 0.5527999401092529, + "learning_rate": 0.00045047974943356473, + "loss": 3.1671, + "step": 20369 + }, + { + "epoch": 1.0, + "grad_norm": 0.549720287322998, + "learning_rate": 0.00045046642998869385, + "loss": 3.1481, + "step": 20370 + }, + { + "epoch": 1.0, + "grad_norm": 0.5233119130134583, + "learning_rate": 0.0004504531101475234, + "loss": 3.2703, + "step": 20371 + }, + { + "epoch": 1.0, + "grad_norm": 0.5394256711006165, + "learning_rate": 0.0004504397899100883, + "loss": 3.0763, + "step": 20372 + }, + { + "epoch": 1.0, + "grad_norm": 0.5158621668815613, + "learning_rate": 0.0004504264692764239, + "loss": 3.3617, + "step": 20373 + }, + { + "epoch": 1.0, + "grad_norm": 0.5600789189338684, + "learning_rate": 0.000450413148246565, + "loss": 3.1524, + "step": 20374 + }, + { + "epoch": 1.0, + "grad_norm": 0.5463529825210571, + "learning_rate": 0.00045039982682054696, + "loss": 3.378, + "step": 20375 + }, + { + "epoch": 1.0, + "grad_norm": 0.48953327536582947, + "learning_rate": 0.0004503865049984045, + "loss": 3.0364, + "step": 20376 + }, + { + "epoch": 1.0, + "grad_norm": 0.5219492316246033, + "learning_rate": 0.0004503731827801731, + "loss": 3.0698, + "step": 20377 + }, + { + "epoch": 1.0, + "grad_norm": 0.557974636554718, + "learning_rate": 0.0004503598601658877, + "loss": 3.2128, + "step": 20378 + }, + { + "epoch": 1.0, + "grad_norm": 0.5604296922683716, + "learning_rate": 0.0004503465371555833, + "loss": 2.9745, + "step": 20379 + }, + { + "epoch": 1.0, + "grad_norm": 0.5162230730056763, + "learning_rate": 0.0004503332137492951, + "loss": 3.0853, + "step": 20380 + }, + { + "epoch": 1.0, + "grad_norm": 0.5778909921646118, + "learning_rate": 0.00045031988994705796, + "loss": 3.2235, + "step": 20381 + }, + { + "epoch": 1.0, + "grad_norm": 0.5140716433525085, + "learning_rate": 0.00045030656574890745, + "loss": 3.2436, + "step": 20382 + }, + { + "epoch": 1.0, + "grad_norm": 0.5335753560066223, + "learning_rate": 0.0004502932411548781, + "loss": 3.1531, + "step": 20383 + }, + { + "epoch": 1.0, + "grad_norm": 0.5693366527557373, + "learning_rate": 0.00045027991616500545, + "loss": 3.3253, + "step": 20384 + }, + { + "epoch": 1.0, + "grad_norm": 0.5139354467391968, + "learning_rate": 0.0004502665907793244, + "loss": 2.9069, + "step": 20385 + }, + { + "epoch": 1.0, + "grad_norm": 0.49556657671928406, + "learning_rate": 0.00045025326499787007, + "loss": 3.2041, + "step": 20386 + }, + { + "epoch": 1.0, + "grad_norm": 0.513113796710968, + "learning_rate": 0.0004502399388206775, + "loss": 3.2202, + "step": 20387 + }, + { + "epoch": 1.0, + "grad_norm": 0.5639035701751709, + "learning_rate": 0.0004502266122477819, + "loss": 3.2123, + "step": 20388 + }, + { + "epoch": 1.0, + "grad_norm": 0.5301312208175659, + "learning_rate": 0.00045021328527921825, + "loss": 3.1671, + "step": 20389 + }, + { + "epoch": 1.0, + "grad_norm": 0.5456419587135315, + "learning_rate": 0.00045019995791502175, + "loss": 3.1206, + "step": 20390 + }, + { + "epoch": 1.0, + "grad_norm": 0.5344980359077454, + "learning_rate": 0.00045018663015522747, + "loss": 3.1698, + "step": 20391 + }, + { + "epoch": 1.0, + "grad_norm": 0.5286158919334412, + "learning_rate": 0.00045017330199987053, + "loss": 3.1031, + "step": 20392 + }, + { + "epoch": 1.0, + "grad_norm": 0.5467382073402405, + "learning_rate": 0.000450159973448986, + "loss": 3.0543, + "step": 20393 + }, + { + "epoch": 1.0, + "grad_norm": 0.5618236064910889, + "learning_rate": 0.000450146644502609, + "loss": 3.0087, + "step": 20394 + }, + { + "epoch": 1.0, + "grad_norm": 0.5205358266830444, + "learning_rate": 0.0004501333151607747, + "loss": 3.2562, + "step": 20395 + }, + { + "epoch": 1.0, + "grad_norm": 0.49830591678619385, + "learning_rate": 0.0004501199854235181, + "loss": 3.0565, + "step": 20396 + }, + { + "epoch": 1.0, + "grad_norm": 0.5011833310127258, + "learning_rate": 0.0004501066552908743, + "loss": 3.174, + "step": 20397 + }, + { + "epoch": 1.0, + "grad_norm": 0.5207628607749939, + "learning_rate": 0.00045009332476287847, + "loss": 3.16, + "step": 20398 + }, + { + "epoch": 1.0, + "grad_norm": 0.5745342969894409, + "learning_rate": 0.00045007999383956564, + "loss": 3.434, + "step": 20399 + }, + { + "epoch": 1.0, + "grad_norm": 0.5692901015281677, + "learning_rate": 0.00045006666252097113, + "loss": 3.283, + "step": 20400 + }, + { + "epoch": 1.0, + "grad_norm": 0.47566521167755127, + "learning_rate": 0.00045005333080712985, + "loss": 3.0213, + "step": 20401 + }, + { + "epoch": 1.0, + "grad_norm": 0.5313223004341125, + "learning_rate": 0.000450039998698077, + "loss": 3.1536, + "step": 20402 + }, + { + "epoch": 1.0, + "grad_norm": 0.5201346278190613, + "learning_rate": 0.0004500266661938476, + "loss": 3.1187, + "step": 20403 + }, + { + "epoch": 1.0, + "grad_norm": 0.5419844388961792, + "learning_rate": 0.000450013333294477, + "loss": 3.0795, + "step": 20404 + }, + { + "epoch": 1.0, + "grad_norm": 0.6561049222946167, + "learning_rate": 0.00045, + "loss": 3.1308, + "step": 20405 + }, + { + "epoch": 1.0, + "grad_norm": 0.5238326191902161, + "learning_rate": 0.00044998666631045184, + "loss": 3.3698, + "step": 20406 + }, + { + "epoch": 1.0, + "grad_norm": 0.5477263927459717, + "learning_rate": 0.0004499733322258678, + "loss": 3.1224, + "step": 20407 + }, + { + "epoch": 1.0, + "grad_norm": 0.5472010374069214, + "learning_rate": 0.0004499599977462828, + "loss": 2.8647, + "step": 20408 + }, + { + "epoch": 1.0, + "grad_norm": 0.5201596021652222, + "learning_rate": 0.00044994666287173196, + "loss": 3.1365, + "step": 20409 + }, + { + "epoch": 1.0, + "grad_norm": 0.5760728716850281, + "learning_rate": 0.0004499333276022506, + "loss": 2.9512, + "step": 20410 + }, + { + "epoch": 1.0, + "grad_norm": 0.5273072719573975, + "learning_rate": 0.0004499199919378736, + "loss": 2.9243, + "step": 20411 + }, + { + "epoch": 1.0, + "grad_norm": 0.5336431860923767, + "learning_rate": 0.0004499066558786362, + "loss": 3.16, + "step": 20412 + }, + { + "epoch": 1.0, + "grad_norm": 0.5312026739120483, + "learning_rate": 0.0004498933194245735, + "loss": 3.1535, + "step": 20413 + }, + { + "epoch": 1.0, + "grad_norm": 0.5205845832824707, + "learning_rate": 0.00044987998257572075, + "loss": 3.1798, + "step": 20414 + }, + { + "epoch": 1.0, + "grad_norm": 0.5186797380447388, + "learning_rate": 0.000449866645332113, + "loss": 2.9815, + "step": 20415 + }, + { + "epoch": 1.0, + "grad_norm": 0.5282081365585327, + "learning_rate": 0.0004498533076937852, + "loss": 3.1629, + "step": 20416 + }, + { + "epoch": 1.0, + "grad_norm": 0.501063346862793, + "learning_rate": 0.00044983996966077263, + "loss": 3.1123, + "step": 20417 + }, + { + "epoch": 1.0, + "grad_norm": 0.5373258590698242, + "learning_rate": 0.0004498266312331106, + "loss": 3.0061, + "step": 20418 + }, + { + "epoch": 1.0, + "grad_norm": 0.5097174048423767, + "learning_rate": 0.0004498132924108339, + "loss": 3.1618, + "step": 20419 + }, + { + "epoch": 1.0, + "grad_norm": 0.4925655126571655, + "learning_rate": 0.00044979995319397787, + "loss": 3.2096, + "step": 20420 + }, + { + "epoch": 1.0, + "grad_norm": 0.5566980242729187, + "learning_rate": 0.0004497866135825776, + "loss": 2.9665, + "step": 20421 + }, + { + "epoch": 1.0, + "grad_norm": 0.62166428565979, + "learning_rate": 0.00044977327357666815, + "loss": 3.0511, + "step": 20422 + }, + { + "epoch": 1.0, + "grad_norm": 0.49404507875442505, + "learning_rate": 0.00044975993317628477, + "loss": 3.1312, + "step": 20423 + }, + { + "epoch": 1.0, + "grad_norm": 0.5426779985427856, + "learning_rate": 0.00044974659238146257, + "loss": 3.3202, + "step": 20424 + }, + { + "epoch": 1.0, + "grad_norm": 0.549683690071106, + "learning_rate": 0.0004497332511922366, + "loss": 3.1595, + "step": 20425 + }, + { + "epoch": 1.0, + "grad_norm": 0.5156567692756653, + "learning_rate": 0.0004497199096086421, + "loss": 3.1343, + "step": 20426 + }, + { + "epoch": 1.0, + "grad_norm": 0.5068504810333252, + "learning_rate": 0.0004497065676307142, + "loss": 3.0144, + "step": 20427 + }, + { + "epoch": 1.0, + "grad_norm": 0.5446821451187134, + "learning_rate": 0.00044969322525848795, + "loss": 3.1761, + "step": 20428 + }, + { + "epoch": 1.0, + "grad_norm": 0.5488018989562988, + "learning_rate": 0.00044967988249199867, + "loss": 3.261, + "step": 20429 + }, + { + "epoch": 1.0, + "grad_norm": 0.513907790184021, + "learning_rate": 0.0004496665393312813, + "loss": 3.1929, + "step": 20430 + }, + { + "epoch": 1.0, + "grad_norm": 0.5646592974662781, + "learning_rate": 0.0004496531957763711, + "loss": 3.0872, + "step": 20431 + }, + { + "epoch": 1.0, + "grad_norm": 0.487352192401886, + "learning_rate": 0.0004496398518273031, + "loss": 3.044, + "step": 20432 + }, + { + "epoch": 1.0, + "grad_norm": 0.48989337682724, + "learning_rate": 0.00044962650748411263, + "loss": 3.0768, + "step": 20433 + }, + { + "epoch": 1.0, + "grad_norm": 0.8140915036201477, + "learning_rate": 0.0004496131627468347, + "loss": 3.0274, + "step": 20434 + }, + { + "epoch": 1.0, + "grad_norm": 0.5698384046554565, + "learning_rate": 0.0004495998176155045, + "loss": 3.1436, + "step": 20435 + }, + { + "epoch": 1.0, + "grad_norm": 0.5074122548103333, + "learning_rate": 0.00044958647209015714, + "loss": 3.1786, + "step": 20436 + }, + { + "epoch": 1.0, + "grad_norm": 0.5354686379432678, + "learning_rate": 0.0004495731261708278, + "loss": 3.1297, + "step": 20437 + }, + { + "epoch": 1.0, + "grad_norm": 0.5391170382499695, + "learning_rate": 0.0004495597798575517, + "loss": 3.0141, + "step": 20438 + }, + { + "epoch": 1.0, + "grad_norm": 0.5479790568351746, + "learning_rate": 0.0004495464331503638, + "loss": 2.9269, + "step": 20439 + }, + { + "epoch": 1.0, + "grad_norm": 0.5683173537254333, + "learning_rate": 0.00044953308604929953, + "loss": 3.09, + "step": 20440 + }, + { + "epoch": 1.0, + "grad_norm": 0.5525175333023071, + "learning_rate": 0.0004495197385543938, + "loss": 3.1034, + "step": 20441 + }, + { + "epoch": 1.0, + "grad_norm": 0.5185175538063049, + "learning_rate": 0.0004495063906656818, + "loss": 3.2782, + "step": 20442 + }, + { + "epoch": 1.0, + "grad_norm": 0.5282908082008362, + "learning_rate": 0.0004494930423831988, + "loss": 3.0233, + "step": 20443 + }, + { + "epoch": 1.0, + "grad_norm": 0.5449439883232117, + "learning_rate": 0.00044947969370698, + "loss": 3.0911, + "step": 20444 + }, + { + "epoch": 1.0, + "grad_norm": 0.5355467796325684, + "learning_rate": 0.00044946634463706035, + "loss": 3.019, + "step": 20445 + }, + { + "epoch": 1.0, + "grad_norm": 0.514240562915802, + "learning_rate": 0.00044945299517347507, + "loss": 3.3083, + "step": 20446 + }, + { + "epoch": 1.0, + "grad_norm": 0.5259093642234802, + "learning_rate": 0.00044943964531625946, + "loss": 3.0219, + "step": 20447 + }, + { + "epoch": 1.0, + "grad_norm": 0.5237060189247131, + "learning_rate": 0.00044942629506544853, + "loss": 3.2387, + "step": 20448 + }, + { + "epoch": 1.0, + "grad_norm": 0.518451988697052, + "learning_rate": 0.00044941294442107753, + "loss": 3.0307, + "step": 20449 + }, + { + "epoch": 1.0, + "grad_norm": 0.518038272857666, + "learning_rate": 0.0004493995933831815, + "loss": 3.093, + "step": 20450 + }, + { + "epoch": 1.0, + "grad_norm": 0.5178418159484863, + "learning_rate": 0.00044938624195179586, + "loss": 3.059, + "step": 20451 + }, + { + "epoch": 1.0, + "grad_norm": 0.5047763586044312, + "learning_rate": 0.00044937289012695545, + "loss": 3.1335, + "step": 20452 + }, + { + "epoch": 1.0, + "grad_norm": 0.5251728892326355, + "learning_rate": 0.0004493595379086956, + "loss": 3.0702, + "step": 20453 + }, + { + "epoch": 1.0, + "grad_norm": 0.5640724301338196, + "learning_rate": 0.0004493461852970515, + "loss": 3.1382, + "step": 20454 + }, + { + "epoch": 1.0, + "grad_norm": 0.5112975835800171, + "learning_rate": 0.0004493328322920584, + "loss": 3.2491, + "step": 20455 + }, + { + "epoch": 1.0, + "grad_norm": 0.5223793983459473, + "learning_rate": 0.00044931947889375126, + "loss": 3.0409, + "step": 20456 + }, + { + "epoch": 1.0, + "grad_norm": 0.5379955172538757, + "learning_rate": 0.00044930612510216535, + "loss": 3.2273, + "step": 20457 + }, + { + "epoch": 1.0, + "grad_norm": 0.49986040592193604, + "learning_rate": 0.0004492927709173359, + "loss": 3.0213, + "step": 20458 + }, + { + "epoch": 1.0, + "grad_norm": 0.5016982555389404, + "learning_rate": 0.00044927941633929796, + "loss": 3.1779, + "step": 20459 + }, + { + "epoch": 1.0, + "grad_norm": 0.5460448265075684, + "learning_rate": 0.00044926606136808675, + "loss": 3.0865, + "step": 20460 + }, + { + "epoch": 1.0, + "grad_norm": 0.5995565056800842, + "learning_rate": 0.00044925270600373754, + "loss": 3.0536, + "step": 20461 + }, + { + "epoch": 1.0, + "grad_norm": 0.5643872618675232, + "learning_rate": 0.0004492393502462854, + "loss": 3.1441, + "step": 20462 + }, + { + "epoch": 1.0, + "grad_norm": 0.5247363448143005, + "learning_rate": 0.00044922599409576543, + "loss": 3.1199, + "step": 20463 + }, + { + "epoch": 1.0, + "grad_norm": 0.5810442566871643, + "learning_rate": 0.000449212637552213, + "loss": 3.1335, + "step": 20464 + }, + { + "epoch": 1.0, + "grad_norm": 0.5455927848815918, + "learning_rate": 0.0004491992806156632, + "loss": 2.9627, + "step": 20465 + }, + { + "epoch": 1.0, + "grad_norm": 0.5521914958953857, + "learning_rate": 0.00044918592328615126, + "loss": 3.1621, + "step": 20466 + }, + { + "epoch": 1.0, + "grad_norm": 0.5023887753486633, + "learning_rate": 0.00044917256556371225, + "loss": 2.9914, + "step": 20467 + }, + { + "epoch": 1.0, + "grad_norm": 0.5143914222717285, + "learning_rate": 0.00044915920744838136, + "loss": 3.181, + "step": 20468 + }, + { + "epoch": 1.0, + "grad_norm": 0.5196571350097656, + "learning_rate": 0.000449145848940194, + "loss": 3.1984, + "step": 20469 + }, + { + "epoch": 1.0, + "grad_norm": 0.5121498107910156, + "learning_rate": 0.00044913249003918505, + "loss": 3.0403, + "step": 20470 + }, + { + "epoch": 1.0, + "grad_norm": 0.5418211817741394, + "learning_rate": 0.00044911913074538977, + "loss": 3.0662, + "step": 20471 + }, + { + "epoch": 1.0, + "grad_norm": 0.5008609890937805, + "learning_rate": 0.00044910577105884345, + "loss": 3.0714, + "step": 20472 + }, + { + "epoch": 1.0, + "grad_norm": 0.4887140393257141, + "learning_rate": 0.00044909241097958126, + "loss": 3.1298, + "step": 20473 + }, + { + "epoch": 1.0, + "grad_norm": 0.5122173428535461, + "learning_rate": 0.00044907905050763834, + "loss": 3.1986, + "step": 20474 + }, + { + "epoch": 1.0, + "grad_norm": 0.5146946907043457, + "learning_rate": 0.0004490656896430498, + "loss": 3.0118, + "step": 20475 + }, + { + "epoch": 1.0, + "grad_norm": 0.5460460782051086, + "learning_rate": 0.00044905232838585103, + "loss": 3.177, + "step": 20476 + }, + { + "epoch": 1.0, + "grad_norm": 0.5463166832923889, + "learning_rate": 0.0004490389667360771, + "loss": 3.2155, + "step": 20477 + }, + { + "epoch": 1.0, + "grad_norm": 0.5182934403419495, + "learning_rate": 0.00044902560469376314, + "loss": 3.1258, + "step": 20478 + }, + { + "epoch": 1.0, + "grad_norm": 0.5243579149246216, + "learning_rate": 0.0004490122422589445, + "loss": 3.1593, + "step": 20479 + }, + { + "epoch": 1.0, + "grad_norm": 0.5343524813652039, + "learning_rate": 0.00044899887943165634, + "loss": 3.1276, + "step": 20480 + }, + { + "epoch": 1.0, + "grad_norm": 0.4999130368232727, + "learning_rate": 0.0004489855162119337, + "loss": 3.2389, + "step": 20481 + }, + { + "epoch": 1.0, + "grad_norm": 0.5230382680892944, + "learning_rate": 0.0004489721525998119, + "loss": 3.0518, + "step": 20482 + }, + { + "epoch": 1.0, + "grad_norm": 0.516069769859314, + "learning_rate": 0.0004489587885953261, + "loss": 3.1985, + "step": 20483 + }, + { + "epoch": 1.0, + "grad_norm": 0.5170885324478149, + "learning_rate": 0.0004489454241985116, + "loss": 3.0926, + "step": 20484 + }, + { + "epoch": 1.0, + "grad_norm": 0.5558387041091919, + "learning_rate": 0.0004489320594094035, + "loss": 3.0827, + "step": 20485 + }, + { + "epoch": 1.0, + "grad_norm": 0.5676001906394958, + "learning_rate": 0.000448918694228037, + "loss": 3.0072, + "step": 20486 + }, + { + "epoch": 1.0, + "grad_norm": 0.552216112613678, + "learning_rate": 0.00044890532865444723, + "loss": 3.1179, + "step": 20487 + }, + { + "epoch": 1.0, + "grad_norm": 0.5196042656898499, + "learning_rate": 0.0004488919626886696, + "loss": 3.1181, + "step": 20488 + }, + { + "epoch": 1.0, + "grad_norm": 0.5262749791145325, + "learning_rate": 0.0004488785963307391, + "loss": 3.0952, + "step": 20489 + }, + { + "epoch": 1.0, + "grad_norm": 0.5152573585510254, + "learning_rate": 0.0004488652295806911, + "loss": 2.9795, + "step": 20490 + }, + { + "epoch": 1.0, + "grad_norm": 0.5612766742706299, + "learning_rate": 0.0004488518624385608, + "loss": 3.1665, + "step": 20491 + }, + { + "epoch": 1.0, + "grad_norm": 0.5625437498092651, + "learning_rate": 0.0004488384949043833, + "loss": 3.3981, + "step": 20492 + }, + { + "epoch": 1.0, + "grad_norm": 0.5344268679618835, + "learning_rate": 0.00044882512697819383, + "loss": 3.0059, + "step": 20493 + }, + { + "epoch": 1.0, + "grad_norm": 0.5187119841575623, + "learning_rate": 0.0004488117586600275, + "loss": 3.1045, + "step": 20494 + }, + { + "epoch": 1.0, + "grad_norm": 0.5119081139564514, + "learning_rate": 0.0004487983899499198, + "loss": 3.1185, + "step": 20495 + }, + { + "epoch": 1.0, + "grad_norm": 0.5100210905075073, + "learning_rate": 0.00044878502084790564, + "loss": 3.0568, + "step": 20496 + }, + { + "epoch": 1.0, + "grad_norm": 0.5492945313453674, + "learning_rate": 0.0004487716513540205, + "loss": 3.2403, + "step": 20497 + }, + { + "epoch": 1.0, + "grad_norm": 0.5489396452903748, + "learning_rate": 0.0004487582814682994, + "loss": 2.8496, + "step": 20498 + }, + { + "epoch": 1.0, + "grad_norm": 0.5205833315849304, + "learning_rate": 0.0004487449111907776, + "loss": 2.8851, + "step": 20499 + }, + { + "epoch": 1.0, + "grad_norm": 0.5621615648269653, + "learning_rate": 0.0004487315405214903, + "loss": 2.9772, + "step": 20500 + }, + { + "epoch": 1.0, + "grad_norm": 0.5375982522964478, + "learning_rate": 0.00044871816946047286, + "loss": 3.1913, + "step": 20501 + }, + { + "epoch": 1.0, + "grad_norm": 0.5279413461685181, + "learning_rate": 0.0004487047980077604, + "loss": 3.0989, + "step": 20502 + }, + { + "epoch": 1.0, + "grad_norm": 0.5290868282318115, + "learning_rate": 0.00044869142616338803, + "loss": 3.0419, + "step": 20503 + }, + { + "epoch": 1.0, + "grad_norm": 0.5295721292495728, + "learning_rate": 0.00044867805392739097, + "loss": 3.1698, + "step": 20504 + }, + { + "epoch": 1.0, + "grad_norm": 0.531470000743866, + "learning_rate": 0.00044866468129980464, + "loss": 2.8806, + "step": 20505 + }, + { + "epoch": 1.0, + "grad_norm": 0.5461409091949463, + "learning_rate": 0.0004486513082806642, + "loss": 3.332, + "step": 20506 + }, + { + "epoch": 1.0, + "grad_norm": 0.5531258583068848, + "learning_rate": 0.00044863793487000475, + "loss": 3.2661, + "step": 20507 + }, + { + "epoch": 1.01, + "grad_norm": 0.5420560240745544, + "learning_rate": 0.00044862456106786166, + "loss": 3.0748, + "step": 20508 + }, + { + "epoch": 1.01, + "grad_norm": 0.5236428380012512, + "learning_rate": 0.00044861118687427, + "loss": 3.0693, + "step": 20509 + }, + { + "epoch": 1.01, + "grad_norm": 0.5320883393287659, + "learning_rate": 0.00044859781228926505, + "loss": 3.2626, + "step": 20510 + }, + { + "epoch": 1.01, + "grad_norm": 0.5048290491104126, + "learning_rate": 0.0004485844373128821, + "loss": 3.0668, + "step": 20511 + }, + { + "epoch": 1.01, + "grad_norm": 0.5460326075553894, + "learning_rate": 0.00044857106194515635, + "loss": 3.1908, + "step": 20512 + }, + { + "epoch": 1.01, + "grad_norm": 0.4952123463153839, + "learning_rate": 0.0004485576861861231, + "loss": 3.0871, + "step": 20513 + }, + { + "epoch": 1.01, + "grad_norm": 0.5370456576347351, + "learning_rate": 0.0004485443100358173, + "loss": 3.0149, + "step": 20514 + }, + { + "epoch": 1.01, + "grad_norm": 0.5347251892089844, + "learning_rate": 0.0004485309334942745, + "loss": 3.1556, + "step": 20515 + }, + { + "epoch": 1.01, + "grad_norm": 0.5180676579475403, + "learning_rate": 0.0004485175565615298, + "loss": 3.0769, + "step": 20516 + }, + { + "epoch": 1.01, + "grad_norm": 0.5209394097328186, + "learning_rate": 0.0004485041792376184, + "loss": 3.2223, + "step": 20517 + }, + { + "epoch": 1.01, + "grad_norm": 0.5076389908790588, + "learning_rate": 0.00044849080152257564, + "loss": 3.1164, + "step": 20518 + }, + { + "epoch": 1.01, + "grad_norm": 0.5367461442947388, + "learning_rate": 0.00044847742341643654, + "loss": 3.1111, + "step": 20519 + }, + { + "epoch": 1.01, + "grad_norm": 0.5419523119926453, + "learning_rate": 0.0004484640449192367, + "loss": 2.9594, + "step": 20520 + }, + { + "epoch": 1.01, + "grad_norm": 0.5312870144844055, + "learning_rate": 0.00044845066603101103, + "loss": 3.0924, + "step": 20521 + }, + { + "epoch": 1.01, + "grad_norm": 0.5425320863723755, + "learning_rate": 0.0004484372867517948, + "loss": 3.1313, + "step": 20522 + }, + { + "epoch": 1.01, + "grad_norm": 0.5226745009422302, + "learning_rate": 0.00044842390708162345, + "loss": 3.1729, + "step": 20523 + }, + { + "epoch": 1.01, + "grad_norm": 0.5791988968849182, + "learning_rate": 0.000448410527020532, + "loss": 2.9848, + "step": 20524 + }, + { + "epoch": 1.01, + "grad_norm": 0.5693748593330383, + "learning_rate": 0.0004483971465685558, + "loss": 3.1733, + "step": 20525 + }, + { + "epoch": 1.01, + "grad_norm": 0.5362666249275208, + "learning_rate": 0.0004483837657257301, + "loss": 3.0081, + "step": 20526 + }, + { + "epoch": 1.01, + "grad_norm": 0.5347479581832886, + "learning_rate": 0.0004483703844920901, + "loss": 3.0189, + "step": 20527 + }, + { + "epoch": 1.01, + "grad_norm": 0.529528796672821, + "learning_rate": 0.00044835700286767114, + "loss": 2.9859, + "step": 20528 + }, + { + "epoch": 1.01, + "grad_norm": 0.5108277797698975, + "learning_rate": 0.0004483436208525083, + "loss": 3.1131, + "step": 20529 + }, + { + "epoch": 1.01, + "grad_norm": 0.5495550632476807, + "learning_rate": 0.00044833023844663693, + "loss": 3.2628, + "step": 20530 + }, + { + "epoch": 1.01, + "grad_norm": 0.5666603446006775, + "learning_rate": 0.0004483168556500922, + "loss": 3.1046, + "step": 20531 + }, + { + "epoch": 1.01, + "grad_norm": 0.5286827683448792, + "learning_rate": 0.00044830347246290956, + "loss": 2.8698, + "step": 20532 + }, + { + "epoch": 1.01, + "grad_norm": 0.5373377203941345, + "learning_rate": 0.000448290088885124, + "loss": 2.9937, + "step": 20533 + }, + { + "epoch": 1.01, + "grad_norm": 0.5299882292747498, + "learning_rate": 0.00044827670491677095, + "loss": 3.0146, + "step": 20534 + }, + { + "epoch": 1.01, + "grad_norm": 0.5432134866714478, + "learning_rate": 0.00044826332055788553, + "loss": 3.3426, + "step": 20535 + }, + { + "epoch": 1.01, + "grad_norm": 0.5193071961402893, + "learning_rate": 0.00044824993580850313, + "loss": 2.8983, + "step": 20536 + }, + { + "epoch": 1.01, + "grad_norm": 0.5308823585510254, + "learning_rate": 0.00044823655066865886, + "loss": 3.3115, + "step": 20537 + }, + { + "epoch": 1.01, + "grad_norm": 0.5253787040710449, + "learning_rate": 0.0004482231651383881, + "loss": 3.2114, + "step": 20538 + }, + { + "epoch": 1.01, + "grad_norm": 0.583215594291687, + "learning_rate": 0.000448209779217726, + "loss": 3.1361, + "step": 20539 + }, + { + "epoch": 1.01, + "grad_norm": 0.5469831824302673, + "learning_rate": 0.0004481963929067078, + "loss": 3.1252, + "step": 20540 + }, + { + "epoch": 1.01, + "grad_norm": 0.5351081490516663, + "learning_rate": 0.000448183006205369, + "loss": 3.0161, + "step": 20541 + }, + { + "epoch": 1.01, + "grad_norm": 0.4953567385673523, + "learning_rate": 0.00044816961911374464, + "loss": 3.3067, + "step": 20542 + }, + { + "epoch": 1.01, + "grad_norm": 0.5496410727500916, + "learning_rate": 0.00044815623163186994, + "loss": 3.1639, + "step": 20543 + }, + { + "epoch": 1.01, + "grad_norm": 0.5172861218452454, + "learning_rate": 0.0004481428437597803, + "loss": 2.8968, + "step": 20544 + }, + { + "epoch": 1.01, + "grad_norm": 0.5500335097312927, + "learning_rate": 0.0004481294554975108, + "loss": 3.1042, + "step": 20545 + }, + { + "epoch": 1.01, + "grad_norm": 0.5287953019142151, + "learning_rate": 0.000448116066845097, + "loss": 3.0327, + "step": 20546 + }, + { + "epoch": 1.01, + "grad_norm": 0.5284257531166077, + "learning_rate": 0.00044810267780257386, + "loss": 3.2541, + "step": 20547 + }, + { + "epoch": 1.01, + "grad_norm": 0.5908827781677246, + "learning_rate": 0.0004480892883699768, + "loss": 3.0347, + "step": 20548 + }, + { + "epoch": 1.01, + "grad_norm": 0.5372423529624939, + "learning_rate": 0.00044807589854734106, + "loss": 2.9296, + "step": 20549 + }, + { + "epoch": 1.01, + "grad_norm": 0.5452413558959961, + "learning_rate": 0.0004480625083347019, + "loss": 2.8282, + "step": 20550 + }, + { + "epoch": 1.01, + "grad_norm": 0.5139778256416321, + "learning_rate": 0.0004480491177320946, + "loss": 2.9481, + "step": 20551 + }, + { + "epoch": 1.01, + "grad_norm": 0.5498486757278442, + "learning_rate": 0.0004480357267395544, + "loss": 2.954, + "step": 20552 + }, + { + "epoch": 1.01, + "grad_norm": 0.550351083278656, + "learning_rate": 0.00044802233535711666, + "loss": 3.1411, + "step": 20553 + }, + { + "epoch": 1.01, + "grad_norm": 0.5372947454452515, + "learning_rate": 0.0004480089435848165, + "loss": 3.0283, + "step": 20554 + }, + { + "epoch": 1.01, + "grad_norm": 0.5474216341972351, + "learning_rate": 0.0004479955514226892, + "loss": 3.0082, + "step": 20555 + }, + { + "epoch": 1.01, + "grad_norm": 0.578901469707489, + "learning_rate": 0.0004479821588707702, + "loss": 3.0129, + "step": 20556 + }, + { + "epoch": 1.01, + "grad_norm": 0.5624405145645142, + "learning_rate": 0.0004479687659290947, + "loss": 2.9875, + "step": 20557 + }, + { + "epoch": 1.01, + "grad_norm": 0.5033628940582275, + "learning_rate": 0.0004479553725976979, + "loss": 3.0755, + "step": 20558 + }, + { + "epoch": 1.01, + "grad_norm": 0.5456181764602661, + "learning_rate": 0.0004479419788766151, + "loss": 2.937, + "step": 20559 + }, + { + "epoch": 1.01, + "grad_norm": 0.595481276512146, + "learning_rate": 0.0004479285847658816, + "loss": 3.1074, + "step": 20560 + }, + { + "epoch": 1.01, + "grad_norm": 0.5183221101760864, + "learning_rate": 0.00044791519026553267, + "loss": 2.8591, + "step": 20561 + }, + { + "epoch": 1.01, + "grad_norm": 0.537140965461731, + "learning_rate": 0.0004479017953756037, + "loss": 3.0136, + "step": 20562 + }, + { + "epoch": 1.01, + "grad_norm": 0.5604956746101379, + "learning_rate": 0.00044788840009612975, + "loss": 3.0379, + "step": 20563 + }, + { + "epoch": 1.01, + "grad_norm": 0.5302656292915344, + "learning_rate": 0.0004478750044271463, + "loss": 3.1423, + "step": 20564 + }, + { + "epoch": 1.01, + "grad_norm": 0.5702511072158813, + "learning_rate": 0.0004478616083686884, + "loss": 3.0888, + "step": 20565 + }, + { + "epoch": 1.01, + "grad_norm": 0.5529466271400452, + "learning_rate": 0.0004478482119207916, + "loss": 3.0283, + "step": 20566 + }, + { + "epoch": 1.01, + "grad_norm": 0.49684593081474304, + "learning_rate": 0.0004478348150834911, + "loss": 3.3656, + "step": 20567 + }, + { + "epoch": 1.01, + "grad_norm": 0.5289061665534973, + "learning_rate": 0.00044782141785682216, + "loss": 3.0752, + "step": 20568 + }, + { + "epoch": 1.01, + "grad_norm": 0.527275025844574, + "learning_rate": 0.00044780802024081993, + "loss": 3.1675, + "step": 20569 + }, + { + "epoch": 1.01, + "grad_norm": 0.5364570617675781, + "learning_rate": 0.00044779462223551995, + "loss": 2.9237, + "step": 20570 + }, + { + "epoch": 1.01, + "grad_norm": 0.5187329053878784, + "learning_rate": 0.00044778122384095723, + "loss": 2.9779, + "step": 20571 + }, + { + "epoch": 1.01, + "grad_norm": 0.5524115562438965, + "learning_rate": 0.0004477678250571673, + "loss": 3.1814, + "step": 20572 + }, + { + "epoch": 1.01, + "grad_norm": 0.5237755179405212, + "learning_rate": 0.00044775442588418536, + "loss": 3.1823, + "step": 20573 + }, + { + "epoch": 1.01, + "grad_norm": 0.5263662338256836, + "learning_rate": 0.0004477410263220467, + "loss": 3.1545, + "step": 20574 + }, + { + "epoch": 1.01, + "grad_norm": 0.5157889127731323, + "learning_rate": 0.00044772762637078665, + "loss": 2.9089, + "step": 20575 + }, + { + "epoch": 1.01, + "grad_norm": 0.5321006178855896, + "learning_rate": 0.0004477142260304403, + "loss": 3.334, + "step": 20576 + }, + { + "epoch": 1.01, + "grad_norm": 0.5617817044258118, + "learning_rate": 0.0004477008253010432, + "loss": 2.9518, + "step": 20577 + }, + { + "epoch": 1.01, + "grad_norm": 0.5136963725090027, + "learning_rate": 0.00044768742418263053, + "loss": 2.9438, + "step": 20578 + }, + { + "epoch": 1.01, + "grad_norm": 0.5390790104866028, + "learning_rate": 0.00044767402267523773, + "loss": 3.1914, + "step": 20579 + }, + { + "epoch": 1.01, + "grad_norm": 0.5621577501296997, + "learning_rate": 0.0004476606207788999, + "loss": 3.0937, + "step": 20580 + }, + { + "epoch": 1.01, + "grad_norm": 0.5234653949737549, + "learning_rate": 0.0004476472184936523, + "loss": 3.087, + "step": 20581 + }, + { + "epoch": 1.01, + "grad_norm": 0.5514974594116211, + "learning_rate": 0.0004476338158195305, + "loss": 2.9886, + "step": 20582 + }, + { + "epoch": 1.01, + "grad_norm": 0.5614942908287048, + "learning_rate": 0.0004476204127565696, + "loss": 3.0167, + "step": 20583 + }, + { + "epoch": 1.01, + "grad_norm": 0.5162169933319092, + "learning_rate": 0.0004476070093048049, + "loss": 3.1614, + "step": 20584 + }, + { + "epoch": 1.01, + "grad_norm": 0.5244741439819336, + "learning_rate": 0.00044759360546427175, + "loss": 3.1057, + "step": 20585 + }, + { + "epoch": 1.01, + "grad_norm": 0.5119442343711853, + "learning_rate": 0.0004475802012350055, + "loss": 2.9263, + "step": 20586 + }, + { + "epoch": 1.01, + "grad_norm": 0.48682332038879395, + "learning_rate": 0.0004475667966170414, + "loss": 3.0508, + "step": 20587 + }, + { + "epoch": 1.01, + "grad_norm": 0.5387108325958252, + "learning_rate": 0.00044755339161041467, + "loss": 2.8952, + "step": 20588 + }, + { + "epoch": 1.01, + "grad_norm": 0.5450228452682495, + "learning_rate": 0.00044753998621516075, + "loss": 3.332, + "step": 20589 + }, + { + "epoch": 1.01, + "grad_norm": 0.5503659844398499, + "learning_rate": 0.000447526580431315, + "loss": 2.9963, + "step": 20590 + }, + { + "epoch": 1.01, + "grad_norm": 0.5000738501548767, + "learning_rate": 0.0004475131742589125, + "loss": 2.9757, + "step": 20591 + }, + { + "epoch": 1.01, + "grad_norm": 0.5334386229515076, + "learning_rate": 0.00044749976769798875, + "loss": 3.3276, + "step": 20592 + }, + { + "epoch": 1.01, + "grad_norm": 0.5666833519935608, + "learning_rate": 0.00044748636074857904, + "loss": 3.1172, + "step": 20593 + }, + { + "epoch": 1.01, + "grad_norm": 0.5799906253814697, + "learning_rate": 0.00044747295341071857, + "loss": 3.0151, + "step": 20594 + }, + { + "epoch": 1.01, + "grad_norm": 0.5235270857810974, + "learning_rate": 0.00044745954568444266, + "loss": 2.9707, + "step": 20595 + }, + { + "epoch": 1.01, + "grad_norm": 0.5744423866271973, + "learning_rate": 0.0004474461375697867, + "loss": 2.9403, + "step": 20596 + }, + { + "epoch": 1.01, + "grad_norm": 0.571638286113739, + "learning_rate": 0.00044743272906678616, + "loss": 3.1176, + "step": 20597 + }, + { + "epoch": 1.01, + "grad_norm": 0.5430690050125122, + "learning_rate": 0.00044741932017547604, + "loss": 3.1179, + "step": 20598 + }, + { + "epoch": 1.01, + "grad_norm": 0.5648912191390991, + "learning_rate": 0.0004474059108958918, + "loss": 3.1133, + "step": 20599 + }, + { + "epoch": 1.01, + "grad_norm": 0.5373355746269226, + "learning_rate": 0.00044739250122806883, + "loss": 3.0403, + "step": 20600 + }, + { + "epoch": 1.01, + "grad_norm": 0.5279290676116943, + "learning_rate": 0.0004473790911720423, + "loss": 3.1451, + "step": 20601 + }, + { + "epoch": 1.01, + "grad_norm": 0.565265953540802, + "learning_rate": 0.0004473656807278477, + "loss": 3.1434, + "step": 20602 + }, + { + "epoch": 1.01, + "grad_norm": 0.5297691226005554, + "learning_rate": 0.00044735226989552014, + "loss": 3.2683, + "step": 20603 + }, + { + "epoch": 1.01, + "grad_norm": 0.5779011249542236, + "learning_rate": 0.0004473388586750952, + "loss": 3.1056, + "step": 20604 + }, + { + "epoch": 1.01, + "grad_norm": 0.5088967084884644, + "learning_rate": 0.000447325447066608, + "loss": 3.2698, + "step": 20605 + }, + { + "epoch": 1.01, + "grad_norm": 0.5209295153617859, + "learning_rate": 0.00044731203507009386, + "loss": 3.1012, + "step": 20606 + }, + { + "epoch": 1.01, + "grad_norm": 0.517296552658081, + "learning_rate": 0.0004472986226855882, + "loss": 3.1231, + "step": 20607 + }, + { + "epoch": 1.01, + "grad_norm": 0.5545491576194763, + "learning_rate": 0.0004472852099131264, + "loss": 3.1827, + "step": 20608 + }, + { + "epoch": 1.01, + "grad_norm": 0.5324487686157227, + "learning_rate": 0.00044727179675274365, + "loss": 2.9018, + "step": 20609 + }, + { + "epoch": 1.01, + "grad_norm": 0.5557654500007629, + "learning_rate": 0.00044725838320447533, + "loss": 2.9244, + "step": 20610 + }, + { + "epoch": 1.01, + "grad_norm": 0.5294561982154846, + "learning_rate": 0.00044724496926835673, + "loss": 3.2031, + "step": 20611 + }, + { + "epoch": 1.01, + "grad_norm": 0.4875947833061218, + "learning_rate": 0.0004472315549444233, + "loss": 2.9372, + "step": 20612 + }, + { + "epoch": 1.01, + "grad_norm": 0.5307518243789673, + "learning_rate": 0.00044721814023271025, + "loss": 3.1983, + "step": 20613 + }, + { + "epoch": 1.01, + "grad_norm": 0.5280675888061523, + "learning_rate": 0.00044720472513325296, + "loss": 3.1898, + "step": 20614 + }, + { + "epoch": 1.01, + "grad_norm": 0.5537136793136597, + "learning_rate": 0.0004471913096460867, + "loss": 3.2588, + "step": 20615 + }, + { + "epoch": 1.01, + "grad_norm": 0.5422837138175964, + "learning_rate": 0.00044717789377124695, + "loss": 3.0336, + "step": 20616 + }, + { + "epoch": 1.01, + "grad_norm": 0.5528018474578857, + "learning_rate": 0.0004471644775087688, + "loss": 3.0397, + "step": 20617 + }, + { + "epoch": 1.01, + "grad_norm": 0.5226984620094299, + "learning_rate": 0.00044715106085868784, + "loss": 3.2272, + "step": 20618 + }, + { + "epoch": 1.01, + "grad_norm": 0.5283844470977783, + "learning_rate": 0.0004471376438210394, + "loss": 3.2111, + "step": 20619 + }, + { + "epoch": 1.01, + "grad_norm": 0.5217439532279968, + "learning_rate": 0.00044712422639585863, + "loss": 3.0164, + "step": 20620 + }, + { + "epoch": 1.01, + "grad_norm": 0.5724291205406189, + "learning_rate": 0.0004471108085831809, + "loss": 3.1398, + "step": 20621 + }, + { + "epoch": 1.01, + "grad_norm": 0.5415415167808533, + "learning_rate": 0.0004470973903830417, + "loss": 3.0994, + "step": 20622 + }, + { + "epoch": 1.01, + "grad_norm": 0.5410470366477966, + "learning_rate": 0.00044708397179547626, + "loss": 3.1288, + "step": 20623 + }, + { + "epoch": 1.01, + "grad_norm": 0.5544204711914062, + "learning_rate": 0.0004470705528205199, + "loss": 3.2814, + "step": 20624 + }, + { + "epoch": 1.01, + "grad_norm": 0.5386510491371155, + "learning_rate": 0.0004470571334582081, + "loss": 3.041, + "step": 20625 + }, + { + "epoch": 1.01, + "grad_norm": 0.5222130417823792, + "learning_rate": 0.00044704371370857607, + "loss": 3.1223, + "step": 20626 + }, + { + "epoch": 1.01, + "grad_norm": 0.5678313374519348, + "learning_rate": 0.0004470302935716591, + "loss": 2.9597, + "step": 20627 + }, + { + "epoch": 1.01, + "grad_norm": 0.5152552127838135, + "learning_rate": 0.00044701687304749276, + "loss": 2.9862, + "step": 20628 + }, + { + "epoch": 1.01, + "grad_norm": 0.5379000902175903, + "learning_rate": 0.0004470034521361122, + "loss": 3.1142, + "step": 20629 + }, + { + "epoch": 1.01, + "grad_norm": 0.5807501673698425, + "learning_rate": 0.0004469900308375529, + "loss": 3.2559, + "step": 20630 + }, + { + "epoch": 1.01, + "grad_norm": 0.5624402165412903, + "learning_rate": 0.0004469766091518502, + "loss": 3.1534, + "step": 20631 + }, + { + "epoch": 1.01, + "grad_norm": 0.5430057048797607, + "learning_rate": 0.0004469631870790392, + "loss": 3.1118, + "step": 20632 + }, + { + "epoch": 1.01, + "grad_norm": 0.5057716369628906, + "learning_rate": 0.0004469497646191556, + "loss": 2.9988, + "step": 20633 + }, + { + "epoch": 1.01, + "grad_norm": 0.5503453612327576, + "learning_rate": 0.0004469363417722346, + "loss": 3.0555, + "step": 20634 + }, + { + "epoch": 1.01, + "grad_norm": 0.5180730223655701, + "learning_rate": 0.0004469229185383115, + "loss": 3.1024, + "step": 20635 + }, + { + "epoch": 1.01, + "grad_norm": 0.5402533411979675, + "learning_rate": 0.0004469094949174217, + "loss": 3.1781, + "step": 20636 + }, + { + "epoch": 1.01, + "grad_norm": 0.6356052160263062, + "learning_rate": 0.0004468960709096006, + "loss": 3.0357, + "step": 20637 + }, + { + "epoch": 1.01, + "grad_norm": 0.5212626457214355, + "learning_rate": 0.0004468826465148835, + "loss": 3.0721, + "step": 20638 + }, + { + "epoch": 1.01, + "grad_norm": 0.5012721419334412, + "learning_rate": 0.00044686922173330584, + "loss": 3.0777, + "step": 20639 + }, + { + "epoch": 1.01, + "grad_norm": 0.5270209908485413, + "learning_rate": 0.00044685579656490287, + "loss": 3.2153, + "step": 20640 + }, + { + "epoch": 1.01, + "grad_norm": 0.5531798005104065, + "learning_rate": 0.00044684237100971, + "loss": 3.1648, + "step": 20641 + }, + { + "epoch": 1.01, + "grad_norm": 0.5050953030586243, + "learning_rate": 0.00044682894506776246, + "loss": 3.2221, + "step": 20642 + }, + { + "epoch": 1.01, + "grad_norm": 0.5169261693954468, + "learning_rate": 0.0004468155187390959, + "loss": 3.1671, + "step": 20643 + }, + { + "epoch": 1.01, + "grad_norm": 0.5264015197753906, + "learning_rate": 0.0004468020920237455, + "loss": 3.0668, + "step": 20644 + }, + { + "epoch": 1.01, + "grad_norm": 0.5398049354553223, + "learning_rate": 0.0004467886649217466, + "loss": 3.1813, + "step": 20645 + }, + { + "epoch": 1.01, + "grad_norm": 0.5570565462112427, + "learning_rate": 0.0004467752374331346, + "loss": 2.8079, + "step": 20646 + }, + { + "epoch": 1.01, + "grad_norm": 0.5235486030578613, + "learning_rate": 0.0004467618095579449, + "loss": 3.1429, + "step": 20647 + }, + { + "epoch": 1.01, + "grad_norm": 0.5539608001708984, + "learning_rate": 0.0004467483812962128, + "loss": 3.1071, + "step": 20648 + }, + { + "epoch": 1.01, + "grad_norm": 0.5413884520530701, + "learning_rate": 0.0004467349526479738, + "loss": 2.7632, + "step": 20649 + }, + { + "epoch": 1.01, + "grad_norm": 0.5679776072502136, + "learning_rate": 0.00044672152361326307, + "loss": 3.0299, + "step": 20650 + }, + { + "epoch": 1.01, + "grad_norm": 0.5429620742797852, + "learning_rate": 0.0004467080941921161, + "loss": 3.1816, + "step": 20651 + }, + { + "epoch": 1.01, + "grad_norm": 0.5164234042167664, + "learning_rate": 0.00044669466438456833, + "loss": 3.2438, + "step": 20652 + }, + { + "epoch": 1.01, + "grad_norm": 0.5403413772583008, + "learning_rate": 0.0004466812341906549, + "loss": 3.1561, + "step": 20653 + }, + { + "epoch": 1.01, + "grad_norm": 0.5405500531196594, + "learning_rate": 0.00044666780361041143, + "loss": 3.229, + "step": 20654 + }, + { + "epoch": 1.01, + "grad_norm": 0.5156397819519043, + "learning_rate": 0.0004466543726438733, + "loss": 3.0178, + "step": 20655 + }, + { + "epoch": 1.01, + "grad_norm": 0.5206366777420044, + "learning_rate": 0.00044664094129107557, + "loss": 2.7869, + "step": 20656 + }, + { + "epoch": 1.01, + "grad_norm": 0.5250067710876465, + "learning_rate": 0.00044662750955205393, + "loss": 3.0707, + "step": 20657 + }, + { + "epoch": 1.01, + "grad_norm": 0.5365163087844849, + "learning_rate": 0.00044661407742684355, + "loss": 2.8563, + "step": 20658 + }, + { + "epoch": 1.01, + "grad_norm": 0.5403412580490112, + "learning_rate": 0.00044660064491548003, + "loss": 3.037, + "step": 20659 + }, + { + "epoch": 1.01, + "grad_norm": 0.5710760354995728, + "learning_rate": 0.00044658721201799856, + "loss": 2.9855, + "step": 20660 + }, + { + "epoch": 1.01, + "grad_norm": 0.5208877921104431, + "learning_rate": 0.00044657377873443454, + "loss": 3.0927, + "step": 20661 + }, + { + "epoch": 1.01, + "grad_norm": 0.5228715538978577, + "learning_rate": 0.00044656034506482354, + "loss": 3.0964, + "step": 20662 + }, + { + "epoch": 1.01, + "grad_norm": 0.5221062302589417, + "learning_rate": 0.00044654691100920067, + "loss": 3.1844, + "step": 20663 + }, + { + "epoch": 1.01, + "grad_norm": 0.5395482778549194, + "learning_rate": 0.00044653347656760145, + "loss": 3.2539, + "step": 20664 + }, + { + "epoch": 1.01, + "grad_norm": 0.5604360103607178, + "learning_rate": 0.00044652004174006133, + "loss": 3.3412, + "step": 20665 + }, + { + "epoch": 1.01, + "grad_norm": 0.5532729029655457, + "learning_rate": 0.0004465066065266156, + "loss": 3.1004, + "step": 20666 + }, + { + "epoch": 1.01, + "grad_norm": 0.5858287811279297, + "learning_rate": 0.0004464931709272996, + "loss": 3.0845, + "step": 20667 + }, + { + "epoch": 1.01, + "grad_norm": 0.506764829158783, + "learning_rate": 0.0004464797349421488, + "loss": 3.3258, + "step": 20668 + }, + { + "epoch": 1.01, + "grad_norm": 0.974445641040802, + "learning_rate": 0.00044646629857119854, + "loss": 2.916, + "step": 20669 + }, + { + "epoch": 1.01, + "grad_norm": 0.5399503111839294, + "learning_rate": 0.0004464528618144843, + "loss": 2.9863, + "step": 20670 + }, + { + "epoch": 1.01, + "grad_norm": 0.5604903101921082, + "learning_rate": 0.0004464394246720415, + "loss": 3.0492, + "step": 20671 + }, + { + "epoch": 1.01, + "grad_norm": 0.5462265014648438, + "learning_rate": 0.00044642598714390527, + "loss": 3.0305, + "step": 20672 + }, + { + "epoch": 1.01, + "grad_norm": 0.512052595615387, + "learning_rate": 0.00044641254923011124, + "loss": 3.0729, + "step": 20673 + }, + { + "epoch": 1.01, + "grad_norm": 0.521513044834137, + "learning_rate": 0.0004463991109306947, + "loss": 3.0554, + "step": 20674 + }, + { + "epoch": 1.01, + "grad_norm": 0.5272539258003235, + "learning_rate": 0.0004463856722456911, + "loss": 3.171, + "step": 20675 + }, + { + "epoch": 1.01, + "grad_norm": 0.5902304649353027, + "learning_rate": 0.00044637223317513583, + "loss": 3.1536, + "step": 20676 + }, + { + "epoch": 1.01, + "grad_norm": 0.5469979643821716, + "learning_rate": 0.00044635879371906427, + "loss": 2.9833, + "step": 20677 + }, + { + "epoch": 1.01, + "grad_norm": 0.5268833041191101, + "learning_rate": 0.0004463453538775118, + "loss": 3.2544, + "step": 20678 + }, + { + "epoch": 1.01, + "grad_norm": 0.5354952812194824, + "learning_rate": 0.0004463319136505138, + "loss": 2.8549, + "step": 20679 + }, + { + "epoch": 1.01, + "grad_norm": 0.6132659316062927, + "learning_rate": 0.0004463184730381057, + "loss": 2.9428, + "step": 20680 + }, + { + "epoch": 1.01, + "grad_norm": 0.5660962462425232, + "learning_rate": 0.000446305032040323, + "loss": 2.9525, + "step": 20681 + }, + { + "epoch": 1.01, + "grad_norm": 0.5585618019104004, + "learning_rate": 0.0004462915906572009, + "loss": 3.1878, + "step": 20682 + }, + { + "epoch": 1.01, + "grad_norm": 0.6061045527458191, + "learning_rate": 0.0004462781488887749, + "loss": 3.2639, + "step": 20683 + }, + { + "epoch": 1.01, + "grad_norm": 0.550226628780365, + "learning_rate": 0.00044626470673508043, + "loss": 3.0404, + "step": 20684 + }, + { + "epoch": 1.01, + "grad_norm": 0.5399933457374573, + "learning_rate": 0.00044625126419615296, + "loss": 3.0385, + "step": 20685 + }, + { + "epoch": 1.01, + "grad_norm": 0.5338366031646729, + "learning_rate": 0.0004462378212720277, + "loss": 3.1391, + "step": 20686 + }, + { + "epoch": 1.01, + "grad_norm": 0.5670849084854126, + "learning_rate": 0.00044622437796274016, + "loss": 3.1828, + "step": 20687 + }, + { + "epoch": 1.01, + "grad_norm": 0.5496315360069275, + "learning_rate": 0.0004462109342683259, + "loss": 2.9966, + "step": 20688 + }, + { + "epoch": 1.01, + "grad_norm": 0.5669832229614258, + "learning_rate": 0.00044619749018881994, + "loss": 2.9435, + "step": 20689 + }, + { + "epoch": 1.01, + "grad_norm": 0.5618574023246765, + "learning_rate": 0.0004461840457242581, + "loss": 3.1487, + "step": 20690 + }, + { + "epoch": 1.01, + "grad_norm": 0.5448206067085266, + "learning_rate": 0.00044617060087467556, + "loss": 3.1906, + "step": 20691 + }, + { + "epoch": 1.01, + "grad_norm": 0.5339007377624512, + "learning_rate": 0.0004461571556401078, + "loss": 3.164, + "step": 20692 + }, + { + "epoch": 1.01, + "grad_norm": 0.5732952952384949, + "learning_rate": 0.00044614371002059026, + "loss": 3.197, + "step": 20693 + }, + { + "epoch": 1.01, + "grad_norm": 0.5226903557777405, + "learning_rate": 0.0004461302640161582, + "loss": 3.1404, + "step": 20694 + }, + { + "epoch": 1.01, + "grad_norm": 0.547990620136261, + "learning_rate": 0.00044611681762684723, + "loss": 3.2876, + "step": 20695 + }, + { + "epoch": 1.01, + "grad_norm": 0.5328800082206726, + "learning_rate": 0.0004461033708526927, + "loss": 3.0255, + "step": 20696 + }, + { + "epoch": 1.01, + "grad_norm": 0.5429891347885132, + "learning_rate": 0.00044608992369372995, + "loss": 3.4003, + "step": 20697 + }, + { + "epoch": 1.01, + "grad_norm": 0.561837375164032, + "learning_rate": 0.00044607647614999454, + "loss": 3.0505, + "step": 20698 + }, + { + "epoch": 1.01, + "grad_norm": 0.5248981714248657, + "learning_rate": 0.00044606302822152176, + "loss": 3.3295, + "step": 20699 + }, + { + "epoch": 1.01, + "grad_norm": 0.5702486038208008, + "learning_rate": 0.000446049579908347, + "loss": 3.072, + "step": 20700 + }, + { + "epoch": 1.01, + "grad_norm": 0.5280510187149048, + "learning_rate": 0.0004460361312105058, + "loss": 3.1817, + "step": 20701 + }, + { + "epoch": 1.01, + "grad_norm": 0.5336136221885681, + "learning_rate": 0.0004460226821280336, + "loss": 3.1157, + "step": 20702 + }, + { + "epoch": 1.01, + "grad_norm": 0.523258626461029, + "learning_rate": 0.0004460092326609658, + "loss": 3.1815, + "step": 20703 + }, + { + "epoch": 1.01, + "grad_norm": 0.5246843099594116, + "learning_rate": 0.00044599578280933756, + "loss": 3.0741, + "step": 20704 + }, + { + "epoch": 1.01, + "grad_norm": 0.5236319303512573, + "learning_rate": 0.00044598233257318474, + "loss": 2.9568, + "step": 20705 + }, + { + "epoch": 1.01, + "grad_norm": 0.5435445308685303, + "learning_rate": 0.0004459688819525425, + "loss": 2.9715, + "step": 20706 + }, + { + "epoch": 1.01, + "grad_norm": 0.5246100425720215, + "learning_rate": 0.00044595543094744623, + "loss": 3.0166, + "step": 20707 + }, + { + "epoch": 1.01, + "grad_norm": 0.5761013627052307, + "learning_rate": 0.00044594197955793156, + "loss": 3.0693, + "step": 20708 + }, + { + "epoch": 1.01, + "grad_norm": 0.5138135552406311, + "learning_rate": 0.00044592852778403366, + "loss": 2.9992, + "step": 20709 + }, + { + "epoch": 1.01, + "grad_norm": 0.5575985312461853, + "learning_rate": 0.00044591507562578825, + "loss": 2.9294, + "step": 20710 + }, + { + "epoch": 1.01, + "grad_norm": 0.5381579995155334, + "learning_rate": 0.0004459016230832306, + "loss": 3.0036, + "step": 20711 + }, + { + "epoch": 1.02, + "grad_norm": 0.5545154213905334, + "learning_rate": 0.00044588817015639605, + "loss": 3.0897, + "step": 20712 + }, + { + "epoch": 1.02, + "grad_norm": 0.507051408290863, + "learning_rate": 0.00044587471684532016, + "loss": 3.0543, + "step": 20713 + }, + { + "epoch": 1.02, + "grad_norm": 0.5791917443275452, + "learning_rate": 0.00044586126315003836, + "loss": 3.2333, + "step": 20714 + }, + { + "epoch": 1.02, + "grad_norm": 0.5103766322135925, + "learning_rate": 0.0004458478090705861, + "loss": 3.193, + "step": 20715 + }, + { + "epoch": 1.02, + "grad_norm": 0.5501724481582642, + "learning_rate": 0.00044583435460699875, + "loss": 3.0267, + "step": 20716 + }, + { + "epoch": 1.02, + "grad_norm": 0.5263850092887878, + "learning_rate": 0.00044582089975931185, + "loss": 3.2196, + "step": 20717 + }, + { + "epoch": 1.02, + "grad_norm": 0.5362185835838318, + "learning_rate": 0.0004458074445275607, + "loss": 2.9511, + "step": 20718 + }, + { + "epoch": 1.02, + "grad_norm": 0.535193920135498, + "learning_rate": 0.0004457939889117807, + "loss": 3.2362, + "step": 20719 + }, + { + "epoch": 1.02, + "grad_norm": 0.5330086350440979, + "learning_rate": 0.0004457805329120075, + "loss": 2.9531, + "step": 20720 + }, + { + "epoch": 1.02, + "grad_norm": 0.5201607942581177, + "learning_rate": 0.00044576707652827646, + "loss": 3.2588, + "step": 20721 + }, + { + "epoch": 1.02, + "grad_norm": 0.5220875144004822, + "learning_rate": 0.000445753619760623, + "loss": 3.0818, + "step": 20722 + }, + { + "epoch": 1.02, + "grad_norm": 0.5611007213592529, + "learning_rate": 0.0004457401626090825, + "loss": 3.1012, + "step": 20723 + }, + { + "epoch": 1.02, + "grad_norm": 0.5594449639320374, + "learning_rate": 0.0004457267050736904, + "loss": 3.3061, + "step": 20724 + }, + { + "epoch": 1.02, + "grad_norm": 0.5162807106971741, + "learning_rate": 0.00044571324715448235, + "loss": 3.0391, + "step": 20725 + }, + { + "epoch": 1.02, + "grad_norm": 0.5748878717422485, + "learning_rate": 0.00044569978885149354, + "loss": 3.1104, + "step": 20726 + }, + { + "epoch": 1.02, + "grad_norm": 0.5533029437065125, + "learning_rate": 0.0004456863301647596, + "loss": 3.0481, + "step": 20727 + }, + { + "epoch": 1.02, + "grad_norm": 0.5461177229881287, + "learning_rate": 0.00044567287109431586, + "loss": 3.0085, + "step": 20728 + }, + { + "epoch": 1.02, + "grad_norm": 0.5752653479576111, + "learning_rate": 0.00044565941164019784, + "loss": 3.1422, + "step": 20729 + }, + { + "epoch": 1.02, + "grad_norm": 0.5168489813804626, + "learning_rate": 0.00044564595180244095, + "loss": 3.0288, + "step": 20730 + }, + { + "epoch": 1.02, + "grad_norm": 0.5368278622627258, + "learning_rate": 0.00044563249158108064, + "loss": 3.215, + "step": 20731 + }, + { + "epoch": 1.02, + "grad_norm": 0.5282604098320007, + "learning_rate": 0.00044561903097615243, + "loss": 3.0692, + "step": 20732 + }, + { + "epoch": 1.02, + "grad_norm": 0.568565309047699, + "learning_rate": 0.00044560556998769166, + "loss": 3.1969, + "step": 20733 + }, + { + "epoch": 1.02, + "grad_norm": 0.5004165172576904, + "learning_rate": 0.0004455921086157338, + "loss": 2.8507, + "step": 20734 + }, + { + "epoch": 1.02, + "grad_norm": 0.5740750432014465, + "learning_rate": 0.0004455786468603144, + "loss": 3.0852, + "step": 20735 + }, + { + "epoch": 1.02, + "grad_norm": 0.5363320112228394, + "learning_rate": 0.0004455651847214689, + "loss": 2.942, + "step": 20736 + }, + { + "epoch": 1.02, + "grad_norm": 0.5179160237312317, + "learning_rate": 0.00044555172219923263, + "loss": 3.0065, + "step": 20737 + }, + { + "epoch": 1.02, + "grad_norm": 0.5659471154212952, + "learning_rate": 0.0004455382592936412, + "loss": 3.1902, + "step": 20738 + }, + { + "epoch": 1.02, + "grad_norm": 0.514021635055542, + "learning_rate": 0.00044552479600473, + "loss": 2.9964, + "step": 20739 + }, + { + "epoch": 1.02, + "grad_norm": 0.5664601922035217, + "learning_rate": 0.00044551133233253443, + "loss": 2.9117, + "step": 20740 + }, + { + "epoch": 1.02, + "grad_norm": 0.5283511877059937, + "learning_rate": 0.0004454978682770901, + "loss": 3.3048, + "step": 20741 + }, + { + "epoch": 1.02, + "grad_norm": 0.5478894710540771, + "learning_rate": 0.0004454844038384323, + "loss": 3.247, + "step": 20742 + }, + { + "epoch": 1.02, + "grad_norm": 0.5319153666496277, + "learning_rate": 0.0004454709390165967, + "loss": 3.2392, + "step": 20743 + }, + { + "epoch": 1.02, + "grad_norm": 0.5409319400787354, + "learning_rate": 0.0004454574738116186, + "loss": 3.0977, + "step": 20744 + }, + { + "epoch": 1.02, + "grad_norm": 0.5745616555213928, + "learning_rate": 0.0004454440082235334, + "loss": 3.072, + "step": 20745 + }, + { + "epoch": 1.02, + "grad_norm": 0.6125838756561279, + "learning_rate": 0.0004454305422523768, + "loss": 3.2891, + "step": 20746 + }, + { + "epoch": 1.02, + "grad_norm": 0.5295475721359253, + "learning_rate": 0.00044541707589818404, + "loss": 3.3818, + "step": 20747 + }, + { + "epoch": 1.02, + "grad_norm": 0.588213324546814, + "learning_rate": 0.0004454036091609908, + "loss": 3.2082, + "step": 20748 + }, + { + "epoch": 1.02, + "grad_norm": 0.5516581535339355, + "learning_rate": 0.0004453901420408324, + "loss": 3.3228, + "step": 20749 + }, + { + "epoch": 1.02, + "grad_norm": 0.5377952456474304, + "learning_rate": 0.0004453766745377442, + "loss": 3.0391, + "step": 20750 + }, + { + "epoch": 1.02, + "grad_norm": 0.5352370738983154, + "learning_rate": 0.000445363206651762, + "loss": 3.2461, + "step": 20751 + }, + { + "epoch": 1.02, + "grad_norm": 0.5557248592376709, + "learning_rate": 0.0004453497383829211, + "loss": 2.8931, + "step": 20752 + }, + { + "epoch": 1.02, + "grad_norm": 0.5509526133537292, + "learning_rate": 0.00044533626973125687, + "loss": 3.1886, + "step": 20753 + }, + { + "epoch": 1.02, + "grad_norm": 0.5012487173080444, + "learning_rate": 0.0004453228006968049, + "loss": 2.9703, + "step": 20754 + }, + { + "epoch": 1.02, + "grad_norm": 0.5457666516304016, + "learning_rate": 0.0004453093312796006, + "loss": 2.8753, + "step": 20755 + }, + { + "epoch": 1.02, + "grad_norm": 0.5392427444458008, + "learning_rate": 0.0004452958614796795, + "loss": 3.1255, + "step": 20756 + }, + { + "epoch": 1.02, + "grad_norm": 0.5236079096794128, + "learning_rate": 0.0004452823912970772, + "loss": 2.8528, + "step": 20757 + }, + { + "epoch": 1.02, + "grad_norm": 0.5426657795906067, + "learning_rate": 0.0004452689207318289, + "loss": 3.0241, + "step": 20758 + }, + { + "epoch": 1.02, + "grad_norm": 0.5343982577323914, + "learning_rate": 0.00044525544978397025, + "loss": 3.1399, + "step": 20759 + }, + { + "epoch": 1.02, + "grad_norm": 0.535350501537323, + "learning_rate": 0.0004452419784535367, + "loss": 3.1378, + "step": 20760 + }, + { + "epoch": 1.02, + "grad_norm": 0.5450648665428162, + "learning_rate": 0.0004452285067405638, + "loss": 3.1517, + "step": 20761 + }, + { + "epoch": 1.02, + "grad_norm": 0.5389657020568848, + "learning_rate": 0.0004452150346450869, + "loss": 3.3121, + "step": 20762 + }, + { + "epoch": 1.02, + "grad_norm": 0.5154426693916321, + "learning_rate": 0.00044520156216714145, + "loss": 3.1708, + "step": 20763 + }, + { + "epoch": 1.02, + "grad_norm": 0.5560336112976074, + "learning_rate": 0.0004451880893067632, + "loss": 3.0882, + "step": 20764 + }, + { + "epoch": 1.02, + "grad_norm": 0.5327739119529724, + "learning_rate": 0.0004451746160639874, + "loss": 3.1679, + "step": 20765 + }, + { + "epoch": 1.02, + "grad_norm": 0.5418607592582703, + "learning_rate": 0.0004451611424388496, + "loss": 3.1208, + "step": 20766 + }, + { + "epoch": 1.02, + "grad_norm": 0.57447749376297, + "learning_rate": 0.0004451476684313852, + "loss": 2.9681, + "step": 20767 + }, + { + "epoch": 1.02, + "grad_norm": 0.5182052850723267, + "learning_rate": 0.0004451341940416299, + "loss": 3.2641, + "step": 20768 + }, + { + "epoch": 1.02, + "grad_norm": 0.5607951283454895, + "learning_rate": 0.00044512071926961904, + "loss": 3.117, + "step": 20769 + }, + { + "epoch": 1.02, + "grad_norm": 0.5186867713928223, + "learning_rate": 0.0004451072441153881, + "loss": 3.0579, + "step": 20770 + }, + { + "epoch": 1.02, + "grad_norm": 0.5786254405975342, + "learning_rate": 0.0004450937685789725, + "loss": 3.1559, + "step": 20771 + }, + { + "epoch": 1.02, + "grad_norm": 0.5199046730995178, + "learning_rate": 0.0004450802926604081, + "loss": 3.0105, + "step": 20772 + }, + { + "epoch": 1.02, + "grad_norm": 0.5281519293785095, + "learning_rate": 0.00044506681635972996, + "loss": 3.0246, + "step": 20773 + }, + { + "epoch": 1.02, + "grad_norm": 0.4995059072971344, + "learning_rate": 0.0004450533396769737, + "loss": 3.2023, + "step": 20774 + }, + { + "epoch": 1.02, + "grad_norm": 0.526586651802063, + "learning_rate": 0.000445039862612175, + "loss": 2.8645, + "step": 20775 + }, + { + "epoch": 1.02, + "grad_norm": 0.5115561485290527, + "learning_rate": 0.00044502638516536906, + "loss": 2.8865, + "step": 20776 + }, + { + "epoch": 1.02, + "grad_norm": 0.5567725896835327, + "learning_rate": 0.00044501290733659166, + "loss": 3.3244, + "step": 20777 + }, + { + "epoch": 1.02, + "grad_norm": 0.5572482943534851, + "learning_rate": 0.00044499942912587813, + "loss": 3.0157, + "step": 20778 + }, + { + "epoch": 1.02, + "grad_norm": 0.5388816595077515, + "learning_rate": 0.00044498595053326403, + "loss": 2.9588, + "step": 20779 + }, + { + "epoch": 1.02, + "grad_norm": 0.5826587677001953, + "learning_rate": 0.0004449724715587849, + "loss": 2.875, + "step": 20780 + }, + { + "epoch": 1.02, + "grad_norm": 0.5563898682594299, + "learning_rate": 0.000444958992202476, + "loss": 2.9278, + "step": 20781 + }, + { + "epoch": 1.02, + "grad_norm": 0.5438665747642517, + "learning_rate": 0.0004449455124643731, + "loss": 3.1942, + "step": 20782 + }, + { + "epoch": 1.02, + "grad_norm": 0.5232053995132446, + "learning_rate": 0.00044493203234451166, + "loss": 3.0504, + "step": 20783 + }, + { + "epoch": 1.02, + "grad_norm": 0.5295712947845459, + "learning_rate": 0.00044491855184292713, + "loss": 3.0075, + "step": 20784 + }, + { + "epoch": 1.02, + "grad_norm": 0.5307983756065369, + "learning_rate": 0.000444905070959655, + "loss": 3.025, + "step": 20785 + }, + { + "epoch": 1.02, + "grad_norm": 0.619365394115448, + "learning_rate": 0.00044489158969473076, + "loss": 2.8898, + "step": 20786 + }, + { + "epoch": 1.02, + "grad_norm": 0.5883886218070984, + "learning_rate": 0.00044487810804819, + "loss": 2.9573, + "step": 20787 + }, + { + "epoch": 1.02, + "grad_norm": 0.5858182907104492, + "learning_rate": 0.0004448646260200682, + "loss": 3.2317, + "step": 20788 + }, + { + "epoch": 1.02, + "grad_norm": 0.5257620215415955, + "learning_rate": 0.0004448511436104008, + "loss": 3.1254, + "step": 20789 + }, + { + "epoch": 1.02, + "grad_norm": 0.5104060173034668, + "learning_rate": 0.00044483766081922346, + "loss": 3.1547, + "step": 20790 + }, + { + "epoch": 1.02, + "grad_norm": 0.5185402035713196, + "learning_rate": 0.00044482417764657147, + "loss": 3.099, + "step": 20791 + }, + { + "epoch": 1.02, + "grad_norm": 0.5819932818412781, + "learning_rate": 0.00044481069409248056, + "loss": 3.0686, + "step": 20792 + }, + { + "epoch": 1.02, + "grad_norm": 0.5629081726074219, + "learning_rate": 0.00044479721015698614, + "loss": 3.1719, + "step": 20793 + }, + { + "epoch": 1.02, + "grad_norm": 0.5439802408218384, + "learning_rate": 0.00044478372584012376, + "loss": 3.1272, + "step": 20794 + }, + { + "epoch": 1.02, + "grad_norm": 0.5107980370521545, + "learning_rate": 0.0004447702411419289, + "loss": 3.1131, + "step": 20795 + }, + { + "epoch": 1.02, + "grad_norm": 0.5272974967956543, + "learning_rate": 0.000444756756062437, + "loss": 3.1204, + "step": 20796 + }, + { + "epoch": 1.02, + "grad_norm": 0.5283223986625671, + "learning_rate": 0.00044474327060168374, + "loss": 3.1007, + "step": 20797 + }, + { + "epoch": 1.02, + "grad_norm": 0.5268027186393738, + "learning_rate": 0.00044472978475970453, + "loss": 3.1367, + "step": 20798 + }, + { + "epoch": 1.02, + "grad_norm": 0.5282880663871765, + "learning_rate": 0.00044471629853653496, + "loss": 3.08, + "step": 20799 + }, + { + "epoch": 1.02, + "grad_norm": 0.6580334901809692, + "learning_rate": 0.0004447028119322105, + "loss": 3.0333, + "step": 20800 + }, + { + "epoch": 1.02, + "grad_norm": 0.5694079995155334, + "learning_rate": 0.0004446893249467666, + "loss": 3.1917, + "step": 20801 + }, + { + "epoch": 1.02, + "grad_norm": 0.5710819959640503, + "learning_rate": 0.00044467583758023895, + "loss": 3.1855, + "step": 20802 + }, + { + "epoch": 1.02, + "grad_norm": 0.5371394157409668, + "learning_rate": 0.000444662349832663, + "loss": 3.1907, + "step": 20803 + }, + { + "epoch": 1.02, + "grad_norm": 0.5172489285469055, + "learning_rate": 0.0004446488617040742, + "loss": 3.2504, + "step": 20804 + }, + { + "epoch": 1.02, + "grad_norm": 0.531115710735321, + "learning_rate": 0.0004446353731945082, + "loss": 2.9735, + "step": 20805 + }, + { + "epoch": 1.02, + "grad_norm": 0.49821171164512634, + "learning_rate": 0.00044462188430400044, + "loss": 3.0562, + "step": 20806 + }, + { + "epoch": 1.02, + "grad_norm": 0.6454533338546753, + "learning_rate": 0.00044460839503258643, + "loss": 3.1137, + "step": 20807 + }, + { + "epoch": 1.02, + "grad_norm": 0.64560866355896, + "learning_rate": 0.0004445949053803018, + "loss": 3.0143, + "step": 20808 + }, + { + "epoch": 1.02, + "grad_norm": 0.6050903797149658, + "learning_rate": 0.000444581415347182, + "loss": 3.0204, + "step": 20809 + }, + { + "epoch": 1.02, + "grad_norm": 0.5002428293228149, + "learning_rate": 0.00044456792493326256, + "loss": 2.9459, + "step": 20810 + }, + { + "epoch": 1.02, + "grad_norm": 0.534598708152771, + "learning_rate": 0.000444554434138579, + "loss": 3.1241, + "step": 20811 + }, + { + "epoch": 1.02, + "grad_norm": 0.6012741923332214, + "learning_rate": 0.0004445409429631669, + "loss": 3.0102, + "step": 20812 + }, + { + "epoch": 1.02, + "grad_norm": 0.5511965155601501, + "learning_rate": 0.0004445274514070618, + "loss": 3.1177, + "step": 20813 + }, + { + "epoch": 1.02, + "grad_norm": 0.5788381099700928, + "learning_rate": 0.0004445139594702992, + "loss": 3.2391, + "step": 20814 + }, + { + "epoch": 1.02, + "grad_norm": 0.6373004913330078, + "learning_rate": 0.0004445004671529147, + "loss": 3.0853, + "step": 20815 + }, + { + "epoch": 1.02, + "grad_norm": 0.5785526633262634, + "learning_rate": 0.00044448697445494367, + "loss": 3.1423, + "step": 20816 + }, + { + "epoch": 1.02, + "grad_norm": 0.5426591634750366, + "learning_rate": 0.00044447348137642177, + "loss": 3.3273, + "step": 20817 + }, + { + "epoch": 1.02, + "grad_norm": 0.5850669145584106, + "learning_rate": 0.00044445998791738453, + "loss": 2.9099, + "step": 20818 + }, + { + "epoch": 1.02, + "grad_norm": 0.5589839816093445, + "learning_rate": 0.0004444464940778676, + "loss": 3.0122, + "step": 20819 + }, + { + "epoch": 1.02, + "grad_norm": 0.5280645489692688, + "learning_rate": 0.0004444329998579063, + "loss": 3.1035, + "step": 20820 + }, + { + "epoch": 1.02, + "grad_norm": 0.593413233757019, + "learning_rate": 0.00044441950525753626, + "loss": 3.1984, + "step": 20821 + }, + { + "epoch": 1.02, + "grad_norm": 0.5713319182395935, + "learning_rate": 0.00044440601027679303, + "loss": 3.2343, + "step": 20822 + }, + { + "epoch": 1.02, + "grad_norm": 0.5799878239631653, + "learning_rate": 0.0004443925149157123, + "loss": 2.8615, + "step": 20823 + }, + { + "epoch": 1.02, + "grad_norm": 0.5746982097625732, + "learning_rate": 0.0004443790191743293, + "loss": 3.1915, + "step": 20824 + }, + { + "epoch": 1.02, + "grad_norm": 0.5620375275611877, + "learning_rate": 0.00044436552305267984, + "loss": 2.9491, + "step": 20825 + }, + { + "epoch": 1.02, + "grad_norm": 0.5882107615470886, + "learning_rate": 0.00044435202655079934, + "loss": 3.0635, + "step": 20826 + }, + { + "epoch": 1.02, + "grad_norm": 0.5356667637825012, + "learning_rate": 0.0004443385296687234, + "loss": 2.9083, + "step": 20827 + }, + { + "epoch": 1.02, + "grad_norm": 0.5147374868392944, + "learning_rate": 0.00044432503240648757, + "loss": 3.1242, + "step": 20828 + }, + { + "epoch": 1.02, + "grad_norm": 0.550109326839447, + "learning_rate": 0.00044431153476412737, + "loss": 2.9514, + "step": 20829 + }, + { + "epoch": 1.02, + "grad_norm": 0.5489082336425781, + "learning_rate": 0.0004442980367416784, + "loss": 3.1057, + "step": 20830 + }, + { + "epoch": 1.02, + "grad_norm": 0.49882984161376953, + "learning_rate": 0.0004442845383391761, + "loss": 3.1469, + "step": 20831 + }, + { + "epoch": 1.02, + "grad_norm": 0.6269406080245972, + "learning_rate": 0.00044427103955665606, + "loss": 3.0736, + "step": 20832 + }, + { + "epoch": 1.02, + "grad_norm": 0.5686376690864563, + "learning_rate": 0.00044425754039415394, + "loss": 3.1878, + "step": 20833 + }, + { + "epoch": 1.02, + "grad_norm": 0.5298441052436829, + "learning_rate": 0.00044424404085170526, + "loss": 3.2007, + "step": 20834 + }, + { + "epoch": 1.02, + "grad_norm": 0.5424293279647827, + "learning_rate": 0.00044423054092934547, + "loss": 3.182, + "step": 20835 + }, + { + "epoch": 1.02, + "grad_norm": 0.5576657652854919, + "learning_rate": 0.0004442170406271102, + "loss": 3.2624, + "step": 20836 + }, + { + "epoch": 1.02, + "grad_norm": 0.5367722511291504, + "learning_rate": 0.00044420353994503503, + "loss": 3.0781, + "step": 20837 + }, + { + "epoch": 1.02, + "grad_norm": 0.57781982421875, + "learning_rate": 0.00044419003888315544, + "loss": 3.0836, + "step": 20838 + }, + { + "epoch": 1.02, + "grad_norm": 0.5427062511444092, + "learning_rate": 0.00044417653744150705, + "loss": 3.1441, + "step": 20839 + }, + { + "epoch": 1.02, + "grad_norm": 0.5554705858230591, + "learning_rate": 0.0004441630356201254, + "loss": 2.918, + "step": 20840 + }, + { + "epoch": 1.02, + "grad_norm": 0.5354554653167725, + "learning_rate": 0.00044414953341904615, + "loss": 2.9882, + "step": 20841 + }, + { + "epoch": 1.02, + "grad_norm": 0.5082659721374512, + "learning_rate": 0.0004441360308383047, + "loss": 3.0436, + "step": 20842 + }, + { + "epoch": 1.02, + "grad_norm": 0.5301640033721924, + "learning_rate": 0.00044412252787793665, + "loss": 2.9847, + "step": 20843 + }, + { + "epoch": 1.02, + "grad_norm": 0.5346314907073975, + "learning_rate": 0.0004441090245379776, + "loss": 3.1013, + "step": 20844 + }, + { + "epoch": 1.02, + "grad_norm": 0.5592143535614014, + "learning_rate": 0.0004440955208184632, + "loss": 3.0265, + "step": 20845 + }, + { + "epoch": 1.02, + "grad_norm": 0.5529156923294067, + "learning_rate": 0.00044408201671942884, + "loss": 2.9318, + "step": 20846 + }, + { + "epoch": 1.02, + "grad_norm": 0.5625426769256592, + "learning_rate": 0.0004440685122409102, + "loss": 3.1834, + "step": 20847 + }, + { + "epoch": 1.02, + "grad_norm": 0.5384853482246399, + "learning_rate": 0.00044405500738294284, + "loss": 3.1567, + "step": 20848 + }, + { + "epoch": 1.02, + "grad_norm": 0.5495803952217102, + "learning_rate": 0.0004440415021455624, + "loss": 3.1289, + "step": 20849 + }, + { + "epoch": 1.02, + "grad_norm": 0.5577335357666016, + "learning_rate": 0.0004440279965288042, + "loss": 3.0094, + "step": 20850 + }, + { + "epoch": 1.02, + "grad_norm": 0.6182305216789246, + "learning_rate": 0.0004440144905327041, + "loss": 3.1001, + "step": 20851 + }, + { + "epoch": 1.02, + "grad_norm": 0.5557514429092407, + "learning_rate": 0.00044400098415729754, + "loss": 3.1761, + "step": 20852 + }, + { + "epoch": 1.02, + "grad_norm": 0.5622000694274902, + "learning_rate": 0.00044398747740261995, + "loss": 2.9326, + "step": 20853 + }, + { + "epoch": 1.02, + "grad_norm": 0.5277696251869202, + "learning_rate": 0.00044397397026870724, + "loss": 3.0355, + "step": 20854 + }, + { + "epoch": 1.02, + "grad_norm": 0.5236798524856567, + "learning_rate": 0.0004439604627555947, + "loss": 3.1484, + "step": 20855 + }, + { + "epoch": 1.02, + "grad_norm": 0.5746625661849976, + "learning_rate": 0.0004439469548633181, + "loss": 3.1309, + "step": 20856 + }, + { + "epoch": 1.02, + "grad_norm": 0.5382353663444519, + "learning_rate": 0.00044393344659191284, + "loss": 3.0546, + "step": 20857 + }, + { + "epoch": 1.02, + "grad_norm": 0.5459259748458862, + "learning_rate": 0.00044391993794141456, + "loss": 3.1935, + "step": 20858 + }, + { + "epoch": 1.02, + "grad_norm": 0.5256115794181824, + "learning_rate": 0.000443906428911859, + "loss": 3.2811, + "step": 20859 + }, + { + "epoch": 1.02, + "grad_norm": 0.5325765013694763, + "learning_rate": 0.00044389291950328144, + "loss": 3.2127, + "step": 20860 + }, + { + "epoch": 1.02, + "grad_norm": 0.5285824537277222, + "learning_rate": 0.00044387940971571773, + "loss": 3.1977, + "step": 20861 + }, + { + "epoch": 1.02, + "grad_norm": 0.5795076489448547, + "learning_rate": 0.00044386589954920324, + "loss": 3.1272, + "step": 20862 + }, + { + "epoch": 1.02, + "grad_norm": 0.522125780582428, + "learning_rate": 0.0004438523890037738, + "loss": 3.1669, + "step": 20863 + }, + { + "epoch": 1.02, + "grad_norm": 0.5119063258171082, + "learning_rate": 0.0004438388780794647, + "loss": 3.2521, + "step": 20864 + }, + { + "epoch": 1.02, + "grad_norm": 0.5453227758407593, + "learning_rate": 0.00044382536677631176, + "loss": 3.1975, + "step": 20865 + }, + { + "epoch": 1.02, + "grad_norm": 0.5346236228942871, + "learning_rate": 0.0004438118550943504, + "loss": 3.1095, + "step": 20866 + }, + { + "epoch": 1.02, + "grad_norm": 0.5401219129562378, + "learning_rate": 0.00044379834303361645, + "loss": 3.1477, + "step": 20867 + }, + { + "epoch": 1.02, + "grad_norm": 0.5600337386131287, + "learning_rate": 0.0004437848305941452, + "loss": 3.135, + "step": 20868 + }, + { + "epoch": 1.02, + "grad_norm": 0.5327202081680298, + "learning_rate": 0.00044377131777597245, + "loss": 2.9688, + "step": 20869 + }, + { + "epoch": 1.02, + "grad_norm": 0.557734489440918, + "learning_rate": 0.0004437578045791338, + "loss": 3.2244, + "step": 20870 + }, + { + "epoch": 1.02, + "grad_norm": 0.5608131885528564, + "learning_rate": 0.0004437442910036646, + "loss": 3.2067, + "step": 20871 + }, + { + "epoch": 1.02, + "grad_norm": 0.5300242304801941, + "learning_rate": 0.0004437307770496007, + "loss": 3.3376, + "step": 20872 + }, + { + "epoch": 1.02, + "grad_norm": 0.5177008509635925, + "learning_rate": 0.00044371726271697745, + "loss": 3.0261, + "step": 20873 + }, + { + "epoch": 1.02, + "grad_norm": 0.5148898959159851, + "learning_rate": 0.0004437037480058308, + "loss": 3.2279, + "step": 20874 + }, + { + "epoch": 1.02, + "grad_norm": 0.5264756679534912, + "learning_rate": 0.000443690232916196, + "loss": 3.0305, + "step": 20875 + }, + { + "epoch": 1.02, + "grad_norm": 0.5689302682876587, + "learning_rate": 0.00044367671744810884, + "loss": 3.263, + "step": 20876 + }, + { + "epoch": 1.02, + "grad_norm": 0.5246378779411316, + "learning_rate": 0.00044366320160160483, + "loss": 3.2429, + "step": 20877 + }, + { + "epoch": 1.02, + "grad_norm": 0.5417928695678711, + "learning_rate": 0.0004436496853767196, + "loss": 3.1085, + "step": 20878 + }, + { + "epoch": 1.02, + "grad_norm": 0.5359458327293396, + "learning_rate": 0.0004436361687734887, + "loss": 3.1174, + "step": 20879 + }, + { + "epoch": 1.02, + "grad_norm": 0.5729898810386658, + "learning_rate": 0.00044362265179194785, + "loss": 2.9681, + "step": 20880 + }, + { + "epoch": 1.02, + "grad_norm": 0.5482082366943359, + "learning_rate": 0.00044360913443213255, + "loss": 3.1622, + "step": 20881 + }, + { + "epoch": 1.02, + "grad_norm": 0.5981318354606628, + "learning_rate": 0.0004435956166940784, + "loss": 3.0729, + "step": 20882 + }, + { + "epoch": 1.02, + "grad_norm": 0.5297590494155884, + "learning_rate": 0.00044358209857782107, + "loss": 2.9377, + "step": 20883 + }, + { + "epoch": 1.02, + "grad_norm": 0.5387971997261047, + "learning_rate": 0.00044356858008339606, + "loss": 3.1172, + "step": 20884 + }, + { + "epoch": 1.02, + "grad_norm": 0.5694130063056946, + "learning_rate": 0.0004435550612108391, + "loss": 3.0498, + "step": 20885 + }, + { + "epoch": 1.02, + "grad_norm": 0.5102159380912781, + "learning_rate": 0.00044354154196018576, + "loss": 3.0899, + "step": 20886 + }, + { + "epoch": 1.02, + "grad_norm": 0.5234137773513794, + "learning_rate": 0.0004435280223314715, + "loss": 3.0329, + "step": 20887 + }, + { + "epoch": 1.02, + "grad_norm": 0.5225757956504822, + "learning_rate": 0.00044351450232473215, + "loss": 3.3125, + "step": 20888 + }, + { + "epoch": 1.02, + "grad_norm": 0.5035845041275024, + "learning_rate": 0.00044350098194000326, + "loss": 3.1396, + "step": 20889 + }, + { + "epoch": 1.02, + "grad_norm": 0.4975343942642212, + "learning_rate": 0.00044348746117732035, + "loss": 3.2713, + "step": 20890 + }, + { + "epoch": 1.02, + "grad_norm": 0.49224379658699036, + "learning_rate": 0.00044347394003671904, + "loss": 3.1967, + "step": 20891 + }, + { + "epoch": 1.02, + "grad_norm": 0.5111834406852722, + "learning_rate": 0.0004434604185182351, + "loss": 3.0417, + "step": 20892 + }, + { + "epoch": 1.02, + "grad_norm": 0.5302641987800598, + "learning_rate": 0.00044344689662190396, + "loss": 2.9933, + "step": 20893 + }, + { + "epoch": 1.02, + "grad_norm": 0.5250119566917419, + "learning_rate": 0.00044343337434776125, + "loss": 3.0811, + "step": 20894 + }, + { + "epoch": 1.02, + "grad_norm": 0.5314276218414307, + "learning_rate": 0.00044341985169584267, + "loss": 3.2476, + "step": 20895 + }, + { + "epoch": 1.02, + "grad_norm": 0.5408056974411011, + "learning_rate": 0.0004434063286661838, + "loss": 3.0161, + "step": 20896 + }, + { + "epoch": 1.02, + "grad_norm": 0.5244573354721069, + "learning_rate": 0.00044339280525882026, + "loss": 3.2421, + "step": 20897 + }, + { + "epoch": 1.02, + "grad_norm": 0.5429604053497314, + "learning_rate": 0.0004433792814737877, + "loss": 3.1625, + "step": 20898 + }, + { + "epoch": 1.02, + "grad_norm": 0.5206112265586853, + "learning_rate": 0.00044336575731112167, + "loss": 3.0023, + "step": 20899 + }, + { + "epoch": 1.02, + "grad_norm": 0.5225122570991516, + "learning_rate": 0.0004433522327708579, + "loss": 3.059, + "step": 20900 + }, + { + "epoch": 1.02, + "grad_norm": 0.5603109002113342, + "learning_rate": 0.0004433387078530318, + "loss": 2.9529, + "step": 20901 + }, + { + "epoch": 1.02, + "grad_norm": 0.536223828792572, + "learning_rate": 0.00044332518255767926, + "loss": 2.9272, + "step": 20902 + }, + { + "epoch": 1.02, + "grad_norm": 0.5303646922111511, + "learning_rate": 0.00044331165688483575, + "loss": 3.1247, + "step": 20903 + }, + { + "epoch": 1.02, + "grad_norm": 0.5381811261177063, + "learning_rate": 0.00044329813083453685, + "loss": 3.1472, + "step": 20904 + }, + { + "epoch": 1.02, + "grad_norm": 0.5109459757804871, + "learning_rate": 0.0004432846044068183, + "loss": 3.186, + "step": 20905 + }, + { + "epoch": 1.02, + "grad_norm": 0.5285735726356506, + "learning_rate": 0.00044327107760171565, + "loss": 3.1352, + "step": 20906 + }, + { + "epoch": 1.02, + "grad_norm": 0.5155451893806458, + "learning_rate": 0.0004432575504192646, + "loss": 2.8308, + "step": 20907 + }, + { + "epoch": 1.02, + "grad_norm": 0.5433021187782288, + "learning_rate": 0.00044324402285950067, + "loss": 3.295, + "step": 20908 + }, + { + "epoch": 1.02, + "grad_norm": 0.5422189235687256, + "learning_rate": 0.00044323049492245954, + "loss": 3.0311, + "step": 20909 + }, + { + "epoch": 1.02, + "grad_norm": 0.5412802696228027, + "learning_rate": 0.000443216966608177, + "loss": 2.9925, + "step": 20910 + }, + { + "epoch": 1.02, + "grad_norm": 0.5609269142150879, + "learning_rate": 0.00044320343791668835, + "loss": 3.1073, + "step": 20911 + }, + { + "epoch": 1.02, + "grad_norm": 0.5714790225028992, + "learning_rate": 0.0004431899088480295, + "loss": 2.9488, + "step": 20912 + }, + { + "epoch": 1.02, + "grad_norm": 0.5238274931907654, + "learning_rate": 0.00044317637940223596, + "loss": 3.0765, + "step": 20913 + }, + { + "epoch": 1.02, + "grad_norm": 0.5142273902893066, + "learning_rate": 0.0004431628495793434, + "loss": 2.9384, + "step": 20914 + }, + { + "epoch": 1.02, + "grad_norm": 0.5264378786087036, + "learning_rate": 0.00044314931937938746, + "loss": 3.1992, + "step": 20915 + }, + { + "epoch": 1.03, + "grad_norm": 0.5493513345718384, + "learning_rate": 0.00044313578880240376, + "loss": 3.225, + "step": 20916 + }, + { + "epoch": 1.03, + "grad_norm": 0.5445050597190857, + "learning_rate": 0.00044312225784842794, + "loss": 3.0925, + "step": 20917 + }, + { + "epoch": 1.03, + "grad_norm": 0.5303078889846802, + "learning_rate": 0.0004431087265174957, + "loss": 3.0849, + "step": 20918 + }, + { + "epoch": 1.03, + "grad_norm": 0.5611590147018433, + "learning_rate": 0.0004430951948096425, + "loss": 3.0506, + "step": 20919 + }, + { + "epoch": 1.03, + "grad_norm": 0.52338707447052, + "learning_rate": 0.0004430816627249041, + "loss": 3.1669, + "step": 20920 + }, + { + "epoch": 1.03, + "grad_norm": 0.5278040170669556, + "learning_rate": 0.00044306813026331627, + "loss": 3.0816, + "step": 20921 + }, + { + "epoch": 1.03, + "grad_norm": 0.5498781800270081, + "learning_rate": 0.00044305459742491435, + "loss": 3.0202, + "step": 20922 + }, + { + "epoch": 1.03, + "grad_norm": 0.5435249209403992, + "learning_rate": 0.00044304106420973424, + "loss": 3.0002, + "step": 20923 + }, + { + "epoch": 1.03, + "grad_norm": 0.5514103174209595, + "learning_rate": 0.00044302753061781154, + "loss": 3.0938, + "step": 20924 + }, + { + "epoch": 1.03, + "grad_norm": 0.5727975368499756, + "learning_rate": 0.0004430139966491817, + "loss": 2.9635, + "step": 20925 + }, + { + "epoch": 1.03, + "grad_norm": 0.4973636567592621, + "learning_rate": 0.00044300046230388065, + "loss": 3.0122, + "step": 20926 + }, + { + "epoch": 1.03, + "grad_norm": 0.5237885117530823, + "learning_rate": 0.0004429869275819439, + "loss": 3.0556, + "step": 20927 + }, + { + "epoch": 1.03, + "grad_norm": 0.5372980237007141, + "learning_rate": 0.000442973392483407, + "loss": 3.109, + "step": 20928 + }, + { + "epoch": 1.03, + "grad_norm": 0.5056158900260925, + "learning_rate": 0.00044295985700830583, + "loss": 2.9878, + "step": 20929 + }, + { + "epoch": 1.03, + "grad_norm": 0.5174768567085266, + "learning_rate": 0.00044294632115667574, + "loss": 3.0221, + "step": 20930 + }, + { + "epoch": 1.03, + "grad_norm": 0.5897946953773499, + "learning_rate": 0.0004429327849285527, + "loss": 3.0459, + "step": 20931 + }, + { + "epoch": 1.03, + "grad_norm": 0.5413453578948975, + "learning_rate": 0.00044291924832397223, + "loss": 3.4315, + "step": 20932 + }, + { + "epoch": 1.03, + "grad_norm": 0.5361354947090149, + "learning_rate": 0.0004429057113429699, + "loss": 3.1492, + "step": 20933 + }, + { + "epoch": 1.03, + "grad_norm": 0.5186129808425903, + "learning_rate": 0.00044289217398558144, + "loss": 3.0495, + "step": 20934 + }, + { + "epoch": 1.03, + "grad_norm": 0.6256024241447449, + "learning_rate": 0.0004428786362518424, + "loss": 3.0102, + "step": 20935 + }, + { + "epoch": 1.03, + "grad_norm": 0.5609905123710632, + "learning_rate": 0.00044286509814178866, + "loss": 3.0142, + "step": 20936 + }, + { + "epoch": 1.03, + "grad_norm": 0.5775830149650574, + "learning_rate": 0.00044285155965545573, + "loss": 3.0254, + "step": 20937 + }, + { + "epoch": 1.03, + "grad_norm": 0.5974602699279785, + "learning_rate": 0.0004428380207928792, + "loss": 3.2062, + "step": 20938 + }, + { + "epoch": 1.03, + "grad_norm": 0.5600636005401611, + "learning_rate": 0.0004428244815540949, + "loss": 3.0143, + "step": 20939 + }, + { + "epoch": 1.03, + "grad_norm": 0.5269203782081604, + "learning_rate": 0.00044281094193913837, + "loss": 3.2125, + "step": 20940 + }, + { + "epoch": 1.03, + "grad_norm": 0.5174791812896729, + "learning_rate": 0.0004427974019480453, + "loss": 3.2041, + "step": 20941 + }, + { + "epoch": 1.03, + "grad_norm": 0.5623819231987, + "learning_rate": 0.00044278386158085135, + "loss": 2.9925, + "step": 20942 + }, + { + "epoch": 1.03, + "grad_norm": 0.5486443638801575, + "learning_rate": 0.00044277032083759227, + "loss": 2.9681, + "step": 20943 + }, + { + "epoch": 1.03, + "grad_norm": 0.536963939666748, + "learning_rate": 0.0004427567797183036, + "loss": 3.0695, + "step": 20944 + }, + { + "epoch": 1.03, + "grad_norm": 0.5224885940551758, + "learning_rate": 0.00044274323822302095, + "loss": 3.2491, + "step": 20945 + }, + { + "epoch": 1.03, + "grad_norm": 0.5229834914207458, + "learning_rate": 0.0004427296963517801, + "loss": 3.227, + "step": 20946 + }, + { + "epoch": 1.03, + "grad_norm": 0.49516236782073975, + "learning_rate": 0.0004427161541046169, + "loss": 3.0456, + "step": 20947 + }, + { + "epoch": 1.03, + "grad_norm": 0.4853455126285553, + "learning_rate": 0.00044270261148156656, + "loss": 3.1123, + "step": 20948 + }, + { + "epoch": 1.03, + "grad_norm": 0.547874927520752, + "learning_rate": 0.00044268906848266514, + "loss": 3.1893, + "step": 20949 + }, + { + "epoch": 1.03, + "grad_norm": 0.5608683228492737, + "learning_rate": 0.00044267552510794813, + "loss": 3.2441, + "step": 20950 + }, + { + "epoch": 1.03, + "grad_norm": 0.5529409646987915, + "learning_rate": 0.00044266198135745126, + "loss": 3.0731, + "step": 20951 + }, + { + "epoch": 1.03, + "grad_norm": 0.5526501536369324, + "learning_rate": 0.0004426484372312102, + "loss": 2.9746, + "step": 20952 + }, + { + "epoch": 1.03, + "grad_norm": 0.5399184823036194, + "learning_rate": 0.0004426348927292606, + "loss": 3.0924, + "step": 20953 + }, + { + "epoch": 1.03, + "grad_norm": 0.5350891351699829, + "learning_rate": 0.00044262134785163815, + "loss": 3.1553, + "step": 20954 + }, + { + "epoch": 1.03, + "grad_norm": 0.523499071598053, + "learning_rate": 0.00044260780259837846, + "loss": 2.9259, + "step": 20955 + }, + { + "epoch": 1.03, + "grad_norm": 0.519354522228241, + "learning_rate": 0.0004425942569695173, + "loss": 3.0434, + "step": 20956 + }, + { + "epoch": 1.03, + "grad_norm": 0.5334795117378235, + "learning_rate": 0.00044258071096509033, + "loss": 3.1886, + "step": 20957 + }, + { + "epoch": 1.03, + "grad_norm": 0.5632496476173401, + "learning_rate": 0.0004425671645851333, + "loss": 2.9461, + "step": 20958 + }, + { + "epoch": 1.03, + "grad_norm": 0.5479925870895386, + "learning_rate": 0.0004425536178296816, + "loss": 3.0682, + "step": 20959 + }, + { + "epoch": 1.03, + "grad_norm": 0.5662195086479187, + "learning_rate": 0.0004425400706987712, + "loss": 3.1047, + "step": 20960 + }, + { + "epoch": 1.03, + "grad_norm": 0.5479673147201538, + "learning_rate": 0.0004425265231924377, + "loss": 3.1159, + "step": 20961 + }, + { + "epoch": 1.03, + "grad_norm": 0.5581120848655701, + "learning_rate": 0.0004425129753107168, + "loss": 3.1118, + "step": 20962 + }, + { + "epoch": 1.03, + "grad_norm": 0.562798261642456, + "learning_rate": 0.00044249942705364403, + "loss": 3.1258, + "step": 20963 + }, + { + "epoch": 1.03, + "grad_norm": 0.5359207987785339, + "learning_rate": 0.0004424858784212553, + "loss": 3.1239, + "step": 20964 + }, + { + "epoch": 1.03, + "grad_norm": 0.502883791923523, + "learning_rate": 0.0004424723294135862, + "loss": 3.0139, + "step": 20965 + }, + { + "epoch": 1.03, + "grad_norm": 0.5590471625328064, + "learning_rate": 0.0004424587800306723, + "loss": 3.0502, + "step": 20966 + }, + { + "epoch": 1.03, + "grad_norm": 0.5840969085693359, + "learning_rate": 0.0004424452302725495, + "loss": 3.2399, + "step": 20967 + }, + { + "epoch": 1.03, + "grad_norm": 0.5094526410102844, + "learning_rate": 0.00044243168013925326, + "loss": 3.0227, + "step": 20968 + }, + { + "epoch": 1.03, + "grad_norm": 0.5850446224212646, + "learning_rate": 0.0004424181296308195, + "loss": 3.0581, + "step": 20969 + }, + { + "epoch": 1.03, + "grad_norm": 0.5337172746658325, + "learning_rate": 0.0004424045787472838, + "loss": 3.1475, + "step": 20970 + }, + { + "epoch": 1.03, + "grad_norm": 0.5515691041946411, + "learning_rate": 0.0004423910274886817, + "loss": 2.8972, + "step": 20971 + }, + { + "epoch": 1.03, + "grad_norm": 0.5271448493003845, + "learning_rate": 0.0004423774758550492, + "loss": 3.1464, + "step": 20972 + }, + { + "epoch": 1.03, + "grad_norm": 0.5401154160499573, + "learning_rate": 0.0004423639238464217, + "loss": 3.0886, + "step": 20973 + }, + { + "epoch": 1.03, + "grad_norm": 0.5540905594825745, + "learning_rate": 0.0004423503714628351, + "loss": 3.0209, + "step": 20974 + }, + { + "epoch": 1.03, + "grad_norm": 0.5254117250442505, + "learning_rate": 0.00044233681870432497, + "loss": 3.0147, + "step": 20975 + }, + { + "epoch": 1.03, + "grad_norm": 0.5366216897964478, + "learning_rate": 0.0004423232655709271, + "loss": 3.191, + "step": 20976 + }, + { + "epoch": 1.03, + "grad_norm": 0.56840580701828, + "learning_rate": 0.00044230971206267716, + "loss": 3.1895, + "step": 20977 + }, + { + "epoch": 1.03, + "grad_norm": 0.5454578995704651, + "learning_rate": 0.0004422961581796108, + "loss": 3.1494, + "step": 20978 + }, + { + "epoch": 1.03, + "grad_norm": 0.5930941104888916, + "learning_rate": 0.0004422826039217637, + "loss": 3.036, + "step": 20979 + }, + { + "epoch": 1.03, + "grad_norm": 0.5133326649665833, + "learning_rate": 0.00044226904928917167, + "loss": 3.2738, + "step": 20980 + }, + { + "epoch": 1.03, + "grad_norm": 0.5534525513648987, + "learning_rate": 0.0004422554942818703, + "loss": 3.0809, + "step": 20981 + }, + { + "epoch": 1.03, + "grad_norm": 0.5347831845283508, + "learning_rate": 0.00044224193889989534, + "loss": 3.062, + "step": 20982 + }, + { + "epoch": 1.03, + "grad_norm": 0.5829472541809082, + "learning_rate": 0.00044222838314328253, + "loss": 3.0576, + "step": 20983 + }, + { + "epoch": 1.03, + "grad_norm": 0.5392614603042603, + "learning_rate": 0.00044221482701206746, + "loss": 3.0753, + "step": 20984 + }, + { + "epoch": 1.03, + "grad_norm": 0.5648199319839478, + "learning_rate": 0.000442201270506286, + "loss": 2.8233, + "step": 20985 + }, + { + "epoch": 1.03, + "grad_norm": 0.5383753776550293, + "learning_rate": 0.00044218771362597366, + "loss": 3.1036, + "step": 20986 + }, + { + "epoch": 1.03, + "grad_norm": 0.5280416011810303, + "learning_rate": 0.0004421741563711663, + "loss": 3.2258, + "step": 20987 + }, + { + "epoch": 1.03, + "grad_norm": 0.5613174438476562, + "learning_rate": 0.0004421605987418996, + "loss": 3.0589, + "step": 20988 + }, + { + "epoch": 1.03, + "grad_norm": 0.551788866519928, + "learning_rate": 0.00044214704073820913, + "loss": 3.3632, + "step": 20989 + }, + { + "epoch": 1.03, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.00044213348236013083, + "loss": 2.9893, + "step": 20990 + }, + { + "epoch": 1.03, + "grad_norm": 0.5227196216583252, + "learning_rate": 0.00044211992360770024, + "loss": 3.0317, + "step": 20991 + }, + { + "epoch": 1.03, + "grad_norm": 0.4978506863117218, + "learning_rate": 0.0004421063644809532, + "loss": 2.8453, + "step": 20992 + }, + { + "epoch": 1.03, + "grad_norm": 0.5322362780570984, + "learning_rate": 0.0004420928049799252, + "loss": 3.1358, + "step": 20993 + }, + { + "epoch": 1.03, + "grad_norm": 0.5089728236198425, + "learning_rate": 0.0004420792451046523, + "loss": 3.2766, + "step": 20994 + }, + { + "epoch": 1.03, + "grad_norm": 0.5534874200820923, + "learning_rate": 0.00044206568485516987, + "loss": 3.0792, + "step": 20995 + }, + { + "epoch": 1.03, + "grad_norm": 0.530881941318512, + "learning_rate": 0.00044205212423151366, + "loss": 3.2128, + "step": 20996 + }, + { + "epoch": 1.03, + "grad_norm": 0.5450581312179565, + "learning_rate": 0.0004420385632337196, + "loss": 3.2734, + "step": 20997 + }, + { + "epoch": 1.03, + "grad_norm": 0.5764946937561035, + "learning_rate": 0.00044202500186182344, + "loss": 2.963, + "step": 20998 + }, + { + "epoch": 1.03, + "grad_norm": 0.5664291381835938, + "learning_rate": 0.0004420114401158606, + "loss": 3.1355, + "step": 20999 + }, + { + "epoch": 1.03, + "grad_norm": 0.5113416314125061, + "learning_rate": 0.00044199787799586695, + "loss": 3.1817, + "step": 21000 + }, + { + "epoch": 1.03, + "grad_norm": 0.5444121360778809, + "learning_rate": 0.0004419843155018783, + "loss": 2.9416, + "step": 21001 + }, + { + "epoch": 1.03, + "grad_norm": 0.5358783602714539, + "learning_rate": 0.00044197075263393025, + "loss": 3.1008, + "step": 21002 + }, + { + "epoch": 1.03, + "grad_norm": 0.5368015170097351, + "learning_rate": 0.0004419571893920586, + "loss": 2.9991, + "step": 21003 + }, + { + "epoch": 1.03, + "grad_norm": 0.5103992819786072, + "learning_rate": 0.000441943625776299, + "loss": 3.1975, + "step": 21004 + }, + { + "epoch": 1.03, + "grad_norm": 0.5424513816833496, + "learning_rate": 0.0004419300617866872, + "loss": 3.0472, + "step": 21005 + }, + { + "epoch": 1.03, + "grad_norm": 0.5450505614280701, + "learning_rate": 0.00044191649742325893, + "loss": 3.0328, + "step": 21006 + }, + { + "epoch": 1.03, + "grad_norm": 0.5383841395378113, + "learning_rate": 0.00044190293268604985, + "loss": 3.1623, + "step": 21007 + }, + { + "epoch": 1.03, + "grad_norm": 0.5024534463882446, + "learning_rate": 0.0004418893675750958, + "loss": 3.2498, + "step": 21008 + }, + { + "epoch": 1.03, + "grad_norm": 0.5342257618904114, + "learning_rate": 0.0004418758020904326, + "loss": 3.198, + "step": 21009 + }, + { + "epoch": 1.03, + "grad_norm": 0.5287760496139526, + "learning_rate": 0.0004418622362320957, + "loss": 3.22, + "step": 21010 + }, + { + "epoch": 1.03, + "grad_norm": 0.5340474247932434, + "learning_rate": 0.00044184867000012103, + "loss": 3.2134, + "step": 21011 + }, + { + "epoch": 1.03, + "grad_norm": 0.6150773167610168, + "learning_rate": 0.00044183510339454424, + "loss": 3.0173, + "step": 21012 + }, + { + "epoch": 1.03, + "grad_norm": 0.5500164031982422, + "learning_rate": 0.00044182153641540104, + "loss": 3.0495, + "step": 21013 + }, + { + "epoch": 1.03, + "grad_norm": 0.5410817861557007, + "learning_rate": 0.00044180796906272726, + "loss": 3.0827, + "step": 21014 + }, + { + "epoch": 1.03, + "grad_norm": 0.5385421514511108, + "learning_rate": 0.00044179440133655857, + "loss": 2.9338, + "step": 21015 + }, + { + "epoch": 1.03, + "grad_norm": 0.5334212779998779, + "learning_rate": 0.0004417808332369307, + "loss": 2.9631, + "step": 21016 + }, + { + "epoch": 1.03, + "grad_norm": 0.5273452401161194, + "learning_rate": 0.0004417672647638794, + "loss": 3.2463, + "step": 21017 + }, + { + "epoch": 1.03, + "grad_norm": 0.5127688050270081, + "learning_rate": 0.00044175369591744045, + "loss": 3.2323, + "step": 21018 + }, + { + "epoch": 1.03, + "grad_norm": 0.5606804490089417, + "learning_rate": 0.00044174012669764953, + "loss": 3.0001, + "step": 21019 + }, + { + "epoch": 1.03, + "grad_norm": 0.5266181230545044, + "learning_rate": 0.00044172655710454244, + "loss": 3.1192, + "step": 21020 + }, + { + "epoch": 1.03, + "grad_norm": 0.5243688821792603, + "learning_rate": 0.0004417129871381548, + "loss": 3.2963, + "step": 21021 + }, + { + "epoch": 1.03, + "grad_norm": 0.5265859365463257, + "learning_rate": 0.0004416994167985224, + "loss": 3.1358, + "step": 21022 + }, + { + "epoch": 1.03, + "grad_norm": 0.5116739273071289, + "learning_rate": 0.0004416858460856811, + "loss": 2.9943, + "step": 21023 + }, + { + "epoch": 1.03, + "grad_norm": 0.543095052242279, + "learning_rate": 0.0004416722749996665, + "loss": 2.9124, + "step": 21024 + }, + { + "epoch": 1.03, + "grad_norm": 0.550938069820404, + "learning_rate": 0.0004416587035405143, + "loss": 3.2173, + "step": 21025 + }, + { + "epoch": 1.03, + "grad_norm": 0.5226496458053589, + "learning_rate": 0.00044164513170826046, + "loss": 3.1445, + "step": 21026 + }, + { + "epoch": 1.03, + "grad_norm": 0.5465294122695923, + "learning_rate": 0.0004416315595029406, + "loss": 3.1399, + "step": 21027 + }, + { + "epoch": 1.03, + "grad_norm": 0.5636183619499207, + "learning_rate": 0.00044161798692459044, + "loss": 3.2226, + "step": 21028 + }, + { + "epoch": 1.03, + "grad_norm": 0.5646752119064331, + "learning_rate": 0.0004416044139732457, + "loss": 3.0557, + "step": 21029 + }, + { + "epoch": 1.03, + "grad_norm": 0.5204924941062927, + "learning_rate": 0.0004415908406489423, + "loss": 3.1327, + "step": 21030 + }, + { + "epoch": 1.03, + "grad_norm": 0.5620864033699036, + "learning_rate": 0.00044157726695171585, + "loss": 3.0268, + "step": 21031 + }, + { + "epoch": 1.03, + "grad_norm": 0.5253093838691711, + "learning_rate": 0.0004415636928816021, + "loss": 3.0451, + "step": 21032 + }, + { + "epoch": 1.03, + "grad_norm": 0.5312303900718689, + "learning_rate": 0.0004415501184386368, + "loss": 3.1457, + "step": 21033 + }, + { + "epoch": 1.03, + "grad_norm": 0.5458439588546753, + "learning_rate": 0.0004415365436228558, + "loss": 3.0072, + "step": 21034 + }, + { + "epoch": 1.03, + "grad_norm": 0.5164788961410522, + "learning_rate": 0.00044152296843429474, + "loss": 3.0088, + "step": 21035 + }, + { + "epoch": 1.03, + "grad_norm": 0.5227516293525696, + "learning_rate": 0.0004415093928729894, + "loss": 2.7882, + "step": 21036 + }, + { + "epoch": 1.03, + "grad_norm": 0.5490872859954834, + "learning_rate": 0.0004414958169389756, + "loss": 3.146, + "step": 21037 + }, + { + "epoch": 1.03, + "grad_norm": 0.5249037742614746, + "learning_rate": 0.000441482240632289, + "loss": 3.1291, + "step": 21038 + }, + { + "epoch": 1.03, + "grad_norm": 0.590631902217865, + "learning_rate": 0.00044146866395296545, + "loss": 3.1117, + "step": 21039 + }, + { + "epoch": 1.03, + "grad_norm": 0.5558299422264099, + "learning_rate": 0.00044145508690104056, + "loss": 2.882, + "step": 21040 + }, + { + "epoch": 1.03, + "grad_norm": 0.5602800846099854, + "learning_rate": 0.00044144150947655035, + "loss": 2.8093, + "step": 21041 + }, + { + "epoch": 1.03, + "grad_norm": 0.5326724648475647, + "learning_rate": 0.00044142793167953033, + "loss": 3.3363, + "step": 21042 + }, + { + "epoch": 1.03, + "grad_norm": 0.5459589958190918, + "learning_rate": 0.0004414143535100164, + "loss": 3.1367, + "step": 21043 + }, + { + "epoch": 1.03, + "grad_norm": 0.5694371461868286, + "learning_rate": 0.00044140077496804427, + "loss": 3.1642, + "step": 21044 + }, + { + "epoch": 1.03, + "grad_norm": 0.566260039806366, + "learning_rate": 0.00044138719605364967, + "loss": 3.2619, + "step": 21045 + }, + { + "epoch": 1.03, + "grad_norm": 0.5829527974128723, + "learning_rate": 0.00044137361676686843, + "loss": 3.019, + "step": 21046 + }, + { + "epoch": 1.03, + "grad_norm": 0.5660526752471924, + "learning_rate": 0.00044136003710773633, + "loss": 3.0725, + "step": 21047 + }, + { + "epoch": 1.03, + "grad_norm": 0.5259793996810913, + "learning_rate": 0.000441346457076289, + "loss": 3.1078, + "step": 21048 + }, + { + "epoch": 1.03, + "grad_norm": 0.5162500143051147, + "learning_rate": 0.0004413328766725624, + "loss": 2.8862, + "step": 21049 + }, + { + "epoch": 1.03, + "grad_norm": 0.560583233833313, + "learning_rate": 0.00044131929589659213, + "loss": 3.1804, + "step": 21050 + }, + { + "epoch": 1.03, + "grad_norm": 0.5528599619865417, + "learning_rate": 0.00044130571474841406, + "loss": 2.9594, + "step": 21051 + }, + { + "epoch": 1.03, + "grad_norm": 0.5608258843421936, + "learning_rate": 0.000441292133228064, + "loss": 3.2056, + "step": 21052 + }, + { + "epoch": 1.03, + "grad_norm": 0.546389102935791, + "learning_rate": 0.0004412785513355776, + "loss": 3.0145, + "step": 21053 + }, + { + "epoch": 1.03, + "grad_norm": 0.5085633397102356, + "learning_rate": 0.0004412649690709907, + "loss": 3.1211, + "step": 21054 + }, + { + "epoch": 1.03, + "grad_norm": 0.543613851070404, + "learning_rate": 0.000441251386434339, + "loss": 2.996, + "step": 21055 + }, + { + "epoch": 1.03, + "grad_norm": 0.5142641067504883, + "learning_rate": 0.00044123780342565844, + "loss": 3.1159, + "step": 21056 + }, + { + "epoch": 1.03, + "grad_norm": 0.5283182859420776, + "learning_rate": 0.00044122422004498466, + "loss": 3.142, + "step": 21057 + }, + { + "epoch": 1.03, + "grad_norm": 0.5592846274375916, + "learning_rate": 0.0004412106362923533, + "loss": 3.3198, + "step": 21058 + }, + { + "epoch": 1.03, + "grad_norm": 0.5456278324127197, + "learning_rate": 0.00044119705216780046, + "loss": 3.1421, + "step": 21059 + }, + { + "epoch": 1.03, + "grad_norm": 0.5718361139297485, + "learning_rate": 0.0004411834676713618, + "loss": 2.7871, + "step": 21060 + }, + { + "epoch": 1.03, + "grad_norm": 0.5030151605606079, + "learning_rate": 0.000441169882803073, + "loss": 3.1338, + "step": 21061 + }, + { + "epoch": 1.03, + "grad_norm": 0.5248968005180359, + "learning_rate": 0.0004411562975629698, + "loss": 3.273, + "step": 21062 + }, + { + "epoch": 1.03, + "grad_norm": 0.5286002159118652, + "learning_rate": 0.00044114271195108814, + "loss": 3.218, + "step": 21063 + }, + { + "epoch": 1.03, + "grad_norm": 0.511260986328125, + "learning_rate": 0.0004411291259674638, + "loss": 3.1222, + "step": 21064 + }, + { + "epoch": 1.03, + "grad_norm": 0.5192838907241821, + "learning_rate": 0.00044111553961213244, + "loss": 3.0487, + "step": 21065 + }, + { + "epoch": 1.03, + "grad_norm": 0.49542543292045593, + "learning_rate": 0.00044110195288512996, + "loss": 2.9655, + "step": 21066 + }, + { + "epoch": 1.03, + "grad_norm": 0.5238027572631836, + "learning_rate": 0.0004410883657864921, + "loss": 3.0834, + "step": 21067 + }, + { + "epoch": 1.03, + "grad_norm": 0.5153197050094604, + "learning_rate": 0.00044107477831625456, + "loss": 2.9757, + "step": 21068 + }, + { + "epoch": 1.03, + "grad_norm": 0.5175606608390808, + "learning_rate": 0.00044106119047445324, + "loss": 3.0309, + "step": 21069 + }, + { + "epoch": 1.03, + "grad_norm": 0.5227696299552917, + "learning_rate": 0.00044104760226112386, + "loss": 3.0677, + "step": 21070 + }, + { + "epoch": 1.03, + "grad_norm": 0.5260235071182251, + "learning_rate": 0.0004410340136763023, + "loss": 3.1779, + "step": 21071 + }, + { + "epoch": 1.03, + "grad_norm": 0.567876398563385, + "learning_rate": 0.0004410204247200243, + "loss": 2.9599, + "step": 21072 + }, + { + "epoch": 1.03, + "grad_norm": 0.5913751721382141, + "learning_rate": 0.0004410068353923255, + "loss": 3.0299, + "step": 21073 + }, + { + "epoch": 1.03, + "grad_norm": 0.5325940847396851, + "learning_rate": 0.000440993245693242, + "loss": 3.0769, + "step": 21074 + }, + { + "epoch": 1.03, + "grad_norm": 0.5309601426124573, + "learning_rate": 0.0004409796556228094, + "loss": 3.043, + "step": 21075 + }, + { + "epoch": 1.03, + "grad_norm": 0.5216854810714722, + "learning_rate": 0.0004409660651810635, + "loss": 3.264, + "step": 21076 + }, + { + "epoch": 1.03, + "grad_norm": 0.5479691624641418, + "learning_rate": 0.00044095247436804006, + "loss": 2.9584, + "step": 21077 + }, + { + "epoch": 1.03, + "grad_norm": 0.5204125642776489, + "learning_rate": 0.00044093888318377507, + "loss": 3.1291, + "step": 21078 + }, + { + "epoch": 1.03, + "grad_norm": 0.5596486330032349, + "learning_rate": 0.00044092529162830397, + "loss": 3.1667, + "step": 21079 + }, + { + "epoch": 1.03, + "grad_norm": 0.5593887567520142, + "learning_rate": 0.0004409116997016629, + "loss": 2.9974, + "step": 21080 + }, + { + "epoch": 1.03, + "grad_norm": 0.5333544611930847, + "learning_rate": 0.00044089810740388755, + "loss": 2.9289, + "step": 21081 + }, + { + "epoch": 1.03, + "grad_norm": 0.5625095367431641, + "learning_rate": 0.0004408845147350137, + "loss": 3.0538, + "step": 21082 + }, + { + "epoch": 1.03, + "grad_norm": 0.5155503153800964, + "learning_rate": 0.0004408709216950771, + "loss": 2.9528, + "step": 21083 + }, + { + "epoch": 1.03, + "grad_norm": 0.5525764226913452, + "learning_rate": 0.00044085732828411355, + "loss": 3.1537, + "step": 21084 + }, + { + "epoch": 1.03, + "grad_norm": 0.5102589726448059, + "learning_rate": 0.000440843734502159, + "loss": 3.3095, + "step": 21085 + }, + { + "epoch": 1.03, + "grad_norm": 0.5470672845840454, + "learning_rate": 0.00044083014034924917, + "loss": 3.2635, + "step": 21086 + }, + { + "epoch": 1.03, + "grad_norm": 0.5264706015586853, + "learning_rate": 0.0004408165458254198, + "loss": 3.5163, + "step": 21087 + }, + { + "epoch": 1.03, + "grad_norm": 0.5716114044189453, + "learning_rate": 0.00044080295093070675, + "loss": 3.1289, + "step": 21088 + }, + { + "epoch": 1.03, + "grad_norm": 0.54709792137146, + "learning_rate": 0.0004407893556651459, + "loss": 3.2208, + "step": 21089 + }, + { + "epoch": 1.03, + "grad_norm": 0.6111564636230469, + "learning_rate": 0.0004407757600287729, + "loss": 3.1601, + "step": 21090 + }, + { + "epoch": 1.03, + "grad_norm": 0.5455335974693298, + "learning_rate": 0.0004407621640216237, + "loss": 3.0098, + "step": 21091 + }, + { + "epoch": 1.03, + "grad_norm": 0.5645591020584106, + "learning_rate": 0.00044074856764373397, + "loss": 3.1535, + "step": 21092 + }, + { + "epoch": 1.03, + "grad_norm": 0.53360915184021, + "learning_rate": 0.00044073497089513976, + "loss": 3.0372, + "step": 21093 + }, + { + "epoch": 1.03, + "grad_norm": 0.5450435876846313, + "learning_rate": 0.00044072137377587655, + "loss": 2.9299, + "step": 21094 + }, + { + "epoch": 1.03, + "grad_norm": 0.5204038023948669, + "learning_rate": 0.0004407077762859804, + "loss": 2.9236, + "step": 21095 + }, + { + "epoch": 1.03, + "grad_norm": 0.5424490571022034, + "learning_rate": 0.00044069417842548705, + "loss": 3.2089, + "step": 21096 + }, + { + "epoch": 1.03, + "grad_norm": 0.5224910378456116, + "learning_rate": 0.0004406805801944323, + "loss": 3.0936, + "step": 21097 + }, + { + "epoch": 1.03, + "grad_norm": 0.5337421894073486, + "learning_rate": 0.00044066698159285196, + "loss": 3.069, + "step": 21098 + }, + { + "epoch": 1.03, + "grad_norm": 0.5657770037651062, + "learning_rate": 0.00044065338262078184, + "loss": 3.0047, + "step": 21099 + }, + { + "epoch": 1.03, + "grad_norm": 0.5155889987945557, + "learning_rate": 0.00044063978327825784, + "loss": 3.1635, + "step": 21100 + }, + { + "epoch": 1.03, + "grad_norm": 0.560525119304657, + "learning_rate": 0.00044062618356531573, + "loss": 3.1346, + "step": 21101 + }, + { + "epoch": 1.03, + "grad_norm": 0.5268765687942505, + "learning_rate": 0.00044061258348199124, + "loss": 2.9914, + "step": 21102 + }, + { + "epoch": 1.03, + "grad_norm": 0.5468574166297913, + "learning_rate": 0.0004405989830283203, + "loss": 3.3028, + "step": 21103 + }, + { + "epoch": 1.03, + "grad_norm": 0.5861000418663025, + "learning_rate": 0.00044058538220433866, + "loss": 3.2922, + "step": 21104 + }, + { + "epoch": 1.03, + "grad_norm": 0.5265964269638062, + "learning_rate": 0.0004405717810100823, + "loss": 3.2175, + "step": 21105 + }, + { + "epoch": 1.03, + "grad_norm": 0.5208206176757812, + "learning_rate": 0.00044055817944558683, + "loss": 3.2776, + "step": 21106 + }, + { + "epoch": 1.03, + "grad_norm": 0.5527623891830444, + "learning_rate": 0.0004405445775108882, + "loss": 3.129, + "step": 21107 + }, + { + "epoch": 1.03, + "grad_norm": 0.5665170550346375, + "learning_rate": 0.00044053097520602223, + "loss": 2.96, + "step": 21108 + }, + { + "epoch": 1.03, + "grad_norm": 0.5453583002090454, + "learning_rate": 0.0004405173725310246, + "loss": 3.0435, + "step": 21109 + }, + { + "epoch": 1.03, + "grad_norm": 0.5542186498641968, + "learning_rate": 0.0004405037694859313, + "loss": 3.1687, + "step": 21110 + }, + { + "epoch": 1.03, + "grad_norm": 0.5576282143592834, + "learning_rate": 0.0004404901660707782, + "loss": 3.0296, + "step": 21111 + }, + { + "epoch": 1.03, + "grad_norm": 0.5522076487541199, + "learning_rate": 0.0004404765622856009, + "loss": 2.9013, + "step": 21112 + }, + { + "epoch": 1.03, + "grad_norm": 0.5171756744384766, + "learning_rate": 0.0004404629581304355, + "loss": 3.1172, + "step": 21113 + }, + { + "epoch": 1.03, + "grad_norm": 0.5567202568054199, + "learning_rate": 0.00044044935360531765, + "loss": 2.9008, + "step": 21114 + }, + { + "epoch": 1.03, + "grad_norm": 0.5733157992362976, + "learning_rate": 0.0004404357487102832, + "loss": 2.8591, + "step": 21115 + }, + { + "epoch": 1.03, + "grad_norm": 0.5684636831283569, + "learning_rate": 0.0004404221434453681, + "loss": 3.2353, + "step": 21116 + }, + { + "epoch": 1.03, + "grad_norm": 0.5617053508758545, + "learning_rate": 0.000440408537810608, + "loss": 3.1757, + "step": 21117 + }, + { + "epoch": 1.03, + "grad_norm": 0.5794061422348022, + "learning_rate": 0.0004403949318060389, + "loss": 3.0882, + "step": 21118 + }, + { + "epoch": 1.03, + "grad_norm": 0.5320546627044678, + "learning_rate": 0.00044038132543169656, + "loss": 3.0382, + "step": 21119 + }, + { + "epoch": 1.04, + "grad_norm": 0.5199233293533325, + "learning_rate": 0.00044036771868761674, + "loss": 2.9563, + "step": 21120 + }, + { + "epoch": 1.04, + "grad_norm": 0.5452735424041748, + "learning_rate": 0.0004403541115738354, + "loss": 3.1861, + "step": 21121 + }, + { + "epoch": 1.04, + "grad_norm": 0.5553098917007446, + "learning_rate": 0.00044034050409038847, + "loss": 3.1264, + "step": 21122 + }, + { + "epoch": 1.04, + "grad_norm": 0.5302225947380066, + "learning_rate": 0.0004403268962373116, + "loss": 3.0457, + "step": 21123 + }, + { + "epoch": 1.04, + "grad_norm": 0.5287278890609741, + "learning_rate": 0.00044031328801464063, + "loss": 3.0917, + "step": 21124 + }, + { + "epoch": 1.04, + "grad_norm": 0.5537627935409546, + "learning_rate": 0.0004402996794224115, + "loss": 3.0715, + "step": 21125 + }, + { + "epoch": 1.04, + "grad_norm": 0.5674768686294556, + "learning_rate": 0.00044028607046066, + "loss": 3.1355, + "step": 21126 + }, + { + "epoch": 1.04, + "grad_norm": 0.505564272403717, + "learning_rate": 0.00044027246112942204, + "loss": 2.8738, + "step": 21127 + }, + { + "epoch": 1.04, + "grad_norm": 0.537092924118042, + "learning_rate": 0.00044025885142873336, + "loss": 3.241, + "step": 21128 + }, + { + "epoch": 1.04, + "grad_norm": 0.514145016670227, + "learning_rate": 0.0004402452413586299, + "loss": 3.0004, + "step": 21129 + }, + { + "epoch": 1.04, + "grad_norm": 0.5343130230903625, + "learning_rate": 0.00044023163091914735, + "loss": 3.1431, + "step": 21130 + }, + { + "epoch": 1.04, + "grad_norm": 0.5057582855224609, + "learning_rate": 0.00044021802011032175, + "loss": 3.0871, + "step": 21131 + }, + { + "epoch": 1.04, + "grad_norm": 0.5197306871414185, + "learning_rate": 0.0004402044089321889, + "loss": 3.0718, + "step": 21132 + }, + { + "epoch": 1.04, + "grad_norm": 0.594575047492981, + "learning_rate": 0.00044019079738478464, + "loss": 3.019, + "step": 21133 + }, + { + "epoch": 1.04, + "grad_norm": 0.5036117434501648, + "learning_rate": 0.00044017718546814473, + "loss": 2.9993, + "step": 21134 + }, + { + "epoch": 1.04, + "grad_norm": 0.5579177737236023, + "learning_rate": 0.0004401635731823051, + "loss": 3.3196, + "step": 21135 + }, + { + "epoch": 1.04, + "grad_norm": 0.5319854021072388, + "learning_rate": 0.0004401499605273016, + "loss": 3.0564, + "step": 21136 + }, + { + "epoch": 1.04, + "grad_norm": 0.5116011500358582, + "learning_rate": 0.0004401363475031701, + "loss": 3.1638, + "step": 21137 + }, + { + "epoch": 1.04, + "grad_norm": 0.5688881278038025, + "learning_rate": 0.00044012273410994643, + "loss": 3.2267, + "step": 21138 + }, + { + "epoch": 1.04, + "grad_norm": 0.5254831314086914, + "learning_rate": 0.0004401091203476664, + "loss": 2.9605, + "step": 21139 + }, + { + "epoch": 1.04, + "grad_norm": 0.5896934270858765, + "learning_rate": 0.00044009550621636597, + "loss": 3.0805, + "step": 21140 + }, + { + "epoch": 1.04, + "grad_norm": 0.5865271091461182, + "learning_rate": 0.0004400818917160809, + "loss": 3.0489, + "step": 21141 + }, + { + "epoch": 1.04, + "grad_norm": 0.5466059446334839, + "learning_rate": 0.0004400682768468471, + "loss": 3.198, + "step": 21142 + }, + { + "epoch": 1.04, + "grad_norm": 0.5474870800971985, + "learning_rate": 0.00044005466160870044, + "loss": 3.1467, + "step": 21143 + }, + { + "epoch": 1.04, + "grad_norm": 0.5172299742698669, + "learning_rate": 0.0004400410460016767, + "loss": 3.1705, + "step": 21144 + }, + { + "epoch": 1.04, + "grad_norm": 0.5260311365127563, + "learning_rate": 0.0004400274300258118, + "loss": 3.1582, + "step": 21145 + }, + { + "epoch": 1.04, + "grad_norm": 0.5976084470748901, + "learning_rate": 0.0004400138136811417, + "loss": 3.1107, + "step": 21146 + }, + { + "epoch": 1.04, + "grad_norm": 0.5335716009140015, + "learning_rate": 0.0004400001969677021, + "loss": 3.1361, + "step": 21147 + }, + { + "epoch": 1.04, + "grad_norm": 0.5297675728797913, + "learning_rate": 0.0004399865798855289, + "loss": 3.0556, + "step": 21148 + }, + { + "epoch": 1.04, + "grad_norm": 0.5369957685470581, + "learning_rate": 0.0004399729624346581, + "loss": 3.162, + "step": 21149 + }, + { + "epoch": 1.04, + "grad_norm": 0.5566016435623169, + "learning_rate": 0.00043995934461512525, + "loss": 3.0682, + "step": 21150 + }, + { + "epoch": 1.04, + "grad_norm": 0.5417798757553101, + "learning_rate": 0.00043994572642696656, + "loss": 3.1494, + "step": 21151 + }, + { + "epoch": 1.04, + "grad_norm": 0.5253806114196777, + "learning_rate": 0.0004399321078702177, + "loss": 2.9937, + "step": 21152 + }, + { + "epoch": 1.04, + "grad_norm": 0.5396881699562073, + "learning_rate": 0.00043991848894491464, + "loss": 2.9225, + "step": 21153 + }, + { + "epoch": 1.04, + "grad_norm": 0.5461945533752441, + "learning_rate": 0.0004399048696510932, + "loss": 3.1803, + "step": 21154 + }, + { + "epoch": 1.04, + "grad_norm": 0.5187393426895142, + "learning_rate": 0.0004398912499887893, + "loss": 3.0976, + "step": 21155 + }, + { + "epoch": 1.04, + "grad_norm": 0.5625729560852051, + "learning_rate": 0.00043987762995803867, + "loss": 2.9497, + "step": 21156 + }, + { + "epoch": 1.04, + "grad_norm": 0.5398131012916565, + "learning_rate": 0.00043986400955887733, + "loss": 3.0063, + "step": 21157 + }, + { + "epoch": 1.04, + "grad_norm": 0.5377815961837769, + "learning_rate": 0.0004398503887913411, + "loss": 3.0048, + "step": 21158 + }, + { + "epoch": 1.04, + "grad_norm": 0.5535577535629272, + "learning_rate": 0.0004398367676554659, + "loss": 3.0225, + "step": 21159 + }, + { + "epoch": 1.04, + "grad_norm": 0.5389450192451477, + "learning_rate": 0.0004398231461512875, + "loss": 3.0952, + "step": 21160 + }, + { + "epoch": 1.04, + "grad_norm": 0.5288699865341187, + "learning_rate": 0.0004398095242788419, + "loss": 3.0028, + "step": 21161 + }, + { + "epoch": 1.04, + "grad_norm": 0.5519115924835205, + "learning_rate": 0.00043979590203816496, + "loss": 3.0892, + "step": 21162 + }, + { + "epoch": 1.04, + "grad_norm": 0.5267183780670166, + "learning_rate": 0.0004397822794292924, + "loss": 3.1845, + "step": 21163 + }, + { + "epoch": 1.04, + "grad_norm": 0.5286177396774292, + "learning_rate": 0.0004397686564522603, + "loss": 3.182, + "step": 21164 + }, + { + "epoch": 1.04, + "grad_norm": 0.5310975909233093, + "learning_rate": 0.00043975503310710436, + "loss": 3.0952, + "step": 21165 + }, + { + "epoch": 1.04, + "grad_norm": 0.5732306838035583, + "learning_rate": 0.0004397414093938607, + "loss": 3.0944, + "step": 21166 + }, + { + "epoch": 1.04, + "grad_norm": 0.5607469081878662, + "learning_rate": 0.000439727785312565, + "loss": 2.9405, + "step": 21167 + }, + { + "epoch": 1.04, + "grad_norm": 0.5709458589553833, + "learning_rate": 0.0004397141608632532, + "loss": 3.1021, + "step": 21168 + }, + { + "epoch": 1.04, + "grad_norm": 0.5287217497825623, + "learning_rate": 0.00043970053604596126, + "loss": 3.0768, + "step": 21169 + }, + { + "epoch": 1.04, + "grad_norm": 0.5781598091125488, + "learning_rate": 0.0004396869108607249, + "loss": 3.214, + "step": 21170 + }, + { + "epoch": 1.04, + "grad_norm": 0.5571904182434082, + "learning_rate": 0.00043967328530758004, + "loss": 3.0581, + "step": 21171 + }, + { + "epoch": 1.04, + "grad_norm": 0.5215210914611816, + "learning_rate": 0.0004396596593865627, + "loss": 2.9378, + "step": 21172 + }, + { + "epoch": 1.04, + "grad_norm": 0.6075809597969055, + "learning_rate": 0.0004396460330977088, + "loss": 2.9941, + "step": 21173 + }, + { + "epoch": 1.04, + "grad_norm": 0.5475901365280151, + "learning_rate": 0.000439632406441054, + "loss": 3.277, + "step": 21174 + }, + { + "epoch": 1.04, + "grad_norm": 0.5757737159729004, + "learning_rate": 0.00043961877941663433, + "loss": 3.2825, + "step": 21175 + }, + { + "epoch": 1.04, + "grad_norm": 0.5772517919540405, + "learning_rate": 0.0004396051520244857, + "loss": 3.1746, + "step": 21176 + }, + { + "epoch": 1.04, + "grad_norm": 0.5674006938934326, + "learning_rate": 0.00043959152426464393, + "loss": 3.1398, + "step": 21177 + }, + { + "epoch": 1.04, + "grad_norm": 0.5191792845726013, + "learning_rate": 0.00043957789613714493, + "loss": 3.1744, + "step": 21178 + }, + { + "epoch": 1.04, + "grad_norm": 0.5092511177062988, + "learning_rate": 0.0004395642676420247, + "loss": 3.0467, + "step": 21179 + }, + { + "epoch": 1.04, + "grad_norm": 0.5251277089118958, + "learning_rate": 0.000439550638779319, + "loss": 2.8873, + "step": 21180 + }, + { + "epoch": 1.04, + "grad_norm": 0.5171329975128174, + "learning_rate": 0.0004395370095490637, + "loss": 2.8804, + "step": 21181 + }, + { + "epoch": 1.04, + "grad_norm": 0.5684592127799988, + "learning_rate": 0.0004395233799512949, + "loss": 3.1189, + "step": 21182 + }, + { + "epoch": 1.04, + "grad_norm": 0.5765049457550049, + "learning_rate": 0.0004395097499860483, + "loss": 3.0344, + "step": 21183 + }, + { + "epoch": 1.04, + "grad_norm": 0.5399144291877747, + "learning_rate": 0.0004394961196533599, + "loss": 3.1513, + "step": 21184 + }, + { + "epoch": 1.04, + "grad_norm": 0.5205578207969666, + "learning_rate": 0.00043948248895326556, + "loss": 3.0598, + "step": 21185 + }, + { + "epoch": 1.04, + "grad_norm": 0.5342963933944702, + "learning_rate": 0.0004394688578858011, + "loss": 3.099, + "step": 21186 + }, + { + "epoch": 1.04, + "grad_norm": 0.5252129435539246, + "learning_rate": 0.00043945522645100264, + "loss": 3.1557, + "step": 21187 + }, + { + "epoch": 1.04, + "grad_norm": 0.5322774052619934, + "learning_rate": 0.0004394415946489059, + "loss": 3.1597, + "step": 21188 + }, + { + "epoch": 1.04, + "grad_norm": 0.5448768138885498, + "learning_rate": 0.00043942796247954685, + "loss": 3.07, + "step": 21189 + }, + { + "epoch": 1.04, + "grad_norm": 0.5391474962234497, + "learning_rate": 0.00043941432994296136, + "loss": 3.2065, + "step": 21190 + }, + { + "epoch": 1.04, + "grad_norm": 0.5174856185913086, + "learning_rate": 0.0004394006970391854, + "loss": 3.3146, + "step": 21191 + }, + { + "epoch": 1.04, + "grad_norm": 0.5522015690803528, + "learning_rate": 0.00043938706376825474, + "loss": 2.9034, + "step": 21192 + }, + { + "epoch": 1.04, + "grad_norm": 0.5845143795013428, + "learning_rate": 0.0004393734301302054, + "loss": 3.2134, + "step": 21193 + }, + { + "epoch": 1.04, + "grad_norm": 0.5457573533058167, + "learning_rate": 0.0004393597961250733, + "loss": 2.9945, + "step": 21194 + }, + { + "epoch": 1.04, + "grad_norm": 0.5486933588981628, + "learning_rate": 0.0004393461617528944, + "loss": 3.1322, + "step": 21195 + }, + { + "epoch": 1.04, + "grad_norm": 0.5403074622154236, + "learning_rate": 0.00043933252701370445, + "loss": 3.1812, + "step": 21196 + }, + { + "epoch": 1.04, + "grad_norm": 0.5348062515258789, + "learning_rate": 0.00043931889190753937, + "loss": 3.1889, + "step": 21197 + }, + { + "epoch": 1.04, + "grad_norm": 0.5630962252616882, + "learning_rate": 0.00043930525643443523, + "loss": 3.1408, + "step": 21198 + }, + { + "epoch": 1.04, + "grad_norm": 0.5177236795425415, + "learning_rate": 0.00043929162059442786, + "loss": 3.0108, + "step": 21199 + }, + { + "epoch": 1.04, + "grad_norm": 0.6250644326210022, + "learning_rate": 0.00043927798438755315, + "loss": 2.906, + "step": 21200 + }, + { + "epoch": 1.04, + "grad_norm": 0.596113920211792, + "learning_rate": 0.00043926434781384703, + "loss": 3.0855, + "step": 21201 + }, + { + "epoch": 1.04, + "grad_norm": 0.5471876859664917, + "learning_rate": 0.0004392507108733454, + "loss": 2.9947, + "step": 21202 + }, + { + "epoch": 1.04, + "grad_norm": 0.529731273651123, + "learning_rate": 0.00043923707356608414, + "loss": 3.0602, + "step": 21203 + }, + { + "epoch": 1.04, + "grad_norm": 0.5773414969444275, + "learning_rate": 0.0004392234358920993, + "loss": 3.1828, + "step": 21204 + }, + { + "epoch": 1.04, + "grad_norm": 0.5588471293449402, + "learning_rate": 0.0004392097978514267, + "loss": 3.2139, + "step": 21205 + }, + { + "epoch": 1.04, + "grad_norm": 0.5582773089408875, + "learning_rate": 0.0004391961594441024, + "loss": 3.0678, + "step": 21206 + }, + { + "epoch": 1.04, + "grad_norm": 0.5097718834877014, + "learning_rate": 0.000439182520670162, + "loss": 3.1309, + "step": 21207 + }, + { + "epoch": 1.04, + "grad_norm": 0.6590524911880493, + "learning_rate": 0.00043916888152964174, + "loss": 2.9753, + "step": 21208 + }, + { + "epoch": 1.04, + "grad_norm": 0.545197606086731, + "learning_rate": 0.0004391552420225775, + "loss": 3.1811, + "step": 21209 + }, + { + "epoch": 1.04, + "grad_norm": 0.557826042175293, + "learning_rate": 0.000439141602149005, + "loss": 3.1237, + "step": 21210 + }, + { + "epoch": 1.04, + "grad_norm": 0.5410374999046326, + "learning_rate": 0.0004391279619089604, + "loss": 3.0101, + "step": 21211 + }, + { + "epoch": 1.04, + "grad_norm": 0.5548087358474731, + "learning_rate": 0.0004391143213024794, + "loss": 3.0225, + "step": 21212 + }, + { + "epoch": 1.04, + "grad_norm": 0.5963765382766724, + "learning_rate": 0.0004391006803295982, + "loss": 3.2314, + "step": 21213 + }, + { + "epoch": 1.04, + "grad_norm": 0.5618867874145508, + "learning_rate": 0.0004390870389903525, + "loss": 3.0364, + "step": 21214 + }, + { + "epoch": 1.04, + "grad_norm": 0.5385682582855225, + "learning_rate": 0.0004390733972847783, + "loss": 3.1931, + "step": 21215 + }, + { + "epoch": 1.04, + "grad_norm": 0.5348148941993713, + "learning_rate": 0.0004390597552129116, + "loss": 3.0025, + "step": 21216 + }, + { + "epoch": 1.04, + "grad_norm": 0.5246903896331787, + "learning_rate": 0.00043904611277478816, + "loss": 3.2069, + "step": 21217 + }, + { + "epoch": 1.04, + "grad_norm": 0.5948673486709595, + "learning_rate": 0.0004390324699704441, + "loss": 2.9454, + "step": 21218 + }, + { + "epoch": 1.04, + "grad_norm": 0.5328468084335327, + "learning_rate": 0.0004390188267999152, + "loss": 3.3611, + "step": 21219 + }, + { + "epoch": 1.04, + "grad_norm": 0.5216394066810608, + "learning_rate": 0.0004390051832632376, + "loss": 3.0009, + "step": 21220 + }, + { + "epoch": 1.04, + "grad_norm": 0.7438657283782959, + "learning_rate": 0.000438991539360447, + "loss": 3.1814, + "step": 21221 + }, + { + "epoch": 1.04, + "grad_norm": 0.5399791598320007, + "learning_rate": 0.0004389778950915794, + "loss": 2.9652, + "step": 21222 + }, + { + "epoch": 1.04, + "grad_norm": 0.6201183795928955, + "learning_rate": 0.00043896425045667083, + "loss": 3.1625, + "step": 21223 + }, + { + "epoch": 1.04, + "grad_norm": 0.6501622200012207, + "learning_rate": 0.00043895060545575716, + "loss": 2.9417, + "step": 21224 + }, + { + "epoch": 1.04, + "grad_norm": 0.5394759178161621, + "learning_rate": 0.00043893696008887436, + "loss": 3.2379, + "step": 21225 + }, + { + "epoch": 1.04, + "grad_norm": 0.5677136182785034, + "learning_rate": 0.0004389233143560583, + "loss": 3.1612, + "step": 21226 + }, + { + "epoch": 1.04, + "grad_norm": 0.5787396430969238, + "learning_rate": 0.00043890966825734504, + "loss": 3.1594, + "step": 21227 + }, + { + "epoch": 1.04, + "grad_norm": 0.5350275635719299, + "learning_rate": 0.00043889602179277037, + "loss": 3.0324, + "step": 21228 + }, + { + "epoch": 1.04, + "grad_norm": 0.5961716175079346, + "learning_rate": 0.0004388823749623704, + "loss": 3.1801, + "step": 21229 + }, + { + "epoch": 1.04, + "grad_norm": 0.5254611372947693, + "learning_rate": 0.00043886872776618084, + "loss": 2.9515, + "step": 21230 + }, + { + "epoch": 1.04, + "grad_norm": 0.5171568393707275, + "learning_rate": 0.0004388550802042379, + "loss": 2.9371, + "step": 21231 + }, + { + "epoch": 1.04, + "grad_norm": 0.5672535300254822, + "learning_rate": 0.0004388414322765773, + "loss": 2.9656, + "step": 21232 + }, + { + "epoch": 1.04, + "grad_norm": 0.551892101764679, + "learning_rate": 0.0004388277839832352, + "loss": 3.0332, + "step": 21233 + }, + { + "epoch": 1.04, + "grad_norm": 0.571571946144104, + "learning_rate": 0.00043881413532424735, + "loss": 3.3032, + "step": 21234 + }, + { + "epoch": 1.04, + "grad_norm": 0.5441715717315674, + "learning_rate": 0.0004388004862996499, + "loss": 3.0702, + "step": 21235 + }, + { + "epoch": 1.04, + "grad_norm": 0.5653895735740662, + "learning_rate": 0.00043878683690947855, + "loss": 3.2735, + "step": 21236 + }, + { + "epoch": 1.04, + "grad_norm": 0.548560380935669, + "learning_rate": 0.00043877318715376937, + "loss": 3.2087, + "step": 21237 + }, + { + "epoch": 1.04, + "grad_norm": 0.5536999702453613, + "learning_rate": 0.00043875953703255844, + "loss": 3.2161, + "step": 21238 + }, + { + "epoch": 1.04, + "grad_norm": 0.5580829977989197, + "learning_rate": 0.00043874588654588145, + "loss": 3.2814, + "step": 21239 + }, + { + "epoch": 1.04, + "grad_norm": 0.5434279441833496, + "learning_rate": 0.00043873223569377456, + "loss": 3.1529, + "step": 21240 + }, + { + "epoch": 1.04, + "grad_norm": 0.5689913630485535, + "learning_rate": 0.00043871858447627375, + "loss": 3.1142, + "step": 21241 + }, + { + "epoch": 1.04, + "grad_norm": 0.554728090763092, + "learning_rate": 0.0004387049328934148, + "loss": 3.0535, + "step": 21242 + }, + { + "epoch": 1.04, + "grad_norm": 0.5019456148147583, + "learning_rate": 0.00043869128094523377, + "loss": 3.2623, + "step": 21243 + }, + { + "epoch": 1.04, + "grad_norm": 0.5831273794174194, + "learning_rate": 0.00043867762863176654, + "loss": 3.2874, + "step": 21244 + }, + { + "epoch": 1.04, + "grad_norm": 0.5366065502166748, + "learning_rate": 0.0004386639759530492, + "loss": 2.943, + "step": 21245 + }, + { + "epoch": 1.04, + "grad_norm": 0.5083133578300476, + "learning_rate": 0.00043865032290911764, + "loss": 3.0425, + "step": 21246 + }, + { + "epoch": 1.04, + "grad_norm": 0.5319649577140808, + "learning_rate": 0.0004386366695000078, + "loss": 3.1767, + "step": 21247 + }, + { + "epoch": 1.04, + "grad_norm": 0.5332556366920471, + "learning_rate": 0.0004386230157257556, + "loss": 3.1384, + "step": 21248 + }, + { + "epoch": 1.04, + "grad_norm": 0.511458694934845, + "learning_rate": 0.0004386093615863972, + "loss": 3.0138, + "step": 21249 + }, + { + "epoch": 1.04, + "grad_norm": 0.5145825147628784, + "learning_rate": 0.0004385957070819683, + "loss": 3.0111, + "step": 21250 + }, + { + "epoch": 1.04, + "grad_norm": 0.5127465128898621, + "learning_rate": 0.00043858205221250496, + "loss": 2.944, + "step": 21251 + }, + { + "epoch": 1.04, + "grad_norm": 0.5706564784049988, + "learning_rate": 0.0004385683969780432, + "loss": 3.2836, + "step": 21252 + }, + { + "epoch": 1.04, + "grad_norm": 0.5455026030540466, + "learning_rate": 0.00043855474137861894, + "loss": 3.0848, + "step": 21253 + }, + { + "epoch": 1.04, + "grad_norm": 0.5656867623329163, + "learning_rate": 0.00043854108541426813, + "loss": 3.077, + "step": 21254 + }, + { + "epoch": 1.04, + "grad_norm": 0.5728113651275635, + "learning_rate": 0.00043852742908502676, + "loss": 3.1265, + "step": 21255 + }, + { + "epoch": 1.04, + "grad_norm": 0.5125572085380554, + "learning_rate": 0.0004385137723909309, + "loss": 3.0643, + "step": 21256 + }, + { + "epoch": 1.04, + "grad_norm": 0.5141593217849731, + "learning_rate": 0.00043850011533201643, + "loss": 2.931, + "step": 21257 + }, + { + "epoch": 1.04, + "grad_norm": 0.5243428945541382, + "learning_rate": 0.0004384864579083192, + "loss": 2.8332, + "step": 21258 + }, + { + "epoch": 1.04, + "grad_norm": 0.5448452234268188, + "learning_rate": 0.0004384728001198753, + "loss": 3.0665, + "step": 21259 + }, + { + "epoch": 1.04, + "grad_norm": 0.5277436375617981, + "learning_rate": 0.0004384591419667208, + "loss": 2.9827, + "step": 21260 + }, + { + "epoch": 1.04, + "grad_norm": 0.5299006700515747, + "learning_rate": 0.00043844548344889147, + "loss": 2.9862, + "step": 21261 + }, + { + "epoch": 1.04, + "grad_norm": 0.5536235570907593, + "learning_rate": 0.00043843182456642336, + "loss": 3.3202, + "step": 21262 + }, + { + "epoch": 1.04, + "grad_norm": 0.5166927576065063, + "learning_rate": 0.0004384181653193525, + "loss": 3.0759, + "step": 21263 + }, + { + "epoch": 1.04, + "grad_norm": 0.5285334587097168, + "learning_rate": 0.0004384045057077149, + "loss": 3.2137, + "step": 21264 + }, + { + "epoch": 1.04, + "grad_norm": 0.5186501145362854, + "learning_rate": 0.00043839084573154635, + "loss": 2.869, + "step": 21265 + }, + { + "epoch": 1.04, + "grad_norm": 0.49396657943725586, + "learning_rate": 0.0004383771853908831, + "loss": 2.9673, + "step": 21266 + }, + { + "epoch": 1.04, + "grad_norm": 0.559752345085144, + "learning_rate": 0.0004383635246857608, + "loss": 3.2458, + "step": 21267 + }, + { + "epoch": 1.04, + "grad_norm": 0.5300299525260925, + "learning_rate": 0.0004383498636162157, + "loss": 3.0761, + "step": 21268 + }, + { + "epoch": 1.04, + "grad_norm": 0.5181698203086853, + "learning_rate": 0.0004383362021822837, + "loss": 3.0975, + "step": 21269 + }, + { + "epoch": 1.04, + "grad_norm": 0.5253779888153076, + "learning_rate": 0.0004383225403840007, + "loss": 3.0754, + "step": 21270 + }, + { + "epoch": 1.04, + "grad_norm": 0.519812822341919, + "learning_rate": 0.0004383088782214029, + "loss": 3.1472, + "step": 21271 + }, + { + "epoch": 1.04, + "grad_norm": 0.5578188300132751, + "learning_rate": 0.000438295215694526, + "loss": 3.1391, + "step": 21272 + }, + { + "epoch": 1.04, + "grad_norm": 0.5534501075744629, + "learning_rate": 0.00043828155280340613, + "loss": 2.9352, + "step": 21273 + }, + { + "epoch": 1.04, + "grad_norm": 0.5486505031585693, + "learning_rate": 0.00043826788954807924, + "loss": 3.2344, + "step": 21274 + }, + { + "epoch": 1.04, + "grad_norm": 0.5528342127799988, + "learning_rate": 0.0004382542259285814, + "loss": 3.014, + "step": 21275 + }, + { + "epoch": 1.04, + "grad_norm": 0.5291114449501038, + "learning_rate": 0.0004382405619449485, + "loss": 2.9962, + "step": 21276 + }, + { + "epoch": 1.04, + "grad_norm": 0.5442724227905273, + "learning_rate": 0.0004382268975972166, + "loss": 2.9954, + "step": 21277 + }, + { + "epoch": 1.04, + "grad_norm": 0.5385951995849609, + "learning_rate": 0.00043821323288542163, + "loss": 3.3394, + "step": 21278 + }, + { + "epoch": 1.04, + "grad_norm": 0.5655089616775513, + "learning_rate": 0.00043819956780959953, + "loss": 3.0762, + "step": 21279 + }, + { + "epoch": 1.04, + "grad_norm": 0.5629735589027405, + "learning_rate": 0.00043818590236978643, + "loss": 3.1988, + "step": 21280 + }, + { + "epoch": 1.04, + "grad_norm": 0.5002853274345398, + "learning_rate": 0.0004381722365660183, + "loss": 3.2544, + "step": 21281 + }, + { + "epoch": 1.04, + "grad_norm": 0.6046683192253113, + "learning_rate": 0.00043815857039833107, + "loss": 3.0964, + "step": 21282 + }, + { + "epoch": 1.04, + "grad_norm": 0.5325875282287598, + "learning_rate": 0.0004381449038667608, + "loss": 3.0506, + "step": 21283 + }, + { + "epoch": 1.04, + "grad_norm": 0.568646252155304, + "learning_rate": 0.0004381312369713433, + "loss": 3.3053, + "step": 21284 + }, + { + "epoch": 1.04, + "grad_norm": 0.5482074022293091, + "learning_rate": 0.0004381175697121148, + "loss": 3.158, + "step": 21285 + }, + { + "epoch": 1.04, + "grad_norm": 0.5522348284721375, + "learning_rate": 0.0004381039020891112, + "loss": 3.0225, + "step": 21286 + }, + { + "epoch": 1.04, + "grad_norm": 0.5628721714019775, + "learning_rate": 0.0004380902341023685, + "loss": 3.2838, + "step": 21287 + }, + { + "epoch": 1.04, + "grad_norm": 0.5041287541389465, + "learning_rate": 0.0004380765657519227, + "loss": 3.1902, + "step": 21288 + }, + { + "epoch": 1.04, + "grad_norm": 0.5522623062133789, + "learning_rate": 0.0004380628970378098, + "loss": 3.1488, + "step": 21289 + }, + { + "epoch": 1.04, + "grad_norm": 0.5240798592567444, + "learning_rate": 0.0004380492279600658, + "loss": 2.9737, + "step": 21290 + }, + { + "epoch": 1.04, + "grad_norm": 0.532329261302948, + "learning_rate": 0.00043803555851872663, + "loss": 3.1439, + "step": 21291 + }, + { + "epoch": 1.04, + "grad_norm": 0.5527257919311523, + "learning_rate": 0.0004380218887138284, + "loss": 3.1124, + "step": 21292 + }, + { + "epoch": 1.04, + "grad_norm": 0.5227271318435669, + "learning_rate": 0.0004380082185454072, + "loss": 3.1057, + "step": 21293 + }, + { + "epoch": 1.04, + "grad_norm": 0.5554690957069397, + "learning_rate": 0.00043799454801349876, + "loss": 3.0577, + "step": 21294 + }, + { + "epoch": 1.04, + "grad_norm": 0.5670045614242554, + "learning_rate": 0.0004379808771181393, + "loss": 3.0609, + "step": 21295 + }, + { + "epoch": 1.04, + "grad_norm": 0.5955886840820312, + "learning_rate": 0.00043796720585936475, + "loss": 3.13, + "step": 21296 + }, + { + "epoch": 1.04, + "grad_norm": 0.546093761920929, + "learning_rate": 0.0004379535342372112, + "loss": 3.1067, + "step": 21297 + }, + { + "epoch": 1.04, + "grad_norm": 0.5156376361846924, + "learning_rate": 0.0004379398622517145, + "loss": 3.1343, + "step": 21298 + }, + { + "epoch": 1.04, + "grad_norm": 0.5549337267875671, + "learning_rate": 0.0004379261899029107, + "loss": 3.1189, + "step": 21299 + }, + { + "epoch": 1.04, + "grad_norm": 0.5297946929931641, + "learning_rate": 0.000437912517190836, + "loss": 2.8991, + "step": 21300 + }, + { + "epoch": 1.04, + "grad_norm": 0.566015362739563, + "learning_rate": 0.0004378988441155262, + "loss": 2.8466, + "step": 21301 + }, + { + "epoch": 1.04, + "grad_norm": 0.5280096530914307, + "learning_rate": 0.0004378851706770174, + "loss": 3.1138, + "step": 21302 + }, + { + "epoch": 1.04, + "grad_norm": 0.5393192768096924, + "learning_rate": 0.00043787149687534555, + "loss": 3.0668, + "step": 21303 + }, + { + "epoch": 1.04, + "grad_norm": 0.5300957560539246, + "learning_rate": 0.00043785782271054676, + "loss": 3.1409, + "step": 21304 + }, + { + "epoch": 1.04, + "grad_norm": 0.5181828737258911, + "learning_rate": 0.000437844148182657, + "loss": 3.191, + "step": 21305 + }, + { + "epoch": 1.04, + "grad_norm": 0.5715096592903137, + "learning_rate": 0.00043783047329171225, + "loss": 3.0793, + "step": 21306 + }, + { + "epoch": 1.04, + "grad_norm": 0.5914278626441956, + "learning_rate": 0.0004378167980377486, + "loss": 3.0991, + "step": 21307 + }, + { + "epoch": 1.04, + "grad_norm": 0.520348310470581, + "learning_rate": 0.0004378031224208021, + "loss": 2.8398, + "step": 21308 + }, + { + "epoch": 1.04, + "grad_norm": 0.5296949744224548, + "learning_rate": 0.00043778944644090855, + "loss": 3.1946, + "step": 21309 + }, + { + "epoch": 1.04, + "grad_norm": 0.5429492592811584, + "learning_rate": 0.00043777577009810416, + "loss": 3.0502, + "step": 21310 + }, + { + "epoch": 1.04, + "grad_norm": 0.524493396282196, + "learning_rate": 0.00043776209339242494, + "loss": 2.9544, + "step": 21311 + }, + { + "epoch": 1.04, + "grad_norm": 0.5563609004020691, + "learning_rate": 0.00043774841632390684, + "loss": 3.0386, + "step": 21312 + }, + { + "epoch": 1.04, + "grad_norm": 0.5135151147842407, + "learning_rate": 0.0004377347388925859, + "loss": 3.131, + "step": 21313 + }, + { + "epoch": 1.04, + "grad_norm": 0.5470807552337646, + "learning_rate": 0.00043772106109849824, + "loss": 3.1214, + "step": 21314 + }, + { + "epoch": 1.04, + "grad_norm": 0.5473372340202332, + "learning_rate": 0.0004377073829416798, + "loss": 3.019, + "step": 21315 + }, + { + "epoch": 1.04, + "grad_norm": 0.5544005632400513, + "learning_rate": 0.00043769370442216655, + "loss": 3.2527, + "step": 21316 + }, + { + "epoch": 1.04, + "grad_norm": 0.5615202188491821, + "learning_rate": 0.0004376800255399946, + "loss": 3.1916, + "step": 21317 + }, + { + "epoch": 1.04, + "grad_norm": 0.5153510570526123, + "learning_rate": 0.0004376663462952, + "loss": 3.1229, + "step": 21318 + }, + { + "epoch": 1.04, + "grad_norm": 0.5381267666816711, + "learning_rate": 0.00043765266668781874, + "loss": 3.0508, + "step": 21319 + }, + { + "epoch": 1.04, + "grad_norm": 0.5629384517669678, + "learning_rate": 0.00043763898671788677, + "loss": 2.9926, + "step": 21320 + }, + { + "epoch": 1.04, + "grad_norm": 0.5090012550354004, + "learning_rate": 0.00043762530638544026, + "loss": 3.157, + "step": 21321 + }, + { + "epoch": 1.04, + "grad_norm": 0.5678680539131165, + "learning_rate": 0.00043761162569051526, + "loss": 2.9729, + "step": 21322 + }, + { + "epoch": 1.04, + "grad_norm": 0.5650219917297363, + "learning_rate": 0.0004375979446331476, + "loss": 3.3654, + "step": 21323 + }, + { + "epoch": 1.05, + "grad_norm": 0.5620492100715637, + "learning_rate": 0.00043758426321337355, + "loss": 3.1908, + "step": 21324 + }, + { + "epoch": 1.05, + "grad_norm": 0.5956307053565979, + "learning_rate": 0.0004375705814312288, + "loss": 2.8768, + "step": 21325 + }, + { + "epoch": 1.05, + "grad_norm": 0.5379623174667358, + "learning_rate": 0.00043755689928674985, + "loss": 3.0727, + "step": 21326 + }, + { + "epoch": 1.05, + "grad_norm": 0.5664665699005127, + "learning_rate": 0.00043754321677997237, + "loss": 2.7609, + "step": 21327 + }, + { + "epoch": 1.05, + "grad_norm": 0.527374267578125, + "learning_rate": 0.0004375295339109326, + "loss": 3.1016, + "step": 21328 + }, + { + "epoch": 1.05, + "grad_norm": 0.5893529653549194, + "learning_rate": 0.0004375158506796665, + "loss": 3.1118, + "step": 21329 + }, + { + "epoch": 1.05, + "grad_norm": 0.5682183504104614, + "learning_rate": 0.0004375021670862101, + "loss": 3.1754, + "step": 21330 + }, + { + "epoch": 1.05, + "grad_norm": 0.5709665417671204, + "learning_rate": 0.0004374884831305995, + "loss": 3.0515, + "step": 21331 + }, + { + "epoch": 1.05, + "grad_norm": 0.5187592506408691, + "learning_rate": 0.00043747479881287054, + "loss": 3.2883, + "step": 21332 + }, + { + "epoch": 1.05, + "grad_norm": 0.5648815035820007, + "learning_rate": 0.0004374611141330596, + "loss": 3.1431, + "step": 21333 + }, + { + "epoch": 1.05, + "grad_norm": 0.500318169593811, + "learning_rate": 0.0004374474290912025, + "loss": 3.0056, + "step": 21334 + }, + { + "epoch": 1.05, + "grad_norm": 0.5123018622398376, + "learning_rate": 0.00043743374368733517, + "loss": 3.0267, + "step": 21335 + }, + { + "epoch": 1.05, + "grad_norm": 0.5368598103523254, + "learning_rate": 0.00043742005792149393, + "loss": 3.1508, + "step": 21336 + }, + { + "epoch": 1.05, + "grad_norm": 0.5632544159889221, + "learning_rate": 0.00043740637179371473, + "loss": 3.1209, + "step": 21337 + }, + { + "epoch": 1.05, + "grad_norm": 0.5279106497764587, + "learning_rate": 0.0004373926853040336, + "loss": 3.3057, + "step": 21338 + }, + { + "epoch": 1.05, + "grad_norm": 0.5026139616966248, + "learning_rate": 0.0004373789984524865, + "loss": 3.1144, + "step": 21339 + }, + { + "epoch": 1.05, + "grad_norm": 0.5417594909667969, + "learning_rate": 0.0004373653112391096, + "loss": 3.1992, + "step": 21340 + }, + { + "epoch": 1.05, + "grad_norm": 0.5107925534248352, + "learning_rate": 0.0004373516236639389, + "loss": 3.1297, + "step": 21341 + }, + { + "epoch": 1.05, + "grad_norm": 0.532360315322876, + "learning_rate": 0.00043733793572701046, + "loss": 3.104, + "step": 21342 + }, + { + "epoch": 1.05, + "grad_norm": 0.5431941151618958, + "learning_rate": 0.0004373242474283603, + "loss": 3.0766, + "step": 21343 + }, + { + "epoch": 1.05, + "grad_norm": 0.5421182513237, + "learning_rate": 0.00043731055876802463, + "loss": 3.2512, + "step": 21344 + }, + { + "epoch": 1.05, + "grad_norm": 0.5257422924041748, + "learning_rate": 0.0004372968697460392, + "loss": 3.1045, + "step": 21345 + }, + { + "epoch": 1.05, + "grad_norm": 0.4982752203941345, + "learning_rate": 0.0004372831803624403, + "loss": 3.182, + "step": 21346 + }, + { + "epoch": 1.05, + "grad_norm": 0.6206594705581665, + "learning_rate": 0.000437269490617264, + "loss": 3.0784, + "step": 21347 + }, + { + "epoch": 1.05, + "grad_norm": 0.5198937058448792, + "learning_rate": 0.00043725580051054625, + "loss": 3.357, + "step": 21348 + }, + { + "epoch": 1.05, + "grad_norm": 0.5259478092193604, + "learning_rate": 0.00043724211004232303, + "loss": 3.0271, + "step": 21349 + }, + { + "epoch": 1.05, + "grad_norm": 0.523259699344635, + "learning_rate": 0.00043722841921263055, + "loss": 3.1932, + "step": 21350 + }, + { + "epoch": 1.05, + "grad_norm": 0.5149825215339661, + "learning_rate": 0.0004372147280215049, + "loss": 3.171, + "step": 21351 + }, + { + "epoch": 1.05, + "grad_norm": 0.5463266968727112, + "learning_rate": 0.00043720103646898204, + "loss": 2.9304, + "step": 21352 + }, + { + "epoch": 1.05, + "grad_norm": 0.5451768636703491, + "learning_rate": 0.000437187344555098, + "loss": 3.0214, + "step": 21353 + }, + { + "epoch": 1.05, + "grad_norm": 0.5087425112724304, + "learning_rate": 0.0004371736522798889, + "loss": 2.9616, + "step": 21354 + }, + { + "epoch": 1.05, + "grad_norm": 0.5477582216262817, + "learning_rate": 0.0004371599596433909, + "loss": 3.0898, + "step": 21355 + }, + { + "epoch": 1.05, + "grad_norm": 0.5459511280059814, + "learning_rate": 0.0004371462666456398, + "loss": 3.2466, + "step": 21356 + }, + { + "epoch": 1.05, + "grad_norm": 0.5397904515266418, + "learning_rate": 0.00043713257328667195, + "loss": 2.9689, + "step": 21357 + }, + { + "epoch": 1.05, + "grad_norm": 0.528396487236023, + "learning_rate": 0.00043711887956652325, + "loss": 2.9729, + "step": 21358 + }, + { + "epoch": 1.05, + "grad_norm": 0.5352534651756287, + "learning_rate": 0.00043710518548522985, + "loss": 3.2941, + "step": 21359 + }, + { + "epoch": 1.05, + "grad_norm": 0.529255747795105, + "learning_rate": 0.0004370914910428278, + "loss": 3.1262, + "step": 21360 + }, + { + "epoch": 1.05, + "grad_norm": 0.5711490511894226, + "learning_rate": 0.000437077796239353, + "loss": 2.9798, + "step": 21361 + }, + { + "epoch": 1.05, + "grad_norm": 0.5521866083145142, + "learning_rate": 0.00043706410107484183, + "loss": 3.0056, + "step": 21362 + }, + { + "epoch": 1.05, + "grad_norm": 0.5383753776550293, + "learning_rate": 0.0004370504055493301, + "loss": 2.9838, + "step": 21363 + }, + { + "epoch": 1.05, + "grad_norm": 0.5527660846710205, + "learning_rate": 0.000437036709662854, + "loss": 3.1455, + "step": 21364 + }, + { + "epoch": 1.05, + "grad_norm": 0.5740596652030945, + "learning_rate": 0.0004370230134154496, + "loss": 3.1023, + "step": 21365 + }, + { + "epoch": 1.05, + "grad_norm": 0.5765985250473022, + "learning_rate": 0.0004370093168071529, + "loss": 2.9812, + "step": 21366 + }, + { + "epoch": 1.05, + "grad_norm": 0.5971035957336426, + "learning_rate": 0.00043699561983800007, + "loss": 3.1561, + "step": 21367 + }, + { + "epoch": 1.05, + "grad_norm": 0.5382588505744934, + "learning_rate": 0.0004369819225080272, + "loss": 3.1575, + "step": 21368 + }, + { + "epoch": 1.05, + "grad_norm": 0.5307689905166626, + "learning_rate": 0.0004369682248172702, + "loss": 3.0418, + "step": 21369 + }, + { + "epoch": 1.05, + "grad_norm": 0.551903486251831, + "learning_rate": 0.00043695452676576537, + "loss": 3.1686, + "step": 21370 + }, + { + "epoch": 1.05, + "grad_norm": 0.5409027934074402, + "learning_rate": 0.00043694082835354854, + "loss": 2.9421, + "step": 21371 + }, + { + "epoch": 1.05, + "grad_norm": 0.5229660868644714, + "learning_rate": 0.000436927129580656, + "loss": 2.9775, + "step": 21372 + }, + { + "epoch": 1.05, + "grad_norm": 0.5019228458404541, + "learning_rate": 0.0004369134304471238, + "loss": 3.0205, + "step": 21373 + }, + { + "epoch": 1.05, + "grad_norm": 0.5376384854316711, + "learning_rate": 0.0004368997309529879, + "loss": 3.1344, + "step": 21374 + }, + { + "epoch": 1.05, + "grad_norm": 0.5281094908714294, + "learning_rate": 0.0004368860310982845, + "loss": 2.9118, + "step": 21375 + }, + { + "epoch": 1.05, + "grad_norm": 0.5503444075584412, + "learning_rate": 0.0004368723308830495, + "loss": 2.9234, + "step": 21376 + }, + { + "epoch": 1.05, + "grad_norm": 0.5356557369232178, + "learning_rate": 0.00043685863030731934, + "loss": 3.2536, + "step": 21377 + }, + { + "epoch": 1.05, + "grad_norm": 0.5426025390625, + "learning_rate": 0.0004368449293711298, + "loss": 3.1201, + "step": 21378 + }, + { + "epoch": 1.05, + "grad_norm": 0.5253592133522034, + "learning_rate": 0.00043683122807451695, + "loss": 3.1486, + "step": 21379 + }, + { + "epoch": 1.05, + "grad_norm": 0.5157790184020996, + "learning_rate": 0.0004368175264175171, + "loss": 2.9994, + "step": 21380 + }, + { + "epoch": 1.05, + "grad_norm": 0.5356464385986328, + "learning_rate": 0.0004368038244001661, + "loss": 2.9935, + "step": 21381 + }, + { + "epoch": 1.05, + "grad_norm": 0.5198023915290833, + "learning_rate": 0.00043679012202250023, + "loss": 2.9333, + "step": 21382 + }, + { + "epoch": 1.05, + "grad_norm": 0.6020377278327942, + "learning_rate": 0.00043677641928455553, + "loss": 3.1164, + "step": 21383 + }, + { + "epoch": 1.05, + "grad_norm": 0.540126383304596, + "learning_rate": 0.0004367627161863681, + "loss": 3.0763, + "step": 21384 + }, + { + "epoch": 1.05, + "grad_norm": 0.5544701814651489, + "learning_rate": 0.00043674901272797395, + "loss": 2.9656, + "step": 21385 + }, + { + "epoch": 1.05, + "grad_norm": 0.57041335105896, + "learning_rate": 0.0004367353089094092, + "loss": 3.096, + "step": 21386 + }, + { + "epoch": 1.05, + "grad_norm": 0.5330620408058167, + "learning_rate": 0.0004367216047307099, + "loss": 3.0942, + "step": 21387 + }, + { + "epoch": 1.05, + "grad_norm": 0.5197447538375854, + "learning_rate": 0.0004367079001919124, + "loss": 2.9982, + "step": 21388 + }, + { + "epoch": 1.05, + "grad_norm": 0.5680919885635376, + "learning_rate": 0.0004366941952930524, + "loss": 3.1648, + "step": 21389 + }, + { + "epoch": 1.05, + "grad_norm": 0.6198605895042419, + "learning_rate": 0.0004366804900341663, + "loss": 3.0439, + "step": 21390 + }, + { + "epoch": 1.05, + "grad_norm": 0.5180275440216064, + "learning_rate": 0.00043666678441529004, + "loss": 2.7934, + "step": 21391 + }, + { + "epoch": 1.05, + "grad_norm": 0.5544524192810059, + "learning_rate": 0.00043665307843645976, + "loss": 2.8057, + "step": 21392 + }, + { + "epoch": 1.05, + "grad_norm": 0.6640998125076294, + "learning_rate": 0.0004366393720977116, + "loss": 3.081, + "step": 21393 + }, + { + "epoch": 1.05, + "grad_norm": 0.5651010870933533, + "learning_rate": 0.00043662566539908165, + "loss": 2.9706, + "step": 21394 + }, + { + "epoch": 1.05, + "grad_norm": 0.5061413049697876, + "learning_rate": 0.00043661195834060603, + "loss": 3.0914, + "step": 21395 + }, + { + "epoch": 1.05, + "grad_norm": 0.5113970637321472, + "learning_rate": 0.00043659825092232075, + "loss": 3.0878, + "step": 21396 + }, + { + "epoch": 1.05, + "grad_norm": 0.521929919719696, + "learning_rate": 0.0004365845431442619, + "loss": 3.1446, + "step": 21397 + }, + { + "epoch": 1.05, + "grad_norm": 0.5344848036766052, + "learning_rate": 0.0004365708350064657, + "loss": 3.2237, + "step": 21398 + }, + { + "epoch": 1.05, + "grad_norm": 0.5206398963928223, + "learning_rate": 0.0004365571265089683, + "loss": 3.0752, + "step": 21399 + }, + { + "epoch": 1.05, + "grad_norm": 0.583916187286377, + "learning_rate": 0.0004365434176518057, + "loss": 3.0648, + "step": 21400 + }, + { + "epoch": 1.05, + "grad_norm": 0.5128787159919739, + "learning_rate": 0.0004365297084350139, + "loss": 3.3052, + "step": 21401 + }, + { + "epoch": 1.05, + "grad_norm": 0.5522475242614746, + "learning_rate": 0.00043651599885862917, + "loss": 3.155, + "step": 21402 + }, + { + "epoch": 1.05, + "grad_norm": 0.5218787789344788, + "learning_rate": 0.00043650228892268755, + "loss": 2.8891, + "step": 21403 + }, + { + "epoch": 1.05, + "grad_norm": 0.5376594662666321, + "learning_rate": 0.0004364885786272252, + "loss": 3.1309, + "step": 21404 + }, + { + "epoch": 1.05, + "grad_norm": 0.5462920069694519, + "learning_rate": 0.0004364748679722782, + "loss": 3.139, + "step": 21405 + }, + { + "epoch": 1.05, + "grad_norm": 0.5011506080627441, + "learning_rate": 0.0004364611569578827, + "loss": 3.2214, + "step": 21406 + }, + { + "epoch": 1.05, + "grad_norm": 0.5488290190696716, + "learning_rate": 0.00043644744558407465, + "loss": 2.852, + "step": 21407 + }, + { + "epoch": 1.05, + "grad_norm": 0.5682697296142578, + "learning_rate": 0.0004364337338508904, + "loss": 3.0328, + "step": 21408 + }, + { + "epoch": 1.05, + "grad_norm": 0.5012774467468262, + "learning_rate": 0.0004364200217583659, + "loss": 2.9092, + "step": 21409 + }, + { + "epoch": 1.05, + "grad_norm": 0.5332022309303284, + "learning_rate": 0.00043640630930653744, + "loss": 3.1763, + "step": 21410 + }, + { + "epoch": 1.05, + "grad_norm": 0.49493035674095154, + "learning_rate": 0.00043639259649544084, + "loss": 3.2316, + "step": 21411 + }, + { + "epoch": 1.05, + "grad_norm": 0.5405813455581665, + "learning_rate": 0.00043637888332511245, + "loss": 3.1027, + "step": 21412 + }, + { + "epoch": 1.05, + "grad_norm": 0.531501829624176, + "learning_rate": 0.0004363651697955884, + "loss": 3.249, + "step": 21413 + }, + { + "epoch": 1.05, + "grad_norm": 0.5270764827728271, + "learning_rate": 0.0004363514559069047, + "loss": 2.9355, + "step": 21414 + }, + { + "epoch": 1.05, + "grad_norm": 0.5820385217666626, + "learning_rate": 0.00043633774165909744, + "loss": 3.1851, + "step": 21415 + }, + { + "epoch": 1.05, + "grad_norm": 0.5525915026664734, + "learning_rate": 0.0004363240270522029, + "loss": 2.9714, + "step": 21416 + }, + { + "epoch": 1.05, + "grad_norm": 0.5313616991043091, + "learning_rate": 0.00043631031208625704, + "loss": 3.1641, + "step": 21417 + }, + { + "epoch": 1.05, + "grad_norm": 0.5130735635757446, + "learning_rate": 0.0004362965967612961, + "loss": 3.1911, + "step": 21418 + }, + { + "epoch": 1.05, + "grad_norm": 0.5332739353179932, + "learning_rate": 0.0004362828810773561, + "loss": 3.2662, + "step": 21419 + }, + { + "epoch": 1.05, + "grad_norm": 0.4927477538585663, + "learning_rate": 0.0004362691650344732, + "loss": 3.204, + "step": 21420 + }, + { + "epoch": 1.05, + "grad_norm": 0.5364248752593994, + "learning_rate": 0.00043625544863268366, + "loss": 3.1127, + "step": 21421 + }, + { + "epoch": 1.05, + "grad_norm": 0.5149003267288208, + "learning_rate": 0.0004362417318720234, + "loss": 3.1497, + "step": 21422 + }, + { + "epoch": 1.05, + "grad_norm": 0.5644574761390686, + "learning_rate": 0.00043622801475252866, + "loss": 3.0328, + "step": 21423 + }, + { + "epoch": 1.05, + "grad_norm": 0.5710899829864502, + "learning_rate": 0.0004362142972742356, + "loss": 2.9797, + "step": 21424 + }, + { + "epoch": 1.05, + "grad_norm": 0.5768548846244812, + "learning_rate": 0.00043620057943718023, + "loss": 3.2249, + "step": 21425 + }, + { + "epoch": 1.05, + "grad_norm": 0.5565029382705688, + "learning_rate": 0.0004361868612413988, + "loss": 3.0481, + "step": 21426 + }, + { + "epoch": 1.05, + "grad_norm": 0.5066186785697937, + "learning_rate": 0.0004361731426869273, + "loss": 3.2739, + "step": 21427 + }, + { + "epoch": 1.05, + "grad_norm": 0.5121492743492126, + "learning_rate": 0.000436159423773802, + "loss": 3.0838, + "step": 21428 + }, + { + "epoch": 1.05, + "grad_norm": 0.5542187094688416, + "learning_rate": 0.000436145704502059, + "loss": 3.2834, + "step": 21429 + }, + { + "epoch": 1.05, + "grad_norm": 0.5797318816184998, + "learning_rate": 0.0004361319848717344, + "loss": 2.9999, + "step": 21430 + }, + { + "epoch": 1.05, + "grad_norm": 0.5291181802749634, + "learning_rate": 0.00043611826488286433, + "loss": 3.0283, + "step": 21431 + }, + { + "epoch": 1.05, + "grad_norm": 0.5589236617088318, + "learning_rate": 0.00043610454453548505, + "loss": 3.2367, + "step": 21432 + }, + { + "epoch": 1.05, + "grad_norm": 0.5176942348480225, + "learning_rate": 0.0004360908238296324, + "loss": 3.0578, + "step": 21433 + }, + { + "epoch": 1.05, + "grad_norm": 0.5708388686180115, + "learning_rate": 0.0004360771027653428, + "loss": 2.9649, + "step": 21434 + }, + { + "epoch": 1.05, + "grad_norm": 0.5405948758125305, + "learning_rate": 0.0004360633813426524, + "loss": 3.1171, + "step": 21435 + }, + { + "epoch": 1.05, + "grad_norm": 0.5287649631500244, + "learning_rate": 0.00043604965956159716, + "loss": 3.1744, + "step": 21436 + }, + { + "epoch": 1.05, + "grad_norm": 0.5235514640808105, + "learning_rate": 0.0004360359374222133, + "loss": 2.9239, + "step": 21437 + }, + { + "epoch": 1.05, + "grad_norm": 0.5278797745704651, + "learning_rate": 0.0004360222149245369, + "loss": 2.9237, + "step": 21438 + }, + { + "epoch": 1.05, + "grad_norm": 0.5604250431060791, + "learning_rate": 0.0004360084920686042, + "loss": 3.0579, + "step": 21439 + }, + { + "epoch": 1.05, + "grad_norm": 0.5143141150474548, + "learning_rate": 0.0004359947688544514, + "loss": 3.105, + "step": 21440 + }, + { + "epoch": 1.05, + "grad_norm": 0.5379894971847534, + "learning_rate": 0.0004359810452821145, + "loss": 2.9211, + "step": 21441 + }, + { + "epoch": 1.05, + "grad_norm": 0.5371367931365967, + "learning_rate": 0.00043596732135162965, + "loss": 3.3108, + "step": 21442 + }, + { + "epoch": 1.05, + "grad_norm": 0.5497921109199524, + "learning_rate": 0.00043595359706303307, + "loss": 3.1547, + "step": 21443 + }, + { + "epoch": 1.05, + "grad_norm": 0.5384805798530579, + "learning_rate": 0.0004359398724163609, + "loss": 3.2494, + "step": 21444 + }, + { + "epoch": 1.05, + "grad_norm": 0.5575272440910339, + "learning_rate": 0.00043592614741164924, + "loss": 3.0403, + "step": 21445 + }, + { + "epoch": 1.05, + "grad_norm": 0.5456324219703674, + "learning_rate": 0.0004359124220489343, + "loss": 3.1509, + "step": 21446 + }, + { + "epoch": 1.05, + "grad_norm": 0.5280311107635498, + "learning_rate": 0.00043589869632825216, + "loss": 3.0561, + "step": 21447 + }, + { + "epoch": 1.05, + "grad_norm": 0.542693555355072, + "learning_rate": 0.000435884970249639, + "loss": 2.9996, + "step": 21448 + }, + { + "epoch": 1.05, + "grad_norm": 0.531836748123169, + "learning_rate": 0.00043587124381313104, + "loss": 3.1081, + "step": 21449 + }, + { + "epoch": 1.05, + "grad_norm": 0.7221289873123169, + "learning_rate": 0.0004358575170187644, + "loss": 2.9935, + "step": 21450 + }, + { + "epoch": 1.05, + "grad_norm": 0.5565702319145203, + "learning_rate": 0.0004358437898665751, + "loss": 2.9451, + "step": 21451 + }, + { + "epoch": 1.05, + "grad_norm": 0.5191698670387268, + "learning_rate": 0.0004358300623565994, + "loss": 2.9686, + "step": 21452 + }, + { + "epoch": 1.05, + "grad_norm": 0.5412079691886902, + "learning_rate": 0.0004358163344888735, + "loss": 3.1743, + "step": 21453 + }, + { + "epoch": 1.05, + "grad_norm": 0.5316622257232666, + "learning_rate": 0.00043580260626343355, + "loss": 3.1001, + "step": 21454 + }, + { + "epoch": 1.05, + "grad_norm": 0.5519059896469116, + "learning_rate": 0.00043578887768031563, + "loss": 3.2956, + "step": 21455 + }, + { + "epoch": 1.05, + "grad_norm": 0.507394552230835, + "learning_rate": 0.0004357751487395559, + "loss": 2.9144, + "step": 21456 + }, + { + "epoch": 1.05, + "grad_norm": 0.5262680649757385, + "learning_rate": 0.00043576141944119066, + "loss": 3.1354, + "step": 21457 + }, + { + "epoch": 1.05, + "grad_norm": 0.5504863858222961, + "learning_rate": 0.0004357476897852558, + "loss": 3.2812, + "step": 21458 + }, + { + "epoch": 1.05, + "grad_norm": 0.5362016558647156, + "learning_rate": 0.0004357339597717878, + "loss": 3.0618, + "step": 21459 + }, + { + "epoch": 1.05, + "grad_norm": 0.5501962900161743, + "learning_rate": 0.0004357202294008227, + "loss": 3.2196, + "step": 21460 + }, + { + "epoch": 1.05, + "grad_norm": 0.5307948589324951, + "learning_rate": 0.00043570649867239657, + "loss": 3.2588, + "step": 21461 + }, + { + "epoch": 1.05, + "grad_norm": 0.5310550332069397, + "learning_rate": 0.00043569276758654565, + "loss": 3.2253, + "step": 21462 + }, + { + "epoch": 1.05, + "grad_norm": 0.6042519211769104, + "learning_rate": 0.00043567903614330603, + "loss": 3.2064, + "step": 21463 + }, + { + "epoch": 1.05, + "grad_norm": 0.5459402799606323, + "learning_rate": 0.00043566530434271407, + "loss": 2.8162, + "step": 21464 + }, + { + "epoch": 1.05, + "grad_norm": 0.5372798442840576, + "learning_rate": 0.0004356515721848057, + "loss": 3.1366, + "step": 21465 + }, + { + "epoch": 1.05, + "grad_norm": 0.5367051959037781, + "learning_rate": 0.0004356378396696172, + "loss": 3.0794, + "step": 21466 + }, + { + "epoch": 1.05, + "grad_norm": 0.5590572953224182, + "learning_rate": 0.0004356241067971848, + "loss": 3.1426, + "step": 21467 + }, + { + "epoch": 1.05, + "grad_norm": 0.5460636615753174, + "learning_rate": 0.0004356103735675446, + "loss": 3.1592, + "step": 21468 + }, + { + "epoch": 1.05, + "grad_norm": 0.5461768507957458, + "learning_rate": 0.00043559663998073264, + "loss": 2.9046, + "step": 21469 + }, + { + "epoch": 1.05, + "grad_norm": 0.5281131267547607, + "learning_rate": 0.00043558290603678537, + "loss": 2.9527, + "step": 21470 + }, + { + "epoch": 1.05, + "grad_norm": 0.5545151829719543, + "learning_rate": 0.00043556917173573877, + "loss": 2.9797, + "step": 21471 + }, + { + "epoch": 1.05, + "grad_norm": 0.5535064935684204, + "learning_rate": 0.00043555543707762915, + "loss": 3.0521, + "step": 21472 + }, + { + "epoch": 1.05, + "grad_norm": 0.5981447100639343, + "learning_rate": 0.0004355417020624925, + "loss": 3.0078, + "step": 21473 + }, + { + "epoch": 1.05, + "grad_norm": 0.5304649472236633, + "learning_rate": 0.000435527966690365, + "loss": 3.0455, + "step": 21474 + }, + { + "epoch": 1.05, + "grad_norm": 0.5150699615478516, + "learning_rate": 0.0004355142309612831, + "loss": 3.1424, + "step": 21475 + }, + { + "epoch": 1.05, + "grad_norm": 0.5696814060211182, + "learning_rate": 0.0004355004948752827, + "loss": 3.0476, + "step": 21476 + }, + { + "epoch": 1.05, + "grad_norm": 0.6048765778541565, + "learning_rate": 0.0004354867584324002, + "loss": 3.0301, + "step": 21477 + }, + { + "epoch": 1.05, + "grad_norm": 0.5578874349594116, + "learning_rate": 0.0004354730216326714, + "loss": 2.9297, + "step": 21478 + }, + { + "epoch": 1.05, + "grad_norm": 0.5642783641815186, + "learning_rate": 0.00043545928447613294, + "loss": 3.1453, + "step": 21479 + }, + { + "epoch": 1.05, + "grad_norm": 0.5326113104820251, + "learning_rate": 0.00043544554696282075, + "loss": 3.2642, + "step": 21480 + }, + { + "epoch": 1.05, + "grad_norm": 0.54203861951828, + "learning_rate": 0.00043543180909277104, + "loss": 2.9685, + "step": 21481 + }, + { + "epoch": 1.05, + "grad_norm": 0.5783830881118774, + "learning_rate": 0.00043541807086602, + "loss": 3.046, + "step": 21482 + }, + { + "epoch": 1.05, + "grad_norm": 0.5656664371490479, + "learning_rate": 0.0004354043322826039, + "loss": 3.1215, + "step": 21483 + }, + { + "epoch": 1.05, + "grad_norm": 0.5210521221160889, + "learning_rate": 0.00043539059334255875, + "loss": 3.3077, + "step": 21484 + }, + { + "epoch": 1.05, + "grad_norm": 0.5449597239494324, + "learning_rate": 0.00043537685404592086, + "loss": 3.0667, + "step": 21485 + }, + { + "epoch": 1.05, + "grad_norm": 0.5515075325965881, + "learning_rate": 0.00043536311439272644, + "loss": 3.0646, + "step": 21486 + }, + { + "epoch": 1.05, + "grad_norm": 0.5687845945358276, + "learning_rate": 0.00043534937438301154, + "loss": 3.2882, + "step": 21487 + }, + { + "epoch": 1.05, + "grad_norm": 0.6585469245910645, + "learning_rate": 0.00043533563401681254, + "loss": 3.0092, + "step": 21488 + }, + { + "epoch": 1.05, + "grad_norm": 0.5366838574409485, + "learning_rate": 0.00043532189329416543, + "loss": 3.0872, + "step": 21489 + }, + { + "epoch": 1.05, + "grad_norm": 0.5649664998054504, + "learning_rate": 0.00043530815221510654, + "loss": 2.9941, + "step": 21490 + }, + { + "epoch": 1.05, + "grad_norm": 0.5187537670135498, + "learning_rate": 0.000435294410779672, + "loss": 3.248, + "step": 21491 + }, + { + "epoch": 1.05, + "grad_norm": 0.5714203119277954, + "learning_rate": 0.0004352806689878981, + "loss": 3.077, + "step": 21492 + }, + { + "epoch": 1.05, + "grad_norm": 0.5206428170204163, + "learning_rate": 0.0004352669268398208, + "loss": 2.8386, + "step": 21493 + }, + { + "epoch": 1.05, + "grad_norm": 0.5927132368087769, + "learning_rate": 0.00043525318433547654, + "loss": 2.9447, + "step": 21494 + }, + { + "epoch": 1.05, + "grad_norm": 0.5436407923698425, + "learning_rate": 0.0004352394414749014, + "loss": 3.2219, + "step": 21495 + }, + { + "epoch": 1.05, + "grad_norm": 0.5328101515769958, + "learning_rate": 0.0004352256982581316, + "loss": 3.0678, + "step": 21496 + }, + { + "epoch": 1.05, + "grad_norm": 0.7768601179122925, + "learning_rate": 0.0004352119546852034, + "loss": 3.0912, + "step": 21497 + }, + { + "epoch": 1.05, + "grad_norm": 0.531445324420929, + "learning_rate": 0.0004351982107561529, + "loss": 3.0633, + "step": 21498 + }, + { + "epoch": 1.05, + "grad_norm": 0.565925657749176, + "learning_rate": 0.00043518446647101625, + "loss": 3.1066, + "step": 21499 + }, + { + "epoch": 1.05, + "grad_norm": 0.5696445107460022, + "learning_rate": 0.0004351707218298298, + "loss": 3.1754, + "step": 21500 + }, + { + "epoch": 1.05, + "grad_norm": 0.5589838624000549, + "learning_rate": 0.0004351569768326298, + "loss": 2.9757, + "step": 21501 + }, + { + "epoch": 1.05, + "grad_norm": 0.539966881275177, + "learning_rate": 0.0004351432314794522, + "loss": 3.2406, + "step": 21502 + }, + { + "epoch": 1.05, + "grad_norm": 0.5462685823440552, + "learning_rate": 0.0004351294857703333, + "loss": 3.1382, + "step": 21503 + }, + { + "epoch": 1.05, + "grad_norm": 0.5470060110092163, + "learning_rate": 0.0004351157397053094, + "loss": 3.0518, + "step": 21504 + }, + { + "epoch": 1.05, + "grad_norm": 0.5174694657325745, + "learning_rate": 0.0004351019932844166, + "loss": 3.1323, + "step": 21505 + }, + { + "epoch": 1.05, + "grad_norm": 0.533613920211792, + "learning_rate": 0.00043508824650769125, + "loss": 3.0013, + "step": 21506 + }, + { + "epoch": 1.05, + "grad_norm": 0.5810818076133728, + "learning_rate": 0.0004350744993751694, + "loss": 3.3575, + "step": 21507 + }, + { + "epoch": 1.05, + "grad_norm": 0.5429716110229492, + "learning_rate": 0.0004350607518868873, + "loss": 3.0456, + "step": 21508 + }, + { + "epoch": 1.05, + "grad_norm": 0.5317225456237793, + "learning_rate": 0.0004350470040428812, + "loss": 3.0374, + "step": 21509 + }, + { + "epoch": 1.05, + "grad_norm": 0.5231348276138306, + "learning_rate": 0.0004350332558431872, + "loss": 3.0947, + "step": 21510 + }, + { + "epoch": 1.05, + "grad_norm": 0.5566467642784119, + "learning_rate": 0.00043501950728784165, + "loss": 3.2049, + "step": 21511 + }, + { + "epoch": 1.05, + "grad_norm": 0.5202820301055908, + "learning_rate": 0.0004350057583768808, + "loss": 3.0884, + "step": 21512 + }, + { + "epoch": 1.05, + "grad_norm": 0.5291932821273804, + "learning_rate": 0.00043499200911034067, + "loss": 2.998, + "step": 21513 + }, + { + "epoch": 1.05, + "grad_norm": 0.5607914924621582, + "learning_rate": 0.0004349782594882576, + "loss": 3.1113, + "step": 21514 + }, + { + "epoch": 1.05, + "grad_norm": 0.557773768901825, + "learning_rate": 0.0004349645095106677, + "loss": 3.0285, + "step": 21515 + }, + { + "epoch": 1.05, + "grad_norm": 0.5368457436561584, + "learning_rate": 0.00043495075917760726, + "loss": 3.0034, + "step": 21516 + }, + { + "epoch": 1.05, + "grad_norm": 0.526913583278656, + "learning_rate": 0.00043493700848911256, + "loss": 3.1358, + "step": 21517 + }, + { + "epoch": 1.05, + "grad_norm": 0.5346729755401611, + "learning_rate": 0.0004349232574452197, + "loss": 3.1478, + "step": 21518 + }, + { + "epoch": 1.05, + "grad_norm": 0.554854154586792, + "learning_rate": 0.00043490950604596504, + "loss": 3.1444, + "step": 21519 + }, + { + "epoch": 1.05, + "grad_norm": 0.5558192133903503, + "learning_rate": 0.00043489575429138454, + "loss": 2.9225, + "step": 21520 + }, + { + "epoch": 1.05, + "grad_norm": 0.5165724158287048, + "learning_rate": 0.00043488200218151467, + "loss": 3.0812, + "step": 21521 + }, + { + "epoch": 1.05, + "grad_norm": 0.5703836679458618, + "learning_rate": 0.0004348682497163916, + "loss": 3.0469, + "step": 21522 + }, + { + "epoch": 1.05, + "grad_norm": 0.5449686050415039, + "learning_rate": 0.0004348544968960515, + "loss": 3.1636, + "step": 21523 + }, + { + "epoch": 1.05, + "grad_norm": 0.6131134033203125, + "learning_rate": 0.0004348407437205306, + "loss": 3.1791, + "step": 21524 + }, + { + "epoch": 1.05, + "grad_norm": 0.563544750213623, + "learning_rate": 0.000434826990189865, + "loss": 3.2026, + "step": 21525 + }, + { + "epoch": 1.05, + "grad_norm": 0.5371457934379578, + "learning_rate": 0.0004348132363040913, + "loss": 3.04, + "step": 21526 + }, + { + "epoch": 1.05, + "grad_norm": 0.5195092558860779, + "learning_rate": 0.00043479948206324537, + "loss": 3.2698, + "step": 21527 + }, + { + "epoch": 1.06, + "grad_norm": 0.5485871434211731, + "learning_rate": 0.0004347857274673635, + "loss": 3.0969, + "step": 21528 + }, + { + "epoch": 1.06, + "grad_norm": 0.5748703479766846, + "learning_rate": 0.000434771972516482, + "loss": 3.1038, + "step": 21529 + }, + { + "epoch": 1.06, + "grad_norm": 0.5171026587486267, + "learning_rate": 0.0004347582172106371, + "loss": 3.1868, + "step": 21530 + }, + { + "epoch": 1.06, + "grad_norm": 0.5326929092407227, + "learning_rate": 0.00043474446154986493, + "loss": 2.8315, + "step": 21531 + }, + { + "epoch": 1.06, + "grad_norm": 0.5850439667701721, + "learning_rate": 0.00043473070553420184, + "loss": 3.2327, + "step": 21532 + }, + { + "epoch": 1.06, + "grad_norm": 0.5027980804443359, + "learning_rate": 0.00043471694916368393, + "loss": 3.0613, + "step": 21533 + }, + { + "epoch": 1.06, + "grad_norm": 0.5234944224357605, + "learning_rate": 0.0004347031924383476, + "loss": 3.2077, + "step": 21534 + }, + { + "epoch": 1.06, + "grad_norm": 0.5578376650810242, + "learning_rate": 0.0004346894353582289, + "loss": 3.0004, + "step": 21535 + }, + { + "epoch": 1.06, + "grad_norm": 0.49966779351234436, + "learning_rate": 0.00043467567792336413, + "loss": 2.9163, + "step": 21536 + }, + { + "epoch": 1.06, + "grad_norm": 0.5220000147819519, + "learning_rate": 0.0004346619201337896, + "loss": 3.1377, + "step": 21537 + }, + { + "epoch": 1.06, + "grad_norm": 0.5169272422790527, + "learning_rate": 0.0004346481619895415, + "loss": 3.0862, + "step": 21538 + }, + { + "epoch": 1.06, + "grad_norm": 0.5571988821029663, + "learning_rate": 0.000434634403490656, + "loss": 2.9343, + "step": 21539 + }, + { + "epoch": 1.06, + "grad_norm": 0.5117565989494324, + "learning_rate": 0.0004346206446371694, + "loss": 3.1412, + "step": 21540 + }, + { + "epoch": 1.06, + "grad_norm": 0.6265973448753357, + "learning_rate": 0.00043460688542911797, + "loss": 3.1845, + "step": 21541 + }, + { + "epoch": 1.06, + "grad_norm": 0.526631772518158, + "learning_rate": 0.00043459312586653794, + "loss": 3.0802, + "step": 21542 + }, + { + "epoch": 1.06, + "grad_norm": 0.6042065620422363, + "learning_rate": 0.00043457936594946543, + "loss": 3.0749, + "step": 21543 + }, + { + "epoch": 1.06, + "grad_norm": 0.5184115171432495, + "learning_rate": 0.0004345656056779368, + "loss": 2.9749, + "step": 21544 + }, + { + "epoch": 1.06, + "grad_norm": 0.560426652431488, + "learning_rate": 0.0004345518450519884, + "loss": 3.2708, + "step": 21545 + }, + { + "epoch": 1.06, + "grad_norm": 0.5563343167304993, + "learning_rate": 0.0004345380840716561, + "loss": 3.0454, + "step": 21546 + }, + { + "epoch": 1.06, + "grad_norm": 0.5319254398345947, + "learning_rate": 0.0004345243227369765, + "loss": 3.0709, + "step": 21547 + }, + { + "epoch": 1.06, + "grad_norm": 0.5544840693473816, + "learning_rate": 0.00043451056104798583, + "loss": 3.2259, + "step": 21548 + }, + { + "epoch": 1.06, + "grad_norm": 0.5176193714141846, + "learning_rate": 0.00043449679900472, + "loss": 3.1498, + "step": 21549 + }, + { + "epoch": 1.06, + "grad_norm": 0.5405734777450562, + "learning_rate": 0.0004344830366072156, + "loss": 2.9556, + "step": 21550 + }, + { + "epoch": 1.06, + "grad_norm": 0.5425923466682434, + "learning_rate": 0.00043446927385550874, + "loss": 3.234, + "step": 21551 + }, + { + "epoch": 1.06, + "grad_norm": 0.5294197201728821, + "learning_rate": 0.00043445551074963577, + "loss": 3.239, + "step": 21552 + }, + { + "epoch": 1.06, + "grad_norm": 0.6730620861053467, + "learning_rate": 0.00043444174728963277, + "loss": 3.1966, + "step": 21553 + }, + { + "epoch": 1.06, + "grad_norm": 0.538125216960907, + "learning_rate": 0.0004344279834755361, + "loss": 3.1703, + "step": 21554 + }, + { + "epoch": 1.06, + "grad_norm": 0.5608587861061096, + "learning_rate": 0.00043441421930738203, + "loss": 2.9973, + "step": 21555 + }, + { + "epoch": 1.06, + "grad_norm": 0.5333779454231262, + "learning_rate": 0.00043440045478520677, + "loss": 3.3017, + "step": 21556 + }, + { + "epoch": 1.06, + "grad_norm": 0.5401734113693237, + "learning_rate": 0.00043438668990904654, + "loss": 3.0712, + "step": 21557 + }, + { + "epoch": 1.06, + "grad_norm": 0.5446183085441589, + "learning_rate": 0.0004343729246789377, + "loss": 3.1535, + "step": 21558 + }, + { + "epoch": 1.06, + "grad_norm": 0.5327209234237671, + "learning_rate": 0.0004343591590949164, + "loss": 2.9274, + "step": 21559 + }, + { + "epoch": 1.06, + "grad_norm": 0.6195545196533203, + "learning_rate": 0.0004343453931570189, + "loss": 3.0559, + "step": 21560 + }, + { + "epoch": 1.06, + "grad_norm": 0.5407031178474426, + "learning_rate": 0.0004343316268652815, + "loss": 3.1942, + "step": 21561 + }, + { + "epoch": 1.06, + "grad_norm": 0.5744057893753052, + "learning_rate": 0.0004343178602197404, + "loss": 3.0987, + "step": 21562 + }, + { + "epoch": 1.06, + "grad_norm": 0.5579788684844971, + "learning_rate": 0.0004343040932204321, + "loss": 3.0874, + "step": 21563 + }, + { + "epoch": 1.06, + "grad_norm": 0.5619020462036133, + "learning_rate": 0.0004342903258673925, + "loss": 3.2928, + "step": 21564 + }, + { + "epoch": 1.06, + "grad_norm": 0.5300469398498535, + "learning_rate": 0.00043427655816065803, + "loss": 3.3128, + "step": 21565 + }, + { + "epoch": 1.06, + "grad_norm": 0.5448369979858398, + "learning_rate": 0.000434262790100265, + "loss": 3.2438, + "step": 21566 + }, + { + "epoch": 1.06, + "grad_norm": 0.5507218241691589, + "learning_rate": 0.00043424902168624963, + "loss": 3.1717, + "step": 21567 + }, + { + "epoch": 1.06, + "grad_norm": 0.5300918221473694, + "learning_rate": 0.0004342352529186481, + "loss": 2.9758, + "step": 21568 + }, + { + "epoch": 1.06, + "grad_norm": 0.537909746170044, + "learning_rate": 0.0004342214837974968, + "loss": 3.0509, + "step": 21569 + }, + { + "epoch": 1.06, + "grad_norm": 0.5577003955841064, + "learning_rate": 0.000434207714322832, + "loss": 3.1697, + "step": 21570 + }, + { + "epoch": 1.06, + "grad_norm": 0.5539290308952332, + "learning_rate": 0.00043419394449468974, + "loss": 2.8774, + "step": 21571 + }, + { + "epoch": 1.06, + "grad_norm": 0.5374821424484253, + "learning_rate": 0.0004341801743131066, + "loss": 2.8361, + "step": 21572 + }, + { + "epoch": 1.06, + "grad_norm": 0.5573104023933411, + "learning_rate": 0.0004341664037781186, + "loss": 3.016, + "step": 21573 + }, + { + "epoch": 1.06, + "grad_norm": 0.5746645331382751, + "learning_rate": 0.00043415263288976223, + "loss": 3.2311, + "step": 21574 + }, + { + "epoch": 1.06, + "grad_norm": 0.524361789226532, + "learning_rate": 0.00043413886164807357, + "loss": 3.2525, + "step": 21575 + }, + { + "epoch": 1.06, + "grad_norm": 0.5610430836677551, + "learning_rate": 0.00043412509005308895, + "loss": 3.2134, + "step": 21576 + }, + { + "epoch": 1.06, + "grad_norm": 0.5629414319992065, + "learning_rate": 0.0004341113181048447, + "loss": 3.0197, + "step": 21577 + }, + { + "epoch": 1.06, + "grad_norm": 0.5457921624183655, + "learning_rate": 0.00043409754580337704, + "loss": 3.1535, + "step": 21578 + }, + { + "epoch": 1.06, + "grad_norm": 0.5242516398429871, + "learning_rate": 0.0004340837731487223, + "loss": 3.0792, + "step": 21579 + }, + { + "epoch": 1.06, + "grad_norm": 0.5492205023765564, + "learning_rate": 0.0004340700001409167, + "loss": 2.9799, + "step": 21580 + }, + { + "epoch": 1.06, + "grad_norm": 0.5539306402206421, + "learning_rate": 0.0004340562267799964, + "loss": 2.9942, + "step": 21581 + }, + { + "epoch": 1.06, + "grad_norm": 0.5252792835235596, + "learning_rate": 0.00043404245306599785, + "loss": 2.9845, + "step": 21582 + }, + { + "epoch": 1.06, + "grad_norm": 0.5377252697944641, + "learning_rate": 0.0004340286789989573, + "loss": 3.0228, + "step": 21583 + }, + { + "epoch": 1.06, + "grad_norm": 0.5324434638023376, + "learning_rate": 0.000434014904578911, + "loss": 3.0127, + "step": 21584 + }, + { + "epoch": 1.06, + "grad_norm": 0.5117992162704468, + "learning_rate": 0.0004340011298058953, + "loss": 3.0829, + "step": 21585 + }, + { + "epoch": 1.06, + "grad_norm": 0.5125967860221863, + "learning_rate": 0.0004339873546799464, + "loss": 3.1856, + "step": 21586 + }, + { + "epoch": 1.06, + "grad_norm": 0.502491295337677, + "learning_rate": 0.0004339735792011004, + "loss": 3.1148, + "step": 21587 + }, + { + "epoch": 1.06, + "grad_norm": 0.5541898608207703, + "learning_rate": 0.000433959803369394, + "loss": 3.0582, + "step": 21588 + }, + { + "epoch": 1.06, + "grad_norm": 0.5374077558517456, + "learning_rate": 0.0004339460271848632, + "loss": 3.0841, + "step": 21589 + }, + { + "epoch": 1.06, + "grad_norm": 0.5276811122894287, + "learning_rate": 0.00043393225064754427, + "loss": 3.1326, + "step": 21590 + }, + { + "epoch": 1.06, + "grad_norm": 0.5653324723243713, + "learning_rate": 0.0004339184737574736, + "loss": 3.1166, + "step": 21591 + }, + { + "epoch": 1.06, + "grad_norm": 0.570012092590332, + "learning_rate": 0.0004339046965146875, + "loss": 2.9614, + "step": 21592 + }, + { + "epoch": 1.06, + "grad_norm": 1.030922770500183, + "learning_rate": 0.00043389091891922216, + "loss": 3.1976, + "step": 21593 + }, + { + "epoch": 1.06, + "grad_norm": 0.5297044515609741, + "learning_rate": 0.0004338771409711139, + "loss": 3.275, + "step": 21594 + }, + { + "epoch": 1.06, + "grad_norm": 0.5701044201850891, + "learning_rate": 0.00043386336267039907, + "loss": 2.9492, + "step": 21595 + }, + { + "epoch": 1.06, + "grad_norm": 0.5452165007591248, + "learning_rate": 0.0004338495840171139, + "loss": 3.2421, + "step": 21596 + }, + { + "epoch": 1.06, + "grad_norm": 0.5350081324577332, + "learning_rate": 0.00043383580501129453, + "loss": 2.9024, + "step": 21597 + }, + { + "epoch": 1.06, + "grad_norm": 0.5603721737861633, + "learning_rate": 0.00043382202565297757, + "loss": 2.9177, + "step": 21598 + }, + { + "epoch": 1.06, + "grad_norm": 0.559532642364502, + "learning_rate": 0.00043380824594219916, + "loss": 3.1842, + "step": 21599 + }, + { + "epoch": 1.06, + "grad_norm": 0.5277261137962341, + "learning_rate": 0.00043379446587899547, + "loss": 3.056, + "step": 21600 + }, + { + "epoch": 1.06, + "grad_norm": 0.5619999766349792, + "learning_rate": 0.000433780685463403, + "loss": 3.2529, + "step": 21601 + }, + { + "epoch": 1.06, + "grad_norm": 0.5646684169769287, + "learning_rate": 0.0004337669046954578, + "loss": 3.2436, + "step": 21602 + }, + { + "epoch": 1.06, + "grad_norm": 0.5427494645118713, + "learning_rate": 0.0004337531235751965, + "loss": 3.1102, + "step": 21603 + }, + { + "epoch": 1.06, + "grad_norm": 0.5302757620811462, + "learning_rate": 0.00043373934210265517, + "loss": 3.152, + "step": 21604 + }, + { + "epoch": 1.06, + "grad_norm": 0.5829156637191772, + "learning_rate": 0.00043372556027787014, + "loss": 2.9137, + "step": 21605 + }, + { + "epoch": 1.06, + "grad_norm": 0.5077518820762634, + "learning_rate": 0.0004337117781008777, + "loss": 2.888, + "step": 21606 + }, + { + "epoch": 1.06, + "grad_norm": 0.5359871983528137, + "learning_rate": 0.00043369799557171413, + "loss": 3.1001, + "step": 21607 + }, + { + "epoch": 1.06, + "grad_norm": 0.5384306311607361, + "learning_rate": 0.0004336842126904159, + "loss": 3.2559, + "step": 21608 + }, + { + "epoch": 1.06, + "grad_norm": 0.5284044742584229, + "learning_rate": 0.00043367042945701903, + "loss": 3.4134, + "step": 21609 + }, + { + "epoch": 1.06, + "grad_norm": 0.5183429718017578, + "learning_rate": 0.0004336566458715601, + "loss": 2.9548, + "step": 21610 + }, + { + "epoch": 1.06, + "grad_norm": 0.5918934345245361, + "learning_rate": 0.0004336428619340752, + "loss": 3.1684, + "step": 21611 + }, + { + "epoch": 1.06, + "grad_norm": 0.555814266204834, + "learning_rate": 0.0004336290776446007, + "loss": 2.91, + "step": 21612 + }, + { + "epoch": 1.06, + "grad_norm": 0.551300048828125, + "learning_rate": 0.000433615293003173, + "loss": 3.1364, + "step": 21613 + }, + { + "epoch": 1.06, + "grad_norm": 0.5664514899253845, + "learning_rate": 0.00043360150800982836, + "loss": 3.1111, + "step": 21614 + }, + { + "epoch": 1.06, + "grad_norm": 0.5319811105728149, + "learning_rate": 0.000433587722664603, + "loss": 2.9241, + "step": 21615 + }, + { + "epoch": 1.06, + "grad_norm": 0.560508131980896, + "learning_rate": 0.00043357393696753327, + "loss": 3.0344, + "step": 21616 + }, + { + "epoch": 1.06, + "grad_norm": 0.5378097891807556, + "learning_rate": 0.00043356015091865555, + "loss": 3.2021, + "step": 21617 + }, + { + "epoch": 1.06, + "grad_norm": 0.5653480887413025, + "learning_rate": 0.00043354636451800604, + "loss": 3.1775, + "step": 21618 + }, + { + "epoch": 1.06, + "grad_norm": 0.5392255187034607, + "learning_rate": 0.00043353257776562114, + "loss": 3.4296, + "step": 21619 + }, + { + "epoch": 1.06, + "grad_norm": 0.5405376553535461, + "learning_rate": 0.00043351879066153714, + "loss": 3.2874, + "step": 21620 + }, + { + "epoch": 1.06, + "grad_norm": 0.5263589024543762, + "learning_rate": 0.0004335050032057904, + "loss": 3.0824, + "step": 21621 + }, + { + "epoch": 1.06, + "grad_norm": 0.5315642356872559, + "learning_rate": 0.0004334912153984171, + "loss": 3.1399, + "step": 21622 + }, + { + "epoch": 1.06, + "grad_norm": 0.5322493314743042, + "learning_rate": 0.0004334774272394536, + "loss": 3.1131, + "step": 21623 + }, + { + "epoch": 1.06, + "grad_norm": 0.5302774310112, + "learning_rate": 0.0004334636387289362, + "loss": 3.124, + "step": 21624 + }, + { + "epoch": 1.06, + "grad_norm": 0.5446813702583313, + "learning_rate": 0.00043344984986690144, + "loss": 3.0245, + "step": 21625 + }, + { + "epoch": 1.06, + "grad_norm": 0.5131939649581909, + "learning_rate": 0.00043343606065338535, + "loss": 2.8756, + "step": 21626 + }, + { + "epoch": 1.06, + "grad_norm": 0.5474516153335571, + "learning_rate": 0.0004334222710884243, + "loss": 3.3199, + "step": 21627 + }, + { + "epoch": 1.06, + "grad_norm": 0.5619019269943237, + "learning_rate": 0.00043340848117205476, + "loss": 3.1003, + "step": 21628 + }, + { + "epoch": 1.06, + "grad_norm": 0.5801265239715576, + "learning_rate": 0.0004333946909043129, + "loss": 2.9848, + "step": 21629 + }, + { + "epoch": 1.06, + "grad_norm": 0.5769397616386414, + "learning_rate": 0.00043338090028523517, + "loss": 3.037, + "step": 21630 + }, + { + "epoch": 1.06, + "grad_norm": 0.5513659119606018, + "learning_rate": 0.0004333671093148577, + "loss": 2.9997, + "step": 21631 + }, + { + "epoch": 1.06, + "grad_norm": 0.5560690760612488, + "learning_rate": 0.0004333533179932171, + "loss": 2.9288, + "step": 21632 + }, + { + "epoch": 1.06, + "grad_norm": 0.5233303308486938, + "learning_rate": 0.00043333952632034937, + "loss": 3.0383, + "step": 21633 + }, + { + "epoch": 1.06, + "grad_norm": 0.5534685850143433, + "learning_rate": 0.00043332573429629106, + "loss": 3.0307, + "step": 21634 + }, + { + "epoch": 1.06, + "grad_norm": 0.5539918541908264, + "learning_rate": 0.00043331194192107843, + "loss": 3.0601, + "step": 21635 + }, + { + "epoch": 1.06, + "grad_norm": 0.5524195432662964, + "learning_rate": 0.0004332981491947478, + "loss": 3.0906, + "step": 21636 + }, + { + "epoch": 1.06, + "grad_norm": 0.5301859974861145, + "learning_rate": 0.00043328435611733547, + "loss": 3.0146, + "step": 21637 + }, + { + "epoch": 1.06, + "grad_norm": 0.5274763107299805, + "learning_rate": 0.0004332705626888778, + "loss": 3.1862, + "step": 21638 + }, + { + "epoch": 1.06, + "grad_norm": 0.6033293604850769, + "learning_rate": 0.0004332567689094111, + "loss": 2.8906, + "step": 21639 + }, + { + "epoch": 1.06, + "grad_norm": 0.5655192732810974, + "learning_rate": 0.00043324297477897176, + "loss": 3.1, + "step": 21640 + }, + { + "epoch": 1.06, + "grad_norm": 0.578748345375061, + "learning_rate": 0.00043322918029759606, + "loss": 3.1724, + "step": 21641 + }, + { + "epoch": 1.06, + "grad_norm": 0.5158365368843079, + "learning_rate": 0.0004332153854653203, + "loss": 3.1176, + "step": 21642 + }, + { + "epoch": 1.06, + "grad_norm": 0.5151905417442322, + "learning_rate": 0.0004332015902821809, + "loss": 3.1752, + "step": 21643 + }, + { + "epoch": 1.06, + "grad_norm": 0.5332306027412415, + "learning_rate": 0.0004331877947482141, + "loss": 3.0743, + "step": 21644 + }, + { + "epoch": 1.06, + "grad_norm": 0.576889157295227, + "learning_rate": 0.0004331739988634563, + "loss": 2.9053, + "step": 21645 + }, + { + "epoch": 1.06, + "grad_norm": 0.5512627363204956, + "learning_rate": 0.00043316020262794386, + "loss": 3.171, + "step": 21646 + }, + { + "epoch": 1.06, + "grad_norm": 0.5933674573898315, + "learning_rate": 0.00043314640604171303, + "loss": 3.1488, + "step": 21647 + }, + { + "epoch": 1.06, + "grad_norm": 0.5496378540992737, + "learning_rate": 0.00043313260910480015, + "loss": 3.0251, + "step": 21648 + }, + { + "epoch": 1.06, + "grad_norm": 0.5148006677627563, + "learning_rate": 0.0004331188118172417, + "loss": 3.2106, + "step": 21649 + }, + { + "epoch": 1.06, + "grad_norm": 0.5798931121826172, + "learning_rate": 0.0004331050141790739, + "loss": 2.9724, + "step": 21650 + }, + { + "epoch": 1.06, + "grad_norm": 0.567121684551239, + "learning_rate": 0.0004330912161903331, + "loss": 3.1009, + "step": 21651 + }, + { + "epoch": 1.06, + "grad_norm": 0.5588269233703613, + "learning_rate": 0.00043307741785105554, + "loss": 2.9866, + "step": 21652 + }, + { + "epoch": 1.06, + "grad_norm": 0.5300847291946411, + "learning_rate": 0.0004330636191612778, + "loss": 2.9866, + "step": 21653 + }, + { + "epoch": 1.06, + "grad_norm": 0.5314716100692749, + "learning_rate": 0.00043304982012103604, + "loss": 3.0428, + "step": 21654 + }, + { + "epoch": 1.06, + "grad_norm": 0.5971154570579529, + "learning_rate": 0.0004330360207303667, + "loss": 3.1492, + "step": 21655 + }, + { + "epoch": 1.06, + "grad_norm": 0.7022563219070435, + "learning_rate": 0.00043302222098930604, + "loss": 3.0283, + "step": 21656 + }, + { + "epoch": 1.06, + "grad_norm": 0.5879517793655396, + "learning_rate": 0.0004330084208978905, + "loss": 3.0705, + "step": 21657 + }, + { + "epoch": 1.06, + "grad_norm": 0.5577943921089172, + "learning_rate": 0.0004329946204561563, + "loss": 3.0612, + "step": 21658 + }, + { + "epoch": 1.06, + "grad_norm": 0.5404565930366516, + "learning_rate": 0.0004329808196641399, + "loss": 3.1377, + "step": 21659 + }, + { + "epoch": 1.06, + "grad_norm": 0.5410740971565247, + "learning_rate": 0.00043296701852187764, + "loss": 3.3028, + "step": 21660 + }, + { + "epoch": 1.06, + "grad_norm": 0.5340285897254944, + "learning_rate": 0.0004329532170294059, + "loss": 3.3272, + "step": 21661 + }, + { + "epoch": 1.06, + "grad_norm": 0.5251579284667969, + "learning_rate": 0.0004329394151867609, + "loss": 3.6786, + "step": 21662 + }, + { + "epoch": 1.06, + "grad_norm": 0.5060845613479614, + "learning_rate": 0.00043292561299397906, + "loss": 3.2364, + "step": 21663 + }, + { + "epoch": 1.06, + "grad_norm": 0.568199872970581, + "learning_rate": 0.0004329118104510967, + "loss": 2.9753, + "step": 21664 + }, + { + "epoch": 1.06, + "grad_norm": 0.5439720749855042, + "learning_rate": 0.0004328980075581503, + "loss": 3.0583, + "step": 21665 + }, + { + "epoch": 1.06, + "grad_norm": 0.5244399309158325, + "learning_rate": 0.00043288420431517613, + "loss": 3.0426, + "step": 21666 + }, + { + "epoch": 1.06, + "grad_norm": 0.5180443525314331, + "learning_rate": 0.0004328704007222105, + "loss": 3.1875, + "step": 21667 + }, + { + "epoch": 1.06, + "grad_norm": 0.48462343215942383, + "learning_rate": 0.00043285659677928974, + "loss": 3.0099, + "step": 21668 + }, + { + "epoch": 1.06, + "grad_norm": 0.5198360681533813, + "learning_rate": 0.00043284279248645034, + "loss": 2.9479, + "step": 21669 + }, + { + "epoch": 1.06, + "grad_norm": 0.5512001514434814, + "learning_rate": 0.0004328289878437286, + "loss": 3.1317, + "step": 21670 + }, + { + "epoch": 1.06, + "grad_norm": 0.5167283415794373, + "learning_rate": 0.00043281518285116083, + "loss": 3.2503, + "step": 21671 + }, + { + "epoch": 1.06, + "grad_norm": 0.5513035655021667, + "learning_rate": 0.0004328013775087835, + "loss": 3.2889, + "step": 21672 + }, + { + "epoch": 1.06, + "grad_norm": 0.5547505617141724, + "learning_rate": 0.0004327875718166328, + "loss": 3.1017, + "step": 21673 + }, + { + "epoch": 1.06, + "grad_norm": 0.5487305521965027, + "learning_rate": 0.0004327737657747452, + "loss": 2.9976, + "step": 21674 + }, + { + "epoch": 1.06, + "grad_norm": 0.5412231087684631, + "learning_rate": 0.00043275995938315713, + "loss": 3.1704, + "step": 21675 + }, + { + "epoch": 1.06, + "grad_norm": 0.5449442863464355, + "learning_rate": 0.0004327461526419049, + "loss": 3.0803, + "step": 21676 + }, + { + "epoch": 1.06, + "grad_norm": 0.566912829875946, + "learning_rate": 0.0004327323455510248, + "loss": 3.2645, + "step": 21677 + }, + { + "epoch": 1.06, + "grad_norm": 0.551240861415863, + "learning_rate": 0.00043271853811055315, + "loss": 2.908, + "step": 21678 + }, + { + "epoch": 1.06, + "grad_norm": 0.5297688245773315, + "learning_rate": 0.00043270473032052656, + "loss": 3.0444, + "step": 21679 + }, + { + "epoch": 1.06, + "grad_norm": 0.5144250392913818, + "learning_rate": 0.00043269092218098116, + "loss": 2.945, + "step": 21680 + }, + { + "epoch": 1.06, + "grad_norm": 0.5448917746543884, + "learning_rate": 0.00043267711369195344, + "loss": 3.1495, + "step": 21681 + }, + { + "epoch": 1.06, + "grad_norm": 0.5812903642654419, + "learning_rate": 0.00043266330485347977, + "loss": 3.1761, + "step": 21682 + }, + { + "epoch": 1.06, + "grad_norm": 0.5192112922668457, + "learning_rate": 0.0004326494956655965, + "loss": 3.2557, + "step": 21683 + }, + { + "epoch": 1.06, + "grad_norm": 0.5966460704803467, + "learning_rate": 0.00043263568612833986, + "loss": 3.0654, + "step": 21684 + }, + { + "epoch": 1.06, + "grad_norm": 0.5527370572090149, + "learning_rate": 0.00043262187624174643, + "loss": 3.0416, + "step": 21685 + }, + { + "epoch": 1.06, + "grad_norm": 0.5254831910133362, + "learning_rate": 0.0004326080660058525, + "loss": 3.1767, + "step": 21686 + }, + { + "epoch": 1.06, + "grad_norm": 0.5413762927055359, + "learning_rate": 0.0004325942554206945, + "loss": 3.2666, + "step": 21687 + }, + { + "epoch": 1.06, + "grad_norm": 0.5316290855407715, + "learning_rate": 0.0004325804444863087, + "loss": 2.9736, + "step": 21688 + }, + { + "epoch": 1.06, + "grad_norm": 0.5323042273521423, + "learning_rate": 0.00043256663320273146, + "loss": 2.9594, + "step": 21689 + }, + { + "epoch": 1.06, + "grad_norm": 0.5886972546577454, + "learning_rate": 0.0004325528215699993, + "loss": 3.1795, + "step": 21690 + }, + { + "epoch": 1.06, + "grad_norm": 0.5144338607788086, + "learning_rate": 0.00043253900958814854, + "loss": 2.8915, + "step": 21691 + }, + { + "epoch": 1.06, + "grad_norm": 0.5302631258964539, + "learning_rate": 0.0004325251972572155, + "loss": 3.2102, + "step": 21692 + }, + { + "epoch": 1.06, + "grad_norm": 0.534359335899353, + "learning_rate": 0.00043251138457723655, + "loss": 3.0445, + "step": 21693 + }, + { + "epoch": 1.06, + "grad_norm": 0.5207918882369995, + "learning_rate": 0.0004324975715482482, + "loss": 3.1168, + "step": 21694 + }, + { + "epoch": 1.06, + "grad_norm": 0.5734944343566895, + "learning_rate": 0.00043248375817028664, + "loss": 2.9265, + "step": 21695 + }, + { + "epoch": 1.06, + "grad_norm": 0.5305697321891785, + "learning_rate": 0.00043246994444338845, + "loss": 3.1291, + "step": 21696 + }, + { + "epoch": 1.06, + "grad_norm": 0.5317648649215698, + "learning_rate": 0.00043245613036758994, + "loss": 2.9974, + "step": 21697 + }, + { + "epoch": 1.06, + "grad_norm": 0.5591028332710266, + "learning_rate": 0.0004324423159429275, + "loss": 3.0407, + "step": 21698 + }, + { + "epoch": 1.06, + "grad_norm": 0.5748549103736877, + "learning_rate": 0.0004324285011694374, + "loss": 3.2274, + "step": 21699 + }, + { + "epoch": 1.06, + "grad_norm": 0.5331709980964661, + "learning_rate": 0.000432414686047156, + "loss": 3.1211, + "step": 21700 + }, + { + "epoch": 1.06, + "grad_norm": 0.5106818079948425, + "learning_rate": 0.00043240087057612003, + "loss": 2.9146, + "step": 21701 + }, + { + "epoch": 1.06, + "grad_norm": 0.5366308689117432, + "learning_rate": 0.0004323870547563655, + "loss": 3.3333, + "step": 21702 + }, + { + "epoch": 1.06, + "grad_norm": 0.5502612590789795, + "learning_rate": 0.000432373238587929, + "loss": 3.2715, + "step": 21703 + }, + { + "epoch": 1.06, + "grad_norm": 0.5137237906455994, + "learning_rate": 0.00043235942207084686, + "loss": 3.1082, + "step": 21704 + }, + { + "epoch": 1.06, + "grad_norm": 0.506195068359375, + "learning_rate": 0.00043234560520515546, + "loss": 2.997, + "step": 21705 + }, + { + "epoch": 1.06, + "grad_norm": 0.5691025257110596, + "learning_rate": 0.0004323317879908913, + "loss": 3.1141, + "step": 21706 + }, + { + "epoch": 1.06, + "grad_norm": 0.5307044386863708, + "learning_rate": 0.0004323179704280905, + "loss": 2.9933, + "step": 21707 + }, + { + "epoch": 1.06, + "grad_norm": 0.5617866516113281, + "learning_rate": 0.0004323041525167898, + "loss": 2.9266, + "step": 21708 + }, + { + "epoch": 1.06, + "grad_norm": 0.5357836484909058, + "learning_rate": 0.0004322903342570254, + "loss": 2.938, + "step": 21709 + }, + { + "epoch": 1.06, + "grad_norm": 0.5504767894744873, + "learning_rate": 0.0004322765156488336, + "loss": 3.1159, + "step": 21710 + }, + { + "epoch": 1.06, + "grad_norm": 0.537348747253418, + "learning_rate": 0.00043226269669225097, + "loss": 3.1031, + "step": 21711 + }, + { + "epoch": 1.06, + "grad_norm": 0.5378528237342834, + "learning_rate": 0.0004322488773873139, + "loss": 3.1137, + "step": 21712 + }, + { + "epoch": 1.06, + "grad_norm": 0.5131245255470276, + "learning_rate": 0.00043223505773405874, + "loss": 3.0508, + "step": 21713 + }, + { + "epoch": 1.06, + "grad_norm": 0.533130943775177, + "learning_rate": 0.00043222123773252187, + "loss": 3.1552, + "step": 21714 + }, + { + "epoch": 1.06, + "grad_norm": 0.5789387822151184, + "learning_rate": 0.0004322074173827396, + "loss": 3.1302, + "step": 21715 + }, + { + "epoch": 1.06, + "grad_norm": 0.5390761494636536, + "learning_rate": 0.0004321935966847486, + "loss": 2.9585, + "step": 21716 + }, + { + "epoch": 1.06, + "grad_norm": 0.5794828534126282, + "learning_rate": 0.00043217977563858503, + "loss": 3.1416, + "step": 21717 + }, + { + "epoch": 1.06, + "grad_norm": 0.5323156118392944, + "learning_rate": 0.00043216595424428535, + "loss": 3.0928, + "step": 21718 + }, + { + "epoch": 1.06, + "grad_norm": 0.5307153463363647, + "learning_rate": 0.00043215213250188603, + "loss": 3.1511, + "step": 21719 + }, + { + "epoch": 1.06, + "grad_norm": 0.5392312407493591, + "learning_rate": 0.00043213831041142344, + "loss": 3.0964, + "step": 21720 + }, + { + "epoch": 1.06, + "grad_norm": 0.5451873540878296, + "learning_rate": 0.0004321244879729339, + "loss": 3.0374, + "step": 21721 + }, + { + "epoch": 1.06, + "grad_norm": 0.5187518000602722, + "learning_rate": 0.0004321106651864539, + "loss": 3.1617, + "step": 21722 + }, + { + "epoch": 1.06, + "grad_norm": 0.5146177411079407, + "learning_rate": 0.0004320968420520199, + "loss": 2.9917, + "step": 21723 + }, + { + "epoch": 1.06, + "grad_norm": 0.5389916300773621, + "learning_rate": 0.00043208301856966825, + "loss": 3.11, + "step": 21724 + }, + { + "epoch": 1.06, + "grad_norm": 0.6462126970291138, + "learning_rate": 0.0004320691947394352, + "loss": 3.0173, + "step": 21725 + }, + { + "epoch": 1.06, + "grad_norm": 0.5558220148086548, + "learning_rate": 0.00043205537056135735, + "loss": 3.0579, + "step": 21726 + }, + { + "epoch": 1.06, + "grad_norm": 0.5146065354347229, + "learning_rate": 0.0004320415460354712, + "loss": 3.0825, + "step": 21727 + }, + { + "epoch": 1.06, + "grad_norm": 0.5261913537979126, + "learning_rate": 0.00043202772116181297, + "loss": 3.142, + "step": 21728 + }, + { + "epoch": 1.06, + "grad_norm": 0.5271784663200378, + "learning_rate": 0.0004320138959404191, + "loss": 3.0003, + "step": 21729 + }, + { + "epoch": 1.06, + "grad_norm": 0.5543608665466309, + "learning_rate": 0.00043200007037132605, + "loss": 3.0188, + "step": 21730 + }, + { + "epoch": 1.06, + "grad_norm": 0.5403302311897278, + "learning_rate": 0.00043198624445457023, + "loss": 3.0804, + "step": 21731 + }, + { + "epoch": 1.07, + "grad_norm": 0.5381479263305664, + "learning_rate": 0.000431972418190188, + "loss": 3.0915, + "step": 21732 + }, + { + "epoch": 1.07, + "grad_norm": 0.5359466075897217, + "learning_rate": 0.0004319585915782159, + "loss": 3.1204, + "step": 21733 + }, + { + "epoch": 1.07, + "grad_norm": 0.5506048202514648, + "learning_rate": 0.0004319447646186902, + "loss": 3.1377, + "step": 21734 + }, + { + "epoch": 1.07, + "grad_norm": 0.5816706418991089, + "learning_rate": 0.00043193093731164737, + "loss": 3.3217, + "step": 21735 + }, + { + "epoch": 1.07, + "grad_norm": 0.543936014175415, + "learning_rate": 0.00043191710965712385, + "loss": 3.3028, + "step": 21736 + }, + { + "epoch": 1.07, + "grad_norm": 0.5475443601608276, + "learning_rate": 0.0004319032816551561, + "loss": 3.1762, + "step": 21737 + }, + { + "epoch": 1.07, + "grad_norm": 0.5085486173629761, + "learning_rate": 0.0004318894533057805, + "loss": 3.2277, + "step": 21738 + }, + { + "epoch": 1.07, + "grad_norm": 0.5390211343765259, + "learning_rate": 0.0004318756246090334, + "loss": 3.0807, + "step": 21739 + }, + { + "epoch": 1.07, + "grad_norm": 0.5398746132850647, + "learning_rate": 0.00043186179556495134, + "loss": 2.9588, + "step": 21740 + }, + { + "epoch": 1.07, + "grad_norm": 0.5575569272041321, + "learning_rate": 0.00043184796617357063, + "loss": 2.9674, + "step": 21741 + }, + { + "epoch": 1.07, + "grad_norm": 0.5330104231834412, + "learning_rate": 0.00043183413643492784, + "loss": 3.0524, + "step": 21742 + }, + { + "epoch": 1.07, + "grad_norm": 0.5363730788230896, + "learning_rate": 0.0004318203063490592, + "loss": 3.0613, + "step": 21743 + }, + { + "epoch": 1.07, + "grad_norm": 0.5895825028419495, + "learning_rate": 0.0004318064759160013, + "loss": 2.9534, + "step": 21744 + }, + { + "epoch": 1.07, + "grad_norm": 0.560642659664154, + "learning_rate": 0.00043179264513579056, + "loss": 2.9325, + "step": 21745 + }, + { + "epoch": 1.07, + "grad_norm": 0.588516354560852, + "learning_rate": 0.0004317788140084632, + "loss": 2.9729, + "step": 21746 + }, + { + "epoch": 1.07, + "grad_norm": 0.5563256144523621, + "learning_rate": 0.0004317649825340559, + "loss": 3.0784, + "step": 21747 + }, + { + "epoch": 1.07, + "grad_norm": 0.51703280210495, + "learning_rate": 0.00043175115071260496, + "loss": 3.132, + "step": 21748 + }, + { + "epoch": 1.07, + "grad_norm": 0.6014596819877625, + "learning_rate": 0.00043173731854414697, + "loss": 3.1507, + "step": 21749 + }, + { + "epoch": 1.07, + "grad_norm": 0.5737568736076355, + "learning_rate": 0.00043172348602871815, + "loss": 3.1788, + "step": 21750 + }, + { + "epoch": 1.07, + "grad_norm": 0.5655453205108643, + "learning_rate": 0.0004317096531663549, + "loss": 3.0277, + "step": 21751 + }, + { + "epoch": 1.07, + "grad_norm": 0.5275116562843323, + "learning_rate": 0.000431695819957094, + "loss": 3.0195, + "step": 21752 + }, + { + "epoch": 1.07, + "grad_norm": 0.5256880521774292, + "learning_rate": 0.00043168198640097154, + "loss": 3.193, + "step": 21753 + }, + { + "epoch": 1.07, + "grad_norm": 0.547167181968689, + "learning_rate": 0.00043166815249802404, + "loss": 2.8198, + "step": 21754 + }, + { + "epoch": 1.07, + "grad_norm": 0.522355854511261, + "learning_rate": 0.000431654318248288, + "loss": 3.2742, + "step": 21755 + }, + { + "epoch": 1.07, + "grad_norm": 0.5362539887428284, + "learning_rate": 0.00043164048365179977, + "loss": 3.1321, + "step": 21756 + }, + { + "epoch": 1.07, + "grad_norm": 0.5473201274871826, + "learning_rate": 0.00043162664870859596, + "loss": 3.2113, + "step": 21757 + }, + { + "epoch": 1.07, + "grad_norm": 0.5578786134719849, + "learning_rate": 0.0004316128134187128, + "loss": 3.3162, + "step": 21758 + }, + { + "epoch": 1.07, + "grad_norm": 0.5824030041694641, + "learning_rate": 0.0004315989777821868, + "loss": 3.1726, + "step": 21759 + }, + { + "epoch": 1.07, + "grad_norm": 0.5133621096611023, + "learning_rate": 0.00043158514179905455, + "loss": 3.0582, + "step": 21760 + }, + { + "epoch": 1.07, + "grad_norm": 0.5308951139450073, + "learning_rate": 0.00043157130546935224, + "loss": 3.3032, + "step": 21761 + }, + { + "epoch": 1.07, + "grad_norm": 0.5566686391830444, + "learning_rate": 0.0004315574687931164, + "loss": 3.3004, + "step": 21762 + }, + { + "epoch": 1.07, + "grad_norm": 0.6115220189094543, + "learning_rate": 0.0004315436317703836, + "loss": 2.962, + "step": 21763 + }, + { + "epoch": 1.07, + "grad_norm": 0.52565997838974, + "learning_rate": 0.0004315297944011902, + "loss": 3.1985, + "step": 21764 + }, + { + "epoch": 1.07, + "grad_norm": 0.5332111716270447, + "learning_rate": 0.0004315159566855726, + "loss": 3.0903, + "step": 21765 + }, + { + "epoch": 1.07, + "grad_norm": 0.5685824155807495, + "learning_rate": 0.00043150211862356724, + "loss": 2.9799, + "step": 21766 + }, + { + "epoch": 1.07, + "grad_norm": 0.5400283932685852, + "learning_rate": 0.0004314882802152107, + "loss": 3.1849, + "step": 21767 + }, + { + "epoch": 1.07, + "grad_norm": 0.5025684833526611, + "learning_rate": 0.00043147444146053924, + "loss": 3.1382, + "step": 21768 + }, + { + "epoch": 1.07, + "grad_norm": 0.5145972371101379, + "learning_rate": 0.0004314606023595894, + "loss": 3.0659, + "step": 21769 + }, + { + "epoch": 1.07, + "grad_norm": 0.5135764479637146, + "learning_rate": 0.0004314467629123977, + "loss": 3.0465, + "step": 21770 + }, + { + "epoch": 1.07, + "grad_norm": 0.5721059441566467, + "learning_rate": 0.00043143292311900055, + "loss": 3.1543, + "step": 21771 + }, + { + "epoch": 1.07, + "grad_norm": 0.5446774959564209, + "learning_rate": 0.00043141908297943425, + "loss": 3.3055, + "step": 21772 + }, + { + "epoch": 1.07, + "grad_norm": 0.5134919285774231, + "learning_rate": 0.0004314052424937355, + "loss": 2.8909, + "step": 21773 + }, + { + "epoch": 1.07, + "grad_norm": 0.518467903137207, + "learning_rate": 0.00043139140166194057, + "loss": 3.2775, + "step": 21774 + }, + { + "epoch": 1.07, + "grad_norm": 0.5316968560218811, + "learning_rate": 0.000431377560484086, + "loss": 3.2102, + "step": 21775 + }, + { + "epoch": 1.07, + "grad_norm": 0.5495380163192749, + "learning_rate": 0.0004313637189602082, + "loss": 3.1133, + "step": 21776 + }, + { + "epoch": 1.07, + "grad_norm": 0.5162259340286255, + "learning_rate": 0.00043134987709034364, + "loss": 3.1323, + "step": 21777 + }, + { + "epoch": 1.07, + "grad_norm": 0.5471155047416687, + "learning_rate": 0.0004313360348745289, + "loss": 3.0562, + "step": 21778 + }, + { + "epoch": 1.07, + "grad_norm": 0.5574952960014343, + "learning_rate": 0.0004313221923128001, + "loss": 3.027, + "step": 21779 + }, + { + "epoch": 1.07, + "grad_norm": 0.5334025025367737, + "learning_rate": 0.00043130834940519405, + "loss": 3.2028, + "step": 21780 + }, + { + "epoch": 1.07, + "grad_norm": 0.5593041181564331, + "learning_rate": 0.0004312945061517471, + "loss": 3.1218, + "step": 21781 + }, + { + "epoch": 1.07, + "grad_norm": 0.5666584968566895, + "learning_rate": 0.00043128066255249565, + "loss": 3.173, + "step": 21782 + }, + { + "epoch": 1.07, + "grad_norm": 0.5271990299224854, + "learning_rate": 0.0004312668186074762, + "loss": 3.0105, + "step": 21783 + }, + { + "epoch": 1.07, + "grad_norm": 0.5458244681358337, + "learning_rate": 0.0004312529743167252, + "loss": 2.9528, + "step": 21784 + }, + { + "epoch": 1.07, + "grad_norm": 0.5140916705131531, + "learning_rate": 0.0004312391296802792, + "loss": 3.1281, + "step": 21785 + }, + { + "epoch": 1.07, + "grad_norm": 0.5619597434997559, + "learning_rate": 0.0004312252846981745, + "loss": 2.9598, + "step": 21786 + }, + { + "epoch": 1.07, + "grad_norm": 0.5158681273460388, + "learning_rate": 0.0004312114393704476, + "loss": 3.0605, + "step": 21787 + }, + { + "epoch": 1.07, + "grad_norm": 0.6134312152862549, + "learning_rate": 0.00043119759369713515, + "loss": 3.008, + "step": 21788 + }, + { + "epoch": 1.07, + "grad_norm": 0.5468149781227112, + "learning_rate": 0.0004311837476782735, + "loss": 2.905, + "step": 21789 + }, + { + "epoch": 1.07, + "grad_norm": 0.5272074341773987, + "learning_rate": 0.000431169901313899, + "loss": 3.1477, + "step": 21790 + }, + { + "epoch": 1.07, + "grad_norm": 0.5446023344993591, + "learning_rate": 0.0004311560546040483, + "loss": 3.1807, + "step": 21791 + }, + { + "epoch": 1.07, + "grad_norm": 0.5380364656448364, + "learning_rate": 0.0004311422075487578, + "loss": 2.9937, + "step": 21792 + }, + { + "epoch": 1.07, + "grad_norm": 0.5410694479942322, + "learning_rate": 0.00043112836014806385, + "loss": 3.0857, + "step": 21793 + }, + { + "epoch": 1.07, + "grad_norm": 0.5312390327453613, + "learning_rate": 0.00043111451240200317, + "loss": 2.923, + "step": 21794 + }, + { + "epoch": 1.07, + "grad_norm": 0.5283505320549011, + "learning_rate": 0.000431100664310612, + "loss": 3.2019, + "step": 21795 + }, + { + "epoch": 1.07, + "grad_norm": 0.5480759739875793, + "learning_rate": 0.000431086815873927, + "loss": 3.091, + "step": 21796 + }, + { + "epoch": 1.07, + "grad_norm": 0.5413346290588379, + "learning_rate": 0.00043107296709198447, + "loss": 3.1228, + "step": 21797 + }, + { + "epoch": 1.07, + "grad_norm": 0.6117550134658813, + "learning_rate": 0.00043105911796482104, + "loss": 3.184, + "step": 21798 + }, + { + "epoch": 1.07, + "grad_norm": 0.514468252658844, + "learning_rate": 0.00043104526849247306, + "loss": 3.165, + "step": 21799 + }, + { + "epoch": 1.07, + "grad_norm": 0.542933464050293, + "learning_rate": 0.00043103141867497713, + "loss": 3.1681, + "step": 21800 + }, + { + "epoch": 1.07, + "grad_norm": 0.5387585163116455, + "learning_rate": 0.0004310175685123696, + "loss": 3.1421, + "step": 21801 + }, + { + "epoch": 1.07, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.00043100371800468696, + "loss": 3.2455, + "step": 21802 + }, + { + "epoch": 1.07, + "grad_norm": 0.5273892879486084, + "learning_rate": 0.00043098986715196596, + "loss": 3.1728, + "step": 21803 + }, + { + "epoch": 1.07, + "grad_norm": 0.5189810395240784, + "learning_rate": 0.0004309760159542426, + "loss": 3.1855, + "step": 21804 + }, + { + "epoch": 1.07, + "grad_norm": 0.5295484066009521, + "learning_rate": 0.0004309621644115538, + "loss": 3.0618, + "step": 21805 + }, + { + "epoch": 1.07, + "grad_norm": 0.5467668175697327, + "learning_rate": 0.0004309483125239358, + "loss": 2.8678, + "step": 21806 + }, + { + "epoch": 1.07, + "grad_norm": 0.5472779273986816, + "learning_rate": 0.00043093446029142515, + "loss": 3.0829, + "step": 21807 + }, + { + "epoch": 1.07, + "grad_norm": 0.5611221194267273, + "learning_rate": 0.0004309206077140583, + "loss": 3.1447, + "step": 21808 + }, + { + "epoch": 1.07, + "grad_norm": 0.5621599555015564, + "learning_rate": 0.0004309067547918718, + "loss": 3.067, + "step": 21809 + }, + { + "epoch": 1.07, + "grad_norm": 0.5594969391822815, + "learning_rate": 0.0004308929015249021, + "loss": 3.1116, + "step": 21810 + }, + { + "epoch": 1.07, + "grad_norm": 0.5223482847213745, + "learning_rate": 0.0004308790479131857, + "loss": 3.0363, + "step": 21811 + }, + { + "epoch": 1.07, + "grad_norm": 0.5609418749809265, + "learning_rate": 0.00043086519395675897, + "loss": 3.0169, + "step": 21812 + }, + { + "epoch": 1.07, + "grad_norm": 0.5535745024681091, + "learning_rate": 0.0004308513396556585, + "loss": 2.7776, + "step": 21813 + }, + { + "epoch": 1.07, + "grad_norm": 0.5150181651115417, + "learning_rate": 0.000430837485009921, + "loss": 2.8866, + "step": 21814 + }, + { + "epoch": 1.07, + "grad_norm": 0.5247113108634949, + "learning_rate": 0.0004308236300195826, + "loss": 3.1063, + "step": 21815 + }, + { + "epoch": 1.07, + "grad_norm": 0.5388785600662231, + "learning_rate": 0.00043080977468467995, + "loss": 3.3039, + "step": 21816 + }, + { + "epoch": 1.07, + "grad_norm": 0.6435933709144592, + "learning_rate": 0.00043079591900524954, + "loss": 2.9531, + "step": 21817 + }, + { + "epoch": 1.07, + "grad_norm": 0.5959275960922241, + "learning_rate": 0.00043078206298132783, + "loss": 3.0319, + "step": 21818 + }, + { + "epoch": 1.07, + "grad_norm": 0.5229818820953369, + "learning_rate": 0.00043076820661295135, + "loss": 3.1529, + "step": 21819 + }, + { + "epoch": 1.07, + "grad_norm": 0.5054709911346436, + "learning_rate": 0.0004307543499001566, + "loss": 3.1894, + "step": 21820 + }, + { + "epoch": 1.07, + "grad_norm": 0.5688246488571167, + "learning_rate": 0.00043074049284297995, + "loss": 3.086, + "step": 21821 + }, + { + "epoch": 1.07, + "grad_norm": 0.598905622959137, + "learning_rate": 0.00043072663544145817, + "loss": 3.2127, + "step": 21822 + }, + { + "epoch": 1.07, + "grad_norm": 0.5313297510147095, + "learning_rate": 0.00043071277769562744, + "loss": 3.0613, + "step": 21823 + }, + { + "epoch": 1.07, + "grad_norm": 0.5411408543586731, + "learning_rate": 0.00043069891960552446, + "loss": 3.1642, + "step": 21824 + }, + { + "epoch": 1.07, + "grad_norm": 0.5270294547080994, + "learning_rate": 0.0004306850611711858, + "loss": 3.2479, + "step": 21825 + }, + { + "epoch": 1.07, + "grad_norm": 0.5298737287521362, + "learning_rate": 0.00043067120239264776, + "loss": 3.2503, + "step": 21826 + }, + { + "epoch": 1.07, + "grad_norm": 0.544009804725647, + "learning_rate": 0.0004306573432699468, + "loss": 3.013, + "step": 21827 + }, + { + "epoch": 1.07, + "grad_norm": 0.51810622215271, + "learning_rate": 0.0004306434838031196, + "loss": 3.04, + "step": 21828 + }, + { + "epoch": 1.07, + "grad_norm": 0.5270453095436096, + "learning_rate": 0.0004306296239922027, + "loss": 2.9395, + "step": 21829 + }, + { + "epoch": 1.07, + "grad_norm": 0.5867906212806702, + "learning_rate": 0.0004306157638372325, + "loss": 3.0299, + "step": 21830 + }, + { + "epoch": 1.07, + "grad_norm": 0.5443738698959351, + "learning_rate": 0.0004306019033382454, + "loss": 3.091, + "step": 21831 + }, + { + "epoch": 1.07, + "grad_norm": 0.5243510007858276, + "learning_rate": 0.0004305880424952781, + "loss": 3.0763, + "step": 21832 + }, + { + "epoch": 1.07, + "grad_norm": 0.5453609824180603, + "learning_rate": 0.00043057418130836706, + "loss": 3.0139, + "step": 21833 + }, + { + "epoch": 1.07, + "grad_norm": 0.5322545766830444, + "learning_rate": 0.0004305603197775487, + "loss": 2.9306, + "step": 21834 + }, + { + "epoch": 1.07, + "grad_norm": 0.5745664834976196, + "learning_rate": 0.00043054645790285964, + "loss": 3.1827, + "step": 21835 + }, + { + "epoch": 1.07, + "grad_norm": 0.5085702538490295, + "learning_rate": 0.0004305325956843363, + "loss": 3.0335, + "step": 21836 + }, + { + "epoch": 1.07, + "grad_norm": 0.5018842816352844, + "learning_rate": 0.00043051873312201526, + "loss": 3.0121, + "step": 21837 + }, + { + "epoch": 1.07, + "grad_norm": 0.5426105260848999, + "learning_rate": 0.00043050487021593285, + "loss": 3.0908, + "step": 21838 + }, + { + "epoch": 1.07, + "grad_norm": 0.529456615447998, + "learning_rate": 0.0004304910069661259, + "loss": 3.108, + "step": 21839 + }, + { + "epoch": 1.07, + "grad_norm": 0.5271927714347839, + "learning_rate": 0.0004304771433726307, + "loss": 3.0775, + "step": 21840 + }, + { + "epoch": 1.07, + "grad_norm": 0.5279380679130554, + "learning_rate": 0.00043046327943548383, + "loss": 3.059, + "step": 21841 + }, + { + "epoch": 1.07, + "grad_norm": 0.5312852263450623, + "learning_rate": 0.0004304494151547217, + "loss": 3.2413, + "step": 21842 + }, + { + "epoch": 1.07, + "grad_norm": 0.525822103023529, + "learning_rate": 0.00043043555053038095, + "loss": 3.0476, + "step": 21843 + }, + { + "epoch": 1.07, + "grad_norm": 0.5275469422340393, + "learning_rate": 0.0004304216855624981, + "loss": 3.1864, + "step": 21844 + }, + { + "epoch": 1.07, + "grad_norm": 0.5422474145889282, + "learning_rate": 0.00043040782025110966, + "loss": 3.1227, + "step": 21845 + }, + { + "epoch": 1.07, + "grad_norm": 0.5537338256835938, + "learning_rate": 0.00043039395459625203, + "loss": 3.2381, + "step": 21846 + }, + { + "epoch": 1.07, + "grad_norm": 0.5338166356086731, + "learning_rate": 0.000430380088597962, + "loss": 3.2902, + "step": 21847 + }, + { + "epoch": 1.07, + "grad_norm": 0.5591501593589783, + "learning_rate": 0.00043036622225627567, + "loss": 3.2828, + "step": 21848 + }, + { + "epoch": 1.07, + "grad_norm": 0.5365943312644958, + "learning_rate": 0.00043035235557122985, + "loss": 2.9454, + "step": 21849 + }, + { + "epoch": 1.07, + "grad_norm": 0.5561801195144653, + "learning_rate": 0.00043033848854286106, + "loss": 3.1468, + "step": 21850 + }, + { + "epoch": 1.07, + "grad_norm": 0.5465043187141418, + "learning_rate": 0.0004303246211712059, + "loss": 3.0577, + "step": 21851 + }, + { + "epoch": 1.07, + "grad_norm": 0.5761997699737549, + "learning_rate": 0.0004303107534563006, + "loss": 2.8597, + "step": 21852 + }, + { + "epoch": 1.07, + "grad_norm": 0.5243141055107117, + "learning_rate": 0.0004302968853981819, + "loss": 3.114, + "step": 21853 + }, + { + "epoch": 1.07, + "grad_norm": 0.5474295020103455, + "learning_rate": 0.0004302830169968863, + "loss": 3.0841, + "step": 21854 + }, + { + "epoch": 1.07, + "grad_norm": 0.5314984321594238, + "learning_rate": 0.0004302691482524503, + "loss": 3.25, + "step": 21855 + }, + { + "epoch": 1.07, + "grad_norm": 0.5473430156707764, + "learning_rate": 0.0004302552791649105, + "loss": 3.2211, + "step": 21856 + }, + { + "epoch": 1.07, + "grad_norm": 0.6076376438140869, + "learning_rate": 0.0004302414097343032, + "loss": 3.1258, + "step": 21857 + }, + { + "epoch": 1.07, + "grad_norm": 0.555774986743927, + "learning_rate": 0.0004302275399606653, + "loss": 3.2512, + "step": 21858 + }, + { + "epoch": 1.07, + "grad_norm": 0.5176228880882263, + "learning_rate": 0.0004302136698440329, + "loss": 3.0976, + "step": 21859 + }, + { + "epoch": 1.07, + "grad_norm": 0.5607179403305054, + "learning_rate": 0.0004301997993844429, + "loss": 3.1035, + "step": 21860 + }, + { + "epoch": 1.07, + "grad_norm": 0.5279338955879211, + "learning_rate": 0.00043018592858193165, + "loss": 3.0925, + "step": 21861 + }, + { + "epoch": 1.07, + "grad_norm": 0.506940484046936, + "learning_rate": 0.00043017205743653577, + "loss": 3.1532, + "step": 21862 + }, + { + "epoch": 1.07, + "grad_norm": 0.5521911978721619, + "learning_rate": 0.0004301581859482917, + "loss": 3.0362, + "step": 21863 + }, + { + "epoch": 1.07, + "grad_norm": 0.5607286095619202, + "learning_rate": 0.0004301443141172359, + "loss": 3.0133, + "step": 21864 + }, + { + "epoch": 1.07, + "grad_norm": 0.5357416272163391, + "learning_rate": 0.00043013044194340513, + "loss": 3.1389, + "step": 21865 + }, + { + "epoch": 1.07, + "grad_norm": 0.5648325681686401, + "learning_rate": 0.0004301165694268359, + "loss": 3.0022, + "step": 21866 + }, + { + "epoch": 1.07, + "grad_norm": 0.5390356779098511, + "learning_rate": 0.0004301026965675646, + "loss": 3.0165, + "step": 21867 + }, + { + "epoch": 1.07, + "grad_norm": 0.5444007515907288, + "learning_rate": 0.00043008882336562776, + "loss": 3.1716, + "step": 21868 + }, + { + "epoch": 1.07, + "grad_norm": 0.5538380742073059, + "learning_rate": 0.00043007494982106204, + "loss": 2.798, + "step": 21869 + }, + { + "epoch": 1.07, + "grad_norm": 0.5302233099937439, + "learning_rate": 0.000430061075933904, + "loss": 3.2093, + "step": 21870 + }, + { + "epoch": 1.07, + "grad_norm": 0.5354451537132263, + "learning_rate": 0.00043004720170419005, + "loss": 3.1233, + "step": 21871 + }, + { + "epoch": 1.07, + "grad_norm": 0.553068995475769, + "learning_rate": 0.00043003332713195685, + "loss": 2.9238, + "step": 21872 + }, + { + "epoch": 1.07, + "grad_norm": 0.5181149840354919, + "learning_rate": 0.0004300194522172409, + "loss": 3.185, + "step": 21873 + }, + { + "epoch": 1.07, + "grad_norm": 0.53992760181427, + "learning_rate": 0.0004300055769600786, + "loss": 3.1029, + "step": 21874 + }, + { + "epoch": 1.07, + "grad_norm": 0.5325241088867188, + "learning_rate": 0.00042999170136050674, + "loss": 3.1176, + "step": 21875 + }, + { + "epoch": 1.07, + "grad_norm": 0.5475149750709534, + "learning_rate": 0.0004299778254185618, + "loss": 3.044, + "step": 21876 + }, + { + "epoch": 1.07, + "grad_norm": 0.5398291945457458, + "learning_rate": 0.00042996394913428023, + "loss": 3.1001, + "step": 21877 + }, + { + "epoch": 1.07, + "grad_norm": 0.5641384124755859, + "learning_rate": 0.0004299500725076986, + "loss": 2.9772, + "step": 21878 + }, + { + "epoch": 1.07, + "grad_norm": 0.5371559262275696, + "learning_rate": 0.00042993619553885346, + "loss": 2.9344, + "step": 21879 + }, + { + "epoch": 1.07, + "grad_norm": 0.5246857404708862, + "learning_rate": 0.00042992231822778154, + "loss": 3.0261, + "step": 21880 + }, + { + "epoch": 1.07, + "grad_norm": 0.5782071948051453, + "learning_rate": 0.0004299084405745191, + "loss": 3.1893, + "step": 21881 + }, + { + "epoch": 1.07, + "grad_norm": 0.5363188982009888, + "learning_rate": 0.0004298945625791028, + "loss": 2.9895, + "step": 21882 + }, + { + "epoch": 1.07, + "grad_norm": 0.5487208366394043, + "learning_rate": 0.0004298806842415693, + "loss": 2.9313, + "step": 21883 + }, + { + "epoch": 1.07, + "grad_norm": 0.5662634968757629, + "learning_rate": 0.0004298668055619551, + "loss": 3.0855, + "step": 21884 + }, + { + "epoch": 1.07, + "grad_norm": 0.5252002477645874, + "learning_rate": 0.0004298529265402967, + "loss": 2.8349, + "step": 21885 + }, + { + "epoch": 1.07, + "grad_norm": 0.7216542363166809, + "learning_rate": 0.00042983904717663066, + "loss": 3.1169, + "step": 21886 + }, + { + "epoch": 1.07, + "grad_norm": 0.5242555141448975, + "learning_rate": 0.0004298251674709936, + "loss": 3.0732, + "step": 21887 + }, + { + "epoch": 1.07, + "grad_norm": 0.5485615730285645, + "learning_rate": 0.00042981128742342197, + "loss": 3.0352, + "step": 21888 + }, + { + "epoch": 1.07, + "grad_norm": 0.5380790829658508, + "learning_rate": 0.00042979740703395243, + "loss": 3.0147, + "step": 21889 + }, + { + "epoch": 1.07, + "grad_norm": 0.5466700792312622, + "learning_rate": 0.0004297835263026215, + "loss": 3.0625, + "step": 21890 + }, + { + "epoch": 1.07, + "grad_norm": 0.6295219659805298, + "learning_rate": 0.00042976964522946576, + "loss": 2.8748, + "step": 21891 + }, + { + "epoch": 1.07, + "grad_norm": 0.5317986011505127, + "learning_rate": 0.0004297557638145217, + "loss": 2.9312, + "step": 21892 + }, + { + "epoch": 1.07, + "grad_norm": 0.5600071549415588, + "learning_rate": 0.000429741882057826, + "loss": 3.1082, + "step": 21893 + }, + { + "epoch": 1.07, + "grad_norm": 0.5254099369049072, + "learning_rate": 0.0004297279999594151, + "loss": 2.9492, + "step": 21894 + }, + { + "epoch": 1.07, + "grad_norm": 0.557667076587677, + "learning_rate": 0.00042971411751932564, + "loss": 2.9501, + "step": 21895 + }, + { + "epoch": 1.07, + "grad_norm": 0.5373759865760803, + "learning_rate": 0.0004297002347375941, + "loss": 2.9849, + "step": 21896 + }, + { + "epoch": 1.07, + "grad_norm": 0.5822132229804993, + "learning_rate": 0.00042968635161425717, + "loss": 3.1294, + "step": 21897 + }, + { + "epoch": 1.07, + "grad_norm": 0.5314226150512695, + "learning_rate": 0.0004296724681493513, + "loss": 3.0397, + "step": 21898 + }, + { + "epoch": 1.07, + "grad_norm": 0.5426365733146667, + "learning_rate": 0.00042965858434291317, + "loss": 2.9715, + "step": 21899 + }, + { + "epoch": 1.07, + "grad_norm": 0.5367540121078491, + "learning_rate": 0.00042964470019497917, + "loss": 3.0716, + "step": 21900 + }, + { + "epoch": 1.07, + "grad_norm": 0.5575674176216125, + "learning_rate": 0.000429630815705586, + "loss": 3.2273, + "step": 21901 + }, + { + "epoch": 1.07, + "grad_norm": 0.5367324948310852, + "learning_rate": 0.0004296169308747704, + "loss": 2.9208, + "step": 21902 + }, + { + "epoch": 1.07, + "grad_norm": 0.5453097224235535, + "learning_rate": 0.00042960304570256855, + "loss": 3.211, + "step": 21903 + }, + { + "epoch": 1.07, + "grad_norm": 0.5521195530891418, + "learning_rate": 0.00042958916018901724, + "loss": 3.0296, + "step": 21904 + }, + { + "epoch": 1.07, + "grad_norm": 0.5742548108100891, + "learning_rate": 0.0004295752743341531, + "loss": 2.9801, + "step": 21905 + }, + { + "epoch": 1.07, + "grad_norm": 0.5651740431785583, + "learning_rate": 0.00042956138813801263, + "loss": 3.216, + "step": 21906 + }, + { + "epoch": 1.07, + "grad_norm": 0.5569771528244019, + "learning_rate": 0.0004295475016006323, + "loss": 3.0902, + "step": 21907 + }, + { + "epoch": 1.07, + "grad_norm": 0.5515370965003967, + "learning_rate": 0.0004295336147220488, + "loss": 3.2262, + "step": 21908 + }, + { + "epoch": 1.07, + "grad_norm": 0.5465577244758606, + "learning_rate": 0.0004295197275022988, + "loss": 3.0305, + "step": 21909 + }, + { + "epoch": 1.07, + "grad_norm": 0.5600085854530334, + "learning_rate": 0.00042950583994141867, + "loss": 3.2192, + "step": 21910 + }, + { + "epoch": 1.07, + "grad_norm": 0.5199048519134521, + "learning_rate": 0.00042949195203944505, + "loss": 3.2271, + "step": 21911 + }, + { + "epoch": 1.07, + "grad_norm": 0.5712819695472717, + "learning_rate": 0.0004294780637964146, + "loss": 3.1676, + "step": 21912 + }, + { + "epoch": 1.07, + "grad_norm": 0.5674663782119751, + "learning_rate": 0.00042946417521236394, + "loss": 3.0938, + "step": 21913 + }, + { + "epoch": 1.07, + "grad_norm": 0.5764085054397583, + "learning_rate": 0.0004294502862873294, + "loss": 3.1466, + "step": 21914 + }, + { + "epoch": 1.07, + "grad_norm": 0.5495882630348206, + "learning_rate": 0.0004294363970213477, + "loss": 3.1355, + "step": 21915 + }, + { + "epoch": 1.07, + "grad_norm": 0.5135552287101746, + "learning_rate": 0.0004294225074144556, + "loss": 2.932, + "step": 21916 + }, + { + "epoch": 1.07, + "grad_norm": 0.5441713929176331, + "learning_rate": 0.0004294086174666895, + "loss": 3.0714, + "step": 21917 + }, + { + "epoch": 1.07, + "grad_norm": 0.539913535118103, + "learning_rate": 0.00042939472717808596, + "loss": 3.0711, + "step": 21918 + }, + { + "epoch": 1.07, + "grad_norm": 0.5363076329231262, + "learning_rate": 0.00042938083654868155, + "loss": 3.0245, + "step": 21919 + }, + { + "epoch": 1.07, + "grad_norm": 0.5468325614929199, + "learning_rate": 0.000429366945578513, + "loss": 2.9802, + "step": 21920 + }, + { + "epoch": 1.07, + "grad_norm": 0.5180367827415466, + "learning_rate": 0.0004293530542676168, + "loss": 3.2203, + "step": 21921 + }, + { + "epoch": 1.07, + "grad_norm": 0.5398200154304504, + "learning_rate": 0.00042933916261602946, + "loss": 2.9909, + "step": 21922 + }, + { + "epoch": 1.07, + "grad_norm": 0.5421903729438782, + "learning_rate": 0.0004293252706237878, + "loss": 2.8939, + "step": 21923 + }, + { + "epoch": 1.07, + "grad_norm": 0.6031635999679565, + "learning_rate": 0.00042931137829092824, + "loss": 2.8989, + "step": 21924 + }, + { + "epoch": 1.07, + "grad_norm": 0.5310101509094238, + "learning_rate": 0.00042929748561748723, + "loss": 2.9027, + "step": 21925 + }, + { + "epoch": 1.07, + "grad_norm": 0.5500092506408691, + "learning_rate": 0.0004292835926035017, + "loss": 3.0154, + "step": 21926 + }, + { + "epoch": 1.07, + "grad_norm": 0.5398533940315247, + "learning_rate": 0.00042926969924900806, + "loss": 3.1928, + "step": 21927 + }, + { + "epoch": 1.07, + "grad_norm": 0.5290772914886475, + "learning_rate": 0.0004292558055540429, + "loss": 3.0907, + "step": 21928 + }, + { + "epoch": 1.07, + "grad_norm": 0.553503692150116, + "learning_rate": 0.00042924191151864276, + "loss": 3.2309, + "step": 21929 + }, + { + "epoch": 1.07, + "grad_norm": 0.5868078470230103, + "learning_rate": 0.00042922801714284425, + "loss": 3.244, + "step": 21930 + }, + { + "epoch": 1.07, + "grad_norm": 0.535077691078186, + "learning_rate": 0.0004292141224266842, + "loss": 3.3127, + "step": 21931 + }, + { + "epoch": 1.07, + "grad_norm": 0.5569257736206055, + "learning_rate": 0.0004292002273701989, + "loss": 3.0853, + "step": 21932 + }, + { + "epoch": 1.07, + "grad_norm": 0.5465898513793945, + "learning_rate": 0.00042918633197342506, + "loss": 3.0265, + "step": 21933 + }, + { + "epoch": 1.07, + "grad_norm": 0.5579783916473389, + "learning_rate": 0.00042917243623639934, + "loss": 3.087, + "step": 21934 + }, + { + "epoch": 1.07, + "grad_norm": 0.5138366222381592, + "learning_rate": 0.00042915854015915825, + "loss": 3.2086, + "step": 21935 + }, + { + "epoch": 1.08, + "grad_norm": 0.5494992136955261, + "learning_rate": 0.0004291446437417383, + "loss": 3.0049, + "step": 21936 + }, + { + "epoch": 1.08, + "grad_norm": 0.5276719927787781, + "learning_rate": 0.00042913074698417645, + "loss": 3.0952, + "step": 21937 + }, + { + "epoch": 1.08, + "grad_norm": 0.6109719276428223, + "learning_rate": 0.00042911684988650894, + "loss": 3.0782, + "step": 21938 + }, + { + "epoch": 1.08, + "grad_norm": 0.5517315864562988, + "learning_rate": 0.0004291029524487725, + "loss": 2.9928, + "step": 21939 + }, + { + "epoch": 1.08, + "grad_norm": 0.5608121752738953, + "learning_rate": 0.0004290890546710038, + "loss": 3.0408, + "step": 21940 + }, + { + "epoch": 1.08, + "grad_norm": 0.5328395366668701, + "learning_rate": 0.0004290751565532392, + "loss": 3.0053, + "step": 21941 + }, + { + "epoch": 1.08, + "grad_norm": 0.519293487071991, + "learning_rate": 0.0004290612580955156, + "loss": 3.1365, + "step": 21942 + }, + { + "epoch": 1.08, + "grad_norm": 0.5312771797180176, + "learning_rate": 0.0004290473592978695, + "loss": 3.1626, + "step": 21943 + }, + { + "epoch": 1.08, + "grad_norm": 0.5550572276115417, + "learning_rate": 0.00042903346016033746, + "loss": 2.9873, + "step": 21944 + }, + { + "epoch": 1.08, + "grad_norm": 0.5349480509757996, + "learning_rate": 0.0004290195606829561, + "loss": 3.0404, + "step": 21945 + }, + { + "epoch": 1.08, + "grad_norm": 0.53115314245224, + "learning_rate": 0.0004290056608657621, + "loss": 3.1882, + "step": 21946 + }, + { + "epoch": 1.08, + "grad_norm": 0.5174546837806702, + "learning_rate": 0.00042899176070879195, + "loss": 3.1557, + "step": 21947 + }, + { + "epoch": 1.08, + "grad_norm": 0.5151444673538208, + "learning_rate": 0.0004289778602120824, + "loss": 2.9821, + "step": 21948 + }, + { + "epoch": 1.08, + "grad_norm": 0.5512619614601135, + "learning_rate": 0.00042896395937567003, + "loss": 3.2606, + "step": 21949 + }, + { + "epoch": 1.08, + "grad_norm": 0.5739850997924805, + "learning_rate": 0.00042895005819959124, + "loss": 3.0911, + "step": 21950 + }, + { + "epoch": 1.08, + "grad_norm": 0.5571601390838623, + "learning_rate": 0.00042893615668388287, + "loss": 3.0645, + "step": 21951 + }, + { + "epoch": 1.08, + "grad_norm": 0.5094681978225708, + "learning_rate": 0.00042892225482858145, + "loss": 3.1872, + "step": 21952 + }, + { + "epoch": 1.08, + "grad_norm": 0.531054675579071, + "learning_rate": 0.00042890835263372374, + "loss": 2.8496, + "step": 21953 + }, + { + "epoch": 1.08, + "grad_norm": 0.5505704283714294, + "learning_rate": 0.0004288944500993462, + "loss": 3.0071, + "step": 21954 + }, + { + "epoch": 1.08, + "grad_norm": 0.5394859313964844, + "learning_rate": 0.0004288805472254855, + "loss": 3.2745, + "step": 21955 + }, + { + "epoch": 1.08, + "grad_norm": 0.5463074445724487, + "learning_rate": 0.0004288666440121782, + "loss": 3.0487, + "step": 21956 + }, + { + "epoch": 1.08, + "grad_norm": 0.5753787159919739, + "learning_rate": 0.00042885274045946096, + "loss": 3.0229, + "step": 21957 + }, + { + "epoch": 1.08, + "grad_norm": 0.5389484763145447, + "learning_rate": 0.0004288388365673704, + "loss": 2.9471, + "step": 21958 + }, + { + "epoch": 1.08, + "grad_norm": 0.5314404964447021, + "learning_rate": 0.00042882493233594316, + "loss": 3.1627, + "step": 21959 + }, + { + "epoch": 1.08, + "grad_norm": 0.5405857563018799, + "learning_rate": 0.0004288110277652159, + "loss": 3.0673, + "step": 21960 + }, + { + "epoch": 1.08, + "grad_norm": 0.5381196141242981, + "learning_rate": 0.0004287971228552251, + "loss": 3.1375, + "step": 21961 + }, + { + "epoch": 1.08, + "grad_norm": 0.5330754518508911, + "learning_rate": 0.0004287832176060074, + "loss": 2.9481, + "step": 21962 + }, + { + "epoch": 1.08, + "grad_norm": 0.5497124791145325, + "learning_rate": 0.0004287693120175996, + "loss": 3.0511, + "step": 21963 + }, + { + "epoch": 1.08, + "grad_norm": 0.5864923596382141, + "learning_rate": 0.0004287554060900382, + "loss": 3.1814, + "step": 21964 + }, + { + "epoch": 1.08, + "grad_norm": 0.5531643629074097, + "learning_rate": 0.0004287414998233598, + "loss": 3.0753, + "step": 21965 + }, + { + "epoch": 1.08, + "grad_norm": 0.5571635961532593, + "learning_rate": 0.0004287275932176011, + "loss": 2.926, + "step": 21966 + }, + { + "epoch": 1.08, + "grad_norm": 0.5513104796409607, + "learning_rate": 0.00042871368627279873, + "loss": 3.1882, + "step": 21967 + }, + { + "epoch": 1.08, + "grad_norm": 0.548090934753418, + "learning_rate": 0.0004286997789889892, + "loss": 3.265, + "step": 21968 + }, + { + "epoch": 1.08, + "grad_norm": 0.5597220063209534, + "learning_rate": 0.00042868587136620927, + "loss": 2.9922, + "step": 21969 + }, + { + "epoch": 1.08, + "grad_norm": 0.5412937998771667, + "learning_rate": 0.0004286719634044955, + "loss": 3.0208, + "step": 21970 + }, + { + "epoch": 1.08, + "grad_norm": 0.5296668410301208, + "learning_rate": 0.00042865805510388456, + "loss": 3.1406, + "step": 21971 + }, + { + "epoch": 1.08, + "grad_norm": 0.5780396461486816, + "learning_rate": 0.000428644146464413, + "loss": 3.0601, + "step": 21972 + }, + { + "epoch": 1.08, + "grad_norm": 0.5567464828491211, + "learning_rate": 0.00042863023748611757, + "loss": 3.2744, + "step": 21973 + }, + { + "epoch": 1.08, + "grad_norm": 0.5450098514556885, + "learning_rate": 0.00042861632816903485, + "loss": 3.0649, + "step": 21974 + }, + { + "epoch": 1.08, + "grad_norm": 0.5599391460418701, + "learning_rate": 0.00042860241851320157, + "loss": 3.1379, + "step": 21975 + }, + { + "epoch": 1.08, + "grad_norm": 0.5682699680328369, + "learning_rate": 0.00042858850851865414, + "loss": 3.0891, + "step": 21976 + }, + { + "epoch": 1.08, + "grad_norm": 0.578801155090332, + "learning_rate": 0.0004285745981854293, + "loss": 2.6889, + "step": 21977 + }, + { + "epoch": 1.08, + "grad_norm": 0.5470324754714966, + "learning_rate": 0.0004285606875135638, + "loss": 3.2862, + "step": 21978 + }, + { + "epoch": 1.08, + "grad_norm": 0.5256801843643188, + "learning_rate": 0.00042854677650309416, + "loss": 3.3307, + "step": 21979 + }, + { + "epoch": 1.08, + "grad_norm": 0.5400407910346985, + "learning_rate": 0.0004285328651540571, + "loss": 2.9132, + "step": 21980 + }, + { + "epoch": 1.08, + "grad_norm": 0.5310783982276917, + "learning_rate": 0.00042851895346648916, + "loss": 3.0173, + "step": 21981 + }, + { + "epoch": 1.08, + "grad_norm": 0.5397918820381165, + "learning_rate": 0.00042850504144042697, + "loss": 3.1176, + "step": 21982 + }, + { + "epoch": 1.08, + "grad_norm": 0.5340597033500671, + "learning_rate": 0.0004284911290759073, + "loss": 3.2694, + "step": 21983 + }, + { + "epoch": 1.08, + "grad_norm": 0.5765253901481628, + "learning_rate": 0.0004284772163729667, + "loss": 2.9625, + "step": 21984 + }, + { + "epoch": 1.08, + "grad_norm": 0.5977330803871155, + "learning_rate": 0.0004284633033316419, + "loss": 2.8798, + "step": 21985 + }, + { + "epoch": 1.08, + "grad_norm": 0.5429593324661255, + "learning_rate": 0.0004284493899519695, + "loss": 3.2659, + "step": 21986 + }, + { + "epoch": 1.08, + "grad_norm": 0.5388059020042419, + "learning_rate": 0.000428435476233986, + "loss": 2.9844, + "step": 21987 + }, + { + "epoch": 1.08, + "grad_norm": 0.5572206377983093, + "learning_rate": 0.0004284215621777282, + "loss": 3.0551, + "step": 21988 + }, + { + "epoch": 1.08, + "grad_norm": 0.5941150784492493, + "learning_rate": 0.0004284076477832328, + "loss": 2.8342, + "step": 21989 + }, + { + "epoch": 1.08, + "grad_norm": 0.5501981973648071, + "learning_rate": 0.00042839373305053635, + "loss": 3.0811, + "step": 21990 + }, + { + "epoch": 1.08, + "grad_norm": 0.5432038903236389, + "learning_rate": 0.00042837981797967553, + "loss": 3.0867, + "step": 21991 + }, + { + "epoch": 1.08, + "grad_norm": 0.5538283586502075, + "learning_rate": 0.00042836590257068686, + "loss": 3.3209, + "step": 21992 + }, + { + "epoch": 1.08, + "grad_norm": 0.5830565094947815, + "learning_rate": 0.00042835198682360725, + "loss": 2.9133, + "step": 21993 + }, + { + "epoch": 1.08, + "grad_norm": 0.5611438155174255, + "learning_rate": 0.00042833807073847313, + "loss": 2.9194, + "step": 21994 + }, + { + "epoch": 1.08, + "grad_norm": 0.5672524571418762, + "learning_rate": 0.0004283241543153212, + "loss": 3.0636, + "step": 21995 + }, + { + "epoch": 1.08, + "grad_norm": 0.5321059226989746, + "learning_rate": 0.00042831023755418823, + "loss": 3.1639, + "step": 21996 + }, + { + "epoch": 1.08, + "grad_norm": 0.574716329574585, + "learning_rate": 0.0004282963204551108, + "loss": 3.1508, + "step": 21997 + }, + { + "epoch": 1.08, + "grad_norm": 0.540168046951294, + "learning_rate": 0.0004282824030181254, + "loss": 2.825, + "step": 21998 + }, + { + "epoch": 1.08, + "grad_norm": 0.5404049158096313, + "learning_rate": 0.00042826848524326906, + "loss": 3.2393, + "step": 21999 + }, + { + "epoch": 1.08, + "grad_norm": 0.5294651389122009, + "learning_rate": 0.0004282545671305781, + "loss": 3.2042, + "step": 22000 + }, + { + "epoch": 1.08, + "grad_norm": 0.5121399760246277, + "learning_rate": 0.0004282406486800893, + "loss": 3.3916, + "step": 22001 + }, + { + "epoch": 1.08, + "grad_norm": 0.5601375699043274, + "learning_rate": 0.0004282267298918392, + "loss": 2.9623, + "step": 22002 + }, + { + "epoch": 1.08, + "grad_norm": 0.5114780068397522, + "learning_rate": 0.0004282128107658647, + "loss": 2.9027, + "step": 22003 + }, + { + "epoch": 1.08, + "grad_norm": 0.5449838638305664, + "learning_rate": 0.0004281988913022024, + "loss": 3.0159, + "step": 22004 + }, + { + "epoch": 1.08, + "grad_norm": 0.5693271160125732, + "learning_rate": 0.0004281849715008888, + "loss": 3.162, + "step": 22005 + }, + { + "epoch": 1.08, + "grad_norm": 0.5240155458450317, + "learning_rate": 0.0004281710513619606, + "loss": 3.1996, + "step": 22006 + }, + { + "epoch": 1.08, + "grad_norm": 0.5236963629722595, + "learning_rate": 0.0004281571308854546, + "loss": 3.3674, + "step": 22007 + }, + { + "epoch": 1.08, + "grad_norm": 0.581671953201294, + "learning_rate": 0.00042814321007140734, + "loss": 3.1306, + "step": 22008 + }, + { + "epoch": 1.08, + "grad_norm": 0.5509883761405945, + "learning_rate": 0.00042812928891985556, + "loss": 3.1849, + "step": 22009 + }, + { + "epoch": 1.08, + "grad_norm": 0.5407490134239197, + "learning_rate": 0.00042811536743083583, + "loss": 3.2232, + "step": 22010 + }, + { + "epoch": 1.08, + "grad_norm": 0.5536962151527405, + "learning_rate": 0.00042810144560438495, + "loss": 2.9887, + "step": 22011 + }, + { + "epoch": 1.08, + "grad_norm": 0.5936238765716553, + "learning_rate": 0.0004280875234405395, + "loss": 2.943, + "step": 22012 + }, + { + "epoch": 1.08, + "grad_norm": 0.5407906174659729, + "learning_rate": 0.0004280736009393361, + "loss": 3.271, + "step": 22013 + }, + { + "epoch": 1.08, + "grad_norm": 0.569972813129425, + "learning_rate": 0.00042805967810081156, + "loss": 3.0759, + "step": 22014 + }, + { + "epoch": 1.08, + "grad_norm": 0.5183060765266418, + "learning_rate": 0.00042804575492500245, + "loss": 3.2847, + "step": 22015 + }, + { + "epoch": 1.08, + "grad_norm": 0.5623323917388916, + "learning_rate": 0.00042803183141194547, + "loss": 3.2323, + "step": 22016 + }, + { + "epoch": 1.08, + "grad_norm": 0.5643849968910217, + "learning_rate": 0.00042801790756167733, + "loss": 3.0499, + "step": 22017 + }, + { + "epoch": 1.08, + "grad_norm": 0.5582993030548096, + "learning_rate": 0.00042800398337423457, + "loss": 3.051, + "step": 22018 + }, + { + "epoch": 1.08, + "grad_norm": 0.5692091584205627, + "learning_rate": 0.000427990058849654, + "loss": 3.0984, + "step": 22019 + }, + { + "epoch": 1.08, + "grad_norm": 0.5723261833190918, + "learning_rate": 0.0004279761339879722, + "loss": 3.0136, + "step": 22020 + }, + { + "epoch": 1.08, + "grad_norm": 0.5091471672058105, + "learning_rate": 0.0004279622087892259, + "loss": 3.2192, + "step": 22021 + }, + { + "epoch": 1.08, + "grad_norm": 0.5590563416481018, + "learning_rate": 0.0004279482832534519, + "loss": 3.0695, + "step": 22022 + }, + { + "epoch": 1.08, + "grad_norm": 0.5491331815719604, + "learning_rate": 0.0004279343573806866, + "loss": 3.1763, + "step": 22023 + }, + { + "epoch": 1.08, + "grad_norm": 0.5999693274497986, + "learning_rate": 0.00042792043117096686, + "loss": 2.8919, + "step": 22024 + }, + { + "epoch": 1.08, + "grad_norm": 0.5291171669960022, + "learning_rate": 0.0004279065046243293, + "loss": 3.404, + "step": 22025 + }, + { + "epoch": 1.08, + "grad_norm": 0.5340095162391663, + "learning_rate": 0.00042789257774081073, + "loss": 3.2515, + "step": 22026 + }, + { + "epoch": 1.08, + "grad_norm": 0.5588778853416443, + "learning_rate": 0.00042787865052044765, + "loss": 3.3579, + "step": 22027 + }, + { + "epoch": 1.08, + "grad_norm": 0.5633630156517029, + "learning_rate": 0.0004278647229632767, + "loss": 3.0803, + "step": 22028 + }, + { + "epoch": 1.08, + "grad_norm": 0.5125625133514404, + "learning_rate": 0.0004278507950693348, + "loss": 2.9785, + "step": 22029 + }, + { + "epoch": 1.08, + "grad_norm": 0.6125033497810364, + "learning_rate": 0.00042783686683865854, + "loss": 3.1076, + "step": 22030 + }, + { + "epoch": 1.08, + "grad_norm": 0.5360808372497559, + "learning_rate": 0.00042782293827128457, + "loss": 3.1738, + "step": 22031 + }, + { + "epoch": 1.08, + "grad_norm": 0.5409636497497559, + "learning_rate": 0.00042780900936724957, + "loss": 3.0977, + "step": 22032 + }, + { + "epoch": 1.08, + "grad_norm": 0.5472274422645569, + "learning_rate": 0.0004277950801265902, + "loss": 2.994, + "step": 22033 + }, + { + "epoch": 1.08, + "grad_norm": 0.5469160079956055, + "learning_rate": 0.00042778115054934314, + "loss": 3.0537, + "step": 22034 + }, + { + "epoch": 1.08, + "grad_norm": 0.5636223554611206, + "learning_rate": 0.0004277672206355452, + "loss": 3.01, + "step": 22035 + }, + { + "epoch": 1.08, + "grad_norm": 0.6423721313476562, + "learning_rate": 0.000427753290385233, + "loss": 2.9419, + "step": 22036 + }, + { + "epoch": 1.08, + "grad_norm": 0.53389573097229, + "learning_rate": 0.00042773935979844327, + "loss": 3.0792, + "step": 22037 + }, + { + "epoch": 1.08, + "grad_norm": 0.5442113280296326, + "learning_rate": 0.00042772542887521247, + "loss": 3.0606, + "step": 22038 + }, + { + "epoch": 1.08, + "grad_norm": 0.5777226686477661, + "learning_rate": 0.0004277114976155776, + "loss": 3.2876, + "step": 22039 + }, + { + "epoch": 1.08, + "grad_norm": 0.5454306602478027, + "learning_rate": 0.0004276975660195752, + "loss": 2.9339, + "step": 22040 + }, + { + "epoch": 1.08, + "grad_norm": 0.544465959072113, + "learning_rate": 0.000427683634087242, + "loss": 3.0272, + "step": 22041 + }, + { + "epoch": 1.08, + "grad_norm": 0.5971094369888306, + "learning_rate": 0.00042766970181861466, + "loss": 3.0422, + "step": 22042 + }, + { + "epoch": 1.08, + "grad_norm": 0.5495122075080872, + "learning_rate": 0.00042765576921372986, + "loss": 3.0553, + "step": 22043 + }, + { + "epoch": 1.08, + "grad_norm": 0.5605582594871521, + "learning_rate": 0.00042764183627262437, + "loss": 3.3398, + "step": 22044 + }, + { + "epoch": 1.08, + "grad_norm": 0.564033031463623, + "learning_rate": 0.00042762790299533484, + "loss": 3.0244, + "step": 22045 + }, + { + "epoch": 1.08, + "grad_norm": 0.5470286011695862, + "learning_rate": 0.000427613969381898, + "loss": 2.9283, + "step": 22046 + }, + { + "epoch": 1.08, + "grad_norm": 0.5006475448608398, + "learning_rate": 0.00042760003543235056, + "loss": 2.9596, + "step": 22047 + }, + { + "epoch": 1.08, + "grad_norm": 0.5973907709121704, + "learning_rate": 0.00042758610114672905, + "loss": 2.888, + "step": 22048 + }, + { + "epoch": 1.08, + "grad_norm": 0.5585212111473083, + "learning_rate": 0.00042757216652507046, + "loss": 3.1172, + "step": 22049 + }, + { + "epoch": 1.08, + "grad_norm": 0.5310104489326477, + "learning_rate": 0.00042755823156741127, + "loss": 3.0854, + "step": 22050 + }, + { + "epoch": 1.08, + "grad_norm": 0.5287449359893799, + "learning_rate": 0.00042754429627378824, + "loss": 3.1736, + "step": 22051 + }, + { + "epoch": 1.08, + "grad_norm": 0.5601946711540222, + "learning_rate": 0.00042753036064423813, + "loss": 3.1034, + "step": 22052 + }, + { + "epoch": 1.08, + "grad_norm": 0.5369007587432861, + "learning_rate": 0.00042751642467879746, + "loss": 2.9668, + "step": 22053 + }, + { + "epoch": 1.08, + "grad_norm": 0.5678867101669312, + "learning_rate": 0.00042750248837750317, + "loss": 3.331, + "step": 22054 + }, + { + "epoch": 1.08, + "grad_norm": 0.5435790419578552, + "learning_rate": 0.0004274885517403919, + "loss": 2.823, + "step": 22055 + }, + { + "epoch": 1.08, + "grad_norm": 0.4990702271461487, + "learning_rate": 0.00042747461476750026, + "loss": 2.8055, + "step": 22056 + }, + { + "epoch": 1.08, + "grad_norm": 0.551609218120575, + "learning_rate": 0.000427460677458865, + "loss": 3.0637, + "step": 22057 + }, + { + "epoch": 1.08, + "grad_norm": 0.5618946552276611, + "learning_rate": 0.00042744673981452285, + "loss": 3.0707, + "step": 22058 + }, + { + "epoch": 1.08, + "grad_norm": 0.5400345921516418, + "learning_rate": 0.0004274328018345105, + "loss": 2.8357, + "step": 22059 + }, + { + "epoch": 1.08, + "grad_norm": 0.5235921740531921, + "learning_rate": 0.0004274188635188647, + "loss": 3.0765, + "step": 22060 + }, + { + "epoch": 1.08, + "grad_norm": 0.5762670636177063, + "learning_rate": 0.00042740492486762215, + "loss": 3.2314, + "step": 22061 + }, + { + "epoch": 1.08, + "grad_norm": 0.5849509239196777, + "learning_rate": 0.0004273909858808196, + "loss": 2.9629, + "step": 22062 + }, + { + "epoch": 1.08, + "grad_norm": 0.5751528143882751, + "learning_rate": 0.0004273770465584937, + "loss": 3.1594, + "step": 22063 + }, + { + "epoch": 1.08, + "grad_norm": 0.5581267476081848, + "learning_rate": 0.00042736310690068106, + "loss": 3.0604, + "step": 22064 + }, + { + "epoch": 1.08, + "grad_norm": 0.6129317283630371, + "learning_rate": 0.00042734916690741854, + "loss": 3.1213, + "step": 22065 + }, + { + "epoch": 1.08, + "grad_norm": 0.5418512225151062, + "learning_rate": 0.00042733522657874295, + "loss": 3.0227, + "step": 22066 + }, + { + "epoch": 1.08, + "grad_norm": 0.5475884079933167, + "learning_rate": 0.0004273212859146908, + "loss": 3.254, + "step": 22067 + }, + { + "epoch": 1.08, + "grad_norm": 0.5630029439926147, + "learning_rate": 0.00042730734491529883, + "loss": 3.1402, + "step": 22068 + }, + { + "epoch": 1.08, + "grad_norm": 0.5417295098304749, + "learning_rate": 0.00042729340358060386, + "loss": 3.2479, + "step": 22069 + }, + { + "epoch": 1.08, + "grad_norm": 0.5166475772857666, + "learning_rate": 0.0004272794619106426, + "loss": 3.123, + "step": 22070 + }, + { + "epoch": 1.08, + "grad_norm": 0.554018497467041, + "learning_rate": 0.00042726551990545167, + "loss": 3.0511, + "step": 22071 + }, + { + "epoch": 1.08, + "grad_norm": 0.520998477935791, + "learning_rate": 0.00042725157756506784, + "loss": 2.9895, + "step": 22072 + }, + { + "epoch": 1.08, + "grad_norm": 0.5566690564155579, + "learning_rate": 0.00042723763488952796, + "loss": 3.1612, + "step": 22073 + }, + { + "epoch": 1.08, + "grad_norm": 0.5547829270362854, + "learning_rate": 0.0004272236918788685, + "loss": 3.1395, + "step": 22074 + }, + { + "epoch": 1.08, + "grad_norm": 0.6837734580039978, + "learning_rate": 0.00042720974853312633, + "loss": 3.0537, + "step": 22075 + }, + { + "epoch": 1.08, + "grad_norm": 0.6061183214187622, + "learning_rate": 0.00042719580485233826, + "loss": 2.9883, + "step": 22076 + }, + { + "epoch": 1.08, + "grad_norm": 0.5670740604400635, + "learning_rate": 0.00042718186083654094, + "loss": 3.0004, + "step": 22077 + }, + { + "epoch": 1.08, + "grad_norm": 0.5454275608062744, + "learning_rate": 0.000427167916485771, + "loss": 3.0537, + "step": 22078 + }, + { + "epoch": 1.08, + "grad_norm": 0.5352046489715576, + "learning_rate": 0.00042715397180006516, + "loss": 3.1883, + "step": 22079 + }, + { + "epoch": 1.08, + "grad_norm": 0.5673153400421143, + "learning_rate": 0.0004271400267794604, + "loss": 3.106, + "step": 22080 + }, + { + "epoch": 1.08, + "grad_norm": 0.5358964800834656, + "learning_rate": 0.0004271260814239932, + "loss": 3.0141, + "step": 22081 + }, + { + "epoch": 1.08, + "grad_norm": 0.6573508977890015, + "learning_rate": 0.0004271121357337003, + "loss": 3.0543, + "step": 22082 + }, + { + "epoch": 1.08, + "grad_norm": 0.5617774128913879, + "learning_rate": 0.0004270981897086186, + "loss": 3.0414, + "step": 22083 + }, + { + "epoch": 1.08, + "grad_norm": 0.5869306921958923, + "learning_rate": 0.0004270842433487847, + "loss": 2.9109, + "step": 22084 + }, + { + "epoch": 1.08, + "grad_norm": 0.5464381575584412, + "learning_rate": 0.00042707029665423535, + "loss": 2.9973, + "step": 22085 + }, + { + "epoch": 1.08, + "grad_norm": 0.5536450147628784, + "learning_rate": 0.00042705634962500726, + "loss": 3.1901, + "step": 22086 + }, + { + "epoch": 1.08, + "grad_norm": 0.5760237574577332, + "learning_rate": 0.00042704240226113725, + "loss": 3.2899, + "step": 22087 + }, + { + "epoch": 1.08, + "grad_norm": 0.5971628427505493, + "learning_rate": 0.00042702845456266207, + "loss": 2.9888, + "step": 22088 + }, + { + "epoch": 1.08, + "grad_norm": 0.5124607682228088, + "learning_rate": 0.0004270145065296183, + "loss": 3.1622, + "step": 22089 + }, + { + "epoch": 1.08, + "grad_norm": 0.5447020530700684, + "learning_rate": 0.0004270005581620427, + "loss": 3.1413, + "step": 22090 + }, + { + "epoch": 1.08, + "grad_norm": 0.5350099802017212, + "learning_rate": 0.00042698660945997215, + "loss": 3.0142, + "step": 22091 + }, + { + "epoch": 1.08, + "grad_norm": 0.5177395343780518, + "learning_rate": 0.0004269726604234433, + "loss": 3.2439, + "step": 22092 + }, + { + "epoch": 1.08, + "grad_norm": 0.5653977990150452, + "learning_rate": 0.0004269587110524929, + "loss": 3.0023, + "step": 22093 + }, + { + "epoch": 1.08, + "grad_norm": 0.5047891736030579, + "learning_rate": 0.0004269447613471577, + "loss": 2.987, + "step": 22094 + }, + { + "epoch": 1.08, + "grad_norm": 0.540993869304657, + "learning_rate": 0.00042693081130747444, + "loss": 3.1142, + "step": 22095 + }, + { + "epoch": 1.08, + "grad_norm": 0.5562851428985596, + "learning_rate": 0.0004269168609334798, + "loss": 2.9607, + "step": 22096 + }, + { + "epoch": 1.08, + "grad_norm": 0.5496192574501038, + "learning_rate": 0.0004269029102252106, + "loss": 3.0992, + "step": 22097 + }, + { + "epoch": 1.08, + "grad_norm": 0.5273379683494568, + "learning_rate": 0.0004268889591827036, + "loss": 2.926, + "step": 22098 + }, + { + "epoch": 1.08, + "grad_norm": 0.5540835857391357, + "learning_rate": 0.00042687500780599544, + "loss": 3.0501, + "step": 22099 + }, + { + "epoch": 1.08, + "grad_norm": 0.5160466432571411, + "learning_rate": 0.0004268610560951229, + "loss": 3.152, + "step": 22100 + }, + { + "epoch": 1.08, + "grad_norm": 0.5677379965782166, + "learning_rate": 0.0004268471040501228, + "loss": 3.0895, + "step": 22101 + }, + { + "epoch": 1.08, + "grad_norm": 0.5393813848495483, + "learning_rate": 0.00042683315167103196, + "loss": 3.0357, + "step": 22102 + }, + { + "epoch": 1.08, + "grad_norm": 0.5417753458023071, + "learning_rate": 0.00042681919895788683, + "loss": 3.1901, + "step": 22103 + }, + { + "epoch": 1.08, + "grad_norm": 0.4963914155960083, + "learning_rate": 0.0004268052459107244, + "loss": 3.3258, + "step": 22104 + }, + { + "epoch": 1.08, + "grad_norm": 0.5311129689216614, + "learning_rate": 0.0004267912925295813, + "loss": 3.1505, + "step": 22105 + }, + { + "epoch": 1.08, + "grad_norm": 0.5351983904838562, + "learning_rate": 0.00042677733881449446, + "loss": 3.0273, + "step": 22106 + }, + { + "epoch": 1.08, + "grad_norm": 0.5361194610595703, + "learning_rate": 0.0004267633847655004, + "loss": 2.9613, + "step": 22107 + }, + { + "epoch": 1.08, + "grad_norm": 0.5353830456733704, + "learning_rate": 0.000426749430382636, + "loss": 3.164, + "step": 22108 + }, + { + "epoch": 1.08, + "grad_norm": 0.5087656378746033, + "learning_rate": 0.0004267354756659381, + "loss": 3.2214, + "step": 22109 + }, + { + "epoch": 1.08, + "grad_norm": 0.5443871021270752, + "learning_rate": 0.00042672152061544314, + "loss": 3.043, + "step": 22110 + }, + { + "epoch": 1.08, + "grad_norm": 0.567270815372467, + "learning_rate": 0.0004267075652311882, + "loss": 3.1497, + "step": 22111 + }, + { + "epoch": 1.08, + "grad_norm": 0.520046055316925, + "learning_rate": 0.00042669360951321, + "loss": 3.1484, + "step": 22112 + }, + { + "epoch": 1.08, + "grad_norm": 0.5559588670730591, + "learning_rate": 0.00042667965346154514, + "loss": 3.0016, + "step": 22113 + }, + { + "epoch": 1.08, + "grad_norm": 0.5984733700752258, + "learning_rate": 0.0004266656970762305, + "loss": 3.0313, + "step": 22114 + }, + { + "epoch": 1.08, + "grad_norm": 0.5269164443016052, + "learning_rate": 0.0004266517403573026, + "loss": 3.2197, + "step": 22115 + }, + { + "epoch": 1.08, + "grad_norm": 0.6935906410217285, + "learning_rate": 0.00042663778330479863, + "loss": 3.2497, + "step": 22116 + }, + { + "epoch": 1.08, + "grad_norm": 0.5183548927307129, + "learning_rate": 0.000426623825918755, + "loss": 3.3231, + "step": 22117 + }, + { + "epoch": 1.08, + "grad_norm": 0.5844008326530457, + "learning_rate": 0.0004266098681992086, + "loss": 3.2325, + "step": 22118 + }, + { + "epoch": 1.08, + "grad_norm": 0.4959857165813446, + "learning_rate": 0.0004265959101461962, + "loss": 3.2061, + "step": 22119 + }, + { + "epoch": 1.08, + "grad_norm": 0.5572559833526611, + "learning_rate": 0.0004265819517597545, + "loss": 2.9519, + "step": 22120 + }, + { + "epoch": 1.08, + "grad_norm": 0.5319798588752747, + "learning_rate": 0.0004265679930399203, + "loss": 3.1005, + "step": 22121 + }, + { + "epoch": 1.08, + "grad_norm": 0.5380662679672241, + "learning_rate": 0.00042655403398673036, + "loss": 3.1476, + "step": 22122 + }, + { + "epoch": 1.08, + "grad_norm": 0.5653159618377686, + "learning_rate": 0.0004265400746002214, + "loss": 3.0577, + "step": 22123 + }, + { + "epoch": 1.08, + "grad_norm": 0.5704825520515442, + "learning_rate": 0.0004265261148804304, + "loss": 3.1806, + "step": 22124 + }, + { + "epoch": 1.08, + "grad_norm": 0.5176539421081543, + "learning_rate": 0.0004265121548273938, + "loss": 3.1776, + "step": 22125 + }, + { + "epoch": 1.08, + "grad_norm": 0.5719426870346069, + "learning_rate": 0.0004264981944411486, + "loss": 3.0773, + "step": 22126 + }, + { + "epoch": 1.08, + "grad_norm": 0.48471060395240784, + "learning_rate": 0.0004264842337217315, + "loss": 3.0145, + "step": 22127 + }, + { + "epoch": 1.08, + "grad_norm": 0.515109121799469, + "learning_rate": 0.00042647027266917926, + "loss": 3.2385, + "step": 22128 + }, + { + "epoch": 1.08, + "grad_norm": 0.5453347563743591, + "learning_rate": 0.0004264563112835287, + "loss": 3.0087, + "step": 22129 + }, + { + "epoch": 1.08, + "grad_norm": 0.5508375763893127, + "learning_rate": 0.00042644234956481646, + "loss": 3.0716, + "step": 22130 + }, + { + "epoch": 1.08, + "grad_norm": 0.5232502818107605, + "learning_rate": 0.00042642838751307953, + "loss": 3.2418, + "step": 22131 + }, + { + "epoch": 1.08, + "grad_norm": 0.5501592755317688, + "learning_rate": 0.00042641442512835446, + "loss": 3.0997, + "step": 22132 + }, + { + "epoch": 1.08, + "grad_norm": 0.54051673412323, + "learning_rate": 0.00042640046241067817, + "loss": 3.012, + "step": 22133 + }, + { + "epoch": 1.08, + "grad_norm": 0.52208411693573, + "learning_rate": 0.00042638649936008736, + "loss": 3.0246, + "step": 22134 + }, + { + "epoch": 1.08, + "grad_norm": 0.6216174364089966, + "learning_rate": 0.0004263725359766189, + "loss": 3.0755, + "step": 22135 + }, + { + "epoch": 1.08, + "grad_norm": 0.5394618511199951, + "learning_rate": 0.00042635857226030934, + "loss": 3.1135, + "step": 22136 + }, + { + "epoch": 1.08, + "grad_norm": 0.5192962288856506, + "learning_rate": 0.00042634460821119576, + "loss": 3.096, + "step": 22137 + }, + { + "epoch": 1.08, + "grad_norm": 0.5286705493927002, + "learning_rate": 0.0004263306438293148, + "loss": 3.0381, + "step": 22138 + }, + { + "epoch": 1.08, + "grad_norm": 0.5518622398376465, + "learning_rate": 0.0004263166791147032, + "loss": 3.0308, + "step": 22139 + }, + { + "epoch": 1.09, + "grad_norm": 0.5467196702957153, + "learning_rate": 0.0004263027140673978, + "loss": 3.172, + "step": 22140 + }, + { + "epoch": 1.09, + "grad_norm": 0.5387907028198242, + "learning_rate": 0.00042628874868743527, + "loss": 3.3111, + "step": 22141 + }, + { + "epoch": 1.09, + "grad_norm": 0.5448318719863892, + "learning_rate": 0.00042627478297485263, + "loss": 2.9398, + "step": 22142 + }, + { + "epoch": 1.09, + "grad_norm": 0.57187819480896, + "learning_rate": 0.0004262608169296864, + "loss": 3.2116, + "step": 22143 + }, + { + "epoch": 1.09, + "grad_norm": 0.6088796257972717, + "learning_rate": 0.00042624685055197355, + "loss": 3.0892, + "step": 22144 + }, + { + "epoch": 1.09, + "grad_norm": 0.5294945240020752, + "learning_rate": 0.00042623288384175073, + "loss": 3.0999, + "step": 22145 + }, + { + "epoch": 1.09, + "grad_norm": 0.5282583236694336, + "learning_rate": 0.00042621891679905477, + "loss": 3.2845, + "step": 22146 + }, + { + "epoch": 1.09, + "grad_norm": 0.556050181388855, + "learning_rate": 0.00042620494942392247, + "loss": 3.1564, + "step": 22147 + }, + { + "epoch": 1.09, + "grad_norm": 0.5315901041030884, + "learning_rate": 0.0004261909817163907, + "loss": 3.0353, + "step": 22148 + }, + { + "epoch": 1.09, + "grad_norm": 0.5163993239402771, + "learning_rate": 0.00042617701367649616, + "loss": 3.0004, + "step": 22149 + }, + { + "epoch": 1.09, + "grad_norm": 0.5579254031181335, + "learning_rate": 0.00042616304530427565, + "loss": 3.0205, + "step": 22150 + }, + { + "epoch": 1.09, + "grad_norm": 0.5503444671630859, + "learning_rate": 0.00042614907659976593, + "loss": 3.187, + "step": 22151 + }, + { + "epoch": 1.09, + "grad_norm": 0.5766018033027649, + "learning_rate": 0.0004261351075630038, + "loss": 3.2087, + "step": 22152 + }, + { + "epoch": 1.09, + "grad_norm": 0.5432386994361877, + "learning_rate": 0.00042612113819402616, + "loss": 3.0861, + "step": 22153 + }, + { + "epoch": 1.09, + "grad_norm": 0.5107462406158447, + "learning_rate": 0.0004261071684928696, + "loss": 2.9946, + "step": 22154 + }, + { + "epoch": 1.09, + "grad_norm": 0.5693354606628418, + "learning_rate": 0.00042609319845957115, + "loss": 2.9644, + "step": 22155 + }, + { + "epoch": 1.09, + "grad_norm": 0.6248636841773987, + "learning_rate": 0.00042607922809416736, + "loss": 3.1401, + "step": 22156 + }, + { + "epoch": 1.09, + "grad_norm": 0.5272868871688843, + "learning_rate": 0.0004260652573966952, + "loss": 2.98, + "step": 22157 + }, + { + "epoch": 1.09, + "grad_norm": 0.5685650706291199, + "learning_rate": 0.00042605128636719145, + "loss": 2.7644, + "step": 22158 + }, + { + "epoch": 1.09, + "grad_norm": 0.5144070982933044, + "learning_rate": 0.0004260373150056928, + "loss": 3.078, + "step": 22159 + }, + { + "epoch": 1.09, + "grad_norm": 0.5530477166175842, + "learning_rate": 0.00042602334331223615, + "loss": 3.1147, + "step": 22160 + }, + { + "epoch": 1.09, + "grad_norm": 0.5493810772895813, + "learning_rate": 0.0004260093712868583, + "loss": 3.2218, + "step": 22161 + }, + { + "epoch": 1.09, + "grad_norm": 0.5624595284461975, + "learning_rate": 0.00042599539892959593, + "loss": 3.3188, + "step": 22162 + }, + { + "epoch": 1.09, + "grad_norm": 0.5464377999305725, + "learning_rate": 0.00042598142624048597, + "loss": 3.1378, + "step": 22163 + }, + { + "epoch": 1.09, + "grad_norm": 0.5302308201789856, + "learning_rate": 0.00042596745321956525, + "loss": 3.4019, + "step": 22164 + }, + { + "epoch": 1.09, + "grad_norm": 0.5592415928840637, + "learning_rate": 0.0004259534798668705, + "loss": 2.9761, + "step": 22165 + }, + { + "epoch": 1.09, + "grad_norm": 0.6183668971061707, + "learning_rate": 0.0004259395061824384, + "loss": 2.8968, + "step": 22166 + }, + { + "epoch": 1.09, + "grad_norm": 0.5616676807403564, + "learning_rate": 0.00042592553216630593, + "loss": 3.13, + "step": 22167 + }, + { + "epoch": 1.09, + "grad_norm": 0.5477641820907593, + "learning_rate": 0.0004259115578185099, + "loss": 3.1877, + "step": 22168 + }, + { + "epoch": 1.09, + "grad_norm": 0.5532886385917664, + "learning_rate": 0.000425897583139087, + "loss": 2.7935, + "step": 22169 + }, + { + "epoch": 1.09, + "grad_norm": 0.5679529309272766, + "learning_rate": 0.00042588360812807416, + "loss": 3.2103, + "step": 22170 + }, + { + "epoch": 1.09, + "grad_norm": 0.5274649262428284, + "learning_rate": 0.00042586963278550807, + "loss": 2.974, + "step": 22171 + }, + { + "epoch": 1.09, + "grad_norm": 0.5478577613830566, + "learning_rate": 0.0004258556571114256, + "loss": 3.0606, + "step": 22172 + }, + { + "epoch": 1.09, + "grad_norm": 0.5519079566001892, + "learning_rate": 0.00042584168110586354, + "loss": 3.1437, + "step": 22173 + }, + { + "epoch": 1.09, + "grad_norm": 0.5189817547798157, + "learning_rate": 0.00042582770476885877, + "loss": 3.0276, + "step": 22174 + }, + { + "epoch": 1.09, + "grad_norm": 0.544884979724884, + "learning_rate": 0.00042581372810044797, + "loss": 3.0585, + "step": 22175 + }, + { + "epoch": 1.09, + "grad_norm": 0.5363135933876038, + "learning_rate": 0.0004257997511006681, + "loss": 3.2155, + "step": 22176 + }, + { + "epoch": 1.09, + "grad_norm": 0.5269807577133179, + "learning_rate": 0.0004257857737695558, + "loss": 3.0673, + "step": 22177 + }, + { + "epoch": 1.09, + "grad_norm": 0.535571813583374, + "learning_rate": 0.00042577179610714795, + "loss": 3.3365, + "step": 22178 + }, + { + "epoch": 1.09, + "grad_norm": 0.6153188943862915, + "learning_rate": 0.00042575781811348155, + "loss": 2.9663, + "step": 22179 + }, + { + "epoch": 1.09, + "grad_norm": 0.5156885385513306, + "learning_rate": 0.0004257438397885932, + "loss": 3.134, + "step": 22180 + }, + { + "epoch": 1.09, + "grad_norm": 0.5604032874107361, + "learning_rate": 0.00042572986113251975, + "loss": 3.2063, + "step": 22181 + }, + { + "epoch": 1.09, + "grad_norm": 0.5438750386238098, + "learning_rate": 0.000425715882145298, + "loss": 3.0467, + "step": 22182 + }, + { + "epoch": 1.09, + "grad_norm": 0.5508151054382324, + "learning_rate": 0.00042570190282696484, + "loss": 3.2381, + "step": 22183 + }, + { + "epoch": 1.09, + "grad_norm": 0.7027482986450195, + "learning_rate": 0.0004256879231775571, + "loss": 3.0696, + "step": 22184 + }, + { + "epoch": 1.09, + "grad_norm": 0.5488677620887756, + "learning_rate": 0.00042567394319711156, + "loss": 2.7966, + "step": 22185 + }, + { + "epoch": 1.09, + "grad_norm": 0.6230414509773254, + "learning_rate": 0.000425659962885665, + "loss": 3.157, + "step": 22186 + }, + { + "epoch": 1.09, + "grad_norm": 0.5432460308074951, + "learning_rate": 0.0004256459822432543, + "loss": 3.2616, + "step": 22187 + }, + { + "epoch": 1.09, + "grad_norm": 0.520793616771698, + "learning_rate": 0.00042563200126991625, + "loss": 3.0267, + "step": 22188 + }, + { + "epoch": 1.09, + "grad_norm": 0.533034086227417, + "learning_rate": 0.0004256180199656877, + "loss": 3.0244, + "step": 22189 + }, + { + "epoch": 1.09, + "grad_norm": 0.5230249762535095, + "learning_rate": 0.0004256040383306055, + "loss": 3.1633, + "step": 22190 + }, + { + "epoch": 1.09, + "grad_norm": 0.5286110639572144, + "learning_rate": 0.00042559005636470636, + "loss": 3.0445, + "step": 22191 + }, + { + "epoch": 1.09, + "grad_norm": 0.541115403175354, + "learning_rate": 0.00042557607406802715, + "loss": 3.1721, + "step": 22192 + }, + { + "epoch": 1.09, + "grad_norm": 0.5243132710456848, + "learning_rate": 0.0004255620914406048, + "loss": 3.2148, + "step": 22193 + }, + { + "epoch": 1.09, + "grad_norm": 0.5529786944389343, + "learning_rate": 0.00042554810848247613, + "loss": 3.1431, + "step": 22194 + }, + { + "epoch": 1.09, + "grad_norm": 0.5576296448707581, + "learning_rate": 0.00042553412519367783, + "loss": 3.0609, + "step": 22195 + }, + { + "epoch": 1.09, + "grad_norm": 0.5397801995277405, + "learning_rate": 0.00042552014157424677, + "loss": 2.8357, + "step": 22196 + }, + { + "epoch": 1.09, + "grad_norm": 0.5517771244049072, + "learning_rate": 0.00042550615762421986, + "loss": 3.21, + "step": 22197 + }, + { + "epoch": 1.09, + "grad_norm": 0.5241683721542358, + "learning_rate": 0.0004254921733436339, + "loss": 3.1942, + "step": 22198 + }, + { + "epoch": 1.09, + "grad_norm": 0.5698009729385376, + "learning_rate": 0.00042547818873252567, + "loss": 3.1291, + "step": 22199 + }, + { + "epoch": 1.09, + "grad_norm": 0.5370516777038574, + "learning_rate": 0.000425464203790932, + "loss": 3.3494, + "step": 22200 + }, + { + "epoch": 1.09, + "grad_norm": 0.6023409962654114, + "learning_rate": 0.0004254502185188899, + "loss": 3.1343, + "step": 22201 + }, + { + "epoch": 1.09, + "grad_norm": 0.5534052848815918, + "learning_rate": 0.000425436232916436, + "loss": 3.1015, + "step": 22202 + }, + { + "epoch": 1.09, + "grad_norm": 0.5142847299575806, + "learning_rate": 0.00042542224698360713, + "loss": 2.9254, + "step": 22203 + }, + { + "epoch": 1.09, + "grad_norm": 0.5432417392730713, + "learning_rate": 0.00042540826072044036, + "loss": 3.1449, + "step": 22204 + }, + { + "epoch": 1.09, + "grad_norm": 0.5595294833183289, + "learning_rate": 0.0004253942741269722, + "loss": 3.3928, + "step": 22205 + }, + { + "epoch": 1.09, + "grad_norm": 0.5585522651672363, + "learning_rate": 0.00042538028720323976, + "loss": 3.2472, + "step": 22206 + }, + { + "epoch": 1.09, + "grad_norm": 0.6130068302154541, + "learning_rate": 0.0004253662999492797, + "loss": 3.0106, + "step": 22207 + }, + { + "epoch": 1.09, + "grad_norm": 0.5520464777946472, + "learning_rate": 0.000425352312365129, + "loss": 3.2156, + "step": 22208 + }, + { + "epoch": 1.09, + "grad_norm": 0.5398738384246826, + "learning_rate": 0.0004253383244508243, + "loss": 3.2174, + "step": 22209 + }, + { + "epoch": 1.09, + "grad_norm": 0.5600001215934753, + "learning_rate": 0.00042532433620640276, + "loss": 3.0729, + "step": 22210 + }, + { + "epoch": 1.09, + "grad_norm": 0.5530575513839722, + "learning_rate": 0.00042531034763190095, + "loss": 3.0908, + "step": 22211 + }, + { + "epoch": 1.09, + "grad_norm": 0.550061047077179, + "learning_rate": 0.0004252963587273558, + "loss": 3.0893, + "step": 22212 + }, + { + "epoch": 1.09, + "grad_norm": 0.5559527277946472, + "learning_rate": 0.0004252823694928041, + "loss": 2.8335, + "step": 22213 + }, + { + "epoch": 1.09, + "grad_norm": 0.7743620276451111, + "learning_rate": 0.00042526837992828287, + "loss": 3.1083, + "step": 22214 + }, + { + "epoch": 1.09, + "grad_norm": 0.5912230610847473, + "learning_rate": 0.0004252543900338288, + "loss": 3.0625, + "step": 22215 + }, + { + "epoch": 1.09, + "grad_norm": 0.5378463864326477, + "learning_rate": 0.0004252403998094787, + "loss": 3.0251, + "step": 22216 + }, + { + "epoch": 1.09, + "grad_norm": 0.563388466835022, + "learning_rate": 0.0004252264092552696, + "loss": 3.153, + "step": 22217 + }, + { + "epoch": 1.09, + "grad_norm": 0.5485290288925171, + "learning_rate": 0.0004252124183712381, + "loss": 3.3472, + "step": 22218 + }, + { + "epoch": 1.09, + "grad_norm": 0.5503448247909546, + "learning_rate": 0.0004251984271574213, + "loss": 3.1022, + "step": 22219 + }, + { + "epoch": 1.09, + "grad_norm": 0.5638839602470398, + "learning_rate": 0.0004251844356138559, + "loss": 3.016, + "step": 22220 + }, + { + "epoch": 1.09, + "grad_norm": 0.5732755064964294, + "learning_rate": 0.0004251704437405788, + "loss": 2.9927, + "step": 22221 + }, + { + "epoch": 1.09, + "grad_norm": 0.5192015171051025, + "learning_rate": 0.00042515645153762683, + "loss": 3.2112, + "step": 22222 + }, + { + "epoch": 1.09, + "grad_norm": 0.49721595644950867, + "learning_rate": 0.0004251424590050368, + "loss": 3.0738, + "step": 22223 + }, + { + "epoch": 1.09, + "grad_norm": 0.5540822148323059, + "learning_rate": 0.00042512846614284566, + "loss": 3.1562, + "step": 22224 + }, + { + "epoch": 1.09, + "grad_norm": 0.5434120893478394, + "learning_rate": 0.0004251144729510903, + "loss": 3.0941, + "step": 22225 + }, + { + "epoch": 1.09, + "grad_norm": 0.5510311722755432, + "learning_rate": 0.0004251004794298074, + "loss": 2.973, + "step": 22226 + }, + { + "epoch": 1.09, + "grad_norm": 0.5258462429046631, + "learning_rate": 0.000425086485579034, + "loss": 2.8038, + "step": 22227 + }, + { + "epoch": 1.09, + "grad_norm": 0.5398142337799072, + "learning_rate": 0.0004250724913988067, + "loss": 3.1084, + "step": 22228 + }, + { + "epoch": 1.09, + "grad_norm": 0.5346091985702515, + "learning_rate": 0.00042505849688916265, + "loss": 2.9255, + "step": 22229 + }, + { + "epoch": 1.09, + "grad_norm": 0.5627778172492981, + "learning_rate": 0.0004250445020501387, + "loss": 3.1123, + "step": 22230 + }, + { + "epoch": 1.09, + "grad_norm": 0.5590901970863342, + "learning_rate": 0.0004250305068817715, + "loss": 3.1776, + "step": 22231 + }, + { + "epoch": 1.09, + "grad_norm": 0.5717828869819641, + "learning_rate": 0.0004250165113840979, + "loss": 3.1254, + "step": 22232 + }, + { + "epoch": 1.09, + "grad_norm": 0.5412927269935608, + "learning_rate": 0.00042500251555715497, + "loss": 3.1597, + "step": 22233 + }, + { + "epoch": 1.09, + "grad_norm": 0.5460436344146729, + "learning_rate": 0.0004249885194009794, + "loss": 3.062, + "step": 22234 + }, + { + "epoch": 1.09, + "grad_norm": 0.5982441306114197, + "learning_rate": 0.0004249745229156082, + "loss": 3.0677, + "step": 22235 + }, + { + "epoch": 1.09, + "grad_norm": 0.6243758201599121, + "learning_rate": 0.00042496052610107817, + "loss": 3.0814, + "step": 22236 + }, + { + "epoch": 1.09, + "grad_norm": 0.5570179224014282, + "learning_rate": 0.00042494652895742614, + "loss": 3.1024, + "step": 22237 + }, + { + "epoch": 1.09, + "grad_norm": 0.5606381297111511, + "learning_rate": 0.00042493253148468893, + "loss": 3.1431, + "step": 22238 + }, + { + "epoch": 1.09, + "grad_norm": 0.5624200701713562, + "learning_rate": 0.0004249185336829035, + "loss": 3.0207, + "step": 22239 + }, + { + "epoch": 1.09, + "grad_norm": 0.5409414768218994, + "learning_rate": 0.00042490453555210677, + "loss": 3.0627, + "step": 22240 + }, + { + "epoch": 1.09, + "grad_norm": 0.5465520024299622, + "learning_rate": 0.00042489053709233554, + "loss": 3.1536, + "step": 22241 + }, + { + "epoch": 1.09, + "grad_norm": 0.5893270373344421, + "learning_rate": 0.00042487653830362664, + "loss": 3.2021, + "step": 22242 + }, + { + "epoch": 1.09, + "grad_norm": 0.5436384081840515, + "learning_rate": 0.0004248625391860168, + "loss": 3.0846, + "step": 22243 + }, + { + "epoch": 1.09, + "grad_norm": 0.5279334187507629, + "learning_rate": 0.00042484853973954334, + "loss": 2.9912, + "step": 22244 + }, + { + "epoch": 1.09, + "grad_norm": 0.5567160844802856, + "learning_rate": 0.00042483453996424266, + "loss": 3.0722, + "step": 22245 + }, + { + "epoch": 1.09, + "grad_norm": 0.5820968151092529, + "learning_rate": 0.00042482053986015193, + "loss": 3.2603, + "step": 22246 + }, + { + "epoch": 1.09, + "grad_norm": 0.5296043753623962, + "learning_rate": 0.00042480653942730785, + "loss": 2.8998, + "step": 22247 + }, + { + "epoch": 1.09, + "grad_norm": 0.5903947353363037, + "learning_rate": 0.0004247925386657475, + "loss": 2.9915, + "step": 22248 + }, + { + "epoch": 1.09, + "grad_norm": 0.5724281072616577, + "learning_rate": 0.00042477853757550744, + "loss": 3.0855, + "step": 22249 + }, + { + "epoch": 1.09, + "grad_norm": 0.5451366305351257, + "learning_rate": 0.0004247645361566248, + "loss": 3.2043, + "step": 22250 + }, + { + "epoch": 1.09, + "grad_norm": 0.5273775458335876, + "learning_rate": 0.0004247505344091364, + "loss": 3.0935, + "step": 22251 + }, + { + "epoch": 1.09, + "grad_norm": 0.5182363390922546, + "learning_rate": 0.00042473653233307914, + "loss": 3.0011, + "step": 22252 + }, + { + "epoch": 1.09, + "grad_norm": 0.5897952914237976, + "learning_rate": 0.0004247225299284898, + "loss": 3.0319, + "step": 22253 + }, + { + "epoch": 1.09, + "grad_norm": 0.5534531474113464, + "learning_rate": 0.00042470852719540526, + "loss": 3.067, + "step": 22254 + }, + { + "epoch": 1.09, + "grad_norm": 0.6196662187576294, + "learning_rate": 0.0004246945241338626, + "loss": 3.1267, + "step": 22255 + }, + { + "epoch": 1.09, + "grad_norm": 0.5049815773963928, + "learning_rate": 0.0004246805207438985, + "loss": 3.0543, + "step": 22256 + }, + { + "epoch": 1.09, + "grad_norm": 0.5456374287605286, + "learning_rate": 0.00042466651702554984, + "loss": 2.9115, + "step": 22257 + }, + { + "epoch": 1.09, + "grad_norm": 0.5522916913032532, + "learning_rate": 0.00042465251297885365, + "loss": 3.1055, + "step": 22258 + }, + { + "epoch": 1.09, + "grad_norm": 0.5380954146385193, + "learning_rate": 0.0004246385086038467, + "loss": 2.957, + "step": 22259 + }, + { + "epoch": 1.09, + "grad_norm": 0.5323505997657776, + "learning_rate": 0.00042462450390056593, + "loss": 3.18, + "step": 22260 + }, + { + "epoch": 1.09, + "grad_norm": 0.5911219120025635, + "learning_rate": 0.0004246104988690481, + "loss": 3.0147, + "step": 22261 + }, + { + "epoch": 1.09, + "grad_norm": 0.5536802411079407, + "learning_rate": 0.00042459649350933026, + "loss": 2.9749, + "step": 22262 + }, + { + "epoch": 1.09, + "grad_norm": 0.5558472275733948, + "learning_rate": 0.00042458248782144937, + "loss": 3.0357, + "step": 22263 + }, + { + "epoch": 1.09, + "grad_norm": 0.5627835988998413, + "learning_rate": 0.00042456848180544196, + "loss": 3.1438, + "step": 22264 + }, + { + "epoch": 1.09, + "grad_norm": 0.5418185591697693, + "learning_rate": 0.00042455447546134526, + "loss": 3.2752, + "step": 22265 + }, + { + "epoch": 1.09, + "grad_norm": 0.5398939251899719, + "learning_rate": 0.0004245404687891961, + "loss": 3.2987, + "step": 22266 + }, + { + "epoch": 1.09, + "grad_norm": 0.5812855362892151, + "learning_rate": 0.0004245264617890312, + "loss": 3.2357, + "step": 22267 + }, + { + "epoch": 1.09, + "grad_norm": 0.5603014826774597, + "learning_rate": 0.00042451245446088764, + "loss": 3.1168, + "step": 22268 + }, + { + "epoch": 1.09, + "grad_norm": 0.5525497794151306, + "learning_rate": 0.00042449844680480213, + "loss": 3.0883, + "step": 22269 + }, + { + "epoch": 1.09, + "grad_norm": 0.5205212235450745, + "learning_rate": 0.0004244844388208118, + "loss": 2.9792, + "step": 22270 + }, + { + "epoch": 1.09, + "grad_norm": 0.5575414896011353, + "learning_rate": 0.00042447043050895334, + "loss": 3.0944, + "step": 22271 + }, + { + "epoch": 1.09, + "grad_norm": 0.5364728569984436, + "learning_rate": 0.00042445642186926373, + "loss": 2.9596, + "step": 22272 + }, + { + "epoch": 1.09, + "grad_norm": 0.5745815634727478, + "learning_rate": 0.0004244424129017799, + "loss": 3.0151, + "step": 22273 + }, + { + "epoch": 1.09, + "grad_norm": 0.5751864314079285, + "learning_rate": 0.0004244284036065387, + "loss": 3.1432, + "step": 22274 + }, + { + "epoch": 1.09, + "grad_norm": 0.5519656538963318, + "learning_rate": 0.00042441439398357696, + "loss": 3.0842, + "step": 22275 + }, + { + "epoch": 1.09, + "grad_norm": 0.5192199945449829, + "learning_rate": 0.0004244003840329317, + "loss": 3.051, + "step": 22276 + }, + { + "epoch": 1.09, + "grad_norm": 0.5178653597831726, + "learning_rate": 0.00042438637375463977, + "loss": 2.9998, + "step": 22277 + }, + { + "epoch": 1.09, + "grad_norm": 0.5251026153564453, + "learning_rate": 0.00042437236314873814, + "loss": 3.2947, + "step": 22278 + }, + { + "epoch": 1.09, + "grad_norm": 0.5604255199432373, + "learning_rate": 0.00042435835221526347, + "loss": 3.0916, + "step": 22279 + }, + { + "epoch": 1.09, + "grad_norm": 0.5459524393081665, + "learning_rate": 0.000424344340954253, + "loss": 2.81, + "step": 22280 + }, + { + "epoch": 1.09, + "grad_norm": 0.5570917725563049, + "learning_rate": 0.0004243303293657434, + "loss": 2.95, + "step": 22281 + }, + { + "epoch": 1.09, + "grad_norm": 0.5491553544998169, + "learning_rate": 0.00042431631744977165, + "loss": 2.8636, + "step": 22282 + }, + { + "epoch": 1.09, + "grad_norm": 0.5401395559310913, + "learning_rate": 0.0004243023052063746, + "loss": 3.0923, + "step": 22283 + }, + { + "epoch": 1.09, + "grad_norm": 0.619458019733429, + "learning_rate": 0.00042428829263558925, + "loss": 3.1343, + "step": 22284 + }, + { + "epoch": 1.09, + "grad_norm": 0.6060642600059509, + "learning_rate": 0.0004242742797374524, + "loss": 2.9652, + "step": 22285 + }, + { + "epoch": 1.09, + "grad_norm": 0.5287819504737854, + "learning_rate": 0.0004242602665120011, + "loss": 2.9648, + "step": 22286 + }, + { + "epoch": 1.09, + "grad_norm": 0.5329878330230713, + "learning_rate": 0.0004242462529592721, + "loss": 3.0809, + "step": 22287 + }, + { + "epoch": 1.09, + "grad_norm": 0.5269094705581665, + "learning_rate": 0.00042423223907930244, + "loss": 3.151, + "step": 22288 + }, + { + "epoch": 1.09, + "grad_norm": 0.5310917496681213, + "learning_rate": 0.00042421822487212897, + "loss": 3.0105, + "step": 22289 + }, + { + "epoch": 1.09, + "grad_norm": 0.5504202246665955, + "learning_rate": 0.0004242042103377885, + "loss": 3.1283, + "step": 22290 + }, + { + "epoch": 1.09, + "grad_norm": 0.5809419751167297, + "learning_rate": 0.00042419019547631815, + "loss": 3.0451, + "step": 22291 + }, + { + "epoch": 1.09, + "grad_norm": 0.5401477217674255, + "learning_rate": 0.0004241761802877547, + "loss": 2.8967, + "step": 22292 + }, + { + "epoch": 1.09, + "grad_norm": 0.5379841327667236, + "learning_rate": 0.00042416216477213507, + "loss": 3.0813, + "step": 22293 + }, + { + "epoch": 1.09, + "grad_norm": 0.5749803781509399, + "learning_rate": 0.0004241481489294962, + "loss": 3.0934, + "step": 22294 + }, + { + "epoch": 1.09, + "grad_norm": 0.5080057978630066, + "learning_rate": 0.000424134132759875, + "loss": 3.249, + "step": 22295 + }, + { + "epoch": 1.09, + "grad_norm": 0.5740684270858765, + "learning_rate": 0.0004241201162633084, + "loss": 3.2469, + "step": 22296 + }, + { + "epoch": 1.09, + "grad_norm": 0.6056188344955444, + "learning_rate": 0.0004241060994398333, + "loss": 3.118, + "step": 22297 + }, + { + "epoch": 1.09, + "grad_norm": 0.5314207673072815, + "learning_rate": 0.0004240920822894866, + "loss": 3.1676, + "step": 22298 + }, + { + "epoch": 1.09, + "grad_norm": 0.5285966992378235, + "learning_rate": 0.00042407806481230525, + "loss": 3.1967, + "step": 22299 + }, + { + "epoch": 1.09, + "grad_norm": 0.5155075192451477, + "learning_rate": 0.0004240640470083261, + "loss": 3.0571, + "step": 22300 + }, + { + "epoch": 1.09, + "grad_norm": 0.5423807501792908, + "learning_rate": 0.0004240500288775861, + "loss": 2.8794, + "step": 22301 + }, + { + "epoch": 1.09, + "grad_norm": 0.5311214327812195, + "learning_rate": 0.00042403601042012237, + "loss": 3.1135, + "step": 22302 + }, + { + "epoch": 1.09, + "grad_norm": 0.530709981918335, + "learning_rate": 0.00042402199163597164, + "loss": 3.0584, + "step": 22303 + }, + { + "epoch": 1.09, + "grad_norm": 0.5539402961730957, + "learning_rate": 0.00042400797252517075, + "loss": 3.2242, + "step": 22304 + }, + { + "epoch": 1.09, + "grad_norm": 0.5307897925376892, + "learning_rate": 0.0004239939530877567, + "loss": 2.8867, + "step": 22305 + }, + { + "epoch": 1.09, + "grad_norm": 0.5629311800003052, + "learning_rate": 0.0004239799333237666, + "loss": 3.2001, + "step": 22306 + }, + { + "epoch": 1.09, + "grad_norm": 0.533023476600647, + "learning_rate": 0.000423965913233237, + "loss": 2.9562, + "step": 22307 + }, + { + "epoch": 1.09, + "grad_norm": 0.5523614287376404, + "learning_rate": 0.0004239518928162052, + "loss": 3.0158, + "step": 22308 + }, + { + "epoch": 1.09, + "grad_norm": 0.5242599844932556, + "learning_rate": 0.00042393787207270793, + "loss": 3.0386, + "step": 22309 + }, + { + "epoch": 1.09, + "grad_norm": 0.5767312049865723, + "learning_rate": 0.0004239238510027821, + "loss": 3.1031, + "step": 22310 + }, + { + "epoch": 1.09, + "grad_norm": 0.5401198863983154, + "learning_rate": 0.0004239098296064648, + "loss": 3.0174, + "step": 22311 + }, + { + "epoch": 1.09, + "grad_norm": 0.5361244082450867, + "learning_rate": 0.0004238958078837927, + "loss": 3.1997, + "step": 22312 + }, + { + "epoch": 1.09, + "grad_norm": 0.5413596630096436, + "learning_rate": 0.000423881785834803, + "loss": 3.0465, + "step": 22313 + }, + { + "epoch": 1.09, + "grad_norm": 0.5148143172264099, + "learning_rate": 0.0004238677634595326, + "loss": 3.1441, + "step": 22314 + }, + { + "epoch": 1.09, + "grad_norm": 0.5160139203071594, + "learning_rate": 0.0004238537407580182, + "loss": 3.0543, + "step": 22315 + }, + { + "epoch": 1.09, + "grad_norm": 0.5179514288902283, + "learning_rate": 0.0004238397177302969, + "loss": 3.2671, + "step": 22316 + }, + { + "epoch": 1.09, + "grad_norm": 0.5649704337120056, + "learning_rate": 0.00042382569437640573, + "loss": 3.1858, + "step": 22317 + }, + { + "epoch": 1.09, + "grad_norm": 0.5431568622589111, + "learning_rate": 0.0004238116706963814, + "loss": 3.3245, + "step": 22318 + }, + { + "epoch": 1.09, + "grad_norm": 0.6058350205421448, + "learning_rate": 0.00042379764669026103, + "loss": 3.1804, + "step": 22319 + }, + { + "epoch": 1.09, + "grad_norm": 0.5485496520996094, + "learning_rate": 0.0004237836223580814, + "loss": 3.1041, + "step": 22320 + }, + { + "epoch": 1.09, + "grad_norm": 0.5472593903541565, + "learning_rate": 0.00042376959769987966, + "loss": 3.1254, + "step": 22321 + }, + { + "epoch": 1.09, + "grad_norm": 0.5176267027854919, + "learning_rate": 0.00042375557271569247, + "loss": 3.2074, + "step": 22322 + }, + { + "epoch": 1.09, + "grad_norm": 0.5529015064239502, + "learning_rate": 0.000423741547405557, + "loss": 3.1134, + "step": 22323 + }, + { + "epoch": 1.09, + "grad_norm": 0.5462213158607483, + "learning_rate": 0.00042372752176951013, + "loss": 3.1059, + "step": 22324 + }, + { + "epoch": 1.09, + "grad_norm": 0.5348901152610779, + "learning_rate": 0.0004237134958075887, + "loss": 3.1225, + "step": 22325 + }, + { + "epoch": 1.09, + "grad_norm": 0.6045992374420166, + "learning_rate": 0.0004236994695198298, + "loss": 3.0944, + "step": 22326 + }, + { + "epoch": 1.09, + "grad_norm": 0.5294902920722961, + "learning_rate": 0.0004236854429062703, + "loss": 3.1684, + "step": 22327 + }, + { + "epoch": 1.09, + "grad_norm": 0.5004157423973083, + "learning_rate": 0.00042367141596694717, + "loss": 2.921, + "step": 22328 + }, + { + "epoch": 1.09, + "grad_norm": 0.5467502474784851, + "learning_rate": 0.00042365738870189725, + "loss": 3.0567, + "step": 22329 + }, + { + "epoch": 1.09, + "grad_norm": 0.5614712238311768, + "learning_rate": 0.0004236433611111576, + "loss": 3.165, + "step": 22330 + }, + { + "epoch": 1.09, + "grad_norm": 0.5636160373687744, + "learning_rate": 0.0004236293331947651, + "loss": 2.931, + "step": 22331 + }, + { + "epoch": 1.09, + "grad_norm": 0.510614275932312, + "learning_rate": 0.0004236153049527568, + "loss": 2.879, + "step": 22332 + }, + { + "epoch": 1.09, + "grad_norm": 0.5611421465873718, + "learning_rate": 0.00042360127638516954, + "loss": 3.0598, + "step": 22333 + }, + { + "epoch": 1.09, + "grad_norm": 0.5256950855255127, + "learning_rate": 0.0004235872474920403, + "loss": 2.9983, + "step": 22334 + }, + { + "epoch": 1.09, + "grad_norm": 0.528252899646759, + "learning_rate": 0.00042357321827340603, + "loss": 3.2655, + "step": 22335 + }, + { + "epoch": 1.09, + "grad_norm": 0.528097927570343, + "learning_rate": 0.0004235591887293037, + "loss": 2.9888, + "step": 22336 + }, + { + "epoch": 1.09, + "grad_norm": 0.4826788008213043, + "learning_rate": 0.0004235451588597702, + "loss": 3.2171, + "step": 22337 + }, + { + "epoch": 1.09, + "grad_norm": 0.5288520455360413, + "learning_rate": 0.00042353112866484266, + "loss": 3.1955, + "step": 22338 + }, + { + "epoch": 1.09, + "grad_norm": 0.5336189866065979, + "learning_rate": 0.0004235170981445579, + "loss": 3.0057, + "step": 22339 + }, + { + "epoch": 1.09, + "grad_norm": 0.5776178240776062, + "learning_rate": 0.0004235030672989527, + "loss": 2.89, + "step": 22340 + }, + { + "epoch": 1.09, + "grad_norm": 0.533409059047699, + "learning_rate": 0.00042348903612806423, + "loss": 3.0783, + "step": 22341 + }, + { + "epoch": 1.09, + "grad_norm": 0.547615647315979, + "learning_rate": 0.0004234750046319294, + "loss": 3.1035, + "step": 22342 + }, + { + "epoch": 1.09, + "grad_norm": 0.5494769215583801, + "learning_rate": 0.00042346097281058536, + "loss": 3.252, + "step": 22343 + }, + { + "epoch": 1.1, + "grad_norm": 0.570833146572113, + "learning_rate": 0.0004234469406640687, + "loss": 3.049, + "step": 22344 + }, + { + "epoch": 1.1, + "grad_norm": 0.5642995834350586, + "learning_rate": 0.0004234329081924166, + "loss": 3.1304, + "step": 22345 + }, + { + "epoch": 1.1, + "grad_norm": 0.5693486332893372, + "learning_rate": 0.00042341887539566595, + "loss": 2.8819, + "step": 22346 + }, + { + "epoch": 1.1, + "grad_norm": 0.5467201471328735, + "learning_rate": 0.0004234048422738538, + "loss": 3.0296, + "step": 22347 + }, + { + "epoch": 1.1, + "grad_norm": 0.5357093811035156, + "learning_rate": 0.00042339080882701697, + "loss": 3.0394, + "step": 22348 + }, + { + "epoch": 1.1, + "grad_norm": 0.5732297301292419, + "learning_rate": 0.00042337677505519254, + "loss": 3.1511, + "step": 22349 + }, + { + "epoch": 1.1, + "grad_norm": 0.5539453029632568, + "learning_rate": 0.00042336274095841744, + "loss": 3.0462, + "step": 22350 + }, + { + "epoch": 1.1, + "grad_norm": 0.5515852570533752, + "learning_rate": 0.00042334870653672863, + "loss": 2.9138, + "step": 22351 + }, + { + "epoch": 1.1, + "grad_norm": 0.5160804986953735, + "learning_rate": 0.00042333467179016303, + "loss": 3.0142, + "step": 22352 + }, + { + "epoch": 1.1, + "grad_norm": 0.5109854936599731, + "learning_rate": 0.0004233206367187576, + "loss": 3.1363, + "step": 22353 + }, + { + "epoch": 1.1, + "grad_norm": 0.5832191705703735, + "learning_rate": 0.0004233066013225495, + "loss": 3.0025, + "step": 22354 + }, + { + "epoch": 1.1, + "grad_norm": 0.5390937924385071, + "learning_rate": 0.00042329256560157537, + "loss": 3.1417, + "step": 22355 + }, + { + "epoch": 1.1, + "grad_norm": 0.5653042793273926, + "learning_rate": 0.0004232785295558724, + "loss": 3.1148, + "step": 22356 + }, + { + "epoch": 1.1, + "grad_norm": 0.5726116895675659, + "learning_rate": 0.0004232644931854775, + "loss": 3.2039, + "step": 22357 + }, + { + "epoch": 1.1, + "grad_norm": 0.5899278521537781, + "learning_rate": 0.0004232504564904278, + "loss": 3.2208, + "step": 22358 + }, + { + "epoch": 1.1, + "grad_norm": 0.5757837295532227, + "learning_rate": 0.00042323641947075995, + "loss": 3.1871, + "step": 22359 + }, + { + "epoch": 1.1, + "grad_norm": 0.5611856579780579, + "learning_rate": 0.0004232223821265111, + "loss": 3.1254, + "step": 22360 + }, + { + "epoch": 1.1, + "grad_norm": 0.568060576915741, + "learning_rate": 0.00042320834445771827, + "loss": 3.2821, + "step": 22361 + }, + { + "epoch": 1.1, + "grad_norm": 0.5462589859962463, + "learning_rate": 0.0004231943064644183, + "loss": 3.0981, + "step": 22362 + }, + { + "epoch": 1.1, + "grad_norm": 0.5303399562835693, + "learning_rate": 0.0004231802681466483, + "loss": 3.2174, + "step": 22363 + }, + { + "epoch": 1.1, + "grad_norm": 0.5866662263870239, + "learning_rate": 0.0004231662295044451, + "loss": 2.9066, + "step": 22364 + }, + { + "epoch": 1.1, + "grad_norm": 0.5583893060684204, + "learning_rate": 0.0004231521905378459, + "loss": 2.9209, + "step": 22365 + }, + { + "epoch": 1.1, + "grad_norm": 0.5403869152069092, + "learning_rate": 0.00042313815124688745, + "loss": 3.0106, + "step": 22366 + }, + { + "epoch": 1.1, + "grad_norm": 0.5992947220802307, + "learning_rate": 0.0004231241116316068, + "loss": 3.2221, + "step": 22367 + }, + { + "epoch": 1.1, + "grad_norm": 0.5342088937759399, + "learning_rate": 0.0004231100716920409, + "loss": 2.932, + "step": 22368 + }, + { + "epoch": 1.1, + "grad_norm": 0.5689389705657959, + "learning_rate": 0.00042309603142822686, + "loss": 3.0559, + "step": 22369 + }, + { + "epoch": 1.1, + "grad_norm": 0.5494974851608276, + "learning_rate": 0.0004230819908402015, + "loss": 3.1754, + "step": 22370 + }, + { + "epoch": 1.1, + "grad_norm": 0.5982123613357544, + "learning_rate": 0.0004230679499280018, + "loss": 3.2388, + "step": 22371 + }, + { + "epoch": 1.1, + "grad_norm": 0.5255215167999268, + "learning_rate": 0.00042305390869166494, + "loss": 3.2487, + "step": 22372 + }, + { + "epoch": 1.1, + "grad_norm": 0.5233165621757507, + "learning_rate": 0.00042303986713122766, + "loss": 3.0138, + "step": 22373 + }, + { + "epoch": 1.1, + "grad_norm": 0.5595539808273315, + "learning_rate": 0.0004230258252467271, + "loss": 3.1675, + "step": 22374 + }, + { + "epoch": 1.1, + "grad_norm": 0.5262620449066162, + "learning_rate": 0.00042301178303820025, + "loss": 3.3422, + "step": 22375 + }, + { + "epoch": 1.1, + "grad_norm": 0.5396249890327454, + "learning_rate": 0.00042299774050568394, + "loss": 3.0504, + "step": 22376 + }, + { + "epoch": 1.1, + "grad_norm": 0.5464133620262146, + "learning_rate": 0.00042298369764921524, + "loss": 3.2645, + "step": 22377 + }, + { + "epoch": 1.1, + "grad_norm": 0.561158299446106, + "learning_rate": 0.0004229696544688312, + "loss": 3.1811, + "step": 22378 + }, + { + "epoch": 1.1, + "grad_norm": 0.5148482918739319, + "learning_rate": 0.0004229556109645688, + "loss": 3.2374, + "step": 22379 + }, + { + "epoch": 1.1, + "grad_norm": 0.5801580548286438, + "learning_rate": 0.00042294156713646494, + "loss": 2.9333, + "step": 22380 + }, + { + "epoch": 1.1, + "grad_norm": 0.5597841739654541, + "learning_rate": 0.0004229275229845567, + "loss": 3.2094, + "step": 22381 + }, + { + "epoch": 1.1, + "grad_norm": 0.5296369791030884, + "learning_rate": 0.00042291347850888087, + "loss": 2.8841, + "step": 22382 + }, + { + "epoch": 1.1, + "grad_norm": 0.553082287311554, + "learning_rate": 0.00042289943370947476, + "loss": 3.1016, + "step": 22383 + }, + { + "epoch": 1.1, + "grad_norm": 0.6039657592773438, + "learning_rate": 0.0004228853885863751, + "loss": 3.0928, + "step": 22384 + }, + { + "epoch": 1.1, + "grad_norm": 0.5544348359107971, + "learning_rate": 0.000422871343139619, + "loss": 3.093, + "step": 22385 + }, + { + "epoch": 1.1, + "grad_norm": 0.5134222507476807, + "learning_rate": 0.00042285729736924344, + "loss": 3.134, + "step": 22386 + }, + { + "epoch": 1.1, + "grad_norm": 0.5257523059844971, + "learning_rate": 0.00042284325127528536, + "loss": 3.1072, + "step": 22387 + }, + { + "epoch": 1.1, + "grad_norm": 0.5168401598930359, + "learning_rate": 0.00042282920485778183, + "loss": 3.0503, + "step": 22388 + }, + { + "epoch": 1.1, + "grad_norm": 0.5573122501373291, + "learning_rate": 0.0004228151581167698, + "loss": 3.0215, + "step": 22389 + }, + { + "epoch": 1.1, + "grad_norm": 0.5416275858879089, + "learning_rate": 0.00042280111105228634, + "loss": 3.1959, + "step": 22390 + }, + { + "epoch": 1.1, + "grad_norm": 0.5634099841117859, + "learning_rate": 0.00042278706366436836, + "loss": 3.1155, + "step": 22391 + }, + { + "epoch": 1.1, + "grad_norm": 0.58818519115448, + "learning_rate": 0.0004227730159530528, + "loss": 3.0849, + "step": 22392 + }, + { + "epoch": 1.1, + "grad_norm": 0.5335586071014404, + "learning_rate": 0.0004227589679183768, + "loss": 2.8114, + "step": 22393 + }, + { + "epoch": 1.1, + "grad_norm": 0.5638708472251892, + "learning_rate": 0.00042274491956037735, + "loss": 2.9018, + "step": 22394 + }, + { + "epoch": 1.1, + "grad_norm": 0.5199167132377625, + "learning_rate": 0.0004227308708790914, + "loss": 2.95, + "step": 22395 + }, + { + "epoch": 1.1, + "grad_norm": 0.5466358661651611, + "learning_rate": 0.00042271682187455594, + "loss": 3.0032, + "step": 22396 + }, + { + "epoch": 1.1, + "grad_norm": 0.5106999278068542, + "learning_rate": 0.000422702772546808, + "loss": 3.1893, + "step": 22397 + }, + { + "epoch": 1.1, + "grad_norm": 0.5572851896286011, + "learning_rate": 0.00042268872289588454, + "loss": 2.9937, + "step": 22398 + }, + { + "epoch": 1.1, + "grad_norm": 0.5566379427909851, + "learning_rate": 0.0004226746729218226, + "loss": 2.8665, + "step": 22399 + }, + { + "epoch": 1.1, + "grad_norm": 0.5310401916503906, + "learning_rate": 0.00042266062262465913, + "loss": 3.1665, + "step": 22400 + }, + { + "epoch": 1.1, + "grad_norm": 0.5580877065658569, + "learning_rate": 0.0004226465720044313, + "loss": 3.0617, + "step": 22401 + }, + { + "epoch": 1.1, + "grad_norm": 0.5510329604148865, + "learning_rate": 0.0004226325210611759, + "loss": 3.1422, + "step": 22402 + }, + { + "epoch": 1.1, + "grad_norm": 0.6322705149650574, + "learning_rate": 0.00042261846979493005, + "loss": 3.0221, + "step": 22403 + }, + { + "epoch": 1.1, + "grad_norm": 0.56231290102005, + "learning_rate": 0.00042260441820573077, + "loss": 3.0854, + "step": 22404 + }, + { + "epoch": 1.1, + "grad_norm": 0.5650330781936646, + "learning_rate": 0.00042259036629361515, + "loss": 3.1679, + "step": 22405 + }, + { + "epoch": 1.1, + "grad_norm": 0.5603417754173279, + "learning_rate": 0.00042257631405862, + "loss": 3.1113, + "step": 22406 + }, + { + "epoch": 1.1, + "grad_norm": 0.5578677654266357, + "learning_rate": 0.0004225622615007824, + "loss": 3.2132, + "step": 22407 + }, + { + "epoch": 1.1, + "grad_norm": 0.5346105694770813, + "learning_rate": 0.00042254820862013947, + "loss": 3.1515, + "step": 22408 + }, + { + "epoch": 1.1, + "grad_norm": 0.5724475979804993, + "learning_rate": 0.0004225341554167281, + "loss": 3.0174, + "step": 22409 + }, + { + "epoch": 1.1, + "grad_norm": 0.6160552501678467, + "learning_rate": 0.00042252010189058534, + "loss": 3.0215, + "step": 22410 + }, + { + "epoch": 1.1, + "grad_norm": 0.5466728806495667, + "learning_rate": 0.0004225060480417482, + "loss": 3.0644, + "step": 22411 + }, + { + "epoch": 1.1, + "grad_norm": 0.5517079830169678, + "learning_rate": 0.00042249199387025383, + "loss": 3.0097, + "step": 22412 + }, + { + "epoch": 1.1, + "grad_norm": 0.5601084232330322, + "learning_rate": 0.00042247793937613893, + "loss": 3.0367, + "step": 22413 + }, + { + "epoch": 1.1, + "grad_norm": 0.5142292976379395, + "learning_rate": 0.00042246388455944076, + "loss": 3.0302, + "step": 22414 + }, + { + "epoch": 1.1, + "grad_norm": 0.5414049625396729, + "learning_rate": 0.0004224498294201964, + "loss": 3.0613, + "step": 22415 + }, + { + "epoch": 1.1, + "grad_norm": 0.5946900844573975, + "learning_rate": 0.0004224357739584427, + "loss": 2.8341, + "step": 22416 + }, + { + "epoch": 1.1, + "grad_norm": 0.5414168238639832, + "learning_rate": 0.0004224217181742167, + "loss": 3.109, + "step": 22417 + }, + { + "epoch": 1.1, + "grad_norm": 0.5585944056510925, + "learning_rate": 0.0004224076620675554, + "loss": 3.1668, + "step": 22418 + }, + { + "epoch": 1.1, + "grad_norm": 0.5433940887451172, + "learning_rate": 0.000422393605638496, + "loss": 2.8695, + "step": 22419 + }, + { + "epoch": 1.1, + "grad_norm": 0.5730898976325989, + "learning_rate": 0.00042237954888707535, + "loss": 3.1879, + "step": 22420 + }, + { + "epoch": 1.1, + "grad_norm": 0.5441948771476746, + "learning_rate": 0.0004223654918133305, + "loss": 3.0039, + "step": 22421 + }, + { + "epoch": 1.1, + "grad_norm": 0.5263099074363708, + "learning_rate": 0.00042235143441729853, + "loss": 3.0273, + "step": 22422 + }, + { + "epoch": 1.1, + "grad_norm": 0.5462498068809509, + "learning_rate": 0.0004223373766990164, + "loss": 3.1222, + "step": 22423 + }, + { + "epoch": 1.1, + "grad_norm": 0.5350505709648132, + "learning_rate": 0.0004223233186585211, + "loss": 3.0815, + "step": 22424 + }, + { + "epoch": 1.1, + "grad_norm": 0.5752918720245361, + "learning_rate": 0.0004223092602958498, + "loss": 3.2427, + "step": 22425 + }, + { + "epoch": 1.1, + "grad_norm": 0.5051820278167725, + "learning_rate": 0.0004222952016110395, + "loss": 3.1282, + "step": 22426 + }, + { + "epoch": 1.1, + "grad_norm": 0.568838894367218, + "learning_rate": 0.0004222811426041271, + "loss": 3.133, + "step": 22427 + }, + { + "epoch": 1.1, + "grad_norm": 0.5199463367462158, + "learning_rate": 0.00042226708327514973, + "loss": 3.1213, + "step": 22428 + }, + { + "epoch": 1.1, + "grad_norm": 0.5748323202133179, + "learning_rate": 0.00042225302362414435, + "loss": 3.2312, + "step": 22429 + }, + { + "epoch": 1.1, + "grad_norm": 0.5747201442718506, + "learning_rate": 0.00042223896365114815, + "loss": 2.9831, + "step": 22430 + }, + { + "epoch": 1.1, + "grad_norm": 0.5636919140815735, + "learning_rate": 0.00042222490335619797, + "loss": 3.0873, + "step": 22431 + }, + { + "epoch": 1.1, + "grad_norm": 0.5946205854415894, + "learning_rate": 0.00042221084273933086, + "loss": 2.8779, + "step": 22432 + }, + { + "epoch": 1.1, + "grad_norm": 0.545413613319397, + "learning_rate": 0.0004221967818005839, + "loss": 3.1714, + "step": 22433 + }, + { + "epoch": 1.1, + "grad_norm": 0.5125501751899719, + "learning_rate": 0.00042218272053999427, + "loss": 3.2075, + "step": 22434 + }, + { + "epoch": 1.1, + "grad_norm": 0.5402535796165466, + "learning_rate": 0.0004221686589575988, + "loss": 3.0146, + "step": 22435 + }, + { + "epoch": 1.1, + "grad_norm": 0.5473110675811768, + "learning_rate": 0.0004221545970534346, + "loss": 3.0171, + "step": 22436 + }, + { + "epoch": 1.1, + "grad_norm": 0.5473494529724121, + "learning_rate": 0.00042214053482753864, + "loss": 2.9442, + "step": 22437 + }, + { + "epoch": 1.1, + "grad_norm": 0.5252751708030701, + "learning_rate": 0.00042212647227994805, + "loss": 3.2078, + "step": 22438 + }, + { + "epoch": 1.1, + "grad_norm": 0.5198621153831482, + "learning_rate": 0.0004221124094106999, + "loss": 2.9712, + "step": 22439 + }, + { + "epoch": 1.1, + "grad_norm": 0.5305339694023132, + "learning_rate": 0.00042209834621983105, + "loss": 3.0418, + "step": 22440 + }, + { + "epoch": 1.1, + "grad_norm": 0.5615828037261963, + "learning_rate": 0.00042208428270737875, + "loss": 3.0449, + "step": 22441 + }, + { + "epoch": 1.1, + "grad_norm": 0.554909884929657, + "learning_rate": 0.00042207021887337995, + "loss": 3.2129, + "step": 22442 + }, + { + "epoch": 1.1, + "grad_norm": 0.578995406627655, + "learning_rate": 0.0004220561547178717, + "loss": 3.0968, + "step": 22443 + }, + { + "epoch": 1.1, + "grad_norm": 0.5198182463645935, + "learning_rate": 0.0004220420902408908, + "loss": 3.035, + "step": 22444 + }, + { + "epoch": 1.1, + "grad_norm": 0.5391382575035095, + "learning_rate": 0.0004220280254424748, + "loss": 3.1567, + "step": 22445 + }, + { + "epoch": 1.1, + "grad_norm": 0.5328619480133057, + "learning_rate": 0.0004220139603226603, + "loss": 3.2176, + "step": 22446 + }, + { + "epoch": 1.1, + "grad_norm": 0.5532649159431458, + "learning_rate": 0.0004219998948814846, + "loss": 3.105, + "step": 22447 + }, + { + "epoch": 1.1, + "grad_norm": 0.5565139055252075, + "learning_rate": 0.00042198582911898467, + "loss": 3.1877, + "step": 22448 + }, + { + "epoch": 1.1, + "grad_norm": 0.522406280040741, + "learning_rate": 0.00042197176303519746, + "loss": 3.0828, + "step": 22449 + }, + { + "epoch": 1.1, + "grad_norm": 0.5588423609733582, + "learning_rate": 0.00042195769663016017, + "loss": 3.1538, + "step": 22450 + }, + { + "epoch": 1.1, + "grad_norm": 0.5381976962089539, + "learning_rate": 0.0004219436299039097, + "loss": 3.2286, + "step": 22451 + }, + { + "epoch": 1.1, + "grad_norm": 0.5260021090507507, + "learning_rate": 0.00042192956285648335, + "loss": 3.2763, + "step": 22452 + }, + { + "epoch": 1.1, + "grad_norm": 0.5152723789215088, + "learning_rate": 0.00042191549548791787, + "loss": 3.0711, + "step": 22453 + }, + { + "epoch": 1.1, + "grad_norm": 0.5516050457954407, + "learning_rate": 0.00042190142779825035, + "loss": 2.9179, + "step": 22454 + }, + { + "epoch": 1.1, + "grad_norm": 0.539625883102417, + "learning_rate": 0.000421887359787518, + "loss": 2.9954, + "step": 22455 + }, + { + "epoch": 1.1, + "grad_norm": 0.5438560843467712, + "learning_rate": 0.0004218732914557579, + "loss": 3.1334, + "step": 22456 + }, + { + "epoch": 1.1, + "grad_norm": 0.5455101132392883, + "learning_rate": 0.00042185922280300696, + "loss": 3.2678, + "step": 22457 + }, + { + "epoch": 1.1, + "grad_norm": 0.5365552306175232, + "learning_rate": 0.0004218451538293023, + "loss": 3.0484, + "step": 22458 + }, + { + "epoch": 1.1, + "grad_norm": 0.549579918384552, + "learning_rate": 0.0004218310845346808, + "loss": 3.3398, + "step": 22459 + }, + { + "epoch": 1.1, + "grad_norm": 0.537279486656189, + "learning_rate": 0.00042181701491917983, + "loss": 3.0408, + "step": 22460 + }, + { + "epoch": 1.1, + "grad_norm": 0.5590521693229675, + "learning_rate": 0.0004218029449828362, + "loss": 3.1195, + "step": 22461 + }, + { + "epoch": 1.1, + "grad_norm": 0.5591493844985962, + "learning_rate": 0.0004217888747256871, + "loss": 3.0398, + "step": 22462 + }, + { + "epoch": 1.1, + "grad_norm": 0.5219199061393738, + "learning_rate": 0.00042177480414776956, + "loss": 2.9367, + "step": 22463 + }, + { + "epoch": 1.1, + "grad_norm": 0.5333996415138245, + "learning_rate": 0.0004217607332491206, + "loss": 3.2085, + "step": 22464 + }, + { + "epoch": 1.1, + "grad_norm": 0.5313884615898132, + "learning_rate": 0.00042174666202977727, + "loss": 2.8805, + "step": 22465 + }, + { + "epoch": 1.1, + "grad_norm": 0.5648304224014282, + "learning_rate": 0.0004217325904897766, + "loss": 3.2723, + "step": 22466 + }, + { + "epoch": 1.1, + "grad_norm": 0.5362172722816467, + "learning_rate": 0.0004217185186291559, + "loss": 3.132, + "step": 22467 + }, + { + "epoch": 1.1, + "grad_norm": 0.5308043956756592, + "learning_rate": 0.00042170444644795197, + "loss": 3.1991, + "step": 22468 + }, + { + "epoch": 1.1, + "grad_norm": 0.5285980701446533, + "learning_rate": 0.0004216903739462018, + "loss": 2.998, + "step": 22469 + }, + { + "epoch": 1.1, + "grad_norm": 0.529820442199707, + "learning_rate": 0.00042167630112394284, + "loss": 2.9558, + "step": 22470 + }, + { + "epoch": 1.1, + "grad_norm": 0.5706838965415955, + "learning_rate": 0.0004216622279812118, + "loss": 3.1321, + "step": 22471 + }, + { + "epoch": 1.1, + "grad_norm": 0.5605747103691101, + "learning_rate": 0.0004216481545180459, + "loss": 3.1693, + "step": 22472 + }, + { + "epoch": 1.1, + "grad_norm": 0.52308189868927, + "learning_rate": 0.00042163408073448214, + "loss": 2.8613, + "step": 22473 + }, + { + "epoch": 1.1, + "grad_norm": 0.5640896558761597, + "learning_rate": 0.0004216200066305576, + "loss": 3.1425, + "step": 22474 + }, + { + "epoch": 1.1, + "grad_norm": 0.5801900029182434, + "learning_rate": 0.0004216059322063094, + "loss": 3.1555, + "step": 22475 + }, + { + "epoch": 1.1, + "grad_norm": 0.5530162453651428, + "learning_rate": 0.0004215918574617746, + "loss": 3.1143, + "step": 22476 + }, + { + "epoch": 1.1, + "grad_norm": 0.5521156191825867, + "learning_rate": 0.0004215777823969902, + "loss": 3.103, + "step": 22477 + }, + { + "epoch": 1.1, + "grad_norm": 0.5479637980461121, + "learning_rate": 0.0004215637070119934, + "loss": 3.0354, + "step": 22478 + }, + { + "epoch": 1.1, + "grad_norm": 0.551988422870636, + "learning_rate": 0.0004215496313068212, + "loss": 3.1629, + "step": 22479 + }, + { + "epoch": 1.1, + "grad_norm": 0.5706598162651062, + "learning_rate": 0.0004215355552815105, + "loss": 3.1129, + "step": 22480 + }, + { + "epoch": 1.1, + "grad_norm": 0.5351764559745789, + "learning_rate": 0.0004215214789360987, + "loss": 3.071, + "step": 22481 + }, + { + "epoch": 1.1, + "grad_norm": 0.5388553738594055, + "learning_rate": 0.00042150740227062263, + "loss": 3.0048, + "step": 22482 + }, + { + "epoch": 1.1, + "grad_norm": 0.5683891773223877, + "learning_rate": 0.0004214933252851195, + "loss": 3.0454, + "step": 22483 + }, + { + "epoch": 1.1, + "grad_norm": 0.5642884969711304, + "learning_rate": 0.00042147924797962625, + "loss": 3.0447, + "step": 22484 + }, + { + "epoch": 1.1, + "grad_norm": 0.5272451043128967, + "learning_rate": 0.0004214651703541801, + "loss": 2.9475, + "step": 22485 + }, + { + "epoch": 1.1, + "grad_norm": 0.5619935989379883, + "learning_rate": 0.00042145109240881805, + "loss": 2.9711, + "step": 22486 + }, + { + "epoch": 1.1, + "grad_norm": 0.6104692220687866, + "learning_rate": 0.0004214370141435772, + "loss": 3.2122, + "step": 22487 + }, + { + "epoch": 1.1, + "grad_norm": 0.5447784066200256, + "learning_rate": 0.0004214229355584946, + "loss": 3.0263, + "step": 22488 + }, + { + "epoch": 1.1, + "grad_norm": 0.5399470329284668, + "learning_rate": 0.0004214088566536075, + "loss": 3.0962, + "step": 22489 + }, + { + "epoch": 1.1, + "grad_norm": 0.5518352389335632, + "learning_rate": 0.0004213947774289526, + "loss": 2.8093, + "step": 22490 + }, + { + "epoch": 1.1, + "grad_norm": 0.5711850523948669, + "learning_rate": 0.00042138069788456736, + "loss": 2.9572, + "step": 22491 + }, + { + "epoch": 1.1, + "grad_norm": 0.5700511932373047, + "learning_rate": 0.00042136661802048874, + "loss": 3.058, + "step": 22492 + }, + { + "epoch": 1.1, + "grad_norm": 0.5496143102645874, + "learning_rate": 0.0004213525378367538, + "loss": 3.0367, + "step": 22493 + }, + { + "epoch": 1.1, + "grad_norm": 0.5528961420059204, + "learning_rate": 0.0004213384573333996, + "loss": 3.0467, + "step": 22494 + }, + { + "epoch": 1.1, + "grad_norm": 0.5584120750427246, + "learning_rate": 0.00042132437651046315, + "loss": 3.1026, + "step": 22495 + }, + { + "epoch": 1.1, + "grad_norm": 0.5595943927764893, + "learning_rate": 0.0004213102953679818, + "loss": 2.9348, + "step": 22496 + }, + { + "epoch": 1.1, + "grad_norm": 0.5819007158279419, + "learning_rate": 0.0004212962139059924, + "loss": 3.1339, + "step": 22497 + }, + { + "epoch": 1.1, + "grad_norm": 0.5432302951812744, + "learning_rate": 0.00042128213212453216, + "loss": 3.1587, + "step": 22498 + }, + { + "epoch": 1.1, + "grad_norm": 0.5066870450973511, + "learning_rate": 0.00042126805002363815, + "loss": 3.1825, + "step": 22499 + }, + { + "epoch": 1.1, + "grad_norm": 0.531493604183197, + "learning_rate": 0.0004212539676033474, + "loss": 3.3026, + "step": 22500 + }, + { + "epoch": 1.1, + "grad_norm": 0.5524630546569824, + "learning_rate": 0.00042123988486369703, + "loss": 3.0143, + "step": 22501 + }, + { + "epoch": 1.1, + "grad_norm": 0.6150621771812439, + "learning_rate": 0.00042122580180472413, + "loss": 2.8606, + "step": 22502 + }, + { + "epoch": 1.1, + "grad_norm": 0.55769282579422, + "learning_rate": 0.00042121171842646584, + "loss": 3.0798, + "step": 22503 + }, + { + "epoch": 1.1, + "grad_norm": 0.5654763579368591, + "learning_rate": 0.00042119763472895917, + "loss": 3.1062, + "step": 22504 + }, + { + "epoch": 1.1, + "grad_norm": 0.5415908098220825, + "learning_rate": 0.00042118355071224116, + "loss": 3.2484, + "step": 22505 + }, + { + "epoch": 1.1, + "grad_norm": 0.5293490290641785, + "learning_rate": 0.00042116946637634915, + "loss": 3.3383, + "step": 22506 + }, + { + "epoch": 1.1, + "grad_norm": 0.5430842041969299, + "learning_rate": 0.00042115538172132007, + "loss": 2.9673, + "step": 22507 + }, + { + "epoch": 1.1, + "grad_norm": 0.9073577523231506, + "learning_rate": 0.000421141296747191, + "loss": 3.183, + "step": 22508 + }, + { + "epoch": 1.1, + "grad_norm": 0.5296067595481873, + "learning_rate": 0.00042112721145399904, + "loss": 3.0694, + "step": 22509 + }, + { + "epoch": 1.1, + "grad_norm": 0.5582892894744873, + "learning_rate": 0.00042111312584178136, + "loss": 2.915, + "step": 22510 + }, + { + "epoch": 1.1, + "grad_norm": 0.5482213497161865, + "learning_rate": 0.000421099039910575, + "loss": 3.0819, + "step": 22511 + }, + { + "epoch": 1.1, + "grad_norm": 0.534598708152771, + "learning_rate": 0.00042108495366041703, + "loss": 3.0956, + "step": 22512 + }, + { + "epoch": 1.1, + "grad_norm": 0.5279784202575684, + "learning_rate": 0.0004210708670913446, + "loss": 3.136, + "step": 22513 + }, + { + "epoch": 1.1, + "grad_norm": 0.5435617566108704, + "learning_rate": 0.00042105678020339495, + "loss": 3.3483, + "step": 22514 + }, + { + "epoch": 1.1, + "grad_norm": 0.5815554857254028, + "learning_rate": 0.00042104269299660487, + "loss": 3.2077, + "step": 22515 + }, + { + "epoch": 1.1, + "grad_norm": 0.5558618903160095, + "learning_rate": 0.00042102860547101165, + "loss": 2.9707, + "step": 22516 + }, + { + "epoch": 1.1, + "grad_norm": 0.5012040734291077, + "learning_rate": 0.00042101451762665247, + "loss": 3.1028, + "step": 22517 + }, + { + "epoch": 1.1, + "grad_norm": 0.5293975472450256, + "learning_rate": 0.00042100042946356436, + "loss": 2.9891, + "step": 22518 + }, + { + "epoch": 1.1, + "grad_norm": 0.5717222690582275, + "learning_rate": 0.0004209863409817843, + "loss": 3.0784, + "step": 22519 + }, + { + "epoch": 1.1, + "grad_norm": 0.5145463347434998, + "learning_rate": 0.0004209722521813495, + "loss": 2.9634, + "step": 22520 + }, + { + "epoch": 1.1, + "grad_norm": 0.5181236267089844, + "learning_rate": 0.0004209581630622971, + "loss": 3.2429, + "step": 22521 + }, + { + "epoch": 1.1, + "grad_norm": 0.5378340482711792, + "learning_rate": 0.0004209440736246642, + "loss": 3.152, + "step": 22522 + }, + { + "epoch": 1.1, + "grad_norm": 0.7082079648971558, + "learning_rate": 0.0004209299838684879, + "loss": 2.9937, + "step": 22523 + }, + { + "epoch": 1.1, + "grad_norm": 0.5510277152061462, + "learning_rate": 0.00042091589379380525, + "loss": 3.4617, + "step": 22524 + }, + { + "epoch": 1.1, + "grad_norm": 0.5319778919219971, + "learning_rate": 0.00042090180340065346, + "loss": 3.0375, + "step": 22525 + }, + { + "epoch": 1.1, + "grad_norm": 0.5088480710983276, + "learning_rate": 0.0004208877126890695, + "loss": 3.2293, + "step": 22526 + }, + { + "epoch": 1.1, + "grad_norm": 0.5296842455863953, + "learning_rate": 0.00042087362165909057, + "loss": 3.0811, + "step": 22527 + }, + { + "epoch": 1.1, + "grad_norm": 0.5537300109863281, + "learning_rate": 0.0004208595303107539, + "loss": 3.1295, + "step": 22528 + }, + { + "epoch": 1.1, + "grad_norm": 0.5461785197257996, + "learning_rate": 0.00042084543864409646, + "loss": 3.2753, + "step": 22529 + }, + { + "epoch": 1.1, + "grad_norm": 0.571995735168457, + "learning_rate": 0.0004208313466591553, + "loss": 3.087, + "step": 22530 + }, + { + "epoch": 1.1, + "grad_norm": 0.5290436148643494, + "learning_rate": 0.00042081725435596764, + "loss": 3.0314, + "step": 22531 + }, + { + "epoch": 1.1, + "grad_norm": 0.5442591905593872, + "learning_rate": 0.00042080316173457064, + "loss": 3.2783, + "step": 22532 + }, + { + "epoch": 1.1, + "grad_norm": 0.5432357788085938, + "learning_rate": 0.00042078906879500135, + "loss": 3.2025, + "step": 22533 + }, + { + "epoch": 1.1, + "grad_norm": 0.53834468126297, + "learning_rate": 0.0004207749755372969, + "loss": 3.2905, + "step": 22534 + }, + { + "epoch": 1.1, + "grad_norm": 0.5716333389282227, + "learning_rate": 0.0004207608819614944, + "loss": 3.1693, + "step": 22535 + }, + { + "epoch": 1.1, + "grad_norm": 0.5577285885810852, + "learning_rate": 0.00042074678806763095, + "loss": 3.0463, + "step": 22536 + }, + { + "epoch": 1.1, + "grad_norm": 0.5382303595542908, + "learning_rate": 0.00042073269385574374, + "loss": 3.0666, + "step": 22537 + }, + { + "epoch": 1.1, + "grad_norm": 0.5647816061973572, + "learning_rate": 0.00042071859932586985, + "loss": 3.0805, + "step": 22538 + }, + { + "epoch": 1.1, + "grad_norm": 0.5214908123016357, + "learning_rate": 0.0004207045044780464, + "loss": 3.1665, + "step": 22539 + }, + { + "epoch": 1.1, + "grad_norm": 0.5525656938552856, + "learning_rate": 0.0004206904093123106, + "loss": 3.0327, + "step": 22540 + }, + { + "epoch": 1.1, + "grad_norm": 0.5515208840370178, + "learning_rate": 0.0004206763138286993, + "loss": 3.1713, + "step": 22541 + }, + { + "epoch": 1.1, + "grad_norm": 0.5453616380691528, + "learning_rate": 0.0004206622180272499, + "loss": 3.0146, + "step": 22542 + }, + { + "epoch": 1.1, + "grad_norm": 0.5157879590988159, + "learning_rate": 0.00042064812190799946, + "loss": 3.1077, + "step": 22543 + }, + { + "epoch": 1.1, + "grad_norm": 0.5753068327903748, + "learning_rate": 0.0004206340254709851, + "loss": 2.9845, + "step": 22544 + }, + { + "epoch": 1.1, + "grad_norm": 0.5232889652252197, + "learning_rate": 0.0004206199287162439, + "loss": 3.1151, + "step": 22545 + }, + { + "epoch": 1.1, + "grad_norm": 0.527743399143219, + "learning_rate": 0.000420605831643813, + "loss": 3.25, + "step": 22546 + }, + { + "epoch": 1.1, + "grad_norm": 0.5670763254165649, + "learning_rate": 0.00042059173425372965, + "loss": 3.3105, + "step": 22547 + }, + { + "epoch": 1.11, + "grad_norm": 0.5618078708648682, + "learning_rate": 0.0004205776365460307, + "loss": 3.1793, + "step": 22548 + }, + { + "epoch": 1.11, + "grad_norm": 0.5352552533149719, + "learning_rate": 0.00042056353852075364, + "loss": 3.1404, + "step": 22549 + }, + { + "epoch": 1.11, + "grad_norm": 0.5104782581329346, + "learning_rate": 0.0004205494401779354, + "loss": 3.1323, + "step": 22550 + }, + { + "epoch": 1.11, + "grad_norm": 0.5447371602058411, + "learning_rate": 0.00042053534151761306, + "loss": 3.3586, + "step": 22551 + }, + { + "epoch": 1.11, + "grad_norm": 0.5591816902160645, + "learning_rate": 0.0004205212425398238, + "loss": 3.0942, + "step": 22552 + }, + { + "epoch": 1.11, + "grad_norm": 0.5504888892173767, + "learning_rate": 0.00042050714324460485, + "loss": 3.0393, + "step": 22553 + }, + { + "epoch": 1.11, + "grad_norm": 0.5453820824623108, + "learning_rate": 0.00042049304363199334, + "loss": 3.0541, + "step": 22554 + }, + { + "epoch": 1.11, + "grad_norm": 0.5488752722740173, + "learning_rate": 0.00042047894370202625, + "loss": 3.0418, + "step": 22555 + }, + { + "epoch": 1.11, + "grad_norm": 0.5417929887771606, + "learning_rate": 0.00042046484345474084, + "loss": 3.0206, + "step": 22556 + }, + { + "epoch": 1.11, + "grad_norm": 0.5546737909317017, + "learning_rate": 0.00042045074289017413, + "loss": 3.0535, + "step": 22557 + }, + { + "epoch": 1.11, + "grad_norm": 0.5946574807167053, + "learning_rate": 0.00042043664200836346, + "loss": 3.1673, + "step": 22558 + }, + { + "epoch": 1.11, + "grad_norm": 0.5759459137916565, + "learning_rate": 0.0004204225408093458, + "loss": 3.0653, + "step": 22559 + }, + { + "epoch": 1.11, + "grad_norm": 0.5452854037284851, + "learning_rate": 0.0004204084392931583, + "loss": 3.092, + "step": 22560 + }, + { + "epoch": 1.11, + "grad_norm": 0.5289528369903564, + "learning_rate": 0.0004203943374598382, + "loss": 3.2014, + "step": 22561 + }, + { + "epoch": 1.11, + "grad_norm": 0.5537521243095398, + "learning_rate": 0.00042038023530942265, + "loss": 3.1951, + "step": 22562 + }, + { + "epoch": 1.11, + "grad_norm": 0.5194694399833679, + "learning_rate": 0.0004203661328419486, + "loss": 2.9367, + "step": 22563 + }, + { + "epoch": 1.11, + "grad_norm": 0.5450275540351868, + "learning_rate": 0.00042035203005745336, + "loss": 2.9492, + "step": 22564 + }, + { + "epoch": 1.11, + "grad_norm": 0.527119517326355, + "learning_rate": 0.0004203379269559741, + "loss": 3.1056, + "step": 22565 + }, + { + "epoch": 1.11, + "grad_norm": 0.5397108197212219, + "learning_rate": 0.0004203238235375479, + "loss": 3.1839, + "step": 22566 + }, + { + "epoch": 1.11, + "grad_norm": 0.5303124189376831, + "learning_rate": 0.0004203097198022118, + "loss": 3.2767, + "step": 22567 + }, + { + "epoch": 1.11, + "grad_norm": 0.5147061347961426, + "learning_rate": 0.0004202956157500031, + "loss": 3.1816, + "step": 22568 + }, + { + "epoch": 1.11, + "grad_norm": 0.5415562391281128, + "learning_rate": 0.00042028151138095895, + "loss": 3.3555, + "step": 22569 + }, + { + "epoch": 1.11, + "grad_norm": 0.5352077484130859, + "learning_rate": 0.0004202674066951165, + "loss": 3.0523, + "step": 22570 + }, + { + "epoch": 1.11, + "grad_norm": 0.5709248781204224, + "learning_rate": 0.0004202533016925127, + "loss": 3.0357, + "step": 22571 + }, + { + "epoch": 1.11, + "grad_norm": 0.5660057663917542, + "learning_rate": 0.0004202391963731849, + "loss": 3.1059, + "step": 22572 + }, + { + "epoch": 1.11, + "grad_norm": 0.5669631958007812, + "learning_rate": 0.0004202250907371702, + "loss": 3.0434, + "step": 22573 + }, + { + "epoch": 1.11, + "grad_norm": 0.5338017344474792, + "learning_rate": 0.0004202109847845057, + "loss": 3.0701, + "step": 22574 + }, + { + "epoch": 1.11, + "grad_norm": 0.5378772020339966, + "learning_rate": 0.00042019687851522873, + "loss": 3.3332, + "step": 22575 + }, + { + "epoch": 1.11, + "grad_norm": 0.5645471215248108, + "learning_rate": 0.0004201827719293762, + "loss": 3.2192, + "step": 22576 + }, + { + "epoch": 1.11, + "grad_norm": 0.5384810566902161, + "learning_rate": 0.00042016866502698536, + "loss": 2.9694, + "step": 22577 + }, + { + "epoch": 1.11, + "grad_norm": 0.5898525714874268, + "learning_rate": 0.00042015455780809345, + "loss": 2.8896, + "step": 22578 + }, + { + "epoch": 1.11, + "grad_norm": 0.5633717179298401, + "learning_rate": 0.00042014045027273755, + "loss": 2.9377, + "step": 22579 + }, + { + "epoch": 1.11, + "grad_norm": 0.551085352897644, + "learning_rate": 0.00042012634242095487, + "loss": 3.0136, + "step": 22580 + }, + { + "epoch": 1.11, + "grad_norm": 0.5334330201148987, + "learning_rate": 0.00042011223425278253, + "loss": 3.0485, + "step": 22581 + }, + { + "epoch": 1.11, + "grad_norm": 0.5643008947372437, + "learning_rate": 0.0004200981257682575, + "loss": 3.142, + "step": 22582 + }, + { + "epoch": 1.11, + "grad_norm": 0.5641648769378662, + "learning_rate": 0.0004200840169674173, + "loss": 3.2212, + "step": 22583 + }, + { + "epoch": 1.11, + "grad_norm": 0.5251689553260803, + "learning_rate": 0.00042006990785029886, + "loss": 3.2124, + "step": 22584 + }, + { + "epoch": 1.11, + "grad_norm": 0.5536843538284302, + "learning_rate": 0.0004200557984169394, + "loss": 3.3359, + "step": 22585 + }, + { + "epoch": 1.11, + "grad_norm": 0.5234001874923706, + "learning_rate": 0.00042004168866737607, + "loss": 3.098, + "step": 22586 + }, + { + "epoch": 1.11, + "grad_norm": 0.5152735114097595, + "learning_rate": 0.000420027578601646, + "loss": 3.1305, + "step": 22587 + }, + { + "epoch": 1.11, + "grad_norm": 0.5633012056350708, + "learning_rate": 0.0004200134682197864, + "loss": 3.0323, + "step": 22588 + }, + { + "epoch": 1.11, + "grad_norm": 0.5583577752113342, + "learning_rate": 0.00041999935752183446, + "loss": 3.1308, + "step": 22589 + }, + { + "epoch": 1.11, + "grad_norm": 0.5771940350532532, + "learning_rate": 0.0004199852465078273, + "loss": 3.1042, + "step": 22590 + }, + { + "epoch": 1.11, + "grad_norm": 0.5792399644851685, + "learning_rate": 0.0004199711351778021, + "loss": 2.8656, + "step": 22591 + }, + { + "epoch": 1.11, + "grad_norm": 0.5672153234481812, + "learning_rate": 0.0004199570235317959, + "loss": 2.8594, + "step": 22592 + }, + { + "epoch": 1.11, + "grad_norm": 0.5597004890441895, + "learning_rate": 0.0004199429115698461, + "loss": 3.1582, + "step": 22593 + }, + { + "epoch": 1.11, + "grad_norm": 0.5668061971664429, + "learning_rate": 0.00041992879929198977, + "loss": 2.9705, + "step": 22594 + }, + { + "epoch": 1.11, + "grad_norm": 0.5375760793685913, + "learning_rate": 0.000419914686698264, + "loss": 2.9946, + "step": 22595 + }, + { + "epoch": 1.11, + "grad_norm": 0.5492685437202454, + "learning_rate": 0.00041990057378870607, + "loss": 2.9779, + "step": 22596 + }, + { + "epoch": 1.11, + "grad_norm": 0.5831897854804993, + "learning_rate": 0.00041988646056335304, + "loss": 3.0992, + "step": 22597 + }, + { + "epoch": 1.11, + "grad_norm": 0.5664513111114502, + "learning_rate": 0.00041987234702224217, + "loss": 3.0218, + "step": 22598 + }, + { + "epoch": 1.11, + "grad_norm": 0.5811390280723572, + "learning_rate": 0.00041985823316541065, + "loss": 2.9748, + "step": 22599 + }, + { + "epoch": 1.11, + "grad_norm": 0.5502529740333557, + "learning_rate": 0.00041984411899289556, + "loss": 3.1049, + "step": 22600 + }, + { + "epoch": 1.11, + "grad_norm": 0.57771897315979, + "learning_rate": 0.0004198300045047342, + "loss": 3.1521, + "step": 22601 + }, + { + "epoch": 1.11, + "grad_norm": 0.5387015342712402, + "learning_rate": 0.00041981588970096365, + "loss": 3.0113, + "step": 22602 + }, + { + "epoch": 1.11, + "grad_norm": 0.551127016544342, + "learning_rate": 0.000419801774581621, + "loss": 3.27, + "step": 22603 + }, + { + "epoch": 1.11, + "grad_norm": 0.5647044777870178, + "learning_rate": 0.0004197876591467436, + "loss": 3.3264, + "step": 22604 + }, + { + "epoch": 1.11, + "grad_norm": 0.5634638071060181, + "learning_rate": 0.0004197735433963686, + "loss": 2.9904, + "step": 22605 + }, + { + "epoch": 1.11, + "grad_norm": 0.5413936972618103, + "learning_rate": 0.0004197594273305332, + "loss": 3.1296, + "step": 22606 + }, + { + "epoch": 1.11, + "grad_norm": 0.526145875453949, + "learning_rate": 0.0004197453109492743, + "loss": 3.0424, + "step": 22607 + }, + { + "epoch": 1.11, + "grad_norm": 0.5350415706634521, + "learning_rate": 0.0004197311942526294, + "loss": 3.137, + "step": 22608 + }, + { + "epoch": 1.11, + "grad_norm": 0.5814176797866821, + "learning_rate": 0.0004197170772406357, + "loss": 3.0361, + "step": 22609 + }, + { + "epoch": 1.11, + "grad_norm": 0.5808823108673096, + "learning_rate": 0.0004197029599133301, + "loss": 3.1731, + "step": 22610 + }, + { + "epoch": 1.11, + "grad_norm": 0.5619063973426819, + "learning_rate": 0.00041968884227075, + "loss": 2.9452, + "step": 22611 + }, + { + "epoch": 1.11, + "grad_norm": 0.5297744870185852, + "learning_rate": 0.0004196747243129325, + "loss": 3.2803, + "step": 22612 + }, + { + "epoch": 1.11, + "grad_norm": 0.527192234992981, + "learning_rate": 0.00041966060603991484, + "loss": 3.133, + "step": 22613 + }, + { + "epoch": 1.11, + "grad_norm": 0.5389567017555237, + "learning_rate": 0.00041964648745173417, + "loss": 3.2495, + "step": 22614 + }, + { + "epoch": 1.11, + "grad_norm": 0.5459379553794861, + "learning_rate": 0.0004196323685484277, + "loss": 3.1859, + "step": 22615 + }, + { + "epoch": 1.11, + "grad_norm": 0.558315634727478, + "learning_rate": 0.0004196182493300326, + "loss": 3.0052, + "step": 22616 + }, + { + "epoch": 1.11, + "grad_norm": 0.5612319707870483, + "learning_rate": 0.00041960412979658604, + "loss": 3.2109, + "step": 22617 + }, + { + "epoch": 1.11, + "grad_norm": 0.5649484992027283, + "learning_rate": 0.00041959000994812513, + "loss": 3.0352, + "step": 22618 + }, + { + "epoch": 1.11, + "grad_norm": 0.5512779951095581, + "learning_rate": 0.00041957588978468717, + "loss": 3.1786, + "step": 22619 + }, + { + "epoch": 1.11, + "grad_norm": 0.5309266448020935, + "learning_rate": 0.00041956176930630946, + "loss": 3.0735, + "step": 22620 + }, + { + "epoch": 1.11, + "grad_norm": 0.5767373442649841, + "learning_rate": 0.0004195476485130289, + "loss": 2.8406, + "step": 22621 + }, + { + "epoch": 1.11, + "grad_norm": 0.5628970861434937, + "learning_rate": 0.00041953352740488293, + "loss": 3.0715, + "step": 22622 + }, + { + "epoch": 1.11, + "grad_norm": 0.5341333150863647, + "learning_rate": 0.00041951940598190866, + "loss": 3.0542, + "step": 22623 + }, + { + "epoch": 1.11, + "grad_norm": 0.5453346967697144, + "learning_rate": 0.0004195052842441432, + "loss": 3.0502, + "step": 22624 + }, + { + "epoch": 1.11, + "grad_norm": 0.5791659951210022, + "learning_rate": 0.00041949116219162393, + "loss": 3.192, + "step": 22625 + }, + { + "epoch": 1.11, + "grad_norm": 0.5513650178909302, + "learning_rate": 0.0004194770398243879, + "loss": 3.1476, + "step": 22626 + }, + { + "epoch": 1.11, + "grad_norm": 0.5383404493331909, + "learning_rate": 0.0004194629171424724, + "loss": 2.8348, + "step": 22627 + }, + { + "epoch": 1.11, + "grad_norm": 0.5772968530654907, + "learning_rate": 0.00041944879414591434, + "loss": 3.2311, + "step": 22628 + }, + { + "epoch": 1.11, + "grad_norm": 0.5837339758872986, + "learning_rate": 0.0004194346708347513, + "loss": 2.9275, + "step": 22629 + }, + { + "epoch": 1.11, + "grad_norm": 0.5292240381240845, + "learning_rate": 0.0004194205472090204, + "loss": 3.0504, + "step": 22630 + }, + { + "epoch": 1.11, + "grad_norm": 0.5393617153167725, + "learning_rate": 0.0004194064232687587, + "loss": 2.9578, + "step": 22631 + }, + { + "epoch": 1.11, + "grad_norm": 0.5465915203094482, + "learning_rate": 0.0004193922990140034, + "loss": 3.2505, + "step": 22632 + }, + { + "epoch": 1.11, + "grad_norm": 0.5606839656829834, + "learning_rate": 0.0004193781744447917, + "loss": 2.9551, + "step": 22633 + }, + { + "epoch": 1.11, + "grad_norm": 0.5263482928276062, + "learning_rate": 0.0004193640495611611, + "loss": 3.2808, + "step": 22634 + }, + { + "epoch": 1.11, + "grad_norm": 0.5450267195701599, + "learning_rate": 0.0004193499243631484, + "loss": 3.1876, + "step": 22635 + }, + { + "epoch": 1.11, + "grad_norm": 0.5732899904251099, + "learning_rate": 0.000419335798850791, + "loss": 2.9856, + "step": 22636 + }, + { + "epoch": 1.11, + "grad_norm": 0.55254727602005, + "learning_rate": 0.0004193216730241261, + "loss": 3.1878, + "step": 22637 + }, + { + "epoch": 1.11, + "grad_norm": 0.5654364824295044, + "learning_rate": 0.00041930754688319086, + "loss": 3.154, + "step": 22638 + }, + { + "epoch": 1.11, + "grad_norm": 0.5142787098884583, + "learning_rate": 0.0004192934204280224, + "loss": 3.4391, + "step": 22639 + }, + { + "epoch": 1.11, + "grad_norm": 0.5899552702903748, + "learning_rate": 0.00041927929365865815, + "loss": 3.126, + "step": 22640 + }, + { + "epoch": 1.11, + "grad_norm": 0.5144187808036804, + "learning_rate": 0.00041926516657513513, + "loss": 3.1848, + "step": 22641 + }, + { + "epoch": 1.11, + "grad_norm": 0.5327566266059875, + "learning_rate": 0.0004192510391774907, + "loss": 3.2201, + "step": 22642 + }, + { + "epoch": 1.11, + "grad_norm": 0.5743151307106018, + "learning_rate": 0.00041923691146576194, + "loss": 3.0937, + "step": 22643 + }, + { + "epoch": 1.11, + "grad_norm": 0.538710355758667, + "learning_rate": 0.000419222783439986, + "loss": 3.1129, + "step": 22644 + }, + { + "epoch": 1.11, + "grad_norm": 0.5390024781227112, + "learning_rate": 0.00041920865510020045, + "loss": 3.0571, + "step": 22645 + }, + { + "epoch": 1.11, + "grad_norm": 0.5429919958114624, + "learning_rate": 0.000419194526446442, + "loss": 3.0848, + "step": 22646 + }, + { + "epoch": 1.11, + "grad_norm": 0.532172679901123, + "learning_rate": 0.0004191803974787482, + "loss": 2.8347, + "step": 22647 + }, + { + "epoch": 1.11, + "grad_norm": 0.5515339970588684, + "learning_rate": 0.00041916626819715617, + "loss": 3.0664, + "step": 22648 + }, + { + "epoch": 1.11, + "grad_norm": 0.5160419940948486, + "learning_rate": 0.0004191521386017031, + "loss": 3.019, + "step": 22649 + }, + { + "epoch": 1.11, + "grad_norm": 0.5607364177703857, + "learning_rate": 0.0004191380086924263, + "loss": 2.895, + "step": 22650 + }, + { + "epoch": 1.11, + "grad_norm": 0.5337927937507629, + "learning_rate": 0.00041912387846936277, + "loss": 3.1408, + "step": 22651 + }, + { + "epoch": 1.11, + "grad_norm": 0.5739155411720276, + "learning_rate": 0.00041910974793255, + "loss": 3.2168, + "step": 22652 + }, + { + "epoch": 1.11, + "grad_norm": 0.579247236251831, + "learning_rate": 0.0004190956170820251, + "loss": 2.9161, + "step": 22653 + }, + { + "epoch": 1.11, + "grad_norm": 0.5730478167533875, + "learning_rate": 0.0004190814859178251, + "loss": 3.0244, + "step": 22654 + }, + { + "epoch": 1.11, + "grad_norm": 0.5330377817153931, + "learning_rate": 0.0004190673544399875, + "loss": 2.9202, + "step": 22655 + }, + { + "epoch": 1.11, + "grad_norm": 0.5503820180892944, + "learning_rate": 0.00041905322264854946, + "loss": 3.1697, + "step": 22656 + }, + { + "epoch": 1.11, + "grad_norm": 0.5626627206802368, + "learning_rate": 0.00041903909054354804, + "loss": 3.1492, + "step": 22657 + }, + { + "epoch": 1.11, + "grad_norm": 0.5176795721054077, + "learning_rate": 0.0004190249581250206, + "loss": 3.0295, + "step": 22658 + }, + { + "epoch": 1.11, + "grad_norm": 0.5228406190872192, + "learning_rate": 0.00041901082539300426, + "loss": 3.0384, + "step": 22659 + }, + { + "epoch": 1.11, + "grad_norm": 0.5561695694923401, + "learning_rate": 0.0004189966923475364, + "loss": 3.0576, + "step": 22660 + }, + { + "epoch": 1.11, + "grad_norm": 0.5149094462394714, + "learning_rate": 0.0004189825589886542, + "loss": 3.0316, + "step": 22661 + }, + { + "epoch": 1.11, + "grad_norm": 0.5652263164520264, + "learning_rate": 0.00041896842531639476, + "loss": 3.0514, + "step": 22662 + }, + { + "epoch": 1.11, + "grad_norm": 0.5679181218147278, + "learning_rate": 0.00041895429133079534, + "loss": 3.0378, + "step": 22663 + }, + { + "epoch": 1.11, + "grad_norm": 0.5677348375320435, + "learning_rate": 0.00041894015703189326, + "loss": 3.1728, + "step": 22664 + }, + { + "epoch": 1.11, + "grad_norm": 0.5508793592453003, + "learning_rate": 0.00041892602241972566, + "loss": 3.1805, + "step": 22665 + }, + { + "epoch": 1.11, + "grad_norm": 0.5380650162696838, + "learning_rate": 0.0004189118874943298, + "loss": 2.9287, + "step": 22666 + }, + { + "epoch": 1.11, + "grad_norm": 0.5400630235671997, + "learning_rate": 0.00041889775225574305, + "loss": 3.0148, + "step": 22667 + }, + { + "epoch": 1.11, + "grad_norm": 0.5459088683128357, + "learning_rate": 0.0004188836167040024, + "loss": 3.2834, + "step": 22668 + }, + { + "epoch": 1.11, + "grad_norm": 0.5284379720687866, + "learning_rate": 0.00041886948083914513, + "loss": 3.0427, + "step": 22669 + }, + { + "epoch": 1.11, + "grad_norm": 0.5413404107093811, + "learning_rate": 0.0004188553446612086, + "loss": 3.0797, + "step": 22670 + }, + { + "epoch": 1.11, + "grad_norm": 0.6029809713363647, + "learning_rate": 0.00041884120817023, + "loss": 3.0449, + "step": 22671 + }, + { + "epoch": 1.11, + "grad_norm": 0.5510432720184326, + "learning_rate": 0.0004188270713662465, + "loss": 3.0182, + "step": 22672 + }, + { + "epoch": 1.11, + "grad_norm": 0.5862864255905151, + "learning_rate": 0.0004188129342492953, + "loss": 3.2882, + "step": 22673 + }, + { + "epoch": 1.11, + "grad_norm": 0.5308249592781067, + "learning_rate": 0.0004187987968194137, + "loss": 3.0142, + "step": 22674 + }, + { + "epoch": 1.11, + "grad_norm": 0.5954753756523132, + "learning_rate": 0.000418784659076639, + "loss": 3.2163, + "step": 22675 + }, + { + "epoch": 1.11, + "grad_norm": 0.5317316651344299, + "learning_rate": 0.00041877052102100835, + "loss": 2.9223, + "step": 22676 + }, + { + "epoch": 1.11, + "grad_norm": 0.5436576008796692, + "learning_rate": 0.00041875638265255894, + "loss": 3.0907, + "step": 22677 + }, + { + "epoch": 1.11, + "grad_norm": 0.5222603678703308, + "learning_rate": 0.0004187422439713282, + "loss": 2.9988, + "step": 22678 + }, + { + "epoch": 1.11, + "grad_norm": 0.5745731592178345, + "learning_rate": 0.0004187281049773532, + "loss": 2.8509, + "step": 22679 + }, + { + "epoch": 1.11, + "grad_norm": 0.5338032841682434, + "learning_rate": 0.0004187139656706711, + "loss": 3.1961, + "step": 22680 + }, + { + "epoch": 1.11, + "grad_norm": 0.5111261010169983, + "learning_rate": 0.00041869982605131933, + "loss": 3.2963, + "step": 22681 + }, + { + "epoch": 1.11, + "grad_norm": 0.5470437407493591, + "learning_rate": 0.0004186856861193352, + "loss": 3.1833, + "step": 22682 + }, + { + "epoch": 1.11, + "grad_norm": 0.5256366729736328, + "learning_rate": 0.0004186715458747557, + "loss": 3.2079, + "step": 22683 + }, + { + "epoch": 1.11, + "grad_norm": 0.5694595575332642, + "learning_rate": 0.00041865740531761813, + "loss": 2.9047, + "step": 22684 + }, + { + "epoch": 1.11, + "grad_norm": 0.5024998188018799, + "learning_rate": 0.0004186432644479599, + "loss": 3.2674, + "step": 22685 + }, + { + "epoch": 1.11, + "grad_norm": 0.5744755268096924, + "learning_rate": 0.000418629123265818, + "loss": 3.1185, + "step": 22686 + }, + { + "epoch": 1.11, + "grad_norm": 0.5288079380989075, + "learning_rate": 0.00041861498177122995, + "loss": 3.1339, + "step": 22687 + }, + { + "epoch": 1.11, + "grad_norm": 0.5845724940299988, + "learning_rate": 0.0004186008399642328, + "loss": 2.8917, + "step": 22688 + }, + { + "epoch": 1.11, + "grad_norm": 0.5915581583976746, + "learning_rate": 0.00041858669784486396, + "loss": 3.0969, + "step": 22689 + }, + { + "epoch": 1.11, + "grad_norm": 0.5309663414955139, + "learning_rate": 0.0004185725554131604, + "loss": 3.0139, + "step": 22690 + }, + { + "epoch": 1.11, + "grad_norm": 0.5378329753875732, + "learning_rate": 0.0004185584126691597, + "loss": 3.2019, + "step": 22691 + }, + { + "epoch": 1.11, + "grad_norm": 0.5664974451065063, + "learning_rate": 0.0004185442696128989, + "loss": 3.0974, + "step": 22692 + }, + { + "epoch": 1.11, + "grad_norm": 0.5150554776191711, + "learning_rate": 0.00041853012624441544, + "loss": 3.0179, + "step": 22693 + }, + { + "epoch": 1.11, + "grad_norm": 0.5422317981719971, + "learning_rate": 0.0004185159825637463, + "loss": 3.0485, + "step": 22694 + }, + { + "epoch": 1.11, + "grad_norm": 0.5424741506576538, + "learning_rate": 0.0004185018385709288, + "loss": 3.1692, + "step": 22695 + }, + { + "epoch": 1.11, + "grad_norm": 0.5382916927337646, + "learning_rate": 0.0004184876942660004, + "loss": 3.027, + "step": 22696 + }, + { + "epoch": 1.11, + "grad_norm": 0.5341999530792236, + "learning_rate": 0.00041847354964899823, + "loss": 3.0156, + "step": 22697 + }, + { + "epoch": 1.11, + "grad_norm": 0.5673853754997253, + "learning_rate": 0.00041845940471995946, + "loss": 3.045, + "step": 22698 + }, + { + "epoch": 1.11, + "grad_norm": 0.5665754079818726, + "learning_rate": 0.0004184452594789214, + "loss": 3.0651, + "step": 22699 + }, + { + "epoch": 1.11, + "grad_norm": 0.5702019333839417, + "learning_rate": 0.0004184311139259214, + "loss": 2.9393, + "step": 22700 + }, + { + "epoch": 1.11, + "grad_norm": 0.5696156620979309, + "learning_rate": 0.00041841696806099656, + "loss": 2.9479, + "step": 22701 + }, + { + "epoch": 1.11, + "grad_norm": 0.577709436416626, + "learning_rate": 0.00041840282188418426, + "loss": 3.0455, + "step": 22702 + }, + { + "epoch": 1.11, + "grad_norm": 0.5778018832206726, + "learning_rate": 0.0004183886753955217, + "loss": 3.0691, + "step": 22703 + }, + { + "epoch": 1.11, + "grad_norm": 0.5543364882469177, + "learning_rate": 0.0004183745285950462, + "loss": 3.0442, + "step": 22704 + }, + { + "epoch": 1.11, + "grad_norm": 0.5635522603988647, + "learning_rate": 0.0004183603814827949, + "loss": 3.0389, + "step": 22705 + }, + { + "epoch": 1.11, + "grad_norm": 0.5350740551948547, + "learning_rate": 0.0004183462340588051, + "loss": 3.1186, + "step": 22706 + }, + { + "epoch": 1.11, + "grad_norm": 0.5741170048713684, + "learning_rate": 0.00041833208632311423, + "loss": 3.017, + "step": 22707 + }, + { + "epoch": 1.11, + "grad_norm": 0.5641077756881714, + "learning_rate": 0.0004183179382757593, + "loss": 3.2578, + "step": 22708 + }, + { + "epoch": 1.11, + "grad_norm": 0.5855833888053894, + "learning_rate": 0.0004183037899167778, + "loss": 2.9378, + "step": 22709 + }, + { + "epoch": 1.11, + "grad_norm": 0.5699617266654968, + "learning_rate": 0.0004182896412462067, + "loss": 3.2279, + "step": 22710 + }, + { + "epoch": 1.11, + "grad_norm": 0.5196274518966675, + "learning_rate": 0.0004182754922640837, + "loss": 3.0284, + "step": 22711 + }, + { + "epoch": 1.11, + "grad_norm": 0.5257676243782043, + "learning_rate": 0.0004182613429704456, + "loss": 3.1445, + "step": 22712 + }, + { + "epoch": 1.11, + "grad_norm": 0.5339217782020569, + "learning_rate": 0.00041824719336533005, + "loss": 3.0042, + "step": 22713 + }, + { + "epoch": 1.11, + "grad_norm": 0.5554669499397278, + "learning_rate": 0.00041823304344877404, + "loss": 3.0607, + "step": 22714 + }, + { + "epoch": 1.11, + "grad_norm": 0.5328848958015442, + "learning_rate": 0.000418218893220815, + "loss": 3.1213, + "step": 22715 + }, + { + "epoch": 1.11, + "grad_norm": 0.5345682501792908, + "learning_rate": 0.0004182047426814901, + "loss": 3.122, + "step": 22716 + }, + { + "epoch": 1.11, + "grad_norm": 0.5637184381484985, + "learning_rate": 0.00041819059183083665, + "loss": 3.186, + "step": 22717 + }, + { + "epoch": 1.11, + "grad_norm": 0.5496988892555237, + "learning_rate": 0.00041817644066889203, + "loss": 3.1549, + "step": 22718 + }, + { + "epoch": 1.11, + "grad_norm": 0.5595161318778992, + "learning_rate": 0.00041816228919569335, + "loss": 3.1797, + "step": 22719 + }, + { + "epoch": 1.11, + "grad_norm": 0.5612072944641113, + "learning_rate": 0.00041814813741127796, + "loss": 3.1245, + "step": 22720 + }, + { + "epoch": 1.11, + "grad_norm": 0.5443363785743713, + "learning_rate": 0.000418133985315683, + "loss": 3.2956, + "step": 22721 + }, + { + "epoch": 1.11, + "grad_norm": 0.5439444780349731, + "learning_rate": 0.000418119832908946, + "loss": 3.0141, + "step": 22722 + }, + { + "epoch": 1.11, + "grad_norm": 0.5160937309265137, + "learning_rate": 0.000418105680191104, + "loss": 3.2615, + "step": 22723 + }, + { + "epoch": 1.11, + "grad_norm": 0.5264231562614441, + "learning_rate": 0.0004180915271621945, + "loss": 3.0696, + "step": 22724 + }, + { + "epoch": 1.11, + "grad_norm": 0.5493642687797546, + "learning_rate": 0.0004180773738222545, + "loss": 3.0094, + "step": 22725 + }, + { + "epoch": 1.11, + "grad_norm": 0.535229504108429, + "learning_rate": 0.0004180632201713215, + "loss": 2.9877, + "step": 22726 + }, + { + "epoch": 1.11, + "grad_norm": 0.5449292063713074, + "learning_rate": 0.0004180490662094326, + "loss": 3.1424, + "step": 22727 + }, + { + "epoch": 1.11, + "grad_norm": 0.5288578271865845, + "learning_rate": 0.00041803491193662524, + "loss": 3.1605, + "step": 22728 + }, + { + "epoch": 1.11, + "grad_norm": 0.5462258458137512, + "learning_rate": 0.0004180207573529367, + "loss": 3.204, + "step": 22729 + }, + { + "epoch": 1.11, + "grad_norm": 0.507749080657959, + "learning_rate": 0.00041800660245840414, + "loss": 3.1402, + "step": 22730 + }, + { + "epoch": 1.11, + "grad_norm": 0.554622232913971, + "learning_rate": 0.00041799244725306483, + "loss": 3.1245, + "step": 22731 + }, + { + "epoch": 1.11, + "grad_norm": 0.5544594526290894, + "learning_rate": 0.0004179782917369562, + "loss": 3.0031, + "step": 22732 + }, + { + "epoch": 1.11, + "grad_norm": 0.5762028098106384, + "learning_rate": 0.00041796413591011557, + "loss": 2.9274, + "step": 22733 + }, + { + "epoch": 1.11, + "grad_norm": 0.5185121893882751, + "learning_rate": 0.00041794997977257994, + "loss": 2.9783, + "step": 22734 + }, + { + "epoch": 1.11, + "grad_norm": 0.5906330943107605, + "learning_rate": 0.0004179358233243868, + "loss": 3.1314, + "step": 22735 + }, + { + "epoch": 1.11, + "grad_norm": 0.5670438408851624, + "learning_rate": 0.00041792166656557343, + "loss": 3.1062, + "step": 22736 + }, + { + "epoch": 1.11, + "grad_norm": 0.6015538573265076, + "learning_rate": 0.00041790750949617704, + "loss": 2.9635, + "step": 22737 + }, + { + "epoch": 1.11, + "grad_norm": 0.5294987559318542, + "learning_rate": 0.000417893352116235, + "loss": 3.0218, + "step": 22738 + }, + { + "epoch": 1.11, + "grad_norm": 0.5639286041259766, + "learning_rate": 0.00041787919442578457, + "loss": 3.1125, + "step": 22739 + }, + { + "epoch": 1.11, + "grad_norm": 0.5568161010742188, + "learning_rate": 0.0004178650364248631, + "loss": 3.2293, + "step": 22740 + }, + { + "epoch": 1.11, + "grad_norm": 0.5709725022315979, + "learning_rate": 0.0004178508781135076, + "loss": 3.1711, + "step": 22741 + }, + { + "epoch": 1.11, + "grad_norm": 0.5765569806098938, + "learning_rate": 0.00041783671949175566, + "loss": 3.2865, + "step": 22742 + }, + { + "epoch": 1.11, + "grad_norm": 0.5151780247688293, + "learning_rate": 0.0004178225605596445, + "loss": 3.2819, + "step": 22743 + }, + { + "epoch": 1.11, + "grad_norm": 0.5453171730041504, + "learning_rate": 0.00041780840131721143, + "loss": 3.089, + "step": 22744 + }, + { + "epoch": 1.11, + "grad_norm": 0.5520200729370117, + "learning_rate": 0.0004177942417644937, + "loss": 3.0535, + "step": 22745 + }, + { + "epoch": 1.11, + "grad_norm": 0.5933725237846375, + "learning_rate": 0.00041778008190152856, + "loss": 3.0728, + "step": 22746 + }, + { + "epoch": 1.11, + "grad_norm": 0.5700779557228088, + "learning_rate": 0.0004177659217283533, + "loss": 3.0971, + "step": 22747 + }, + { + "epoch": 1.11, + "grad_norm": 0.5266256928443909, + "learning_rate": 0.0004177517612450054, + "loss": 3.1489, + "step": 22748 + }, + { + "epoch": 1.11, + "grad_norm": 0.5342414975166321, + "learning_rate": 0.00041773760045152195, + "loss": 3.1828, + "step": 22749 + }, + { + "epoch": 1.11, + "grad_norm": 0.5465877056121826, + "learning_rate": 0.0004177234393479403, + "loss": 3.0301, + "step": 22750 + }, + { + "epoch": 1.11, + "grad_norm": 0.5596908330917358, + "learning_rate": 0.0004177092779342978, + "loss": 2.8841, + "step": 22751 + }, + { + "epoch": 1.12, + "grad_norm": 0.5438765287399292, + "learning_rate": 0.00041769511621063166, + "loss": 3.1112, + "step": 22752 + }, + { + "epoch": 1.12, + "grad_norm": 0.5041543245315552, + "learning_rate": 0.00041768095417697937, + "loss": 3.0129, + "step": 22753 + }, + { + "epoch": 1.12, + "grad_norm": 0.527295708656311, + "learning_rate": 0.00041766679183337797, + "loss": 3.2012, + "step": 22754 + }, + { + "epoch": 1.12, + "grad_norm": 0.5652879476547241, + "learning_rate": 0.00041765262917986497, + "loss": 3.2307, + "step": 22755 + }, + { + "epoch": 1.12, + "grad_norm": 0.5673266053199768, + "learning_rate": 0.00041763846621647755, + "loss": 3.135, + "step": 22756 + }, + { + "epoch": 1.12, + "grad_norm": 0.5326421856880188, + "learning_rate": 0.00041762430294325296, + "loss": 3.0118, + "step": 22757 + }, + { + "epoch": 1.12, + "grad_norm": 0.5760022401809692, + "learning_rate": 0.0004176101393602287, + "loss": 3.2049, + "step": 22758 + }, + { + "epoch": 1.12, + "grad_norm": 0.5360777974128723, + "learning_rate": 0.00041759597546744194, + "loss": 3.1222, + "step": 22759 + }, + { + "epoch": 1.12, + "grad_norm": 0.5167760848999023, + "learning_rate": 0.00041758181126493006, + "loss": 2.9691, + "step": 22760 + }, + { + "epoch": 1.12, + "grad_norm": 0.8578059077262878, + "learning_rate": 0.00041756764675273024, + "loss": 3.2545, + "step": 22761 + }, + { + "epoch": 1.12, + "grad_norm": 0.6276934742927551, + "learning_rate": 0.0004175534819308799, + "loss": 3.0123, + "step": 22762 + }, + { + "epoch": 1.12, + "grad_norm": 0.5215904712677002, + "learning_rate": 0.0004175393167994163, + "loss": 3.1768, + "step": 22763 + }, + { + "epoch": 1.12, + "grad_norm": 0.51545250415802, + "learning_rate": 0.00041752515135837676, + "loss": 2.9006, + "step": 22764 + }, + { + "epoch": 1.12, + "grad_norm": 0.535327672958374, + "learning_rate": 0.00041751098560779856, + "loss": 3.1213, + "step": 22765 + }, + { + "epoch": 1.12, + "grad_norm": 0.54879230260849, + "learning_rate": 0.00041749681954771913, + "loss": 3.2119, + "step": 22766 + }, + { + "epoch": 1.12, + "grad_norm": 0.5459108352661133, + "learning_rate": 0.00041748265317817566, + "loss": 3.0567, + "step": 22767 + }, + { + "epoch": 1.12, + "grad_norm": 0.5210204720497131, + "learning_rate": 0.0004174684864992054, + "loss": 3.2307, + "step": 22768 + }, + { + "epoch": 1.12, + "grad_norm": 0.5359011292457581, + "learning_rate": 0.0004174543195108459, + "loss": 2.9444, + "step": 22769 + }, + { + "epoch": 1.12, + "grad_norm": 0.5488296747207642, + "learning_rate": 0.00041744015221313423, + "loss": 3.263, + "step": 22770 + }, + { + "epoch": 1.12, + "grad_norm": 0.5671960115432739, + "learning_rate": 0.0004174259846061078, + "loss": 2.9674, + "step": 22771 + }, + { + "epoch": 1.12, + "grad_norm": 0.5221547484397888, + "learning_rate": 0.00041741181668980386, + "loss": 2.937, + "step": 22772 + }, + { + "epoch": 1.12, + "grad_norm": 0.5802128911018372, + "learning_rate": 0.00041739764846425995, + "loss": 3.0267, + "step": 22773 + }, + { + "epoch": 1.12, + "grad_norm": 0.5907021760940552, + "learning_rate": 0.0004173834799295132, + "loss": 3.1479, + "step": 22774 + }, + { + "epoch": 1.12, + "grad_norm": 0.5466511845588684, + "learning_rate": 0.0004173693110856009, + "loss": 2.9301, + "step": 22775 + }, + { + "epoch": 1.12, + "grad_norm": 0.5275335311889648, + "learning_rate": 0.00041735514193256044, + "loss": 3.1554, + "step": 22776 + }, + { + "epoch": 1.12, + "grad_norm": 0.5344946980476379, + "learning_rate": 0.0004173409724704291, + "loss": 3.2402, + "step": 22777 + }, + { + "epoch": 1.12, + "grad_norm": 0.5361987948417664, + "learning_rate": 0.0004173268026992442, + "loss": 3.0482, + "step": 22778 + }, + { + "epoch": 1.12, + "grad_norm": 0.5685098171234131, + "learning_rate": 0.0004173126326190431, + "loss": 3.1086, + "step": 22779 + }, + { + "epoch": 1.12, + "grad_norm": 0.5359070897102356, + "learning_rate": 0.0004172984622298632, + "loss": 3.1898, + "step": 22780 + }, + { + "epoch": 1.12, + "grad_norm": 0.5278506875038147, + "learning_rate": 0.0004172842915317417, + "loss": 3.0062, + "step": 22781 + }, + { + "epoch": 1.12, + "grad_norm": 0.5743756294250488, + "learning_rate": 0.00041727012052471577, + "loss": 3.1249, + "step": 22782 + }, + { + "epoch": 1.12, + "grad_norm": 0.5638933777809143, + "learning_rate": 0.00041725594920882304, + "loss": 2.8144, + "step": 22783 + }, + { + "epoch": 1.12, + "grad_norm": 0.5870307683944702, + "learning_rate": 0.00041724177758410073, + "loss": 3.1692, + "step": 22784 + }, + { + "epoch": 1.12, + "grad_norm": 0.530704140663147, + "learning_rate": 0.0004172276056505861, + "loss": 2.8009, + "step": 22785 + }, + { + "epoch": 1.12, + "grad_norm": 0.5861652493476868, + "learning_rate": 0.00041721343340831655, + "loss": 3.2099, + "step": 22786 + }, + { + "epoch": 1.12, + "grad_norm": 0.5355147123336792, + "learning_rate": 0.0004171992608573293, + "loss": 2.9651, + "step": 22787 + }, + { + "epoch": 1.12, + "grad_norm": 0.5698812007904053, + "learning_rate": 0.0004171850879976618, + "loss": 3.0561, + "step": 22788 + }, + { + "epoch": 1.12, + "grad_norm": 0.5511211156845093, + "learning_rate": 0.0004171709148293514, + "loss": 3.2349, + "step": 22789 + }, + { + "epoch": 1.12, + "grad_norm": 0.5315529108047485, + "learning_rate": 0.00041715674135243524, + "loss": 2.962, + "step": 22790 + }, + { + "epoch": 1.12, + "grad_norm": 0.5924056768417358, + "learning_rate": 0.0004171425675669509, + "loss": 3.0684, + "step": 22791 + }, + { + "epoch": 1.12, + "grad_norm": 0.5384601950645447, + "learning_rate": 0.00041712839347293556, + "loss": 3.1687, + "step": 22792 + }, + { + "epoch": 1.12, + "grad_norm": 0.5141586065292358, + "learning_rate": 0.0004171142190704264, + "loss": 3.0498, + "step": 22793 + }, + { + "epoch": 1.12, + "grad_norm": 0.5134872794151306, + "learning_rate": 0.00041710004435946104, + "loss": 3.0604, + "step": 22794 + }, + { + "epoch": 1.12, + "grad_norm": 0.5418192744255066, + "learning_rate": 0.0004170858693400768, + "loss": 3.0924, + "step": 22795 + }, + { + "epoch": 1.12, + "grad_norm": 0.5468102097511292, + "learning_rate": 0.0004170716940123108, + "loss": 2.9778, + "step": 22796 + }, + { + "epoch": 1.12, + "grad_norm": 0.519443929195404, + "learning_rate": 0.00041705751837620055, + "loss": 3.0669, + "step": 22797 + }, + { + "epoch": 1.12, + "grad_norm": 0.534935712814331, + "learning_rate": 0.0004170433424317832, + "loss": 3.0799, + "step": 22798 + }, + { + "epoch": 1.12, + "grad_norm": 0.5599324703216553, + "learning_rate": 0.00041702916617909633, + "loss": 3.2856, + "step": 22799 + }, + { + "epoch": 1.12, + "grad_norm": 0.5286010503768921, + "learning_rate": 0.0004170149896181771, + "loss": 2.9179, + "step": 22800 + }, + { + "epoch": 1.12, + "grad_norm": 0.5498127341270447, + "learning_rate": 0.000417000812749063, + "loss": 2.9982, + "step": 22801 + }, + { + "epoch": 1.12, + "grad_norm": 0.5402305126190186, + "learning_rate": 0.00041698663557179126, + "loss": 2.922, + "step": 22802 + }, + { + "epoch": 1.12, + "grad_norm": 0.5140150785446167, + "learning_rate": 0.0004169724580863991, + "loss": 3.1083, + "step": 22803 + }, + { + "epoch": 1.12, + "grad_norm": 0.518584132194519, + "learning_rate": 0.00041695828029292406, + "loss": 3.1583, + "step": 22804 + }, + { + "epoch": 1.12, + "grad_norm": 0.5516182780265808, + "learning_rate": 0.00041694410219140344, + "loss": 3.2764, + "step": 22805 + }, + { + "epoch": 1.12, + "grad_norm": 0.5740772485733032, + "learning_rate": 0.0004169299237818747, + "loss": 3.0374, + "step": 22806 + }, + { + "epoch": 1.12, + "grad_norm": 0.5997231602668762, + "learning_rate": 0.00041691574506437485, + "loss": 3.056, + "step": 22807 + }, + { + "epoch": 1.12, + "grad_norm": 0.566772997379303, + "learning_rate": 0.0004169015660389414, + "loss": 3.1544, + "step": 22808 + }, + { + "epoch": 1.12, + "grad_norm": 0.5957667827606201, + "learning_rate": 0.0004168873867056119, + "loss": 3.0864, + "step": 22809 + }, + { + "epoch": 1.12, + "grad_norm": 0.5558214783668518, + "learning_rate": 0.00041687320706442345, + "loss": 3.0475, + "step": 22810 + }, + { + "epoch": 1.12, + "grad_norm": 0.5514797568321228, + "learning_rate": 0.0004168590271154134, + "loss": 3.1521, + "step": 22811 + }, + { + "epoch": 1.12, + "grad_norm": 0.5616558194160461, + "learning_rate": 0.0004168448468586192, + "loss": 2.9716, + "step": 22812 + }, + { + "epoch": 1.12, + "grad_norm": 0.555022120475769, + "learning_rate": 0.00041683066629407816, + "loss": 2.985, + "step": 22813 + }, + { + "epoch": 1.12, + "grad_norm": 0.5655428767204285, + "learning_rate": 0.00041681648542182765, + "loss": 3.1157, + "step": 22814 + }, + { + "epoch": 1.12, + "grad_norm": 0.5333276391029358, + "learning_rate": 0.00041680230424190497, + "loss": 3.1813, + "step": 22815 + }, + { + "epoch": 1.12, + "grad_norm": 0.5510712265968323, + "learning_rate": 0.0004167881227543475, + "loss": 2.9741, + "step": 22816 + }, + { + "epoch": 1.12, + "grad_norm": 0.5697930455207825, + "learning_rate": 0.0004167739409591927, + "loss": 3.3855, + "step": 22817 + }, + { + "epoch": 1.12, + "grad_norm": 0.5547995567321777, + "learning_rate": 0.0004167597588564776, + "loss": 3.1625, + "step": 22818 + }, + { + "epoch": 1.12, + "grad_norm": 0.5366219878196716, + "learning_rate": 0.00041674557644623986, + "loss": 3.1218, + "step": 22819 + }, + { + "epoch": 1.12, + "grad_norm": 0.5644329786300659, + "learning_rate": 0.0004167313937285169, + "loss": 3.2227, + "step": 22820 + }, + { + "epoch": 1.12, + "grad_norm": 0.5153218507766724, + "learning_rate": 0.0004167172107033457, + "loss": 3.1479, + "step": 22821 + }, + { + "epoch": 1.12, + "grad_norm": 0.5740698575973511, + "learning_rate": 0.00041670302737076395, + "loss": 3.0745, + "step": 22822 + }, + { + "epoch": 1.12, + "grad_norm": 0.5266560316085815, + "learning_rate": 0.00041668884373080876, + "loss": 3.0566, + "step": 22823 + }, + { + "epoch": 1.12, + "grad_norm": 0.5471863150596619, + "learning_rate": 0.0004166746597835178, + "loss": 3.0059, + "step": 22824 + }, + { + "epoch": 1.12, + "grad_norm": 0.53224116563797, + "learning_rate": 0.00041666047552892815, + "loss": 3.1979, + "step": 22825 + }, + { + "epoch": 1.12, + "grad_norm": 0.5300314426422119, + "learning_rate": 0.00041664629096707716, + "loss": 3.0605, + "step": 22826 + }, + { + "epoch": 1.12, + "grad_norm": 0.536897599697113, + "learning_rate": 0.00041663210609800246, + "loss": 3.075, + "step": 22827 + }, + { + "epoch": 1.12, + "grad_norm": 0.5496629476547241, + "learning_rate": 0.0004166179209217411, + "loss": 3.1034, + "step": 22828 + }, + { + "epoch": 1.12, + "grad_norm": 0.5242640376091003, + "learning_rate": 0.00041660373543833066, + "loss": 3.0961, + "step": 22829 + }, + { + "epoch": 1.12, + "grad_norm": 0.5705289840698242, + "learning_rate": 0.00041658954964780837, + "loss": 3.0197, + "step": 22830 + }, + { + "epoch": 1.12, + "grad_norm": 0.5280457139015198, + "learning_rate": 0.0004165753635502118, + "loss": 3.0366, + "step": 22831 + }, + { + "epoch": 1.12, + "grad_norm": 0.5403161644935608, + "learning_rate": 0.000416561177145578, + "loss": 3.0601, + "step": 22832 + }, + { + "epoch": 1.12, + "grad_norm": 0.5531915426254272, + "learning_rate": 0.0004165469904339445, + "loss": 3.1521, + "step": 22833 + }, + { + "epoch": 1.12, + "grad_norm": 0.5562331080436707, + "learning_rate": 0.0004165328034153486, + "loss": 3.1089, + "step": 22834 + }, + { + "epoch": 1.12, + "grad_norm": 0.5130084156990051, + "learning_rate": 0.00041651861608982786, + "loss": 3.2384, + "step": 22835 + }, + { + "epoch": 1.12, + "grad_norm": 0.5995485186576843, + "learning_rate": 0.00041650442845741944, + "loss": 2.9602, + "step": 22836 + }, + { + "epoch": 1.12, + "grad_norm": 0.5520250201225281, + "learning_rate": 0.00041649024051816077, + "loss": 2.9841, + "step": 22837 + }, + { + "epoch": 1.12, + "grad_norm": 0.5484051704406738, + "learning_rate": 0.0004164760522720892, + "loss": 3.0843, + "step": 22838 + }, + { + "epoch": 1.12, + "grad_norm": 0.5702111124992371, + "learning_rate": 0.0004164618637192422, + "loss": 3.1287, + "step": 22839 + }, + { + "epoch": 1.12, + "grad_norm": 0.5361772179603577, + "learning_rate": 0.00041644767485965705, + "loss": 2.8899, + "step": 22840 + }, + { + "epoch": 1.12, + "grad_norm": 0.5582685470581055, + "learning_rate": 0.0004164334856933711, + "loss": 3.2705, + "step": 22841 + }, + { + "epoch": 1.12, + "grad_norm": 0.5814189910888672, + "learning_rate": 0.00041641929622042185, + "loss": 2.9787, + "step": 22842 + }, + { + "epoch": 1.12, + "grad_norm": 0.543645977973938, + "learning_rate": 0.0004164051064408465, + "loss": 2.9174, + "step": 22843 + }, + { + "epoch": 1.12, + "grad_norm": 0.5395931601524353, + "learning_rate": 0.0004163909163546824, + "loss": 2.9608, + "step": 22844 + }, + { + "epoch": 1.12, + "grad_norm": 0.5268449783325195, + "learning_rate": 0.0004163767259619671, + "loss": 3.1027, + "step": 22845 + }, + { + "epoch": 1.12, + "grad_norm": 0.5362355709075928, + "learning_rate": 0.0004163625352627381, + "loss": 2.9239, + "step": 22846 + }, + { + "epoch": 1.12, + "grad_norm": 0.5752016305923462, + "learning_rate": 0.00041634834425703236, + "loss": 3.0805, + "step": 22847 + }, + { + "epoch": 1.12, + "grad_norm": 0.5360502004623413, + "learning_rate": 0.0004163341529448875, + "loss": 2.909, + "step": 22848 + }, + { + "epoch": 1.12, + "grad_norm": 0.5390860438346863, + "learning_rate": 0.00041631996132634086, + "loss": 3.0375, + "step": 22849 + }, + { + "epoch": 1.12, + "grad_norm": 0.5611980557441711, + "learning_rate": 0.0004163057694014299, + "loss": 3.1001, + "step": 22850 + }, + { + "epoch": 1.12, + "grad_norm": 0.5395224094390869, + "learning_rate": 0.00041629157717019185, + "loss": 3.2519, + "step": 22851 + }, + { + "epoch": 1.12, + "grad_norm": 0.5417009592056274, + "learning_rate": 0.00041627738463266426, + "loss": 2.8745, + "step": 22852 + }, + { + "epoch": 1.12, + "grad_norm": 0.5183812379837036, + "learning_rate": 0.0004162631917888844, + "loss": 3.1916, + "step": 22853 + }, + { + "epoch": 1.12, + "grad_norm": 0.5822927355766296, + "learning_rate": 0.0004162489986388896, + "loss": 3.0131, + "step": 22854 + }, + { + "epoch": 1.12, + "grad_norm": 0.5340526103973389, + "learning_rate": 0.0004162348051827173, + "loss": 3.151, + "step": 22855 + }, + { + "epoch": 1.12, + "grad_norm": 0.5964843034744263, + "learning_rate": 0.00041622061142040494, + "loss": 3.1859, + "step": 22856 + }, + { + "epoch": 1.12, + "grad_norm": 0.5237846970558167, + "learning_rate": 0.0004162064173519899, + "loss": 2.9817, + "step": 22857 + }, + { + "epoch": 1.12, + "grad_norm": 0.5963344573974609, + "learning_rate": 0.0004161922229775094, + "loss": 3.1043, + "step": 22858 + }, + { + "epoch": 1.12, + "grad_norm": 0.5289567708969116, + "learning_rate": 0.00041617802829700104, + "loss": 3.0147, + "step": 22859 + }, + { + "epoch": 1.12, + "grad_norm": 0.545599102973938, + "learning_rate": 0.00041616383331050214, + "loss": 3.104, + "step": 22860 + }, + { + "epoch": 1.12, + "grad_norm": 0.5368244647979736, + "learning_rate": 0.00041614963801804995, + "loss": 2.9397, + "step": 22861 + }, + { + "epoch": 1.12, + "grad_norm": 0.5587390661239624, + "learning_rate": 0.00041613544241968204, + "loss": 3.1038, + "step": 22862 + }, + { + "epoch": 1.12, + "grad_norm": 0.5761477947235107, + "learning_rate": 0.0004161212465154357, + "loss": 3.1405, + "step": 22863 + }, + { + "epoch": 1.12, + "grad_norm": 0.5684610605239868, + "learning_rate": 0.00041610705030534835, + "loss": 3.0149, + "step": 22864 + }, + { + "epoch": 1.12, + "grad_norm": 0.5535710453987122, + "learning_rate": 0.0004160928537894574, + "loss": 3.1474, + "step": 22865 + }, + { + "epoch": 1.12, + "grad_norm": 0.5354270339012146, + "learning_rate": 0.00041607865696780016, + "loss": 2.9943, + "step": 22866 + }, + { + "epoch": 1.12, + "grad_norm": 0.5734537839889526, + "learning_rate": 0.0004160644598404141, + "loss": 3.0333, + "step": 22867 + }, + { + "epoch": 1.12, + "grad_norm": 0.5363245010375977, + "learning_rate": 0.0004160502624073367, + "loss": 3.1273, + "step": 22868 + }, + { + "epoch": 1.12, + "grad_norm": 0.5657275319099426, + "learning_rate": 0.0004160360646686051, + "loss": 3.1475, + "step": 22869 + }, + { + "epoch": 1.12, + "grad_norm": 0.5877406001091003, + "learning_rate": 0.0004160218666242569, + "loss": 3.1525, + "step": 22870 + }, + { + "epoch": 1.12, + "grad_norm": 0.5854254961013794, + "learning_rate": 0.0004160076682743294, + "loss": 3.343, + "step": 22871 + }, + { + "epoch": 1.12, + "grad_norm": 0.4987305700778961, + "learning_rate": 0.00041599346961886005, + "loss": 3.1971, + "step": 22872 + }, + { + "epoch": 1.12, + "grad_norm": 0.5421817302703857, + "learning_rate": 0.0004159792706578862, + "loss": 3.1557, + "step": 22873 + }, + { + "epoch": 1.12, + "grad_norm": 0.5647212862968445, + "learning_rate": 0.0004159650713914453, + "loss": 3.059, + "step": 22874 + }, + { + "epoch": 1.12, + "grad_norm": 0.5287631154060364, + "learning_rate": 0.0004159508718195747, + "loss": 3.1896, + "step": 22875 + }, + { + "epoch": 1.12, + "grad_norm": 0.5936976671218872, + "learning_rate": 0.0004159366719423118, + "loss": 3.188, + "step": 22876 + }, + { + "epoch": 1.12, + "grad_norm": 0.5288457870483398, + "learning_rate": 0.00041592247175969413, + "loss": 3.1293, + "step": 22877 + }, + { + "epoch": 1.12, + "grad_norm": 0.5637390613555908, + "learning_rate": 0.00041590827127175884, + "loss": 3.1525, + "step": 22878 + }, + { + "epoch": 1.12, + "grad_norm": 0.5691234469413757, + "learning_rate": 0.00041589407047854356, + "loss": 3.1165, + "step": 22879 + }, + { + "epoch": 1.12, + "grad_norm": 0.5463374853134155, + "learning_rate": 0.0004158798693800855, + "loss": 3.185, + "step": 22880 + }, + { + "epoch": 1.12, + "grad_norm": 0.5565144419670105, + "learning_rate": 0.0004158656679764223, + "loss": 3.179, + "step": 22881 + }, + { + "epoch": 1.12, + "grad_norm": 0.5509166717529297, + "learning_rate": 0.0004158514662675911, + "loss": 3.1439, + "step": 22882 + }, + { + "epoch": 1.12, + "grad_norm": 0.5458000302314758, + "learning_rate": 0.00041583726425362957, + "loss": 2.9293, + "step": 22883 + }, + { + "epoch": 1.12, + "grad_norm": 0.536400318145752, + "learning_rate": 0.0004158230619345749, + "loss": 2.9197, + "step": 22884 + }, + { + "epoch": 1.12, + "grad_norm": 0.5528376698493958, + "learning_rate": 0.0004158088593104645, + "loss": 3.1646, + "step": 22885 + }, + { + "epoch": 1.12, + "grad_norm": 0.5160115361213684, + "learning_rate": 0.000415794656381336, + "loss": 3.0597, + "step": 22886 + }, + { + "epoch": 1.12, + "grad_norm": 0.5468571782112122, + "learning_rate": 0.0004157804531472266, + "loss": 3.257, + "step": 22887 + }, + { + "epoch": 1.12, + "grad_norm": 0.5390594601631165, + "learning_rate": 0.0004157662496081738, + "loss": 3.2257, + "step": 22888 + }, + { + "epoch": 1.12, + "grad_norm": 0.5498918890953064, + "learning_rate": 0.00041575204576421493, + "loss": 3.1676, + "step": 22889 + }, + { + "epoch": 1.12, + "grad_norm": 0.5376027822494507, + "learning_rate": 0.0004157378416153874, + "loss": 3.087, + "step": 22890 + }, + { + "epoch": 1.12, + "grad_norm": 0.5271984934806824, + "learning_rate": 0.00041572363716172876, + "loss": 3.092, + "step": 22891 + }, + { + "epoch": 1.12, + "grad_norm": 0.5507243275642395, + "learning_rate": 0.00041570943240327627, + "loss": 3.0022, + "step": 22892 + }, + { + "epoch": 1.12, + "grad_norm": 0.5260735154151917, + "learning_rate": 0.0004156952273400675, + "loss": 3.0662, + "step": 22893 + }, + { + "epoch": 1.12, + "grad_norm": 0.5336798429489136, + "learning_rate": 0.0004156810219721397, + "loss": 3.1374, + "step": 22894 + }, + { + "epoch": 1.12, + "grad_norm": 0.5313898324966431, + "learning_rate": 0.0004156668162995303, + "loss": 3.1071, + "step": 22895 + }, + { + "epoch": 1.12, + "grad_norm": 0.5449457764625549, + "learning_rate": 0.0004156526103222768, + "loss": 3.2074, + "step": 22896 + }, + { + "epoch": 1.12, + "grad_norm": 0.5850553512573242, + "learning_rate": 0.00041563840404041665, + "loss": 2.977, + "step": 22897 + }, + { + "epoch": 1.12, + "grad_norm": 0.5299965143203735, + "learning_rate": 0.00041562419745398715, + "loss": 3.154, + "step": 22898 + }, + { + "epoch": 1.12, + "grad_norm": 0.5133044719696045, + "learning_rate": 0.00041560999056302575, + "loss": 3.0494, + "step": 22899 + }, + { + "epoch": 1.12, + "grad_norm": 0.6028819680213928, + "learning_rate": 0.0004155957833675699, + "loss": 2.9978, + "step": 22900 + }, + { + "epoch": 1.12, + "grad_norm": 0.5400010347366333, + "learning_rate": 0.00041558157586765704, + "loss": 3.2693, + "step": 22901 + }, + { + "epoch": 1.12, + "grad_norm": 0.5579959750175476, + "learning_rate": 0.0004155673680633245, + "loss": 3.1617, + "step": 22902 + }, + { + "epoch": 1.12, + "grad_norm": 0.5807639360427856, + "learning_rate": 0.00041555315995460976, + "loss": 3.0796, + "step": 22903 + }, + { + "epoch": 1.12, + "grad_norm": 0.547631025314331, + "learning_rate": 0.0004155389515415503, + "loss": 3.0593, + "step": 22904 + }, + { + "epoch": 1.12, + "grad_norm": 0.5586033463478088, + "learning_rate": 0.00041552474282418336, + "loss": 3.0915, + "step": 22905 + }, + { + "epoch": 1.12, + "grad_norm": 0.5403106212615967, + "learning_rate": 0.00041551053380254656, + "loss": 3.0695, + "step": 22906 + }, + { + "epoch": 1.12, + "grad_norm": 0.5964490175247192, + "learning_rate": 0.0004154963244766772, + "loss": 2.9246, + "step": 22907 + }, + { + "epoch": 1.12, + "grad_norm": 0.6330263018608093, + "learning_rate": 0.00041548211484661286, + "loss": 3.1909, + "step": 22908 + }, + { + "epoch": 1.12, + "grad_norm": 0.5445305109024048, + "learning_rate": 0.0004154679049123908, + "loss": 3.2143, + "step": 22909 + }, + { + "epoch": 1.12, + "grad_norm": 0.49582967162132263, + "learning_rate": 0.0004154536946740484, + "loss": 3.0744, + "step": 22910 + }, + { + "epoch": 1.12, + "grad_norm": 0.6021095514297485, + "learning_rate": 0.0004154394841316233, + "loss": 3.1623, + "step": 22911 + }, + { + "epoch": 1.12, + "grad_norm": 0.5545721650123596, + "learning_rate": 0.00041542527328515273, + "loss": 3.0835, + "step": 22912 + }, + { + "epoch": 1.12, + "grad_norm": 0.5633933544158936, + "learning_rate": 0.0004154110621346743, + "loss": 3.0356, + "step": 22913 + }, + { + "epoch": 1.12, + "grad_norm": 0.5791369676589966, + "learning_rate": 0.00041539685068022527, + "loss": 3.1277, + "step": 22914 + }, + { + "epoch": 1.12, + "grad_norm": 0.588784396648407, + "learning_rate": 0.0004153826389218432, + "loss": 3.1858, + "step": 22915 + }, + { + "epoch": 1.12, + "grad_norm": 0.5584728121757507, + "learning_rate": 0.0004153684268595654, + "loss": 2.9936, + "step": 22916 + }, + { + "epoch": 1.12, + "grad_norm": 0.6015306711196899, + "learning_rate": 0.00041535421449342936, + "loss": 3.0412, + "step": 22917 + }, + { + "epoch": 1.12, + "grad_norm": 0.6056309342384338, + "learning_rate": 0.00041534000182347257, + "loss": 2.8666, + "step": 22918 + }, + { + "epoch": 1.12, + "grad_norm": 0.5673311948776245, + "learning_rate": 0.00041532578884973247, + "loss": 3.0582, + "step": 22919 + }, + { + "epoch": 1.12, + "grad_norm": 0.5334855318069458, + "learning_rate": 0.0004153115755722464, + "loss": 3.0588, + "step": 22920 + }, + { + "epoch": 1.12, + "grad_norm": 0.589484453201294, + "learning_rate": 0.0004152973619910517, + "loss": 3.1469, + "step": 22921 + }, + { + "epoch": 1.12, + "grad_norm": 0.5717774629592896, + "learning_rate": 0.0004152831481061861, + "loss": 2.9427, + "step": 22922 + }, + { + "epoch": 1.12, + "grad_norm": 0.6408734321594238, + "learning_rate": 0.00041526893391768683, + "loss": 3.0572, + "step": 22923 + }, + { + "epoch": 1.12, + "grad_norm": 0.5661062002182007, + "learning_rate": 0.0004152547194255913, + "loss": 2.8823, + "step": 22924 + }, + { + "epoch": 1.12, + "grad_norm": 0.5968921780586243, + "learning_rate": 0.0004152405046299371, + "loss": 3.0135, + "step": 22925 + }, + { + "epoch": 1.12, + "grad_norm": 0.5666815638542175, + "learning_rate": 0.0004152262895307616, + "loss": 3.0244, + "step": 22926 + }, + { + "epoch": 1.12, + "grad_norm": 0.5514220595359802, + "learning_rate": 0.00041521207412810213, + "loss": 2.9396, + "step": 22927 + }, + { + "epoch": 1.12, + "grad_norm": 0.5360330939292908, + "learning_rate": 0.0004151978584219963, + "loss": 3.1101, + "step": 22928 + }, + { + "epoch": 1.12, + "grad_norm": 0.5596000552177429, + "learning_rate": 0.0004151836424124815, + "loss": 3.379, + "step": 22929 + }, + { + "epoch": 1.12, + "grad_norm": 0.5417352318763733, + "learning_rate": 0.0004151694260995952, + "loss": 3.0873, + "step": 22930 + }, + { + "epoch": 1.12, + "grad_norm": 0.4918670952320099, + "learning_rate": 0.0004151552094833746, + "loss": 3.1007, + "step": 22931 + }, + { + "epoch": 1.12, + "grad_norm": 0.5756099224090576, + "learning_rate": 0.00041514099256385753, + "loss": 3.0216, + "step": 22932 + }, + { + "epoch": 1.12, + "grad_norm": 0.5461629033088684, + "learning_rate": 0.0004151267753410812, + "loss": 3.107, + "step": 22933 + }, + { + "epoch": 1.12, + "grad_norm": 0.5471493005752563, + "learning_rate": 0.00041511255781508306, + "loss": 3.1213, + "step": 22934 + }, + { + "epoch": 1.12, + "grad_norm": 0.600498616695404, + "learning_rate": 0.00041509833998590063, + "loss": 3.1274, + "step": 22935 + }, + { + "epoch": 1.12, + "grad_norm": 0.5208067297935486, + "learning_rate": 0.00041508412185357126, + "loss": 3.1673, + "step": 22936 + }, + { + "epoch": 1.12, + "grad_norm": 0.5667810440063477, + "learning_rate": 0.00041506990341813257, + "loss": 3.0739, + "step": 22937 + }, + { + "epoch": 1.12, + "grad_norm": 0.5090360045433044, + "learning_rate": 0.00041505568467962187, + "loss": 3.2307, + "step": 22938 + }, + { + "epoch": 1.12, + "grad_norm": 0.5476797819137573, + "learning_rate": 0.0004150414656380766, + "loss": 3.0299, + "step": 22939 + }, + { + "epoch": 1.12, + "grad_norm": 0.5608437657356262, + "learning_rate": 0.00041502724629353423, + "loss": 3.0124, + "step": 22940 + }, + { + "epoch": 1.12, + "grad_norm": 0.5343563556671143, + "learning_rate": 0.0004150130266460322, + "loss": 3.0477, + "step": 22941 + }, + { + "epoch": 1.12, + "grad_norm": 0.5381127595901489, + "learning_rate": 0.0004149988066956081, + "loss": 3.1718, + "step": 22942 + }, + { + "epoch": 1.12, + "grad_norm": 0.5361109375953674, + "learning_rate": 0.00041498458644229926, + "loss": 3.0919, + "step": 22943 + }, + { + "epoch": 1.12, + "grad_norm": 0.5653386116027832, + "learning_rate": 0.0004149703658861431, + "loss": 3.1964, + "step": 22944 + }, + { + "epoch": 1.12, + "grad_norm": 0.5344790816307068, + "learning_rate": 0.0004149561450271772, + "loss": 2.9937, + "step": 22945 + }, + { + "epoch": 1.12, + "grad_norm": 0.5467535257339478, + "learning_rate": 0.00041494192386543885, + "loss": 3.1078, + "step": 22946 + }, + { + "epoch": 1.12, + "grad_norm": 0.6098953485488892, + "learning_rate": 0.0004149277024009655, + "loss": 3.178, + "step": 22947 + }, + { + "epoch": 1.12, + "grad_norm": 0.5292100310325623, + "learning_rate": 0.00041491348063379496, + "loss": 3.0706, + "step": 22948 + }, + { + "epoch": 1.12, + "grad_norm": 0.5830525159835815, + "learning_rate": 0.0004148992585639643, + "loss": 3.1763, + "step": 22949 + }, + { + "epoch": 1.12, + "grad_norm": 0.5466551184654236, + "learning_rate": 0.00041488503619151103, + "loss": 2.918, + "step": 22950 + }, + { + "epoch": 1.12, + "grad_norm": 0.5626237392425537, + "learning_rate": 0.00041487081351647274, + "loss": 2.8359, + "step": 22951 + }, + { + "epoch": 1.12, + "grad_norm": 0.6049579977989197, + "learning_rate": 0.00041485659053888696, + "loss": 3.1318, + "step": 22952 + }, + { + "epoch": 1.12, + "grad_norm": 0.5381505489349365, + "learning_rate": 0.0004148423672587908, + "loss": 3.1775, + "step": 22953 + }, + { + "epoch": 1.12, + "grad_norm": 0.5498600006103516, + "learning_rate": 0.0004148281436762221, + "loss": 3.1253, + "step": 22954 + }, + { + "epoch": 1.12, + "grad_norm": 0.5618240237236023, + "learning_rate": 0.00041481391979121817, + "loss": 2.8616, + "step": 22955 + }, + { + "epoch": 1.13, + "grad_norm": 0.5557640194892883, + "learning_rate": 0.00041479969560381645, + "loss": 3.1876, + "step": 22956 + }, + { + "epoch": 1.13, + "grad_norm": 0.5765591859817505, + "learning_rate": 0.0004147854711140543, + "loss": 3.2525, + "step": 22957 + }, + { + "epoch": 1.13, + "grad_norm": 0.5476953983306885, + "learning_rate": 0.0004147712463219694, + "loss": 2.9851, + "step": 22958 + }, + { + "epoch": 1.13, + "grad_norm": 0.5657317042350769, + "learning_rate": 0.00041475702122759924, + "loss": 2.9882, + "step": 22959 + }, + { + "epoch": 1.13, + "grad_norm": 0.5439834594726562, + "learning_rate": 0.00041474279583098104, + "loss": 3.3056, + "step": 22960 + }, + { + "epoch": 1.13, + "grad_norm": 0.5226112604141235, + "learning_rate": 0.0004147285701321525, + "loss": 3.0917, + "step": 22961 + }, + { + "epoch": 1.13, + "grad_norm": 0.5537917613983154, + "learning_rate": 0.0004147143441311509, + "loss": 3.0329, + "step": 22962 + }, + { + "epoch": 1.13, + "grad_norm": 0.5705544948577881, + "learning_rate": 0.00041470011782801373, + "loss": 3.3239, + "step": 22963 + }, + { + "epoch": 1.13, + "grad_norm": 0.5459992289543152, + "learning_rate": 0.0004146858912227786, + "loss": 3.0937, + "step": 22964 + }, + { + "epoch": 1.13, + "grad_norm": 0.6103513240814209, + "learning_rate": 0.00041467166431548297, + "loss": 3.2497, + "step": 22965 + }, + { + "epoch": 1.13, + "grad_norm": 0.591571033000946, + "learning_rate": 0.00041465743710616417, + "loss": 3.1043, + "step": 22966 + }, + { + "epoch": 1.13, + "grad_norm": 0.5303111672401428, + "learning_rate": 0.00041464320959485977, + "loss": 3.111, + "step": 22967 + }, + { + "epoch": 1.13, + "grad_norm": 0.5746419429779053, + "learning_rate": 0.00041462898178160713, + "loss": 2.8603, + "step": 22968 + }, + { + "epoch": 1.13, + "grad_norm": 0.5438621044158936, + "learning_rate": 0.00041461475366644387, + "loss": 3.2288, + "step": 22969 + }, + { + "epoch": 1.13, + "grad_norm": 0.6318126320838928, + "learning_rate": 0.00041460052524940755, + "loss": 3.101, + "step": 22970 + }, + { + "epoch": 1.13, + "grad_norm": 0.549228847026825, + "learning_rate": 0.00041458629653053536, + "loss": 3.118, + "step": 22971 + }, + { + "epoch": 1.13, + "grad_norm": 0.5160179138183594, + "learning_rate": 0.0004145720675098649, + "loss": 2.8717, + "step": 22972 + }, + { + "epoch": 1.13, + "grad_norm": 0.5238551497459412, + "learning_rate": 0.0004145578381874337, + "loss": 3.0346, + "step": 22973 + }, + { + "epoch": 1.13, + "grad_norm": 0.5568090677261353, + "learning_rate": 0.00041454360856327927, + "loss": 3.0845, + "step": 22974 + }, + { + "epoch": 1.13, + "grad_norm": 0.5128166079521179, + "learning_rate": 0.0004145293786374389, + "loss": 3.0882, + "step": 22975 + }, + { + "epoch": 1.13, + "grad_norm": 0.5460500717163086, + "learning_rate": 0.00041451514840995025, + "loss": 3.252, + "step": 22976 + }, + { + "epoch": 1.13, + "grad_norm": 0.5138880610466003, + "learning_rate": 0.00041450091788085075, + "loss": 3.0992, + "step": 22977 + }, + { + "epoch": 1.13, + "grad_norm": 0.5311721563339233, + "learning_rate": 0.0004144866870501778, + "loss": 3.0771, + "step": 22978 + }, + { + "epoch": 1.13, + "grad_norm": 0.5450507402420044, + "learning_rate": 0.000414472455917969, + "loss": 3.0126, + "step": 22979 + }, + { + "epoch": 1.13, + "grad_norm": 0.5414109230041504, + "learning_rate": 0.0004144582244842618, + "loss": 3.1002, + "step": 22980 + }, + { + "epoch": 1.13, + "grad_norm": 0.5404634475708008, + "learning_rate": 0.0004144439927490937, + "loss": 2.9594, + "step": 22981 + }, + { + "epoch": 1.13, + "grad_norm": 0.564774751663208, + "learning_rate": 0.000414429760712502, + "loss": 3.2406, + "step": 22982 + }, + { + "epoch": 1.13, + "grad_norm": 0.5353189706802368, + "learning_rate": 0.0004144155283745244, + "loss": 2.8896, + "step": 22983 + }, + { + "epoch": 1.13, + "grad_norm": 0.5611631274223328, + "learning_rate": 0.00041440129573519843, + "loss": 2.9384, + "step": 22984 + }, + { + "epoch": 1.13, + "grad_norm": 0.5437825918197632, + "learning_rate": 0.00041438706279456133, + "loss": 3.0938, + "step": 22985 + }, + { + "epoch": 1.13, + "grad_norm": 0.5270655155181885, + "learning_rate": 0.0004143728295526508, + "loss": 3.125, + "step": 22986 + }, + { + "epoch": 1.13, + "grad_norm": 0.5163417458534241, + "learning_rate": 0.0004143585960095042, + "loss": 3.1141, + "step": 22987 + }, + { + "epoch": 1.13, + "grad_norm": 0.5559436678886414, + "learning_rate": 0.0004143443621651591, + "loss": 3.045, + "step": 22988 + }, + { + "epoch": 1.13, + "grad_norm": 0.5473134517669678, + "learning_rate": 0.00041433012801965296, + "loss": 3.099, + "step": 22989 + }, + { + "epoch": 1.13, + "grad_norm": 0.567561686038971, + "learning_rate": 0.00041431589357302325, + "loss": 3.1221, + "step": 22990 + }, + { + "epoch": 1.13, + "grad_norm": 0.5573049783706665, + "learning_rate": 0.0004143016588253074, + "loss": 3.0608, + "step": 22991 + }, + { + "epoch": 1.13, + "grad_norm": 0.5784956812858582, + "learning_rate": 0.00041428742377654315, + "loss": 3.14, + "step": 22992 + }, + { + "epoch": 1.13, + "grad_norm": 0.5378555655479431, + "learning_rate": 0.0004142731884267676, + "loss": 3.107, + "step": 22993 + }, + { + "epoch": 1.13, + "grad_norm": 0.5544320344924927, + "learning_rate": 0.00041425895277601865, + "loss": 2.9306, + "step": 22994 + }, + { + "epoch": 1.13, + "grad_norm": 0.536886990070343, + "learning_rate": 0.00041424471682433353, + "loss": 3.1313, + "step": 22995 + }, + { + "epoch": 1.13, + "grad_norm": 0.5307275056838989, + "learning_rate": 0.00041423048057174984, + "loss": 2.9803, + "step": 22996 + }, + { + "epoch": 1.13, + "grad_norm": 0.5753107666969299, + "learning_rate": 0.00041421624401830504, + "loss": 3.0398, + "step": 22997 + }, + { + "epoch": 1.13, + "grad_norm": 0.5424846410751343, + "learning_rate": 0.00041420200716403663, + "loss": 3.042, + "step": 22998 + }, + { + "epoch": 1.13, + "grad_norm": 0.549351692199707, + "learning_rate": 0.00041418777000898214, + "loss": 3.035, + "step": 22999 + }, + { + "epoch": 1.13, + "grad_norm": 0.5418702960014343, + "learning_rate": 0.000414173532553179, + "loss": 2.835, + "step": 23000 + }, + { + "epoch": 1.13, + "grad_norm": 0.5061715841293335, + "learning_rate": 0.0004141592947966647, + "loss": 3.0573, + "step": 23001 + }, + { + "epoch": 1.13, + "grad_norm": 0.5275315046310425, + "learning_rate": 0.00041414505673947687, + "loss": 3.2917, + "step": 23002 + }, + { + "epoch": 1.13, + "grad_norm": 0.5400534272193909, + "learning_rate": 0.00041413081838165294, + "loss": 2.8802, + "step": 23003 + }, + { + "epoch": 1.13, + "grad_norm": 0.5462265014648438, + "learning_rate": 0.0004141165797232304, + "loss": 3.0331, + "step": 23004 + }, + { + "epoch": 1.13, + "grad_norm": 0.515167772769928, + "learning_rate": 0.00041410234076424664, + "loss": 3.0803, + "step": 23005 + }, + { + "epoch": 1.13, + "grad_norm": 0.557414174079895, + "learning_rate": 0.0004140881015047394, + "loss": 3.1524, + "step": 23006 + }, + { + "epoch": 1.13, + "grad_norm": 0.5607942342758179, + "learning_rate": 0.000414073861944746, + "loss": 3.1207, + "step": 23007 + }, + { + "epoch": 1.13, + "grad_norm": 0.5383069515228271, + "learning_rate": 0.000414059622084304, + "loss": 3.0531, + "step": 23008 + }, + { + "epoch": 1.13, + "grad_norm": 0.5618544816970825, + "learning_rate": 0.0004140453819234509, + "loss": 3.1155, + "step": 23009 + }, + { + "epoch": 1.13, + "grad_norm": 0.5300397276878357, + "learning_rate": 0.0004140311414622243, + "loss": 3.2335, + "step": 23010 + }, + { + "epoch": 1.13, + "grad_norm": 0.5628443360328674, + "learning_rate": 0.0004140169007006615, + "loss": 3.3326, + "step": 23011 + }, + { + "epoch": 1.13, + "grad_norm": 0.5267999768257141, + "learning_rate": 0.0004140026596388002, + "loss": 3.1844, + "step": 23012 + }, + { + "epoch": 1.13, + "grad_norm": 0.5517027378082275, + "learning_rate": 0.0004139884182766778, + "loss": 3.1886, + "step": 23013 + }, + { + "epoch": 1.13, + "grad_norm": 0.5515194535255432, + "learning_rate": 0.00041397417661433183, + "loss": 3.0972, + "step": 23014 + }, + { + "epoch": 1.13, + "grad_norm": 0.5435203313827515, + "learning_rate": 0.00041395993465179984, + "loss": 3.0578, + "step": 23015 + }, + { + "epoch": 1.13, + "grad_norm": 0.5524570941925049, + "learning_rate": 0.0004139456923891193, + "loss": 3.0469, + "step": 23016 + }, + { + "epoch": 1.13, + "grad_norm": 0.5539202094078064, + "learning_rate": 0.0004139314498263278, + "loss": 3.0198, + "step": 23017 + }, + { + "epoch": 1.13, + "grad_norm": 0.5375218987464905, + "learning_rate": 0.00041391720696346267, + "loss": 3.1131, + "step": 23018 + }, + { + "epoch": 1.13, + "grad_norm": 0.6372843384742737, + "learning_rate": 0.00041390296380056156, + "loss": 3.2209, + "step": 23019 + }, + { + "epoch": 1.13, + "grad_norm": 0.5595908164978027, + "learning_rate": 0.00041388872033766203, + "loss": 3.2019, + "step": 23020 + }, + { + "epoch": 1.13, + "grad_norm": 0.5718174576759338, + "learning_rate": 0.00041387447657480154, + "loss": 3.0502, + "step": 23021 + }, + { + "epoch": 1.13, + "grad_norm": 0.5092378854751587, + "learning_rate": 0.0004138602325120176, + "loss": 3.2092, + "step": 23022 + }, + { + "epoch": 1.13, + "grad_norm": 0.507887601852417, + "learning_rate": 0.0004138459881493476, + "loss": 3.2074, + "step": 23023 + }, + { + "epoch": 1.13, + "grad_norm": 0.502585232257843, + "learning_rate": 0.0004138317434868293, + "loss": 3.1789, + "step": 23024 + }, + { + "epoch": 1.13, + "grad_norm": 0.5779663920402527, + "learning_rate": 0.0004138174985245001, + "loss": 3.0216, + "step": 23025 + }, + { + "epoch": 1.13, + "grad_norm": 0.580353856086731, + "learning_rate": 0.0004138032532623974, + "loss": 3.0584, + "step": 23026 + }, + { + "epoch": 1.13, + "grad_norm": 0.5854467153549194, + "learning_rate": 0.00041378900770055896, + "loss": 3.0347, + "step": 23027 + }, + { + "epoch": 1.13, + "grad_norm": 0.5568118095397949, + "learning_rate": 0.0004137747618390221, + "loss": 2.9267, + "step": 23028 + }, + { + "epoch": 1.13, + "grad_norm": 0.5543918609619141, + "learning_rate": 0.0004137605156778244, + "loss": 3.0005, + "step": 23029 + }, + { + "epoch": 1.13, + "grad_norm": 0.5414887070655823, + "learning_rate": 0.00041374626921700345, + "loss": 3.0285, + "step": 23030 + }, + { + "epoch": 1.13, + "grad_norm": 0.5289766788482666, + "learning_rate": 0.00041373202245659663, + "loss": 2.9147, + "step": 23031 + }, + { + "epoch": 1.13, + "grad_norm": 0.5012800097465515, + "learning_rate": 0.0004137177753966417, + "loss": 2.962, + "step": 23032 + }, + { + "epoch": 1.13, + "grad_norm": 0.5366766452789307, + "learning_rate": 0.0004137035280371759, + "loss": 2.9534, + "step": 23033 + }, + { + "epoch": 1.13, + "grad_norm": 0.524847149848938, + "learning_rate": 0.0004136892803782369, + "loss": 3.2415, + "step": 23034 + }, + { + "epoch": 1.13, + "grad_norm": 0.5566405057907104, + "learning_rate": 0.0004136750324198623, + "loss": 3.0081, + "step": 23035 + }, + { + "epoch": 1.13, + "grad_norm": 0.5404036045074463, + "learning_rate": 0.0004136607841620895, + "loss": 3.1826, + "step": 23036 + }, + { + "epoch": 1.13, + "grad_norm": 0.5708770155906677, + "learning_rate": 0.00041364653560495605, + "loss": 2.9988, + "step": 23037 + }, + { + "epoch": 1.13, + "grad_norm": 0.557307779788971, + "learning_rate": 0.00041363228674849955, + "loss": 2.9934, + "step": 23038 + }, + { + "epoch": 1.13, + "grad_norm": 0.5527191758155823, + "learning_rate": 0.0004136180375927574, + "loss": 3.2225, + "step": 23039 + }, + { + "epoch": 1.13, + "grad_norm": 0.5346764922142029, + "learning_rate": 0.0004136037881377672, + "loss": 3.1241, + "step": 23040 + }, + { + "epoch": 1.13, + "grad_norm": 0.5354068875312805, + "learning_rate": 0.0004135895383835666, + "loss": 3.0392, + "step": 23041 + }, + { + "epoch": 1.13, + "grad_norm": 0.5253552198410034, + "learning_rate": 0.00041357528833019285, + "loss": 3.235, + "step": 23042 + }, + { + "epoch": 1.13, + "grad_norm": 0.5762072205543518, + "learning_rate": 0.0004135610379776838, + "loss": 3.0428, + "step": 23043 + }, + { + "epoch": 1.13, + "grad_norm": 0.554805338382721, + "learning_rate": 0.00041354678732607677, + "loss": 3.0625, + "step": 23044 + }, + { + "epoch": 1.13, + "grad_norm": 0.5384538173675537, + "learning_rate": 0.00041353253637540935, + "loss": 2.9583, + "step": 23045 + }, + { + "epoch": 1.13, + "grad_norm": 0.5378395915031433, + "learning_rate": 0.00041351828512571913, + "loss": 3.2187, + "step": 23046 + }, + { + "epoch": 1.13, + "grad_norm": 0.5567187070846558, + "learning_rate": 0.0004135040335770436, + "loss": 3.2139, + "step": 23047 + }, + { + "epoch": 1.13, + "grad_norm": 0.504665732383728, + "learning_rate": 0.0004134897817294202, + "loss": 3.2191, + "step": 23048 + }, + { + "epoch": 1.13, + "grad_norm": 0.5528210997581482, + "learning_rate": 0.0004134755295828865, + "loss": 3.1749, + "step": 23049 + }, + { + "epoch": 1.13, + "grad_norm": 0.5319205522537231, + "learning_rate": 0.00041346127713748023, + "loss": 3.1229, + "step": 23050 + }, + { + "epoch": 1.13, + "grad_norm": 0.5349425077438354, + "learning_rate": 0.00041344702439323873, + "loss": 2.9538, + "step": 23051 + }, + { + "epoch": 1.13, + "grad_norm": 0.5903375148773193, + "learning_rate": 0.0004134327713501997, + "loss": 3.0624, + "step": 23052 + }, + { + "epoch": 1.13, + "grad_norm": 0.5711473226547241, + "learning_rate": 0.0004134185180084004, + "loss": 3.2639, + "step": 23053 + }, + { + "epoch": 1.13, + "grad_norm": 0.5508195161819458, + "learning_rate": 0.0004134042643678787, + "loss": 2.9987, + "step": 23054 + }, + { + "epoch": 1.13, + "grad_norm": 0.5824445486068726, + "learning_rate": 0.00041339001042867195, + "loss": 3.1495, + "step": 23055 + }, + { + "epoch": 1.13, + "grad_norm": 0.5956960320472717, + "learning_rate": 0.0004133757561908177, + "loss": 3.384, + "step": 23056 + }, + { + "epoch": 1.13, + "grad_norm": 0.5386295318603516, + "learning_rate": 0.0004133615016543536, + "loss": 3.0116, + "step": 23057 + }, + { + "epoch": 1.13, + "grad_norm": 0.5671989321708679, + "learning_rate": 0.0004133472468193171, + "loss": 2.8844, + "step": 23058 + }, + { + "epoch": 1.13, + "grad_norm": 0.5901870727539062, + "learning_rate": 0.00041333299168574565, + "loss": 3.076, + "step": 23059 + }, + { + "epoch": 1.13, + "grad_norm": 0.566315233707428, + "learning_rate": 0.00041331873625367703, + "loss": 3.1643, + "step": 23060 + }, + { + "epoch": 1.13, + "grad_norm": 0.5477898716926575, + "learning_rate": 0.0004133044805231487, + "loss": 3.3274, + "step": 23061 + }, + { + "epoch": 1.13, + "grad_norm": 0.54547119140625, + "learning_rate": 0.0004132902244941981, + "loss": 3.2076, + "step": 23062 + }, + { + "epoch": 1.13, + "grad_norm": 0.5427873730659485, + "learning_rate": 0.0004132759681668629, + "loss": 3.1031, + "step": 23063 + }, + { + "epoch": 1.13, + "grad_norm": 0.6867091655731201, + "learning_rate": 0.0004132617115411805, + "loss": 2.9959, + "step": 23064 + }, + { + "epoch": 1.13, + "grad_norm": 0.6565173864364624, + "learning_rate": 0.00041324745461718867, + "loss": 3.1724, + "step": 23065 + }, + { + "epoch": 1.13, + "grad_norm": 0.5303226709365845, + "learning_rate": 0.0004132331973949248, + "loss": 3.0475, + "step": 23066 + }, + { + "epoch": 1.13, + "grad_norm": 0.537520170211792, + "learning_rate": 0.00041321893987442647, + "loss": 3.1931, + "step": 23067 + }, + { + "epoch": 1.13, + "grad_norm": 0.5558544397354126, + "learning_rate": 0.00041320468205573125, + "loss": 2.9065, + "step": 23068 + }, + { + "epoch": 1.13, + "grad_norm": 0.5704731941223145, + "learning_rate": 0.00041319042393887675, + "loss": 3.0102, + "step": 23069 + }, + { + "epoch": 1.13, + "grad_norm": 0.5413227081298828, + "learning_rate": 0.0004131761655239003, + "loss": 2.8069, + "step": 23070 + }, + { + "epoch": 1.13, + "grad_norm": 0.5648303627967834, + "learning_rate": 0.00041316190681083963, + "loss": 3.0567, + "step": 23071 + }, + { + "epoch": 1.13, + "grad_norm": 0.5538491010665894, + "learning_rate": 0.00041314764779973247, + "loss": 3.1438, + "step": 23072 + }, + { + "epoch": 1.13, + "grad_norm": 0.5491966605186462, + "learning_rate": 0.0004131333884906161, + "loss": 2.889, + "step": 23073 + }, + { + "epoch": 1.13, + "grad_norm": 0.572340190410614, + "learning_rate": 0.000413119128883528, + "loss": 3.1123, + "step": 23074 + }, + { + "epoch": 1.13, + "grad_norm": 0.5277241468429565, + "learning_rate": 0.00041310486897850604, + "loss": 3.0489, + "step": 23075 + }, + { + "epoch": 1.13, + "grad_norm": 0.5485084652900696, + "learning_rate": 0.0004130906087755876, + "loss": 3.134, + "step": 23076 + }, + { + "epoch": 1.13, + "grad_norm": 0.510722815990448, + "learning_rate": 0.0004130763482748102, + "loss": 3.1496, + "step": 23077 + }, + { + "epoch": 1.13, + "grad_norm": 0.5820092558860779, + "learning_rate": 0.0004130620874762115, + "loss": 2.9881, + "step": 23078 + }, + { + "epoch": 1.13, + "grad_norm": 0.5545257329940796, + "learning_rate": 0.00041304782637982903, + "loss": 3.1989, + "step": 23079 + }, + { + "epoch": 1.13, + "grad_norm": 0.5428977012634277, + "learning_rate": 0.00041303356498570036, + "loss": 2.969, + "step": 23080 + }, + { + "epoch": 1.13, + "grad_norm": 0.5307221412658691, + "learning_rate": 0.000413019303293863, + "loss": 3.1404, + "step": 23081 + }, + { + "epoch": 1.13, + "grad_norm": 0.5607012510299683, + "learning_rate": 0.0004130050413043545, + "loss": 3.0869, + "step": 23082 + }, + { + "epoch": 1.13, + "grad_norm": 0.5869443416595459, + "learning_rate": 0.00041299077901721257, + "loss": 3.0999, + "step": 23083 + }, + { + "epoch": 1.13, + "grad_norm": 0.5176264047622681, + "learning_rate": 0.00041297651643247465, + "loss": 2.937, + "step": 23084 + }, + { + "epoch": 1.13, + "grad_norm": 0.5371460318565369, + "learning_rate": 0.00041296225355017826, + "loss": 2.9182, + "step": 23085 + }, + { + "epoch": 1.13, + "grad_norm": 0.5200726985931396, + "learning_rate": 0.00041294799037036107, + "loss": 3.0559, + "step": 23086 + }, + { + "epoch": 1.13, + "grad_norm": 0.5590261220932007, + "learning_rate": 0.0004129337268930607, + "loss": 3.2351, + "step": 23087 + }, + { + "epoch": 1.13, + "grad_norm": 0.5306074023246765, + "learning_rate": 0.00041291946311831446, + "loss": 3.2695, + "step": 23088 + }, + { + "epoch": 1.13, + "grad_norm": 0.5653502345085144, + "learning_rate": 0.0004129051990461602, + "loss": 3.1316, + "step": 23089 + }, + { + "epoch": 1.13, + "grad_norm": 0.5234277248382568, + "learning_rate": 0.00041289093467663536, + "loss": 3.009, + "step": 23090 + }, + { + "epoch": 1.13, + "grad_norm": 0.5736457109451294, + "learning_rate": 0.0004128766700097775, + "loss": 3.1699, + "step": 23091 + }, + { + "epoch": 1.13, + "grad_norm": 0.607955276966095, + "learning_rate": 0.00041286240504562416, + "loss": 2.9498, + "step": 23092 + }, + { + "epoch": 1.13, + "grad_norm": 0.5673606395721436, + "learning_rate": 0.00041284813978421307, + "loss": 3.0839, + "step": 23093 + }, + { + "epoch": 1.13, + "grad_norm": 0.5501077175140381, + "learning_rate": 0.0004128338742255817, + "loss": 3.0898, + "step": 23094 + }, + { + "epoch": 1.13, + "grad_norm": 0.5964276790618896, + "learning_rate": 0.0004128196083697676, + "loss": 3.0892, + "step": 23095 + }, + { + "epoch": 1.13, + "grad_norm": 0.5602797865867615, + "learning_rate": 0.00041280534221680825, + "loss": 3.0006, + "step": 23096 + }, + { + "epoch": 1.13, + "grad_norm": 0.5748618841171265, + "learning_rate": 0.00041279107576674154, + "loss": 2.9523, + "step": 23097 + }, + { + "epoch": 1.13, + "grad_norm": 0.5489750504493713, + "learning_rate": 0.0004127768090196047, + "loss": 3.1644, + "step": 23098 + }, + { + "epoch": 1.13, + "grad_norm": 0.5585864186286926, + "learning_rate": 0.00041276254197543545, + "loss": 3.1808, + "step": 23099 + }, + { + "epoch": 1.13, + "grad_norm": 0.6432881951332092, + "learning_rate": 0.0004127482746342714, + "loss": 2.9174, + "step": 23100 + }, + { + "epoch": 1.13, + "grad_norm": 0.5519260764122009, + "learning_rate": 0.0004127340069961501, + "loss": 3.0454, + "step": 23101 + }, + { + "epoch": 1.13, + "grad_norm": 0.5701799988746643, + "learning_rate": 0.00041271973906110916, + "loss": 3.0101, + "step": 23102 + }, + { + "epoch": 1.13, + "grad_norm": 0.5463408827781677, + "learning_rate": 0.000412705470829186, + "loss": 3.3575, + "step": 23103 + }, + { + "epoch": 1.13, + "grad_norm": 0.5591546297073364, + "learning_rate": 0.0004126912023004184, + "loss": 2.722, + "step": 23104 + }, + { + "epoch": 1.13, + "grad_norm": 0.6106502413749695, + "learning_rate": 0.00041267693347484403, + "loss": 3.2153, + "step": 23105 + }, + { + "epoch": 1.13, + "grad_norm": 0.5445353984832764, + "learning_rate": 0.00041266266435250004, + "loss": 3.1634, + "step": 23106 + }, + { + "epoch": 1.13, + "grad_norm": 0.5435615181922913, + "learning_rate": 0.00041264839493342434, + "loss": 3.0936, + "step": 23107 + }, + { + "epoch": 1.13, + "grad_norm": 0.5400312542915344, + "learning_rate": 0.00041263412521765454, + "loss": 3.1408, + "step": 23108 + }, + { + "epoch": 1.13, + "grad_norm": 0.5756552219390869, + "learning_rate": 0.00041261985520522806, + "loss": 3.2401, + "step": 23109 + }, + { + "epoch": 1.13, + "grad_norm": 0.573711633682251, + "learning_rate": 0.00041260558489618256, + "loss": 3.0456, + "step": 23110 + }, + { + "epoch": 1.13, + "grad_norm": 0.5202263593673706, + "learning_rate": 0.0004125913142905555, + "loss": 2.958, + "step": 23111 + }, + { + "epoch": 1.13, + "grad_norm": 0.5493197441101074, + "learning_rate": 0.0004125770433883848, + "loss": 3.1262, + "step": 23112 + }, + { + "epoch": 1.13, + "grad_norm": 0.5797801613807678, + "learning_rate": 0.00041256277218970774, + "loss": 3.0246, + "step": 23113 + }, + { + "epoch": 1.13, + "grad_norm": 0.5684946179389954, + "learning_rate": 0.000412548500694562, + "loss": 3.1813, + "step": 23114 + }, + { + "epoch": 1.13, + "grad_norm": 0.54192054271698, + "learning_rate": 0.00041253422890298515, + "loss": 2.8924, + "step": 23115 + }, + { + "epoch": 1.13, + "grad_norm": 0.5381801724433899, + "learning_rate": 0.00041251995681501483, + "loss": 3.0613, + "step": 23116 + }, + { + "epoch": 1.13, + "grad_norm": 0.5283708572387695, + "learning_rate": 0.0004125056844306886, + "loss": 3.0999, + "step": 23117 + }, + { + "epoch": 1.13, + "grad_norm": 0.5617417693138123, + "learning_rate": 0.000412491411750044, + "loss": 2.9397, + "step": 23118 + }, + { + "epoch": 1.13, + "grad_norm": 0.6005975604057312, + "learning_rate": 0.0004124771387731188, + "loss": 3.3285, + "step": 23119 + }, + { + "epoch": 1.13, + "grad_norm": 0.5726503729820251, + "learning_rate": 0.00041246286549995035, + "loss": 3.1522, + "step": 23120 + }, + { + "epoch": 1.13, + "grad_norm": 0.5579819679260254, + "learning_rate": 0.0004124485919305763, + "loss": 3.278, + "step": 23121 + }, + { + "epoch": 1.13, + "grad_norm": 0.5398461818695068, + "learning_rate": 0.00041243431806503437, + "loss": 2.8825, + "step": 23122 + }, + { + "epoch": 1.13, + "grad_norm": 0.5483943819999695, + "learning_rate": 0.00041242004390336216, + "loss": 3.1911, + "step": 23123 + }, + { + "epoch": 1.13, + "grad_norm": 0.5343546867370605, + "learning_rate": 0.0004124057694455971, + "loss": 3.0709, + "step": 23124 + }, + { + "epoch": 1.13, + "grad_norm": 0.5518394708633423, + "learning_rate": 0.0004123914946917769, + "loss": 3.088, + "step": 23125 + }, + { + "epoch": 1.13, + "grad_norm": 0.541649580001831, + "learning_rate": 0.00041237721964193914, + "loss": 2.9646, + "step": 23126 + }, + { + "epoch": 1.13, + "grad_norm": 0.5012128949165344, + "learning_rate": 0.00041236294429612143, + "loss": 3.0232, + "step": 23127 + }, + { + "epoch": 1.13, + "grad_norm": 0.5622835755348206, + "learning_rate": 0.0004123486686543613, + "loss": 3.1136, + "step": 23128 + }, + { + "epoch": 1.13, + "grad_norm": 0.5492423176765442, + "learning_rate": 0.00041233439271669644, + "loss": 3.2057, + "step": 23129 + }, + { + "epoch": 1.13, + "grad_norm": 0.5439629554748535, + "learning_rate": 0.0004123201164831644, + "loss": 2.9353, + "step": 23130 + }, + { + "epoch": 1.13, + "grad_norm": 0.5032997727394104, + "learning_rate": 0.0004123058399538027, + "loss": 3.1584, + "step": 23131 + }, + { + "epoch": 1.13, + "grad_norm": 0.5795646905899048, + "learning_rate": 0.0004122915631286492, + "loss": 3.1761, + "step": 23132 + }, + { + "epoch": 1.13, + "grad_norm": 0.5473034381866455, + "learning_rate": 0.0004122772860077412, + "loss": 2.8715, + "step": 23133 + }, + { + "epoch": 1.13, + "grad_norm": 0.5456127524375916, + "learning_rate": 0.0004122630085911165, + "loss": 2.9562, + "step": 23134 + }, + { + "epoch": 1.13, + "grad_norm": 0.5090837478637695, + "learning_rate": 0.00041224873087881265, + "loss": 3.0014, + "step": 23135 + }, + { + "epoch": 1.13, + "grad_norm": 0.5396968126296997, + "learning_rate": 0.00041223445287086715, + "loss": 3.173, + "step": 23136 + }, + { + "epoch": 1.13, + "grad_norm": 0.5504181981086731, + "learning_rate": 0.0004122201745673178, + "loss": 3.0763, + "step": 23137 + }, + { + "epoch": 1.13, + "grad_norm": 0.551391065120697, + "learning_rate": 0.00041220589596820204, + "loss": 3.0412, + "step": 23138 + }, + { + "epoch": 1.13, + "grad_norm": 0.5569359064102173, + "learning_rate": 0.0004121916170735577, + "loss": 3.136, + "step": 23139 + }, + { + "epoch": 1.13, + "grad_norm": 0.5515356063842773, + "learning_rate": 0.00041217733788342204, + "loss": 3.1659, + "step": 23140 + }, + { + "epoch": 1.13, + "grad_norm": 0.5313219428062439, + "learning_rate": 0.0004121630583978329, + "loss": 2.951, + "step": 23141 + }, + { + "epoch": 1.13, + "grad_norm": 0.5136998295783997, + "learning_rate": 0.00041214877861682795, + "loss": 2.9565, + "step": 23142 + }, + { + "epoch": 1.13, + "grad_norm": 0.531518816947937, + "learning_rate": 0.00041213449854044464, + "loss": 3.2081, + "step": 23143 + }, + { + "epoch": 1.13, + "grad_norm": 0.5740218758583069, + "learning_rate": 0.0004121202181687206, + "loss": 3.0908, + "step": 23144 + }, + { + "epoch": 1.13, + "grad_norm": 0.5602694153785706, + "learning_rate": 0.0004121059375016936, + "loss": 3.0052, + "step": 23145 + }, + { + "epoch": 1.13, + "grad_norm": 0.5783454775810242, + "learning_rate": 0.0004120916565394011, + "loss": 3.056, + "step": 23146 + }, + { + "epoch": 1.13, + "grad_norm": 0.5084561705589294, + "learning_rate": 0.00041207737528188056, + "loss": 3.152, + "step": 23147 + }, + { + "epoch": 1.13, + "grad_norm": 0.5585759878158569, + "learning_rate": 0.00041206309372917, + "loss": 3.1573, + "step": 23148 + }, + { + "epoch": 1.13, + "grad_norm": 0.5565611720085144, + "learning_rate": 0.00041204881188130674, + "loss": 3.1032, + "step": 23149 + }, + { + "epoch": 1.13, + "grad_norm": 0.5540475249290466, + "learning_rate": 0.00041203452973832843, + "loss": 2.8712, + "step": 23150 + }, + { + "epoch": 1.13, + "grad_norm": 0.5279893279075623, + "learning_rate": 0.0004120202473002728, + "loss": 3.0523, + "step": 23151 + }, + { + "epoch": 1.13, + "grad_norm": 0.5814106464385986, + "learning_rate": 0.0004120059645671774, + "loss": 3.2728, + "step": 23152 + }, + { + "epoch": 1.13, + "grad_norm": 0.5316987633705139, + "learning_rate": 0.0004119916815390798, + "loss": 3.185, + "step": 23153 + }, + { + "epoch": 1.13, + "grad_norm": 0.5352542996406555, + "learning_rate": 0.00041197739821601767, + "loss": 2.9784, + "step": 23154 + }, + { + "epoch": 1.13, + "grad_norm": 0.587202250957489, + "learning_rate": 0.00041196311459802866, + "loss": 3.1149, + "step": 23155 + }, + { + "epoch": 1.13, + "grad_norm": 0.5219794511795044, + "learning_rate": 0.0004119488306851504, + "loss": 3.1565, + "step": 23156 + }, + { + "epoch": 1.13, + "grad_norm": 0.5088773965835571, + "learning_rate": 0.0004119345464774203, + "loss": 2.8584, + "step": 23157 + }, + { + "epoch": 1.13, + "grad_norm": 0.5774076581001282, + "learning_rate": 0.0004119202619748763, + "loss": 3.02, + "step": 23158 + }, + { + "epoch": 1.13, + "grad_norm": 0.5148767232894897, + "learning_rate": 0.0004119059771775559, + "loss": 3.0098, + "step": 23159 + }, + { + "epoch": 1.14, + "grad_norm": 0.5569112300872803, + "learning_rate": 0.00041189169208549655, + "loss": 3.0286, + "step": 23160 + }, + { + "epoch": 1.14, + "grad_norm": 0.5477631688117981, + "learning_rate": 0.00041187740669873605, + "loss": 3.2834, + "step": 23161 + }, + { + "epoch": 1.14, + "grad_norm": 0.5244240164756775, + "learning_rate": 0.00041186312101731195, + "loss": 3.1129, + "step": 23162 + }, + { + "epoch": 1.14, + "grad_norm": 0.6075512170791626, + "learning_rate": 0.0004118488350412621, + "loss": 3.1451, + "step": 23163 + }, + { + "epoch": 1.14, + "grad_norm": 0.5244618058204651, + "learning_rate": 0.0004118345487706238, + "loss": 2.9186, + "step": 23164 + }, + { + "epoch": 1.14, + "grad_norm": 0.5191154479980469, + "learning_rate": 0.00041182026220543485, + "loss": 3.2372, + "step": 23165 + }, + { + "epoch": 1.14, + "grad_norm": 0.540847897529602, + "learning_rate": 0.00041180597534573284, + "loss": 2.9988, + "step": 23166 + }, + { + "epoch": 1.14, + "grad_norm": 0.5709373950958252, + "learning_rate": 0.0004117916881915554, + "loss": 3.1161, + "step": 23167 + }, + { + "epoch": 1.14, + "grad_norm": 0.6198864579200745, + "learning_rate": 0.0004117774007429402, + "loss": 3.0857, + "step": 23168 + }, + { + "epoch": 1.14, + "grad_norm": 0.5441683530807495, + "learning_rate": 0.00041176311299992484, + "loss": 3.1707, + "step": 23169 + }, + { + "epoch": 1.14, + "grad_norm": 0.5470789670944214, + "learning_rate": 0.000411748824962547, + "loss": 3.0281, + "step": 23170 + }, + { + "epoch": 1.14, + "grad_norm": 0.5256059169769287, + "learning_rate": 0.00041173453663084417, + "loss": 3.274, + "step": 23171 + }, + { + "epoch": 1.14, + "grad_norm": 0.5547487735748291, + "learning_rate": 0.00041172024800485403, + "loss": 3.1233, + "step": 23172 + }, + { + "epoch": 1.14, + "grad_norm": 0.5724942684173584, + "learning_rate": 0.00041170595908461436, + "loss": 3.2013, + "step": 23173 + }, + { + "epoch": 1.14, + "grad_norm": 0.5241184830665588, + "learning_rate": 0.00041169166987016265, + "loss": 3.3218, + "step": 23174 + }, + { + "epoch": 1.14, + "grad_norm": 0.5703988671302795, + "learning_rate": 0.00041167738036153664, + "loss": 3.1718, + "step": 23175 + }, + { + "epoch": 1.14, + "grad_norm": 0.5422783493995667, + "learning_rate": 0.0004116630905587738, + "loss": 3.011, + "step": 23176 + }, + { + "epoch": 1.14, + "grad_norm": 0.581802248954773, + "learning_rate": 0.00041164880046191195, + "loss": 2.9262, + "step": 23177 + }, + { + "epoch": 1.14, + "grad_norm": 0.5663591027259827, + "learning_rate": 0.0004116345100709886, + "loss": 3.3378, + "step": 23178 + }, + { + "epoch": 1.14, + "grad_norm": 0.5546205639839172, + "learning_rate": 0.00041162021938604147, + "loss": 3.0222, + "step": 23179 + }, + { + "epoch": 1.14, + "grad_norm": 0.583781361579895, + "learning_rate": 0.0004116059284071081, + "loss": 3.0932, + "step": 23180 + }, + { + "epoch": 1.14, + "grad_norm": 0.5724231004714966, + "learning_rate": 0.0004115916371342263, + "loss": 3.3044, + "step": 23181 + }, + { + "epoch": 1.14, + "grad_norm": 0.5571467876434326, + "learning_rate": 0.0004115773455674336, + "loss": 3.1352, + "step": 23182 + }, + { + "epoch": 1.14, + "grad_norm": 0.5352392792701721, + "learning_rate": 0.00041156305370676754, + "loss": 3.0393, + "step": 23183 + }, + { + "epoch": 1.14, + "grad_norm": 0.5406389236450195, + "learning_rate": 0.0004115487615522658, + "loss": 3.0062, + "step": 23184 + }, + { + "epoch": 1.14, + "grad_norm": 0.6502759456634521, + "learning_rate": 0.0004115344691039663, + "loss": 2.9669, + "step": 23185 + }, + { + "epoch": 1.14, + "grad_norm": 0.5307773351669312, + "learning_rate": 0.00041152017636190643, + "loss": 3.3796, + "step": 23186 + }, + { + "epoch": 1.14, + "grad_norm": 0.5802614092826843, + "learning_rate": 0.00041150588332612383, + "loss": 3.2169, + "step": 23187 + }, + { + "epoch": 1.14, + "grad_norm": 0.5349951386451721, + "learning_rate": 0.0004114915899966561, + "loss": 3.144, + "step": 23188 + }, + { + "epoch": 1.14, + "grad_norm": 0.5569307804107666, + "learning_rate": 0.0004114772963735411, + "loss": 2.9595, + "step": 23189 + }, + { + "epoch": 1.14, + "grad_norm": 0.5304500460624695, + "learning_rate": 0.0004114630024568163, + "loss": 3.0931, + "step": 23190 + }, + { + "epoch": 1.14, + "grad_norm": 0.5246667861938477, + "learning_rate": 0.00041144870824651945, + "loss": 3.2074, + "step": 23191 + }, + { + "epoch": 1.14, + "grad_norm": 0.524143397808075, + "learning_rate": 0.00041143441374268824, + "loss": 2.7597, + "step": 23192 + }, + { + "epoch": 1.14, + "grad_norm": 0.5903804898262024, + "learning_rate": 0.00041142011894536003, + "loss": 3.0905, + "step": 23193 + }, + { + "epoch": 1.14, + "grad_norm": 0.5838131308555603, + "learning_rate": 0.0004114058238545728, + "loss": 3.0597, + "step": 23194 + }, + { + "epoch": 1.14, + "grad_norm": 0.5308125615119934, + "learning_rate": 0.000411391528470364, + "loss": 3.0877, + "step": 23195 + }, + { + "epoch": 1.14, + "grad_norm": 0.5069236755371094, + "learning_rate": 0.0004113772327927714, + "loss": 3.1319, + "step": 23196 + }, + { + "epoch": 1.14, + "grad_norm": 0.533857524394989, + "learning_rate": 0.0004113629368218326, + "loss": 2.9676, + "step": 23197 + }, + { + "epoch": 1.14, + "grad_norm": 0.5685359835624695, + "learning_rate": 0.00041134864055758513, + "loss": 3.0839, + "step": 23198 + }, + { + "epoch": 1.14, + "grad_norm": 0.6156092286109924, + "learning_rate": 0.00041133434400006695, + "loss": 3.2202, + "step": 23199 + }, + { + "epoch": 1.14, + "grad_norm": 0.5481438636779785, + "learning_rate": 0.0004113200471493155, + "loss": 3.1862, + "step": 23200 + }, + { + "epoch": 1.14, + "grad_norm": 0.5097211599349976, + "learning_rate": 0.0004113057500053684, + "loss": 3.0117, + "step": 23201 + }, + { + "epoch": 1.14, + "grad_norm": 0.5516743063926697, + "learning_rate": 0.00041129145256826345, + "loss": 3.0283, + "step": 23202 + }, + { + "epoch": 1.14, + "grad_norm": 0.5222983360290527, + "learning_rate": 0.0004112771548380382, + "loss": 2.9669, + "step": 23203 + }, + { + "epoch": 1.14, + "grad_norm": 0.5839381217956543, + "learning_rate": 0.0004112628568147303, + "loss": 3.015, + "step": 23204 + }, + { + "epoch": 1.14, + "grad_norm": 0.5553185939788818, + "learning_rate": 0.0004112485584983775, + "loss": 2.915, + "step": 23205 + }, + { + "epoch": 1.14, + "grad_norm": 0.5800415873527527, + "learning_rate": 0.0004112342598890174, + "loss": 3.1186, + "step": 23206 + }, + { + "epoch": 1.14, + "grad_norm": 0.569153904914856, + "learning_rate": 0.0004112199609866877, + "loss": 2.9612, + "step": 23207 + }, + { + "epoch": 1.14, + "grad_norm": 0.5515949130058289, + "learning_rate": 0.00041120566179142596, + "loss": 3.1817, + "step": 23208 + }, + { + "epoch": 1.14, + "grad_norm": 0.5314306020736694, + "learning_rate": 0.00041119136230326984, + "loss": 3.0635, + "step": 23209 + }, + { + "epoch": 1.14, + "grad_norm": 0.5693814754486084, + "learning_rate": 0.00041117706252225727, + "loss": 3.0755, + "step": 23210 + }, + { + "epoch": 1.14, + "grad_norm": 0.5441717505455017, + "learning_rate": 0.0004111627624484256, + "loss": 3.1078, + "step": 23211 + }, + { + "epoch": 1.14, + "grad_norm": 0.5225312113761902, + "learning_rate": 0.0004111484620818126, + "loss": 2.992, + "step": 23212 + }, + { + "epoch": 1.14, + "grad_norm": 0.5491369962692261, + "learning_rate": 0.00041113416142245587, + "loss": 2.9803, + "step": 23213 + }, + { + "epoch": 1.14, + "grad_norm": 0.530925989151001, + "learning_rate": 0.00041111986047039333, + "loss": 3.0763, + "step": 23214 + }, + { + "epoch": 1.14, + "grad_norm": 0.5920950174331665, + "learning_rate": 0.00041110555922566235, + "loss": 3.195, + "step": 23215 + }, + { + "epoch": 1.14, + "grad_norm": 0.544621467590332, + "learning_rate": 0.0004110912576883007, + "loss": 3.1061, + "step": 23216 + }, + { + "epoch": 1.14, + "grad_norm": 0.526262640953064, + "learning_rate": 0.0004110769558583461, + "loss": 3.167, + "step": 23217 + }, + { + "epoch": 1.14, + "grad_norm": 0.5441073179244995, + "learning_rate": 0.00041106265373583615, + "loss": 3.0507, + "step": 23218 + }, + { + "epoch": 1.14, + "grad_norm": 0.529232382774353, + "learning_rate": 0.00041104835132080856, + "loss": 3.1984, + "step": 23219 + }, + { + "epoch": 1.14, + "grad_norm": 0.5716642141342163, + "learning_rate": 0.00041103404861330095, + "loss": 3.1493, + "step": 23220 + }, + { + "epoch": 1.14, + "grad_norm": 0.5743255019187927, + "learning_rate": 0.0004110197456133511, + "loss": 3.0924, + "step": 23221 + }, + { + "epoch": 1.14, + "grad_norm": 0.6008023023605347, + "learning_rate": 0.0004110054423209966, + "loss": 3.0093, + "step": 23222 + }, + { + "epoch": 1.14, + "grad_norm": 0.5539536476135254, + "learning_rate": 0.00041099113873627505, + "loss": 3.0168, + "step": 23223 + }, + { + "epoch": 1.14, + "grad_norm": 0.5438982248306274, + "learning_rate": 0.0004109768348592242, + "loss": 2.9509, + "step": 23224 + }, + { + "epoch": 1.14, + "grad_norm": 0.5372403860092163, + "learning_rate": 0.0004109625306898818, + "loss": 3.1003, + "step": 23225 + }, + { + "epoch": 1.14, + "grad_norm": 0.5633687376976013, + "learning_rate": 0.0004109482262282854, + "loss": 3.1442, + "step": 23226 + }, + { + "epoch": 1.14, + "grad_norm": 0.6077851057052612, + "learning_rate": 0.0004109339214744727, + "loss": 3.112, + "step": 23227 + }, + { + "epoch": 1.14, + "grad_norm": 0.5495371222496033, + "learning_rate": 0.00041091961642848143, + "loss": 3.1602, + "step": 23228 + }, + { + "epoch": 1.14, + "grad_norm": 0.5324305295944214, + "learning_rate": 0.0004109053110903493, + "loss": 3.0703, + "step": 23229 + }, + { + "epoch": 1.14, + "grad_norm": 0.5331787467002869, + "learning_rate": 0.00041089100546011384, + "loss": 3.1745, + "step": 23230 + }, + { + "epoch": 1.14, + "grad_norm": 0.5928345918655396, + "learning_rate": 0.00041087669953781284, + "loss": 3.0949, + "step": 23231 + }, + { + "epoch": 1.14, + "grad_norm": 0.5761058330535889, + "learning_rate": 0.0004108623933234841, + "loss": 3.163, + "step": 23232 + }, + { + "epoch": 1.14, + "grad_norm": 0.5908313989639282, + "learning_rate": 0.0004108480868171649, + "loss": 3.2201, + "step": 23233 + }, + { + "epoch": 1.14, + "grad_norm": 0.5895100235939026, + "learning_rate": 0.00041083378001889327, + "loss": 2.8646, + "step": 23234 + }, + { + "epoch": 1.14, + "grad_norm": 0.5367791652679443, + "learning_rate": 0.0004108194729287068, + "loss": 3.2775, + "step": 23235 + }, + { + "epoch": 1.14, + "grad_norm": 0.5391991138458252, + "learning_rate": 0.00041080516554664327, + "loss": 3.1389, + "step": 23236 + }, + { + "epoch": 1.14, + "grad_norm": 0.5764256119728088, + "learning_rate": 0.00041079085787274015, + "loss": 2.9448, + "step": 23237 + }, + { + "epoch": 1.14, + "grad_norm": 0.5493453145027161, + "learning_rate": 0.0004107765499070352, + "loss": 3.2057, + "step": 23238 + }, + { + "epoch": 1.14, + "grad_norm": 0.5182337760925293, + "learning_rate": 0.00041076224164956614, + "loss": 3.0028, + "step": 23239 + }, + { + "epoch": 1.14, + "grad_norm": 0.5463001132011414, + "learning_rate": 0.0004107479331003707, + "loss": 3.3298, + "step": 23240 + }, + { + "epoch": 1.14, + "grad_norm": 0.5629000067710876, + "learning_rate": 0.0004107336242594865, + "loss": 3.131, + "step": 23241 + }, + { + "epoch": 1.14, + "grad_norm": 0.5420428514480591, + "learning_rate": 0.0004107193151269512, + "loss": 3.1846, + "step": 23242 + }, + { + "epoch": 1.14, + "grad_norm": 0.5327157974243164, + "learning_rate": 0.0004107050057028027, + "loss": 3.1262, + "step": 23243 + }, + { + "epoch": 1.14, + "grad_norm": 0.5222551822662354, + "learning_rate": 0.0004106906959870783, + "loss": 3.1124, + "step": 23244 + }, + { + "epoch": 1.14, + "grad_norm": 0.5623447895050049, + "learning_rate": 0.00041067638597981604, + "loss": 3.1196, + "step": 23245 + }, + { + "epoch": 1.14, + "grad_norm": 0.5250970721244812, + "learning_rate": 0.00041066207568105344, + "loss": 3.1107, + "step": 23246 + }, + { + "epoch": 1.14, + "grad_norm": 0.5632120966911316, + "learning_rate": 0.0004106477650908283, + "loss": 3.0244, + "step": 23247 + }, + { + "epoch": 1.14, + "grad_norm": 0.5749978423118591, + "learning_rate": 0.0004106334542091782, + "loss": 3.0074, + "step": 23248 + }, + { + "epoch": 1.14, + "grad_norm": 0.548491358757019, + "learning_rate": 0.00041061914303614085, + "loss": 2.8617, + "step": 23249 + }, + { + "epoch": 1.14, + "grad_norm": 0.5380204319953918, + "learning_rate": 0.00041060483157175396, + "loss": 3.0328, + "step": 23250 + }, + { + "epoch": 1.14, + "grad_norm": 0.5522347092628479, + "learning_rate": 0.00041059051981605523, + "loss": 3.1683, + "step": 23251 + }, + { + "epoch": 1.14, + "grad_norm": 0.5788419842720032, + "learning_rate": 0.00041057620776908244, + "loss": 3.1031, + "step": 23252 + }, + { + "epoch": 1.14, + "grad_norm": 0.5642889738082886, + "learning_rate": 0.0004105618954308731, + "loss": 2.9625, + "step": 23253 + }, + { + "epoch": 1.14, + "grad_norm": 0.5192211866378784, + "learning_rate": 0.00041054758280146506, + "loss": 3.1176, + "step": 23254 + }, + { + "epoch": 1.14, + "grad_norm": 0.5606915950775146, + "learning_rate": 0.0004105332698808959, + "loss": 2.9903, + "step": 23255 + }, + { + "epoch": 1.14, + "grad_norm": 0.5417342782020569, + "learning_rate": 0.0004105189566692034, + "loss": 2.9431, + "step": 23256 + }, + { + "epoch": 1.14, + "grad_norm": 0.5401815176010132, + "learning_rate": 0.00041050464316642524, + "loss": 2.9124, + "step": 23257 + }, + { + "epoch": 1.14, + "grad_norm": 0.5581971406936646, + "learning_rate": 0.0004104903293725992, + "loss": 3.1402, + "step": 23258 + }, + { + "epoch": 1.14, + "grad_norm": 0.5499171018600464, + "learning_rate": 0.0004104760152877628, + "loss": 3.1808, + "step": 23259 + }, + { + "epoch": 1.14, + "grad_norm": 0.5453315377235413, + "learning_rate": 0.0004104617009119539, + "loss": 3.0836, + "step": 23260 + }, + { + "epoch": 1.14, + "grad_norm": 0.5436408519744873, + "learning_rate": 0.0004104473862452101, + "loss": 3.2032, + "step": 23261 + }, + { + "epoch": 1.14, + "grad_norm": 0.595960795879364, + "learning_rate": 0.00041043307128756914, + "loss": 3.0531, + "step": 23262 + }, + { + "epoch": 1.14, + "grad_norm": 0.5662114024162292, + "learning_rate": 0.0004104187560390687, + "loss": 2.9397, + "step": 23263 + }, + { + "epoch": 1.14, + "grad_norm": 0.548102080821991, + "learning_rate": 0.00041040444049974655, + "loss": 2.9091, + "step": 23264 + }, + { + "epoch": 1.14, + "grad_norm": 0.5476301312446594, + "learning_rate": 0.00041039012466964034, + "loss": 2.8001, + "step": 23265 + }, + { + "epoch": 1.14, + "grad_norm": 0.5653447508811951, + "learning_rate": 0.00041037580854878775, + "loss": 3.2257, + "step": 23266 + }, + { + "epoch": 1.14, + "grad_norm": 0.5923088788986206, + "learning_rate": 0.00041036149213722657, + "loss": 3.1438, + "step": 23267 + }, + { + "epoch": 1.14, + "grad_norm": 0.5683751702308655, + "learning_rate": 0.0004103471754349945, + "loss": 3.0826, + "step": 23268 + }, + { + "epoch": 1.14, + "grad_norm": 0.5276745557785034, + "learning_rate": 0.00041033285844212914, + "loss": 3.1649, + "step": 23269 + }, + { + "epoch": 1.14, + "grad_norm": 0.5372428894042969, + "learning_rate": 0.0004103185411586682, + "loss": 3.1504, + "step": 23270 + }, + { + "epoch": 1.14, + "grad_norm": 0.5391978621482849, + "learning_rate": 0.00041030422358464955, + "loss": 3.1403, + "step": 23271 + }, + { + "epoch": 1.14, + "grad_norm": 0.5556966066360474, + "learning_rate": 0.0004102899057201108, + "loss": 3.1582, + "step": 23272 + }, + { + "epoch": 1.14, + "grad_norm": 0.5490073561668396, + "learning_rate": 0.00041027558756508967, + "loss": 3.1393, + "step": 23273 + }, + { + "epoch": 1.14, + "grad_norm": 0.5323960185050964, + "learning_rate": 0.00041026126911962386, + "loss": 3.0192, + "step": 23274 + }, + { + "epoch": 1.14, + "grad_norm": 0.5622329711914062, + "learning_rate": 0.00041024695038375104, + "loss": 3.0085, + "step": 23275 + }, + { + "epoch": 1.14, + "grad_norm": 0.5242308378219604, + "learning_rate": 0.00041023263135750904, + "loss": 3.1784, + "step": 23276 + }, + { + "epoch": 1.14, + "grad_norm": 0.5954573154449463, + "learning_rate": 0.0004102183120409354, + "loss": 3.0906, + "step": 23277 + }, + { + "epoch": 1.14, + "grad_norm": 0.542190670967102, + "learning_rate": 0.000410203992434068, + "loss": 3.102, + "step": 23278 + }, + { + "epoch": 1.14, + "grad_norm": 0.5081149339675903, + "learning_rate": 0.0004101896725369445, + "loss": 3.1372, + "step": 23279 + }, + { + "epoch": 1.14, + "grad_norm": 0.5064743161201477, + "learning_rate": 0.0004101753523496026, + "loss": 3.1823, + "step": 23280 + }, + { + "epoch": 1.14, + "grad_norm": 0.6618224382400513, + "learning_rate": 0.0004101610318720801, + "loss": 3.0695, + "step": 23281 + }, + { + "epoch": 1.14, + "grad_norm": 0.5303663015365601, + "learning_rate": 0.00041014671110441457, + "loss": 3.1745, + "step": 23282 + }, + { + "epoch": 1.14, + "grad_norm": 0.5263445973396301, + "learning_rate": 0.0004101323900466438, + "loss": 3.022, + "step": 23283 + }, + { + "epoch": 1.14, + "grad_norm": 0.5309288501739502, + "learning_rate": 0.00041011806869880555, + "loss": 2.762, + "step": 23284 + }, + { + "epoch": 1.14, + "grad_norm": 0.5699345469474792, + "learning_rate": 0.00041010374706093735, + "loss": 2.9958, + "step": 23285 + }, + { + "epoch": 1.14, + "grad_norm": 0.5486957430839539, + "learning_rate": 0.0004100894251330772, + "loss": 3.0072, + "step": 23286 + }, + { + "epoch": 1.14, + "grad_norm": 0.5336732864379883, + "learning_rate": 0.00041007510291526277, + "loss": 3.1703, + "step": 23287 + }, + { + "epoch": 1.14, + "grad_norm": 0.5408821702003479, + "learning_rate": 0.00041006078040753163, + "loss": 3.0636, + "step": 23288 + }, + { + "epoch": 1.14, + "grad_norm": 0.5387353897094727, + "learning_rate": 0.0004100464576099215, + "loss": 3.0886, + "step": 23289 + }, + { + "epoch": 1.14, + "grad_norm": 0.5669566988945007, + "learning_rate": 0.00041003213452247026, + "loss": 3.0922, + "step": 23290 + }, + { + "epoch": 1.14, + "grad_norm": 0.5251073241233826, + "learning_rate": 0.0004100178111452155, + "loss": 3.3268, + "step": 23291 + }, + { + "epoch": 1.14, + "grad_norm": 0.536972165107727, + "learning_rate": 0.000410003487478195, + "loss": 2.9341, + "step": 23292 + }, + { + "epoch": 1.14, + "grad_norm": 0.5253141522407532, + "learning_rate": 0.0004099891635214465, + "loss": 3.0662, + "step": 23293 + }, + { + "epoch": 1.14, + "grad_norm": 0.6211135983467102, + "learning_rate": 0.00040997483927500785, + "loss": 3.1949, + "step": 23294 + }, + { + "epoch": 1.14, + "grad_norm": 0.5622311234474182, + "learning_rate": 0.0004099605147389164, + "loss": 3.218, + "step": 23295 + }, + { + "epoch": 1.14, + "grad_norm": 0.5925061702728271, + "learning_rate": 0.0004099461899132102, + "loss": 3.2212, + "step": 23296 + }, + { + "epoch": 1.14, + "grad_norm": 0.5344102382659912, + "learning_rate": 0.00040993186479792694, + "loss": 2.9887, + "step": 23297 + }, + { + "epoch": 1.14, + "grad_norm": 0.6124759316444397, + "learning_rate": 0.0004099175393931044, + "loss": 3.0576, + "step": 23298 + }, + { + "epoch": 1.14, + "grad_norm": 0.5534654259681702, + "learning_rate": 0.00040990321369878006, + "loss": 3.1516, + "step": 23299 + }, + { + "epoch": 1.14, + "grad_norm": 0.5637291073799133, + "learning_rate": 0.0004098888877149918, + "loss": 2.9373, + "step": 23300 + }, + { + "epoch": 1.14, + "grad_norm": 0.5289487242698669, + "learning_rate": 0.00040987456144177747, + "loss": 3.0108, + "step": 23301 + }, + { + "epoch": 1.14, + "grad_norm": 0.5784006118774414, + "learning_rate": 0.0004098602348791746, + "loss": 2.9401, + "step": 23302 + }, + { + "epoch": 1.14, + "grad_norm": 0.5589538216590881, + "learning_rate": 0.00040984590802722106, + "loss": 2.9519, + "step": 23303 + }, + { + "epoch": 1.14, + "grad_norm": 0.573984682559967, + "learning_rate": 0.00040983158088595456, + "loss": 2.9528, + "step": 23304 + }, + { + "epoch": 1.14, + "grad_norm": 0.5732395648956299, + "learning_rate": 0.0004098172534554128, + "loss": 3.081, + "step": 23305 + }, + { + "epoch": 1.14, + "grad_norm": 0.5892529487609863, + "learning_rate": 0.0004098029257356334, + "loss": 3.1453, + "step": 23306 + }, + { + "epoch": 1.14, + "grad_norm": 0.5673011541366577, + "learning_rate": 0.0004097885977266544, + "loss": 3.0574, + "step": 23307 + }, + { + "epoch": 1.14, + "grad_norm": 0.5459287166595459, + "learning_rate": 0.00040977426942851326, + "loss": 3.0539, + "step": 23308 + }, + { + "epoch": 1.14, + "grad_norm": 0.545254647731781, + "learning_rate": 0.0004097599408412479, + "loss": 2.968, + "step": 23309 + }, + { + "epoch": 1.14, + "grad_norm": 0.5447995066642761, + "learning_rate": 0.0004097456119648959, + "loss": 2.9872, + "step": 23310 + }, + { + "epoch": 1.14, + "grad_norm": 0.5695222020149231, + "learning_rate": 0.0004097312827994951, + "loss": 3.0334, + "step": 23311 + }, + { + "epoch": 1.14, + "grad_norm": 0.5799598693847656, + "learning_rate": 0.00040971695334508323, + "loss": 3.1335, + "step": 23312 + }, + { + "epoch": 1.14, + "grad_norm": 0.5451822876930237, + "learning_rate": 0.000409702623601698, + "loss": 3.1662, + "step": 23313 + }, + { + "epoch": 1.14, + "grad_norm": 0.5687471628189087, + "learning_rate": 0.00040968829356937726, + "loss": 3.1632, + "step": 23314 + }, + { + "epoch": 1.14, + "grad_norm": 0.5137524604797363, + "learning_rate": 0.00040967396324815853, + "loss": 3.1517, + "step": 23315 + }, + { + "epoch": 1.14, + "grad_norm": 0.535132110118866, + "learning_rate": 0.00040965963263807977, + "loss": 3.0008, + "step": 23316 + }, + { + "epoch": 1.14, + "grad_norm": 0.5807247161865234, + "learning_rate": 0.0004096453017391786, + "loss": 3.0634, + "step": 23317 + }, + { + "epoch": 1.14, + "grad_norm": 0.585669994354248, + "learning_rate": 0.00040963097055149287, + "loss": 3.0915, + "step": 23318 + }, + { + "epoch": 1.14, + "grad_norm": 0.5763066411018372, + "learning_rate": 0.00040961663907506025, + "loss": 3.1631, + "step": 23319 + }, + { + "epoch": 1.14, + "grad_norm": 0.563368558883667, + "learning_rate": 0.0004096023073099185, + "loss": 3.3913, + "step": 23320 + }, + { + "epoch": 1.14, + "grad_norm": 0.5762029886245728, + "learning_rate": 0.00040958797525610525, + "loss": 3.0807, + "step": 23321 + }, + { + "epoch": 1.14, + "grad_norm": 0.5520102977752686, + "learning_rate": 0.0004095736429136585, + "loss": 3.1066, + "step": 23322 + }, + { + "epoch": 1.14, + "grad_norm": 0.537263035774231, + "learning_rate": 0.0004095593102826159, + "loss": 3.1889, + "step": 23323 + }, + { + "epoch": 1.14, + "grad_norm": 0.5755485892295837, + "learning_rate": 0.00040954497736301503, + "loss": 3.0597, + "step": 23324 + }, + { + "epoch": 1.14, + "grad_norm": 0.5270916819572449, + "learning_rate": 0.00040953064415489383, + "loss": 3.1009, + "step": 23325 + }, + { + "epoch": 1.14, + "grad_norm": 0.5502188801765442, + "learning_rate": 0.00040951631065828996, + "loss": 2.9506, + "step": 23326 + }, + { + "epoch": 1.14, + "grad_norm": 0.5408546328544617, + "learning_rate": 0.00040950197687324126, + "loss": 2.9593, + "step": 23327 + }, + { + "epoch": 1.14, + "grad_norm": 0.5552592873573303, + "learning_rate": 0.0004094876427997854, + "loss": 2.8134, + "step": 23328 + }, + { + "epoch": 1.14, + "grad_norm": 0.5457807779312134, + "learning_rate": 0.00040947330843796016, + "loss": 3.2202, + "step": 23329 + }, + { + "epoch": 1.14, + "grad_norm": 0.6020430326461792, + "learning_rate": 0.0004094589737878033, + "loss": 3.1423, + "step": 23330 + }, + { + "epoch": 1.14, + "grad_norm": 0.5497642159461975, + "learning_rate": 0.00040944463884935256, + "loss": 3.1488, + "step": 23331 + }, + { + "epoch": 1.14, + "grad_norm": 0.5718896389007568, + "learning_rate": 0.00040943030362264565, + "loss": 3.162, + "step": 23332 + }, + { + "epoch": 1.14, + "grad_norm": 0.5680598616600037, + "learning_rate": 0.00040941596810772045, + "loss": 3.1336, + "step": 23333 + }, + { + "epoch": 1.14, + "grad_norm": 0.5549580454826355, + "learning_rate": 0.0004094016323046147, + "loss": 3.2387, + "step": 23334 + }, + { + "epoch": 1.14, + "grad_norm": 0.586732029914856, + "learning_rate": 0.00040938729621336605, + "loss": 3.0624, + "step": 23335 + }, + { + "epoch": 1.14, + "grad_norm": 0.5963259339332581, + "learning_rate": 0.00040937295983401226, + "loss": 2.9214, + "step": 23336 + }, + { + "epoch": 1.14, + "grad_norm": 0.5552912354469299, + "learning_rate": 0.00040935862316659116, + "loss": 3.0501, + "step": 23337 + }, + { + "epoch": 1.14, + "grad_norm": 0.509372889995575, + "learning_rate": 0.0004093442862111406, + "loss": 3.0564, + "step": 23338 + }, + { + "epoch": 1.14, + "grad_norm": 0.5622632503509521, + "learning_rate": 0.00040932994896769815, + "loss": 3.0737, + "step": 23339 + }, + { + "epoch": 1.14, + "grad_norm": 0.5708377361297607, + "learning_rate": 0.0004093156114363016, + "loss": 3.2455, + "step": 23340 + }, + { + "epoch": 1.14, + "grad_norm": 0.5660071969032288, + "learning_rate": 0.00040930127361698887, + "loss": 3.1272, + "step": 23341 + }, + { + "epoch": 1.14, + "grad_norm": 0.5480991005897522, + "learning_rate": 0.00040928693550979753, + "loss": 3.2418, + "step": 23342 + }, + { + "epoch": 1.14, + "grad_norm": 0.560892641544342, + "learning_rate": 0.0004092725971147655, + "loss": 2.9972, + "step": 23343 + }, + { + "epoch": 1.14, + "grad_norm": 0.5563073754310608, + "learning_rate": 0.00040925825843193044, + "loss": 3.082, + "step": 23344 + }, + { + "epoch": 1.14, + "grad_norm": 0.5521158576011658, + "learning_rate": 0.0004092439194613302, + "loss": 3.0532, + "step": 23345 + }, + { + "epoch": 1.14, + "grad_norm": 0.5784248113632202, + "learning_rate": 0.0004092295802030025, + "loss": 2.9597, + "step": 23346 + }, + { + "epoch": 1.14, + "grad_norm": 0.551121175289154, + "learning_rate": 0.00040921524065698505, + "loss": 3.1059, + "step": 23347 + }, + { + "epoch": 1.14, + "grad_norm": 0.5605080723762512, + "learning_rate": 0.00040920090082331565, + "loss": 3.1398, + "step": 23348 + }, + { + "epoch": 1.14, + "grad_norm": 0.5794740319252014, + "learning_rate": 0.00040918656070203224, + "loss": 2.8233, + "step": 23349 + }, + { + "epoch": 1.14, + "grad_norm": 0.5346746444702148, + "learning_rate": 0.00040917222029317234, + "loss": 3.1589, + "step": 23350 + }, + { + "epoch": 1.14, + "grad_norm": 0.540032684803009, + "learning_rate": 0.0004091578795967739, + "loss": 3.0918, + "step": 23351 + }, + { + "epoch": 1.14, + "grad_norm": 0.5193694233894348, + "learning_rate": 0.00040914353861287446, + "loss": 3.3181, + "step": 23352 + }, + { + "epoch": 1.14, + "grad_norm": 0.5316674113273621, + "learning_rate": 0.00040912919734151205, + "loss": 3.1497, + "step": 23353 + }, + { + "epoch": 1.14, + "grad_norm": 0.5241429805755615, + "learning_rate": 0.00040911485578272433, + "loss": 3.0704, + "step": 23354 + }, + { + "epoch": 1.14, + "grad_norm": 0.5313575863838196, + "learning_rate": 0.00040910051393654905, + "loss": 3.2197, + "step": 23355 + }, + { + "epoch": 1.14, + "grad_norm": 0.5644849538803101, + "learning_rate": 0.00040908617180302403, + "loss": 3.0536, + "step": 23356 + }, + { + "epoch": 1.14, + "grad_norm": 0.5238354802131653, + "learning_rate": 0.000409071829382187, + "loss": 3.1429, + "step": 23357 + }, + { + "epoch": 1.14, + "grad_norm": 0.5016031265258789, + "learning_rate": 0.00040905748667407576, + "loss": 3.2018, + "step": 23358 + }, + { + "epoch": 1.14, + "grad_norm": 0.5598127245903015, + "learning_rate": 0.00040904314367872814, + "loss": 3.0267, + "step": 23359 + }, + { + "epoch": 1.14, + "grad_norm": 0.557971715927124, + "learning_rate": 0.0004090288003961819, + "loss": 3.3368, + "step": 23360 + }, + { + "epoch": 1.14, + "grad_norm": 0.5256807208061218, + "learning_rate": 0.00040901445682647473, + "loss": 3.044, + "step": 23361 + }, + { + "epoch": 1.14, + "grad_norm": 0.5249261260032654, + "learning_rate": 0.0004090001129696444, + "loss": 3.1656, + "step": 23362 + }, + { + "epoch": 1.14, + "grad_norm": 0.5444071888923645, + "learning_rate": 0.0004089857688257289, + "loss": 3.0243, + "step": 23363 + }, + { + "epoch": 1.15, + "grad_norm": 0.5587050318717957, + "learning_rate": 0.00040897142439476575, + "loss": 3.3505, + "step": 23364 + }, + { + "epoch": 1.15, + "grad_norm": 0.5856272578239441, + "learning_rate": 0.00040895707967679283, + "loss": 3.0021, + "step": 23365 + }, + { + "epoch": 1.15, + "grad_norm": 0.5790854096412659, + "learning_rate": 0.000408942734671848, + "loss": 3.1403, + "step": 23366 + }, + { + "epoch": 1.15, + "grad_norm": 0.53217613697052, + "learning_rate": 0.00040892838937996894, + "loss": 3.2079, + "step": 23367 + }, + { + "epoch": 1.15, + "grad_norm": 0.5617000460624695, + "learning_rate": 0.0004089140438011934, + "loss": 2.9651, + "step": 23368 + }, + { + "epoch": 1.15, + "grad_norm": 0.5272381901741028, + "learning_rate": 0.0004088996979355593, + "loss": 3.0486, + "step": 23369 + }, + { + "epoch": 1.15, + "grad_norm": 0.543768048286438, + "learning_rate": 0.0004088853517831044, + "loss": 3.0786, + "step": 23370 + }, + { + "epoch": 1.15, + "grad_norm": 0.5788832902908325, + "learning_rate": 0.0004088710053438664, + "loss": 3.0349, + "step": 23371 + }, + { + "epoch": 1.15, + "grad_norm": 0.5837016105651855, + "learning_rate": 0.000408856658617883, + "loss": 3.0429, + "step": 23372 + }, + { + "epoch": 1.15, + "grad_norm": 0.5341154336929321, + "learning_rate": 0.00040884231160519225, + "loss": 2.9318, + "step": 23373 + }, + { + "epoch": 1.15, + "grad_norm": 0.5163646936416626, + "learning_rate": 0.0004088279643058318, + "loss": 3.2756, + "step": 23374 + }, + { + "epoch": 1.15, + "grad_norm": 0.5284332036972046, + "learning_rate": 0.00040881361671983937, + "loss": 3.1314, + "step": 23375 + }, + { + "epoch": 1.15, + "grad_norm": 0.5456615686416626, + "learning_rate": 0.0004087992688472529, + "loss": 2.9555, + "step": 23376 + }, + { + "epoch": 1.15, + "grad_norm": 0.5189909338951111, + "learning_rate": 0.00040878492068811004, + "loss": 2.9664, + "step": 23377 + }, + { + "epoch": 1.15, + "grad_norm": 0.5077756643295288, + "learning_rate": 0.0004087705722424486, + "loss": 3.0454, + "step": 23378 + }, + { + "epoch": 1.15, + "grad_norm": 0.5315677523612976, + "learning_rate": 0.0004087562235103065, + "loss": 3.1714, + "step": 23379 + }, + { + "epoch": 1.15, + "grad_norm": 0.5543997883796692, + "learning_rate": 0.00040874187449172134, + "loss": 3.0133, + "step": 23380 + }, + { + "epoch": 1.15, + "grad_norm": 0.560238242149353, + "learning_rate": 0.0004087275251867311, + "loss": 3.0748, + "step": 23381 + }, + { + "epoch": 1.15, + "grad_norm": 0.5396565198898315, + "learning_rate": 0.0004087131755953734, + "loss": 3.253, + "step": 23382 + }, + { + "epoch": 1.15, + "grad_norm": 0.5551555156707764, + "learning_rate": 0.0004086988257176861, + "loss": 2.9754, + "step": 23383 + }, + { + "epoch": 1.15, + "grad_norm": 0.5715532302856445, + "learning_rate": 0.00040868447555370707, + "loss": 3.0113, + "step": 23384 + }, + { + "epoch": 1.15, + "grad_norm": 0.5877143144607544, + "learning_rate": 0.0004086701251034741, + "loss": 3.0628, + "step": 23385 + }, + { + "epoch": 1.15, + "grad_norm": 0.547389566898346, + "learning_rate": 0.00040865577436702483, + "loss": 3.0458, + "step": 23386 + }, + { + "epoch": 1.15, + "grad_norm": 0.5390537977218628, + "learning_rate": 0.00040864142334439725, + "loss": 3.17, + "step": 23387 + }, + { + "epoch": 1.15, + "grad_norm": 0.5335311889648438, + "learning_rate": 0.000408627072035629, + "loss": 2.9859, + "step": 23388 + }, + { + "epoch": 1.15, + "grad_norm": 0.5414642691612244, + "learning_rate": 0.00040861272044075803, + "loss": 3.0252, + "step": 23389 + }, + { + "epoch": 1.15, + "grad_norm": 0.5587677359580994, + "learning_rate": 0.00040859836855982196, + "loss": 3.0681, + "step": 23390 + }, + { + "epoch": 1.15, + "grad_norm": 0.6065161824226379, + "learning_rate": 0.0004085840163928588, + "loss": 2.8998, + "step": 23391 + }, + { + "epoch": 1.15, + "grad_norm": 0.5559492111206055, + "learning_rate": 0.0004085696639399061, + "loss": 3.1246, + "step": 23392 + }, + { + "epoch": 1.15, + "grad_norm": 0.5463036894798279, + "learning_rate": 0.0004085553112010019, + "loss": 3.0655, + "step": 23393 + }, + { + "epoch": 1.15, + "grad_norm": 0.5573052167892456, + "learning_rate": 0.00040854095817618384, + "loss": 2.826, + "step": 23394 + }, + { + "epoch": 1.15, + "grad_norm": 0.5249746441841125, + "learning_rate": 0.0004085266048654899, + "loss": 3.0882, + "step": 23395 + }, + { + "epoch": 1.15, + "grad_norm": 0.5521349906921387, + "learning_rate": 0.00040851225126895766, + "loss": 2.9564, + "step": 23396 + }, + { + "epoch": 1.15, + "grad_norm": 0.550536572933197, + "learning_rate": 0.00040849789738662504, + "loss": 2.9643, + "step": 23397 + }, + { + "epoch": 1.15, + "grad_norm": 0.5547955632209778, + "learning_rate": 0.00040848354321852985, + "loss": 3.2019, + "step": 23398 + }, + { + "epoch": 1.15, + "grad_norm": 0.5499030351638794, + "learning_rate": 0.00040846918876470987, + "loss": 3.1381, + "step": 23399 + }, + { + "epoch": 1.15, + "grad_norm": 0.548309326171875, + "learning_rate": 0.000408454834025203, + "loss": 3.0595, + "step": 23400 + }, + { + "epoch": 1.15, + "grad_norm": 0.5371249914169312, + "learning_rate": 0.0004084404790000469, + "loss": 2.9364, + "step": 23401 + }, + { + "epoch": 1.15, + "grad_norm": 0.5594949722290039, + "learning_rate": 0.00040842612368927945, + "loss": 3.1443, + "step": 23402 + }, + { + "epoch": 1.15, + "grad_norm": 0.5433090329170227, + "learning_rate": 0.0004084117680929385, + "loss": 2.9709, + "step": 23403 + }, + { + "epoch": 1.15, + "grad_norm": 0.5581390261650085, + "learning_rate": 0.0004083974122110618, + "loss": 2.9891, + "step": 23404 + }, + { + "epoch": 1.15, + "grad_norm": 0.540836751461029, + "learning_rate": 0.0004083830560436871, + "loss": 3.2713, + "step": 23405 + }, + { + "epoch": 1.15, + "grad_norm": 0.5747362971305847, + "learning_rate": 0.00040836869959085236, + "loss": 3.0827, + "step": 23406 + }, + { + "epoch": 1.15, + "grad_norm": 0.5611171126365662, + "learning_rate": 0.0004083543428525954, + "loss": 3.1278, + "step": 23407 + }, + { + "epoch": 1.15, + "grad_norm": 0.5559061765670776, + "learning_rate": 0.0004083399858289538, + "loss": 3.0086, + "step": 23408 + }, + { + "epoch": 1.15, + "grad_norm": 0.5641838908195496, + "learning_rate": 0.0004083256285199656, + "loss": 3.1977, + "step": 23409 + }, + { + "epoch": 1.15, + "grad_norm": 0.5294587016105652, + "learning_rate": 0.0004083112709256685, + "loss": 3.0881, + "step": 23410 + }, + { + "epoch": 1.15, + "grad_norm": 0.5804036259651184, + "learning_rate": 0.0004082969130461005, + "loss": 2.8935, + "step": 23411 + }, + { + "epoch": 1.15, + "grad_norm": 0.5449517369270325, + "learning_rate": 0.00040828255488129916, + "loss": 3.2973, + "step": 23412 + }, + { + "epoch": 1.15, + "grad_norm": 0.5502049326896667, + "learning_rate": 0.00040826819643130236, + "loss": 3.2441, + "step": 23413 + }, + { + "epoch": 1.15, + "grad_norm": 0.5578358769416809, + "learning_rate": 0.0004082538376961481, + "loss": 3.2078, + "step": 23414 + }, + { + "epoch": 1.15, + "grad_norm": 0.5473148822784424, + "learning_rate": 0.00040823947867587397, + "loss": 3.1273, + "step": 23415 + }, + { + "epoch": 1.15, + "grad_norm": 0.6118950843811035, + "learning_rate": 0.00040822511937051793, + "loss": 3.0009, + "step": 23416 + }, + { + "epoch": 1.15, + "grad_norm": 0.5265421867370605, + "learning_rate": 0.0004082107597801177, + "loss": 3.1703, + "step": 23417 + }, + { + "epoch": 1.15, + "grad_norm": 0.5736425518989563, + "learning_rate": 0.0004081963999047112, + "loss": 3.0401, + "step": 23418 + }, + { + "epoch": 1.15, + "grad_norm": 0.572830080986023, + "learning_rate": 0.00040818203974433623, + "loss": 3.2007, + "step": 23419 + }, + { + "epoch": 1.15, + "grad_norm": 0.5175371170043945, + "learning_rate": 0.0004081676792990305, + "loss": 3.3056, + "step": 23420 + }, + { + "epoch": 1.15, + "grad_norm": 0.5525279641151428, + "learning_rate": 0.000408153318568832, + "loss": 2.9136, + "step": 23421 + }, + { + "epoch": 1.15, + "grad_norm": 0.5556487441062927, + "learning_rate": 0.00040813895755377846, + "loss": 3.2804, + "step": 23422 + }, + { + "epoch": 1.15, + "grad_norm": 0.5376157164573669, + "learning_rate": 0.00040812459625390774, + "loss": 3.0081, + "step": 23423 + }, + { + "epoch": 1.15, + "grad_norm": 0.5336998701095581, + "learning_rate": 0.0004081102346692575, + "loss": 3.0859, + "step": 23424 + }, + { + "epoch": 1.15, + "grad_norm": 0.5490345358848572, + "learning_rate": 0.0004080958727998659, + "loss": 3.1922, + "step": 23425 + }, + { + "epoch": 1.15, + "grad_norm": 0.5638200044631958, + "learning_rate": 0.0004080815106457705, + "loss": 3.2863, + "step": 23426 + }, + { + "epoch": 1.15, + "grad_norm": 0.5599801540374756, + "learning_rate": 0.0004080671482070092, + "loss": 3.0831, + "step": 23427 + }, + { + "epoch": 1.15, + "grad_norm": 0.5768230557441711, + "learning_rate": 0.0004080527854836198, + "loss": 3.0359, + "step": 23428 + }, + { + "epoch": 1.15, + "grad_norm": 0.591766357421875, + "learning_rate": 0.00040803842247564014, + "loss": 2.9248, + "step": 23429 + }, + { + "epoch": 1.15, + "grad_norm": 0.5740898847579956, + "learning_rate": 0.0004080240591831081, + "loss": 3.3461, + "step": 23430 + }, + { + "epoch": 1.15, + "grad_norm": 0.6085920929908752, + "learning_rate": 0.0004080096956060615, + "loss": 2.907, + "step": 23431 + }, + { + "epoch": 1.15, + "grad_norm": 0.5598239898681641, + "learning_rate": 0.00040799533174453806, + "loss": 3.2174, + "step": 23432 + }, + { + "epoch": 1.15, + "grad_norm": 0.5733321905136108, + "learning_rate": 0.00040798096759857587, + "loss": 3.2491, + "step": 23433 + }, + { + "epoch": 1.15, + "grad_norm": 0.5727006793022156, + "learning_rate": 0.00040796660316821243, + "loss": 3.1616, + "step": 23434 + }, + { + "epoch": 1.15, + "grad_norm": 0.5333359241485596, + "learning_rate": 0.00040795223845348574, + "loss": 3.3976, + "step": 23435 + }, + { + "epoch": 1.15, + "grad_norm": 0.57224440574646, + "learning_rate": 0.00040793787345443376, + "loss": 3.1264, + "step": 23436 + }, + { + "epoch": 1.15, + "grad_norm": 0.5239753723144531, + "learning_rate": 0.0004079235081710941, + "loss": 2.9847, + "step": 23437 + }, + { + "epoch": 1.15, + "grad_norm": 0.5496913194656372, + "learning_rate": 0.0004079091426035047, + "loss": 2.9158, + "step": 23438 + }, + { + "epoch": 1.15, + "grad_norm": 0.5474773049354553, + "learning_rate": 0.0004078947767517034, + "loss": 2.9988, + "step": 23439 + }, + { + "epoch": 1.15, + "grad_norm": 0.5493378639221191, + "learning_rate": 0.000407880410615728, + "loss": 3.0624, + "step": 23440 + }, + { + "epoch": 1.15, + "grad_norm": 0.5190552473068237, + "learning_rate": 0.0004078660441956164, + "loss": 3.2048, + "step": 23441 + }, + { + "epoch": 1.15, + "grad_norm": 0.5590404272079468, + "learning_rate": 0.0004078516774914064, + "loss": 3.277, + "step": 23442 + }, + { + "epoch": 1.15, + "grad_norm": 0.5510579347610474, + "learning_rate": 0.00040783731050313577, + "loss": 3.0241, + "step": 23443 + }, + { + "epoch": 1.15, + "grad_norm": 0.534829318523407, + "learning_rate": 0.0004078229432308425, + "loss": 3.1232, + "step": 23444 + }, + { + "epoch": 1.15, + "grad_norm": 0.5541685223579407, + "learning_rate": 0.0004078085756745643, + "loss": 3.0511, + "step": 23445 + }, + { + "epoch": 1.15, + "grad_norm": 0.5376542806625366, + "learning_rate": 0.0004077942078343391, + "loss": 3.2308, + "step": 23446 + }, + { + "epoch": 1.15, + "grad_norm": 0.5467217564582825, + "learning_rate": 0.00040777983971020473, + "loss": 3.0171, + "step": 23447 + }, + { + "epoch": 1.15, + "grad_norm": 0.5329676866531372, + "learning_rate": 0.0004077654713021989, + "loss": 3.0905, + "step": 23448 + }, + { + "epoch": 1.15, + "grad_norm": 0.5383985638618469, + "learning_rate": 0.0004077511026103596, + "loss": 2.947, + "step": 23449 + }, + { + "epoch": 1.15, + "grad_norm": 0.5557945966720581, + "learning_rate": 0.00040773673363472465, + "loss": 3.0836, + "step": 23450 + }, + { + "epoch": 1.15, + "grad_norm": 0.5461722612380981, + "learning_rate": 0.0004077223643753319, + "loss": 3.061, + "step": 23451 + }, + { + "epoch": 1.15, + "grad_norm": 0.5956500172615051, + "learning_rate": 0.00040770799483221914, + "loss": 3.1172, + "step": 23452 + }, + { + "epoch": 1.15, + "grad_norm": 0.5400736331939697, + "learning_rate": 0.00040769362500542425, + "loss": 3.1167, + "step": 23453 + }, + { + "epoch": 1.15, + "grad_norm": 0.5344555377960205, + "learning_rate": 0.0004076792548949851, + "loss": 3.1462, + "step": 23454 + }, + { + "epoch": 1.15, + "grad_norm": 0.5609763860702515, + "learning_rate": 0.00040766488450093947, + "loss": 3.0945, + "step": 23455 + }, + { + "epoch": 1.15, + "grad_norm": 0.5006883144378662, + "learning_rate": 0.00040765051382332527, + "loss": 3.1908, + "step": 23456 + }, + { + "epoch": 1.15, + "grad_norm": 0.566444456577301, + "learning_rate": 0.0004076361428621804, + "loss": 3.2382, + "step": 23457 + }, + { + "epoch": 1.15, + "grad_norm": 0.5385657548904419, + "learning_rate": 0.00040762177161754264, + "loss": 2.8872, + "step": 23458 + }, + { + "epoch": 1.15, + "grad_norm": 0.5454058647155762, + "learning_rate": 0.00040760740008944973, + "loss": 2.94, + "step": 23459 + }, + { + "epoch": 1.15, + "grad_norm": 0.5491828322410583, + "learning_rate": 0.00040759302827793965, + "loss": 3.0851, + "step": 23460 + }, + { + "epoch": 1.15, + "grad_norm": 0.5193360447883606, + "learning_rate": 0.0004075786561830503, + "loss": 2.9652, + "step": 23461 + }, + { + "epoch": 1.15, + "grad_norm": 0.534925103187561, + "learning_rate": 0.0004075642838048195, + "loss": 3.0946, + "step": 23462 + }, + { + "epoch": 1.15, + "grad_norm": 0.5688135027885437, + "learning_rate": 0.00040754991114328506, + "loss": 2.9249, + "step": 23463 + }, + { + "epoch": 1.15, + "grad_norm": 0.5361714959144592, + "learning_rate": 0.00040753553819848485, + "loss": 3.122, + "step": 23464 + }, + { + "epoch": 1.15, + "grad_norm": 0.5302151441574097, + "learning_rate": 0.0004075211649704568, + "loss": 2.9271, + "step": 23465 + }, + { + "epoch": 1.15, + "grad_norm": 0.5387076735496521, + "learning_rate": 0.0004075067914592385, + "loss": 3.0661, + "step": 23466 + }, + { + "epoch": 1.15, + "grad_norm": 0.5546697974205017, + "learning_rate": 0.0004074924176648681, + "loss": 3.1147, + "step": 23467 + }, + { + "epoch": 1.15, + "grad_norm": 0.5648727416992188, + "learning_rate": 0.0004074780435873834, + "loss": 2.842, + "step": 23468 + }, + { + "epoch": 1.15, + "grad_norm": 0.5225715637207031, + "learning_rate": 0.00040746366922682223, + "loss": 3.1203, + "step": 23469 + }, + { + "epoch": 1.15, + "grad_norm": 0.5498008728027344, + "learning_rate": 0.00040744929458322234, + "loss": 3.1043, + "step": 23470 + }, + { + "epoch": 1.15, + "grad_norm": 0.5694062113761902, + "learning_rate": 0.0004074349196566217, + "loss": 3.1744, + "step": 23471 + }, + { + "epoch": 1.15, + "grad_norm": 0.5407145023345947, + "learning_rate": 0.0004074205444470582, + "loss": 3.1199, + "step": 23472 + }, + { + "epoch": 1.15, + "grad_norm": 0.5782432556152344, + "learning_rate": 0.00040740616895456976, + "loss": 3.1877, + "step": 23473 + }, + { + "epoch": 1.15, + "grad_norm": 0.5868061184883118, + "learning_rate": 0.00040739179317919404, + "loss": 3.298, + "step": 23474 + }, + { + "epoch": 1.15, + "grad_norm": 0.5635443329811096, + "learning_rate": 0.00040737741712096895, + "loss": 3.4142, + "step": 23475 + }, + { + "epoch": 1.15, + "grad_norm": 0.5290628671646118, + "learning_rate": 0.00040736304077993254, + "loss": 3.034, + "step": 23476 + }, + { + "epoch": 1.15, + "grad_norm": 0.5918185710906982, + "learning_rate": 0.00040734866415612244, + "loss": 3.0493, + "step": 23477 + }, + { + "epoch": 1.15, + "grad_norm": 0.5599088668823242, + "learning_rate": 0.00040733428724957664, + "loss": 3.263, + "step": 23478 + }, + { + "epoch": 1.15, + "grad_norm": 0.5094558596611023, + "learning_rate": 0.000407319910060333, + "loss": 3.1001, + "step": 23479 + }, + { + "epoch": 1.15, + "grad_norm": 0.6165416836738586, + "learning_rate": 0.0004073055325884293, + "loss": 3.0571, + "step": 23480 + }, + { + "epoch": 1.15, + "grad_norm": 0.5464115142822266, + "learning_rate": 0.00040729115483390357, + "loss": 3.1606, + "step": 23481 + }, + { + "epoch": 1.15, + "grad_norm": 0.5529763698577881, + "learning_rate": 0.00040727677679679354, + "loss": 3.0281, + "step": 23482 + }, + { + "epoch": 1.15, + "grad_norm": 0.5393357276916504, + "learning_rate": 0.00040726239847713717, + "loss": 3.0014, + "step": 23483 + }, + { + "epoch": 1.15, + "grad_norm": 0.5649782419204712, + "learning_rate": 0.00040724801987497236, + "loss": 2.9159, + "step": 23484 + }, + { + "epoch": 1.15, + "grad_norm": 0.5705831050872803, + "learning_rate": 0.0004072336409903367, + "loss": 3.0978, + "step": 23485 + }, + { + "epoch": 1.15, + "grad_norm": 0.5222892165184021, + "learning_rate": 0.0004072192618232683, + "loss": 3.0826, + "step": 23486 + }, + { + "epoch": 1.15, + "grad_norm": 0.5631487965583801, + "learning_rate": 0.0004072048823738052, + "loss": 3.0558, + "step": 23487 + }, + { + "epoch": 1.15, + "grad_norm": 0.5665938258171082, + "learning_rate": 0.00040719050264198493, + "loss": 3.2, + "step": 23488 + }, + { + "epoch": 1.15, + "grad_norm": 0.5198308825492859, + "learning_rate": 0.0004071761226278455, + "loss": 2.922, + "step": 23489 + }, + { + "epoch": 1.15, + "grad_norm": 0.5553904175758362, + "learning_rate": 0.00040716174233142475, + "loss": 3.1187, + "step": 23490 + }, + { + "epoch": 1.15, + "grad_norm": 0.5327962636947632, + "learning_rate": 0.00040714736175276073, + "loss": 3.0355, + "step": 23491 + }, + { + "epoch": 1.15, + "grad_norm": 0.5585110187530518, + "learning_rate": 0.0004071329808918911, + "loss": 3.0713, + "step": 23492 + }, + { + "epoch": 1.15, + "grad_norm": 0.5731109976768494, + "learning_rate": 0.0004071185997488538, + "loss": 3.1877, + "step": 23493 + }, + { + "epoch": 1.15, + "grad_norm": 0.5556461215019226, + "learning_rate": 0.0004071042183236867, + "loss": 2.8029, + "step": 23494 + }, + { + "epoch": 1.15, + "grad_norm": 0.5197128653526306, + "learning_rate": 0.00040708983661642785, + "loss": 3.1096, + "step": 23495 + }, + { + "epoch": 1.15, + "grad_norm": 0.5375272035598755, + "learning_rate": 0.00040707545462711483, + "loss": 3.1094, + "step": 23496 + }, + { + "epoch": 1.15, + "grad_norm": 0.5286567807197571, + "learning_rate": 0.00040706107235578565, + "loss": 2.863, + "step": 23497 + }, + { + "epoch": 1.15, + "grad_norm": 0.5974603891372681, + "learning_rate": 0.00040704668980247837, + "loss": 2.7441, + "step": 23498 + }, + { + "epoch": 1.15, + "grad_norm": 0.5531563758850098, + "learning_rate": 0.0004070323069672306, + "loss": 2.9755, + "step": 23499 + }, + { + "epoch": 1.15, + "grad_norm": 0.518351137638092, + "learning_rate": 0.00040701792385008034, + "loss": 3.214, + "step": 23500 + }, + { + "epoch": 1.15, + "grad_norm": 0.5526236891746521, + "learning_rate": 0.00040700354045106543, + "loss": 2.9767, + "step": 23501 + }, + { + "epoch": 1.15, + "grad_norm": 0.5635164976119995, + "learning_rate": 0.0004069891567702238, + "loss": 3.2102, + "step": 23502 + }, + { + "epoch": 1.15, + "grad_norm": 0.5762778520584106, + "learning_rate": 0.00040697477280759336, + "loss": 3.1312, + "step": 23503 + }, + { + "epoch": 1.15, + "grad_norm": 0.5641893148422241, + "learning_rate": 0.000406960388563212, + "loss": 3.2119, + "step": 23504 + }, + { + "epoch": 1.15, + "grad_norm": 0.5612379908561707, + "learning_rate": 0.0004069460040371175, + "loss": 3.1178, + "step": 23505 + }, + { + "epoch": 1.15, + "grad_norm": 0.5747913718223572, + "learning_rate": 0.0004069316192293478, + "loss": 3.3066, + "step": 23506 + }, + { + "epoch": 1.15, + "grad_norm": 0.5277937650680542, + "learning_rate": 0.0004069172341399408, + "loss": 3.2138, + "step": 23507 + }, + { + "epoch": 1.15, + "grad_norm": 0.5456346869468689, + "learning_rate": 0.0004069028487689344, + "loss": 3.0695, + "step": 23508 + }, + { + "epoch": 1.15, + "grad_norm": 0.5213927626609802, + "learning_rate": 0.00040688846311636654, + "loss": 3.2051, + "step": 23509 + }, + { + "epoch": 1.15, + "grad_norm": 0.5669615268707275, + "learning_rate": 0.00040687407718227494, + "loss": 2.951, + "step": 23510 + }, + { + "epoch": 1.15, + "grad_norm": 0.5207639336585999, + "learning_rate": 0.0004068596909666975, + "loss": 3.2213, + "step": 23511 + }, + { + "epoch": 1.15, + "grad_norm": 0.5501463413238525, + "learning_rate": 0.0004068453044696724, + "loss": 3.1305, + "step": 23512 + }, + { + "epoch": 1.15, + "grad_norm": 0.5365391969680786, + "learning_rate": 0.00040683091769123724, + "loss": 2.9962, + "step": 23513 + }, + { + "epoch": 1.15, + "grad_norm": 0.514644980430603, + "learning_rate": 0.00040681653063143, + "loss": 3.0177, + "step": 23514 + }, + { + "epoch": 1.15, + "grad_norm": 0.5442690849304199, + "learning_rate": 0.0004068021432902886, + "loss": 3.1254, + "step": 23515 + }, + { + "epoch": 1.15, + "grad_norm": 0.508587121963501, + "learning_rate": 0.00040678775566785086, + "loss": 3.0467, + "step": 23516 + }, + { + "epoch": 1.15, + "grad_norm": 0.5336624383926392, + "learning_rate": 0.00040677336776415477, + "loss": 3.0867, + "step": 23517 + }, + { + "epoch": 1.15, + "grad_norm": 0.5563424229621887, + "learning_rate": 0.00040675897957923816, + "loss": 3.1045, + "step": 23518 + }, + { + "epoch": 1.15, + "grad_norm": 0.5636367797851562, + "learning_rate": 0.0004067445911131389, + "loss": 3.4081, + "step": 23519 + }, + { + "epoch": 1.15, + "grad_norm": 0.5157569050788879, + "learning_rate": 0.00040673020236589504, + "loss": 3.1305, + "step": 23520 + }, + { + "epoch": 1.15, + "grad_norm": 0.5343283414840698, + "learning_rate": 0.00040671581333754427, + "loss": 3.1285, + "step": 23521 + }, + { + "epoch": 1.15, + "grad_norm": 0.5524337291717529, + "learning_rate": 0.00040670142402812455, + "loss": 3.1464, + "step": 23522 + }, + { + "epoch": 1.15, + "grad_norm": 0.638904333114624, + "learning_rate": 0.0004066870344376739, + "loss": 2.9966, + "step": 23523 + }, + { + "epoch": 1.15, + "grad_norm": 0.5390263199806213, + "learning_rate": 0.00040667264456623016, + "loss": 3.2796, + "step": 23524 + }, + { + "epoch": 1.15, + "grad_norm": 0.588818371295929, + "learning_rate": 0.0004066582544138312, + "loss": 3.1716, + "step": 23525 + }, + { + "epoch": 1.15, + "grad_norm": 0.6020216345787048, + "learning_rate": 0.00040664386398051483, + "loss": 3.2594, + "step": 23526 + }, + { + "epoch": 1.15, + "grad_norm": 0.5486242771148682, + "learning_rate": 0.00040662947326631913, + "loss": 3.0998, + "step": 23527 + }, + { + "epoch": 1.15, + "grad_norm": 0.5452220439910889, + "learning_rate": 0.0004066150822712819, + "loss": 2.853, + "step": 23528 + }, + { + "epoch": 1.15, + "grad_norm": 0.5223864912986755, + "learning_rate": 0.000406600690995441, + "loss": 3.1148, + "step": 23529 + }, + { + "epoch": 1.15, + "grad_norm": 0.5381213426589966, + "learning_rate": 0.00040658629943883447, + "loss": 3.2326, + "step": 23530 + }, + { + "epoch": 1.15, + "grad_norm": 0.5543666481971741, + "learning_rate": 0.00040657190760150013, + "loss": 3.2087, + "step": 23531 + }, + { + "epoch": 1.15, + "grad_norm": 0.5769178867340088, + "learning_rate": 0.00040655751548347583, + "loss": 3.337, + "step": 23532 + }, + { + "epoch": 1.15, + "grad_norm": 0.5365214347839355, + "learning_rate": 0.0004065431230847996, + "loss": 3.0142, + "step": 23533 + }, + { + "epoch": 1.15, + "grad_norm": 0.5313071012496948, + "learning_rate": 0.00040652873040550926, + "loss": 3.0712, + "step": 23534 + }, + { + "epoch": 1.15, + "grad_norm": 0.6116557121276855, + "learning_rate": 0.0004065143374456429, + "loss": 3.0255, + "step": 23535 + }, + { + "epoch": 1.15, + "grad_norm": 0.5586936473846436, + "learning_rate": 0.0004064999442052381, + "loss": 2.9973, + "step": 23536 + }, + { + "epoch": 1.15, + "grad_norm": 0.5303226709365845, + "learning_rate": 0.00040648555068433293, + "loss": 3.2365, + "step": 23537 + }, + { + "epoch": 1.15, + "grad_norm": 0.5624071359634399, + "learning_rate": 0.0004064711568829654, + "loss": 3.0904, + "step": 23538 + }, + { + "epoch": 1.15, + "grad_norm": 0.5486844778060913, + "learning_rate": 0.00040645676280117327, + "loss": 3.0888, + "step": 23539 + }, + { + "epoch": 1.15, + "grad_norm": 0.5096597671508789, + "learning_rate": 0.00040644236843899456, + "loss": 3.0044, + "step": 23540 + }, + { + "epoch": 1.15, + "grad_norm": 0.5384427905082703, + "learning_rate": 0.00040642797379646713, + "loss": 3.0459, + "step": 23541 + }, + { + "epoch": 1.15, + "grad_norm": 0.551108181476593, + "learning_rate": 0.0004064135788736289, + "loss": 3.2747, + "step": 23542 + }, + { + "epoch": 1.15, + "grad_norm": 0.5435267090797424, + "learning_rate": 0.00040639918367051776, + "loss": 3.2177, + "step": 23543 + }, + { + "epoch": 1.15, + "grad_norm": 0.5716750621795654, + "learning_rate": 0.00040638478818717165, + "loss": 3.2905, + "step": 23544 + }, + { + "epoch": 1.15, + "grad_norm": 0.5734542608261108, + "learning_rate": 0.0004063703924236285, + "loss": 3.4277, + "step": 23545 + }, + { + "epoch": 1.15, + "grad_norm": 0.5500821471214294, + "learning_rate": 0.0004063559963799262, + "loss": 3.1241, + "step": 23546 + }, + { + "epoch": 1.15, + "grad_norm": 0.6079295873641968, + "learning_rate": 0.00040634160005610263, + "loss": 2.9869, + "step": 23547 + }, + { + "epoch": 1.15, + "grad_norm": 0.5442874431610107, + "learning_rate": 0.00040632720345219585, + "loss": 3.1278, + "step": 23548 + }, + { + "epoch": 1.15, + "grad_norm": 0.5300388336181641, + "learning_rate": 0.00040631280656824365, + "loss": 3.0279, + "step": 23549 + }, + { + "epoch": 1.15, + "grad_norm": 0.5295524597167969, + "learning_rate": 0.000406298409404284, + "loss": 3.2271, + "step": 23550 + }, + { + "epoch": 1.15, + "grad_norm": 0.5851762890815735, + "learning_rate": 0.0004062840119603547, + "loss": 3.0275, + "step": 23551 + }, + { + "epoch": 1.15, + "grad_norm": 0.5763968825340271, + "learning_rate": 0.00040626961423649376, + "loss": 3.0724, + "step": 23552 + }, + { + "epoch": 1.15, + "grad_norm": 0.534521758556366, + "learning_rate": 0.0004062552162327393, + "loss": 3.0856, + "step": 23553 + }, + { + "epoch": 1.15, + "grad_norm": 0.5817490220069885, + "learning_rate": 0.0004062408179491288, + "loss": 3.1984, + "step": 23554 + }, + { + "epoch": 1.15, + "grad_norm": 0.548794686794281, + "learning_rate": 0.0004062264193857006, + "loss": 3.0361, + "step": 23555 + }, + { + "epoch": 1.15, + "grad_norm": 0.5278934240341187, + "learning_rate": 0.0004062120205424924, + "loss": 2.9397, + "step": 23556 + }, + { + "epoch": 1.15, + "grad_norm": 0.5557737350463867, + "learning_rate": 0.0004061976214195422, + "loss": 3.0761, + "step": 23557 + }, + { + "epoch": 1.15, + "grad_norm": 0.6352978944778442, + "learning_rate": 0.0004061832220168879, + "loss": 3.2706, + "step": 23558 + }, + { + "epoch": 1.15, + "grad_norm": 0.5525439381599426, + "learning_rate": 0.00040616882233456735, + "loss": 3.1981, + "step": 23559 + }, + { + "epoch": 1.15, + "grad_norm": 0.5778921246528625, + "learning_rate": 0.0004061544223726187, + "loss": 3.0622, + "step": 23560 + }, + { + "epoch": 1.15, + "grad_norm": 0.5746831297874451, + "learning_rate": 0.0004061400221310796, + "loss": 3.335, + "step": 23561 + }, + { + "epoch": 1.15, + "grad_norm": 0.5181114077568054, + "learning_rate": 0.00040612562160998817, + "loss": 3.0855, + "step": 23562 + }, + { + "epoch": 1.15, + "grad_norm": 0.5968653559684753, + "learning_rate": 0.00040611122080938224, + "loss": 3.1416, + "step": 23563 + }, + { + "epoch": 1.15, + "grad_norm": 0.5655339956283569, + "learning_rate": 0.0004060968197292999, + "loss": 3.0138, + "step": 23564 + }, + { + "epoch": 1.15, + "grad_norm": 0.5550764203071594, + "learning_rate": 0.00040608241836977886, + "loss": 3.0035, + "step": 23565 + }, + { + "epoch": 1.15, + "grad_norm": 0.5673668384552002, + "learning_rate": 0.0004060680167308571, + "loss": 3.0248, + "step": 23566 + }, + { + "epoch": 1.15, + "grad_norm": 0.5502573251724243, + "learning_rate": 0.0004060536148125726, + "loss": 3.3946, + "step": 23567 + }, + { + "epoch": 1.16, + "grad_norm": 0.5763288140296936, + "learning_rate": 0.00040603921261496336, + "loss": 3.1626, + "step": 23568 + }, + { + "epoch": 1.16, + "grad_norm": 0.5315199494361877, + "learning_rate": 0.0004060248101380672, + "loss": 2.9473, + "step": 23569 + }, + { + "epoch": 1.16, + "grad_norm": 0.5365606546401978, + "learning_rate": 0.00040601040738192214, + "loss": 3.0127, + "step": 23570 + }, + { + "epoch": 1.16, + "grad_norm": 0.5493980050086975, + "learning_rate": 0.000405996004346566, + "loss": 2.8699, + "step": 23571 + }, + { + "epoch": 1.16, + "grad_norm": 0.5779330730438232, + "learning_rate": 0.0004059816010320369, + "loss": 3.0912, + "step": 23572 + }, + { + "epoch": 1.16, + "grad_norm": 0.5333728790283203, + "learning_rate": 0.00040596719743837253, + "loss": 3.2728, + "step": 23573 + }, + { + "epoch": 1.16, + "grad_norm": 0.5707440376281738, + "learning_rate": 0.0004059527935656109, + "loss": 3.0194, + "step": 23574 + }, + { + "epoch": 1.16, + "grad_norm": 0.5592421293258667, + "learning_rate": 0.0004059383894137902, + "loss": 2.8304, + "step": 23575 + }, + { + "epoch": 1.16, + "grad_norm": 0.5238040089607239, + "learning_rate": 0.0004059239849829481, + "loss": 2.9382, + "step": 23576 + }, + { + "epoch": 1.16, + "grad_norm": 0.5463867783546448, + "learning_rate": 0.00040590958027312255, + "loss": 3.3477, + "step": 23577 + }, + { + "epoch": 1.16, + "grad_norm": 0.5499131083488464, + "learning_rate": 0.0004058951752843516, + "loss": 3.274, + "step": 23578 + }, + { + "epoch": 1.16, + "grad_norm": 0.5574745535850525, + "learning_rate": 0.0004058807700166731, + "loss": 2.9782, + "step": 23579 + }, + { + "epoch": 1.16, + "grad_norm": 0.5117610692977905, + "learning_rate": 0.0004058663644701251, + "loss": 3.0166, + "step": 23580 + }, + { + "epoch": 1.16, + "grad_norm": 0.5790355205535889, + "learning_rate": 0.00040585195864474544, + "loss": 3.2326, + "step": 23581 + }, + { + "epoch": 1.16, + "grad_norm": 0.5722032785415649, + "learning_rate": 0.00040583755254057204, + "loss": 3.116, + "step": 23582 + }, + { + "epoch": 1.16, + "grad_norm": 0.5745276212692261, + "learning_rate": 0.00040582314615764293, + "loss": 3.0388, + "step": 23583 + }, + { + "epoch": 1.16, + "grad_norm": 0.5759443640708923, + "learning_rate": 0.000405808739495996, + "loss": 2.9188, + "step": 23584 + }, + { + "epoch": 1.16, + "grad_norm": 0.5403763651847839, + "learning_rate": 0.00040579433255566927, + "loss": 3.0228, + "step": 23585 + }, + { + "epoch": 1.16, + "grad_norm": 0.5597376227378845, + "learning_rate": 0.00040577992533670065, + "loss": 3.3179, + "step": 23586 + }, + { + "epoch": 1.16, + "grad_norm": 0.565265417098999, + "learning_rate": 0.000405765517839128, + "loss": 3.2375, + "step": 23587 + }, + { + "epoch": 1.16, + "grad_norm": 0.584064781665802, + "learning_rate": 0.00040575111006298925, + "loss": 2.8922, + "step": 23588 + }, + { + "epoch": 1.16, + "grad_norm": 0.5477665066719055, + "learning_rate": 0.00040573670200832253, + "loss": 3.0098, + "step": 23589 + }, + { + "epoch": 1.16, + "grad_norm": 0.5629181265830994, + "learning_rate": 0.0004057222936751657, + "loss": 3.0527, + "step": 23590 + }, + { + "epoch": 1.16, + "grad_norm": 0.5321118235588074, + "learning_rate": 0.0004057078850635567, + "loss": 3.1828, + "step": 23591 + }, + { + "epoch": 1.16, + "grad_norm": 0.5479146242141724, + "learning_rate": 0.00040569347617353343, + "loss": 3.1211, + "step": 23592 + }, + { + "epoch": 1.16, + "grad_norm": 0.5773611068725586, + "learning_rate": 0.0004056790670051339, + "loss": 2.9645, + "step": 23593 + }, + { + "epoch": 1.16, + "grad_norm": 0.5333780646324158, + "learning_rate": 0.000405664657558396, + "loss": 3.1218, + "step": 23594 + }, + { + "epoch": 1.16, + "grad_norm": 0.5267741680145264, + "learning_rate": 0.00040565024783335775, + "loss": 3.0897, + "step": 23595 + }, + { + "epoch": 1.16, + "grad_norm": 0.5395077466964722, + "learning_rate": 0.00040563583783005707, + "loss": 3.0424, + "step": 23596 + }, + { + "epoch": 1.16, + "grad_norm": 0.562576949596405, + "learning_rate": 0.00040562142754853206, + "loss": 3.3082, + "step": 23597 + }, + { + "epoch": 1.16, + "grad_norm": 0.5555576682090759, + "learning_rate": 0.00040560701698882034, + "loss": 3.0102, + "step": 23598 + }, + { + "epoch": 1.16, + "grad_norm": 0.5478713512420654, + "learning_rate": 0.0004055926061509602, + "loss": 2.9769, + "step": 23599 + }, + { + "epoch": 1.16, + "grad_norm": 0.5155161619186401, + "learning_rate": 0.00040557819503498943, + "loss": 3.0548, + "step": 23600 + }, + { + "epoch": 1.16, + "grad_norm": 0.563854455947876, + "learning_rate": 0.0004055637836409459, + "loss": 2.9703, + "step": 23601 + }, + { + "epoch": 1.16, + "grad_norm": 0.5164005756378174, + "learning_rate": 0.0004055493719688678, + "loss": 3.032, + "step": 23602 + }, + { + "epoch": 1.16, + "grad_norm": 0.6590232849121094, + "learning_rate": 0.00040553496001879296, + "loss": 3.1402, + "step": 23603 + }, + { + "epoch": 1.16, + "grad_norm": 0.5741829872131348, + "learning_rate": 0.00040552054779075935, + "loss": 2.7843, + "step": 23604 + }, + { + "epoch": 1.16, + "grad_norm": 0.5617377758026123, + "learning_rate": 0.00040550613528480493, + "loss": 3.1812, + "step": 23605 + }, + { + "epoch": 1.16, + "grad_norm": 0.52630615234375, + "learning_rate": 0.0004054917225009676, + "loss": 3.1588, + "step": 23606 + }, + { + "epoch": 1.16, + "grad_norm": 0.5488269329071045, + "learning_rate": 0.0004054773094392854, + "loss": 3.2574, + "step": 23607 + }, + { + "epoch": 1.16, + "grad_norm": 0.7178289294242859, + "learning_rate": 0.00040546289609979624, + "loss": 2.9536, + "step": 23608 + }, + { + "epoch": 1.16, + "grad_norm": 0.5843186974525452, + "learning_rate": 0.0004054484824825382, + "loss": 3.1758, + "step": 23609 + }, + { + "epoch": 1.16, + "grad_norm": 0.6291219592094421, + "learning_rate": 0.00040543406858754903, + "loss": 3.0335, + "step": 23610 + }, + { + "epoch": 1.16, + "grad_norm": 0.5768383741378784, + "learning_rate": 0.0004054196544148669, + "loss": 2.8571, + "step": 23611 + }, + { + "epoch": 1.16, + "grad_norm": 0.5430237054824829, + "learning_rate": 0.0004054052399645297, + "loss": 2.9117, + "step": 23612 + }, + { + "epoch": 1.16, + "grad_norm": 0.5219388008117676, + "learning_rate": 0.00040539082523657536, + "loss": 3.1468, + "step": 23613 + }, + { + "epoch": 1.16, + "grad_norm": 0.547167181968689, + "learning_rate": 0.0004053764102310417, + "loss": 3.237, + "step": 23614 + }, + { + "epoch": 1.16, + "grad_norm": 0.5904198288917542, + "learning_rate": 0.0004053619949479671, + "loss": 3.1433, + "step": 23615 + }, + { + "epoch": 1.16, + "grad_norm": 0.5035914778709412, + "learning_rate": 0.0004053475793873892, + "loss": 2.966, + "step": 23616 + }, + { + "epoch": 1.16, + "grad_norm": 0.5650410652160645, + "learning_rate": 0.00040533316354934607, + "loss": 3.1302, + "step": 23617 + }, + { + "epoch": 1.16, + "grad_norm": 0.5530869960784912, + "learning_rate": 0.0004053187474338756, + "loss": 3.0593, + "step": 23618 + }, + { + "epoch": 1.16, + "grad_norm": 0.560907781124115, + "learning_rate": 0.00040530433104101583, + "loss": 3.1948, + "step": 23619 + }, + { + "epoch": 1.16, + "grad_norm": 0.5566398501396179, + "learning_rate": 0.00040528991437080474, + "loss": 3.2553, + "step": 23620 + }, + { + "epoch": 1.16, + "grad_norm": 0.5609525442123413, + "learning_rate": 0.00040527549742328027, + "loss": 2.9138, + "step": 23621 + }, + { + "epoch": 1.16, + "grad_norm": 0.5385319590568542, + "learning_rate": 0.0004052610801984805, + "loss": 3.1255, + "step": 23622 + }, + { + "epoch": 1.16, + "grad_norm": 0.5298342108726501, + "learning_rate": 0.0004052466626964432, + "loss": 3.3531, + "step": 23623 + }, + { + "epoch": 1.16, + "grad_norm": 0.5590463280677795, + "learning_rate": 0.00040523224491720636, + "loss": 3.0695, + "step": 23624 + }, + { + "epoch": 1.16, + "grad_norm": 0.5449240803718567, + "learning_rate": 0.00040521782686080816, + "loss": 3.0979, + "step": 23625 + }, + { + "epoch": 1.16, + "grad_norm": 0.5203918218612671, + "learning_rate": 0.00040520340852728647, + "loss": 3.023, + "step": 23626 + }, + { + "epoch": 1.16, + "grad_norm": 0.6454471945762634, + "learning_rate": 0.0004051889899166792, + "loss": 3.0371, + "step": 23627 + }, + { + "epoch": 1.16, + "grad_norm": 0.546628475189209, + "learning_rate": 0.0004051745710290244, + "loss": 2.9801, + "step": 23628 + }, + { + "epoch": 1.16, + "grad_norm": 0.5471029877662659, + "learning_rate": 0.00040516015186436, + "loss": 3.0225, + "step": 23629 + }, + { + "epoch": 1.16, + "grad_norm": 0.57635498046875, + "learning_rate": 0.00040514573242272396, + "loss": 3.1095, + "step": 23630 + }, + { + "epoch": 1.16, + "grad_norm": 0.5368325710296631, + "learning_rate": 0.0004051313127041544, + "loss": 3.0904, + "step": 23631 + }, + { + "epoch": 1.16, + "grad_norm": 0.5363153219223022, + "learning_rate": 0.00040511689270868905, + "loss": 2.986, + "step": 23632 + }, + { + "epoch": 1.16, + "grad_norm": 0.5831332802772522, + "learning_rate": 0.00040510247243636614, + "loss": 3.0365, + "step": 23633 + }, + { + "epoch": 1.16, + "grad_norm": 0.5615931749343872, + "learning_rate": 0.0004050880518872235, + "loss": 3.142, + "step": 23634 + }, + { + "epoch": 1.16, + "grad_norm": 0.5161574482917786, + "learning_rate": 0.0004050736310612992, + "loss": 3.2197, + "step": 23635 + }, + { + "epoch": 1.16, + "grad_norm": 0.5491262078285217, + "learning_rate": 0.00040505920995863114, + "loss": 2.9822, + "step": 23636 + }, + { + "epoch": 1.16, + "grad_norm": 0.5797311067581177, + "learning_rate": 0.00040504478857925736, + "loss": 3.1336, + "step": 23637 + }, + { + "epoch": 1.16, + "grad_norm": 0.5718839168548584, + "learning_rate": 0.00040503036692321584, + "loss": 3.0789, + "step": 23638 + }, + { + "epoch": 1.16, + "grad_norm": 0.529486358165741, + "learning_rate": 0.0004050159449905445, + "loss": 3.0697, + "step": 23639 + }, + { + "epoch": 1.16, + "grad_norm": 0.5350091457366943, + "learning_rate": 0.0004050015227812815, + "loss": 3.1539, + "step": 23640 + }, + { + "epoch": 1.16, + "grad_norm": 0.5986202359199524, + "learning_rate": 0.0004049871002954645, + "loss": 3.2035, + "step": 23641 + }, + { + "epoch": 1.16, + "grad_norm": 0.5608084797859192, + "learning_rate": 0.0004049726775331318, + "loss": 3.0752, + "step": 23642 + }, + { + "epoch": 1.16, + "grad_norm": 0.5415424108505249, + "learning_rate": 0.00040495825449432125, + "loss": 3.2252, + "step": 23643 + }, + { + "epoch": 1.16, + "grad_norm": 0.5681462287902832, + "learning_rate": 0.00040494383117907083, + "loss": 3.228, + "step": 23644 + }, + { + "epoch": 1.16, + "grad_norm": 0.5260352492332458, + "learning_rate": 0.0004049294075874186, + "loss": 3.1951, + "step": 23645 + }, + { + "epoch": 1.16, + "grad_norm": 0.5569955706596375, + "learning_rate": 0.0004049149837194024, + "loss": 3.1433, + "step": 23646 + }, + { + "epoch": 1.16, + "grad_norm": 0.5537160634994507, + "learning_rate": 0.00040490055957506047, + "loss": 2.7933, + "step": 23647 + }, + { + "epoch": 1.16, + "grad_norm": 0.5751855373382568, + "learning_rate": 0.00040488613515443064, + "loss": 2.9887, + "step": 23648 + }, + { + "epoch": 1.16, + "grad_norm": 0.6018997430801392, + "learning_rate": 0.00040487171045755087, + "loss": 2.9387, + "step": 23649 + }, + { + "epoch": 1.16, + "grad_norm": 0.5773023366928101, + "learning_rate": 0.0004048572854844591, + "loss": 3.0559, + "step": 23650 + }, + { + "epoch": 1.16, + "grad_norm": 0.5616927146911621, + "learning_rate": 0.0004048428602351936, + "loss": 3.2197, + "step": 23651 + }, + { + "epoch": 1.16, + "grad_norm": 0.5932993292808533, + "learning_rate": 0.00040482843470979207, + "loss": 2.9324, + "step": 23652 + }, + { + "epoch": 1.16, + "grad_norm": 0.5337039232254028, + "learning_rate": 0.0004048140089082927, + "loss": 3.0964, + "step": 23653 + }, + { + "epoch": 1.16, + "grad_norm": 0.5320896506309509, + "learning_rate": 0.00040479958283073334, + "loss": 3.0553, + "step": 23654 + }, + { + "epoch": 1.16, + "grad_norm": 0.5578566193580627, + "learning_rate": 0.0004047851564771521, + "loss": 2.9863, + "step": 23655 + }, + { + "epoch": 1.16, + "grad_norm": 0.5269665718078613, + "learning_rate": 0.00040477072984758687, + "loss": 3.0494, + "step": 23656 + }, + { + "epoch": 1.16, + "grad_norm": 0.6187220811843872, + "learning_rate": 0.0004047563029420757, + "loss": 3.0293, + "step": 23657 + }, + { + "epoch": 1.16, + "grad_norm": 0.5536328554153442, + "learning_rate": 0.00040474187576065656, + "loss": 2.991, + "step": 23658 + }, + { + "epoch": 1.16, + "grad_norm": 0.5319032669067383, + "learning_rate": 0.0004047274483033676, + "loss": 3.0669, + "step": 23659 + }, + { + "epoch": 1.16, + "grad_norm": 0.5195010304450989, + "learning_rate": 0.00040471302057024653, + "loss": 2.8542, + "step": 23660 + }, + { + "epoch": 1.16, + "grad_norm": 0.5610055327415466, + "learning_rate": 0.0004046985925613316, + "loss": 2.9366, + "step": 23661 + }, + { + "epoch": 1.16, + "grad_norm": 0.5877775549888611, + "learning_rate": 0.0004046841642766608, + "loss": 3.2743, + "step": 23662 + }, + { + "epoch": 1.16, + "grad_norm": 0.5556247234344482, + "learning_rate": 0.000404669735716272, + "loss": 3.2213, + "step": 23663 + }, + { + "epoch": 1.16, + "grad_norm": 0.5412140488624573, + "learning_rate": 0.00040465530688020324, + "loss": 3.2489, + "step": 23664 + }, + { + "epoch": 1.16, + "grad_norm": 0.5714347958564758, + "learning_rate": 0.00040464087776849244, + "loss": 3.0755, + "step": 23665 + }, + { + "epoch": 1.16, + "grad_norm": 0.5732872486114502, + "learning_rate": 0.00040462644838117783, + "loss": 3.0111, + "step": 23666 + }, + { + "epoch": 1.16, + "grad_norm": 0.5459180474281311, + "learning_rate": 0.00040461201871829736, + "loss": 2.8323, + "step": 23667 + }, + { + "epoch": 1.16, + "grad_norm": 0.5625903606414795, + "learning_rate": 0.00040459758877988886, + "loss": 3.0188, + "step": 23668 + }, + { + "epoch": 1.16, + "grad_norm": 0.5748194456100464, + "learning_rate": 0.00040458315856599044, + "loss": 2.6947, + "step": 23669 + }, + { + "epoch": 1.16, + "grad_norm": 0.5342017412185669, + "learning_rate": 0.00040456872807664016, + "loss": 3.1381, + "step": 23670 + }, + { + "epoch": 1.16, + "grad_norm": 0.5473812818527222, + "learning_rate": 0.0004045542973118759, + "loss": 3.1439, + "step": 23671 + }, + { + "epoch": 1.16, + "grad_norm": 0.517485499382019, + "learning_rate": 0.00040453986627173586, + "loss": 3.0633, + "step": 23672 + }, + { + "epoch": 1.16, + "grad_norm": 0.5450983047485352, + "learning_rate": 0.0004045254349562579, + "loss": 3.056, + "step": 23673 + }, + { + "epoch": 1.16, + "grad_norm": 0.6088648438453674, + "learning_rate": 0.00040451100336548, + "loss": 3.1054, + "step": 23674 + }, + { + "epoch": 1.16, + "grad_norm": 0.5295372009277344, + "learning_rate": 0.0004044965714994402, + "loss": 3.1122, + "step": 23675 + }, + { + "epoch": 1.16, + "grad_norm": 0.5838629007339478, + "learning_rate": 0.00040448213935817654, + "loss": 3.1979, + "step": 23676 + }, + { + "epoch": 1.16, + "grad_norm": 0.5640658736228943, + "learning_rate": 0.0004044677069417272, + "loss": 3.163, + "step": 23677 + }, + { + "epoch": 1.16, + "grad_norm": 0.5375956296920776, + "learning_rate": 0.00040445327425012986, + "loss": 3.2325, + "step": 23678 + }, + { + "epoch": 1.16, + "grad_norm": 0.5397927761077881, + "learning_rate": 0.0004044388412834228, + "loss": 3.1302, + "step": 23679 + }, + { + "epoch": 1.16, + "grad_norm": 0.5797949433326721, + "learning_rate": 0.00040442440804164384, + "loss": 3.0378, + "step": 23680 + }, + { + "epoch": 1.16, + "grad_norm": 0.5403143763542175, + "learning_rate": 0.00040440997452483116, + "loss": 3.1009, + "step": 23681 + }, + { + "epoch": 1.16, + "grad_norm": 0.5413429141044617, + "learning_rate": 0.00040439554073302264, + "loss": 2.7718, + "step": 23682 + }, + { + "epoch": 1.16, + "grad_norm": 0.5402602553367615, + "learning_rate": 0.0004043811066662564, + "loss": 3.1233, + "step": 23683 + }, + { + "epoch": 1.16, + "grad_norm": 0.5142017006874084, + "learning_rate": 0.00040436667232457044, + "loss": 3.0242, + "step": 23684 + }, + { + "epoch": 1.16, + "grad_norm": 0.5384440422058105, + "learning_rate": 0.0004043522377080026, + "loss": 3.1141, + "step": 23685 + }, + { + "epoch": 1.16, + "grad_norm": 0.539061963558197, + "learning_rate": 0.0004043378028165912, + "loss": 2.9895, + "step": 23686 + }, + { + "epoch": 1.16, + "grad_norm": 0.5121895670890808, + "learning_rate": 0.0004043233676503741, + "loss": 3.0622, + "step": 23687 + }, + { + "epoch": 1.16, + "grad_norm": 0.5728363990783691, + "learning_rate": 0.00040430893220938925, + "loss": 3.0116, + "step": 23688 + }, + { + "epoch": 1.16, + "grad_norm": 0.5647080540657043, + "learning_rate": 0.00040429449649367487, + "loss": 3.0383, + "step": 23689 + }, + { + "epoch": 1.16, + "grad_norm": 0.5658575296401978, + "learning_rate": 0.0004042800605032687, + "loss": 3.0062, + "step": 23690 + }, + { + "epoch": 1.16, + "grad_norm": 0.5728873014450073, + "learning_rate": 0.00040426562423820904, + "loss": 2.9859, + "step": 23691 + }, + { + "epoch": 1.16, + "grad_norm": 0.5607736110687256, + "learning_rate": 0.0004042511876985338, + "loss": 3.1462, + "step": 23692 + }, + { + "epoch": 1.16, + "grad_norm": 0.5391986966133118, + "learning_rate": 0.00040423675088428095, + "loss": 3.165, + "step": 23693 + }, + { + "epoch": 1.16, + "grad_norm": 0.5368701815605164, + "learning_rate": 0.0004042223137954885, + "loss": 3.1621, + "step": 23694 + }, + { + "epoch": 1.16, + "grad_norm": 0.578046441078186, + "learning_rate": 0.0004042078764321945, + "loss": 3.2745, + "step": 23695 + }, + { + "epoch": 1.16, + "grad_norm": 0.5303065776824951, + "learning_rate": 0.0004041934387944372, + "loss": 3.2873, + "step": 23696 + }, + { + "epoch": 1.16, + "grad_norm": 0.5496455430984497, + "learning_rate": 0.00040417900088225435, + "loss": 2.9276, + "step": 23697 + }, + { + "epoch": 1.16, + "grad_norm": 0.6175884008407593, + "learning_rate": 0.00040416456269568404, + "loss": 2.8928, + "step": 23698 + }, + { + "epoch": 1.16, + "grad_norm": 0.5311169028282166, + "learning_rate": 0.0004041501242347644, + "loss": 3.1252, + "step": 23699 + }, + { + "epoch": 1.16, + "grad_norm": 0.5432295203208923, + "learning_rate": 0.0004041356854995332, + "loss": 3.2053, + "step": 23700 + }, + { + "epoch": 1.16, + "grad_norm": 0.5909796357154846, + "learning_rate": 0.00040412124649002876, + "loss": 3.0875, + "step": 23701 + }, + { + "epoch": 1.16, + "grad_norm": 0.5231812000274658, + "learning_rate": 0.000404106807206289, + "loss": 2.9765, + "step": 23702 + }, + { + "epoch": 1.16, + "grad_norm": 0.5160446763038635, + "learning_rate": 0.0004040923676483519, + "loss": 3.0671, + "step": 23703 + }, + { + "epoch": 1.16, + "grad_norm": 0.5727381706237793, + "learning_rate": 0.00040407792781625555, + "loss": 2.9777, + "step": 23704 + }, + { + "epoch": 1.16, + "grad_norm": 0.5669409036636353, + "learning_rate": 0.000404063487710038, + "loss": 3.185, + "step": 23705 + }, + { + "epoch": 1.16, + "grad_norm": 0.5885549783706665, + "learning_rate": 0.0004040490473297372, + "loss": 3.3162, + "step": 23706 + }, + { + "epoch": 1.16, + "grad_norm": 0.5521764159202576, + "learning_rate": 0.0004040346066753913, + "loss": 3.0025, + "step": 23707 + }, + { + "epoch": 1.16, + "grad_norm": 0.5233154892921448, + "learning_rate": 0.0004040201657470382, + "loss": 3.0222, + "step": 23708 + }, + { + "epoch": 1.16, + "grad_norm": 0.6025112867355347, + "learning_rate": 0.00040400572454471603, + "loss": 3.0527, + "step": 23709 + }, + { + "epoch": 1.16, + "grad_norm": 0.5110952258110046, + "learning_rate": 0.00040399128306846285, + "loss": 3.1567, + "step": 23710 + }, + { + "epoch": 1.16, + "grad_norm": 0.5763827562332153, + "learning_rate": 0.00040397684131831656, + "loss": 3.1964, + "step": 23711 + }, + { + "epoch": 1.16, + "grad_norm": 0.5659586191177368, + "learning_rate": 0.00040396239929431534, + "loss": 3.1401, + "step": 23712 + }, + { + "epoch": 1.16, + "grad_norm": 0.5608625411987305, + "learning_rate": 0.0004039479569964971, + "loss": 3.0601, + "step": 23713 + }, + { + "epoch": 1.16, + "grad_norm": 0.5368182063102722, + "learning_rate": 0.00040393351442490007, + "loss": 2.9722, + "step": 23714 + }, + { + "epoch": 1.16, + "grad_norm": 0.5225261449813843, + "learning_rate": 0.00040391907157956214, + "loss": 3.0737, + "step": 23715 + }, + { + "epoch": 1.16, + "grad_norm": 0.5581330060958862, + "learning_rate": 0.0004039046284605212, + "loss": 3.1152, + "step": 23716 + }, + { + "epoch": 1.16, + "grad_norm": 0.5440344214439392, + "learning_rate": 0.00040389018506781563, + "loss": 3.2068, + "step": 23717 + }, + { + "epoch": 1.16, + "grad_norm": 0.5621079802513123, + "learning_rate": 0.0004038757414014833, + "loss": 3.0555, + "step": 23718 + }, + { + "epoch": 1.16, + "grad_norm": 0.566859781742096, + "learning_rate": 0.00040386129746156215, + "loss": 2.9883, + "step": 23719 + }, + { + "epoch": 1.16, + "grad_norm": 0.5255882143974304, + "learning_rate": 0.0004038468532480905, + "loss": 2.8779, + "step": 23720 + }, + { + "epoch": 1.16, + "grad_norm": 0.5612319111824036, + "learning_rate": 0.0004038324087611061, + "loss": 3.115, + "step": 23721 + }, + { + "epoch": 1.16, + "grad_norm": 0.5677599906921387, + "learning_rate": 0.00040381796400064716, + "loss": 3.0859, + "step": 23722 + }, + { + "epoch": 1.16, + "grad_norm": 0.5437223315238953, + "learning_rate": 0.00040380351896675157, + "loss": 3.0952, + "step": 23723 + }, + { + "epoch": 1.16, + "grad_norm": 0.5329674482345581, + "learning_rate": 0.0004037890736594577, + "loss": 3.0249, + "step": 23724 + }, + { + "epoch": 1.16, + "grad_norm": 0.5678834319114685, + "learning_rate": 0.00040377462807880324, + "loss": 3.0096, + "step": 23725 + }, + { + "epoch": 1.16, + "grad_norm": 0.5391994118690491, + "learning_rate": 0.0004037601822248264, + "loss": 3.2036, + "step": 23726 + }, + { + "epoch": 1.16, + "grad_norm": 0.5704066753387451, + "learning_rate": 0.0004037457360975652, + "loss": 3.2307, + "step": 23727 + }, + { + "epoch": 1.16, + "grad_norm": 0.528113842010498, + "learning_rate": 0.00040373128969705774, + "loss": 3.3054, + "step": 23728 + }, + { + "epoch": 1.16, + "grad_norm": 0.5105913877487183, + "learning_rate": 0.00040371684302334203, + "loss": 3.173, + "step": 23729 + }, + { + "epoch": 1.16, + "grad_norm": 0.5552327036857605, + "learning_rate": 0.0004037023960764561, + "loss": 3.1766, + "step": 23730 + }, + { + "epoch": 1.16, + "grad_norm": 0.5556386113166809, + "learning_rate": 0.000403687948856438, + "loss": 2.9731, + "step": 23731 + }, + { + "epoch": 1.16, + "grad_norm": 0.5652289986610413, + "learning_rate": 0.0004036735013633258, + "loss": 3.2652, + "step": 23732 + }, + { + "epoch": 1.16, + "grad_norm": 0.4932308495044708, + "learning_rate": 0.0004036590535971576, + "loss": 3.1269, + "step": 23733 + }, + { + "epoch": 1.16, + "grad_norm": 0.5384199023246765, + "learning_rate": 0.0004036446055579713, + "loss": 3.1946, + "step": 23734 + }, + { + "epoch": 1.16, + "grad_norm": 0.5091381669044495, + "learning_rate": 0.00040363015724580517, + "loss": 2.9825, + "step": 23735 + }, + { + "epoch": 1.16, + "grad_norm": 0.5702914595603943, + "learning_rate": 0.0004036157086606971, + "loss": 3.1606, + "step": 23736 + }, + { + "epoch": 1.16, + "grad_norm": 0.5650493502616882, + "learning_rate": 0.00040360125980268513, + "loss": 3.0586, + "step": 23737 + }, + { + "epoch": 1.16, + "grad_norm": 0.5145954489707947, + "learning_rate": 0.00040358681067180746, + "loss": 3.2997, + "step": 23738 + }, + { + "epoch": 1.16, + "grad_norm": 0.5607908368110657, + "learning_rate": 0.00040357236126810207, + "loss": 3.0453, + "step": 23739 + }, + { + "epoch": 1.16, + "grad_norm": 0.5422621965408325, + "learning_rate": 0.000403557911591607, + "loss": 2.933, + "step": 23740 + }, + { + "epoch": 1.16, + "grad_norm": 0.5554590225219727, + "learning_rate": 0.0004035434616423603, + "loss": 2.9494, + "step": 23741 + }, + { + "epoch": 1.16, + "grad_norm": 0.6001891493797302, + "learning_rate": 0.0004035290114204, + "loss": 3.1366, + "step": 23742 + }, + { + "epoch": 1.16, + "grad_norm": 0.5242499113082886, + "learning_rate": 0.00040351456092576433, + "loss": 3.277, + "step": 23743 + }, + { + "epoch": 1.16, + "grad_norm": 0.5295645594596863, + "learning_rate": 0.00040350011015849116, + "loss": 3.2324, + "step": 23744 + }, + { + "epoch": 1.16, + "grad_norm": 0.5429471135139465, + "learning_rate": 0.00040348565911861866, + "loss": 3.0578, + "step": 23745 + }, + { + "epoch": 1.16, + "grad_norm": 0.5438092350959778, + "learning_rate": 0.0004034712078061849, + "loss": 3.0985, + "step": 23746 + }, + { + "epoch": 1.16, + "grad_norm": 0.5752182006835938, + "learning_rate": 0.0004034567562212277, + "loss": 3.2343, + "step": 23747 + }, + { + "epoch": 1.16, + "grad_norm": 0.5302133560180664, + "learning_rate": 0.0004034423043637854, + "loss": 3.0236, + "step": 23748 + }, + { + "epoch": 1.16, + "grad_norm": 0.5672379732131958, + "learning_rate": 0.000403427852233896, + "loss": 3.2175, + "step": 23749 + }, + { + "epoch": 1.16, + "grad_norm": 0.5564298033714294, + "learning_rate": 0.0004034133998315976, + "loss": 3.2707, + "step": 23750 + }, + { + "epoch": 1.16, + "grad_norm": 0.5528357028961182, + "learning_rate": 0.00040339894715692815, + "loss": 3.1907, + "step": 23751 + }, + { + "epoch": 1.16, + "grad_norm": 0.5306649208068848, + "learning_rate": 0.0004033844942099257, + "loss": 3.0427, + "step": 23752 + }, + { + "epoch": 1.16, + "grad_norm": 0.5104833841323853, + "learning_rate": 0.00040337004099062854, + "loss": 3.1894, + "step": 23753 + }, + { + "epoch": 1.16, + "grad_norm": 0.5718398094177246, + "learning_rate": 0.00040335558749907455, + "loss": 3.0523, + "step": 23754 + }, + { + "epoch": 1.16, + "grad_norm": 0.568389892578125, + "learning_rate": 0.00040334113373530174, + "loss": 3.0214, + "step": 23755 + }, + { + "epoch": 1.16, + "grad_norm": 0.567775547504425, + "learning_rate": 0.00040332667969934837, + "loss": 3.217, + "step": 23756 + }, + { + "epoch": 1.16, + "grad_norm": 0.5069112777709961, + "learning_rate": 0.00040331222539125234, + "loss": 2.9524, + "step": 23757 + }, + { + "epoch": 1.16, + "grad_norm": 0.5586796402931213, + "learning_rate": 0.0004032977708110518, + "loss": 3.1551, + "step": 23758 + }, + { + "epoch": 1.16, + "grad_norm": 0.5643654465675354, + "learning_rate": 0.0004032833159587848, + "loss": 2.9142, + "step": 23759 + }, + { + "epoch": 1.16, + "grad_norm": 0.5675867199897766, + "learning_rate": 0.00040326886083448944, + "loss": 2.9242, + "step": 23760 + }, + { + "epoch": 1.16, + "grad_norm": 0.5585340261459351, + "learning_rate": 0.00040325440543820387, + "loss": 3.1171, + "step": 23761 + }, + { + "epoch": 1.16, + "grad_norm": 0.5579453706741333, + "learning_rate": 0.0004032399497699659, + "loss": 3.082, + "step": 23762 + }, + { + "epoch": 1.16, + "grad_norm": 0.5434767603874207, + "learning_rate": 0.00040322549382981385, + "loss": 2.9839, + "step": 23763 + }, + { + "epoch": 1.16, + "grad_norm": 0.5664116740226746, + "learning_rate": 0.00040321103761778584, + "loss": 2.8925, + "step": 23764 + }, + { + "epoch": 1.16, + "grad_norm": 0.5497961640357971, + "learning_rate": 0.00040319658113391967, + "loss": 3.0979, + "step": 23765 + }, + { + "epoch": 1.16, + "grad_norm": 0.5843026041984558, + "learning_rate": 0.00040318212437825355, + "loss": 3.1466, + "step": 23766 + }, + { + "epoch": 1.16, + "grad_norm": 0.5227053761482239, + "learning_rate": 0.00040316766735082565, + "loss": 2.9814, + "step": 23767 + }, + { + "epoch": 1.16, + "grad_norm": 0.5491824150085449, + "learning_rate": 0.0004031532100516739, + "loss": 3.0808, + "step": 23768 + }, + { + "epoch": 1.16, + "grad_norm": 0.5716967582702637, + "learning_rate": 0.00040313875248083647, + "loss": 3.2243, + "step": 23769 + }, + { + "epoch": 1.16, + "grad_norm": 0.5618823170661926, + "learning_rate": 0.00040312429463835146, + "loss": 3.162, + "step": 23770 + }, + { + "epoch": 1.16, + "grad_norm": 0.5274955630302429, + "learning_rate": 0.00040310983652425695, + "loss": 2.892, + "step": 23771 + }, + { + "epoch": 1.17, + "grad_norm": 0.6322970390319824, + "learning_rate": 0.0004030953781385909, + "loss": 3.0052, + "step": 23772 + }, + { + "epoch": 1.17, + "grad_norm": 0.5498408079147339, + "learning_rate": 0.00040308091948139136, + "loss": 2.9249, + "step": 23773 + }, + { + "epoch": 1.17, + "grad_norm": 0.5550096035003662, + "learning_rate": 0.0004030664605526966, + "loss": 3.0928, + "step": 23774 + }, + { + "epoch": 1.17, + "grad_norm": 0.5641299486160278, + "learning_rate": 0.00040305200135254475, + "loss": 3.1518, + "step": 23775 + }, + { + "epoch": 1.17, + "grad_norm": 0.5485759973526001, + "learning_rate": 0.0004030375418809736, + "loss": 2.9378, + "step": 23776 + }, + { + "epoch": 1.17, + "grad_norm": 0.5412816405296326, + "learning_rate": 0.0004030230821380215, + "loss": 2.9547, + "step": 23777 + }, + { + "epoch": 1.17, + "grad_norm": 0.5825008749961853, + "learning_rate": 0.0004030086221237263, + "loss": 3.1217, + "step": 23778 + }, + { + "epoch": 1.17, + "grad_norm": 0.533901035785675, + "learning_rate": 0.00040299416183812634, + "loss": 3.1133, + "step": 23779 + }, + { + "epoch": 1.17, + "grad_norm": 0.5950905084609985, + "learning_rate": 0.00040297970128125955, + "loss": 3.0327, + "step": 23780 + }, + { + "epoch": 1.17, + "grad_norm": 0.5376090407371521, + "learning_rate": 0.00040296524045316403, + "loss": 3.1263, + "step": 23781 + }, + { + "epoch": 1.17, + "grad_norm": 0.552521288394928, + "learning_rate": 0.00040295077935387783, + "loss": 3.0581, + "step": 23782 + }, + { + "epoch": 1.17, + "grad_norm": 0.5569019913673401, + "learning_rate": 0.0004029363179834391, + "loss": 3.0234, + "step": 23783 + }, + { + "epoch": 1.17, + "grad_norm": 0.5935654044151306, + "learning_rate": 0.000402921856341886, + "loss": 3.1942, + "step": 23784 + }, + { + "epoch": 1.17, + "grad_norm": 0.5477899312973022, + "learning_rate": 0.00040290739442925644, + "loss": 3.1383, + "step": 23785 + }, + { + "epoch": 1.17, + "grad_norm": 0.5384113788604736, + "learning_rate": 0.00040289293224558874, + "loss": 3.1697, + "step": 23786 + }, + { + "epoch": 1.17, + "grad_norm": 0.5377801656723022, + "learning_rate": 0.00040287846979092075, + "loss": 3.1197, + "step": 23787 + }, + { + "epoch": 1.17, + "grad_norm": 0.560335636138916, + "learning_rate": 0.0004028640070652907, + "loss": 2.9627, + "step": 23788 + }, + { + "epoch": 1.17, + "grad_norm": 0.5258608460426331, + "learning_rate": 0.0004028495440687366, + "loss": 3.1526, + "step": 23789 + }, + { + "epoch": 1.17, + "grad_norm": 0.5460545420646667, + "learning_rate": 0.00040283508080129674, + "loss": 3.1596, + "step": 23790 + }, + { + "epoch": 1.17, + "grad_norm": 0.5956621766090393, + "learning_rate": 0.00040282061726300896, + "loss": 3.0147, + "step": 23791 + }, + { + "epoch": 1.17, + "grad_norm": 0.5216905474662781, + "learning_rate": 0.00040280615345391144, + "loss": 2.9739, + "step": 23792 + }, + { + "epoch": 1.17, + "grad_norm": 0.508186399936676, + "learning_rate": 0.00040279168937404233, + "loss": 3.0885, + "step": 23793 + }, + { + "epoch": 1.17, + "grad_norm": 0.5671628713607788, + "learning_rate": 0.0004027772250234397, + "loss": 3.0457, + "step": 23794 + }, + { + "epoch": 1.17, + "grad_norm": 0.582146406173706, + "learning_rate": 0.0004027627604021416, + "loss": 2.9687, + "step": 23795 + }, + { + "epoch": 1.17, + "grad_norm": 0.6142930388450623, + "learning_rate": 0.0004027482955101863, + "loss": 3.0002, + "step": 23796 + }, + { + "epoch": 1.17, + "grad_norm": 0.5688886642456055, + "learning_rate": 0.00040273383034761164, + "loss": 3.0811, + "step": 23797 + }, + { + "epoch": 1.17, + "grad_norm": 0.5424383878707886, + "learning_rate": 0.0004027193649144558, + "loss": 3.1528, + "step": 23798 + }, + { + "epoch": 1.17, + "grad_norm": 0.6007323861122131, + "learning_rate": 0.00040270489921075706, + "loss": 2.7161, + "step": 23799 + }, + { + "epoch": 1.17, + "grad_norm": 0.5330042839050293, + "learning_rate": 0.0004026904332365533, + "loss": 2.7069, + "step": 23800 + }, + { + "epoch": 1.17, + "grad_norm": 0.5643306970596313, + "learning_rate": 0.0004026759669918828, + "loss": 2.8007, + "step": 23801 + }, + { + "epoch": 1.17, + "grad_norm": 0.5669317245483398, + "learning_rate": 0.00040266150047678346, + "loss": 3.0615, + "step": 23802 + }, + { + "epoch": 1.17, + "grad_norm": 0.6755366325378418, + "learning_rate": 0.00040264703369129336, + "loss": 3.0581, + "step": 23803 + }, + { + "epoch": 1.17, + "grad_norm": 0.5715072751045227, + "learning_rate": 0.000402632566635451, + "loss": 3.1604, + "step": 23804 + }, + { + "epoch": 1.17, + "grad_norm": 0.5770458579063416, + "learning_rate": 0.0004026180993092941, + "loss": 2.9243, + "step": 23805 + }, + { + "epoch": 1.17, + "grad_norm": 0.5324617028236389, + "learning_rate": 0.0004026036317128608, + "loss": 3.3259, + "step": 23806 + }, + { + "epoch": 1.17, + "grad_norm": 0.5772245526313782, + "learning_rate": 0.00040258916384618935, + "loss": 3.2526, + "step": 23807 + }, + { + "epoch": 1.17, + "grad_norm": 0.5446373224258423, + "learning_rate": 0.0004025746957093178, + "loss": 3.2472, + "step": 23808 + }, + { + "epoch": 1.17, + "grad_norm": 0.5985844135284424, + "learning_rate": 0.0004025602273022842, + "loss": 3.1847, + "step": 23809 + }, + { + "epoch": 1.17, + "grad_norm": 0.5797475576400757, + "learning_rate": 0.00040254575862512667, + "loss": 2.9985, + "step": 23810 + }, + { + "epoch": 1.17, + "grad_norm": 0.5373989939689636, + "learning_rate": 0.00040253128967788343, + "loss": 2.9025, + "step": 23811 + }, + { + "epoch": 1.17, + "grad_norm": 0.5533153414726257, + "learning_rate": 0.0004025168204605925, + "loss": 3.2097, + "step": 23812 + }, + { + "epoch": 1.17, + "grad_norm": 0.5629292726516724, + "learning_rate": 0.000402502350973292, + "loss": 3.1, + "step": 23813 + }, + { + "epoch": 1.17, + "grad_norm": 0.5873868465423584, + "learning_rate": 0.00040248788121601995, + "loss": 3.1942, + "step": 23814 + }, + { + "epoch": 1.17, + "grad_norm": 0.5775611996650696, + "learning_rate": 0.00040247341118881464, + "loss": 2.9935, + "step": 23815 + }, + { + "epoch": 1.17, + "grad_norm": 0.5191925168037415, + "learning_rate": 0.000402458940891714, + "loss": 3.1487, + "step": 23816 + }, + { + "epoch": 1.17, + "grad_norm": 0.5707910656929016, + "learning_rate": 0.00040244447032475626, + "loss": 3.0536, + "step": 23817 + }, + { + "epoch": 1.17, + "grad_norm": 0.555317223072052, + "learning_rate": 0.0004024299994879795, + "loss": 2.9648, + "step": 23818 + }, + { + "epoch": 1.17, + "grad_norm": 0.5581421852111816, + "learning_rate": 0.0004024155283814219, + "loss": 3.1438, + "step": 23819 + }, + { + "epoch": 1.17, + "grad_norm": 0.607385516166687, + "learning_rate": 0.0004024010570051214, + "loss": 2.9115, + "step": 23820 + }, + { + "epoch": 1.17, + "grad_norm": 0.5479592084884644, + "learning_rate": 0.0004023865853591163, + "loss": 2.9924, + "step": 23821 + }, + { + "epoch": 1.17, + "grad_norm": 0.5671947598457336, + "learning_rate": 0.0004023721134434446, + "loss": 2.9375, + "step": 23822 + }, + { + "epoch": 1.17, + "grad_norm": 0.5418531894683838, + "learning_rate": 0.0004023576412581445, + "loss": 3.1671, + "step": 23823 + }, + { + "epoch": 1.17, + "grad_norm": 0.5851473212242126, + "learning_rate": 0.00040234316880325403, + "loss": 3.1394, + "step": 23824 + }, + { + "epoch": 1.17, + "grad_norm": 0.5450239181518555, + "learning_rate": 0.0004023286960788113, + "loss": 3.149, + "step": 23825 + }, + { + "epoch": 1.17, + "grad_norm": 0.5457543730735779, + "learning_rate": 0.00040231422308485465, + "loss": 2.8019, + "step": 23826 + }, + { + "epoch": 1.17, + "grad_norm": 0.5215305685997009, + "learning_rate": 0.0004022997498214219, + "loss": 3.36, + "step": 23827 + }, + { + "epoch": 1.17, + "grad_norm": 0.5583608746528625, + "learning_rate": 0.0004022852762885513, + "loss": 2.9352, + "step": 23828 + }, + { + "epoch": 1.17, + "grad_norm": 0.5622576475143433, + "learning_rate": 0.0004022708024862809, + "loss": 3.029, + "step": 23829 + }, + { + "epoch": 1.17, + "grad_norm": 0.5386072993278503, + "learning_rate": 0.0004022563284146491, + "loss": 3.2786, + "step": 23830 + }, + { + "epoch": 1.17, + "grad_norm": 0.6399205327033997, + "learning_rate": 0.0004022418540736936, + "loss": 2.9981, + "step": 23831 + }, + { + "epoch": 1.17, + "grad_norm": 0.5961595177650452, + "learning_rate": 0.0004022273794634528, + "loss": 3.1205, + "step": 23832 + }, + { + "epoch": 1.17, + "grad_norm": 0.5393549203872681, + "learning_rate": 0.0004022129045839648, + "loss": 2.9914, + "step": 23833 + }, + { + "epoch": 1.17, + "grad_norm": 0.5417840480804443, + "learning_rate": 0.0004021984294352676, + "loss": 3.0758, + "step": 23834 + }, + { + "epoch": 1.17, + "grad_norm": 0.582919180393219, + "learning_rate": 0.00040218395401739944, + "loss": 3.1598, + "step": 23835 + }, + { + "epoch": 1.17, + "grad_norm": 0.5843506455421448, + "learning_rate": 0.0004021694783303985, + "loss": 2.9717, + "step": 23836 + }, + { + "epoch": 1.17, + "grad_norm": 0.5359125733375549, + "learning_rate": 0.00040215500237430275, + "loss": 3.0764, + "step": 23837 + }, + { + "epoch": 1.17, + "grad_norm": 0.608666181564331, + "learning_rate": 0.00040214052614915045, + "loss": 2.9321, + "step": 23838 + }, + { + "epoch": 1.17, + "grad_norm": 0.5720089673995972, + "learning_rate": 0.0004021260496549795, + "loss": 3.0585, + "step": 23839 + }, + { + "epoch": 1.17, + "grad_norm": 0.528830885887146, + "learning_rate": 0.00040211157289182824, + "loss": 3.2521, + "step": 23840 + }, + { + "epoch": 1.17, + "grad_norm": 0.57254958152771, + "learning_rate": 0.00040209709585973483, + "loss": 3.148, + "step": 23841 + }, + { + "epoch": 1.17, + "grad_norm": 0.529454231262207, + "learning_rate": 0.0004020826185587373, + "loss": 3.0549, + "step": 23842 + }, + { + "epoch": 1.17, + "grad_norm": 0.5671842694282532, + "learning_rate": 0.00040206814098887373, + "loss": 3.1676, + "step": 23843 + }, + { + "epoch": 1.17, + "grad_norm": 0.5969575047492981, + "learning_rate": 0.00040205366315018237, + "loss": 3.3495, + "step": 23844 + }, + { + "epoch": 1.17, + "grad_norm": 0.5233942270278931, + "learning_rate": 0.00040203918504270135, + "loss": 2.9989, + "step": 23845 + }, + { + "epoch": 1.17, + "grad_norm": 0.5281428694725037, + "learning_rate": 0.0004020247066664686, + "loss": 3.1292, + "step": 23846 + }, + { + "epoch": 1.17, + "grad_norm": 0.5774059891700745, + "learning_rate": 0.00040201022802152257, + "loss": 2.922, + "step": 23847 + }, + { + "epoch": 1.17, + "grad_norm": 0.5433593392372131, + "learning_rate": 0.0004019957491079012, + "loss": 3.0951, + "step": 23848 + }, + { + "epoch": 1.17, + "grad_norm": 0.5464013814926147, + "learning_rate": 0.00040198126992564264, + "loss": 3.2079, + "step": 23849 + }, + { + "epoch": 1.17, + "grad_norm": 0.5496659874916077, + "learning_rate": 0.00040196679047478493, + "loss": 3.1016, + "step": 23850 + }, + { + "epoch": 1.17, + "grad_norm": 0.542181134223938, + "learning_rate": 0.00040195231075536644, + "loss": 3.0924, + "step": 23851 + }, + { + "epoch": 1.17, + "grad_norm": 0.596310019493103, + "learning_rate": 0.00040193783076742523, + "loss": 3.1904, + "step": 23852 + }, + { + "epoch": 1.17, + "grad_norm": 0.5427301526069641, + "learning_rate": 0.0004019233505109993, + "loss": 3.1372, + "step": 23853 + }, + { + "epoch": 1.17, + "grad_norm": 0.5772640705108643, + "learning_rate": 0.00040190886998612695, + "loss": 2.9875, + "step": 23854 + }, + { + "epoch": 1.17, + "grad_norm": 0.5249664187431335, + "learning_rate": 0.00040189438919284616, + "loss": 3.1938, + "step": 23855 + }, + { + "epoch": 1.17, + "grad_norm": 0.5812081694602966, + "learning_rate": 0.00040187990813119525, + "loss": 3.1606, + "step": 23856 + }, + { + "epoch": 1.17, + "grad_norm": 0.5547779202461243, + "learning_rate": 0.0004018654268012122, + "loss": 3.2556, + "step": 23857 + }, + { + "epoch": 1.17, + "grad_norm": 0.5426790714263916, + "learning_rate": 0.0004018509452029352, + "loss": 3.1825, + "step": 23858 + }, + { + "epoch": 1.17, + "grad_norm": 0.5476883053779602, + "learning_rate": 0.0004018364633364025, + "loss": 3.0241, + "step": 23859 + }, + { + "epoch": 1.17, + "grad_norm": 0.5173876285552979, + "learning_rate": 0.00040182198120165206, + "loss": 3.0905, + "step": 23860 + }, + { + "epoch": 1.17, + "grad_norm": 0.5504484176635742, + "learning_rate": 0.00040180749879872217, + "loss": 3.212, + "step": 23861 + }, + { + "epoch": 1.17, + "grad_norm": 0.5541347861289978, + "learning_rate": 0.0004017930161276509, + "loss": 2.9228, + "step": 23862 + }, + { + "epoch": 1.17, + "grad_norm": 0.580261766910553, + "learning_rate": 0.0004017785331884765, + "loss": 2.9678, + "step": 23863 + }, + { + "epoch": 1.17, + "grad_norm": 0.6364923715591431, + "learning_rate": 0.000401764049981237, + "loss": 3.1864, + "step": 23864 + }, + { + "epoch": 1.17, + "grad_norm": 0.5156949758529663, + "learning_rate": 0.0004017495665059704, + "loss": 3.3035, + "step": 23865 + }, + { + "epoch": 1.17, + "grad_norm": 0.5521565675735474, + "learning_rate": 0.00040173508276271526, + "loss": 2.9317, + "step": 23866 + }, + { + "epoch": 1.17, + "grad_norm": 0.5523860454559326, + "learning_rate": 0.0004017205987515094, + "loss": 3.1082, + "step": 23867 + }, + { + "epoch": 1.17, + "grad_norm": 0.5502112507820129, + "learning_rate": 0.00040170611447239103, + "loss": 3.0595, + "step": 23868 + }, + { + "epoch": 1.17, + "grad_norm": 0.5495514273643494, + "learning_rate": 0.00040169162992539833, + "loss": 3.05, + "step": 23869 + }, + { + "epoch": 1.17, + "grad_norm": 0.5663180947303772, + "learning_rate": 0.00040167714511056947, + "loss": 3.133, + "step": 23870 + }, + { + "epoch": 1.17, + "grad_norm": 0.5522775650024414, + "learning_rate": 0.0004016626600279426, + "loss": 3.1148, + "step": 23871 + }, + { + "epoch": 1.17, + "grad_norm": 0.5634680390357971, + "learning_rate": 0.00040164817467755584, + "loss": 3.1422, + "step": 23872 + }, + { + "epoch": 1.17, + "grad_norm": 0.5490415096282959, + "learning_rate": 0.0004016336890594473, + "loss": 2.8798, + "step": 23873 + }, + { + "epoch": 1.17, + "grad_norm": 0.5496166348457336, + "learning_rate": 0.00040161920317365534, + "loss": 2.8576, + "step": 23874 + }, + { + "epoch": 1.17, + "grad_norm": 0.5630446076393127, + "learning_rate": 0.00040160471702021773, + "loss": 3.266, + "step": 23875 + }, + { + "epoch": 1.17, + "grad_norm": 0.5701159238815308, + "learning_rate": 0.0004015902305991729, + "loss": 3.0536, + "step": 23876 + }, + { + "epoch": 1.17, + "grad_norm": 0.5700026154518127, + "learning_rate": 0.00040157574391055907, + "loss": 2.979, + "step": 23877 + }, + { + "epoch": 1.17, + "grad_norm": 0.5726680159568787, + "learning_rate": 0.0004015612569544143, + "loss": 3.0092, + "step": 23878 + }, + { + "epoch": 1.17, + "grad_norm": 0.5549166202545166, + "learning_rate": 0.0004015467697307766, + "loss": 3.2318, + "step": 23879 + }, + { + "epoch": 1.17, + "grad_norm": 0.5379598736763, + "learning_rate": 0.0004015322822396842, + "loss": 2.9661, + "step": 23880 + }, + { + "epoch": 1.17, + "grad_norm": 0.5367391705513, + "learning_rate": 0.00040151779448117545, + "loss": 3.1219, + "step": 23881 + }, + { + "epoch": 1.17, + "grad_norm": 0.5859577655792236, + "learning_rate": 0.00040150330645528833, + "loss": 3.0839, + "step": 23882 + }, + { + "epoch": 1.17, + "grad_norm": 0.5608367323875427, + "learning_rate": 0.0004014888181620611, + "loss": 3.1321, + "step": 23883 + }, + { + "epoch": 1.17, + "grad_norm": 0.5417865514755249, + "learning_rate": 0.00040147432960153175, + "loss": 2.9928, + "step": 23884 + }, + { + "epoch": 1.17, + "grad_norm": 0.5564191341400146, + "learning_rate": 0.0004014598407737386, + "loss": 3.0766, + "step": 23885 + }, + { + "epoch": 1.17, + "grad_norm": 0.5441717505455017, + "learning_rate": 0.0004014453516787197, + "loss": 3.1433, + "step": 23886 + }, + { + "epoch": 1.17, + "grad_norm": 0.5383797287940979, + "learning_rate": 0.0004014308623165133, + "loss": 3.194, + "step": 23887 + }, + { + "epoch": 1.17, + "grad_norm": 0.6364131569862366, + "learning_rate": 0.00040141637268715754, + "loss": 3.347, + "step": 23888 + }, + { + "epoch": 1.17, + "grad_norm": 0.565334677696228, + "learning_rate": 0.0004014018827906906, + "loss": 3.252, + "step": 23889 + }, + { + "epoch": 1.17, + "grad_norm": 0.5537137985229492, + "learning_rate": 0.0004013873926271506, + "loss": 3.0982, + "step": 23890 + }, + { + "epoch": 1.17, + "grad_norm": 0.5591994524002075, + "learning_rate": 0.0004013729021965756, + "loss": 3.0882, + "step": 23891 + }, + { + "epoch": 1.17, + "grad_norm": 0.5560016632080078, + "learning_rate": 0.0004013584114990041, + "loss": 3.1173, + "step": 23892 + }, + { + "epoch": 1.17, + "grad_norm": 0.5662415623664856, + "learning_rate": 0.00040134392053447387, + "loss": 2.995, + "step": 23893 + }, + { + "epoch": 1.17, + "grad_norm": 0.5805153846740723, + "learning_rate": 0.0004013294293030233, + "loss": 3.3028, + "step": 23894 + }, + { + "epoch": 1.17, + "grad_norm": 0.5564658045768738, + "learning_rate": 0.0004013149378046905, + "loss": 2.9882, + "step": 23895 + }, + { + "epoch": 1.17, + "grad_norm": 0.5353381633758545, + "learning_rate": 0.0004013004460395137, + "loss": 2.9248, + "step": 23896 + }, + { + "epoch": 1.17, + "grad_norm": 0.5824044346809387, + "learning_rate": 0.000401285954007531, + "loss": 2.8985, + "step": 23897 + }, + { + "epoch": 1.17, + "grad_norm": 0.5630552768707275, + "learning_rate": 0.0004012714617087806, + "loss": 3.1425, + "step": 23898 + }, + { + "epoch": 1.17, + "grad_norm": 0.5532223582267761, + "learning_rate": 0.0004012569691433008, + "loss": 3.1631, + "step": 23899 + }, + { + "epoch": 1.17, + "grad_norm": 0.5480971932411194, + "learning_rate": 0.00040124247631112943, + "loss": 2.8964, + "step": 23900 + }, + { + "epoch": 1.17, + "grad_norm": 0.5849064588546753, + "learning_rate": 0.00040122798321230485, + "loss": 2.9088, + "step": 23901 + }, + { + "epoch": 1.17, + "grad_norm": 0.5372636318206787, + "learning_rate": 0.0004012134898468653, + "loss": 2.7437, + "step": 23902 + }, + { + "epoch": 1.17, + "grad_norm": 0.5332382321357727, + "learning_rate": 0.000401198996214849, + "loss": 3.2017, + "step": 23903 + }, + { + "epoch": 1.17, + "grad_norm": 0.5774133801460266, + "learning_rate": 0.00040118450231629393, + "loss": 3.2153, + "step": 23904 + }, + { + "epoch": 1.17, + "grad_norm": 0.5455067157745361, + "learning_rate": 0.0004011700081512384, + "loss": 3.1119, + "step": 23905 + }, + { + "epoch": 1.17, + "grad_norm": 0.5268059968948364, + "learning_rate": 0.00040115551371972045, + "loss": 3.0404, + "step": 23906 + }, + { + "epoch": 1.17, + "grad_norm": 0.5563852190971375, + "learning_rate": 0.0004011410190217784, + "loss": 2.8807, + "step": 23907 + }, + { + "epoch": 1.17, + "grad_norm": 0.5379848480224609, + "learning_rate": 0.0004011265240574503, + "loss": 3.1135, + "step": 23908 + }, + { + "epoch": 1.17, + "grad_norm": 0.5759731531143188, + "learning_rate": 0.0004011120288267745, + "loss": 3.054, + "step": 23909 + }, + { + "epoch": 1.17, + "grad_norm": 0.571288526058197, + "learning_rate": 0.0004010975333297891, + "loss": 3.1158, + "step": 23910 + }, + { + "epoch": 1.17, + "grad_norm": 0.5381522178649902, + "learning_rate": 0.00040108303756653213, + "loss": 3.1559, + "step": 23911 + }, + { + "epoch": 1.17, + "grad_norm": 0.6180474162101746, + "learning_rate": 0.00040106854153704196, + "loss": 3.0476, + "step": 23912 + }, + { + "epoch": 1.17, + "grad_norm": 0.5325801372528076, + "learning_rate": 0.00040105404524135664, + "loss": 3.0364, + "step": 23913 + }, + { + "epoch": 1.17, + "grad_norm": 0.5483402013778687, + "learning_rate": 0.00040103954867951454, + "loss": 2.9758, + "step": 23914 + }, + { + "epoch": 1.17, + "grad_norm": 0.5332612991333008, + "learning_rate": 0.00040102505185155365, + "loss": 3.0309, + "step": 23915 + }, + { + "epoch": 1.17, + "grad_norm": 0.5399067997932434, + "learning_rate": 0.00040101055475751216, + "loss": 3.2472, + "step": 23916 + }, + { + "epoch": 1.17, + "grad_norm": 0.5286165475845337, + "learning_rate": 0.0004009960573974284, + "loss": 3.3073, + "step": 23917 + }, + { + "epoch": 1.17, + "grad_norm": 0.5932464003562927, + "learning_rate": 0.0004009815597713404, + "loss": 3.0836, + "step": 23918 + }, + { + "epoch": 1.17, + "grad_norm": 0.5477786660194397, + "learning_rate": 0.00040096706187928643, + "loss": 3.1094, + "step": 23919 + }, + { + "epoch": 1.17, + "grad_norm": 0.5101897716522217, + "learning_rate": 0.0004009525637213046, + "loss": 3.043, + "step": 23920 + }, + { + "epoch": 1.17, + "grad_norm": 0.5300383567810059, + "learning_rate": 0.0004009380652974332, + "loss": 3.2401, + "step": 23921 + }, + { + "epoch": 1.17, + "grad_norm": 0.5190649628639221, + "learning_rate": 0.0004009235666077103, + "loss": 3.224, + "step": 23922 + }, + { + "epoch": 1.17, + "grad_norm": 0.5531300902366638, + "learning_rate": 0.00040090906765217425, + "loss": 2.9455, + "step": 23923 + }, + { + "epoch": 1.17, + "grad_norm": 0.5615624189376831, + "learning_rate": 0.00040089456843086304, + "loss": 2.9886, + "step": 23924 + }, + { + "epoch": 1.17, + "grad_norm": 0.562551736831665, + "learning_rate": 0.0004008800689438151, + "loss": 3.0263, + "step": 23925 + }, + { + "epoch": 1.17, + "grad_norm": 0.541335940361023, + "learning_rate": 0.00040086556919106833, + "loss": 3.0767, + "step": 23926 + }, + { + "epoch": 1.17, + "grad_norm": 0.5761546492576599, + "learning_rate": 0.00040085106917266106, + "loss": 2.9538, + "step": 23927 + }, + { + "epoch": 1.17, + "grad_norm": 0.5472856163978577, + "learning_rate": 0.00040083656888863163, + "loss": 2.8641, + "step": 23928 + }, + { + "epoch": 1.17, + "grad_norm": 0.5612488389015198, + "learning_rate": 0.000400822068339018, + "loss": 2.9537, + "step": 23929 + }, + { + "epoch": 1.17, + "grad_norm": 0.5841686725616455, + "learning_rate": 0.0004008075675238584, + "loss": 3.1257, + "step": 23930 + }, + { + "epoch": 1.17, + "grad_norm": 0.5874391794204712, + "learning_rate": 0.0004007930664431911, + "loss": 3.0393, + "step": 23931 + }, + { + "epoch": 1.17, + "grad_norm": 0.5234120488166809, + "learning_rate": 0.00040077856509705424, + "loss": 3.1151, + "step": 23932 + }, + { + "epoch": 1.17, + "grad_norm": 0.5505731105804443, + "learning_rate": 0.00040076406348548603, + "loss": 3.1803, + "step": 23933 + }, + { + "epoch": 1.17, + "grad_norm": 0.5326248407363892, + "learning_rate": 0.00040074956160852476, + "loss": 3.2542, + "step": 23934 + }, + { + "epoch": 1.17, + "grad_norm": 0.5707079768180847, + "learning_rate": 0.00040073505946620837, + "loss": 3.028, + "step": 23935 + }, + { + "epoch": 1.17, + "grad_norm": 0.5336579084396362, + "learning_rate": 0.0004007205570585754, + "loss": 3.2687, + "step": 23936 + }, + { + "epoch": 1.17, + "grad_norm": 0.5472836494445801, + "learning_rate": 0.00040070605438566375, + "loss": 2.8165, + "step": 23937 + }, + { + "epoch": 1.17, + "grad_norm": 0.6249715089797974, + "learning_rate": 0.00040069155144751173, + "loss": 3.1156, + "step": 23938 + }, + { + "epoch": 1.17, + "grad_norm": 0.533698320388794, + "learning_rate": 0.00040067704824415764, + "loss": 2.9911, + "step": 23939 + }, + { + "epoch": 1.17, + "grad_norm": 0.5471363663673401, + "learning_rate": 0.00040066254477563956, + "loss": 2.9763, + "step": 23940 + }, + { + "epoch": 1.17, + "grad_norm": 0.5757012963294983, + "learning_rate": 0.0004006480410419957, + "loss": 3.2284, + "step": 23941 + }, + { + "epoch": 1.17, + "grad_norm": 0.5495263934135437, + "learning_rate": 0.0004006335370432641, + "loss": 2.9926, + "step": 23942 + }, + { + "epoch": 1.17, + "grad_norm": 0.5950400233268738, + "learning_rate": 0.00040061903277948335, + "loss": 2.9854, + "step": 23943 + }, + { + "epoch": 1.17, + "grad_norm": 0.5375027060508728, + "learning_rate": 0.00040060452825069133, + "loss": 3.3018, + "step": 23944 + }, + { + "epoch": 1.17, + "grad_norm": 0.5809231400489807, + "learning_rate": 0.00040059002345692644, + "loss": 3.1755, + "step": 23945 + }, + { + "epoch": 1.17, + "grad_norm": 0.5961464643478394, + "learning_rate": 0.00040057551839822665, + "loss": 3.0865, + "step": 23946 + }, + { + "epoch": 1.17, + "grad_norm": 0.574310839176178, + "learning_rate": 0.0004005610130746303, + "loss": 3.0654, + "step": 23947 + }, + { + "epoch": 1.17, + "grad_norm": 0.5345257520675659, + "learning_rate": 0.00040054650748617575, + "loss": 3.1405, + "step": 23948 + }, + { + "epoch": 1.17, + "grad_norm": 0.565226674079895, + "learning_rate": 0.0004005320016329009, + "loss": 2.7882, + "step": 23949 + }, + { + "epoch": 1.17, + "grad_norm": 0.5397844314575195, + "learning_rate": 0.00040051749551484425, + "loss": 2.9382, + "step": 23950 + }, + { + "epoch": 1.17, + "grad_norm": 0.5760013461112976, + "learning_rate": 0.00040050298913204375, + "loss": 3.017, + "step": 23951 + }, + { + "epoch": 1.17, + "grad_norm": 0.5508855581283569, + "learning_rate": 0.0004004884824845377, + "loss": 3.2093, + "step": 23952 + }, + { + "epoch": 1.17, + "grad_norm": 0.5632848739624023, + "learning_rate": 0.0004004739755723644, + "loss": 3.0376, + "step": 23953 + }, + { + "epoch": 1.17, + "grad_norm": 0.536578893661499, + "learning_rate": 0.00040045946839556196, + "loss": 3.0994, + "step": 23954 + }, + { + "epoch": 1.17, + "grad_norm": 0.5494163632392883, + "learning_rate": 0.00040044496095416863, + "loss": 2.9971, + "step": 23955 + }, + { + "epoch": 1.17, + "grad_norm": 0.5547284483909607, + "learning_rate": 0.0004004304532482226, + "loss": 3.268, + "step": 23956 + }, + { + "epoch": 1.17, + "grad_norm": 0.551442563533783, + "learning_rate": 0.00040041594527776204, + "loss": 3.0469, + "step": 23957 + }, + { + "epoch": 1.17, + "grad_norm": 0.5382516384124756, + "learning_rate": 0.00040040143704282526, + "loss": 3.4146, + "step": 23958 + }, + { + "epoch": 1.17, + "grad_norm": 0.5559409856796265, + "learning_rate": 0.0004003869285434504, + "loss": 3.0229, + "step": 23959 + }, + { + "epoch": 1.17, + "grad_norm": 0.5598931312561035, + "learning_rate": 0.0004003724197796756, + "loss": 3.0695, + "step": 23960 + }, + { + "epoch": 1.17, + "grad_norm": 0.59652179479599, + "learning_rate": 0.00040035791075153935, + "loss": 3.1012, + "step": 23961 + }, + { + "epoch": 1.17, + "grad_norm": 0.5521093606948853, + "learning_rate": 0.0004003434014590796, + "loss": 3.1512, + "step": 23962 + }, + { + "epoch": 1.17, + "grad_norm": 0.5462397933006287, + "learning_rate": 0.0004003288919023346, + "loss": 3.1358, + "step": 23963 + }, + { + "epoch": 1.17, + "grad_norm": 0.5487635731697083, + "learning_rate": 0.0004003143820813425, + "loss": 3.373, + "step": 23964 + }, + { + "epoch": 1.17, + "grad_norm": 0.6183003783226013, + "learning_rate": 0.00040029987199614186, + "loss": 2.8504, + "step": 23965 + }, + { + "epoch": 1.17, + "grad_norm": 0.5639381408691406, + "learning_rate": 0.00040028536164677053, + "loss": 2.9285, + "step": 23966 + }, + { + "epoch": 1.17, + "grad_norm": 0.5435758829116821, + "learning_rate": 0.00040027085103326685, + "loss": 3.1571, + "step": 23967 + }, + { + "epoch": 1.17, + "grad_norm": 0.5409099459648132, + "learning_rate": 0.0004002563401556691, + "loss": 3.0643, + "step": 23968 + }, + { + "epoch": 1.17, + "grad_norm": 0.564572811126709, + "learning_rate": 0.00040024182901401544, + "loss": 3.0944, + "step": 23969 + }, + { + "epoch": 1.17, + "grad_norm": 0.5847697257995605, + "learning_rate": 0.00040022731760834405, + "loss": 3.1069, + "step": 23970 + }, + { + "epoch": 1.17, + "grad_norm": 0.5416810512542725, + "learning_rate": 0.00040021280593869317, + "loss": 3.1393, + "step": 23971 + }, + { + "epoch": 1.17, + "grad_norm": 0.5735564231872559, + "learning_rate": 0.00040019829400510113, + "loss": 3.278, + "step": 23972 + }, + { + "epoch": 1.17, + "grad_norm": 0.5522201657295227, + "learning_rate": 0.000400183781807606, + "loss": 3.1469, + "step": 23973 + }, + { + "epoch": 1.17, + "grad_norm": 0.5339809060096741, + "learning_rate": 0.00040016926934624607, + "loss": 3.2685, + "step": 23974 + }, + { + "epoch": 1.17, + "grad_norm": 0.5736157894134521, + "learning_rate": 0.00040015475662105965, + "loss": 3.1981, + "step": 23975 + }, + { + "epoch": 1.18, + "grad_norm": 0.5715289115905762, + "learning_rate": 0.00040014024363208483, + "loss": 3.1167, + "step": 23976 + }, + { + "epoch": 1.18, + "grad_norm": 0.5391354560852051, + "learning_rate": 0.00040012573037935984, + "loss": 3.2589, + "step": 23977 + }, + { + "epoch": 1.18, + "grad_norm": 0.5813712477684021, + "learning_rate": 0.0004001112168629229, + "loss": 3.0653, + "step": 23978 + }, + { + "epoch": 1.18, + "grad_norm": 0.6210728287696838, + "learning_rate": 0.0004000967030828124, + "loss": 3.1392, + "step": 23979 + }, + { + "epoch": 1.18, + "grad_norm": 0.5406283736228943, + "learning_rate": 0.0004000821890390664, + "loss": 3.2409, + "step": 23980 + }, + { + "epoch": 1.18, + "grad_norm": 0.5507621765136719, + "learning_rate": 0.00040006767473172323, + "loss": 3.1331, + "step": 23981 + }, + { + "epoch": 1.18, + "grad_norm": 0.5478275418281555, + "learning_rate": 0.00040005316016082095, + "loss": 2.9733, + "step": 23982 + }, + { + "epoch": 1.18, + "grad_norm": 0.6610849499702454, + "learning_rate": 0.00040003864532639797, + "loss": 2.9103, + "step": 23983 + }, + { + "epoch": 1.18, + "grad_norm": 0.5360865592956543, + "learning_rate": 0.00040002413022849245, + "loss": 3.118, + "step": 23984 + }, + { + "epoch": 1.18, + "grad_norm": 0.5579555034637451, + "learning_rate": 0.0004000096148671426, + "loss": 3.1183, + "step": 23985 + }, + { + "epoch": 1.18, + "grad_norm": 0.5361294746398926, + "learning_rate": 0.0003999950992423867, + "loss": 3.1518, + "step": 23986 + }, + { + "epoch": 1.18, + "grad_norm": 0.5184491872787476, + "learning_rate": 0.000399980583354263, + "loss": 2.8963, + "step": 23987 + }, + { + "epoch": 1.18, + "grad_norm": 0.551647961139679, + "learning_rate": 0.0003999660672028095, + "loss": 3.1424, + "step": 23988 + }, + { + "epoch": 1.18, + "grad_norm": 0.5427066087722778, + "learning_rate": 0.0003999515507880648, + "loss": 3.0262, + "step": 23989 + }, + { + "epoch": 1.18, + "grad_norm": 0.5327326059341431, + "learning_rate": 0.0003999370341100669, + "loss": 2.976, + "step": 23990 + }, + { + "epoch": 1.18, + "grad_norm": 0.5397793650627136, + "learning_rate": 0.0003999225171688541, + "loss": 2.9345, + "step": 23991 + }, + { + "epoch": 1.18, + "grad_norm": 0.5645992159843445, + "learning_rate": 0.00039990799996446466, + "loss": 2.9815, + "step": 23992 + }, + { + "epoch": 1.18, + "grad_norm": 0.5220346450805664, + "learning_rate": 0.00039989348249693657, + "loss": 3.2895, + "step": 23993 + }, + { + "epoch": 1.18, + "grad_norm": 0.5440986156463623, + "learning_rate": 0.00039987896476630845, + "loss": 2.9243, + "step": 23994 + }, + { + "epoch": 1.18, + "grad_norm": 0.6222359538078308, + "learning_rate": 0.00039986444677261835, + "loss": 3.1331, + "step": 23995 + }, + { + "epoch": 1.18, + "grad_norm": 0.5543932914733887, + "learning_rate": 0.0003998499285159045, + "loss": 3.0349, + "step": 23996 + }, + { + "epoch": 1.18, + "grad_norm": 0.5490472316741943, + "learning_rate": 0.0003998354099962051, + "loss": 3.0956, + "step": 23997 + }, + { + "epoch": 1.18, + "grad_norm": 0.5755308866500854, + "learning_rate": 0.0003998208912135585, + "loss": 3.0513, + "step": 23998 + }, + { + "epoch": 1.18, + "grad_norm": 0.5586222410202026, + "learning_rate": 0.0003998063721680029, + "loss": 3.3977, + "step": 23999 + }, + { + "epoch": 1.18, + "grad_norm": 0.5579812526702881, + "learning_rate": 0.00039979185285957645, + "loss": 2.8466, + "step": 24000 + }, + { + "epoch": 1.18, + "grad_norm": 0.5372399091720581, + "learning_rate": 0.0003997773332883176, + "loss": 3.0433, + "step": 24001 + }, + { + "epoch": 1.18, + "grad_norm": 0.5638691186904907, + "learning_rate": 0.0003997628134542644, + "loss": 2.8779, + "step": 24002 + }, + { + "epoch": 1.18, + "grad_norm": 0.5711449384689331, + "learning_rate": 0.0003997482933574551, + "loss": 3.1942, + "step": 24003 + }, + { + "epoch": 1.18, + "grad_norm": 0.5793060660362244, + "learning_rate": 0.0003997337729979279, + "loss": 2.9375, + "step": 24004 + }, + { + "epoch": 1.18, + "grad_norm": 0.5681477189064026, + "learning_rate": 0.00039971925237572127, + "loss": 3.0421, + "step": 24005 + }, + { + "epoch": 1.18, + "grad_norm": 0.5376327037811279, + "learning_rate": 0.0003997047314908733, + "loss": 3.0035, + "step": 24006 + }, + { + "epoch": 1.18, + "grad_norm": 0.5498843789100647, + "learning_rate": 0.00039969021034342233, + "loss": 3.1599, + "step": 24007 + }, + { + "epoch": 1.18, + "grad_norm": 0.5775700807571411, + "learning_rate": 0.0003996756889334064, + "loss": 2.8919, + "step": 24008 + }, + { + "epoch": 1.18, + "grad_norm": 0.5586049556732178, + "learning_rate": 0.00039966116726086395, + "loss": 3.3297, + "step": 24009 + }, + { + "epoch": 1.18, + "grad_norm": 0.5521610379219055, + "learning_rate": 0.0003996466453258331, + "loss": 2.9728, + "step": 24010 + }, + { + "epoch": 1.18, + "grad_norm": 0.5638512969017029, + "learning_rate": 0.0003996321231283523, + "loss": 3.262, + "step": 24011 + }, + { + "epoch": 1.18, + "grad_norm": 0.7236627340316772, + "learning_rate": 0.0003996176006684596, + "loss": 3.0747, + "step": 24012 + }, + { + "epoch": 1.18, + "grad_norm": 0.555939793586731, + "learning_rate": 0.0003996030779461933, + "loss": 3.1563, + "step": 24013 + }, + { + "epoch": 1.18, + "grad_norm": 0.5715100169181824, + "learning_rate": 0.0003995885549615916, + "loss": 3.1999, + "step": 24014 + }, + { + "epoch": 1.18, + "grad_norm": 0.5524293780326843, + "learning_rate": 0.00039957403171469286, + "loss": 2.9745, + "step": 24015 + }, + { + "epoch": 1.18, + "grad_norm": 0.545418381690979, + "learning_rate": 0.0003995595082055354, + "loss": 2.8698, + "step": 24016 + }, + { + "epoch": 1.18, + "grad_norm": 0.5677865743637085, + "learning_rate": 0.00039954498443415723, + "loss": 2.9115, + "step": 24017 + }, + { + "epoch": 1.18, + "grad_norm": 0.5774267911911011, + "learning_rate": 0.0003995304604005967, + "loss": 3.2582, + "step": 24018 + }, + { + "epoch": 1.18, + "grad_norm": 0.5392197966575623, + "learning_rate": 0.00039951593610489214, + "loss": 3.1144, + "step": 24019 + }, + { + "epoch": 1.18, + "grad_norm": 0.6039441823959351, + "learning_rate": 0.00039950141154708176, + "loss": 2.9836, + "step": 24020 + }, + { + "epoch": 1.18, + "grad_norm": 0.5584951639175415, + "learning_rate": 0.0003994868867272038, + "loss": 3.0626, + "step": 24021 + }, + { + "epoch": 1.18, + "grad_norm": 0.5053369402885437, + "learning_rate": 0.0003994723616452965, + "loss": 2.9783, + "step": 24022 + }, + { + "epoch": 1.18, + "grad_norm": 0.5872316956520081, + "learning_rate": 0.0003994578363013982, + "loss": 3.0212, + "step": 24023 + }, + { + "epoch": 1.18, + "grad_norm": 0.6137145161628723, + "learning_rate": 0.0003994433106955471, + "loss": 3.0148, + "step": 24024 + }, + { + "epoch": 1.18, + "grad_norm": 0.8941653370857239, + "learning_rate": 0.00039942878482778144, + "loss": 2.8822, + "step": 24025 + }, + { + "epoch": 1.18, + "grad_norm": 0.5612619519233704, + "learning_rate": 0.00039941425869813947, + "loss": 3.1952, + "step": 24026 + }, + { + "epoch": 1.18, + "grad_norm": 0.553356409072876, + "learning_rate": 0.0003993997323066596, + "loss": 3.2152, + "step": 24027 + }, + { + "epoch": 1.18, + "grad_norm": 0.5677022337913513, + "learning_rate": 0.00039938520565337984, + "loss": 3.0083, + "step": 24028 + }, + { + "epoch": 1.18, + "grad_norm": 0.5404044985771179, + "learning_rate": 0.0003993706787383385, + "loss": 3.0666, + "step": 24029 + }, + { + "epoch": 1.18, + "grad_norm": 0.5190367698669434, + "learning_rate": 0.00039935615156157407, + "loss": 3.021, + "step": 24030 + }, + { + "epoch": 1.18, + "grad_norm": 0.5348750948905945, + "learning_rate": 0.0003993416241231246, + "loss": 3.0933, + "step": 24031 + }, + { + "epoch": 1.18, + "grad_norm": 0.5640134811401367, + "learning_rate": 0.0003993270964230284, + "loss": 3.0944, + "step": 24032 + }, + { + "epoch": 1.18, + "grad_norm": 0.571087658405304, + "learning_rate": 0.0003993125684613238, + "loss": 2.8386, + "step": 24033 + }, + { + "epoch": 1.18, + "grad_norm": 0.5981314778327942, + "learning_rate": 0.000399298040238049, + "loss": 3.3129, + "step": 24034 + }, + { + "epoch": 1.18, + "grad_norm": 0.5638099908828735, + "learning_rate": 0.00039928351175324216, + "loss": 3.1856, + "step": 24035 + }, + { + "epoch": 1.18, + "grad_norm": 0.5106322169303894, + "learning_rate": 0.00039926898300694173, + "loss": 2.9335, + "step": 24036 + }, + { + "epoch": 1.18, + "grad_norm": 0.545299232006073, + "learning_rate": 0.0003992544539991859, + "loss": 3.3001, + "step": 24037 + }, + { + "epoch": 1.18, + "grad_norm": 0.5997403264045715, + "learning_rate": 0.00039923992473001307, + "loss": 3.2128, + "step": 24038 + }, + { + "epoch": 1.18, + "grad_norm": 0.5346583127975464, + "learning_rate": 0.0003992253951994612, + "loss": 2.8758, + "step": 24039 + }, + { + "epoch": 1.18, + "grad_norm": 0.560383141040802, + "learning_rate": 0.0003992108654075687, + "loss": 2.9834, + "step": 24040 + }, + { + "epoch": 1.18, + "grad_norm": 0.5270285606384277, + "learning_rate": 0.0003991963353543741, + "loss": 3.0306, + "step": 24041 + }, + { + "epoch": 1.18, + "grad_norm": 0.5616056323051453, + "learning_rate": 0.00039918180503991524, + "loss": 3.1098, + "step": 24042 + }, + { + "epoch": 1.18, + "grad_norm": 0.5329706072807312, + "learning_rate": 0.0003991672744642307, + "loss": 3.1355, + "step": 24043 + }, + { + "epoch": 1.18, + "grad_norm": 0.7826784253120422, + "learning_rate": 0.0003991527436273586, + "loss": 3.0251, + "step": 24044 + }, + { + "epoch": 1.18, + "grad_norm": 0.5575398206710815, + "learning_rate": 0.0003991382125293373, + "loss": 3.0598, + "step": 24045 + }, + { + "epoch": 1.18, + "grad_norm": 0.5318834185600281, + "learning_rate": 0.0003991236811702049, + "loss": 3.1486, + "step": 24046 + }, + { + "epoch": 1.18, + "grad_norm": 0.5418036580085754, + "learning_rate": 0.0003991091495499999, + "loss": 2.9798, + "step": 24047 + }, + { + "epoch": 1.18, + "grad_norm": 0.5505343675613403, + "learning_rate": 0.0003990946176687605, + "loss": 3.2627, + "step": 24048 + }, + { + "epoch": 1.18, + "grad_norm": 0.5388018488883972, + "learning_rate": 0.000399080085526525, + "loss": 3.0686, + "step": 24049 + }, + { + "epoch": 1.18, + "grad_norm": 0.6034098267555237, + "learning_rate": 0.0003990655531233314, + "loss": 3.0145, + "step": 24050 + }, + { + "epoch": 1.18, + "grad_norm": 0.5737993717193604, + "learning_rate": 0.0003990510204592184, + "loss": 3.0209, + "step": 24051 + }, + { + "epoch": 1.18, + "grad_norm": 0.7313047051429749, + "learning_rate": 0.000399036487534224, + "loss": 3.1884, + "step": 24052 + }, + { + "epoch": 1.18, + "grad_norm": 0.5465160012245178, + "learning_rate": 0.00039902195434838656, + "loss": 3.2552, + "step": 24053 + }, + { + "epoch": 1.18, + "grad_norm": 0.5565720200538635, + "learning_rate": 0.00039900742090174434, + "loss": 3.2226, + "step": 24054 + }, + { + "epoch": 1.18, + "grad_norm": 0.525080144405365, + "learning_rate": 0.0003989928871943356, + "loss": 3.1065, + "step": 24055 + }, + { + "epoch": 1.18, + "grad_norm": 0.5119069814682007, + "learning_rate": 0.00039897835322619875, + "loss": 3.3655, + "step": 24056 + }, + { + "epoch": 1.18, + "grad_norm": 0.5813723206520081, + "learning_rate": 0.00039896381899737194, + "loss": 2.8918, + "step": 24057 + }, + { + "epoch": 1.18, + "grad_norm": 0.5397539138793945, + "learning_rate": 0.0003989492845078934, + "loss": 3.1229, + "step": 24058 + }, + { + "epoch": 1.18, + "grad_norm": 0.571702778339386, + "learning_rate": 0.0003989347497578015, + "loss": 3.1404, + "step": 24059 + }, + { + "epoch": 1.18, + "grad_norm": 0.5537548065185547, + "learning_rate": 0.00039892021474713455, + "loss": 3.0636, + "step": 24060 + }, + { + "epoch": 1.18, + "grad_norm": 0.5488508343696594, + "learning_rate": 0.0003989056794759307, + "loss": 3.1616, + "step": 24061 + }, + { + "epoch": 1.18, + "grad_norm": 0.5703139901161194, + "learning_rate": 0.00039889114394422846, + "loss": 3.2048, + "step": 24062 + }, + { + "epoch": 1.18, + "grad_norm": 0.5412448644638062, + "learning_rate": 0.000398876608152066, + "loss": 3.161, + "step": 24063 + }, + { + "epoch": 1.18, + "grad_norm": 0.5422495007514954, + "learning_rate": 0.0003988620720994815, + "loss": 3.114, + "step": 24064 + }, + { + "epoch": 1.18, + "grad_norm": 0.5373824834823608, + "learning_rate": 0.00039884753578651327, + "loss": 3.222, + "step": 24065 + }, + { + "epoch": 1.18, + "grad_norm": 0.5398744344711304, + "learning_rate": 0.0003988329992131997, + "loss": 2.9718, + "step": 24066 + }, + { + "epoch": 1.18, + "grad_norm": 0.5511226058006287, + "learning_rate": 0.00039881846237957907, + "loss": 3.0787, + "step": 24067 + }, + { + "epoch": 1.18, + "grad_norm": 0.548173725605011, + "learning_rate": 0.00039880392528568963, + "loss": 2.9258, + "step": 24068 + }, + { + "epoch": 1.18, + "grad_norm": 0.5512843728065491, + "learning_rate": 0.0003987893879315696, + "loss": 2.9631, + "step": 24069 + }, + { + "epoch": 1.18, + "grad_norm": 0.58067387342453, + "learning_rate": 0.0003987748503172574, + "loss": 3.2054, + "step": 24070 + }, + { + "epoch": 1.18, + "grad_norm": 0.5880147814750671, + "learning_rate": 0.00039876031244279127, + "loss": 3.1545, + "step": 24071 + }, + { + "epoch": 1.18, + "grad_norm": 0.5471587777137756, + "learning_rate": 0.0003987457743082094, + "loss": 3.0934, + "step": 24072 + }, + { + "epoch": 1.18, + "grad_norm": 0.5884271264076233, + "learning_rate": 0.00039873123591355024, + "loss": 3.1801, + "step": 24073 + }, + { + "epoch": 1.18, + "grad_norm": 0.5782662630081177, + "learning_rate": 0.00039871669725885197, + "loss": 2.9995, + "step": 24074 + }, + { + "epoch": 1.18, + "grad_norm": 0.5610849261283875, + "learning_rate": 0.0003987021583441529, + "loss": 2.916, + "step": 24075 + }, + { + "epoch": 1.18, + "grad_norm": 0.5222210884094238, + "learning_rate": 0.00039868761916949134, + "loss": 3.0684, + "step": 24076 + }, + { + "epoch": 1.18, + "grad_norm": 0.5449653267860413, + "learning_rate": 0.0003986730797349056, + "loss": 3.2768, + "step": 24077 + }, + { + "epoch": 1.18, + "grad_norm": 0.5474014282226562, + "learning_rate": 0.00039865854004043406, + "loss": 3.3026, + "step": 24078 + }, + { + "epoch": 1.18, + "grad_norm": 0.564146876335144, + "learning_rate": 0.0003986440000861148, + "loss": 3.0649, + "step": 24079 + }, + { + "epoch": 1.18, + "grad_norm": 0.5155508518218994, + "learning_rate": 0.00039862945987198626, + "loss": 2.9618, + "step": 24080 + }, + { + "epoch": 1.18, + "grad_norm": 0.5813475251197815, + "learning_rate": 0.0003986149193980867, + "loss": 3.1608, + "step": 24081 + }, + { + "epoch": 1.18, + "grad_norm": 0.5683935284614563, + "learning_rate": 0.0003986003786644544, + "loss": 3.1103, + "step": 24082 + }, + { + "epoch": 1.18, + "grad_norm": 1.1067701578140259, + "learning_rate": 0.0003985858376711277, + "loss": 3.224, + "step": 24083 + }, + { + "epoch": 1.18, + "grad_norm": 0.5283617377281189, + "learning_rate": 0.0003985712964181448, + "loss": 3.2039, + "step": 24084 + }, + { + "epoch": 1.18, + "grad_norm": 0.5957078337669373, + "learning_rate": 0.00039855675490554416, + "loss": 3.2682, + "step": 24085 + }, + { + "epoch": 1.18, + "grad_norm": 0.549714207649231, + "learning_rate": 0.0003985422131333639, + "loss": 3.0082, + "step": 24086 + }, + { + "epoch": 1.18, + "grad_norm": 0.5549113154411316, + "learning_rate": 0.00039852767110164255, + "loss": 3.0617, + "step": 24087 + }, + { + "epoch": 1.18, + "grad_norm": 0.5381019115447998, + "learning_rate": 0.0003985131288104182, + "loss": 3.1637, + "step": 24088 + }, + { + "epoch": 1.18, + "grad_norm": 0.5498014092445374, + "learning_rate": 0.0003984985862597293, + "loss": 3.1576, + "step": 24089 + }, + { + "epoch": 1.18, + "grad_norm": 0.5414360761642456, + "learning_rate": 0.000398484043449614, + "loss": 3.2084, + "step": 24090 + }, + { + "epoch": 1.18, + "grad_norm": 0.5614427924156189, + "learning_rate": 0.0003984695003801107, + "loss": 3.0681, + "step": 24091 + }, + { + "epoch": 1.18, + "grad_norm": 0.5734679698944092, + "learning_rate": 0.00039845495705125776, + "loss": 3.0153, + "step": 24092 + }, + { + "epoch": 1.18, + "grad_norm": 0.5509762763977051, + "learning_rate": 0.00039844041346309337, + "loss": 2.9354, + "step": 24093 + }, + { + "epoch": 1.18, + "grad_norm": 0.5663211941719055, + "learning_rate": 0.00039842586961565585, + "loss": 3.1139, + "step": 24094 + }, + { + "epoch": 1.18, + "grad_norm": 0.5385051965713501, + "learning_rate": 0.0003984113255089835, + "loss": 3.0458, + "step": 24095 + }, + { + "epoch": 1.18, + "grad_norm": 0.543887197971344, + "learning_rate": 0.0003983967811431147, + "loss": 3.1119, + "step": 24096 + }, + { + "epoch": 1.18, + "grad_norm": 0.5807494521141052, + "learning_rate": 0.0003983822365180877, + "loss": 3.1381, + "step": 24097 + }, + { + "epoch": 1.18, + "grad_norm": 0.5525929927825928, + "learning_rate": 0.0003983676916339408, + "loss": 3.1725, + "step": 24098 + }, + { + "epoch": 1.18, + "grad_norm": 0.5865118503570557, + "learning_rate": 0.00039835314649071235, + "loss": 3.1903, + "step": 24099 + }, + { + "epoch": 1.18, + "grad_norm": 0.5506489276885986, + "learning_rate": 0.0003983386010884408, + "loss": 2.9352, + "step": 24100 + }, + { + "epoch": 1.18, + "grad_norm": 0.561968207359314, + "learning_rate": 0.00039832405542716403, + "loss": 2.9563, + "step": 24101 + }, + { + "epoch": 1.18, + "grad_norm": 0.5727139115333557, + "learning_rate": 0.00039830950950692077, + "loss": 3.2105, + "step": 24102 + }, + { + "epoch": 1.18, + "grad_norm": 0.6121776103973389, + "learning_rate": 0.0003982949633277492, + "loss": 3.0155, + "step": 24103 + }, + { + "epoch": 1.18, + "grad_norm": 0.5447525382041931, + "learning_rate": 0.0003982804168896876, + "loss": 3.2316, + "step": 24104 + }, + { + "epoch": 1.18, + "grad_norm": 0.540285587310791, + "learning_rate": 0.0003982658701927742, + "loss": 3.2454, + "step": 24105 + }, + { + "epoch": 1.18, + "grad_norm": 0.5771687030792236, + "learning_rate": 0.00039825132323704744, + "loss": 3.3046, + "step": 24106 + }, + { + "epoch": 1.18, + "grad_norm": 0.5980179905891418, + "learning_rate": 0.00039823677602254576, + "loss": 3.088, + "step": 24107 + }, + { + "epoch": 1.18, + "grad_norm": 0.5670889019966125, + "learning_rate": 0.0003982222285493072, + "loss": 3.3893, + "step": 24108 + }, + { + "epoch": 1.18, + "grad_norm": 0.5559228658676147, + "learning_rate": 0.0003982076808173701, + "loss": 3.106, + "step": 24109 + }, + { + "epoch": 1.18, + "grad_norm": 0.5641289353370667, + "learning_rate": 0.000398193132826773, + "loss": 3.0903, + "step": 24110 + }, + { + "epoch": 1.18, + "grad_norm": 0.5894806981086731, + "learning_rate": 0.000398178584577554, + "loss": 3.1442, + "step": 24111 + }, + { + "epoch": 1.18, + "grad_norm": 0.557578980922699, + "learning_rate": 0.0003981640360697516, + "loss": 2.9691, + "step": 24112 + }, + { + "epoch": 1.18, + "grad_norm": 0.5747301578521729, + "learning_rate": 0.000398149487303404, + "loss": 2.961, + "step": 24113 + }, + { + "epoch": 1.18, + "grad_norm": 0.5464869737625122, + "learning_rate": 0.00039813493827854955, + "loss": 3.1954, + "step": 24114 + }, + { + "epoch": 1.18, + "grad_norm": 0.5992234945297241, + "learning_rate": 0.0003981203889952265, + "loss": 3.0543, + "step": 24115 + }, + { + "epoch": 1.18, + "grad_norm": 0.5721291899681091, + "learning_rate": 0.00039810583945347326, + "loss": 3.1144, + "step": 24116 + }, + { + "epoch": 1.18, + "grad_norm": 0.6171442270278931, + "learning_rate": 0.000398091289653328, + "loss": 3.1549, + "step": 24117 + }, + { + "epoch": 1.18, + "grad_norm": 0.5718544125556946, + "learning_rate": 0.00039807673959482936, + "loss": 3.1521, + "step": 24118 + }, + { + "epoch": 1.18, + "grad_norm": 0.5414995551109314, + "learning_rate": 0.0003980621892780154, + "loss": 3.1951, + "step": 24119 + }, + { + "epoch": 1.18, + "grad_norm": 0.5666779279708862, + "learning_rate": 0.00039804763870292443, + "loss": 3.0715, + "step": 24120 + }, + { + "epoch": 1.18, + "grad_norm": 0.5580045580863953, + "learning_rate": 0.0003980330878695949, + "loss": 3.2015, + "step": 24121 + }, + { + "epoch": 1.18, + "grad_norm": 0.5530674457550049, + "learning_rate": 0.00039801853677806507, + "loss": 3.3237, + "step": 24122 + }, + { + "epoch": 1.18, + "grad_norm": 0.5504000186920166, + "learning_rate": 0.0003980039854283733, + "loss": 3.0533, + "step": 24123 + }, + { + "epoch": 1.18, + "grad_norm": 0.5976004600524902, + "learning_rate": 0.0003979894338205579, + "loss": 3.1773, + "step": 24124 + }, + { + "epoch": 1.18, + "grad_norm": 0.5514919757843018, + "learning_rate": 0.00039797488195465725, + "loss": 3.1461, + "step": 24125 + }, + { + "epoch": 1.18, + "grad_norm": 0.572727620601654, + "learning_rate": 0.0003979603298307095, + "loss": 3.2894, + "step": 24126 + }, + { + "epoch": 1.18, + "grad_norm": 0.5436162948608398, + "learning_rate": 0.00039794577744875313, + "loss": 3.0208, + "step": 24127 + }, + { + "epoch": 1.18, + "grad_norm": 0.5952852368354797, + "learning_rate": 0.0003979312248088264, + "loss": 3.1083, + "step": 24128 + }, + { + "epoch": 1.18, + "grad_norm": 0.5643694400787354, + "learning_rate": 0.00039791667191096773, + "loss": 3.1304, + "step": 24129 + }, + { + "epoch": 1.18, + "grad_norm": 0.55324786901474, + "learning_rate": 0.0003979021187552154, + "loss": 2.9863, + "step": 24130 + }, + { + "epoch": 1.18, + "grad_norm": 0.6007174253463745, + "learning_rate": 0.0003978875653416077, + "loss": 2.975, + "step": 24131 + }, + { + "epoch": 1.18, + "grad_norm": 0.5391753315925598, + "learning_rate": 0.00039787301167018297, + "loss": 2.843, + "step": 24132 + }, + { + "epoch": 1.18, + "grad_norm": 0.5486133098602295, + "learning_rate": 0.00039785845774097957, + "loss": 2.8428, + "step": 24133 + }, + { + "epoch": 1.18, + "grad_norm": 0.5005714297294617, + "learning_rate": 0.00039784390355403584, + "loss": 3.3195, + "step": 24134 + }, + { + "epoch": 1.18, + "grad_norm": 0.5840128064155579, + "learning_rate": 0.0003978293491093901, + "loss": 3.1903, + "step": 24135 + }, + { + "epoch": 1.18, + "grad_norm": 0.5641036033630371, + "learning_rate": 0.00039781479440708067, + "loss": 3.0371, + "step": 24136 + }, + { + "epoch": 1.18, + "grad_norm": 0.5386980772018433, + "learning_rate": 0.0003978002394471459, + "loss": 3.1271, + "step": 24137 + }, + { + "epoch": 1.18, + "grad_norm": 0.5703660249710083, + "learning_rate": 0.00039778568422962413, + "loss": 3.1415, + "step": 24138 + }, + { + "epoch": 1.18, + "grad_norm": 0.5552119016647339, + "learning_rate": 0.0003977711287545537, + "loss": 3.0826, + "step": 24139 + }, + { + "epoch": 1.18, + "grad_norm": 0.5309668779373169, + "learning_rate": 0.00039775657302197295, + "loss": 3.1237, + "step": 24140 + }, + { + "epoch": 1.18, + "grad_norm": 0.5294352173805237, + "learning_rate": 0.0003977420170319201, + "loss": 3.1383, + "step": 24141 + }, + { + "epoch": 1.18, + "grad_norm": 0.522890031337738, + "learning_rate": 0.0003977274607844337, + "loss": 3.101, + "step": 24142 + }, + { + "epoch": 1.18, + "grad_norm": 0.6287959218025208, + "learning_rate": 0.000397712904279552, + "loss": 3.0709, + "step": 24143 + }, + { + "epoch": 1.18, + "grad_norm": 0.5275660157203674, + "learning_rate": 0.0003976983475173132, + "loss": 3.1218, + "step": 24144 + }, + { + "epoch": 1.18, + "grad_norm": 0.5518277287483215, + "learning_rate": 0.0003976837904977558, + "loss": 3.0445, + "step": 24145 + }, + { + "epoch": 1.18, + "grad_norm": 0.5437619090080261, + "learning_rate": 0.0003976692332209181, + "loss": 3.0587, + "step": 24146 + }, + { + "epoch": 1.18, + "grad_norm": 0.6017915606498718, + "learning_rate": 0.00039765467568683846, + "loss": 3.0058, + "step": 24147 + }, + { + "epoch": 1.18, + "grad_norm": 0.5371351838111877, + "learning_rate": 0.00039764011789555513, + "loss": 2.9967, + "step": 24148 + }, + { + "epoch": 1.18, + "grad_norm": 0.5650933980941772, + "learning_rate": 0.0003976255598471066, + "loss": 2.981, + "step": 24149 + }, + { + "epoch": 1.18, + "grad_norm": 0.5488359332084656, + "learning_rate": 0.00039761100154153103, + "loss": 3.1748, + "step": 24150 + }, + { + "epoch": 1.18, + "grad_norm": 0.5488278269767761, + "learning_rate": 0.00039759644297886703, + "loss": 2.9815, + "step": 24151 + }, + { + "epoch": 1.18, + "grad_norm": 0.562656819820404, + "learning_rate": 0.0003975818841591526, + "loss": 3.2104, + "step": 24152 + }, + { + "epoch": 1.18, + "grad_norm": 0.530597448348999, + "learning_rate": 0.00039756732508242635, + "loss": 3.1563, + "step": 24153 + }, + { + "epoch": 1.18, + "grad_norm": 0.520431399345398, + "learning_rate": 0.00039755276574872665, + "loss": 3.0995, + "step": 24154 + }, + { + "epoch": 1.18, + "grad_norm": 0.5567206740379333, + "learning_rate": 0.0003975382061580916, + "loss": 3.1326, + "step": 24155 + }, + { + "epoch": 1.18, + "grad_norm": 0.5744289755821228, + "learning_rate": 0.0003975236463105597, + "loss": 3.183, + "step": 24156 + }, + { + "epoch": 1.18, + "grad_norm": 0.5486968755722046, + "learning_rate": 0.00039750908620616936, + "loss": 3.2061, + "step": 24157 + }, + { + "epoch": 1.18, + "grad_norm": 0.5852683186531067, + "learning_rate": 0.00039749452584495875, + "loss": 3.0303, + "step": 24158 + }, + { + "epoch": 1.18, + "grad_norm": 0.5728834271430969, + "learning_rate": 0.0003974799652269664, + "loss": 3.0113, + "step": 24159 + }, + { + "epoch": 1.18, + "grad_norm": 0.5454309582710266, + "learning_rate": 0.0003974654043522306, + "loss": 3.1408, + "step": 24160 + }, + { + "epoch": 1.18, + "grad_norm": 0.5599636435508728, + "learning_rate": 0.00039745084322078956, + "loss": 3.2156, + "step": 24161 + }, + { + "epoch": 1.18, + "grad_norm": 0.540389358997345, + "learning_rate": 0.0003974362818326819, + "loss": 3.0243, + "step": 24162 + }, + { + "epoch": 1.18, + "grad_norm": 0.5856047868728638, + "learning_rate": 0.0003974217201879457, + "loss": 2.8006, + "step": 24163 + }, + { + "epoch": 1.18, + "grad_norm": 0.5460880398750305, + "learning_rate": 0.0003974071582866195, + "loss": 3.036, + "step": 24164 + }, + { + "epoch": 1.18, + "grad_norm": 0.5401052832603455, + "learning_rate": 0.00039739259612874163, + "loss": 3.0225, + "step": 24165 + }, + { + "epoch": 1.18, + "grad_norm": 0.6093460917472839, + "learning_rate": 0.0003973780337143504, + "loss": 3.0323, + "step": 24166 + }, + { + "epoch": 1.18, + "grad_norm": 0.5864385366439819, + "learning_rate": 0.0003973634710434841, + "loss": 3.0327, + "step": 24167 + }, + { + "epoch": 1.18, + "grad_norm": 0.5695944428443909, + "learning_rate": 0.00039734890811618116, + "loss": 3.1903, + "step": 24168 + }, + { + "epoch": 1.18, + "grad_norm": 0.5272200107574463, + "learning_rate": 0.00039733434493248, + "loss": 3.1964, + "step": 24169 + }, + { + "epoch": 1.18, + "grad_norm": 0.5584858655929565, + "learning_rate": 0.00039731978149241883, + "loss": 2.8314, + "step": 24170 + }, + { + "epoch": 1.18, + "grad_norm": 0.542620837688446, + "learning_rate": 0.0003973052177960361, + "loss": 3.1823, + "step": 24171 + }, + { + "epoch": 1.18, + "grad_norm": 0.5688046813011169, + "learning_rate": 0.00039729065384337025, + "loss": 2.9456, + "step": 24172 + }, + { + "epoch": 1.18, + "grad_norm": 0.5506514310836792, + "learning_rate": 0.0003972760896344594, + "loss": 3.1059, + "step": 24173 + }, + { + "epoch": 1.18, + "grad_norm": 0.5505015254020691, + "learning_rate": 0.00039726152516934214, + "loss": 3.2437, + "step": 24174 + }, + { + "epoch": 1.18, + "grad_norm": 0.5867189764976501, + "learning_rate": 0.0003972469604480567, + "loss": 2.8984, + "step": 24175 + }, + { + "epoch": 1.18, + "grad_norm": 0.5614295601844788, + "learning_rate": 0.00039723239547064155, + "loss": 3.1634, + "step": 24176 + }, + { + "epoch": 1.18, + "grad_norm": 0.5461905002593994, + "learning_rate": 0.00039721783023713496, + "loss": 3.1337, + "step": 24177 + }, + { + "epoch": 1.18, + "grad_norm": 0.5388670563697815, + "learning_rate": 0.0003972032647475752, + "loss": 3.1324, + "step": 24178 + }, + { + "epoch": 1.18, + "grad_norm": 0.5529229640960693, + "learning_rate": 0.00039718869900200085, + "loss": 3.2731, + "step": 24179 + }, + { + "epoch": 1.19, + "grad_norm": 0.5644211173057556, + "learning_rate": 0.0003971741330004502, + "loss": 3.0189, + "step": 24180 + }, + { + "epoch": 1.19, + "grad_norm": 0.5382275581359863, + "learning_rate": 0.00039715956674296154, + "loss": 3.1831, + "step": 24181 + }, + { + "epoch": 1.19, + "grad_norm": 0.5577836036682129, + "learning_rate": 0.00039714500022957326, + "loss": 3.1967, + "step": 24182 + }, + { + "epoch": 1.19, + "grad_norm": 0.5363103151321411, + "learning_rate": 0.0003971304334603238, + "loss": 2.9104, + "step": 24183 + }, + { + "epoch": 1.19, + "grad_norm": 0.538547694683075, + "learning_rate": 0.0003971158664352514, + "loss": 3.1109, + "step": 24184 + }, + { + "epoch": 1.19, + "grad_norm": 0.5949356555938721, + "learning_rate": 0.0003971012991543945, + "loss": 3.0122, + "step": 24185 + }, + { + "epoch": 1.19, + "grad_norm": 0.5695586204528809, + "learning_rate": 0.0003970867316177915, + "loss": 3.1775, + "step": 24186 + }, + { + "epoch": 1.19, + "grad_norm": 0.6203005313873291, + "learning_rate": 0.0003970721638254808, + "loss": 3.1296, + "step": 24187 + }, + { + "epoch": 1.19, + "grad_norm": 0.5122324228286743, + "learning_rate": 0.0003970575957775006, + "loss": 3.079, + "step": 24188 + }, + { + "epoch": 1.19, + "grad_norm": 0.5441964864730835, + "learning_rate": 0.0003970430274738894, + "loss": 3.109, + "step": 24189 + }, + { + "epoch": 1.19, + "grad_norm": 0.5421319007873535, + "learning_rate": 0.00039702845891468556, + "loss": 3.1183, + "step": 24190 + }, + { + "epoch": 1.19, + "grad_norm": 0.5920052528381348, + "learning_rate": 0.0003970138900999275, + "loss": 3.0434, + "step": 24191 + }, + { + "epoch": 1.19, + "grad_norm": 0.5872803926467896, + "learning_rate": 0.0003969993210296535, + "loss": 2.8057, + "step": 24192 + }, + { + "epoch": 1.19, + "grad_norm": 0.557303786277771, + "learning_rate": 0.0003969847517039018, + "loss": 3.0588, + "step": 24193 + }, + { + "epoch": 1.19, + "grad_norm": 0.5577976107597351, + "learning_rate": 0.0003969701821227111, + "loss": 3.2499, + "step": 24194 + }, + { + "epoch": 1.19, + "grad_norm": 0.5627845525741577, + "learning_rate": 0.00039695561228611953, + "loss": 3.059, + "step": 24195 + }, + { + "epoch": 1.19, + "grad_norm": 0.607268750667572, + "learning_rate": 0.0003969410421941655, + "loss": 3.0055, + "step": 24196 + }, + { + "epoch": 1.19, + "grad_norm": 0.5258342027664185, + "learning_rate": 0.0003969264718468875, + "loss": 2.8618, + "step": 24197 + }, + { + "epoch": 1.19, + "grad_norm": 0.5296414494514465, + "learning_rate": 0.0003969119012443238, + "loss": 3.2552, + "step": 24198 + }, + { + "epoch": 1.19, + "grad_norm": 0.5506784915924072, + "learning_rate": 0.00039689733038651275, + "loss": 2.9617, + "step": 24199 + }, + { + "epoch": 1.19, + "grad_norm": 0.5685448050498962, + "learning_rate": 0.00039688275927349286, + "loss": 3.12, + "step": 24200 + }, + { + "epoch": 1.19, + "grad_norm": 0.5335084199905396, + "learning_rate": 0.0003968681879053024, + "loss": 3.0428, + "step": 24201 + }, + { + "epoch": 1.19, + "grad_norm": 0.6116160750389099, + "learning_rate": 0.0003968536162819798, + "loss": 2.9557, + "step": 24202 + }, + { + "epoch": 1.19, + "grad_norm": 0.5441951751708984, + "learning_rate": 0.00039683904440356336, + "loss": 3.272, + "step": 24203 + }, + { + "epoch": 1.19, + "grad_norm": 0.5496557950973511, + "learning_rate": 0.00039682447227009146, + "loss": 3.1211, + "step": 24204 + }, + { + "epoch": 1.19, + "grad_norm": 0.6027690768241882, + "learning_rate": 0.00039680989988160275, + "loss": 3.2041, + "step": 24205 + }, + { + "epoch": 1.19, + "grad_norm": 0.7103535532951355, + "learning_rate": 0.00039679532723813516, + "loss": 3.384, + "step": 24206 + }, + { + "epoch": 1.19, + "grad_norm": 0.570144772529602, + "learning_rate": 0.00039678075433972746, + "loss": 3.0117, + "step": 24207 + }, + { + "epoch": 1.19, + "grad_norm": 0.5552254915237427, + "learning_rate": 0.0003967661811864178, + "loss": 3.0736, + "step": 24208 + }, + { + "epoch": 1.19, + "grad_norm": 0.5385003685951233, + "learning_rate": 0.00039675160777824465, + "loss": 3.0785, + "step": 24209 + }, + { + "epoch": 1.19, + "grad_norm": 0.5606467127799988, + "learning_rate": 0.00039673703411524647, + "loss": 3.0978, + "step": 24210 + }, + { + "epoch": 1.19, + "grad_norm": 0.5405035614967346, + "learning_rate": 0.0003967224601974615, + "loss": 3.1803, + "step": 24211 + }, + { + "epoch": 1.19, + "grad_norm": 0.5441907048225403, + "learning_rate": 0.0003967078860249281, + "loss": 3.0588, + "step": 24212 + }, + { + "epoch": 1.19, + "grad_norm": 0.5538133382797241, + "learning_rate": 0.0003966933115976849, + "loss": 3.1926, + "step": 24213 + }, + { + "epoch": 1.19, + "grad_norm": 0.528517484664917, + "learning_rate": 0.00039667873691576997, + "loss": 3.4385, + "step": 24214 + }, + { + "epoch": 1.19, + "grad_norm": 0.5590735077857971, + "learning_rate": 0.0003966641619792219, + "loss": 3.067, + "step": 24215 + }, + { + "epoch": 1.19, + "grad_norm": 0.591706395149231, + "learning_rate": 0.0003966495867880791, + "loss": 3.0503, + "step": 24216 + }, + { + "epoch": 1.19, + "grad_norm": 0.5574644207954407, + "learning_rate": 0.00039663501134237983, + "loss": 3.0304, + "step": 24217 + }, + { + "epoch": 1.19, + "grad_norm": 0.5569961667060852, + "learning_rate": 0.00039662043564216257, + "loss": 2.9025, + "step": 24218 + }, + { + "epoch": 1.19, + "grad_norm": 0.5450246930122375, + "learning_rate": 0.00039660585968746554, + "loss": 3.1877, + "step": 24219 + }, + { + "epoch": 1.19, + "grad_norm": 0.562933623790741, + "learning_rate": 0.00039659128347832747, + "loss": 3.0375, + "step": 24220 + }, + { + "epoch": 1.19, + "grad_norm": 0.5538458824157715, + "learning_rate": 0.0003965767070147865, + "loss": 3.0816, + "step": 24221 + }, + { + "epoch": 1.19, + "grad_norm": 0.60169917345047, + "learning_rate": 0.000396562130296881, + "loss": 3.1415, + "step": 24222 + }, + { + "epoch": 1.19, + "grad_norm": 0.5521277785301208, + "learning_rate": 0.00039654755332464947, + "loss": 3.0604, + "step": 24223 + }, + { + "epoch": 1.19, + "grad_norm": 0.5698322653770447, + "learning_rate": 0.0003965329760981303, + "loss": 3.283, + "step": 24224 + }, + { + "epoch": 1.19, + "grad_norm": 0.5327075123786926, + "learning_rate": 0.0003965183986173618, + "loss": 3.0547, + "step": 24225 + }, + { + "epoch": 1.19, + "grad_norm": 0.5273543000221252, + "learning_rate": 0.0003965038208823824, + "loss": 3.2407, + "step": 24226 + }, + { + "epoch": 1.19, + "grad_norm": 0.7036231756210327, + "learning_rate": 0.00039648924289323057, + "loss": 3.1334, + "step": 24227 + }, + { + "epoch": 1.19, + "grad_norm": 0.5902255773544312, + "learning_rate": 0.00039647466464994463, + "loss": 2.9559, + "step": 24228 + }, + { + "epoch": 1.19, + "grad_norm": 0.5623027086257935, + "learning_rate": 0.0003964600861525629, + "loss": 2.899, + "step": 24229 + }, + { + "epoch": 1.19, + "grad_norm": 0.5198655128479004, + "learning_rate": 0.00039644550740112393, + "loss": 3.029, + "step": 24230 + }, + { + "epoch": 1.19, + "grad_norm": 0.5730830430984497, + "learning_rate": 0.00039643092839566614, + "loss": 2.9395, + "step": 24231 + }, + { + "epoch": 1.19, + "grad_norm": 0.5397255420684814, + "learning_rate": 0.00039641634913622776, + "loss": 3.1824, + "step": 24232 + }, + { + "epoch": 1.19, + "grad_norm": 0.5603347420692444, + "learning_rate": 0.0003964017696228473, + "loss": 3.0076, + "step": 24233 + }, + { + "epoch": 1.19, + "grad_norm": 0.556710422039032, + "learning_rate": 0.00039638718985556307, + "loss": 3.1231, + "step": 24234 + }, + { + "epoch": 1.19, + "grad_norm": 0.5215965509414673, + "learning_rate": 0.0003963726098344136, + "loss": 3.0744, + "step": 24235 + }, + { + "epoch": 1.19, + "grad_norm": 0.5522467494010925, + "learning_rate": 0.0003963580295594372, + "loss": 3.2905, + "step": 24236 + }, + { + "epoch": 1.19, + "grad_norm": 0.5642509460449219, + "learning_rate": 0.0003963434490306722, + "loss": 3.1552, + "step": 24237 + }, + { + "epoch": 1.19, + "grad_norm": 0.5451124310493469, + "learning_rate": 0.0003963288682481573, + "loss": 3.0713, + "step": 24238 + }, + { + "epoch": 1.19, + "grad_norm": 0.5440728664398193, + "learning_rate": 0.00039631428721193055, + "loss": 3.1808, + "step": 24239 + }, + { + "epoch": 1.19, + "grad_norm": 0.5344854593276978, + "learning_rate": 0.00039629970592203043, + "loss": 3.118, + "step": 24240 + }, + { + "epoch": 1.19, + "grad_norm": 0.5784305930137634, + "learning_rate": 0.00039628512437849553, + "loss": 3.2412, + "step": 24241 + }, + { + "epoch": 1.19, + "grad_norm": 0.5519080758094788, + "learning_rate": 0.00039627054258136417, + "loss": 3.3491, + "step": 24242 + }, + { + "epoch": 1.19, + "grad_norm": 0.5608418583869934, + "learning_rate": 0.0003962559605306747, + "loss": 3.0723, + "step": 24243 + }, + { + "epoch": 1.19, + "grad_norm": 0.5935826897621155, + "learning_rate": 0.0003962413782264656, + "loss": 3.1175, + "step": 24244 + }, + { + "epoch": 1.19, + "grad_norm": 0.5591660141944885, + "learning_rate": 0.0003962267956687752, + "loss": 3.0419, + "step": 24245 + }, + { + "epoch": 1.19, + "grad_norm": 0.6037750244140625, + "learning_rate": 0.0003962122128576419, + "loss": 3.0272, + "step": 24246 + }, + { + "epoch": 1.19, + "grad_norm": 0.5254233479499817, + "learning_rate": 0.00039619762979310416, + "loss": 3.1549, + "step": 24247 + }, + { + "epoch": 1.19, + "grad_norm": 0.5338966846466064, + "learning_rate": 0.0003961830464752004, + "loss": 3.102, + "step": 24248 + }, + { + "epoch": 1.19, + "grad_norm": 0.5736029744148254, + "learning_rate": 0.000396168462903969, + "loss": 3.2333, + "step": 24249 + }, + { + "epoch": 1.19, + "grad_norm": 0.5698594450950623, + "learning_rate": 0.00039615387907944834, + "loss": 3.2735, + "step": 24250 + }, + { + "epoch": 1.19, + "grad_norm": 0.5339264869689941, + "learning_rate": 0.0003961392950016769, + "loss": 2.9759, + "step": 24251 + }, + { + "epoch": 1.19, + "grad_norm": 0.5492169857025146, + "learning_rate": 0.000396124710670693, + "loss": 3.0123, + "step": 24252 + }, + { + "epoch": 1.19, + "grad_norm": 0.5674329996109009, + "learning_rate": 0.00039611012608653523, + "loss": 3.2794, + "step": 24253 + }, + { + "epoch": 1.19, + "grad_norm": 0.5592032670974731, + "learning_rate": 0.00039609554124924183, + "loss": 3.1204, + "step": 24254 + }, + { + "epoch": 1.19, + "grad_norm": 0.5712992548942566, + "learning_rate": 0.0003960809561588512, + "loss": 2.8761, + "step": 24255 + }, + { + "epoch": 1.19, + "grad_norm": 0.5755642652511597, + "learning_rate": 0.0003960663708154019, + "loss": 2.9666, + "step": 24256 + }, + { + "epoch": 1.19, + "grad_norm": 0.5444374084472656, + "learning_rate": 0.0003960517852189323, + "loss": 3.0622, + "step": 24257 + }, + { + "epoch": 1.19, + "grad_norm": 0.5554080605506897, + "learning_rate": 0.00039603719936948074, + "loss": 3.2019, + "step": 24258 + }, + { + "epoch": 1.19, + "grad_norm": 0.6017959713935852, + "learning_rate": 0.0003960226132670857, + "loss": 2.9427, + "step": 24259 + }, + { + "epoch": 1.19, + "grad_norm": 0.5440652966499329, + "learning_rate": 0.00039600802691178556, + "loss": 3.1756, + "step": 24260 + }, + { + "epoch": 1.19, + "grad_norm": 0.5530151128768921, + "learning_rate": 0.0003959934403036187, + "loss": 2.9358, + "step": 24261 + }, + { + "epoch": 1.19, + "grad_norm": 0.5590417385101318, + "learning_rate": 0.00039597885344262366, + "loss": 3.0388, + "step": 24262 + }, + { + "epoch": 1.19, + "grad_norm": 0.5701658725738525, + "learning_rate": 0.0003959642663288387, + "loss": 3.0352, + "step": 24263 + }, + { + "epoch": 1.19, + "grad_norm": 0.5556230545043945, + "learning_rate": 0.0003959496789623025, + "loss": 3.1208, + "step": 24264 + }, + { + "epoch": 1.19, + "grad_norm": 0.5601057410240173, + "learning_rate": 0.00039593509134305317, + "loss": 3.0421, + "step": 24265 + }, + { + "epoch": 1.19, + "grad_norm": 0.5562484860420227, + "learning_rate": 0.00039592050347112933, + "loss": 3.1872, + "step": 24266 + }, + { + "epoch": 1.19, + "grad_norm": 0.5837891101837158, + "learning_rate": 0.00039590591534656937, + "loss": 3.0225, + "step": 24267 + }, + { + "epoch": 1.19, + "grad_norm": 0.5743361115455627, + "learning_rate": 0.0003958913269694116, + "loss": 3.1041, + "step": 24268 + }, + { + "epoch": 1.19, + "grad_norm": 0.6154342889785767, + "learning_rate": 0.00039587673833969464, + "loss": 2.9143, + "step": 24269 + }, + { + "epoch": 1.19, + "grad_norm": 0.5403562188148499, + "learning_rate": 0.00039586214945745664, + "loss": 2.9925, + "step": 24270 + }, + { + "epoch": 1.19, + "grad_norm": 0.5781658291816711, + "learning_rate": 0.00039584756032273635, + "loss": 3.1171, + "step": 24271 + }, + { + "epoch": 1.19, + "grad_norm": 0.5835286378860474, + "learning_rate": 0.00039583297093557195, + "loss": 3.1155, + "step": 24272 + }, + { + "epoch": 1.19, + "grad_norm": 0.5412124395370483, + "learning_rate": 0.00039581838129600197, + "loss": 3.056, + "step": 24273 + }, + { + "epoch": 1.19, + "grad_norm": 0.5261394381523132, + "learning_rate": 0.00039580379140406475, + "loss": 3.0973, + "step": 24274 + }, + { + "epoch": 1.19, + "grad_norm": 0.5671351552009583, + "learning_rate": 0.00039578920125979893, + "loss": 3.0282, + "step": 24275 + }, + { + "epoch": 1.19, + "grad_norm": 0.5514994263648987, + "learning_rate": 0.0003957746108632427, + "loss": 3.1963, + "step": 24276 + }, + { + "epoch": 1.19, + "grad_norm": 0.576802134513855, + "learning_rate": 0.00039576002021443456, + "loss": 3.0313, + "step": 24277 + }, + { + "epoch": 1.19, + "grad_norm": 0.5313069224357605, + "learning_rate": 0.00039574542931341297, + "loss": 3.2151, + "step": 24278 + }, + { + "epoch": 1.19, + "grad_norm": 0.536454439163208, + "learning_rate": 0.00039573083816021637, + "loss": 3.1187, + "step": 24279 + }, + { + "epoch": 1.19, + "grad_norm": 0.5508918762207031, + "learning_rate": 0.00039571624675488313, + "loss": 2.9256, + "step": 24280 + }, + { + "epoch": 1.19, + "grad_norm": 0.5629120469093323, + "learning_rate": 0.0003957016550974517, + "loss": 3.0875, + "step": 24281 + }, + { + "epoch": 1.19, + "grad_norm": 0.5517382025718689, + "learning_rate": 0.0003956870631879606, + "loss": 2.9645, + "step": 24282 + }, + { + "epoch": 1.19, + "grad_norm": 0.5510847568511963, + "learning_rate": 0.00039567247102644817, + "loss": 3.0028, + "step": 24283 + }, + { + "epoch": 1.19, + "grad_norm": 0.5683895945549011, + "learning_rate": 0.0003956578786129528, + "loss": 3.2715, + "step": 24284 + }, + { + "epoch": 1.19, + "grad_norm": 0.5647633671760559, + "learning_rate": 0.00039564328594751306, + "loss": 2.9625, + "step": 24285 + }, + { + "epoch": 1.19, + "grad_norm": 0.5621564388275146, + "learning_rate": 0.00039562869303016724, + "loss": 3.2011, + "step": 24286 + }, + { + "epoch": 1.19, + "grad_norm": 0.5985468029975891, + "learning_rate": 0.0003956140998609539, + "loss": 3.3408, + "step": 24287 + }, + { + "epoch": 1.19, + "grad_norm": 0.5399235486984253, + "learning_rate": 0.00039559950643991144, + "loss": 3.1029, + "step": 24288 + }, + { + "epoch": 1.19, + "grad_norm": 0.5459802150726318, + "learning_rate": 0.0003955849127670783, + "loss": 2.9857, + "step": 24289 + }, + { + "epoch": 1.19, + "grad_norm": 0.5875890254974365, + "learning_rate": 0.00039557031884249286, + "loss": 2.938, + "step": 24290 + }, + { + "epoch": 1.19, + "grad_norm": 0.5480020642280579, + "learning_rate": 0.0003955557246661935, + "loss": 3.3471, + "step": 24291 + }, + { + "epoch": 1.19, + "grad_norm": 0.5252436995506287, + "learning_rate": 0.0003955411302382189, + "loss": 3.2947, + "step": 24292 + }, + { + "epoch": 1.19, + "grad_norm": 0.5304093956947327, + "learning_rate": 0.0003955265355586073, + "loss": 3.3092, + "step": 24293 + }, + { + "epoch": 1.19, + "grad_norm": 0.5480414628982544, + "learning_rate": 0.00039551194062739713, + "loss": 3.0931, + "step": 24294 + }, + { + "epoch": 1.19, + "grad_norm": 0.5469361543655396, + "learning_rate": 0.0003954973454446269, + "loss": 3.1805, + "step": 24295 + }, + { + "epoch": 1.19, + "grad_norm": 0.5798367857933044, + "learning_rate": 0.0003954827500103351, + "loss": 3.0148, + "step": 24296 + }, + { + "epoch": 1.19, + "grad_norm": 0.5936307907104492, + "learning_rate": 0.0003954681543245601, + "loss": 2.8564, + "step": 24297 + }, + { + "epoch": 1.19, + "grad_norm": 0.5227268934249878, + "learning_rate": 0.0003954535583873403, + "loss": 3.0408, + "step": 24298 + }, + { + "epoch": 1.19, + "grad_norm": 0.5441100001335144, + "learning_rate": 0.0003954389621987143, + "loss": 3.0124, + "step": 24299 + }, + { + "epoch": 1.19, + "grad_norm": 0.5319622159004211, + "learning_rate": 0.0003954243657587203, + "loss": 2.8448, + "step": 24300 + }, + { + "epoch": 1.19, + "grad_norm": 0.640273928642273, + "learning_rate": 0.000395409769067397, + "loss": 3.1077, + "step": 24301 + }, + { + "epoch": 1.19, + "grad_norm": 0.5414076447486877, + "learning_rate": 0.0003953951721247826, + "loss": 3.1144, + "step": 24302 + }, + { + "epoch": 1.19, + "grad_norm": 0.5242058038711548, + "learning_rate": 0.00039538057493091584, + "loss": 2.9945, + "step": 24303 + }, + { + "epoch": 1.19, + "grad_norm": 0.5769972205162048, + "learning_rate": 0.00039536597748583496, + "loss": 2.9874, + "step": 24304 + }, + { + "epoch": 1.19, + "grad_norm": 0.5243396759033203, + "learning_rate": 0.0003953513797895784, + "loss": 3.0878, + "step": 24305 + }, + { + "epoch": 1.19, + "grad_norm": 0.5818905234336853, + "learning_rate": 0.0003953367818421847, + "loss": 3.1227, + "step": 24306 + }, + { + "epoch": 1.19, + "grad_norm": 0.5517255067825317, + "learning_rate": 0.00039532218364369225, + "loss": 3.006, + "step": 24307 + }, + { + "epoch": 1.19, + "grad_norm": 0.5664587020874023, + "learning_rate": 0.0003953075851941395, + "loss": 3.0768, + "step": 24308 + }, + { + "epoch": 1.19, + "grad_norm": 0.547131359577179, + "learning_rate": 0.0003952929864935649, + "loss": 3.054, + "step": 24309 + }, + { + "epoch": 1.19, + "grad_norm": 0.5453808307647705, + "learning_rate": 0.00039527838754200694, + "loss": 3.1298, + "step": 24310 + }, + { + "epoch": 1.19, + "grad_norm": 0.549038827419281, + "learning_rate": 0.00039526378833950406, + "loss": 2.9323, + "step": 24311 + }, + { + "epoch": 1.19, + "grad_norm": 0.5720995664596558, + "learning_rate": 0.00039524918888609467, + "loss": 3.0093, + "step": 24312 + }, + { + "epoch": 1.19, + "grad_norm": 0.5207635164260864, + "learning_rate": 0.0003952345891818172, + "loss": 2.853, + "step": 24313 + }, + { + "epoch": 1.19, + "grad_norm": 0.5525292754173279, + "learning_rate": 0.0003952199892267102, + "loss": 3.2314, + "step": 24314 + }, + { + "epoch": 1.19, + "grad_norm": 0.5358814001083374, + "learning_rate": 0.0003952053890208121, + "loss": 2.8513, + "step": 24315 + }, + { + "epoch": 1.19, + "grad_norm": 0.5728432536125183, + "learning_rate": 0.00039519078856416136, + "loss": 3.1612, + "step": 24316 + }, + { + "epoch": 1.19, + "grad_norm": 0.5785890817642212, + "learning_rate": 0.0003951761878567962, + "loss": 3.1774, + "step": 24317 + }, + { + "epoch": 1.19, + "grad_norm": 0.6545047760009766, + "learning_rate": 0.00039516158689875555, + "loss": 2.9229, + "step": 24318 + }, + { + "epoch": 1.19, + "grad_norm": 0.5325422883033752, + "learning_rate": 0.0003951469856900775, + "loss": 2.9676, + "step": 24319 + }, + { + "epoch": 1.19, + "grad_norm": 0.5706045627593994, + "learning_rate": 0.0003951323842308005, + "loss": 3.0287, + "step": 24320 + }, + { + "epoch": 1.19, + "grad_norm": 0.5675446391105652, + "learning_rate": 0.00039511778252096323, + "loss": 3.1198, + "step": 24321 + }, + { + "epoch": 1.19, + "grad_norm": 0.5522854924201965, + "learning_rate": 0.00039510318056060396, + "loss": 2.9583, + "step": 24322 + }, + { + "epoch": 1.19, + "grad_norm": 0.5343487858772278, + "learning_rate": 0.00039508857834976124, + "loss": 3.3867, + "step": 24323 + }, + { + "epoch": 1.19, + "grad_norm": 0.5691496133804321, + "learning_rate": 0.00039507397588847346, + "loss": 3.257, + "step": 24324 + }, + { + "epoch": 1.19, + "grad_norm": 0.5750447511672974, + "learning_rate": 0.00039505937317677915, + "loss": 2.8305, + "step": 24325 + }, + { + "epoch": 1.19, + "grad_norm": 0.6205912828445435, + "learning_rate": 0.0003950447702147168, + "loss": 2.6935, + "step": 24326 + }, + { + "epoch": 1.19, + "grad_norm": 0.5219376087188721, + "learning_rate": 0.0003950301670023247, + "loss": 3.142, + "step": 24327 + }, + { + "epoch": 1.19, + "grad_norm": 0.5368987917900085, + "learning_rate": 0.00039501556353964146, + "loss": 3.1002, + "step": 24328 + }, + { + "epoch": 1.19, + "grad_norm": 0.5906136631965637, + "learning_rate": 0.0003950009598267057, + "loss": 2.9932, + "step": 24329 + }, + { + "epoch": 1.19, + "grad_norm": 0.5572717785835266, + "learning_rate": 0.0003949863558635555, + "loss": 3.0393, + "step": 24330 + }, + { + "epoch": 1.19, + "grad_norm": 0.5366165041923523, + "learning_rate": 0.00039497175165022956, + "loss": 2.838, + "step": 24331 + }, + { + "epoch": 1.19, + "grad_norm": 0.5345458388328552, + "learning_rate": 0.0003949571471867663, + "loss": 2.8754, + "step": 24332 + }, + { + "epoch": 1.19, + "grad_norm": 0.5713475942611694, + "learning_rate": 0.00039494254247320423, + "loss": 2.9894, + "step": 24333 + }, + { + "epoch": 1.19, + "grad_norm": 0.5291685461997986, + "learning_rate": 0.00039492793750958173, + "loss": 3.1748, + "step": 24334 + }, + { + "epoch": 1.19, + "grad_norm": 0.5592206120491028, + "learning_rate": 0.0003949133322959373, + "loss": 3.2341, + "step": 24335 + }, + { + "epoch": 1.19, + "grad_norm": 0.5599371194839478, + "learning_rate": 0.0003948987268323094, + "loss": 3.2312, + "step": 24336 + }, + { + "epoch": 1.19, + "grad_norm": 0.5585187077522278, + "learning_rate": 0.0003948841211187366, + "loss": 3.0695, + "step": 24337 + }, + { + "epoch": 1.19, + "grad_norm": 0.5566004514694214, + "learning_rate": 0.0003948695151552572, + "loss": 3.202, + "step": 24338 + }, + { + "epoch": 1.19, + "grad_norm": 0.5448996424674988, + "learning_rate": 0.0003948549089419098, + "loss": 3.0852, + "step": 24339 + }, + { + "epoch": 1.19, + "grad_norm": 0.5520145893096924, + "learning_rate": 0.0003948403024787329, + "loss": 3.1257, + "step": 24340 + }, + { + "epoch": 1.19, + "grad_norm": 0.5336360931396484, + "learning_rate": 0.00039482569576576484, + "loss": 3.1598, + "step": 24341 + }, + { + "epoch": 1.19, + "grad_norm": 0.5652227997779846, + "learning_rate": 0.00039481108880304403, + "loss": 3.0888, + "step": 24342 + }, + { + "epoch": 1.19, + "grad_norm": 0.5527975559234619, + "learning_rate": 0.0003947964815906091, + "loss": 3.1309, + "step": 24343 + }, + { + "epoch": 1.19, + "grad_norm": 0.5577079057693481, + "learning_rate": 0.0003947818741284987, + "loss": 3.1479, + "step": 24344 + }, + { + "epoch": 1.19, + "grad_norm": 0.5621340274810791, + "learning_rate": 0.0003947672664167509, + "loss": 3.0016, + "step": 24345 + }, + { + "epoch": 1.19, + "grad_norm": 0.5487366914749146, + "learning_rate": 0.0003947526584554044, + "loss": 3.0614, + "step": 24346 + }, + { + "epoch": 1.19, + "grad_norm": 0.5261529088020325, + "learning_rate": 0.0003947380502444976, + "loss": 3.2059, + "step": 24347 + }, + { + "epoch": 1.19, + "grad_norm": 0.5683314204216003, + "learning_rate": 0.000394723441784069, + "loss": 3.2642, + "step": 24348 + }, + { + "epoch": 1.19, + "grad_norm": 0.5974188446998596, + "learning_rate": 0.0003947088330741571, + "loss": 3.0305, + "step": 24349 + }, + { + "epoch": 1.19, + "grad_norm": 0.5211678147315979, + "learning_rate": 0.0003946942241148004, + "loss": 3.1625, + "step": 24350 + }, + { + "epoch": 1.19, + "grad_norm": 0.553632915019989, + "learning_rate": 0.0003946796149060373, + "loss": 3.1093, + "step": 24351 + }, + { + "epoch": 1.19, + "grad_norm": 0.5715633034706116, + "learning_rate": 0.00039466500544790633, + "loss": 3.187, + "step": 24352 + }, + { + "epoch": 1.19, + "grad_norm": 0.5198462009429932, + "learning_rate": 0.00039465039574044597, + "loss": 3.118, + "step": 24353 + }, + { + "epoch": 1.19, + "grad_norm": 0.5220559239387512, + "learning_rate": 0.00039463578578369463, + "loss": 3.1978, + "step": 24354 + }, + { + "epoch": 1.19, + "grad_norm": 0.5109367370605469, + "learning_rate": 0.00039462117557769095, + "loss": 3.1694, + "step": 24355 + }, + { + "epoch": 1.19, + "grad_norm": 0.5752742886543274, + "learning_rate": 0.0003946065651224732, + "loss": 3.0348, + "step": 24356 + }, + { + "epoch": 1.19, + "grad_norm": 0.5721966028213501, + "learning_rate": 0.00039459195441807996, + "loss": 3.1092, + "step": 24357 + }, + { + "epoch": 1.19, + "grad_norm": 0.5311843156814575, + "learning_rate": 0.0003945773434645498, + "loss": 3.1743, + "step": 24358 + }, + { + "epoch": 1.19, + "grad_norm": 0.5497158765792847, + "learning_rate": 0.00039456273226192107, + "loss": 2.7434, + "step": 24359 + }, + { + "epoch": 1.19, + "grad_norm": 0.5891440510749817, + "learning_rate": 0.0003945481208102323, + "loss": 3.17, + "step": 24360 + }, + { + "epoch": 1.19, + "grad_norm": 0.5738333463668823, + "learning_rate": 0.0003945335091095219, + "loss": 3.0605, + "step": 24361 + }, + { + "epoch": 1.19, + "grad_norm": 0.5178869366645813, + "learning_rate": 0.0003945188971598285, + "loss": 3.113, + "step": 24362 + }, + { + "epoch": 1.19, + "grad_norm": 0.5586382746696472, + "learning_rate": 0.00039450428496119055, + "loss": 3.1943, + "step": 24363 + }, + { + "epoch": 1.19, + "grad_norm": 0.5722424983978271, + "learning_rate": 0.0003944896725136464, + "loss": 2.919, + "step": 24364 + }, + { + "epoch": 1.19, + "grad_norm": 0.547747015953064, + "learning_rate": 0.0003944750598172348, + "loss": 3.0477, + "step": 24365 + }, + { + "epoch": 1.19, + "grad_norm": 0.5872486233711243, + "learning_rate": 0.000394460446871994, + "loss": 3.2328, + "step": 24366 + }, + { + "epoch": 1.19, + "grad_norm": 0.5272241234779358, + "learning_rate": 0.0003944458336779626, + "loss": 3.1448, + "step": 24367 + }, + { + "epoch": 1.19, + "grad_norm": 0.5975600481033325, + "learning_rate": 0.00039443122023517894, + "loss": 3.0531, + "step": 24368 + }, + { + "epoch": 1.19, + "grad_norm": 0.5917435884475708, + "learning_rate": 0.0003944166065436817, + "loss": 3.1291, + "step": 24369 + }, + { + "epoch": 1.19, + "grad_norm": 0.5495269894599915, + "learning_rate": 0.00039440199260350927, + "loss": 3.056, + "step": 24370 + }, + { + "epoch": 1.19, + "grad_norm": 0.5593695640563965, + "learning_rate": 0.00039438737841470013, + "loss": 2.9955, + "step": 24371 + }, + { + "epoch": 1.19, + "grad_norm": 0.5663504600524902, + "learning_rate": 0.0003943727639772928, + "loss": 3.0223, + "step": 24372 + }, + { + "epoch": 1.19, + "grad_norm": 0.5404443144798279, + "learning_rate": 0.00039435814929132586, + "loss": 3.2137, + "step": 24373 + }, + { + "epoch": 1.19, + "grad_norm": 0.52837735414505, + "learning_rate": 0.00039434353435683766, + "loss": 3.2715, + "step": 24374 + }, + { + "epoch": 1.19, + "grad_norm": 0.5688294768333435, + "learning_rate": 0.0003943289191738667, + "loss": 3.1562, + "step": 24375 + }, + { + "epoch": 1.19, + "grad_norm": 0.5669829249382019, + "learning_rate": 0.00039431430374245154, + "loss": 3.2254, + "step": 24376 + }, + { + "epoch": 1.19, + "grad_norm": 0.5809857249259949, + "learning_rate": 0.00039429968806263077, + "loss": 3.1399, + "step": 24377 + }, + { + "epoch": 1.19, + "grad_norm": 0.5638022422790527, + "learning_rate": 0.00039428507213444254, + "loss": 3.0601, + "step": 24378 + }, + { + "epoch": 1.19, + "grad_norm": 0.5520922541618347, + "learning_rate": 0.0003942704559579257, + "loss": 3.0555, + "step": 24379 + }, + { + "epoch": 1.19, + "grad_norm": 0.5883176922798157, + "learning_rate": 0.0003942558395331188, + "loss": 3.0359, + "step": 24380 + }, + { + "epoch": 1.19, + "grad_norm": 0.5609521269798279, + "learning_rate": 0.0003942412228600599, + "loss": 3.101, + "step": 24381 + }, + { + "epoch": 1.19, + "grad_norm": 0.5642240047454834, + "learning_rate": 0.0003942266059387879, + "loss": 2.9432, + "step": 24382 + }, + { + "epoch": 1.19, + "grad_norm": 0.5660053491592407, + "learning_rate": 0.000394211988769341, + "loss": 3.0687, + "step": 24383 + }, + { + "epoch": 1.2, + "grad_norm": 0.5571756362915039, + "learning_rate": 0.00039419737135175804, + "loss": 2.9455, + "step": 24384 + }, + { + "epoch": 1.2, + "grad_norm": 0.6327832937240601, + "learning_rate": 0.0003941827536860773, + "loss": 3.3466, + "step": 24385 + }, + { + "epoch": 1.2, + "grad_norm": 0.5416423082351685, + "learning_rate": 0.0003941681357723372, + "loss": 3.1423, + "step": 24386 + }, + { + "epoch": 1.2, + "grad_norm": 0.56650310754776, + "learning_rate": 0.0003941535176105764, + "loss": 3.2203, + "step": 24387 + }, + { + "epoch": 1.2, + "grad_norm": 0.516886830329895, + "learning_rate": 0.00039413889920083335, + "loss": 2.9785, + "step": 24388 + }, + { + "epoch": 1.2, + "grad_norm": 0.5625539422035217, + "learning_rate": 0.0003941242805431466, + "loss": 3.0304, + "step": 24389 + }, + { + "epoch": 1.2, + "grad_norm": 0.5716773867607117, + "learning_rate": 0.0003941096616375546, + "loss": 3.2899, + "step": 24390 + }, + { + "epoch": 1.2, + "grad_norm": 0.580208420753479, + "learning_rate": 0.00039409504248409595, + "loss": 3.1046, + "step": 24391 + }, + { + "epoch": 1.2, + "grad_norm": 0.5867831707000732, + "learning_rate": 0.0003940804230828089, + "loss": 3.0618, + "step": 24392 + }, + { + "epoch": 1.2, + "grad_norm": 0.5907902717590332, + "learning_rate": 0.00039406580343373217, + "loss": 3.3336, + "step": 24393 + }, + { + "epoch": 1.2, + "grad_norm": 0.5453541278839111, + "learning_rate": 0.0003940511835369041, + "loss": 2.8692, + "step": 24394 + }, + { + "epoch": 1.2, + "grad_norm": 0.5520537495613098, + "learning_rate": 0.0003940365633923636, + "loss": 3.0061, + "step": 24395 + }, + { + "epoch": 1.2, + "grad_norm": 0.5857133865356445, + "learning_rate": 0.0003940219430001487, + "loss": 3.0555, + "step": 24396 + }, + { + "epoch": 1.2, + "grad_norm": 0.5415214896202087, + "learning_rate": 0.00039400732236029816, + "loss": 3.0087, + "step": 24397 + }, + { + "epoch": 1.2, + "grad_norm": 0.5039085745811462, + "learning_rate": 0.0003939927014728504, + "loss": 3.37, + "step": 24398 + }, + { + "epoch": 1.2, + "grad_norm": 0.5987560153007507, + "learning_rate": 0.000393978080337844, + "loss": 3.3081, + "step": 24399 + }, + { + "epoch": 1.2, + "grad_norm": 0.5762282013893127, + "learning_rate": 0.0003939634589553173, + "loss": 3.048, + "step": 24400 + }, + { + "epoch": 1.2, + "grad_norm": 0.5720338225364685, + "learning_rate": 0.000393948837325309, + "loss": 3.0585, + "step": 24401 + }, + { + "epoch": 1.2, + "grad_norm": 0.5712810754776001, + "learning_rate": 0.00039393421544785766, + "loss": 3.0319, + "step": 24402 + }, + { + "epoch": 1.2, + "grad_norm": 0.5783328413963318, + "learning_rate": 0.0003939195933230015, + "loss": 3.0169, + "step": 24403 + }, + { + "epoch": 1.2, + "grad_norm": 0.5550515055656433, + "learning_rate": 0.00039390497095077924, + "loss": 3.2278, + "step": 24404 + }, + { + "epoch": 1.2, + "grad_norm": 0.5710436105728149, + "learning_rate": 0.0003938903483312294, + "loss": 3.0629, + "step": 24405 + }, + { + "epoch": 1.2, + "grad_norm": 1.2239809036254883, + "learning_rate": 0.00039387572546439046, + "loss": 3.3184, + "step": 24406 + }, + { + "epoch": 1.2, + "grad_norm": 0.5772742033004761, + "learning_rate": 0.00039386110235030094, + "loss": 3.2588, + "step": 24407 + }, + { + "epoch": 1.2, + "grad_norm": 0.576033353805542, + "learning_rate": 0.00039384647898899934, + "loss": 2.8866, + "step": 24408 + }, + { + "epoch": 1.2, + "grad_norm": 0.5748345851898193, + "learning_rate": 0.0003938318553805241, + "loss": 3.0798, + "step": 24409 + }, + { + "epoch": 1.2, + "grad_norm": 0.5759305953979492, + "learning_rate": 0.00039381723152491385, + "loss": 3.0736, + "step": 24410 + }, + { + "epoch": 1.2, + "grad_norm": 0.5629940032958984, + "learning_rate": 0.0003938026074222071, + "loss": 3.3334, + "step": 24411 + }, + { + "epoch": 1.2, + "grad_norm": 0.5594444274902344, + "learning_rate": 0.00039378798307244234, + "loss": 2.9158, + "step": 24412 + }, + { + "epoch": 1.2, + "grad_norm": 0.5800905823707581, + "learning_rate": 0.000393773358475658, + "loss": 2.9424, + "step": 24413 + }, + { + "epoch": 1.2, + "grad_norm": 0.5819612741470337, + "learning_rate": 0.0003937587336318927, + "loss": 3.0931, + "step": 24414 + }, + { + "epoch": 1.2, + "grad_norm": 0.5524073243141174, + "learning_rate": 0.00039374410854118495, + "loss": 3.1325, + "step": 24415 + }, + { + "epoch": 1.2, + "grad_norm": 0.5730950832366943, + "learning_rate": 0.0003937294832035733, + "loss": 2.9164, + "step": 24416 + }, + { + "epoch": 1.2, + "grad_norm": 0.5506508350372314, + "learning_rate": 0.00039371485761909627, + "loss": 3.0187, + "step": 24417 + }, + { + "epoch": 1.2, + "grad_norm": 0.5478151440620422, + "learning_rate": 0.00039370023178779233, + "loss": 2.788, + "step": 24418 + }, + { + "epoch": 1.2, + "grad_norm": 0.5452743768692017, + "learning_rate": 0.00039368560570969985, + "loss": 2.9242, + "step": 24419 + }, + { + "epoch": 1.2, + "grad_norm": 0.49868834018707275, + "learning_rate": 0.0003936709793848577, + "loss": 2.9301, + "step": 24420 + }, + { + "epoch": 1.2, + "grad_norm": 0.597121000289917, + "learning_rate": 0.00039365635281330417, + "loss": 2.7868, + "step": 24421 + }, + { + "epoch": 1.2, + "grad_norm": 0.5634990334510803, + "learning_rate": 0.00039364172599507777, + "loss": 2.8906, + "step": 24422 + }, + { + "epoch": 1.2, + "grad_norm": 0.5244052410125732, + "learning_rate": 0.0003936270989302171, + "loss": 3.1885, + "step": 24423 + }, + { + "epoch": 1.2, + "grad_norm": 0.5397709012031555, + "learning_rate": 0.0003936124716187607, + "loss": 3.1282, + "step": 24424 + }, + { + "epoch": 1.2, + "grad_norm": 0.5438342690467834, + "learning_rate": 0.00039359784406074706, + "loss": 3.1376, + "step": 24425 + }, + { + "epoch": 1.2, + "grad_norm": 0.5476157665252686, + "learning_rate": 0.00039358321625621473, + "loss": 3.1603, + "step": 24426 + }, + { + "epoch": 1.2, + "grad_norm": 0.5249218940734863, + "learning_rate": 0.00039356858820520217, + "loss": 3.1193, + "step": 24427 + }, + { + "epoch": 1.2, + "grad_norm": 0.548941969871521, + "learning_rate": 0.0003935539599077481, + "loss": 3.2686, + "step": 24428 + }, + { + "epoch": 1.2, + "grad_norm": 0.5758370757102966, + "learning_rate": 0.0003935393313638908, + "loss": 3.1205, + "step": 24429 + }, + { + "epoch": 1.2, + "grad_norm": 0.5937607884407043, + "learning_rate": 0.0003935247025736688, + "loss": 2.9228, + "step": 24430 + }, + { + "epoch": 1.2, + "grad_norm": 0.5786687731742859, + "learning_rate": 0.00039351007353712086, + "loss": 2.9846, + "step": 24431 + }, + { + "epoch": 1.2, + "grad_norm": 0.5394309163093567, + "learning_rate": 0.0003934954442542854, + "loss": 3.2228, + "step": 24432 + }, + { + "epoch": 1.2, + "grad_norm": 0.5585785508155823, + "learning_rate": 0.00039348081472520086, + "loss": 3.1961, + "step": 24433 + }, + { + "epoch": 1.2, + "grad_norm": 0.5906318426132202, + "learning_rate": 0.00039346618494990584, + "loss": 3.2208, + "step": 24434 + }, + { + "epoch": 1.2, + "grad_norm": 0.5899059176445007, + "learning_rate": 0.00039345155492843893, + "loss": 3.0491, + "step": 24435 + }, + { + "epoch": 1.2, + "grad_norm": 0.5428707003593445, + "learning_rate": 0.00039343692466083856, + "loss": 2.9778, + "step": 24436 + }, + { + "epoch": 1.2, + "grad_norm": 0.5404731631278992, + "learning_rate": 0.0003934222941471433, + "loss": 2.9239, + "step": 24437 + }, + { + "epoch": 1.2, + "grad_norm": 0.5213099718093872, + "learning_rate": 0.00039340766338739175, + "loss": 3.2136, + "step": 24438 + }, + { + "epoch": 1.2, + "grad_norm": 0.5822370052337646, + "learning_rate": 0.00039339303238162247, + "loss": 3.0366, + "step": 24439 + }, + { + "epoch": 1.2, + "grad_norm": 0.581151008605957, + "learning_rate": 0.00039337840112987373, + "loss": 3.1906, + "step": 24440 + }, + { + "epoch": 1.2, + "grad_norm": 0.5343998670578003, + "learning_rate": 0.00039336376963218435, + "loss": 3.202, + "step": 24441 + }, + { + "epoch": 1.2, + "grad_norm": 0.5333302617073059, + "learning_rate": 0.00039334913788859283, + "loss": 3.2351, + "step": 24442 + }, + { + "epoch": 1.2, + "grad_norm": 0.5405131578445435, + "learning_rate": 0.00039333450589913754, + "loss": 3.0949, + "step": 24443 + }, + { + "epoch": 1.2, + "grad_norm": 0.5513492822647095, + "learning_rate": 0.0003933198736638572, + "loss": 3.023, + "step": 24444 + }, + { + "epoch": 1.2, + "grad_norm": 0.587897539138794, + "learning_rate": 0.00039330524118279015, + "loss": 3.1852, + "step": 24445 + }, + { + "epoch": 1.2, + "grad_norm": 0.5661784410476685, + "learning_rate": 0.00039329060845597524, + "loss": 2.9803, + "step": 24446 + }, + { + "epoch": 1.2, + "grad_norm": 0.5629352927207947, + "learning_rate": 0.0003932759754834507, + "loss": 3.0325, + "step": 24447 + }, + { + "epoch": 1.2, + "grad_norm": 0.5622941851615906, + "learning_rate": 0.00039326134226525515, + "loss": 2.8962, + "step": 24448 + }, + { + "epoch": 1.2, + "grad_norm": 0.5289006233215332, + "learning_rate": 0.00039324670880142726, + "loss": 3.1545, + "step": 24449 + }, + { + "epoch": 1.2, + "grad_norm": 0.5608207583427429, + "learning_rate": 0.00039323207509200545, + "loss": 3.1841, + "step": 24450 + }, + { + "epoch": 1.2, + "grad_norm": 0.5729897618293762, + "learning_rate": 0.0003932174411370283, + "loss": 3.1255, + "step": 24451 + }, + { + "epoch": 1.2, + "grad_norm": 0.5416507720947266, + "learning_rate": 0.00039320280693653435, + "loss": 3.0346, + "step": 24452 + }, + { + "epoch": 1.2, + "grad_norm": 0.5522789359092712, + "learning_rate": 0.00039318817249056224, + "loss": 3.1717, + "step": 24453 + }, + { + "epoch": 1.2, + "grad_norm": 0.6044777631759644, + "learning_rate": 0.00039317353779915034, + "loss": 3.0349, + "step": 24454 + }, + { + "epoch": 1.2, + "grad_norm": 0.5705912709236145, + "learning_rate": 0.0003931589028623371, + "loss": 3.0584, + "step": 24455 + }, + { + "epoch": 1.2, + "grad_norm": 0.5330730676651001, + "learning_rate": 0.0003931442676801614, + "loss": 3.2936, + "step": 24456 + }, + { + "epoch": 1.2, + "grad_norm": 0.526887059211731, + "learning_rate": 0.0003931296322526617, + "loss": 2.9784, + "step": 24457 + }, + { + "epoch": 1.2, + "grad_norm": 0.5354217886924744, + "learning_rate": 0.0003931149965798764, + "loss": 3.1643, + "step": 24458 + }, + { + "epoch": 1.2, + "grad_norm": 0.5469911098480225, + "learning_rate": 0.0003931003606618441, + "loss": 3.0493, + "step": 24459 + }, + { + "epoch": 1.2, + "grad_norm": 0.5541418790817261, + "learning_rate": 0.00039308572449860336, + "loss": 2.8854, + "step": 24460 + }, + { + "epoch": 1.2, + "grad_norm": 0.568221390247345, + "learning_rate": 0.0003930710880901928, + "loss": 2.958, + "step": 24461 + }, + { + "epoch": 1.2, + "grad_norm": 0.5707933902740479, + "learning_rate": 0.0003930564514366509, + "loss": 3.1631, + "step": 24462 + }, + { + "epoch": 1.2, + "grad_norm": 0.5753080248832703, + "learning_rate": 0.00039304181453801614, + "loss": 3.1559, + "step": 24463 + }, + { + "epoch": 1.2, + "grad_norm": 0.5365299582481384, + "learning_rate": 0.0003930271773943272, + "loss": 3.1203, + "step": 24464 + }, + { + "epoch": 1.2, + "grad_norm": 0.5476408004760742, + "learning_rate": 0.00039301254000562256, + "loss": 3.0281, + "step": 24465 + }, + { + "epoch": 1.2, + "grad_norm": 0.5769860744476318, + "learning_rate": 0.0003929979023719408, + "loss": 3.121, + "step": 24466 + }, + { + "epoch": 1.2, + "grad_norm": 0.6829390525817871, + "learning_rate": 0.00039298326449332044, + "loss": 2.9079, + "step": 24467 + }, + { + "epoch": 1.2, + "grad_norm": 0.5319774746894836, + "learning_rate": 0.00039296862636980015, + "loss": 2.9785, + "step": 24468 + }, + { + "epoch": 1.2, + "grad_norm": 0.5541231632232666, + "learning_rate": 0.0003929539880014183, + "loss": 3.0751, + "step": 24469 + }, + { + "epoch": 1.2, + "grad_norm": 0.5718669295310974, + "learning_rate": 0.00039293934938821354, + "loss": 2.9471, + "step": 24470 + }, + { + "epoch": 1.2, + "grad_norm": 0.5447272658348083, + "learning_rate": 0.0003929247105302244, + "loss": 2.8957, + "step": 24471 + }, + { + "epoch": 1.2, + "grad_norm": 0.5534474849700928, + "learning_rate": 0.0003929100714274895, + "loss": 3.0906, + "step": 24472 + }, + { + "epoch": 1.2, + "grad_norm": 0.9372397661209106, + "learning_rate": 0.00039289543208004734, + "loss": 3.2436, + "step": 24473 + }, + { + "epoch": 1.2, + "grad_norm": 0.5349345207214355, + "learning_rate": 0.00039288079248793646, + "loss": 3.0896, + "step": 24474 + }, + { + "epoch": 1.2, + "grad_norm": 0.5702376365661621, + "learning_rate": 0.0003928661526511955, + "loss": 2.9394, + "step": 24475 + }, + { + "epoch": 1.2, + "grad_norm": 0.5644256472587585, + "learning_rate": 0.00039285151256986296, + "loss": 3.2093, + "step": 24476 + }, + { + "epoch": 1.2, + "grad_norm": 0.5325481295585632, + "learning_rate": 0.00039283687224397744, + "loss": 2.844, + "step": 24477 + }, + { + "epoch": 1.2, + "grad_norm": 0.5378096103668213, + "learning_rate": 0.0003928222316735773, + "loss": 3.0989, + "step": 24478 + }, + { + "epoch": 1.2, + "grad_norm": 0.5697956085205078, + "learning_rate": 0.0003928075908587015, + "loss": 3.1788, + "step": 24479 + }, + { + "epoch": 1.2, + "grad_norm": 0.5447378754615784, + "learning_rate": 0.0003927929497993882, + "loss": 3.0596, + "step": 24480 + }, + { + "epoch": 1.2, + "grad_norm": 0.6714164614677429, + "learning_rate": 0.00039277830849567615, + "loss": 2.8876, + "step": 24481 + }, + { + "epoch": 1.2, + "grad_norm": 0.5856902003288269, + "learning_rate": 0.000392763666947604, + "loss": 2.8871, + "step": 24482 + }, + { + "epoch": 1.2, + "grad_norm": 0.58744215965271, + "learning_rate": 0.0003927490251552101, + "loss": 3.1451, + "step": 24483 + }, + { + "epoch": 1.2, + "grad_norm": 0.563101053237915, + "learning_rate": 0.00039273438311853315, + "loss": 3.0435, + "step": 24484 + }, + { + "epoch": 1.2, + "grad_norm": 0.5481135845184326, + "learning_rate": 0.00039271974083761167, + "loss": 3.0859, + "step": 24485 + }, + { + "epoch": 1.2, + "grad_norm": 0.5843150019645691, + "learning_rate": 0.0003927050983124842, + "loss": 3.0686, + "step": 24486 + }, + { + "epoch": 1.2, + "grad_norm": 0.5351256728172302, + "learning_rate": 0.0003926904555431894, + "loss": 3.0086, + "step": 24487 + }, + { + "epoch": 1.2, + "grad_norm": 0.6021181344985962, + "learning_rate": 0.00039267581252976574, + "loss": 2.9542, + "step": 24488 + }, + { + "epoch": 1.2, + "grad_norm": 0.5292304754257202, + "learning_rate": 0.00039266116927225186, + "loss": 3.0605, + "step": 24489 + }, + { + "epoch": 1.2, + "grad_norm": 0.5263028144836426, + "learning_rate": 0.00039264652577068634, + "loss": 2.9425, + "step": 24490 + }, + { + "epoch": 1.2, + "grad_norm": 0.5306416749954224, + "learning_rate": 0.0003926318820251076, + "loss": 3.0063, + "step": 24491 + }, + { + "epoch": 1.2, + "grad_norm": 0.5751937031745911, + "learning_rate": 0.00039261723803555427, + "loss": 3.029, + "step": 24492 + }, + { + "epoch": 1.2, + "grad_norm": 0.5904483795166016, + "learning_rate": 0.00039260259380206517, + "loss": 3.3765, + "step": 24493 + }, + { + "epoch": 1.2, + "grad_norm": 0.5914291739463806, + "learning_rate": 0.00039258794932467845, + "loss": 2.9694, + "step": 24494 + }, + { + "epoch": 1.2, + "grad_norm": 0.5873374938964844, + "learning_rate": 0.000392573304603433, + "loss": 3.0701, + "step": 24495 + }, + { + "epoch": 1.2, + "grad_norm": 0.6778043508529663, + "learning_rate": 0.00039255865963836714, + "loss": 3.0645, + "step": 24496 + }, + { + "epoch": 1.2, + "grad_norm": 0.5494279265403748, + "learning_rate": 0.0003925440144295198, + "loss": 3.1383, + "step": 24497 + }, + { + "epoch": 1.2, + "grad_norm": 0.5600928068161011, + "learning_rate": 0.0003925293689769292, + "loss": 3.2732, + "step": 24498 + }, + { + "epoch": 1.2, + "grad_norm": 0.5523821711540222, + "learning_rate": 0.0003925147232806341, + "loss": 2.9552, + "step": 24499 + }, + { + "epoch": 1.2, + "grad_norm": 0.5339621305465698, + "learning_rate": 0.000392500077340673, + "loss": 3.0629, + "step": 24500 + }, + { + "epoch": 1.2, + "grad_norm": 0.5230332016944885, + "learning_rate": 0.0003924854311570845, + "loss": 3.3095, + "step": 24501 + }, + { + "epoch": 1.2, + "grad_norm": 0.5600953698158264, + "learning_rate": 0.00039247078472990716, + "loss": 3.1273, + "step": 24502 + }, + { + "epoch": 1.2, + "grad_norm": 0.5858113765716553, + "learning_rate": 0.00039245613805917955, + "loss": 2.9782, + "step": 24503 + }, + { + "epoch": 1.2, + "grad_norm": 0.5632593631744385, + "learning_rate": 0.00039244149114494036, + "loss": 3.1121, + "step": 24504 + }, + { + "epoch": 1.2, + "grad_norm": 0.5598071813583374, + "learning_rate": 0.00039242684398722806, + "loss": 3.083, + "step": 24505 + }, + { + "epoch": 1.2, + "grad_norm": 0.598629891872406, + "learning_rate": 0.00039241219658608115, + "loss": 3.0239, + "step": 24506 + }, + { + "epoch": 1.2, + "grad_norm": 0.6259543895721436, + "learning_rate": 0.00039239754894153833, + "loss": 3.2252, + "step": 24507 + }, + { + "epoch": 1.2, + "grad_norm": 0.5557901263237, + "learning_rate": 0.00039238290105363815, + "loss": 3.1079, + "step": 24508 + }, + { + "epoch": 1.2, + "grad_norm": 0.5629055500030518, + "learning_rate": 0.00039236825292241926, + "loss": 3.1228, + "step": 24509 + }, + { + "epoch": 1.2, + "grad_norm": 0.578502357006073, + "learning_rate": 0.00039235360454792015, + "loss": 3.042, + "step": 24510 + }, + { + "epoch": 1.2, + "grad_norm": 0.591256320476532, + "learning_rate": 0.0003923389559301793, + "loss": 3.0652, + "step": 24511 + }, + { + "epoch": 1.2, + "grad_norm": 0.566704273223877, + "learning_rate": 0.00039232430706923554, + "loss": 3.0541, + "step": 24512 + }, + { + "epoch": 1.2, + "grad_norm": 0.5784484148025513, + "learning_rate": 0.00039230965796512723, + "loss": 2.751, + "step": 24513 + }, + { + "epoch": 1.2, + "grad_norm": 0.5382969975471497, + "learning_rate": 0.0003922950086178931, + "loss": 2.9646, + "step": 24514 + }, + { + "epoch": 1.2, + "grad_norm": 0.5786687731742859, + "learning_rate": 0.00039228035902757173, + "loss": 3.1632, + "step": 24515 + }, + { + "epoch": 1.2, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0003922657091942016, + "loss": 3.0142, + "step": 24516 + }, + { + "epoch": 1.2, + "grad_norm": 0.5659675598144531, + "learning_rate": 0.00039225105911782124, + "loss": 3.183, + "step": 24517 + }, + { + "epoch": 1.2, + "grad_norm": 0.5671629905700684, + "learning_rate": 0.00039223640879846943, + "loss": 3.1611, + "step": 24518 + }, + { + "epoch": 1.2, + "grad_norm": 0.5662316083908081, + "learning_rate": 0.0003922217582361848, + "loss": 3.2395, + "step": 24519 + }, + { + "epoch": 1.2, + "grad_norm": 0.5527083873748779, + "learning_rate": 0.00039220710743100565, + "loss": 3.312, + "step": 24520 + }, + { + "epoch": 1.2, + "grad_norm": 0.553294837474823, + "learning_rate": 0.0003921924563829708, + "loss": 3.0211, + "step": 24521 + }, + { + "epoch": 1.2, + "grad_norm": 0.658713161945343, + "learning_rate": 0.0003921778050921187, + "loss": 3.2227, + "step": 24522 + }, + { + "epoch": 1.2, + "grad_norm": 0.6088311076164246, + "learning_rate": 0.00039216315355848803, + "loss": 3.2464, + "step": 24523 + }, + { + "epoch": 1.2, + "grad_norm": 0.6018931269645691, + "learning_rate": 0.0003921485017821173, + "loss": 3.1967, + "step": 24524 + }, + { + "epoch": 1.2, + "grad_norm": 0.5140994787216187, + "learning_rate": 0.00039213384976304527, + "loss": 3.0007, + "step": 24525 + }, + { + "epoch": 1.2, + "grad_norm": 0.5533731579780579, + "learning_rate": 0.0003921191975013103, + "loss": 2.992, + "step": 24526 + }, + { + "epoch": 1.2, + "grad_norm": 0.5808982849121094, + "learning_rate": 0.00039210454499695116, + "loss": 3.2767, + "step": 24527 + }, + { + "epoch": 1.2, + "grad_norm": 0.5596871972084045, + "learning_rate": 0.00039208989225000633, + "loss": 3.1751, + "step": 24528 + }, + { + "epoch": 1.2, + "grad_norm": 0.542251467704773, + "learning_rate": 0.00039207523926051453, + "loss": 3.0595, + "step": 24529 + }, + { + "epoch": 1.2, + "grad_norm": 0.6116825342178345, + "learning_rate": 0.0003920605860285142, + "loss": 3.1456, + "step": 24530 + }, + { + "epoch": 1.2, + "grad_norm": 0.5442792177200317, + "learning_rate": 0.000392045932554044, + "loss": 3.2253, + "step": 24531 + }, + { + "epoch": 1.2, + "grad_norm": 0.5724294781684875, + "learning_rate": 0.0003920312788371425, + "loss": 3.3303, + "step": 24532 + }, + { + "epoch": 1.2, + "grad_norm": 0.5326899290084839, + "learning_rate": 0.00039201662487784844, + "loss": 2.9463, + "step": 24533 + }, + { + "epoch": 1.2, + "grad_norm": 0.5549022555351257, + "learning_rate": 0.0003920019706762002, + "loss": 3.0103, + "step": 24534 + }, + { + "epoch": 1.2, + "grad_norm": 0.533281147480011, + "learning_rate": 0.0003919873162322365, + "loss": 3.0965, + "step": 24535 + }, + { + "epoch": 1.2, + "grad_norm": 0.5557292699813843, + "learning_rate": 0.0003919726615459959, + "loss": 3.0286, + "step": 24536 + }, + { + "epoch": 1.2, + "grad_norm": 0.558367133140564, + "learning_rate": 0.000391958006617517, + "loss": 3.2902, + "step": 24537 + }, + { + "epoch": 1.2, + "grad_norm": 0.5451139211654663, + "learning_rate": 0.00039194335144683844, + "loss": 3.2517, + "step": 24538 + }, + { + "epoch": 1.2, + "grad_norm": 0.5590549111366272, + "learning_rate": 0.00039192869603399877, + "loss": 2.977, + "step": 24539 + }, + { + "epoch": 1.2, + "grad_norm": 0.5913295745849609, + "learning_rate": 0.00039191404037903664, + "loss": 2.984, + "step": 24540 + }, + { + "epoch": 1.2, + "grad_norm": 0.5299440622329712, + "learning_rate": 0.0003918993844819906, + "loss": 3.129, + "step": 24541 + }, + { + "epoch": 1.2, + "grad_norm": 0.5578473210334778, + "learning_rate": 0.0003918847283428992, + "loss": 2.9886, + "step": 24542 + }, + { + "epoch": 1.2, + "grad_norm": 0.5618113875389099, + "learning_rate": 0.0003918700719618012, + "loss": 3.1438, + "step": 24543 + }, + { + "epoch": 1.2, + "grad_norm": 0.5523414015769958, + "learning_rate": 0.0003918554153387351, + "loss": 3.1398, + "step": 24544 + }, + { + "epoch": 1.2, + "grad_norm": 0.5433165431022644, + "learning_rate": 0.0003918407584737395, + "loss": 3.0228, + "step": 24545 + }, + { + "epoch": 1.2, + "grad_norm": 0.5516573786735535, + "learning_rate": 0.0003918261013668531, + "loss": 3.2176, + "step": 24546 + }, + { + "epoch": 1.2, + "grad_norm": 0.548460841178894, + "learning_rate": 0.00039181144401811426, + "loss": 3.0904, + "step": 24547 + }, + { + "epoch": 1.2, + "grad_norm": 0.57688307762146, + "learning_rate": 0.0003917967864275618, + "loss": 2.8368, + "step": 24548 + }, + { + "epoch": 1.2, + "grad_norm": 0.5394658446311951, + "learning_rate": 0.0003917821285952343, + "loss": 3.1838, + "step": 24549 + }, + { + "epoch": 1.2, + "grad_norm": 0.5557076334953308, + "learning_rate": 0.0003917674705211703, + "loss": 3.0597, + "step": 24550 + }, + { + "epoch": 1.2, + "grad_norm": 0.5342375636100769, + "learning_rate": 0.00039175281220540844, + "loss": 2.9482, + "step": 24551 + }, + { + "epoch": 1.2, + "grad_norm": 0.5965703725814819, + "learning_rate": 0.00039173815364798744, + "loss": 2.9069, + "step": 24552 + }, + { + "epoch": 1.2, + "grad_norm": 0.5493952631950378, + "learning_rate": 0.0003917234948489457, + "loss": 3.0136, + "step": 24553 + }, + { + "epoch": 1.2, + "grad_norm": 0.563545286655426, + "learning_rate": 0.0003917088358083219, + "loss": 2.9065, + "step": 24554 + }, + { + "epoch": 1.2, + "grad_norm": 0.575928807258606, + "learning_rate": 0.0003916941765261548, + "loss": 3.1358, + "step": 24555 + }, + { + "epoch": 1.2, + "grad_norm": 0.5435425043106079, + "learning_rate": 0.0003916795170024828, + "loss": 2.9169, + "step": 24556 + }, + { + "epoch": 1.2, + "grad_norm": 0.5062286853790283, + "learning_rate": 0.0003916648572373446, + "loss": 2.9257, + "step": 24557 + }, + { + "epoch": 1.2, + "grad_norm": 0.5492591857910156, + "learning_rate": 0.0003916501972307787, + "loss": 3.0671, + "step": 24558 + }, + { + "epoch": 1.2, + "grad_norm": 0.5551416873931885, + "learning_rate": 0.000391635536982824, + "loss": 3.117, + "step": 24559 + }, + { + "epoch": 1.2, + "grad_norm": 0.5714171528816223, + "learning_rate": 0.0003916208764935189, + "loss": 3.0568, + "step": 24560 + }, + { + "epoch": 1.2, + "grad_norm": 0.5315268635749817, + "learning_rate": 0.000391606215762902, + "loss": 3.2503, + "step": 24561 + }, + { + "epoch": 1.2, + "grad_norm": 0.5530765056610107, + "learning_rate": 0.00039159155479101196, + "loss": 3.0153, + "step": 24562 + }, + { + "epoch": 1.2, + "grad_norm": 0.5797913074493408, + "learning_rate": 0.0003915768935778874, + "loss": 3.2337, + "step": 24563 + }, + { + "epoch": 1.2, + "grad_norm": 0.5785074830055237, + "learning_rate": 0.0003915622321235669, + "loss": 3.169, + "step": 24564 + }, + { + "epoch": 1.2, + "grad_norm": 0.5695971846580505, + "learning_rate": 0.0003915475704280891, + "loss": 3.0674, + "step": 24565 + }, + { + "epoch": 1.2, + "grad_norm": 0.5791469216346741, + "learning_rate": 0.00039153290849149275, + "loss": 3.1011, + "step": 24566 + }, + { + "epoch": 1.2, + "grad_norm": 0.5918307304382324, + "learning_rate": 0.0003915182463138161, + "loss": 3.1235, + "step": 24567 + }, + { + "epoch": 1.2, + "grad_norm": 0.5434072613716125, + "learning_rate": 0.00039150358389509815, + "loss": 2.9738, + "step": 24568 + }, + { + "epoch": 1.2, + "grad_norm": 0.6053929328918457, + "learning_rate": 0.0003914889212353773, + "loss": 3.0853, + "step": 24569 + }, + { + "epoch": 1.2, + "grad_norm": 0.5387932062149048, + "learning_rate": 0.0003914742583346924, + "loss": 3.0236, + "step": 24570 + }, + { + "epoch": 1.2, + "grad_norm": 0.572083592414856, + "learning_rate": 0.00039145959519308176, + "loss": 2.891, + "step": 24571 + }, + { + "epoch": 1.2, + "grad_norm": 0.5593575835227966, + "learning_rate": 0.0003914449318105842, + "loss": 3.3136, + "step": 24572 + }, + { + "epoch": 1.2, + "grad_norm": 0.5914036631584167, + "learning_rate": 0.0003914302681872382, + "loss": 3.0979, + "step": 24573 + }, + { + "epoch": 1.2, + "grad_norm": 0.5585025548934937, + "learning_rate": 0.0003914156043230826, + "loss": 3.1267, + "step": 24574 + }, + { + "epoch": 1.2, + "grad_norm": 0.5593438148498535, + "learning_rate": 0.0003914009402181558, + "loss": 2.9214, + "step": 24575 + }, + { + "epoch": 1.2, + "grad_norm": 0.5880452394485474, + "learning_rate": 0.0003913862758724966, + "loss": 3.0177, + "step": 24576 + }, + { + "epoch": 1.2, + "grad_norm": 0.5511958003044128, + "learning_rate": 0.00039137161128614345, + "loss": 3.0525, + "step": 24577 + }, + { + "epoch": 1.2, + "grad_norm": 0.5612953305244446, + "learning_rate": 0.00039135694645913506, + "loss": 3.189, + "step": 24578 + }, + { + "epoch": 1.2, + "grad_norm": 0.5851930379867554, + "learning_rate": 0.0003913422813915101, + "loss": 2.9018, + "step": 24579 + }, + { + "epoch": 1.2, + "grad_norm": 0.5985074043273926, + "learning_rate": 0.00039132761608330716, + "loss": 3.0104, + "step": 24580 + }, + { + "epoch": 1.2, + "grad_norm": 0.545453667640686, + "learning_rate": 0.0003913129505345649, + "loss": 3.1573, + "step": 24581 + }, + { + "epoch": 1.2, + "grad_norm": 0.5423109531402588, + "learning_rate": 0.0003912982847453218, + "loss": 2.9684, + "step": 24582 + }, + { + "epoch": 1.2, + "grad_norm": 0.5626406073570251, + "learning_rate": 0.0003912836187156166, + "loss": 3.0915, + "step": 24583 + }, + { + "epoch": 1.2, + "grad_norm": 0.5251243710517883, + "learning_rate": 0.000391268952445488, + "loss": 2.8087, + "step": 24584 + }, + { + "epoch": 1.2, + "grad_norm": 0.5345422625541687, + "learning_rate": 0.0003912542859349745, + "loss": 3.0146, + "step": 24585 + }, + { + "epoch": 1.2, + "grad_norm": 0.563607394695282, + "learning_rate": 0.0003912396191841148, + "loss": 3.1094, + "step": 24586 + }, + { + "epoch": 1.2, + "grad_norm": 0.5774620175361633, + "learning_rate": 0.00039122495219294734, + "loss": 3.1227, + "step": 24587 + }, + { + "epoch": 1.2, + "grad_norm": 0.6058965921401978, + "learning_rate": 0.0003912102849615111, + "loss": 3.0073, + "step": 24588 + }, + { + "epoch": 1.21, + "grad_norm": 0.5583047866821289, + "learning_rate": 0.00039119561748984446, + "loss": 3.2368, + "step": 24589 + }, + { + "epoch": 1.21, + "grad_norm": 0.5507198572158813, + "learning_rate": 0.00039118094977798605, + "loss": 3.062, + "step": 24590 + }, + { + "epoch": 1.21, + "grad_norm": 0.5293483734130859, + "learning_rate": 0.00039116628182597464, + "loss": 3.1221, + "step": 24591 + }, + { + "epoch": 1.21, + "grad_norm": 0.5906999111175537, + "learning_rate": 0.0003911516136338489, + "loss": 3.1421, + "step": 24592 + }, + { + "epoch": 1.21, + "grad_norm": 0.5505334734916687, + "learning_rate": 0.0003911369452016472, + "loss": 3.2336, + "step": 24593 + }, + { + "epoch": 1.21, + "grad_norm": 0.5691468119621277, + "learning_rate": 0.00039112227652940825, + "loss": 2.9671, + "step": 24594 + }, + { + "epoch": 1.21, + "grad_norm": 0.6026682257652283, + "learning_rate": 0.00039110760761717095, + "loss": 3.0389, + "step": 24595 + }, + { + "epoch": 1.21, + "grad_norm": 0.5420871376991272, + "learning_rate": 0.00039109293846497365, + "loss": 3.0556, + "step": 24596 + }, + { + "epoch": 1.21, + "grad_norm": 0.5781927704811096, + "learning_rate": 0.0003910782690728551, + "loss": 3.016, + "step": 24597 + }, + { + "epoch": 1.21, + "grad_norm": 0.5528817772865295, + "learning_rate": 0.0003910635994408539, + "loss": 3.1946, + "step": 24598 + }, + { + "epoch": 1.21, + "grad_norm": 0.5814532041549683, + "learning_rate": 0.0003910489295690087, + "loss": 3.0579, + "step": 24599 + }, + { + "epoch": 1.21, + "grad_norm": 0.6129553914070129, + "learning_rate": 0.00039103425945735817, + "loss": 3.1454, + "step": 24600 + }, + { + "epoch": 1.21, + "grad_norm": 0.5644974708557129, + "learning_rate": 0.0003910195891059409, + "loss": 2.8995, + "step": 24601 + }, + { + "epoch": 1.21, + "grad_norm": 0.5495239496231079, + "learning_rate": 0.0003910049185147955, + "loss": 3.2219, + "step": 24602 + }, + { + "epoch": 1.21, + "grad_norm": 0.5390620231628418, + "learning_rate": 0.0003909902476839608, + "loss": 3.3411, + "step": 24603 + }, + { + "epoch": 1.21, + "grad_norm": 0.5480315685272217, + "learning_rate": 0.0003909755766134751, + "loss": 3.2259, + "step": 24604 + }, + { + "epoch": 1.21, + "grad_norm": 0.5507662892341614, + "learning_rate": 0.0003909609053033774, + "loss": 3.0374, + "step": 24605 + }, + { + "epoch": 1.21, + "grad_norm": 0.5539723038673401, + "learning_rate": 0.00039094623375370626, + "loss": 3.1604, + "step": 24606 + }, + { + "epoch": 1.21, + "grad_norm": 0.5907084345817566, + "learning_rate": 0.00039093156196450007, + "loss": 3.0563, + "step": 24607 + }, + { + "epoch": 1.21, + "grad_norm": 0.569409191608429, + "learning_rate": 0.0003909168899357977, + "loss": 3.0649, + "step": 24608 + }, + { + "epoch": 1.21, + "grad_norm": 0.529839277267456, + "learning_rate": 0.00039090221766763765, + "loss": 3.0733, + "step": 24609 + }, + { + "epoch": 1.21, + "grad_norm": 0.5536385774612427, + "learning_rate": 0.00039088754516005883, + "loss": 3.1285, + "step": 24610 + }, + { + "epoch": 1.21, + "grad_norm": 0.5590646266937256, + "learning_rate": 0.0003908728724130996, + "loss": 2.8481, + "step": 24611 + }, + { + "epoch": 1.21, + "grad_norm": 0.5736700296401978, + "learning_rate": 0.00039085819942679876, + "loss": 3.0309, + "step": 24612 + }, + { + "epoch": 1.21, + "grad_norm": 0.5361476540565491, + "learning_rate": 0.00039084352620119485, + "loss": 3.094, + "step": 24613 + }, + { + "epoch": 1.21, + "grad_norm": 0.5522602796554565, + "learning_rate": 0.0003908288527363266, + "loss": 3.1455, + "step": 24614 + }, + { + "epoch": 1.21, + "grad_norm": 0.5784001350402832, + "learning_rate": 0.00039081417903223263, + "loss": 3.0122, + "step": 24615 + }, + { + "epoch": 1.21, + "grad_norm": 0.5663528442382812, + "learning_rate": 0.00039079950508895156, + "loss": 2.9354, + "step": 24616 + }, + { + "epoch": 1.21, + "grad_norm": 0.5776238441467285, + "learning_rate": 0.00039078483090652217, + "loss": 3.0843, + "step": 24617 + }, + { + "epoch": 1.21, + "grad_norm": 0.5722895860671997, + "learning_rate": 0.00039077015648498293, + "loss": 3.0982, + "step": 24618 + }, + { + "epoch": 1.21, + "grad_norm": 0.5736603140830994, + "learning_rate": 0.0003907554818243725, + "loss": 2.9995, + "step": 24619 + }, + { + "epoch": 1.21, + "grad_norm": 0.5491389632225037, + "learning_rate": 0.00039074080692472966, + "loss": 2.9911, + "step": 24620 + }, + { + "epoch": 1.21, + "grad_norm": 0.5545860528945923, + "learning_rate": 0.0003907261317860931, + "loss": 3.074, + "step": 24621 + }, + { + "epoch": 1.21, + "grad_norm": 0.6800806522369385, + "learning_rate": 0.0003907114564085013, + "loss": 3.2136, + "step": 24622 + }, + { + "epoch": 1.21, + "grad_norm": 0.5676558017730713, + "learning_rate": 0.0003906967807919929, + "loss": 3.1701, + "step": 24623 + }, + { + "epoch": 1.21, + "grad_norm": 0.540444552898407, + "learning_rate": 0.0003906821049366067, + "loss": 3.037, + "step": 24624 + }, + { + "epoch": 1.21, + "grad_norm": 0.583018958568573, + "learning_rate": 0.00039066742884238134, + "loss": 3.1316, + "step": 24625 + }, + { + "epoch": 1.21, + "grad_norm": 0.5733615159988403, + "learning_rate": 0.00039065275250935535, + "loss": 2.998, + "step": 24626 + }, + { + "epoch": 1.21, + "grad_norm": 0.5599751472473145, + "learning_rate": 0.00039063807593756744, + "loss": 3.0973, + "step": 24627 + }, + { + "epoch": 1.21, + "grad_norm": 0.5724146962165833, + "learning_rate": 0.0003906233991270563, + "loss": 2.935, + "step": 24628 + }, + { + "epoch": 1.21, + "grad_norm": 0.5427190065383911, + "learning_rate": 0.0003906087220778607, + "loss": 3.0934, + "step": 24629 + }, + { + "epoch": 1.21, + "grad_norm": 0.5561226010322571, + "learning_rate": 0.000390594044790019, + "loss": 3.2373, + "step": 24630 + }, + { + "epoch": 1.21, + "grad_norm": 0.5783647894859314, + "learning_rate": 0.0003905793672635701, + "loss": 3.0405, + "step": 24631 + }, + { + "epoch": 1.21, + "grad_norm": 0.5754805207252502, + "learning_rate": 0.00039056468949855253, + "loss": 3.2403, + "step": 24632 + }, + { + "epoch": 1.21, + "grad_norm": 0.558912992477417, + "learning_rate": 0.000390550011495005, + "loss": 3.0081, + "step": 24633 + }, + { + "epoch": 1.21, + "grad_norm": 0.5601596236228943, + "learning_rate": 0.0003905353332529662, + "loss": 3.0755, + "step": 24634 + }, + { + "epoch": 1.21, + "grad_norm": 0.5683655142784119, + "learning_rate": 0.0003905206547724748, + "loss": 3.0046, + "step": 24635 + }, + { + "epoch": 1.21, + "grad_norm": 0.556339681148529, + "learning_rate": 0.00039050597605356927, + "loss": 3.1934, + "step": 24636 + }, + { + "epoch": 1.21, + "grad_norm": 0.5865704417228699, + "learning_rate": 0.0003904912970962885, + "loss": 2.9044, + "step": 24637 + }, + { + "epoch": 1.21, + "grad_norm": 0.5483426451683044, + "learning_rate": 0.00039047661790067107, + "loss": 2.9773, + "step": 24638 + }, + { + "epoch": 1.21, + "grad_norm": 0.5963693857192993, + "learning_rate": 0.00039046193846675563, + "loss": 3.1899, + "step": 24639 + }, + { + "epoch": 1.21, + "grad_norm": 0.5514311790466309, + "learning_rate": 0.00039044725879458086, + "loss": 3.0625, + "step": 24640 + }, + { + "epoch": 1.21, + "grad_norm": 0.5385047793388367, + "learning_rate": 0.0003904325788841854, + "loss": 3.1098, + "step": 24641 + }, + { + "epoch": 1.21, + "grad_norm": 0.5517145991325378, + "learning_rate": 0.00039041789873560796, + "loss": 2.91, + "step": 24642 + }, + { + "epoch": 1.21, + "grad_norm": 0.5656097531318665, + "learning_rate": 0.00039040321834888725, + "loss": 3.1927, + "step": 24643 + }, + { + "epoch": 1.21, + "grad_norm": 0.5733511447906494, + "learning_rate": 0.00039038853772406176, + "loss": 3.0975, + "step": 24644 + }, + { + "epoch": 1.21, + "grad_norm": 0.5602204203605652, + "learning_rate": 0.00039037385686117024, + "loss": 3.1281, + "step": 24645 + }, + { + "epoch": 1.21, + "grad_norm": 0.551032304763794, + "learning_rate": 0.0003903591757602514, + "loss": 3.3527, + "step": 24646 + }, + { + "epoch": 1.21, + "grad_norm": 0.5796739459037781, + "learning_rate": 0.0003903444944213439, + "loss": 2.9157, + "step": 24647 + }, + { + "epoch": 1.21, + "grad_norm": 0.5496836304664612, + "learning_rate": 0.0003903298128444864, + "loss": 3.0158, + "step": 24648 + }, + { + "epoch": 1.21, + "grad_norm": 0.5698907375335693, + "learning_rate": 0.0003903151310297176, + "loss": 3.2225, + "step": 24649 + }, + { + "epoch": 1.21, + "grad_norm": 0.5394782423973083, + "learning_rate": 0.000390300448977076, + "loss": 3.0648, + "step": 24650 + }, + { + "epoch": 1.21, + "grad_norm": 0.5592007040977478, + "learning_rate": 0.00039028576668660047, + "loss": 3.0059, + "step": 24651 + }, + { + "epoch": 1.21, + "grad_norm": 0.5389769077301025, + "learning_rate": 0.0003902710841583296, + "loss": 3.0591, + "step": 24652 + }, + { + "epoch": 1.21, + "grad_norm": 0.5288125872612, + "learning_rate": 0.00039025640139230206, + "loss": 3.061, + "step": 24653 + }, + { + "epoch": 1.21, + "grad_norm": 0.5310102105140686, + "learning_rate": 0.0003902417183885566, + "loss": 3.0722, + "step": 24654 + }, + { + "epoch": 1.21, + "grad_norm": 0.5603654980659485, + "learning_rate": 0.00039022703514713167, + "loss": 2.9501, + "step": 24655 + }, + { + "epoch": 1.21, + "grad_norm": 0.6040814518928528, + "learning_rate": 0.00039021235166806617, + "loss": 2.9324, + "step": 24656 + }, + { + "epoch": 1.21, + "grad_norm": 0.5468594431877136, + "learning_rate": 0.00039019766795139873, + "loss": 3.1048, + "step": 24657 + }, + { + "epoch": 1.21, + "grad_norm": 0.5571081042289734, + "learning_rate": 0.00039018298399716797, + "loss": 3.1354, + "step": 24658 + }, + { + "epoch": 1.21, + "grad_norm": 0.5484535694122314, + "learning_rate": 0.0003901682998054126, + "loss": 3.2044, + "step": 24659 + }, + { + "epoch": 1.21, + "grad_norm": 0.5328761339187622, + "learning_rate": 0.0003901536153761712, + "loss": 2.9363, + "step": 24660 + }, + { + "epoch": 1.21, + "grad_norm": 0.5752051472663879, + "learning_rate": 0.0003901389307094827, + "loss": 3.0491, + "step": 24661 + }, + { + "epoch": 1.21, + "grad_norm": 0.7036685943603516, + "learning_rate": 0.0003901242458053854, + "loss": 2.9426, + "step": 24662 + }, + { + "epoch": 1.21, + "grad_norm": 0.5637723803520203, + "learning_rate": 0.0003901095606639183, + "loss": 2.9939, + "step": 24663 + }, + { + "epoch": 1.21, + "grad_norm": 0.5546841025352478, + "learning_rate": 0.00039009487528512, + "loss": 2.8504, + "step": 24664 + }, + { + "epoch": 1.21, + "grad_norm": 0.5602949857711792, + "learning_rate": 0.00039008018966902913, + "loss": 3.3448, + "step": 24665 + }, + { + "epoch": 1.21, + "grad_norm": 0.5492205023765564, + "learning_rate": 0.00039006550381568425, + "loss": 3.0376, + "step": 24666 + }, + { + "epoch": 1.21, + "grad_norm": 0.6059777140617371, + "learning_rate": 0.0003900508177251242, + "loss": 3.0945, + "step": 24667 + }, + { + "epoch": 1.21, + "grad_norm": 0.5843932628631592, + "learning_rate": 0.0003900361313973877, + "loss": 3.0452, + "step": 24668 + }, + { + "epoch": 1.21, + "grad_norm": 0.59035325050354, + "learning_rate": 0.0003900214448325133, + "loss": 3.145, + "step": 24669 + }, + { + "epoch": 1.21, + "grad_norm": 0.5399641394615173, + "learning_rate": 0.00039000675803053985, + "loss": 3.2487, + "step": 24670 + }, + { + "epoch": 1.21, + "grad_norm": 0.6116369366645813, + "learning_rate": 0.0003899920709915057, + "loss": 3.2259, + "step": 24671 + }, + { + "epoch": 1.21, + "grad_norm": 0.5800071954727173, + "learning_rate": 0.00038997738371545, + "loss": 2.9216, + "step": 24672 + }, + { + "epoch": 1.21, + "grad_norm": 0.608817458152771, + "learning_rate": 0.00038996269620241103, + "loss": 3.0501, + "step": 24673 + }, + { + "epoch": 1.21, + "grad_norm": 0.5628402829170227, + "learning_rate": 0.00038994800845242766, + "loss": 3.0112, + "step": 24674 + }, + { + "epoch": 1.21, + "grad_norm": 0.5566816329956055, + "learning_rate": 0.00038993332046553857, + "loss": 3.2272, + "step": 24675 + }, + { + "epoch": 1.21, + "grad_norm": 0.5373614430427551, + "learning_rate": 0.0003899186322417824, + "loss": 2.8742, + "step": 24676 + }, + { + "epoch": 1.21, + "grad_norm": 0.5301047563552856, + "learning_rate": 0.0003899039437811979, + "loss": 3.0577, + "step": 24677 + }, + { + "epoch": 1.21, + "grad_norm": 0.5556225776672363, + "learning_rate": 0.00038988925508382367, + "loss": 3.0731, + "step": 24678 + }, + { + "epoch": 1.21, + "grad_norm": 0.5261009931564331, + "learning_rate": 0.00038987456614969853, + "loss": 2.9431, + "step": 24679 + }, + { + "epoch": 1.21, + "grad_norm": 0.6140367388725281, + "learning_rate": 0.000389859876978861, + "loss": 3.1981, + "step": 24680 + }, + { + "epoch": 1.21, + "grad_norm": 0.5956748127937317, + "learning_rate": 0.00038984518757134983, + "loss": 3.1423, + "step": 24681 + }, + { + "epoch": 1.21, + "grad_norm": 0.5351552963256836, + "learning_rate": 0.00038983049792720373, + "loss": 3.0685, + "step": 24682 + }, + { + "epoch": 1.21, + "grad_norm": 0.5489239692687988, + "learning_rate": 0.00038981580804646146, + "loss": 3.2112, + "step": 24683 + }, + { + "epoch": 1.21, + "grad_norm": 0.5213435292243958, + "learning_rate": 0.00038980111792916156, + "loss": 3.0596, + "step": 24684 + }, + { + "epoch": 1.21, + "grad_norm": 0.5946751236915588, + "learning_rate": 0.00038978642757534285, + "loss": 3.0055, + "step": 24685 + }, + { + "epoch": 1.21, + "grad_norm": 0.5385335087776184, + "learning_rate": 0.0003897717369850439, + "loss": 3.146, + "step": 24686 + }, + { + "epoch": 1.21, + "grad_norm": 0.5397436022758484, + "learning_rate": 0.00038975704615830354, + "loss": 3.0557, + "step": 24687 + }, + { + "epoch": 1.21, + "grad_norm": 0.5667961239814758, + "learning_rate": 0.00038974235509516036, + "loss": 3.1658, + "step": 24688 + }, + { + "epoch": 1.21, + "grad_norm": 0.5461593866348267, + "learning_rate": 0.0003897276637956531, + "loss": 3.0923, + "step": 24689 + }, + { + "epoch": 1.21, + "grad_norm": 0.5522091388702393, + "learning_rate": 0.0003897129722598204, + "loss": 3.0271, + "step": 24690 + }, + { + "epoch": 1.21, + "grad_norm": 0.5801872611045837, + "learning_rate": 0.00038969828048770105, + "loss": 3.3063, + "step": 24691 + }, + { + "epoch": 1.21, + "grad_norm": 0.5313283801078796, + "learning_rate": 0.0003896835884793337, + "loss": 2.9629, + "step": 24692 + }, + { + "epoch": 1.21, + "grad_norm": 0.5575485229492188, + "learning_rate": 0.000389668896234757, + "loss": 3.1138, + "step": 24693 + }, + { + "epoch": 1.21, + "grad_norm": 0.5732911825180054, + "learning_rate": 0.0003896542037540098, + "loss": 3.1065, + "step": 24694 + }, + { + "epoch": 1.21, + "grad_norm": 0.5623870491981506, + "learning_rate": 0.0003896395110371306, + "loss": 3.2062, + "step": 24695 + }, + { + "epoch": 1.21, + "grad_norm": 0.5679678320884705, + "learning_rate": 0.00038962481808415807, + "loss": 3.0646, + "step": 24696 + }, + { + "epoch": 1.21, + "grad_norm": 0.578198254108429, + "learning_rate": 0.0003896101248951312, + "loss": 3.3072, + "step": 24697 + }, + { + "epoch": 1.21, + "grad_norm": 0.5689905881881714, + "learning_rate": 0.0003895954314700884, + "loss": 3.1302, + "step": 24698 + }, + { + "epoch": 1.21, + "grad_norm": 0.5264217853546143, + "learning_rate": 0.00038958073780906845, + "loss": 3.0872, + "step": 24699 + }, + { + "epoch": 1.21, + "grad_norm": 0.5618980526924133, + "learning_rate": 0.00038956604391211016, + "loss": 3.2415, + "step": 24700 + }, + { + "epoch": 1.21, + "grad_norm": 0.5655535459518433, + "learning_rate": 0.0003895513497792521, + "loss": 3.0696, + "step": 24701 + }, + { + "epoch": 1.21, + "grad_norm": 0.5163285136222839, + "learning_rate": 0.000389536655410533, + "loss": 3.1711, + "step": 24702 + }, + { + "epoch": 1.21, + "grad_norm": 0.7728413343429565, + "learning_rate": 0.0003895219608059916, + "loss": 3.008, + "step": 24703 + }, + { + "epoch": 1.21, + "grad_norm": 0.5822486877441406, + "learning_rate": 0.0003895072659656666, + "loss": 3.1006, + "step": 24704 + }, + { + "epoch": 1.21, + "grad_norm": 0.5410815477371216, + "learning_rate": 0.00038949257088959674, + "loss": 2.9528, + "step": 24705 + }, + { + "epoch": 1.21, + "grad_norm": 0.5558587312698364, + "learning_rate": 0.0003894778755778206, + "loss": 3.0161, + "step": 24706 + }, + { + "epoch": 1.21, + "grad_norm": 0.5925754904747009, + "learning_rate": 0.0003894631800303769, + "loss": 3.1687, + "step": 24707 + }, + { + "epoch": 1.21, + "grad_norm": 0.5678203701972961, + "learning_rate": 0.00038944848424730456, + "loss": 3.1855, + "step": 24708 + }, + { + "epoch": 1.21, + "grad_norm": 0.5488570928573608, + "learning_rate": 0.00038943378822864203, + "loss": 3.2203, + "step": 24709 + }, + { + "epoch": 1.21, + "grad_norm": 0.5649476051330566, + "learning_rate": 0.00038941909197442813, + "loss": 3.1933, + "step": 24710 + }, + { + "epoch": 1.21, + "grad_norm": 0.599998950958252, + "learning_rate": 0.0003894043954847015, + "loss": 3.1783, + "step": 24711 + }, + { + "epoch": 1.21, + "grad_norm": 0.5275146961212158, + "learning_rate": 0.00038938969875950097, + "loss": 3.058, + "step": 24712 + }, + { + "epoch": 1.21, + "grad_norm": 0.5559340119361877, + "learning_rate": 0.0003893750017988651, + "loss": 3.2087, + "step": 24713 + }, + { + "epoch": 1.21, + "grad_norm": 0.5753278732299805, + "learning_rate": 0.00038936030460283276, + "loss": 3.2299, + "step": 24714 + }, + { + "epoch": 1.21, + "grad_norm": 0.5743871927261353, + "learning_rate": 0.0003893456071714425, + "loss": 3.1301, + "step": 24715 + }, + { + "epoch": 1.21, + "grad_norm": 0.5973168611526489, + "learning_rate": 0.0003893309095047332, + "loss": 2.9648, + "step": 24716 + }, + { + "epoch": 1.21, + "grad_norm": 0.5473564863204956, + "learning_rate": 0.00038931621160274337, + "loss": 3.1843, + "step": 24717 + }, + { + "epoch": 1.21, + "grad_norm": 0.5533649325370789, + "learning_rate": 0.00038930151346551185, + "loss": 2.9377, + "step": 24718 + }, + { + "epoch": 1.21, + "grad_norm": 0.551533043384552, + "learning_rate": 0.0003892868150930775, + "loss": 3.2214, + "step": 24719 + }, + { + "epoch": 1.21, + "grad_norm": 0.5771843791007996, + "learning_rate": 0.0003892721164854787, + "loss": 3.0803, + "step": 24720 + }, + { + "epoch": 1.21, + "grad_norm": 0.5571311712265015, + "learning_rate": 0.0003892574176427543, + "loss": 2.997, + "step": 24721 + }, + { + "epoch": 1.21, + "grad_norm": 0.5600820183753967, + "learning_rate": 0.00038924271856494305, + "loss": 3.082, + "step": 24722 + }, + { + "epoch": 1.21, + "grad_norm": 0.5592990517616272, + "learning_rate": 0.0003892280192520837, + "loss": 3.0899, + "step": 24723 + }, + { + "epoch": 1.21, + "grad_norm": 0.5576978921890259, + "learning_rate": 0.00038921331970421494, + "loss": 3.3335, + "step": 24724 + }, + { + "epoch": 1.21, + "grad_norm": 0.5841004848480225, + "learning_rate": 0.0003891986199213754, + "loss": 3.0562, + "step": 24725 + }, + { + "epoch": 1.21, + "grad_norm": 0.5277174115180969, + "learning_rate": 0.0003891839199036039, + "loss": 2.9582, + "step": 24726 + }, + { + "epoch": 1.21, + "grad_norm": 0.5508502125740051, + "learning_rate": 0.0003891692196509391, + "loss": 3.1022, + "step": 24727 + }, + { + "epoch": 1.21, + "grad_norm": 0.5699372887611389, + "learning_rate": 0.00038915451916341973, + "loss": 3.0745, + "step": 24728 + }, + { + "epoch": 1.21, + "grad_norm": 0.5580192804336548, + "learning_rate": 0.0003891398184410846, + "loss": 2.8848, + "step": 24729 + }, + { + "epoch": 1.21, + "grad_norm": 0.6069357395172119, + "learning_rate": 0.00038912511748397235, + "loss": 2.7991, + "step": 24730 + }, + { + "epoch": 1.21, + "grad_norm": 0.5295454859733582, + "learning_rate": 0.00038911041629212156, + "loss": 2.9931, + "step": 24731 + }, + { + "epoch": 1.21, + "grad_norm": 0.5806009769439697, + "learning_rate": 0.0003890957148655711, + "loss": 3.0751, + "step": 24732 + }, + { + "epoch": 1.21, + "grad_norm": 0.5367207527160645, + "learning_rate": 0.0003890810132043597, + "loss": 2.8636, + "step": 24733 + }, + { + "epoch": 1.21, + "grad_norm": 0.5449971556663513, + "learning_rate": 0.00038906631130852614, + "loss": 2.8036, + "step": 24734 + }, + { + "epoch": 1.21, + "grad_norm": 0.5441697835922241, + "learning_rate": 0.00038905160917810896, + "loss": 3.0157, + "step": 24735 + }, + { + "epoch": 1.21, + "grad_norm": 0.5573702454566956, + "learning_rate": 0.000389036906813147, + "loss": 3.2127, + "step": 24736 + }, + { + "epoch": 1.21, + "grad_norm": 0.5617653131484985, + "learning_rate": 0.000389022204213679, + "loss": 3.0465, + "step": 24737 + }, + { + "epoch": 1.21, + "grad_norm": 0.5416889786720276, + "learning_rate": 0.0003890075013797437, + "loss": 3.1156, + "step": 24738 + }, + { + "epoch": 1.21, + "grad_norm": 0.5474164485931396, + "learning_rate": 0.0003889927983113796, + "loss": 3.0115, + "step": 24739 + }, + { + "epoch": 1.21, + "grad_norm": 0.5465503931045532, + "learning_rate": 0.0003889780950086257, + "loss": 3.2573, + "step": 24740 + }, + { + "epoch": 1.21, + "grad_norm": 0.5438640117645264, + "learning_rate": 0.00038896339147152066, + "loss": 3.0766, + "step": 24741 + }, + { + "epoch": 1.21, + "grad_norm": 0.5305720567703247, + "learning_rate": 0.00038894868770010315, + "loss": 2.8584, + "step": 24742 + }, + { + "epoch": 1.21, + "grad_norm": 0.5442739129066467, + "learning_rate": 0.0003889339836944118, + "loss": 3.0769, + "step": 24743 + }, + { + "epoch": 1.21, + "grad_norm": 0.6273128390312195, + "learning_rate": 0.0003889192794544856, + "loss": 2.9629, + "step": 24744 + }, + { + "epoch": 1.21, + "grad_norm": 0.5607988834381104, + "learning_rate": 0.0003889045749803631, + "loss": 3.2894, + "step": 24745 + }, + { + "epoch": 1.21, + "grad_norm": 0.5226300954818726, + "learning_rate": 0.00038888987027208303, + "loss": 3.0109, + "step": 24746 + }, + { + "epoch": 1.21, + "grad_norm": 0.5920791625976562, + "learning_rate": 0.0003888751653296841, + "loss": 3.0861, + "step": 24747 + }, + { + "epoch": 1.21, + "grad_norm": 0.552906334400177, + "learning_rate": 0.0003888604601532052, + "loss": 3.0653, + "step": 24748 + }, + { + "epoch": 1.21, + "grad_norm": 0.565069317817688, + "learning_rate": 0.0003888457547426848, + "loss": 3.0598, + "step": 24749 + }, + { + "epoch": 1.21, + "grad_norm": 0.5915144681930542, + "learning_rate": 0.0003888310490981619, + "loss": 3.0202, + "step": 24750 + }, + { + "epoch": 1.21, + "grad_norm": 0.5616911053657532, + "learning_rate": 0.0003888163432196751, + "loss": 2.9658, + "step": 24751 + }, + { + "epoch": 1.21, + "grad_norm": 0.5911539793014526, + "learning_rate": 0.0003888016371072631, + "loss": 2.8024, + "step": 24752 + }, + { + "epoch": 1.21, + "grad_norm": 0.5526560544967651, + "learning_rate": 0.0003887869307609647, + "loss": 3.0992, + "step": 24753 + }, + { + "epoch": 1.21, + "grad_norm": 0.5728417634963989, + "learning_rate": 0.00038877222418081856, + "loss": 3.0105, + "step": 24754 + }, + { + "epoch": 1.21, + "grad_norm": 0.5776088833808899, + "learning_rate": 0.00038875751736686353, + "loss": 3.1478, + "step": 24755 + }, + { + "epoch": 1.21, + "grad_norm": 0.5825873613357544, + "learning_rate": 0.00038874281031913837, + "loss": 3.2437, + "step": 24756 + }, + { + "epoch": 1.21, + "grad_norm": 0.5393372774124146, + "learning_rate": 0.0003887281030376816, + "loss": 3.0675, + "step": 24757 + }, + { + "epoch": 1.21, + "grad_norm": 0.5632210969924927, + "learning_rate": 0.00038871339552253205, + "loss": 2.968, + "step": 24758 + }, + { + "epoch": 1.21, + "grad_norm": 0.553977370262146, + "learning_rate": 0.00038869868777372863, + "loss": 3.0479, + "step": 24759 + }, + { + "epoch": 1.21, + "grad_norm": 0.588313639163971, + "learning_rate": 0.00038868397979130984, + "loss": 3.0204, + "step": 24760 + }, + { + "epoch": 1.21, + "grad_norm": 0.5613222718238831, + "learning_rate": 0.0003886692715753145, + "loss": 2.8621, + "step": 24761 + }, + { + "epoch": 1.21, + "grad_norm": 0.5414893627166748, + "learning_rate": 0.0003886545631257814, + "loss": 2.8851, + "step": 24762 + }, + { + "epoch": 1.21, + "grad_norm": 0.5642477869987488, + "learning_rate": 0.0003886398544427493, + "loss": 2.9733, + "step": 24763 + }, + { + "epoch": 1.21, + "grad_norm": 0.5491669178009033, + "learning_rate": 0.00038862514552625676, + "loss": 3.1974, + "step": 24764 + }, + { + "epoch": 1.21, + "grad_norm": 0.5662064552307129, + "learning_rate": 0.0003886104363763427, + "loss": 2.9075, + "step": 24765 + }, + { + "epoch": 1.21, + "grad_norm": 0.5613273978233337, + "learning_rate": 0.0003885957269930458, + "loss": 3.1875, + "step": 24766 + }, + { + "epoch": 1.21, + "grad_norm": 0.5412261486053467, + "learning_rate": 0.0003885810173764049, + "loss": 3.193, + "step": 24767 + }, + { + "epoch": 1.21, + "grad_norm": 0.5381949543952942, + "learning_rate": 0.0003885663075264585, + "loss": 3.0033, + "step": 24768 + }, + { + "epoch": 1.21, + "grad_norm": 0.5455308556556702, + "learning_rate": 0.0003885515974432455, + "loss": 3.0217, + "step": 24769 + }, + { + "epoch": 1.21, + "grad_norm": 0.5200797915458679, + "learning_rate": 0.0003885368871268047, + "loss": 2.9717, + "step": 24770 + }, + { + "epoch": 1.21, + "grad_norm": 0.5431541800498962, + "learning_rate": 0.00038852217657717484, + "loss": 2.9984, + "step": 24771 + }, + { + "epoch": 1.21, + "grad_norm": 0.5343523621559143, + "learning_rate": 0.0003885074657943945, + "loss": 3.1071, + "step": 24772 + }, + { + "epoch": 1.21, + "grad_norm": 0.5573854446411133, + "learning_rate": 0.00038849275477850255, + "loss": 3.0734, + "step": 24773 + }, + { + "epoch": 1.21, + "grad_norm": 0.5465708374977112, + "learning_rate": 0.0003884780435295377, + "loss": 3.0892, + "step": 24774 + }, + { + "epoch": 1.21, + "grad_norm": 0.5360824465751648, + "learning_rate": 0.00038846333204753874, + "loss": 2.935, + "step": 24775 + }, + { + "epoch": 1.21, + "grad_norm": 0.5567554831504822, + "learning_rate": 0.0003884486203325444, + "loss": 3.0637, + "step": 24776 + }, + { + "epoch": 1.21, + "grad_norm": 0.5666375756263733, + "learning_rate": 0.0003884339083845934, + "loss": 2.9533, + "step": 24777 + }, + { + "epoch": 1.21, + "grad_norm": 0.5433690547943115, + "learning_rate": 0.0003884191962037245, + "loss": 3.0508, + "step": 24778 + }, + { + "epoch": 1.21, + "grad_norm": 0.5279322266578674, + "learning_rate": 0.0003884044837899764, + "loss": 3.0479, + "step": 24779 + }, + { + "epoch": 1.21, + "grad_norm": 0.524019718170166, + "learning_rate": 0.00038838977114338797, + "loss": 3.0864, + "step": 24780 + }, + { + "epoch": 1.21, + "grad_norm": 0.5386550426483154, + "learning_rate": 0.00038837505826399795, + "loss": 3.109, + "step": 24781 + }, + { + "epoch": 1.21, + "grad_norm": 0.5483960509300232, + "learning_rate": 0.0003883603451518449, + "loss": 2.943, + "step": 24782 + }, + { + "epoch": 1.21, + "grad_norm": 0.5979861617088318, + "learning_rate": 0.0003883456318069678, + "loss": 3.1003, + "step": 24783 + }, + { + "epoch": 1.21, + "grad_norm": 0.5268682837486267, + "learning_rate": 0.0003883309182294052, + "loss": 3.2098, + "step": 24784 + }, + { + "epoch": 1.21, + "grad_norm": 0.5503677129745483, + "learning_rate": 0.0003883162044191962, + "loss": 3.1854, + "step": 24785 + }, + { + "epoch": 1.21, + "grad_norm": 0.5744519829750061, + "learning_rate": 0.00038830149037637906, + "loss": 3.1521, + "step": 24786 + }, + { + "epoch": 1.21, + "grad_norm": 0.5381206274032593, + "learning_rate": 0.00038828677610099294, + "loss": 3.1749, + "step": 24787 + }, + { + "epoch": 1.21, + "grad_norm": 0.569991946220398, + "learning_rate": 0.00038827206159307634, + "loss": 3.2375, + "step": 24788 + }, + { + "epoch": 1.21, + "grad_norm": 0.5447730422019958, + "learning_rate": 0.0003882573468526682, + "loss": 3.1548, + "step": 24789 + }, + { + "epoch": 1.21, + "grad_norm": 0.5430207252502441, + "learning_rate": 0.0003882426318798071, + "loss": 3.0477, + "step": 24790 + }, + { + "epoch": 1.21, + "grad_norm": 0.5270898342132568, + "learning_rate": 0.00038822791667453196, + "loss": 3.1821, + "step": 24791 + }, + { + "epoch": 1.21, + "grad_norm": 0.5762162804603577, + "learning_rate": 0.0003882132012368815, + "loss": 2.8535, + "step": 24792 + }, + { + "epoch": 1.22, + "grad_norm": 0.5520655512809753, + "learning_rate": 0.00038819848556689444, + "loss": 3.1805, + "step": 24793 + }, + { + "epoch": 1.22, + "grad_norm": 0.557744026184082, + "learning_rate": 0.00038818376966460945, + "loss": 3.0716, + "step": 24794 + }, + { + "epoch": 1.22, + "grad_norm": 0.5885498523712158, + "learning_rate": 0.0003881690535300654, + "loss": 3.1626, + "step": 24795 + }, + { + "epoch": 1.22, + "grad_norm": 0.559755802154541, + "learning_rate": 0.00038815433716330117, + "loss": 3.0249, + "step": 24796 + }, + { + "epoch": 1.22, + "grad_norm": 0.5420626401901245, + "learning_rate": 0.0003881396205643553, + "loss": 3.1339, + "step": 24797 + }, + { + "epoch": 1.22, + "grad_norm": 0.5564322471618652, + "learning_rate": 0.00038812490373326655, + "loss": 2.9808, + "step": 24798 + }, + { + "epoch": 1.22, + "grad_norm": 0.5733435750007629, + "learning_rate": 0.00038811018667007387, + "loss": 2.9819, + "step": 24799 + }, + { + "epoch": 1.22, + "grad_norm": 0.5582762956619263, + "learning_rate": 0.00038809546937481585, + "loss": 2.9805, + "step": 24800 + }, + { + "epoch": 1.22, + "grad_norm": 0.577571451663971, + "learning_rate": 0.00038808075184753135, + "loss": 3.1034, + "step": 24801 + }, + { + "epoch": 1.22, + "grad_norm": 0.5833460688591003, + "learning_rate": 0.000388066034088259, + "loss": 3.1709, + "step": 24802 + }, + { + "epoch": 1.22, + "grad_norm": 0.5358278155326843, + "learning_rate": 0.0003880513160970378, + "loss": 2.8111, + "step": 24803 + }, + { + "epoch": 1.22, + "grad_norm": 0.5528193116188049, + "learning_rate": 0.00038803659787390633, + "loss": 3.4981, + "step": 24804 + }, + { + "epoch": 1.22, + "grad_norm": 0.5586181879043579, + "learning_rate": 0.0003880218794189034, + "loss": 2.8715, + "step": 24805 + }, + { + "epoch": 1.22, + "grad_norm": 0.5519729256629944, + "learning_rate": 0.00038800716073206785, + "loss": 3.1437, + "step": 24806 + }, + { + "epoch": 1.22, + "grad_norm": 0.5765265226364136, + "learning_rate": 0.0003879924418134383, + "loss": 3.204, + "step": 24807 + }, + { + "epoch": 1.22, + "grad_norm": 0.5296533107757568, + "learning_rate": 0.00038797772266305365, + "loss": 3.1702, + "step": 24808 + }, + { + "epoch": 1.22, + "grad_norm": 0.5647825598716736, + "learning_rate": 0.00038796300328095245, + "loss": 3.1688, + "step": 24809 + }, + { + "epoch": 1.22, + "grad_norm": 0.5361009836196899, + "learning_rate": 0.00038794828366717384, + "loss": 3.139, + "step": 24810 + }, + { + "epoch": 1.22, + "grad_norm": 0.5457804799079895, + "learning_rate": 0.0003879335638217563, + "loss": 3.1071, + "step": 24811 + }, + { + "epoch": 1.22, + "grad_norm": 0.5646703839302063, + "learning_rate": 0.0003879188437447386, + "loss": 3.1663, + "step": 24812 + }, + { + "epoch": 1.22, + "grad_norm": 0.5620137453079224, + "learning_rate": 0.0003879041234361597, + "loss": 3.2054, + "step": 24813 + }, + { + "epoch": 1.22, + "grad_norm": 0.5515795350074768, + "learning_rate": 0.0003878894028960582, + "loss": 3.1787, + "step": 24814 + }, + { + "epoch": 1.22, + "grad_norm": 0.5928753614425659, + "learning_rate": 0.00038787468212447287, + "loss": 3.2087, + "step": 24815 + }, + { + "epoch": 1.22, + "grad_norm": 0.5143396854400635, + "learning_rate": 0.00038785996112144263, + "loss": 3.0081, + "step": 24816 + }, + { + "epoch": 1.22, + "grad_norm": 0.5518531799316406, + "learning_rate": 0.0003878452398870061, + "loss": 3.0035, + "step": 24817 + }, + { + "epoch": 1.22, + "grad_norm": 0.5611719489097595, + "learning_rate": 0.00038783051842120227, + "loss": 3.1042, + "step": 24818 + }, + { + "epoch": 1.22, + "grad_norm": 0.5301064252853394, + "learning_rate": 0.0003878157967240696, + "loss": 3.0097, + "step": 24819 + }, + { + "epoch": 1.22, + "grad_norm": 0.575943648815155, + "learning_rate": 0.00038780107479564695, + "loss": 2.8759, + "step": 24820 + }, + { + "epoch": 1.22, + "grad_norm": 0.5593007802963257, + "learning_rate": 0.00038778635263597336, + "loss": 3.2868, + "step": 24821 + }, + { + "epoch": 1.22, + "grad_norm": 0.5593023300170898, + "learning_rate": 0.00038777163024508726, + "loss": 3.0437, + "step": 24822 + }, + { + "epoch": 1.22, + "grad_norm": 0.5502452254295349, + "learning_rate": 0.0003877569076230277, + "loss": 3.179, + "step": 24823 + }, + { + "epoch": 1.22, + "grad_norm": 0.5816333889961243, + "learning_rate": 0.0003877421847698333, + "loss": 2.8658, + "step": 24824 + }, + { + "epoch": 1.22, + "grad_norm": 0.5567662119865417, + "learning_rate": 0.0003877274616855428, + "loss": 2.9821, + "step": 24825 + }, + { + "epoch": 1.22, + "grad_norm": 0.5488287210464478, + "learning_rate": 0.0003877127383701951, + "loss": 3.1091, + "step": 24826 + }, + { + "epoch": 1.22, + "grad_norm": 0.534261167049408, + "learning_rate": 0.0003876980148238289, + "loss": 2.8979, + "step": 24827 + }, + { + "epoch": 1.22, + "grad_norm": 0.5407263040542603, + "learning_rate": 0.000387683291046483, + "loss": 3.2039, + "step": 24828 + }, + { + "epoch": 1.22, + "grad_norm": 0.5979218482971191, + "learning_rate": 0.0003876685670381964, + "loss": 3.146, + "step": 24829 + }, + { + "epoch": 1.22, + "grad_norm": 0.5862070322036743, + "learning_rate": 0.0003876538427990073, + "loss": 3.0431, + "step": 24830 + }, + { + "epoch": 1.22, + "grad_norm": 0.5690160393714905, + "learning_rate": 0.0003876391183289551, + "loss": 3.1891, + "step": 24831 + }, + { + "epoch": 1.22, + "grad_norm": 0.5591949224472046, + "learning_rate": 0.00038762439362807825, + "loss": 3.1933, + "step": 24832 + }, + { + "epoch": 1.22, + "grad_norm": 0.5995550751686096, + "learning_rate": 0.0003876096686964157, + "loss": 3.0018, + "step": 24833 + }, + { + "epoch": 1.22, + "grad_norm": 0.5738639235496521, + "learning_rate": 0.000387594943534006, + "loss": 3.0424, + "step": 24834 + }, + { + "epoch": 1.22, + "grad_norm": 0.5617707371711731, + "learning_rate": 0.00038758021814088806, + "loss": 2.9083, + "step": 24835 + }, + { + "epoch": 1.22, + "grad_norm": 0.5648507475852966, + "learning_rate": 0.00038756549251710087, + "loss": 2.8961, + "step": 24836 + }, + { + "epoch": 1.22, + "grad_norm": 0.6076490879058838, + "learning_rate": 0.00038755076666268284, + "loss": 3.1156, + "step": 24837 + }, + { + "epoch": 1.22, + "grad_norm": 0.5720618963241577, + "learning_rate": 0.00038753604057767307, + "loss": 2.8891, + "step": 24838 + }, + { + "epoch": 1.22, + "grad_norm": 0.5429185628890991, + "learning_rate": 0.00038752131426211014, + "loss": 3.1625, + "step": 24839 + }, + { + "epoch": 1.22, + "grad_norm": 0.5565420389175415, + "learning_rate": 0.00038750658771603293, + "loss": 3.0722, + "step": 24840 + }, + { + "epoch": 1.22, + "grad_norm": 0.5758388042449951, + "learning_rate": 0.00038749186093948025, + "loss": 3.2304, + "step": 24841 + }, + { + "epoch": 1.22, + "grad_norm": 0.5630769729614258, + "learning_rate": 0.00038747713393249075, + "loss": 3.1224, + "step": 24842 + }, + { + "epoch": 1.22, + "grad_norm": 0.5871720910072327, + "learning_rate": 0.0003874624066951034, + "loss": 2.9868, + "step": 24843 + }, + { + "epoch": 1.22, + "grad_norm": 0.5743727684020996, + "learning_rate": 0.00038744767922735694, + "loss": 3.1502, + "step": 24844 + }, + { + "epoch": 1.22, + "grad_norm": 0.5455135107040405, + "learning_rate": 0.00038743295152929, + "loss": 3.2048, + "step": 24845 + }, + { + "epoch": 1.22, + "grad_norm": 0.5863217115402222, + "learning_rate": 0.0003874182236009415, + "loss": 3.1062, + "step": 24846 + }, + { + "epoch": 1.22, + "grad_norm": 0.5657460689544678, + "learning_rate": 0.00038740349544235037, + "loss": 3.026, + "step": 24847 + }, + { + "epoch": 1.22, + "grad_norm": 0.5640048980712891, + "learning_rate": 0.0003873887670535551, + "loss": 2.9006, + "step": 24848 + }, + { + "epoch": 1.22, + "grad_norm": 0.5563377737998962, + "learning_rate": 0.00038737403843459476, + "loss": 3.159, + "step": 24849 + }, + { + "epoch": 1.22, + "grad_norm": 0.5812902450561523, + "learning_rate": 0.0003873593095855079, + "loss": 3.2999, + "step": 24850 + }, + { + "epoch": 1.22, + "grad_norm": 0.5502186417579651, + "learning_rate": 0.0003873445805063335, + "loss": 3.1485, + "step": 24851 + }, + { + "epoch": 1.22, + "grad_norm": 0.6045628786087036, + "learning_rate": 0.00038732985119711025, + "loss": 3.1333, + "step": 24852 + }, + { + "epoch": 1.22, + "grad_norm": 0.5175624489784241, + "learning_rate": 0.000387315121657877, + "loss": 3.1418, + "step": 24853 + }, + { + "epoch": 1.22, + "grad_norm": 0.5742688179016113, + "learning_rate": 0.00038730039188867254, + "loss": 3.2371, + "step": 24854 + }, + { + "epoch": 1.22, + "grad_norm": 0.5657257437705994, + "learning_rate": 0.0003872856618895356, + "loss": 3.2323, + "step": 24855 + }, + { + "epoch": 1.22, + "grad_norm": 0.5566824674606323, + "learning_rate": 0.00038727093166050503, + "loss": 2.8398, + "step": 24856 + }, + { + "epoch": 1.22, + "grad_norm": 0.5399627089500427, + "learning_rate": 0.0003872562012016197, + "loss": 2.7679, + "step": 24857 + }, + { + "epoch": 1.22, + "grad_norm": 0.5554389357566833, + "learning_rate": 0.0003872414705129183, + "loss": 2.9823, + "step": 24858 + }, + { + "epoch": 1.22, + "grad_norm": 0.5519767999649048, + "learning_rate": 0.00038722673959443963, + "loss": 3.1616, + "step": 24859 + }, + { + "epoch": 1.22, + "grad_norm": 0.5414425134658813, + "learning_rate": 0.00038721200844622257, + "loss": 3.1378, + "step": 24860 + }, + { + "epoch": 1.22, + "grad_norm": 0.5529886484146118, + "learning_rate": 0.0003871972770683058, + "loss": 3.2881, + "step": 24861 + }, + { + "epoch": 1.22, + "grad_norm": 0.5492392778396606, + "learning_rate": 0.00038718254546072815, + "loss": 3.008, + "step": 24862 + }, + { + "epoch": 1.22, + "grad_norm": 0.5687651634216309, + "learning_rate": 0.00038716781362352847, + "loss": 3.2633, + "step": 24863 + }, + { + "epoch": 1.22, + "grad_norm": 0.5575495958328247, + "learning_rate": 0.0003871530815567456, + "loss": 2.9893, + "step": 24864 + }, + { + "epoch": 1.22, + "grad_norm": 0.6896212100982666, + "learning_rate": 0.00038713834926041825, + "loss": 3.0161, + "step": 24865 + }, + { + "epoch": 1.22, + "grad_norm": 0.5157473087310791, + "learning_rate": 0.00038712361673458524, + "loss": 3.2208, + "step": 24866 + }, + { + "epoch": 1.22, + "grad_norm": 0.5381258130073547, + "learning_rate": 0.00038710888397928544, + "loss": 2.9892, + "step": 24867 + }, + { + "epoch": 1.22, + "grad_norm": 0.577904999256134, + "learning_rate": 0.00038709415099455756, + "loss": 3.0439, + "step": 24868 + }, + { + "epoch": 1.22, + "grad_norm": 0.5449947118759155, + "learning_rate": 0.00038707941778044053, + "loss": 2.9669, + "step": 24869 + }, + { + "epoch": 1.22, + "grad_norm": 0.557723879814148, + "learning_rate": 0.000387064684336973, + "loss": 3.1256, + "step": 24870 + }, + { + "epoch": 1.22, + "grad_norm": 0.557279109954834, + "learning_rate": 0.0003870499506641938, + "loss": 3.046, + "step": 24871 + }, + { + "epoch": 1.22, + "grad_norm": 0.5531541705131531, + "learning_rate": 0.0003870352167621419, + "loss": 2.9408, + "step": 24872 + }, + { + "epoch": 1.22, + "grad_norm": 0.5873280167579651, + "learning_rate": 0.0003870204826308559, + "loss": 3.2446, + "step": 24873 + }, + { + "epoch": 1.22, + "grad_norm": 0.5433573722839355, + "learning_rate": 0.0003870057482703748, + "loss": 3.1398, + "step": 24874 + }, + { + "epoch": 1.22, + "grad_norm": 0.574711263179779, + "learning_rate": 0.0003869910136807372, + "loss": 3.261, + "step": 24875 + }, + { + "epoch": 1.22, + "grad_norm": 0.5614281296730042, + "learning_rate": 0.00038697627886198204, + "loss": 3.2698, + "step": 24876 + }, + { + "epoch": 1.22, + "grad_norm": 0.5823600888252258, + "learning_rate": 0.0003869615438141481, + "loss": 2.9666, + "step": 24877 + }, + { + "epoch": 1.22, + "grad_norm": 0.5654398798942566, + "learning_rate": 0.00038694680853727423, + "loss": 3.2506, + "step": 24878 + }, + { + "epoch": 1.22, + "grad_norm": 0.6435055136680603, + "learning_rate": 0.0003869320730313991, + "loss": 3.0697, + "step": 24879 + }, + { + "epoch": 1.22, + "grad_norm": 0.5971924662590027, + "learning_rate": 0.0003869173372965618, + "loss": 3.1637, + "step": 24880 + }, + { + "epoch": 1.22, + "grad_norm": 0.5890087485313416, + "learning_rate": 0.0003869026013328008, + "loss": 3.1743, + "step": 24881 + }, + { + "epoch": 1.22, + "grad_norm": 0.5819125771522522, + "learning_rate": 0.0003868878651401551, + "loss": 3.0368, + "step": 24882 + }, + { + "epoch": 1.22, + "grad_norm": 0.5607448816299438, + "learning_rate": 0.00038687312871866353, + "loss": 3.2866, + "step": 24883 + }, + { + "epoch": 1.22, + "grad_norm": 0.5538012981414795, + "learning_rate": 0.0003868583920683648, + "loss": 2.9722, + "step": 24884 + }, + { + "epoch": 1.22, + "grad_norm": 0.5519068241119385, + "learning_rate": 0.00038684365518929783, + "loss": 3.1197, + "step": 24885 + }, + { + "epoch": 1.22, + "grad_norm": 0.5657366514205933, + "learning_rate": 0.00038682891808150126, + "loss": 3.3442, + "step": 24886 + }, + { + "epoch": 1.22, + "grad_norm": 0.5565481781959534, + "learning_rate": 0.00038681418074501425, + "loss": 3.1246, + "step": 24887 + }, + { + "epoch": 1.22, + "grad_norm": 0.5680059194564819, + "learning_rate": 0.00038679944317987524, + "loss": 2.9612, + "step": 24888 + }, + { + "epoch": 1.22, + "grad_norm": 0.5660066604614258, + "learning_rate": 0.00038678470538612325, + "loss": 3.0581, + "step": 24889 + }, + { + "epoch": 1.22, + "grad_norm": 0.575242280960083, + "learning_rate": 0.00038676996736379705, + "loss": 3.0885, + "step": 24890 + }, + { + "epoch": 1.22, + "grad_norm": 0.5943603515625, + "learning_rate": 0.00038675522911293546, + "loss": 3.0541, + "step": 24891 + }, + { + "epoch": 1.22, + "grad_norm": 0.5383366942405701, + "learning_rate": 0.0003867404906335773, + "loss": 2.9924, + "step": 24892 + }, + { + "epoch": 1.22, + "grad_norm": 0.5978488326072693, + "learning_rate": 0.0003867257519257614, + "loss": 3.0037, + "step": 24893 + }, + { + "epoch": 1.22, + "grad_norm": 0.5685155987739563, + "learning_rate": 0.00038671101298952653, + "loss": 3.1197, + "step": 24894 + }, + { + "epoch": 1.22, + "grad_norm": 0.5263768434524536, + "learning_rate": 0.00038669627382491154, + "loss": 3.3945, + "step": 24895 + }, + { + "epoch": 1.22, + "grad_norm": 0.5680824518203735, + "learning_rate": 0.0003866815344319552, + "loss": 3.2136, + "step": 24896 + }, + { + "epoch": 1.22, + "grad_norm": 0.5891380906105042, + "learning_rate": 0.0003866667948106964, + "loss": 3.1585, + "step": 24897 + }, + { + "epoch": 1.22, + "grad_norm": 0.5633382797241211, + "learning_rate": 0.000386652054961174, + "loss": 3.1851, + "step": 24898 + }, + { + "epoch": 1.22, + "grad_norm": 0.5629472732543945, + "learning_rate": 0.00038663731488342674, + "loss": 3.1504, + "step": 24899 + }, + { + "epoch": 1.22, + "grad_norm": 0.5522317290306091, + "learning_rate": 0.0003866225745774935, + "loss": 3.1356, + "step": 24900 + }, + { + "epoch": 1.22, + "grad_norm": 0.5409548282623291, + "learning_rate": 0.0003866078340434129, + "loss": 3.1164, + "step": 24901 + }, + { + "epoch": 1.22, + "grad_norm": 0.5608346462249756, + "learning_rate": 0.0003865930932812241, + "loss": 2.968, + "step": 24902 + }, + { + "epoch": 1.22, + "grad_norm": 0.5439483523368835, + "learning_rate": 0.00038657835229096577, + "loss": 3.13, + "step": 24903 + }, + { + "epoch": 1.22, + "grad_norm": 0.5601171255111694, + "learning_rate": 0.00038656361107267665, + "loss": 3.0996, + "step": 24904 + }, + { + "epoch": 1.22, + "grad_norm": 0.5870618224143982, + "learning_rate": 0.00038654886962639565, + "loss": 3.0531, + "step": 24905 + }, + { + "epoch": 1.22, + "grad_norm": 0.6319909691810608, + "learning_rate": 0.0003865341279521617, + "loss": 3.1479, + "step": 24906 + }, + { + "epoch": 1.22, + "grad_norm": 0.6008051037788391, + "learning_rate": 0.00038651938605001324, + "loss": 2.8563, + "step": 24907 + }, + { + "epoch": 1.22, + "grad_norm": 0.549253523349762, + "learning_rate": 0.0003865046439199895, + "loss": 3.1851, + "step": 24908 + }, + { + "epoch": 1.22, + "grad_norm": 0.5713079571723938, + "learning_rate": 0.0003864899015621293, + "loss": 3.2438, + "step": 24909 + }, + { + "epoch": 1.22, + "grad_norm": 0.55234694480896, + "learning_rate": 0.00038647515897647126, + "loss": 3.1086, + "step": 24910 + }, + { + "epoch": 1.22, + "grad_norm": 0.5803124308586121, + "learning_rate": 0.00038646041616305425, + "loss": 2.9661, + "step": 24911 + }, + { + "epoch": 1.22, + "grad_norm": 0.5373254418373108, + "learning_rate": 0.0003864456731219172, + "loss": 3.077, + "step": 24912 + }, + { + "epoch": 1.22, + "grad_norm": 0.5681374669075012, + "learning_rate": 0.00038643092985309887, + "loss": 3.1995, + "step": 24913 + }, + { + "epoch": 1.22, + "grad_norm": 0.586463451385498, + "learning_rate": 0.00038641618635663807, + "loss": 2.939, + "step": 24914 + }, + { + "epoch": 1.22, + "grad_norm": 0.5377286076545715, + "learning_rate": 0.0003864014426325737, + "loss": 3.2162, + "step": 24915 + }, + { + "epoch": 1.22, + "grad_norm": 0.583235502243042, + "learning_rate": 0.00038638669868094454, + "loss": 3.0064, + "step": 24916 + }, + { + "epoch": 1.22, + "grad_norm": 0.5170654654502869, + "learning_rate": 0.00038637195450178945, + "loss": 3.0976, + "step": 24917 + }, + { + "epoch": 1.22, + "grad_norm": 0.5743283033370972, + "learning_rate": 0.00038635721009514727, + "loss": 3.1247, + "step": 24918 + }, + { + "epoch": 1.22, + "grad_norm": 0.5659081935882568, + "learning_rate": 0.00038634246546105676, + "loss": 3.0991, + "step": 24919 + }, + { + "epoch": 1.22, + "grad_norm": 0.5626477599143982, + "learning_rate": 0.0003863277205995569, + "loss": 2.9735, + "step": 24920 + }, + { + "epoch": 1.22, + "grad_norm": 0.525495707988739, + "learning_rate": 0.00038631297551068643, + "loss": 2.8968, + "step": 24921 + }, + { + "epoch": 1.22, + "grad_norm": 0.5388118624687195, + "learning_rate": 0.00038629823019448406, + "loss": 2.9915, + "step": 24922 + }, + { + "epoch": 1.22, + "grad_norm": 0.5696624517440796, + "learning_rate": 0.00038628348465098895, + "loss": 3.2016, + "step": 24923 + }, + { + "epoch": 1.22, + "grad_norm": 0.5766279697418213, + "learning_rate": 0.00038626873888023965, + "loss": 3.1201, + "step": 24924 + }, + { + "epoch": 1.22, + "grad_norm": 0.5821237564086914, + "learning_rate": 0.00038625399288227507, + "loss": 2.9969, + "step": 24925 + }, + { + "epoch": 1.22, + "grad_norm": 0.570102870464325, + "learning_rate": 0.0003862392466571341, + "loss": 2.9724, + "step": 24926 + }, + { + "epoch": 1.22, + "grad_norm": 0.5588696599006653, + "learning_rate": 0.00038622450020485557, + "loss": 2.8742, + "step": 24927 + }, + { + "epoch": 1.22, + "grad_norm": 0.5280874967575073, + "learning_rate": 0.0003862097535254783, + "loss": 3.2865, + "step": 24928 + }, + { + "epoch": 1.22, + "grad_norm": 0.5691091418266296, + "learning_rate": 0.0003861950066190411, + "loss": 3.1516, + "step": 24929 + }, + { + "epoch": 1.22, + "grad_norm": 0.5672391057014465, + "learning_rate": 0.00038618025948558283, + "loss": 3.0566, + "step": 24930 + }, + { + "epoch": 1.22, + "grad_norm": 0.5959036350250244, + "learning_rate": 0.0003861655121251425, + "loss": 2.8303, + "step": 24931 + }, + { + "epoch": 1.22, + "grad_norm": 0.55399090051651, + "learning_rate": 0.00038615076453775855, + "loss": 3.0734, + "step": 24932 + }, + { + "epoch": 1.22, + "grad_norm": 0.5398240089416504, + "learning_rate": 0.0003861360167234702, + "loss": 2.8201, + "step": 24933 + }, + { + "epoch": 1.22, + "grad_norm": 0.5640655159950256, + "learning_rate": 0.00038612126868231615, + "loss": 3.159, + "step": 24934 + }, + { + "epoch": 1.22, + "grad_norm": 0.5160124897956848, + "learning_rate": 0.00038610652041433526, + "loss": 3.1671, + "step": 24935 + }, + { + "epoch": 1.22, + "grad_norm": 0.5614923238754272, + "learning_rate": 0.0003860917719195663, + "loss": 3.0478, + "step": 24936 + }, + { + "epoch": 1.22, + "grad_norm": 0.5773776173591614, + "learning_rate": 0.00038607702319804827, + "loss": 3.1768, + "step": 24937 + }, + { + "epoch": 1.22, + "grad_norm": 0.5507655143737793, + "learning_rate": 0.0003860622742498199, + "loss": 3.0052, + "step": 24938 + }, + { + "epoch": 1.22, + "grad_norm": 0.6294071078300476, + "learning_rate": 0.00038604752507492, + "loss": 2.8901, + "step": 24939 + }, + { + "epoch": 1.22, + "grad_norm": 0.5447244644165039, + "learning_rate": 0.0003860327756733876, + "loss": 3.0518, + "step": 24940 + }, + { + "epoch": 1.22, + "grad_norm": 0.5702586770057678, + "learning_rate": 0.0003860180260452613, + "loss": 2.9634, + "step": 24941 + }, + { + "epoch": 1.22, + "grad_norm": 0.5664628148078918, + "learning_rate": 0.00038600327619058017, + "loss": 2.7702, + "step": 24942 + }, + { + "epoch": 1.22, + "grad_norm": 0.6090885400772095, + "learning_rate": 0.00038598852610938287, + "loss": 2.9708, + "step": 24943 + }, + { + "epoch": 1.22, + "grad_norm": 0.5620157718658447, + "learning_rate": 0.00038597377580170837, + "loss": 3.2652, + "step": 24944 + }, + { + "epoch": 1.22, + "grad_norm": 0.5198007225990295, + "learning_rate": 0.0003859590252675956, + "loss": 3.0837, + "step": 24945 + }, + { + "epoch": 1.22, + "grad_norm": 0.5496712327003479, + "learning_rate": 0.0003859442745070832, + "loss": 3.1429, + "step": 24946 + }, + { + "epoch": 1.22, + "grad_norm": 0.5467877984046936, + "learning_rate": 0.0003859295235202101, + "loss": 3.1486, + "step": 24947 + }, + { + "epoch": 1.22, + "grad_norm": 0.7828726172447205, + "learning_rate": 0.0003859147723070151, + "loss": 3.0038, + "step": 24948 + }, + { + "epoch": 1.22, + "grad_norm": 0.5720941424369812, + "learning_rate": 0.00038590002086753735, + "loss": 3.0948, + "step": 24949 + }, + { + "epoch": 1.22, + "grad_norm": 0.6058550477027893, + "learning_rate": 0.0003858852692018154, + "loss": 3.1602, + "step": 24950 + }, + { + "epoch": 1.22, + "grad_norm": 0.5809235572814941, + "learning_rate": 0.0003858705173098881, + "loss": 3.0933, + "step": 24951 + }, + { + "epoch": 1.22, + "grad_norm": 0.5285111665725708, + "learning_rate": 0.0003858557651917944, + "loss": 3.0971, + "step": 24952 + }, + { + "epoch": 1.22, + "grad_norm": 0.5728558301925659, + "learning_rate": 0.0003858410128475731, + "loss": 3.2456, + "step": 24953 + }, + { + "epoch": 1.22, + "grad_norm": 0.55547696352005, + "learning_rate": 0.0003858262602772632, + "loss": 3.1905, + "step": 24954 + }, + { + "epoch": 1.22, + "grad_norm": 0.5570668578147888, + "learning_rate": 0.0003858115074809034, + "loss": 3.0247, + "step": 24955 + }, + { + "epoch": 1.22, + "grad_norm": 0.5741897225379944, + "learning_rate": 0.00038579675445853264, + "loss": 3.1321, + "step": 24956 + }, + { + "epoch": 1.22, + "grad_norm": 0.538557231426239, + "learning_rate": 0.00038578200121018976, + "loss": 3.133, + "step": 24957 + }, + { + "epoch": 1.22, + "grad_norm": 0.576298713684082, + "learning_rate": 0.0003857672477359134, + "loss": 3.1654, + "step": 24958 + }, + { + "epoch": 1.22, + "grad_norm": 0.5457455515861511, + "learning_rate": 0.00038575249403574276, + "loss": 3.1689, + "step": 24959 + }, + { + "epoch": 1.22, + "grad_norm": 0.5845022201538086, + "learning_rate": 0.00038573774010971666, + "loss": 2.9617, + "step": 24960 + }, + { + "epoch": 1.22, + "grad_norm": 0.5689767599105835, + "learning_rate": 0.00038572298595787374, + "loss": 2.9591, + "step": 24961 + }, + { + "epoch": 1.22, + "grad_norm": 0.5592098832130432, + "learning_rate": 0.0003857082315802529, + "loss": 3.0949, + "step": 24962 + }, + { + "epoch": 1.22, + "grad_norm": 0.5083266496658325, + "learning_rate": 0.0003856934769768932, + "loss": 2.9243, + "step": 24963 + }, + { + "epoch": 1.22, + "grad_norm": 0.5481277108192444, + "learning_rate": 0.0003856787221478333, + "loss": 3.1659, + "step": 24964 + }, + { + "epoch": 1.22, + "grad_norm": 0.5553989410400391, + "learning_rate": 0.00038566396709311215, + "loss": 3.1719, + "step": 24965 + }, + { + "epoch": 1.22, + "grad_norm": 0.5781741738319397, + "learning_rate": 0.00038564921181276867, + "loss": 3.1653, + "step": 24966 + }, + { + "epoch": 1.22, + "grad_norm": 0.5346400141716003, + "learning_rate": 0.00038563445630684153, + "loss": 2.9946, + "step": 24967 + }, + { + "epoch": 1.22, + "grad_norm": 0.5499003529548645, + "learning_rate": 0.00038561970057536977, + "loss": 2.9757, + "step": 24968 + }, + { + "epoch": 1.22, + "grad_norm": 0.5649663209915161, + "learning_rate": 0.0003856049446183922, + "loss": 3.0766, + "step": 24969 + }, + { + "epoch": 1.22, + "grad_norm": 0.5463762283325195, + "learning_rate": 0.0003855901884359476, + "loss": 2.9893, + "step": 24970 + }, + { + "epoch": 1.22, + "grad_norm": 0.5486137270927429, + "learning_rate": 0.00038557543202807513, + "loss": 3.1587, + "step": 24971 + }, + { + "epoch": 1.22, + "grad_norm": 0.5812304019927979, + "learning_rate": 0.0003855606753948133, + "loss": 3.0608, + "step": 24972 + }, + { + "epoch": 1.22, + "grad_norm": 0.6184636354446411, + "learning_rate": 0.000385545918536201, + "loss": 3.0295, + "step": 24973 + }, + { + "epoch": 1.22, + "grad_norm": 0.5484544038772583, + "learning_rate": 0.00038553116145227743, + "loss": 3.0903, + "step": 24974 + }, + { + "epoch": 1.22, + "grad_norm": 0.5901756882667542, + "learning_rate": 0.0003855164041430811, + "loss": 3.1048, + "step": 24975 + }, + { + "epoch": 1.22, + "grad_norm": 0.5762054920196533, + "learning_rate": 0.0003855016466086511, + "loss": 3.1637, + "step": 24976 + }, + { + "epoch": 1.22, + "grad_norm": 0.5577733516693115, + "learning_rate": 0.00038548688884902616, + "loss": 3.1275, + "step": 24977 + }, + { + "epoch": 1.22, + "grad_norm": 0.5851019620895386, + "learning_rate": 0.00038547213086424526, + "loss": 3.3804, + "step": 24978 + }, + { + "epoch": 1.22, + "grad_norm": 0.5264205932617188, + "learning_rate": 0.0003854573726543472, + "loss": 2.9034, + "step": 24979 + }, + { + "epoch": 1.22, + "grad_norm": 0.529139518737793, + "learning_rate": 0.0003854426142193708, + "loss": 3.13, + "step": 24980 + }, + { + "epoch": 1.22, + "grad_norm": 0.5476992726325989, + "learning_rate": 0.000385427855559355, + "loss": 3.1176, + "step": 24981 + }, + { + "epoch": 1.22, + "grad_norm": 0.5486899614334106, + "learning_rate": 0.0003854130966743388, + "loss": 3.0411, + "step": 24982 + }, + { + "epoch": 1.22, + "grad_norm": 0.5495012998580933, + "learning_rate": 0.00038539833756436086, + "loss": 3.0098, + "step": 24983 + }, + { + "epoch": 1.22, + "grad_norm": 0.5303811430931091, + "learning_rate": 0.0003853835782294601, + "loss": 3.1033, + "step": 24984 + }, + { + "epoch": 1.22, + "grad_norm": 0.5448635220527649, + "learning_rate": 0.00038536881866967554, + "loss": 2.9309, + "step": 24985 + }, + { + "epoch": 1.22, + "grad_norm": 0.5653622150421143, + "learning_rate": 0.0003853540588850459, + "loss": 3.0957, + "step": 24986 + }, + { + "epoch": 1.22, + "grad_norm": 0.6244167685508728, + "learning_rate": 0.00038533929887561003, + "loss": 3.134, + "step": 24987 + }, + { + "epoch": 1.22, + "grad_norm": 0.5666537284851074, + "learning_rate": 0.00038532453864140685, + "loss": 3.1218, + "step": 24988 + }, + { + "epoch": 1.22, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.00038530977818247535, + "loss": 2.8336, + "step": 24989 + }, + { + "epoch": 1.22, + "grad_norm": 0.5451518297195435, + "learning_rate": 0.0003852950174988542, + "loss": 2.8832, + "step": 24990 + }, + { + "epoch": 1.22, + "grad_norm": 0.5833927989006042, + "learning_rate": 0.00038528025659058255, + "loss": 2.9466, + "step": 24991 + }, + { + "epoch": 1.22, + "grad_norm": 0.5745109915733337, + "learning_rate": 0.000385265495457699, + "loss": 3.0929, + "step": 24992 + }, + { + "epoch": 1.22, + "grad_norm": 0.5827248096466064, + "learning_rate": 0.00038525073410024263, + "loss": 3.0634, + "step": 24993 + }, + { + "epoch": 1.22, + "grad_norm": 0.544745922088623, + "learning_rate": 0.00038523597251825207, + "loss": 3.1738, + "step": 24994 + }, + { + "epoch": 1.22, + "grad_norm": 0.5738770961761475, + "learning_rate": 0.0003852212107117665, + "loss": 2.9558, + "step": 24995 + }, + { + "epoch": 1.22, + "grad_norm": 0.631645143032074, + "learning_rate": 0.0003852064486808247, + "loss": 2.8622, + "step": 24996 + }, + { + "epoch": 1.23, + "grad_norm": 0.5620419979095459, + "learning_rate": 0.0003851916864254654, + "loss": 2.6171, + "step": 24997 + }, + { + "epoch": 1.23, + "grad_norm": 0.5576983094215393, + "learning_rate": 0.0003851769239457277, + "loss": 3.1946, + "step": 24998 + }, + { + "epoch": 1.23, + "grad_norm": 0.6105443835258484, + "learning_rate": 0.0003851621612416502, + "loss": 3.0174, + "step": 24999 + }, + { + "epoch": 1.23, + "grad_norm": 0.5353231430053711, + "learning_rate": 0.00038514739831327214, + "loss": 2.901, + "step": 25000 + }, + { + "epoch": 1.23, + "grad_norm": 0.600609540939331, + "learning_rate": 0.0003851326351606321, + "loss": 3.0875, + "step": 25001 + }, + { + "epoch": 1.23, + "grad_norm": 0.6092653274536133, + "learning_rate": 0.0003851178717837691, + "loss": 3.0929, + "step": 25002 + }, + { + "epoch": 1.23, + "grad_norm": 0.5666999816894531, + "learning_rate": 0.00038510310818272203, + "loss": 2.9879, + "step": 25003 + }, + { + "epoch": 1.23, + "grad_norm": 0.5675250291824341, + "learning_rate": 0.0003850883443575298, + "loss": 3.2092, + "step": 25004 + }, + { + "epoch": 1.23, + "grad_norm": 0.5591527223587036, + "learning_rate": 0.0003850735803082312, + "loss": 3.0273, + "step": 25005 + }, + { + "epoch": 1.23, + "grad_norm": 0.5454131364822388, + "learning_rate": 0.0003850588160348652, + "loss": 3.1456, + "step": 25006 + }, + { + "epoch": 1.23, + "grad_norm": 0.5496153235435486, + "learning_rate": 0.00038504405153747066, + "loss": 2.9497, + "step": 25007 + }, + { + "epoch": 1.23, + "grad_norm": 0.5893572568893433, + "learning_rate": 0.0003850292868160864, + "loss": 3.2013, + "step": 25008 + }, + { + "epoch": 1.23, + "grad_norm": 0.6619960069656372, + "learning_rate": 0.0003850145218707513, + "loss": 3.0674, + "step": 25009 + }, + { + "epoch": 1.23, + "grad_norm": 0.5621336102485657, + "learning_rate": 0.00038499975670150437, + "loss": 3.1975, + "step": 25010 + }, + { + "epoch": 1.23, + "grad_norm": 0.5568299889564514, + "learning_rate": 0.00038498499130838455, + "loss": 3.166, + "step": 25011 + }, + { + "epoch": 1.23, + "grad_norm": 0.5435270071029663, + "learning_rate": 0.0003849702256914305, + "loss": 3.067, + "step": 25012 + }, + { + "epoch": 1.23, + "grad_norm": 0.5628631114959717, + "learning_rate": 0.0003849554598506812, + "loss": 3.017, + "step": 25013 + }, + { + "epoch": 1.23, + "grad_norm": 0.5889310240745544, + "learning_rate": 0.00038494069378617566, + "loss": 3.1861, + "step": 25014 + }, + { + "epoch": 1.23, + "grad_norm": 0.5779412984848022, + "learning_rate": 0.0003849259274979526, + "loss": 3.0177, + "step": 25015 + }, + { + "epoch": 1.23, + "grad_norm": 0.5566220283508301, + "learning_rate": 0.00038491116098605103, + "loss": 3.0443, + "step": 25016 + }, + { + "epoch": 1.23, + "grad_norm": 0.5883911848068237, + "learning_rate": 0.0003848963942505098, + "loss": 2.8931, + "step": 25017 + }, + { + "epoch": 1.23, + "grad_norm": 0.5884630084037781, + "learning_rate": 0.0003848816272913678, + "loss": 2.978, + "step": 25018 + }, + { + "epoch": 1.23, + "grad_norm": 0.5841085910797119, + "learning_rate": 0.000384866860108664, + "loss": 2.8837, + "step": 25019 + }, + { + "epoch": 1.23, + "grad_norm": 0.5884820818901062, + "learning_rate": 0.00038485209270243707, + "loss": 3.0159, + "step": 25020 + }, + { + "epoch": 1.23, + "grad_norm": 0.6222401857376099, + "learning_rate": 0.0003848373250727262, + "loss": 3.1475, + "step": 25021 + }, + { + "epoch": 1.23, + "grad_norm": 0.6234578490257263, + "learning_rate": 0.0003848225572195702, + "loss": 3.2096, + "step": 25022 + }, + { + "epoch": 1.23, + "grad_norm": 0.6008965373039246, + "learning_rate": 0.00038480778914300775, + "loss": 3.2338, + "step": 25023 + }, + { + "epoch": 1.23, + "grad_norm": 0.5736437439918518, + "learning_rate": 0.000384793020843078, + "loss": 3.1751, + "step": 25024 + }, + { + "epoch": 1.23, + "grad_norm": 0.5647448301315308, + "learning_rate": 0.00038477825231981967, + "loss": 3.1197, + "step": 25025 + }, + { + "epoch": 1.23, + "grad_norm": 0.5257458686828613, + "learning_rate": 0.00038476348357327175, + "loss": 3.2591, + "step": 25026 + }, + { + "epoch": 1.23, + "grad_norm": 0.553209662437439, + "learning_rate": 0.00038474871460347315, + "loss": 3.3708, + "step": 25027 + }, + { + "epoch": 1.23, + "grad_norm": 0.6161184310913086, + "learning_rate": 0.00038473394541046276, + "loss": 2.9489, + "step": 25028 + }, + { + "epoch": 1.23, + "grad_norm": 0.5613665580749512, + "learning_rate": 0.0003847191759942794, + "loss": 2.9919, + "step": 25029 + }, + { + "epoch": 1.23, + "grad_norm": 0.5895677804946899, + "learning_rate": 0.00038470440635496214, + "loss": 2.8271, + "step": 25030 + }, + { + "epoch": 1.23, + "grad_norm": 0.5605174899101257, + "learning_rate": 0.00038468963649254976, + "loss": 3.0974, + "step": 25031 + }, + { + "epoch": 1.23, + "grad_norm": 0.5734592080116272, + "learning_rate": 0.00038467486640708106, + "loss": 2.9672, + "step": 25032 + }, + { + "epoch": 1.23, + "grad_norm": 0.6092544794082642, + "learning_rate": 0.0003846600960985953, + "loss": 3.1621, + "step": 25033 + }, + { + "epoch": 1.23, + "grad_norm": 0.5776329636573792, + "learning_rate": 0.0003846453255671309, + "loss": 3.0484, + "step": 25034 + }, + { + "epoch": 1.23, + "grad_norm": 0.555668294429779, + "learning_rate": 0.000384630554812727, + "loss": 2.8431, + "step": 25035 + }, + { + "epoch": 1.23, + "grad_norm": 0.5371507406234741, + "learning_rate": 0.00038461578383542266, + "loss": 3.2272, + "step": 25036 + }, + { + "epoch": 1.23, + "grad_norm": 0.5646740198135376, + "learning_rate": 0.00038460101263525654, + "loss": 3.1364, + "step": 25037 + }, + { + "epoch": 1.23, + "grad_norm": 0.5981430411338806, + "learning_rate": 0.0003845862412122677, + "loss": 3.0595, + "step": 25038 + }, + { + "epoch": 1.23, + "grad_norm": 0.5709407329559326, + "learning_rate": 0.00038457146956649496, + "loss": 3.3635, + "step": 25039 + }, + { + "epoch": 1.23, + "grad_norm": 0.5592368245124817, + "learning_rate": 0.00038455669769797723, + "loss": 3.0549, + "step": 25040 + }, + { + "epoch": 1.23, + "grad_norm": 0.5950730443000793, + "learning_rate": 0.00038454192560675337, + "loss": 3.0904, + "step": 25041 + }, + { + "epoch": 1.23, + "grad_norm": 0.5664187073707581, + "learning_rate": 0.00038452715329286236, + "loss": 3.2247, + "step": 25042 + }, + { + "epoch": 1.23, + "grad_norm": 0.5902990102767944, + "learning_rate": 0.00038451238075634317, + "loss": 2.9901, + "step": 25043 + }, + { + "epoch": 1.23, + "grad_norm": 0.5837813019752502, + "learning_rate": 0.00038449760799723467, + "loss": 3.281, + "step": 25044 + }, + { + "epoch": 1.23, + "grad_norm": 0.5957124829292297, + "learning_rate": 0.0003844828350155756, + "loss": 3.1995, + "step": 25045 + }, + { + "epoch": 1.23, + "grad_norm": 0.5741605758666992, + "learning_rate": 0.000384468061811405, + "loss": 3.0371, + "step": 25046 + }, + { + "epoch": 1.23, + "grad_norm": 0.5534799098968506, + "learning_rate": 0.00038445328838476197, + "loss": 3.2169, + "step": 25047 + }, + { + "epoch": 1.23, + "grad_norm": 0.5551855564117432, + "learning_rate": 0.00038443851473568503, + "loss": 2.9418, + "step": 25048 + }, + { + "epoch": 1.23, + "grad_norm": 0.5379355549812317, + "learning_rate": 0.0003844237408642134, + "loss": 3.0535, + "step": 25049 + }, + { + "epoch": 1.23, + "grad_norm": 0.5475854873657227, + "learning_rate": 0.0003844089667703858, + "loss": 2.9308, + "step": 25050 + }, + { + "epoch": 1.23, + "grad_norm": 0.5689952969551086, + "learning_rate": 0.0003843941924542413, + "loss": 2.939, + "step": 25051 + }, + { + "epoch": 1.23, + "grad_norm": 0.5918108224868774, + "learning_rate": 0.0003843794179158187, + "loss": 2.8464, + "step": 25052 + }, + { + "epoch": 1.23, + "grad_norm": 0.5874655842781067, + "learning_rate": 0.000384364643155157, + "loss": 3.1436, + "step": 25053 + }, + { + "epoch": 1.23, + "grad_norm": 0.535483181476593, + "learning_rate": 0.00038434986817229504, + "loss": 2.867, + "step": 25054 + }, + { + "epoch": 1.23, + "grad_norm": 0.6092949509620667, + "learning_rate": 0.0003843350929672719, + "loss": 3.1436, + "step": 25055 + }, + { + "epoch": 1.23, + "grad_norm": 0.5654955506324768, + "learning_rate": 0.0003843203175401261, + "loss": 3.0727, + "step": 25056 + }, + { + "epoch": 1.23, + "grad_norm": 0.5241906642913818, + "learning_rate": 0.0003843055418908969, + "loss": 2.7895, + "step": 25057 + }, + { + "epoch": 1.23, + "grad_norm": 0.577303409576416, + "learning_rate": 0.0003842907660196232, + "loss": 2.9957, + "step": 25058 + }, + { + "epoch": 1.23, + "grad_norm": 0.5571273565292358, + "learning_rate": 0.0003842759899263438, + "loss": 3.1266, + "step": 25059 + }, + { + "epoch": 1.23, + "grad_norm": 0.5683521628379822, + "learning_rate": 0.0003842612136110976, + "loss": 2.9504, + "step": 25060 + }, + { + "epoch": 1.23, + "grad_norm": 0.5134350657463074, + "learning_rate": 0.00038424643707392364, + "loss": 3.295, + "step": 25061 + }, + { + "epoch": 1.23, + "grad_norm": 0.6152227520942688, + "learning_rate": 0.0003842316603148608, + "loss": 2.764, + "step": 25062 + }, + { + "epoch": 1.23, + "grad_norm": 0.5723081231117249, + "learning_rate": 0.000384216883333948, + "loss": 3.0838, + "step": 25063 + }, + { + "epoch": 1.23, + "grad_norm": 0.5563600659370422, + "learning_rate": 0.000384202106131224, + "loss": 3.2504, + "step": 25064 + }, + { + "epoch": 1.23, + "grad_norm": 0.5456048250198364, + "learning_rate": 0.00038418732870672804, + "loss": 2.9962, + "step": 25065 + }, + { + "epoch": 1.23, + "grad_norm": 0.5585772395133972, + "learning_rate": 0.0003841725510604987, + "loss": 2.8871, + "step": 25066 + }, + { + "epoch": 1.23, + "grad_norm": 0.5854873061180115, + "learning_rate": 0.00038415777319257517, + "loss": 2.9557, + "step": 25067 + }, + { + "epoch": 1.23, + "grad_norm": 0.5794108510017395, + "learning_rate": 0.0003841429951029962, + "loss": 3.1969, + "step": 25068 + }, + { + "epoch": 1.23, + "grad_norm": 0.688174307346344, + "learning_rate": 0.0003841282167918008, + "loss": 3.1212, + "step": 25069 + }, + { + "epoch": 1.23, + "grad_norm": 0.5816777944564819, + "learning_rate": 0.0003841134382590279, + "loss": 3.0208, + "step": 25070 + }, + { + "epoch": 1.23, + "grad_norm": 0.5570030808448792, + "learning_rate": 0.00038409865950471634, + "loss": 3.1805, + "step": 25071 + }, + { + "epoch": 1.23, + "grad_norm": 0.5498480796813965, + "learning_rate": 0.00038408388052890507, + "loss": 3.0436, + "step": 25072 + }, + { + "epoch": 1.23, + "grad_norm": 0.5754635334014893, + "learning_rate": 0.0003840691013316331, + "loss": 3.1856, + "step": 25073 + }, + { + "epoch": 1.23, + "grad_norm": 0.5430830121040344, + "learning_rate": 0.0003840543219129393, + "loss": 2.9776, + "step": 25074 + }, + { + "epoch": 1.23, + "grad_norm": 0.5669942498207092, + "learning_rate": 0.0003840395422728626, + "loss": 3.1792, + "step": 25075 + }, + { + "epoch": 1.23, + "grad_norm": 0.5448963046073914, + "learning_rate": 0.0003840247624114418, + "loss": 3.0217, + "step": 25076 + }, + { + "epoch": 1.23, + "grad_norm": 0.5780636072158813, + "learning_rate": 0.0003840099823287161, + "loss": 3.0021, + "step": 25077 + }, + { + "epoch": 1.23, + "grad_norm": 0.5799130201339722, + "learning_rate": 0.00038399520202472414, + "loss": 2.9277, + "step": 25078 + }, + { + "epoch": 1.23, + "grad_norm": 0.5543425679206848, + "learning_rate": 0.0003839804214995051, + "loss": 3.0757, + "step": 25079 + }, + { + "epoch": 1.23, + "grad_norm": 0.5854793787002563, + "learning_rate": 0.0003839656407530977, + "loss": 3.0283, + "step": 25080 + }, + { + "epoch": 1.23, + "grad_norm": 0.5901440978050232, + "learning_rate": 0.00038395085978554103, + "loss": 3.0345, + "step": 25081 + }, + { + "epoch": 1.23, + "grad_norm": 0.6366936564445496, + "learning_rate": 0.00038393607859687394, + "loss": 3.1309, + "step": 25082 + }, + { + "epoch": 1.23, + "grad_norm": 0.554716169834137, + "learning_rate": 0.0003839212971871354, + "loss": 3.1066, + "step": 25083 + }, + { + "epoch": 1.23, + "grad_norm": 0.5704500675201416, + "learning_rate": 0.0003839065155563643, + "loss": 2.9534, + "step": 25084 + }, + { + "epoch": 1.23, + "grad_norm": 0.5622331500053406, + "learning_rate": 0.00038389173370459955, + "loss": 3.0545, + "step": 25085 + }, + { + "epoch": 1.23, + "grad_norm": 0.5637291073799133, + "learning_rate": 0.0003838769516318801, + "loss": 3.018, + "step": 25086 + }, + { + "epoch": 1.23, + "grad_norm": 0.5698617696762085, + "learning_rate": 0.000383862169338245, + "loss": 3.1951, + "step": 25087 + }, + { + "epoch": 1.23, + "grad_norm": 0.5883769392967224, + "learning_rate": 0.00038384738682373297, + "loss": 3.2073, + "step": 25088 + }, + { + "epoch": 1.23, + "grad_norm": 0.54131680727005, + "learning_rate": 0.0003838326040883831, + "loss": 3.1775, + "step": 25089 + }, + { + "epoch": 1.23, + "grad_norm": 0.5682424306869507, + "learning_rate": 0.00038381782113223426, + "loss": 3.1067, + "step": 25090 + }, + { + "epoch": 1.23, + "grad_norm": 0.5473423004150391, + "learning_rate": 0.0003838030379553255, + "loss": 2.8472, + "step": 25091 + }, + { + "epoch": 1.23, + "grad_norm": 0.5303641557693481, + "learning_rate": 0.0003837882545576956, + "loss": 3.1275, + "step": 25092 + }, + { + "epoch": 1.23, + "grad_norm": 0.5696659684181213, + "learning_rate": 0.00038377347093938353, + "loss": 3.0899, + "step": 25093 + }, + { + "epoch": 1.23, + "grad_norm": 0.5956259965896606, + "learning_rate": 0.00038375868710042836, + "loss": 3.0071, + "step": 25094 + }, + { + "epoch": 1.23, + "grad_norm": 0.5872520208358765, + "learning_rate": 0.00038374390304086884, + "loss": 3.1197, + "step": 25095 + }, + { + "epoch": 1.23, + "grad_norm": 0.5226383805274963, + "learning_rate": 0.00038372911876074405, + "loss": 3.1719, + "step": 25096 + }, + { + "epoch": 1.23, + "grad_norm": 0.5661742687225342, + "learning_rate": 0.00038371433426009276, + "loss": 2.9626, + "step": 25097 + }, + { + "epoch": 1.23, + "grad_norm": 0.5806180834770203, + "learning_rate": 0.0003836995495389542, + "loss": 3.1586, + "step": 25098 + }, + { + "epoch": 1.23, + "grad_norm": 0.5616850256919861, + "learning_rate": 0.00038368476459736705, + "loss": 3.0818, + "step": 25099 + }, + { + "epoch": 1.23, + "grad_norm": 0.5661494731903076, + "learning_rate": 0.0003836699794353703, + "loss": 3.0206, + "step": 25100 + }, + { + "epoch": 1.23, + "grad_norm": 0.5832769274711609, + "learning_rate": 0.000383655194053003, + "loss": 2.8287, + "step": 25101 + }, + { + "epoch": 1.23, + "grad_norm": 0.5530614256858826, + "learning_rate": 0.000383640408450304, + "loss": 3.0402, + "step": 25102 + }, + { + "epoch": 1.23, + "grad_norm": 0.5663602352142334, + "learning_rate": 0.00038362562262731224, + "loss": 3.2647, + "step": 25103 + }, + { + "epoch": 1.23, + "grad_norm": 0.5610358119010925, + "learning_rate": 0.00038361083658406666, + "loss": 3.0342, + "step": 25104 + }, + { + "epoch": 1.23, + "grad_norm": 0.5962334275245667, + "learning_rate": 0.0003835960503206063, + "loss": 3.039, + "step": 25105 + }, + { + "epoch": 1.23, + "grad_norm": 0.5851008296012878, + "learning_rate": 0.00038358126383697, + "loss": 2.7736, + "step": 25106 + }, + { + "epoch": 1.23, + "grad_norm": 0.6189122796058655, + "learning_rate": 0.0003835664771331967, + "loss": 2.9924, + "step": 25107 + }, + { + "epoch": 1.23, + "grad_norm": 0.5865572094917297, + "learning_rate": 0.00038355169020932534, + "loss": 3.1454, + "step": 25108 + }, + { + "epoch": 1.23, + "grad_norm": 0.5793099403381348, + "learning_rate": 0.00038353690306539506, + "loss": 3.0783, + "step": 25109 + }, + { + "epoch": 1.23, + "grad_norm": 0.577268660068512, + "learning_rate": 0.00038352211570144454, + "loss": 3.1842, + "step": 25110 + }, + { + "epoch": 1.23, + "grad_norm": 0.524253249168396, + "learning_rate": 0.0003835073281175129, + "loss": 3.1753, + "step": 25111 + }, + { + "epoch": 1.23, + "grad_norm": 0.553202748298645, + "learning_rate": 0.0003834925403136389, + "loss": 2.9092, + "step": 25112 + }, + { + "epoch": 1.23, + "grad_norm": 0.5760665535926819, + "learning_rate": 0.0003834777522898618, + "loss": 3.1152, + "step": 25113 + }, + { + "epoch": 1.23, + "grad_norm": 0.5376695990562439, + "learning_rate": 0.00038346296404622024, + "loss": 3.0326, + "step": 25114 + }, + { + "epoch": 1.23, + "grad_norm": 0.5973801612854004, + "learning_rate": 0.00038344817558275335, + "loss": 3.1555, + "step": 25115 + }, + { + "epoch": 1.23, + "grad_norm": 0.5529061555862427, + "learning_rate": 0.00038343338689950004, + "loss": 3.1566, + "step": 25116 + }, + { + "epoch": 1.23, + "grad_norm": 0.6026521325111389, + "learning_rate": 0.0003834185979964992, + "loss": 3.1871, + "step": 25117 + }, + { + "epoch": 1.23, + "grad_norm": 0.5608975887298584, + "learning_rate": 0.0003834038088737898, + "loss": 2.9394, + "step": 25118 + }, + { + "epoch": 1.23, + "grad_norm": 0.530158519744873, + "learning_rate": 0.00038338901953141087, + "loss": 3.0237, + "step": 25119 + }, + { + "epoch": 1.23, + "grad_norm": 0.5642078518867493, + "learning_rate": 0.0003833742299694014, + "loss": 3.2089, + "step": 25120 + }, + { + "epoch": 1.23, + "grad_norm": 0.526268720626831, + "learning_rate": 0.0003833594401878001, + "loss": 3.1074, + "step": 25121 + }, + { + "epoch": 1.23, + "grad_norm": 0.5271223187446594, + "learning_rate": 0.00038334465018664615, + "loss": 3.1286, + "step": 25122 + }, + { + "epoch": 1.23, + "grad_norm": 0.5779740214347839, + "learning_rate": 0.0003833298599659783, + "loss": 3.0723, + "step": 25123 + }, + { + "epoch": 1.23, + "grad_norm": 0.5797901749610901, + "learning_rate": 0.00038331506952583584, + "loss": 3.1721, + "step": 25124 + }, + { + "epoch": 1.23, + "grad_norm": 0.5432953834533691, + "learning_rate": 0.00038330027886625744, + "loss": 2.9935, + "step": 25125 + }, + { + "epoch": 1.23, + "grad_norm": 0.5544005632400513, + "learning_rate": 0.0003832854879872821, + "loss": 2.8895, + "step": 25126 + }, + { + "epoch": 1.23, + "grad_norm": 0.5620940327644348, + "learning_rate": 0.00038327069688894885, + "loss": 3.053, + "step": 25127 + }, + { + "epoch": 1.23, + "grad_norm": 0.5733068585395813, + "learning_rate": 0.00038325590557129665, + "loss": 2.9839, + "step": 25128 + }, + { + "epoch": 1.23, + "grad_norm": 0.5551736354827881, + "learning_rate": 0.00038324111403436435, + "loss": 3.268, + "step": 25129 + }, + { + "epoch": 1.23, + "grad_norm": 0.584686279296875, + "learning_rate": 0.00038322632227819097, + "loss": 3.2248, + "step": 25130 + }, + { + "epoch": 1.23, + "grad_norm": 0.6471843123435974, + "learning_rate": 0.00038321153030281555, + "loss": 3.0796, + "step": 25131 + }, + { + "epoch": 1.23, + "grad_norm": 0.5832377672195435, + "learning_rate": 0.000383196738108277, + "loss": 3.0214, + "step": 25132 + }, + { + "epoch": 1.23, + "grad_norm": 0.5637081861495972, + "learning_rate": 0.0003831819456946141, + "loss": 3.1815, + "step": 25133 + }, + { + "epoch": 1.23, + "grad_norm": 0.5314174890518188, + "learning_rate": 0.00038316715306186604, + "loss": 2.947, + "step": 25134 + }, + { + "epoch": 1.23, + "grad_norm": 0.564139187335968, + "learning_rate": 0.0003831523602100717, + "loss": 3.0108, + "step": 25135 + }, + { + "epoch": 1.23, + "grad_norm": 0.5853064656257629, + "learning_rate": 0.00038313756713927004, + "loss": 2.963, + "step": 25136 + }, + { + "epoch": 1.23, + "grad_norm": 0.5824783444404602, + "learning_rate": 0.0003831227738495001, + "loss": 3.0401, + "step": 25137 + }, + { + "epoch": 1.23, + "grad_norm": 0.590248703956604, + "learning_rate": 0.0003831079803408007, + "loss": 3.1766, + "step": 25138 + }, + { + "epoch": 1.23, + "grad_norm": 0.5577486157417297, + "learning_rate": 0.0003830931866132109, + "loss": 2.9757, + "step": 25139 + }, + { + "epoch": 1.23, + "grad_norm": 0.5475689768791199, + "learning_rate": 0.0003830783926667696, + "loss": 3.2508, + "step": 25140 + }, + { + "epoch": 1.23, + "grad_norm": 0.5604707598686218, + "learning_rate": 0.0003830635985015158, + "loss": 2.9826, + "step": 25141 + }, + { + "epoch": 1.23, + "grad_norm": 0.5635944604873657, + "learning_rate": 0.0003830488041174885, + "loss": 3.0229, + "step": 25142 + }, + { + "epoch": 1.23, + "grad_norm": 0.5407227873802185, + "learning_rate": 0.00038303400951472666, + "loss": 2.8838, + "step": 25143 + }, + { + "epoch": 1.23, + "grad_norm": 0.5590699315071106, + "learning_rate": 0.0003830192146932692, + "loss": 3.2534, + "step": 25144 + }, + { + "epoch": 1.23, + "grad_norm": 0.5566086769104004, + "learning_rate": 0.00038300441965315505, + "loss": 2.9712, + "step": 25145 + }, + { + "epoch": 1.23, + "grad_norm": 0.5474108457565308, + "learning_rate": 0.0003829896243944234, + "loss": 3.1384, + "step": 25146 + }, + { + "epoch": 1.23, + "grad_norm": 0.5965452790260315, + "learning_rate": 0.00038297482891711286, + "loss": 3.0701, + "step": 25147 + }, + { + "epoch": 1.23, + "grad_norm": 0.5965765714645386, + "learning_rate": 0.0003829600332212626, + "loss": 3.038, + "step": 25148 + }, + { + "epoch": 1.23, + "grad_norm": 0.547390878200531, + "learning_rate": 0.00038294523730691174, + "loss": 2.8804, + "step": 25149 + }, + { + "epoch": 1.23, + "grad_norm": 0.5904244184494019, + "learning_rate": 0.00038293044117409896, + "loss": 2.9532, + "step": 25150 + }, + { + "epoch": 1.23, + "grad_norm": 0.5980238914489746, + "learning_rate": 0.00038291564482286335, + "loss": 3.1317, + "step": 25151 + }, + { + "epoch": 1.23, + "grad_norm": 0.6068405508995056, + "learning_rate": 0.00038290084825324394, + "loss": 3.1838, + "step": 25152 + }, + { + "epoch": 1.23, + "grad_norm": 0.5584993362426758, + "learning_rate": 0.00038288605146527966, + "loss": 3.1354, + "step": 25153 + }, + { + "epoch": 1.23, + "grad_norm": 0.5200023651123047, + "learning_rate": 0.00038287125445900943, + "loss": 3.1367, + "step": 25154 + }, + { + "epoch": 1.23, + "grad_norm": 0.5400229096412659, + "learning_rate": 0.00038285645723447223, + "loss": 3.1147, + "step": 25155 + }, + { + "epoch": 1.23, + "grad_norm": 0.6204379200935364, + "learning_rate": 0.0003828416597917071, + "loss": 3.0594, + "step": 25156 + }, + { + "epoch": 1.23, + "grad_norm": 0.5439574122428894, + "learning_rate": 0.0003828268621307531, + "loss": 3.294, + "step": 25157 + }, + { + "epoch": 1.23, + "grad_norm": 0.5771875381469727, + "learning_rate": 0.00038281206425164893, + "loss": 3.033, + "step": 25158 + }, + { + "epoch": 1.23, + "grad_norm": 0.5748353004455566, + "learning_rate": 0.0003827972661544337, + "loss": 3.3069, + "step": 25159 + }, + { + "epoch": 1.23, + "grad_norm": 0.5352070331573486, + "learning_rate": 0.00038278246783914657, + "loss": 3.2755, + "step": 25160 + }, + { + "epoch": 1.23, + "grad_norm": 0.533769965171814, + "learning_rate": 0.00038276766930582626, + "loss": 3.2693, + "step": 25161 + }, + { + "epoch": 1.23, + "grad_norm": 0.5342288017272949, + "learning_rate": 0.0003827528705545118, + "loss": 3.1622, + "step": 25162 + }, + { + "epoch": 1.23, + "grad_norm": 0.5443651676177979, + "learning_rate": 0.00038273807158524216, + "loss": 3.1329, + "step": 25163 + }, + { + "epoch": 1.23, + "grad_norm": 0.5513403415679932, + "learning_rate": 0.0003827232723980566, + "loss": 3.0414, + "step": 25164 + }, + { + "epoch": 1.23, + "grad_norm": 0.5552654266357422, + "learning_rate": 0.00038270847299299367, + "loss": 3.0813, + "step": 25165 + }, + { + "epoch": 1.23, + "grad_norm": 0.5734013319015503, + "learning_rate": 0.00038269367337009257, + "loss": 3.0957, + "step": 25166 + }, + { + "epoch": 1.23, + "grad_norm": 0.6037915349006653, + "learning_rate": 0.0003826788735293923, + "loss": 3.2297, + "step": 25167 + }, + { + "epoch": 1.23, + "grad_norm": 0.5523087382316589, + "learning_rate": 0.0003826640734709317, + "loss": 2.9195, + "step": 25168 + }, + { + "epoch": 1.23, + "grad_norm": 0.5460664629936218, + "learning_rate": 0.00038264927319474985, + "loss": 3.202, + "step": 25169 + }, + { + "epoch": 1.23, + "grad_norm": 0.5905874967575073, + "learning_rate": 0.0003826344727008858, + "loss": 2.9142, + "step": 25170 + }, + { + "epoch": 1.23, + "grad_norm": 0.5918222665786743, + "learning_rate": 0.0003826196719893785, + "loss": 3.0206, + "step": 25171 + }, + { + "epoch": 1.23, + "grad_norm": 0.539842426776886, + "learning_rate": 0.00038260487106026676, + "loss": 3.134, + "step": 25172 + }, + { + "epoch": 1.23, + "grad_norm": 0.5324265956878662, + "learning_rate": 0.00038259006991358966, + "loss": 3.1243, + "step": 25173 + }, + { + "epoch": 1.23, + "grad_norm": 0.5232551097869873, + "learning_rate": 0.00038257526854938625, + "loss": 3.4041, + "step": 25174 + }, + { + "epoch": 1.23, + "grad_norm": 0.5780461430549622, + "learning_rate": 0.0003825604669676955, + "loss": 3.1711, + "step": 25175 + }, + { + "epoch": 1.23, + "grad_norm": 0.5784001350402832, + "learning_rate": 0.00038254566516855633, + "loss": 3.0586, + "step": 25176 + }, + { + "epoch": 1.23, + "grad_norm": 0.5395734310150146, + "learning_rate": 0.0003825308631520079, + "loss": 3.1729, + "step": 25177 + }, + { + "epoch": 1.23, + "grad_norm": 0.57415372133255, + "learning_rate": 0.0003825160609180889, + "loss": 2.9213, + "step": 25178 + }, + { + "epoch": 1.23, + "grad_norm": 0.5726572275161743, + "learning_rate": 0.0003825012584668385, + "loss": 3.1291, + "step": 25179 + }, + { + "epoch": 1.23, + "grad_norm": 0.571429431438446, + "learning_rate": 0.00038248645579829567, + "loss": 2.9315, + "step": 25180 + }, + { + "epoch": 1.23, + "grad_norm": 0.5924422144889832, + "learning_rate": 0.0003824716529124994, + "loss": 3.3614, + "step": 25181 + }, + { + "epoch": 1.23, + "grad_norm": 0.5415179133415222, + "learning_rate": 0.0003824568498094887, + "loss": 3.1108, + "step": 25182 + }, + { + "epoch": 1.23, + "grad_norm": 0.5803284645080566, + "learning_rate": 0.0003824420464893026, + "loss": 3.2433, + "step": 25183 + }, + { + "epoch": 1.23, + "grad_norm": 0.5539078712463379, + "learning_rate": 0.00038242724295197984, + "loss": 2.8731, + "step": 25184 + }, + { + "epoch": 1.23, + "grad_norm": 0.549834668636322, + "learning_rate": 0.00038241243919755967, + "loss": 3.0925, + "step": 25185 + }, + { + "epoch": 1.23, + "grad_norm": 0.5797627568244934, + "learning_rate": 0.0003823976352260811, + "loss": 3.2575, + "step": 25186 + }, + { + "epoch": 1.23, + "grad_norm": 0.57608562707901, + "learning_rate": 0.0003823828310375829, + "loss": 2.9751, + "step": 25187 + }, + { + "epoch": 1.23, + "grad_norm": 0.587173342704773, + "learning_rate": 0.0003823680266321042, + "loss": 3.1131, + "step": 25188 + }, + { + "epoch": 1.23, + "grad_norm": 0.5533391833305359, + "learning_rate": 0.00038235322200968396, + "loss": 3.2133, + "step": 25189 + }, + { + "epoch": 1.23, + "grad_norm": 0.5643752813339233, + "learning_rate": 0.00038233841717036116, + "loss": 3.2175, + "step": 25190 + }, + { + "epoch": 1.23, + "grad_norm": 0.5208031535148621, + "learning_rate": 0.00038232361211417484, + "loss": 3.1639, + "step": 25191 + }, + { + "epoch": 1.23, + "grad_norm": 0.5685301423072815, + "learning_rate": 0.00038230880684116396, + "loss": 3.0885, + "step": 25192 + }, + { + "epoch": 1.23, + "grad_norm": 0.5917596817016602, + "learning_rate": 0.0003822940013513675, + "loss": 2.8533, + "step": 25193 + }, + { + "epoch": 1.23, + "grad_norm": 0.5756223797798157, + "learning_rate": 0.0003822791956448246, + "loss": 3.0704, + "step": 25194 + }, + { + "epoch": 1.23, + "grad_norm": 0.5527355074882507, + "learning_rate": 0.0003822643897215739, + "loss": 2.929, + "step": 25195 + }, + { + "epoch": 1.23, + "grad_norm": 0.6033982634544373, + "learning_rate": 0.0003822495835816548, + "loss": 3.028, + "step": 25196 + }, + { + "epoch": 1.23, + "grad_norm": 0.5483729243278503, + "learning_rate": 0.00038223477722510623, + "loss": 3.1289, + "step": 25197 + }, + { + "epoch": 1.23, + "grad_norm": 0.5902836918830872, + "learning_rate": 0.0003822199706519669, + "loss": 3.0741, + "step": 25198 + }, + { + "epoch": 1.23, + "grad_norm": 0.5841925740242004, + "learning_rate": 0.000382205163862276, + "loss": 3.0422, + "step": 25199 + }, + { + "epoch": 1.23, + "grad_norm": 0.5686092972755432, + "learning_rate": 0.0003821903568560726, + "loss": 3.0314, + "step": 25200 + }, + { + "epoch": 1.24, + "grad_norm": 0.5432037115097046, + "learning_rate": 0.0003821755496333956, + "loss": 3.2488, + "step": 25201 + }, + { + "epoch": 1.24, + "grad_norm": 0.546054482460022, + "learning_rate": 0.000382160742194284, + "loss": 2.9305, + "step": 25202 + }, + { + "epoch": 1.24, + "grad_norm": 0.5392909049987793, + "learning_rate": 0.0003821459345387768, + "loss": 3.0723, + "step": 25203 + }, + { + "epoch": 1.24, + "grad_norm": 0.5378334522247314, + "learning_rate": 0.00038213112666691303, + "loss": 2.9682, + "step": 25204 + }, + { + "epoch": 1.24, + "grad_norm": 0.5470960736274719, + "learning_rate": 0.0003821163185787317, + "loss": 3.1647, + "step": 25205 + }, + { + "epoch": 1.24, + "grad_norm": 0.5794514417648315, + "learning_rate": 0.00038210151027427176, + "loss": 3.0332, + "step": 25206 + }, + { + "epoch": 1.24, + "grad_norm": 0.611207127571106, + "learning_rate": 0.00038208670175357226, + "loss": 3.2109, + "step": 25207 + }, + { + "epoch": 1.24, + "grad_norm": 0.5396631956100464, + "learning_rate": 0.00038207189301667227, + "loss": 3.1659, + "step": 25208 + }, + { + "epoch": 1.24, + "grad_norm": 0.5904266238212585, + "learning_rate": 0.00038205708406361056, + "loss": 3.0525, + "step": 25209 + }, + { + "epoch": 1.24, + "grad_norm": 0.5775328278541565, + "learning_rate": 0.0003820422748944263, + "loss": 3.0498, + "step": 25210 + }, + { + "epoch": 1.24, + "grad_norm": 0.5564374327659607, + "learning_rate": 0.00038202746550915857, + "loss": 3.1151, + "step": 25211 + }, + { + "epoch": 1.24, + "grad_norm": 0.5537264943122864, + "learning_rate": 0.00038201265590784625, + "loss": 3.2185, + "step": 25212 + }, + { + "epoch": 1.24, + "grad_norm": 0.5900477170944214, + "learning_rate": 0.00038199784609052834, + "loss": 3.1797, + "step": 25213 + }, + { + "epoch": 1.24, + "grad_norm": 0.5341221690177917, + "learning_rate": 0.00038198303605724385, + "loss": 3.1518, + "step": 25214 + }, + { + "epoch": 1.24, + "grad_norm": 0.5363330245018005, + "learning_rate": 0.00038196822580803194, + "loss": 2.8538, + "step": 25215 + }, + { + "epoch": 1.24, + "grad_norm": 0.5329729914665222, + "learning_rate": 0.0003819534153429314, + "loss": 3.1315, + "step": 25216 + }, + { + "epoch": 1.24, + "grad_norm": 0.5687403082847595, + "learning_rate": 0.0003819386046619814, + "loss": 3.1472, + "step": 25217 + }, + { + "epoch": 1.24, + "grad_norm": 0.5683347582817078, + "learning_rate": 0.00038192379376522087, + "loss": 3.2782, + "step": 25218 + }, + { + "epoch": 1.24, + "grad_norm": 0.5241825580596924, + "learning_rate": 0.0003819089826526889, + "loss": 2.9968, + "step": 25219 + }, + { + "epoch": 1.24, + "grad_norm": 0.5936827659606934, + "learning_rate": 0.00038189417132442424, + "loss": 2.9825, + "step": 25220 + }, + { + "epoch": 1.24, + "grad_norm": 0.5642827749252319, + "learning_rate": 0.0003818793597804662, + "loss": 2.9358, + "step": 25221 + }, + { + "epoch": 1.24, + "grad_norm": 0.5551975965499878, + "learning_rate": 0.00038186454802085373, + "loss": 3.0648, + "step": 25222 + }, + { + "epoch": 1.24, + "grad_norm": 0.5875920057296753, + "learning_rate": 0.00038184973604562573, + "loss": 3.0493, + "step": 25223 + }, + { + "epoch": 1.24, + "grad_norm": 0.5492988228797913, + "learning_rate": 0.00038183492385482137, + "loss": 3.1779, + "step": 25224 + }, + { + "epoch": 1.24, + "grad_norm": 0.5400311350822449, + "learning_rate": 0.0003818201114484794, + "loss": 3.2273, + "step": 25225 + }, + { + "epoch": 1.24, + "grad_norm": 0.5957015156745911, + "learning_rate": 0.00038180529882663914, + "loss": 3.1921, + "step": 25226 + }, + { + "epoch": 1.24, + "grad_norm": 0.5321521162986755, + "learning_rate": 0.00038179048598933944, + "loss": 3.0101, + "step": 25227 + }, + { + "epoch": 1.24, + "grad_norm": 0.5768882036209106, + "learning_rate": 0.00038177567293661935, + "loss": 2.9978, + "step": 25228 + }, + { + "epoch": 1.24, + "grad_norm": 0.5608060359954834, + "learning_rate": 0.0003817608596685179, + "loss": 3.0694, + "step": 25229 + }, + { + "epoch": 1.24, + "grad_norm": 0.5481438040733337, + "learning_rate": 0.00038174604618507397, + "loss": 3.0578, + "step": 25230 + }, + { + "epoch": 1.24, + "grad_norm": 0.570463240146637, + "learning_rate": 0.00038173123248632675, + "loss": 3.1923, + "step": 25231 + }, + { + "epoch": 1.24, + "grad_norm": 0.6016967296600342, + "learning_rate": 0.0003817164185723152, + "loss": 3.0681, + "step": 25232 + }, + { + "epoch": 1.24, + "grad_norm": 0.5828872919082642, + "learning_rate": 0.00038170160444307835, + "loss": 3.0874, + "step": 25233 + }, + { + "epoch": 1.24, + "grad_norm": 0.5865387916564941, + "learning_rate": 0.00038168679009865523, + "loss": 3.0659, + "step": 25234 + }, + { + "epoch": 1.24, + "grad_norm": 0.5488494038581848, + "learning_rate": 0.00038167197553908467, + "loss": 2.983, + "step": 25235 + }, + { + "epoch": 1.24, + "grad_norm": 0.5348222255706787, + "learning_rate": 0.00038165716076440595, + "loss": 3.0415, + "step": 25236 + }, + { + "epoch": 1.24, + "grad_norm": 0.5851247906684875, + "learning_rate": 0.0003816423457746581, + "loss": 3.0879, + "step": 25237 + }, + { + "epoch": 1.24, + "grad_norm": 0.6187082529067993, + "learning_rate": 0.00038162753056987986, + "loss": 3.1523, + "step": 25238 + }, + { + "epoch": 1.24, + "grad_norm": 0.5365039110183716, + "learning_rate": 0.00038161271515011044, + "loss": 3.1376, + "step": 25239 + }, + { + "epoch": 1.24, + "grad_norm": 0.5412645936012268, + "learning_rate": 0.00038159789951538884, + "loss": 2.9421, + "step": 25240 + }, + { + "epoch": 1.24, + "grad_norm": 0.5671802759170532, + "learning_rate": 0.00038158308366575413, + "loss": 3.0663, + "step": 25241 + }, + { + "epoch": 1.24, + "grad_norm": 0.5550203919410706, + "learning_rate": 0.00038156826760124524, + "loss": 2.914, + "step": 25242 + }, + { + "epoch": 1.24, + "grad_norm": 0.5574257969856262, + "learning_rate": 0.0003815534513219012, + "loss": 2.9547, + "step": 25243 + }, + { + "epoch": 1.24, + "grad_norm": 0.5445705652236938, + "learning_rate": 0.0003815386348277611, + "loss": 3.0675, + "step": 25244 + }, + { + "epoch": 1.24, + "grad_norm": 0.5782180428504944, + "learning_rate": 0.00038152381811886393, + "loss": 3.0991, + "step": 25245 + }, + { + "epoch": 1.24, + "grad_norm": 0.5452681183815002, + "learning_rate": 0.0003815090011952487, + "loss": 2.943, + "step": 25246 + }, + { + "epoch": 1.24, + "grad_norm": 0.5488862991333008, + "learning_rate": 0.00038149418405695443, + "loss": 2.8228, + "step": 25247 + }, + { + "epoch": 1.24, + "grad_norm": 0.5082352161407471, + "learning_rate": 0.0003814793667040203, + "loss": 3.0863, + "step": 25248 + }, + { + "epoch": 1.24, + "grad_norm": 0.5326624512672424, + "learning_rate": 0.000381464549136485, + "loss": 3.1076, + "step": 25249 + }, + { + "epoch": 1.24, + "grad_norm": 0.568250834941864, + "learning_rate": 0.0003814497313543879, + "loss": 3.1315, + "step": 25250 + }, + { + "epoch": 1.24, + "grad_norm": 0.5677963495254517, + "learning_rate": 0.00038143491335776783, + "loss": 3.137, + "step": 25251 + }, + { + "epoch": 1.24, + "grad_norm": 0.5616076588630676, + "learning_rate": 0.0003814200951466638, + "loss": 3.0442, + "step": 25252 + }, + { + "epoch": 1.24, + "grad_norm": 0.5570437908172607, + "learning_rate": 0.00038140527672111496, + "loss": 3.0987, + "step": 25253 + }, + { + "epoch": 1.24, + "grad_norm": 0.5529975891113281, + "learning_rate": 0.00038139045808116036, + "loss": 3.2037, + "step": 25254 + }, + { + "epoch": 1.24, + "grad_norm": 0.6022285223007202, + "learning_rate": 0.0003813756392268389, + "loss": 3.0878, + "step": 25255 + }, + { + "epoch": 1.24, + "grad_norm": 0.5654017925262451, + "learning_rate": 0.00038136082015818965, + "loss": 3.1599, + "step": 25256 + }, + { + "epoch": 1.24, + "grad_norm": 0.5939410924911499, + "learning_rate": 0.00038134600087525166, + "loss": 3.2432, + "step": 25257 + }, + { + "epoch": 1.24, + "grad_norm": 0.5679937601089478, + "learning_rate": 0.000381331181378064, + "loss": 2.9138, + "step": 25258 + }, + { + "epoch": 1.24, + "grad_norm": 0.584583580493927, + "learning_rate": 0.00038131636166666567, + "loss": 3.2667, + "step": 25259 + }, + { + "epoch": 1.24, + "grad_norm": 0.5996798872947693, + "learning_rate": 0.0003813015417410957, + "loss": 3.2236, + "step": 25260 + }, + { + "epoch": 1.24, + "grad_norm": 0.5446107387542725, + "learning_rate": 0.00038128672160139295, + "loss": 3.1285, + "step": 25261 + }, + { + "epoch": 1.24, + "grad_norm": 0.5301439762115479, + "learning_rate": 0.0003812719012475968, + "loss": 3.0362, + "step": 25262 + }, + { + "epoch": 1.24, + "grad_norm": 0.5626558661460876, + "learning_rate": 0.000381257080679746, + "loss": 3.0798, + "step": 25263 + }, + { + "epoch": 1.24, + "grad_norm": 0.5594674944877625, + "learning_rate": 0.00038124225989787975, + "loss": 2.9337, + "step": 25264 + }, + { + "epoch": 1.24, + "grad_norm": 0.5297481417655945, + "learning_rate": 0.00038122743890203704, + "loss": 3.1576, + "step": 25265 + }, + { + "epoch": 1.24, + "grad_norm": 0.5789366960525513, + "learning_rate": 0.00038121261769225685, + "loss": 3.0484, + "step": 25266 + }, + { + "epoch": 1.24, + "grad_norm": 0.5611054301261902, + "learning_rate": 0.0003811977962685782, + "loss": 3.1415, + "step": 25267 + }, + { + "epoch": 1.24, + "grad_norm": 0.5735259056091309, + "learning_rate": 0.0003811829746310403, + "loss": 3.061, + "step": 25268 + }, + { + "epoch": 1.24, + "grad_norm": 0.5358028411865234, + "learning_rate": 0.00038116815277968195, + "loss": 3.0565, + "step": 25269 + }, + { + "epoch": 1.24, + "grad_norm": 0.5914501547813416, + "learning_rate": 0.00038115333071454246, + "loss": 3.1501, + "step": 25270 + }, + { + "epoch": 1.24, + "grad_norm": 0.5669057369232178, + "learning_rate": 0.0003811385084356606, + "loss": 3.2603, + "step": 25271 + }, + { + "epoch": 1.24, + "grad_norm": 0.549128532409668, + "learning_rate": 0.00038112368594307554, + "loss": 3.0044, + "step": 25272 + }, + { + "epoch": 1.24, + "grad_norm": 0.5490759015083313, + "learning_rate": 0.0003811088632368263, + "loss": 3.2027, + "step": 25273 + }, + { + "epoch": 1.24, + "grad_norm": 0.5359374284744263, + "learning_rate": 0.000381094040316952, + "loss": 3.2197, + "step": 25274 + }, + { + "epoch": 1.24, + "grad_norm": 0.5579591393470764, + "learning_rate": 0.0003810792171834915, + "loss": 3.1138, + "step": 25275 + }, + { + "epoch": 1.24, + "grad_norm": 0.5682498216629028, + "learning_rate": 0.00038106439383648396, + "loss": 3.0713, + "step": 25276 + }, + { + "epoch": 1.24, + "grad_norm": 0.5187976360321045, + "learning_rate": 0.0003810495702759685, + "loss": 2.9462, + "step": 25277 + }, + { + "epoch": 1.24, + "grad_norm": 0.5464273691177368, + "learning_rate": 0.00038103474650198396, + "loss": 2.9093, + "step": 25278 + }, + { + "epoch": 1.24, + "grad_norm": 0.5833659768104553, + "learning_rate": 0.0003810199225145695, + "loss": 3.0675, + "step": 25279 + }, + { + "epoch": 1.24, + "grad_norm": 0.5375162959098816, + "learning_rate": 0.0003810050983137642, + "loss": 2.9478, + "step": 25280 + }, + { + "epoch": 1.24, + "grad_norm": 0.5666972994804382, + "learning_rate": 0.00038099027389960704, + "loss": 2.9796, + "step": 25281 + }, + { + "epoch": 1.24, + "grad_norm": 0.526598334312439, + "learning_rate": 0.0003809754492721371, + "loss": 3.0346, + "step": 25282 + }, + { + "epoch": 1.24, + "grad_norm": 0.5676760077476501, + "learning_rate": 0.0003809606244313934, + "loss": 3.2404, + "step": 25283 + }, + { + "epoch": 1.24, + "grad_norm": 0.5630086064338684, + "learning_rate": 0.00038094579937741506, + "loss": 2.936, + "step": 25284 + }, + { + "epoch": 1.24, + "grad_norm": 0.5473533272743225, + "learning_rate": 0.00038093097411024106, + "loss": 3.0267, + "step": 25285 + }, + { + "epoch": 1.24, + "grad_norm": 0.6100202202796936, + "learning_rate": 0.0003809161486299103, + "loss": 3.1214, + "step": 25286 + }, + { + "epoch": 1.24, + "grad_norm": 0.5748114585876465, + "learning_rate": 0.0003809013229364621, + "loss": 3.1075, + "step": 25287 + }, + { + "epoch": 1.24, + "grad_norm": 0.567639172077179, + "learning_rate": 0.00038088649702993537, + "loss": 3.0915, + "step": 25288 + }, + { + "epoch": 1.24, + "grad_norm": 0.5674895644187927, + "learning_rate": 0.00038087167091036916, + "loss": 3.0159, + "step": 25289 + }, + { + "epoch": 1.24, + "grad_norm": 0.5389357209205627, + "learning_rate": 0.0003808568445778025, + "loss": 3.1525, + "step": 25290 + }, + { + "epoch": 1.24, + "grad_norm": 0.5887845754623413, + "learning_rate": 0.00038084201803227456, + "loss": 3.0113, + "step": 25291 + }, + { + "epoch": 1.24, + "grad_norm": 0.5666884183883667, + "learning_rate": 0.00038082719127382423, + "loss": 3.0803, + "step": 25292 + }, + { + "epoch": 1.24, + "grad_norm": 0.5322736501693726, + "learning_rate": 0.00038081236430249064, + "loss": 3.0178, + "step": 25293 + }, + { + "epoch": 1.24, + "grad_norm": 0.519856333732605, + "learning_rate": 0.0003807975371183129, + "loss": 3.0541, + "step": 25294 + }, + { + "epoch": 1.24, + "grad_norm": 0.5431399345397949, + "learning_rate": 0.0003807827097213299, + "loss": 3.0365, + "step": 25295 + }, + { + "epoch": 1.24, + "grad_norm": 0.5920628905296326, + "learning_rate": 0.0003807678821115809, + "loss": 3.0214, + "step": 25296 + }, + { + "epoch": 1.24, + "grad_norm": 0.5433076024055481, + "learning_rate": 0.0003807530542891047, + "loss": 2.8857, + "step": 25297 + }, + { + "epoch": 1.24, + "grad_norm": 0.5601280331611633, + "learning_rate": 0.0003807382262539406, + "loss": 2.8714, + "step": 25298 + }, + { + "epoch": 1.24, + "grad_norm": 0.5662012696266174, + "learning_rate": 0.00038072339800612753, + "loss": 2.8085, + "step": 25299 + }, + { + "epoch": 1.24, + "grad_norm": 0.5706889629364014, + "learning_rate": 0.0003807085695457046, + "loss": 3.1877, + "step": 25300 + }, + { + "epoch": 1.24, + "grad_norm": 0.5297551155090332, + "learning_rate": 0.0003806937408727108, + "loss": 3.0017, + "step": 25301 + }, + { + "epoch": 1.24, + "grad_norm": 0.5498740673065186, + "learning_rate": 0.0003806789119871852, + "loss": 3.1668, + "step": 25302 + }, + { + "epoch": 1.24, + "grad_norm": 0.6031630635261536, + "learning_rate": 0.0003806640828891669, + "loss": 3.1125, + "step": 25303 + }, + { + "epoch": 1.24, + "grad_norm": 0.5248180627822876, + "learning_rate": 0.00038064925357869493, + "loss": 2.9471, + "step": 25304 + }, + { + "epoch": 1.24, + "grad_norm": 0.5380234718322754, + "learning_rate": 0.0003806344240558083, + "loss": 3.0616, + "step": 25305 + }, + { + "epoch": 1.24, + "grad_norm": 0.544638991355896, + "learning_rate": 0.00038061959432054613, + "loss": 3.1263, + "step": 25306 + }, + { + "epoch": 1.24, + "grad_norm": 0.5599265694618225, + "learning_rate": 0.00038060476437294747, + "loss": 3.083, + "step": 25307 + }, + { + "epoch": 1.24, + "grad_norm": 0.5477656126022339, + "learning_rate": 0.0003805899342130514, + "loss": 3.1702, + "step": 25308 + }, + { + "epoch": 1.24, + "grad_norm": 0.5713855028152466, + "learning_rate": 0.00038057510384089695, + "loss": 3.1395, + "step": 25309 + }, + { + "epoch": 1.24, + "grad_norm": 0.5825148820877075, + "learning_rate": 0.00038056027325652315, + "loss": 3.1251, + "step": 25310 + }, + { + "epoch": 1.24, + "grad_norm": 0.5400784015655518, + "learning_rate": 0.00038054544245996915, + "loss": 3.0086, + "step": 25311 + }, + { + "epoch": 1.24, + "grad_norm": 0.5549777150154114, + "learning_rate": 0.00038053061145127385, + "loss": 3.0571, + "step": 25312 + }, + { + "epoch": 1.24, + "grad_norm": 0.5992767810821533, + "learning_rate": 0.0003805157802304766, + "loss": 3.0949, + "step": 25313 + }, + { + "epoch": 1.24, + "grad_norm": 0.5606101155281067, + "learning_rate": 0.0003805009487976161, + "loss": 3.0668, + "step": 25314 + }, + { + "epoch": 1.24, + "grad_norm": 0.6076589822769165, + "learning_rate": 0.00038048611715273166, + "loss": 3.0241, + "step": 25315 + }, + { + "epoch": 1.24, + "grad_norm": 0.5783843994140625, + "learning_rate": 0.00038047128529586225, + "loss": 3.0586, + "step": 25316 + }, + { + "epoch": 1.24, + "grad_norm": 0.5715208053588867, + "learning_rate": 0.000380456453227047, + "loss": 3.1446, + "step": 25317 + }, + { + "epoch": 1.24, + "grad_norm": 0.5512316823005676, + "learning_rate": 0.00038044162094632493, + "loss": 3.0362, + "step": 25318 + }, + { + "epoch": 1.24, + "grad_norm": 0.539901852607727, + "learning_rate": 0.0003804267884537351, + "loss": 3.0684, + "step": 25319 + }, + { + "epoch": 1.24, + "grad_norm": 0.5319052338600159, + "learning_rate": 0.0003804119557493166, + "loss": 3.0007, + "step": 25320 + }, + { + "epoch": 1.24, + "grad_norm": 0.5402718782424927, + "learning_rate": 0.0003803971228331085, + "loss": 2.9589, + "step": 25321 + }, + { + "epoch": 1.24, + "grad_norm": 0.6263967156410217, + "learning_rate": 0.0003803822897051498, + "loss": 3.0552, + "step": 25322 + }, + { + "epoch": 1.24, + "grad_norm": 0.5595197081565857, + "learning_rate": 0.00038036745636547964, + "loss": 2.8282, + "step": 25323 + }, + { + "epoch": 1.24, + "grad_norm": 0.5648815035820007, + "learning_rate": 0.0003803526228141371, + "loss": 2.9941, + "step": 25324 + }, + { + "epoch": 1.24, + "grad_norm": 0.6085686683654785, + "learning_rate": 0.00038033778905116123, + "loss": 3.0553, + "step": 25325 + }, + { + "epoch": 1.24, + "grad_norm": 0.5713744163513184, + "learning_rate": 0.000380322955076591, + "loss": 2.9707, + "step": 25326 + }, + { + "epoch": 1.24, + "grad_norm": 0.5762509107589722, + "learning_rate": 0.0003803081208904656, + "loss": 3.0925, + "step": 25327 + }, + { + "epoch": 1.24, + "grad_norm": 0.5664671063423157, + "learning_rate": 0.0003802932864928241, + "loss": 2.9792, + "step": 25328 + }, + { + "epoch": 1.24, + "grad_norm": 0.5856859087944031, + "learning_rate": 0.00038027845188370554, + "loss": 2.8393, + "step": 25329 + }, + { + "epoch": 1.24, + "grad_norm": 0.5801443457603455, + "learning_rate": 0.00038026361706314894, + "loss": 3.1843, + "step": 25330 + }, + { + "epoch": 1.24, + "grad_norm": 0.5778241157531738, + "learning_rate": 0.00038024878203119346, + "loss": 3.1812, + "step": 25331 + }, + { + "epoch": 1.24, + "grad_norm": 0.5924252271652222, + "learning_rate": 0.0003802339467878781, + "loss": 3.0418, + "step": 25332 + }, + { + "epoch": 1.24, + "grad_norm": 0.5770880579948425, + "learning_rate": 0.00038021911133324194, + "loss": 2.8706, + "step": 25333 + }, + { + "epoch": 1.24, + "grad_norm": 0.547100841999054, + "learning_rate": 0.0003802042756673241, + "loss": 3.2091, + "step": 25334 + }, + { + "epoch": 1.24, + "grad_norm": 0.6044512391090393, + "learning_rate": 0.0003801894397901637, + "loss": 3.153, + "step": 25335 + }, + { + "epoch": 1.24, + "grad_norm": 0.5438991189002991, + "learning_rate": 0.00038017460370179976, + "loss": 3.1416, + "step": 25336 + }, + { + "epoch": 1.24, + "grad_norm": 0.5397824645042419, + "learning_rate": 0.00038015976740227126, + "loss": 3.065, + "step": 25337 + }, + { + "epoch": 1.24, + "grad_norm": 0.6003246903419495, + "learning_rate": 0.0003801449308916173, + "loss": 2.9869, + "step": 25338 + }, + { + "epoch": 1.24, + "grad_norm": 0.5817223191261292, + "learning_rate": 0.00038013009416987714, + "loss": 3.0956, + "step": 25339 + }, + { + "epoch": 1.24, + "grad_norm": 0.5436729192733765, + "learning_rate": 0.00038011525723708974, + "loss": 3.1657, + "step": 25340 + }, + { + "epoch": 1.24, + "grad_norm": 0.5461451411247253, + "learning_rate": 0.0003801004200932941, + "loss": 2.9516, + "step": 25341 + }, + { + "epoch": 1.24, + "grad_norm": 0.5592874884605408, + "learning_rate": 0.0003800855827385294, + "loss": 3.01, + "step": 25342 + }, + { + "epoch": 1.24, + "grad_norm": 0.5277195572853088, + "learning_rate": 0.00038007074517283466, + "loss": 2.9353, + "step": 25343 + }, + { + "epoch": 1.24, + "grad_norm": 0.5816713571548462, + "learning_rate": 0.000380055907396249, + "loss": 3.2137, + "step": 25344 + }, + { + "epoch": 1.24, + "grad_norm": 0.5725088715553284, + "learning_rate": 0.00038004106940881145, + "loss": 2.9513, + "step": 25345 + }, + { + "epoch": 1.24, + "grad_norm": 0.5226502418518066, + "learning_rate": 0.0003800262312105612, + "loss": 2.9282, + "step": 25346 + }, + { + "epoch": 1.24, + "grad_norm": 0.5807681679725647, + "learning_rate": 0.0003800113928015373, + "loss": 3.0696, + "step": 25347 + }, + { + "epoch": 1.24, + "grad_norm": 0.5598049163818359, + "learning_rate": 0.00037999655418177866, + "loss": 3.0249, + "step": 25348 + }, + { + "epoch": 1.24, + "grad_norm": 0.5281652212142944, + "learning_rate": 0.00037998171535132456, + "loss": 3.0612, + "step": 25349 + }, + { + "epoch": 1.24, + "grad_norm": 0.5705523490905762, + "learning_rate": 0.00037996687631021404, + "loss": 3.0364, + "step": 25350 + }, + { + "epoch": 1.24, + "grad_norm": 0.599563717842102, + "learning_rate": 0.00037995203705848613, + "loss": 3.0559, + "step": 25351 + }, + { + "epoch": 1.24, + "grad_norm": 0.5675721168518066, + "learning_rate": 0.0003799371975961799, + "loss": 2.8777, + "step": 25352 + }, + { + "epoch": 1.24, + "grad_norm": 0.5328501462936401, + "learning_rate": 0.00037992235792333457, + "loss": 3.0203, + "step": 25353 + }, + { + "epoch": 1.24, + "grad_norm": 0.554772138595581, + "learning_rate": 0.000379907518039989, + "loss": 3.1119, + "step": 25354 + }, + { + "epoch": 1.24, + "grad_norm": 0.6034314632415771, + "learning_rate": 0.0003798926779461825, + "loss": 2.9382, + "step": 25355 + }, + { + "epoch": 1.24, + "grad_norm": 0.582343578338623, + "learning_rate": 0.000379877837641954, + "loss": 3.0815, + "step": 25356 + }, + { + "epoch": 1.24, + "grad_norm": 0.5476296544075012, + "learning_rate": 0.00037986299712734273, + "loss": 3.1573, + "step": 25357 + }, + { + "epoch": 1.24, + "grad_norm": 0.5945506691932678, + "learning_rate": 0.00037984815640238764, + "loss": 3.1786, + "step": 25358 + }, + { + "epoch": 1.24, + "grad_norm": 0.5869128704071045, + "learning_rate": 0.0003798333154671279, + "loss": 3.2192, + "step": 25359 + }, + { + "epoch": 1.24, + "grad_norm": 0.585900604724884, + "learning_rate": 0.00037981847432160253, + "loss": 2.7552, + "step": 25360 + }, + { + "epoch": 1.24, + "grad_norm": 0.6065345406532288, + "learning_rate": 0.0003798036329658508, + "loss": 3.0549, + "step": 25361 + }, + { + "epoch": 1.24, + "grad_norm": 0.6202573180198669, + "learning_rate": 0.0003797887913999115, + "loss": 3.2176, + "step": 25362 + }, + { + "epoch": 1.24, + "grad_norm": 0.5414901971817017, + "learning_rate": 0.00037977394962382383, + "loss": 3.2003, + "step": 25363 + }, + { + "epoch": 1.24, + "grad_norm": 0.5944116115570068, + "learning_rate": 0.00037975910763762713, + "loss": 3.0485, + "step": 25364 + }, + { + "epoch": 1.24, + "grad_norm": 0.5542477369308472, + "learning_rate": 0.00037974426544136014, + "loss": 3.0894, + "step": 25365 + }, + { + "epoch": 1.24, + "grad_norm": 0.5856495499610901, + "learning_rate": 0.00037972942303506215, + "loss": 3.2326, + "step": 25366 + }, + { + "epoch": 1.24, + "grad_norm": 0.5559060573577881, + "learning_rate": 0.0003797145804187722, + "loss": 3.1288, + "step": 25367 + }, + { + "epoch": 1.24, + "grad_norm": 0.6178712248802185, + "learning_rate": 0.0003796997375925294, + "loss": 2.901, + "step": 25368 + }, + { + "epoch": 1.24, + "grad_norm": 0.534890353679657, + "learning_rate": 0.00037968489455637286, + "loss": 3.0637, + "step": 25369 + }, + { + "epoch": 1.24, + "grad_norm": 0.5386385917663574, + "learning_rate": 0.0003796700513103416, + "loss": 3.1557, + "step": 25370 + }, + { + "epoch": 1.24, + "grad_norm": 0.574946403503418, + "learning_rate": 0.00037965520785447474, + "loss": 3.0145, + "step": 25371 + }, + { + "epoch": 1.24, + "grad_norm": 0.5754439830780029, + "learning_rate": 0.0003796403641888114, + "loss": 3.0277, + "step": 25372 + }, + { + "epoch": 1.24, + "grad_norm": 0.6202484369277954, + "learning_rate": 0.0003796255203133907, + "loss": 3.1161, + "step": 25373 + }, + { + "epoch": 1.24, + "grad_norm": 0.5231525897979736, + "learning_rate": 0.0003796106762282517, + "loss": 3.1765, + "step": 25374 + }, + { + "epoch": 1.24, + "grad_norm": 0.5547069311141968, + "learning_rate": 0.0003795958319334334, + "loss": 3.0762, + "step": 25375 + }, + { + "epoch": 1.24, + "grad_norm": 0.5507140159606934, + "learning_rate": 0.0003795809874289751, + "loss": 2.9072, + "step": 25376 + }, + { + "epoch": 1.24, + "grad_norm": 0.5551223754882812, + "learning_rate": 0.0003795661427149158, + "loss": 2.9539, + "step": 25377 + }, + { + "epoch": 1.24, + "grad_norm": 0.5440731048583984, + "learning_rate": 0.0003795512977912946, + "loss": 3.1145, + "step": 25378 + }, + { + "epoch": 1.24, + "grad_norm": 0.5543565154075623, + "learning_rate": 0.0003795364526581505, + "loss": 3.1151, + "step": 25379 + }, + { + "epoch": 1.24, + "grad_norm": 0.5475043654441833, + "learning_rate": 0.0003795216073155228, + "loss": 3.1024, + "step": 25380 + }, + { + "epoch": 1.24, + "grad_norm": 0.5892181396484375, + "learning_rate": 0.00037950676176345044, + "loss": 2.9669, + "step": 25381 + }, + { + "epoch": 1.24, + "grad_norm": 0.5114701390266418, + "learning_rate": 0.00037949191600197254, + "loss": 2.8436, + "step": 25382 + }, + { + "epoch": 1.24, + "grad_norm": 0.5685053467750549, + "learning_rate": 0.00037947707003112833, + "loss": 3.0272, + "step": 25383 + }, + { + "epoch": 1.24, + "grad_norm": 0.579219400882721, + "learning_rate": 0.0003794622238509567, + "loss": 3.0499, + "step": 25384 + }, + { + "epoch": 1.24, + "grad_norm": 0.5205180644989014, + "learning_rate": 0.00037944737746149694, + "loss": 3.0484, + "step": 25385 + }, + { + "epoch": 1.24, + "grad_norm": 0.561149001121521, + "learning_rate": 0.0003794325308627881, + "loss": 3.0581, + "step": 25386 + }, + { + "epoch": 1.24, + "grad_norm": 0.5951744318008423, + "learning_rate": 0.0003794176840548691, + "loss": 3.1024, + "step": 25387 + }, + { + "epoch": 1.24, + "grad_norm": 0.6370974779129028, + "learning_rate": 0.0003794028370377794, + "loss": 3.1705, + "step": 25388 + }, + { + "epoch": 1.24, + "grad_norm": 0.5966722965240479, + "learning_rate": 0.0003793879898115577, + "loss": 2.9605, + "step": 25389 + }, + { + "epoch": 1.24, + "grad_norm": 0.5954158306121826, + "learning_rate": 0.00037937314237624357, + "loss": 3.0356, + "step": 25390 + }, + { + "epoch": 1.24, + "grad_norm": 0.5606188774108887, + "learning_rate": 0.00037935829473187573, + "loss": 2.9174, + "step": 25391 + }, + { + "epoch": 1.24, + "grad_norm": 0.5629304051399231, + "learning_rate": 0.0003793434468784934, + "loss": 2.9253, + "step": 25392 + }, + { + "epoch": 1.24, + "grad_norm": 0.5472045540809631, + "learning_rate": 0.0003793285988161357, + "loss": 3.3048, + "step": 25393 + }, + { + "epoch": 1.24, + "grad_norm": 0.5473329424858093, + "learning_rate": 0.0003793137505448417, + "loss": 3.1923, + "step": 25394 + }, + { + "epoch": 1.24, + "grad_norm": 0.5582367777824402, + "learning_rate": 0.0003792989020646506, + "loss": 3.0302, + "step": 25395 + }, + { + "epoch": 1.24, + "grad_norm": 1.2111254930496216, + "learning_rate": 0.00037928405337560143, + "loss": 2.9217, + "step": 25396 + }, + { + "epoch": 1.24, + "grad_norm": 0.5781081318855286, + "learning_rate": 0.0003792692044777334, + "loss": 3.1969, + "step": 25397 + }, + { + "epoch": 1.24, + "grad_norm": 0.5536330938339233, + "learning_rate": 0.00037925435537108546, + "loss": 3.2146, + "step": 25398 + }, + { + "epoch": 1.24, + "grad_norm": 0.5391330122947693, + "learning_rate": 0.00037923950605569677, + "loss": 2.8489, + "step": 25399 + }, + { + "epoch": 1.24, + "grad_norm": 0.5480332374572754, + "learning_rate": 0.00037922465653160653, + "loss": 2.9888, + "step": 25400 + }, + { + "epoch": 1.24, + "grad_norm": 0.5818240642547607, + "learning_rate": 0.00037920980679885383, + "loss": 2.974, + "step": 25401 + }, + { + "epoch": 1.24, + "grad_norm": 0.5594255924224854, + "learning_rate": 0.0003791949568574777, + "loss": 3.0717, + "step": 25402 + }, + { + "epoch": 1.24, + "grad_norm": 0.5563430786132812, + "learning_rate": 0.00037918010670751735, + "loss": 3.3801, + "step": 25403 + }, + { + "epoch": 1.24, + "grad_norm": 0.5488848090171814, + "learning_rate": 0.0003791652563490118, + "loss": 2.8365, + "step": 25404 + }, + { + "epoch": 1.25, + "grad_norm": 0.5589650869369507, + "learning_rate": 0.0003791504057820001, + "loss": 2.8259, + "step": 25405 + }, + { + "epoch": 1.25, + "grad_norm": 0.5604705214500427, + "learning_rate": 0.00037913555500652154, + "loss": 3.1208, + "step": 25406 + }, + { + "epoch": 1.25, + "grad_norm": 0.5571016073226929, + "learning_rate": 0.0003791207040226152, + "loss": 3.1, + "step": 25407 + }, + { + "epoch": 1.25, + "grad_norm": 0.559099555015564, + "learning_rate": 0.00037910585283032006, + "loss": 3.0902, + "step": 25408 + }, + { + "epoch": 1.25, + "grad_norm": 0.5342800617218018, + "learning_rate": 0.0003790910014296754, + "loss": 3.0075, + "step": 25409 + }, + { + "epoch": 1.25, + "grad_norm": 0.555751621723175, + "learning_rate": 0.00037907614982072023, + "loss": 3.2053, + "step": 25410 + }, + { + "epoch": 1.25, + "grad_norm": 0.6236014366149902, + "learning_rate": 0.0003790612980034937, + "loss": 3.0104, + "step": 25411 + }, + { + "epoch": 1.25, + "grad_norm": 0.5512464642524719, + "learning_rate": 0.000379046445978035, + "loss": 3.1207, + "step": 25412 + }, + { + "epoch": 1.25, + "grad_norm": 0.5613242387771606, + "learning_rate": 0.00037903159374438314, + "loss": 3.1059, + "step": 25413 + }, + { + "epoch": 1.25, + "grad_norm": 0.5795272588729858, + "learning_rate": 0.00037901674130257724, + "loss": 3.1214, + "step": 25414 + }, + { + "epoch": 1.25, + "grad_norm": 0.5552756190299988, + "learning_rate": 0.00037900188865265646, + "loss": 3.2082, + "step": 25415 + }, + { + "epoch": 1.25, + "grad_norm": 0.5399115681648254, + "learning_rate": 0.0003789870357946599, + "loss": 3.035, + "step": 25416 + }, + { + "epoch": 1.25, + "grad_norm": 0.5368036031723022, + "learning_rate": 0.0003789721827286267, + "loss": 2.8561, + "step": 25417 + }, + { + "epoch": 1.25, + "grad_norm": 0.5200400352478027, + "learning_rate": 0.000378957329454596, + "loss": 2.9214, + "step": 25418 + }, + { + "epoch": 1.25, + "grad_norm": 0.569953978061676, + "learning_rate": 0.0003789424759726068, + "loss": 3.2006, + "step": 25419 + }, + { + "epoch": 1.25, + "grad_norm": 0.5677704215049744, + "learning_rate": 0.0003789276222826984, + "loss": 2.8954, + "step": 25420 + }, + { + "epoch": 1.25, + "grad_norm": 0.5435131788253784, + "learning_rate": 0.00037891276838490976, + "loss": 3.0026, + "step": 25421 + }, + { + "epoch": 1.25, + "grad_norm": 0.5513231754302979, + "learning_rate": 0.00037889791427928016, + "loss": 3.173, + "step": 25422 + }, + { + "epoch": 1.25, + "grad_norm": 0.5995567440986633, + "learning_rate": 0.00037888305996584864, + "loss": 2.996, + "step": 25423 + }, + { + "epoch": 1.25, + "grad_norm": 0.5517016649246216, + "learning_rate": 0.00037886820544465427, + "loss": 2.964, + "step": 25424 + }, + { + "epoch": 1.25, + "grad_norm": 0.572759747505188, + "learning_rate": 0.00037885335071573614, + "loss": 2.8945, + "step": 25425 + }, + { + "epoch": 1.25, + "grad_norm": 0.6202085018157959, + "learning_rate": 0.0003788384957791336, + "loss": 3.1124, + "step": 25426 + }, + { + "epoch": 1.25, + "grad_norm": 0.5376129746437073, + "learning_rate": 0.00037882364063488566, + "loss": 3.0313, + "step": 25427 + }, + { + "epoch": 1.25, + "grad_norm": 0.5526015758514404, + "learning_rate": 0.00037880878528303126, + "loss": 3.0498, + "step": 25428 + }, + { + "epoch": 1.25, + "grad_norm": 0.553801417350769, + "learning_rate": 0.0003787939297236098, + "loss": 3.0979, + "step": 25429 + }, + { + "epoch": 1.25, + "grad_norm": 0.5524164438247681, + "learning_rate": 0.0003787790739566603, + "loss": 3.1177, + "step": 25430 + }, + { + "epoch": 1.25, + "grad_norm": 0.5112387537956238, + "learning_rate": 0.00037876421798222176, + "loss": 3.2164, + "step": 25431 + }, + { + "epoch": 1.25, + "grad_norm": 0.5780650973320007, + "learning_rate": 0.00037874936180033356, + "loss": 3.0476, + "step": 25432 + }, + { + "epoch": 1.25, + "grad_norm": 0.5552143454551697, + "learning_rate": 0.0003787345054110347, + "loss": 3.1861, + "step": 25433 + }, + { + "epoch": 1.25, + "grad_norm": 0.5469130277633667, + "learning_rate": 0.00037871964881436423, + "loss": 2.8781, + "step": 25434 + }, + { + "epoch": 1.25, + "grad_norm": 0.5303400158882141, + "learning_rate": 0.00037870479201036137, + "loss": 2.987, + "step": 25435 + }, + { + "epoch": 1.25, + "grad_norm": 0.5632544755935669, + "learning_rate": 0.00037868993499906524, + "loss": 3.0197, + "step": 25436 + }, + { + "epoch": 1.25, + "grad_norm": 0.5879452228546143, + "learning_rate": 0.000378675077780515, + "loss": 3.1763, + "step": 25437 + }, + { + "epoch": 1.25, + "grad_norm": 0.6507343649864197, + "learning_rate": 0.00037866022035474976, + "loss": 2.9622, + "step": 25438 + }, + { + "epoch": 1.25, + "grad_norm": 0.5414554476737976, + "learning_rate": 0.0003786453627218086, + "loss": 3.0865, + "step": 25439 + }, + { + "epoch": 1.25, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0003786305048817306, + "loss": 3.1138, + "step": 25440 + }, + { + "epoch": 1.25, + "grad_norm": 0.5658540725708008, + "learning_rate": 0.0003786156468345551, + "loss": 2.9164, + "step": 25441 + }, + { + "epoch": 1.25, + "grad_norm": 0.5722022652626038, + "learning_rate": 0.0003786007885803211, + "loss": 3.1499, + "step": 25442 + }, + { + "epoch": 1.25, + "grad_norm": 0.5444322228431702, + "learning_rate": 0.0003785859301190677, + "loss": 3.0966, + "step": 25443 + }, + { + "epoch": 1.25, + "grad_norm": 0.5430464148521423, + "learning_rate": 0.00037857107145083415, + "loss": 2.9632, + "step": 25444 + }, + { + "epoch": 1.25, + "grad_norm": 0.5783938765525818, + "learning_rate": 0.00037855621257565955, + "loss": 2.8965, + "step": 25445 + }, + { + "epoch": 1.25, + "grad_norm": 0.5244552493095398, + "learning_rate": 0.00037854135349358285, + "loss": 3.1105, + "step": 25446 + }, + { + "epoch": 1.25, + "grad_norm": 0.5750003457069397, + "learning_rate": 0.0003785264942046434, + "loss": 3.1679, + "step": 25447 + }, + { + "epoch": 1.25, + "grad_norm": 0.5536020398139954, + "learning_rate": 0.00037851163470888045, + "loss": 3.4111, + "step": 25448 + }, + { + "epoch": 1.25, + "grad_norm": 0.7663441896438599, + "learning_rate": 0.0003784967750063328, + "loss": 2.8687, + "step": 25449 + }, + { + "epoch": 1.25, + "grad_norm": 0.5692669153213501, + "learning_rate": 0.0003784819150970397, + "loss": 2.9943, + "step": 25450 + }, + { + "epoch": 1.25, + "grad_norm": 0.5337068438529968, + "learning_rate": 0.0003784670549810404, + "loss": 3.1486, + "step": 25451 + }, + { + "epoch": 1.25, + "grad_norm": 0.5633428692817688, + "learning_rate": 0.0003784521946583741, + "loss": 3.135, + "step": 25452 + }, + { + "epoch": 1.25, + "grad_norm": 0.5698487758636475, + "learning_rate": 0.0003784373341290797, + "loss": 3.0194, + "step": 25453 + }, + { + "epoch": 1.25, + "grad_norm": 0.5451970100402832, + "learning_rate": 0.00037842247339319645, + "loss": 3.0151, + "step": 25454 + }, + { + "epoch": 1.25, + "grad_norm": 0.5770736336708069, + "learning_rate": 0.0003784076124507635, + "loss": 3.2033, + "step": 25455 + }, + { + "epoch": 1.25, + "grad_norm": 0.5762639045715332, + "learning_rate": 0.00037839275130182004, + "loss": 2.977, + "step": 25456 + }, + { + "epoch": 1.25, + "grad_norm": 0.5343278646469116, + "learning_rate": 0.0003783778899464051, + "loss": 3.0204, + "step": 25457 + }, + { + "epoch": 1.25, + "grad_norm": 0.5425517559051514, + "learning_rate": 0.00037836302838455794, + "loss": 3.0833, + "step": 25458 + }, + { + "epoch": 1.25, + "grad_norm": 0.5324981212615967, + "learning_rate": 0.0003783481666163176, + "loss": 3.2069, + "step": 25459 + }, + { + "epoch": 1.25, + "grad_norm": 0.5692557096481323, + "learning_rate": 0.0003783333046417233, + "loss": 3.0091, + "step": 25460 + }, + { + "epoch": 1.25, + "grad_norm": 0.5556652545928955, + "learning_rate": 0.00037831844246081413, + "loss": 2.867, + "step": 25461 + }, + { + "epoch": 1.25, + "grad_norm": 0.5014756321907043, + "learning_rate": 0.00037830358007362924, + "loss": 3.1228, + "step": 25462 + }, + { + "epoch": 1.25, + "grad_norm": 0.5904489159584045, + "learning_rate": 0.00037828871748020785, + "loss": 3.0725, + "step": 25463 + }, + { + "epoch": 1.25, + "grad_norm": 0.5578276515007019, + "learning_rate": 0.000378273854680589, + "loss": 3.1744, + "step": 25464 + }, + { + "epoch": 1.25, + "grad_norm": 0.5439172387123108, + "learning_rate": 0.00037825899167481184, + "loss": 3.1216, + "step": 25465 + }, + { + "epoch": 1.25, + "grad_norm": 0.5510421395301819, + "learning_rate": 0.0003782441284629156, + "loss": 3.1522, + "step": 25466 + }, + { + "epoch": 1.25, + "grad_norm": 0.581859827041626, + "learning_rate": 0.0003782292650449394, + "loss": 3.0195, + "step": 25467 + }, + { + "epoch": 1.25, + "grad_norm": 0.5343931913375854, + "learning_rate": 0.00037821440142092236, + "loss": 3.1396, + "step": 25468 + }, + { + "epoch": 1.25, + "grad_norm": 0.5974150896072388, + "learning_rate": 0.0003781995375909036, + "loss": 3.2128, + "step": 25469 + }, + { + "epoch": 1.25, + "grad_norm": 0.5799618363380432, + "learning_rate": 0.00037818467355492234, + "loss": 3.0855, + "step": 25470 + }, + { + "epoch": 1.25, + "grad_norm": 0.5386278629302979, + "learning_rate": 0.00037816980931301773, + "loss": 3.1758, + "step": 25471 + }, + { + "epoch": 1.25, + "grad_norm": 0.6230940818786621, + "learning_rate": 0.0003781549448652288, + "loss": 3.2251, + "step": 25472 + }, + { + "epoch": 1.25, + "grad_norm": 0.5379871129989624, + "learning_rate": 0.00037814008021159486, + "loss": 3.0945, + "step": 25473 + }, + { + "epoch": 1.25, + "grad_norm": 0.5654126405715942, + "learning_rate": 0.00037812521535215504, + "loss": 3.1067, + "step": 25474 + }, + { + "epoch": 1.25, + "grad_norm": 0.5493576526641846, + "learning_rate": 0.00037811035028694834, + "loss": 3.1932, + "step": 25475 + }, + { + "epoch": 1.25, + "grad_norm": 0.5735651850700378, + "learning_rate": 0.000378095485016014, + "loss": 3.2, + "step": 25476 + }, + { + "epoch": 1.25, + "grad_norm": 0.5522319674491882, + "learning_rate": 0.0003780806195393913, + "loss": 3.0353, + "step": 25477 + }, + { + "epoch": 1.25, + "grad_norm": 0.575622022151947, + "learning_rate": 0.00037806575385711916, + "loss": 2.8894, + "step": 25478 + }, + { + "epoch": 1.25, + "grad_norm": 0.6093813180923462, + "learning_rate": 0.0003780508879692369, + "loss": 2.8872, + "step": 25479 + }, + { + "epoch": 1.25, + "grad_norm": 0.5739896297454834, + "learning_rate": 0.00037803602187578357, + "loss": 2.9564, + "step": 25480 + }, + { + "epoch": 1.25, + "grad_norm": 0.560309648513794, + "learning_rate": 0.00037802115557679844, + "loss": 3.3268, + "step": 25481 + }, + { + "epoch": 1.25, + "grad_norm": 0.5892646908760071, + "learning_rate": 0.00037800628907232056, + "loss": 3.1495, + "step": 25482 + }, + { + "epoch": 1.25, + "grad_norm": 0.5857096910476685, + "learning_rate": 0.0003779914223623892, + "loss": 2.9523, + "step": 25483 + }, + { + "epoch": 1.25, + "grad_norm": 0.5739219784736633, + "learning_rate": 0.0003779765554470433, + "loss": 2.9484, + "step": 25484 + }, + { + "epoch": 1.25, + "grad_norm": 0.5851458311080933, + "learning_rate": 0.0003779616883263223, + "loss": 3.0733, + "step": 25485 + }, + { + "epoch": 1.25, + "grad_norm": 0.5476492047309875, + "learning_rate": 0.0003779468210002652, + "loss": 3.0483, + "step": 25486 + }, + { + "epoch": 1.25, + "grad_norm": 0.5560165047645569, + "learning_rate": 0.000377931953468911, + "loss": 3.0811, + "step": 25487 + }, + { + "epoch": 1.25, + "grad_norm": 0.5552588105201721, + "learning_rate": 0.00037791708573229926, + "loss": 3.3325, + "step": 25488 + }, + { + "epoch": 1.25, + "grad_norm": 0.5610097646713257, + "learning_rate": 0.0003779022177904689, + "loss": 3.1599, + "step": 25489 + }, + { + "epoch": 1.25, + "grad_norm": 0.6263189911842346, + "learning_rate": 0.00037788734964345897, + "loss": 3.2049, + "step": 25490 + }, + { + "epoch": 1.25, + "grad_norm": 0.5741820931434631, + "learning_rate": 0.0003778724812913088, + "loss": 3.0006, + "step": 25491 + }, + { + "epoch": 1.25, + "grad_norm": 0.579188346862793, + "learning_rate": 0.00037785761273405745, + "loss": 3.0864, + "step": 25492 + }, + { + "epoch": 1.25, + "grad_norm": 0.5346486568450928, + "learning_rate": 0.00037784274397174416, + "loss": 2.8912, + "step": 25493 + }, + { + "epoch": 1.25, + "grad_norm": 0.5457817912101746, + "learning_rate": 0.0003778278750044081, + "loss": 2.8513, + "step": 25494 + }, + { + "epoch": 1.25, + "grad_norm": 0.5900013446807861, + "learning_rate": 0.00037781300583208835, + "loss": 3.0625, + "step": 25495 + }, + { + "epoch": 1.25, + "grad_norm": 0.5467310547828674, + "learning_rate": 0.0003777981364548242, + "loss": 3.1045, + "step": 25496 + }, + { + "epoch": 1.25, + "grad_norm": 0.5489881038665771, + "learning_rate": 0.0003777832668726546, + "loss": 2.9645, + "step": 25497 + }, + { + "epoch": 1.25, + "grad_norm": 0.5877793431282043, + "learning_rate": 0.0003777683970856189, + "loss": 3.1978, + "step": 25498 + }, + { + "epoch": 1.25, + "grad_norm": 0.5987591743469238, + "learning_rate": 0.0003777535270937563, + "loss": 2.9118, + "step": 25499 + }, + { + "epoch": 1.25, + "grad_norm": 0.5459617972373962, + "learning_rate": 0.00037773865689710585, + "loss": 3.2444, + "step": 25500 + }, + { + "epoch": 1.25, + "grad_norm": 0.5510004162788391, + "learning_rate": 0.00037772378649570667, + "loss": 3.0586, + "step": 25501 + }, + { + "epoch": 1.25, + "grad_norm": 0.5648934245109558, + "learning_rate": 0.000377708915889598, + "loss": 3.0798, + "step": 25502 + }, + { + "epoch": 1.25, + "grad_norm": 0.5991185307502747, + "learning_rate": 0.00037769404507881897, + "loss": 3.095, + "step": 25503 + }, + { + "epoch": 1.25, + "grad_norm": 0.5326690673828125, + "learning_rate": 0.00037767917406340883, + "loss": 3.0806, + "step": 25504 + }, + { + "epoch": 1.25, + "grad_norm": 0.5778882503509521, + "learning_rate": 0.0003776643028434067, + "loss": 3.0523, + "step": 25505 + }, + { + "epoch": 1.25, + "grad_norm": 0.5468420386314392, + "learning_rate": 0.00037764943141885174, + "loss": 3.038, + "step": 25506 + }, + { + "epoch": 1.25, + "grad_norm": 0.617231547832489, + "learning_rate": 0.0003776345597897831, + "loss": 2.9654, + "step": 25507 + }, + { + "epoch": 1.25, + "grad_norm": 0.5568265914916992, + "learning_rate": 0.00037761968795624, + "loss": 2.8395, + "step": 25508 + }, + { + "epoch": 1.25, + "grad_norm": 0.5809617638587952, + "learning_rate": 0.00037760481591826153, + "loss": 3.1054, + "step": 25509 + }, + { + "epoch": 1.25, + "grad_norm": 0.5677468180656433, + "learning_rate": 0.00037758994367588695, + "loss": 2.9851, + "step": 25510 + }, + { + "epoch": 1.25, + "grad_norm": 0.5191762447357178, + "learning_rate": 0.00037757507122915544, + "loss": 3.2637, + "step": 25511 + }, + { + "epoch": 1.25, + "grad_norm": 0.5888081192970276, + "learning_rate": 0.00037756019857810604, + "loss": 3.06, + "step": 25512 + }, + { + "epoch": 1.25, + "grad_norm": 0.5667383074760437, + "learning_rate": 0.000377545325722778, + "loss": 2.9374, + "step": 25513 + }, + { + "epoch": 1.25, + "grad_norm": 0.5688818693161011, + "learning_rate": 0.0003775304526632106, + "loss": 3.2734, + "step": 25514 + }, + { + "epoch": 1.25, + "grad_norm": 0.5567358136177063, + "learning_rate": 0.0003775155793994428, + "loss": 3.0426, + "step": 25515 + }, + { + "epoch": 1.25, + "grad_norm": 0.5471105575561523, + "learning_rate": 0.00037750070593151394, + "loss": 3.25, + "step": 25516 + }, + { + "epoch": 1.25, + "grad_norm": 0.5713381767272949, + "learning_rate": 0.00037748583225946306, + "loss": 3.0582, + "step": 25517 + }, + { + "epoch": 1.25, + "grad_norm": 0.5663639903068542, + "learning_rate": 0.00037747095838332954, + "loss": 3.2855, + "step": 25518 + }, + { + "epoch": 1.25, + "grad_norm": 0.5420747399330139, + "learning_rate": 0.00037745608430315227, + "loss": 3.3252, + "step": 25519 + }, + { + "epoch": 1.25, + "grad_norm": 0.5536601543426514, + "learning_rate": 0.0003774412100189707, + "loss": 3.0771, + "step": 25520 + }, + { + "epoch": 1.25, + "grad_norm": 0.5411950945854187, + "learning_rate": 0.0003774263355308238, + "loss": 3.0431, + "step": 25521 + }, + { + "epoch": 1.25, + "grad_norm": 0.571185827255249, + "learning_rate": 0.0003774114608387509, + "loss": 3.2206, + "step": 25522 + }, + { + "epoch": 1.25, + "grad_norm": 0.5684292316436768, + "learning_rate": 0.0003773965859427911, + "loss": 3.0063, + "step": 25523 + }, + { + "epoch": 1.25, + "grad_norm": 0.5304699540138245, + "learning_rate": 0.0003773817108429836, + "loss": 3.0202, + "step": 25524 + }, + { + "epoch": 1.25, + "grad_norm": 0.5694625973701477, + "learning_rate": 0.0003773668355393676, + "loss": 3.0962, + "step": 25525 + }, + { + "epoch": 1.25, + "grad_norm": 0.545238733291626, + "learning_rate": 0.00037735196003198213, + "loss": 2.9854, + "step": 25526 + }, + { + "epoch": 1.25, + "grad_norm": 0.578179657459259, + "learning_rate": 0.0003773370843208666, + "loss": 3.029, + "step": 25527 + }, + { + "epoch": 1.25, + "grad_norm": 0.5803759694099426, + "learning_rate": 0.00037732220840606, + "loss": 3.2408, + "step": 25528 + }, + { + "epoch": 1.25, + "grad_norm": 0.5503095388412476, + "learning_rate": 0.0003773073322876016, + "loss": 3.4273, + "step": 25529 + }, + { + "epoch": 1.25, + "grad_norm": 0.532310426235199, + "learning_rate": 0.0003772924559655305, + "loss": 3.0394, + "step": 25530 + }, + { + "epoch": 1.25, + "grad_norm": 0.5836457014083862, + "learning_rate": 0.000377277579439886, + "loss": 2.9752, + "step": 25531 + }, + { + "epoch": 1.25, + "grad_norm": 0.5723191499710083, + "learning_rate": 0.0003772627027107073, + "loss": 3.0069, + "step": 25532 + }, + { + "epoch": 1.25, + "grad_norm": 0.5557374358177185, + "learning_rate": 0.00037724782577803345, + "loss": 3.2241, + "step": 25533 + }, + { + "epoch": 1.25, + "grad_norm": 0.5663606524467468, + "learning_rate": 0.0003772329486419036, + "loss": 3.2456, + "step": 25534 + }, + { + "epoch": 1.25, + "grad_norm": 0.5517666339874268, + "learning_rate": 0.0003772180713023572, + "loss": 2.9475, + "step": 25535 + }, + { + "epoch": 1.25, + "grad_norm": 0.5494801998138428, + "learning_rate": 0.0003772031937594332, + "loss": 3.1934, + "step": 25536 + }, + { + "epoch": 1.25, + "grad_norm": 0.5216752886772156, + "learning_rate": 0.0003771883160131708, + "loss": 3.0643, + "step": 25537 + }, + { + "epoch": 1.25, + "grad_norm": 0.5742051005363464, + "learning_rate": 0.0003771734380636092, + "loss": 2.9488, + "step": 25538 + }, + { + "epoch": 1.25, + "grad_norm": 0.53158038854599, + "learning_rate": 0.00037715855991078776, + "loss": 2.9859, + "step": 25539 + }, + { + "epoch": 1.25, + "grad_norm": 0.5515921711921692, + "learning_rate": 0.00037714368155474545, + "loss": 3.0378, + "step": 25540 + }, + { + "epoch": 1.25, + "grad_norm": 0.5819881558418274, + "learning_rate": 0.0003771288029955215, + "loss": 3.0099, + "step": 25541 + }, + { + "epoch": 1.25, + "grad_norm": 0.5356078743934631, + "learning_rate": 0.00037711392423315517, + "loss": 3.1849, + "step": 25542 + }, + { + "epoch": 1.25, + "grad_norm": 0.5517314076423645, + "learning_rate": 0.00037709904526768553, + "loss": 3.141, + "step": 25543 + }, + { + "epoch": 1.25, + "grad_norm": 0.5887091159820557, + "learning_rate": 0.0003770841660991519, + "loss": 3.2121, + "step": 25544 + }, + { + "epoch": 1.25, + "grad_norm": 0.6048860549926758, + "learning_rate": 0.0003770692867275934, + "loss": 3.0597, + "step": 25545 + }, + { + "epoch": 1.25, + "grad_norm": 0.5582966804504395, + "learning_rate": 0.0003770544071530492, + "loss": 2.9708, + "step": 25546 + }, + { + "epoch": 1.25, + "grad_norm": 0.588004469871521, + "learning_rate": 0.0003770395273755586, + "loss": 3.0733, + "step": 25547 + }, + { + "epoch": 1.25, + "grad_norm": 0.6418083906173706, + "learning_rate": 0.00037702464739516065, + "loss": 2.9607, + "step": 25548 + }, + { + "epoch": 1.25, + "grad_norm": 0.5486046075820923, + "learning_rate": 0.00037700976721189464, + "loss": 3.0641, + "step": 25549 + }, + { + "epoch": 1.25, + "grad_norm": 0.5865910649299622, + "learning_rate": 0.0003769948868257997, + "loss": 3.2268, + "step": 25550 + }, + { + "epoch": 1.25, + "grad_norm": 0.571828305721283, + "learning_rate": 0.0003769800062369151, + "loss": 3.1411, + "step": 25551 + }, + { + "epoch": 1.25, + "grad_norm": 0.6386631727218628, + "learning_rate": 0.00037696512544527986, + "loss": 3.2847, + "step": 25552 + }, + { + "epoch": 1.25, + "grad_norm": 0.5370537042617798, + "learning_rate": 0.0003769502444509333, + "loss": 2.9321, + "step": 25553 + }, + { + "epoch": 1.25, + "grad_norm": 0.5921683311462402, + "learning_rate": 0.00037693536325391475, + "loss": 3.0641, + "step": 25554 + }, + { + "epoch": 1.25, + "grad_norm": 0.533803403377533, + "learning_rate": 0.0003769204818542632, + "loss": 2.8298, + "step": 25555 + }, + { + "epoch": 1.25, + "grad_norm": 0.5238431096076965, + "learning_rate": 0.0003769056002520178, + "loss": 3.0571, + "step": 25556 + }, + { + "epoch": 1.25, + "grad_norm": 0.6707302331924438, + "learning_rate": 0.00037689071844721796, + "loss": 3.0363, + "step": 25557 + }, + { + "epoch": 1.25, + "grad_norm": 0.5460362434387207, + "learning_rate": 0.00037687583643990273, + "loss": 3.3561, + "step": 25558 + }, + { + "epoch": 1.25, + "grad_norm": 0.5882094502449036, + "learning_rate": 0.00037686095423011134, + "loss": 3.268, + "step": 25559 + }, + { + "epoch": 1.25, + "grad_norm": 0.5774903297424316, + "learning_rate": 0.000376846071817883, + "loss": 3.0655, + "step": 25560 + }, + { + "epoch": 1.25, + "grad_norm": 0.5424748659133911, + "learning_rate": 0.0003768311892032569, + "loss": 3.0053, + "step": 25561 + }, + { + "epoch": 1.25, + "grad_norm": 0.5767691731452942, + "learning_rate": 0.00037681630638627223, + "loss": 2.964, + "step": 25562 + }, + { + "epoch": 1.25, + "grad_norm": 0.5361145734786987, + "learning_rate": 0.0003768014233669682, + "loss": 3.1929, + "step": 25563 + }, + { + "epoch": 1.25, + "grad_norm": 0.582098126411438, + "learning_rate": 0.000376786540145384, + "loss": 2.9265, + "step": 25564 + }, + { + "epoch": 1.25, + "grad_norm": 0.5639384984970093, + "learning_rate": 0.00037677165672155884, + "loss": 2.9726, + "step": 25565 + }, + { + "epoch": 1.25, + "grad_norm": 0.608085036277771, + "learning_rate": 0.0003767567730955319, + "loss": 3.1035, + "step": 25566 + }, + { + "epoch": 1.25, + "grad_norm": 0.5574251413345337, + "learning_rate": 0.00037674188926734237, + "loss": 2.9313, + "step": 25567 + }, + { + "epoch": 1.25, + "grad_norm": 0.5575839281082153, + "learning_rate": 0.00037672700523702944, + "loss": 3.0273, + "step": 25568 + }, + { + "epoch": 1.25, + "grad_norm": 0.6037725806236267, + "learning_rate": 0.0003767121210046324, + "loss": 3.2235, + "step": 25569 + }, + { + "epoch": 1.25, + "grad_norm": 0.5843100547790527, + "learning_rate": 0.0003766972365701904, + "loss": 3.0442, + "step": 25570 + }, + { + "epoch": 1.25, + "grad_norm": 0.5654159188270569, + "learning_rate": 0.0003766823519337426, + "loss": 3.1427, + "step": 25571 + }, + { + "epoch": 1.25, + "grad_norm": 0.5915213227272034, + "learning_rate": 0.00037666746709532825, + "loss": 3.533, + "step": 25572 + }, + { + "epoch": 1.25, + "grad_norm": 0.5641651153564453, + "learning_rate": 0.0003766525820549866, + "loss": 3.0753, + "step": 25573 + }, + { + "epoch": 1.25, + "grad_norm": 0.5694716572761536, + "learning_rate": 0.0003766376968127567, + "loss": 3.1201, + "step": 25574 + }, + { + "epoch": 1.25, + "grad_norm": 0.5624406933784485, + "learning_rate": 0.00037662281136867793, + "loss": 3.0598, + "step": 25575 + }, + { + "epoch": 1.25, + "grad_norm": 0.560565710067749, + "learning_rate": 0.00037660792572278943, + "loss": 3.111, + "step": 25576 + }, + { + "epoch": 1.25, + "grad_norm": 0.5786372423171997, + "learning_rate": 0.0003765930398751304, + "loss": 3.0341, + "step": 25577 + }, + { + "epoch": 1.25, + "grad_norm": 0.535211980342865, + "learning_rate": 0.00037657815382574004, + "loss": 2.8081, + "step": 25578 + }, + { + "epoch": 1.25, + "grad_norm": 0.5386492609977722, + "learning_rate": 0.0003765632675746576, + "loss": 3.2728, + "step": 25579 + }, + { + "epoch": 1.25, + "grad_norm": 0.5663537979125977, + "learning_rate": 0.00037654838112192215, + "loss": 3.2322, + "step": 25580 + }, + { + "epoch": 1.25, + "grad_norm": 0.6106393337249756, + "learning_rate": 0.00037653349446757303, + "loss": 2.9779, + "step": 25581 + }, + { + "epoch": 1.25, + "grad_norm": 0.5462380051612854, + "learning_rate": 0.0003765186076116495, + "loss": 3.2093, + "step": 25582 + }, + { + "epoch": 1.25, + "grad_norm": 0.5549441576004028, + "learning_rate": 0.0003765037205541905, + "loss": 3.0289, + "step": 25583 + }, + { + "epoch": 1.25, + "grad_norm": 0.5625503063201904, + "learning_rate": 0.0003764888332952355, + "loss": 3.1362, + "step": 25584 + }, + { + "epoch": 1.25, + "grad_norm": 0.582071840763092, + "learning_rate": 0.0003764739458348237, + "loss": 3.035, + "step": 25585 + }, + { + "epoch": 1.25, + "grad_norm": 0.5360458493232727, + "learning_rate": 0.0003764590581729942, + "loss": 2.8962, + "step": 25586 + }, + { + "epoch": 1.25, + "grad_norm": 0.5536467432975769, + "learning_rate": 0.0003764441703097864, + "loss": 3.0696, + "step": 25587 + }, + { + "epoch": 1.25, + "grad_norm": 0.5753385424613953, + "learning_rate": 0.0003764292822452392, + "loss": 3.1238, + "step": 25588 + }, + { + "epoch": 1.25, + "grad_norm": 0.5996004343032837, + "learning_rate": 0.00037641439397939196, + "loss": 3.0104, + "step": 25589 + }, + { + "epoch": 1.25, + "grad_norm": 0.5788471698760986, + "learning_rate": 0.00037639950551228397, + "loss": 3.1478, + "step": 25590 + }, + { + "epoch": 1.25, + "grad_norm": 0.5919115543365479, + "learning_rate": 0.00037638461684395445, + "loss": 3.2666, + "step": 25591 + }, + { + "epoch": 1.25, + "grad_norm": 0.5502909421920776, + "learning_rate": 0.0003763697279744424, + "loss": 3.3325, + "step": 25592 + }, + { + "epoch": 1.25, + "grad_norm": 0.5541229844093323, + "learning_rate": 0.0003763548389037873, + "loss": 3.0699, + "step": 25593 + }, + { + "epoch": 1.25, + "grad_norm": 0.5316629409790039, + "learning_rate": 0.0003763399496320282, + "loss": 3.2613, + "step": 25594 + }, + { + "epoch": 1.25, + "grad_norm": 0.5380682349205017, + "learning_rate": 0.00037632506015920433, + "loss": 3.2379, + "step": 25595 + }, + { + "epoch": 1.25, + "grad_norm": 0.6165295243263245, + "learning_rate": 0.00037631017048535503, + "loss": 3.3058, + "step": 25596 + }, + { + "epoch": 1.25, + "grad_norm": 0.5723467469215393, + "learning_rate": 0.0003762952806105193, + "loss": 3.1968, + "step": 25597 + }, + { + "epoch": 1.25, + "grad_norm": 0.5385229587554932, + "learning_rate": 0.0003762803905347367, + "loss": 3.1022, + "step": 25598 + }, + { + "epoch": 1.25, + "grad_norm": 0.5821444392204285, + "learning_rate": 0.0003762655002580461, + "loss": 3.036, + "step": 25599 + }, + { + "epoch": 1.25, + "grad_norm": 0.5753803253173828, + "learning_rate": 0.0003762506097804868, + "loss": 3.0051, + "step": 25600 + }, + { + "epoch": 1.25, + "grad_norm": 0.5555531978607178, + "learning_rate": 0.00037623571910209817, + "loss": 3.2474, + "step": 25601 + }, + { + "epoch": 1.25, + "grad_norm": 0.5928922295570374, + "learning_rate": 0.00037622082822291926, + "loss": 3.2314, + "step": 25602 + }, + { + "epoch": 1.25, + "grad_norm": 0.5480112433433533, + "learning_rate": 0.00037620593714298936, + "loss": 3.2276, + "step": 25603 + }, + { + "epoch": 1.25, + "grad_norm": 0.5796836018562317, + "learning_rate": 0.00037619104586234767, + "loss": 2.9367, + "step": 25604 + }, + { + "epoch": 1.25, + "grad_norm": 0.535346508026123, + "learning_rate": 0.0003761761543810335, + "loss": 3.2006, + "step": 25605 + }, + { + "epoch": 1.25, + "grad_norm": 0.6117957830429077, + "learning_rate": 0.0003761612626990859, + "loss": 3.0288, + "step": 25606 + }, + { + "epoch": 1.25, + "grad_norm": 0.5398089289665222, + "learning_rate": 0.0003761463708165443, + "loss": 3.1357, + "step": 25607 + }, + { + "epoch": 1.25, + "grad_norm": 0.5904059410095215, + "learning_rate": 0.0003761314787334477, + "loss": 3.1091, + "step": 25608 + }, + { + "epoch": 1.26, + "grad_norm": 0.5292810797691345, + "learning_rate": 0.00037611658644983565, + "loss": 2.9714, + "step": 25609 + }, + { + "epoch": 1.26, + "grad_norm": 0.5337926149368286, + "learning_rate": 0.0003761016939657469, + "loss": 3.1214, + "step": 25610 + }, + { + "epoch": 1.26, + "grad_norm": 0.5683236122131348, + "learning_rate": 0.00037608680128122104, + "loss": 2.8445, + "step": 25611 + }, + { + "epoch": 1.26, + "grad_norm": 0.5837772488594055, + "learning_rate": 0.0003760719083962973, + "loss": 3.0071, + "step": 25612 + }, + { + "epoch": 1.26, + "grad_norm": 0.5842117071151733, + "learning_rate": 0.00037605701531101467, + "loss": 2.9642, + "step": 25613 + }, + { + "epoch": 1.26, + "grad_norm": 0.5550433993339539, + "learning_rate": 0.0003760421220254125, + "loss": 3.0106, + "step": 25614 + }, + { + "epoch": 1.26, + "grad_norm": 0.5435743927955627, + "learning_rate": 0.00037602722853953, + "loss": 3.0388, + "step": 25615 + }, + { + "epoch": 1.26, + "grad_norm": 0.6093657612800598, + "learning_rate": 0.00037601233485340645, + "loss": 3.0343, + "step": 25616 + }, + { + "epoch": 1.26, + "grad_norm": 0.5637142062187195, + "learning_rate": 0.0003759974409670811, + "loss": 3.1616, + "step": 25617 + }, + { + "epoch": 1.26, + "grad_norm": 0.5223868489265442, + "learning_rate": 0.000375982546880593, + "loss": 3.189, + "step": 25618 + }, + { + "epoch": 1.26, + "grad_norm": 0.578644871711731, + "learning_rate": 0.00037596765259398157, + "loss": 2.9092, + "step": 25619 + }, + { + "epoch": 1.26, + "grad_norm": 0.5757935643196106, + "learning_rate": 0.00037595275810728597, + "loss": 3.1162, + "step": 25620 + }, + { + "epoch": 1.26, + "grad_norm": 0.6163303852081299, + "learning_rate": 0.00037593786342054536, + "loss": 3.0835, + "step": 25621 + }, + { + "epoch": 1.26, + "grad_norm": 0.5282571315765381, + "learning_rate": 0.00037592296853379906, + "loss": 3.0402, + "step": 25622 + }, + { + "epoch": 1.26, + "grad_norm": 0.5777333378791809, + "learning_rate": 0.00037590807344708624, + "loss": 3.1, + "step": 25623 + }, + { + "epoch": 1.26, + "grad_norm": 0.5467130541801453, + "learning_rate": 0.0003758931781604463, + "loss": 3.2504, + "step": 25624 + }, + { + "epoch": 1.26, + "grad_norm": 0.5582188367843628, + "learning_rate": 0.00037587828267391814, + "loss": 3.0181, + "step": 25625 + }, + { + "epoch": 1.26, + "grad_norm": 0.5501718521118164, + "learning_rate": 0.0003758633869875413, + "loss": 2.9322, + "step": 25626 + }, + { + "epoch": 1.26, + "grad_norm": 0.5445879101753235, + "learning_rate": 0.0003758484911013549, + "loss": 3.2152, + "step": 25627 + }, + { + "epoch": 1.26, + "grad_norm": 0.5596219301223755, + "learning_rate": 0.0003758335950153982, + "loss": 3.0867, + "step": 25628 + }, + { + "epoch": 1.26, + "grad_norm": 0.5973536968231201, + "learning_rate": 0.00037581869872971036, + "loss": 2.9504, + "step": 25629 + }, + { + "epoch": 1.26, + "grad_norm": 0.5466140508651733, + "learning_rate": 0.0003758038022443306, + "loss": 2.9407, + "step": 25630 + }, + { + "epoch": 1.26, + "grad_norm": 0.6324753165245056, + "learning_rate": 0.0003757889055592983, + "loss": 3.0794, + "step": 25631 + }, + { + "epoch": 1.26, + "grad_norm": 0.6060929298400879, + "learning_rate": 0.0003757740086746526, + "loss": 3.026, + "step": 25632 + }, + { + "epoch": 1.26, + "grad_norm": 0.5800380110740662, + "learning_rate": 0.00037575911159043273, + "loss": 3.027, + "step": 25633 + }, + { + "epoch": 1.26, + "grad_norm": 0.5752260684967041, + "learning_rate": 0.00037574421430667794, + "loss": 3.1041, + "step": 25634 + }, + { + "epoch": 1.26, + "grad_norm": 0.5729960799217224, + "learning_rate": 0.00037572931682342735, + "loss": 3.0167, + "step": 25635 + }, + { + "epoch": 1.26, + "grad_norm": 0.5333654284477234, + "learning_rate": 0.0003757144191407204, + "loss": 3.3717, + "step": 25636 + }, + { + "epoch": 1.26, + "grad_norm": 0.5531308650970459, + "learning_rate": 0.0003756995212585963, + "loss": 3.0053, + "step": 25637 + }, + { + "epoch": 1.26, + "grad_norm": 0.563683271408081, + "learning_rate": 0.00037568462317709423, + "loss": 3.0348, + "step": 25638 + }, + { + "epoch": 1.26, + "grad_norm": 0.5568251609802246, + "learning_rate": 0.00037566972489625345, + "loss": 2.9534, + "step": 25639 + }, + { + "epoch": 1.26, + "grad_norm": 0.5349379181861877, + "learning_rate": 0.0003756548264161131, + "loss": 3.0158, + "step": 25640 + }, + { + "epoch": 1.26, + "grad_norm": 0.5677779316902161, + "learning_rate": 0.0003756399277367125, + "loss": 3.0896, + "step": 25641 + }, + { + "epoch": 1.26, + "grad_norm": 0.5582015514373779, + "learning_rate": 0.0003756250288580909, + "loss": 3.0478, + "step": 25642 + }, + { + "epoch": 1.26, + "grad_norm": 0.539047360420227, + "learning_rate": 0.00037561012978028755, + "loss": 2.9591, + "step": 25643 + }, + { + "epoch": 1.26, + "grad_norm": 0.5575973391532898, + "learning_rate": 0.0003755952305033416, + "loss": 3.1518, + "step": 25644 + }, + { + "epoch": 1.26, + "grad_norm": 0.5632666349411011, + "learning_rate": 0.00037558033102729247, + "loss": 3.1166, + "step": 25645 + }, + { + "epoch": 1.26, + "grad_norm": 0.5250673890113831, + "learning_rate": 0.00037556543135217925, + "loss": 3.0891, + "step": 25646 + }, + { + "epoch": 1.26, + "grad_norm": 0.571751594543457, + "learning_rate": 0.00037555053147804115, + "loss": 2.9595, + "step": 25647 + }, + { + "epoch": 1.26, + "grad_norm": 0.5709319114685059, + "learning_rate": 0.0003755356314049176, + "loss": 2.9572, + "step": 25648 + }, + { + "epoch": 1.26, + "grad_norm": 0.6075392961502075, + "learning_rate": 0.0003755207311328478, + "loss": 3.1889, + "step": 25649 + }, + { + "epoch": 1.26, + "grad_norm": 0.5540075898170471, + "learning_rate": 0.00037550583066187077, + "loss": 3.046, + "step": 25650 + }, + { + "epoch": 1.26, + "grad_norm": 0.5717571973800659, + "learning_rate": 0.0003754909299920259, + "loss": 3.1389, + "step": 25651 + }, + { + "epoch": 1.26, + "grad_norm": 0.5485336780548096, + "learning_rate": 0.00037547602912335257, + "loss": 2.9172, + "step": 25652 + }, + { + "epoch": 1.26, + "grad_norm": 0.6030722260475159, + "learning_rate": 0.00037546112805588984, + "loss": 2.8894, + "step": 25653 + }, + { + "epoch": 1.26, + "grad_norm": 0.5882648825645447, + "learning_rate": 0.000375446226789677, + "loss": 2.9703, + "step": 25654 + }, + { + "epoch": 1.26, + "grad_norm": 0.5773414969444275, + "learning_rate": 0.0003754313253247533, + "loss": 3.2545, + "step": 25655 + }, + { + "epoch": 1.26, + "grad_norm": 0.5464721322059631, + "learning_rate": 0.00037541642366115813, + "loss": 3.2103, + "step": 25656 + }, + { + "epoch": 1.26, + "grad_norm": 0.59247225522995, + "learning_rate": 0.00037540152179893054, + "loss": 3.0535, + "step": 25657 + }, + { + "epoch": 1.26, + "grad_norm": 0.5408132076263428, + "learning_rate": 0.0003753866197381098, + "loss": 2.9782, + "step": 25658 + }, + { + "epoch": 1.26, + "grad_norm": 0.5488524436950684, + "learning_rate": 0.0003753717174787352, + "loss": 2.9827, + "step": 25659 + }, + { + "epoch": 1.26, + "grad_norm": 0.5907993316650391, + "learning_rate": 0.0003753568150208462, + "loss": 3.2103, + "step": 25660 + }, + { + "epoch": 1.26, + "grad_norm": 0.568075954914093, + "learning_rate": 0.0003753419123644816, + "loss": 2.89, + "step": 25661 + }, + { + "epoch": 1.26, + "grad_norm": 0.6084296703338623, + "learning_rate": 0.000375327009509681, + "loss": 3.1351, + "step": 25662 + }, + { + "epoch": 1.26, + "grad_norm": 0.5559454560279846, + "learning_rate": 0.00037531210645648364, + "loss": 3.2423, + "step": 25663 + }, + { + "epoch": 1.26, + "grad_norm": 0.5514500141143799, + "learning_rate": 0.00037529720320492865, + "loss": 3.065, + "step": 25664 + }, + { + "epoch": 1.26, + "grad_norm": 0.5497595071792603, + "learning_rate": 0.00037528229975505525, + "loss": 3.0918, + "step": 25665 + }, + { + "epoch": 1.26, + "grad_norm": 0.5739080905914307, + "learning_rate": 0.00037526739610690274, + "loss": 3.0684, + "step": 25666 + }, + { + "epoch": 1.26, + "grad_norm": 0.5830264687538147, + "learning_rate": 0.00037525249226051053, + "loss": 3.192, + "step": 25667 + }, + { + "epoch": 1.26, + "grad_norm": 0.5546814203262329, + "learning_rate": 0.00037523758821591765, + "loss": 3.0591, + "step": 25668 + }, + { + "epoch": 1.26, + "grad_norm": 0.5485287308692932, + "learning_rate": 0.00037522268397316346, + "loss": 3.1005, + "step": 25669 + }, + { + "epoch": 1.26, + "grad_norm": 0.5739904642105103, + "learning_rate": 0.0003752077795322872, + "loss": 3.0651, + "step": 25670 + }, + { + "epoch": 1.26, + "grad_norm": 0.6011346578598022, + "learning_rate": 0.0003751928748933281, + "loss": 3.1442, + "step": 25671 + }, + { + "epoch": 1.26, + "grad_norm": 0.5489895939826965, + "learning_rate": 0.0003751779700563254, + "loss": 2.8407, + "step": 25672 + }, + { + "epoch": 1.26, + "grad_norm": 0.5272802710533142, + "learning_rate": 0.00037516306502131843, + "loss": 3.075, + "step": 25673 + }, + { + "epoch": 1.26, + "grad_norm": 0.6106321811676025, + "learning_rate": 0.0003751481597883464, + "loss": 3.0871, + "step": 25674 + }, + { + "epoch": 1.26, + "grad_norm": 0.5586559772491455, + "learning_rate": 0.0003751332543574487, + "loss": 2.8499, + "step": 25675 + }, + { + "epoch": 1.26, + "grad_norm": 0.5668689012527466, + "learning_rate": 0.0003751183487286644, + "loss": 3.2104, + "step": 25676 + }, + { + "epoch": 1.26, + "grad_norm": 0.5656947493553162, + "learning_rate": 0.0003751034429020327, + "loss": 3.0308, + "step": 25677 + }, + { + "epoch": 1.26, + "grad_norm": 0.5509669184684753, + "learning_rate": 0.0003750885368775932, + "loss": 3.2247, + "step": 25678 + }, + { + "epoch": 1.26, + "grad_norm": 0.581092894077301, + "learning_rate": 0.00037507363065538476, + "loss": 3.0334, + "step": 25679 + }, + { + "epoch": 1.26, + "grad_norm": 0.5440135598182678, + "learning_rate": 0.0003750587242354469, + "loss": 3.162, + "step": 25680 + }, + { + "epoch": 1.26, + "grad_norm": 0.5443293452262878, + "learning_rate": 0.0003750438176178188, + "loss": 3.1321, + "step": 25681 + }, + { + "epoch": 1.26, + "grad_norm": 0.5595262050628662, + "learning_rate": 0.00037502891080253974, + "loss": 3.1387, + "step": 25682 + }, + { + "epoch": 1.26, + "grad_norm": 0.5496951937675476, + "learning_rate": 0.00037501400378964894, + "loss": 2.9888, + "step": 25683 + }, + { + "epoch": 1.26, + "grad_norm": 0.5384321212768555, + "learning_rate": 0.00037499909657918567, + "loss": 3.1049, + "step": 25684 + }, + { + "epoch": 1.26, + "grad_norm": 0.6079148650169373, + "learning_rate": 0.0003749841891711893, + "loss": 3.1243, + "step": 25685 + }, + { + "epoch": 1.26, + "grad_norm": 0.5618904829025269, + "learning_rate": 0.000374969281565699, + "loss": 2.9805, + "step": 25686 + }, + { + "epoch": 1.26, + "grad_norm": 0.6069911122322083, + "learning_rate": 0.00037495437376275397, + "loss": 3.343, + "step": 25687 + }, + { + "epoch": 1.26, + "grad_norm": 0.7043254971504211, + "learning_rate": 0.0003749394657623936, + "loss": 3.1197, + "step": 25688 + }, + { + "epoch": 1.26, + "grad_norm": 0.5782732367515564, + "learning_rate": 0.00037492455756465705, + "loss": 2.9482, + "step": 25689 + }, + { + "epoch": 1.26, + "grad_norm": 0.5924282670021057, + "learning_rate": 0.00037490964916958363, + "loss": 3.2733, + "step": 25690 + }, + { + "epoch": 1.26, + "grad_norm": 0.5641574859619141, + "learning_rate": 0.0003748947405772127, + "loss": 3.0016, + "step": 25691 + }, + { + "epoch": 1.26, + "grad_norm": 0.596458375453949, + "learning_rate": 0.0003748798317875833, + "loss": 3.0319, + "step": 25692 + }, + { + "epoch": 1.26, + "grad_norm": 0.6097240447998047, + "learning_rate": 0.00037486492280073495, + "loss": 3.3717, + "step": 25693 + }, + { + "epoch": 1.26, + "grad_norm": 0.5604461431503296, + "learning_rate": 0.00037485001361670675, + "loss": 2.8643, + "step": 25694 + }, + { + "epoch": 1.26, + "grad_norm": 0.5882611274719238, + "learning_rate": 0.000374835104235538, + "loss": 2.9761, + "step": 25695 + }, + { + "epoch": 1.26, + "grad_norm": 0.5667465925216675, + "learning_rate": 0.000374820194657268, + "loss": 3.1456, + "step": 25696 + }, + { + "epoch": 1.26, + "grad_norm": 0.5910031199455261, + "learning_rate": 0.00037480528488193605, + "loss": 2.9162, + "step": 25697 + }, + { + "epoch": 1.26, + "grad_norm": 0.5716198682785034, + "learning_rate": 0.00037479037490958136, + "loss": 3.0564, + "step": 25698 + }, + { + "epoch": 1.26, + "grad_norm": 0.5668255090713501, + "learning_rate": 0.0003747754647402432, + "loss": 3.2102, + "step": 25699 + }, + { + "epoch": 1.26, + "grad_norm": 0.5733356475830078, + "learning_rate": 0.00037476055437396093, + "loss": 3.3008, + "step": 25700 + }, + { + "epoch": 1.26, + "grad_norm": 0.5971511006355286, + "learning_rate": 0.00037474564381077363, + "loss": 3.0688, + "step": 25701 + }, + { + "epoch": 1.26, + "grad_norm": 0.5802672505378723, + "learning_rate": 0.0003747307330507207, + "loss": 2.9328, + "step": 25702 + }, + { + "epoch": 1.26, + "grad_norm": 0.6406810879707336, + "learning_rate": 0.0003747158220938415, + "loss": 3.0136, + "step": 25703 + }, + { + "epoch": 1.26, + "grad_norm": 0.5840638279914856, + "learning_rate": 0.0003747009109401752, + "loss": 3.0113, + "step": 25704 + }, + { + "epoch": 1.26, + "grad_norm": 0.55064457654953, + "learning_rate": 0.000374685999589761, + "loss": 3.0928, + "step": 25705 + }, + { + "epoch": 1.26, + "grad_norm": 0.5626662373542786, + "learning_rate": 0.00037467108804263826, + "loss": 2.9046, + "step": 25706 + }, + { + "epoch": 1.26, + "grad_norm": 0.6290438175201416, + "learning_rate": 0.00037465617629884625, + "loss": 3.0341, + "step": 25707 + }, + { + "epoch": 1.26, + "grad_norm": 0.5759810209274292, + "learning_rate": 0.00037464126435842425, + "loss": 3.1417, + "step": 25708 + }, + { + "epoch": 1.26, + "grad_norm": 0.6134321689605713, + "learning_rate": 0.00037462635222141146, + "loss": 3.2309, + "step": 25709 + }, + { + "epoch": 1.26, + "grad_norm": 0.578365683555603, + "learning_rate": 0.0003746114398878473, + "loss": 3.1481, + "step": 25710 + }, + { + "epoch": 1.26, + "grad_norm": 0.5462852716445923, + "learning_rate": 0.00037459652735777105, + "loss": 3.1731, + "step": 25711 + }, + { + "epoch": 1.26, + "grad_norm": 0.636318027973175, + "learning_rate": 0.00037458161463122165, + "loss": 2.9947, + "step": 25712 + }, + { + "epoch": 1.26, + "grad_norm": 0.5679839849472046, + "learning_rate": 0.00037456670170823883, + "loss": 3.122, + "step": 25713 + }, + { + "epoch": 1.26, + "grad_norm": 0.5561462640762329, + "learning_rate": 0.0003745517885888617, + "loss": 3.2713, + "step": 25714 + }, + { + "epoch": 1.26, + "grad_norm": 0.6204407811164856, + "learning_rate": 0.00037453687527312944, + "loss": 3.0994, + "step": 25715 + }, + { + "epoch": 1.26, + "grad_norm": 0.52779620885849, + "learning_rate": 0.0003745219617610813, + "loss": 3.1557, + "step": 25716 + }, + { + "epoch": 1.26, + "grad_norm": 0.5464897155761719, + "learning_rate": 0.00037450704805275676, + "loss": 3.1137, + "step": 25717 + }, + { + "epoch": 1.26, + "grad_norm": 0.6134060621261597, + "learning_rate": 0.00037449213414819497, + "loss": 2.9496, + "step": 25718 + }, + { + "epoch": 1.26, + "grad_norm": 0.5954126119613647, + "learning_rate": 0.00037447722004743526, + "loss": 3.1065, + "step": 25719 + }, + { + "epoch": 1.26, + "grad_norm": 0.5473684668540955, + "learning_rate": 0.00037446230575051686, + "loss": 3.0049, + "step": 25720 + }, + { + "epoch": 1.26, + "grad_norm": 0.5591050386428833, + "learning_rate": 0.0003744473912574791, + "loss": 3.114, + "step": 25721 + }, + { + "epoch": 1.26, + "grad_norm": 0.5836315751075745, + "learning_rate": 0.0003744324765683613, + "loss": 3.046, + "step": 25722 + }, + { + "epoch": 1.26, + "grad_norm": 0.6122407913208008, + "learning_rate": 0.0003744175616832025, + "loss": 3.0378, + "step": 25723 + }, + { + "epoch": 1.26, + "grad_norm": 0.570861279964447, + "learning_rate": 0.0003744026466020422, + "loss": 3.1949, + "step": 25724 + }, + { + "epoch": 1.26, + "grad_norm": 0.534397542476654, + "learning_rate": 0.00037438773132491984, + "loss": 3.0312, + "step": 25725 + }, + { + "epoch": 1.26, + "grad_norm": 0.536069929599762, + "learning_rate": 0.0003743728158518744, + "loss": 3.0838, + "step": 25726 + }, + { + "epoch": 1.26, + "grad_norm": 0.5722454190254211, + "learning_rate": 0.0003743579001829452, + "loss": 3.2043, + "step": 25727 + }, + { + "epoch": 1.26, + "grad_norm": 0.5977911949157715, + "learning_rate": 0.0003743429843181716, + "loss": 3.0963, + "step": 25728 + }, + { + "epoch": 1.26, + "grad_norm": 0.566826343536377, + "learning_rate": 0.000374328068257593, + "loss": 2.9931, + "step": 25729 + }, + { + "epoch": 1.26, + "grad_norm": 0.5475797057151794, + "learning_rate": 0.00037431315200124853, + "loss": 3.1475, + "step": 25730 + }, + { + "epoch": 1.26, + "grad_norm": 0.5495274662971497, + "learning_rate": 0.0003742982355491775, + "loss": 3.1989, + "step": 25731 + }, + { + "epoch": 1.26, + "grad_norm": 0.559955358505249, + "learning_rate": 0.00037428331890141926, + "loss": 3.015, + "step": 25732 + }, + { + "epoch": 1.26, + "grad_norm": 0.6134682893753052, + "learning_rate": 0.000374268402058013, + "loss": 2.9699, + "step": 25733 + }, + { + "epoch": 1.26, + "grad_norm": 0.5740057826042175, + "learning_rate": 0.0003742534850189981, + "loss": 3.2211, + "step": 25734 + }, + { + "epoch": 1.26, + "grad_norm": 0.610024094581604, + "learning_rate": 0.00037423856778441377, + "loss": 2.9174, + "step": 25735 + }, + { + "epoch": 1.26, + "grad_norm": 0.6208667755126953, + "learning_rate": 0.00037422365035429936, + "loss": 3.155, + "step": 25736 + }, + { + "epoch": 1.26, + "grad_norm": 0.5483558773994446, + "learning_rate": 0.00037420873272869424, + "loss": 3.1108, + "step": 25737 + }, + { + "epoch": 1.26, + "grad_norm": 0.5884615182876587, + "learning_rate": 0.00037419381490763745, + "loss": 3.1444, + "step": 25738 + }, + { + "epoch": 1.26, + "grad_norm": 0.5598413348197937, + "learning_rate": 0.00037417889689116845, + "loss": 3.1624, + "step": 25739 + }, + { + "epoch": 1.26, + "grad_norm": 0.5523557066917419, + "learning_rate": 0.0003741639786793266, + "loss": 3.1961, + "step": 25740 + }, + { + "epoch": 1.26, + "grad_norm": 0.6840699911117554, + "learning_rate": 0.00037414906027215104, + "loss": 3.0235, + "step": 25741 + }, + { + "epoch": 1.26, + "grad_norm": 0.5679307579994202, + "learning_rate": 0.0003741341416696811, + "loss": 3.0551, + "step": 25742 + }, + { + "epoch": 1.26, + "grad_norm": 0.5635999441146851, + "learning_rate": 0.00037411922287195615, + "loss": 3.016, + "step": 25743 + }, + { + "epoch": 1.26, + "grad_norm": 0.561026394367218, + "learning_rate": 0.0003741043038790154, + "loss": 2.9945, + "step": 25744 + }, + { + "epoch": 1.26, + "grad_norm": 0.5478890538215637, + "learning_rate": 0.0003740893846908982, + "loss": 3.12, + "step": 25745 + }, + { + "epoch": 1.26, + "grad_norm": 0.5661895871162415, + "learning_rate": 0.0003740744653076438, + "loss": 3.1501, + "step": 25746 + }, + { + "epoch": 1.26, + "grad_norm": 0.5741684436798096, + "learning_rate": 0.00037405954572929155, + "loss": 2.9514, + "step": 25747 + }, + { + "epoch": 1.26, + "grad_norm": 0.5624836087226868, + "learning_rate": 0.0003740446259558807, + "loss": 3.0239, + "step": 25748 + }, + { + "epoch": 1.26, + "grad_norm": 0.5481595993041992, + "learning_rate": 0.0003740297059874505, + "loss": 3.1957, + "step": 25749 + }, + { + "epoch": 1.26, + "grad_norm": 0.5887608528137207, + "learning_rate": 0.0003740147858240403, + "loss": 3.2924, + "step": 25750 + }, + { + "epoch": 1.26, + "grad_norm": 0.5847222805023193, + "learning_rate": 0.0003739998654656896, + "loss": 3.0791, + "step": 25751 + }, + { + "epoch": 1.26, + "grad_norm": 0.6101052761077881, + "learning_rate": 0.00037398494491243736, + "loss": 2.9752, + "step": 25752 + }, + { + "epoch": 1.26, + "grad_norm": 0.5880925059318542, + "learning_rate": 0.00037397002416432283, + "loss": 2.9945, + "step": 25753 + }, + { + "epoch": 1.26, + "grad_norm": 0.5594073534011841, + "learning_rate": 0.00037395510322138574, + "loss": 3.0858, + "step": 25754 + }, + { + "epoch": 1.26, + "grad_norm": 0.5806262493133545, + "learning_rate": 0.00037394018208366505, + "loss": 3.1636, + "step": 25755 + }, + { + "epoch": 1.26, + "grad_norm": 0.6017138361930847, + "learning_rate": 0.00037392526075120014, + "loss": 3.1412, + "step": 25756 + }, + { + "epoch": 1.26, + "grad_norm": 0.5765290260314941, + "learning_rate": 0.00037391033922403035, + "loss": 3.1801, + "step": 25757 + }, + { + "epoch": 1.26, + "grad_norm": 0.6031667590141296, + "learning_rate": 0.00037389541750219494, + "loss": 3.0756, + "step": 25758 + }, + { + "epoch": 1.26, + "grad_norm": 0.5250658988952637, + "learning_rate": 0.0003738804955857332, + "loss": 3.0219, + "step": 25759 + }, + { + "epoch": 1.26, + "grad_norm": 0.5491588115692139, + "learning_rate": 0.0003738655734746845, + "loss": 2.9849, + "step": 25760 + }, + { + "epoch": 1.26, + "grad_norm": 0.5519552826881409, + "learning_rate": 0.000373850651169088, + "loss": 3.1823, + "step": 25761 + }, + { + "epoch": 1.26, + "grad_norm": 0.5483062267303467, + "learning_rate": 0.0003738357286689832, + "loss": 3.0518, + "step": 25762 + }, + { + "epoch": 1.26, + "grad_norm": 0.5620872378349304, + "learning_rate": 0.0003738208059744093, + "loss": 3.1545, + "step": 25763 + }, + { + "epoch": 1.26, + "grad_norm": 0.5591562986373901, + "learning_rate": 0.00037380588308540553, + "loss": 3.0748, + "step": 25764 + }, + { + "epoch": 1.26, + "grad_norm": 0.6279521584510803, + "learning_rate": 0.0003737909600020113, + "loss": 3.0854, + "step": 25765 + }, + { + "epoch": 1.26, + "grad_norm": 0.6239844560623169, + "learning_rate": 0.0003737760367242659, + "loss": 3.1755, + "step": 25766 + }, + { + "epoch": 1.26, + "grad_norm": 0.536332905292511, + "learning_rate": 0.0003737611132522087, + "loss": 3.1022, + "step": 25767 + }, + { + "epoch": 1.26, + "grad_norm": 0.5568017363548279, + "learning_rate": 0.00037374618958587873, + "loss": 3.2276, + "step": 25768 + }, + { + "epoch": 1.26, + "grad_norm": 0.546877920627594, + "learning_rate": 0.0003737312657253156, + "loss": 2.9514, + "step": 25769 + }, + { + "epoch": 1.26, + "grad_norm": 0.6245750784873962, + "learning_rate": 0.0003737163416705585, + "loss": 3.0925, + "step": 25770 + }, + { + "epoch": 1.26, + "grad_norm": 0.609423816204071, + "learning_rate": 0.0003737014174216467, + "loss": 3.0713, + "step": 25771 + }, + { + "epoch": 1.26, + "grad_norm": 0.5926220417022705, + "learning_rate": 0.0003736864929786196, + "loss": 3.1992, + "step": 25772 + }, + { + "epoch": 1.26, + "grad_norm": 0.5451133251190186, + "learning_rate": 0.0003736715683415165, + "loss": 3.0744, + "step": 25773 + }, + { + "epoch": 1.26, + "grad_norm": 0.5710421204566956, + "learning_rate": 0.00037365664351037654, + "loss": 3.0953, + "step": 25774 + }, + { + "epoch": 1.26, + "grad_norm": 0.5600199699401855, + "learning_rate": 0.0003736417184852393, + "loss": 3.1269, + "step": 25775 + }, + { + "epoch": 1.26, + "grad_norm": 0.5701009035110474, + "learning_rate": 0.00037362679326614386, + "loss": 3.1154, + "step": 25776 + }, + { + "epoch": 1.26, + "grad_norm": 0.6089168787002563, + "learning_rate": 0.0003736118678531297, + "loss": 3.0837, + "step": 25777 + }, + { + "epoch": 1.26, + "grad_norm": 0.5491774082183838, + "learning_rate": 0.0003735969422462359, + "loss": 3.3128, + "step": 25778 + }, + { + "epoch": 1.26, + "grad_norm": 0.5762932300567627, + "learning_rate": 0.000373582016445502, + "loss": 3.0711, + "step": 25779 + }, + { + "epoch": 1.26, + "grad_norm": 0.5476071834564209, + "learning_rate": 0.0003735670904509673, + "loss": 2.887, + "step": 25780 + }, + { + "epoch": 1.26, + "grad_norm": 0.6149681806564331, + "learning_rate": 0.00037355216426267093, + "loss": 3.0442, + "step": 25781 + }, + { + "epoch": 1.26, + "grad_norm": 0.5968160033226013, + "learning_rate": 0.00037353723788065236, + "loss": 2.8897, + "step": 25782 + }, + { + "epoch": 1.26, + "grad_norm": 0.5533415675163269, + "learning_rate": 0.00037352231130495083, + "loss": 3.1161, + "step": 25783 + }, + { + "epoch": 1.26, + "grad_norm": 0.5641621351242065, + "learning_rate": 0.00037350738453560575, + "loss": 3.1067, + "step": 25784 + }, + { + "epoch": 1.26, + "grad_norm": 0.5823823809623718, + "learning_rate": 0.00037349245757265626, + "loss": 3.1385, + "step": 25785 + }, + { + "epoch": 1.26, + "grad_norm": 0.6093672513961792, + "learning_rate": 0.0003734775304161419, + "loss": 3.1447, + "step": 25786 + }, + { + "epoch": 1.26, + "grad_norm": 0.5846396684646606, + "learning_rate": 0.00037346260306610175, + "loss": 2.9545, + "step": 25787 + }, + { + "epoch": 1.26, + "grad_norm": 0.5794227719306946, + "learning_rate": 0.0003734476755225754, + "loss": 2.8661, + "step": 25788 + }, + { + "epoch": 1.26, + "grad_norm": 0.566815972328186, + "learning_rate": 0.0003734327477856018, + "loss": 3.0093, + "step": 25789 + }, + { + "epoch": 1.26, + "grad_norm": 0.5866063833236694, + "learning_rate": 0.00037341781985522057, + "loss": 2.9509, + "step": 25790 + }, + { + "epoch": 1.26, + "grad_norm": 0.552588939666748, + "learning_rate": 0.000373402891731471, + "loss": 3.1778, + "step": 25791 + }, + { + "epoch": 1.26, + "grad_norm": 0.5920195579528809, + "learning_rate": 0.0003733879634143923, + "loss": 3.0382, + "step": 25792 + }, + { + "epoch": 1.26, + "grad_norm": 0.5480687618255615, + "learning_rate": 0.0003733730349040238, + "loss": 3.0309, + "step": 25793 + }, + { + "epoch": 1.26, + "grad_norm": 0.5250555276870728, + "learning_rate": 0.0003733581062004049, + "loss": 3.2171, + "step": 25794 + }, + { + "epoch": 1.26, + "grad_norm": 0.6039732694625854, + "learning_rate": 0.0003733431773035748, + "loss": 3.1888, + "step": 25795 + }, + { + "epoch": 1.26, + "grad_norm": 0.5604811310768127, + "learning_rate": 0.0003733282482135729, + "loss": 3.1069, + "step": 25796 + }, + { + "epoch": 1.26, + "grad_norm": 0.597496509552002, + "learning_rate": 0.0003733133189304385, + "loss": 3.1567, + "step": 25797 + }, + { + "epoch": 1.26, + "grad_norm": 0.5957289338111877, + "learning_rate": 0.00037329838945421095, + "loss": 3.0694, + "step": 25798 + }, + { + "epoch": 1.26, + "grad_norm": 0.5216500163078308, + "learning_rate": 0.00037328345978492966, + "loss": 3.0539, + "step": 25799 + }, + { + "epoch": 1.26, + "grad_norm": 0.5505014061927795, + "learning_rate": 0.0003732685299226336, + "loss": 3.1195, + "step": 25800 + }, + { + "epoch": 1.26, + "grad_norm": 0.5500777363777161, + "learning_rate": 0.0003732535998673624, + "loss": 3.007, + "step": 25801 + }, + { + "epoch": 1.26, + "grad_norm": 0.6095644235610962, + "learning_rate": 0.00037323866961915545, + "loss": 2.8892, + "step": 25802 + }, + { + "epoch": 1.26, + "grad_norm": 0.5533205270767212, + "learning_rate": 0.0003732237391780519, + "loss": 3.0392, + "step": 25803 + }, + { + "epoch": 1.26, + "grad_norm": 0.576029360294342, + "learning_rate": 0.0003732088085440911, + "loss": 2.8924, + "step": 25804 + }, + { + "epoch": 1.26, + "grad_norm": 0.5615506172180176, + "learning_rate": 0.00037319387771731237, + "loss": 3.0801, + "step": 25805 + }, + { + "epoch": 1.26, + "grad_norm": 0.5568882822990417, + "learning_rate": 0.000373178946697755, + "loss": 3.1054, + "step": 25806 + }, + { + "epoch": 1.26, + "grad_norm": 0.554295003414154, + "learning_rate": 0.00037316401548545845, + "loss": 3.1786, + "step": 25807 + }, + { + "epoch": 1.26, + "grad_norm": 0.5566840767860413, + "learning_rate": 0.0003731490840804619, + "loss": 3.0353, + "step": 25808 + }, + { + "epoch": 1.26, + "grad_norm": 0.5562843084335327, + "learning_rate": 0.00037313415248280477, + "loss": 3.3003, + "step": 25809 + }, + { + "epoch": 1.26, + "grad_norm": 0.569489061832428, + "learning_rate": 0.00037311922069252636, + "loss": 3.155, + "step": 25810 + }, + { + "epoch": 1.26, + "grad_norm": 0.537324070930481, + "learning_rate": 0.00037310428870966595, + "loss": 3.0376, + "step": 25811 + }, + { + "epoch": 1.26, + "grad_norm": 0.5831682085990906, + "learning_rate": 0.00037308935653426295, + "loss": 3.1946, + "step": 25812 + }, + { + "epoch": 1.27, + "grad_norm": 0.5841866135597229, + "learning_rate": 0.0003730744241663567, + "loss": 3.2204, + "step": 25813 + }, + { + "epoch": 1.27, + "grad_norm": 0.5594460368156433, + "learning_rate": 0.0003730594916059864, + "loss": 3.1202, + "step": 25814 + }, + { + "epoch": 1.27, + "grad_norm": 0.635952353477478, + "learning_rate": 0.0003730445588531915, + "loss": 3.0469, + "step": 25815 + }, + { + "epoch": 1.27, + "grad_norm": 0.5496728420257568, + "learning_rate": 0.00037302962590801133, + "loss": 3.0797, + "step": 25816 + }, + { + "epoch": 1.27, + "grad_norm": 0.5622497797012329, + "learning_rate": 0.00037301469277048515, + "loss": 3.0121, + "step": 25817 + }, + { + "epoch": 1.27, + "grad_norm": 0.5610560774803162, + "learning_rate": 0.0003729997594406523, + "loss": 2.9417, + "step": 25818 + }, + { + "epoch": 1.27, + "grad_norm": 0.5938122272491455, + "learning_rate": 0.0003729848259185521, + "loss": 2.8371, + "step": 25819 + }, + { + "epoch": 1.27, + "grad_norm": 0.5797512531280518, + "learning_rate": 0.0003729698922042239, + "loss": 2.8959, + "step": 25820 + }, + { + "epoch": 1.27, + "grad_norm": 0.5663214921951294, + "learning_rate": 0.00037295495829770713, + "loss": 3.0954, + "step": 25821 + }, + { + "epoch": 1.27, + "grad_norm": 0.5849252343177795, + "learning_rate": 0.00037294002419904104, + "loss": 2.9451, + "step": 25822 + }, + { + "epoch": 1.27, + "grad_norm": 0.5354498028755188, + "learning_rate": 0.00037292508990826494, + "loss": 3.0579, + "step": 25823 + }, + { + "epoch": 1.27, + "grad_norm": 0.5267759561538696, + "learning_rate": 0.0003729101554254182, + "loss": 3.1529, + "step": 25824 + }, + { + "epoch": 1.27, + "grad_norm": 0.5743757486343384, + "learning_rate": 0.00037289522075054007, + "loss": 2.9971, + "step": 25825 + }, + { + "epoch": 1.27, + "grad_norm": 0.5666912198066711, + "learning_rate": 0.00037288028588367, + "loss": 3.1979, + "step": 25826 + }, + { + "epoch": 1.27, + "grad_norm": 0.5688309669494629, + "learning_rate": 0.0003728653508248474, + "loss": 3.0861, + "step": 25827 + }, + { + "epoch": 1.27, + "grad_norm": 0.5575060248374939, + "learning_rate": 0.00037285041557411135, + "loss": 3.1098, + "step": 25828 + }, + { + "epoch": 1.27, + "grad_norm": 0.5437538623809814, + "learning_rate": 0.0003728354801315014, + "loss": 3.1594, + "step": 25829 + }, + { + "epoch": 1.27, + "grad_norm": 0.6106091141700745, + "learning_rate": 0.00037282054449705665, + "loss": 3.0998, + "step": 25830 + }, + { + "epoch": 1.27, + "grad_norm": 0.6044281125068665, + "learning_rate": 0.0003728056086708168, + "loss": 3.0176, + "step": 25831 + }, + { + "epoch": 1.27, + "grad_norm": 0.5630094408988953, + "learning_rate": 0.00037279067265282094, + "loss": 3.1904, + "step": 25832 + }, + { + "epoch": 1.27, + "grad_norm": 0.5635753273963928, + "learning_rate": 0.0003727757364431084, + "loss": 3.1233, + "step": 25833 + }, + { + "epoch": 1.27, + "grad_norm": 0.5579873919487, + "learning_rate": 0.0003727608000417186, + "loss": 2.8207, + "step": 25834 + }, + { + "epoch": 1.27, + "grad_norm": 0.5992753505706787, + "learning_rate": 0.0003727458634486909, + "loss": 3.0802, + "step": 25835 + }, + { + "epoch": 1.27, + "grad_norm": 0.5542998313903809, + "learning_rate": 0.0003727309266640645, + "loss": 3.1656, + "step": 25836 + }, + { + "epoch": 1.27, + "grad_norm": 0.5979121327400208, + "learning_rate": 0.0003727159896878789, + "loss": 3.2221, + "step": 25837 + }, + { + "epoch": 1.27, + "grad_norm": 0.5464156270027161, + "learning_rate": 0.00037270105252017335, + "loss": 3.0004, + "step": 25838 + }, + { + "epoch": 1.27, + "grad_norm": 0.5646561980247498, + "learning_rate": 0.00037268611516098725, + "loss": 3.3144, + "step": 25839 + }, + { + "epoch": 1.27, + "grad_norm": 0.60554438829422, + "learning_rate": 0.00037267117761036, + "loss": 3.0871, + "step": 25840 + }, + { + "epoch": 1.27, + "grad_norm": 0.5654262900352478, + "learning_rate": 0.0003726562398683307, + "loss": 2.938, + "step": 25841 + }, + { + "epoch": 1.27, + "grad_norm": 0.5625606179237366, + "learning_rate": 0.0003726413019349389, + "loss": 3.161, + "step": 25842 + }, + { + "epoch": 1.27, + "grad_norm": 0.5684981346130371, + "learning_rate": 0.0003726263638102239, + "loss": 3.0923, + "step": 25843 + }, + { + "epoch": 1.27, + "grad_norm": 0.5840631723403931, + "learning_rate": 0.00037261142549422505, + "loss": 3.1465, + "step": 25844 + }, + { + "epoch": 1.27, + "grad_norm": 0.5702567100524902, + "learning_rate": 0.0003725964869869816, + "loss": 3.1471, + "step": 25845 + }, + { + "epoch": 1.27, + "grad_norm": 0.61046302318573, + "learning_rate": 0.0003725815482885331, + "loss": 3.1633, + "step": 25846 + }, + { + "epoch": 1.27, + "grad_norm": 0.557167649269104, + "learning_rate": 0.00037256660939891865, + "loss": 3.0677, + "step": 25847 + }, + { + "epoch": 1.27, + "grad_norm": 0.5500303506851196, + "learning_rate": 0.00037255167031817775, + "loss": 3.3199, + "step": 25848 + }, + { + "epoch": 1.27, + "grad_norm": 0.5514801144599915, + "learning_rate": 0.00037253673104634973, + "loss": 3.1673, + "step": 25849 + }, + { + "epoch": 1.27, + "grad_norm": 0.5589605569839478, + "learning_rate": 0.000372521791583474, + "loss": 3.0712, + "step": 25850 + }, + { + "epoch": 1.27, + "grad_norm": 0.5684011578559875, + "learning_rate": 0.0003725068519295897, + "loss": 3.0508, + "step": 25851 + }, + { + "epoch": 1.27, + "grad_norm": 0.559701144695282, + "learning_rate": 0.0003724919120847363, + "loss": 3.0181, + "step": 25852 + }, + { + "epoch": 1.27, + "grad_norm": 0.5717242360115051, + "learning_rate": 0.00037247697204895327, + "loss": 2.9951, + "step": 25853 + }, + { + "epoch": 1.27, + "grad_norm": 0.5555175542831421, + "learning_rate": 0.0003724620318222798, + "loss": 2.9823, + "step": 25854 + }, + { + "epoch": 1.27, + "grad_norm": 0.5793154239654541, + "learning_rate": 0.00037244709140475527, + "loss": 3.1666, + "step": 25855 + }, + { + "epoch": 1.27, + "grad_norm": 0.552769660949707, + "learning_rate": 0.0003724321507964191, + "loss": 3.2035, + "step": 25856 + }, + { + "epoch": 1.27, + "grad_norm": 0.5471254587173462, + "learning_rate": 0.0003724172099973105, + "loss": 3.1123, + "step": 25857 + }, + { + "epoch": 1.27, + "grad_norm": 0.5685672760009766, + "learning_rate": 0.0003724022690074689, + "loss": 3.0484, + "step": 25858 + }, + { + "epoch": 1.27, + "grad_norm": 0.5822762250900269, + "learning_rate": 0.0003723873278269337, + "loss": 3.0311, + "step": 25859 + }, + { + "epoch": 1.27, + "grad_norm": 0.5946375131607056, + "learning_rate": 0.0003723723864557442, + "loss": 3.2406, + "step": 25860 + }, + { + "epoch": 1.27, + "grad_norm": 0.5624514818191528, + "learning_rate": 0.0003723574448939398, + "loss": 3.0958, + "step": 25861 + }, + { + "epoch": 1.27, + "grad_norm": 0.6144577264785767, + "learning_rate": 0.0003723425031415598, + "loss": 3.117, + "step": 25862 + }, + { + "epoch": 1.27, + "grad_norm": 0.5530250668525696, + "learning_rate": 0.0003723275611986435, + "loss": 3.0762, + "step": 25863 + }, + { + "epoch": 1.27, + "grad_norm": 0.5647308230400085, + "learning_rate": 0.0003723126190652304, + "loss": 3.2921, + "step": 25864 + }, + { + "epoch": 1.27, + "grad_norm": 0.5888532996177673, + "learning_rate": 0.0003722976767413598, + "loss": 3.1137, + "step": 25865 + }, + { + "epoch": 1.27, + "grad_norm": 0.5185219645500183, + "learning_rate": 0.0003722827342270709, + "loss": 3.0245, + "step": 25866 + }, + { + "epoch": 1.27, + "grad_norm": 0.5749824643135071, + "learning_rate": 0.00037226779152240336, + "loss": 3.186, + "step": 25867 + }, + { + "epoch": 1.27, + "grad_norm": 0.5528282523155212, + "learning_rate": 0.0003722528486273962, + "loss": 3.329, + "step": 25868 + }, + { + "epoch": 1.27, + "grad_norm": 0.5749999284744263, + "learning_rate": 0.00037223790554208905, + "loss": 3.1513, + "step": 25869 + }, + { + "epoch": 1.27, + "grad_norm": 0.5693772435188293, + "learning_rate": 0.00037222296226652115, + "loss": 3.1707, + "step": 25870 + }, + { + "epoch": 1.27, + "grad_norm": 0.5503202080726624, + "learning_rate": 0.0003722080188007318, + "loss": 3.2248, + "step": 25871 + }, + { + "epoch": 1.27, + "grad_norm": 0.639018714427948, + "learning_rate": 0.0003721930751447605, + "loss": 3.225, + "step": 25872 + }, + { + "epoch": 1.27, + "grad_norm": 0.5662673711776733, + "learning_rate": 0.00037217813129864646, + "loss": 3.1343, + "step": 25873 + }, + { + "epoch": 1.27, + "grad_norm": 0.6173918843269348, + "learning_rate": 0.0003721631872624292, + "loss": 3.1776, + "step": 25874 + }, + { + "epoch": 1.27, + "grad_norm": 0.5484563112258911, + "learning_rate": 0.000372148243036148, + "loss": 3.0044, + "step": 25875 + }, + { + "epoch": 1.27, + "grad_norm": 0.55157870054245, + "learning_rate": 0.00037213329861984215, + "loss": 3.0107, + "step": 25876 + }, + { + "epoch": 1.27, + "grad_norm": 0.5684902667999268, + "learning_rate": 0.000372118354013551, + "loss": 2.8688, + "step": 25877 + }, + { + "epoch": 1.27, + "grad_norm": 0.5629575252532959, + "learning_rate": 0.00037210340921731415, + "loss": 3.3597, + "step": 25878 + }, + { + "epoch": 1.27, + "grad_norm": 0.5630050897598267, + "learning_rate": 0.0003720884642311707, + "loss": 2.9911, + "step": 25879 + }, + { + "epoch": 1.27, + "grad_norm": 0.5761358141899109, + "learning_rate": 0.0003720735190551602, + "loss": 3.1811, + "step": 25880 + }, + { + "epoch": 1.27, + "grad_norm": 0.5436105132102966, + "learning_rate": 0.0003720585736893218, + "loss": 2.9532, + "step": 25881 + }, + { + "epoch": 1.27, + "grad_norm": 0.6045496463775635, + "learning_rate": 0.00037204362813369503, + "loss": 2.8204, + "step": 25882 + }, + { + "epoch": 1.27, + "grad_norm": 0.5377562642097473, + "learning_rate": 0.0003720286823883192, + "loss": 3.1401, + "step": 25883 + }, + { + "epoch": 1.27, + "grad_norm": 0.5733555555343628, + "learning_rate": 0.0003720137364532337, + "loss": 3.1384, + "step": 25884 + }, + { + "epoch": 1.27, + "grad_norm": 0.5425565242767334, + "learning_rate": 0.00037199879032847786, + "loss": 2.9473, + "step": 25885 + }, + { + "epoch": 1.27, + "grad_norm": 0.5795601606369019, + "learning_rate": 0.00037198384401409114, + "loss": 2.8055, + "step": 25886 + }, + { + "epoch": 1.27, + "grad_norm": 0.5669113993644714, + "learning_rate": 0.0003719688975101127, + "loss": 3.1459, + "step": 25887 + }, + { + "epoch": 1.27, + "grad_norm": 0.5572779178619385, + "learning_rate": 0.0003719539508165821, + "loss": 3.1748, + "step": 25888 + }, + { + "epoch": 1.27, + "grad_norm": 0.5794333219528198, + "learning_rate": 0.0003719390039335387, + "loss": 3.1461, + "step": 25889 + }, + { + "epoch": 1.27, + "grad_norm": 0.5872516632080078, + "learning_rate": 0.00037192405686102174, + "loss": 2.8606, + "step": 25890 + }, + { + "epoch": 1.27, + "grad_norm": 0.5817949175834656, + "learning_rate": 0.0003719091095990707, + "loss": 3.0884, + "step": 25891 + }, + { + "epoch": 1.27, + "grad_norm": 0.590277910232544, + "learning_rate": 0.00037189416214772477, + "loss": 3.0184, + "step": 25892 + }, + { + "epoch": 1.27, + "grad_norm": 0.5216969847679138, + "learning_rate": 0.00037187921450702367, + "loss": 3.1245, + "step": 25893 + }, + { + "epoch": 1.27, + "grad_norm": 0.545471727848053, + "learning_rate": 0.0003718642666770064, + "loss": 3.1712, + "step": 25894 + }, + { + "epoch": 1.27, + "grad_norm": 0.5600536465644836, + "learning_rate": 0.0003718493186577125, + "loss": 2.9244, + "step": 25895 + }, + { + "epoch": 1.27, + "grad_norm": 0.5645468831062317, + "learning_rate": 0.00037183437044918134, + "loss": 3.2514, + "step": 25896 + }, + { + "epoch": 1.27, + "grad_norm": 0.5973244309425354, + "learning_rate": 0.0003718194220514523, + "loss": 2.9894, + "step": 25897 + }, + { + "epoch": 1.27, + "grad_norm": 0.5746738314628601, + "learning_rate": 0.0003718044734645647, + "loss": 2.8643, + "step": 25898 + }, + { + "epoch": 1.27, + "grad_norm": 0.5265095829963684, + "learning_rate": 0.00037178952468855793, + "loss": 3.2349, + "step": 25899 + }, + { + "epoch": 1.27, + "grad_norm": 0.5884544849395752, + "learning_rate": 0.0003717745757234714, + "loss": 3.1012, + "step": 25900 + }, + { + "epoch": 1.27, + "grad_norm": 0.5789063572883606, + "learning_rate": 0.00037175962656934447, + "loss": 3.0908, + "step": 25901 + }, + { + "epoch": 1.27, + "grad_norm": 0.568000316619873, + "learning_rate": 0.0003717446772262163, + "loss": 2.9355, + "step": 25902 + }, + { + "epoch": 1.27, + "grad_norm": 0.5491638779640198, + "learning_rate": 0.0003717297276941267, + "loss": 3.2674, + "step": 25903 + }, + { + "epoch": 1.27, + "grad_norm": 0.5411843061447144, + "learning_rate": 0.00037171477797311473, + "loss": 3.0928, + "step": 25904 + }, + { + "epoch": 1.27, + "grad_norm": 0.6108878254890442, + "learning_rate": 0.0003716998280632198, + "loss": 3.2653, + "step": 25905 + }, + { + "epoch": 1.27, + "grad_norm": 0.5373321771621704, + "learning_rate": 0.00037168487796448134, + "loss": 3.0299, + "step": 25906 + }, + { + "epoch": 1.27, + "grad_norm": 0.5621278882026672, + "learning_rate": 0.0003716699276769387, + "loss": 3.019, + "step": 25907 + }, + { + "epoch": 1.27, + "grad_norm": 0.5679013133049011, + "learning_rate": 0.00037165497720063125, + "loss": 3.3508, + "step": 25908 + }, + { + "epoch": 1.27, + "grad_norm": 0.5926734209060669, + "learning_rate": 0.0003716400265355984, + "loss": 3.0921, + "step": 25909 + }, + { + "epoch": 1.27, + "grad_norm": 0.5435253381729126, + "learning_rate": 0.00037162507568187954, + "loss": 2.8687, + "step": 25910 + }, + { + "epoch": 1.27, + "grad_norm": 0.5271333456039429, + "learning_rate": 0.0003716101246395139, + "loss": 2.9884, + "step": 25911 + }, + { + "epoch": 1.27, + "grad_norm": 0.5629345774650574, + "learning_rate": 0.00037159517340854117, + "loss": 2.8821, + "step": 25912 + }, + { + "epoch": 1.27, + "grad_norm": 0.5234237313270569, + "learning_rate": 0.0003715802219890003, + "loss": 3.2772, + "step": 25913 + }, + { + "epoch": 1.27, + "grad_norm": 0.5610054135322571, + "learning_rate": 0.00037156527038093105, + "loss": 3.1372, + "step": 25914 + }, + { + "epoch": 1.27, + "grad_norm": 0.5486628413200378, + "learning_rate": 0.0003715503185843727, + "loss": 3.1335, + "step": 25915 + }, + { + "epoch": 1.27, + "grad_norm": 0.5305410027503967, + "learning_rate": 0.00037153536659936446, + "loss": 3.053, + "step": 25916 + }, + { + "epoch": 1.27, + "grad_norm": 0.6576041579246521, + "learning_rate": 0.00037152041442594584, + "loss": 3.1786, + "step": 25917 + }, + { + "epoch": 1.27, + "grad_norm": 0.5794118046760559, + "learning_rate": 0.00037150546206415625, + "loss": 3.1182, + "step": 25918 + }, + { + "epoch": 1.27, + "grad_norm": 0.5535377264022827, + "learning_rate": 0.00037149050951403506, + "loss": 2.9718, + "step": 25919 + }, + { + "epoch": 1.27, + "grad_norm": 0.5769936442375183, + "learning_rate": 0.0003714755567756216, + "loss": 2.987, + "step": 25920 + }, + { + "epoch": 1.27, + "grad_norm": 0.59618079662323, + "learning_rate": 0.00037146060384895527, + "loss": 3.2961, + "step": 25921 + }, + { + "epoch": 1.27, + "grad_norm": 0.5642500519752502, + "learning_rate": 0.00037144565073407544, + "loss": 2.9942, + "step": 25922 + }, + { + "epoch": 1.27, + "grad_norm": 0.5690189003944397, + "learning_rate": 0.0003714306974310215, + "loss": 3.0254, + "step": 25923 + }, + { + "epoch": 1.27, + "grad_norm": 0.5622614026069641, + "learning_rate": 0.0003714157439398329, + "loss": 3.1844, + "step": 25924 + }, + { + "epoch": 1.27, + "grad_norm": 0.5633580684661865, + "learning_rate": 0.000371400790260549, + "loss": 3.1334, + "step": 25925 + }, + { + "epoch": 1.27, + "grad_norm": 0.5582607984542847, + "learning_rate": 0.00037138583639320915, + "loss": 3.026, + "step": 25926 + }, + { + "epoch": 1.27, + "grad_norm": 0.5949597954750061, + "learning_rate": 0.00037137088233785273, + "loss": 3.076, + "step": 25927 + }, + { + "epoch": 1.27, + "grad_norm": 0.5441601872444153, + "learning_rate": 0.000371355928094519, + "loss": 3.0003, + "step": 25928 + }, + { + "epoch": 1.27, + "grad_norm": 0.606289803981781, + "learning_rate": 0.00037134097366324774, + "loss": 3.1652, + "step": 25929 + }, + { + "epoch": 1.27, + "grad_norm": 0.551216185092926, + "learning_rate": 0.000371326019044078, + "loss": 3.121, + "step": 25930 + }, + { + "epoch": 1.27, + "grad_norm": 0.5833144783973694, + "learning_rate": 0.0003713110642370492, + "loss": 3.4873, + "step": 25931 + }, + { + "epoch": 1.27, + "grad_norm": 0.5816163420677185, + "learning_rate": 0.00037129610924220075, + "loss": 3.1136, + "step": 25932 + }, + { + "epoch": 1.27, + "grad_norm": 0.5476531386375427, + "learning_rate": 0.00037128115405957217, + "loss": 3.0812, + "step": 25933 + }, + { + "epoch": 1.27, + "grad_norm": 0.5457040071487427, + "learning_rate": 0.0003712661986892027, + "loss": 3.187, + "step": 25934 + }, + { + "epoch": 1.27, + "grad_norm": 0.5884016156196594, + "learning_rate": 0.0003712512431311317, + "loss": 3.158, + "step": 25935 + }, + { + "epoch": 1.27, + "grad_norm": 0.5727925300598145, + "learning_rate": 0.0003712362873853988, + "loss": 3.2241, + "step": 25936 + }, + { + "epoch": 1.27, + "grad_norm": 0.5885441303253174, + "learning_rate": 0.00037122133145204314, + "loss": 3.1813, + "step": 25937 + }, + { + "epoch": 1.27, + "grad_norm": 0.6078060865402222, + "learning_rate": 0.00037120637533110415, + "loss": 3.0894, + "step": 25938 + }, + { + "epoch": 1.27, + "grad_norm": 0.51329106092453, + "learning_rate": 0.00037119141902262133, + "loss": 3.0433, + "step": 25939 + }, + { + "epoch": 1.27, + "grad_norm": 0.6121600866317749, + "learning_rate": 0.0003711764625266341, + "loss": 2.934, + "step": 25940 + }, + { + "epoch": 1.27, + "grad_norm": 0.5866429805755615, + "learning_rate": 0.0003711615058431816, + "loss": 2.975, + "step": 25941 + }, + { + "epoch": 1.27, + "grad_norm": 0.5531361103057861, + "learning_rate": 0.00037114654897230353, + "loss": 3.0505, + "step": 25942 + }, + { + "epoch": 1.27, + "grad_norm": 0.7733150124549866, + "learning_rate": 0.00037113159191403896, + "loss": 3.2246, + "step": 25943 + }, + { + "epoch": 1.27, + "grad_norm": 0.5927426218986511, + "learning_rate": 0.0003711166346684276, + "loss": 2.9347, + "step": 25944 + }, + { + "epoch": 1.27, + "grad_norm": 0.5862705707550049, + "learning_rate": 0.00037110167723550876, + "loss": 2.817, + "step": 25945 + }, + { + "epoch": 1.27, + "grad_norm": 0.6004965901374817, + "learning_rate": 0.0003710867196153217, + "loss": 3.1906, + "step": 25946 + }, + { + "epoch": 1.27, + "grad_norm": 0.563130259513855, + "learning_rate": 0.0003710717618079059, + "loss": 3.124, + "step": 25947 + }, + { + "epoch": 1.27, + "grad_norm": 0.5575785636901855, + "learning_rate": 0.0003710568038133008, + "loss": 3.0767, + "step": 25948 + }, + { + "epoch": 1.27, + "grad_norm": 0.5379173159599304, + "learning_rate": 0.0003710418456315457, + "loss": 3.207, + "step": 25949 + }, + { + "epoch": 1.27, + "grad_norm": 0.6190086603164673, + "learning_rate": 0.0003710268872626801, + "loss": 2.9019, + "step": 25950 + }, + { + "epoch": 1.27, + "grad_norm": 0.5316742062568665, + "learning_rate": 0.0003710119287067433, + "loss": 2.996, + "step": 25951 + }, + { + "epoch": 1.27, + "grad_norm": 0.5887004137039185, + "learning_rate": 0.0003709969699637749, + "loss": 2.9726, + "step": 25952 + }, + { + "epoch": 1.27, + "grad_norm": 0.560197651386261, + "learning_rate": 0.00037098201103381395, + "loss": 3.1608, + "step": 25953 + }, + { + "epoch": 1.27, + "grad_norm": 0.5498967170715332, + "learning_rate": 0.0003709670519169, + "loss": 3.1144, + "step": 25954 + }, + { + "epoch": 1.27, + "grad_norm": 0.571072518825531, + "learning_rate": 0.0003709520926130727, + "loss": 3.1361, + "step": 25955 + }, + { + "epoch": 1.27, + "grad_norm": 0.5782363414764404, + "learning_rate": 0.0003709371331223712, + "loss": 2.9036, + "step": 25956 + }, + { + "epoch": 1.27, + "grad_norm": 0.5756257772445679, + "learning_rate": 0.00037092217344483487, + "loss": 2.8669, + "step": 25957 + }, + { + "epoch": 1.27, + "grad_norm": 0.5726673603057861, + "learning_rate": 0.00037090721358050324, + "loss": 2.84, + "step": 25958 + }, + { + "epoch": 1.27, + "grad_norm": 0.5259421467781067, + "learning_rate": 0.00037089225352941556, + "loss": 3.0836, + "step": 25959 + }, + { + "epoch": 1.27, + "grad_norm": 0.5834009051322937, + "learning_rate": 0.0003708772932916114, + "loss": 2.9831, + "step": 25960 + }, + { + "epoch": 1.27, + "grad_norm": 0.5248730778694153, + "learning_rate": 0.0003708623328671301, + "loss": 3.1426, + "step": 25961 + }, + { + "epoch": 1.27, + "grad_norm": 0.5834124088287354, + "learning_rate": 0.000370847372256011, + "loss": 3.0896, + "step": 25962 + }, + { + "epoch": 1.27, + "grad_norm": 0.5581058859825134, + "learning_rate": 0.0003708324114582937, + "loss": 2.8448, + "step": 25963 + }, + { + "epoch": 1.27, + "grad_norm": 0.6077955961227417, + "learning_rate": 0.0003708174504740173, + "loss": 2.7767, + "step": 25964 + }, + { + "epoch": 1.27, + "grad_norm": 0.5812661647796631, + "learning_rate": 0.00037080248930322136, + "loss": 3.1861, + "step": 25965 + }, + { + "epoch": 1.27, + "grad_norm": 0.5405663847923279, + "learning_rate": 0.00037078752794594545, + "loss": 3.198, + "step": 25966 + }, + { + "epoch": 1.27, + "grad_norm": 0.5473437905311584, + "learning_rate": 0.0003707725664022287, + "loss": 2.858, + "step": 25967 + }, + { + "epoch": 1.27, + "grad_norm": 0.539084255695343, + "learning_rate": 0.0003707576046721106, + "loss": 3.262, + "step": 25968 + }, + { + "epoch": 1.27, + "grad_norm": 0.5603315830230713, + "learning_rate": 0.00037074264275563064, + "loss": 3.0577, + "step": 25969 + }, + { + "epoch": 1.27, + "grad_norm": 0.5609519481658936, + "learning_rate": 0.0003707276806528282, + "loss": 3.1609, + "step": 25970 + }, + { + "epoch": 1.27, + "grad_norm": 0.5762900710105896, + "learning_rate": 0.0003707127183637426, + "loss": 3.103, + "step": 25971 + }, + { + "epoch": 1.27, + "grad_norm": 0.5924959778785706, + "learning_rate": 0.0003706977558884133, + "loss": 2.9045, + "step": 25972 + }, + { + "epoch": 1.27, + "grad_norm": 0.5308851003646851, + "learning_rate": 0.0003706827932268798, + "loss": 2.995, + "step": 25973 + }, + { + "epoch": 1.27, + "grad_norm": 0.605690062046051, + "learning_rate": 0.0003706678303791813, + "loss": 3.0956, + "step": 25974 + }, + { + "epoch": 1.27, + "grad_norm": 0.5422868728637695, + "learning_rate": 0.0003706528673453574, + "loss": 2.8659, + "step": 25975 + }, + { + "epoch": 1.27, + "grad_norm": 0.5357705950737, + "learning_rate": 0.00037063790412544747, + "loss": 2.9898, + "step": 25976 + }, + { + "epoch": 1.27, + "grad_norm": 0.5521494150161743, + "learning_rate": 0.00037062294071949094, + "loss": 2.9851, + "step": 25977 + }, + { + "epoch": 1.27, + "grad_norm": 0.5768835544586182, + "learning_rate": 0.0003706079771275271, + "loss": 3.1727, + "step": 25978 + }, + { + "epoch": 1.27, + "grad_norm": 0.5737690329551697, + "learning_rate": 0.00037059301334959536, + "loss": 2.8939, + "step": 25979 + }, + { + "epoch": 1.27, + "grad_norm": 0.5862882733345032, + "learning_rate": 0.0003705780493857354, + "loss": 2.9907, + "step": 25980 + }, + { + "epoch": 1.27, + "grad_norm": 0.5231944918632507, + "learning_rate": 0.00037056308523598637, + "loss": 2.9625, + "step": 25981 + }, + { + "epoch": 1.27, + "grad_norm": 0.5753133893013, + "learning_rate": 0.0003705481209003877, + "loss": 2.9762, + "step": 25982 + }, + { + "epoch": 1.27, + "grad_norm": 0.5374119281768799, + "learning_rate": 0.00037053315637897887, + "loss": 3.159, + "step": 25983 + }, + { + "epoch": 1.27, + "grad_norm": 0.5538880228996277, + "learning_rate": 0.0003705181916717993, + "loss": 2.9437, + "step": 25984 + }, + { + "epoch": 1.27, + "grad_norm": 0.5667445063591003, + "learning_rate": 0.00037050322677888837, + "loss": 3.2595, + "step": 25985 + }, + { + "epoch": 1.27, + "grad_norm": 0.5335412621498108, + "learning_rate": 0.0003704882617002855, + "loss": 3.1344, + "step": 25986 + }, + { + "epoch": 1.27, + "grad_norm": 0.5830770134925842, + "learning_rate": 0.00037047329643603014, + "loss": 3.1744, + "step": 25987 + }, + { + "epoch": 1.27, + "grad_norm": 0.549442708492279, + "learning_rate": 0.00037045833098616176, + "loss": 2.9406, + "step": 25988 + }, + { + "epoch": 1.27, + "grad_norm": 0.5874332785606384, + "learning_rate": 0.00037044336535071954, + "loss": 3.1014, + "step": 25989 + }, + { + "epoch": 1.27, + "grad_norm": 0.5202533006668091, + "learning_rate": 0.00037042839952974307, + "loss": 3.0192, + "step": 25990 + }, + { + "epoch": 1.27, + "grad_norm": 0.5560673475265503, + "learning_rate": 0.00037041343352327185, + "loss": 3.1054, + "step": 25991 + }, + { + "epoch": 1.27, + "grad_norm": 0.5724155306816101, + "learning_rate": 0.00037039846733134514, + "loss": 3.2477, + "step": 25992 + }, + { + "epoch": 1.27, + "grad_norm": 0.528751790523529, + "learning_rate": 0.00037038350095400246, + "loss": 3.2029, + "step": 25993 + }, + { + "epoch": 1.27, + "grad_norm": 0.55881667137146, + "learning_rate": 0.0003703685343912831, + "loss": 3.1877, + "step": 25994 + }, + { + "epoch": 1.27, + "grad_norm": 0.5411615967750549, + "learning_rate": 0.0003703535676432266, + "loss": 3.1792, + "step": 25995 + }, + { + "epoch": 1.27, + "grad_norm": 0.5560112595558167, + "learning_rate": 0.00037033860070987237, + "loss": 3.1054, + "step": 25996 + }, + { + "epoch": 1.27, + "grad_norm": 0.5627215504646301, + "learning_rate": 0.00037032363359125974, + "loss": 3.0732, + "step": 25997 + }, + { + "epoch": 1.27, + "grad_norm": 0.600377082824707, + "learning_rate": 0.0003703086662874283, + "loss": 3.0793, + "step": 25998 + }, + { + "epoch": 1.27, + "grad_norm": 0.5853224992752075, + "learning_rate": 0.00037029369879841735, + "loss": 2.9752, + "step": 25999 + }, + { + "epoch": 1.27, + "grad_norm": 0.5395614504814148, + "learning_rate": 0.0003702787311242662, + "loss": 3.0602, + "step": 26000 + }, + { + "epoch": 1.27, + "grad_norm": 0.555023729801178, + "learning_rate": 0.0003702637632650145, + "loss": 3.2359, + "step": 26001 + }, + { + "epoch": 1.27, + "grad_norm": 0.5492096543312073, + "learning_rate": 0.00037024879522070157, + "loss": 2.8831, + "step": 26002 + }, + { + "epoch": 1.27, + "grad_norm": 0.5381262302398682, + "learning_rate": 0.00037023382699136686, + "loss": 3.0899, + "step": 26003 + }, + { + "epoch": 1.27, + "grad_norm": 0.5479117631912231, + "learning_rate": 0.0003702188585770497, + "loss": 3.2155, + "step": 26004 + }, + { + "epoch": 1.27, + "grad_norm": 0.5779570937156677, + "learning_rate": 0.00037020388997778954, + "loss": 3.0202, + "step": 26005 + }, + { + "epoch": 1.27, + "grad_norm": 0.5237216949462891, + "learning_rate": 0.0003701889211936259, + "loss": 3.0036, + "step": 26006 + }, + { + "epoch": 1.27, + "grad_norm": 0.5618526339530945, + "learning_rate": 0.00037017395222459817, + "loss": 3.1664, + "step": 26007 + }, + { + "epoch": 1.27, + "grad_norm": 0.5620414614677429, + "learning_rate": 0.0003701589830707457, + "loss": 3.166, + "step": 26008 + }, + { + "epoch": 1.27, + "grad_norm": 0.5533866286277771, + "learning_rate": 0.00037014401373210805, + "loss": 3.2417, + "step": 26009 + }, + { + "epoch": 1.27, + "grad_norm": 0.6008995771408081, + "learning_rate": 0.00037012904420872454, + "loss": 2.9938, + "step": 26010 + }, + { + "epoch": 1.27, + "grad_norm": 0.5710824728012085, + "learning_rate": 0.0003701140745006345, + "loss": 3.0668, + "step": 26011 + }, + { + "epoch": 1.27, + "grad_norm": 0.5625746846199036, + "learning_rate": 0.0003700991046078777, + "loss": 3.1091, + "step": 26012 + }, + { + "epoch": 1.27, + "grad_norm": 0.5556397438049316, + "learning_rate": 0.00037008413453049315, + "loss": 3.1491, + "step": 26013 + }, + { + "epoch": 1.27, + "grad_norm": 0.5745502710342407, + "learning_rate": 0.0003700691642685207, + "loss": 3.0732, + "step": 26014 + }, + { + "epoch": 1.27, + "grad_norm": 0.5311285257339478, + "learning_rate": 0.00037005419382199933, + "loss": 2.9755, + "step": 26015 + }, + { + "epoch": 1.27, + "grad_norm": 0.573462963104248, + "learning_rate": 0.00037003922319096876, + "loss": 3.2097, + "step": 26016 + }, + { + "epoch": 1.28, + "grad_norm": 0.6004371047019958, + "learning_rate": 0.00037002425237546844, + "loss": 2.9479, + "step": 26017 + }, + { + "epoch": 1.28, + "grad_norm": 0.5266828536987305, + "learning_rate": 0.00037000928137553764, + "loss": 3.1431, + "step": 26018 + }, + { + "epoch": 1.28, + "grad_norm": 0.5740906000137329, + "learning_rate": 0.00036999431019121593, + "loss": 2.9836, + "step": 26019 + }, + { + "epoch": 1.28, + "grad_norm": 0.5576544404029846, + "learning_rate": 0.0003699793388225426, + "loss": 2.9814, + "step": 26020 + }, + { + "epoch": 1.28, + "grad_norm": 0.568863570690155, + "learning_rate": 0.0003699643672695572, + "loss": 3.1698, + "step": 26021 + }, + { + "epoch": 1.28, + "grad_norm": 0.5414212942123413, + "learning_rate": 0.0003699493955322991, + "loss": 2.9529, + "step": 26022 + }, + { + "epoch": 1.28, + "grad_norm": 0.5581563711166382, + "learning_rate": 0.00036993442361080777, + "loss": 3.2253, + "step": 26023 + }, + { + "epoch": 1.28, + "grad_norm": 0.578456461429596, + "learning_rate": 0.0003699194515051226, + "loss": 3.0586, + "step": 26024 + }, + { + "epoch": 1.28, + "grad_norm": 0.5558168888092041, + "learning_rate": 0.00036990447921528307, + "loss": 2.9423, + "step": 26025 + }, + { + "epoch": 1.28, + "grad_norm": 0.5423651337623596, + "learning_rate": 0.0003698895067413286, + "loss": 3.0314, + "step": 26026 + }, + { + "epoch": 1.28, + "grad_norm": 0.6390382647514343, + "learning_rate": 0.0003698745340832986, + "loss": 3.0495, + "step": 26027 + }, + { + "epoch": 1.28, + "grad_norm": 0.5630806684494019, + "learning_rate": 0.0003698595612412327, + "loss": 3.0398, + "step": 26028 + }, + { + "epoch": 1.28, + "grad_norm": 0.5756039023399353, + "learning_rate": 0.0003698445882151699, + "loss": 3.2904, + "step": 26029 + }, + { + "epoch": 1.28, + "grad_norm": 0.5660070180892944, + "learning_rate": 0.00036982961500515006, + "loss": 3.0634, + "step": 26030 + }, + { + "epoch": 1.28, + "grad_norm": 0.6267983913421631, + "learning_rate": 0.0003698146416112124, + "loss": 2.9549, + "step": 26031 + }, + { + "epoch": 1.28, + "grad_norm": 0.5327684283256531, + "learning_rate": 0.0003697996680333964, + "loss": 2.9134, + "step": 26032 + }, + { + "epoch": 1.28, + "grad_norm": 0.543506383895874, + "learning_rate": 0.00036978469427174145, + "loss": 3.131, + "step": 26033 + }, + { + "epoch": 1.28, + "grad_norm": 0.5573419332504272, + "learning_rate": 0.00036976972032628716, + "loss": 3.2857, + "step": 26034 + }, + { + "epoch": 1.28, + "grad_norm": 0.5342174768447876, + "learning_rate": 0.0003697547461970728, + "loss": 3.2678, + "step": 26035 + }, + { + "epoch": 1.28, + "grad_norm": 0.5931220650672913, + "learning_rate": 0.0003697397718841379, + "loss": 3.0568, + "step": 26036 + }, + { + "epoch": 1.28, + "grad_norm": 0.5429893136024475, + "learning_rate": 0.0003697247973875218, + "loss": 3.1089, + "step": 26037 + }, + { + "epoch": 1.28, + "grad_norm": 0.5769212245941162, + "learning_rate": 0.000369709822707264, + "loss": 3.0458, + "step": 26038 + }, + { + "epoch": 1.28, + "grad_norm": 0.549676239490509, + "learning_rate": 0.00036969484784340405, + "loss": 3.0389, + "step": 26039 + }, + { + "epoch": 1.28, + "grad_norm": 0.6053200364112854, + "learning_rate": 0.00036967987279598125, + "loss": 3.218, + "step": 26040 + }, + { + "epoch": 1.28, + "grad_norm": 0.6391343474388123, + "learning_rate": 0.00036966489756503495, + "loss": 3.1639, + "step": 26041 + }, + { + "epoch": 1.28, + "grad_norm": 0.632569432258606, + "learning_rate": 0.00036964992215060484, + "loss": 3.2523, + "step": 26042 + }, + { + "epoch": 1.28, + "grad_norm": 0.5485760569572449, + "learning_rate": 0.00036963494655273016, + "loss": 3.1035, + "step": 26043 + }, + { + "epoch": 1.28, + "grad_norm": 0.6080588698387146, + "learning_rate": 0.0003696199707714505, + "loss": 3.2853, + "step": 26044 + }, + { + "epoch": 1.28, + "grad_norm": 0.5590080618858337, + "learning_rate": 0.0003696049948068052, + "loss": 2.9434, + "step": 26045 + }, + { + "epoch": 1.28, + "grad_norm": 0.5305129885673523, + "learning_rate": 0.0003695900186588338, + "loss": 3.0217, + "step": 26046 + }, + { + "epoch": 1.28, + "grad_norm": 0.5904659032821655, + "learning_rate": 0.00036957504232757555, + "loss": 3.0828, + "step": 26047 + }, + { + "epoch": 1.28, + "grad_norm": 0.5809376239776611, + "learning_rate": 0.00036956006581307017, + "loss": 3.1483, + "step": 26048 + }, + { + "epoch": 1.28, + "grad_norm": 0.5855849385261536, + "learning_rate": 0.0003695450891153569, + "loss": 3.1708, + "step": 26049 + }, + { + "epoch": 1.28, + "grad_norm": 0.55824214220047, + "learning_rate": 0.00036953011223447526, + "loss": 3.0215, + "step": 26050 + }, + { + "epoch": 1.28, + "grad_norm": 0.5497469305992126, + "learning_rate": 0.0003695151351704646, + "loss": 3.1189, + "step": 26051 + }, + { + "epoch": 1.28, + "grad_norm": 0.558712899684906, + "learning_rate": 0.0003695001579233646, + "loss": 3.0199, + "step": 26052 + }, + { + "epoch": 1.28, + "grad_norm": 0.5559713840484619, + "learning_rate": 0.0003694851804932145, + "loss": 3.1461, + "step": 26053 + }, + { + "epoch": 1.28, + "grad_norm": 0.5822705030441284, + "learning_rate": 0.0003694702028800538, + "loss": 3.0182, + "step": 26054 + }, + { + "epoch": 1.28, + "grad_norm": 0.579784631729126, + "learning_rate": 0.00036945522508392197, + "loss": 3.0663, + "step": 26055 + }, + { + "epoch": 1.28, + "grad_norm": 0.5558045506477356, + "learning_rate": 0.00036944024710485834, + "loss": 3.012, + "step": 26056 + }, + { + "epoch": 1.28, + "grad_norm": 0.5622739195823669, + "learning_rate": 0.0003694252689429026, + "loss": 3.1491, + "step": 26057 + }, + { + "epoch": 1.28, + "grad_norm": 0.5217506289482117, + "learning_rate": 0.00036941029059809407, + "loss": 3.0809, + "step": 26058 + }, + { + "epoch": 1.28, + "grad_norm": 0.5513173341751099, + "learning_rate": 0.00036939531207047207, + "loss": 2.9512, + "step": 26059 + }, + { + "epoch": 1.28, + "grad_norm": 0.5600497722625732, + "learning_rate": 0.00036938033336007624, + "loss": 3.2733, + "step": 26060 + }, + { + "epoch": 1.28, + "grad_norm": 0.5448271036148071, + "learning_rate": 0.00036936535446694595, + "loss": 2.8906, + "step": 26061 + }, + { + "epoch": 1.28, + "grad_norm": 0.564911961555481, + "learning_rate": 0.0003693503753911207, + "loss": 2.9206, + "step": 26062 + }, + { + "epoch": 1.28, + "grad_norm": 0.5715450644493103, + "learning_rate": 0.0003693353961326399, + "loss": 2.9661, + "step": 26063 + }, + { + "epoch": 1.28, + "grad_norm": 0.5593858361244202, + "learning_rate": 0.000369320416691543, + "loss": 3.2893, + "step": 26064 + }, + { + "epoch": 1.28, + "grad_norm": 0.5523359179496765, + "learning_rate": 0.00036930543706786953, + "loss": 3.0601, + "step": 26065 + }, + { + "epoch": 1.28, + "grad_norm": 0.5331974029541016, + "learning_rate": 0.0003692904572616588, + "loss": 3.0984, + "step": 26066 + }, + { + "epoch": 1.28, + "grad_norm": 0.5936278700828552, + "learning_rate": 0.00036927547727295024, + "loss": 2.9378, + "step": 26067 + }, + { + "epoch": 1.28, + "grad_norm": 0.5834710001945496, + "learning_rate": 0.0003692604971017836, + "loss": 3.0566, + "step": 26068 + }, + { + "epoch": 1.28, + "grad_norm": 0.6169381737709045, + "learning_rate": 0.0003692455167481981, + "loss": 3.1942, + "step": 26069 + }, + { + "epoch": 1.28, + "grad_norm": 0.5709614753723145, + "learning_rate": 0.00036923053621223316, + "loss": 3.2825, + "step": 26070 + }, + { + "epoch": 1.28, + "grad_norm": 0.5453691482543945, + "learning_rate": 0.0003692155554939283, + "loss": 3.0132, + "step": 26071 + }, + { + "epoch": 1.28, + "grad_norm": 0.5543729662895203, + "learning_rate": 0.00036920057459332304, + "loss": 3.0603, + "step": 26072 + }, + { + "epoch": 1.28, + "grad_norm": 0.572906494140625, + "learning_rate": 0.0003691855935104568, + "loss": 3.2762, + "step": 26073 + }, + { + "epoch": 1.28, + "grad_norm": 0.5718940496444702, + "learning_rate": 0.0003691706122453689, + "loss": 3.1996, + "step": 26074 + }, + { + "epoch": 1.28, + "grad_norm": 0.534610390663147, + "learning_rate": 0.00036915563079809906, + "loss": 3.0727, + "step": 26075 + }, + { + "epoch": 1.28, + "grad_norm": 0.6224848628044128, + "learning_rate": 0.00036914064916868664, + "loss": 2.8755, + "step": 26076 + }, + { + "epoch": 1.28, + "grad_norm": 0.5702244639396667, + "learning_rate": 0.0003691256673571709, + "loss": 3.2086, + "step": 26077 + }, + { + "epoch": 1.28, + "grad_norm": 0.567895770072937, + "learning_rate": 0.0003691106853635915, + "loss": 3.1703, + "step": 26078 + }, + { + "epoch": 1.28, + "grad_norm": 0.5513347387313843, + "learning_rate": 0.00036909570318798793, + "loss": 2.9534, + "step": 26079 + }, + { + "epoch": 1.28, + "grad_norm": 0.5436404943466187, + "learning_rate": 0.00036908072083039953, + "loss": 2.9507, + "step": 26080 + }, + { + "epoch": 1.28, + "grad_norm": 0.5403690338134766, + "learning_rate": 0.00036906573829086585, + "loss": 3.1926, + "step": 26081 + }, + { + "epoch": 1.28, + "grad_norm": 0.5600476264953613, + "learning_rate": 0.00036905075556942625, + "loss": 3.1376, + "step": 26082 + }, + { + "epoch": 1.28, + "grad_norm": 0.6057079434394836, + "learning_rate": 0.0003690357726661202, + "loss": 3.2067, + "step": 26083 + }, + { + "epoch": 1.28, + "grad_norm": 0.575212299823761, + "learning_rate": 0.0003690207895809873, + "loss": 3.1779, + "step": 26084 + }, + { + "epoch": 1.28, + "grad_norm": 0.5557694435119629, + "learning_rate": 0.00036900580631406683, + "loss": 2.9421, + "step": 26085 + }, + { + "epoch": 1.28, + "grad_norm": 0.5446481108665466, + "learning_rate": 0.00036899082286539846, + "loss": 3.1631, + "step": 26086 + }, + { + "epoch": 1.28, + "grad_norm": 0.5733548998832703, + "learning_rate": 0.0003689758392350215, + "loss": 3.1699, + "step": 26087 + }, + { + "epoch": 1.28, + "grad_norm": 0.5587981343269348, + "learning_rate": 0.00036896085542297545, + "loss": 3.245, + "step": 26088 + }, + { + "epoch": 1.28, + "grad_norm": 0.5554731488227844, + "learning_rate": 0.00036894587142929973, + "loss": 3.1064, + "step": 26089 + }, + { + "epoch": 1.28, + "grad_norm": 0.583678662776947, + "learning_rate": 0.0003689308872540339, + "loss": 3.456, + "step": 26090 + }, + { + "epoch": 1.28, + "grad_norm": 0.5885918140411377, + "learning_rate": 0.0003689159028972175, + "loss": 3.3753, + "step": 26091 + }, + { + "epoch": 1.28, + "grad_norm": 0.5499343872070312, + "learning_rate": 0.00036890091835888966, + "loss": 3.1146, + "step": 26092 + }, + { + "epoch": 1.28, + "grad_norm": 0.5878946781158447, + "learning_rate": 0.0003688859336390902, + "loss": 2.9873, + "step": 26093 + }, + { + "epoch": 1.28, + "grad_norm": 0.5701496601104736, + "learning_rate": 0.0003688709487378584, + "loss": 3.1183, + "step": 26094 + }, + { + "epoch": 1.28, + "grad_norm": 0.5646578669548035, + "learning_rate": 0.00036885596365523375, + "loss": 3.3045, + "step": 26095 + }, + { + "epoch": 1.28, + "grad_norm": 0.5535494089126587, + "learning_rate": 0.0003688409783912557, + "loss": 3.0198, + "step": 26096 + }, + { + "epoch": 1.28, + "grad_norm": 0.5509173274040222, + "learning_rate": 0.0003688259929459639, + "loss": 3.1636, + "step": 26097 + }, + { + "epoch": 1.28, + "grad_norm": 0.5773724317550659, + "learning_rate": 0.00036881100731939756, + "loss": 3.0848, + "step": 26098 + }, + { + "epoch": 1.28, + "grad_norm": 0.5771932601928711, + "learning_rate": 0.00036879602151159633, + "loss": 2.8706, + "step": 26099 + }, + { + "epoch": 1.28, + "grad_norm": 0.5937799215316772, + "learning_rate": 0.0003687810355225996, + "loss": 3.1804, + "step": 26100 + }, + { + "epoch": 1.28, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.00036876604935244696, + "loss": 3.0081, + "step": 26101 + }, + { + "epoch": 1.28, + "grad_norm": 0.5697646737098694, + "learning_rate": 0.0003687510630011776, + "loss": 3.1045, + "step": 26102 + }, + { + "epoch": 1.28, + "grad_norm": 0.562432050704956, + "learning_rate": 0.00036873607646883125, + "loss": 3.0006, + "step": 26103 + }, + { + "epoch": 1.28, + "grad_norm": 0.5757910013198853, + "learning_rate": 0.0003687210897554474, + "loss": 2.8835, + "step": 26104 + }, + { + "epoch": 1.28, + "grad_norm": 0.5502630472183228, + "learning_rate": 0.0003687061028610653, + "loss": 3.1017, + "step": 26105 + }, + { + "epoch": 1.28, + "grad_norm": 0.5945281386375427, + "learning_rate": 0.00036869111578572463, + "loss": 2.9427, + "step": 26106 + }, + { + "epoch": 1.28, + "grad_norm": 0.562147855758667, + "learning_rate": 0.0003686761285294647, + "loss": 2.8959, + "step": 26107 + }, + { + "epoch": 1.28, + "grad_norm": 0.5568060874938965, + "learning_rate": 0.0003686611410923251, + "loss": 3.1515, + "step": 26108 + }, + { + "epoch": 1.28, + "grad_norm": 0.5851083993911743, + "learning_rate": 0.0003686461534743453, + "loss": 2.9902, + "step": 26109 + }, + { + "epoch": 1.28, + "grad_norm": 0.5441685318946838, + "learning_rate": 0.0003686311656755647, + "loss": 3.1402, + "step": 26110 + }, + { + "epoch": 1.28, + "grad_norm": 0.5639269351959229, + "learning_rate": 0.0003686161776960228, + "loss": 2.9541, + "step": 26111 + }, + { + "epoch": 1.28, + "grad_norm": 0.5413896441459656, + "learning_rate": 0.0003686011895357592, + "loss": 3.0014, + "step": 26112 + }, + { + "epoch": 1.28, + "grad_norm": 0.5627785921096802, + "learning_rate": 0.0003685862011948131, + "loss": 3.0785, + "step": 26113 + }, + { + "epoch": 1.28, + "grad_norm": 0.6175768375396729, + "learning_rate": 0.00036857121267322424, + "loss": 3.2395, + "step": 26114 + }, + { + "epoch": 1.28, + "grad_norm": 0.5553895235061646, + "learning_rate": 0.000368556223971032, + "loss": 3.1911, + "step": 26115 + }, + { + "epoch": 1.28, + "grad_norm": 0.5802327394485474, + "learning_rate": 0.00036854123508827595, + "loss": 3.1287, + "step": 26116 + }, + { + "epoch": 1.28, + "grad_norm": 0.561752200126648, + "learning_rate": 0.00036852624602499534, + "loss": 3.2061, + "step": 26117 + }, + { + "epoch": 1.28, + "grad_norm": 0.6642087697982788, + "learning_rate": 0.0003685112567812298, + "loss": 3.0804, + "step": 26118 + }, + { + "epoch": 1.28, + "grad_norm": 0.5541870594024658, + "learning_rate": 0.0003684962673570189, + "loss": 3.0745, + "step": 26119 + }, + { + "epoch": 1.28, + "grad_norm": 0.5653018355369568, + "learning_rate": 0.0003684812777524019, + "loss": 3.1752, + "step": 26120 + }, + { + "epoch": 1.28, + "grad_norm": 0.5748471021652222, + "learning_rate": 0.0003684662879674185, + "loss": 3.1672, + "step": 26121 + }, + { + "epoch": 1.28, + "grad_norm": 0.6177536845207214, + "learning_rate": 0.00036845129800210796, + "loss": 3.0762, + "step": 26122 + }, + { + "epoch": 1.28, + "grad_norm": 0.5813118815422058, + "learning_rate": 0.00036843630785650996, + "loss": 3.0871, + "step": 26123 + }, + { + "epoch": 1.28, + "grad_norm": 0.5994959473609924, + "learning_rate": 0.0003684213175306639, + "loss": 2.9619, + "step": 26124 + }, + { + "epoch": 1.28, + "grad_norm": 0.6316370368003845, + "learning_rate": 0.0003684063270246092, + "loss": 3.1457, + "step": 26125 + }, + { + "epoch": 1.28, + "grad_norm": 0.5947257876396179, + "learning_rate": 0.0003683913363383854, + "loss": 3.196, + "step": 26126 + }, + { + "epoch": 1.28, + "grad_norm": 0.5939607620239258, + "learning_rate": 0.00036837634547203207, + "loss": 3.2313, + "step": 26127 + }, + { + "epoch": 1.28, + "grad_norm": 0.5786470770835876, + "learning_rate": 0.0003683613544255884, + "loss": 3.1698, + "step": 26128 + }, + { + "epoch": 1.28, + "grad_norm": 0.5446995496749878, + "learning_rate": 0.00036834636319909425, + "loss": 3.4354, + "step": 26129 + }, + { + "epoch": 1.28, + "grad_norm": 0.5662088394165039, + "learning_rate": 0.00036833137179258897, + "loss": 3.104, + "step": 26130 + }, + { + "epoch": 1.28, + "grad_norm": 0.5539671182632446, + "learning_rate": 0.0003683163802061119, + "loss": 3.2375, + "step": 26131 + }, + { + "epoch": 1.28, + "grad_norm": 0.576744019985199, + "learning_rate": 0.00036830138843970274, + "loss": 3.222, + "step": 26132 + }, + { + "epoch": 1.28, + "grad_norm": 0.5656651258468628, + "learning_rate": 0.0003682863964934007, + "loss": 3.09, + "step": 26133 + }, + { + "epoch": 1.28, + "grad_norm": 0.5607897043228149, + "learning_rate": 0.0003682714043672456, + "loss": 3.0891, + "step": 26134 + }, + { + "epoch": 1.28, + "grad_norm": 0.5817605257034302, + "learning_rate": 0.00036825641206127667, + "loss": 3.1378, + "step": 26135 + }, + { + "epoch": 1.28, + "grad_norm": 0.5782478451728821, + "learning_rate": 0.0003682414195755335, + "loss": 3.0419, + "step": 26136 + }, + { + "epoch": 1.28, + "grad_norm": 0.575994610786438, + "learning_rate": 0.0003682264269100556, + "loss": 3.3339, + "step": 26137 + }, + { + "epoch": 1.28, + "grad_norm": 0.6501098871231079, + "learning_rate": 0.00036821143406488236, + "loss": 3.114, + "step": 26138 + }, + { + "epoch": 1.28, + "grad_norm": 0.5928285121917725, + "learning_rate": 0.0003681964410400534, + "loss": 3.1105, + "step": 26139 + }, + { + "epoch": 1.28, + "grad_norm": 0.5721243619918823, + "learning_rate": 0.00036818144783560804, + "loss": 2.8842, + "step": 26140 + }, + { + "epoch": 1.28, + "grad_norm": 0.5766890645027161, + "learning_rate": 0.0003681664544515861, + "loss": 2.9824, + "step": 26141 + }, + { + "epoch": 1.28, + "grad_norm": 0.5364489555358887, + "learning_rate": 0.00036815146088802663, + "loss": 3.034, + "step": 26142 + }, + { + "epoch": 1.28, + "grad_norm": 0.5811851620674133, + "learning_rate": 0.0003681364671449693, + "loss": 3.1483, + "step": 26143 + }, + { + "epoch": 1.28, + "grad_norm": 0.6257297992706299, + "learning_rate": 0.0003681214732224537, + "loss": 3.1882, + "step": 26144 + }, + { + "epoch": 1.28, + "grad_norm": 0.5299636721611023, + "learning_rate": 0.00036810647912051924, + "loss": 3.008, + "step": 26145 + }, + { + "epoch": 1.28, + "grad_norm": 0.5441427826881409, + "learning_rate": 0.00036809148483920543, + "loss": 3.0218, + "step": 26146 + }, + { + "epoch": 1.28, + "grad_norm": 0.573083758354187, + "learning_rate": 0.00036807649037855173, + "loss": 3.064, + "step": 26147 + }, + { + "epoch": 1.28, + "grad_norm": 0.5455029010772705, + "learning_rate": 0.0003680614957385977, + "loss": 3.1727, + "step": 26148 + }, + { + "epoch": 1.28, + "grad_norm": 0.5636841654777527, + "learning_rate": 0.00036804650091938276, + "loss": 3.0886, + "step": 26149 + }, + { + "epoch": 1.28, + "grad_norm": 0.5653840899467468, + "learning_rate": 0.0003680315059209464, + "loss": 3.1209, + "step": 26150 + }, + { + "epoch": 1.28, + "grad_norm": 0.5494506359100342, + "learning_rate": 0.0003680165107433282, + "loss": 2.749, + "step": 26151 + }, + { + "epoch": 1.28, + "grad_norm": 0.6142790913581848, + "learning_rate": 0.00036800151538656764, + "loss": 3.0748, + "step": 26152 + }, + { + "epoch": 1.28, + "grad_norm": 0.5461570620536804, + "learning_rate": 0.00036798651985070415, + "loss": 3.0659, + "step": 26153 + }, + { + "epoch": 1.28, + "grad_norm": 0.5468064546585083, + "learning_rate": 0.00036797152413577713, + "loss": 2.7883, + "step": 26154 + }, + { + "epoch": 1.28, + "grad_norm": 0.5630825757980347, + "learning_rate": 0.0003679565282418263, + "loss": 3.0646, + "step": 26155 + }, + { + "epoch": 1.28, + "grad_norm": 0.5849050879478455, + "learning_rate": 0.0003679415321688911, + "loss": 3.1669, + "step": 26156 + }, + { + "epoch": 1.28, + "grad_norm": 0.5444782972335815, + "learning_rate": 0.0003679265359170109, + "loss": 3.1053, + "step": 26157 + }, + { + "epoch": 1.28, + "grad_norm": 0.5830182433128357, + "learning_rate": 0.00036791153948622534, + "loss": 2.9533, + "step": 26158 + }, + { + "epoch": 1.28, + "grad_norm": 0.5592882633209229, + "learning_rate": 0.00036789654287657376, + "loss": 3.0239, + "step": 26159 + }, + { + "epoch": 1.28, + "grad_norm": 0.5635514855384827, + "learning_rate": 0.00036788154608809583, + "loss": 3.0102, + "step": 26160 + }, + { + "epoch": 1.28, + "grad_norm": 0.5419289469718933, + "learning_rate": 0.00036786654912083095, + "loss": 3.061, + "step": 26161 + }, + { + "epoch": 1.28, + "grad_norm": 0.5628889799118042, + "learning_rate": 0.0003678515519748186, + "loss": 2.9982, + "step": 26162 + }, + { + "epoch": 1.28, + "grad_norm": 0.5445288419723511, + "learning_rate": 0.00036783655465009844, + "loss": 3.154, + "step": 26163 + }, + { + "epoch": 1.28, + "grad_norm": 0.5867927074432373, + "learning_rate": 0.00036782155714670974, + "loss": 2.873, + "step": 26164 + }, + { + "epoch": 1.28, + "grad_norm": 0.5903654098510742, + "learning_rate": 0.0003678065594646921, + "loss": 3.2046, + "step": 26165 + }, + { + "epoch": 1.28, + "grad_norm": 0.5572836995124817, + "learning_rate": 0.00036779156160408514, + "loss": 3.1028, + "step": 26166 + }, + { + "epoch": 1.28, + "grad_norm": 0.5365197658538818, + "learning_rate": 0.0003677765635649282, + "loss": 2.9957, + "step": 26167 + }, + { + "epoch": 1.28, + "grad_norm": 0.567305862903595, + "learning_rate": 0.0003677615653472608, + "loss": 3.0076, + "step": 26168 + }, + { + "epoch": 1.28, + "grad_norm": 0.5919384956359863, + "learning_rate": 0.00036774656695112246, + "loss": 2.7972, + "step": 26169 + }, + { + "epoch": 1.28, + "grad_norm": 0.5833246111869812, + "learning_rate": 0.0003677315683765528, + "loss": 2.9785, + "step": 26170 + }, + { + "epoch": 1.28, + "grad_norm": 0.5576356053352356, + "learning_rate": 0.00036771656962359116, + "loss": 3.0201, + "step": 26171 + }, + { + "epoch": 1.28, + "grad_norm": 0.5311302542686462, + "learning_rate": 0.0003677015706922771, + "loss": 3.1483, + "step": 26172 + }, + { + "epoch": 1.28, + "grad_norm": 0.6967976689338684, + "learning_rate": 0.0003676865715826502, + "loss": 3.0607, + "step": 26173 + }, + { + "epoch": 1.28, + "grad_norm": 0.5725787281990051, + "learning_rate": 0.00036767157229474975, + "loss": 3.0772, + "step": 26174 + }, + { + "epoch": 1.28, + "grad_norm": 0.5482276082038879, + "learning_rate": 0.0003676565728286155, + "loss": 3.0737, + "step": 26175 + }, + { + "epoch": 1.28, + "grad_norm": 0.5845776796340942, + "learning_rate": 0.0003676415731842868, + "loss": 3.0155, + "step": 26176 + }, + { + "epoch": 1.28, + "grad_norm": 0.56215900182724, + "learning_rate": 0.0003676265733618033, + "loss": 3.1768, + "step": 26177 + }, + { + "epoch": 1.28, + "grad_norm": 0.638214111328125, + "learning_rate": 0.00036761157336120444, + "loss": 3.0996, + "step": 26178 + }, + { + "epoch": 1.28, + "grad_norm": 0.5645318627357483, + "learning_rate": 0.0003675965731825296, + "loss": 3.1095, + "step": 26179 + }, + { + "epoch": 1.28, + "grad_norm": 0.5821544528007507, + "learning_rate": 0.0003675815728258184, + "loss": 3.1556, + "step": 26180 + }, + { + "epoch": 1.28, + "grad_norm": 0.564754843711853, + "learning_rate": 0.00036756657229111055, + "loss": 3.1484, + "step": 26181 + }, + { + "epoch": 1.28, + "grad_norm": 0.559941291809082, + "learning_rate": 0.0003675515715784452, + "loss": 3.2373, + "step": 26182 + }, + { + "epoch": 1.28, + "grad_norm": 0.5572832226753235, + "learning_rate": 0.00036753657068786194, + "loss": 3.0946, + "step": 26183 + }, + { + "epoch": 1.28, + "grad_norm": 0.5751796364784241, + "learning_rate": 0.00036752156961940047, + "loss": 3.1057, + "step": 26184 + }, + { + "epoch": 1.28, + "grad_norm": 0.7486311793327332, + "learning_rate": 0.0003675065683731001, + "loss": 2.961, + "step": 26185 + }, + { + "epoch": 1.28, + "grad_norm": 0.593322217464447, + "learning_rate": 0.0003674915669490004, + "loss": 3.2956, + "step": 26186 + }, + { + "epoch": 1.28, + "grad_norm": 0.5414199233055115, + "learning_rate": 0.00036747656534714094, + "loss": 2.9403, + "step": 26187 + }, + { + "epoch": 1.28, + "grad_norm": 0.613577127456665, + "learning_rate": 0.00036746156356756124, + "loss": 3.0939, + "step": 26188 + }, + { + "epoch": 1.28, + "grad_norm": 0.6109552979469299, + "learning_rate": 0.00036744656161030075, + "loss": 3.0483, + "step": 26189 + }, + { + "epoch": 1.28, + "grad_norm": 0.6177679300308228, + "learning_rate": 0.00036743155947539894, + "loss": 3.0387, + "step": 26190 + }, + { + "epoch": 1.28, + "grad_norm": 0.557323157787323, + "learning_rate": 0.0003674165571628954, + "loss": 3.1379, + "step": 26191 + }, + { + "epoch": 1.28, + "grad_norm": 0.6148905754089355, + "learning_rate": 0.0003674015546728297, + "loss": 3.0182, + "step": 26192 + }, + { + "epoch": 1.28, + "grad_norm": 0.5857757329940796, + "learning_rate": 0.00036738655200524125, + "loss": 3.0732, + "step": 26193 + }, + { + "epoch": 1.28, + "grad_norm": 0.5747817754745483, + "learning_rate": 0.00036737154916016956, + "loss": 2.8798, + "step": 26194 + }, + { + "epoch": 1.28, + "grad_norm": 0.5815746188163757, + "learning_rate": 0.00036735654613765414, + "loss": 3.2067, + "step": 26195 + }, + { + "epoch": 1.28, + "grad_norm": 0.5818893313407898, + "learning_rate": 0.0003673415429377345, + "loss": 3.1208, + "step": 26196 + }, + { + "epoch": 1.28, + "grad_norm": 0.5754139423370361, + "learning_rate": 0.0003673265395604503, + "loss": 3.2264, + "step": 26197 + }, + { + "epoch": 1.28, + "grad_norm": 0.5612207055091858, + "learning_rate": 0.00036731153600584094, + "loss": 3.0593, + "step": 26198 + }, + { + "epoch": 1.28, + "grad_norm": 0.5592952966690063, + "learning_rate": 0.0003672965322739459, + "loss": 3.3381, + "step": 26199 + }, + { + "epoch": 1.28, + "grad_norm": 0.5539454817771912, + "learning_rate": 0.0003672815283648048, + "loss": 3.2051, + "step": 26200 + }, + { + "epoch": 1.28, + "grad_norm": 0.5505897998809814, + "learning_rate": 0.00036726652427845706, + "loss": 3.0047, + "step": 26201 + }, + { + "epoch": 1.28, + "grad_norm": 0.6153740286827087, + "learning_rate": 0.00036725152001494226, + "loss": 3.0584, + "step": 26202 + }, + { + "epoch": 1.28, + "grad_norm": 0.5606221556663513, + "learning_rate": 0.0003672365155742999, + "loss": 3.0653, + "step": 26203 + }, + { + "epoch": 1.28, + "grad_norm": 0.5304719805717468, + "learning_rate": 0.0003672215109565696, + "loss": 3.1395, + "step": 26204 + }, + { + "epoch": 1.28, + "grad_norm": 0.5453681349754333, + "learning_rate": 0.00036720650616179056, + "loss": 3.0008, + "step": 26205 + }, + { + "epoch": 1.28, + "grad_norm": 0.5311553478240967, + "learning_rate": 0.00036719150119000264, + "loss": 3.022, + "step": 26206 + }, + { + "epoch": 1.28, + "grad_norm": 0.5572003126144409, + "learning_rate": 0.0003671764960412453, + "loss": 3.121, + "step": 26207 + }, + { + "epoch": 1.28, + "grad_norm": 0.6088648438453674, + "learning_rate": 0.0003671614907155579, + "loss": 3.1107, + "step": 26208 + }, + { + "epoch": 1.28, + "grad_norm": 0.6267878413200378, + "learning_rate": 0.0003671464852129801, + "loss": 3.201, + "step": 26209 + }, + { + "epoch": 1.28, + "grad_norm": 0.5654227137565613, + "learning_rate": 0.0003671314795335513, + "loss": 3.1772, + "step": 26210 + }, + { + "epoch": 1.28, + "grad_norm": 0.5888288617134094, + "learning_rate": 0.0003671164736773112, + "loss": 2.9714, + "step": 26211 + }, + { + "epoch": 1.28, + "grad_norm": 0.542107343673706, + "learning_rate": 0.00036710146764429915, + "loss": 3.125, + "step": 26212 + }, + { + "epoch": 1.28, + "grad_norm": 0.5171507596969604, + "learning_rate": 0.00036708646143455477, + "loss": 3.0741, + "step": 26213 + }, + { + "epoch": 1.28, + "grad_norm": 0.5658581852912903, + "learning_rate": 0.00036707145504811763, + "loss": 3.1359, + "step": 26214 + }, + { + "epoch": 1.28, + "grad_norm": 0.583036482334137, + "learning_rate": 0.00036705644848502716, + "loss": 3.0781, + "step": 26215 + }, + { + "epoch": 1.28, + "grad_norm": 0.553354024887085, + "learning_rate": 0.00036704144174532286, + "loss": 3.1118, + "step": 26216 + }, + { + "epoch": 1.28, + "grad_norm": 0.5677482485771179, + "learning_rate": 0.0003670264348290443, + "loss": 3.0024, + "step": 26217 + }, + { + "epoch": 1.28, + "grad_norm": 0.6651977896690369, + "learning_rate": 0.0003670114277362311, + "loss": 2.9855, + "step": 26218 + }, + { + "epoch": 1.28, + "grad_norm": 0.6104764342308044, + "learning_rate": 0.00036699642046692264, + "loss": 2.775, + "step": 26219 + }, + { + "epoch": 1.28, + "grad_norm": 0.5855001211166382, + "learning_rate": 0.0003669814130211584, + "loss": 2.8599, + "step": 26220 + }, + { + "epoch": 1.29, + "grad_norm": 0.5708563923835754, + "learning_rate": 0.0003669664053989782, + "loss": 3.1819, + "step": 26221 + }, + { + "epoch": 1.29, + "grad_norm": 0.5730376243591309, + "learning_rate": 0.00036695139760042125, + "loss": 3.2359, + "step": 26222 + }, + { + "epoch": 1.29, + "grad_norm": 0.5522173643112183, + "learning_rate": 0.0003669363896255272, + "loss": 2.9391, + "step": 26223 + }, + { + "epoch": 1.29, + "grad_norm": 0.6628093123435974, + "learning_rate": 0.00036692138147433566, + "loss": 2.9406, + "step": 26224 + }, + { + "epoch": 1.29, + "grad_norm": 0.5647956132888794, + "learning_rate": 0.00036690637314688607, + "loss": 3.0561, + "step": 26225 + }, + { + "epoch": 1.29, + "grad_norm": 0.541031002998352, + "learning_rate": 0.0003668913646432179, + "loss": 3.0519, + "step": 26226 + }, + { + "epoch": 1.29, + "grad_norm": 0.5448402166366577, + "learning_rate": 0.00036687635596337084, + "loss": 3.3572, + "step": 26227 + }, + { + "epoch": 1.29, + "grad_norm": 0.5486478209495544, + "learning_rate": 0.00036686134710738424, + "loss": 2.9945, + "step": 26228 + }, + { + "epoch": 1.29, + "grad_norm": 0.5686172842979431, + "learning_rate": 0.0003668463380752978, + "loss": 2.9606, + "step": 26229 + }, + { + "epoch": 1.29, + "grad_norm": 0.54524165391922, + "learning_rate": 0.00036683132886715093, + "loss": 2.9665, + "step": 26230 + }, + { + "epoch": 1.29, + "grad_norm": 0.5635949969291687, + "learning_rate": 0.0003668163194829831, + "loss": 2.8544, + "step": 26231 + }, + { + "epoch": 1.29, + "grad_norm": 0.6484821438789368, + "learning_rate": 0.00036680130992283413, + "loss": 3.0592, + "step": 26232 + }, + { + "epoch": 1.29, + "grad_norm": 0.5802894830703735, + "learning_rate": 0.00036678630018674327, + "loss": 3.1788, + "step": 26233 + }, + { + "epoch": 1.29, + "grad_norm": 0.5594797730445862, + "learning_rate": 0.0003667712902747501, + "loss": 3.0831, + "step": 26234 + }, + { + "epoch": 1.29, + "grad_norm": 0.5902174115180969, + "learning_rate": 0.0003667562801868943, + "loss": 3.0271, + "step": 26235 + }, + { + "epoch": 1.29, + "grad_norm": 0.5711631178855896, + "learning_rate": 0.00036674126992321525, + "loss": 2.6302, + "step": 26236 + }, + { + "epoch": 1.29, + "grad_norm": 0.527863621711731, + "learning_rate": 0.0003667262594837525, + "loss": 3.146, + "step": 26237 + }, + { + "epoch": 1.29, + "grad_norm": 0.5474887490272522, + "learning_rate": 0.0003667112488685457, + "loss": 3.1675, + "step": 26238 + }, + { + "epoch": 1.29, + "grad_norm": 0.570087730884552, + "learning_rate": 0.00036669623807763434, + "loss": 3.0728, + "step": 26239 + }, + { + "epoch": 1.29, + "grad_norm": 0.582175612449646, + "learning_rate": 0.00036668122711105793, + "loss": 3.0308, + "step": 26240 + }, + { + "epoch": 1.29, + "grad_norm": 0.5625713467597961, + "learning_rate": 0.00036666621596885584, + "loss": 3.1616, + "step": 26241 + }, + { + "epoch": 1.29, + "grad_norm": 0.5727894306182861, + "learning_rate": 0.00036665120465106786, + "loss": 3.1067, + "step": 26242 + }, + { + "epoch": 1.29, + "grad_norm": 0.5584968328475952, + "learning_rate": 0.00036663619315773356, + "loss": 2.8448, + "step": 26243 + }, + { + "epoch": 1.29, + "grad_norm": 0.5649468898773193, + "learning_rate": 0.0003666211814888922, + "loss": 3.0906, + "step": 26244 + }, + { + "epoch": 1.29, + "grad_norm": 0.5201927423477173, + "learning_rate": 0.0003666061696445835, + "loss": 3.1189, + "step": 26245 + }, + { + "epoch": 1.29, + "grad_norm": 0.5626629590988159, + "learning_rate": 0.0003665911576248469, + "loss": 3.1336, + "step": 26246 + }, + { + "epoch": 1.29, + "grad_norm": 0.580909013748169, + "learning_rate": 0.00036657614542972216, + "loss": 2.9194, + "step": 26247 + }, + { + "epoch": 1.29, + "grad_norm": 0.5470664501190186, + "learning_rate": 0.00036656113305924857, + "loss": 2.9906, + "step": 26248 + }, + { + "epoch": 1.29, + "grad_norm": 0.5362159013748169, + "learning_rate": 0.0003665461205134657, + "loss": 3.0446, + "step": 26249 + }, + { + "epoch": 1.29, + "grad_norm": 0.5313224792480469, + "learning_rate": 0.0003665311077924133, + "loss": 3.1602, + "step": 26250 + }, + { + "epoch": 1.29, + "grad_norm": 0.5694699287414551, + "learning_rate": 0.0003665160948961307, + "loss": 2.9436, + "step": 26251 + }, + { + "epoch": 1.29, + "grad_norm": 0.5497816205024719, + "learning_rate": 0.0003665010818246575, + "loss": 3.0351, + "step": 26252 + }, + { + "epoch": 1.29, + "grad_norm": 0.5596659779548645, + "learning_rate": 0.00036648606857803324, + "loss": 3.0668, + "step": 26253 + }, + { + "epoch": 1.29, + "grad_norm": 0.6068937182426453, + "learning_rate": 0.0003664710551562976, + "loss": 3.0934, + "step": 26254 + }, + { + "epoch": 1.29, + "grad_norm": 0.5531221032142639, + "learning_rate": 0.0003664560415594899, + "loss": 2.95, + "step": 26255 + }, + { + "epoch": 1.29, + "grad_norm": 0.5915762782096863, + "learning_rate": 0.00036644102778764963, + "loss": 2.8952, + "step": 26256 + }, + { + "epoch": 1.29, + "grad_norm": 0.5725966095924377, + "learning_rate": 0.0003664260138408167, + "loss": 3.1125, + "step": 26257 + }, + { + "epoch": 1.29, + "grad_norm": 0.5639798045158386, + "learning_rate": 0.00036641099971903033, + "loss": 3.226, + "step": 26258 + }, + { + "epoch": 1.29, + "grad_norm": 0.5750287771224976, + "learning_rate": 0.0003663959854223302, + "loss": 3.0015, + "step": 26259 + }, + { + "epoch": 1.29, + "grad_norm": 0.5510179400444031, + "learning_rate": 0.0003663809709507558, + "loss": 2.9119, + "step": 26260 + }, + { + "epoch": 1.29, + "grad_norm": 0.6045522093772888, + "learning_rate": 0.00036636595630434664, + "loss": 3.2426, + "step": 26261 + }, + { + "epoch": 1.29, + "grad_norm": 0.5941025018692017, + "learning_rate": 0.0003663509414831424, + "loss": 3.1572, + "step": 26262 + }, + { + "epoch": 1.29, + "grad_norm": 0.5708783268928528, + "learning_rate": 0.0003663359264871825, + "loss": 3.0711, + "step": 26263 + }, + { + "epoch": 1.29, + "grad_norm": 0.5933129191398621, + "learning_rate": 0.00036632091131650664, + "loss": 2.9989, + "step": 26264 + }, + { + "epoch": 1.29, + "grad_norm": 0.5684006214141846, + "learning_rate": 0.00036630589597115424, + "loss": 3.0895, + "step": 26265 + }, + { + "epoch": 1.29, + "grad_norm": 0.555566132068634, + "learning_rate": 0.00036629088045116485, + "loss": 3.0108, + "step": 26266 + }, + { + "epoch": 1.29, + "grad_norm": 0.5590725541114807, + "learning_rate": 0.0003662758647565779, + "loss": 3.1664, + "step": 26267 + }, + { + "epoch": 1.29, + "grad_norm": 0.5853378176689148, + "learning_rate": 0.00036626084888743325, + "loss": 3.2277, + "step": 26268 + }, + { + "epoch": 1.29, + "grad_norm": 0.5699875354766846, + "learning_rate": 0.00036624583284377025, + "loss": 2.9595, + "step": 26269 + }, + { + "epoch": 1.29, + "grad_norm": 0.5524995923042297, + "learning_rate": 0.0003662308166256284, + "loss": 3.1834, + "step": 26270 + }, + { + "epoch": 1.29, + "grad_norm": 0.5816988945007324, + "learning_rate": 0.0003662158002330474, + "loss": 3.0219, + "step": 26271 + }, + { + "epoch": 1.29, + "grad_norm": 0.5661150813102722, + "learning_rate": 0.0003662007836660666, + "loss": 2.9393, + "step": 26272 + }, + { + "epoch": 1.29, + "grad_norm": 0.5578607320785522, + "learning_rate": 0.0003661857669247258, + "loss": 2.9993, + "step": 26273 + }, + { + "epoch": 1.29, + "grad_norm": 0.5460517406463623, + "learning_rate": 0.0003661707500090644, + "loss": 2.9927, + "step": 26274 + }, + { + "epoch": 1.29, + "grad_norm": 0.5846410393714905, + "learning_rate": 0.000366155732919122, + "loss": 3.0969, + "step": 26275 + }, + { + "epoch": 1.29, + "grad_norm": 0.5628123879432678, + "learning_rate": 0.0003661407156549382, + "loss": 3.1571, + "step": 26276 + }, + { + "epoch": 1.29, + "grad_norm": 0.5363184809684753, + "learning_rate": 0.0003661256982165523, + "loss": 2.9979, + "step": 26277 + }, + { + "epoch": 1.29, + "grad_norm": 0.5130451321601868, + "learning_rate": 0.0003661106806040041, + "loss": 3.1275, + "step": 26278 + }, + { + "epoch": 1.29, + "grad_norm": 0.5658857226371765, + "learning_rate": 0.0003660956628173331, + "loss": 3.0147, + "step": 26279 + }, + { + "epoch": 1.29, + "grad_norm": 0.56007319688797, + "learning_rate": 0.00036608064485657896, + "loss": 3.1505, + "step": 26280 + }, + { + "epoch": 1.29, + "grad_norm": 0.6172868013381958, + "learning_rate": 0.00036606562672178097, + "loss": 3.1135, + "step": 26281 + }, + { + "epoch": 1.29, + "grad_norm": 0.5785728693008423, + "learning_rate": 0.0003660506084129789, + "loss": 3.201, + "step": 26282 + }, + { + "epoch": 1.29, + "grad_norm": 0.60005784034729, + "learning_rate": 0.0003660355899302123, + "loss": 3.0202, + "step": 26283 + }, + { + "epoch": 1.29, + "grad_norm": 0.5991455912590027, + "learning_rate": 0.0003660205712735205, + "loss": 3.0963, + "step": 26284 + }, + { + "epoch": 1.29, + "grad_norm": 0.6216191649436951, + "learning_rate": 0.0003660055524429433, + "loss": 3.1045, + "step": 26285 + }, + { + "epoch": 1.29, + "grad_norm": 0.5496450662612915, + "learning_rate": 0.00036599053343852026, + "loss": 2.9522, + "step": 26286 + }, + { + "epoch": 1.29, + "grad_norm": 0.5841752886772156, + "learning_rate": 0.00036597551426029076, + "loss": 3.2331, + "step": 26287 + }, + { + "epoch": 1.29, + "grad_norm": 0.5961534380912781, + "learning_rate": 0.00036596049490829444, + "loss": 3.0853, + "step": 26288 + }, + { + "epoch": 1.29, + "grad_norm": 0.5837684273719788, + "learning_rate": 0.0003659454753825709, + "loss": 2.9664, + "step": 26289 + }, + { + "epoch": 1.29, + "grad_norm": 0.5820373892784119, + "learning_rate": 0.0003659304556831597, + "loss": 2.9441, + "step": 26290 + }, + { + "epoch": 1.29, + "grad_norm": 0.5966834425926208, + "learning_rate": 0.00036591543581010043, + "loss": 2.9539, + "step": 26291 + }, + { + "epoch": 1.29, + "grad_norm": 0.5738745331764221, + "learning_rate": 0.00036590041576343237, + "loss": 3.2159, + "step": 26292 + }, + { + "epoch": 1.29, + "grad_norm": 0.5458084344863892, + "learning_rate": 0.00036588539554319546, + "loss": 3.111, + "step": 26293 + }, + { + "epoch": 1.29, + "grad_norm": 0.5425732731819153, + "learning_rate": 0.00036587037514942917, + "loss": 3.3392, + "step": 26294 + }, + { + "epoch": 1.29, + "grad_norm": 0.6192220449447632, + "learning_rate": 0.0003658553545821728, + "loss": 3.0393, + "step": 26295 + }, + { + "epoch": 1.29, + "grad_norm": 0.5837422013282776, + "learning_rate": 0.0003658403338414662, + "loss": 3.2242, + "step": 26296 + }, + { + "epoch": 1.29, + "grad_norm": 0.5665894746780396, + "learning_rate": 0.00036582531292734887, + "loss": 3.3003, + "step": 26297 + }, + { + "epoch": 1.29, + "grad_norm": 0.5463784337043762, + "learning_rate": 0.00036581029183986027, + "loss": 3.0505, + "step": 26298 + }, + { + "epoch": 1.29, + "grad_norm": 0.5693387389183044, + "learning_rate": 0.00036579527057904005, + "loss": 3.14, + "step": 26299 + }, + { + "epoch": 1.29, + "grad_norm": 0.5675256848335266, + "learning_rate": 0.00036578024914492774, + "loss": 3.2636, + "step": 26300 + }, + { + "epoch": 1.29, + "grad_norm": 0.5448147654533386, + "learning_rate": 0.0003657652275375629, + "loss": 3.1507, + "step": 26301 + }, + { + "epoch": 1.29, + "grad_norm": 0.5830021500587463, + "learning_rate": 0.00036575020575698515, + "loss": 3.1082, + "step": 26302 + }, + { + "epoch": 1.29, + "grad_norm": 0.5744504332542419, + "learning_rate": 0.0003657351838032339, + "loss": 3.057, + "step": 26303 + }, + { + "epoch": 1.29, + "grad_norm": 0.5693845152854919, + "learning_rate": 0.00036572016167634895, + "loss": 2.9967, + "step": 26304 + }, + { + "epoch": 1.29, + "grad_norm": 0.5699017643928528, + "learning_rate": 0.00036570513937636974, + "loss": 3.131, + "step": 26305 + }, + { + "epoch": 1.29, + "grad_norm": 0.6183264851570129, + "learning_rate": 0.0003656901169033358, + "loss": 3.0087, + "step": 26306 + }, + { + "epoch": 1.29, + "grad_norm": 0.5810492634773254, + "learning_rate": 0.00036567509425728675, + "loss": 3.1105, + "step": 26307 + }, + { + "epoch": 1.29, + "grad_norm": 0.5565503239631653, + "learning_rate": 0.0003656600714382621, + "loss": 2.939, + "step": 26308 + }, + { + "epoch": 1.29, + "grad_norm": 0.5506647825241089, + "learning_rate": 0.00036564504844630155, + "loss": 3.0549, + "step": 26309 + }, + { + "epoch": 1.29, + "grad_norm": 0.53928142786026, + "learning_rate": 0.00036563002528144445, + "loss": 3.0072, + "step": 26310 + }, + { + "epoch": 1.29, + "grad_norm": 0.5917065143585205, + "learning_rate": 0.00036561500194373057, + "loss": 3.0097, + "step": 26311 + }, + { + "epoch": 1.29, + "grad_norm": 0.584202766418457, + "learning_rate": 0.0003655999784331994, + "loss": 3.2125, + "step": 26312 + }, + { + "epoch": 1.29, + "grad_norm": 0.5942773222923279, + "learning_rate": 0.00036558495474989044, + "loss": 2.9767, + "step": 26313 + }, + { + "epoch": 1.29, + "grad_norm": 0.5491356253623962, + "learning_rate": 0.0003655699308938434, + "loss": 3.1626, + "step": 26314 + }, + { + "epoch": 1.29, + "grad_norm": 0.5383910536766052, + "learning_rate": 0.0003655549068650978, + "loss": 3.2015, + "step": 26315 + }, + { + "epoch": 1.29, + "grad_norm": 0.5347186326980591, + "learning_rate": 0.00036553988266369323, + "loss": 3.1515, + "step": 26316 + }, + { + "epoch": 1.29, + "grad_norm": 0.5822970867156982, + "learning_rate": 0.0003655248582896692, + "loss": 3.1044, + "step": 26317 + }, + { + "epoch": 1.29, + "grad_norm": 0.5540775060653687, + "learning_rate": 0.0003655098337430651, + "loss": 3.1069, + "step": 26318 + }, + { + "epoch": 1.29, + "grad_norm": 0.5978838205337524, + "learning_rate": 0.00036549480902392094, + "loss": 2.8344, + "step": 26319 + }, + { + "epoch": 1.29, + "grad_norm": 0.546452522277832, + "learning_rate": 0.00036547978413227597, + "loss": 2.8552, + "step": 26320 + }, + { + "epoch": 1.29, + "grad_norm": 0.5403779149055481, + "learning_rate": 0.0003654647590681699, + "loss": 3.1634, + "step": 26321 + }, + { + "epoch": 1.29, + "grad_norm": 0.5638641119003296, + "learning_rate": 0.0003654497338316422, + "loss": 3.0284, + "step": 26322 + }, + { + "epoch": 1.29, + "grad_norm": 0.5681201219558716, + "learning_rate": 0.0003654347084227325, + "loss": 3.1705, + "step": 26323 + }, + { + "epoch": 1.29, + "grad_norm": 0.5366926193237305, + "learning_rate": 0.0003654196828414804, + "loss": 3.0462, + "step": 26324 + }, + { + "epoch": 1.29, + "grad_norm": 0.5784863829612732, + "learning_rate": 0.0003654046570879254, + "loss": 3.0464, + "step": 26325 + }, + { + "epoch": 1.29, + "grad_norm": 0.5681374073028564, + "learning_rate": 0.0003653896311621072, + "loss": 2.7868, + "step": 26326 + }, + { + "epoch": 1.29, + "grad_norm": 0.5848317742347717, + "learning_rate": 0.00036537460506406533, + "loss": 3.2213, + "step": 26327 + }, + { + "epoch": 1.29, + "grad_norm": 0.5783429741859436, + "learning_rate": 0.00036535957879383913, + "loss": 2.9188, + "step": 26328 + }, + { + "epoch": 1.29, + "grad_norm": 0.5719509124755859, + "learning_rate": 0.00036534455235146847, + "loss": 3.0323, + "step": 26329 + }, + { + "epoch": 1.29, + "grad_norm": 0.5757635235786438, + "learning_rate": 0.00036532952573699296, + "loss": 3.2186, + "step": 26330 + }, + { + "epoch": 1.29, + "grad_norm": 0.56982421875, + "learning_rate": 0.00036531449895045197, + "loss": 2.9804, + "step": 26331 + }, + { + "epoch": 1.29, + "grad_norm": 0.6023058295249939, + "learning_rate": 0.00036529947199188516, + "loss": 2.9169, + "step": 26332 + }, + { + "epoch": 1.29, + "grad_norm": 0.5599899888038635, + "learning_rate": 0.00036528444486133197, + "loss": 3.1561, + "step": 26333 + }, + { + "epoch": 1.29, + "grad_norm": 0.5620229840278625, + "learning_rate": 0.00036526941755883233, + "loss": 2.9379, + "step": 26334 + }, + { + "epoch": 1.29, + "grad_norm": 0.6028508543968201, + "learning_rate": 0.0003652543900844255, + "loss": 3.1183, + "step": 26335 + }, + { + "epoch": 1.29, + "grad_norm": 0.5630659461021423, + "learning_rate": 0.0003652393624381512, + "loss": 3.0555, + "step": 26336 + }, + { + "epoch": 1.29, + "grad_norm": 0.5554866790771484, + "learning_rate": 0.00036522433462004894, + "loss": 3.0993, + "step": 26337 + }, + { + "epoch": 1.29, + "grad_norm": 0.595605194568634, + "learning_rate": 0.0003652093066301583, + "loss": 2.8999, + "step": 26338 + }, + { + "epoch": 1.29, + "grad_norm": 0.5355490446090698, + "learning_rate": 0.00036519427846851897, + "loss": 2.9579, + "step": 26339 + }, + { + "epoch": 1.29, + "grad_norm": 0.5914220213890076, + "learning_rate": 0.0003651792501351704, + "loss": 2.9879, + "step": 26340 + }, + { + "epoch": 1.29, + "grad_norm": 0.5520111322402954, + "learning_rate": 0.0003651642216301523, + "loss": 2.8358, + "step": 26341 + }, + { + "epoch": 1.29, + "grad_norm": 0.5735923051834106, + "learning_rate": 0.00036514919295350414, + "loss": 3.1905, + "step": 26342 + }, + { + "epoch": 1.29, + "grad_norm": 0.583451509475708, + "learning_rate": 0.0003651341641052656, + "loss": 3.2077, + "step": 26343 + }, + { + "epoch": 1.29, + "grad_norm": 0.5587623715400696, + "learning_rate": 0.0003651191350854761, + "loss": 3.1237, + "step": 26344 + }, + { + "epoch": 1.29, + "grad_norm": 0.5999316573143005, + "learning_rate": 0.0003651041058941755, + "loss": 2.9136, + "step": 26345 + }, + { + "epoch": 1.29, + "grad_norm": 0.5469268560409546, + "learning_rate": 0.000365089076531403, + "loss": 3.1107, + "step": 26346 + }, + { + "epoch": 1.29, + "grad_norm": 0.5756465792655945, + "learning_rate": 0.0003650740469971985, + "loss": 3.0659, + "step": 26347 + }, + { + "epoch": 1.29, + "grad_norm": 0.5584861040115356, + "learning_rate": 0.00036505901729160146, + "loss": 3.095, + "step": 26348 + }, + { + "epoch": 1.29, + "grad_norm": 0.5867658853530884, + "learning_rate": 0.0003650439874146515, + "loss": 2.9141, + "step": 26349 + }, + { + "epoch": 1.29, + "grad_norm": 0.5676002502441406, + "learning_rate": 0.0003650289573663882, + "loss": 3.2514, + "step": 26350 + }, + { + "epoch": 1.29, + "grad_norm": 0.5888804793357849, + "learning_rate": 0.0003650139271468512, + "loss": 3.1704, + "step": 26351 + }, + { + "epoch": 1.29, + "grad_norm": 0.5709172487258911, + "learning_rate": 0.00036499889675608, + "loss": 3.2181, + "step": 26352 + }, + { + "epoch": 1.29, + "grad_norm": 0.560498058795929, + "learning_rate": 0.0003649838661941142, + "loss": 3.1361, + "step": 26353 + }, + { + "epoch": 1.29, + "grad_norm": 0.5983741283416748, + "learning_rate": 0.0003649688354609934, + "loss": 2.8439, + "step": 26354 + }, + { + "epoch": 1.29, + "grad_norm": 0.5880367159843445, + "learning_rate": 0.0003649538045567571, + "loss": 2.862, + "step": 26355 + }, + { + "epoch": 1.29, + "grad_norm": 0.5435540080070496, + "learning_rate": 0.00036493877348144516, + "loss": 3.0035, + "step": 26356 + }, + { + "epoch": 1.29, + "grad_norm": 0.5692511796951294, + "learning_rate": 0.00036492374223509686, + "loss": 3.3313, + "step": 26357 + }, + { + "epoch": 1.29, + "grad_norm": 0.5610879063606262, + "learning_rate": 0.000364908710817752, + "loss": 3.1846, + "step": 26358 + }, + { + "epoch": 1.29, + "grad_norm": 0.5607772469520569, + "learning_rate": 0.00036489367922945, + "loss": 3.1864, + "step": 26359 + }, + { + "epoch": 1.29, + "grad_norm": 0.5416234731674194, + "learning_rate": 0.00036487864747023055, + "loss": 3.1087, + "step": 26360 + }, + { + "epoch": 1.29, + "grad_norm": 0.5746535062789917, + "learning_rate": 0.0003648636155401333, + "loss": 2.9298, + "step": 26361 + }, + { + "epoch": 1.29, + "grad_norm": 0.5644704699516296, + "learning_rate": 0.0003648485834391977, + "loss": 3.1022, + "step": 26362 + }, + { + "epoch": 1.29, + "grad_norm": 0.5640776753425598, + "learning_rate": 0.00036483355116746346, + "loss": 3.1836, + "step": 26363 + }, + { + "epoch": 1.29, + "grad_norm": 0.6316478848457336, + "learning_rate": 0.00036481851872497003, + "loss": 2.929, + "step": 26364 + }, + { + "epoch": 1.29, + "grad_norm": 0.5855584740638733, + "learning_rate": 0.0003648034861117572, + "loss": 2.9929, + "step": 26365 + }, + { + "epoch": 1.29, + "grad_norm": 0.5345319509506226, + "learning_rate": 0.00036478845332786446, + "loss": 3.098, + "step": 26366 + }, + { + "epoch": 1.29, + "grad_norm": 0.6064936518669128, + "learning_rate": 0.0003647734203733314, + "loss": 3.1827, + "step": 26367 + }, + { + "epoch": 1.29, + "grad_norm": 0.5909245610237122, + "learning_rate": 0.0003647583872481976, + "loss": 2.9642, + "step": 26368 + }, + { + "epoch": 1.29, + "grad_norm": 0.5904161334037781, + "learning_rate": 0.00036474335395250256, + "loss": 2.8956, + "step": 26369 + }, + { + "epoch": 1.29, + "grad_norm": 0.6184599995613098, + "learning_rate": 0.00036472832048628615, + "loss": 3.0058, + "step": 26370 + }, + { + "epoch": 1.29, + "grad_norm": 0.5821686387062073, + "learning_rate": 0.0003647132868495877, + "loss": 3.1638, + "step": 26371 + }, + { + "epoch": 1.29, + "grad_norm": 0.5509762167930603, + "learning_rate": 0.00036469825304244696, + "loss": 3.0594, + "step": 26372 + }, + { + "epoch": 1.29, + "grad_norm": 0.5704058408737183, + "learning_rate": 0.00036468321906490346, + "loss": 3.077, + "step": 26373 + }, + { + "epoch": 1.29, + "grad_norm": 0.5785650014877319, + "learning_rate": 0.0003646681849169968, + "loss": 3.288, + "step": 26374 + }, + { + "epoch": 1.29, + "grad_norm": 0.5538581609725952, + "learning_rate": 0.0003646531505987666, + "loss": 3.2065, + "step": 26375 + }, + { + "epoch": 1.29, + "grad_norm": 0.5718187093734741, + "learning_rate": 0.00036463811611025246, + "loss": 3.0738, + "step": 26376 + }, + { + "epoch": 1.29, + "grad_norm": 0.5524046421051025, + "learning_rate": 0.0003646230814514939, + "loss": 2.9332, + "step": 26377 + }, + { + "epoch": 1.29, + "grad_norm": 0.528167188167572, + "learning_rate": 0.0003646080466225307, + "loss": 3.0779, + "step": 26378 + }, + { + "epoch": 1.29, + "grad_norm": 0.5965219736099243, + "learning_rate": 0.00036459301162340224, + "loss": 3.0625, + "step": 26379 + }, + { + "epoch": 1.29, + "grad_norm": 0.5643555521965027, + "learning_rate": 0.00036457797645414816, + "loss": 3.1285, + "step": 26380 + }, + { + "epoch": 1.29, + "grad_norm": 0.5772526264190674, + "learning_rate": 0.00036456294111480823, + "loss": 2.9713, + "step": 26381 + }, + { + "epoch": 1.29, + "grad_norm": 0.5830170512199402, + "learning_rate": 0.0003645479056054219, + "loss": 3.3178, + "step": 26382 + }, + { + "epoch": 1.29, + "grad_norm": 0.5554019808769226, + "learning_rate": 0.0003645328699260288, + "loss": 3.0084, + "step": 26383 + }, + { + "epoch": 1.29, + "grad_norm": 0.6931495070457458, + "learning_rate": 0.0003645178340766686, + "loss": 3.0225, + "step": 26384 + }, + { + "epoch": 1.29, + "grad_norm": 0.5578175187110901, + "learning_rate": 0.0003645027980573807, + "loss": 2.7523, + "step": 26385 + }, + { + "epoch": 1.29, + "grad_norm": 0.5534636378288269, + "learning_rate": 0.00036448776186820495, + "loss": 3.299, + "step": 26386 + }, + { + "epoch": 1.29, + "grad_norm": 0.5296128392219543, + "learning_rate": 0.00036447272550918085, + "loss": 3.2003, + "step": 26387 + }, + { + "epoch": 1.29, + "grad_norm": 0.550498902797699, + "learning_rate": 0.00036445768898034803, + "loss": 2.945, + "step": 26388 + }, + { + "epoch": 1.29, + "grad_norm": 0.5512216687202454, + "learning_rate": 0.000364442652281746, + "loss": 3.1243, + "step": 26389 + }, + { + "epoch": 1.29, + "grad_norm": 0.552977979183197, + "learning_rate": 0.0003644276154134144, + "loss": 3.0263, + "step": 26390 + }, + { + "epoch": 1.29, + "grad_norm": 0.5711373090744019, + "learning_rate": 0.00036441257837539293, + "loss": 3.0337, + "step": 26391 + }, + { + "epoch": 1.29, + "grad_norm": 0.6151379942893982, + "learning_rate": 0.00036439754116772107, + "loss": 3.0109, + "step": 26392 + }, + { + "epoch": 1.29, + "grad_norm": 0.5529595017433167, + "learning_rate": 0.0003643825037904385, + "loss": 3.1506, + "step": 26393 + }, + { + "epoch": 1.29, + "grad_norm": 0.5650982856750488, + "learning_rate": 0.00036436746624358485, + "loss": 3.193, + "step": 26394 + }, + { + "epoch": 1.29, + "grad_norm": 0.5503296852111816, + "learning_rate": 0.0003643524285271996, + "loss": 3.1614, + "step": 26395 + }, + { + "epoch": 1.29, + "grad_norm": 0.5685902237892151, + "learning_rate": 0.00036433739064132257, + "loss": 3.0464, + "step": 26396 + }, + { + "epoch": 1.29, + "grad_norm": 0.5520438551902771, + "learning_rate": 0.0003643223525859931, + "loss": 3.0988, + "step": 26397 + }, + { + "epoch": 1.29, + "grad_norm": 0.5292802453041077, + "learning_rate": 0.00036430731436125097, + "loss": 3.1851, + "step": 26398 + }, + { + "epoch": 1.29, + "grad_norm": 0.5346623659133911, + "learning_rate": 0.00036429227596713575, + "loss": 3.0767, + "step": 26399 + }, + { + "epoch": 1.29, + "grad_norm": 0.5210029482841492, + "learning_rate": 0.000364277237403687, + "loss": 3.0894, + "step": 26400 + }, + { + "epoch": 1.29, + "grad_norm": 0.5822991728782654, + "learning_rate": 0.00036426219867094446, + "loss": 3.1194, + "step": 26401 + }, + { + "epoch": 1.29, + "grad_norm": 0.5511487126350403, + "learning_rate": 0.0003642471597689477, + "loss": 2.9108, + "step": 26402 + }, + { + "epoch": 1.29, + "grad_norm": 0.575802743434906, + "learning_rate": 0.0003642321206977362, + "loss": 3.0977, + "step": 26403 + }, + { + "epoch": 1.29, + "grad_norm": 0.5361688137054443, + "learning_rate": 0.00036421708145734967, + "loss": 3.2185, + "step": 26404 + }, + { + "epoch": 1.29, + "grad_norm": 0.5705873966217041, + "learning_rate": 0.00036420204204782767, + "loss": 3.0483, + "step": 26405 + }, + { + "epoch": 1.29, + "grad_norm": 0.5684397220611572, + "learning_rate": 0.00036418700246920987, + "loss": 2.9966, + "step": 26406 + }, + { + "epoch": 1.29, + "grad_norm": 0.6203626394271851, + "learning_rate": 0.00036417196272153595, + "loss": 3.0679, + "step": 26407 + }, + { + "epoch": 1.29, + "grad_norm": 0.55103999376297, + "learning_rate": 0.0003641569228048453, + "loss": 3.128, + "step": 26408 + }, + { + "epoch": 1.29, + "grad_norm": 0.5689854621887207, + "learning_rate": 0.0003641418827191777, + "loss": 2.9817, + "step": 26409 + }, + { + "epoch": 1.29, + "grad_norm": 0.5948728322982788, + "learning_rate": 0.00036412684246457276, + "loss": 3.2106, + "step": 26410 + }, + { + "epoch": 1.29, + "grad_norm": 0.5416500568389893, + "learning_rate": 0.0003641118020410701, + "loss": 2.9918, + "step": 26411 + }, + { + "epoch": 1.29, + "grad_norm": 0.5819432139396667, + "learning_rate": 0.00036409676144870916, + "loss": 2.9693, + "step": 26412 + }, + { + "epoch": 1.29, + "grad_norm": 0.5323217511177063, + "learning_rate": 0.00036408172068752977, + "loss": 3.0868, + "step": 26413 + }, + { + "epoch": 1.29, + "grad_norm": 0.5703701972961426, + "learning_rate": 0.00036406667975757143, + "loss": 3.0828, + "step": 26414 + }, + { + "epoch": 1.29, + "grad_norm": 0.5979413390159607, + "learning_rate": 0.00036405163865887383, + "loss": 2.9349, + "step": 26415 + }, + { + "epoch": 1.29, + "grad_norm": 0.6079196929931641, + "learning_rate": 0.0003640365973914765, + "loss": 3.2151, + "step": 26416 + }, + { + "epoch": 1.29, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0003640215559554191, + "loss": 3.2673, + "step": 26417 + }, + { + "epoch": 1.29, + "grad_norm": 0.576112687587738, + "learning_rate": 0.00036400651435074136, + "loss": 3.2403, + "step": 26418 + }, + { + "epoch": 1.29, + "grad_norm": 0.6245038509368896, + "learning_rate": 0.0003639914725774826, + "loss": 3.04, + "step": 26419 + }, + { + "epoch": 1.29, + "grad_norm": 0.5649415254592896, + "learning_rate": 0.0003639764306356827, + "loss": 2.7593, + "step": 26420 + }, + { + "epoch": 1.29, + "grad_norm": 0.6045398116111755, + "learning_rate": 0.0003639613885253812, + "loss": 2.9559, + "step": 26421 + }, + { + "epoch": 1.29, + "grad_norm": 0.5652473568916321, + "learning_rate": 0.00036394634624661767, + "loss": 2.9299, + "step": 26422 + }, + { + "epoch": 1.29, + "grad_norm": 0.5572347640991211, + "learning_rate": 0.0003639313037994318, + "loss": 3.1491, + "step": 26423 + }, + { + "epoch": 1.29, + "grad_norm": 0.7249118685722351, + "learning_rate": 0.00036391626118386316, + "loss": 3.1411, + "step": 26424 + }, + { + "epoch": 1.3, + "grad_norm": 0.581577718257904, + "learning_rate": 0.00036390121839995147, + "loss": 3.0565, + "step": 26425 + }, + { + "epoch": 1.3, + "grad_norm": 0.5638243556022644, + "learning_rate": 0.00036388617544773615, + "loss": 3.0026, + "step": 26426 + }, + { + "epoch": 1.3, + "grad_norm": 0.585462749004364, + "learning_rate": 0.000363871132327257, + "loss": 3.1258, + "step": 26427 + }, + { + "epoch": 1.3, + "grad_norm": 0.6039126515388489, + "learning_rate": 0.00036385608903855357, + "loss": 3.2572, + "step": 26428 + }, + { + "epoch": 1.3, + "grad_norm": 0.5659081339836121, + "learning_rate": 0.0003638410455816655, + "loss": 3.0501, + "step": 26429 + }, + { + "epoch": 1.3, + "grad_norm": 0.610337495803833, + "learning_rate": 0.00036382600195663243, + "loss": 2.9333, + "step": 26430 + }, + { + "epoch": 1.3, + "grad_norm": 0.5635059475898743, + "learning_rate": 0.00036381095816349386, + "loss": 3.2617, + "step": 26431 + }, + { + "epoch": 1.3, + "grad_norm": 0.5821727514266968, + "learning_rate": 0.00036379591420228964, + "loss": 3.0316, + "step": 26432 + }, + { + "epoch": 1.3, + "grad_norm": 0.6468302607536316, + "learning_rate": 0.0003637808700730592, + "loss": 3.1508, + "step": 26433 + }, + { + "epoch": 1.3, + "grad_norm": 0.6181358098983765, + "learning_rate": 0.0003637658257758421, + "loss": 3.0418, + "step": 26434 + }, + { + "epoch": 1.3, + "grad_norm": 0.5608850717544556, + "learning_rate": 0.0003637507813106783, + "loss": 3.0204, + "step": 26435 + }, + { + "epoch": 1.3, + "grad_norm": 0.584475576877594, + "learning_rate": 0.0003637357366776071, + "loss": 3.158, + "step": 26436 + }, + { + "epoch": 1.3, + "grad_norm": 0.5593780875205994, + "learning_rate": 0.00036372069187666826, + "loss": 3.0006, + "step": 26437 + }, + { + "epoch": 1.3, + "grad_norm": 0.5664603114128113, + "learning_rate": 0.0003637056469079013, + "loss": 3.2513, + "step": 26438 + }, + { + "epoch": 1.3, + "grad_norm": 0.5719146132469177, + "learning_rate": 0.00036369060177134603, + "loss": 3.076, + "step": 26439 + }, + { + "epoch": 1.3, + "grad_norm": 0.5447546243667603, + "learning_rate": 0.000363675556467042, + "loss": 2.7886, + "step": 26440 + }, + { + "epoch": 1.3, + "grad_norm": 0.607576310634613, + "learning_rate": 0.00036366051099502865, + "loss": 3.0216, + "step": 26441 + }, + { + "epoch": 1.3, + "grad_norm": 0.5661342740058899, + "learning_rate": 0.00036364546535534584, + "loss": 3.0811, + "step": 26442 + }, + { + "epoch": 1.3, + "grad_norm": 0.5804358124732971, + "learning_rate": 0.0003636304195480332, + "loss": 3.0749, + "step": 26443 + }, + { + "epoch": 1.3, + "grad_norm": 0.5507537126541138, + "learning_rate": 0.00036361537357313024, + "loss": 3.028, + "step": 26444 + }, + { + "epoch": 1.3, + "grad_norm": 0.6021802425384521, + "learning_rate": 0.0003636003274306766, + "loss": 3.0707, + "step": 26445 + }, + { + "epoch": 1.3, + "grad_norm": 0.6064923405647278, + "learning_rate": 0.0003635852811207119, + "loss": 3.1915, + "step": 26446 + }, + { + "epoch": 1.3, + "grad_norm": 0.5652244687080383, + "learning_rate": 0.0003635702346432759, + "loss": 2.9867, + "step": 26447 + }, + { + "epoch": 1.3, + "grad_norm": 0.5297040939331055, + "learning_rate": 0.0003635551879984081, + "loss": 3.2154, + "step": 26448 + }, + { + "epoch": 1.3, + "grad_norm": 0.6315213441848755, + "learning_rate": 0.00036354014118614815, + "loss": 2.971, + "step": 26449 + }, + { + "epoch": 1.3, + "grad_norm": 0.5433605313301086, + "learning_rate": 0.0003635250942065357, + "loss": 3.0269, + "step": 26450 + }, + { + "epoch": 1.3, + "grad_norm": 0.5568627119064331, + "learning_rate": 0.0003635100470596104, + "loss": 3.047, + "step": 26451 + }, + { + "epoch": 1.3, + "grad_norm": 0.563654363155365, + "learning_rate": 0.0003634949997454118, + "loss": 2.9285, + "step": 26452 + }, + { + "epoch": 1.3, + "grad_norm": 0.5413941740989685, + "learning_rate": 0.00036347995226397963, + "loss": 3.1223, + "step": 26453 + }, + { + "epoch": 1.3, + "grad_norm": 0.5784571766853333, + "learning_rate": 0.00036346490461535347, + "loss": 3.0467, + "step": 26454 + }, + { + "epoch": 1.3, + "grad_norm": 0.5411424040794373, + "learning_rate": 0.00036344985679957303, + "loss": 3.1296, + "step": 26455 + }, + { + "epoch": 1.3, + "grad_norm": 0.5692468285560608, + "learning_rate": 0.00036343480881667785, + "loss": 2.8744, + "step": 26456 + }, + { + "epoch": 1.3, + "grad_norm": 0.6368603706359863, + "learning_rate": 0.00036341976066670754, + "loss": 3.0441, + "step": 26457 + }, + { + "epoch": 1.3, + "grad_norm": 0.546215832233429, + "learning_rate": 0.00036340471234970186, + "loss": 3.0811, + "step": 26458 + }, + { + "epoch": 1.3, + "grad_norm": 0.582275390625, + "learning_rate": 0.0003633896638657003, + "loss": 3.0124, + "step": 26459 + }, + { + "epoch": 1.3, + "grad_norm": 0.5756465792655945, + "learning_rate": 0.00036337461521474256, + "loss": 2.997, + "step": 26460 + }, + { + "epoch": 1.3, + "grad_norm": 0.6010496020317078, + "learning_rate": 0.0003633595663968683, + "loss": 3.0752, + "step": 26461 + }, + { + "epoch": 1.3, + "grad_norm": 0.5619287490844727, + "learning_rate": 0.00036334451741211715, + "loss": 2.9749, + "step": 26462 + }, + { + "epoch": 1.3, + "grad_norm": 0.5364715456962585, + "learning_rate": 0.00036332946826052873, + "loss": 3.3402, + "step": 26463 + }, + { + "epoch": 1.3, + "grad_norm": 0.5843341946601868, + "learning_rate": 0.00036331441894214275, + "loss": 3.1867, + "step": 26464 + }, + { + "epoch": 1.3, + "grad_norm": 0.5982407331466675, + "learning_rate": 0.00036329936945699866, + "loss": 3.0285, + "step": 26465 + }, + { + "epoch": 1.3, + "grad_norm": 0.5931727290153503, + "learning_rate": 0.0003632843198051363, + "loss": 2.9798, + "step": 26466 + }, + { + "epoch": 1.3, + "grad_norm": 0.5525808930397034, + "learning_rate": 0.00036326926998659514, + "loss": 3.1638, + "step": 26467 + }, + { + "epoch": 1.3, + "grad_norm": 0.5755184888839722, + "learning_rate": 0.0003632542200014149, + "loss": 3.1532, + "step": 26468 + }, + { + "epoch": 1.3, + "grad_norm": 0.5768822431564331, + "learning_rate": 0.00036323916984963534, + "loss": 3.1317, + "step": 26469 + }, + { + "epoch": 1.3, + "grad_norm": 0.5717555284500122, + "learning_rate": 0.0003632241195312959, + "loss": 3.1596, + "step": 26470 + }, + { + "epoch": 1.3, + "grad_norm": 0.5730190873146057, + "learning_rate": 0.00036320906904643624, + "loss": 2.9991, + "step": 26471 + }, + { + "epoch": 1.3, + "grad_norm": 0.5641260147094727, + "learning_rate": 0.00036319401839509616, + "loss": 3.223, + "step": 26472 + }, + { + "epoch": 1.3, + "grad_norm": 0.5625524520874023, + "learning_rate": 0.0003631789675773151, + "loss": 3.0021, + "step": 26473 + }, + { + "epoch": 1.3, + "grad_norm": 0.5481633543968201, + "learning_rate": 0.00036316391659313283, + "loss": 3.018, + "step": 26474 + }, + { + "epoch": 1.3, + "grad_norm": 0.6047376990318298, + "learning_rate": 0.000363148865442589, + "loss": 2.9506, + "step": 26475 + }, + { + "epoch": 1.3, + "grad_norm": 0.5548542737960815, + "learning_rate": 0.00036313381412572313, + "loss": 3.0886, + "step": 26476 + }, + { + "epoch": 1.3, + "grad_norm": 0.5693079829216003, + "learning_rate": 0.000363118762642575, + "loss": 3.0023, + "step": 26477 + }, + { + "epoch": 1.3, + "grad_norm": 0.5523504614830017, + "learning_rate": 0.00036310371099318423, + "loss": 3.0461, + "step": 26478 + }, + { + "epoch": 1.3, + "grad_norm": 0.5627286434173584, + "learning_rate": 0.0003630886591775904, + "loss": 3.0872, + "step": 26479 + }, + { + "epoch": 1.3, + "grad_norm": 0.5450271368026733, + "learning_rate": 0.00036307360719583316, + "loss": 2.9879, + "step": 26480 + }, + { + "epoch": 1.3, + "grad_norm": 0.5695874094963074, + "learning_rate": 0.0003630585550479522, + "loss": 2.8817, + "step": 26481 + }, + { + "epoch": 1.3, + "grad_norm": 0.5733633041381836, + "learning_rate": 0.0003630435027339871, + "loss": 3.1564, + "step": 26482 + }, + { + "epoch": 1.3, + "grad_norm": 0.6155542731285095, + "learning_rate": 0.00036302845025397765, + "loss": 3.1808, + "step": 26483 + }, + { + "epoch": 1.3, + "grad_norm": 0.5391807556152344, + "learning_rate": 0.00036301339760796335, + "loss": 3.1328, + "step": 26484 + }, + { + "epoch": 1.3, + "grad_norm": 0.5551477670669556, + "learning_rate": 0.0003629983447959838, + "loss": 3.0805, + "step": 26485 + }, + { + "epoch": 1.3, + "grad_norm": 0.5700846910476685, + "learning_rate": 0.0003629832918180788, + "loss": 3.0817, + "step": 26486 + }, + { + "epoch": 1.3, + "grad_norm": 0.5457755327224731, + "learning_rate": 0.00036296823867428793, + "loss": 3.0501, + "step": 26487 + }, + { + "epoch": 1.3, + "grad_norm": 0.5421336889266968, + "learning_rate": 0.0003629531853646508, + "loss": 3.0883, + "step": 26488 + }, + { + "epoch": 1.3, + "grad_norm": 0.5504266023635864, + "learning_rate": 0.00036293813188920717, + "loss": 3.1587, + "step": 26489 + }, + { + "epoch": 1.3, + "grad_norm": 0.5366515517234802, + "learning_rate": 0.00036292307824799654, + "loss": 3.1722, + "step": 26490 + }, + { + "epoch": 1.3, + "grad_norm": 0.5298593044281006, + "learning_rate": 0.0003629080244410587, + "loss": 3.1069, + "step": 26491 + }, + { + "epoch": 1.3, + "grad_norm": 0.5564733147621155, + "learning_rate": 0.0003628929704684332, + "loss": 3.0798, + "step": 26492 + }, + { + "epoch": 1.3, + "grad_norm": 0.5588745474815369, + "learning_rate": 0.0003628779163301597, + "loss": 3.2701, + "step": 26493 + }, + { + "epoch": 1.3, + "grad_norm": 0.6034541130065918, + "learning_rate": 0.0003628628620262779, + "loss": 2.9638, + "step": 26494 + }, + { + "epoch": 1.3, + "grad_norm": 0.54830402135849, + "learning_rate": 0.0003628478075568274, + "loss": 3.1212, + "step": 26495 + }, + { + "epoch": 1.3, + "grad_norm": 0.6392161250114441, + "learning_rate": 0.0003628327529218479, + "loss": 3.3524, + "step": 26496 + }, + { + "epoch": 1.3, + "grad_norm": 0.5647044777870178, + "learning_rate": 0.000362817698121379, + "loss": 3.0019, + "step": 26497 + }, + { + "epoch": 1.3, + "grad_norm": 0.5975145697593689, + "learning_rate": 0.0003628026431554603, + "loss": 3.0822, + "step": 26498 + }, + { + "epoch": 1.3, + "grad_norm": 0.5746291279792786, + "learning_rate": 0.00036278758802413166, + "loss": 3.2055, + "step": 26499 + }, + { + "epoch": 1.3, + "grad_norm": 0.6121372580528259, + "learning_rate": 0.0003627725327274325, + "loss": 3.0877, + "step": 26500 + }, + { + "epoch": 1.3, + "grad_norm": 0.550165593624115, + "learning_rate": 0.0003627574772654026, + "loss": 3.1267, + "step": 26501 + }, + { + "epoch": 1.3, + "grad_norm": 0.6445377469062805, + "learning_rate": 0.00036274242163808164, + "loss": 3.2958, + "step": 26502 + }, + { + "epoch": 1.3, + "grad_norm": 0.5646840333938599, + "learning_rate": 0.00036272736584550913, + "loss": 2.9063, + "step": 26503 + }, + { + "epoch": 1.3, + "grad_norm": 0.6248049736022949, + "learning_rate": 0.0003627123098877248, + "loss": 3.142, + "step": 26504 + }, + { + "epoch": 1.3, + "grad_norm": 0.5855923295021057, + "learning_rate": 0.00036269725376476835, + "loss": 2.976, + "step": 26505 + }, + { + "epoch": 1.3, + "grad_norm": 0.5521533489227295, + "learning_rate": 0.0003626821974766795, + "loss": 3.2744, + "step": 26506 + }, + { + "epoch": 1.3, + "grad_norm": 0.6442424058914185, + "learning_rate": 0.00036266714102349773, + "loss": 3.0419, + "step": 26507 + }, + { + "epoch": 1.3, + "grad_norm": 0.5635976791381836, + "learning_rate": 0.00036265208440526266, + "loss": 3.0997, + "step": 26508 + }, + { + "epoch": 1.3, + "grad_norm": 0.671798825263977, + "learning_rate": 0.0003626370276220142, + "loss": 3.037, + "step": 26509 + }, + { + "epoch": 1.3, + "grad_norm": 0.5717962384223938, + "learning_rate": 0.0003626219706737918, + "loss": 3.3547, + "step": 26510 + }, + { + "epoch": 1.3, + "grad_norm": 0.5673326253890991, + "learning_rate": 0.0003626069135606352, + "loss": 3.074, + "step": 26511 + }, + { + "epoch": 1.3, + "grad_norm": 0.5579215884208679, + "learning_rate": 0.00036259185628258406, + "loss": 3.1001, + "step": 26512 + }, + { + "epoch": 1.3, + "grad_norm": 0.6117144227027893, + "learning_rate": 0.000362576798839678, + "loss": 2.9077, + "step": 26513 + }, + { + "epoch": 1.3, + "grad_norm": 0.5998569130897522, + "learning_rate": 0.00036256174123195663, + "loss": 3.0331, + "step": 26514 + }, + { + "epoch": 1.3, + "grad_norm": 0.618823230266571, + "learning_rate": 0.00036254668345945976, + "loss": 3.2402, + "step": 26515 + }, + { + "epoch": 1.3, + "grad_norm": 0.6104410290718079, + "learning_rate": 0.0003625316255222269, + "loss": 3.0506, + "step": 26516 + }, + { + "epoch": 1.3, + "grad_norm": 0.6050528883934021, + "learning_rate": 0.0003625165674202979, + "loss": 3.0046, + "step": 26517 + }, + { + "epoch": 1.3, + "grad_norm": 0.5506466627120972, + "learning_rate": 0.0003625015091537121, + "loss": 2.9101, + "step": 26518 + }, + { + "epoch": 1.3, + "grad_norm": 0.5498501658439636, + "learning_rate": 0.0003624864507225094, + "loss": 3.0459, + "step": 26519 + }, + { + "epoch": 1.3, + "grad_norm": 0.5689669251441956, + "learning_rate": 0.0003624713921267296, + "loss": 3.1916, + "step": 26520 + }, + { + "epoch": 1.3, + "grad_norm": 0.5426681637763977, + "learning_rate": 0.000362456333366412, + "loss": 2.9685, + "step": 26521 + }, + { + "epoch": 1.3, + "grad_norm": 0.604785680770874, + "learning_rate": 0.0003624412744415965, + "loss": 3.0663, + "step": 26522 + }, + { + "epoch": 1.3, + "grad_norm": 0.5710480213165283, + "learning_rate": 0.0003624262153523226, + "loss": 2.9204, + "step": 26523 + }, + { + "epoch": 1.3, + "grad_norm": 0.6017956733703613, + "learning_rate": 0.0003624111560986302, + "loss": 3.011, + "step": 26524 + }, + { + "epoch": 1.3, + "grad_norm": 0.5829985737800598, + "learning_rate": 0.00036239609668055876, + "loss": 2.9643, + "step": 26525 + }, + { + "epoch": 1.3, + "grad_norm": 0.6299350261688232, + "learning_rate": 0.000362381037098148, + "loss": 2.9682, + "step": 26526 + }, + { + "epoch": 1.3, + "grad_norm": 0.613353967666626, + "learning_rate": 0.0003623659773514376, + "loss": 3.0985, + "step": 26527 + }, + { + "epoch": 1.3, + "grad_norm": 0.5568817257881165, + "learning_rate": 0.00036235091744046724, + "loss": 2.8491, + "step": 26528 + }, + { + "epoch": 1.3, + "grad_norm": 0.5802782773971558, + "learning_rate": 0.0003623358573652765, + "loss": 3.1953, + "step": 26529 + }, + { + "epoch": 1.3, + "grad_norm": 0.5671431422233582, + "learning_rate": 0.0003623207971259052, + "loss": 2.9758, + "step": 26530 + }, + { + "epoch": 1.3, + "grad_norm": 0.5735278725624084, + "learning_rate": 0.0003623057367223929, + "loss": 3.119, + "step": 26531 + }, + { + "epoch": 1.3, + "grad_norm": 0.6418932676315308, + "learning_rate": 0.0003622906761547793, + "loss": 2.8465, + "step": 26532 + }, + { + "epoch": 1.3, + "grad_norm": 0.6021503210067749, + "learning_rate": 0.0003622756154231039, + "loss": 2.9617, + "step": 26533 + }, + { + "epoch": 1.3, + "grad_norm": 0.5770435929298401, + "learning_rate": 0.0003622605545274067, + "loss": 3.1281, + "step": 26534 + }, + { + "epoch": 1.3, + "grad_norm": 0.6095218062400818, + "learning_rate": 0.0003622454934677271, + "loss": 2.9292, + "step": 26535 + }, + { + "epoch": 1.3, + "grad_norm": 0.6107041239738464, + "learning_rate": 0.00036223043224410483, + "loss": 3.124, + "step": 26536 + }, + { + "epoch": 1.3, + "grad_norm": 0.5658743977546692, + "learning_rate": 0.0003622153708565796, + "loss": 3.0487, + "step": 26537 + }, + { + "epoch": 1.3, + "grad_norm": 0.5829092264175415, + "learning_rate": 0.000362200309305191, + "loss": 3.0127, + "step": 26538 + }, + { + "epoch": 1.3, + "grad_norm": 0.5712616443634033, + "learning_rate": 0.00036218524758997885, + "loss": 3.0086, + "step": 26539 + }, + { + "epoch": 1.3, + "grad_norm": 0.5344133377075195, + "learning_rate": 0.0003621701857109827, + "loss": 3.1907, + "step": 26540 + }, + { + "epoch": 1.3, + "grad_norm": 0.5803573727607727, + "learning_rate": 0.0003621551236682422, + "loss": 3.1347, + "step": 26541 + }, + { + "epoch": 1.3, + "grad_norm": 0.566353976726532, + "learning_rate": 0.0003621400614617972, + "loss": 3.1542, + "step": 26542 + }, + { + "epoch": 1.3, + "grad_norm": 0.5749666094779968, + "learning_rate": 0.00036212499909168706, + "loss": 3.1458, + "step": 26543 + }, + { + "epoch": 1.3, + "grad_norm": 0.6262149214744568, + "learning_rate": 0.00036210993655795163, + "loss": 3.1817, + "step": 26544 + }, + { + "epoch": 1.3, + "grad_norm": 0.629041314125061, + "learning_rate": 0.00036209487386063075, + "loss": 3.0455, + "step": 26545 + }, + { + "epoch": 1.3, + "grad_norm": 0.5451223850250244, + "learning_rate": 0.0003620798109997638, + "loss": 3.1538, + "step": 26546 + }, + { + "epoch": 1.3, + "grad_norm": 0.5455161929130554, + "learning_rate": 0.00036206474797539066, + "loss": 3.1579, + "step": 26547 + }, + { + "epoch": 1.3, + "grad_norm": 0.5440385341644287, + "learning_rate": 0.0003620496847875508, + "loss": 3.1674, + "step": 26548 + }, + { + "epoch": 1.3, + "grad_norm": 0.5767881274223328, + "learning_rate": 0.00036203462143628406, + "loss": 3.1339, + "step": 26549 + }, + { + "epoch": 1.3, + "grad_norm": 0.5350778698921204, + "learning_rate": 0.0003620195579216301, + "loss": 3.0594, + "step": 26550 + }, + { + "epoch": 1.3, + "grad_norm": 0.5580967664718628, + "learning_rate": 0.0003620044942436285, + "loss": 3.04, + "step": 26551 + }, + { + "epoch": 1.3, + "grad_norm": 0.5478173494338989, + "learning_rate": 0.00036198943040231904, + "loss": 3.1905, + "step": 26552 + }, + { + "epoch": 1.3, + "grad_norm": 0.6238259077072144, + "learning_rate": 0.00036197436639774145, + "loss": 3.0206, + "step": 26553 + }, + { + "epoch": 1.3, + "grad_norm": 0.6044278144836426, + "learning_rate": 0.00036195930222993506, + "loss": 3.0768, + "step": 26554 + }, + { + "epoch": 1.3, + "grad_norm": 0.5655761957168579, + "learning_rate": 0.00036194423789893997, + "loss": 3.1671, + "step": 26555 + }, + { + "epoch": 1.3, + "grad_norm": 0.5538684725761414, + "learning_rate": 0.00036192917340479566, + "loss": 3.0836, + "step": 26556 + }, + { + "epoch": 1.3, + "grad_norm": 0.5838094353675842, + "learning_rate": 0.0003619141087475418, + "loss": 3.0374, + "step": 26557 + }, + { + "epoch": 1.3, + "grad_norm": 0.5795071721076965, + "learning_rate": 0.00036189904392721814, + "loss": 2.9839, + "step": 26558 + }, + { + "epoch": 1.3, + "grad_norm": 0.5634998083114624, + "learning_rate": 0.0003618839789438642, + "loss": 3.0211, + "step": 26559 + }, + { + "epoch": 1.3, + "grad_norm": 0.5954675078392029, + "learning_rate": 0.0003618689137975199, + "loss": 3.102, + "step": 26560 + }, + { + "epoch": 1.3, + "grad_norm": 0.5686203241348267, + "learning_rate": 0.0003618538484882247, + "loss": 3.0318, + "step": 26561 + }, + { + "epoch": 1.3, + "grad_norm": 0.5589979887008667, + "learning_rate": 0.0003618387830160184, + "loss": 2.9281, + "step": 26562 + }, + { + "epoch": 1.3, + "grad_norm": 0.5522468686103821, + "learning_rate": 0.0003618237173809406, + "loss": 2.823, + "step": 26563 + }, + { + "epoch": 1.3, + "grad_norm": 0.5559152364730835, + "learning_rate": 0.00036180865158303116, + "loss": 3.1592, + "step": 26564 + }, + { + "epoch": 1.3, + "grad_norm": 0.5371105074882507, + "learning_rate": 0.00036179358562232945, + "loss": 3.0656, + "step": 26565 + }, + { + "epoch": 1.3, + "grad_norm": 0.5746232271194458, + "learning_rate": 0.00036177851949887544, + "loss": 3.042, + "step": 26566 + }, + { + "epoch": 1.3, + "grad_norm": 0.5857375264167786, + "learning_rate": 0.00036176345321270865, + "loss": 3.1788, + "step": 26567 + }, + { + "epoch": 1.3, + "grad_norm": 0.5602015852928162, + "learning_rate": 0.00036174838676386893, + "loss": 2.8983, + "step": 26568 + }, + { + "epoch": 1.3, + "grad_norm": 0.5846152305603027, + "learning_rate": 0.00036173332015239563, + "loss": 2.7467, + "step": 26569 + }, + { + "epoch": 1.3, + "grad_norm": 0.541037917137146, + "learning_rate": 0.0003617182533783288, + "loss": 2.9718, + "step": 26570 + }, + { + "epoch": 1.3, + "grad_norm": 0.5495973825454712, + "learning_rate": 0.000361703186441708, + "loss": 3.0222, + "step": 26571 + }, + { + "epoch": 1.3, + "grad_norm": 0.5735442042350769, + "learning_rate": 0.00036168811934257275, + "loss": 3.2686, + "step": 26572 + }, + { + "epoch": 1.3, + "grad_norm": 0.5584540367126465, + "learning_rate": 0.00036167305208096293, + "loss": 2.8147, + "step": 26573 + }, + { + "epoch": 1.3, + "grad_norm": 0.5874664187431335, + "learning_rate": 0.00036165798465691816, + "loss": 3.1831, + "step": 26574 + }, + { + "epoch": 1.3, + "grad_norm": 0.5908390879631042, + "learning_rate": 0.00036164291707047817, + "loss": 2.9889, + "step": 26575 + }, + { + "epoch": 1.3, + "grad_norm": 0.5895627737045288, + "learning_rate": 0.00036162784932168254, + "loss": 2.9451, + "step": 26576 + }, + { + "epoch": 1.3, + "grad_norm": 0.5433921813964844, + "learning_rate": 0.000361612781410571, + "loss": 3.2065, + "step": 26577 + }, + { + "epoch": 1.3, + "grad_norm": 0.5549830794334412, + "learning_rate": 0.0003615977133371833, + "loss": 3.1892, + "step": 26578 + }, + { + "epoch": 1.3, + "grad_norm": 0.6039674282073975, + "learning_rate": 0.0003615826451015591, + "loss": 2.8504, + "step": 26579 + }, + { + "epoch": 1.3, + "grad_norm": 0.526862382888794, + "learning_rate": 0.0003615675767037379, + "loss": 3.1544, + "step": 26580 + }, + { + "epoch": 1.3, + "grad_norm": 0.5542131662368774, + "learning_rate": 0.0003615525081437597, + "loss": 3.1678, + "step": 26581 + }, + { + "epoch": 1.3, + "grad_norm": 0.5980327129364014, + "learning_rate": 0.0003615374394216641, + "loss": 3.1879, + "step": 26582 + }, + { + "epoch": 1.3, + "grad_norm": 0.5924147367477417, + "learning_rate": 0.00036152237053749064, + "loss": 3.0792, + "step": 26583 + }, + { + "epoch": 1.3, + "grad_norm": 0.5593456029891968, + "learning_rate": 0.00036150730149127905, + "loss": 2.8554, + "step": 26584 + }, + { + "epoch": 1.3, + "grad_norm": 0.5941464304924011, + "learning_rate": 0.0003614922322830691, + "loss": 3.0402, + "step": 26585 + }, + { + "epoch": 1.3, + "grad_norm": 0.5819092988967896, + "learning_rate": 0.0003614771629129005, + "loss": 3.1216, + "step": 26586 + }, + { + "epoch": 1.3, + "grad_norm": 0.5705400705337524, + "learning_rate": 0.0003614620933808128, + "loss": 3.1876, + "step": 26587 + }, + { + "epoch": 1.3, + "grad_norm": 0.5921162366867065, + "learning_rate": 0.00036144702368684587, + "loss": 2.9728, + "step": 26588 + }, + { + "epoch": 1.3, + "grad_norm": 0.5853647589683533, + "learning_rate": 0.0003614319538310392, + "loss": 3.2357, + "step": 26589 + }, + { + "epoch": 1.3, + "grad_norm": 0.5218216180801392, + "learning_rate": 0.0003614168838134327, + "loss": 3.1811, + "step": 26590 + }, + { + "epoch": 1.3, + "grad_norm": 0.5547449588775635, + "learning_rate": 0.00036140181363406585, + "loss": 3.2543, + "step": 26591 + }, + { + "epoch": 1.3, + "grad_norm": 0.5389057993888855, + "learning_rate": 0.0003613867432929785, + "loss": 3.062, + "step": 26592 + }, + { + "epoch": 1.3, + "grad_norm": 0.5760816931724548, + "learning_rate": 0.0003613716727902103, + "loss": 3.0541, + "step": 26593 + }, + { + "epoch": 1.3, + "grad_norm": 0.5420430302619934, + "learning_rate": 0.00036135660212580084, + "loss": 3.1731, + "step": 26594 + }, + { + "epoch": 1.3, + "grad_norm": 0.5819205045700073, + "learning_rate": 0.0003613415312997899, + "loss": 3.0585, + "step": 26595 + }, + { + "epoch": 1.3, + "grad_norm": 0.5555254817008972, + "learning_rate": 0.00036132646031221725, + "loss": 3.1077, + "step": 26596 + }, + { + "epoch": 1.3, + "grad_norm": 0.580630362033844, + "learning_rate": 0.0003613113891631225, + "loss": 2.9911, + "step": 26597 + }, + { + "epoch": 1.3, + "grad_norm": 0.5484400391578674, + "learning_rate": 0.0003612963178525453, + "loss": 3.0661, + "step": 26598 + }, + { + "epoch": 1.3, + "grad_norm": 0.5431928634643555, + "learning_rate": 0.0003612812463805254, + "loss": 3.32, + "step": 26599 + }, + { + "epoch": 1.3, + "grad_norm": 0.5618100166320801, + "learning_rate": 0.00036126617474710254, + "loss": 3.0773, + "step": 26600 + }, + { + "epoch": 1.3, + "grad_norm": 0.5543720722198486, + "learning_rate": 0.0003612511029523163, + "loss": 2.8697, + "step": 26601 + }, + { + "epoch": 1.3, + "grad_norm": 0.6032953858375549, + "learning_rate": 0.00036123603099620644, + "loss": 3.1716, + "step": 26602 + }, + { + "epoch": 1.3, + "grad_norm": 0.5478854179382324, + "learning_rate": 0.0003612209588788127, + "loss": 3.1367, + "step": 26603 + }, + { + "epoch": 1.3, + "grad_norm": 0.5998310446739197, + "learning_rate": 0.0003612058866001748, + "loss": 3.1668, + "step": 26604 + }, + { + "epoch": 1.3, + "grad_norm": 0.5461510419845581, + "learning_rate": 0.00036119081416033225, + "loss": 3.213, + "step": 26605 + }, + { + "epoch": 1.3, + "grad_norm": 0.5615707635879517, + "learning_rate": 0.0003611757415593249, + "loss": 3.0274, + "step": 26606 + }, + { + "epoch": 1.3, + "grad_norm": 0.5446403622627258, + "learning_rate": 0.0003611606687971925, + "loss": 3.1591, + "step": 26607 + }, + { + "epoch": 1.3, + "grad_norm": 0.5432937145233154, + "learning_rate": 0.0003611455958739746, + "loss": 2.9783, + "step": 26608 + }, + { + "epoch": 1.3, + "grad_norm": 0.5421168208122253, + "learning_rate": 0.000361130522789711, + "loss": 3.0148, + "step": 26609 + }, + { + "epoch": 1.3, + "grad_norm": 0.5991039872169495, + "learning_rate": 0.0003611154495444413, + "loss": 2.9486, + "step": 26610 + }, + { + "epoch": 1.3, + "grad_norm": 0.5641329884529114, + "learning_rate": 0.0003611003761382054, + "loss": 2.8076, + "step": 26611 + }, + { + "epoch": 1.3, + "grad_norm": 0.5584830045700073, + "learning_rate": 0.00036108530257104274, + "loss": 2.9863, + "step": 26612 + }, + { + "epoch": 1.3, + "grad_norm": 0.591867983341217, + "learning_rate": 0.0003610702288429932, + "loss": 2.9895, + "step": 26613 + }, + { + "epoch": 1.3, + "grad_norm": 0.5379605293273926, + "learning_rate": 0.0003610551549540964, + "loss": 3.1134, + "step": 26614 + }, + { + "epoch": 1.3, + "grad_norm": 0.570203959941864, + "learning_rate": 0.00036104008090439215, + "loss": 3.0738, + "step": 26615 + }, + { + "epoch": 1.3, + "grad_norm": 0.531862735748291, + "learning_rate": 0.00036102500669392, + "loss": 2.9935, + "step": 26616 + }, + { + "epoch": 1.3, + "grad_norm": 0.5866312980651855, + "learning_rate": 0.0003610099323227197, + "loss": 3.0712, + "step": 26617 + }, + { + "epoch": 1.3, + "grad_norm": 0.6026278734207153, + "learning_rate": 0.00036099485779083105, + "loss": 3.1248, + "step": 26618 + }, + { + "epoch": 1.3, + "grad_norm": 0.5626994967460632, + "learning_rate": 0.0003609797830982937, + "loss": 3.2064, + "step": 26619 + }, + { + "epoch": 1.3, + "grad_norm": 0.5667855143547058, + "learning_rate": 0.00036096470824514734, + "loss": 3.1069, + "step": 26620 + }, + { + "epoch": 1.3, + "grad_norm": 0.5892822742462158, + "learning_rate": 0.00036094963323143154, + "loss": 3.106, + "step": 26621 + }, + { + "epoch": 1.3, + "grad_norm": 0.5754657983779907, + "learning_rate": 0.00036093455805718633, + "loss": 2.9988, + "step": 26622 + }, + { + "epoch": 1.3, + "grad_norm": 0.5949243903160095, + "learning_rate": 0.0003609194827224511, + "loss": 2.8595, + "step": 26623 + }, + { + "epoch": 1.3, + "grad_norm": 0.5688416957855225, + "learning_rate": 0.00036090440722726574, + "loss": 3.1287, + "step": 26624 + }, + { + "epoch": 1.3, + "grad_norm": 0.563495934009552, + "learning_rate": 0.0003608893315716698, + "loss": 3.0331, + "step": 26625 + }, + { + "epoch": 1.3, + "grad_norm": 0.5768406987190247, + "learning_rate": 0.0003608742557557031, + "loss": 3.1165, + "step": 26626 + }, + { + "epoch": 1.3, + "grad_norm": 0.6202779412269592, + "learning_rate": 0.00036085917977940533, + "loss": 3.0652, + "step": 26627 + }, + { + "epoch": 1.3, + "grad_norm": 0.556969940662384, + "learning_rate": 0.0003608441036428163, + "loss": 3.163, + "step": 26628 + }, + { + "epoch": 1.31, + "grad_norm": 0.5989406704902649, + "learning_rate": 0.0003608290273459755, + "loss": 2.933, + "step": 26629 + }, + { + "epoch": 1.31, + "grad_norm": 0.5388544201850891, + "learning_rate": 0.0003608139508889228, + "loss": 3.2661, + "step": 26630 + }, + { + "epoch": 1.31, + "grad_norm": 0.5508213639259338, + "learning_rate": 0.00036079887427169775, + "loss": 3.278, + "step": 26631 + }, + { + "epoch": 1.31, + "grad_norm": 0.5620518326759338, + "learning_rate": 0.00036078379749434025, + "loss": 2.8503, + "step": 26632 + }, + { + "epoch": 1.31, + "grad_norm": 0.5819762349128723, + "learning_rate": 0.00036076872055689, + "loss": 3.1962, + "step": 26633 + }, + { + "epoch": 1.31, + "grad_norm": 0.5653870701789856, + "learning_rate": 0.0003607536434593866, + "loss": 3.0897, + "step": 26634 + }, + { + "epoch": 1.31, + "grad_norm": 0.6262655258178711, + "learning_rate": 0.0003607385662018697, + "loss": 3.0569, + "step": 26635 + }, + { + "epoch": 1.31, + "grad_norm": 0.5709400773048401, + "learning_rate": 0.0003607234887843792, + "loss": 3.0048, + "step": 26636 + }, + { + "epoch": 1.31, + "grad_norm": 0.5970860719680786, + "learning_rate": 0.00036070841120695467, + "loss": 3.1965, + "step": 26637 + }, + { + "epoch": 1.31, + "grad_norm": 0.5762056112289429, + "learning_rate": 0.0003606933334696359, + "loss": 3.0631, + "step": 26638 + }, + { + "epoch": 1.31, + "grad_norm": 0.5508270263671875, + "learning_rate": 0.0003606782555724625, + "loss": 3.2962, + "step": 26639 + }, + { + "epoch": 1.31, + "grad_norm": 0.5481507778167725, + "learning_rate": 0.00036066317751547427, + "loss": 3.0597, + "step": 26640 + }, + { + "epoch": 1.31, + "grad_norm": 0.5892797708511353, + "learning_rate": 0.000360648099298711, + "loss": 3.0084, + "step": 26641 + }, + { + "epoch": 1.31, + "grad_norm": 0.5414859056472778, + "learning_rate": 0.0003606330209222122, + "loss": 3.1328, + "step": 26642 + }, + { + "epoch": 1.31, + "grad_norm": 0.5723474621772766, + "learning_rate": 0.0003606179423860178, + "loss": 3.0681, + "step": 26643 + }, + { + "epoch": 1.31, + "grad_norm": 0.5581324100494385, + "learning_rate": 0.00036060286369016736, + "loss": 3.0845, + "step": 26644 + }, + { + "epoch": 1.31, + "grad_norm": 0.5598316788673401, + "learning_rate": 0.0003605877848347006, + "loss": 3.1011, + "step": 26645 + }, + { + "epoch": 1.31, + "grad_norm": 0.55232173204422, + "learning_rate": 0.0003605727058196572, + "loss": 3.3967, + "step": 26646 + }, + { + "epoch": 1.31, + "grad_norm": 0.6095204949378967, + "learning_rate": 0.0003605576266450771, + "loss": 3.0187, + "step": 26647 + }, + { + "epoch": 1.31, + "grad_norm": 0.6039015650749207, + "learning_rate": 0.00036054254731099984, + "loss": 2.9391, + "step": 26648 + }, + { + "epoch": 1.31, + "grad_norm": 0.5620260238647461, + "learning_rate": 0.00036052746781746513, + "loss": 3.067, + "step": 26649 + }, + { + "epoch": 1.31, + "grad_norm": 0.5486672520637512, + "learning_rate": 0.0003605123881645127, + "loss": 3.137, + "step": 26650 + }, + { + "epoch": 1.31, + "grad_norm": 0.5771086812019348, + "learning_rate": 0.0003604973083521823, + "loss": 3.0855, + "step": 26651 + }, + { + "epoch": 1.31, + "grad_norm": 0.5530979037284851, + "learning_rate": 0.0003604822283805137, + "loss": 3.3067, + "step": 26652 + }, + { + "epoch": 1.31, + "grad_norm": 0.5996308922767639, + "learning_rate": 0.00036046714824954643, + "loss": 2.8549, + "step": 26653 + }, + { + "epoch": 1.31, + "grad_norm": 0.5549845099449158, + "learning_rate": 0.00036045206795932045, + "loss": 2.9425, + "step": 26654 + }, + { + "epoch": 1.31, + "grad_norm": 0.5953495502471924, + "learning_rate": 0.00036043698750987536, + "loss": 3.1898, + "step": 26655 + }, + { + "epoch": 1.31, + "grad_norm": 0.5502009987831116, + "learning_rate": 0.00036042190690125075, + "loss": 3.0692, + "step": 26656 + }, + { + "epoch": 1.31, + "grad_norm": 0.5558758974075317, + "learning_rate": 0.0003604068261334865, + "loss": 3.0813, + "step": 26657 + }, + { + "epoch": 1.31, + "grad_norm": 0.5885339975357056, + "learning_rate": 0.0003603917452066224, + "loss": 3.0579, + "step": 26658 + }, + { + "epoch": 1.31, + "grad_norm": 0.5545154809951782, + "learning_rate": 0.000360376664120698, + "loss": 3.0188, + "step": 26659 + }, + { + "epoch": 1.31, + "grad_norm": 0.6164775490760803, + "learning_rate": 0.00036036158287575305, + "loss": 3.0655, + "step": 26660 + }, + { + "epoch": 1.31, + "grad_norm": 0.6128799915313721, + "learning_rate": 0.0003603465014718273, + "loss": 3.0116, + "step": 26661 + }, + { + "epoch": 1.31, + "grad_norm": 0.5506448149681091, + "learning_rate": 0.00036033141990896055, + "loss": 3.1085, + "step": 26662 + }, + { + "epoch": 1.31, + "grad_norm": 0.5668236613273621, + "learning_rate": 0.0003603163381871924, + "loss": 3.0768, + "step": 26663 + }, + { + "epoch": 1.31, + "grad_norm": 0.5560147166252136, + "learning_rate": 0.00036030125630656266, + "loss": 3.0992, + "step": 26664 + }, + { + "epoch": 1.31, + "grad_norm": 0.5709168910980225, + "learning_rate": 0.00036028617426711097, + "loss": 3.2251, + "step": 26665 + }, + { + "epoch": 1.31, + "grad_norm": 0.5365732908248901, + "learning_rate": 0.00036027109206887716, + "loss": 2.8935, + "step": 26666 + }, + { + "epoch": 1.31, + "grad_norm": 0.559626042842865, + "learning_rate": 0.00036025600971190076, + "loss": 3.0365, + "step": 26667 + }, + { + "epoch": 1.31, + "grad_norm": 0.6036146879196167, + "learning_rate": 0.00036024092719622177, + "loss": 2.8695, + "step": 26668 + }, + { + "epoch": 1.31, + "grad_norm": 0.5651452541351318, + "learning_rate": 0.0003602258445218797, + "loss": 3.1308, + "step": 26669 + }, + { + "epoch": 1.31, + "grad_norm": 0.5566524267196655, + "learning_rate": 0.0003602107616889145, + "loss": 2.8533, + "step": 26670 + }, + { + "epoch": 1.31, + "grad_norm": 0.5793358683586121, + "learning_rate": 0.00036019567869736556, + "loss": 3.0315, + "step": 26671 + }, + { + "epoch": 1.31, + "grad_norm": 0.5406820774078369, + "learning_rate": 0.0003601805955472728, + "loss": 3.1241, + "step": 26672 + }, + { + "epoch": 1.31, + "grad_norm": 0.590872585773468, + "learning_rate": 0.000360165512238676, + "loss": 3.0732, + "step": 26673 + }, + { + "epoch": 1.31, + "grad_norm": 0.6023873090744019, + "learning_rate": 0.00036015042877161484, + "loss": 3.323, + "step": 26674 + }, + { + "epoch": 1.31, + "grad_norm": 0.575550377368927, + "learning_rate": 0.00036013534514612894, + "loss": 3.0127, + "step": 26675 + }, + { + "epoch": 1.31, + "grad_norm": 0.583389163017273, + "learning_rate": 0.00036012026136225813, + "loss": 3.1352, + "step": 26676 + }, + { + "epoch": 1.31, + "grad_norm": 0.5580568909645081, + "learning_rate": 0.0003601051774200422, + "loss": 3.1552, + "step": 26677 + }, + { + "epoch": 1.31, + "grad_norm": 0.5758119225502014, + "learning_rate": 0.0003600900933195207, + "loss": 2.7847, + "step": 26678 + }, + { + "epoch": 1.31, + "grad_norm": 0.5357880592346191, + "learning_rate": 0.00036007500906073346, + "loss": 3.0774, + "step": 26679 + }, + { + "epoch": 1.31, + "grad_norm": 0.5659300684928894, + "learning_rate": 0.0003600599246437203, + "loss": 3.1311, + "step": 26680 + }, + { + "epoch": 1.31, + "grad_norm": 0.5540302395820618, + "learning_rate": 0.00036004484006852083, + "loss": 3.2338, + "step": 26681 + }, + { + "epoch": 1.31, + "grad_norm": 0.5465559363365173, + "learning_rate": 0.00036002975533517477, + "loss": 3.0121, + "step": 26682 + }, + { + "epoch": 1.31, + "grad_norm": 0.6347934007644653, + "learning_rate": 0.00036001467044372183, + "loss": 3.2469, + "step": 26683 + }, + { + "epoch": 1.31, + "grad_norm": 0.6012324690818787, + "learning_rate": 0.000359999585394202, + "loss": 2.9777, + "step": 26684 + }, + { + "epoch": 1.31, + "grad_norm": 0.5452451705932617, + "learning_rate": 0.00035998450018665467, + "loss": 3.0931, + "step": 26685 + }, + { + "epoch": 1.31, + "grad_norm": 0.6085728406906128, + "learning_rate": 0.0003599694148211197, + "loss": 3.0797, + "step": 26686 + }, + { + "epoch": 1.31, + "grad_norm": 0.5629120469093323, + "learning_rate": 0.0003599543292976369, + "loss": 3.1281, + "step": 26687 + }, + { + "epoch": 1.31, + "grad_norm": 0.5537635087966919, + "learning_rate": 0.00035993924361624587, + "loss": 3.176, + "step": 26688 + }, + { + "epoch": 1.31, + "grad_norm": 0.5594296455383301, + "learning_rate": 0.0003599241577769864, + "loss": 3.1268, + "step": 26689 + }, + { + "epoch": 1.31, + "grad_norm": 0.5878404974937439, + "learning_rate": 0.00035990907177989827, + "loss": 2.9992, + "step": 26690 + }, + { + "epoch": 1.31, + "grad_norm": 0.5586697459220886, + "learning_rate": 0.0003598939856250212, + "loss": 3.011, + "step": 26691 + }, + { + "epoch": 1.31, + "grad_norm": 0.5551723837852478, + "learning_rate": 0.00035987889931239484, + "loss": 2.9541, + "step": 26692 + }, + { + "epoch": 1.31, + "grad_norm": 0.5686328411102295, + "learning_rate": 0.000359863812842059, + "loss": 3.3019, + "step": 26693 + }, + { + "epoch": 1.31, + "grad_norm": 0.5753780603408813, + "learning_rate": 0.00035984872621405337, + "loss": 3.2523, + "step": 26694 + }, + { + "epoch": 1.31, + "grad_norm": 0.5661664009094238, + "learning_rate": 0.0003598336394284178, + "loss": 3.0049, + "step": 26695 + }, + { + "epoch": 1.31, + "grad_norm": 0.5784562826156616, + "learning_rate": 0.0003598185524851919, + "loss": 2.8481, + "step": 26696 + }, + { + "epoch": 1.31, + "grad_norm": 0.6021720767021179, + "learning_rate": 0.0003598034653844154, + "loss": 3.2881, + "step": 26697 + }, + { + "epoch": 1.31, + "grad_norm": 0.5414631366729736, + "learning_rate": 0.0003597883781261281, + "loss": 3.1237, + "step": 26698 + }, + { + "epoch": 1.31, + "grad_norm": 0.5559752583503723, + "learning_rate": 0.00035977329071036976, + "loss": 3.177, + "step": 26699 + }, + { + "epoch": 1.31, + "grad_norm": 0.6183595061302185, + "learning_rate": 0.00035975820313718004, + "loss": 2.9357, + "step": 26700 + }, + { + "epoch": 1.31, + "grad_norm": 0.5673224925994873, + "learning_rate": 0.00035974311540659876, + "loss": 3.0607, + "step": 26701 + }, + { + "epoch": 1.31, + "grad_norm": 0.6092291474342346, + "learning_rate": 0.00035972802751866554, + "loss": 2.9099, + "step": 26702 + }, + { + "epoch": 1.31, + "grad_norm": 0.541416347026825, + "learning_rate": 0.0003597129394734202, + "loss": 3.0918, + "step": 26703 + }, + { + "epoch": 1.31, + "grad_norm": 0.6167910695075989, + "learning_rate": 0.0003596978512709025, + "loss": 2.8992, + "step": 26704 + }, + { + "epoch": 1.31, + "grad_norm": 0.6436028480529785, + "learning_rate": 0.0003596827629111521, + "loss": 2.8388, + "step": 26705 + }, + { + "epoch": 1.31, + "grad_norm": 0.5907847285270691, + "learning_rate": 0.0003596676743942089, + "loss": 3.1507, + "step": 26706 + }, + { + "epoch": 1.31, + "grad_norm": 0.5741157531738281, + "learning_rate": 0.00035965258572011246, + "loss": 3.16, + "step": 26707 + }, + { + "epoch": 1.31, + "grad_norm": 0.545328676700592, + "learning_rate": 0.00035963749688890253, + "loss": 3.1713, + "step": 26708 + }, + { + "epoch": 1.31, + "grad_norm": 0.5556525588035583, + "learning_rate": 0.000359622407900619, + "loss": 3.1528, + "step": 26709 + }, + { + "epoch": 1.31, + "grad_norm": 0.5546360015869141, + "learning_rate": 0.0003596073187553015, + "loss": 2.8478, + "step": 26710 + }, + { + "epoch": 1.31, + "grad_norm": 0.6153631806373596, + "learning_rate": 0.0003595922294529898, + "loss": 3.0284, + "step": 26711 + }, + { + "epoch": 1.31, + "grad_norm": 0.5715863704681396, + "learning_rate": 0.0003595771399937236, + "loss": 3.2872, + "step": 26712 + }, + { + "epoch": 1.31, + "grad_norm": 0.5800357460975647, + "learning_rate": 0.00035956205037754275, + "loss": 3.2561, + "step": 26713 + }, + { + "epoch": 1.31, + "grad_norm": 0.557467520236969, + "learning_rate": 0.0003595469606044869, + "loss": 3.136, + "step": 26714 + }, + { + "epoch": 1.31, + "grad_norm": 0.5892956852912903, + "learning_rate": 0.00035953187067459575, + "loss": 3.1928, + "step": 26715 + }, + { + "epoch": 1.31, + "grad_norm": 0.5511610507965088, + "learning_rate": 0.00035951678058790913, + "loss": 2.9516, + "step": 26716 + }, + { + "epoch": 1.31, + "grad_norm": 0.5597865581512451, + "learning_rate": 0.0003595016903444669, + "loss": 3.0769, + "step": 26717 + }, + { + "epoch": 1.31, + "grad_norm": 0.5834679007530212, + "learning_rate": 0.00035948659994430843, + "loss": 3.0175, + "step": 26718 + }, + { + "epoch": 1.31, + "grad_norm": 0.5704622268676758, + "learning_rate": 0.0003594715093874738, + "loss": 3.0997, + "step": 26719 + }, + { + "epoch": 1.31, + "grad_norm": 0.5578005909919739, + "learning_rate": 0.0003594564186740027, + "loss": 3.0305, + "step": 26720 + }, + { + "epoch": 1.31, + "grad_norm": 0.6064382195472717, + "learning_rate": 0.0003594413278039349, + "loss": 3.0124, + "step": 26721 + }, + { + "epoch": 1.31, + "grad_norm": 0.5383819341659546, + "learning_rate": 0.00035942623677731004, + "loss": 3.1193, + "step": 26722 + }, + { + "epoch": 1.31, + "grad_norm": 0.5984358191490173, + "learning_rate": 0.00035941114559416776, + "loss": 3.0482, + "step": 26723 + }, + { + "epoch": 1.31, + "grad_norm": 0.5805661082267761, + "learning_rate": 0.0003593960542545481, + "loss": 2.9751, + "step": 26724 + }, + { + "epoch": 1.31, + "grad_norm": 0.5724626779556274, + "learning_rate": 0.0003593809627584906, + "loss": 3.0252, + "step": 26725 + }, + { + "epoch": 1.31, + "grad_norm": 0.545838475227356, + "learning_rate": 0.0003593658711060352, + "loss": 3.1077, + "step": 26726 + }, + { + "epoch": 1.31, + "grad_norm": 0.5773608088493347, + "learning_rate": 0.0003593507792972214, + "loss": 3.4112, + "step": 26727 + }, + { + "epoch": 1.31, + "grad_norm": 0.5865770578384399, + "learning_rate": 0.0003593356873320891, + "loss": 3.1387, + "step": 26728 + }, + { + "epoch": 1.31, + "grad_norm": 0.5342745184898376, + "learning_rate": 0.000359320595210678, + "loss": 2.9596, + "step": 26729 + }, + { + "epoch": 1.31, + "grad_norm": 0.567023754119873, + "learning_rate": 0.0003593055029330279, + "loss": 3.2155, + "step": 26730 + }, + { + "epoch": 1.31, + "grad_norm": 0.5480973124504089, + "learning_rate": 0.0003592904104991785, + "loss": 3.1711, + "step": 26731 + }, + { + "epoch": 1.31, + "grad_norm": 0.5474660396575928, + "learning_rate": 0.0003592753179091696, + "loss": 2.9138, + "step": 26732 + }, + { + "epoch": 1.31, + "grad_norm": 0.5918500423431396, + "learning_rate": 0.00035926022516304085, + "loss": 3.0106, + "step": 26733 + }, + { + "epoch": 1.31, + "grad_norm": 0.5381353497505188, + "learning_rate": 0.00035924513226083207, + "loss": 2.9064, + "step": 26734 + }, + { + "epoch": 1.31, + "grad_norm": 0.5770012140274048, + "learning_rate": 0.00035923003920258306, + "loss": 2.8773, + "step": 26735 + }, + { + "epoch": 1.31, + "grad_norm": 0.6201854348182678, + "learning_rate": 0.0003592149459883336, + "loss": 3.1339, + "step": 26736 + }, + { + "epoch": 1.31, + "grad_norm": 0.559162437915802, + "learning_rate": 0.0003591998526181233, + "loss": 3.0548, + "step": 26737 + }, + { + "epoch": 1.31, + "grad_norm": 0.5843203067779541, + "learning_rate": 0.0003591847590919919, + "loss": 3.2486, + "step": 26738 + }, + { + "epoch": 1.31, + "grad_norm": 0.6074045896530151, + "learning_rate": 0.00035916966540997934, + "loss": 3.2939, + "step": 26739 + }, + { + "epoch": 1.31, + "grad_norm": 0.605433464050293, + "learning_rate": 0.0003591545715721252, + "loss": 3.2019, + "step": 26740 + }, + { + "epoch": 1.31, + "grad_norm": 0.6092764735221863, + "learning_rate": 0.00035913947757846936, + "loss": 3.0387, + "step": 26741 + }, + { + "epoch": 1.31, + "grad_norm": 0.5730652213096619, + "learning_rate": 0.00035912438342905153, + "loss": 3.214, + "step": 26742 + }, + { + "epoch": 1.31, + "grad_norm": 0.5732026696205139, + "learning_rate": 0.0003591092891239115, + "loss": 2.8984, + "step": 26743 + }, + { + "epoch": 1.31, + "grad_norm": 0.5797938704490662, + "learning_rate": 0.0003590941946630888, + "loss": 2.9541, + "step": 26744 + }, + { + "epoch": 1.31, + "grad_norm": 0.555505096912384, + "learning_rate": 0.00035907910004662343, + "loss": 2.9514, + "step": 26745 + }, + { + "epoch": 1.31, + "grad_norm": 0.6078484058380127, + "learning_rate": 0.00035906400527455524, + "loss": 3.2675, + "step": 26746 + }, + { + "epoch": 1.31, + "grad_norm": 0.5532565116882324, + "learning_rate": 0.0003590489103469236, + "loss": 3.0698, + "step": 26747 + }, + { + "epoch": 1.31, + "grad_norm": 0.5619004964828491, + "learning_rate": 0.00035903381526376863, + "loss": 2.9572, + "step": 26748 + }, + { + "epoch": 1.31, + "grad_norm": 0.5572454929351807, + "learning_rate": 0.0003590187200251299, + "loss": 3.2266, + "step": 26749 + }, + { + "epoch": 1.31, + "grad_norm": 0.5924436450004578, + "learning_rate": 0.00035900362463104724, + "loss": 2.8406, + "step": 26750 + }, + { + "epoch": 1.31, + "grad_norm": 0.6133246421813965, + "learning_rate": 0.00035898852908156044, + "loss": 2.9873, + "step": 26751 + }, + { + "epoch": 1.31, + "grad_norm": 0.5816695094108582, + "learning_rate": 0.00035897343337670915, + "loss": 2.9221, + "step": 26752 + }, + { + "epoch": 1.31, + "grad_norm": 0.5877933502197266, + "learning_rate": 0.0003589583375165331, + "loss": 3.0776, + "step": 26753 + }, + { + "epoch": 1.31, + "grad_norm": 0.5489791631698608, + "learning_rate": 0.00035894324150107225, + "loss": 2.8286, + "step": 26754 + }, + { + "epoch": 1.31, + "grad_norm": 0.5626378059387207, + "learning_rate": 0.0003589281453303662, + "loss": 3.1682, + "step": 26755 + }, + { + "epoch": 1.31, + "grad_norm": 0.5878295302391052, + "learning_rate": 0.0003589130490044548, + "loss": 3.0382, + "step": 26756 + }, + { + "epoch": 1.31, + "grad_norm": 0.5876728892326355, + "learning_rate": 0.00035889795252337783, + "loss": 2.8998, + "step": 26757 + }, + { + "epoch": 1.31, + "grad_norm": 0.5746318101882935, + "learning_rate": 0.00035888285588717486, + "loss": 2.9987, + "step": 26758 + }, + { + "epoch": 1.31, + "grad_norm": 0.5840819478034973, + "learning_rate": 0.0003588677590958857, + "loss": 3.0352, + "step": 26759 + }, + { + "epoch": 1.31, + "grad_norm": 0.5611565709114075, + "learning_rate": 0.0003588526621495504, + "loss": 3.0672, + "step": 26760 + }, + { + "epoch": 1.31, + "grad_norm": 0.595608115196228, + "learning_rate": 0.0003588375650482085, + "loss": 2.868, + "step": 26761 + }, + { + "epoch": 1.31, + "grad_norm": 0.5854583978652954, + "learning_rate": 0.00035882246779189963, + "loss": 3.467, + "step": 26762 + }, + { + "epoch": 1.31, + "grad_norm": 0.5326132774353027, + "learning_rate": 0.0003588073703806638, + "loss": 3.0428, + "step": 26763 + }, + { + "epoch": 1.31, + "grad_norm": 0.5872798562049866, + "learning_rate": 0.0003587922728145407, + "loss": 2.8781, + "step": 26764 + }, + { + "epoch": 1.31, + "grad_norm": 0.5464463829994202, + "learning_rate": 0.00035877717509356997, + "loss": 3.3402, + "step": 26765 + }, + { + "epoch": 1.31, + "grad_norm": 0.6281166672706604, + "learning_rate": 0.0003587620772177915, + "loss": 3.1979, + "step": 26766 + }, + { + "epoch": 1.31, + "grad_norm": 0.5617860555648804, + "learning_rate": 0.000358746979187245, + "loss": 3.0359, + "step": 26767 + }, + { + "epoch": 1.31, + "grad_norm": 0.6079573035240173, + "learning_rate": 0.0003587318810019704, + "loss": 3.0718, + "step": 26768 + }, + { + "epoch": 1.31, + "grad_norm": 0.5587426424026489, + "learning_rate": 0.0003587167826620073, + "loss": 2.9656, + "step": 26769 + }, + { + "epoch": 1.31, + "grad_norm": 0.5827581882476807, + "learning_rate": 0.0003587016841673953, + "loss": 3.1422, + "step": 26770 + }, + { + "epoch": 1.31, + "grad_norm": 0.6292572021484375, + "learning_rate": 0.00035868658551817457, + "loss": 3.1551, + "step": 26771 + }, + { + "epoch": 1.31, + "grad_norm": 0.6260858774185181, + "learning_rate": 0.0003586714867143846, + "loss": 3.1601, + "step": 26772 + }, + { + "epoch": 1.31, + "grad_norm": 0.5667866468429565, + "learning_rate": 0.0003586563877560652, + "loss": 3.1435, + "step": 26773 + }, + { + "epoch": 1.31, + "grad_norm": 0.6008673310279846, + "learning_rate": 0.0003586412886432562, + "loss": 2.9808, + "step": 26774 + }, + { + "epoch": 1.31, + "grad_norm": 0.5359012484550476, + "learning_rate": 0.0003586261893759973, + "loss": 3.068, + "step": 26775 + }, + { + "epoch": 1.31, + "grad_norm": 0.5466083288192749, + "learning_rate": 0.00035861108995432833, + "loss": 3.0155, + "step": 26776 + }, + { + "epoch": 1.31, + "grad_norm": 0.5517134666442871, + "learning_rate": 0.000358595990378289, + "loss": 2.9651, + "step": 26777 + }, + { + "epoch": 1.31, + "grad_norm": 0.55260169506073, + "learning_rate": 0.00035858089064791913, + "loss": 2.9182, + "step": 26778 + }, + { + "epoch": 1.31, + "grad_norm": 0.5760247111320496, + "learning_rate": 0.0003585657907632585, + "loss": 2.8463, + "step": 26779 + }, + { + "epoch": 1.31, + "grad_norm": 0.596767246723175, + "learning_rate": 0.0003585506907243467, + "loss": 3.3759, + "step": 26780 + }, + { + "epoch": 1.31, + "grad_norm": 0.5780153274536133, + "learning_rate": 0.0003585355905312238, + "loss": 3.0084, + "step": 26781 + }, + { + "epoch": 1.31, + "grad_norm": 0.5638839602470398, + "learning_rate": 0.00035852049018392934, + "loss": 3.1286, + "step": 26782 + }, + { + "epoch": 1.31, + "grad_norm": 0.5510829091072083, + "learning_rate": 0.0003585053896825033, + "loss": 2.9903, + "step": 26783 + }, + { + "epoch": 1.31, + "grad_norm": 0.6048457026481628, + "learning_rate": 0.0003584902890269852, + "loss": 3.044, + "step": 26784 + }, + { + "epoch": 1.31, + "grad_norm": 0.5909347534179688, + "learning_rate": 0.0003584751882174149, + "loss": 3.0262, + "step": 26785 + }, + { + "epoch": 1.31, + "grad_norm": 0.5883007645606995, + "learning_rate": 0.00035846008725383237, + "loss": 3.2037, + "step": 26786 + }, + { + "epoch": 1.31, + "grad_norm": 0.6115594506263733, + "learning_rate": 0.0003584449861362771, + "loss": 3.1592, + "step": 26787 + }, + { + "epoch": 1.31, + "grad_norm": 0.5681421756744385, + "learning_rate": 0.00035842988486478903, + "loss": 2.844, + "step": 26788 + }, + { + "epoch": 1.31, + "grad_norm": 0.6011251211166382, + "learning_rate": 0.00035841478343940784, + "loss": 3.0141, + "step": 26789 + }, + { + "epoch": 1.31, + "grad_norm": 0.5426762700080872, + "learning_rate": 0.0003583996818601734, + "loss": 3.2084, + "step": 26790 + }, + { + "epoch": 1.31, + "grad_norm": 0.6285281181335449, + "learning_rate": 0.0003583845801271254, + "loss": 3.0548, + "step": 26791 + }, + { + "epoch": 1.31, + "grad_norm": 0.6543646454811096, + "learning_rate": 0.0003583694782403036, + "loss": 2.9277, + "step": 26792 + }, + { + "epoch": 1.31, + "grad_norm": 0.5835058093070984, + "learning_rate": 0.0003583543761997479, + "loss": 2.9471, + "step": 26793 + }, + { + "epoch": 1.31, + "grad_norm": 0.5639581084251404, + "learning_rate": 0.0003583392740054981, + "loss": 2.9748, + "step": 26794 + }, + { + "epoch": 1.31, + "grad_norm": 0.5835383534431458, + "learning_rate": 0.0003583241716575937, + "loss": 3.1496, + "step": 26795 + }, + { + "epoch": 1.31, + "grad_norm": 0.5585667490959167, + "learning_rate": 0.00035830906915607476, + "loss": 2.8306, + "step": 26796 + }, + { + "epoch": 1.31, + "grad_norm": 0.6079902052879333, + "learning_rate": 0.000358293966500981, + "loss": 3.0118, + "step": 26797 + }, + { + "epoch": 1.31, + "grad_norm": 0.6015511155128479, + "learning_rate": 0.00035827886369235203, + "loss": 3.0294, + "step": 26798 + }, + { + "epoch": 1.31, + "grad_norm": 0.5599113702774048, + "learning_rate": 0.00035826376073022783, + "loss": 3.0174, + "step": 26799 + }, + { + "epoch": 1.31, + "grad_norm": 0.5897526741027832, + "learning_rate": 0.0003582486576146481, + "loss": 3.1255, + "step": 26800 + }, + { + "epoch": 1.31, + "grad_norm": 0.570527195930481, + "learning_rate": 0.00035823355434565256, + "loss": 2.9353, + "step": 26801 + }, + { + "epoch": 1.31, + "grad_norm": 0.5881776809692383, + "learning_rate": 0.0003582184509232811, + "loss": 3.1936, + "step": 26802 + }, + { + "epoch": 1.31, + "grad_norm": 0.5746738314628601, + "learning_rate": 0.00035820334734757336, + "loss": 2.9701, + "step": 26803 + }, + { + "epoch": 1.31, + "grad_norm": 0.5680601000785828, + "learning_rate": 0.0003581882436185693, + "loss": 3.3507, + "step": 26804 + }, + { + "epoch": 1.31, + "grad_norm": 0.5403523445129395, + "learning_rate": 0.0003581731397363085, + "loss": 3.082, + "step": 26805 + }, + { + "epoch": 1.31, + "grad_norm": 0.5722833275794983, + "learning_rate": 0.0003581580357008309, + "loss": 3.041, + "step": 26806 + }, + { + "epoch": 1.31, + "grad_norm": 0.5738394260406494, + "learning_rate": 0.00035814293151217625, + "loss": 2.9287, + "step": 26807 + }, + { + "epoch": 1.31, + "grad_norm": 0.6539166569709778, + "learning_rate": 0.00035812782717038435, + "loss": 3.1007, + "step": 26808 + }, + { + "epoch": 1.31, + "grad_norm": 0.5289335250854492, + "learning_rate": 0.0003581127226754949, + "loss": 2.8963, + "step": 26809 + }, + { + "epoch": 1.31, + "grad_norm": 0.5261499285697937, + "learning_rate": 0.0003580976180275477, + "loss": 3.2443, + "step": 26810 + }, + { + "epoch": 1.31, + "grad_norm": 0.5962910652160645, + "learning_rate": 0.00035808251322658256, + "loss": 2.9977, + "step": 26811 + }, + { + "epoch": 1.31, + "grad_norm": 0.6628442406654358, + "learning_rate": 0.0003580674082726392, + "loss": 3.1296, + "step": 26812 + }, + { + "epoch": 1.31, + "grad_norm": 0.5591052770614624, + "learning_rate": 0.00035805230316575756, + "loss": 3.0394, + "step": 26813 + }, + { + "epoch": 1.31, + "grad_norm": 0.5713579058647156, + "learning_rate": 0.00035803719790597727, + "loss": 3.0677, + "step": 26814 + }, + { + "epoch": 1.31, + "grad_norm": 0.5537461042404175, + "learning_rate": 0.0003580220924933382, + "loss": 2.9197, + "step": 26815 + }, + { + "epoch": 1.31, + "grad_norm": 0.5648530721664429, + "learning_rate": 0.0003580069869278801, + "loss": 3.0549, + "step": 26816 + }, + { + "epoch": 1.31, + "grad_norm": 0.595908522605896, + "learning_rate": 0.0003579918812096428, + "loss": 2.9516, + "step": 26817 + }, + { + "epoch": 1.31, + "grad_norm": 0.5620924830436707, + "learning_rate": 0.00035797677533866597, + "loss": 2.9171, + "step": 26818 + }, + { + "epoch": 1.31, + "grad_norm": 0.5861213803291321, + "learning_rate": 0.00035796166931498956, + "loss": 2.938, + "step": 26819 + }, + { + "epoch": 1.31, + "grad_norm": 0.6200598478317261, + "learning_rate": 0.0003579465631386532, + "loss": 3.1466, + "step": 26820 + }, + { + "epoch": 1.31, + "grad_norm": 0.5952265858650208, + "learning_rate": 0.00035793145680969665, + "loss": 3.194, + "step": 26821 + }, + { + "epoch": 1.31, + "grad_norm": 0.5975573062896729, + "learning_rate": 0.00035791635032816, + "loss": 3.1285, + "step": 26822 + }, + { + "epoch": 1.31, + "grad_norm": 0.5594826340675354, + "learning_rate": 0.0003579012436940827, + "loss": 3.0251, + "step": 26823 + }, + { + "epoch": 1.31, + "grad_norm": 0.5513445734977722, + "learning_rate": 0.00035788613690750467, + "loss": 3.175, + "step": 26824 + }, + { + "epoch": 1.31, + "grad_norm": 0.5082058310508728, + "learning_rate": 0.0003578710299684657, + "loss": 3.244, + "step": 26825 + }, + { + "epoch": 1.31, + "grad_norm": 0.560768723487854, + "learning_rate": 0.00035785592287700563, + "loss": 3.1634, + "step": 26826 + }, + { + "epoch": 1.31, + "grad_norm": 0.6477553844451904, + "learning_rate": 0.00035784081563316415, + "loss": 2.7827, + "step": 26827 + }, + { + "epoch": 1.31, + "grad_norm": 0.5783835053443909, + "learning_rate": 0.0003578257082369811, + "loss": 3.0997, + "step": 26828 + }, + { + "epoch": 1.31, + "grad_norm": 0.6416663527488708, + "learning_rate": 0.0003578106006884962, + "loss": 3.2326, + "step": 26829 + }, + { + "epoch": 1.31, + "grad_norm": 0.5797775983810425, + "learning_rate": 0.00035779549298774947, + "loss": 3.1772, + "step": 26830 + }, + { + "epoch": 1.31, + "grad_norm": 0.5553704500198364, + "learning_rate": 0.00035778038513478033, + "loss": 2.9769, + "step": 26831 + }, + { + "epoch": 1.31, + "grad_norm": 0.5943940281867981, + "learning_rate": 0.0003577652771296289, + "loss": 3.226, + "step": 26832 + }, + { + "epoch": 1.32, + "grad_norm": 0.5958367586135864, + "learning_rate": 0.00035775016897233487, + "loss": 3.1038, + "step": 26833 + }, + { + "epoch": 1.32, + "grad_norm": 0.5723438858985901, + "learning_rate": 0.000357735060662938, + "loss": 3.147, + "step": 26834 + }, + { + "epoch": 1.32, + "grad_norm": 0.5667865872383118, + "learning_rate": 0.0003577199522014781, + "loss": 3.0796, + "step": 26835 + }, + { + "epoch": 1.32, + "grad_norm": 0.585544228553772, + "learning_rate": 0.00035770484358799474, + "loss": 3.0049, + "step": 26836 + }, + { + "epoch": 1.32, + "grad_norm": 0.586850106716156, + "learning_rate": 0.00035768973482252814, + "loss": 3.0851, + "step": 26837 + }, + { + "epoch": 1.32, + "grad_norm": 0.6466086506843567, + "learning_rate": 0.00035767462590511793, + "loss": 2.9241, + "step": 26838 + }, + { + "epoch": 1.32, + "grad_norm": 0.5506917834281921, + "learning_rate": 0.00035765951683580367, + "loss": 3.1222, + "step": 26839 + }, + { + "epoch": 1.32, + "grad_norm": 0.545690655708313, + "learning_rate": 0.0003576444076146255, + "loss": 3.0191, + "step": 26840 + }, + { + "epoch": 1.32, + "grad_norm": 0.6054341793060303, + "learning_rate": 0.00035762929824162295, + "loss": 2.9853, + "step": 26841 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546567440032959, + "learning_rate": 0.00035761418871683596, + "loss": 2.9322, + "step": 26842 + }, + { + "epoch": 1.32, + "grad_norm": 0.5472365617752075, + "learning_rate": 0.00035759907904030425, + "loss": 3.009, + "step": 26843 + }, + { + "epoch": 1.32, + "grad_norm": 0.5797613263130188, + "learning_rate": 0.0003575839692120677, + "loss": 3.1111, + "step": 26844 + }, + { + "epoch": 1.32, + "grad_norm": 0.5437989234924316, + "learning_rate": 0.0003575688592321661, + "loss": 3.1502, + "step": 26845 + }, + { + "epoch": 1.32, + "grad_norm": 0.568385124206543, + "learning_rate": 0.00035755374910063914, + "loss": 2.8418, + "step": 26846 + }, + { + "epoch": 1.32, + "grad_norm": 0.5577401518821716, + "learning_rate": 0.0003575386388175266, + "loss": 2.9497, + "step": 26847 + }, + { + "epoch": 1.32, + "grad_norm": 0.5725609064102173, + "learning_rate": 0.0003575235283828685, + "loss": 3.0207, + "step": 26848 + }, + { + "epoch": 1.32, + "grad_norm": 0.6471617817878723, + "learning_rate": 0.00035750841779670444, + "loss": 2.9642, + "step": 26849 + }, + { + "epoch": 1.32, + "grad_norm": 0.5361818075180054, + "learning_rate": 0.0003574933070590743, + "loss": 2.933, + "step": 26850 + }, + { + "epoch": 1.32, + "grad_norm": 0.5480607748031616, + "learning_rate": 0.00035747819617001783, + "loss": 3.1263, + "step": 26851 + }, + { + "epoch": 1.32, + "grad_norm": 0.5243863463401794, + "learning_rate": 0.00035746308512957486, + "loss": 3.0354, + "step": 26852 + }, + { + "epoch": 1.32, + "grad_norm": 0.5858286023139954, + "learning_rate": 0.0003574479739377852, + "loss": 3.1605, + "step": 26853 + }, + { + "epoch": 1.32, + "grad_norm": 0.5662160515785217, + "learning_rate": 0.0003574328625946885, + "loss": 3.2539, + "step": 26854 + }, + { + "epoch": 1.32, + "grad_norm": 0.5930647253990173, + "learning_rate": 0.0003574177511003248, + "loss": 2.7858, + "step": 26855 + }, + { + "epoch": 1.32, + "grad_norm": 0.5709060430526733, + "learning_rate": 0.0003574026394547339, + "loss": 3.0939, + "step": 26856 + }, + { + "epoch": 1.32, + "grad_norm": 0.6525852680206299, + "learning_rate": 0.00035738752765795533, + "loss": 3.0615, + "step": 26857 + }, + { + "epoch": 1.32, + "grad_norm": 0.5705323815345764, + "learning_rate": 0.0003573724157100291, + "loss": 2.8539, + "step": 26858 + }, + { + "epoch": 1.32, + "grad_norm": 0.5645226836204529, + "learning_rate": 0.00035735730361099506, + "loss": 3.1802, + "step": 26859 + }, + { + "epoch": 1.32, + "grad_norm": 0.5563384294509888, + "learning_rate": 0.00035734219136089287, + "loss": 3.1764, + "step": 26860 + }, + { + "epoch": 1.32, + "grad_norm": 0.5454398989677429, + "learning_rate": 0.0003573270789597623, + "loss": 3.2813, + "step": 26861 + }, + { + "epoch": 1.32, + "grad_norm": 0.5893054008483887, + "learning_rate": 0.0003573119664076433, + "loss": 2.9479, + "step": 26862 + }, + { + "epoch": 1.32, + "grad_norm": 0.6017407178878784, + "learning_rate": 0.00035729685370457557, + "loss": 2.981, + "step": 26863 + }, + { + "epoch": 1.32, + "grad_norm": 0.5848926901817322, + "learning_rate": 0.00035728174085059906, + "loss": 2.9967, + "step": 26864 + }, + { + "epoch": 1.32, + "grad_norm": 0.5659344792366028, + "learning_rate": 0.0003572666278457533, + "loss": 3.0336, + "step": 26865 + }, + { + "epoch": 1.32, + "grad_norm": 0.5689499378204346, + "learning_rate": 0.0003572515146900784, + "loss": 3.1079, + "step": 26866 + }, + { + "epoch": 1.32, + "grad_norm": 0.5527841448783875, + "learning_rate": 0.00035723640138361404, + "loss": 2.9584, + "step": 26867 + }, + { + "epoch": 1.32, + "grad_norm": 0.5790805220603943, + "learning_rate": 0.0003572212879263999, + "loss": 3.1476, + "step": 26868 + }, + { + "epoch": 1.32, + "grad_norm": 0.5489739775657654, + "learning_rate": 0.00035720617431847604, + "loss": 3.0897, + "step": 26869 + }, + { + "epoch": 1.32, + "grad_norm": 0.5729896426200867, + "learning_rate": 0.00035719106055988206, + "loss": 3.0103, + "step": 26870 + }, + { + "epoch": 1.32, + "grad_norm": 0.6073732376098633, + "learning_rate": 0.0003571759466506578, + "loss": 2.9835, + "step": 26871 + }, + { + "epoch": 1.32, + "grad_norm": 0.63726407289505, + "learning_rate": 0.0003571608325908431, + "loss": 2.8864, + "step": 26872 + }, + { + "epoch": 1.32, + "grad_norm": 0.5688294172286987, + "learning_rate": 0.0003571457183804779, + "loss": 2.7645, + "step": 26873 + }, + { + "epoch": 1.32, + "grad_norm": 0.5426622033119202, + "learning_rate": 0.00035713060401960174, + "loss": 3.0762, + "step": 26874 + }, + { + "epoch": 1.32, + "grad_norm": 0.57330322265625, + "learning_rate": 0.0003571154895082546, + "loss": 3.0234, + "step": 26875 + }, + { + "epoch": 1.32, + "grad_norm": 0.5708470940589905, + "learning_rate": 0.0003571003748464762, + "loss": 2.8983, + "step": 26876 + }, + { + "epoch": 1.32, + "grad_norm": 0.5946699976921082, + "learning_rate": 0.00035708526003430647, + "loss": 3.0277, + "step": 26877 + }, + { + "epoch": 1.32, + "grad_norm": 0.5799338817596436, + "learning_rate": 0.0003570701450717851, + "loss": 3.0412, + "step": 26878 + }, + { + "epoch": 1.32, + "grad_norm": 0.558404803276062, + "learning_rate": 0.000357055029958952, + "loss": 3.0555, + "step": 26879 + }, + { + "epoch": 1.32, + "grad_norm": 0.7721039056777954, + "learning_rate": 0.00035703991469584686, + "loss": 3.0843, + "step": 26880 + }, + { + "epoch": 1.32, + "grad_norm": 0.5947189927101135, + "learning_rate": 0.0003570247992825097, + "loss": 3.2052, + "step": 26881 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546451807022095, + "learning_rate": 0.00035700968371898007, + "loss": 3.1527, + "step": 26882 + }, + { + "epoch": 1.32, + "grad_norm": 0.5192341208457947, + "learning_rate": 0.0003569945680052979, + "loss": 3.0922, + "step": 26883 + }, + { + "epoch": 1.32, + "grad_norm": 0.570080041885376, + "learning_rate": 0.00035697945214150306, + "loss": 3.1406, + "step": 26884 + }, + { + "epoch": 1.32, + "grad_norm": 0.5520020723342896, + "learning_rate": 0.00035696433612763536, + "loss": 3.2305, + "step": 26885 + }, + { + "epoch": 1.32, + "grad_norm": 0.5862802267074585, + "learning_rate": 0.0003569492199637345, + "loss": 3.0221, + "step": 26886 + }, + { + "epoch": 1.32, + "grad_norm": 0.5665404200553894, + "learning_rate": 0.0003569341036498404, + "loss": 3.0495, + "step": 26887 + }, + { + "epoch": 1.32, + "grad_norm": 0.5680643320083618, + "learning_rate": 0.00035691898718599274, + "loss": 3.2575, + "step": 26888 + }, + { + "epoch": 1.32, + "grad_norm": 0.5598427057266235, + "learning_rate": 0.0003569038705722315, + "loss": 2.9607, + "step": 26889 + }, + { + "epoch": 1.32, + "grad_norm": 0.5373253226280212, + "learning_rate": 0.00035688875380859636, + "loss": 2.9572, + "step": 26890 + }, + { + "epoch": 1.32, + "grad_norm": 0.5592238306999207, + "learning_rate": 0.00035687363689512714, + "loss": 2.8467, + "step": 26891 + }, + { + "epoch": 1.32, + "grad_norm": 0.7021776437759399, + "learning_rate": 0.0003568585198318639, + "loss": 2.9761, + "step": 26892 + }, + { + "epoch": 1.32, + "grad_norm": 0.6009718775749207, + "learning_rate": 0.00035684340261884606, + "loss": 3.3808, + "step": 26893 + }, + { + "epoch": 1.32, + "grad_norm": 0.6137123107910156, + "learning_rate": 0.0003568282852561137, + "loss": 3.1077, + "step": 26894 + }, + { + "epoch": 1.32, + "grad_norm": 0.5503768920898438, + "learning_rate": 0.0003568131677437066, + "loss": 3.0737, + "step": 26895 + }, + { + "epoch": 1.32, + "grad_norm": 0.6017237901687622, + "learning_rate": 0.0003567980500816646, + "loss": 3.045, + "step": 26896 + }, + { + "epoch": 1.32, + "grad_norm": 0.5975255966186523, + "learning_rate": 0.00035678293227002744, + "loss": 3.138, + "step": 26897 + }, + { + "epoch": 1.32, + "grad_norm": 0.5789278745651245, + "learning_rate": 0.0003567678143088348, + "loss": 2.9976, + "step": 26898 + }, + { + "epoch": 1.32, + "grad_norm": 0.6184183359146118, + "learning_rate": 0.0003567526961981269, + "loss": 2.8672, + "step": 26899 + }, + { + "epoch": 1.32, + "grad_norm": 0.5539001822471619, + "learning_rate": 0.0003567375779379432, + "loss": 3.2786, + "step": 26900 + }, + { + "epoch": 1.32, + "grad_norm": 0.5661340951919556, + "learning_rate": 0.00035672245952832367, + "loss": 3.013, + "step": 26901 + }, + { + "epoch": 1.32, + "grad_norm": 0.5806612968444824, + "learning_rate": 0.000356707340969308, + "loss": 2.8103, + "step": 26902 + }, + { + "epoch": 1.32, + "grad_norm": 0.6162651181221008, + "learning_rate": 0.00035669222226093625, + "loss": 3.3411, + "step": 26903 + }, + { + "epoch": 1.32, + "grad_norm": 0.615796685218811, + "learning_rate": 0.000356677103403248, + "loss": 2.9888, + "step": 26904 + }, + { + "epoch": 1.32, + "grad_norm": 0.5535275340080261, + "learning_rate": 0.0003566619843962832, + "loss": 3.151, + "step": 26905 + }, + { + "epoch": 1.32, + "grad_norm": 0.5614963173866272, + "learning_rate": 0.0003566468652400817, + "loss": 3.1728, + "step": 26906 + }, + { + "epoch": 1.32, + "grad_norm": 0.5885602235794067, + "learning_rate": 0.0003566317459346832, + "loss": 2.9835, + "step": 26907 + }, + { + "epoch": 1.32, + "grad_norm": 0.5614680647850037, + "learning_rate": 0.00035661662648012757, + "loss": 3.2124, + "step": 26908 + }, + { + "epoch": 1.32, + "grad_norm": 0.6076555848121643, + "learning_rate": 0.0003566015068764546, + "loss": 2.9445, + "step": 26909 + }, + { + "epoch": 1.32, + "grad_norm": 0.5833714604377747, + "learning_rate": 0.0003565863871237043, + "loss": 3.1111, + "step": 26910 + }, + { + "epoch": 1.32, + "grad_norm": 0.5717360973358154, + "learning_rate": 0.0003565712672219162, + "loss": 2.9425, + "step": 26911 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546928644180298, + "learning_rate": 0.0003565561471711303, + "loss": 2.9382, + "step": 26912 + }, + { + "epoch": 1.32, + "grad_norm": 0.6168055534362793, + "learning_rate": 0.0003565410269713865, + "loss": 2.9419, + "step": 26913 + }, + { + "epoch": 1.32, + "grad_norm": 0.6003530025482178, + "learning_rate": 0.0003565259066227244, + "loss": 3.2024, + "step": 26914 + }, + { + "epoch": 1.32, + "grad_norm": 0.5346034169197083, + "learning_rate": 0.000356510786125184, + "loss": 3.1215, + "step": 26915 + }, + { + "epoch": 1.32, + "grad_norm": 0.5668526291847229, + "learning_rate": 0.00035649566547880495, + "loss": 3.1087, + "step": 26916 + }, + { + "epoch": 1.32, + "grad_norm": 0.7049530148506165, + "learning_rate": 0.0003564805446836273, + "loss": 3.0214, + "step": 26917 + }, + { + "epoch": 1.32, + "grad_norm": 0.5999130010604858, + "learning_rate": 0.0003564654237396907, + "loss": 2.9735, + "step": 26918 + }, + { + "epoch": 1.32, + "grad_norm": 0.5857048034667969, + "learning_rate": 0.0003564503026470351, + "loss": 2.8567, + "step": 26919 + }, + { + "epoch": 1.32, + "grad_norm": 0.5745254158973694, + "learning_rate": 0.0003564351814057003, + "loss": 3.0944, + "step": 26920 + }, + { + "epoch": 1.32, + "grad_norm": 0.5591810941696167, + "learning_rate": 0.0003564200600157261, + "loss": 3.124, + "step": 26921 + }, + { + "epoch": 1.32, + "grad_norm": 0.5647045373916626, + "learning_rate": 0.0003564049384771522, + "loss": 2.9827, + "step": 26922 + }, + { + "epoch": 1.32, + "grad_norm": 0.5440300107002258, + "learning_rate": 0.00035638981679001863, + "loss": 3.0062, + "step": 26923 + }, + { + "epoch": 1.32, + "grad_norm": 0.5504894852638245, + "learning_rate": 0.0003563746949543651, + "loss": 3.1177, + "step": 26924 + }, + { + "epoch": 1.32, + "grad_norm": 0.5864576697349548, + "learning_rate": 0.00035635957297023153, + "loss": 3.037, + "step": 26925 + }, + { + "epoch": 1.32, + "grad_norm": 0.5730857849121094, + "learning_rate": 0.00035634445083765765, + "loss": 2.8243, + "step": 26926 + }, + { + "epoch": 1.32, + "grad_norm": 0.5582221746444702, + "learning_rate": 0.0003563293285566833, + "loss": 3.1537, + "step": 26927 + }, + { + "epoch": 1.32, + "grad_norm": 0.5400102138519287, + "learning_rate": 0.00035631420612734836, + "loss": 2.955, + "step": 26928 + }, + { + "epoch": 1.32, + "grad_norm": 0.5986092686653137, + "learning_rate": 0.00035629908354969263, + "loss": 3.1307, + "step": 26929 + }, + { + "epoch": 1.32, + "grad_norm": 0.5586465001106262, + "learning_rate": 0.000356283960823756, + "loss": 3.022, + "step": 26930 + }, + { + "epoch": 1.32, + "grad_norm": 0.6682041883468628, + "learning_rate": 0.0003562688379495782, + "loss": 2.9636, + "step": 26931 + }, + { + "epoch": 1.32, + "grad_norm": 0.5432518124580383, + "learning_rate": 0.0003562537149271992, + "loss": 3.2366, + "step": 26932 + }, + { + "epoch": 1.32, + "grad_norm": 0.6639944314956665, + "learning_rate": 0.0003562385917566587, + "loss": 3.053, + "step": 26933 + }, + { + "epoch": 1.32, + "grad_norm": 0.5308532118797302, + "learning_rate": 0.0003562234684379965, + "loss": 3.0926, + "step": 26934 + }, + { + "epoch": 1.32, + "grad_norm": 0.5658736824989319, + "learning_rate": 0.0003562083449712525, + "loss": 3.069, + "step": 26935 + }, + { + "epoch": 1.32, + "grad_norm": 0.5990812182426453, + "learning_rate": 0.0003561932213564666, + "loss": 3.0671, + "step": 26936 + }, + { + "epoch": 1.32, + "grad_norm": 0.5363149046897888, + "learning_rate": 0.00035617809759367854, + "loss": 2.8551, + "step": 26937 + }, + { + "epoch": 1.32, + "grad_norm": 0.5616311430931091, + "learning_rate": 0.0003561629736829282, + "loss": 3.2784, + "step": 26938 + }, + { + "epoch": 1.32, + "grad_norm": 0.5642018914222717, + "learning_rate": 0.00035614784962425546, + "loss": 3.0111, + "step": 26939 + }, + { + "epoch": 1.32, + "grad_norm": 0.558562695980072, + "learning_rate": 0.0003561327254177, + "loss": 3.0165, + "step": 26940 + }, + { + "epoch": 1.32, + "grad_norm": 0.5792576670646667, + "learning_rate": 0.00035611760106330176, + "loss": 3.0326, + "step": 26941 + }, + { + "epoch": 1.32, + "grad_norm": 0.5646781921386719, + "learning_rate": 0.0003561024765611005, + "loss": 3.0448, + "step": 26942 + }, + { + "epoch": 1.32, + "grad_norm": 0.5456598997116089, + "learning_rate": 0.00035608735191113624, + "loss": 2.9734, + "step": 26943 + }, + { + "epoch": 1.32, + "grad_norm": 0.601516842842102, + "learning_rate": 0.0003560722271134486, + "loss": 3.1814, + "step": 26944 + }, + { + "epoch": 1.32, + "grad_norm": 0.5825067758560181, + "learning_rate": 0.0003560571021680775, + "loss": 2.9143, + "step": 26945 + }, + { + "epoch": 1.32, + "grad_norm": 0.5741273164749146, + "learning_rate": 0.00035604197707506287, + "loss": 2.8824, + "step": 26946 + }, + { + "epoch": 1.32, + "grad_norm": 0.6200869083404541, + "learning_rate": 0.0003560268518344444, + "loss": 3.2152, + "step": 26947 + }, + { + "epoch": 1.32, + "grad_norm": 0.6041608452796936, + "learning_rate": 0.000356011726446262, + "loss": 3.0224, + "step": 26948 + }, + { + "epoch": 1.32, + "grad_norm": 0.5746281147003174, + "learning_rate": 0.00035599660091055544, + "loss": 3.0115, + "step": 26949 + }, + { + "epoch": 1.32, + "grad_norm": 0.5583726763725281, + "learning_rate": 0.0003559814752273647, + "loss": 3.2274, + "step": 26950 + }, + { + "epoch": 1.32, + "grad_norm": 0.5739026665687561, + "learning_rate": 0.00035596634939672947, + "loss": 3.0633, + "step": 26951 + }, + { + "epoch": 1.32, + "grad_norm": 0.551236629486084, + "learning_rate": 0.0003559512234186896, + "loss": 3.0881, + "step": 26952 + }, + { + "epoch": 1.32, + "grad_norm": 0.5410869717597961, + "learning_rate": 0.0003559360972932851, + "loss": 3.0438, + "step": 26953 + }, + { + "epoch": 1.32, + "grad_norm": 0.6108107566833496, + "learning_rate": 0.00035592097102055554, + "loss": 3.1604, + "step": 26954 + }, + { + "epoch": 1.32, + "grad_norm": 0.5781041979789734, + "learning_rate": 0.00035590584460054104, + "loss": 3.1562, + "step": 26955 + }, + { + "epoch": 1.32, + "grad_norm": 0.5774297118186951, + "learning_rate": 0.00035589071803328125, + "loss": 2.885, + "step": 26956 + }, + { + "epoch": 1.32, + "grad_norm": 0.5566204786300659, + "learning_rate": 0.00035587559131881603, + "loss": 3.1957, + "step": 26957 + }, + { + "epoch": 1.32, + "grad_norm": 0.5503354668617249, + "learning_rate": 0.00035586046445718533, + "loss": 3.0237, + "step": 26958 + }, + { + "epoch": 1.32, + "grad_norm": 0.5675086975097656, + "learning_rate": 0.00035584533744842877, + "loss": 3.0587, + "step": 26959 + }, + { + "epoch": 1.32, + "grad_norm": 0.5769832730293274, + "learning_rate": 0.00035583021029258647, + "loss": 3.1677, + "step": 26960 + }, + { + "epoch": 1.32, + "grad_norm": 0.614290714263916, + "learning_rate": 0.00035581508298969817, + "loss": 2.9065, + "step": 26961 + }, + { + "epoch": 1.32, + "grad_norm": 0.5537709593772888, + "learning_rate": 0.00035579995553980365, + "loss": 3.1548, + "step": 26962 + }, + { + "epoch": 1.32, + "grad_norm": 0.6154881119728088, + "learning_rate": 0.0003557848279429427, + "loss": 3.3156, + "step": 26963 + }, + { + "epoch": 1.32, + "grad_norm": 0.5546302795410156, + "learning_rate": 0.00035576970019915536, + "loss": 2.939, + "step": 26964 + }, + { + "epoch": 1.32, + "grad_norm": 0.6049430966377258, + "learning_rate": 0.00035575457230848127, + "loss": 2.7861, + "step": 26965 + }, + { + "epoch": 1.32, + "grad_norm": 0.5868650078773499, + "learning_rate": 0.00035573944427096045, + "loss": 3.0257, + "step": 26966 + }, + { + "epoch": 1.32, + "grad_norm": 0.5958408713340759, + "learning_rate": 0.0003557243160866326, + "loss": 2.7296, + "step": 26967 + }, + { + "epoch": 1.32, + "grad_norm": 0.5856796503067017, + "learning_rate": 0.00035570918775553763, + "loss": 3.0672, + "step": 26968 + }, + { + "epoch": 1.32, + "grad_norm": 0.5468848347663879, + "learning_rate": 0.0003556940592777155, + "loss": 2.9349, + "step": 26969 + }, + { + "epoch": 1.32, + "grad_norm": 0.5882099270820618, + "learning_rate": 0.0003556789306532057, + "loss": 3.1575, + "step": 26970 + }, + { + "epoch": 1.32, + "grad_norm": 0.5558639168739319, + "learning_rate": 0.0003556638018820485, + "loss": 2.948, + "step": 26971 + }, + { + "epoch": 1.32, + "grad_norm": 0.582410454750061, + "learning_rate": 0.0003556486729642835, + "loss": 3.0795, + "step": 26972 + }, + { + "epoch": 1.32, + "grad_norm": 0.5897334218025208, + "learning_rate": 0.0003556335438999506, + "loss": 2.867, + "step": 26973 + }, + { + "epoch": 1.32, + "grad_norm": 0.5801978707313538, + "learning_rate": 0.0003556184146890897, + "loss": 3.112, + "step": 26974 + }, + { + "epoch": 1.32, + "grad_norm": 0.5620837807655334, + "learning_rate": 0.00035560328533174057, + "loss": 3.1498, + "step": 26975 + }, + { + "epoch": 1.32, + "grad_norm": 0.5551138520240784, + "learning_rate": 0.00035558815582794305, + "loss": 3.1267, + "step": 26976 + }, + { + "epoch": 1.32, + "grad_norm": 0.5612112283706665, + "learning_rate": 0.00035557302617773704, + "loss": 3.1203, + "step": 26977 + }, + { + "epoch": 1.32, + "grad_norm": 0.6193846464157104, + "learning_rate": 0.0003555578963811624, + "loss": 3.0384, + "step": 26978 + }, + { + "epoch": 1.32, + "grad_norm": 0.6214466691017151, + "learning_rate": 0.0003555427664382589, + "loss": 3.0276, + "step": 26979 + }, + { + "epoch": 1.32, + "grad_norm": 0.5702659487724304, + "learning_rate": 0.00035552763634906655, + "loss": 3.2472, + "step": 26980 + }, + { + "epoch": 1.32, + "grad_norm": 0.5576087832450867, + "learning_rate": 0.000355512506113625, + "loss": 2.9014, + "step": 26981 + }, + { + "epoch": 1.32, + "grad_norm": 0.578071653842926, + "learning_rate": 0.00035549737573197415, + "loss": 3.1932, + "step": 26982 + }, + { + "epoch": 1.32, + "grad_norm": 0.5625930428504944, + "learning_rate": 0.00035548224520415403, + "loss": 3.0508, + "step": 26983 + }, + { + "epoch": 1.32, + "grad_norm": 0.6400315761566162, + "learning_rate": 0.0003554671145302044, + "loss": 3.0855, + "step": 26984 + }, + { + "epoch": 1.32, + "grad_norm": 0.5485404133796692, + "learning_rate": 0.00035545198371016477, + "loss": 3.1335, + "step": 26985 + }, + { + "epoch": 1.32, + "grad_norm": 0.5850667357444763, + "learning_rate": 0.0003554368527440756, + "loss": 2.8681, + "step": 26986 + }, + { + "epoch": 1.32, + "grad_norm": 0.5740143656730652, + "learning_rate": 0.00035542172163197626, + "loss": 3.0152, + "step": 26987 + }, + { + "epoch": 1.32, + "grad_norm": 0.5631483793258667, + "learning_rate": 0.0003554065903739068, + "loss": 3.1364, + "step": 26988 + }, + { + "epoch": 1.32, + "grad_norm": 0.604009211063385, + "learning_rate": 0.0003553914589699071, + "loss": 3.162, + "step": 26989 + }, + { + "epoch": 1.32, + "grad_norm": 0.5691906213760376, + "learning_rate": 0.00035537632742001685, + "loss": 3.1606, + "step": 26990 + }, + { + "epoch": 1.32, + "grad_norm": 0.629859983921051, + "learning_rate": 0.00035536119572427603, + "loss": 2.9259, + "step": 26991 + }, + { + "epoch": 1.32, + "grad_norm": 0.5835464000701904, + "learning_rate": 0.00035534606388272456, + "loss": 3.2523, + "step": 26992 + }, + { + "epoch": 1.32, + "grad_norm": 0.5969851613044739, + "learning_rate": 0.00035533093189540216, + "loss": 3.1687, + "step": 26993 + }, + { + "epoch": 1.32, + "grad_norm": 0.541480302810669, + "learning_rate": 0.0003553157997623488, + "loss": 2.9884, + "step": 26994 + }, + { + "epoch": 1.32, + "grad_norm": 0.5837870240211487, + "learning_rate": 0.0003553006674836041, + "loss": 2.7295, + "step": 26995 + }, + { + "epoch": 1.32, + "grad_norm": 0.609303891658783, + "learning_rate": 0.00035528553505920825, + "loss": 3.0732, + "step": 26996 + }, + { + "epoch": 1.32, + "grad_norm": 0.5507553815841675, + "learning_rate": 0.00035527040248920085, + "loss": 3.0481, + "step": 26997 + }, + { + "epoch": 1.32, + "grad_norm": 0.5484434366226196, + "learning_rate": 0.00035525526977362197, + "loss": 3.0371, + "step": 26998 + }, + { + "epoch": 1.32, + "grad_norm": 0.5981265902519226, + "learning_rate": 0.0003552401369125112, + "loss": 3.0613, + "step": 26999 + }, + { + "epoch": 1.32, + "grad_norm": 0.5397427678108215, + "learning_rate": 0.00035522500390590856, + "loss": 3.1179, + "step": 27000 + }, + { + "epoch": 1.32, + "grad_norm": 0.5383358001708984, + "learning_rate": 0.00035520987075385394, + "loss": 3.0821, + "step": 27001 + }, + { + "epoch": 1.32, + "grad_norm": 0.564812183380127, + "learning_rate": 0.0003551947374563871, + "loss": 2.9963, + "step": 27002 + }, + { + "epoch": 1.32, + "grad_norm": 0.5476768016815186, + "learning_rate": 0.00035517960401354806, + "loss": 3.0847, + "step": 27003 + }, + { + "epoch": 1.32, + "grad_norm": 0.5901896357536316, + "learning_rate": 0.00035516447042537643, + "loss": 3.2513, + "step": 27004 + }, + { + "epoch": 1.32, + "grad_norm": 0.5683808326721191, + "learning_rate": 0.0003551493366919123, + "loss": 3.2693, + "step": 27005 + }, + { + "epoch": 1.32, + "grad_norm": 0.563849151134491, + "learning_rate": 0.00035513420281319533, + "loss": 3.1826, + "step": 27006 + }, + { + "epoch": 1.32, + "grad_norm": 0.5702254772186279, + "learning_rate": 0.0003551190687892655, + "loss": 3.1978, + "step": 27007 + }, + { + "epoch": 1.32, + "grad_norm": 0.5525578856468201, + "learning_rate": 0.00035510393462016273, + "loss": 3.1693, + "step": 27008 + }, + { + "epoch": 1.32, + "grad_norm": 0.5483326315879822, + "learning_rate": 0.0003550888003059268, + "loss": 3.0161, + "step": 27009 + }, + { + "epoch": 1.32, + "grad_norm": 0.6483874320983887, + "learning_rate": 0.0003550736658465975, + "loss": 3.1485, + "step": 27010 + }, + { + "epoch": 1.32, + "grad_norm": 0.5594171285629272, + "learning_rate": 0.0003550585312422147, + "loss": 2.9653, + "step": 27011 + }, + { + "epoch": 1.32, + "grad_norm": 0.5625665187835693, + "learning_rate": 0.0003550433964928185, + "loss": 3.1067, + "step": 27012 + }, + { + "epoch": 1.32, + "grad_norm": 0.5742067694664001, + "learning_rate": 0.0003550282615984485, + "loss": 3.0806, + "step": 27013 + }, + { + "epoch": 1.32, + "grad_norm": 0.6136051416397095, + "learning_rate": 0.0003550131265591446, + "loss": 3.2865, + "step": 27014 + }, + { + "epoch": 1.32, + "grad_norm": 0.7125194072723389, + "learning_rate": 0.0003549979913749468, + "loss": 3.0679, + "step": 27015 + }, + { + "epoch": 1.32, + "grad_norm": 0.5937182307243347, + "learning_rate": 0.0003549828560458948, + "loss": 2.9668, + "step": 27016 + }, + { + "epoch": 1.32, + "grad_norm": 0.5569949150085449, + "learning_rate": 0.0003549677205720286, + "loss": 3.0069, + "step": 27017 + }, + { + "epoch": 1.32, + "grad_norm": 0.5426252484321594, + "learning_rate": 0.000354952584953388, + "loss": 3.1013, + "step": 27018 + }, + { + "epoch": 1.32, + "grad_norm": 0.5773874521255493, + "learning_rate": 0.0003549374491900128, + "loss": 2.9514, + "step": 27019 + }, + { + "epoch": 1.32, + "grad_norm": 0.5283383131027222, + "learning_rate": 0.00035492231328194306, + "loss": 2.845, + "step": 27020 + }, + { + "epoch": 1.32, + "grad_norm": 0.5627959966659546, + "learning_rate": 0.00035490717722921834, + "loss": 3.0731, + "step": 27021 + }, + { + "epoch": 1.32, + "grad_norm": 0.584900975227356, + "learning_rate": 0.0003548920410318788, + "loss": 2.9836, + "step": 27022 + }, + { + "epoch": 1.32, + "grad_norm": 0.5437769889831543, + "learning_rate": 0.0003548769046899642, + "loss": 3.0244, + "step": 27023 + }, + { + "epoch": 1.32, + "grad_norm": 0.5971786975860596, + "learning_rate": 0.00035486176820351435, + "loss": 3.2019, + "step": 27024 + }, + { + "epoch": 1.32, + "grad_norm": 0.5955939888954163, + "learning_rate": 0.0003548466315725691, + "loss": 2.9108, + "step": 27025 + }, + { + "epoch": 1.32, + "grad_norm": 0.5769616961479187, + "learning_rate": 0.0003548314947971685, + "loss": 2.9552, + "step": 27026 + }, + { + "epoch": 1.32, + "grad_norm": 0.5936218500137329, + "learning_rate": 0.0003548163578773522, + "loss": 3.0934, + "step": 27027 + }, + { + "epoch": 1.32, + "grad_norm": 0.5732731819152832, + "learning_rate": 0.0003548012208131602, + "loss": 3.1205, + "step": 27028 + }, + { + "epoch": 1.32, + "grad_norm": 0.5782310962677002, + "learning_rate": 0.00035478608360463233, + "loss": 2.8778, + "step": 27029 + }, + { + "epoch": 1.32, + "grad_norm": 0.5631444454193115, + "learning_rate": 0.0003547709462518085, + "loss": 2.9783, + "step": 27030 + }, + { + "epoch": 1.32, + "grad_norm": 0.5475609302520752, + "learning_rate": 0.0003547558087547285, + "loss": 2.9648, + "step": 27031 + }, + { + "epoch": 1.32, + "grad_norm": 0.5806057453155518, + "learning_rate": 0.0003547406711134322, + "loss": 2.8895, + "step": 27032 + }, + { + "epoch": 1.32, + "grad_norm": 0.5406964421272278, + "learning_rate": 0.00035472553332795956, + "loss": 3.2604, + "step": 27033 + }, + { + "epoch": 1.32, + "grad_norm": 0.5376538634300232, + "learning_rate": 0.00035471039539835045, + "loss": 3.0621, + "step": 27034 + }, + { + "epoch": 1.32, + "grad_norm": 0.5387920141220093, + "learning_rate": 0.00035469525732464464, + "loss": 3.0724, + "step": 27035 + }, + { + "epoch": 1.32, + "grad_norm": 0.6255925297737122, + "learning_rate": 0.0003546801191068819, + "loss": 3.0135, + "step": 27036 + }, + { + "epoch": 1.33, + "grad_norm": 0.5752521753311157, + "learning_rate": 0.00035466498074510246, + "loss": 3.1237, + "step": 27037 + }, + { + "epoch": 1.33, + "grad_norm": 0.5635229349136353, + "learning_rate": 0.0003546498422393459, + "loss": 3.0906, + "step": 27038 + }, + { + "epoch": 1.33, + "grad_norm": 0.5960707664489746, + "learning_rate": 0.0003546347035896522, + "loss": 2.9693, + "step": 27039 + }, + { + "epoch": 1.33, + "grad_norm": 0.5936806201934814, + "learning_rate": 0.00035461956479606113, + "loss": 3.1782, + "step": 27040 + }, + { + "epoch": 1.33, + "grad_norm": 0.5770202875137329, + "learning_rate": 0.0003546044258586127, + "loss": 3.1503, + "step": 27041 + }, + { + "epoch": 1.33, + "grad_norm": 0.607388973236084, + "learning_rate": 0.0003545892867773467, + "loss": 3.1449, + "step": 27042 + }, + { + "epoch": 1.33, + "grad_norm": 0.5307580232620239, + "learning_rate": 0.00035457414755230303, + "loss": 3.0214, + "step": 27043 + }, + { + "epoch": 1.33, + "grad_norm": 0.5784158110618591, + "learning_rate": 0.0003545590081835215, + "loss": 3.0766, + "step": 27044 + }, + { + "epoch": 1.33, + "grad_norm": 0.5791460275650024, + "learning_rate": 0.0003545438686710422, + "loss": 2.999, + "step": 27045 + }, + { + "epoch": 1.33, + "grad_norm": 0.5575026869773865, + "learning_rate": 0.00035452872901490476, + "loss": 3.0514, + "step": 27046 + }, + { + "epoch": 1.33, + "grad_norm": 0.54404217004776, + "learning_rate": 0.0003545135892151491, + "loss": 2.9649, + "step": 27047 + }, + { + "epoch": 1.33, + "grad_norm": 0.5947194695472717, + "learning_rate": 0.00035449844927181525, + "loss": 3.0767, + "step": 27048 + }, + { + "epoch": 1.33, + "grad_norm": 0.5407034158706665, + "learning_rate": 0.00035448330918494284, + "loss": 2.9749, + "step": 27049 + }, + { + "epoch": 1.33, + "grad_norm": 0.5737541317939758, + "learning_rate": 0.00035446816895457194, + "loss": 3.2021, + "step": 27050 + }, + { + "epoch": 1.33, + "grad_norm": 0.5305442214012146, + "learning_rate": 0.00035445302858074234, + "loss": 3.2367, + "step": 27051 + }, + { + "epoch": 1.33, + "grad_norm": 0.5993185639381409, + "learning_rate": 0.000354437888063494, + "loss": 3.2942, + "step": 27052 + }, + { + "epoch": 1.33, + "grad_norm": 0.5892103314399719, + "learning_rate": 0.00035442274740286666, + "loss": 3.1939, + "step": 27053 + }, + { + "epoch": 1.33, + "grad_norm": 0.6093845963478088, + "learning_rate": 0.00035440760659890034, + "loss": 2.9334, + "step": 27054 + }, + { + "epoch": 1.33, + "grad_norm": 0.5915141701698303, + "learning_rate": 0.00035439246565163485, + "loss": 3.1818, + "step": 27055 + }, + { + "epoch": 1.33, + "grad_norm": 0.5777159333229065, + "learning_rate": 0.0003543773245611101, + "loss": 3.1014, + "step": 27056 + }, + { + "epoch": 1.33, + "grad_norm": 0.6174264550209045, + "learning_rate": 0.0003543621833273658, + "loss": 3.036, + "step": 27057 + }, + { + "epoch": 1.33, + "grad_norm": 0.584246814250946, + "learning_rate": 0.00035434704195044206, + "loss": 3.0767, + "step": 27058 + }, + { + "epoch": 1.33, + "grad_norm": 0.5757298469543457, + "learning_rate": 0.0003543319004303787, + "loss": 3.1766, + "step": 27059 + }, + { + "epoch": 1.33, + "grad_norm": 0.5471251606941223, + "learning_rate": 0.0003543167587672156, + "loss": 3.087, + "step": 27060 + }, + { + "epoch": 1.33, + "grad_norm": 0.6035965085029602, + "learning_rate": 0.00035430161696099256, + "loss": 3.0348, + "step": 27061 + }, + { + "epoch": 1.33, + "grad_norm": 0.5603316426277161, + "learning_rate": 0.00035428647501174943, + "loss": 3.1404, + "step": 27062 + }, + { + "epoch": 1.33, + "grad_norm": 0.6384663581848145, + "learning_rate": 0.00035427133291952633, + "loss": 2.9896, + "step": 27063 + }, + { + "epoch": 1.33, + "grad_norm": 0.5779756903648376, + "learning_rate": 0.00035425619068436294, + "loss": 3.0965, + "step": 27064 + }, + { + "epoch": 1.33, + "grad_norm": 0.5718583464622498, + "learning_rate": 0.00035424104830629916, + "loss": 3.1002, + "step": 27065 + }, + { + "epoch": 1.33, + "grad_norm": 0.5677472352981567, + "learning_rate": 0.0003542259057853749, + "loss": 3.1221, + "step": 27066 + }, + { + "epoch": 1.33, + "grad_norm": 0.575522243976593, + "learning_rate": 0.00035421076312163, + "loss": 3.0776, + "step": 27067 + }, + { + "epoch": 1.33, + "grad_norm": 0.5618709325790405, + "learning_rate": 0.0003541956203151044, + "loss": 2.8956, + "step": 27068 + }, + { + "epoch": 1.33, + "grad_norm": 0.5451009273529053, + "learning_rate": 0.000354180477365838, + "loss": 3.0641, + "step": 27069 + }, + { + "epoch": 1.33, + "grad_norm": 0.618395984172821, + "learning_rate": 0.0003541653342738706, + "loss": 2.8972, + "step": 27070 + }, + { + "epoch": 1.33, + "grad_norm": 0.5467285513877869, + "learning_rate": 0.0003541501910392422, + "loss": 3.0502, + "step": 27071 + }, + { + "epoch": 1.33, + "grad_norm": 0.6058321595191956, + "learning_rate": 0.00035413504766199257, + "loss": 2.9549, + "step": 27072 + }, + { + "epoch": 1.33, + "grad_norm": 0.5389471054077148, + "learning_rate": 0.00035411990414216165, + "loss": 3.2793, + "step": 27073 + }, + { + "epoch": 1.33, + "grad_norm": 0.6167011857032776, + "learning_rate": 0.0003541047604797894, + "loss": 3.0513, + "step": 27074 + }, + { + "epoch": 1.33, + "grad_norm": 0.6449909210205078, + "learning_rate": 0.00035408961667491554, + "loss": 2.965, + "step": 27075 + }, + { + "epoch": 1.33, + "grad_norm": 0.5643405914306641, + "learning_rate": 0.0003540744727275801, + "loss": 2.8066, + "step": 27076 + }, + { + "epoch": 1.33, + "grad_norm": 0.5791833400726318, + "learning_rate": 0.0003540593286378228, + "loss": 3.0858, + "step": 27077 + }, + { + "epoch": 1.33, + "grad_norm": 0.5517277717590332, + "learning_rate": 0.0003540441844056837, + "loss": 3.2508, + "step": 27078 + }, + { + "epoch": 1.33, + "grad_norm": 0.5787827372550964, + "learning_rate": 0.00035402904003120254, + "loss": 3.0538, + "step": 27079 + }, + { + "epoch": 1.33, + "grad_norm": 0.5583292841911316, + "learning_rate": 0.0003540138955144194, + "loss": 3.1336, + "step": 27080 + }, + { + "epoch": 1.33, + "grad_norm": 0.5447515249252319, + "learning_rate": 0.00035399875085537403, + "loss": 2.9675, + "step": 27081 + }, + { + "epoch": 1.33, + "grad_norm": 0.5473519563674927, + "learning_rate": 0.00035398360605410636, + "loss": 2.6685, + "step": 27082 + }, + { + "epoch": 1.33, + "grad_norm": 0.6127763986587524, + "learning_rate": 0.00035396846111065614, + "loss": 2.9615, + "step": 27083 + }, + { + "epoch": 1.33, + "grad_norm": 0.5656737685203552, + "learning_rate": 0.00035395331602506344, + "loss": 3.0083, + "step": 27084 + }, + { + "epoch": 1.33, + "grad_norm": 0.6185324192047119, + "learning_rate": 0.0003539381707973682, + "loss": 3.1667, + "step": 27085 + }, + { + "epoch": 1.33, + "grad_norm": 0.5483812689781189, + "learning_rate": 0.00035392302542761007, + "loss": 3.0815, + "step": 27086 + }, + { + "epoch": 1.33, + "grad_norm": 0.5831971764564514, + "learning_rate": 0.0003539078799158291, + "loss": 3.103, + "step": 27087 + }, + { + "epoch": 1.33, + "grad_norm": 0.5833414196968079, + "learning_rate": 0.0003538927342620651, + "loss": 2.9695, + "step": 27088 + }, + { + "epoch": 1.33, + "grad_norm": 0.543148934841156, + "learning_rate": 0.0003538775884663581, + "loss": 3.0304, + "step": 27089 + }, + { + "epoch": 1.33, + "grad_norm": 0.6388452649116516, + "learning_rate": 0.0003538624425287479, + "loss": 3.0587, + "step": 27090 + }, + { + "epoch": 1.33, + "grad_norm": 0.601694643497467, + "learning_rate": 0.0003538472964492743, + "loss": 2.9898, + "step": 27091 + }, + { + "epoch": 1.33, + "grad_norm": 0.6530476212501526, + "learning_rate": 0.00035383215022797735, + "loss": 3.0919, + "step": 27092 + }, + { + "epoch": 1.33, + "grad_norm": 0.5525394678115845, + "learning_rate": 0.0003538170038648969, + "loss": 3.1727, + "step": 27093 + }, + { + "epoch": 1.33, + "grad_norm": 0.562701404094696, + "learning_rate": 0.0003538018573600728, + "loss": 3.1354, + "step": 27094 + }, + { + "epoch": 1.33, + "grad_norm": 0.6188597679138184, + "learning_rate": 0.0003537867107135449, + "loss": 3.1935, + "step": 27095 + }, + { + "epoch": 1.33, + "grad_norm": 0.5348824858665466, + "learning_rate": 0.0003537715639253533, + "loss": 2.9653, + "step": 27096 + }, + { + "epoch": 1.33, + "grad_norm": 0.5821238160133362, + "learning_rate": 0.00035375641699553765, + "loss": 2.973, + "step": 27097 + }, + { + "epoch": 1.33, + "grad_norm": 0.5549263954162598, + "learning_rate": 0.0003537412699241379, + "loss": 3.0837, + "step": 27098 + }, + { + "epoch": 1.33, + "grad_norm": 0.5607396960258484, + "learning_rate": 0.0003537261227111941, + "loss": 3.1077, + "step": 27099 + }, + { + "epoch": 1.33, + "grad_norm": 0.5585238337516785, + "learning_rate": 0.00035371097535674597, + "loss": 2.7311, + "step": 27100 + }, + { + "epoch": 1.33, + "grad_norm": 0.5680395364761353, + "learning_rate": 0.00035369582786083344, + "loss": 2.9498, + "step": 27101 + }, + { + "epoch": 1.33, + "grad_norm": 0.582237184047699, + "learning_rate": 0.00035368068022349644, + "loss": 2.9487, + "step": 27102 + }, + { + "epoch": 1.33, + "grad_norm": 0.6273513436317444, + "learning_rate": 0.00035366553244477487, + "loss": 3.2227, + "step": 27103 + }, + { + "epoch": 1.33, + "grad_norm": 0.5687019228935242, + "learning_rate": 0.0003536503845247087, + "loss": 3.0682, + "step": 27104 + }, + { + "epoch": 1.33, + "grad_norm": 0.5843102931976318, + "learning_rate": 0.0003536352364633376, + "loss": 3.2752, + "step": 27105 + }, + { + "epoch": 1.33, + "grad_norm": 0.5721175670623779, + "learning_rate": 0.00035362008826070166, + "loss": 3.2501, + "step": 27106 + }, + { + "epoch": 1.33, + "grad_norm": 0.5818900465965271, + "learning_rate": 0.00035360493991684085, + "loss": 3.0691, + "step": 27107 + }, + { + "epoch": 1.33, + "grad_norm": 0.5466283559799194, + "learning_rate": 0.00035358979143179474, + "loss": 2.9073, + "step": 27108 + }, + { + "epoch": 1.33, + "grad_norm": 0.5692821741104126, + "learning_rate": 0.00035357464280560356, + "loss": 2.9878, + "step": 27109 + }, + { + "epoch": 1.33, + "grad_norm": 0.6057048439979553, + "learning_rate": 0.000353559494038307, + "loss": 3.1523, + "step": 27110 + }, + { + "epoch": 1.33, + "grad_norm": 0.7662555575370789, + "learning_rate": 0.0003535443451299452, + "loss": 3.0899, + "step": 27111 + }, + { + "epoch": 1.33, + "grad_norm": 0.6070552468299866, + "learning_rate": 0.00035352919608055777, + "loss": 3.0291, + "step": 27112 + }, + { + "epoch": 1.33, + "grad_norm": 0.5470950603485107, + "learning_rate": 0.0003535140468901847, + "loss": 2.9255, + "step": 27113 + }, + { + "epoch": 1.33, + "grad_norm": 0.5878978967666626, + "learning_rate": 0.00035349889755886604, + "loss": 2.828, + "step": 27114 + }, + { + "epoch": 1.33, + "grad_norm": 0.5929617881774902, + "learning_rate": 0.00035348374808664155, + "loss": 2.9915, + "step": 27115 + }, + { + "epoch": 1.33, + "grad_norm": 0.5788769125938416, + "learning_rate": 0.0003534685984735511, + "loss": 3.2226, + "step": 27116 + }, + { + "epoch": 1.33, + "grad_norm": 0.5542086362838745, + "learning_rate": 0.0003534534487196347, + "loss": 3.269, + "step": 27117 + }, + { + "epoch": 1.33, + "grad_norm": 0.5548458099365234, + "learning_rate": 0.0003534382988249323, + "loss": 3.044, + "step": 27118 + }, + { + "epoch": 1.33, + "grad_norm": 0.5794603228569031, + "learning_rate": 0.0003534231487894835, + "loss": 3.151, + "step": 27119 + }, + { + "epoch": 1.33, + "grad_norm": 0.6125290989875793, + "learning_rate": 0.0003534079986133285, + "loss": 3.0768, + "step": 27120 + }, + { + "epoch": 1.33, + "grad_norm": 0.5400205254554749, + "learning_rate": 0.00035339284829650716, + "loss": 3.0074, + "step": 27121 + }, + { + "epoch": 1.33, + "grad_norm": 0.5447748303413391, + "learning_rate": 0.0003533776978390593, + "loss": 3.031, + "step": 27122 + }, + { + "epoch": 1.33, + "grad_norm": 0.5757708549499512, + "learning_rate": 0.00035336254724102487, + "loss": 3.0801, + "step": 27123 + }, + { + "epoch": 1.33, + "grad_norm": 0.5604739189147949, + "learning_rate": 0.0003533473965024437, + "loss": 3.0351, + "step": 27124 + }, + { + "epoch": 1.33, + "grad_norm": 0.5703689455986023, + "learning_rate": 0.00035333224562335584, + "loss": 3.2484, + "step": 27125 + }, + { + "epoch": 1.33, + "grad_norm": 0.5888387560844421, + "learning_rate": 0.00035331709460380105, + "loss": 2.9558, + "step": 27126 + }, + { + "epoch": 1.33, + "grad_norm": 0.5701423287391663, + "learning_rate": 0.0003533019434438193, + "loss": 3.1372, + "step": 27127 + }, + { + "epoch": 1.33, + "grad_norm": 0.5726101398468018, + "learning_rate": 0.0003532867921434505, + "loss": 2.931, + "step": 27128 + }, + { + "epoch": 1.33, + "grad_norm": 0.5894249677658081, + "learning_rate": 0.00035327164070273457, + "loss": 3.1105, + "step": 27129 + }, + { + "epoch": 1.33, + "grad_norm": 0.5488255620002747, + "learning_rate": 0.0003532564891217113, + "loss": 3.0723, + "step": 27130 + }, + { + "epoch": 1.33, + "grad_norm": 0.5957520604133606, + "learning_rate": 0.00035324133740042077, + "loss": 3.1297, + "step": 27131 + }, + { + "epoch": 1.33, + "grad_norm": 0.5768669843673706, + "learning_rate": 0.00035322618553890276, + "loss": 2.9614, + "step": 27132 + }, + { + "epoch": 1.33, + "grad_norm": 0.5305699110031128, + "learning_rate": 0.00035321103353719736, + "loss": 2.8638, + "step": 27133 + }, + { + "epoch": 1.33, + "grad_norm": 0.5721878409385681, + "learning_rate": 0.00035319588139534413, + "loss": 3.203, + "step": 27134 + }, + { + "epoch": 1.33, + "grad_norm": 0.5690099596977234, + "learning_rate": 0.0003531807291133833, + "loss": 3.0033, + "step": 27135 + }, + { + "epoch": 1.33, + "grad_norm": 0.6488524079322815, + "learning_rate": 0.0003531655766913547, + "loss": 3.1853, + "step": 27136 + }, + { + "epoch": 1.33, + "grad_norm": 0.6288285255432129, + "learning_rate": 0.00035315042412929813, + "loss": 3.1391, + "step": 27137 + }, + { + "epoch": 1.33, + "grad_norm": 0.586223840713501, + "learning_rate": 0.00035313527142725364, + "loss": 2.9698, + "step": 27138 + }, + { + "epoch": 1.33, + "grad_norm": 0.5799707174301147, + "learning_rate": 0.000353120118585261, + "loss": 3.1249, + "step": 27139 + }, + { + "epoch": 1.33, + "grad_norm": 0.555907666683197, + "learning_rate": 0.0003531049656033602, + "loss": 3.1875, + "step": 27140 + }, + { + "epoch": 1.33, + "grad_norm": 0.5725418329238892, + "learning_rate": 0.00035308981248159113, + "loss": 3.1464, + "step": 27141 + }, + { + "epoch": 1.33, + "grad_norm": 0.542593240737915, + "learning_rate": 0.00035307465921999374, + "loss": 3.1674, + "step": 27142 + }, + { + "epoch": 1.33, + "grad_norm": 0.5610252022743225, + "learning_rate": 0.0003530595058186079, + "loss": 3.0766, + "step": 27143 + }, + { + "epoch": 1.33, + "grad_norm": 0.6028214693069458, + "learning_rate": 0.0003530443522774735, + "loss": 3.3676, + "step": 27144 + }, + { + "epoch": 1.33, + "grad_norm": 0.5492804646492004, + "learning_rate": 0.0003530291985966305, + "loss": 3.0786, + "step": 27145 + }, + { + "epoch": 1.33, + "grad_norm": 0.5824100375175476, + "learning_rate": 0.0003530140447761188, + "loss": 3.1429, + "step": 27146 + }, + { + "epoch": 1.33, + "grad_norm": 0.5860824584960938, + "learning_rate": 0.0003529988908159784, + "loss": 3.1449, + "step": 27147 + }, + { + "epoch": 1.33, + "grad_norm": 0.6098974347114563, + "learning_rate": 0.0003529837367162491, + "loss": 3.0493, + "step": 27148 + }, + { + "epoch": 1.33, + "grad_norm": 0.5788512825965881, + "learning_rate": 0.0003529685824769707, + "loss": 2.9447, + "step": 27149 + }, + { + "epoch": 1.33, + "grad_norm": 0.5958915948867798, + "learning_rate": 0.0003529534280981834, + "loss": 3.1485, + "step": 27150 + }, + { + "epoch": 1.33, + "grad_norm": 0.552696704864502, + "learning_rate": 0.0003529382735799269, + "loss": 3.0101, + "step": 27151 + }, + { + "epoch": 1.33, + "grad_norm": 0.571831226348877, + "learning_rate": 0.00035292311892224114, + "loss": 2.9421, + "step": 27152 + }, + { + "epoch": 1.33, + "grad_norm": 0.5760900378227234, + "learning_rate": 0.0003529079641251661, + "loss": 3.0002, + "step": 27153 + }, + { + "epoch": 1.33, + "grad_norm": 0.6153770089149475, + "learning_rate": 0.0003528928091887417, + "loss": 2.9689, + "step": 27154 + }, + { + "epoch": 1.33, + "grad_norm": 0.5628465414047241, + "learning_rate": 0.00035287765411300777, + "loss": 2.8312, + "step": 27155 + }, + { + "epoch": 1.33, + "grad_norm": 0.5893851518630981, + "learning_rate": 0.00035286249889800425, + "loss": 2.9367, + "step": 27156 + }, + { + "epoch": 1.33, + "grad_norm": 0.5659940242767334, + "learning_rate": 0.00035284734354377117, + "loss": 3.078, + "step": 27157 + }, + { + "epoch": 1.33, + "grad_norm": 0.5788155198097229, + "learning_rate": 0.0003528321880503483, + "loss": 2.9413, + "step": 27158 + }, + { + "epoch": 1.33, + "grad_norm": 0.6268181204795837, + "learning_rate": 0.00035281703241777566, + "loss": 3.0774, + "step": 27159 + }, + { + "epoch": 1.33, + "grad_norm": 0.5367437601089478, + "learning_rate": 0.000352801876646093, + "loss": 3.0378, + "step": 27160 + }, + { + "epoch": 1.33, + "grad_norm": 0.5380226969718933, + "learning_rate": 0.00035278672073534045, + "loss": 2.9433, + "step": 27161 + }, + { + "epoch": 1.33, + "grad_norm": 0.5570831894874573, + "learning_rate": 0.00035277156468555785, + "loss": 3.0755, + "step": 27162 + }, + { + "epoch": 1.33, + "grad_norm": 0.5446562767028809, + "learning_rate": 0.00035275640849678514, + "loss": 2.9051, + "step": 27163 + }, + { + "epoch": 1.33, + "grad_norm": 0.5816878080368042, + "learning_rate": 0.0003527412521690621, + "loss": 3.0586, + "step": 27164 + }, + { + "epoch": 1.33, + "grad_norm": 0.6048847436904907, + "learning_rate": 0.0003527260957024288, + "loss": 3.2571, + "step": 27165 + }, + { + "epoch": 1.33, + "grad_norm": 0.5430023670196533, + "learning_rate": 0.0003527109390969251, + "loss": 3.1818, + "step": 27166 + }, + { + "epoch": 1.33, + "grad_norm": 0.5993807911872864, + "learning_rate": 0.00035269578235259095, + "loss": 3.0313, + "step": 27167 + }, + { + "epoch": 1.33, + "grad_norm": 0.638937771320343, + "learning_rate": 0.0003526806254694662, + "loss": 3.304, + "step": 27168 + }, + { + "epoch": 1.33, + "grad_norm": 0.5941781997680664, + "learning_rate": 0.000352665468447591, + "loss": 2.8478, + "step": 27169 + }, + { + "epoch": 1.33, + "grad_norm": 0.5695825815200806, + "learning_rate": 0.0003526503112870049, + "loss": 3.1401, + "step": 27170 + }, + { + "epoch": 1.33, + "grad_norm": 0.563214898109436, + "learning_rate": 0.000352635153987748, + "loss": 3.2235, + "step": 27171 + }, + { + "epoch": 1.33, + "grad_norm": 0.5385993719100952, + "learning_rate": 0.00035261999654986035, + "loss": 3.0607, + "step": 27172 + }, + { + "epoch": 1.33, + "grad_norm": 0.612762987613678, + "learning_rate": 0.0003526048389733818, + "loss": 3.0266, + "step": 27173 + }, + { + "epoch": 1.33, + "grad_norm": 0.5819412469863892, + "learning_rate": 0.0003525896812583521, + "loss": 2.93, + "step": 27174 + }, + { + "epoch": 1.33, + "grad_norm": 0.5829300880432129, + "learning_rate": 0.0003525745234048112, + "loss": 2.9441, + "step": 27175 + }, + { + "epoch": 1.33, + "grad_norm": 0.5720100998878479, + "learning_rate": 0.00035255936541279937, + "loss": 3.02, + "step": 27176 + }, + { + "epoch": 1.33, + "grad_norm": 0.5454930663108826, + "learning_rate": 0.0003525442072823562, + "loss": 2.9635, + "step": 27177 + }, + { + "epoch": 1.33, + "grad_norm": 0.5697073936462402, + "learning_rate": 0.0003525290490135217, + "loss": 3.1289, + "step": 27178 + }, + { + "epoch": 1.33, + "grad_norm": 0.5639901161193848, + "learning_rate": 0.00035251389060633574, + "loss": 3.077, + "step": 27179 + }, + { + "epoch": 1.33, + "grad_norm": 0.5947508811950684, + "learning_rate": 0.0003524987320608383, + "loss": 3.0138, + "step": 27180 + }, + { + "epoch": 1.33, + "grad_norm": 0.555055558681488, + "learning_rate": 0.00035248357337706935, + "loss": 3.1569, + "step": 27181 + }, + { + "epoch": 1.33, + "grad_norm": 0.5873993039131165, + "learning_rate": 0.00035246841455506875, + "loss": 3.0096, + "step": 27182 + }, + { + "epoch": 1.33, + "grad_norm": 0.5661149621009827, + "learning_rate": 0.00035245325559487645, + "loss": 2.9492, + "step": 27183 + }, + { + "epoch": 1.33, + "grad_norm": 0.55522221326828, + "learning_rate": 0.0003524380964965324, + "loss": 3.1046, + "step": 27184 + }, + { + "epoch": 1.33, + "grad_norm": 0.5756954550743103, + "learning_rate": 0.00035242293726007643, + "loss": 3.1467, + "step": 27185 + }, + { + "epoch": 1.33, + "grad_norm": 0.5577954649925232, + "learning_rate": 0.00035240777788554857, + "loss": 3.1245, + "step": 27186 + }, + { + "epoch": 1.33, + "grad_norm": 0.5636711120605469, + "learning_rate": 0.00035239261837298876, + "loss": 3.1186, + "step": 27187 + }, + { + "epoch": 1.33, + "grad_norm": 0.5825818181037903, + "learning_rate": 0.0003523774587224368, + "loss": 3.1649, + "step": 27188 + }, + { + "epoch": 1.33, + "grad_norm": 0.570296585559845, + "learning_rate": 0.0003523622989339327, + "loss": 2.9089, + "step": 27189 + }, + { + "epoch": 1.33, + "grad_norm": 0.5361173152923584, + "learning_rate": 0.0003523471390075164, + "loss": 2.944, + "step": 27190 + }, + { + "epoch": 1.33, + "grad_norm": 0.5616334080696106, + "learning_rate": 0.0003523319789432278, + "loss": 3.2198, + "step": 27191 + }, + { + "epoch": 1.33, + "grad_norm": 0.6106418371200562, + "learning_rate": 0.00035231681874110685, + "loss": 3.0541, + "step": 27192 + }, + { + "epoch": 1.33, + "grad_norm": 0.6105589270591736, + "learning_rate": 0.00035230165840119344, + "loss": 3.1066, + "step": 27193 + }, + { + "epoch": 1.33, + "grad_norm": 0.5929547548294067, + "learning_rate": 0.0003522864979235276, + "loss": 2.9546, + "step": 27194 + }, + { + "epoch": 1.33, + "grad_norm": 0.5807821154594421, + "learning_rate": 0.0003522713373081491, + "loss": 3.1256, + "step": 27195 + }, + { + "epoch": 1.33, + "grad_norm": 0.5757489204406738, + "learning_rate": 0.00035225617655509797, + "loss": 3.0397, + "step": 27196 + }, + { + "epoch": 1.33, + "grad_norm": 0.5699480772018433, + "learning_rate": 0.0003522410156644142, + "loss": 3.3366, + "step": 27197 + }, + { + "epoch": 1.33, + "grad_norm": 0.6023076176643372, + "learning_rate": 0.00035222585463613764, + "loss": 3.103, + "step": 27198 + }, + { + "epoch": 1.33, + "grad_norm": 0.5710323452949524, + "learning_rate": 0.0003522106934703082, + "loss": 3.1119, + "step": 27199 + }, + { + "epoch": 1.33, + "grad_norm": 0.5762554407119751, + "learning_rate": 0.0003521955321669658, + "loss": 3.0356, + "step": 27200 + }, + { + "epoch": 1.33, + "grad_norm": 0.5505509972572327, + "learning_rate": 0.00035218037072615047, + "loss": 3.0826, + "step": 27201 + }, + { + "epoch": 1.33, + "grad_norm": 0.5784319639205933, + "learning_rate": 0.00035216520914790205, + "loss": 3.0583, + "step": 27202 + }, + { + "epoch": 1.33, + "grad_norm": 0.5438815951347351, + "learning_rate": 0.00035215004743226056, + "loss": 2.8998, + "step": 27203 + }, + { + "epoch": 1.33, + "grad_norm": 0.5637693405151367, + "learning_rate": 0.0003521348855792658, + "loss": 3.0585, + "step": 27204 + }, + { + "epoch": 1.33, + "grad_norm": 0.5552592277526855, + "learning_rate": 0.00035211972358895776, + "loss": 2.8598, + "step": 27205 + }, + { + "epoch": 1.33, + "grad_norm": 0.6333317756652832, + "learning_rate": 0.0003521045614613765, + "loss": 3.0529, + "step": 27206 + }, + { + "epoch": 1.33, + "grad_norm": 0.6301619410514832, + "learning_rate": 0.0003520893991965618, + "loss": 2.963, + "step": 27207 + }, + { + "epoch": 1.33, + "grad_norm": 0.5590476989746094, + "learning_rate": 0.0003520742367945537, + "loss": 2.9396, + "step": 27208 + }, + { + "epoch": 1.33, + "grad_norm": 0.5794091820716858, + "learning_rate": 0.00035205907425539214, + "loss": 3.0028, + "step": 27209 + }, + { + "epoch": 1.33, + "grad_norm": 0.5639303922653198, + "learning_rate": 0.00035204391157911684, + "loss": 3.0661, + "step": 27210 + }, + { + "epoch": 1.33, + "grad_norm": 0.5517282485961914, + "learning_rate": 0.0003520287487657679, + "loss": 2.9745, + "step": 27211 + }, + { + "epoch": 1.33, + "grad_norm": 0.6066323518753052, + "learning_rate": 0.0003520135858153854, + "loss": 3.1238, + "step": 27212 + }, + { + "epoch": 1.33, + "grad_norm": 0.5425803661346436, + "learning_rate": 0.000351998422728009, + "loss": 3.1991, + "step": 27213 + }, + { + "epoch": 1.33, + "grad_norm": 0.5908896923065186, + "learning_rate": 0.0003519832595036788, + "loss": 3.1234, + "step": 27214 + }, + { + "epoch": 1.33, + "grad_norm": 0.5824942588806152, + "learning_rate": 0.0003519680961424347, + "loss": 3.0476, + "step": 27215 + }, + { + "epoch": 1.33, + "grad_norm": 0.5757591724395752, + "learning_rate": 0.00035195293264431664, + "loss": 3.0747, + "step": 27216 + }, + { + "epoch": 1.33, + "grad_norm": 0.5482543110847473, + "learning_rate": 0.00035193776900936457, + "loss": 3.2661, + "step": 27217 + }, + { + "epoch": 1.33, + "grad_norm": 0.6184133887290955, + "learning_rate": 0.00035192260523761833, + "loss": 3.3704, + "step": 27218 + }, + { + "epoch": 1.33, + "grad_norm": 0.5994088053703308, + "learning_rate": 0.000351907441329118, + "loss": 2.8227, + "step": 27219 + }, + { + "epoch": 1.33, + "grad_norm": 0.5714132785797119, + "learning_rate": 0.0003518922772839035, + "loss": 3.102, + "step": 27220 + }, + { + "epoch": 1.33, + "grad_norm": 0.5737739205360413, + "learning_rate": 0.00035187711310201463, + "loss": 3.0617, + "step": 27221 + }, + { + "epoch": 1.33, + "grad_norm": 0.5780137777328491, + "learning_rate": 0.0003518619487834915, + "loss": 3.0814, + "step": 27222 + }, + { + "epoch": 1.33, + "grad_norm": 0.5979353785514832, + "learning_rate": 0.00035184678432837396, + "loss": 3.1507, + "step": 27223 + }, + { + "epoch": 1.33, + "grad_norm": 0.5943918824195862, + "learning_rate": 0.000351831619736702, + "loss": 2.9943, + "step": 27224 + }, + { + "epoch": 1.33, + "grad_norm": 0.5988668203353882, + "learning_rate": 0.0003518164550085155, + "loss": 3.1677, + "step": 27225 + }, + { + "epoch": 1.33, + "grad_norm": 0.5408164262771606, + "learning_rate": 0.00035180129014385437, + "loss": 3.1091, + "step": 27226 + }, + { + "epoch": 1.33, + "grad_norm": 0.5693389177322388, + "learning_rate": 0.0003517861251427587, + "loss": 3.2419, + "step": 27227 + }, + { + "epoch": 1.33, + "grad_norm": 0.5514744520187378, + "learning_rate": 0.00035177096000526837, + "loss": 3.0412, + "step": 27228 + }, + { + "epoch": 1.33, + "grad_norm": 0.5758994221687317, + "learning_rate": 0.0003517557947314232, + "loss": 3.0927, + "step": 27229 + }, + { + "epoch": 1.33, + "grad_norm": 0.5742641687393188, + "learning_rate": 0.00035174062932126325, + "loss": 3.0443, + "step": 27230 + }, + { + "epoch": 1.33, + "grad_norm": 0.587285041809082, + "learning_rate": 0.00035172546377482843, + "loss": 3.2015, + "step": 27231 + }, + { + "epoch": 1.33, + "grad_norm": 0.5484117865562439, + "learning_rate": 0.0003517102980921587, + "loss": 3.2877, + "step": 27232 + }, + { + "epoch": 1.33, + "grad_norm": 0.5856320261955261, + "learning_rate": 0.000351695132273294, + "loss": 3.1225, + "step": 27233 + }, + { + "epoch": 1.33, + "grad_norm": 0.5759357810020447, + "learning_rate": 0.00035167996631827435, + "loss": 3.2482, + "step": 27234 + }, + { + "epoch": 1.33, + "grad_norm": 0.556387722492218, + "learning_rate": 0.0003516648002271396, + "loss": 2.9464, + "step": 27235 + }, + { + "epoch": 1.33, + "grad_norm": 0.5765631198883057, + "learning_rate": 0.00035164963399992964, + "loss": 3.0678, + "step": 27236 + }, + { + "epoch": 1.33, + "grad_norm": 0.5664901733398438, + "learning_rate": 0.0003516344676366844, + "loss": 3.1126, + "step": 27237 + }, + { + "epoch": 1.33, + "grad_norm": 0.5461036562919617, + "learning_rate": 0.00035161930113744404, + "loss": 3.123, + "step": 27238 + }, + { + "epoch": 1.33, + "grad_norm": 0.5736780762672424, + "learning_rate": 0.00035160413450224836, + "loss": 3.1065, + "step": 27239 + }, + { + "epoch": 1.33, + "grad_norm": 0.5623251795768738, + "learning_rate": 0.00035158896773113737, + "loss": 3.1294, + "step": 27240 + }, + { + "epoch": 1.34, + "grad_norm": 0.5517589449882507, + "learning_rate": 0.00035157380082415085, + "loss": 3.1211, + "step": 27241 + }, + { + "epoch": 1.34, + "grad_norm": 0.6113983988761902, + "learning_rate": 0.000351558633781329, + "loss": 2.9765, + "step": 27242 + }, + { + "epoch": 1.34, + "grad_norm": 0.5473376512527466, + "learning_rate": 0.00035154346660271144, + "loss": 3.0213, + "step": 27243 + }, + { + "epoch": 1.34, + "grad_norm": 0.5686004161834717, + "learning_rate": 0.00035152829928833846, + "loss": 3.1543, + "step": 27244 + }, + { + "epoch": 1.34, + "grad_norm": 0.5545783042907715, + "learning_rate": 0.0003515131318382498, + "loss": 3.1051, + "step": 27245 + }, + { + "epoch": 1.34, + "grad_norm": 0.5631744861602783, + "learning_rate": 0.0003514979642524855, + "loss": 3.134, + "step": 27246 + }, + { + "epoch": 1.34, + "grad_norm": 0.5703157186508179, + "learning_rate": 0.0003514827965310854, + "loss": 3.1174, + "step": 27247 + }, + { + "epoch": 1.34, + "grad_norm": 0.5550886392593384, + "learning_rate": 0.0003514676286740896, + "loss": 3.1043, + "step": 27248 + }, + { + "epoch": 1.34, + "grad_norm": 0.5608128309249878, + "learning_rate": 0.00035145246068153804, + "loss": 3.0714, + "step": 27249 + }, + { + "epoch": 1.34, + "grad_norm": 0.6270610690116882, + "learning_rate": 0.00035143729255347045, + "loss": 3.0144, + "step": 27250 + }, + { + "epoch": 1.34, + "grad_norm": 0.6169301867485046, + "learning_rate": 0.000351422124289927, + "loss": 2.9064, + "step": 27251 + }, + { + "epoch": 1.34, + "grad_norm": 0.6073354482650757, + "learning_rate": 0.0003514069558909475, + "loss": 3.1812, + "step": 27252 + }, + { + "epoch": 1.34, + "grad_norm": 0.5473766326904297, + "learning_rate": 0.00035139178735657214, + "loss": 3.0509, + "step": 27253 + }, + { + "epoch": 1.34, + "grad_norm": 0.5443681478500366, + "learning_rate": 0.00035137661868684056, + "loss": 3.1905, + "step": 27254 + }, + { + "epoch": 1.34, + "grad_norm": 0.5778075456619263, + "learning_rate": 0.00035136144988179286, + "loss": 2.8105, + "step": 27255 + }, + { + "epoch": 1.34, + "grad_norm": 0.581585168838501, + "learning_rate": 0.00035134628094146903, + "loss": 3.0035, + "step": 27256 + }, + { + "epoch": 1.34, + "grad_norm": 0.6521691679954529, + "learning_rate": 0.0003513311118659089, + "loss": 2.892, + "step": 27257 + }, + { + "epoch": 1.34, + "grad_norm": 0.6202303171157837, + "learning_rate": 0.0003513159426551526, + "loss": 2.8636, + "step": 27258 + }, + { + "epoch": 1.34, + "grad_norm": 0.573544979095459, + "learning_rate": 0.00035130077330923997, + "loss": 3.2428, + "step": 27259 + }, + { + "epoch": 1.34, + "grad_norm": 0.6818689107894897, + "learning_rate": 0.00035128560382821097, + "loss": 2.8362, + "step": 27260 + }, + { + "epoch": 1.34, + "grad_norm": 0.5987098813056946, + "learning_rate": 0.0003512704342121055, + "loss": 2.9345, + "step": 27261 + }, + { + "epoch": 1.34, + "grad_norm": 0.6563383936882019, + "learning_rate": 0.0003512552644609636, + "loss": 2.9929, + "step": 27262 + }, + { + "epoch": 1.34, + "grad_norm": 0.5845065712928772, + "learning_rate": 0.00035124009457482524, + "loss": 3.0538, + "step": 27263 + }, + { + "epoch": 1.34, + "grad_norm": 0.5674929618835449, + "learning_rate": 0.0003512249245537303, + "loss": 3.1348, + "step": 27264 + }, + { + "epoch": 1.34, + "grad_norm": 0.5974243879318237, + "learning_rate": 0.00035120975439771883, + "loss": 3.188, + "step": 27265 + }, + { + "epoch": 1.34, + "grad_norm": 0.5698199272155762, + "learning_rate": 0.0003511945841068306, + "loss": 2.9841, + "step": 27266 + }, + { + "epoch": 1.34, + "grad_norm": 0.5573409795761108, + "learning_rate": 0.0003511794136811058, + "loss": 2.9835, + "step": 27267 + }, + { + "epoch": 1.34, + "grad_norm": 0.5393723249435425, + "learning_rate": 0.00035116424312058416, + "loss": 3.1279, + "step": 27268 + }, + { + "epoch": 1.34, + "grad_norm": 0.5727754235267639, + "learning_rate": 0.0003511490724253058, + "loss": 3.0733, + "step": 27269 + }, + { + "epoch": 1.34, + "grad_norm": 0.5624639391899109, + "learning_rate": 0.00035113390159531067, + "loss": 3.0563, + "step": 27270 + }, + { + "epoch": 1.34, + "grad_norm": 0.5651065707206726, + "learning_rate": 0.0003511187306306387, + "loss": 2.9617, + "step": 27271 + }, + { + "epoch": 1.34, + "grad_norm": 0.5551247000694275, + "learning_rate": 0.00035110355953132976, + "loss": 3.096, + "step": 27272 + }, + { + "epoch": 1.34, + "grad_norm": 0.5362761616706848, + "learning_rate": 0.0003510883882974239, + "loss": 3.133, + "step": 27273 + }, + { + "epoch": 1.34, + "grad_norm": 0.6459551453590393, + "learning_rate": 0.00035107321692896105, + "loss": 2.9986, + "step": 27274 + }, + { + "epoch": 1.34, + "grad_norm": 0.5925512313842773, + "learning_rate": 0.00035105804542598124, + "loss": 3.1903, + "step": 27275 + }, + { + "epoch": 1.34, + "grad_norm": 0.5428948998451233, + "learning_rate": 0.0003510428737885243, + "loss": 2.8865, + "step": 27276 + }, + { + "epoch": 1.34, + "grad_norm": 0.5987960696220398, + "learning_rate": 0.0003510277020166303, + "loss": 3.1261, + "step": 27277 + }, + { + "epoch": 1.34, + "grad_norm": 0.5686396956443787, + "learning_rate": 0.00035101253011033914, + "loss": 2.9944, + "step": 27278 + }, + { + "epoch": 1.34, + "grad_norm": 0.5981069207191467, + "learning_rate": 0.00035099735806969073, + "loss": 3.1263, + "step": 27279 + }, + { + "epoch": 1.34, + "grad_norm": 0.5610902309417725, + "learning_rate": 0.0003509821858947252, + "loss": 2.8438, + "step": 27280 + }, + { + "epoch": 1.34, + "grad_norm": 0.5656661987304688, + "learning_rate": 0.0003509670135854823, + "loss": 3.1531, + "step": 27281 + }, + { + "epoch": 1.34, + "grad_norm": 0.5572855472564697, + "learning_rate": 0.0003509518411420022, + "loss": 3.1005, + "step": 27282 + }, + { + "epoch": 1.34, + "grad_norm": 0.5681395530700684, + "learning_rate": 0.0003509366685643246, + "loss": 3.1331, + "step": 27283 + }, + { + "epoch": 1.34, + "grad_norm": 0.603127121925354, + "learning_rate": 0.0003509214958524897, + "loss": 3.265, + "step": 27284 + }, + { + "epoch": 1.34, + "grad_norm": 0.5976519584655762, + "learning_rate": 0.0003509063230065374, + "loss": 2.9344, + "step": 27285 + }, + { + "epoch": 1.34, + "grad_norm": 0.5389209985733032, + "learning_rate": 0.0003508911500265077, + "loss": 3.0831, + "step": 27286 + }, + { + "epoch": 1.34, + "grad_norm": 0.5420101881027222, + "learning_rate": 0.0003508759769124405, + "loss": 3.0395, + "step": 27287 + }, + { + "epoch": 1.34, + "grad_norm": 0.5472182035446167, + "learning_rate": 0.00035086080366437554, + "loss": 2.9825, + "step": 27288 + }, + { + "epoch": 1.34, + "grad_norm": 0.5901409983634949, + "learning_rate": 0.0003508456302823533, + "loss": 3.0206, + "step": 27289 + }, + { + "epoch": 1.34, + "grad_norm": 0.5679577589035034, + "learning_rate": 0.0003508304567664133, + "loss": 3.061, + "step": 27290 + }, + { + "epoch": 1.34, + "grad_norm": 0.5943844318389893, + "learning_rate": 0.0003508152831165957, + "loss": 2.9039, + "step": 27291 + }, + { + "epoch": 1.34, + "grad_norm": 0.619647741317749, + "learning_rate": 0.0003508001093329404, + "loss": 3.2114, + "step": 27292 + }, + { + "epoch": 1.34, + "grad_norm": 0.5737244486808777, + "learning_rate": 0.0003507849354154874, + "loss": 2.9268, + "step": 27293 + }, + { + "epoch": 1.34, + "grad_norm": 0.574532687664032, + "learning_rate": 0.00035076976136427665, + "loss": 3.3231, + "step": 27294 + }, + { + "epoch": 1.34, + "grad_norm": 0.6363462805747986, + "learning_rate": 0.00035075458717934816, + "loss": 3.227, + "step": 27295 + }, + { + "epoch": 1.34, + "grad_norm": 0.5762098431587219, + "learning_rate": 0.00035073941286074183, + "loss": 3.2721, + "step": 27296 + }, + { + "epoch": 1.34, + "grad_norm": 0.6113587021827698, + "learning_rate": 0.0003507242384084977, + "loss": 2.9124, + "step": 27297 + }, + { + "epoch": 1.34, + "grad_norm": 0.5779880881309509, + "learning_rate": 0.00035070906382265554, + "loss": 3.2056, + "step": 27298 + }, + { + "epoch": 1.34, + "grad_norm": 0.5307174921035767, + "learning_rate": 0.00035069388910325555, + "loss": 2.8949, + "step": 27299 + }, + { + "epoch": 1.34, + "grad_norm": 0.5478222966194153, + "learning_rate": 0.0003506787142503377, + "loss": 3.1332, + "step": 27300 + }, + { + "epoch": 1.34, + "grad_norm": 0.5192190408706665, + "learning_rate": 0.0003506635392639418, + "loss": 3.0535, + "step": 27301 + }, + { + "epoch": 1.34, + "grad_norm": 0.5842856168746948, + "learning_rate": 0.00035064836414410783, + "loss": 2.8508, + "step": 27302 + }, + { + "epoch": 1.34, + "grad_norm": 0.5742753148078918, + "learning_rate": 0.00035063318889087586, + "loss": 3.1244, + "step": 27303 + }, + { + "epoch": 1.34, + "grad_norm": 0.576598048210144, + "learning_rate": 0.0003506180135042859, + "loss": 3.0379, + "step": 27304 + }, + { + "epoch": 1.34, + "grad_norm": 0.5685365796089172, + "learning_rate": 0.0003506028379843777, + "loss": 3.1901, + "step": 27305 + }, + { + "epoch": 1.34, + "grad_norm": 0.6361297965049744, + "learning_rate": 0.0003505876623311914, + "loss": 3.3425, + "step": 27306 + }, + { + "epoch": 1.34, + "grad_norm": 0.6099052429199219, + "learning_rate": 0.00035057248654476694, + "loss": 3.1613, + "step": 27307 + }, + { + "epoch": 1.34, + "grad_norm": 0.5758441686630249, + "learning_rate": 0.0003505573106251443, + "loss": 3.2276, + "step": 27308 + }, + { + "epoch": 1.34, + "grad_norm": 0.5512416958808899, + "learning_rate": 0.00035054213457236347, + "loss": 3.0823, + "step": 27309 + }, + { + "epoch": 1.34, + "grad_norm": 0.5952394008636475, + "learning_rate": 0.0003505269583864643, + "loss": 3.3958, + "step": 27310 + }, + { + "epoch": 1.34, + "grad_norm": 0.569972813129425, + "learning_rate": 0.00035051178206748696, + "loss": 3.049, + "step": 27311 + }, + { + "epoch": 1.34, + "grad_norm": 0.5571011304855347, + "learning_rate": 0.00035049660561547124, + "loss": 3.2249, + "step": 27312 + }, + { + "epoch": 1.34, + "grad_norm": 0.6160045862197876, + "learning_rate": 0.0003504814290304572, + "loss": 3.3971, + "step": 27313 + }, + { + "epoch": 1.34, + "grad_norm": 0.6157644987106323, + "learning_rate": 0.0003504662523124848, + "loss": 2.9801, + "step": 27314 + }, + { + "epoch": 1.34, + "grad_norm": 0.5774762034416199, + "learning_rate": 0.00035045107546159395, + "loss": 3.0074, + "step": 27315 + }, + { + "epoch": 1.34, + "grad_norm": 0.6000568270683289, + "learning_rate": 0.00035043589847782465, + "loss": 2.9885, + "step": 27316 + }, + { + "epoch": 1.34, + "grad_norm": 0.5656048655509949, + "learning_rate": 0.00035042072136121696, + "loss": 3.1387, + "step": 27317 + }, + { + "epoch": 1.34, + "grad_norm": 0.613006055355072, + "learning_rate": 0.0003504055441118108, + "loss": 3.0847, + "step": 27318 + }, + { + "epoch": 1.34, + "grad_norm": 0.5949397683143616, + "learning_rate": 0.00035039036672964607, + "loss": 3.1484, + "step": 27319 + }, + { + "epoch": 1.34, + "grad_norm": 0.5421934127807617, + "learning_rate": 0.00035037518921476283, + "loss": 3.214, + "step": 27320 + }, + { + "epoch": 1.34, + "grad_norm": 0.574578046798706, + "learning_rate": 0.0003503600115672011, + "loss": 3.0972, + "step": 27321 + }, + { + "epoch": 1.34, + "grad_norm": 0.5909102559089661, + "learning_rate": 0.0003503448337870008, + "loss": 3.0916, + "step": 27322 + }, + { + "epoch": 1.34, + "grad_norm": 0.5599302649497986, + "learning_rate": 0.00035032965587420187, + "loss": 2.9846, + "step": 27323 + }, + { + "epoch": 1.34, + "grad_norm": 0.558342695236206, + "learning_rate": 0.0003503144778288442, + "loss": 3.1657, + "step": 27324 + }, + { + "epoch": 1.34, + "grad_norm": 0.6576507091522217, + "learning_rate": 0.000350299299650968, + "loss": 2.9349, + "step": 27325 + }, + { + "epoch": 1.34, + "grad_norm": 0.5906968712806702, + "learning_rate": 0.00035028412134061314, + "loss": 2.8856, + "step": 27326 + }, + { + "epoch": 1.34, + "grad_norm": 0.53647381067276, + "learning_rate": 0.0003502689428978196, + "loss": 3.2723, + "step": 27327 + }, + { + "epoch": 1.34, + "grad_norm": 0.8482022285461426, + "learning_rate": 0.00035025376432262724, + "loss": 3.0158, + "step": 27328 + }, + { + "epoch": 1.34, + "grad_norm": 0.603201687335968, + "learning_rate": 0.0003502385856150762, + "loss": 3.1107, + "step": 27329 + }, + { + "epoch": 1.34, + "grad_norm": 0.551382303237915, + "learning_rate": 0.0003502234067752064, + "loss": 3.1786, + "step": 27330 + }, + { + "epoch": 1.34, + "grad_norm": 0.5987643003463745, + "learning_rate": 0.0003502082278030577, + "loss": 3.0718, + "step": 27331 + }, + { + "epoch": 1.34, + "grad_norm": 0.5653188228607178, + "learning_rate": 0.0003501930486986703, + "loss": 2.962, + "step": 27332 + }, + { + "epoch": 1.34, + "grad_norm": 0.5514883399009705, + "learning_rate": 0.0003501778694620841, + "loss": 3.1602, + "step": 27333 + }, + { + "epoch": 1.34, + "grad_norm": 0.571922242641449, + "learning_rate": 0.0003501626900933389, + "loss": 3.0968, + "step": 27334 + }, + { + "epoch": 1.34, + "grad_norm": 0.5911169648170471, + "learning_rate": 0.0003501475105924749, + "loss": 3.297, + "step": 27335 + }, + { + "epoch": 1.34, + "grad_norm": 0.5552850365638733, + "learning_rate": 0.0003501323309595321, + "loss": 3.193, + "step": 27336 + }, + { + "epoch": 1.34, + "grad_norm": 0.582349419593811, + "learning_rate": 0.00035011715119455036, + "loss": 2.9565, + "step": 27337 + }, + { + "epoch": 1.34, + "grad_norm": 0.5713590383529663, + "learning_rate": 0.0003501019712975696, + "loss": 3.1794, + "step": 27338 + }, + { + "epoch": 1.34, + "grad_norm": 0.5552675127983093, + "learning_rate": 0.00035008679126862993, + "loss": 2.889, + "step": 27339 + }, + { + "epoch": 1.34, + "grad_norm": 0.548956573009491, + "learning_rate": 0.00035007161110777135, + "loss": 3.2242, + "step": 27340 + }, + { + "epoch": 1.34, + "grad_norm": 0.5811518430709839, + "learning_rate": 0.0003500564308150337, + "loss": 3.1934, + "step": 27341 + }, + { + "epoch": 1.34, + "grad_norm": 0.5775765776634216, + "learning_rate": 0.0003500412503904571, + "loss": 3.0469, + "step": 27342 + }, + { + "epoch": 1.34, + "grad_norm": 0.5719749927520752, + "learning_rate": 0.00035002606983408147, + "loss": 2.9931, + "step": 27343 + }, + { + "epoch": 1.34, + "grad_norm": 0.5796716809272766, + "learning_rate": 0.00035001088914594675, + "loss": 3.1023, + "step": 27344 + }, + { + "epoch": 1.34, + "grad_norm": 0.5906006693840027, + "learning_rate": 0.000349995708326093, + "loss": 3.2135, + "step": 27345 + }, + { + "epoch": 1.34, + "grad_norm": 0.58286052942276, + "learning_rate": 0.0003499805273745602, + "loss": 3.2402, + "step": 27346 + }, + { + "epoch": 1.34, + "grad_norm": 0.5934612154960632, + "learning_rate": 0.00034996534629138824, + "loss": 3.0281, + "step": 27347 + }, + { + "epoch": 1.34, + "grad_norm": 0.5473476648330688, + "learning_rate": 0.00034995016507661733, + "loss": 3.0496, + "step": 27348 + }, + { + "epoch": 1.34, + "grad_norm": 0.5469577312469482, + "learning_rate": 0.0003499349837302871, + "loss": 3.2524, + "step": 27349 + }, + { + "epoch": 1.34, + "grad_norm": 0.6002890467643738, + "learning_rate": 0.0003499198022524379, + "loss": 3.1881, + "step": 27350 + }, + { + "epoch": 1.34, + "grad_norm": 0.564316987991333, + "learning_rate": 0.00034990462064310945, + "loss": 3.2093, + "step": 27351 + }, + { + "epoch": 1.34, + "grad_norm": 0.5971184968948364, + "learning_rate": 0.00034988943890234186, + "loss": 2.937, + "step": 27352 + }, + { + "epoch": 1.34, + "grad_norm": 0.5803214311599731, + "learning_rate": 0.0003498742570301751, + "loss": 2.9276, + "step": 27353 + }, + { + "epoch": 1.34, + "grad_norm": 0.5375666618347168, + "learning_rate": 0.00034985907502664917, + "loss": 3.1482, + "step": 27354 + }, + { + "epoch": 1.34, + "grad_norm": 0.60368412733078, + "learning_rate": 0.00034984389289180397, + "loss": 3.1235, + "step": 27355 + }, + { + "epoch": 1.34, + "grad_norm": 0.6041290760040283, + "learning_rate": 0.00034982871062567956, + "loss": 3.1815, + "step": 27356 + }, + { + "epoch": 1.34, + "grad_norm": 0.5556178092956543, + "learning_rate": 0.00034981352822831594, + "loss": 3.0318, + "step": 27357 + }, + { + "epoch": 1.34, + "grad_norm": 0.5745914578437805, + "learning_rate": 0.000349798345699753, + "loss": 2.7708, + "step": 27358 + }, + { + "epoch": 1.34, + "grad_norm": 0.5874989628791809, + "learning_rate": 0.00034978316304003096, + "loss": 2.9932, + "step": 27359 + }, + { + "epoch": 1.34, + "grad_norm": 0.5912497043609619, + "learning_rate": 0.0003497679802491895, + "loss": 2.8349, + "step": 27360 + }, + { + "epoch": 1.34, + "grad_norm": 0.5921602845191956, + "learning_rate": 0.0003497527973272688, + "loss": 2.8376, + "step": 27361 + }, + { + "epoch": 1.34, + "grad_norm": 0.5861170887947083, + "learning_rate": 0.0003497376142743089, + "loss": 2.9823, + "step": 27362 + }, + { + "epoch": 1.34, + "grad_norm": 0.6155903339385986, + "learning_rate": 0.00034972243109034957, + "loss": 3.0171, + "step": 27363 + }, + { + "epoch": 1.34, + "grad_norm": 0.5605053901672363, + "learning_rate": 0.00034970724777543097, + "loss": 3.1852, + "step": 27364 + }, + { + "epoch": 1.34, + "grad_norm": 0.5607673525810242, + "learning_rate": 0.0003496920643295931, + "loss": 3.17, + "step": 27365 + }, + { + "epoch": 1.34, + "grad_norm": 0.5722825527191162, + "learning_rate": 0.00034967688075287584, + "loss": 3.1404, + "step": 27366 + }, + { + "epoch": 1.34, + "grad_norm": 0.5873673558235168, + "learning_rate": 0.0003496616970453192, + "loss": 3.0477, + "step": 27367 + }, + { + "epoch": 1.34, + "grad_norm": 0.6069886088371277, + "learning_rate": 0.00034964651320696326, + "loss": 2.9818, + "step": 27368 + }, + { + "epoch": 1.34, + "grad_norm": 0.5635610222816467, + "learning_rate": 0.00034963132923784796, + "loss": 2.8535, + "step": 27369 + }, + { + "epoch": 1.34, + "grad_norm": 0.5699170231819153, + "learning_rate": 0.0003496161451380133, + "loss": 3.075, + "step": 27370 + }, + { + "epoch": 1.34, + "grad_norm": 0.5741016268730164, + "learning_rate": 0.00034960096090749923, + "loss": 3.0363, + "step": 27371 + }, + { + "epoch": 1.34, + "grad_norm": 0.5453298687934875, + "learning_rate": 0.00034958577654634575, + "loss": 2.9578, + "step": 27372 + }, + { + "epoch": 1.34, + "grad_norm": 0.577882707118988, + "learning_rate": 0.000349570592054593, + "loss": 2.9079, + "step": 27373 + }, + { + "epoch": 1.34, + "grad_norm": 0.5363011360168457, + "learning_rate": 0.0003495554074322807, + "loss": 3.054, + "step": 27374 + }, + { + "epoch": 1.34, + "grad_norm": 0.5371752381324768, + "learning_rate": 0.000349540222679449, + "loss": 3.0363, + "step": 27375 + }, + { + "epoch": 1.34, + "grad_norm": 0.5992576479911804, + "learning_rate": 0.000349525037796138, + "loss": 3.2143, + "step": 27376 + }, + { + "epoch": 1.34, + "grad_norm": 0.5904133319854736, + "learning_rate": 0.00034950985278238753, + "loss": 3.2084, + "step": 27377 + }, + { + "epoch": 1.34, + "grad_norm": 0.6310132145881653, + "learning_rate": 0.00034949466763823766, + "loss": 2.872, + "step": 27378 + }, + { + "epoch": 1.34, + "grad_norm": 0.5347175002098083, + "learning_rate": 0.0003494794823637283, + "loss": 2.8966, + "step": 27379 + }, + { + "epoch": 1.34, + "grad_norm": 0.5774542689323425, + "learning_rate": 0.00034946429695889954, + "loss": 3.0197, + "step": 27380 + }, + { + "epoch": 1.34, + "grad_norm": 0.6394713521003723, + "learning_rate": 0.00034944911142379136, + "loss": 3.0879, + "step": 27381 + }, + { + "epoch": 1.34, + "grad_norm": 0.5754245519638062, + "learning_rate": 0.0003494339257584437, + "loss": 3.056, + "step": 27382 + }, + { + "epoch": 1.34, + "grad_norm": 0.5918018817901611, + "learning_rate": 0.0003494187399628966, + "loss": 3.0525, + "step": 27383 + }, + { + "epoch": 1.34, + "grad_norm": 0.5479419231414795, + "learning_rate": 0.0003494035540371901, + "loss": 3.0767, + "step": 27384 + }, + { + "epoch": 1.34, + "grad_norm": 0.5651912093162537, + "learning_rate": 0.0003493883679813641, + "loss": 3.0674, + "step": 27385 + }, + { + "epoch": 1.34, + "grad_norm": 0.610517680644989, + "learning_rate": 0.0003493731817954586, + "loss": 2.9939, + "step": 27386 + }, + { + "epoch": 1.34, + "grad_norm": 0.5542714595794678, + "learning_rate": 0.0003493579954795137, + "loss": 2.9815, + "step": 27387 + }, + { + "epoch": 1.34, + "grad_norm": 0.5359601974487305, + "learning_rate": 0.0003493428090335694, + "loss": 3.1892, + "step": 27388 + }, + { + "epoch": 1.34, + "grad_norm": 0.5919166207313538, + "learning_rate": 0.00034932762245766557, + "loss": 3.1598, + "step": 27389 + }, + { + "epoch": 1.34, + "grad_norm": 0.5871334075927734, + "learning_rate": 0.0003493124357518422, + "loss": 3.1153, + "step": 27390 + }, + { + "epoch": 1.34, + "grad_norm": 0.5805013179779053, + "learning_rate": 0.0003492972489161395, + "loss": 3.0343, + "step": 27391 + }, + { + "epoch": 1.34, + "grad_norm": 0.6670655012130737, + "learning_rate": 0.0003492820619505973, + "loss": 3.1204, + "step": 27392 + }, + { + "epoch": 1.34, + "grad_norm": 0.5655217170715332, + "learning_rate": 0.00034926687485525555, + "loss": 2.9918, + "step": 27393 + }, + { + "epoch": 1.34, + "grad_norm": 0.5714253187179565, + "learning_rate": 0.00034925168763015444, + "loss": 3.0133, + "step": 27394 + }, + { + "epoch": 1.34, + "grad_norm": 0.6281166076660156, + "learning_rate": 0.0003492365002753338, + "loss": 3.1404, + "step": 27395 + }, + { + "epoch": 1.34, + "grad_norm": 0.6088977456092834, + "learning_rate": 0.00034922131279083364, + "loss": 2.8393, + "step": 27396 + }, + { + "epoch": 1.34, + "grad_norm": 0.6364918351173401, + "learning_rate": 0.0003492061251766941, + "loss": 3.0798, + "step": 27397 + }, + { + "epoch": 1.34, + "grad_norm": 0.5799409747123718, + "learning_rate": 0.0003491909374329551, + "loss": 2.9797, + "step": 27398 + }, + { + "epoch": 1.34, + "grad_norm": 0.5414788126945496, + "learning_rate": 0.00034917574955965666, + "loss": 3.2742, + "step": 27399 + }, + { + "epoch": 1.34, + "grad_norm": 0.5648382902145386, + "learning_rate": 0.00034916056155683873, + "loss": 3.2028, + "step": 27400 + }, + { + "epoch": 1.34, + "grad_norm": 0.5464253425598145, + "learning_rate": 0.00034914537342454127, + "loss": 3.0764, + "step": 27401 + }, + { + "epoch": 1.34, + "grad_norm": 0.5706402063369751, + "learning_rate": 0.00034913018516280443, + "loss": 3.0302, + "step": 27402 + }, + { + "epoch": 1.34, + "grad_norm": 0.5839251279830933, + "learning_rate": 0.0003491149967716681, + "loss": 3.1927, + "step": 27403 + }, + { + "epoch": 1.34, + "grad_norm": 0.5891869068145752, + "learning_rate": 0.0003490998082511724, + "loss": 3.1018, + "step": 27404 + }, + { + "epoch": 1.34, + "grad_norm": 0.6145084500312805, + "learning_rate": 0.00034908461960135714, + "loss": 3.0066, + "step": 27405 + }, + { + "epoch": 1.34, + "grad_norm": 0.6099284887313843, + "learning_rate": 0.0003490694308222625, + "loss": 2.9118, + "step": 27406 + }, + { + "epoch": 1.34, + "grad_norm": 0.5804110765457153, + "learning_rate": 0.0003490542419139284, + "loss": 3.0253, + "step": 27407 + }, + { + "epoch": 1.34, + "grad_norm": 0.6241898536682129, + "learning_rate": 0.0003490390528763949, + "loss": 3.0083, + "step": 27408 + }, + { + "epoch": 1.34, + "grad_norm": 0.5708284974098206, + "learning_rate": 0.00034902386370970197, + "loss": 3.1177, + "step": 27409 + }, + { + "epoch": 1.34, + "grad_norm": 0.5617969036102295, + "learning_rate": 0.00034900867441388963, + "loss": 3.1507, + "step": 27410 + }, + { + "epoch": 1.34, + "grad_norm": 0.5512824058532715, + "learning_rate": 0.0003489934849889978, + "loss": 3.2521, + "step": 27411 + }, + { + "epoch": 1.34, + "grad_norm": 0.5705128908157349, + "learning_rate": 0.00034897829543506666, + "loss": 2.9359, + "step": 27412 + }, + { + "epoch": 1.34, + "grad_norm": 0.5484633445739746, + "learning_rate": 0.00034896310575213604, + "loss": 3.2642, + "step": 27413 + }, + { + "epoch": 1.34, + "grad_norm": 0.6320682764053345, + "learning_rate": 0.00034894791594024607, + "loss": 3.0843, + "step": 27414 + }, + { + "epoch": 1.34, + "grad_norm": 0.5714908242225647, + "learning_rate": 0.0003489327259994367, + "loss": 3.2425, + "step": 27415 + }, + { + "epoch": 1.34, + "grad_norm": 0.5834794640541077, + "learning_rate": 0.0003489175359297479, + "loss": 3.0362, + "step": 27416 + }, + { + "epoch": 1.34, + "grad_norm": 0.580422580242157, + "learning_rate": 0.0003489023457312198, + "loss": 3.0221, + "step": 27417 + }, + { + "epoch": 1.34, + "grad_norm": 0.5791642665863037, + "learning_rate": 0.00034888715540389226, + "loss": 3.1245, + "step": 27418 + }, + { + "epoch": 1.34, + "grad_norm": 0.5940735936164856, + "learning_rate": 0.00034887196494780536, + "loss": 3.1225, + "step": 27419 + }, + { + "epoch": 1.34, + "grad_norm": 0.5686174035072327, + "learning_rate": 0.00034885677436299916, + "loss": 2.943, + "step": 27420 + }, + { + "epoch": 1.34, + "grad_norm": 0.5724006295204163, + "learning_rate": 0.00034884158364951356, + "loss": 3.0081, + "step": 27421 + }, + { + "epoch": 1.34, + "grad_norm": 0.646140456199646, + "learning_rate": 0.0003488263928073887, + "loss": 3.0739, + "step": 27422 + }, + { + "epoch": 1.34, + "grad_norm": 0.6348581314086914, + "learning_rate": 0.0003488112018366645, + "loss": 3.2155, + "step": 27423 + }, + { + "epoch": 1.34, + "grad_norm": 0.5576120018959045, + "learning_rate": 0.000348796010737381, + "loss": 3.326, + "step": 27424 + }, + { + "epoch": 1.34, + "grad_norm": 0.5346035361289978, + "learning_rate": 0.0003487808195095782, + "loss": 2.9394, + "step": 27425 + }, + { + "epoch": 1.34, + "grad_norm": 0.6104670763015747, + "learning_rate": 0.000348765628153296, + "loss": 3.2695, + "step": 27426 + }, + { + "epoch": 1.34, + "grad_norm": 0.5855081081390381, + "learning_rate": 0.0003487504366685747, + "loss": 3.1842, + "step": 27427 + }, + { + "epoch": 1.34, + "grad_norm": 0.604956865310669, + "learning_rate": 0.00034873524505545404, + "loss": 2.8405, + "step": 27428 + }, + { + "epoch": 1.34, + "grad_norm": 0.5948700904846191, + "learning_rate": 0.0003487200533139741, + "loss": 3.2639, + "step": 27429 + }, + { + "epoch": 1.34, + "grad_norm": 0.5722827911376953, + "learning_rate": 0.00034870486144417486, + "loss": 3.0473, + "step": 27430 + }, + { + "epoch": 1.34, + "grad_norm": 0.5703803300857544, + "learning_rate": 0.0003486896694460965, + "loss": 3.1484, + "step": 27431 + }, + { + "epoch": 1.34, + "grad_norm": 0.5681774616241455, + "learning_rate": 0.00034867447731977885, + "loss": 2.9344, + "step": 27432 + }, + { + "epoch": 1.34, + "grad_norm": 0.5943686962127686, + "learning_rate": 0.00034865928506526206, + "loss": 3.2241, + "step": 27433 + }, + { + "epoch": 1.34, + "grad_norm": 0.5889038443565369, + "learning_rate": 0.00034864409268258606, + "loss": 3.0752, + "step": 27434 + }, + { + "epoch": 1.34, + "grad_norm": 0.551078736782074, + "learning_rate": 0.0003486289001717909, + "loss": 3.0876, + "step": 27435 + }, + { + "epoch": 1.34, + "grad_norm": 0.5845509767532349, + "learning_rate": 0.00034861370753291654, + "loss": 3.1042, + "step": 27436 + }, + { + "epoch": 1.34, + "grad_norm": 0.5808750987052917, + "learning_rate": 0.000348598514766003, + "loss": 3.0015, + "step": 27437 + }, + { + "epoch": 1.34, + "grad_norm": 0.5626875758171082, + "learning_rate": 0.00034858332187109035, + "loss": 3.1874, + "step": 27438 + }, + { + "epoch": 1.34, + "grad_norm": 0.5855140089988708, + "learning_rate": 0.0003485681288482186, + "loss": 3.0787, + "step": 27439 + }, + { + "epoch": 1.34, + "grad_norm": 0.5767703056335449, + "learning_rate": 0.00034855293569742776, + "loss": 3.1796, + "step": 27440 + }, + { + "epoch": 1.34, + "grad_norm": 0.5492141246795654, + "learning_rate": 0.0003485377424187578, + "loss": 2.7963, + "step": 27441 + }, + { + "epoch": 1.34, + "grad_norm": 0.5425765514373779, + "learning_rate": 0.0003485225490122488, + "loss": 3.1762, + "step": 27442 + }, + { + "epoch": 1.34, + "grad_norm": 0.5987265706062317, + "learning_rate": 0.00034850735547794066, + "loss": 3.1511, + "step": 27443 + }, + { + "epoch": 1.34, + "grad_norm": 0.5857033729553223, + "learning_rate": 0.00034849216181587363, + "loss": 3.1447, + "step": 27444 + }, + { + "epoch": 1.35, + "grad_norm": 0.6146132946014404, + "learning_rate": 0.0003484769680260874, + "loss": 3.1278, + "step": 27445 + }, + { + "epoch": 1.35, + "grad_norm": 0.5451048016548157, + "learning_rate": 0.00034846177410862236, + "loss": 3.1896, + "step": 27446 + }, + { + "epoch": 1.35, + "grad_norm": 0.6035172343254089, + "learning_rate": 0.00034844658006351814, + "loss": 3.1108, + "step": 27447 + }, + { + "epoch": 1.35, + "grad_norm": 0.5181143283843994, + "learning_rate": 0.0003484313858908151, + "loss": 3.073, + "step": 27448 + }, + { + "epoch": 1.35, + "grad_norm": 0.5882478356361389, + "learning_rate": 0.000348416191590553, + "loss": 3.1133, + "step": 27449 + }, + { + "epoch": 1.35, + "grad_norm": 0.5757107138633728, + "learning_rate": 0.00034840099716277207, + "loss": 3.1586, + "step": 27450 + }, + { + "epoch": 1.35, + "grad_norm": 0.5847183465957642, + "learning_rate": 0.0003483858026075122, + "loss": 3.0846, + "step": 27451 + }, + { + "epoch": 1.35, + "grad_norm": 0.6361482739448547, + "learning_rate": 0.00034837060792481333, + "loss": 3.0394, + "step": 27452 + }, + { + "epoch": 1.35, + "grad_norm": 0.5674738883972168, + "learning_rate": 0.00034835541311471573, + "loss": 2.8753, + "step": 27453 + }, + { + "epoch": 1.35, + "grad_norm": 0.5853537321090698, + "learning_rate": 0.00034834021817725923, + "loss": 2.982, + "step": 27454 + }, + { + "epoch": 1.35, + "grad_norm": 0.6073134541511536, + "learning_rate": 0.00034832502311248385, + "loss": 3.1412, + "step": 27455 + }, + { + "epoch": 1.35, + "grad_norm": 0.5832349061965942, + "learning_rate": 0.00034830982792042974, + "loss": 3.0898, + "step": 27456 + }, + { + "epoch": 1.35, + "grad_norm": 0.5568851828575134, + "learning_rate": 0.00034829463260113675, + "loss": 2.9267, + "step": 27457 + }, + { + "epoch": 1.35, + "grad_norm": 0.5909299850463867, + "learning_rate": 0.00034827943715464506, + "loss": 2.9748, + "step": 27458 + }, + { + "epoch": 1.35, + "grad_norm": 0.5588631629943848, + "learning_rate": 0.0003482642415809946, + "loss": 2.9251, + "step": 27459 + }, + { + "epoch": 1.35, + "grad_norm": 0.5427162051200867, + "learning_rate": 0.0003482490458802254, + "loss": 3.0714, + "step": 27460 + }, + { + "epoch": 1.35, + "grad_norm": 0.5868051052093506, + "learning_rate": 0.00034823385005237765, + "loss": 3.2121, + "step": 27461 + }, + { + "epoch": 1.35, + "grad_norm": 0.5954635143280029, + "learning_rate": 0.00034821865409749103, + "loss": 3.2533, + "step": 27462 + }, + { + "epoch": 1.35, + "grad_norm": 0.5590815544128418, + "learning_rate": 0.0003482034580156058, + "loss": 3.1354, + "step": 27463 + }, + { + "epoch": 1.35, + "grad_norm": 0.5484957695007324, + "learning_rate": 0.00034818826180676196, + "loss": 2.9512, + "step": 27464 + }, + { + "epoch": 1.35, + "grad_norm": 0.6143427491188049, + "learning_rate": 0.0003481730654709995, + "loss": 2.9583, + "step": 27465 + }, + { + "epoch": 1.35, + "grad_norm": 0.602432131767273, + "learning_rate": 0.0003481578690083585, + "loss": 3.1544, + "step": 27466 + }, + { + "epoch": 1.35, + "grad_norm": 0.5481232404708862, + "learning_rate": 0.0003481426724188789, + "loss": 3.1557, + "step": 27467 + }, + { + "epoch": 1.35, + "grad_norm": 0.5532791018486023, + "learning_rate": 0.00034812747570260073, + "loss": 3.0697, + "step": 27468 + }, + { + "epoch": 1.35, + "grad_norm": 0.5800402164459229, + "learning_rate": 0.0003481122788595641, + "loss": 2.9307, + "step": 27469 + }, + { + "epoch": 1.35, + "grad_norm": 0.6530680656433105, + "learning_rate": 0.00034809708188980896, + "loss": 3.2462, + "step": 27470 + }, + { + "epoch": 1.35, + "grad_norm": 0.5887762308120728, + "learning_rate": 0.0003480818847933754, + "loss": 3.0021, + "step": 27471 + }, + { + "epoch": 1.35, + "grad_norm": 0.5353226661682129, + "learning_rate": 0.0003480666875703035, + "loss": 3.0513, + "step": 27472 + }, + { + "epoch": 1.35, + "grad_norm": 0.5689337253570557, + "learning_rate": 0.00034805149022063296, + "loss": 3.1072, + "step": 27473 + }, + { + "epoch": 1.35, + "grad_norm": 0.5485574007034302, + "learning_rate": 0.0003480362927444042, + "loss": 3.1626, + "step": 27474 + }, + { + "epoch": 1.35, + "grad_norm": 0.6580893993377686, + "learning_rate": 0.0003480210951416571, + "loss": 3.0229, + "step": 27475 + }, + { + "epoch": 1.35, + "grad_norm": 0.6442924737930298, + "learning_rate": 0.00034800589741243167, + "loss": 3.077, + "step": 27476 + }, + { + "epoch": 1.35, + "grad_norm": 0.5990310311317444, + "learning_rate": 0.00034799069955676794, + "loss": 3.1217, + "step": 27477 + }, + { + "epoch": 1.35, + "grad_norm": 0.5739229321479797, + "learning_rate": 0.0003479755015747059, + "loss": 3.1821, + "step": 27478 + }, + { + "epoch": 1.35, + "grad_norm": 0.6192485690116882, + "learning_rate": 0.0003479603034662856, + "loss": 3.071, + "step": 27479 + }, + { + "epoch": 1.35, + "grad_norm": 0.5898162722587585, + "learning_rate": 0.0003479451052315471, + "loss": 3.1133, + "step": 27480 + }, + { + "epoch": 1.35, + "grad_norm": 0.5484324097633362, + "learning_rate": 0.00034792990687053046, + "loss": 2.8319, + "step": 27481 + }, + { + "epoch": 1.35, + "grad_norm": 0.5545966625213623, + "learning_rate": 0.0003479147083832757, + "loss": 3.0854, + "step": 27482 + }, + { + "epoch": 1.35, + "grad_norm": 0.5734389424324036, + "learning_rate": 0.00034789950976982275, + "loss": 3.2265, + "step": 27483 + }, + { + "epoch": 1.35, + "grad_norm": 0.6044603586196899, + "learning_rate": 0.00034788431103021175, + "loss": 2.8291, + "step": 27484 + }, + { + "epoch": 1.35, + "grad_norm": 0.5482417345046997, + "learning_rate": 0.00034786911216448267, + "loss": 3.1032, + "step": 27485 + }, + { + "epoch": 1.35, + "grad_norm": 0.5807044506072998, + "learning_rate": 0.0003478539131726757, + "loss": 3.2418, + "step": 27486 + }, + { + "epoch": 1.35, + "grad_norm": 0.5764628648757935, + "learning_rate": 0.00034783871405483056, + "loss": 3.1398, + "step": 27487 + }, + { + "epoch": 1.35, + "grad_norm": 0.5609407424926758, + "learning_rate": 0.00034782351481098744, + "loss": 3.0961, + "step": 27488 + }, + { + "epoch": 1.35, + "grad_norm": 0.5483243465423584, + "learning_rate": 0.0003478083154411865, + "loss": 3.3225, + "step": 27489 + }, + { + "epoch": 1.35, + "grad_norm": 0.5866134166717529, + "learning_rate": 0.0003477931159454675, + "loss": 3.1778, + "step": 27490 + }, + { + "epoch": 1.35, + "grad_norm": 0.5812581181526184, + "learning_rate": 0.0003477779163238708, + "loss": 2.9979, + "step": 27491 + }, + { + "epoch": 1.35, + "grad_norm": 0.5664998292922974, + "learning_rate": 0.00034776271657643615, + "loss": 3.0774, + "step": 27492 + }, + { + "epoch": 1.35, + "grad_norm": 0.6252369284629822, + "learning_rate": 0.0003477475167032038, + "loss": 2.8475, + "step": 27493 + }, + { + "epoch": 1.35, + "grad_norm": 0.647500216960907, + "learning_rate": 0.0003477323167042136, + "loss": 2.954, + "step": 27494 + }, + { + "epoch": 1.35, + "grad_norm": 0.5603214502334595, + "learning_rate": 0.0003477171165795057, + "loss": 2.9279, + "step": 27495 + }, + { + "epoch": 1.35, + "grad_norm": 0.5812236070632935, + "learning_rate": 0.00034770191632912, + "loss": 3.0352, + "step": 27496 + }, + { + "epoch": 1.35, + "grad_norm": 0.5692586898803711, + "learning_rate": 0.0003476867159530968, + "loss": 3.1431, + "step": 27497 + }, + { + "epoch": 1.35, + "grad_norm": 0.5523695945739746, + "learning_rate": 0.0003476715154514758, + "loss": 3.2377, + "step": 27498 + }, + { + "epoch": 1.35, + "grad_norm": 0.5964672565460205, + "learning_rate": 0.00034765631482429733, + "loss": 3.2893, + "step": 27499 + }, + { + "epoch": 1.35, + "grad_norm": 0.601300835609436, + "learning_rate": 0.0003476411140716012, + "loss": 2.9938, + "step": 27500 + }, + { + "epoch": 1.35, + "grad_norm": 0.5985939502716064, + "learning_rate": 0.0003476259131934277, + "loss": 3.0204, + "step": 27501 + }, + { + "epoch": 1.35, + "grad_norm": 0.5791720151901245, + "learning_rate": 0.0003476107121898166, + "loss": 3.0052, + "step": 27502 + }, + { + "epoch": 1.35, + "grad_norm": 0.6213023662567139, + "learning_rate": 0.00034759551106080795, + "loss": 3.0617, + "step": 27503 + }, + { + "epoch": 1.35, + "grad_norm": 0.5528647899627686, + "learning_rate": 0.0003475803098064421, + "loss": 3.024, + "step": 27504 + }, + { + "epoch": 1.35, + "grad_norm": 0.5452709197998047, + "learning_rate": 0.0003475651084267587, + "loss": 3.1488, + "step": 27505 + }, + { + "epoch": 1.35, + "grad_norm": 0.5762589573860168, + "learning_rate": 0.000347549906921798, + "loss": 3.0249, + "step": 27506 + }, + { + "epoch": 1.35, + "grad_norm": 0.6029790043830872, + "learning_rate": 0.00034753470529160006, + "loss": 3.1336, + "step": 27507 + }, + { + "epoch": 1.35, + "grad_norm": 0.5660719871520996, + "learning_rate": 0.00034751950353620494, + "loss": 3.1616, + "step": 27508 + }, + { + "epoch": 1.35, + "grad_norm": 0.5759086012840271, + "learning_rate": 0.00034750430165565233, + "loss": 3.1031, + "step": 27509 + }, + { + "epoch": 1.35, + "grad_norm": 0.5439847111701965, + "learning_rate": 0.00034748909964998264, + "loss": 3.1131, + "step": 27510 + }, + { + "epoch": 1.35, + "grad_norm": 0.6624140739440918, + "learning_rate": 0.0003474738975192359, + "loss": 3.0788, + "step": 27511 + }, + { + "epoch": 1.35, + "grad_norm": 0.5809428095817566, + "learning_rate": 0.000347458695263452, + "loss": 3.0249, + "step": 27512 + }, + { + "epoch": 1.35, + "grad_norm": 0.5749208927154541, + "learning_rate": 0.00034744349288267105, + "loss": 2.9156, + "step": 27513 + }, + { + "epoch": 1.35, + "grad_norm": 0.5683353543281555, + "learning_rate": 0.0003474282903769329, + "loss": 3.1445, + "step": 27514 + }, + { + "epoch": 1.35, + "grad_norm": 0.5920277833938599, + "learning_rate": 0.00034741308774627794, + "loss": 3.0885, + "step": 27515 + }, + { + "epoch": 1.35, + "grad_norm": 0.561812698841095, + "learning_rate": 0.000347397884990746, + "loss": 3.0101, + "step": 27516 + }, + { + "epoch": 1.35, + "grad_norm": 0.5506225228309631, + "learning_rate": 0.00034738268211037716, + "loss": 3.156, + "step": 27517 + }, + { + "epoch": 1.35, + "grad_norm": 0.5584924817085266, + "learning_rate": 0.00034736747910521135, + "loss": 2.9141, + "step": 27518 + }, + { + "epoch": 1.35, + "grad_norm": 0.6194260716438293, + "learning_rate": 0.00034735227597528884, + "loss": 3.0649, + "step": 27519 + }, + { + "epoch": 1.35, + "grad_norm": 0.5642139315605164, + "learning_rate": 0.0003473370727206495, + "loss": 2.6767, + "step": 27520 + }, + { + "epoch": 1.35, + "grad_norm": 0.6075909733772278, + "learning_rate": 0.0003473218693413334, + "loss": 2.9534, + "step": 27521 + }, + { + "epoch": 1.35, + "grad_norm": 0.5577836632728577, + "learning_rate": 0.00034730666583738064, + "loss": 2.9497, + "step": 27522 + }, + { + "epoch": 1.35, + "grad_norm": 0.565424382686615, + "learning_rate": 0.0003472914622088312, + "loss": 3.0355, + "step": 27523 + }, + { + "epoch": 1.35, + "grad_norm": 0.5719602704048157, + "learning_rate": 0.0003472762584557252, + "loss": 3.1099, + "step": 27524 + }, + { + "epoch": 1.35, + "grad_norm": 0.5585300922393799, + "learning_rate": 0.00034726105457810253, + "loss": 3.027, + "step": 27525 + }, + { + "epoch": 1.35, + "grad_norm": 0.5087852478027344, + "learning_rate": 0.0003472458505760035, + "loss": 3.1373, + "step": 27526 + }, + { + "epoch": 1.35, + "grad_norm": 0.5587421655654907, + "learning_rate": 0.00034723064644946787, + "loss": 3.1473, + "step": 27527 + }, + { + "epoch": 1.35, + "grad_norm": 0.5711315274238586, + "learning_rate": 0.0003472154421985359, + "loss": 3.0316, + "step": 27528 + }, + { + "epoch": 1.35, + "grad_norm": 0.5235508680343628, + "learning_rate": 0.0003472002378232474, + "loss": 3.0433, + "step": 27529 + }, + { + "epoch": 1.35, + "grad_norm": 0.5291568040847778, + "learning_rate": 0.00034718503332364264, + "loss": 3.2164, + "step": 27530 + }, + { + "epoch": 1.35, + "grad_norm": 0.5746374130249023, + "learning_rate": 0.00034716982869976157, + "loss": 3.1122, + "step": 27531 + }, + { + "epoch": 1.35, + "grad_norm": 0.5441637635231018, + "learning_rate": 0.0003471546239516443, + "loss": 2.9419, + "step": 27532 + }, + { + "epoch": 1.35, + "grad_norm": 0.5394623279571533, + "learning_rate": 0.00034713941907933075, + "loss": 3.0706, + "step": 27533 + }, + { + "epoch": 1.35, + "grad_norm": 0.5609070062637329, + "learning_rate": 0.00034712421408286106, + "loss": 2.9203, + "step": 27534 + }, + { + "epoch": 1.35, + "grad_norm": 0.5641400218009949, + "learning_rate": 0.0003471090089622752, + "loss": 3.1299, + "step": 27535 + }, + { + "epoch": 1.35, + "grad_norm": 0.5568336248397827, + "learning_rate": 0.0003470938037176134, + "loss": 3.0579, + "step": 27536 + }, + { + "epoch": 1.35, + "grad_norm": 0.5555543899536133, + "learning_rate": 0.00034707859834891557, + "loss": 2.9768, + "step": 27537 + }, + { + "epoch": 1.35, + "grad_norm": 0.5764153599739075, + "learning_rate": 0.0003470633928562218, + "loss": 2.9253, + "step": 27538 + }, + { + "epoch": 1.35, + "grad_norm": 0.5446388125419617, + "learning_rate": 0.00034704818723957197, + "loss": 3.1424, + "step": 27539 + }, + { + "epoch": 1.35, + "grad_norm": 0.5743789076805115, + "learning_rate": 0.0003470329814990064, + "loss": 3.0394, + "step": 27540 + }, + { + "epoch": 1.35, + "grad_norm": 0.5995753407478333, + "learning_rate": 0.00034701777563456496, + "loss": 3.1474, + "step": 27541 + }, + { + "epoch": 1.35, + "grad_norm": 0.5293822288513184, + "learning_rate": 0.00034700256964628767, + "loss": 3.0878, + "step": 27542 + }, + { + "epoch": 1.35, + "grad_norm": 0.5851097702980042, + "learning_rate": 0.00034698736353421477, + "loss": 3.1295, + "step": 27543 + }, + { + "epoch": 1.35, + "grad_norm": 0.5822820067405701, + "learning_rate": 0.00034697215729838615, + "loss": 3.1411, + "step": 27544 + }, + { + "epoch": 1.35, + "grad_norm": 0.5606449842453003, + "learning_rate": 0.00034695695093884193, + "loss": 3.1297, + "step": 27545 + }, + { + "epoch": 1.35, + "grad_norm": 0.5498665571212769, + "learning_rate": 0.00034694174445562206, + "loss": 3.0238, + "step": 27546 + }, + { + "epoch": 1.35, + "grad_norm": 0.5421035289764404, + "learning_rate": 0.0003469265378487668, + "loss": 3.0742, + "step": 27547 + }, + { + "epoch": 1.35, + "grad_norm": 0.5203042030334473, + "learning_rate": 0.000346911331118316, + "loss": 3.0389, + "step": 27548 + }, + { + "epoch": 1.35, + "grad_norm": 0.5701861381530762, + "learning_rate": 0.0003468961242643098, + "loss": 3.2245, + "step": 27549 + }, + { + "epoch": 1.35, + "grad_norm": 0.5597485303878784, + "learning_rate": 0.0003468809172867881, + "loss": 3.0925, + "step": 27550 + }, + { + "epoch": 1.35, + "grad_norm": 0.5598206520080566, + "learning_rate": 0.00034686571018579127, + "loss": 2.9806, + "step": 27551 + }, + { + "epoch": 1.35, + "grad_norm": 0.5926447510719299, + "learning_rate": 0.00034685050296135914, + "loss": 2.9184, + "step": 27552 + }, + { + "epoch": 1.35, + "grad_norm": 0.5669886469841003, + "learning_rate": 0.00034683529561353174, + "loss": 3.074, + "step": 27553 + }, + { + "epoch": 1.35, + "grad_norm": 0.5824142098426819, + "learning_rate": 0.0003468200881423493, + "loss": 3.116, + "step": 27554 + }, + { + "epoch": 1.35, + "grad_norm": 0.5819292068481445, + "learning_rate": 0.00034680488054785163, + "loss": 3.0986, + "step": 27555 + }, + { + "epoch": 1.35, + "grad_norm": 0.5521724820137024, + "learning_rate": 0.000346789672830079, + "loss": 2.9404, + "step": 27556 + }, + { + "epoch": 1.35, + "grad_norm": 0.5370104908943176, + "learning_rate": 0.0003467744649890713, + "loss": 3.0389, + "step": 27557 + }, + { + "epoch": 1.35, + "grad_norm": 0.5862271189689636, + "learning_rate": 0.0003467592570248687, + "loss": 3.1758, + "step": 27558 + }, + { + "epoch": 1.35, + "grad_norm": 0.5811551809310913, + "learning_rate": 0.0003467440489375113, + "loss": 2.9348, + "step": 27559 + }, + { + "epoch": 1.35, + "grad_norm": 0.523688554763794, + "learning_rate": 0.00034672884072703885, + "loss": 2.9629, + "step": 27560 + }, + { + "epoch": 1.35, + "grad_norm": 0.5698621273040771, + "learning_rate": 0.0003467136323934918, + "loss": 3.0013, + "step": 27561 + }, + { + "epoch": 1.35, + "grad_norm": 0.5459543466567993, + "learning_rate": 0.00034669842393691, + "loss": 3.0861, + "step": 27562 + }, + { + "epoch": 1.35, + "grad_norm": 0.6095524430274963, + "learning_rate": 0.0003466832153573336, + "loss": 3.0588, + "step": 27563 + }, + { + "epoch": 1.35, + "grad_norm": 0.5736280679702759, + "learning_rate": 0.00034666800665480253, + "loss": 3.1733, + "step": 27564 + }, + { + "epoch": 1.35, + "grad_norm": 0.579998254776001, + "learning_rate": 0.0003466527978293568, + "loss": 3.0893, + "step": 27565 + }, + { + "epoch": 1.35, + "grad_norm": 0.5939657688140869, + "learning_rate": 0.00034663758888103677, + "loss": 3.0966, + "step": 27566 + }, + { + "epoch": 1.35, + "grad_norm": 0.5807148814201355, + "learning_rate": 0.00034662237980988226, + "loss": 3.03, + "step": 27567 + }, + { + "epoch": 1.35, + "grad_norm": 0.57145756483078, + "learning_rate": 0.00034660717061593335, + "loss": 3.0573, + "step": 27568 + }, + { + "epoch": 1.35, + "grad_norm": 0.5279229283332825, + "learning_rate": 0.00034659196129923004, + "loss": 2.9326, + "step": 27569 + }, + { + "epoch": 1.35, + "grad_norm": 0.5835081338882446, + "learning_rate": 0.00034657675185981255, + "loss": 2.9735, + "step": 27570 + }, + { + "epoch": 1.35, + "grad_norm": 0.5817593932151794, + "learning_rate": 0.00034656154229772084, + "loss": 3.1158, + "step": 27571 + }, + { + "epoch": 1.35, + "grad_norm": 0.5388606786727905, + "learning_rate": 0.000346546332612995, + "loss": 2.8509, + "step": 27572 + }, + { + "epoch": 1.35, + "grad_norm": 0.622467041015625, + "learning_rate": 0.0003465311228056751, + "loss": 3.0875, + "step": 27573 + }, + { + "epoch": 1.35, + "grad_norm": 0.5687299966812134, + "learning_rate": 0.00034651591287580125, + "loss": 2.9959, + "step": 27574 + }, + { + "epoch": 1.35, + "grad_norm": 0.5633476972579956, + "learning_rate": 0.00034650070282341326, + "loss": 2.9332, + "step": 27575 + }, + { + "epoch": 1.35, + "grad_norm": 0.5782656073570251, + "learning_rate": 0.00034648549264855146, + "loss": 3.0858, + "step": 27576 + }, + { + "epoch": 1.35, + "grad_norm": 0.577627956867218, + "learning_rate": 0.0003464702823512558, + "loss": 3.012, + "step": 27577 + }, + { + "epoch": 1.35, + "grad_norm": 0.559639573097229, + "learning_rate": 0.00034645507193156646, + "loss": 3.1584, + "step": 27578 + }, + { + "epoch": 1.35, + "grad_norm": 0.5748494863510132, + "learning_rate": 0.0003464398613895233, + "loss": 3.0548, + "step": 27579 + }, + { + "epoch": 1.35, + "grad_norm": 0.5871093273162842, + "learning_rate": 0.0003464246507251664, + "loss": 3.1131, + "step": 27580 + }, + { + "epoch": 1.35, + "grad_norm": 0.5574132204055786, + "learning_rate": 0.00034640943993853606, + "loss": 3.1498, + "step": 27581 + }, + { + "epoch": 1.35, + "grad_norm": 0.5610949397087097, + "learning_rate": 0.0003463942290296721, + "loss": 3.0497, + "step": 27582 + }, + { + "epoch": 1.35, + "grad_norm": 0.5792922377586365, + "learning_rate": 0.0003463790179986147, + "loss": 3.3694, + "step": 27583 + }, + { + "epoch": 1.35, + "grad_norm": 0.5634068250656128, + "learning_rate": 0.0003463638068454039, + "loss": 3.0192, + "step": 27584 + }, + { + "epoch": 1.35, + "grad_norm": 0.5394881367683411, + "learning_rate": 0.00034634859557007976, + "loss": 3.1326, + "step": 27585 + }, + { + "epoch": 1.35, + "grad_norm": 0.6030181050300598, + "learning_rate": 0.00034633338417268227, + "loss": 3.0469, + "step": 27586 + }, + { + "epoch": 1.35, + "grad_norm": 0.6113521456718445, + "learning_rate": 0.0003463181726532516, + "loss": 3.0221, + "step": 27587 + }, + { + "epoch": 1.35, + "grad_norm": 0.5368742346763611, + "learning_rate": 0.00034630296101182794, + "loss": 3.1202, + "step": 27588 + }, + { + "epoch": 1.35, + "grad_norm": 0.588915228843689, + "learning_rate": 0.000346287749248451, + "loss": 3.0284, + "step": 27589 + }, + { + "epoch": 1.35, + "grad_norm": 0.5892957448959351, + "learning_rate": 0.00034627253736316103, + "loss": 3.1462, + "step": 27590 + }, + { + "epoch": 1.35, + "grad_norm": 0.5666577816009521, + "learning_rate": 0.0003462573253559982, + "loss": 3.0818, + "step": 27591 + }, + { + "epoch": 1.35, + "grad_norm": 0.5563762187957764, + "learning_rate": 0.0003462421132270024, + "loss": 3.2716, + "step": 27592 + }, + { + "epoch": 1.35, + "grad_norm": 0.615742564201355, + "learning_rate": 0.0003462269009762138, + "loss": 3.0507, + "step": 27593 + }, + { + "epoch": 1.35, + "grad_norm": 0.5579602122306824, + "learning_rate": 0.0003462116886036725, + "loss": 3.0305, + "step": 27594 + }, + { + "epoch": 1.35, + "grad_norm": 0.5372714996337891, + "learning_rate": 0.0003461964761094184, + "loss": 2.8777, + "step": 27595 + }, + { + "epoch": 1.35, + "grad_norm": 0.5734666585922241, + "learning_rate": 0.0003461812634934917, + "loss": 2.9552, + "step": 27596 + }, + { + "epoch": 1.35, + "grad_norm": 0.5839183330535889, + "learning_rate": 0.0003461660507559325, + "loss": 3.0588, + "step": 27597 + }, + { + "epoch": 1.35, + "grad_norm": 0.6317874193191528, + "learning_rate": 0.00034615083789678075, + "loss": 2.8828, + "step": 27598 + }, + { + "epoch": 1.35, + "grad_norm": 0.572235643863678, + "learning_rate": 0.0003461356249160767, + "loss": 3.0354, + "step": 27599 + }, + { + "epoch": 1.35, + "grad_norm": 0.5442335605621338, + "learning_rate": 0.00034612041181386014, + "loss": 3.099, + "step": 27600 + }, + { + "epoch": 1.35, + "grad_norm": 0.5586719512939453, + "learning_rate": 0.0003461051985901713, + "loss": 3.1294, + "step": 27601 + }, + { + "epoch": 1.35, + "grad_norm": 0.5442153811454773, + "learning_rate": 0.0003460899852450502, + "loss": 2.9258, + "step": 27602 + }, + { + "epoch": 1.35, + "grad_norm": 0.5317257642745972, + "learning_rate": 0.0003460747717785371, + "loss": 3.2071, + "step": 27603 + }, + { + "epoch": 1.35, + "grad_norm": 0.5560556054115295, + "learning_rate": 0.0003460595581906719, + "loss": 3.2065, + "step": 27604 + }, + { + "epoch": 1.35, + "grad_norm": 0.6010359525680542, + "learning_rate": 0.0003460443444814946, + "loss": 2.8649, + "step": 27605 + }, + { + "epoch": 1.35, + "grad_norm": 0.5659478306770325, + "learning_rate": 0.0003460291306510454, + "loss": 3.0582, + "step": 27606 + }, + { + "epoch": 1.35, + "grad_norm": 0.5736438035964966, + "learning_rate": 0.00034601391669936436, + "loss": 3.0973, + "step": 27607 + }, + { + "epoch": 1.35, + "grad_norm": 0.5653098821640015, + "learning_rate": 0.0003459987026264914, + "loss": 3.3044, + "step": 27608 + }, + { + "epoch": 1.35, + "grad_norm": 0.5595373511314392, + "learning_rate": 0.0003459834884324668, + "loss": 3.1815, + "step": 27609 + }, + { + "epoch": 1.35, + "grad_norm": 0.5338704586029053, + "learning_rate": 0.0003459682741173306, + "loss": 3.2657, + "step": 27610 + }, + { + "epoch": 1.35, + "grad_norm": 0.595348060131073, + "learning_rate": 0.0003459530596811227, + "loss": 3.0412, + "step": 27611 + }, + { + "epoch": 1.35, + "grad_norm": 0.5767225623130798, + "learning_rate": 0.0003459378451238833, + "loss": 3.1244, + "step": 27612 + }, + { + "epoch": 1.35, + "grad_norm": 0.5606244206428528, + "learning_rate": 0.00034592263044565247, + "loss": 2.9589, + "step": 27613 + }, + { + "epoch": 1.35, + "grad_norm": 0.581843376159668, + "learning_rate": 0.0003459074156464704, + "loss": 3.1054, + "step": 27614 + }, + { + "epoch": 1.35, + "grad_norm": 0.5597598552703857, + "learning_rate": 0.0003458922007263769, + "loss": 3.1613, + "step": 27615 + }, + { + "epoch": 1.35, + "grad_norm": 0.6114240884780884, + "learning_rate": 0.0003458769856854121, + "loss": 2.9326, + "step": 27616 + }, + { + "epoch": 1.35, + "grad_norm": 0.6092174649238586, + "learning_rate": 0.0003458617705236163, + "loss": 3.1666, + "step": 27617 + }, + { + "epoch": 1.35, + "grad_norm": 0.5693092346191406, + "learning_rate": 0.0003458465552410294, + "loss": 3.1785, + "step": 27618 + }, + { + "epoch": 1.35, + "grad_norm": 0.5575876235961914, + "learning_rate": 0.00034583133983769146, + "loss": 3.0092, + "step": 27619 + }, + { + "epoch": 1.35, + "grad_norm": 0.5795199871063232, + "learning_rate": 0.00034581612431364253, + "loss": 3.2021, + "step": 27620 + }, + { + "epoch": 1.35, + "grad_norm": 0.5466296076774597, + "learning_rate": 0.0003458009086689228, + "loss": 3.271, + "step": 27621 + }, + { + "epoch": 1.35, + "grad_norm": 0.5569012761116028, + "learning_rate": 0.00034578569290357234, + "loss": 3.0389, + "step": 27622 + }, + { + "epoch": 1.35, + "grad_norm": 0.554438591003418, + "learning_rate": 0.00034577047701763114, + "loss": 3.284, + "step": 27623 + }, + { + "epoch": 1.35, + "grad_norm": 0.540482759475708, + "learning_rate": 0.00034575526101113924, + "loss": 3.0049, + "step": 27624 + }, + { + "epoch": 1.35, + "grad_norm": 0.5931518077850342, + "learning_rate": 0.0003457400448841369, + "loss": 3.0533, + "step": 27625 + }, + { + "epoch": 1.35, + "grad_norm": 0.5511003136634827, + "learning_rate": 0.0003457248286366641, + "loss": 2.9812, + "step": 27626 + }, + { + "epoch": 1.35, + "grad_norm": 0.5712304711341858, + "learning_rate": 0.00034570961226876074, + "loss": 3.1466, + "step": 27627 + }, + { + "epoch": 1.35, + "grad_norm": 0.5965542197227478, + "learning_rate": 0.0003456943957804672, + "loss": 3.0022, + "step": 27628 + }, + { + "epoch": 1.35, + "grad_norm": 0.5600318312644958, + "learning_rate": 0.0003456791791718234, + "loss": 3.0214, + "step": 27629 + }, + { + "epoch": 1.35, + "grad_norm": 0.5907925367355347, + "learning_rate": 0.00034566396244286945, + "loss": 3.0262, + "step": 27630 + }, + { + "epoch": 1.35, + "grad_norm": 0.6217458844184875, + "learning_rate": 0.00034564874559364536, + "loss": 3.1207, + "step": 27631 + }, + { + "epoch": 1.35, + "grad_norm": 0.5708328485488892, + "learning_rate": 0.00034563352862419127, + "loss": 3.0498, + "step": 27632 + }, + { + "epoch": 1.35, + "grad_norm": 0.5754064321517944, + "learning_rate": 0.00034561831153454725, + "loss": 3.1127, + "step": 27633 + }, + { + "epoch": 1.35, + "grad_norm": 0.5828617215156555, + "learning_rate": 0.00034560309432475335, + "loss": 3.1008, + "step": 27634 + }, + { + "epoch": 1.35, + "grad_norm": 0.553674042224884, + "learning_rate": 0.0003455878769948497, + "loss": 3.0445, + "step": 27635 + }, + { + "epoch": 1.35, + "grad_norm": 0.5497589707374573, + "learning_rate": 0.0003455726595448764, + "loss": 3.0791, + "step": 27636 + }, + { + "epoch": 1.35, + "grad_norm": 0.5885263681411743, + "learning_rate": 0.00034555744197487334, + "loss": 2.9445, + "step": 27637 + }, + { + "epoch": 1.35, + "grad_norm": 0.5797986388206482, + "learning_rate": 0.0003455422242848809, + "loss": 3.3291, + "step": 27638 + }, + { + "epoch": 1.35, + "grad_norm": 0.5659964680671692, + "learning_rate": 0.000345527006474939, + "loss": 2.9028, + "step": 27639 + }, + { + "epoch": 1.35, + "grad_norm": 0.5410609841346741, + "learning_rate": 0.00034551178854508763, + "loss": 3.1878, + "step": 27640 + }, + { + "epoch": 1.35, + "grad_norm": 0.6087222099304199, + "learning_rate": 0.0003454965704953671, + "loss": 3.0399, + "step": 27641 + }, + { + "epoch": 1.35, + "grad_norm": 0.5863704085350037, + "learning_rate": 0.0003454813523258172, + "loss": 2.9568, + "step": 27642 + }, + { + "epoch": 1.35, + "grad_norm": 0.5730543732643127, + "learning_rate": 0.00034546613403647826, + "loss": 2.7912, + "step": 27643 + }, + { + "epoch": 1.35, + "grad_norm": 0.5473147630691528, + "learning_rate": 0.0003454509156273903, + "loss": 3.1532, + "step": 27644 + }, + { + "epoch": 1.35, + "grad_norm": 0.6057653427124023, + "learning_rate": 0.0003454356970985933, + "loss": 3.098, + "step": 27645 + }, + { + "epoch": 1.35, + "grad_norm": 0.5998139381408691, + "learning_rate": 0.0003454204784501275, + "loss": 3.0748, + "step": 27646 + }, + { + "epoch": 1.35, + "grad_norm": 0.6057741641998291, + "learning_rate": 0.0003454052596820328, + "loss": 3.0626, + "step": 27647 + }, + { + "epoch": 1.35, + "grad_norm": 0.5644229650497437, + "learning_rate": 0.00034539004079434945, + "loss": 3.1077, + "step": 27648 + }, + { + "epoch": 1.36, + "grad_norm": 0.5536103248596191, + "learning_rate": 0.00034537482178711743, + "loss": 2.9196, + "step": 27649 + }, + { + "epoch": 1.36, + "grad_norm": 0.5765447616577148, + "learning_rate": 0.00034535960266037695, + "loss": 3.158, + "step": 27650 + }, + { + "epoch": 1.36, + "grad_norm": 0.551717221736908, + "learning_rate": 0.00034534438341416796, + "loss": 3.047, + "step": 27651 + }, + { + "epoch": 1.36, + "grad_norm": 0.55844646692276, + "learning_rate": 0.0003453291640485305, + "loss": 3.0137, + "step": 27652 + }, + { + "epoch": 1.36, + "grad_norm": 0.5359190702438354, + "learning_rate": 0.00034531394456350486, + "loss": 3.0382, + "step": 27653 + }, + { + "epoch": 1.36, + "grad_norm": 0.5825155377388, + "learning_rate": 0.00034529872495913104, + "loss": 3.2114, + "step": 27654 + }, + { + "epoch": 1.36, + "grad_norm": 0.5790165662765503, + "learning_rate": 0.000345283505235449, + "loss": 2.943, + "step": 27655 + }, + { + "epoch": 1.36, + "grad_norm": 0.6118806004524231, + "learning_rate": 0.00034526828539249894, + "loss": 2.9642, + "step": 27656 + }, + { + "epoch": 1.36, + "grad_norm": 0.5818917751312256, + "learning_rate": 0.00034525306543032095, + "loss": 2.8687, + "step": 27657 + }, + { + "epoch": 1.36, + "grad_norm": 0.5818273425102234, + "learning_rate": 0.0003452378453489551, + "loss": 2.9843, + "step": 27658 + }, + { + "epoch": 1.36, + "grad_norm": 0.5957201719284058, + "learning_rate": 0.00034522262514844143, + "loss": 2.9917, + "step": 27659 + }, + { + "epoch": 1.36, + "grad_norm": 0.5800155997276306, + "learning_rate": 0.0003452074048288201, + "loss": 2.8889, + "step": 27660 + }, + { + "epoch": 1.36, + "grad_norm": 0.5779448747634888, + "learning_rate": 0.0003451921843901312, + "loss": 3.0047, + "step": 27661 + }, + { + "epoch": 1.36, + "grad_norm": 0.6519986391067505, + "learning_rate": 0.0003451769638324147, + "loss": 3.1293, + "step": 27662 + }, + { + "epoch": 1.36, + "grad_norm": 0.577875554561615, + "learning_rate": 0.00034516174315571077, + "loss": 3.0927, + "step": 27663 + }, + { + "epoch": 1.36, + "grad_norm": 0.5626934766769409, + "learning_rate": 0.00034514652236005956, + "loss": 2.9672, + "step": 27664 + }, + { + "epoch": 1.36, + "grad_norm": 0.5806666612625122, + "learning_rate": 0.00034513130144550114, + "loss": 2.8056, + "step": 27665 + }, + { + "epoch": 1.36, + "grad_norm": 0.5470517873764038, + "learning_rate": 0.0003451160804120755, + "loss": 2.9702, + "step": 27666 + }, + { + "epoch": 1.36, + "grad_norm": 0.560828685760498, + "learning_rate": 0.00034510085925982284, + "loss": 3.1832, + "step": 27667 + }, + { + "epoch": 1.36, + "grad_norm": 0.5910745859146118, + "learning_rate": 0.0003450856379887831, + "loss": 2.9672, + "step": 27668 + }, + { + "epoch": 1.36, + "grad_norm": 0.5761743187904358, + "learning_rate": 0.0003450704165989965, + "loss": 2.8815, + "step": 27669 + }, + { + "epoch": 1.36, + "grad_norm": 0.5565659403800964, + "learning_rate": 0.00034505519509050314, + "loss": 3.0549, + "step": 27670 + }, + { + "epoch": 1.36, + "grad_norm": 0.5785242319107056, + "learning_rate": 0.000345039973463343, + "loss": 2.9993, + "step": 27671 + }, + { + "epoch": 1.36, + "grad_norm": 0.5992151498794556, + "learning_rate": 0.00034502475171755635, + "loss": 2.9655, + "step": 27672 + }, + { + "epoch": 1.36, + "grad_norm": 0.5523272752761841, + "learning_rate": 0.00034500952985318305, + "loss": 3.1217, + "step": 27673 + }, + { + "epoch": 1.36, + "grad_norm": 0.543624222278595, + "learning_rate": 0.0003449943078702633, + "loss": 3.0863, + "step": 27674 + }, + { + "epoch": 1.36, + "grad_norm": 0.5931307673454285, + "learning_rate": 0.0003449790857688373, + "loss": 3.2511, + "step": 27675 + }, + { + "epoch": 1.36, + "grad_norm": 0.5931395888328552, + "learning_rate": 0.0003449638635489451, + "loss": 2.8998, + "step": 27676 + }, + { + "epoch": 1.36, + "grad_norm": 0.5682350993156433, + "learning_rate": 0.0003449486412106266, + "loss": 3.0556, + "step": 27677 + }, + { + "epoch": 1.36, + "grad_norm": 0.590588390827179, + "learning_rate": 0.00034493341875392196, + "loss": 3.0171, + "step": 27678 + }, + { + "epoch": 1.36, + "grad_norm": 0.6109996438026428, + "learning_rate": 0.00034491819617887154, + "loss": 3.0134, + "step": 27679 + }, + { + "epoch": 1.36, + "grad_norm": 0.6124204397201538, + "learning_rate": 0.0003449029734855152, + "loss": 3.1012, + "step": 27680 + }, + { + "epoch": 1.36, + "grad_norm": 0.5548977255821228, + "learning_rate": 0.00034488775067389294, + "loss": 3.282, + "step": 27681 + }, + { + "epoch": 1.36, + "grad_norm": 0.6532643437385559, + "learning_rate": 0.00034487252774404505, + "loss": 2.9508, + "step": 27682 + }, + { + "epoch": 1.36, + "grad_norm": 0.6053386330604553, + "learning_rate": 0.00034485730469601154, + "loss": 3.2382, + "step": 27683 + }, + { + "epoch": 1.36, + "grad_norm": 0.567348837852478, + "learning_rate": 0.00034484208152983257, + "loss": 2.7893, + "step": 27684 + }, + { + "epoch": 1.36, + "grad_norm": 0.5835475325584412, + "learning_rate": 0.00034482685824554815, + "loss": 3.0216, + "step": 27685 + }, + { + "epoch": 1.36, + "grad_norm": 0.5753740668296814, + "learning_rate": 0.00034481163484319845, + "loss": 3.3452, + "step": 27686 + }, + { + "epoch": 1.36, + "grad_norm": 0.6069841980934143, + "learning_rate": 0.0003447964113228236, + "loss": 3.1379, + "step": 27687 + }, + { + "epoch": 1.36, + "grad_norm": 0.5838615894317627, + "learning_rate": 0.0003447811876844634, + "loss": 2.9669, + "step": 27688 + }, + { + "epoch": 1.36, + "grad_norm": 0.547076940536499, + "learning_rate": 0.00034476596392815835, + "loss": 3.0967, + "step": 27689 + }, + { + "epoch": 1.36, + "grad_norm": 0.6205630898475647, + "learning_rate": 0.00034475074005394836, + "loss": 3.1115, + "step": 27690 + }, + { + "epoch": 1.36, + "grad_norm": 0.5432915091514587, + "learning_rate": 0.0003447355160618735, + "loss": 3.1954, + "step": 27691 + }, + { + "epoch": 1.36, + "grad_norm": 0.5629213452339172, + "learning_rate": 0.0003447202919519739, + "loss": 3.134, + "step": 27692 + }, + { + "epoch": 1.36, + "grad_norm": 0.6095461249351501, + "learning_rate": 0.00034470506772428966, + "loss": 2.9366, + "step": 27693 + }, + { + "epoch": 1.36, + "grad_norm": 0.5309693813323975, + "learning_rate": 0.00034468984337886085, + "loss": 3.2188, + "step": 27694 + }, + { + "epoch": 1.36, + "grad_norm": 0.5798507928848267, + "learning_rate": 0.0003446746189157276, + "loss": 2.9499, + "step": 27695 + }, + { + "epoch": 1.36, + "grad_norm": 0.5637386441230774, + "learning_rate": 0.00034465939433493003, + "loss": 2.9753, + "step": 27696 + }, + { + "epoch": 1.36, + "grad_norm": 0.5978463888168335, + "learning_rate": 0.0003446441696365082, + "loss": 2.9246, + "step": 27697 + }, + { + "epoch": 1.36, + "grad_norm": 0.5650119781494141, + "learning_rate": 0.00034462894482050214, + "loss": 3.1587, + "step": 27698 + }, + { + "epoch": 1.36, + "grad_norm": 0.6245388984680176, + "learning_rate": 0.00034461371988695215, + "loss": 3.1073, + "step": 27699 + }, + { + "epoch": 1.36, + "grad_norm": 0.6453614234924316, + "learning_rate": 0.0003445984948358981, + "loss": 3.1702, + "step": 27700 + }, + { + "epoch": 1.36, + "grad_norm": 0.5954031944274902, + "learning_rate": 0.0003445832696673803, + "loss": 3.2765, + "step": 27701 + }, + { + "epoch": 1.36, + "grad_norm": 0.5462033152580261, + "learning_rate": 0.0003445680443814387, + "loss": 2.9853, + "step": 27702 + }, + { + "epoch": 1.36, + "grad_norm": 0.5822536945343018, + "learning_rate": 0.00034455281897811344, + "loss": 3.147, + "step": 27703 + }, + { + "epoch": 1.36, + "grad_norm": 0.5529893040657043, + "learning_rate": 0.0003445375934574446, + "loss": 3.1181, + "step": 27704 + }, + { + "epoch": 1.36, + "grad_norm": 0.5892257690429688, + "learning_rate": 0.0003445223678194724, + "loss": 3.0272, + "step": 27705 + }, + { + "epoch": 1.36, + "grad_norm": 0.5971611738204956, + "learning_rate": 0.0003445071420642368, + "loss": 3.0081, + "step": 27706 + }, + { + "epoch": 1.36, + "grad_norm": 0.5674407482147217, + "learning_rate": 0.000344491916191778, + "loss": 3.1847, + "step": 27707 + }, + { + "epoch": 1.36, + "grad_norm": 0.6164575219154358, + "learning_rate": 0.000344476690202136, + "loss": 3.0107, + "step": 27708 + }, + { + "epoch": 1.36, + "grad_norm": 0.5708376169204712, + "learning_rate": 0.0003444614640953509, + "loss": 3.1381, + "step": 27709 + }, + { + "epoch": 1.36, + "grad_norm": 0.5596605539321899, + "learning_rate": 0.00034444623787146297, + "loss": 3.0325, + "step": 27710 + }, + { + "epoch": 1.36, + "grad_norm": 0.5818103551864624, + "learning_rate": 0.0003444310115305121, + "loss": 3.1746, + "step": 27711 + }, + { + "epoch": 1.36, + "grad_norm": 0.5505349040031433, + "learning_rate": 0.0003444157850725386, + "loss": 3.128, + "step": 27712 + }, + { + "epoch": 1.36, + "grad_norm": 0.5452769994735718, + "learning_rate": 0.00034440055849758246, + "loss": 3.2109, + "step": 27713 + }, + { + "epoch": 1.36, + "grad_norm": 0.5605431199073792, + "learning_rate": 0.0003443853318056837, + "loss": 2.9834, + "step": 27714 + }, + { + "epoch": 1.36, + "grad_norm": 0.5733485221862793, + "learning_rate": 0.0003443701049968826, + "loss": 3.1925, + "step": 27715 + }, + { + "epoch": 1.36, + "grad_norm": 0.5892324447631836, + "learning_rate": 0.0003443548780712192, + "loss": 2.9853, + "step": 27716 + }, + { + "epoch": 1.36, + "grad_norm": 0.6256640553474426, + "learning_rate": 0.0003443396510287335, + "loss": 3.0881, + "step": 27717 + }, + { + "epoch": 1.36, + "grad_norm": 0.6237658858299255, + "learning_rate": 0.00034432442386946575, + "loss": 3.0615, + "step": 27718 + }, + { + "epoch": 1.36, + "grad_norm": 0.6183185577392578, + "learning_rate": 0.000344309196593456, + "loss": 2.9544, + "step": 27719 + }, + { + "epoch": 1.36, + "grad_norm": 0.5464622974395752, + "learning_rate": 0.00034429396920074436, + "loss": 2.9632, + "step": 27720 + }, + { + "epoch": 1.36, + "grad_norm": 0.5681126117706299, + "learning_rate": 0.0003442787416913709, + "loss": 3.0697, + "step": 27721 + }, + { + "epoch": 1.36, + "grad_norm": 0.5842775106430054, + "learning_rate": 0.0003442635140653758, + "loss": 3.0525, + "step": 27722 + }, + { + "epoch": 1.36, + "grad_norm": 0.5486373901367188, + "learning_rate": 0.00034424828632279914, + "loss": 3.0561, + "step": 27723 + }, + { + "epoch": 1.36, + "grad_norm": 0.5385405421257019, + "learning_rate": 0.0003442330584636809, + "loss": 3.1888, + "step": 27724 + }, + { + "epoch": 1.36, + "grad_norm": 0.5733704566955566, + "learning_rate": 0.0003442178304880613, + "loss": 2.9091, + "step": 27725 + }, + { + "epoch": 1.36, + "grad_norm": 0.5765992999076843, + "learning_rate": 0.00034420260239598053, + "loss": 3.0318, + "step": 27726 + }, + { + "epoch": 1.36, + "grad_norm": 0.5930641889572144, + "learning_rate": 0.0003441873741874786, + "loss": 3.0233, + "step": 27727 + }, + { + "epoch": 1.36, + "grad_norm": 0.5822330117225647, + "learning_rate": 0.00034417214586259567, + "loss": 3.1026, + "step": 27728 + }, + { + "epoch": 1.36, + "grad_norm": 0.6270058751106262, + "learning_rate": 0.0003441569174213717, + "loss": 3.1726, + "step": 27729 + }, + { + "epoch": 1.36, + "grad_norm": 0.5660275220870972, + "learning_rate": 0.000344141688863847, + "loss": 2.8885, + "step": 27730 + }, + { + "epoch": 1.36, + "grad_norm": 0.6260101199150085, + "learning_rate": 0.00034412646019006156, + "loss": 3.075, + "step": 27731 + }, + { + "epoch": 1.36, + "grad_norm": 0.577351450920105, + "learning_rate": 0.0003441112314000555, + "loss": 3.1064, + "step": 27732 + }, + { + "epoch": 1.36, + "grad_norm": 0.5536080598831177, + "learning_rate": 0.00034409600249386894, + "loss": 3.1454, + "step": 27733 + }, + { + "epoch": 1.36, + "grad_norm": 0.5863784551620483, + "learning_rate": 0.000344080773471542, + "loss": 3.0272, + "step": 27734 + }, + { + "epoch": 1.36, + "grad_norm": 0.6073148846626282, + "learning_rate": 0.0003440655443331148, + "loss": 3.2349, + "step": 27735 + }, + { + "epoch": 1.36, + "grad_norm": 0.5578989386558533, + "learning_rate": 0.00034405031507862746, + "loss": 3.078, + "step": 27736 + }, + { + "epoch": 1.36, + "grad_norm": 0.5783877372741699, + "learning_rate": 0.00034403508570811993, + "loss": 3.208, + "step": 27737 + }, + { + "epoch": 1.36, + "grad_norm": 0.5361840128898621, + "learning_rate": 0.0003440198562216327, + "loss": 3.0135, + "step": 27738 + }, + { + "epoch": 1.36, + "grad_norm": 0.5612678527832031, + "learning_rate": 0.0003440046266192054, + "loss": 2.8991, + "step": 27739 + }, + { + "epoch": 1.36, + "grad_norm": 0.575139045715332, + "learning_rate": 0.0003439893969008785, + "loss": 2.9018, + "step": 27740 + }, + { + "epoch": 1.36, + "grad_norm": 0.5466126799583435, + "learning_rate": 0.000343974167066692, + "loss": 2.9975, + "step": 27741 + }, + { + "epoch": 1.36, + "grad_norm": 0.5556744337081909, + "learning_rate": 0.000343958937116686, + "loss": 2.9647, + "step": 27742 + }, + { + "epoch": 1.36, + "grad_norm": 0.6033949255943298, + "learning_rate": 0.00034394370705090067, + "loss": 2.8281, + "step": 27743 + }, + { + "epoch": 1.36, + "grad_norm": 0.5831649303436279, + "learning_rate": 0.000343928476869376, + "loss": 3.053, + "step": 27744 + }, + { + "epoch": 1.36, + "grad_norm": 0.5626701712608337, + "learning_rate": 0.0003439132465721522, + "loss": 3.0697, + "step": 27745 + }, + { + "epoch": 1.36, + "grad_norm": 0.5939829349517822, + "learning_rate": 0.0003438980161592693, + "loss": 3.2721, + "step": 27746 + }, + { + "epoch": 1.36, + "grad_norm": 0.564018964767456, + "learning_rate": 0.0003438827856307675, + "loss": 2.8827, + "step": 27747 + }, + { + "epoch": 1.36, + "grad_norm": 0.5679821968078613, + "learning_rate": 0.0003438675549866869, + "loss": 3.1603, + "step": 27748 + }, + { + "epoch": 1.36, + "grad_norm": 0.56532222032547, + "learning_rate": 0.0003438523242270677, + "loss": 3.0086, + "step": 27749 + }, + { + "epoch": 1.36, + "grad_norm": 0.5731778144836426, + "learning_rate": 0.00034383709335194975, + "loss": 3.0396, + "step": 27750 + }, + { + "epoch": 1.36, + "grad_norm": 0.5794793963432312, + "learning_rate": 0.00034382186236137346, + "loss": 2.9388, + "step": 27751 + }, + { + "epoch": 1.36, + "grad_norm": 0.5715826153755188, + "learning_rate": 0.0003438066312553788, + "loss": 3.0948, + "step": 27752 + }, + { + "epoch": 1.36, + "grad_norm": 0.6393271088600159, + "learning_rate": 0.0003437914000340059, + "loss": 2.9898, + "step": 27753 + }, + { + "epoch": 1.36, + "grad_norm": 0.5703928470611572, + "learning_rate": 0.00034377616869729486, + "loss": 3.2764, + "step": 27754 + }, + { + "epoch": 1.36, + "grad_norm": 0.5397366285324097, + "learning_rate": 0.0003437609372452858, + "loss": 3.1067, + "step": 27755 + }, + { + "epoch": 1.36, + "grad_norm": 0.5772479772567749, + "learning_rate": 0.0003437457056780188, + "loss": 3.0379, + "step": 27756 + }, + { + "epoch": 1.36, + "grad_norm": 0.5816461443901062, + "learning_rate": 0.0003437304739955341, + "loss": 3.0917, + "step": 27757 + }, + { + "epoch": 1.36, + "grad_norm": 0.5699825882911682, + "learning_rate": 0.00034371524219787176, + "loss": 2.9859, + "step": 27758 + }, + { + "epoch": 1.36, + "grad_norm": 0.6363911032676697, + "learning_rate": 0.0003437000102850719, + "loss": 2.9362, + "step": 27759 + }, + { + "epoch": 1.36, + "grad_norm": 0.6590486168861389, + "learning_rate": 0.00034368477825717455, + "loss": 3.0905, + "step": 27760 + }, + { + "epoch": 1.36, + "grad_norm": 0.5966184139251709, + "learning_rate": 0.00034366954611421993, + "loss": 2.9768, + "step": 27761 + }, + { + "epoch": 1.36, + "grad_norm": 0.6021097302436829, + "learning_rate": 0.00034365431385624816, + "loss": 3.0687, + "step": 27762 + }, + { + "epoch": 1.36, + "grad_norm": 0.5527408719062805, + "learning_rate": 0.0003436390814832994, + "loss": 3.1802, + "step": 27763 + }, + { + "epoch": 1.36, + "grad_norm": 0.5687882900238037, + "learning_rate": 0.00034362384899541356, + "loss": 3.1737, + "step": 27764 + }, + { + "epoch": 1.36, + "grad_norm": 0.5569482445716858, + "learning_rate": 0.00034360861639263086, + "loss": 3.1958, + "step": 27765 + }, + { + "epoch": 1.36, + "grad_norm": 0.5489016771316528, + "learning_rate": 0.0003435933836749916, + "loss": 2.8973, + "step": 27766 + }, + { + "epoch": 1.36, + "grad_norm": 0.5932565927505493, + "learning_rate": 0.00034357815084253576, + "loss": 2.9053, + "step": 27767 + }, + { + "epoch": 1.36, + "grad_norm": 0.5763736963272095, + "learning_rate": 0.00034356291789530333, + "loss": 3.1687, + "step": 27768 + }, + { + "epoch": 1.36, + "grad_norm": 0.6087867021560669, + "learning_rate": 0.00034354768483333466, + "loss": 2.9277, + "step": 27769 + }, + { + "epoch": 1.36, + "grad_norm": 0.5887653231620789, + "learning_rate": 0.0003435324516566697, + "loss": 2.8634, + "step": 27770 + }, + { + "epoch": 1.36, + "grad_norm": 0.5681719183921814, + "learning_rate": 0.0003435172183653487, + "loss": 2.9306, + "step": 27771 + }, + { + "epoch": 1.36, + "grad_norm": 0.6251999735832214, + "learning_rate": 0.0003435019849594117, + "loss": 2.9546, + "step": 27772 + }, + { + "epoch": 1.36, + "grad_norm": 0.5517759919166565, + "learning_rate": 0.00034348675143889884, + "loss": 3.1553, + "step": 27773 + }, + { + "epoch": 1.36, + "grad_norm": 0.5633260607719421, + "learning_rate": 0.00034347151780385034, + "loss": 3.0941, + "step": 27774 + }, + { + "epoch": 1.36, + "grad_norm": 0.5699298977851868, + "learning_rate": 0.000343456284054306, + "loss": 3.0928, + "step": 27775 + }, + { + "epoch": 1.36, + "grad_norm": 0.5600057244300842, + "learning_rate": 0.0003434410501903063, + "loss": 3.3317, + "step": 27776 + }, + { + "epoch": 1.36, + "grad_norm": 0.5598301291465759, + "learning_rate": 0.0003434258162118913, + "loss": 3.1272, + "step": 27777 + }, + { + "epoch": 1.36, + "grad_norm": 0.6225312352180481, + "learning_rate": 0.000343410582119101, + "loss": 2.9665, + "step": 27778 + }, + { + "epoch": 1.36, + "grad_norm": 0.5822789072990417, + "learning_rate": 0.00034339534791197557, + "loss": 2.927, + "step": 27779 + }, + { + "epoch": 1.36, + "grad_norm": 0.6133136749267578, + "learning_rate": 0.00034338011359055516, + "loss": 3.1081, + "step": 27780 + }, + { + "epoch": 1.36, + "grad_norm": 0.597339391708374, + "learning_rate": 0.0003433648791548799, + "loss": 3.1425, + "step": 27781 + }, + { + "epoch": 1.36, + "grad_norm": 0.562778115272522, + "learning_rate": 0.00034334964460498984, + "loss": 3.2396, + "step": 27782 + }, + { + "epoch": 1.36, + "grad_norm": 0.5441209673881531, + "learning_rate": 0.0003433344099409252, + "loss": 3.2741, + "step": 27783 + }, + { + "epoch": 1.36, + "grad_norm": 0.5804208517074585, + "learning_rate": 0.00034331917516272604, + "loss": 3.0572, + "step": 27784 + }, + { + "epoch": 1.36, + "grad_norm": 0.6229684352874756, + "learning_rate": 0.0003433039402704326, + "loss": 2.9544, + "step": 27785 + }, + { + "epoch": 1.36, + "grad_norm": 0.5673180818557739, + "learning_rate": 0.0003432887052640848, + "loss": 2.9424, + "step": 27786 + }, + { + "epoch": 1.36, + "grad_norm": 0.5669776797294617, + "learning_rate": 0.0003432734701437229, + "loss": 3.0883, + "step": 27787 + }, + { + "epoch": 1.36, + "grad_norm": 0.5755397081375122, + "learning_rate": 0.000343258234909387, + "loss": 3.2678, + "step": 27788 + }, + { + "epoch": 1.36, + "grad_norm": 0.8632124662399292, + "learning_rate": 0.00034324299956111737, + "loss": 3.0894, + "step": 27789 + }, + { + "epoch": 1.36, + "grad_norm": 0.5921272039413452, + "learning_rate": 0.00034322776409895385, + "loss": 2.7846, + "step": 27790 + }, + { + "epoch": 1.36, + "grad_norm": 0.5782287120819092, + "learning_rate": 0.0003432125285229367, + "loss": 2.951, + "step": 27791 + }, + { + "epoch": 1.36, + "grad_norm": 0.5763784646987915, + "learning_rate": 0.0003431972928331061, + "loss": 2.9963, + "step": 27792 + }, + { + "epoch": 1.36, + "grad_norm": 0.6159095764160156, + "learning_rate": 0.0003431820570295022, + "loss": 2.9511, + "step": 27793 + }, + { + "epoch": 1.36, + "grad_norm": 0.5671179890632629, + "learning_rate": 0.00034316682111216513, + "loss": 2.9702, + "step": 27794 + }, + { + "epoch": 1.36, + "grad_norm": 0.6086155772209167, + "learning_rate": 0.00034315158508113483, + "loss": 2.9524, + "step": 27795 + }, + { + "epoch": 1.36, + "grad_norm": 0.6278063654899597, + "learning_rate": 0.0003431363489364516, + "loss": 2.9647, + "step": 27796 + }, + { + "epoch": 1.36, + "grad_norm": 0.5405550003051758, + "learning_rate": 0.0003431211126781555, + "loss": 3.1255, + "step": 27797 + }, + { + "epoch": 1.36, + "grad_norm": 0.5947132110595703, + "learning_rate": 0.0003431058763062867, + "loss": 3.0813, + "step": 27798 + }, + { + "epoch": 1.36, + "grad_norm": 0.6197540760040283, + "learning_rate": 0.00034309063982088534, + "loss": 3.1518, + "step": 27799 + }, + { + "epoch": 1.36, + "grad_norm": 0.5950043201446533, + "learning_rate": 0.00034307540322199163, + "loss": 3.0943, + "step": 27800 + }, + { + "epoch": 1.36, + "grad_norm": 0.5821655988693237, + "learning_rate": 0.00034306016650964544, + "loss": 3.0351, + "step": 27801 + }, + { + "epoch": 1.36, + "grad_norm": 0.6075488328933716, + "learning_rate": 0.00034304492968388703, + "loss": 3.1314, + "step": 27802 + }, + { + "epoch": 1.36, + "grad_norm": 0.5653616786003113, + "learning_rate": 0.00034302969274475675, + "loss": 2.9401, + "step": 27803 + }, + { + "epoch": 1.36, + "grad_norm": 0.556991696357727, + "learning_rate": 0.00034301445569229444, + "loss": 2.8252, + "step": 27804 + }, + { + "epoch": 1.36, + "grad_norm": 0.5913997888565063, + "learning_rate": 0.00034299921852654037, + "loss": 3.0624, + "step": 27805 + }, + { + "epoch": 1.36, + "grad_norm": 1.0726978778839111, + "learning_rate": 0.00034298398124753455, + "loss": 3.1772, + "step": 27806 + }, + { + "epoch": 1.36, + "grad_norm": 0.5975397825241089, + "learning_rate": 0.0003429687438553172, + "loss": 2.8712, + "step": 27807 + }, + { + "epoch": 1.36, + "grad_norm": 0.5815263390541077, + "learning_rate": 0.0003429535063499285, + "loss": 3.0278, + "step": 27808 + }, + { + "epoch": 1.36, + "grad_norm": 0.563348114490509, + "learning_rate": 0.0003429382687314085, + "loss": 2.9911, + "step": 27809 + }, + { + "epoch": 1.36, + "grad_norm": 0.6096580028533936, + "learning_rate": 0.00034292303099979737, + "loss": 2.9992, + "step": 27810 + }, + { + "epoch": 1.36, + "grad_norm": 0.5842347145080566, + "learning_rate": 0.00034290779315513525, + "loss": 2.8416, + "step": 27811 + }, + { + "epoch": 1.36, + "grad_norm": 0.558156430721283, + "learning_rate": 0.00034289255519746225, + "loss": 2.9634, + "step": 27812 + }, + { + "epoch": 1.36, + "grad_norm": 0.6108639240264893, + "learning_rate": 0.0003428773171268185, + "loss": 3.1804, + "step": 27813 + }, + { + "epoch": 1.36, + "grad_norm": 0.6039309501647949, + "learning_rate": 0.0003428620789432443, + "loss": 3.0395, + "step": 27814 + }, + { + "epoch": 1.36, + "grad_norm": 0.5984408855438232, + "learning_rate": 0.0003428468406467794, + "loss": 3.1099, + "step": 27815 + }, + { + "epoch": 1.36, + "grad_norm": 0.5448001027107239, + "learning_rate": 0.0003428316022374642, + "loss": 3.2093, + "step": 27816 + }, + { + "epoch": 1.36, + "grad_norm": 0.577616274356842, + "learning_rate": 0.00034281636371533895, + "loss": 3.1489, + "step": 27817 + }, + { + "epoch": 1.36, + "grad_norm": 0.5480098128318787, + "learning_rate": 0.0003428011250804436, + "loss": 3.0566, + "step": 27818 + }, + { + "epoch": 1.36, + "grad_norm": 0.5774540305137634, + "learning_rate": 0.00034278588633281825, + "loss": 3.0605, + "step": 27819 + }, + { + "epoch": 1.36, + "grad_norm": 0.5654169917106628, + "learning_rate": 0.00034277064747250313, + "loss": 2.8954, + "step": 27820 + }, + { + "epoch": 1.36, + "grad_norm": 0.6514195799827576, + "learning_rate": 0.0003427554084995384, + "loss": 3.0725, + "step": 27821 + }, + { + "epoch": 1.36, + "grad_norm": 0.5787948369979858, + "learning_rate": 0.0003427401694139641, + "loss": 3.034, + "step": 27822 + }, + { + "epoch": 1.36, + "grad_norm": 0.59248948097229, + "learning_rate": 0.0003427249302158205, + "loss": 2.8765, + "step": 27823 + }, + { + "epoch": 1.36, + "grad_norm": 0.5247685313224792, + "learning_rate": 0.0003427096909051475, + "loss": 3.0719, + "step": 27824 + }, + { + "epoch": 1.36, + "grad_norm": 0.5795184969902039, + "learning_rate": 0.00034269445148198553, + "loss": 3.2857, + "step": 27825 + }, + { + "epoch": 1.36, + "grad_norm": 0.5434640049934387, + "learning_rate": 0.0003426792119463746, + "loss": 3.0622, + "step": 27826 + }, + { + "epoch": 1.36, + "grad_norm": 0.5672294497489929, + "learning_rate": 0.0003426639722983547, + "loss": 2.9796, + "step": 27827 + }, + { + "epoch": 1.36, + "grad_norm": 0.6290318369865417, + "learning_rate": 0.0003426487325379662, + "loss": 2.8113, + "step": 27828 + }, + { + "epoch": 1.36, + "grad_norm": 0.5443248152732849, + "learning_rate": 0.0003426334926652492, + "loss": 3.0352, + "step": 27829 + }, + { + "epoch": 1.36, + "grad_norm": 0.5471267700195312, + "learning_rate": 0.00034261825268024374, + "loss": 3.1233, + "step": 27830 + }, + { + "epoch": 1.36, + "grad_norm": 0.5816265940666199, + "learning_rate": 0.00034260301258299, + "loss": 3.1235, + "step": 27831 + }, + { + "epoch": 1.36, + "grad_norm": 0.5702730417251587, + "learning_rate": 0.00034258777237352814, + "loss": 3.0953, + "step": 27832 + }, + { + "epoch": 1.36, + "grad_norm": 0.5764862895011902, + "learning_rate": 0.0003425725320518982, + "loss": 3.0093, + "step": 27833 + }, + { + "epoch": 1.36, + "grad_norm": 0.5817887187004089, + "learning_rate": 0.0003425572916181405, + "loss": 3.0378, + "step": 27834 + }, + { + "epoch": 1.36, + "grad_norm": 0.5505737066268921, + "learning_rate": 0.00034254205107229504, + "loss": 3.0721, + "step": 27835 + }, + { + "epoch": 1.36, + "grad_norm": 0.6135760545730591, + "learning_rate": 0.0003425268104144021, + "loss": 2.9754, + "step": 27836 + }, + { + "epoch": 1.36, + "grad_norm": 0.5726092457771301, + "learning_rate": 0.00034251156964450153, + "loss": 3.2546, + "step": 27837 + }, + { + "epoch": 1.36, + "grad_norm": 0.5802931785583496, + "learning_rate": 0.0003424963287626338, + "loss": 2.9391, + "step": 27838 + }, + { + "epoch": 1.36, + "grad_norm": 0.5456535816192627, + "learning_rate": 0.0003424810877688389, + "loss": 3.1814, + "step": 27839 + }, + { + "epoch": 1.36, + "grad_norm": 0.5868333578109741, + "learning_rate": 0.00034246584666315703, + "loss": 3.0369, + "step": 27840 + }, + { + "epoch": 1.36, + "grad_norm": 0.6372731924057007, + "learning_rate": 0.00034245060544562826, + "loss": 2.9961, + "step": 27841 + }, + { + "epoch": 1.36, + "grad_norm": 0.5847151279449463, + "learning_rate": 0.00034243536411629266, + "loss": 3.073, + "step": 27842 + }, + { + "epoch": 1.36, + "grad_norm": 0.5791596174240112, + "learning_rate": 0.0003424201226751906, + "loss": 3.0307, + "step": 27843 + }, + { + "epoch": 1.36, + "grad_norm": 0.5760388970375061, + "learning_rate": 0.0003424048811223621, + "loss": 2.9906, + "step": 27844 + }, + { + "epoch": 1.36, + "grad_norm": 0.5581043362617493, + "learning_rate": 0.0003423896394578473, + "loss": 3.1661, + "step": 27845 + }, + { + "epoch": 1.36, + "grad_norm": 0.6014453172683716, + "learning_rate": 0.0003423743976816863, + "loss": 3.0195, + "step": 27846 + }, + { + "epoch": 1.36, + "grad_norm": 0.6490236520767212, + "learning_rate": 0.0003423591557939193, + "loss": 2.8944, + "step": 27847 + }, + { + "epoch": 1.36, + "grad_norm": 0.5952842831611633, + "learning_rate": 0.00034234391379458655, + "loss": 3.0426, + "step": 27848 + }, + { + "epoch": 1.36, + "grad_norm": 0.5552687644958496, + "learning_rate": 0.00034232867168372793, + "loss": 2.9578, + "step": 27849 + }, + { + "epoch": 1.36, + "grad_norm": 0.5637187361717224, + "learning_rate": 0.0003423134294613838, + "loss": 3.1498, + "step": 27850 + }, + { + "epoch": 1.36, + "grad_norm": 0.5724777579307556, + "learning_rate": 0.00034229818712759423, + "loss": 3.0703, + "step": 27851 + }, + { + "epoch": 1.36, + "grad_norm": 0.6271558403968811, + "learning_rate": 0.0003422829446823994, + "loss": 3.046, + "step": 27852 + }, + { + "epoch": 1.37, + "grad_norm": 0.5826039910316467, + "learning_rate": 0.00034226770212583934, + "loss": 2.9899, + "step": 27853 + }, + { + "epoch": 1.37, + "grad_norm": 0.5479075312614441, + "learning_rate": 0.0003422524594579544, + "loss": 2.9437, + "step": 27854 + }, + { + "epoch": 1.37, + "grad_norm": 0.572819709777832, + "learning_rate": 0.0003422372166787846, + "loss": 3.0236, + "step": 27855 + }, + { + "epoch": 1.37, + "grad_norm": 0.5536037683486938, + "learning_rate": 0.00034222197378837006, + "loss": 3.1475, + "step": 27856 + }, + { + "epoch": 1.37, + "grad_norm": 0.6179131865501404, + "learning_rate": 0.00034220673078675095, + "loss": 2.9968, + "step": 27857 + }, + { + "epoch": 1.37, + "grad_norm": 0.562525749206543, + "learning_rate": 0.00034219148767396744, + "loss": 3.0093, + "step": 27858 + }, + { + "epoch": 1.37, + "grad_norm": 0.5559420585632324, + "learning_rate": 0.00034217624445005967, + "loss": 3.0252, + "step": 27859 + }, + { + "epoch": 1.37, + "grad_norm": 0.5643912553787231, + "learning_rate": 0.00034216100111506784, + "loss": 3.1213, + "step": 27860 + }, + { + "epoch": 1.37, + "grad_norm": 0.5565373301506042, + "learning_rate": 0.00034214575766903203, + "loss": 3.0101, + "step": 27861 + }, + { + "epoch": 1.37, + "grad_norm": 0.5774194002151489, + "learning_rate": 0.00034213051411199246, + "loss": 2.9123, + "step": 27862 + }, + { + "epoch": 1.37, + "grad_norm": 0.5667964220046997, + "learning_rate": 0.000342115270443989, + "loss": 3.2832, + "step": 27863 + }, + { + "epoch": 1.37, + "grad_norm": 0.5750254988670349, + "learning_rate": 0.0003421000266650622, + "loss": 3.1505, + "step": 27864 + }, + { + "epoch": 1.37, + "grad_norm": 0.6452105641365051, + "learning_rate": 0.0003420847827752521, + "loss": 3.0935, + "step": 27865 + }, + { + "epoch": 1.37, + "grad_norm": 0.5923091173171997, + "learning_rate": 0.0003420695387745987, + "loss": 2.9655, + "step": 27866 + }, + { + "epoch": 1.37, + "grad_norm": 0.5900555849075317, + "learning_rate": 0.0003420542946631422, + "loss": 2.9454, + "step": 27867 + }, + { + "epoch": 1.37, + "grad_norm": 0.5830941200256348, + "learning_rate": 0.00034203905044092274, + "loss": 3.0925, + "step": 27868 + }, + { + "epoch": 1.37, + "grad_norm": 0.5814031362533569, + "learning_rate": 0.00034202380610798066, + "loss": 3.0646, + "step": 27869 + }, + { + "epoch": 1.37, + "grad_norm": 0.5986719131469727, + "learning_rate": 0.0003420085616643558, + "loss": 2.8448, + "step": 27870 + }, + { + "epoch": 1.37, + "grad_norm": 0.5420025587081909, + "learning_rate": 0.00034199331711008864, + "loss": 2.9547, + "step": 27871 + }, + { + "epoch": 1.37, + "grad_norm": 0.5797125101089478, + "learning_rate": 0.00034197807244521904, + "loss": 3.1777, + "step": 27872 + }, + { + "epoch": 1.37, + "grad_norm": 0.5667112469673157, + "learning_rate": 0.00034196282766978727, + "loss": 3.1301, + "step": 27873 + }, + { + "epoch": 1.37, + "grad_norm": 0.5843361616134644, + "learning_rate": 0.0003419475827838335, + "loss": 2.8819, + "step": 27874 + }, + { + "epoch": 1.37, + "grad_norm": 0.5419958829879761, + "learning_rate": 0.00034193233778739797, + "loss": 2.8901, + "step": 27875 + }, + { + "epoch": 1.37, + "grad_norm": 0.5321311354637146, + "learning_rate": 0.0003419170926805207, + "loss": 2.8699, + "step": 27876 + }, + { + "epoch": 1.37, + "grad_norm": 0.6508222818374634, + "learning_rate": 0.0003419018474632418, + "loss": 3.2209, + "step": 27877 + }, + { + "epoch": 1.37, + "grad_norm": 0.6090697646141052, + "learning_rate": 0.0003418866021356015, + "loss": 3.0021, + "step": 27878 + }, + { + "epoch": 1.37, + "grad_norm": 0.5967793464660645, + "learning_rate": 0.00034187135669764, + "loss": 2.9383, + "step": 27879 + }, + { + "epoch": 1.37, + "grad_norm": 0.5427808165550232, + "learning_rate": 0.00034185611114939744, + "loss": 3.1093, + "step": 27880 + }, + { + "epoch": 1.37, + "grad_norm": 0.601786732673645, + "learning_rate": 0.0003418408654909139, + "loss": 3.0333, + "step": 27881 + }, + { + "epoch": 1.37, + "grad_norm": 0.6144266724586487, + "learning_rate": 0.0003418256197222295, + "loss": 3.0911, + "step": 27882 + }, + { + "epoch": 1.37, + "grad_norm": 0.5672413110733032, + "learning_rate": 0.00034181037384338453, + "loss": 3.0872, + "step": 27883 + }, + { + "epoch": 1.37, + "grad_norm": 0.6033337712287903, + "learning_rate": 0.0003417951278544191, + "loss": 3.2807, + "step": 27884 + }, + { + "epoch": 1.37, + "grad_norm": 0.5832936763763428, + "learning_rate": 0.0003417798817553733, + "loss": 2.9746, + "step": 27885 + }, + { + "epoch": 1.37, + "grad_norm": 0.5509626269340515, + "learning_rate": 0.0003417646355462874, + "loss": 3.0896, + "step": 27886 + }, + { + "epoch": 1.37, + "grad_norm": 0.6345809698104858, + "learning_rate": 0.0003417493892272015, + "loss": 2.9941, + "step": 27887 + }, + { + "epoch": 1.37, + "grad_norm": 0.5827943682670593, + "learning_rate": 0.00034173414279815563, + "loss": 3.0285, + "step": 27888 + }, + { + "epoch": 1.37, + "grad_norm": 0.5544496774673462, + "learning_rate": 0.0003417188962591901, + "loss": 3.0064, + "step": 27889 + }, + { + "epoch": 1.37, + "grad_norm": 0.5938286185264587, + "learning_rate": 0.00034170364961034507, + "loss": 2.9717, + "step": 27890 + }, + { + "epoch": 1.37, + "grad_norm": 0.5810260772705078, + "learning_rate": 0.00034168840285166065, + "loss": 2.9629, + "step": 27891 + }, + { + "epoch": 1.37, + "grad_norm": 0.6095471978187561, + "learning_rate": 0.00034167315598317697, + "loss": 3.1627, + "step": 27892 + }, + { + "epoch": 1.37, + "grad_norm": 0.5932841897010803, + "learning_rate": 0.00034165790900493413, + "loss": 2.9353, + "step": 27893 + }, + { + "epoch": 1.37, + "grad_norm": 0.6483215093612671, + "learning_rate": 0.00034164266191697253, + "loss": 2.9442, + "step": 27894 + }, + { + "epoch": 1.37, + "grad_norm": 0.5468027591705322, + "learning_rate": 0.00034162741471933217, + "loss": 3.1029, + "step": 27895 + }, + { + "epoch": 1.37, + "grad_norm": 0.5780773758888245, + "learning_rate": 0.00034161216741205317, + "loss": 3.1393, + "step": 27896 + }, + { + "epoch": 1.37, + "grad_norm": 0.6147868037223816, + "learning_rate": 0.0003415969199951757, + "loss": 2.9525, + "step": 27897 + }, + { + "epoch": 1.37, + "grad_norm": 0.6018847227096558, + "learning_rate": 0.00034158167246874, + "loss": 3.2364, + "step": 27898 + }, + { + "epoch": 1.37, + "grad_norm": 0.6293785572052002, + "learning_rate": 0.0003415664248327861, + "loss": 3.0502, + "step": 27899 + }, + { + "epoch": 1.37, + "grad_norm": 0.5687551498413086, + "learning_rate": 0.0003415511770873542, + "loss": 2.9601, + "step": 27900 + }, + { + "epoch": 1.37, + "grad_norm": 0.5848996639251709, + "learning_rate": 0.00034153592923248465, + "loss": 2.9939, + "step": 27901 + }, + { + "epoch": 1.37, + "grad_norm": 0.5531403422355652, + "learning_rate": 0.00034152068126821745, + "loss": 2.9455, + "step": 27902 + }, + { + "epoch": 1.37, + "grad_norm": 0.6372696757316589, + "learning_rate": 0.0003415054331945927, + "loss": 3.0152, + "step": 27903 + }, + { + "epoch": 1.37, + "grad_norm": 0.6175010204315186, + "learning_rate": 0.00034149018501165054, + "loss": 3.0507, + "step": 27904 + }, + { + "epoch": 1.37, + "grad_norm": 0.5934361815452576, + "learning_rate": 0.00034147493671943133, + "loss": 3.14, + "step": 27905 + }, + { + "epoch": 1.37, + "grad_norm": 0.5523131489753723, + "learning_rate": 0.00034145968831797506, + "loss": 3.2342, + "step": 27906 + }, + { + "epoch": 1.37, + "grad_norm": 0.595689594745636, + "learning_rate": 0.000341444439807322, + "loss": 2.9376, + "step": 27907 + }, + { + "epoch": 1.37, + "grad_norm": 0.5672516822814941, + "learning_rate": 0.0003414291911875122, + "loss": 3.2794, + "step": 27908 + }, + { + "epoch": 1.37, + "grad_norm": 0.6107340455055237, + "learning_rate": 0.000341413942458586, + "loss": 3.1038, + "step": 27909 + }, + { + "epoch": 1.37, + "grad_norm": 0.6264364719390869, + "learning_rate": 0.0003413986936205833, + "loss": 2.9496, + "step": 27910 + }, + { + "epoch": 1.37, + "grad_norm": 0.6131812334060669, + "learning_rate": 0.00034138344467354447, + "loss": 2.895, + "step": 27911 + }, + { + "epoch": 1.37, + "grad_norm": 0.5950072407722473, + "learning_rate": 0.00034136819561750965, + "loss": 3.083, + "step": 27912 + }, + { + "epoch": 1.37, + "grad_norm": 0.5509093403816223, + "learning_rate": 0.00034135294645251896, + "loss": 3.0271, + "step": 27913 + }, + { + "epoch": 1.37, + "grad_norm": 0.5661293864250183, + "learning_rate": 0.0003413376971786125, + "loss": 3.1161, + "step": 27914 + }, + { + "epoch": 1.37, + "grad_norm": 0.5739542841911316, + "learning_rate": 0.0003413224477958306, + "loss": 3.0398, + "step": 27915 + }, + { + "epoch": 1.37, + "grad_norm": 0.5414571166038513, + "learning_rate": 0.00034130719830421333, + "loss": 3.0356, + "step": 27916 + }, + { + "epoch": 1.37, + "grad_norm": 0.5896730422973633, + "learning_rate": 0.00034129194870380076, + "loss": 3.0442, + "step": 27917 + }, + { + "epoch": 1.37, + "grad_norm": 0.606563925743103, + "learning_rate": 0.0003412766989946332, + "loss": 3.0527, + "step": 27918 + }, + { + "epoch": 1.37, + "grad_norm": 0.5917063355445862, + "learning_rate": 0.0003412614491767507, + "loss": 3.0547, + "step": 27919 + }, + { + "epoch": 1.37, + "grad_norm": 0.5469411015510559, + "learning_rate": 0.00034124619925019346, + "loss": 3.4306, + "step": 27920 + }, + { + "epoch": 1.37, + "grad_norm": 0.5653911828994751, + "learning_rate": 0.0003412309492150018, + "loss": 3.1284, + "step": 27921 + }, + { + "epoch": 1.37, + "grad_norm": 0.5671815872192383, + "learning_rate": 0.00034121569907121565, + "loss": 3.1207, + "step": 27922 + }, + { + "epoch": 1.37, + "grad_norm": 0.6383528709411621, + "learning_rate": 0.0003412004488188753, + "loss": 2.8917, + "step": 27923 + }, + { + "epoch": 1.37, + "grad_norm": 0.5371748805046082, + "learning_rate": 0.0003411851984580209, + "loss": 3.0595, + "step": 27924 + }, + { + "epoch": 1.37, + "grad_norm": 0.5818402767181396, + "learning_rate": 0.0003411699479886926, + "loss": 3.001, + "step": 27925 + }, + { + "epoch": 1.37, + "grad_norm": 0.6108590960502625, + "learning_rate": 0.0003411546974109306, + "loss": 3.0168, + "step": 27926 + }, + { + "epoch": 1.37, + "grad_norm": 0.5887522101402283, + "learning_rate": 0.0003411394467247751, + "loss": 3.0405, + "step": 27927 + }, + { + "epoch": 1.37, + "grad_norm": 0.5705615878105164, + "learning_rate": 0.00034112419593026615, + "loss": 3.0193, + "step": 27928 + }, + { + "epoch": 1.37, + "grad_norm": 0.5551228523254395, + "learning_rate": 0.0003411089450274439, + "loss": 3.1167, + "step": 27929 + }, + { + "epoch": 1.37, + "grad_norm": 0.567327082157135, + "learning_rate": 0.0003410936940163487, + "loss": 2.9874, + "step": 27930 + }, + { + "epoch": 1.37, + "grad_norm": 0.5707133412361145, + "learning_rate": 0.0003410784428970207, + "loss": 3.0144, + "step": 27931 + }, + { + "epoch": 1.37, + "grad_norm": 0.5892231464385986, + "learning_rate": 0.0003410631916694999, + "loss": 3.1033, + "step": 27932 + }, + { + "epoch": 1.37, + "grad_norm": 0.6045179963111877, + "learning_rate": 0.00034104794033382657, + "loss": 2.9812, + "step": 27933 + }, + { + "epoch": 1.37, + "grad_norm": 0.5541436076164246, + "learning_rate": 0.0003410326888900408, + "loss": 3.1712, + "step": 27934 + }, + { + "epoch": 1.37, + "grad_norm": 0.5599698424339294, + "learning_rate": 0.0003410174373381829, + "loss": 3.0917, + "step": 27935 + }, + { + "epoch": 1.37, + "grad_norm": 0.5694020986557007, + "learning_rate": 0.0003410021856782929, + "loss": 3.1889, + "step": 27936 + }, + { + "epoch": 1.37, + "grad_norm": 0.5983900427818298, + "learning_rate": 0.00034098693391041107, + "loss": 2.9114, + "step": 27937 + }, + { + "epoch": 1.37, + "grad_norm": 0.6155137419700623, + "learning_rate": 0.0003409716820345776, + "loss": 3.0607, + "step": 27938 + }, + { + "epoch": 1.37, + "grad_norm": 0.6034281849861145, + "learning_rate": 0.00034095643005083257, + "loss": 2.961, + "step": 27939 + }, + { + "epoch": 1.37, + "grad_norm": 0.5892189145088196, + "learning_rate": 0.0003409411779592161, + "loss": 3.244, + "step": 27940 + }, + { + "epoch": 1.37, + "grad_norm": 0.5782514214515686, + "learning_rate": 0.00034092592575976855, + "loss": 3.2606, + "step": 27941 + }, + { + "epoch": 1.37, + "grad_norm": 0.5579574108123779, + "learning_rate": 0.00034091067345253, + "loss": 2.9648, + "step": 27942 + }, + { + "epoch": 1.37, + "grad_norm": 0.5963599681854248, + "learning_rate": 0.0003408954210375405, + "loss": 3.0709, + "step": 27943 + }, + { + "epoch": 1.37, + "grad_norm": 0.6034666299819946, + "learning_rate": 0.00034088016851484043, + "loss": 2.8616, + "step": 27944 + }, + { + "epoch": 1.37, + "grad_norm": 0.6475614309310913, + "learning_rate": 0.00034086491588446985, + "loss": 3.0576, + "step": 27945 + }, + { + "epoch": 1.37, + "grad_norm": 0.5820233821868896, + "learning_rate": 0.0003408496631464689, + "loss": 2.9848, + "step": 27946 + }, + { + "epoch": 1.37, + "grad_norm": 0.5455297827720642, + "learning_rate": 0.00034083441030087784, + "loss": 3.0637, + "step": 27947 + }, + { + "epoch": 1.37, + "grad_norm": 0.568981945514679, + "learning_rate": 0.00034081915734773675, + "loss": 2.9338, + "step": 27948 + }, + { + "epoch": 1.37, + "grad_norm": 0.5847499370574951, + "learning_rate": 0.00034080390428708595, + "loss": 2.8865, + "step": 27949 + }, + { + "epoch": 1.37, + "grad_norm": 0.6259151697158813, + "learning_rate": 0.0003407886511189654, + "loss": 3.0548, + "step": 27950 + }, + { + "epoch": 1.37, + "grad_norm": 0.5772634744644165, + "learning_rate": 0.00034077339784341544, + "loss": 3.0336, + "step": 27951 + }, + { + "epoch": 1.37, + "grad_norm": 0.5600981712341309, + "learning_rate": 0.00034075814446047617, + "loss": 2.9458, + "step": 27952 + }, + { + "epoch": 1.37, + "grad_norm": 0.5539212226867676, + "learning_rate": 0.0003407428909701879, + "loss": 3.0916, + "step": 27953 + }, + { + "epoch": 1.37, + "grad_norm": 0.598659336566925, + "learning_rate": 0.0003407276373725906, + "loss": 3.18, + "step": 27954 + }, + { + "epoch": 1.37, + "grad_norm": 0.5764479637145996, + "learning_rate": 0.00034071238366772456, + "loss": 3.0754, + "step": 27955 + }, + { + "epoch": 1.37, + "grad_norm": 0.5627712607383728, + "learning_rate": 0.00034069712985563, + "loss": 2.9324, + "step": 27956 + }, + { + "epoch": 1.37, + "grad_norm": 0.6134984493255615, + "learning_rate": 0.00034068187593634696, + "loss": 2.9614, + "step": 27957 + }, + { + "epoch": 1.37, + "grad_norm": 0.581798255443573, + "learning_rate": 0.0003406666219099157, + "loss": 2.8332, + "step": 27958 + }, + { + "epoch": 1.37, + "grad_norm": 0.5963696837425232, + "learning_rate": 0.0003406513677763764, + "loss": 2.9447, + "step": 27959 + }, + { + "epoch": 1.37, + "grad_norm": 0.5944687128067017, + "learning_rate": 0.0003406361135357692, + "loss": 2.9688, + "step": 27960 + }, + { + "epoch": 1.37, + "grad_norm": 0.6353680491447449, + "learning_rate": 0.00034062085918813435, + "loss": 2.8498, + "step": 27961 + }, + { + "epoch": 1.37, + "grad_norm": 0.5559459328651428, + "learning_rate": 0.00034060560473351187, + "loss": 2.8492, + "step": 27962 + }, + { + "epoch": 1.37, + "grad_norm": 0.5717218518257141, + "learning_rate": 0.00034059035017194216, + "loss": 2.782, + "step": 27963 + }, + { + "epoch": 1.37, + "grad_norm": 0.5911750197410583, + "learning_rate": 0.00034057509550346524, + "loss": 3.1497, + "step": 27964 + }, + { + "epoch": 1.37, + "grad_norm": 0.5874834060668945, + "learning_rate": 0.0003405598407281212, + "loss": 2.923, + "step": 27965 + }, + { + "epoch": 1.37, + "grad_norm": 0.5998968482017517, + "learning_rate": 0.0003405445858459505, + "loss": 2.8489, + "step": 27966 + }, + { + "epoch": 1.37, + "grad_norm": 0.576008677482605, + "learning_rate": 0.0003405293308569932, + "loss": 2.9175, + "step": 27967 + }, + { + "epoch": 1.37, + "grad_norm": 0.5811209082603455, + "learning_rate": 0.0003405140757612893, + "loss": 3.0114, + "step": 27968 + }, + { + "epoch": 1.37, + "grad_norm": 0.6185311675071716, + "learning_rate": 0.0003404988205588792, + "loss": 3.1087, + "step": 27969 + }, + { + "epoch": 1.37, + "grad_norm": 0.6029673218727112, + "learning_rate": 0.000340483565249803, + "loss": 3.0329, + "step": 27970 + }, + { + "epoch": 1.37, + "grad_norm": 0.6005308628082275, + "learning_rate": 0.0003404683098341009, + "loss": 3.1651, + "step": 27971 + }, + { + "epoch": 1.37, + "grad_norm": 0.5758666396141052, + "learning_rate": 0.00034045305431181297, + "loss": 3.0992, + "step": 27972 + }, + { + "epoch": 1.37, + "grad_norm": 0.546139657497406, + "learning_rate": 0.00034043779868297953, + "loss": 2.9213, + "step": 27973 + }, + { + "epoch": 1.37, + "grad_norm": 0.5436400771141052, + "learning_rate": 0.00034042254294764073, + "loss": 3.0611, + "step": 27974 + }, + { + "epoch": 1.37, + "grad_norm": 0.5803444385528564, + "learning_rate": 0.00034040728710583665, + "loss": 3.0152, + "step": 27975 + }, + { + "epoch": 1.37, + "grad_norm": 0.5593194961547852, + "learning_rate": 0.0003403920311576077, + "loss": 3.0609, + "step": 27976 + }, + { + "epoch": 1.37, + "grad_norm": 0.5657615661621094, + "learning_rate": 0.0003403767751029938, + "loss": 3.1276, + "step": 27977 + }, + { + "epoch": 1.37, + "grad_norm": 0.5875985622406006, + "learning_rate": 0.00034036151894203536, + "loss": 3.0625, + "step": 27978 + }, + { + "epoch": 1.37, + "grad_norm": 0.55743807554245, + "learning_rate": 0.00034034626267477234, + "loss": 3.1621, + "step": 27979 + }, + { + "epoch": 1.37, + "grad_norm": 0.6265092492103577, + "learning_rate": 0.00034033100630124504, + "loss": 2.9605, + "step": 27980 + }, + { + "epoch": 1.37, + "grad_norm": 0.5701452493667603, + "learning_rate": 0.00034031574982149363, + "loss": 3.0356, + "step": 27981 + }, + { + "epoch": 1.37, + "grad_norm": 0.5367501974105835, + "learning_rate": 0.00034030049323555833, + "loss": 3.0456, + "step": 27982 + }, + { + "epoch": 1.37, + "grad_norm": 0.5916096568107605, + "learning_rate": 0.00034028523654347926, + "loss": 3.2683, + "step": 27983 + }, + { + "epoch": 1.37, + "grad_norm": 0.5602060556411743, + "learning_rate": 0.00034026997974529664, + "loss": 3.2333, + "step": 27984 + }, + { + "epoch": 1.37, + "grad_norm": 0.542546272277832, + "learning_rate": 0.00034025472284105065, + "loss": 2.9991, + "step": 27985 + }, + { + "epoch": 1.37, + "grad_norm": 0.5879301428794861, + "learning_rate": 0.0003402394658307814, + "loss": 2.8912, + "step": 27986 + }, + { + "epoch": 1.37, + "grad_norm": 0.5856759548187256, + "learning_rate": 0.00034022420871452925, + "loss": 2.9014, + "step": 27987 + }, + { + "epoch": 1.37, + "grad_norm": 0.5833997130393982, + "learning_rate": 0.0003402089514923342, + "loss": 2.9192, + "step": 27988 + }, + { + "epoch": 1.37, + "grad_norm": 0.5771424770355225, + "learning_rate": 0.0003401936941642366, + "loss": 3.1536, + "step": 27989 + }, + { + "epoch": 1.37, + "grad_norm": 0.5892386436462402, + "learning_rate": 0.00034017843673027646, + "loss": 2.9945, + "step": 27990 + }, + { + "epoch": 1.37, + "grad_norm": 0.6291250586509705, + "learning_rate": 0.00034016317919049403, + "loss": 2.9909, + "step": 27991 + }, + { + "epoch": 1.37, + "grad_norm": 0.5596926212310791, + "learning_rate": 0.00034014792154492957, + "loss": 3.1103, + "step": 27992 + }, + { + "epoch": 1.37, + "grad_norm": 0.5595067739486694, + "learning_rate": 0.00034013266379362325, + "loss": 3.0245, + "step": 27993 + }, + { + "epoch": 1.37, + "grad_norm": 0.5994234085083008, + "learning_rate": 0.00034011740593661513, + "loss": 3.1417, + "step": 27994 + }, + { + "epoch": 1.37, + "grad_norm": 0.578930675983429, + "learning_rate": 0.00034010214797394555, + "loss": 3.1635, + "step": 27995 + }, + { + "epoch": 1.37, + "grad_norm": 0.6194568872451782, + "learning_rate": 0.00034008688990565456, + "loss": 2.962, + "step": 27996 + }, + { + "epoch": 1.37, + "grad_norm": 0.5864658951759338, + "learning_rate": 0.00034007163173178245, + "loss": 3.2179, + "step": 27997 + }, + { + "epoch": 1.37, + "grad_norm": 0.5644214153289795, + "learning_rate": 0.0003400563734523694, + "loss": 2.9034, + "step": 27998 + }, + { + "epoch": 1.37, + "grad_norm": 0.5568000674247742, + "learning_rate": 0.00034004111506745553, + "loss": 3.0852, + "step": 27999 + }, + { + "epoch": 1.37, + "grad_norm": 0.5540627241134644, + "learning_rate": 0.0003400258565770811, + "loss": 3.0086, + "step": 28000 + }, + { + "epoch": 1.37, + "grad_norm": 0.5743499994277954, + "learning_rate": 0.0003400105979812862, + "loss": 2.6773, + "step": 28001 + }, + { + "epoch": 1.37, + "grad_norm": 0.5981676578521729, + "learning_rate": 0.0003399953392801111, + "loss": 3.0462, + "step": 28002 + }, + { + "epoch": 1.37, + "grad_norm": 0.5800593495368958, + "learning_rate": 0.00033998008047359603, + "loss": 3.116, + "step": 28003 + }, + { + "epoch": 1.37, + "grad_norm": 0.590241014957428, + "learning_rate": 0.0003399648215617812, + "loss": 3.228, + "step": 28004 + }, + { + "epoch": 1.37, + "grad_norm": 0.5530853271484375, + "learning_rate": 0.0003399495625447066, + "loss": 3.315, + "step": 28005 + }, + { + "epoch": 1.37, + "grad_norm": 0.5522137880325317, + "learning_rate": 0.00033993430342241254, + "loss": 3.0778, + "step": 28006 + }, + { + "epoch": 1.37, + "grad_norm": 0.5803789496421814, + "learning_rate": 0.00033991904419493926, + "loss": 3.1161, + "step": 28007 + }, + { + "epoch": 1.37, + "grad_norm": 0.5703111886978149, + "learning_rate": 0.00033990378486232684, + "loss": 3.0318, + "step": 28008 + }, + { + "epoch": 1.37, + "grad_norm": 0.5972906351089478, + "learning_rate": 0.0003398885254246156, + "loss": 3.0836, + "step": 28009 + }, + { + "epoch": 1.37, + "grad_norm": 0.6131359338760376, + "learning_rate": 0.00033987326588184565, + "loss": 3.1311, + "step": 28010 + }, + { + "epoch": 1.37, + "grad_norm": 0.5489694476127625, + "learning_rate": 0.00033985800623405717, + "loss": 3.1116, + "step": 28011 + }, + { + "epoch": 1.37, + "grad_norm": 0.552985668182373, + "learning_rate": 0.0003398427464812904, + "loss": 3.0374, + "step": 28012 + }, + { + "epoch": 1.37, + "grad_norm": 0.581065833568573, + "learning_rate": 0.00033982748662358546, + "loss": 2.8298, + "step": 28013 + }, + { + "epoch": 1.37, + "grad_norm": 0.5935637354850769, + "learning_rate": 0.00033981222666098256, + "loss": 3.1071, + "step": 28014 + }, + { + "epoch": 1.37, + "grad_norm": 0.5556374788284302, + "learning_rate": 0.00033979696659352203, + "loss": 3.204, + "step": 28015 + }, + { + "epoch": 1.37, + "grad_norm": 0.5717475414276123, + "learning_rate": 0.00033978170642124387, + "loss": 2.7786, + "step": 28016 + }, + { + "epoch": 1.37, + "grad_norm": 0.5912850499153137, + "learning_rate": 0.0003397664461441883, + "loss": 3.1273, + "step": 28017 + }, + { + "epoch": 1.37, + "grad_norm": 0.5778431296348572, + "learning_rate": 0.00033975118576239565, + "loss": 2.9896, + "step": 28018 + }, + { + "epoch": 1.37, + "grad_norm": 0.5879780054092407, + "learning_rate": 0.000339735925275906, + "loss": 3.0575, + "step": 28019 + }, + { + "epoch": 1.37, + "grad_norm": 0.5398730635643005, + "learning_rate": 0.00033972066468475955, + "loss": 3.0526, + "step": 28020 + }, + { + "epoch": 1.37, + "grad_norm": 0.6177681088447571, + "learning_rate": 0.00033970540398899654, + "loss": 3.0964, + "step": 28021 + }, + { + "epoch": 1.37, + "grad_norm": 0.5534880757331848, + "learning_rate": 0.00033969014318865713, + "loss": 3.0763, + "step": 28022 + }, + { + "epoch": 1.37, + "grad_norm": 0.5374436378479004, + "learning_rate": 0.0003396748822837815, + "loss": 3.2786, + "step": 28023 + }, + { + "epoch": 1.37, + "grad_norm": 0.5656604170799255, + "learning_rate": 0.0003396596212744099, + "loss": 3.1471, + "step": 28024 + }, + { + "epoch": 1.37, + "grad_norm": 0.5550830960273743, + "learning_rate": 0.0003396443601605825, + "loss": 3.0216, + "step": 28025 + }, + { + "epoch": 1.37, + "grad_norm": 0.5797797441482544, + "learning_rate": 0.0003396290989423395, + "loss": 2.9703, + "step": 28026 + }, + { + "epoch": 1.37, + "grad_norm": 0.5749794244766235, + "learning_rate": 0.000339613837619721, + "loss": 3.1812, + "step": 28027 + }, + { + "epoch": 1.37, + "grad_norm": 0.6089553833007812, + "learning_rate": 0.0003395985761927673, + "loss": 3.0538, + "step": 28028 + }, + { + "epoch": 1.37, + "grad_norm": 0.5771617889404297, + "learning_rate": 0.00033958331466151865, + "loss": 3.1415, + "step": 28029 + }, + { + "epoch": 1.37, + "grad_norm": 0.602817952632904, + "learning_rate": 0.0003395680530260151, + "loss": 3.1378, + "step": 28030 + }, + { + "epoch": 1.37, + "grad_norm": 0.597654402256012, + "learning_rate": 0.000339552791286297, + "loss": 3.1961, + "step": 28031 + }, + { + "epoch": 1.37, + "grad_norm": 0.6101335287094116, + "learning_rate": 0.0003395375294424043, + "loss": 2.981, + "step": 28032 + }, + { + "epoch": 1.37, + "grad_norm": 0.5998036861419678, + "learning_rate": 0.0003395222674943775, + "loss": 3.0004, + "step": 28033 + }, + { + "epoch": 1.37, + "grad_norm": 0.9481212496757507, + "learning_rate": 0.0003395070054422566, + "loss": 3.0037, + "step": 28034 + }, + { + "epoch": 1.37, + "grad_norm": 0.5774849653244019, + "learning_rate": 0.00033949174328608175, + "loss": 3.4101, + "step": 28035 + }, + { + "epoch": 1.37, + "grad_norm": 0.608545184135437, + "learning_rate": 0.0003394764810258934, + "loss": 3.1483, + "step": 28036 + }, + { + "epoch": 1.37, + "grad_norm": 0.5682400465011597, + "learning_rate": 0.00033946121866173146, + "loss": 3.0725, + "step": 28037 + }, + { + "epoch": 1.37, + "grad_norm": 0.634087085723877, + "learning_rate": 0.0003394459561936364, + "loss": 3.0956, + "step": 28038 + }, + { + "epoch": 1.37, + "grad_norm": 0.5982432961463928, + "learning_rate": 0.00033943069362164823, + "loss": 3.0107, + "step": 28039 + }, + { + "epoch": 1.37, + "grad_norm": 0.5551794171333313, + "learning_rate": 0.0003394154309458072, + "loss": 2.9166, + "step": 28040 + }, + { + "epoch": 1.37, + "grad_norm": 0.594632625579834, + "learning_rate": 0.00033940016816615354, + "loss": 2.9562, + "step": 28041 + }, + { + "epoch": 1.37, + "grad_norm": 0.5509760975837708, + "learning_rate": 0.0003393849052827273, + "loss": 3.1103, + "step": 28042 + }, + { + "epoch": 1.37, + "grad_norm": 0.5657804012298584, + "learning_rate": 0.0003393696422955689, + "loss": 3.1675, + "step": 28043 + }, + { + "epoch": 1.37, + "grad_norm": 0.5348253846168518, + "learning_rate": 0.0003393543792047185, + "loss": 3.0305, + "step": 28044 + }, + { + "epoch": 1.37, + "grad_norm": 0.572263240814209, + "learning_rate": 0.0003393391160102161, + "loss": 3.1031, + "step": 28045 + }, + { + "epoch": 1.37, + "grad_norm": 0.581788182258606, + "learning_rate": 0.00033932385271210214, + "loss": 2.9799, + "step": 28046 + }, + { + "epoch": 1.37, + "grad_norm": 0.5556021928787231, + "learning_rate": 0.0003393085893104167, + "loss": 3.1716, + "step": 28047 + }, + { + "epoch": 1.37, + "grad_norm": 0.5477098226547241, + "learning_rate": 0.00033929332580519994, + "loss": 3.0578, + "step": 28048 + }, + { + "epoch": 1.37, + "grad_norm": 0.6444635987281799, + "learning_rate": 0.0003392780621964922, + "loss": 3.1512, + "step": 28049 + }, + { + "epoch": 1.37, + "grad_norm": 0.5834984183311462, + "learning_rate": 0.0003392627984843335, + "loss": 3.3345, + "step": 28050 + }, + { + "epoch": 1.37, + "grad_norm": 0.5475103259086609, + "learning_rate": 0.00033924753466876427, + "loss": 3.094, + "step": 28051 + }, + { + "epoch": 1.37, + "grad_norm": 0.5777970552444458, + "learning_rate": 0.0003392322707498245, + "loss": 3.1841, + "step": 28052 + }, + { + "epoch": 1.37, + "grad_norm": 0.5672211647033691, + "learning_rate": 0.00033921700672755455, + "loss": 3.0541, + "step": 28053 + }, + { + "epoch": 1.37, + "grad_norm": 0.7378207445144653, + "learning_rate": 0.0003392017426019945, + "loss": 3.0522, + "step": 28054 + }, + { + "epoch": 1.37, + "grad_norm": 0.5990445017814636, + "learning_rate": 0.0003391864783731846, + "loss": 3.0336, + "step": 28055 + }, + { + "epoch": 1.37, + "grad_norm": 0.5967203974723816, + "learning_rate": 0.00033917121404116515, + "loss": 2.9228, + "step": 28056 + }, + { + "epoch": 1.38, + "grad_norm": 0.5567073225975037, + "learning_rate": 0.00033915594960597616, + "loss": 3.0333, + "step": 28057 + }, + { + "epoch": 1.38, + "grad_norm": 0.5595899820327759, + "learning_rate": 0.0003391406850676579, + "loss": 3.093, + "step": 28058 + }, + { + "epoch": 1.38, + "grad_norm": 0.5615085363388062, + "learning_rate": 0.0003391254204262507, + "loss": 2.9224, + "step": 28059 + }, + { + "epoch": 1.38, + "grad_norm": 0.5954691767692566, + "learning_rate": 0.00033911015568179466, + "loss": 2.9878, + "step": 28060 + }, + { + "epoch": 1.38, + "grad_norm": 0.557086169719696, + "learning_rate": 0.00033909489083433, + "loss": 3.1065, + "step": 28061 + }, + { + "epoch": 1.38, + "grad_norm": 0.5938393473625183, + "learning_rate": 0.0003390796258838969, + "loss": 2.859, + "step": 28062 + }, + { + "epoch": 1.38, + "grad_norm": 0.5960376262664795, + "learning_rate": 0.00033906436083053557, + "loss": 3.1894, + "step": 28063 + }, + { + "epoch": 1.38, + "grad_norm": 0.5832287669181824, + "learning_rate": 0.0003390490956742862, + "loss": 2.9972, + "step": 28064 + }, + { + "epoch": 1.38, + "grad_norm": 0.8595984578132629, + "learning_rate": 0.0003390338304151891, + "loss": 3.2039, + "step": 28065 + }, + { + "epoch": 1.38, + "grad_norm": 0.5543466806411743, + "learning_rate": 0.0003390185650532844, + "loss": 3.0077, + "step": 28066 + }, + { + "epoch": 1.38, + "grad_norm": 0.6111711263656616, + "learning_rate": 0.00033900329958861227, + "loss": 2.9772, + "step": 28067 + }, + { + "epoch": 1.38, + "grad_norm": 0.5975467562675476, + "learning_rate": 0.0003389880340212129, + "loss": 3.1056, + "step": 28068 + }, + { + "epoch": 1.38, + "grad_norm": 0.5642098784446716, + "learning_rate": 0.00033897276835112675, + "loss": 2.9942, + "step": 28069 + }, + { + "epoch": 1.38, + "grad_norm": 0.5549335479736328, + "learning_rate": 0.0003389575025783937, + "loss": 3.0474, + "step": 28070 + }, + { + "epoch": 1.38, + "grad_norm": 0.6323215961456299, + "learning_rate": 0.00033894223670305403, + "loss": 2.9807, + "step": 28071 + }, + { + "epoch": 1.38, + "grad_norm": 0.5654608607292175, + "learning_rate": 0.000338926970725148, + "loss": 3.0737, + "step": 28072 + }, + { + "epoch": 1.38, + "grad_norm": 0.574693500995636, + "learning_rate": 0.00033891170464471596, + "loss": 2.8669, + "step": 28073 + }, + { + "epoch": 1.38, + "grad_norm": 0.5713275671005249, + "learning_rate": 0.00033889643846179784, + "loss": 3.2703, + "step": 28074 + }, + { + "epoch": 1.38, + "grad_norm": 0.6364814639091492, + "learning_rate": 0.00033888117217643405, + "loss": 3.1299, + "step": 28075 + }, + { + "epoch": 1.38, + "grad_norm": 0.5909932255744934, + "learning_rate": 0.0003388659057886648, + "loss": 3.0662, + "step": 28076 + }, + { + "epoch": 1.38, + "grad_norm": 0.5821861624717712, + "learning_rate": 0.0003388506392985302, + "loss": 3.0567, + "step": 28077 + }, + { + "epoch": 1.38, + "grad_norm": 0.5835837721824646, + "learning_rate": 0.0003388353727060704, + "loss": 3.1389, + "step": 28078 + }, + { + "epoch": 1.38, + "grad_norm": 0.5665825605392456, + "learning_rate": 0.00033882010601132573, + "loss": 3.3765, + "step": 28079 + }, + { + "epoch": 1.38, + "grad_norm": 0.5594505071640015, + "learning_rate": 0.0003388048392143365, + "loss": 3.0203, + "step": 28080 + }, + { + "epoch": 1.38, + "grad_norm": 0.5456429123878479, + "learning_rate": 0.0003387895723151427, + "loss": 3.107, + "step": 28081 + }, + { + "epoch": 1.38, + "grad_norm": 0.571052610874176, + "learning_rate": 0.0003387743053137846, + "loss": 3.1842, + "step": 28082 + }, + { + "epoch": 1.38, + "grad_norm": 0.6176466941833496, + "learning_rate": 0.0003387590382103025, + "loss": 2.9045, + "step": 28083 + }, + { + "epoch": 1.38, + "grad_norm": 0.5826286673545837, + "learning_rate": 0.00033874377100473653, + "loss": 2.9922, + "step": 28084 + }, + { + "epoch": 1.38, + "grad_norm": 0.5976090431213379, + "learning_rate": 0.0003387285036971268, + "loss": 2.9568, + "step": 28085 + }, + { + "epoch": 1.38, + "grad_norm": 0.5784058570861816, + "learning_rate": 0.0003387132362875138, + "loss": 2.9194, + "step": 28086 + }, + { + "epoch": 1.38, + "grad_norm": 0.5768011212348938, + "learning_rate": 0.00033869796877593756, + "loss": 3.0978, + "step": 28087 + }, + { + "epoch": 1.38, + "grad_norm": 0.563029944896698, + "learning_rate": 0.0003386827011624383, + "loss": 3.0959, + "step": 28088 + }, + { + "epoch": 1.38, + "grad_norm": 0.5373599529266357, + "learning_rate": 0.00033866743344705626, + "loss": 3.0315, + "step": 28089 + }, + { + "epoch": 1.38, + "grad_norm": 0.5780729055404663, + "learning_rate": 0.0003386521656298316, + "loss": 3.013, + "step": 28090 + }, + { + "epoch": 1.38, + "grad_norm": 0.5696938037872314, + "learning_rate": 0.0003386368977108047, + "loss": 2.8036, + "step": 28091 + }, + { + "epoch": 1.38, + "grad_norm": 0.55682373046875, + "learning_rate": 0.0003386216296900155, + "loss": 3.0036, + "step": 28092 + }, + { + "epoch": 1.38, + "grad_norm": 0.6155198812484741, + "learning_rate": 0.0003386063615675045, + "loss": 2.979, + "step": 28093 + }, + { + "epoch": 1.38, + "grad_norm": 0.5788460373878479, + "learning_rate": 0.00033859109334331166, + "loss": 3.1754, + "step": 28094 + }, + { + "epoch": 1.38, + "grad_norm": 0.6345138549804688, + "learning_rate": 0.0003385758250174774, + "loss": 3.0151, + "step": 28095 + }, + { + "epoch": 1.38, + "grad_norm": 0.5743170380592346, + "learning_rate": 0.0003385605565900417, + "loss": 3.0853, + "step": 28096 + }, + { + "epoch": 1.38, + "grad_norm": 0.6293991208076477, + "learning_rate": 0.00033854528806104495, + "loss": 2.999, + "step": 28097 + }, + { + "epoch": 1.38, + "grad_norm": 0.6683945655822754, + "learning_rate": 0.00033853001943052736, + "loss": 3.2133, + "step": 28098 + }, + { + "epoch": 1.38, + "grad_norm": 0.5892733335494995, + "learning_rate": 0.00033851475069852916, + "loss": 2.9365, + "step": 28099 + }, + { + "epoch": 1.38, + "grad_norm": 0.5764167904853821, + "learning_rate": 0.0003384994818650905, + "loss": 3.0822, + "step": 28100 + }, + { + "epoch": 1.38, + "grad_norm": 0.5932957530021667, + "learning_rate": 0.00033848421293025156, + "loss": 3.2123, + "step": 28101 + }, + { + "epoch": 1.38, + "grad_norm": 0.5980968475341797, + "learning_rate": 0.0003384689438940528, + "loss": 2.9073, + "step": 28102 + }, + { + "epoch": 1.38, + "grad_norm": 0.5964227318763733, + "learning_rate": 0.0003384536747565341, + "loss": 2.7695, + "step": 28103 + }, + { + "epoch": 1.38, + "grad_norm": 0.5735576152801514, + "learning_rate": 0.00033843840551773564, + "loss": 3.0268, + "step": 28104 + }, + { + "epoch": 1.38, + "grad_norm": 0.6148794889450073, + "learning_rate": 0.000338423136177698, + "loss": 2.8976, + "step": 28105 + }, + { + "epoch": 1.38, + "grad_norm": 0.6786152124404907, + "learning_rate": 0.00033840786673646134, + "loss": 3.1084, + "step": 28106 + }, + { + "epoch": 1.38, + "grad_norm": 0.748644232749939, + "learning_rate": 0.00033839259719406554, + "loss": 3.0318, + "step": 28107 + }, + { + "epoch": 1.38, + "grad_norm": 0.5767213106155396, + "learning_rate": 0.00033837732755055114, + "loss": 2.9741, + "step": 28108 + }, + { + "epoch": 1.38, + "grad_norm": 0.6026296019554138, + "learning_rate": 0.0003383620578059582, + "loss": 3.0951, + "step": 28109 + }, + { + "epoch": 1.38, + "grad_norm": 0.5640915036201477, + "learning_rate": 0.000338346787960327, + "loss": 3.0876, + "step": 28110 + }, + { + "epoch": 1.38, + "grad_norm": 0.5978099703788757, + "learning_rate": 0.00033833151801369773, + "loss": 3.0631, + "step": 28111 + }, + { + "epoch": 1.38, + "grad_norm": 0.5457133054733276, + "learning_rate": 0.0003383162479661106, + "loss": 3.207, + "step": 28112 + }, + { + "epoch": 1.38, + "grad_norm": 0.5326697826385498, + "learning_rate": 0.00033830097781760595, + "loss": 3.086, + "step": 28113 + }, + { + "epoch": 1.38, + "grad_norm": 0.5941699147224426, + "learning_rate": 0.0003382857075682237, + "loss": 3.1537, + "step": 28114 + }, + { + "epoch": 1.38, + "grad_norm": 0.5777954459190369, + "learning_rate": 0.0003382704372180044, + "loss": 3.0681, + "step": 28115 + }, + { + "epoch": 1.38, + "grad_norm": 0.5638835430145264, + "learning_rate": 0.00033825516676698816, + "loss": 3.0887, + "step": 28116 + }, + { + "epoch": 1.38, + "grad_norm": 0.5932229161262512, + "learning_rate": 0.0003382398962152152, + "loss": 2.9348, + "step": 28117 + }, + { + "epoch": 1.38, + "grad_norm": 0.566215991973877, + "learning_rate": 0.0003382246255627256, + "loss": 3.0488, + "step": 28118 + }, + { + "epoch": 1.38, + "grad_norm": 0.5923061370849609, + "learning_rate": 0.0003382093548095597, + "loss": 3.053, + "step": 28119 + }, + { + "epoch": 1.38, + "grad_norm": 0.5887033939361572, + "learning_rate": 0.0003381940839557578, + "loss": 3.1721, + "step": 28120 + }, + { + "epoch": 1.38, + "grad_norm": 0.6126934289932251, + "learning_rate": 0.00033817881300136, + "loss": 3.0277, + "step": 28121 + }, + { + "epoch": 1.38, + "grad_norm": 0.5494149923324585, + "learning_rate": 0.0003381635419464065, + "loss": 3.209, + "step": 28122 + }, + { + "epoch": 1.38, + "grad_norm": 0.5944859385490417, + "learning_rate": 0.00033814827079093755, + "loss": 3.0693, + "step": 28123 + }, + { + "epoch": 1.38, + "grad_norm": 0.5820398926734924, + "learning_rate": 0.0003381329995349935, + "loss": 3.2692, + "step": 28124 + }, + { + "epoch": 1.38, + "grad_norm": 0.580773651599884, + "learning_rate": 0.00033811772817861444, + "loss": 3.0431, + "step": 28125 + }, + { + "epoch": 1.38, + "grad_norm": 0.5761692523956299, + "learning_rate": 0.00033810245672184053, + "loss": 3.0927, + "step": 28126 + }, + { + "epoch": 1.38, + "grad_norm": 0.5915241837501526, + "learning_rate": 0.00033808718516471217, + "loss": 3.1483, + "step": 28127 + }, + { + "epoch": 1.38, + "grad_norm": 0.5700193047523499, + "learning_rate": 0.00033807191350726957, + "loss": 3.0007, + "step": 28128 + }, + { + "epoch": 1.38, + "grad_norm": 0.596190333366394, + "learning_rate": 0.00033805664174955274, + "loss": 2.9628, + "step": 28129 + }, + { + "epoch": 1.38, + "grad_norm": 0.5701046586036682, + "learning_rate": 0.0003380413698916021, + "loss": 3.1288, + "step": 28130 + }, + { + "epoch": 1.38, + "grad_norm": 0.5389936566352844, + "learning_rate": 0.00033802609793345784, + "loss": 3.0459, + "step": 28131 + }, + { + "epoch": 1.38, + "grad_norm": 0.5747068524360657, + "learning_rate": 0.0003380108258751601, + "loss": 2.9633, + "step": 28132 + }, + { + "epoch": 1.38, + "grad_norm": 0.606648862361908, + "learning_rate": 0.00033799555371674916, + "loss": 2.9476, + "step": 28133 + }, + { + "epoch": 1.38, + "grad_norm": 0.6043571829795837, + "learning_rate": 0.00033798028145826526, + "loss": 3.1849, + "step": 28134 + }, + { + "epoch": 1.38, + "grad_norm": 0.5670210719108582, + "learning_rate": 0.0003379650090997486, + "loss": 2.9245, + "step": 28135 + }, + { + "epoch": 1.38, + "grad_norm": 0.5730489492416382, + "learning_rate": 0.00033794973664123937, + "loss": 3.2632, + "step": 28136 + }, + { + "epoch": 1.38, + "grad_norm": 0.6139273643493652, + "learning_rate": 0.000337934464082778, + "loss": 2.8662, + "step": 28137 + }, + { + "epoch": 1.38, + "grad_norm": 0.5544430613517761, + "learning_rate": 0.0003379191914244044, + "loss": 2.9654, + "step": 28138 + }, + { + "epoch": 1.38, + "grad_norm": 0.5451558232307434, + "learning_rate": 0.000337903918666159, + "loss": 3.077, + "step": 28139 + }, + { + "epoch": 1.38, + "grad_norm": 0.5961140990257263, + "learning_rate": 0.0003378886458080819, + "loss": 2.9309, + "step": 28140 + }, + { + "epoch": 1.38, + "grad_norm": 0.5541216135025024, + "learning_rate": 0.00033787337285021347, + "loss": 3.2097, + "step": 28141 + }, + { + "epoch": 1.38, + "grad_norm": 0.6020674705505371, + "learning_rate": 0.0003378580997925939, + "loss": 3.0919, + "step": 28142 + }, + { + "epoch": 1.38, + "grad_norm": 0.5713543891906738, + "learning_rate": 0.0003378428266352633, + "loss": 2.98, + "step": 28143 + }, + { + "epoch": 1.38, + "grad_norm": 0.5623284578323364, + "learning_rate": 0.00033782755337826207, + "loss": 3.1572, + "step": 28144 + }, + { + "epoch": 1.38, + "grad_norm": 0.616407036781311, + "learning_rate": 0.00033781228002163023, + "loss": 3.0148, + "step": 28145 + }, + { + "epoch": 1.38, + "grad_norm": 0.6024094820022583, + "learning_rate": 0.0003377970065654082, + "loss": 3.2478, + "step": 28146 + }, + { + "epoch": 1.38, + "grad_norm": 0.5678855180740356, + "learning_rate": 0.0003377817330096361, + "loss": 3.0946, + "step": 28147 + }, + { + "epoch": 1.38, + "grad_norm": 0.5765829682350159, + "learning_rate": 0.00033776645935435423, + "loss": 3.0297, + "step": 28148 + }, + { + "epoch": 1.38, + "grad_norm": 0.5960372090339661, + "learning_rate": 0.0003377511855996027, + "loss": 3.2084, + "step": 28149 + }, + { + "epoch": 1.38, + "grad_norm": 0.5560441017150879, + "learning_rate": 0.00033773591174542187, + "loss": 3.2332, + "step": 28150 + }, + { + "epoch": 1.38, + "grad_norm": 0.5740705132484436, + "learning_rate": 0.0003377206377918519, + "loss": 3.1825, + "step": 28151 + }, + { + "epoch": 1.38, + "grad_norm": 0.5770044922828674, + "learning_rate": 0.00033770536373893296, + "loss": 2.9729, + "step": 28152 + }, + { + "epoch": 1.38, + "grad_norm": 0.5786614418029785, + "learning_rate": 0.0003376900895867055, + "loss": 2.9429, + "step": 28153 + }, + { + "epoch": 1.38, + "grad_norm": 0.5744767189025879, + "learning_rate": 0.00033767481533520955, + "loss": 3.047, + "step": 28154 + }, + { + "epoch": 1.38, + "grad_norm": 0.5745210647583008, + "learning_rate": 0.0003376595409844852, + "loss": 3.0093, + "step": 28155 + }, + { + "epoch": 1.38, + "grad_norm": 0.5670796632766724, + "learning_rate": 0.000337644266534573, + "loss": 3.3227, + "step": 28156 + }, + { + "epoch": 1.38, + "grad_norm": 0.5535241365432739, + "learning_rate": 0.00033762899198551313, + "loss": 2.8861, + "step": 28157 + }, + { + "epoch": 1.38, + "grad_norm": 0.5871549248695374, + "learning_rate": 0.00033761371733734573, + "loss": 3.0057, + "step": 28158 + }, + { + "epoch": 1.38, + "grad_norm": 0.5588001012802124, + "learning_rate": 0.0003375984425901109, + "loss": 3.0635, + "step": 28159 + }, + { + "epoch": 1.38, + "grad_norm": 0.5769440531730652, + "learning_rate": 0.00033758316774384905, + "loss": 3.0683, + "step": 28160 + }, + { + "epoch": 1.38, + "grad_norm": 0.5671083331108093, + "learning_rate": 0.00033756789279860045, + "loss": 3.2814, + "step": 28161 + }, + { + "epoch": 1.38, + "grad_norm": 0.5821115970611572, + "learning_rate": 0.00033755261775440516, + "loss": 3.0085, + "step": 28162 + }, + { + "epoch": 1.38, + "grad_norm": 0.5854321122169495, + "learning_rate": 0.00033753734261130354, + "loss": 3.0935, + "step": 28163 + }, + { + "epoch": 1.38, + "grad_norm": 0.6202100515365601, + "learning_rate": 0.0003375220673693359, + "loss": 2.9791, + "step": 28164 + }, + { + "epoch": 1.38, + "grad_norm": 0.5914233326911926, + "learning_rate": 0.00033750679202854215, + "loss": 3.1086, + "step": 28165 + }, + { + "epoch": 1.38, + "grad_norm": 0.6070291996002197, + "learning_rate": 0.0003374915165889628, + "loss": 2.9375, + "step": 28166 + }, + { + "epoch": 1.38, + "grad_norm": 0.5866225361824036, + "learning_rate": 0.000337476241050638, + "loss": 2.9277, + "step": 28167 + }, + { + "epoch": 1.38, + "grad_norm": 0.5692453980445862, + "learning_rate": 0.0003374609654136081, + "loss": 3.2167, + "step": 28168 + }, + { + "epoch": 1.38, + "grad_norm": 0.5685532689094543, + "learning_rate": 0.0003374456896779132, + "loss": 3.0796, + "step": 28169 + }, + { + "epoch": 1.38, + "grad_norm": 0.5635455846786499, + "learning_rate": 0.00033743041384359335, + "loss": 2.8701, + "step": 28170 + }, + { + "epoch": 1.38, + "grad_norm": 0.6098818182945251, + "learning_rate": 0.0003374151379106893, + "loss": 2.901, + "step": 28171 + }, + { + "epoch": 1.38, + "grad_norm": 0.579856812953949, + "learning_rate": 0.00033739986187924083, + "loss": 3.2155, + "step": 28172 + }, + { + "epoch": 1.38, + "grad_norm": 0.596610963344574, + "learning_rate": 0.0003373845857492883, + "loss": 2.9086, + "step": 28173 + }, + { + "epoch": 1.38, + "grad_norm": 0.5602899789810181, + "learning_rate": 0.000337369309520872, + "loss": 3.1105, + "step": 28174 + }, + { + "epoch": 1.38, + "grad_norm": 0.5983844995498657, + "learning_rate": 0.00033735403319403215, + "loss": 3.0502, + "step": 28175 + }, + { + "epoch": 1.38, + "grad_norm": 0.5782126784324646, + "learning_rate": 0.0003373387567688089, + "loss": 3.148, + "step": 28176 + }, + { + "epoch": 1.38, + "grad_norm": 0.5897262692451477, + "learning_rate": 0.0003373234802452425, + "loss": 2.9129, + "step": 28177 + }, + { + "epoch": 1.38, + "grad_norm": 0.5535157918930054, + "learning_rate": 0.00033730820362337335, + "loss": 3.263, + "step": 28178 + }, + { + "epoch": 1.38, + "grad_norm": 0.5690072178840637, + "learning_rate": 0.00033729292690324166, + "loss": 3.0606, + "step": 28179 + }, + { + "epoch": 1.38, + "grad_norm": 0.5716756582260132, + "learning_rate": 0.00033727765008488743, + "loss": 3.0934, + "step": 28180 + }, + { + "epoch": 1.38, + "grad_norm": 0.6298738718032837, + "learning_rate": 0.00033726237316835094, + "loss": 2.8377, + "step": 28181 + }, + { + "epoch": 1.38, + "grad_norm": 0.5696219205856323, + "learning_rate": 0.0003372470961536728, + "loss": 3.2661, + "step": 28182 + }, + { + "epoch": 1.38, + "grad_norm": 0.5914949178695679, + "learning_rate": 0.00033723181904089284, + "loss": 3.0636, + "step": 28183 + }, + { + "epoch": 1.38, + "grad_norm": 0.5469275116920471, + "learning_rate": 0.0003372165418300514, + "loss": 3.2017, + "step": 28184 + }, + { + "epoch": 1.38, + "grad_norm": 0.543024480342865, + "learning_rate": 0.00033720126452118883, + "loss": 2.9891, + "step": 28185 + }, + { + "epoch": 1.38, + "grad_norm": 0.5579934120178223, + "learning_rate": 0.0003371859871143452, + "loss": 2.9713, + "step": 28186 + }, + { + "epoch": 1.38, + "grad_norm": 0.5843299627304077, + "learning_rate": 0.0003371707096095609, + "loss": 2.8568, + "step": 28187 + }, + { + "epoch": 1.38, + "grad_norm": 0.5556427240371704, + "learning_rate": 0.000337155432006876, + "loss": 3.2779, + "step": 28188 + }, + { + "epoch": 1.38, + "grad_norm": 0.5713661313056946, + "learning_rate": 0.000337140154306331, + "loss": 2.8248, + "step": 28189 + }, + { + "epoch": 1.38, + "grad_norm": 0.5614469647407532, + "learning_rate": 0.000337124876507966, + "loss": 3.1321, + "step": 28190 + }, + { + "epoch": 1.38, + "grad_norm": 0.5511723160743713, + "learning_rate": 0.00033710959861182107, + "loss": 3.0381, + "step": 28191 + }, + { + "epoch": 1.38, + "grad_norm": 0.5804082751274109, + "learning_rate": 0.00033709432061793663, + "loss": 3.0038, + "step": 28192 + }, + { + "epoch": 1.38, + "grad_norm": 0.5728508234024048, + "learning_rate": 0.000337079042526353, + "loss": 3.1805, + "step": 28193 + }, + { + "epoch": 1.38, + "grad_norm": 0.5317544341087341, + "learning_rate": 0.0003370637643371102, + "loss": 3.1494, + "step": 28194 + }, + { + "epoch": 1.38, + "grad_norm": 0.5826964378356934, + "learning_rate": 0.0003370484860502486, + "loss": 3.038, + "step": 28195 + }, + { + "epoch": 1.38, + "grad_norm": 0.5762989521026611, + "learning_rate": 0.0003370332076658084, + "loss": 3.0305, + "step": 28196 + }, + { + "epoch": 1.38, + "grad_norm": 0.55995774269104, + "learning_rate": 0.00033701792918382987, + "loss": 3.0058, + "step": 28197 + }, + { + "epoch": 1.38, + "grad_norm": 0.6025938987731934, + "learning_rate": 0.0003370026506043533, + "loss": 3.2507, + "step": 28198 + }, + { + "epoch": 1.38, + "grad_norm": 0.5528343915939331, + "learning_rate": 0.0003369873719274188, + "loss": 3.0487, + "step": 28199 + }, + { + "epoch": 1.38, + "grad_norm": 0.5329409241676331, + "learning_rate": 0.0003369720931530667, + "loss": 2.9969, + "step": 28200 + }, + { + "epoch": 1.38, + "grad_norm": 0.5734084248542786, + "learning_rate": 0.00033695681428133725, + "loss": 3.2161, + "step": 28201 + }, + { + "epoch": 1.38, + "grad_norm": 0.562954306602478, + "learning_rate": 0.00033694153531227053, + "loss": 3.1746, + "step": 28202 + }, + { + "epoch": 1.38, + "grad_norm": 0.5255847573280334, + "learning_rate": 0.0003369262562459071, + "loss": 2.9532, + "step": 28203 + }, + { + "epoch": 1.38, + "grad_norm": 0.5966204404830933, + "learning_rate": 0.000336910977082287, + "loss": 3.195, + "step": 28204 + }, + { + "epoch": 1.38, + "grad_norm": 0.549161434173584, + "learning_rate": 0.00033689569782145045, + "loss": 3.1496, + "step": 28205 + }, + { + "epoch": 1.38, + "grad_norm": 0.5623190999031067, + "learning_rate": 0.0003368804184634376, + "loss": 3.1611, + "step": 28206 + }, + { + "epoch": 1.38, + "grad_norm": 0.5726973414421082, + "learning_rate": 0.000336865139008289, + "loss": 3.0249, + "step": 28207 + }, + { + "epoch": 1.38, + "grad_norm": 0.5972573161125183, + "learning_rate": 0.00033684985945604465, + "loss": 3.0751, + "step": 28208 + }, + { + "epoch": 1.38, + "grad_norm": 0.5603774785995483, + "learning_rate": 0.0003368345798067449, + "loss": 3.1158, + "step": 28209 + }, + { + "epoch": 1.38, + "grad_norm": 0.5560487508773804, + "learning_rate": 0.0003368193000604299, + "loss": 3.1079, + "step": 28210 + }, + { + "epoch": 1.38, + "grad_norm": 0.5797421336174011, + "learning_rate": 0.00033680402021714, + "loss": 3.0904, + "step": 28211 + }, + { + "epoch": 1.38, + "grad_norm": 0.5546010136604309, + "learning_rate": 0.0003367887402769153, + "loss": 3.1467, + "step": 28212 + }, + { + "epoch": 1.38, + "grad_norm": 0.6668386459350586, + "learning_rate": 0.0003367734602397962, + "loss": 3.0496, + "step": 28213 + }, + { + "epoch": 1.38, + "grad_norm": 0.5610592365264893, + "learning_rate": 0.0003367581801058229, + "loss": 3.0401, + "step": 28214 + }, + { + "epoch": 1.38, + "grad_norm": 0.5730311870574951, + "learning_rate": 0.00033674289987503566, + "loss": 3.0605, + "step": 28215 + }, + { + "epoch": 1.38, + "grad_norm": 0.5653447508811951, + "learning_rate": 0.0003367276195474747, + "loss": 3.0281, + "step": 28216 + }, + { + "epoch": 1.38, + "grad_norm": 0.5662187933921814, + "learning_rate": 0.0003367123391231801, + "loss": 2.9896, + "step": 28217 + }, + { + "epoch": 1.38, + "grad_norm": 0.5825369954109192, + "learning_rate": 0.00033669705860219233, + "loss": 3.0219, + "step": 28218 + }, + { + "epoch": 1.38, + "grad_norm": 0.5653578042984009, + "learning_rate": 0.0003366817779845517, + "loss": 3.1838, + "step": 28219 + }, + { + "epoch": 1.38, + "grad_norm": 0.5515113472938538, + "learning_rate": 0.00033666649727029824, + "loss": 2.9519, + "step": 28220 + }, + { + "epoch": 1.38, + "grad_norm": 0.5668362975120544, + "learning_rate": 0.0003366512164594723, + "loss": 3.0131, + "step": 28221 + }, + { + "epoch": 1.38, + "grad_norm": 0.5645515322685242, + "learning_rate": 0.0003366359355521141, + "loss": 3.1945, + "step": 28222 + }, + { + "epoch": 1.38, + "grad_norm": 0.5728771686553955, + "learning_rate": 0.0003366206545482639, + "loss": 3.2453, + "step": 28223 + }, + { + "epoch": 1.38, + "grad_norm": 0.5764223337173462, + "learning_rate": 0.00033660537344796187, + "loss": 3.0845, + "step": 28224 + }, + { + "epoch": 1.38, + "grad_norm": 0.5650098323822021, + "learning_rate": 0.00033659009225124836, + "loss": 2.9356, + "step": 28225 + }, + { + "epoch": 1.38, + "grad_norm": 0.5907217860221863, + "learning_rate": 0.0003365748109581637, + "loss": 3.1849, + "step": 28226 + }, + { + "epoch": 1.38, + "grad_norm": 0.5731244683265686, + "learning_rate": 0.0003365595295687479, + "loss": 3.1806, + "step": 28227 + }, + { + "epoch": 1.38, + "grad_norm": 0.5921067595481873, + "learning_rate": 0.0003365442480830414, + "loss": 3.1239, + "step": 28228 + }, + { + "epoch": 1.38, + "grad_norm": 0.5963445901870728, + "learning_rate": 0.00033652896650108435, + "loss": 2.944, + "step": 28229 + }, + { + "epoch": 1.38, + "grad_norm": 0.565613329410553, + "learning_rate": 0.00033651368482291705, + "loss": 2.9348, + "step": 28230 + }, + { + "epoch": 1.38, + "grad_norm": 0.6107422709465027, + "learning_rate": 0.00033649840304857977, + "loss": 2.9047, + "step": 28231 + }, + { + "epoch": 1.38, + "grad_norm": 0.5712419748306274, + "learning_rate": 0.0003364831211781126, + "loss": 3.086, + "step": 28232 + }, + { + "epoch": 1.38, + "grad_norm": 0.5632501840591431, + "learning_rate": 0.0003364678392115561, + "loss": 3.0059, + "step": 28233 + }, + { + "epoch": 1.38, + "grad_norm": 0.5802181363105774, + "learning_rate": 0.0003364525571489502, + "loss": 3.0808, + "step": 28234 + }, + { + "epoch": 1.38, + "grad_norm": 0.5678035616874695, + "learning_rate": 0.00033643727499033533, + "loss": 3.0125, + "step": 28235 + }, + { + "epoch": 1.38, + "grad_norm": 0.5598823428153992, + "learning_rate": 0.0003364219927357516, + "loss": 3.0799, + "step": 28236 + }, + { + "epoch": 1.38, + "grad_norm": 0.5781440138816833, + "learning_rate": 0.00033640671038523943, + "loss": 3.0678, + "step": 28237 + }, + { + "epoch": 1.38, + "grad_norm": 0.5706907510757446, + "learning_rate": 0.000336391427938839, + "loss": 3.1238, + "step": 28238 + }, + { + "epoch": 1.38, + "grad_norm": 0.5403823852539062, + "learning_rate": 0.0003363761453965905, + "loss": 3.0808, + "step": 28239 + }, + { + "epoch": 1.38, + "grad_norm": 0.5869365334510803, + "learning_rate": 0.0003363608627585342, + "loss": 3.202, + "step": 28240 + }, + { + "epoch": 1.38, + "grad_norm": 0.590049684047699, + "learning_rate": 0.00033634558002471055, + "loss": 2.983, + "step": 28241 + }, + { + "epoch": 1.38, + "grad_norm": 0.6182698607444763, + "learning_rate": 0.00033633029719515946, + "loss": 2.9491, + "step": 28242 + }, + { + "epoch": 1.38, + "grad_norm": 0.5630959868431091, + "learning_rate": 0.00033631501426992146, + "loss": 3.1776, + "step": 28243 + }, + { + "epoch": 1.38, + "grad_norm": 0.5790128111839294, + "learning_rate": 0.00033629973124903666, + "loss": 3.2561, + "step": 28244 + }, + { + "epoch": 1.38, + "grad_norm": 0.612781286239624, + "learning_rate": 0.0003362844481325454, + "loss": 3.0595, + "step": 28245 + }, + { + "epoch": 1.38, + "grad_norm": 0.5653966069221497, + "learning_rate": 0.00033626916492048784, + "loss": 3.0165, + "step": 28246 + }, + { + "epoch": 1.38, + "grad_norm": 0.5676288604736328, + "learning_rate": 0.0003362538816129043, + "loss": 3.1517, + "step": 28247 + }, + { + "epoch": 1.38, + "grad_norm": 0.5745164752006531, + "learning_rate": 0.000336238598209835, + "loss": 3.2458, + "step": 28248 + }, + { + "epoch": 1.38, + "grad_norm": 0.624270498752594, + "learning_rate": 0.0003362233147113202, + "loss": 2.9842, + "step": 28249 + }, + { + "epoch": 1.38, + "grad_norm": 0.5752471089363098, + "learning_rate": 0.0003362080311174002, + "loss": 3.08, + "step": 28250 + }, + { + "epoch": 1.38, + "grad_norm": 0.6258599758148193, + "learning_rate": 0.0003361927474281152, + "loss": 2.9069, + "step": 28251 + }, + { + "epoch": 1.38, + "grad_norm": 0.5874921083450317, + "learning_rate": 0.0003361774636435055, + "loss": 3.077, + "step": 28252 + }, + { + "epoch": 1.38, + "grad_norm": 0.5625190734863281, + "learning_rate": 0.0003361621797636112, + "loss": 2.7603, + "step": 28253 + }, + { + "epoch": 1.38, + "grad_norm": 0.5910776257514954, + "learning_rate": 0.00033614689578847275, + "loss": 2.8325, + "step": 28254 + }, + { + "epoch": 1.38, + "grad_norm": 0.5543468594551086, + "learning_rate": 0.0003361316117181304, + "loss": 3.0568, + "step": 28255 + }, + { + "epoch": 1.38, + "grad_norm": 0.6161579489707947, + "learning_rate": 0.0003361163275526243, + "loss": 3.1239, + "step": 28256 + }, + { + "epoch": 1.38, + "grad_norm": 0.5997530817985535, + "learning_rate": 0.0003361010432919947, + "loss": 3.1178, + "step": 28257 + }, + { + "epoch": 1.38, + "grad_norm": 0.6007872223854065, + "learning_rate": 0.0003360857589362819, + "loss": 2.8373, + "step": 28258 + }, + { + "epoch": 1.38, + "grad_norm": 0.5524115562438965, + "learning_rate": 0.00033607047448552617, + "loss": 3.1908, + "step": 28259 + }, + { + "epoch": 1.38, + "grad_norm": 0.5868061184883118, + "learning_rate": 0.00033605518993976783, + "loss": 3.091, + "step": 28260 + }, + { + "epoch": 1.39, + "grad_norm": 0.5432960391044617, + "learning_rate": 0.00033603990529904694, + "loss": 3.1476, + "step": 28261 + }, + { + "epoch": 1.39, + "grad_norm": 0.5888095498085022, + "learning_rate": 0.00033602462056340387, + "loss": 3.0807, + "step": 28262 + }, + { + "epoch": 1.39, + "grad_norm": 0.5865523219108582, + "learning_rate": 0.00033600933573287896, + "loss": 2.9347, + "step": 28263 + }, + { + "epoch": 1.39, + "grad_norm": 0.5949804186820984, + "learning_rate": 0.0003359940508075124, + "loss": 3.151, + "step": 28264 + }, + { + "epoch": 1.39, + "grad_norm": 0.5891909599304199, + "learning_rate": 0.0003359787657873444, + "loss": 2.8826, + "step": 28265 + }, + { + "epoch": 1.39, + "grad_norm": 0.5554693937301636, + "learning_rate": 0.0003359634806724153, + "loss": 3.0168, + "step": 28266 + }, + { + "epoch": 1.39, + "grad_norm": 0.583011269569397, + "learning_rate": 0.0003359481954627653, + "loss": 2.8308, + "step": 28267 + }, + { + "epoch": 1.39, + "grad_norm": 0.5493490695953369, + "learning_rate": 0.00033593291015843456, + "loss": 2.6961, + "step": 28268 + }, + { + "epoch": 1.39, + "grad_norm": 0.5663343667984009, + "learning_rate": 0.0003359176247594635, + "loss": 3.1074, + "step": 28269 + }, + { + "epoch": 1.39, + "grad_norm": 0.6228228807449341, + "learning_rate": 0.00033590233926589246, + "loss": 3.2481, + "step": 28270 + }, + { + "epoch": 1.39, + "grad_norm": 0.6101515293121338, + "learning_rate": 0.00033588705367776145, + "loss": 2.8834, + "step": 28271 + }, + { + "epoch": 1.39, + "grad_norm": 0.5485780239105225, + "learning_rate": 0.0003358717679951109, + "loss": 3.0186, + "step": 28272 + }, + { + "epoch": 1.39, + "grad_norm": 0.5793207287788391, + "learning_rate": 0.0003358564822179809, + "loss": 3.0909, + "step": 28273 + }, + { + "epoch": 1.39, + "grad_norm": 0.6276209354400635, + "learning_rate": 0.000335841196346412, + "loss": 3.0662, + "step": 28274 + }, + { + "epoch": 1.39, + "grad_norm": 0.5886393785476685, + "learning_rate": 0.00033582591038044414, + "loss": 3.1548, + "step": 28275 + }, + { + "epoch": 1.39, + "grad_norm": 0.5739596486091614, + "learning_rate": 0.0003358106243201178, + "loss": 2.8666, + "step": 28276 + }, + { + "epoch": 1.39, + "grad_norm": 0.583791196346283, + "learning_rate": 0.0003357953381654732, + "loss": 3.0231, + "step": 28277 + }, + { + "epoch": 1.39, + "grad_norm": 0.584082305431366, + "learning_rate": 0.00033578005191655047, + "loss": 3.1848, + "step": 28278 + }, + { + "epoch": 1.39, + "grad_norm": 0.5657137632369995, + "learning_rate": 0.00033576476557339, + "loss": 2.9686, + "step": 28279 + }, + { + "epoch": 1.39, + "grad_norm": 0.5658915042877197, + "learning_rate": 0.00033574947913603205, + "loss": 3.1709, + "step": 28280 + }, + { + "epoch": 1.39, + "grad_norm": 0.5720155835151672, + "learning_rate": 0.00033573419260451694, + "loss": 3.1229, + "step": 28281 + }, + { + "epoch": 1.39, + "grad_norm": 0.5654585361480713, + "learning_rate": 0.00033571890597888473, + "loss": 2.9455, + "step": 28282 + }, + { + "epoch": 1.39, + "grad_norm": 0.5792554616928101, + "learning_rate": 0.00033570361925917575, + "loss": 3.1386, + "step": 28283 + }, + { + "epoch": 1.39, + "grad_norm": 0.5770618915557861, + "learning_rate": 0.0003356883324454304, + "loss": 3.1544, + "step": 28284 + }, + { + "epoch": 1.39, + "grad_norm": 0.5782315731048584, + "learning_rate": 0.00033567304553768884, + "loss": 3.1645, + "step": 28285 + }, + { + "epoch": 1.39, + "grad_norm": 0.570188581943512, + "learning_rate": 0.0003356577585359913, + "loss": 3.1487, + "step": 28286 + }, + { + "epoch": 1.39, + "grad_norm": 0.5855571031570435, + "learning_rate": 0.00033564247144037815, + "loss": 2.8481, + "step": 28287 + }, + { + "epoch": 1.39, + "grad_norm": 0.5529902577400208, + "learning_rate": 0.0003356271842508896, + "loss": 2.9648, + "step": 28288 + }, + { + "epoch": 1.39, + "grad_norm": 0.5426622629165649, + "learning_rate": 0.00033561189696756574, + "loss": 3.077, + "step": 28289 + }, + { + "epoch": 1.39, + "grad_norm": 0.5838942527770996, + "learning_rate": 0.00033559660959044715, + "loss": 3.0563, + "step": 28290 + }, + { + "epoch": 1.39, + "grad_norm": 0.6251130700111389, + "learning_rate": 0.00033558132211957385, + "loss": 2.962, + "step": 28291 + }, + { + "epoch": 1.39, + "grad_norm": 0.5622515678405762, + "learning_rate": 0.0003355660345549863, + "loss": 3.0919, + "step": 28292 + }, + { + "epoch": 1.39, + "grad_norm": 0.5667559504508972, + "learning_rate": 0.00033555074689672464, + "loss": 2.9387, + "step": 28293 + }, + { + "epoch": 1.39, + "grad_norm": 0.5650799870491028, + "learning_rate": 0.00033553545914482907, + "loss": 3.1512, + "step": 28294 + }, + { + "epoch": 1.39, + "grad_norm": 0.5878683924674988, + "learning_rate": 0.00033552017129934, + "loss": 2.8555, + "step": 28295 + }, + { + "epoch": 1.39, + "grad_norm": 0.61265629529953, + "learning_rate": 0.00033550488336029765, + "loss": 3.0915, + "step": 28296 + }, + { + "epoch": 1.39, + "grad_norm": 0.5722835659980774, + "learning_rate": 0.00033548959532774225, + "loss": 2.8942, + "step": 28297 + }, + { + "epoch": 1.39, + "grad_norm": 0.5835713744163513, + "learning_rate": 0.0003354743072017141, + "loss": 2.9738, + "step": 28298 + }, + { + "epoch": 1.39, + "grad_norm": 0.6345725655555725, + "learning_rate": 0.0003354590189822534, + "loss": 3.3849, + "step": 28299 + }, + { + "epoch": 1.39, + "grad_norm": 0.5987057685852051, + "learning_rate": 0.0003354437306694005, + "loss": 2.9575, + "step": 28300 + }, + { + "epoch": 1.39, + "grad_norm": 0.5893240571022034, + "learning_rate": 0.00033542844226319566, + "loss": 3.0071, + "step": 28301 + }, + { + "epoch": 1.39, + "grad_norm": 0.595427930355072, + "learning_rate": 0.0003354131537636791, + "loss": 3.1305, + "step": 28302 + }, + { + "epoch": 1.39, + "grad_norm": 0.6372551321983337, + "learning_rate": 0.0003353978651708911, + "loss": 2.9446, + "step": 28303 + }, + { + "epoch": 1.39, + "grad_norm": 0.6060798168182373, + "learning_rate": 0.00033538257648487195, + "loss": 3.0027, + "step": 28304 + }, + { + "epoch": 1.39, + "grad_norm": 0.5530107021331787, + "learning_rate": 0.00033536728770566186, + "loss": 3.0878, + "step": 28305 + }, + { + "epoch": 1.39, + "grad_norm": 0.6247791051864624, + "learning_rate": 0.00033535199883330123, + "loss": 3.2343, + "step": 28306 + }, + { + "epoch": 1.39, + "grad_norm": 0.5921459794044495, + "learning_rate": 0.00033533670986783014, + "loss": 3.1694, + "step": 28307 + }, + { + "epoch": 1.39, + "grad_norm": 0.5848678946495056, + "learning_rate": 0.000335321420809289, + "loss": 3.0928, + "step": 28308 + }, + { + "epoch": 1.39, + "grad_norm": 0.6024354100227356, + "learning_rate": 0.00033530613165771804, + "loss": 2.881, + "step": 28309 + }, + { + "epoch": 1.39, + "grad_norm": 0.5735805034637451, + "learning_rate": 0.0003352908424131576, + "loss": 2.9333, + "step": 28310 + }, + { + "epoch": 1.39, + "grad_norm": 0.5353273153305054, + "learning_rate": 0.00033527555307564773, + "loss": 3.0901, + "step": 28311 + }, + { + "epoch": 1.39, + "grad_norm": 0.5976811647415161, + "learning_rate": 0.00033526026364522895, + "loss": 3.2398, + "step": 28312 + }, + { + "epoch": 1.39, + "grad_norm": 0.5682657361030579, + "learning_rate": 0.00033524497412194137, + "loss": 3.137, + "step": 28313 + }, + { + "epoch": 1.39, + "grad_norm": 0.5598923563957214, + "learning_rate": 0.0003352296845058253, + "loss": 2.9647, + "step": 28314 + }, + { + "epoch": 1.39, + "grad_norm": 0.5730248093605042, + "learning_rate": 0.000335214394796921, + "loss": 3.0873, + "step": 28315 + }, + { + "epoch": 1.39, + "grad_norm": 0.5653427839279175, + "learning_rate": 0.00033519910499526883, + "loss": 2.9421, + "step": 28316 + }, + { + "epoch": 1.39, + "grad_norm": 0.5537395477294922, + "learning_rate": 0.000335183815100909, + "loss": 2.9381, + "step": 28317 + }, + { + "epoch": 1.39, + "grad_norm": 0.5599444508552551, + "learning_rate": 0.00033516852511388174, + "loss": 2.903, + "step": 28318 + }, + { + "epoch": 1.39, + "grad_norm": 0.552375853061676, + "learning_rate": 0.00033515323503422725, + "loss": 3.0835, + "step": 28319 + }, + { + "epoch": 1.39, + "grad_norm": 0.6051978468894958, + "learning_rate": 0.00033513794486198607, + "loss": 3.013, + "step": 28320 + }, + { + "epoch": 1.39, + "grad_norm": 0.5906317234039307, + "learning_rate": 0.0003351226545971983, + "loss": 3.0689, + "step": 28321 + }, + { + "epoch": 1.39, + "grad_norm": 0.5584150552749634, + "learning_rate": 0.0003351073642399041, + "loss": 3.0572, + "step": 28322 + }, + { + "epoch": 1.39, + "grad_norm": 0.5732271075248718, + "learning_rate": 0.0003350920737901439, + "loss": 3.0508, + "step": 28323 + }, + { + "epoch": 1.39, + "grad_norm": 0.6018269658088684, + "learning_rate": 0.000335076783247958, + "loss": 2.9892, + "step": 28324 + }, + { + "epoch": 1.39, + "grad_norm": 0.5475950837135315, + "learning_rate": 0.00033506149261338655, + "loss": 3.0967, + "step": 28325 + }, + { + "epoch": 1.39, + "grad_norm": 0.5918049812316895, + "learning_rate": 0.0003350462018864698, + "loss": 2.9518, + "step": 28326 + }, + { + "epoch": 1.39, + "grad_norm": 0.6317940950393677, + "learning_rate": 0.0003350309110672482, + "loss": 3.0599, + "step": 28327 + }, + { + "epoch": 1.39, + "grad_norm": 0.5397853255271912, + "learning_rate": 0.00033501562015576195, + "loss": 3.1353, + "step": 28328 + }, + { + "epoch": 1.39, + "grad_norm": 0.5789517760276794, + "learning_rate": 0.0003350003291520513, + "loss": 3.1184, + "step": 28329 + }, + { + "epoch": 1.39, + "grad_norm": 0.6313763856887817, + "learning_rate": 0.00033498503805615636, + "loss": 2.9678, + "step": 28330 + }, + { + "epoch": 1.39, + "grad_norm": 0.5650973916053772, + "learning_rate": 0.00033496974686811766, + "loss": 3.1104, + "step": 28331 + }, + { + "epoch": 1.39, + "grad_norm": 0.5666373372077942, + "learning_rate": 0.00033495445558797543, + "loss": 2.8425, + "step": 28332 + }, + { + "epoch": 1.39, + "grad_norm": 0.5651002526283264, + "learning_rate": 0.0003349391642157698, + "loss": 2.909, + "step": 28333 + }, + { + "epoch": 1.39, + "grad_norm": 0.5783306360244751, + "learning_rate": 0.0003349238727515412, + "loss": 3.1748, + "step": 28334 + }, + { + "epoch": 1.39, + "grad_norm": 0.5995091795921326, + "learning_rate": 0.00033490858119532985, + "loss": 2.9953, + "step": 28335 + }, + { + "epoch": 1.39, + "grad_norm": 0.5513354539871216, + "learning_rate": 0.00033489328954717596, + "loss": 2.9552, + "step": 28336 + }, + { + "epoch": 1.39, + "grad_norm": 0.56820148229599, + "learning_rate": 0.0003348779978071199, + "loss": 3.0048, + "step": 28337 + }, + { + "epoch": 1.39, + "grad_norm": 0.5699313282966614, + "learning_rate": 0.0003348627059752019, + "loss": 3.1077, + "step": 28338 + }, + { + "epoch": 1.39, + "grad_norm": 0.5874160528182983, + "learning_rate": 0.00033484741405146227, + "loss": 3.1624, + "step": 28339 + }, + { + "epoch": 1.39, + "grad_norm": 0.5903382301330566, + "learning_rate": 0.00033483212203594116, + "loss": 3.124, + "step": 28340 + }, + { + "epoch": 1.39, + "grad_norm": 0.5418932437896729, + "learning_rate": 0.00033481682992867904, + "loss": 3.1153, + "step": 28341 + }, + { + "epoch": 1.39, + "grad_norm": 0.6493607759475708, + "learning_rate": 0.00033480153772971603, + "loss": 2.9999, + "step": 28342 + }, + { + "epoch": 1.39, + "grad_norm": 0.5959764122962952, + "learning_rate": 0.00033478624543909256, + "loss": 3.0586, + "step": 28343 + }, + { + "epoch": 1.39, + "grad_norm": 0.5970855355262756, + "learning_rate": 0.0003347709530568487, + "loss": 3.1342, + "step": 28344 + }, + { + "epoch": 1.39, + "grad_norm": 0.5649682879447937, + "learning_rate": 0.00033475566058302486, + "loss": 3.1413, + "step": 28345 + }, + { + "epoch": 1.39, + "grad_norm": 0.6004948616027832, + "learning_rate": 0.00033474036801766136, + "loss": 3.0267, + "step": 28346 + }, + { + "epoch": 1.39, + "grad_norm": 0.6019605398178101, + "learning_rate": 0.0003347250753607984, + "loss": 2.954, + "step": 28347 + }, + { + "epoch": 1.39, + "grad_norm": 0.5807358026504517, + "learning_rate": 0.0003347097826124762, + "loss": 3.1699, + "step": 28348 + }, + { + "epoch": 1.39, + "grad_norm": 0.5428721308708191, + "learning_rate": 0.0003346944897727352, + "loss": 3.1219, + "step": 28349 + }, + { + "epoch": 1.39, + "grad_norm": 0.6022670269012451, + "learning_rate": 0.00033467919684161554, + "loss": 3.1146, + "step": 28350 + }, + { + "epoch": 1.39, + "grad_norm": 0.5750691294670105, + "learning_rate": 0.0003346639038191576, + "loss": 3.06, + "step": 28351 + }, + { + "epoch": 1.39, + "grad_norm": 0.5875911712646484, + "learning_rate": 0.0003346486107054016, + "loss": 3.0479, + "step": 28352 + }, + { + "epoch": 1.39, + "grad_norm": 0.588360071182251, + "learning_rate": 0.0003346333175003878, + "loss": 3.0508, + "step": 28353 + }, + { + "epoch": 1.39, + "grad_norm": 0.5815553665161133, + "learning_rate": 0.00033461802420415654, + "loss": 3.2218, + "step": 28354 + }, + { + "epoch": 1.39, + "grad_norm": 0.5743089914321899, + "learning_rate": 0.00033460273081674797, + "loss": 3.039, + "step": 28355 + }, + { + "epoch": 1.39, + "grad_norm": 0.5957072377204895, + "learning_rate": 0.0003345874373382025, + "loss": 3.0255, + "step": 28356 + }, + { + "epoch": 1.39, + "grad_norm": 0.5829094648361206, + "learning_rate": 0.0003345721437685605, + "loss": 3.2637, + "step": 28357 + }, + { + "epoch": 1.39, + "grad_norm": 0.5787622928619385, + "learning_rate": 0.00033455685010786205, + "loss": 3.0796, + "step": 28358 + }, + { + "epoch": 1.39, + "grad_norm": 0.6022943258285522, + "learning_rate": 0.0003345415563561475, + "loss": 3.2131, + "step": 28359 + }, + { + "epoch": 1.39, + "grad_norm": 0.59725421667099, + "learning_rate": 0.00033452626251345713, + "loss": 3.1263, + "step": 28360 + }, + { + "epoch": 1.39, + "grad_norm": 0.5658643245697021, + "learning_rate": 0.00033451096857983124, + "loss": 2.9285, + "step": 28361 + }, + { + "epoch": 1.39, + "grad_norm": 0.5705099105834961, + "learning_rate": 0.0003344956745553101, + "loss": 2.9658, + "step": 28362 + }, + { + "epoch": 1.39, + "grad_norm": 0.5385516881942749, + "learning_rate": 0.000334480380439934, + "loss": 3.1083, + "step": 28363 + }, + { + "epoch": 1.39, + "grad_norm": 0.5449607372283936, + "learning_rate": 0.00033446508623374317, + "loss": 3.1891, + "step": 28364 + }, + { + "epoch": 1.39, + "grad_norm": 0.6035528779029846, + "learning_rate": 0.000334449791936778, + "loss": 3.2337, + "step": 28365 + }, + { + "epoch": 1.39, + "grad_norm": 0.5781618356704712, + "learning_rate": 0.0003344344975490786, + "loss": 3.1241, + "step": 28366 + }, + { + "epoch": 1.39, + "grad_norm": 0.561470091342926, + "learning_rate": 0.0003344192030706855, + "loss": 3.0613, + "step": 28367 + }, + { + "epoch": 1.39, + "grad_norm": 0.5787182450294495, + "learning_rate": 0.00033440390850163876, + "loss": 3.063, + "step": 28368 + }, + { + "epoch": 1.39, + "grad_norm": 0.5583317875862122, + "learning_rate": 0.0003343886138419788, + "loss": 3.0326, + "step": 28369 + }, + { + "epoch": 1.39, + "grad_norm": 0.5830174088478088, + "learning_rate": 0.0003343733190917458, + "loss": 3.1601, + "step": 28370 + }, + { + "epoch": 1.39, + "grad_norm": 0.5853716731071472, + "learning_rate": 0.00033435802425098006, + "loss": 2.985, + "step": 28371 + }, + { + "epoch": 1.39, + "grad_norm": 0.5587119460105896, + "learning_rate": 0.000334342729319722, + "loss": 3.1248, + "step": 28372 + }, + { + "epoch": 1.39, + "grad_norm": 0.5844123363494873, + "learning_rate": 0.00033432743429801176, + "loss": 3.1114, + "step": 28373 + }, + { + "epoch": 1.39, + "grad_norm": 0.620872974395752, + "learning_rate": 0.0003343121391858896, + "loss": 3.1804, + "step": 28374 + }, + { + "epoch": 1.39, + "grad_norm": 0.5749686360359192, + "learning_rate": 0.00033429684398339596, + "loss": 2.8822, + "step": 28375 + }, + { + "epoch": 1.39, + "grad_norm": 0.5744134783744812, + "learning_rate": 0.00033428154869057097, + "loss": 3.0772, + "step": 28376 + }, + { + "epoch": 1.39, + "grad_norm": 0.6181778311729431, + "learning_rate": 0.000334266253307455, + "loss": 3.1124, + "step": 28377 + }, + { + "epoch": 1.39, + "grad_norm": 0.5899921655654907, + "learning_rate": 0.0003342509578340883, + "loss": 3.2172, + "step": 28378 + }, + { + "epoch": 1.39, + "grad_norm": 0.5781901478767395, + "learning_rate": 0.00033423566227051127, + "loss": 3.12, + "step": 28379 + }, + { + "epoch": 1.39, + "grad_norm": 0.5640218257904053, + "learning_rate": 0.000334220366616764, + "loss": 2.9047, + "step": 28380 + }, + { + "epoch": 1.39, + "grad_norm": 0.6122222542762756, + "learning_rate": 0.0003342050708728868, + "loss": 3.0602, + "step": 28381 + }, + { + "epoch": 1.39, + "grad_norm": 0.5799720883369446, + "learning_rate": 0.0003341897750389201, + "loss": 3.148, + "step": 28382 + }, + { + "epoch": 1.39, + "grad_norm": 0.57460618019104, + "learning_rate": 0.0003341744791149042, + "loss": 3.3129, + "step": 28383 + }, + { + "epoch": 1.39, + "grad_norm": 0.5734221339225769, + "learning_rate": 0.0003341591831008792, + "loss": 3.0301, + "step": 28384 + }, + { + "epoch": 1.39, + "grad_norm": 0.5949209928512573, + "learning_rate": 0.0003341438869968855, + "loss": 3.1525, + "step": 28385 + }, + { + "epoch": 1.39, + "grad_norm": 0.5924066305160522, + "learning_rate": 0.00033412859080296335, + "loss": 3.2143, + "step": 28386 + }, + { + "epoch": 1.39, + "grad_norm": 0.5933117270469666, + "learning_rate": 0.0003341132945191531, + "loss": 3.2277, + "step": 28387 + }, + { + "epoch": 1.39, + "grad_norm": 0.5546301007270813, + "learning_rate": 0.00033409799814549496, + "loss": 2.9102, + "step": 28388 + }, + { + "epoch": 1.39, + "grad_norm": 0.5834781527519226, + "learning_rate": 0.0003340827016820293, + "loss": 2.8856, + "step": 28389 + }, + { + "epoch": 1.39, + "grad_norm": 0.5664399266242981, + "learning_rate": 0.00033406740512879635, + "loss": 3.0256, + "step": 28390 + }, + { + "epoch": 1.39, + "grad_norm": 0.5528917908668518, + "learning_rate": 0.00033405210848583636, + "loss": 2.9815, + "step": 28391 + }, + { + "epoch": 1.39, + "grad_norm": 0.5898980498313904, + "learning_rate": 0.00033403681175318974, + "loss": 2.9924, + "step": 28392 + }, + { + "epoch": 1.39, + "grad_norm": 0.6130119562149048, + "learning_rate": 0.00033402151493089663, + "loss": 2.812, + "step": 28393 + }, + { + "epoch": 1.39, + "grad_norm": 0.5696910619735718, + "learning_rate": 0.0003340062180189975, + "loss": 3.2702, + "step": 28394 + }, + { + "epoch": 1.39, + "grad_norm": 0.5849608778953552, + "learning_rate": 0.0003339909210175325, + "loss": 3.0146, + "step": 28395 + }, + { + "epoch": 1.39, + "grad_norm": 0.596095621585846, + "learning_rate": 0.0003339756239265419, + "loss": 2.9181, + "step": 28396 + }, + { + "epoch": 1.39, + "grad_norm": 0.5665485262870789, + "learning_rate": 0.0003339603267460661, + "loss": 3.069, + "step": 28397 + }, + { + "epoch": 1.39, + "grad_norm": 0.5528839826583862, + "learning_rate": 0.0003339450294761453, + "loss": 3.0766, + "step": 28398 + }, + { + "epoch": 1.39, + "grad_norm": 0.5885125398635864, + "learning_rate": 0.00033392973211681987, + "loss": 2.9717, + "step": 28399 + }, + { + "epoch": 1.39, + "grad_norm": 0.6001590490341187, + "learning_rate": 0.00033391443466813004, + "loss": 2.8122, + "step": 28400 + }, + { + "epoch": 1.39, + "grad_norm": 0.5653418898582458, + "learning_rate": 0.0003338991371301161, + "loss": 3.1272, + "step": 28401 + }, + { + "epoch": 1.39, + "grad_norm": 0.577953577041626, + "learning_rate": 0.0003338838395028183, + "loss": 3.0803, + "step": 28402 + }, + { + "epoch": 1.39, + "grad_norm": 0.5523266196250916, + "learning_rate": 0.00033386854178627705, + "loss": 3.0247, + "step": 28403 + }, + { + "epoch": 1.39, + "grad_norm": 0.5539414882659912, + "learning_rate": 0.0003338532439805326, + "loss": 3.1748, + "step": 28404 + }, + { + "epoch": 1.39, + "grad_norm": 0.5848209857940674, + "learning_rate": 0.0003338379460856253, + "loss": 2.9997, + "step": 28405 + }, + { + "epoch": 1.39, + "grad_norm": 0.573849618434906, + "learning_rate": 0.00033382264810159523, + "loss": 3.0206, + "step": 28406 + }, + { + "epoch": 1.39, + "grad_norm": 0.575800895690918, + "learning_rate": 0.00033380735002848274, + "loss": 3.0534, + "step": 28407 + }, + { + "epoch": 1.39, + "grad_norm": 0.5938015580177307, + "learning_rate": 0.0003337920518663284, + "loss": 3.1933, + "step": 28408 + }, + { + "epoch": 1.39, + "grad_norm": 0.5658040642738342, + "learning_rate": 0.0003337767536151722, + "loss": 2.8285, + "step": 28409 + }, + { + "epoch": 1.39, + "grad_norm": 0.5854364633560181, + "learning_rate": 0.0003337614552750545, + "loss": 2.8454, + "step": 28410 + }, + { + "epoch": 1.39, + "grad_norm": 0.5785530209541321, + "learning_rate": 0.00033374615684601567, + "loss": 3.0105, + "step": 28411 + }, + { + "epoch": 1.39, + "grad_norm": 0.5590497851371765, + "learning_rate": 0.000333730858328096, + "loss": 2.8793, + "step": 28412 + }, + { + "epoch": 1.39, + "grad_norm": 0.5437723994255066, + "learning_rate": 0.00033371555972133563, + "loss": 2.9129, + "step": 28413 + }, + { + "epoch": 1.39, + "grad_norm": 0.6012238264083862, + "learning_rate": 0.00033370026102577503, + "loss": 3.1785, + "step": 28414 + }, + { + "epoch": 1.39, + "grad_norm": 0.5660067200660706, + "learning_rate": 0.0003336849622414544, + "loss": 3.0921, + "step": 28415 + }, + { + "epoch": 1.39, + "grad_norm": 0.5316462516784668, + "learning_rate": 0.0003336696633684141, + "loss": 3.1858, + "step": 28416 + }, + { + "epoch": 1.39, + "grad_norm": 0.6181156039237976, + "learning_rate": 0.0003336543644066944, + "loss": 3.0513, + "step": 28417 + }, + { + "epoch": 1.39, + "grad_norm": 0.5790272951126099, + "learning_rate": 0.00033363906535633546, + "loss": 3.0921, + "step": 28418 + }, + { + "epoch": 1.39, + "grad_norm": 0.553527295589447, + "learning_rate": 0.0003336237662173779, + "loss": 2.9811, + "step": 28419 + }, + { + "epoch": 1.39, + "grad_norm": 0.5918067097663879, + "learning_rate": 0.00033360846698986165, + "loss": 2.9584, + "step": 28420 + }, + { + "epoch": 1.39, + "grad_norm": 0.5768551826477051, + "learning_rate": 0.0003335931676738272, + "loss": 3.1019, + "step": 28421 + }, + { + "epoch": 1.39, + "grad_norm": 0.573397696018219, + "learning_rate": 0.00033357786826931484, + "loss": 3.0213, + "step": 28422 + }, + { + "epoch": 1.39, + "grad_norm": 0.5723202228546143, + "learning_rate": 0.00033356256877636485, + "loss": 2.847, + "step": 28423 + }, + { + "epoch": 1.39, + "grad_norm": 0.5666738748550415, + "learning_rate": 0.00033354726919501747, + "loss": 2.8922, + "step": 28424 + }, + { + "epoch": 1.39, + "grad_norm": 0.5918300747871399, + "learning_rate": 0.00033353196952531304, + "loss": 3.0617, + "step": 28425 + }, + { + "epoch": 1.39, + "grad_norm": 0.5779464840888977, + "learning_rate": 0.0003335166697672919, + "loss": 3.0998, + "step": 28426 + }, + { + "epoch": 1.39, + "grad_norm": 0.5665059685707092, + "learning_rate": 0.00033350136992099416, + "loss": 3.207, + "step": 28427 + }, + { + "epoch": 1.39, + "grad_norm": 0.5882079601287842, + "learning_rate": 0.0003334860699864604, + "loss": 3.0049, + "step": 28428 + }, + { + "epoch": 1.39, + "grad_norm": 0.5963062047958374, + "learning_rate": 0.00033347076996373073, + "loss": 3.2008, + "step": 28429 + }, + { + "epoch": 1.39, + "grad_norm": 0.5421083569526672, + "learning_rate": 0.0003334554698528456, + "loss": 2.9144, + "step": 28430 + }, + { + "epoch": 1.39, + "grad_norm": 0.5771951675415039, + "learning_rate": 0.0003334401696538451, + "loss": 3.1604, + "step": 28431 + }, + { + "epoch": 1.39, + "grad_norm": 0.6018733978271484, + "learning_rate": 0.00033342486936676957, + "loss": 3.2034, + "step": 28432 + }, + { + "epoch": 1.39, + "grad_norm": 0.5767068862915039, + "learning_rate": 0.0003334095689916594, + "loss": 3.2174, + "step": 28433 + }, + { + "epoch": 1.39, + "grad_norm": 0.5713096857070923, + "learning_rate": 0.00033339426852855496, + "loss": 3.0759, + "step": 28434 + }, + { + "epoch": 1.39, + "grad_norm": 0.5891026258468628, + "learning_rate": 0.0003333789679774964, + "loss": 3.3295, + "step": 28435 + }, + { + "epoch": 1.39, + "grad_norm": 0.5787071585655212, + "learning_rate": 0.000333363667338524, + "loss": 3.164, + "step": 28436 + }, + { + "epoch": 1.39, + "grad_norm": 0.6013773679733276, + "learning_rate": 0.00033334836661167816, + "loss": 2.8607, + "step": 28437 + }, + { + "epoch": 1.39, + "grad_norm": 0.5678498148918152, + "learning_rate": 0.00033333306579699914, + "loss": 3.0511, + "step": 28438 + }, + { + "epoch": 1.39, + "grad_norm": 0.6372431516647339, + "learning_rate": 0.00033331776489452724, + "loss": 2.7539, + "step": 28439 + }, + { + "epoch": 1.39, + "grad_norm": 0.5957581400871277, + "learning_rate": 0.0003333024639043028, + "loss": 3.117, + "step": 28440 + }, + { + "epoch": 1.39, + "grad_norm": 0.5882934927940369, + "learning_rate": 0.0003332871628263661, + "loss": 3.0676, + "step": 28441 + }, + { + "epoch": 1.39, + "grad_norm": 0.5675431489944458, + "learning_rate": 0.00033327186166075723, + "loss": 3.2111, + "step": 28442 + }, + { + "epoch": 1.39, + "grad_norm": 0.5941580533981323, + "learning_rate": 0.0003332565604075168, + "loss": 3.0244, + "step": 28443 + }, + { + "epoch": 1.39, + "grad_norm": 0.6287396550178528, + "learning_rate": 0.000333241259066685, + "loss": 3.1406, + "step": 28444 + }, + { + "epoch": 1.39, + "grad_norm": 0.5600050091743469, + "learning_rate": 0.0003332259576383022, + "loss": 2.9896, + "step": 28445 + }, + { + "epoch": 1.39, + "grad_norm": 0.5782938599586487, + "learning_rate": 0.00033321065612240854, + "loss": 3.0307, + "step": 28446 + }, + { + "epoch": 1.39, + "grad_norm": 0.5861037969589233, + "learning_rate": 0.0003331953545190444, + "loss": 2.9698, + "step": 28447 + }, + { + "epoch": 1.39, + "grad_norm": 0.5864157676696777, + "learning_rate": 0.00033318005282825, + "loss": 2.9486, + "step": 28448 + }, + { + "epoch": 1.39, + "grad_norm": 0.5786110162734985, + "learning_rate": 0.0003331647510500659, + "loss": 2.9526, + "step": 28449 + }, + { + "epoch": 1.39, + "grad_norm": 0.5792027711868286, + "learning_rate": 0.0003331494491845321, + "loss": 3.2205, + "step": 28450 + }, + { + "epoch": 1.39, + "grad_norm": 0.546311616897583, + "learning_rate": 0.00033313414723168904, + "loss": 2.8147, + "step": 28451 + }, + { + "epoch": 1.39, + "grad_norm": 0.5609930753707886, + "learning_rate": 0.0003331188451915771, + "loss": 3.0984, + "step": 28452 + }, + { + "epoch": 1.39, + "grad_norm": 0.6086905598640442, + "learning_rate": 0.0003331035430642364, + "loss": 3.138, + "step": 28453 + }, + { + "epoch": 1.39, + "grad_norm": 0.5497440099716187, + "learning_rate": 0.00033308824084970736, + "loss": 3.0029, + "step": 28454 + }, + { + "epoch": 1.39, + "grad_norm": 0.5903970003128052, + "learning_rate": 0.0003330729385480303, + "loss": 2.9264, + "step": 28455 + }, + { + "epoch": 1.39, + "grad_norm": 0.6043407320976257, + "learning_rate": 0.0003330576361592455, + "loss": 2.8884, + "step": 28456 + }, + { + "epoch": 1.39, + "grad_norm": 0.559085488319397, + "learning_rate": 0.0003330423336833933, + "loss": 3.0239, + "step": 28457 + }, + { + "epoch": 1.39, + "grad_norm": 0.5664832592010498, + "learning_rate": 0.0003330270311205137, + "loss": 3.116, + "step": 28458 + }, + { + "epoch": 1.39, + "grad_norm": 0.5917896032333374, + "learning_rate": 0.0003330117284706475, + "loss": 3.139, + "step": 28459 + }, + { + "epoch": 1.39, + "grad_norm": 0.588679850101471, + "learning_rate": 0.0003329964257338347, + "loss": 3.0135, + "step": 28460 + }, + { + "epoch": 1.39, + "grad_norm": 0.5861005187034607, + "learning_rate": 0.0003329811229101156, + "loss": 2.8976, + "step": 28461 + }, + { + "epoch": 1.39, + "grad_norm": 0.5760902166366577, + "learning_rate": 0.00033296581999953063, + "loss": 3.094, + "step": 28462 + }, + { + "epoch": 1.39, + "grad_norm": 0.5762536525726318, + "learning_rate": 0.00033295051700212003, + "loss": 3.2222, + "step": 28463 + }, + { + "epoch": 1.39, + "grad_norm": 0.5697655081748962, + "learning_rate": 0.0003329352139179241, + "loss": 3.0622, + "step": 28464 + }, + { + "epoch": 1.4, + "grad_norm": 0.6054670810699463, + "learning_rate": 0.0003329199107469832, + "loss": 2.8424, + "step": 28465 + }, + { + "epoch": 1.4, + "grad_norm": 0.5558887720108032, + "learning_rate": 0.00033290460748933745, + "loss": 3.2195, + "step": 28466 + }, + { + "epoch": 1.4, + "grad_norm": 0.6065505743026733, + "learning_rate": 0.0003328893041450275, + "loss": 3.1301, + "step": 28467 + }, + { + "epoch": 1.4, + "grad_norm": 0.6185204982757568, + "learning_rate": 0.00033287400071409327, + "loss": 3.1541, + "step": 28468 + }, + { + "epoch": 1.4, + "grad_norm": 0.5224096179008484, + "learning_rate": 0.0003328586971965753, + "loss": 2.9278, + "step": 28469 + }, + { + "epoch": 1.4, + "grad_norm": 0.5885241627693176, + "learning_rate": 0.0003328433935925139, + "loss": 3.2153, + "step": 28470 + }, + { + "epoch": 1.4, + "grad_norm": 0.5575811266899109, + "learning_rate": 0.00033282808990194935, + "loss": 3.1026, + "step": 28471 + }, + { + "epoch": 1.4, + "grad_norm": 0.6058753132820129, + "learning_rate": 0.00033281278612492187, + "loss": 2.8013, + "step": 28472 + }, + { + "epoch": 1.4, + "grad_norm": 0.585891842842102, + "learning_rate": 0.0003327974822614718, + "loss": 3.0083, + "step": 28473 + }, + { + "epoch": 1.4, + "grad_norm": 0.5971677303314209, + "learning_rate": 0.0003327821783116395, + "loss": 3.1401, + "step": 28474 + }, + { + "epoch": 1.4, + "grad_norm": 0.5844119191169739, + "learning_rate": 0.0003327668742754653, + "loss": 2.9653, + "step": 28475 + }, + { + "epoch": 1.4, + "grad_norm": 0.6075958609580994, + "learning_rate": 0.0003327515701529894, + "loss": 3.0223, + "step": 28476 + }, + { + "epoch": 1.4, + "grad_norm": 0.5761697888374329, + "learning_rate": 0.00033273626594425217, + "loss": 3.2987, + "step": 28477 + }, + { + "epoch": 1.4, + "grad_norm": 0.7378432750701904, + "learning_rate": 0.0003327209616492939, + "loss": 3.1888, + "step": 28478 + }, + { + "epoch": 1.4, + "grad_norm": 0.5896404981613159, + "learning_rate": 0.0003327056572681549, + "loss": 3.1417, + "step": 28479 + }, + { + "epoch": 1.4, + "grad_norm": 0.581295371055603, + "learning_rate": 0.00033269035280087555, + "loss": 2.915, + "step": 28480 + }, + { + "epoch": 1.4, + "grad_norm": 0.5805588960647583, + "learning_rate": 0.0003326750482474961, + "loss": 3.0804, + "step": 28481 + }, + { + "epoch": 1.4, + "grad_norm": 0.631949782371521, + "learning_rate": 0.00033265974360805687, + "loss": 2.8173, + "step": 28482 + }, + { + "epoch": 1.4, + "grad_norm": 0.5704339742660522, + "learning_rate": 0.0003326444388825981, + "loss": 2.9781, + "step": 28483 + }, + { + "epoch": 1.4, + "grad_norm": 0.5693856477737427, + "learning_rate": 0.0003326291340711602, + "loss": 2.923, + "step": 28484 + }, + { + "epoch": 1.4, + "grad_norm": 0.5859050750732422, + "learning_rate": 0.0003326138291737835, + "loss": 2.9329, + "step": 28485 + }, + { + "epoch": 1.4, + "grad_norm": 0.5935141444206238, + "learning_rate": 0.00033259852419050814, + "loss": 3.1226, + "step": 28486 + }, + { + "epoch": 1.4, + "grad_norm": 0.6061211228370667, + "learning_rate": 0.00033258321912137465, + "loss": 3.1309, + "step": 28487 + }, + { + "epoch": 1.4, + "grad_norm": 0.6038804650306702, + "learning_rate": 0.0003325679139664231, + "loss": 2.8745, + "step": 28488 + }, + { + "epoch": 1.4, + "grad_norm": 0.6384372711181641, + "learning_rate": 0.00033255260872569405, + "loss": 2.9845, + "step": 28489 + }, + { + "epoch": 1.4, + "grad_norm": 0.6087737679481506, + "learning_rate": 0.00033253730339922766, + "loss": 3.0332, + "step": 28490 + }, + { + "epoch": 1.4, + "grad_norm": 0.544606626033783, + "learning_rate": 0.00033252199798706426, + "loss": 3.0271, + "step": 28491 + }, + { + "epoch": 1.4, + "grad_norm": 0.6075654625892639, + "learning_rate": 0.00033250669248924416, + "loss": 3.1083, + "step": 28492 + }, + { + "epoch": 1.4, + "grad_norm": 0.5745288729667664, + "learning_rate": 0.0003324913869058077, + "loss": 3.118, + "step": 28493 + }, + { + "epoch": 1.4, + "grad_norm": 0.585982620716095, + "learning_rate": 0.0003324760812367952, + "loss": 2.8791, + "step": 28494 + }, + { + "epoch": 1.4, + "grad_norm": 0.6062518358230591, + "learning_rate": 0.00033246077548224686, + "loss": 3.0527, + "step": 28495 + }, + { + "epoch": 1.4, + "grad_norm": 0.5591403841972351, + "learning_rate": 0.0003324454696422033, + "loss": 3.087, + "step": 28496 + }, + { + "epoch": 1.4, + "grad_norm": 0.6129733324050903, + "learning_rate": 0.00033243016371670446, + "loss": 3.1021, + "step": 28497 + }, + { + "epoch": 1.4, + "grad_norm": 0.5932202935218811, + "learning_rate": 0.00033241485770579084, + "loss": 2.89, + "step": 28498 + }, + { + "epoch": 1.4, + "grad_norm": 0.5794448256492615, + "learning_rate": 0.0003323995516095027, + "loss": 3.1646, + "step": 28499 + }, + { + "epoch": 1.4, + "grad_norm": 0.6294394135475159, + "learning_rate": 0.0003323842454278804, + "loss": 3.0625, + "step": 28500 + }, + { + "epoch": 1.4, + "grad_norm": 0.5836816430091858, + "learning_rate": 0.0003323689391609642, + "loss": 2.9569, + "step": 28501 + }, + { + "epoch": 1.4, + "grad_norm": 0.5736550688743591, + "learning_rate": 0.0003323536328087945, + "loss": 3.0808, + "step": 28502 + }, + { + "epoch": 1.4, + "grad_norm": 0.6020394563674927, + "learning_rate": 0.0003323383263714115, + "loss": 3.1046, + "step": 28503 + }, + { + "epoch": 1.4, + "grad_norm": 0.566116213798523, + "learning_rate": 0.0003323230198488555, + "loss": 2.8113, + "step": 28504 + }, + { + "epoch": 1.4, + "grad_norm": 0.5596758723258972, + "learning_rate": 0.000332307713241167, + "loss": 3.1021, + "step": 28505 + }, + { + "epoch": 1.4, + "grad_norm": 0.5862348675727844, + "learning_rate": 0.0003322924065483862, + "loss": 2.9094, + "step": 28506 + }, + { + "epoch": 1.4, + "grad_norm": 0.5914575457572937, + "learning_rate": 0.00033227709977055344, + "loss": 3.1915, + "step": 28507 + }, + { + "epoch": 1.4, + "grad_norm": 0.6266211867332458, + "learning_rate": 0.000332261792907709, + "loss": 3.0326, + "step": 28508 + }, + { + "epoch": 1.4, + "grad_norm": 0.6111003756523132, + "learning_rate": 0.00033224648595989306, + "loss": 3.3007, + "step": 28509 + }, + { + "epoch": 1.4, + "grad_norm": 0.611010730266571, + "learning_rate": 0.00033223117892714626, + "loss": 2.8517, + "step": 28510 + }, + { + "epoch": 1.4, + "grad_norm": 0.6048029065132141, + "learning_rate": 0.0003322158718095087, + "loss": 3.065, + "step": 28511 + }, + { + "epoch": 1.4, + "grad_norm": 0.6111939549446106, + "learning_rate": 0.0003322005646070207, + "loss": 2.9375, + "step": 28512 + }, + { + "epoch": 1.4, + "grad_norm": 0.5618122816085815, + "learning_rate": 0.0003321852573197226, + "loss": 3.0391, + "step": 28513 + }, + { + "epoch": 1.4, + "grad_norm": 0.6132274866104126, + "learning_rate": 0.00033216994994765477, + "loss": 3.0492, + "step": 28514 + }, + { + "epoch": 1.4, + "grad_norm": 0.6032343506813049, + "learning_rate": 0.0003321546424908574, + "loss": 2.9005, + "step": 28515 + }, + { + "epoch": 1.4, + "grad_norm": 0.6772813200950623, + "learning_rate": 0.00033213933494937093, + "loss": 3.089, + "step": 28516 + }, + { + "epoch": 1.4, + "grad_norm": 0.544093668460846, + "learning_rate": 0.0003321240273232357, + "loss": 3.0564, + "step": 28517 + }, + { + "epoch": 1.4, + "grad_norm": 0.6309990286827087, + "learning_rate": 0.00033210871961249195, + "loss": 2.9296, + "step": 28518 + }, + { + "epoch": 1.4, + "grad_norm": 0.5908814668655396, + "learning_rate": 0.00033209341181717996, + "loss": 3.1371, + "step": 28519 + }, + { + "epoch": 1.4, + "grad_norm": 0.55782151222229, + "learning_rate": 0.0003320781039373401, + "loss": 2.9583, + "step": 28520 + }, + { + "epoch": 1.4, + "grad_norm": 0.5843961834907532, + "learning_rate": 0.00033206279597301276, + "loss": 3.2184, + "step": 28521 + }, + { + "epoch": 1.4, + "grad_norm": 0.5808646082878113, + "learning_rate": 0.0003320474879242381, + "loss": 2.8148, + "step": 28522 + }, + { + "epoch": 1.4, + "grad_norm": 0.6347132325172424, + "learning_rate": 0.00033203217979105656, + "loss": 3.2595, + "step": 28523 + }, + { + "epoch": 1.4, + "grad_norm": 0.5245764851570129, + "learning_rate": 0.00033201687157350843, + "loss": 3.0635, + "step": 28524 + }, + { + "epoch": 1.4, + "grad_norm": 0.6027230024337769, + "learning_rate": 0.000332001563271634, + "loss": 3.0783, + "step": 28525 + }, + { + "epoch": 1.4, + "grad_norm": 0.5538972616195679, + "learning_rate": 0.0003319862548854736, + "loss": 2.8988, + "step": 28526 + }, + { + "epoch": 1.4, + "grad_norm": 0.6136893630027771, + "learning_rate": 0.00033197094641506763, + "loss": 2.9789, + "step": 28527 + }, + { + "epoch": 1.4, + "grad_norm": 0.5786117911338806, + "learning_rate": 0.0003319556378604563, + "loss": 2.8044, + "step": 28528 + }, + { + "epoch": 1.4, + "grad_norm": 0.6115828156471252, + "learning_rate": 0.00033194032922167994, + "loss": 3.114, + "step": 28529 + }, + { + "epoch": 1.4, + "grad_norm": 0.5768436193466187, + "learning_rate": 0.0003319250204987789, + "loss": 2.6763, + "step": 28530 + }, + { + "epoch": 1.4, + "grad_norm": 0.5673786401748657, + "learning_rate": 0.0003319097116917935, + "loss": 2.8227, + "step": 28531 + }, + { + "epoch": 1.4, + "grad_norm": 0.6050379276275635, + "learning_rate": 0.00033189440280076416, + "loss": 2.9611, + "step": 28532 + }, + { + "epoch": 1.4, + "grad_norm": 0.5654118061065674, + "learning_rate": 0.000331879093825731, + "loss": 3.0181, + "step": 28533 + }, + { + "epoch": 1.4, + "grad_norm": 0.5509759783744812, + "learning_rate": 0.00033186378476673446, + "loss": 3.2072, + "step": 28534 + }, + { + "epoch": 1.4, + "grad_norm": 0.5696454644203186, + "learning_rate": 0.0003318484756238149, + "loss": 3.1229, + "step": 28535 + }, + { + "epoch": 1.4, + "grad_norm": 0.641002357006073, + "learning_rate": 0.0003318331663970125, + "loss": 3.2484, + "step": 28536 + }, + { + "epoch": 1.4, + "grad_norm": 0.6124339699745178, + "learning_rate": 0.00033181785708636774, + "loss": 3.158, + "step": 28537 + }, + { + "epoch": 1.4, + "grad_norm": 0.5635184049606323, + "learning_rate": 0.0003318025476919208, + "loss": 2.9605, + "step": 28538 + }, + { + "epoch": 1.4, + "grad_norm": 0.6408677697181702, + "learning_rate": 0.00033178723821371216, + "loss": 2.8396, + "step": 28539 + }, + { + "epoch": 1.4, + "grad_norm": 0.5962166786193848, + "learning_rate": 0.00033177192865178196, + "loss": 3.1288, + "step": 28540 + }, + { + "epoch": 1.4, + "grad_norm": 0.5760708451271057, + "learning_rate": 0.00033175661900617065, + "loss": 3.0704, + "step": 28541 + }, + { + "epoch": 1.4, + "grad_norm": 0.667230486869812, + "learning_rate": 0.0003317413092769185, + "loss": 2.9527, + "step": 28542 + }, + { + "epoch": 1.4, + "grad_norm": 0.5681466460227966, + "learning_rate": 0.00033172599946406587, + "loss": 3.1464, + "step": 28543 + }, + { + "epoch": 1.4, + "grad_norm": 0.6880142092704773, + "learning_rate": 0.0003317106895676531, + "loss": 3.2, + "step": 28544 + }, + { + "epoch": 1.4, + "grad_norm": 0.6255538463592529, + "learning_rate": 0.0003316953795877204, + "loss": 3.1399, + "step": 28545 + }, + { + "epoch": 1.4, + "grad_norm": 0.5599697232246399, + "learning_rate": 0.0003316800695243082, + "loss": 3.1353, + "step": 28546 + }, + { + "epoch": 1.4, + "grad_norm": 0.5526019930839539, + "learning_rate": 0.0003316647593774568, + "loss": 2.8375, + "step": 28547 + }, + { + "epoch": 1.4, + "grad_norm": 0.6557570695877075, + "learning_rate": 0.0003316494491472065, + "loss": 2.9793, + "step": 28548 + }, + { + "epoch": 1.4, + "grad_norm": 0.5730993747711182, + "learning_rate": 0.00033163413883359764, + "loss": 3.0206, + "step": 28549 + }, + { + "epoch": 1.4, + "grad_norm": 1.1626458168029785, + "learning_rate": 0.0003316188284366706, + "loss": 3.0879, + "step": 28550 + }, + { + "epoch": 1.4, + "grad_norm": 0.5795100927352905, + "learning_rate": 0.0003316035179564656, + "loss": 2.9078, + "step": 28551 + }, + { + "epoch": 1.4, + "grad_norm": 0.5477898120880127, + "learning_rate": 0.000331588207393023, + "loss": 2.9622, + "step": 28552 + }, + { + "epoch": 1.4, + "grad_norm": 0.5724733471870422, + "learning_rate": 0.0003315728967463832, + "loss": 3.2156, + "step": 28553 + }, + { + "epoch": 1.4, + "grad_norm": 0.5982617139816284, + "learning_rate": 0.0003315575860165865, + "loss": 2.8647, + "step": 28554 + }, + { + "epoch": 1.4, + "grad_norm": 0.6391007900238037, + "learning_rate": 0.0003315422752036731, + "loss": 3.0067, + "step": 28555 + }, + { + "epoch": 1.4, + "grad_norm": 0.6046311855316162, + "learning_rate": 0.00033152696430768347, + "loss": 2.9357, + "step": 28556 + }, + { + "epoch": 1.4, + "grad_norm": 0.5996942520141602, + "learning_rate": 0.0003315116533286578, + "loss": 3.0517, + "step": 28557 + }, + { + "epoch": 1.4, + "grad_norm": 0.5585609078407288, + "learning_rate": 0.00033149634226663666, + "loss": 3.0389, + "step": 28558 + }, + { + "epoch": 1.4, + "grad_norm": 0.6330249309539795, + "learning_rate": 0.0003314810311216601, + "loss": 3.0515, + "step": 28559 + }, + { + "epoch": 1.4, + "grad_norm": 0.574908971786499, + "learning_rate": 0.0003314657198937685, + "loss": 3.255, + "step": 28560 + }, + { + "epoch": 1.4, + "grad_norm": 0.597728431224823, + "learning_rate": 0.0003314504085830024, + "loss": 3.0646, + "step": 28561 + }, + { + "epoch": 1.4, + "grad_norm": 0.6056450009346008, + "learning_rate": 0.00033143509718940186, + "loss": 3.1887, + "step": 28562 + }, + { + "epoch": 1.4, + "grad_norm": 0.5824889540672302, + "learning_rate": 0.0003314197857130074, + "loss": 3.1014, + "step": 28563 + }, + { + "epoch": 1.4, + "grad_norm": 0.5638570189476013, + "learning_rate": 0.00033140447415385923, + "loss": 3.0617, + "step": 28564 + }, + { + "epoch": 1.4, + "grad_norm": 0.5699633359909058, + "learning_rate": 0.00033138916251199775, + "loss": 3.1922, + "step": 28565 + }, + { + "epoch": 1.4, + "grad_norm": 0.5394684672355652, + "learning_rate": 0.0003313738507874632, + "loss": 3.1004, + "step": 28566 + }, + { + "epoch": 1.4, + "grad_norm": 0.5617507100105286, + "learning_rate": 0.000331358538980296, + "loss": 3.1166, + "step": 28567 + }, + { + "epoch": 1.4, + "grad_norm": 0.5685690641403198, + "learning_rate": 0.00033134322709053643, + "loss": 3.0646, + "step": 28568 + }, + { + "epoch": 1.4, + "grad_norm": 0.5826153755187988, + "learning_rate": 0.000331327915118225, + "loss": 3.073, + "step": 28569 + }, + { + "epoch": 1.4, + "grad_norm": 0.5799400210380554, + "learning_rate": 0.0003313126030634016, + "loss": 3.1073, + "step": 28570 + }, + { + "epoch": 1.4, + "grad_norm": 0.6147586703300476, + "learning_rate": 0.0003312972909261069, + "loss": 3.0393, + "step": 28571 + }, + { + "epoch": 1.4, + "grad_norm": 0.5650718212127686, + "learning_rate": 0.0003312819787063813, + "loss": 2.536, + "step": 28572 + }, + { + "epoch": 1.4, + "grad_norm": 0.6030925512313843, + "learning_rate": 0.00033126666640426487, + "loss": 2.9619, + "step": 28573 + }, + { + "epoch": 1.4, + "grad_norm": 0.573800802230835, + "learning_rate": 0.0003312513540197981, + "loss": 3.1022, + "step": 28574 + }, + { + "epoch": 1.4, + "grad_norm": 0.5658773183822632, + "learning_rate": 0.0003312360415530213, + "loss": 3.0579, + "step": 28575 + }, + { + "epoch": 1.4, + "grad_norm": 0.5464009642601013, + "learning_rate": 0.00033122072900397466, + "loss": 3.1756, + "step": 28576 + }, + { + "epoch": 1.4, + "grad_norm": 0.5734522342681885, + "learning_rate": 0.00033120541637269874, + "loss": 3.1855, + "step": 28577 + }, + { + "epoch": 1.4, + "grad_norm": 0.6184408068656921, + "learning_rate": 0.00033119010365923374, + "loss": 2.8647, + "step": 28578 + }, + { + "epoch": 1.4, + "grad_norm": 0.6169735789299011, + "learning_rate": 0.00033117479086362, + "loss": 3.0088, + "step": 28579 + }, + { + "epoch": 1.4, + "grad_norm": 0.5645252466201782, + "learning_rate": 0.00033115947798589786, + "loss": 3.0772, + "step": 28580 + }, + { + "epoch": 1.4, + "grad_norm": 0.587335467338562, + "learning_rate": 0.0003311441650261076, + "loss": 3.1952, + "step": 28581 + }, + { + "epoch": 1.4, + "grad_norm": 0.6208239197731018, + "learning_rate": 0.00033112885198428963, + "loss": 3.0525, + "step": 28582 + }, + { + "epoch": 1.4, + "grad_norm": 0.5548790097236633, + "learning_rate": 0.00033111353886048435, + "loss": 3.0921, + "step": 28583 + }, + { + "epoch": 1.4, + "grad_norm": 0.5708760619163513, + "learning_rate": 0.00033109822565473187, + "loss": 3.0512, + "step": 28584 + }, + { + "epoch": 1.4, + "grad_norm": 0.5541423559188843, + "learning_rate": 0.0003310829123670728, + "loss": 3.001, + "step": 28585 + }, + { + "epoch": 1.4, + "grad_norm": 0.6086994409561157, + "learning_rate": 0.00033106759899754714, + "loss": 2.9911, + "step": 28586 + }, + { + "epoch": 1.4, + "grad_norm": 0.5725777745246887, + "learning_rate": 0.0003310522855461955, + "loss": 3.1316, + "step": 28587 + }, + { + "epoch": 1.4, + "grad_norm": 0.5796346664428711, + "learning_rate": 0.0003310369720130581, + "loss": 3.2368, + "step": 28588 + }, + { + "epoch": 1.4, + "grad_norm": 0.5890423655509949, + "learning_rate": 0.00033102165839817525, + "loss": 3.0038, + "step": 28589 + }, + { + "epoch": 1.4, + "grad_norm": 0.5724145770072937, + "learning_rate": 0.0003310063447015874, + "loss": 3.2116, + "step": 28590 + }, + { + "epoch": 1.4, + "grad_norm": 0.560804009437561, + "learning_rate": 0.0003309910309233348, + "loss": 2.8441, + "step": 28591 + }, + { + "epoch": 1.4, + "grad_norm": 0.5579610466957092, + "learning_rate": 0.0003309757170634577, + "loss": 3.0832, + "step": 28592 + }, + { + "epoch": 1.4, + "grad_norm": 0.5677865147590637, + "learning_rate": 0.0003309604031219966, + "loss": 3.1131, + "step": 28593 + }, + { + "epoch": 1.4, + "grad_norm": 0.5703995227813721, + "learning_rate": 0.00033094508909899177, + "loss": 3.0792, + "step": 28594 + }, + { + "epoch": 1.4, + "grad_norm": 0.5485500693321228, + "learning_rate": 0.00033092977499448356, + "loss": 3.1622, + "step": 28595 + }, + { + "epoch": 1.4, + "grad_norm": 0.5834635496139526, + "learning_rate": 0.0003309144608085121, + "loss": 3.1295, + "step": 28596 + }, + { + "epoch": 1.4, + "grad_norm": 0.5779139995574951, + "learning_rate": 0.000330899146541118, + "loss": 3.1709, + "step": 28597 + }, + { + "epoch": 1.4, + "grad_norm": 0.6009939908981323, + "learning_rate": 0.00033088383219234154, + "loss": 2.9788, + "step": 28598 + }, + { + "epoch": 1.4, + "grad_norm": 0.6160380840301514, + "learning_rate": 0.00033086851776222297, + "loss": 2.8536, + "step": 28599 + }, + { + "epoch": 1.4, + "grad_norm": 0.5951359868049622, + "learning_rate": 0.00033085320325080273, + "loss": 3.1106, + "step": 28600 + }, + { + "epoch": 1.4, + "grad_norm": 0.6209423542022705, + "learning_rate": 0.000330837888658121, + "loss": 3.0604, + "step": 28601 + }, + { + "epoch": 1.4, + "grad_norm": 0.5782553553581238, + "learning_rate": 0.0003308225739842182, + "loss": 3.2548, + "step": 28602 + }, + { + "epoch": 1.4, + "grad_norm": 0.5685989260673523, + "learning_rate": 0.00033080725922913476, + "loss": 3.0094, + "step": 28603 + }, + { + "epoch": 1.4, + "grad_norm": 0.5853109955787659, + "learning_rate": 0.00033079194439291086, + "loss": 3.1369, + "step": 28604 + }, + { + "epoch": 1.4, + "grad_norm": 0.5716216564178467, + "learning_rate": 0.000330776629475587, + "loss": 3.156, + "step": 28605 + }, + { + "epoch": 1.4, + "grad_norm": 0.6147785186767578, + "learning_rate": 0.00033076131447720334, + "loss": 2.942, + "step": 28606 + }, + { + "epoch": 1.4, + "grad_norm": 0.5691528916358948, + "learning_rate": 0.0003307459993978002, + "loss": 3.1663, + "step": 28607 + }, + { + "epoch": 1.4, + "grad_norm": 0.6025173664093018, + "learning_rate": 0.00033073068423741814, + "loss": 3.1461, + "step": 28608 + }, + { + "epoch": 1.4, + "grad_norm": 0.59565669298172, + "learning_rate": 0.00033071536899609744, + "loss": 2.7866, + "step": 28609 + }, + { + "epoch": 1.4, + "grad_norm": 0.543670654296875, + "learning_rate": 0.00033070005367387824, + "loss": 3.0595, + "step": 28610 + }, + { + "epoch": 1.4, + "grad_norm": 0.6227362751960754, + "learning_rate": 0.000330684738270801, + "loss": 3.0906, + "step": 28611 + }, + { + "epoch": 1.4, + "grad_norm": 0.5895864367485046, + "learning_rate": 0.00033066942278690614, + "loss": 3.08, + "step": 28612 + }, + { + "epoch": 1.4, + "grad_norm": 0.5541119575500488, + "learning_rate": 0.0003306541072222339, + "loss": 3.106, + "step": 28613 + }, + { + "epoch": 1.4, + "grad_norm": 0.5648583769798279, + "learning_rate": 0.0003306387915768246, + "loss": 3.2744, + "step": 28614 + }, + { + "epoch": 1.4, + "grad_norm": 0.6211113929748535, + "learning_rate": 0.00033062347585071863, + "loss": 2.9499, + "step": 28615 + }, + { + "epoch": 1.4, + "grad_norm": 0.6163092255592346, + "learning_rate": 0.00033060816004395647, + "loss": 3.1507, + "step": 28616 + }, + { + "epoch": 1.4, + "grad_norm": 0.6097185015678406, + "learning_rate": 0.0003305928441565781, + "loss": 3.0956, + "step": 28617 + }, + { + "epoch": 1.4, + "grad_norm": 0.6374078989028931, + "learning_rate": 0.0003305775281886241, + "loss": 3.0742, + "step": 28618 + }, + { + "epoch": 1.4, + "grad_norm": 0.6147667169570923, + "learning_rate": 0.0003305622121401348, + "loss": 3.029, + "step": 28619 + }, + { + "epoch": 1.4, + "grad_norm": 0.5697017312049866, + "learning_rate": 0.00033054689601115067, + "loss": 3.0943, + "step": 28620 + }, + { + "epoch": 1.4, + "grad_norm": 0.563630998134613, + "learning_rate": 0.0003305315798017117, + "loss": 2.928, + "step": 28621 + }, + { + "epoch": 1.4, + "grad_norm": 0.5571799278259277, + "learning_rate": 0.00033051626351185843, + "loss": 3.0234, + "step": 28622 + }, + { + "epoch": 1.4, + "grad_norm": 0.5704235434532166, + "learning_rate": 0.00033050094714163136, + "loss": 3.1792, + "step": 28623 + }, + { + "epoch": 1.4, + "grad_norm": 0.5932474732398987, + "learning_rate": 0.0003304856306910705, + "loss": 3.0762, + "step": 28624 + }, + { + "epoch": 1.4, + "grad_norm": 0.6049175262451172, + "learning_rate": 0.00033047031416021637, + "loss": 3.0379, + "step": 28625 + }, + { + "epoch": 1.4, + "grad_norm": 0.5922101736068726, + "learning_rate": 0.00033045499754910933, + "loss": 3.1122, + "step": 28626 + }, + { + "epoch": 1.4, + "grad_norm": 0.5743208527565002, + "learning_rate": 0.00033043968085778977, + "loss": 3.0287, + "step": 28627 + }, + { + "epoch": 1.4, + "grad_norm": 0.6035799384117126, + "learning_rate": 0.00033042436408629785, + "loss": 2.9865, + "step": 28628 + }, + { + "epoch": 1.4, + "grad_norm": 0.5719898343086243, + "learning_rate": 0.000330409047234674, + "loss": 3.0642, + "step": 28629 + }, + { + "epoch": 1.4, + "grad_norm": 0.6188897490501404, + "learning_rate": 0.00033039373030295863, + "loss": 3.123, + "step": 28630 + }, + { + "epoch": 1.4, + "grad_norm": 0.5590984225273132, + "learning_rate": 0.00033037841329119205, + "loss": 3.3236, + "step": 28631 + }, + { + "epoch": 1.4, + "grad_norm": 0.5841843485832214, + "learning_rate": 0.0003303630961994145, + "loss": 3.2134, + "step": 28632 + }, + { + "epoch": 1.4, + "grad_norm": 0.5968827605247498, + "learning_rate": 0.00033034777902766636, + "loss": 2.9945, + "step": 28633 + }, + { + "epoch": 1.4, + "grad_norm": 0.5639210939407349, + "learning_rate": 0.00033033246177598813, + "loss": 3.0465, + "step": 28634 + }, + { + "epoch": 1.4, + "grad_norm": 0.5694261193275452, + "learning_rate": 0.00033031714444441997, + "loss": 3.1198, + "step": 28635 + }, + { + "epoch": 1.4, + "grad_norm": 0.5945818424224854, + "learning_rate": 0.0003303018270330023, + "loss": 3.1175, + "step": 28636 + }, + { + "epoch": 1.4, + "grad_norm": 0.6064791679382324, + "learning_rate": 0.0003302865095417754, + "loss": 3.0226, + "step": 28637 + }, + { + "epoch": 1.4, + "grad_norm": 0.5732207298278809, + "learning_rate": 0.0003302711919707798, + "loss": 3.1993, + "step": 28638 + }, + { + "epoch": 1.4, + "grad_norm": 0.604217529296875, + "learning_rate": 0.00033025587432005554, + "loss": 2.9412, + "step": 28639 + }, + { + "epoch": 1.4, + "grad_norm": 0.589924156665802, + "learning_rate": 0.0003302405565896432, + "loss": 3.0485, + "step": 28640 + }, + { + "epoch": 1.4, + "grad_norm": 0.5883309841156006, + "learning_rate": 0.00033022523877958304, + "loss": 2.848, + "step": 28641 + }, + { + "epoch": 1.4, + "grad_norm": 0.641121506690979, + "learning_rate": 0.0003302099208899156, + "loss": 3.0409, + "step": 28642 + }, + { + "epoch": 1.4, + "grad_norm": 0.5524410605430603, + "learning_rate": 0.00033019460292068074, + "loss": 3.0698, + "step": 28643 + }, + { + "epoch": 1.4, + "grad_norm": 0.5567286610603333, + "learning_rate": 0.00033017928487191924, + "loss": 3.1788, + "step": 28644 + }, + { + "epoch": 1.4, + "grad_norm": 0.5724384784698486, + "learning_rate": 0.0003301639667436714, + "loss": 3.3344, + "step": 28645 + }, + { + "epoch": 1.4, + "grad_norm": 0.6331698298454285, + "learning_rate": 0.00033014864853597736, + "loss": 3.0563, + "step": 28646 + }, + { + "epoch": 1.4, + "grad_norm": 0.6174417734146118, + "learning_rate": 0.0003301333302488776, + "loss": 3.0488, + "step": 28647 + }, + { + "epoch": 1.4, + "grad_norm": 0.5746307373046875, + "learning_rate": 0.00033011801188241244, + "loss": 3.0419, + "step": 28648 + }, + { + "epoch": 1.4, + "grad_norm": 0.6337699294090271, + "learning_rate": 0.0003301026934366223, + "loss": 3.0054, + "step": 28649 + }, + { + "epoch": 1.4, + "grad_norm": 0.5989909768104553, + "learning_rate": 0.00033008737491154735, + "loss": 3.1279, + "step": 28650 + }, + { + "epoch": 1.4, + "grad_norm": 0.6146669387817383, + "learning_rate": 0.00033007205630722813, + "loss": 2.9037, + "step": 28651 + }, + { + "epoch": 1.4, + "grad_norm": 0.5555586218833923, + "learning_rate": 0.00033005673762370486, + "loss": 2.9902, + "step": 28652 + }, + { + "epoch": 1.4, + "grad_norm": 0.5725548267364502, + "learning_rate": 0.0003300414188610179, + "loss": 3.2456, + "step": 28653 + }, + { + "epoch": 1.4, + "grad_norm": 0.5860716700553894, + "learning_rate": 0.00033002610001920773, + "loss": 3.106, + "step": 28654 + }, + { + "epoch": 1.4, + "grad_norm": 0.5727003812789917, + "learning_rate": 0.0003300107810983145, + "loss": 3.0971, + "step": 28655 + }, + { + "epoch": 1.4, + "grad_norm": 0.5849184989929199, + "learning_rate": 0.0003299954620983788, + "loss": 2.8865, + "step": 28656 + }, + { + "epoch": 1.4, + "grad_norm": 0.5811796188354492, + "learning_rate": 0.0003299801430194407, + "loss": 2.9816, + "step": 28657 + }, + { + "epoch": 1.4, + "grad_norm": 0.5917805433273315, + "learning_rate": 0.00032996482386154054, + "loss": 3.1353, + "step": 28658 + }, + { + "epoch": 1.4, + "grad_norm": 0.5500307083129883, + "learning_rate": 0.0003299495046247189, + "loss": 3.0833, + "step": 28659 + }, + { + "epoch": 1.4, + "grad_norm": 0.5789609551429749, + "learning_rate": 0.0003299341853090162, + "loss": 2.9942, + "step": 28660 + }, + { + "epoch": 1.4, + "grad_norm": 0.539592981338501, + "learning_rate": 0.0003299188659144725, + "loss": 3.0309, + "step": 28661 + }, + { + "epoch": 1.4, + "grad_norm": 0.601234495639801, + "learning_rate": 0.00032990354644112823, + "loss": 2.9512, + "step": 28662 + }, + { + "epoch": 1.4, + "grad_norm": 0.5849595665931702, + "learning_rate": 0.00032988822688902376, + "loss": 3.0492, + "step": 28663 + }, + { + "epoch": 1.4, + "grad_norm": 0.609544038772583, + "learning_rate": 0.00032987290725819955, + "loss": 3.0828, + "step": 28664 + }, + { + "epoch": 1.4, + "grad_norm": 0.5832148194313049, + "learning_rate": 0.00032985758754869575, + "loss": 2.9258, + "step": 28665 + }, + { + "epoch": 1.4, + "grad_norm": 0.5789518356323242, + "learning_rate": 0.00032984226776055283, + "loss": 3.1485, + "step": 28666 + }, + { + "epoch": 1.4, + "grad_norm": 0.5975507497787476, + "learning_rate": 0.0003298269478938112, + "loss": 3.1122, + "step": 28667 + }, + { + "epoch": 1.4, + "grad_norm": 0.5649939179420471, + "learning_rate": 0.000329811627948511, + "loss": 3.0364, + "step": 28668 + }, + { + "epoch": 1.4, + "grad_norm": 0.6013187170028687, + "learning_rate": 0.0003297963079246928, + "loss": 3.1851, + "step": 28669 + }, + { + "epoch": 1.41, + "grad_norm": 0.6145163774490356, + "learning_rate": 0.00032978098782239686, + "loss": 3.0284, + "step": 28670 + }, + { + "epoch": 1.41, + "grad_norm": 0.5722653865814209, + "learning_rate": 0.00032976566764166356, + "loss": 3.088, + "step": 28671 + }, + { + "epoch": 1.41, + "grad_norm": 0.597767174243927, + "learning_rate": 0.0003297503473825331, + "loss": 3.2594, + "step": 28672 + }, + { + "epoch": 1.41, + "grad_norm": 0.5960759520530701, + "learning_rate": 0.000329735027045046, + "loss": 3.149, + "step": 28673 + }, + { + "epoch": 1.41, + "grad_norm": 0.629086434841156, + "learning_rate": 0.0003297197066292426, + "loss": 3.1178, + "step": 28674 + }, + { + "epoch": 1.41, + "grad_norm": 0.5683742165565491, + "learning_rate": 0.00032970438613516323, + "loss": 3.1392, + "step": 28675 + }, + { + "epoch": 1.41, + "grad_norm": 0.5865838527679443, + "learning_rate": 0.00032968906556284815, + "loss": 3.1683, + "step": 28676 + }, + { + "epoch": 1.41, + "grad_norm": 0.5845195651054382, + "learning_rate": 0.0003296737449123378, + "loss": 2.9377, + "step": 28677 + }, + { + "epoch": 1.41, + "grad_norm": 0.5908820033073425, + "learning_rate": 0.0003296584241836726, + "loss": 3.0637, + "step": 28678 + }, + { + "epoch": 1.41, + "grad_norm": 0.5620695352554321, + "learning_rate": 0.00032964310337689273, + "loss": 3.1156, + "step": 28679 + }, + { + "epoch": 1.41, + "grad_norm": 0.5610857009887695, + "learning_rate": 0.00032962778249203865, + "loss": 3.1601, + "step": 28680 + }, + { + "epoch": 1.41, + "grad_norm": 0.5518953800201416, + "learning_rate": 0.0003296124615291507, + "loss": 3.0757, + "step": 28681 + }, + { + "epoch": 1.41, + "grad_norm": 0.5811406373977661, + "learning_rate": 0.00032959714048826923, + "loss": 3.0635, + "step": 28682 + }, + { + "epoch": 1.41, + "grad_norm": 0.5600650310516357, + "learning_rate": 0.0003295818193694346, + "loss": 2.9543, + "step": 28683 + }, + { + "epoch": 1.41, + "grad_norm": 0.6001834869384766, + "learning_rate": 0.00032956649817268707, + "loss": 3.0855, + "step": 28684 + }, + { + "epoch": 1.41, + "grad_norm": 0.5461838841438293, + "learning_rate": 0.0003295511768980672, + "loss": 3.0432, + "step": 28685 + }, + { + "epoch": 1.41, + "grad_norm": 0.5799773335456848, + "learning_rate": 0.00032953585554561514, + "loss": 2.9155, + "step": 28686 + }, + { + "epoch": 1.41, + "grad_norm": 0.589601993560791, + "learning_rate": 0.0003295205341153713, + "loss": 3.1836, + "step": 28687 + }, + { + "epoch": 1.41, + "grad_norm": 0.5646551251411438, + "learning_rate": 0.0003295052126073761, + "loss": 3.2984, + "step": 28688 + }, + { + "epoch": 1.41, + "grad_norm": 0.5855870246887207, + "learning_rate": 0.00032948989102166983, + "loss": 3.1553, + "step": 28689 + }, + { + "epoch": 1.41, + "grad_norm": 0.5871852040290833, + "learning_rate": 0.00032947456935829284, + "loss": 3.1598, + "step": 28690 + }, + { + "epoch": 1.41, + "grad_norm": 0.6054069995880127, + "learning_rate": 0.0003294592476172856, + "loss": 2.9861, + "step": 28691 + }, + { + "epoch": 1.41, + "grad_norm": 0.600292980670929, + "learning_rate": 0.00032944392579868826, + "loss": 3.0694, + "step": 28692 + }, + { + "epoch": 1.41, + "grad_norm": 0.570586621761322, + "learning_rate": 0.0003294286039025414, + "loss": 2.9997, + "step": 28693 + }, + { + "epoch": 1.41, + "grad_norm": 0.5904337167739868, + "learning_rate": 0.0003294132819288851, + "loss": 2.9074, + "step": 28694 + }, + { + "epoch": 1.41, + "grad_norm": 0.5638096332550049, + "learning_rate": 0.00032939795987776, + "loss": 2.9564, + "step": 28695 + }, + { + "epoch": 1.41, + "grad_norm": 0.5465283393859863, + "learning_rate": 0.0003293826377492064, + "loss": 3.1155, + "step": 28696 + }, + { + "epoch": 1.41, + "grad_norm": 0.5867735147476196, + "learning_rate": 0.00032936731554326443, + "loss": 3.1527, + "step": 28697 + }, + { + "epoch": 1.41, + "grad_norm": 0.5899142026901245, + "learning_rate": 0.0003293519932599747, + "loss": 3.0795, + "step": 28698 + }, + { + "epoch": 1.41, + "grad_norm": 0.5677284002304077, + "learning_rate": 0.00032933667089937745, + "loss": 2.8966, + "step": 28699 + }, + { + "epoch": 1.41, + "grad_norm": 0.5423473715782166, + "learning_rate": 0.000329321348461513, + "loss": 3.2039, + "step": 28700 + }, + { + "epoch": 1.41, + "grad_norm": 0.6081904768943787, + "learning_rate": 0.0003293060259464218, + "loss": 3.1396, + "step": 28701 + }, + { + "epoch": 1.41, + "grad_norm": 0.5386385321617126, + "learning_rate": 0.0003292907033541441, + "loss": 3.0957, + "step": 28702 + }, + { + "epoch": 1.41, + "grad_norm": 0.5593324303627014, + "learning_rate": 0.0003292753806847205, + "loss": 3.0017, + "step": 28703 + }, + { + "epoch": 1.41, + "grad_norm": 0.5604330897331238, + "learning_rate": 0.000329260057938191, + "loss": 2.9814, + "step": 28704 + }, + { + "epoch": 1.41, + "grad_norm": 0.6002910733222961, + "learning_rate": 0.0003292447351145962, + "loss": 2.9692, + "step": 28705 + }, + { + "epoch": 1.41, + "grad_norm": 0.5873032808303833, + "learning_rate": 0.00032922941221397644, + "loss": 3.2579, + "step": 28706 + }, + { + "epoch": 1.41, + "grad_norm": 0.5412537455558777, + "learning_rate": 0.000329214089236372, + "loss": 2.7752, + "step": 28707 + }, + { + "epoch": 1.41, + "grad_norm": 0.58359295129776, + "learning_rate": 0.0003291987661818233, + "loss": 3.2543, + "step": 28708 + }, + { + "epoch": 1.41, + "grad_norm": 0.5666883587837219, + "learning_rate": 0.00032918344305037057, + "loss": 3.2266, + "step": 28709 + }, + { + "epoch": 1.41, + "grad_norm": 0.5886451005935669, + "learning_rate": 0.00032916811984205427, + "loss": 3.0358, + "step": 28710 + }, + { + "epoch": 1.41, + "grad_norm": 0.5730712413787842, + "learning_rate": 0.0003291527965569149, + "loss": 3.3266, + "step": 28711 + }, + { + "epoch": 1.41, + "grad_norm": 0.6448611617088318, + "learning_rate": 0.00032913747319499265, + "loss": 2.9833, + "step": 28712 + }, + { + "epoch": 1.41, + "grad_norm": 0.5740808844566345, + "learning_rate": 0.00032912214975632783, + "loss": 3.1631, + "step": 28713 + }, + { + "epoch": 1.41, + "grad_norm": 0.5787137746810913, + "learning_rate": 0.00032910682624096087, + "loss": 3.0614, + "step": 28714 + }, + { + "epoch": 1.41, + "grad_norm": 0.6143020391464233, + "learning_rate": 0.0003290915026489322, + "loss": 2.9691, + "step": 28715 + }, + { + "epoch": 1.41, + "grad_norm": 0.5791074633598328, + "learning_rate": 0.000329076178980282, + "loss": 2.8866, + "step": 28716 + }, + { + "epoch": 1.41, + "grad_norm": 0.5702418088912964, + "learning_rate": 0.0003290608552350508, + "loss": 2.997, + "step": 28717 + }, + { + "epoch": 1.41, + "grad_norm": 0.5896664261817932, + "learning_rate": 0.000329045531413279, + "loss": 2.9138, + "step": 28718 + }, + { + "epoch": 1.41, + "grad_norm": 0.6688372492790222, + "learning_rate": 0.00032903020751500676, + "loss": 3.1648, + "step": 28719 + }, + { + "epoch": 1.41, + "grad_norm": 0.5435354113578796, + "learning_rate": 0.00032901488354027445, + "loss": 3.0354, + "step": 28720 + }, + { + "epoch": 1.41, + "grad_norm": 0.5935372710227966, + "learning_rate": 0.0003289995594891227, + "loss": 3.0646, + "step": 28721 + }, + { + "epoch": 1.41, + "grad_norm": 0.5761470198631287, + "learning_rate": 0.00032898423536159165, + "loss": 3.2373, + "step": 28722 + }, + { + "epoch": 1.41, + "grad_norm": 0.6226154565811157, + "learning_rate": 0.00032896891115772166, + "loss": 2.8922, + "step": 28723 + }, + { + "epoch": 1.41, + "grad_norm": 0.5801342129707336, + "learning_rate": 0.0003289535868775531, + "loss": 3.0508, + "step": 28724 + }, + { + "epoch": 1.41, + "grad_norm": 0.5750948190689087, + "learning_rate": 0.0003289382625211264, + "loss": 3.152, + "step": 28725 + }, + { + "epoch": 1.41, + "grad_norm": 0.612395167350769, + "learning_rate": 0.00032892293808848183, + "loss": 2.9434, + "step": 28726 + }, + { + "epoch": 1.41, + "grad_norm": 0.607111930847168, + "learning_rate": 0.0003289076135796599, + "loss": 3.1274, + "step": 28727 + }, + { + "epoch": 1.41, + "grad_norm": 0.5647713541984558, + "learning_rate": 0.00032889228899470085, + "loss": 3.1467, + "step": 28728 + }, + { + "epoch": 1.41, + "grad_norm": 0.5546459555625916, + "learning_rate": 0.0003288769643336451, + "loss": 3.2392, + "step": 28729 + }, + { + "epoch": 1.41, + "grad_norm": 0.6006819605827332, + "learning_rate": 0.00032886163959653286, + "loss": 3.1325, + "step": 28730 + }, + { + "epoch": 1.41, + "grad_norm": 0.593948245048523, + "learning_rate": 0.0003288463147834047, + "loss": 3.1686, + "step": 28731 + }, + { + "epoch": 1.41, + "grad_norm": 0.6291419267654419, + "learning_rate": 0.0003288309898943009, + "loss": 3.0538, + "step": 28732 + }, + { + "epoch": 1.41, + "grad_norm": 0.5162724256515503, + "learning_rate": 0.00032881566492926187, + "loss": 3.0556, + "step": 28733 + }, + { + "epoch": 1.41, + "grad_norm": 0.6125902533531189, + "learning_rate": 0.0003288003398883279, + "loss": 3.1117, + "step": 28734 + }, + { + "epoch": 1.41, + "grad_norm": 0.5817069411277771, + "learning_rate": 0.0003287850147715393, + "loss": 3.0981, + "step": 28735 + }, + { + "epoch": 1.41, + "grad_norm": 0.5820333361625671, + "learning_rate": 0.0003287696895789366, + "loss": 3.18, + "step": 28736 + }, + { + "epoch": 1.41, + "grad_norm": 0.6199052929878235, + "learning_rate": 0.00032875436431056, + "loss": 2.9355, + "step": 28737 + }, + { + "epoch": 1.41, + "grad_norm": 0.6050741672515869, + "learning_rate": 0.00032873903896645, + "loss": 3.0017, + "step": 28738 + }, + { + "epoch": 1.41, + "grad_norm": 0.5911166667938232, + "learning_rate": 0.00032872371354664685, + "loss": 2.894, + "step": 28739 + }, + { + "epoch": 1.41, + "grad_norm": 0.5618557929992676, + "learning_rate": 0.000328708388051191, + "loss": 2.8401, + "step": 28740 + }, + { + "epoch": 1.41, + "grad_norm": 0.5915781259536743, + "learning_rate": 0.00032869306248012277, + "loss": 3.02, + "step": 28741 + }, + { + "epoch": 1.41, + "grad_norm": 0.5654686689376831, + "learning_rate": 0.00032867773683348256, + "loss": 2.9968, + "step": 28742 + }, + { + "epoch": 1.41, + "grad_norm": 0.584214448928833, + "learning_rate": 0.0003286624111113107, + "loss": 3.1967, + "step": 28743 + }, + { + "epoch": 1.41, + "grad_norm": 0.5594285726547241, + "learning_rate": 0.0003286470853136476, + "loss": 3.0001, + "step": 28744 + }, + { + "epoch": 1.41, + "grad_norm": 0.5778558254241943, + "learning_rate": 0.0003286317594405335, + "loss": 3.0812, + "step": 28745 + }, + { + "epoch": 1.41, + "grad_norm": 0.6112017631530762, + "learning_rate": 0.0003286164334920089, + "loss": 2.981, + "step": 28746 + }, + { + "epoch": 1.41, + "grad_norm": 0.5561625361442566, + "learning_rate": 0.00032860110746811414, + "loss": 2.9896, + "step": 28747 + }, + { + "epoch": 1.41, + "grad_norm": 0.57763671875, + "learning_rate": 0.00032858578136888957, + "loss": 2.7896, + "step": 28748 + }, + { + "epoch": 1.41, + "grad_norm": 0.5785109996795654, + "learning_rate": 0.00032857045519437555, + "loss": 2.9054, + "step": 28749 + }, + { + "epoch": 1.41, + "grad_norm": 0.5656970143318176, + "learning_rate": 0.0003285551289446124, + "loss": 3.0454, + "step": 28750 + }, + { + "epoch": 1.41, + "grad_norm": 0.5791580080986023, + "learning_rate": 0.00032853980261964054, + "loss": 2.8888, + "step": 28751 + }, + { + "epoch": 1.41, + "grad_norm": 0.586801290512085, + "learning_rate": 0.0003285244762195004, + "loss": 2.8627, + "step": 28752 + }, + { + "epoch": 1.41, + "grad_norm": 0.5646578073501587, + "learning_rate": 0.00032850914974423225, + "loss": 3.1138, + "step": 28753 + }, + { + "epoch": 1.41, + "grad_norm": 0.5531555414199829, + "learning_rate": 0.00032849382319387647, + "loss": 3.193, + "step": 28754 + }, + { + "epoch": 1.41, + "grad_norm": 0.5732769966125488, + "learning_rate": 0.0003284784965684734, + "loss": 3.1193, + "step": 28755 + }, + { + "epoch": 1.41, + "grad_norm": 0.5827401876449585, + "learning_rate": 0.00032846316986806355, + "loss": 3.1374, + "step": 28756 + }, + { + "epoch": 1.41, + "grad_norm": 0.5628218650817871, + "learning_rate": 0.00032844784309268713, + "loss": 3.2275, + "step": 28757 + }, + { + "epoch": 1.41, + "grad_norm": 0.5630583167076111, + "learning_rate": 0.00032843251624238464, + "loss": 3.212, + "step": 28758 + }, + { + "epoch": 1.41, + "grad_norm": 0.5518205165863037, + "learning_rate": 0.00032841718931719626, + "loss": 2.8222, + "step": 28759 + }, + { + "epoch": 1.41, + "grad_norm": 0.576407253742218, + "learning_rate": 0.00032840186231716253, + "loss": 3.1868, + "step": 28760 + }, + { + "epoch": 1.41, + "grad_norm": 0.5976382493972778, + "learning_rate": 0.00032838653524232367, + "loss": 3.1501, + "step": 28761 + }, + { + "epoch": 1.41, + "grad_norm": 0.5755053758621216, + "learning_rate": 0.0003283712080927203, + "loss": 2.9968, + "step": 28762 + }, + { + "epoch": 1.41, + "grad_norm": 0.5521357655525208, + "learning_rate": 0.00032835588086839256, + "loss": 3.211, + "step": 28763 + }, + { + "epoch": 1.41, + "grad_norm": 0.5395427346229553, + "learning_rate": 0.0003283405535693809, + "loss": 2.959, + "step": 28764 + }, + { + "epoch": 1.41, + "grad_norm": 0.5700335502624512, + "learning_rate": 0.00032832522619572564, + "loss": 3.0619, + "step": 28765 + }, + { + "epoch": 1.41, + "grad_norm": 0.5535926222801208, + "learning_rate": 0.0003283098987474672, + "loss": 3.1751, + "step": 28766 + }, + { + "epoch": 1.41, + "grad_norm": 0.6124323606491089, + "learning_rate": 0.00032829457122464595, + "loss": 3.1244, + "step": 28767 + }, + { + "epoch": 1.41, + "grad_norm": 0.6224178671836853, + "learning_rate": 0.0003282792436273022, + "loss": 2.879, + "step": 28768 + }, + { + "epoch": 1.41, + "grad_norm": 0.5655115842819214, + "learning_rate": 0.0003282639159554765, + "loss": 3.1099, + "step": 28769 + }, + { + "epoch": 1.41, + "grad_norm": 0.6045935153961182, + "learning_rate": 0.00032824858820920894, + "loss": 2.9087, + "step": 28770 + }, + { + "epoch": 1.41, + "grad_norm": 0.5513472557067871, + "learning_rate": 0.00032823326038854003, + "loss": 2.8912, + "step": 28771 + }, + { + "epoch": 1.41, + "grad_norm": 0.6038599610328674, + "learning_rate": 0.0003282179324935102, + "loss": 2.9113, + "step": 28772 + }, + { + "epoch": 1.41, + "grad_norm": 0.5618096590042114, + "learning_rate": 0.00032820260452415983, + "loss": 3.2247, + "step": 28773 + }, + { + "epoch": 1.41, + "grad_norm": 0.6248725652694702, + "learning_rate": 0.00032818727648052914, + "loss": 3.141, + "step": 28774 + }, + { + "epoch": 1.41, + "grad_norm": 0.5756538510322571, + "learning_rate": 0.0003281719483626585, + "loss": 3.1785, + "step": 28775 + }, + { + "epoch": 1.41, + "grad_norm": 0.5904281735420227, + "learning_rate": 0.0003281566201705885, + "loss": 2.9667, + "step": 28776 + }, + { + "epoch": 1.41, + "grad_norm": 0.5620636940002441, + "learning_rate": 0.00032814129190435936, + "loss": 3.1892, + "step": 28777 + }, + { + "epoch": 1.41, + "grad_norm": 0.5535551905632019, + "learning_rate": 0.0003281259635640115, + "loss": 3.2308, + "step": 28778 + }, + { + "epoch": 1.41, + "grad_norm": 0.602520227432251, + "learning_rate": 0.0003281106351495852, + "loss": 2.8856, + "step": 28779 + }, + { + "epoch": 1.41, + "grad_norm": 0.5640451908111572, + "learning_rate": 0.000328095306661121, + "loss": 3.0938, + "step": 28780 + }, + { + "epoch": 1.41, + "grad_norm": 0.6214268207550049, + "learning_rate": 0.000328079978098659, + "loss": 2.7807, + "step": 28781 + }, + { + "epoch": 1.41, + "grad_norm": 0.5931678414344788, + "learning_rate": 0.0003280646494622399, + "loss": 3.0696, + "step": 28782 + }, + { + "epoch": 1.41, + "grad_norm": 0.5738914012908936, + "learning_rate": 0.00032804932075190384, + "loss": 3.1592, + "step": 28783 + }, + { + "epoch": 1.41, + "grad_norm": 0.5861164331436157, + "learning_rate": 0.00032803399196769135, + "loss": 3.0072, + "step": 28784 + }, + { + "epoch": 1.41, + "grad_norm": 0.5898017883300781, + "learning_rate": 0.0003280186631096426, + "loss": 3.1016, + "step": 28785 + }, + { + "epoch": 1.41, + "grad_norm": 0.5697299242019653, + "learning_rate": 0.00032800333417779806, + "loss": 3.2083, + "step": 28786 + }, + { + "epoch": 1.41, + "grad_norm": 0.5507596135139465, + "learning_rate": 0.00032798800517219826, + "loss": 2.8548, + "step": 28787 + }, + { + "epoch": 1.41, + "grad_norm": 0.5449158549308777, + "learning_rate": 0.00032797267609288344, + "loss": 3.2197, + "step": 28788 + }, + { + "epoch": 1.41, + "grad_norm": 0.5956053733825684, + "learning_rate": 0.0003279573469398939, + "loss": 3.011, + "step": 28789 + }, + { + "epoch": 1.41, + "grad_norm": 0.6707125902175903, + "learning_rate": 0.0003279420177132701, + "loss": 3.1171, + "step": 28790 + }, + { + "epoch": 1.41, + "grad_norm": 0.5548729300498962, + "learning_rate": 0.00032792668841305245, + "loss": 2.96, + "step": 28791 + }, + { + "epoch": 1.41, + "grad_norm": 0.5525285601615906, + "learning_rate": 0.00032791135903928123, + "loss": 3.0747, + "step": 28792 + }, + { + "epoch": 1.41, + "grad_norm": 0.5493025183677673, + "learning_rate": 0.0003278960295919968, + "loss": 3.1805, + "step": 28793 + }, + { + "epoch": 1.41, + "grad_norm": 0.5797526836395264, + "learning_rate": 0.00032788070007123965, + "loss": 2.8174, + "step": 28794 + }, + { + "epoch": 1.41, + "grad_norm": 0.5741456151008606, + "learning_rate": 0.00032786537047705023, + "loss": 3.1977, + "step": 28795 + }, + { + "epoch": 1.41, + "grad_norm": 0.5887008905410767, + "learning_rate": 0.0003278500408094686, + "loss": 2.9331, + "step": 28796 + }, + { + "epoch": 1.41, + "grad_norm": 0.6311918497085571, + "learning_rate": 0.00032783471106853537, + "loss": 2.8529, + "step": 28797 + }, + { + "epoch": 1.41, + "grad_norm": 0.5639904141426086, + "learning_rate": 0.00032781938125429097, + "loss": 2.9302, + "step": 28798 + }, + { + "epoch": 1.41, + "grad_norm": 0.5747509002685547, + "learning_rate": 0.0003278040513667756, + "loss": 3.0155, + "step": 28799 + }, + { + "epoch": 1.41, + "grad_norm": 0.5683767795562744, + "learning_rate": 0.00032778872140602967, + "loss": 3.1515, + "step": 28800 + }, + { + "epoch": 1.41, + "grad_norm": 0.5533831119537354, + "learning_rate": 0.00032777339137209365, + "loss": 2.9977, + "step": 28801 + }, + { + "epoch": 1.41, + "grad_norm": 0.5720163583755493, + "learning_rate": 0.0003277580612650078, + "loss": 3.1106, + "step": 28802 + }, + { + "epoch": 1.41, + "grad_norm": 0.605758011341095, + "learning_rate": 0.0003277427310848127, + "loss": 3.0415, + "step": 28803 + }, + { + "epoch": 1.41, + "grad_norm": 0.5644210577011108, + "learning_rate": 0.00032772740083154844, + "loss": 2.9584, + "step": 28804 + }, + { + "epoch": 1.41, + "grad_norm": 0.603422999382019, + "learning_rate": 0.0003277120705052556, + "loss": 2.9925, + "step": 28805 + }, + { + "epoch": 1.41, + "grad_norm": 0.5727534890174866, + "learning_rate": 0.00032769674010597454, + "loss": 3.1375, + "step": 28806 + }, + { + "epoch": 1.41, + "grad_norm": 0.5674635171890259, + "learning_rate": 0.0003276814096337455, + "loss": 3.0833, + "step": 28807 + }, + { + "epoch": 1.41, + "grad_norm": 0.5521325469017029, + "learning_rate": 0.00032766607908860904, + "loss": 2.9992, + "step": 28808 + }, + { + "epoch": 1.41, + "grad_norm": 0.5518462657928467, + "learning_rate": 0.0003276507484706054, + "loss": 3.1441, + "step": 28809 + }, + { + "epoch": 1.41, + "grad_norm": 0.5592528581619263, + "learning_rate": 0.0003276354177797751, + "loss": 3.0381, + "step": 28810 + }, + { + "epoch": 1.41, + "grad_norm": 0.5564046502113342, + "learning_rate": 0.00032762008701615826, + "loss": 3.1174, + "step": 28811 + }, + { + "epoch": 1.41, + "grad_norm": 0.5931382775306702, + "learning_rate": 0.0003276047561797955, + "loss": 3.051, + "step": 28812 + }, + { + "epoch": 1.41, + "grad_norm": 0.5513250231742859, + "learning_rate": 0.0003275894252707272, + "loss": 2.845, + "step": 28813 + }, + { + "epoch": 1.41, + "grad_norm": 0.5776057839393616, + "learning_rate": 0.0003275740942889936, + "loss": 3.111, + "step": 28814 + }, + { + "epoch": 1.41, + "grad_norm": 0.5935764908790588, + "learning_rate": 0.00032755876323463514, + "loss": 3.1766, + "step": 28815 + }, + { + "epoch": 1.41, + "grad_norm": 0.6411653757095337, + "learning_rate": 0.00032754343210769226, + "loss": 3.037, + "step": 28816 + }, + { + "epoch": 1.41, + "grad_norm": 0.5873118042945862, + "learning_rate": 0.0003275281009082052, + "loss": 3.1031, + "step": 28817 + }, + { + "epoch": 1.41, + "grad_norm": 0.57716965675354, + "learning_rate": 0.0003275127696362144, + "loss": 3.1625, + "step": 28818 + }, + { + "epoch": 1.41, + "grad_norm": 0.5541166067123413, + "learning_rate": 0.00032749743829176036, + "loss": 2.9424, + "step": 28819 + }, + { + "epoch": 1.41, + "grad_norm": 0.5837197303771973, + "learning_rate": 0.0003274821068748834, + "loss": 3.1427, + "step": 28820 + }, + { + "epoch": 1.41, + "grad_norm": 0.6180403828620911, + "learning_rate": 0.00032746677538562376, + "loss": 3.0543, + "step": 28821 + }, + { + "epoch": 1.41, + "grad_norm": 0.5609276294708252, + "learning_rate": 0.00032745144382402184, + "loss": 3.2852, + "step": 28822 + }, + { + "epoch": 1.41, + "grad_norm": 0.5260721445083618, + "learning_rate": 0.00032743611219011816, + "loss": 3.1377, + "step": 28823 + }, + { + "epoch": 1.41, + "grad_norm": 0.568089485168457, + "learning_rate": 0.00032742078048395316, + "loss": 3.2754, + "step": 28824 + }, + { + "epoch": 1.41, + "grad_norm": 0.5611562728881836, + "learning_rate": 0.000327405448705567, + "loss": 2.9644, + "step": 28825 + }, + { + "epoch": 1.41, + "grad_norm": 0.5755119919776917, + "learning_rate": 0.00032739011685500017, + "loss": 3.0372, + "step": 28826 + }, + { + "epoch": 1.41, + "grad_norm": 0.579472541809082, + "learning_rate": 0.00032737478493229306, + "loss": 3.2508, + "step": 28827 + }, + { + "epoch": 1.41, + "grad_norm": 0.5705549716949463, + "learning_rate": 0.00032735945293748607, + "loss": 3.0324, + "step": 28828 + }, + { + "epoch": 1.41, + "grad_norm": 0.582402229309082, + "learning_rate": 0.0003273441208706194, + "loss": 2.9384, + "step": 28829 + }, + { + "epoch": 1.41, + "grad_norm": 0.5975726842880249, + "learning_rate": 0.00032732878873173373, + "loss": 2.7791, + "step": 28830 + }, + { + "epoch": 1.41, + "grad_norm": 0.546567440032959, + "learning_rate": 0.0003273134565208693, + "loss": 3.1615, + "step": 28831 + }, + { + "epoch": 1.41, + "grad_norm": 0.5662059783935547, + "learning_rate": 0.0003272981242380664, + "loss": 3.037, + "step": 28832 + }, + { + "epoch": 1.41, + "grad_norm": 0.567099392414093, + "learning_rate": 0.00032728279188336544, + "loss": 3.0003, + "step": 28833 + }, + { + "epoch": 1.41, + "grad_norm": 0.584934651851654, + "learning_rate": 0.0003272674594568069, + "loss": 2.908, + "step": 28834 + }, + { + "epoch": 1.41, + "grad_norm": 0.5936591029167175, + "learning_rate": 0.0003272521269584313, + "loss": 3.0655, + "step": 28835 + }, + { + "epoch": 1.41, + "grad_norm": 0.5734385251998901, + "learning_rate": 0.0003272367943882787, + "loss": 2.9705, + "step": 28836 + }, + { + "epoch": 1.41, + "grad_norm": 0.5487362742424011, + "learning_rate": 0.0003272214617463896, + "loss": 3.0344, + "step": 28837 + }, + { + "epoch": 1.41, + "grad_norm": 0.5989526510238647, + "learning_rate": 0.0003272061290328044, + "loss": 3.11, + "step": 28838 + }, + { + "epoch": 1.41, + "grad_norm": 0.5572174191474915, + "learning_rate": 0.00032719079624756353, + "loss": 2.9015, + "step": 28839 + }, + { + "epoch": 1.41, + "grad_norm": 0.5623894929885864, + "learning_rate": 0.00032717546339070734, + "loss": 2.982, + "step": 28840 + }, + { + "epoch": 1.41, + "grad_norm": 0.6148263216018677, + "learning_rate": 0.0003271601304622762, + "loss": 2.9311, + "step": 28841 + }, + { + "epoch": 1.41, + "grad_norm": 0.5677447319030762, + "learning_rate": 0.0003271447974623105, + "loss": 3.0023, + "step": 28842 + }, + { + "epoch": 1.41, + "grad_norm": 0.5840166807174683, + "learning_rate": 0.0003271294643908506, + "loss": 3.0108, + "step": 28843 + }, + { + "epoch": 1.41, + "grad_norm": 0.5517377853393555, + "learning_rate": 0.00032711413124793693, + "loss": 3.1285, + "step": 28844 + }, + { + "epoch": 1.41, + "grad_norm": 0.5746965408325195, + "learning_rate": 0.0003270987980336099, + "loss": 3.0579, + "step": 28845 + }, + { + "epoch": 1.41, + "grad_norm": 0.5635104775428772, + "learning_rate": 0.00032708346474790985, + "loss": 2.8986, + "step": 28846 + }, + { + "epoch": 1.41, + "grad_norm": 0.5739995241165161, + "learning_rate": 0.00032706813139087715, + "loss": 3.0193, + "step": 28847 + }, + { + "epoch": 1.41, + "grad_norm": 0.5986478328704834, + "learning_rate": 0.0003270527979625521, + "loss": 3.2475, + "step": 28848 + }, + { + "epoch": 1.41, + "grad_norm": 0.5273615121841431, + "learning_rate": 0.0003270374644629753, + "loss": 3.2175, + "step": 28849 + }, + { + "epoch": 1.41, + "grad_norm": 0.5814329981803894, + "learning_rate": 0.000327022130892187, + "loss": 2.9465, + "step": 28850 + }, + { + "epoch": 1.41, + "grad_norm": 0.5327739715576172, + "learning_rate": 0.0003270067972502276, + "loss": 3.0869, + "step": 28851 + }, + { + "epoch": 1.41, + "grad_norm": 0.5583226680755615, + "learning_rate": 0.0003269914635371375, + "loss": 3.0564, + "step": 28852 + }, + { + "epoch": 1.41, + "grad_norm": 0.5671470761299133, + "learning_rate": 0.00032697612975295706, + "loss": 3.3398, + "step": 28853 + }, + { + "epoch": 1.41, + "grad_norm": 0.6003071665763855, + "learning_rate": 0.00032696079589772674, + "loss": 2.9829, + "step": 28854 + }, + { + "epoch": 1.41, + "grad_norm": 0.5671707391738892, + "learning_rate": 0.00032694546197148676, + "loss": 3.1985, + "step": 28855 + }, + { + "epoch": 1.41, + "grad_norm": 0.5648956298828125, + "learning_rate": 0.00032693012797427773, + "loss": 3.1373, + "step": 28856 + }, + { + "epoch": 1.41, + "grad_norm": 0.5903165936470032, + "learning_rate": 0.0003269147939061399, + "loss": 2.9278, + "step": 28857 + }, + { + "epoch": 1.41, + "grad_norm": 0.5882886648178101, + "learning_rate": 0.00032689945976711366, + "loss": 3.102, + "step": 28858 + }, + { + "epoch": 1.41, + "grad_norm": 0.6090853810310364, + "learning_rate": 0.00032688412555723946, + "loss": 3.0155, + "step": 28859 + }, + { + "epoch": 1.41, + "grad_norm": 0.5981293320655823, + "learning_rate": 0.0003268687912765576, + "loss": 2.9891, + "step": 28860 + }, + { + "epoch": 1.41, + "grad_norm": 0.6331175565719604, + "learning_rate": 0.0003268534569251086, + "loss": 2.9693, + "step": 28861 + }, + { + "epoch": 1.41, + "grad_norm": 0.5615060925483704, + "learning_rate": 0.0003268381225029327, + "loss": 3.0143, + "step": 28862 + }, + { + "epoch": 1.41, + "grad_norm": 0.5723454356193542, + "learning_rate": 0.0003268227880100703, + "loss": 3.0968, + "step": 28863 + }, + { + "epoch": 1.41, + "grad_norm": 0.5850404500961304, + "learning_rate": 0.0003268074534465619, + "loss": 2.9922, + "step": 28864 + }, + { + "epoch": 1.41, + "grad_norm": 0.5858272910118103, + "learning_rate": 0.0003267921188124478, + "loss": 3.0211, + "step": 28865 + }, + { + "epoch": 1.41, + "grad_norm": 0.5558182597160339, + "learning_rate": 0.0003267767841077684, + "loss": 2.8762, + "step": 28866 + }, + { + "epoch": 1.41, + "grad_norm": 0.570468544960022, + "learning_rate": 0.0003267614493325641, + "loss": 3.2, + "step": 28867 + }, + { + "epoch": 1.41, + "grad_norm": 0.5738118886947632, + "learning_rate": 0.0003267461144868753, + "loss": 2.9635, + "step": 28868 + }, + { + "epoch": 1.41, + "grad_norm": 0.8501331806182861, + "learning_rate": 0.00032673077957074244, + "loss": 3.1468, + "step": 28869 + }, + { + "epoch": 1.41, + "grad_norm": 0.582685112953186, + "learning_rate": 0.00032671544458420587, + "loss": 2.9663, + "step": 28870 + }, + { + "epoch": 1.41, + "grad_norm": 0.5798792839050293, + "learning_rate": 0.0003267001095273059, + "loss": 3.1031, + "step": 28871 + }, + { + "epoch": 1.41, + "grad_norm": 0.9409517645835876, + "learning_rate": 0.000326684774400083, + "loss": 3.17, + "step": 28872 + }, + { + "epoch": 1.41, + "grad_norm": 0.5941800475120544, + "learning_rate": 0.00032666943920257754, + "loss": 3.1101, + "step": 28873 + }, + { + "epoch": 1.42, + "grad_norm": 0.5913896560668945, + "learning_rate": 0.0003266541039348298, + "loss": 3.1029, + "step": 28874 + }, + { + "epoch": 1.42, + "grad_norm": 0.5694225430488586, + "learning_rate": 0.00032663876859688045, + "loss": 3.1596, + "step": 28875 + }, + { + "epoch": 1.42, + "grad_norm": 0.600014865398407, + "learning_rate": 0.00032662343318876964, + "loss": 3.0626, + "step": 28876 + }, + { + "epoch": 1.42, + "grad_norm": 0.5729812383651733, + "learning_rate": 0.00032660809771053784, + "loss": 2.9556, + "step": 28877 + }, + { + "epoch": 1.42, + "grad_norm": 0.5758359432220459, + "learning_rate": 0.0003265927621622254, + "loss": 3.1092, + "step": 28878 + }, + { + "epoch": 1.42, + "grad_norm": 0.5663775205612183, + "learning_rate": 0.0003265774265438727, + "loss": 3.0048, + "step": 28879 + }, + { + "epoch": 1.42, + "grad_norm": 0.5517311692237854, + "learning_rate": 0.0003265620908555203, + "loss": 2.9829, + "step": 28880 + }, + { + "epoch": 1.42, + "grad_norm": 0.550212025642395, + "learning_rate": 0.0003265467550972084, + "loss": 3.1944, + "step": 28881 + }, + { + "epoch": 1.42, + "grad_norm": 0.5680566430091858, + "learning_rate": 0.00032653141926897747, + "loss": 3.0897, + "step": 28882 + }, + { + "epoch": 1.42, + "grad_norm": 0.5950158834457397, + "learning_rate": 0.0003265160833708679, + "loss": 3.0992, + "step": 28883 + }, + { + "epoch": 1.42, + "grad_norm": 1.2692245244979858, + "learning_rate": 0.00032650074740291996, + "loss": 3.2001, + "step": 28884 + }, + { + "epoch": 1.42, + "grad_norm": 0.560718297958374, + "learning_rate": 0.0003264854113651742, + "loss": 3.074, + "step": 28885 + }, + { + "epoch": 1.42, + "grad_norm": 0.6030946373939514, + "learning_rate": 0.00032647007525767106, + "loss": 3.1256, + "step": 28886 + }, + { + "epoch": 1.42, + "grad_norm": 0.580146312713623, + "learning_rate": 0.0003264547390804508, + "loss": 3.1419, + "step": 28887 + }, + { + "epoch": 1.42, + "grad_norm": 0.6022080779075623, + "learning_rate": 0.0003264394028335538, + "loss": 3.111, + "step": 28888 + }, + { + "epoch": 1.42, + "grad_norm": 0.6084839105606079, + "learning_rate": 0.00032642406651702053, + "loss": 3.088, + "step": 28889 + }, + { + "epoch": 1.42, + "grad_norm": 0.767863392829895, + "learning_rate": 0.00032640873013089133, + "loss": 3.033, + "step": 28890 + }, + { + "epoch": 1.42, + "grad_norm": 0.5766077041625977, + "learning_rate": 0.00032639339367520665, + "loss": 2.9984, + "step": 28891 + }, + { + "epoch": 1.42, + "grad_norm": 0.5633019804954529, + "learning_rate": 0.00032637805715000684, + "loss": 3.1288, + "step": 28892 + }, + { + "epoch": 1.42, + "grad_norm": 0.5685615539550781, + "learning_rate": 0.0003263627205553323, + "loss": 3.0734, + "step": 28893 + }, + { + "epoch": 1.42, + "grad_norm": 0.6013651490211487, + "learning_rate": 0.0003263473838912234, + "loss": 2.937, + "step": 28894 + }, + { + "epoch": 1.42, + "grad_norm": 0.6018322706222534, + "learning_rate": 0.0003263320471577205, + "loss": 3.0378, + "step": 28895 + }, + { + "epoch": 1.42, + "grad_norm": 0.6196945905685425, + "learning_rate": 0.00032631671035486423, + "loss": 3.0361, + "step": 28896 + }, + { + "epoch": 1.42, + "grad_norm": 0.5838954448699951, + "learning_rate": 0.0003263013734826948, + "loss": 3.1028, + "step": 28897 + }, + { + "epoch": 1.42, + "grad_norm": 0.5709912776947021, + "learning_rate": 0.00032628603654125246, + "loss": 3.1711, + "step": 28898 + }, + { + "epoch": 1.42, + "grad_norm": 0.5789138674736023, + "learning_rate": 0.0003262706995305778, + "loss": 3.2224, + "step": 28899 + }, + { + "epoch": 1.42, + "grad_norm": 0.6325254440307617, + "learning_rate": 0.0003262553624507113, + "loss": 3.0815, + "step": 28900 + }, + { + "epoch": 1.42, + "grad_norm": 0.5830736756324768, + "learning_rate": 0.0003262400253016931, + "loss": 3.0138, + "step": 28901 + }, + { + "epoch": 1.42, + "grad_norm": 0.6032917499542236, + "learning_rate": 0.00032622468808356377, + "loss": 3.0851, + "step": 28902 + }, + { + "epoch": 1.42, + "grad_norm": 0.6263208985328674, + "learning_rate": 0.00032620935079636363, + "loss": 3.0902, + "step": 28903 + }, + { + "epoch": 1.42, + "grad_norm": 0.6103036403656006, + "learning_rate": 0.0003261940134401331, + "loss": 2.7757, + "step": 28904 + }, + { + "epoch": 1.42, + "grad_norm": 0.5601415038108826, + "learning_rate": 0.00032617867601491266, + "loss": 3.0745, + "step": 28905 + }, + { + "epoch": 1.42, + "grad_norm": 0.5642362833023071, + "learning_rate": 0.00032616333852074255, + "loss": 3.1547, + "step": 28906 + }, + { + "epoch": 1.42, + "grad_norm": 0.6236977577209473, + "learning_rate": 0.0003261480009576632, + "loss": 2.9319, + "step": 28907 + }, + { + "epoch": 1.42, + "grad_norm": 0.6301894187927246, + "learning_rate": 0.0003261326633257152, + "loss": 3.04, + "step": 28908 + }, + { + "epoch": 1.42, + "grad_norm": 0.5724765658378601, + "learning_rate": 0.00032611732562493864, + "loss": 3.1385, + "step": 28909 + }, + { + "epoch": 1.42, + "grad_norm": 0.5792955756187439, + "learning_rate": 0.00032610198785537414, + "loss": 3.0763, + "step": 28910 + }, + { + "epoch": 1.42, + "grad_norm": 0.5876222848892212, + "learning_rate": 0.0003260866500170621, + "loss": 2.9057, + "step": 28911 + }, + { + "epoch": 1.42, + "grad_norm": 0.5676042437553406, + "learning_rate": 0.0003260713121100427, + "loss": 3.0592, + "step": 28912 + }, + { + "epoch": 1.42, + "grad_norm": 0.6063807010650635, + "learning_rate": 0.0003260559741343566, + "loss": 3.0202, + "step": 28913 + }, + { + "epoch": 1.42, + "grad_norm": 0.5811011791229248, + "learning_rate": 0.000326040636090044, + "loss": 3.1869, + "step": 28914 + }, + { + "epoch": 1.42, + "grad_norm": 0.5812034606933594, + "learning_rate": 0.0003260252979771454, + "loss": 2.9437, + "step": 28915 + }, + { + "epoch": 1.42, + "grad_norm": 0.57767254114151, + "learning_rate": 0.00032600995979570113, + "loss": 3.1279, + "step": 28916 + }, + { + "epoch": 1.42, + "grad_norm": 0.5982522368431091, + "learning_rate": 0.00032599462154575174, + "loss": 3.0572, + "step": 28917 + }, + { + "epoch": 1.42, + "grad_norm": 0.6943778395652771, + "learning_rate": 0.0003259792832273374, + "loss": 3.1348, + "step": 28918 + }, + { + "epoch": 1.42, + "grad_norm": 0.5726862549781799, + "learning_rate": 0.0003259639448404988, + "loss": 3.3887, + "step": 28919 + }, + { + "epoch": 1.42, + "grad_norm": 0.5609963536262512, + "learning_rate": 0.000325948606385276, + "loss": 3.0386, + "step": 28920 + }, + { + "epoch": 1.42, + "grad_norm": 0.573071300983429, + "learning_rate": 0.0003259332678617095, + "loss": 3.2499, + "step": 28921 + }, + { + "epoch": 1.42, + "grad_norm": 0.5977573990821838, + "learning_rate": 0.00032591792926983995, + "loss": 3.0938, + "step": 28922 + }, + { + "epoch": 1.42, + "grad_norm": 0.5285585522651672, + "learning_rate": 0.00032590259060970755, + "loss": 3.3141, + "step": 28923 + }, + { + "epoch": 1.42, + "grad_norm": 0.6297756433486938, + "learning_rate": 0.0003258872518813526, + "loss": 3.0633, + "step": 28924 + }, + { + "epoch": 1.42, + "grad_norm": 0.5830036401748657, + "learning_rate": 0.00032587191308481564, + "loss": 3.0276, + "step": 28925 + }, + { + "epoch": 1.42, + "grad_norm": 0.6116616725921631, + "learning_rate": 0.0003258565742201371, + "loss": 2.7825, + "step": 28926 + }, + { + "epoch": 1.42, + "grad_norm": 0.5653083920478821, + "learning_rate": 0.00032584123528735725, + "loss": 3.0379, + "step": 28927 + }, + { + "epoch": 1.42, + "grad_norm": 0.6250095963478088, + "learning_rate": 0.00032582589628651665, + "loss": 3.0878, + "step": 28928 + }, + { + "epoch": 1.42, + "grad_norm": 0.5555509924888611, + "learning_rate": 0.00032581055721765553, + "loss": 2.9807, + "step": 28929 + }, + { + "epoch": 1.42, + "grad_norm": 0.5599590539932251, + "learning_rate": 0.00032579521808081435, + "loss": 2.8723, + "step": 28930 + }, + { + "epoch": 1.42, + "grad_norm": 0.589675784111023, + "learning_rate": 0.0003257798788760336, + "loss": 3.0035, + "step": 28931 + }, + { + "epoch": 1.42, + "grad_norm": 0.5757635831832886, + "learning_rate": 0.00032576453960335357, + "loss": 3.2037, + "step": 28932 + }, + { + "epoch": 1.42, + "grad_norm": 0.5922831892967224, + "learning_rate": 0.0003257492002628148, + "loss": 3.1709, + "step": 28933 + }, + { + "epoch": 1.42, + "grad_norm": 0.5696724653244019, + "learning_rate": 0.00032573386085445747, + "loss": 3.0498, + "step": 28934 + }, + { + "epoch": 1.42, + "grad_norm": 0.6184026002883911, + "learning_rate": 0.0003257185213783221, + "loss": 2.9888, + "step": 28935 + }, + { + "epoch": 1.42, + "grad_norm": 0.552787184715271, + "learning_rate": 0.00032570318183444913, + "loss": 3.1177, + "step": 28936 + }, + { + "epoch": 1.42, + "grad_norm": 0.6209615468978882, + "learning_rate": 0.00032568784222287896, + "loss": 3.0414, + "step": 28937 + }, + { + "epoch": 1.42, + "grad_norm": 0.5979040265083313, + "learning_rate": 0.0003256725025436519, + "loss": 2.9614, + "step": 28938 + }, + { + "epoch": 1.42, + "grad_norm": 0.6105141043663025, + "learning_rate": 0.0003256571627968084, + "loss": 2.9355, + "step": 28939 + }, + { + "epoch": 1.42, + "grad_norm": 0.564461350440979, + "learning_rate": 0.000325641822982389, + "loss": 2.9066, + "step": 28940 + }, + { + "epoch": 1.42, + "grad_norm": 0.5573233962059021, + "learning_rate": 0.0003256264831004338, + "loss": 2.9489, + "step": 28941 + }, + { + "epoch": 1.42, + "grad_norm": 0.5542727112770081, + "learning_rate": 0.0003256111431509835, + "loss": 2.9582, + "step": 28942 + }, + { + "epoch": 1.42, + "grad_norm": 0.5618546605110168, + "learning_rate": 0.0003255958031340783, + "loss": 3.084, + "step": 28943 + }, + { + "epoch": 1.42, + "grad_norm": 0.5816484093666077, + "learning_rate": 0.00032558046304975876, + "loss": 2.9806, + "step": 28944 + }, + { + "epoch": 1.42, + "grad_norm": 0.6100296974182129, + "learning_rate": 0.0003255651228980651, + "loss": 3.202, + "step": 28945 + }, + { + "epoch": 1.42, + "grad_norm": 0.5640974640846252, + "learning_rate": 0.00032554978267903786, + "loss": 3.1282, + "step": 28946 + }, + { + "epoch": 1.42, + "grad_norm": 0.5738323330879211, + "learning_rate": 0.00032553444239271745, + "loss": 3.0857, + "step": 28947 + }, + { + "epoch": 1.42, + "grad_norm": 0.6203998923301697, + "learning_rate": 0.0003255191020391443, + "loss": 3.0114, + "step": 28948 + }, + { + "epoch": 1.42, + "grad_norm": 0.570510983467102, + "learning_rate": 0.00032550376161835865, + "loss": 3.0182, + "step": 28949 + }, + { + "epoch": 1.42, + "grad_norm": 0.5366271138191223, + "learning_rate": 0.0003254884211304009, + "loss": 2.9167, + "step": 28950 + }, + { + "epoch": 1.42, + "grad_norm": 0.5703023672103882, + "learning_rate": 0.00032547308057531173, + "loss": 3.073, + "step": 28951 + }, + { + "epoch": 1.42, + "grad_norm": 0.57989501953125, + "learning_rate": 0.0003254577399531313, + "loss": 2.995, + "step": 28952 + }, + { + "epoch": 1.42, + "grad_norm": 0.5939070582389832, + "learning_rate": 0.0003254423992639001, + "loss": 3.1934, + "step": 28953 + }, + { + "epoch": 1.42, + "grad_norm": 0.5694959759712219, + "learning_rate": 0.00032542705850765845, + "loss": 3.1665, + "step": 28954 + }, + { + "epoch": 1.42, + "grad_norm": 0.5695184469223022, + "learning_rate": 0.000325411717684447, + "loss": 2.9085, + "step": 28955 + }, + { + "epoch": 1.42, + "grad_norm": 0.6038377285003662, + "learning_rate": 0.00032539637679430576, + "loss": 2.978, + "step": 28956 + }, + { + "epoch": 1.42, + "grad_norm": 0.5637387037277222, + "learning_rate": 0.00032538103583727544, + "loss": 3.1269, + "step": 28957 + }, + { + "epoch": 1.42, + "grad_norm": 0.6748733520507812, + "learning_rate": 0.00032536569481339637, + "loss": 3.1292, + "step": 28958 + }, + { + "epoch": 1.42, + "grad_norm": 0.5864824652671814, + "learning_rate": 0.00032535035372270897, + "loss": 3.1735, + "step": 28959 + }, + { + "epoch": 1.42, + "grad_norm": 0.6003410220146179, + "learning_rate": 0.0003253350125652536, + "loss": 3.1991, + "step": 28960 + }, + { + "epoch": 1.42, + "grad_norm": 0.5306596159934998, + "learning_rate": 0.0003253196713410706, + "loss": 2.9204, + "step": 28961 + }, + { + "epoch": 1.42, + "grad_norm": 0.5750020742416382, + "learning_rate": 0.0003253043300502006, + "loss": 3.019, + "step": 28962 + }, + { + "epoch": 1.42, + "grad_norm": 0.5885328054428101, + "learning_rate": 0.0003252889886926838, + "loss": 2.971, + "step": 28963 + }, + { + "epoch": 1.42, + "grad_norm": 0.6015452146530151, + "learning_rate": 0.0003252736472685606, + "loss": 3.061, + "step": 28964 + }, + { + "epoch": 1.42, + "grad_norm": 0.5836628675460815, + "learning_rate": 0.00032525830577787154, + "loss": 3.0505, + "step": 28965 + }, + { + "epoch": 1.42, + "grad_norm": 0.5610659718513489, + "learning_rate": 0.00032524296422065693, + "loss": 3.1, + "step": 28966 + }, + { + "epoch": 1.42, + "grad_norm": 0.5679178237915039, + "learning_rate": 0.0003252276225969572, + "loss": 3.0292, + "step": 28967 + }, + { + "epoch": 1.42, + "grad_norm": 0.541135311126709, + "learning_rate": 0.0003252122809068128, + "loss": 2.9467, + "step": 28968 + }, + { + "epoch": 1.42, + "grad_norm": 0.6564010977745056, + "learning_rate": 0.0003251969391502641, + "loss": 2.9938, + "step": 28969 + }, + { + "epoch": 1.42, + "grad_norm": 0.5690862536430359, + "learning_rate": 0.0003251815973273516, + "loss": 3.131, + "step": 28970 + }, + { + "epoch": 1.42, + "grad_norm": 0.56672203540802, + "learning_rate": 0.0003251662554381154, + "loss": 3.0208, + "step": 28971 + }, + { + "epoch": 1.42, + "grad_norm": 0.6128435730934143, + "learning_rate": 0.0003251509134825963, + "loss": 2.9169, + "step": 28972 + }, + { + "epoch": 1.42, + "grad_norm": 0.5664635896682739, + "learning_rate": 0.00032513557146083454, + "loss": 3.0566, + "step": 28973 + }, + { + "epoch": 1.42, + "grad_norm": 0.5436082482337952, + "learning_rate": 0.0003251202293728705, + "loss": 3.0333, + "step": 28974 + }, + { + "epoch": 1.42, + "grad_norm": 0.6189156770706177, + "learning_rate": 0.0003251048872187446, + "loss": 2.9735, + "step": 28975 + }, + { + "epoch": 1.42, + "grad_norm": 0.5798361897468567, + "learning_rate": 0.0003250895449984972, + "loss": 3.1293, + "step": 28976 + }, + { + "epoch": 1.42, + "grad_norm": 0.584924042224884, + "learning_rate": 0.00032507420271216884, + "loss": 2.9706, + "step": 28977 + }, + { + "epoch": 1.42, + "grad_norm": 0.597545862197876, + "learning_rate": 0.0003250588603597998, + "loss": 3.2138, + "step": 28978 + }, + { + "epoch": 1.42, + "grad_norm": 0.5903199315071106, + "learning_rate": 0.0003250435179414306, + "loss": 3.1368, + "step": 28979 + }, + { + "epoch": 1.42, + "grad_norm": 0.6203091144561768, + "learning_rate": 0.0003250281754571016, + "loss": 3.1918, + "step": 28980 + }, + { + "epoch": 1.42, + "grad_norm": 0.6048563718795776, + "learning_rate": 0.0003250128329068531, + "loss": 3.1939, + "step": 28981 + }, + { + "epoch": 1.42, + "grad_norm": 0.5847277641296387, + "learning_rate": 0.0003249974902907257, + "loss": 3.2304, + "step": 28982 + }, + { + "epoch": 1.42, + "grad_norm": 0.6675156354904175, + "learning_rate": 0.0003249821476087597, + "loss": 2.9924, + "step": 28983 + }, + { + "epoch": 1.42, + "grad_norm": 0.5942085385322571, + "learning_rate": 0.00032496680486099566, + "loss": 2.9132, + "step": 28984 + }, + { + "epoch": 1.42, + "grad_norm": 0.6033164858818054, + "learning_rate": 0.0003249514620474737, + "loss": 3.0188, + "step": 28985 + }, + { + "epoch": 1.42, + "grad_norm": 0.5678269863128662, + "learning_rate": 0.00032493611916823437, + "loss": 3.0577, + "step": 28986 + }, + { + "epoch": 1.42, + "grad_norm": 0.5534139275550842, + "learning_rate": 0.0003249207762233181, + "loss": 2.9875, + "step": 28987 + }, + { + "epoch": 1.42, + "grad_norm": 0.5515173673629761, + "learning_rate": 0.00032490543321276547, + "loss": 3.0418, + "step": 28988 + }, + { + "epoch": 1.42, + "grad_norm": 0.585387110710144, + "learning_rate": 0.0003248900901366166, + "loss": 3.2985, + "step": 28989 + }, + { + "epoch": 1.42, + "grad_norm": 0.5718480944633484, + "learning_rate": 0.00032487474699491206, + "loss": 2.9427, + "step": 28990 + }, + { + "epoch": 1.42, + "grad_norm": 0.5717833638191223, + "learning_rate": 0.0003248594037876922, + "loss": 2.9248, + "step": 28991 + }, + { + "epoch": 1.42, + "grad_norm": 0.5640171766281128, + "learning_rate": 0.0003248440605149974, + "loss": 3.0837, + "step": 28992 + }, + { + "epoch": 1.42, + "grad_norm": 0.5515601634979248, + "learning_rate": 0.00032482871717686817, + "loss": 3.1191, + "step": 28993 + }, + { + "epoch": 1.42, + "grad_norm": 0.5653744339942932, + "learning_rate": 0.00032481337377334487, + "loss": 3.1291, + "step": 28994 + }, + { + "epoch": 1.42, + "grad_norm": 0.5796131491661072, + "learning_rate": 0.00032479803030446803, + "loss": 3.0333, + "step": 28995 + }, + { + "epoch": 1.42, + "grad_norm": 0.5645726919174194, + "learning_rate": 0.0003247826867702778, + "loss": 3.1165, + "step": 28996 + }, + { + "epoch": 1.42, + "grad_norm": 0.5933886170387268, + "learning_rate": 0.0003247673431708148, + "loss": 3.0208, + "step": 28997 + }, + { + "epoch": 1.42, + "grad_norm": 0.5697206258773804, + "learning_rate": 0.00032475199950611933, + "loss": 2.9842, + "step": 28998 + }, + { + "epoch": 1.42, + "grad_norm": 0.5712945461273193, + "learning_rate": 0.000324736655776232, + "loss": 3.1261, + "step": 28999 + }, + { + "epoch": 1.42, + "grad_norm": 0.689601480960846, + "learning_rate": 0.000324721311981193, + "loss": 3.2871, + "step": 29000 + }, + { + "epoch": 1.42, + "grad_norm": 0.5841153264045715, + "learning_rate": 0.00032470596812104275, + "loss": 3.1395, + "step": 29001 + }, + { + "epoch": 1.42, + "grad_norm": 0.6104511618614197, + "learning_rate": 0.0003246906241958218, + "loss": 3.1197, + "step": 29002 + }, + { + "epoch": 1.42, + "grad_norm": 0.575042724609375, + "learning_rate": 0.0003246752802055705, + "loss": 2.9056, + "step": 29003 + }, + { + "epoch": 1.42, + "grad_norm": 0.6196618676185608, + "learning_rate": 0.0003246599361503292, + "loss": 2.9237, + "step": 29004 + }, + { + "epoch": 1.42, + "grad_norm": 0.607953667640686, + "learning_rate": 0.00032464459203013847, + "loss": 2.9469, + "step": 29005 + }, + { + "epoch": 1.42, + "grad_norm": 0.574377179145813, + "learning_rate": 0.00032462924784503865, + "loss": 2.9129, + "step": 29006 + }, + { + "epoch": 1.42, + "grad_norm": 0.5855792760848999, + "learning_rate": 0.00032461390359507, + "loss": 3.2642, + "step": 29007 + }, + { + "epoch": 1.42, + "grad_norm": 0.5718584656715393, + "learning_rate": 0.00032459855928027313, + "loss": 2.9375, + "step": 29008 + }, + { + "epoch": 1.42, + "grad_norm": 0.6099596619606018, + "learning_rate": 0.00032458321490068837, + "loss": 3.0389, + "step": 29009 + }, + { + "epoch": 1.42, + "grad_norm": 0.5883126854896545, + "learning_rate": 0.00032456787045635624, + "loss": 2.9312, + "step": 29010 + }, + { + "epoch": 1.42, + "grad_norm": 0.5781348943710327, + "learning_rate": 0.00032455252594731704, + "loss": 2.8841, + "step": 29011 + }, + { + "epoch": 1.42, + "grad_norm": 0.5610764026641846, + "learning_rate": 0.00032453718137361113, + "loss": 2.9505, + "step": 29012 + }, + { + "epoch": 1.42, + "grad_norm": 0.5973708033561707, + "learning_rate": 0.0003245218367352791, + "loss": 2.8758, + "step": 29013 + }, + { + "epoch": 1.42, + "grad_norm": 0.6375737190246582, + "learning_rate": 0.00032450649203236116, + "loss": 2.9811, + "step": 29014 + }, + { + "epoch": 1.42, + "grad_norm": 0.6129302382469177, + "learning_rate": 0.00032449114726489786, + "loss": 3.2167, + "step": 29015 + }, + { + "epoch": 1.42, + "grad_norm": 0.5988703966140747, + "learning_rate": 0.0003244758024329297, + "loss": 3.0494, + "step": 29016 + }, + { + "epoch": 1.42, + "grad_norm": 0.6101521253585815, + "learning_rate": 0.0003244604575364969, + "loss": 3.0984, + "step": 29017 + }, + { + "epoch": 1.42, + "grad_norm": 0.5767313838005066, + "learning_rate": 0.00032444511257564, + "loss": 3.0744, + "step": 29018 + }, + { + "epoch": 1.42, + "grad_norm": 0.6167458891868591, + "learning_rate": 0.0003244297675503993, + "loss": 3.1639, + "step": 29019 + }, + { + "epoch": 1.42, + "grad_norm": 0.5684388875961304, + "learning_rate": 0.0003244144224608154, + "loss": 2.9517, + "step": 29020 + }, + { + "epoch": 1.42, + "grad_norm": 0.5900117754936218, + "learning_rate": 0.0003243990773069286, + "loss": 3.0036, + "step": 29021 + }, + { + "epoch": 1.42, + "grad_norm": 0.72846919298172, + "learning_rate": 0.0003243837320887792, + "loss": 3.1484, + "step": 29022 + }, + { + "epoch": 1.42, + "grad_norm": 0.5999704003334045, + "learning_rate": 0.0003243683868064078, + "loss": 3.0344, + "step": 29023 + }, + { + "epoch": 1.42, + "grad_norm": 0.5632615089416504, + "learning_rate": 0.0003243530414598549, + "loss": 3.179, + "step": 29024 + }, + { + "epoch": 1.42, + "grad_norm": 0.645645022392273, + "learning_rate": 0.00032433769604916065, + "loss": 3.1495, + "step": 29025 + }, + { + "epoch": 1.42, + "grad_norm": 0.6163223385810852, + "learning_rate": 0.0003243223505743656, + "loss": 2.983, + "step": 29026 + }, + { + "epoch": 1.42, + "grad_norm": 0.5711262226104736, + "learning_rate": 0.0003243070050355102, + "loss": 3.2788, + "step": 29027 + }, + { + "epoch": 1.42, + "grad_norm": 0.5460343956947327, + "learning_rate": 0.00032429165943263473, + "loss": 2.9364, + "step": 29028 + }, + { + "epoch": 1.42, + "grad_norm": 0.5538520216941833, + "learning_rate": 0.00032427631376577977, + "loss": 2.8617, + "step": 29029 + }, + { + "epoch": 1.42, + "grad_norm": 0.5613263845443726, + "learning_rate": 0.00032426096803498566, + "loss": 3.1502, + "step": 29030 + }, + { + "epoch": 1.42, + "grad_norm": 0.5788880586624146, + "learning_rate": 0.00032424562224029284, + "loss": 3.0286, + "step": 29031 + }, + { + "epoch": 1.42, + "grad_norm": 0.6063810586929321, + "learning_rate": 0.00032423027638174177, + "loss": 2.8803, + "step": 29032 + }, + { + "epoch": 1.42, + "grad_norm": 0.5913493633270264, + "learning_rate": 0.00032421493045937266, + "loss": 2.9329, + "step": 29033 + }, + { + "epoch": 1.42, + "grad_norm": 0.6406772136688232, + "learning_rate": 0.0003241995844732262, + "loss": 3.2342, + "step": 29034 + }, + { + "epoch": 1.42, + "grad_norm": 0.5965743064880371, + "learning_rate": 0.00032418423842334274, + "loss": 2.9745, + "step": 29035 + }, + { + "epoch": 1.42, + "grad_norm": 0.5714675188064575, + "learning_rate": 0.00032416889230976265, + "loss": 2.8127, + "step": 29036 + }, + { + "epoch": 1.42, + "grad_norm": 0.5749967694282532, + "learning_rate": 0.0003241535461325263, + "loss": 3.0176, + "step": 29037 + }, + { + "epoch": 1.42, + "grad_norm": 0.5987011194229126, + "learning_rate": 0.000324138199891674, + "loss": 3.1697, + "step": 29038 + }, + { + "epoch": 1.42, + "grad_norm": 0.5846742391586304, + "learning_rate": 0.00032412285358724655, + "loss": 3.1089, + "step": 29039 + }, + { + "epoch": 1.42, + "grad_norm": 0.6523012518882751, + "learning_rate": 0.00032410750721928406, + "loss": 2.933, + "step": 29040 + }, + { + "epoch": 1.42, + "grad_norm": 0.5596588850021362, + "learning_rate": 0.00032409216078782706, + "loss": 3.1106, + "step": 29041 + }, + { + "epoch": 1.42, + "grad_norm": 0.5636228322982788, + "learning_rate": 0.00032407681429291594, + "loss": 3.2038, + "step": 29042 + }, + { + "epoch": 1.42, + "grad_norm": 0.5773536562919617, + "learning_rate": 0.0003240614677345911, + "loss": 3.2443, + "step": 29043 + }, + { + "epoch": 1.42, + "grad_norm": 0.6011802554130554, + "learning_rate": 0.0003240461211128929, + "loss": 3.0798, + "step": 29044 + }, + { + "epoch": 1.42, + "grad_norm": 0.5544044971466064, + "learning_rate": 0.00032403077442786195, + "loss": 3.0196, + "step": 29045 + }, + { + "epoch": 1.42, + "grad_norm": 0.6246320605278015, + "learning_rate": 0.0003240154276795387, + "loss": 2.8156, + "step": 29046 + }, + { + "epoch": 1.42, + "grad_norm": 0.5959727764129639, + "learning_rate": 0.0003240000808679633, + "loss": 3.1569, + "step": 29047 + }, + { + "epoch": 1.42, + "grad_norm": 0.579071581363678, + "learning_rate": 0.00032398473399317617, + "loss": 3.1073, + "step": 29048 + }, + { + "epoch": 1.42, + "grad_norm": 0.5742684006690979, + "learning_rate": 0.000323969387055218, + "loss": 3.3636, + "step": 29049 + }, + { + "epoch": 1.42, + "grad_norm": 0.5994040966033936, + "learning_rate": 0.0003239540400541291, + "loss": 3.1828, + "step": 29050 + }, + { + "epoch": 1.42, + "grad_norm": 0.5804879069328308, + "learning_rate": 0.0003239386929899499, + "loss": 2.8866, + "step": 29051 + }, + { + "epoch": 1.42, + "grad_norm": 0.5392642021179199, + "learning_rate": 0.0003239233458627207, + "loss": 3.1973, + "step": 29052 + }, + { + "epoch": 1.42, + "grad_norm": 0.5686118602752686, + "learning_rate": 0.000323907998672482, + "loss": 2.977, + "step": 29053 + }, + { + "epoch": 1.42, + "grad_norm": 0.5497790575027466, + "learning_rate": 0.00032389265141927426, + "loss": 2.9656, + "step": 29054 + }, + { + "epoch": 1.42, + "grad_norm": 0.5894394516944885, + "learning_rate": 0.00032387730410313794, + "loss": 2.9002, + "step": 29055 + }, + { + "epoch": 1.42, + "grad_norm": 0.5542726516723633, + "learning_rate": 0.0003238619567241133, + "loss": 3.0072, + "step": 29056 + }, + { + "epoch": 1.42, + "grad_norm": 0.5575839877128601, + "learning_rate": 0.00032384660928224095, + "loss": 3.0584, + "step": 29057 + }, + { + "epoch": 1.42, + "grad_norm": 0.5948037505149841, + "learning_rate": 0.0003238312617775611, + "loss": 3.1864, + "step": 29058 + }, + { + "epoch": 1.42, + "grad_norm": 0.5639663934707642, + "learning_rate": 0.0003238159142101144, + "loss": 3.0449, + "step": 29059 + }, + { + "epoch": 1.42, + "grad_norm": 0.5659995079040527, + "learning_rate": 0.0003238005665799411, + "loss": 3.1618, + "step": 29060 + }, + { + "epoch": 1.42, + "grad_norm": 0.5502591133117676, + "learning_rate": 0.0003237852188870818, + "loss": 3.0585, + "step": 29061 + }, + { + "epoch": 1.42, + "grad_norm": 0.6027945280075073, + "learning_rate": 0.00032376987113157666, + "loss": 3.142, + "step": 29062 + }, + { + "epoch": 1.42, + "grad_norm": 0.5965318083763123, + "learning_rate": 0.00032375452331346634, + "loss": 3.0064, + "step": 29063 + }, + { + "epoch": 1.42, + "grad_norm": 0.5846608877182007, + "learning_rate": 0.00032373917543279115, + "loss": 3.1892, + "step": 29064 + }, + { + "epoch": 1.42, + "grad_norm": 0.6004025340080261, + "learning_rate": 0.00032372382748959165, + "loss": 3.1346, + "step": 29065 + }, + { + "epoch": 1.42, + "grad_norm": 0.581133246421814, + "learning_rate": 0.000323708479483908, + "loss": 2.9457, + "step": 29066 + }, + { + "epoch": 1.42, + "grad_norm": 0.5575944781303406, + "learning_rate": 0.0003236931314157808, + "loss": 3.0139, + "step": 29067 + }, + { + "epoch": 1.42, + "grad_norm": 0.589728832244873, + "learning_rate": 0.0003236777832852506, + "loss": 3.127, + "step": 29068 + }, + { + "epoch": 1.42, + "grad_norm": 0.5565273761749268, + "learning_rate": 0.00032366243509235744, + "loss": 3.1569, + "step": 29069 + }, + { + "epoch": 1.42, + "grad_norm": 0.5574138164520264, + "learning_rate": 0.00032364708683714206, + "loss": 3.0355, + "step": 29070 + }, + { + "epoch": 1.42, + "grad_norm": 0.5988021492958069, + "learning_rate": 0.00032363173851964486, + "loss": 2.9168, + "step": 29071 + }, + { + "epoch": 1.42, + "grad_norm": 0.5684751272201538, + "learning_rate": 0.0003236163901399063, + "loss": 3.1833, + "step": 29072 + }, + { + "epoch": 1.42, + "grad_norm": 0.619386613368988, + "learning_rate": 0.0003236010416979666, + "loss": 2.9159, + "step": 29073 + }, + { + "epoch": 1.42, + "grad_norm": 0.5681483149528503, + "learning_rate": 0.0003235856931938662, + "loss": 2.9759, + "step": 29074 + }, + { + "epoch": 1.42, + "grad_norm": 0.5597151517868042, + "learning_rate": 0.0003235703446276459, + "loss": 3.2504, + "step": 29075 + }, + { + "epoch": 1.42, + "grad_norm": 0.5716343522071838, + "learning_rate": 0.0003235549959993456, + "loss": 3.0314, + "step": 29076 + }, + { + "epoch": 1.42, + "grad_norm": 0.6016743779182434, + "learning_rate": 0.00032353964730900606, + "loss": 3.0251, + "step": 29077 + }, + { + "epoch": 1.43, + "grad_norm": 0.6275364756584167, + "learning_rate": 0.0003235242985566676, + "loss": 2.8835, + "step": 29078 + }, + { + "epoch": 1.43, + "grad_norm": 0.5814191699028015, + "learning_rate": 0.00032350894974237065, + "loss": 3.0817, + "step": 29079 + }, + { + "epoch": 1.43, + "grad_norm": 0.5964180827140808, + "learning_rate": 0.0003234936008661557, + "loss": 2.8892, + "step": 29080 + }, + { + "epoch": 1.43, + "grad_norm": 0.5740237236022949, + "learning_rate": 0.00032347825192806314, + "loss": 2.9775, + "step": 29081 + }, + { + "epoch": 1.43, + "grad_norm": 0.5742438435554504, + "learning_rate": 0.00032346290292813325, + "loss": 2.7709, + "step": 29082 + }, + { + "epoch": 1.43, + "grad_norm": 0.5780929327011108, + "learning_rate": 0.00032344755386640677, + "loss": 3.0928, + "step": 29083 + }, + { + "epoch": 1.43, + "grad_norm": 0.5740398168563843, + "learning_rate": 0.0003234322047429238, + "loss": 3.1639, + "step": 29084 + }, + { + "epoch": 1.43, + "grad_norm": 0.6118278503417969, + "learning_rate": 0.00032341685555772496, + "loss": 3.0158, + "step": 29085 + }, + { + "epoch": 1.43, + "grad_norm": 0.5554642677307129, + "learning_rate": 0.00032340150631085074, + "loss": 3.0543, + "step": 29086 + }, + { + "epoch": 1.43, + "grad_norm": 0.5830543041229248, + "learning_rate": 0.00032338615700234135, + "loss": 3.0372, + "step": 29087 + }, + { + "epoch": 1.43, + "grad_norm": 0.5947121381759644, + "learning_rate": 0.00032337080763223736, + "loss": 2.7584, + "step": 29088 + }, + { + "epoch": 1.43, + "grad_norm": 0.5922091007232666, + "learning_rate": 0.00032335545820057904, + "loss": 3.1258, + "step": 29089 + }, + { + "epoch": 1.43, + "grad_norm": 0.5475114583969116, + "learning_rate": 0.0003233401087074071, + "loss": 2.8411, + "step": 29090 + }, + { + "epoch": 1.43, + "grad_norm": 0.5651057362556458, + "learning_rate": 0.0003233247591527617, + "loss": 3.0353, + "step": 29091 + }, + { + "epoch": 1.43, + "grad_norm": 0.5793414115905762, + "learning_rate": 0.0003233094095366834, + "loss": 2.8918, + "step": 29092 + }, + { + "epoch": 1.43, + "grad_norm": 0.5808545351028442, + "learning_rate": 0.0003232940598592126, + "loss": 3.0612, + "step": 29093 + }, + { + "epoch": 1.43, + "grad_norm": 0.5640668272972107, + "learning_rate": 0.00032327871012038977, + "loss": 3.0701, + "step": 29094 + }, + { + "epoch": 1.43, + "grad_norm": 0.5635700225830078, + "learning_rate": 0.00032326336032025525, + "loss": 3.1003, + "step": 29095 + }, + { + "epoch": 1.43, + "grad_norm": 0.5963321328163147, + "learning_rate": 0.0003232480104588495, + "loss": 2.8877, + "step": 29096 + }, + { + "epoch": 1.43, + "grad_norm": 0.5794896483421326, + "learning_rate": 0.0003232326605362131, + "loss": 2.9719, + "step": 29097 + }, + { + "epoch": 1.43, + "grad_norm": 0.5805581212043762, + "learning_rate": 0.0003232173105523862, + "loss": 3.0281, + "step": 29098 + }, + { + "epoch": 1.43, + "grad_norm": 0.600281834602356, + "learning_rate": 0.00032320196050740935, + "loss": 3.3348, + "step": 29099 + }, + { + "epoch": 1.43, + "grad_norm": 0.5603992938995361, + "learning_rate": 0.00032318661040132307, + "loss": 3.0012, + "step": 29100 + }, + { + "epoch": 1.43, + "grad_norm": 0.6025208830833435, + "learning_rate": 0.0003231712602341678, + "loss": 2.9332, + "step": 29101 + }, + { + "epoch": 1.43, + "grad_norm": 0.5629807710647583, + "learning_rate": 0.00032315591000598375, + "loss": 3.1528, + "step": 29102 + }, + { + "epoch": 1.43, + "grad_norm": 0.5991537570953369, + "learning_rate": 0.0003231405597168116, + "loss": 3.1373, + "step": 29103 + }, + { + "epoch": 1.43, + "grad_norm": 0.5493240356445312, + "learning_rate": 0.0003231252093666916, + "loss": 2.9525, + "step": 29104 + }, + { + "epoch": 1.43, + "grad_norm": 0.5737966299057007, + "learning_rate": 0.0003231098589556642, + "loss": 2.9983, + "step": 29105 + }, + { + "epoch": 1.43, + "grad_norm": 0.6087252497673035, + "learning_rate": 0.00032309450848377, + "loss": 3.0698, + "step": 29106 + }, + { + "epoch": 1.43, + "grad_norm": 0.6365086436271667, + "learning_rate": 0.00032307915795104925, + "loss": 2.9886, + "step": 29107 + }, + { + "epoch": 1.43, + "grad_norm": 0.5757536292076111, + "learning_rate": 0.0003230638073575425, + "loss": 2.9974, + "step": 29108 + }, + { + "epoch": 1.43, + "grad_norm": 0.5831839442253113, + "learning_rate": 0.00032304845670329003, + "loss": 3.0675, + "step": 29109 + }, + { + "epoch": 1.43, + "grad_norm": 0.5890743136405945, + "learning_rate": 0.0003230331059883323, + "loss": 2.9943, + "step": 29110 + }, + { + "epoch": 1.43, + "grad_norm": 0.6159026622772217, + "learning_rate": 0.0003230177552127099, + "loss": 3.2641, + "step": 29111 + }, + { + "epoch": 1.43, + "grad_norm": 0.6064269542694092, + "learning_rate": 0.0003230024043764632, + "loss": 3.031, + "step": 29112 + }, + { + "epoch": 1.43, + "grad_norm": 0.6121566891670227, + "learning_rate": 0.0003229870534796326, + "loss": 3.0392, + "step": 29113 + }, + { + "epoch": 1.43, + "grad_norm": 0.6181187629699707, + "learning_rate": 0.0003229717025222585, + "loss": 3.1578, + "step": 29114 + }, + { + "epoch": 1.43, + "grad_norm": 0.6169598698616028, + "learning_rate": 0.00032295635150438127, + "loss": 3.1092, + "step": 29115 + }, + { + "epoch": 1.43, + "grad_norm": 0.5787153840065002, + "learning_rate": 0.0003229410004260415, + "loss": 3.0131, + "step": 29116 + }, + { + "epoch": 1.43, + "grad_norm": 0.5475699305534363, + "learning_rate": 0.00032292564928727954, + "loss": 3.3638, + "step": 29117 + }, + { + "epoch": 1.43, + "grad_norm": 0.5485643744468689, + "learning_rate": 0.0003229102980881358, + "loss": 2.7888, + "step": 29118 + }, + { + "epoch": 1.43, + "grad_norm": 0.5797244310379028, + "learning_rate": 0.0003228949468286509, + "loss": 3.1413, + "step": 29119 + }, + { + "epoch": 1.43, + "grad_norm": 0.5789276957511902, + "learning_rate": 0.00032287959550886486, + "loss": 2.7676, + "step": 29120 + }, + { + "epoch": 1.43, + "grad_norm": 0.5958558917045593, + "learning_rate": 0.0003228642441288185, + "loss": 3.1023, + "step": 29121 + }, + { + "epoch": 1.43, + "grad_norm": 0.5825849175453186, + "learning_rate": 0.00032284889268855213, + "loss": 3.2999, + "step": 29122 + }, + { + "epoch": 1.43, + "grad_norm": 0.5534700751304626, + "learning_rate": 0.0003228335411881063, + "loss": 3.1667, + "step": 29123 + }, + { + "epoch": 1.43, + "grad_norm": 0.581171989440918, + "learning_rate": 0.00032281818962752115, + "loss": 2.8703, + "step": 29124 + }, + { + "epoch": 1.43, + "grad_norm": 0.5967993140220642, + "learning_rate": 0.00032280283800683717, + "loss": 3.2153, + "step": 29125 + }, + { + "epoch": 1.43, + "grad_norm": 0.5790849924087524, + "learning_rate": 0.00032278748632609517, + "loss": 2.9798, + "step": 29126 + }, + { + "epoch": 1.43, + "grad_norm": 0.5897535681724548, + "learning_rate": 0.00032277213458533515, + "loss": 3.0861, + "step": 29127 + }, + { + "epoch": 1.43, + "grad_norm": 0.5886898636817932, + "learning_rate": 0.0003227567827845978, + "loss": 2.9372, + "step": 29128 + }, + { + "epoch": 1.43, + "grad_norm": 0.5656288862228394, + "learning_rate": 0.00032274143092392337, + "loss": 3.1163, + "step": 29129 + }, + { + "epoch": 1.43, + "grad_norm": 0.5811963081359863, + "learning_rate": 0.00032272607900335243, + "loss": 3.0505, + "step": 29130 + }, + { + "epoch": 1.43, + "grad_norm": 0.5721598267555237, + "learning_rate": 0.0003227107270229254, + "loss": 3.1477, + "step": 29131 + }, + { + "epoch": 1.43, + "grad_norm": 0.5497804880142212, + "learning_rate": 0.0003226953749826826, + "loss": 3.1137, + "step": 29132 + }, + { + "epoch": 1.43, + "grad_norm": 0.6076148748397827, + "learning_rate": 0.0003226800228826646, + "loss": 3.0502, + "step": 29133 + }, + { + "epoch": 1.43, + "grad_norm": 0.5574772357940674, + "learning_rate": 0.0003226646707229119, + "loss": 2.9401, + "step": 29134 + }, + { + "epoch": 1.43, + "grad_norm": 0.5898085236549377, + "learning_rate": 0.00032264931850346464, + "loss": 3.1227, + "step": 29135 + }, + { + "epoch": 1.43, + "grad_norm": 0.5748165845870972, + "learning_rate": 0.0003226339662243635, + "loss": 3.0951, + "step": 29136 + }, + { + "epoch": 1.43, + "grad_norm": 0.6368452310562134, + "learning_rate": 0.0003226186138856489, + "loss": 3.0673, + "step": 29137 + }, + { + "epoch": 1.43, + "grad_norm": 0.5354744791984558, + "learning_rate": 0.0003226032614873612, + "loss": 3.025, + "step": 29138 + }, + { + "epoch": 1.43, + "grad_norm": 0.5715710520744324, + "learning_rate": 0.0003225879090295408, + "loss": 3.2214, + "step": 29139 + }, + { + "epoch": 1.43, + "grad_norm": 0.6316717863082886, + "learning_rate": 0.00032257255651222826, + "loss": 3.273, + "step": 29140 + }, + { + "epoch": 1.43, + "grad_norm": 0.5766654014587402, + "learning_rate": 0.0003225572039354639, + "loss": 3.1074, + "step": 29141 + }, + { + "epoch": 1.43, + "grad_norm": 0.6002153754234314, + "learning_rate": 0.0003225418512992883, + "loss": 3.0406, + "step": 29142 + }, + { + "epoch": 1.43, + "grad_norm": 0.6038157343864441, + "learning_rate": 0.0003225264986037417, + "loss": 3.014, + "step": 29143 + }, + { + "epoch": 1.43, + "grad_norm": 0.5729610323905945, + "learning_rate": 0.00032251114584886466, + "loss": 2.8022, + "step": 29144 + }, + { + "epoch": 1.43, + "grad_norm": 0.5932610034942627, + "learning_rate": 0.00032249579303469765, + "loss": 3.0532, + "step": 29145 + }, + { + "epoch": 1.43, + "grad_norm": 0.5910505056381226, + "learning_rate": 0.000322480440161281, + "loss": 2.8465, + "step": 29146 + }, + { + "epoch": 1.43, + "grad_norm": 0.5964004397392273, + "learning_rate": 0.00032246508722865515, + "loss": 3.0927, + "step": 29147 + }, + { + "epoch": 1.43, + "grad_norm": 0.5830546617507935, + "learning_rate": 0.00032244973423686075, + "loss": 3.0487, + "step": 29148 + }, + { + "epoch": 1.43, + "grad_norm": 0.5955746173858643, + "learning_rate": 0.0003224343811859379, + "loss": 3.1017, + "step": 29149 + }, + { + "epoch": 1.43, + "grad_norm": 0.6120107173919678, + "learning_rate": 0.0003224190280759273, + "loss": 3.0038, + "step": 29150 + }, + { + "epoch": 1.43, + "grad_norm": 0.6444991827011108, + "learning_rate": 0.0003224036749068692, + "loss": 3.089, + "step": 29151 + }, + { + "epoch": 1.43, + "grad_norm": 0.5521337389945984, + "learning_rate": 0.0003223883216788042, + "loss": 3.0886, + "step": 29152 + }, + { + "epoch": 1.43, + "grad_norm": 0.6167043447494507, + "learning_rate": 0.00032237296839177266, + "loss": 2.9639, + "step": 29153 + }, + { + "epoch": 1.43, + "grad_norm": 0.6153653264045715, + "learning_rate": 0.000322357615045815, + "loss": 3.2767, + "step": 29154 + }, + { + "epoch": 1.43, + "grad_norm": 0.6091169118881226, + "learning_rate": 0.00032234226164097173, + "loss": 3.0682, + "step": 29155 + }, + { + "epoch": 1.43, + "grad_norm": 0.577284574508667, + "learning_rate": 0.00032232690817728327, + "loss": 3.0631, + "step": 29156 + }, + { + "epoch": 1.43, + "grad_norm": 0.6353074908256531, + "learning_rate": 0.00032231155465478996, + "loss": 3.0545, + "step": 29157 + }, + { + "epoch": 1.43, + "grad_norm": 0.6882631778717041, + "learning_rate": 0.00032229620107353236, + "loss": 3.0973, + "step": 29158 + }, + { + "epoch": 1.43, + "grad_norm": 0.6139711141586304, + "learning_rate": 0.0003222808474335509, + "loss": 2.9405, + "step": 29159 + }, + { + "epoch": 1.43, + "grad_norm": 0.6546551585197449, + "learning_rate": 0.00032226549373488584, + "loss": 3.1871, + "step": 29160 + }, + { + "epoch": 1.43, + "grad_norm": 0.5913887619972229, + "learning_rate": 0.00032225013997757776, + "loss": 3.0151, + "step": 29161 + }, + { + "epoch": 1.43, + "grad_norm": 0.5905306935310364, + "learning_rate": 0.0003222347861616672, + "loss": 2.8942, + "step": 29162 + }, + { + "epoch": 1.43, + "grad_norm": 0.5535931587219238, + "learning_rate": 0.0003222194322871945, + "loss": 3.1094, + "step": 29163 + }, + { + "epoch": 1.43, + "grad_norm": 0.6039665341377258, + "learning_rate": 0.00032220407835420006, + "loss": 2.9505, + "step": 29164 + }, + { + "epoch": 1.43, + "grad_norm": 0.5726522207260132, + "learning_rate": 0.0003221887243627243, + "loss": 2.9469, + "step": 29165 + }, + { + "epoch": 1.43, + "grad_norm": 0.6165740489959717, + "learning_rate": 0.00032217337031280774, + "loss": 2.9994, + "step": 29166 + }, + { + "epoch": 1.43, + "grad_norm": 0.6031109690666199, + "learning_rate": 0.0003221580162044908, + "loss": 3.1434, + "step": 29167 + }, + { + "epoch": 1.43, + "grad_norm": 0.6150469779968262, + "learning_rate": 0.0003221426620378139, + "loss": 2.9249, + "step": 29168 + }, + { + "epoch": 1.43, + "grad_norm": 0.5885416865348816, + "learning_rate": 0.00032212730781281746, + "loss": 3.2877, + "step": 29169 + }, + { + "epoch": 1.43, + "grad_norm": 0.5557782649993896, + "learning_rate": 0.0003221119535295421, + "loss": 3.0221, + "step": 29170 + }, + { + "epoch": 1.43, + "grad_norm": 0.6185104846954346, + "learning_rate": 0.00032209659918802793, + "loss": 3.0152, + "step": 29171 + }, + { + "epoch": 1.43, + "grad_norm": 0.5505481958389282, + "learning_rate": 0.00032208124478831564, + "loss": 3.0942, + "step": 29172 + }, + { + "epoch": 1.43, + "grad_norm": 0.5830556154251099, + "learning_rate": 0.00032206589033044556, + "loss": 3.1301, + "step": 29173 + }, + { + "epoch": 1.43, + "grad_norm": 0.5747976303100586, + "learning_rate": 0.0003220505358144583, + "loss": 3.0054, + "step": 29174 + }, + { + "epoch": 1.43, + "grad_norm": 0.6036010980606079, + "learning_rate": 0.000322035181240394, + "loss": 2.9831, + "step": 29175 + }, + { + "epoch": 1.43, + "grad_norm": 0.6000482439994812, + "learning_rate": 0.0003220198266082933, + "loss": 2.9121, + "step": 29176 + }, + { + "epoch": 1.43, + "grad_norm": 0.5834548473358154, + "learning_rate": 0.00032200447191819677, + "loss": 3.0241, + "step": 29177 + }, + { + "epoch": 1.43, + "grad_norm": 0.5785706639289856, + "learning_rate": 0.00032198911717014453, + "loss": 3.1835, + "step": 29178 + }, + { + "epoch": 1.43, + "grad_norm": 0.5782999992370605, + "learning_rate": 0.0003219737623641773, + "loss": 3.0318, + "step": 29179 + }, + { + "epoch": 1.43, + "grad_norm": 0.5533021688461304, + "learning_rate": 0.00032195840750033535, + "loss": 3.1158, + "step": 29180 + }, + { + "epoch": 1.43, + "grad_norm": 0.5466267466545105, + "learning_rate": 0.00032194305257865914, + "loss": 3.1926, + "step": 29181 + }, + { + "epoch": 1.43, + "grad_norm": 0.624919056892395, + "learning_rate": 0.0003219276975991892, + "loss": 2.9148, + "step": 29182 + }, + { + "epoch": 1.43, + "grad_norm": 0.5868450403213501, + "learning_rate": 0.00032191234256196587, + "loss": 2.9879, + "step": 29183 + }, + { + "epoch": 1.43, + "grad_norm": 0.5854912400245667, + "learning_rate": 0.00032189698746702976, + "loss": 2.9871, + "step": 29184 + }, + { + "epoch": 1.43, + "grad_norm": 0.5936477780342102, + "learning_rate": 0.0003218816323144212, + "loss": 3.0536, + "step": 29185 + }, + { + "epoch": 1.43, + "grad_norm": 0.5781201720237732, + "learning_rate": 0.00032186627710418056, + "loss": 3.1815, + "step": 29186 + }, + { + "epoch": 1.43, + "grad_norm": 0.5902300477027893, + "learning_rate": 0.0003218509218363483, + "loss": 3.0949, + "step": 29187 + }, + { + "epoch": 1.43, + "grad_norm": 0.5788503289222717, + "learning_rate": 0.00032183556651096506, + "loss": 2.8732, + "step": 29188 + }, + { + "epoch": 1.43, + "grad_norm": 0.5969659090042114, + "learning_rate": 0.0003218202111280711, + "loss": 2.9742, + "step": 29189 + }, + { + "epoch": 1.43, + "grad_norm": 0.603798508644104, + "learning_rate": 0.0003218048556877068, + "loss": 3.2926, + "step": 29190 + }, + { + "epoch": 1.43, + "grad_norm": 0.5414276123046875, + "learning_rate": 0.00032178950018991277, + "loss": 3.0918, + "step": 29191 + }, + { + "epoch": 1.43, + "grad_norm": 0.5900503993034363, + "learning_rate": 0.0003217741446347294, + "loss": 3.2083, + "step": 29192 + }, + { + "epoch": 1.43, + "grad_norm": 0.6011453866958618, + "learning_rate": 0.00032175878902219714, + "loss": 3.215, + "step": 29193 + }, + { + "epoch": 1.43, + "grad_norm": 0.6004211902618408, + "learning_rate": 0.00032174343335235637, + "loss": 3.0323, + "step": 29194 + }, + { + "epoch": 1.43, + "grad_norm": 0.5935797095298767, + "learning_rate": 0.0003217280776252476, + "loss": 2.909, + "step": 29195 + }, + { + "epoch": 1.43, + "grad_norm": 0.6498414278030396, + "learning_rate": 0.0003217127218409113, + "loss": 3.0649, + "step": 29196 + }, + { + "epoch": 1.43, + "grad_norm": 0.5944969058036804, + "learning_rate": 0.00032169736599938777, + "loss": 3.234, + "step": 29197 + }, + { + "epoch": 1.43, + "grad_norm": 0.6110669374465942, + "learning_rate": 0.0003216820101007177, + "loss": 2.709, + "step": 29198 + }, + { + "epoch": 1.43, + "grad_norm": 0.5599201321601868, + "learning_rate": 0.0003216666541449413, + "loss": 3.0135, + "step": 29199 + }, + { + "epoch": 1.43, + "grad_norm": 0.5829009413719177, + "learning_rate": 0.0003216512981320991, + "loss": 2.8439, + "step": 29200 + }, + { + "epoch": 1.43, + "grad_norm": 0.6302081942558289, + "learning_rate": 0.0003216359420622315, + "loss": 2.9406, + "step": 29201 + }, + { + "epoch": 1.43, + "grad_norm": 0.6054470539093018, + "learning_rate": 0.000321620585935379, + "loss": 3.1103, + "step": 29202 + }, + { + "epoch": 1.43, + "grad_norm": 0.6377239227294922, + "learning_rate": 0.00032160522975158216, + "loss": 2.9263, + "step": 29203 + }, + { + "epoch": 1.43, + "grad_norm": 0.5627120733261108, + "learning_rate": 0.0003215898735108812, + "loss": 2.8536, + "step": 29204 + }, + { + "epoch": 1.43, + "grad_norm": 0.5820385217666626, + "learning_rate": 0.00032157451721331675, + "loss": 2.9532, + "step": 29205 + }, + { + "epoch": 1.43, + "grad_norm": 0.593466579914093, + "learning_rate": 0.0003215591608589291, + "loss": 3.0497, + "step": 29206 + }, + { + "epoch": 1.43, + "grad_norm": 0.5768133401870728, + "learning_rate": 0.00032154380444775877, + "loss": 3.1535, + "step": 29207 + }, + { + "epoch": 1.43, + "grad_norm": 0.6087096929550171, + "learning_rate": 0.0003215284479798462, + "loss": 3.0309, + "step": 29208 + }, + { + "epoch": 1.43, + "grad_norm": 0.6114547848701477, + "learning_rate": 0.0003215130914552319, + "loss": 3.0905, + "step": 29209 + }, + { + "epoch": 1.43, + "grad_norm": 0.6147521734237671, + "learning_rate": 0.00032149773487395624, + "loss": 2.9946, + "step": 29210 + }, + { + "epoch": 1.43, + "grad_norm": 0.5623575448989868, + "learning_rate": 0.0003214823782360597, + "loss": 2.9643, + "step": 29211 + }, + { + "epoch": 1.43, + "grad_norm": 0.5678356289863586, + "learning_rate": 0.00032146702154158257, + "loss": 3.0765, + "step": 29212 + }, + { + "epoch": 1.43, + "grad_norm": 0.5572715401649475, + "learning_rate": 0.00032145166479056553, + "loss": 2.6901, + "step": 29213 + }, + { + "epoch": 1.43, + "grad_norm": 0.6046434640884399, + "learning_rate": 0.00032143630798304903, + "loss": 2.9514, + "step": 29214 + }, + { + "epoch": 1.43, + "grad_norm": 0.6713277101516724, + "learning_rate": 0.0003214209511190733, + "loss": 2.9896, + "step": 29215 + }, + { + "epoch": 1.43, + "grad_norm": 0.6149976253509521, + "learning_rate": 0.00032140559419867895, + "loss": 3.0438, + "step": 29216 + }, + { + "epoch": 1.43, + "grad_norm": 0.603386640548706, + "learning_rate": 0.0003213902372219064, + "loss": 3.2325, + "step": 29217 + }, + { + "epoch": 1.43, + "grad_norm": 0.5698647499084473, + "learning_rate": 0.000321374880188796, + "loss": 2.9808, + "step": 29218 + }, + { + "epoch": 1.43, + "grad_norm": 0.588376522064209, + "learning_rate": 0.00032135952309938836, + "loss": 3.0994, + "step": 29219 + }, + { + "epoch": 1.43, + "grad_norm": 0.5656581521034241, + "learning_rate": 0.0003213441659537238, + "loss": 3.2486, + "step": 29220 + }, + { + "epoch": 1.43, + "grad_norm": 0.56963711977005, + "learning_rate": 0.0003213288087518429, + "loss": 3.1403, + "step": 29221 + }, + { + "epoch": 1.43, + "grad_norm": 0.5951929092407227, + "learning_rate": 0.0003213134514937859, + "loss": 2.8392, + "step": 29222 + }, + { + "epoch": 1.43, + "grad_norm": 0.5948726534843445, + "learning_rate": 0.00032129809417959346, + "loss": 3.0278, + "step": 29223 + }, + { + "epoch": 1.43, + "grad_norm": 0.632287323474884, + "learning_rate": 0.0003212827368093059, + "loss": 3.0306, + "step": 29224 + }, + { + "epoch": 1.43, + "grad_norm": 0.5774150490760803, + "learning_rate": 0.00032126737938296376, + "loss": 3.0625, + "step": 29225 + }, + { + "epoch": 1.43, + "grad_norm": 0.687145471572876, + "learning_rate": 0.0003212520219006074, + "loss": 3.0098, + "step": 29226 + }, + { + "epoch": 1.43, + "grad_norm": 0.615565299987793, + "learning_rate": 0.00032123666436227735, + "loss": 3.0459, + "step": 29227 + }, + { + "epoch": 1.43, + "grad_norm": 0.6112501621246338, + "learning_rate": 0.0003212213067680139, + "loss": 3.0546, + "step": 29228 + }, + { + "epoch": 1.43, + "grad_norm": 0.5874766707420349, + "learning_rate": 0.0003212059491178577, + "loss": 2.7821, + "step": 29229 + }, + { + "epoch": 1.43, + "grad_norm": 0.64829021692276, + "learning_rate": 0.0003211905914118491, + "loss": 3.0968, + "step": 29230 + }, + { + "epoch": 1.43, + "grad_norm": 0.6121979355812073, + "learning_rate": 0.0003211752336500285, + "loss": 3.0179, + "step": 29231 + }, + { + "epoch": 1.43, + "grad_norm": 0.6101468801498413, + "learning_rate": 0.00032115987583243653, + "loss": 2.9941, + "step": 29232 + }, + { + "epoch": 1.43, + "grad_norm": 0.5940099358558655, + "learning_rate": 0.0003211445179591134, + "loss": 3.0411, + "step": 29233 + }, + { + "epoch": 1.43, + "grad_norm": 0.5766475796699524, + "learning_rate": 0.00032112916003009965, + "loss": 2.939, + "step": 29234 + }, + { + "epoch": 1.43, + "grad_norm": 0.5749085545539856, + "learning_rate": 0.00032111380204543586, + "loss": 3.0331, + "step": 29235 + }, + { + "epoch": 1.43, + "grad_norm": 0.5809697508811951, + "learning_rate": 0.0003210984440051624, + "loss": 3.1916, + "step": 29236 + }, + { + "epoch": 1.43, + "grad_norm": 0.5791935920715332, + "learning_rate": 0.00032108308590931965, + "loss": 2.8306, + "step": 29237 + }, + { + "epoch": 1.43, + "grad_norm": 0.6096273064613342, + "learning_rate": 0.00032106772775794806, + "loss": 3.0382, + "step": 29238 + }, + { + "epoch": 1.43, + "grad_norm": 0.5717841386795044, + "learning_rate": 0.00032105236955108825, + "loss": 2.9114, + "step": 29239 + }, + { + "epoch": 1.43, + "grad_norm": 0.589238166809082, + "learning_rate": 0.0003210370112887805, + "loss": 3.0616, + "step": 29240 + }, + { + "epoch": 1.43, + "grad_norm": 0.6342265009880066, + "learning_rate": 0.0003210216529710653, + "loss": 3.1, + "step": 29241 + }, + { + "epoch": 1.43, + "grad_norm": 0.609419047832489, + "learning_rate": 0.00032100629459798303, + "loss": 3.0961, + "step": 29242 + }, + { + "epoch": 1.43, + "grad_norm": 0.5756333470344543, + "learning_rate": 0.0003209909361695743, + "loss": 2.8789, + "step": 29243 + }, + { + "epoch": 1.43, + "grad_norm": 0.5944945812225342, + "learning_rate": 0.0003209755776858795, + "loss": 2.9332, + "step": 29244 + }, + { + "epoch": 1.43, + "grad_norm": 0.612972617149353, + "learning_rate": 0.00032096021914693906, + "loss": 3.2249, + "step": 29245 + }, + { + "epoch": 1.43, + "grad_norm": 0.6202994585037231, + "learning_rate": 0.0003209448605527934, + "loss": 2.9364, + "step": 29246 + }, + { + "epoch": 1.43, + "grad_norm": 0.6110982298851013, + "learning_rate": 0.0003209295019034831, + "loss": 3.1885, + "step": 29247 + }, + { + "epoch": 1.43, + "grad_norm": 0.5498843193054199, + "learning_rate": 0.0003209141431990484, + "loss": 3.1529, + "step": 29248 + }, + { + "epoch": 1.43, + "grad_norm": 0.5546060800552368, + "learning_rate": 0.00032089878443952984, + "loss": 3.0771, + "step": 29249 + }, + { + "epoch": 1.43, + "grad_norm": 0.5500269532203674, + "learning_rate": 0.0003208834256249681, + "loss": 3.0521, + "step": 29250 + }, + { + "epoch": 1.43, + "grad_norm": 0.5937256217002869, + "learning_rate": 0.0003208680667554033, + "loss": 3.1337, + "step": 29251 + }, + { + "epoch": 1.43, + "grad_norm": 0.5819010734558105, + "learning_rate": 0.00032085270783087605, + "loss": 3.4204, + "step": 29252 + }, + { + "epoch": 1.43, + "grad_norm": 0.5727535486221313, + "learning_rate": 0.00032083734885142673, + "loss": 2.879, + "step": 29253 + }, + { + "epoch": 1.43, + "grad_norm": 0.6039817333221436, + "learning_rate": 0.0003208219898170959, + "loss": 2.9596, + "step": 29254 + }, + { + "epoch": 1.43, + "grad_norm": 0.5760222673416138, + "learning_rate": 0.000320806630727924, + "loss": 3.2594, + "step": 29255 + }, + { + "epoch": 1.43, + "grad_norm": 0.5537810921669006, + "learning_rate": 0.00032079127158395136, + "loss": 3.0045, + "step": 29256 + }, + { + "epoch": 1.43, + "grad_norm": 0.5716868042945862, + "learning_rate": 0.00032077591238521857, + "loss": 2.921, + "step": 29257 + }, + { + "epoch": 1.43, + "grad_norm": 0.5600032210350037, + "learning_rate": 0.00032076055313176593, + "loss": 3.0156, + "step": 29258 + }, + { + "epoch": 1.43, + "grad_norm": 0.5719262361526489, + "learning_rate": 0.00032074519382363406, + "loss": 3.1698, + "step": 29259 + }, + { + "epoch": 1.43, + "grad_norm": 0.5441270470619202, + "learning_rate": 0.0003207298344608634, + "loss": 3.0605, + "step": 29260 + }, + { + "epoch": 1.43, + "grad_norm": 0.5567929744720459, + "learning_rate": 0.00032071447504349433, + "loss": 3.0992, + "step": 29261 + }, + { + "epoch": 1.43, + "grad_norm": 0.5503496527671814, + "learning_rate": 0.0003206991155715672, + "loss": 2.9937, + "step": 29262 + }, + { + "epoch": 1.43, + "grad_norm": 0.576248049736023, + "learning_rate": 0.0003206837560451227, + "loss": 2.9549, + "step": 29263 + }, + { + "epoch": 1.43, + "grad_norm": 0.5835015773773193, + "learning_rate": 0.000320668396464201, + "loss": 3.1543, + "step": 29264 + }, + { + "epoch": 1.43, + "grad_norm": 0.5682410597801208, + "learning_rate": 0.00032065303682884293, + "loss": 3.2322, + "step": 29265 + }, + { + "epoch": 1.43, + "grad_norm": 0.5933088660240173, + "learning_rate": 0.0003206376771390887, + "loss": 3.2144, + "step": 29266 + }, + { + "epoch": 1.43, + "grad_norm": 0.5696515440940857, + "learning_rate": 0.0003206223173949787, + "loss": 2.8856, + "step": 29267 + }, + { + "epoch": 1.43, + "grad_norm": 0.6060404181480408, + "learning_rate": 0.0003206069575965535, + "loss": 3.1012, + "step": 29268 + }, + { + "epoch": 1.43, + "grad_norm": 0.5765473246574402, + "learning_rate": 0.0003205915977438536, + "loss": 3.0763, + "step": 29269 + }, + { + "epoch": 1.43, + "grad_norm": 0.5708712339401245, + "learning_rate": 0.00032057623783691936, + "loss": 3.1127, + "step": 29270 + }, + { + "epoch": 1.43, + "grad_norm": 0.5930168032646179, + "learning_rate": 0.0003205608778757912, + "loss": 2.9539, + "step": 29271 + }, + { + "epoch": 1.43, + "grad_norm": 0.5647690892219543, + "learning_rate": 0.0003205455178605099, + "loss": 2.9147, + "step": 29272 + }, + { + "epoch": 1.43, + "grad_norm": 0.6091307401657104, + "learning_rate": 0.00032053015779111543, + "loss": 3.0897, + "step": 29273 + }, + { + "epoch": 1.43, + "grad_norm": 0.5425414443016052, + "learning_rate": 0.00032051479766764845, + "loss": 2.9467, + "step": 29274 + }, + { + "epoch": 1.43, + "grad_norm": 0.589598536491394, + "learning_rate": 0.00032049943749014954, + "loss": 3.1705, + "step": 29275 + }, + { + "epoch": 1.43, + "grad_norm": 0.6278284192085266, + "learning_rate": 0.00032048407725865905, + "loss": 2.9084, + "step": 29276 + }, + { + "epoch": 1.43, + "grad_norm": 0.5885589718818665, + "learning_rate": 0.0003204687169732174, + "loss": 3.0185, + "step": 29277 + }, + { + "epoch": 1.43, + "grad_norm": 0.589991569519043, + "learning_rate": 0.0003204533566338652, + "loss": 3.0392, + "step": 29278 + }, + { + "epoch": 1.43, + "grad_norm": 0.5731289982795715, + "learning_rate": 0.0003204379962406427, + "loss": 2.9205, + "step": 29279 + }, + { + "epoch": 1.43, + "grad_norm": 0.5898691415786743, + "learning_rate": 0.00032042263579359047, + "loss": 3.116, + "step": 29280 + }, + { + "epoch": 1.43, + "grad_norm": 0.5861981511116028, + "learning_rate": 0.0003204072752927489, + "loss": 3.2275, + "step": 29281 + }, + { + "epoch": 1.44, + "grad_norm": 0.5974080562591553, + "learning_rate": 0.00032039191473815854, + "loss": 2.8365, + "step": 29282 + }, + { + "epoch": 1.44, + "grad_norm": 0.569999635219574, + "learning_rate": 0.0003203765541298598, + "loss": 2.9994, + "step": 29283 + }, + { + "epoch": 1.44, + "grad_norm": 0.5541989207267761, + "learning_rate": 0.0003203611934678931, + "loss": 2.6546, + "step": 29284 + }, + { + "epoch": 1.44, + "grad_norm": 0.5642061233520508, + "learning_rate": 0.00032034583275229896, + "loss": 3.0176, + "step": 29285 + }, + { + "epoch": 1.44, + "grad_norm": 0.5687254667282104, + "learning_rate": 0.00032033047198311786, + "loss": 3.0303, + "step": 29286 + }, + { + "epoch": 1.44, + "grad_norm": 0.5605220198631287, + "learning_rate": 0.0003203151111603902, + "loss": 3.0205, + "step": 29287 + }, + { + "epoch": 1.44, + "grad_norm": 0.5649875998497009, + "learning_rate": 0.0003202997502841564, + "loss": 2.9837, + "step": 29288 + }, + { + "epoch": 1.44, + "grad_norm": 0.5587196350097656, + "learning_rate": 0.00032028438935445693, + "loss": 3.0814, + "step": 29289 + }, + { + "epoch": 1.44, + "grad_norm": 0.5571485161781311, + "learning_rate": 0.0003202690283713324, + "loss": 3.0345, + "step": 29290 + }, + { + "epoch": 1.44, + "grad_norm": 0.6157510876655579, + "learning_rate": 0.0003202536673348231, + "loss": 3.1435, + "step": 29291 + }, + { + "epoch": 1.44, + "grad_norm": 0.5682488083839417, + "learning_rate": 0.0003202383062449695, + "loss": 3.1119, + "step": 29292 + }, + { + "epoch": 1.44, + "grad_norm": 0.5798583626747131, + "learning_rate": 0.0003202229451018121, + "loss": 2.833, + "step": 29293 + }, + { + "epoch": 1.44, + "grad_norm": 0.5703584551811218, + "learning_rate": 0.00032020758390539136, + "loss": 3.1149, + "step": 29294 + }, + { + "epoch": 1.44, + "grad_norm": 0.5614786148071289, + "learning_rate": 0.00032019222265574775, + "loss": 3.0893, + "step": 29295 + }, + { + "epoch": 1.44, + "grad_norm": 0.541699230670929, + "learning_rate": 0.0003201768613529217, + "loss": 3.0179, + "step": 29296 + }, + { + "epoch": 1.44, + "grad_norm": 0.615832507610321, + "learning_rate": 0.00032016149999695374, + "loss": 3.193, + "step": 29297 + }, + { + "epoch": 1.44, + "grad_norm": 0.5494009852409363, + "learning_rate": 0.0003201461385878842, + "loss": 3.1504, + "step": 29298 + }, + { + "epoch": 1.44, + "grad_norm": 0.5631309747695923, + "learning_rate": 0.00032013077712575357, + "loss": 3.1214, + "step": 29299 + }, + { + "epoch": 1.44, + "grad_norm": 0.5645419359207153, + "learning_rate": 0.00032011541561060246, + "loss": 3.1237, + "step": 29300 + }, + { + "epoch": 1.44, + "grad_norm": 0.577457070350647, + "learning_rate": 0.0003201000540424712, + "loss": 2.8725, + "step": 29301 + }, + { + "epoch": 1.44, + "grad_norm": 0.6075976490974426, + "learning_rate": 0.0003200846924214002, + "loss": 3.0414, + "step": 29302 + }, + { + "epoch": 1.44, + "grad_norm": 0.5680819749832153, + "learning_rate": 0.00032006933074743, + "loss": 3.1522, + "step": 29303 + }, + { + "epoch": 1.44, + "grad_norm": 0.573302149772644, + "learning_rate": 0.00032005396902060106, + "loss": 3.2003, + "step": 29304 + }, + { + "epoch": 1.44, + "grad_norm": 0.572464108467102, + "learning_rate": 0.0003200386072409538, + "loss": 2.9017, + "step": 29305 + }, + { + "epoch": 1.44, + "grad_norm": 0.562468945980072, + "learning_rate": 0.00032002324540852874, + "loss": 3.0256, + "step": 29306 + }, + { + "epoch": 1.44, + "grad_norm": 0.615037202835083, + "learning_rate": 0.00032000788352336626, + "loss": 2.9762, + "step": 29307 + }, + { + "epoch": 1.44, + "grad_norm": 0.5618579387664795, + "learning_rate": 0.0003199925215855069, + "loss": 3.0673, + "step": 29308 + }, + { + "epoch": 1.44, + "grad_norm": 0.5961542725563049, + "learning_rate": 0.0003199771595949912, + "loss": 3.2632, + "step": 29309 + }, + { + "epoch": 1.44, + "grad_norm": 0.6055513620376587, + "learning_rate": 0.00031996179755185925, + "loss": 2.8112, + "step": 29310 + }, + { + "epoch": 1.44, + "grad_norm": 0.5619701147079468, + "learning_rate": 0.000319946435456152, + "loss": 2.9445, + "step": 29311 + }, + { + "epoch": 1.44, + "grad_norm": 0.5580838322639465, + "learning_rate": 0.00031993107330790957, + "loss": 2.8617, + "step": 29312 + }, + { + "epoch": 1.44, + "grad_norm": 0.5237293839454651, + "learning_rate": 0.00031991571110717254, + "loss": 2.8157, + "step": 29313 + }, + { + "epoch": 1.44, + "grad_norm": 0.545191764831543, + "learning_rate": 0.00031990034885398137, + "loss": 3.2033, + "step": 29314 + }, + { + "epoch": 1.44, + "grad_norm": 0.5724726319313049, + "learning_rate": 0.00031988498654837645, + "loss": 3.0922, + "step": 29315 + }, + { + "epoch": 1.44, + "grad_norm": 0.6132506728172302, + "learning_rate": 0.0003198696241903984, + "loss": 2.8161, + "step": 29316 + }, + { + "epoch": 1.44, + "grad_norm": 0.5981416702270508, + "learning_rate": 0.0003198542617800876, + "loss": 3.0212, + "step": 29317 + }, + { + "epoch": 1.44, + "grad_norm": 0.6016727685928345, + "learning_rate": 0.0003198388993174844, + "loss": 3.171, + "step": 29318 + }, + { + "epoch": 1.44, + "grad_norm": 0.5826240181922913, + "learning_rate": 0.00031982353680262937, + "loss": 2.9993, + "step": 29319 + }, + { + "epoch": 1.44, + "grad_norm": 0.5482321381568909, + "learning_rate": 0.000319808174235563, + "loss": 3.193, + "step": 29320 + }, + { + "epoch": 1.44, + "grad_norm": 0.5854382514953613, + "learning_rate": 0.0003197928116163257, + "loss": 3.4126, + "step": 29321 + }, + { + "epoch": 1.44, + "grad_norm": 0.5981870293617249, + "learning_rate": 0.0003197774489449579, + "loss": 3.1969, + "step": 29322 + }, + { + "epoch": 1.44, + "grad_norm": 0.6282458305358887, + "learning_rate": 0.00031976208622150023, + "loss": 2.8896, + "step": 29323 + }, + { + "epoch": 1.44, + "grad_norm": 0.5802420377731323, + "learning_rate": 0.0003197467234459929, + "loss": 3.0195, + "step": 29324 + }, + { + "epoch": 1.44, + "grad_norm": 0.6020224690437317, + "learning_rate": 0.0003197313606184765, + "loss": 3.0872, + "step": 29325 + }, + { + "epoch": 1.44, + "grad_norm": 0.542820394039154, + "learning_rate": 0.0003197159977389915, + "loss": 3.0749, + "step": 29326 + }, + { + "epoch": 1.44, + "grad_norm": 0.5725154280662537, + "learning_rate": 0.00031970063480757843, + "loss": 3.0649, + "step": 29327 + }, + { + "epoch": 1.44, + "grad_norm": 0.602067232131958, + "learning_rate": 0.00031968527182427766, + "loss": 2.877, + "step": 29328 + }, + { + "epoch": 1.44, + "grad_norm": 0.6186937689781189, + "learning_rate": 0.0003196699087891297, + "loss": 3.1819, + "step": 29329 + }, + { + "epoch": 1.44, + "grad_norm": 0.6243546009063721, + "learning_rate": 0.00031965454570217485, + "loss": 3.1825, + "step": 29330 + }, + { + "epoch": 1.44, + "grad_norm": 0.586065948009491, + "learning_rate": 0.0003196391825634538, + "loss": 2.9946, + "step": 29331 + }, + { + "epoch": 1.44, + "grad_norm": 0.6126989722251892, + "learning_rate": 0.00031962381937300694, + "loss": 3.0819, + "step": 29332 + }, + { + "epoch": 1.44, + "grad_norm": 0.5854355692863464, + "learning_rate": 0.00031960845613087464, + "loss": 3.2089, + "step": 29333 + }, + { + "epoch": 1.44, + "grad_norm": 0.5818975567817688, + "learning_rate": 0.0003195930928370976, + "loss": 3.1475, + "step": 29334 + }, + { + "epoch": 1.44, + "grad_norm": 0.5570355653762817, + "learning_rate": 0.000319577729491716, + "loss": 3.0968, + "step": 29335 + }, + { + "epoch": 1.44, + "grad_norm": 0.5599480867385864, + "learning_rate": 0.0003195623660947704, + "loss": 2.9759, + "step": 29336 + }, + { + "epoch": 1.44, + "grad_norm": 0.5848993062973022, + "learning_rate": 0.00031954700264630136, + "loss": 3.2071, + "step": 29337 + }, + { + "epoch": 1.44, + "grad_norm": 0.5863507390022278, + "learning_rate": 0.00031953163914634936, + "loss": 3.3054, + "step": 29338 + }, + { + "epoch": 1.44, + "grad_norm": 0.5722285509109497, + "learning_rate": 0.0003195162755949547, + "loss": 3.1209, + "step": 29339 + }, + { + "epoch": 1.44, + "grad_norm": 0.5752810835838318, + "learning_rate": 0.00031950091199215775, + "loss": 3.1011, + "step": 29340 + }, + { + "epoch": 1.44, + "grad_norm": 0.6314206123352051, + "learning_rate": 0.0003194855483379994, + "loss": 3.1192, + "step": 29341 + }, + { + "epoch": 1.44, + "grad_norm": 0.5731692910194397, + "learning_rate": 0.00031947018463251975, + "loss": 2.9817, + "step": 29342 + }, + { + "epoch": 1.44, + "grad_norm": 0.5550987124443054, + "learning_rate": 0.0003194548208757594, + "loss": 3.079, + "step": 29343 + }, + { + "epoch": 1.44, + "grad_norm": 0.5599222183227539, + "learning_rate": 0.0003194394570677588, + "loss": 3.0314, + "step": 29344 + }, + { + "epoch": 1.44, + "grad_norm": 0.5602944493293762, + "learning_rate": 0.00031942409320855845, + "loss": 3.015, + "step": 29345 + }, + { + "epoch": 1.44, + "grad_norm": 0.5581718683242798, + "learning_rate": 0.00031940872929819874, + "loss": 3.0226, + "step": 29346 + }, + { + "epoch": 1.44, + "grad_norm": 0.6010013818740845, + "learning_rate": 0.00031939336533672014, + "loss": 3.1017, + "step": 29347 + }, + { + "epoch": 1.44, + "grad_norm": 0.5561316013336182, + "learning_rate": 0.00031937800132416315, + "loss": 3.0053, + "step": 29348 + }, + { + "epoch": 1.44, + "grad_norm": 0.620624840259552, + "learning_rate": 0.0003193626372605683, + "loss": 3.0979, + "step": 29349 + }, + { + "epoch": 1.44, + "grad_norm": 0.5798450708389282, + "learning_rate": 0.000319347273145976, + "loss": 3.1279, + "step": 29350 + }, + { + "epoch": 1.44, + "grad_norm": 0.6187215447425842, + "learning_rate": 0.0003193319089804266, + "loss": 3.1367, + "step": 29351 + }, + { + "epoch": 1.44, + "grad_norm": 0.6134929656982422, + "learning_rate": 0.00031931654476396077, + "loss": 3.0664, + "step": 29352 + }, + { + "epoch": 1.44, + "grad_norm": 0.5657615661621094, + "learning_rate": 0.0003193011804966188, + "loss": 3.0826, + "step": 29353 + }, + { + "epoch": 1.44, + "grad_norm": 0.5925348401069641, + "learning_rate": 0.00031928581617844134, + "loss": 2.892, + "step": 29354 + }, + { + "epoch": 1.44, + "grad_norm": 0.5548837780952454, + "learning_rate": 0.0003192704518094686, + "loss": 2.9854, + "step": 29355 + }, + { + "epoch": 1.44, + "grad_norm": 0.5412815809249878, + "learning_rate": 0.00031925508738974134, + "loss": 2.9617, + "step": 29356 + }, + { + "epoch": 1.44, + "grad_norm": 0.5807236433029175, + "learning_rate": 0.0003192397229192998, + "loss": 3.1411, + "step": 29357 + }, + { + "epoch": 1.44, + "grad_norm": 0.5956936478614807, + "learning_rate": 0.00031922435839818456, + "loss": 3.1444, + "step": 29358 + }, + { + "epoch": 1.44, + "grad_norm": 0.5684614181518555, + "learning_rate": 0.00031920899382643605, + "loss": 2.9935, + "step": 29359 + }, + { + "epoch": 1.44, + "grad_norm": 0.6149534583091736, + "learning_rate": 0.0003191936292040948, + "loss": 2.8515, + "step": 29360 + }, + { + "epoch": 1.44, + "grad_norm": 0.5479702353477478, + "learning_rate": 0.00031917826453120113, + "loss": 3.1102, + "step": 29361 + }, + { + "epoch": 1.44, + "grad_norm": 0.5614345073699951, + "learning_rate": 0.00031916289980779566, + "loss": 3.0526, + "step": 29362 + }, + { + "epoch": 1.44, + "grad_norm": 0.6308231353759766, + "learning_rate": 0.0003191475350339189, + "loss": 3.0099, + "step": 29363 + }, + { + "epoch": 1.44, + "grad_norm": 0.5764047503471375, + "learning_rate": 0.0003191321702096111, + "loss": 3.1026, + "step": 29364 + }, + { + "epoch": 1.44, + "grad_norm": 0.5685030221939087, + "learning_rate": 0.00031911680533491284, + "loss": 2.9923, + "step": 29365 + }, + { + "epoch": 1.44, + "grad_norm": 0.5890762805938721, + "learning_rate": 0.0003191014404098645, + "loss": 2.9788, + "step": 29366 + }, + { + "epoch": 1.44, + "grad_norm": 0.5421032309532166, + "learning_rate": 0.00031908607543450684, + "loss": 3.1369, + "step": 29367 + }, + { + "epoch": 1.44, + "grad_norm": 0.6088888049125671, + "learning_rate": 0.00031907071040888007, + "loss": 3.1285, + "step": 29368 + }, + { + "epoch": 1.44, + "grad_norm": 0.585292637348175, + "learning_rate": 0.00031905534533302465, + "loss": 3.0663, + "step": 29369 + }, + { + "epoch": 1.44, + "grad_norm": 0.5948856472969055, + "learning_rate": 0.0003190399802069812, + "loss": 3.0343, + "step": 29370 + }, + { + "epoch": 1.44, + "grad_norm": 0.6100438833236694, + "learning_rate": 0.0003190246150307901, + "loss": 2.9583, + "step": 29371 + }, + { + "epoch": 1.44, + "grad_norm": 0.5696083307266235, + "learning_rate": 0.00031900924980449174, + "loss": 3.1466, + "step": 29372 + }, + { + "epoch": 1.44, + "grad_norm": 0.5895338654518127, + "learning_rate": 0.0003189938845281267, + "loss": 3.001, + "step": 29373 + }, + { + "epoch": 1.44, + "grad_norm": 0.5648195147514343, + "learning_rate": 0.0003189785192017355, + "loss": 3.0352, + "step": 29374 + }, + { + "epoch": 1.44, + "grad_norm": 0.5830442905426025, + "learning_rate": 0.0003189631538253585, + "loss": 3.0942, + "step": 29375 + }, + { + "epoch": 1.44, + "grad_norm": 0.5957970023155212, + "learning_rate": 0.00031894778839903614, + "loss": 2.8905, + "step": 29376 + }, + { + "epoch": 1.44, + "grad_norm": 0.6936584115028381, + "learning_rate": 0.000318932422922809, + "loss": 3.0502, + "step": 29377 + }, + { + "epoch": 1.44, + "grad_norm": 0.6041005849838257, + "learning_rate": 0.0003189170573967176, + "loss": 3.1924, + "step": 29378 + }, + { + "epoch": 1.44, + "grad_norm": 0.5614645481109619, + "learning_rate": 0.00031890169182080216, + "loss": 3.0978, + "step": 29379 + }, + { + "epoch": 1.44, + "grad_norm": 0.7858589291572571, + "learning_rate": 0.0003188863261951033, + "loss": 3.061, + "step": 29380 + }, + { + "epoch": 1.44, + "grad_norm": 0.6027712225914001, + "learning_rate": 0.00031887096051966154, + "loss": 3.1522, + "step": 29381 + }, + { + "epoch": 1.44, + "grad_norm": 0.6058451533317566, + "learning_rate": 0.00031885559479451735, + "loss": 3.0498, + "step": 29382 + }, + { + "epoch": 1.44, + "grad_norm": 0.6025016903877258, + "learning_rate": 0.0003188402290197111, + "loss": 3.1248, + "step": 29383 + }, + { + "epoch": 1.44, + "grad_norm": 0.6525845527648926, + "learning_rate": 0.0003188248631952833, + "loss": 2.9143, + "step": 29384 + }, + { + "epoch": 1.44, + "grad_norm": 0.5875680446624756, + "learning_rate": 0.0003188094973212745, + "loss": 2.9911, + "step": 29385 + }, + { + "epoch": 1.44, + "grad_norm": 0.5825726985931396, + "learning_rate": 0.00031879413139772507, + "loss": 3.0715, + "step": 29386 + }, + { + "epoch": 1.44, + "grad_norm": 0.5770814418792725, + "learning_rate": 0.00031877876542467544, + "loss": 2.9507, + "step": 29387 + }, + { + "epoch": 1.44, + "grad_norm": 0.5879018306732178, + "learning_rate": 0.00031876339940216624, + "loss": 2.9064, + "step": 29388 + }, + { + "epoch": 1.44, + "grad_norm": 0.5837126970291138, + "learning_rate": 0.0003187480333302379, + "loss": 2.9365, + "step": 29389 + }, + { + "epoch": 1.44, + "grad_norm": 0.5690444111824036, + "learning_rate": 0.00031873266720893075, + "loss": 3.1901, + "step": 29390 + }, + { + "epoch": 1.44, + "grad_norm": 0.5976502299308777, + "learning_rate": 0.00031871730103828544, + "loss": 3.0219, + "step": 29391 + }, + { + "epoch": 1.44, + "grad_norm": 0.6792353987693787, + "learning_rate": 0.0003187019348183423, + "loss": 2.9063, + "step": 29392 + }, + { + "epoch": 1.44, + "grad_norm": 0.5715599656105042, + "learning_rate": 0.00031868656854914184, + "loss": 3.1014, + "step": 29393 + }, + { + "epoch": 1.44, + "grad_norm": 0.5927602648735046, + "learning_rate": 0.0003186712022307246, + "loss": 3.0256, + "step": 29394 + }, + { + "epoch": 1.44, + "grad_norm": 0.583453893661499, + "learning_rate": 0.00031865583586313103, + "loss": 3.0197, + "step": 29395 + }, + { + "epoch": 1.44, + "grad_norm": 0.5917752385139465, + "learning_rate": 0.0003186404694464016, + "loss": 3.064, + "step": 29396 + }, + { + "epoch": 1.44, + "grad_norm": 0.5617307424545288, + "learning_rate": 0.0003186251029805767, + "loss": 3.1082, + "step": 29397 + }, + { + "epoch": 1.44, + "grad_norm": 0.5545293688774109, + "learning_rate": 0.00031860973646569683, + "loss": 2.9116, + "step": 29398 + }, + { + "epoch": 1.44, + "grad_norm": 0.5984163880348206, + "learning_rate": 0.00031859436990180255, + "loss": 3.0933, + "step": 29399 + }, + { + "epoch": 1.44, + "grad_norm": 0.6275175213813782, + "learning_rate": 0.0003185790032889343, + "loss": 3.1102, + "step": 29400 + }, + { + "epoch": 1.44, + "grad_norm": 0.5929582715034485, + "learning_rate": 0.0003185636366271325, + "loss": 3.0094, + "step": 29401 + }, + { + "epoch": 1.44, + "grad_norm": 0.5918399095535278, + "learning_rate": 0.0003185482699164376, + "loss": 2.9954, + "step": 29402 + }, + { + "epoch": 1.44, + "grad_norm": 0.6211867928504944, + "learning_rate": 0.0003185329031568902, + "loss": 3.0263, + "step": 29403 + }, + { + "epoch": 1.44, + "grad_norm": 0.615907609462738, + "learning_rate": 0.0003185175363485307, + "loss": 3.3152, + "step": 29404 + }, + { + "epoch": 1.44, + "grad_norm": 0.594133734703064, + "learning_rate": 0.0003185021694913995, + "loss": 3.0418, + "step": 29405 + }, + { + "epoch": 1.44, + "grad_norm": 0.5710883140563965, + "learning_rate": 0.0003184868025855373, + "loss": 3.1399, + "step": 29406 + }, + { + "epoch": 1.44, + "grad_norm": 0.544840395450592, + "learning_rate": 0.00031847143563098427, + "loss": 2.8117, + "step": 29407 + }, + { + "epoch": 1.44, + "grad_norm": 0.6463403701782227, + "learning_rate": 0.0003184560686277811, + "loss": 3.0027, + "step": 29408 + }, + { + "epoch": 1.44, + "grad_norm": 0.6022986173629761, + "learning_rate": 0.00031844070157596813, + "loss": 2.9204, + "step": 29409 + }, + { + "epoch": 1.44, + "grad_norm": 0.568325400352478, + "learning_rate": 0.00031842533447558596, + "loss": 3.0302, + "step": 29410 + }, + { + "epoch": 1.44, + "grad_norm": 0.573235809803009, + "learning_rate": 0.00031840996732667505, + "loss": 3.1247, + "step": 29411 + }, + { + "epoch": 1.44, + "grad_norm": 0.5684030055999756, + "learning_rate": 0.0003183946001292757, + "loss": 3.1886, + "step": 29412 + }, + { + "epoch": 1.44, + "grad_norm": 0.5798808336257935, + "learning_rate": 0.0003183792328834286, + "loss": 3.2167, + "step": 29413 + }, + { + "epoch": 1.44, + "grad_norm": 0.5618143677711487, + "learning_rate": 0.0003183638655891742, + "loss": 3.0885, + "step": 29414 + }, + { + "epoch": 1.44, + "grad_norm": 0.5687463879585266, + "learning_rate": 0.00031834849824655283, + "loss": 2.8543, + "step": 29415 + }, + { + "epoch": 1.44, + "grad_norm": 0.5497147440910339, + "learning_rate": 0.00031833313085560507, + "loss": 3.104, + "step": 29416 + }, + { + "epoch": 1.44, + "grad_norm": 0.5957204103469849, + "learning_rate": 0.0003183177634163713, + "loss": 3.0252, + "step": 29417 + }, + { + "epoch": 1.44, + "grad_norm": 0.6223195195198059, + "learning_rate": 0.0003183023959288922, + "loss": 2.9171, + "step": 29418 + }, + { + "epoch": 1.44, + "grad_norm": 0.5928622484207153, + "learning_rate": 0.000318287028393208, + "loss": 3.1098, + "step": 29419 + }, + { + "epoch": 1.44, + "grad_norm": 0.567246675491333, + "learning_rate": 0.0003182716608093593, + "loss": 3.0079, + "step": 29420 + }, + { + "epoch": 1.44, + "grad_norm": 0.6125072836875916, + "learning_rate": 0.00031825629317738655, + "loss": 2.9814, + "step": 29421 + }, + { + "epoch": 1.44, + "grad_norm": 0.5972563624382019, + "learning_rate": 0.0003182409254973303, + "loss": 3.1398, + "step": 29422 + }, + { + "epoch": 1.44, + "grad_norm": 0.5955824255943298, + "learning_rate": 0.0003182255577692309, + "loss": 2.8948, + "step": 29423 + }, + { + "epoch": 1.44, + "grad_norm": 0.5571625828742981, + "learning_rate": 0.00031821018999312895, + "loss": 3.1142, + "step": 29424 + }, + { + "epoch": 1.44, + "grad_norm": 0.6010100841522217, + "learning_rate": 0.00031819482216906487, + "loss": 3.1236, + "step": 29425 + }, + { + "epoch": 1.44, + "grad_norm": 0.5668064951896667, + "learning_rate": 0.0003181794542970791, + "loss": 2.9231, + "step": 29426 + }, + { + "epoch": 1.44, + "grad_norm": 0.5944365859031677, + "learning_rate": 0.0003181640863772121, + "loss": 3.2458, + "step": 29427 + }, + { + "epoch": 1.44, + "grad_norm": 0.5827503800392151, + "learning_rate": 0.0003181487184095044, + "loss": 3.2371, + "step": 29428 + }, + { + "epoch": 1.44, + "grad_norm": 0.5670848488807678, + "learning_rate": 0.00031813335039399654, + "loss": 3.0366, + "step": 29429 + }, + { + "epoch": 1.44, + "grad_norm": 0.5577699542045593, + "learning_rate": 0.0003181179823307289, + "loss": 3.1032, + "step": 29430 + }, + { + "epoch": 1.44, + "grad_norm": 0.5776100754737854, + "learning_rate": 0.000318102614219742, + "loss": 3.0566, + "step": 29431 + }, + { + "epoch": 1.44, + "grad_norm": 0.6043997406959534, + "learning_rate": 0.0003180872460610762, + "loss": 2.892, + "step": 29432 + }, + { + "epoch": 1.44, + "grad_norm": 0.584821343421936, + "learning_rate": 0.0003180718778547722, + "loss": 3.0156, + "step": 29433 + }, + { + "epoch": 1.44, + "grad_norm": 0.591511070728302, + "learning_rate": 0.0003180565096008703, + "loss": 3.0376, + "step": 29434 + }, + { + "epoch": 1.44, + "grad_norm": 0.5555319786071777, + "learning_rate": 0.00031804114129941095, + "loss": 3.0716, + "step": 29435 + }, + { + "epoch": 1.44, + "grad_norm": 0.6005762815475464, + "learning_rate": 0.0003180257729504349, + "loss": 3.0269, + "step": 29436 + }, + { + "epoch": 1.44, + "grad_norm": 0.5544703602790833, + "learning_rate": 0.0003180104045539823, + "loss": 3.006, + "step": 29437 + }, + { + "epoch": 1.44, + "grad_norm": 0.5543263554573059, + "learning_rate": 0.00031799503611009366, + "loss": 2.9448, + "step": 29438 + }, + { + "epoch": 1.44, + "grad_norm": 0.5978908538818359, + "learning_rate": 0.0003179796676188097, + "loss": 3.1896, + "step": 29439 + }, + { + "epoch": 1.44, + "grad_norm": 0.5939019322395325, + "learning_rate": 0.0003179642990801708, + "loss": 3.0236, + "step": 29440 + }, + { + "epoch": 1.44, + "grad_norm": 0.5759717226028442, + "learning_rate": 0.00031794893049421724, + "loss": 3.0154, + "step": 29441 + }, + { + "epoch": 1.44, + "grad_norm": 0.5859683156013489, + "learning_rate": 0.00031793356186098976, + "loss": 3.1299, + "step": 29442 + }, + { + "epoch": 1.44, + "grad_norm": 0.6028755307197571, + "learning_rate": 0.0003179181931805287, + "loss": 3.0498, + "step": 29443 + }, + { + "epoch": 1.44, + "grad_norm": 0.5671459436416626, + "learning_rate": 0.0003179028244528746, + "loss": 3.1373, + "step": 29444 + }, + { + "epoch": 1.44, + "grad_norm": 0.5760989785194397, + "learning_rate": 0.00031788745567806786, + "loss": 3.0959, + "step": 29445 + }, + { + "epoch": 1.44, + "grad_norm": 0.5798001885414124, + "learning_rate": 0.00031787208685614903, + "loss": 3.223, + "step": 29446 + }, + { + "epoch": 1.44, + "grad_norm": 0.5691527724266052, + "learning_rate": 0.0003178567179871586, + "loss": 2.9604, + "step": 29447 + }, + { + "epoch": 1.44, + "grad_norm": 0.5820776224136353, + "learning_rate": 0.00031784134907113687, + "loss": 2.9364, + "step": 29448 + }, + { + "epoch": 1.44, + "grad_norm": 0.576815664768219, + "learning_rate": 0.0003178259801081246, + "loss": 2.951, + "step": 29449 + }, + { + "epoch": 1.44, + "grad_norm": 0.5452558994293213, + "learning_rate": 0.000317810611098162, + "loss": 3.0218, + "step": 29450 + }, + { + "epoch": 1.44, + "grad_norm": 0.6160311102867126, + "learning_rate": 0.00031779524204128985, + "loss": 3.0444, + "step": 29451 + }, + { + "epoch": 1.44, + "grad_norm": 0.6155081391334534, + "learning_rate": 0.0003177798729375484, + "loss": 3.1733, + "step": 29452 + }, + { + "epoch": 1.44, + "grad_norm": 0.5370228886604309, + "learning_rate": 0.00031776450378697807, + "loss": 2.9489, + "step": 29453 + }, + { + "epoch": 1.44, + "grad_norm": 0.5774576663970947, + "learning_rate": 0.0003177491345896196, + "loss": 3.1492, + "step": 29454 + }, + { + "epoch": 1.44, + "grad_norm": 0.580348789691925, + "learning_rate": 0.0003177337653455132, + "loss": 3.0479, + "step": 29455 + }, + { + "epoch": 1.44, + "grad_norm": 0.554613471031189, + "learning_rate": 0.0003177183960546996, + "loss": 2.9147, + "step": 29456 + }, + { + "epoch": 1.44, + "grad_norm": 0.5800740122795105, + "learning_rate": 0.00031770302671721906, + "loss": 2.9172, + "step": 29457 + }, + { + "epoch": 1.44, + "grad_norm": 0.5789035558700562, + "learning_rate": 0.0003176876573331123, + "loss": 2.9797, + "step": 29458 + }, + { + "epoch": 1.44, + "grad_norm": 0.6241493821144104, + "learning_rate": 0.0003176722879024194, + "loss": 2.9661, + "step": 29459 + }, + { + "epoch": 1.44, + "grad_norm": 0.6004448533058167, + "learning_rate": 0.00031765691842518127, + "loss": 3.0091, + "step": 29460 + }, + { + "epoch": 1.44, + "grad_norm": 0.6146655678749084, + "learning_rate": 0.00031764154890143815, + "loss": 2.7731, + "step": 29461 + }, + { + "epoch": 1.44, + "grad_norm": 0.5921909213066101, + "learning_rate": 0.0003176261793312307, + "loss": 3.0267, + "step": 29462 + }, + { + "epoch": 1.44, + "grad_norm": 0.6653693914413452, + "learning_rate": 0.0003176108097145992, + "loss": 2.9743, + "step": 29463 + }, + { + "epoch": 1.44, + "grad_norm": 0.575783371925354, + "learning_rate": 0.00031759544005158415, + "loss": 3.0894, + "step": 29464 + }, + { + "epoch": 1.44, + "grad_norm": 0.5989192128181458, + "learning_rate": 0.0003175800703422262, + "loss": 2.8907, + "step": 29465 + }, + { + "epoch": 1.44, + "grad_norm": 0.5665627121925354, + "learning_rate": 0.0003175647005865657, + "loss": 3.0642, + "step": 29466 + }, + { + "epoch": 1.44, + "grad_norm": 0.595417320728302, + "learning_rate": 0.0003175493307846432, + "loss": 3.1442, + "step": 29467 + }, + { + "epoch": 1.44, + "grad_norm": 0.5861067771911621, + "learning_rate": 0.00031753396093649904, + "loss": 2.803, + "step": 29468 + }, + { + "epoch": 1.44, + "grad_norm": 0.5747168064117432, + "learning_rate": 0.0003175185910421739, + "loss": 2.9859, + "step": 29469 + }, + { + "epoch": 1.44, + "grad_norm": 0.5734466314315796, + "learning_rate": 0.0003175032211017081, + "loss": 3.217, + "step": 29470 + }, + { + "epoch": 1.44, + "grad_norm": 0.5525097846984863, + "learning_rate": 0.0003174878511151422, + "loss": 3.0217, + "step": 29471 + }, + { + "epoch": 1.44, + "grad_norm": 0.5869901776313782, + "learning_rate": 0.0003174724810825166, + "loss": 2.9827, + "step": 29472 + }, + { + "epoch": 1.44, + "grad_norm": 0.5891143679618835, + "learning_rate": 0.00031745711100387205, + "loss": 3.0641, + "step": 29473 + }, + { + "epoch": 1.44, + "grad_norm": 0.5963634252548218, + "learning_rate": 0.0003174417408792486, + "loss": 2.9668, + "step": 29474 + }, + { + "epoch": 1.44, + "grad_norm": 0.5769605040550232, + "learning_rate": 0.00031742637070868705, + "loss": 3.2163, + "step": 29475 + }, + { + "epoch": 1.44, + "grad_norm": 0.5539005398750305, + "learning_rate": 0.00031741100049222787, + "loss": 3.0295, + "step": 29476 + }, + { + "epoch": 1.44, + "grad_norm": 0.5648486018180847, + "learning_rate": 0.00031739563022991144, + "loss": 3.2033, + "step": 29477 + }, + { + "epoch": 1.44, + "grad_norm": 0.6375812292098999, + "learning_rate": 0.00031738025992177814, + "loss": 3.0786, + "step": 29478 + }, + { + "epoch": 1.44, + "grad_norm": 0.5986570119857788, + "learning_rate": 0.00031736488956786863, + "loss": 3.2335, + "step": 29479 + }, + { + "epoch": 1.44, + "grad_norm": 0.5965058207511902, + "learning_rate": 0.00031734951916822347, + "loss": 3.0896, + "step": 29480 + }, + { + "epoch": 1.44, + "grad_norm": 0.5741937756538391, + "learning_rate": 0.0003173341487228829, + "loss": 3.1851, + "step": 29481 + }, + { + "epoch": 1.44, + "grad_norm": 0.5639917254447937, + "learning_rate": 0.00031731877823188754, + "loss": 3.0339, + "step": 29482 + }, + { + "epoch": 1.44, + "grad_norm": 0.5802062153816223, + "learning_rate": 0.00031730340769527783, + "loss": 3.1775, + "step": 29483 + }, + { + "epoch": 1.44, + "grad_norm": 0.5636395812034607, + "learning_rate": 0.0003172880371130943, + "loss": 3.0593, + "step": 29484 + }, + { + "epoch": 1.44, + "grad_norm": 0.57658851146698, + "learning_rate": 0.00031727266648537737, + "loss": 3.1828, + "step": 29485 + }, + { + "epoch": 1.45, + "grad_norm": 0.5845613479614258, + "learning_rate": 0.0003172572958121676, + "loss": 2.8095, + "step": 29486 + }, + { + "epoch": 1.45, + "grad_norm": 0.5840170979499817, + "learning_rate": 0.0003172419250935055, + "loss": 2.8715, + "step": 29487 + }, + { + "epoch": 1.45, + "grad_norm": 0.5798373222351074, + "learning_rate": 0.0003172265543294314, + "loss": 2.9696, + "step": 29488 + }, + { + "epoch": 1.45, + "grad_norm": 0.5842916965484619, + "learning_rate": 0.0003172111835199858, + "loss": 3.2531, + "step": 29489 + }, + { + "epoch": 1.45, + "grad_norm": 0.57811439037323, + "learning_rate": 0.00031719581266520937, + "loss": 3.1352, + "step": 29490 + }, + { + "epoch": 1.45, + "grad_norm": 0.6172042489051819, + "learning_rate": 0.0003171804417651425, + "loss": 3.3323, + "step": 29491 + }, + { + "epoch": 1.45, + "grad_norm": 0.6265311241149902, + "learning_rate": 0.0003171650708198256, + "loss": 2.9154, + "step": 29492 + }, + { + "epoch": 1.45, + "grad_norm": 0.5730522274971008, + "learning_rate": 0.0003171496998292992, + "loss": 3.0178, + "step": 29493 + }, + { + "epoch": 1.45, + "grad_norm": 0.5934553146362305, + "learning_rate": 0.0003171343287936038, + "loss": 3.076, + "step": 29494 + }, + { + "epoch": 1.45, + "grad_norm": 0.6215176582336426, + "learning_rate": 0.00031711895771277987, + "loss": 3.1317, + "step": 29495 + }, + { + "epoch": 1.45, + "grad_norm": 0.5459911227226257, + "learning_rate": 0.0003171035865868679, + "loss": 3.2385, + "step": 29496 + }, + { + "epoch": 1.45, + "grad_norm": 0.5924990177154541, + "learning_rate": 0.0003170882154159084, + "loss": 2.8411, + "step": 29497 + }, + { + "epoch": 1.45, + "grad_norm": 0.5878446102142334, + "learning_rate": 0.00031707284419994183, + "loss": 2.9754, + "step": 29498 + }, + { + "epoch": 1.45, + "grad_norm": 0.6080270409584045, + "learning_rate": 0.0003170574729390087, + "loss": 3.1571, + "step": 29499 + }, + { + "epoch": 1.45, + "grad_norm": 0.5498298406600952, + "learning_rate": 0.00031704210163314934, + "loss": 3.1642, + "step": 29500 + }, + { + "epoch": 1.45, + "grad_norm": 0.5822784900665283, + "learning_rate": 0.0003170267302824044, + "loss": 3.0001, + "step": 29501 + }, + { + "epoch": 1.45, + "grad_norm": 0.5680847764015198, + "learning_rate": 0.0003170113588868145, + "loss": 2.9522, + "step": 29502 + }, + { + "epoch": 1.45, + "grad_norm": 0.589328944683075, + "learning_rate": 0.0003169959874464198, + "loss": 3.1955, + "step": 29503 + }, + { + "epoch": 1.45, + "grad_norm": 0.6632125973701477, + "learning_rate": 0.00031698061596126096, + "loss": 3.0567, + "step": 29504 + }, + { + "epoch": 1.45, + "grad_norm": 0.5721989274024963, + "learning_rate": 0.00031696524443137846, + "loss": 2.9241, + "step": 29505 + }, + { + "epoch": 1.45, + "grad_norm": 0.6211953163146973, + "learning_rate": 0.0003169498728568128, + "loss": 3.0061, + "step": 29506 + }, + { + "epoch": 1.45, + "grad_norm": 0.5938788056373596, + "learning_rate": 0.00031693450123760437, + "loss": 3.0157, + "step": 29507 + }, + { + "epoch": 1.45, + "grad_norm": 0.5666404366493225, + "learning_rate": 0.0003169191295737938, + "loss": 3.0017, + "step": 29508 + }, + { + "epoch": 1.45, + "grad_norm": 0.5512176752090454, + "learning_rate": 0.00031690375786542146, + "loss": 3.1269, + "step": 29509 + }, + { + "epoch": 1.45, + "grad_norm": 0.5751276612281799, + "learning_rate": 0.00031688838611252783, + "loss": 2.9614, + "step": 29510 + }, + { + "epoch": 1.45, + "grad_norm": 0.6172812581062317, + "learning_rate": 0.00031687301431515346, + "loss": 2.8939, + "step": 29511 + }, + { + "epoch": 1.45, + "grad_norm": 0.5911383628845215, + "learning_rate": 0.0003168576424733389, + "loss": 3.2453, + "step": 29512 + }, + { + "epoch": 1.45, + "grad_norm": 0.5967884659767151, + "learning_rate": 0.0003168422705871245, + "loss": 3.016, + "step": 29513 + }, + { + "epoch": 1.45, + "grad_norm": 0.5799013376235962, + "learning_rate": 0.0003168268986565508, + "loss": 3.1217, + "step": 29514 + }, + { + "epoch": 1.45, + "grad_norm": 0.6159305572509766, + "learning_rate": 0.0003168115266816582, + "loss": 2.9896, + "step": 29515 + }, + { + "epoch": 1.45, + "grad_norm": 0.5831592679023743, + "learning_rate": 0.00031679615466248746, + "loss": 3.1185, + "step": 29516 + }, + { + "epoch": 1.45, + "grad_norm": 0.5765199065208435, + "learning_rate": 0.0003167807825990787, + "loss": 3.116, + "step": 29517 + }, + { + "epoch": 1.45, + "grad_norm": 0.6035337448120117, + "learning_rate": 0.00031676541049147273, + "loss": 3.001, + "step": 29518 + }, + { + "epoch": 1.45, + "grad_norm": 0.6357006430625916, + "learning_rate": 0.0003167500383397098, + "loss": 2.9424, + "step": 29519 + }, + { + "epoch": 1.45, + "grad_norm": 0.595038115978241, + "learning_rate": 0.0003167346661438305, + "loss": 3.0421, + "step": 29520 + }, + { + "epoch": 1.45, + "grad_norm": 0.6188342571258545, + "learning_rate": 0.0003167192939038754, + "loss": 3.2593, + "step": 29521 + }, + { + "epoch": 1.45, + "grad_norm": 0.6148853302001953, + "learning_rate": 0.00031670392161988484, + "loss": 2.9146, + "step": 29522 + }, + { + "epoch": 1.45, + "grad_norm": 0.6067038774490356, + "learning_rate": 0.00031668854929189927, + "loss": 2.9484, + "step": 29523 + }, + { + "epoch": 1.45, + "grad_norm": 0.5747334361076355, + "learning_rate": 0.00031667317691995943, + "loss": 3.1063, + "step": 29524 + }, + { + "epoch": 1.45, + "grad_norm": 0.6633566617965698, + "learning_rate": 0.0003166578045041055, + "loss": 3.0578, + "step": 29525 + }, + { + "epoch": 1.45, + "grad_norm": 0.6091784238815308, + "learning_rate": 0.0003166424320443782, + "loss": 2.8992, + "step": 29526 + }, + { + "epoch": 1.45, + "grad_norm": 0.5521370768547058, + "learning_rate": 0.00031662705954081807, + "loss": 3.3263, + "step": 29527 + }, + { + "epoch": 1.45, + "grad_norm": 0.5992710590362549, + "learning_rate": 0.0003166116869934653, + "loss": 2.9598, + "step": 29528 + }, + { + "epoch": 1.45, + "grad_norm": 0.5992154479026794, + "learning_rate": 0.0003165963144023606, + "loss": 3.0288, + "step": 29529 + }, + { + "epoch": 1.45, + "grad_norm": 0.563355028629303, + "learning_rate": 0.0003165809417675443, + "loss": 3.1072, + "step": 29530 + }, + { + "epoch": 1.45, + "grad_norm": 0.575053334236145, + "learning_rate": 0.0003165655690890571, + "loss": 3.067, + "step": 29531 + }, + { + "epoch": 1.45, + "grad_norm": 0.5506818890571594, + "learning_rate": 0.00031655019636693933, + "loss": 3.2032, + "step": 29532 + }, + { + "epoch": 1.45, + "grad_norm": 0.5816770195960999, + "learning_rate": 0.0003165348236012316, + "loss": 3.2077, + "step": 29533 + }, + { + "epoch": 1.45, + "grad_norm": 0.5859194993972778, + "learning_rate": 0.00031651945079197425, + "loss": 3.084, + "step": 29534 + }, + { + "epoch": 1.45, + "grad_norm": 0.6002169847488403, + "learning_rate": 0.0003165040779392079, + "loss": 3.0688, + "step": 29535 + }, + { + "epoch": 1.45, + "grad_norm": 0.5905258655548096, + "learning_rate": 0.00031648870504297287, + "loss": 2.9794, + "step": 29536 + }, + { + "epoch": 1.45, + "grad_norm": 0.5949984788894653, + "learning_rate": 0.0003164733321033099, + "loss": 2.9201, + "step": 29537 + }, + { + "epoch": 1.45, + "grad_norm": 0.6448434591293335, + "learning_rate": 0.00031645795912025937, + "loss": 2.964, + "step": 29538 + }, + { + "epoch": 1.45, + "grad_norm": 0.5883697867393494, + "learning_rate": 0.0003164425860938616, + "loss": 3.0997, + "step": 29539 + }, + { + "epoch": 1.45, + "grad_norm": 0.5910248160362244, + "learning_rate": 0.0003164272130241573, + "loss": 3.3129, + "step": 29540 + }, + { + "epoch": 1.45, + "grad_norm": 0.5907135009765625, + "learning_rate": 0.0003164118399111868, + "loss": 2.9919, + "step": 29541 + }, + { + "epoch": 1.45, + "grad_norm": 0.5622631907463074, + "learning_rate": 0.0003163964667549908, + "loss": 2.835, + "step": 29542 + }, + { + "epoch": 1.45, + "grad_norm": 0.609990656375885, + "learning_rate": 0.00031638109355560957, + "loss": 3.2596, + "step": 29543 + }, + { + "epoch": 1.45, + "grad_norm": 0.5881011486053467, + "learning_rate": 0.00031636572031308374, + "loss": 3.0279, + "step": 29544 + }, + { + "epoch": 1.45, + "grad_norm": 0.6357592940330505, + "learning_rate": 0.0003163503470274537, + "loss": 3.2961, + "step": 29545 + }, + { + "epoch": 1.45, + "grad_norm": 0.5936046242713928, + "learning_rate": 0.00031633497369876006, + "loss": 2.9672, + "step": 29546 + }, + { + "epoch": 1.45, + "grad_norm": 0.6165637969970703, + "learning_rate": 0.00031631960032704325, + "loss": 2.9565, + "step": 29547 + }, + { + "epoch": 1.45, + "grad_norm": 0.6325369477272034, + "learning_rate": 0.0003163042269123436, + "loss": 2.9206, + "step": 29548 + }, + { + "epoch": 1.45, + "grad_norm": 0.6465588212013245, + "learning_rate": 0.000316288853454702, + "loss": 2.9257, + "step": 29549 + }, + { + "epoch": 1.45, + "grad_norm": 0.585541307926178, + "learning_rate": 0.00031627347995415856, + "loss": 2.8962, + "step": 29550 + }, + { + "epoch": 1.45, + "grad_norm": 0.5976788997650146, + "learning_rate": 0.00031625810641075383, + "loss": 2.9734, + "step": 29551 + }, + { + "epoch": 1.45, + "grad_norm": 0.5807003378868103, + "learning_rate": 0.00031624273282452844, + "loss": 3.1595, + "step": 29552 + }, + { + "epoch": 1.45, + "grad_norm": 0.586675763130188, + "learning_rate": 0.00031622735919552297, + "loss": 2.9426, + "step": 29553 + }, + { + "epoch": 1.45, + "grad_norm": 0.583739161491394, + "learning_rate": 0.0003162119855237776, + "loss": 2.9561, + "step": 29554 + }, + { + "epoch": 1.45, + "grad_norm": 0.5614110231399536, + "learning_rate": 0.00031619661180933294, + "loss": 2.9349, + "step": 29555 + }, + { + "epoch": 1.45, + "grad_norm": 0.5753343105316162, + "learning_rate": 0.00031618123805222964, + "loss": 3.2066, + "step": 29556 + }, + { + "epoch": 1.45, + "grad_norm": 0.6021926403045654, + "learning_rate": 0.00031616586425250797, + "loss": 3.0151, + "step": 29557 + }, + { + "epoch": 1.45, + "grad_norm": 0.6058591604232788, + "learning_rate": 0.00031615049041020855, + "loss": 3.2783, + "step": 29558 + }, + { + "epoch": 1.45, + "grad_norm": 0.608304500579834, + "learning_rate": 0.0003161351165253719, + "loss": 2.7891, + "step": 29559 + }, + { + "epoch": 1.45, + "grad_norm": 0.5610315203666687, + "learning_rate": 0.0003161197425980385, + "loss": 3.0305, + "step": 29560 + }, + { + "epoch": 1.45, + "grad_norm": 0.5982028245925903, + "learning_rate": 0.0003161043686282487, + "loss": 2.8197, + "step": 29561 + }, + { + "epoch": 1.45, + "grad_norm": 0.5895063877105713, + "learning_rate": 0.00031608899461604313, + "loss": 3.0352, + "step": 29562 + }, + { + "epoch": 1.45, + "grad_norm": 0.6164365410804749, + "learning_rate": 0.0003160736205614622, + "loss": 3.0017, + "step": 29563 + }, + { + "epoch": 1.45, + "grad_norm": 0.5825421214103699, + "learning_rate": 0.00031605824646454655, + "loss": 2.9548, + "step": 29564 + }, + { + "epoch": 1.45, + "grad_norm": 0.58476722240448, + "learning_rate": 0.00031604287232533647, + "loss": 3.157, + "step": 29565 + }, + { + "epoch": 1.45, + "grad_norm": 0.5678737163543701, + "learning_rate": 0.00031602749814387253, + "loss": 2.9115, + "step": 29566 + }, + { + "epoch": 1.45, + "grad_norm": 0.5935869812965393, + "learning_rate": 0.00031601212392019535, + "loss": 2.9762, + "step": 29567 + }, + { + "epoch": 1.45, + "grad_norm": 0.5967699885368347, + "learning_rate": 0.00031599674965434525, + "loss": 3.1632, + "step": 29568 + }, + { + "epoch": 1.45, + "grad_norm": 0.6144185066223145, + "learning_rate": 0.00031598137534636283, + "loss": 2.9605, + "step": 29569 + }, + { + "epoch": 1.45, + "grad_norm": 0.5704265236854553, + "learning_rate": 0.0003159660009962885, + "loss": 3.0765, + "step": 29570 + }, + { + "epoch": 1.45, + "grad_norm": 0.5627557635307312, + "learning_rate": 0.0003159506266041628, + "loss": 3.1848, + "step": 29571 + }, + { + "epoch": 1.45, + "grad_norm": 0.5492954254150391, + "learning_rate": 0.0003159352521700262, + "loss": 2.9258, + "step": 29572 + }, + { + "epoch": 1.45, + "grad_norm": 0.5822631120681763, + "learning_rate": 0.0003159198776939193, + "loss": 2.9043, + "step": 29573 + }, + { + "epoch": 1.45, + "grad_norm": 0.6288532614707947, + "learning_rate": 0.0003159045031758824, + "loss": 2.9592, + "step": 29574 + }, + { + "epoch": 1.45, + "grad_norm": 0.5812193751335144, + "learning_rate": 0.00031588912861595623, + "loss": 3.0854, + "step": 29575 + }, + { + "epoch": 1.45, + "grad_norm": 0.6251932382583618, + "learning_rate": 0.00031587375401418105, + "loss": 2.8665, + "step": 29576 + }, + { + "epoch": 1.45, + "grad_norm": 0.5608442425727844, + "learning_rate": 0.00031585837937059746, + "loss": 3.2142, + "step": 29577 + }, + { + "epoch": 1.45, + "grad_norm": 0.6160196661949158, + "learning_rate": 0.000315843004685246, + "loss": 3.0009, + "step": 29578 + }, + { + "epoch": 1.45, + "grad_norm": 0.5881204009056091, + "learning_rate": 0.0003158276299581671, + "loss": 2.8357, + "step": 29579 + }, + { + "epoch": 1.45, + "grad_norm": 0.5906282067298889, + "learning_rate": 0.00031581225518940124, + "loss": 3.1925, + "step": 29580 + }, + { + "epoch": 1.45, + "grad_norm": 0.5734764337539673, + "learning_rate": 0.00031579688037898897, + "loss": 3.1714, + "step": 29581 + }, + { + "epoch": 1.45, + "grad_norm": 0.5693907141685486, + "learning_rate": 0.00031578150552697077, + "loss": 3.1624, + "step": 29582 + }, + { + "epoch": 1.45, + "grad_norm": 0.6162329316139221, + "learning_rate": 0.00031576613063338707, + "loss": 2.9136, + "step": 29583 + }, + { + "epoch": 1.45, + "grad_norm": 0.6050886511802673, + "learning_rate": 0.0003157507556982785, + "loss": 2.9093, + "step": 29584 + }, + { + "epoch": 1.45, + "grad_norm": 0.5759333968162537, + "learning_rate": 0.0003157353807216854, + "loss": 3.1633, + "step": 29585 + }, + { + "epoch": 1.45, + "grad_norm": 0.6068954467773438, + "learning_rate": 0.0003157200057036484, + "loss": 3.0777, + "step": 29586 + }, + { + "epoch": 1.45, + "grad_norm": 0.5611181259155273, + "learning_rate": 0.0003157046306442078, + "loss": 3.1995, + "step": 29587 + }, + { + "epoch": 1.45, + "grad_norm": 0.5561584830284119, + "learning_rate": 0.0003156892555434043, + "loss": 3.1371, + "step": 29588 + }, + { + "epoch": 1.45, + "grad_norm": 0.5815036296844482, + "learning_rate": 0.0003156738804012784, + "loss": 3.0672, + "step": 29589 + }, + { + "epoch": 1.45, + "grad_norm": 0.5883771777153015, + "learning_rate": 0.00031565850521787043, + "loss": 3.0313, + "step": 29590 + }, + { + "epoch": 1.45, + "grad_norm": 0.5724140405654907, + "learning_rate": 0.000315643129993221, + "loss": 3.099, + "step": 29591 + }, + { + "epoch": 1.45, + "grad_norm": 0.5555368065834045, + "learning_rate": 0.0003156277547273704, + "loss": 3.1058, + "step": 29592 + }, + { + "epoch": 1.45, + "grad_norm": 0.5770111083984375, + "learning_rate": 0.0003156123794203596, + "loss": 3.0728, + "step": 29593 + }, + { + "epoch": 1.45, + "grad_norm": 0.6504952311515808, + "learning_rate": 0.00031559700407222865, + "loss": 3.0687, + "step": 29594 + }, + { + "epoch": 1.45, + "grad_norm": 0.6245805025100708, + "learning_rate": 0.0003155816286830182, + "loss": 3.0529, + "step": 29595 + }, + { + "epoch": 1.45, + "grad_norm": 0.6823046207427979, + "learning_rate": 0.0003155662532527687, + "loss": 3.0419, + "step": 29596 + }, + { + "epoch": 1.45, + "grad_norm": 0.5609052181243896, + "learning_rate": 0.00031555087778152075, + "loss": 2.8952, + "step": 29597 + }, + { + "epoch": 1.45, + "grad_norm": 0.6359115839004517, + "learning_rate": 0.00031553550226931477, + "loss": 2.9609, + "step": 29598 + }, + { + "epoch": 1.45, + "grad_norm": 0.5841073989868164, + "learning_rate": 0.0003155201267161912, + "loss": 2.9918, + "step": 29599 + }, + { + "epoch": 1.45, + "grad_norm": 0.5604556202888489, + "learning_rate": 0.00031550475112219074, + "loss": 2.8732, + "step": 29600 + }, + { + "epoch": 1.45, + "grad_norm": 0.6146168112754822, + "learning_rate": 0.0003154893754873537, + "loss": 2.8485, + "step": 29601 + }, + { + "epoch": 1.45, + "grad_norm": 0.5645619034767151, + "learning_rate": 0.0003154739998117205, + "loss": 3.0853, + "step": 29602 + }, + { + "epoch": 1.45, + "grad_norm": 0.6000223159790039, + "learning_rate": 0.0003154586240953318, + "loss": 3.0441, + "step": 29603 + }, + { + "epoch": 1.45, + "grad_norm": 0.5900564193725586, + "learning_rate": 0.00031544324833822823, + "loss": 3.1783, + "step": 29604 + }, + { + "epoch": 1.45, + "grad_norm": 0.6112537384033203, + "learning_rate": 0.00031542787254044995, + "loss": 3.0663, + "step": 29605 + }, + { + "epoch": 1.45, + "grad_norm": 0.6015599966049194, + "learning_rate": 0.00031541249670203773, + "loss": 2.9529, + "step": 29606 + }, + { + "epoch": 1.45, + "grad_norm": 0.562324583530426, + "learning_rate": 0.00031539712082303184, + "loss": 3.2351, + "step": 29607 + }, + { + "epoch": 1.45, + "grad_norm": 0.6208454370498657, + "learning_rate": 0.000315381744903473, + "loss": 3.4744, + "step": 29608 + }, + { + "epoch": 1.45, + "grad_norm": 0.5760998725891113, + "learning_rate": 0.0003153663689434016, + "loss": 3.0427, + "step": 29609 + }, + { + "epoch": 1.45, + "grad_norm": 0.5720778107643127, + "learning_rate": 0.00031535099294285803, + "loss": 2.9921, + "step": 29610 + }, + { + "epoch": 1.45, + "grad_norm": 0.5598829388618469, + "learning_rate": 0.000315335616901883, + "loss": 3.1476, + "step": 29611 + }, + { + "epoch": 1.45, + "grad_norm": 0.5947191119194031, + "learning_rate": 0.00031532024082051686, + "loss": 3.2268, + "step": 29612 + }, + { + "epoch": 1.45, + "grad_norm": 0.5800713300704956, + "learning_rate": 0.00031530486469880017, + "loss": 2.9083, + "step": 29613 + }, + { + "epoch": 1.45, + "grad_norm": 0.5774427652359009, + "learning_rate": 0.0003152894885367734, + "loss": 2.8321, + "step": 29614 + }, + { + "epoch": 1.45, + "grad_norm": 0.5743990540504456, + "learning_rate": 0.0003152741123344771, + "loss": 3.0104, + "step": 29615 + }, + { + "epoch": 1.45, + "grad_norm": 0.557272732257843, + "learning_rate": 0.0003152587360919517, + "loss": 3.0485, + "step": 29616 + }, + { + "epoch": 1.45, + "grad_norm": 0.5970534682273865, + "learning_rate": 0.0003152433598092378, + "loss": 3.011, + "step": 29617 + }, + { + "epoch": 1.45, + "grad_norm": 0.5622311234474182, + "learning_rate": 0.0003152279834863757, + "loss": 3.1436, + "step": 29618 + }, + { + "epoch": 1.45, + "grad_norm": 0.5816734433174133, + "learning_rate": 0.00031521260712340606, + "loss": 3.0806, + "step": 29619 + }, + { + "epoch": 1.45, + "grad_norm": 0.5890375375747681, + "learning_rate": 0.0003151972307203694, + "loss": 3.2086, + "step": 29620 + }, + { + "epoch": 1.45, + "grad_norm": 0.6203916072845459, + "learning_rate": 0.00031518185427730606, + "loss": 3.126, + "step": 29621 + }, + { + "epoch": 1.45, + "grad_norm": 0.5634053945541382, + "learning_rate": 0.00031516647779425677, + "loss": 3.3819, + "step": 29622 + }, + { + "epoch": 1.45, + "grad_norm": 0.5932179689407349, + "learning_rate": 0.0003151511012712617, + "loss": 3.0579, + "step": 29623 + }, + { + "epoch": 1.45, + "grad_norm": 0.5462847948074341, + "learning_rate": 0.0003151357247083617, + "loss": 3.0788, + "step": 29624 + }, + { + "epoch": 1.45, + "grad_norm": 0.6403987407684326, + "learning_rate": 0.00031512034810559703, + "loss": 3.0505, + "step": 29625 + }, + { + "epoch": 1.45, + "grad_norm": 0.6161810755729675, + "learning_rate": 0.0003151049714630084, + "loss": 3.0068, + "step": 29626 + }, + { + "epoch": 1.45, + "grad_norm": 0.5708884596824646, + "learning_rate": 0.0003150895947806361, + "loss": 2.9304, + "step": 29627 + }, + { + "epoch": 1.45, + "grad_norm": 0.5830209851264954, + "learning_rate": 0.0003150742180585207, + "loss": 3.0144, + "step": 29628 + }, + { + "epoch": 1.45, + "grad_norm": 0.6132103204727173, + "learning_rate": 0.0003150588412967028, + "loss": 3.1479, + "step": 29629 + }, + { + "epoch": 1.45, + "grad_norm": 0.5505216717720032, + "learning_rate": 0.0003150434644952227, + "loss": 3.3665, + "step": 29630 + }, + { + "epoch": 1.45, + "grad_norm": 0.5873746275901794, + "learning_rate": 0.00031502808765412107, + "loss": 3.2011, + "step": 29631 + }, + { + "epoch": 1.45, + "grad_norm": 0.6037224531173706, + "learning_rate": 0.0003150127107734383, + "loss": 3.2015, + "step": 29632 + }, + { + "epoch": 1.45, + "grad_norm": 0.6088622212409973, + "learning_rate": 0.00031499733385321495, + "loss": 3.0153, + "step": 29633 + }, + { + "epoch": 1.45, + "grad_norm": 0.6723757386207581, + "learning_rate": 0.0003149819568934915, + "loss": 3.1598, + "step": 29634 + }, + { + "epoch": 1.45, + "grad_norm": 0.5801018476486206, + "learning_rate": 0.0003149665798943085, + "loss": 2.9169, + "step": 29635 + }, + { + "epoch": 1.45, + "grad_norm": 0.5815666913986206, + "learning_rate": 0.00031495120285570635, + "loss": 3.0191, + "step": 29636 + }, + { + "epoch": 1.45, + "grad_norm": 0.5963479280471802, + "learning_rate": 0.00031493582577772573, + "loss": 3.0861, + "step": 29637 + }, + { + "epoch": 1.45, + "grad_norm": 0.5587741732597351, + "learning_rate": 0.0003149204486604069, + "loss": 3.1379, + "step": 29638 + }, + { + "epoch": 1.45, + "grad_norm": 0.574137806892395, + "learning_rate": 0.0003149050715037905, + "loss": 2.9059, + "step": 29639 + }, + { + "epoch": 1.45, + "grad_norm": 0.6238990426063538, + "learning_rate": 0.0003148896943079171, + "loss": 3.1322, + "step": 29640 + }, + { + "epoch": 1.45, + "grad_norm": 0.6018304824829102, + "learning_rate": 0.000314874317072827, + "loss": 3.0981, + "step": 29641 + }, + { + "epoch": 1.45, + "grad_norm": 0.5812610983848572, + "learning_rate": 0.0003148589397985609, + "loss": 2.9599, + "step": 29642 + }, + { + "epoch": 1.45, + "grad_norm": 0.6216371655464172, + "learning_rate": 0.00031484356248515907, + "loss": 3.1933, + "step": 29643 + }, + { + "epoch": 1.45, + "grad_norm": 0.621842086315155, + "learning_rate": 0.0003148281851326623, + "loss": 2.7123, + "step": 29644 + }, + { + "epoch": 1.45, + "grad_norm": 0.5874238014221191, + "learning_rate": 0.0003148128077411109, + "loss": 3.2495, + "step": 29645 + }, + { + "epoch": 1.45, + "grad_norm": 0.63470059633255, + "learning_rate": 0.0003147974303105454, + "loss": 2.9006, + "step": 29646 + }, + { + "epoch": 1.45, + "grad_norm": 0.5815867781639099, + "learning_rate": 0.0003147820528410063, + "loss": 3.078, + "step": 29647 + }, + { + "epoch": 1.45, + "grad_norm": 0.5960510969161987, + "learning_rate": 0.00031476667533253416, + "loss": 3.0143, + "step": 29648 + }, + { + "epoch": 1.45, + "grad_norm": 0.5853951573371887, + "learning_rate": 0.0003147512977851694, + "loss": 3.179, + "step": 29649 + }, + { + "epoch": 1.45, + "grad_norm": 0.5538525581359863, + "learning_rate": 0.00031473592019895253, + "loss": 2.9866, + "step": 29650 + }, + { + "epoch": 1.45, + "grad_norm": 0.6282745599746704, + "learning_rate": 0.0003147205425739242, + "loss": 3.2866, + "step": 29651 + }, + { + "epoch": 1.45, + "grad_norm": 0.6330800652503967, + "learning_rate": 0.00031470516491012476, + "loss": 2.8979, + "step": 29652 + }, + { + "epoch": 1.45, + "grad_norm": 0.6331319808959961, + "learning_rate": 0.00031468978720759473, + "loss": 2.9076, + "step": 29653 + }, + { + "epoch": 1.45, + "grad_norm": 1.587941288948059, + "learning_rate": 0.0003146744094663745, + "loss": 3.2152, + "step": 29654 + }, + { + "epoch": 1.45, + "grad_norm": 0.5891446471214294, + "learning_rate": 0.0003146590316865049, + "loss": 3.0108, + "step": 29655 + }, + { + "epoch": 1.45, + "grad_norm": 0.5783140063285828, + "learning_rate": 0.0003146436538680261, + "loss": 2.781, + "step": 29656 + }, + { + "epoch": 1.45, + "grad_norm": 0.5963090658187866, + "learning_rate": 0.00031462827601097873, + "loss": 3.212, + "step": 29657 + }, + { + "epoch": 1.45, + "grad_norm": 0.6018965840339661, + "learning_rate": 0.0003146128981154034, + "loss": 3.201, + "step": 29658 + }, + { + "epoch": 1.45, + "grad_norm": 0.5825430750846863, + "learning_rate": 0.0003145975201813404, + "loss": 3.1323, + "step": 29659 + }, + { + "epoch": 1.45, + "grad_norm": 0.58023601770401, + "learning_rate": 0.0003145821422088304, + "loss": 2.8852, + "step": 29660 + }, + { + "epoch": 1.45, + "grad_norm": 0.6018133759498596, + "learning_rate": 0.00031456676419791384, + "loss": 3.2422, + "step": 29661 + }, + { + "epoch": 1.45, + "grad_norm": 0.6052143573760986, + "learning_rate": 0.0003145513861486313, + "loss": 3.0118, + "step": 29662 + }, + { + "epoch": 1.45, + "grad_norm": 0.5762540102005005, + "learning_rate": 0.0003145360080610231, + "loss": 3.1484, + "step": 29663 + }, + { + "epoch": 1.45, + "grad_norm": 0.5601885914802551, + "learning_rate": 0.0003145206299351298, + "loss": 3.2171, + "step": 29664 + }, + { + "epoch": 1.45, + "grad_norm": 0.6563633680343628, + "learning_rate": 0.00031450525177099205, + "loss": 2.9215, + "step": 29665 + }, + { + "epoch": 1.45, + "grad_norm": 0.5905854105949402, + "learning_rate": 0.0003144898735686503, + "loss": 3.0631, + "step": 29666 + }, + { + "epoch": 1.45, + "grad_norm": 0.5945887565612793, + "learning_rate": 0.00031447449532814495, + "loss": 2.9145, + "step": 29667 + }, + { + "epoch": 1.45, + "grad_norm": 0.6269522309303284, + "learning_rate": 0.00031445911704951657, + "loss": 3.1563, + "step": 29668 + }, + { + "epoch": 1.45, + "grad_norm": 0.577925980091095, + "learning_rate": 0.00031444373873280564, + "loss": 2.9717, + "step": 29669 + }, + { + "epoch": 1.45, + "grad_norm": 0.572482705116272, + "learning_rate": 0.00031442836037805276, + "loss": 3.1004, + "step": 29670 + }, + { + "epoch": 1.45, + "grad_norm": 0.6007272005081177, + "learning_rate": 0.0003144129819852983, + "loss": 3.157, + "step": 29671 + }, + { + "epoch": 1.45, + "grad_norm": 0.5722142457962036, + "learning_rate": 0.00031439760355458284, + "loss": 2.966, + "step": 29672 + }, + { + "epoch": 1.45, + "grad_norm": 0.5537738800048828, + "learning_rate": 0.00031438222508594683, + "loss": 3.095, + "step": 29673 + }, + { + "epoch": 1.45, + "grad_norm": 0.5694095492362976, + "learning_rate": 0.0003143668465794308, + "loss": 3.0124, + "step": 29674 + }, + { + "epoch": 1.45, + "grad_norm": 0.6016697287559509, + "learning_rate": 0.0003143514680350752, + "loss": 3.0182, + "step": 29675 + }, + { + "epoch": 1.45, + "grad_norm": 0.5762494206428528, + "learning_rate": 0.0003143360894529207, + "loss": 2.9215, + "step": 29676 + }, + { + "epoch": 1.45, + "grad_norm": 0.5735913515090942, + "learning_rate": 0.00031432071083300773, + "loss": 3.148, + "step": 29677 + }, + { + "epoch": 1.45, + "grad_norm": 0.5936062932014465, + "learning_rate": 0.0003143053321753767, + "loss": 2.9584, + "step": 29678 + }, + { + "epoch": 1.45, + "grad_norm": 0.5691812634468079, + "learning_rate": 0.00031428995348006816, + "loss": 3.23, + "step": 29679 + }, + { + "epoch": 1.45, + "grad_norm": 0.5691925883293152, + "learning_rate": 0.00031427457474712274, + "loss": 3.1767, + "step": 29680 + }, + { + "epoch": 1.45, + "grad_norm": 0.6052525043487549, + "learning_rate": 0.00031425919597658067, + "loss": 2.9384, + "step": 29681 + }, + { + "epoch": 1.45, + "grad_norm": 0.6288476586341858, + "learning_rate": 0.00031424381716848275, + "loss": 2.9858, + "step": 29682 + }, + { + "epoch": 1.45, + "grad_norm": 0.5947161316871643, + "learning_rate": 0.0003142284383228693, + "loss": 3.2128, + "step": 29683 + }, + { + "epoch": 1.45, + "grad_norm": 0.7295497059822083, + "learning_rate": 0.00031421305943978093, + "loss": 3.237, + "step": 29684 + }, + { + "epoch": 1.45, + "grad_norm": 0.5625701546669006, + "learning_rate": 0.000314197680519258, + "loss": 2.8254, + "step": 29685 + }, + { + "epoch": 1.45, + "grad_norm": 0.6221656203269958, + "learning_rate": 0.0003141823015613412, + "loss": 2.8866, + "step": 29686 + }, + { + "epoch": 1.45, + "grad_norm": 0.6753302812576294, + "learning_rate": 0.0003141669225660709, + "loss": 3.0037, + "step": 29687 + }, + { + "epoch": 1.45, + "grad_norm": 0.579978883266449, + "learning_rate": 0.0003141515435334878, + "loss": 2.7757, + "step": 29688 + }, + { + "epoch": 1.45, + "grad_norm": 0.5823621153831482, + "learning_rate": 0.000314136164463632, + "loss": 3.2779, + "step": 29689 + }, + { + "epoch": 1.46, + "grad_norm": 0.6142686605453491, + "learning_rate": 0.0003141207853565444, + "loss": 2.8307, + "step": 29690 + }, + { + "epoch": 1.46, + "grad_norm": 0.5641099810600281, + "learning_rate": 0.0003141054062122654, + "loss": 2.9068, + "step": 29691 + }, + { + "epoch": 1.46, + "grad_norm": 0.6020845770835876, + "learning_rate": 0.00031409002703083547, + "loss": 3.0636, + "step": 29692 + }, + { + "epoch": 1.46, + "grad_norm": 0.5882956981658936, + "learning_rate": 0.00031407464781229503, + "loss": 3.0913, + "step": 29693 + }, + { + "epoch": 1.46, + "grad_norm": 0.593789279460907, + "learning_rate": 0.00031405926855668475, + "loss": 3.1935, + "step": 29694 + }, + { + "epoch": 1.46, + "grad_norm": 0.5638978481292725, + "learning_rate": 0.00031404388926404507, + "loss": 3.1643, + "step": 29695 + }, + { + "epoch": 1.46, + "grad_norm": 0.595602810382843, + "learning_rate": 0.00031402850993441647, + "loss": 3.1038, + "step": 29696 + }, + { + "epoch": 1.46, + "grad_norm": 0.6187957525253296, + "learning_rate": 0.00031401313056783945, + "loss": 2.9744, + "step": 29697 + }, + { + "epoch": 1.46, + "grad_norm": 0.5919390916824341, + "learning_rate": 0.0003139977511643546, + "loss": 3.2703, + "step": 29698 + }, + { + "epoch": 1.46, + "grad_norm": 0.566708505153656, + "learning_rate": 0.00031398237172400236, + "loss": 3.0879, + "step": 29699 + }, + { + "epoch": 1.46, + "grad_norm": 0.5841004848480225, + "learning_rate": 0.0003139669922468231, + "loss": 3.036, + "step": 29700 + }, + { + "epoch": 1.46, + "grad_norm": 0.5895936489105225, + "learning_rate": 0.00031395161273285765, + "loss": 2.9205, + "step": 29701 + }, + { + "epoch": 1.46, + "grad_norm": 0.5551169514656067, + "learning_rate": 0.00031393623318214635, + "loss": 2.9754, + "step": 29702 + }, + { + "epoch": 1.46, + "grad_norm": 0.5898941159248352, + "learning_rate": 0.00031392085359472954, + "loss": 2.9792, + "step": 29703 + }, + { + "epoch": 1.46, + "grad_norm": 0.6206042170524597, + "learning_rate": 0.000313905473970648, + "loss": 3.1062, + "step": 29704 + }, + { + "epoch": 1.46, + "grad_norm": 0.6914759278297424, + "learning_rate": 0.00031389009430994195, + "loss": 3.0206, + "step": 29705 + }, + { + "epoch": 1.46, + "grad_norm": 0.619304358959198, + "learning_rate": 0.00031387471461265227, + "loss": 2.9323, + "step": 29706 + }, + { + "epoch": 1.46, + "grad_norm": 0.573853075504303, + "learning_rate": 0.00031385933487881916, + "loss": 3.3458, + "step": 29707 + }, + { + "epoch": 1.46, + "grad_norm": 0.5841895341873169, + "learning_rate": 0.0003138439551084833, + "loss": 2.78, + "step": 29708 + }, + { + "epoch": 1.46, + "grad_norm": 0.5773389339447021, + "learning_rate": 0.0003138285753016851, + "loss": 3.123, + "step": 29709 + }, + { + "epoch": 1.46, + "grad_norm": 0.5712064504623413, + "learning_rate": 0.00031381319545846503, + "loss": 3.2139, + "step": 29710 + }, + { + "epoch": 1.46, + "grad_norm": 0.6082553267478943, + "learning_rate": 0.00031379781557886366, + "loss": 3.19, + "step": 29711 + }, + { + "epoch": 1.46, + "grad_norm": 0.6170501112937927, + "learning_rate": 0.00031378243566292156, + "loss": 3.0305, + "step": 29712 + }, + { + "epoch": 1.46, + "grad_norm": 0.5564154982566833, + "learning_rate": 0.00031376705571067924, + "loss": 3.0066, + "step": 29713 + }, + { + "epoch": 1.46, + "grad_norm": 0.6043330430984497, + "learning_rate": 0.000313751675722177, + "loss": 3.0141, + "step": 29714 + }, + { + "epoch": 1.46, + "grad_norm": 0.633105456829071, + "learning_rate": 0.00031373629569745546, + "loss": 2.8554, + "step": 29715 + }, + { + "epoch": 1.46, + "grad_norm": 0.5764904022216797, + "learning_rate": 0.0003137209156365553, + "loss": 2.9793, + "step": 29716 + }, + { + "epoch": 1.46, + "grad_norm": 0.6007531881332397, + "learning_rate": 0.00031370553553951687, + "loss": 3.0194, + "step": 29717 + }, + { + "epoch": 1.46, + "grad_norm": 0.597282350063324, + "learning_rate": 0.0003136901554063807, + "loss": 3.086, + "step": 29718 + }, + { + "epoch": 1.46, + "grad_norm": 0.6131802797317505, + "learning_rate": 0.0003136747752371873, + "loss": 3.1779, + "step": 29719 + }, + { + "epoch": 1.46, + "grad_norm": 0.5542619824409485, + "learning_rate": 0.0003136593950319771, + "loss": 3.0502, + "step": 29720 + }, + { + "epoch": 1.46, + "grad_norm": 0.662255048751831, + "learning_rate": 0.00031364401479079074, + "loss": 3.0517, + "step": 29721 + }, + { + "epoch": 1.46, + "grad_norm": 0.5674471259117126, + "learning_rate": 0.00031362863451366864, + "loss": 2.9217, + "step": 29722 + }, + { + "epoch": 1.46, + "grad_norm": 0.5355050563812256, + "learning_rate": 0.0003136132542006514, + "loss": 3.1893, + "step": 29723 + }, + { + "epoch": 1.46, + "grad_norm": 0.5745908617973328, + "learning_rate": 0.00031359787385177944, + "loss": 3.2364, + "step": 29724 + }, + { + "epoch": 1.46, + "grad_norm": 0.5746610760688782, + "learning_rate": 0.0003135824934670932, + "loss": 3.1486, + "step": 29725 + }, + { + "epoch": 1.46, + "grad_norm": 0.5605559945106506, + "learning_rate": 0.00031356711304663336, + "loss": 3.0998, + "step": 29726 + }, + { + "epoch": 1.46, + "grad_norm": 0.5893903970718384, + "learning_rate": 0.0003135517325904403, + "loss": 3.0817, + "step": 29727 + }, + { + "epoch": 1.46, + "grad_norm": 0.583816647529602, + "learning_rate": 0.00031353635209855474, + "loss": 3.0743, + "step": 29728 + }, + { + "epoch": 1.46, + "grad_norm": 0.616067111492157, + "learning_rate": 0.00031352097157101696, + "loss": 3.1714, + "step": 29729 + }, + { + "epoch": 1.46, + "grad_norm": 0.6139724850654602, + "learning_rate": 0.00031350559100786743, + "loss": 3.0468, + "step": 29730 + }, + { + "epoch": 1.46, + "grad_norm": 0.5720040202140808, + "learning_rate": 0.0003134902104091469, + "loss": 3.2753, + "step": 29731 + }, + { + "epoch": 1.46, + "grad_norm": 0.6097339391708374, + "learning_rate": 0.00031347482977489577, + "loss": 3.0575, + "step": 29732 + }, + { + "epoch": 1.46, + "grad_norm": 0.6279599666595459, + "learning_rate": 0.00031345944910515447, + "loss": 3.0261, + "step": 29733 + }, + { + "epoch": 1.46, + "grad_norm": 0.5827269554138184, + "learning_rate": 0.00031344406839996355, + "loss": 3.2295, + "step": 29734 + }, + { + "epoch": 1.46, + "grad_norm": 0.5751627683639526, + "learning_rate": 0.0003134286876593637, + "loss": 2.9174, + "step": 29735 + }, + { + "epoch": 1.46, + "grad_norm": 0.5884239077568054, + "learning_rate": 0.000313413306883395, + "loss": 3.0503, + "step": 29736 + }, + { + "epoch": 1.46, + "grad_norm": 0.5882185697555542, + "learning_rate": 0.00031339792607209844, + "loss": 2.9154, + "step": 29737 + }, + { + "epoch": 1.46, + "grad_norm": 0.5342128276824951, + "learning_rate": 0.00031338254522551417, + "loss": 3.0905, + "step": 29738 + }, + { + "epoch": 1.46, + "grad_norm": 0.566861093044281, + "learning_rate": 0.00031336716434368303, + "loss": 3.0281, + "step": 29739 + }, + { + "epoch": 1.46, + "grad_norm": 0.5697751045227051, + "learning_rate": 0.00031335178342664524, + "loss": 2.994, + "step": 29740 + }, + { + "epoch": 1.46, + "grad_norm": 0.6492906212806702, + "learning_rate": 0.00031333640247444136, + "loss": 3.1775, + "step": 29741 + }, + { + "epoch": 1.46, + "grad_norm": 0.6085817813873291, + "learning_rate": 0.00031332102148711214, + "loss": 3.1993, + "step": 29742 + }, + { + "epoch": 1.46, + "grad_norm": 0.5619418621063232, + "learning_rate": 0.0003133056404646978, + "loss": 2.982, + "step": 29743 + }, + { + "epoch": 1.46, + "grad_norm": 0.5778747200965881, + "learning_rate": 0.000313290259407239, + "loss": 2.8579, + "step": 29744 + }, + { + "epoch": 1.46, + "grad_norm": 0.597019374370575, + "learning_rate": 0.00031327487831477616, + "loss": 2.9197, + "step": 29745 + }, + { + "epoch": 1.46, + "grad_norm": 0.5481294989585876, + "learning_rate": 0.00031325949718734986, + "loss": 3.0998, + "step": 29746 + }, + { + "epoch": 1.46, + "grad_norm": 0.5638428926467896, + "learning_rate": 0.0003132441160250006, + "loss": 3.1415, + "step": 29747 + }, + { + "epoch": 1.46, + "grad_norm": 0.5789369940757751, + "learning_rate": 0.0003132287348277689, + "loss": 3.1912, + "step": 29748 + }, + { + "epoch": 1.46, + "grad_norm": 0.5999630689620972, + "learning_rate": 0.00031321335359569527, + "loss": 3.0397, + "step": 29749 + }, + { + "epoch": 1.46, + "grad_norm": 0.5871257185935974, + "learning_rate": 0.0003131979723288203, + "loss": 2.9909, + "step": 29750 + }, + { + "epoch": 1.46, + "grad_norm": 0.6134441494941711, + "learning_rate": 0.0003131825910271842, + "loss": 3.1867, + "step": 29751 + }, + { + "epoch": 1.46, + "grad_norm": 0.622126579284668, + "learning_rate": 0.0003131672096908278, + "loss": 3.1849, + "step": 29752 + }, + { + "epoch": 1.46, + "grad_norm": 0.5536166429519653, + "learning_rate": 0.00031315182831979157, + "loss": 3.1054, + "step": 29753 + }, + { + "epoch": 1.46, + "grad_norm": 0.5597190856933594, + "learning_rate": 0.0003131364469141159, + "loss": 3.1826, + "step": 29754 + }, + { + "epoch": 1.46, + "grad_norm": 0.5869278311729431, + "learning_rate": 0.00031312106547384133, + "loss": 3.029, + "step": 29755 + }, + { + "epoch": 1.46, + "grad_norm": 0.6084804534912109, + "learning_rate": 0.0003131056839990084, + "loss": 2.6037, + "step": 29756 + }, + { + "epoch": 1.46, + "grad_norm": 0.5875492691993713, + "learning_rate": 0.00031309030248965764, + "loss": 3.1481, + "step": 29757 + }, + { + "epoch": 1.46, + "grad_norm": 0.6422194242477417, + "learning_rate": 0.00031307492094582957, + "loss": 3.0253, + "step": 29758 + }, + { + "epoch": 1.46, + "grad_norm": 0.5797473788261414, + "learning_rate": 0.0003130595393675646, + "loss": 3.0883, + "step": 29759 + }, + { + "epoch": 1.46, + "grad_norm": 0.6029950976371765, + "learning_rate": 0.0003130441577549034, + "loss": 2.8537, + "step": 29760 + }, + { + "epoch": 1.46, + "grad_norm": 0.5672838687896729, + "learning_rate": 0.00031302877610788637, + "loss": 2.8068, + "step": 29761 + }, + { + "epoch": 1.46, + "grad_norm": 0.5959682464599609, + "learning_rate": 0.00031301339442655403, + "loss": 3.1368, + "step": 29762 + }, + { + "epoch": 1.46, + "grad_norm": 0.6413449645042419, + "learning_rate": 0.00031299801271094687, + "loss": 2.7994, + "step": 29763 + }, + { + "epoch": 1.46, + "grad_norm": 0.5850305557250977, + "learning_rate": 0.0003129826309611056, + "loss": 3.1345, + "step": 29764 + }, + { + "epoch": 1.46, + "grad_norm": 0.6164899468421936, + "learning_rate": 0.0003129672491770705, + "loss": 3.048, + "step": 29765 + }, + { + "epoch": 1.46, + "grad_norm": 0.6072206497192383, + "learning_rate": 0.00031295186735888204, + "loss": 3.0531, + "step": 29766 + }, + { + "epoch": 1.46, + "grad_norm": 0.5849358439445496, + "learning_rate": 0.000312936485506581, + "loss": 3.0533, + "step": 29767 + }, + { + "epoch": 1.46, + "grad_norm": 0.5805505514144897, + "learning_rate": 0.0003129211036202077, + "loss": 2.9414, + "step": 29768 + }, + { + "epoch": 1.46, + "grad_norm": 0.5903421640396118, + "learning_rate": 0.0003129057216998027, + "loss": 3.0021, + "step": 29769 + }, + { + "epoch": 1.46, + "grad_norm": 0.5872484445571899, + "learning_rate": 0.0003128903397454066, + "loss": 2.9317, + "step": 29770 + }, + { + "epoch": 1.46, + "grad_norm": 0.6200918555259705, + "learning_rate": 0.00031287495775705963, + "loss": 3.0879, + "step": 29771 + }, + { + "epoch": 1.46, + "grad_norm": 0.5617815256118774, + "learning_rate": 0.00031285957573480263, + "loss": 3.0599, + "step": 29772 + }, + { + "epoch": 1.46, + "grad_norm": 0.5852945446968079, + "learning_rate": 0.00031284419367867596, + "loss": 2.9581, + "step": 29773 + }, + { + "epoch": 1.46, + "grad_norm": 0.6024762392044067, + "learning_rate": 0.0003128288115887202, + "loss": 2.8992, + "step": 29774 + }, + { + "epoch": 1.46, + "grad_norm": 0.6069262623786926, + "learning_rate": 0.00031281342946497584, + "loss": 3.1949, + "step": 29775 + }, + { + "epoch": 1.46, + "grad_norm": 0.6076410412788391, + "learning_rate": 0.00031279804730748327, + "loss": 2.93, + "step": 29776 + }, + { + "epoch": 1.46, + "grad_norm": 0.599779486656189, + "learning_rate": 0.00031278266511628307, + "loss": 3.0456, + "step": 29777 + }, + { + "epoch": 1.46, + "grad_norm": 0.6234508156776428, + "learning_rate": 0.0003127672828914159, + "loss": 3.0092, + "step": 29778 + }, + { + "epoch": 1.46, + "grad_norm": 0.5925348997116089, + "learning_rate": 0.00031275190063292214, + "loss": 3.2223, + "step": 29779 + }, + { + "epoch": 1.46, + "grad_norm": 0.6192617416381836, + "learning_rate": 0.00031273651834084234, + "loss": 2.9969, + "step": 29780 + }, + { + "epoch": 1.46, + "grad_norm": 0.6267848014831543, + "learning_rate": 0.000312721136015217, + "loss": 2.8548, + "step": 29781 + }, + { + "epoch": 1.46, + "grad_norm": 0.5980367064476013, + "learning_rate": 0.00031270575365608663, + "loss": 2.9697, + "step": 29782 + }, + { + "epoch": 1.46, + "grad_norm": 0.5981336236000061, + "learning_rate": 0.0003126903712634917, + "loss": 3.051, + "step": 29783 + }, + { + "epoch": 1.46, + "grad_norm": 0.6255171298980713, + "learning_rate": 0.00031267498883747277, + "loss": 3.1955, + "step": 29784 + }, + { + "epoch": 1.46, + "grad_norm": 0.5721108913421631, + "learning_rate": 0.00031265960637807044, + "loss": 3.077, + "step": 29785 + }, + { + "epoch": 1.46, + "grad_norm": 0.5838683843612671, + "learning_rate": 0.00031264422388532515, + "loss": 3.2894, + "step": 29786 + }, + { + "epoch": 1.46, + "grad_norm": 0.5979468822479248, + "learning_rate": 0.0003126288413592773, + "loss": 2.9987, + "step": 29787 + }, + { + "epoch": 1.46, + "grad_norm": 0.601974368095398, + "learning_rate": 0.0003126134587999676, + "loss": 2.9889, + "step": 29788 + }, + { + "epoch": 1.46, + "grad_norm": 0.5563658475875854, + "learning_rate": 0.0003125980762074364, + "loss": 2.9098, + "step": 29789 + }, + { + "epoch": 1.46, + "grad_norm": 0.628157913684845, + "learning_rate": 0.0003125826935817244, + "loss": 3.2493, + "step": 29790 + }, + { + "epoch": 1.46, + "grad_norm": 0.5461869239807129, + "learning_rate": 0.00031256731092287195, + "loss": 3.1618, + "step": 29791 + }, + { + "epoch": 1.46, + "grad_norm": 0.6051981449127197, + "learning_rate": 0.0003125519282309196, + "loss": 3.0727, + "step": 29792 + }, + { + "epoch": 1.46, + "grad_norm": 0.5399933457374573, + "learning_rate": 0.00031253654550590795, + "loss": 2.8477, + "step": 29793 + }, + { + "epoch": 1.46, + "grad_norm": 0.5897714495658875, + "learning_rate": 0.00031252116274787736, + "loss": 3.0971, + "step": 29794 + }, + { + "epoch": 1.46, + "grad_norm": 0.5866346955299377, + "learning_rate": 0.00031250577995686854, + "loss": 3.0104, + "step": 29795 + }, + { + "epoch": 1.46, + "grad_norm": 0.6055521368980408, + "learning_rate": 0.00031249039713292183, + "loss": 3.2093, + "step": 29796 + }, + { + "epoch": 1.46, + "grad_norm": 0.581646203994751, + "learning_rate": 0.0003124750142760778, + "loss": 3.2742, + "step": 29797 + }, + { + "epoch": 1.46, + "grad_norm": 0.5622509717941284, + "learning_rate": 0.00031245963138637706, + "loss": 3.1615, + "step": 29798 + }, + { + "epoch": 1.46, + "grad_norm": 0.5802165269851685, + "learning_rate": 0.00031244424846386, + "loss": 2.7669, + "step": 29799 + }, + { + "epoch": 1.46, + "grad_norm": 0.578447699546814, + "learning_rate": 0.0003124288655085671, + "loss": 3.0275, + "step": 29800 + }, + { + "epoch": 1.46, + "grad_norm": 0.6063286066055298, + "learning_rate": 0.00031241348252053914, + "loss": 3.052, + "step": 29801 + }, + { + "epoch": 1.46, + "grad_norm": 0.5866938829421997, + "learning_rate": 0.0003123980994998163, + "loss": 2.9231, + "step": 29802 + }, + { + "epoch": 1.46, + "grad_norm": 0.5843689441680908, + "learning_rate": 0.00031238271644643933, + "loss": 3.0812, + "step": 29803 + }, + { + "epoch": 1.46, + "grad_norm": 0.6158820986747742, + "learning_rate": 0.00031236733336044876, + "loss": 2.9661, + "step": 29804 + }, + { + "epoch": 1.46, + "grad_norm": 0.6195018887519836, + "learning_rate": 0.00031235195024188484, + "loss": 2.955, + "step": 29805 + }, + { + "epoch": 1.46, + "grad_norm": 0.5677433013916016, + "learning_rate": 0.00031233656709078836, + "loss": 3.0007, + "step": 29806 + }, + { + "epoch": 1.46, + "grad_norm": 0.5664675831794739, + "learning_rate": 0.0003123211839071997, + "loss": 2.8784, + "step": 29807 + }, + { + "epoch": 1.46, + "grad_norm": 0.5745792984962463, + "learning_rate": 0.00031230580069115945, + "loss": 2.9201, + "step": 29808 + }, + { + "epoch": 1.46, + "grad_norm": 0.5681601762771606, + "learning_rate": 0.000312290417442708, + "loss": 2.9938, + "step": 29809 + }, + { + "epoch": 1.46, + "grad_norm": 0.580299437046051, + "learning_rate": 0.00031227503416188604, + "loss": 2.8414, + "step": 29810 + }, + { + "epoch": 1.46, + "grad_norm": 0.5805426239967346, + "learning_rate": 0.00031225965084873396, + "loss": 3.0765, + "step": 29811 + }, + { + "epoch": 1.46, + "grad_norm": 0.5639891624450684, + "learning_rate": 0.00031224426750329245, + "loss": 3.0892, + "step": 29812 + }, + { + "epoch": 1.46, + "grad_norm": 0.5976626873016357, + "learning_rate": 0.0003122288841256017, + "loss": 3.1597, + "step": 29813 + }, + { + "epoch": 1.46, + "grad_norm": 0.6398038864135742, + "learning_rate": 0.0003122135007157025, + "loss": 2.881, + "step": 29814 + }, + { + "epoch": 1.46, + "grad_norm": 0.5923072695732117, + "learning_rate": 0.0003121981172736354, + "loss": 3.1807, + "step": 29815 + }, + { + "epoch": 1.46, + "grad_norm": 0.5970144271850586, + "learning_rate": 0.00031218273379944063, + "loss": 3.1799, + "step": 29816 + }, + { + "epoch": 1.46, + "grad_norm": 0.6289509534835815, + "learning_rate": 0.00031216735029315903, + "loss": 2.7998, + "step": 29817 + }, + { + "epoch": 1.46, + "grad_norm": 0.609671413898468, + "learning_rate": 0.00031215196675483074, + "loss": 2.9472, + "step": 29818 + }, + { + "epoch": 1.46, + "grad_norm": 0.5885813236236572, + "learning_rate": 0.00031213658318449676, + "loss": 3.249, + "step": 29819 + }, + { + "epoch": 1.46, + "grad_norm": 0.5870927572250366, + "learning_rate": 0.0003121211995821973, + "loss": 2.9601, + "step": 29820 + }, + { + "epoch": 1.46, + "grad_norm": 0.5752508044242859, + "learning_rate": 0.00031210581594797284, + "loss": 3.1265, + "step": 29821 + }, + { + "epoch": 1.46, + "grad_norm": 0.5618848204612732, + "learning_rate": 0.00031209043228186406, + "loss": 2.8466, + "step": 29822 + }, + { + "epoch": 1.46, + "grad_norm": 0.5932725667953491, + "learning_rate": 0.00031207504858391145, + "loss": 3.1074, + "step": 29823 + }, + { + "epoch": 1.46, + "grad_norm": 0.567245602607727, + "learning_rate": 0.00031205966485415535, + "loss": 2.9608, + "step": 29824 + }, + { + "epoch": 1.46, + "grad_norm": 0.5531851649284363, + "learning_rate": 0.0003120442810926365, + "loss": 3.1427, + "step": 29825 + }, + { + "epoch": 1.46, + "grad_norm": 0.5592467188835144, + "learning_rate": 0.00031202889729939543, + "loss": 3.1028, + "step": 29826 + }, + { + "epoch": 1.46, + "grad_norm": 0.6010973453521729, + "learning_rate": 0.0003120135134744724, + "loss": 3.1718, + "step": 29827 + }, + { + "epoch": 1.46, + "grad_norm": 0.587733805179596, + "learning_rate": 0.0003119981296179081, + "loss": 2.8077, + "step": 29828 + }, + { + "epoch": 1.46, + "grad_norm": 0.5497106313705444, + "learning_rate": 0.0003119827457297431, + "loss": 3.0282, + "step": 29829 + }, + { + "epoch": 1.46, + "grad_norm": 0.5502404570579529, + "learning_rate": 0.00031196736181001784, + "loss": 2.9867, + "step": 29830 + }, + { + "epoch": 1.46, + "grad_norm": 0.5896203517913818, + "learning_rate": 0.0003119519778587729, + "loss": 2.9452, + "step": 29831 + }, + { + "epoch": 1.46, + "grad_norm": 0.5907154679298401, + "learning_rate": 0.0003119365938760486, + "loss": 2.919, + "step": 29832 + }, + { + "epoch": 1.46, + "grad_norm": 0.5724136829376221, + "learning_rate": 0.00031192120986188573, + "loss": 3.052, + "step": 29833 + }, + { + "epoch": 1.46, + "grad_norm": 0.5927634835243225, + "learning_rate": 0.0003119058258163247, + "loss": 3.0744, + "step": 29834 + }, + { + "epoch": 1.46, + "grad_norm": 0.5511701107025146, + "learning_rate": 0.00031189044173940596, + "loss": 2.9463, + "step": 29835 + }, + { + "epoch": 1.46, + "grad_norm": 0.6173757314682007, + "learning_rate": 0.00031187505763117016, + "loss": 2.9852, + "step": 29836 + }, + { + "epoch": 1.46, + "grad_norm": 0.6480523347854614, + "learning_rate": 0.0003118596734916577, + "loss": 3.0302, + "step": 29837 + }, + { + "epoch": 1.46, + "grad_norm": 0.6319320201873779, + "learning_rate": 0.00031184428932090906, + "loss": 3.1108, + "step": 29838 + }, + { + "epoch": 1.46, + "grad_norm": 0.5854665040969849, + "learning_rate": 0.0003118289051189649, + "loss": 3.1069, + "step": 29839 + }, + { + "epoch": 1.46, + "grad_norm": 0.5656838417053223, + "learning_rate": 0.00031181352088586567, + "loss": 3.0943, + "step": 29840 + }, + { + "epoch": 1.46, + "grad_norm": 0.5562570691108704, + "learning_rate": 0.000311798136621652, + "loss": 2.9549, + "step": 29841 + }, + { + "epoch": 1.46, + "grad_norm": 0.635686457157135, + "learning_rate": 0.0003117827523263642, + "loss": 3.1737, + "step": 29842 + }, + { + "epoch": 1.46, + "grad_norm": 0.5831470489501953, + "learning_rate": 0.0003117673680000429, + "loss": 2.9611, + "step": 29843 + }, + { + "epoch": 1.46, + "grad_norm": 0.5976853966712952, + "learning_rate": 0.00031175198364272866, + "loss": 2.9907, + "step": 29844 + }, + { + "epoch": 1.46, + "grad_norm": 0.6116719841957092, + "learning_rate": 0.000311736599254462, + "loss": 2.9301, + "step": 29845 + }, + { + "epoch": 1.46, + "grad_norm": 0.6009196639060974, + "learning_rate": 0.0003117212148352833, + "loss": 3.0093, + "step": 29846 + }, + { + "epoch": 1.46, + "grad_norm": 0.5840451717376709, + "learning_rate": 0.00031170583038523324, + "loss": 3.1293, + "step": 29847 + }, + { + "epoch": 1.46, + "grad_norm": 0.5879883766174316, + "learning_rate": 0.0003116904459043523, + "loss": 2.8642, + "step": 29848 + }, + { + "epoch": 1.46, + "grad_norm": 0.5799393057823181, + "learning_rate": 0.00031167506139268084, + "loss": 3.0245, + "step": 29849 + }, + { + "epoch": 1.46, + "grad_norm": 0.5922977328300476, + "learning_rate": 0.0003116596768502596, + "loss": 3.0836, + "step": 29850 + }, + { + "epoch": 1.46, + "grad_norm": 0.5858464241027832, + "learning_rate": 0.000311644292277129, + "loss": 3.0924, + "step": 29851 + }, + { + "epoch": 1.46, + "grad_norm": 0.5689262747764587, + "learning_rate": 0.00031162890767332966, + "loss": 2.9704, + "step": 29852 + }, + { + "epoch": 1.46, + "grad_norm": 0.6013664603233337, + "learning_rate": 0.0003116135230389019, + "loss": 2.994, + "step": 29853 + }, + { + "epoch": 1.46, + "grad_norm": 0.5736306309700012, + "learning_rate": 0.00031159813837388636, + "loss": 2.8971, + "step": 29854 + }, + { + "epoch": 1.46, + "grad_norm": 0.6058791279792786, + "learning_rate": 0.00031158275367832366, + "loss": 3.0407, + "step": 29855 + }, + { + "epoch": 1.46, + "grad_norm": 0.563359797000885, + "learning_rate": 0.00031156736895225417, + "loss": 3.3032, + "step": 29856 + }, + { + "epoch": 1.46, + "grad_norm": 0.6421247720718384, + "learning_rate": 0.00031155198419571844, + "loss": 3.0148, + "step": 29857 + }, + { + "epoch": 1.46, + "grad_norm": 0.5899890065193176, + "learning_rate": 0.00031153659940875696, + "loss": 2.8782, + "step": 29858 + }, + { + "epoch": 1.46, + "grad_norm": 0.5498698353767395, + "learning_rate": 0.00031152121459141033, + "loss": 3.1737, + "step": 29859 + }, + { + "epoch": 1.46, + "grad_norm": 0.564635694026947, + "learning_rate": 0.00031150582974371905, + "loss": 3.1709, + "step": 29860 + }, + { + "epoch": 1.46, + "grad_norm": 0.5672764778137207, + "learning_rate": 0.0003114904448657236, + "loss": 3.0535, + "step": 29861 + }, + { + "epoch": 1.46, + "grad_norm": 0.6894760727882385, + "learning_rate": 0.0003114750599574646, + "loss": 3.0999, + "step": 29862 + }, + { + "epoch": 1.46, + "grad_norm": 0.6003525257110596, + "learning_rate": 0.0003114596750189825, + "loss": 3.137, + "step": 29863 + }, + { + "epoch": 1.46, + "grad_norm": 0.5983224511146545, + "learning_rate": 0.0003114442900503177, + "loss": 3.0206, + "step": 29864 + }, + { + "epoch": 1.46, + "grad_norm": 0.5564965009689331, + "learning_rate": 0.000311428905051511, + "loss": 2.9906, + "step": 29865 + }, + { + "epoch": 1.46, + "grad_norm": 0.5878624320030212, + "learning_rate": 0.0003114135200226027, + "loss": 2.8516, + "step": 29866 + }, + { + "epoch": 1.46, + "grad_norm": 0.5519507527351379, + "learning_rate": 0.00031139813496363337, + "loss": 3.2566, + "step": 29867 + }, + { + "epoch": 1.46, + "grad_norm": 0.5717609524726868, + "learning_rate": 0.0003113827498746435, + "loss": 3.0263, + "step": 29868 + }, + { + "epoch": 1.46, + "grad_norm": 0.5674951076507568, + "learning_rate": 0.00031136736475567364, + "loss": 3.1298, + "step": 29869 + }, + { + "epoch": 1.46, + "grad_norm": 0.6507361531257629, + "learning_rate": 0.0003113519796067645, + "loss": 3.1109, + "step": 29870 + }, + { + "epoch": 1.46, + "grad_norm": 0.6233397722244263, + "learning_rate": 0.0003113365944279563, + "loss": 2.8556, + "step": 29871 + }, + { + "epoch": 1.46, + "grad_norm": 0.5794210433959961, + "learning_rate": 0.0003113212092192897, + "loss": 2.9724, + "step": 29872 + }, + { + "epoch": 1.46, + "grad_norm": 0.5545445084571838, + "learning_rate": 0.0003113058239808052, + "loss": 2.9606, + "step": 29873 + }, + { + "epoch": 1.46, + "grad_norm": 0.5938491821289062, + "learning_rate": 0.00031129043871254344, + "loss": 3.1588, + "step": 29874 + }, + { + "epoch": 1.46, + "grad_norm": 0.5351107716560364, + "learning_rate": 0.0003112750534145448, + "loss": 3.0698, + "step": 29875 + }, + { + "epoch": 1.46, + "grad_norm": 0.5751978754997253, + "learning_rate": 0.0003112596680868497, + "loss": 3.298, + "step": 29876 + }, + { + "epoch": 1.46, + "grad_norm": 0.6225106120109558, + "learning_rate": 0.00031124428272949896, + "loss": 3.1264, + "step": 29877 + }, + { + "epoch": 1.46, + "grad_norm": 0.5818681716918945, + "learning_rate": 0.00031122889734253294, + "loss": 3.1766, + "step": 29878 + }, + { + "epoch": 1.46, + "grad_norm": 0.5759578943252563, + "learning_rate": 0.000311213511925992, + "loss": 3.0989, + "step": 29879 + }, + { + "epoch": 1.46, + "grad_norm": 0.5782442688941956, + "learning_rate": 0.0003111981264799169, + "loss": 3.1062, + "step": 29880 + }, + { + "epoch": 1.46, + "grad_norm": 0.606756865978241, + "learning_rate": 0.00031118274100434823, + "loss": 2.9649, + "step": 29881 + }, + { + "epoch": 1.46, + "grad_norm": 0.5722266435623169, + "learning_rate": 0.0003111673554993263, + "loss": 2.9258, + "step": 29882 + }, + { + "epoch": 1.46, + "grad_norm": 0.5965888500213623, + "learning_rate": 0.0003111519699648916, + "loss": 2.7842, + "step": 29883 + }, + { + "epoch": 1.46, + "grad_norm": 0.5483811497688293, + "learning_rate": 0.0003111365844010849, + "loss": 2.9596, + "step": 29884 + }, + { + "epoch": 1.46, + "grad_norm": 0.5529528856277466, + "learning_rate": 0.00031112119880794654, + "loss": 3.1932, + "step": 29885 + }, + { + "epoch": 1.46, + "grad_norm": 0.624657392501831, + "learning_rate": 0.000311105813185517, + "loss": 2.9514, + "step": 29886 + }, + { + "epoch": 1.46, + "grad_norm": 0.5750808119773865, + "learning_rate": 0.0003110904275338369, + "loss": 3.1854, + "step": 29887 + }, + { + "epoch": 1.46, + "grad_norm": 0.5993642807006836, + "learning_rate": 0.0003110750418529469, + "loss": 2.9088, + "step": 29888 + }, + { + "epoch": 1.46, + "grad_norm": 0.6091515421867371, + "learning_rate": 0.00031105965614288723, + "loss": 3.1339, + "step": 29889 + }, + { + "epoch": 1.46, + "grad_norm": 0.6110482811927795, + "learning_rate": 0.0003110442704036985, + "loss": 3.286, + "step": 29890 + }, + { + "epoch": 1.46, + "grad_norm": 0.6325213313102722, + "learning_rate": 0.00031102888463542137, + "loss": 3.0859, + "step": 29891 + }, + { + "epoch": 1.46, + "grad_norm": 0.586706280708313, + "learning_rate": 0.0003110134988380964, + "loss": 2.8581, + "step": 29892 + }, + { + "epoch": 1.46, + "grad_norm": 0.5791521668434143, + "learning_rate": 0.0003109981130117639, + "loss": 3.0277, + "step": 29893 + }, + { + "epoch": 1.47, + "grad_norm": 0.6084620952606201, + "learning_rate": 0.0003109827271564644, + "loss": 3.1767, + "step": 29894 + }, + { + "epoch": 1.47, + "grad_norm": 0.6085389852523804, + "learning_rate": 0.0003109673412722386, + "loss": 3.0384, + "step": 29895 + }, + { + "epoch": 1.47, + "grad_norm": 0.6390563249588013, + "learning_rate": 0.0003109519553591269, + "loss": 2.8968, + "step": 29896 + }, + { + "epoch": 1.47, + "grad_norm": 0.5879514813423157, + "learning_rate": 0.0003109365694171699, + "loss": 3.0246, + "step": 29897 + }, + { + "epoch": 1.47, + "grad_norm": 0.6184220314025879, + "learning_rate": 0.000310921183446408, + "loss": 2.9875, + "step": 29898 + }, + { + "epoch": 1.47, + "grad_norm": 0.629254162311554, + "learning_rate": 0.00031090579744688196, + "loss": 3.1362, + "step": 29899 + }, + { + "epoch": 1.47, + "grad_norm": 0.6382981538772583, + "learning_rate": 0.00031089041141863194, + "loss": 3.0567, + "step": 29900 + }, + { + "epoch": 1.47, + "grad_norm": 0.5889188647270203, + "learning_rate": 0.00031087502536169875, + "loss": 3.0674, + "step": 29901 + }, + { + "epoch": 1.47, + "grad_norm": 0.6274064183235168, + "learning_rate": 0.0003108596392761229, + "loss": 3.0911, + "step": 29902 + }, + { + "epoch": 1.47, + "grad_norm": 0.6702287197113037, + "learning_rate": 0.0003108442531619448, + "loss": 2.8349, + "step": 29903 + }, + { + "epoch": 1.47, + "grad_norm": 0.5988737940788269, + "learning_rate": 0.00031082886701920505, + "loss": 3.0651, + "step": 29904 + }, + { + "epoch": 1.47, + "grad_norm": 0.6140710115432739, + "learning_rate": 0.00031081348084794404, + "loss": 2.8459, + "step": 29905 + }, + { + "epoch": 1.47, + "grad_norm": 0.5804564356803894, + "learning_rate": 0.00031079809464820257, + "loss": 3.0062, + "step": 29906 + }, + { + "epoch": 1.47, + "grad_norm": 0.5837451219558716, + "learning_rate": 0.0003107827084200209, + "loss": 3.1248, + "step": 29907 + }, + { + "epoch": 1.47, + "grad_norm": 0.6358888149261475, + "learning_rate": 0.0003107673221634397, + "loss": 3.0773, + "step": 29908 + }, + { + "epoch": 1.47, + "grad_norm": 0.6011119484901428, + "learning_rate": 0.00031075193587849943, + "loss": 3.0292, + "step": 29909 + }, + { + "epoch": 1.47, + "grad_norm": 0.6402463316917419, + "learning_rate": 0.00031073654956524063, + "loss": 2.7842, + "step": 29910 + }, + { + "epoch": 1.47, + "grad_norm": 0.5692670345306396, + "learning_rate": 0.0003107211632237037, + "loss": 3.114, + "step": 29911 + }, + { + "epoch": 1.47, + "grad_norm": 0.5846226215362549, + "learning_rate": 0.0003107057768539295, + "loss": 3.1666, + "step": 29912 + }, + { + "epoch": 1.47, + "grad_norm": 0.5675181150436401, + "learning_rate": 0.0003106903904559582, + "loss": 2.7647, + "step": 29913 + }, + { + "epoch": 1.47, + "grad_norm": 0.6056368350982666, + "learning_rate": 0.00031067500402983056, + "loss": 3.0578, + "step": 29914 + }, + { + "epoch": 1.47, + "grad_norm": 0.5673414468765259, + "learning_rate": 0.00031065961757558694, + "loss": 2.9745, + "step": 29915 + }, + { + "epoch": 1.47, + "grad_norm": 0.6073873043060303, + "learning_rate": 0.00031064423109326795, + "loss": 3.1665, + "step": 29916 + }, + { + "epoch": 1.47, + "grad_norm": 0.5638627409934998, + "learning_rate": 0.0003106288445829142, + "loss": 3.1064, + "step": 29917 + }, + { + "epoch": 1.47, + "grad_norm": 0.5839649438858032, + "learning_rate": 0.00031061345804456604, + "loss": 2.7305, + "step": 29918 + }, + { + "epoch": 1.47, + "grad_norm": 0.5768024325370789, + "learning_rate": 0.00031059807147826405, + "loss": 2.9832, + "step": 29919 + }, + { + "epoch": 1.47, + "grad_norm": 0.6130768656730652, + "learning_rate": 0.0003105826848840488, + "loss": 2.9809, + "step": 29920 + }, + { + "epoch": 1.47, + "grad_norm": 0.5641641616821289, + "learning_rate": 0.0003105672982619608, + "loss": 2.9487, + "step": 29921 + }, + { + "epoch": 1.47, + "grad_norm": 0.5932754278182983, + "learning_rate": 0.0003105519116120405, + "loss": 3.1855, + "step": 29922 + }, + { + "epoch": 1.47, + "grad_norm": 0.6482731699943542, + "learning_rate": 0.00031053652493432857, + "loss": 3.0039, + "step": 29923 + }, + { + "epoch": 1.47, + "grad_norm": 0.586216926574707, + "learning_rate": 0.0003105211382288655, + "loss": 3.0655, + "step": 29924 + }, + { + "epoch": 1.47, + "grad_norm": 0.5854665637016296, + "learning_rate": 0.0003105057514956917, + "loss": 3.1271, + "step": 29925 + }, + { + "epoch": 1.47, + "grad_norm": 0.592591404914856, + "learning_rate": 0.0003104903647348478, + "loss": 2.8566, + "step": 29926 + }, + { + "epoch": 1.47, + "grad_norm": 0.5938833951950073, + "learning_rate": 0.0003104749779463743, + "loss": 2.9205, + "step": 29927 + }, + { + "epoch": 1.47, + "grad_norm": 0.5724154114723206, + "learning_rate": 0.00031045959113031187, + "loss": 2.8484, + "step": 29928 + }, + { + "epoch": 1.47, + "grad_norm": 0.5788401961326599, + "learning_rate": 0.00031044420428670067, + "loss": 3.0497, + "step": 29929 + }, + { + "epoch": 1.47, + "grad_norm": 0.5472137928009033, + "learning_rate": 0.0003104288174155816, + "loss": 3.1811, + "step": 29930 + }, + { + "epoch": 1.47, + "grad_norm": 0.6054922938346863, + "learning_rate": 0.0003104134305169948, + "loss": 3.059, + "step": 29931 + }, + { + "epoch": 1.47, + "grad_norm": 0.6112335920333862, + "learning_rate": 0.0003103980435909812, + "loss": 2.9871, + "step": 29932 + }, + { + "epoch": 1.47, + "grad_norm": 0.5911417007446289, + "learning_rate": 0.0003103826566375812, + "loss": 3.0777, + "step": 29933 + }, + { + "epoch": 1.47, + "grad_norm": 0.724625825881958, + "learning_rate": 0.0003103672696568352, + "loss": 3.1604, + "step": 29934 + }, + { + "epoch": 1.47, + "grad_norm": 0.5843529105186462, + "learning_rate": 0.0003103518826487838, + "loss": 2.8306, + "step": 29935 + }, + { + "epoch": 1.47, + "grad_norm": 0.5531800389289856, + "learning_rate": 0.0003103364956134676, + "loss": 2.9553, + "step": 29936 + }, + { + "epoch": 1.47, + "grad_norm": 0.6143542528152466, + "learning_rate": 0.00031032110855092705, + "loss": 2.8877, + "step": 29937 + }, + { + "epoch": 1.47, + "grad_norm": 0.5682981014251709, + "learning_rate": 0.0003103057214612026, + "loss": 3.1894, + "step": 29938 + }, + { + "epoch": 1.47, + "grad_norm": 0.6502469182014465, + "learning_rate": 0.000310290334344335, + "loss": 3.2924, + "step": 29939 + }, + { + "epoch": 1.47, + "grad_norm": 0.5962908267974854, + "learning_rate": 0.00031027494720036456, + "loss": 2.8696, + "step": 29940 + }, + { + "epoch": 1.47, + "grad_norm": 0.6846643090248108, + "learning_rate": 0.0003102595600293318, + "loss": 2.8676, + "step": 29941 + }, + { + "epoch": 1.47, + "grad_norm": 0.5837406516075134, + "learning_rate": 0.00031024417283127744, + "loss": 3.0501, + "step": 29942 + }, + { + "epoch": 1.47, + "grad_norm": 0.6407919526100159, + "learning_rate": 0.0003102287856062419, + "loss": 3.2333, + "step": 29943 + }, + { + "epoch": 1.47, + "grad_norm": 0.615726888179779, + "learning_rate": 0.0003102133983542657, + "loss": 3.1496, + "step": 29944 + }, + { + "epoch": 1.47, + "grad_norm": 0.5835103392601013, + "learning_rate": 0.00031019801107538933, + "loss": 3.0538, + "step": 29945 + }, + { + "epoch": 1.47, + "grad_norm": 0.5882119536399841, + "learning_rate": 0.0003101826237696534, + "loss": 2.8205, + "step": 29946 + }, + { + "epoch": 1.47, + "grad_norm": 0.5969010591506958, + "learning_rate": 0.0003101672364370984, + "loss": 2.8979, + "step": 29947 + }, + { + "epoch": 1.47, + "grad_norm": 0.561356782913208, + "learning_rate": 0.00031015184907776485, + "loss": 2.9959, + "step": 29948 + }, + { + "epoch": 1.47, + "grad_norm": 0.5852020978927612, + "learning_rate": 0.00031013646169169326, + "loss": 3.0055, + "step": 29949 + }, + { + "epoch": 1.47, + "grad_norm": 0.613553524017334, + "learning_rate": 0.0003101210742789242, + "loss": 2.9737, + "step": 29950 + }, + { + "epoch": 1.47, + "grad_norm": 0.5886765122413635, + "learning_rate": 0.00031010568683949817, + "loss": 2.8426, + "step": 29951 + }, + { + "epoch": 1.47, + "grad_norm": 0.6295921206474304, + "learning_rate": 0.00031009029937345567, + "loss": 3.2008, + "step": 29952 + }, + { + "epoch": 1.47, + "grad_norm": 0.5718304514884949, + "learning_rate": 0.0003100749118808373, + "loss": 3.2872, + "step": 29953 + }, + { + "epoch": 1.47, + "grad_norm": 0.5818148851394653, + "learning_rate": 0.0003100595243616836, + "loss": 3.315, + "step": 29954 + }, + { + "epoch": 1.47, + "grad_norm": 0.5743296146392822, + "learning_rate": 0.00031004413681603505, + "loss": 3.1359, + "step": 29955 + }, + { + "epoch": 1.47, + "grad_norm": 0.6252809166908264, + "learning_rate": 0.00031002874924393204, + "loss": 2.96, + "step": 29956 + }, + { + "epoch": 1.47, + "grad_norm": 0.5608223676681519, + "learning_rate": 0.00031001336164541537, + "loss": 3.1832, + "step": 29957 + }, + { + "epoch": 1.47, + "grad_norm": 0.5842495560646057, + "learning_rate": 0.00030999797402052543, + "loss": 3.2502, + "step": 29958 + }, + { + "epoch": 1.47, + "grad_norm": 0.569668710231781, + "learning_rate": 0.00030998258636930266, + "loss": 3.2578, + "step": 29959 + }, + { + "epoch": 1.47, + "grad_norm": 0.5936859250068665, + "learning_rate": 0.00030996719869178776, + "loss": 3.2201, + "step": 29960 + }, + { + "epoch": 1.47, + "grad_norm": 0.5749362111091614, + "learning_rate": 0.00030995181098802114, + "loss": 3.1107, + "step": 29961 + }, + { + "epoch": 1.47, + "grad_norm": 0.6007623076438904, + "learning_rate": 0.00030993642325804333, + "loss": 3.0569, + "step": 29962 + }, + { + "epoch": 1.47, + "grad_norm": 0.6313218474388123, + "learning_rate": 0.000309921035501895, + "loss": 3.0859, + "step": 29963 + }, + { + "epoch": 1.47, + "grad_norm": 0.6418139934539795, + "learning_rate": 0.00030990564771961646, + "loss": 3.1744, + "step": 29964 + }, + { + "epoch": 1.47, + "grad_norm": 0.5875469446182251, + "learning_rate": 0.00030989025991124845, + "loss": 2.8325, + "step": 29965 + }, + { + "epoch": 1.47, + "grad_norm": 0.5943783521652222, + "learning_rate": 0.00030987487207683134, + "loss": 3.1794, + "step": 29966 + }, + { + "epoch": 1.47, + "grad_norm": 0.5599361658096313, + "learning_rate": 0.00030985948421640564, + "loss": 3.1811, + "step": 29967 + }, + { + "epoch": 1.47, + "grad_norm": 0.605446994304657, + "learning_rate": 0.00030984409633001215, + "loss": 3.1107, + "step": 29968 + }, + { + "epoch": 1.47, + "grad_norm": 0.609178900718689, + "learning_rate": 0.00030982870841769105, + "loss": 2.9717, + "step": 29969 + }, + { + "epoch": 1.47, + "grad_norm": 0.5545887351036072, + "learning_rate": 0.00030981332047948305, + "loss": 2.9905, + "step": 29970 + }, + { + "epoch": 1.47, + "grad_norm": 0.5839653611183167, + "learning_rate": 0.0003097979325154287, + "loss": 2.9023, + "step": 29971 + }, + { + "epoch": 1.47, + "grad_norm": 0.5799376964569092, + "learning_rate": 0.0003097825445255684, + "loss": 3.0182, + "step": 29972 + }, + { + "epoch": 1.47, + "grad_norm": 0.560981273651123, + "learning_rate": 0.00030976715650994283, + "loss": 3.1762, + "step": 29973 + }, + { + "epoch": 1.47, + "grad_norm": 0.6042661666870117, + "learning_rate": 0.00030975176846859245, + "loss": 3.1572, + "step": 29974 + }, + { + "epoch": 1.47, + "grad_norm": 0.6044111847877502, + "learning_rate": 0.0003097363804015577, + "loss": 3.0491, + "step": 29975 + }, + { + "epoch": 1.47, + "grad_norm": 0.5796766877174377, + "learning_rate": 0.00030972099230887936, + "loss": 3.0295, + "step": 29976 + }, + { + "epoch": 1.47, + "grad_norm": 0.7335373163223267, + "learning_rate": 0.0003097056041905977, + "loss": 2.8727, + "step": 29977 + }, + { + "epoch": 1.47, + "grad_norm": 0.6482856273651123, + "learning_rate": 0.00030969021604675335, + "loss": 3.0558, + "step": 29978 + }, + { + "epoch": 1.47, + "grad_norm": 0.5912559628486633, + "learning_rate": 0.00030967482787738683, + "loss": 2.9217, + "step": 29979 + }, + { + "epoch": 1.47, + "grad_norm": 0.577346920967102, + "learning_rate": 0.0003096594396825387, + "loss": 3.1919, + "step": 29980 + }, + { + "epoch": 1.47, + "grad_norm": 0.586104154586792, + "learning_rate": 0.0003096440514622495, + "loss": 3.0131, + "step": 29981 + }, + { + "epoch": 1.47, + "grad_norm": 0.5916348695755005, + "learning_rate": 0.0003096286632165596, + "loss": 3.0654, + "step": 29982 + }, + { + "epoch": 1.47, + "grad_norm": 0.6343048810958862, + "learning_rate": 0.00030961327494550975, + "loss": 2.8386, + "step": 29983 + }, + { + "epoch": 1.47, + "grad_norm": 0.60175621509552, + "learning_rate": 0.00030959788664914034, + "loss": 2.8354, + "step": 29984 + }, + { + "epoch": 1.47, + "grad_norm": 0.5782569050788879, + "learning_rate": 0.00030958249832749193, + "loss": 3.1649, + "step": 29985 + }, + { + "epoch": 1.47, + "grad_norm": 0.5811314582824707, + "learning_rate": 0.00030956710998060515, + "loss": 2.8642, + "step": 29986 + }, + { + "epoch": 1.47, + "grad_norm": 0.6273580193519592, + "learning_rate": 0.0003095517216085203, + "loss": 3.082, + "step": 29987 + }, + { + "epoch": 1.47, + "grad_norm": 0.595939576625824, + "learning_rate": 0.0003095363332112782, + "loss": 3.0947, + "step": 29988 + }, + { + "epoch": 1.47, + "grad_norm": 0.6219014525413513, + "learning_rate": 0.00030952094478891917, + "loss": 2.9882, + "step": 29989 + }, + { + "epoch": 1.47, + "grad_norm": 0.6124245524406433, + "learning_rate": 0.0003095055563414839, + "loss": 2.9679, + "step": 29990 + }, + { + "epoch": 1.47, + "grad_norm": 0.5397445559501648, + "learning_rate": 0.0003094901678690127, + "loss": 3.1045, + "step": 29991 + }, + { + "epoch": 1.47, + "grad_norm": 0.6168830990791321, + "learning_rate": 0.0003094747793715462, + "loss": 3.0952, + "step": 29992 + }, + { + "epoch": 1.47, + "grad_norm": 0.6066709756851196, + "learning_rate": 0.000309459390849125, + "loss": 2.8169, + "step": 29993 + }, + { + "epoch": 1.47, + "grad_norm": 0.6061710119247437, + "learning_rate": 0.00030944400230178965, + "loss": 3.1075, + "step": 29994 + }, + { + "epoch": 1.47, + "grad_norm": 0.5859954953193665, + "learning_rate": 0.00030942861372958055, + "loss": 3.0391, + "step": 29995 + }, + { + "epoch": 1.47, + "grad_norm": 0.6048535704612732, + "learning_rate": 0.0003094132251325383, + "loss": 3.171, + "step": 29996 + }, + { + "epoch": 1.47, + "grad_norm": 0.6408707499504089, + "learning_rate": 0.00030939783651070347, + "loss": 3.0741, + "step": 29997 + }, + { + "epoch": 1.47, + "grad_norm": 0.5734691023826599, + "learning_rate": 0.0003093824478641165, + "loss": 2.9984, + "step": 29998 + }, + { + "epoch": 1.47, + "grad_norm": 0.5938532948493958, + "learning_rate": 0.00030936705919281797, + "loss": 2.9143, + "step": 29999 + }, + { + "epoch": 1.47, + "grad_norm": 0.5862475037574768, + "learning_rate": 0.00030935167049684845, + "loss": 2.9718, + "step": 30000 + }, + { + "epoch": 1.47, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.00030933628177624845, + "loss": 2.987, + "step": 30001 + }, + { + "epoch": 1.47, + "grad_norm": 0.5849444270133972, + "learning_rate": 0.00030932089303105836, + "loss": 3.0514, + "step": 30002 + }, + { + "epoch": 1.47, + "grad_norm": 0.5930509567260742, + "learning_rate": 0.0003093055042613189, + "loss": 3.1043, + "step": 30003 + }, + { + "epoch": 1.47, + "grad_norm": 0.5608189702033997, + "learning_rate": 0.0003092901154670705, + "loss": 3.0665, + "step": 30004 + }, + { + "epoch": 1.47, + "grad_norm": 0.5968251824378967, + "learning_rate": 0.00030927472664835385, + "loss": 2.9385, + "step": 30005 + }, + { + "epoch": 1.47, + "grad_norm": 0.6607335805892944, + "learning_rate": 0.0003092593378052093, + "loss": 2.8233, + "step": 30006 + }, + { + "epoch": 1.47, + "grad_norm": 0.5770511031150818, + "learning_rate": 0.00030924394893767736, + "loss": 3.1056, + "step": 30007 + }, + { + "epoch": 1.47, + "grad_norm": 0.671971321105957, + "learning_rate": 0.0003092285600457987, + "loss": 2.833, + "step": 30008 + }, + { + "epoch": 1.47, + "grad_norm": 0.5936998724937439, + "learning_rate": 0.0003092131711296137, + "loss": 3.1278, + "step": 30009 + }, + { + "epoch": 1.47, + "grad_norm": 0.6061517596244812, + "learning_rate": 0.00030919778218916304, + "loss": 3.2403, + "step": 30010 + }, + { + "epoch": 1.47, + "grad_norm": 0.6386778354644775, + "learning_rate": 0.00030918239322448723, + "loss": 3.0888, + "step": 30011 + }, + { + "epoch": 1.47, + "grad_norm": 0.5995616316795349, + "learning_rate": 0.00030916700423562677, + "loss": 2.953, + "step": 30012 + }, + { + "epoch": 1.47, + "grad_norm": 0.6046136021614075, + "learning_rate": 0.0003091516152226221, + "loss": 3.1343, + "step": 30013 + }, + { + "epoch": 1.47, + "grad_norm": 0.6061043739318848, + "learning_rate": 0.00030913622618551384, + "loss": 3.0659, + "step": 30014 + }, + { + "epoch": 1.47, + "grad_norm": 0.6259770393371582, + "learning_rate": 0.00030912083712434263, + "loss": 3.0636, + "step": 30015 + }, + { + "epoch": 1.47, + "grad_norm": 0.5781980156898499, + "learning_rate": 0.00030910544803914886, + "loss": 3.0077, + "step": 30016 + }, + { + "epoch": 1.47, + "grad_norm": 0.6106329560279846, + "learning_rate": 0.0003090900589299731, + "loss": 2.8722, + "step": 30017 + }, + { + "epoch": 1.47, + "grad_norm": 0.6144434213638306, + "learning_rate": 0.00030907466979685573, + "loss": 2.9317, + "step": 30018 + }, + { + "epoch": 1.47, + "grad_norm": 0.5561577081680298, + "learning_rate": 0.0003090592806398376, + "loss": 3.058, + "step": 30019 + }, + { + "epoch": 1.47, + "grad_norm": 0.5754068493843079, + "learning_rate": 0.00030904389145895895, + "loss": 2.7952, + "step": 30020 + }, + { + "epoch": 1.47, + "grad_norm": 0.566835343837738, + "learning_rate": 0.00030902850225426046, + "loss": 3.3087, + "step": 30021 + }, + { + "epoch": 1.47, + "grad_norm": 0.6028580069541931, + "learning_rate": 0.00030901311302578265, + "loss": 2.8337, + "step": 30022 + }, + { + "epoch": 1.47, + "grad_norm": 0.6136353611946106, + "learning_rate": 0.000308997723773566, + "loss": 2.9233, + "step": 30023 + }, + { + "epoch": 1.47, + "grad_norm": 0.5877104997634888, + "learning_rate": 0.00030898233449765117, + "loss": 3.2272, + "step": 30024 + }, + { + "epoch": 1.47, + "grad_norm": 0.5864350199699402, + "learning_rate": 0.0003089669451980785, + "loss": 3.0262, + "step": 30025 + }, + { + "epoch": 1.47, + "grad_norm": 0.5969464182853699, + "learning_rate": 0.00030895155587488867, + "loss": 3.0745, + "step": 30026 + }, + { + "epoch": 1.47, + "grad_norm": 0.6056321859359741, + "learning_rate": 0.00030893616652812224, + "loss": 3.0233, + "step": 30027 + }, + { + "epoch": 1.47, + "grad_norm": 0.5771879553794861, + "learning_rate": 0.0003089207771578195, + "loss": 3.1142, + "step": 30028 + }, + { + "epoch": 1.47, + "grad_norm": 0.5768468976020813, + "learning_rate": 0.00030890538776402125, + "loss": 3.2566, + "step": 30029 + }, + { + "epoch": 1.47, + "grad_norm": 0.6096814870834351, + "learning_rate": 0.0003088899983467679, + "loss": 3.0353, + "step": 30030 + }, + { + "epoch": 1.47, + "grad_norm": 0.5729770660400391, + "learning_rate": 0.0003088746089061, + "loss": 3.0693, + "step": 30031 + }, + { + "epoch": 1.47, + "grad_norm": 0.6186831593513489, + "learning_rate": 0.0003088592194420581, + "loss": 3.2713, + "step": 30032 + }, + { + "epoch": 1.47, + "grad_norm": 0.5830119848251343, + "learning_rate": 0.00030884382995468267, + "loss": 3.2221, + "step": 30033 + }, + { + "epoch": 1.47, + "grad_norm": 0.5844249129295349, + "learning_rate": 0.00030882844044401446, + "loss": 2.9112, + "step": 30034 + }, + { + "epoch": 1.47, + "grad_norm": 0.6245279312133789, + "learning_rate": 0.0003088130509100937, + "loss": 3.1312, + "step": 30035 + }, + { + "epoch": 1.47, + "grad_norm": 0.6162157654762268, + "learning_rate": 0.00030879766135296103, + "loss": 3.0903, + "step": 30036 + }, + { + "epoch": 1.47, + "grad_norm": 0.6323968172073364, + "learning_rate": 0.00030878227177265707, + "loss": 3.0378, + "step": 30037 + }, + { + "epoch": 1.47, + "grad_norm": 0.5921278595924377, + "learning_rate": 0.0003087668821692223, + "loss": 3.1487, + "step": 30038 + }, + { + "epoch": 1.47, + "grad_norm": 0.5776242613792419, + "learning_rate": 0.00030875149254269723, + "loss": 2.8284, + "step": 30039 + }, + { + "epoch": 1.47, + "grad_norm": 0.5871045589447021, + "learning_rate": 0.0003087361028931224, + "loss": 3.0864, + "step": 30040 + }, + { + "epoch": 1.47, + "grad_norm": 0.6193739175796509, + "learning_rate": 0.0003087207132205384, + "loss": 3.0912, + "step": 30041 + }, + { + "epoch": 1.47, + "grad_norm": 0.5853241086006165, + "learning_rate": 0.00030870532352498577, + "loss": 3.095, + "step": 30042 + }, + { + "epoch": 1.47, + "grad_norm": 0.5713692903518677, + "learning_rate": 0.00030868993380650495, + "loss": 2.9933, + "step": 30043 + }, + { + "epoch": 1.47, + "grad_norm": 0.589024543762207, + "learning_rate": 0.00030867454406513636, + "loss": 3.1075, + "step": 30044 + }, + { + "epoch": 1.47, + "grad_norm": 0.638248085975647, + "learning_rate": 0.0003086591543009209, + "loss": 3.1879, + "step": 30045 + }, + { + "epoch": 1.47, + "grad_norm": 0.5836499333381653, + "learning_rate": 0.00030864376451389885, + "loss": 3.2182, + "step": 30046 + }, + { + "epoch": 1.47, + "grad_norm": 0.6031222343444824, + "learning_rate": 0.0003086283747041107, + "loss": 3.1655, + "step": 30047 + }, + { + "epoch": 1.47, + "grad_norm": 0.5775892734527588, + "learning_rate": 0.00030861298487159714, + "loss": 3.1287, + "step": 30048 + }, + { + "epoch": 1.47, + "grad_norm": 0.5872842669487, + "learning_rate": 0.00030859759501639865, + "loss": 2.8683, + "step": 30049 + }, + { + "epoch": 1.47, + "grad_norm": 0.6183463931083679, + "learning_rate": 0.00030858220513855567, + "loss": 3.0174, + "step": 30050 + }, + { + "epoch": 1.47, + "grad_norm": 0.56663578748703, + "learning_rate": 0.0003085668152381088, + "loss": 3.0062, + "step": 30051 + }, + { + "epoch": 1.47, + "grad_norm": 0.610784649848938, + "learning_rate": 0.0003085514253150988, + "loss": 3.0837, + "step": 30052 + }, + { + "epoch": 1.47, + "grad_norm": 0.5991683602333069, + "learning_rate": 0.0003085360353695658, + "loss": 3.1117, + "step": 30053 + }, + { + "epoch": 1.47, + "grad_norm": 0.5896287560462952, + "learning_rate": 0.0003085206454015505, + "loss": 3.2123, + "step": 30054 + }, + { + "epoch": 1.47, + "grad_norm": 0.5315856337547302, + "learning_rate": 0.00030850525541109347, + "loss": 3.1274, + "step": 30055 + }, + { + "epoch": 1.47, + "grad_norm": 0.600836992263794, + "learning_rate": 0.0003084898653982354, + "loss": 3.0838, + "step": 30056 + }, + { + "epoch": 1.47, + "grad_norm": 0.6084719300270081, + "learning_rate": 0.00030847447536301657, + "loss": 2.7858, + "step": 30057 + }, + { + "epoch": 1.47, + "grad_norm": 0.597867488861084, + "learning_rate": 0.0003084590853054775, + "loss": 3.0085, + "step": 30058 + }, + { + "epoch": 1.47, + "grad_norm": 0.6317042112350464, + "learning_rate": 0.000308443695225659, + "loss": 2.9331, + "step": 30059 + }, + { + "epoch": 1.47, + "grad_norm": 0.6137262582778931, + "learning_rate": 0.00030842830512360127, + "loss": 3.237, + "step": 30060 + }, + { + "epoch": 1.47, + "grad_norm": 0.6111647486686707, + "learning_rate": 0.0003084129149993451, + "loss": 3.0709, + "step": 30061 + }, + { + "epoch": 1.47, + "grad_norm": 0.6254605650901794, + "learning_rate": 0.0003083975248529309, + "loss": 3.1905, + "step": 30062 + }, + { + "epoch": 1.47, + "grad_norm": 0.5825523138046265, + "learning_rate": 0.00030838213468439927, + "loss": 3.2801, + "step": 30063 + }, + { + "epoch": 1.47, + "grad_norm": 0.6181331276893616, + "learning_rate": 0.00030836674449379064, + "loss": 2.8734, + "step": 30064 + }, + { + "epoch": 1.47, + "grad_norm": 0.6186519861221313, + "learning_rate": 0.0003083513542811456, + "loss": 3.1002, + "step": 30065 + }, + { + "epoch": 1.47, + "grad_norm": 0.5745614767074585, + "learning_rate": 0.00030833596404650476, + "loss": 3.0784, + "step": 30066 + }, + { + "epoch": 1.47, + "grad_norm": 0.5756679773330688, + "learning_rate": 0.0003083205737899087, + "loss": 2.988, + "step": 30067 + }, + { + "epoch": 1.47, + "grad_norm": 0.6421597003936768, + "learning_rate": 0.0003083051835113977, + "loss": 3.2194, + "step": 30068 + }, + { + "epoch": 1.47, + "grad_norm": 0.610306978225708, + "learning_rate": 0.00030828979321101245, + "loss": 3.0658, + "step": 30069 + }, + { + "epoch": 1.47, + "grad_norm": 0.5943095088005066, + "learning_rate": 0.0003082744028887936, + "loss": 3.0469, + "step": 30070 + }, + { + "epoch": 1.47, + "grad_norm": 0.5513453483581543, + "learning_rate": 0.0003082590125447815, + "loss": 3.1213, + "step": 30071 + }, + { + "epoch": 1.47, + "grad_norm": 0.577889084815979, + "learning_rate": 0.0003082436221790167, + "loss": 2.8904, + "step": 30072 + }, + { + "epoch": 1.47, + "grad_norm": 0.5584579706192017, + "learning_rate": 0.0003082282317915398, + "loss": 2.9921, + "step": 30073 + }, + { + "epoch": 1.47, + "grad_norm": 0.5914309620857239, + "learning_rate": 0.00030821284138239137, + "loss": 3.0308, + "step": 30074 + }, + { + "epoch": 1.47, + "grad_norm": 0.5481913089752197, + "learning_rate": 0.00030819745095161184, + "loss": 3.0113, + "step": 30075 + }, + { + "epoch": 1.47, + "grad_norm": 0.6211130619049072, + "learning_rate": 0.00030818206049924184, + "loss": 3.1022, + "step": 30076 + }, + { + "epoch": 1.47, + "grad_norm": 0.5855414271354675, + "learning_rate": 0.0003081666700253218, + "loss": 2.867, + "step": 30077 + }, + { + "epoch": 1.47, + "grad_norm": 0.626336395740509, + "learning_rate": 0.00030815127952989245, + "loss": 3.2556, + "step": 30078 + }, + { + "epoch": 1.47, + "grad_norm": 0.5970550179481506, + "learning_rate": 0.00030813588901299404, + "loss": 3.2407, + "step": 30079 + }, + { + "epoch": 1.47, + "grad_norm": 0.6191651821136475, + "learning_rate": 0.00030812049847466736, + "loss": 3.1078, + "step": 30080 + }, + { + "epoch": 1.47, + "grad_norm": 0.6230210661888123, + "learning_rate": 0.00030810510791495294, + "loss": 3.3527, + "step": 30081 + }, + { + "epoch": 1.47, + "grad_norm": 0.6455768942832947, + "learning_rate": 0.00030808971733389107, + "loss": 2.8692, + "step": 30082 + }, + { + "epoch": 1.47, + "grad_norm": 0.6259360909461975, + "learning_rate": 0.00030807432673152257, + "loss": 3.0794, + "step": 30083 + }, + { + "epoch": 1.47, + "grad_norm": 0.5726274251937866, + "learning_rate": 0.00030805893610788776, + "loss": 3.0452, + "step": 30084 + }, + { + "epoch": 1.47, + "grad_norm": 0.5579006671905518, + "learning_rate": 0.00030804354546302726, + "loss": 2.9137, + "step": 30085 + }, + { + "epoch": 1.47, + "grad_norm": 0.582209050655365, + "learning_rate": 0.0003080281547969816, + "loss": 2.8087, + "step": 30086 + }, + { + "epoch": 1.47, + "grad_norm": 0.5763527750968933, + "learning_rate": 0.00030801276410979137, + "loss": 3.0132, + "step": 30087 + }, + { + "epoch": 1.47, + "grad_norm": 0.606497585773468, + "learning_rate": 0.00030799737340149707, + "loss": 2.8572, + "step": 30088 + }, + { + "epoch": 1.47, + "grad_norm": 0.6698089241981506, + "learning_rate": 0.0003079819826721392, + "loss": 2.9752, + "step": 30089 + }, + { + "epoch": 1.47, + "grad_norm": 0.6636598110198975, + "learning_rate": 0.00030796659192175826, + "loss": 3.3192, + "step": 30090 + }, + { + "epoch": 1.47, + "grad_norm": 0.5712068676948547, + "learning_rate": 0.0003079512011503949, + "loss": 2.9899, + "step": 30091 + }, + { + "epoch": 1.47, + "grad_norm": 0.5981823205947876, + "learning_rate": 0.00030793581035808967, + "loss": 3.0397, + "step": 30092 + }, + { + "epoch": 1.47, + "grad_norm": 0.5917591452598572, + "learning_rate": 0.0003079204195448831, + "loss": 3.2434, + "step": 30093 + }, + { + "epoch": 1.47, + "grad_norm": 0.616977870464325, + "learning_rate": 0.00030790502871081555, + "loss": 3.1531, + "step": 30094 + }, + { + "epoch": 1.47, + "grad_norm": 0.5743931531906128, + "learning_rate": 0.0003078896378559276, + "loss": 2.8923, + "step": 30095 + }, + { + "epoch": 1.47, + "grad_norm": 0.6095641851425171, + "learning_rate": 0.00030787424698026, + "loss": 3.196, + "step": 30096 + }, + { + "epoch": 1.47, + "grad_norm": 0.5719312429428101, + "learning_rate": 0.0003078588560838531, + "loss": 2.9232, + "step": 30097 + }, + { + "epoch": 1.48, + "grad_norm": 0.5918893218040466, + "learning_rate": 0.00030784346516674743, + "loss": 3.1492, + "step": 30098 + }, + { + "epoch": 1.48, + "grad_norm": 0.6037421822547913, + "learning_rate": 0.00030782807422898364, + "loss": 2.899, + "step": 30099 + }, + { + "epoch": 1.48, + "grad_norm": 0.5701082944869995, + "learning_rate": 0.00030781268327060223, + "loss": 2.8979, + "step": 30100 + }, + { + "epoch": 1.48, + "grad_norm": 0.5683816075325012, + "learning_rate": 0.0003077972922916437, + "loss": 2.8652, + "step": 30101 + }, + { + "epoch": 1.48, + "grad_norm": 0.5692213177680969, + "learning_rate": 0.0003077819012921486, + "loss": 3.1262, + "step": 30102 + }, + { + "epoch": 1.48, + "grad_norm": 0.5966072082519531, + "learning_rate": 0.00030776651027215754, + "loss": 3.1198, + "step": 30103 + }, + { + "epoch": 1.48, + "grad_norm": 0.59067302942276, + "learning_rate": 0.0003077511192317109, + "loss": 3.0482, + "step": 30104 + }, + { + "epoch": 1.48, + "grad_norm": 0.6178147196769714, + "learning_rate": 0.00030773572817084925, + "loss": 3.0714, + "step": 30105 + }, + { + "epoch": 1.48, + "grad_norm": 0.5838093161582947, + "learning_rate": 0.0003077203370896132, + "loss": 3.0316, + "step": 30106 + }, + { + "epoch": 1.48, + "grad_norm": 0.5537781715393066, + "learning_rate": 0.00030770494598804344, + "loss": 3.1259, + "step": 30107 + }, + { + "epoch": 1.48, + "grad_norm": 0.5612012147903442, + "learning_rate": 0.00030768955486618017, + "loss": 3.0059, + "step": 30108 + }, + { + "epoch": 1.48, + "grad_norm": 0.5880160927772522, + "learning_rate": 0.0003076741637240641, + "loss": 2.9501, + "step": 30109 + }, + { + "epoch": 1.48, + "grad_norm": 0.5850368142127991, + "learning_rate": 0.0003076587725617358, + "loss": 2.9588, + "step": 30110 + }, + { + "epoch": 1.48, + "grad_norm": 0.5933541655540466, + "learning_rate": 0.00030764338137923583, + "loss": 3.0509, + "step": 30111 + }, + { + "epoch": 1.48, + "grad_norm": 0.6294256448745728, + "learning_rate": 0.00030762799017660454, + "loss": 3.0801, + "step": 30112 + }, + { + "epoch": 1.48, + "grad_norm": 0.5729942917823792, + "learning_rate": 0.00030761259895388266, + "loss": 3.0384, + "step": 30113 + }, + { + "epoch": 1.48, + "grad_norm": 0.575340986251831, + "learning_rate": 0.0003075972077111107, + "loss": 3.067, + "step": 30114 + }, + { + "epoch": 1.48, + "grad_norm": 0.5830142498016357, + "learning_rate": 0.000307581816448329, + "loss": 3.1674, + "step": 30115 + }, + { + "epoch": 1.48, + "grad_norm": 0.5919670462608337, + "learning_rate": 0.00030756642516557836, + "loss": 3.2883, + "step": 30116 + }, + { + "epoch": 1.48, + "grad_norm": 0.5591514110565186, + "learning_rate": 0.00030755103386289923, + "loss": 2.8813, + "step": 30117 + }, + { + "epoch": 1.48, + "grad_norm": 0.6111984252929688, + "learning_rate": 0.0003075356425403322, + "loss": 3.2448, + "step": 30118 + }, + { + "epoch": 1.48, + "grad_norm": 0.5747086405754089, + "learning_rate": 0.0003075202511979176, + "loss": 3.2162, + "step": 30119 + }, + { + "epoch": 1.48, + "grad_norm": 0.6217588782310486, + "learning_rate": 0.0003075048598356961, + "loss": 3.0186, + "step": 30120 + }, + { + "epoch": 1.48, + "grad_norm": 0.5869501829147339, + "learning_rate": 0.00030748946845370837, + "loss": 3.1229, + "step": 30121 + }, + { + "epoch": 1.48, + "grad_norm": 0.6082846522331238, + "learning_rate": 0.00030747407705199474, + "loss": 3.0859, + "step": 30122 + }, + { + "epoch": 1.48, + "grad_norm": 0.6041231751441956, + "learning_rate": 0.00030745868563059585, + "loss": 3.1107, + "step": 30123 + }, + { + "epoch": 1.48, + "grad_norm": 0.5768772959709167, + "learning_rate": 0.0003074432941895522, + "loss": 3.1982, + "step": 30124 + }, + { + "epoch": 1.48, + "grad_norm": 0.5537108182907104, + "learning_rate": 0.00030742790272890436, + "loss": 2.9797, + "step": 30125 + }, + { + "epoch": 1.48, + "grad_norm": 0.582661509513855, + "learning_rate": 0.0003074125112486928, + "loss": 3.0316, + "step": 30126 + }, + { + "epoch": 1.48, + "grad_norm": 0.5950396656990051, + "learning_rate": 0.00030739711974895814, + "loss": 2.7202, + "step": 30127 + }, + { + "epoch": 1.48, + "grad_norm": 0.5830130577087402, + "learning_rate": 0.00030738172822974096, + "loss": 2.9846, + "step": 30128 + }, + { + "epoch": 1.48, + "grad_norm": 0.5635205507278442, + "learning_rate": 0.0003073663366910817, + "loss": 2.9789, + "step": 30129 + }, + { + "epoch": 1.48, + "grad_norm": 0.577469527721405, + "learning_rate": 0.00030735094513302085, + "loss": 2.9118, + "step": 30130 + }, + { + "epoch": 1.48, + "grad_norm": 0.603352963924408, + "learning_rate": 0.00030733555355559903, + "loss": 2.9981, + "step": 30131 + }, + { + "epoch": 1.48, + "grad_norm": 0.5949218273162842, + "learning_rate": 0.0003073201619588569, + "loss": 2.8709, + "step": 30132 + }, + { + "epoch": 1.48, + "grad_norm": 0.5829793214797974, + "learning_rate": 0.00030730477034283477, + "loss": 2.947, + "step": 30133 + }, + { + "epoch": 1.48, + "grad_norm": 0.5699021220207214, + "learning_rate": 0.0003072893787075733, + "loss": 3.1005, + "step": 30134 + }, + { + "epoch": 1.48, + "grad_norm": 0.5696962475776672, + "learning_rate": 0.000307273987053113, + "loss": 3.1134, + "step": 30135 + }, + { + "epoch": 1.48, + "grad_norm": 0.5930047631263733, + "learning_rate": 0.0003072585953794945, + "loss": 2.9946, + "step": 30136 + }, + { + "epoch": 1.48, + "grad_norm": 0.6077128052711487, + "learning_rate": 0.0003072432036867581, + "loss": 3.1896, + "step": 30137 + }, + { + "epoch": 1.48, + "grad_norm": 0.6007574796676636, + "learning_rate": 0.0003072278119749446, + "loss": 3.0898, + "step": 30138 + }, + { + "epoch": 1.48, + "grad_norm": 0.613781750202179, + "learning_rate": 0.00030721242024409447, + "loss": 2.9471, + "step": 30139 + }, + { + "epoch": 1.48, + "grad_norm": 0.5998162031173706, + "learning_rate": 0.0003071970284942482, + "loss": 3.1317, + "step": 30140 + }, + { + "epoch": 1.48, + "grad_norm": 0.5879024863243103, + "learning_rate": 0.0003071816367254462, + "loss": 3.2085, + "step": 30141 + }, + { + "epoch": 1.48, + "grad_norm": 0.5877510905265808, + "learning_rate": 0.00030716624493772927, + "loss": 3.1028, + "step": 30142 + }, + { + "epoch": 1.48, + "grad_norm": 0.6133305430412292, + "learning_rate": 0.00030715085313113785, + "loss": 2.9439, + "step": 30143 + }, + { + "epoch": 1.48, + "grad_norm": 0.5921503901481628, + "learning_rate": 0.0003071354613057124, + "loss": 2.9244, + "step": 30144 + }, + { + "epoch": 1.48, + "grad_norm": 0.5787282586097717, + "learning_rate": 0.0003071200694614936, + "loss": 2.927, + "step": 30145 + }, + { + "epoch": 1.48, + "grad_norm": 0.5823509097099304, + "learning_rate": 0.00030710467759852177, + "loss": 3.1512, + "step": 30146 + }, + { + "epoch": 1.48, + "grad_norm": 0.6131330132484436, + "learning_rate": 0.0003070892857168377, + "loss": 3.0768, + "step": 30147 + }, + { + "epoch": 1.48, + "grad_norm": 0.6605939865112305, + "learning_rate": 0.00030707389381648176, + "loss": 3.1089, + "step": 30148 + }, + { + "epoch": 1.48, + "grad_norm": 0.5737425088882446, + "learning_rate": 0.0003070585018974946, + "loss": 3.2414, + "step": 30149 + }, + { + "epoch": 1.48, + "grad_norm": 0.5738173127174377, + "learning_rate": 0.00030704310995991667, + "loss": 3.024, + "step": 30150 + }, + { + "epoch": 1.48, + "grad_norm": 0.6123209595680237, + "learning_rate": 0.00030702771800378856, + "loss": 3.0543, + "step": 30151 + }, + { + "epoch": 1.48, + "grad_norm": 0.5468517541885376, + "learning_rate": 0.00030701232602915073, + "loss": 3.029, + "step": 30152 + }, + { + "epoch": 1.48, + "grad_norm": 0.5879467725753784, + "learning_rate": 0.0003069969340360438, + "loss": 3.1501, + "step": 30153 + }, + { + "epoch": 1.48, + "grad_norm": 0.609234631061554, + "learning_rate": 0.0003069815420245084, + "loss": 3.1284, + "step": 30154 + }, + { + "epoch": 1.48, + "grad_norm": 0.5802608132362366, + "learning_rate": 0.00030696614999458494, + "loss": 3.3857, + "step": 30155 + }, + { + "epoch": 1.48, + "grad_norm": 0.5865260362625122, + "learning_rate": 0.00030695075794631385, + "loss": 2.9436, + "step": 30156 + }, + { + "epoch": 1.48, + "grad_norm": 0.5951496362686157, + "learning_rate": 0.00030693536587973584, + "loss": 3.2445, + "step": 30157 + }, + { + "epoch": 1.48, + "grad_norm": 0.59162837266922, + "learning_rate": 0.0003069199737948915, + "loss": 3.0387, + "step": 30158 + }, + { + "epoch": 1.48, + "grad_norm": 0.574717104434967, + "learning_rate": 0.0003069045816918212, + "loss": 2.8136, + "step": 30159 + }, + { + "epoch": 1.48, + "grad_norm": 0.6385117769241333, + "learning_rate": 0.00030688918957056565, + "loss": 2.971, + "step": 30160 + }, + { + "epoch": 1.48, + "grad_norm": 0.6385549902915955, + "learning_rate": 0.0003068737974311652, + "loss": 3.1969, + "step": 30161 + }, + { + "epoch": 1.48, + "grad_norm": 0.5969220399856567, + "learning_rate": 0.0003068584052736606, + "loss": 3.2271, + "step": 30162 + }, + { + "epoch": 1.48, + "grad_norm": 0.5814893841743469, + "learning_rate": 0.0003068430130980921, + "loss": 2.9764, + "step": 30163 + }, + { + "epoch": 1.48, + "grad_norm": 0.5654752850532532, + "learning_rate": 0.0003068276209045005, + "loss": 3.1884, + "step": 30164 + }, + { + "epoch": 1.48, + "grad_norm": 0.5874380469322205, + "learning_rate": 0.0003068122286929264, + "loss": 3.2886, + "step": 30165 + }, + { + "epoch": 1.48, + "grad_norm": 0.5981268286705017, + "learning_rate": 0.0003067968364634101, + "loss": 2.7295, + "step": 30166 + }, + { + "epoch": 1.48, + "grad_norm": 0.6147415041923523, + "learning_rate": 0.00030678144421599217, + "loss": 3.2141, + "step": 30167 + }, + { + "epoch": 1.48, + "grad_norm": 0.5960661768913269, + "learning_rate": 0.00030676605195071325, + "loss": 3.0085, + "step": 30168 + }, + { + "epoch": 1.48, + "grad_norm": 0.5469895601272583, + "learning_rate": 0.00030675065966761394, + "loss": 3.0473, + "step": 30169 + }, + { + "epoch": 1.48, + "grad_norm": 0.6334496736526489, + "learning_rate": 0.0003067352673667347, + "loss": 2.9559, + "step": 30170 + }, + { + "epoch": 1.48, + "grad_norm": 0.6375976800918579, + "learning_rate": 0.000306719875048116, + "loss": 3.0748, + "step": 30171 + }, + { + "epoch": 1.48, + "grad_norm": 0.5987905859947205, + "learning_rate": 0.0003067044827117984, + "loss": 2.7641, + "step": 30172 + }, + { + "epoch": 1.48, + "grad_norm": 0.6431529521942139, + "learning_rate": 0.00030668909035782253, + "loss": 2.9902, + "step": 30173 + }, + { + "epoch": 1.48, + "grad_norm": 0.5719122290611267, + "learning_rate": 0.00030667369798622886, + "loss": 3.1214, + "step": 30174 + }, + { + "epoch": 1.48, + "grad_norm": 0.5882480144500732, + "learning_rate": 0.00030665830559705804, + "loss": 3.3428, + "step": 30175 + }, + { + "epoch": 1.48, + "grad_norm": 0.5951686501502991, + "learning_rate": 0.00030664291319035047, + "loss": 3.058, + "step": 30176 + }, + { + "epoch": 1.48, + "grad_norm": 0.594463050365448, + "learning_rate": 0.00030662752076614664, + "loss": 2.9867, + "step": 30177 + }, + { + "epoch": 1.48, + "grad_norm": 0.6419702768325806, + "learning_rate": 0.0003066121283244873, + "loss": 3.0913, + "step": 30178 + }, + { + "epoch": 1.48, + "grad_norm": 0.6004655361175537, + "learning_rate": 0.0003065967358654129, + "loss": 3.0112, + "step": 30179 + }, + { + "epoch": 1.48, + "grad_norm": 0.6011263728141785, + "learning_rate": 0.00030658134338896396, + "loss": 2.9962, + "step": 30180 + }, + { + "epoch": 1.48, + "grad_norm": 0.5436926484107971, + "learning_rate": 0.000306565950895181, + "loss": 3.1637, + "step": 30181 + }, + { + "epoch": 1.48, + "grad_norm": 0.6261427998542786, + "learning_rate": 0.0003065505583841045, + "loss": 2.9735, + "step": 30182 + }, + { + "epoch": 1.48, + "grad_norm": 0.6169971227645874, + "learning_rate": 0.00030653516585577527, + "loss": 2.9467, + "step": 30183 + }, + { + "epoch": 1.48, + "grad_norm": 0.6584175825119019, + "learning_rate": 0.0003065197733102336, + "loss": 3.1038, + "step": 30184 + }, + { + "epoch": 1.48, + "grad_norm": 0.5873148441314697, + "learning_rate": 0.00030650438074752013, + "loss": 2.959, + "step": 30185 + }, + { + "epoch": 1.48, + "grad_norm": 0.5871749520301819, + "learning_rate": 0.0003064889881676753, + "loss": 2.9619, + "step": 30186 + }, + { + "epoch": 1.48, + "grad_norm": 0.6506713628768921, + "learning_rate": 0.00030647359557073973, + "loss": 3.1184, + "step": 30187 + }, + { + "epoch": 1.48, + "grad_norm": 0.5763834118843079, + "learning_rate": 0.000306458202956754, + "loss": 3.2246, + "step": 30188 + }, + { + "epoch": 1.48, + "grad_norm": 0.5880481600761414, + "learning_rate": 0.00030644281032575857, + "loss": 2.8984, + "step": 30189 + }, + { + "epoch": 1.48, + "grad_norm": 0.5734893679618835, + "learning_rate": 0.00030642741767779406, + "loss": 3.1565, + "step": 30190 + }, + { + "epoch": 1.48, + "grad_norm": 0.6054985523223877, + "learning_rate": 0.000306412025012901, + "loss": 2.9848, + "step": 30191 + }, + { + "epoch": 1.48, + "grad_norm": 0.5837637186050415, + "learning_rate": 0.00030639663233111977, + "loss": 3.0006, + "step": 30192 + }, + { + "epoch": 1.48, + "grad_norm": 0.5740242004394531, + "learning_rate": 0.00030638123963249114, + "loss": 2.8531, + "step": 30193 + }, + { + "epoch": 1.48, + "grad_norm": 0.5803605318069458, + "learning_rate": 0.0003063658469170556, + "loss": 2.9359, + "step": 30194 + }, + { + "epoch": 1.48, + "grad_norm": 0.6074859499931335, + "learning_rate": 0.0003063504541848535, + "loss": 3.085, + "step": 30195 + }, + { + "epoch": 1.48, + "grad_norm": 0.5865538120269775, + "learning_rate": 0.00030633506143592566, + "loss": 3.0822, + "step": 30196 + }, + { + "epoch": 1.48, + "grad_norm": 0.589298665523529, + "learning_rate": 0.0003063196686703123, + "loss": 3.2545, + "step": 30197 + }, + { + "epoch": 1.48, + "grad_norm": 0.5684748291969299, + "learning_rate": 0.0003063042758880544, + "loss": 2.933, + "step": 30198 + }, + { + "epoch": 1.48, + "grad_norm": 0.5615243911743164, + "learning_rate": 0.0003062888830891921, + "loss": 3.0079, + "step": 30199 + }, + { + "epoch": 1.48, + "grad_norm": 0.6056434512138367, + "learning_rate": 0.0003062734902737661, + "loss": 2.8702, + "step": 30200 + }, + { + "epoch": 1.48, + "grad_norm": 0.61896151304245, + "learning_rate": 0.00030625809744181694, + "loss": 3.1829, + "step": 30201 + }, + { + "epoch": 1.48, + "grad_norm": 0.5565640330314636, + "learning_rate": 0.0003062427045933853, + "loss": 3.0956, + "step": 30202 + }, + { + "epoch": 1.48, + "grad_norm": 0.634736180305481, + "learning_rate": 0.00030622731172851134, + "loss": 3.0891, + "step": 30203 + }, + { + "epoch": 1.48, + "grad_norm": 0.5769646763801575, + "learning_rate": 0.0003062119188472359, + "loss": 3.0662, + "step": 30204 + }, + { + "epoch": 1.48, + "grad_norm": 0.5608484148979187, + "learning_rate": 0.0003061965259495996, + "loss": 3.0664, + "step": 30205 + }, + { + "epoch": 1.48, + "grad_norm": 0.5996660590171814, + "learning_rate": 0.0003061811330356427, + "loss": 3.0626, + "step": 30206 + }, + { + "epoch": 1.48, + "grad_norm": 0.606673538684845, + "learning_rate": 0.00030616574010540595, + "loss": 3.0254, + "step": 30207 + }, + { + "epoch": 1.48, + "grad_norm": 0.6172423958778381, + "learning_rate": 0.0003061503471589298, + "loss": 2.9156, + "step": 30208 + }, + { + "epoch": 1.48, + "grad_norm": 0.5741183757781982, + "learning_rate": 0.00030613495419625484, + "loss": 3.0138, + "step": 30209 + }, + { + "epoch": 1.48, + "grad_norm": 0.5955791473388672, + "learning_rate": 0.00030611956121742164, + "loss": 3.0918, + "step": 30210 + }, + { + "epoch": 1.48, + "grad_norm": 0.5800381898880005, + "learning_rate": 0.0003061041682224706, + "loss": 3.1426, + "step": 30211 + }, + { + "epoch": 1.48, + "grad_norm": 0.6998795866966248, + "learning_rate": 0.0003060887752114424, + "loss": 3.0431, + "step": 30212 + }, + { + "epoch": 1.48, + "grad_norm": 0.5997923016548157, + "learning_rate": 0.00030607338218437754, + "loss": 2.8662, + "step": 30213 + }, + { + "epoch": 1.48, + "grad_norm": 0.5872188210487366, + "learning_rate": 0.00030605798914131653, + "loss": 2.9839, + "step": 30214 + }, + { + "epoch": 1.48, + "grad_norm": 0.6395425200462341, + "learning_rate": 0.00030604259608229997, + "loss": 2.9624, + "step": 30215 + }, + { + "epoch": 1.48, + "grad_norm": 0.5830799341201782, + "learning_rate": 0.0003060272030073684, + "loss": 3.0724, + "step": 30216 + }, + { + "epoch": 1.48, + "grad_norm": 0.5782269835472107, + "learning_rate": 0.00030601180991656233, + "loss": 3.0847, + "step": 30217 + }, + { + "epoch": 1.48, + "grad_norm": 0.5734887719154358, + "learning_rate": 0.0003059964168099222, + "loss": 3.0501, + "step": 30218 + }, + { + "epoch": 1.48, + "grad_norm": 0.6264581680297852, + "learning_rate": 0.00030598102368748875, + "loss": 3.0401, + "step": 30219 + }, + { + "epoch": 1.48, + "grad_norm": 0.6159687042236328, + "learning_rate": 0.00030596563054930244, + "loss": 3.3187, + "step": 30220 + }, + { + "epoch": 1.48, + "grad_norm": 0.601514458656311, + "learning_rate": 0.00030595023739540383, + "loss": 2.9992, + "step": 30221 + }, + { + "epoch": 1.48, + "grad_norm": 0.6012780070304871, + "learning_rate": 0.0003059348442258334, + "loss": 2.8195, + "step": 30222 + }, + { + "epoch": 1.48, + "grad_norm": 0.6135539412498474, + "learning_rate": 0.00030591945104063175, + "loss": 3.0922, + "step": 30223 + }, + { + "epoch": 1.48, + "grad_norm": 0.6214767098426819, + "learning_rate": 0.0003059040578398394, + "loss": 3.2023, + "step": 30224 + }, + { + "epoch": 1.48, + "grad_norm": 0.5887531042098999, + "learning_rate": 0.0003058886646234969, + "loss": 2.8366, + "step": 30225 + }, + { + "epoch": 1.48, + "grad_norm": 0.5788924694061279, + "learning_rate": 0.00030587327139164474, + "loss": 3.0441, + "step": 30226 + }, + { + "epoch": 1.48, + "grad_norm": 0.6139992475509644, + "learning_rate": 0.0003058578781443236, + "loss": 3.1177, + "step": 30227 + }, + { + "epoch": 1.48, + "grad_norm": 0.5603842735290527, + "learning_rate": 0.00030584248488157393, + "loss": 2.965, + "step": 30228 + }, + { + "epoch": 1.48, + "grad_norm": 0.5821728110313416, + "learning_rate": 0.00030582709160343615, + "loss": 2.9993, + "step": 30229 + }, + { + "epoch": 1.48, + "grad_norm": 0.5948326587677002, + "learning_rate": 0.00030581169830995106, + "loss": 2.8071, + "step": 30230 + }, + { + "epoch": 1.48, + "grad_norm": 0.6164664030075073, + "learning_rate": 0.0003057963050011591, + "loss": 3.2478, + "step": 30231 + }, + { + "epoch": 1.48, + "grad_norm": 0.6196146607398987, + "learning_rate": 0.00030578091167710075, + "loss": 2.7503, + "step": 30232 + }, + { + "epoch": 1.48, + "grad_norm": 0.6732343435287476, + "learning_rate": 0.00030576551833781646, + "loss": 3.0339, + "step": 30233 + }, + { + "epoch": 1.48, + "grad_norm": 0.5696006417274475, + "learning_rate": 0.00030575012498334705, + "loss": 2.8955, + "step": 30234 + }, + { + "epoch": 1.48, + "grad_norm": 0.5879353284835815, + "learning_rate": 0.0003057347316137329, + "loss": 3.0795, + "step": 30235 + }, + { + "epoch": 1.48, + "grad_norm": 0.5767134428024292, + "learning_rate": 0.0003057193382290146, + "loss": 2.9579, + "step": 30236 + }, + { + "epoch": 1.48, + "grad_norm": 0.5511177182197571, + "learning_rate": 0.00030570394482923254, + "loss": 3.0081, + "step": 30237 + }, + { + "epoch": 1.48, + "grad_norm": 0.5799663662910461, + "learning_rate": 0.0003056885514144276, + "loss": 3.1636, + "step": 30238 + }, + { + "epoch": 1.48, + "grad_norm": 0.582383394241333, + "learning_rate": 0.00030567315798463987, + "loss": 2.8857, + "step": 30239 + }, + { + "epoch": 1.48, + "grad_norm": 0.554225742816925, + "learning_rate": 0.00030565776453991025, + "loss": 2.9747, + "step": 30240 + }, + { + "epoch": 1.48, + "grad_norm": 0.6220203638076782, + "learning_rate": 0.00030564237108027915, + "loss": 2.9467, + "step": 30241 + }, + { + "epoch": 1.48, + "grad_norm": 0.5915405750274658, + "learning_rate": 0.0003056269776057872, + "loss": 2.7895, + "step": 30242 + }, + { + "epoch": 1.48, + "grad_norm": 0.5988735556602478, + "learning_rate": 0.0003056115841164748, + "loss": 2.9361, + "step": 30243 + }, + { + "epoch": 1.48, + "grad_norm": 0.6032905578613281, + "learning_rate": 0.0003055961906123825, + "loss": 3.1007, + "step": 30244 + }, + { + "epoch": 1.48, + "grad_norm": 0.6156014204025269, + "learning_rate": 0.00030558079709355106, + "loss": 2.9049, + "step": 30245 + }, + { + "epoch": 1.48, + "grad_norm": 0.6102397441864014, + "learning_rate": 0.0003055654035600208, + "loss": 2.9683, + "step": 30246 + }, + { + "epoch": 1.48, + "grad_norm": 0.6204525232315063, + "learning_rate": 0.0003055500100118323, + "loss": 3.0618, + "step": 30247 + }, + { + "epoch": 1.48, + "grad_norm": 0.6160491108894348, + "learning_rate": 0.0003055346164490262, + "loss": 3.1209, + "step": 30248 + }, + { + "epoch": 1.48, + "grad_norm": 0.5589972734451294, + "learning_rate": 0.000305519222871643, + "loss": 3.1358, + "step": 30249 + }, + { + "epoch": 1.48, + "grad_norm": 0.5976723432540894, + "learning_rate": 0.00030550382927972317, + "loss": 2.9208, + "step": 30250 + }, + { + "epoch": 1.48, + "grad_norm": 0.5512217283248901, + "learning_rate": 0.0003054884356733073, + "loss": 3.0062, + "step": 30251 + }, + { + "epoch": 1.48, + "grad_norm": 0.5562359690666199, + "learning_rate": 0.000305473042052436, + "loss": 3.0616, + "step": 30252 + }, + { + "epoch": 1.48, + "grad_norm": 0.6103971600532532, + "learning_rate": 0.0003054576484171498, + "loss": 2.8764, + "step": 30253 + }, + { + "epoch": 1.48, + "grad_norm": 0.5834851861000061, + "learning_rate": 0.00030544225476748907, + "loss": 3.1122, + "step": 30254 + }, + { + "epoch": 1.48, + "grad_norm": 0.5874277949333191, + "learning_rate": 0.0003054268611034946, + "loss": 3.0748, + "step": 30255 + }, + { + "epoch": 1.48, + "grad_norm": 0.5750007629394531, + "learning_rate": 0.00030541146742520684, + "loss": 3.0289, + "step": 30256 + }, + { + "epoch": 1.48, + "grad_norm": 0.5605846047401428, + "learning_rate": 0.0003053960737326663, + "loss": 2.9117, + "step": 30257 + }, + { + "epoch": 1.48, + "grad_norm": 0.577288031578064, + "learning_rate": 0.0003053806800259135, + "loss": 2.9049, + "step": 30258 + }, + { + "epoch": 1.48, + "grad_norm": 0.5720159411430359, + "learning_rate": 0.00030536528630498893, + "loss": 3.2772, + "step": 30259 + }, + { + "epoch": 1.48, + "grad_norm": 0.5890551805496216, + "learning_rate": 0.0003053498925699334, + "loss": 2.8408, + "step": 30260 + }, + { + "epoch": 1.48, + "grad_norm": 0.6078507900238037, + "learning_rate": 0.00030533449882078723, + "loss": 3.1933, + "step": 30261 + }, + { + "epoch": 1.48, + "grad_norm": 0.5361073017120361, + "learning_rate": 0.000305319105057591, + "loss": 3.0317, + "step": 30262 + }, + { + "epoch": 1.48, + "grad_norm": 0.6214322447776794, + "learning_rate": 0.00030530371128038527, + "loss": 2.9402, + "step": 30263 + }, + { + "epoch": 1.48, + "grad_norm": 0.5904223322868347, + "learning_rate": 0.00030528831748921056, + "loss": 3.1021, + "step": 30264 + }, + { + "epoch": 1.48, + "grad_norm": 0.6872094869613647, + "learning_rate": 0.0003052729236841075, + "loss": 3.0022, + "step": 30265 + }, + { + "epoch": 1.48, + "grad_norm": 0.6223731637001038, + "learning_rate": 0.0003052575298651165, + "loss": 2.7194, + "step": 30266 + }, + { + "epoch": 1.48, + "grad_norm": 0.592032253742218, + "learning_rate": 0.00030524213603227827, + "loss": 3.336, + "step": 30267 + }, + { + "epoch": 1.48, + "grad_norm": 0.5875898599624634, + "learning_rate": 0.0003052267421856332, + "loss": 3.031, + "step": 30268 + }, + { + "epoch": 1.48, + "grad_norm": 0.6270338892936707, + "learning_rate": 0.00030521134832522186, + "loss": 2.9393, + "step": 30269 + }, + { + "epoch": 1.48, + "grad_norm": 0.5827121734619141, + "learning_rate": 0.00030519595445108487, + "loss": 3.0373, + "step": 30270 + }, + { + "epoch": 1.48, + "grad_norm": 0.5908574461936951, + "learning_rate": 0.0003051805605632628, + "loss": 2.9094, + "step": 30271 + }, + { + "epoch": 1.48, + "grad_norm": 0.6578699946403503, + "learning_rate": 0.00030516516666179605, + "loss": 3.124, + "step": 30272 + }, + { + "epoch": 1.48, + "grad_norm": 0.6305190324783325, + "learning_rate": 0.0003051497727467253, + "loss": 3.0748, + "step": 30273 + }, + { + "epoch": 1.48, + "grad_norm": 0.5935174226760864, + "learning_rate": 0.000305134378818091, + "loss": 2.9313, + "step": 30274 + }, + { + "epoch": 1.48, + "grad_norm": 0.5942251682281494, + "learning_rate": 0.00030511898487593374, + "loss": 2.8511, + "step": 30275 + }, + { + "epoch": 1.48, + "grad_norm": 0.5874195694923401, + "learning_rate": 0.00030510359092029407, + "loss": 3.1529, + "step": 30276 + }, + { + "epoch": 1.48, + "grad_norm": 0.5620760917663574, + "learning_rate": 0.00030508819695121246, + "loss": 3.1332, + "step": 30277 + }, + { + "epoch": 1.48, + "grad_norm": 0.573728084564209, + "learning_rate": 0.0003050728029687297, + "loss": 3.0645, + "step": 30278 + }, + { + "epoch": 1.48, + "grad_norm": 0.5932513475418091, + "learning_rate": 0.000305057408972886, + "loss": 3.1181, + "step": 30279 + }, + { + "epoch": 1.48, + "grad_norm": 0.6048024296760559, + "learning_rate": 0.00030504201496372205, + "loss": 3.0854, + "step": 30280 + }, + { + "epoch": 1.48, + "grad_norm": 0.5609991550445557, + "learning_rate": 0.0003050266209412784, + "loss": 3.0611, + "step": 30281 + }, + { + "epoch": 1.48, + "grad_norm": 0.598209023475647, + "learning_rate": 0.00030501122690559576, + "loss": 3.0283, + "step": 30282 + }, + { + "epoch": 1.48, + "grad_norm": 0.5940065979957581, + "learning_rate": 0.00030499583285671433, + "loss": 3.0705, + "step": 30283 + }, + { + "epoch": 1.48, + "grad_norm": 0.6010187268257141, + "learning_rate": 0.0003049804387946749, + "loss": 3.1678, + "step": 30284 + }, + { + "epoch": 1.48, + "grad_norm": 0.6073846220970154, + "learning_rate": 0.00030496504471951794, + "loss": 3.0441, + "step": 30285 + }, + { + "epoch": 1.48, + "grad_norm": 0.6591005325317383, + "learning_rate": 0.000304949650631284, + "loss": 2.9177, + "step": 30286 + }, + { + "epoch": 1.48, + "grad_norm": 0.6189901828765869, + "learning_rate": 0.0003049342565300137, + "loss": 3.0777, + "step": 30287 + }, + { + "epoch": 1.48, + "grad_norm": 0.6542283892631531, + "learning_rate": 0.00030491886241574743, + "loss": 2.9349, + "step": 30288 + }, + { + "epoch": 1.48, + "grad_norm": 0.5866391062736511, + "learning_rate": 0.00030490346828852594, + "loss": 3.0771, + "step": 30289 + }, + { + "epoch": 1.48, + "grad_norm": 0.6572050452232361, + "learning_rate": 0.00030488807414838953, + "loss": 3.0787, + "step": 30290 + }, + { + "epoch": 1.48, + "grad_norm": 0.6037701368331909, + "learning_rate": 0.00030487267999537894, + "loss": 3.1791, + "step": 30291 + }, + { + "epoch": 1.48, + "grad_norm": 0.6054988503456116, + "learning_rate": 0.0003048572858295346, + "loss": 2.9688, + "step": 30292 + }, + { + "epoch": 1.48, + "grad_norm": 0.6137593984603882, + "learning_rate": 0.00030484189165089714, + "loss": 3.1096, + "step": 30293 + }, + { + "epoch": 1.48, + "grad_norm": 0.5879693031311035, + "learning_rate": 0.00030482649745950714, + "loss": 3.1275, + "step": 30294 + }, + { + "epoch": 1.48, + "grad_norm": 0.588402271270752, + "learning_rate": 0.00030481110325540487, + "loss": 3.0677, + "step": 30295 + }, + { + "epoch": 1.48, + "grad_norm": 0.5765928030014038, + "learning_rate": 0.00030479570903863126, + "loss": 3.2167, + "step": 30296 + }, + { + "epoch": 1.48, + "grad_norm": 0.6110025644302368, + "learning_rate": 0.0003047803148092267, + "loss": 3.1337, + "step": 30297 + }, + { + "epoch": 1.48, + "grad_norm": 0.6048384308815002, + "learning_rate": 0.00030476492056723156, + "loss": 2.8711, + "step": 30298 + }, + { + "epoch": 1.48, + "grad_norm": 0.6859794855117798, + "learning_rate": 0.0003047495263126867, + "loss": 3.0686, + "step": 30299 + }, + { + "epoch": 1.48, + "grad_norm": 0.6263118982315063, + "learning_rate": 0.0003047341320456324, + "loss": 3.052, + "step": 30300 + }, + { + "epoch": 1.48, + "grad_norm": 0.5739185214042664, + "learning_rate": 0.00030471873776610926, + "loss": 3.1724, + "step": 30301 + }, + { + "epoch": 1.49, + "grad_norm": 0.6050714254379272, + "learning_rate": 0.00030470334347415794, + "loss": 2.7828, + "step": 30302 + }, + { + "epoch": 1.49, + "grad_norm": 0.5985298156738281, + "learning_rate": 0.0003046879491698189, + "loss": 3.2557, + "step": 30303 + }, + { + "epoch": 1.49, + "grad_norm": 0.6600600481033325, + "learning_rate": 0.00030467255485313276, + "loss": 3.1668, + "step": 30304 + }, + { + "epoch": 1.49, + "grad_norm": 0.6146441698074341, + "learning_rate": 0.00030465716052413994, + "loss": 2.9902, + "step": 30305 + }, + { + "epoch": 1.49, + "grad_norm": 0.5583391785621643, + "learning_rate": 0.00030464176618288106, + "loss": 3.1554, + "step": 30306 + }, + { + "epoch": 1.49, + "grad_norm": 0.5961558222770691, + "learning_rate": 0.00030462637182939676, + "loss": 3.1391, + "step": 30307 + }, + { + "epoch": 1.49, + "grad_norm": 0.5696814656257629, + "learning_rate": 0.00030461097746372744, + "loss": 2.9831, + "step": 30308 + }, + { + "epoch": 1.49, + "grad_norm": 0.5873986482620239, + "learning_rate": 0.00030459558308591365, + "loss": 3.1334, + "step": 30309 + }, + { + "epoch": 1.49, + "grad_norm": 0.5988091230392456, + "learning_rate": 0.0003045801886959959, + "loss": 2.9959, + "step": 30310 + }, + { + "epoch": 1.49, + "grad_norm": 0.5809959173202515, + "learning_rate": 0.000304564794294015, + "loss": 3.0092, + "step": 30311 + }, + { + "epoch": 1.49, + "grad_norm": 0.647441565990448, + "learning_rate": 0.00030454939988001114, + "loss": 3.0226, + "step": 30312 + }, + { + "epoch": 1.49, + "grad_norm": 0.5705923438072205, + "learning_rate": 0.0003045340054540251, + "loss": 3.0666, + "step": 30313 + }, + { + "epoch": 1.49, + "grad_norm": 0.5672298669815063, + "learning_rate": 0.00030451861101609746, + "loss": 2.9993, + "step": 30314 + }, + { + "epoch": 1.49, + "grad_norm": 0.6036275029182434, + "learning_rate": 0.0003045032165662686, + "loss": 3.2952, + "step": 30315 + }, + { + "epoch": 1.49, + "grad_norm": 0.5786696672439575, + "learning_rate": 0.00030448782210457906, + "loss": 3.0459, + "step": 30316 + }, + { + "epoch": 1.49, + "grad_norm": 0.5882992148399353, + "learning_rate": 0.0003044724276310695, + "loss": 3.1442, + "step": 30317 + }, + { + "epoch": 1.49, + "grad_norm": 0.5856705904006958, + "learning_rate": 0.0003044570331457805, + "loss": 3.0808, + "step": 30318 + }, + { + "epoch": 1.49, + "grad_norm": 0.5906365513801575, + "learning_rate": 0.0003044416386487525, + "loss": 3.0573, + "step": 30319 + }, + { + "epoch": 1.49, + "grad_norm": 0.587424635887146, + "learning_rate": 0.00030442624414002607, + "loss": 2.9874, + "step": 30320 + }, + { + "epoch": 1.49, + "grad_norm": 0.5522072315216064, + "learning_rate": 0.00030441084961964164, + "loss": 3.0988, + "step": 30321 + }, + { + "epoch": 1.49, + "grad_norm": 0.617243230342865, + "learning_rate": 0.0003043954550876401, + "loss": 2.9937, + "step": 30322 + }, + { + "epoch": 1.49, + "grad_norm": 0.5649192929267883, + "learning_rate": 0.00030438006054406166, + "loss": 3.0672, + "step": 30323 + }, + { + "epoch": 1.49, + "grad_norm": 0.5680512189865112, + "learning_rate": 0.000304364665988947, + "loss": 2.9709, + "step": 30324 + }, + { + "epoch": 1.49, + "grad_norm": 0.560331404209137, + "learning_rate": 0.00030434927142233663, + "loss": 2.9678, + "step": 30325 + }, + { + "epoch": 1.49, + "grad_norm": 0.652197539806366, + "learning_rate": 0.00030433387684427113, + "loss": 2.8437, + "step": 30326 + }, + { + "epoch": 1.49, + "grad_norm": 0.5792146921157837, + "learning_rate": 0.000304318482254791, + "loss": 3.1141, + "step": 30327 + }, + { + "epoch": 1.49, + "grad_norm": 0.589543342590332, + "learning_rate": 0.0003043030876539368, + "loss": 3.0839, + "step": 30328 + }, + { + "epoch": 1.49, + "grad_norm": 0.613621175289154, + "learning_rate": 0.0003042876930417493, + "loss": 3.0055, + "step": 30329 + }, + { + "epoch": 1.49, + "grad_norm": 0.5933493971824646, + "learning_rate": 0.00030427229841826863, + "loss": 2.9089, + "step": 30330 + }, + { + "epoch": 1.49, + "grad_norm": 0.5833031535148621, + "learning_rate": 0.00030425690378353557, + "loss": 3.0106, + "step": 30331 + }, + { + "epoch": 1.49, + "grad_norm": 0.6349642872810364, + "learning_rate": 0.00030424150913759067, + "loss": 3.0396, + "step": 30332 + }, + { + "epoch": 1.49, + "grad_norm": 0.6486304998397827, + "learning_rate": 0.0003042261144804745, + "loss": 3.0428, + "step": 30333 + }, + { + "epoch": 1.49, + "grad_norm": 0.6241731643676758, + "learning_rate": 0.0003042107198122275, + "loss": 3.0911, + "step": 30334 + }, + { + "epoch": 1.49, + "grad_norm": 0.6061176657676697, + "learning_rate": 0.0003041953251328903, + "loss": 2.8672, + "step": 30335 + }, + { + "epoch": 1.49, + "grad_norm": 0.5616305470466614, + "learning_rate": 0.00030417993044250344, + "loss": 3.0808, + "step": 30336 + }, + { + "epoch": 1.49, + "grad_norm": 0.5880496501922607, + "learning_rate": 0.00030416453574110734, + "loss": 3.1441, + "step": 30337 + }, + { + "epoch": 1.49, + "grad_norm": 0.5873252749443054, + "learning_rate": 0.0003041491410287428, + "loss": 3.2033, + "step": 30338 + }, + { + "epoch": 1.49, + "grad_norm": 0.5877861380577087, + "learning_rate": 0.0003041337463054501, + "loss": 3.1364, + "step": 30339 + }, + { + "epoch": 1.49, + "grad_norm": 0.6224821209907532, + "learning_rate": 0.00030411835157127007, + "loss": 3.01, + "step": 30340 + }, + { + "epoch": 1.49, + "grad_norm": 0.5856956839561462, + "learning_rate": 0.0003041029568262429, + "loss": 3.1922, + "step": 30341 + }, + { + "epoch": 1.49, + "grad_norm": 0.6419745683670044, + "learning_rate": 0.0003040875620704094, + "loss": 3.0602, + "step": 30342 + }, + { + "epoch": 1.49, + "grad_norm": 0.7789932489395142, + "learning_rate": 0.0003040721673038101, + "loss": 2.9055, + "step": 30343 + }, + { + "epoch": 1.49, + "grad_norm": 0.5722334384918213, + "learning_rate": 0.00030405677252648547, + "loss": 3.3164, + "step": 30344 + }, + { + "epoch": 1.49, + "grad_norm": 0.6217427849769592, + "learning_rate": 0.0003040413777384761, + "loss": 3.1159, + "step": 30345 + }, + { + "epoch": 1.49, + "grad_norm": 0.6086733937263489, + "learning_rate": 0.00030402598293982245, + "loss": 2.9625, + "step": 30346 + }, + { + "epoch": 1.49, + "grad_norm": 0.5802361965179443, + "learning_rate": 0.00030401058813056526, + "loss": 3.132, + "step": 30347 + }, + { + "epoch": 1.49, + "grad_norm": 0.617330014705658, + "learning_rate": 0.00030399519331074485, + "loss": 2.8512, + "step": 30348 + }, + { + "epoch": 1.49, + "grad_norm": 0.5500723123550415, + "learning_rate": 0.00030397979848040187, + "loss": 3.1054, + "step": 30349 + }, + { + "epoch": 1.49, + "grad_norm": 0.5912415981292725, + "learning_rate": 0.00030396440363957683, + "loss": 2.6677, + "step": 30350 + }, + { + "epoch": 1.49, + "grad_norm": 0.5975785255432129, + "learning_rate": 0.0003039490087883104, + "loss": 3.1374, + "step": 30351 + }, + { + "epoch": 1.49, + "grad_norm": 0.6210824251174927, + "learning_rate": 0.000303933613926643, + "loss": 3.0865, + "step": 30352 + }, + { + "epoch": 1.49, + "grad_norm": 0.5919585824012756, + "learning_rate": 0.00030391821905461524, + "loss": 2.9867, + "step": 30353 + }, + { + "epoch": 1.49, + "grad_norm": 0.6664869785308838, + "learning_rate": 0.00030390282417226764, + "loss": 3.0154, + "step": 30354 + }, + { + "epoch": 1.49, + "grad_norm": 0.6210564970970154, + "learning_rate": 0.00030388742927964083, + "loss": 3.0155, + "step": 30355 + }, + { + "epoch": 1.49, + "grad_norm": 0.6251997947692871, + "learning_rate": 0.0003038720343767752, + "loss": 3.2516, + "step": 30356 + }, + { + "epoch": 1.49, + "grad_norm": 0.6180994510650635, + "learning_rate": 0.0003038566394637113, + "loss": 2.9035, + "step": 30357 + }, + { + "epoch": 1.49, + "grad_norm": 0.590447187423706, + "learning_rate": 0.00030384124454048986, + "loss": 3.1161, + "step": 30358 + }, + { + "epoch": 1.49, + "grad_norm": 0.5581066608428955, + "learning_rate": 0.00030382584960715137, + "loss": 2.945, + "step": 30359 + }, + { + "epoch": 1.49, + "grad_norm": 0.6092578768730164, + "learning_rate": 0.0003038104546637362, + "loss": 3.1729, + "step": 30360 + }, + { + "epoch": 1.49, + "grad_norm": 0.5907177329063416, + "learning_rate": 0.00030379505971028504, + "loss": 2.8266, + "step": 30361 + }, + { + "epoch": 1.49, + "grad_norm": 0.5585633516311646, + "learning_rate": 0.0003037796647468385, + "loss": 3.0163, + "step": 30362 + }, + { + "epoch": 1.49, + "grad_norm": 0.5564131736755371, + "learning_rate": 0.000303764269773437, + "loss": 3.1012, + "step": 30363 + }, + { + "epoch": 1.49, + "grad_norm": 0.578472375869751, + "learning_rate": 0.00030374887479012115, + "loss": 2.98, + "step": 30364 + }, + { + "epoch": 1.49, + "grad_norm": 0.5936715602874756, + "learning_rate": 0.00030373347979693145, + "loss": 3.014, + "step": 30365 + }, + { + "epoch": 1.49, + "grad_norm": 0.6830310821533203, + "learning_rate": 0.00030371808479390857, + "loss": 3.0181, + "step": 30366 + }, + { + "epoch": 1.49, + "grad_norm": 0.5550583600997925, + "learning_rate": 0.0003037026897810928, + "loss": 3.1111, + "step": 30367 + }, + { + "epoch": 1.49, + "grad_norm": 0.5920956134796143, + "learning_rate": 0.000303687294758525, + "loss": 3.0784, + "step": 30368 + }, + { + "epoch": 1.49, + "grad_norm": 0.6229501366615295, + "learning_rate": 0.00030367189972624564, + "loss": 3.1064, + "step": 30369 + }, + { + "epoch": 1.49, + "grad_norm": 0.6036462783813477, + "learning_rate": 0.0003036565046842951, + "loss": 2.8816, + "step": 30370 + }, + { + "epoch": 1.49, + "grad_norm": 0.5995955467224121, + "learning_rate": 0.00030364110963271404, + "loss": 3.0448, + "step": 30371 + }, + { + "epoch": 1.49, + "grad_norm": 0.5678886771202087, + "learning_rate": 0.00030362571457154296, + "loss": 3.1554, + "step": 30372 + }, + { + "epoch": 1.49, + "grad_norm": 0.5802719593048096, + "learning_rate": 0.0003036103195008225, + "loss": 3.1493, + "step": 30373 + }, + { + "epoch": 1.49, + "grad_norm": 0.5747050642967224, + "learning_rate": 0.00030359492442059315, + "loss": 3.1413, + "step": 30374 + }, + { + "epoch": 1.49, + "grad_norm": 0.6112601161003113, + "learning_rate": 0.0003035795293308954, + "loss": 3.0674, + "step": 30375 + }, + { + "epoch": 1.49, + "grad_norm": 0.5865578055381775, + "learning_rate": 0.0003035641342317699, + "loss": 2.955, + "step": 30376 + }, + { + "epoch": 1.49, + "grad_norm": 0.7048220038414001, + "learning_rate": 0.0003035487391232572, + "loss": 3.0602, + "step": 30377 + }, + { + "epoch": 1.49, + "grad_norm": 0.585225522518158, + "learning_rate": 0.00030353334400539777, + "loss": 2.9674, + "step": 30378 + }, + { + "epoch": 1.49, + "grad_norm": 0.6342913508415222, + "learning_rate": 0.0003035179488782322, + "loss": 3.0666, + "step": 30379 + }, + { + "epoch": 1.49, + "grad_norm": 0.5397252440452576, + "learning_rate": 0.0003035025537418011, + "loss": 3.041, + "step": 30380 + }, + { + "epoch": 1.49, + "grad_norm": 0.5967416167259216, + "learning_rate": 0.0003034871585961448, + "loss": 3.2197, + "step": 30381 + }, + { + "epoch": 1.49, + "grad_norm": 0.5769542455673218, + "learning_rate": 0.000303471763441304, + "loss": 3.0792, + "step": 30382 + }, + { + "epoch": 1.49, + "grad_norm": 0.5737739205360413, + "learning_rate": 0.00030345636827731936, + "loss": 3.0399, + "step": 30383 + }, + { + "epoch": 1.49, + "grad_norm": 0.5969375967979431, + "learning_rate": 0.00030344097310423124, + "loss": 3.2229, + "step": 30384 + }, + { + "epoch": 1.49, + "grad_norm": 0.5473321676254272, + "learning_rate": 0.00030342557792208027, + "loss": 3.1632, + "step": 30385 + }, + { + "epoch": 1.49, + "grad_norm": 0.5815901756286621, + "learning_rate": 0.000303410182730907, + "loss": 3.0973, + "step": 30386 + }, + { + "epoch": 1.49, + "grad_norm": 0.5692906975746155, + "learning_rate": 0.00030339478753075194, + "loss": 2.9992, + "step": 30387 + }, + { + "epoch": 1.49, + "grad_norm": 0.5809762477874756, + "learning_rate": 0.0003033793923216557, + "loss": 3.1701, + "step": 30388 + }, + { + "epoch": 1.49, + "grad_norm": 0.5709482431411743, + "learning_rate": 0.00030336399710365877, + "loss": 3.102, + "step": 30389 + }, + { + "epoch": 1.49, + "grad_norm": 0.578090250492096, + "learning_rate": 0.0003033486018768017, + "loss": 3.0336, + "step": 30390 + }, + { + "epoch": 1.49, + "grad_norm": 0.6145226359367371, + "learning_rate": 0.0003033332066411252, + "loss": 2.9575, + "step": 30391 + }, + { + "epoch": 1.49, + "grad_norm": 0.5879982113838196, + "learning_rate": 0.00030331781139666947, + "loss": 2.979, + "step": 30392 + }, + { + "epoch": 1.49, + "grad_norm": 0.5679128766059875, + "learning_rate": 0.0003033024161434753, + "loss": 3.4305, + "step": 30393 + }, + { + "epoch": 1.49, + "grad_norm": 0.6054519414901733, + "learning_rate": 0.00030328702088158326, + "loss": 3.1972, + "step": 30394 + }, + { + "epoch": 1.49, + "grad_norm": 0.5590806603431702, + "learning_rate": 0.0003032716256110339, + "loss": 3.0287, + "step": 30395 + }, + { + "epoch": 1.49, + "grad_norm": 0.660683274269104, + "learning_rate": 0.00030325623033186763, + "loss": 3.0268, + "step": 30396 + }, + { + "epoch": 1.49, + "grad_norm": 0.5682603716850281, + "learning_rate": 0.0003032408350441251, + "loss": 3.1007, + "step": 30397 + }, + { + "epoch": 1.49, + "grad_norm": 0.5996838808059692, + "learning_rate": 0.00030322543974784676, + "loss": 2.8001, + "step": 30398 + }, + { + "epoch": 1.49, + "grad_norm": 0.5741966962814331, + "learning_rate": 0.0003032100444430733, + "loss": 3.041, + "step": 30399 + }, + { + "epoch": 1.49, + "grad_norm": 0.604943573474884, + "learning_rate": 0.0003031946491298452, + "loss": 3.1721, + "step": 30400 + }, + { + "epoch": 1.49, + "grad_norm": 0.5835703015327454, + "learning_rate": 0.000303179253808203, + "loss": 2.8629, + "step": 30401 + }, + { + "epoch": 1.49, + "grad_norm": 0.5605635046958923, + "learning_rate": 0.0003031638584781873, + "loss": 2.9483, + "step": 30402 + }, + { + "epoch": 1.49, + "grad_norm": 0.5778949856758118, + "learning_rate": 0.0003031484631398385, + "loss": 2.8903, + "step": 30403 + }, + { + "epoch": 1.49, + "grad_norm": 0.5777247548103333, + "learning_rate": 0.0003031330677931973, + "loss": 3.0132, + "step": 30404 + }, + { + "epoch": 1.49, + "grad_norm": 0.6003771424293518, + "learning_rate": 0.0003031176724383042, + "loss": 2.9525, + "step": 30405 + }, + { + "epoch": 1.49, + "grad_norm": 0.6186116337776184, + "learning_rate": 0.00030310227707519983, + "loss": 2.8372, + "step": 30406 + }, + { + "epoch": 1.49, + "grad_norm": 0.7651210427284241, + "learning_rate": 0.0003030868817039246, + "loss": 3.118, + "step": 30407 + }, + { + "epoch": 1.49, + "grad_norm": 0.5799486637115479, + "learning_rate": 0.00030307148632451904, + "loss": 2.9309, + "step": 30408 + }, + { + "epoch": 1.49, + "grad_norm": 0.5795913934707642, + "learning_rate": 0.0003030560909370239, + "loss": 2.9225, + "step": 30409 + }, + { + "epoch": 1.49, + "grad_norm": 0.6140474081039429, + "learning_rate": 0.0003030406955414796, + "loss": 2.9041, + "step": 30410 + }, + { + "epoch": 1.49, + "grad_norm": 0.5724575519561768, + "learning_rate": 0.00030302530013792656, + "loss": 3.0642, + "step": 30411 + }, + { + "epoch": 1.49, + "grad_norm": 0.5742631554603577, + "learning_rate": 0.0003030099047264056, + "loss": 2.8497, + "step": 30412 + }, + { + "epoch": 1.49, + "grad_norm": 0.6531141996383667, + "learning_rate": 0.0003029945093069571, + "loss": 3.0945, + "step": 30413 + }, + { + "epoch": 1.49, + "grad_norm": 0.5915018916130066, + "learning_rate": 0.0003029791138796216, + "loss": 3.1048, + "step": 30414 + }, + { + "epoch": 1.49, + "grad_norm": 0.5942657589912415, + "learning_rate": 0.0003029637184444397, + "loss": 2.9497, + "step": 30415 + }, + { + "epoch": 1.49, + "grad_norm": 0.5846244096755981, + "learning_rate": 0.00030294832300145196, + "loss": 2.9391, + "step": 30416 + }, + { + "epoch": 1.49, + "grad_norm": 0.5968058109283447, + "learning_rate": 0.0003029329275506989, + "loss": 2.9962, + "step": 30417 + }, + { + "epoch": 1.49, + "grad_norm": 0.5921707153320312, + "learning_rate": 0.00030291753209222097, + "loss": 3.0783, + "step": 30418 + }, + { + "epoch": 1.49, + "grad_norm": 0.6028823852539062, + "learning_rate": 0.00030290213662605896, + "loss": 3.063, + "step": 30419 + }, + { + "epoch": 1.49, + "grad_norm": 0.577496349811554, + "learning_rate": 0.00030288674115225327, + "loss": 3.1577, + "step": 30420 + }, + { + "epoch": 1.49, + "grad_norm": 0.5886926054954529, + "learning_rate": 0.0003028713456708444, + "loss": 3.0918, + "step": 30421 + }, + { + "epoch": 1.49, + "grad_norm": 0.6270083785057068, + "learning_rate": 0.000302855950181873, + "loss": 3.2039, + "step": 30422 + }, + { + "epoch": 1.49, + "grad_norm": 0.5588559508323669, + "learning_rate": 0.00030284055468537946, + "loss": 2.9458, + "step": 30423 + }, + { + "epoch": 1.49, + "grad_norm": 0.5802051424980164, + "learning_rate": 0.0003028251591814046, + "loss": 3.1478, + "step": 30424 + }, + { + "epoch": 1.49, + "grad_norm": 0.5683912038803101, + "learning_rate": 0.00030280976366998876, + "loss": 2.8673, + "step": 30425 + }, + { + "epoch": 1.49, + "grad_norm": 0.5958796143531799, + "learning_rate": 0.0003027943681511726, + "loss": 3.0611, + "step": 30426 + }, + { + "epoch": 1.49, + "grad_norm": 0.5795177817344666, + "learning_rate": 0.0003027789726249965, + "loss": 2.9482, + "step": 30427 + }, + { + "epoch": 1.49, + "grad_norm": 0.6204610466957092, + "learning_rate": 0.0003027635770915012, + "loss": 3.1761, + "step": 30428 + }, + { + "epoch": 1.49, + "grad_norm": 0.5817151069641113, + "learning_rate": 0.0003027481815507271, + "loss": 3.0828, + "step": 30429 + }, + { + "epoch": 1.49, + "grad_norm": 0.5747313499450684, + "learning_rate": 0.00030273278600271485, + "loss": 2.9711, + "step": 30430 + }, + { + "epoch": 1.49, + "grad_norm": 0.6377949714660645, + "learning_rate": 0.00030271739044750504, + "loss": 2.9626, + "step": 30431 + }, + { + "epoch": 1.49, + "grad_norm": 0.599992036819458, + "learning_rate": 0.0003027019948851381, + "loss": 3.3038, + "step": 30432 + }, + { + "epoch": 1.49, + "grad_norm": 0.5743573307991028, + "learning_rate": 0.00030268659931565465, + "loss": 3.0086, + "step": 30433 + }, + { + "epoch": 1.49, + "grad_norm": 0.5959254503250122, + "learning_rate": 0.00030267120373909506, + "loss": 3.1205, + "step": 30434 + }, + { + "epoch": 1.49, + "grad_norm": 0.5806088447570801, + "learning_rate": 0.00030265580815550024, + "loss": 2.9793, + "step": 30435 + }, + { + "epoch": 1.49, + "grad_norm": 0.6778531074523926, + "learning_rate": 0.0003026404125649105, + "loss": 2.9428, + "step": 30436 + }, + { + "epoch": 1.49, + "grad_norm": 0.5825092792510986, + "learning_rate": 0.0003026250169673663, + "loss": 2.9743, + "step": 30437 + }, + { + "epoch": 1.49, + "grad_norm": 0.5714017748832703, + "learning_rate": 0.00030260962136290835, + "loss": 2.9662, + "step": 30438 + }, + { + "epoch": 1.49, + "grad_norm": 0.5609602928161621, + "learning_rate": 0.0003025942257515772, + "loss": 2.913, + "step": 30439 + }, + { + "epoch": 1.49, + "grad_norm": 0.5940423011779785, + "learning_rate": 0.00030257883013341336, + "loss": 3.082, + "step": 30440 + }, + { + "epoch": 1.49, + "grad_norm": 0.5933805704116821, + "learning_rate": 0.0003025634345084573, + "loss": 3.3069, + "step": 30441 + }, + { + "epoch": 1.49, + "grad_norm": 0.6016649603843689, + "learning_rate": 0.0003025480388767498, + "loss": 3.1425, + "step": 30442 + }, + { + "epoch": 1.49, + "grad_norm": 0.5964870452880859, + "learning_rate": 0.0003025326432383312, + "loss": 3.0939, + "step": 30443 + }, + { + "epoch": 1.49, + "grad_norm": 0.6471694111824036, + "learning_rate": 0.0003025172475932419, + "loss": 3.2347, + "step": 30444 + }, + { + "epoch": 1.49, + "grad_norm": 0.5884523987770081, + "learning_rate": 0.00030250185194152286, + "loss": 2.8428, + "step": 30445 + }, + { + "epoch": 1.49, + "grad_norm": 0.6636972427368164, + "learning_rate": 0.0003024864562832144, + "loss": 3.0002, + "step": 30446 + }, + { + "epoch": 1.49, + "grad_norm": 0.6032358407974243, + "learning_rate": 0.00030247106061835713, + "loss": 3.0426, + "step": 30447 + }, + { + "epoch": 1.49, + "grad_norm": 0.6321635842323303, + "learning_rate": 0.0003024556649469915, + "loss": 2.8345, + "step": 30448 + }, + { + "epoch": 1.49, + "grad_norm": 0.5757449865341187, + "learning_rate": 0.00030244026926915814, + "loss": 2.9545, + "step": 30449 + }, + { + "epoch": 1.49, + "grad_norm": 0.5811001658439636, + "learning_rate": 0.00030242487358489754, + "loss": 3.0403, + "step": 30450 + }, + { + "epoch": 1.49, + "grad_norm": 0.582600474357605, + "learning_rate": 0.00030240947789425033, + "loss": 2.9595, + "step": 30451 + }, + { + "epoch": 1.49, + "grad_norm": 0.6360583305358887, + "learning_rate": 0.000302394082197257, + "loss": 2.8013, + "step": 30452 + }, + { + "epoch": 1.49, + "grad_norm": 0.6164677739143372, + "learning_rate": 0.00030237868649395815, + "loss": 3.1337, + "step": 30453 + }, + { + "epoch": 1.49, + "grad_norm": 0.5845510363578796, + "learning_rate": 0.0003023632907843942, + "loss": 3.1204, + "step": 30454 + }, + { + "epoch": 1.49, + "grad_norm": 0.5880382657051086, + "learning_rate": 0.00030234789506860594, + "loss": 3.1311, + "step": 30455 + }, + { + "epoch": 1.49, + "grad_norm": 0.6087186932563782, + "learning_rate": 0.0003023324993466337, + "loss": 3.1319, + "step": 30456 + }, + { + "epoch": 1.49, + "grad_norm": 0.6022242307662964, + "learning_rate": 0.00030231710361851814, + "loss": 3.0978, + "step": 30457 + }, + { + "epoch": 1.49, + "grad_norm": 0.5847160816192627, + "learning_rate": 0.0003023017078842998, + "loss": 2.8099, + "step": 30458 + }, + { + "epoch": 1.49, + "grad_norm": 0.5767192840576172, + "learning_rate": 0.00030228631214401905, + "loss": 3.2496, + "step": 30459 + }, + { + "epoch": 1.49, + "grad_norm": 0.5725535154342651, + "learning_rate": 0.00030227091639771676, + "loss": 3.0633, + "step": 30460 + }, + { + "epoch": 1.49, + "grad_norm": 0.6081965565681458, + "learning_rate": 0.00030225552064543326, + "loss": 3.1437, + "step": 30461 + }, + { + "epoch": 1.49, + "grad_norm": 0.6389308571815491, + "learning_rate": 0.0003022401248872091, + "loss": 3.1606, + "step": 30462 + }, + { + "epoch": 1.49, + "grad_norm": 0.5722255110740662, + "learning_rate": 0.0003022247291230849, + "loss": 3.1019, + "step": 30463 + }, + { + "epoch": 1.49, + "grad_norm": 0.6214210987091064, + "learning_rate": 0.00030220933335310126, + "loss": 3.0384, + "step": 30464 + }, + { + "epoch": 1.49, + "grad_norm": 0.5633954405784607, + "learning_rate": 0.0003021939375772986, + "loss": 2.9655, + "step": 30465 + }, + { + "epoch": 1.49, + "grad_norm": 0.583755373954773, + "learning_rate": 0.00030217854179571756, + "loss": 3.0302, + "step": 30466 + }, + { + "epoch": 1.49, + "grad_norm": 0.5567439794540405, + "learning_rate": 0.00030216314600839866, + "loss": 3.1762, + "step": 30467 + }, + { + "epoch": 1.49, + "grad_norm": 0.6298270225524902, + "learning_rate": 0.00030214775021538244, + "loss": 2.9571, + "step": 30468 + }, + { + "epoch": 1.49, + "grad_norm": 0.5982662439346313, + "learning_rate": 0.00030213235441670945, + "loss": 2.9339, + "step": 30469 + }, + { + "epoch": 1.49, + "grad_norm": 0.5737664103507996, + "learning_rate": 0.00030211695861242024, + "loss": 3.0476, + "step": 30470 + }, + { + "epoch": 1.49, + "grad_norm": 0.5396885275840759, + "learning_rate": 0.00030210156280255547, + "loss": 3.04, + "step": 30471 + }, + { + "epoch": 1.49, + "grad_norm": 0.6123176217079163, + "learning_rate": 0.00030208616698715546, + "loss": 2.9829, + "step": 30472 + }, + { + "epoch": 1.49, + "grad_norm": 0.6010289788246155, + "learning_rate": 0.00030207077116626093, + "loss": 3.0169, + "step": 30473 + }, + { + "epoch": 1.49, + "grad_norm": 0.5837356448173523, + "learning_rate": 0.0003020553753399124, + "loss": 3.0827, + "step": 30474 + }, + { + "epoch": 1.49, + "grad_norm": 0.6117029190063477, + "learning_rate": 0.0003020399795081504, + "loss": 2.9524, + "step": 30475 + }, + { + "epoch": 1.49, + "grad_norm": 0.595624566078186, + "learning_rate": 0.0003020245836710155, + "loss": 3.2267, + "step": 30476 + }, + { + "epoch": 1.49, + "grad_norm": 0.5812867879867554, + "learning_rate": 0.0003020091878285482, + "loss": 3.0827, + "step": 30477 + }, + { + "epoch": 1.49, + "grad_norm": 0.6493427157402039, + "learning_rate": 0.0003019937919807891, + "loss": 3.1577, + "step": 30478 + }, + { + "epoch": 1.49, + "grad_norm": 0.5776815414428711, + "learning_rate": 0.0003019783961277788, + "loss": 2.8263, + "step": 30479 + }, + { + "epoch": 1.49, + "grad_norm": 0.5874110460281372, + "learning_rate": 0.00030196300026955763, + "loss": 3.2444, + "step": 30480 + }, + { + "epoch": 1.49, + "grad_norm": 0.5695233345031738, + "learning_rate": 0.0003019476044061664, + "loss": 2.9263, + "step": 30481 + }, + { + "epoch": 1.49, + "grad_norm": 0.5816549062728882, + "learning_rate": 0.0003019322085376456, + "loss": 3.0367, + "step": 30482 + }, + { + "epoch": 1.49, + "grad_norm": 0.5536554455757141, + "learning_rate": 0.0003019168126640357, + "loss": 2.8394, + "step": 30483 + }, + { + "epoch": 1.49, + "grad_norm": 0.5806564688682556, + "learning_rate": 0.00030190141678537724, + "loss": 3.1847, + "step": 30484 + }, + { + "epoch": 1.49, + "grad_norm": 0.5865193605422974, + "learning_rate": 0.00030188602090171077, + "loss": 3.1498, + "step": 30485 + }, + { + "epoch": 1.49, + "grad_norm": 0.5693684220314026, + "learning_rate": 0.000301870625013077, + "loss": 3.0875, + "step": 30486 + }, + { + "epoch": 1.49, + "grad_norm": 0.6434977650642395, + "learning_rate": 0.00030185522911951634, + "loss": 2.9599, + "step": 30487 + }, + { + "epoch": 1.49, + "grad_norm": 0.6276722550392151, + "learning_rate": 0.0003018398332210693, + "loss": 3.3054, + "step": 30488 + }, + { + "epoch": 1.49, + "grad_norm": 0.6278494000434875, + "learning_rate": 0.0003018244373177765, + "loss": 3.1279, + "step": 30489 + }, + { + "epoch": 1.49, + "grad_norm": 0.639779806137085, + "learning_rate": 0.0003018090414096786, + "loss": 2.9098, + "step": 30490 + }, + { + "epoch": 1.49, + "grad_norm": 0.58705735206604, + "learning_rate": 0.00030179364549681595, + "loss": 3.1877, + "step": 30491 + }, + { + "epoch": 1.49, + "grad_norm": 0.6035036444664001, + "learning_rate": 0.00030177824957922913, + "loss": 3.1528, + "step": 30492 + }, + { + "epoch": 1.49, + "grad_norm": 0.6042265295982361, + "learning_rate": 0.00030176285365695883, + "loss": 2.8595, + "step": 30493 + }, + { + "epoch": 1.49, + "grad_norm": 0.5860368013381958, + "learning_rate": 0.0003017474577300455, + "loss": 3.2413, + "step": 30494 + }, + { + "epoch": 1.49, + "grad_norm": 0.580884575843811, + "learning_rate": 0.00030173206179852965, + "loss": 2.9413, + "step": 30495 + }, + { + "epoch": 1.49, + "grad_norm": 0.6143849492073059, + "learning_rate": 0.00030171666586245187, + "loss": 2.9255, + "step": 30496 + }, + { + "epoch": 1.49, + "grad_norm": 0.5543144941329956, + "learning_rate": 0.00030170126992185286, + "loss": 3.1471, + "step": 30497 + }, + { + "epoch": 1.49, + "grad_norm": 0.5888075232505798, + "learning_rate": 0.0003016858739767729, + "loss": 2.8926, + "step": 30498 + }, + { + "epoch": 1.49, + "grad_norm": 0.6028613448143005, + "learning_rate": 0.0003016704780272527, + "loss": 3.1541, + "step": 30499 + }, + { + "epoch": 1.49, + "grad_norm": 0.5923327207565308, + "learning_rate": 0.00030165508207333284, + "loss": 2.9867, + "step": 30500 + }, + { + "epoch": 1.49, + "grad_norm": 0.5775189399719238, + "learning_rate": 0.00030163968611505375, + "loss": 3.1233, + "step": 30501 + }, + { + "epoch": 1.49, + "grad_norm": 0.6112174391746521, + "learning_rate": 0.000301624290152456, + "loss": 3.0625, + "step": 30502 + }, + { + "epoch": 1.49, + "grad_norm": 0.562967836856842, + "learning_rate": 0.0003016088941855803, + "loss": 2.9785, + "step": 30503 + }, + { + "epoch": 1.49, + "grad_norm": 0.6123626828193665, + "learning_rate": 0.00030159349821446703, + "loss": 2.9996, + "step": 30504 + }, + { + "epoch": 1.49, + "grad_norm": 0.578737199306488, + "learning_rate": 0.00030157810223915675, + "loss": 2.9256, + "step": 30505 + }, + { + "epoch": 1.5, + "grad_norm": 0.6503991484642029, + "learning_rate": 0.00030156270625969006, + "loss": 3.0334, + "step": 30506 + }, + { + "epoch": 1.5, + "grad_norm": 0.5757296085357666, + "learning_rate": 0.00030154731027610753, + "loss": 2.9777, + "step": 30507 + }, + { + "epoch": 1.5, + "grad_norm": 0.5990521907806396, + "learning_rate": 0.00030153191428844976, + "loss": 3.0282, + "step": 30508 + }, + { + "epoch": 1.5, + "grad_norm": 0.5953813791275024, + "learning_rate": 0.0003015165182967572, + "loss": 3.1037, + "step": 30509 + }, + { + "epoch": 1.5, + "grad_norm": 0.5794540047645569, + "learning_rate": 0.00030150112230107023, + "loss": 3.0526, + "step": 30510 + }, + { + "epoch": 1.5, + "grad_norm": 0.564243495464325, + "learning_rate": 0.0003014857263014298, + "loss": 2.8519, + "step": 30511 + }, + { + "epoch": 1.5, + "grad_norm": 0.5954510569572449, + "learning_rate": 0.0003014703302978762, + "loss": 3.1894, + "step": 30512 + }, + { + "epoch": 1.5, + "grad_norm": 0.6306339502334595, + "learning_rate": 0.00030145493429044995, + "loss": 3.0282, + "step": 30513 + }, + { + "epoch": 1.5, + "grad_norm": 0.5806563496589661, + "learning_rate": 0.00030143953827919175, + "loss": 2.8584, + "step": 30514 + }, + { + "epoch": 1.5, + "grad_norm": 0.6290732026100159, + "learning_rate": 0.00030142414226414215, + "loss": 3.2384, + "step": 30515 + }, + { + "epoch": 1.5, + "grad_norm": 0.5588606595993042, + "learning_rate": 0.00030140874624534153, + "loss": 2.962, + "step": 30516 + }, + { + "epoch": 1.5, + "grad_norm": 0.5796079635620117, + "learning_rate": 0.00030139335022283056, + "loss": 2.8573, + "step": 30517 + }, + { + "epoch": 1.5, + "grad_norm": 0.5561217069625854, + "learning_rate": 0.0003013779541966498, + "loss": 2.8819, + "step": 30518 + }, + { + "epoch": 1.5, + "grad_norm": 0.5866872072219849, + "learning_rate": 0.00030136255816683985, + "loss": 3.0356, + "step": 30519 + }, + { + "epoch": 1.5, + "grad_norm": 0.5967084765434265, + "learning_rate": 0.0003013471621334411, + "loss": 2.724, + "step": 30520 + }, + { + "epoch": 1.5, + "grad_norm": 0.5654155611991882, + "learning_rate": 0.0003013317660964941, + "loss": 3.045, + "step": 30521 + }, + { + "epoch": 1.5, + "grad_norm": 0.5480029582977295, + "learning_rate": 0.0003013163700560396, + "loss": 3.0384, + "step": 30522 + }, + { + "epoch": 1.5, + "grad_norm": 0.5529598593711853, + "learning_rate": 0.000301300974012118, + "loss": 2.8905, + "step": 30523 + }, + { + "epoch": 1.5, + "grad_norm": 0.6232104301452637, + "learning_rate": 0.0003012855779647699, + "loss": 3.1526, + "step": 30524 + }, + { + "epoch": 1.5, + "grad_norm": 0.5751103758811951, + "learning_rate": 0.0003012701819140358, + "loss": 3.1856, + "step": 30525 + }, + { + "epoch": 1.5, + "grad_norm": 0.5563510060310364, + "learning_rate": 0.0003012547858599563, + "loss": 2.8839, + "step": 30526 + }, + { + "epoch": 1.5, + "grad_norm": 0.597868800163269, + "learning_rate": 0.00030123938980257197, + "loss": 2.8558, + "step": 30527 + }, + { + "epoch": 1.5, + "grad_norm": 0.5711178779602051, + "learning_rate": 0.00030122399374192326, + "loss": 3.1168, + "step": 30528 + }, + { + "epoch": 1.5, + "grad_norm": 0.5784346461296082, + "learning_rate": 0.0003012085976780509, + "loss": 3.1221, + "step": 30529 + }, + { + "epoch": 1.5, + "grad_norm": 0.5460003614425659, + "learning_rate": 0.0003011932016109953, + "loss": 2.9339, + "step": 30530 + }, + { + "epoch": 1.5, + "grad_norm": 0.5536607503890991, + "learning_rate": 0.0003011778055407969, + "loss": 2.7398, + "step": 30531 + }, + { + "epoch": 1.5, + "grad_norm": 0.5944807529449463, + "learning_rate": 0.0003011624094674964, + "loss": 2.9107, + "step": 30532 + }, + { + "epoch": 1.5, + "grad_norm": 0.5808088183403015, + "learning_rate": 0.00030114701339113454, + "loss": 3.1568, + "step": 30533 + }, + { + "epoch": 1.5, + "grad_norm": 0.6210165619850159, + "learning_rate": 0.0003011316173117515, + "loss": 3.0295, + "step": 30534 + }, + { + "epoch": 1.5, + "grad_norm": 0.5955290794372559, + "learning_rate": 0.00030111622122938803, + "loss": 3.1209, + "step": 30535 + }, + { + "epoch": 1.5, + "grad_norm": 0.608069121837616, + "learning_rate": 0.0003011008251440846, + "loss": 3.093, + "step": 30536 + }, + { + "epoch": 1.5, + "grad_norm": 0.5967391133308411, + "learning_rate": 0.00030108542905588195, + "loss": 3.195, + "step": 30537 + }, + { + "epoch": 1.5, + "grad_norm": 0.5866439938545227, + "learning_rate": 0.0003010700329648204, + "loss": 3.0273, + "step": 30538 + }, + { + "epoch": 1.5, + "grad_norm": 0.6006977558135986, + "learning_rate": 0.0003010546368709406, + "loss": 3.0791, + "step": 30539 + }, + { + "epoch": 1.5, + "grad_norm": 0.5997205972671509, + "learning_rate": 0.000301039240774283, + "loss": 2.8495, + "step": 30540 + }, + { + "epoch": 1.5, + "grad_norm": 0.5799188613891602, + "learning_rate": 0.0003010238446748884, + "loss": 3.0779, + "step": 30541 + }, + { + "epoch": 1.5, + "grad_norm": 0.6030257940292358, + "learning_rate": 0.0003010084485727971, + "loss": 2.7769, + "step": 30542 + }, + { + "epoch": 1.5, + "grad_norm": 0.6359684467315674, + "learning_rate": 0.0003009930524680497, + "loss": 3.0085, + "step": 30543 + }, + { + "epoch": 1.5, + "grad_norm": 0.5771874189376831, + "learning_rate": 0.0003009776563606869, + "loss": 2.8918, + "step": 30544 + }, + { + "epoch": 1.5, + "grad_norm": 0.5872456431388855, + "learning_rate": 0.0003009622602507491, + "loss": 3.0065, + "step": 30545 + }, + { + "epoch": 1.5, + "grad_norm": 0.6064155697822571, + "learning_rate": 0.00030094686413827685, + "loss": 3.1301, + "step": 30546 + }, + { + "epoch": 1.5, + "grad_norm": 0.6292130351066589, + "learning_rate": 0.0003009314680233107, + "loss": 3.0003, + "step": 30547 + }, + { + "epoch": 1.5, + "grad_norm": 0.6809849739074707, + "learning_rate": 0.00030091607190589143, + "loss": 3.2596, + "step": 30548 + }, + { + "epoch": 1.5, + "grad_norm": 0.6155319213867188, + "learning_rate": 0.00030090067578605927, + "loss": 2.9799, + "step": 30549 + }, + { + "epoch": 1.5, + "grad_norm": 0.5968411564826965, + "learning_rate": 0.00030088527966385493, + "loss": 3.0207, + "step": 30550 + }, + { + "epoch": 1.5, + "grad_norm": 0.5945718288421631, + "learning_rate": 0.0003008698835393189, + "loss": 3.0553, + "step": 30551 + }, + { + "epoch": 1.5, + "grad_norm": 0.654512882232666, + "learning_rate": 0.0003008544874124918, + "loss": 3.2307, + "step": 30552 + }, + { + "epoch": 1.5, + "grad_norm": 0.5625406503677368, + "learning_rate": 0.0003008390912834141, + "loss": 3.1419, + "step": 30553 + }, + { + "epoch": 1.5, + "grad_norm": 0.6100373864173889, + "learning_rate": 0.00030082369515212643, + "loss": 3.0421, + "step": 30554 + }, + { + "epoch": 1.5, + "grad_norm": 0.6178447008132935, + "learning_rate": 0.0003008082990186694, + "loss": 3.1594, + "step": 30555 + }, + { + "epoch": 1.5, + "grad_norm": 0.5651780962944031, + "learning_rate": 0.0003007929028830833, + "loss": 3.0438, + "step": 30556 + }, + { + "epoch": 1.5, + "grad_norm": 0.5665957927703857, + "learning_rate": 0.00030077750674540883, + "loss": 2.9222, + "step": 30557 + }, + { + "epoch": 1.5, + "grad_norm": 0.5908525586128235, + "learning_rate": 0.00030076211060568666, + "loss": 2.9867, + "step": 30558 + }, + { + "epoch": 1.5, + "grad_norm": 0.5936393141746521, + "learning_rate": 0.00030074671446395725, + "loss": 2.962, + "step": 30559 + }, + { + "epoch": 1.5, + "grad_norm": 0.6061809659004211, + "learning_rate": 0.0003007313183202611, + "loss": 3.1466, + "step": 30560 + }, + { + "epoch": 1.5, + "grad_norm": 0.5717028975486755, + "learning_rate": 0.00030071592217463885, + "loss": 2.985, + "step": 30561 + }, + { + "epoch": 1.5, + "grad_norm": 0.596230149269104, + "learning_rate": 0.00030070052602713097, + "loss": 3.0271, + "step": 30562 + }, + { + "epoch": 1.5, + "grad_norm": 0.6109982132911682, + "learning_rate": 0.000300685129877778, + "loss": 3.0328, + "step": 30563 + }, + { + "epoch": 1.5, + "grad_norm": 0.5894909501075745, + "learning_rate": 0.0003006697337266206, + "loss": 2.791, + "step": 30564 + }, + { + "epoch": 1.5, + "grad_norm": 0.5837761163711548, + "learning_rate": 0.0003006543375736991, + "loss": 2.8944, + "step": 30565 + }, + { + "epoch": 1.5, + "grad_norm": 0.5935973525047302, + "learning_rate": 0.0003006389414190544, + "loss": 2.9046, + "step": 30566 + }, + { + "epoch": 1.5, + "grad_norm": 0.6078287959098816, + "learning_rate": 0.0003006235452627267, + "loss": 3.2071, + "step": 30567 + }, + { + "epoch": 1.5, + "grad_norm": 0.6492429971694946, + "learning_rate": 0.0003006081491047567, + "loss": 3.0934, + "step": 30568 + }, + { + "epoch": 1.5, + "grad_norm": 0.5882315635681152, + "learning_rate": 0.00030059275294518507, + "loss": 2.9978, + "step": 30569 + }, + { + "epoch": 1.5, + "grad_norm": 0.6093411445617676, + "learning_rate": 0.0003005773567840522, + "loss": 3.0642, + "step": 30570 + }, + { + "epoch": 1.5, + "grad_norm": 0.5847349762916565, + "learning_rate": 0.0003005619606213987, + "loss": 2.9372, + "step": 30571 + }, + { + "epoch": 1.5, + "grad_norm": 0.5897664427757263, + "learning_rate": 0.00030054656445726495, + "loss": 3.059, + "step": 30572 + }, + { + "epoch": 1.5, + "grad_norm": 0.5819639563560486, + "learning_rate": 0.0003005311682916918, + "loss": 2.9334, + "step": 30573 + }, + { + "epoch": 1.5, + "grad_norm": 0.5720083713531494, + "learning_rate": 0.00030051577212471964, + "loss": 2.8956, + "step": 30574 + }, + { + "epoch": 1.5, + "grad_norm": 0.6046839356422424, + "learning_rate": 0.00030050037595638905, + "loss": 3.0251, + "step": 30575 + }, + { + "epoch": 1.5, + "grad_norm": 0.618550717830658, + "learning_rate": 0.00030048497978674057, + "loss": 2.8597, + "step": 30576 + }, + { + "epoch": 1.5, + "grad_norm": 0.5971773266792297, + "learning_rate": 0.0003004695836158147, + "loss": 2.9743, + "step": 30577 + }, + { + "epoch": 1.5, + "grad_norm": 0.6202574968338013, + "learning_rate": 0.00030045418744365204, + "loss": 3.063, + "step": 30578 + }, + { + "epoch": 1.5, + "grad_norm": 0.6273716688156128, + "learning_rate": 0.00030043879127029314, + "loss": 3.1348, + "step": 30579 + }, + { + "epoch": 1.5, + "grad_norm": 0.6152206063270569, + "learning_rate": 0.0003004233950957785, + "loss": 3.1494, + "step": 30580 + }, + { + "epoch": 1.5, + "grad_norm": 0.5668372511863708, + "learning_rate": 0.0003004079989201489, + "loss": 3.1972, + "step": 30581 + }, + { + "epoch": 1.5, + "grad_norm": 0.5945584177970886, + "learning_rate": 0.0003003926027434445, + "loss": 2.9756, + "step": 30582 + }, + { + "epoch": 1.5, + "grad_norm": 0.68867427110672, + "learning_rate": 0.0003003772065657061, + "loss": 2.9829, + "step": 30583 + }, + { + "epoch": 1.5, + "grad_norm": 0.6505774259567261, + "learning_rate": 0.00030036181038697434, + "loss": 3.1015, + "step": 30584 + }, + { + "epoch": 1.5, + "grad_norm": 0.5589194297790527, + "learning_rate": 0.00030034641420728956, + "loss": 2.9296, + "step": 30585 + }, + { + "epoch": 1.5, + "grad_norm": 0.5537344217300415, + "learning_rate": 0.0003003310180266924, + "loss": 2.9285, + "step": 30586 + }, + { + "epoch": 1.5, + "grad_norm": 0.6337488889694214, + "learning_rate": 0.00030031562184522326, + "loss": 3.0206, + "step": 30587 + }, + { + "epoch": 1.5, + "grad_norm": 0.6181750297546387, + "learning_rate": 0.00030030022566292306, + "loss": 2.8851, + "step": 30588 + }, + { + "epoch": 1.5, + "grad_norm": 0.5827994346618652, + "learning_rate": 0.00030028482947983204, + "loss": 3.1003, + "step": 30589 + }, + { + "epoch": 1.5, + "grad_norm": 0.6145060658454895, + "learning_rate": 0.0003002694332959908, + "loss": 3.0419, + "step": 30590 + }, + { + "epoch": 1.5, + "grad_norm": 0.6019697189331055, + "learning_rate": 0.00030025403711143997, + "loss": 2.9749, + "step": 30591 + }, + { + "epoch": 1.5, + "grad_norm": 0.625700056552887, + "learning_rate": 0.0003002386409262201, + "loss": 3.0995, + "step": 30592 + }, + { + "epoch": 1.5, + "grad_norm": 0.5388084650039673, + "learning_rate": 0.00030022324474037154, + "loss": 2.9934, + "step": 30593 + }, + { + "epoch": 1.5, + "grad_norm": 0.626770555973053, + "learning_rate": 0.0003002078485539351, + "loss": 3.071, + "step": 30594 + }, + { + "epoch": 1.5, + "grad_norm": 0.5812084078788757, + "learning_rate": 0.00030019245236695126, + "loss": 2.9377, + "step": 30595 + }, + { + "epoch": 1.5, + "grad_norm": 0.6054795384407043, + "learning_rate": 0.0003001770561794605, + "loss": 2.7962, + "step": 30596 + }, + { + "epoch": 1.5, + "grad_norm": 0.5243318676948547, + "learning_rate": 0.00030016165999150336, + "loss": 3.0612, + "step": 30597 + }, + { + "epoch": 1.5, + "grad_norm": 0.562089741230011, + "learning_rate": 0.0003001462638031204, + "loss": 3.0272, + "step": 30598 + }, + { + "epoch": 1.5, + "grad_norm": 0.5881873369216919, + "learning_rate": 0.0003001308676143524, + "loss": 3.2251, + "step": 30599 + }, + { + "epoch": 1.5, + "grad_norm": 0.5781797170639038, + "learning_rate": 0.00030011547142523955, + "loss": 2.8792, + "step": 30600 + }, + { + "epoch": 1.5, + "grad_norm": 0.6735518574714661, + "learning_rate": 0.0003001000752358226, + "loss": 3.1419, + "step": 30601 + }, + { + "epoch": 1.5, + "grad_norm": 0.5663983821868896, + "learning_rate": 0.0003000846790461421, + "loss": 2.9512, + "step": 30602 + }, + { + "epoch": 1.5, + "grad_norm": 0.6189948320388794, + "learning_rate": 0.00030006928285623857, + "loss": 2.9013, + "step": 30603 + }, + { + "epoch": 1.5, + "grad_norm": 0.6442484855651855, + "learning_rate": 0.00030005388666615255, + "loss": 3.1847, + "step": 30604 + }, + { + "epoch": 1.5, + "grad_norm": 0.629422128200531, + "learning_rate": 0.0003000384904759246, + "loss": 3.0761, + "step": 30605 + }, + { + "epoch": 1.5, + "grad_norm": 0.5894017219543457, + "learning_rate": 0.0003000230942855954, + "loss": 3.1667, + "step": 30606 + }, + { + "epoch": 1.5, + "grad_norm": 0.5657310485839844, + "learning_rate": 0.00030000769809520526, + "loss": 3.1474, + "step": 30607 + }, + { + "epoch": 1.5, + "grad_norm": 0.664206862449646, + "learning_rate": 0.0002999923019047948, + "loss": 2.9338, + "step": 30608 + }, + { + "epoch": 1.5, + "grad_norm": 0.5959284901618958, + "learning_rate": 0.0002999769057144046, + "loss": 3.0428, + "step": 30609 + }, + { + "epoch": 1.5, + "grad_norm": 0.5982975959777832, + "learning_rate": 0.00029996150952407534, + "loss": 2.9871, + "step": 30610 + }, + { + "epoch": 1.5, + "grad_norm": 0.6166161894798279, + "learning_rate": 0.00029994611333384745, + "loss": 3.0232, + "step": 30611 + }, + { + "epoch": 1.5, + "grad_norm": 0.5803634524345398, + "learning_rate": 0.0002999307171437613, + "loss": 3.2022, + "step": 30612 + }, + { + "epoch": 1.5, + "grad_norm": 0.5776461958885193, + "learning_rate": 0.0002999153209538579, + "loss": 2.9408, + "step": 30613 + }, + { + "epoch": 1.5, + "grad_norm": 0.5784315466880798, + "learning_rate": 0.00029989992476417735, + "loss": 3.2184, + "step": 30614 + }, + { + "epoch": 1.5, + "grad_norm": 0.635517418384552, + "learning_rate": 0.00029988452857476045, + "loss": 3.1963, + "step": 30615 + }, + { + "epoch": 1.5, + "grad_norm": 0.6005640625953674, + "learning_rate": 0.0002998691323856476, + "loss": 3.0256, + "step": 30616 + }, + { + "epoch": 1.5, + "grad_norm": 0.5672207474708557, + "learning_rate": 0.0002998537361968795, + "loss": 3.0099, + "step": 30617 + }, + { + "epoch": 1.5, + "grad_norm": 0.6029596328735352, + "learning_rate": 0.00029983834000849664, + "loss": 3.125, + "step": 30618 + }, + { + "epoch": 1.5, + "grad_norm": 0.5798458456993103, + "learning_rate": 0.0002998229438205395, + "loss": 3.0502, + "step": 30619 + }, + { + "epoch": 1.5, + "grad_norm": 0.5979757308959961, + "learning_rate": 0.0002998075476330488, + "loss": 3.0172, + "step": 30620 + }, + { + "epoch": 1.5, + "grad_norm": 0.6162039041519165, + "learning_rate": 0.0002997921514460649, + "loss": 3.0594, + "step": 30621 + }, + { + "epoch": 1.5, + "grad_norm": 0.5752620100975037, + "learning_rate": 0.00029977675525962835, + "loss": 3.0725, + "step": 30622 + }, + { + "epoch": 1.5, + "grad_norm": 0.6150903701782227, + "learning_rate": 0.00029976135907377994, + "loss": 2.9637, + "step": 30623 + }, + { + "epoch": 1.5, + "grad_norm": 0.607782781124115, + "learning_rate": 0.00029974596288856003, + "loss": 2.8398, + "step": 30624 + }, + { + "epoch": 1.5, + "grad_norm": 0.5732407569885254, + "learning_rate": 0.00029973056670400925, + "loss": 2.9802, + "step": 30625 + }, + { + "epoch": 1.5, + "grad_norm": 0.5856612324714661, + "learning_rate": 0.0002997151705201679, + "loss": 2.9968, + "step": 30626 + }, + { + "epoch": 1.5, + "grad_norm": 0.5715699791908264, + "learning_rate": 0.00029969977433707683, + "loss": 3.0805, + "step": 30627 + }, + { + "epoch": 1.5, + "grad_norm": 0.6128286123275757, + "learning_rate": 0.0002996843781547767, + "loss": 2.8269, + "step": 30628 + }, + { + "epoch": 1.5, + "grad_norm": 0.5702658891677856, + "learning_rate": 0.0002996689819733076, + "loss": 2.8871, + "step": 30629 + }, + { + "epoch": 1.5, + "grad_norm": 0.5865205526351929, + "learning_rate": 0.0002996535857927105, + "loss": 2.8866, + "step": 30630 + }, + { + "epoch": 1.5, + "grad_norm": 0.5985194444656372, + "learning_rate": 0.0002996381896130256, + "loss": 3.1501, + "step": 30631 + }, + { + "epoch": 1.5, + "grad_norm": 0.6110131144523621, + "learning_rate": 0.0002996227934342938, + "loss": 2.9409, + "step": 30632 + }, + { + "epoch": 1.5, + "grad_norm": 0.5706114768981934, + "learning_rate": 0.00029960739725655547, + "loss": 3.0904, + "step": 30633 + }, + { + "epoch": 1.5, + "grad_norm": 0.6214529275894165, + "learning_rate": 0.0002995920010798511, + "loss": 3.0926, + "step": 30634 + }, + { + "epoch": 1.5, + "grad_norm": 0.6152591109275818, + "learning_rate": 0.00029957660490422146, + "loss": 3.1173, + "step": 30635 + }, + { + "epoch": 1.5, + "grad_norm": 0.6281127333641052, + "learning_rate": 0.00029956120872970686, + "loss": 3.0511, + "step": 30636 + }, + { + "epoch": 1.5, + "grad_norm": 0.6203930377960205, + "learning_rate": 0.0002995458125563479, + "loss": 3.079, + "step": 30637 + }, + { + "epoch": 1.5, + "grad_norm": 0.603507399559021, + "learning_rate": 0.0002995304163841853, + "loss": 3.2659, + "step": 30638 + }, + { + "epoch": 1.5, + "grad_norm": 0.5846010446548462, + "learning_rate": 0.0002995150202132595, + "loss": 3.1515, + "step": 30639 + }, + { + "epoch": 1.5, + "grad_norm": 0.6017332673072815, + "learning_rate": 0.000299499624043611, + "loss": 2.8583, + "step": 30640 + }, + { + "epoch": 1.5, + "grad_norm": 0.6173145771026611, + "learning_rate": 0.0002994842278752803, + "loss": 2.9607, + "step": 30641 + }, + { + "epoch": 1.5, + "grad_norm": 0.6321989297866821, + "learning_rate": 0.0002994688317083081, + "loss": 3.1546, + "step": 30642 + }, + { + "epoch": 1.5, + "grad_norm": 0.6010352373123169, + "learning_rate": 0.00029945343554273505, + "loss": 3.1106, + "step": 30643 + }, + { + "epoch": 1.5, + "grad_norm": 0.6526492238044739, + "learning_rate": 0.00029943803937860136, + "loss": 3.2372, + "step": 30644 + }, + { + "epoch": 1.5, + "grad_norm": 0.5786515474319458, + "learning_rate": 0.0002994226432159479, + "loss": 2.9052, + "step": 30645 + }, + { + "epoch": 1.5, + "grad_norm": 0.5871065258979797, + "learning_rate": 0.00029940724705481493, + "loss": 3.2204, + "step": 30646 + }, + { + "epoch": 1.5, + "grad_norm": 0.5986695289611816, + "learning_rate": 0.0002993918508952432, + "loss": 3.0846, + "step": 30647 + }, + { + "epoch": 1.5, + "grad_norm": 0.5756219625473022, + "learning_rate": 0.0002993764547372733, + "loss": 2.9358, + "step": 30648 + }, + { + "epoch": 1.5, + "grad_norm": 0.5979821681976318, + "learning_rate": 0.0002993610585809456, + "loss": 3.1195, + "step": 30649 + }, + { + "epoch": 1.5, + "grad_norm": 0.6420942544937134, + "learning_rate": 0.00029934566242630084, + "loss": 3.1625, + "step": 30650 + }, + { + "epoch": 1.5, + "grad_norm": 0.7682424783706665, + "learning_rate": 0.0002993302662733795, + "loss": 3.1441, + "step": 30651 + }, + { + "epoch": 1.5, + "grad_norm": 0.5857537984848022, + "learning_rate": 0.0002993148701222219, + "loss": 2.9752, + "step": 30652 + }, + { + "epoch": 1.5, + "grad_norm": 0.6025992035865784, + "learning_rate": 0.00029929947397286903, + "loss": 3.0435, + "step": 30653 + }, + { + "epoch": 1.5, + "grad_norm": 0.6120580434799194, + "learning_rate": 0.00029928407782536115, + "loss": 3.14, + "step": 30654 + }, + { + "epoch": 1.5, + "grad_norm": 0.5636025667190552, + "learning_rate": 0.0002992686816797389, + "loss": 3.1197, + "step": 30655 + }, + { + "epoch": 1.5, + "grad_norm": 0.5800879001617432, + "learning_rate": 0.0002992532855360427, + "loss": 2.947, + "step": 30656 + }, + { + "epoch": 1.5, + "grad_norm": 0.6197709441184998, + "learning_rate": 0.00029923788939431324, + "loss": 3.0448, + "step": 30657 + }, + { + "epoch": 1.5, + "grad_norm": 0.6334044337272644, + "learning_rate": 0.0002992224932545911, + "loss": 2.8652, + "step": 30658 + }, + { + "epoch": 1.5, + "grad_norm": 0.6068458557128906, + "learning_rate": 0.0002992070971169166, + "loss": 2.781, + "step": 30659 + }, + { + "epoch": 1.5, + "grad_norm": 0.6140934824943542, + "learning_rate": 0.0002991917009813307, + "loss": 2.8921, + "step": 30660 + }, + { + "epoch": 1.5, + "grad_norm": 0.593880295753479, + "learning_rate": 0.00029917630484787357, + "loss": 3.1028, + "step": 30661 + }, + { + "epoch": 1.5, + "grad_norm": 0.5681495666503906, + "learning_rate": 0.0002991609087165858, + "loss": 3.0147, + "step": 30662 + }, + { + "epoch": 1.5, + "grad_norm": 0.6396278738975525, + "learning_rate": 0.0002991455125875082, + "loss": 3.0758, + "step": 30663 + }, + { + "epoch": 1.5, + "grad_norm": 0.6014585494995117, + "learning_rate": 0.00029913011646068104, + "loss": 3.1103, + "step": 30664 + }, + { + "epoch": 1.5, + "grad_norm": 0.6441971659660339, + "learning_rate": 0.0002991147203361451, + "loss": 2.9616, + "step": 30665 + }, + { + "epoch": 1.5, + "grad_norm": 0.5746193528175354, + "learning_rate": 0.00029909932421394073, + "loss": 2.9842, + "step": 30666 + }, + { + "epoch": 1.5, + "grad_norm": 0.6001632213592529, + "learning_rate": 0.0002990839280941086, + "loss": 3.1733, + "step": 30667 + }, + { + "epoch": 1.5, + "grad_norm": 0.5855227708816528, + "learning_rate": 0.00029906853197668925, + "loss": 3.0555, + "step": 30668 + }, + { + "epoch": 1.5, + "grad_norm": 0.6073086857795715, + "learning_rate": 0.0002990531358617231, + "loss": 2.9114, + "step": 30669 + }, + { + "epoch": 1.5, + "grad_norm": 0.6142853498458862, + "learning_rate": 0.00029903773974925094, + "loss": 3.2542, + "step": 30670 + }, + { + "epoch": 1.5, + "grad_norm": 0.6134935617446899, + "learning_rate": 0.00029902234363931305, + "loss": 3.0456, + "step": 30671 + }, + { + "epoch": 1.5, + "grad_norm": 0.6078312993049622, + "learning_rate": 0.0002990069475319502, + "loss": 3.1121, + "step": 30672 + }, + { + "epoch": 1.5, + "grad_norm": 0.5812007784843445, + "learning_rate": 0.0002989915514272029, + "loss": 2.9169, + "step": 30673 + }, + { + "epoch": 1.5, + "grad_norm": 0.5966639518737793, + "learning_rate": 0.00029897615532511156, + "loss": 3.0635, + "step": 30674 + }, + { + "epoch": 1.5, + "grad_norm": 0.6440772414207458, + "learning_rate": 0.00029896075922571693, + "loss": 3.085, + "step": 30675 + }, + { + "epoch": 1.5, + "grad_norm": 0.6082709431648254, + "learning_rate": 0.00029894536312905943, + "loss": 3.0807, + "step": 30676 + }, + { + "epoch": 1.5, + "grad_norm": 0.585754930973053, + "learning_rate": 0.0002989299670351795, + "loss": 3.2426, + "step": 30677 + }, + { + "epoch": 1.5, + "grad_norm": 0.5902261734008789, + "learning_rate": 0.00029891457094411805, + "loss": 2.7597, + "step": 30678 + }, + { + "epoch": 1.5, + "grad_norm": 0.6010871529579163, + "learning_rate": 0.0002988991748559153, + "loss": 2.9683, + "step": 30679 + }, + { + "epoch": 1.5, + "grad_norm": 0.6037760972976685, + "learning_rate": 0.00029888377877061197, + "loss": 3.1568, + "step": 30680 + }, + { + "epoch": 1.5, + "grad_norm": 0.6149284839630127, + "learning_rate": 0.0002988683826882484, + "loss": 3.2423, + "step": 30681 + }, + { + "epoch": 1.5, + "grad_norm": 0.897918164730072, + "learning_rate": 0.00029885298660886546, + "loss": 2.9964, + "step": 30682 + }, + { + "epoch": 1.5, + "grad_norm": 0.5681318044662476, + "learning_rate": 0.0002988375905325035, + "loss": 3.1145, + "step": 30683 + }, + { + "epoch": 1.5, + "grad_norm": 0.6103178262710571, + "learning_rate": 0.00029882219445920305, + "loss": 2.9291, + "step": 30684 + }, + { + "epoch": 1.5, + "grad_norm": 0.5969374179840088, + "learning_rate": 0.0002988067983890048, + "loss": 3.1072, + "step": 30685 + }, + { + "epoch": 1.5, + "grad_norm": 0.5936666131019592, + "learning_rate": 0.00029879140232194916, + "loss": 3.129, + "step": 30686 + }, + { + "epoch": 1.5, + "grad_norm": 0.6052805185317993, + "learning_rate": 0.0002987760062580768, + "loss": 2.9546, + "step": 30687 + }, + { + "epoch": 1.5, + "grad_norm": 0.5850530862808228, + "learning_rate": 0.00029876061019742803, + "loss": 2.9499, + "step": 30688 + }, + { + "epoch": 1.5, + "grad_norm": 0.5777554512023926, + "learning_rate": 0.0002987452141400436, + "loss": 2.9389, + "step": 30689 + }, + { + "epoch": 1.5, + "grad_norm": 0.5716623067855835, + "learning_rate": 0.0002987298180859642, + "loss": 2.9837, + "step": 30690 + }, + { + "epoch": 1.5, + "grad_norm": 0.5977976322174072, + "learning_rate": 0.0002987144220352301, + "loss": 3.116, + "step": 30691 + }, + { + "epoch": 1.5, + "grad_norm": 0.6488404273986816, + "learning_rate": 0.000298699025987882, + "loss": 3.0038, + "step": 30692 + }, + { + "epoch": 1.5, + "grad_norm": 0.6428252458572388, + "learning_rate": 0.0002986836299439604, + "loss": 3.0782, + "step": 30693 + }, + { + "epoch": 1.5, + "grad_norm": 0.6638103127479553, + "learning_rate": 0.0002986682339035058, + "loss": 3.1157, + "step": 30694 + }, + { + "epoch": 1.5, + "grad_norm": 0.5968785285949707, + "learning_rate": 0.00029865283786655896, + "loss": 3.2548, + "step": 30695 + }, + { + "epoch": 1.5, + "grad_norm": 0.6197184324264526, + "learning_rate": 0.00029863744183316015, + "loss": 3.1877, + "step": 30696 + }, + { + "epoch": 1.5, + "grad_norm": 0.6367472410202026, + "learning_rate": 0.00029862204580335017, + "loss": 3.0486, + "step": 30697 + }, + { + "epoch": 1.5, + "grad_norm": 0.644905149936676, + "learning_rate": 0.00029860664977716944, + "loss": 2.8002, + "step": 30698 + }, + { + "epoch": 1.5, + "grad_norm": 0.6608884334564209, + "learning_rate": 0.0002985912537546584, + "loss": 3.0254, + "step": 30699 + }, + { + "epoch": 1.5, + "grad_norm": 0.6148144006729126, + "learning_rate": 0.00029857585773585785, + "loss": 3.0313, + "step": 30700 + }, + { + "epoch": 1.5, + "grad_norm": 0.6347461342811584, + "learning_rate": 0.0002985604617208082, + "loss": 2.7207, + "step": 30701 + }, + { + "epoch": 1.5, + "grad_norm": 0.6112677454948425, + "learning_rate": 0.00029854506570955005, + "loss": 3.0828, + "step": 30702 + }, + { + "epoch": 1.5, + "grad_norm": 0.5936033129692078, + "learning_rate": 0.0002985296697021238, + "loss": 3.1319, + "step": 30703 + }, + { + "epoch": 1.5, + "grad_norm": 0.6078819632530212, + "learning_rate": 0.0002985142736985702, + "loss": 3.0869, + "step": 30704 + }, + { + "epoch": 1.5, + "grad_norm": 0.6138762831687927, + "learning_rate": 0.00029849887769892977, + "loss": 2.7921, + "step": 30705 + }, + { + "epoch": 1.5, + "grad_norm": 0.6070495247840881, + "learning_rate": 0.0002984834817032429, + "loss": 3.1173, + "step": 30706 + }, + { + "epoch": 1.5, + "grad_norm": 0.6134839653968811, + "learning_rate": 0.0002984680857115503, + "loss": 2.9879, + "step": 30707 + }, + { + "epoch": 1.5, + "grad_norm": 0.5913065671920776, + "learning_rate": 0.00029845268972389247, + "loss": 3.2981, + "step": 30708 + }, + { + "epoch": 1.5, + "grad_norm": 0.5801486372947693, + "learning_rate": 0.00029843729374030983, + "loss": 3.3145, + "step": 30709 + }, + { + "epoch": 1.51, + "grad_norm": 0.6102262139320374, + "learning_rate": 0.00029842189776084325, + "loss": 3.0449, + "step": 30710 + }, + { + "epoch": 1.51, + "grad_norm": 0.5686670541763306, + "learning_rate": 0.0002984065017855329, + "loss": 2.991, + "step": 30711 + }, + { + "epoch": 1.51, + "grad_norm": 0.6398444175720215, + "learning_rate": 0.00029839110581441974, + "loss": 3.1699, + "step": 30712 + }, + { + "epoch": 1.51, + "grad_norm": 0.5702043175697327, + "learning_rate": 0.00029837570984754395, + "loss": 2.7375, + "step": 30713 + }, + { + "epoch": 1.51, + "grad_norm": 0.5870870351791382, + "learning_rate": 0.0002983603138849462, + "loss": 3.0903, + "step": 30714 + }, + { + "epoch": 1.51, + "grad_norm": 0.5614882707595825, + "learning_rate": 0.00029834491792666716, + "loss": 3.1141, + "step": 30715 + }, + { + "epoch": 1.51, + "grad_norm": 0.56441730260849, + "learning_rate": 0.00029832952197274726, + "loss": 3.1426, + "step": 30716 + }, + { + "epoch": 1.51, + "grad_norm": 0.5936001539230347, + "learning_rate": 0.00029831412602322716, + "loss": 3.0108, + "step": 30717 + }, + { + "epoch": 1.51, + "grad_norm": 0.5969215035438538, + "learning_rate": 0.00029829873007814714, + "loss": 3.0834, + "step": 30718 + }, + { + "epoch": 1.51, + "grad_norm": 0.6016257405281067, + "learning_rate": 0.000298283334137548, + "loss": 2.9115, + "step": 30719 + }, + { + "epoch": 1.51, + "grad_norm": 0.6029444336891174, + "learning_rate": 0.00029826793820147035, + "loss": 3.0821, + "step": 30720 + }, + { + "epoch": 1.51, + "grad_norm": 0.5698739886283875, + "learning_rate": 0.00029825254226995446, + "loss": 3.0591, + "step": 30721 + }, + { + "epoch": 1.51, + "grad_norm": 0.595145046710968, + "learning_rate": 0.00029823714634304117, + "loss": 3.0994, + "step": 30722 + }, + { + "epoch": 1.51, + "grad_norm": 0.6198235750198364, + "learning_rate": 0.0002982217504207708, + "loss": 2.9106, + "step": 30723 + }, + { + "epoch": 1.51, + "grad_norm": 0.5928401350975037, + "learning_rate": 0.000298206354503184, + "loss": 2.9303, + "step": 30724 + }, + { + "epoch": 1.51, + "grad_norm": 0.5822376608848572, + "learning_rate": 0.0002981909585903214, + "loss": 3.0786, + "step": 30725 + }, + { + "epoch": 1.51, + "grad_norm": 0.5875365138053894, + "learning_rate": 0.00029817556268222343, + "loss": 2.81, + "step": 30726 + }, + { + "epoch": 1.51, + "grad_norm": 0.6183528900146484, + "learning_rate": 0.00029816016677893073, + "loss": 3.1806, + "step": 30727 + }, + { + "epoch": 1.51, + "grad_norm": 0.5537392497062683, + "learning_rate": 0.00029814477088048366, + "loss": 3.1093, + "step": 30728 + }, + { + "epoch": 1.51, + "grad_norm": 0.5720838904380798, + "learning_rate": 0.000298129374986923, + "loss": 3.1148, + "step": 30729 + }, + { + "epoch": 1.51, + "grad_norm": 0.6122817397117615, + "learning_rate": 0.00029811397909828923, + "loss": 3.0031, + "step": 30730 + }, + { + "epoch": 1.51, + "grad_norm": 0.5819237232208252, + "learning_rate": 0.00029809858321462276, + "loss": 3.1225, + "step": 30731 + }, + { + "epoch": 1.51, + "grad_norm": 0.6177088618278503, + "learning_rate": 0.0002980831873359644, + "loss": 2.9787, + "step": 30732 + }, + { + "epoch": 1.51, + "grad_norm": 0.6123387813568115, + "learning_rate": 0.00029806779146235437, + "loss": 3.1079, + "step": 30733 + }, + { + "epoch": 1.51, + "grad_norm": 0.6111491918563843, + "learning_rate": 0.0002980523955938335, + "loss": 2.9765, + "step": 30734 + }, + { + "epoch": 1.51, + "grad_norm": 0.566396176815033, + "learning_rate": 0.0002980369997304423, + "loss": 2.815, + "step": 30735 + }, + { + "epoch": 1.51, + "grad_norm": 0.588200032711029, + "learning_rate": 0.0002980216038722212, + "loss": 3.112, + "step": 30736 + }, + { + "epoch": 1.51, + "grad_norm": 0.5717114806175232, + "learning_rate": 0.00029800620801921087, + "loss": 3.0413, + "step": 30737 + }, + { + "epoch": 1.51, + "grad_norm": 0.6023069620132446, + "learning_rate": 0.00029799081217145177, + "loss": 3.1242, + "step": 30738 + }, + { + "epoch": 1.51, + "grad_norm": 0.5799679756164551, + "learning_rate": 0.00029797541632898444, + "loss": 2.8477, + "step": 30739 + }, + { + "epoch": 1.51, + "grad_norm": 0.5831611752510071, + "learning_rate": 0.0002979600204918496, + "loss": 3.0047, + "step": 30740 + }, + { + "epoch": 1.51, + "grad_norm": 0.6182281374931335, + "learning_rate": 0.00029794462466008756, + "loss": 3.0421, + "step": 30741 + }, + { + "epoch": 1.51, + "grad_norm": 0.5878432989120483, + "learning_rate": 0.00029792922883373907, + "loss": 2.879, + "step": 30742 + }, + { + "epoch": 1.51, + "grad_norm": 0.5995323061943054, + "learning_rate": 0.0002979138330128445, + "loss": 2.9612, + "step": 30743 + }, + { + "epoch": 1.51, + "grad_norm": 0.5848367810249329, + "learning_rate": 0.00029789843719744453, + "loss": 3.1774, + "step": 30744 + }, + { + "epoch": 1.51, + "grad_norm": 0.5988871455192566, + "learning_rate": 0.00029788304138757976, + "loss": 3.0458, + "step": 30745 + }, + { + "epoch": 1.51, + "grad_norm": 0.6023797988891602, + "learning_rate": 0.0002978676455832905, + "loss": 3.1648, + "step": 30746 + }, + { + "epoch": 1.51, + "grad_norm": 0.6229612827301025, + "learning_rate": 0.00029785224978461756, + "loss": 3.0046, + "step": 30747 + }, + { + "epoch": 1.51, + "grad_norm": 0.6516324877738953, + "learning_rate": 0.00029783685399160134, + "loss": 3.0324, + "step": 30748 + }, + { + "epoch": 1.51, + "grad_norm": 0.6502434611320496, + "learning_rate": 0.0002978214582042824, + "loss": 2.9143, + "step": 30749 + }, + { + "epoch": 1.51, + "grad_norm": 0.5838092565536499, + "learning_rate": 0.00029780606242270135, + "loss": 2.9889, + "step": 30750 + }, + { + "epoch": 1.51, + "grad_norm": 0.5811021327972412, + "learning_rate": 0.0002977906666468987, + "loss": 3.0722, + "step": 30751 + }, + { + "epoch": 1.51, + "grad_norm": 0.5942781567573547, + "learning_rate": 0.0002977752708769151, + "loss": 2.9056, + "step": 30752 + }, + { + "epoch": 1.51, + "grad_norm": 0.5923861861228943, + "learning_rate": 0.00029775987511279086, + "loss": 3.1602, + "step": 30753 + }, + { + "epoch": 1.51, + "grad_norm": 0.6492545008659363, + "learning_rate": 0.00029774447935456663, + "loss": 3.1609, + "step": 30754 + }, + { + "epoch": 1.51, + "grad_norm": 0.6185020804405212, + "learning_rate": 0.00029772908360228324, + "loss": 2.8854, + "step": 30755 + }, + { + "epoch": 1.51, + "grad_norm": 0.6205618381500244, + "learning_rate": 0.00029771368785598085, + "loss": 2.8574, + "step": 30756 + }, + { + "epoch": 1.51, + "grad_norm": 0.590815544128418, + "learning_rate": 0.0002976982921157003, + "loss": 2.9544, + "step": 30757 + }, + { + "epoch": 1.51, + "grad_norm": 0.6202199459075928, + "learning_rate": 0.0002976828963814818, + "loss": 3.0796, + "step": 30758 + }, + { + "epoch": 1.51, + "grad_norm": 0.6064924597740173, + "learning_rate": 0.00029766750065336625, + "loss": 2.9535, + "step": 30759 + }, + { + "epoch": 1.51, + "grad_norm": 0.6110441088676453, + "learning_rate": 0.00029765210493139406, + "loss": 3.1612, + "step": 30760 + }, + { + "epoch": 1.51, + "grad_norm": 0.5866239666938782, + "learning_rate": 0.00029763670921560567, + "loss": 2.9289, + "step": 30761 + }, + { + "epoch": 1.51, + "grad_norm": 0.5617367625236511, + "learning_rate": 0.00029762131350604185, + "loss": 3.1119, + "step": 30762 + }, + { + "epoch": 1.51, + "grad_norm": 0.551780104637146, + "learning_rate": 0.000297605917802743, + "loss": 3.0988, + "step": 30763 + }, + { + "epoch": 1.51, + "grad_norm": 0.5915526747703552, + "learning_rate": 0.0002975905221057497, + "loss": 3.0443, + "step": 30764 + }, + { + "epoch": 1.51, + "grad_norm": 0.5596851110458374, + "learning_rate": 0.0002975751264151024, + "loss": 2.8931, + "step": 30765 + }, + { + "epoch": 1.51, + "grad_norm": 0.6421846151351929, + "learning_rate": 0.00029755973073084186, + "loss": 2.9407, + "step": 30766 + }, + { + "epoch": 1.51, + "grad_norm": 0.6003358364105225, + "learning_rate": 0.00029754433505300857, + "loss": 3.0634, + "step": 30767 + }, + { + "epoch": 1.51, + "grad_norm": 0.5622581839561462, + "learning_rate": 0.00029752893938164287, + "loss": 3.1398, + "step": 30768 + }, + { + "epoch": 1.51, + "grad_norm": 0.6357739567756653, + "learning_rate": 0.0002975135437167856, + "loss": 3.0894, + "step": 30769 + }, + { + "epoch": 1.51, + "grad_norm": 0.6039325594902039, + "learning_rate": 0.0002974981480584771, + "loss": 3.1304, + "step": 30770 + }, + { + "epoch": 1.51, + "grad_norm": 0.5737800002098083, + "learning_rate": 0.00029748275240675795, + "loss": 2.8875, + "step": 30771 + }, + { + "epoch": 1.51, + "grad_norm": 0.6089869141578674, + "learning_rate": 0.0002974673567616689, + "loss": 2.9146, + "step": 30772 + }, + { + "epoch": 1.51, + "grad_norm": 0.6089136600494385, + "learning_rate": 0.0002974519611232502, + "loss": 3.2106, + "step": 30773 + }, + { + "epoch": 1.51, + "grad_norm": 0.5950151681900024, + "learning_rate": 0.00029743656549154263, + "loss": 2.8026, + "step": 30774 + }, + { + "epoch": 1.51, + "grad_norm": 0.6248009204864502, + "learning_rate": 0.00029742116986658664, + "loss": 3.1683, + "step": 30775 + }, + { + "epoch": 1.51, + "grad_norm": 0.6114397048950195, + "learning_rate": 0.00029740577424842274, + "loss": 2.9792, + "step": 30776 + }, + { + "epoch": 1.51, + "grad_norm": 0.6554459929466248, + "learning_rate": 0.00029739037863709165, + "loss": 3.2027, + "step": 30777 + }, + { + "epoch": 1.51, + "grad_norm": 0.6299651265144348, + "learning_rate": 0.00029737498303263374, + "loss": 3.015, + "step": 30778 + }, + { + "epoch": 1.51, + "grad_norm": 0.6014512777328491, + "learning_rate": 0.00029735958743508964, + "loss": 3.058, + "step": 30779 + }, + { + "epoch": 1.51, + "grad_norm": 0.6071848273277283, + "learning_rate": 0.00029734419184449976, + "loss": 3.0521, + "step": 30780 + }, + { + "epoch": 1.51, + "grad_norm": 0.6110485792160034, + "learning_rate": 0.00029732879626090484, + "loss": 2.9316, + "step": 30781 + }, + { + "epoch": 1.51, + "grad_norm": 0.5651352405548096, + "learning_rate": 0.0002973134006843454, + "loss": 3.0355, + "step": 30782 + }, + { + "epoch": 1.51, + "grad_norm": 0.6445112824440002, + "learning_rate": 0.00029729800511486186, + "loss": 3.1326, + "step": 30783 + }, + { + "epoch": 1.51, + "grad_norm": 0.582943320274353, + "learning_rate": 0.00029728260955249496, + "loss": 3.223, + "step": 30784 + }, + { + "epoch": 1.51, + "grad_norm": 0.611379861831665, + "learning_rate": 0.0002972672139972851, + "loss": 2.8036, + "step": 30785 + }, + { + "epoch": 1.51, + "grad_norm": 0.6126112937927246, + "learning_rate": 0.0002972518184492728, + "loss": 2.797, + "step": 30786 + }, + { + "epoch": 1.51, + "grad_norm": 0.5877750515937805, + "learning_rate": 0.0002972364229084988, + "loss": 2.9347, + "step": 30787 + }, + { + "epoch": 1.51, + "grad_norm": 0.6406726837158203, + "learning_rate": 0.0002972210273750035, + "loss": 3.1474, + "step": 30788 + }, + { + "epoch": 1.51, + "grad_norm": 0.6040596961975098, + "learning_rate": 0.0002972056318488275, + "loss": 3.1666, + "step": 30789 + }, + { + "epoch": 1.51, + "grad_norm": 0.6338227987289429, + "learning_rate": 0.0002971902363300112, + "loss": 3.1337, + "step": 30790 + }, + { + "epoch": 1.51, + "grad_norm": 0.6211509704589844, + "learning_rate": 0.00029717484081859534, + "loss": 2.8265, + "step": 30791 + }, + { + "epoch": 1.51, + "grad_norm": 0.5889843106269836, + "learning_rate": 0.0002971594453146205, + "loss": 3.1715, + "step": 30792 + }, + { + "epoch": 1.51, + "grad_norm": 0.5887772440910339, + "learning_rate": 0.00029714404981812695, + "loss": 2.9016, + "step": 30793 + }, + { + "epoch": 1.51, + "grad_norm": 0.5816041231155396, + "learning_rate": 0.00029712865432915566, + "loss": 2.9766, + "step": 30794 + }, + { + "epoch": 1.51, + "grad_norm": 0.5636284947395325, + "learning_rate": 0.00029711325884774673, + "loss": 3.1463, + "step": 30795 + }, + { + "epoch": 1.51, + "grad_norm": 0.5685577392578125, + "learning_rate": 0.000297097863373941, + "loss": 2.8435, + "step": 30796 + }, + { + "epoch": 1.51, + "grad_norm": 0.6058424115180969, + "learning_rate": 0.000297082467907779, + "loss": 3.2246, + "step": 30797 + }, + { + "epoch": 1.51, + "grad_norm": 0.6104329824447632, + "learning_rate": 0.0002970670724493011, + "loss": 3.0745, + "step": 30798 + }, + { + "epoch": 1.51, + "grad_norm": 0.5837059020996094, + "learning_rate": 0.00029705167699854804, + "loss": 2.9483, + "step": 30799 + }, + { + "epoch": 1.51, + "grad_norm": 0.6254569888114929, + "learning_rate": 0.0002970362815555603, + "loss": 2.8121, + "step": 30800 + }, + { + "epoch": 1.51, + "grad_norm": 0.5854609608650208, + "learning_rate": 0.00029702088612037835, + "loss": 3.0961, + "step": 30801 + }, + { + "epoch": 1.51, + "grad_norm": 0.5687035322189331, + "learning_rate": 0.0002970054906930429, + "loss": 2.8367, + "step": 30802 + }, + { + "epoch": 1.51, + "grad_norm": 0.5669450759887695, + "learning_rate": 0.00029699009527359437, + "loss": 3.0268, + "step": 30803 + }, + { + "epoch": 1.51, + "grad_norm": 0.5802821516990662, + "learning_rate": 0.00029697469986207344, + "loss": 3.1936, + "step": 30804 + }, + { + "epoch": 1.51, + "grad_norm": 0.6279524564743042, + "learning_rate": 0.0002969593044585204, + "loss": 3.0683, + "step": 30805 + }, + { + "epoch": 1.51, + "grad_norm": 0.5692258477210999, + "learning_rate": 0.0002969439090629761, + "loss": 3.2225, + "step": 30806 + }, + { + "epoch": 1.51, + "grad_norm": 0.5868411064147949, + "learning_rate": 0.00029692851367548096, + "loss": 2.9523, + "step": 30807 + }, + { + "epoch": 1.51, + "grad_norm": 0.5972650647163391, + "learning_rate": 0.0002969131182960754, + "loss": 2.8724, + "step": 30808 + }, + { + "epoch": 1.51, + "grad_norm": 0.6030793786048889, + "learning_rate": 0.0002968977229248002, + "loss": 3.2454, + "step": 30809 + }, + { + "epoch": 1.51, + "grad_norm": 0.6051518321037292, + "learning_rate": 0.00029688232756169575, + "loss": 3.1048, + "step": 30810 + }, + { + "epoch": 1.51, + "grad_norm": 0.5919243097305298, + "learning_rate": 0.0002968669322068026, + "loss": 3.0596, + "step": 30811 + }, + { + "epoch": 1.51, + "grad_norm": 0.6317167282104492, + "learning_rate": 0.00029685153686016146, + "loss": 2.9332, + "step": 30812 + }, + { + "epoch": 1.51, + "grad_norm": 0.6132222414016724, + "learning_rate": 0.0002968361415218127, + "loss": 3.0974, + "step": 30813 + }, + { + "epoch": 1.51, + "grad_norm": 0.621911346912384, + "learning_rate": 0.000296820746191797, + "loss": 3.1756, + "step": 30814 + }, + { + "epoch": 1.51, + "grad_norm": 0.6402205228805542, + "learning_rate": 0.0002968053508701548, + "loss": 3.0177, + "step": 30815 + }, + { + "epoch": 1.51, + "grad_norm": 0.5846377015113831, + "learning_rate": 0.0002967899555569266, + "loss": 3.0623, + "step": 30816 + }, + { + "epoch": 1.51, + "grad_norm": 0.6023883819580078, + "learning_rate": 0.00029677456025215324, + "loss": 2.9456, + "step": 30817 + }, + { + "epoch": 1.51, + "grad_norm": 0.6518478393554688, + "learning_rate": 0.0002967591649558749, + "loss": 3.2577, + "step": 30818 + }, + { + "epoch": 1.51, + "grad_norm": 0.5860528945922852, + "learning_rate": 0.0002967437696681324, + "loss": 2.9268, + "step": 30819 + }, + { + "epoch": 1.51, + "grad_norm": 0.5669897198677063, + "learning_rate": 0.0002967283743889661, + "loss": 2.9486, + "step": 30820 + }, + { + "epoch": 1.51, + "grad_norm": 0.5880569815635681, + "learning_rate": 0.00029671297911841663, + "loss": 3.3732, + "step": 30821 + }, + { + "epoch": 1.51, + "grad_norm": 0.588849663734436, + "learning_rate": 0.00029669758385652466, + "loss": 3.0296, + "step": 30822 + }, + { + "epoch": 1.51, + "grad_norm": 0.5880824327468872, + "learning_rate": 0.0002966821886033305, + "loss": 3.1179, + "step": 30823 + }, + { + "epoch": 1.51, + "grad_norm": 0.5957514047622681, + "learning_rate": 0.00029666679335887486, + "loss": 3.0971, + "step": 30824 + }, + { + "epoch": 1.51, + "grad_norm": 0.646440327167511, + "learning_rate": 0.00029665139812319824, + "loss": 3.0703, + "step": 30825 + }, + { + "epoch": 1.51, + "grad_norm": 0.6188963055610657, + "learning_rate": 0.0002966360028963412, + "loss": 2.8634, + "step": 30826 + }, + { + "epoch": 1.51, + "grad_norm": 0.5998916625976562, + "learning_rate": 0.00029662060767834427, + "loss": 3.0819, + "step": 30827 + }, + { + "epoch": 1.51, + "grad_norm": 0.5960819721221924, + "learning_rate": 0.000296605212469248, + "loss": 3.1007, + "step": 30828 + }, + { + "epoch": 1.51, + "grad_norm": 0.6161120533943176, + "learning_rate": 0.00029658981726909305, + "loss": 2.9573, + "step": 30829 + }, + { + "epoch": 1.51, + "grad_norm": 0.6249703168869019, + "learning_rate": 0.0002965744220779197, + "loss": 3.0573, + "step": 30830 + }, + { + "epoch": 1.51, + "grad_norm": 0.6068581342697144, + "learning_rate": 0.0002965590268957687, + "loss": 3.1307, + "step": 30831 + }, + { + "epoch": 1.51, + "grad_norm": 0.5972431302070618, + "learning_rate": 0.0002965436317226807, + "loss": 3.2302, + "step": 30832 + }, + { + "epoch": 1.51, + "grad_norm": 0.5628926753997803, + "learning_rate": 0.0002965282365586959, + "loss": 3.204, + "step": 30833 + }, + { + "epoch": 1.51, + "grad_norm": 0.6214185357093811, + "learning_rate": 0.00029651284140385525, + "loss": 3.0294, + "step": 30834 + }, + { + "epoch": 1.51, + "grad_norm": 0.6001722812652588, + "learning_rate": 0.0002964974462581989, + "loss": 2.8884, + "step": 30835 + }, + { + "epoch": 1.51, + "grad_norm": 0.6169463396072388, + "learning_rate": 0.0002964820511217678, + "loss": 2.9654, + "step": 30836 + }, + { + "epoch": 1.51, + "grad_norm": 0.5828376412391663, + "learning_rate": 0.00029646665599460223, + "loss": 3.0695, + "step": 30837 + }, + { + "epoch": 1.51, + "grad_norm": 0.6448942422866821, + "learning_rate": 0.0002964512608767427, + "loss": 3.1534, + "step": 30838 + }, + { + "epoch": 1.51, + "grad_norm": 0.5935970544815063, + "learning_rate": 0.00029643586576823006, + "loss": 2.8783, + "step": 30839 + }, + { + "epoch": 1.51, + "grad_norm": 0.6004209518432617, + "learning_rate": 0.0002964204706691046, + "loss": 2.9735, + "step": 30840 + }, + { + "epoch": 1.51, + "grad_norm": 0.5771257877349854, + "learning_rate": 0.0002964050755794068, + "loss": 2.7251, + "step": 30841 + }, + { + "epoch": 1.51, + "grad_norm": 0.5918455123901367, + "learning_rate": 0.0002963896804991775, + "loss": 2.9045, + "step": 30842 + }, + { + "epoch": 1.51, + "grad_norm": 0.5754095911979675, + "learning_rate": 0.000296374285428457, + "loss": 3.2429, + "step": 30843 + }, + { + "epoch": 1.51, + "grad_norm": 0.5826681852340698, + "learning_rate": 0.000296358890367286, + "loss": 3.0659, + "step": 30844 + }, + { + "epoch": 1.51, + "grad_norm": 0.610785722732544, + "learning_rate": 0.00029634349531570487, + "loss": 3.0146, + "step": 30845 + }, + { + "epoch": 1.51, + "grad_norm": 0.5650796890258789, + "learning_rate": 0.0002963281002737544, + "loss": 2.9735, + "step": 30846 + }, + { + "epoch": 1.51, + "grad_norm": 0.6261953711509705, + "learning_rate": 0.000296312705241475, + "loss": 2.9293, + "step": 30847 + }, + { + "epoch": 1.51, + "grad_norm": 0.5803135633468628, + "learning_rate": 0.00029629731021890704, + "loss": 3.1329, + "step": 30848 + }, + { + "epoch": 1.51, + "grad_norm": 0.611264169216156, + "learning_rate": 0.0002962819152060915, + "loss": 3.0638, + "step": 30849 + }, + { + "epoch": 1.51, + "grad_norm": 0.587598979473114, + "learning_rate": 0.00029626652020306855, + "loss": 3.1472, + "step": 30850 + }, + { + "epoch": 1.51, + "grad_norm": 0.6119747161865234, + "learning_rate": 0.0002962511252098789, + "loss": 3.0915, + "step": 30851 + }, + { + "epoch": 1.51, + "grad_norm": 0.6030398011207581, + "learning_rate": 0.000296235730226563, + "loss": 2.7529, + "step": 30852 + }, + { + "epoch": 1.51, + "grad_norm": 0.6091843843460083, + "learning_rate": 0.0002962203352531615, + "loss": 3.1295, + "step": 30853 + }, + { + "epoch": 1.51, + "grad_norm": 0.5616978406906128, + "learning_rate": 0.00029620494028971496, + "loss": 3.268, + "step": 30854 + }, + { + "epoch": 1.51, + "grad_norm": 0.6114065647125244, + "learning_rate": 0.0002961895453362638, + "loss": 3.12, + "step": 30855 + }, + { + "epoch": 1.51, + "grad_norm": 0.6025994420051575, + "learning_rate": 0.00029617415039284874, + "loss": 2.9599, + "step": 30856 + }, + { + "epoch": 1.51, + "grad_norm": 0.5990439057350159, + "learning_rate": 0.0002961587554595101, + "loss": 3.1287, + "step": 30857 + }, + { + "epoch": 1.51, + "grad_norm": 0.5876929759979248, + "learning_rate": 0.0002961433605362886, + "loss": 3.0938, + "step": 30858 + }, + { + "epoch": 1.51, + "grad_norm": 0.6120889186859131, + "learning_rate": 0.00029612796562322486, + "loss": 3.1546, + "step": 30859 + }, + { + "epoch": 1.51, + "grad_norm": 0.5933310985565186, + "learning_rate": 0.00029611257072035917, + "loss": 3.0761, + "step": 30860 + }, + { + "epoch": 1.51, + "grad_norm": 0.646134614944458, + "learning_rate": 0.0002960971758277323, + "loss": 3.1372, + "step": 30861 + }, + { + "epoch": 1.51, + "grad_norm": 0.5914332866668701, + "learning_rate": 0.0002960817809453847, + "loss": 2.8682, + "step": 30862 + }, + { + "epoch": 1.51, + "grad_norm": 0.6064649820327759, + "learning_rate": 0.00029606638607335687, + "loss": 3.0682, + "step": 30863 + }, + { + "epoch": 1.51, + "grad_norm": 0.5667405128479004, + "learning_rate": 0.00029605099121168956, + "loss": 3.1389, + "step": 30864 + }, + { + "epoch": 1.51, + "grad_norm": 0.5688351988792419, + "learning_rate": 0.0002960355963604231, + "loss": 3.0559, + "step": 30865 + }, + { + "epoch": 1.51, + "grad_norm": 0.5366978645324707, + "learning_rate": 0.0002960202015195982, + "loss": 3.0342, + "step": 30866 + }, + { + "epoch": 1.51, + "grad_norm": 0.6322748064994812, + "learning_rate": 0.00029600480668925516, + "loss": 3.4253, + "step": 30867 + }, + { + "epoch": 1.51, + "grad_norm": 0.6188519597053528, + "learning_rate": 0.00029598941186943475, + "loss": 3.2374, + "step": 30868 + }, + { + "epoch": 1.51, + "grad_norm": 0.567204475402832, + "learning_rate": 0.00029597401706017755, + "loss": 3.1257, + "step": 30869 + }, + { + "epoch": 1.51, + "grad_norm": 0.6143216490745544, + "learning_rate": 0.00029595862226152386, + "loss": 2.9782, + "step": 30870 + }, + { + "epoch": 1.51, + "grad_norm": 0.6275702118873596, + "learning_rate": 0.00029594322747351453, + "loss": 3.0836, + "step": 30871 + }, + { + "epoch": 1.51, + "grad_norm": 0.7560949325561523, + "learning_rate": 0.0002959278326961899, + "loss": 3.0159, + "step": 30872 + }, + { + "epoch": 1.51, + "grad_norm": 0.6060622334480286, + "learning_rate": 0.0002959124379295905, + "loss": 3.2309, + "step": 30873 + }, + { + "epoch": 1.51, + "grad_norm": 0.5544068813323975, + "learning_rate": 0.0002958970431737571, + "loss": 2.9914, + "step": 30874 + }, + { + "epoch": 1.51, + "grad_norm": 0.5971145629882812, + "learning_rate": 0.00029588164842872994, + "loss": 3.0419, + "step": 30875 + }, + { + "epoch": 1.51, + "grad_norm": 0.6468105912208557, + "learning_rate": 0.0002958662536945499, + "loss": 2.7897, + "step": 30876 + }, + { + "epoch": 1.51, + "grad_norm": 0.6096239686012268, + "learning_rate": 0.0002958508589712572, + "loss": 3.1458, + "step": 30877 + }, + { + "epoch": 1.51, + "grad_norm": 0.6860781908035278, + "learning_rate": 0.00029583546425889255, + "loss": 2.8722, + "step": 30878 + }, + { + "epoch": 1.51, + "grad_norm": 0.5959381461143494, + "learning_rate": 0.0002958200695574966, + "loss": 3.0762, + "step": 30879 + }, + { + "epoch": 1.51, + "grad_norm": 0.587128221988678, + "learning_rate": 0.00029580467486710967, + "loss": 3.1168, + "step": 30880 + }, + { + "epoch": 1.51, + "grad_norm": 0.6013617515563965, + "learning_rate": 0.00029578928018777253, + "loss": 3.0795, + "step": 30881 + }, + { + "epoch": 1.51, + "grad_norm": 0.6354458928108215, + "learning_rate": 0.0002957738855195255, + "loss": 3.2363, + "step": 30882 + }, + { + "epoch": 1.51, + "grad_norm": 0.6061522960662842, + "learning_rate": 0.0002957584908624093, + "loss": 3.1117, + "step": 30883 + }, + { + "epoch": 1.51, + "grad_norm": 0.6070138216018677, + "learning_rate": 0.00029574309621646443, + "loss": 2.9919, + "step": 30884 + }, + { + "epoch": 1.51, + "grad_norm": 0.6107861399650574, + "learning_rate": 0.0002957277015817314, + "loss": 3.265, + "step": 30885 + }, + { + "epoch": 1.51, + "grad_norm": 0.5738034844398499, + "learning_rate": 0.0002957123069582508, + "loss": 3.1544, + "step": 30886 + }, + { + "epoch": 1.51, + "grad_norm": 0.6216245889663696, + "learning_rate": 0.00029569691234606314, + "loss": 2.9596, + "step": 30887 + }, + { + "epoch": 1.51, + "grad_norm": 0.6213082671165466, + "learning_rate": 0.0002956815177452089, + "loss": 2.9898, + "step": 30888 + }, + { + "epoch": 1.51, + "grad_norm": 0.6034766435623169, + "learning_rate": 0.0002956661231557289, + "loss": 3.1961, + "step": 30889 + }, + { + "epoch": 1.51, + "grad_norm": 0.5714133977890015, + "learning_rate": 0.0002956507285776634, + "loss": 3.2041, + "step": 30890 + }, + { + "epoch": 1.51, + "grad_norm": 0.5667616724967957, + "learning_rate": 0.0002956353340110531, + "loss": 3.1601, + "step": 30891 + }, + { + "epoch": 1.51, + "grad_norm": 0.5944485068321228, + "learning_rate": 0.00029561993945593834, + "loss": 3.0033, + "step": 30892 + }, + { + "epoch": 1.51, + "grad_norm": 0.5806108117103577, + "learning_rate": 0.0002956045449123599, + "loss": 2.8649, + "step": 30893 + }, + { + "epoch": 1.51, + "grad_norm": 0.621354877948761, + "learning_rate": 0.0002955891503803583, + "loss": 3.0581, + "step": 30894 + }, + { + "epoch": 1.51, + "grad_norm": 0.6162626147270203, + "learning_rate": 0.00029557375585997393, + "loss": 2.9909, + "step": 30895 + }, + { + "epoch": 1.51, + "grad_norm": 0.5693962574005127, + "learning_rate": 0.00029555836135124755, + "loss": 3.117, + "step": 30896 + }, + { + "epoch": 1.51, + "grad_norm": 0.7049458622932434, + "learning_rate": 0.00029554296685421945, + "loss": 3.0019, + "step": 30897 + }, + { + "epoch": 1.51, + "grad_norm": 0.5936644077301025, + "learning_rate": 0.0002955275723689304, + "loss": 3.3479, + "step": 30898 + }, + { + "epoch": 1.51, + "grad_norm": 0.5873288512229919, + "learning_rate": 0.00029551217789542094, + "loss": 2.8661, + "step": 30899 + }, + { + "epoch": 1.51, + "grad_norm": 0.5874931812286377, + "learning_rate": 0.00029549678343373137, + "loss": 3.2489, + "step": 30900 + }, + { + "epoch": 1.51, + "grad_norm": 0.6320956945419312, + "learning_rate": 0.00029548138898390254, + "loss": 2.8875, + "step": 30901 + }, + { + "epoch": 1.51, + "grad_norm": 0.5752156972885132, + "learning_rate": 0.00029546599454597484, + "loss": 2.9791, + "step": 30902 + }, + { + "epoch": 1.51, + "grad_norm": 0.5847448110580444, + "learning_rate": 0.00029545060011998875, + "loss": 2.9069, + "step": 30903 + }, + { + "epoch": 1.51, + "grad_norm": 0.5820603966712952, + "learning_rate": 0.00029543520570598505, + "loss": 3.1561, + "step": 30904 + }, + { + "epoch": 1.51, + "grad_norm": 0.6131413578987122, + "learning_rate": 0.000295419811304004, + "loss": 3.1054, + "step": 30905 + }, + { + "epoch": 1.51, + "grad_norm": 0.6036339402198792, + "learning_rate": 0.0002954044169140864, + "loss": 2.9931, + "step": 30906 + }, + { + "epoch": 1.51, + "grad_norm": 0.6479763984680176, + "learning_rate": 0.0002953890225362726, + "loss": 3.1477, + "step": 30907 + }, + { + "epoch": 1.51, + "grad_norm": 0.5721249580383301, + "learning_rate": 0.00029537362817060324, + "loss": 3.0113, + "step": 30908 + }, + { + "epoch": 1.51, + "grad_norm": 0.6328310370445251, + "learning_rate": 0.0002953582338171189, + "loss": 2.9072, + "step": 30909 + }, + { + "epoch": 1.51, + "grad_norm": 0.6006953716278076, + "learning_rate": 0.00029534283947586, + "loss": 2.95, + "step": 30910 + }, + { + "epoch": 1.51, + "grad_norm": 0.6052888631820679, + "learning_rate": 0.00029532744514686724, + "loss": 2.8012, + "step": 30911 + }, + { + "epoch": 1.51, + "grad_norm": 0.6413729190826416, + "learning_rate": 0.00029531205083018104, + "loss": 3.1825, + "step": 30912 + }, + { + "epoch": 1.51, + "grad_norm": 0.5984901785850525, + "learning_rate": 0.00029529665652584195, + "loss": 2.9248, + "step": 30913 + }, + { + "epoch": 1.52, + "grad_norm": 0.5955290198326111, + "learning_rate": 0.0002952812622338907, + "loss": 2.9168, + "step": 30914 + }, + { + "epoch": 1.52, + "grad_norm": 0.6276412606239319, + "learning_rate": 0.00029526586795436757, + "loss": 3.1651, + "step": 30915 + }, + { + "epoch": 1.52, + "grad_norm": 0.5960054397583008, + "learning_rate": 0.0002952504736873134, + "loss": 3.1858, + "step": 30916 + }, + { + "epoch": 1.52, + "grad_norm": 0.5850929021835327, + "learning_rate": 0.0002952350794327684, + "loss": 3.048, + "step": 30917 + }, + { + "epoch": 1.52, + "grad_norm": 0.616568386554718, + "learning_rate": 0.00029521968519077325, + "loss": 2.8989, + "step": 30918 + }, + { + "epoch": 1.52, + "grad_norm": 0.623950183391571, + "learning_rate": 0.0002952042909613687, + "loss": 3.1168, + "step": 30919 + }, + { + "epoch": 1.52, + "grad_norm": 0.5852279663085938, + "learning_rate": 0.000295188896744595, + "loss": 3.0668, + "step": 30920 + }, + { + "epoch": 1.52, + "grad_norm": 0.5998043417930603, + "learning_rate": 0.00029517350254049297, + "loss": 3.0312, + "step": 30921 + }, + { + "epoch": 1.52, + "grad_norm": 0.6115825772285461, + "learning_rate": 0.0002951581083491028, + "loss": 2.8682, + "step": 30922 + }, + { + "epoch": 1.52, + "grad_norm": 0.6442579627037048, + "learning_rate": 0.0002951427141704654, + "loss": 3.1506, + "step": 30923 + }, + { + "epoch": 1.52, + "grad_norm": 0.6034064292907715, + "learning_rate": 0.0002951273200046211, + "loss": 3.1786, + "step": 30924 + }, + { + "epoch": 1.52, + "grad_norm": 0.5963617563247681, + "learning_rate": 0.0002951119258516104, + "loss": 2.975, + "step": 30925 + }, + { + "epoch": 1.52, + "grad_norm": 0.6269310712814331, + "learning_rate": 0.0002950965317114741, + "loss": 3.1549, + "step": 30926 + }, + { + "epoch": 1.52, + "grad_norm": 0.5554681420326233, + "learning_rate": 0.0002950811375842526, + "loss": 3.099, + "step": 30927 + }, + { + "epoch": 1.52, + "grad_norm": 0.6449235677719116, + "learning_rate": 0.0002950657434699864, + "loss": 3.0467, + "step": 30928 + }, + { + "epoch": 1.52, + "grad_norm": 0.633747398853302, + "learning_rate": 0.00029505034936871597, + "loss": 2.9924, + "step": 30929 + }, + { + "epoch": 1.52, + "grad_norm": 0.5843096971511841, + "learning_rate": 0.00029503495528048206, + "loss": 3.2848, + "step": 30930 + }, + { + "epoch": 1.52, + "grad_norm": 0.6081466674804688, + "learning_rate": 0.00029501956120532515, + "loss": 3.054, + "step": 30931 + }, + { + "epoch": 1.52, + "grad_norm": 0.5923753976821899, + "learning_rate": 0.0002950041671432856, + "loss": 2.893, + "step": 30932 + }, + { + "epoch": 1.52, + "grad_norm": 0.638497531414032, + "learning_rate": 0.00029498877309440435, + "loss": 3.1434, + "step": 30933 + }, + { + "epoch": 1.52, + "grad_norm": 0.5726613998413086, + "learning_rate": 0.00029497337905872155, + "loss": 3.0945, + "step": 30934 + }, + { + "epoch": 1.52, + "grad_norm": 0.6029412746429443, + "learning_rate": 0.0002949579850362779, + "loss": 3.0213, + "step": 30935 + }, + { + "epoch": 1.52, + "grad_norm": 0.6141619086265564, + "learning_rate": 0.00029494259102711405, + "loss": 2.8401, + "step": 30936 + }, + { + "epoch": 1.52, + "grad_norm": 0.5874308347702026, + "learning_rate": 0.0002949271970312703, + "loss": 3.1897, + "step": 30937 + }, + { + "epoch": 1.52, + "grad_norm": 0.5788091421127319, + "learning_rate": 0.0002949118030487875, + "loss": 3.0942, + "step": 30938 + }, + { + "epoch": 1.52, + "grad_norm": 0.6286618709564209, + "learning_rate": 0.00029489640907970593, + "loss": 2.9367, + "step": 30939 + }, + { + "epoch": 1.52, + "grad_norm": 0.588944137096405, + "learning_rate": 0.00029488101512406615, + "loss": 3.0126, + "step": 30940 + }, + { + "epoch": 1.52, + "grad_norm": 0.5936257243156433, + "learning_rate": 0.000294865621181909, + "loss": 2.9992, + "step": 30941 + }, + { + "epoch": 1.52, + "grad_norm": 0.5838150382041931, + "learning_rate": 0.0002948502272532747, + "loss": 3.0763, + "step": 30942 + }, + { + "epoch": 1.52, + "grad_norm": 0.5851708650588989, + "learning_rate": 0.000294834833338204, + "loss": 3.0234, + "step": 30943 + }, + { + "epoch": 1.52, + "grad_norm": 0.6120582818984985, + "learning_rate": 0.00029481943943673715, + "loss": 2.7278, + "step": 30944 + }, + { + "epoch": 1.52, + "grad_norm": 0.6074814796447754, + "learning_rate": 0.00029480404554891503, + "loss": 3.1052, + "step": 30945 + }, + { + "epoch": 1.52, + "grad_norm": 0.5912714004516602, + "learning_rate": 0.00029478865167477814, + "loss": 2.8922, + "step": 30946 + }, + { + "epoch": 1.52, + "grad_norm": 0.5891293287277222, + "learning_rate": 0.00029477325781436677, + "loss": 3.0284, + "step": 30947 + }, + { + "epoch": 1.52, + "grad_norm": 0.714016854763031, + "learning_rate": 0.0002947578639677218, + "loss": 2.9723, + "step": 30948 + }, + { + "epoch": 1.52, + "grad_norm": 0.5967224836349487, + "learning_rate": 0.0002947424701348835, + "loss": 3.0212, + "step": 30949 + }, + { + "epoch": 1.52, + "grad_norm": 0.5975646376609802, + "learning_rate": 0.00029472707631589246, + "loss": 3.0859, + "step": 30950 + }, + { + "epoch": 1.52, + "grad_norm": 0.6335617899894714, + "learning_rate": 0.00029471168251078944, + "loss": 2.9693, + "step": 30951 + }, + { + "epoch": 1.52, + "grad_norm": 0.6031022071838379, + "learning_rate": 0.00029469628871961473, + "loss": 2.8384, + "step": 30952 + }, + { + "epoch": 1.52, + "grad_norm": 0.5797258019447327, + "learning_rate": 0.0002946808949424091, + "loss": 2.9484, + "step": 30953 + }, + { + "epoch": 1.52, + "grad_norm": 0.5635722875595093, + "learning_rate": 0.00029466550117921277, + "loss": 3.0065, + "step": 30954 + }, + { + "epoch": 1.52, + "grad_norm": 0.6008172631263733, + "learning_rate": 0.00029465010743006656, + "loss": 3.1646, + "step": 30955 + }, + { + "epoch": 1.52, + "grad_norm": 0.5772174596786499, + "learning_rate": 0.0002946347136950111, + "loss": 2.8605, + "step": 30956 + }, + { + "epoch": 1.52, + "grad_norm": 0.6074007749557495, + "learning_rate": 0.00029461931997408654, + "loss": 3.1638, + "step": 30957 + }, + { + "epoch": 1.52, + "grad_norm": 0.5578022003173828, + "learning_rate": 0.0002946039262673338, + "loss": 2.9493, + "step": 30958 + }, + { + "epoch": 1.52, + "grad_norm": 0.6266377568244934, + "learning_rate": 0.0002945885325747931, + "loss": 2.9573, + "step": 30959 + }, + { + "epoch": 1.52, + "grad_norm": 0.6051025986671448, + "learning_rate": 0.0002945731388965053, + "loss": 2.8036, + "step": 30960 + }, + { + "epoch": 1.52, + "grad_norm": 0.6597091555595398, + "learning_rate": 0.0002945577452325109, + "loss": 3.0133, + "step": 30961 + }, + { + "epoch": 1.52, + "grad_norm": 0.5791193842887878, + "learning_rate": 0.00029454235158285014, + "loss": 3.1355, + "step": 30962 + }, + { + "epoch": 1.52, + "grad_norm": 0.6521401405334473, + "learning_rate": 0.00029452695794756397, + "loss": 3.2595, + "step": 30963 + }, + { + "epoch": 1.52, + "grad_norm": 0.614771842956543, + "learning_rate": 0.00029451156432669265, + "loss": 3.0447, + "step": 30964 + }, + { + "epoch": 1.52, + "grad_norm": 0.6045995354652405, + "learning_rate": 0.0002944961707202768, + "loss": 3.0624, + "step": 30965 + }, + { + "epoch": 1.52, + "grad_norm": 0.5819627642631531, + "learning_rate": 0.000294480777128357, + "loss": 3.2695, + "step": 30966 + }, + { + "epoch": 1.52, + "grad_norm": 0.5987125039100647, + "learning_rate": 0.0002944653835509738, + "loss": 3.1373, + "step": 30967 + }, + { + "epoch": 1.52, + "grad_norm": 0.5703858137130737, + "learning_rate": 0.00029444998998816774, + "loss": 3.2609, + "step": 30968 + }, + { + "epoch": 1.52, + "grad_norm": 0.6514813303947449, + "learning_rate": 0.0002944345964399792, + "loss": 3.0562, + "step": 30969 + }, + { + "epoch": 1.52, + "grad_norm": 0.6097198724746704, + "learning_rate": 0.00029441920290644894, + "loss": 3.1765, + "step": 30970 + }, + { + "epoch": 1.52, + "grad_norm": 0.5842579007148743, + "learning_rate": 0.00029440380938761747, + "loss": 3.0327, + "step": 30971 + }, + { + "epoch": 1.52, + "grad_norm": 0.5636705756187439, + "learning_rate": 0.0002943884158835252, + "loss": 2.9826, + "step": 30972 + }, + { + "epoch": 1.52, + "grad_norm": 0.5914285778999329, + "learning_rate": 0.00029437302239421285, + "loss": 2.9421, + "step": 30973 + }, + { + "epoch": 1.52, + "grad_norm": 0.6153814792633057, + "learning_rate": 0.00029435762891972085, + "loss": 3.1329, + "step": 30974 + }, + { + "epoch": 1.52, + "grad_norm": 0.5827009081840515, + "learning_rate": 0.0002943422354600897, + "loss": 3.1252, + "step": 30975 + }, + { + "epoch": 1.52, + "grad_norm": 0.5662691593170166, + "learning_rate": 0.00029432684201536013, + "loss": 3.2731, + "step": 30976 + }, + { + "epoch": 1.52, + "grad_norm": 0.6179322600364685, + "learning_rate": 0.00029431144858557245, + "loss": 3.048, + "step": 30977 + }, + { + "epoch": 1.52, + "grad_norm": 0.6341006755828857, + "learning_rate": 0.0002942960551707674, + "loss": 3.1041, + "step": 30978 + }, + { + "epoch": 1.52, + "grad_norm": 0.626679539680481, + "learning_rate": 0.0002942806617709854, + "loss": 3.1037, + "step": 30979 + }, + { + "epoch": 1.52, + "grad_norm": 0.5652845501899719, + "learning_rate": 0.000294265268386267, + "loss": 2.7815, + "step": 30980 + }, + { + "epoch": 1.52, + "grad_norm": 0.6046680212020874, + "learning_rate": 0.0002942498750166529, + "loss": 2.9942, + "step": 30981 + }, + { + "epoch": 1.52, + "grad_norm": 0.5820021629333496, + "learning_rate": 0.00029423448166218343, + "loss": 3.126, + "step": 30982 + }, + { + "epoch": 1.52, + "grad_norm": 0.6065924167633057, + "learning_rate": 0.0002942190883228993, + "loss": 2.8712, + "step": 30983 + }, + { + "epoch": 1.52, + "grad_norm": 0.6059271693229675, + "learning_rate": 0.00029420369499884086, + "loss": 3.0145, + "step": 30984 + }, + { + "epoch": 1.52, + "grad_norm": 0.6043378710746765, + "learning_rate": 0.0002941883016900489, + "loss": 3.1985, + "step": 30985 + }, + { + "epoch": 1.52, + "grad_norm": 0.5847151875495911, + "learning_rate": 0.0002941729083965638, + "loss": 3.1297, + "step": 30986 + }, + { + "epoch": 1.52, + "grad_norm": 0.6242455244064331, + "learning_rate": 0.000294157515118426, + "loss": 3.034, + "step": 30987 + }, + { + "epoch": 1.52, + "grad_norm": 0.6258992552757263, + "learning_rate": 0.00029414212185567643, + "loss": 2.9636, + "step": 30988 + }, + { + "epoch": 1.52, + "grad_norm": 0.6174604892730713, + "learning_rate": 0.0002941267286083552, + "loss": 3.1873, + "step": 30989 + }, + { + "epoch": 1.52, + "grad_norm": 0.6053414940834045, + "learning_rate": 0.00029411133537650305, + "loss": 3.0621, + "step": 30990 + }, + { + "epoch": 1.52, + "grad_norm": 0.6238487362861633, + "learning_rate": 0.00029409594216016057, + "loss": 3.3516, + "step": 30991 + }, + { + "epoch": 1.52, + "grad_norm": 0.6055889129638672, + "learning_rate": 0.0002940805489593682, + "loss": 2.9447, + "step": 30992 + }, + { + "epoch": 1.52, + "grad_norm": 0.6507366895675659, + "learning_rate": 0.0002940651557741666, + "loss": 3.007, + "step": 30993 + }, + { + "epoch": 1.52, + "grad_norm": 0.5801891684532166, + "learning_rate": 0.0002940497626045962, + "loss": 3.0388, + "step": 30994 + }, + { + "epoch": 1.52, + "grad_norm": 0.5716410875320435, + "learning_rate": 0.0002940343694506975, + "loss": 2.8096, + "step": 30995 + }, + { + "epoch": 1.52, + "grad_norm": 0.5872518420219421, + "learning_rate": 0.00029401897631251125, + "loss": 3.0487, + "step": 30996 + }, + { + "epoch": 1.52, + "grad_norm": 0.6373841762542725, + "learning_rate": 0.0002940035831900777, + "loss": 3.104, + "step": 30997 + }, + { + "epoch": 1.52, + "grad_norm": 0.6203256845474243, + "learning_rate": 0.0002939881900834377, + "loss": 2.8221, + "step": 30998 + }, + { + "epoch": 1.52, + "grad_norm": 0.5925654768943787, + "learning_rate": 0.0002939727969926316, + "loss": 3.0561, + "step": 30999 + }, + { + "epoch": 1.52, + "grad_norm": 0.58648282289505, + "learning_rate": 0.00029395740391770003, + "loss": 3.1059, + "step": 31000 + }, + { + "epoch": 1.52, + "grad_norm": 0.5882959365844727, + "learning_rate": 0.00029394201085868347, + "loss": 3.2673, + "step": 31001 + }, + { + "epoch": 1.52, + "grad_norm": 0.6323480606079102, + "learning_rate": 0.0002939266178156224, + "loss": 3.2371, + "step": 31002 + }, + { + "epoch": 1.52, + "grad_norm": 0.5646398067474365, + "learning_rate": 0.0002939112247885576, + "loss": 3.3075, + "step": 31003 + }, + { + "epoch": 1.52, + "grad_norm": 0.9151919484138489, + "learning_rate": 0.0002938958317775294, + "loss": 3.2684, + "step": 31004 + }, + { + "epoch": 1.52, + "grad_norm": 0.5926310420036316, + "learning_rate": 0.00029388043878257847, + "loss": 3.0728, + "step": 31005 + }, + { + "epoch": 1.52, + "grad_norm": 0.580113410949707, + "learning_rate": 0.0002938650458037451, + "loss": 3.203, + "step": 31006 + }, + { + "epoch": 1.52, + "grad_norm": 0.5675933361053467, + "learning_rate": 0.00029384965284107016, + "loss": 3.0887, + "step": 31007 + }, + { + "epoch": 1.52, + "grad_norm": 0.6494491696357727, + "learning_rate": 0.00029383425989459405, + "loss": 3.0097, + "step": 31008 + }, + { + "epoch": 1.52, + "grad_norm": 0.5970069169998169, + "learning_rate": 0.00029381886696435724, + "loss": 2.7491, + "step": 31009 + }, + { + "epoch": 1.52, + "grad_norm": 0.5872333645820618, + "learning_rate": 0.0002938034740504004, + "loss": 3.293, + "step": 31010 + }, + { + "epoch": 1.52, + "grad_norm": 0.5623251795768738, + "learning_rate": 0.00029378808115276405, + "loss": 3.0746, + "step": 31011 + }, + { + "epoch": 1.52, + "grad_norm": 0.5938485860824585, + "learning_rate": 0.00029377268827148855, + "loss": 3.1148, + "step": 31012 + }, + { + "epoch": 1.52, + "grad_norm": 0.5834391713142395, + "learning_rate": 0.0002937572954066148, + "loss": 2.8762, + "step": 31013 + }, + { + "epoch": 1.52, + "grad_norm": 0.5958360433578491, + "learning_rate": 0.000293741902558183, + "loss": 2.9429, + "step": 31014 + }, + { + "epoch": 1.52, + "grad_norm": 0.5586824417114258, + "learning_rate": 0.00029372650972623395, + "loss": 3.0833, + "step": 31015 + }, + { + "epoch": 1.52, + "grad_norm": 0.6077871322631836, + "learning_rate": 0.00029371111691080785, + "loss": 2.9211, + "step": 31016 + }, + { + "epoch": 1.52, + "grad_norm": 0.566399097442627, + "learning_rate": 0.00029369572411194555, + "loss": 3.0479, + "step": 31017 + }, + { + "epoch": 1.52, + "grad_norm": 0.5618554949760437, + "learning_rate": 0.00029368033132968764, + "loss": 2.9492, + "step": 31018 + }, + { + "epoch": 1.52, + "grad_norm": 0.6119033694267273, + "learning_rate": 0.00029366493856407435, + "loss": 3.0998, + "step": 31019 + }, + { + "epoch": 1.52, + "grad_norm": 0.5961689352989197, + "learning_rate": 0.0002936495458151465, + "loss": 2.8835, + "step": 31020 + }, + { + "epoch": 1.52, + "grad_norm": 0.563315212726593, + "learning_rate": 0.0002936341530829444, + "loss": 3.0595, + "step": 31021 + }, + { + "epoch": 1.52, + "grad_norm": 0.5843584537506104, + "learning_rate": 0.0002936187603675088, + "loss": 3.117, + "step": 31022 + }, + { + "epoch": 1.52, + "grad_norm": 0.6035363674163818, + "learning_rate": 0.00029360336766888023, + "loss": 2.9555, + "step": 31023 + }, + { + "epoch": 1.52, + "grad_norm": 0.6223370432853699, + "learning_rate": 0.00029358797498709897, + "loss": 3.3564, + "step": 31024 + }, + { + "epoch": 1.52, + "grad_norm": 0.5917815566062927, + "learning_rate": 0.00029357258232220595, + "loss": 3.1168, + "step": 31025 + }, + { + "epoch": 1.52, + "grad_norm": 0.567468523979187, + "learning_rate": 0.00029355718967424144, + "loss": 3.0815, + "step": 31026 + }, + { + "epoch": 1.52, + "grad_norm": 0.6053411364555359, + "learning_rate": 0.00029354179704324594, + "loss": 2.9032, + "step": 31027 + }, + { + "epoch": 1.52, + "grad_norm": 0.6136942505836487, + "learning_rate": 0.0002935264044292602, + "loss": 2.9913, + "step": 31028 + }, + { + "epoch": 1.52, + "grad_norm": 0.5973330736160278, + "learning_rate": 0.00029351101183232466, + "loss": 3.091, + "step": 31029 + }, + { + "epoch": 1.52, + "grad_norm": 0.6054880619049072, + "learning_rate": 0.00029349561925248, + "loss": 2.8339, + "step": 31030 + }, + { + "epoch": 1.52, + "grad_norm": 0.5846839547157288, + "learning_rate": 0.0002934802266897664, + "loss": 2.973, + "step": 31031 + }, + { + "epoch": 1.52, + "grad_norm": 0.5589364767074585, + "learning_rate": 0.0002934648341442247, + "loss": 3.0979, + "step": 31032 + }, + { + "epoch": 1.52, + "grad_norm": 0.5708386898040771, + "learning_rate": 0.0002934494416158954, + "loss": 3.19, + "step": 31033 + }, + { + "epoch": 1.52, + "grad_norm": 0.5712014436721802, + "learning_rate": 0.000293434049104819, + "loss": 3.1109, + "step": 31034 + }, + { + "epoch": 1.52, + "grad_norm": 0.6233195662498474, + "learning_rate": 0.00029341865661103604, + "loss": 3.1072, + "step": 31035 + }, + { + "epoch": 1.52, + "grad_norm": 0.5828282833099365, + "learning_rate": 0.0002934032641345871, + "loss": 3.1385, + "step": 31036 + }, + { + "epoch": 1.52, + "grad_norm": 0.6146054863929749, + "learning_rate": 0.00029338787167551263, + "loss": 2.9546, + "step": 31037 + }, + { + "epoch": 1.52, + "grad_norm": 0.5709480047225952, + "learning_rate": 0.0002933724792338533, + "loss": 2.9423, + "step": 31038 + }, + { + "epoch": 1.52, + "grad_norm": 0.5619285702705383, + "learning_rate": 0.00029335708680964953, + "loss": 2.9497, + "step": 31039 + }, + { + "epoch": 1.52, + "grad_norm": 0.6475139856338501, + "learning_rate": 0.000293341694402942, + "loss": 2.9842, + "step": 31040 + }, + { + "epoch": 1.52, + "grad_norm": 0.5895599722862244, + "learning_rate": 0.0002933263020137711, + "loss": 2.8115, + "step": 31041 + }, + { + "epoch": 1.52, + "grad_norm": 0.5933480858802795, + "learning_rate": 0.0002933109096421774, + "loss": 3.0456, + "step": 31042 + }, + { + "epoch": 1.52, + "grad_norm": 0.6113942861557007, + "learning_rate": 0.00029329551728820153, + "loss": 3.0089, + "step": 31043 + }, + { + "epoch": 1.52, + "grad_norm": 0.5870497226715088, + "learning_rate": 0.000293280124951884, + "loss": 3.2366, + "step": 31044 + }, + { + "epoch": 1.52, + "grad_norm": 0.6341589689254761, + "learning_rate": 0.00029326473263326536, + "loss": 3.2206, + "step": 31045 + }, + { + "epoch": 1.52, + "grad_norm": 0.6039684414863586, + "learning_rate": 0.000293249340332386, + "loss": 2.9294, + "step": 31046 + }, + { + "epoch": 1.52, + "grad_norm": 0.609302282333374, + "learning_rate": 0.0002932339480492866, + "loss": 3.1502, + "step": 31047 + }, + { + "epoch": 1.52, + "grad_norm": 0.5908909440040588, + "learning_rate": 0.0002932185557840078, + "loss": 3.0183, + "step": 31048 + }, + { + "epoch": 1.52, + "grad_norm": 0.6392471790313721, + "learning_rate": 0.0002932031635365899, + "loss": 2.9455, + "step": 31049 + }, + { + "epoch": 1.52, + "grad_norm": 0.6417034864425659, + "learning_rate": 0.00029318777130707364, + "loss": 2.9563, + "step": 31050 + }, + { + "epoch": 1.52, + "grad_norm": 0.637570858001709, + "learning_rate": 0.0002931723790954994, + "loss": 2.9481, + "step": 31051 + }, + { + "epoch": 1.52, + "grad_norm": 0.5919541716575623, + "learning_rate": 0.0002931569869019078, + "loss": 3.021, + "step": 31052 + }, + { + "epoch": 1.52, + "grad_norm": 0.6471995115280151, + "learning_rate": 0.00029314159472633943, + "loss": 2.9314, + "step": 31053 + }, + { + "epoch": 1.52, + "grad_norm": 0.5962557196617126, + "learning_rate": 0.00029312620256883475, + "loss": 2.9913, + "step": 31054 + }, + { + "epoch": 1.52, + "grad_norm": 0.57488614320755, + "learning_rate": 0.0002931108104294344, + "loss": 3.0118, + "step": 31055 + }, + { + "epoch": 1.52, + "grad_norm": 0.6746479272842407, + "learning_rate": 0.00029309541830817873, + "loss": 2.932, + "step": 31056 + }, + { + "epoch": 1.52, + "grad_norm": 0.6285524964332581, + "learning_rate": 0.0002930800262051085, + "loss": 3.079, + "step": 31057 + }, + { + "epoch": 1.52, + "grad_norm": 0.6768153309822083, + "learning_rate": 0.0002930646341202641, + "loss": 3.0896, + "step": 31058 + }, + { + "epoch": 1.52, + "grad_norm": 0.6034467220306396, + "learning_rate": 0.0002930492420536861, + "loss": 3.1119, + "step": 31059 + }, + { + "epoch": 1.52, + "grad_norm": 0.5936095714569092, + "learning_rate": 0.00029303385000541517, + "loss": 3.1263, + "step": 31060 + }, + { + "epoch": 1.52, + "grad_norm": 0.6120097041130066, + "learning_rate": 0.00029301845797549155, + "loss": 2.9978, + "step": 31061 + }, + { + "epoch": 1.52, + "grad_norm": 0.6264371871948242, + "learning_rate": 0.00029300306596395604, + "loss": 3.0816, + "step": 31062 + }, + { + "epoch": 1.52, + "grad_norm": 0.6259018182754517, + "learning_rate": 0.00029298767397084927, + "loss": 2.8979, + "step": 31063 + }, + { + "epoch": 1.52, + "grad_norm": 0.6216327548027039, + "learning_rate": 0.0002929722819962114, + "loss": 3.02, + "step": 31064 + }, + { + "epoch": 1.52, + "grad_norm": 0.6106691956520081, + "learning_rate": 0.00029295689004008333, + "loss": 3.1225, + "step": 31065 + }, + { + "epoch": 1.52, + "grad_norm": 0.6300420761108398, + "learning_rate": 0.0002929414981025054, + "loss": 3.0543, + "step": 31066 + }, + { + "epoch": 1.52, + "grad_norm": 0.6276101469993591, + "learning_rate": 0.00029292610618351814, + "loss": 3.0879, + "step": 31067 + }, + { + "epoch": 1.52, + "grad_norm": 0.5739606618881226, + "learning_rate": 0.00029291071428316225, + "loss": 2.9895, + "step": 31068 + }, + { + "epoch": 1.52, + "grad_norm": 0.6423936486244202, + "learning_rate": 0.0002928953224014782, + "loss": 2.9886, + "step": 31069 + }, + { + "epoch": 1.52, + "grad_norm": 0.563739001750946, + "learning_rate": 0.00029287993053850645, + "loss": 2.9262, + "step": 31070 + }, + { + "epoch": 1.52, + "grad_norm": 0.6203590631484985, + "learning_rate": 0.0002928645386942875, + "loss": 3.1504, + "step": 31071 + }, + { + "epoch": 1.52, + "grad_norm": 0.6195458173751831, + "learning_rate": 0.0002928491468688621, + "loss": 3.0775, + "step": 31072 + }, + { + "epoch": 1.52, + "grad_norm": 0.6083816289901733, + "learning_rate": 0.00029283375506227073, + "loss": 3.0681, + "step": 31073 + }, + { + "epoch": 1.52, + "grad_norm": 0.567173182964325, + "learning_rate": 0.0002928183632745537, + "loss": 2.9375, + "step": 31074 + }, + { + "epoch": 1.52, + "grad_norm": 0.5811203122138977, + "learning_rate": 0.00029280297150575187, + "loss": 2.9456, + "step": 31075 + }, + { + "epoch": 1.52, + "grad_norm": 0.5663540363311768, + "learning_rate": 0.0002927875797559056, + "loss": 2.8892, + "step": 31076 + }, + { + "epoch": 1.52, + "grad_norm": 0.5827478766441345, + "learning_rate": 0.00029277218802505544, + "loss": 2.8407, + "step": 31077 + }, + { + "epoch": 1.52, + "grad_norm": 0.6202985048294067, + "learning_rate": 0.0002927567963132419, + "loss": 3.1736, + "step": 31078 + }, + { + "epoch": 1.52, + "grad_norm": 0.601956307888031, + "learning_rate": 0.00029274140462050546, + "loss": 3.0033, + "step": 31079 + }, + { + "epoch": 1.52, + "grad_norm": 0.6013195514678955, + "learning_rate": 0.000292726012946887, + "loss": 2.8182, + "step": 31080 + }, + { + "epoch": 1.52, + "grad_norm": 0.5847362279891968, + "learning_rate": 0.00029271062129242664, + "loss": 3.0162, + "step": 31081 + }, + { + "epoch": 1.52, + "grad_norm": 0.5564871430397034, + "learning_rate": 0.0002926952296571653, + "loss": 3.1941, + "step": 31082 + }, + { + "epoch": 1.52, + "grad_norm": 0.5841847062110901, + "learning_rate": 0.0002926798380411431, + "loss": 3.0432, + "step": 31083 + }, + { + "epoch": 1.52, + "grad_norm": 0.6471392512321472, + "learning_rate": 0.00029266444644440086, + "loss": 2.9476, + "step": 31084 + }, + { + "epoch": 1.52, + "grad_norm": 0.5753255486488342, + "learning_rate": 0.00029264905486697915, + "loss": 2.876, + "step": 31085 + }, + { + "epoch": 1.52, + "grad_norm": 0.6308822631835938, + "learning_rate": 0.0002926336633089183, + "loss": 2.907, + "step": 31086 + }, + { + "epoch": 1.52, + "grad_norm": 0.6044859886169434, + "learning_rate": 0.0002926182717702591, + "loss": 3.1926, + "step": 31087 + }, + { + "epoch": 1.52, + "grad_norm": 0.610822319984436, + "learning_rate": 0.0002926028802510418, + "loss": 2.9054, + "step": 31088 + }, + { + "epoch": 1.52, + "grad_norm": 0.6061519384384155, + "learning_rate": 0.00029258748875130714, + "loss": 3.0566, + "step": 31089 + }, + { + "epoch": 1.52, + "grad_norm": 0.6459476351737976, + "learning_rate": 0.00029257209727109564, + "loss": 2.9014, + "step": 31090 + }, + { + "epoch": 1.52, + "grad_norm": 0.6128569841384888, + "learning_rate": 0.0002925567058104478, + "loss": 2.6935, + "step": 31091 + }, + { + "epoch": 1.52, + "grad_norm": 0.6115437746047974, + "learning_rate": 0.0002925413143694042, + "loss": 3.3244, + "step": 31092 + }, + { + "epoch": 1.52, + "grad_norm": 0.6120768785476685, + "learning_rate": 0.00029252592294800526, + "loss": 2.8478, + "step": 31093 + }, + { + "epoch": 1.52, + "grad_norm": 0.5760208368301392, + "learning_rate": 0.00029251053154629163, + "loss": 2.9368, + "step": 31094 + }, + { + "epoch": 1.52, + "grad_norm": 0.5910771489143372, + "learning_rate": 0.0002924951401643039, + "loss": 2.9577, + "step": 31095 + }, + { + "epoch": 1.52, + "grad_norm": 0.6013192534446716, + "learning_rate": 0.00029247974880208235, + "loss": 3.07, + "step": 31096 + }, + { + "epoch": 1.52, + "grad_norm": 0.6419551372528076, + "learning_rate": 0.00029246435745966786, + "loss": 2.997, + "step": 31097 + }, + { + "epoch": 1.52, + "grad_norm": 0.6099100708961487, + "learning_rate": 0.00029244896613710077, + "loss": 3.0675, + "step": 31098 + }, + { + "epoch": 1.52, + "grad_norm": 0.5890765190124512, + "learning_rate": 0.00029243357483442153, + "loss": 2.9062, + "step": 31099 + }, + { + "epoch": 1.52, + "grad_norm": 0.6350886821746826, + "learning_rate": 0.00029241818355167095, + "loss": 3.1908, + "step": 31100 + }, + { + "epoch": 1.52, + "grad_norm": 0.6403512954711914, + "learning_rate": 0.00029240279228888926, + "loss": 3.0824, + "step": 31101 + }, + { + "epoch": 1.52, + "grad_norm": 0.5865958333015442, + "learning_rate": 0.00029238740104611734, + "loss": 3.1828, + "step": 31102 + }, + { + "epoch": 1.52, + "grad_norm": 0.6323754191398621, + "learning_rate": 0.0002923720098233954, + "loss": 3.1062, + "step": 31103 + }, + { + "epoch": 1.52, + "grad_norm": 0.6078318953514099, + "learning_rate": 0.0002923566186207641, + "loss": 3.2757, + "step": 31104 + }, + { + "epoch": 1.52, + "grad_norm": 0.5803014636039734, + "learning_rate": 0.00029234122743826413, + "loss": 3.2241, + "step": 31105 + }, + { + "epoch": 1.52, + "grad_norm": 0.5942589044570923, + "learning_rate": 0.00029232583627593583, + "loss": 3.378, + "step": 31106 + }, + { + "epoch": 1.52, + "grad_norm": 0.6468483209609985, + "learning_rate": 0.0002923104451338199, + "loss": 3.0655, + "step": 31107 + }, + { + "epoch": 1.52, + "grad_norm": 0.6610134243965149, + "learning_rate": 0.00029229505401195656, + "loss": 2.9371, + "step": 31108 + }, + { + "epoch": 1.52, + "grad_norm": 0.5979439616203308, + "learning_rate": 0.0002922796629103867, + "loss": 3.087, + "step": 31109 + }, + { + "epoch": 1.52, + "grad_norm": 0.5915126204490662, + "learning_rate": 0.00029226427182915075, + "loss": 3.043, + "step": 31110 + }, + { + "epoch": 1.52, + "grad_norm": 0.5519029498100281, + "learning_rate": 0.0002922488807682891, + "loss": 3.1751, + "step": 31111 + }, + { + "epoch": 1.52, + "grad_norm": 0.5895189642906189, + "learning_rate": 0.0002922334897278425, + "loss": 3.1257, + "step": 31112 + }, + { + "epoch": 1.52, + "grad_norm": 0.627914547920227, + "learning_rate": 0.00029221809870785137, + "loss": 3.2162, + "step": 31113 + }, + { + "epoch": 1.52, + "grad_norm": 0.5837568044662476, + "learning_rate": 0.0002922027077083562, + "loss": 3.0215, + "step": 31114 + }, + { + "epoch": 1.52, + "grad_norm": 0.5984426736831665, + "learning_rate": 0.0002921873167293977, + "loss": 2.9769, + "step": 31115 + }, + { + "epoch": 1.52, + "grad_norm": 0.600309431552887, + "learning_rate": 0.0002921719257710163, + "loss": 3.3203, + "step": 31116 + }, + { + "epoch": 1.52, + "grad_norm": 0.5865994095802307, + "learning_rate": 0.00029215653483325257, + "loss": 3.0811, + "step": 31117 + }, + { + "epoch": 1.53, + "grad_norm": 0.5973957777023315, + "learning_rate": 0.0002921411439161469, + "loss": 3.1921, + "step": 31118 + }, + { + "epoch": 1.53, + "grad_norm": 0.6090832352638245, + "learning_rate": 0.00029212575301974, + "loss": 3.0701, + "step": 31119 + }, + { + "epoch": 1.53, + "grad_norm": 0.5993704795837402, + "learning_rate": 0.00029211036214407243, + "loss": 2.9587, + "step": 31120 + }, + { + "epoch": 1.53, + "grad_norm": 0.5656693577766418, + "learning_rate": 0.0002920949712891845, + "loss": 2.8517, + "step": 31121 + }, + { + "epoch": 1.53, + "grad_norm": 0.6290675401687622, + "learning_rate": 0.000292079580455117, + "loss": 2.9327, + "step": 31122 + }, + { + "epoch": 1.53, + "grad_norm": 0.6300106048583984, + "learning_rate": 0.0002920641896419103, + "loss": 3.0199, + "step": 31123 + }, + { + "epoch": 1.53, + "grad_norm": 0.5924240946769714, + "learning_rate": 0.00029204879884960495, + "loss": 3.042, + "step": 31124 + }, + { + "epoch": 1.53, + "grad_norm": 0.6742143630981445, + "learning_rate": 0.0002920334080782417, + "loss": 3.1415, + "step": 31125 + }, + { + "epoch": 1.53, + "grad_norm": 0.6075363159179688, + "learning_rate": 0.0002920180173278608, + "loss": 3.1435, + "step": 31126 + }, + { + "epoch": 1.53, + "grad_norm": 0.6138061881065369, + "learning_rate": 0.00029200262659850293, + "loss": 2.8322, + "step": 31127 + }, + { + "epoch": 1.53, + "grad_norm": 0.6019613742828369, + "learning_rate": 0.0002919872358902086, + "loss": 2.9297, + "step": 31128 + }, + { + "epoch": 1.53, + "grad_norm": 0.584140956401825, + "learning_rate": 0.00029197184520301833, + "loss": 3.0311, + "step": 31129 + }, + { + "epoch": 1.53, + "grad_norm": 0.6052551865577698, + "learning_rate": 0.00029195645453697274, + "loss": 3.276, + "step": 31130 + }, + { + "epoch": 1.53, + "grad_norm": 0.5705941915512085, + "learning_rate": 0.00029194106389211224, + "loss": 3.1659, + "step": 31131 + }, + { + "epoch": 1.53, + "grad_norm": 0.6407089233398438, + "learning_rate": 0.0002919256732684775, + "loss": 2.7949, + "step": 31132 + }, + { + "epoch": 1.53, + "grad_norm": 0.6144587397575378, + "learning_rate": 0.0002919102826661089, + "loss": 3.1167, + "step": 31133 + }, + { + "epoch": 1.53, + "grad_norm": 0.5976653099060059, + "learning_rate": 0.00029189489208504706, + "loss": 2.9931, + "step": 31134 + }, + { + "epoch": 1.53, + "grad_norm": 0.6328117251396179, + "learning_rate": 0.0002918795015253326, + "loss": 3.169, + "step": 31135 + }, + { + "epoch": 1.53, + "grad_norm": 0.5923229455947876, + "learning_rate": 0.00029186411098700585, + "loss": 3.1009, + "step": 31136 + }, + { + "epoch": 1.53, + "grad_norm": 0.5932513475418091, + "learning_rate": 0.00029184872047010755, + "loss": 3.17, + "step": 31137 + }, + { + "epoch": 1.53, + "grad_norm": 0.6198069453239441, + "learning_rate": 0.00029183332997467814, + "loss": 2.9827, + "step": 31138 + }, + { + "epoch": 1.53, + "grad_norm": 0.6290854215621948, + "learning_rate": 0.0002918179395007581, + "loss": 2.9822, + "step": 31139 + }, + { + "epoch": 1.53, + "grad_norm": 0.58577960729599, + "learning_rate": 0.00029180254904838816, + "loss": 3.062, + "step": 31140 + }, + { + "epoch": 1.53, + "grad_norm": 0.5880772471427917, + "learning_rate": 0.0002917871586176086, + "loss": 3.0266, + "step": 31141 + }, + { + "epoch": 1.53, + "grad_norm": 0.6108989715576172, + "learning_rate": 0.0002917717682084602, + "loss": 3.0569, + "step": 31142 + }, + { + "epoch": 1.53, + "grad_norm": 0.5859941840171814, + "learning_rate": 0.00029175637782098327, + "loss": 3.1125, + "step": 31143 + }, + { + "epoch": 1.53, + "grad_norm": 0.5857529044151306, + "learning_rate": 0.0002917409874552184, + "loss": 3.0528, + "step": 31144 + }, + { + "epoch": 1.53, + "grad_norm": 0.6427134871482849, + "learning_rate": 0.0002917255971112064, + "loss": 3.147, + "step": 31145 + }, + { + "epoch": 1.53, + "grad_norm": 0.5859655737876892, + "learning_rate": 0.00029171020678898745, + "loss": 3.154, + "step": 31146 + }, + { + "epoch": 1.53, + "grad_norm": 0.5824823379516602, + "learning_rate": 0.0002916948164886023, + "loss": 3.174, + "step": 31147 + }, + { + "epoch": 1.53, + "grad_norm": 0.6048821210861206, + "learning_rate": 0.0002916794262100913, + "loss": 3.0062, + "step": 31148 + }, + { + "epoch": 1.53, + "grad_norm": 0.5758355855941772, + "learning_rate": 0.00029166403595349514, + "loss": 2.8761, + "step": 31149 + }, + { + "epoch": 1.53, + "grad_norm": 0.5995056629180908, + "learning_rate": 0.00029164864571885435, + "loss": 2.9244, + "step": 31150 + }, + { + "epoch": 1.53, + "grad_norm": 0.6515876650810242, + "learning_rate": 0.00029163325550620925, + "loss": 2.7845, + "step": 31151 + }, + { + "epoch": 1.53, + "grad_norm": 0.5933794975280762, + "learning_rate": 0.00029161786531560073, + "loss": 3.0369, + "step": 31152 + }, + { + "epoch": 1.53, + "grad_norm": 0.6130641102790833, + "learning_rate": 0.0002916024751470691, + "loss": 2.8529, + "step": 31153 + }, + { + "epoch": 1.53, + "grad_norm": 0.6391645669937134, + "learning_rate": 0.00029158708500065495, + "loss": 3.2121, + "step": 31154 + }, + { + "epoch": 1.53, + "grad_norm": 0.6190594434738159, + "learning_rate": 0.0002915716948763987, + "loss": 2.8578, + "step": 31155 + }, + { + "epoch": 1.53, + "grad_norm": 0.6114823818206787, + "learning_rate": 0.000291556304774341, + "loss": 3.2051, + "step": 31156 + }, + { + "epoch": 1.53, + "grad_norm": 0.5992326736450195, + "learning_rate": 0.0002915409146945225, + "loss": 2.9415, + "step": 31157 + }, + { + "epoch": 1.53, + "grad_norm": 0.6097759008407593, + "learning_rate": 0.0002915255246369835, + "loss": 2.8587, + "step": 31158 + }, + { + "epoch": 1.53, + "grad_norm": 0.5706651210784912, + "learning_rate": 0.00029151013460176466, + "loss": 3.1365, + "step": 31159 + }, + { + "epoch": 1.53, + "grad_norm": 0.5817975997924805, + "learning_rate": 0.0002914947445889065, + "loss": 2.8617, + "step": 31160 + }, + { + "epoch": 1.53, + "grad_norm": 0.5653377771377563, + "learning_rate": 0.00029147935459844943, + "loss": 2.9423, + "step": 31161 + }, + { + "epoch": 1.53, + "grad_norm": 0.6266032457351685, + "learning_rate": 0.0002914639646304343, + "loss": 2.8249, + "step": 31162 + }, + { + "epoch": 1.53, + "grad_norm": 0.6154088973999023, + "learning_rate": 0.00029144857468490126, + "loss": 3.3106, + "step": 31163 + }, + { + "epoch": 1.53, + "grad_norm": 0.5928178429603577, + "learning_rate": 0.00029143318476189113, + "loss": 3.1093, + "step": 31164 + }, + { + "epoch": 1.53, + "grad_norm": 0.6203148365020752, + "learning_rate": 0.00029141779486144433, + "loss": 3.0661, + "step": 31165 + }, + { + "epoch": 1.53, + "grad_norm": 0.5982689261436462, + "learning_rate": 0.0002914024049836013, + "loss": 2.9455, + "step": 31166 + }, + { + "epoch": 1.53, + "grad_norm": 0.5757526755332947, + "learning_rate": 0.00029138701512840286, + "loss": 3.1113, + "step": 31167 + }, + { + "epoch": 1.53, + "grad_norm": 0.5914579033851624, + "learning_rate": 0.00029137162529588924, + "loss": 2.9128, + "step": 31168 + }, + { + "epoch": 1.53, + "grad_norm": 0.6134317517280579, + "learning_rate": 0.00029135623548610126, + "loss": 2.9946, + "step": 31169 + }, + { + "epoch": 1.53, + "grad_norm": 0.5648379921913147, + "learning_rate": 0.0002913408456990791, + "loss": 3.4179, + "step": 31170 + }, + { + "epoch": 1.53, + "grad_norm": 0.6595600247383118, + "learning_rate": 0.00029132545593486353, + "loss": 3.0651, + "step": 31171 + }, + { + "epoch": 1.53, + "grad_norm": 0.5977067351341248, + "learning_rate": 0.00029131006619349516, + "loss": 2.9647, + "step": 31172 + }, + { + "epoch": 1.53, + "grad_norm": 0.5756570100784302, + "learning_rate": 0.00029129467647501424, + "loss": 3.0136, + "step": 31173 + }, + { + "epoch": 1.53, + "grad_norm": 0.5817214250564575, + "learning_rate": 0.0002912792867794616, + "loss": 3.013, + "step": 31174 + }, + { + "epoch": 1.53, + "grad_norm": 0.628970742225647, + "learning_rate": 0.00029126389710687755, + "loss": 2.8144, + "step": 31175 + }, + { + "epoch": 1.53, + "grad_norm": 0.5823667049407959, + "learning_rate": 0.00029124850745730266, + "loss": 3.2412, + "step": 31176 + }, + { + "epoch": 1.53, + "grad_norm": 0.6125685572624207, + "learning_rate": 0.0002912331178307777, + "loss": 3.0372, + "step": 31177 + }, + { + "epoch": 1.53, + "grad_norm": 0.6078442335128784, + "learning_rate": 0.0002912177282273429, + "loss": 3.1155, + "step": 31178 + }, + { + "epoch": 1.53, + "grad_norm": 0.5867834091186523, + "learning_rate": 0.000291202338647039, + "loss": 2.7659, + "step": 31179 + }, + { + "epoch": 1.53, + "grad_norm": 0.605891227722168, + "learning_rate": 0.0002911869490899063, + "loss": 3.1373, + "step": 31180 + }, + { + "epoch": 1.53, + "grad_norm": 0.6134265065193176, + "learning_rate": 0.0002911715595559855, + "loss": 3.0259, + "step": 31181 + }, + { + "epoch": 1.53, + "grad_norm": 0.6155909299850464, + "learning_rate": 0.0002911561700453173, + "loss": 3.0505, + "step": 31182 + }, + { + "epoch": 1.53, + "grad_norm": 0.6217151284217834, + "learning_rate": 0.00029114078055794186, + "loss": 3.1017, + "step": 31183 + }, + { + "epoch": 1.53, + "grad_norm": 0.6171595454216003, + "learning_rate": 0.00029112539109390004, + "loss": 3.1822, + "step": 31184 + }, + { + "epoch": 1.53, + "grad_norm": 0.5772479176521301, + "learning_rate": 0.00029111000165323205, + "loss": 2.9001, + "step": 31185 + }, + { + "epoch": 1.53, + "grad_norm": 0.5680612921714783, + "learning_rate": 0.0002910946122359787, + "loss": 3.0623, + "step": 31186 + }, + { + "epoch": 1.53, + "grad_norm": 0.5654654502868652, + "learning_rate": 0.0002910792228421805, + "loss": 3.1616, + "step": 31187 + }, + { + "epoch": 1.53, + "grad_norm": 0.5805103182792664, + "learning_rate": 0.00029106383347187777, + "loss": 3.1398, + "step": 31188 + }, + { + "epoch": 1.53, + "grad_norm": 0.5948848724365234, + "learning_rate": 0.0002910484441251113, + "loss": 2.8847, + "step": 31189 + }, + { + "epoch": 1.53, + "grad_norm": 0.6186020970344543, + "learning_rate": 0.00029103305480192147, + "loss": 3.1687, + "step": 31190 + }, + { + "epoch": 1.53, + "grad_norm": 0.5708625316619873, + "learning_rate": 0.0002910176655023488, + "loss": 3.048, + "step": 31191 + }, + { + "epoch": 1.53, + "grad_norm": 0.5748517513275146, + "learning_rate": 0.00029100227622643393, + "loss": 3.2699, + "step": 31192 + }, + { + "epoch": 1.53, + "grad_norm": 0.6108618378639221, + "learning_rate": 0.0002909868869742173, + "loss": 3.1266, + "step": 31193 + }, + { + "epoch": 1.53, + "grad_norm": 0.5868430733680725, + "learning_rate": 0.0002909714977457396, + "loss": 2.9538, + "step": 31194 + }, + { + "epoch": 1.53, + "grad_norm": 0.5879116058349609, + "learning_rate": 0.00029095610854104105, + "loss": 3.023, + "step": 31195 + }, + { + "epoch": 1.53, + "grad_norm": 0.6066574454307556, + "learning_rate": 0.0002909407193601624, + "loss": 3.202, + "step": 31196 + }, + { + "epoch": 1.53, + "grad_norm": 0.5830428004264832, + "learning_rate": 0.00029092533020314427, + "loss": 3.1226, + "step": 31197 + }, + { + "epoch": 1.53, + "grad_norm": 0.6408023238182068, + "learning_rate": 0.0002909099410700269, + "loss": 3.0443, + "step": 31198 + }, + { + "epoch": 1.53, + "grad_norm": 0.6012759804725647, + "learning_rate": 0.0002908945519608512, + "loss": 3.0593, + "step": 31199 + }, + { + "epoch": 1.53, + "grad_norm": 0.6043107509613037, + "learning_rate": 0.00029087916287565737, + "loss": 3.059, + "step": 31200 + }, + { + "epoch": 1.53, + "grad_norm": 0.590762197971344, + "learning_rate": 0.00029086377381448605, + "loss": 3.1015, + "step": 31201 + }, + { + "epoch": 1.53, + "grad_norm": 0.5998045206069946, + "learning_rate": 0.00029084838477737784, + "loss": 2.8935, + "step": 31202 + }, + { + "epoch": 1.53, + "grad_norm": 0.6675084233283997, + "learning_rate": 0.0002908329957643732, + "loss": 3.0359, + "step": 31203 + }, + { + "epoch": 1.53, + "grad_norm": 0.5763409733772278, + "learning_rate": 0.0002908176067755127, + "loss": 2.9924, + "step": 31204 + }, + { + "epoch": 1.53, + "grad_norm": 0.5802990794181824, + "learning_rate": 0.0002908022178108369, + "loss": 3.0601, + "step": 31205 + }, + { + "epoch": 1.53, + "grad_norm": 0.592325747013092, + "learning_rate": 0.0002907868288703862, + "loss": 3.0388, + "step": 31206 + }, + { + "epoch": 1.53, + "grad_norm": 0.5971692800521851, + "learning_rate": 0.0002907714399542013, + "loss": 2.8794, + "step": 31207 + }, + { + "epoch": 1.53, + "grad_norm": 0.6179129481315613, + "learning_rate": 0.00029075605106232264, + "loss": 2.9758, + "step": 31208 + }, + { + "epoch": 1.53, + "grad_norm": 0.648285984992981, + "learning_rate": 0.0002907406621947908, + "loss": 2.772, + "step": 31209 + }, + { + "epoch": 1.53, + "grad_norm": 0.6502874493598938, + "learning_rate": 0.00029072527335164615, + "loss": 2.8967, + "step": 31210 + }, + { + "epoch": 1.53, + "grad_norm": 0.6211252212524414, + "learning_rate": 0.0002907098845329294, + "loss": 2.8207, + "step": 31211 + }, + { + "epoch": 1.53, + "grad_norm": 0.6525001525878906, + "learning_rate": 0.0002906944957386811, + "loss": 2.9625, + "step": 31212 + }, + { + "epoch": 1.53, + "grad_norm": 0.6617982983589172, + "learning_rate": 0.0002906791069689416, + "loss": 3.0828, + "step": 31213 + }, + { + "epoch": 1.53, + "grad_norm": 0.6012643575668335, + "learning_rate": 0.0002906637182237516, + "loss": 2.9603, + "step": 31214 + }, + { + "epoch": 1.53, + "grad_norm": 0.5715975165367126, + "learning_rate": 0.00029064832950315156, + "loss": 3.043, + "step": 31215 + }, + { + "epoch": 1.53, + "grad_norm": 0.6091911196708679, + "learning_rate": 0.0002906329408071819, + "loss": 2.9976, + "step": 31216 + }, + { + "epoch": 1.53, + "grad_norm": 0.6170886158943176, + "learning_rate": 0.00029061755213588347, + "loss": 2.9807, + "step": 31217 + }, + { + "epoch": 1.53, + "grad_norm": 0.5898032188415527, + "learning_rate": 0.00029060216348929653, + "loss": 3.1002, + "step": 31218 + }, + { + "epoch": 1.53, + "grad_norm": 0.6269931197166443, + "learning_rate": 0.0002905867748674617, + "loss": 3.2008, + "step": 31219 + }, + { + "epoch": 1.53, + "grad_norm": 0.6010162830352783, + "learning_rate": 0.0002905713862704194, + "loss": 2.7986, + "step": 31220 + }, + { + "epoch": 1.53, + "grad_norm": 0.664634644985199, + "learning_rate": 0.00029055599769821036, + "loss": 3.1664, + "step": 31221 + }, + { + "epoch": 1.53, + "grad_norm": 0.5801214575767517, + "learning_rate": 0.00029054060915087497, + "loss": 2.9423, + "step": 31222 + }, + { + "epoch": 1.53, + "grad_norm": 0.5644407272338867, + "learning_rate": 0.00029052522062845374, + "loss": 3.0543, + "step": 31223 + }, + { + "epoch": 1.53, + "grad_norm": 0.5935441255569458, + "learning_rate": 0.0002905098321309873, + "loss": 3.0877, + "step": 31224 + }, + { + "epoch": 1.53, + "grad_norm": 0.6010169982910156, + "learning_rate": 0.0002904944436585161, + "loss": 3.1078, + "step": 31225 + }, + { + "epoch": 1.53, + "grad_norm": 0.6892803311347961, + "learning_rate": 0.0002904790552110808, + "loss": 3.0326, + "step": 31226 + }, + { + "epoch": 1.53, + "grad_norm": 0.5705881118774414, + "learning_rate": 0.00029046366678872176, + "loss": 2.9728, + "step": 31227 + }, + { + "epoch": 1.53, + "grad_norm": 0.6575391292572021, + "learning_rate": 0.00029044827839147953, + "loss": 3.0386, + "step": 31228 + }, + { + "epoch": 1.53, + "grad_norm": 0.5752348303794861, + "learning_rate": 0.00029043289001939485, + "loss": 3.0842, + "step": 31229 + }, + { + "epoch": 1.53, + "grad_norm": 0.5966557860374451, + "learning_rate": 0.000290417501672508, + "loss": 3.0958, + "step": 31230 + }, + { + "epoch": 1.53, + "grad_norm": 0.6041060090065002, + "learning_rate": 0.00029040211335085956, + "loss": 2.974, + "step": 31231 + }, + { + "epoch": 1.53, + "grad_norm": 0.590144693851471, + "learning_rate": 0.00029038672505449025, + "loss": 3.0159, + "step": 31232 + }, + { + "epoch": 1.53, + "grad_norm": 0.6653158068656921, + "learning_rate": 0.00029037133678344037, + "loss": 2.6909, + "step": 31233 + }, + { + "epoch": 1.53, + "grad_norm": 0.6356053948402405, + "learning_rate": 0.00029035594853775063, + "loss": 2.886, + "step": 31234 + }, + { + "epoch": 1.53, + "grad_norm": 0.616893470287323, + "learning_rate": 0.00029034056031746125, + "loss": 3.085, + "step": 31235 + }, + { + "epoch": 1.53, + "grad_norm": 0.5892876386642456, + "learning_rate": 0.0002903251721226132, + "loss": 3.0894, + "step": 31236 + }, + { + "epoch": 1.53, + "grad_norm": 0.5598612427711487, + "learning_rate": 0.0002903097839532467, + "loss": 3.0063, + "step": 31237 + }, + { + "epoch": 1.53, + "grad_norm": 0.6560896039009094, + "learning_rate": 0.00029029439580940226, + "loss": 3.0115, + "step": 31238 + }, + { + "epoch": 1.53, + "grad_norm": 0.5958468317985535, + "learning_rate": 0.00029027900769112064, + "loss": 3.0396, + "step": 31239 + }, + { + "epoch": 1.53, + "grad_norm": 0.5762074589729309, + "learning_rate": 0.00029026361959844224, + "loss": 3.3083, + "step": 31240 + }, + { + "epoch": 1.53, + "grad_norm": 0.5689447522163391, + "learning_rate": 0.0002902482315314076, + "loss": 3.2403, + "step": 31241 + }, + { + "epoch": 1.53, + "grad_norm": 0.5754348039627075, + "learning_rate": 0.0002902328434900571, + "loss": 3.07, + "step": 31242 + }, + { + "epoch": 1.53, + "grad_norm": 0.6313903331756592, + "learning_rate": 0.00029021745547443145, + "loss": 3.122, + "step": 31243 + }, + { + "epoch": 1.53, + "grad_norm": 0.6057738065719604, + "learning_rate": 0.00029020206748457135, + "loss": 3.025, + "step": 31244 + }, + { + "epoch": 1.53, + "grad_norm": 0.6703782677650452, + "learning_rate": 0.0002901866795205169, + "loss": 3.1737, + "step": 31245 + }, + { + "epoch": 1.53, + "grad_norm": 0.5860815048217773, + "learning_rate": 0.000290171291582309, + "loss": 3.0211, + "step": 31246 + }, + { + "epoch": 1.53, + "grad_norm": 0.5635547041893005, + "learning_rate": 0.00029015590366998785, + "loss": 3.2277, + "step": 31247 + }, + { + "epoch": 1.53, + "grad_norm": 0.598173201084137, + "learning_rate": 0.00029014051578359426, + "loss": 2.9847, + "step": 31248 + }, + { + "epoch": 1.53, + "grad_norm": 0.6276450157165527, + "learning_rate": 0.0002901251279231687, + "loss": 3.1291, + "step": 31249 + }, + { + "epoch": 1.53, + "grad_norm": 0.5984366536140442, + "learning_rate": 0.00029010974008875155, + "loss": 3.23, + "step": 31250 + }, + { + "epoch": 1.53, + "grad_norm": 0.5806760191917419, + "learning_rate": 0.00029009435228038354, + "loss": 2.9694, + "step": 31251 + }, + { + "epoch": 1.53, + "grad_norm": 0.6110997796058655, + "learning_rate": 0.00029007896449810506, + "loss": 3.054, + "step": 31252 + }, + { + "epoch": 1.53, + "grad_norm": 0.5997822880744934, + "learning_rate": 0.00029006357674195657, + "loss": 3.1742, + "step": 31253 + }, + { + "epoch": 1.53, + "grad_norm": 0.6100072860717773, + "learning_rate": 0.00029004818901197886, + "loss": 2.9635, + "step": 31254 + }, + { + "epoch": 1.53, + "grad_norm": 0.5825597047805786, + "learning_rate": 0.00029003280130821224, + "loss": 3.119, + "step": 31255 + }, + { + "epoch": 1.53, + "grad_norm": 0.636887788772583, + "learning_rate": 0.00029001741363069735, + "loss": 3.1825, + "step": 31256 + }, + { + "epoch": 1.53, + "grad_norm": 0.5912345051765442, + "learning_rate": 0.00029000202597947457, + "loss": 3.078, + "step": 31257 + }, + { + "epoch": 1.53, + "grad_norm": 0.5856207609176636, + "learning_rate": 0.0002899866383545846, + "loss": 3.2083, + "step": 31258 + }, + { + "epoch": 1.53, + "grad_norm": 0.6235297322273254, + "learning_rate": 0.0002899712507560679, + "loss": 3.106, + "step": 31259 + }, + { + "epoch": 1.53, + "grad_norm": 0.6140345335006714, + "learning_rate": 0.00028995586318396495, + "loss": 3.2502, + "step": 31260 + }, + { + "epoch": 1.53, + "grad_norm": 0.5801708698272705, + "learning_rate": 0.0002899404756383164, + "loss": 3.1254, + "step": 31261 + }, + { + "epoch": 1.53, + "grad_norm": 0.5877318382263184, + "learning_rate": 0.0002899250881191627, + "loss": 2.9534, + "step": 31262 + }, + { + "epoch": 1.53, + "grad_norm": 0.5814765095710754, + "learning_rate": 0.0002899097006265442, + "loss": 2.9297, + "step": 31263 + }, + { + "epoch": 1.53, + "grad_norm": 0.6114090085029602, + "learning_rate": 0.00028989431316050183, + "loss": 2.9979, + "step": 31264 + }, + { + "epoch": 1.53, + "grad_norm": 0.601153552532196, + "learning_rate": 0.00028987892572107573, + "loss": 2.9141, + "step": 31265 + }, + { + "epoch": 1.53, + "grad_norm": 0.6009148955345154, + "learning_rate": 0.00028986353830830674, + "loss": 2.9535, + "step": 31266 + }, + { + "epoch": 1.53, + "grad_norm": 0.5960407257080078, + "learning_rate": 0.0002898481509222352, + "loss": 3.208, + "step": 31267 + }, + { + "epoch": 1.53, + "grad_norm": 0.6321334838867188, + "learning_rate": 0.00028983276356290156, + "loss": 2.9113, + "step": 31268 + }, + { + "epoch": 1.53, + "grad_norm": 0.6519079804420471, + "learning_rate": 0.00028981737623034657, + "loss": 2.9775, + "step": 31269 + }, + { + "epoch": 1.53, + "grad_norm": 0.652167797088623, + "learning_rate": 0.0002898019889246106, + "loss": 3.239, + "step": 31270 + }, + { + "epoch": 1.53, + "grad_norm": 0.5852517485618591, + "learning_rate": 0.00028978660164573437, + "loss": 2.943, + "step": 31271 + }, + { + "epoch": 1.53, + "grad_norm": 0.5924550294876099, + "learning_rate": 0.0002897712143937581, + "loss": 3.0424, + "step": 31272 + }, + { + "epoch": 1.53, + "grad_norm": 0.5765801072120667, + "learning_rate": 0.0002897558271687225, + "loss": 3.1681, + "step": 31273 + }, + { + "epoch": 1.53, + "grad_norm": 0.6032983660697937, + "learning_rate": 0.0002897404399706682, + "loss": 3.0438, + "step": 31274 + }, + { + "epoch": 1.53, + "grad_norm": 0.6318213939666748, + "learning_rate": 0.0002897250527996355, + "loss": 3.053, + "step": 31275 + }, + { + "epoch": 1.53, + "grad_norm": 0.6157848238945007, + "learning_rate": 0.0002897096656556651, + "loss": 2.8284, + "step": 31276 + }, + { + "epoch": 1.53, + "grad_norm": 0.6649987697601318, + "learning_rate": 0.0002896942785387974, + "loss": 3.1921, + "step": 31277 + }, + { + "epoch": 1.53, + "grad_norm": 0.5770165920257568, + "learning_rate": 0.0002896788914490729, + "loss": 3.0402, + "step": 31278 + }, + { + "epoch": 1.53, + "grad_norm": 0.5739706158638, + "learning_rate": 0.0002896635043865324, + "loss": 3.0012, + "step": 31279 + }, + { + "epoch": 1.53, + "grad_norm": 0.6319332122802734, + "learning_rate": 0.00028964811735121614, + "loss": 3.2183, + "step": 31280 + }, + { + "epoch": 1.53, + "grad_norm": 0.6746277809143066, + "learning_rate": 0.00028963273034316486, + "loss": 3.0346, + "step": 31281 + }, + { + "epoch": 1.53, + "grad_norm": 0.5536617636680603, + "learning_rate": 0.0002896173433624188, + "loss": 3.1045, + "step": 31282 + }, + { + "epoch": 1.53, + "grad_norm": 0.6022813320159912, + "learning_rate": 0.0002896019564090187, + "loss": 2.8612, + "step": 31283 + }, + { + "epoch": 1.53, + "grad_norm": 0.6644525527954102, + "learning_rate": 0.0002895865694830052, + "loss": 2.758, + "step": 31284 + }, + { + "epoch": 1.53, + "grad_norm": 0.6138014197349548, + "learning_rate": 0.0002895711825844185, + "loss": 3.1694, + "step": 31285 + }, + { + "epoch": 1.53, + "grad_norm": 0.5913397669792175, + "learning_rate": 0.0002895557957132994, + "loss": 2.8191, + "step": 31286 + }, + { + "epoch": 1.53, + "grad_norm": 0.5768195390701294, + "learning_rate": 0.0002895404088696882, + "loss": 2.8873, + "step": 31287 + }, + { + "epoch": 1.53, + "grad_norm": 0.6131641268730164, + "learning_rate": 0.0002895250220536256, + "loss": 3.1928, + "step": 31288 + }, + { + "epoch": 1.53, + "grad_norm": 0.5838510394096375, + "learning_rate": 0.00028950963526515214, + "loss": 3.0279, + "step": 31289 + }, + { + "epoch": 1.53, + "grad_norm": 0.6200519800186157, + "learning_rate": 0.00028949424850430817, + "loss": 3.0139, + "step": 31290 + }, + { + "epoch": 1.53, + "grad_norm": 0.60508131980896, + "learning_rate": 0.00028947886177113446, + "loss": 3.1849, + "step": 31291 + }, + { + "epoch": 1.53, + "grad_norm": 0.6125930547714233, + "learning_rate": 0.0002894634750656714, + "loss": 3.0983, + "step": 31292 + }, + { + "epoch": 1.53, + "grad_norm": 0.5791587829589844, + "learning_rate": 0.0002894480883879593, + "loss": 3.1379, + "step": 31293 + }, + { + "epoch": 1.53, + "grad_norm": 0.6140015721321106, + "learning_rate": 0.0002894327017380392, + "loss": 3.2646, + "step": 31294 + }, + { + "epoch": 1.53, + "grad_norm": 0.7361003756523132, + "learning_rate": 0.00028941731511595117, + "loss": 3.1597, + "step": 31295 + }, + { + "epoch": 1.53, + "grad_norm": 0.5974676012992859, + "learning_rate": 0.00028940192852173596, + "loss": 3.1636, + "step": 31296 + }, + { + "epoch": 1.53, + "grad_norm": 0.6096514463424683, + "learning_rate": 0.00028938654195543396, + "loss": 2.9766, + "step": 31297 + }, + { + "epoch": 1.53, + "grad_norm": 0.6190111637115479, + "learning_rate": 0.0002893711554170858, + "loss": 2.8579, + "step": 31298 + }, + { + "epoch": 1.53, + "grad_norm": 0.6046838164329529, + "learning_rate": 0.00028935576890673205, + "loss": 3.1151, + "step": 31299 + }, + { + "epoch": 1.53, + "grad_norm": 0.6145317554473877, + "learning_rate": 0.000289340382424413, + "loss": 3.1992, + "step": 31300 + }, + { + "epoch": 1.53, + "grad_norm": 0.6103838086128235, + "learning_rate": 0.00028932499597016944, + "loss": 3.0945, + "step": 31301 + }, + { + "epoch": 1.53, + "grad_norm": 0.600480854511261, + "learning_rate": 0.0002893096095440418, + "loss": 2.9785, + "step": 31302 + }, + { + "epoch": 1.53, + "grad_norm": 0.5839743614196777, + "learning_rate": 0.0002892942231460704, + "loss": 3.1328, + "step": 31303 + }, + { + "epoch": 1.53, + "grad_norm": 0.6166554689407349, + "learning_rate": 0.0002892788367762962, + "loss": 3.0722, + "step": 31304 + }, + { + "epoch": 1.53, + "grad_norm": 0.6772105693817139, + "learning_rate": 0.0002892634504347593, + "loss": 2.8744, + "step": 31305 + }, + { + "epoch": 1.53, + "grad_norm": 0.6089929938316345, + "learning_rate": 0.00028924806412150057, + "loss": 3.1004, + "step": 31306 + }, + { + "epoch": 1.53, + "grad_norm": 0.5962395668029785, + "learning_rate": 0.00028923267783656026, + "loss": 2.9843, + "step": 31307 + }, + { + "epoch": 1.53, + "grad_norm": 0.6941888928413391, + "learning_rate": 0.00028921729157997894, + "loss": 3.0849, + "step": 31308 + }, + { + "epoch": 1.53, + "grad_norm": 0.607742965221405, + "learning_rate": 0.0002892019053517974, + "loss": 3.2729, + "step": 31309 + }, + { + "epoch": 1.53, + "grad_norm": 0.5975240468978882, + "learning_rate": 0.00028918651915205585, + "loss": 3.1408, + "step": 31310 + }, + { + "epoch": 1.53, + "grad_norm": 0.5710057616233826, + "learning_rate": 0.000289171132980795, + "loss": 3.3382, + "step": 31311 + }, + { + "epoch": 1.53, + "grad_norm": 0.6463126540184021, + "learning_rate": 0.00028915574683805514, + "loss": 2.967, + "step": 31312 + }, + { + "epoch": 1.53, + "grad_norm": 0.7494412064552307, + "learning_rate": 0.00028914036072387716, + "loss": 2.9294, + "step": 31313 + }, + { + "epoch": 1.53, + "grad_norm": 0.5747896432876587, + "learning_rate": 0.00028912497463830125, + "loss": 3.2201, + "step": 31314 + }, + { + "epoch": 1.53, + "grad_norm": 0.5977765917778015, + "learning_rate": 0.00028910958858136796, + "loss": 3.0647, + "step": 31315 + }, + { + "epoch": 1.53, + "grad_norm": 0.6861127614974976, + "learning_rate": 0.00028909420255311815, + "loss": 3.0568, + "step": 31316 + }, + { + "epoch": 1.53, + "grad_norm": 0.6028975248336792, + "learning_rate": 0.00028907881655359195, + "loss": 3.0533, + "step": 31317 + }, + { + "epoch": 1.53, + "grad_norm": 0.5990371108055115, + "learning_rate": 0.0002890634305828302, + "loss": 3.0781, + "step": 31318 + }, + { + "epoch": 1.53, + "grad_norm": 0.6500790119171143, + "learning_rate": 0.0002890480446408731, + "loss": 2.9856, + "step": 31319 + }, + { + "epoch": 1.53, + "grad_norm": 0.6449432969093323, + "learning_rate": 0.0002890326587277614, + "loss": 2.9427, + "step": 31320 + }, + { + "epoch": 1.53, + "grad_norm": 0.6211156845092773, + "learning_rate": 0.0002890172728435356, + "loss": 2.9741, + "step": 31321 + }, + { + "epoch": 1.54, + "grad_norm": 0.6242359280586243, + "learning_rate": 0.0002890018869882361, + "loss": 3.1796, + "step": 31322 + }, + { + "epoch": 1.54, + "grad_norm": 0.6206004619598389, + "learning_rate": 0.00028898650116190367, + "loss": 3.1543, + "step": 31323 + }, + { + "epoch": 1.54, + "grad_norm": 0.5952155590057373, + "learning_rate": 0.0002889711153645786, + "loss": 2.88, + "step": 31324 + }, + { + "epoch": 1.54, + "grad_norm": 0.6566175222396851, + "learning_rate": 0.00028895572959630137, + "loss": 3.3069, + "step": 31325 + }, + { + "epoch": 1.54, + "grad_norm": 0.5943531394004822, + "learning_rate": 0.00028894034385711277, + "loss": 3.0009, + "step": 31326 + }, + { + "epoch": 1.54, + "grad_norm": 0.6611933708190918, + "learning_rate": 0.0002889249581470531, + "loss": 3.1812, + "step": 31327 + }, + { + "epoch": 1.54, + "grad_norm": 0.6006651520729065, + "learning_rate": 0.000288909572466163, + "loss": 3.2205, + "step": 31328 + }, + { + "epoch": 1.54, + "grad_norm": 0.6034921407699585, + "learning_rate": 0.000288894186814483, + "loss": 3.079, + "step": 31329 + }, + { + "epoch": 1.54, + "grad_norm": 0.5508182644844055, + "learning_rate": 0.0002888788011920534, + "loss": 2.9034, + "step": 31330 + }, + { + "epoch": 1.54, + "grad_norm": 0.596584141254425, + "learning_rate": 0.0002888634155989151, + "loss": 3.1913, + "step": 31331 + }, + { + "epoch": 1.54, + "grad_norm": 0.5692295432090759, + "learning_rate": 0.0002888480300351083, + "loss": 3.1871, + "step": 31332 + }, + { + "epoch": 1.54, + "grad_norm": 0.6114552617073059, + "learning_rate": 0.0002888326445006738, + "loss": 2.9601, + "step": 31333 + }, + { + "epoch": 1.54, + "grad_norm": 0.6093324422836304, + "learning_rate": 0.00028881725899565177, + "loss": 3.0075, + "step": 31334 + }, + { + "epoch": 1.54, + "grad_norm": 0.608741044998169, + "learning_rate": 0.00028880187352008297, + "loss": 3.0645, + "step": 31335 + }, + { + "epoch": 1.54, + "grad_norm": 0.598630428314209, + "learning_rate": 0.00028878648807400794, + "loss": 3.1638, + "step": 31336 + }, + { + "epoch": 1.54, + "grad_norm": 0.6114171743392944, + "learning_rate": 0.00028877110265746706, + "loss": 2.9811, + "step": 31337 + }, + { + "epoch": 1.54, + "grad_norm": 0.6134542226791382, + "learning_rate": 0.0002887557172705011, + "loss": 3.0926, + "step": 31338 + }, + { + "epoch": 1.54, + "grad_norm": 0.61723792552948, + "learning_rate": 0.00028874033191315023, + "loss": 3.0028, + "step": 31339 + }, + { + "epoch": 1.54, + "grad_norm": 0.5689911842346191, + "learning_rate": 0.00028872494658545517, + "loss": 3.1265, + "step": 31340 + }, + { + "epoch": 1.54, + "grad_norm": 0.6355405449867249, + "learning_rate": 0.00028870956128745657, + "loss": 3.0033, + "step": 31341 + }, + { + "epoch": 1.54, + "grad_norm": 0.6212851405143738, + "learning_rate": 0.00028869417601919475, + "loss": 3.1319, + "step": 31342 + }, + { + "epoch": 1.54, + "grad_norm": 0.600463330745697, + "learning_rate": 0.00028867879078071034, + "loss": 3.0882, + "step": 31343 + }, + { + "epoch": 1.54, + "grad_norm": 0.6733449101448059, + "learning_rate": 0.0002886634055720437, + "loss": 2.8782, + "step": 31344 + }, + { + "epoch": 1.54, + "grad_norm": 0.6149190068244934, + "learning_rate": 0.00028864802039323546, + "loss": 3.1114, + "step": 31345 + }, + { + "epoch": 1.54, + "grad_norm": 0.6464491486549377, + "learning_rate": 0.0002886326352443263, + "loss": 2.9551, + "step": 31346 + }, + { + "epoch": 1.54, + "grad_norm": 0.5949641466140747, + "learning_rate": 0.0002886172501253564, + "loss": 3.2199, + "step": 31347 + }, + { + "epoch": 1.54, + "grad_norm": 0.6497898697853088, + "learning_rate": 0.0002886018650363667, + "loss": 2.9258, + "step": 31348 + }, + { + "epoch": 1.54, + "grad_norm": 0.6072144508361816, + "learning_rate": 0.0002885864799773973, + "loss": 2.9873, + "step": 31349 + }, + { + "epoch": 1.54, + "grad_norm": 0.5734956860542297, + "learning_rate": 0.00028857109494848896, + "loss": 2.9834, + "step": 31350 + }, + { + "epoch": 1.54, + "grad_norm": 0.6253546476364136, + "learning_rate": 0.00028855570994968223, + "loss": 2.9745, + "step": 31351 + }, + { + "epoch": 1.54, + "grad_norm": 0.5898392796516418, + "learning_rate": 0.00028854032498101746, + "loss": 2.8968, + "step": 31352 + }, + { + "epoch": 1.54, + "grad_norm": 0.6153894662857056, + "learning_rate": 0.00028852494004253537, + "loss": 2.9332, + "step": 31353 + }, + { + "epoch": 1.54, + "grad_norm": 0.6599181890487671, + "learning_rate": 0.00028850955513427633, + "loss": 3.1384, + "step": 31354 + }, + { + "epoch": 1.54, + "grad_norm": 0.5734416842460632, + "learning_rate": 0.00028849417025628084, + "loss": 3.0424, + "step": 31355 + }, + { + "epoch": 1.54, + "grad_norm": 0.5493830442428589, + "learning_rate": 0.0002884787854085896, + "loss": 3.0597, + "step": 31356 + }, + { + "epoch": 1.54, + "grad_norm": 0.6523565649986267, + "learning_rate": 0.000288463400591243, + "loss": 3.3818, + "step": 31357 + }, + { + "epoch": 1.54, + "grad_norm": 0.6114045977592468, + "learning_rate": 0.00028844801580428167, + "loss": 2.8229, + "step": 31358 + }, + { + "epoch": 1.54, + "grad_norm": 0.5856912136077881, + "learning_rate": 0.00028843263104774583, + "loss": 3.0191, + "step": 31359 + }, + { + "epoch": 1.54, + "grad_norm": 0.6086015105247498, + "learning_rate": 0.00028841724632167634, + "loss": 3.0111, + "step": 31360 + }, + { + "epoch": 1.54, + "grad_norm": 0.5889554619789124, + "learning_rate": 0.00028840186162611364, + "loss": 3.1478, + "step": 31361 + }, + { + "epoch": 1.54, + "grad_norm": 0.6440091133117676, + "learning_rate": 0.00028838647696109807, + "loss": 3.1964, + "step": 31362 + }, + { + "epoch": 1.54, + "grad_norm": 0.6442698240280151, + "learning_rate": 0.0002883710923266704, + "loss": 3.0283, + "step": 31363 + }, + { + "epoch": 1.54, + "grad_norm": 0.6014328002929688, + "learning_rate": 0.000288355707722871, + "loss": 3.1843, + "step": 31364 + }, + { + "epoch": 1.54, + "grad_norm": 0.5934261679649353, + "learning_rate": 0.0002883403231497403, + "loss": 2.9702, + "step": 31365 + }, + { + "epoch": 1.54, + "grad_norm": 0.6464361548423767, + "learning_rate": 0.00028832493860731916, + "loss": 3.0837, + "step": 31366 + }, + { + "epoch": 1.54, + "grad_norm": 0.5907967686653137, + "learning_rate": 0.0002883095540956477, + "loss": 2.9704, + "step": 31367 + }, + { + "epoch": 1.54, + "grad_norm": 0.5807782411575317, + "learning_rate": 0.0002882941696147668, + "loss": 3.0911, + "step": 31368 + }, + { + "epoch": 1.54, + "grad_norm": 0.5743398666381836, + "learning_rate": 0.0002882787851647167, + "loss": 2.9713, + "step": 31369 + }, + { + "epoch": 1.54, + "grad_norm": 0.5994200110435486, + "learning_rate": 0.00028826340074553797, + "loss": 2.9487, + "step": 31370 + }, + { + "epoch": 1.54, + "grad_norm": 0.6094851493835449, + "learning_rate": 0.0002882480163572713, + "loss": 2.9808, + "step": 31371 + }, + { + "epoch": 1.54, + "grad_norm": 0.6090477108955383, + "learning_rate": 0.00028823263199995706, + "loss": 3.0441, + "step": 31372 + }, + { + "epoch": 1.54, + "grad_norm": 0.5785759091377258, + "learning_rate": 0.00028821724767363584, + "loss": 3.0751, + "step": 31373 + }, + { + "epoch": 1.54, + "grad_norm": 0.5603683590888977, + "learning_rate": 0.000288201863378348, + "loss": 3.0937, + "step": 31374 + }, + { + "epoch": 1.54, + "grad_norm": 0.5782989263534546, + "learning_rate": 0.0002881864791141342, + "loss": 2.9813, + "step": 31375 + }, + { + "epoch": 1.54, + "grad_norm": 0.6115820407867432, + "learning_rate": 0.0002881710948810351, + "loss": 2.9611, + "step": 31376 + }, + { + "epoch": 1.54, + "grad_norm": 0.6622043251991272, + "learning_rate": 0.00028815571067909083, + "loss": 3.142, + "step": 31377 + }, + { + "epoch": 1.54, + "grad_norm": 0.6727275252342224, + "learning_rate": 0.0002881403265083423, + "loss": 3.1263, + "step": 31378 + }, + { + "epoch": 1.54, + "grad_norm": 0.6339889764785767, + "learning_rate": 0.0002881249423688299, + "loss": 3.0685, + "step": 31379 + }, + { + "epoch": 1.54, + "grad_norm": 0.6273970007896423, + "learning_rate": 0.000288109558260594, + "loss": 3.2247, + "step": 31380 + }, + { + "epoch": 1.54, + "grad_norm": 0.6286791563034058, + "learning_rate": 0.0002880941741836753, + "loss": 3.0735, + "step": 31381 + }, + { + "epoch": 1.54, + "grad_norm": 0.5971516370773315, + "learning_rate": 0.0002880787901381142, + "loss": 3.1196, + "step": 31382 + }, + { + "epoch": 1.54, + "grad_norm": 0.5588302612304688, + "learning_rate": 0.0002880634061239514, + "loss": 2.7368, + "step": 31383 + }, + { + "epoch": 1.54, + "grad_norm": 0.6372582912445068, + "learning_rate": 0.0002880480221412271, + "loss": 3.0035, + "step": 31384 + }, + { + "epoch": 1.54, + "grad_norm": 0.6087130904197693, + "learning_rate": 0.0002880326381899821, + "loss": 3.1687, + "step": 31385 + }, + { + "epoch": 1.54, + "grad_norm": 0.6057910919189453, + "learning_rate": 0.00028801725427025687, + "loss": 2.9205, + "step": 31386 + }, + { + "epoch": 1.54, + "grad_norm": 0.5742743015289307, + "learning_rate": 0.0002880018703820918, + "loss": 3.0471, + "step": 31387 + }, + { + "epoch": 1.54, + "grad_norm": 0.5914288759231567, + "learning_rate": 0.0002879864865255276, + "loss": 3.14, + "step": 31388 + }, + { + "epoch": 1.54, + "grad_norm": 0.5800380706787109, + "learning_rate": 0.00028797110270060457, + "loss": 2.8114, + "step": 31389 + }, + { + "epoch": 1.54, + "grad_norm": 0.5977968573570251, + "learning_rate": 0.0002879557189073635, + "loss": 3.0108, + "step": 31390 + }, + { + "epoch": 1.54, + "grad_norm": 0.6388533711433411, + "learning_rate": 0.0002879403351458446, + "loss": 2.8719, + "step": 31391 + }, + { + "epoch": 1.54, + "grad_norm": 0.5507797002792358, + "learning_rate": 0.0002879249514160885, + "loss": 3.0939, + "step": 31392 + }, + { + "epoch": 1.54, + "grad_norm": 0.5805448293685913, + "learning_rate": 0.0002879095677181359, + "loss": 2.9328, + "step": 31393 + }, + { + "epoch": 1.54, + "grad_norm": 0.5536087155342102, + "learning_rate": 0.0002878941840520271, + "loss": 3.1754, + "step": 31394 + }, + { + "epoch": 1.54, + "grad_norm": 0.5905557870864868, + "learning_rate": 0.00028787880041780277, + "loss": 3.0814, + "step": 31395 + }, + { + "epoch": 1.54, + "grad_norm": 0.6131752729415894, + "learning_rate": 0.0002878634168155032, + "loss": 2.9661, + "step": 31396 + }, + { + "epoch": 1.54, + "grad_norm": 0.6142122149467468, + "learning_rate": 0.0002878480332451691, + "loss": 3.0028, + "step": 31397 + }, + { + "epoch": 1.54, + "grad_norm": 0.6468976140022278, + "learning_rate": 0.0002878326497068411, + "loss": 2.9483, + "step": 31398 + }, + { + "epoch": 1.54, + "grad_norm": 0.5881075859069824, + "learning_rate": 0.0002878172662005593, + "loss": 2.9752, + "step": 31399 + }, + { + "epoch": 1.54, + "grad_norm": 0.5991531610488892, + "learning_rate": 0.0002878018827263647, + "loss": 2.8782, + "step": 31400 + }, + { + "epoch": 1.54, + "grad_norm": 0.589621365070343, + "learning_rate": 0.00028778649928429744, + "loss": 3.1235, + "step": 31401 + }, + { + "epoch": 1.54, + "grad_norm": 0.5604541301727295, + "learning_rate": 0.0002877711158743982, + "loss": 3.1901, + "step": 31402 + }, + { + "epoch": 1.54, + "grad_norm": 0.6096484661102295, + "learning_rate": 0.0002877557324967076, + "loss": 3.0457, + "step": 31403 + }, + { + "epoch": 1.54, + "grad_norm": 0.645102858543396, + "learning_rate": 0.000287740349151266, + "loss": 2.9014, + "step": 31404 + }, + { + "epoch": 1.54, + "grad_norm": 0.6197732090950012, + "learning_rate": 0.00028772496583811397, + "loss": 2.9379, + "step": 31405 + }, + { + "epoch": 1.54, + "grad_norm": 0.5692452788352966, + "learning_rate": 0.00028770958255729194, + "loss": 3.0261, + "step": 31406 + }, + { + "epoch": 1.54, + "grad_norm": 0.6242355704307556, + "learning_rate": 0.00028769419930884044, + "loss": 2.9257, + "step": 31407 + }, + { + "epoch": 1.54, + "grad_norm": 0.578525722026825, + "learning_rate": 0.0002876788160928003, + "loss": 3.1085, + "step": 31408 + }, + { + "epoch": 1.54, + "grad_norm": 0.6010854840278625, + "learning_rate": 0.0002876634329092116, + "loss": 3.0628, + "step": 31409 + }, + { + "epoch": 1.54, + "grad_norm": 0.5785937309265137, + "learning_rate": 0.00028764804975811516, + "loss": 2.8512, + "step": 31410 + }, + { + "epoch": 1.54, + "grad_norm": 0.6024236083030701, + "learning_rate": 0.00028763266663955125, + "loss": 2.9083, + "step": 31411 + }, + { + "epoch": 1.54, + "grad_norm": 0.5748933553695679, + "learning_rate": 0.00028761728355356056, + "loss": 2.9888, + "step": 31412 + }, + { + "epoch": 1.54, + "grad_norm": 0.5996086597442627, + "learning_rate": 0.00028760190050018366, + "loss": 3.0, + "step": 31413 + }, + { + "epoch": 1.54, + "grad_norm": 0.6583248376846313, + "learning_rate": 0.0002875865174794608, + "loss": 2.8246, + "step": 31414 + }, + { + "epoch": 1.54, + "grad_norm": 0.64971524477005, + "learning_rate": 0.00028757113449143283, + "loss": 2.9814, + "step": 31415 + }, + { + "epoch": 1.54, + "grad_norm": 0.6212257146835327, + "learning_rate": 0.00028755575153614, + "loss": 2.9499, + "step": 31416 + }, + { + "epoch": 1.54, + "grad_norm": 0.6192092895507812, + "learning_rate": 0.0002875403686136229, + "loss": 3.067, + "step": 31417 + }, + { + "epoch": 1.54, + "grad_norm": 0.588834285736084, + "learning_rate": 0.0002875249857239222, + "loss": 3.134, + "step": 31418 + }, + { + "epoch": 1.54, + "grad_norm": 0.5852153897285461, + "learning_rate": 0.0002875096028670782, + "loss": 3.1103, + "step": 31419 + }, + { + "epoch": 1.54, + "grad_norm": 0.6158280372619629, + "learning_rate": 0.00028749422004313157, + "loss": 2.9628, + "step": 31420 + }, + { + "epoch": 1.54, + "grad_norm": 0.6489527225494385, + "learning_rate": 0.0002874788372521226, + "loss": 3.1299, + "step": 31421 + }, + { + "epoch": 1.54, + "grad_norm": 0.5668579339981079, + "learning_rate": 0.00028746345449409206, + "loss": 2.8412, + "step": 31422 + }, + { + "epoch": 1.54, + "grad_norm": 0.6310492753982544, + "learning_rate": 0.0002874480717690804, + "loss": 3.1859, + "step": 31423 + }, + { + "epoch": 1.54, + "grad_norm": 0.5989445447921753, + "learning_rate": 0.00028743268907712805, + "loss": 3.0767, + "step": 31424 + }, + { + "epoch": 1.54, + "grad_norm": 0.5920668244361877, + "learning_rate": 0.0002874173064182756, + "loss": 2.8711, + "step": 31425 + }, + { + "epoch": 1.54, + "grad_norm": 0.5631740093231201, + "learning_rate": 0.00028740192379256356, + "loss": 3.1315, + "step": 31426 + }, + { + "epoch": 1.54, + "grad_norm": 0.613795816898346, + "learning_rate": 0.00028738654120003237, + "loss": 2.8111, + "step": 31427 + }, + { + "epoch": 1.54, + "grad_norm": 0.6500447988510132, + "learning_rate": 0.0002873711586407227, + "loss": 3.0035, + "step": 31428 + }, + { + "epoch": 1.54, + "grad_norm": 0.595923662185669, + "learning_rate": 0.00028735577611467485, + "loss": 3.0004, + "step": 31429 + }, + { + "epoch": 1.54, + "grad_norm": 0.5796763896942139, + "learning_rate": 0.00028734039362192956, + "loss": 3.0754, + "step": 31430 + }, + { + "epoch": 1.54, + "grad_norm": 0.6116151213645935, + "learning_rate": 0.0002873250111625272, + "loss": 3.1244, + "step": 31431 + }, + { + "epoch": 1.54, + "grad_norm": 0.6706094741821289, + "learning_rate": 0.00028730962873650825, + "loss": 3.3772, + "step": 31432 + }, + { + "epoch": 1.54, + "grad_norm": 0.5975269079208374, + "learning_rate": 0.0002872942463439134, + "loss": 3.1309, + "step": 31433 + }, + { + "epoch": 1.54, + "grad_norm": 0.5869545340538025, + "learning_rate": 0.000287278863984783, + "loss": 3.2301, + "step": 31434 + }, + { + "epoch": 1.54, + "grad_norm": 0.5768601298332214, + "learning_rate": 0.0002872634816591577, + "loss": 2.9476, + "step": 31435 + }, + { + "epoch": 1.54, + "grad_norm": 0.614723265171051, + "learning_rate": 0.0002872480993670778, + "loss": 3.1834, + "step": 31436 + }, + { + "epoch": 1.54, + "grad_norm": 0.5859989523887634, + "learning_rate": 0.00028723271710858405, + "loss": 3.0171, + "step": 31437 + }, + { + "epoch": 1.54, + "grad_norm": 0.6272194981575012, + "learning_rate": 0.0002872173348837169, + "loss": 3.0156, + "step": 31438 + }, + { + "epoch": 1.54, + "grad_norm": 0.6279314756393433, + "learning_rate": 0.0002872019526925167, + "loss": 2.7363, + "step": 31439 + }, + { + "epoch": 1.54, + "grad_norm": 0.5947081446647644, + "learning_rate": 0.00028718657053502427, + "loss": 3.1848, + "step": 31440 + }, + { + "epoch": 1.54, + "grad_norm": 0.619102954864502, + "learning_rate": 0.0002871711884112798, + "loss": 2.929, + "step": 31441 + }, + { + "epoch": 1.54, + "grad_norm": 0.6086522936820984, + "learning_rate": 0.00028715580632132393, + "loss": 3.2284, + "step": 31442 + }, + { + "epoch": 1.54, + "grad_norm": 0.6687970757484436, + "learning_rate": 0.0002871404242651973, + "loss": 2.9145, + "step": 31443 + }, + { + "epoch": 1.54, + "grad_norm": 0.5713397860527039, + "learning_rate": 0.0002871250422429403, + "loss": 2.8666, + "step": 31444 + }, + { + "epoch": 1.54, + "grad_norm": 0.5800654292106628, + "learning_rate": 0.00028710966025459353, + "loss": 3.0257, + "step": 31445 + }, + { + "epoch": 1.54, + "grad_norm": 0.6090438365936279, + "learning_rate": 0.00028709427830019723, + "loss": 2.9861, + "step": 31446 + }, + { + "epoch": 1.54, + "grad_norm": 0.5880951285362244, + "learning_rate": 0.00028707889637979224, + "loss": 3.0089, + "step": 31447 + }, + { + "epoch": 1.54, + "grad_norm": 0.6175966858863831, + "learning_rate": 0.000287063514493419, + "loss": 3.0327, + "step": 31448 + }, + { + "epoch": 1.54, + "grad_norm": 0.602347195148468, + "learning_rate": 0.0002870481326411179, + "loss": 3.0266, + "step": 31449 + }, + { + "epoch": 1.54, + "grad_norm": 0.6162763833999634, + "learning_rate": 0.0002870327508229296, + "loss": 3.1335, + "step": 31450 + }, + { + "epoch": 1.54, + "grad_norm": 0.6215873956680298, + "learning_rate": 0.0002870173690388944, + "loss": 3.0356, + "step": 31451 + }, + { + "epoch": 1.54, + "grad_norm": 0.6079941391944885, + "learning_rate": 0.000287001987289053, + "loss": 3.0005, + "step": 31452 + }, + { + "epoch": 1.54, + "grad_norm": 0.5799838304519653, + "learning_rate": 0.00028698660557344597, + "loss": 3.1835, + "step": 31453 + }, + { + "epoch": 1.54, + "grad_norm": 0.5891263484954834, + "learning_rate": 0.0002869712238921136, + "loss": 2.8755, + "step": 31454 + }, + { + "epoch": 1.54, + "grad_norm": 0.5852018594741821, + "learning_rate": 0.00028695584224509656, + "loss": 3.0743, + "step": 31455 + }, + { + "epoch": 1.54, + "grad_norm": 0.6360331177711487, + "learning_rate": 0.00028694046063243537, + "loss": 3.1777, + "step": 31456 + }, + { + "epoch": 1.54, + "grad_norm": 0.5894647240638733, + "learning_rate": 0.0002869250790541704, + "loss": 3.2482, + "step": 31457 + }, + { + "epoch": 1.54, + "grad_norm": 0.5866122841835022, + "learning_rate": 0.0002869096975103423, + "loss": 2.9541, + "step": 31458 + }, + { + "epoch": 1.54, + "grad_norm": 0.5750760436058044, + "learning_rate": 0.00028689431600099154, + "loss": 3.1921, + "step": 31459 + }, + { + "epoch": 1.54, + "grad_norm": 0.6584492325782776, + "learning_rate": 0.0002868789345261587, + "loss": 2.9085, + "step": 31460 + }, + { + "epoch": 1.54, + "grad_norm": 0.6034347414970398, + "learning_rate": 0.00028686355308588405, + "loss": 3.332, + "step": 31461 + }, + { + "epoch": 1.54, + "grad_norm": 0.581174910068512, + "learning_rate": 0.0002868481716802084, + "loss": 2.9919, + "step": 31462 + }, + { + "epoch": 1.54, + "grad_norm": 0.616607129573822, + "learning_rate": 0.00028683279030917216, + "loss": 2.9166, + "step": 31463 + }, + { + "epoch": 1.54, + "grad_norm": 0.6416230201721191, + "learning_rate": 0.0002868174089728157, + "loss": 2.944, + "step": 31464 + }, + { + "epoch": 1.54, + "grad_norm": 0.616322934627533, + "learning_rate": 0.0002868020276711798, + "loss": 2.9355, + "step": 31465 + }, + { + "epoch": 1.54, + "grad_norm": 0.5935969948768616, + "learning_rate": 0.0002867866464043047, + "loss": 3.1532, + "step": 31466 + }, + { + "epoch": 1.54, + "grad_norm": 0.6162576079368591, + "learning_rate": 0.00028677126517223113, + "loss": 2.867, + "step": 31467 + }, + { + "epoch": 1.54, + "grad_norm": 0.6141955852508545, + "learning_rate": 0.00028675588397499935, + "loss": 2.8846, + "step": 31468 + }, + { + "epoch": 1.54, + "grad_norm": 0.5820909142494202, + "learning_rate": 0.00028674050281265003, + "loss": 3.1472, + "step": 31469 + }, + { + "epoch": 1.54, + "grad_norm": 0.6148683428764343, + "learning_rate": 0.00028672512168522384, + "loss": 2.9993, + "step": 31470 + }, + { + "epoch": 1.54, + "grad_norm": 0.5928391218185425, + "learning_rate": 0.000286709740592761, + "loss": 2.9203, + "step": 31471 + }, + { + "epoch": 1.54, + "grad_norm": 0.5849189758300781, + "learning_rate": 0.0002866943595353023, + "loss": 2.995, + "step": 31472 + }, + { + "epoch": 1.54, + "grad_norm": 0.6054073572158813, + "learning_rate": 0.00028667897851288786, + "loss": 3.1151, + "step": 31473 + }, + { + "epoch": 1.54, + "grad_norm": 0.5830089449882507, + "learning_rate": 0.00028666359752555854, + "loss": 3.001, + "step": 31474 + }, + { + "epoch": 1.54, + "grad_norm": 0.565097451210022, + "learning_rate": 0.00028664821657335476, + "loss": 3.1232, + "step": 31475 + }, + { + "epoch": 1.54, + "grad_norm": 0.7242405414581299, + "learning_rate": 0.0002866328356563169, + "loss": 2.8098, + "step": 31476 + }, + { + "epoch": 1.54, + "grad_norm": 0.5993044376373291, + "learning_rate": 0.0002866174547744858, + "loss": 3.0741, + "step": 31477 + }, + { + "epoch": 1.54, + "grad_norm": 0.5751321911811829, + "learning_rate": 0.0002866020739279016, + "loss": 2.8981, + "step": 31478 + }, + { + "epoch": 1.54, + "grad_norm": 0.6080617308616638, + "learning_rate": 0.0002865866931166049, + "loss": 3.0646, + "step": 31479 + }, + { + "epoch": 1.54, + "grad_norm": 0.6069605350494385, + "learning_rate": 0.0002865713123406364, + "loss": 2.967, + "step": 31480 + }, + { + "epoch": 1.54, + "grad_norm": 0.6173204183578491, + "learning_rate": 0.0002865559316000364, + "loss": 2.9911, + "step": 31481 + }, + { + "epoch": 1.54, + "grad_norm": 0.5786520838737488, + "learning_rate": 0.0002865405508948456, + "loss": 2.7597, + "step": 31482 + }, + { + "epoch": 1.54, + "grad_norm": 0.6026302576065063, + "learning_rate": 0.0002865251702251042, + "loss": 3.1053, + "step": 31483 + }, + { + "epoch": 1.54, + "grad_norm": 0.5935983061790466, + "learning_rate": 0.00028650978959085305, + "loss": 3.092, + "step": 31484 + }, + { + "epoch": 1.54, + "grad_norm": 0.6370457410812378, + "learning_rate": 0.0002864944089921325, + "loss": 3.0275, + "step": 31485 + }, + { + "epoch": 1.54, + "grad_norm": 0.6145215630531311, + "learning_rate": 0.00028647902842898304, + "loss": 3.1223, + "step": 31486 + }, + { + "epoch": 1.54, + "grad_norm": 0.6142358779907227, + "learning_rate": 0.00028646364790144526, + "loss": 3.0543, + "step": 31487 + }, + { + "epoch": 1.54, + "grad_norm": 0.5921164155006409, + "learning_rate": 0.00028644826740955963, + "loss": 2.9785, + "step": 31488 + }, + { + "epoch": 1.54, + "grad_norm": 0.6127771139144897, + "learning_rate": 0.0002864328869533666, + "loss": 3.1838, + "step": 31489 + }, + { + "epoch": 1.54, + "grad_norm": 0.6095576882362366, + "learning_rate": 0.0002864175065329068, + "loss": 2.9457, + "step": 31490 + }, + { + "epoch": 1.54, + "grad_norm": 0.5726305842399597, + "learning_rate": 0.00028640212614822056, + "loss": 2.9453, + "step": 31491 + }, + { + "epoch": 1.54, + "grad_norm": 0.5958991050720215, + "learning_rate": 0.0002863867457993486, + "loss": 2.8155, + "step": 31492 + }, + { + "epoch": 1.54, + "grad_norm": 0.5724688768386841, + "learning_rate": 0.00028637136548633136, + "loss": 2.9964, + "step": 31493 + }, + { + "epoch": 1.54, + "grad_norm": 0.6108876466751099, + "learning_rate": 0.0002863559852092092, + "loss": 3.0757, + "step": 31494 + }, + { + "epoch": 1.54, + "grad_norm": 0.5869536399841309, + "learning_rate": 0.00028634060496802283, + "loss": 2.9439, + "step": 31495 + }, + { + "epoch": 1.54, + "grad_norm": 0.6103752255439758, + "learning_rate": 0.0002863252247628127, + "loss": 3.0208, + "step": 31496 + }, + { + "epoch": 1.54, + "grad_norm": 0.6240681409835815, + "learning_rate": 0.0002863098445936194, + "loss": 2.8288, + "step": 31497 + }, + { + "epoch": 1.54, + "grad_norm": 0.5823042988777161, + "learning_rate": 0.0002862944644604831, + "loss": 2.9939, + "step": 31498 + }, + { + "epoch": 1.54, + "grad_norm": 0.5619244575500488, + "learning_rate": 0.00028627908436344463, + "loss": 3.0009, + "step": 31499 + }, + { + "epoch": 1.54, + "grad_norm": 0.652056097984314, + "learning_rate": 0.0002862637043025445, + "loss": 3.3113, + "step": 31500 + }, + { + "epoch": 1.54, + "grad_norm": 0.5911815166473389, + "learning_rate": 0.0002862483242778229, + "loss": 3.1164, + "step": 31501 + }, + { + "epoch": 1.54, + "grad_norm": 0.6071313619613647, + "learning_rate": 0.0002862329442893208, + "loss": 3.1199, + "step": 31502 + }, + { + "epoch": 1.54, + "grad_norm": 0.6353355646133423, + "learning_rate": 0.00028621756433707844, + "loss": 2.8222, + "step": 31503 + }, + { + "epoch": 1.54, + "grad_norm": 0.6127616167068481, + "learning_rate": 0.00028620218442113623, + "loss": 3.1173, + "step": 31504 + }, + { + "epoch": 1.54, + "grad_norm": 0.6205554008483887, + "learning_rate": 0.00028618680454153497, + "loss": 3.0187, + "step": 31505 + }, + { + "epoch": 1.54, + "grad_norm": 0.5830715298652649, + "learning_rate": 0.000286171424698315, + "loss": 2.8732, + "step": 31506 + }, + { + "epoch": 1.54, + "grad_norm": 0.6005604267120361, + "learning_rate": 0.0002861560448915168, + "loss": 3.1406, + "step": 31507 + }, + { + "epoch": 1.54, + "grad_norm": 0.5887455344200134, + "learning_rate": 0.0002861406651211808, + "loss": 3.1733, + "step": 31508 + }, + { + "epoch": 1.54, + "grad_norm": 0.5852056741714478, + "learning_rate": 0.0002861252853873476, + "loss": 3.0398, + "step": 31509 + }, + { + "epoch": 1.54, + "grad_norm": 0.6041200160980225, + "learning_rate": 0.000286109905690058, + "loss": 2.9279, + "step": 31510 + }, + { + "epoch": 1.54, + "grad_norm": 0.590787410736084, + "learning_rate": 0.000286094526029352, + "loss": 2.9335, + "step": 31511 + }, + { + "epoch": 1.54, + "grad_norm": 0.5768125653266907, + "learning_rate": 0.00028607914640527046, + "loss": 3.1014, + "step": 31512 + }, + { + "epoch": 1.54, + "grad_norm": 0.6092056035995483, + "learning_rate": 0.00028606376681785366, + "loss": 3.0408, + "step": 31513 + }, + { + "epoch": 1.54, + "grad_norm": 0.6362078189849854, + "learning_rate": 0.0002860483872671423, + "loss": 2.8638, + "step": 31514 + }, + { + "epoch": 1.54, + "grad_norm": 0.5419045686721802, + "learning_rate": 0.00028603300775317683, + "loss": 3.0463, + "step": 31515 + }, + { + "epoch": 1.54, + "grad_norm": 0.5694605112075806, + "learning_rate": 0.00028601762827599764, + "loss": 2.8361, + "step": 31516 + }, + { + "epoch": 1.54, + "grad_norm": 0.6117928624153137, + "learning_rate": 0.00028600224883564544, + "loss": 3.0969, + "step": 31517 + }, + { + "epoch": 1.54, + "grad_norm": 0.5907307863235474, + "learning_rate": 0.00028598686943216055, + "loss": 3.1432, + "step": 31518 + }, + { + "epoch": 1.54, + "grad_norm": 0.5980222821235657, + "learning_rate": 0.00028597149006558343, + "loss": 2.9284, + "step": 31519 + }, + { + "epoch": 1.54, + "grad_norm": 0.6280714869499207, + "learning_rate": 0.0002859561107359549, + "loss": 2.811, + "step": 31520 + }, + { + "epoch": 1.54, + "grad_norm": 0.5964347720146179, + "learning_rate": 0.0002859407314433152, + "loss": 2.8092, + "step": 31521 + }, + { + "epoch": 1.54, + "grad_norm": 0.5739207863807678, + "learning_rate": 0.00028592535218770497, + "loss": 3.0099, + "step": 31522 + }, + { + "epoch": 1.54, + "grad_norm": 0.5974461436271667, + "learning_rate": 0.00028590997296916454, + "loss": 2.9818, + "step": 31523 + }, + { + "epoch": 1.54, + "grad_norm": 0.6324328184127808, + "learning_rate": 0.00028589459378773456, + "loss": 3.3268, + "step": 31524 + }, + { + "epoch": 1.54, + "grad_norm": 0.5549293756484985, + "learning_rate": 0.0002858792146434556, + "loss": 3.1289, + "step": 31525 + }, + { + "epoch": 1.55, + "grad_norm": 0.5910172462463379, + "learning_rate": 0.00028586383553636793, + "loss": 3.0805, + "step": 31526 + }, + { + "epoch": 1.55, + "grad_norm": 0.5912405848503113, + "learning_rate": 0.0002858484564665123, + "loss": 3.2192, + "step": 31527 + }, + { + "epoch": 1.55, + "grad_norm": 0.6898330450057983, + "learning_rate": 0.0002858330774339291, + "loss": 2.9639, + "step": 31528 + }, + { + "epoch": 1.55, + "grad_norm": 0.6082789897918701, + "learning_rate": 0.00028581769843865875, + "loss": 2.9877, + "step": 31529 + }, + { + "epoch": 1.55, + "grad_norm": 0.6206566691398621, + "learning_rate": 0.000285802319480742, + "loss": 3.1703, + "step": 31530 + }, + { + "epoch": 1.55, + "grad_norm": 0.5829499363899231, + "learning_rate": 0.000285786940560219, + "loss": 3.1095, + "step": 31531 + }, + { + "epoch": 1.55, + "grad_norm": 0.5791610479354858, + "learning_rate": 0.0002857715616771307, + "loss": 2.9349, + "step": 31532 + }, + { + "epoch": 1.55, + "grad_norm": 0.6252768635749817, + "learning_rate": 0.00028575618283151725, + "loss": 3.0773, + "step": 31533 + }, + { + "epoch": 1.55, + "grad_norm": 0.5908172130584717, + "learning_rate": 0.0002857408040234192, + "loss": 2.9155, + "step": 31534 + }, + { + "epoch": 1.55, + "grad_norm": 0.5993767976760864, + "learning_rate": 0.0002857254252528773, + "loss": 3.1915, + "step": 31535 + }, + { + "epoch": 1.55, + "grad_norm": 0.615296483039856, + "learning_rate": 0.0002857100465199318, + "loss": 2.9795, + "step": 31536 + }, + { + "epoch": 1.55, + "grad_norm": 0.57916659116745, + "learning_rate": 0.00028569466782462337, + "loss": 3.1722, + "step": 31537 + }, + { + "epoch": 1.55, + "grad_norm": 0.597743570804596, + "learning_rate": 0.0002856792891669922, + "loss": 2.8156, + "step": 31538 + }, + { + "epoch": 1.55, + "grad_norm": 0.5725197792053223, + "learning_rate": 0.0002856639105470792, + "loss": 2.9045, + "step": 31539 + }, + { + "epoch": 1.55, + "grad_norm": 0.5700836777687073, + "learning_rate": 0.00028564853196492475, + "loss": 3.203, + "step": 31540 + }, + { + "epoch": 1.55, + "grad_norm": 0.6204779148101807, + "learning_rate": 0.00028563315342056914, + "loss": 3.0341, + "step": 31541 + }, + { + "epoch": 1.55, + "grad_norm": 0.6638866066932678, + "learning_rate": 0.00028561777491405317, + "loss": 2.8039, + "step": 31542 + }, + { + "epoch": 1.55, + "grad_norm": 0.6386014819145203, + "learning_rate": 0.0002856023964454172, + "loss": 3.016, + "step": 31543 + }, + { + "epoch": 1.55, + "grad_norm": 0.5822014212608337, + "learning_rate": 0.0002855870180147018, + "loss": 2.9548, + "step": 31544 + }, + { + "epoch": 1.55, + "grad_norm": 0.586982786655426, + "learning_rate": 0.00028557163962194724, + "loss": 2.9768, + "step": 31545 + }, + { + "epoch": 1.55, + "grad_norm": 0.5915098786354065, + "learning_rate": 0.0002855562612671943, + "loss": 3.1178, + "step": 31546 + }, + { + "epoch": 1.55, + "grad_norm": 0.5712721943855286, + "learning_rate": 0.0002855408829504835, + "loss": 2.9357, + "step": 31547 + }, + { + "epoch": 1.55, + "grad_norm": 0.643669605255127, + "learning_rate": 0.000285525504671855, + "loss": 2.9061, + "step": 31548 + }, + { + "epoch": 1.55, + "grad_norm": 0.5977221727371216, + "learning_rate": 0.0002855101264313497, + "loss": 2.989, + "step": 31549 + }, + { + "epoch": 1.55, + "grad_norm": 0.6182084679603577, + "learning_rate": 0.0002854947482290079, + "loss": 3.1102, + "step": 31550 + }, + { + "epoch": 1.55, + "grad_norm": 0.6026540994644165, + "learning_rate": 0.0002854793700648701, + "loss": 3.0948, + "step": 31551 + }, + { + "epoch": 1.55, + "grad_norm": 0.6409699320793152, + "learning_rate": 0.00028546399193897694, + "loss": 2.8929, + "step": 31552 + }, + { + "epoch": 1.55, + "grad_norm": 0.5884620547294617, + "learning_rate": 0.0002854486138513687, + "loss": 3.3798, + "step": 31553 + }, + { + "epoch": 1.55, + "grad_norm": 0.5860359072685242, + "learning_rate": 0.0002854332358020861, + "loss": 3.1201, + "step": 31554 + }, + { + "epoch": 1.55, + "grad_norm": 0.6279225945472717, + "learning_rate": 0.0002854178577911696, + "loss": 3.0676, + "step": 31555 + }, + { + "epoch": 1.55, + "grad_norm": 0.6590830087661743, + "learning_rate": 0.00028540247981865944, + "loss": 3.0733, + "step": 31556 + }, + { + "epoch": 1.55, + "grad_norm": 0.6351860165596008, + "learning_rate": 0.0002853871018845966, + "loss": 2.8937, + "step": 31557 + }, + { + "epoch": 1.55, + "grad_norm": 0.6308092474937439, + "learning_rate": 0.0002853717239890212, + "loss": 2.96, + "step": 31558 + }, + { + "epoch": 1.55, + "grad_norm": 0.6065652370452881, + "learning_rate": 0.00028535634613197395, + "loss": 3.285, + "step": 31559 + }, + { + "epoch": 1.55, + "grad_norm": 0.5910200476646423, + "learning_rate": 0.0002853409683134951, + "loss": 2.7821, + "step": 31560 + }, + { + "epoch": 1.55, + "grad_norm": 0.624097466468811, + "learning_rate": 0.0002853255905336254, + "loss": 3.0966, + "step": 31561 + }, + { + "epoch": 1.55, + "grad_norm": 0.6241322159767151, + "learning_rate": 0.0002853102127924054, + "loss": 3.0899, + "step": 31562 + }, + { + "epoch": 1.55, + "grad_norm": 0.6398972272872925, + "learning_rate": 0.00028529483508987524, + "loss": 3.0438, + "step": 31563 + }, + { + "epoch": 1.55, + "grad_norm": 0.601036548614502, + "learning_rate": 0.0002852794574260758, + "loss": 3.2424, + "step": 31564 + }, + { + "epoch": 1.55, + "grad_norm": 0.610298216342926, + "learning_rate": 0.0002852640798010474, + "loss": 3.0497, + "step": 31565 + }, + { + "epoch": 1.55, + "grad_norm": 0.5706086754798889, + "learning_rate": 0.00028524870221483054, + "loss": 2.9957, + "step": 31566 + }, + { + "epoch": 1.55, + "grad_norm": 0.5881780385971069, + "learning_rate": 0.00028523332466746584, + "loss": 3.1404, + "step": 31567 + }, + { + "epoch": 1.55, + "grad_norm": 0.558418333530426, + "learning_rate": 0.0002852179471589937, + "loss": 2.9538, + "step": 31568 + }, + { + "epoch": 1.55, + "grad_norm": 0.5936317443847656, + "learning_rate": 0.00028520256968945465, + "loss": 2.7039, + "step": 31569 + }, + { + "epoch": 1.55, + "grad_norm": 0.6009182929992676, + "learning_rate": 0.0002851871922588891, + "loss": 3.1115, + "step": 31570 + }, + { + "epoch": 1.55, + "grad_norm": 0.6169144511222839, + "learning_rate": 0.00028517181486733764, + "loss": 3.127, + "step": 31571 + }, + { + "epoch": 1.55, + "grad_norm": 0.6200720071792603, + "learning_rate": 0.00028515643751484093, + "loss": 3.0668, + "step": 31572 + }, + { + "epoch": 1.55, + "grad_norm": 0.6260064244270325, + "learning_rate": 0.00028514106020143914, + "loss": 3.2649, + "step": 31573 + }, + { + "epoch": 1.55, + "grad_norm": 0.5613835453987122, + "learning_rate": 0.00028512568292717304, + "loss": 3.1029, + "step": 31574 + }, + { + "epoch": 1.55, + "grad_norm": 0.6062586903572083, + "learning_rate": 0.0002851103056920829, + "loss": 2.9356, + "step": 31575 + }, + { + "epoch": 1.55, + "grad_norm": 0.6101064085960388, + "learning_rate": 0.00028509492849620945, + "loss": 2.9583, + "step": 31576 + }, + { + "epoch": 1.55, + "grad_norm": 0.5925574898719788, + "learning_rate": 0.0002850795513395931, + "loss": 3.092, + "step": 31577 + }, + { + "epoch": 1.55, + "grad_norm": 0.5718997120857239, + "learning_rate": 0.00028506417422227427, + "loss": 3.0322, + "step": 31578 + }, + { + "epoch": 1.55, + "grad_norm": 0.6322290897369385, + "learning_rate": 0.0002850487971442936, + "loss": 2.9443, + "step": 31579 + }, + { + "epoch": 1.55, + "grad_norm": 0.585167646408081, + "learning_rate": 0.00028503342010569147, + "loss": 2.9556, + "step": 31580 + }, + { + "epoch": 1.55, + "grad_norm": 0.5464421510696411, + "learning_rate": 0.00028501804310650844, + "loss": 2.9251, + "step": 31581 + }, + { + "epoch": 1.55, + "grad_norm": 0.5760757327079773, + "learning_rate": 0.00028500266614678505, + "loss": 3.1459, + "step": 31582 + }, + { + "epoch": 1.55, + "grad_norm": 0.5659793615341187, + "learning_rate": 0.0002849872892265617, + "loss": 3.1723, + "step": 31583 + }, + { + "epoch": 1.55, + "grad_norm": 0.64796382188797, + "learning_rate": 0.00028497191234587904, + "loss": 2.8646, + "step": 31584 + }, + { + "epoch": 1.55, + "grad_norm": 0.7371762990951538, + "learning_rate": 0.0002849565355047773, + "loss": 3.0312, + "step": 31585 + }, + { + "epoch": 1.55, + "grad_norm": 0.5944815278053284, + "learning_rate": 0.00028494115870329723, + "loss": 2.9846, + "step": 31586 + }, + { + "epoch": 1.55, + "grad_norm": 0.5763345956802368, + "learning_rate": 0.0002849257819414793, + "loss": 2.9265, + "step": 31587 + }, + { + "epoch": 1.55, + "grad_norm": 0.5970070958137512, + "learning_rate": 0.00028491040521936385, + "loss": 3.0161, + "step": 31588 + }, + { + "epoch": 1.55, + "grad_norm": 0.594948410987854, + "learning_rate": 0.00028489502853699163, + "loss": 3.1838, + "step": 31589 + }, + { + "epoch": 1.55, + "grad_norm": 0.6008180975914001, + "learning_rate": 0.0002848796518944029, + "loss": 3.0902, + "step": 31590 + }, + { + "epoch": 1.55, + "grad_norm": 0.5951430201530457, + "learning_rate": 0.00028486427529163824, + "loss": 3.0587, + "step": 31591 + }, + { + "epoch": 1.55, + "grad_norm": 0.607059121131897, + "learning_rate": 0.0002848488987287382, + "loss": 3.0581, + "step": 31592 + }, + { + "epoch": 1.55, + "grad_norm": 0.600946307182312, + "learning_rate": 0.00028483352220574324, + "loss": 3.1581, + "step": 31593 + }, + { + "epoch": 1.55, + "grad_norm": 0.5530339479446411, + "learning_rate": 0.0002848181457226939, + "loss": 3.1023, + "step": 31594 + }, + { + "epoch": 1.55, + "grad_norm": 0.6456699371337891, + "learning_rate": 0.0002848027692796306, + "loss": 3.1343, + "step": 31595 + }, + { + "epoch": 1.55, + "grad_norm": 0.5949581265449524, + "learning_rate": 0.00028478739287659383, + "loss": 3.0981, + "step": 31596 + }, + { + "epoch": 1.55, + "grad_norm": 0.6152155995368958, + "learning_rate": 0.00028477201651362427, + "loss": 2.957, + "step": 31597 + }, + { + "epoch": 1.55, + "grad_norm": 0.6344307065010071, + "learning_rate": 0.0002847566401907622, + "loss": 2.9609, + "step": 31598 + }, + { + "epoch": 1.55, + "grad_norm": 0.5822964906692505, + "learning_rate": 0.00028474126390804833, + "loss": 3.0524, + "step": 31599 + }, + { + "epoch": 1.55, + "grad_norm": 0.5847934484481812, + "learning_rate": 0.00028472588766552284, + "loss": 2.9667, + "step": 31600 + }, + { + "epoch": 1.55, + "grad_norm": 0.5926794409751892, + "learning_rate": 0.0002847105114632265, + "loss": 3.0349, + "step": 31601 + }, + { + "epoch": 1.55, + "grad_norm": 0.5681343078613281, + "learning_rate": 0.00028469513530119983, + "loss": 3.2188, + "step": 31602 + }, + { + "epoch": 1.55, + "grad_norm": 0.5752440690994263, + "learning_rate": 0.0002846797591794831, + "loss": 3.0376, + "step": 31603 + }, + { + "epoch": 1.55, + "grad_norm": 0.6142379641532898, + "learning_rate": 0.00028466438309811704, + "loss": 3.1508, + "step": 31604 + }, + { + "epoch": 1.55, + "grad_norm": 0.6223300099372864, + "learning_rate": 0.00028464900705714197, + "loss": 2.9159, + "step": 31605 + }, + { + "epoch": 1.55, + "grad_norm": 0.6309249401092529, + "learning_rate": 0.00028463363105659837, + "loss": 2.8823, + "step": 31606 + }, + { + "epoch": 1.55, + "grad_norm": 0.5628587603569031, + "learning_rate": 0.000284618255096527, + "loss": 3.1089, + "step": 31607 + }, + { + "epoch": 1.55, + "grad_norm": 0.6215999126434326, + "learning_rate": 0.0002846028791769681, + "loss": 3.1475, + "step": 31608 + }, + { + "epoch": 1.55, + "grad_norm": 0.5789870619773865, + "learning_rate": 0.0002845875032979624, + "loss": 3.0857, + "step": 31609 + }, + { + "epoch": 1.55, + "grad_norm": 0.5938981771469116, + "learning_rate": 0.00028457212745955, + "loss": 2.958, + "step": 31610 + }, + { + "epoch": 1.55, + "grad_norm": 0.5996585488319397, + "learning_rate": 0.00028455675166177177, + "loss": 2.9466, + "step": 31611 + }, + { + "epoch": 1.55, + "grad_norm": 0.5623268485069275, + "learning_rate": 0.00028454137590466815, + "loss": 2.8726, + "step": 31612 + }, + { + "epoch": 1.55, + "grad_norm": 0.5604333281517029, + "learning_rate": 0.00028452600018827947, + "loss": 3.3238, + "step": 31613 + }, + { + "epoch": 1.55, + "grad_norm": 0.6165540218353271, + "learning_rate": 0.0002845106245126464, + "loss": 2.9483, + "step": 31614 + }, + { + "epoch": 1.55, + "grad_norm": 0.5510979890823364, + "learning_rate": 0.00028449524887780926, + "loss": 2.9379, + "step": 31615 + }, + { + "epoch": 1.55, + "grad_norm": 0.5766331553459167, + "learning_rate": 0.0002844798732838087, + "loss": 3.0955, + "step": 31616 + }, + { + "epoch": 1.55, + "grad_norm": 0.6156991124153137, + "learning_rate": 0.00028446449773068523, + "loss": 2.9406, + "step": 31617 + }, + { + "epoch": 1.55, + "grad_norm": 0.6004645824432373, + "learning_rate": 0.0002844491222184792, + "loss": 3.0694, + "step": 31618 + }, + { + "epoch": 1.55, + "grad_norm": 0.6063745021820068, + "learning_rate": 0.0002844337467472313, + "loss": 2.8684, + "step": 31619 + }, + { + "epoch": 1.55, + "grad_norm": 0.5919575691223145, + "learning_rate": 0.0002844183713169818, + "loss": 3.1631, + "step": 31620 + }, + { + "epoch": 1.55, + "grad_norm": 0.5818456411361694, + "learning_rate": 0.0002844029959277713, + "loss": 2.9513, + "step": 31621 + }, + { + "epoch": 1.55, + "grad_norm": 0.5850279927253723, + "learning_rate": 0.0002843876205796404, + "loss": 3.0365, + "step": 31622 + }, + { + "epoch": 1.55, + "grad_norm": 0.590082049369812, + "learning_rate": 0.00028437224527262943, + "loss": 2.9559, + "step": 31623 + }, + { + "epoch": 1.55, + "grad_norm": 0.8791807889938354, + "learning_rate": 0.0002843568700067791, + "loss": 2.9216, + "step": 31624 + }, + { + "epoch": 1.55, + "grad_norm": 0.593088686466217, + "learning_rate": 0.00028434149478212957, + "loss": 3.1052, + "step": 31625 + }, + { + "epoch": 1.55, + "grad_norm": 0.6907350420951843, + "learning_rate": 0.00028432611959872166, + "loss": 3.1177, + "step": 31626 + }, + { + "epoch": 1.55, + "grad_norm": 0.5803176164627075, + "learning_rate": 0.00028431074445659573, + "loss": 3.0874, + "step": 31627 + }, + { + "epoch": 1.55, + "grad_norm": 0.627086877822876, + "learning_rate": 0.0002842953693557921, + "loss": 3.0477, + "step": 31628 + }, + { + "epoch": 1.55, + "grad_norm": 0.6301358938217163, + "learning_rate": 0.0002842799942963517, + "loss": 3.1221, + "step": 31629 + }, + { + "epoch": 1.55, + "grad_norm": 0.5863131284713745, + "learning_rate": 0.0002842646192783146, + "loss": 3.176, + "step": 31630 + }, + { + "epoch": 1.55, + "grad_norm": 0.6010187864303589, + "learning_rate": 0.0002842492443017216, + "loss": 3.2, + "step": 31631 + }, + { + "epoch": 1.55, + "grad_norm": 0.5959247946739197, + "learning_rate": 0.0002842338693666129, + "loss": 2.8586, + "step": 31632 + }, + { + "epoch": 1.55, + "grad_norm": 0.5971164107322693, + "learning_rate": 0.0002842184944730292, + "loss": 2.8364, + "step": 31633 + }, + { + "epoch": 1.55, + "grad_norm": 0.6169106960296631, + "learning_rate": 0.000284203119621011, + "loss": 2.9412, + "step": 31634 + }, + { + "epoch": 1.55, + "grad_norm": 0.6330366730690002, + "learning_rate": 0.0002841877448105987, + "loss": 3.2115, + "step": 31635 + }, + { + "epoch": 1.55, + "grad_norm": 0.6117235422134399, + "learning_rate": 0.00028417237004183297, + "loss": 2.8728, + "step": 31636 + }, + { + "epoch": 1.55, + "grad_norm": 0.5777655839920044, + "learning_rate": 0.00028415699531475394, + "loss": 2.9479, + "step": 31637 + }, + { + "epoch": 1.55, + "grad_norm": 0.5683594346046448, + "learning_rate": 0.0002841416206294025, + "loss": 3.1955, + "step": 31638 + }, + { + "epoch": 1.55, + "grad_norm": 0.6049903631210327, + "learning_rate": 0.00028412624598581895, + "loss": 3.1881, + "step": 31639 + }, + { + "epoch": 1.55, + "grad_norm": 0.5954413414001465, + "learning_rate": 0.0002841108713840437, + "loss": 3.0535, + "step": 31640 + }, + { + "epoch": 1.55, + "grad_norm": 0.6370765566825867, + "learning_rate": 0.00028409549682411754, + "loss": 3.0142, + "step": 31641 + }, + { + "epoch": 1.55, + "grad_norm": 0.5675783753395081, + "learning_rate": 0.0002840801223060807, + "loss": 3.1618, + "step": 31642 + }, + { + "epoch": 1.55, + "grad_norm": 0.6547769904136658, + "learning_rate": 0.00028406474782997367, + "loss": 3.2298, + "step": 31643 + }, + { + "epoch": 1.55, + "grad_norm": 0.6375356316566467, + "learning_rate": 0.0002840493733958372, + "loss": 3.0107, + "step": 31644 + }, + { + "epoch": 1.55, + "grad_norm": 0.6045966744422913, + "learning_rate": 0.00028403399900371146, + "loss": 3.0618, + "step": 31645 + }, + { + "epoch": 1.55, + "grad_norm": 0.5673590302467346, + "learning_rate": 0.0002840186246536372, + "loss": 2.9589, + "step": 31646 + }, + { + "epoch": 1.55, + "grad_norm": 0.6037384867668152, + "learning_rate": 0.0002840032503456547, + "loss": 3.1438, + "step": 31647 + }, + { + "epoch": 1.55, + "grad_norm": 0.6107524633407593, + "learning_rate": 0.0002839878760798046, + "loss": 3.062, + "step": 31648 + }, + { + "epoch": 1.55, + "grad_norm": 0.5911772847175598, + "learning_rate": 0.00028397250185612747, + "loss": 2.9119, + "step": 31649 + }, + { + "epoch": 1.55, + "grad_norm": 0.6252596974372864, + "learning_rate": 0.0002839571276746635, + "loss": 2.9791, + "step": 31650 + }, + { + "epoch": 1.55, + "grad_norm": 0.5904194116592407, + "learning_rate": 0.0002839417535354535, + "loss": 2.805, + "step": 31651 + }, + { + "epoch": 1.55, + "grad_norm": 0.5900318026542664, + "learning_rate": 0.0002839263794385378, + "loss": 3.1029, + "step": 31652 + }, + { + "epoch": 1.55, + "grad_norm": 0.6207619309425354, + "learning_rate": 0.0002839110053839568, + "loss": 2.9533, + "step": 31653 + }, + { + "epoch": 1.55, + "grad_norm": 0.7129519581794739, + "learning_rate": 0.0002838956313717513, + "loss": 2.9724, + "step": 31654 + }, + { + "epoch": 1.55, + "grad_norm": 0.5762568712234497, + "learning_rate": 0.0002838802574019615, + "loss": 3.0598, + "step": 31655 + }, + { + "epoch": 1.55, + "grad_norm": 0.6000192761421204, + "learning_rate": 0.0002838648834746281, + "loss": 3.0808, + "step": 31656 + }, + { + "epoch": 1.55, + "grad_norm": 0.6309391260147095, + "learning_rate": 0.0002838495095897914, + "loss": 3.1001, + "step": 31657 + }, + { + "epoch": 1.55, + "grad_norm": 0.5648094415664673, + "learning_rate": 0.0002838341357474919, + "loss": 2.8382, + "step": 31658 + }, + { + "epoch": 1.55, + "grad_norm": 0.601072371006012, + "learning_rate": 0.00028381876194777037, + "loss": 2.9228, + "step": 31659 + }, + { + "epoch": 1.55, + "grad_norm": 0.6006704568862915, + "learning_rate": 0.000283803388190667, + "loss": 3.1873, + "step": 31660 + }, + { + "epoch": 1.55, + "grad_norm": 0.5812244415283203, + "learning_rate": 0.00028378801447622245, + "loss": 2.9829, + "step": 31661 + }, + { + "epoch": 1.55, + "grad_norm": 0.5904305577278137, + "learning_rate": 0.00028377264080447703, + "loss": 2.9529, + "step": 31662 + }, + { + "epoch": 1.55, + "grad_norm": 0.5870997905731201, + "learning_rate": 0.0002837572671754714, + "loss": 2.9428, + "step": 31663 + }, + { + "epoch": 1.55, + "grad_norm": 0.6143147945404053, + "learning_rate": 0.0002837418935892461, + "loss": 3.1222, + "step": 31664 + }, + { + "epoch": 1.55, + "grad_norm": 0.8274781107902527, + "learning_rate": 0.0002837265200458414, + "loss": 3.1928, + "step": 31665 + }, + { + "epoch": 1.55, + "grad_norm": 0.5731198787689209, + "learning_rate": 0.00028371114654529805, + "loss": 3.1566, + "step": 31666 + }, + { + "epoch": 1.55, + "grad_norm": 0.582653820514679, + "learning_rate": 0.0002836957730876563, + "loss": 3.0699, + "step": 31667 + }, + { + "epoch": 1.55, + "grad_norm": 0.6127033829689026, + "learning_rate": 0.0002836803996729567, + "loss": 3.074, + "step": 31668 + }, + { + "epoch": 1.55, + "grad_norm": 0.578920304775238, + "learning_rate": 0.00028366502630123994, + "loss": 3.1217, + "step": 31669 + }, + { + "epoch": 1.55, + "grad_norm": 0.574611246585846, + "learning_rate": 0.0002836496529725462, + "loss": 3.0978, + "step": 31670 + }, + { + "epoch": 1.55, + "grad_norm": 0.6003169417381287, + "learning_rate": 0.0002836342796869163, + "loss": 2.872, + "step": 31671 + }, + { + "epoch": 1.55, + "grad_norm": 0.6231915950775146, + "learning_rate": 0.0002836189064443904, + "loss": 2.9799, + "step": 31672 + }, + { + "epoch": 1.55, + "grad_norm": 0.6026855111122131, + "learning_rate": 0.00028360353324500913, + "loss": 2.9663, + "step": 31673 + }, + { + "epoch": 1.55, + "grad_norm": 0.639599621295929, + "learning_rate": 0.0002835881600888132, + "loss": 3.0197, + "step": 31674 + }, + { + "epoch": 1.55, + "grad_norm": 0.6096246242523193, + "learning_rate": 0.0002835727869758427, + "loss": 3.1729, + "step": 31675 + }, + { + "epoch": 1.55, + "grad_norm": 0.585129976272583, + "learning_rate": 0.00028355741390613846, + "loss": 2.8763, + "step": 31676 + }, + { + "epoch": 1.55, + "grad_norm": 0.5901762843132019, + "learning_rate": 0.00028354204087974064, + "loss": 3.2103, + "step": 31677 + }, + { + "epoch": 1.55, + "grad_norm": 0.6148892641067505, + "learning_rate": 0.00028352666789669003, + "loss": 2.9013, + "step": 31678 + }, + { + "epoch": 1.55, + "grad_norm": 0.5568849444389343, + "learning_rate": 0.0002835112949570271, + "loss": 2.9905, + "step": 31679 + }, + { + "epoch": 1.55, + "grad_norm": 0.5767827033996582, + "learning_rate": 0.00028349592206079207, + "loss": 3.0622, + "step": 31680 + }, + { + "epoch": 1.55, + "grad_norm": 0.6218153834342957, + "learning_rate": 0.00028348054920802575, + "loss": 2.9914, + "step": 31681 + }, + { + "epoch": 1.55, + "grad_norm": 0.6101388335227966, + "learning_rate": 0.0002834651763987684, + "loss": 3.0335, + "step": 31682 + }, + { + "epoch": 1.55, + "grad_norm": 0.5884225368499756, + "learning_rate": 0.00028344980363306056, + "loss": 2.9399, + "step": 31683 + }, + { + "epoch": 1.55, + "grad_norm": 0.6422539353370667, + "learning_rate": 0.0002834344309109429, + "loss": 2.9614, + "step": 31684 + }, + { + "epoch": 1.55, + "grad_norm": 0.5843330025672913, + "learning_rate": 0.0002834190582324557, + "loss": 3.2347, + "step": 31685 + }, + { + "epoch": 1.55, + "grad_norm": 0.5898882746696472, + "learning_rate": 0.0002834036855976395, + "loss": 3.0284, + "step": 31686 + }, + { + "epoch": 1.55, + "grad_norm": 0.6142812967300415, + "learning_rate": 0.0002833883130065347, + "loss": 3.0773, + "step": 31687 + }, + { + "epoch": 1.55, + "grad_norm": 0.591521143913269, + "learning_rate": 0.00028337294045918194, + "loss": 3.2177, + "step": 31688 + }, + { + "epoch": 1.55, + "grad_norm": 0.6057526469230652, + "learning_rate": 0.00028335756795562175, + "loss": 2.7478, + "step": 31689 + }, + { + "epoch": 1.55, + "grad_norm": 0.5793746113777161, + "learning_rate": 0.0002833421954958944, + "loss": 3.3766, + "step": 31690 + }, + { + "epoch": 1.55, + "grad_norm": 0.6439757943153381, + "learning_rate": 0.00028332682308004057, + "loss": 3.1779, + "step": 31691 + }, + { + "epoch": 1.55, + "grad_norm": 0.6085169911384583, + "learning_rate": 0.0002833114507081007, + "loss": 3.1525, + "step": 31692 + }, + { + "epoch": 1.55, + "grad_norm": 0.6145389080047607, + "learning_rate": 0.0002832960783801151, + "loss": 3.0357, + "step": 31693 + }, + { + "epoch": 1.55, + "grad_norm": 0.5741784572601318, + "learning_rate": 0.0002832807060961246, + "loss": 3.3173, + "step": 31694 + }, + { + "epoch": 1.55, + "grad_norm": 0.6480720639228821, + "learning_rate": 0.0002832653338561694, + "loss": 2.9654, + "step": 31695 + }, + { + "epoch": 1.55, + "grad_norm": 0.600227415561676, + "learning_rate": 0.00028324996166029015, + "loss": 2.8275, + "step": 31696 + }, + { + "epoch": 1.55, + "grad_norm": 0.6081393957138062, + "learning_rate": 0.00028323458950852727, + "loss": 3.164, + "step": 31697 + }, + { + "epoch": 1.55, + "grad_norm": 0.5734052658081055, + "learning_rate": 0.00028321921740092113, + "loss": 3.0086, + "step": 31698 + }, + { + "epoch": 1.55, + "grad_norm": 0.6059851050376892, + "learning_rate": 0.00028320384533751255, + "loss": 2.9567, + "step": 31699 + }, + { + "epoch": 1.55, + "grad_norm": 0.5766007304191589, + "learning_rate": 0.0002831884733183417, + "loss": 3.1765, + "step": 31700 + }, + { + "epoch": 1.55, + "grad_norm": 0.5918262600898743, + "learning_rate": 0.00028317310134344927, + "loss": 3.1488, + "step": 31701 + }, + { + "epoch": 1.55, + "grad_norm": 0.5856273770332336, + "learning_rate": 0.0002831577294128755, + "loss": 3.2332, + "step": 31702 + }, + { + "epoch": 1.55, + "grad_norm": 0.6248118877410889, + "learning_rate": 0.0002831423575266611, + "loss": 2.9965, + "step": 31703 + }, + { + "epoch": 1.55, + "grad_norm": 0.6196810603141785, + "learning_rate": 0.0002831269856848465, + "loss": 3.1695, + "step": 31704 + }, + { + "epoch": 1.55, + "grad_norm": 0.693333089351654, + "learning_rate": 0.0002831116138874721, + "loss": 3.2062, + "step": 31705 + }, + { + "epoch": 1.55, + "grad_norm": 0.6046402454376221, + "learning_rate": 0.00028309624213457854, + "loss": 3.0393, + "step": 31706 + }, + { + "epoch": 1.55, + "grad_norm": 0.6249449253082275, + "learning_rate": 0.0002830808704262062, + "loss": 2.9533, + "step": 31707 + }, + { + "epoch": 1.55, + "grad_norm": 0.5609068870544434, + "learning_rate": 0.0002830654987623957, + "loss": 2.9595, + "step": 31708 + }, + { + "epoch": 1.55, + "grad_norm": 0.6460155844688416, + "learning_rate": 0.00028305012714318717, + "loss": 2.9995, + "step": 31709 + }, + { + "epoch": 1.55, + "grad_norm": 0.6091744303703308, + "learning_rate": 0.0002830347555686215, + "loss": 3.2087, + "step": 31710 + }, + { + "epoch": 1.55, + "grad_norm": 0.5952534079551697, + "learning_rate": 0.00028301938403873904, + "loss": 2.9537, + "step": 31711 + }, + { + "epoch": 1.55, + "grad_norm": 0.5830929279327393, + "learning_rate": 0.0002830040125535802, + "loss": 2.989, + "step": 31712 + }, + { + "epoch": 1.55, + "grad_norm": 0.598037838935852, + "learning_rate": 0.00028298864111318557, + "loss": 3.0506, + "step": 31713 + }, + { + "epoch": 1.55, + "grad_norm": 0.5895772576332092, + "learning_rate": 0.00028297326971759554, + "loss": 3.0416, + "step": 31714 + }, + { + "epoch": 1.55, + "grad_norm": 0.6120174527168274, + "learning_rate": 0.00028295789836685055, + "loss": 3.1145, + "step": 31715 + }, + { + "epoch": 1.55, + "grad_norm": 0.5766130089759827, + "learning_rate": 0.00028294252706099137, + "loss": 3.0686, + "step": 31716 + }, + { + "epoch": 1.55, + "grad_norm": 0.6399678587913513, + "learning_rate": 0.0002829271558000581, + "loss": 2.9891, + "step": 31717 + }, + { + "epoch": 1.55, + "grad_norm": 0.6096529364585876, + "learning_rate": 0.0002829117845840916, + "loss": 2.9512, + "step": 31718 + }, + { + "epoch": 1.55, + "grad_norm": 0.632032036781311, + "learning_rate": 0.00028289641341313205, + "loss": 3.1313, + "step": 31719 + }, + { + "epoch": 1.55, + "grad_norm": 0.6017841696739197, + "learning_rate": 0.00028288104228722, + "loss": 3.0703, + "step": 31720 + }, + { + "epoch": 1.55, + "grad_norm": 0.6441850066184998, + "learning_rate": 0.00028286567120639616, + "loss": 2.9992, + "step": 31721 + }, + { + "epoch": 1.55, + "grad_norm": 0.5980612635612488, + "learning_rate": 0.00028285030017070076, + "loss": 3.1623, + "step": 31722 + }, + { + "epoch": 1.55, + "grad_norm": 0.5885056257247925, + "learning_rate": 0.0002828349291801744, + "loss": 2.9903, + "step": 31723 + }, + { + "epoch": 1.55, + "grad_norm": 0.5749340057373047, + "learning_rate": 0.00028281955823485745, + "loss": 2.9712, + "step": 31724 + }, + { + "epoch": 1.55, + "grad_norm": 0.5880852341651917, + "learning_rate": 0.00028280418733479053, + "loss": 3.0399, + "step": 31725 + }, + { + "epoch": 1.55, + "grad_norm": 0.6071673631668091, + "learning_rate": 0.00028278881648001415, + "loss": 2.9693, + "step": 31726 + }, + { + "epoch": 1.55, + "grad_norm": 0.6033803820610046, + "learning_rate": 0.0002827734456705686, + "loss": 3.0939, + "step": 31727 + }, + { + "epoch": 1.55, + "grad_norm": 0.5925212502479553, + "learning_rate": 0.00028275807490649456, + "loss": 3.0581, + "step": 31728 + }, + { + "epoch": 1.55, + "grad_norm": 0.6496951580047607, + "learning_rate": 0.0002827427041878324, + "loss": 2.8309, + "step": 31729 + }, + { + "epoch": 1.56, + "grad_norm": 0.568051815032959, + "learning_rate": 0.00028272733351462253, + "loss": 3.1127, + "step": 31730 + }, + { + "epoch": 1.56, + "grad_norm": 0.5681890249252319, + "learning_rate": 0.0002827119628869057, + "loss": 2.8544, + "step": 31731 + }, + { + "epoch": 1.56, + "grad_norm": 0.5864232182502747, + "learning_rate": 0.0002826965923047222, + "loss": 2.8054, + "step": 31732 + }, + { + "epoch": 1.56, + "grad_norm": 0.6019498705863953, + "learning_rate": 0.0002826812217681125, + "loss": 2.9317, + "step": 31733 + }, + { + "epoch": 1.56, + "grad_norm": 0.637272298336029, + "learning_rate": 0.00028266585127711713, + "loss": 3.1232, + "step": 31734 + }, + { + "epoch": 1.56, + "grad_norm": 0.6020846366882324, + "learning_rate": 0.0002826504808317765, + "loss": 3.0031, + "step": 31735 + }, + { + "epoch": 1.56, + "grad_norm": 0.6123052835464478, + "learning_rate": 0.00028263511043213137, + "loss": 3.0354, + "step": 31736 + }, + { + "epoch": 1.56, + "grad_norm": 0.5840319395065308, + "learning_rate": 0.0002826197400782218, + "loss": 2.7779, + "step": 31737 + }, + { + "epoch": 1.56, + "grad_norm": 0.6461364030838013, + "learning_rate": 0.0002826043697700887, + "loss": 3.3123, + "step": 31738 + }, + { + "epoch": 1.56, + "grad_norm": 0.6259379386901855, + "learning_rate": 0.00028258899950777213, + "loss": 3.0375, + "step": 31739 + }, + { + "epoch": 1.56, + "grad_norm": 0.5921148657798767, + "learning_rate": 0.0002825736292913129, + "loss": 2.9987, + "step": 31740 + }, + { + "epoch": 1.56, + "grad_norm": 0.5672594904899597, + "learning_rate": 0.00028255825912075136, + "loss": 2.9653, + "step": 31741 + }, + { + "epoch": 1.56, + "grad_norm": 0.5888809561729431, + "learning_rate": 0.00028254288899612795, + "loss": 3.078, + "step": 31742 + }, + { + "epoch": 1.56, + "grad_norm": 0.5958986282348633, + "learning_rate": 0.00028252751891748334, + "loss": 3.1048, + "step": 31743 + }, + { + "epoch": 1.56, + "grad_norm": 0.6289757490158081, + "learning_rate": 0.0002825121488848578, + "loss": 3.0964, + "step": 31744 + }, + { + "epoch": 1.56, + "grad_norm": 0.5901963710784912, + "learning_rate": 0.00028249677889829187, + "loss": 2.9512, + "step": 31745 + }, + { + "epoch": 1.56, + "grad_norm": 0.6648480296134949, + "learning_rate": 0.0002824814089578261, + "loss": 3.0944, + "step": 31746 + }, + { + "epoch": 1.56, + "grad_norm": 0.6112911105155945, + "learning_rate": 0.0002824660390635009, + "loss": 2.8058, + "step": 31747 + }, + { + "epoch": 1.56, + "grad_norm": 0.6173609495162964, + "learning_rate": 0.00028245066921535686, + "loss": 3.1509, + "step": 31748 + }, + { + "epoch": 1.56, + "grad_norm": 0.6812626123428345, + "learning_rate": 0.00028243529941343425, + "loss": 2.8451, + "step": 31749 + }, + { + "epoch": 1.56, + "grad_norm": 0.5810101628303528, + "learning_rate": 0.0002824199296577738, + "loss": 2.7511, + "step": 31750 + }, + { + "epoch": 1.56, + "grad_norm": 0.5572994947433472, + "learning_rate": 0.00028240455994841586, + "loss": 3.047, + "step": 31751 + }, + { + "epoch": 1.56, + "grad_norm": 0.6092203259468079, + "learning_rate": 0.0002823891902854008, + "loss": 3.0467, + "step": 31752 + }, + { + "epoch": 1.56, + "grad_norm": 0.6097322106361389, + "learning_rate": 0.00028237382066876937, + "loss": 3.1722, + "step": 31753 + }, + { + "epoch": 1.56, + "grad_norm": 0.5815500020980835, + "learning_rate": 0.0002823584510985618, + "loss": 3.1401, + "step": 31754 + }, + { + "epoch": 1.56, + "grad_norm": 0.6111709475517273, + "learning_rate": 0.0002823430815748187, + "loss": 3.1151, + "step": 31755 + }, + { + "epoch": 1.56, + "grad_norm": 0.6202290058135986, + "learning_rate": 0.0002823277120975806, + "loss": 2.981, + "step": 31756 + }, + { + "epoch": 1.56, + "grad_norm": 0.6156234741210938, + "learning_rate": 0.00028231234266688776, + "loss": 3.0407, + "step": 31757 + }, + { + "epoch": 1.56, + "grad_norm": 0.5927169322967529, + "learning_rate": 0.00028229697328278094, + "loss": 2.9775, + "step": 31758 + }, + { + "epoch": 1.56, + "grad_norm": 0.5997295379638672, + "learning_rate": 0.0002822816039453004, + "loss": 3.031, + "step": 31759 + }, + { + "epoch": 1.56, + "grad_norm": 0.5959268808364868, + "learning_rate": 0.0002822662346544867, + "loss": 3.1206, + "step": 31760 + }, + { + "epoch": 1.56, + "grad_norm": 0.6445311307907104, + "learning_rate": 0.0002822508654103804, + "loss": 2.9471, + "step": 31761 + }, + { + "epoch": 1.56, + "grad_norm": 0.6009385585784912, + "learning_rate": 0.0002822354962130219, + "loss": 2.9307, + "step": 31762 + }, + { + "epoch": 1.56, + "grad_norm": 0.5991648435592651, + "learning_rate": 0.00028222012706245167, + "loss": 3.112, + "step": 31763 + }, + { + "epoch": 1.56, + "grad_norm": 0.5902231335639954, + "learning_rate": 0.0002822047579587101, + "loss": 2.8237, + "step": 31764 + }, + { + "epoch": 1.56, + "grad_norm": 0.8094449639320374, + "learning_rate": 0.0002821893889018379, + "loss": 3.1565, + "step": 31765 + }, + { + "epoch": 1.56, + "grad_norm": 0.6150186657905579, + "learning_rate": 0.00028217401989187547, + "loss": 2.9342, + "step": 31766 + }, + { + "epoch": 1.56, + "grad_norm": 0.5921657681465149, + "learning_rate": 0.000282158650928863, + "loss": 2.9887, + "step": 31767 + }, + { + "epoch": 1.56, + "grad_norm": 0.5835986137390137, + "learning_rate": 0.00028214328201284146, + "loss": 3.1866, + "step": 31768 + }, + { + "epoch": 1.56, + "grad_norm": 0.5715051889419556, + "learning_rate": 0.00028212791314385097, + "loss": 2.8587, + "step": 31769 + }, + { + "epoch": 1.56, + "grad_norm": 0.6232491135597229, + "learning_rate": 0.0002821125443219321, + "loss": 3.0507, + "step": 31770 + }, + { + "epoch": 1.56, + "grad_norm": 0.5849910974502563, + "learning_rate": 0.0002820971755471254, + "loss": 2.9724, + "step": 31771 + }, + { + "epoch": 1.56, + "grad_norm": 0.6116089820861816, + "learning_rate": 0.0002820818068194713, + "loss": 3.2148, + "step": 31772 + }, + { + "epoch": 1.56, + "grad_norm": 0.6128947138786316, + "learning_rate": 0.00028206643813901024, + "loss": 3.0349, + "step": 31773 + }, + { + "epoch": 1.56, + "grad_norm": 0.591354250907898, + "learning_rate": 0.0002820510695057827, + "loss": 3.119, + "step": 31774 + }, + { + "epoch": 1.56, + "grad_norm": 0.6188297867774963, + "learning_rate": 0.0002820357009198292, + "loss": 3.2393, + "step": 31775 + }, + { + "epoch": 1.56, + "grad_norm": 0.6261090636253357, + "learning_rate": 0.0002820203323811903, + "loss": 2.9624, + "step": 31776 + }, + { + "epoch": 1.56, + "grad_norm": 0.56405109167099, + "learning_rate": 0.00028200496388990624, + "loss": 3.1006, + "step": 31777 + }, + { + "epoch": 1.56, + "grad_norm": 0.6306495666503906, + "learning_rate": 0.00028198959544601776, + "loss": 3.1529, + "step": 31778 + }, + { + "epoch": 1.56, + "grad_norm": 0.6113765239715576, + "learning_rate": 0.0002819742270495651, + "loss": 2.888, + "step": 31779 + }, + { + "epoch": 1.56, + "grad_norm": 0.5723241567611694, + "learning_rate": 0.000281958858700589, + "loss": 2.8904, + "step": 31780 + }, + { + "epoch": 1.56, + "grad_norm": 0.5704621076583862, + "learning_rate": 0.0002819434903991297, + "loss": 2.879, + "step": 31781 + }, + { + "epoch": 1.56, + "grad_norm": 0.5865147113800049, + "learning_rate": 0.00028192812214522777, + "loss": 3.0125, + "step": 31782 + }, + { + "epoch": 1.56, + "grad_norm": 0.6170030832290649, + "learning_rate": 0.00028191275393892376, + "loss": 2.9818, + "step": 31783 + }, + { + "epoch": 1.56, + "grad_norm": 0.6150692701339722, + "learning_rate": 0.00028189738578025803, + "loss": 3.019, + "step": 31784 + }, + { + "epoch": 1.56, + "grad_norm": 0.6316766142845154, + "learning_rate": 0.0002818820176692711, + "loss": 2.8793, + "step": 31785 + }, + { + "epoch": 1.56, + "grad_norm": 0.5430260300636292, + "learning_rate": 0.00028186664960600346, + "loss": 2.9567, + "step": 31786 + }, + { + "epoch": 1.56, + "grad_norm": 0.5936475992202759, + "learning_rate": 0.0002818512815904955, + "loss": 3.1525, + "step": 31787 + }, + { + "epoch": 1.56, + "grad_norm": 0.6307163834571838, + "learning_rate": 0.0002818359136227879, + "loss": 3.0512, + "step": 31788 + }, + { + "epoch": 1.56, + "grad_norm": 0.6203303933143616, + "learning_rate": 0.0002818205457029209, + "loss": 3.0299, + "step": 31789 + }, + { + "epoch": 1.56, + "grad_norm": 0.5848477482795715, + "learning_rate": 0.0002818051778309352, + "loss": 3.0923, + "step": 31790 + }, + { + "epoch": 1.56, + "grad_norm": 0.611884355545044, + "learning_rate": 0.00028178981000687105, + "loss": 3.0312, + "step": 31791 + }, + { + "epoch": 1.56, + "grad_norm": 0.585080623626709, + "learning_rate": 0.00028177444223076903, + "loss": 3.14, + "step": 31792 + }, + { + "epoch": 1.56, + "grad_norm": 0.5883607268333435, + "learning_rate": 0.00028175907450266974, + "loss": 3.2011, + "step": 31793 + }, + { + "epoch": 1.56, + "grad_norm": 0.6267507672309875, + "learning_rate": 0.0002817437068226134, + "loss": 3.17, + "step": 31794 + }, + { + "epoch": 1.56, + "grad_norm": 0.6080348491668701, + "learning_rate": 0.00028172833919064077, + "loss": 2.996, + "step": 31795 + }, + { + "epoch": 1.56, + "grad_norm": 0.5978765487670898, + "learning_rate": 0.000281712971606792, + "loss": 2.9237, + "step": 31796 + }, + { + "epoch": 1.56, + "grad_norm": 0.6047960519790649, + "learning_rate": 0.0002816976040711078, + "loss": 2.8924, + "step": 31797 + }, + { + "epoch": 1.56, + "grad_norm": 0.6580313444137573, + "learning_rate": 0.0002816822365836287, + "loss": 2.9914, + "step": 31798 + }, + { + "epoch": 1.56, + "grad_norm": 0.620141863822937, + "learning_rate": 0.00028166686914439493, + "loss": 3.1461, + "step": 31799 + }, + { + "epoch": 1.56, + "grad_norm": 0.604900062084198, + "learning_rate": 0.0002816515017534472, + "loss": 3.0193, + "step": 31800 + }, + { + "epoch": 1.56, + "grad_norm": 0.6572511196136475, + "learning_rate": 0.0002816361344108258, + "loss": 2.9982, + "step": 31801 + }, + { + "epoch": 1.56, + "grad_norm": 0.6069661974906921, + "learning_rate": 0.00028162076711657133, + "loss": 3.0559, + "step": 31802 + }, + { + "epoch": 1.56, + "grad_norm": 0.6206227540969849, + "learning_rate": 0.00028160539987072425, + "loss": 3.0348, + "step": 31803 + }, + { + "epoch": 1.56, + "grad_norm": 0.5691961050033569, + "learning_rate": 0.0002815900326733249, + "loss": 2.9718, + "step": 31804 + }, + { + "epoch": 1.56, + "grad_norm": 0.5769830942153931, + "learning_rate": 0.000281574665524414, + "loss": 2.8864, + "step": 31805 + }, + { + "epoch": 1.56, + "grad_norm": 0.6015021800994873, + "learning_rate": 0.0002815592984240318, + "loss": 3.0994, + "step": 31806 + }, + { + "epoch": 1.56, + "grad_norm": 0.6178338527679443, + "learning_rate": 0.00028154393137221886, + "loss": 3.1458, + "step": 31807 + }, + { + "epoch": 1.56, + "grad_norm": 0.5842515826225281, + "learning_rate": 0.00028152856436901573, + "loss": 3.1004, + "step": 31808 + }, + { + "epoch": 1.56, + "grad_norm": 0.5896745920181274, + "learning_rate": 0.00028151319741446276, + "loss": 3.0969, + "step": 31809 + }, + { + "epoch": 1.56, + "grad_norm": 0.6141567230224609, + "learning_rate": 0.0002814978305086005, + "loss": 2.9886, + "step": 31810 + }, + { + "epoch": 1.56, + "grad_norm": 0.6013665795326233, + "learning_rate": 0.00028148246365146927, + "loss": 3.0342, + "step": 31811 + }, + { + "epoch": 1.56, + "grad_norm": 0.6272392868995667, + "learning_rate": 0.00028146709684310974, + "loss": 3.2857, + "step": 31812 + }, + { + "epoch": 1.56, + "grad_norm": 0.5829760432243347, + "learning_rate": 0.0002814517300835624, + "loss": 3.1099, + "step": 31813 + }, + { + "epoch": 1.56, + "grad_norm": 0.629869282245636, + "learning_rate": 0.0002814363633728675, + "loss": 2.8432, + "step": 31814 + }, + { + "epoch": 1.56, + "grad_norm": 0.6304134726524353, + "learning_rate": 0.00028142099671106573, + "loss": 3.305, + "step": 31815 + }, + { + "epoch": 1.56, + "grad_norm": 0.5929440855979919, + "learning_rate": 0.00028140563009819745, + "loss": 3.1774, + "step": 31816 + }, + { + "epoch": 1.56, + "grad_norm": 0.6079155206680298, + "learning_rate": 0.0002813902635343031, + "loss": 3.167, + "step": 31817 + }, + { + "epoch": 1.56, + "grad_norm": 0.6250888705253601, + "learning_rate": 0.00028137489701942335, + "loss": 2.9714, + "step": 31818 + }, + { + "epoch": 1.56, + "grad_norm": 0.6257724165916443, + "learning_rate": 0.00028135953055359836, + "loss": 2.7605, + "step": 31819 + }, + { + "epoch": 1.56, + "grad_norm": 0.6094701290130615, + "learning_rate": 0.00028134416413686897, + "loss": 3.1054, + "step": 31820 + }, + { + "epoch": 1.56, + "grad_norm": 0.6528363823890686, + "learning_rate": 0.00028132879776927535, + "loss": 2.792, + "step": 31821 + }, + { + "epoch": 1.56, + "grad_norm": 0.647331953048706, + "learning_rate": 0.00028131343145085805, + "loss": 2.9551, + "step": 31822 + }, + { + "epoch": 1.56, + "grad_norm": 0.5757825374603271, + "learning_rate": 0.0002812980651816577, + "loss": 3.011, + "step": 31823 + }, + { + "epoch": 1.56, + "grad_norm": 0.5802479982376099, + "learning_rate": 0.00028128269896171457, + "loss": 2.9968, + "step": 31824 + }, + { + "epoch": 1.56, + "grad_norm": 0.583878755569458, + "learning_rate": 0.00028126733279106925, + "loss": 3.0743, + "step": 31825 + }, + { + "epoch": 1.56, + "grad_norm": 0.5796141624450684, + "learning_rate": 0.0002812519666697621, + "loss": 3.1808, + "step": 31826 + }, + { + "epoch": 1.56, + "grad_norm": 0.6166408658027649, + "learning_rate": 0.0002812366005978337, + "loss": 3.1588, + "step": 31827 + }, + { + "epoch": 1.56, + "grad_norm": 0.5788041353225708, + "learning_rate": 0.0002812212345753245, + "loss": 2.872, + "step": 31828 + }, + { + "epoch": 1.56, + "grad_norm": 0.5892295837402344, + "learning_rate": 0.0002812058686022749, + "loss": 3.0678, + "step": 31829 + }, + { + "epoch": 1.56, + "grad_norm": 0.6605625748634338, + "learning_rate": 0.0002811905026787255, + "loss": 2.7785, + "step": 31830 + }, + { + "epoch": 1.56, + "grad_norm": 0.6186345219612122, + "learning_rate": 0.00028117513680471663, + "loss": 3.0413, + "step": 31831 + }, + { + "epoch": 1.56, + "grad_norm": 0.561852216720581, + "learning_rate": 0.0002811597709802888, + "loss": 2.7898, + "step": 31832 + }, + { + "epoch": 1.56, + "grad_norm": 0.5842719674110413, + "learning_rate": 0.0002811444052054826, + "loss": 3.0802, + "step": 31833 + }, + { + "epoch": 1.56, + "grad_norm": 0.6352782249450684, + "learning_rate": 0.0002811290394803384, + "loss": 2.9058, + "step": 31834 + }, + { + "epoch": 1.56, + "grad_norm": 0.6168009638786316, + "learning_rate": 0.0002811136738048967, + "loss": 2.9086, + "step": 31835 + }, + { + "epoch": 1.56, + "grad_norm": 0.6582854986190796, + "learning_rate": 0.0002810983081791978, + "loss": 3.0012, + "step": 31836 + }, + { + "epoch": 1.56, + "grad_norm": 0.5972253084182739, + "learning_rate": 0.0002810829426032824, + "loss": 3.2943, + "step": 31837 + }, + { + "epoch": 1.56, + "grad_norm": 0.6005479693412781, + "learning_rate": 0.000281067577077191, + "loss": 2.9417, + "step": 31838 + }, + { + "epoch": 1.56, + "grad_norm": 0.5901066660881042, + "learning_rate": 0.0002810522116009638, + "loss": 2.9913, + "step": 31839 + }, + { + "epoch": 1.56, + "grad_norm": 0.6512740850448608, + "learning_rate": 0.0002810368461746415, + "loss": 3.1729, + "step": 31840 + }, + { + "epoch": 1.56, + "grad_norm": 0.6124058961868286, + "learning_rate": 0.00028102148079826443, + "loss": 2.9708, + "step": 31841 + }, + { + "epoch": 1.56, + "grad_norm": 0.5982481241226196, + "learning_rate": 0.0002810061154718732, + "loss": 3.1866, + "step": 31842 + }, + { + "epoch": 1.56, + "grad_norm": 0.5993276238441467, + "learning_rate": 0.00028099075019550826, + "loss": 2.8918, + "step": 31843 + }, + { + "epoch": 1.56, + "grad_norm": 0.5819422602653503, + "learning_rate": 0.00028097538496920987, + "loss": 3.1729, + "step": 31844 + }, + { + "epoch": 1.56, + "grad_norm": 0.6041077375411987, + "learning_rate": 0.0002809600197930188, + "loss": 2.9457, + "step": 31845 + }, + { + "epoch": 1.56, + "grad_norm": 0.6543545126914978, + "learning_rate": 0.0002809446546669753, + "loss": 3.07, + "step": 31846 + }, + { + "epoch": 1.56, + "grad_norm": 0.5843205451965332, + "learning_rate": 0.0002809292895911199, + "loss": 3.1067, + "step": 31847 + }, + { + "epoch": 1.56, + "grad_norm": 0.5955379605293274, + "learning_rate": 0.0002809139245654931, + "loss": 3.0711, + "step": 31848 + }, + { + "epoch": 1.56, + "grad_norm": 0.6512821912765503, + "learning_rate": 0.00028089855959013537, + "loss": 2.8107, + "step": 31849 + }, + { + "epoch": 1.56, + "grad_norm": 0.6019611954689026, + "learning_rate": 0.0002808831946650872, + "loss": 3.0046, + "step": 31850 + }, + { + "epoch": 1.56, + "grad_norm": 0.5904426574707031, + "learning_rate": 0.0002808678297903889, + "loss": 2.968, + "step": 31851 + }, + { + "epoch": 1.56, + "grad_norm": 0.6119365096092224, + "learning_rate": 0.00028085246496608115, + "loss": 3.2099, + "step": 31852 + }, + { + "epoch": 1.56, + "grad_norm": 0.6144260168075562, + "learning_rate": 0.00028083710019220434, + "loss": 3.1469, + "step": 31853 + }, + { + "epoch": 1.56, + "grad_norm": 0.5753419399261475, + "learning_rate": 0.00028082173546879876, + "loss": 3.1069, + "step": 31854 + }, + { + "epoch": 1.56, + "grad_norm": 0.5890007019042969, + "learning_rate": 0.0002808063707959052, + "loss": 2.9953, + "step": 31855 + }, + { + "epoch": 1.56, + "grad_norm": 0.6050313711166382, + "learning_rate": 0.0002807910061735639, + "loss": 3.1714, + "step": 31856 + }, + { + "epoch": 1.56, + "grad_norm": 0.6816890835762024, + "learning_rate": 0.0002807756416018155, + "loss": 3.093, + "step": 31857 + }, + { + "epoch": 1.56, + "grad_norm": 0.6396083831787109, + "learning_rate": 0.0002807602770807002, + "loss": 3.1671, + "step": 31858 + }, + { + "epoch": 1.56, + "grad_norm": 0.5927896499633789, + "learning_rate": 0.0002807449126102586, + "loss": 2.9865, + "step": 31859 + }, + { + "epoch": 1.56, + "grad_norm": 0.6155912280082703, + "learning_rate": 0.00028072954819053134, + "loss": 2.914, + "step": 31860 + }, + { + "epoch": 1.56, + "grad_norm": 0.5896044373512268, + "learning_rate": 0.0002807141838215587, + "loss": 3.0633, + "step": 31861 + }, + { + "epoch": 1.56, + "grad_norm": 0.6200414896011353, + "learning_rate": 0.00028069881950338124, + "loss": 2.9665, + "step": 31862 + }, + { + "epoch": 1.56, + "grad_norm": 0.5940514206886292, + "learning_rate": 0.00028068345523603924, + "loss": 3.2252, + "step": 31863 + }, + { + "epoch": 1.56, + "grad_norm": 0.6118853092193604, + "learning_rate": 0.00028066809101957337, + "loss": 2.9935, + "step": 31864 + }, + { + "epoch": 1.56, + "grad_norm": 0.628420889377594, + "learning_rate": 0.00028065272685402407, + "loss": 2.6308, + "step": 31865 + }, + { + "epoch": 1.56, + "grad_norm": 0.6051673293113708, + "learning_rate": 0.0002806373627394316, + "loss": 3.186, + "step": 31866 + }, + { + "epoch": 1.56, + "grad_norm": 0.6414685845375061, + "learning_rate": 0.0002806219986758368, + "loss": 2.9328, + "step": 31867 + }, + { + "epoch": 1.56, + "grad_norm": 0.6532812118530273, + "learning_rate": 0.00028060663466327986, + "loss": 3.215, + "step": 31868 + }, + { + "epoch": 1.56, + "grad_norm": 0.6058521270751953, + "learning_rate": 0.0002805912707018012, + "loss": 3.1998, + "step": 31869 + }, + { + "epoch": 1.56, + "grad_norm": 0.5776923298835754, + "learning_rate": 0.0002805759067914416, + "loss": 2.9845, + "step": 31870 + }, + { + "epoch": 1.56, + "grad_norm": 0.6667451858520508, + "learning_rate": 0.00028056054293224117, + "loss": 3.1901, + "step": 31871 + }, + { + "epoch": 1.56, + "grad_norm": 0.6361920833587646, + "learning_rate": 0.00028054517912424066, + "loss": 3.0641, + "step": 31872 + }, + { + "epoch": 1.56, + "grad_norm": 0.587245762348175, + "learning_rate": 0.0002805298153674802, + "loss": 3.1569, + "step": 31873 + }, + { + "epoch": 1.56, + "grad_norm": 0.6225107908248901, + "learning_rate": 0.0002805144516620006, + "loss": 3.0355, + "step": 31874 + }, + { + "epoch": 1.56, + "grad_norm": 0.6272902488708496, + "learning_rate": 0.0002804990880078422, + "loss": 3.0836, + "step": 31875 + }, + { + "epoch": 1.56, + "grad_norm": 0.6180784106254578, + "learning_rate": 0.0002804837244050454, + "loss": 3.0903, + "step": 31876 + }, + { + "epoch": 1.56, + "grad_norm": 0.6579588651657104, + "learning_rate": 0.00028046836085365075, + "loss": 3.1705, + "step": 31877 + }, + { + "epoch": 1.56, + "grad_norm": 0.6163791418075562, + "learning_rate": 0.00028045299735369864, + "loss": 3.1334, + "step": 31878 + }, + { + "epoch": 1.56, + "grad_norm": 0.5973280668258667, + "learning_rate": 0.0002804376339052295, + "loss": 2.9392, + "step": 31879 + }, + { + "epoch": 1.56, + "grad_norm": 0.5973444581031799, + "learning_rate": 0.000280422270508284, + "loss": 3.068, + "step": 31880 + }, + { + "epoch": 1.56, + "grad_norm": 0.5980919599533081, + "learning_rate": 0.0002804069071629024, + "loss": 3.0488, + "step": 31881 + }, + { + "epoch": 1.56, + "grad_norm": 0.5866699814796448, + "learning_rate": 0.0002803915438691253, + "loss": 2.9993, + "step": 31882 + }, + { + "epoch": 1.56, + "grad_norm": 0.6229212880134583, + "learning_rate": 0.00028037618062699306, + "loss": 3.168, + "step": 31883 + }, + { + "epoch": 1.56, + "grad_norm": 0.629422664642334, + "learning_rate": 0.0002803608174365461, + "loss": 2.9593, + "step": 31884 + }, + { + "epoch": 1.56, + "grad_norm": 0.5796070694923401, + "learning_rate": 0.0002803454542978251, + "loss": 3.0037, + "step": 31885 + }, + { + "epoch": 1.56, + "grad_norm": 0.6071597933769226, + "learning_rate": 0.0002803300912108703, + "loss": 2.9927, + "step": 31886 + }, + { + "epoch": 1.56, + "grad_norm": 0.6303873658180237, + "learning_rate": 0.0002803147281757224, + "loss": 3.144, + "step": 31887 + }, + { + "epoch": 1.56, + "grad_norm": 0.5881187915802002, + "learning_rate": 0.0002802993651924215, + "loss": 3.1171, + "step": 31888 + }, + { + "epoch": 1.56, + "grad_norm": 0.5947704315185547, + "learning_rate": 0.00028028400226100844, + "loss": 3.233, + "step": 31889 + }, + { + "epoch": 1.56, + "grad_norm": 0.6180113554000854, + "learning_rate": 0.00028026863938152355, + "loss": 3.0989, + "step": 31890 + }, + { + "epoch": 1.56, + "grad_norm": 0.6293803453445435, + "learning_rate": 0.00028025327655400707, + "loss": 3.1379, + "step": 31891 + }, + { + "epoch": 1.56, + "grad_norm": 0.6309683322906494, + "learning_rate": 0.0002802379137784999, + "loss": 3.0034, + "step": 31892 + }, + { + "epoch": 1.56, + "grad_norm": 0.5890076160430908, + "learning_rate": 0.0002802225510550421, + "loss": 3.1913, + "step": 31893 + }, + { + "epoch": 1.56, + "grad_norm": 0.5986996293067932, + "learning_rate": 0.00028020718838367426, + "loss": 3.1027, + "step": 31894 + }, + { + "epoch": 1.56, + "grad_norm": 0.6149293184280396, + "learning_rate": 0.000280191825764437, + "loss": 2.8922, + "step": 31895 + }, + { + "epoch": 1.56, + "grad_norm": 0.5837023258209229, + "learning_rate": 0.00028017646319737063, + "loss": 2.8853, + "step": 31896 + }, + { + "epoch": 1.56, + "grad_norm": 0.5797613859176636, + "learning_rate": 0.00028016110068251567, + "loss": 3.1853, + "step": 31897 + }, + { + "epoch": 1.56, + "grad_norm": 0.6410925984382629, + "learning_rate": 0.00028014573821991243, + "loss": 3.1645, + "step": 31898 + }, + { + "epoch": 1.56, + "grad_norm": 0.5977853536605835, + "learning_rate": 0.0002801303758096015, + "loss": 3.1972, + "step": 31899 + }, + { + "epoch": 1.56, + "grad_norm": 0.6081350445747375, + "learning_rate": 0.00028011501345162355, + "loss": 3.0046, + "step": 31900 + }, + { + "epoch": 1.56, + "grad_norm": 0.598984956741333, + "learning_rate": 0.0002800996511460186, + "loss": 2.9773, + "step": 31901 + }, + { + "epoch": 1.56, + "grad_norm": 0.5773735046386719, + "learning_rate": 0.0002800842888928275, + "loss": 2.9064, + "step": 31902 + }, + { + "epoch": 1.56, + "grad_norm": 0.5690639615058899, + "learning_rate": 0.0002800689266920904, + "loss": 2.9076, + "step": 31903 + }, + { + "epoch": 1.56, + "grad_norm": 0.6134076118469238, + "learning_rate": 0.00028005356454384797, + "loss": 2.9833, + "step": 31904 + }, + { + "epoch": 1.56, + "grad_norm": 0.5963901281356812, + "learning_rate": 0.0002800382024481407, + "loss": 2.8316, + "step": 31905 + }, + { + "epoch": 1.56, + "grad_norm": 0.5689248442649841, + "learning_rate": 0.0002800228404050088, + "loss": 3.0716, + "step": 31906 + }, + { + "epoch": 1.56, + "grad_norm": 0.6227253079414368, + "learning_rate": 0.00028000747841449306, + "loss": 2.9693, + "step": 31907 + }, + { + "epoch": 1.56, + "grad_norm": 0.6223208904266357, + "learning_rate": 0.0002799921164766337, + "loss": 3.0232, + "step": 31908 + }, + { + "epoch": 1.56, + "grad_norm": 0.5878385305404663, + "learning_rate": 0.00027997675459147115, + "loss": 2.9983, + "step": 31909 + }, + { + "epoch": 1.56, + "grad_norm": 0.6404999494552612, + "learning_rate": 0.00027996139275904616, + "loss": 3.1131, + "step": 31910 + }, + { + "epoch": 1.56, + "grad_norm": 0.6232870817184448, + "learning_rate": 0.0002799460309793989, + "loss": 3.0633, + "step": 31911 + }, + { + "epoch": 1.56, + "grad_norm": 0.6137735843658447, + "learning_rate": 0.00027993066925257004, + "loss": 2.8579, + "step": 31912 + }, + { + "epoch": 1.56, + "grad_norm": 0.5834466218948364, + "learning_rate": 0.0002799153075785998, + "loss": 3.0556, + "step": 31913 + }, + { + "epoch": 1.56, + "grad_norm": 0.6220276951789856, + "learning_rate": 0.0002798999459575288, + "loss": 3.0881, + "step": 31914 + }, + { + "epoch": 1.56, + "grad_norm": 0.5749549269676208, + "learning_rate": 0.00027988458438939754, + "loss": 3.0783, + "step": 31915 + }, + { + "epoch": 1.56, + "grad_norm": 0.5834282636642456, + "learning_rate": 0.0002798692228742463, + "loss": 3.078, + "step": 31916 + }, + { + "epoch": 1.56, + "grad_norm": 0.59247225522995, + "learning_rate": 0.00027985386141211584, + "loss": 2.9249, + "step": 31917 + }, + { + "epoch": 1.56, + "grad_norm": 0.5814173221588135, + "learning_rate": 0.00027983850000304626, + "loss": 2.897, + "step": 31918 + }, + { + "epoch": 1.56, + "grad_norm": 0.6004323959350586, + "learning_rate": 0.0002798231386470782, + "loss": 3.0616, + "step": 31919 + }, + { + "epoch": 1.56, + "grad_norm": 0.5897133350372314, + "learning_rate": 0.0002798077773442522, + "loss": 3.1485, + "step": 31920 + }, + { + "epoch": 1.56, + "grad_norm": 0.6207877397537231, + "learning_rate": 0.00027979241609460854, + "loss": 2.8907, + "step": 31921 + }, + { + "epoch": 1.56, + "grad_norm": 0.5839530229568481, + "learning_rate": 0.00027977705489818787, + "loss": 3.0955, + "step": 31922 + }, + { + "epoch": 1.56, + "grad_norm": 0.6286572813987732, + "learning_rate": 0.0002797616937550305, + "loss": 2.9629, + "step": 31923 + }, + { + "epoch": 1.56, + "grad_norm": 0.5850244760513306, + "learning_rate": 0.00027974633266517686, + "loss": 3.3621, + "step": 31924 + }, + { + "epoch": 1.56, + "grad_norm": 0.5959276556968689, + "learning_rate": 0.0002797309716286676, + "loss": 3.0628, + "step": 31925 + }, + { + "epoch": 1.56, + "grad_norm": 0.6070095896720886, + "learning_rate": 0.00027971561064554296, + "loss": 2.8452, + "step": 31926 + }, + { + "epoch": 1.56, + "grad_norm": 0.6149983406066895, + "learning_rate": 0.0002797002497158436, + "loss": 2.8937, + "step": 31927 + }, + { + "epoch": 1.56, + "grad_norm": 0.594487726688385, + "learning_rate": 0.0002796848888396098, + "loss": 2.9891, + "step": 31928 + }, + { + "epoch": 1.56, + "grad_norm": 0.623718798160553, + "learning_rate": 0.0002796695280168821, + "loss": 3.0839, + "step": 31929 + }, + { + "epoch": 1.56, + "grad_norm": 0.6588249206542969, + "learning_rate": 0.000279654167247701, + "loss": 2.8436, + "step": 31930 + }, + { + "epoch": 1.56, + "grad_norm": 0.6797537803649902, + "learning_rate": 0.0002796388065321068, + "loss": 3.0175, + "step": 31931 + }, + { + "epoch": 1.56, + "grad_norm": 0.6062451601028442, + "learning_rate": 0.00027962344587014024, + "loss": 3.0375, + "step": 31932 + }, + { + "epoch": 1.56, + "grad_norm": 0.5665577054023743, + "learning_rate": 0.00027960808526184146, + "loss": 3.332, + "step": 31933 + }, + { + "epoch": 1.57, + "grad_norm": 0.5775216817855835, + "learning_rate": 0.00027959272470725113, + "loss": 3.1504, + "step": 31934 + }, + { + "epoch": 1.57, + "grad_norm": 0.6084179282188416, + "learning_rate": 0.0002795773642064096, + "loss": 3.3357, + "step": 31935 + }, + { + "epoch": 1.57, + "grad_norm": 0.6588486433029175, + "learning_rate": 0.0002795620037593573, + "loss": 2.8731, + "step": 31936 + }, + { + "epoch": 1.57, + "grad_norm": 0.6479736566543579, + "learning_rate": 0.0002795466433661349, + "loss": 3.0273, + "step": 31937 + }, + { + "epoch": 1.57, + "grad_norm": 0.5970213413238525, + "learning_rate": 0.00027953128302678253, + "loss": 2.9507, + "step": 31938 + }, + { + "epoch": 1.57, + "grad_norm": 0.5876474380493164, + "learning_rate": 0.00027951592274134095, + "loss": 2.9572, + "step": 31939 + }, + { + "epoch": 1.57, + "grad_norm": 0.5981870293617249, + "learning_rate": 0.00027950056250985046, + "loss": 3.128, + "step": 31940 + }, + { + "epoch": 1.57, + "grad_norm": 0.6277065277099609, + "learning_rate": 0.0002794852023323515, + "loss": 2.9634, + "step": 31941 + }, + { + "epoch": 1.57, + "grad_norm": 0.6169352531433105, + "learning_rate": 0.0002794698422088846, + "loss": 2.9605, + "step": 31942 + }, + { + "epoch": 1.57, + "grad_norm": 0.6255897283554077, + "learning_rate": 0.0002794544821394901, + "loss": 3.017, + "step": 31943 + }, + { + "epoch": 1.57, + "grad_norm": 0.584186851978302, + "learning_rate": 0.00027943912212420873, + "loss": 3.0064, + "step": 31944 + }, + { + "epoch": 1.57, + "grad_norm": 0.5788871049880981, + "learning_rate": 0.00027942376216308064, + "loss": 2.9854, + "step": 31945 + }, + { + "epoch": 1.57, + "grad_norm": 0.601641833782196, + "learning_rate": 0.00027940840225614634, + "loss": 2.8917, + "step": 31946 + }, + { + "epoch": 1.57, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.00027939304240344644, + "loss": 3.2907, + "step": 31947 + }, + { + "epoch": 1.57, + "grad_norm": 0.5894083380699158, + "learning_rate": 0.0002793776826050213, + "loss": 3.0337, + "step": 31948 + }, + { + "epoch": 1.57, + "grad_norm": 0.5871928930282593, + "learning_rate": 0.0002793623228609114, + "loss": 3.1671, + "step": 31949 + }, + { + "epoch": 1.57, + "grad_norm": 0.599830150604248, + "learning_rate": 0.00027934696317115707, + "loss": 3.0923, + "step": 31950 + }, + { + "epoch": 1.57, + "grad_norm": 0.5850623846054077, + "learning_rate": 0.0002793316035357989, + "loss": 2.9835, + "step": 31951 + }, + { + "epoch": 1.57, + "grad_norm": 0.6004115343093872, + "learning_rate": 0.00027931624395487734, + "loss": 3.0782, + "step": 31952 + }, + { + "epoch": 1.57, + "grad_norm": 0.6150354743003845, + "learning_rate": 0.00027930088442843274, + "loss": 3.1377, + "step": 31953 + }, + { + "epoch": 1.57, + "grad_norm": 0.5694727897644043, + "learning_rate": 0.0002792855249565058, + "loss": 2.9826, + "step": 31954 + }, + { + "epoch": 1.57, + "grad_norm": 0.6149030327796936, + "learning_rate": 0.0002792701655391366, + "loss": 3.1122, + "step": 31955 + }, + { + "epoch": 1.57, + "grad_norm": 0.5870358347892761, + "learning_rate": 0.00027925480617636584, + "loss": 2.9392, + "step": 31956 + }, + { + "epoch": 1.57, + "grad_norm": 0.576116681098938, + "learning_rate": 0.000279239446868234, + "loss": 2.9264, + "step": 31957 + }, + { + "epoch": 1.57, + "grad_norm": 0.586956262588501, + "learning_rate": 0.00027922408761478143, + "loss": 2.9759, + "step": 31958 + }, + { + "epoch": 1.57, + "grad_norm": 0.5969283580780029, + "learning_rate": 0.0002792087284160487, + "loss": 3.1103, + "step": 31959 + }, + { + "epoch": 1.57, + "grad_norm": 0.5975672602653503, + "learning_rate": 0.000279193369272076, + "loss": 2.7399, + "step": 31960 + }, + { + "epoch": 1.57, + "grad_norm": 0.5579419732093811, + "learning_rate": 0.000279178010182904, + "loss": 3.1493, + "step": 31961 + }, + { + "epoch": 1.57, + "grad_norm": 0.5934404134750366, + "learning_rate": 0.0002791626511485733, + "loss": 3.0314, + "step": 31962 + }, + { + "epoch": 1.57, + "grad_norm": 0.6547814607620239, + "learning_rate": 0.00027914729216912395, + "loss": 3.1367, + "step": 31963 + }, + { + "epoch": 1.57, + "grad_norm": 0.6256775856018066, + "learning_rate": 0.00027913193324459674, + "loss": 3.0097, + "step": 31964 + }, + { + "epoch": 1.57, + "grad_norm": 0.6281748414039612, + "learning_rate": 0.0002791165743750319, + "loss": 3.119, + "step": 31965 + }, + { + "epoch": 1.57, + "grad_norm": 0.584899365901947, + "learning_rate": 0.00027910121556047005, + "loss": 3.0945, + "step": 31966 + }, + { + "epoch": 1.57, + "grad_norm": 0.5488184690475464, + "learning_rate": 0.00027908585680095163, + "loss": 2.9363, + "step": 31967 + }, + { + "epoch": 1.57, + "grad_norm": 0.6100363731384277, + "learning_rate": 0.0002790704980965169, + "loss": 3.102, + "step": 31968 + }, + { + "epoch": 1.57, + "grad_norm": 0.5827188491821289, + "learning_rate": 0.0002790551394472066, + "loss": 2.9794, + "step": 31969 + }, + { + "epoch": 1.57, + "grad_norm": 0.592393696308136, + "learning_rate": 0.00027903978085306095, + "loss": 3.0996, + "step": 31970 + }, + { + "epoch": 1.57, + "grad_norm": 0.5915204882621765, + "learning_rate": 0.0002790244223141204, + "loss": 3.125, + "step": 31971 + }, + { + "epoch": 1.57, + "grad_norm": 0.6166325211524963, + "learning_rate": 0.0002790090638304257, + "loss": 3.1128, + "step": 31972 + }, + { + "epoch": 1.57, + "grad_norm": 0.6143838167190552, + "learning_rate": 0.00027899370540201697, + "loss": 2.8731, + "step": 31973 + }, + { + "epoch": 1.57, + "grad_norm": 0.5882230997085571, + "learning_rate": 0.0002789783470289348, + "loss": 3.0723, + "step": 31974 + }, + { + "epoch": 1.57, + "grad_norm": 0.5999899506568909, + "learning_rate": 0.0002789629887112195, + "loss": 3.0853, + "step": 31975 + }, + { + "epoch": 1.57, + "grad_norm": 0.6208217740058899, + "learning_rate": 0.00027894763044891176, + "loss": 2.8768, + "step": 31976 + }, + { + "epoch": 1.57, + "grad_norm": 0.6412845253944397, + "learning_rate": 0.0002789322722420519, + "loss": 3.1244, + "step": 31977 + }, + { + "epoch": 1.57, + "grad_norm": 0.5920975208282471, + "learning_rate": 0.0002789169140906803, + "loss": 3.2523, + "step": 31978 + }, + { + "epoch": 1.57, + "grad_norm": 0.576759934425354, + "learning_rate": 0.0002789015559948376, + "loss": 3.0485, + "step": 31979 + }, + { + "epoch": 1.57, + "grad_norm": 0.6126590967178345, + "learning_rate": 0.0002788861979545641, + "loss": 2.9728, + "step": 31980 + }, + { + "epoch": 1.57, + "grad_norm": 0.6673457026481628, + "learning_rate": 0.0002788708399699002, + "loss": 2.889, + "step": 31981 + }, + { + "epoch": 1.57, + "grad_norm": 0.6051563620567322, + "learning_rate": 0.0002788554820408866, + "loss": 3.1627, + "step": 31982 + }, + { + "epoch": 1.57, + "grad_norm": 0.5865967273712158, + "learning_rate": 0.00027884012416756347, + "loss": 3.1364, + "step": 31983 + }, + { + "epoch": 1.57, + "grad_norm": 0.6102021336555481, + "learning_rate": 0.0002788247663499715, + "loss": 3.0828, + "step": 31984 + }, + { + "epoch": 1.57, + "grad_norm": 0.5698949694633484, + "learning_rate": 0.0002788094085881509, + "loss": 3.0331, + "step": 31985 + }, + { + "epoch": 1.57, + "grad_norm": 0.5987421274185181, + "learning_rate": 0.00027879405088214226, + "loss": 3.156, + "step": 31986 + }, + { + "epoch": 1.57, + "grad_norm": 0.6104761362075806, + "learning_rate": 0.00027877869323198606, + "loss": 3.0174, + "step": 31987 + }, + { + "epoch": 1.57, + "grad_norm": 0.5552363395690918, + "learning_rate": 0.0002787633356377227, + "loss": 3.0127, + "step": 31988 + }, + { + "epoch": 1.57, + "grad_norm": 0.5822331309318542, + "learning_rate": 0.00027874797809939264, + "loss": 3.1789, + "step": 31989 + }, + { + "epoch": 1.57, + "grad_norm": 0.5854426622390747, + "learning_rate": 0.0002787326206170362, + "loss": 3.1524, + "step": 31990 + }, + { + "epoch": 1.57, + "grad_norm": 0.6006827354431152, + "learning_rate": 0.000278717263190694, + "loss": 2.9734, + "step": 31991 + }, + { + "epoch": 1.57, + "grad_norm": 0.5953511595726013, + "learning_rate": 0.0002787019058204065, + "loss": 2.9979, + "step": 31992 + }, + { + "epoch": 1.57, + "grad_norm": 0.5749074220657349, + "learning_rate": 0.00027868654850621397, + "loss": 2.9802, + "step": 31993 + }, + { + "epoch": 1.57, + "grad_norm": 0.638322114944458, + "learning_rate": 0.0002786711912481571, + "loss": 3.0781, + "step": 31994 + }, + { + "epoch": 1.57, + "grad_norm": 0.5966150760650635, + "learning_rate": 0.00027865583404627616, + "loss": 2.9011, + "step": 31995 + }, + { + "epoch": 1.57, + "grad_norm": 0.5970824956893921, + "learning_rate": 0.00027864047690061153, + "loss": 3.1345, + "step": 31996 + }, + { + "epoch": 1.57, + "grad_norm": 0.6031424403190613, + "learning_rate": 0.00027862511981120394, + "loss": 3.2617, + "step": 31997 + }, + { + "epoch": 1.57, + "grad_norm": 0.5995854735374451, + "learning_rate": 0.0002786097627780936, + "loss": 2.9957, + "step": 31998 + }, + { + "epoch": 1.57, + "grad_norm": 0.6159862279891968, + "learning_rate": 0.00027859440580132105, + "loss": 2.9234, + "step": 31999 + }, + { + "epoch": 1.57, + "grad_norm": 0.6073964238166809, + "learning_rate": 0.0002785790488809267, + "loss": 2.867, + "step": 32000 + }, + { + "epoch": 1.57, + "grad_norm": 1.2414467334747314, + "learning_rate": 0.00027856369201695097, + "loss": 3.1922, + "step": 32001 + }, + { + "epoch": 1.57, + "grad_norm": 0.616241455078125, + "learning_rate": 0.0002785483352094344, + "loss": 2.9149, + "step": 32002 + }, + { + "epoch": 1.57, + "grad_norm": 0.6222730875015259, + "learning_rate": 0.0002785329784584173, + "loss": 3.1566, + "step": 32003 + }, + { + "epoch": 1.57, + "grad_norm": 0.5811864137649536, + "learning_rate": 0.0002785176217639404, + "loss": 3.0071, + "step": 32004 + }, + { + "epoch": 1.57, + "grad_norm": 0.7227095365524292, + "learning_rate": 0.00027850226512604376, + "loss": 2.9084, + "step": 32005 + }, + { + "epoch": 1.57, + "grad_norm": 0.5832785964012146, + "learning_rate": 0.00027848690854476805, + "loss": 3.0637, + "step": 32006 + }, + { + "epoch": 1.57, + "grad_norm": 0.6182896494865417, + "learning_rate": 0.0002784715520201538, + "loss": 3.1451, + "step": 32007 + }, + { + "epoch": 1.57, + "grad_norm": 0.6133847832679749, + "learning_rate": 0.0002784561955522412, + "loss": 2.8646, + "step": 32008 + }, + { + "epoch": 1.57, + "grad_norm": 0.6060160994529724, + "learning_rate": 0.0002784408391410709, + "loss": 3.0987, + "step": 32009 + }, + { + "epoch": 1.57, + "grad_norm": 0.6133733987808228, + "learning_rate": 0.0002784254827866833, + "loss": 2.9392, + "step": 32010 + }, + { + "epoch": 1.57, + "grad_norm": 0.5962675213813782, + "learning_rate": 0.0002784101264891187, + "loss": 3.1465, + "step": 32011 + }, + { + "epoch": 1.57, + "grad_norm": 0.5825144648551941, + "learning_rate": 0.00027839477024841784, + "loss": 3.1008, + "step": 32012 + }, + { + "epoch": 1.57, + "grad_norm": 0.5747712254524231, + "learning_rate": 0.0002783794140646209, + "loss": 3.088, + "step": 32013 + }, + { + "epoch": 1.57, + "grad_norm": 0.624232292175293, + "learning_rate": 0.0002783640579377685, + "loss": 3.1401, + "step": 32014 + }, + { + "epoch": 1.57, + "grad_norm": 0.6441041231155396, + "learning_rate": 0.00027834870186790086, + "loss": 2.9997, + "step": 32015 + }, + { + "epoch": 1.57, + "grad_norm": 0.6096225380897522, + "learning_rate": 0.00027833334585505875, + "loss": 2.6779, + "step": 32016 + }, + { + "epoch": 1.57, + "grad_norm": 0.6041115522384644, + "learning_rate": 0.0002783179898992824, + "loss": 3.0651, + "step": 32017 + }, + { + "epoch": 1.57, + "grad_norm": 0.6201046705245972, + "learning_rate": 0.0002783026340006121, + "loss": 2.9325, + "step": 32018 + }, + { + "epoch": 1.57, + "grad_norm": 0.5794186592102051, + "learning_rate": 0.0002782872781590887, + "loss": 3.0974, + "step": 32019 + }, + { + "epoch": 1.57, + "grad_norm": 0.5904180407524109, + "learning_rate": 0.0002782719223747524, + "loss": 3.1501, + "step": 32020 + }, + { + "epoch": 1.57, + "grad_norm": 0.5724608898162842, + "learning_rate": 0.0002782565666476437, + "loss": 3.0647, + "step": 32021 + }, + { + "epoch": 1.57, + "grad_norm": 0.6130523681640625, + "learning_rate": 0.00027824121097780286, + "loss": 2.9517, + "step": 32022 + }, + { + "epoch": 1.57, + "grad_norm": 0.5855075716972351, + "learning_rate": 0.00027822585536527053, + "loss": 2.8407, + "step": 32023 + }, + { + "epoch": 1.57, + "grad_norm": 0.5709105730056763, + "learning_rate": 0.0002782104998100872, + "loss": 3.1079, + "step": 32024 + }, + { + "epoch": 1.57, + "grad_norm": 0.6038665175437927, + "learning_rate": 0.00027819514431229314, + "loss": 2.9033, + "step": 32025 + }, + { + "epoch": 1.57, + "grad_norm": 0.6230751872062683, + "learning_rate": 0.00027817978887192897, + "loss": 2.9427, + "step": 32026 + }, + { + "epoch": 1.57, + "grad_norm": 0.5995590090751648, + "learning_rate": 0.00027816443348903494, + "loss": 3.0794, + "step": 32027 + }, + { + "epoch": 1.57, + "grad_norm": 0.5960817337036133, + "learning_rate": 0.0002781490781636516, + "loss": 3.079, + "step": 32028 + }, + { + "epoch": 1.57, + "grad_norm": 0.5772078633308411, + "learning_rate": 0.0002781337228958195, + "loss": 2.9819, + "step": 32029 + }, + { + "epoch": 1.57, + "grad_norm": 0.5906760692596436, + "learning_rate": 0.0002781183676855788, + "loss": 3.1876, + "step": 32030 + }, + { + "epoch": 1.57, + "grad_norm": 0.6208398938179016, + "learning_rate": 0.00027810301253297024, + "loss": 2.9969, + "step": 32031 + }, + { + "epoch": 1.57, + "grad_norm": 0.6172988414764404, + "learning_rate": 0.0002780876574380341, + "loss": 3.1743, + "step": 32032 + }, + { + "epoch": 1.57, + "grad_norm": 0.6380127668380737, + "learning_rate": 0.0002780723024008107, + "loss": 3.0405, + "step": 32033 + }, + { + "epoch": 1.57, + "grad_norm": 0.639022946357727, + "learning_rate": 0.0002780569474213408, + "loss": 2.8838, + "step": 32034 + }, + { + "epoch": 1.57, + "grad_norm": 0.6695070862770081, + "learning_rate": 0.00027804159249966465, + "loss": 2.9628, + "step": 32035 + }, + { + "epoch": 1.57, + "grad_norm": 0.6148467063903809, + "learning_rate": 0.0002780262376358228, + "loss": 2.9927, + "step": 32036 + }, + { + "epoch": 1.57, + "grad_norm": 0.6534361839294434, + "learning_rate": 0.0002780108828298554, + "loss": 3.1393, + "step": 32037 + }, + { + "epoch": 1.57, + "grad_norm": 0.5975500345230103, + "learning_rate": 0.00027799552808180323, + "loss": 3.2256, + "step": 32038 + }, + { + "epoch": 1.57, + "grad_norm": 0.6091464161872864, + "learning_rate": 0.00027798017339170664, + "loss": 2.9941, + "step": 32039 + }, + { + "epoch": 1.57, + "grad_norm": 0.5968720316886902, + "learning_rate": 0.00027796481875960597, + "loss": 3.15, + "step": 32040 + }, + { + "epoch": 1.57, + "grad_norm": 0.5805733799934387, + "learning_rate": 0.00027794946418554183, + "loss": 3.0558, + "step": 32041 + }, + { + "epoch": 1.57, + "grad_norm": 0.638577401638031, + "learning_rate": 0.00027793410966955444, + "loss": 3.0992, + "step": 32042 + }, + { + "epoch": 1.57, + "grad_norm": 0.5825785398483276, + "learning_rate": 0.0002779187552116843, + "loss": 3.1721, + "step": 32043 + }, + { + "epoch": 1.57, + "grad_norm": 0.6435587406158447, + "learning_rate": 0.00027790340081197207, + "loss": 3.0042, + "step": 32044 + }, + { + "epoch": 1.57, + "grad_norm": 0.5820013880729675, + "learning_rate": 0.0002778880464704579, + "loss": 2.998, + "step": 32045 + }, + { + "epoch": 1.57, + "grad_norm": 0.5931047797203064, + "learning_rate": 0.0002778726921871825, + "loss": 3.1897, + "step": 32046 + }, + { + "epoch": 1.57, + "grad_norm": 0.6294622421264648, + "learning_rate": 0.0002778573379621861, + "loss": 2.9959, + "step": 32047 + }, + { + "epoch": 1.57, + "grad_norm": 0.5972573161125183, + "learning_rate": 0.0002778419837955091, + "loss": 3.0585, + "step": 32048 + }, + { + "epoch": 1.57, + "grad_norm": 0.6380653381347656, + "learning_rate": 0.0002778266296871922, + "loss": 3.0712, + "step": 32049 + }, + { + "epoch": 1.57, + "grad_norm": 0.6002389192581177, + "learning_rate": 0.0002778112756372757, + "loss": 3.1067, + "step": 32050 + }, + { + "epoch": 1.57, + "grad_norm": 0.6016014218330383, + "learning_rate": 0.0002777959216458, + "loss": 3.0874, + "step": 32051 + }, + { + "epoch": 1.57, + "grad_norm": 0.5852669477462769, + "learning_rate": 0.00027778056771280546, + "loss": 3.1815, + "step": 32052 + }, + { + "epoch": 1.57, + "grad_norm": 0.6085726022720337, + "learning_rate": 0.0002777652138383327, + "loss": 3.1229, + "step": 32053 + }, + { + "epoch": 1.57, + "grad_norm": 0.6045544147491455, + "learning_rate": 0.0002777498600224222, + "loss": 2.7204, + "step": 32054 + }, + { + "epoch": 1.57, + "grad_norm": 0.5995080471038818, + "learning_rate": 0.0002777345062651141, + "loss": 3.1933, + "step": 32055 + }, + { + "epoch": 1.57, + "grad_norm": 0.6304450035095215, + "learning_rate": 0.00027771915256644917, + "loss": 2.8745, + "step": 32056 + }, + { + "epoch": 1.57, + "grad_norm": 0.5954857468605042, + "learning_rate": 0.00027770379892646765, + "loss": 2.9642, + "step": 32057 + }, + { + "epoch": 1.57, + "grad_norm": 0.6180627346038818, + "learning_rate": 0.00027768844534520993, + "loss": 2.9956, + "step": 32058 + }, + { + "epoch": 1.57, + "grad_norm": 0.5869137644767761, + "learning_rate": 0.00027767309182271673, + "loss": 2.8287, + "step": 32059 + }, + { + "epoch": 1.57, + "grad_norm": 0.5612866282463074, + "learning_rate": 0.0002776577383590282, + "loss": 2.8453, + "step": 32060 + }, + { + "epoch": 1.57, + "grad_norm": 0.5565618872642517, + "learning_rate": 0.000277642384954185, + "loss": 3.1548, + "step": 32061 + }, + { + "epoch": 1.57, + "grad_norm": 0.6152032017707825, + "learning_rate": 0.0002776270316082273, + "loss": 2.8864, + "step": 32062 + }, + { + "epoch": 1.57, + "grad_norm": 0.5819271206855774, + "learning_rate": 0.0002776116783211957, + "loss": 3.0542, + "step": 32063 + }, + { + "epoch": 1.57, + "grad_norm": 0.5854024887084961, + "learning_rate": 0.0002775963250931308, + "loss": 2.9896, + "step": 32064 + }, + { + "epoch": 1.57, + "grad_norm": 0.5745155215263367, + "learning_rate": 0.0002775809719240727, + "loss": 2.971, + "step": 32065 + }, + { + "epoch": 1.57, + "grad_norm": 0.5686827898025513, + "learning_rate": 0.00027756561881406213, + "loss": 3.1338, + "step": 32066 + }, + { + "epoch": 1.57, + "grad_norm": 0.5938876867294312, + "learning_rate": 0.0002775502657631393, + "loss": 3.1614, + "step": 32067 + }, + { + "epoch": 1.57, + "grad_norm": 0.5606462359428406, + "learning_rate": 0.00027753491277134474, + "loss": 3.1713, + "step": 32068 + }, + { + "epoch": 1.57, + "grad_norm": 0.6324983239173889, + "learning_rate": 0.00027751955983871905, + "loss": 3.2225, + "step": 32069 + }, + { + "epoch": 1.57, + "grad_norm": 0.5873457193374634, + "learning_rate": 0.0002775042069653023, + "loss": 2.8439, + "step": 32070 + }, + { + "epoch": 1.57, + "grad_norm": 0.5858466625213623, + "learning_rate": 0.0002774888541511353, + "loss": 3.1174, + "step": 32071 + }, + { + "epoch": 1.57, + "grad_norm": 0.6003888249397278, + "learning_rate": 0.0002774735013962583, + "loss": 3.1971, + "step": 32072 + }, + { + "epoch": 1.57, + "grad_norm": 0.574512243270874, + "learning_rate": 0.00027745814870071164, + "loss": 2.901, + "step": 32073 + }, + { + "epoch": 1.57, + "grad_norm": 0.5840027928352356, + "learning_rate": 0.00027744279606453605, + "loss": 2.9237, + "step": 32074 + }, + { + "epoch": 1.57, + "grad_norm": 0.5972057580947876, + "learning_rate": 0.0002774274434877717, + "loss": 3.0789, + "step": 32075 + }, + { + "epoch": 1.57, + "grad_norm": 0.6341174840927124, + "learning_rate": 0.0002774120909704592, + "loss": 3.0405, + "step": 32076 + }, + { + "epoch": 1.57, + "grad_norm": 0.6080915331840515, + "learning_rate": 0.0002773967385126388, + "loss": 3.1676, + "step": 32077 + }, + { + "epoch": 1.57, + "grad_norm": 0.5747969150543213, + "learning_rate": 0.00027738138611435104, + "loss": 3.1673, + "step": 32078 + }, + { + "epoch": 1.57, + "grad_norm": 0.6224098205566406, + "learning_rate": 0.00027736603377563645, + "loss": 3.1983, + "step": 32079 + }, + { + "epoch": 1.57, + "grad_norm": 0.6277689337730408, + "learning_rate": 0.0002773506814965353, + "loss": 2.974, + "step": 32080 + }, + { + "epoch": 1.57, + "grad_norm": 0.6136027574539185, + "learning_rate": 0.00027733532927708815, + "loss": 3.1981, + "step": 32081 + }, + { + "epoch": 1.57, + "grad_norm": 0.6523263454437256, + "learning_rate": 0.00027731997711733533, + "loss": 3.1751, + "step": 32082 + }, + { + "epoch": 1.57, + "grad_norm": 0.6148524880409241, + "learning_rate": 0.0002773046250173173, + "loss": 2.9798, + "step": 32083 + }, + { + "epoch": 1.57, + "grad_norm": 0.5856233835220337, + "learning_rate": 0.0002772892729770746, + "loss": 2.8006, + "step": 32084 + }, + { + "epoch": 1.57, + "grad_norm": 0.5549863576889038, + "learning_rate": 0.00027727392099664746, + "loss": 3.0683, + "step": 32085 + }, + { + "epoch": 1.57, + "grad_norm": 0.6235625743865967, + "learning_rate": 0.0002772585690760766, + "loss": 3.0228, + "step": 32086 + }, + { + "epoch": 1.57, + "grad_norm": 0.6109411716461182, + "learning_rate": 0.00027724321721540223, + "loss": 3.031, + "step": 32087 + }, + { + "epoch": 1.57, + "grad_norm": 0.641261637210846, + "learning_rate": 0.00027722786541466474, + "loss": 3.1146, + "step": 32088 + }, + { + "epoch": 1.57, + "grad_norm": 0.6297993063926697, + "learning_rate": 0.00027721251367390483, + "loss": 2.9968, + "step": 32089 + }, + { + "epoch": 1.57, + "grad_norm": 0.6300150156021118, + "learning_rate": 0.0002771971619931627, + "loss": 3.0939, + "step": 32090 + }, + { + "epoch": 1.57, + "grad_norm": 0.6364719271659851, + "learning_rate": 0.00027718181037247896, + "loss": 3.026, + "step": 32091 + }, + { + "epoch": 1.57, + "grad_norm": 0.613513171672821, + "learning_rate": 0.0002771664588118937, + "loss": 2.8987, + "step": 32092 + }, + { + "epoch": 1.57, + "grad_norm": 0.6249509453773499, + "learning_rate": 0.0002771511073114478, + "loss": 2.9277, + "step": 32093 + }, + { + "epoch": 1.57, + "grad_norm": 0.605344295501709, + "learning_rate": 0.0002771357558711815, + "loss": 2.9627, + "step": 32094 + }, + { + "epoch": 1.57, + "grad_norm": 0.7609037160873413, + "learning_rate": 0.00027712040449113503, + "loss": 2.957, + "step": 32095 + }, + { + "epoch": 1.57, + "grad_norm": 0.5966310501098633, + "learning_rate": 0.0002771050531713492, + "loss": 3.1181, + "step": 32096 + }, + { + "epoch": 1.57, + "grad_norm": 0.6008804440498352, + "learning_rate": 0.00027708970191186414, + "loss": 2.9581, + "step": 32097 + }, + { + "epoch": 1.57, + "grad_norm": 0.5897909998893738, + "learning_rate": 0.0002770743507127205, + "loss": 3.3047, + "step": 32098 + }, + { + "epoch": 1.57, + "grad_norm": 0.590558648109436, + "learning_rate": 0.0002770589995739585, + "loss": 3.0585, + "step": 32099 + }, + { + "epoch": 1.57, + "grad_norm": 0.5807520747184753, + "learning_rate": 0.0002770436484956187, + "loss": 2.9507, + "step": 32100 + }, + { + "epoch": 1.57, + "grad_norm": 0.6075061559677124, + "learning_rate": 0.0002770282974777416, + "loss": 2.8376, + "step": 32101 + }, + { + "epoch": 1.57, + "grad_norm": 0.6187633275985718, + "learning_rate": 0.0002770129465203674, + "loss": 3.172, + "step": 32102 + }, + { + "epoch": 1.57, + "grad_norm": 0.5896881222724915, + "learning_rate": 0.00027699759562353684, + "loss": 3.0375, + "step": 32103 + }, + { + "epoch": 1.57, + "grad_norm": 0.5933699011802673, + "learning_rate": 0.00027698224478729005, + "loss": 3.0564, + "step": 32104 + }, + { + "epoch": 1.57, + "grad_norm": 0.5940541625022888, + "learning_rate": 0.0002769668940116676, + "loss": 2.9197, + "step": 32105 + }, + { + "epoch": 1.57, + "grad_norm": 0.6250037550926208, + "learning_rate": 0.00027695154329671003, + "loss": 3.0014, + "step": 32106 + }, + { + "epoch": 1.57, + "grad_norm": 0.6533704400062561, + "learning_rate": 0.0002769361926424575, + "loss": 2.8502, + "step": 32107 + }, + { + "epoch": 1.57, + "grad_norm": 0.5797675848007202, + "learning_rate": 0.00027692084204895075, + "loss": 3.0491, + "step": 32108 + }, + { + "epoch": 1.57, + "grad_norm": 0.5860679745674133, + "learning_rate": 0.00027690549151623, + "loss": 3.176, + "step": 32109 + }, + { + "epoch": 1.57, + "grad_norm": 0.6044549345970154, + "learning_rate": 0.0002768901410443357, + "loss": 2.8871, + "step": 32110 + }, + { + "epoch": 1.57, + "grad_norm": 0.6011185646057129, + "learning_rate": 0.0002768747906333084, + "loss": 3.0566, + "step": 32111 + }, + { + "epoch": 1.57, + "grad_norm": 0.5532136559486389, + "learning_rate": 0.00027685944028318845, + "loss": 2.9295, + "step": 32112 + }, + { + "epoch": 1.57, + "grad_norm": 0.5941985249519348, + "learning_rate": 0.00027684408999401625, + "loss": 2.9165, + "step": 32113 + }, + { + "epoch": 1.57, + "grad_norm": 0.599427342414856, + "learning_rate": 0.0002768287397658322, + "loss": 3.1695, + "step": 32114 + }, + { + "epoch": 1.57, + "grad_norm": 0.6106255650520325, + "learning_rate": 0.00027681338959867683, + "loss": 3.0471, + "step": 32115 + }, + { + "epoch": 1.57, + "grad_norm": 0.6107386946678162, + "learning_rate": 0.0002767980394925906, + "loss": 3.2058, + "step": 32116 + }, + { + "epoch": 1.57, + "grad_norm": 0.6064965128898621, + "learning_rate": 0.00027678268944761374, + "loss": 2.885, + "step": 32117 + }, + { + "epoch": 1.57, + "grad_norm": 0.6419435143470764, + "learning_rate": 0.00027676733946378697, + "loss": 3.086, + "step": 32118 + }, + { + "epoch": 1.57, + "grad_norm": 0.6065117716789246, + "learning_rate": 0.00027675198954115046, + "loss": 3.0474, + "step": 32119 + }, + { + "epoch": 1.57, + "grad_norm": 0.7930900454521179, + "learning_rate": 0.0002767366396797447, + "loss": 2.9981, + "step": 32120 + }, + { + "epoch": 1.57, + "grad_norm": 0.6305885910987854, + "learning_rate": 0.00027672128987961023, + "loss": 3.0314, + "step": 32121 + }, + { + "epoch": 1.57, + "grad_norm": 0.599500834941864, + "learning_rate": 0.00027670594014078735, + "loss": 3.3042, + "step": 32122 + }, + { + "epoch": 1.57, + "grad_norm": 0.6495477557182312, + "learning_rate": 0.00027669059046331665, + "loss": 3.0577, + "step": 32123 + }, + { + "epoch": 1.57, + "grad_norm": 0.5801239013671875, + "learning_rate": 0.00027667524084723825, + "loss": 2.8206, + "step": 32124 + }, + { + "epoch": 1.57, + "grad_norm": 0.6565015316009521, + "learning_rate": 0.00027665989129259285, + "loss": 2.8886, + "step": 32125 + }, + { + "epoch": 1.57, + "grad_norm": 0.612785279750824, + "learning_rate": 0.00027664454179942096, + "loss": 3.0586, + "step": 32126 + }, + { + "epoch": 1.57, + "grad_norm": 0.6405092477798462, + "learning_rate": 0.00027662919236776264, + "loss": 3.2097, + "step": 32127 + }, + { + "epoch": 1.57, + "grad_norm": 0.6418353319168091, + "learning_rate": 0.0002766138429976587, + "loss": 2.8991, + "step": 32128 + }, + { + "epoch": 1.57, + "grad_norm": 0.598755419254303, + "learning_rate": 0.00027659849368914926, + "loss": 3.0289, + "step": 32129 + }, + { + "epoch": 1.57, + "grad_norm": 0.6165720820426941, + "learning_rate": 0.00027658314444227493, + "loss": 3.2701, + "step": 32130 + }, + { + "epoch": 1.57, + "grad_norm": 0.6199347972869873, + "learning_rate": 0.00027656779525707614, + "loss": 2.834, + "step": 32131 + }, + { + "epoch": 1.57, + "grad_norm": 0.6760289669036865, + "learning_rate": 0.0002765524461335932, + "loss": 3.128, + "step": 32132 + }, + { + "epoch": 1.57, + "grad_norm": 0.6090681552886963, + "learning_rate": 0.0002765370970718667, + "loss": 2.799, + "step": 32133 + }, + { + "epoch": 1.57, + "grad_norm": 0.6263375282287598, + "learning_rate": 0.0002765217480719369, + "loss": 2.939, + "step": 32134 + }, + { + "epoch": 1.57, + "grad_norm": 0.5948789715766907, + "learning_rate": 0.00027650639913384423, + "loss": 2.971, + "step": 32135 + }, + { + "epoch": 1.57, + "grad_norm": 0.6212787628173828, + "learning_rate": 0.0002764910502576293, + "loss": 3.0933, + "step": 32136 + }, + { + "epoch": 1.57, + "grad_norm": 0.6157195568084717, + "learning_rate": 0.0002764757014433324, + "loss": 2.8863, + "step": 32137 + }, + { + "epoch": 1.58, + "grad_norm": 0.6237986087799072, + "learning_rate": 0.000276460352690994, + "loss": 3.1061, + "step": 32138 + }, + { + "epoch": 1.58, + "grad_norm": 0.6217524409294128, + "learning_rate": 0.00027644500400065435, + "loss": 3.0781, + "step": 32139 + }, + { + "epoch": 1.58, + "grad_norm": 0.6218914985656738, + "learning_rate": 0.00027642965537235417, + "loss": 2.946, + "step": 32140 + }, + { + "epoch": 1.58, + "grad_norm": 0.6007027626037598, + "learning_rate": 0.0002764143068061337, + "loss": 3.0428, + "step": 32141 + }, + { + "epoch": 1.58, + "grad_norm": 0.5837012529373169, + "learning_rate": 0.0002763989583020334, + "loss": 2.8888, + "step": 32142 + }, + { + "epoch": 1.58, + "grad_norm": 0.5839048624038696, + "learning_rate": 0.00027638360986009374, + "loss": 3.089, + "step": 32143 + }, + { + "epoch": 1.58, + "grad_norm": 0.6786932945251465, + "learning_rate": 0.0002763682614803551, + "loss": 2.9294, + "step": 32144 + }, + { + "epoch": 1.58, + "grad_norm": 0.6156800985336304, + "learning_rate": 0.0002763529131628578, + "loss": 3.2307, + "step": 32145 + }, + { + "epoch": 1.58, + "grad_norm": 0.6580122709274292, + "learning_rate": 0.0002763375649076425, + "loss": 2.9384, + "step": 32146 + }, + { + "epoch": 1.58, + "grad_norm": 0.6277601718902588, + "learning_rate": 0.0002763222167147494, + "loss": 2.9617, + "step": 32147 + }, + { + "epoch": 1.58, + "grad_norm": 0.6300899386405945, + "learning_rate": 0.0002763068685842191, + "loss": 3.0255, + "step": 32148 + }, + { + "epoch": 1.58, + "grad_norm": 0.5806252956390381, + "learning_rate": 0.000276291520516092, + "loss": 2.7774, + "step": 32149 + }, + { + "epoch": 1.58, + "grad_norm": 0.5649163126945496, + "learning_rate": 0.0002762761725104083, + "loss": 3.0464, + "step": 32150 + }, + { + "epoch": 1.58, + "grad_norm": 0.6552037000656128, + "learning_rate": 0.0002762608245672088, + "loss": 2.938, + "step": 32151 + }, + { + "epoch": 1.58, + "grad_norm": 0.5685431957244873, + "learning_rate": 0.0002762454766865336, + "loss": 3.0988, + "step": 32152 + }, + { + "epoch": 1.58, + "grad_norm": 0.5887003540992737, + "learning_rate": 0.00027623012886842334, + "loss": 2.9508, + "step": 32153 + }, + { + "epoch": 1.58, + "grad_norm": 0.6270775198936462, + "learning_rate": 0.0002762147811129182, + "loss": 3.3599, + "step": 32154 + }, + { + "epoch": 1.58, + "grad_norm": 0.5925376415252686, + "learning_rate": 0.0002761994334200588, + "loss": 2.8112, + "step": 32155 + }, + { + "epoch": 1.58, + "grad_norm": 0.6008676290512085, + "learning_rate": 0.00027618408578988557, + "loss": 3.0296, + "step": 32156 + }, + { + "epoch": 1.58, + "grad_norm": 0.6249087452888489, + "learning_rate": 0.0002761687382224388, + "loss": 3.0094, + "step": 32157 + }, + { + "epoch": 1.58, + "grad_norm": 0.6997944116592407, + "learning_rate": 0.00027615339071775905, + "loss": 2.9996, + "step": 32158 + }, + { + "epoch": 1.58, + "grad_norm": 0.6397016644477844, + "learning_rate": 0.00027613804327588666, + "loss": 3.184, + "step": 32159 + }, + { + "epoch": 1.58, + "grad_norm": 0.6241958141326904, + "learning_rate": 0.000276122695896862, + "loss": 3.1826, + "step": 32160 + }, + { + "epoch": 1.58, + "grad_norm": 0.5984312295913696, + "learning_rate": 0.0002761073485807257, + "loss": 2.841, + "step": 32161 + }, + { + "epoch": 1.58, + "grad_norm": 0.6074608564376831, + "learning_rate": 0.00027609200132751793, + "loss": 2.9895, + "step": 32162 + }, + { + "epoch": 1.58, + "grad_norm": 0.6068947315216064, + "learning_rate": 0.0002760766541372794, + "loss": 2.7853, + "step": 32163 + }, + { + "epoch": 1.58, + "grad_norm": 0.6099889278411865, + "learning_rate": 0.0002760613070100501, + "loss": 3.0638, + "step": 32164 + }, + { + "epoch": 1.58, + "grad_norm": 0.640886127948761, + "learning_rate": 0.0002760459599458709, + "loss": 3.0396, + "step": 32165 + }, + { + "epoch": 1.58, + "grad_norm": 0.6190171241760254, + "learning_rate": 0.00027603061294478203, + "loss": 3.0839, + "step": 32166 + }, + { + "epoch": 1.58, + "grad_norm": 0.5623420476913452, + "learning_rate": 0.0002760152660068238, + "loss": 2.9209, + "step": 32167 + }, + { + "epoch": 1.58, + "grad_norm": 0.588102400302887, + "learning_rate": 0.0002759999191320368, + "loss": 3.0652, + "step": 32168 + }, + { + "epoch": 1.58, + "grad_norm": 0.5879389047622681, + "learning_rate": 0.0002759845723204613, + "loss": 2.8914, + "step": 32169 + }, + { + "epoch": 1.58, + "grad_norm": 0.6012025475502014, + "learning_rate": 0.000275969225572138, + "loss": 3.1915, + "step": 32170 + }, + { + "epoch": 1.58, + "grad_norm": 0.6043398380279541, + "learning_rate": 0.00027595387888710703, + "loss": 3.0019, + "step": 32171 + }, + { + "epoch": 1.58, + "grad_norm": 0.6456342935562134, + "learning_rate": 0.00027593853226540887, + "loss": 2.8778, + "step": 32172 + }, + { + "epoch": 1.58, + "grad_norm": 0.5889996886253357, + "learning_rate": 0.0002759231857070841, + "loss": 2.9956, + "step": 32173 + }, + { + "epoch": 1.58, + "grad_norm": 0.697205126285553, + "learning_rate": 0.00027590783921217294, + "loss": 3.0563, + "step": 32174 + }, + { + "epoch": 1.58, + "grad_norm": 0.5691419243812561, + "learning_rate": 0.000275892492780716, + "loss": 3.1269, + "step": 32175 + }, + { + "epoch": 1.58, + "grad_norm": 0.6309320330619812, + "learning_rate": 0.00027587714641275345, + "loss": 2.7833, + "step": 32176 + }, + { + "epoch": 1.58, + "grad_norm": 0.5982531905174255, + "learning_rate": 0.0002758618001083259, + "loss": 2.9703, + "step": 32177 + }, + { + "epoch": 1.58, + "grad_norm": 0.5961381793022156, + "learning_rate": 0.0002758464538674738, + "loss": 3.017, + "step": 32178 + }, + { + "epoch": 1.58, + "grad_norm": 0.6061530113220215, + "learning_rate": 0.00027583110769023735, + "loss": 3.2634, + "step": 32179 + }, + { + "epoch": 1.58, + "grad_norm": 0.5718122124671936, + "learning_rate": 0.00027581576157665726, + "loss": 3.1522, + "step": 32180 + }, + { + "epoch": 1.58, + "grad_norm": 0.632178544998169, + "learning_rate": 0.0002758004155267738, + "loss": 2.8639, + "step": 32181 + }, + { + "epoch": 1.58, + "grad_norm": 0.6122092008590698, + "learning_rate": 0.0002757850695406272, + "loss": 2.8233, + "step": 32182 + }, + { + "epoch": 1.58, + "grad_norm": 0.6195982694625854, + "learning_rate": 0.00027576972361825823, + "loss": 3.092, + "step": 32183 + }, + { + "epoch": 1.58, + "grad_norm": 0.6117836833000183, + "learning_rate": 0.0002757543777597071, + "loss": 3.2046, + "step": 32184 + }, + { + "epoch": 1.58, + "grad_norm": 0.6399086117744446, + "learning_rate": 0.0002757390319650144, + "loss": 3.0408, + "step": 32185 + }, + { + "epoch": 1.58, + "grad_norm": 0.6001808047294617, + "learning_rate": 0.0002757236862342202, + "loss": 2.9358, + "step": 32186 + }, + { + "epoch": 1.58, + "grad_norm": 0.5729873776435852, + "learning_rate": 0.00027570834056736516, + "loss": 3.1652, + "step": 32187 + }, + { + "epoch": 1.58, + "grad_norm": 0.5765482187271118, + "learning_rate": 0.00027569299496448983, + "loss": 3.0379, + "step": 32188 + }, + { + "epoch": 1.58, + "grad_norm": 0.5675467848777771, + "learning_rate": 0.00027567764942563436, + "loss": 3.0612, + "step": 32189 + }, + { + "epoch": 1.58, + "grad_norm": 0.5885112285614014, + "learning_rate": 0.0002756623039508394, + "loss": 2.9927, + "step": 32190 + }, + { + "epoch": 1.58, + "grad_norm": 0.6263838410377502, + "learning_rate": 0.0002756469585401451, + "loss": 3.1661, + "step": 32191 + }, + { + "epoch": 1.58, + "grad_norm": 0.580174446105957, + "learning_rate": 0.00027563161319359207, + "loss": 3.057, + "step": 32192 + }, + { + "epoch": 1.58, + "grad_norm": 0.5984945297241211, + "learning_rate": 0.0002756162679112208, + "loss": 3.1215, + "step": 32193 + }, + { + "epoch": 1.58, + "grad_norm": 0.588066041469574, + "learning_rate": 0.0002756009226930714, + "loss": 3.1108, + "step": 32194 + }, + { + "epoch": 1.58, + "grad_norm": 0.6735916137695312, + "learning_rate": 0.0002755855775391846, + "loss": 2.7924, + "step": 32195 + }, + { + "epoch": 1.58, + "grad_norm": 0.6410754919052124, + "learning_rate": 0.0002755702324496007, + "loss": 2.906, + "step": 32196 + }, + { + "epoch": 1.58, + "grad_norm": 0.5920937061309814, + "learning_rate": 0.00027555488742436, + "loss": 3.1199, + "step": 32197 + }, + { + "epoch": 1.58, + "grad_norm": 0.5957187414169312, + "learning_rate": 0.0002755395424635031, + "loss": 2.8848, + "step": 32198 + }, + { + "epoch": 1.58, + "grad_norm": 0.5822932124137878, + "learning_rate": 0.0002755241975670703, + "loss": 2.7998, + "step": 32199 + }, + { + "epoch": 1.58, + "grad_norm": 0.6525944471359253, + "learning_rate": 0.00027550885273510214, + "loss": 3.3601, + "step": 32200 + }, + { + "epoch": 1.58, + "grad_norm": 0.6136645674705505, + "learning_rate": 0.0002754935079676388, + "loss": 2.8207, + "step": 32201 + }, + { + "epoch": 1.58, + "grad_norm": 0.5625642538070679, + "learning_rate": 0.0002754781632647209, + "loss": 2.9618, + "step": 32202 + }, + { + "epoch": 1.58, + "grad_norm": 0.6007310748100281, + "learning_rate": 0.0002754628186263889, + "loss": 3.0686, + "step": 32203 + }, + { + "epoch": 1.58, + "grad_norm": 0.5860627889633179, + "learning_rate": 0.00027544747405268297, + "loss": 2.9696, + "step": 32204 + }, + { + "epoch": 1.58, + "grad_norm": 0.6014703512191772, + "learning_rate": 0.00027543212954364376, + "loss": 3.1983, + "step": 32205 + }, + { + "epoch": 1.58, + "grad_norm": 0.6181617975234985, + "learning_rate": 0.0002754167850993116, + "loss": 3.234, + "step": 32206 + }, + { + "epoch": 1.58, + "grad_norm": 0.6260382533073425, + "learning_rate": 0.00027540144071972676, + "loss": 2.9967, + "step": 32207 + }, + { + "epoch": 1.58, + "grad_norm": 0.5667868256568909, + "learning_rate": 0.00027538609640492996, + "loss": 2.902, + "step": 32208 + }, + { + "epoch": 1.58, + "grad_norm": 0.5965689420700073, + "learning_rate": 0.0002753707521549613, + "loss": 3.1618, + "step": 32209 + }, + { + "epoch": 1.58, + "grad_norm": 0.6163270473480225, + "learning_rate": 0.0002753554079698615, + "loss": 2.993, + "step": 32210 + }, + { + "epoch": 1.58, + "grad_norm": 0.6091551780700684, + "learning_rate": 0.0002753400638496707, + "loss": 3.1026, + "step": 32211 + }, + { + "epoch": 1.58, + "grad_norm": 0.615360677242279, + "learning_rate": 0.00027532471979442943, + "loss": 3.3256, + "step": 32212 + }, + { + "epoch": 1.58, + "grad_norm": 0.58626389503479, + "learning_rate": 0.00027530937580417814, + "loss": 3.1256, + "step": 32213 + }, + { + "epoch": 1.58, + "grad_norm": 0.6162660717964172, + "learning_rate": 0.0002752940318789572, + "loss": 2.9119, + "step": 32214 + }, + { + "epoch": 1.58, + "grad_norm": 0.630847692489624, + "learning_rate": 0.0002752786880188071, + "loss": 3.0364, + "step": 32215 + }, + { + "epoch": 1.58, + "grad_norm": 0.5829058289527893, + "learning_rate": 0.000275263344223768, + "loss": 3.141, + "step": 32216 + }, + { + "epoch": 1.58, + "grad_norm": 0.6056093573570251, + "learning_rate": 0.00027524800049388056, + "loss": 3.0728, + "step": 32217 + }, + { + "epoch": 1.58, + "grad_norm": 0.6000507473945618, + "learning_rate": 0.0002752326568291852, + "loss": 2.8014, + "step": 32218 + }, + { + "epoch": 1.58, + "grad_norm": 0.5783975124359131, + "learning_rate": 0.0002752173132297221, + "loss": 2.9118, + "step": 32219 + }, + { + "epoch": 1.58, + "grad_norm": 0.5872189402580261, + "learning_rate": 0.0002752019696955321, + "loss": 2.8498, + "step": 32220 + }, + { + "epoch": 1.58, + "grad_norm": 0.6652278900146484, + "learning_rate": 0.0002751866262266551, + "loss": 3.0035, + "step": 32221 + }, + { + "epoch": 1.58, + "grad_norm": 0.5831838846206665, + "learning_rate": 0.0002751712828231317, + "loss": 3.2491, + "step": 32222 + }, + { + "epoch": 1.58, + "grad_norm": 0.6078551411628723, + "learning_rate": 0.00027515593948500256, + "loss": 3.0388, + "step": 32223 + }, + { + "epoch": 1.58, + "grad_norm": 0.5786187648773193, + "learning_rate": 0.0002751405962123078, + "loss": 2.9939, + "step": 32224 + }, + { + "epoch": 1.58, + "grad_norm": 0.5971993207931519, + "learning_rate": 0.00027512525300508805, + "loss": 3.2239, + "step": 32225 + }, + { + "epoch": 1.58, + "grad_norm": 0.5976489782333374, + "learning_rate": 0.0002751099098633834, + "loss": 3.0328, + "step": 32226 + }, + { + "epoch": 1.58, + "grad_norm": 0.5616191029548645, + "learning_rate": 0.0002750945667872345, + "loss": 2.9874, + "step": 32227 + }, + { + "epoch": 1.58, + "grad_norm": 0.6311202049255371, + "learning_rate": 0.0002750792237766819, + "loss": 2.9954, + "step": 32228 + }, + { + "epoch": 1.58, + "grad_norm": 0.5888649821281433, + "learning_rate": 0.0002750638808317656, + "loss": 3.1133, + "step": 32229 + }, + { + "epoch": 1.58, + "grad_norm": 0.6184988617897034, + "learning_rate": 0.0002750485379525264, + "loss": 3.1047, + "step": 32230 + }, + { + "epoch": 1.58, + "grad_norm": 0.6103339791297913, + "learning_rate": 0.0002750331951390044, + "loss": 3.2664, + "step": 32231 + }, + { + "epoch": 1.58, + "grad_norm": 0.5738555788993835, + "learning_rate": 0.0002750178523912402, + "loss": 3.1224, + "step": 32232 + }, + { + "epoch": 1.58, + "grad_norm": 0.6099984049797058, + "learning_rate": 0.00027500250970927424, + "loss": 3.1296, + "step": 32233 + }, + { + "epoch": 1.58, + "grad_norm": 0.6913991570472717, + "learning_rate": 0.0002749871670931468, + "loss": 2.9424, + "step": 32234 + }, + { + "epoch": 1.58, + "grad_norm": 0.5921345353126526, + "learning_rate": 0.0002749718245428984, + "loss": 3.1274, + "step": 32235 + }, + { + "epoch": 1.58, + "grad_norm": 0.5641476511955261, + "learning_rate": 0.00027495648205856936, + "loss": 3.1257, + "step": 32236 + }, + { + "epoch": 1.58, + "grad_norm": 0.6268711090087891, + "learning_rate": 0.0002749411396402001, + "loss": 3.0799, + "step": 32237 + }, + { + "epoch": 1.58, + "grad_norm": 0.6031218767166138, + "learning_rate": 0.0002749257972878311, + "loss": 2.9939, + "step": 32238 + }, + { + "epoch": 1.58, + "grad_norm": 0.5958530902862549, + "learning_rate": 0.00027491045500150274, + "loss": 3.1693, + "step": 32239 + }, + { + "epoch": 1.58, + "grad_norm": 0.5875588059425354, + "learning_rate": 0.00027489511278125546, + "loss": 3.2846, + "step": 32240 + }, + { + "epoch": 1.58, + "grad_norm": 0.5789769887924194, + "learning_rate": 0.0002748797706271295, + "loss": 2.9361, + "step": 32241 + }, + { + "epoch": 1.58, + "grad_norm": 0.5652769207954407, + "learning_rate": 0.0002748644285391654, + "loss": 3.0612, + "step": 32242 + }, + { + "epoch": 1.58, + "grad_norm": 0.57442307472229, + "learning_rate": 0.00027484908651740367, + "loss": 3.177, + "step": 32243 + }, + { + "epoch": 1.58, + "grad_norm": 0.6496897339820862, + "learning_rate": 0.00027483374456188447, + "loss": 2.985, + "step": 32244 + }, + { + "epoch": 1.58, + "grad_norm": 0.5900055170059204, + "learning_rate": 0.00027481840267264845, + "loss": 2.9613, + "step": 32245 + }, + { + "epoch": 1.58, + "grad_norm": 0.6131547093391418, + "learning_rate": 0.0002748030608497359, + "loss": 3.0876, + "step": 32246 + }, + { + "epoch": 1.58, + "grad_norm": 0.6449847221374512, + "learning_rate": 0.00027478771909318725, + "loss": 2.7354, + "step": 32247 + }, + { + "epoch": 1.58, + "grad_norm": 0.5974999666213989, + "learning_rate": 0.0002747723774030428, + "loss": 3.2546, + "step": 32248 + }, + { + "epoch": 1.58, + "grad_norm": 0.6547728776931763, + "learning_rate": 0.000274757035779343, + "loss": 3.1246, + "step": 32249 + }, + { + "epoch": 1.58, + "grad_norm": 0.6210682988166809, + "learning_rate": 0.00027474169422212846, + "loss": 3.1677, + "step": 32250 + }, + { + "epoch": 1.58, + "grad_norm": 0.924889326095581, + "learning_rate": 0.00027472635273143937, + "loss": 3.046, + "step": 32251 + }, + { + "epoch": 1.58, + "grad_norm": 0.5950158834457397, + "learning_rate": 0.0002747110113073163, + "loss": 3.2527, + "step": 32252 + }, + { + "epoch": 1.58, + "grad_norm": 0.6571494340896606, + "learning_rate": 0.00027469566994979945, + "loss": 3.0866, + "step": 32253 + }, + { + "epoch": 1.58, + "grad_norm": 0.6341148614883423, + "learning_rate": 0.00027468032865892936, + "loss": 2.8808, + "step": 32254 + }, + { + "epoch": 1.58, + "grad_norm": 0.5878071188926697, + "learning_rate": 0.0002746649874347465, + "loss": 2.9232, + "step": 32255 + }, + { + "epoch": 1.58, + "grad_norm": 0.6338520646095276, + "learning_rate": 0.00027464964627729103, + "loss": 3.1164, + "step": 32256 + }, + { + "epoch": 1.58, + "grad_norm": 0.5606010556221008, + "learning_rate": 0.00027463430518660364, + "loss": 3.1759, + "step": 32257 + }, + { + "epoch": 1.58, + "grad_norm": 0.6088926196098328, + "learning_rate": 0.00027461896416272456, + "loss": 3.0883, + "step": 32258 + }, + { + "epoch": 1.58, + "grad_norm": 0.580753743648529, + "learning_rate": 0.00027460362320569414, + "loss": 2.9957, + "step": 32259 + }, + { + "epoch": 1.58, + "grad_norm": 0.5840525031089783, + "learning_rate": 0.0002745882823155531, + "loss": 3.1201, + "step": 32260 + }, + { + "epoch": 1.58, + "grad_norm": 0.5990431904792786, + "learning_rate": 0.0002745729414923415, + "loss": 3.1314, + "step": 32261 + }, + { + "epoch": 1.58, + "grad_norm": 0.6257085204124451, + "learning_rate": 0.00027455760073609995, + "loss": 3.0842, + "step": 32262 + }, + { + "epoch": 1.58, + "grad_norm": 0.5750504732131958, + "learning_rate": 0.00027454226004686866, + "loss": 3.148, + "step": 32263 + }, + { + "epoch": 1.58, + "grad_norm": 0.6597176790237427, + "learning_rate": 0.0002745269194246882, + "loss": 3.263, + "step": 32264 + }, + { + "epoch": 1.58, + "grad_norm": 0.5871362686157227, + "learning_rate": 0.000274511578869599, + "loss": 3.0359, + "step": 32265 + }, + { + "epoch": 1.58, + "grad_norm": 0.5987416505813599, + "learning_rate": 0.00027449623838164135, + "loss": 2.9311, + "step": 32266 + }, + { + "epoch": 1.58, + "grad_norm": 0.6555812358856201, + "learning_rate": 0.00027448089796085575, + "loss": 3.0914, + "step": 32267 + }, + { + "epoch": 1.58, + "grad_norm": 0.5952374339103699, + "learning_rate": 0.00027446555760728255, + "loss": 2.9107, + "step": 32268 + }, + { + "epoch": 1.58, + "grad_norm": 0.5956671237945557, + "learning_rate": 0.00027445021732096204, + "loss": 3.183, + "step": 32269 + }, + { + "epoch": 1.58, + "grad_norm": 0.5921088457107544, + "learning_rate": 0.00027443487710193486, + "loss": 3.1328, + "step": 32270 + }, + { + "epoch": 1.58, + "grad_norm": 0.5908499360084534, + "learning_rate": 0.0002744195369502412, + "loss": 3.1227, + "step": 32271 + }, + { + "epoch": 1.58, + "grad_norm": 0.6202715635299683, + "learning_rate": 0.0002744041968659217, + "loss": 2.9904, + "step": 32272 + }, + { + "epoch": 1.58, + "grad_norm": 0.7672480344772339, + "learning_rate": 0.00027438885684901647, + "loss": 2.8866, + "step": 32273 + }, + { + "epoch": 1.58, + "grad_norm": 0.6302787065505981, + "learning_rate": 0.00027437351689956613, + "loss": 3.0427, + "step": 32274 + }, + { + "epoch": 1.58, + "grad_norm": 0.5923793911933899, + "learning_rate": 0.000274358177017611, + "loss": 2.9583, + "step": 32275 + }, + { + "epoch": 1.58, + "grad_norm": 0.584662139415741, + "learning_rate": 0.00027434283720319155, + "loss": 2.9949, + "step": 32276 + }, + { + "epoch": 1.58, + "grad_norm": 0.5973477959632874, + "learning_rate": 0.00027432749745634813, + "loss": 3.0155, + "step": 32277 + }, + { + "epoch": 1.58, + "grad_norm": 0.5955763459205627, + "learning_rate": 0.00027431215777712104, + "loss": 3.1771, + "step": 32278 + }, + { + "epoch": 1.58, + "grad_norm": 0.5979658365249634, + "learning_rate": 0.0002742968181655508, + "loss": 3.0354, + "step": 32279 + }, + { + "epoch": 1.58, + "grad_norm": 0.6078369617462158, + "learning_rate": 0.0002742814786216779, + "loss": 3.127, + "step": 32280 + }, + { + "epoch": 1.58, + "grad_norm": 0.6542069315910339, + "learning_rate": 0.00027426613914554253, + "loss": 2.921, + "step": 32281 + }, + { + "epoch": 1.58, + "grad_norm": 0.6545595526695251, + "learning_rate": 0.0002742507997371853, + "loss": 3.0068, + "step": 32282 + }, + { + "epoch": 1.58, + "grad_norm": 0.5855773091316223, + "learning_rate": 0.00027423546039664643, + "loss": 3.1546, + "step": 32283 + }, + { + "epoch": 1.58, + "grad_norm": 0.6614902019500732, + "learning_rate": 0.0002742201211239663, + "loss": 3.1364, + "step": 32284 + }, + { + "epoch": 1.58, + "grad_norm": 0.6412147879600525, + "learning_rate": 0.0002742047819191856, + "loss": 2.7693, + "step": 32285 + }, + { + "epoch": 1.58, + "grad_norm": 0.6186434030532837, + "learning_rate": 0.00027418944278234447, + "loss": 2.9929, + "step": 32286 + }, + { + "epoch": 1.58, + "grad_norm": 0.6143002510070801, + "learning_rate": 0.0002741741037134834, + "loss": 2.8651, + "step": 32287 + }, + { + "epoch": 1.58, + "grad_norm": 0.6143453121185303, + "learning_rate": 0.0002741587647126427, + "loss": 3.2461, + "step": 32288 + }, + { + "epoch": 1.58, + "grad_norm": 0.6308348774909973, + "learning_rate": 0.00027414342577986283, + "loss": 2.9028, + "step": 32289 + }, + { + "epoch": 1.58, + "grad_norm": 0.588534414768219, + "learning_rate": 0.00027412808691518436, + "loss": 3.1194, + "step": 32290 + }, + { + "epoch": 1.58, + "grad_norm": 0.6186642646789551, + "learning_rate": 0.00027411274811864735, + "loss": 3.0384, + "step": 32291 + }, + { + "epoch": 1.58, + "grad_norm": 0.5939307808876038, + "learning_rate": 0.0002740974093902925, + "loss": 3.1793, + "step": 32292 + }, + { + "epoch": 1.58, + "grad_norm": 0.6168650984764099, + "learning_rate": 0.00027408207073016, + "loss": 3.1562, + "step": 32293 + }, + { + "epoch": 1.58, + "grad_norm": 0.6111552119255066, + "learning_rate": 0.00027406673213829037, + "loss": 3.2366, + "step": 32294 + }, + { + "epoch": 1.58, + "grad_norm": 0.612614631652832, + "learning_rate": 0.000274051393614724, + "loss": 2.9828, + "step": 32295 + }, + { + "epoch": 1.58, + "grad_norm": 0.5782878398895264, + "learning_rate": 0.0002740360551595012, + "loss": 3.1741, + "step": 32296 + }, + { + "epoch": 1.58, + "grad_norm": 0.6033691167831421, + "learning_rate": 0.0002740207167726626, + "loss": 2.99, + "step": 32297 + }, + { + "epoch": 1.58, + "grad_norm": 0.5962322950363159, + "learning_rate": 0.00027400537845424826, + "loss": 3.293, + "step": 32298 + }, + { + "epoch": 1.58, + "grad_norm": 0.6039497256278992, + "learning_rate": 0.0002739900402042987, + "loss": 3.2375, + "step": 32299 + }, + { + "epoch": 1.58, + "grad_norm": 0.6100921630859375, + "learning_rate": 0.0002739747020228546, + "loss": 3.096, + "step": 32300 + }, + { + "epoch": 1.58, + "grad_norm": 0.5819284319877625, + "learning_rate": 0.000273959363909956, + "loss": 3.0614, + "step": 32301 + }, + { + "epoch": 1.58, + "grad_norm": 0.6027130484580994, + "learning_rate": 0.0002739440258656435, + "loss": 2.9864, + "step": 32302 + }, + { + "epoch": 1.58, + "grad_norm": 0.5925067663192749, + "learning_rate": 0.00027392868788995723, + "loss": 3.2436, + "step": 32303 + }, + { + "epoch": 1.58, + "grad_norm": 0.6477892398834229, + "learning_rate": 0.0002739133499829379, + "loss": 3.0252, + "step": 32304 + }, + { + "epoch": 1.58, + "grad_norm": 0.6437166929244995, + "learning_rate": 0.00027389801214462587, + "loss": 3.0984, + "step": 32305 + }, + { + "epoch": 1.58, + "grad_norm": 0.60708087682724, + "learning_rate": 0.00027388267437506125, + "loss": 2.945, + "step": 32306 + }, + { + "epoch": 1.58, + "grad_norm": 0.6095942258834839, + "learning_rate": 0.0002738673366742848, + "loss": 3.1028, + "step": 32307 + }, + { + "epoch": 1.58, + "grad_norm": 0.5710780024528503, + "learning_rate": 0.00027385199904233674, + "loss": 2.9135, + "step": 32308 + }, + { + "epoch": 1.58, + "grad_norm": 0.5993046164512634, + "learning_rate": 0.0002738366614792574, + "loss": 2.9611, + "step": 32309 + }, + { + "epoch": 1.58, + "grad_norm": 0.6444818377494812, + "learning_rate": 0.00027382132398508734, + "loss": 3.0658, + "step": 32310 + }, + { + "epoch": 1.58, + "grad_norm": 0.6341570615768433, + "learning_rate": 0.0002738059865598668, + "loss": 3.1248, + "step": 32311 + }, + { + "epoch": 1.58, + "grad_norm": 0.6394447684288025, + "learning_rate": 0.00027379064920363637, + "loss": 2.9208, + "step": 32312 + }, + { + "epoch": 1.58, + "grad_norm": 0.613353431224823, + "learning_rate": 0.00027377531191643623, + "loss": 3.16, + "step": 32313 + }, + { + "epoch": 1.58, + "grad_norm": 0.5999470949172974, + "learning_rate": 0.0002737599746983068, + "loss": 2.9717, + "step": 32314 + }, + { + "epoch": 1.58, + "grad_norm": 0.6254415512084961, + "learning_rate": 0.00027374463754928867, + "loss": 3.171, + "step": 32315 + }, + { + "epoch": 1.58, + "grad_norm": 0.6086492538452148, + "learning_rate": 0.0002737293004694221, + "loss": 3.0757, + "step": 32316 + }, + { + "epoch": 1.58, + "grad_norm": 0.6170065402984619, + "learning_rate": 0.00027371396345874754, + "loss": 2.8206, + "step": 32317 + }, + { + "epoch": 1.58, + "grad_norm": 0.6064412593841553, + "learning_rate": 0.0002736986265173052, + "loss": 3.0026, + "step": 32318 + }, + { + "epoch": 1.58, + "grad_norm": 0.6228976249694824, + "learning_rate": 0.0002736832896451357, + "loss": 3.1017, + "step": 32319 + }, + { + "epoch": 1.58, + "grad_norm": 0.6091843247413635, + "learning_rate": 0.0002736679528422794, + "loss": 2.9123, + "step": 32320 + }, + { + "epoch": 1.58, + "grad_norm": 0.5790774822235107, + "learning_rate": 0.0002736526161087765, + "loss": 2.8931, + "step": 32321 + }, + { + "epoch": 1.58, + "grad_norm": 0.6052638292312622, + "learning_rate": 0.00027363727944466773, + "loss": 3.2133, + "step": 32322 + }, + { + "epoch": 1.58, + "grad_norm": 0.6797242164611816, + "learning_rate": 0.00027362194284999316, + "loss": 3.1776, + "step": 32323 + }, + { + "epoch": 1.58, + "grad_norm": 0.5846360921859741, + "learning_rate": 0.00027360660632479346, + "loss": 2.9818, + "step": 32324 + }, + { + "epoch": 1.58, + "grad_norm": 0.5824291706085205, + "learning_rate": 0.00027359126986910867, + "loss": 2.9715, + "step": 32325 + }, + { + "epoch": 1.58, + "grad_norm": 0.5969865322113037, + "learning_rate": 0.00027357593348297947, + "loss": 3.0331, + "step": 32326 + }, + { + "epoch": 1.58, + "grad_norm": 0.6006813049316406, + "learning_rate": 0.00027356059716644623, + "loss": 3.1214, + "step": 32327 + }, + { + "epoch": 1.58, + "grad_norm": 0.6340086460113525, + "learning_rate": 0.0002735452609195492, + "loss": 3.1599, + "step": 32328 + }, + { + "epoch": 1.58, + "grad_norm": 0.610417902469635, + "learning_rate": 0.00027352992474232894, + "loss": 3.0844, + "step": 32329 + }, + { + "epoch": 1.58, + "grad_norm": 0.6211175322532654, + "learning_rate": 0.00027351458863482576, + "loss": 2.9915, + "step": 32330 + }, + { + "epoch": 1.58, + "grad_norm": 0.6344559192657471, + "learning_rate": 0.00027349925259708, + "loss": 2.8198, + "step": 32331 + }, + { + "epoch": 1.58, + "grad_norm": 0.5910095572471619, + "learning_rate": 0.00027348391662913217, + "loss": 3.0135, + "step": 32332 + }, + { + "epoch": 1.58, + "grad_norm": 0.6326538920402527, + "learning_rate": 0.0002734685807310225, + "loss": 3.0308, + "step": 32333 + }, + { + "epoch": 1.58, + "grad_norm": 0.6006748676300049, + "learning_rate": 0.0002734532449027916, + "loss": 2.9874, + "step": 32334 + }, + { + "epoch": 1.58, + "grad_norm": 0.6194648742675781, + "learning_rate": 0.0002734379091444797, + "loss": 3.0995, + "step": 32335 + }, + { + "epoch": 1.58, + "grad_norm": 0.6255232095718384, + "learning_rate": 0.00027342257345612715, + "loss": 2.9889, + "step": 32336 + }, + { + "epoch": 1.58, + "grad_norm": 0.6315654516220093, + "learning_rate": 0.0002734072378377746, + "loss": 3.1515, + "step": 32337 + }, + { + "epoch": 1.58, + "grad_norm": 0.6415723562240601, + "learning_rate": 0.0002733919022894622, + "loss": 2.9541, + "step": 32338 + }, + { + "epoch": 1.58, + "grad_norm": 0.6148661375045776, + "learning_rate": 0.0002733765668112304, + "loss": 3.0006, + "step": 32339 + }, + { + "epoch": 1.58, + "grad_norm": 0.7170533537864685, + "learning_rate": 0.00027336123140311955, + "loss": 2.9005, + "step": 32340 + }, + { + "epoch": 1.58, + "grad_norm": 0.5991891622543335, + "learning_rate": 0.0002733458960651701, + "loss": 2.9749, + "step": 32341 + }, + { + "epoch": 1.59, + "grad_norm": 0.5969253182411194, + "learning_rate": 0.00027333056079742257, + "loss": 2.8458, + "step": 32342 + }, + { + "epoch": 1.59, + "grad_norm": 0.6092698574066162, + "learning_rate": 0.000273315225599917, + "loss": 3.0701, + "step": 32343 + }, + { + "epoch": 1.59, + "grad_norm": 0.6403960585594177, + "learning_rate": 0.00027329989047269413, + "loss": 2.9416, + "step": 32344 + }, + { + "epoch": 1.59, + "grad_norm": 0.5866044759750366, + "learning_rate": 0.0002732845554157942, + "loss": 3.0471, + "step": 32345 + }, + { + "epoch": 1.59, + "grad_norm": 0.6154597997665405, + "learning_rate": 0.00027326922042925745, + "loss": 3.141, + "step": 32346 + }, + { + "epoch": 1.59, + "grad_norm": 0.6291300058364868, + "learning_rate": 0.0002732538855131246, + "loss": 2.9708, + "step": 32347 + }, + { + "epoch": 1.59, + "grad_norm": 0.615016520023346, + "learning_rate": 0.0002732385506674359, + "loss": 2.903, + "step": 32348 + }, + { + "epoch": 1.59, + "grad_norm": 0.6233264803886414, + "learning_rate": 0.0002732232158922316, + "loss": 3.0574, + "step": 32349 + }, + { + "epoch": 1.59, + "grad_norm": 0.5890025496482849, + "learning_rate": 0.0002732078811875522, + "loss": 3.0299, + "step": 32350 + }, + { + "epoch": 1.59, + "grad_norm": 0.5658774971961975, + "learning_rate": 0.00027319254655343804, + "loss": 2.972, + "step": 32351 + }, + { + "epoch": 1.59, + "grad_norm": 0.6112178564071655, + "learning_rate": 0.00027317721198992967, + "loss": 2.9405, + "step": 32352 + }, + { + "epoch": 1.59, + "grad_norm": 0.6809776425361633, + "learning_rate": 0.00027316187749706737, + "loss": 3.1299, + "step": 32353 + }, + { + "epoch": 1.59, + "grad_norm": 0.6142784953117371, + "learning_rate": 0.0002731465430748915, + "loss": 3.2225, + "step": 32354 + }, + { + "epoch": 1.59, + "grad_norm": 0.5717936158180237, + "learning_rate": 0.00027313120872344236, + "loss": 3.1395, + "step": 32355 + }, + { + "epoch": 1.59, + "grad_norm": 0.6221408843994141, + "learning_rate": 0.0002731158744427605, + "loss": 3.1424, + "step": 32356 + }, + { + "epoch": 1.59, + "grad_norm": 0.6313767433166504, + "learning_rate": 0.00027310054023288634, + "loss": 3.0057, + "step": 32357 + }, + { + "epoch": 1.59, + "grad_norm": 0.577363908290863, + "learning_rate": 0.00027308520609386003, + "loss": 3.0129, + "step": 32358 + }, + { + "epoch": 1.59, + "grad_norm": 0.6358572840690613, + "learning_rate": 0.00027306987202572227, + "loss": 3.0684, + "step": 32359 + }, + { + "epoch": 1.59, + "grad_norm": 0.6030645370483398, + "learning_rate": 0.0002730545380285132, + "loss": 3.105, + "step": 32360 + }, + { + "epoch": 1.59, + "grad_norm": 0.574252188205719, + "learning_rate": 0.0002730392041022732, + "loss": 3.2618, + "step": 32361 + }, + { + "epoch": 1.59, + "grad_norm": 0.5889049172401428, + "learning_rate": 0.0002730238702470429, + "loss": 3.1371, + "step": 32362 + }, + { + "epoch": 1.59, + "grad_norm": 0.5928773283958435, + "learning_rate": 0.00027300853646286245, + "loss": 2.8395, + "step": 32363 + }, + { + "epoch": 1.59, + "grad_norm": 0.6077459454536438, + "learning_rate": 0.00027299320274977245, + "loss": 2.9993, + "step": 32364 + }, + { + "epoch": 1.59, + "grad_norm": 0.6264618635177612, + "learning_rate": 0.000272977869107813, + "loss": 3.1145, + "step": 32365 + }, + { + "epoch": 1.59, + "grad_norm": 0.6416333913803101, + "learning_rate": 0.0002729625355370247, + "loss": 3.0723, + "step": 32366 + }, + { + "epoch": 1.59, + "grad_norm": 0.6093899607658386, + "learning_rate": 0.0002729472020374479, + "loss": 2.8618, + "step": 32367 + }, + { + "epoch": 1.59, + "grad_norm": 0.6002346277236938, + "learning_rate": 0.00027293186860912285, + "loss": 3.0874, + "step": 32368 + }, + { + "epoch": 1.59, + "grad_norm": 0.5929070115089417, + "learning_rate": 0.0002729165352520902, + "loss": 3.2543, + "step": 32369 + }, + { + "epoch": 1.59, + "grad_norm": 0.6260755062103271, + "learning_rate": 0.0002729012019663901, + "loss": 3.0492, + "step": 32370 + }, + { + "epoch": 1.59, + "grad_norm": 0.6383898258209229, + "learning_rate": 0.00027288586875206296, + "loss": 2.941, + "step": 32371 + }, + { + "epoch": 1.59, + "grad_norm": 0.6028842926025391, + "learning_rate": 0.0002728705356091494, + "loss": 3.1908, + "step": 32372 + }, + { + "epoch": 1.59, + "grad_norm": 0.6282774209976196, + "learning_rate": 0.00027285520253768944, + "loss": 3.0125, + "step": 32373 + }, + { + "epoch": 1.59, + "grad_norm": 0.5710699558258057, + "learning_rate": 0.00027283986953772376, + "loss": 3.0275, + "step": 32374 + }, + { + "epoch": 1.59, + "grad_norm": 0.6519010066986084, + "learning_rate": 0.00027282453660929266, + "loss": 2.6907, + "step": 32375 + }, + { + "epoch": 1.59, + "grad_norm": 0.5964446067810059, + "learning_rate": 0.0002728092037524364, + "loss": 3.0842, + "step": 32376 + }, + { + "epoch": 1.59, + "grad_norm": 0.564836323261261, + "learning_rate": 0.00027279387096719557, + "loss": 3.0647, + "step": 32377 + }, + { + "epoch": 1.59, + "grad_norm": 0.5880719423294067, + "learning_rate": 0.00027277853825361035, + "loss": 3.0947, + "step": 32378 + }, + { + "epoch": 1.59, + "grad_norm": 0.6163837909698486, + "learning_rate": 0.00027276320561172136, + "loss": 3.095, + "step": 32379 + }, + { + "epoch": 1.59, + "grad_norm": 0.6298701763153076, + "learning_rate": 0.0002727478730415687, + "loss": 2.9519, + "step": 32380 + }, + { + "epoch": 1.59, + "grad_norm": 0.6387132406234741, + "learning_rate": 0.000272732540543193, + "loss": 3.0831, + "step": 32381 + }, + { + "epoch": 1.59, + "grad_norm": 0.5862566232681274, + "learning_rate": 0.0002727172081166345, + "loss": 3.0494, + "step": 32382 + }, + { + "epoch": 1.59, + "grad_norm": 0.5777621269226074, + "learning_rate": 0.00027270187576193354, + "loss": 3.0085, + "step": 32383 + }, + { + "epoch": 1.59, + "grad_norm": 0.5978364944458008, + "learning_rate": 0.00027268654347913074, + "loss": 2.8669, + "step": 32384 + }, + { + "epoch": 1.59, + "grad_norm": 0.6363843679428101, + "learning_rate": 0.0002726712112682662, + "loss": 2.9042, + "step": 32385 + }, + { + "epoch": 1.59, + "grad_norm": 0.5844437479972839, + "learning_rate": 0.00027265587912938047, + "loss": 3.1127, + "step": 32386 + }, + { + "epoch": 1.59, + "grad_norm": 0.5706456899642944, + "learning_rate": 0.00027264054706251393, + "loss": 2.8488, + "step": 32387 + }, + { + "epoch": 1.59, + "grad_norm": 0.5758098363876343, + "learning_rate": 0.0002726252150677069, + "loss": 3.4247, + "step": 32388 + }, + { + "epoch": 1.59, + "grad_norm": 0.647951602935791, + "learning_rate": 0.00027260988314499983, + "loss": 3.069, + "step": 32389 + }, + { + "epoch": 1.59, + "grad_norm": 0.6163773536682129, + "learning_rate": 0.000272594551294433, + "loss": 3.0409, + "step": 32390 + }, + { + "epoch": 1.59, + "grad_norm": 0.5810932517051697, + "learning_rate": 0.00027257921951604673, + "loss": 3.0328, + "step": 32391 + }, + { + "epoch": 1.59, + "grad_norm": 0.6113194823265076, + "learning_rate": 0.0002725638878098818, + "loss": 3.0184, + "step": 32392 + }, + { + "epoch": 1.59, + "grad_norm": 0.6727303266525269, + "learning_rate": 0.00027254855617597806, + "loss": 3.1082, + "step": 32393 + }, + { + "epoch": 1.59, + "grad_norm": 0.5940728783607483, + "learning_rate": 0.00027253322461437635, + "loss": 3.158, + "step": 32394 + }, + { + "epoch": 1.59, + "grad_norm": 0.6131221652030945, + "learning_rate": 0.0002725178931251166, + "loss": 3.0268, + "step": 32395 + }, + { + "epoch": 1.59, + "grad_norm": 0.591697633266449, + "learning_rate": 0.0002725025617082396, + "loss": 3.1224, + "step": 32396 + }, + { + "epoch": 1.59, + "grad_norm": 0.5704348683357239, + "learning_rate": 0.0002724872303637855, + "loss": 2.9679, + "step": 32397 + }, + { + "epoch": 1.59, + "grad_norm": 0.6032789349555969, + "learning_rate": 0.0002724718990917947, + "loss": 2.959, + "step": 32398 + }, + { + "epoch": 1.59, + "grad_norm": 0.5963385701179504, + "learning_rate": 0.00027245656789230775, + "loss": 3.1237, + "step": 32399 + }, + { + "epoch": 1.59, + "grad_norm": 0.625329315662384, + "learning_rate": 0.00027244123676536486, + "loss": 2.9193, + "step": 32400 + }, + { + "epoch": 1.59, + "grad_norm": 0.5919100046157837, + "learning_rate": 0.0002724259057110063, + "loss": 3.141, + "step": 32401 + }, + { + "epoch": 1.59, + "grad_norm": 0.6092190146446228, + "learning_rate": 0.0002724105747292728, + "loss": 2.9224, + "step": 32402 + }, + { + "epoch": 1.59, + "grad_norm": 0.5872280597686768, + "learning_rate": 0.0002723952438202044, + "loss": 3.3013, + "step": 32403 + }, + { + "epoch": 1.59, + "grad_norm": 0.5837467908859253, + "learning_rate": 0.00027237991298384174, + "loss": 2.8526, + "step": 32404 + }, + { + "epoch": 1.59, + "grad_norm": 0.6181141138076782, + "learning_rate": 0.0002723645822202249, + "loss": 2.9058, + "step": 32405 + }, + { + "epoch": 1.59, + "grad_norm": 0.6013671159744263, + "learning_rate": 0.0002723492515293946, + "loss": 3.0388, + "step": 32406 + }, + { + "epoch": 1.59, + "grad_norm": 0.5945082306861877, + "learning_rate": 0.00027233392091139096, + "loss": 2.9561, + "step": 32407 + }, + { + "epoch": 1.59, + "grad_norm": 0.565525233745575, + "learning_rate": 0.00027231859036625444, + "loss": 3.0511, + "step": 32408 + }, + { + "epoch": 1.59, + "grad_norm": 0.6196669936180115, + "learning_rate": 0.0002723032598940255, + "loss": 3.0336, + "step": 32409 + }, + { + "epoch": 1.59, + "grad_norm": 0.6390196681022644, + "learning_rate": 0.0002722879294947444, + "loss": 2.7013, + "step": 32410 + }, + { + "epoch": 1.59, + "grad_norm": 0.6164066791534424, + "learning_rate": 0.0002722725991684516, + "loss": 3.1329, + "step": 32411 + }, + { + "epoch": 1.59, + "grad_norm": 0.570892870426178, + "learning_rate": 0.00027225726891518735, + "loss": 3.1695, + "step": 32412 + }, + { + "epoch": 1.59, + "grad_norm": 0.6367611289024353, + "learning_rate": 0.000272241938734992, + "loss": 3.1406, + "step": 32413 + }, + { + "epoch": 1.59, + "grad_norm": 0.6313506960868835, + "learning_rate": 0.0002722266086279063, + "loss": 3.0672, + "step": 32414 + }, + { + "epoch": 1.59, + "grad_norm": 0.5836746096611023, + "learning_rate": 0.0002722112785939703, + "loss": 3.2579, + "step": 32415 + }, + { + "epoch": 1.59, + "grad_norm": 0.5823087692260742, + "learning_rate": 0.00027219594863322447, + "loss": 2.9865, + "step": 32416 + }, + { + "epoch": 1.59, + "grad_norm": 0.6018955111503601, + "learning_rate": 0.00027218061874570903, + "loss": 2.9428, + "step": 32417 + }, + { + "epoch": 1.59, + "grad_norm": 0.6570754647254944, + "learning_rate": 0.0002721652889314645, + "loss": 3.081, + "step": 32418 + }, + { + "epoch": 1.59, + "grad_norm": 0.6093233227729797, + "learning_rate": 0.00027214995919053143, + "loss": 2.591, + "step": 32419 + }, + { + "epoch": 1.59, + "grad_norm": 0.5643780827522278, + "learning_rate": 0.00027213462952294977, + "loss": 3.0753, + "step": 32420 + }, + { + "epoch": 1.59, + "grad_norm": 0.5750892162322998, + "learning_rate": 0.0002721192999287603, + "loss": 2.9966, + "step": 32421 + }, + { + "epoch": 1.59, + "grad_norm": 0.6011072993278503, + "learning_rate": 0.00027210397040800313, + "loss": 3.0969, + "step": 32422 + }, + { + "epoch": 1.59, + "grad_norm": 0.5999905467033386, + "learning_rate": 0.0002720886409607187, + "loss": 3.0539, + "step": 32423 + }, + { + "epoch": 1.59, + "grad_norm": 0.6333760023117065, + "learning_rate": 0.0002720733115869476, + "loss": 3.1335, + "step": 32424 + }, + { + "epoch": 1.59, + "grad_norm": 0.6337881684303284, + "learning_rate": 0.0002720579822867299, + "loss": 2.828, + "step": 32425 + }, + { + "epoch": 1.59, + "grad_norm": 0.5925475358963013, + "learning_rate": 0.00027204265306010616, + "loss": 2.9094, + "step": 32426 + }, + { + "epoch": 1.59, + "grad_norm": 0.6454971432685852, + "learning_rate": 0.00027202732390711656, + "loss": 3.0657, + "step": 32427 + }, + { + "epoch": 1.59, + "grad_norm": 0.6801785826683044, + "learning_rate": 0.0002720119948278017, + "loss": 2.9696, + "step": 32428 + }, + { + "epoch": 1.59, + "grad_norm": 0.6122978329658508, + "learning_rate": 0.0002719966658222019, + "loss": 2.901, + "step": 32429 + }, + { + "epoch": 1.59, + "grad_norm": 0.6025977730751038, + "learning_rate": 0.0002719813368903573, + "loss": 3.0501, + "step": 32430 + }, + { + "epoch": 1.59, + "grad_norm": 0.5820662975311279, + "learning_rate": 0.0002719660080323087, + "loss": 3.1117, + "step": 32431 + }, + { + "epoch": 1.59, + "grad_norm": 0.6449814438819885, + "learning_rate": 0.00027195067924809616, + "loss": 3.0745, + "step": 32432 + }, + { + "epoch": 1.59, + "grad_norm": 0.6674616932868958, + "learning_rate": 0.00027193535053776006, + "loss": 2.9594, + "step": 32433 + }, + { + "epoch": 1.59, + "grad_norm": 0.6257567405700684, + "learning_rate": 0.00027192002190134097, + "loss": 3.1974, + "step": 32434 + }, + { + "epoch": 1.59, + "grad_norm": 0.6641944050788879, + "learning_rate": 0.000271904693338879, + "loss": 3.2397, + "step": 32435 + }, + { + "epoch": 1.59, + "grad_norm": 0.6264591813087463, + "learning_rate": 0.0002718893648504148, + "loss": 3.1784, + "step": 32436 + }, + { + "epoch": 1.59, + "grad_norm": 0.6466692686080933, + "learning_rate": 0.00027187403643598847, + "loss": 2.7668, + "step": 32437 + }, + { + "epoch": 1.59, + "grad_norm": 0.6122263073921204, + "learning_rate": 0.00027185870809564054, + "loss": 2.9252, + "step": 32438 + }, + { + "epoch": 1.59, + "grad_norm": 0.5788053274154663, + "learning_rate": 0.00027184337982941143, + "loss": 2.9313, + "step": 32439 + }, + { + "epoch": 1.59, + "grad_norm": 0.6403984427452087, + "learning_rate": 0.00027182805163734143, + "loss": 3.004, + "step": 32440 + }, + { + "epoch": 1.59, + "grad_norm": 0.6190902590751648, + "learning_rate": 0.00027181272351947097, + "loss": 3.2076, + "step": 32441 + }, + { + "epoch": 1.59, + "grad_norm": 0.6131787300109863, + "learning_rate": 0.0002717973954758402, + "loss": 3.0188, + "step": 32442 + }, + { + "epoch": 1.59, + "grad_norm": 0.574184000492096, + "learning_rate": 0.00027178206750648975, + "loss": 3.0804, + "step": 32443 + }, + { + "epoch": 1.59, + "grad_norm": 0.6264970302581787, + "learning_rate": 0.00027176673961145997, + "loss": 2.8913, + "step": 32444 + }, + { + "epoch": 1.59, + "grad_norm": 0.613540768623352, + "learning_rate": 0.000271751411790791, + "loss": 3.0355, + "step": 32445 + }, + { + "epoch": 1.59, + "grad_norm": 0.5924250483512878, + "learning_rate": 0.0002717360840445236, + "loss": 2.7862, + "step": 32446 + }, + { + "epoch": 1.59, + "grad_norm": 0.5855326652526855, + "learning_rate": 0.00027172075637269774, + "loss": 2.8646, + "step": 32447 + }, + { + "epoch": 1.59, + "grad_norm": 0.6014474034309387, + "learning_rate": 0.000271705428775354, + "loss": 3.0086, + "step": 32448 + }, + { + "epoch": 1.59, + "grad_norm": 0.6462697386741638, + "learning_rate": 0.0002716901012525328, + "loss": 3.0801, + "step": 32449 + }, + { + "epoch": 1.59, + "grad_norm": 0.582455039024353, + "learning_rate": 0.00027167477380427436, + "loss": 3.0355, + "step": 32450 + }, + { + "epoch": 1.59, + "grad_norm": 0.5739898085594177, + "learning_rate": 0.00027165944643061914, + "loss": 3.1304, + "step": 32451 + }, + { + "epoch": 1.59, + "grad_norm": 0.6203562617301941, + "learning_rate": 0.0002716441191316074, + "loss": 3.2051, + "step": 32452 + }, + { + "epoch": 1.59, + "grad_norm": 0.6219992637634277, + "learning_rate": 0.0002716287919072796, + "loss": 2.9347, + "step": 32453 + }, + { + "epoch": 1.59, + "grad_norm": 0.6303226351737976, + "learning_rate": 0.0002716134647576763, + "loss": 3.0828, + "step": 32454 + }, + { + "epoch": 1.59, + "grad_norm": 0.6271916627883911, + "learning_rate": 0.0002715981376828374, + "loss": 3.0765, + "step": 32455 + }, + { + "epoch": 1.59, + "grad_norm": 0.6190890073776245, + "learning_rate": 0.00027158281068280374, + "loss": 2.999, + "step": 32456 + }, + { + "epoch": 1.59, + "grad_norm": 0.6210778951644897, + "learning_rate": 0.00027156748375761536, + "loss": 3.0662, + "step": 32457 + }, + { + "epoch": 1.59, + "grad_norm": 0.5740706324577332, + "learning_rate": 0.0002715521569073128, + "loss": 2.9903, + "step": 32458 + }, + { + "epoch": 1.59, + "grad_norm": 0.5983421802520752, + "learning_rate": 0.00027153683013193645, + "loss": 2.9476, + "step": 32459 + }, + { + "epoch": 1.59, + "grad_norm": 0.5819885730743408, + "learning_rate": 0.0002715215034315265, + "loss": 3.1873, + "step": 32460 + }, + { + "epoch": 1.59, + "grad_norm": 0.6093148589134216, + "learning_rate": 0.00027150617680612353, + "loss": 2.9995, + "step": 32461 + }, + { + "epoch": 1.59, + "grad_norm": 0.607557475566864, + "learning_rate": 0.00027149085025576775, + "loss": 3.0563, + "step": 32462 + }, + { + "epoch": 1.59, + "grad_norm": 0.6053733825683594, + "learning_rate": 0.0002714755237804995, + "loss": 3.2042, + "step": 32463 + }, + { + "epoch": 1.59, + "grad_norm": 0.6200517416000366, + "learning_rate": 0.0002714601973803594, + "loss": 3.1293, + "step": 32464 + }, + { + "epoch": 1.59, + "grad_norm": 0.6223552823066711, + "learning_rate": 0.00027144487105538757, + "loss": 2.9668, + "step": 32465 + }, + { + "epoch": 1.59, + "grad_norm": 0.6040474772453308, + "learning_rate": 0.00027142954480562456, + "loss": 2.9209, + "step": 32466 + }, + { + "epoch": 1.59, + "grad_norm": 0.5464975237846375, + "learning_rate": 0.00027141421863111044, + "loss": 3.1782, + "step": 32467 + }, + { + "epoch": 1.59, + "grad_norm": 0.6203432679176331, + "learning_rate": 0.00027139889253188586, + "loss": 2.926, + "step": 32468 + }, + { + "epoch": 1.59, + "grad_norm": 0.5865224599838257, + "learning_rate": 0.0002713835665079911, + "loss": 3.2632, + "step": 32469 + }, + { + "epoch": 1.59, + "grad_norm": 0.577903687953949, + "learning_rate": 0.00027136824055946647, + "loss": 3.0893, + "step": 32470 + }, + { + "epoch": 1.59, + "grad_norm": 0.6584243178367615, + "learning_rate": 0.00027135291468635247, + "loss": 2.8712, + "step": 32471 + }, + { + "epoch": 1.59, + "grad_norm": 1.597705364227295, + "learning_rate": 0.00027133758888868935, + "loss": 3.0972, + "step": 32472 + }, + { + "epoch": 1.59, + "grad_norm": 0.5774487853050232, + "learning_rate": 0.0002713222631665174, + "loss": 2.9983, + "step": 32473 + }, + { + "epoch": 1.59, + "grad_norm": 0.6193967461585999, + "learning_rate": 0.00027130693751987724, + "loss": 3.0678, + "step": 32474 + }, + { + "epoch": 1.59, + "grad_norm": 0.692612886428833, + "learning_rate": 0.0002712916119488089, + "loss": 3.0259, + "step": 32475 + }, + { + "epoch": 1.59, + "grad_norm": 0.611100971698761, + "learning_rate": 0.0002712762864533531, + "loss": 2.943, + "step": 32476 + }, + { + "epoch": 1.59, + "grad_norm": 0.6383076310157776, + "learning_rate": 0.00027126096103355, + "loss": 3.1925, + "step": 32477 + }, + { + "epoch": 1.59, + "grad_norm": 0.5976969003677368, + "learning_rate": 0.0002712456356894399, + "loss": 2.983, + "step": 32478 + }, + { + "epoch": 1.59, + "grad_norm": 0.6111800670623779, + "learning_rate": 0.0002712303104210634, + "loss": 3.1147, + "step": 32479 + }, + { + "epoch": 1.59, + "grad_norm": 0.5945995450019836, + "learning_rate": 0.00027121498522846065, + "loss": 3.1164, + "step": 32480 + }, + { + "epoch": 1.59, + "grad_norm": 0.5979011654853821, + "learning_rate": 0.00027119966011167215, + "loss": 3.0233, + "step": 32481 + }, + { + "epoch": 1.59, + "grad_norm": 0.6658178567886353, + "learning_rate": 0.0002711843350707381, + "loss": 3.0965, + "step": 32482 + }, + { + "epoch": 1.59, + "grad_norm": 0.5945828557014465, + "learning_rate": 0.0002711690101056991, + "loss": 3.0912, + "step": 32483 + }, + { + "epoch": 1.59, + "grad_norm": 0.6201061606407166, + "learning_rate": 0.00027115368521659527, + "loss": 2.9176, + "step": 32484 + }, + { + "epoch": 1.59, + "grad_norm": 0.6284053325653076, + "learning_rate": 0.00027113836040346703, + "loss": 2.9548, + "step": 32485 + }, + { + "epoch": 1.59, + "grad_norm": 0.5955159068107605, + "learning_rate": 0.000271123035666355, + "loss": 3.2301, + "step": 32486 + }, + { + "epoch": 1.59, + "grad_norm": 0.6049878597259521, + "learning_rate": 0.00027110771100529916, + "loss": 2.9669, + "step": 32487 + }, + { + "epoch": 1.59, + "grad_norm": 0.5916286706924438, + "learning_rate": 0.0002710923864203402, + "loss": 3.0258, + "step": 32488 + }, + { + "epoch": 1.59, + "grad_norm": 0.5768219232559204, + "learning_rate": 0.0002710770619115181, + "loss": 3.0384, + "step": 32489 + }, + { + "epoch": 1.59, + "grad_norm": 0.6287925839424133, + "learning_rate": 0.0002710617374788736, + "loss": 3.0563, + "step": 32490 + }, + { + "epoch": 1.59, + "grad_norm": 0.659156858921051, + "learning_rate": 0.00027104641312244694, + "loss": 3.0919, + "step": 32491 + }, + { + "epoch": 1.59, + "grad_norm": 0.5746557712554932, + "learning_rate": 0.0002710310888422784, + "loss": 3.063, + "step": 32492 + }, + { + "epoch": 1.59, + "grad_norm": 0.613263726234436, + "learning_rate": 0.00027101576463840846, + "loss": 2.9219, + "step": 32493 + }, + { + "epoch": 1.59, + "grad_norm": 0.610640287399292, + "learning_rate": 0.00027100044051087737, + "loss": 3.182, + "step": 32494 + }, + { + "epoch": 1.59, + "grad_norm": 0.6279901266098022, + "learning_rate": 0.00027098511645972544, + "loss": 2.7512, + "step": 32495 + }, + { + "epoch": 1.59, + "grad_norm": 0.6687607169151306, + "learning_rate": 0.0002709697924849933, + "loss": 3.1327, + "step": 32496 + }, + { + "epoch": 1.59, + "grad_norm": 0.6255040764808655, + "learning_rate": 0.000270954468586721, + "loss": 2.8758, + "step": 32497 + }, + { + "epoch": 1.59, + "grad_norm": 0.5970877408981323, + "learning_rate": 0.00027093914476494915, + "loss": 3.0503, + "step": 32498 + }, + { + "epoch": 1.59, + "grad_norm": 0.5644693970680237, + "learning_rate": 0.00027092382101971793, + "loss": 2.8474, + "step": 32499 + }, + { + "epoch": 1.59, + "grad_norm": 0.591464638710022, + "learning_rate": 0.00027090849735106776, + "loss": 2.9457, + "step": 32500 + }, + { + "epoch": 1.59, + "grad_norm": 0.6773257255554199, + "learning_rate": 0.0002708931737590391, + "loss": 3.0178, + "step": 32501 + }, + { + "epoch": 1.59, + "grad_norm": 0.59250807762146, + "learning_rate": 0.00027087785024367217, + "loss": 3.1202, + "step": 32502 + }, + { + "epoch": 1.59, + "grad_norm": 0.6139160990715027, + "learning_rate": 0.0002708625268050074, + "loss": 2.8928, + "step": 32503 + }, + { + "epoch": 1.59, + "grad_norm": 0.6274259686470032, + "learning_rate": 0.00027084720344308503, + "loss": 3.168, + "step": 32504 + }, + { + "epoch": 1.59, + "grad_norm": 0.5896390080451965, + "learning_rate": 0.0002708318801579456, + "loss": 2.9981, + "step": 32505 + }, + { + "epoch": 1.59, + "grad_norm": 0.6155339479446411, + "learning_rate": 0.00027081655694962943, + "loss": 2.9269, + "step": 32506 + }, + { + "epoch": 1.59, + "grad_norm": 0.5823910236358643, + "learning_rate": 0.00027080123381817666, + "loss": 2.9131, + "step": 32507 + }, + { + "epoch": 1.59, + "grad_norm": 0.6796182990074158, + "learning_rate": 0.00027078591076362804, + "loss": 3.0235, + "step": 32508 + }, + { + "epoch": 1.59, + "grad_norm": 0.5854375958442688, + "learning_rate": 0.00027077058778602356, + "loss": 2.883, + "step": 32509 + }, + { + "epoch": 1.59, + "grad_norm": 0.5948371291160583, + "learning_rate": 0.0002707552648854037, + "loss": 2.9561, + "step": 32510 + }, + { + "epoch": 1.59, + "grad_norm": 0.6129918694496155, + "learning_rate": 0.00027073994206180897, + "loss": 3.2477, + "step": 32511 + }, + { + "epoch": 1.59, + "grad_norm": 0.6146805882453918, + "learning_rate": 0.0002707246193152796, + "loss": 3.0606, + "step": 32512 + }, + { + "epoch": 1.59, + "grad_norm": 0.5845103859901428, + "learning_rate": 0.00027070929664585594, + "loss": 3.1949, + "step": 32513 + }, + { + "epoch": 1.59, + "grad_norm": 0.606810450553894, + "learning_rate": 0.0002706939740535782, + "loss": 2.9231, + "step": 32514 + }, + { + "epoch": 1.59, + "grad_norm": 0.5757008194923401, + "learning_rate": 0.0002706786515384869, + "loss": 2.9504, + "step": 32515 + }, + { + "epoch": 1.59, + "grad_norm": 0.620858371257782, + "learning_rate": 0.0002706633291006226, + "loss": 2.9856, + "step": 32516 + }, + { + "epoch": 1.59, + "grad_norm": 0.6243740320205688, + "learning_rate": 0.00027064800674002533, + "loss": 3.1278, + "step": 32517 + }, + { + "epoch": 1.59, + "grad_norm": 0.5853642225265503, + "learning_rate": 0.00027063268445673557, + "loss": 3.1047, + "step": 32518 + }, + { + "epoch": 1.59, + "grad_norm": 0.6242586374282837, + "learning_rate": 0.0002706173622507936, + "loss": 3.009, + "step": 32519 + }, + { + "epoch": 1.59, + "grad_norm": 0.636432409286499, + "learning_rate": 0.0002706020401222399, + "loss": 2.9736, + "step": 32520 + }, + { + "epoch": 1.59, + "grad_norm": 0.6194183826446533, + "learning_rate": 0.00027058671807111484, + "loss": 3.1121, + "step": 32521 + }, + { + "epoch": 1.59, + "grad_norm": 0.6781464219093323, + "learning_rate": 0.00027057139609745854, + "loss": 2.7746, + "step": 32522 + }, + { + "epoch": 1.59, + "grad_norm": 0.5913337469100952, + "learning_rate": 0.0002705560742013117, + "loss": 3.0744, + "step": 32523 + }, + { + "epoch": 1.59, + "grad_norm": 0.5661170482635498, + "learning_rate": 0.0002705407523827144, + "loss": 3.1307, + "step": 32524 + }, + { + "epoch": 1.59, + "grad_norm": 0.6512724161148071, + "learning_rate": 0.000270525430641707, + "loss": 3.1466, + "step": 32525 + }, + { + "epoch": 1.59, + "grad_norm": 0.631696879863739, + "learning_rate": 0.0002705101089783301, + "loss": 3.0165, + "step": 32526 + }, + { + "epoch": 1.59, + "grad_norm": 0.6301171183586121, + "learning_rate": 0.00027049478739262384, + "loss": 2.9184, + "step": 32527 + }, + { + "epoch": 1.59, + "grad_norm": 0.6100221276283264, + "learning_rate": 0.00027047946588462874, + "loss": 3.1662, + "step": 32528 + }, + { + "epoch": 1.59, + "grad_norm": 0.6037270426750183, + "learning_rate": 0.00027046414445438486, + "loss": 2.9379, + "step": 32529 + }, + { + "epoch": 1.59, + "grad_norm": 0.6437107920646667, + "learning_rate": 0.0002704488231019328, + "loss": 3.0219, + "step": 32530 + }, + { + "epoch": 1.59, + "grad_norm": 0.5775290131568909, + "learning_rate": 0.00027043350182731293, + "loss": 3.1123, + "step": 32531 + }, + { + "epoch": 1.59, + "grad_norm": 0.5918175578117371, + "learning_rate": 0.0002704181806305654, + "loss": 3.0319, + "step": 32532 + }, + { + "epoch": 1.59, + "grad_norm": 0.567966103553772, + "learning_rate": 0.0002704028595117308, + "loss": 3.1738, + "step": 32533 + }, + { + "epoch": 1.59, + "grad_norm": 0.6273672580718994, + "learning_rate": 0.0002703875384708493, + "loss": 3.2175, + "step": 32534 + }, + { + "epoch": 1.59, + "grad_norm": 0.6246945858001709, + "learning_rate": 0.0002703722175079613, + "loss": 2.9896, + "step": 32535 + }, + { + "epoch": 1.59, + "grad_norm": 0.5851449370384216, + "learning_rate": 0.0002703568966231073, + "loss": 3.0777, + "step": 32536 + }, + { + "epoch": 1.59, + "grad_norm": 0.6178532838821411, + "learning_rate": 0.00027034157581632735, + "loss": 3.3369, + "step": 32537 + }, + { + "epoch": 1.59, + "grad_norm": 0.626719057559967, + "learning_rate": 0.0002703262550876622, + "loss": 3.2271, + "step": 32538 + }, + { + "epoch": 1.59, + "grad_norm": 0.5996720194816589, + "learning_rate": 0.00027031093443715185, + "loss": 3.2784, + "step": 32539 + }, + { + "epoch": 1.59, + "grad_norm": 0.5899909138679504, + "learning_rate": 0.0002702956138648367, + "loss": 3.2182, + "step": 32540 + }, + { + "epoch": 1.59, + "grad_norm": 0.5896705985069275, + "learning_rate": 0.0002702802933707574, + "loss": 2.862, + "step": 32541 + }, + { + "epoch": 1.59, + "grad_norm": 0.6266672015190125, + "learning_rate": 0.00027026497295495395, + "loss": 3.039, + "step": 32542 + }, + { + "epoch": 1.59, + "grad_norm": 0.615408718585968, + "learning_rate": 0.0002702496526174669, + "loss": 3.0288, + "step": 32543 + }, + { + "epoch": 1.59, + "grad_norm": 0.6363191604614258, + "learning_rate": 0.0002702343323583365, + "loss": 3.1345, + "step": 32544 + }, + { + "epoch": 1.59, + "grad_norm": 0.6553589701652527, + "learning_rate": 0.0002702190121776031, + "loss": 3.1664, + "step": 32545 + }, + { + "epoch": 1.6, + "grad_norm": 0.6160081624984741, + "learning_rate": 0.0002702036920753072, + "loss": 2.9866, + "step": 32546 + }, + { + "epoch": 1.6, + "grad_norm": 0.6299121975898743, + "learning_rate": 0.0002701883720514889, + "loss": 3.0313, + "step": 32547 + }, + { + "epoch": 1.6, + "grad_norm": 0.5947949290275574, + "learning_rate": 0.00027017305210618885, + "loss": 3.2526, + "step": 32548 + }, + { + "epoch": 1.6, + "grad_norm": 0.5934600830078125, + "learning_rate": 0.00027015773223944717, + "loss": 2.9821, + "step": 32549 + }, + { + "epoch": 1.6, + "grad_norm": 0.6083028316497803, + "learning_rate": 0.00027014241245130414, + "loss": 2.9853, + "step": 32550 + }, + { + "epoch": 1.6, + "grad_norm": 0.5787101984024048, + "learning_rate": 0.0002701270927418005, + "loss": 2.9115, + "step": 32551 + }, + { + "epoch": 1.6, + "grad_norm": 0.580090343952179, + "learning_rate": 0.0002701117731109762, + "loss": 3.02, + "step": 32552 + }, + { + "epoch": 1.6, + "grad_norm": 0.6392739415168762, + "learning_rate": 0.0002700964535588718, + "loss": 3.2771, + "step": 32553 + }, + { + "epoch": 1.6, + "grad_norm": 0.607865035533905, + "learning_rate": 0.0002700811340855275, + "loss": 2.8881, + "step": 32554 + }, + { + "epoch": 1.6, + "grad_norm": 0.6404851078987122, + "learning_rate": 0.0002700658146909837, + "loss": 3.1971, + "step": 32555 + }, + { + "epoch": 1.6, + "grad_norm": 0.6085619330406189, + "learning_rate": 0.00027005049537528103, + "loss": 3.0046, + "step": 32556 + }, + { + "epoch": 1.6, + "grad_norm": 0.5817078351974487, + "learning_rate": 0.00027003517613845935, + "loss": 2.956, + "step": 32557 + }, + { + "epoch": 1.6, + "grad_norm": 0.6575085520744324, + "learning_rate": 0.0002700198569805594, + "loss": 3.2202, + "step": 32558 + }, + { + "epoch": 1.6, + "grad_norm": 0.6530008912086487, + "learning_rate": 0.0002700045379016212, + "loss": 2.9829, + "step": 32559 + }, + { + "epoch": 1.6, + "grad_norm": 0.5922717452049255, + "learning_rate": 0.00026998921890168545, + "loss": 3.005, + "step": 32560 + }, + { + "epoch": 1.6, + "grad_norm": 0.5945848822593689, + "learning_rate": 0.00026997389998079227, + "loss": 2.9596, + "step": 32561 + }, + { + "epoch": 1.6, + "grad_norm": 0.5837536454200745, + "learning_rate": 0.0002699585811389819, + "loss": 2.9112, + "step": 32562 + }, + { + "epoch": 1.6, + "grad_norm": 0.6117374897003174, + "learning_rate": 0.0002699432623762951, + "loss": 2.9097, + "step": 32563 + }, + { + "epoch": 1.6, + "grad_norm": 0.5952165126800537, + "learning_rate": 0.00026992794369277187, + "loss": 3.0588, + "step": 32564 + }, + { + "epoch": 1.6, + "grad_norm": 0.6353278756141663, + "learning_rate": 0.00026991262508845265, + "loss": 3.1253, + "step": 32565 + }, + { + "epoch": 1.6, + "grad_norm": 0.6072468161582947, + "learning_rate": 0.0002698973065633777, + "loss": 2.8889, + "step": 32566 + }, + { + "epoch": 1.6, + "grad_norm": 0.6333851218223572, + "learning_rate": 0.0002698819881175875, + "loss": 3.0873, + "step": 32567 + }, + { + "epoch": 1.6, + "grad_norm": 0.5800299644470215, + "learning_rate": 0.0002698666697511224, + "loss": 3.1187, + "step": 32568 + }, + { + "epoch": 1.6, + "grad_norm": 0.6245841979980469, + "learning_rate": 0.00026985135146402264, + "loss": 3.1307, + "step": 32569 + }, + { + "epoch": 1.6, + "grad_norm": 0.6200968623161316, + "learning_rate": 0.0002698360332563287, + "loss": 3.0176, + "step": 32570 + }, + { + "epoch": 1.6, + "grad_norm": 0.5942978858947754, + "learning_rate": 0.00026982071512808076, + "loss": 3.2657, + "step": 32571 + }, + { + "epoch": 1.6, + "grad_norm": 0.6148001551628113, + "learning_rate": 0.0002698053970793192, + "loss": 2.9059, + "step": 32572 + }, + { + "epoch": 1.6, + "grad_norm": 0.6103355288505554, + "learning_rate": 0.00026979007911008454, + "loss": 3.1997, + "step": 32573 + }, + { + "epoch": 1.6, + "grad_norm": 0.5524942278862, + "learning_rate": 0.0002697747612204169, + "loss": 3.0229, + "step": 32574 + }, + { + "epoch": 1.6, + "grad_norm": 0.590096652507782, + "learning_rate": 0.00026975944341035683, + "loss": 3.2664, + "step": 32575 + }, + { + "epoch": 1.6, + "grad_norm": 0.6268036961555481, + "learning_rate": 0.0002697441256799444, + "loss": 2.976, + "step": 32576 + }, + { + "epoch": 1.6, + "grad_norm": 0.6254855394363403, + "learning_rate": 0.00026972880802922014, + "loss": 3.2555, + "step": 32577 + }, + { + "epoch": 1.6, + "grad_norm": 0.6256861090660095, + "learning_rate": 0.00026971349045822453, + "loss": 3.0236, + "step": 32578 + }, + { + "epoch": 1.6, + "grad_norm": 0.6017988324165344, + "learning_rate": 0.0002696981729669977, + "loss": 3.025, + "step": 32579 + }, + { + "epoch": 1.6, + "grad_norm": 0.6067514419555664, + "learning_rate": 0.00026968285555558003, + "loss": 3.0145, + "step": 32580 + }, + { + "epoch": 1.6, + "grad_norm": 0.6188127398490906, + "learning_rate": 0.0002696675382240118, + "loss": 3.2419, + "step": 32581 + }, + { + "epoch": 1.6, + "grad_norm": 0.7204647064208984, + "learning_rate": 0.00026965222097233353, + "loss": 3.1838, + "step": 32582 + }, + { + "epoch": 1.6, + "grad_norm": 0.6136758923530579, + "learning_rate": 0.00026963690380058555, + "loss": 2.9887, + "step": 32583 + }, + { + "epoch": 1.6, + "grad_norm": 0.6154049634933472, + "learning_rate": 0.00026962158670880795, + "loss": 3.1223, + "step": 32584 + }, + { + "epoch": 1.6, + "grad_norm": 0.6027714014053345, + "learning_rate": 0.00026960626969704137, + "loss": 3.0375, + "step": 32585 + }, + { + "epoch": 1.6, + "grad_norm": 0.5715473890304565, + "learning_rate": 0.00026959095276532594, + "loss": 2.9966, + "step": 32586 + }, + { + "epoch": 1.6, + "grad_norm": 0.6251134872436523, + "learning_rate": 0.0002695756359137021, + "loss": 2.9965, + "step": 32587 + }, + { + "epoch": 1.6, + "grad_norm": 0.6239544153213501, + "learning_rate": 0.00026956031914221023, + "loss": 3.2285, + "step": 32588 + }, + { + "epoch": 1.6, + "grad_norm": 0.6197227239608765, + "learning_rate": 0.0002695450024508906, + "loss": 2.9335, + "step": 32589 + }, + { + "epoch": 1.6, + "grad_norm": 0.6238695979118347, + "learning_rate": 0.00026952968583978363, + "loss": 3.1362, + "step": 32590 + }, + { + "epoch": 1.6, + "grad_norm": 0.6636531949043274, + "learning_rate": 0.00026951436930892946, + "loss": 2.9115, + "step": 32591 + }, + { + "epoch": 1.6, + "grad_norm": 0.5675901770591736, + "learning_rate": 0.0002694990528583687, + "loss": 2.8736, + "step": 32592 + }, + { + "epoch": 1.6, + "grad_norm": 0.6764063239097595, + "learning_rate": 0.00026948373648814157, + "loss": 3.0807, + "step": 32593 + }, + { + "epoch": 1.6, + "grad_norm": 0.6123232841491699, + "learning_rate": 0.00026946842019828825, + "loss": 2.9812, + "step": 32594 + }, + { + "epoch": 1.6, + "grad_norm": 0.7344564199447632, + "learning_rate": 0.00026945310398884944, + "loss": 2.9918, + "step": 32595 + }, + { + "epoch": 1.6, + "grad_norm": 0.6294760704040527, + "learning_rate": 0.00026943778785986516, + "loss": 3.0817, + "step": 32596 + }, + { + "epoch": 1.6, + "grad_norm": 0.6110149621963501, + "learning_rate": 0.0002694224718113758, + "loss": 3.1679, + "step": 32597 + }, + { + "epoch": 1.6, + "grad_norm": 0.6438841819763184, + "learning_rate": 0.00026940715584342187, + "loss": 3.1391, + "step": 32598 + }, + { + "epoch": 1.6, + "grad_norm": 0.6176292896270752, + "learning_rate": 0.00026939183995604353, + "loss": 2.913, + "step": 32599 + }, + { + "epoch": 1.6, + "grad_norm": 0.5883298516273499, + "learning_rate": 0.0002693765241492813, + "loss": 3.054, + "step": 32600 + }, + { + "epoch": 1.6, + "grad_norm": 0.6791853308677673, + "learning_rate": 0.0002693612084231754, + "loss": 3.0085, + "step": 32601 + }, + { + "epoch": 1.6, + "grad_norm": 0.6615046262741089, + "learning_rate": 0.00026934589277776604, + "loss": 3.0181, + "step": 32602 + }, + { + "epoch": 1.6, + "grad_norm": 0.6171573996543884, + "learning_rate": 0.00026933057721309386, + "loss": 2.9615, + "step": 32603 + }, + { + "epoch": 1.6, + "grad_norm": 0.6239962577819824, + "learning_rate": 0.0002693152617291989, + "loss": 2.9545, + "step": 32604 + }, + { + "epoch": 1.6, + "grad_norm": 0.61008620262146, + "learning_rate": 0.0002692999463261218, + "loss": 3.1183, + "step": 32605 + }, + { + "epoch": 1.6, + "grad_norm": 0.6085340976715088, + "learning_rate": 0.00026928463100390256, + "loss": 3.1372, + "step": 32606 + }, + { + "epoch": 1.6, + "grad_norm": 0.6348270177841187, + "learning_rate": 0.00026926931576258176, + "loss": 2.9547, + "step": 32607 + }, + { + "epoch": 1.6, + "grad_norm": 0.5983766913414001, + "learning_rate": 0.00026925400060219973, + "loss": 3.3116, + "step": 32608 + }, + { + "epoch": 1.6, + "grad_norm": 0.6149789690971375, + "learning_rate": 0.00026923868552279666, + "loss": 3.0122, + "step": 32609 + }, + { + "epoch": 1.6, + "grad_norm": 0.5611794590950012, + "learning_rate": 0.0002692233705244131, + "loss": 3.1514, + "step": 32610 + }, + { + "epoch": 1.6, + "grad_norm": 0.6104076504707336, + "learning_rate": 0.00026920805560708914, + "loss": 3.0857, + "step": 32611 + }, + { + "epoch": 1.6, + "grad_norm": 0.6021207571029663, + "learning_rate": 0.0002691927407708652, + "loss": 2.9064, + "step": 32612 + }, + { + "epoch": 1.6, + "grad_norm": 0.58101487159729, + "learning_rate": 0.00026917742601578173, + "loss": 3.0131, + "step": 32613 + }, + { + "epoch": 1.6, + "grad_norm": 0.5953050851821899, + "learning_rate": 0.00026916211134187895, + "loss": 2.9169, + "step": 32614 + }, + { + "epoch": 1.6, + "grad_norm": 0.6245440244674683, + "learning_rate": 0.0002691467967491973, + "loss": 3.0768, + "step": 32615 + }, + { + "epoch": 1.6, + "grad_norm": 0.5973414182662964, + "learning_rate": 0.000269131482237777, + "loss": 2.9519, + "step": 32616 + }, + { + "epoch": 1.6, + "grad_norm": 0.583087146282196, + "learning_rate": 0.0002691161678076583, + "loss": 3.0333, + "step": 32617 + }, + { + "epoch": 1.6, + "grad_norm": 0.6332690715789795, + "learning_rate": 0.000269100853458882, + "loss": 3.0362, + "step": 32618 + }, + { + "epoch": 1.6, + "grad_norm": 0.6058811545372009, + "learning_rate": 0.0002690855391914878, + "loss": 2.9249, + "step": 32619 + }, + { + "epoch": 1.6, + "grad_norm": 0.6081041693687439, + "learning_rate": 0.00026907022500551655, + "loss": 3.0316, + "step": 32620 + }, + { + "epoch": 1.6, + "grad_norm": 0.6076980829238892, + "learning_rate": 0.0002690549109010082, + "loss": 2.9194, + "step": 32621 + }, + { + "epoch": 1.6, + "grad_norm": 0.6190997362136841, + "learning_rate": 0.0002690395968780033, + "loss": 3.1308, + "step": 32622 + }, + { + "epoch": 1.6, + "grad_norm": 0.6132203936576843, + "learning_rate": 0.0002690242829365423, + "loss": 3.0724, + "step": 32623 + }, + { + "epoch": 1.6, + "grad_norm": 0.5711997151374817, + "learning_rate": 0.00026900896907666516, + "loss": 2.984, + "step": 32624 + }, + { + "epoch": 1.6, + "grad_norm": 0.6215646862983704, + "learning_rate": 0.00026899365529841257, + "loss": 3.1405, + "step": 32625 + }, + { + "epoch": 1.6, + "grad_norm": 0.5729935765266418, + "learning_rate": 0.0002689783416018247, + "loss": 3.1276, + "step": 32626 + }, + { + "epoch": 1.6, + "grad_norm": 0.5910037755966187, + "learning_rate": 0.0002689630279869418, + "loss": 2.9777, + "step": 32627 + }, + { + "epoch": 1.6, + "grad_norm": 0.6306336522102356, + "learning_rate": 0.00026894771445380445, + "loss": 2.9572, + "step": 32628 + }, + { + "epoch": 1.6, + "grad_norm": 0.6902565360069275, + "learning_rate": 0.0002689324010024528, + "loss": 2.8442, + "step": 32629 + }, + { + "epoch": 1.6, + "grad_norm": 0.5936403870582581, + "learning_rate": 0.0002689170876329273, + "loss": 3.0951, + "step": 32630 + }, + { + "epoch": 1.6, + "grad_norm": 0.6323724389076233, + "learning_rate": 0.0002689017743452681, + "loss": 2.9881, + "step": 32631 + }, + { + "epoch": 1.6, + "grad_norm": 0.5704877972602844, + "learning_rate": 0.0002688864611395156, + "loss": 2.9077, + "step": 32632 + }, + { + "epoch": 1.6, + "grad_norm": 0.643515944480896, + "learning_rate": 0.0002688711480157103, + "loss": 3.0715, + "step": 32633 + }, + { + "epoch": 1.6, + "grad_norm": 0.6297488212585449, + "learning_rate": 0.00026885583497389235, + "loss": 3.0259, + "step": 32634 + }, + { + "epoch": 1.6, + "grad_norm": 0.5987085103988647, + "learning_rate": 0.00026884052201410215, + "loss": 3.0065, + "step": 32635 + }, + { + "epoch": 1.6, + "grad_norm": 0.5853610634803772, + "learning_rate": 0.00026882520913638, + "loss": 2.9778, + "step": 32636 + }, + { + "epoch": 1.6, + "grad_norm": 0.5946998000144958, + "learning_rate": 0.0002688098963407664, + "loss": 2.9844, + "step": 32637 + }, + { + "epoch": 1.6, + "grad_norm": 0.6260942220687866, + "learning_rate": 0.00026879458362730126, + "loss": 2.9159, + "step": 32638 + }, + { + "epoch": 1.6, + "grad_norm": 0.7542493343353271, + "learning_rate": 0.00026877927099602523, + "loss": 3.1824, + "step": 32639 + }, + { + "epoch": 1.6, + "grad_norm": 0.5645418763160706, + "learning_rate": 0.0002687639584469787, + "loss": 2.9704, + "step": 32640 + }, + { + "epoch": 1.6, + "grad_norm": 0.6517764329910278, + "learning_rate": 0.0002687486459802019, + "loss": 3.099, + "step": 32641 + }, + { + "epoch": 1.6, + "grad_norm": 0.6776633262634277, + "learning_rate": 0.0002687333335957352, + "loss": 3.0951, + "step": 32642 + }, + { + "epoch": 1.6, + "grad_norm": 0.6206831336021423, + "learning_rate": 0.00026871802129361865, + "loss": 3.1086, + "step": 32643 + }, + { + "epoch": 1.6, + "grad_norm": 0.633886992931366, + "learning_rate": 0.00026870270907389303, + "loss": 3.0899, + "step": 32644 + }, + { + "epoch": 1.6, + "grad_norm": 0.6100537776947021, + "learning_rate": 0.0002686873969365984, + "loss": 2.9998, + "step": 32645 + }, + { + "epoch": 1.6, + "grad_norm": 0.588862419128418, + "learning_rate": 0.0002686720848817751, + "loss": 3.1437, + "step": 32646 + }, + { + "epoch": 1.6, + "grad_norm": 0.6336972713470459, + "learning_rate": 0.0002686567729094635, + "loss": 3.0714, + "step": 32647 + }, + { + "epoch": 1.6, + "grad_norm": 0.6299551129341125, + "learning_rate": 0.000268641461019704, + "loss": 3.0617, + "step": 32648 + }, + { + "epoch": 1.6, + "grad_norm": 0.5796019434928894, + "learning_rate": 0.0002686261492125367, + "loss": 2.8452, + "step": 32649 + }, + { + "epoch": 1.6, + "grad_norm": 0.6125264763832092, + "learning_rate": 0.00026861083748800225, + "loss": 3.037, + "step": 32650 + }, + { + "epoch": 1.6, + "grad_norm": 0.6127404570579529, + "learning_rate": 0.00026859552584614077, + "loss": 3.182, + "step": 32651 + }, + { + "epoch": 1.6, + "grad_norm": 0.6474936604499817, + "learning_rate": 0.00026858021428699264, + "loss": 3.2452, + "step": 32652 + }, + { + "epoch": 1.6, + "grad_norm": 0.5996273159980774, + "learning_rate": 0.0002685649028105981, + "loss": 3.1117, + "step": 32653 + }, + { + "epoch": 1.6, + "grad_norm": 0.6106273531913757, + "learning_rate": 0.0002685495914169976, + "loss": 3.2262, + "step": 32654 + }, + { + "epoch": 1.6, + "grad_norm": 0.6102750897407532, + "learning_rate": 0.00026853428010623146, + "loss": 3.1025, + "step": 32655 + }, + { + "epoch": 1.6, + "grad_norm": 0.6115225553512573, + "learning_rate": 0.0002685189688783399, + "loss": 3.0833, + "step": 32656 + }, + { + "epoch": 1.6, + "grad_norm": 0.6346925497055054, + "learning_rate": 0.0002685036577333634, + "loss": 3.0, + "step": 32657 + }, + { + "epoch": 1.6, + "grad_norm": 0.6187523603439331, + "learning_rate": 0.0002684883466713422, + "loss": 3.0215, + "step": 32658 + }, + { + "epoch": 1.6, + "grad_norm": 0.7155755758285522, + "learning_rate": 0.0002684730356923165, + "loss": 2.8853, + "step": 32659 + }, + { + "epoch": 1.6, + "grad_norm": 0.6200131177902222, + "learning_rate": 0.0002684577247963269, + "loss": 3.0541, + "step": 32660 + }, + { + "epoch": 1.6, + "grad_norm": 0.6308820843696594, + "learning_rate": 0.0002684424139834135, + "loss": 3.0811, + "step": 32661 + }, + { + "epoch": 1.6, + "grad_norm": 0.5879965424537659, + "learning_rate": 0.00026842710325361677, + "loss": 2.9076, + "step": 32662 + }, + { + "epoch": 1.6, + "grad_norm": 0.629178524017334, + "learning_rate": 0.00026841179260697693, + "loss": 2.9654, + "step": 32663 + }, + { + "epoch": 1.6, + "grad_norm": 0.621136486530304, + "learning_rate": 0.0002683964820435343, + "loss": 3.0303, + "step": 32664 + }, + { + "epoch": 1.6, + "grad_norm": 0.6378344297409058, + "learning_rate": 0.0002683811715633294, + "loss": 2.8025, + "step": 32665 + }, + { + "epoch": 1.6, + "grad_norm": 0.642932116985321, + "learning_rate": 0.0002683658611664023, + "loss": 2.7823, + "step": 32666 + }, + { + "epoch": 1.6, + "grad_norm": 0.6393907070159912, + "learning_rate": 0.0002683505508527935, + "loss": 3.1127, + "step": 32667 + }, + { + "epoch": 1.6, + "grad_norm": 0.6391983032226562, + "learning_rate": 0.0002683352406225432, + "loss": 2.9901, + "step": 32668 + }, + { + "epoch": 1.6, + "grad_norm": 0.6028316020965576, + "learning_rate": 0.00026831993047569177, + "loss": 3.0726, + "step": 32669 + }, + { + "epoch": 1.6, + "grad_norm": 0.655157208442688, + "learning_rate": 0.0002683046204122796, + "loss": 2.786, + "step": 32670 + }, + { + "epoch": 1.6, + "grad_norm": 0.61063551902771, + "learning_rate": 0.0002682893104323469, + "loss": 3.0121, + "step": 32671 + }, + { + "epoch": 1.6, + "grad_norm": 0.6077100038528442, + "learning_rate": 0.00026827400053593413, + "loss": 3.0462, + "step": 32672 + }, + { + "epoch": 1.6, + "grad_norm": 0.6022846698760986, + "learning_rate": 0.0002682586907230815, + "loss": 3.0828, + "step": 32673 + }, + { + "epoch": 1.6, + "grad_norm": 0.6264395117759705, + "learning_rate": 0.0002682433809938293, + "loss": 3.1968, + "step": 32674 + }, + { + "epoch": 1.6, + "grad_norm": 0.6148096323013306, + "learning_rate": 0.00026822807134821804, + "loss": 3.2081, + "step": 32675 + }, + { + "epoch": 1.6, + "grad_norm": 0.6186606287956238, + "learning_rate": 0.0002682127617862879, + "loss": 2.9582, + "step": 32676 + }, + { + "epoch": 1.6, + "grad_norm": 0.6134831309318542, + "learning_rate": 0.00026819745230807925, + "loss": 3.0577, + "step": 32677 + }, + { + "epoch": 1.6, + "grad_norm": 0.5858891010284424, + "learning_rate": 0.00026818214291363226, + "loss": 2.9476, + "step": 32678 + }, + { + "epoch": 1.6, + "grad_norm": 0.6068850755691528, + "learning_rate": 0.0002681668336029874, + "loss": 3.0758, + "step": 32679 + }, + { + "epoch": 1.6, + "grad_norm": 0.5995755195617676, + "learning_rate": 0.0002681515243761851, + "loss": 2.9556, + "step": 32680 + }, + { + "epoch": 1.6, + "grad_norm": 0.5753273367881775, + "learning_rate": 0.0002681362152332655, + "loss": 2.7898, + "step": 32681 + }, + { + "epoch": 1.6, + "grad_norm": 0.6415500044822693, + "learning_rate": 0.00026812090617426903, + "loss": 3.2757, + "step": 32682 + }, + { + "epoch": 1.6, + "grad_norm": 0.5945836305618286, + "learning_rate": 0.0002681055971992358, + "loss": 3.1363, + "step": 32683 + }, + { + "epoch": 1.6, + "grad_norm": 0.6280573010444641, + "learning_rate": 0.0002680902883082064, + "loss": 2.8068, + "step": 32684 + }, + { + "epoch": 1.6, + "grad_norm": 0.6142799258232117, + "learning_rate": 0.00026807497950122105, + "loss": 3.0977, + "step": 32685 + }, + { + "epoch": 1.6, + "grad_norm": 0.6052511930465698, + "learning_rate": 0.00026805967077832, + "loss": 2.9415, + "step": 32686 + }, + { + "epoch": 1.6, + "grad_norm": 0.5932520031929016, + "learning_rate": 0.0002680443621395437, + "loss": 2.9637, + "step": 32687 + }, + { + "epoch": 1.6, + "grad_norm": 0.5896964073181152, + "learning_rate": 0.00026802905358493237, + "loss": 2.875, + "step": 32688 + }, + { + "epoch": 1.6, + "grad_norm": 0.6037048697471619, + "learning_rate": 0.0002680137451145263, + "loss": 3.1561, + "step": 32689 + }, + { + "epoch": 1.6, + "grad_norm": 0.6128067374229431, + "learning_rate": 0.00026799843672836596, + "loss": 2.8895, + "step": 32690 + }, + { + "epoch": 1.6, + "grad_norm": 0.5946832299232483, + "learning_rate": 0.00026798312842649157, + "loss": 2.9309, + "step": 32691 + }, + { + "epoch": 1.6, + "grad_norm": 0.6187651753425598, + "learning_rate": 0.0002679678202089435, + "loss": 2.8649, + "step": 32692 + }, + { + "epoch": 1.6, + "grad_norm": 0.5798802971839905, + "learning_rate": 0.00026795251207576185, + "loss": 3.0354, + "step": 32693 + }, + { + "epoch": 1.6, + "grad_norm": 0.5939930081367493, + "learning_rate": 0.00026793720402698725, + "loss": 3.1686, + "step": 32694 + }, + { + "epoch": 1.6, + "grad_norm": 0.5916917324066162, + "learning_rate": 0.0002679218960626599, + "loss": 3.1373, + "step": 32695 + }, + { + "epoch": 1.6, + "grad_norm": 0.6304068565368652, + "learning_rate": 0.00026790658818282, + "loss": 3.0483, + "step": 32696 + }, + { + "epoch": 1.6, + "grad_norm": 0.5936771035194397, + "learning_rate": 0.0002678912803875081, + "loss": 2.9578, + "step": 32697 + }, + { + "epoch": 1.6, + "grad_norm": 0.5950177907943726, + "learning_rate": 0.0002678759726767643, + "loss": 3.0496, + "step": 32698 + }, + { + "epoch": 1.6, + "grad_norm": 0.6186109781265259, + "learning_rate": 0.00026786066505062896, + "loss": 3.0393, + "step": 32699 + }, + { + "epoch": 1.6, + "grad_norm": 0.6025801301002502, + "learning_rate": 0.0002678453575091426, + "loss": 3.2257, + "step": 32700 + }, + { + "epoch": 1.6, + "grad_norm": 0.5880355834960938, + "learning_rate": 0.0002678300500523452, + "loss": 2.7253, + "step": 32701 + }, + { + "epoch": 1.6, + "grad_norm": 0.6294435858726501, + "learning_rate": 0.0002678147426802774, + "loss": 3.0772, + "step": 32702 + }, + { + "epoch": 1.6, + "grad_norm": 0.6276559233665466, + "learning_rate": 0.0002677994353929793, + "loss": 2.8216, + "step": 32703 + }, + { + "epoch": 1.6, + "grad_norm": 0.6946702003479004, + "learning_rate": 0.00026778412819049123, + "loss": 3.0334, + "step": 32704 + }, + { + "epoch": 1.6, + "grad_norm": 0.5705452561378479, + "learning_rate": 0.00026776882107285374, + "loss": 2.9975, + "step": 32705 + }, + { + "epoch": 1.6, + "grad_norm": 0.5945636034011841, + "learning_rate": 0.00026775351404010684, + "loss": 2.9102, + "step": 32706 + }, + { + "epoch": 1.6, + "grad_norm": 0.5809672474861145, + "learning_rate": 0.0002677382070922911, + "loss": 2.9264, + "step": 32707 + }, + { + "epoch": 1.6, + "grad_norm": 0.5987460613250732, + "learning_rate": 0.0002677229002294465, + "loss": 3.0214, + "step": 32708 + }, + { + "epoch": 1.6, + "grad_norm": 0.6070874929428101, + "learning_rate": 0.00026770759345161375, + "loss": 3.1131, + "step": 32709 + }, + { + "epoch": 1.6, + "grad_norm": 0.632290780544281, + "learning_rate": 0.00026769228675883297, + "loss": 2.8364, + "step": 32710 + }, + { + "epoch": 1.6, + "grad_norm": 0.6187470555305481, + "learning_rate": 0.0002676769801511444, + "loss": 2.9534, + "step": 32711 + }, + { + "epoch": 1.6, + "grad_norm": 0.5995469689369202, + "learning_rate": 0.00026766167362858855, + "loss": 2.9997, + "step": 32712 + }, + { + "epoch": 1.6, + "grad_norm": 0.6334401965141296, + "learning_rate": 0.0002676463671912056, + "loss": 3.0416, + "step": 32713 + }, + { + "epoch": 1.6, + "grad_norm": 0.6241161227226257, + "learning_rate": 0.0002676310608390359, + "loss": 2.9658, + "step": 32714 + }, + { + "epoch": 1.6, + "grad_norm": 0.6149295568466187, + "learning_rate": 0.00026761575457211965, + "loss": 3.071, + "step": 32715 + }, + { + "epoch": 1.6, + "grad_norm": 0.6074439287185669, + "learning_rate": 0.0002676004483904973, + "loss": 3.0599, + "step": 32716 + }, + { + "epoch": 1.6, + "grad_norm": 0.5981096625328064, + "learning_rate": 0.0002675851422942093, + "loss": 3.1352, + "step": 32717 + }, + { + "epoch": 1.6, + "grad_norm": 0.5963551998138428, + "learning_rate": 0.00026756983628329554, + "loss": 2.8888, + "step": 32718 + }, + { + "epoch": 1.6, + "grad_norm": 0.6065682768821716, + "learning_rate": 0.0002675545303577968, + "loss": 3.0682, + "step": 32719 + }, + { + "epoch": 1.6, + "grad_norm": 0.6360631585121155, + "learning_rate": 0.0002675392245177531, + "loss": 3.0089, + "step": 32720 + }, + { + "epoch": 1.6, + "grad_norm": 0.5898069143295288, + "learning_rate": 0.00026752391876320476, + "loss": 2.876, + "step": 32721 + }, + { + "epoch": 1.6, + "grad_norm": 0.6404922604560852, + "learning_rate": 0.0002675086130941923, + "loss": 3.1414, + "step": 32722 + }, + { + "epoch": 1.6, + "grad_norm": 0.6144474744796753, + "learning_rate": 0.0002674933075107558, + "loss": 2.9401, + "step": 32723 + }, + { + "epoch": 1.6, + "grad_norm": 0.6159098744392395, + "learning_rate": 0.00026747800201293575, + "loss": 3.0381, + "step": 32724 + }, + { + "epoch": 1.6, + "grad_norm": 0.6114851236343384, + "learning_rate": 0.0002674626966007724, + "loss": 3.0802, + "step": 32725 + }, + { + "epoch": 1.6, + "grad_norm": 0.6240290999412537, + "learning_rate": 0.0002674473912743059, + "loss": 3.0702, + "step": 32726 + }, + { + "epoch": 1.6, + "grad_norm": 0.598355770111084, + "learning_rate": 0.00026743208603357687, + "loss": 2.8799, + "step": 32727 + }, + { + "epoch": 1.6, + "grad_norm": 0.6011687517166138, + "learning_rate": 0.00026741678087862535, + "loss": 3.171, + "step": 32728 + }, + { + "epoch": 1.6, + "grad_norm": 0.6154776811599731, + "learning_rate": 0.00026740147580949186, + "loss": 3.1596, + "step": 32729 + }, + { + "epoch": 1.6, + "grad_norm": 0.5840276479721069, + "learning_rate": 0.0002673861708262165, + "loss": 3.1403, + "step": 32730 + }, + { + "epoch": 1.6, + "grad_norm": 0.6225008368492126, + "learning_rate": 0.00026737086592883976, + "loss": 2.9208, + "step": 32731 + }, + { + "epoch": 1.6, + "grad_norm": 0.5753346681594849, + "learning_rate": 0.0002673555611174019, + "loss": 3.0474, + "step": 32732 + }, + { + "epoch": 1.6, + "grad_norm": 0.6378356218338013, + "learning_rate": 0.0002673402563919431, + "loss": 3.0054, + "step": 32733 + }, + { + "epoch": 1.6, + "grad_norm": 0.5706433653831482, + "learning_rate": 0.0002673249517525039, + "loss": 3.058, + "step": 32734 + }, + { + "epoch": 1.6, + "grad_norm": 0.6072514057159424, + "learning_rate": 0.00026730964719912445, + "loss": 2.9545, + "step": 32735 + }, + { + "epoch": 1.6, + "grad_norm": 0.6374166011810303, + "learning_rate": 0.00026729434273184497, + "loss": 3.1019, + "step": 32736 + }, + { + "epoch": 1.6, + "grad_norm": 0.6123475432395935, + "learning_rate": 0.00026727903835070606, + "loss": 3.0819, + "step": 32737 + }, + { + "epoch": 1.6, + "grad_norm": 0.6655570268630981, + "learning_rate": 0.00026726373405574783, + "loss": 3.1967, + "step": 32738 + }, + { + "epoch": 1.6, + "grad_norm": 0.5814079642295837, + "learning_rate": 0.0002672484298470107, + "loss": 3.0927, + "step": 32739 + }, + { + "epoch": 1.6, + "grad_norm": 0.7113948464393616, + "learning_rate": 0.0002672331257245347, + "loss": 3.0828, + "step": 32740 + }, + { + "epoch": 1.6, + "grad_norm": 0.6144413352012634, + "learning_rate": 0.0002672178216883604, + "loss": 2.8819, + "step": 32741 + }, + { + "epoch": 1.6, + "grad_norm": 0.5857863426208496, + "learning_rate": 0.0002672025177385282, + "loss": 3.2243, + "step": 32742 + }, + { + "epoch": 1.6, + "grad_norm": 0.5955747365951538, + "learning_rate": 0.00026718721387507813, + "loss": 3.1915, + "step": 32743 + }, + { + "epoch": 1.6, + "grad_norm": 0.6116932034492493, + "learning_rate": 0.0002671719100980507, + "loss": 2.8832, + "step": 32744 + }, + { + "epoch": 1.6, + "grad_norm": 0.5967369079589844, + "learning_rate": 0.000267156606407486, + "loss": 2.9476, + "step": 32745 + }, + { + "epoch": 1.6, + "grad_norm": 0.6008356213569641, + "learning_rate": 0.0002671413028034246, + "loss": 3.111, + "step": 32746 + }, + { + "epoch": 1.6, + "grad_norm": 0.5891445279121399, + "learning_rate": 0.00026712599928590674, + "loss": 3.1365, + "step": 32747 + }, + { + "epoch": 1.6, + "grad_norm": 0.5794476270675659, + "learning_rate": 0.0002671106958549725, + "loss": 3.1771, + "step": 32748 + }, + { + "epoch": 1.6, + "grad_norm": 0.6033952236175537, + "learning_rate": 0.0002670953925106625, + "loss": 3.1056, + "step": 32749 + }, + { + "epoch": 1.6, + "grad_norm": 0.6501690745353699, + "learning_rate": 0.00026708008925301687, + "loss": 2.7864, + "step": 32750 + }, + { + "epoch": 1.61, + "grad_norm": 0.6086935997009277, + "learning_rate": 0.00026706478608207586, + "loss": 3.0754, + "step": 32751 + }, + { + "epoch": 1.61, + "grad_norm": 0.6224468350410461, + "learning_rate": 0.00026704948299787997, + "loss": 3.0004, + "step": 32752 + }, + { + "epoch": 1.61, + "grad_norm": 0.5442426800727844, + "learning_rate": 0.0002670341800004693, + "loss": 3.0873, + "step": 32753 + }, + { + "epoch": 1.61, + "grad_norm": 0.5749726891517639, + "learning_rate": 0.00026701887708988444, + "loss": 2.8662, + "step": 32754 + }, + { + "epoch": 1.61, + "grad_norm": 0.6050938367843628, + "learning_rate": 0.0002670035742661653, + "loss": 2.9914, + "step": 32755 + }, + { + "epoch": 1.61, + "grad_norm": 0.5932197570800781, + "learning_rate": 0.00026698827152935245, + "loss": 3.0756, + "step": 32756 + }, + { + "epoch": 1.61, + "grad_norm": 0.592960000038147, + "learning_rate": 0.0002669729688794863, + "loss": 3.055, + "step": 32757 + }, + { + "epoch": 1.61, + "grad_norm": 0.5769248008728027, + "learning_rate": 0.0002669576663166068, + "loss": 3.3109, + "step": 32758 + }, + { + "epoch": 1.61, + "grad_norm": 0.6258298754692078, + "learning_rate": 0.0002669423638407545, + "loss": 3.139, + "step": 32759 + }, + { + "epoch": 1.61, + "grad_norm": 0.5944938659667969, + "learning_rate": 0.0002669270614519697, + "loss": 3.1027, + "step": 32760 + }, + { + "epoch": 1.61, + "grad_norm": 0.6634683012962341, + "learning_rate": 0.00026691175915029253, + "loss": 3.0298, + "step": 32761 + }, + { + "epoch": 1.61, + "grad_norm": 0.6014466881752014, + "learning_rate": 0.0002668964569357636, + "loss": 2.9555, + "step": 32762 + }, + { + "epoch": 1.61, + "grad_norm": 0.620914876461029, + "learning_rate": 0.00026688115480842287, + "loss": 3.0418, + "step": 32763 + }, + { + "epoch": 1.61, + "grad_norm": 0.5882852673530579, + "learning_rate": 0.0002668658527683109, + "loss": 2.9286, + "step": 32764 + }, + { + "epoch": 1.61, + "grad_norm": 0.610764741897583, + "learning_rate": 0.0002668505508154679, + "loss": 3.0676, + "step": 32765 + }, + { + "epoch": 1.61, + "grad_norm": 0.6545993685722351, + "learning_rate": 0.00026683524894993406, + "loss": 2.8785, + "step": 32766 + }, + { + "epoch": 1.61, + "grad_norm": 0.6135849356651306, + "learning_rate": 0.00026681994717174995, + "loss": 3.0813, + "step": 32767 + }, + { + "epoch": 1.61, + "grad_norm": 0.596288800239563, + "learning_rate": 0.0002668046454809556, + "loss": 3.1225, + "step": 32768 + }, + { + "epoch": 1.61, + "grad_norm": 0.6006448864936829, + "learning_rate": 0.0002667893438775915, + "loss": 2.8525, + "step": 32769 + }, + { + "epoch": 1.61, + "grad_norm": 0.5774670243263245, + "learning_rate": 0.0002667740423616978, + "loss": 3.2201, + "step": 32770 + }, + { + "epoch": 1.61, + "grad_norm": 0.6070205569267273, + "learning_rate": 0.00026675874093331494, + "loss": 2.946, + "step": 32771 + }, + { + "epoch": 1.61, + "grad_norm": 0.5863068103790283, + "learning_rate": 0.0002667434395924832, + "loss": 3.2808, + "step": 32772 + }, + { + "epoch": 1.61, + "grad_norm": 0.6191710233688354, + "learning_rate": 0.0002667281383392427, + "loss": 2.894, + "step": 32773 + }, + { + "epoch": 1.61, + "grad_norm": 0.621767520904541, + "learning_rate": 0.000266712837173634, + "loss": 3.1097, + "step": 32774 + }, + { + "epoch": 1.61, + "grad_norm": 0.5857933163642883, + "learning_rate": 0.0002666975360956972, + "loss": 2.9612, + "step": 32775 + }, + { + "epoch": 1.61, + "grad_norm": 0.6357319355010986, + "learning_rate": 0.0002666822351054727, + "loss": 3.0118, + "step": 32776 + }, + { + "epoch": 1.61, + "grad_norm": 0.5903388857841492, + "learning_rate": 0.00026666693420300086, + "loss": 2.7496, + "step": 32777 + }, + { + "epoch": 1.61, + "grad_norm": 0.6077666282653809, + "learning_rate": 0.0002666516333883218, + "loss": 3.0988, + "step": 32778 + }, + { + "epoch": 1.61, + "grad_norm": 0.6337267756462097, + "learning_rate": 0.000266636332661476, + "loss": 3.1458, + "step": 32779 + }, + { + "epoch": 1.61, + "grad_norm": 0.567110538482666, + "learning_rate": 0.00026662103202250355, + "loss": 2.8474, + "step": 32780 + }, + { + "epoch": 1.61, + "grad_norm": 0.6637780666351318, + "learning_rate": 0.00026660573147144494, + "loss": 3.2734, + "step": 32781 + }, + { + "epoch": 1.61, + "grad_norm": 0.6085167527198792, + "learning_rate": 0.00026659043100834054, + "loss": 3.1563, + "step": 32782 + }, + { + "epoch": 1.61, + "grad_norm": 0.6335734128952026, + "learning_rate": 0.0002665751306332304, + "loss": 3.1045, + "step": 32783 + }, + { + "epoch": 1.61, + "grad_norm": 0.6378978490829468, + "learning_rate": 0.00026655983034615493, + "loss": 3.1083, + "step": 32784 + }, + { + "epoch": 1.61, + "grad_norm": 0.5712326765060425, + "learning_rate": 0.0002665445301471544, + "loss": 3.1253, + "step": 32785 + }, + { + "epoch": 1.61, + "grad_norm": 0.5776566863059998, + "learning_rate": 0.00026652923003626916, + "loss": 3.0857, + "step": 32786 + }, + { + "epoch": 1.61, + "grad_norm": 0.5935328602790833, + "learning_rate": 0.00026651393001353956, + "loss": 2.9172, + "step": 32787 + }, + { + "epoch": 1.61, + "grad_norm": 0.5658502578735352, + "learning_rate": 0.0002664986300790057, + "loss": 2.8991, + "step": 32788 + }, + { + "epoch": 1.61, + "grad_norm": 0.6034933924674988, + "learning_rate": 0.0002664833302327081, + "loss": 3.1756, + "step": 32789 + }, + { + "epoch": 1.61, + "grad_norm": 0.5854072570800781, + "learning_rate": 0.00026646803047468697, + "loss": 2.8792, + "step": 32790 + }, + { + "epoch": 1.61, + "grad_norm": 0.5877964496612549, + "learning_rate": 0.00026645273080498264, + "loss": 2.9641, + "step": 32791 + }, + { + "epoch": 1.61, + "grad_norm": 0.6241223216056824, + "learning_rate": 0.00026643743122363515, + "loss": 2.9318, + "step": 32792 + }, + { + "epoch": 1.61, + "grad_norm": 0.5755746364593506, + "learning_rate": 0.00026642213173068516, + "loss": 2.9953, + "step": 32793 + }, + { + "epoch": 1.61, + "grad_norm": 0.6470869779586792, + "learning_rate": 0.00026640683232617284, + "loss": 2.8269, + "step": 32794 + }, + { + "epoch": 1.61, + "grad_norm": 0.6277267932891846, + "learning_rate": 0.0002663915330101383, + "loss": 3.0639, + "step": 32795 + }, + { + "epoch": 1.61, + "grad_norm": 0.6223757266998291, + "learning_rate": 0.0002663762337826222, + "loss": 2.9395, + "step": 32796 + }, + { + "epoch": 1.61, + "grad_norm": 0.5803819894790649, + "learning_rate": 0.0002663609346436645, + "loss": 2.8632, + "step": 32797 + }, + { + "epoch": 1.61, + "grad_norm": 0.659708559513092, + "learning_rate": 0.00026634563559330555, + "loss": 2.9963, + "step": 32798 + }, + { + "epoch": 1.61, + "grad_norm": 0.6480075716972351, + "learning_rate": 0.0002663303366315859, + "loss": 3.0471, + "step": 32799 + }, + { + "epoch": 1.61, + "grad_norm": 0.5975663661956787, + "learning_rate": 0.00026631503775854557, + "loss": 2.7081, + "step": 32800 + }, + { + "epoch": 1.61, + "grad_norm": 0.5907651782035828, + "learning_rate": 0.000266299738974225, + "loss": 3.1879, + "step": 32801 + }, + { + "epoch": 1.61, + "grad_norm": 0.6050690412521362, + "learning_rate": 0.00026628444027866437, + "loss": 3.0749, + "step": 32802 + }, + { + "epoch": 1.61, + "grad_norm": 0.638195276260376, + "learning_rate": 0.00026626914167190394, + "loss": 3.0082, + "step": 32803 + }, + { + "epoch": 1.61, + "grad_norm": 0.5860411524772644, + "learning_rate": 0.0002662538431539843, + "loss": 2.9251, + "step": 32804 + }, + { + "epoch": 1.61, + "grad_norm": 0.5866068005561829, + "learning_rate": 0.00026623854472494546, + "loss": 2.9912, + "step": 32805 + }, + { + "epoch": 1.61, + "grad_norm": 0.6038371324539185, + "learning_rate": 0.00026622324638482785, + "loss": 2.9756, + "step": 32806 + }, + { + "epoch": 1.61, + "grad_norm": 0.5844926834106445, + "learning_rate": 0.0002662079481336716, + "loss": 3.011, + "step": 32807 + }, + { + "epoch": 1.61, + "grad_norm": 0.5914148092269897, + "learning_rate": 0.00026619264997151715, + "loss": 3.2044, + "step": 32808 + }, + { + "epoch": 1.61, + "grad_norm": 0.5837469100952148, + "learning_rate": 0.0002661773518984048, + "loss": 2.9881, + "step": 32809 + }, + { + "epoch": 1.61, + "grad_norm": 0.60676109790802, + "learning_rate": 0.0002661620539143747, + "loss": 3.1325, + "step": 32810 + }, + { + "epoch": 1.61, + "grad_norm": 0.6246742010116577, + "learning_rate": 0.0002661467560194674, + "loss": 2.9884, + "step": 32811 + }, + { + "epoch": 1.61, + "grad_norm": 0.5762494206428528, + "learning_rate": 0.0002661314582137229, + "loss": 3.31, + "step": 32812 + }, + { + "epoch": 1.61, + "grad_norm": 0.5754257440567017, + "learning_rate": 0.00026611616049718157, + "loss": 3.0474, + "step": 32813 + }, + { + "epoch": 1.61, + "grad_norm": 0.6173507571220398, + "learning_rate": 0.0002661008628698839, + "loss": 3.0469, + "step": 32814 + }, + { + "epoch": 1.61, + "grad_norm": 0.5958468914031982, + "learning_rate": 0.00026608556533186997, + "loss": 2.8676, + "step": 32815 + }, + { + "epoch": 1.61, + "grad_norm": 0.6056309342384338, + "learning_rate": 0.0002660702678831802, + "loss": 3.2593, + "step": 32816 + }, + { + "epoch": 1.61, + "grad_norm": 0.6290296316146851, + "learning_rate": 0.00026605497052385464, + "loss": 3.0761, + "step": 32817 + }, + { + "epoch": 1.61, + "grad_norm": 0.6019764542579651, + "learning_rate": 0.00026603967325393386, + "loss": 2.9699, + "step": 32818 + }, + { + "epoch": 1.61, + "grad_norm": 0.6389729976654053, + "learning_rate": 0.00026602437607345814, + "loss": 3.1099, + "step": 32819 + }, + { + "epoch": 1.61, + "grad_norm": 0.6280241012573242, + "learning_rate": 0.0002660090789824675, + "loss": 2.9096, + "step": 32820 + }, + { + "epoch": 1.61, + "grad_norm": 0.6099143028259277, + "learning_rate": 0.00026599378198100253, + "loss": 3.0422, + "step": 32821 + }, + { + "epoch": 1.61, + "grad_norm": 0.6680817604064941, + "learning_rate": 0.0002659784850691033, + "loss": 3.2611, + "step": 32822 + }, + { + "epoch": 1.61, + "grad_norm": 0.7214831113815308, + "learning_rate": 0.0002659631882468102, + "loss": 3.0891, + "step": 32823 + }, + { + "epoch": 1.61, + "grad_norm": 0.6249724626541138, + "learning_rate": 0.0002659478915141636, + "loss": 2.9744, + "step": 32824 + }, + { + "epoch": 1.61, + "grad_norm": 0.6301910877227783, + "learning_rate": 0.0002659325948712036, + "loss": 2.9372, + "step": 32825 + }, + { + "epoch": 1.61, + "grad_norm": 0.6053838133811951, + "learning_rate": 0.0002659172983179707, + "loss": 3.2256, + "step": 32826 + }, + { + "epoch": 1.61, + "grad_norm": 0.6509674191474915, + "learning_rate": 0.00026590200185450504, + "loss": 3.0197, + "step": 32827 + }, + { + "epoch": 1.61, + "grad_norm": 0.6163961887359619, + "learning_rate": 0.0002658867054808468, + "loss": 3.0188, + "step": 32828 + }, + { + "epoch": 1.61, + "grad_norm": 0.5931441783905029, + "learning_rate": 0.00026587140919703665, + "loss": 3.0105, + "step": 32829 + }, + { + "epoch": 1.61, + "grad_norm": 0.6105912923812866, + "learning_rate": 0.0002658561130031145, + "loss": 2.8814, + "step": 32830 + }, + { + "epoch": 1.61, + "grad_norm": 0.6719006896018982, + "learning_rate": 0.0002658408168991209, + "loss": 2.9579, + "step": 32831 + }, + { + "epoch": 1.61, + "grad_norm": 0.5960902571678162, + "learning_rate": 0.0002658255208850958, + "loss": 3.1361, + "step": 32832 + }, + { + "epoch": 1.61, + "grad_norm": 0.5977107882499695, + "learning_rate": 0.0002658102249610798, + "loss": 3.2039, + "step": 32833 + }, + { + "epoch": 1.61, + "grad_norm": 0.6679555177688599, + "learning_rate": 0.00026579492912711316, + "loss": 3.3237, + "step": 32834 + }, + { + "epoch": 1.61, + "grad_norm": 0.6147051453590393, + "learning_rate": 0.000265779633383236, + "loss": 3.0583, + "step": 32835 + }, + { + "epoch": 1.61, + "grad_norm": 0.6416993141174316, + "learning_rate": 0.0002657643377294888, + "loss": 3.0023, + "step": 32836 + }, + { + "epoch": 1.61, + "grad_norm": 0.5568658113479614, + "learning_rate": 0.0002657490421659117, + "loss": 2.8273, + "step": 32837 + }, + { + "epoch": 1.61, + "grad_norm": 0.6182296872138977, + "learning_rate": 0.0002657337466925449, + "loss": 3.1808, + "step": 32838 + }, + { + "epoch": 1.61, + "grad_norm": 0.61872398853302, + "learning_rate": 0.00026571845130942903, + "loss": 2.9039, + "step": 32839 + }, + { + "epoch": 1.61, + "grad_norm": 0.5836578011512756, + "learning_rate": 0.000265703156016604, + "loss": 2.8411, + "step": 32840 + }, + { + "epoch": 1.61, + "grad_norm": 0.6261762976646423, + "learning_rate": 0.0002656878608141104, + "loss": 3.0058, + "step": 32841 + }, + { + "epoch": 1.61, + "grad_norm": 0.7778844237327576, + "learning_rate": 0.00026567256570198824, + "loss": 2.8564, + "step": 32842 + }, + { + "epoch": 1.61, + "grad_norm": 0.6407623887062073, + "learning_rate": 0.0002656572706802779, + "loss": 2.9384, + "step": 32843 + }, + { + "epoch": 1.61, + "grad_norm": 0.5836843252182007, + "learning_rate": 0.00026564197574901994, + "loss": 3.0409, + "step": 32844 + }, + { + "epoch": 1.61, + "grad_norm": 0.6171208024024963, + "learning_rate": 0.00026562668090825413, + "loss": 2.9327, + "step": 32845 + }, + { + "epoch": 1.61, + "grad_norm": 0.6322222352027893, + "learning_rate": 0.0002656113861580212, + "loss": 3.12, + "step": 32846 + }, + { + "epoch": 1.61, + "grad_norm": 2.0868051052093506, + "learning_rate": 0.0002655960914983612, + "loss": 3.0806, + "step": 32847 + }, + { + "epoch": 1.61, + "grad_norm": 0.6040574908256531, + "learning_rate": 0.00026558079692931444, + "loss": 3.019, + "step": 32848 + }, + { + "epoch": 1.61, + "grad_norm": 0.693903386592865, + "learning_rate": 0.00026556550245092136, + "loss": 2.8479, + "step": 32849 + }, + { + "epoch": 1.61, + "grad_norm": 0.6436097025871277, + "learning_rate": 0.00026555020806322196, + "loss": 3.0285, + "step": 32850 + }, + { + "epoch": 1.61, + "grad_norm": 0.6326521039009094, + "learning_rate": 0.0002655349137662568, + "loss": 3.076, + "step": 32851 + }, + { + "epoch": 1.61, + "grad_norm": 0.6029640436172485, + "learning_rate": 0.00026551961956006604, + "loss": 2.957, + "step": 32852 + }, + { + "epoch": 1.61, + "grad_norm": 0.5923823118209839, + "learning_rate": 0.0002655043254446898, + "loss": 2.9685, + "step": 32853 + }, + { + "epoch": 1.61, + "grad_norm": 0.6047025322914124, + "learning_rate": 0.00026548903142016876, + "loss": 3.0945, + "step": 32854 + }, + { + "epoch": 1.61, + "grad_norm": 0.6205102205276489, + "learning_rate": 0.0002654737374865428, + "loss": 2.7298, + "step": 32855 + }, + { + "epoch": 1.61, + "grad_norm": 0.636955976486206, + "learning_rate": 0.00026545844364385255, + "loss": 3.0996, + "step": 32856 + }, + { + "epoch": 1.61, + "grad_norm": 0.6210900545120239, + "learning_rate": 0.00026544314989213795, + "loss": 2.9344, + "step": 32857 + }, + { + "epoch": 1.61, + "grad_norm": 0.5878751873970032, + "learning_rate": 0.0002654278562314395, + "loss": 2.929, + "step": 32858 + }, + { + "epoch": 1.61, + "grad_norm": 0.6176540851593018, + "learning_rate": 0.00026541256266179744, + "loss": 3.2169, + "step": 32859 + }, + { + "epoch": 1.61, + "grad_norm": 0.7335445284843445, + "learning_rate": 0.0002653972691832519, + "loss": 3.117, + "step": 32860 + }, + { + "epoch": 1.61, + "grad_norm": 0.6168895363807678, + "learning_rate": 0.0002653819757958435, + "loss": 3.0956, + "step": 32861 + }, + { + "epoch": 1.61, + "grad_norm": 0.6440188884735107, + "learning_rate": 0.0002653666824996122, + "loss": 3.0447, + "step": 32862 + }, + { + "epoch": 1.61, + "grad_norm": 0.5899813771247864, + "learning_rate": 0.00026535138929459834, + "loss": 2.9709, + "step": 32863 + }, + { + "epoch": 1.61, + "grad_norm": 0.6249847412109375, + "learning_rate": 0.0002653360961808424, + "loss": 3.1298, + "step": 32864 + }, + { + "epoch": 1.61, + "grad_norm": 0.5660920143127441, + "learning_rate": 0.00026532080315838435, + "loss": 3.0273, + "step": 32865 + }, + { + "epoch": 1.61, + "grad_norm": 0.6059000492095947, + "learning_rate": 0.00026530551022726476, + "loss": 3.014, + "step": 32866 + }, + { + "epoch": 1.61, + "grad_norm": 0.6058610081672668, + "learning_rate": 0.00026529021738752373, + "loss": 3.0652, + "step": 32867 + }, + { + "epoch": 1.61, + "grad_norm": 0.6035959124565125, + "learning_rate": 0.00026527492463920154, + "loss": 3.1049, + "step": 32868 + }, + { + "epoch": 1.61, + "grad_norm": 0.5923382639884949, + "learning_rate": 0.00026525963198233864, + "loss": 3.1006, + "step": 32869 + }, + { + "epoch": 1.61, + "grad_norm": 0.6309588551521301, + "learning_rate": 0.0002652443394169751, + "loss": 3.1369, + "step": 32870 + }, + { + "epoch": 1.61, + "grad_norm": 0.5777470469474792, + "learning_rate": 0.0002652290469431513, + "loss": 2.9681, + "step": 32871 + }, + { + "epoch": 1.61, + "grad_norm": 0.604954183101654, + "learning_rate": 0.00026521375456090745, + "loss": 3.0512, + "step": 32872 + }, + { + "epoch": 1.61, + "grad_norm": 0.618948757648468, + "learning_rate": 0.000265198462270284, + "loss": 3.152, + "step": 32873 + }, + { + "epoch": 1.61, + "grad_norm": 0.5669094920158386, + "learning_rate": 0.000265183170071321, + "loss": 3.0005, + "step": 32874 + }, + { + "epoch": 1.61, + "grad_norm": 0.6252792477607727, + "learning_rate": 0.0002651678779640588, + "loss": 3.0027, + "step": 32875 + }, + { + "epoch": 1.61, + "grad_norm": 0.6201028823852539, + "learning_rate": 0.0002651525859485378, + "loss": 2.932, + "step": 32876 + }, + { + "epoch": 1.61, + "grad_norm": 0.5937361717224121, + "learning_rate": 0.0002651372940247981, + "loss": 3.0645, + "step": 32877 + }, + { + "epoch": 1.61, + "grad_norm": 0.6492998003959656, + "learning_rate": 0.00026512200219288017, + "loss": 3.1435, + "step": 32878 + }, + { + "epoch": 1.61, + "grad_norm": 0.625295877456665, + "learning_rate": 0.000265106710452824, + "loss": 2.965, + "step": 32879 + }, + { + "epoch": 1.61, + "grad_norm": 0.5870344042778015, + "learning_rate": 0.00026509141880467016, + "loss": 2.9514, + "step": 32880 + }, + { + "epoch": 1.61, + "grad_norm": 0.5917270183563232, + "learning_rate": 0.00026507612724845883, + "loss": 2.9134, + "step": 32881 + }, + { + "epoch": 1.61, + "grad_norm": 0.6095778346061707, + "learning_rate": 0.00026506083578423014, + "loss": 3.1039, + "step": 32882 + }, + { + "epoch": 1.61, + "grad_norm": 0.66627436876297, + "learning_rate": 0.00026504554441202457, + "loss": 2.996, + "step": 32883 + }, + { + "epoch": 1.61, + "grad_norm": 0.6477625370025635, + "learning_rate": 0.00026503025313188235, + "loss": 2.9929, + "step": 32884 + }, + { + "epoch": 1.61, + "grad_norm": 0.5901625156402588, + "learning_rate": 0.0002650149619438436, + "loss": 3.2471, + "step": 32885 + }, + { + "epoch": 1.61, + "grad_norm": 0.6187769174575806, + "learning_rate": 0.0002649996708479488, + "loss": 3.0292, + "step": 32886 + }, + { + "epoch": 1.61, + "grad_norm": 0.6475508809089661, + "learning_rate": 0.000264984379844238, + "loss": 3.0723, + "step": 32887 + }, + { + "epoch": 1.61, + "grad_norm": 0.5832803845405579, + "learning_rate": 0.0002649690889327518, + "loss": 2.8318, + "step": 32888 + }, + { + "epoch": 1.61, + "grad_norm": 0.6353322863578796, + "learning_rate": 0.00026495379811353014, + "loss": 3.0975, + "step": 32889 + }, + { + "epoch": 1.61, + "grad_norm": 0.5817992091178894, + "learning_rate": 0.0002649385073866134, + "loss": 3.0773, + "step": 32890 + }, + { + "epoch": 1.61, + "grad_norm": 0.596846342086792, + "learning_rate": 0.000264923216752042, + "loss": 3.1376, + "step": 32891 + }, + { + "epoch": 1.61, + "grad_norm": 0.6275048851966858, + "learning_rate": 0.00026490792620985603, + "loss": 3.2158, + "step": 32892 + }, + { + "epoch": 1.61, + "grad_norm": 0.5930907130241394, + "learning_rate": 0.0002648926357600959, + "loss": 3.2069, + "step": 32893 + }, + { + "epoch": 1.61, + "grad_norm": 0.597496747970581, + "learning_rate": 0.00026487734540280174, + "loss": 2.8155, + "step": 32894 + }, + { + "epoch": 1.61, + "grad_norm": 0.6274101734161377, + "learning_rate": 0.0002648620551380139, + "loss": 3.1146, + "step": 32895 + }, + { + "epoch": 1.61, + "grad_norm": 0.6199111342430115, + "learning_rate": 0.0002648467649657727, + "loss": 3.0811, + "step": 32896 + }, + { + "epoch": 1.61, + "grad_norm": 0.6213255524635315, + "learning_rate": 0.0002648314748861182, + "loss": 3.0629, + "step": 32897 + }, + { + "epoch": 1.61, + "grad_norm": 0.6072750091552734, + "learning_rate": 0.000264816184899091, + "loss": 3.0835, + "step": 32898 + }, + { + "epoch": 1.61, + "grad_norm": 0.7357712388038635, + "learning_rate": 0.0002648008950047312, + "loss": 2.8921, + "step": 32899 + }, + { + "epoch": 1.61, + "grad_norm": 0.637231707572937, + "learning_rate": 0.00026478560520307886, + "loss": 3.1188, + "step": 32900 + }, + { + "epoch": 1.61, + "grad_norm": 0.5956443548202515, + "learning_rate": 0.00026477031549417473, + "loss": 3.0897, + "step": 32901 + }, + { + "epoch": 1.61, + "grad_norm": 0.5935696959495544, + "learning_rate": 0.0002647550258780587, + "loss": 2.942, + "step": 32902 + }, + { + "epoch": 1.61, + "grad_norm": 0.659156322479248, + "learning_rate": 0.00026473973635477116, + "loss": 3.1397, + "step": 32903 + }, + { + "epoch": 1.61, + "grad_norm": 0.6243013739585876, + "learning_rate": 0.00026472444692435227, + "loss": 2.8611, + "step": 32904 + }, + { + "epoch": 1.61, + "grad_norm": 0.5944880843162537, + "learning_rate": 0.00026470915758684235, + "loss": 2.9702, + "step": 32905 + }, + { + "epoch": 1.61, + "grad_norm": 0.6213318705558777, + "learning_rate": 0.0002646938683422819, + "loss": 3.0618, + "step": 32906 + }, + { + "epoch": 1.61, + "grad_norm": 0.6070886254310608, + "learning_rate": 0.0002646785791907109, + "loss": 3.1214, + "step": 32907 + }, + { + "epoch": 1.61, + "grad_norm": 0.6019971966743469, + "learning_rate": 0.00026466329013216986, + "loss": 3.0693, + "step": 32908 + }, + { + "epoch": 1.61, + "grad_norm": 0.5632584691047668, + "learning_rate": 0.00026464800116669877, + "loss": 2.8446, + "step": 32909 + }, + { + "epoch": 1.61, + "grad_norm": 0.6107988953590393, + "learning_rate": 0.0002646327122943381, + "loss": 2.9715, + "step": 32910 + }, + { + "epoch": 1.61, + "grad_norm": 0.6390516757965088, + "learning_rate": 0.0002646174235151281, + "loss": 3.0538, + "step": 32911 + }, + { + "epoch": 1.61, + "grad_norm": 0.5974604487419128, + "learning_rate": 0.00026460213482910883, + "loss": 2.9292, + "step": 32912 + }, + { + "epoch": 1.61, + "grad_norm": 0.603056788444519, + "learning_rate": 0.0002645868462363209, + "loss": 3.1131, + "step": 32913 + }, + { + "epoch": 1.61, + "grad_norm": 0.6022782921791077, + "learning_rate": 0.00026457155773680434, + "loss": 3.1211, + "step": 32914 + }, + { + "epoch": 1.61, + "grad_norm": 0.6246730089187622, + "learning_rate": 0.00026455626933059943, + "loss": 3.1058, + "step": 32915 + }, + { + "epoch": 1.61, + "grad_norm": 0.6355071067810059, + "learning_rate": 0.0002645409810177466, + "loss": 3.1005, + "step": 32916 + }, + { + "epoch": 1.61, + "grad_norm": 0.5840960741043091, + "learning_rate": 0.0002645256927982859, + "loss": 3.2979, + "step": 32917 + }, + { + "epoch": 1.61, + "grad_norm": 0.614286482334137, + "learning_rate": 0.0002645104046722578, + "loss": 2.919, + "step": 32918 + }, + { + "epoch": 1.61, + "grad_norm": 0.6425104141235352, + "learning_rate": 0.00026449511663970235, + "loss": 3.1202, + "step": 32919 + }, + { + "epoch": 1.61, + "grad_norm": 0.617282509803772, + "learning_rate": 0.00026447982870065993, + "loss": 3.072, + "step": 32920 + }, + { + "epoch": 1.61, + "grad_norm": 0.5912017822265625, + "learning_rate": 0.00026446454085517093, + "loss": 2.8702, + "step": 32921 + }, + { + "epoch": 1.61, + "grad_norm": 0.6111149787902832, + "learning_rate": 0.00026444925310327536, + "loss": 2.9778, + "step": 32922 + }, + { + "epoch": 1.61, + "grad_norm": 0.6221730709075928, + "learning_rate": 0.0002644339654450137, + "loss": 3.289, + "step": 32923 + }, + { + "epoch": 1.61, + "grad_norm": 0.6156650185585022, + "learning_rate": 0.0002644186778804261, + "loss": 3.1326, + "step": 32924 + }, + { + "epoch": 1.61, + "grad_norm": 0.6163793802261353, + "learning_rate": 0.0002644033904095528, + "loss": 3.0494, + "step": 32925 + }, + { + "epoch": 1.61, + "grad_norm": 0.6287680268287659, + "learning_rate": 0.00026438810303243426, + "loss": 3.0893, + "step": 32926 + }, + { + "epoch": 1.61, + "grad_norm": 0.6106224060058594, + "learning_rate": 0.00026437281574911043, + "loss": 2.9948, + "step": 32927 + }, + { + "epoch": 1.61, + "grad_norm": 0.6386653184890747, + "learning_rate": 0.0002643575285596219, + "loss": 3.023, + "step": 32928 + }, + { + "epoch": 1.61, + "grad_norm": 0.6105353236198425, + "learning_rate": 0.00026434224146400866, + "loss": 3.0687, + "step": 32929 + }, + { + "epoch": 1.61, + "grad_norm": 0.5927025079727173, + "learning_rate": 0.0002643269544623111, + "loss": 2.9087, + "step": 32930 + }, + { + "epoch": 1.61, + "grad_norm": 0.6284357309341431, + "learning_rate": 0.00026431166755456955, + "loss": 2.934, + "step": 32931 + }, + { + "epoch": 1.61, + "grad_norm": 0.6147943139076233, + "learning_rate": 0.0002642963807408242, + "loss": 2.9622, + "step": 32932 + }, + { + "epoch": 1.61, + "grad_norm": 0.6074808239936829, + "learning_rate": 0.00026428109402111533, + "loss": 2.9748, + "step": 32933 + }, + { + "epoch": 1.61, + "grad_norm": 0.6237158179283142, + "learning_rate": 0.00026426580739548306, + "loss": 3.109, + "step": 32934 + }, + { + "epoch": 1.61, + "grad_norm": 0.594880223274231, + "learning_rate": 0.0002642505208639679, + "loss": 3.1491, + "step": 32935 + }, + { + "epoch": 1.61, + "grad_norm": 0.5945537686347961, + "learning_rate": 0.00026423523442661, + "loss": 2.9643, + "step": 32936 + }, + { + "epoch": 1.61, + "grad_norm": 0.6391527652740479, + "learning_rate": 0.0002642199480834495, + "loss": 2.9662, + "step": 32937 + }, + { + "epoch": 1.61, + "grad_norm": 0.6021575927734375, + "learning_rate": 0.00026420466183452683, + "loss": 3.1065, + "step": 32938 + }, + { + "epoch": 1.61, + "grad_norm": 0.6353442072868347, + "learning_rate": 0.0002641893756798822, + "loss": 2.9833, + "step": 32939 + }, + { + "epoch": 1.61, + "grad_norm": 0.6768826246261597, + "learning_rate": 0.00026417408961955575, + "loss": 2.8308, + "step": 32940 + }, + { + "epoch": 1.61, + "grad_norm": 0.5737194418907166, + "learning_rate": 0.00026415880365358803, + "loss": 3.0519, + "step": 32941 + }, + { + "epoch": 1.61, + "grad_norm": 0.5978490710258484, + "learning_rate": 0.000264143517782019, + "loss": 2.8875, + "step": 32942 + }, + { + "epoch": 1.61, + "grad_norm": 0.6324152946472168, + "learning_rate": 0.0002641282320048892, + "loss": 3.2518, + "step": 32943 + }, + { + "epoch": 1.61, + "grad_norm": 0.6049727201461792, + "learning_rate": 0.0002641129463222385, + "loss": 2.9631, + "step": 32944 + }, + { + "epoch": 1.61, + "grad_norm": 0.584157407283783, + "learning_rate": 0.0002640976607341075, + "loss": 3.0125, + "step": 32945 + }, + { + "epoch": 1.61, + "grad_norm": 0.603410005569458, + "learning_rate": 0.00026408237524053646, + "loss": 2.7752, + "step": 32946 + }, + { + "epoch": 1.61, + "grad_norm": 0.6187100410461426, + "learning_rate": 0.0002640670898415654, + "loss": 3.0289, + "step": 32947 + }, + { + "epoch": 1.61, + "grad_norm": 0.5807082653045654, + "learning_rate": 0.0002640518045372348, + "loss": 3.1831, + "step": 32948 + }, + { + "epoch": 1.61, + "grad_norm": 0.6443330645561218, + "learning_rate": 0.00026403651932758467, + "loss": 3.1691, + "step": 32949 + }, + { + "epoch": 1.61, + "grad_norm": 0.6212769746780396, + "learning_rate": 0.0002640212342126556, + "loss": 2.9774, + "step": 32950 + }, + { + "epoch": 1.61, + "grad_norm": 0.5869172811508179, + "learning_rate": 0.00026400594919248757, + "loss": 2.8276, + "step": 32951 + }, + { + "epoch": 1.61, + "grad_norm": 0.6382247805595398, + "learning_rate": 0.00026399066426712094, + "loss": 3.0276, + "step": 32952 + }, + { + "epoch": 1.61, + "grad_norm": 0.6578730344772339, + "learning_rate": 0.0002639753794365961, + "loss": 3.0336, + "step": 32953 + }, + { + "epoch": 1.61, + "grad_norm": 0.6229750514030457, + "learning_rate": 0.000263960094700953, + "loss": 3.0683, + "step": 32954 + }, + { + "epoch": 1.62, + "grad_norm": 0.6478546261787415, + "learning_rate": 0.0002639448100602323, + "loss": 2.9494, + "step": 32955 + }, + { + "epoch": 1.62, + "grad_norm": 0.607498288154602, + "learning_rate": 0.0002639295255144738, + "loss": 3.0408, + "step": 32956 + }, + { + "epoch": 1.62, + "grad_norm": 0.6057003736495972, + "learning_rate": 0.00026391424106371805, + "loss": 3.2572, + "step": 32957 + }, + { + "epoch": 1.62, + "grad_norm": 0.5550916194915771, + "learning_rate": 0.00026389895670800534, + "loss": 2.9359, + "step": 32958 + }, + { + "epoch": 1.62, + "grad_norm": 0.6296790242195129, + "learning_rate": 0.0002638836724473757, + "loss": 3.0151, + "step": 32959 + }, + { + "epoch": 1.62, + "grad_norm": 0.5951831936836243, + "learning_rate": 0.00026386838828186966, + "loss": 2.8441, + "step": 32960 + }, + { + "epoch": 1.62, + "grad_norm": 0.6196625828742981, + "learning_rate": 0.00026385310421152725, + "loss": 2.8942, + "step": 32961 + }, + { + "epoch": 1.62, + "grad_norm": 0.6182433366775513, + "learning_rate": 0.00026383782023638874, + "loss": 2.9938, + "step": 32962 + }, + { + "epoch": 1.62, + "grad_norm": 0.6165900230407715, + "learning_rate": 0.0002638225363564946, + "loss": 2.8868, + "step": 32963 + }, + { + "epoch": 1.62, + "grad_norm": 0.6178959012031555, + "learning_rate": 0.0002638072525718848, + "loss": 2.8964, + "step": 32964 + }, + { + "epoch": 1.62, + "grad_norm": 0.6362462639808655, + "learning_rate": 0.00026379196888259984, + "loss": 2.8124, + "step": 32965 + }, + { + "epoch": 1.62, + "grad_norm": 0.6361078023910522, + "learning_rate": 0.00026377668528867977, + "loss": 3.0088, + "step": 32966 + }, + { + "epoch": 1.62, + "grad_norm": 0.7130830883979797, + "learning_rate": 0.00026376140179016494, + "loss": 3.1082, + "step": 32967 + }, + { + "epoch": 1.62, + "grad_norm": 0.6127350926399231, + "learning_rate": 0.0002637461183870957, + "loss": 3.1208, + "step": 32968 + }, + { + "epoch": 1.62, + "grad_norm": 0.5895925164222717, + "learning_rate": 0.0002637308350795121, + "loss": 2.8528, + "step": 32969 + }, + { + "epoch": 1.62, + "grad_norm": 0.6005771160125732, + "learning_rate": 0.00026371555186745465, + "loss": 3.0934, + "step": 32970 + }, + { + "epoch": 1.62, + "grad_norm": 0.6108828783035278, + "learning_rate": 0.0002637002687509633, + "loss": 2.9517, + "step": 32971 + }, + { + "epoch": 1.62, + "grad_norm": 0.5996699333190918, + "learning_rate": 0.0002636849857300785, + "loss": 2.9753, + "step": 32972 + }, + { + "epoch": 1.62, + "grad_norm": 0.5930362939834595, + "learning_rate": 0.0002636697028048405, + "loss": 2.9364, + "step": 32973 + }, + { + "epoch": 1.62, + "grad_norm": 0.6767230033874512, + "learning_rate": 0.0002636544199752894, + "loss": 3.1138, + "step": 32974 + }, + { + "epoch": 1.62, + "grad_norm": 0.6063745021820068, + "learning_rate": 0.0002636391372414657, + "loss": 3.1365, + "step": 32975 + }, + { + "epoch": 1.62, + "grad_norm": 0.6611247658729553, + "learning_rate": 0.00026362385460340946, + "loss": 3.0819, + "step": 32976 + }, + { + "epoch": 1.62, + "grad_norm": 0.6121769547462463, + "learning_rate": 0.00026360857206116095, + "loss": 2.8333, + "step": 32977 + }, + { + "epoch": 1.62, + "grad_norm": 0.6047820448875427, + "learning_rate": 0.00026359328961476057, + "loss": 3.1222, + "step": 32978 + }, + { + "epoch": 1.62, + "grad_norm": 0.6275504231452942, + "learning_rate": 0.00026357800726424833, + "loss": 3.3318, + "step": 32979 + }, + { + "epoch": 1.62, + "grad_norm": 0.6080473065376282, + "learning_rate": 0.0002635627250096647, + "loss": 2.9658, + "step": 32980 + }, + { + "epoch": 1.62, + "grad_norm": 0.6527457237243652, + "learning_rate": 0.00026354744285104977, + "loss": 3.0866, + "step": 32981 + }, + { + "epoch": 1.62, + "grad_norm": 0.6376957297325134, + "learning_rate": 0.0002635321607884439, + "loss": 3.1455, + "step": 32982 + }, + { + "epoch": 1.62, + "grad_norm": 0.6095510125160217, + "learning_rate": 0.0002635168788218874, + "loss": 2.9285, + "step": 32983 + }, + { + "epoch": 1.62, + "grad_norm": 0.6063658595085144, + "learning_rate": 0.0002635015969514202, + "loss": 3.2578, + "step": 32984 + }, + { + "epoch": 1.62, + "grad_norm": 0.5850301384925842, + "learning_rate": 0.00026348631517708295, + "loss": 3.193, + "step": 32985 + }, + { + "epoch": 1.62, + "grad_norm": 0.6624311804771423, + "learning_rate": 0.00026347103349891565, + "loss": 3.1238, + "step": 32986 + }, + { + "epoch": 1.62, + "grad_norm": 0.5807081460952759, + "learning_rate": 0.00026345575191695855, + "loss": 3.0628, + "step": 32987 + }, + { + "epoch": 1.62, + "grad_norm": 0.5957835912704468, + "learning_rate": 0.0002634404704312521, + "loss": 2.947, + "step": 32988 + }, + { + "epoch": 1.62, + "grad_norm": 0.6118351817131042, + "learning_rate": 0.00026342518904183624, + "loss": 3.0652, + "step": 32989 + }, + { + "epoch": 1.62, + "grad_norm": 0.5782414674758911, + "learning_rate": 0.0002634099077487516, + "loss": 2.9368, + "step": 32990 + }, + { + "epoch": 1.62, + "grad_norm": 0.6376646161079407, + "learning_rate": 0.0002633946265520381, + "loss": 2.9095, + "step": 32991 + }, + { + "epoch": 1.62, + "grad_norm": 0.7035831809043884, + "learning_rate": 0.00026337934545173606, + "loss": 2.9032, + "step": 32992 + }, + { + "epoch": 1.62, + "grad_norm": 0.6374669075012207, + "learning_rate": 0.0002633640644478859, + "loss": 3.1269, + "step": 32993 + }, + { + "epoch": 1.62, + "grad_norm": 0.6482071280479431, + "learning_rate": 0.0002633487835405277, + "loss": 2.9167, + "step": 32994 + }, + { + "epoch": 1.62, + "grad_norm": 0.5882799029350281, + "learning_rate": 0.0002633335027297018, + "loss": 2.8802, + "step": 32995 + }, + { + "epoch": 1.62, + "grad_norm": 0.6047265529632568, + "learning_rate": 0.00026331822201544825, + "loss": 3.0355, + "step": 32996 + }, + { + "epoch": 1.62, + "grad_norm": 0.5921814441680908, + "learning_rate": 0.00026330294139780756, + "loss": 2.9725, + "step": 32997 + }, + { + "epoch": 1.62, + "grad_norm": 0.6499700546264648, + "learning_rate": 0.00026328766087681986, + "loss": 2.9638, + "step": 32998 + }, + { + "epoch": 1.62, + "grad_norm": 0.6229745149612427, + "learning_rate": 0.00026327238045252533, + "loss": 3.066, + "step": 32999 + }, + { + "epoch": 1.62, + "grad_norm": 0.6632835268974304, + "learning_rate": 0.00026325710012496434, + "loss": 2.9409, + "step": 33000 + }, + { + "epoch": 1.62, + "grad_norm": 0.5706971287727356, + "learning_rate": 0.00026324181989417707, + "loss": 2.9932, + "step": 33001 + }, + { + "epoch": 1.62, + "grad_norm": 0.6048433184623718, + "learning_rate": 0.0002632265397602037, + "loss": 2.9595, + "step": 33002 + }, + { + "epoch": 1.62, + "grad_norm": 0.6280288100242615, + "learning_rate": 0.0002632112597230847, + "loss": 3.1984, + "step": 33003 + }, + { + "epoch": 1.62, + "grad_norm": 0.6192353963851929, + "learning_rate": 0.00026319597978286, + "loss": 2.9718, + "step": 33004 + }, + { + "epoch": 1.62, + "grad_norm": 0.6082794070243835, + "learning_rate": 0.0002631806999395702, + "loss": 3.0911, + "step": 33005 + }, + { + "epoch": 1.62, + "grad_norm": 0.6182049512863159, + "learning_rate": 0.00026316542019325513, + "loss": 3.064, + "step": 33006 + }, + { + "epoch": 1.62, + "grad_norm": 0.6134260296821594, + "learning_rate": 0.0002631501405439553, + "loss": 3.0449, + "step": 33007 + }, + { + "epoch": 1.62, + "grad_norm": 0.6304604411125183, + "learning_rate": 0.0002631348609917111, + "loss": 2.9606, + "step": 33008 + }, + { + "epoch": 1.62, + "grad_norm": 0.5884554386138916, + "learning_rate": 0.00026311958153656237, + "loss": 3.0864, + "step": 33009 + }, + { + "epoch": 1.62, + "grad_norm": 0.5836405158042908, + "learning_rate": 0.00026310430217854966, + "loss": 3.4003, + "step": 33010 + }, + { + "epoch": 1.62, + "grad_norm": 0.6103641986846924, + "learning_rate": 0.000263089022917713, + "loss": 3.1367, + "step": 33011 + }, + { + "epoch": 1.62, + "grad_norm": 0.6257080435752869, + "learning_rate": 0.0002630737437540928, + "loss": 3.2796, + "step": 33012 + }, + { + "epoch": 1.62, + "grad_norm": 0.6084847450256348, + "learning_rate": 0.0002630584646877294, + "loss": 3.099, + "step": 33013 + }, + { + "epoch": 1.62, + "grad_norm": 0.6533360481262207, + "learning_rate": 0.0002630431857186627, + "loss": 3.201, + "step": 33014 + }, + { + "epoch": 1.62, + "grad_norm": 0.65522700548172, + "learning_rate": 0.0002630279068469333, + "loss": 2.9477, + "step": 33015 + }, + { + "epoch": 1.62, + "grad_norm": 0.6127893328666687, + "learning_rate": 0.0002630126280725812, + "loss": 2.9227, + "step": 33016 + }, + { + "epoch": 1.62, + "grad_norm": 0.6066474318504333, + "learning_rate": 0.00026299734939564664, + "loss": 2.9635, + "step": 33017 + }, + { + "epoch": 1.62, + "grad_norm": 0.6110500693321228, + "learning_rate": 0.0002629820708161701, + "loss": 3.1665, + "step": 33018 + }, + { + "epoch": 1.62, + "grad_norm": 0.6653189063072205, + "learning_rate": 0.00026296679233419153, + "loss": 3.1245, + "step": 33019 + }, + { + "epoch": 1.62, + "grad_norm": 0.5800127983093262, + "learning_rate": 0.00026295151394975145, + "loss": 2.8873, + "step": 33020 + }, + { + "epoch": 1.62, + "grad_norm": 0.6234315037727356, + "learning_rate": 0.0002629362356628898, + "loss": 2.9345, + "step": 33021 + }, + { + "epoch": 1.62, + "grad_norm": 0.6198316216468811, + "learning_rate": 0.000262920957473647, + "loss": 2.8847, + "step": 33022 + }, + { + "epoch": 1.62, + "grad_norm": 0.6274871230125427, + "learning_rate": 0.0002629056793820634, + "loss": 3.0915, + "step": 33023 + }, + { + "epoch": 1.62, + "grad_norm": 0.6605402231216431, + "learning_rate": 0.0002628904013881789, + "loss": 2.9215, + "step": 33024 + }, + { + "epoch": 1.62, + "grad_norm": 0.634557843208313, + "learning_rate": 0.00026287512349203407, + "loss": 2.955, + "step": 33025 + }, + { + "epoch": 1.62, + "grad_norm": 0.6039037108421326, + "learning_rate": 0.000262859845693669, + "loss": 3.1472, + "step": 33026 + }, + { + "epoch": 1.62, + "grad_norm": 0.6004440784454346, + "learning_rate": 0.000262844567993124, + "loss": 3.1694, + "step": 33027 + }, + { + "epoch": 1.62, + "grad_norm": 0.5967251658439636, + "learning_rate": 0.0002628292903904391, + "loss": 2.922, + "step": 33028 + }, + { + "epoch": 1.62, + "grad_norm": 0.5802043676376343, + "learning_rate": 0.0002628140128856547, + "loss": 3.0168, + "step": 33029 + }, + { + "epoch": 1.62, + "grad_norm": 0.5677957534790039, + "learning_rate": 0.00026279873547881117, + "loss": 2.8979, + "step": 33030 + }, + { + "epoch": 1.62, + "grad_norm": 0.6296228170394897, + "learning_rate": 0.0002627834581699486, + "loss": 2.9727, + "step": 33031 + }, + { + "epoch": 1.62, + "grad_norm": 0.6011860966682434, + "learning_rate": 0.0002627681809591072, + "loss": 3.1614, + "step": 33032 + }, + { + "epoch": 1.62, + "grad_norm": 0.6390479803085327, + "learning_rate": 0.0002627529038463272, + "loss": 3.0619, + "step": 33033 + }, + { + "epoch": 1.62, + "grad_norm": 0.7037544846534729, + "learning_rate": 0.0002627376268316489, + "loss": 3.0301, + "step": 33034 + }, + { + "epoch": 1.62, + "grad_norm": 0.6357463598251343, + "learning_rate": 0.00026272234991511263, + "loss": 3.0546, + "step": 33035 + }, + { + "epoch": 1.62, + "grad_norm": 0.5906470417976379, + "learning_rate": 0.0002627070730967584, + "loss": 3.1207, + "step": 33036 + }, + { + "epoch": 1.62, + "grad_norm": 0.6273382306098938, + "learning_rate": 0.0002626917963766266, + "loss": 3.0675, + "step": 33037 + }, + { + "epoch": 1.62, + "grad_norm": 0.5891216993331909, + "learning_rate": 0.00026267651975475745, + "loss": 3.158, + "step": 33038 + }, + { + "epoch": 1.62, + "grad_norm": 0.6417419910430908, + "learning_rate": 0.00026266124323119107, + "loss": 3.1118, + "step": 33039 + }, + { + "epoch": 1.62, + "grad_norm": 0.5798816680908203, + "learning_rate": 0.0002626459668059679, + "loss": 3.1448, + "step": 33040 + }, + { + "epoch": 1.62, + "grad_norm": 0.6085364818572998, + "learning_rate": 0.000262630690479128, + "loss": 3.1527, + "step": 33041 + }, + { + "epoch": 1.62, + "grad_norm": 0.6986152529716492, + "learning_rate": 0.00026261541425071176, + "loss": 2.9744, + "step": 33042 + }, + { + "epoch": 1.62, + "grad_norm": 0.5539711713790894, + "learning_rate": 0.0002626001381207592, + "loss": 3.1058, + "step": 33043 + }, + { + "epoch": 1.62, + "grad_norm": 0.6884754300117493, + "learning_rate": 0.00026258486208931075, + "loss": 3.0357, + "step": 33044 + }, + { + "epoch": 1.62, + "grad_norm": 0.6211069226264954, + "learning_rate": 0.0002625695861564066, + "loss": 3.2542, + "step": 33045 + }, + { + "epoch": 1.62, + "grad_norm": 0.6055322885513306, + "learning_rate": 0.00026255431032208684, + "loss": 3.275, + "step": 33046 + }, + { + "epoch": 1.62, + "grad_norm": 0.581346333026886, + "learning_rate": 0.00026253903458639197, + "loss": 2.9475, + "step": 33047 + }, + { + "epoch": 1.62, + "grad_norm": 0.5834121704101562, + "learning_rate": 0.00026252375894936193, + "loss": 3.3297, + "step": 33048 + }, + { + "epoch": 1.62, + "grad_norm": 0.610347330570221, + "learning_rate": 0.0002625084834110371, + "loss": 2.9988, + "step": 33049 + }, + { + "epoch": 1.62, + "grad_norm": 0.6127068400382996, + "learning_rate": 0.00026249320797145785, + "loss": 2.843, + "step": 33050 + }, + { + "epoch": 1.62, + "grad_norm": 0.5858662128448486, + "learning_rate": 0.0002624779326306641, + "loss": 3.0937, + "step": 33051 + }, + { + "epoch": 1.62, + "grad_norm": 0.5876694917678833, + "learning_rate": 0.00026246265738869647, + "loss": 2.9783, + "step": 33052 + }, + { + "epoch": 1.62, + "grad_norm": 0.622711718082428, + "learning_rate": 0.0002624473822455948, + "loss": 2.9802, + "step": 33053 + }, + { + "epoch": 1.62, + "grad_norm": 0.5976312160491943, + "learning_rate": 0.0002624321072013995, + "loss": 3.0595, + "step": 33054 + }, + { + "epoch": 1.62, + "grad_norm": 0.5904166102409363, + "learning_rate": 0.0002624168322561509, + "loss": 2.9977, + "step": 33055 + }, + { + "epoch": 1.62, + "grad_norm": 0.7579708099365234, + "learning_rate": 0.0002624015574098891, + "loss": 2.9153, + "step": 33056 + }, + { + "epoch": 1.62, + "grad_norm": 0.6306504011154175, + "learning_rate": 0.0002623862826626544, + "loss": 3.1642, + "step": 33057 + }, + { + "epoch": 1.62, + "grad_norm": 0.6161608695983887, + "learning_rate": 0.0002623710080144868, + "loss": 3.1045, + "step": 33058 + }, + { + "epoch": 1.62, + "grad_norm": 0.6324542760848999, + "learning_rate": 0.0002623557334654269, + "loss": 3.1015, + "step": 33059 + }, + { + "epoch": 1.62, + "grad_norm": 0.6091515421867371, + "learning_rate": 0.00026234045901551474, + "loss": 3.0746, + "step": 33060 + }, + { + "epoch": 1.62, + "grad_norm": 0.6115736961364746, + "learning_rate": 0.0002623251846647905, + "loss": 3.0863, + "step": 33061 + }, + { + "epoch": 1.62, + "grad_norm": 0.6223326325416565, + "learning_rate": 0.00026230991041329457, + "loss": 2.998, + "step": 33062 + }, + { + "epoch": 1.62, + "grad_norm": 0.5899375677108765, + "learning_rate": 0.000262294636261067, + "loss": 3.0513, + "step": 33063 + }, + { + "epoch": 1.62, + "grad_norm": 0.577978789806366, + "learning_rate": 0.00026227936220814805, + "loss": 3.0171, + "step": 33064 + }, + { + "epoch": 1.62, + "grad_norm": 0.5958923697471619, + "learning_rate": 0.00026226408825457813, + "loss": 2.9265, + "step": 33065 + }, + { + "epoch": 1.62, + "grad_norm": 0.6276746988296509, + "learning_rate": 0.0002622488144003973, + "loss": 2.8662, + "step": 33066 + }, + { + "epoch": 1.62, + "grad_norm": 0.5911863446235657, + "learning_rate": 0.0002622335406456459, + "loss": 3.0632, + "step": 33067 + }, + { + "epoch": 1.62, + "grad_norm": 0.6043195724487305, + "learning_rate": 0.0002622182669903639, + "loss": 2.8693, + "step": 33068 + }, + { + "epoch": 1.62, + "grad_norm": 0.6209897994995117, + "learning_rate": 0.0002622029934345917, + "loss": 2.8144, + "step": 33069 + }, + { + "epoch": 1.62, + "grad_norm": 0.5737490653991699, + "learning_rate": 0.0002621877199783697, + "loss": 3.1074, + "step": 33070 + }, + { + "epoch": 1.62, + "grad_norm": 0.5962395668029785, + "learning_rate": 0.00026217244662173793, + "loss": 2.9208, + "step": 33071 + }, + { + "epoch": 1.62, + "grad_norm": 0.6508395671844482, + "learning_rate": 0.00026215717336473674, + "loss": 2.7527, + "step": 33072 + }, + { + "epoch": 1.62, + "grad_norm": 0.6254932284355164, + "learning_rate": 0.0002621419002074061, + "loss": 2.7828, + "step": 33073 + }, + { + "epoch": 1.62, + "grad_norm": 0.6272798180580139, + "learning_rate": 0.0002621266271497865, + "loss": 3.0308, + "step": 33074 + }, + { + "epoch": 1.62, + "grad_norm": 0.5974328517913818, + "learning_rate": 0.0002621113541919181, + "loss": 2.8549, + "step": 33075 + }, + { + "epoch": 1.62, + "grad_norm": 0.6224279403686523, + "learning_rate": 0.000262096081333841, + "loss": 2.9043, + "step": 33076 + }, + { + "epoch": 1.62, + "grad_norm": 0.6068066358566284, + "learning_rate": 0.0002620808085755956, + "loss": 3.1066, + "step": 33077 + }, + { + "epoch": 1.62, + "grad_norm": 0.6328063011169434, + "learning_rate": 0.0002620655359172221, + "loss": 3.0899, + "step": 33078 + }, + { + "epoch": 1.62, + "grad_norm": 0.6107156276702881, + "learning_rate": 0.00026205026335876047, + "loss": 3.0569, + "step": 33079 + }, + { + "epoch": 1.62, + "grad_norm": 0.6129263639450073, + "learning_rate": 0.00026203499090025136, + "loss": 3.0112, + "step": 33080 + }, + { + "epoch": 1.62, + "grad_norm": 0.6082353591918945, + "learning_rate": 0.00026201971854173474, + "loss": 3.0659, + "step": 33081 + }, + { + "epoch": 1.62, + "grad_norm": 0.6048126816749573, + "learning_rate": 0.0002620044462832509, + "loss": 2.9859, + "step": 33082 + }, + { + "epoch": 1.62, + "grad_norm": 0.5699526071548462, + "learning_rate": 0.0002619891741248399, + "loss": 3.1299, + "step": 33083 + }, + { + "epoch": 1.62, + "grad_norm": 0.5666524767875671, + "learning_rate": 0.0002619739020665422, + "loss": 3.215, + "step": 33084 + }, + { + "epoch": 1.62, + "grad_norm": 0.5848353505134583, + "learning_rate": 0.00026195863010839793, + "loss": 3.0975, + "step": 33085 + }, + { + "epoch": 1.62, + "grad_norm": 0.5911352634429932, + "learning_rate": 0.0002619433582504472, + "loss": 3.1146, + "step": 33086 + }, + { + "epoch": 1.62, + "grad_norm": 0.6226699352264404, + "learning_rate": 0.0002619280864927305, + "loss": 2.8967, + "step": 33087 + }, + { + "epoch": 1.62, + "grad_norm": 0.5983354449272156, + "learning_rate": 0.0002619128148352878, + "loss": 3.0173, + "step": 33088 + }, + { + "epoch": 1.62, + "grad_norm": 0.643765389919281, + "learning_rate": 0.00026189754327815936, + "loss": 3.1155, + "step": 33089 + }, + { + "epoch": 1.62, + "grad_norm": 0.5957856774330139, + "learning_rate": 0.0002618822718213856, + "loss": 2.9854, + "step": 33090 + }, + { + "epoch": 1.62, + "grad_norm": 0.565482497215271, + "learning_rate": 0.00026186700046500644, + "loss": 2.9022, + "step": 33091 + }, + { + "epoch": 1.62, + "grad_norm": 0.5913927555084229, + "learning_rate": 0.0002618517292090624, + "loss": 3.1603, + "step": 33092 + }, + { + "epoch": 1.62, + "grad_norm": 0.6006325483322144, + "learning_rate": 0.00026183645805359353, + "loss": 3.2807, + "step": 33093 + }, + { + "epoch": 1.62, + "grad_norm": 0.6765154004096985, + "learning_rate": 0.00026182118699863996, + "loss": 3.0531, + "step": 33094 + }, + { + "epoch": 1.62, + "grad_norm": 0.6251957416534424, + "learning_rate": 0.0002618059160442422, + "loss": 2.9957, + "step": 33095 + }, + { + "epoch": 1.62, + "grad_norm": 0.6182376742362976, + "learning_rate": 0.00026179064519044027, + "loss": 3.0766, + "step": 33096 + }, + { + "epoch": 1.62, + "grad_norm": 0.5697817206382751, + "learning_rate": 0.00026177537443727443, + "loss": 2.8416, + "step": 33097 + }, + { + "epoch": 1.62, + "grad_norm": 0.6249179244041443, + "learning_rate": 0.00026176010378478484, + "loss": 3.007, + "step": 33098 + }, + { + "epoch": 1.62, + "grad_norm": 0.6290808320045471, + "learning_rate": 0.0002617448332330118, + "loss": 3.235, + "step": 33099 + }, + { + "epoch": 1.62, + "grad_norm": 0.6982139348983765, + "learning_rate": 0.0002617295627819956, + "loss": 3.0333, + "step": 33100 + }, + { + "epoch": 1.62, + "grad_norm": 0.6556278467178345, + "learning_rate": 0.00026171429243177614, + "loss": 3.0093, + "step": 33101 + }, + { + "epoch": 1.62, + "grad_norm": 0.6192794442176819, + "learning_rate": 0.0002616990221823941, + "loss": 3.0252, + "step": 33102 + }, + { + "epoch": 1.62, + "grad_norm": 0.6167252659797668, + "learning_rate": 0.0002616837520338894, + "loss": 2.9509, + "step": 33103 + }, + { + "epoch": 1.62, + "grad_norm": 0.651763379573822, + "learning_rate": 0.0002616684819863023, + "loss": 3.0517, + "step": 33104 + }, + { + "epoch": 1.62, + "grad_norm": 0.6661128997802734, + "learning_rate": 0.00026165321203967297, + "loss": 2.9474, + "step": 33105 + }, + { + "epoch": 1.62, + "grad_norm": 0.6066205501556396, + "learning_rate": 0.00026163794219404176, + "loss": 3.1624, + "step": 33106 + }, + { + "epoch": 1.62, + "grad_norm": 0.6290202736854553, + "learning_rate": 0.0002616226724494489, + "loss": 2.8767, + "step": 33107 + }, + { + "epoch": 1.62, + "grad_norm": 0.6024856567382812, + "learning_rate": 0.0002616074028059344, + "loss": 2.9091, + "step": 33108 + }, + { + "epoch": 1.62, + "grad_norm": 0.5915390253067017, + "learning_rate": 0.00026159213326353877, + "loss": 2.9278, + "step": 33109 + }, + { + "epoch": 1.62, + "grad_norm": 0.6625373363494873, + "learning_rate": 0.00026157686382230195, + "loss": 3.3122, + "step": 33110 + }, + { + "epoch": 1.62, + "grad_norm": 0.7008400559425354, + "learning_rate": 0.00026156159448226425, + "loss": 3.002, + "step": 33111 + }, + { + "epoch": 1.62, + "grad_norm": 0.7132464051246643, + "learning_rate": 0.00026154632524346604, + "loss": 3.1318, + "step": 33112 + }, + { + "epoch": 1.62, + "grad_norm": 0.6390926837921143, + "learning_rate": 0.00026153105610594727, + "loss": 2.9645, + "step": 33113 + }, + { + "epoch": 1.62, + "grad_norm": 0.6167672872543335, + "learning_rate": 0.0002615157870697484, + "loss": 2.9288, + "step": 33114 + }, + { + "epoch": 1.62, + "grad_norm": 0.5881397724151611, + "learning_rate": 0.0002615005181349095, + "loss": 2.8726, + "step": 33115 + }, + { + "epoch": 1.62, + "grad_norm": 0.6101930737495422, + "learning_rate": 0.00026148524930147074, + "loss": 3.1134, + "step": 33116 + }, + { + "epoch": 1.62, + "grad_norm": 0.602695643901825, + "learning_rate": 0.0002614699805694726, + "loss": 2.9318, + "step": 33117 + }, + { + "epoch": 1.62, + "grad_norm": 0.6428423523902893, + "learning_rate": 0.000261454711938955, + "loss": 3.018, + "step": 33118 + }, + { + "epoch": 1.62, + "grad_norm": 0.62602299451828, + "learning_rate": 0.00026143944340995836, + "loss": 2.8108, + "step": 33119 + }, + { + "epoch": 1.62, + "grad_norm": 0.6125110387802124, + "learning_rate": 0.0002614241749825227, + "loss": 3.1053, + "step": 33120 + }, + { + "epoch": 1.62, + "grad_norm": 0.5930248498916626, + "learning_rate": 0.00026140890665668834, + "loss": 2.7055, + "step": 33121 + }, + { + "epoch": 1.62, + "grad_norm": 0.6099435091018677, + "learning_rate": 0.0002613936384324956, + "loss": 3.0313, + "step": 33122 + }, + { + "epoch": 1.62, + "grad_norm": 0.599867045879364, + "learning_rate": 0.00026137837030998444, + "loss": 3.0244, + "step": 33123 + }, + { + "epoch": 1.62, + "grad_norm": 0.6972360014915466, + "learning_rate": 0.00026136310228919537, + "loss": 3.1866, + "step": 33124 + }, + { + "epoch": 1.62, + "grad_norm": 0.6721414923667908, + "learning_rate": 0.0002613478343701684, + "loss": 3.0925, + "step": 33125 + }, + { + "epoch": 1.62, + "grad_norm": 0.6666560769081116, + "learning_rate": 0.0002613325665529437, + "loss": 3.2204, + "step": 33126 + }, + { + "epoch": 1.62, + "grad_norm": 0.577688992023468, + "learning_rate": 0.0002613172988375617, + "loss": 3.1363, + "step": 33127 + }, + { + "epoch": 1.62, + "grad_norm": 0.60993492603302, + "learning_rate": 0.00026130203122406244, + "loss": 3.2141, + "step": 33128 + }, + { + "epoch": 1.62, + "grad_norm": 0.601895809173584, + "learning_rate": 0.0002612867637124862, + "loss": 2.8998, + "step": 33129 + }, + { + "epoch": 1.62, + "grad_norm": 0.5851340889930725, + "learning_rate": 0.00026127149630287313, + "loss": 3.0916, + "step": 33130 + }, + { + "epoch": 1.62, + "grad_norm": 0.5805398225784302, + "learning_rate": 0.0002612562289952634, + "loss": 2.9505, + "step": 33131 + }, + { + "epoch": 1.62, + "grad_norm": 0.6383261680603027, + "learning_rate": 0.0002612409617896975, + "loss": 2.8642, + "step": 33132 + }, + { + "epoch": 1.62, + "grad_norm": 0.6343017816543579, + "learning_rate": 0.00026122569468621537, + "loss": 3.29, + "step": 33133 + }, + { + "epoch": 1.62, + "grad_norm": 0.6100292801856995, + "learning_rate": 0.00026121042768485737, + "loss": 2.9963, + "step": 33134 + }, + { + "epoch": 1.62, + "grad_norm": 0.5947556495666504, + "learning_rate": 0.0002611951607856635, + "loss": 2.851, + "step": 33135 + }, + { + "epoch": 1.62, + "grad_norm": 0.6661785244941711, + "learning_rate": 0.00026117989398867416, + "loss": 3.2415, + "step": 33136 + }, + { + "epoch": 1.62, + "grad_norm": 0.6562435030937195, + "learning_rate": 0.00026116462729392956, + "loss": 3.1879, + "step": 33137 + }, + { + "epoch": 1.62, + "grad_norm": 0.6270297765731812, + "learning_rate": 0.0002611493607014698, + "loss": 3.1056, + "step": 33138 + }, + { + "epoch": 1.62, + "grad_norm": 0.6291563510894775, + "learning_rate": 0.00026113409421133524, + "loss": 3.0792, + "step": 33139 + }, + { + "epoch": 1.62, + "grad_norm": 0.6328475475311279, + "learning_rate": 0.0002611188278235659, + "loss": 2.9278, + "step": 33140 + }, + { + "epoch": 1.62, + "grad_norm": 0.5976963639259338, + "learning_rate": 0.000261103561538202, + "loss": 3.1471, + "step": 33141 + }, + { + "epoch": 1.62, + "grad_norm": 0.5877504348754883, + "learning_rate": 0.000261088295355284, + "loss": 2.8909, + "step": 33142 + }, + { + "epoch": 1.62, + "grad_norm": 0.6378092169761658, + "learning_rate": 0.0002610730292748519, + "loss": 2.7738, + "step": 33143 + }, + { + "epoch": 1.62, + "grad_norm": 0.686267614364624, + "learning_rate": 0.00026105776329694597, + "loss": 3.0721, + "step": 33144 + }, + { + "epoch": 1.62, + "grad_norm": 0.6485110521316528, + "learning_rate": 0.0002610424974216063, + "loss": 3.1614, + "step": 33145 + }, + { + "epoch": 1.62, + "grad_norm": 0.6028688549995422, + "learning_rate": 0.00026102723164887325, + "loss": 3.1251, + "step": 33146 + }, + { + "epoch": 1.62, + "grad_norm": 0.5872344374656677, + "learning_rate": 0.00026101196597878704, + "loss": 3.2345, + "step": 33147 + }, + { + "epoch": 1.62, + "grad_norm": 0.661635160446167, + "learning_rate": 0.0002609967004113877, + "loss": 3.0462, + "step": 33148 + }, + { + "epoch": 1.62, + "grad_norm": 0.6713125705718994, + "learning_rate": 0.0002609814349467156, + "loss": 2.9479, + "step": 33149 + }, + { + "epoch": 1.62, + "grad_norm": 0.5891671776771545, + "learning_rate": 0.00026096616958481094, + "loss": 3.0252, + "step": 33150 + }, + { + "epoch": 1.62, + "grad_norm": 0.6562174558639526, + "learning_rate": 0.0002609509043257137, + "loss": 3.0676, + "step": 33151 + }, + { + "epoch": 1.62, + "grad_norm": 0.6034083366394043, + "learning_rate": 0.00026093563916946443, + "loss": 3.2758, + "step": 33152 + }, + { + "epoch": 1.62, + "grad_norm": 0.5874406099319458, + "learning_rate": 0.00026092037411610305, + "loss": 2.7679, + "step": 33153 + }, + { + "epoch": 1.62, + "grad_norm": 0.583278238773346, + "learning_rate": 0.00026090510916567, + "loss": 3.2504, + "step": 33154 + }, + { + "epoch": 1.62, + "grad_norm": 0.6118584871292114, + "learning_rate": 0.00026088984431820535, + "loss": 3.032, + "step": 33155 + }, + { + "epoch": 1.62, + "grad_norm": 0.6118886470794678, + "learning_rate": 0.0002608745795737492, + "loss": 2.9379, + "step": 33156 + }, + { + "epoch": 1.62, + "grad_norm": 0.5924785137176514, + "learning_rate": 0.00026085931493234204, + "loss": 3.0218, + "step": 33157 + }, + { + "epoch": 1.62, + "grad_norm": 0.6517102718353271, + "learning_rate": 0.00026084405039402385, + "loss": 2.8742, + "step": 33158 + }, + { + "epoch": 1.63, + "grad_norm": 0.6144333481788635, + "learning_rate": 0.0002608287859588349, + "loss": 2.952, + "step": 33159 + }, + { + "epoch": 1.63, + "grad_norm": 0.5906076431274414, + "learning_rate": 0.00026081352162681535, + "loss": 3.0309, + "step": 33160 + }, + { + "epoch": 1.63, + "grad_norm": 0.6224218606948853, + "learning_rate": 0.00026079825739800544, + "loss": 3.0958, + "step": 33161 + }, + { + "epoch": 1.63, + "grad_norm": 0.6227795481681824, + "learning_rate": 0.00026078299327244545, + "loss": 2.8246, + "step": 33162 + }, + { + "epoch": 1.63, + "grad_norm": 0.5719353556632996, + "learning_rate": 0.0002607677292501754, + "loss": 2.9469, + "step": 33163 + }, + { + "epoch": 1.63, + "grad_norm": 0.6005598902702332, + "learning_rate": 0.00026075246533123573, + "loss": 3.0891, + "step": 33164 + }, + { + "epoch": 1.63, + "grad_norm": 0.6221149563789368, + "learning_rate": 0.00026073720151566644, + "loss": 2.8739, + "step": 33165 + }, + { + "epoch": 1.63, + "grad_norm": 0.6196457147598267, + "learning_rate": 0.00026072193780350776, + "loss": 2.9828, + "step": 33166 + }, + { + "epoch": 1.63, + "grad_norm": 0.6265254616737366, + "learning_rate": 0.0002607066741948, + "loss": 2.8732, + "step": 33167 + }, + { + "epoch": 1.63, + "grad_norm": 0.653234601020813, + "learning_rate": 0.0002606914106895833, + "loss": 2.9117, + "step": 33168 + }, + { + "epoch": 1.63, + "grad_norm": 0.6232619285583496, + "learning_rate": 0.0002606761472878979, + "loss": 3.2086, + "step": 33169 + }, + { + "epoch": 1.63, + "grad_norm": 0.5829308032989502, + "learning_rate": 0.00026066088398978384, + "loss": 3.2108, + "step": 33170 + }, + { + "epoch": 1.63, + "grad_norm": 0.5806586742401123, + "learning_rate": 0.00026064562079528144, + "loss": 3.133, + "step": 33171 + }, + { + "epoch": 1.63, + "grad_norm": 0.5950442552566528, + "learning_rate": 0.0002606303577044311, + "loss": 3.0696, + "step": 33172 + }, + { + "epoch": 1.63, + "grad_norm": 0.6098839640617371, + "learning_rate": 0.0002606150947172726, + "loss": 3.2242, + "step": 33173 + }, + { + "epoch": 1.63, + "grad_norm": 0.6669129729270935, + "learning_rate": 0.0002605998318338465, + "loss": 2.9275, + "step": 33174 + }, + { + "epoch": 1.63, + "grad_norm": 0.5990313291549683, + "learning_rate": 0.0002605845690541928, + "loss": 3.1519, + "step": 33175 + }, + { + "epoch": 1.63, + "grad_norm": 0.614076554775238, + "learning_rate": 0.0002605693063783517, + "loss": 2.8938, + "step": 33176 + }, + { + "epoch": 1.63, + "grad_norm": 0.6242024302482605, + "learning_rate": 0.0002605540438063636, + "loss": 3.0608, + "step": 33177 + }, + { + "epoch": 1.63, + "grad_norm": 0.5861707329750061, + "learning_rate": 0.00026053878133826844, + "loss": 3.0707, + "step": 33178 + }, + { + "epoch": 1.63, + "grad_norm": 0.6213533282279968, + "learning_rate": 0.00026052351897410666, + "loss": 2.9031, + "step": 33179 + }, + { + "epoch": 1.63, + "grad_norm": 0.560062825679779, + "learning_rate": 0.0002605082567139182, + "loss": 3.173, + "step": 33180 + }, + { + "epoch": 1.63, + "grad_norm": 0.6374415159225464, + "learning_rate": 0.00026049299455774353, + "loss": 2.9135, + "step": 33181 + }, + { + "epoch": 1.63, + "grad_norm": 0.6181164979934692, + "learning_rate": 0.0002604777325056225, + "loss": 3.0568, + "step": 33182 + }, + { + "epoch": 1.63, + "grad_norm": 0.6000651121139526, + "learning_rate": 0.0002604624705575957, + "loss": 3.0938, + "step": 33183 + }, + { + "epoch": 1.63, + "grad_norm": 0.6053292155265808, + "learning_rate": 0.0002604472087137031, + "loss": 3.0198, + "step": 33184 + }, + { + "epoch": 1.63, + "grad_norm": 0.6192270517349243, + "learning_rate": 0.0002604319469739849, + "loss": 2.9611, + "step": 33185 + }, + { + "epoch": 1.63, + "grad_norm": 0.599314272403717, + "learning_rate": 0.00026041668533848135, + "loss": 3.0997, + "step": 33186 + }, + { + "epoch": 1.63, + "grad_norm": 0.613339364528656, + "learning_rate": 0.00026040142380723264, + "loss": 3.1241, + "step": 33187 + }, + { + "epoch": 1.63, + "grad_norm": 0.6284672021865845, + "learning_rate": 0.0002603861623802789, + "loss": 2.9876, + "step": 33188 + }, + { + "epoch": 1.63, + "grad_norm": 0.6124971508979797, + "learning_rate": 0.0002603709010576605, + "loss": 2.9233, + "step": 33189 + }, + { + "epoch": 1.63, + "grad_norm": 0.6334354281425476, + "learning_rate": 0.0002603556398394175, + "loss": 3.187, + "step": 33190 + }, + { + "epoch": 1.63, + "grad_norm": 0.5786687135696411, + "learning_rate": 0.0002603403787255901, + "loss": 2.8804, + "step": 33191 + }, + { + "epoch": 1.63, + "grad_norm": 0.6184731721878052, + "learning_rate": 0.00026032511771621844, + "loss": 2.9722, + "step": 33192 + }, + { + "epoch": 1.63, + "grad_norm": 0.591331422328949, + "learning_rate": 0.00026030985681134276, + "loss": 2.9679, + "step": 33193 + }, + { + "epoch": 1.63, + "grad_norm": 0.6321749091148376, + "learning_rate": 0.0002602945960110034, + "loss": 3.0021, + "step": 33194 + }, + { + "epoch": 1.63, + "grad_norm": 0.6103703379631042, + "learning_rate": 0.0002602793353152404, + "loss": 3.1078, + "step": 33195 + }, + { + "epoch": 1.63, + "grad_norm": 0.5984880924224854, + "learning_rate": 0.000260264074724094, + "loss": 3.0928, + "step": 33196 + }, + { + "epoch": 1.63, + "grad_norm": 0.5951915979385376, + "learning_rate": 0.0002602488142376043, + "loss": 2.8636, + "step": 33197 + }, + { + "epoch": 1.63, + "grad_norm": 0.656234622001648, + "learning_rate": 0.00026023355385581165, + "loss": 3.2596, + "step": 33198 + }, + { + "epoch": 1.63, + "grad_norm": 0.6170872449874878, + "learning_rate": 0.0002602182935787562, + "loss": 3.0438, + "step": 33199 + }, + { + "epoch": 1.63, + "grad_norm": 0.6340166926383972, + "learning_rate": 0.00026020303340647797, + "loss": 3.1691, + "step": 33200 + }, + { + "epoch": 1.63, + "grad_norm": 0.6261439323425293, + "learning_rate": 0.00026018777333901744, + "loss": 2.905, + "step": 33201 + }, + { + "epoch": 1.63, + "grad_norm": 0.5833534598350525, + "learning_rate": 0.00026017251337641454, + "loss": 3.0735, + "step": 33202 + }, + { + "epoch": 1.63, + "grad_norm": 0.6300815343856812, + "learning_rate": 0.00026015725351870956, + "loss": 3.054, + "step": 33203 + }, + { + "epoch": 1.63, + "grad_norm": 0.6257278919219971, + "learning_rate": 0.00026014199376594283, + "loss": 3.032, + "step": 33204 + }, + { + "epoch": 1.63, + "grad_norm": 0.607824444770813, + "learning_rate": 0.00026012673411815435, + "loss": 2.8231, + "step": 33205 + }, + { + "epoch": 1.63, + "grad_norm": 0.6163040399551392, + "learning_rate": 0.00026011147457538446, + "loss": 3.0407, + "step": 33206 + }, + { + "epoch": 1.63, + "grad_norm": 0.6370983719825745, + "learning_rate": 0.0002600962151376731, + "loss": 2.9238, + "step": 33207 + }, + { + "epoch": 1.63, + "grad_norm": 0.5754092335700989, + "learning_rate": 0.0002600809558050607, + "loss": 2.9017, + "step": 33208 + }, + { + "epoch": 1.63, + "grad_norm": 0.6198747754096985, + "learning_rate": 0.00026006569657758747, + "loss": 3.0832, + "step": 33209 + }, + { + "epoch": 1.63, + "grad_norm": 0.6094009876251221, + "learning_rate": 0.00026005043745529335, + "loss": 3.0943, + "step": 33210 + }, + { + "epoch": 1.63, + "grad_norm": 0.6221636533737183, + "learning_rate": 0.00026003517843821883, + "loss": 2.969, + "step": 33211 + }, + { + "epoch": 1.63, + "grad_norm": 0.6165490746498108, + "learning_rate": 0.0002600199195264039, + "loss": 3.0619, + "step": 33212 + }, + { + "epoch": 1.63, + "grad_norm": 0.6335442066192627, + "learning_rate": 0.0002600046607198888, + "loss": 3.0032, + "step": 33213 + }, + { + "epoch": 1.63, + "grad_norm": 0.608069658279419, + "learning_rate": 0.00025998940201871375, + "loss": 2.9305, + "step": 33214 + }, + { + "epoch": 1.63, + "grad_norm": 0.5837422609329224, + "learning_rate": 0.00025997414342291883, + "loss": 3.1584, + "step": 33215 + }, + { + "epoch": 1.63, + "grad_norm": 0.6601833701133728, + "learning_rate": 0.00025995888493254447, + "loss": 3.0164, + "step": 33216 + }, + { + "epoch": 1.63, + "grad_norm": 0.6254077553749084, + "learning_rate": 0.0002599436265476306, + "loss": 2.9673, + "step": 33217 + }, + { + "epoch": 1.63, + "grad_norm": 0.6094150543212891, + "learning_rate": 0.0002599283682682175, + "loss": 3.217, + "step": 33218 + }, + { + "epoch": 1.63, + "grad_norm": 0.6002028584480286, + "learning_rate": 0.00025991311009434544, + "loss": 2.9381, + "step": 33219 + }, + { + "epoch": 1.63, + "grad_norm": 0.6100465655326843, + "learning_rate": 0.0002598978520260545, + "loss": 3.05, + "step": 33220 + }, + { + "epoch": 1.63, + "grad_norm": 0.6161542534828186, + "learning_rate": 0.0002598825940633849, + "loss": 3.0822, + "step": 33221 + }, + { + "epoch": 1.63, + "grad_norm": 0.6322788000106812, + "learning_rate": 0.0002598673362063768, + "loss": 3.0368, + "step": 33222 + }, + { + "epoch": 1.63, + "grad_norm": 0.617686927318573, + "learning_rate": 0.0002598520784550704, + "loss": 3.1602, + "step": 33223 + }, + { + "epoch": 1.63, + "grad_norm": 0.6013792753219604, + "learning_rate": 0.00025983682080950597, + "loss": 3.0759, + "step": 33224 + }, + { + "epoch": 1.63, + "grad_norm": 0.5862300992012024, + "learning_rate": 0.0002598215632697235, + "loss": 3.2734, + "step": 33225 + }, + { + "epoch": 1.63, + "grad_norm": 0.5786439776420593, + "learning_rate": 0.00025980630583576347, + "loss": 2.8391, + "step": 33226 + }, + { + "epoch": 1.63, + "grad_norm": 0.6323323249816895, + "learning_rate": 0.0002597910485076658, + "loss": 3.1474, + "step": 33227 + }, + { + "epoch": 1.63, + "grad_norm": 0.5810719728469849, + "learning_rate": 0.0002597757912854707, + "loss": 3.1847, + "step": 33228 + }, + { + "epoch": 1.63, + "grad_norm": 0.6033740639686584, + "learning_rate": 0.00025976053416921856, + "loss": 3.0101, + "step": 33229 + }, + { + "epoch": 1.63, + "grad_norm": 0.6337068676948547, + "learning_rate": 0.00025974527715894936, + "loss": 2.9007, + "step": 33230 + }, + { + "epoch": 1.63, + "grad_norm": 0.6159489750862122, + "learning_rate": 0.0002597300202547034, + "loss": 3.0461, + "step": 33231 + }, + { + "epoch": 1.63, + "grad_norm": 0.5909284949302673, + "learning_rate": 0.0002597147634565207, + "loss": 3.0588, + "step": 33232 + }, + { + "epoch": 1.63, + "grad_norm": 0.5779708623886108, + "learning_rate": 0.0002596995067644416, + "loss": 3.0699, + "step": 33233 + }, + { + "epoch": 1.63, + "grad_norm": 0.618440568447113, + "learning_rate": 0.0002596842501785063, + "loss": 3.1235, + "step": 33234 + }, + { + "epoch": 1.63, + "grad_norm": 0.6726293563842773, + "learning_rate": 0.0002596689936987549, + "loss": 2.8195, + "step": 33235 + }, + { + "epoch": 1.63, + "grad_norm": 0.6066452264785767, + "learning_rate": 0.0002596537373252277, + "loss": 3.019, + "step": 33236 + }, + { + "epoch": 1.63, + "grad_norm": 0.6022939085960388, + "learning_rate": 0.00025963848105796465, + "loss": 2.9119, + "step": 33237 + }, + { + "epoch": 1.63, + "grad_norm": 0.590732991695404, + "learning_rate": 0.00025962322489700613, + "loss": 2.8292, + "step": 33238 + }, + { + "epoch": 1.63, + "grad_norm": 0.5825474262237549, + "learning_rate": 0.0002596079688423923, + "loss": 2.9209, + "step": 33239 + }, + { + "epoch": 1.63, + "grad_norm": 0.6330576539039612, + "learning_rate": 0.0002595927128941632, + "loss": 2.9113, + "step": 33240 + }, + { + "epoch": 1.63, + "grad_norm": 0.5879871845245361, + "learning_rate": 0.00025957745705235927, + "loss": 2.9726, + "step": 33241 + }, + { + "epoch": 1.63, + "grad_norm": 0.6738741993904114, + "learning_rate": 0.0002595622013170204, + "loss": 2.8329, + "step": 33242 + }, + { + "epoch": 1.63, + "grad_norm": 0.6530206203460693, + "learning_rate": 0.0002595469456881869, + "loss": 3.0572, + "step": 33243 + }, + { + "epoch": 1.63, + "grad_norm": 0.6073220372200012, + "learning_rate": 0.00025953169016589917, + "loss": 2.9772, + "step": 33244 + }, + { + "epoch": 1.63, + "grad_norm": 0.6424898505210876, + "learning_rate": 0.000259516434750197, + "loss": 3.0354, + "step": 33245 + }, + { + "epoch": 1.63, + "grad_norm": 0.6124472618103027, + "learning_rate": 0.00025950117944112085, + "loss": 3.0896, + "step": 33246 + }, + { + "epoch": 1.63, + "grad_norm": 0.6103495955467224, + "learning_rate": 0.00025948592423871063, + "loss": 3.0801, + "step": 33247 + }, + { + "epoch": 1.63, + "grad_norm": 0.6181695461273193, + "learning_rate": 0.0002594706691430068, + "loss": 3.0102, + "step": 33248 + }, + { + "epoch": 1.63, + "grad_norm": 0.6263618469238281, + "learning_rate": 0.0002594554141540495, + "loss": 2.9957, + "step": 33249 + }, + { + "epoch": 1.63, + "grad_norm": 0.6513804793357849, + "learning_rate": 0.00025944015927187867, + "loss": 3.1224, + "step": 33250 + }, + { + "epoch": 1.63, + "grad_norm": 0.6265279054641724, + "learning_rate": 0.0002594249044965348, + "loss": 2.7665, + "step": 33251 + }, + { + "epoch": 1.63, + "grad_norm": 0.651552140712738, + "learning_rate": 0.0002594096498280579, + "loss": 2.906, + "step": 33252 + }, + { + "epoch": 1.63, + "grad_norm": 0.6316579580307007, + "learning_rate": 0.00025939439526648803, + "loss": 2.9716, + "step": 33253 + }, + { + "epoch": 1.63, + "grad_norm": 0.5931977033615112, + "learning_rate": 0.0002593791408118657, + "loss": 3.1162, + "step": 33254 + }, + { + "epoch": 1.63, + "grad_norm": 0.6142486333847046, + "learning_rate": 0.0002593638864642307, + "loss": 3.0778, + "step": 33255 + }, + { + "epoch": 1.63, + "grad_norm": 0.5705798268318176, + "learning_rate": 0.0002593486322236236, + "loss": 3.136, + "step": 33256 + }, + { + "epoch": 1.63, + "grad_norm": 0.6078622341156006, + "learning_rate": 0.0002593333780900843, + "loss": 3.0166, + "step": 33257 + }, + { + "epoch": 1.63, + "grad_norm": 0.5794697999954224, + "learning_rate": 0.00025931812406365293, + "loss": 2.999, + "step": 33258 + }, + { + "epoch": 1.63, + "grad_norm": 0.6560611724853516, + "learning_rate": 0.00025930287014437, + "loss": 2.8884, + "step": 33259 + }, + { + "epoch": 1.63, + "grad_norm": 0.6472424268722534, + "learning_rate": 0.00025928761633227534, + "loss": 3.0264, + "step": 33260 + }, + { + "epoch": 1.63, + "grad_norm": 0.6192428469657898, + "learning_rate": 0.0002592723626274094, + "loss": 2.9969, + "step": 33261 + }, + { + "epoch": 1.63, + "grad_norm": 0.6398415565490723, + "learning_rate": 0.0002592571090298121, + "loss": 3.0716, + "step": 33262 + }, + { + "epoch": 1.63, + "grad_norm": 0.5866214036941528, + "learning_rate": 0.0002592418555395238, + "loss": 3.1818, + "step": 33263 + }, + { + "epoch": 1.63, + "grad_norm": 0.6058687567710876, + "learning_rate": 0.00025922660215658457, + "loss": 3.0925, + "step": 33264 + }, + { + "epoch": 1.63, + "grad_norm": 0.5742605924606323, + "learning_rate": 0.00025921134888103454, + "loss": 2.9712, + "step": 33265 + }, + { + "epoch": 1.63, + "grad_norm": 0.6119689345359802, + "learning_rate": 0.0002591960957129141, + "loss": 2.8314, + "step": 33266 + }, + { + "epoch": 1.63, + "grad_norm": 0.645915150642395, + "learning_rate": 0.00025918084265226326, + "loss": 2.8669, + "step": 33267 + }, + { + "epoch": 1.63, + "grad_norm": 0.630989134311676, + "learning_rate": 0.0002591655896991222, + "loss": 3.1193, + "step": 33268 + }, + { + "epoch": 1.63, + "grad_norm": 0.5954846739768982, + "learning_rate": 0.00025915033685353106, + "loss": 2.9961, + "step": 33269 + }, + { + "epoch": 1.63, + "grad_norm": 0.5985564589500427, + "learning_rate": 0.00025913508411553015, + "loss": 3.0826, + "step": 33270 + }, + { + "epoch": 1.63, + "grad_norm": 0.623395562171936, + "learning_rate": 0.0002591198314851596, + "loss": 3.1322, + "step": 33271 + }, + { + "epoch": 1.63, + "grad_norm": 0.5947928428649902, + "learning_rate": 0.00025910457896245943, + "loss": 3.0461, + "step": 33272 + }, + { + "epoch": 1.63, + "grad_norm": 0.594084620475769, + "learning_rate": 0.00025908932654747005, + "loss": 2.9222, + "step": 33273 + }, + { + "epoch": 1.63, + "grad_norm": 0.6043190956115723, + "learning_rate": 0.00025907407424023146, + "loss": 3.0105, + "step": 33274 + }, + { + "epoch": 1.63, + "grad_norm": 0.6263400912284851, + "learning_rate": 0.0002590588220407838, + "loss": 3.1322, + "step": 33275 + }, + { + "epoch": 1.63, + "grad_norm": 0.5728640556335449, + "learning_rate": 0.0002590435699491675, + "loss": 2.9339, + "step": 33276 + }, + { + "epoch": 1.63, + "grad_norm": 0.6283183097839355, + "learning_rate": 0.00025902831796542234, + "loss": 3.0203, + "step": 33277 + }, + { + "epoch": 1.63, + "grad_norm": 0.6112415790557861, + "learning_rate": 0.0002590130660895889, + "loss": 2.7254, + "step": 33278 + }, + { + "epoch": 1.63, + "grad_norm": 0.5928776860237122, + "learning_rate": 0.0002589978143217071, + "loss": 3.0446, + "step": 33279 + }, + { + "epoch": 1.63, + "grad_norm": 0.6818798780441284, + "learning_rate": 0.00025898256266181704, + "loss": 2.9687, + "step": 33280 + }, + { + "epoch": 1.63, + "grad_norm": 0.5973649621009827, + "learning_rate": 0.00025896731110995917, + "loss": 3.2809, + "step": 33281 + }, + { + "epoch": 1.63, + "grad_norm": 0.6226446628570557, + "learning_rate": 0.00025895205966617343, + "loss": 3.058, + "step": 33282 + }, + { + "epoch": 1.63, + "grad_norm": 0.6349427103996277, + "learning_rate": 0.00025893680833050017, + "loss": 3.167, + "step": 33283 + }, + { + "epoch": 1.63, + "grad_norm": 0.6562497019767761, + "learning_rate": 0.0002589215571029793, + "loss": 3.0447, + "step": 33284 + }, + { + "epoch": 1.63, + "grad_norm": 0.5976384878158569, + "learning_rate": 0.0002589063059836512, + "loss": 3.0494, + "step": 33285 + }, + { + "epoch": 1.63, + "grad_norm": 0.62012779712677, + "learning_rate": 0.00025889105497255605, + "loss": 3.164, + "step": 33286 + }, + { + "epoch": 1.63, + "grad_norm": 0.5701225996017456, + "learning_rate": 0.00025887580406973385, + "loss": 3.0588, + "step": 33287 + }, + { + "epoch": 1.63, + "grad_norm": 0.6170254945755005, + "learning_rate": 0.0002588605532752249, + "loss": 2.9552, + "step": 33288 + }, + { + "epoch": 1.63, + "grad_norm": 0.6369534730911255, + "learning_rate": 0.0002588453025890694, + "loss": 3.1661, + "step": 33289 + }, + { + "epoch": 1.63, + "grad_norm": 0.6616371273994446, + "learning_rate": 0.00025883005201130736, + "loss": 3.0857, + "step": 33290 + }, + { + "epoch": 1.63, + "grad_norm": 0.6203917264938354, + "learning_rate": 0.0002588148015419791, + "loss": 3.0843, + "step": 33291 + }, + { + "epoch": 1.63, + "grad_norm": 0.6321138739585876, + "learning_rate": 0.0002587995511811247, + "loss": 2.8944, + "step": 33292 + }, + { + "epoch": 1.63, + "grad_norm": 0.6112622618675232, + "learning_rate": 0.00025878430092878446, + "loss": 3.1176, + "step": 33293 + }, + { + "epoch": 1.63, + "grad_norm": 0.6042934060096741, + "learning_rate": 0.0002587690507849983, + "loss": 2.9723, + "step": 33294 + }, + { + "epoch": 1.63, + "grad_norm": 0.6640942692756653, + "learning_rate": 0.00025875380074980644, + "loss": 2.8202, + "step": 33295 + }, + { + "epoch": 1.63, + "grad_norm": 0.5991891622543335, + "learning_rate": 0.0002587385508232493, + "loss": 2.9682, + "step": 33296 + }, + { + "epoch": 1.63, + "grad_norm": 0.6276793479919434, + "learning_rate": 0.00025872330100536683, + "loss": 2.91, + "step": 33297 + }, + { + "epoch": 1.63, + "grad_norm": 0.5799841284751892, + "learning_rate": 0.0002587080512961993, + "loss": 2.9584, + "step": 33298 + }, + { + "epoch": 1.63, + "grad_norm": 0.6178253293037415, + "learning_rate": 0.0002586928016957867, + "loss": 3.0259, + "step": 33299 + }, + { + "epoch": 1.63, + "grad_norm": 0.6273776292800903, + "learning_rate": 0.00025867755220416934, + "loss": 3.0089, + "step": 33300 + }, + { + "epoch": 1.63, + "grad_norm": 0.6046527624130249, + "learning_rate": 0.0002586623028213875, + "loss": 3.0268, + "step": 33301 + }, + { + "epoch": 1.63, + "grad_norm": 0.6330516934394836, + "learning_rate": 0.000258647053547481, + "loss": 3.0706, + "step": 33302 + }, + { + "epoch": 1.63, + "grad_norm": 0.6148566603660583, + "learning_rate": 0.0002586318043824903, + "loss": 3.0584, + "step": 33303 + }, + { + "epoch": 1.63, + "grad_norm": 0.6204240918159485, + "learning_rate": 0.0002586165553264555, + "loss": 3.0304, + "step": 33304 + }, + { + "epoch": 1.63, + "grad_norm": 0.6241191029548645, + "learning_rate": 0.0002586013063794166, + "loss": 2.7981, + "step": 33305 + }, + { + "epoch": 1.63, + "grad_norm": 0.6470986604690552, + "learning_rate": 0.000258586057541414, + "loss": 3.0272, + "step": 33306 + }, + { + "epoch": 1.63, + "grad_norm": 0.6138694286346436, + "learning_rate": 0.00025857080881248776, + "loss": 2.8922, + "step": 33307 + }, + { + "epoch": 1.63, + "grad_norm": 0.5973373055458069, + "learning_rate": 0.00025855556019267805, + "loss": 2.907, + "step": 33308 + }, + { + "epoch": 1.63, + "grad_norm": 0.599695086479187, + "learning_rate": 0.0002585403116820249, + "loss": 3.1989, + "step": 33309 + }, + { + "epoch": 1.63, + "grad_norm": 0.6345245838165283, + "learning_rate": 0.0002585250632805686, + "loss": 3.1087, + "step": 33310 + }, + { + "epoch": 1.63, + "grad_norm": 0.6359992623329163, + "learning_rate": 0.00025850981498834946, + "loss": 2.968, + "step": 33311 + }, + { + "epoch": 1.63, + "grad_norm": 0.5912989377975464, + "learning_rate": 0.0002584945668054073, + "loss": 2.9463, + "step": 33312 + }, + { + "epoch": 1.63, + "grad_norm": 0.6323924660682678, + "learning_rate": 0.00025847931873178266, + "loss": 3.3634, + "step": 33313 + }, + { + "epoch": 1.63, + "grad_norm": 0.6180119514465332, + "learning_rate": 0.00025846407076751535, + "loss": 3.0597, + "step": 33314 + }, + { + "epoch": 1.63, + "grad_norm": 0.5933579802513123, + "learning_rate": 0.0002584488229126457, + "loss": 3.1696, + "step": 33315 + }, + { + "epoch": 1.63, + "grad_norm": 0.6219573020935059, + "learning_rate": 0.0002584335751672139, + "loss": 2.9026, + "step": 33316 + }, + { + "epoch": 1.63, + "grad_norm": 0.6562425494194031, + "learning_rate": 0.00025841832753126, + "loss": 3.0916, + "step": 33317 + }, + { + "epoch": 1.63, + "grad_norm": 0.6309749484062195, + "learning_rate": 0.00025840308000482426, + "loss": 3.0988, + "step": 33318 + }, + { + "epoch": 1.63, + "grad_norm": 0.6028389930725098, + "learning_rate": 0.00025838783258794683, + "loss": 2.8867, + "step": 33319 + }, + { + "epoch": 1.63, + "grad_norm": 0.5963541269302368, + "learning_rate": 0.0002583725852806678, + "loss": 2.9588, + "step": 33320 + }, + { + "epoch": 1.63, + "grad_norm": 0.6047212481498718, + "learning_rate": 0.0002583573380830274, + "loss": 2.9557, + "step": 33321 + }, + { + "epoch": 1.63, + "grad_norm": 0.5715580582618713, + "learning_rate": 0.00025834209099506576, + "loss": 3.0006, + "step": 33322 + }, + { + "epoch": 1.63, + "grad_norm": 0.6066077351570129, + "learning_rate": 0.00025832684401682303, + "loss": 3.1405, + "step": 33323 + }, + { + "epoch": 1.63, + "grad_norm": 0.6018175482749939, + "learning_rate": 0.00025831159714833935, + "loss": 2.902, + "step": 33324 + }, + { + "epoch": 1.63, + "grad_norm": 0.6172803044319153, + "learning_rate": 0.0002582963503896549, + "loss": 2.888, + "step": 33325 + }, + { + "epoch": 1.63, + "grad_norm": 0.6167096495628357, + "learning_rate": 0.0002582811037408099, + "loss": 2.9556, + "step": 33326 + }, + { + "epoch": 1.63, + "grad_norm": 0.6224731802940369, + "learning_rate": 0.0002582658572018443, + "loss": 2.9202, + "step": 33327 + }, + { + "epoch": 1.63, + "grad_norm": 0.6047587990760803, + "learning_rate": 0.00025825061077279856, + "loss": 2.9544, + "step": 33328 + }, + { + "epoch": 1.63, + "grad_norm": 0.6331458687782288, + "learning_rate": 0.00025823536445371265, + "loss": 2.9418, + "step": 33329 + }, + { + "epoch": 1.63, + "grad_norm": 0.6286932826042175, + "learning_rate": 0.00025822011824462663, + "loss": 3.2196, + "step": 33330 + }, + { + "epoch": 1.63, + "grad_norm": 0.6198918223381042, + "learning_rate": 0.0002582048721455809, + "loss": 2.9849, + "step": 33331 + }, + { + "epoch": 1.63, + "grad_norm": 0.6311217546463013, + "learning_rate": 0.0002581896261566154, + "loss": 2.9469, + "step": 33332 + }, + { + "epoch": 1.63, + "grad_norm": 0.6329818964004517, + "learning_rate": 0.0002581743802777705, + "loss": 2.7676, + "step": 33333 + }, + { + "epoch": 1.63, + "grad_norm": 0.6387625932693481, + "learning_rate": 0.0002581591345090861, + "loss": 2.994, + "step": 33334 + }, + { + "epoch": 1.63, + "grad_norm": 0.6207517385482788, + "learning_rate": 0.0002581438888506025, + "loss": 3.1672, + "step": 33335 + }, + { + "epoch": 1.63, + "grad_norm": 0.5961693525314331, + "learning_rate": 0.00025812864330236005, + "loss": 2.8354, + "step": 33336 + }, + { + "epoch": 1.63, + "grad_norm": 0.6982114911079407, + "learning_rate": 0.00025811339786439846, + "loss": 3.005, + "step": 33337 + }, + { + "epoch": 1.63, + "grad_norm": 0.6450947523117065, + "learning_rate": 0.0002580981525367582, + "loss": 2.9924, + "step": 33338 + }, + { + "epoch": 1.63, + "grad_norm": 0.6073309779167175, + "learning_rate": 0.0002580829073194793, + "loss": 2.9336, + "step": 33339 + }, + { + "epoch": 1.63, + "grad_norm": 0.6175057291984558, + "learning_rate": 0.00025806766221260204, + "loss": 2.8445, + "step": 33340 + }, + { + "epoch": 1.63, + "grad_norm": 0.6069908738136292, + "learning_rate": 0.00025805241721616644, + "loss": 3.0606, + "step": 33341 + }, + { + "epoch": 1.63, + "grad_norm": 0.6620157957077026, + "learning_rate": 0.0002580371723302126, + "loss": 3.2177, + "step": 33342 + }, + { + "epoch": 1.63, + "grad_norm": 0.6427522897720337, + "learning_rate": 0.00025802192755478096, + "loss": 3.0998, + "step": 33343 + }, + { + "epoch": 1.63, + "grad_norm": 0.6400549411773682, + "learning_rate": 0.0002580066828899114, + "loss": 2.9967, + "step": 33344 + }, + { + "epoch": 1.63, + "grad_norm": 0.5848243832588196, + "learning_rate": 0.0002579914383356442, + "loss": 3.0598, + "step": 33345 + }, + { + "epoch": 1.63, + "grad_norm": 0.5872244238853455, + "learning_rate": 0.00025797619389201934, + "loss": 3.002, + "step": 33346 + }, + { + "epoch": 1.63, + "grad_norm": 0.5892220735549927, + "learning_rate": 0.0002579609495590772, + "loss": 3.0065, + "step": 33347 + }, + { + "epoch": 1.63, + "grad_norm": 0.5997381210327148, + "learning_rate": 0.0002579457053368578, + "loss": 2.9662, + "step": 33348 + }, + { + "epoch": 1.63, + "grad_norm": 0.645177960395813, + "learning_rate": 0.00025793046122540126, + "loss": 3.0608, + "step": 33349 + }, + { + "epoch": 1.63, + "grad_norm": 0.6021146178245544, + "learning_rate": 0.00025791521722474795, + "loss": 2.8478, + "step": 33350 + }, + { + "epoch": 1.63, + "grad_norm": 0.6015944480895996, + "learning_rate": 0.0002578999733349378, + "loss": 3.0736, + "step": 33351 + }, + { + "epoch": 1.63, + "grad_norm": 0.6046991944313049, + "learning_rate": 0.00025788472955601087, + "loss": 3.1848, + "step": 33352 + }, + { + "epoch": 1.63, + "grad_norm": 0.6653998494148254, + "learning_rate": 0.00025786948588800765, + "loss": 3.1648, + "step": 33353 + }, + { + "epoch": 1.63, + "grad_norm": 0.599899172782898, + "learning_rate": 0.00025785424233096797, + "loss": 3.2521, + "step": 33354 + }, + { + "epoch": 1.63, + "grad_norm": 0.6322702169418335, + "learning_rate": 0.0002578389988849322, + "loss": 2.8027, + "step": 33355 + }, + { + "epoch": 1.63, + "grad_norm": 0.5862153768539429, + "learning_rate": 0.0002578237555499403, + "loss": 2.9012, + "step": 33356 + }, + { + "epoch": 1.63, + "grad_norm": 0.6028646230697632, + "learning_rate": 0.0002578085123260325, + "loss": 3.1227, + "step": 33357 + }, + { + "epoch": 1.63, + "grad_norm": 0.6330054402351379, + "learning_rate": 0.00025779326921324905, + "loss": 2.8262, + "step": 33358 + }, + { + "epoch": 1.63, + "grad_norm": 0.609877347946167, + "learning_rate": 0.00025777802621162994, + "loss": 2.9177, + "step": 33359 + }, + { + "epoch": 1.63, + "grad_norm": 0.5922818183898926, + "learning_rate": 0.0002577627833212155, + "loss": 2.9613, + "step": 33360 + }, + { + "epoch": 1.63, + "grad_norm": 0.5810437798500061, + "learning_rate": 0.0002577475405420456, + "loss": 3.0444, + "step": 33361 + }, + { + "epoch": 1.63, + "grad_norm": 0.6439691781997681, + "learning_rate": 0.0002577322978741606, + "loss": 2.9434, + "step": 33362 + }, + { + "epoch": 1.64, + "grad_norm": 0.6181412935256958, + "learning_rate": 0.0002577170553176007, + "loss": 3.0512, + "step": 33363 + }, + { + "epoch": 1.64, + "grad_norm": 0.5877893567085266, + "learning_rate": 0.0002577018128724057, + "loss": 3.1475, + "step": 33364 + }, + { + "epoch": 1.64, + "grad_norm": 0.6160455942153931, + "learning_rate": 0.0002576865705386162, + "loss": 2.9531, + "step": 33365 + }, + { + "epoch": 1.64, + "grad_norm": 0.5825440883636475, + "learning_rate": 0.00025767132831627207, + "loss": 3.0221, + "step": 33366 + }, + { + "epoch": 1.64, + "grad_norm": 0.5844451189041138, + "learning_rate": 0.0002576560862054134, + "loss": 3.0393, + "step": 33367 + }, + { + "epoch": 1.64, + "grad_norm": 0.6060708165168762, + "learning_rate": 0.00025764084420608064, + "loss": 3.1585, + "step": 33368 + }, + { + "epoch": 1.64, + "grad_norm": 0.641826868057251, + "learning_rate": 0.00025762560231831365, + "loss": 3.0958, + "step": 33369 + }, + { + "epoch": 1.64, + "grad_norm": 0.6380789279937744, + "learning_rate": 0.00025761036054215274, + "loss": 2.9916, + "step": 33370 + }, + { + "epoch": 1.64, + "grad_norm": 0.624642014503479, + "learning_rate": 0.0002575951188776378, + "loss": 2.8842, + "step": 33371 + }, + { + "epoch": 1.64, + "grad_norm": 0.5973908305168152, + "learning_rate": 0.00025757987732480934, + "loss": 2.6799, + "step": 33372 + }, + { + "epoch": 1.64, + "grad_norm": 0.6174383163452148, + "learning_rate": 0.0002575646358837073, + "loss": 3.2208, + "step": 33373 + }, + { + "epoch": 1.64, + "grad_norm": 0.5949251055717468, + "learning_rate": 0.00025754939455437174, + "loss": 2.959, + "step": 33374 + }, + { + "epoch": 1.64, + "grad_norm": 0.6241964101791382, + "learning_rate": 0.000257534153336843, + "loss": 3.0481, + "step": 33375 + }, + { + "epoch": 1.64, + "grad_norm": 0.6426224112510681, + "learning_rate": 0.0002575189122311611, + "loss": 2.8685, + "step": 33376 + }, + { + "epoch": 1.64, + "grad_norm": 0.6326531171798706, + "learning_rate": 0.0002575036712373661, + "loss": 2.9726, + "step": 33377 + }, + { + "epoch": 1.64, + "grad_norm": 0.6101643443107605, + "learning_rate": 0.0002574884303554984, + "loss": 2.9489, + "step": 33378 + }, + { + "epoch": 1.64, + "grad_norm": 0.6922885179519653, + "learning_rate": 0.0002574731895855979, + "loss": 2.8549, + "step": 33379 + }, + { + "epoch": 1.64, + "grad_norm": 0.5953141450881958, + "learning_rate": 0.0002574579489277049, + "loss": 3.1971, + "step": 33380 + }, + { + "epoch": 1.64, + "grad_norm": 0.6089001893997192, + "learning_rate": 0.0002574427083818595, + "loss": 3.1966, + "step": 33381 + }, + { + "epoch": 1.64, + "grad_norm": 0.6315808296203613, + "learning_rate": 0.00025742746794810167, + "loss": 2.9393, + "step": 33382 + }, + { + "epoch": 1.64, + "grad_norm": 0.6140810251235962, + "learning_rate": 0.00025741222762647187, + "loss": 3.0597, + "step": 33383 + }, + { + "epoch": 1.64, + "grad_norm": 0.6030791997909546, + "learning_rate": 0.00025739698741701, + "loss": 2.9677, + "step": 33384 + }, + { + "epoch": 1.64, + "grad_norm": 0.6273488998413086, + "learning_rate": 0.0002573817473197563, + "loss": 3.0209, + "step": 33385 + }, + { + "epoch": 1.64, + "grad_norm": 0.5986915230751038, + "learning_rate": 0.00025736650733475074, + "loss": 3.0809, + "step": 33386 + }, + { + "epoch": 1.64, + "grad_norm": 0.6623584032058716, + "learning_rate": 0.0002573512674620337, + "loss": 3.1532, + "step": 33387 + }, + { + "epoch": 1.64, + "grad_norm": 0.5963658690452576, + "learning_rate": 0.0002573360277016453, + "loss": 3.0506, + "step": 33388 + }, + { + "epoch": 1.64, + "grad_norm": 0.6195214986801147, + "learning_rate": 0.0002573207880536254, + "loss": 3.316, + "step": 33389 + }, + { + "epoch": 1.64, + "grad_norm": 0.6641665697097778, + "learning_rate": 0.0002573055485180145, + "loss": 3.044, + "step": 33390 + }, + { + "epoch": 1.64, + "grad_norm": 0.6132287383079529, + "learning_rate": 0.00025729030909485244, + "loss": 2.805, + "step": 33391 + }, + { + "epoch": 1.64, + "grad_norm": 0.6133543252944946, + "learning_rate": 0.00025727506978417947, + "loss": 2.8478, + "step": 33392 + }, + { + "epoch": 1.64, + "grad_norm": 0.652361273765564, + "learning_rate": 0.0002572598305860359, + "loss": 3.0461, + "step": 33393 + }, + { + "epoch": 1.64, + "grad_norm": 0.5797291398048401, + "learning_rate": 0.00025724459150046157, + "loss": 3.3286, + "step": 33394 + }, + { + "epoch": 1.64, + "grad_norm": 0.6221191883087158, + "learning_rate": 0.0002572293525274969, + "loss": 2.8018, + "step": 33395 + }, + { + "epoch": 1.64, + "grad_norm": 0.6194936633110046, + "learning_rate": 0.0002572141136671817, + "loss": 2.847, + "step": 33396 + }, + { + "epoch": 1.64, + "grad_norm": 0.6495451331138611, + "learning_rate": 0.00025719887491955633, + "loss": 3.1063, + "step": 33397 + }, + { + "epoch": 1.64, + "grad_norm": 0.6331799030303955, + "learning_rate": 0.000257183636284661, + "loss": 3.1819, + "step": 33398 + }, + { + "epoch": 1.64, + "grad_norm": 0.5986855626106262, + "learning_rate": 0.0002571683977625357, + "loss": 3.0035, + "step": 33399 + }, + { + "epoch": 1.64, + "grad_norm": 0.6081706881523132, + "learning_rate": 0.0002571531593532206, + "loss": 3.0621, + "step": 33400 + }, + { + "epoch": 1.64, + "grad_norm": 0.638114333152771, + "learning_rate": 0.00025713792105675577, + "loss": 2.9936, + "step": 33401 + }, + { + "epoch": 1.64, + "grad_norm": 0.6606442928314209, + "learning_rate": 0.00025712268287318143, + "loss": 3.0516, + "step": 33402 + }, + { + "epoch": 1.64, + "grad_norm": 0.642713189125061, + "learning_rate": 0.00025710744480253776, + "loss": 2.943, + "step": 33403 + }, + { + "epoch": 1.64, + "grad_norm": 0.6082857847213745, + "learning_rate": 0.0002570922068448647, + "loss": 3.0018, + "step": 33404 + }, + { + "epoch": 1.64, + "grad_norm": 0.5871419310569763, + "learning_rate": 0.00025707696900020263, + "loss": 3.0449, + "step": 33405 + }, + { + "epoch": 1.64, + "grad_norm": 0.6455428600311279, + "learning_rate": 0.0002570617312685915, + "loss": 2.959, + "step": 33406 + }, + { + "epoch": 1.64, + "grad_norm": 0.5815387964248657, + "learning_rate": 0.00025704649365007143, + "loss": 2.7549, + "step": 33407 + }, + { + "epoch": 1.64, + "grad_norm": 0.6647642254829407, + "learning_rate": 0.0002570312561446828, + "loss": 3.2245, + "step": 33408 + }, + { + "epoch": 1.64, + "grad_norm": 0.6292085647583008, + "learning_rate": 0.00025701601875246545, + "loss": 2.8913, + "step": 33409 + }, + { + "epoch": 1.64, + "grad_norm": 0.6244023442268372, + "learning_rate": 0.00025700078147345974, + "loss": 2.9362, + "step": 33410 + }, + { + "epoch": 1.64, + "grad_norm": 0.620573878288269, + "learning_rate": 0.0002569855443077055, + "loss": 2.8535, + "step": 33411 + }, + { + "epoch": 1.64, + "grad_norm": 0.612984299659729, + "learning_rate": 0.00025697030725524325, + "loss": 2.9519, + "step": 33412 + }, + { + "epoch": 1.64, + "grad_norm": 0.6438848376274109, + "learning_rate": 0.0002569550703161129, + "loss": 2.8849, + "step": 33413 + }, + { + "epoch": 1.64, + "grad_norm": 0.5936183929443359, + "learning_rate": 0.0002569398334903545, + "loss": 2.7897, + "step": 33414 + }, + { + "epoch": 1.64, + "grad_norm": 0.614172101020813, + "learning_rate": 0.0002569245967780084, + "loss": 3.0311, + "step": 33415 + }, + { + "epoch": 1.64, + "grad_norm": 0.6163328289985657, + "learning_rate": 0.0002569093601791146, + "loss": 3.0904, + "step": 33416 + }, + { + "epoch": 1.64, + "grad_norm": 0.6245632767677307, + "learning_rate": 0.0002568941236937133, + "loss": 2.9677, + "step": 33417 + }, + { + "epoch": 1.64, + "grad_norm": 0.6340919733047485, + "learning_rate": 0.00025687888732184446, + "loss": 3.2047, + "step": 33418 + }, + { + "epoch": 1.64, + "grad_norm": 0.58950275182724, + "learning_rate": 0.0002568636510635483, + "loss": 2.9382, + "step": 33419 + }, + { + "epoch": 1.64, + "grad_norm": 0.5915112495422363, + "learning_rate": 0.00025684841491886517, + "loss": 2.7972, + "step": 33420 + }, + { + "epoch": 1.64, + "grad_norm": 0.6251772046089172, + "learning_rate": 0.0002568331788878349, + "loss": 2.9719, + "step": 33421 + }, + { + "epoch": 1.64, + "grad_norm": 0.5975916385650635, + "learning_rate": 0.0002568179429704978, + "loss": 2.9929, + "step": 33422 + }, + { + "epoch": 1.64, + "grad_norm": 0.6500588655471802, + "learning_rate": 0.0002568027071668938, + "loss": 2.9747, + "step": 33423 + }, + { + "epoch": 1.64, + "grad_norm": 0.6431196331977844, + "learning_rate": 0.00025678747147706325, + "loss": 2.9508, + "step": 33424 + }, + { + "epoch": 1.64, + "grad_norm": 0.6361487507820129, + "learning_rate": 0.0002567722359010462, + "loss": 2.9058, + "step": 33425 + }, + { + "epoch": 1.64, + "grad_norm": 0.6039143204689026, + "learning_rate": 0.00025675700043888264, + "loss": 2.9206, + "step": 33426 + }, + { + "epoch": 1.64, + "grad_norm": 0.6220351457595825, + "learning_rate": 0.000256741765090613, + "loss": 3.2211, + "step": 33427 + }, + { + "epoch": 1.64, + "grad_norm": 0.6292129158973694, + "learning_rate": 0.0002567265298562771, + "loss": 3.0292, + "step": 33428 + }, + { + "epoch": 1.64, + "grad_norm": 0.6125423312187195, + "learning_rate": 0.00025671129473591516, + "loss": 2.9991, + "step": 33429 + }, + { + "epoch": 1.64, + "grad_norm": 0.6337186098098755, + "learning_rate": 0.0002566960597295674, + "loss": 3.1525, + "step": 33430 + }, + { + "epoch": 1.64, + "grad_norm": 0.6597618460655212, + "learning_rate": 0.0002566808248372739, + "loss": 2.9801, + "step": 33431 + }, + { + "epoch": 1.64, + "grad_norm": 0.6206303238868713, + "learning_rate": 0.00025666559005907484, + "loss": 3.1365, + "step": 33432 + }, + { + "epoch": 1.64, + "grad_norm": 0.5949422717094421, + "learning_rate": 0.0002566503553950101, + "loss": 2.8709, + "step": 33433 + }, + { + "epoch": 1.64, + "grad_norm": 0.5526446104049683, + "learning_rate": 0.0002566351208451201, + "loss": 2.9796, + "step": 33434 + }, + { + "epoch": 1.64, + "grad_norm": 0.6217344999313354, + "learning_rate": 0.00025661988640944484, + "loss": 2.7793, + "step": 33435 + }, + { + "epoch": 1.64, + "grad_norm": 0.5836462378501892, + "learning_rate": 0.0002566046520880244, + "loss": 2.9644, + "step": 33436 + }, + { + "epoch": 1.64, + "grad_norm": 0.6020750999450684, + "learning_rate": 0.000256589417880899, + "loss": 3.1975, + "step": 33437 + }, + { + "epoch": 1.64, + "grad_norm": 0.6435813903808594, + "learning_rate": 0.00025657418378810873, + "loss": 2.9718, + "step": 33438 + }, + { + "epoch": 1.64, + "grad_norm": 0.6334481835365295, + "learning_rate": 0.00025655894980969356, + "loss": 2.8116, + "step": 33439 + }, + { + "epoch": 1.64, + "grad_norm": 0.6242516040802002, + "learning_rate": 0.0002565437159456939, + "loss": 2.8935, + "step": 33440 + }, + { + "epoch": 1.64, + "grad_norm": 0.6350675821304321, + "learning_rate": 0.0002565284821961497, + "loss": 2.9426, + "step": 33441 + }, + { + "epoch": 1.64, + "grad_norm": 0.6530657410621643, + "learning_rate": 0.00025651324856110116, + "loss": 3.0214, + "step": 33442 + }, + { + "epoch": 1.64, + "grad_norm": 0.6366276144981384, + "learning_rate": 0.00025649801504058826, + "loss": 2.9178, + "step": 33443 + }, + { + "epoch": 1.64, + "grad_norm": 0.629357635974884, + "learning_rate": 0.0002564827816346512, + "loss": 3.052, + "step": 33444 + }, + { + "epoch": 1.64, + "grad_norm": 0.5792384743690491, + "learning_rate": 0.00025646754834333024, + "loss": 2.9482, + "step": 33445 + }, + { + "epoch": 1.64, + "grad_norm": 0.615355908870697, + "learning_rate": 0.00025645231516666534, + "loss": 3.1283, + "step": 33446 + }, + { + "epoch": 1.64, + "grad_norm": 0.6263376474380493, + "learning_rate": 0.00025643708210469667, + "loss": 3.1832, + "step": 33447 + }, + { + "epoch": 1.64, + "grad_norm": 0.6375722289085388, + "learning_rate": 0.0002564218491574643, + "loss": 2.9832, + "step": 33448 + }, + { + "epoch": 1.64, + "grad_norm": 0.6010499596595764, + "learning_rate": 0.0002564066163250084, + "loss": 3.1208, + "step": 33449 + }, + { + "epoch": 1.64, + "grad_norm": 0.6063414216041565, + "learning_rate": 0.0002563913836073691, + "loss": 3.1275, + "step": 33450 + }, + { + "epoch": 1.64, + "grad_norm": 0.6149500012397766, + "learning_rate": 0.0002563761510045864, + "loss": 3.0182, + "step": 33451 + }, + { + "epoch": 1.64, + "grad_norm": 0.6116110682487488, + "learning_rate": 0.0002563609185167007, + "loss": 3.1028, + "step": 33452 + }, + { + "epoch": 1.64, + "grad_norm": 0.6712843775749207, + "learning_rate": 0.00025634568614375184, + "loss": 3.0418, + "step": 33453 + }, + { + "epoch": 1.64, + "grad_norm": 0.597224235534668, + "learning_rate": 0.00025633045388577996, + "loss": 3.0967, + "step": 33454 + }, + { + "epoch": 1.64, + "grad_norm": 0.609315037727356, + "learning_rate": 0.00025631522174282545, + "loss": 3.0672, + "step": 33455 + }, + { + "epoch": 1.64, + "grad_norm": 0.6235337853431702, + "learning_rate": 0.00025629998971492816, + "loss": 2.9888, + "step": 33456 + }, + { + "epoch": 1.64, + "grad_norm": 0.5854429602622986, + "learning_rate": 0.00025628475780212825, + "loss": 3.0874, + "step": 33457 + }, + { + "epoch": 1.64, + "grad_norm": 0.6504743695259094, + "learning_rate": 0.0002562695260044659, + "loss": 3.1094, + "step": 33458 + }, + { + "epoch": 1.64, + "grad_norm": 0.6945144534111023, + "learning_rate": 0.0002562542943219811, + "loss": 2.8553, + "step": 33459 + }, + { + "epoch": 1.64, + "grad_norm": 0.5794476866722107, + "learning_rate": 0.0002562390627547142, + "loss": 2.9821, + "step": 33460 + }, + { + "epoch": 1.64, + "grad_norm": 0.5813378691673279, + "learning_rate": 0.00025622383130270514, + "loss": 2.9139, + "step": 33461 + }, + { + "epoch": 1.64, + "grad_norm": 0.6076094508171082, + "learning_rate": 0.00025620859996599416, + "loss": 2.9726, + "step": 33462 + }, + { + "epoch": 1.64, + "grad_norm": 0.6358339190483093, + "learning_rate": 0.0002561933687446212, + "loss": 2.9508, + "step": 33463 + }, + { + "epoch": 1.64, + "grad_norm": 0.6296307444572449, + "learning_rate": 0.0002561781376386265, + "loss": 3.04, + "step": 33464 + }, + { + "epoch": 1.64, + "grad_norm": 0.6401169896125793, + "learning_rate": 0.00025616290664805025, + "loss": 3.2167, + "step": 33465 + }, + { + "epoch": 1.64, + "grad_norm": 0.6119227409362793, + "learning_rate": 0.00025614767577293227, + "loss": 3.2228, + "step": 33466 + }, + { + "epoch": 1.64, + "grad_norm": 0.6023973226547241, + "learning_rate": 0.00025613244501331304, + "loss": 3.0254, + "step": 33467 + }, + { + "epoch": 1.64, + "grad_norm": 0.5672397613525391, + "learning_rate": 0.0002561172143692325, + "loss": 3.1091, + "step": 33468 + }, + { + "epoch": 1.64, + "grad_norm": 0.6069930195808411, + "learning_rate": 0.0002561019838407306, + "loss": 2.9391, + "step": 33469 + }, + { + "epoch": 1.64, + "grad_norm": 0.6205309629440308, + "learning_rate": 0.0002560867534278478, + "loss": 2.9794, + "step": 33470 + }, + { + "epoch": 1.64, + "grad_norm": 0.6706638336181641, + "learning_rate": 0.000256071523130624, + "loss": 2.9495, + "step": 33471 + }, + { + "epoch": 1.64, + "grad_norm": 0.6044138073921204, + "learning_rate": 0.00025605629294909944, + "loss": 3.0051, + "step": 33472 + }, + { + "epoch": 1.64, + "grad_norm": 0.6162787079811096, + "learning_rate": 0.00025604106288331394, + "loss": 2.7831, + "step": 33473 + }, + { + "epoch": 1.64, + "grad_norm": 0.6222827434539795, + "learning_rate": 0.00025602583293330794, + "loss": 2.9045, + "step": 33474 + }, + { + "epoch": 1.64, + "grad_norm": 0.6912936568260193, + "learning_rate": 0.0002560106030991215, + "loss": 3.111, + "step": 33475 + }, + { + "epoch": 1.64, + "grad_norm": 0.5932526588439941, + "learning_rate": 0.0002559953733807945, + "loss": 2.973, + "step": 33476 + }, + { + "epoch": 1.64, + "grad_norm": 0.6283918619155884, + "learning_rate": 0.00025598014377836736, + "loss": 2.8764, + "step": 33477 + }, + { + "epoch": 1.64, + "grad_norm": 0.6525346636772156, + "learning_rate": 0.00025596491429188, + "loss": 2.9557, + "step": 33478 + }, + { + "epoch": 1.64, + "grad_norm": 0.595056414604187, + "learning_rate": 0.0002559496849213725, + "loss": 3.0048, + "step": 33479 + }, + { + "epoch": 1.64, + "grad_norm": 0.6286141276359558, + "learning_rate": 0.0002559344556668852, + "loss": 2.9229, + "step": 33480 + }, + { + "epoch": 1.64, + "grad_norm": 0.6390392184257507, + "learning_rate": 0.0002559192265284579, + "loss": 3.024, + "step": 33481 + }, + { + "epoch": 1.64, + "grad_norm": 0.6058946251869202, + "learning_rate": 0.00025590399750613106, + "loss": 2.8562, + "step": 33482 + }, + { + "epoch": 1.64, + "grad_norm": 0.7172413468360901, + "learning_rate": 0.0002558887685999445, + "loss": 2.993, + "step": 33483 + }, + { + "epoch": 1.64, + "grad_norm": 0.6925371289253235, + "learning_rate": 0.0002558735398099384, + "loss": 2.83, + "step": 33484 + }, + { + "epoch": 1.64, + "grad_norm": 0.6435196995735168, + "learning_rate": 0.000255858311136153, + "loss": 3.1041, + "step": 33485 + }, + { + "epoch": 1.64, + "grad_norm": 0.6237314939498901, + "learning_rate": 0.00025584308257862824, + "loss": 2.8744, + "step": 33486 + }, + { + "epoch": 1.64, + "grad_norm": 0.5538667440414429, + "learning_rate": 0.0002558278541374044, + "loss": 2.9134, + "step": 33487 + }, + { + "epoch": 1.64, + "grad_norm": 0.6051622629165649, + "learning_rate": 0.00025581262581252133, + "loss": 2.8513, + "step": 33488 + }, + { + "epoch": 1.64, + "grad_norm": 0.61176997423172, + "learning_rate": 0.0002557973976040194, + "loss": 3.0499, + "step": 33489 + }, + { + "epoch": 1.64, + "grad_norm": 0.6022197604179382, + "learning_rate": 0.00025578216951193864, + "loss": 3.1049, + "step": 33490 + }, + { + "epoch": 1.64, + "grad_norm": 0.6228072047233582, + "learning_rate": 0.00025576694153631906, + "loss": 3.0617, + "step": 33491 + }, + { + "epoch": 1.64, + "grad_norm": 0.6194964051246643, + "learning_rate": 0.0002557517136772009, + "loss": 2.8614, + "step": 33492 + }, + { + "epoch": 1.64, + "grad_norm": 0.6054251790046692, + "learning_rate": 0.00025573648593462425, + "loss": 3.0247, + "step": 33493 + }, + { + "epoch": 1.64, + "grad_norm": 0.6623701453208923, + "learning_rate": 0.00025572125830862914, + "loss": 3.0102, + "step": 33494 + }, + { + "epoch": 1.64, + "grad_norm": 0.6244812607765198, + "learning_rate": 0.00025570603079925564, + "loss": 2.8992, + "step": 33495 + }, + { + "epoch": 1.64, + "grad_norm": 0.6170223951339722, + "learning_rate": 0.00025569080340654397, + "loss": 3.1205, + "step": 33496 + }, + { + "epoch": 1.64, + "grad_norm": 0.5932803750038147, + "learning_rate": 0.00025567557613053425, + "loss": 3.1165, + "step": 33497 + }, + { + "epoch": 1.64, + "grad_norm": 0.6121468544006348, + "learning_rate": 0.00025566034897126644, + "loss": 3.0744, + "step": 33498 + }, + { + "epoch": 1.64, + "grad_norm": 0.6298878788948059, + "learning_rate": 0.00025564512192878086, + "loss": 2.8936, + "step": 33499 + }, + { + "epoch": 1.64, + "grad_norm": 0.6140074729919434, + "learning_rate": 0.0002556298950031174, + "loss": 2.9988, + "step": 33500 + }, + { + "epoch": 1.64, + "grad_norm": 0.5965002179145813, + "learning_rate": 0.00025561466819431627, + "loss": 3.168, + "step": 33501 + }, + { + "epoch": 1.64, + "grad_norm": 0.637391209602356, + "learning_rate": 0.0002555994415024176, + "loss": 3.2728, + "step": 33502 + }, + { + "epoch": 1.64, + "grad_norm": 0.6739944219589233, + "learning_rate": 0.00025558421492746135, + "loss": 3.0144, + "step": 33503 + }, + { + "epoch": 1.64, + "grad_norm": 0.6153771877288818, + "learning_rate": 0.00025556898846948786, + "loss": 3.0694, + "step": 33504 + }, + { + "epoch": 1.64, + "grad_norm": 0.5968106985092163, + "learning_rate": 0.00025555376212853704, + "loss": 2.9543, + "step": 33505 + }, + { + "epoch": 1.64, + "grad_norm": 0.610713541507721, + "learning_rate": 0.000255538535904649, + "loss": 3.0342, + "step": 33506 + }, + { + "epoch": 1.64, + "grad_norm": 0.6176581382751465, + "learning_rate": 0.000255523309797864, + "loss": 3.1373, + "step": 33507 + }, + { + "epoch": 1.64, + "grad_norm": 0.6341189742088318, + "learning_rate": 0.000255508083808222, + "loss": 2.9842, + "step": 33508 + }, + { + "epoch": 1.64, + "grad_norm": 0.5881686806678772, + "learning_rate": 0.00025549285793576324, + "loss": 3.0403, + "step": 33509 + }, + { + "epoch": 1.64, + "grad_norm": 0.6181926727294922, + "learning_rate": 0.0002554776321805276, + "loss": 3.124, + "step": 33510 + }, + { + "epoch": 1.64, + "grad_norm": 0.6077942848205566, + "learning_rate": 0.00025546240654255535, + "loss": 2.9548, + "step": 33511 + }, + { + "epoch": 1.64, + "grad_norm": 0.6073379516601562, + "learning_rate": 0.00025544718102188656, + "loss": 2.8528, + "step": 33512 + }, + { + "epoch": 1.64, + "grad_norm": 0.6286836266517639, + "learning_rate": 0.00025543195561856125, + "loss": 2.9042, + "step": 33513 + }, + { + "epoch": 1.64, + "grad_norm": 0.6575766801834106, + "learning_rate": 0.0002554167303326197, + "loss": 3.1573, + "step": 33514 + }, + { + "epoch": 1.64, + "grad_norm": 0.5965009331703186, + "learning_rate": 0.0002554015051641019, + "loss": 3.0997, + "step": 33515 + }, + { + "epoch": 1.64, + "grad_norm": 0.6113033890724182, + "learning_rate": 0.0002553862801130478, + "loss": 2.9108, + "step": 33516 + }, + { + "epoch": 1.64, + "grad_norm": 0.6287041306495667, + "learning_rate": 0.00025537105517949786, + "loss": 2.875, + "step": 33517 + }, + { + "epoch": 1.64, + "grad_norm": 0.6469335556030273, + "learning_rate": 0.00025535583036349186, + "loss": 3.1125, + "step": 33518 + }, + { + "epoch": 1.64, + "grad_norm": 0.6197258830070496, + "learning_rate": 0.0002553406056650701, + "loss": 3.0977, + "step": 33519 + }, + { + "epoch": 1.64, + "grad_norm": 0.6109700202941895, + "learning_rate": 0.0002553253810842724, + "loss": 2.9279, + "step": 33520 + }, + { + "epoch": 1.64, + "grad_norm": 0.6416023373603821, + "learning_rate": 0.0002553101566211391, + "loss": 2.9842, + "step": 33521 + }, + { + "epoch": 1.64, + "grad_norm": 0.5842431783676147, + "learning_rate": 0.00025529493227571034, + "loss": 3.105, + "step": 33522 + }, + { + "epoch": 1.64, + "grad_norm": 0.6129705905914307, + "learning_rate": 0.0002552797080480261, + "loss": 2.9826, + "step": 33523 + }, + { + "epoch": 1.64, + "grad_norm": 0.5900617837905884, + "learning_rate": 0.00025526448393812655, + "loss": 3.2003, + "step": 33524 + }, + { + "epoch": 1.64, + "grad_norm": 0.5936287641525269, + "learning_rate": 0.00025524925994605164, + "loss": 3.2126, + "step": 33525 + }, + { + "epoch": 1.64, + "grad_norm": 0.5882431864738464, + "learning_rate": 0.0002552340360718416, + "loss": 2.9639, + "step": 33526 + }, + { + "epoch": 1.64, + "grad_norm": 0.5971298217773438, + "learning_rate": 0.00025521881231553654, + "loss": 3.096, + "step": 33527 + }, + { + "epoch": 1.64, + "grad_norm": 0.6381330490112305, + "learning_rate": 0.0002552035886771764, + "loss": 2.9462, + "step": 33528 + }, + { + "epoch": 1.64, + "grad_norm": 0.5903002023696899, + "learning_rate": 0.00025518836515680155, + "loss": 2.8188, + "step": 33529 + }, + { + "epoch": 1.64, + "grad_norm": 0.5994799733161926, + "learning_rate": 0.0002551731417544518, + "loss": 2.9601, + "step": 33530 + }, + { + "epoch": 1.64, + "grad_norm": 0.5971114039421082, + "learning_rate": 0.0002551579184701673, + "loss": 2.9885, + "step": 33531 + }, + { + "epoch": 1.64, + "grad_norm": 0.6384589076042175, + "learning_rate": 0.0002551426953039884, + "loss": 3.1067, + "step": 33532 + }, + { + "epoch": 1.64, + "grad_norm": 0.591693639755249, + "learning_rate": 0.0002551274722559549, + "loss": 3.0607, + "step": 33533 + }, + { + "epoch": 1.64, + "grad_norm": 0.6068736910820007, + "learning_rate": 0.0002551122493261071, + "loss": 3.1471, + "step": 33534 + }, + { + "epoch": 1.64, + "grad_norm": 0.6241335868835449, + "learning_rate": 0.00025509702651448486, + "loss": 3.0201, + "step": 33535 + }, + { + "epoch": 1.64, + "grad_norm": 0.6447053551673889, + "learning_rate": 0.00025508180382112847, + "loss": 2.9881, + "step": 33536 + }, + { + "epoch": 1.64, + "grad_norm": 0.600275456905365, + "learning_rate": 0.000255066581246078, + "loss": 3.0984, + "step": 33537 + }, + { + "epoch": 1.64, + "grad_norm": 0.5895094275474548, + "learning_rate": 0.00025505135878937344, + "loss": 3.1595, + "step": 33538 + }, + { + "epoch": 1.64, + "grad_norm": 0.6206028461456299, + "learning_rate": 0.000255036136451055, + "loss": 3.1758, + "step": 33539 + }, + { + "epoch": 1.64, + "grad_norm": 0.6393868923187256, + "learning_rate": 0.00025502091423116273, + "loss": 2.9527, + "step": 33540 + }, + { + "epoch": 1.64, + "grad_norm": 0.672612190246582, + "learning_rate": 0.0002550056921297366, + "loss": 2.9975, + "step": 33541 + }, + { + "epoch": 1.64, + "grad_norm": 0.6368157863616943, + "learning_rate": 0.00025499047014681695, + "loss": 3.0863, + "step": 33542 + }, + { + "epoch": 1.64, + "grad_norm": 0.621232271194458, + "learning_rate": 0.00025497524828244365, + "loss": 3.052, + "step": 33543 + }, + { + "epoch": 1.64, + "grad_norm": 0.6463088989257812, + "learning_rate": 0.00025496002653665697, + "loss": 2.6657, + "step": 33544 + }, + { + "epoch": 1.64, + "grad_norm": 0.6010912656784058, + "learning_rate": 0.00025494480490949686, + "loss": 2.9976, + "step": 33545 + }, + { + "epoch": 1.64, + "grad_norm": 0.6122451424598694, + "learning_rate": 0.00025492958340100344, + "loss": 2.9951, + "step": 33546 + }, + { + "epoch": 1.64, + "grad_norm": 0.6520957350730896, + "learning_rate": 0.00025491436201121687, + "loss": 2.9447, + "step": 33547 + }, + { + "epoch": 1.64, + "grad_norm": 0.6786126494407654, + "learning_rate": 0.00025489914074017716, + "loss": 3.0369, + "step": 33548 + }, + { + "epoch": 1.64, + "grad_norm": 0.5904258489608765, + "learning_rate": 0.0002548839195879245, + "loss": 2.9875, + "step": 33549 + }, + { + "epoch": 1.64, + "grad_norm": 0.6132463216781616, + "learning_rate": 0.0002548686985544988, + "loss": 2.8802, + "step": 33550 + }, + { + "epoch": 1.64, + "grad_norm": 0.6229579448699951, + "learning_rate": 0.00025485347763994033, + "loss": 3.1264, + "step": 33551 + }, + { + "epoch": 1.64, + "grad_norm": 0.5817505717277527, + "learning_rate": 0.0002548382568442892, + "loss": 2.837, + "step": 33552 + }, + { + "epoch": 1.64, + "grad_norm": 0.6309376955032349, + "learning_rate": 0.00025482303616758525, + "loss": 2.9213, + "step": 33553 + }, + { + "epoch": 1.64, + "grad_norm": 0.6208295822143555, + "learning_rate": 0.00025480781560986883, + "loss": 2.9553, + "step": 33554 + }, + { + "epoch": 1.64, + "grad_norm": 0.6129673719406128, + "learning_rate": 0.0002547925951711799, + "loss": 2.7496, + "step": 33555 + }, + { + "epoch": 1.64, + "grad_norm": 0.6032195091247559, + "learning_rate": 0.0002547773748515585, + "loss": 2.9992, + "step": 33556 + }, + { + "epoch": 1.64, + "grad_norm": 0.6073378920555115, + "learning_rate": 0.0002547621546510449, + "loss": 2.8163, + "step": 33557 + }, + { + "epoch": 1.64, + "grad_norm": 0.6146911382675171, + "learning_rate": 0.00025474693456967905, + "loss": 2.9754, + "step": 33558 + }, + { + "epoch": 1.64, + "grad_norm": 0.6286951899528503, + "learning_rate": 0.0002547317146075011, + "loss": 2.9883, + "step": 33559 + }, + { + "epoch": 1.64, + "grad_norm": 0.6064934134483337, + "learning_rate": 0.000254716494764551, + "loss": 3.0692, + "step": 33560 + }, + { + "epoch": 1.64, + "grad_norm": 0.6015488505363464, + "learning_rate": 0.0002547012750408689, + "loss": 3.0608, + "step": 33561 + }, + { + "epoch": 1.64, + "grad_norm": 0.6412321925163269, + "learning_rate": 0.0002546860554364951, + "loss": 3.3523, + "step": 33562 + }, + { + "epoch": 1.64, + "grad_norm": 0.6053003072738647, + "learning_rate": 0.0002546708359514694, + "loss": 2.8756, + "step": 33563 + }, + { + "epoch": 1.64, + "grad_norm": 0.6287083029747009, + "learning_rate": 0.00025465561658583204, + "loss": 3.047, + "step": 33564 + }, + { + "epoch": 1.64, + "grad_norm": 0.6067085266113281, + "learning_rate": 0.000254640397339623, + "loss": 3.1418, + "step": 33565 + }, + { + "epoch": 1.64, + "grad_norm": 0.6111128330230713, + "learning_rate": 0.00025462517821288246, + "loss": 3.0786, + "step": 33566 + }, + { + "epoch": 1.65, + "grad_norm": 0.6282280087471008, + "learning_rate": 0.00025460995920565055, + "loss": 3.0406, + "step": 33567 + }, + { + "epoch": 1.65, + "grad_norm": 0.6765866875648499, + "learning_rate": 0.0002545947403179671, + "loss": 3.1285, + "step": 33568 + }, + { + "epoch": 1.65, + "grad_norm": 0.6518777012825012, + "learning_rate": 0.0002545795215498725, + "loss": 3.0133, + "step": 33569 + }, + { + "epoch": 1.65, + "grad_norm": 0.6171415448188782, + "learning_rate": 0.0002545643029014067, + "loss": 3.1624, + "step": 33570 + }, + { + "epoch": 1.65, + "grad_norm": 0.6353912949562073, + "learning_rate": 0.0002545490843726098, + "loss": 2.9545, + "step": 33571 + }, + { + "epoch": 1.65, + "grad_norm": 0.6414846181869507, + "learning_rate": 0.0002545338659635217, + "loss": 3.1739, + "step": 33572 + }, + { + "epoch": 1.65, + "grad_norm": 0.6102538704872131, + "learning_rate": 0.00025451864767418275, + "loss": 3.1158, + "step": 33573 + }, + { + "epoch": 1.65, + "grad_norm": 0.6371604204177856, + "learning_rate": 0.00025450342950463303, + "loss": 3.0714, + "step": 33574 + }, + { + "epoch": 1.65, + "grad_norm": 0.5931726694107056, + "learning_rate": 0.0002544882114549123, + "loss": 3.1613, + "step": 33575 + }, + { + "epoch": 1.65, + "grad_norm": 0.614202618598938, + "learning_rate": 0.00025447299352506105, + "loss": 2.8824, + "step": 33576 + }, + { + "epoch": 1.65, + "grad_norm": 0.6412155628204346, + "learning_rate": 0.0002544577757151191, + "loss": 2.6015, + "step": 33577 + }, + { + "epoch": 1.65, + "grad_norm": 0.6213472485542297, + "learning_rate": 0.0002544425580251265, + "loss": 3.0039, + "step": 33578 + }, + { + "epoch": 1.65, + "grad_norm": 0.6479688286781311, + "learning_rate": 0.0002544273404551236, + "loss": 2.9294, + "step": 33579 + }, + { + "epoch": 1.65, + "grad_norm": 0.6218825578689575, + "learning_rate": 0.00025441212300515026, + "loss": 3.1985, + "step": 33580 + }, + { + "epoch": 1.65, + "grad_norm": 0.6288421750068665, + "learning_rate": 0.0002543969056752467, + "loss": 2.974, + "step": 33581 + }, + { + "epoch": 1.65, + "grad_norm": 0.6092920303344727, + "learning_rate": 0.00025438168846545275, + "loss": 3.1613, + "step": 33582 + }, + { + "epoch": 1.65, + "grad_norm": 0.6967260241508484, + "learning_rate": 0.0002543664713758087, + "loss": 3.1719, + "step": 33583 + }, + { + "epoch": 1.65, + "grad_norm": 0.6147881150245667, + "learning_rate": 0.00025435125440635464, + "loss": 2.9597, + "step": 33584 + }, + { + "epoch": 1.65, + "grad_norm": 0.6140986680984497, + "learning_rate": 0.00025433603755713055, + "loss": 2.8277, + "step": 33585 + }, + { + "epoch": 1.65, + "grad_norm": 0.6095908880233765, + "learning_rate": 0.00025432082082817663, + "loss": 2.9332, + "step": 33586 + }, + { + "epoch": 1.65, + "grad_norm": 0.6159641146659851, + "learning_rate": 0.0002543056042195327, + "loss": 2.96, + "step": 33587 + }, + { + "epoch": 1.65, + "grad_norm": 0.5917350053787231, + "learning_rate": 0.00025429038773123916, + "loss": 2.9545, + "step": 33588 + }, + { + "epoch": 1.65, + "grad_norm": 0.6371214985847473, + "learning_rate": 0.000254275171363336, + "loss": 2.9997, + "step": 33589 + }, + { + "epoch": 1.65, + "grad_norm": 0.609025239944458, + "learning_rate": 0.00025425995511586303, + "loss": 3.2217, + "step": 33590 + }, + { + "epoch": 1.65, + "grad_norm": 0.6314291954040527, + "learning_rate": 0.0002542447389888607, + "loss": 3.135, + "step": 33591 + }, + { + "epoch": 1.65, + "grad_norm": 0.6252784729003906, + "learning_rate": 0.00025422952298236886, + "loss": 3.072, + "step": 33592 + }, + { + "epoch": 1.65, + "grad_norm": 0.5745939612388611, + "learning_rate": 0.0002542143070964276, + "loss": 2.9007, + "step": 33593 + }, + { + "epoch": 1.65, + "grad_norm": 0.6394028663635254, + "learning_rate": 0.0002541990913310772, + "loss": 3.1977, + "step": 33594 + }, + { + "epoch": 1.65, + "grad_norm": 0.5794762969017029, + "learning_rate": 0.00025418387568635747, + "loss": 3.063, + "step": 33595 + }, + { + "epoch": 1.65, + "grad_norm": 0.6263272762298584, + "learning_rate": 0.00025416866016230865, + "loss": 2.9177, + "step": 33596 + }, + { + "epoch": 1.65, + "grad_norm": 0.6222317218780518, + "learning_rate": 0.0002541534447589706, + "loss": 2.9814, + "step": 33597 + }, + { + "epoch": 1.65, + "grad_norm": 0.6260561347007751, + "learning_rate": 0.0002541382294763837, + "loss": 3.0318, + "step": 33598 + }, + { + "epoch": 1.65, + "grad_norm": 0.6080615520477295, + "learning_rate": 0.00025412301431458785, + "loss": 3.0894, + "step": 33599 + }, + { + "epoch": 1.65, + "grad_norm": 0.617087721824646, + "learning_rate": 0.0002541077992736231, + "loss": 2.9801, + "step": 33600 + }, + { + "epoch": 1.65, + "grad_norm": 0.6065132021903992, + "learning_rate": 0.0002540925843535297, + "loss": 3.0377, + "step": 33601 + }, + { + "epoch": 1.65, + "grad_norm": 0.5966345071792603, + "learning_rate": 0.0002540773695543475, + "loss": 2.8676, + "step": 33602 + }, + { + "epoch": 1.65, + "grad_norm": 0.5771703720092773, + "learning_rate": 0.0002540621548761166, + "loss": 2.9713, + "step": 33603 + }, + { + "epoch": 1.65, + "grad_norm": 0.6102816462516785, + "learning_rate": 0.0002540469403188773, + "loss": 3.0297, + "step": 33604 + }, + { + "epoch": 1.65, + "grad_norm": 0.5968355536460876, + "learning_rate": 0.00025403172588266934, + "loss": 2.9866, + "step": 33605 + }, + { + "epoch": 1.65, + "grad_norm": 0.6273283958435059, + "learning_rate": 0.00025401651156753316, + "loss": 3.0427, + "step": 33606 + }, + { + "epoch": 1.65, + "grad_norm": 0.6076398491859436, + "learning_rate": 0.00025400129737350853, + "loss": 2.7761, + "step": 33607 + }, + { + "epoch": 1.65, + "grad_norm": 0.6098259091377258, + "learning_rate": 0.0002539860833006356, + "loss": 3.0196, + "step": 33608 + }, + { + "epoch": 1.65, + "grad_norm": 0.5994309186935425, + "learning_rate": 0.0002539708693489546, + "loss": 3.0615, + "step": 33609 + }, + { + "epoch": 1.65, + "grad_norm": 0.6185238361358643, + "learning_rate": 0.00025395565551850535, + "loss": 2.9607, + "step": 33610 + }, + { + "epoch": 1.65, + "grad_norm": 0.6246052384376526, + "learning_rate": 0.0002539404418093282, + "loss": 2.9347, + "step": 33611 + }, + { + "epoch": 1.65, + "grad_norm": 0.616183876991272, + "learning_rate": 0.00025392522822146284, + "loss": 2.8803, + "step": 33612 + }, + { + "epoch": 1.65, + "grad_norm": 0.5878469944000244, + "learning_rate": 0.00025391001475494964, + "loss": 3.1297, + "step": 33613 + }, + { + "epoch": 1.65, + "grad_norm": 0.6638484597206116, + "learning_rate": 0.0002538948014098287, + "loss": 2.9255, + "step": 33614 + }, + { + "epoch": 1.65, + "grad_norm": 0.5929738879203796, + "learning_rate": 0.00025387958818613986, + "loss": 3.0306, + "step": 33615 + }, + { + "epoch": 1.65, + "grad_norm": 0.6322368383407593, + "learning_rate": 0.00025386437508392336, + "loss": 3.0476, + "step": 33616 + }, + { + "epoch": 1.65, + "grad_norm": 0.6140685081481934, + "learning_rate": 0.00025384916210321925, + "loss": 3.4163, + "step": 33617 + }, + { + "epoch": 1.65, + "grad_norm": 0.685403048992157, + "learning_rate": 0.00025383394924406743, + "loss": 2.907, + "step": 33618 + }, + { + "epoch": 1.65, + "grad_norm": 0.6040143966674805, + "learning_rate": 0.00025381873650650823, + "loss": 3.2015, + "step": 33619 + }, + { + "epoch": 1.65, + "grad_norm": 0.6561526656150818, + "learning_rate": 0.00025380352389058156, + "loss": 2.9761, + "step": 33620 + }, + { + "epoch": 1.65, + "grad_norm": 0.5854026675224304, + "learning_rate": 0.0002537883113963276, + "loss": 3.077, + "step": 33621 + }, + { + "epoch": 1.65, + "grad_norm": 0.5917167067527771, + "learning_rate": 0.0002537730990237862, + "loss": 2.972, + "step": 33622 + }, + { + "epoch": 1.65, + "grad_norm": 0.6419681906700134, + "learning_rate": 0.0002537578867729975, + "loss": 3.0175, + "step": 33623 + }, + { + "epoch": 1.65, + "grad_norm": 0.6021273732185364, + "learning_rate": 0.0002537426746440018, + "loss": 3.1386, + "step": 33624 + }, + { + "epoch": 1.65, + "grad_norm": 0.6152304410934448, + "learning_rate": 0.0002537274626368389, + "loss": 2.7386, + "step": 33625 + }, + { + "epoch": 1.65, + "grad_norm": 0.6640774011611938, + "learning_rate": 0.00025371225075154904, + "loss": 2.9014, + "step": 33626 + }, + { + "epoch": 1.65, + "grad_norm": 0.6067616939544678, + "learning_rate": 0.00025369703898817206, + "loss": 2.9653, + "step": 33627 + }, + { + "epoch": 1.65, + "grad_norm": 0.5870032906532288, + "learning_rate": 0.00025368182734674825, + "loss": 3.0218, + "step": 33628 + }, + { + "epoch": 1.65, + "grad_norm": 0.5886638164520264, + "learning_rate": 0.0002536666158273177, + "loss": 3.0415, + "step": 33629 + }, + { + "epoch": 1.65, + "grad_norm": 0.640203595161438, + "learning_rate": 0.0002536514044299202, + "loss": 3.0191, + "step": 33630 + }, + { + "epoch": 1.65, + "grad_norm": 0.6461527943611145, + "learning_rate": 0.00025363619315459606, + "loss": 2.9528, + "step": 33631 + }, + { + "epoch": 1.65, + "grad_norm": 0.6292715668678284, + "learning_rate": 0.00025362098200138524, + "loss": 2.9719, + "step": 33632 + }, + { + "epoch": 1.65, + "grad_norm": 0.6264772415161133, + "learning_rate": 0.0002536057709703278, + "loss": 3.0541, + "step": 33633 + }, + { + "epoch": 1.65, + "grad_norm": 0.6099303960800171, + "learning_rate": 0.00025359056006146394, + "loss": 2.9152, + "step": 33634 + }, + { + "epoch": 1.65, + "grad_norm": 0.6216869950294495, + "learning_rate": 0.0002535753492748335, + "loss": 3.1676, + "step": 33635 + }, + { + "epoch": 1.65, + "grad_norm": 0.6083691716194153, + "learning_rate": 0.0002535601386104768, + "loss": 3.0394, + "step": 33636 + }, + { + "epoch": 1.65, + "grad_norm": 0.5741828083992004, + "learning_rate": 0.00025354492806843354, + "loss": 3.0786, + "step": 33637 + }, + { + "epoch": 1.65, + "grad_norm": 0.634376049041748, + "learning_rate": 0.00025352971764874415, + "loss": 3.148, + "step": 33638 + }, + { + "epoch": 1.65, + "grad_norm": 0.6192229390144348, + "learning_rate": 0.00025351450735144854, + "loss": 3.0148, + "step": 33639 + }, + { + "epoch": 1.65, + "grad_norm": 0.6180740594863892, + "learning_rate": 0.0002534992971765867, + "loss": 3.1087, + "step": 33640 + }, + { + "epoch": 1.65, + "grad_norm": 0.5931870937347412, + "learning_rate": 0.0002534840871241988, + "loss": 3.1503, + "step": 33641 + }, + { + "epoch": 1.65, + "grad_norm": 0.5776154398918152, + "learning_rate": 0.00025346887719432485, + "loss": 3.0434, + "step": 33642 + }, + { + "epoch": 1.65, + "grad_norm": 0.61803138256073, + "learning_rate": 0.0002534536673870049, + "loss": 3.0197, + "step": 33643 + }, + { + "epoch": 1.65, + "grad_norm": 0.6241032481193542, + "learning_rate": 0.0002534384577022791, + "loss": 3.0501, + "step": 33644 + }, + { + "epoch": 1.65, + "grad_norm": 0.6268068552017212, + "learning_rate": 0.00025342324814018734, + "loss": 2.8237, + "step": 33645 + }, + { + "epoch": 1.65, + "grad_norm": 0.5747461318969727, + "learning_rate": 0.0002534080387007699, + "loss": 3.3158, + "step": 33646 + }, + { + "epoch": 1.65, + "grad_norm": 0.6452773213386536, + "learning_rate": 0.0002533928293840667, + "loss": 3.1601, + "step": 33647 + }, + { + "epoch": 1.65, + "grad_norm": 0.6670741438865662, + "learning_rate": 0.0002533776201901177, + "loss": 3.0166, + "step": 33648 + }, + { + "epoch": 1.65, + "grad_norm": 0.585372269153595, + "learning_rate": 0.0002533624111189632, + "loss": 3.0088, + "step": 33649 + }, + { + "epoch": 1.65, + "grad_norm": 0.5669026970863342, + "learning_rate": 0.0002533472021706431, + "loss": 2.8939, + "step": 33650 + }, + { + "epoch": 1.65, + "grad_norm": 0.6074293851852417, + "learning_rate": 0.0002533319933451975, + "loss": 3.2963, + "step": 33651 + }, + { + "epoch": 1.65, + "grad_norm": 0.5969696640968323, + "learning_rate": 0.0002533167846426664, + "loss": 2.9618, + "step": 33652 + }, + { + "epoch": 1.65, + "grad_norm": 0.6585623621940613, + "learning_rate": 0.00025330157606309, + "loss": 2.9734, + "step": 33653 + }, + { + "epoch": 1.65, + "grad_norm": 0.5954491496086121, + "learning_rate": 0.0002532863676065082, + "loss": 2.9167, + "step": 33654 + }, + { + "epoch": 1.65, + "grad_norm": 0.6330291628837585, + "learning_rate": 0.00025327115927296104, + "loss": 3.005, + "step": 33655 + }, + { + "epoch": 1.65, + "grad_norm": 0.6088626980781555, + "learning_rate": 0.0002532559510624888, + "loss": 3.2036, + "step": 33656 + }, + { + "epoch": 1.65, + "grad_norm": 0.6002334356307983, + "learning_rate": 0.0002532407429751313, + "loss": 3.0143, + "step": 33657 + }, + { + "epoch": 1.65, + "grad_norm": 0.6352154612541199, + "learning_rate": 0.0002532255350109287, + "loss": 3.0687, + "step": 33658 + }, + { + "epoch": 1.65, + "grad_norm": 0.5860254764556885, + "learning_rate": 0.000253210327169921, + "loss": 3.0877, + "step": 33659 + }, + { + "epoch": 1.65, + "grad_norm": 0.6036262512207031, + "learning_rate": 0.0002531951194521483, + "loss": 2.8906, + "step": 33660 + }, + { + "epoch": 1.65, + "grad_norm": 0.5720845460891724, + "learning_rate": 0.00025317991185765077, + "loss": 2.8707, + "step": 33661 + }, + { + "epoch": 1.65, + "grad_norm": 0.5837738513946533, + "learning_rate": 0.0002531647043864682, + "loss": 3.0003, + "step": 33662 + }, + { + "epoch": 1.65, + "grad_norm": 0.5987973213195801, + "learning_rate": 0.00025314949703864086, + "loss": 3.1969, + "step": 33663 + }, + { + "epoch": 1.65, + "grad_norm": 0.6149620413780212, + "learning_rate": 0.00025313428981420873, + "loss": 2.9412, + "step": 33664 + }, + { + "epoch": 1.65, + "grad_norm": 0.6203813552856445, + "learning_rate": 0.00025311908271321177, + "loss": 3.0063, + "step": 33665 + }, + { + "epoch": 1.65, + "grad_norm": 0.6189368367195129, + "learning_rate": 0.00025310387573569026, + "loss": 3.0654, + "step": 33666 + }, + { + "epoch": 1.65, + "grad_norm": 0.6118205189704895, + "learning_rate": 0.000253088668881684, + "loss": 2.9288, + "step": 33667 + }, + { + "epoch": 1.65, + "grad_norm": 0.6222481727600098, + "learning_rate": 0.0002530734621512332, + "loss": 3.0051, + "step": 33668 + }, + { + "epoch": 1.65, + "grad_norm": 0.631308913230896, + "learning_rate": 0.0002530582555443779, + "loss": 3.1259, + "step": 33669 + }, + { + "epoch": 1.65, + "grad_norm": 0.6014788150787354, + "learning_rate": 0.000253043049061158, + "loss": 2.9009, + "step": 33670 + }, + { + "epoch": 1.65, + "grad_norm": 0.5971441268920898, + "learning_rate": 0.0002530278427016138, + "loss": 3.2945, + "step": 33671 + }, + { + "epoch": 1.65, + "grad_norm": 0.606205940246582, + "learning_rate": 0.00025301263646578524, + "loss": 2.8732, + "step": 33672 + }, + { + "epoch": 1.65, + "grad_norm": 0.5801162123680115, + "learning_rate": 0.00025299743035371233, + "loss": 2.9203, + "step": 33673 + }, + { + "epoch": 1.65, + "grad_norm": 0.6619035005569458, + "learning_rate": 0.0002529822243654351, + "loss": 2.8384, + "step": 33674 + }, + { + "epoch": 1.65, + "grad_norm": 0.614162027835846, + "learning_rate": 0.0002529670185009936, + "loss": 2.9881, + "step": 33675 + }, + { + "epoch": 1.65, + "grad_norm": 0.6145156621932983, + "learning_rate": 0.0002529518127604281, + "loss": 3.2938, + "step": 33676 + }, + { + "epoch": 1.65, + "grad_norm": 0.6185023784637451, + "learning_rate": 0.0002529366071437782, + "loss": 2.9915, + "step": 33677 + }, + { + "epoch": 1.65, + "grad_norm": 0.640781581401825, + "learning_rate": 0.0002529214016510845, + "loss": 2.879, + "step": 33678 + }, + { + "epoch": 1.65, + "grad_norm": 0.616585910320282, + "learning_rate": 0.00025290619628238656, + "loss": 2.9472, + "step": 33679 + }, + { + "epoch": 1.65, + "grad_norm": 0.6268201470375061, + "learning_rate": 0.0002528909910377247, + "loss": 3.0141, + "step": 33680 + }, + { + "epoch": 1.65, + "grad_norm": 0.6437908411026001, + "learning_rate": 0.00025287578591713894, + "loss": 2.9349, + "step": 33681 + }, + { + "epoch": 1.65, + "grad_norm": 0.6435546875, + "learning_rate": 0.00025286058092066925, + "loss": 2.9744, + "step": 33682 + }, + { + "epoch": 1.65, + "grad_norm": 0.6060557961463928, + "learning_rate": 0.0002528453760483558, + "loss": 3.0914, + "step": 33683 + }, + { + "epoch": 1.65, + "grad_norm": 0.5863337516784668, + "learning_rate": 0.00025283017130023843, + "loss": 3.1313, + "step": 33684 + }, + { + "epoch": 1.65, + "grad_norm": 0.6209391951560974, + "learning_rate": 0.0002528149666763573, + "loss": 3.1179, + "step": 33685 + }, + { + "epoch": 1.65, + "grad_norm": 0.6263371706008911, + "learning_rate": 0.0002527997621767526, + "loss": 3.0119, + "step": 33686 + }, + { + "epoch": 1.65, + "grad_norm": 0.6627563238143921, + "learning_rate": 0.0002527845578014642, + "loss": 2.9974, + "step": 33687 + }, + { + "epoch": 1.65, + "grad_norm": 0.7270352244377136, + "learning_rate": 0.0002527693535505322, + "loss": 3.0584, + "step": 33688 + }, + { + "epoch": 1.65, + "grad_norm": 0.5933760404586792, + "learning_rate": 0.0002527541494239965, + "loss": 2.9056, + "step": 33689 + }, + { + "epoch": 1.65, + "grad_norm": 0.5804953575134277, + "learning_rate": 0.00025273894542189736, + "loss": 3.1185, + "step": 33690 + }, + { + "epoch": 1.65, + "grad_norm": 0.6337752938270569, + "learning_rate": 0.0002527237415442748, + "loss": 2.8029, + "step": 33691 + }, + { + "epoch": 1.65, + "grad_norm": 0.5905575156211853, + "learning_rate": 0.00025270853779116873, + "loss": 2.9804, + "step": 33692 + }, + { + "epoch": 1.65, + "grad_norm": 0.8519971370697021, + "learning_rate": 0.00025269333416261936, + "loss": 3.0931, + "step": 33693 + }, + { + "epoch": 1.65, + "grad_norm": 0.5886369347572327, + "learning_rate": 0.00025267813065866655, + "loss": 3.0487, + "step": 33694 + }, + { + "epoch": 1.65, + "grad_norm": 0.5819816589355469, + "learning_rate": 0.00025266292727935043, + "loss": 2.9745, + "step": 33695 + }, + { + "epoch": 1.65, + "grad_norm": 0.5780503749847412, + "learning_rate": 0.00025264772402471117, + "loss": 3.2257, + "step": 33696 + }, + { + "epoch": 1.65, + "grad_norm": 0.6502557396888733, + "learning_rate": 0.0002526325208947886, + "loss": 3.067, + "step": 33697 + }, + { + "epoch": 1.65, + "grad_norm": 0.6687465310096741, + "learning_rate": 0.00025261731788962295, + "loss": 3.1206, + "step": 33698 + }, + { + "epoch": 1.65, + "grad_norm": 0.6016365885734558, + "learning_rate": 0.000252602115009254, + "loss": 3.0287, + "step": 33699 + }, + { + "epoch": 1.65, + "grad_norm": 0.6240675449371338, + "learning_rate": 0.000252586912253722, + "loss": 2.9064, + "step": 33700 + }, + { + "epoch": 1.65, + "grad_norm": 0.582499086856842, + "learning_rate": 0.00025257170962306706, + "loss": 3.2111, + "step": 33701 + }, + { + "epoch": 1.65, + "grad_norm": 0.6195961236953735, + "learning_rate": 0.00025255650711732895, + "loss": 3.008, + "step": 33702 + }, + { + "epoch": 1.65, + "grad_norm": 0.5937759280204773, + "learning_rate": 0.00025254130473654807, + "loss": 2.8966, + "step": 33703 + }, + { + "epoch": 1.65, + "grad_norm": 0.6716198921203613, + "learning_rate": 0.00025252610248076415, + "loss": 2.8721, + "step": 33704 + }, + { + "epoch": 1.65, + "grad_norm": 0.6009177565574646, + "learning_rate": 0.0002525109003500172, + "loss": 3.0408, + "step": 33705 + }, + { + "epoch": 1.65, + "grad_norm": 0.6476081013679504, + "learning_rate": 0.0002524956983443476, + "loss": 2.9311, + "step": 33706 + }, + { + "epoch": 1.65, + "grad_norm": 0.6230241060256958, + "learning_rate": 0.00025248049646379506, + "loss": 2.6999, + "step": 33707 + }, + { + "epoch": 1.65, + "grad_norm": 0.6477628946304321, + "learning_rate": 0.0002524652947083999, + "loss": 3.2673, + "step": 33708 + }, + { + "epoch": 1.65, + "grad_norm": 0.6434798240661621, + "learning_rate": 0.0002524500930782019, + "loss": 2.8493, + "step": 33709 + }, + { + "epoch": 1.65, + "grad_norm": 0.6150158047676086, + "learning_rate": 0.00025243489157324113, + "loss": 2.9382, + "step": 33710 + }, + { + "epoch": 1.65, + "grad_norm": 0.6013590693473816, + "learning_rate": 0.0002524196901935579, + "loss": 2.9773, + "step": 33711 + }, + { + "epoch": 1.65, + "grad_norm": 0.6192983388900757, + "learning_rate": 0.00025240448893919194, + "loss": 3.2487, + "step": 33712 + }, + { + "epoch": 1.65, + "grad_norm": 0.6279726624488831, + "learning_rate": 0.00025238928781018345, + "loss": 2.9022, + "step": 33713 + }, + { + "epoch": 1.65, + "grad_norm": 0.6031069755554199, + "learning_rate": 0.00025237408680657227, + "loss": 3.0851, + "step": 33714 + }, + { + "epoch": 1.65, + "grad_norm": 0.6207053065299988, + "learning_rate": 0.0002523588859283987, + "loss": 3.0118, + "step": 33715 + }, + { + "epoch": 1.65, + "grad_norm": 0.5942187309265137, + "learning_rate": 0.00025234368517570267, + "loss": 3.1582, + "step": 33716 + }, + { + "epoch": 1.65, + "grad_norm": 0.6146659851074219, + "learning_rate": 0.0002523284845485241, + "loss": 3.0741, + "step": 33717 + }, + { + "epoch": 1.65, + "grad_norm": 0.6310451030731201, + "learning_rate": 0.00025231328404690323, + "loss": 3.027, + "step": 33718 + }, + { + "epoch": 1.65, + "grad_norm": 0.6505426168441772, + "learning_rate": 0.00025229808367087993, + "loss": 2.864, + "step": 33719 + }, + { + "epoch": 1.65, + "grad_norm": 0.6088349223136902, + "learning_rate": 0.00025228288342049425, + "loss": 2.9772, + "step": 33720 + }, + { + "epoch": 1.65, + "grad_norm": 0.6298375725746155, + "learning_rate": 0.0002522676832957864, + "loss": 3.1535, + "step": 33721 + }, + { + "epoch": 1.65, + "grad_norm": 0.6122135519981384, + "learning_rate": 0.00025225248329679626, + "loss": 3.1195, + "step": 33722 + }, + { + "epoch": 1.65, + "grad_norm": 0.5897640585899353, + "learning_rate": 0.00025223728342356385, + "loss": 3.0419, + "step": 33723 + }, + { + "epoch": 1.65, + "grad_norm": 0.6169995069503784, + "learning_rate": 0.0002522220836761292, + "loss": 3.0293, + "step": 33724 + }, + { + "epoch": 1.65, + "grad_norm": 0.6491032838821411, + "learning_rate": 0.00025220688405453237, + "loss": 2.9347, + "step": 33725 + }, + { + "epoch": 1.65, + "grad_norm": 0.5842862725257874, + "learning_rate": 0.0002521916845588135, + "loss": 3.1327, + "step": 33726 + }, + { + "epoch": 1.65, + "grad_norm": 0.6430948376655579, + "learning_rate": 0.0002521764851890125, + "loss": 2.9445, + "step": 33727 + }, + { + "epoch": 1.65, + "grad_norm": 0.6189959645271301, + "learning_rate": 0.00025216128594516955, + "loss": 3.0916, + "step": 33728 + }, + { + "epoch": 1.65, + "grad_norm": 0.608040452003479, + "learning_rate": 0.0002521460868273244, + "loss": 3.0271, + "step": 33729 + }, + { + "epoch": 1.65, + "grad_norm": 0.6393671035766602, + "learning_rate": 0.00025213088783551733, + "loss": 2.8933, + "step": 33730 + }, + { + "epoch": 1.65, + "grad_norm": 0.6282805800437927, + "learning_rate": 0.00025211568896978825, + "loss": 2.9764, + "step": 33731 + }, + { + "epoch": 1.65, + "grad_norm": 0.5948448181152344, + "learning_rate": 0.0002521004902301772, + "loss": 2.9167, + "step": 33732 + }, + { + "epoch": 1.65, + "grad_norm": 0.6147938370704651, + "learning_rate": 0.0002520852916167243, + "loss": 3.0297, + "step": 33733 + }, + { + "epoch": 1.65, + "grad_norm": 0.6409828662872314, + "learning_rate": 0.0002520700931294695, + "loss": 2.8486, + "step": 33734 + }, + { + "epoch": 1.65, + "grad_norm": 0.718755841255188, + "learning_rate": 0.0002520548947684529, + "loss": 2.9996, + "step": 33735 + }, + { + "epoch": 1.65, + "grad_norm": 0.6078330874443054, + "learning_rate": 0.00025203969653371435, + "loss": 2.9222, + "step": 33736 + }, + { + "epoch": 1.65, + "grad_norm": 0.6095114350318909, + "learning_rate": 0.0002520244984252941, + "loss": 2.8218, + "step": 33737 + }, + { + "epoch": 1.65, + "grad_norm": 0.670408308506012, + "learning_rate": 0.0002520093004432322, + "loss": 2.9352, + "step": 33738 + }, + { + "epoch": 1.65, + "grad_norm": 0.6077500581741333, + "learning_rate": 0.0002519941025875683, + "loss": 3.225, + "step": 33739 + }, + { + "epoch": 1.65, + "grad_norm": 0.6481585502624512, + "learning_rate": 0.0002519789048583429, + "loss": 2.9603, + "step": 33740 + }, + { + "epoch": 1.65, + "grad_norm": 0.5873180627822876, + "learning_rate": 0.0002519637072555958, + "loss": 3.0317, + "step": 33741 + }, + { + "epoch": 1.65, + "grad_norm": 0.6104382872581482, + "learning_rate": 0.00025194850977936693, + "loss": 3.0169, + "step": 33742 + }, + { + "epoch": 1.65, + "grad_norm": 0.5977993607521057, + "learning_rate": 0.00025193331242969656, + "loss": 3.1041, + "step": 33743 + }, + { + "epoch": 1.65, + "grad_norm": 0.6217960119247437, + "learning_rate": 0.0002519181152066246, + "loss": 2.8854, + "step": 33744 + }, + { + "epoch": 1.65, + "grad_norm": 0.6322939991950989, + "learning_rate": 0.00025190291811019104, + "loss": 2.9709, + "step": 33745 + }, + { + "epoch": 1.65, + "grad_norm": 0.5825021862983704, + "learning_rate": 0.00025188772114043584, + "loss": 3.1036, + "step": 33746 + }, + { + "epoch": 1.65, + "grad_norm": 0.6149255633354187, + "learning_rate": 0.00025187252429739916, + "loss": 3.0629, + "step": 33747 + }, + { + "epoch": 1.65, + "grad_norm": 0.5964076519012451, + "learning_rate": 0.0002518573275811211, + "loss": 2.949, + "step": 33748 + }, + { + "epoch": 1.65, + "grad_norm": 0.5930870175361633, + "learning_rate": 0.0002518421309916415, + "loss": 3.2019, + "step": 33749 + }, + { + "epoch": 1.65, + "grad_norm": 0.6285166144371033, + "learning_rate": 0.0002518269345290005, + "loss": 3.1833, + "step": 33750 + }, + { + "epoch": 1.65, + "grad_norm": 0.5890181064605713, + "learning_rate": 0.000251811738193238, + "loss": 3.284, + "step": 33751 + }, + { + "epoch": 1.65, + "grad_norm": 0.5849533677101135, + "learning_rate": 0.00025179654198439415, + "loss": 3.0216, + "step": 33752 + }, + { + "epoch": 1.65, + "grad_norm": 0.6618516445159912, + "learning_rate": 0.000251781345902509, + "loss": 3.1006, + "step": 33753 + }, + { + "epoch": 1.65, + "grad_norm": 0.6029493808746338, + "learning_rate": 0.0002517661499476224, + "loss": 2.8438, + "step": 33754 + }, + { + "epoch": 1.65, + "grad_norm": 0.628272294998169, + "learning_rate": 0.00025175095411977453, + "loss": 3.0567, + "step": 33755 + }, + { + "epoch": 1.65, + "grad_norm": 0.6236722469329834, + "learning_rate": 0.0002517357584190054, + "loss": 3.056, + "step": 33756 + }, + { + "epoch": 1.65, + "grad_norm": 0.5944138169288635, + "learning_rate": 0.00025172056284535483, + "loss": 3.1059, + "step": 33757 + }, + { + "epoch": 1.65, + "grad_norm": 0.5836665034294128, + "learning_rate": 0.0002517053673988632, + "loss": 3.1147, + "step": 33758 + }, + { + "epoch": 1.65, + "grad_norm": 0.6636527180671692, + "learning_rate": 0.00025169017207957026, + "loss": 2.7778, + "step": 33759 + }, + { + "epoch": 1.65, + "grad_norm": 0.6323641538619995, + "learning_rate": 0.00025167497688751615, + "loss": 3.0488, + "step": 33760 + }, + { + "epoch": 1.65, + "grad_norm": 0.6302624940872192, + "learning_rate": 0.00025165978182274077, + "loss": 3.1198, + "step": 33761 + }, + { + "epoch": 1.65, + "grad_norm": 0.6282402873039246, + "learning_rate": 0.0002516445868852842, + "loss": 3.0372, + "step": 33762 + }, + { + "epoch": 1.65, + "grad_norm": 0.5997104048728943, + "learning_rate": 0.0002516293920751867, + "loss": 2.7741, + "step": 33763 + }, + { + "epoch": 1.65, + "grad_norm": 0.6254650950431824, + "learning_rate": 0.0002516141973924878, + "loss": 3.0331, + "step": 33764 + }, + { + "epoch": 1.65, + "grad_norm": 0.6326666474342346, + "learning_rate": 0.000251599002837228, + "loss": 2.9852, + "step": 33765 + }, + { + "epoch": 1.65, + "grad_norm": 0.5990265011787415, + "learning_rate": 0.00025158380840944696, + "loss": 2.9702, + "step": 33766 + }, + { + "epoch": 1.65, + "grad_norm": 0.6204058527946472, + "learning_rate": 0.00025156861410918487, + "loss": 3.0526, + "step": 33767 + }, + { + "epoch": 1.65, + "grad_norm": 0.747573971748352, + "learning_rate": 0.0002515534199364818, + "loss": 2.9046, + "step": 33768 + }, + { + "epoch": 1.65, + "grad_norm": 0.6106170415878296, + "learning_rate": 0.00025153822589137764, + "loss": 3.1094, + "step": 33769 + }, + { + "epoch": 1.65, + "grad_norm": 0.6426050066947937, + "learning_rate": 0.0002515230319739126, + "loss": 2.9108, + "step": 33770 + }, + { + "epoch": 1.66, + "grad_norm": 0.6021739840507507, + "learning_rate": 0.0002515078381841264, + "loss": 3.1094, + "step": 33771 + }, + { + "epoch": 1.66, + "grad_norm": 0.5746902823448181, + "learning_rate": 0.00025149264452205923, + "loss": 3.0454, + "step": 33772 + }, + { + "epoch": 1.66, + "grad_norm": 0.5948019027709961, + "learning_rate": 0.00025147745098775116, + "loss": 2.9203, + "step": 33773 + }, + { + "epoch": 1.66, + "grad_norm": 0.6490957736968994, + "learning_rate": 0.0002514622575812422, + "loss": 2.8963, + "step": 33774 + }, + { + "epoch": 1.66, + "grad_norm": 0.6122204065322876, + "learning_rate": 0.0002514470643025723, + "loss": 2.8563, + "step": 33775 + }, + { + "epoch": 1.66, + "grad_norm": 0.5842893123626709, + "learning_rate": 0.00025143187115178134, + "loss": 2.8726, + "step": 33776 + }, + { + "epoch": 1.66, + "grad_norm": 0.62315434217453, + "learning_rate": 0.0002514166781289096, + "loss": 2.9784, + "step": 33777 + }, + { + "epoch": 1.66, + "grad_norm": 0.6309679746627808, + "learning_rate": 0.00025140148523399697, + "loss": 3.1414, + "step": 33778 + }, + { + "epoch": 1.66, + "grad_norm": 0.6170636415481567, + "learning_rate": 0.0002513862924670834, + "loss": 3.0576, + "step": 33779 + }, + { + "epoch": 1.66, + "grad_norm": 0.6519901156425476, + "learning_rate": 0.00025137109982820914, + "loss": 3.0581, + "step": 33780 + }, + { + "epoch": 1.66, + "grad_norm": 0.6126569509506226, + "learning_rate": 0.00025135590731741394, + "loss": 3.0446, + "step": 33781 + }, + { + "epoch": 1.66, + "grad_norm": 0.5987186431884766, + "learning_rate": 0.00025134071493473784, + "loss": 2.9068, + "step": 33782 + }, + { + "epoch": 1.66, + "grad_norm": 0.6122428178787231, + "learning_rate": 0.0002513255226802211, + "loss": 2.9485, + "step": 33783 + }, + { + "epoch": 1.66, + "grad_norm": 0.6258891224861145, + "learning_rate": 0.00025131033055390345, + "loss": 3.0306, + "step": 33784 + }, + { + "epoch": 1.66, + "grad_norm": 0.6125542521476746, + "learning_rate": 0.00025129513855582514, + "loss": 3.1461, + "step": 33785 + }, + { + "epoch": 1.66, + "grad_norm": 0.610899806022644, + "learning_rate": 0.00025127994668602593, + "loss": 2.9816, + "step": 33786 + }, + { + "epoch": 1.66, + "grad_norm": 0.6190662384033203, + "learning_rate": 0.0002512647549445459, + "loss": 2.9831, + "step": 33787 + }, + { + "epoch": 1.66, + "grad_norm": 0.6101117134094238, + "learning_rate": 0.00025124956333142534, + "loss": 3.1565, + "step": 33788 + }, + { + "epoch": 1.66, + "grad_norm": 0.6318680047988892, + "learning_rate": 0.0002512343718467039, + "loss": 3.1456, + "step": 33789 + }, + { + "epoch": 1.66, + "grad_norm": 0.6346719264984131, + "learning_rate": 0.0002512191804904219, + "loss": 3.0291, + "step": 33790 + }, + { + "epoch": 1.66, + "grad_norm": 0.6252193450927734, + "learning_rate": 0.00025120398926261895, + "loss": 2.9251, + "step": 33791 + }, + { + "epoch": 1.66, + "grad_norm": 0.6464138627052307, + "learning_rate": 0.00025118879816333545, + "loss": 3.0129, + "step": 33792 + }, + { + "epoch": 1.66, + "grad_norm": 0.6083313226699829, + "learning_rate": 0.0002511736071926113, + "loss": 2.9965, + "step": 33793 + }, + { + "epoch": 1.66, + "grad_norm": 0.5951755046844482, + "learning_rate": 0.0002511584163504863, + "loss": 2.9157, + "step": 33794 + }, + { + "epoch": 1.66, + "grad_norm": 0.604433000087738, + "learning_rate": 0.00025114322563700084, + "loss": 3.1928, + "step": 33795 + }, + { + "epoch": 1.66, + "grad_norm": 0.6072720289230347, + "learning_rate": 0.0002511280350521946, + "loss": 2.8213, + "step": 33796 + }, + { + "epoch": 1.66, + "grad_norm": 0.611731231212616, + "learning_rate": 0.0002511128445961077, + "loss": 3.159, + "step": 33797 + }, + { + "epoch": 1.66, + "grad_norm": 0.5745161175727844, + "learning_rate": 0.00025109765426878025, + "loss": 2.951, + "step": 33798 + }, + { + "epoch": 1.66, + "grad_norm": 0.5982853770256042, + "learning_rate": 0.0002510824640702521, + "loss": 3.1052, + "step": 33799 + }, + { + "epoch": 1.66, + "grad_norm": 0.6100951433181763, + "learning_rate": 0.00025106727400056334, + "loss": 3.0418, + "step": 33800 + }, + { + "epoch": 1.66, + "grad_norm": 0.6522631645202637, + "learning_rate": 0.00025105208405975393, + "loss": 2.8211, + "step": 33801 + }, + { + "epoch": 1.66, + "grad_norm": 0.6171745657920837, + "learning_rate": 0.0002510368942478639, + "loss": 2.8845, + "step": 33802 + }, + { + "epoch": 1.66, + "grad_norm": 0.6560838222503662, + "learning_rate": 0.0002510217045649334, + "loss": 3.0927, + "step": 33803 + }, + { + "epoch": 1.66, + "grad_norm": 0.6423225998878479, + "learning_rate": 0.0002510065150110021, + "loss": 2.9911, + "step": 33804 + }, + { + "epoch": 1.66, + "grad_norm": 0.6459085941314697, + "learning_rate": 0.00025099132558611037, + "loss": 2.8735, + "step": 33805 + }, + { + "epoch": 1.66, + "grad_norm": 0.6427082419395447, + "learning_rate": 0.000250976136290298, + "loss": 3.0119, + "step": 33806 + }, + { + "epoch": 1.66, + "grad_norm": 0.6143639087677002, + "learning_rate": 0.0002509609471236051, + "loss": 3.0391, + "step": 33807 + }, + { + "epoch": 1.66, + "grad_norm": 0.6215494871139526, + "learning_rate": 0.00025094575808607154, + "loss": 2.9713, + "step": 33808 + }, + { + "epoch": 1.66, + "grad_norm": 0.7413198351860046, + "learning_rate": 0.0002509305691777374, + "loss": 2.8757, + "step": 33809 + }, + { + "epoch": 1.66, + "grad_norm": 0.5840345621109009, + "learning_rate": 0.0002509153803986428, + "loss": 2.9908, + "step": 33810 + }, + { + "epoch": 1.66, + "grad_norm": 0.5827749967575073, + "learning_rate": 0.0002509001917488276, + "loss": 3.0006, + "step": 33811 + }, + { + "epoch": 1.66, + "grad_norm": 0.6079306602478027, + "learning_rate": 0.0002508850032283319, + "loss": 3.0942, + "step": 33812 + }, + { + "epoch": 1.66, + "grad_norm": 0.5841607451438904, + "learning_rate": 0.0002508698148371955, + "loss": 2.917, + "step": 33813 + }, + { + "epoch": 1.66, + "grad_norm": 0.6027677059173584, + "learning_rate": 0.0002508546265754587, + "loss": 3.0351, + "step": 33814 + }, + { + "epoch": 1.66, + "grad_norm": 0.5869815349578857, + "learning_rate": 0.0002508394384431613, + "loss": 2.9288, + "step": 33815 + }, + { + "epoch": 1.66, + "grad_norm": 0.6005563139915466, + "learning_rate": 0.00025082425044034335, + "loss": 3.2591, + "step": 33816 + }, + { + "epoch": 1.66, + "grad_norm": 0.6811476945877075, + "learning_rate": 0.00025080906256704486, + "loss": 2.9065, + "step": 33817 + }, + { + "epoch": 1.66, + "grad_norm": 0.5969555377960205, + "learning_rate": 0.0002507938748233059, + "loss": 3.1142, + "step": 33818 + }, + { + "epoch": 1.66, + "grad_norm": 0.6238395571708679, + "learning_rate": 0.0002507786872091662, + "loss": 2.9812, + "step": 33819 + }, + { + "epoch": 1.66, + "grad_norm": 0.5949825644493103, + "learning_rate": 0.0002507634997246662, + "loss": 2.9406, + "step": 33820 + }, + { + "epoch": 1.66, + "grad_norm": 0.5956006646156311, + "learning_rate": 0.00025074831236984556, + "loss": 3.0653, + "step": 33821 + }, + { + "epoch": 1.66, + "grad_norm": 0.6946335434913635, + "learning_rate": 0.00025073312514474445, + "loss": 2.8979, + "step": 33822 + }, + { + "epoch": 1.66, + "grad_norm": 0.6324100494384766, + "learning_rate": 0.0002507179380494027, + "loss": 3.1288, + "step": 33823 + }, + { + "epoch": 1.66, + "grad_norm": 0.6476609706878662, + "learning_rate": 0.00025070275108386047, + "loss": 3.1207, + "step": 33824 + }, + { + "epoch": 1.66, + "grad_norm": 0.6044153571128845, + "learning_rate": 0.0002506875642481578, + "loss": 3.0552, + "step": 33825 + }, + { + "epoch": 1.66, + "grad_norm": 0.6101298332214355, + "learning_rate": 0.00025067237754233443, + "loss": 3.069, + "step": 33826 + }, + { + "epoch": 1.66, + "grad_norm": 0.5707805752754211, + "learning_rate": 0.00025065719096643065, + "loss": 3.0449, + "step": 33827 + }, + { + "epoch": 1.66, + "grad_norm": 0.5993731617927551, + "learning_rate": 0.0002506420045204863, + "loss": 2.9649, + "step": 33828 + }, + { + "epoch": 1.66, + "grad_norm": 0.6095276474952698, + "learning_rate": 0.00025062681820454133, + "loss": 3.2008, + "step": 33829 + }, + { + "epoch": 1.66, + "grad_norm": 0.6098636984825134, + "learning_rate": 0.0002506116320186359, + "loss": 3.1267, + "step": 33830 + }, + { + "epoch": 1.66, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002505964459628098, + "loss": 2.8534, + "step": 33831 + }, + { + "epoch": 1.66, + "grad_norm": 0.657274067401886, + "learning_rate": 0.0002505812600371034, + "loss": 2.9407, + "step": 33832 + }, + { + "epoch": 1.66, + "grad_norm": 0.6024130582809448, + "learning_rate": 0.0002505660742415563, + "loss": 3.1447, + "step": 33833 + }, + { + "epoch": 1.66, + "grad_norm": 0.6005028486251831, + "learning_rate": 0.0002505508885762086, + "loss": 3.1241, + "step": 33834 + }, + { + "epoch": 1.66, + "grad_norm": 0.6737073063850403, + "learning_rate": 0.0002505357030411004, + "loss": 3.0029, + "step": 33835 + }, + { + "epoch": 1.66, + "grad_norm": 0.603809118270874, + "learning_rate": 0.0002505205176362717, + "loss": 3.0558, + "step": 33836 + }, + { + "epoch": 1.66, + "grad_norm": 0.593603789806366, + "learning_rate": 0.0002505053323617624, + "loss": 3.203, + "step": 33837 + }, + { + "epoch": 1.66, + "grad_norm": 0.639643132686615, + "learning_rate": 0.0002504901472176124, + "loss": 3.0018, + "step": 33838 + }, + { + "epoch": 1.66, + "grad_norm": 0.6236515045166016, + "learning_rate": 0.00025047496220386193, + "loss": 3.0459, + "step": 33839 + }, + { + "epoch": 1.66, + "grad_norm": 0.6432270407676697, + "learning_rate": 0.00025045977732055094, + "loss": 2.9573, + "step": 33840 + }, + { + "epoch": 1.66, + "grad_norm": 0.6288147568702698, + "learning_rate": 0.00025044459256771923, + "loss": 2.9896, + "step": 33841 + }, + { + "epoch": 1.66, + "grad_norm": 0.6194936037063599, + "learning_rate": 0.0002504294079454071, + "loss": 3.1635, + "step": 33842 + }, + { + "epoch": 1.66, + "grad_norm": 0.6322032809257507, + "learning_rate": 0.00025041422345365425, + "loss": 3.0478, + "step": 33843 + }, + { + "epoch": 1.66, + "grad_norm": 0.6737121939659119, + "learning_rate": 0.0002503990390925007, + "loss": 3.3406, + "step": 33844 + }, + { + "epoch": 1.66, + "grad_norm": 0.6102167963981628, + "learning_rate": 0.0002503838548619867, + "loss": 3.1013, + "step": 33845 + }, + { + "epoch": 1.66, + "grad_norm": 0.6064083576202393, + "learning_rate": 0.00025036867076215204, + "loss": 3.0313, + "step": 33846 + }, + { + "epoch": 1.66, + "grad_norm": 0.6235889196395874, + "learning_rate": 0.0002503534867930368, + "loss": 3.0489, + "step": 33847 + }, + { + "epoch": 1.66, + "grad_norm": 0.6064871549606323, + "learning_rate": 0.0002503383029546808, + "loss": 2.9426, + "step": 33848 + }, + { + "epoch": 1.66, + "grad_norm": 0.6244485378265381, + "learning_rate": 0.0002503231192471241, + "loss": 2.994, + "step": 33849 + }, + { + "epoch": 1.66, + "grad_norm": 0.6445832252502441, + "learning_rate": 0.0002503079356704069, + "loss": 3.1693, + "step": 33850 + }, + { + "epoch": 1.66, + "grad_norm": 0.6180044412612915, + "learning_rate": 0.000250292752224569, + "loss": 3.1618, + "step": 33851 + }, + { + "epoch": 1.66, + "grad_norm": 0.6148596405982971, + "learning_rate": 0.00025027756890965043, + "loss": 2.8679, + "step": 33852 + }, + { + "epoch": 1.66, + "grad_norm": 0.6558221578598022, + "learning_rate": 0.00025026238572569104, + "loss": 3.1456, + "step": 33853 + }, + { + "epoch": 1.66, + "grad_norm": 0.5984779596328735, + "learning_rate": 0.00025024720267273113, + "loss": 3.0931, + "step": 33854 + }, + { + "epoch": 1.66, + "grad_norm": 0.5910894274711609, + "learning_rate": 0.00025023201975081044, + "loss": 2.8309, + "step": 33855 + }, + { + "epoch": 1.66, + "grad_norm": 0.6671299338340759, + "learning_rate": 0.000250216836959969, + "loss": 2.9913, + "step": 33856 + }, + { + "epoch": 1.66, + "grad_norm": 0.617927610874176, + "learning_rate": 0.00025020165430024693, + "loss": 3.0802, + "step": 33857 + }, + { + "epoch": 1.66, + "grad_norm": 0.6495857238769531, + "learning_rate": 0.000250186471771684, + "loss": 2.8715, + "step": 33858 + }, + { + "epoch": 1.66, + "grad_norm": 0.6269500851631165, + "learning_rate": 0.00025017128937432034, + "loss": 2.7541, + "step": 33859 + }, + { + "epoch": 1.66, + "grad_norm": 0.6183517575263977, + "learning_rate": 0.00025015610710819603, + "loss": 3.0808, + "step": 33860 + }, + { + "epoch": 1.66, + "grad_norm": 0.6176064014434814, + "learning_rate": 0.00025014092497335083, + "loss": 3.178, + "step": 33861 + }, + { + "epoch": 1.66, + "grad_norm": 0.609695315361023, + "learning_rate": 0.00025012574296982495, + "loss": 3.0251, + "step": 33862 + }, + { + "epoch": 1.66, + "grad_norm": 0.6329081654548645, + "learning_rate": 0.0002501105610976581, + "loss": 3.1642, + "step": 33863 + }, + { + "epoch": 1.66, + "grad_norm": 0.6028696894645691, + "learning_rate": 0.0002500953793568905, + "loss": 2.8275, + "step": 33864 + }, + { + "epoch": 1.66, + "grad_norm": 0.6294543147087097, + "learning_rate": 0.00025008019774756216, + "loss": 3.264, + "step": 33865 + }, + { + "epoch": 1.66, + "grad_norm": 0.5953629016876221, + "learning_rate": 0.0002500650162697128, + "loss": 3.0756, + "step": 33866 + }, + { + "epoch": 1.66, + "grad_norm": 0.6343550682067871, + "learning_rate": 0.00025004983492338273, + "loss": 3.1477, + "step": 33867 + }, + { + "epoch": 1.66, + "grad_norm": 0.581670343875885, + "learning_rate": 0.0002500346537086117, + "loss": 3.1717, + "step": 33868 + }, + { + "epoch": 1.66, + "grad_norm": 0.6213530898094177, + "learning_rate": 0.00025001947262543974, + "loss": 2.9365, + "step": 33869 + }, + { + "epoch": 1.66, + "grad_norm": 0.6029473543167114, + "learning_rate": 0.000250004291673907, + "loss": 3.0754, + "step": 33870 + }, + { + "epoch": 1.66, + "grad_norm": 0.6134632229804993, + "learning_rate": 0.00024998911085405315, + "loss": 2.8977, + "step": 33871 + }, + { + "epoch": 1.66, + "grad_norm": 0.6177032589912415, + "learning_rate": 0.00024997393016591853, + "loss": 3.1839, + "step": 33872 + }, + { + "epoch": 1.66, + "grad_norm": 0.6571273803710938, + "learning_rate": 0.0002499587496095429, + "loss": 3.0477, + "step": 33873 + }, + { + "epoch": 1.66, + "grad_norm": 0.6338697075843811, + "learning_rate": 0.0002499435691849662, + "loss": 2.9806, + "step": 33874 + }, + { + "epoch": 1.66, + "grad_norm": 0.6008161902427673, + "learning_rate": 0.00024992838889222865, + "loss": 2.9013, + "step": 33875 + }, + { + "epoch": 1.66, + "grad_norm": 0.6442448496818542, + "learning_rate": 0.00024991320873137, + "loss": 2.9811, + "step": 33876 + }, + { + "epoch": 1.66, + "grad_norm": 0.6157088875770569, + "learning_rate": 0.0002498980287024304, + "loss": 3.0265, + "step": 33877 + }, + { + "epoch": 1.66, + "grad_norm": 0.5938071012496948, + "learning_rate": 0.0002498828488054496, + "loss": 2.9337, + "step": 33878 + }, + { + "epoch": 1.66, + "grad_norm": 0.6022875905036926, + "learning_rate": 0.00024986766904046785, + "loss": 3.1691, + "step": 33879 + }, + { + "epoch": 1.66, + "grad_norm": 0.6283571720123291, + "learning_rate": 0.00024985248940752504, + "loss": 3.0208, + "step": 33880 + }, + { + "epoch": 1.66, + "grad_norm": 0.6032127141952515, + "learning_rate": 0.000249837309906661, + "loss": 2.9129, + "step": 33881 + }, + { + "epoch": 1.66, + "grad_norm": 0.6097046136856079, + "learning_rate": 0.00024982213053791595, + "loss": 3.0705, + "step": 33882 + }, + { + "epoch": 1.66, + "grad_norm": 0.5926503539085388, + "learning_rate": 0.00024980695130132963, + "loss": 3.0113, + "step": 33883 + }, + { + "epoch": 1.66, + "grad_norm": 0.6158053874969482, + "learning_rate": 0.0002497917721969423, + "loss": 3.1628, + "step": 33884 + }, + { + "epoch": 1.66, + "grad_norm": 0.6430631279945374, + "learning_rate": 0.0002497765932247936, + "loss": 3.0387, + "step": 33885 + }, + { + "epoch": 1.66, + "grad_norm": 0.5978126525878906, + "learning_rate": 0.0002497614143849238, + "loss": 2.8056, + "step": 33886 + }, + { + "epoch": 1.66, + "grad_norm": 0.5920930504798889, + "learning_rate": 0.0002497462356773728, + "loss": 2.8876, + "step": 33887 + }, + { + "epoch": 1.66, + "grad_norm": 0.5528014302253723, + "learning_rate": 0.0002497310571021804, + "loss": 2.97, + "step": 33888 + }, + { + "epoch": 1.66, + "grad_norm": 0.5907645225524902, + "learning_rate": 0.00024971587865938686, + "loss": 3.0249, + "step": 33889 + }, + { + "epoch": 1.66, + "grad_norm": 0.5735505223274231, + "learning_rate": 0.00024970070034903197, + "loss": 2.9847, + "step": 33890 + }, + { + "epoch": 1.66, + "grad_norm": 0.6211549639701843, + "learning_rate": 0.0002496855221711557, + "loss": 3.1012, + "step": 33891 + }, + { + "epoch": 1.66, + "grad_norm": 0.6353756785392761, + "learning_rate": 0.0002496703441257982, + "loss": 3.062, + "step": 33892 + }, + { + "epoch": 1.66, + "grad_norm": 0.5956559181213379, + "learning_rate": 0.00024965516621299914, + "loss": 2.9391, + "step": 33893 + }, + { + "epoch": 1.66, + "grad_norm": 0.598713755607605, + "learning_rate": 0.00024963998843279884, + "loss": 2.964, + "step": 33894 + }, + { + "epoch": 1.66, + "grad_norm": 0.6306696534156799, + "learning_rate": 0.0002496248107852371, + "loss": 2.8183, + "step": 33895 + }, + { + "epoch": 1.66, + "grad_norm": 0.6647626161575317, + "learning_rate": 0.00024960963327035383, + "loss": 2.9461, + "step": 33896 + }, + { + "epoch": 1.66, + "grad_norm": 0.6233382821083069, + "learning_rate": 0.00024959445588818924, + "loss": 3.1564, + "step": 33897 + }, + { + "epoch": 1.66, + "grad_norm": 0.6287595629692078, + "learning_rate": 0.00024957927863878304, + "loss": 2.9366, + "step": 33898 + }, + { + "epoch": 1.66, + "grad_norm": 0.6304962635040283, + "learning_rate": 0.0002495641015221754, + "loss": 3.1073, + "step": 33899 + }, + { + "epoch": 1.66, + "grad_norm": 0.6396434307098389, + "learning_rate": 0.00024954892453840606, + "loss": 2.9138, + "step": 33900 + }, + { + "epoch": 1.66, + "grad_norm": 0.5777924656867981, + "learning_rate": 0.0002495337476875152, + "loss": 2.9275, + "step": 33901 + }, + { + "epoch": 1.66, + "grad_norm": 0.6460705995559692, + "learning_rate": 0.00024951857096954286, + "loss": 2.9709, + "step": 33902 + }, + { + "epoch": 1.66, + "grad_norm": 0.607254683971405, + "learning_rate": 0.0002495033943845287, + "loss": 3.1602, + "step": 33903 + }, + { + "epoch": 1.66, + "grad_norm": 0.5625812411308289, + "learning_rate": 0.0002494882179325131, + "loss": 3.0044, + "step": 33904 + }, + { + "epoch": 1.66, + "grad_norm": 0.597536563873291, + "learning_rate": 0.00024947304161353564, + "loss": 3.281, + "step": 33905 + }, + { + "epoch": 1.66, + "grad_norm": 0.7313137650489807, + "learning_rate": 0.0002494578654276365, + "loss": 3.086, + "step": 33906 + }, + { + "epoch": 1.66, + "grad_norm": 0.6183013319969177, + "learning_rate": 0.00024944268937485565, + "loss": 2.8716, + "step": 33907 + }, + { + "epoch": 1.66, + "grad_norm": 0.6373917460441589, + "learning_rate": 0.000249427513455233, + "loss": 3.0631, + "step": 33908 + }, + { + "epoch": 1.66, + "grad_norm": 0.6329418420791626, + "learning_rate": 0.0002494123376688086, + "loss": 3.0995, + "step": 33909 + }, + { + "epoch": 1.66, + "grad_norm": 0.6428967118263245, + "learning_rate": 0.00024939716201562227, + "loss": 3.1429, + "step": 33910 + }, + { + "epoch": 1.66, + "grad_norm": 0.5830432772636414, + "learning_rate": 0.00024938198649571407, + "loss": 2.8414, + "step": 33911 + }, + { + "epoch": 1.66, + "grad_norm": 0.6218639612197876, + "learning_rate": 0.0002493668111091241, + "loss": 3.0054, + "step": 33912 + }, + { + "epoch": 1.66, + "grad_norm": 0.6176791787147522, + "learning_rate": 0.0002493516358558921, + "loss": 2.9076, + "step": 33913 + }, + { + "epoch": 1.66, + "grad_norm": 0.6287698745727539, + "learning_rate": 0.00024933646073605826, + "loss": 3.1468, + "step": 33914 + }, + { + "epoch": 1.66, + "grad_norm": 0.6077466011047363, + "learning_rate": 0.00024932128574966226, + "loss": 3.0103, + "step": 33915 + }, + { + "epoch": 1.66, + "grad_norm": 0.6049577593803406, + "learning_rate": 0.00024930611089674434, + "loss": 2.6801, + "step": 33916 + }, + { + "epoch": 1.66, + "grad_norm": 0.5947195887565613, + "learning_rate": 0.0002492909361773444, + "loss": 3.1092, + "step": 33917 + }, + { + "epoch": 1.66, + "grad_norm": 0.6179590225219727, + "learning_rate": 0.0002492757615915023, + "loss": 3.0234, + "step": 33918 + }, + { + "epoch": 1.66, + "grad_norm": 0.6137421727180481, + "learning_rate": 0.0002492605871392582, + "loss": 3.0543, + "step": 33919 + }, + { + "epoch": 1.66, + "grad_norm": 0.6110509037971497, + "learning_rate": 0.00024924541282065184, + "loss": 3.0235, + "step": 33920 + }, + { + "epoch": 1.66, + "grad_norm": 0.604406476020813, + "learning_rate": 0.00024923023863572324, + "loss": 2.8201, + "step": 33921 + }, + { + "epoch": 1.66, + "grad_norm": 0.5563648343086243, + "learning_rate": 0.00024921506458451255, + "loss": 3.0609, + "step": 33922 + }, + { + "epoch": 1.66, + "grad_norm": 0.6176638007164001, + "learning_rate": 0.00024919989066705955, + "loss": 2.9723, + "step": 33923 + }, + { + "epoch": 1.66, + "grad_norm": 0.5700177550315857, + "learning_rate": 0.00024918471688340437, + "loss": 3.0213, + "step": 33924 + }, + { + "epoch": 1.66, + "grad_norm": 0.6631049513816833, + "learning_rate": 0.0002491695432335867, + "loss": 2.908, + "step": 33925 + }, + { + "epoch": 1.66, + "grad_norm": 0.6572071313858032, + "learning_rate": 0.0002491543697176467, + "loss": 3.0808, + "step": 33926 + }, + { + "epoch": 1.66, + "grad_norm": 0.5589262843132019, + "learning_rate": 0.0002491391963356244, + "loss": 2.9899, + "step": 33927 + }, + { + "epoch": 1.66, + "grad_norm": 0.6265024542808533, + "learning_rate": 0.00024912402308755957, + "loss": 2.9103, + "step": 33928 + }, + { + "epoch": 1.66, + "grad_norm": 0.5696996450424194, + "learning_rate": 0.00024910884997349236, + "loss": 3.0077, + "step": 33929 + }, + { + "epoch": 1.66, + "grad_norm": 0.6142954230308533, + "learning_rate": 0.0002490936769934626, + "loss": 3.0136, + "step": 33930 + }, + { + "epoch": 1.66, + "grad_norm": 0.5850833654403687, + "learning_rate": 0.00024907850414751016, + "loss": 3.0001, + "step": 33931 + }, + { + "epoch": 1.66, + "grad_norm": 0.6385449171066284, + "learning_rate": 0.0002490633314356754, + "loss": 3.005, + "step": 33932 + }, + { + "epoch": 1.66, + "grad_norm": 0.6238567233085632, + "learning_rate": 0.0002490481588579978, + "loss": 3.1686, + "step": 33933 + }, + { + "epoch": 1.66, + "grad_norm": 0.6315261721611023, + "learning_rate": 0.0002490329864145177, + "loss": 3.2475, + "step": 33934 + }, + { + "epoch": 1.66, + "grad_norm": 0.6120766997337341, + "learning_rate": 0.0002490178141052748, + "loss": 2.8656, + "step": 33935 + }, + { + "epoch": 1.66, + "grad_norm": 0.6094756126403809, + "learning_rate": 0.0002490026419303092, + "loss": 2.949, + "step": 33936 + }, + { + "epoch": 1.66, + "grad_norm": 0.6236438751220703, + "learning_rate": 0.0002489874698896609, + "loss": 3.0454, + "step": 33937 + }, + { + "epoch": 1.66, + "grad_norm": 0.7023098468780518, + "learning_rate": 0.00024897229798336967, + "loss": 2.8311, + "step": 33938 + }, + { + "epoch": 1.66, + "grad_norm": 0.609567403793335, + "learning_rate": 0.00024895712621147574, + "loss": 3.0063, + "step": 33939 + }, + { + "epoch": 1.66, + "grad_norm": 0.6077160835266113, + "learning_rate": 0.00024894195457401876, + "loss": 3.0588, + "step": 33940 + }, + { + "epoch": 1.66, + "grad_norm": 0.6205511093139648, + "learning_rate": 0.00024892678307103885, + "loss": 3.0441, + "step": 33941 + }, + { + "epoch": 1.66, + "grad_norm": 0.626202404499054, + "learning_rate": 0.00024891161170257606, + "loss": 3.137, + "step": 33942 + }, + { + "epoch": 1.66, + "grad_norm": 0.5801575779914856, + "learning_rate": 0.0002488964404686702, + "loss": 2.9521, + "step": 33943 + }, + { + "epoch": 1.66, + "grad_norm": 0.6224533319473267, + "learning_rate": 0.00024888126936936135, + "loss": 3.0051, + "step": 33944 + }, + { + "epoch": 1.66, + "grad_norm": 0.6122703552246094, + "learning_rate": 0.00024886609840468933, + "loss": 2.7792, + "step": 33945 + }, + { + "epoch": 1.66, + "grad_norm": 0.6122666001319885, + "learning_rate": 0.0002488509275746941, + "loss": 2.8985, + "step": 33946 + }, + { + "epoch": 1.66, + "grad_norm": 0.5584424734115601, + "learning_rate": 0.0002488357568794158, + "loss": 2.9583, + "step": 33947 + }, + { + "epoch": 1.66, + "grad_norm": 0.601530909538269, + "learning_rate": 0.0002488205863188942, + "loss": 2.8557, + "step": 33948 + }, + { + "epoch": 1.66, + "grad_norm": 0.6443337798118591, + "learning_rate": 0.0002488054158931694, + "loss": 2.9457, + "step": 33949 + }, + { + "epoch": 1.66, + "grad_norm": 0.5796291828155518, + "learning_rate": 0.0002487902456022812, + "loss": 2.9709, + "step": 33950 + }, + { + "epoch": 1.66, + "grad_norm": 0.5982805490493774, + "learning_rate": 0.0002487750754462696, + "loss": 3.193, + "step": 33951 + }, + { + "epoch": 1.66, + "grad_norm": 0.5929266810417175, + "learning_rate": 0.0002487599054251747, + "loss": 3.1062, + "step": 33952 + }, + { + "epoch": 1.66, + "grad_norm": 0.6260059475898743, + "learning_rate": 0.0002487447355390363, + "loss": 2.9633, + "step": 33953 + }, + { + "epoch": 1.66, + "grad_norm": 0.6307040452957153, + "learning_rate": 0.0002487295657878945, + "loss": 3.0025, + "step": 33954 + }, + { + "epoch": 1.66, + "grad_norm": 0.6263796091079712, + "learning_rate": 0.000248714396171789, + "loss": 2.8704, + "step": 33955 + }, + { + "epoch": 1.66, + "grad_norm": 0.5983632206916809, + "learning_rate": 0.00024869922669076, + "loss": 3.1199, + "step": 33956 + }, + { + "epoch": 1.66, + "grad_norm": 0.6320955157279968, + "learning_rate": 0.0002486840573448474, + "loss": 3.0689, + "step": 33957 + }, + { + "epoch": 1.66, + "grad_norm": 0.6292237639427185, + "learning_rate": 0.00024866888813409097, + "loss": 2.9389, + "step": 33958 + }, + { + "epoch": 1.66, + "grad_norm": 0.6104917526245117, + "learning_rate": 0.00024865371905853097, + "loss": 3.0677, + "step": 33959 + }, + { + "epoch": 1.66, + "grad_norm": 0.6262634992599487, + "learning_rate": 0.0002486385501182071, + "loss": 2.9518, + "step": 33960 + }, + { + "epoch": 1.66, + "grad_norm": 0.6107706427574158, + "learning_rate": 0.0002486233813131595, + "loss": 2.9798, + "step": 33961 + }, + { + "epoch": 1.66, + "grad_norm": 0.6529039144515991, + "learning_rate": 0.0002486082126434279, + "loss": 3.1314, + "step": 33962 + }, + { + "epoch": 1.66, + "grad_norm": 0.6255234479904175, + "learning_rate": 0.0002485930441090524, + "loss": 3.112, + "step": 33963 + }, + { + "epoch": 1.66, + "grad_norm": 0.5849068760871887, + "learning_rate": 0.0002485778757100731, + "loss": 3.0832, + "step": 33964 + }, + { + "epoch": 1.66, + "grad_norm": 0.6300506591796875, + "learning_rate": 0.0002485627074465295, + "loss": 2.8947, + "step": 33965 + }, + { + "epoch": 1.66, + "grad_norm": 0.6072397828102112, + "learning_rate": 0.00024854753931846207, + "loss": 2.9836, + "step": 33966 + }, + { + "epoch": 1.66, + "grad_norm": 0.6096535325050354, + "learning_rate": 0.0002485323713259104, + "loss": 2.876, + "step": 33967 + }, + { + "epoch": 1.66, + "grad_norm": 0.6128242611885071, + "learning_rate": 0.0002485172034689145, + "loss": 3.1772, + "step": 33968 + }, + { + "epoch": 1.66, + "grad_norm": 0.6279151439666748, + "learning_rate": 0.00024850203574751454, + "loss": 2.9902, + "step": 33969 + }, + { + "epoch": 1.66, + "grad_norm": 0.7040567398071289, + "learning_rate": 0.0002484868681617502, + "loss": 3.0256, + "step": 33970 + }, + { + "epoch": 1.66, + "grad_norm": 0.6096146702766418, + "learning_rate": 0.0002484717007116616, + "loss": 2.8562, + "step": 33971 + }, + { + "epoch": 1.66, + "grad_norm": 0.5939738750457764, + "learning_rate": 0.0002484565333972885, + "loss": 3.1068, + "step": 33972 + }, + { + "epoch": 1.66, + "grad_norm": 0.6173325181007385, + "learning_rate": 0.00024844136621867096, + "loss": 3.0958, + "step": 33973 + }, + { + "epoch": 1.66, + "grad_norm": 0.6269416213035583, + "learning_rate": 0.0002484261991758491, + "loss": 3.0309, + "step": 33974 + }, + { + "epoch": 1.67, + "grad_norm": 0.6126830577850342, + "learning_rate": 0.00024841103226886263, + "loss": 3.0065, + "step": 33975 + }, + { + "epoch": 1.67, + "grad_norm": 0.6426466107368469, + "learning_rate": 0.00024839586549775164, + "loss": 3.0531, + "step": 33976 + }, + { + "epoch": 1.67, + "grad_norm": 0.6027688384056091, + "learning_rate": 0.0002483806988625559, + "loss": 3.0285, + "step": 33977 + }, + { + "epoch": 1.67, + "grad_norm": 0.6397801637649536, + "learning_rate": 0.0002483655323633155, + "loss": 3.0949, + "step": 33978 + }, + { + "epoch": 1.67, + "grad_norm": 0.591986894607544, + "learning_rate": 0.0002483503660000704, + "loss": 3.1487, + "step": 33979 + }, + { + "epoch": 1.67, + "grad_norm": 0.6391310095787048, + "learning_rate": 0.00024833519977286044, + "loss": 3.0401, + "step": 33980 + }, + { + "epoch": 1.67, + "grad_norm": 0.6004000306129456, + "learning_rate": 0.00024832003368172566, + "loss": 3.0843, + "step": 33981 + }, + { + "epoch": 1.67, + "grad_norm": 0.5886471271514893, + "learning_rate": 0.00024830486772670595, + "loss": 3.0734, + "step": 33982 + }, + { + "epoch": 1.67, + "grad_norm": 0.616348385810852, + "learning_rate": 0.0002482897019078412, + "loss": 2.848, + "step": 33983 + }, + { + "epoch": 1.67, + "grad_norm": 0.6533381342887878, + "learning_rate": 0.0002482745362251715, + "loss": 3.0357, + "step": 33984 + }, + { + "epoch": 1.67, + "grad_norm": 0.6193398237228394, + "learning_rate": 0.00024825937067873675, + "loss": 3.0845, + "step": 33985 + }, + { + "epoch": 1.67, + "grad_norm": 0.604232668876648, + "learning_rate": 0.0002482442052685769, + "loss": 3.1351, + "step": 33986 + }, + { + "epoch": 1.67, + "grad_norm": 0.6174710392951965, + "learning_rate": 0.0002482290399947317, + "loss": 3.1375, + "step": 33987 + }, + { + "epoch": 1.67, + "grad_norm": 0.5886451005935669, + "learning_rate": 0.0002482138748572413, + "loss": 3.0255, + "step": 33988 + }, + { + "epoch": 1.67, + "grad_norm": 0.6079821586608887, + "learning_rate": 0.00024819870985614564, + "loss": 2.9344, + "step": 33989 + }, + { + "epoch": 1.67, + "grad_norm": 0.669161856174469, + "learning_rate": 0.00024818354499148447, + "loss": 3.0288, + "step": 33990 + }, + { + "epoch": 1.67, + "grad_norm": 0.6441835761070251, + "learning_rate": 0.00024816838026329806, + "loss": 2.9344, + "step": 33991 + }, + { + "epoch": 1.67, + "grad_norm": 0.6017826795578003, + "learning_rate": 0.00024815321567162605, + "loss": 2.8813, + "step": 33992 + }, + { + "epoch": 1.67, + "grad_norm": 0.6216018199920654, + "learning_rate": 0.0002481380512165084, + "loss": 2.8692, + "step": 33993 + }, + { + "epoch": 1.67, + "grad_norm": 0.6077483892440796, + "learning_rate": 0.00024812288689798537, + "loss": 3.0153, + "step": 33994 + }, + { + "epoch": 1.67, + "grad_norm": 0.8035667538642883, + "learning_rate": 0.00024810772271609646, + "loss": 3.3342, + "step": 33995 + }, + { + "epoch": 1.67, + "grad_norm": 0.6364055275917053, + "learning_rate": 0.000248092558670882, + "loss": 2.9934, + "step": 33996 + }, + { + "epoch": 1.67, + "grad_norm": 0.5845009684562683, + "learning_rate": 0.00024807739476238167, + "loss": 3.0476, + "step": 33997 + }, + { + "epoch": 1.67, + "grad_norm": 0.6098141670227051, + "learning_rate": 0.0002480622309906354, + "loss": 2.9357, + "step": 33998 + }, + { + "epoch": 1.67, + "grad_norm": 0.6033996343612671, + "learning_rate": 0.00024804706735568336, + "loss": 3.0799, + "step": 33999 + }, + { + "epoch": 1.67, + "grad_norm": 0.6313714385032654, + "learning_rate": 0.0002480319038575653, + "loss": 3.0056, + "step": 34000 + }, + { + "epoch": 1.67, + "grad_norm": 0.6305382251739502, + "learning_rate": 0.00024801674049632126, + "loss": 2.9569, + "step": 34001 + }, + { + "epoch": 1.67, + "grad_norm": 0.6073910593986511, + "learning_rate": 0.000248001577271991, + "loss": 2.9318, + "step": 34002 + }, + { + "epoch": 1.67, + "grad_norm": 0.594529926776886, + "learning_rate": 0.0002479864141846146, + "loss": 2.9888, + "step": 34003 + }, + { + "epoch": 1.67, + "grad_norm": 0.6212973594665527, + "learning_rate": 0.0002479712512342321, + "loss": 2.9173, + "step": 34004 + }, + { + "epoch": 1.67, + "grad_norm": 0.6293397545814514, + "learning_rate": 0.0002479560884208831, + "loss": 2.8407, + "step": 34005 + }, + { + "epoch": 1.67, + "grad_norm": 0.6491313576698303, + "learning_rate": 0.00024794092574460797, + "loss": 2.8996, + "step": 34006 + }, + { + "epoch": 1.67, + "grad_norm": 0.583442747592926, + "learning_rate": 0.0002479257632054463, + "loss": 2.928, + "step": 34007 + }, + { + "epoch": 1.67, + "grad_norm": 0.6074749827384949, + "learning_rate": 0.0002479106008034381, + "loss": 2.8657, + "step": 34008 + }, + { + "epoch": 1.67, + "grad_norm": 0.5930288434028625, + "learning_rate": 0.0002478954385386235, + "loss": 3.0198, + "step": 34009 + }, + { + "epoch": 1.67, + "grad_norm": 0.6107326745986938, + "learning_rate": 0.0002478802764110422, + "loss": 2.9917, + "step": 34010 + }, + { + "epoch": 1.67, + "grad_norm": 0.5948980450630188, + "learning_rate": 0.0002478651144207342, + "loss": 3.0944, + "step": 34011 + }, + { + "epoch": 1.67, + "grad_norm": 0.6396520137786865, + "learning_rate": 0.0002478499525677395, + "loss": 2.9329, + "step": 34012 + }, + { + "epoch": 1.67, + "grad_norm": 0.6055231094360352, + "learning_rate": 0.0002478347908520979, + "loss": 3.2016, + "step": 34013 + }, + { + "epoch": 1.67, + "grad_norm": 0.6053661108016968, + "learning_rate": 0.00024781962927384953, + "loss": 2.8238, + "step": 34014 + }, + { + "epoch": 1.67, + "grad_norm": 0.6582406163215637, + "learning_rate": 0.00024780446783303415, + "loss": 2.8245, + "step": 34015 + }, + { + "epoch": 1.67, + "grad_norm": 0.6152645349502563, + "learning_rate": 0.00024778930652969187, + "loss": 3.0896, + "step": 34016 + }, + { + "epoch": 1.67, + "grad_norm": 0.6121913194656372, + "learning_rate": 0.00024777414536386236, + "loss": 3.1472, + "step": 34017 + }, + { + "epoch": 1.67, + "grad_norm": 0.6189271807670593, + "learning_rate": 0.00024775898433558575, + "loss": 3.2943, + "step": 34018 + }, + { + "epoch": 1.67, + "grad_norm": 0.6674231886863708, + "learning_rate": 0.000247743823444902, + "loss": 2.8963, + "step": 34019 + }, + { + "epoch": 1.67, + "grad_norm": 0.6085709929466248, + "learning_rate": 0.00024772866269185084, + "loss": 2.9968, + "step": 34020 + }, + { + "epoch": 1.67, + "grad_norm": 0.6044761538505554, + "learning_rate": 0.0002477135020764724, + "loss": 2.9718, + "step": 34021 + }, + { + "epoch": 1.67, + "grad_norm": 0.6364777088165283, + "learning_rate": 0.00024769834159880656, + "loss": 3.092, + "step": 34022 + }, + { + "epoch": 1.67, + "grad_norm": 0.6292357444763184, + "learning_rate": 0.00024768318125889305, + "loss": 2.865, + "step": 34023 + }, + { + "epoch": 1.67, + "grad_norm": 0.6410455703735352, + "learning_rate": 0.0002476680210567722, + "loss": 2.8619, + "step": 34024 + }, + { + "epoch": 1.67, + "grad_norm": 0.6323256492614746, + "learning_rate": 0.0002476528609924836, + "loss": 2.9672, + "step": 34025 + }, + { + "epoch": 1.67, + "grad_norm": 0.6153978705406189, + "learning_rate": 0.00024763770106606733, + "loss": 3.0129, + "step": 34026 + }, + { + "epoch": 1.67, + "grad_norm": 0.6172310709953308, + "learning_rate": 0.0002476225412775632, + "loss": 3.1617, + "step": 34027 + }, + { + "epoch": 1.67, + "grad_norm": 0.6111814379692078, + "learning_rate": 0.00024760738162701125, + "loss": 2.8775, + "step": 34028 + }, + { + "epoch": 1.67, + "grad_norm": 0.6040524840354919, + "learning_rate": 0.00024759222211445144, + "loss": 3.0851, + "step": 34029 + }, + { + "epoch": 1.67, + "grad_norm": 0.6083324551582336, + "learning_rate": 0.0002475770627399235, + "loss": 3.2039, + "step": 34030 + }, + { + "epoch": 1.67, + "grad_norm": 0.6631869673728943, + "learning_rate": 0.0002475619035034676, + "loss": 3.0856, + "step": 34031 + }, + { + "epoch": 1.67, + "grad_norm": 0.605511486530304, + "learning_rate": 0.00024754674440512355, + "loss": 3.0132, + "step": 34032 + }, + { + "epoch": 1.67, + "grad_norm": 0.6278396248817444, + "learning_rate": 0.0002475315854449312, + "loss": 2.935, + "step": 34033 + }, + { + "epoch": 1.67, + "grad_norm": 0.6231174468994141, + "learning_rate": 0.00024751642662293065, + "loss": 2.9998, + "step": 34034 + }, + { + "epoch": 1.67, + "grad_norm": 0.6460698246955872, + "learning_rate": 0.0002475012679391616, + "loss": 3.1249, + "step": 34035 + }, + { + "epoch": 1.67, + "grad_norm": 0.6108884811401367, + "learning_rate": 0.00024748610939366426, + "loss": 3.1124, + "step": 34036 + }, + { + "epoch": 1.67, + "grad_norm": 0.67753005027771, + "learning_rate": 0.00024747095098647833, + "loss": 3.0618, + "step": 34037 + }, + { + "epoch": 1.67, + "grad_norm": 0.6046687364578247, + "learning_rate": 0.00024745579271764375, + "loss": 3.1446, + "step": 34038 + }, + { + "epoch": 1.67, + "grad_norm": 0.701672375202179, + "learning_rate": 0.0002474406345872006, + "loss": 2.9236, + "step": 34039 + }, + { + "epoch": 1.67, + "grad_norm": 0.60172438621521, + "learning_rate": 0.00024742547659518867, + "loss": 3.0169, + "step": 34040 + }, + { + "epoch": 1.67, + "grad_norm": 0.5958285927772522, + "learning_rate": 0.000247410318741648, + "loss": 3.0876, + "step": 34041 + }, + { + "epoch": 1.67, + "grad_norm": 0.6142698526382446, + "learning_rate": 0.0002473951610266182, + "loss": 3.2667, + "step": 34042 + }, + { + "epoch": 1.67, + "grad_norm": 0.5789704322814941, + "learning_rate": 0.00024738000345013965, + "loss": 2.9274, + "step": 34043 + }, + { + "epoch": 1.67, + "grad_norm": 0.6565642356872559, + "learning_rate": 0.0002473648460122519, + "loss": 3.2293, + "step": 34044 + }, + { + "epoch": 1.67, + "grad_norm": 0.6168193221092224, + "learning_rate": 0.00024734968871299506, + "loss": 3.166, + "step": 34045 + }, + { + "epoch": 1.67, + "grad_norm": 0.6158468127250671, + "learning_rate": 0.00024733453155240906, + "loss": 3.1144, + "step": 34046 + }, + { + "epoch": 1.67, + "grad_norm": 0.6314502954483032, + "learning_rate": 0.0002473193745305337, + "loss": 2.9168, + "step": 34047 + }, + { + "epoch": 1.67, + "grad_norm": 0.6529750823974609, + "learning_rate": 0.00024730421764740905, + "loss": 3.116, + "step": 34048 + }, + { + "epoch": 1.67, + "grad_norm": 0.6014294028282166, + "learning_rate": 0.00024728906090307483, + "loss": 3.053, + "step": 34049 + }, + { + "epoch": 1.67, + "grad_norm": 0.6091155409812927, + "learning_rate": 0.0002472739042975712, + "loss": 2.8216, + "step": 34050 + }, + { + "epoch": 1.67, + "grad_norm": 0.641144871711731, + "learning_rate": 0.00024725874783093795, + "loss": 3.0557, + "step": 34051 + }, + { + "epoch": 1.67, + "grad_norm": 0.6270335912704468, + "learning_rate": 0.00024724359150321486, + "loss": 3.1368, + "step": 34052 + }, + { + "epoch": 1.67, + "grad_norm": 0.5929427146911621, + "learning_rate": 0.00024722843531444215, + "loss": 3.031, + "step": 34053 + }, + { + "epoch": 1.67, + "grad_norm": 0.6024312376976013, + "learning_rate": 0.00024721327926465955, + "loss": 2.8996, + "step": 34054 + }, + { + "epoch": 1.67, + "grad_norm": 0.6244577765464783, + "learning_rate": 0.0002471981233539069, + "loss": 2.9659, + "step": 34055 + }, + { + "epoch": 1.67, + "grad_norm": 0.6653500199317932, + "learning_rate": 0.0002471829675822244, + "loss": 2.9293, + "step": 34056 + }, + { + "epoch": 1.67, + "grad_norm": 0.583678662776947, + "learning_rate": 0.0002471678119496516, + "loss": 3.0174, + "step": 34057 + }, + { + "epoch": 1.67, + "grad_norm": 0.6141014099121094, + "learning_rate": 0.00024715265645622883, + "loss": 3.1254, + "step": 34058 + }, + { + "epoch": 1.67, + "grad_norm": 0.6307711601257324, + "learning_rate": 0.0002471375011019957, + "loss": 2.8053, + "step": 34059 + }, + { + "epoch": 1.67, + "grad_norm": 0.5954844951629639, + "learning_rate": 0.0002471223458869922, + "loss": 2.8284, + "step": 34060 + }, + { + "epoch": 1.67, + "grad_norm": 0.6285943388938904, + "learning_rate": 0.00024710719081125833, + "loss": 2.9632, + "step": 34061 + }, + { + "epoch": 1.67, + "grad_norm": 0.6757904887199402, + "learning_rate": 0.00024709203587483384, + "loss": 2.9003, + "step": 34062 + }, + { + "epoch": 1.67, + "grad_norm": 0.5749251842498779, + "learning_rate": 0.0002470768810777589, + "loss": 3.0031, + "step": 34063 + }, + { + "epoch": 1.67, + "grad_norm": 0.6063864231109619, + "learning_rate": 0.0002470617264200731, + "loss": 2.9874, + "step": 34064 + }, + { + "epoch": 1.67, + "grad_norm": 0.6300905346870422, + "learning_rate": 0.0002470465719018166, + "loss": 2.8773, + "step": 34065 + }, + { + "epoch": 1.67, + "grad_norm": 0.6623172760009766, + "learning_rate": 0.00024703141752302927, + "loss": 2.9534, + "step": 34066 + }, + { + "epoch": 1.67, + "grad_norm": 0.6001306772232056, + "learning_rate": 0.0002470162632837509, + "loss": 2.909, + "step": 34067 + }, + { + "epoch": 1.67, + "grad_norm": 0.6355345249176025, + "learning_rate": 0.0002470011091840216, + "loss": 3.036, + "step": 34068 + }, + { + "epoch": 1.67, + "grad_norm": 0.6132904887199402, + "learning_rate": 0.00024698595522388114, + "loss": 3.0769, + "step": 34069 + }, + { + "epoch": 1.67, + "grad_norm": 0.6177525520324707, + "learning_rate": 0.0002469708014033694, + "loss": 2.792, + "step": 34070 + }, + { + "epoch": 1.67, + "grad_norm": 0.6004658937454224, + "learning_rate": 0.0002469556477225265, + "loss": 3.1412, + "step": 34071 + }, + { + "epoch": 1.67, + "grad_norm": 0.6170461177825928, + "learning_rate": 0.0002469404941813921, + "loss": 3.1152, + "step": 34072 + }, + { + "epoch": 1.67, + "grad_norm": 0.5980595350265503, + "learning_rate": 0.0002469253407800063, + "loss": 3.123, + "step": 34073 + }, + { + "epoch": 1.67, + "grad_norm": 0.6601492762565613, + "learning_rate": 0.0002469101875184089, + "loss": 2.977, + "step": 34074 + }, + { + "epoch": 1.67, + "grad_norm": 0.6366380453109741, + "learning_rate": 0.00024689503439663974, + "loss": 2.9476, + "step": 34075 + }, + { + "epoch": 1.67, + "grad_norm": 0.656537652015686, + "learning_rate": 0.000246879881414739, + "loss": 3.1397, + "step": 34076 + }, + { + "epoch": 1.67, + "grad_norm": 0.629987359046936, + "learning_rate": 0.0002468647285727464, + "loss": 2.9363, + "step": 34077 + }, + { + "epoch": 1.67, + "grad_norm": 0.6459150314331055, + "learning_rate": 0.0002468495758707019, + "loss": 3.0619, + "step": 34078 + }, + { + "epoch": 1.67, + "grad_norm": 0.6273544430732727, + "learning_rate": 0.0002468344233086453, + "loss": 2.9486, + "step": 34079 + }, + { + "epoch": 1.67, + "grad_norm": 0.6106494665145874, + "learning_rate": 0.00024681927088661664, + "loss": 3.2629, + "step": 34080 + }, + { + "epoch": 1.67, + "grad_norm": 0.6219441890716553, + "learning_rate": 0.0002468041186046558, + "loss": 3.2484, + "step": 34081 + }, + { + "epoch": 1.67, + "grad_norm": 0.6537150144577026, + "learning_rate": 0.0002467889664628026, + "loss": 2.9886, + "step": 34082 + }, + { + "epoch": 1.67, + "grad_norm": 0.6156889200210571, + "learning_rate": 0.0002467738144610972, + "loss": 3.1849, + "step": 34083 + }, + { + "epoch": 1.67, + "grad_norm": 0.6050704717636108, + "learning_rate": 0.0002467586625995792, + "loss": 2.9982, + "step": 34084 + }, + { + "epoch": 1.67, + "grad_norm": 0.6336327195167542, + "learning_rate": 0.00024674351087828857, + "loss": 3.1285, + "step": 34085 + }, + { + "epoch": 1.67, + "grad_norm": 0.6327105164527893, + "learning_rate": 0.00024672835929726543, + "loss": 2.9639, + "step": 34086 + }, + { + "epoch": 1.67, + "grad_norm": 0.6417514681816101, + "learning_rate": 0.0002467132078565495, + "loss": 3.111, + "step": 34087 + }, + { + "epoch": 1.67, + "grad_norm": 0.6045467257499695, + "learning_rate": 0.0002466980565561807, + "loss": 3.2154, + "step": 34088 + }, + { + "epoch": 1.67, + "grad_norm": 0.9114331007003784, + "learning_rate": 0.0002466829053961989, + "loss": 3.1789, + "step": 34089 + }, + { + "epoch": 1.67, + "grad_norm": 0.5843703746795654, + "learning_rate": 0.0002466677543766441, + "loss": 3.1093, + "step": 34090 + }, + { + "epoch": 1.67, + "grad_norm": 0.6175855398178101, + "learning_rate": 0.0002466526034975563, + "loss": 3.1485, + "step": 34091 + }, + { + "epoch": 1.67, + "grad_norm": 0.6193850636482239, + "learning_rate": 0.0002466374527589751, + "loss": 3.0645, + "step": 34092 + }, + { + "epoch": 1.67, + "grad_norm": 0.6025353670120239, + "learning_rate": 0.0002466223021609407, + "loss": 2.9797, + "step": 34093 + }, + { + "epoch": 1.67, + "grad_norm": 0.6327268481254578, + "learning_rate": 0.00024660715170349285, + "loss": 3.2005, + "step": 34094 + }, + { + "epoch": 1.67, + "grad_norm": 0.6247721910476685, + "learning_rate": 0.0002465920013866714, + "loss": 3.0138, + "step": 34095 + }, + { + "epoch": 1.67, + "grad_norm": 0.6387418508529663, + "learning_rate": 0.0002465768512105165, + "loss": 3.1151, + "step": 34096 + }, + { + "epoch": 1.67, + "grad_norm": 0.6263685822486877, + "learning_rate": 0.00024656170117506773, + "loss": 2.9747, + "step": 34097 + }, + { + "epoch": 1.67, + "grad_norm": 0.6204245686531067, + "learning_rate": 0.00024654655128036526, + "loss": 2.9732, + "step": 34098 + }, + { + "epoch": 1.67, + "grad_norm": 0.5845150947570801, + "learning_rate": 0.0002465314015264489, + "loss": 3.0868, + "step": 34099 + }, + { + "epoch": 1.67, + "grad_norm": 0.6110867857933044, + "learning_rate": 0.0002465162519133584, + "loss": 2.8015, + "step": 34100 + }, + { + "epoch": 1.67, + "grad_norm": 0.6433117389678955, + "learning_rate": 0.0002465011024411339, + "loss": 2.9421, + "step": 34101 + }, + { + "epoch": 1.67, + "grad_norm": 0.6174062490463257, + "learning_rate": 0.0002464859531098152, + "loss": 2.854, + "step": 34102 + }, + { + "epoch": 1.67, + "grad_norm": 0.5987910032272339, + "learning_rate": 0.0002464708039194423, + "loss": 3.1167, + "step": 34103 + }, + { + "epoch": 1.67, + "grad_norm": 0.6083313226699829, + "learning_rate": 0.0002464556548700548, + "loss": 3.0495, + "step": 34104 + }, + { + "epoch": 1.67, + "grad_norm": 0.633455753326416, + "learning_rate": 0.0002464405059616929, + "loss": 2.8616, + "step": 34105 + }, + { + "epoch": 1.67, + "grad_norm": 0.6219099760055542, + "learning_rate": 0.00024642535719439644, + "loss": 3.1337, + "step": 34106 + }, + { + "epoch": 1.67, + "grad_norm": 0.6289411783218384, + "learning_rate": 0.0002464102085682052, + "loss": 3.1779, + "step": 34107 + }, + { + "epoch": 1.67, + "grad_norm": 0.6306152939796448, + "learning_rate": 0.00024639506008315926, + "loss": 3.1277, + "step": 34108 + }, + { + "epoch": 1.67, + "grad_norm": 0.6876551508903503, + "learning_rate": 0.0002463799117392983, + "loss": 3.0214, + "step": 34109 + }, + { + "epoch": 1.67, + "grad_norm": 0.598209023475647, + "learning_rate": 0.00024636476353666234, + "loss": 3.0505, + "step": 34110 + }, + { + "epoch": 1.67, + "grad_norm": 0.645162045955658, + "learning_rate": 0.0002463496154752914, + "loss": 3.0546, + "step": 34111 + }, + { + "epoch": 1.67, + "grad_norm": 0.6210871934890747, + "learning_rate": 0.0002463344675552251, + "loss": 3.1667, + "step": 34112 + }, + { + "epoch": 1.67, + "grad_norm": 0.5721740126609802, + "learning_rate": 0.0002463193197765036, + "loss": 2.9053, + "step": 34113 + }, + { + "epoch": 1.67, + "grad_norm": 0.5755153298377991, + "learning_rate": 0.00024630417213916656, + "loss": 2.8611, + "step": 34114 + }, + { + "epoch": 1.67, + "grad_norm": 0.615121066570282, + "learning_rate": 0.000246289024643254, + "loss": 2.9583, + "step": 34115 + }, + { + "epoch": 1.67, + "grad_norm": 0.641639232635498, + "learning_rate": 0.0002462738772888059, + "loss": 3.1712, + "step": 34116 + }, + { + "epoch": 1.67, + "grad_norm": 0.5973453521728516, + "learning_rate": 0.00024625873007586203, + "loss": 3.1632, + "step": 34117 + }, + { + "epoch": 1.67, + "grad_norm": 0.6583412289619446, + "learning_rate": 0.0002462435830044624, + "loss": 2.9733, + "step": 34118 + }, + { + "epoch": 1.67, + "grad_norm": 0.6712580323219299, + "learning_rate": 0.0002462284360746467, + "loss": 3.1107, + "step": 34119 + }, + { + "epoch": 1.67, + "grad_norm": 0.6605498194694519, + "learning_rate": 0.00024621328928645504, + "loss": 3.0804, + "step": 34120 + }, + { + "epoch": 1.67, + "grad_norm": 0.6001577377319336, + "learning_rate": 0.0002461981426399272, + "loss": 2.8523, + "step": 34121 + }, + { + "epoch": 1.67, + "grad_norm": 0.6218976974487305, + "learning_rate": 0.000246182996135103, + "loss": 3.1601, + "step": 34122 + }, + { + "epoch": 1.67, + "grad_norm": 0.6491854190826416, + "learning_rate": 0.0002461678497720226, + "loss": 2.9465, + "step": 34123 + }, + { + "epoch": 1.67, + "grad_norm": 0.6104328036308289, + "learning_rate": 0.0002461527035507256, + "loss": 3.0921, + "step": 34124 + }, + { + "epoch": 1.67, + "grad_norm": 0.622700035572052, + "learning_rate": 0.00024613755747125216, + "loss": 2.9134, + "step": 34125 + }, + { + "epoch": 1.67, + "grad_norm": 0.6061636805534363, + "learning_rate": 0.00024612241153364184, + "loss": 3.047, + "step": 34126 + }, + { + "epoch": 1.67, + "grad_norm": 0.6331983804702759, + "learning_rate": 0.0002461072657379348, + "loss": 2.7637, + "step": 34127 + }, + { + "epoch": 1.67, + "grad_norm": 0.6341940760612488, + "learning_rate": 0.00024609212008417096, + "loss": 3.0107, + "step": 34128 + }, + { + "epoch": 1.67, + "grad_norm": 0.6093514561653137, + "learning_rate": 0.00024607697457238993, + "loss": 2.9378, + "step": 34129 + }, + { + "epoch": 1.67, + "grad_norm": 0.6078907251358032, + "learning_rate": 0.0002460618292026319, + "loss": 2.928, + "step": 34130 + }, + { + "epoch": 1.67, + "grad_norm": 0.6402571201324463, + "learning_rate": 0.00024604668397493657, + "loss": 3.0684, + "step": 34131 + }, + { + "epoch": 1.67, + "grad_norm": 0.6203433871269226, + "learning_rate": 0.0002460315388893438, + "loss": 2.8733, + "step": 34132 + }, + { + "epoch": 1.67, + "grad_norm": 0.6072234511375427, + "learning_rate": 0.0002460163939458937, + "loss": 3.0711, + "step": 34133 + }, + { + "epoch": 1.67, + "grad_norm": 0.6317029595375061, + "learning_rate": 0.000246001249144626, + "loss": 2.9225, + "step": 34134 + }, + { + "epoch": 1.67, + "grad_norm": 0.6035718321800232, + "learning_rate": 0.0002459861044855807, + "loss": 3.0206, + "step": 34135 + }, + { + "epoch": 1.67, + "grad_norm": 0.6024402976036072, + "learning_rate": 0.0002459709599687974, + "loss": 2.8322, + "step": 34136 + }, + { + "epoch": 1.67, + "grad_norm": 0.7386209964752197, + "learning_rate": 0.00024595581559431627, + "loss": 3.1102, + "step": 34137 + }, + { + "epoch": 1.67, + "grad_norm": 0.646673321723938, + "learning_rate": 0.00024594067136217716, + "loss": 3.1028, + "step": 34138 + }, + { + "epoch": 1.67, + "grad_norm": 0.6153397560119629, + "learning_rate": 0.00024592552727242, + "loss": 3.1711, + "step": 34139 + }, + { + "epoch": 1.67, + "grad_norm": 0.6744325160980225, + "learning_rate": 0.0002459103833250845, + "loss": 2.8508, + "step": 34140 + }, + { + "epoch": 1.67, + "grad_norm": 0.5932391881942749, + "learning_rate": 0.0002458952395202106, + "loss": 2.9682, + "step": 34141 + }, + { + "epoch": 1.67, + "grad_norm": 0.570640504360199, + "learning_rate": 0.0002458800958578383, + "loss": 3.0059, + "step": 34142 + }, + { + "epoch": 1.67, + "grad_norm": 0.6285528540611267, + "learning_rate": 0.00024586495233800743, + "loss": 3.0278, + "step": 34143 + }, + { + "epoch": 1.67, + "grad_norm": 0.6127492189407349, + "learning_rate": 0.0002458498089607577, + "loss": 3.117, + "step": 34144 + }, + { + "epoch": 1.67, + "grad_norm": 0.6140125393867493, + "learning_rate": 0.00024583466572612934, + "loss": 3.2226, + "step": 34145 + }, + { + "epoch": 1.67, + "grad_norm": 0.6389678716659546, + "learning_rate": 0.000245819522634162, + "loss": 3.1822, + "step": 34146 + }, + { + "epoch": 1.67, + "grad_norm": 0.6139385104179382, + "learning_rate": 0.0002458043796848955, + "loss": 2.8201, + "step": 34147 + }, + { + "epoch": 1.67, + "grad_norm": 0.6733065843582153, + "learning_rate": 0.00024578923687837, + "loss": 3.0222, + "step": 34148 + }, + { + "epoch": 1.67, + "grad_norm": 0.6101844906806946, + "learning_rate": 0.0002457740942146251, + "loss": 2.8743, + "step": 34149 + }, + { + "epoch": 1.67, + "grad_norm": 0.5869734883308411, + "learning_rate": 0.0002457589516937009, + "loss": 3.0927, + "step": 34150 + }, + { + "epoch": 1.67, + "grad_norm": 0.6247618198394775, + "learning_rate": 0.00024574380931563706, + "loss": 3.2443, + "step": 34151 + }, + { + "epoch": 1.67, + "grad_norm": 0.6441221833229065, + "learning_rate": 0.00024572866708047367, + "loss": 2.9533, + "step": 34152 + }, + { + "epoch": 1.67, + "grad_norm": 0.6270751357078552, + "learning_rate": 0.0002457135249882505, + "loss": 3.0383, + "step": 34153 + }, + { + "epoch": 1.67, + "grad_norm": 0.6131313443183899, + "learning_rate": 0.0002456983830390074, + "loss": 3.2256, + "step": 34154 + }, + { + "epoch": 1.67, + "grad_norm": 0.6170026063919067, + "learning_rate": 0.0002456832412327844, + "loss": 2.9657, + "step": 34155 + }, + { + "epoch": 1.67, + "grad_norm": 0.6400575637817383, + "learning_rate": 0.00024566809956962126, + "loss": 3.0925, + "step": 34156 + }, + { + "epoch": 1.67, + "grad_norm": 0.618373453617096, + "learning_rate": 0.00024565295804955783, + "loss": 2.9742, + "step": 34157 + }, + { + "epoch": 1.67, + "grad_norm": 0.67595374584198, + "learning_rate": 0.0002456378166726342, + "loss": 3.2046, + "step": 34158 + }, + { + "epoch": 1.67, + "grad_norm": 0.6886033415794373, + "learning_rate": 0.0002456226754388899, + "loss": 2.941, + "step": 34159 + }, + { + "epoch": 1.67, + "grad_norm": 0.62932288646698, + "learning_rate": 0.00024560753434836515, + "loss": 2.9249, + "step": 34160 + }, + { + "epoch": 1.67, + "grad_norm": 0.6120386123657227, + "learning_rate": 0.00024559239340109966, + "loss": 3.2161, + "step": 34161 + }, + { + "epoch": 1.67, + "grad_norm": 0.6127865314483643, + "learning_rate": 0.00024557725259713323, + "loss": 2.8264, + "step": 34162 + }, + { + "epoch": 1.67, + "grad_norm": 0.6054245829582214, + "learning_rate": 0.00024556211193650597, + "loss": 3.1627, + "step": 34163 + }, + { + "epoch": 1.67, + "grad_norm": 0.6147023439407349, + "learning_rate": 0.0002455469714192576, + "loss": 3.0502, + "step": 34164 + }, + { + "epoch": 1.67, + "grad_norm": 0.6629253029823303, + "learning_rate": 0.0002455318310454281, + "loss": 2.9995, + "step": 34165 + }, + { + "epoch": 1.67, + "grad_norm": 0.6122319102287292, + "learning_rate": 0.0002455166908150571, + "loss": 3.0473, + "step": 34166 + }, + { + "epoch": 1.67, + "grad_norm": 0.6370877027511597, + "learning_rate": 0.00024550155072818476, + "loss": 2.8365, + "step": 34167 + }, + { + "epoch": 1.67, + "grad_norm": 0.6131877899169922, + "learning_rate": 0.0002454864107848509, + "loss": 2.9348, + "step": 34168 + }, + { + "epoch": 1.67, + "grad_norm": 0.6444746255874634, + "learning_rate": 0.0002454712709850952, + "loss": 3.127, + "step": 34169 + }, + { + "epoch": 1.67, + "grad_norm": 0.6460678577423096, + "learning_rate": 0.0002454561313289578, + "loss": 2.8952, + "step": 34170 + }, + { + "epoch": 1.67, + "grad_norm": 0.6373785734176636, + "learning_rate": 0.00024544099181647843, + "loss": 3.0078, + "step": 34171 + }, + { + "epoch": 1.67, + "grad_norm": 0.6953406929969788, + "learning_rate": 0.00024542585244769687, + "loss": 3.0514, + "step": 34172 + }, + { + "epoch": 1.67, + "grad_norm": 0.6235325336456299, + "learning_rate": 0.00024541071322265324, + "loss": 3.0022, + "step": 34173 + }, + { + "epoch": 1.67, + "grad_norm": 0.6241675019264221, + "learning_rate": 0.00024539557414138724, + "loss": 3.1864, + "step": 34174 + }, + { + "epoch": 1.67, + "grad_norm": 0.6402691602706909, + "learning_rate": 0.00024538043520393887, + "loss": 2.88, + "step": 34175 + }, + { + "epoch": 1.67, + "grad_norm": 0.6314650774002075, + "learning_rate": 0.0002453652964103478, + "loss": 3.0372, + "step": 34176 + }, + { + "epoch": 1.67, + "grad_norm": 0.6837327480316162, + "learning_rate": 0.000245350157760654, + "loss": 2.8986, + "step": 34177 + }, + { + "epoch": 1.67, + "grad_norm": 0.6558158993721008, + "learning_rate": 0.0002453350192548975, + "loss": 3.008, + "step": 34178 + }, + { + "epoch": 1.68, + "grad_norm": 0.6023430228233337, + "learning_rate": 0.00024531988089311797, + "loss": 2.9874, + "step": 34179 + }, + { + "epoch": 1.68, + "grad_norm": 0.6416671276092529, + "learning_rate": 0.00024530474267535547, + "loss": 2.7857, + "step": 34180 + }, + { + "epoch": 1.68, + "grad_norm": 0.6319637298583984, + "learning_rate": 0.00024528960460164955, + "loss": 2.914, + "step": 34181 + }, + { + "epoch": 1.68, + "grad_norm": 0.6269749999046326, + "learning_rate": 0.0002452744666720404, + "loss": 3.0919, + "step": 34182 + }, + { + "epoch": 1.68, + "grad_norm": 0.6353631615638733, + "learning_rate": 0.0002452593288865678, + "loss": 2.9123, + "step": 34183 + }, + { + "epoch": 1.68, + "grad_norm": 0.6360670924186707, + "learning_rate": 0.00024524419124527145, + "loss": 2.8882, + "step": 34184 + }, + { + "epoch": 1.68, + "grad_norm": 0.6149998307228088, + "learning_rate": 0.0002452290537481915, + "loss": 3.2487, + "step": 34185 + }, + { + "epoch": 1.68, + "grad_norm": 0.5942299365997314, + "learning_rate": 0.0002452139163953676, + "loss": 3.0568, + "step": 34186 + }, + { + "epoch": 1.68, + "grad_norm": 0.5962042808532715, + "learning_rate": 0.00024519877918683974, + "loss": 3.0002, + "step": 34187 + }, + { + "epoch": 1.68, + "grad_norm": 0.6600490212440491, + "learning_rate": 0.00024518364212264775, + "loss": 3.2226, + "step": 34188 + }, + { + "epoch": 1.68, + "grad_norm": 0.6743106842041016, + "learning_rate": 0.00024516850520283154, + "loss": 3.1296, + "step": 34189 + }, + { + "epoch": 1.68, + "grad_norm": 0.6211302876472473, + "learning_rate": 0.0002451533684274309, + "loss": 3.1217, + "step": 34190 + }, + { + "epoch": 1.68, + "grad_norm": 0.7367480397224426, + "learning_rate": 0.00024513823179648565, + "loss": 2.9123, + "step": 34191 + }, + { + "epoch": 1.68, + "grad_norm": 0.6232231855392456, + "learning_rate": 0.00024512309531003583, + "loss": 3.0276, + "step": 34192 + }, + { + "epoch": 1.68, + "grad_norm": 0.6443117260932922, + "learning_rate": 0.0002451079589681212, + "loss": 3.2087, + "step": 34193 + }, + { + "epoch": 1.68, + "grad_norm": 0.6081936955451965, + "learning_rate": 0.0002450928227707816, + "loss": 2.993, + "step": 34194 + }, + { + "epoch": 1.68, + "grad_norm": 0.600371241569519, + "learning_rate": 0.00024507768671805705, + "loss": 3.051, + "step": 34195 + }, + { + "epoch": 1.68, + "grad_norm": 0.6255161762237549, + "learning_rate": 0.0002450625508099872, + "loss": 3.0649, + "step": 34196 + }, + { + "epoch": 1.68, + "grad_norm": 0.632082462310791, + "learning_rate": 0.00024504741504661207, + "loss": 2.9378, + "step": 34197 + }, + { + "epoch": 1.68, + "grad_norm": 0.5944221615791321, + "learning_rate": 0.00024503227942797144, + "loss": 3.1465, + "step": 34198 + }, + { + "epoch": 1.68, + "grad_norm": 0.5801829695701599, + "learning_rate": 0.00024501714395410514, + "loss": 2.8567, + "step": 34199 + }, + { + "epoch": 1.68, + "grad_norm": 0.5647958517074585, + "learning_rate": 0.00024500200862505317, + "loss": 3.0342, + "step": 34200 + }, + { + "epoch": 1.68, + "grad_norm": 0.6428757309913635, + "learning_rate": 0.0002449868734408554, + "loss": 3.0151, + "step": 34201 + }, + { + "epoch": 1.68, + "grad_norm": 0.5983733534812927, + "learning_rate": 0.00024497173840155157, + "loss": 2.9189, + "step": 34202 + }, + { + "epoch": 1.68, + "grad_norm": 0.6872225999832153, + "learning_rate": 0.0002449566035071815, + "loss": 2.9386, + "step": 34203 + }, + { + "epoch": 1.68, + "grad_norm": 0.6514347195625305, + "learning_rate": 0.00024494146875778523, + "loss": 2.9179, + "step": 34204 + }, + { + "epoch": 1.68, + "grad_norm": 0.6127699613571167, + "learning_rate": 0.00024492633415340256, + "loss": 2.8151, + "step": 34205 + }, + { + "epoch": 1.68, + "grad_norm": 0.6513962149620056, + "learning_rate": 0.0002449111996940732, + "loss": 2.9692, + "step": 34206 + }, + { + "epoch": 1.68, + "grad_norm": 0.6389971971511841, + "learning_rate": 0.00024489606537983727, + "loss": 2.9922, + "step": 34207 + }, + { + "epoch": 1.68, + "grad_norm": 0.6056563854217529, + "learning_rate": 0.00024488093121073445, + "loss": 2.9475, + "step": 34208 + }, + { + "epoch": 1.68, + "grad_norm": 0.6097168326377869, + "learning_rate": 0.0002448657971868046, + "loss": 2.9191, + "step": 34209 + }, + { + "epoch": 1.68, + "grad_norm": 0.640527069568634, + "learning_rate": 0.0002448506633080877, + "loss": 3.0529, + "step": 34210 + }, + { + "epoch": 1.68, + "grad_norm": 0.6365799307823181, + "learning_rate": 0.00024483552957462357, + "loss": 2.9739, + "step": 34211 + }, + { + "epoch": 1.68, + "grad_norm": 0.6156507730484009, + "learning_rate": 0.00024482039598645205, + "loss": 2.9809, + "step": 34212 + }, + { + "epoch": 1.68, + "grad_norm": 0.6057137846946716, + "learning_rate": 0.00024480526254361284, + "loss": 3.0669, + "step": 34213 + }, + { + "epoch": 1.68, + "grad_norm": 0.6164252758026123, + "learning_rate": 0.000244790129246146, + "loss": 3.0025, + "step": 34214 + }, + { + "epoch": 1.68, + "grad_norm": 0.6119879484176636, + "learning_rate": 0.00024477499609409144, + "loss": 3.0289, + "step": 34215 + }, + { + "epoch": 1.68, + "grad_norm": 0.6054874062538147, + "learning_rate": 0.00024475986308748873, + "loss": 3.1761, + "step": 34216 + }, + { + "epoch": 1.68, + "grad_norm": 0.6626514196395874, + "learning_rate": 0.0002447447302263781, + "loss": 3.0249, + "step": 34217 + }, + { + "epoch": 1.68, + "grad_norm": 0.6329224109649658, + "learning_rate": 0.0002447295975107991, + "loss": 2.9464, + "step": 34218 + }, + { + "epoch": 1.68, + "grad_norm": 0.6414870619773865, + "learning_rate": 0.0002447144649407917, + "loss": 3.1747, + "step": 34219 + }, + { + "epoch": 1.68, + "grad_norm": 0.6235871315002441, + "learning_rate": 0.00024469933251639584, + "loss": 3.0888, + "step": 34220 + }, + { + "epoch": 1.68, + "grad_norm": 0.6273425221443176, + "learning_rate": 0.00024468420023765117, + "loss": 3.0888, + "step": 34221 + }, + { + "epoch": 1.68, + "grad_norm": 0.6260068416595459, + "learning_rate": 0.00024466906810459785, + "loss": 3.174, + "step": 34222 + }, + { + "epoch": 1.68, + "grad_norm": 0.721221923828125, + "learning_rate": 0.00024465393611727544, + "loss": 2.8587, + "step": 34223 + }, + { + "epoch": 1.68, + "grad_norm": 0.5994982719421387, + "learning_rate": 0.0002446388042757238, + "loss": 2.8307, + "step": 34224 + }, + { + "epoch": 1.68, + "grad_norm": 0.5987682342529297, + "learning_rate": 0.0002446236725799831, + "loss": 3.0374, + "step": 34225 + }, + { + "epoch": 1.68, + "grad_norm": 0.6004258990287781, + "learning_rate": 0.0002446085410300929, + "loss": 2.8079, + "step": 34226 + }, + { + "epoch": 1.68, + "grad_norm": 0.5990030169487, + "learning_rate": 0.0002445934096260932, + "loss": 3.0825, + "step": 34227 + }, + { + "epoch": 1.68, + "grad_norm": 0.7027409076690674, + "learning_rate": 0.0002445782783680237, + "loss": 2.9353, + "step": 34228 + }, + { + "epoch": 1.68, + "grad_norm": 0.6076624989509583, + "learning_rate": 0.0002445631472559244, + "loss": 3.0427, + "step": 34229 + }, + { + "epoch": 1.68, + "grad_norm": 0.6400550603866577, + "learning_rate": 0.0002445480162898352, + "loss": 3.0594, + "step": 34230 + }, + { + "epoch": 1.68, + "grad_norm": 0.5895517468452454, + "learning_rate": 0.0002445328854697957, + "loss": 3.1673, + "step": 34231 + }, + { + "epoch": 1.68, + "grad_norm": 0.6176828742027283, + "learning_rate": 0.000244517754795846, + "loss": 3.1844, + "step": 34232 + }, + { + "epoch": 1.68, + "grad_norm": 0.6322025060653687, + "learning_rate": 0.0002445026242680258, + "loss": 3.1415, + "step": 34233 + }, + { + "epoch": 1.68, + "grad_norm": 0.6357572078704834, + "learning_rate": 0.00024448749388637493, + "loss": 2.9114, + "step": 34234 + }, + { + "epoch": 1.68, + "grad_norm": 0.6500570178031921, + "learning_rate": 0.0002444723636509335, + "loss": 2.9372, + "step": 34235 + }, + { + "epoch": 1.68, + "grad_norm": 0.6248914003372192, + "learning_rate": 0.0002444572335617411, + "loss": 2.9285, + "step": 34236 + }, + { + "epoch": 1.68, + "grad_norm": 0.5908624529838562, + "learning_rate": 0.0002444421036188377, + "loss": 3.1703, + "step": 34237 + }, + { + "epoch": 1.68, + "grad_norm": 0.6131687760353088, + "learning_rate": 0.00024442697382226296, + "loss": 2.8722, + "step": 34238 + }, + { + "epoch": 1.68, + "grad_norm": 0.6797118186950684, + "learning_rate": 0.00024441184417205684, + "loss": 3.0294, + "step": 34239 + }, + { + "epoch": 1.68, + "grad_norm": 0.6466017365455627, + "learning_rate": 0.00024439671466825944, + "loss": 2.9447, + "step": 34240 + }, + { + "epoch": 1.68, + "grad_norm": 0.6415280699729919, + "learning_rate": 0.0002443815853109103, + "loss": 3.1305, + "step": 34241 + }, + { + "epoch": 1.68, + "grad_norm": 0.613725483417511, + "learning_rate": 0.0002443664561000494, + "loss": 2.773, + "step": 34242 + }, + { + "epoch": 1.68, + "grad_norm": 0.6166124939918518, + "learning_rate": 0.00024435132703571645, + "loss": 2.8056, + "step": 34243 + }, + { + "epoch": 1.68, + "grad_norm": 0.6828124523162842, + "learning_rate": 0.00024433619811795146, + "loss": 2.9771, + "step": 34244 + }, + { + "epoch": 1.68, + "grad_norm": 0.6250672936439514, + "learning_rate": 0.00024432106934679424, + "loss": 3.0102, + "step": 34245 + }, + { + "epoch": 1.68, + "grad_norm": 0.6510005593299866, + "learning_rate": 0.0002443059407222845, + "loss": 2.931, + "step": 34246 + }, + { + "epoch": 1.68, + "grad_norm": 0.6244876980781555, + "learning_rate": 0.0002442908122444623, + "loss": 2.9292, + "step": 34247 + }, + { + "epoch": 1.68, + "grad_norm": 0.631175696849823, + "learning_rate": 0.0002442756839133674, + "loss": 3.1205, + "step": 34248 + }, + { + "epoch": 1.68, + "grad_norm": 0.6444908380508423, + "learning_rate": 0.0002442605557290395, + "loss": 3.013, + "step": 34249 + }, + { + "epoch": 1.68, + "grad_norm": 0.6660930514335632, + "learning_rate": 0.0002442454276915187, + "loss": 3.1253, + "step": 34250 + }, + { + "epoch": 1.68, + "grad_norm": 0.6830661296844482, + "learning_rate": 0.00024423029980084464, + "loss": 3.1829, + "step": 34251 + }, + { + "epoch": 1.68, + "grad_norm": 0.6448426246643066, + "learning_rate": 0.00024421517205705734, + "loss": 3.0845, + "step": 34252 + }, + { + "epoch": 1.68, + "grad_norm": 0.6062400341033936, + "learning_rate": 0.00024420004446019635, + "loss": 3.12, + "step": 34253 + }, + { + "epoch": 1.68, + "grad_norm": 0.5658940672874451, + "learning_rate": 0.00024418491701030183, + "loss": 2.9393, + "step": 34254 + }, + { + "epoch": 1.68, + "grad_norm": 0.6128112077713013, + "learning_rate": 0.00024416978970741353, + "loss": 2.9671, + "step": 34255 + }, + { + "epoch": 1.68, + "grad_norm": 0.5911213159561157, + "learning_rate": 0.0002441546625515711, + "loss": 3.0101, + "step": 34256 + }, + { + "epoch": 1.68, + "grad_norm": 0.6012565493583679, + "learning_rate": 0.00024413953554281473, + "loss": 2.9874, + "step": 34257 + }, + { + "epoch": 1.68, + "grad_norm": 0.6286384463310242, + "learning_rate": 0.00024412440868118397, + "loss": 2.9793, + "step": 34258 + }, + { + "epoch": 1.68, + "grad_norm": 0.6148403286933899, + "learning_rate": 0.0002441092819667187, + "loss": 2.9738, + "step": 34259 + }, + { + "epoch": 1.68, + "grad_norm": 0.616051435470581, + "learning_rate": 0.00024409415539945896, + "loss": 3.0531, + "step": 34260 + }, + { + "epoch": 1.68, + "grad_norm": 0.5852324962615967, + "learning_rate": 0.00024407902897944435, + "loss": 3.0542, + "step": 34261 + }, + { + "epoch": 1.68, + "grad_norm": 0.6138484477996826, + "learning_rate": 0.00024406390270671492, + "loss": 3.1578, + "step": 34262 + }, + { + "epoch": 1.68, + "grad_norm": 0.6208502054214478, + "learning_rate": 0.00024404877658131032, + "loss": 3.067, + "step": 34263 + }, + { + "epoch": 1.68, + "grad_norm": 0.6173650026321411, + "learning_rate": 0.00024403365060327045, + "loss": 2.8049, + "step": 34264 + }, + { + "epoch": 1.68, + "grad_norm": 0.6221855878829956, + "learning_rate": 0.0002440185247726353, + "loss": 3.171, + "step": 34265 + }, + { + "epoch": 1.68, + "grad_norm": 0.6358213424682617, + "learning_rate": 0.0002440033990894445, + "loss": 3.2996, + "step": 34266 + }, + { + "epoch": 1.68, + "grad_norm": 0.5805011987686157, + "learning_rate": 0.00024398827355373803, + "loss": 3.1994, + "step": 34267 + }, + { + "epoch": 1.68, + "grad_norm": 0.6460946798324585, + "learning_rate": 0.00024397314816555557, + "loss": 2.9848, + "step": 34268 + }, + { + "epoch": 1.68, + "grad_norm": 0.6628650426864624, + "learning_rate": 0.00024395802292493708, + "loss": 2.9467, + "step": 34269 + }, + { + "epoch": 1.68, + "grad_norm": 0.6098015308380127, + "learning_rate": 0.00024394289783192245, + "loss": 3.0061, + "step": 34270 + }, + { + "epoch": 1.68, + "grad_norm": 0.6058465242385864, + "learning_rate": 0.00024392777288655132, + "loss": 2.85, + "step": 34271 + }, + { + "epoch": 1.68, + "grad_norm": 0.6222949624061584, + "learning_rate": 0.0002439126480888638, + "loss": 3.2935, + "step": 34272 + }, + { + "epoch": 1.68, + "grad_norm": 0.6029030084609985, + "learning_rate": 0.00024389752343889943, + "loss": 3.1337, + "step": 34273 + }, + { + "epoch": 1.68, + "grad_norm": 0.5769012570381165, + "learning_rate": 0.00024388239893669833, + "loss": 2.9523, + "step": 34274 + }, + { + "epoch": 1.68, + "grad_norm": 0.6386813521385193, + "learning_rate": 0.0002438672745823, + "loss": 3.0349, + "step": 34275 + }, + { + "epoch": 1.68, + "grad_norm": 0.6211322546005249, + "learning_rate": 0.00024385215037574457, + "loss": 3.046, + "step": 34276 + }, + { + "epoch": 1.68, + "grad_norm": 0.6359053254127502, + "learning_rate": 0.00024383702631707182, + "loss": 2.9339, + "step": 34277 + }, + { + "epoch": 1.68, + "grad_norm": 0.6204125285148621, + "learning_rate": 0.0002438219024063214, + "loss": 3.1282, + "step": 34278 + }, + { + "epoch": 1.68, + "grad_norm": 0.6863940358161926, + "learning_rate": 0.00024380677864353344, + "loss": 3.1766, + "step": 34279 + }, + { + "epoch": 1.68, + "grad_norm": 0.6086069345474243, + "learning_rate": 0.00024379165502874743, + "loss": 2.8825, + "step": 34280 + }, + { + "epoch": 1.68, + "grad_norm": 0.6797551512718201, + "learning_rate": 0.00024377653156200347, + "loss": 2.948, + "step": 34281 + }, + { + "epoch": 1.68, + "grad_norm": 0.6163539290428162, + "learning_rate": 0.0002437614082433414, + "loss": 3.045, + "step": 34282 + }, + { + "epoch": 1.68, + "grad_norm": 0.6408080458641052, + "learning_rate": 0.0002437462850728008, + "loss": 3.0271, + "step": 34283 + }, + { + "epoch": 1.68, + "grad_norm": 0.6323572993278503, + "learning_rate": 0.00024373116205042176, + "loss": 2.9313, + "step": 34284 + }, + { + "epoch": 1.68, + "grad_norm": 0.611010730266571, + "learning_rate": 0.00024371603917624398, + "loss": 3.0656, + "step": 34285 + }, + { + "epoch": 1.68, + "grad_norm": 0.615911602973938, + "learning_rate": 0.00024370091645030726, + "loss": 3.2463, + "step": 34286 + }, + { + "epoch": 1.68, + "grad_norm": 0.6529790163040161, + "learning_rate": 0.0002436857938726516, + "loss": 2.9683, + "step": 34287 + }, + { + "epoch": 1.68, + "grad_norm": 0.64628005027771, + "learning_rate": 0.00024367067144331667, + "loss": 2.9319, + "step": 34288 + }, + { + "epoch": 1.68, + "grad_norm": 0.5937123894691467, + "learning_rate": 0.00024365554916234243, + "loss": 3.0921, + "step": 34289 + }, + { + "epoch": 1.68, + "grad_norm": 0.6053999066352844, + "learning_rate": 0.00024364042702976847, + "loss": 2.9979, + "step": 34290 + }, + { + "epoch": 1.68, + "grad_norm": 0.635191023349762, + "learning_rate": 0.00024362530504563486, + "loss": 2.9769, + "step": 34291 + }, + { + "epoch": 1.68, + "grad_norm": 0.6321050524711609, + "learning_rate": 0.00024361018320998137, + "loss": 2.9325, + "step": 34292 + }, + { + "epoch": 1.68, + "grad_norm": 0.6223572492599487, + "learning_rate": 0.00024359506152284776, + "loss": 3.0409, + "step": 34293 + }, + { + "epoch": 1.68, + "grad_norm": 0.6281905770301819, + "learning_rate": 0.00024357993998427396, + "loss": 3.1466, + "step": 34294 + }, + { + "epoch": 1.68, + "grad_norm": 0.5953301191329956, + "learning_rate": 0.0002435648185942997, + "loss": 3.0765, + "step": 34295 + }, + { + "epoch": 1.68, + "grad_norm": 0.6165432333946228, + "learning_rate": 0.0002435496973529648, + "loss": 3.0896, + "step": 34296 + }, + { + "epoch": 1.68, + "grad_norm": 0.5897827744483948, + "learning_rate": 0.00024353457626030922, + "loss": 2.9357, + "step": 34297 + }, + { + "epoch": 1.68, + "grad_norm": 0.6467866897583008, + "learning_rate": 0.00024351945531637267, + "loss": 2.8904, + "step": 34298 + }, + { + "epoch": 1.68, + "grad_norm": 0.6166879534721375, + "learning_rate": 0.00024350433452119505, + "loss": 3.0529, + "step": 34299 + }, + { + "epoch": 1.68, + "grad_norm": 0.6443753242492676, + "learning_rate": 0.00024348921387481602, + "loss": 2.7317, + "step": 34300 + }, + { + "epoch": 1.68, + "grad_norm": 0.755827009677887, + "learning_rate": 0.00024347409337727554, + "loss": 3.1424, + "step": 34301 + }, + { + "epoch": 1.68, + "grad_norm": 0.5986933708190918, + "learning_rate": 0.00024345897302861355, + "loss": 3.1745, + "step": 34302 + }, + { + "epoch": 1.68, + "grad_norm": 0.6184709668159485, + "learning_rate": 0.00024344385282886962, + "loss": 3.0542, + "step": 34303 + }, + { + "epoch": 1.68, + "grad_norm": 0.9942894577980042, + "learning_rate": 0.0002434287327780838, + "loss": 3.0583, + "step": 34304 + }, + { + "epoch": 1.68, + "grad_norm": 0.6022435426712036, + "learning_rate": 0.00024341361287629572, + "loss": 3.154, + "step": 34305 + }, + { + "epoch": 1.68, + "grad_norm": 0.6629198789596558, + "learning_rate": 0.0002433984931235453, + "loss": 3.0552, + "step": 34306 + }, + { + "epoch": 1.68, + "grad_norm": 0.5723194479942322, + "learning_rate": 0.00024338337351987246, + "loss": 3.1065, + "step": 34307 + }, + { + "epoch": 1.68, + "grad_norm": 0.6211422085762024, + "learning_rate": 0.00024336825406531676, + "loss": 3.1316, + "step": 34308 + }, + { + "epoch": 1.68, + "grad_norm": 0.6164812445640564, + "learning_rate": 0.00024335313475991834, + "loss": 2.9869, + "step": 34309 + }, + { + "epoch": 1.68, + "grad_norm": 1.0711008310317993, + "learning_rate": 0.00024333801560371676, + "loss": 2.9982, + "step": 34310 + }, + { + "epoch": 1.68, + "grad_norm": 0.6362112760543823, + "learning_rate": 0.00024332289659675192, + "loss": 3.0657, + "step": 34311 + }, + { + "epoch": 1.68, + "grad_norm": 0.5985422134399414, + "learning_rate": 0.00024330777773906375, + "loss": 3.1279, + "step": 34312 + }, + { + "epoch": 1.68, + "grad_norm": 0.6208683848381042, + "learning_rate": 0.00024329265903069193, + "loss": 2.8673, + "step": 34313 + }, + { + "epoch": 1.68, + "grad_norm": 0.6138306856155396, + "learning_rate": 0.00024327754047167641, + "loss": 3.024, + "step": 34314 + }, + { + "epoch": 1.68, + "grad_norm": 0.5941352844238281, + "learning_rate": 0.00024326242206205678, + "loss": 2.8803, + "step": 34315 + }, + { + "epoch": 1.68, + "grad_norm": 0.6221591830253601, + "learning_rate": 0.0002432473038018731, + "loss": 3.1186, + "step": 34316 + }, + { + "epoch": 1.68, + "grad_norm": 0.6871387958526611, + "learning_rate": 0.00024323218569116515, + "loss": 3.0352, + "step": 34317 + }, + { + "epoch": 1.68, + "grad_norm": 0.5821581482887268, + "learning_rate": 0.0002432170677299726, + "loss": 3.2781, + "step": 34318 + }, + { + "epoch": 1.68, + "grad_norm": 0.6137875914573669, + "learning_rate": 0.00024320194991833544, + "loss": 2.8125, + "step": 34319 + }, + { + "epoch": 1.68, + "grad_norm": 0.6038451194763184, + "learning_rate": 0.0002431868322562934, + "loss": 3.1545, + "step": 34320 + }, + { + "epoch": 1.68, + "grad_norm": 0.6101359128952026, + "learning_rate": 0.00024317171474388618, + "loss": 3.026, + "step": 34321 + }, + { + "epoch": 1.68, + "grad_norm": 0.7233001589775085, + "learning_rate": 0.00024315659738115392, + "loss": 2.9277, + "step": 34322 + }, + { + "epoch": 1.68, + "grad_norm": 0.6210648417472839, + "learning_rate": 0.0002431414801681361, + "loss": 3.0142, + "step": 34323 + }, + { + "epoch": 1.68, + "grad_norm": 0.598692774772644, + "learning_rate": 0.00024312636310487278, + "loss": 2.9345, + "step": 34324 + }, + { + "epoch": 1.68, + "grad_norm": 0.5950759053230286, + "learning_rate": 0.00024311124619140365, + "loss": 3.062, + "step": 34325 + }, + { + "epoch": 1.68, + "grad_norm": 0.5895894765853882, + "learning_rate": 0.00024309612942776846, + "loss": 3.2376, + "step": 34326 + }, + { + "epoch": 1.68, + "grad_norm": 0.638976514339447, + "learning_rate": 0.00024308101281400726, + "loss": 2.917, + "step": 34327 + }, + { + "epoch": 1.68, + "grad_norm": 0.6100997924804688, + "learning_rate": 0.00024306589635015961, + "loss": 3.0652, + "step": 34328 + }, + { + "epoch": 1.68, + "grad_norm": 0.6079884767532349, + "learning_rate": 0.00024305078003626553, + "loss": 3.0837, + "step": 34329 + }, + { + "epoch": 1.68, + "grad_norm": 0.5833554267883301, + "learning_rate": 0.0002430356638723646, + "loss": 3.0955, + "step": 34330 + }, + { + "epoch": 1.68, + "grad_norm": 0.5608810782432556, + "learning_rate": 0.00024302054785849684, + "loss": 3.0168, + "step": 34331 + }, + { + "epoch": 1.68, + "grad_norm": 0.6720162630081177, + "learning_rate": 0.00024300543199470208, + "loss": 3.0969, + "step": 34332 + }, + { + "epoch": 1.68, + "grad_norm": 0.6119675636291504, + "learning_rate": 0.00024299031628101988, + "loss": 3.1501, + "step": 34333 + }, + { + "epoch": 1.68, + "grad_norm": 0.6166000366210938, + "learning_rate": 0.00024297520071749033, + "loss": 3.1214, + "step": 34334 + }, + { + "epoch": 1.68, + "grad_norm": 0.7879331707954407, + "learning_rate": 0.00024296008530415308, + "loss": 3.0078, + "step": 34335 + }, + { + "epoch": 1.68, + "grad_norm": 0.630114734172821, + "learning_rate": 0.0002429449700410479, + "loss": 3.1951, + "step": 34336 + }, + { + "epoch": 1.68, + "grad_norm": 0.6173840761184692, + "learning_rate": 0.00024292985492821488, + "loss": 3.1601, + "step": 34337 + }, + { + "epoch": 1.68, + "grad_norm": 0.6316830515861511, + "learning_rate": 0.0002429147399656935, + "loss": 3.086, + "step": 34338 + }, + { + "epoch": 1.68, + "grad_norm": 0.6645943522453308, + "learning_rate": 0.00024289962515352384, + "loss": 2.9076, + "step": 34339 + }, + { + "epoch": 1.68, + "grad_norm": 0.6618261933326721, + "learning_rate": 0.0002428845104917454, + "loss": 3.1904, + "step": 34340 + }, + { + "epoch": 1.68, + "grad_norm": 0.6228432655334473, + "learning_rate": 0.0002428693959803982, + "loss": 3.0376, + "step": 34341 + }, + { + "epoch": 1.68, + "grad_norm": 0.6474012732505798, + "learning_rate": 0.00024285428161952214, + "loss": 3.1362, + "step": 34342 + }, + { + "epoch": 1.68, + "grad_norm": 0.5851452350616455, + "learning_rate": 0.0002428391674091568, + "loss": 3.0867, + "step": 34343 + }, + { + "epoch": 1.68, + "grad_norm": 0.6338858604431152, + "learning_rate": 0.00024282405334934221, + "loss": 3.013, + "step": 34344 + }, + { + "epoch": 1.68, + "grad_norm": 0.6032165288925171, + "learning_rate": 0.0002428089394401179, + "loss": 3.1117, + "step": 34345 + }, + { + "epoch": 1.68, + "grad_norm": 0.6296061873435974, + "learning_rate": 0.00024279382568152393, + "loss": 2.954, + "step": 34346 + }, + { + "epoch": 1.68, + "grad_norm": 0.5732358694076538, + "learning_rate": 0.00024277871207360003, + "loss": 2.9257, + "step": 34347 + }, + { + "epoch": 1.68, + "grad_norm": 0.6399105787277222, + "learning_rate": 0.0002427635986163859, + "loss": 2.9714, + "step": 34348 + }, + { + "epoch": 1.68, + "grad_norm": 0.6738459467887878, + "learning_rate": 0.00024274848530992155, + "loss": 2.9373, + "step": 34349 + }, + { + "epoch": 1.68, + "grad_norm": 0.6233891844749451, + "learning_rate": 0.0002427333721542466, + "loss": 3.1315, + "step": 34350 + }, + { + "epoch": 1.68, + "grad_norm": 0.6309840083122253, + "learning_rate": 0.00024271825914940102, + "loss": 3.0287, + "step": 34351 + }, + { + "epoch": 1.68, + "grad_norm": 0.6268077492713928, + "learning_rate": 0.00024270314629542435, + "loss": 3.0762, + "step": 34352 + }, + { + "epoch": 1.68, + "grad_norm": 0.5958818793296814, + "learning_rate": 0.00024268803359235667, + "loss": 2.9662, + "step": 34353 + }, + { + "epoch": 1.68, + "grad_norm": 0.6950878500938416, + "learning_rate": 0.0002426729210402377, + "loss": 2.8479, + "step": 34354 + }, + { + "epoch": 1.68, + "grad_norm": 0.6276915073394775, + "learning_rate": 0.00024265780863910716, + "loss": 3.1364, + "step": 34355 + }, + { + "epoch": 1.68, + "grad_norm": 0.647860050201416, + "learning_rate": 0.00024264269638900497, + "loss": 2.5724, + "step": 34356 + }, + { + "epoch": 1.68, + "grad_norm": 0.599606454372406, + "learning_rate": 0.00024262758428997088, + "loss": 2.9408, + "step": 34357 + }, + { + "epoch": 1.68, + "grad_norm": 0.6251691579818726, + "learning_rate": 0.0002426124723420446, + "loss": 3.0815, + "step": 34358 + }, + { + "epoch": 1.68, + "grad_norm": 0.597493052482605, + "learning_rate": 0.00024259736054526614, + "loss": 2.9994, + "step": 34359 + }, + { + "epoch": 1.68, + "grad_norm": 0.6401041150093079, + "learning_rate": 0.00024258224889967512, + "loss": 3.0447, + "step": 34360 + }, + { + "epoch": 1.68, + "grad_norm": 0.6328083276748657, + "learning_rate": 0.00024256713740531148, + "loss": 2.9255, + "step": 34361 + }, + { + "epoch": 1.68, + "grad_norm": 0.6329466700553894, + "learning_rate": 0.00024255202606221485, + "loss": 2.923, + "step": 34362 + }, + { + "epoch": 1.68, + "grad_norm": 0.6171144247055054, + "learning_rate": 0.00024253691487042511, + "loss": 2.8842, + "step": 34363 + }, + { + "epoch": 1.68, + "grad_norm": 0.6171637773513794, + "learning_rate": 0.00024252180382998217, + "loss": 3.1929, + "step": 34364 + }, + { + "epoch": 1.68, + "grad_norm": 0.6588241457939148, + "learning_rate": 0.0002425066929409257, + "loss": 2.9894, + "step": 34365 + }, + { + "epoch": 1.68, + "grad_norm": 0.6128795742988586, + "learning_rate": 0.0002424915822032956, + "loss": 2.8934, + "step": 34366 + }, + { + "epoch": 1.68, + "grad_norm": 0.584743082523346, + "learning_rate": 0.00024247647161713148, + "loss": 3.0518, + "step": 34367 + }, + { + "epoch": 1.68, + "grad_norm": 0.6354223489761353, + "learning_rate": 0.00024246136118247333, + "loss": 2.8951, + "step": 34368 + }, + { + "epoch": 1.68, + "grad_norm": 0.6299620866775513, + "learning_rate": 0.00024244625089936091, + "loss": 2.9908, + "step": 34369 + }, + { + "epoch": 1.68, + "grad_norm": 0.6883636713027954, + "learning_rate": 0.0002424311407678339, + "loss": 2.9329, + "step": 34370 + }, + { + "epoch": 1.68, + "grad_norm": 0.5922103524208069, + "learning_rate": 0.0002424160307879323, + "loss": 3.091, + "step": 34371 + }, + { + "epoch": 1.68, + "grad_norm": 0.6945319175720215, + "learning_rate": 0.0002424009209596957, + "loss": 3.1215, + "step": 34372 + }, + { + "epoch": 1.68, + "grad_norm": 0.6298499703407288, + "learning_rate": 0.00024238581128316396, + "loss": 3.0817, + "step": 34373 + }, + { + "epoch": 1.68, + "grad_norm": 0.632078230381012, + "learning_rate": 0.00024237070175837705, + "loss": 3.1141, + "step": 34374 + }, + { + "epoch": 1.68, + "grad_norm": 0.633488118648529, + "learning_rate": 0.0002423555923853745, + "loss": 2.97, + "step": 34375 + }, + { + "epoch": 1.68, + "grad_norm": 0.6549461483955383, + "learning_rate": 0.00024234048316419633, + "loss": 3.0173, + "step": 34376 + }, + { + "epoch": 1.68, + "grad_norm": 0.6033638715744019, + "learning_rate": 0.0002423253740948821, + "loss": 2.9334, + "step": 34377 + }, + { + "epoch": 1.68, + "grad_norm": 0.6423242688179016, + "learning_rate": 0.00024231026517747178, + "loss": 3.1711, + "step": 34378 + }, + { + "epoch": 1.68, + "grad_norm": 0.586613118648529, + "learning_rate": 0.0002422951564120052, + "loss": 2.9276, + "step": 34379 + }, + { + "epoch": 1.68, + "grad_norm": 0.6340028643608093, + "learning_rate": 0.00024228004779852194, + "loss": 3.0458, + "step": 34380 + }, + { + "epoch": 1.68, + "grad_norm": 0.6134786009788513, + "learning_rate": 0.00024226493933706208, + "loss": 3.0783, + "step": 34381 + }, + { + "epoch": 1.68, + "grad_norm": 0.641250491142273, + "learning_rate": 0.00024224983102766516, + "loss": 3.0496, + "step": 34382 + }, + { + "epoch": 1.69, + "grad_norm": 0.6067681908607483, + "learning_rate": 0.000242234722870371, + "loss": 2.8943, + "step": 34383 + }, + { + "epoch": 1.69, + "grad_norm": 0.5972951650619507, + "learning_rate": 0.00024221961486521962, + "loss": 3.0228, + "step": 34384 + }, + { + "epoch": 1.69, + "grad_norm": 0.6391837000846863, + "learning_rate": 0.0002422045070122505, + "loss": 2.9397, + "step": 34385 + }, + { + "epoch": 1.69, + "grad_norm": 0.5825820565223694, + "learning_rate": 0.00024218939931150375, + "loss": 3.1374, + "step": 34386 + }, + { + "epoch": 1.69, + "grad_norm": 0.6349732875823975, + "learning_rate": 0.00024217429176301889, + "loss": 2.9957, + "step": 34387 + }, + { + "epoch": 1.69, + "grad_norm": 0.6446109414100647, + "learning_rate": 0.00024215918436683577, + "loss": 3.1967, + "step": 34388 + }, + { + "epoch": 1.69, + "grad_norm": 0.636913001537323, + "learning_rate": 0.00024214407712299437, + "loss": 3.1058, + "step": 34389 + }, + { + "epoch": 1.69, + "grad_norm": 0.5909577012062073, + "learning_rate": 0.00024212897003153425, + "loss": 3.0518, + "step": 34390 + }, + { + "epoch": 1.69, + "grad_norm": 0.6335484981536865, + "learning_rate": 0.00024211386309249539, + "loss": 3.0988, + "step": 34391 + }, + { + "epoch": 1.69, + "grad_norm": 0.6210086941719055, + "learning_rate": 0.0002420987563059173, + "loss": 2.9179, + "step": 34392 + }, + { + "epoch": 1.69, + "grad_norm": 0.6080520153045654, + "learning_rate": 0.00024208364967184, + "loss": 3.227, + "step": 34393 + }, + { + "epoch": 1.69, + "grad_norm": 0.6113629341125488, + "learning_rate": 0.00024206854319030333, + "loss": 2.9451, + "step": 34394 + }, + { + "epoch": 1.69, + "grad_norm": 0.6378176808357239, + "learning_rate": 0.00024205343686134682, + "loss": 3.1103, + "step": 34395 + }, + { + "epoch": 1.69, + "grad_norm": 0.6006256341934204, + "learning_rate": 0.00024203833068501052, + "loss": 2.9884, + "step": 34396 + }, + { + "epoch": 1.69, + "grad_norm": 0.6118775010108948, + "learning_rate": 0.00024202322466133406, + "loss": 2.7407, + "step": 34397 + }, + { + "epoch": 1.69, + "grad_norm": 0.6467645168304443, + "learning_rate": 0.00024200811879035718, + "loss": 2.7859, + "step": 34398 + }, + { + "epoch": 1.69, + "grad_norm": 0.6246896386146545, + "learning_rate": 0.00024199301307211988, + "loss": 3.0706, + "step": 34399 + }, + { + "epoch": 1.69, + "grad_norm": 0.6289390325546265, + "learning_rate": 0.00024197790750666177, + "loss": 3.0411, + "step": 34400 + }, + { + "epoch": 1.69, + "grad_norm": 0.6309362649917603, + "learning_rate": 0.00024196280209402276, + "loss": 2.9929, + "step": 34401 + }, + { + "epoch": 1.69, + "grad_norm": 0.6463305950164795, + "learning_rate": 0.0002419476968342424, + "loss": 3.0134, + "step": 34402 + }, + { + "epoch": 1.69, + "grad_norm": 0.6127621531486511, + "learning_rate": 0.00024193259172736067, + "loss": 3.0831, + "step": 34403 + }, + { + "epoch": 1.69, + "grad_norm": 0.6621184945106506, + "learning_rate": 0.00024191748677341744, + "loss": 2.9493, + "step": 34404 + }, + { + "epoch": 1.69, + "grad_norm": 0.6097039580345154, + "learning_rate": 0.0002419023819724523, + "loss": 3.0993, + "step": 34405 + }, + { + "epoch": 1.69, + "grad_norm": 0.660682201385498, + "learning_rate": 0.00024188727732450517, + "loss": 3.2129, + "step": 34406 + }, + { + "epoch": 1.69, + "grad_norm": 0.6187965869903564, + "learning_rate": 0.00024187217282961563, + "loss": 2.8815, + "step": 34407 + }, + { + "epoch": 1.69, + "grad_norm": 0.5884891152381897, + "learning_rate": 0.00024185706848782365, + "loss": 3.0827, + "step": 34408 + }, + { + "epoch": 1.69, + "grad_norm": 0.5681439638137817, + "learning_rate": 0.00024184196429916904, + "loss": 3.1114, + "step": 34409 + }, + { + "epoch": 1.69, + "grad_norm": 0.6478064060211182, + "learning_rate": 0.00024182686026369142, + "loss": 3.0638, + "step": 34410 + }, + { + "epoch": 1.69, + "grad_norm": 0.6609353423118591, + "learning_rate": 0.00024181175638143072, + "loss": 3.0618, + "step": 34411 + }, + { + "epoch": 1.69, + "grad_norm": 0.6224457025527954, + "learning_rate": 0.0002417966526524266, + "loss": 2.9672, + "step": 34412 + }, + { + "epoch": 1.69, + "grad_norm": 0.6122366189956665, + "learning_rate": 0.00024178154907671887, + "loss": 2.8498, + "step": 34413 + }, + { + "epoch": 1.69, + "grad_norm": 0.6315528154373169, + "learning_rate": 0.00024176644565434744, + "loss": 2.8217, + "step": 34414 + }, + { + "epoch": 1.69, + "grad_norm": 0.714515209197998, + "learning_rate": 0.0002417513423853519, + "loss": 2.774, + "step": 34415 + }, + { + "epoch": 1.69, + "grad_norm": 0.6689561009407043, + "learning_rate": 0.00024173623926977223, + "loss": 2.9304, + "step": 34416 + }, + { + "epoch": 1.69, + "grad_norm": 0.6164539456367493, + "learning_rate": 0.00024172113630764792, + "loss": 3.1242, + "step": 34417 + }, + { + "epoch": 1.69, + "grad_norm": 0.6706748604774475, + "learning_rate": 0.000241706033499019, + "loss": 3.3955, + "step": 34418 + }, + { + "epoch": 1.69, + "grad_norm": 0.5987676382064819, + "learning_rate": 0.00024169093084392524, + "loss": 3.03, + "step": 34419 + }, + { + "epoch": 1.69, + "grad_norm": 0.6364895105361938, + "learning_rate": 0.0002416758283424062, + "loss": 3.2174, + "step": 34420 + }, + { + "epoch": 1.69, + "grad_norm": 0.755660891532898, + "learning_rate": 0.00024166072599450192, + "loss": 2.9115, + "step": 34421 + }, + { + "epoch": 1.69, + "grad_norm": 0.6371123194694519, + "learning_rate": 0.00024164562380025203, + "loss": 3.0867, + "step": 34422 + }, + { + "epoch": 1.69, + "grad_norm": 0.6260061264038086, + "learning_rate": 0.00024163052175969627, + "loss": 2.9373, + "step": 34423 + }, + { + "epoch": 1.69, + "grad_norm": 0.601723849773407, + "learning_rate": 0.00024161541987287457, + "loss": 3.0002, + "step": 34424 + }, + { + "epoch": 1.69, + "grad_norm": 0.6575174331665039, + "learning_rate": 0.00024160031813982657, + "loss": 3.0804, + "step": 34425 + }, + { + "epoch": 1.69, + "grad_norm": 0.6266413927078247, + "learning_rate": 0.00024158521656059213, + "loss": 3.0032, + "step": 34426 + }, + { + "epoch": 1.69, + "grad_norm": 0.6671347618103027, + "learning_rate": 0.000241570115135211, + "loss": 2.9415, + "step": 34427 + }, + { + "epoch": 1.69, + "grad_norm": 0.6078870296478271, + "learning_rate": 0.00024155501386372282, + "loss": 2.9793, + "step": 34428 + }, + { + "epoch": 1.69, + "grad_norm": 0.6798886060714722, + "learning_rate": 0.00024153991274616763, + "loss": 2.7276, + "step": 34429 + }, + { + "epoch": 1.69, + "grad_norm": 0.6024205088615417, + "learning_rate": 0.00024152481178258501, + "loss": 3.1548, + "step": 34430 + }, + { + "epoch": 1.69, + "grad_norm": 0.6181690692901611, + "learning_rate": 0.00024150971097301486, + "loss": 2.8255, + "step": 34431 + }, + { + "epoch": 1.69, + "grad_norm": 0.6173415780067444, + "learning_rate": 0.00024149461031749668, + "loss": 3.0516, + "step": 34432 + }, + { + "epoch": 1.69, + "grad_norm": 0.6175852417945862, + "learning_rate": 0.00024147950981607064, + "loss": 2.9044, + "step": 34433 + }, + { + "epoch": 1.69, + "grad_norm": 0.6466128826141357, + "learning_rate": 0.0002414644094687762, + "loss": 2.893, + "step": 34434 + }, + { + "epoch": 1.69, + "grad_norm": 0.626565158367157, + "learning_rate": 0.00024144930927565318, + "loss": 3.1384, + "step": 34435 + }, + { + "epoch": 1.69, + "grad_norm": 0.6032750606536865, + "learning_rate": 0.00024143420923674157, + "loss": 2.773, + "step": 34436 + }, + { + "epoch": 1.69, + "grad_norm": 0.6452335715293884, + "learning_rate": 0.0002414191093520809, + "loss": 2.8955, + "step": 34437 + }, + { + "epoch": 1.69, + "grad_norm": 0.6146382093429565, + "learning_rate": 0.00024140400962171106, + "loss": 2.935, + "step": 34438 + }, + { + "epoch": 1.69, + "grad_norm": 0.5858275294303894, + "learning_rate": 0.00024138891004567167, + "loss": 3.0345, + "step": 34439 + }, + { + "epoch": 1.69, + "grad_norm": 0.6086834669113159, + "learning_rate": 0.00024137381062400268, + "loss": 2.9822, + "step": 34440 + }, + { + "epoch": 1.69, + "grad_norm": 0.6375246047973633, + "learning_rate": 0.00024135871135674388, + "loss": 3.1297, + "step": 34441 + }, + { + "epoch": 1.69, + "grad_norm": 0.6939441561698914, + "learning_rate": 0.00024134361224393479, + "loss": 3.0535, + "step": 34442 + }, + { + "epoch": 1.69, + "grad_norm": 0.6073477864265442, + "learning_rate": 0.00024132851328561544, + "loss": 2.944, + "step": 34443 + }, + { + "epoch": 1.69, + "grad_norm": 0.6158416271209717, + "learning_rate": 0.0002413134144818254, + "loss": 3.151, + "step": 34444 + }, + { + "epoch": 1.69, + "grad_norm": 0.5689690709114075, + "learning_rate": 0.0002412983158326046, + "loss": 2.9117, + "step": 34445 + }, + { + "epoch": 1.69, + "grad_norm": 0.6501169800758362, + "learning_rate": 0.0002412832173379928, + "loss": 3.1277, + "step": 34446 + }, + { + "epoch": 1.69, + "grad_norm": 0.6253321766853333, + "learning_rate": 0.00024126811899802958, + "loss": 3.1249, + "step": 34447 + }, + { + "epoch": 1.69, + "grad_norm": 0.6375982761383057, + "learning_rate": 0.00024125302081275492, + "loss": 3.1011, + "step": 34448 + }, + { + "epoch": 1.69, + "grad_norm": 0.6295775175094604, + "learning_rate": 0.00024123792278220847, + "loss": 3.1153, + "step": 34449 + }, + { + "epoch": 1.69, + "grad_norm": 0.6375122666358948, + "learning_rate": 0.00024122282490642998, + "loss": 2.9863, + "step": 34450 + }, + { + "epoch": 1.69, + "grad_norm": 0.6656721234321594, + "learning_rate": 0.00024120772718545935, + "loss": 3.0008, + "step": 34451 + }, + { + "epoch": 1.69, + "grad_norm": 0.584492564201355, + "learning_rate": 0.00024119262961933618, + "loss": 2.9974, + "step": 34452 + }, + { + "epoch": 1.69, + "grad_norm": 0.5878311395645142, + "learning_rate": 0.0002411775322081004, + "loss": 3.0679, + "step": 34453 + }, + { + "epoch": 1.69, + "grad_norm": 0.6173829436302185, + "learning_rate": 0.00024116243495179152, + "loss": 2.82, + "step": 34454 + }, + { + "epoch": 1.69, + "grad_norm": 0.6128963828086853, + "learning_rate": 0.00024114733785044953, + "loss": 2.8603, + "step": 34455 + }, + { + "epoch": 1.69, + "grad_norm": 0.6082571744918823, + "learning_rate": 0.0002411322409041142, + "loss": 2.9488, + "step": 34456 + }, + { + "epoch": 1.69, + "grad_norm": 0.6463648080825806, + "learning_rate": 0.0002411171441128251, + "loss": 3.0709, + "step": 34457 + }, + { + "epoch": 1.69, + "grad_norm": 0.6087504625320435, + "learning_rate": 0.00024110204747662225, + "loss": 3.1574, + "step": 34458 + }, + { + "epoch": 1.69, + "grad_norm": 0.6248615980148315, + "learning_rate": 0.0002410869509955452, + "loss": 3.0775, + "step": 34459 + }, + { + "epoch": 1.69, + "grad_norm": 0.6408868432044983, + "learning_rate": 0.00024107185466963369, + "loss": 3.1373, + "step": 34460 + }, + { + "epoch": 1.69, + "grad_norm": 0.6176990866661072, + "learning_rate": 0.00024105675849892772, + "loss": 3.0042, + "step": 34461 + }, + { + "epoch": 1.69, + "grad_norm": 0.6030988693237305, + "learning_rate": 0.00024104166248346685, + "loss": 2.9704, + "step": 34462 + }, + { + "epoch": 1.69, + "grad_norm": 0.6126129031181335, + "learning_rate": 0.00024102656662329094, + "loss": 2.9439, + "step": 34463 + }, + { + "epoch": 1.69, + "grad_norm": 0.620477020740509, + "learning_rate": 0.0002410114709184396, + "loss": 3.2136, + "step": 34464 + }, + { + "epoch": 1.69, + "grad_norm": 0.6435030102729797, + "learning_rate": 0.00024099637536895263, + "loss": 2.8834, + "step": 34465 + }, + { + "epoch": 1.69, + "grad_norm": 0.6259873509407043, + "learning_rate": 0.00024098127997487006, + "loss": 3.2122, + "step": 34466 + }, + { + "epoch": 1.69, + "grad_norm": 0.630929708480835, + "learning_rate": 0.00024096618473623134, + "loss": 3.1022, + "step": 34467 + }, + { + "epoch": 1.69, + "grad_norm": 0.6018049120903015, + "learning_rate": 0.0002409510896530764, + "loss": 2.6932, + "step": 34468 + }, + { + "epoch": 1.69, + "grad_norm": 0.6094369292259216, + "learning_rate": 0.0002409359947254448, + "loss": 2.9726, + "step": 34469 + }, + { + "epoch": 1.69, + "grad_norm": 0.6517297625541687, + "learning_rate": 0.0002409208999533765, + "loss": 3.0395, + "step": 34470 + }, + { + "epoch": 1.69, + "grad_norm": 0.6332142949104309, + "learning_rate": 0.0002409058053369112, + "loss": 2.9317, + "step": 34471 + }, + { + "epoch": 1.69, + "grad_norm": 0.6489332914352417, + "learning_rate": 0.00024089071087608852, + "loss": 2.8955, + "step": 34472 + }, + { + "epoch": 1.69, + "grad_norm": 0.6442323327064514, + "learning_rate": 0.0002408756165709485, + "loss": 3.0071, + "step": 34473 + }, + { + "epoch": 1.69, + "grad_norm": 0.6622292995452881, + "learning_rate": 0.0002408605224215306, + "loss": 2.5943, + "step": 34474 + }, + { + "epoch": 1.69, + "grad_norm": 0.6429809927940369, + "learning_rate": 0.00024084542842787467, + "loss": 2.9143, + "step": 34475 + }, + { + "epoch": 1.69, + "grad_norm": 0.658168375492096, + "learning_rate": 0.00024083033459002066, + "loss": 3.0773, + "step": 34476 + }, + { + "epoch": 1.69, + "grad_norm": 0.6099553108215332, + "learning_rate": 0.00024081524090800805, + "loss": 3.1176, + "step": 34477 + }, + { + "epoch": 1.69, + "grad_norm": 0.6159991025924683, + "learning_rate": 0.00024080014738187677, + "loss": 2.9891, + "step": 34478 + }, + { + "epoch": 1.69, + "grad_norm": 0.589513897895813, + "learning_rate": 0.0002407850540116664, + "loss": 2.9638, + "step": 34479 + }, + { + "epoch": 1.69, + "grad_norm": 0.6264899969100952, + "learning_rate": 0.00024076996079741686, + "loss": 2.898, + "step": 34480 + }, + { + "epoch": 1.69, + "grad_norm": 0.7650187611579895, + "learning_rate": 0.00024075486773916793, + "loss": 3.0772, + "step": 34481 + }, + { + "epoch": 1.69, + "grad_norm": 0.6351287364959717, + "learning_rate": 0.0002407397748369591, + "loss": 3.1892, + "step": 34482 + }, + { + "epoch": 1.69, + "grad_norm": 0.6214819550514221, + "learning_rate": 0.00024072468209083045, + "loss": 2.9347, + "step": 34483 + }, + { + "epoch": 1.69, + "grad_norm": 0.6406108736991882, + "learning_rate": 0.00024070958950082153, + "loss": 2.9502, + "step": 34484 + }, + { + "epoch": 1.69, + "grad_norm": 0.6591122150421143, + "learning_rate": 0.00024069449706697207, + "loss": 3.3276, + "step": 34485 + }, + { + "epoch": 1.69, + "grad_norm": 0.6212307214736938, + "learning_rate": 0.000240679404789322, + "loss": 3.2392, + "step": 34486 + }, + { + "epoch": 1.69, + "grad_norm": 0.7033385634422302, + "learning_rate": 0.00024066431266791085, + "loss": 2.9312, + "step": 34487 + }, + { + "epoch": 1.69, + "grad_norm": 0.5959001779556274, + "learning_rate": 0.0002406492207027786, + "loss": 2.7623, + "step": 34488 + }, + { + "epoch": 1.69, + "grad_norm": 0.5990002155303955, + "learning_rate": 0.00024063412889396485, + "loss": 2.971, + "step": 34489 + }, + { + "epoch": 1.69, + "grad_norm": 0.6115400195121765, + "learning_rate": 0.00024061903724150927, + "loss": 2.8386, + "step": 34490 + }, + { + "epoch": 1.69, + "grad_norm": 0.6301840543746948, + "learning_rate": 0.00024060394574545186, + "loss": 2.8966, + "step": 34491 + }, + { + "epoch": 1.69, + "grad_norm": 0.6668326258659363, + "learning_rate": 0.00024058885440583216, + "loss": 2.8776, + "step": 34492 + }, + { + "epoch": 1.69, + "grad_norm": 0.6199162602424622, + "learning_rate": 0.00024057376322269007, + "loss": 3.1312, + "step": 34493 + }, + { + "epoch": 1.69, + "grad_norm": 0.6100919842720032, + "learning_rate": 0.0002405586721960651, + "loss": 3.1269, + "step": 34494 + }, + { + "epoch": 1.69, + "grad_norm": 0.6069178581237793, + "learning_rate": 0.00024054358132599723, + "loss": 2.9928, + "step": 34495 + }, + { + "epoch": 1.69, + "grad_norm": 0.6124541163444519, + "learning_rate": 0.00024052849061252616, + "loss": 3.018, + "step": 34496 + }, + { + "epoch": 1.69, + "grad_norm": 0.5998200178146362, + "learning_rate": 0.0002405134000556915, + "loss": 2.8799, + "step": 34497 + }, + { + "epoch": 1.69, + "grad_norm": 0.6153830885887146, + "learning_rate": 0.00024049830965553322, + "loss": 3.1137, + "step": 34498 + }, + { + "epoch": 1.69, + "grad_norm": 0.6186378002166748, + "learning_rate": 0.00024048321941209087, + "loss": 3.0032, + "step": 34499 + }, + { + "epoch": 1.69, + "grad_norm": 0.625507652759552, + "learning_rate": 0.00024046812932540417, + "loss": 3.2128, + "step": 34500 + }, + { + "epoch": 1.69, + "grad_norm": 0.628273069858551, + "learning_rate": 0.00024045303939551315, + "loss": 3.0745, + "step": 34501 + }, + { + "epoch": 1.69, + "grad_norm": 0.6160310506820679, + "learning_rate": 0.00024043794962245725, + "loss": 3.0487, + "step": 34502 + }, + { + "epoch": 1.69, + "grad_norm": 0.6147609353065491, + "learning_rate": 0.0002404228600062764, + "loss": 3.0751, + "step": 34503 + }, + { + "epoch": 1.69, + "grad_norm": 0.6561556458473206, + "learning_rate": 0.0002404077705470102, + "loss": 3.1311, + "step": 34504 + }, + { + "epoch": 1.69, + "grad_norm": 0.6268799901008606, + "learning_rate": 0.00024039268124469837, + "loss": 2.981, + "step": 34505 + }, + { + "epoch": 1.69, + "grad_norm": 0.6238470077514648, + "learning_rate": 0.00024037759209938092, + "loss": 2.9657, + "step": 34506 + }, + { + "epoch": 1.69, + "grad_norm": 0.5943543314933777, + "learning_rate": 0.0002403625031110974, + "loss": 2.8509, + "step": 34507 + }, + { + "epoch": 1.69, + "grad_norm": 0.6060153841972351, + "learning_rate": 0.00024034741427988757, + "loss": 3.0004, + "step": 34508 + }, + { + "epoch": 1.69, + "grad_norm": 0.6190601587295532, + "learning_rate": 0.00024033232560579106, + "loss": 3.0331, + "step": 34509 + }, + { + "epoch": 1.69, + "grad_norm": 0.6295119524002075, + "learning_rate": 0.00024031723708884786, + "loss": 2.895, + "step": 34510 + }, + { + "epoch": 1.69, + "grad_norm": 0.6272006034851074, + "learning_rate": 0.00024030214872909747, + "loss": 3.0444, + "step": 34511 + }, + { + "epoch": 1.69, + "grad_norm": 0.6380195021629333, + "learning_rate": 0.00024028706052657973, + "loss": 3.0712, + "step": 34512 + }, + { + "epoch": 1.69, + "grad_norm": 0.6345843076705933, + "learning_rate": 0.00024027197248133446, + "loss": 3.0436, + "step": 34513 + }, + { + "epoch": 1.69, + "grad_norm": 0.586628258228302, + "learning_rate": 0.00024025688459340127, + "loss": 3.2935, + "step": 34514 + }, + { + "epoch": 1.69, + "grad_norm": 0.6063928008079529, + "learning_rate": 0.00024024179686282, + "loss": 2.8827, + "step": 34515 + }, + { + "epoch": 1.69, + "grad_norm": 0.5976722836494446, + "learning_rate": 0.00024022670928963024, + "loss": 3.0413, + "step": 34516 + }, + { + "epoch": 1.69, + "grad_norm": 0.6519937515258789, + "learning_rate": 0.00024021162187387184, + "loss": 2.9673, + "step": 34517 + }, + { + "epoch": 1.69, + "grad_norm": 0.6437779068946838, + "learning_rate": 0.0002401965346155846, + "loss": 3.0316, + "step": 34518 + }, + { + "epoch": 1.69, + "grad_norm": 0.5971271395683289, + "learning_rate": 0.0002401814475148081, + "loss": 3.1481, + "step": 34519 + }, + { + "epoch": 1.69, + "grad_norm": 0.6342564225196838, + "learning_rate": 0.00024016636057158224, + "loss": 3.0502, + "step": 34520 + }, + { + "epoch": 1.69, + "grad_norm": 0.5975090265274048, + "learning_rate": 0.00024015127378594663, + "loss": 2.9394, + "step": 34521 + }, + { + "epoch": 1.69, + "grad_norm": 0.6094161868095398, + "learning_rate": 0.00024013618715794095, + "loss": 2.9791, + "step": 34522 + }, + { + "epoch": 1.69, + "grad_norm": 0.6090801954269409, + "learning_rate": 0.00024012110068760519, + "loss": 3.0233, + "step": 34523 + }, + { + "epoch": 1.69, + "grad_norm": 0.6072431802749634, + "learning_rate": 0.00024010601437497882, + "loss": 2.9527, + "step": 34524 + }, + { + "epoch": 1.69, + "grad_norm": 0.7141648530960083, + "learning_rate": 0.0002400909282201018, + "loss": 2.8589, + "step": 34525 + }, + { + "epoch": 1.69, + "grad_norm": 0.6420031785964966, + "learning_rate": 0.00024007584222301357, + "loss": 3.0201, + "step": 34526 + }, + { + "epoch": 1.69, + "grad_norm": 0.6151692867279053, + "learning_rate": 0.00024006075638375408, + "loss": 3.039, + "step": 34527 + }, + { + "epoch": 1.69, + "grad_norm": 0.6226093173027039, + "learning_rate": 0.00024004567070236312, + "loss": 3.0654, + "step": 34528 + }, + { + "epoch": 1.69, + "grad_norm": 0.6651445031166077, + "learning_rate": 0.00024003058517888028, + "loss": 2.8437, + "step": 34529 + }, + { + "epoch": 1.69, + "grad_norm": 0.5924610495567322, + "learning_rate": 0.00024001549981334541, + "loss": 2.7907, + "step": 34530 + }, + { + "epoch": 1.69, + "grad_norm": 0.6144448518753052, + "learning_rate": 0.00024000041460579802, + "loss": 3.0333, + "step": 34531 + }, + { + "epoch": 1.69, + "grad_norm": 0.6835182905197144, + "learning_rate": 0.00023998532955627803, + "loss": 2.8505, + "step": 34532 + }, + { + "epoch": 1.69, + "grad_norm": 0.6623280048370361, + "learning_rate": 0.00023997024466482523, + "loss": 3.0159, + "step": 34533 + }, + { + "epoch": 1.69, + "grad_norm": 0.6474083065986633, + "learning_rate": 0.00023995515993147911, + "loss": 2.9979, + "step": 34534 + }, + { + "epoch": 1.69, + "grad_norm": 0.6150049567222595, + "learning_rate": 0.00023994007535627968, + "loss": 3.1431, + "step": 34535 + }, + { + "epoch": 1.69, + "grad_norm": 0.6174440383911133, + "learning_rate": 0.00023992499093926646, + "loss": 2.996, + "step": 34536 + }, + { + "epoch": 1.69, + "grad_norm": 0.6341604590415955, + "learning_rate": 0.00023990990668047921, + "loss": 3.0105, + "step": 34537 + }, + { + "epoch": 1.69, + "grad_norm": 0.624151349067688, + "learning_rate": 0.00023989482257995783, + "loss": 3.1166, + "step": 34538 + }, + { + "epoch": 1.69, + "grad_norm": 0.5957610011100769, + "learning_rate": 0.00023987973863774184, + "loss": 2.9692, + "step": 34539 + }, + { + "epoch": 1.69, + "grad_norm": 0.655441403388977, + "learning_rate": 0.00023986465485387114, + "loss": 2.9397, + "step": 34540 + }, + { + "epoch": 1.69, + "grad_norm": 0.6130520701408386, + "learning_rate": 0.0002398495712283852, + "loss": 3.0796, + "step": 34541 + }, + { + "epoch": 1.69, + "grad_norm": 0.6145035624504089, + "learning_rate": 0.00023983448776132397, + "loss": 3.0219, + "step": 34542 + }, + { + "epoch": 1.69, + "grad_norm": 0.6132924556732178, + "learning_rate": 0.00023981940445272723, + "loss": 2.9801, + "step": 34543 + }, + { + "epoch": 1.69, + "grad_norm": 0.6345343589782715, + "learning_rate": 0.00023980432130263444, + "loss": 2.9226, + "step": 34544 + }, + { + "epoch": 1.69, + "grad_norm": 0.6521178483963013, + "learning_rate": 0.00023978923831108562, + "loss": 3.0685, + "step": 34545 + }, + { + "epoch": 1.69, + "grad_norm": 0.6504637598991394, + "learning_rate": 0.00023977415547812029, + "loss": 2.8594, + "step": 34546 + }, + { + "epoch": 1.69, + "grad_norm": 0.6152077913284302, + "learning_rate": 0.00023975907280377815, + "loss": 3.1033, + "step": 34547 + }, + { + "epoch": 1.69, + "grad_norm": 0.6426700949668884, + "learning_rate": 0.00023974399028809916, + "loss": 2.9754, + "step": 34548 + }, + { + "epoch": 1.69, + "grad_norm": 0.6243044137954712, + "learning_rate": 0.00023972890793112281, + "loss": 2.958, + "step": 34549 + }, + { + "epoch": 1.69, + "grad_norm": 0.6287975311279297, + "learning_rate": 0.00023971382573288903, + "loss": 3.0594, + "step": 34550 + }, + { + "epoch": 1.69, + "grad_norm": 0.6270124912261963, + "learning_rate": 0.00023969874369343734, + "loss": 2.976, + "step": 34551 + }, + { + "epoch": 1.69, + "grad_norm": 0.6438772678375244, + "learning_rate": 0.00023968366181280755, + "loss": 3.0046, + "step": 34552 + }, + { + "epoch": 1.69, + "grad_norm": 0.6129069328308105, + "learning_rate": 0.00023966858009103945, + "loss": 2.9007, + "step": 34553 + }, + { + "epoch": 1.69, + "grad_norm": 0.6193562150001526, + "learning_rate": 0.00023965349852817267, + "loss": 2.9481, + "step": 34554 + }, + { + "epoch": 1.69, + "grad_norm": 0.6159753203392029, + "learning_rate": 0.00023963841712424703, + "loss": 2.9394, + "step": 34555 + }, + { + "epoch": 1.69, + "grad_norm": 0.5913207530975342, + "learning_rate": 0.00023962333587930203, + "loss": 2.9432, + "step": 34556 + }, + { + "epoch": 1.69, + "grad_norm": 0.6216997504234314, + "learning_rate": 0.00023960825479337763, + "loss": 3.0619, + "step": 34557 + }, + { + "epoch": 1.69, + "grad_norm": 0.6246969699859619, + "learning_rate": 0.00023959317386651352, + "loss": 2.8386, + "step": 34558 + }, + { + "epoch": 1.69, + "grad_norm": 0.6128319501876831, + "learning_rate": 0.0002395780930987492, + "loss": 3.0849, + "step": 34559 + }, + { + "epoch": 1.69, + "grad_norm": 0.6397287249565125, + "learning_rate": 0.00023956301249012472, + "loss": 3.0055, + "step": 34560 + }, + { + "epoch": 1.69, + "grad_norm": 0.5970715880393982, + "learning_rate": 0.00023954793204067956, + "loss": 2.909, + "step": 34561 + }, + { + "epoch": 1.69, + "grad_norm": 0.6784390807151794, + "learning_rate": 0.00023953285175045346, + "loss": 3.0605, + "step": 34562 + }, + { + "epoch": 1.69, + "grad_norm": 0.611601710319519, + "learning_rate": 0.00023951777161948634, + "loss": 3.0419, + "step": 34563 + }, + { + "epoch": 1.69, + "grad_norm": 0.6383347511291504, + "learning_rate": 0.00023950269164781766, + "loss": 2.9873, + "step": 34564 + }, + { + "epoch": 1.69, + "grad_norm": 0.5747080445289612, + "learning_rate": 0.00023948761183548734, + "loss": 2.8731, + "step": 34565 + }, + { + "epoch": 1.69, + "grad_norm": 0.611871600151062, + "learning_rate": 0.00023947253218253484, + "loss": 3.0842, + "step": 34566 + }, + { + "epoch": 1.69, + "grad_norm": 0.6260141730308533, + "learning_rate": 0.00023945745268900008, + "loss": 3.1116, + "step": 34567 + }, + { + "epoch": 1.69, + "grad_norm": 0.5983566641807556, + "learning_rate": 0.00023944237335492285, + "loss": 3.0337, + "step": 34568 + }, + { + "epoch": 1.69, + "grad_norm": 0.599165678024292, + "learning_rate": 0.0002394272941803427, + "loss": 2.9508, + "step": 34569 + }, + { + "epoch": 1.69, + "grad_norm": 0.6618895530700684, + "learning_rate": 0.00023941221516529944, + "loss": 3.1141, + "step": 34570 + }, + { + "epoch": 1.69, + "grad_norm": 0.5907045006752014, + "learning_rate": 0.00023939713630983267, + "loss": 2.8549, + "step": 34571 + }, + { + "epoch": 1.69, + "grad_norm": 0.6474612951278687, + "learning_rate": 0.0002393820576139822, + "loss": 2.9113, + "step": 34572 + }, + { + "epoch": 1.69, + "grad_norm": 0.5951142311096191, + "learning_rate": 0.00023936697907778778, + "loss": 2.9788, + "step": 34573 + }, + { + "epoch": 1.69, + "grad_norm": 0.6441212296485901, + "learning_rate": 0.00023935190070128895, + "loss": 2.8805, + "step": 34574 + }, + { + "epoch": 1.69, + "grad_norm": 0.6195458173751831, + "learning_rate": 0.0002393368224845257, + "loss": 3.1319, + "step": 34575 + }, + { + "epoch": 1.69, + "grad_norm": 0.602984607219696, + "learning_rate": 0.00023932174442753748, + "loss": 3.1744, + "step": 34576 + }, + { + "epoch": 1.69, + "grad_norm": 0.616973340511322, + "learning_rate": 0.00023930666653036408, + "loss": 3.1541, + "step": 34577 + }, + { + "epoch": 1.69, + "grad_norm": 0.6357001066207886, + "learning_rate": 0.00023929158879304534, + "loss": 3.112, + "step": 34578 + }, + { + "epoch": 1.69, + "grad_norm": 0.6247024536132812, + "learning_rate": 0.00023927651121562082, + "loss": 3.2426, + "step": 34579 + }, + { + "epoch": 1.69, + "grad_norm": 0.6100210547447205, + "learning_rate": 0.00023926143379813034, + "loss": 3.0513, + "step": 34580 + }, + { + "epoch": 1.69, + "grad_norm": 0.5985271334648132, + "learning_rate": 0.00023924635654061342, + "loss": 2.9466, + "step": 34581 + }, + { + "epoch": 1.69, + "grad_norm": 0.6380882859230042, + "learning_rate": 0.00023923127944310997, + "loss": 3.0692, + "step": 34582 + }, + { + "epoch": 1.69, + "grad_norm": 0.6195142269134521, + "learning_rate": 0.0002392162025056597, + "loss": 3.1293, + "step": 34583 + }, + { + "epoch": 1.69, + "grad_norm": 0.6390209197998047, + "learning_rate": 0.00023920112572830212, + "loss": 2.8975, + "step": 34584 + }, + { + "epoch": 1.69, + "grad_norm": 0.6426082849502563, + "learning_rate": 0.00023918604911107722, + "loss": 2.9478, + "step": 34585 + }, + { + "epoch": 1.69, + "grad_norm": 0.6332377195358276, + "learning_rate": 0.00023917097265402452, + "loss": 2.9731, + "step": 34586 + }, + { + "epoch": 1.7, + "grad_norm": 0.6898009181022644, + "learning_rate": 0.0002391558963571838, + "loss": 3.0346, + "step": 34587 + }, + { + "epoch": 1.7, + "grad_norm": 0.6284445524215698, + "learning_rate": 0.0002391408202205946, + "loss": 3.1084, + "step": 34588 + }, + { + "epoch": 1.7, + "grad_norm": 0.5925804376602173, + "learning_rate": 0.00023912574424429678, + "loss": 3.0499, + "step": 34589 + }, + { + "epoch": 1.7, + "grad_norm": 0.6358410716056824, + "learning_rate": 0.0002391106684283302, + "loss": 2.8512, + "step": 34590 + }, + { + "epoch": 1.7, + "grad_norm": 0.6737104654312134, + "learning_rate": 0.0002390955927727343, + "loss": 2.946, + "step": 34591 + }, + { + "epoch": 1.7, + "grad_norm": 0.604070782661438, + "learning_rate": 0.00023908051727754897, + "loss": 3.0541, + "step": 34592 + }, + { + "epoch": 1.7, + "grad_norm": 0.5812761187553406, + "learning_rate": 0.00023906544194281367, + "loss": 3.1825, + "step": 34593 + }, + { + "epoch": 1.7, + "grad_norm": 0.5896308422088623, + "learning_rate": 0.00023905036676856835, + "loss": 3.1379, + "step": 34594 + }, + { + "epoch": 1.7, + "grad_norm": 0.6765037775039673, + "learning_rate": 0.00023903529175485272, + "loss": 2.9756, + "step": 34595 + }, + { + "epoch": 1.7, + "grad_norm": 0.6317283511161804, + "learning_rate": 0.00023902021690170623, + "loss": 3.07, + "step": 34596 + }, + { + "epoch": 1.7, + "grad_norm": 0.6281628012657166, + "learning_rate": 0.0002390051422091689, + "loss": 3.049, + "step": 34597 + }, + { + "epoch": 1.7, + "grad_norm": 0.6032910943031311, + "learning_rate": 0.00023899006767728024, + "loss": 2.9015, + "step": 34598 + }, + { + "epoch": 1.7, + "grad_norm": 0.6200457811355591, + "learning_rate": 0.00023897499330607992, + "loss": 3.0544, + "step": 34599 + }, + { + "epoch": 1.7, + "grad_norm": 0.5972832441329956, + "learning_rate": 0.00023895991909560785, + "loss": 3.0304, + "step": 34600 + }, + { + "epoch": 1.7, + "grad_norm": 0.6621426343917847, + "learning_rate": 0.00023894484504590355, + "loss": 3.1561, + "step": 34601 + }, + { + "epoch": 1.7, + "grad_norm": 0.6243399977684021, + "learning_rate": 0.00023892977115700685, + "loss": 2.9101, + "step": 34602 + }, + { + "epoch": 1.7, + "grad_norm": 0.6167004108428955, + "learning_rate": 0.00023891469742895723, + "loss": 2.9986, + "step": 34603 + }, + { + "epoch": 1.7, + "grad_norm": 0.7089727520942688, + "learning_rate": 0.0002388996238617946, + "loss": 3.242, + "step": 34604 + }, + { + "epoch": 1.7, + "grad_norm": 0.5827861428260803, + "learning_rate": 0.00023888455045555868, + "loss": 2.9402, + "step": 34605 + }, + { + "epoch": 1.7, + "grad_norm": 0.6445075273513794, + "learning_rate": 0.000238869477210289, + "loss": 3.0979, + "step": 34606 + }, + { + "epoch": 1.7, + "grad_norm": 0.6177965402603149, + "learning_rate": 0.0002388544041260254, + "loss": 2.9209, + "step": 34607 + }, + { + "epoch": 1.7, + "grad_norm": 0.6588315963745117, + "learning_rate": 0.00023883933120280747, + "loss": 2.9099, + "step": 34608 + }, + { + "epoch": 1.7, + "grad_norm": 0.6694788336753845, + "learning_rate": 0.000238824258440675, + "loss": 2.9249, + "step": 34609 + }, + { + "epoch": 1.7, + "grad_norm": 0.6350728869438171, + "learning_rate": 0.00023880918583966777, + "loss": 3.1211, + "step": 34610 + }, + { + "epoch": 1.7, + "grad_norm": 0.6018306016921997, + "learning_rate": 0.0002387941133998252, + "loss": 3.068, + "step": 34611 + }, + { + "epoch": 1.7, + "grad_norm": 0.6151270866394043, + "learning_rate": 0.00023877904112118726, + "loss": 2.7378, + "step": 34612 + }, + { + "epoch": 1.7, + "grad_norm": 0.6024587154388428, + "learning_rate": 0.0002387639690037935, + "loss": 3.111, + "step": 34613 + }, + { + "epoch": 1.7, + "grad_norm": 0.6354588866233826, + "learning_rate": 0.00023874889704768366, + "loss": 3.0932, + "step": 34614 + }, + { + "epoch": 1.7, + "grad_norm": 0.6052812337875366, + "learning_rate": 0.0002387338252528975, + "loss": 3.0522, + "step": 34615 + }, + { + "epoch": 1.7, + "grad_norm": 0.6550926566123962, + "learning_rate": 0.00023871875361947462, + "loss": 3.0193, + "step": 34616 + }, + { + "epoch": 1.7, + "grad_norm": 0.6020631790161133, + "learning_rate": 0.00023870368214745478, + "loss": 2.9907, + "step": 34617 + }, + { + "epoch": 1.7, + "grad_norm": 0.6404764652252197, + "learning_rate": 0.00023868861083687753, + "loss": 3.0878, + "step": 34618 + }, + { + "epoch": 1.7, + "grad_norm": 0.6281757950782776, + "learning_rate": 0.00023867353968778273, + "loss": 2.9839, + "step": 34619 + }, + { + "epoch": 1.7, + "grad_norm": 0.6446701884269714, + "learning_rate": 0.0002386584687002101, + "loss": 3.0934, + "step": 34620 + }, + { + "epoch": 1.7, + "grad_norm": 0.6888076663017273, + "learning_rate": 0.00023864339787419916, + "loss": 3.2198, + "step": 34621 + }, + { + "epoch": 1.7, + "grad_norm": 0.7113660573959351, + "learning_rate": 0.0002386283272097898, + "loss": 3.125, + "step": 34622 + }, + { + "epoch": 1.7, + "grad_norm": 0.6033316254615784, + "learning_rate": 0.00023861325670702154, + "loss": 2.9347, + "step": 34623 + }, + { + "epoch": 1.7, + "grad_norm": 0.6461472511291504, + "learning_rate": 0.00023859818636593407, + "loss": 2.9211, + "step": 34624 + }, + { + "epoch": 1.7, + "grad_norm": 0.6217671632766724, + "learning_rate": 0.00023858311618656734, + "loss": 2.9407, + "step": 34625 + }, + { + "epoch": 1.7, + "grad_norm": 0.7843379378318787, + "learning_rate": 0.00023856804616896075, + "loss": 3.0633, + "step": 34626 + }, + { + "epoch": 1.7, + "grad_norm": 0.6176000237464905, + "learning_rate": 0.0002385529763131542, + "loss": 3.0751, + "step": 34627 + }, + { + "epoch": 1.7, + "grad_norm": 0.6316340565681458, + "learning_rate": 0.00023853790661918715, + "loss": 2.9783, + "step": 34628 + }, + { + "epoch": 1.7, + "grad_norm": 0.63956218957901, + "learning_rate": 0.00023852283708709944, + "loss": 3.0346, + "step": 34629 + }, + { + "epoch": 1.7, + "grad_norm": 0.6014463901519775, + "learning_rate": 0.00023850776771693082, + "loss": 3.088, + "step": 34630 + }, + { + "epoch": 1.7, + "grad_norm": 0.6172289252281189, + "learning_rate": 0.0002384926985087209, + "loss": 2.9347, + "step": 34631 + }, + { + "epoch": 1.7, + "grad_norm": 0.6280187964439392, + "learning_rate": 0.00023847762946250942, + "loss": 2.9345, + "step": 34632 + }, + { + "epoch": 1.7, + "grad_norm": 0.5740066170692444, + "learning_rate": 0.0002384625605783359, + "loss": 2.9651, + "step": 34633 + }, + { + "epoch": 1.7, + "grad_norm": 0.6673811078071594, + "learning_rate": 0.0002384474918562402, + "loss": 3.2161, + "step": 34634 + }, + { + "epoch": 1.7, + "grad_norm": 0.6165456175804138, + "learning_rate": 0.000238432423296262, + "loss": 3.0175, + "step": 34635 + }, + { + "epoch": 1.7, + "grad_norm": 0.6148802042007446, + "learning_rate": 0.0002384173548984409, + "loss": 2.9729, + "step": 34636 + }, + { + "epoch": 1.7, + "grad_norm": 0.6252344846725464, + "learning_rate": 0.0002384022866628167, + "loss": 3.0979, + "step": 34637 + }, + { + "epoch": 1.7, + "grad_norm": 0.6333229541778564, + "learning_rate": 0.00023838721858942898, + "loss": 2.8437, + "step": 34638 + }, + { + "epoch": 1.7, + "grad_norm": 0.6194198131561279, + "learning_rate": 0.0002383721506783174, + "loss": 2.8774, + "step": 34639 + }, + { + "epoch": 1.7, + "grad_norm": 0.6095027327537537, + "learning_rate": 0.00023835708292952183, + "loss": 2.9816, + "step": 34640 + }, + { + "epoch": 1.7, + "grad_norm": 0.613484799861908, + "learning_rate": 0.0002383420153430818, + "loss": 3.0787, + "step": 34641 + }, + { + "epoch": 1.7, + "grad_norm": 0.5842860341072083, + "learning_rate": 0.00023832694791903707, + "loss": 3.0523, + "step": 34642 + }, + { + "epoch": 1.7, + "grad_norm": 0.6341204047203064, + "learning_rate": 0.0002383118806574272, + "loss": 3.1984, + "step": 34643 + }, + { + "epoch": 1.7, + "grad_norm": 0.6389890313148499, + "learning_rate": 0.00023829681355829203, + "loss": 2.8034, + "step": 34644 + }, + { + "epoch": 1.7, + "grad_norm": 0.5947014689445496, + "learning_rate": 0.00023828174662167122, + "loss": 3.1272, + "step": 34645 + }, + { + "epoch": 1.7, + "grad_norm": 0.6640393733978271, + "learning_rate": 0.00023826667984760426, + "loss": 2.9868, + "step": 34646 + }, + { + "epoch": 1.7, + "grad_norm": 0.6023179292678833, + "learning_rate": 0.00023825161323613113, + "loss": 3.1548, + "step": 34647 + }, + { + "epoch": 1.7, + "grad_norm": 0.6089648604393005, + "learning_rate": 0.0002382365467872913, + "loss": 2.9102, + "step": 34648 + }, + { + "epoch": 1.7, + "grad_norm": 0.6076211333274841, + "learning_rate": 0.00023822148050112448, + "loss": 3.0588, + "step": 34649 + }, + { + "epoch": 1.7, + "grad_norm": 0.7229902744293213, + "learning_rate": 0.0002382064143776705, + "loss": 2.927, + "step": 34650 + }, + { + "epoch": 1.7, + "grad_norm": 0.5934354662895203, + "learning_rate": 0.00023819134841696882, + "loss": 2.9372, + "step": 34651 + }, + { + "epoch": 1.7, + "grad_norm": 0.6203776597976685, + "learning_rate": 0.00023817628261905933, + "loss": 3.0984, + "step": 34652 + }, + { + "epoch": 1.7, + "grad_norm": 0.6217381358146667, + "learning_rate": 0.0002381612169839816, + "loss": 3.1926, + "step": 34653 + }, + { + "epoch": 1.7, + "grad_norm": 0.5950883030891418, + "learning_rate": 0.0002381461515117752, + "loss": 2.8172, + "step": 34654 + }, + { + "epoch": 1.7, + "grad_norm": 0.622291624546051, + "learning_rate": 0.0002381310862024801, + "loss": 2.9426, + "step": 34655 + }, + { + "epoch": 1.7, + "grad_norm": 0.6245651841163635, + "learning_rate": 0.00023811602105613575, + "loss": 3.0114, + "step": 34656 + }, + { + "epoch": 1.7, + "grad_norm": 0.6118585467338562, + "learning_rate": 0.0002381009560727819, + "loss": 3.003, + "step": 34657 + }, + { + "epoch": 1.7, + "grad_norm": 0.7004932761192322, + "learning_rate": 0.00023808589125245814, + "loss": 2.9431, + "step": 34658 + }, + { + "epoch": 1.7, + "grad_norm": 0.6721332669258118, + "learning_rate": 0.0002380708265952043, + "loss": 2.9977, + "step": 34659 + }, + { + "epoch": 1.7, + "grad_norm": 0.6560796499252319, + "learning_rate": 0.00023805576210106003, + "loss": 2.9513, + "step": 34660 + }, + { + "epoch": 1.7, + "grad_norm": 0.5827532410621643, + "learning_rate": 0.0002380406977700648, + "loss": 2.8129, + "step": 34661 + }, + { + "epoch": 1.7, + "grad_norm": 0.6369756460189819, + "learning_rate": 0.00023802563360225863, + "loss": 3.1372, + "step": 34662 + }, + { + "epoch": 1.7, + "grad_norm": 0.6369935870170593, + "learning_rate": 0.00023801056959768093, + "loss": 2.876, + "step": 34663 + }, + { + "epoch": 1.7, + "grad_norm": 0.6207792162895203, + "learning_rate": 0.00023799550575637153, + "loss": 3.0598, + "step": 34664 + }, + { + "epoch": 1.7, + "grad_norm": 0.5954633355140686, + "learning_rate": 0.0002379804420783699, + "loss": 2.9627, + "step": 34665 + }, + { + "epoch": 1.7, + "grad_norm": 0.6295991539955139, + "learning_rate": 0.00023796537856371589, + "loss": 3.0739, + "step": 34666 + }, + { + "epoch": 1.7, + "grad_norm": 0.6417800188064575, + "learning_rate": 0.00023795031521244922, + "loss": 3.0968, + "step": 34667 + }, + { + "epoch": 1.7, + "grad_norm": 0.6085421442985535, + "learning_rate": 0.00023793525202460937, + "loss": 2.824, + "step": 34668 + }, + { + "epoch": 1.7, + "grad_norm": 0.6392890214920044, + "learning_rate": 0.00023792018900023622, + "loss": 2.9634, + "step": 34669 + }, + { + "epoch": 1.7, + "grad_norm": 0.6218932271003723, + "learning_rate": 0.00023790512613936925, + "loss": 3.0132, + "step": 34670 + }, + { + "epoch": 1.7, + "grad_norm": 0.642927885055542, + "learning_rate": 0.00023789006344204823, + "loss": 2.981, + "step": 34671 + }, + { + "epoch": 1.7, + "grad_norm": 0.6467803120613098, + "learning_rate": 0.00023787500090831294, + "loss": 3.0479, + "step": 34672 + }, + { + "epoch": 1.7, + "grad_norm": 0.6575911045074463, + "learning_rate": 0.00023785993853820282, + "loss": 2.999, + "step": 34673 + }, + { + "epoch": 1.7, + "grad_norm": 0.6105745434761047, + "learning_rate": 0.00023784487633175778, + "loss": 3.017, + "step": 34674 + }, + { + "epoch": 1.7, + "grad_norm": 0.6278857588768005, + "learning_rate": 0.0002378298142890173, + "loss": 3.0142, + "step": 34675 + }, + { + "epoch": 1.7, + "grad_norm": 0.7186394929885864, + "learning_rate": 0.00023781475241002107, + "loss": 3.0599, + "step": 34676 + }, + { + "epoch": 1.7, + "grad_norm": 0.6153616309165955, + "learning_rate": 0.00023779969069480896, + "loss": 3.1681, + "step": 34677 + }, + { + "epoch": 1.7, + "grad_norm": 0.6550021171569824, + "learning_rate": 0.00023778462914342038, + "loss": 3.1913, + "step": 34678 + }, + { + "epoch": 1.7, + "grad_norm": 0.6431772112846375, + "learning_rate": 0.0002377695677558952, + "loss": 3.0687, + "step": 34679 + }, + { + "epoch": 1.7, + "grad_norm": 0.5983546376228333, + "learning_rate": 0.0002377545065322729, + "loss": 2.9564, + "step": 34680 + }, + { + "epoch": 1.7, + "grad_norm": 0.6056777238845825, + "learning_rate": 0.00023773944547259327, + "loss": 3.1077, + "step": 34681 + }, + { + "epoch": 1.7, + "grad_norm": 0.6222465634346008, + "learning_rate": 0.00023772438457689607, + "loss": 3.2957, + "step": 34682 + }, + { + "epoch": 1.7, + "grad_norm": 0.5904668569564819, + "learning_rate": 0.00023770932384522072, + "loss": 3.1094, + "step": 34683 + }, + { + "epoch": 1.7, + "grad_norm": 0.6390523910522461, + "learning_rate": 0.00023769426327760712, + "loss": 3.1585, + "step": 34684 + }, + { + "epoch": 1.7, + "grad_norm": 0.6436138153076172, + "learning_rate": 0.0002376792028740948, + "loss": 2.9761, + "step": 34685 + }, + { + "epoch": 1.7, + "grad_norm": 0.6362774968147278, + "learning_rate": 0.00023766414263472338, + "loss": 3.1662, + "step": 34686 + }, + { + "epoch": 1.7, + "grad_norm": 0.6165968775749207, + "learning_rate": 0.00023764908255953277, + "loss": 2.9115, + "step": 34687 + }, + { + "epoch": 1.7, + "grad_norm": 0.6462124586105347, + "learning_rate": 0.00023763402264856238, + "loss": 3.1004, + "step": 34688 + }, + { + "epoch": 1.7, + "grad_norm": 0.6038845777511597, + "learning_rate": 0.00023761896290185207, + "loss": 3.167, + "step": 34689 + }, + { + "epoch": 1.7, + "grad_norm": 0.6276283860206604, + "learning_rate": 0.00023760390331944127, + "loss": 3.0901, + "step": 34690 + }, + { + "epoch": 1.7, + "grad_norm": 0.6404023766517639, + "learning_rate": 0.00023758884390136974, + "loss": 3.031, + "step": 34691 + }, + { + "epoch": 1.7, + "grad_norm": 0.5949774384498596, + "learning_rate": 0.00023757378464767735, + "loss": 3.1607, + "step": 34692 + }, + { + "epoch": 1.7, + "grad_norm": 0.5838576555252075, + "learning_rate": 0.00023755872555840355, + "loss": 2.8934, + "step": 34693 + }, + { + "epoch": 1.7, + "grad_norm": 0.6016432046890259, + "learning_rate": 0.00023754366663358804, + "loss": 2.9678, + "step": 34694 + }, + { + "epoch": 1.7, + "grad_norm": 0.6535770297050476, + "learning_rate": 0.00023752860787327044, + "loss": 3.088, + "step": 34695 + }, + { + "epoch": 1.7, + "grad_norm": 0.5867074131965637, + "learning_rate": 0.0002375135492774905, + "loss": 3.1053, + "step": 34696 + }, + { + "epoch": 1.7, + "grad_norm": 0.6075217127799988, + "learning_rate": 0.0002374984908462879, + "loss": 2.8846, + "step": 34697 + }, + { + "epoch": 1.7, + "grad_norm": 0.5963104367256165, + "learning_rate": 0.00023748343257970215, + "loss": 3.1219, + "step": 34698 + }, + { + "epoch": 1.7, + "grad_norm": 0.5905624032020569, + "learning_rate": 0.00023746837447777313, + "loss": 3.1523, + "step": 34699 + }, + { + "epoch": 1.7, + "grad_norm": 0.6578564643859863, + "learning_rate": 0.00023745331654054027, + "loss": 2.8213, + "step": 34700 + }, + { + "epoch": 1.7, + "grad_norm": 0.609445333480835, + "learning_rate": 0.0002374382587680433, + "loss": 3.1463, + "step": 34701 + }, + { + "epoch": 1.7, + "grad_norm": 0.6245207190513611, + "learning_rate": 0.00023742320116032202, + "loss": 3.0789, + "step": 34702 + }, + { + "epoch": 1.7, + "grad_norm": 0.6319349408149719, + "learning_rate": 0.00023740814371741597, + "loss": 2.9953, + "step": 34703 + }, + { + "epoch": 1.7, + "grad_norm": 0.6266147494316101, + "learning_rate": 0.00023739308643936486, + "loss": 2.9907, + "step": 34704 + }, + { + "epoch": 1.7, + "grad_norm": 0.6235140562057495, + "learning_rate": 0.00023737802932620819, + "loss": 3.0847, + "step": 34705 + }, + { + "epoch": 1.7, + "grad_norm": 0.6102908253669739, + "learning_rate": 0.0002373629723779858, + "loss": 3.0529, + "step": 34706 + }, + { + "epoch": 1.7, + "grad_norm": 0.5892670750617981, + "learning_rate": 0.00023734791559473732, + "loss": 2.8094, + "step": 34707 + }, + { + "epoch": 1.7, + "grad_norm": 0.658932089805603, + "learning_rate": 0.0002373328589765023, + "loss": 3.0447, + "step": 34708 + }, + { + "epoch": 1.7, + "grad_norm": 0.6005896925926208, + "learning_rate": 0.00023731780252332058, + "loss": 3.0984, + "step": 34709 + }, + { + "epoch": 1.7, + "grad_norm": 0.6177623867988586, + "learning_rate": 0.00023730274623523162, + "loss": 3.033, + "step": 34710 + }, + { + "epoch": 1.7, + "grad_norm": 0.6345233917236328, + "learning_rate": 0.00023728769011227509, + "loss": 3.0754, + "step": 34711 + }, + { + "epoch": 1.7, + "grad_norm": 0.7004926800727844, + "learning_rate": 0.00023727263415449087, + "loss": 3.1004, + "step": 34712 + }, + { + "epoch": 1.7, + "grad_norm": 0.6280829906463623, + "learning_rate": 0.0002372575783619183, + "loss": 2.9972, + "step": 34713 + }, + { + "epoch": 1.7, + "grad_norm": 0.661109983921051, + "learning_rate": 0.00023724252273459736, + "loss": 2.9467, + "step": 34714 + }, + { + "epoch": 1.7, + "grad_norm": 0.6094497442245483, + "learning_rate": 0.00023722746727256744, + "loss": 2.869, + "step": 34715 + }, + { + "epoch": 1.7, + "grad_norm": 0.6951985359191895, + "learning_rate": 0.00023721241197586826, + "loss": 3.0284, + "step": 34716 + }, + { + "epoch": 1.7, + "grad_norm": 0.6294052004814148, + "learning_rate": 0.0002371973568445396, + "loss": 2.898, + "step": 34717 + }, + { + "epoch": 1.7, + "grad_norm": 0.6243933439254761, + "learning_rate": 0.00023718230187862098, + "loss": 2.8577, + "step": 34718 + }, + { + "epoch": 1.7, + "grad_norm": 0.6630285978317261, + "learning_rate": 0.00023716724707815217, + "loss": 2.9685, + "step": 34719 + }, + { + "epoch": 1.7, + "grad_norm": 0.6620771288871765, + "learning_rate": 0.00023715219244317256, + "loss": 3.0159, + "step": 34720 + }, + { + "epoch": 1.7, + "grad_norm": 0.6197815537452698, + "learning_rate": 0.00023713713797372202, + "loss": 2.9542, + "step": 34721 + }, + { + "epoch": 1.7, + "grad_norm": 0.6559625864028931, + "learning_rate": 0.0002371220836698403, + "loss": 2.9549, + "step": 34722 + }, + { + "epoch": 1.7, + "grad_norm": 0.6281477212905884, + "learning_rate": 0.00023710702953156675, + "loss": 2.9301, + "step": 34723 + }, + { + "epoch": 1.7, + "grad_norm": 0.6305189728736877, + "learning_rate": 0.00023709197555894128, + "loss": 2.9736, + "step": 34724 + }, + { + "epoch": 1.7, + "grad_norm": 0.6746921539306641, + "learning_rate": 0.0002370769217520034, + "loss": 2.9676, + "step": 34725 + }, + { + "epoch": 1.7, + "grad_norm": 0.6075299382209778, + "learning_rate": 0.00023706186811079278, + "loss": 2.7907, + "step": 34726 + }, + { + "epoch": 1.7, + "grad_norm": 0.61752849817276, + "learning_rate": 0.00023704681463534915, + "loss": 3.1077, + "step": 34727 + }, + { + "epoch": 1.7, + "grad_norm": 0.5963026285171509, + "learning_rate": 0.00023703176132571207, + "loss": 2.9804, + "step": 34728 + }, + { + "epoch": 1.7, + "grad_norm": 0.6269551515579224, + "learning_rate": 0.00023701670818192122, + "loss": 2.8909, + "step": 34729 + }, + { + "epoch": 1.7, + "grad_norm": 0.6345133781433105, + "learning_rate": 0.00023700165520401618, + "loss": 3.0245, + "step": 34730 + }, + { + "epoch": 1.7, + "grad_norm": 0.6552383899688721, + "learning_rate": 0.00023698660239203663, + "loss": 2.9909, + "step": 34731 + }, + { + "epoch": 1.7, + "grad_norm": 0.6076620817184448, + "learning_rate": 0.00023697154974602235, + "loss": 3.0518, + "step": 34732 + }, + { + "epoch": 1.7, + "grad_norm": 0.6243478655815125, + "learning_rate": 0.00023695649726601284, + "loss": 2.9917, + "step": 34733 + }, + { + "epoch": 1.7, + "grad_norm": 0.6108534336090088, + "learning_rate": 0.00023694144495204782, + "loss": 2.8576, + "step": 34734 + }, + { + "epoch": 1.7, + "grad_norm": 0.658390998840332, + "learning_rate": 0.00023692639280416679, + "loss": 3.1793, + "step": 34735 + }, + { + "epoch": 1.7, + "grad_norm": 0.649669349193573, + "learning_rate": 0.00023691134082240957, + "loss": 3.0141, + "step": 34736 + }, + { + "epoch": 1.7, + "grad_norm": 0.6026470065116882, + "learning_rate": 0.0002368962890068158, + "loss": 2.8894, + "step": 34737 + }, + { + "epoch": 1.7, + "grad_norm": 0.5986788868904114, + "learning_rate": 0.00023688123735742493, + "loss": 2.9405, + "step": 34738 + }, + { + "epoch": 1.7, + "grad_norm": 0.6287503838539124, + "learning_rate": 0.00023686618587427685, + "loss": 2.9873, + "step": 34739 + }, + { + "epoch": 1.7, + "grad_norm": 0.617247998714447, + "learning_rate": 0.00023685113455741103, + "loss": 2.9822, + "step": 34740 + }, + { + "epoch": 1.7, + "grad_norm": 0.6538761258125305, + "learning_rate": 0.00023683608340686722, + "loss": 3.2603, + "step": 34741 + }, + { + "epoch": 1.7, + "grad_norm": 0.5831221342086792, + "learning_rate": 0.00023682103242268489, + "loss": 3.0979, + "step": 34742 + }, + { + "epoch": 1.7, + "grad_norm": 0.6267368197441101, + "learning_rate": 0.00023680598160490387, + "loss": 2.7693, + "step": 34743 + }, + { + "epoch": 1.7, + "grad_norm": 0.6173529624938965, + "learning_rate": 0.0002367909309535638, + "loss": 2.883, + "step": 34744 + }, + { + "epoch": 1.7, + "grad_norm": 0.6151677370071411, + "learning_rate": 0.00023677588046870413, + "loss": 3.2401, + "step": 34745 + }, + { + "epoch": 1.7, + "grad_norm": 0.6226587891578674, + "learning_rate": 0.00023676083015036474, + "loss": 2.8953, + "step": 34746 + }, + { + "epoch": 1.7, + "grad_norm": 0.5849825739860535, + "learning_rate": 0.00023674577999858507, + "loss": 2.8515, + "step": 34747 + }, + { + "epoch": 1.7, + "grad_norm": 0.5755380988121033, + "learning_rate": 0.0002367307300134048, + "loss": 3.012, + "step": 34748 + }, + { + "epoch": 1.7, + "grad_norm": 0.6167582273483276, + "learning_rate": 0.00023671568019486376, + "loss": 2.9798, + "step": 34749 + }, + { + "epoch": 1.7, + "grad_norm": 0.5916376709938049, + "learning_rate": 0.00023670063054300134, + "loss": 3.0067, + "step": 34750 + }, + { + "epoch": 1.7, + "grad_norm": 0.639107882976532, + "learning_rate": 0.00023668558105785736, + "loss": 3.067, + "step": 34751 + }, + { + "epoch": 1.7, + "grad_norm": 0.610746443271637, + "learning_rate": 0.00023667053173947122, + "loss": 2.9683, + "step": 34752 + }, + { + "epoch": 1.7, + "grad_norm": 0.6915435194969177, + "learning_rate": 0.00023665548258788274, + "loss": 3.0443, + "step": 34753 + }, + { + "epoch": 1.7, + "grad_norm": 0.6022907495498657, + "learning_rate": 0.00023664043360313166, + "loss": 3.0091, + "step": 34754 + }, + { + "epoch": 1.7, + "grad_norm": 0.6308808326721191, + "learning_rate": 0.0002366253847852574, + "loss": 2.9871, + "step": 34755 + }, + { + "epoch": 1.7, + "grad_norm": 0.6034700870513916, + "learning_rate": 0.00023661033613429976, + "loss": 3.1442, + "step": 34756 + }, + { + "epoch": 1.7, + "grad_norm": 0.5891356468200684, + "learning_rate": 0.00023659528765029815, + "loss": 3.0425, + "step": 34757 + }, + { + "epoch": 1.7, + "grad_norm": 0.6369116902351379, + "learning_rate": 0.00023658023933329243, + "loss": 3.1113, + "step": 34758 + }, + { + "epoch": 1.7, + "grad_norm": 0.605215311050415, + "learning_rate": 0.0002365651911833222, + "loss": 3.1997, + "step": 34759 + }, + { + "epoch": 1.7, + "grad_norm": 0.6010726094245911, + "learning_rate": 0.00023655014320042692, + "loss": 2.907, + "step": 34760 + }, + { + "epoch": 1.7, + "grad_norm": 0.615598201751709, + "learning_rate": 0.00023653509538464648, + "loss": 2.9991, + "step": 34761 + }, + { + "epoch": 1.7, + "grad_norm": 0.6162781119346619, + "learning_rate": 0.00023652004773602034, + "loss": 3.1683, + "step": 34762 + }, + { + "epoch": 1.7, + "grad_norm": 0.6095041036605835, + "learning_rate": 0.0002365050002545881, + "loss": 2.9159, + "step": 34763 + }, + { + "epoch": 1.7, + "grad_norm": 0.5748782753944397, + "learning_rate": 0.0002364899529403896, + "loss": 2.7686, + "step": 34764 + }, + { + "epoch": 1.7, + "grad_norm": 0.6566309332847595, + "learning_rate": 0.00023647490579346432, + "loss": 2.8777, + "step": 34765 + }, + { + "epoch": 1.7, + "grad_norm": 0.5985787510871887, + "learning_rate": 0.0002364598588138519, + "loss": 2.9995, + "step": 34766 + }, + { + "epoch": 1.7, + "grad_norm": 0.6187407374382019, + "learning_rate": 0.0002364448120015919, + "loss": 2.8916, + "step": 34767 + }, + { + "epoch": 1.7, + "grad_norm": 0.6135745644569397, + "learning_rate": 0.00023642976535672407, + "loss": 3.0603, + "step": 34768 + }, + { + "epoch": 1.7, + "grad_norm": 0.6574584245681763, + "learning_rate": 0.0002364147188792881, + "loss": 2.9804, + "step": 34769 + }, + { + "epoch": 1.7, + "grad_norm": 0.6308838725090027, + "learning_rate": 0.00023639967256932338, + "loss": 2.9685, + "step": 34770 + }, + { + "epoch": 1.7, + "grad_norm": 0.6589849591255188, + "learning_rate": 0.00023638462642686984, + "loss": 3.1744, + "step": 34771 + }, + { + "epoch": 1.7, + "grad_norm": 0.6437431573867798, + "learning_rate": 0.00023636958045196676, + "loss": 2.9364, + "step": 34772 + }, + { + "epoch": 1.7, + "grad_norm": 0.6591246128082275, + "learning_rate": 0.00023635453464465408, + "loss": 3.0042, + "step": 34773 + }, + { + "epoch": 1.7, + "grad_norm": 0.6104686260223389, + "learning_rate": 0.00023633948900497135, + "loss": 2.9775, + "step": 34774 + }, + { + "epoch": 1.7, + "grad_norm": 0.6379290819168091, + "learning_rate": 0.00023632444353295802, + "loss": 2.9928, + "step": 34775 + }, + { + "epoch": 1.7, + "grad_norm": 0.6709439158439636, + "learning_rate": 0.00023630939822865397, + "loss": 2.8491, + "step": 34776 + }, + { + "epoch": 1.7, + "grad_norm": 0.6231077909469604, + "learning_rate": 0.00023629435309209865, + "loss": 2.9117, + "step": 34777 + }, + { + "epoch": 1.7, + "grad_norm": 0.6265411376953125, + "learning_rate": 0.00023627930812333172, + "loss": 3.1632, + "step": 34778 + }, + { + "epoch": 1.7, + "grad_norm": 0.6154355406761169, + "learning_rate": 0.00023626426332239292, + "loss": 3.1184, + "step": 34779 + }, + { + "epoch": 1.7, + "grad_norm": 0.6437824964523315, + "learning_rate": 0.0002362492186893217, + "loss": 2.844, + "step": 34780 + }, + { + "epoch": 1.7, + "grad_norm": 0.6491339206695557, + "learning_rate": 0.00023623417422415788, + "loss": 2.8048, + "step": 34781 + }, + { + "epoch": 1.7, + "grad_norm": 0.6449874639511108, + "learning_rate": 0.0002362191299269408, + "loss": 3.0313, + "step": 34782 + }, + { + "epoch": 1.7, + "grad_norm": 0.6093539595603943, + "learning_rate": 0.00023620408579771038, + "loss": 3.1829, + "step": 34783 + }, + { + "epoch": 1.7, + "grad_norm": 0.6419656872749329, + "learning_rate": 0.00023618904183650614, + "loss": 3.3079, + "step": 34784 + }, + { + "epoch": 1.7, + "grad_norm": 0.6378693580627441, + "learning_rate": 0.00023617399804336755, + "loss": 2.9831, + "step": 34785 + }, + { + "epoch": 1.7, + "grad_norm": 0.6107136607170105, + "learning_rate": 0.0002361589544183345, + "loss": 2.9476, + "step": 34786 + }, + { + "epoch": 1.7, + "grad_norm": 0.6500281691551208, + "learning_rate": 0.0002361439109614464, + "loss": 3.0999, + "step": 34787 + }, + { + "epoch": 1.7, + "grad_norm": 0.5922091007232666, + "learning_rate": 0.00023612886767274293, + "loss": 3.0714, + "step": 34788 + }, + { + "epoch": 1.7, + "grad_norm": 0.6356111764907837, + "learning_rate": 0.00023611382455226385, + "loss": 2.8917, + "step": 34789 + }, + { + "epoch": 1.7, + "grad_norm": 0.6688171029090881, + "learning_rate": 0.00023609878160004856, + "loss": 3.0792, + "step": 34790 + }, + { + "epoch": 1.71, + "grad_norm": 0.6286024451255798, + "learning_rate": 0.00023608373881613685, + "loss": 3.1178, + "step": 34791 + }, + { + "epoch": 1.71, + "grad_norm": 0.6304290294647217, + "learning_rate": 0.00023606869620056817, + "loss": 2.9926, + "step": 34792 + }, + { + "epoch": 1.71, + "grad_norm": 0.6616683006286621, + "learning_rate": 0.0002360536537533822, + "loss": 2.761, + "step": 34793 + }, + { + "epoch": 1.71, + "grad_norm": 0.5931280851364136, + "learning_rate": 0.0002360386114746188, + "loss": 2.9022, + "step": 34794 + }, + { + "epoch": 1.71, + "grad_norm": 0.6022833585739136, + "learning_rate": 0.00023602356936431728, + "loss": 3.0123, + "step": 34795 + }, + { + "epoch": 1.71, + "grad_norm": 0.6116443276405334, + "learning_rate": 0.00023600852742251742, + "loss": 2.9503, + "step": 34796 + }, + { + "epoch": 1.71, + "grad_norm": 0.651864230632782, + "learning_rate": 0.00023599348564925867, + "loss": 2.966, + "step": 34797 + }, + { + "epoch": 1.71, + "grad_norm": 0.6232309341430664, + "learning_rate": 0.0002359784440445808, + "loss": 2.9551, + "step": 34798 + }, + { + "epoch": 1.71, + "grad_norm": 0.5941501259803772, + "learning_rate": 0.00023596340260852348, + "loss": 3.1523, + "step": 34799 + }, + { + "epoch": 1.71, + "grad_norm": 0.6085199117660522, + "learning_rate": 0.0002359483613411261, + "loss": 2.8691, + "step": 34800 + }, + { + "epoch": 1.71, + "grad_norm": 0.6391234397888184, + "learning_rate": 0.00023593332024242854, + "loss": 2.9873, + "step": 34801 + }, + { + "epoch": 1.71, + "grad_norm": 0.6489654183387756, + "learning_rate": 0.0002359182793124702, + "loss": 2.9056, + "step": 34802 + }, + { + "epoch": 1.71, + "grad_norm": 0.6384288668632507, + "learning_rate": 0.0002359032385512907, + "loss": 3.1661, + "step": 34803 + }, + { + "epoch": 1.71, + "grad_norm": 0.660001814365387, + "learning_rate": 0.00023588819795892994, + "loss": 3.021, + "step": 34804 + }, + { + "epoch": 1.71, + "grad_norm": 0.6211850047111511, + "learning_rate": 0.0002358731575354272, + "loss": 3.0002, + "step": 34805 + }, + { + "epoch": 1.71, + "grad_norm": 0.5897373557090759, + "learning_rate": 0.0002358581172808223, + "loss": 3.1594, + "step": 34806 + }, + { + "epoch": 1.71, + "grad_norm": 0.5892007350921631, + "learning_rate": 0.00023584307719515469, + "loss": 2.7936, + "step": 34807 + }, + { + "epoch": 1.71, + "grad_norm": 0.5857629776000977, + "learning_rate": 0.00023582803727846405, + "loss": 2.9798, + "step": 34808 + }, + { + "epoch": 1.71, + "grad_norm": 0.6502203941345215, + "learning_rate": 0.0002358129975307901, + "loss": 3.3007, + "step": 34809 + }, + { + "epoch": 1.71, + "grad_norm": 0.6255640983581543, + "learning_rate": 0.00023579795795217225, + "loss": 2.9249, + "step": 34810 + }, + { + "epoch": 1.71, + "grad_norm": 0.6112305521965027, + "learning_rate": 0.00023578291854265033, + "loss": 3.198, + "step": 34811 + }, + { + "epoch": 1.71, + "grad_norm": 0.6151379346847534, + "learning_rate": 0.00023576787930226385, + "loss": 2.7124, + "step": 34812 + }, + { + "epoch": 1.71, + "grad_norm": 0.6541122794151306, + "learning_rate": 0.00023575284023105229, + "loss": 2.8483, + "step": 34813 + }, + { + "epoch": 1.71, + "grad_norm": 0.6102315187454224, + "learning_rate": 0.0002357378013290555, + "loss": 2.942, + "step": 34814 + }, + { + "epoch": 1.71, + "grad_norm": 0.6251721382141113, + "learning_rate": 0.00023572276259631287, + "loss": 3.1909, + "step": 34815 + }, + { + "epoch": 1.71, + "grad_norm": 0.5804009437561035, + "learning_rate": 0.00023570772403286425, + "loss": 3.049, + "step": 34816 + }, + { + "epoch": 1.71, + "grad_norm": 0.6197915077209473, + "learning_rate": 0.00023569268563874903, + "loss": 3.0851, + "step": 34817 + }, + { + "epoch": 1.71, + "grad_norm": 0.605315089225769, + "learning_rate": 0.00023567764741400682, + "loss": 3.1499, + "step": 34818 + }, + { + "epoch": 1.71, + "grad_norm": 0.6538598537445068, + "learning_rate": 0.00023566260935867746, + "loss": 3.0767, + "step": 34819 + }, + { + "epoch": 1.71, + "grad_norm": 0.6209130883216858, + "learning_rate": 0.00023564757147280033, + "loss": 3.1515, + "step": 34820 + }, + { + "epoch": 1.71, + "grad_norm": 0.6308000087738037, + "learning_rate": 0.00023563253375641517, + "loss": 3.0422, + "step": 34821 + }, + { + "epoch": 1.71, + "grad_norm": 0.5922633409500122, + "learning_rate": 0.00023561749620956143, + "loss": 3.0058, + "step": 34822 + }, + { + "epoch": 1.71, + "grad_norm": 0.6279627084732056, + "learning_rate": 0.00023560245883227893, + "loss": 2.9634, + "step": 34823 + }, + { + "epoch": 1.71, + "grad_norm": 0.611582338809967, + "learning_rate": 0.00023558742162460707, + "loss": 2.9571, + "step": 34824 + }, + { + "epoch": 1.71, + "grad_norm": 0.5956346392631531, + "learning_rate": 0.00023557238458658553, + "loss": 3.0411, + "step": 34825 + }, + { + "epoch": 1.71, + "grad_norm": 0.6095432639122009, + "learning_rate": 0.00023555734771825402, + "loss": 3.1169, + "step": 34826 + }, + { + "epoch": 1.71, + "grad_norm": 0.5974093079566956, + "learning_rate": 0.000235542311019652, + "loss": 2.9938, + "step": 34827 + }, + { + "epoch": 1.71, + "grad_norm": 0.696919322013855, + "learning_rate": 0.0002355272744908192, + "loss": 3.144, + "step": 34828 + }, + { + "epoch": 1.71, + "grad_norm": 0.6090503931045532, + "learning_rate": 0.00023551223813179502, + "loss": 3.1853, + "step": 34829 + }, + { + "epoch": 1.71, + "grad_norm": 0.6434307098388672, + "learning_rate": 0.00023549720194261924, + "loss": 3.0533, + "step": 34830 + }, + { + "epoch": 1.71, + "grad_norm": 0.6265668272972107, + "learning_rate": 0.00023548216592333152, + "loss": 2.8724, + "step": 34831 + }, + { + "epoch": 1.71, + "grad_norm": 0.6203542947769165, + "learning_rate": 0.00023546713007397115, + "loss": 3.1396, + "step": 34832 + }, + { + "epoch": 1.71, + "grad_norm": 0.6344203948974609, + "learning_rate": 0.00023545209439457813, + "loss": 3.0088, + "step": 34833 + }, + { + "epoch": 1.71, + "grad_norm": 0.6461747288703918, + "learning_rate": 0.00023543705888519174, + "loss": 2.848, + "step": 34834 + }, + { + "epoch": 1.71, + "grad_norm": 0.6251636743545532, + "learning_rate": 0.0002354220235458518, + "loss": 2.8499, + "step": 34835 + }, + { + "epoch": 1.71, + "grad_norm": 0.6171122789382935, + "learning_rate": 0.00023540698837659784, + "loss": 2.9701, + "step": 34836 + }, + { + "epoch": 1.71, + "grad_norm": 0.6390722990036011, + "learning_rate": 0.0002353919533774693, + "loss": 3.0639, + "step": 34837 + }, + { + "epoch": 1.71, + "grad_norm": 0.6113898158073425, + "learning_rate": 0.0002353769185485061, + "loss": 3.0786, + "step": 34838 + }, + { + "epoch": 1.71, + "grad_norm": 0.640236496925354, + "learning_rate": 0.00023536188388974757, + "loss": 2.9315, + "step": 34839 + }, + { + "epoch": 1.71, + "grad_norm": 0.6200087666511536, + "learning_rate": 0.00023534684940123332, + "loss": 3.1699, + "step": 34840 + }, + { + "epoch": 1.71, + "grad_norm": 0.6289457678794861, + "learning_rate": 0.00023533181508300318, + "loss": 3.1179, + "step": 34841 + }, + { + "epoch": 1.71, + "grad_norm": 0.6203035116195679, + "learning_rate": 0.00023531678093509655, + "loss": 3.1601, + "step": 34842 + }, + { + "epoch": 1.71, + "grad_norm": 0.6237162351608276, + "learning_rate": 0.00023530174695755306, + "loss": 3.0658, + "step": 34843 + }, + { + "epoch": 1.71, + "grad_norm": 0.6613100171089172, + "learning_rate": 0.00023528671315041226, + "loss": 2.8084, + "step": 34844 + }, + { + "epoch": 1.71, + "grad_norm": 0.6259462833404541, + "learning_rate": 0.00023527167951371383, + "loss": 3.0736, + "step": 34845 + }, + { + "epoch": 1.71, + "grad_norm": 0.6277626156806946, + "learning_rate": 0.0002352566460474974, + "loss": 3.0055, + "step": 34846 + }, + { + "epoch": 1.71, + "grad_norm": 0.6393147110939026, + "learning_rate": 0.0002352416127518024, + "loss": 3.144, + "step": 34847 + }, + { + "epoch": 1.71, + "grad_norm": 0.6422019600868225, + "learning_rate": 0.00023522657962666866, + "loss": 3.0748, + "step": 34848 + }, + { + "epoch": 1.71, + "grad_norm": 0.6111181378364563, + "learning_rate": 0.00023521154667213555, + "loss": 3.024, + "step": 34849 + }, + { + "epoch": 1.71, + "grad_norm": 0.6628186702728271, + "learning_rate": 0.0002351965138882427, + "loss": 3.0101, + "step": 34850 + }, + { + "epoch": 1.71, + "grad_norm": 0.6172040700912476, + "learning_rate": 0.0002351814812750299, + "loss": 3.1652, + "step": 34851 + }, + { + "epoch": 1.71, + "grad_norm": 0.6279569268226624, + "learning_rate": 0.00023516644883253654, + "loss": 3.0395, + "step": 34852 + }, + { + "epoch": 1.71, + "grad_norm": 0.6078111529350281, + "learning_rate": 0.00023515141656080236, + "loss": 2.9895, + "step": 34853 + }, + { + "epoch": 1.71, + "grad_norm": 0.6228066682815552, + "learning_rate": 0.0002351363844598667, + "loss": 2.9287, + "step": 34854 + }, + { + "epoch": 1.71, + "grad_norm": 0.7036370038986206, + "learning_rate": 0.00023512135252976935, + "loss": 3.1124, + "step": 34855 + }, + { + "epoch": 1.71, + "grad_norm": 0.5945392847061157, + "learning_rate": 0.00023510632077054997, + "loss": 2.9471, + "step": 34856 + }, + { + "epoch": 1.71, + "grad_norm": 0.6402318477630615, + "learning_rate": 0.00023509128918224802, + "loss": 3.1473, + "step": 34857 + }, + { + "epoch": 1.71, + "grad_norm": 0.6327133774757385, + "learning_rate": 0.00023507625776490317, + "loss": 2.855, + "step": 34858 + }, + { + "epoch": 1.71, + "grad_norm": 0.6053986549377441, + "learning_rate": 0.0002350612265185548, + "loss": 3.0087, + "step": 34859 + }, + { + "epoch": 1.71, + "grad_norm": 0.5792701244354248, + "learning_rate": 0.00023504619544324277, + "loss": 2.8177, + "step": 34860 + }, + { + "epoch": 1.71, + "grad_norm": 0.6614009141921997, + "learning_rate": 0.00023503116453900663, + "loss": 3.218, + "step": 34861 + }, + { + "epoch": 1.71, + "grad_norm": 0.6756806969642639, + "learning_rate": 0.00023501613380588575, + "loss": 3.1788, + "step": 34862 + }, + { + "epoch": 1.71, + "grad_norm": 0.6036829352378845, + "learning_rate": 0.00023500110324392004, + "loss": 3.0215, + "step": 34863 + }, + { + "epoch": 1.71, + "grad_norm": 0.6031563878059387, + "learning_rate": 0.0002349860728531488, + "loss": 3.146, + "step": 34864 + }, + { + "epoch": 1.71, + "grad_norm": 0.6657361388206482, + "learning_rate": 0.00023497104263361167, + "loss": 3.0158, + "step": 34865 + }, + { + "epoch": 1.71, + "grad_norm": 0.6230410933494568, + "learning_rate": 0.00023495601258534843, + "loss": 2.758, + "step": 34866 + }, + { + "epoch": 1.71, + "grad_norm": 0.598953127861023, + "learning_rate": 0.00023494098270839849, + "loss": 3.1044, + "step": 34867 + }, + { + "epoch": 1.71, + "grad_norm": 0.6193811893463135, + "learning_rate": 0.00023492595300280152, + "loss": 2.843, + "step": 34868 + }, + { + "epoch": 1.71, + "grad_norm": 0.6279089450836182, + "learning_rate": 0.00023491092346859693, + "loss": 3.1108, + "step": 34869 + }, + { + "epoch": 1.71, + "grad_norm": 0.6103789806365967, + "learning_rate": 0.00023489589410582455, + "loss": 3.1877, + "step": 34870 + }, + { + "epoch": 1.71, + "grad_norm": 0.6290521621704102, + "learning_rate": 0.0002348808649145239, + "loss": 3.0293, + "step": 34871 + }, + { + "epoch": 1.71, + "grad_norm": 0.5811523795127869, + "learning_rate": 0.00023486583589473438, + "loss": 3.0623, + "step": 34872 + }, + { + "epoch": 1.71, + "grad_norm": 0.6363834738731384, + "learning_rate": 0.00023485080704649586, + "loss": 2.9667, + "step": 34873 + }, + { + "epoch": 1.71, + "grad_norm": 0.6560509204864502, + "learning_rate": 0.0002348357783698477, + "loss": 2.975, + "step": 34874 + }, + { + "epoch": 1.71, + "grad_norm": 0.6376746892929077, + "learning_rate": 0.0002348207498648295, + "loss": 3.156, + "step": 34875 + }, + { + "epoch": 1.71, + "grad_norm": 0.6634820699691772, + "learning_rate": 0.00023480572153148103, + "loss": 2.9336, + "step": 34876 + }, + { + "epoch": 1.71, + "grad_norm": 0.6263517141342163, + "learning_rate": 0.00023479069336984157, + "loss": 3.0352, + "step": 34877 + }, + { + "epoch": 1.71, + "grad_norm": 0.6027346253395081, + "learning_rate": 0.00023477566537995106, + "loss": 2.884, + "step": 34878 + }, + { + "epoch": 1.71, + "grad_norm": 0.6649355292320251, + "learning_rate": 0.0002347606375618488, + "loss": 2.9949, + "step": 34879 + }, + { + "epoch": 1.71, + "grad_norm": 0.6449239253997803, + "learning_rate": 0.00023474560991557442, + "loss": 3.0944, + "step": 34880 + }, + { + "epoch": 1.71, + "grad_norm": 0.6204206943511963, + "learning_rate": 0.00023473058244116767, + "loss": 2.8473, + "step": 34881 + }, + { + "epoch": 1.71, + "grad_norm": 0.6074206829071045, + "learning_rate": 0.00023471555513866795, + "loss": 2.8826, + "step": 34882 + }, + { + "epoch": 1.71, + "grad_norm": 0.607448935508728, + "learning_rate": 0.00023470052800811495, + "loss": 2.9217, + "step": 34883 + }, + { + "epoch": 1.71, + "grad_norm": 0.6031710505485535, + "learning_rate": 0.00023468550104954804, + "loss": 2.9217, + "step": 34884 + }, + { + "epoch": 1.71, + "grad_norm": 0.5931732654571533, + "learning_rate": 0.00023467047426300704, + "loss": 2.8402, + "step": 34885 + }, + { + "epoch": 1.71, + "grad_norm": 0.6108601689338684, + "learning_rate": 0.00023465544764853148, + "loss": 3.0791, + "step": 34886 + }, + { + "epoch": 1.71, + "grad_norm": 0.6241343021392822, + "learning_rate": 0.0002346404212061608, + "loss": 3.0006, + "step": 34887 + }, + { + "epoch": 1.71, + "grad_norm": 0.6183168292045593, + "learning_rate": 0.00023462539493593478, + "loss": 3.1661, + "step": 34888 + }, + { + "epoch": 1.71, + "grad_norm": 0.6017194986343384, + "learning_rate": 0.00023461036883789282, + "loss": 3.0034, + "step": 34889 + }, + { + "epoch": 1.71, + "grad_norm": 0.659395694732666, + "learning_rate": 0.00023459534291207448, + "loss": 3.0346, + "step": 34890 + }, + { + "epoch": 1.71, + "grad_norm": 0.6340118646621704, + "learning_rate": 0.00023458031715851957, + "loss": 3.1279, + "step": 34891 + }, + { + "epoch": 1.71, + "grad_norm": 0.6464155912399292, + "learning_rate": 0.00023456529157726746, + "loss": 2.861, + "step": 34892 + }, + { + "epoch": 1.71, + "grad_norm": 0.6099604368209839, + "learning_rate": 0.00023455026616835784, + "loss": 3.066, + "step": 34893 + }, + { + "epoch": 1.71, + "grad_norm": 0.6244305968284607, + "learning_rate": 0.00023453524093183009, + "loss": 2.9749, + "step": 34894 + }, + { + "epoch": 1.71, + "grad_norm": 0.5913376212120056, + "learning_rate": 0.00023452021586772392, + "loss": 3.099, + "step": 34895 + }, + { + "epoch": 1.71, + "grad_norm": 0.6622759699821472, + "learning_rate": 0.000234505190976079, + "loss": 3.0457, + "step": 34896 + }, + { + "epoch": 1.71, + "grad_norm": 0.6322427988052368, + "learning_rate": 0.00023449016625693477, + "loss": 3.0076, + "step": 34897 + }, + { + "epoch": 1.71, + "grad_norm": 0.6200683116912842, + "learning_rate": 0.0002344751417103309, + "loss": 2.9269, + "step": 34898 + }, + { + "epoch": 1.71, + "grad_norm": 0.6351677775382996, + "learning_rate": 0.00023446011733630674, + "loss": 2.9341, + "step": 34899 + }, + { + "epoch": 1.71, + "grad_norm": 0.6051552891731262, + "learning_rate": 0.0002344450931349022, + "loss": 3.1266, + "step": 34900 + }, + { + "epoch": 1.71, + "grad_norm": 0.657870352268219, + "learning_rate": 0.00023443006910615652, + "loss": 3.2018, + "step": 34901 + }, + { + "epoch": 1.71, + "grad_norm": 0.6787803173065186, + "learning_rate": 0.00023441504525010943, + "loss": 3.0304, + "step": 34902 + }, + { + "epoch": 1.71, + "grad_norm": 0.6219592690467834, + "learning_rate": 0.0002344000215668006, + "loss": 3.2, + "step": 34903 + }, + { + "epoch": 1.71, + "grad_norm": 0.6439167857170105, + "learning_rate": 0.0002343849980562694, + "loss": 3.0774, + "step": 34904 + }, + { + "epoch": 1.71, + "grad_norm": 0.6104590892791748, + "learning_rate": 0.00023436997471855558, + "loss": 3.0582, + "step": 34905 + }, + { + "epoch": 1.71, + "grad_norm": 0.6476345062255859, + "learning_rate": 0.00023435495155369848, + "loss": 3.014, + "step": 34906 + }, + { + "epoch": 1.71, + "grad_norm": 0.6276230812072754, + "learning_rate": 0.00023433992856173786, + "loss": 3.0314, + "step": 34907 + }, + { + "epoch": 1.71, + "grad_norm": 0.6397703886032104, + "learning_rate": 0.00023432490574271328, + "loss": 2.9595, + "step": 34908 + }, + { + "epoch": 1.71, + "grad_norm": 0.6316772699356079, + "learning_rate": 0.00023430988309666414, + "loss": 2.9005, + "step": 34909 + }, + { + "epoch": 1.71, + "grad_norm": 0.6077426671981812, + "learning_rate": 0.00023429486062363027, + "loss": 2.9963, + "step": 34910 + }, + { + "epoch": 1.71, + "grad_norm": 0.6244196891784668, + "learning_rate": 0.00023427983832365106, + "loss": 3.0356, + "step": 34911 + }, + { + "epoch": 1.71, + "grad_norm": 0.6547126770019531, + "learning_rate": 0.00023426481619676597, + "loss": 2.9158, + "step": 34912 + }, + { + "epoch": 1.71, + "grad_norm": 0.5975217223167419, + "learning_rate": 0.00023424979424301488, + "loss": 3.005, + "step": 34913 + }, + { + "epoch": 1.71, + "grad_norm": 0.6610994935035706, + "learning_rate": 0.00023423477246243708, + "loss": 2.8519, + "step": 34914 + }, + { + "epoch": 1.71, + "grad_norm": 0.6371542811393738, + "learning_rate": 0.00023421975085507232, + "loss": 2.9664, + "step": 34915 + }, + { + "epoch": 1.71, + "grad_norm": 0.6088597178459167, + "learning_rate": 0.00023420472942095995, + "loss": 2.9404, + "step": 34916 + }, + { + "epoch": 1.71, + "grad_norm": 0.654114305973053, + "learning_rate": 0.00023418970816013965, + "loss": 2.9306, + "step": 34917 + }, + { + "epoch": 1.71, + "grad_norm": 0.637015700340271, + "learning_rate": 0.0002341746870726511, + "loss": 2.8428, + "step": 34918 + }, + { + "epoch": 1.71, + "grad_norm": 0.6335122585296631, + "learning_rate": 0.00023415966615853372, + "loss": 2.9795, + "step": 34919 + }, + { + "epoch": 1.71, + "grad_norm": 0.6221902966499329, + "learning_rate": 0.0002341446454178272, + "loss": 3.0404, + "step": 34920 + }, + { + "epoch": 1.71, + "grad_norm": 0.6287366151809692, + "learning_rate": 0.00023412962485057086, + "loss": 3.1862, + "step": 34921 + }, + { + "epoch": 1.71, + "grad_norm": 0.6433752775192261, + "learning_rate": 0.00023411460445680444, + "loss": 3.119, + "step": 34922 + }, + { + "epoch": 1.71, + "grad_norm": 0.6408416032791138, + "learning_rate": 0.00023409958423656755, + "loss": 3.0431, + "step": 34923 + }, + { + "epoch": 1.71, + "grad_norm": 0.6352258920669556, + "learning_rate": 0.00023408456418989957, + "loss": 2.9922, + "step": 34924 + }, + { + "epoch": 1.71, + "grad_norm": 0.6453670263290405, + "learning_rate": 0.00023406954431684027, + "loss": 3.1621, + "step": 34925 + }, + { + "epoch": 1.71, + "grad_norm": 0.5970926284790039, + "learning_rate": 0.00023405452461742904, + "loss": 2.9972, + "step": 34926 + }, + { + "epoch": 1.71, + "grad_norm": 0.6512788534164429, + "learning_rate": 0.00023403950509170543, + "loss": 3.1177, + "step": 34927 + }, + { + "epoch": 1.71, + "grad_norm": 0.6225276589393616, + "learning_rate": 0.00023402448573970925, + "loss": 2.9748, + "step": 34928 + }, + { + "epoch": 1.71, + "grad_norm": 0.6059454083442688, + "learning_rate": 0.00023400946656147977, + "loss": 3.0446, + "step": 34929 + }, + { + "epoch": 1.71, + "grad_norm": 0.6157578825950623, + "learning_rate": 0.0002339944475570567, + "loss": 3.1555, + "step": 34930 + }, + { + "epoch": 1.71, + "grad_norm": 0.6617079973220825, + "learning_rate": 0.00023397942872647944, + "loss": 2.9416, + "step": 34931 + }, + { + "epoch": 1.71, + "grad_norm": 0.6348146200180054, + "learning_rate": 0.00023396441006978775, + "loss": 3.1802, + "step": 34932 + }, + { + "epoch": 1.71, + "grad_norm": 0.6032415628433228, + "learning_rate": 0.00023394939158702112, + "loss": 2.9522, + "step": 34933 + }, + { + "epoch": 1.71, + "grad_norm": 0.6263419985771179, + "learning_rate": 0.00023393437327821895, + "loss": 2.8259, + "step": 34934 + }, + { + "epoch": 1.71, + "grad_norm": 0.6232619285583496, + "learning_rate": 0.0002339193551434211, + "loss": 2.8977, + "step": 34935 + }, + { + "epoch": 1.71, + "grad_norm": 0.6191720962524414, + "learning_rate": 0.00023390433718266685, + "loss": 2.9033, + "step": 34936 + }, + { + "epoch": 1.71, + "grad_norm": 0.7007677555084229, + "learning_rate": 0.0002338893193959958, + "loss": 3.1077, + "step": 34937 + }, + { + "epoch": 1.71, + "grad_norm": 0.6107507944107056, + "learning_rate": 0.0002338743017834477, + "loss": 3.1321, + "step": 34938 + }, + { + "epoch": 1.71, + "grad_norm": 0.6792271137237549, + "learning_rate": 0.00023385928434506182, + "loss": 2.9655, + "step": 34939 + }, + { + "epoch": 1.71, + "grad_norm": 0.645459771156311, + "learning_rate": 0.00023384426708087798, + "loss": 3.0854, + "step": 34940 + }, + { + "epoch": 1.71, + "grad_norm": 0.6554004549980164, + "learning_rate": 0.00023382924999093555, + "loss": 2.7691, + "step": 34941 + }, + { + "epoch": 1.71, + "grad_norm": 0.6682091355323792, + "learning_rate": 0.00023381423307527408, + "loss": 2.9499, + "step": 34942 + }, + { + "epoch": 1.71, + "grad_norm": 0.653188169002533, + "learning_rate": 0.0002337992163339333, + "loss": 3.0563, + "step": 34943 + }, + { + "epoch": 1.71, + "grad_norm": 0.6153813600540161, + "learning_rate": 0.0002337841997669526, + "loss": 2.972, + "step": 34944 + }, + { + "epoch": 1.71, + "grad_norm": 0.6357757449150085, + "learning_rate": 0.00023376918337437164, + "loss": 2.9628, + "step": 34945 + }, + { + "epoch": 1.71, + "grad_norm": 0.6298278570175171, + "learning_rate": 0.00023375416715622977, + "loss": 2.7598, + "step": 34946 + }, + { + "epoch": 1.71, + "grad_norm": 0.581403911113739, + "learning_rate": 0.00023373915111256675, + "loss": 2.8313, + "step": 34947 + }, + { + "epoch": 1.71, + "grad_norm": 0.6330929398536682, + "learning_rate": 0.00023372413524342208, + "loss": 2.9305, + "step": 34948 + }, + { + "epoch": 1.71, + "grad_norm": 0.6617899537086487, + "learning_rate": 0.00023370911954883518, + "loss": 2.8998, + "step": 34949 + }, + { + "epoch": 1.71, + "grad_norm": 0.6223428249359131, + "learning_rate": 0.00023369410402884582, + "loss": 3.0054, + "step": 34950 + }, + { + "epoch": 1.71, + "grad_norm": 0.6279831528663635, + "learning_rate": 0.00023367908868349337, + "loss": 3.0671, + "step": 34951 + }, + { + "epoch": 1.71, + "grad_norm": 0.5640294551849365, + "learning_rate": 0.00023366407351281735, + "loss": 3.0893, + "step": 34952 + }, + { + "epoch": 1.71, + "grad_norm": 0.5992966294288635, + "learning_rate": 0.0002336490585168576, + "loss": 3.0811, + "step": 34953 + }, + { + "epoch": 1.71, + "grad_norm": 0.6405746340751648, + "learning_rate": 0.00023363404369565328, + "loss": 3.0204, + "step": 34954 + }, + { + "epoch": 1.71, + "grad_norm": 0.651674747467041, + "learning_rate": 0.00023361902904924428, + "loss": 2.7941, + "step": 34955 + }, + { + "epoch": 1.71, + "grad_norm": 0.6315301656723022, + "learning_rate": 0.0002336040145776698, + "loss": 2.9018, + "step": 34956 + }, + { + "epoch": 1.71, + "grad_norm": 0.6295512914657593, + "learning_rate": 0.00023358900028096959, + "loss": 3.013, + "step": 34957 + }, + { + "epoch": 1.71, + "grad_norm": 0.5874469876289368, + "learning_rate": 0.0002335739861591833, + "loss": 3.203, + "step": 34958 + }, + { + "epoch": 1.71, + "grad_norm": 0.6032001376152039, + "learning_rate": 0.00023355897221235026, + "loss": 3.0261, + "step": 34959 + }, + { + "epoch": 1.71, + "grad_norm": 0.6361725330352783, + "learning_rate": 0.00023354395844051018, + "loss": 3.0568, + "step": 34960 + }, + { + "epoch": 1.71, + "grad_norm": 0.6287243962287903, + "learning_rate": 0.0002335289448437024, + "loss": 3.1561, + "step": 34961 + }, + { + "epoch": 1.71, + "grad_norm": 0.6232472062110901, + "learning_rate": 0.00023351393142196665, + "loss": 3.0336, + "step": 34962 + }, + { + "epoch": 1.71, + "grad_norm": 0.6193660497665405, + "learning_rate": 0.00023349891817534246, + "loss": 2.7832, + "step": 34963 + }, + { + "epoch": 1.71, + "grad_norm": 0.6325574517250061, + "learning_rate": 0.00023348390510386917, + "loss": 2.9592, + "step": 34964 + }, + { + "epoch": 1.71, + "grad_norm": 0.6106416583061218, + "learning_rate": 0.00023346889220758667, + "loss": 3.0528, + "step": 34965 + }, + { + "epoch": 1.71, + "grad_norm": 0.6092174053192139, + "learning_rate": 0.0002334538794865342, + "loss": 2.9849, + "step": 34966 + }, + { + "epoch": 1.71, + "grad_norm": 0.6205219030380249, + "learning_rate": 0.00023343886694075135, + "loss": 2.9281, + "step": 34967 + }, + { + "epoch": 1.71, + "grad_norm": 0.6163828372955322, + "learning_rate": 0.00023342385457027781, + "loss": 2.7753, + "step": 34968 + }, + { + "epoch": 1.71, + "grad_norm": 0.6253390312194824, + "learning_rate": 0.000233408842375153, + "loss": 2.8628, + "step": 34969 + }, + { + "epoch": 1.71, + "grad_norm": 0.650109589099884, + "learning_rate": 0.00023339383035541654, + "loss": 2.9486, + "step": 34970 + }, + { + "epoch": 1.71, + "grad_norm": 0.6107782125473022, + "learning_rate": 0.00023337881851110776, + "loss": 2.8765, + "step": 34971 + }, + { + "epoch": 1.71, + "grad_norm": 0.6146960854530334, + "learning_rate": 0.00023336380684226644, + "loss": 3.0767, + "step": 34972 + }, + { + "epoch": 1.71, + "grad_norm": 0.6434600949287415, + "learning_rate": 0.0002333487953489321, + "loss": 3.0933, + "step": 34973 + }, + { + "epoch": 1.71, + "grad_norm": 0.5741561055183411, + "learning_rate": 0.00023333378403114405, + "loss": 3.2363, + "step": 34974 + }, + { + "epoch": 1.71, + "grad_norm": 0.6086795330047607, + "learning_rate": 0.00023331877288894213, + "loss": 2.7989, + "step": 34975 + }, + { + "epoch": 1.71, + "grad_norm": 0.5960971713066101, + "learning_rate": 0.00023330376192236564, + "loss": 3.1249, + "step": 34976 + }, + { + "epoch": 1.71, + "grad_norm": 0.6542201638221741, + "learning_rate": 0.00023328875113145432, + "loss": 2.8669, + "step": 34977 + }, + { + "epoch": 1.71, + "grad_norm": 0.6497222185134888, + "learning_rate": 0.0002332737405162474, + "loss": 3.1174, + "step": 34978 + }, + { + "epoch": 1.71, + "grad_norm": 0.6587827205657959, + "learning_rate": 0.00023325873007678464, + "loss": 2.9742, + "step": 34979 + }, + { + "epoch": 1.71, + "grad_norm": 0.6213930249214172, + "learning_rate": 0.0002332437198131057, + "loss": 3.1319, + "step": 34980 + }, + { + "epoch": 1.71, + "grad_norm": 0.6171470880508423, + "learning_rate": 0.00023322870972524983, + "loss": 2.9525, + "step": 34981 + }, + { + "epoch": 1.71, + "grad_norm": 0.6292840242385864, + "learning_rate": 0.00023321369981325676, + "loss": 3.0539, + "step": 34982 + }, + { + "epoch": 1.71, + "grad_norm": 0.6087939143180847, + "learning_rate": 0.00023319869007716587, + "loss": 3.0538, + "step": 34983 + }, + { + "epoch": 1.71, + "grad_norm": 0.6223199367523193, + "learning_rate": 0.0002331836805170168, + "loss": 2.8537, + "step": 34984 + }, + { + "epoch": 1.71, + "grad_norm": 0.6605082154273987, + "learning_rate": 0.00023316867113284913, + "loss": 3.0738, + "step": 34985 + }, + { + "epoch": 1.71, + "grad_norm": 0.6185541749000549, + "learning_rate": 0.0002331536619247022, + "loss": 2.9984, + "step": 34986 + }, + { + "epoch": 1.71, + "grad_norm": 0.631881594657898, + "learning_rate": 0.00023313865289261576, + "loss": 2.9922, + "step": 34987 + }, + { + "epoch": 1.71, + "grad_norm": 0.618061363697052, + "learning_rate": 0.0002331236440366292, + "loss": 2.996, + "step": 34988 + }, + { + "epoch": 1.71, + "grad_norm": 0.609032928943634, + "learning_rate": 0.00023310863535678202, + "loss": 2.6868, + "step": 34989 + }, + { + "epoch": 1.71, + "grad_norm": 0.6071302890777588, + "learning_rate": 0.00023309362685311396, + "loss": 3.0175, + "step": 34990 + }, + { + "epoch": 1.71, + "grad_norm": 0.6098959445953369, + "learning_rate": 0.00023307861852566434, + "loss": 2.9696, + "step": 34991 + }, + { + "epoch": 1.71, + "grad_norm": 0.638105034828186, + "learning_rate": 0.00023306361037447282, + "loss": 2.9288, + "step": 34992 + }, + { + "epoch": 1.71, + "grad_norm": 0.6155096292495728, + "learning_rate": 0.00023304860239957873, + "loss": 3.1519, + "step": 34993 + }, + { + "epoch": 1.71, + "grad_norm": 0.628525972366333, + "learning_rate": 0.0002330335946010218, + "loss": 2.9343, + "step": 34994 + }, + { + "epoch": 1.72, + "grad_norm": 0.7091739177703857, + "learning_rate": 0.00023301858697884157, + "loss": 3.1016, + "step": 34995 + }, + { + "epoch": 1.72, + "grad_norm": 0.62142014503479, + "learning_rate": 0.0002330035795330774, + "loss": 2.8768, + "step": 34996 + }, + { + "epoch": 1.72, + "grad_norm": 0.6097868084907532, + "learning_rate": 0.00023298857226376895, + "loss": 3.0771, + "step": 34997 + }, + { + "epoch": 1.72, + "grad_norm": 0.6286457777023315, + "learning_rate": 0.00023297356517095562, + "loss": 3.0466, + "step": 34998 + }, + { + "epoch": 1.72, + "grad_norm": 0.6783491373062134, + "learning_rate": 0.00023295855825467712, + "loss": 3.2109, + "step": 34999 + }, + { + "epoch": 1.72, + "grad_norm": 0.7100011110305786, + "learning_rate": 0.00023294355151497287, + "loss": 2.953, + "step": 35000 + }, + { + "epoch": 1.72, + "grad_norm": 0.5850964188575745, + "learning_rate": 0.00023292854495188232, + "loss": 2.9883, + "step": 35001 + }, + { + "epoch": 1.72, + "grad_norm": 0.6173315048217773, + "learning_rate": 0.0002329135385654452, + "loss": 3.0854, + "step": 35002 + }, + { + "epoch": 1.72, + "grad_norm": 0.6593319773674011, + "learning_rate": 0.00023289853235570083, + "loss": 2.7224, + "step": 35003 + }, + { + "epoch": 1.72, + "grad_norm": 0.6051262617111206, + "learning_rate": 0.00023288352632268877, + "loss": 3.0075, + "step": 35004 + }, + { + "epoch": 1.72, + "grad_norm": 0.6009809970855713, + "learning_rate": 0.00023286852046644867, + "loss": 2.7894, + "step": 35005 + }, + { + "epoch": 1.72, + "grad_norm": 0.5898076295852661, + "learning_rate": 0.00023285351478701994, + "loss": 3.0432, + "step": 35006 + }, + { + "epoch": 1.72, + "grad_norm": 0.6174237728118896, + "learning_rate": 0.0002328385092844422, + "loss": 2.9945, + "step": 35007 + }, + { + "epoch": 1.72, + "grad_norm": 0.6520997881889343, + "learning_rate": 0.00023282350395875472, + "loss": 2.9187, + "step": 35008 + }, + { + "epoch": 1.72, + "grad_norm": 0.6250781416893005, + "learning_rate": 0.0002328084988099973, + "loss": 3.1153, + "step": 35009 + }, + { + "epoch": 1.72, + "grad_norm": 0.6104256510734558, + "learning_rate": 0.00023279349383820944, + "loss": 3.1432, + "step": 35010 + }, + { + "epoch": 1.72, + "grad_norm": 0.6256521344184875, + "learning_rate": 0.00023277848904343045, + "loss": 2.8728, + "step": 35011 + }, + { + "epoch": 1.72, + "grad_norm": 0.5990914702415466, + "learning_rate": 0.0002327634844257001, + "loss": 3.0632, + "step": 35012 + }, + { + "epoch": 1.72, + "grad_norm": 0.6210713982582092, + "learning_rate": 0.00023274847998505774, + "loss": 2.9762, + "step": 35013 + }, + { + "epoch": 1.72, + "grad_norm": 0.6181674003601074, + "learning_rate": 0.00023273347572154288, + "loss": 3.0125, + "step": 35014 + }, + { + "epoch": 1.72, + "grad_norm": 0.8895490169525146, + "learning_rate": 0.0002327184716351952, + "loss": 2.9884, + "step": 35015 + }, + { + "epoch": 1.72, + "grad_norm": 0.6275051236152649, + "learning_rate": 0.00023270346772605408, + "loss": 2.9187, + "step": 35016 + }, + { + "epoch": 1.72, + "grad_norm": 0.5979534387588501, + "learning_rate": 0.0002326884639941591, + "loss": 2.9628, + "step": 35017 + }, + { + "epoch": 1.72, + "grad_norm": 0.6114262938499451, + "learning_rate": 0.0002326734604395497, + "loss": 2.9154, + "step": 35018 + }, + { + "epoch": 1.72, + "grad_norm": 0.6187745332717896, + "learning_rate": 0.00023265845706226535, + "loss": 2.9844, + "step": 35019 + }, + { + "epoch": 1.72, + "grad_norm": 0.5852953791618347, + "learning_rate": 0.00023264345386234586, + "loss": 3.0157, + "step": 35020 + }, + { + "epoch": 1.72, + "grad_norm": 0.5987752676010132, + "learning_rate": 0.00023262845083983047, + "loss": 3.0215, + "step": 35021 + }, + { + "epoch": 1.72, + "grad_norm": 0.6052468419075012, + "learning_rate": 0.00023261344799475886, + "loss": 2.8387, + "step": 35022 + }, + { + "epoch": 1.72, + "grad_norm": 0.653300940990448, + "learning_rate": 0.00023259844532717028, + "loss": 3.0387, + "step": 35023 + }, + { + "epoch": 1.72, + "grad_norm": 0.645679235458374, + "learning_rate": 0.00023258344283710452, + "loss": 2.861, + "step": 35024 + }, + { + "epoch": 1.72, + "grad_norm": 0.6186267137527466, + "learning_rate": 0.00023256844052460106, + "loss": 2.9967, + "step": 35025 + }, + { + "epoch": 1.72, + "grad_norm": 0.6053884029388428, + "learning_rate": 0.0002325534383896992, + "loss": 3.0525, + "step": 35026 + }, + { + "epoch": 1.72, + "grad_norm": 0.6017190217971802, + "learning_rate": 0.00023253843643243876, + "loss": 2.8209, + "step": 35027 + }, + { + "epoch": 1.72, + "grad_norm": 0.6449417471885681, + "learning_rate": 0.000232523434652859, + "loss": 3.0937, + "step": 35028 + }, + { + "epoch": 1.72, + "grad_norm": 0.6535047888755798, + "learning_rate": 0.0002325084330509995, + "loss": 3.1532, + "step": 35029 + }, + { + "epoch": 1.72, + "grad_norm": 0.6600343585014343, + "learning_rate": 0.0002324934316268999, + "loss": 3.0011, + "step": 35030 + }, + { + "epoch": 1.72, + "grad_norm": 0.6052358150482178, + "learning_rate": 0.00023247843038059956, + "loss": 3.2281, + "step": 35031 + }, + { + "epoch": 1.72, + "grad_norm": 0.6027965545654297, + "learning_rate": 0.00023246342931213809, + "loss": 2.9464, + "step": 35032 + }, + { + "epoch": 1.72, + "grad_norm": 0.6339533925056458, + "learning_rate": 0.00023244842842155483, + "loss": 3.0856, + "step": 35033 + }, + { + "epoch": 1.72, + "grad_norm": 0.6081569194793701, + "learning_rate": 0.00023243342770888948, + "loss": 2.8734, + "step": 35034 + }, + { + "epoch": 1.72, + "grad_norm": 0.6404123902320862, + "learning_rate": 0.00023241842717418152, + "loss": 3.113, + "step": 35035 + }, + { + "epoch": 1.72, + "grad_norm": 0.6517124176025391, + "learning_rate": 0.00023240342681747031, + "loss": 3.0599, + "step": 35036 + }, + { + "epoch": 1.72, + "grad_norm": 0.6136491894721985, + "learning_rate": 0.00023238842663879557, + "loss": 2.9569, + "step": 35037 + }, + { + "epoch": 1.72, + "grad_norm": 0.585811972618103, + "learning_rate": 0.00023237342663819666, + "loss": 2.9896, + "step": 35038 + }, + { + "epoch": 1.72, + "grad_norm": 0.6353388428688049, + "learning_rate": 0.00023235842681571306, + "loss": 2.8659, + "step": 35039 + }, + { + "epoch": 1.72, + "grad_norm": 0.625332236289978, + "learning_rate": 0.00023234342717138448, + "loss": 3.2114, + "step": 35040 + }, + { + "epoch": 1.72, + "grad_norm": 0.6419073939323425, + "learning_rate": 0.00023232842770525014, + "loss": 2.8107, + "step": 35041 + }, + { + "epoch": 1.72, + "grad_norm": 0.6179407835006714, + "learning_rate": 0.00023231342841734985, + "loss": 2.9328, + "step": 35042 + }, + { + "epoch": 1.72, + "grad_norm": 0.6118654608726501, + "learning_rate": 0.0002322984293077229, + "loss": 2.9489, + "step": 35043 + }, + { + "epoch": 1.72, + "grad_norm": 0.6000199913978577, + "learning_rate": 0.00023228343037640879, + "loss": 3.0466, + "step": 35044 + }, + { + "epoch": 1.72, + "grad_norm": 0.6850727796554565, + "learning_rate": 0.0002322684316234472, + "loss": 3.1185, + "step": 35045 + }, + { + "epoch": 1.72, + "grad_norm": 0.6070858240127563, + "learning_rate": 0.0002322534330488775, + "loss": 3.0039, + "step": 35046 + }, + { + "epoch": 1.72, + "grad_norm": 0.7080966830253601, + "learning_rate": 0.00023223843465273924, + "loss": 3.0643, + "step": 35047 + }, + { + "epoch": 1.72, + "grad_norm": 0.6055739521980286, + "learning_rate": 0.00023222343643507177, + "loss": 2.9921, + "step": 35048 + }, + { + "epoch": 1.72, + "grad_norm": 0.6497558355331421, + "learning_rate": 0.00023220843839591484, + "loss": 2.9583, + "step": 35049 + }, + { + "epoch": 1.72, + "grad_norm": 0.6094196438789368, + "learning_rate": 0.00023219344053530786, + "loss": 2.7776, + "step": 35050 + }, + { + "epoch": 1.72, + "grad_norm": 0.6469390988349915, + "learning_rate": 0.0002321784428532902, + "loss": 3.0497, + "step": 35051 + }, + { + "epoch": 1.72, + "grad_norm": 0.6989441514015198, + "learning_rate": 0.0002321634453499016, + "loss": 3.1574, + "step": 35052 + }, + { + "epoch": 1.72, + "grad_norm": 0.6314229965209961, + "learning_rate": 0.00023214844802518135, + "loss": 2.9443, + "step": 35053 + }, + { + "epoch": 1.72, + "grad_norm": 0.6381677389144897, + "learning_rate": 0.0002321334508791691, + "loss": 2.9694, + "step": 35054 + }, + { + "epoch": 1.72, + "grad_norm": 0.6836425065994263, + "learning_rate": 0.00023211845391190415, + "loss": 2.8559, + "step": 35055 + }, + { + "epoch": 1.72, + "grad_norm": 0.6240857243537903, + "learning_rate": 0.00023210345712342619, + "loss": 2.7767, + "step": 35056 + }, + { + "epoch": 1.72, + "grad_norm": 0.6270095705986023, + "learning_rate": 0.00023208846051377474, + "loss": 2.999, + "step": 35057 + }, + { + "epoch": 1.72, + "grad_norm": 0.6575677990913391, + "learning_rate": 0.00023207346408298906, + "loss": 2.943, + "step": 35058 + }, + { + "epoch": 1.72, + "grad_norm": 0.6204233765602112, + "learning_rate": 0.00023205846783110894, + "loss": 3.0342, + "step": 35059 + }, + { + "epoch": 1.72, + "grad_norm": 0.6046788692474365, + "learning_rate": 0.00023204347175817362, + "loss": 2.9394, + "step": 35060 + }, + { + "epoch": 1.72, + "grad_norm": 0.6025890707969666, + "learning_rate": 0.0002320284758642228, + "loss": 2.9541, + "step": 35061 + }, + { + "epoch": 1.72, + "grad_norm": 0.5898043513298035, + "learning_rate": 0.0002320134801492959, + "loss": 3.1216, + "step": 35062 + }, + { + "epoch": 1.72, + "grad_norm": 0.6248551607131958, + "learning_rate": 0.0002319984846134323, + "loss": 2.9211, + "step": 35063 + }, + { + "epoch": 1.72, + "grad_norm": 0.677790641784668, + "learning_rate": 0.00023198348925667177, + "loss": 2.8475, + "step": 35064 + }, + { + "epoch": 1.72, + "grad_norm": 0.6180427074432373, + "learning_rate": 0.00023196849407905353, + "loss": 3.1241, + "step": 35065 + }, + { + "epoch": 1.72, + "grad_norm": 0.6461980938911438, + "learning_rate": 0.00023195349908061716, + "loss": 3.1005, + "step": 35066 + }, + { + "epoch": 1.72, + "grad_norm": 0.6379979848861694, + "learning_rate": 0.0002319385042614023, + "loss": 2.9287, + "step": 35067 + }, + { + "epoch": 1.72, + "grad_norm": 0.631192684173584, + "learning_rate": 0.0002319235096214482, + "loss": 3.0211, + "step": 35068 + }, + { + "epoch": 1.72, + "grad_norm": 0.6260473728179932, + "learning_rate": 0.00023190851516079463, + "loss": 2.9484, + "step": 35069 + }, + { + "epoch": 1.72, + "grad_norm": 0.6406779885292053, + "learning_rate": 0.0002318935208794807, + "loss": 2.9191, + "step": 35070 + }, + { + "epoch": 1.72, + "grad_norm": 0.6329416036605835, + "learning_rate": 0.00023187852677754626, + "loss": 2.8363, + "step": 35071 + }, + { + "epoch": 1.72, + "grad_norm": 0.6531370878219604, + "learning_rate": 0.00023186353285503076, + "loss": 2.9056, + "step": 35072 + }, + { + "epoch": 1.72, + "grad_norm": 0.6264121532440186, + "learning_rate": 0.0002318485391119734, + "loss": 3.0358, + "step": 35073 + }, + { + "epoch": 1.72, + "grad_norm": 0.6566066145896912, + "learning_rate": 0.00023183354554841403, + "loss": 2.8215, + "step": 35074 + }, + { + "epoch": 1.72, + "grad_norm": 0.652664840221405, + "learning_rate": 0.00023181855216439194, + "loss": 3.1829, + "step": 35075 + }, + { + "epoch": 1.72, + "grad_norm": 0.6445363163948059, + "learning_rate": 0.00023180355895994654, + "loss": 3.0111, + "step": 35076 + }, + { + "epoch": 1.72, + "grad_norm": 0.5922958254814148, + "learning_rate": 0.00023178856593511761, + "loss": 3.0851, + "step": 35077 + }, + { + "epoch": 1.72, + "grad_norm": 0.6356951594352722, + "learning_rate": 0.00023177357308994443, + "loss": 3.0073, + "step": 35078 + }, + { + "epoch": 1.72, + "grad_norm": 0.6576932072639465, + "learning_rate": 0.00023175858042446653, + "loss": 3.1066, + "step": 35079 + }, + { + "epoch": 1.72, + "grad_norm": 0.6770666837692261, + "learning_rate": 0.0002317435879387233, + "loss": 2.8589, + "step": 35080 + }, + { + "epoch": 1.72, + "grad_norm": 0.6689457297325134, + "learning_rate": 0.00023172859563275436, + "loss": 2.9456, + "step": 35081 + }, + { + "epoch": 1.72, + "grad_norm": 0.6123393774032593, + "learning_rate": 0.00023171360350659922, + "loss": 3.3388, + "step": 35082 + }, + { + "epoch": 1.72, + "grad_norm": 0.6080477833747864, + "learning_rate": 0.0002316986115602973, + "loss": 2.8955, + "step": 35083 + }, + { + "epoch": 1.72, + "grad_norm": 0.5717896819114685, + "learning_rate": 0.0002316836197938881, + "loss": 2.9882, + "step": 35084 + }, + { + "epoch": 1.72, + "grad_norm": 0.5806421637535095, + "learning_rate": 0.000231668628207411, + "loss": 3.0781, + "step": 35085 + }, + { + "epoch": 1.72, + "grad_norm": 0.6210070252418518, + "learning_rate": 0.00023165363680090567, + "loss": 3.1186, + "step": 35086 + }, + { + "epoch": 1.72, + "grad_norm": 0.6110129356384277, + "learning_rate": 0.00023163864557441152, + "loss": 3.156, + "step": 35087 + }, + { + "epoch": 1.72, + "grad_norm": 0.6130040884017944, + "learning_rate": 0.00023162365452796793, + "loss": 2.936, + "step": 35088 + }, + { + "epoch": 1.72, + "grad_norm": 0.6246054768562317, + "learning_rate": 0.0002316086636616146, + "loss": 2.9497, + "step": 35089 + }, + { + "epoch": 1.72, + "grad_norm": 0.6145926117897034, + "learning_rate": 0.00023159367297539082, + "loss": 3.0549, + "step": 35090 + }, + { + "epoch": 1.72, + "grad_norm": 0.5754942893981934, + "learning_rate": 0.0002315786824693361, + "loss": 3.034, + "step": 35091 + }, + { + "epoch": 1.72, + "grad_norm": 0.639498770236969, + "learning_rate": 0.00023156369214349004, + "loss": 3.0611, + "step": 35092 + }, + { + "epoch": 1.72, + "grad_norm": 0.6060043573379517, + "learning_rate": 0.00023154870199789199, + "loss": 3.1878, + "step": 35093 + }, + { + "epoch": 1.72, + "grad_norm": 0.6273714900016785, + "learning_rate": 0.00023153371203258157, + "loss": 3.0544, + "step": 35094 + }, + { + "epoch": 1.72, + "grad_norm": 0.6444366574287415, + "learning_rate": 0.00023151872224759805, + "loss": 3.2422, + "step": 35095 + }, + { + "epoch": 1.72, + "grad_norm": 0.6600435376167297, + "learning_rate": 0.0002315037326429811, + "loss": 3.1418, + "step": 35096 + }, + { + "epoch": 1.72, + "grad_norm": 0.605183482170105, + "learning_rate": 0.00023148874321877018, + "loss": 2.9719, + "step": 35097 + }, + { + "epoch": 1.72, + "grad_norm": 0.6069551110267639, + "learning_rate": 0.0002314737539750046, + "loss": 2.6923, + "step": 35098 + }, + { + "epoch": 1.72, + "grad_norm": 0.5764786005020142, + "learning_rate": 0.0002314587649117241, + "loss": 2.9678, + "step": 35099 + }, + { + "epoch": 1.72, + "grad_norm": 0.6361105442047119, + "learning_rate": 0.000231443776028968, + "loss": 3.2081, + "step": 35100 + }, + { + "epoch": 1.72, + "grad_norm": 0.5919163227081299, + "learning_rate": 0.00023142878732677566, + "loss": 3.0231, + "step": 35101 + }, + { + "epoch": 1.72, + "grad_norm": 0.6047539114952087, + "learning_rate": 0.00023141379880518685, + "loss": 3.0358, + "step": 35102 + }, + { + "epoch": 1.72, + "grad_norm": 0.6241350173950195, + "learning_rate": 0.00023139881046424079, + "loss": 2.9605, + "step": 35103 + }, + { + "epoch": 1.72, + "grad_norm": 0.6277053356170654, + "learning_rate": 0.00023138382230397718, + "loss": 3.0339, + "step": 35104 + }, + { + "epoch": 1.72, + "grad_norm": 0.6247433423995972, + "learning_rate": 0.0002313688343244353, + "loss": 2.979, + "step": 35105 + }, + { + "epoch": 1.72, + "grad_norm": 0.5854073762893677, + "learning_rate": 0.00023135384652565465, + "loss": 2.9685, + "step": 35106 + }, + { + "epoch": 1.72, + "grad_norm": 0.6047874093055725, + "learning_rate": 0.0002313388589076749, + "loss": 3.2763, + "step": 35107 + }, + { + "epoch": 1.72, + "grad_norm": 0.5953376293182373, + "learning_rate": 0.00023132387147053527, + "loss": 2.8974, + "step": 35108 + }, + { + "epoch": 1.72, + "grad_norm": 0.5756540894508362, + "learning_rate": 0.00023130888421427545, + "loss": 3.0288, + "step": 35109 + }, + { + "epoch": 1.72, + "grad_norm": 0.6189519166946411, + "learning_rate": 0.00023129389713893466, + "loss": 2.7638, + "step": 35110 + }, + { + "epoch": 1.72, + "grad_norm": 0.5780520439147949, + "learning_rate": 0.0002312789102445526, + "loss": 3.0979, + "step": 35111 + }, + { + "epoch": 1.72, + "grad_norm": 0.6746885180473328, + "learning_rate": 0.00023126392353116872, + "loss": 3.0076, + "step": 35112 + }, + { + "epoch": 1.72, + "grad_norm": 0.6331716179847717, + "learning_rate": 0.00023124893699882231, + "loss": 3.2743, + "step": 35113 + }, + { + "epoch": 1.72, + "grad_norm": 0.6570529937744141, + "learning_rate": 0.00023123395064755312, + "loss": 3.0482, + "step": 35114 + }, + { + "epoch": 1.72, + "grad_norm": 0.6318064332008362, + "learning_rate": 0.00023121896447740036, + "loss": 3.0843, + "step": 35115 + }, + { + "epoch": 1.72, + "grad_norm": 0.6018270254135132, + "learning_rate": 0.00023120397848840359, + "loss": 2.9797, + "step": 35116 + }, + { + "epoch": 1.72, + "grad_norm": 0.5887584090232849, + "learning_rate": 0.00023118899268060241, + "loss": 3.1461, + "step": 35117 + }, + { + "epoch": 1.72, + "grad_norm": 0.6415103077888489, + "learning_rate": 0.00023117400705403613, + "loss": 3.0801, + "step": 35118 + }, + { + "epoch": 1.72, + "grad_norm": 0.6064682006835938, + "learning_rate": 0.0002311590216087443, + "loss": 2.779, + "step": 35119 + }, + { + "epoch": 1.72, + "grad_norm": 0.6046452522277832, + "learning_rate": 0.00023114403634476625, + "loss": 2.99, + "step": 35120 + }, + { + "epoch": 1.72, + "grad_norm": 0.6241592764854431, + "learning_rate": 0.00023112905126214156, + "loss": 2.903, + "step": 35121 + }, + { + "epoch": 1.72, + "grad_norm": 0.6440034508705139, + "learning_rate": 0.00023111406636090983, + "loss": 2.928, + "step": 35122 + }, + { + "epoch": 1.72, + "grad_norm": 0.6371393799781799, + "learning_rate": 0.0002310990816411103, + "loss": 3.198, + "step": 35123 + }, + { + "epoch": 1.72, + "grad_norm": 0.635123610496521, + "learning_rate": 0.00023108409710278263, + "loss": 2.8926, + "step": 35124 + }, + { + "epoch": 1.72, + "grad_norm": 0.6079972386360168, + "learning_rate": 0.000231069112745966, + "loss": 2.9344, + "step": 35125 + }, + { + "epoch": 1.72, + "grad_norm": 0.5995076298713684, + "learning_rate": 0.0002310541285707002, + "loss": 2.9177, + "step": 35126 + }, + { + "epoch": 1.72, + "grad_norm": 0.6506484746932983, + "learning_rate": 0.00023103914457702455, + "loss": 3.03, + "step": 35127 + }, + { + "epoch": 1.72, + "grad_norm": 0.6203939914703369, + "learning_rate": 0.00023102416076497843, + "loss": 3.0106, + "step": 35128 + }, + { + "epoch": 1.72, + "grad_norm": 0.6446170210838318, + "learning_rate": 0.00023100917713460154, + "loss": 3.1954, + "step": 35129 + }, + { + "epoch": 1.72, + "grad_norm": 0.676665723323822, + "learning_rate": 0.00023099419368593311, + "loss": 2.9198, + "step": 35130 + }, + { + "epoch": 1.72, + "grad_norm": 0.6401715278625488, + "learning_rate": 0.00023097921041901275, + "loss": 3.1917, + "step": 35131 + }, + { + "epoch": 1.72, + "grad_norm": 0.6062941551208496, + "learning_rate": 0.00023096422733387978, + "loss": 2.7867, + "step": 35132 + }, + { + "epoch": 1.72, + "grad_norm": 0.574828565120697, + "learning_rate": 0.00023094924443057378, + "loss": 2.9809, + "step": 35133 + }, + { + "epoch": 1.72, + "grad_norm": 0.6718885898590088, + "learning_rate": 0.00023093426170913426, + "loss": 3.095, + "step": 35134 + }, + { + "epoch": 1.72, + "grad_norm": 0.595146656036377, + "learning_rate": 0.00023091927916960042, + "loss": 2.8781, + "step": 35135 + }, + { + "epoch": 1.72, + "grad_norm": 0.6414282321929932, + "learning_rate": 0.0002309042968120121, + "loss": 2.8138, + "step": 35136 + }, + { + "epoch": 1.72, + "grad_norm": 0.6145898699760437, + "learning_rate": 0.00023088931463640847, + "loss": 2.856, + "step": 35137 + }, + { + "epoch": 1.72, + "grad_norm": 0.6160130500793457, + "learning_rate": 0.00023087433264282905, + "loss": 2.7365, + "step": 35138 + }, + { + "epoch": 1.72, + "grad_norm": 0.6244746446609497, + "learning_rate": 0.00023085935083131342, + "loss": 2.838, + "step": 35139 + }, + { + "epoch": 1.72, + "grad_norm": 0.6224129796028137, + "learning_rate": 0.00023084436920190092, + "loss": 2.8034, + "step": 35140 + }, + { + "epoch": 1.72, + "grad_norm": 0.6752472519874573, + "learning_rate": 0.0002308293877546311, + "loss": 2.9116, + "step": 35141 + }, + { + "epoch": 1.72, + "grad_norm": 0.6341972947120667, + "learning_rate": 0.00023081440648954323, + "loss": 2.8963, + "step": 35142 + }, + { + "epoch": 1.72, + "grad_norm": 0.6429397463798523, + "learning_rate": 0.00023079942540667688, + "loss": 3.1747, + "step": 35143 + }, + { + "epoch": 1.72, + "grad_norm": 0.6000608205795288, + "learning_rate": 0.00023078444450607164, + "loss": 2.8118, + "step": 35144 + }, + { + "epoch": 1.72, + "grad_norm": 0.6079022884368896, + "learning_rate": 0.00023076946378776687, + "loss": 2.9218, + "step": 35145 + }, + { + "epoch": 1.72, + "grad_norm": 0.6264728307723999, + "learning_rate": 0.00023075448325180203, + "loss": 2.8652, + "step": 35146 + }, + { + "epoch": 1.72, + "grad_norm": 0.6049911379814148, + "learning_rate": 0.00023073950289821638, + "loss": 3.0077, + "step": 35147 + }, + { + "epoch": 1.72, + "grad_norm": 0.6240792870521545, + "learning_rate": 0.00023072452272704966, + "loss": 3.0171, + "step": 35148 + }, + { + "epoch": 1.72, + "grad_norm": 0.6277632117271423, + "learning_rate": 0.00023070954273834128, + "loss": 2.9082, + "step": 35149 + }, + { + "epoch": 1.72, + "grad_norm": 0.5884895324707031, + "learning_rate": 0.0002306945629321305, + "loss": 3.139, + "step": 35150 + }, + { + "epoch": 1.72, + "grad_norm": 0.6733899712562561, + "learning_rate": 0.00023067958330845703, + "loss": 3.0002, + "step": 35151 + }, + { + "epoch": 1.72, + "grad_norm": 0.6554712057113647, + "learning_rate": 0.0002306646038673601, + "loss": 3.1105, + "step": 35152 + }, + { + "epoch": 1.72, + "grad_norm": 0.6073846817016602, + "learning_rate": 0.00023064962460887925, + "loss": 2.9152, + "step": 35153 + }, + { + "epoch": 1.72, + "grad_norm": 0.5951540470123291, + "learning_rate": 0.000230634645533054, + "loss": 3.0983, + "step": 35154 + }, + { + "epoch": 1.72, + "grad_norm": 0.6558448672294617, + "learning_rate": 0.00023061966663992376, + "loss": 2.9839, + "step": 35155 + }, + { + "epoch": 1.72, + "grad_norm": 0.632959246635437, + "learning_rate": 0.00023060468792952796, + "loss": 3.2383, + "step": 35156 + }, + { + "epoch": 1.72, + "grad_norm": 0.6814397573471069, + "learning_rate": 0.00023058970940190596, + "loss": 2.7762, + "step": 35157 + }, + { + "epoch": 1.72, + "grad_norm": 0.62791907787323, + "learning_rate": 0.0002305747310570974, + "loss": 3.0783, + "step": 35158 + }, + { + "epoch": 1.72, + "grad_norm": 0.6006518602371216, + "learning_rate": 0.00023055975289514166, + "loss": 3.2219, + "step": 35159 + }, + { + "epoch": 1.72, + "grad_norm": 0.6083630919456482, + "learning_rate": 0.00023054477491607804, + "loss": 3.1753, + "step": 35160 + }, + { + "epoch": 1.72, + "grad_norm": 0.5891106128692627, + "learning_rate": 0.00023052979711994624, + "loss": 2.9886, + "step": 35161 + }, + { + "epoch": 1.72, + "grad_norm": 0.6096766591072083, + "learning_rate": 0.00023051481950678545, + "loss": 3.0412, + "step": 35162 + }, + { + "epoch": 1.72, + "grad_norm": 0.64244544506073, + "learning_rate": 0.00023049984207663537, + "loss": 2.8953, + "step": 35163 + }, + { + "epoch": 1.72, + "grad_norm": 0.6248708963394165, + "learning_rate": 0.00023048486482953536, + "loss": 3.0307, + "step": 35164 + }, + { + "epoch": 1.72, + "grad_norm": 0.6246823072433472, + "learning_rate": 0.00023046988776552469, + "loss": 3.14, + "step": 35165 + }, + { + "epoch": 1.72, + "grad_norm": 0.6114916205406189, + "learning_rate": 0.00023045491088464313, + "loss": 3.003, + "step": 35166 + }, + { + "epoch": 1.72, + "grad_norm": 0.6664708256721497, + "learning_rate": 0.00023043993418692986, + "loss": 2.9394, + "step": 35167 + }, + { + "epoch": 1.72, + "grad_norm": 0.6526961326599121, + "learning_rate": 0.00023042495767242434, + "loss": 3.0523, + "step": 35168 + }, + { + "epoch": 1.72, + "grad_norm": 0.637554943561554, + "learning_rate": 0.00023040998134116623, + "loss": 3.157, + "step": 35169 + }, + { + "epoch": 1.72, + "grad_norm": 0.6036433577537537, + "learning_rate": 0.00023039500519319477, + "loss": 3.2725, + "step": 35170 + }, + { + "epoch": 1.72, + "grad_norm": 0.6289345622062683, + "learning_rate": 0.00023038002922854957, + "loss": 3.088, + "step": 35171 + }, + { + "epoch": 1.72, + "grad_norm": 0.6497759222984314, + "learning_rate": 0.0002303650534472698, + "loss": 3.1527, + "step": 35172 + }, + { + "epoch": 1.72, + "grad_norm": 0.6652274131774902, + "learning_rate": 0.00023035007784939516, + "loss": 2.8838, + "step": 35173 + }, + { + "epoch": 1.72, + "grad_norm": 0.6045557260513306, + "learning_rate": 0.00023033510243496507, + "loss": 2.9615, + "step": 35174 + }, + { + "epoch": 1.72, + "grad_norm": 0.6381337642669678, + "learning_rate": 0.00023032012720401878, + "loss": 3.0682, + "step": 35175 + }, + { + "epoch": 1.72, + "grad_norm": 0.6169887781143188, + "learning_rate": 0.000230305152156596, + "loss": 3.2183, + "step": 35176 + }, + { + "epoch": 1.72, + "grad_norm": 0.6736177206039429, + "learning_rate": 0.00023029017729273596, + "loss": 2.9374, + "step": 35177 + }, + { + "epoch": 1.72, + "grad_norm": 0.6414783596992493, + "learning_rate": 0.00023027520261247812, + "loss": 3.0381, + "step": 35178 + }, + { + "epoch": 1.72, + "grad_norm": 0.6434562802314758, + "learning_rate": 0.00023026022811586215, + "loss": 2.8393, + "step": 35179 + }, + { + "epoch": 1.72, + "grad_norm": 0.6093327403068542, + "learning_rate": 0.00023024525380292718, + "loss": 3.0741, + "step": 35180 + }, + { + "epoch": 1.72, + "grad_norm": 0.5809537172317505, + "learning_rate": 0.0002302302796737129, + "loss": 3.0564, + "step": 35181 + }, + { + "epoch": 1.72, + "grad_norm": 0.635099470615387, + "learning_rate": 0.00023021530572825847, + "loss": 3.1763, + "step": 35182 + }, + { + "epoch": 1.72, + "grad_norm": 0.6042500734329224, + "learning_rate": 0.00023020033196660355, + "loss": 3.0027, + "step": 35183 + }, + { + "epoch": 1.72, + "grad_norm": 0.6080394387245178, + "learning_rate": 0.00023018535838878758, + "loss": 3.0439, + "step": 35184 + }, + { + "epoch": 1.72, + "grad_norm": 0.666522204875946, + "learning_rate": 0.00023017038499484994, + "loss": 3.0731, + "step": 35185 + }, + { + "epoch": 1.72, + "grad_norm": 0.6572619080543518, + "learning_rate": 0.0002301554117848301, + "loss": 3.0514, + "step": 35186 + }, + { + "epoch": 1.72, + "grad_norm": 0.6002278327941895, + "learning_rate": 0.00023014043875876734, + "loss": 2.9164, + "step": 35187 + }, + { + "epoch": 1.72, + "grad_norm": 0.6185051202774048, + "learning_rate": 0.00023012546591670127, + "loss": 2.9961, + "step": 35188 + }, + { + "epoch": 1.72, + "grad_norm": 0.61963951587677, + "learning_rate": 0.00023011049325867135, + "loss": 2.9879, + "step": 35189 + }, + { + "epoch": 1.72, + "grad_norm": 0.5980280041694641, + "learning_rate": 0.0002300955207847168, + "loss": 3.2725, + "step": 35190 + }, + { + "epoch": 1.72, + "grad_norm": 0.5959213972091675, + "learning_rate": 0.00023008054849487733, + "loss": 3.1077, + "step": 35191 + }, + { + "epoch": 1.72, + "grad_norm": 0.5841280221939087, + "learning_rate": 0.00023006557638919223, + "loss": 3.0087, + "step": 35192 + }, + { + "epoch": 1.72, + "grad_norm": 0.618198573589325, + "learning_rate": 0.0002300506044677008, + "loss": 2.9676, + "step": 35193 + }, + { + "epoch": 1.72, + "grad_norm": 0.6227092742919922, + "learning_rate": 0.00023003563273044276, + "loss": 3.1052, + "step": 35194 + }, + { + "epoch": 1.72, + "grad_norm": 0.6252568364143372, + "learning_rate": 0.00023002066117745737, + "loss": 3.0911, + "step": 35195 + }, + { + "epoch": 1.72, + "grad_norm": 0.623429536819458, + "learning_rate": 0.00023000568980878415, + "loss": 2.9064, + "step": 35196 + }, + { + "epoch": 1.72, + "grad_norm": 0.6618231534957886, + "learning_rate": 0.0002299907186244623, + "loss": 3.1137, + "step": 35197 + }, + { + "epoch": 1.72, + "grad_norm": 0.6143259406089783, + "learning_rate": 0.00022997574762453156, + "loss": 3.0053, + "step": 35198 + }, + { + "epoch": 1.73, + "grad_norm": 0.6106548309326172, + "learning_rate": 0.0002299607768090312, + "loss": 3.0564, + "step": 35199 + }, + { + "epoch": 1.73, + "grad_norm": 0.595967710018158, + "learning_rate": 0.0002299458061780006, + "loss": 2.9528, + "step": 35200 + }, + { + "epoch": 1.73, + "grad_norm": 0.6516256928443909, + "learning_rate": 0.0002299308357314794, + "loss": 3.0523, + "step": 35201 + }, + { + "epoch": 1.73, + "grad_norm": 0.6277573704719543, + "learning_rate": 0.0002299158654695068, + "loss": 3.1288, + "step": 35202 + }, + { + "epoch": 1.73, + "grad_norm": 0.6570903062820435, + "learning_rate": 0.0002299008953921223, + "loss": 2.9566, + "step": 35203 + }, + { + "epoch": 1.73, + "grad_norm": 0.638201892375946, + "learning_rate": 0.00022988592549936542, + "loss": 2.756, + "step": 35204 + }, + { + "epoch": 1.73, + "grad_norm": 0.6191340088844299, + "learning_rate": 0.00022987095579127544, + "loss": 3.0523, + "step": 35205 + }, + { + "epoch": 1.73, + "grad_norm": 0.6047921776771545, + "learning_rate": 0.00022985598626789196, + "loss": 3.0704, + "step": 35206 + }, + { + "epoch": 1.73, + "grad_norm": 0.6013028025627136, + "learning_rate": 0.00022984101692925425, + "loss": 3.1367, + "step": 35207 + }, + { + "epoch": 1.73, + "grad_norm": 0.681874692440033, + "learning_rate": 0.00022982604777540175, + "loss": 3.125, + "step": 35208 + }, + { + "epoch": 1.73, + "grad_norm": 0.6835533976554871, + "learning_rate": 0.0002298110788063741, + "loss": 2.946, + "step": 35209 + }, + { + "epoch": 1.73, + "grad_norm": 0.6232330799102783, + "learning_rate": 0.00022979611002221044, + "loss": 2.9882, + "step": 35210 + }, + { + "epoch": 1.73, + "grad_norm": 0.6557362675666809, + "learning_rate": 0.00022978114142295036, + "loss": 2.8999, + "step": 35211 + }, + { + "epoch": 1.73, + "grad_norm": 0.6331151127815247, + "learning_rate": 0.00022976617300863317, + "loss": 3.0894, + "step": 35212 + }, + { + "epoch": 1.73, + "grad_norm": 0.6033350825309753, + "learning_rate": 0.00022975120477929846, + "loss": 2.892, + "step": 35213 + }, + { + "epoch": 1.73, + "grad_norm": 0.64558345079422, + "learning_rate": 0.0002297362367349855, + "loss": 2.8062, + "step": 35214 + }, + { + "epoch": 1.73, + "grad_norm": 0.6405138373374939, + "learning_rate": 0.00022972126887573372, + "loss": 2.9401, + "step": 35215 + }, + { + "epoch": 1.73, + "grad_norm": 0.6379008293151855, + "learning_rate": 0.0002297063012015827, + "loss": 3.1785, + "step": 35216 + }, + { + "epoch": 1.73, + "grad_norm": 0.6051523089408875, + "learning_rate": 0.0002296913337125717, + "loss": 3.099, + "step": 35217 + }, + { + "epoch": 1.73, + "grad_norm": 0.6115164160728455, + "learning_rate": 0.00022967636640874026, + "loss": 2.9392, + "step": 35218 + }, + { + "epoch": 1.73, + "grad_norm": 0.6115735769271851, + "learning_rate": 0.0002296613992901276, + "loss": 3.0127, + "step": 35219 + }, + { + "epoch": 1.73, + "grad_norm": 0.6742574572563171, + "learning_rate": 0.00022964643235677337, + "loss": 3.2632, + "step": 35220 + }, + { + "epoch": 1.73, + "grad_norm": 0.6841800212860107, + "learning_rate": 0.00022963146560871692, + "loss": 2.976, + "step": 35221 + }, + { + "epoch": 1.73, + "grad_norm": 0.6270367503166199, + "learning_rate": 0.00022961649904599754, + "loss": 3.1518, + "step": 35222 + }, + { + "epoch": 1.73, + "grad_norm": 0.6001424193382263, + "learning_rate": 0.00022960153266865488, + "loss": 3.0636, + "step": 35223 + }, + { + "epoch": 1.73, + "grad_norm": 0.5948289632797241, + "learning_rate": 0.0002295865664767281, + "loss": 3.0079, + "step": 35224 + }, + { + "epoch": 1.73, + "grad_norm": 0.6431679725646973, + "learning_rate": 0.00022957160047025683, + "loss": 2.9789, + "step": 35225 + }, + { + "epoch": 1.73, + "grad_norm": 0.6434341669082642, + "learning_rate": 0.00022955663464928048, + "loss": 3.2493, + "step": 35226 + }, + { + "epoch": 1.73, + "grad_norm": 0.6969561576843262, + "learning_rate": 0.00022954166901383827, + "loss": 2.903, + "step": 35227 + }, + { + "epoch": 1.73, + "grad_norm": 0.7470771074295044, + "learning_rate": 0.00022952670356396986, + "loss": 3.0102, + "step": 35228 + }, + { + "epoch": 1.73, + "grad_norm": 0.6371145844459534, + "learning_rate": 0.00022951173829971447, + "loss": 2.9863, + "step": 35229 + }, + { + "epoch": 1.73, + "grad_norm": 0.6292941570281982, + "learning_rate": 0.00022949677322111155, + "loss": 3.1331, + "step": 35230 + }, + { + "epoch": 1.73, + "grad_norm": 0.6096308827400208, + "learning_rate": 0.0002294818083282007, + "loss": 3.084, + "step": 35231 + }, + { + "epoch": 1.73, + "grad_norm": 0.6575873494148254, + "learning_rate": 0.0002294668436210211, + "loss": 3.0223, + "step": 35232 + }, + { + "epoch": 1.73, + "grad_norm": 0.6215528845787048, + "learning_rate": 0.00022945187909961237, + "loss": 2.9783, + "step": 35233 + }, + { + "epoch": 1.73, + "grad_norm": 0.6451046466827393, + "learning_rate": 0.00022943691476401363, + "loss": 2.8736, + "step": 35234 + }, + { + "epoch": 1.73, + "grad_norm": 0.5992505550384521, + "learning_rate": 0.0002294219506142646, + "loss": 3.1373, + "step": 35235 + }, + { + "epoch": 1.73, + "grad_norm": 0.6010803580284119, + "learning_rate": 0.00022940698665040459, + "loss": 2.8851, + "step": 35236 + }, + { + "epoch": 1.73, + "grad_norm": 0.6390721201896667, + "learning_rate": 0.00022939202287247285, + "loss": 3.0608, + "step": 35237 + }, + { + "epoch": 1.73, + "grad_norm": 0.6065561771392822, + "learning_rate": 0.00022937705928050911, + "loss": 2.9655, + "step": 35238 + }, + { + "epoch": 1.73, + "grad_norm": 0.6406224370002747, + "learning_rate": 0.0002293620958745525, + "loss": 3.0668, + "step": 35239 + }, + { + "epoch": 1.73, + "grad_norm": 0.6230671405792236, + "learning_rate": 0.00022934713265464248, + "loss": 2.97, + "step": 35240 + }, + { + "epoch": 1.73, + "grad_norm": 0.6472780704498291, + "learning_rate": 0.00022933216962081867, + "loss": 3.097, + "step": 35241 + }, + { + "epoch": 1.73, + "grad_norm": 0.6151770949363708, + "learning_rate": 0.00022931720677312023, + "loss": 2.9176, + "step": 35242 + }, + { + "epoch": 1.73, + "grad_norm": 0.6394203901290894, + "learning_rate": 0.00022930224411158673, + "loss": 2.904, + "step": 35243 + }, + { + "epoch": 1.73, + "grad_norm": 0.5917284488677979, + "learning_rate": 0.0002292872816362574, + "loss": 3.172, + "step": 35244 + }, + { + "epoch": 1.73, + "grad_norm": 0.6539353728294373, + "learning_rate": 0.00022927231934717176, + "loss": 3.0997, + "step": 35245 + }, + { + "epoch": 1.73, + "grad_norm": 0.592643678188324, + "learning_rate": 0.00022925735724436933, + "loss": 3.1776, + "step": 35246 + }, + { + "epoch": 1.73, + "grad_norm": 0.6221964955329895, + "learning_rate": 0.00022924239532788935, + "loss": 3.0839, + "step": 35247 + }, + { + "epoch": 1.73, + "grad_norm": 0.6389697194099426, + "learning_rate": 0.00022922743359777138, + "loss": 3.1729, + "step": 35248 + }, + { + "epoch": 1.73, + "grad_norm": 0.6145962476730347, + "learning_rate": 0.00022921247205405458, + "loss": 3.2908, + "step": 35249 + }, + { + "epoch": 1.73, + "grad_norm": 0.6351203322410583, + "learning_rate": 0.00022919751069677856, + "loss": 3.214, + "step": 35250 + }, + { + "epoch": 1.73, + "grad_norm": 0.6602262258529663, + "learning_rate": 0.00022918254952598272, + "loss": 2.904, + "step": 35251 + }, + { + "epoch": 1.73, + "grad_norm": 0.6813586354255676, + "learning_rate": 0.0002291675885417063, + "loss": 2.9873, + "step": 35252 + }, + { + "epoch": 1.73, + "grad_norm": 0.6100876331329346, + "learning_rate": 0.00022915262774398897, + "loss": 3.0853, + "step": 35253 + }, + { + "epoch": 1.73, + "grad_norm": 0.614741861820221, + "learning_rate": 0.0002291376671328699, + "loss": 3.0462, + "step": 35254 + }, + { + "epoch": 1.73, + "grad_norm": 0.629533588886261, + "learning_rate": 0.0002291227067083885, + "loss": 2.9814, + "step": 35255 + }, + { + "epoch": 1.73, + "grad_norm": 0.6625435948371887, + "learning_rate": 0.0002291077464705844, + "loss": 3.1346, + "step": 35256 + }, + { + "epoch": 1.73, + "grad_norm": 0.6223627924919128, + "learning_rate": 0.0002290927864194968, + "loss": 2.9611, + "step": 35257 + }, + { + "epoch": 1.73, + "grad_norm": 0.5756930112838745, + "learning_rate": 0.0002290778265551652, + "loss": 2.9672, + "step": 35258 + }, + { + "epoch": 1.73, + "grad_norm": 0.6262645125389099, + "learning_rate": 0.0002290628668776288, + "loss": 2.8283, + "step": 35259 + }, + { + "epoch": 1.73, + "grad_norm": 0.6052729487419128, + "learning_rate": 0.00022904790738692725, + "loss": 2.8735, + "step": 35260 + }, + { + "epoch": 1.73, + "grad_norm": 0.6487675309181213, + "learning_rate": 0.0002290329480830999, + "loss": 2.8281, + "step": 35261 + }, + { + "epoch": 1.73, + "grad_norm": 0.6117686629295349, + "learning_rate": 0.00022901798896618602, + "loss": 3.0395, + "step": 35262 + }, + { + "epoch": 1.73, + "grad_norm": 0.602601170539856, + "learning_rate": 0.0002290030300362252, + "loss": 2.974, + "step": 35263 + }, + { + "epoch": 1.73, + "grad_norm": 0.6140736937522888, + "learning_rate": 0.00022898807129325665, + "loss": 2.8676, + "step": 35264 + }, + { + "epoch": 1.73, + "grad_norm": 0.6174287796020508, + "learning_rate": 0.00022897311273731985, + "loss": 3.0077, + "step": 35265 + }, + { + "epoch": 1.73, + "grad_norm": 0.6516817212104797, + "learning_rate": 0.00022895815436845427, + "loss": 3.1008, + "step": 35266 + }, + { + "epoch": 1.73, + "grad_norm": 0.6091328859329224, + "learning_rate": 0.0002289431961866991, + "loss": 2.8788, + "step": 35267 + }, + { + "epoch": 1.73, + "grad_norm": 0.6192858815193176, + "learning_rate": 0.00022892823819209407, + "loss": 2.9493, + "step": 35268 + }, + { + "epoch": 1.73, + "grad_norm": 0.641791820526123, + "learning_rate": 0.0002289132803846783, + "loss": 3.0263, + "step": 35269 + }, + { + "epoch": 1.73, + "grad_norm": 0.6411734223365784, + "learning_rate": 0.00022889832276449116, + "loss": 3.1047, + "step": 35270 + }, + { + "epoch": 1.73, + "grad_norm": 0.6926196217536926, + "learning_rate": 0.00022888336533157232, + "loss": 3.1334, + "step": 35271 + }, + { + "epoch": 1.73, + "grad_norm": 0.622738778591156, + "learning_rate": 0.00022886840808596096, + "loss": 3.1372, + "step": 35272 + }, + { + "epoch": 1.73, + "grad_norm": 0.5840880870819092, + "learning_rate": 0.00022885345102769655, + "loss": 3.0984, + "step": 35273 + }, + { + "epoch": 1.73, + "grad_norm": 0.60886549949646, + "learning_rate": 0.00022883849415681834, + "loss": 2.9715, + "step": 35274 + }, + { + "epoch": 1.73, + "grad_norm": 0.6631279587745667, + "learning_rate": 0.00022882353747336593, + "loss": 2.959, + "step": 35275 + }, + { + "epoch": 1.73, + "grad_norm": 0.6146132946014404, + "learning_rate": 0.00022880858097737864, + "loss": 2.9739, + "step": 35276 + }, + { + "epoch": 1.73, + "grad_norm": 0.6303625702857971, + "learning_rate": 0.00022879362466889577, + "loss": 3.0501, + "step": 35277 + }, + { + "epoch": 1.73, + "grad_norm": 0.6738470792770386, + "learning_rate": 0.0002287786685479569, + "loss": 2.9648, + "step": 35278 + }, + { + "epoch": 1.73, + "grad_norm": 0.6016108989715576, + "learning_rate": 0.00022876371261460122, + "loss": 3.1268, + "step": 35279 + }, + { + "epoch": 1.73, + "grad_norm": 0.6048367023468018, + "learning_rate": 0.00022874875686886817, + "loss": 3.1259, + "step": 35280 + }, + { + "epoch": 1.73, + "grad_norm": 0.6452575922012329, + "learning_rate": 0.0002287338013107973, + "loss": 3.1736, + "step": 35281 + }, + { + "epoch": 1.73, + "grad_norm": 0.6293025612831116, + "learning_rate": 0.00022871884594042783, + "loss": 2.9434, + "step": 35282 + }, + { + "epoch": 1.73, + "grad_norm": 0.6301412582397461, + "learning_rate": 0.00022870389075779925, + "loss": 2.8099, + "step": 35283 + }, + { + "epoch": 1.73, + "grad_norm": 0.6559537053108215, + "learning_rate": 0.0002286889357629508, + "loss": 2.8507, + "step": 35284 + }, + { + "epoch": 1.73, + "grad_norm": 0.6444092392921448, + "learning_rate": 0.00022867398095592194, + "loss": 2.8865, + "step": 35285 + }, + { + "epoch": 1.73, + "grad_norm": 0.5924097895622253, + "learning_rate": 0.0002286590263367522, + "loss": 3.3732, + "step": 35286 + }, + { + "epoch": 1.73, + "grad_norm": 0.6571818590164185, + "learning_rate": 0.00022864407190548085, + "loss": 3.0194, + "step": 35287 + }, + { + "epoch": 1.73, + "grad_norm": 0.6414764523506165, + "learning_rate": 0.00022862911766214733, + "loss": 2.895, + "step": 35288 + }, + { + "epoch": 1.73, + "grad_norm": 0.5858622789382935, + "learning_rate": 0.00022861416360679082, + "loss": 2.973, + "step": 35289 + }, + { + "epoch": 1.73, + "grad_norm": 0.6514439582824707, + "learning_rate": 0.00022859920973945097, + "loss": 2.8787, + "step": 35290 + }, + { + "epoch": 1.73, + "grad_norm": 0.6287581324577332, + "learning_rate": 0.0002285842560601671, + "loss": 3.1014, + "step": 35291 + }, + { + "epoch": 1.73, + "grad_norm": 0.6182932257652283, + "learning_rate": 0.0002285693025689784, + "loss": 3.1868, + "step": 35292 + }, + { + "epoch": 1.73, + "grad_norm": 0.739643394947052, + "learning_rate": 0.00022855434926592454, + "loss": 2.7995, + "step": 35293 + }, + { + "epoch": 1.73, + "grad_norm": 0.6869654655456543, + "learning_rate": 0.00022853939615104476, + "loss": 2.9529, + "step": 35294 + }, + { + "epoch": 1.73, + "grad_norm": 0.6125555038452148, + "learning_rate": 0.00022852444322437847, + "loss": 2.8912, + "step": 35295 + }, + { + "epoch": 1.73, + "grad_norm": 0.5996557474136353, + "learning_rate": 0.00022850949048596492, + "loss": 3.0872, + "step": 35296 + }, + { + "epoch": 1.73, + "grad_norm": 0.6288072466850281, + "learning_rate": 0.00022849453793584372, + "loss": 2.9651, + "step": 35297 + }, + { + "epoch": 1.73, + "grad_norm": 0.6631354093551636, + "learning_rate": 0.00022847958557405416, + "loss": 2.9076, + "step": 35298 + }, + { + "epoch": 1.73, + "grad_norm": 0.6144710779190063, + "learning_rate": 0.0002284646334006355, + "loss": 2.9343, + "step": 35299 + }, + { + "epoch": 1.73, + "grad_norm": 0.6347219347953796, + "learning_rate": 0.00022844968141562738, + "loss": 3.1248, + "step": 35300 + }, + { + "epoch": 1.73, + "grad_norm": 0.6205817461013794, + "learning_rate": 0.00022843472961906893, + "loss": 2.8253, + "step": 35301 + }, + { + "epoch": 1.73, + "grad_norm": 0.6230130791664124, + "learning_rate": 0.0002284197780109996, + "loss": 2.9309, + "step": 35302 + }, + { + "epoch": 1.73, + "grad_norm": 0.673992931842804, + "learning_rate": 0.0002284048265914589, + "loss": 3.2319, + "step": 35303 + }, + { + "epoch": 1.73, + "grad_norm": 0.7464253902435303, + "learning_rate": 0.00022838987536048604, + "loss": 3.1167, + "step": 35304 + }, + { + "epoch": 1.73, + "grad_norm": 0.6007848381996155, + "learning_rate": 0.00022837492431812054, + "loss": 2.8541, + "step": 35305 + }, + { + "epoch": 1.73, + "grad_norm": 0.6575406789779663, + "learning_rate": 0.0002283599734644016, + "loss": 3.1411, + "step": 35306 + }, + { + "epoch": 1.73, + "grad_norm": 0.5982678532600403, + "learning_rate": 0.00022834502279936867, + "loss": 2.9991, + "step": 35307 + }, + { + "epoch": 1.73, + "grad_norm": 0.6275074481964111, + "learning_rate": 0.00022833007232306128, + "loss": 2.88, + "step": 35308 + }, + { + "epoch": 1.73, + "grad_norm": 0.6013560891151428, + "learning_rate": 0.00022831512203551864, + "loss": 2.8328, + "step": 35309 + }, + { + "epoch": 1.73, + "grad_norm": 0.6640160083770752, + "learning_rate": 0.00022830017193678024, + "loss": 2.9587, + "step": 35310 + }, + { + "epoch": 1.73, + "grad_norm": 0.6699552536010742, + "learning_rate": 0.00022828522202688527, + "loss": 3.1212, + "step": 35311 + }, + { + "epoch": 1.73, + "grad_norm": 0.6032391786575317, + "learning_rate": 0.00022827027230587328, + "loss": 2.9259, + "step": 35312 + }, + { + "epoch": 1.73, + "grad_norm": 0.6313595175743103, + "learning_rate": 0.00022825532277378363, + "loss": 3.02, + "step": 35313 + }, + { + "epoch": 1.73, + "grad_norm": 0.6690878868103027, + "learning_rate": 0.00022824037343065556, + "loss": 3.0762, + "step": 35314 + }, + { + "epoch": 1.73, + "grad_norm": 0.5943594574928284, + "learning_rate": 0.00022822542427652863, + "loss": 2.8547, + "step": 35315 + }, + { + "epoch": 1.73, + "grad_norm": 0.6819391846656799, + "learning_rate": 0.00022821047531144207, + "loss": 3.2038, + "step": 35316 + }, + { + "epoch": 1.73, + "grad_norm": 0.6711174845695496, + "learning_rate": 0.00022819552653543525, + "loss": 2.8025, + "step": 35317 + }, + { + "epoch": 1.73, + "grad_norm": 0.667477011680603, + "learning_rate": 0.0002281805779485477, + "loss": 3.1893, + "step": 35318 + }, + { + "epoch": 1.73, + "grad_norm": 0.6368096470832825, + "learning_rate": 0.0002281656295508186, + "loss": 3.1114, + "step": 35319 + }, + { + "epoch": 1.73, + "grad_norm": 0.6909788846969604, + "learning_rate": 0.0002281506813422875, + "loss": 2.9798, + "step": 35320 + }, + { + "epoch": 1.73, + "grad_norm": 0.6262080073356628, + "learning_rate": 0.0002281357333229936, + "loss": 3.0459, + "step": 35321 + }, + { + "epoch": 1.73, + "grad_norm": 0.6632434725761414, + "learning_rate": 0.00022812078549297633, + "loss": 3.0869, + "step": 35322 + }, + { + "epoch": 1.73, + "grad_norm": 0.6851413249969482, + "learning_rate": 0.0002281058378522752, + "loss": 2.9552, + "step": 35323 + }, + { + "epoch": 1.73, + "grad_norm": 0.6063692569732666, + "learning_rate": 0.0002280908904009293, + "loss": 2.8897, + "step": 35324 + }, + { + "epoch": 1.73, + "grad_norm": 0.6380870342254639, + "learning_rate": 0.00022807594313897827, + "loss": 3.108, + "step": 35325 + }, + { + "epoch": 1.73, + "grad_norm": 0.6056703925132751, + "learning_rate": 0.00022806099606646125, + "loss": 3.0071, + "step": 35326 + }, + { + "epoch": 1.73, + "grad_norm": 0.6386041045188904, + "learning_rate": 0.0002280460491834178, + "loss": 3.0436, + "step": 35327 + }, + { + "epoch": 1.73, + "grad_norm": 0.6316136717796326, + "learning_rate": 0.00022803110248988725, + "loss": 2.9004, + "step": 35328 + }, + { + "epoch": 1.73, + "grad_norm": 0.624875545501709, + "learning_rate": 0.0002280161559859088, + "loss": 3.0124, + "step": 35329 + }, + { + "epoch": 1.73, + "grad_norm": 0.6380939483642578, + "learning_rate": 0.0002280012096715221, + "loss": 3.0695, + "step": 35330 + }, + { + "epoch": 1.73, + "grad_norm": 0.6007676720619202, + "learning_rate": 0.00022798626354676626, + "loss": 3.1734, + "step": 35331 + }, + { + "epoch": 1.73, + "grad_norm": 0.6163882613182068, + "learning_rate": 0.00022797131761168068, + "loss": 2.8966, + "step": 35332 + }, + { + "epoch": 1.73, + "grad_norm": 0.6753647327423096, + "learning_rate": 0.00022795637186630494, + "loss": 2.9465, + "step": 35333 + }, + { + "epoch": 1.73, + "grad_norm": 0.5936952233314514, + "learning_rate": 0.00022794142631067814, + "loss": 2.8909, + "step": 35334 + }, + { + "epoch": 1.73, + "grad_norm": 0.6279860734939575, + "learning_rate": 0.00022792648094483987, + "loss": 3.0901, + "step": 35335 + }, + { + "epoch": 1.73, + "grad_norm": 0.6184460520744324, + "learning_rate": 0.00022791153576882922, + "loss": 2.9562, + "step": 35336 + }, + { + "epoch": 1.73, + "grad_norm": 0.6539486646652222, + "learning_rate": 0.0002278965907826858, + "loss": 2.9344, + "step": 35337 + }, + { + "epoch": 1.73, + "grad_norm": 0.6066672205924988, + "learning_rate": 0.00022788164598644894, + "loss": 3.0856, + "step": 35338 + }, + { + "epoch": 1.73, + "grad_norm": 0.613047182559967, + "learning_rate": 0.00022786670138015783, + "loss": 3.1087, + "step": 35339 + }, + { + "epoch": 1.73, + "grad_norm": 0.6032065749168396, + "learning_rate": 0.00022785175696385203, + "loss": 3.0789, + "step": 35340 + }, + { + "epoch": 1.73, + "grad_norm": 0.6134180426597595, + "learning_rate": 0.00022783681273757077, + "loss": 3.1098, + "step": 35341 + }, + { + "epoch": 1.73, + "grad_norm": 0.6650903820991516, + "learning_rate": 0.00022782186870135343, + "loss": 2.9518, + "step": 35342 + }, + { + "epoch": 1.73, + "grad_norm": 0.6557191610336304, + "learning_rate": 0.0002278069248552395, + "loss": 2.8869, + "step": 35343 + }, + { + "epoch": 1.73, + "grad_norm": 0.6179054975509644, + "learning_rate": 0.00022779198119926819, + "loss": 3.0837, + "step": 35344 + }, + { + "epoch": 1.73, + "grad_norm": 0.6232442855834961, + "learning_rate": 0.00022777703773347894, + "loss": 3.1992, + "step": 35345 + }, + { + "epoch": 1.73, + "grad_norm": 0.6513820290565491, + "learning_rate": 0.00022776209445791093, + "loss": 3.0261, + "step": 35346 + }, + { + "epoch": 1.73, + "grad_norm": 0.6319857239723206, + "learning_rate": 0.00022774715137260367, + "loss": 2.9248, + "step": 35347 + }, + { + "epoch": 1.73, + "grad_norm": 0.6437803506851196, + "learning_rate": 0.00022773220847759667, + "loss": 3.0184, + "step": 35348 + }, + { + "epoch": 1.73, + "grad_norm": 0.645805299282074, + "learning_rate": 0.00022771726577292903, + "loss": 2.9881, + "step": 35349 + }, + { + "epoch": 1.73, + "grad_norm": 0.6178550124168396, + "learning_rate": 0.0002277023232586403, + "loss": 3.097, + "step": 35350 + }, + { + "epoch": 1.73, + "grad_norm": 0.6673634052276611, + "learning_rate": 0.00022768738093476956, + "loss": 3.2377, + "step": 35351 + }, + { + "epoch": 1.73, + "grad_norm": 0.601243257522583, + "learning_rate": 0.00022767243880135641, + "loss": 2.6505, + "step": 35352 + }, + { + "epoch": 1.73, + "grad_norm": 0.6067811846733093, + "learning_rate": 0.00022765749685844025, + "loss": 2.8767, + "step": 35353 + }, + { + "epoch": 1.73, + "grad_norm": 0.5918362736701965, + "learning_rate": 0.00022764255510606014, + "loss": 2.9484, + "step": 35354 + }, + { + "epoch": 1.73, + "grad_norm": 0.6136022806167603, + "learning_rate": 0.00022762761354425576, + "loss": 3.0941, + "step": 35355 + }, + { + "epoch": 1.73, + "grad_norm": 0.6263452768325806, + "learning_rate": 0.0002276126721730663, + "loss": 2.9776, + "step": 35356 + }, + { + "epoch": 1.73, + "grad_norm": 0.6397753953933716, + "learning_rate": 0.000227597730992531, + "loss": 2.8435, + "step": 35357 + }, + { + "epoch": 1.73, + "grad_norm": 0.6426587700843811, + "learning_rate": 0.0002275827900026895, + "loss": 2.9557, + "step": 35358 + }, + { + "epoch": 1.73, + "grad_norm": 0.6584495902061462, + "learning_rate": 0.00022756784920358093, + "loss": 2.8092, + "step": 35359 + }, + { + "epoch": 1.73, + "grad_norm": 0.6029062271118164, + "learning_rate": 0.00022755290859524476, + "loss": 3.1722, + "step": 35360 + }, + { + "epoch": 1.73, + "grad_norm": 0.5749930739402771, + "learning_rate": 0.00022753796817772014, + "loss": 2.9569, + "step": 35361 + }, + { + "epoch": 1.73, + "grad_norm": 0.7005466222763062, + "learning_rate": 0.00022752302795104665, + "loss": 3.0455, + "step": 35362 + }, + { + "epoch": 1.73, + "grad_norm": 0.6490772366523743, + "learning_rate": 0.00022750808791526363, + "loss": 2.8489, + "step": 35363 + }, + { + "epoch": 1.73, + "grad_norm": 0.6193538904190063, + "learning_rate": 0.00022749314807041023, + "loss": 2.97, + "step": 35364 + }, + { + "epoch": 1.73, + "grad_norm": 0.6467298865318298, + "learning_rate": 0.00022747820841652607, + "loss": 3.0202, + "step": 35365 + }, + { + "epoch": 1.73, + "grad_norm": 0.5905973315238953, + "learning_rate": 0.00022746326895365025, + "loss": 3.1219, + "step": 35366 + }, + { + "epoch": 1.73, + "grad_norm": 0.6227057576179504, + "learning_rate": 0.00022744832968182228, + "loss": 2.9582, + "step": 35367 + }, + { + "epoch": 1.73, + "grad_norm": 0.6363484263420105, + "learning_rate": 0.00022743339060108135, + "loss": 3.2402, + "step": 35368 + }, + { + "epoch": 1.73, + "grad_norm": 0.6553448438644409, + "learning_rate": 0.00022741845171146685, + "loss": 3.0608, + "step": 35369 + }, + { + "epoch": 1.73, + "grad_norm": 0.6466527581214905, + "learning_rate": 0.00022740351301301837, + "loss": 3.0484, + "step": 35370 + }, + { + "epoch": 1.73, + "grad_norm": 0.5950348377227783, + "learning_rate": 0.00022738857450577496, + "loss": 3.2043, + "step": 35371 + }, + { + "epoch": 1.73, + "grad_norm": 0.6238282918930054, + "learning_rate": 0.00022737363618977613, + "loss": 2.9925, + "step": 35372 + }, + { + "epoch": 1.73, + "grad_norm": 0.6248958110809326, + "learning_rate": 0.00022735869806506106, + "loss": 3.1858, + "step": 35373 + }, + { + "epoch": 1.73, + "grad_norm": 0.7205170392990112, + "learning_rate": 0.00022734376013166927, + "loss": 2.9612, + "step": 35374 + }, + { + "epoch": 1.73, + "grad_norm": 0.6162315607070923, + "learning_rate": 0.0002273288223896401, + "loss": 2.8959, + "step": 35375 + }, + { + "epoch": 1.73, + "grad_norm": 0.6196463704109192, + "learning_rate": 0.00022731388483901267, + "loss": 3.0026, + "step": 35376 + }, + { + "epoch": 1.73, + "grad_norm": 0.6567784547805786, + "learning_rate": 0.00022729894747982663, + "loss": 2.9925, + "step": 35377 + }, + { + "epoch": 1.73, + "grad_norm": 0.6151658892631531, + "learning_rate": 0.00022728401031212107, + "loss": 3.1042, + "step": 35378 + }, + { + "epoch": 1.73, + "grad_norm": 0.5919417142868042, + "learning_rate": 0.0002272690733359354, + "loss": 3.0079, + "step": 35379 + }, + { + "epoch": 1.73, + "grad_norm": 0.6030619740486145, + "learning_rate": 0.00022725413655130916, + "loss": 2.9471, + "step": 35380 + }, + { + "epoch": 1.73, + "grad_norm": 0.6577292084693909, + "learning_rate": 0.0002272391999582814, + "loss": 2.8635, + "step": 35381 + }, + { + "epoch": 1.73, + "grad_norm": 0.6286402940750122, + "learning_rate": 0.00022722426355689166, + "loss": 2.8672, + "step": 35382 + }, + { + "epoch": 1.73, + "grad_norm": 0.6577459573745728, + "learning_rate": 0.0002272093273471791, + "loss": 3.0737, + "step": 35383 + }, + { + "epoch": 1.73, + "grad_norm": 0.692613959312439, + "learning_rate": 0.00022719439132918318, + "loss": 3.2409, + "step": 35384 + }, + { + "epoch": 1.73, + "grad_norm": 0.628135085105896, + "learning_rate": 0.0002271794555029433, + "loss": 2.9021, + "step": 35385 + }, + { + "epoch": 1.73, + "grad_norm": 0.629897952079773, + "learning_rate": 0.00022716451986849861, + "loss": 2.9762, + "step": 35386 + }, + { + "epoch": 1.73, + "grad_norm": 0.6259167790412903, + "learning_rate": 0.00022714958442588868, + "loss": 2.949, + "step": 35387 + }, + { + "epoch": 1.73, + "grad_norm": 0.6222914457321167, + "learning_rate": 0.0002271346491751526, + "loss": 2.9188, + "step": 35388 + }, + { + "epoch": 1.73, + "grad_norm": 0.6155588030815125, + "learning_rate": 0.00022711971411632992, + "loss": 3.0117, + "step": 35389 + }, + { + "epoch": 1.73, + "grad_norm": 0.6510941982269287, + "learning_rate": 0.0002271047792494599, + "loss": 3.1024, + "step": 35390 + }, + { + "epoch": 1.73, + "grad_norm": 0.6198724508285522, + "learning_rate": 0.00022708984457458175, + "loss": 2.7966, + "step": 35391 + }, + { + "epoch": 1.73, + "grad_norm": 0.6015543341636658, + "learning_rate": 0.00022707491009173506, + "loss": 3.0348, + "step": 35392 + }, + { + "epoch": 1.73, + "grad_norm": 0.6300435662269592, + "learning_rate": 0.00022705997580095894, + "loss": 3.0702, + "step": 35393 + }, + { + "epoch": 1.73, + "grad_norm": 0.6055490970611572, + "learning_rate": 0.00022704504170229276, + "loss": 3.1463, + "step": 35394 + }, + { + "epoch": 1.73, + "grad_norm": 0.5714443922042847, + "learning_rate": 0.00022703010779577602, + "loss": 3.1352, + "step": 35395 + }, + { + "epoch": 1.73, + "grad_norm": 0.6332013607025146, + "learning_rate": 0.00022701517408144789, + "loss": 3.0562, + "step": 35396 + }, + { + "epoch": 1.73, + "grad_norm": 0.5947373509407043, + "learning_rate": 0.00022700024055934777, + "loss": 3.167, + "step": 35397 + }, + { + "epoch": 1.73, + "grad_norm": 0.6268903613090515, + "learning_rate": 0.00022698530722951486, + "loss": 2.9227, + "step": 35398 + }, + { + "epoch": 1.73, + "grad_norm": 0.6999599933624268, + "learning_rate": 0.00022697037409198865, + "loss": 2.8985, + "step": 35399 + }, + { + "epoch": 1.73, + "grad_norm": 0.6626995205879211, + "learning_rate": 0.00022695544114680852, + "loss": 3.0245, + "step": 35400 + }, + { + "epoch": 1.73, + "grad_norm": 0.6301445960998535, + "learning_rate": 0.0002269405083940135, + "loss": 3.0983, + "step": 35401 + }, + { + "epoch": 1.73, + "grad_norm": 0.6395618915557861, + "learning_rate": 0.00022692557583364333, + "loss": 2.9343, + "step": 35402 + }, + { + "epoch": 1.74, + "grad_norm": 0.5879999995231628, + "learning_rate": 0.00022691064346573703, + "loss": 2.7305, + "step": 35403 + }, + { + "epoch": 1.74, + "grad_norm": 0.6151583194732666, + "learning_rate": 0.00022689571129033397, + "loss": 3.0668, + "step": 35404 + }, + { + "epoch": 1.74, + "grad_norm": 0.6331624984741211, + "learning_rate": 0.00022688077930747367, + "loss": 3.0581, + "step": 35405 + }, + { + "epoch": 1.74, + "grad_norm": 0.6111859083175659, + "learning_rate": 0.00022686584751719524, + "loss": 3.1264, + "step": 35406 + }, + { + "epoch": 1.74, + "grad_norm": 0.6400036811828613, + "learning_rate": 0.00022685091591953817, + "loss": 3.1299, + "step": 35407 + }, + { + "epoch": 1.74, + "grad_norm": 0.6397266983985901, + "learning_rate": 0.00022683598451454158, + "loss": 3.2165, + "step": 35408 + }, + { + "epoch": 1.74, + "grad_norm": 0.6920884847640991, + "learning_rate": 0.0002268210533022449, + "loss": 3.1075, + "step": 35409 + }, + { + "epoch": 1.74, + "grad_norm": 0.6338069438934326, + "learning_rate": 0.00022680612228268766, + "loss": 3.0068, + "step": 35410 + }, + { + "epoch": 1.74, + "grad_norm": 0.6007820963859558, + "learning_rate": 0.0002267911914559089, + "loss": 3.0907, + "step": 35411 + }, + { + "epoch": 1.74, + "grad_norm": 0.6300089955329895, + "learning_rate": 0.00022677626082194812, + "loss": 3.1833, + "step": 35412 + }, + { + "epoch": 1.74, + "grad_norm": 0.6192349791526794, + "learning_rate": 0.00022676133038084447, + "loss": 3.0577, + "step": 35413 + }, + { + "epoch": 1.74, + "grad_norm": 0.6377847790718079, + "learning_rate": 0.00022674640013263745, + "loss": 3.0741, + "step": 35414 + }, + { + "epoch": 1.74, + "grad_norm": 0.6192604303359985, + "learning_rate": 0.00022673147007736636, + "loss": 3.0948, + "step": 35415 + }, + { + "epoch": 1.74, + "grad_norm": 0.605172336101532, + "learning_rate": 0.00022671654021507037, + "loss": 3.0666, + "step": 35416 + }, + { + "epoch": 1.74, + "grad_norm": 0.6593872904777527, + "learning_rate": 0.00022670161054578903, + "loss": 3.0207, + "step": 35417 + }, + { + "epoch": 1.74, + "grad_norm": 0.6469155550003052, + "learning_rate": 0.00022668668106956148, + "loss": 2.9885, + "step": 35418 + }, + { + "epoch": 1.74, + "grad_norm": 0.6012063026428223, + "learning_rate": 0.00022667175178642701, + "loss": 3.0121, + "step": 35419 + }, + { + "epoch": 1.74, + "grad_norm": 0.6011958718299866, + "learning_rate": 0.0002266568226964252, + "loss": 2.9255, + "step": 35420 + }, + { + "epoch": 1.74, + "grad_norm": 0.6600900888442993, + "learning_rate": 0.0002266418937995951, + "loss": 3.0529, + "step": 35421 + }, + { + "epoch": 1.74, + "grad_norm": 0.6183411478996277, + "learning_rate": 0.00022662696509597623, + "loss": 2.9691, + "step": 35422 + }, + { + "epoch": 1.74, + "grad_norm": 0.6118205189704895, + "learning_rate": 0.0002266120365856077, + "loss": 3.101, + "step": 35423 + }, + { + "epoch": 1.74, + "grad_norm": 0.6012549996376038, + "learning_rate": 0.00022659710826852895, + "loss": 3.0369, + "step": 35424 + }, + { + "epoch": 1.74, + "grad_norm": 0.7324210405349731, + "learning_rate": 0.00022658218014477938, + "loss": 3.0671, + "step": 35425 + }, + { + "epoch": 1.74, + "grad_norm": 0.616727888584137, + "learning_rate": 0.00022656725221439808, + "loss": 2.9368, + "step": 35426 + }, + { + "epoch": 1.74, + "grad_norm": 0.5840345621109009, + "learning_rate": 0.00022655232447742467, + "loss": 3.1202, + "step": 35427 + }, + { + "epoch": 1.74, + "grad_norm": 0.6287633180618286, + "learning_rate": 0.0002265373969338982, + "loss": 2.9494, + "step": 35428 + }, + { + "epoch": 1.74, + "grad_norm": 0.6012067198753357, + "learning_rate": 0.00022652246958385807, + "loss": 3.0535, + "step": 35429 + }, + { + "epoch": 1.74, + "grad_norm": 0.6259651184082031, + "learning_rate": 0.00022650754242734371, + "loss": 3.2078, + "step": 35430 + }, + { + "epoch": 1.74, + "grad_norm": 0.6013410687446594, + "learning_rate": 0.00022649261546439422, + "loss": 3.0495, + "step": 35431 + }, + { + "epoch": 1.74, + "grad_norm": 0.6384536623954773, + "learning_rate": 0.00022647768869504912, + "loss": 2.9705, + "step": 35432 + }, + { + "epoch": 1.74, + "grad_norm": 0.6175307631492615, + "learning_rate": 0.00022646276211934764, + "loss": 3.1426, + "step": 35433 + }, + { + "epoch": 1.74, + "grad_norm": 0.630964457988739, + "learning_rate": 0.000226447835737329, + "loss": 3.0491, + "step": 35434 + }, + { + "epoch": 1.74, + "grad_norm": 0.6299197673797607, + "learning_rate": 0.00022643290954903273, + "loss": 2.9989, + "step": 35435 + }, + { + "epoch": 1.74, + "grad_norm": 0.6441041231155396, + "learning_rate": 0.00022641798355449796, + "loss": 3.2179, + "step": 35436 + }, + { + "epoch": 1.74, + "grad_norm": 0.6124367117881775, + "learning_rate": 0.00022640305775376409, + "loss": 2.9684, + "step": 35437 + }, + { + "epoch": 1.74, + "grad_norm": 0.5946165323257446, + "learning_rate": 0.00022638813214687033, + "loss": 2.781, + "step": 35438 + }, + { + "epoch": 1.74, + "grad_norm": 0.6153435707092285, + "learning_rate": 0.0002263732067338561, + "loss": 3.2042, + "step": 35439 + }, + { + "epoch": 1.74, + "grad_norm": 0.614129364490509, + "learning_rate": 0.0002263582815147607, + "loss": 3.0863, + "step": 35440 + }, + { + "epoch": 1.74, + "grad_norm": 0.6298364400863647, + "learning_rate": 0.00022634335648962333, + "loss": 2.8166, + "step": 35441 + }, + { + "epoch": 1.74, + "grad_norm": 0.6218454241752625, + "learning_rate": 0.0002263284316584835, + "loss": 3.0104, + "step": 35442 + }, + { + "epoch": 1.74, + "grad_norm": 0.63499915599823, + "learning_rate": 0.00022631350702138038, + "loss": 3.0578, + "step": 35443 + }, + { + "epoch": 1.74, + "grad_norm": 0.5916718244552612, + "learning_rate": 0.0002262985825783533, + "loss": 3.2383, + "step": 35444 + }, + { + "epoch": 1.74, + "grad_norm": 0.6168555617332458, + "learning_rate": 0.0002262836583294415, + "loss": 3.2438, + "step": 35445 + }, + { + "epoch": 1.74, + "grad_norm": 0.6397154927253723, + "learning_rate": 0.00022626873427468439, + "loss": 2.8624, + "step": 35446 + }, + { + "epoch": 1.74, + "grad_norm": 0.594692587852478, + "learning_rate": 0.00022625381041412127, + "loss": 3.0968, + "step": 35447 + }, + { + "epoch": 1.74, + "grad_norm": 0.6055973172187805, + "learning_rate": 0.00022623888674779135, + "loss": 2.9747, + "step": 35448 + }, + { + "epoch": 1.74, + "grad_norm": 0.6591005921363831, + "learning_rate": 0.0002262239632757341, + "loss": 2.9765, + "step": 35449 + }, + { + "epoch": 1.74, + "grad_norm": 0.6846657991409302, + "learning_rate": 0.00022620903999798862, + "loss": 2.827, + "step": 35450 + }, + { + "epoch": 1.74, + "grad_norm": 0.6533744931221008, + "learning_rate": 0.00022619411691459442, + "loss": 3.1813, + "step": 35451 + }, + { + "epoch": 1.74, + "grad_norm": 0.6433231830596924, + "learning_rate": 0.00022617919402559075, + "loss": 2.9903, + "step": 35452 + }, + { + "epoch": 1.74, + "grad_norm": 0.6263206005096436, + "learning_rate": 0.00022616427133101673, + "loss": 2.9961, + "step": 35453 + }, + { + "epoch": 1.74, + "grad_norm": 0.6297306418418884, + "learning_rate": 0.00022614934883091197, + "loss": 3.0246, + "step": 35454 + }, + { + "epoch": 1.74, + "grad_norm": 0.6420456171035767, + "learning_rate": 0.00022613442652531551, + "loss": 3.0221, + "step": 35455 + }, + { + "epoch": 1.74, + "grad_norm": 0.6900204420089722, + "learning_rate": 0.00022611950441426675, + "loss": 3.232, + "step": 35456 + }, + { + "epoch": 1.74, + "grad_norm": 0.6068527698516846, + "learning_rate": 0.00022610458249780507, + "loss": 2.8889, + "step": 35457 + }, + { + "epoch": 1.74, + "grad_norm": 0.6444588303565979, + "learning_rate": 0.00022608966077596965, + "loss": 2.9091, + "step": 35458 + }, + { + "epoch": 1.74, + "grad_norm": 0.6167253255844116, + "learning_rate": 0.0002260747392487999, + "loss": 2.8771, + "step": 35459 + }, + { + "epoch": 1.74, + "grad_norm": 0.6610934138298035, + "learning_rate": 0.00022605981791633492, + "loss": 2.8356, + "step": 35460 + }, + { + "epoch": 1.74, + "grad_norm": 0.6797847151756287, + "learning_rate": 0.00022604489677861426, + "loss": 3.1294, + "step": 35461 + }, + { + "epoch": 1.74, + "grad_norm": 0.6337746381759644, + "learning_rate": 0.00022602997583567712, + "loss": 3.0702, + "step": 35462 + }, + { + "epoch": 1.74, + "grad_norm": 0.619875967502594, + "learning_rate": 0.00022601505508756267, + "loss": 2.879, + "step": 35463 + }, + { + "epoch": 1.74, + "grad_norm": 0.6586485505104065, + "learning_rate": 0.0002260001345343105, + "loss": 3.1996, + "step": 35464 + }, + { + "epoch": 1.74, + "grad_norm": 0.6685471534729004, + "learning_rate": 0.0002259852141759596, + "loss": 3.0143, + "step": 35465 + }, + { + "epoch": 1.74, + "grad_norm": 0.6211610436439514, + "learning_rate": 0.00022597029401254937, + "loss": 3.2222, + "step": 35466 + }, + { + "epoch": 1.74, + "grad_norm": 0.6223245859146118, + "learning_rate": 0.0002259553740441193, + "loss": 2.8696, + "step": 35467 + }, + { + "epoch": 1.74, + "grad_norm": 0.603645384311676, + "learning_rate": 0.00022594045427070842, + "loss": 2.9405, + "step": 35468 + }, + { + "epoch": 1.74, + "grad_norm": 0.6659462451934814, + "learning_rate": 0.0002259255346923562, + "loss": 3.0429, + "step": 35469 + }, + { + "epoch": 1.74, + "grad_norm": 0.6123594045639038, + "learning_rate": 0.00022591061530910175, + "loss": 2.9071, + "step": 35470 + }, + { + "epoch": 1.74, + "grad_norm": 0.6161990165710449, + "learning_rate": 0.00022589569612098447, + "loss": 3.1323, + "step": 35471 + }, + { + "epoch": 1.74, + "grad_norm": 0.6867458820343018, + "learning_rate": 0.0002258807771280438, + "loss": 2.9695, + "step": 35472 + }, + { + "epoch": 1.74, + "grad_norm": 0.5981655716896057, + "learning_rate": 0.00022586585833031883, + "loss": 2.9782, + "step": 35473 + }, + { + "epoch": 1.74, + "grad_norm": 0.6146122217178345, + "learning_rate": 0.000225850939727849, + "loss": 3.0785, + "step": 35474 + }, + { + "epoch": 1.74, + "grad_norm": 0.6437488794326782, + "learning_rate": 0.00022583602132067335, + "loss": 2.8745, + "step": 35475 + }, + { + "epoch": 1.74, + "grad_norm": 0.6147376894950867, + "learning_rate": 0.00022582110310883147, + "loss": 2.9838, + "step": 35476 + }, + { + "epoch": 1.74, + "grad_norm": 0.5768126845359802, + "learning_rate": 0.00022580618509236258, + "loss": 3.0706, + "step": 35477 + }, + { + "epoch": 1.74, + "grad_norm": 0.6257051825523376, + "learning_rate": 0.00022579126727130576, + "loss": 2.857, + "step": 35478 + }, + { + "epoch": 1.74, + "grad_norm": 0.6463055610656738, + "learning_rate": 0.0002257763496457006, + "loss": 3.0198, + "step": 35479 + }, + { + "epoch": 1.74, + "grad_norm": 0.6310402154922485, + "learning_rate": 0.00022576143221558618, + "loss": 2.7418, + "step": 35480 + }, + { + "epoch": 1.74, + "grad_norm": 0.6200835704803467, + "learning_rate": 0.00022574651498100186, + "loss": 3.0547, + "step": 35481 + }, + { + "epoch": 1.74, + "grad_norm": 0.6173551082611084, + "learning_rate": 0.00022573159794198695, + "loss": 2.8957, + "step": 35482 + }, + { + "epoch": 1.74, + "grad_norm": 0.6188896894454956, + "learning_rate": 0.00022571668109858074, + "loss": 2.9646, + "step": 35483 + }, + { + "epoch": 1.74, + "grad_norm": 0.6149911880493164, + "learning_rate": 0.00022570176445082252, + "loss": 2.963, + "step": 35484 + }, + { + "epoch": 1.74, + "grad_norm": 0.568781316280365, + "learning_rate": 0.00022568684799875144, + "loss": 3.0014, + "step": 35485 + }, + { + "epoch": 1.74, + "grad_norm": 0.7318620681762695, + "learning_rate": 0.00022567193174240693, + "loss": 3.0284, + "step": 35486 + }, + { + "epoch": 1.74, + "grad_norm": 0.6288902759552002, + "learning_rate": 0.00022565701568182835, + "loss": 2.8994, + "step": 35487 + }, + { + "epoch": 1.74, + "grad_norm": 0.6177136301994324, + "learning_rate": 0.00022564209981705473, + "loss": 3.065, + "step": 35488 + }, + { + "epoch": 1.74, + "grad_norm": 0.6629291772842407, + "learning_rate": 0.00022562718414812568, + "loss": 3.1049, + "step": 35489 + }, + { + "epoch": 1.74, + "grad_norm": 0.6288484334945679, + "learning_rate": 0.00022561226867508018, + "loss": 2.9863, + "step": 35490 + }, + { + "epoch": 1.74, + "grad_norm": 0.611689031124115, + "learning_rate": 0.00022559735339795767, + "loss": 2.8179, + "step": 35491 + }, + { + "epoch": 1.74, + "grad_norm": 0.6365360617637634, + "learning_rate": 0.00022558243831679749, + "loss": 3.0323, + "step": 35492 + }, + { + "epoch": 1.74, + "grad_norm": 0.586676299571991, + "learning_rate": 0.00022556752343163872, + "loss": 2.9651, + "step": 35493 + }, + { + "epoch": 1.74, + "grad_norm": 0.6502875685691833, + "learning_rate": 0.00022555260874252092, + "loss": 3.2247, + "step": 35494 + }, + { + "epoch": 1.74, + "grad_norm": 0.6236476302146912, + "learning_rate": 0.00022553769424948314, + "loss": 3.2573, + "step": 35495 + }, + { + "epoch": 1.74, + "grad_norm": 0.622161865234375, + "learning_rate": 0.00022552277995256466, + "loss": 3.0476, + "step": 35496 + }, + { + "epoch": 1.74, + "grad_norm": 0.5968731045722961, + "learning_rate": 0.000225507865851805, + "loss": 3.1644, + "step": 35497 + }, + { + "epoch": 1.74, + "grad_norm": 0.5831536054611206, + "learning_rate": 0.00022549295194724322, + "loss": 2.9254, + "step": 35498 + }, + { + "epoch": 1.74, + "grad_norm": 0.6409755945205688, + "learning_rate": 0.0002254780382389187, + "loss": 3.0804, + "step": 35499 + }, + { + "epoch": 1.74, + "grad_norm": 0.6048734188079834, + "learning_rate": 0.0002254631247268706, + "loss": 2.9089, + "step": 35500 + }, + { + "epoch": 1.74, + "grad_norm": 0.6509040594100952, + "learning_rate": 0.00022544821141113832, + "loss": 2.6318, + "step": 35501 + }, + { + "epoch": 1.74, + "grad_norm": 0.637199342250824, + "learning_rate": 0.00022543329829176115, + "loss": 3.145, + "step": 35502 + }, + { + "epoch": 1.74, + "grad_norm": 0.6597538590431213, + "learning_rate": 0.00022541838536877822, + "loss": 2.7563, + "step": 35503 + }, + { + "epoch": 1.74, + "grad_norm": 0.6191924214363098, + "learning_rate": 0.00022540347264222904, + "loss": 3.2077, + "step": 35504 + }, + { + "epoch": 1.74, + "grad_norm": 0.6519606113433838, + "learning_rate": 0.00022538856011215265, + "loss": 2.8337, + "step": 35505 + }, + { + "epoch": 1.74, + "grad_norm": 0.6172239184379578, + "learning_rate": 0.00022537364777858843, + "loss": 3.2083, + "step": 35506 + }, + { + "epoch": 1.74, + "grad_norm": 0.6710629463195801, + "learning_rate": 0.00022535873564157575, + "loss": 2.7566, + "step": 35507 + }, + { + "epoch": 1.74, + "grad_norm": 0.6321614980697632, + "learning_rate": 0.00022534382370115375, + "loss": 2.8906, + "step": 35508 + }, + { + "epoch": 1.74, + "grad_norm": 0.6277992129325867, + "learning_rate": 0.00022532891195736182, + "loss": 3.0721, + "step": 35509 + }, + { + "epoch": 1.74, + "grad_norm": 0.6334667205810547, + "learning_rate": 0.000225314000410239, + "loss": 2.9217, + "step": 35510 + }, + { + "epoch": 1.74, + "grad_norm": 0.6273242831230164, + "learning_rate": 0.00022529908905982477, + "loss": 2.844, + "step": 35511 + }, + { + "epoch": 1.74, + "grad_norm": 0.6677801012992859, + "learning_rate": 0.0002252841779061585, + "loss": 3.0954, + "step": 35512 + }, + { + "epoch": 1.74, + "grad_norm": 0.6611440181732178, + "learning_rate": 0.00022526926694927922, + "loss": 3.0319, + "step": 35513 + }, + { + "epoch": 1.74, + "grad_norm": 0.6573300957679749, + "learning_rate": 0.00022525435618922637, + "loss": 3.0643, + "step": 35514 + }, + { + "epoch": 1.74, + "grad_norm": 0.6609066724777222, + "learning_rate": 0.00022523944562603904, + "loss": 3.2395, + "step": 35515 + }, + { + "epoch": 1.74, + "grad_norm": 0.6033406853675842, + "learning_rate": 0.0002252245352597567, + "loss": 3.2297, + "step": 35516 + }, + { + "epoch": 1.74, + "grad_norm": 0.6859046220779419, + "learning_rate": 0.00022520962509041864, + "loss": 2.9935, + "step": 35517 + }, + { + "epoch": 1.74, + "grad_norm": 0.6412832140922546, + "learning_rate": 0.00022519471511806384, + "loss": 2.8779, + "step": 35518 + }, + { + "epoch": 1.74, + "grad_norm": 0.6439999341964722, + "learning_rate": 0.00022517980534273192, + "loss": 2.958, + "step": 35519 + }, + { + "epoch": 1.74, + "grad_norm": 0.6545829772949219, + "learning_rate": 0.00022516489576446196, + "loss": 3.0343, + "step": 35520 + }, + { + "epoch": 1.74, + "grad_norm": 0.6581568121910095, + "learning_rate": 0.0002251499863832933, + "loss": 3.0944, + "step": 35521 + }, + { + "epoch": 1.74, + "grad_norm": 0.5977330207824707, + "learning_rate": 0.00022513507719926505, + "loss": 3.0737, + "step": 35522 + }, + { + "epoch": 1.74, + "grad_norm": 0.611193835735321, + "learning_rate": 0.00022512016821241664, + "loss": 3.0715, + "step": 35523 + }, + { + "epoch": 1.74, + "grad_norm": 0.6156790256500244, + "learning_rate": 0.0002251052594227874, + "loss": 2.9858, + "step": 35524 + }, + { + "epoch": 1.74, + "grad_norm": 0.6435781717300415, + "learning_rate": 0.00022509035083041631, + "loss": 3.1172, + "step": 35525 + }, + { + "epoch": 1.74, + "grad_norm": 0.6316915154457092, + "learning_rate": 0.00022507544243534296, + "loss": 3.0121, + "step": 35526 + }, + { + "epoch": 1.74, + "grad_norm": 0.6427628397941589, + "learning_rate": 0.00022506053423760645, + "loss": 3.1036, + "step": 35527 + }, + { + "epoch": 1.74, + "grad_norm": 0.7132937908172607, + "learning_rate": 0.000225045626237246, + "loss": 3.1381, + "step": 35528 + }, + { + "epoch": 1.74, + "grad_norm": 0.6076539754867554, + "learning_rate": 0.00022503071843430102, + "loss": 2.9185, + "step": 35529 + }, + { + "epoch": 1.74, + "grad_norm": 0.59321129322052, + "learning_rate": 0.0002250158108288107, + "loss": 3.2645, + "step": 35530 + }, + { + "epoch": 1.74, + "grad_norm": 0.6133169531822205, + "learning_rate": 0.00022500090342081433, + "loss": 2.9951, + "step": 35531 + }, + { + "epoch": 1.74, + "grad_norm": 0.5968568325042725, + "learning_rate": 0.00022498599621035103, + "loss": 2.9241, + "step": 35532 + }, + { + "epoch": 1.74, + "grad_norm": 0.6295343637466431, + "learning_rate": 0.00022497108919746018, + "loss": 2.9182, + "step": 35533 + }, + { + "epoch": 1.74, + "grad_norm": 0.6084388494491577, + "learning_rate": 0.0002249561823821812, + "loss": 3.0067, + "step": 35534 + }, + { + "epoch": 1.74, + "grad_norm": 0.6127825379371643, + "learning_rate": 0.00022494127576455305, + "loss": 2.998, + "step": 35535 + }, + { + "epoch": 1.74, + "grad_norm": 0.6642194986343384, + "learning_rate": 0.00022492636934461524, + "loss": 2.9918, + "step": 35536 + }, + { + "epoch": 1.74, + "grad_norm": 0.5972631573677063, + "learning_rate": 0.00022491146312240683, + "loss": 2.8956, + "step": 35537 + }, + { + "epoch": 1.74, + "grad_norm": 0.6442238092422485, + "learning_rate": 0.0002248965570979672, + "loss": 2.8439, + "step": 35538 + }, + { + "epoch": 1.74, + "grad_norm": 0.6403670907020569, + "learning_rate": 0.00022488165127133565, + "loss": 2.867, + "step": 35539 + }, + { + "epoch": 1.74, + "grad_norm": 0.6831574440002441, + "learning_rate": 0.00022486674564255128, + "loss": 3.0751, + "step": 35540 + }, + { + "epoch": 1.74, + "grad_norm": 0.6002147793769836, + "learning_rate": 0.0002248518402116535, + "loss": 3.0445, + "step": 35541 + }, + { + "epoch": 1.74, + "grad_norm": 0.6270040273666382, + "learning_rate": 0.0002248369349786815, + "loss": 3.0538, + "step": 35542 + }, + { + "epoch": 1.74, + "grad_norm": 0.6381674408912659, + "learning_rate": 0.00022482202994367448, + "loss": 3.0805, + "step": 35543 + }, + { + "epoch": 1.74, + "grad_norm": 0.6307418942451477, + "learning_rate": 0.00022480712510667195, + "loss": 3.0251, + "step": 35544 + }, + { + "epoch": 1.74, + "grad_norm": 0.6338752508163452, + "learning_rate": 0.00022479222046771282, + "loss": 3.2437, + "step": 35545 + }, + { + "epoch": 1.74, + "grad_norm": 0.5932478904724121, + "learning_rate": 0.00022477731602683662, + "loss": 2.7564, + "step": 35546 + }, + { + "epoch": 1.74, + "grad_norm": 0.6139539480209351, + "learning_rate": 0.00022476241178408232, + "loss": 3.0832, + "step": 35547 + }, + { + "epoch": 1.74, + "grad_norm": 0.6189956068992615, + "learning_rate": 0.00022474750773948944, + "loss": 3.0366, + "step": 35548 + }, + { + "epoch": 1.74, + "grad_norm": 0.6055632829666138, + "learning_rate": 0.0002247326038930972, + "loss": 3.0145, + "step": 35549 + }, + { + "epoch": 1.74, + "grad_norm": 0.6140079498291016, + "learning_rate": 0.0002247177002449447, + "loss": 3.0551, + "step": 35550 + }, + { + "epoch": 1.74, + "grad_norm": 0.656378984451294, + "learning_rate": 0.00022470279679507143, + "loss": 3.0311, + "step": 35551 + }, + { + "epoch": 1.74, + "grad_norm": 0.668050229549408, + "learning_rate": 0.0002246878935435163, + "loss": 3.0796, + "step": 35552 + }, + { + "epoch": 1.74, + "grad_norm": 0.6154969930648804, + "learning_rate": 0.00022467299049031887, + "loss": 3.0686, + "step": 35553 + }, + { + "epoch": 1.74, + "grad_norm": 0.6256099343299866, + "learning_rate": 0.0002246580876355184, + "loss": 2.9475, + "step": 35554 + }, + { + "epoch": 1.74, + "grad_norm": 0.6148830056190491, + "learning_rate": 0.0002246431849791538, + "loss": 2.8056, + "step": 35555 + }, + { + "epoch": 1.74, + "grad_norm": 0.6625372171401978, + "learning_rate": 0.00022462828252126473, + "loss": 2.8291, + "step": 35556 + }, + { + "epoch": 1.74, + "grad_norm": 0.6332937479019165, + "learning_rate": 0.0002246133802618902, + "loss": 3.0003, + "step": 35557 + }, + { + "epoch": 1.74, + "grad_norm": 0.5910854339599609, + "learning_rate": 0.0002245984782010694, + "loss": 3.0787, + "step": 35558 + }, + { + "epoch": 1.74, + "grad_norm": 0.6351834535598755, + "learning_rate": 0.0002245835763388419, + "loss": 3.2896, + "step": 35559 + }, + { + "epoch": 1.74, + "grad_norm": 0.6503569483757019, + "learning_rate": 0.00022456867467524663, + "loss": 3.0756, + "step": 35560 + }, + { + "epoch": 1.74, + "grad_norm": 0.686381995677948, + "learning_rate": 0.00022455377321032306, + "loss": 2.7848, + "step": 35561 + }, + { + "epoch": 1.74, + "grad_norm": 0.6213240027427673, + "learning_rate": 0.00022453887194411016, + "loss": 3.0433, + "step": 35562 + }, + { + "epoch": 1.74, + "grad_norm": 0.6782155632972717, + "learning_rate": 0.00022452397087664743, + "loss": 3.0367, + "step": 35563 + }, + { + "epoch": 1.74, + "grad_norm": 0.8070650100708008, + "learning_rate": 0.00022450907000797412, + "loss": 3.0191, + "step": 35564 + }, + { + "epoch": 1.74, + "grad_norm": 0.6163588166236877, + "learning_rate": 0.00022449416933812924, + "loss": 2.9667, + "step": 35565 + }, + { + "epoch": 1.74, + "grad_norm": 0.6353696584701538, + "learning_rate": 0.00022447926886715231, + "loss": 3.0485, + "step": 35566 + }, + { + "epoch": 1.74, + "grad_norm": 0.6798037886619568, + "learning_rate": 0.0002244643685950824, + "loss": 2.9272, + "step": 35567 + }, + { + "epoch": 1.74, + "grad_norm": 0.5956858992576599, + "learning_rate": 0.00022444946852195874, + "loss": 3.0098, + "step": 35568 + }, + { + "epoch": 1.74, + "grad_norm": 0.6191090941429138, + "learning_rate": 0.00022443456864782078, + "loss": 2.8499, + "step": 35569 + }, + { + "epoch": 1.74, + "grad_norm": 0.6584221720695496, + "learning_rate": 0.00022441966897270753, + "loss": 3.2436, + "step": 35570 + }, + { + "epoch": 1.74, + "grad_norm": 0.6226842403411865, + "learning_rate": 0.0002244047694966584, + "loss": 3.0886, + "step": 35571 + }, + { + "epoch": 1.74, + "grad_norm": 0.6515241265296936, + "learning_rate": 0.00022438987021971242, + "loss": 3.0962, + "step": 35572 + }, + { + "epoch": 1.74, + "grad_norm": 0.6351691484451294, + "learning_rate": 0.000224374971141909, + "loss": 2.9709, + "step": 35573 + }, + { + "epoch": 1.74, + "grad_norm": 0.6209082007408142, + "learning_rate": 0.00022436007226328748, + "loss": 2.9444, + "step": 35574 + }, + { + "epoch": 1.74, + "grad_norm": 0.5917410850524902, + "learning_rate": 0.00022434517358388688, + "loss": 3.1332, + "step": 35575 + }, + { + "epoch": 1.74, + "grad_norm": 0.6047651171684265, + "learning_rate": 0.0002243302751037466, + "loss": 2.9769, + "step": 35576 + }, + { + "epoch": 1.74, + "grad_norm": 0.5979304909706116, + "learning_rate": 0.0002243153768229057, + "loss": 3.0874, + "step": 35577 + }, + { + "epoch": 1.74, + "grad_norm": 0.6439438462257385, + "learning_rate": 0.00022430047874140357, + "loss": 2.9494, + "step": 35578 + }, + { + "epoch": 1.74, + "grad_norm": 0.590167760848999, + "learning_rate": 0.0002242855808592795, + "loss": 2.9652, + "step": 35579 + }, + { + "epoch": 1.74, + "grad_norm": 0.6090808510780334, + "learning_rate": 0.00022427068317657252, + "loss": 2.9947, + "step": 35580 + }, + { + "epoch": 1.74, + "grad_norm": 0.607502281665802, + "learning_rate": 0.0002242557856933221, + "loss": 2.9917, + "step": 35581 + }, + { + "epoch": 1.74, + "grad_norm": 0.6465458273887634, + "learning_rate": 0.0002242408884095673, + "loss": 2.9978, + "step": 35582 + }, + { + "epoch": 1.74, + "grad_norm": 0.6103089451789856, + "learning_rate": 0.00022422599132534737, + "loss": 2.8718, + "step": 35583 + }, + { + "epoch": 1.74, + "grad_norm": 0.6132879853248596, + "learning_rate": 0.0002242110944407017, + "loss": 2.7641, + "step": 35584 + }, + { + "epoch": 1.74, + "grad_norm": 0.6607589721679688, + "learning_rate": 0.00022419619775566934, + "loss": 3.0388, + "step": 35585 + }, + { + "epoch": 1.74, + "grad_norm": 0.6323291063308716, + "learning_rate": 0.00022418130127028972, + "loss": 3.0716, + "step": 35586 + }, + { + "epoch": 1.74, + "grad_norm": 0.6502161622047424, + "learning_rate": 0.0002241664049846018, + "loss": 3.0507, + "step": 35587 + }, + { + "epoch": 1.74, + "grad_norm": 0.6421080231666565, + "learning_rate": 0.00022415150889864503, + "loss": 3.0403, + "step": 35588 + }, + { + "epoch": 1.74, + "grad_norm": 0.6806257367134094, + "learning_rate": 0.00022413661301245872, + "loss": 2.9999, + "step": 35589 + }, + { + "epoch": 1.74, + "grad_norm": 0.6115500330924988, + "learning_rate": 0.00022412171732608176, + "loss": 2.9546, + "step": 35590 + }, + { + "epoch": 1.74, + "grad_norm": 0.6313839554786682, + "learning_rate": 0.00022410682183955375, + "loss": 3.0004, + "step": 35591 + }, + { + "epoch": 1.74, + "grad_norm": 0.6472813487052917, + "learning_rate": 0.0002240919265529137, + "loss": 3.1098, + "step": 35592 + }, + { + "epoch": 1.74, + "grad_norm": 0.6413887143135071, + "learning_rate": 0.00022407703146620086, + "loss": 3.1327, + "step": 35593 + }, + { + "epoch": 1.74, + "grad_norm": 0.5937476754188538, + "learning_rate": 0.00022406213657945464, + "loss": 2.833, + "step": 35594 + }, + { + "epoch": 1.74, + "grad_norm": 0.6520037055015564, + "learning_rate": 0.00022404724189271398, + "loss": 3.0171, + "step": 35595 + }, + { + "epoch": 1.74, + "grad_norm": 0.6196516156196594, + "learning_rate": 0.0002240323474060184, + "loss": 2.9223, + "step": 35596 + }, + { + "epoch": 1.74, + "grad_norm": 0.6525871753692627, + "learning_rate": 0.00022401745311940695, + "loss": 3.1616, + "step": 35597 + }, + { + "epoch": 1.74, + "grad_norm": 0.617184042930603, + "learning_rate": 0.00022400255903291886, + "loss": 3.071, + "step": 35598 + }, + { + "epoch": 1.74, + "grad_norm": 0.6290184259414673, + "learning_rate": 0.00022398766514659347, + "loss": 3.1684, + "step": 35599 + }, + { + "epoch": 1.74, + "grad_norm": 0.6642170548439026, + "learning_rate": 0.00022397277146046996, + "loss": 2.8508, + "step": 35600 + }, + { + "epoch": 1.74, + "grad_norm": 0.6329320073127747, + "learning_rate": 0.0002239578779745875, + "loss": 2.8697, + "step": 35601 + }, + { + "epoch": 1.74, + "grad_norm": 0.6439346671104431, + "learning_rate": 0.00022394298468898533, + "loss": 3.0158, + "step": 35602 + }, + { + "epoch": 1.74, + "grad_norm": 0.6102849245071411, + "learning_rate": 0.0002239280916037028, + "loss": 3.0351, + "step": 35603 + }, + { + "epoch": 1.74, + "grad_norm": 0.6272664070129395, + "learning_rate": 0.00022391319871877888, + "loss": 2.9033, + "step": 35604 + }, + { + "epoch": 1.74, + "grad_norm": 0.6411396265029907, + "learning_rate": 0.000223898306034253, + "loss": 2.9436, + "step": 35605 + }, + { + "epoch": 1.74, + "grad_norm": 0.6268124580383301, + "learning_rate": 0.00022388341355016443, + "loss": 2.9714, + "step": 35606 + }, + { + "epoch": 1.75, + "grad_norm": 0.608563244342804, + "learning_rate": 0.00022386852126655223, + "loss": 3.1994, + "step": 35607 + }, + { + "epoch": 1.75, + "grad_norm": 0.6222233176231384, + "learning_rate": 0.00022385362918345573, + "loss": 2.9368, + "step": 35608 + }, + { + "epoch": 1.75, + "grad_norm": 0.6342154145240784, + "learning_rate": 0.00022383873730091403, + "loss": 2.9475, + "step": 35609 + }, + { + "epoch": 1.75, + "grad_norm": 0.6370996832847595, + "learning_rate": 0.00022382384561896648, + "loss": 2.7946, + "step": 35610 + }, + { + "epoch": 1.75, + "grad_norm": 0.6122251749038696, + "learning_rate": 0.0002238089541376523, + "loss": 2.809, + "step": 35611 + }, + { + "epoch": 1.75, + "grad_norm": 0.6048957109451294, + "learning_rate": 0.0002237940628570106, + "loss": 2.9704, + "step": 35612 + }, + { + "epoch": 1.75, + "grad_norm": 0.619551420211792, + "learning_rate": 0.0002237791717770808, + "loss": 2.8918, + "step": 35613 + }, + { + "epoch": 1.75, + "grad_norm": 0.6651395559310913, + "learning_rate": 0.0002237642808979018, + "loss": 3.0585, + "step": 35614 + }, + { + "epoch": 1.75, + "grad_norm": 0.6424996852874756, + "learning_rate": 0.00022374939021951314, + "loss": 2.9588, + "step": 35615 + }, + { + "epoch": 1.75, + "grad_norm": 0.678436815738678, + "learning_rate": 0.00022373449974195396, + "loss": 2.9305, + "step": 35616 + }, + { + "epoch": 1.75, + "grad_norm": 0.6074570417404175, + "learning_rate": 0.0002237196094652633, + "loss": 2.7319, + "step": 35617 + }, + { + "epoch": 1.75, + "grad_norm": 0.6088811755180359, + "learning_rate": 0.0002237047193894806, + "loss": 2.9016, + "step": 35618 + }, + { + "epoch": 1.75, + "grad_norm": 0.6111059188842773, + "learning_rate": 0.00022368982951464497, + "loss": 3.071, + "step": 35619 + }, + { + "epoch": 1.75, + "grad_norm": 0.594634473323822, + "learning_rate": 0.00022367493984079553, + "loss": 3.0388, + "step": 35620 + }, + { + "epoch": 1.75, + "grad_norm": 0.5961778163909912, + "learning_rate": 0.0002236600503679718, + "loss": 3.0913, + "step": 35621 + }, + { + "epoch": 1.75, + "grad_norm": 0.6522762179374695, + "learning_rate": 0.0002236451610962127, + "loss": 2.9231, + "step": 35622 + }, + { + "epoch": 1.75, + "grad_norm": 0.6687374711036682, + "learning_rate": 0.00022363027202555762, + "loss": 3.0651, + "step": 35623 + }, + { + "epoch": 1.75, + "grad_norm": 0.6435642838478088, + "learning_rate": 0.00022361538315604558, + "loss": 2.9751, + "step": 35624 + }, + { + "epoch": 1.75, + "grad_norm": 0.667826235294342, + "learning_rate": 0.00022360049448771598, + "loss": 3.1817, + "step": 35625 + }, + { + "epoch": 1.75, + "grad_norm": 0.6378101706504822, + "learning_rate": 0.00022358560602060807, + "loss": 2.9707, + "step": 35626 + }, + { + "epoch": 1.75, + "grad_norm": 0.620212197303772, + "learning_rate": 0.0002235707177547608, + "loss": 3.1727, + "step": 35627 + }, + { + "epoch": 1.75, + "grad_norm": 0.6127507090568542, + "learning_rate": 0.0002235558296902137, + "loss": 3.027, + "step": 35628 + }, + { + "epoch": 1.75, + "grad_norm": 0.6060869693756104, + "learning_rate": 0.00022354094182700575, + "loss": 3.2523, + "step": 35629 + }, + { + "epoch": 1.75, + "grad_norm": 0.6330663561820984, + "learning_rate": 0.00022352605416517618, + "loss": 2.9645, + "step": 35630 + }, + { + "epoch": 1.75, + "grad_norm": 0.6215522289276123, + "learning_rate": 0.0002235111667047644, + "loss": 2.9589, + "step": 35631 + }, + { + "epoch": 1.75, + "grad_norm": 0.6048828363418579, + "learning_rate": 0.00022349627944580946, + "loss": 2.9766, + "step": 35632 + }, + { + "epoch": 1.75, + "grad_norm": 0.607565701007843, + "learning_rate": 0.00022348139238835064, + "loss": 3.177, + "step": 35633 + }, + { + "epoch": 1.75, + "grad_norm": 0.6367427110671997, + "learning_rate": 0.00022346650553242695, + "loss": 3.0012, + "step": 35634 + }, + { + "epoch": 1.75, + "grad_norm": 0.6365235447883606, + "learning_rate": 0.00022345161887807774, + "loss": 3.0022, + "step": 35635 + }, + { + "epoch": 1.75, + "grad_norm": 0.6165895462036133, + "learning_rate": 0.00022343673242534244, + "loss": 2.8973, + "step": 35636 + }, + { + "epoch": 1.75, + "grad_norm": 0.6315513849258423, + "learning_rate": 0.00022342184617425993, + "loss": 2.887, + "step": 35637 + }, + { + "epoch": 1.75, + "grad_norm": 0.6446321606636047, + "learning_rate": 0.0002234069601248696, + "loss": 3.0301, + "step": 35638 + }, + { + "epoch": 1.75, + "grad_norm": 0.6023744940757751, + "learning_rate": 0.00022339207427721049, + "loss": 2.8683, + "step": 35639 + }, + { + "epoch": 1.75, + "grad_norm": 0.6446340680122375, + "learning_rate": 0.00022337718863132196, + "loss": 3.1663, + "step": 35640 + }, + { + "epoch": 1.75, + "grad_norm": 0.6004554629325867, + "learning_rate": 0.00022336230318724325, + "loss": 3.0679, + "step": 35641 + }, + { + "epoch": 1.75, + "grad_norm": 0.6370178461074829, + "learning_rate": 0.00022334741794501333, + "loss": 2.9213, + "step": 35642 + }, + { + "epoch": 1.75, + "grad_norm": 0.6428967118263245, + "learning_rate": 0.0002233325329046717, + "loss": 2.9016, + "step": 35643 + }, + { + "epoch": 1.75, + "grad_norm": 0.685459554195404, + "learning_rate": 0.0002233176480662574, + "loss": 2.7608, + "step": 35644 + }, + { + "epoch": 1.75, + "grad_norm": 0.6399477124214172, + "learning_rate": 0.00022330276342980956, + "loss": 2.9918, + "step": 35645 + }, + { + "epoch": 1.75, + "grad_norm": 0.64689040184021, + "learning_rate": 0.0002232878789953676, + "loss": 3.0101, + "step": 35646 + }, + { + "epoch": 1.75, + "grad_norm": 0.6764118671417236, + "learning_rate": 0.00022327299476297054, + "loss": 3.169, + "step": 35647 + }, + { + "epoch": 1.75, + "grad_norm": 0.6111961603164673, + "learning_rate": 0.00022325811073265774, + "loss": 2.777, + "step": 35648 + }, + { + "epoch": 1.75, + "grad_norm": 0.6330588459968567, + "learning_rate": 0.00022324322690446815, + "loss": 2.8875, + "step": 35649 + }, + { + "epoch": 1.75, + "grad_norm": 0.6471860408782959, + "learning_rate": 0.00022322834327844117, + "loss": 3.055, + "step": 35650 + }, + { + "epoch": 1.75, + "grad_norm": 0.6193934679031372, + "learning_rate": 0.00022321345985461605, + "loss": 2.9988, + "step": 35651 + }, + { + "epoch": 1.75, + "grad_norm": 0.627863883972168, + "learning_rate": 0.00022319857663303177, + "loss": 3.1429, + "step": 35652 + }, + { + "epoch": 1.75, + "grad_norm": 0.6523374915122986, + "learning_rate": 0.0002231836936137278, + "loss": 3.1287, + "step": 35653 + }, + { + "epoch": 1.75, + "grad_norm": 0.6731512546539307, + "learning_rate": 0.00022316881079674305, + "loss": 2.9187, + "step": 35654 + }, + { + "epoch": 1.75, + "grad_norm": 0.6700525879859924, + "learning_rate": 0.00022315392818211694, + "loss": 2.8942, + "step": 35655 + }, + { + "epoch": 1.75, + "grad_norm": 0.6407337784767151, + "learning_rate": 0.00022313904576988866, + "loss": 3.2872, + "step": 35656 + }, + { + "epoch": 1.75, + "grad_norm": 0.6617950797080994, + "learning_rate": 0.00022312416356009722, + "loss": 3.0562, + "step": 35657 + }, + { + "epoch": 1.75, + "grad_norm": 0.6472514867782593, + "learning_rate": 0.000223109281552782, + "loss": 3.138, + "step": 35658 + }, + { + "epoch": 1.75, + "grad_norm": 0.669858992099762, + "learning_rate": 0.00022309439974798213, + "loss": 2.8543, + "step": 35659 + }, + { + "epoch": 1.75, + "grad_norm": 0.5877886414527893, + "learning_rate": 0.00022307951814573676, + "loss": 3.2045, + "step": 35660 + }, + { + "epoch": 1.75, + "grad_norm": 0.5913372039794922, + "learning_rate": 0.00022306463674608525, + "loss": 3.2414, + "step": 35661 + }, + { + "epoch": 1.75, + "grad_norm": 0.6737897992134094, + "learning_rate": 0.0002230497555490666, + "loss": 3.0833, + "step": 35662 + }, + { + "epoch": 1.75, + "grad_norm": 0.6321114897727966, + "learning_rate": 0.00022303487455472017, + "loss": 2.9484, + "step": 35663 + }, + { + "epoch": 1.75, + "grad_norm": 0.6263748407363892, + "learning_rate": 0.00022301999376308492, + "loss": 3.1179, + "step": 35664 + }, + { + "epoch": 1.75, + "grad_norm": 0.7027498483657837, + "learning_rate": 0.00022300511317420023, + "loss": 2.8271, + "step": 35665 + }, + { + "epoch": 1.75, + "grad_norm": 0.6153057813644409, + "learning_rate": 0.0002229902327881054, + "loss": 3.2247, + "step": 35666 + }, + { + "epoch": 1.75, + "grad_norm": 0.6329478621482849, + "learning_rate": 0.0002229753526048393, + "loss": 2.8801, + "step": 35667 + }, + { + "epoch": 1.75, + "grad_norm": 0.6193848848342896, + "learning_rate": 0.0002229604726244414, + "loss": 2.9691, + "step": 35668 + }, + { + "epoch": 1.75, + "grad_norm": 0.6393882036209106, + "learning_rate": 0.00022294559284695076, + "loss": 2.9422, + "step": 35669 + }, + { + "epoch": 1.75, + "grad_norm": 0.6423365473747253, + "learning_rate": 0.00022293071327240656, + "loss": 3.0597, + "step": 35670 + }, + { + "epoch": 1.75, + "grad_norm": 0.6023625135421753, + "learning_rate": 0.00022291583390084808, + "loss": 2.9063, + "step": 35671 + }, + { + "epoch": 1.75, + "grad_norm": 0.6385714411735535, + "learning_rate": 0.00022290095473231444, + "loss": 2.9283, + "step": 35672 + }, + { + "epoch": 1.75, + "grad_norm": 0.6258054971694946, + "learning_rate": 0.0002228860757668449, + "loss": 3.0131, + "step": 35673 + }, + { + "epoch": 1.75, + "grad_norm": 0.6036873459815979, + "learning_rate": 0.0002228711970044785, + "loss": 3.2195, + "step": 35674 + }, + { + "epoch": 1.75, + "grad_norm": 0.6359200477600098, + "learning_rate": 0.0002228563184452545, + "loss": 3.214, + "step": 35675 + }, + { + "epoch": 1.75, + "grad_norm": 0.6024333834648132, + "learning_rate": 0.00022284144008921224, + "loss": 2.9977, + "step": 35676 + }, + { + "epoch": 1.75, + "grad_norm": 0.6681496500968933, + "learning_rate": 0.00022282656193639067, + "loss": 3.1888, + "step": 35677 + }, + { + "epoch": 1.75, + "grad_norm": 0.6251303553581238, + "learning_rate": 0.0002228116839868292, + "loss": 2.8912, + "step": 35678 + }, + { + "epoch": 1.75, + "grad_norm": 0.6392276287078857, + "learning_rate": 0.00022279680624056678, + "loss": 3.1644, + "step": 35679 + }, + { + "epoch": 1.75, + "grad_norm": 0.6303515434265137, + "learning_rate": 0.00022278192869764282, + "loss": 3.0278, + "step": 35680 + }, + { + "epoch": 1.75, + "grad_norm": 0.6725468635559082, + "learning_rate": 0.00022276705135809631, + "loss": 3.1027, + "step": 35681 + }, + { + "epoch": 1.75, + "grad_norm": 0.6221615672111511, + "learning_rate": 0.0002227521742219665, + "loss": 3.0331, + "step": 35682 + }, + { + "epoch": 1.75, + "grad_norm": 0.6534598469734192, + "learning_rate": 0.00022273729728929272, + "loss": 3.0542, + "step": 35683 + }, + { + "epoch": 1.75, + "grad_norm": 0.6483638286590576, + "learning_rate": 0.00022272242056011396, + "loss": 2.9716, + "step": 35684 + }, + { + "epoch": 1.75, + "grad_norm": 0.636474072933197, + "learning_rate": 0.0002227075440344695, + "loss": 3.0775, + "step": 35685 + }, + { + "epoch": 1.75, + "grad_norm": 0.6173744201660156, + "learning_rate": 0.00022269266771239842, + "loss": 2.879, + "step": 35686 + }, + { + "epoch": 1.75, + "grad_norm": 0.6293776631355286, + "learning_rate": 0.00022267779159394, + "loss": 3.0981, + "step": 35687 + }, + { + "epoch": 1.75, + "grad_norm": 0.624398946762085, + "learning_rate": 0.0002226629156791335, + "loss": 2.7682, + "step": 35688 + }, + { + "epoch": 1.75, + "grad_norm": 0.647941529750824, + "learning_rate": 0.00022264803996801787, + "loss": 3.103, + "step": 35689 + }, + { + "epoch": 1.75, + "grad_norm": 0.6048708558082581, + "learning_rate": 0.00022263316446063247, + "loss": 2.9964, + "step": 35690 + }, + { + "epoch": 1.75, + "grad_norm": 0.6444445848464966, + "learning_rate": 0.00022261828915701642, + "loss": 3.1067, + "step": 35691 + }, + { + "epoch": 1.75, + "grad_norm": 0.6575992703437805, + "learning_rate": 0.00022260341405720886, + "loss": 3.024, + "step": 35692 + }, + { + "epoch": 1.75, + "grad_norm": 0.6296408772468567, + "learning_rate": 0.00022258853916124908, + "loss": 3.1385, + "step": 35693 + }, + { + "epoch": 1.75, + "grad_norm": 0.6155877709388733, + "learning_rate": 0.00022257366446917616, + "loss": 3.0299, + "step": 35694 + }, + { + "epoch": 1.75, + "grad_norm": 0.6629244089126587, + "learning_rate": 0.00022255878998102936, + "loss": 2.886, + "step": 35695 + }, + { + "epoch": 1.75, + "grad_norm": 0.6616221070289612, + "learning_rate": 0.00022254391569684768, + "loss": 3.066, + "step": 35696 + }, + { + "epoch": 1.75, + "grad_norm": 0.630438506603241, + "learning_rate": 0.00022252904161667043, + "loss": 3.0545, + "step": 35697 + }, + { + "epoch": 1.75, + "grad_norm": 0.6666058301925659, + "learning_rate": 0.00022251416774053686, + "loss": 2.9689, + "step": 35698 + }, + { + "epoch": 1.75, + "grad_norm": 0.6253411173820496, + "learning_rate": 0.00022249929406848609, + "loss": 2.8768, + "step": 35699 + }, + { + "epoch": 1.75, + "grad_norm": 0.6282232403755188, + "learning_rate": 0.00022248442060055724, + "loss": 2.9593, + "step": 35700 + }, + { + "epoch": 1.75, + "grad_norm": 0.5989751219749451, + "learning_rate": 0.0002224695473367894, + "loss": 2.8903, + "step": 35701 + }, + { + "epoch": 1.75, + "grad_norm": 0.6522663831710815, + "learning_rate": 0.00022245467427722194, + "loss": 2.9982, + "step": 35702 + }, + { + "epoch": 1.75, + "grad_norm": 0.6247538328170776, + "learning_rate": 0.00022243980142189397, + "loss": 2.8845, + "step": 35703 + }, + { + "epoch": 1.75, + "grad_norm": 0.6098034977912903, + "learning_rate": 0.00022242492877084453, + "loss": 2.941, + "step": 35704 + }, + { + "epoch": 1.75, + "grad_norm": 0.6762757301330566, + "learning_rate": 0.00022241005632411305, + "loss": 3.0948, + "step": 35705 + }, + { + "epoch": 1.75, + "grad_norm": 0.6396958827972412, + "learning_rate": 0.00022239518408173842, + "loss": 3.0555, + "step": 35706 + }, + { + "epoch": 1.75, + "grad_norm": 0.6262756586074829, + "learning_rate": 0.00022238031204375994, + "loss": 3.0633, + "step": 35707 + }, + { + "epoch": 1.75, + "grad_norm": 0.6355347037315369, + "learning_rate": 0.0002223654402102169, + "loss": 3.2732, + "step": 35708 + }, + { + "epoch": 1.75, + "grad_norm": 0.6595379114151001, + "learning_rate": 0.00022235056858114824, + "loss": 2.9131, + "step": 35709 + }, + { + "epoch": 1.75, + "grad_norm": 0.5768058896064758, + "learning_rate": 0.00022233569715659335, + "loss": 3.0761, + "step": 35710 + }, + { + "epoch": 1.75, + "grad_norm": 0.6388674974441528, + "learning_rate": 0.00022232082593659112, + "loss": 3.2021, + "step": 35711 + }, + { + "epoch": 1.75, + "grad_norm": 0.6107366681098938, + "learning_rate": 0.00022230595492118098, + "loss": 2.9706, + "step": 35712 + }, + { + "epoch": 1.75, + "grad_norm": 0.6297752857208252, + "learning_rate": 0.00022229108411040205, + "loss": 3.1894, + "step": 35713 + }, + { + "epoch": 1.75, + "grad_norm": 0.6354250311851501, + "learning_rate": 0.00022227621350429333, + "loss": 3.2159, + "step": 35714 + }, + { + "epoch": 1.75, + "grad_norm": 0.6811100244522095, + "learning_rate": 0.00022226134310289426, + "loss": 3.0646, + "step": 35715 + }, + { + "epoch": 1.75, + "grad_norm": 0.6282296776771545, + "learning_rate": 0.00022224647290624365, + "loss": 3.1562, + "step": 35716 + }, + { + "epoch": 1.75, + "grad_norm": 0.6105203628540039, + "learning_rate": 0.00022223160291438097, + "loss": 3.07, + "step": 35717 + }, + { + "epoch": 1.75, + "grad_norm": 0.5900524258613586, + "learning_rate": 0.00022221673312734536, + "loss": 3.1058, + "step": 35718 + }, + { + "epoch": 1.75, + "grad_norm": 0.6298518180847168, + "learning_rate": 0.00022220186354517576, + "loss": 3.0704, + "step": 35719 + }, + { + "epoch": 1.75, + "grad_norm": 0.6291689872741699, + "learning_rate": 0.00022218699416791163, + "loss": 2.9188, + "step": 35720 + }, + { + "epoch": 1.75, + "grad_norm": 0.5981619358062744, + "learning_rate": 0.00022217212499559192, + "loss": 3.0848, + "step": 35721 + }, + { + "epoch": 1.75, + "grad_norm": 0.6223424673080444, + "learning_rate": 0.00022215725602825573, + "loss": 2.9289, + "step": 35722 + }, + { + "epoch": 1.75, + "grad_norm": 0.6437315344810486, + "learning_rate": 0.00022214238726594253, + "loss": 3.2998, + "step": 35723 + }, + { + "epoch": 1.75, + "grad_norm": 0.6619381904602051, + "learning_rate": 0.0002221275187086912, + "loss": 2.8725, + "step": 35724 + }, + { + "epoch": 1.75, + "grad_norm": 0.624908983707428, + "learning_rate": 0.0002221126503565411, + "loss": 3.1948, + "step": 35725 + }, + { + "epoch": 1.75, + "grad_norm": 0.6466329097747803, + "learning_rate": 0.00022209778220953115, + "loss": 2.9206, + "step": 35726 + }, + { + "epoch": 1.75, + "grad_norm": 0.6283143162727356, + "learning_rate": 0.00022208291426770068, + "loss": 3.0594, + "step": 35727 + }, + { + "epoch": 1.75, + "grad_norm": 0.6625270843505859, + "learning_rate": 0.00022206804653108895, + "loss": 2.9291, + "step": 35728 + }, + { + "epoch": 1.75, + "grad_norm": 0.6456392407417297, + "learning_rate": 0.0002220531789997348, + "loss": 2.9168, + "step": 35729 + }, + { + "epoch": 1.75, + "grad_norm": 2.799959659576416, + "learning_rate": 0.00022203831167367773, + "loss": 3.0285, + "step": 35730 + }, + { + "epoch": 1.75, + "grad_norm": 0.6306584477424622, + "learning_rate": 0.00022202344455295663, + "loss": 2.7562, + "step": 35731 + }, + { + "epoch": 1.75, + "grad_norm": 0.7043753266334534, + "learning_rate": 0.00022200857763761078, + "loss": 2.8602, + "step": 35732 + }, + { + "epoch": 1.75, + "grad_norm": 0.656856119632721, + "learning_rate": 0.00022199371092767944, + "loss": 2.9566, + "step": 35733 + }, + { + "epoch": 1.75, + "grad_norm": 0.638849139213562, + "learning_rate": 0.00022197884442320154, + "loss": 2.9435, + "step": 35734 + }, + { + "epoch": 1.75, + "grad_norm": 0.6164271831512451, + "learning_rate": 0.00022196397812421646, + "loss": 2.8935, + "step": 35735 + }, + { + "epoch": 1.75, + "grad_norm": 0.6180526614189148, + "learning_rate": 0.0002219491120307631, + "loss": 3.05, + "step": 35736 + }, + { + "epoch": 1.75, + "grad_norm": 0.6781109571456909, + "learning_rate": 0.00022193424614288076, + "loss": 3.0915, + "step": 35737 + }, + { + "epoch": 1.75, + "grad_norm": 0.6297280788421631, + "learning_rate": 0.0002219193804606087, + "loss": 2.8888, + "step": 35738 + }, + { + "epoch": 1.75, + "grad_norm": 0.6124048829078674, + "learning_rate": 0.00022190451498398594, + "loss": 3.0255, + "step": 35739 + }, + { + "epoch": 1.75, + "grad_norm": 0.6476908922195435, + "learning_rate": 0.00022188964971305169, + "loss": 3.111, + "step": 35740 + }, + { + "epoch": 1.75, + "grad_norm": 0.602753221988678, + "learning_rate": 0.0002218747846478449, + "loss": 3.2071, + "step": 35741 + }, + { + "epoch": 1.75, + "grad_norm": 0.6379673480987549, + "learning_rate": 0.00022185991978840503, + "loss": 2.9443, + "step": 35742 + }, + { + "epoch": 1.75, + "grad_norm": 0.675123393535614, + "learning_rate": 0.00022184505513477115, + "loss": 2.9927, + "step": 35743 + }, + { + "epoch": 1.75, + "grad_norm": 0.615736186504364, + "learning_rate": 0.0002218301906869822, + "loss": 2.928, + "step": 35744 + }, + { + "epoch": 1.75, + "grad_norm": 0.6254475116729736, + "learning_rate": 0.0002218153264450776, + "loss": 3.0137, + "step": 35745 + }, + { + "epoch": 1.75, + "grad_norm": 0.5977519750595093, + "learning_rate": 0.00022180046240909634, + "loss": 2.7181, + "step": 35746 + }, + { + "epoch": 1.75, + "grad_norm": 0.659858763217926, + "learning_rate": 0.00022178559857907759, + "loss": 3.1556, + "step": 35747 + }, + { + "epoch": 1.75, + "grad_norm": 0.634911060333252, + "learning_rate": 0.00022177073495506058, + "loss": 3.1182, + "step": 35748 + }, + { + "epoch": 1.75, + "grad_norm": 0.6552413105964661, + "learning_rate": 0.00022175587153708438, + "loss": 3.0402, + "step": 35749 + }, + { + "epoch": 1.75, + "grad_norm": 0.5962386131286621, + "learning_rate": 0.00022174100832518819, + "loss": 3.092, + "step": 35750 + }, + { + "epoch": 1.75, + "grad_norm": 0.6224254965782166, + "learning_rate": 0.00022172614531941102, + "loss": 2.9382, + "step": 35751 + }, + { + "epoch": 1.75, + "grad_norm": 0.6392555236816406, + "learning_rate": 0.00022171128251979218, + "loss": 3.0492, + "step": 35752 + }, + { + "epoch": 1.75, + "grad_norm": 0.6384632587432861, + "learning_rate": 0.0002216964199263708, + "loss": 3.043, + "step": 35753 + }, + { + "epoch": 1.75, + "grad_norm": 0.6350359320640564, + "learning_rate": 0.00022168155753918585, + "loss": 3.0483, + "step": 35754 + }, + { + "epoch": 1.75, + "grad_norm": 0.6259121298789978, + "learning_rate": 0.00022166669535827674, + "loss": 3.1861, + "step": 35755 + }, + { + "epoch": 1.75, + "grad_norm": 0.6385367512702942, + "learning_rate": 0.0002216518333836824, + "loss": 2.9332, + "step": 35756 + }, + { + "epoch": 1.75, + "grad_norm": 0.7472400069236755, + "learning_rate": 0.00022163697161544217, + "loss": 2.9411, + "step": 35757 + }, + { + "epoch": 1.75, + "grad_norm": 0.6142936944961548, + "learning_rate": 0.00022162211005359486, + "loss": 3.0595, + "step": 35758 + }, + { + "epoch": 1.75, + "grad_norm": 0.6409468054771423, + "learning_rate": 0.0002216072486981799, + "loss": 3.0586, + "step": 35759 + }, + { + "epoch": 1.75, + "grad_norm": 0.6137709617614746, + "learning_rate": 0.00022159238754923647, + "loss": 2.9876, + "step": 35760 + }, + { + "epoch": 1.75, + "grad_norm": 0.6863341331481934, + "learning_rate": 0.00022157752660680353, + "loss": 3.1946, + "step": 35761 + }, + { + "epoch": 1.75, + "grad_norm": 0.615785539150238, + "learning_rate": 0.00022156266587092038, + "loss": 2.8858, + "step": 35762 + }, + { + "epoch": 1.75, + "grad_norm": 0.6471198201179504, + "learning_rate": 0.00022154780534162593, + "loss": 3.07, + "step": 35763 + }, + { + "epoch": 1.75, + "grad_norm": 0.6607215404510498, + "learning_rate": 0.0002215329450189595, + "loss": 3.1074, + "step": 35764 + }, + { + "epoch": 1.75, + "grad_norm": 0.6380542516708374, + "learning_rate": 0.0002215180849029603, + "loss": 3.0175, + "step": 35765 + }, + { + "epoch": 1.75, + "grad_norm": 0.5926660895347595, + "learning_rate": 0.00022150322499366718, + "loss": 3.1769, + "step": 35766 + }, + { + "epoch": 1.75, + "grad_norm": 0.6166107058525085, + "learning_rate": 0.00022148836529111964, + "loss": 2.812, + "step": 35767 + }, + { + "epoch": 1.75, + "grad_norm": 0.6104620695114136, + "learning_rate": 0.00022147350579535652, + "loss": 3.1454, + "step": 35768 + }, + { + "epoch": 1.75, + "grad_norm": 0.6072012186050415, + "learning_rate": 0.00022145864650641704, + "loss": 3.0505, + "step": 35769 + }, + { + "epoch": 1.75, + "grad_norm": 0.6567019820213318, + "learning_rate": 0.00022144378742434054, + "loss": 2.9977, + "step": 35770 + }, + { + "epoch": 1.75, + "grad_norm": 0.6445237398147583, + "learning_rate": 0.00022142892854916586, + "loss": 2.9995, + "step": 35771 + }, + { + "epoch": 1.75, + "grad_norm": 0.6039271354675293, + "learning_rate": 0.0002214140698809323, + "loss": 3.1029, + "step": 35772 + }, + { + "epoch": 1.75, + "grad_norm": 0.6350820660591125, + "learning_rate": 0.00022139921141967885, + "loss": 2.942, + "step": 35773 + }, + { + "epoch": 1.75, + "grad_norm": 0.6840111017227173, + "learning_rate": 0.00022138435316544487, + "loss": 2.9071, + "step": 35774 + }, + { + "epoch": 1.75, + "grad_norm": 0.7613235712051392, + "learning_rate": 0.00022136949511826942, + "loss": 2.7632, + "step": 35775 + }, + { + "epoch": 1.75, + "grad_norm": 0.651785671710968, + "learning_rate": 0.00022135463727819142, + "loss": 2.9729, + "step": 35776 + }, + { + "epoch": 1.75, + "grad_norm": 0.62009596824646, + "learning_rate": 0.00022133977964525033, + "loss": 3.1663, + "step": 35777 + }, + { + "epoch": 1.75, + "grad_norm": 0.6254829168319702, + "learning_rate": 0.00022132492221948498, + "loss": 2.9613, + "step": 35778 + }, + { + "epoch": 1.75, + "grad_norm": 0.6415801048278809, + "learning_rate": 0.00022131006500093468, + "loss": 3.0607, + "step": 35779 + }, + { + "epoch": 1.75, + "grad_norm": 0.6973337531089783, + "learning_rate": 0.00022129520798963863, + "loss": 2.6692, + "step": 35780 + }, + { + "epoch": 1.75, + "grad_norm": 0.614383339881897, + "learning_rate": 0.0002212803511856357, + "loss": 2.8457, + "step": 35781 + }, + { + "epoch": 1.75, + "grad_norm": 0.6090778708457947, + "learning_rate": 0.00022126549458896535, + "loss": 3.0376, + "step": 35782 + }, + { + "epoch": 1.75, + "grad_norm": 0.6137256622314453, + "learning_rate": 0.00022125063819966644, + "loss": 2.8509, + "step": 35783 + }, + { + "epoch": 1.75, + "grad_norm": 0.6132293939590454, + "learning_rate": 0.0002212357820177781, + "loss": 3.2059, + "step": 35784 + }, + { + "epoch": 1.75, + "grad_norm": 0.6477642059326172, + "learning_rate": 0.00022122092604333975, + "loss": 3.0761, + "step": 35785 + }, + { + "epoch": 1.75, + "grad_norm": 0.6460610032081604, + "learning_rate": 0.00022120607027639017, + "loss": 3.0912, + "step": 35786 + }, + { + "epoch": 1.75, + "grad_norm": 0.628158450126648, + "learning_rate": 0.00022119121471696874, + "loss": 3.2221, + "step": 35787 + }, + { + "epoch": 1.75, + "grad_norm": 0.6395934820175171, + "learning_rate": 0.00022117635936511437, + "loss": 2.687, + "step": 35788 + }, + { + "epoch": 1.75, + "grad_norm": 0.6341500282287598, + "learning_rate": 0.00022116150422086633, + "loss": 2.9476, + "step": 35789 + }, + { + "epoch": 1.75, + "grad_norm": 0.5902708172798157, + "learning_rate": 0.0002211466492842638, + "loss": 3.0437, + "step": 35790 + }, + { + "epoch": 1.75, + "grad_norm": 0.6335721611976624, + "learning_rate": 0.00022113179455534573, + "loss": 2.9241, + "step": 35791 + }, + { + "epoch": 1.75, + "grad_norm": 0.6263195872306824, + "learning_rate": 0.00022111694003415145, + "loss": 2.9259, + "step": 35792 + }, + { + "epoch": 1.75, + "grad_norm": 0.6031926870346069, + "learning_rate": 0.00022110208572071984, + "loss": 2.9516, + "step": 35793 + }, + { + "epoch": 1.75, + "grad_norm": 0.6146538257598877, + "learning_rate": 0.00022108723161509014, + "loss": 2.9278, + "step": 35794 + }, + { + "epoch": 1.75, + "grad_norm": 0.6239001154899597, + "learning_rate": 0.0002210723777173016, + "loss": 2.8726, + "step": 35795 + }, + { + "epoch": 1.75, + "grad_norm": 0.6321032047271729, + "learning_rate": 0.00022105752402739313, + "loss": 3.0842, + "step": 35796 + }, + { + "epoch": 1.75, + "grad_norm": 0.6525657773017883, + "learning_rate": 0.0002210426705454041, + "loss": 2.8471, + "step": 35797 + }, + { + "epoch": 1.75, + "grad_norm": 0.6774393320083618, + "learning_rate": 0.00022102781727137327, + "loss": 3.2696, + "step": 35798 + }, + { + "epoch": 1.75, + "grad_norm": 0.7013652324676514, + "learning_rate": 0.00022101296420534, + "loss": 2.9296, + "step": 35799 + }, + { + "epoch": 1.75, + "grad_norm": 0.6332024931907654, + "learning_rate": 0.0002209981113473435, + "loss": 2.9423, + "step": 35800 + }, + { + "epoch": 1.75, + "grad_norm": 0.64900803565979, + "learning_rate": 0.00022098325869742273, + "loss": 2.9368, + "step": 35801 + }, + { + "epoch": 1.75, + "grad_norm": 0.6335112452507019, + "learning_rate": 0.0002209684062556169, + "loss": 3.258, + "step": 35802 + }, + { + "epoch": 1.75, + "grad_norm": 0.5994589328765869, + "learning_rate": 0.00022095355402196494, + "loss": 3.1889, + "step": 35803 + }, + { + "epoch": 1.75, + "grad_norm": 0.6108216643333435, + "learning_rate": 0.0002209387019965062, + "loss": 3.0253, + "step": 35804 + }, + { + "epoch": 1.75, + "grad_norm": 0.6158538460731506, + "learning_rate": 0.00022092385017927977, + "loss": 2.9247, + "step": 35805 + }, + { + "epoch": 1.75, + "grad_norm": 0.6153451204299927, + "learning_rate": 0.0002209089985703245, + "loss": 3.0619, + "step": 35806 + }, + { + "epoch": 1.75, + "grad_norm": 0.6863804459571838, + "learning_rate": 0.00022089414716967992, + "loss": 2.8898, + "step": 35807 + }, + { + "epoch": 1.75, + "grad_norm": 0.6681492924690247, + "learning_rate": 0.00022087929597738482, + "loss": 2.9785, + "step": 35808 + }, + { + "epoch": 1.75, + "grad_norm": 0.6478404998779297, + "learning_rate": 0.00022086444499347835, + "loss": 3.0399, + "step": 35809 + }, + { + "epoch": 1.75, + "grad_norm": 0.6117250919342041, + "learning_rate": 0.00022084959421799988, + "loss": 3.1075, + "step": 35810 + }, + { + "epoch": 1.76, + "grad_norm": 0.6180821657180786, + "learning_rate": 0.00022083474365098825, + "loss": 3.2999, + "step": 35811 + }, + { + "epoch": 1.76, + "grad_norm": 0.6459303498268127, + "learning_rate": 0.00022081989329248276, + "loss": 2.9466, + "step": 35812 + }, + { + "epoch": 1.76, + "grad_norm": 0.6436235904693604, + "learning_rate": 0.00022080504314252227, + "loss": 3.0516, + "step": 35813 + }, + { + "epoch": 1.76, + "grad_norm": 0.627779483795166, + "learning_rate": 0.0002207901932011461, + "loss": 3.0361, + "step": 35814 + }, + { + "epoch": 1.76, + "grad_norm": 0.5869961977005005, + "learning_rate": 0.00022077534346839344, + "loss": 2.9429, + "step": 35815 + }, + { + "epoch": 1.76, + "grad_norm": 0.6305521726608276, + "learning_rate": 0.0002207604939443031, + "loss": 3.2122, + "step": 35816 + }, + { + "epoch": 1.76, + "grad_norm": 0.621493399143219, + "learning_rate": 0.00022074564462891454, + "loss": 2.9271, + "step": 35817 + }, + { + "epoch": 1.76, + "grad_norm": 0.6501767039299011, + "learning_rate": 0.0002207307955222666, + "loss": 2.8088, + "step": 35818 + }, + { + "epoch": 1.76, + "grad_norm": 0.6253821849822998, + "learning_rate": 0.00022071594662439846, + "loss": 3.0041, + "step": 35819 + }, + { + "epoch": 1.76, + "grad_norm": 0.6286084651947021, + "learning_rate": 0.00022070109793534935, + "loss": 2.8641, + "step": 35820 + }, + { + "epoch": 1.76, + "grad_norm": 0.8458594083786011, + "learning_rate": 0.00022068624945515818, + "loss": 3.1558, + "step": 35821 + }, + { + "epoch": 1.76, + "grad_norm": 0.6667223572731018, + "learning_rate": 0.00022067140118386432, + "loss": 2.9657, + "step": 35822 + }, + { + "epoch": 1.76, + "grad_norm": 0.6399785280227661, + "learning_rate": 0.00022065655312150664, + "loss": 2.9304, + "step": 35823 + }, + { + "epoch": 1.76, + "grad_norm": 0.6804106831550598, + "learning_rate": 0.00022064170526812425, + "loss": 3.1718, + "step": 35824 + }, + { + "epoch": 1.76, + "grad_norm": 0.6263512969017029, + "learning_rate": 0.00022062685762375646, + "loss": 2.7909, + "step": 35825 + }, + { + "epoch": 1.76, + "grad_norm": 0.6424779891967773, + "learning_rate": 0.00022061201018844218, + "loss": 3.1985, + "step": 35826 + }, + { + "epoch": 1.76, + "grad_norm": 0.7617523074150085, + "learning_rate": 0.00022059716296222065, + "loss": 2.9046, + "step": 35827 + }, + { + "epoch": 1.76, + "grad_norm": 0.6302613019943237, + "learning_rate": 0.0002205823159451308, + "loss": 3.043, + "step": 35828 + }, + { + "epoch": 1.76, + "grad_norm": 0.6046407222747803, + "learning_rate": 0.00022056746913721192, + "loss": 3.1451, + "step": 35829 + }, + { + "epoch": 1.76, + "grad_norm": 0.634103536605835, + "learning_rate": 0.0002205526225385031, + "loss": 3.1131, + "step": 35830 + }, + { + "epoch": 1.76, + "grad_norm": 0.6643496155738831, + "learning_rate": 0.0002205377761490432, + "loss": 3.097, + "step": 35831 + }, + { + "epoch": 1.76, + "grad_norm": 0.6304755806922913, + "learning_rate": 0.00022052292996887173, + "loss": 3.1338, + "step": 35832 + }, + { + "epoch": 1.76, + "grad_norm": 0.6238082051277161, + "learning_rate": 0.00022050808399802743, + "loss": 2.871, + "step": 35833 + }, + { + "epoch": 1.76, + "grad_norm": 0.6616991758346558, + "learning_rate": 0.00022049323823654962, + "loss": 3.0218, + "step": 35834 + }, + { + "epoch": 1.76, + "grad_norm": 0.6222161650657654, + "learning_rate": 0.0002204783926844772, + "loss": 3.1102, + "step": 35835 + }, + { + "epoch": 1.76, + "grad_norm": 0.6417911648750305, + "learning_rate": 0.00022046354734184945, + "loss": 2.897, + "step": 35836 + }, + { + "epoch": 1.76, + "grad_norm": 0.5990554094314575, + "learning_rate": 0.00022044870220870545, + "loss": 2.9511, + "step": 35837 + }, + { + "epoch": 1.76, + "grad_norm": 0.5991533994674683, + "learning_rate": 0.00022043385728508417, + "loss": 2.89, + "step": 35838 + }, + { + "epoch": 1.76, + "grad_norm": 0.6272563934326172, + "learning_rate": 0.0002204190125710249, + "loss": 3.0124, + "step": 35839 + }, + { + "epoch": 1.76, + "grad_norm": 0.6233096122741699, + "learning_rate": 0.00022040416806656652, + "loss": 3.1823, + "step": 35840 + }, + { + "epoch": 1.76, + "grad_norm": 0.6353172659873962, + "learning_rate": 0.0002203893237717483, + "loss": 2.8572, + "step": 35841 + }, + { + "epoch": 1.76, + "grad_norm": 0.6314014792442322, + "learning_rate": 0.00022037447968660935, + "loss": 2.7837, + "step": 35842 + }, + { + "epoch": 1.76, + "grad_norm": 0.6511485576629639, + "learning_rate": 0.00022035963581118855, + "loss": 3.0148, + "step": 35843 + }, + { + "epoch": 1.76, + "grad_norm": 0.6228102445602417, + "learning_rate": 0.00022034479214552528, + "loss": 2.9736, + "step": 35844 + }, + { + "epoch": 1.76, + "grad_norm": 0.5977052450180054, + "learning_rate": 0.00022032994868965844, + "loss": 3.0123, + "step": 35845 + }, + { + "epoch": 1.76, + "grad_norm": 0.6566368341445923, + "learning_rate": 0.0002203151054436271, + "loss": 3.0192, + "step": 35846 + }, + { + "epoch": 1.76, + "grad_norm": 0.620186984539032, + "learning_rate": 0.00022030026240747058, + "loss": 3.0604, + "step": 35847 + }, + { + "epoch": 1.76, + "grad_norm": 0.6430531144142151, + "learning_rate": 0.00022028541958122775, + "loss": 3.0086, + "step": 35848 + }, + { + "epoch": 1.76, + "grad_norm": 0.6357125639915466, + "learning_rate": 0.00022027057696493785, + "loss": 2.8469, + "step": 35849 + }, + { + "epoch": 1.76, + "grad_norm": 0.6521148085594177, + "learning_rate": 0.0002202557345586398, + "loss": 2.7497, + "step": 35850 + }, + { + "epoch": 1.76, + "grad_norm": 0.6499366164207458, + "learning_rate": 0.00022024089236237287, + "loss": 3.2067, + "step": 35851 + }, + { + "epoch": 1.76, + "grad_norm": 0.6402422785758972, + "learning_rate": 0.00022022605037617612, + "loss": 3.1193, + "step": 35852 + }, + { + "epoch": 1.76, + "grad_norm": 0.6213492155075073, + "learning_rate": 0.00022021120860008848, + "loss": 3.0745, + "step": 35853 + }, + { + "epoch": 1.76, + "grad_norm": 0.6546719074249268, + "learning_rate": 0.00022019636703414928, + "loss": 3.1135, + "step": 35854 + }, + { + "epoch": 1.76, + "grad_norm": 0.627903163433075, + "learning_rate": 0.00022018152567839747, + "loss": 3.1131, + "step": 35855 + }, + { + "epoch": 1.76, + "grad_norm": 0.628490686416626, + "learning_rate": 0.00022016668453287203, + "loss": 2.8469, + "step": 35856 + }, + { + "epoch": 1.76, + "grad_norm": 0.6247190833091736, + "learning_rate": 0.00022015184359761237, + "loss": 3.0752, + "step": 35857 + }, + { + "epoch": 1.76, + "grad_norm": 0.6584323048591614, + "learning_rate": 0.00022013700287265727, + "loss": 2.7668, + "step": 35858 + }, + { + "epoch": 1.76, + "grad_norm": 0.6163456439971924, + "learning_rate": 0.000220122162358046, + "loss": 3.1157, + "step": 35859 + }, + { + "epoch": 1.76, + "grad_norm": 0.6366605162620544, + "learning_rate": 0.0002201073220538175, + "loss": 2.9302, + "step": 35860 + }, + { + "epoch": 1.76, + "grad_norm": 0.7113758325576782, + "learning_rate": 0.0002200924819600109, + "loss": 3.1147, + "step": 35861 + }, + { + "epoch": 1.76, + "grad_norm": 0.5954180955886841, + "learning_rate": 0.00022007764207666546, + "loss": 2.9149, + "step": 35862 + }, + { + "epoch": 1.76, + "grad_norm": 0.6021776795387268, + "learning_rate": 0.00022006280240382007, + "loss": 3.0365, + "step": 35863 + }, + { + "epoch": 1.76, + "grad_norm": 0.617344856262207, + "learning_rate": 0.00022004796294151392, + "loss": 3.2413, + "step": 35864 + }, + { + "epoch": 1.76, + "grad_norm": 0.6328892707824707, + "learning_rate": 0.00022003312368978597, + "loss": 3.1775, + "step": 35865 + }, + { + "epoch": 1.76, + "grad_norm": 0.6340292096138, + "learning_rate": 0.0002200182846486754, + "loss": 2.7447, + "step": 35866 + }, + { + "epoch": 1.76, + "grad_norm": 0.6463967561721802, + "learning_rate": 0.00022000344581822137, + "loss": 2.9128, + "step": 35867 + }, + { + "epoch": 1.76, + "grad_norm": 0.6408669352531433, + "learning_rate": 0.0002199886071984627, + "loss": 3.0877, + "step": 35868 + }, + { + "epoch": 1.76, + "grad_norm": 0.6117002367973328, + "learning_rate": 0.0002199737687894388, + "loss": 3.2649, + "step": 35869 + }, + { + "epoch": 1.76, + "grad_norm": 0.6672056913375854, + "learning_rate": 0.0002199589305911885, + "loss": 2.74, + "step": 35870 + }, + { + "epoch": 1.76, + "grad_norm": 0.6419618129730225, + "learning_rate": 0.0002199440926037509, + "loss": 2.9445, + "step": 35871 + }, + { + "epoch": 1.76, + "grad_norm": 1.165690302848816, + "learning_rate": 0.00021992925482716534, + "loss": 3.0519, + "step": 35872 + }, + { + "epoch": 1.76, + "grad_norm": 0.6113609671592712, + "learning_rate": 0.0002199144172614706, + "loss": 3.0843, + "step": 35873 + }, + { + "epoch": 1.76, + "grad_norm": 0.6271687746047974, + "learning_rate": 0.00021989957990670595, + "loss": 2.9394, + "step": 35874 + }, + { + "epoch": 1.76, + "grad_norm": 0.6276607513427734, + "learning_rate": 0.00021988474276291028, + "loss": 3.0057, + "step": 35875 + }, + { + "epoch": 1.76, + "grad_norm": 0.6973236203193665, + "learning_rate": 0.00021986990583012283, + "loss": 2.699, + "step": 35876 + }, + { + "epoch": 1.76, + "grad_norm": 0.6413176655769348, + "learning_rate": 0.00021985506910838267, + "loss": 3.0356, + "step": 35877 + }, + { + "epoch": 1.76, + "grad_norm": 0.6662982106208801, + "learning_rate": 0.00021984023259772874, + "loss": 2.9587, + "step": 35878 + }, + { + "epoch": 1.76, + "grad_norm": 0.6634508967399597, + "learning_rate": 0.0002198253962982003, + "loss": 2.69, + "step": 35879 + }, + { + "epoch": 1.76, + "grad_norm": 0.6144524812698364, + "learning_rate": 0.00021981056020983623, + "loss": 2.9196, + "step": 35880 + }, + { + "epoch": 1.76, + "grad_norm": 0.6477610468864441, + "learning_rate": 0.0002197957243326758, + "loss": 3.0727, + "step": 35881 + }, + { + "epoch": 1.76, + "grad_norm": 0.6386900544166565, + "learning_rate": 0.00021978088866675803, + "loss": 2.9197, + "step": 35882 + }, + { + "epoch": 1.76, + "grad_norm": 0.6349610686302185, + "learning_rate": 0.00021976605321212184, + "loss": 3.2623, + "step": 35883 + }, + { + "epoch": 1.76, + "grad_norm": 0.6554539799690247, + "learning_rate": 0.00021975121796880657, + "loss": 3.2208, + "step": 35884 + }, + { + "epoch": 1.76, + "grad_norm": 0.6648715138435364, + "learning_rate": 0.00021973638293685104, + "loss": 2.9701, + "step": 35885 + }, + { + "epoch": 1.76, + "grad_norm": 0.6611798405647278, + "learning_rate": 0.00021972154811629438, + "loss": 3.0644, + "step": 35886 + }, + { + "epoch": 1.76, + "grad_norm": 0.6232627034187317, + "learning_rate": 0.00021970671350717588, + "loss": 3.0797, + "step": 35887 + }, + { + "epoch": 1.76, + "grad_norm": 0.6150153875350952, + "learning_rate": 0.00021969187910953435, + "loss": 2.9063, + "step": 35888 + }, + { + "epoch": 1.76, + "grad_norm": 0.6366429328918457, + "learning_rate": 0.000219677044923409, + "loss": 2.9047, + "step": 35889 + }, + { + "epoch": 1.76, + "grad_norm": 0.6754364967346191, + "learning_rate": 0.00021966221094883877, + "loss": 3.1375, + "step": 35890 + }, + { + "epoch": 1.76, + "grad_norm": 0.5876303315162659, + "learning_rate": 0.00021964737718586287, + "loss": 2.973, + "step": 35891 + }, + { + "epoch": 1.76, + "grad_norm": 0.7163461446762085, + "learning_rate": 0.00021963254363452036, + "loss": 2.9597, + "step": 35892 + }, + { + "epoch": 1.76, + "grad_norm": 0.6201485991477966, + "learning_rate": 0.0002196177102948501, + "loss": 2.992, + "step": 35893 + }, + { + "epoch": 1.76, + "grad_norm": 0.6001695990562439, + "learning_rate": 0.0002196028771668915, + "loss": 3.0862, + "step": 35894 + }, + { + "epoch": 1.76, + "grad_norm": 0.6142009496688843, + "learning_rate": 0.0002195880442506834, + "loss": 2.9972, + "step": 35895 + }, + { + "epoch": 1.76, + "grad_norm": 0.6313881278038025, + "learning_rate": 0.00021957321154626483, + "loss": 3.204, + "step": 35896 + }, + { + "epoch": 1.76, + "grad_norm": 0.6144121885299683, + "learning_rate": 0.00021955837905367507, + "loss": 2.8235, + "step": 35897 + }, + { + "epoch": 1.76, + "grad_norm": 0.6347367763519287, + "learning_rate": 0.00021954354677295297, + "loss": 2.9528, + "step": 35898 + }, + { + "epoch": 1.76, + "grad_norm": 0.655368447303772, + "learning_rate": 0.00021952871470413778, + "loss": 2.9214, + "step": 35899 + }, + { + "epoch": 1.76, + "grad_norm": 0.6603178977966309, + "learning_rate": 0.00021951388284726834, + "loss": 2.9844, + "step": 35900 + }, + { + "epoch": 1.76, + "grad_norm": 0.6305208802223206, + "learning_rate": 0.00021949905120238384, + "loss": 2.933, + "step": 35901 + }, + { + "epoch": 1.76, + "grad_norm": 0.6413267850875854, + "learning_rate": 0.00021948421976952345, + "loss": 2.8795, + "step": 35902 + }, + { + "epoch": 1.76, + "grad_norm": 0.6816225647926331, + "learning_rate": 0.00021946938854872607, + "loss": 3.0087, + "step": 35903 + }, + { + "epoch": 1.76, + "grad_norm": 0.6837611794471741, + "learning_rate": 0.00021945455754003088, + "loss": 2.9229, + "step": 35904 + }, + { + "epoch": 1.76, + "grad_norm": 0.7396782040596008, + "learning_rate": 0.0002194397267434768, + "loss": 3.0543, + "step": 35905 + }, + { + "epoch": 1.76, + "grad_norm": 0.6411307454109192, + "learning_rate": 0.00021942489615910303, + "loss": 2.8873, + "step": 35906 + }, + { + "epoch": 1.76, + "grad_norm": 0.6512781381607056, + "learning_rate": 0.00021941006578694863, + "loss": 3.1282, + "step": 35907 + }, + { + "epoch": 1.76, + "grad_norm": 0.6359968185424805, + "learning_rate": 0.00021939523562705242, + "loss": 3.0821, + "step": 35908 + }, + { + "epoch": 1.76, + "grad_norm": 0.6434915661811829, + "learning_rate": 0.00021938040567945387, + "loss": 3.2044, + "step": 35909 + }, + { + "epoch": 1.76, + "grad_norm": 0.6505936980247498, + "learning_rate": 0.0002193655759441917, + "loss": 2.8933, + "step": 35910 + }, + { + "epoch": 1.76, + "grad_norm": 0.6163533926010132, + "learning_rate": 0.00021935074642130515, + "loss": 2.9099, + "step": 35911 + }, + { + "epoch": 1.76, + "grad_norm": 0.6336808800697327, + "learning_rate": 0.0002193359171108331, + "loss": 2.9613, + "step": 35912 + }, + { + "epoch": 1.76, + "grad_norm": 0.6180527806282043, + "learning_rate": 0.00021932108801281478, + "loss": 3.2311, + "step": 35913 + }, + { + "epoch": 1.76, + "grad_norm": 0.6565324068069458, + "learning_rate": 0.00021930625912728926, + "loss": 3.0751, + "step": 35914 + }, + { + "epoch": 1.76, + "grad_norm": 0.6686374545097351, + "learning_rate": 0.00021929143045429535, + "loss": 3.0744, + "step": 35915 + }, + { + "epoch": 1.76, + "grad_norm": 0.6567208170890808, + "learning_rate": 0.00021927660199387247, + "loss": 3.1457, + "step": 35916 + }, + { + "epoch": 1.76, + "grad_norm": 0.6428233981132507, + "learning_rate": 0.00021926177374605938, + "loss": 3.1215, + "step": 35917 + }, + { + "epoch": 1.76, + "grad_norm": 0.6695310473442078, + "learning_rate": 0.00021924694571089518, + "loss": 3.1795, + "step": 35918 + }, + { + "epoch": 1.76, + "grad_norm": 0.6748586297035217, + "learning_rate": 0.0002192321178884191, + "loss": 3.2222, + "step": 35919 + }, + { + "epoch": 1.76, + "grad_norm": 0.6294280290603638, + "learning_rate": 0.0002192172902786701, + "loss": 2.8973, + "step": 35920 + }, + { + "epoch": 1.76, + "grad_norm": 0.6525007486343384, + "learning_rate": 0.00021920246288168718, + "loss": 2.9294, + "step": 35921 + }, + { + "epoch": 1.76, + "grad_norm": 0.6621073484420776, + "learning_rate": 0.00021918763569750934, + "loss": 3.0736, + "step": 35922 + }, + { + "epoch": 1.76, + "grad_norm": 0.620073139667511, + "learning_rate": 0.00021917280872617566, + "loss": 3.2611, + "step": 35923 + }, + { + "epoch": 1.76, + "grad_norm": 0.641943633556366, + "learning_rate": 0.00021915798196772544, + "loss": 2.9386, + "step": 35924 + }, + { + "epoch": 1.76, + "grad_norm": 0.6125879287719727, + "learning_rate": 0.00021914315542219742, + "loss": 2.8696, + "step": 35925 + }, + { + "epoch": 1.76, + "grad_norm": 0.6328999996185303, + "learning_rate": 0.0002191283290896309, + "loss": 2.9842, + "step": 35926 + }, + { + "epoch": 1.76, + "grad_norm": 0.582988977432251, + "learning_rate": 0.00021911350297006458, + "loss": 2.8296, + "step": 35927 + }, + { + "epoch": 1.76, + "grad_norm": 0.6015204787254333, + "learning_rate": 0.00021909867706353787, + "loss": 3.0296, + "step": 35928 + }, + { + "epoch": 1.76, + "grad_norm": 0.6688986420631409, + "learning_rate": 0.00021908385137008968, + "loss": 2.8411, + "step": 35929 + }, + { + "epoch": 1.76, + "grad_norm": 0.6483864188194275, + "learning_rate": 0.00021906902588975896, + "loss": 3.0242, + "step": 35930 + }, + { + "epoch": 1.76, + "grad_norm": 0.6931516528129578, + "learning_rate": 0.00021905420062258497, + "loss": 2.9332, + "step": 35931 + }, + { + "epoch": 1.76, + "grad_norm": 0.6430986523628235, + "learning_rate": 0.00021903937556860657, + "loss": 2.9444, + "step": 35932 + }, + { + "epoch": 1.76, + "grad_norm": 0.6340141296386719, + "learning_rate": 0.00021902455072786282, + "loss": 2.7925, + "step": 35933 + }, + { + "epoch": 1.76, + "grad_norm": 0.608032763004303, + "learning_rate": 0.0002190097261003929, + "loss": 3.0971, + "step": 35934 + }, + { + "epoch": 1.76, + "grad_norm": 0.6480880379676819, + "learning_rate": 0.0002189949016862358, + "loss": 2.8061, + "step": 35935 + }, + { + "epoch": 1.76, + "grad_norm": 0.6553376317024231, + "learning_rate": 0.00021898007748543053, + "loss": 3.0343, + "step": 35936 + }, + { + "epoch": 1.76, + "grad_norm": 0.6663204431533813, + "learning_rate": 0.00021896525349801606, + "loss": 3.0759, + "step": 35937 + }, + { + "epoch": 1.76, + "grad_norm": 0.6248182654380798, + "learning_rate": 0.00021895042972403154, + "loss": 2.7778, + "step": 35938 + }, + { + "epoch": 1.76, + "grad_norm": 0.6360815167427063, + "learning_rate": 0.00021893560616351607, + "loss": 2.9929, + "step": 35939 + }, + { + "epoch": 1.76, + "grad_norm": 0.6211484670639038, + "learning_rate": 0.00021892078281650847, + "loss": 3.0523, + "step": 35940 + }, + { + "epoch": 1.76, + "grad_norm": 0.6059033274650574, + "learning_rate": 0.00021890595968304807, + "loss": 2.8255, + "step": 35941 + }, + { + "epoch": 1.76, + "grad_norm": 0.6170439124107361, + "learning_rate": 0.00021889113676317365, + "loss": 3.0655, + "step": 35942 + }, + { + "epoch": 1.76, + "grad_norm": 0.6702554821968079, + "learning_rate": 0.00021887631405692444, + "loss": 3.0742, + "step": 35943 + }, + { + "epoch": 1.76, + "grad_norm": 0.6257311105728149, + "learning_rate": 0.00021886149156433944, + "loss": 2.9236, + "step": 35944 + }, + { + "epoch": 1.76, + "grad_norm": 0.6383312344551086, + "learning_rate": 0.0002188466692854575, + "loss": 3.0114, + "step": 35945 + }, + { + "epoch": 1.76, + "grad_norm": 0.6520895957946777, + "learning_rate": 0.000218831847220318, + "loss": 3.0147, + "step": 35946 + }, + { + "epoch": 1.76, + "grad_norm": 0.6120529174804688, + "learning_rate": 0.0002188170253689597, + "loss": 2.9981, + "step": 35947 + }, + { + "epoch": 1.76, + "grad_norm": 0.6751158237457275, + "learning_rate": 0.0002188022037314217, + "loss": 3.0975, + "step": 35948 + }, + { + "epoch": 1.76, + "grad_norm": 0.6635515689849854, + "learning_rate": 0.00021878738230774315, + "loss": 2.8741, + "step": 35949 + }, + { + "epoch": 1.76, + "grad_norm": 0.6444944143295288, + "learning_rate": 0.00021877256109796299, + "loss": 2.788, + "step": 35950 + }, + { + "epoch": 1.76, + "grad_norm": 0.6126725673675537, + "learning_rate": 0.00021875774010212027, + "loss": 2.8899, + "step": 35951 + }, + { + "epoch": 1.76, + "grad_norm": 0.6406420469284058, + "learning_rate": 0.00021874291932025394, + "loss": 3.0294, + "step": 35952 + }, + { + "epoch": 1.76, + "grad_norm": 0.6252112984657288, + "learning_rate": 0.00021872809875240318, + "loss": 3.0093, + "step": 35953 + }, + { + "epoch": 1.76, + "grad_norm": 0.6004651188850403, + "learning_rate": 0.00021871327839860703, + "loss": 3.0745, + "step": 35954 + }, + { + "epoch": 1.76, + "grad_norm": 0.6479066014289856, + "learning_rate": 0.00021869845825890435, + "loss": 3.2259, + "step": 35955 + }, + { + "epoch": 1.76, + "grad_norm": 0.6312503218650818, + "learning_rate": 0.00021868363833333439, + "loss": 3.0985, + "step": 35956 + }, + { + "epoch": 1.76, + "grad_norm": 0.6492436528205872, + "learning_rate": 0.00021866881862193604, + "loss": 3.0955, + "step": 35957 + }, + { + "epoch": 1.76, + "grad_norm": 0.6168732643127441, + "learning_rate": 0.00021865399912474824, + "loss": 2.8809, + "step": 35958 + }, + { + "epoch": 1.76, + "grad_norm": 0.7132405638694763, + "learning_rate": 0.00021863917984181035, + "loss": 3.2223, + "step": 35959 + }, + { + "epoch": 1.76, + "grad_norm": 0.6588032245635986, + "learning_rate": 0.00021862436077316113, + "loss": 3.2283, + "step": 35960 + }, + { + "epoch": 1.76, + "grad_norm": 0.8370882272720337, + "learning_rate": 0.00021860954191883973, + "loss": 2.8941, + "step": 35961 + }, + { + "epoch": 1.76, + "grad_norm": 0.634719729423523, + "learning_rate": 0.00021859472327888496, + "loss": 3.1808, + "step": 35962 + }, + { + "epoch": 1.76, + "grad_norm": 0.6163418292999268, + "learning_rate": 0.00021857990485333608, + "loss": 2.9578, + "step": 35963 + }, + { + "epoch": 1.76, + "grad_norm": 0.6694480180740356, + "learning_rate": 0.00021856508664223217, + "loss": 2.8498, + "step": 35964 + }, + { + "epoch": 1.76, + "grad_norm": 0.6347652673721313, + "learning_rate": 0.0002185502686456121, + "loss": 2.8425, + "step": 35965 + }, + { + "epoch": 1.76, + "grad_norm": 0.6317447423934937, + "learning_rate": 0.000218535450863515, + "loss": 3.0694, + "step": 35966 + }, + { + "epoch": 1.76, + "grad_norm": 0.6160470843315125, + "learning_rate": 0.00021852063329597972, + "loss": 2.9192, + "step": 35967 + }, + { + "epoch": 1.76, + "grad_norm": 0.5880303978919983, + "learning_rate": 0.00021850581594304547, + "loss": 2.6224, + "step": 35968 + }, + { + "epoch": 1.76, + "grad_norm": 0.7462405562400818, + "learning_rate": 0.00021849099880475126, + "loss": 3.0498, + "step": 35969 + }, + { + "epoch": 1.76, + "grad_norm": 0.6336656212806702, + "learning_rate": 0.000218476181881136, + "loss": 3.2386, + "step": 35970 + }, + { + "epoch": 1.76, + "grad_norm": 0.638992965221405, + "learning_rate": 0.0002184613651722389, + "loss": 2.9897, + "step": 35971 + }, + { + "epoch": 1.76, + "grad_norm": 0.6258759498596191, + "learning_rate": 0.00021844654867809877, + "loss": 3.0165, + "step": 35972 + }, + { + "epoch": 1.76, + "grad_norm": 0.6453997492790222, + "learning_rate": 0.00021843173239875468, + "loss": 2.7978, + "step": 35973 + }, + { + "epoch": 1.76, + "grad_norm": 0.6440539360046387, + "learning_rate": 0.00021841691633424587, + "loss": 2.8896, + "step": 35974 + }, + { + "epoch": 1.76, + "grad_norm": 0.6205512285232544, + "learning_rate": 0.0002184021004846111, + "loss": 3.1415, + "step": 35975 + }, + { + "epoch": 1.76, + "grad_norm": 0.6318713426589966, + "learning_rate": 0.00021838728484988956, + "loss": 2.9567, + "step": 35976 + }, + { + "epoch": 1.76, + "grad_norm": 0.6605975031852722, + "learning_rate": 0.0002183724694301201, + "loss": 2.9058, + "step": 35977 + }, + { + "epoch": 1.76, + "grad_norm": 0.6602401733398438, + "learning_rate": 0.00021835765422534192, + "loss": 2.8578, + "step": 35978 + }, + { + "epoch": 1.76, + "grad_norm": 0.6470891237258911, + "learning_rate": 0.000218342839235594, + "loss": 2.8326, + "step": 35979 + }, + { + "epoch": 1.76, + "grad_norm": 0.6551663279533386, + "learning_rate": 0.00021832802446091523, + "loss": 3.1398, + "step": 35980 + }, + { + "epoch": 1.76, + "grad_norm": 0.6770197153091431, + "learning_rate": 0.00021831320990134485, + "loss": 2.9125, + "step": 35981 + }, + { + "epoch": 1.76, + "grad_norm": 0.6041737794876099, + "learning_rate": 0.00021829839555692165, + "loss": 3.1245, + "step": 35982 + }, + { + "epoch": 1.76, + "grad_norm": 0.6864397525787354, + "learning_rate": 0.0002182835814276847, + "loss": 2.9672, + "step": 35983 + }, + { + "epoch": 1.76, + "grad_norm": 0.628705620765686, + "learning_rate": 0.00021826876751367323, + "loss": 2.9742, + "step": 35984 + }, + { + "epoch": 1.76, + "grad_norm": 0.6601821184158325, + "learning_rate": 0.00021825395381492593, + "loss": 2.941, + "step": 35985 + }, + { + "epoch": 1.76, + "grad_norm": 0.679912269115448, + "learning_rate": 0.00021823914033148215, + "loss": 3.0776, + "step": 35986 + }, + { + "epoch": 1.76, + "grad_norm": 0.7171043753623962, + "learning_rate": 0.00021822432706338065, + "loss": 3.0341, + "step": 35987 + }, + { + "epoch": 1.76, + "grad_norm": 0.6408360600471497, + "learning_rate": 0.0002182095140106605, + "loss": 2.934, + "step": 35988 + }, + { + "epoch": 1.76, + "grad_norm": 0.6534038186073303, + "learning_rate": 0.00021819470117336083, + "loss": 2.9033, + "step": 35989 + }, + { + "epoch": 1.76, + "grad_norm": 0.6343661546707153, + "learning_rate": 0.00021817988855152051, + "loss": 3.1754, + "step": 35990 + }, + { + "epoch": 1.76, + "grad_norm": 0.6617305278778076, + "learning_rate": 0.00021816507614517874, + "loss": 3.0125, + "step": 35991 + }, + { + "epoch": 1.76, + "grad_norm": 0.636042594909668, + "learning_rate": 0.00021815026395437422, + "loss": 3.2112, + "step": 35992 + }, + { + "epoch": 1.76, + "grad_norm": 0.6248968243598938, + "learning_rate": 0.0002181354519791463, + "loss": 2.7923, + "step": 35993 + }, + { + "epoch": 1.76, + "grad_norm": 0.6503058671951294, + "learning_rate": 0.0002181206402195338, + "loss": 2.963, + "step": 35994 + }, + { + "epoch": 1.76, + "grad_norm": 0.6492918133735657, + "learning_rate": 0.0002181058286755757, + "loss": 2.9537, + "step": 35995 + }, + { + "epoch": 1.76, + "grad_norm": 0.6048479676246643, + "learning_rate": 0.0002180910173473112, + "loss": 2.8097, + "step": 35996 + }, + { + "epoch": 1.76, + "grad_norm": 0.6944062113761902, + "learning_rate": 0.00021807620623477916, + "loss": 2.8929, + "step": 35997 + }, + { + "epoch": 1.76, + "grad_norm": 0.6303960084915161, + "learning_rate": 0.00021806139533801867, + "loss": 3.0977, + "step": 35998 + }, + { + "epoch": 1.76, + "grad_norm": 0.6644660234451294, + "learning_rate": 0.00021804658465706853, + "loss": 2.9469, + "step": 35999 + }, + { + "epoch": 1.76, + "grad_norm": 0.5891596078872681, + "learning_rate": 0.00021803177419196806, + "loss": 3.255, + "step": 36000 + }, + { + "epoch": 1.76, + "grad_norm": 0.6277598142623901, + "learning_rate": 0.00021801696394275615, + "loss": 3.1176, + "step": 36001 + }, + { + "epoch": 1.76, + "grad_norm": 0.6298516392707825, + "learning_rate": 0.00021800215390947164, + "loss": 2.9568, + "step": 36002 + }, + { + "epoch": 1.76, + "grad_norm": 0.6497096419334412, + "learning_rate": 0.0002179873440921538, + "loss": 2.9517, + "step": 36003 + }, + { + "epoch": 1.76, + "grad_norm": 0.6442708969116211, + "learning_rate": 0.00021797253449084143, + "loss": 3.0019, + "step": 36004 + }, + { + "epoch": 1.76, + "grad_norm": 0.6389861702919006, + "learning_rate": 0.00021795772510557363, + "loss": 2.8269, + "step": 36005 + }, + { + "epoch": 1.76, + "grad_norm": 0.6288984417915344, + "learning_rate": 0.00021794291593638947, + "loss": 2.7268, + "step": 36006 + }, + { + "epoch": 1.76, + "grad_norm": 0.6967528462409973, + "learning_rate": 0.00021792810698332773, + "loss": 2.8301, + "step": 36007 + }, + { + "epoch": 1.76, + "grad_norm": 0.7129323482513428, + "learning_rate": 0.00021791329824642774, + "loss": 3.0183, + "step": 36008 + }, + { + "epoch": 1.76, + "grad_norm": 1.0993932485580444, + "learning_rate": 0.0002178984897257282, + "loss": 2.9254, + "step": 36009 + }, + { + "epoch": 1.76, + "grad_norm": 0.6599279642105103, + "learning_rate": 0.00021788368142126822, + "loss": 2.9239, + "step": 36010 + }, + { + "epoch": 1.76, + "grad_norm": 0.6514578461647034, + "learning_rate": 0.00021786887333308694, + "loss": 2.9384, + "step": 36011 + }, + { + "epoch": 1.76, + "grad_norm": 0.6423161029815674, + "learning_rate": 0.00021785406546122316, + "loss": 3.0236, + "step": 36012 + }, + { + "epoch": 1.76, + "grad_norm": 0.6230955719947815, + "learning_rate": 0.00021783925780571604, + "loss": 3.0301, + "step": 36013 + }, + { + "epoch": 1.76, + "grad_norm": 0.6230641007423401, + "learning_rate": 0.00021782445036660437, + "loss": 2.9346, + "step": 36014 + }, + { + "epoch": 1.77, + "grad_norm": 0.6333297491073608, + "learning_rate": 0.00021780964314392734, + "loss": 2.7931, + "step": 36015 + }, + { + "epoch": 1.77, + "grad_norm": 0.6389443874359131, + "learning_rate": 0.000217794836137724, + "loss": 2.7033, + "step": 36016 + }, + { + "epoch": 1.77, + "grad_norm": 0.6404136419296265, + "learning_rate": 0.00021778002934803306, + "loss": 2.8099, + "step": 36017 + }, + { + "epoch": 1.77, + "grad_norm": 0.6511685848236084, + "learning_rate": 0.00021776522277489386, + "loss": 3.1501, + "step": 36018 + }, + { + "epoch": 1.77, + "grad_norm": 0.6114490628242493, + "learning_rate": 0.00021775041641834513, + "loss": 2.9718, + "step": 36019 + }, + { + "epoch": 1.77, + "grad_norm": 0.6275057792663574, + "learning_rate": 0.00021773561027842595, + "loss": 2.9344, + "step": 36020 + }, + { + "epoch": 1.77, + "grad_norm": 0.6325558423995972, + "learning_rate": 0.00021772080435517545, + "loss": 3.1454, + "step": 36021 + }, + { + "epoch": 1.77, + "grad_norm": 0.6032198667526245, + "learning_rate": 0.00021770599864863246, + "loss": 2.9158, + "step": 36022 + }, + { + "epoch": 1.77, + "grad_norm": 0.5971514582633972, + "learning_rate": 0.0002176911931588361, + "loss": 2.8928, + "step": 36023 + }, + { + "epoch": 1.77, + "grad_norm": 0.6245431900024414, + "learning_rate": 0.00021767638788582517, + "loss": 3.1731, + "step": 36024 + }, + { + "epoch": 1.77, + "grad_norm": 0.6920629143714905, + "learning_rate": 0.00021766158282963876, + "loss": 3.0704, + "step": 36025 + }, + { + "epoch": 1.77, + "grad_norm": 0.6170417666435242, + "learning_rate": 0.00021764677799031607, + "loss": 2.942, + "step": 36026 + }, + { + "epoch": 1.77, + "grad_norm": 0.6285954713821411, + "learning_rate": 0.0002176319733678958, + "loss": 2.9836, + "step": 36027 + }, + { + "epoch": 1.77, + "grad_norm": 0.6193482875823975, + "learning_rate": 0.00021761716896241716, + "loss": 2.8479, + "step": 36028 + }, + { + "epoch": 1.77, + "grad_norm": 0.6073896884918213, + "learning_rate": 0.00021760236477391887, + "loss": 3.2373, + "step": 36029 + }, + { + "epoch": 1.77, + "grad_norm": 0.6277605295181274, + "learning_rate": 0.00021758756080244023, + "loss": 3.0198, + "step": 36030 + }, + { + "epoch": 1.77, + "grad_norm": 0.6205739974975586, + "learning_rate": 0.0002175727570480201, + "loss": 3.0834, + "step": 36031 + }, + { + "epoch": 1.77, + "grad_norm": 0.9096717238426208, + "learning_rate": 0.00021755795351069738, + "loss": 3.0458, + "step": 36032 + }, + { + "epoch": 1.77, + "grad_norm": 0.6602234840393066, + "learning_rate": 0.00021754315019051126, + "loss": 3.2214, + "step": 36033 + }, + { + "epoch": 1.77, + "grad_norm": 0.650767982006073, + "learning_rate": 0.00021752834708750054, + "loss": 3.0056, + "step": 36034 + }, + { + "epoch": 1.77, + "grad_norm": 0.5995840430259705, + "learning_rate": 0.0002175135442017042, + "loss": 3.1619, + "step": 36035 + }, + { + "epoch": 1.77, + "grad_norm": 0.6354784369468689, + "learning_rate": 0.00021749874153316148, + "loss": 3.2268, + "step": 36036 + }, + { + "epoch": 1.77, + "grad_norm": 0.6659834980964661, + "learning_rate": 0.0002174839390819111, + "loss": 3.1833, + "step": 36037 + }, + { + "epoch": 1.77, + "grad_norm": 0.649095892906189, + "learning_rate": 0.00021746913684799222, + "loss": 2.8661, + "step": 36038 + }, + { + "epoch": 1.77, + "grad_norm": 0.7001377940177917, + "learning_rate": 0.0002174543348314436, + "loss": 2.85, + "step": 36039 + }, + { + "epoch": 1.77, + "grad_norm": 0.6997196674346924, + "learning_rate": 0.00021743953303230448, + "loss": 3.1971, + "step": 36040 + }, + { + "epoch": 1.77, + "grad_norm": 0.6453948020935059, + "learning_rate": 0.00021742473145061378, + "loss": 3.01, + "step": 36041 + }, + { + "epoch": 1.77, + "grad_norm": 0.639747679233551, + "learning_rate": 0.0002174099300864103, + "loss": 2.9441, + "step": 36042 + }, + { + "epoch": 1.77, + "grad_norm": 0.619999885559082, + "learning_rate": 0.00021739512893973335, + "loss": 2.8838, + "step": 36043 + }, + { + "epoch": 1.77, + "grad_norm": 0.6154755353927612, + "learning_rate": 0.00021738032801062155, + "loss": 3.1061, + "step": 36044 + }, + { + "epoch": 1.77, + "grad_norm": 0.6470183730125427, + "learning_rate": 0.00021736552729911417, + "loss": 3.0322, + "step": 36045 + }, + { + "epoch": 1.77, + "grad_norm": 0.6488195657730103, + "learning_rate": 0.00021735072680525015, + "loss": 2.9479, + "step": 36046 + }, + { + "epoch": 1.77, + "grad_norm": 0.6353014707565308, + "learning_rate": 0.00021733592652906826, + "loss": 3.135, + "step": 36047 + }, + { + "epoch": 1.77, + "grad_norm": 0.6322101354598999, + "learning_rate": 0.00021732112647060775, + "loss": 3.1161, + "step": 36048 + }, + { + "epoch": 1.77, + "grad_norm": 0.5940348505973816, + "learning_rate": 0.00021730632662990743, + "loss": 2.9735, + "step": 36049 + }, + { + "epoch": 1.77, + "grad_norm": 0.5832706689834595, + "learning_rate": 0.00021729152700700628, + "loss": 2.9688, + "step": 36050 + }, + { + "epoch": 1.77, + "grad_norm": 0.6051971316337585, + "learning_rate": 0.00021727672760194343, + "loss": 2.9654, + "step": 36051 + }, + { + "epoch": 1.77, + "grad_norm": 0.6165816783905029, + "learning_rate": 0.0002172619284147577, + "loss": 2.8744, + "step": 36052 + }, + { + "epoch": 1.77, + "grad_norm": 0.6479892134666443, + "learning_rate": 0.0002172471294454882, + "loss": 2.8019, + "step": 36053 + }, + { + "epoch": 1.77, + "grad_norm": 0.6285097002983093, + "learning_rate": 0.00021723233069417374, + "loss": 2.8948, + "step": 36054 + }, + { + "epoch": 1.77, + "grad_norm": 0.6039531826972961, + "learning_rate": 0.0002172175321608534, + "loss": 2.8057, + "step": 36055 + }, + { + "epoch": 1.77, + "grad_norm": 0.6159113049507141, + "learning_rate": 0.00021720273384556624, + "loss": 2.9787, + "step": 36056 + }, + { + "epoch": 1.77, + "grad_norm": 0.642290472984314, + "learning_rate": 0.00021718793574835102, + "loss": 3.0758, + "step": 36057 + }, + { + "epoch": 1.77, + "grad_norm": 0.6488417387008667, + "learning_rate": 0.00021717313786924695, + "loss": 2.9597, + "step": 36058 + }, + { + "epoch": 1.77, + "grad_norm": 0.6642580628395081, + "learning_rate": 0.00021715834020829284, + "loss": 3.1399, + "step": 36059 + }, + { + "epoch": 1.77, + "grad_norm": 0.686511218547821, + "learning_rate": 0.00021714354276552766, + "loss": 2.9219, + "step": 36060 + }, + { + "epoch": 1.77, + "grad_norm": 0.7143248319625854, + "learning_rate": 0.00021712874554099057, + "loss": 2.8625, + "step": 36061 + }, + { + "epoch": 1.77, + "grad_norm": 0.6069398522377014, + "learning_rate": 0.00021711394853472037, + "loss": 2.6463, + "step": 36062 + }, + { + "epoch": 1.77, + "grad_norm": 0.5863265991210938, + "learning_rate": 0.00021709915174675614, + "loss": 3.0262, + "step": 36063 + }, + { + "epoch": 1.77, + "grad_norm": 0.6850087642669678, + "learning_rate": 0.00021708435517713663, + "loss": 3.182, + "step": 36064 + }, + { + "epoch": 1.77, + "grad_norm": 0.6968920230865479, + "learning_rate": 0.00021706955882590096, + "loss": 2.9166, + "step": 36065 + }, + { + "epoch": 1.77, + "grad_norm": 0.662936270236969, + "learning_rate": 0.0002170547626930883, + "loss": 2.9049, + "step": 36066 + }, + { + "epoch": 1.77, + "grad_norm": 0.6453676223754883, + "learning_rate": 0.0002170399667787373, + "loss": 3.0578, + "step": 36067 + }, + { + "epoch": 1.77, + "grad_norm": 0.6160590052604675, + "learning_rate": 0.00021702517108288715, + "loss": 3.0075, + "step": 36068 + }, + { + "epoch": 1.77, + "grad_norm": 0.6253478527069092, + "learning_rate": 0.00021701037560557663, + "loss": 3.0597, + "step": 36069 + }, + { + "epoch": 1.77, + "grad_norm": 0.6356991529464722, + "learning_rate": 0.0002169955803468449, + "loss": 2.8705, + "step": 36070 + }, + { + "epoch": 1.77, + "grad_norm": 0.6288435459136963, + "learning_rate": 0.0002169807853067308, + "loss": 3.2712, + "step": 36071 + }, + { + "epoch": 1.77, + "grad_norm": 0.6195838451385498, + "learning_rate": 0.00021696599048527326, + "loss": 2.9045, + "step": 36072 + }, + { + "epoch": 1.77, + "grad_norm": 0.6254141330718994, + "learning_rate": 0.00021695119588251144, + "loss": 3.2025, + "step": 36073 + }, + { + "epoch": 1.77, + "grad_norm": 0.6363393664360046, + "learning_rate": 0.00021693640149848416, + "loss": 3.0731, + "step": 36074 + }, + { + "epoch": 1.77, + "grad_norm": 0.5991094708442688, + "learning_rate": 0.00021692160733323044, + "loss": 3.021, + "step": 36075 + }, + { + "epoch": 1.77, + "grad_norm": 0.6361525654792786, + "learning_rate": 0.0002169068133867891, + "loss": 2.9463, + "step": 36076 + }, + { + "epoch": 1.77, + "grad_norm": 0.653708279132843, + "learning_rate": 0.00021689201965919929, + "loss": 2.9209, + "step": 36077 + }, + { + "epoch": 1.77, + "grad_norm": 0.6295130848884583, + "learning_rate": 0.00021687722615049994, + "loss": 2.9804, + "step": 36078 + }, + { + "epoch": 1.77, + "grad_norm": 0.6194791197776794, + "learning_rate": 0.00021686243286072988, + "loss": 2.979, + "step": 36079 + }, + { + "epoch": 1.77, + "grad_norm": 0.6836393475532532, + "learning_rate": 0.0002168476397899283, + "loss": 2.8899, + "step": 36080 + }, + { + "epoch": 1.77, + "grad_norm": 0.5709426403045654, + "learning_rate": 0.00021683284693813396, + "loss": 3.0018, + "step": 36081 + }, + { + "epoch": 1.77, + "grad_norm": 0.637849748134613, + "learning_rate": 0.00021681805430538587, + "loss": 3.072, + "step": 36082 + }, + { + "epoch": 1.77, + "grad_norm": 0.5979206562042236, + "learning_rate": 0.00021680326189172307, + "loss": 2.9527, + "step": 36083 + }, + { + "epoch": 1.77, + "grad_norm": 0.6736457347869873, + "learning_rate": 0.00021678846969718445, + "loss": 3.1289, + "step": 36084 + }, + { + "epoch": 1.77, + "grad_norm": 0.6516305208206177, + "learning_rate": 0.00021677367772180906, + "loss": 3.0119, + "step": 36085 + }, + { + "epoch": 1.77, + "grad_norm": 0.6282474994659424, + "learning_rate": 0.0002167588859656356, + "loss": 2.9219, + "step": 36086 + }, + { + "epoch": 1.77, + "grad_norm": 0.6169061660766602, + "learning_rate": 0.00021674409442870327, + "loss": 3.0883, + "step": 36087 + }, + { + "epoch": 1.77, + "grad_norm": 0.7025720477104187, + "learning_rate": 0.00021672930311105107, + "loss": 3.1129, + "step": 36088 + }, + { + "epoch": 1.77, + "grad_norm": 0.6823790073394775, + "learning_rate": 0.0002167145120127178, + "loss": 2.9719, + "step": 36089 + }, + { + "epoch": 1.77, + "grad_norm": 0.6688551902770996, + "learning_rate": 0.00021669972113374258, + "loss": 2.8693, + "step": 36090 + }, + { + "epoch": 1.77, + "grad_norm": 0.6762685775756836, + "learning_rate": 0.0002166849304741641, + "loss": 2.9417, + "step": 36091 + }, + { + "epoch": 1.77, + "grad_norm": 0.6573099493980408, + "learning_rate": 0.00021667014003402154, + "loss": 2.9705, + "step": 36092 + }, + { + "epoch": 1.77, + "grad_norm": 0.6102715730667114, + "learning_rate": 0.00021665534981335388, + "loss": 3.0654, + "step": 36093 + }, + { + "epoch": 1.77, + "grad_norm": 0.6486304998397827, + "learning_rate": 0.00021664055981219984, + "loss": 2.9239, + "step": 36094 + }, + { + "epoch": 1.77, + "grad_norm": 0.5909685492515564, + "learning_rate": 0.00021662577003059869, + "loss": 3.0793, + "step": 36095 + }, + { + "epoch": 1.77, + "grad_norm": 0.6453856229782104, + "learning_rate": 0.00021661098046858908, + "loss": 2.9635, + "step": 36096 + }, + { + "epoch": 1.77, + "grad_norm": 0.6692531108856201, + "learning_rate": 0.0002165961911262101, + "loss": 3.0343, + "step": 36097 + }, + { + "epoch": 1.77, + "grad_norm": 0.6578263640403748, + "learning_rate": 0.0002165814020035008, + "loss": 3.1648, + "step": 36098 + }, + { + "epoch": 1.77, + "grad_norm": 0.6265367865562439, + "learning_rate": 0.0002165666131005, + "loss": 3.0647, + "step": 36099 + }, + { + "epoch": 1.77, + "grad_norm": 0.615085780620575, + "learning_rate": 0.00021655182441724673, + "loss": 3.1935, + "step": 36100 + }, + { + "epoch": 1.77, + "grad_norm": 0.6076129078865051, + "learning_rate": 0.0002165370359537797, + "loss": 3.0154, + "step": 36101 + }, + { + "epoch": 1.77, + "grad_norm": 0.6174434423446655, + "learning_rate": 0.0002165222477101382, + "loss": 3.0799, + "step": 36102 + }, + { + "epoch": 1.77, + "grad_norm": 0.6437174081802368, + "learning_rate": 0.00021650745968636108, + "loss": 2.9149, + "step": 36103 + }, + { + "epoch": 1.77, + "grad_norm": 0.614182710647583, + "learning_rate": 0.0002164926718824871, + "loss": 3.0644, + "step": 36104 + }, + { + "epoch": 1.77, + "grad_norm": 0.6699419617652893, + "learning_rate": 0.0002164778842985555, + "loss": 2.8719, + "step": 36105 + }, + { + "epoch": 1.77, + "grad_norm": 0.636222243309021, + "learning_rate": 0.00021646309693460495, + "loss": 3.0052, + "step": 36106 + }, + { + "epoch": 1.77, + "grad_norm": 0.6297125816345215, + "learning_rate": 0.00021644830979067456, + "loss": 3.184, + "step": 36107 + }, + { + "epoch": 1.77, + "grad_norm": 0.6243493556976318, + "learning_rate": 0.00021643352286680333, + "loss": 2.8433, + "step": 36108 + }, + { + "epoch": 1.77, + "grad_norm": 0.6312358975410461, + "learning_rate": 0.00021641873616303, + "loss": 3.1941, + "step": 36109 + }, + { + "epoch": 1.77, + "grad_norm": 0.6219147443771362, + "learning_rate": 0.00021640394967939374, + "loss": 3.0337, + "step": 36110 + }, + { + "epoch": 1.77, + "grad_norm": 0.6636743545532227, + "learning_rate": 0.0002163891634159333, + "loss": 3.2584, + "step": 36111 + }, + { + "epoch": 1.77, + "grad_norm": 0.6117985844612122, + "learning_rate": 0.00021637437737268773, + "loss": 3.284, + "step": 36112 + }, + { + "epoch": 1.77, + "grad_norm": 0.6436432003974915, + "learning_rate": 0.00021635959154969603, + "loss": 2.99, + "step": 36113 + }, + { + "epoch": 1.77, + "grad_norm": 0.6268394589424133, + "learning_rate": 0.00021634480594699698, + "loss": 3.0048, + "step": 36114 + }, + { + "epoch": 1.77, + "grad_norm": 0.6493527889251709, + "learning_rate": 0.0002163300205646297, + "loss": 3.037, + "step": 36115 + }, + { + "epoch": 1.77, + "grad_norm": 0.6208270192146301, + "learning_rate": 0.00021631523540263292, + "loss": 2.973, + "step": 36116 + }, + { + "epoch": 1.77, + "grad_norm": 0.6634708642959595, + "learning_rate": 0.00021630045046104577, + "loss": 2.9894, + "step": 36117 + }, + { + "epoch": 1.77, + "grad_norm": 0.6609952449798584, + "learning_rate": 0.00021628566573990717, + "loss": 3.1561, + "step": 36118 + }, + { + "epoch": 1.77, + "grad_norm": 0.6642773747444153, + "learning_rate": 0.0002162708812392559, + "loss": 2.9295, + "step": 36119 + }, + { + "epoch": 1.77, + "grad_norm": 0.6038638353347778, + "learning_rate": 0.00021625609695913116, + "loss": 2.869, + "step": 36120 + }, + { + "epoch": 1.77, + "grad_norm": 0.6279966831207275, + "learning_rate": 0.00021624131289957167, + "loss": 2.9704, + "step": 36121 + }, + { + "epoch": 1.77, + "grad_norm": 0.6348757147789001, + "learning_rate": 0.00021622652906061636, + "loss": 3.0251, + "step": 36122 + }, + { + "epoch": 1.77, + "grad_norm": 0.6414353847503662, + "learning_rate": 0.0002162117454423044, + "loss": 2.9168, + "step": 36123 + }, + { + "epoch": 1.77, + "grad_norm": 0.6199612617492676, + "learning_rate": 0.00021619696204467452, + "loss": 2.956, + "step": 36124 + }, + { + "epoch": 1.77, + "grad_norm": 0.6189448833465576, + "learning_rate": 0.00021618217886776577, + "loss": 3.0051, + "step": 36125 + }, + { + "epoch": 1.77, + "grad_norm": 0.6277577877044678, + "learning_rate": 0.0002161673959116169, + "loss": 3.0413, + "step": 36126 + }, + { + "epoch": 1.77, + "grad_norm": 0.6269301772117615, + "learning_rate": 0.00021615261317626692, + "loss": 3.1164, + "step": 36127 + }, + { + "epoch": 1.77, + "grad_norm": 0.6273691058158875, + "learning_rate": 0.00021613783066175505, + "loss": 2.7998, + "step": 36128 + }, + { + "epoch": 1.77, + "grad_norm": 0.6344377398490906, + "learning_rate": 0.00021612304836811984, + "loss": 3.1015, + "step": 36129 + }, + { + "epoch": 1.77, + "grad_norm": 0.6333027482032776, + "learning_rate": 0.00021610826629540048, + "loss": 2.893, + "step": 36130 + }, + { + "epoch": 1.77, + "grad_norm": 0.6379823088645935, + "learning_rate": 0.0002160934844436357, + "loss": 3.0585, + "step": 36131 + }, + { + "epoch": 1.77, + "grad_norm": 0.6152437925338745, + "learning_rate": 0.00021607870281286455, + "loss": 3.0948, + "step": 36132 + }, + { + "epoch": 1.77, + "grad_norm": 0.8633241057395935, + "learning_rate": 0.00021606392140312606, + "loss": 2.9737, + "step": 36133 + }, + { + "epoch": 1.77, + "grad_norm": 0.6464073061943054, + "learning_rate": 0.00021604914021445886, + "loss": 2.9933, + "step": 36134 + }, + { + "epoch": 1.77, + "grad_norm": 0.6235373616218567, + "learning_rate": 0.00021603435924690223, + "loss": 3.0425, + "step": 36135 + }, + { + "epoch": 1.77, + "grad_norm": 0.7127751708030701, + "learning_rate": 0.00021601957850049487, + "loss": 2.9248, + "step": 36136 + }, + { + "epoch": 1.77, + "grad_norm": 0.6603729724884033, + "learning_rate": 0.00021600479797527573, + "loss": 3.1996, + "step": 36137 + }, + { + "epoch": 1.77, + "grad_norm": 0.6313992738723755, + "learning_rate": 0.00021599001767128387, + "loss": 3.0105, + "step": 36138 + }, + { + "epoch": 1.77, + "grad_norm": 0.6753054857254028, + "learning_rate": 0.00021597523758855813, + "loss": 2.942, + "step": 36139 + }, + { + "epoch": 1.77, + "grad_norm": 0.6346279978752136, + "learning_rate": 0.0002159604577271375, + "loss": 2.9444, + "step": 36140 + }, + { + "epoch": 1.77, + "grad_norm": 0.6592559218406677, + "learning_rate": 0.00021594567808706067, + "loss": 3.264, + "step": 36141 + }, + { + "epoch": 1.77, + "grad_norm": 0.6272072196006775, + "learning_rate": 0.00021593089866836683, + "loss": 3.0916, + "step": 36142 + }, + { + "epoch": 1.77, + "grad_norm": 0.6413544416427612, + "learning_rate": 0.0002159161194710949, + "loss": 3.0556, + "step": 36143 + }, + { + "epoch": 1.77, + "grad_norm": 0.6144356727600098, + "learning_rate": 0.00021590134049528364, + "loss": 3.0897, + "step": 36144 + }, + { + "epoch": 1.77, + "grad_norm": 0.6145825982093811, + "learning_rate": 0.00021588656174097214, + "loss": 3.1299, + "step": 36145 + }, + { + "epoch": 1.77, + "grad_norm": 0.6415427327156067, + "learning_rate": 0.0002158717832081992, + "loss": 3.0902, + "step": 36146 + }, + { + "epoch": 1.77, + "grad_norm": 0.6182569265365601, + "learning_rate": 0.00021585700489700383, + "loss": 2.9496, + "step": 36147 + }, + { + "epoch": 1.77, + "grad_norm": 0.6118202209472656, + "learning_rate": 0.00021584222680742484, + "loss": 2.9401, + "step": 36148 + }, + { + "epoch": 1.77, + "grad_norm": 0.630115807056427, + "learning_rate": 0.00021582744893950122, + "loss": 2.9302, + "step": 36149 + }, + { + "epoch": 1.77, + "grad_norm": 0.6311905384063721, + "learning_rate": 0.000215812671293272, + "loss": 3.1874, + "step": 36150 + }, + { + "epoch": 1.77, + "grad_norm": 0.632701575756073, + "learning_rate": 0.0002157978938687759, + "loss": 2.9891, + "step": 36151 + }, + { + "epoch": 1.77, + "grad_norm": 0.6322634220123291, + "learning_rate": 0.00021578311666605208, + "loss": 2.8718, + "step": 36152 + }, + { + "epoch": 1.77, + "grad_norm": 0.6122119426727295, + "learning_rate": 0.00021576833968513917, + "loss": 3.0455, + "step": 36153 + }, + { + "epoch": 1.77, + "grad_norm": 0.6663072109222412, + "learning_rate": 0.0002157535629260763, + "loss": 2.8699, + "step": 36154 + }, + { + "epoch": 1.77, + "grad_norm": 0.6152451038360596, + "learning_rate": 0.0002157387863889024, + "loss": 3.0347, + "step": 36155 + }, + { + "epoch": 1.77, + "grad_norm": 0.614471435546875, + "learning_rate": 0.0002157240100736562, + "loss": 3.1776, + "step": 36156 + }, + { + "epoch": 1.77, + "grad_norm": 1.1464407444000244, + "learning_rate": 0.00021570923398037684, + "loss": 3.0467, + "step": 36157 + }, + { + "epoch": 1.77, + "grad_norm": 0.626950204372406, + "learning_rate": 0.0002156944581091031, + "loss": 2.7787, + "step": 36158 + }, + { + "epoch": 1.77, + "grad_norm": 0.6673448085784912, + "learning_rate": 0.00021567968245987386, + "loss": 3.0078, + "step": 36159 + }, + { + "epoch": 1.77, + "grad_norm": 0.6380613446235657, + "learning_rate": 0.00021566490703272824, + "loss": 2.8045, + "step": 36160 + }, + { + "epoch": 1.77, + "grad_norm": 0.6106299161911011, + "learning_rate": 0.00021565013182770493, + "loss": 3.2147, + "step": 36161 + }, + { + "epoch": 1.77, + "grad_norm": 0.6322900652885437, + "learning_rate": 0.00021563535684484305, + "loss": 3.0431, + "step": 36162 + }, + { + "epoch": 1.77, + "grad_norm": 0.603399932384491, + "learning_rate": 0.00021562058208418123, + "loss": 2.9781, + "step": 36163 + }, + { + "epoch": 1.77, + "grad_norm": 0.6214710474014282, + "learning_rate": 0.00021560580754575865, + "loss": 3.0257, + "step": 36164 + }, + { + "epoch": 1.77, + "grad_norm": 0.6370988488197327, + "learning_rate": 0.00021559103322961419, + "loss": 3.0746, + "step": 36165 + }, + { + "epoch": 1.77, + "grad_norm": 0.6562209129333496, + "learning_rate": 0.00021557625913578654, + "loss": 2.9225, + "step": 36166 + }, + { + "epoch": 1.77, + "grad_norm": 0.6525112986564636, + "learning_rate": 0.00021556148526431497, + "loss": 2.9325, + "step": 36167 + }, + { + "epoch": 1.77, + "grad_norm": 0.6724076867103577, + "learning_rate": 0.00021554671161523806, + "loss": 2.9713, + "step": 36168 + }, + { + "epoch": 1.77, + "grad_norm": 0.6039425134658813, + "learning_rate": 0.00021553193818859488, + "loss": 3.0612, + "step": 36169 + }, + { + "epoch": 1.77, + "grad_norm": 0.6516081690788269, + "learning_rate": 0.0002155171649844244, + "loss": 2.8402, + "step": 36170 + }, + { + "epoch": 1.77, + "grad_norm": 0.6352439522743225, + "learning_rate": 0.00021550239200276536, + "loss": 3.0824, + "step": 36171 + }, + { + "epoch": 1.77, + "grad_norm": 0.6601967215538025, + "learning_rate": 0.00021548761924365683, + "loss": 2.9832, + "step": 36172 + }, + { + "epoch": 1.77, + "grad_norm": 0.6137501001358032, + "learning_rate": 0.0002154728467071376, + "loss": 3.0011, + "step": 36173 + }, + { + "epoch": 1.77, + "grad_norm": 0.6737114787101746, + "learning_rate": 0.00021545807439324655, + "loss": 2.8073, + "step": 36174 + }, + { + "epoch": 1.77, + "grad_norm": 0.6427306532859802, + "learning_rate": 0.0002154433023020228, + "loss": 2.913, + "step": 36175 + }, + { + "epoch": 1.77, + "grad_norm": 0.6364057660102844, + "learning_rate": 0.00021542853043350507, + "loss": 3.0798, + "step": 36176 + }, + { + "epoch": 1.77, + "grad_norm": 0.6504160165786743, + "learning_rate": 0.00021541375878773237, + "loss": 2.9817, + "step": 36177 + }, + { + "epoch": 1.77, + "grad_norm": 0.669624924659729, + "learning_rate": 0.00021539898736474338, + "loss": 2.8296, + "step": 36178 + }, + { + "epoch": 1.77, + "grad_norm": 0.6107624769210815, + "learning_rate": 0.0002153842161645773, + "loss": 3.046, + "step": 36179 + }, + { + "epoch": 1.77, + "grad_norm": 0.6231289505958557, + "learning_rate": 0.00021536944518727296, + "loss": 3.1231, + "step": 36180 + }, + { + "epoch": 1.77, + "grad_norm": 0.6516662836074829, + "learning_rate": 0.0002153546744328691, + "loss": 3.0611, + "step": 36181 + }, + { + "epoch": 1.77, + "grad_norm": 0.6371469497680664, + "learning_rate": 0.00021533990390140484, + "loss": 3.0647, + "step": 36182 + }, + { + "epoch": 1.77, + "grad_norm": 0.6329872608184814, + "learning_rate": 0.00021532513359291886, + "loss": 2.9429, + "step": 36183 + }, + { + "epoch": 1.77, + "grad_norm": 0.6683485507965088, + "learning_rate": 0.00021531036350745018, + "loss": 2.8126, + "step": 36184 + }, + { + "epoch": 1.77, + "grad_norm": 0.6202999949455261, + "learning_rate": 0.00021529559364503784, + "loss": 3.0846, + "step": 36185 + }, + { + "epoch": 1.77, + "grad_norm": 0.6209010481834412, + "learning_rate": 0.00021528082400572055, + "loss": 2.9513, + "step": 36186 + }, + { + "epoch": 1.77, + "grad_norm": 0.6557936072349548, + "learning_rate": 0.00021526605458953733, + "loss": 3.028, + "step": 36187 + }, + { + "epoch": 1.77, + "grad_norm": 0.6131925582885742, + "learning_rate": 0.0002152512853965268, + "loss": 2.867, + "step": 36188 + }, + { + "epoch": 1.77, + "grad_norm": 0.6493362188339233, + "learning_rate": 0.00021523651642672815, + "loss": 2.935, + "step": 36189 + }, + { + "epoch": 1.77, + "grad_norm": 0.8060845136642456, + "learning_rate": 0.00021522174768018033, + "loss": 3.1432, + "step": 36190 + }, + { + "epoch": 1.77, + "grad_norm": 0.5938668251037598, + "learning_rate": 0.00021520697915692203, + "loss": 2.9079, + "step": 36191 + }, + { + "epoch": 1.77, + "grad_norm": 0.6498206257820129, + "learning_rate": 0.0002151922108569923, + "loss": 2.9356, + "step": 36192 + }, + { + "epoch": 1.77, + "grad_norm": 0.6662101745605469, + "learning_rate": 0.00021517744278042982, + "loss": 2.9902, + "step": 36193 + }, + { + "epoch": 1.77, + "grad_norm": 0.686980128288269, + "learning_rate": 0.00021516267492727373, + "loss": 2.9666, + "step": 36194 + }, + { + "epoch": 1.77, + "grad_norm": 0.6905176639556885, + "learning_rate": 0.00021514790729756285, + "loss": 2.9056, + "step": 36195 + }, + { + "epoch": 1.77, + "grad_norm": 0.6709184050559998, + "learning_rate": 0.00021513313989133596, + "loss": 3.0483, + "step": 36196 + }, + { + "epoch": 1.77, + "grad_norm": 0.6174628734588623, + "learning_rate": 0.00021511837270863214, + "loss": 3.0596, + "step": 36197 + }, + { + "epoch": 1.77, + "grad_norm": 0.6258738040924072, + "learning_rate": 0.0002151036057494902, + "loss": 3.0001, + "step": 36198 + }, + { + "epoch": 1.77, + "grad_norm": 0.6265406012535095, + "learning_rate": 0.00021508883901394886, + "loss": 3.172, + "step": 36199 + }, + { + "epoch": 1.77, + "grad_norm": 0.6480788588523865, + "learning_rate": 0.00021507407250204736, + "loss": 2.9221, + "step": 36200 + }, + { + "epoch": 1.77, + "grad_norm": 0.6287563443183899, + "learning_rate": 0.00021505930621382437, + "loss": 3.0638, + "step": 36201 + }, + { + "epoch": 1.77, + "grad_norm": 0.6304842829704285, + "learning_rate": 0.00021504454014931884, + "loss": 2.9461, + "step": 36202 + }, + { + "epoch": 1.77, + "grad_norm": 0.6325621604919434, + "learning_rate": 0.0002150297743085695, + "loss": 2.892, + "step": 36203 + }, + { + "epoch": 1.77, + "grad_norm": 0.6825348734855652, + "learning_rate": 0.00021501500869161548, + "loss": 2.8529, + "step": 36204 + }, + { + "epoch": 1.77, + "grad_norm": 0.6398288607597351, + "learning_rate": 0.00021500024329849563, + "loss": 2.9031, + "step": 36205 + }, + { + "epoch": 1.77, + "grad_norm": 0.5954134464263916, + "learning_rate": 0.00021498547812924865, + "loss": 2.8321, + "step": 36206 + }, + { + "epoch": 1.77, + "grad_norm": 0.6618627309799194, + "learning_rate": 0.00021497071318391368, + "loss": 3.1237, + "step": 36207 + }, + { + "epoch": 1.77, + "grad_norm": 0.6097262501716614, + "learning_rate": 0.00021495594846252937, + "loss": 2.97, + "step": 36208 + }, + { + "epoch": 1.77, + "grad_norm": 0.6133797764778137, + "learning_rate": 0.00021494118396513475, + "loss": 3.1634, + "step": 36209 + }, + { + "epoch": 1.77, + "grad_norm": 0.7019491791725159, + "learning_rate": 0.0002149264196917688, + "loss": 2.9923, + "step": 36210 + }, + { + "epoch": 1.77, + "grad_norm": 0.6512349843978882, + "learning_rate": 0.00021491165564247011, + "loss": 3.1578, + "step": 36211 + }, + { + "epoch": 1.77, + "grad_norm": 0.6159108281135559, + "learning_rate": 0.0002148968918172779, + "loss": 2.9716, + "step": 36212 + }, + { + "epoch": 1.77, + "grad_norm": 0.6668187975883484, + "learning_rate": 0.00021488212821623085, + "loss": 2.9477, + "step": 36213 + }, + { + "epoch": 1.77, + "grad_norm": 0.6413983106613159, + "learning_rate": 0.0002148673648393678, + "loss": 3.215, + "step": 36214 + }, + { + "epoch": 1.77, + "grad_norm": 0.6101641058921814, + "learning_rate": 0.0002148526016867279, + "loss": 2.7737, + "step": 36215 + }, + { + "epoch": 1.77, + "grad_norm": 0.622040867805481, + "learning_rate": 0.00021483783875834972, + "loss": 2.9313, + "step": 36216 + }, + { + "epoch": 1.77, + "grad_norm": 0.6527264714241028, + "learning_rate": 0.0002148230760542724, + "loss": 3.2522, + "step": 36217 + }, + { + "epoch": 1.77, + "grad_norm": 0.6784375905990601, + "learning_rate": 0.00021480831357453456, + "loss": 2.8937, + "step": 36218 + }, + { + "epoch": 1.78, + "grad_norm": 0.6425294876098633, + "learning_rate": 0.0002147935513191753, + "loss": 2.905, + "step": 36219 + }, + { + "epoch": 1.78, + "grad_norm": 0.6469739675521851, + "learning_rate": 0.0002147787892882335, + "loss": 2.9366, + "step": 36220 + }, + { + "epoch": 1.78, + "grad_norm": 0.6778955459594727, + "learning_rate": 0.00021476402748174783, + "loss": 3.0863, + "step": 36221 + }, + { + "epoch": 1.78, + "grad_norm": 0.5806019306182861, + "learning_rate": 0.00021474926589975745, + "loss": 2.7963, + "step": 36222 + }, + { + "epoch": 1.78, + "grad_norm": 0.6367674469947815, + "learning_rate": 0.000214734504542301, + "loss": 2.9159, + "step": 36223 + }, + { + "epoch": 1.78, + "grad_norm": 0.6361095905303955, + "learning_rate": 0.00021471974340941754, + "loss": 3.1667, + "step": 36224 + }, + { + "epoch": 1.78, + "grad_norm": 0.6673516631126404, + "learning_rate": 0.0002147049825011457, + "loss": 3.1121, + "step": 36225 + }, + { + "epoch": 1.78, + "grad_norm": 0.6467750072479248, + "learning_rate": 0.00021469022181752465, + "loss": 3.1073, + "step": 36226 + }, + { + "epoch": 1.78, + "grad_norm": 0.6565433740615845, + "learning_rate": 0.00021467546135859315, + "loss": 2.8489, + "step": 36227 + }, + { + "epoch": 1.78, + "grad_norm": 0.6047545075416565, + "learning_rate": 0.00021466070112438997, + "loss": 3.0128, + "step": 36228 + }, + { + "epoch": 1.78, + "grad_norm": 0.6131613254547119, + "learning_rate": 0.00021464594111495419, + "loss": 2.8185, + "step": 36229 + }, + { + "epoch": 1.78, + "grad_norm": 0.6076128482818604, + "learning_rate": 0.0002146311813303244, + "loss": 3.0758, + "step": 36230 + }, + { + "epoch": 1.78, + "grad_norm": 0.6186553835868835, + "learning_rate": 0.0002146164217705398, + "loss": 2.9951, + "step": 36231 + }, + { + "epoch": 1.78, + "grad_norm": 0.6514021754264832, + "learning_rate": 0.00021460166243563914, + "loss": 3.0689, + "step": 36232 + }, + { + "epoch": 1.78, + "grad_norm": 0.6270773410797119, + "learning_rate": 0.00021458690332566113, + "loss": 3.036, + "step": 36233 + }, + { + "epoch": 1.78, + "grad_norm": 0.6818325519561768, + "learning_rate": 0.00021457214444064492, + "loss": 2.9342, + "step": 36234 + }, + { + "epoch": 1.78, + "grad_norm": 0.6430490016937256, + "learning_rate": 0.00021455738578062916, + "loss": 2.9211, + "step": 36235 + }, + { + "epoch": 1.78, + "grad_norm": 0.6284935474395752, + "learning_rate": 0.00021454262734565277, + "loss": 3.1729, + "step": 36236 + }, + { + "epoch": 1.78, + "grad_norm": 0.646930456161499, + "learning_rate": 0.00021452786913575474, + "loss": 2.983, + "step": 36237 + }, + { + "epoch": 1.78, + "grad_norm": 0.6334282159805298, + "learning_rate": 0.00021451311115097378, + "loss": 3.1183, + "step": 36238 + }, + { + "epoch": 1.78, + "grad_norm": 0.6594672203063965, + "learning_rate": 0.00021449835339134893, + "loss": 3.0003, + "step": 36239 + }, + { + "epoch": 1.78, + "grad_norm": 0.6455363035202026, + "learning_rate": 0.00021448359585691883, + "loss": 3.0601, + "step": 36240 + }, + { + "epoch": 1.78, + "grad_norm": 0.6710865497589111, + "learning_rate": 0.00021446883854772255, + "loss": 2.9493, + "step": 36241 + }, + { + "epoch": 1.78, + "grad_norm": 0.6526823043823242, + "learning_rate": 0.00021445408146379894, + "loss": 3.1155, + "step": 36242 + }, + { + "epoch": 1.78, + "grad_norm": 0.6468623876571655, + "learning_rate": 0.0002144393246051867, + "loss": 2.7825, + "step": 36243 + }, + { + "epoch": 1.78, + "grad_norm": 0.6826371550559998, + "learning_rate": 0.00021442456797192492, + "loss": 3.0516, + "step": 36244 + }, + { + "epoch": 1.78, + "grad_norm": 0.6300487518310547, + "learning_rate": 0.0002144098115640523, + "loss": 3.0863, + "step": 36245 + }, + { + "epoch": 1.78, + "grad_norm": 0.6709060072898865, + "learning_rate": 0.00021439505538160776, + "loss": 3.0067, + "step": 36246 + }, + { + "epoch": 1.78, + "grad_norm": 0.6281489729881287, + "learning_rate": 0.0002143802994246302, + "loss": 3.1286, + "step": 36247 + }, + { + "epoch": 1.78, + "grad_norm": 0.6108605861663818, + "learning_rate": 0.00021436554369315845, + "loss": 3.0594, + "step": 36248 + }, + { + "epoch": 1.78, + "grad_norm": 0.692886233329773, + "learning_rate": 0.00021435078818723144, + "loss": 2.9815, + "step": 36249 + }, + { + "epoch": 1.78, + "grad_norm": 0.6155291199684143, + "learning_rate": 0.0002143360329068878, + "loss": 2.9099, + "step": 36250 + }, + { + "epoch": 1.78, + "grad_norm": 0.5946487784385681, + "learning_rate": 0.0002143212778521666, + "loss": 3.0502, + "step": 36251 + }, + { + "epoch": 1.78, + "grad_norm": 0.6302567720413208, + "learning_rate": 0.0002143065230231068, + "loss": 3.019, + "step": 36252 + }, + { + "epoch": 1.78, + "grad_norm": 0.610625684261322, + "learning_rate": 0.00021429176841974702, + "loss": 2.9311, + "step": 36253 + }, + { + "epoch": 1.78, + "grad_norm": 0.6491797566413879, + "learning_rate": 0.00021427701404212634, + "loss": 2.978, + "step": 36254 + }, + { + "epoch": 1.78, + "grad_norm": 0.6238184571266174, + "learning_rate": 0.00021426225989028334, + "loss": 3.1842, + "step": 36255 + }, + { + "epoch": 1.78, + "grad_norm": 0.6471647024154663, + "learning_rate": 0.00021424750596425713, + "loss": 3.1372, + "step": 36256 + }, + { + "epoch": 1.78, + "grad_norm": 0.6920592188835144, + "learning_rate": 0.00021423275226408654, + "loss": 3.1878, + "step": 36257 + }, + { + "epoch": 1.78, + "grad_norm": 0.6681246757507324, + "learning_rate": 0.00021421799878981027, + "loss": 2.9845, + "step": 36258 + }, + { + "epoch": 1.78, + "grad_norm": 0.6273609399795532, + "learning_rate": 0.0002142032455414674, + "loss": 3.0139, + "step": 36259 + }, + { + "epoch": 1.78, + "grad_norm": 0.6397222876548767, + "learning_rate": 0.0002141884925190966, + "loss": 2.9536, + "step": 36260 + }, + { + "epoch": 1.78, + "grad_norm": 0.6393402814865112, + "learning_rate": 0.00021417373972273672, + "loss": 2.983, + "step": 36261 + }, + { + "epoch": 1.78, + "grad_norm": 0.619844913482666, + "learning_rate": 0.00021415898715242682, + "loss": 3.053, + "step": 36262 + }, + { + "epoch": 1.78, + "grad_norm": 0.6226319074630737, + "learning_rate": 0.00021414423480820558, + "loss": 3.3036, + "step": 36263 + }, + { + "epoch": 1.78, + "grad_norm": 0.7460436820983887, + "learning_rate": 0.000214129482690112, + "loss": 2.7747, + "step": 36264 + }, + { + "epoch": 1.78, + "grad_norm": 0.6607783436775208, + "learning_rate": 0.00021411473079818464, + "loss": 3.0174, + "step": 36265 + }, + { + "epoch": 1.78, + "grad_norm": 0.6538329124450684, + "learning_rate": 0.00021409997913246262, + "loss": 3.0033, + "step": 36266 + }, + { + "epoch": 1.78, + "grad_norm": 0.704361081123352, + "learning_rate": 0.0002140852276929848, + "loss": 3.0254, + "step": 36267 + }, + { + "epoch": 1.78, + "grad_norm": 0.7130153775215149, + "learning_rate": 0.00021407047647978988, + "loss": 3.0591, + "step": 36268 + }, + { + "epoch": 1.78, + "grad_norm": 0.598651111125946, + "learning_rate": 0.00021405572549291685, + "loss": 3.1185, + "step": 36269 + }, + { + "epoch": 1.78, + "grad_norm": 0.6102203726768494, + "learning_rate": 0.0002140409747324044, + "loss": 3.2775, + "step": 36270 + }, + { + "epoch": 1.78, + "grad_norm": 0.6122444868087769, + "learning_rate": 0.00021402622419829155, + "loss": 3.1641, + "step": 36271 + }, + { + "epoch": 1.78, + "grad_norm": 0.6745235919952393, + "learning_rate": 0.00021401147389061716, + "loss": 2.8793, + "step": 36272 + }, + { + "epoch": 1.78, + "grad_norm": 0.6349541544914246, + "learning_rate": 0.0002139967238094198, + "loss": 3.0279, + "step": 36273 + }, + { + "epoch": 1.78, + "grad_norm": 0.631254255771637, + "learning_rate": 0.0002139819739547387, + "loss": 2.9795, + "step": 36274 + }, + { + "epoch": 1.78, + "grad_norm": 0.6068877577781677, + "learning_rate": 0.00021396722432661245, + "loss": 2.9905, + "step": 36275 + }, + { + "epoch": 1.78, + "grad_norm": 0.677623987197876, + "learning_rate": 0.0002139524749250799, + "loss": 2.8977, + "step": 36276 + }, + { + "epoch": 1.78, + "grad_norm": 0.6311962604522705, + "learning_rate": 0.00021393772575018012, + "loss": 3.075, + "step": 36277 + }, + { + "epoch": 1.78, + "grad_norm": 0.6224701404571533, + "learning_rate": 0.00021392297680195173, + "loss": 3.0223, + "step": 36278 + }, + { + "epoch": 1.78, + "grad_norm": 0.6306285858154297, + "learning_rate": 0.00021390822808043373, + "loss": 3.163, + "step": 36279 + }, + { + "epoch": 1.78, + "grad_norm": 0.6172642707824707, + "learning_rate": 0.00021389347958566474, + "loss": 3.0831, + "step": 36280 + }, + { + "epoch": 1.78, + "grad_norm": 0.6490658521652222, + "learning_rate": 0.0002138787313176838, + "loss": 2.973, + "step": 36281 + }, + { + "epoch": 1.78, + "grad_norm": 0.6317246556282043, + "learning_rate": 0.00021386398327652981, + "loss": 3.3257, + "step": 36282 + }, + { + "epoch": 1.78, + "grad_norm": 0.6703366041183472, + "learning_rate": 0.00021384923546224137, + "loss": 3.067, + "step": 36283 + }, + { + "epoch": 1.78, + "grad_norm": 0.6874723434448242, + "learning_rate": 0.0002138344878748576, + "loss": 2.9391, + "step": 36284 + }, + { + "epoch": 1.78, + "grad_norm": 0.6609402894973755, + "learning_rate": 0.0002138197405144171, + "loss": 2.9994, + "step": 36285 + }, + { + "epoch": 1.78, + "grad_norm": 0.6028761267662048, + "learning_rate": 0.0002138049933809588, + "loss": 3.0421, + "step": 36286 + }, + { + "epoch": 1.78, + "grad_norm": 0.6413368582725525, + "learning_rate": 0.00021379024647452167, + "loss": 3.0925, + "step": 36287 + }, + { + "epoch": 1.78, + "grad_norm": 0.660110354423523, + "learning_rate": 0.00021377549979514438, + "loss": 2.8916, + "step": 36288 + }, + { + "epoch": 1.78, + "grad_norm": 0.6747713088989258, + "learning_rate": 0.0002137607533428659, + "loss": 2.8905, + "step": 36289 + }, + { + "epoch": 1.78, + "grad_norm": 0.600744366645813, + "learning_rate": 0.00021374600711772488, + "loss": 3.1411, + "step": 36290 + }, + { + "epoch": 1.78, + "grad_norm": 0.6018193960189819, + "learning_rate": 0.00021373126111976027, + "loss": 3.0956, + "step": 36291 + }, + { + "epoch": 1.78, + "grad_norm": 0.611671507358551, + "learning_rate": 0.00021371651534901102, + "loss": 3.064, + "step": 36292 + }, + { + "epoch": 1.78, + "grad_norm": 0.6379413604736328, + "learning_rate": 0.00021370176980551583, + "loss": 2.9962, + "step": 36293 + }, + { + "epoch": 1.78, + "grad_norm": 0.644549548625946, + "learning_rate": 0.00021368702448931363, + "loss": 2.9289, + "step": 36294 + }, + { + "epoch": 1.78, + "grad_norm": 0.6469461917877197, + "learning_rate": 0.00021367227940044303, + "loss": 3.3018, + "step": 36295 + }, + { + "epoch": 1.78, + "grad_norm": 0.6457450985908508, + "learning_rate": 0.00021365753453894313, + "loss": 3.0339, + "step": 36296 + }, + { + "epoch": 1.78, + "grad_norm": 0.6415975689888, + "learning_rate": 0.00021364278990485273, + "loss": 3.1975, + "step": 36297 + }, + { + "epoch": 1.78, + "grad_norm": 0.6394268870353699, + "learning_rate": 0.0002136280454982105, + "loss": 3.05, + "step": 36298 + }, + { + "epoch": 1.78, + "grad_norm": 0.6522897481918335, + "learning_rate": 0.00021361330131905543, + "loss": 3.0616, + "step": 36299 + }, + { + "epoch": 1.78, + "grad_norm": 0.6101564764976501, + "learning_rate": 0.0002135985573674263, + "loss": 3.218, + "step": 36300 + }, + { + "epoch": 1.78, + "grad_norm": 0.6102758049964905, + "learning_rate": 0.000213583813643362, + "loss": 2.7773, + "step": 36301 + }, + { + "epoch": 1.78, + "grad_norm": 0.5993099212646484, + "learning_rate": 0.00021356907014690113, + "loss": 2.6897, + "step": 36302 + }, + { + "epoch": 1.78, + "grad_norm": 0.629689633846283, + "learning_rate": 0.0002135543268780828, + "loss": 3.0496, + "step": 36303 + }, + { + "epoch": 1.78, + "grad_norm": 0.5861761569976807, + "learning_rate": 0.00021353958383694575, + "loss": 3.0468, + "step": 36304 + }, + { + "epoch": 1.78, + "grad_norm": 0.6414809226989746, + "learning_rate": 0.00021352484102352872, + "loss": 2.7947, + "step": 36305 + }, + { + "epoch": 1.78, + "grad_norm": 0.6364729404449463, + "learning_rate": 0.00021351009843787074, + "loss": 3.0792, + "step": 36306 + }, + { + "epoch": 1.78, + "grad_norm": 0.6140216588973999, + "learning_rate": 0.00021349535608001044, + "loss": 2.8048, + "step": 36307 + }, + { + "epoch": 1.78, + "grad_norm": 0.6230908036231995, + "learning_rate": 0.00021348061394998663, + "loss": 2.9799, + "step": 36308 + }, + { + "epoch": 1.78, + "grad_norm": 0.6611496210098267, + "learning_rate": 0.00021346587204783843, + "loss": 2.7688, + "step": 36309 + }, + { + "epoch": 1.78, + "grad_norm": 0.638316810131073, + "learning_rate": 0.00021345113037360438, + "loss": 3.2603, + "step": 36310 + }, + { + "epoch": 1.78, + "grad_norm": 0.6499404311180115, + "learning_rate": 0.00021343638892732343, + "loss": 2.8078, + "step": 36311 + }, + { + "epoch": 1.78, + "grad_norm": 0.6417861580848694, + "learning_rate": 0.00021342164770903426, + "loss": 2.8695, + "step": 36312 + }, + { + "epoch": 1.78, + "grad_norm": 0.6680542230606079, + "learning_rate": 0.0002134069067187758, + "loss": 3.202, + "step": 36313 + }, + { + "epoch": 1.78, + "grad_norm": 0.6242054104804993, + "learning_rate": 0.000213392165956587, + "loss": 2.8746, + "step": 36314 + }, + { + "epoch": 1.78, + "grad_norm": 0.6265760064125061, + "learning_rate": 0.00021337742542250652, + "loss": 3.0401, + "step": 36315 + }, + { + "epoch": 1.78, + "grad_norm": 0.6692229509353638, + "learning_rate": 0.00021336268511657332, + "loss": 2.6561, + "step": 36316 + }, + { + "epoch": 1.78, + "grad_norm": 0.7591632008552551, + "learning_rate": 0.00021334794503882596, + "loss": 3.0317, + "step": 36317 + }, + { + "epoch": 1.78, + "grad_norm": 0.6061434745788574, + "learning_rate": 0.00021333320518930355, + "loss": 3.0425, + "step": 36318 + }, + { + "epoch": 1.78, + "grad_norm": 0.6159639954566956, + "learning_rate": 0.00021331846556804483, + "loss": 2.9656, + "step": 36319 + }, + { + "epoch": 1.78, + "grad_norm": 0.5951846241950989, + "learning_rate": 0.00021330372617508846, + "loss": 2.867, + "step": 36320 + }, + { + "epoch": 1.78, + "grad_norm": 0.5990233421325684, + "learning_rate": 0.00021328898701047352, + "loss": 2.9664, + "step": 36321 + }, + { + "epoch": 1.78, + "grad_norm": 0.6403229236602783, + "learning_rate": 0.00021327424807423865, + "loss": 3.045, + "step": 36322 + }, + { + "epoch": 1.78, + "grad_norm": 0.6548666954040527, + "learning_rate": 0.00021325950936642264, + "loss": 3.1726, + "step": 36323 + }, + { + "epoch": 1.78, + "grad_norm": 0.6505340933799744, + "learning_rate": 0.00021324477088706452, + "loss": 3.0317, + "step": 36324 + }, + { + "epoch": 1.78, + "grad_norm": 0.6289445757865906, + "learning_rate": 0.0002132300326362029, + "loss": 3.0337, + "step": 36325 + }, + { + "epoch": 1.78, + "grad_norm": 0.6409799456596375, + "learning_rate": 0.00021321529461387675, + "loss": 3.0774, + "step": 36326 + }, + { + "epoch": 1.78, + "grad_norm": 0.6213870048522949, + "learning_rate": 0.00021320055682012468, + "loss": 3.1011, + "step": 36327 + }, + { + "epoch": 1.78, + "grad_norm": 0.5979524254798889, + "learning_rate": 0.00021318581925498572, + "loss": 2.9269, + "step": 36328 + }, + { + "epoch": 1.78, + "grad_norm": 0.6520103812217712, + "learning_rate": 0.0002131710819184987, + "loss": 2.9932, + "step": 36329 + }, + { + "epoch": 1.78, + "grad_norm": 0.6027911305427551, + "learning_rate": 0.00021315634481070214, + "loss": 3.1689, + "step": 36330 + }, + { + "epoch": 1.78, + "grad_norm": 0.6382830142974854, + "learning_rate": 0.0002131416079316352, + "loss": 3.0135, + "step": 36331 + }, + { + "epoch": 1.78, + "grad_norm": 0.6174009442329407, + "learning_rate": 0.00021312687128133645, + "loss": 2.9106, + "step": 36332 + }, + { + "epoch": 1.78, + "grad_norm": 0.640430212020874, + "learning_rate": 0.00021311213485984486, + "loss": 3.1547, + "step": 36333 + }, + { + "epoch": 1.78, + "grad_norm": 0.6170399785041809, + "learning_rate": 0.00021309739866719922, + "loss": 3.18, + "step": 36334 + }, + { + "epoch": 1.78, + "grad_norm": 0.6654266715049744, + "learning_rate": 0.0002130826627034382, + "loss": 3.0434, + "step": 36335 + }, + { + "epoch": 1.78, + "grad_norm": 0.6619197130203247, + "learning_rate": 0.00021306792696860086, + "loss": 2.9076, + "step": 36336 + }, + { + "epoch": 1.78, + "grad_norm": 0.6682602167129517, + "learning_rate": 0.0002130531914627258, + "loss": 2.9377, + "step": 36337 + }, + { + "epoch": 1.78, + "grad_norm": 0.8006364107131958, + "learning_rate": 0.00021303845618585182, + "loss": 3.014, + "step": 36338 + }, + { + "epoch": 1.78, + "grad_norm": 0.6219844818115234, + "learning_rate": 0.00021302372113801796, + "loss": 2.8757, + "step": 36339 + }, + { + "epoch": 1.78, + "grad_norm": 0.6059569716453552, + "learning_rate": 0.00021300898631926275, + "loss": 2.9424, + "step": 36340 + }, + { + "epoch": 1.78, + "grad_norm": 0.6212844848632812, + "learning_rate": 0.00021299425172962528, + "loss": 2.9297, + "step": 36341 + }, + { + "epoch": 1.78, + "grad_norm": 0.6634481549263, + "learning_rate": 0.00021297951736914402, + "loss": 3.2482, + "step": 36342 + }, + { + "epoch": 1.78, + "grad_norm": 0.6362781524658203, + "learning_rate": 0.00021296478323785804, + "loss": 3.0874, + "step": 36343 + }, + { + "epoch": 1.78, + "grad_norm": 0.721335768699646, + "learning_rate": 0.00021295004933580614, + "loss": 2.8786, + "step": 36344 + }, + { + "epoch": 1.78, + "grad_norm": 0.6178750395774841, + "learning_rate": 0.00021293531566302692, + "loss": 2.8094, + "step": 36345 + }, + { + "epoch": 1.78, + "grad_norm": 0.6878019571304321, + "learning_rate": 0.00021292058221955947, + "loss": 2.9951, + "step": 36346 + }, + { + "epoch": 1.78, + "grad_norm": 0.6131126284599304, + "learning_rate": 0.0002129058490054424, + "loss": 2.993, + "step": 36347 + }, + { + "epoch": 1.78, + "grad_norm": 0.6379331946372986, + "learning_rate": 0.00021289111602071448, + "loss": 2.8931, + "step": 36348 + }, + { + "epoch": 1.78, + "grad_norm": 0.6440445780754089, + "learning_rate": 0.00021287638326541468, + "loss": 3.142, + "step": 36349 + }, + { + "epoch": 1.78, + "grad_norm": 0.6755915880203247, + "learning_rate": 0.00021286165073958175, + "loss": 3.0775, + "step": 36350 + }, + { + "epoch": 1.78, + "grad_norm": 0.6246413588523865, + "learning_rate": 0.0002128469184432545, + "loss": 2.8214, + "step": 36351 + }, + { + "epoch": 1.78, + "grad_norm": 0.6133143305778503, + "learning_rate": 0.0002128321863764715, + "loss": 2.7959, + "step": 36352 + }, + { + "epoch": 1.78, + "grad_norm": 0.6247037649154663, + "learning_rate": 0.0002128174545392718, + "loss": 2.9561, + "step": 36353 + }, + { + "epoch": 1.78, + "grad_norm": 0.6576348543167114, + "learning_rate": 0.00021280272293169424, + "loss": 3.043, + "step": 36354 + }, + { + "epoch": 1.78, + "grad_norm": 0.633882462978363, + "learning_rate": 0.00021278799155377746, + "loss": 3.1632, + "step": 36355 + }, + { + "epoch": 1.78, + "grad_norm": 0.6401931643486023, + "learning_rate": 0.0002127732604055604, + "loss": 3.1631, + "step": 36356 + }, + { + "epoch": 1.78, + "grad_norm": 0.6456989645957947, + "learning_rate": 0.0002127585294870817, + "loss": 2.7861, + "step": 36357 + }, + { + "epoch": 1.78, + "grad_norm": 0.5906917452812195, + "learning_rate": 0.00021274379879838027, + "loss": 3.2061, + "step": 36358 + }, + { + "epoch": 1.78, + "grad_norm": 0.6478468179702759, + "learning_rate": 0.0002127290683394949, + "loss": 3.1233, + "step": 36359 + }, + { + "epoch": 1.78, + "grad_norm": 0.643356204032898, + "learning_rate": 0.00021271433811046432, + "loss": 2.7415, + "step": 36360 + }, + { + "epoch": 1.78, + "grad_norm": 0.651016116142273, + "learning_rate": 0.00021269960811132747, + "loss": 2.8069, + "step": 36361 + }, + { + "epoch": 1.78, + "grad_norm": 0.6640231609344482, + "learning_rate": 0.000212684878342123, + "loss": 2.8503, + "step": 36362 + }, + { + "epoch": 1.78, + "grad_norm": 0.6326684951782227, + "learning_rate": 0.0002126701488028897, + "loss": 2.9992, + "step": 36363 + }, + { + "epoch": 1.78, + "grad_norm": 0.6268022656440735, + "learning_rate": 0.00021265541949366652, + "loss": 3.0261, + "step": 36364 + }, + { + "epoch": 1.78, + "grad_norm": 0.6220657229423523, + "learning_rate": 0.00021264069041449207, + "loss": 2.7213, + "step": 36365 + }, + { + "epoch": 1.78, + "grad_norm": 0.6234197616577148, + "learning_rate": 0.00021262596156540535, + "loss": 3.0385, + "step": 36366 + }, + { + "epoch": 1.78, + "grad_norm": 0.6607543230056763, + "learning_rate": 0.00021261123294644485, + "loss": 3.0352, + "step": 36367 + }, + { + "epoch": 1.78, + "grad_norm": 0.6178925037384033, + "learning_rate": 0.00021259650455764966, + "loss": 3.0404, + "step": 36368 + }, + { + "epoch": 1.78, + "grad_norm": 0.6788184642791748, + "learning_rate": 0.00021258177639905848, + "loss": 2.9027, + "step": 36369 + }, + { + "epoch": 1.78, + "grad_norm": 0.6631069779396057, + "learning_rate": 0.00021256704847071, + "loss": 3.1795, + "step": 36370 + }, + { + "epoch": 1.78, + "grad_norm": 0.6293043494224548, + "learning_rate": 0.00021255232077264314, + "loss": 2.953, + "step": 36371 + }, + { + "epoch": 1.78, + "grad_norm": 0.6410183906555176, + "learning_rate": 0.00021253759330489656, + "loss": 3.0083, + "step": 36372 + }, + { + "epoch": 1.78, + "grad_norm": 0.6360601782798767, + "learning_rate": 0.00021252286606750914, + "loss": 3.0385, + "step": 36373 + }, + { + "epoch": 1.78, + "grad_norm": 0.6407144069671631, + "learning_rate": 0.00021250813906051978, + "loss": 3.0849, + "step": 36374 + }, + { + "epoch": 1.78, + "grad_norm": 0.5937209129333496, + "learning_rate": 0.00021249341228396702, + "loss": 2.8081, + "step": 36375 + }, + { + "epoch": 1.78, + "grad_norm": 0.612413763999939, + "learning_rate": 0.00021247868573788986, + "loss": 3.133, + "step": 36376 + }, + { + "epoch": 1.78, + "grad_norm": 0.6570436358451843, + "learning_rate": 0.00021246395942232693, + "loss": 3.0547, + "step": 36377 + }, + { + "epoch": 1.78, + "grad_norm": 0.6165720820426941, + "learning_rate": 0.00021244923333731702, + "loss": 3.0068, + "step": 36378 + }, + { + "epoch": 1.78, + "grad_norm": 0.6688787341117859, + "learning_rate": 0.00021243450748289916, + "loss": 2.9723, + "step": 36379 + }, + { + "epoch": 1.78, + "grad_norm": 0.6865296363830566, + "learning_rate": 0.00021241978185911184, + "loss": 2.8357, + "step": 36380 + }, + { + "epoch": 1.78, + "grad_norm": 0.6525271534919739, + "learning_rate": 0.00021240505646599405, + "loss": 3.1706, + "step": 36381 + }, + { + "epoch": 1.78, + "grad_norm": 0.6239628195762634, + "learning_rate": 0.00021239033130358433, + "loss": 3.0773, + "step": 36382 + }, + { + "epoch": 1.78, + "grad_norm": 0.6260185241699219, + "learning_rate": 0.00021237560637192178, + "loss": 2.9808, + "step": 36383 + }, + { + "epoch": 1.78, + "grad_norm": 0.6808270812034607, + "learning_rate": 0.00021236088167104494, + "loss": 2.7452, + "step": 36384 + }, + { + "epoch": 1.78, + "grad_norm": 0.6365841031074524, + "learning_rate": 0.00021234615720099257, + "loss": 3.1358, + "step": 36385 + }, + { + "epoch": 1.78, + "grad_norm": 0.6161235570907593, + "learning_rate": 0.00021233143296180374, + "loss": 3.1963, + "step": 36386 + }, + { + "epoch": 1.78, + "grad_norm": 0.6361461281776428, + "learning_rate": 0.00021231670895351695, + "loss": 3.1814, + "step": 36387 + }, + { + "epoch": 1.78, + "grad_norm": 0.6474431157112122, + "learning_rate": 0.00021230198517617114, + "loss": 2.9853, + "step": 36388 + }, + { + "epoch": 1.78, + "grad_norm": 0.679755687713623, + "learning_rate": 0.0002122872616298049, + "loss": 3.062, + "step": 36389 + }, + { + "epoch": 1.78, + "grad_norm": 0.6086769700050354, + "learning_rate": 0.00021227253831445717, + "loss": 3.1973, + "step": 36390 + }, + { + "epoch": 1.78, + "grad_norm": 0.6710407733917236, + "learning_rate": 0.0002122578152301668, + "loss": 2.8719, + "step": 36391 + }, + { + "epoch": 1.78, + "grad_norm": 0.664715051651001, + "learning_rate": 0.0002122430923769723, + "loss": 2.9214, + "step": 36392 + }, + { + "epoch": 1.78, + "grad_norm": 0.625869631767273, + "learning_rate": 0.00021222836975491274, + "loss": 3.0165, + "step": 36393 + }, + { + "epoch": 1.78, + "grad_norm": 0.6978575587272644, + "learning_rate": 0.00021221364736402664, + "loss": 3.0272, + "step": 36394 + }, + { + "epoch": 1.78, + "grad_norm": 0.6914448142051697, + "learning_rate": 0.00021219892520435294, + "loss": 2.8926, + "step": 36395 + }, + { + "epoch": 1.78, + "grad_norm": 0.6650540828704834, + "learning_rate": 0.00021218420327593043, + "loss": 3.0207, + "step": 36396 + }, + { + "epoch": 1.78, + "grad_norm": 0.6832113862037659, + "learning_rate": 0.00021216948157879776, + "loss": 2.9073, + "step": 36397 + }, + { + "epoch": 1.78, + "grad_norm": 0.6525067687034607, + "learning_rate": 0.00021215476011299383, + "loss": 2.9059, + "step": 36398 + }, + { + "epoch": 1.78, + "grad_norm": 0.6345611214637756, + "learning_rate": 0.00021214003887855737, + "loss": 2.8414, + "step": 36399 + }, + { + "epoch": 1.78, + "grad_norm": 0.6637008786201477, + "learning_rate": 0.000212125317875527, + "loss": 2.9859, + "step": 36400 + }, + { + "epoch": 1.78, + "grad_norm": 0.6102834343910217, + "learning_rate": 0.00021211059710394178, + "loss": 3.1545, + "step": 36401 + }, + { + "epoch": 1.78, + "grad_norm": 0.6296905875205994, + "learning_rate": 0.0002120958765638403, + "loss": 2.6042, + "step": 36402 + }, + { + "epoch": 1.78, + "grad_norm": 0.6705476641654968, + "learning_rate": 0.00021208115625526142, + "loss": 2.9978, + "step": 36403 + }, + { + "epoch": 1.78, + "grad_norm": 0.6274638772010803, + "learning_rate": 0.0002120664361782437, + "loss": 3.0183, + "step": 36404 + }, + { + "epoch": 1.78, + "grad_norm": 0.6929411888122559, + "learning_rate": 0.00021205171633282614, + "loss": 3.3224, + "step": 36405 + }, + { + "epoch": 1.78, + "grad_norm": 0.6245270371437073, + "learning_rate": 0.0002120369967190475, + "loss": 3.1516, + "step": 36406 + }, + { + "epoch": 1.78, + "grad_norm": 0.6495602130889893, + "learning_rate": 0.00021202227733694635, + "loss": 2.9227, + "step": 36407 + }, + { + "epoch": 1.78, + "grad_norm": 0.6302369832992554, + "learning_rate": 0.0002120075581865617, + "loss": 3.09, + "step": 36408 + }, + { + "epoch": 1.78, + "grad_norm": 0.6170496344566345, + "learning_rate": 0.00021199283926793218, + "loss": 2.8749, + "step": 36409 + }, + { + "epoch": 1.78, + "grad_norm": 0.6352107524871826, + "learning_rate": 0.00021197812058109652, + "loss": 2.9232, + "step": 36410 + }, + { + "epoch": 1.78, + "grad_norm": 0.6431768536567688, + "learning_rate": 0.00021196340212609362, + "loss": 3.116, + "step": 36411 + }, + { + "epoch": 1.78, + "grad_norm": 0.6413148045539856, + "learning_rate": 0.00021194868390296217, + "loss": 3.0991, + "step": 36412 + }, + { + "epoch": 1.78, + "grad_norm": 0.607494592666626, + "learning_rate": 0.00021193396591174098, + "loss": 3.0078, + "step": 36413 + }, + { + "epoch": 1.78, + "grad_norm": 0.6016876101493835, + "learning_rate": 0.00021191924815246865, + "loss": 2.9614, + "step": 36414 + }, + { + "epoch": 1.78, + "grad_norm": 0.6435602903366089, + "learning_rate": 0.00021190453062518407, + "loss": 2.8346, + "step": 36415 + }, + { + "epoch": 1.78, + "grad_norm": 0.6488229036331177, + "learning_rate": 0.00021188981332992613, + "loss": 2.989, + "step": 36416 + }, + { + "epoch": 1.78, + "grad_norm": 0.623367428779602, + "learning_rate": 0.00021187509626673342, + "loss": 3.1408, + "step": 36417 + }, + { + "epoch": 1.78, + "grad_norm": 0.6525915265083313, + "learning_rate": 0.0002118603794356448, + "loss": 3.0817, + "step": 36418 + }, + { + "epoch": 1.78, + "grad_norm": 0.6700499653816223, + "learning_rate": 0.00021184566283669883, + "loss": 3.0911, + "step": 36419 + }, + { + "epoch": 1.78, + "grad_norm": 0.6590023636817932, + "learning_rate": 0.0002118309464699345, + "loss": 2.8538, + "step": 36420 + }, + { + "epoch": 1.78, + "grad_norm": 0.6719303131103516, + "learning_rate": 0.00021181623033539052, + "loss": 3.0935, + "step": 36421 + }, + { + "epoch": 1.78, + "grad_norm": 0.6459015607833862, + "learning_rate": 0.00021180151443310556, + "loss": 2.809, + "step": 36422 + }, + { + "epoch": 1.79, + "grad_norm": 0.6407613158226013, + "learning_rate": 0.0002117867987631185, + "loss": 2.7572, + "step": 36423 + }, + { + "epoch": 1.79, + "grad_norm": 0.6750385165214539, + "learning_rate": 0.000211772083325468, + "loss": 2.9058, + "step": 36424 + }, + { + "epoch": 1.79, + "grad_norm": 0.6313875317573547, + "learning_rate": 0.00021175736812019275, + "loss": 3.0272, + "step": 36425 + }, + { + "epoch": 1.79, + "grad_norm": 0.6446053385734558, + "learning_rate": 0.0002117426531473318, + "loss": 2.9416, + "step": 36426 + }, + { + "epoch": 1.79, + "grad_norm": 0.6613894104957581, + "learning_rate": 0.00021172793840692358, + "loss": 2.9213, + "step": 36427 + }, + { + "epoch": 1.79, + "grad_norm": 0.6796953082084656, + "learning_rate": 0.00021171322389900715, + "loss": 2.9999, + "step": 36428 + }, + { + "epoch": 1.79, + "grad_norm": 0.6764907836914062, + "learning_rate": 0.00021169850962362088, + "loss": 2.9982, + "step": 36429 + }, + { + "epoch": 1.79, + "grad_norm": 0.6619737148284912, + "learning_rate": 0.00021168379558080385, + "loss": 2.8365, + "step": 36430 + }, + { + "epoch": 1.79, + "grad_norm": 0.6781612038612366, + "learning_rate": 0.00021166908177059474, + "loss": 2.7274, + "step": 36431 + }, + { + "epoch": 1.79, + "grad_norm": 0.6520367860794067, + "learning_rate": 0.00021165436819303214, + "loss": 3.0871, + "step": 36432 + }, + { + "epoch": 1.79, + "grad_norm": 0.6807015538215637, + "learning_rate": 0.00021163965484815507, + "loss": 3.1435, + "step": 36433 + }, + { + "epoch": 1.79, + "grad_norm": 0.688068687915802, + "learning_rate": 0.00021162494173600202, + "loss": 3.1147, + "step": 36434 + }, + { + "epoch": 1.79, + "grad_norm": 0.6110426187515259, + "learning_rate": 0.00021161022885661195, + "loss": 2.9672, + "step": 36435 + }, + { + "epoch": 1.79, + "grad_norm": 0.6746833920478821, + "learning_rate": 0.00021159551621002356, + "loss": 2.9923, + "step": 36436 + }, + { + "epoch": 1.79, + "grad_norm": 0.6675704717636108, + "learning_rate": 0.00021158080379627547, + "loss": 3.146, + "step": 36437 + }, + { + "epoch": 1.79, + "grad_norm": 0.6473252773284912, + "learning_rate": 0.00021156609161540664, + "loss": 2.9734, + "step": 36438 + }, + { + "epoch": 1.79, + "grad_norm": 0.6055066585540771, + "learning_rate": 0.0002115513796674556, + "loss": 2.9343, + "step": 36439 + }, + { + "epoch": 1.79, + "grad_norm": 0.6825117468833923, + "learning_rate": 0.00021153666795246118, + "loss": 2.9982, + "step": 36440 + }, + { + "epoch": 1.79, + "grad_norm": 0.7194039821624756, + "learning_rate": 0.00021152195647046226, + "loss": 3.1512, + "step": 36441 + }, + { + "epoch": 1.79, + "grad_norm": 0.6608765125274658, + "learning_rate": 0.00021150724522149742, + "loss": 2.9463, + "step": 36442 + }, + { + "epoch": 1.79, + "grad_norm": 0.6718124747276306, + "learning_rate": 0.00021149253420560556, + "loss": 2.7968, + "step": 36443 + }, + { + "epoch": 1.79, + "grad_norm": 0.6314850449562073, + "learning_rate": 0.00021147782342282519, + "loss": 2.8741, + "step": 36444 + }, + { + "epoch": 1.79, + "grad_norm": 0.6046271324157715, + "learning_rate": 0.00021146311287319522, + "loss": 3.0566, + "step": 36445 + }, + { + "epoch": 1.79, + "grad_norm": 0.6407644152641296, + "learning_rate": 0.00021144840255675446, + "loss": 2.9809, + "step": 36446 + }, + { + "epoch": 1.79, + "grad_norm": 0.632928192615509, + "learning_rate": 0.00021143369247354147, + "loss": 3.0315, + "step": 36447 + }, + { + "epoch": 1.79, + "grad_norm": 0.6315308213233948, + "learning_rate": 0.00021141898262359515, + "loss": 3.1262, + "step": 36448 + }, + { + "epoch": 1.79, + "grad_norm": 0.6301126480102539, + "learning_rate": 0.00021140427300695417, + "loss": 2.8378, + "step": 36449 + }, + { + "epoch": 1.79, + "grad_norm": 0.6344707012176514, + "learning_rate": 0.00021138956362365722, + "loss": 3.2058, + "step": 36450 + }, + { + "epoch": 1.79, + "grad_norm": 0.6632879972457886, + "learning_rate": 0.00021137485447374324, + "loss": 3.1878, + "step": 36451 + }, + { + "epoch": 1.79, + "grad_norm": 0.5994957089424133, + "learning_rate": 0.00021136014555725075, + "loss": 3.1941, + "step": 36452 + }, + { + "epoch": 1.79, + "grad_norm": 0.5996524691581726, + "learning_rate": 0.00021134543687421865, + "loss": 3.1972, + "step": 36453 + }, + { + "epoch": 1.79, + "grad_norm": 0.6582194566726685, + "learning_rate": 0.00021133072842468545, + "loss": 2.8146, + "step": 36454 + }, + { + "epoch": 1.79, + "grad_norm": 0.6702811121940613, + "learning_rate": 0.00021131602020869013, + "loss": 3.0313, + "step": 36455 + }, + { + "epoch": 1.79, + "grad_norm": 0.6455747485160828, + "learning_rate": 0.0002113013122262714, + "loss": 3.1189, + "step": 36456 + }, + { + "epoch": 1.79, + "grad_norm": 0.6200703382492065, + "learning_rate": 0.00021128660447746787, + "loss": 2.7576, + "step": 36457 + }, + { + "epoch": 1.79, + "grad_norm": 0.6644160747528076, + "learning_rate": 0.00021127189696231846, + "loss": 3.0674, + "step": 36458 + }, + { + "epoch": 1.79, + "grad_norm": 0.6624540090560913, + "learning_rate": 0.00021125718968086163, + "loss": 2.9791, + "step": 36459 + }, + { + "epoch": 1.79, + "grad_norm": 0.6029409766197205, + "learning_rate": 0.00021124248263313644, + "loss": 3.0949, + "step": 36460 + }, + { + "epoch": 1.79, + "grad_norm": 0.6144557595252991, + "learning_rate": 0.0002112277758191814, + "loss": 3.1555, + "step": 36461 + }, + { + "epoch": 1.79, + "grad_norm": 0.6617509722709656, + "learning_rate": 0.00021121306923903524, + "loss": 2.9906, + "step": 36462 + }, + { + "epoch": 1.79, + "grad_norm": 0.6040384769439697, + "learning_rate": 0.0002111983628927369, + "loss": 2.941, + "step": 36463 + }, + { + "epoch": 1.79, + "grad_norm": 0.6424322724342346, + "learning_rate": 0.0002111836567803249, + "loss": 3.1002, + "step": 36464 + }, + { + "epoch": 1.79, + "grad_norm": 0.6235278844833374, + "learning_rate": 0.00021116895090183817, + "loss": 3.0474, + "step": 36465 + }, + { + "epoch": 1.79, + "grad_norm": 0.6009097099304199, + "learning_rate": 0.00021115424525731515, + "loss": 3.0715, + "step": 36466 + }, + { + "epoch": 1.79, + "grad_norm": 0.6081644892692566, + "learning_rate": 0.00021113953984679484, + "loss": 3.1374, + "step": 36467 + }, + { + "epoch": 1.79, + "grad_norm": 0.6132400631904602, + "learning_rate": 0.00021112483467031594, + "loss": 2.8668, + "step": 36468 + }, + { + "epoch": 1.79, + "grad_norm": 0.6370624303817749, + "learning_rate": 0.000211110129727917, + "loss": 2.8138, + "step": 36469 + }, + { + "epoch": 1.79, + "grad_norm": 0.6113362908363342, + "learning_rate": 0.00021109542501963695, + "loss": 3.1049, + "step": 36470 + }, + { + "epoch": 1.79, + "grad_norm": 0.6134870052337646, + "learning_rate": 0.00021108072054551444, + "loss": 3.051, + "step": 36471 + }, + { + "epoch": 1.79, + "grad_norm": 0.6292585730552673, + "learning_rate": 0.0002110660163055881, + "loss": 2.9592, + "step": 36472 + }, + { + "epoch": 1.79, + "grad_norm": 0.6173241138458252, + "learning_rate": 0.0002110513122998969, + "loss": 3.2329, + "step": 36473 + }, + { + "epoch": 1.79, + "grad_norm": 0.6382222771644592, + "learning_rate": 0.00021103660852847937, + "loss": 2.9698, + "step": 36474 + }, + { + "epoch": 1.79, + "grad_norm": 0.6229817867279053, + "learning_rate": 0.00021102190499137433, + "loss": 2.9552, + "step": 36475 + }, + { + "epoch": 1.79, + "grad_norm": 0.6382546424865723, + "learning_rate": 0.00021100720168862038, + "loss": 3.0138, + "step": 36476 + }, + { + "epoch": 1.79, + "grad_norm": 0.63431316614151, + "learning_rate": 0.0002109924986202563, + "loss": 2.7953, + "step": 36477 + }, + { + "epoch": 1.79, + "grad_norm": 0.629733681678772, + "learning_rate": 0.00021097779578632095, + "loss": 3.2056, + "step": 36478 + }, + { + "epoch": 1.79, + "grad_norm": 0.6950914859771729, + "learning_rate": 0.00021096309318685292, + "loss": 3.1653, + "step": 36479 + }, + { + "epoch": 1.79, + "grad_norm": 0.6350083351135254, + "learning_rate": 0.00021094839082189104, + "loss": 2.9533, + "step": 36480 + }, + { + "epoch": 1.79, + "grad_norm": 0.6433498859405518, + "learning_rate": 0.00021093368869147384, + "loss": 2.9834, + "step": 36481 + }, + { + "epoch": 1.79, + "grad_norm": 0.6216697692871094, + "learning_rate": 0.0002109189867956402, + "loss": 3.0459, + "step": 36482 + }, + { + "epoch": 1.79, + "grad_norm": 0.6351829767227173, + "learning_rate": 0.00021090428513442888, + "loss": 3.0169, + "step": 36483 + }, + { + "epoch": 1.79, + "grad_norm": 0.6634454727172852, + "learning_rate": 0.00021088958370787838, + "loss": 2.9469, + "step": 36484 + }, + { + "epoch": 1.79, + "grad_norm": 0.6411239504814148, + "learning_rate": 0.00021087488251602776, + "loss": 3.1319, + "step": 36485 + }, + { + "epoch": 1.79, + "grad_norm": 0.6567073464393616, + "learning_rate": 0.00021086018155891541, + "loss": 2.8596, + "step": 36486 + }, + { + "epoch": 1.79, + "grad_norm": 0.6086127758026123, + "learning_rate": 0.00021084548083658017, + "loss": 3.0581, + "step": 36487 + }, + { + "epoch": 1.79, + "grad_norm": 0.6434993743896484, + "learning_rate": 0.00021083078034906085, + "loss": 3.1005, + "step": 36488 + }, + { + "epoch": 1.79, + "grad_norm": 0.685982882976532, + "learning_rate": 0.0002108160800963961, + "loss": 3.0861, + "step": 36489 + }, + { + "epoch": 1.79, + "grad_norm": 0.6597147583961487, + "learning_rate": 0.00021080138007862465, + "loss": 3.0276, + "step": 36490 + }, + { + "epoch": 1.79, + "grad_norm": 0.6205618977546692, + "learning_rate": 0.00021078668029578506, + "loss": 3.1633, + "step": 36491 + }, + { + "epoch": 1.79, + "grad_norm": 0.6635233759880066, + "learning_rate": 0.00021077198074791626, + "loss": 2.885, + "step": 36492 + }, + { + "epoch": 1.79, + "grad_norm": 0.6489311456680298, + "learning_rate": 0.00021075728143505698, + "loss": 2.8726, + "step": 36493 + }, + { + "epoch": 1.79, + "grad_norm": 0.6681040525436401, + "learning_rate": 0.0002107425823572457, + "loss": 2.8963, + "step": 36494 + }, + { + "epoch": 1.79, + "grad_norm": 0.6183825731277466, + "learning_rate": 0.0002107278835145214, + "loss": 3.2596, + "step": 36495 + }, + { + "epoch": 1.79, + "grad_norm": 0.9229050278663635, + "learning_rate": 0.00021071318490692255, + "loss": 3.1778, + "step": 36496 + }, + { + "epoch": 1.79, + "grad_norm": 0.6755209565162659, + "learning_rate": 0.00021069848653448804, + "loss": 2.8989, + "step": 36497 + }, + { + "epoch": 1.79, + "grad_norm": 0.6122171878814697, + "learning_rate": 0.00021068378839725663, + "loss": 3.1738, + "step": 36498 + }, + { + "epoch": 1.79, + "grad_norm": 0.6368758082389832, + "learning_rate": 0.00021066909049526677, + "loss": 2.9932, + "step": 36499 + }, + { + "epoch": 1.79, + "grad_norm": 0.6233503222465515, + "learning_rate": 0.00021065439282855748, + "loss": 3.0565, + "step": 36500 + }, + { + "epoch": 1.79, + "grad_norm": 0.6276006698608398, + "learning_rate": 0.00021063969539716727, + "loss": 2.7239, + "step": 36501 + }, + { + "epoch": 1.79, + "grad_norm": 0.6331776976585388, + "learning_rate": 0.0002106249982011348, + "loss": 2.9001, + "step": 36502 + }, + { + "epoch": 1.79, + "grad_norm": 0.6143416166305542, + "learning_rate": 0.00021061030124049903, + "loss": 3.2166, + "step": 36503 + }, + { + "epoch": 1.79, + "grad_norm": 0.6579669713973999, + "learning_rate": 0.0002105956045152985, + "loss": 2.8755, + "step": 36504 + }, + { + "epoch": 1.79, + "grad_norm": 0.6055128574371338, + "learning_rate": 0.00021058090802557198, + "loss": 2.7474, + "step": 36505 + }, + { + "epoch": 1.79, + "grad_norm": 0.6531729102134705, + "learning_rate": 0.000210566211771358, + "loss": 3.0321, + "step": 36506 + }, + { + "epoch": 1.79, + "grad_norm": 0.6186026334762573, + "learning_rate": 0.00021055151575269547, + "loss": 3.0292, + "step": 36507 + }, + { + "epoch": 1.79, + "grad_norm": 0.6538664102554321, + "learning_rate": 0.0002105368199696231, + "loss": 3.0134, + "step": 36508 + }, + { + "epoch": 1.79, + "grad_norm": 0.5929544568061829, + "learning_rate": 0.0002105221244221794, + "loss": 3.0028, + "step": 36509 + }, + { + "epoch": 1.79, + "grad_norm": 0.6697933673858643, + "learning_rate": 0.0002105074291104033, + "loss": 2.9312, + "step": 36510 + }, + { + "epoch": 1.79, + "grad_norm": 0.6528447270393372, + "learning_rate": 0.00021049273403433337, + "loss": 2.8436, + "step": 36511 + }, + { + "epoch": 1.79, + "grad_norm": 0.6155452728271484, + "learning_rate": 0.0002104780391940083, + "loss": 3.1464, + "step": 36512 + }, + { + "epoch": 1.79, + "grad_norm": 0.6547124981880188, + "learning_rate": 0.00021046334458946697, + "loss": 2.8919, + "step": 36513 + }, + { + "epoch": 1.79, + "grad_norm": 0.6116988658905029, + "learning_rate": 0.0002104486502207479, + "loss": 3.0086, + "step": 36514 + }, + { + "epoch": 1.79, + "grad_norm": 0.6030555367469788, + "learning_rate": 0.00021043395608788993, + "loss": 3.0789, + "step": 36515 + }, + { + "epoch": 1.79, + "grad_norm": 0.6490576267242432, + "learning_rate": 0.0002104192621909315, + "loss": 2.8825, + "step": 36516 + }, + { + "epoch": 1.79, + "grad_norm": 0.643524169921875, + "learning_rate": 0.00021040456852991157, + "loss": 2.931, + "step": 36517 + }, + { + "epoch": 1.79, + "grad_norm": 0.6385647654533386, + "learning_rate": 0.0002103898751048688, + "loss": 3.002, + "step": 36518 + }, + { + "epoch": 1.79, + "grad_norm": 0.6039292812347412, + "learning_rate": 0.00021037518191584188, + "loss": 3.1068, + "step": 36519 + }, + { + "epoch": 1.79, + "grad_norm": 0.6315106153488159, + "learning_rate": 0.0002103604889628695, + "loss": 3.1007, + "step": 36520 + }, + { + "epoch": 1.79, + "grad_norm": 0.5782997012138367, + "learning_rate": 0.0002103457962459902, + "loss": 3.0973, + "step": 36521 + }, + { + "epoch": 1.79, + "grad_norm": 0.6795260310173035, + "learning_rate": 0.00021033110376524291, + "loss": 3.1494, + "step": 36522 + }, + { + "epoch": 1.79, + "grad_norm": 0.6649677157402039, + "learning_rate": 0.00021031641152066628, + "loss": 3.0697, + "step": 36523 + }, + { + "epoch": 1.79, + "grad_norm": 0.6292036771774292, + "learning_rate": 0.0002103017195122988, + "loss": 3.1645, + "step": 36524 + }, + { + "epoch": 1.79, + "grad_norm": 0.6512131690979004, + "learning_rate": 0.00021028702774017954, + "loss": 2.9064, + "step": 36525 + }, + { + "epoch": 1.79, + "grad_norm": 0.6254218816757202, + "learning_rate": 0.00021027233620434687, + "loss": 3.1291, + "step": 36526 + }, + { + "epoch": 1.79, + "grad_norm": 0.6537070274353027, + "learning_rate": 0.00021025764490483956, + "loss": 3.0194, + "step": 36527 + }, + { + "epoch": 1.79, + "grad_norm": 0.6867677569389343, + "learning_rate": 0.00021024295384169646, + "loss": 2.7768, + "step": 36528 + }, + { + "epoch": 1.79, + "grad_norm": 0.6541476249694824, + "learning_rate": 0.00021022826301495605, + "loss": 2.8653, + "step": 36529 + }, + { + "epoch": 1.79, + "grad_norm": 0.6634353995323181, + "learning_rate": 0.0002102135724246572, + "loss": 3.1185, + "step": 36530 + }, + { + "epoch": 1.79, + "grad_norm": 0.6761667728424072, + "learning_rate": 0.0002101988820708384, + "loss": 2.9844, + "step": 36531 + }, + { + "epoch": 1.79, + "grad_norm": 0.6391075849533081, + "learning_rate": 0.00021018419195353852, + "loss": 2.9843, + "step": 36532 + }, + { + "epoch": 1.79, + "grad_norm": 0.6631091833114624, + "learning_rate": 0.00021016950207279627, + "loss": 2.9588, + "step": 36533 + }, + { + "epoch": 1.79, + "grad_norm": 0.6815654039382935, + "learning_rate": 0.00021015481242865012, + "loss": 2.9905, + "step": 36534 + }, + { + "epoch": 1.79, + "grad_norm": 0.6456697583198547, + "learning_rate": 0.00021014012302113903, + "loss": 3.0181, + "step": 36535 + }, + { + "epoch": 1.79, + "grad_norm": 0.6634407639503479, + "learning_rate": 0.00021012543385030147, + "loss": 3.1435, + "step": 36536 + }, + { + "epoch": 1.79, + "grad_norm": 0.678929328918457, + "learning_rate": 0.00021011074491617636, + "loss": 2.8749, + "step": 36537 + }, + { + "epoch": 1.79, + "grad_norm": 0.6539340019226074, + "learning_rate": 0.00021009605621880208, + "loss": 2.8567, + "step": 36538 + }, + { + "epoch": 1.79, + "grad_norm": 0.6301677227020264, + "learning_rate": 0.00021008136775821752, + "loss": 3.1227, + "step": 36539 + }, + { + "epoch": 1.79, + "grad_norm": 0.6353392601013184, + "learning_rate": 0.0002100666795344614, + "loss": 3.1078, + "step": 36540 + }, + { + "epoch": 1.79, + "grad_norm": 0.6301056742668152, + "learning_rate": 0.0002100519915475723, + "loss": 3.1015, + "step": 36541 + }, + { + "epoch": 1.79, + "grad_norm": 0.6307017803192139, + "learning_rate": 0.000210037303797589, + "loss": 2.948, + "step": 36542 + }, + { + "epoch": 1.79, + "grad_norm": 0.5895671248435974, + "learning_rate": 0.00021002261628455, + "loss": 3.1825, + "step": 36543 + }, + { + "epoch": 1.79, + "grad_norm": 0.6554360389709473, + "learning_rate": 0.00021000792900849422, + "loss": 3.0251, + "step": 36544 + }, + { + "epoch": 1.79, + "grad_norm": 0.6623850464820862, + "learning_rate": 0.00020999324196946026, + "loss": 2.9006, + "step": 36545 + }, + { + "epoch": 1.79, + "grad_norm": 0.6320748329162598, + "learning_rate": 0.00020997855516748665, + "loss": 2.932, + "step": 36546 + }, + { + "epoch": 1.79, + "grad_norm": 0.655175507068634, + "learning_rate": 0.0002099638686026123, + "loss": 2.8594, + "step": 36547 + }, + { + "epoch": 1.79, + "grad_norm": 0.719323456287384, + "learning_rate": 0.00020994918227487577, + "loss": 3.0602, + "step": 36548 + }, + { + "epoch": 1.79, + "grad_norm": 0.6324592232704163, + "learning_rate": 0.0002099344961843157, + "loss": 2.763, + "step": 36549 + }, + { + "epoch": 1.79, + "grad_norm": 0.5989832282066345, + "learning_rate": 0.00020991981033097095, + "loss": 2.6408, + "step": 36550 + }, + { + "epoch": 1.79, + "grad_norm": 0.6369208693504333, + "learning_rate": 0.00020990512471488003, + "loss": 3.09, + "step": 36551 + }, + { + "epoch": 1.79, + "grad_norm": 0.6124125123023987, + "learning_rate": 0.00020989043933608169, + "loss": 3.0194, + "step": 36552 + }, + { + "epoch": 1.79, + "grad_norm": 0.6564167141914368, + "learning_rate": 0.00020987575419461452, + "loss": 3.2123, + "step": 36553 + }, + { + "epoch": 1.79, + "grad_norm": 0.6500516533851624, + "learning_rate": 0.00020986106929051732, + "loss": 3.0145, + "step": 36554 + }, + { + "epoch": 1.79, + "grad_norm": 0.6239467859268188, + "learning_rate": 0.00020984638462382877, + "loss": 3.0401, + "step": 36555 + }, + { + "epoch": 1.79, + "grad_norm": 0.640312910079956, + "learning_rate": 0.00020983170019458737, + "loss": 3.0706, + "step": 36556 + }, + { + "epoch": 1.79, + "grad_norm": 0.6156535744667053, + "learning_rate": 0.00020981701600283204, + "loss": 2.9437, + "step": 36557 + }, + { + "epoch": 1.79, + "grad_norm": 0.6147194504737854, + "learning_rate": 0.00020980233204860122, + "loss": 3.0339, + "step": 36558 + }, + { + "epoch": 1.79, + "grad_norm": 0.64543616771698, + "learning_rate": 0.00020978764833193376, + "loss": 2.9397, + "step": 36559 + }, + { + "epoch": 1.79, + "grad_norm": 0.627521276473999, + "learning_rate": 0.0002097729648528683, + "loss": 2.9861, + "step": 36560 + }, + { + "epoch": 1.79, + "grad_norm": 0.6274319887161255, + "learning_rate": 0.0002097582816114434, + "loss": 2.9149, + "step": 36561 + }, + { + "epoch": 1.79, + "grad_norm": 0.6580536961555481, + "learning_rate": 0.00020974359860769792, + "loss": 2.9476, + "step": 36562 + }, + { + "epoch": 1.79, + "grad_norm": 0.6078628301620483, + "learning_rate": 0.0002097289158416704, + "loss": 2.911, + "step": 36563 + }, + { + "epoch": 1.79, + "grad_norm": 0.6547374725341797, + "learning_rate": 0.00020971423331339945, + "loss": 3.0069, + "step": 36564 + }, + { + "epoch": 1.79, + "grad_norm": 0.6162861585617065, + "learning_rate": 0.00020969955102292394, + "loss": 2.9177, + "step": 36565 + }, + { + "epoch": 1.79, + "grad_norm": 0.6419432163238525, + "learning_rate": 0.00020968486897028242, + "loss": 3.0997, + "step": 36566 + }, + { + "epoch": 1.79, + "grad_norm": 0.6692230105400085, + "learning_rate": 0.00020967018715551362, + "loss": 3.0527, + "step": 36567 + }, + { + "epoch": 1.79, + "grad_norm": 0.6301555633544922, + "learning_rate": 0.00020965550557865606, + "loss": 3.1243, + "step": 36568 + }, + { + "epoch": 1.79, + "grad_norm": 0.6723204851150513, + "learning_rate": 0.00020964082423974853, + "loss": 3.1511, + "step": 36569 + }, + { + "epoch": 1.79, + "grad_norm": 0.6410068273544312, + "learning_rate": 0.00020962614313882978, + "loss": 3.0315, + "step": 36570 + }, + { + "epoch": 1.79, + "grad_norm": 0.6543024182319641, + "learning_rate": 0.00020961146227593821, + "loss": 2.8805, + "step": 36571 + }, + { + "epoch": 1.79, + "grad_norm": 0.6214228868484497, + "learning_rate": 0.00020959678165111283, + "loss": 2.9974, + "step": 36572 + }, + { + "epoch": 1.79, + "grad_norm": 0.5998808145523071, + "learning_rate": 0.00020958210126439204, + "loss": 3.0079, + "step": 36573 + }, + { + "epoch": 1.79, + "grad_norm": 0.6029256582260132, + "learning_rate": 0.00020956742111581447, + "loss": 2.9974, + "step": 36574 + }, + { + "epoch": 1.79, + "grad_norm": 0.6017154455184937, + "learning_rate": 0.00020955274120541915, + "loss": 3.0842, + "step": 36575 + }, + { + "epoch": 1.79, + "grad_norm": 0.6353750228881836, + "learning_rate": 0.00020953806153324437, + "loss": 2.8448, + "step": 36576 + }, + { + "epoch": 1.79, + "grad_norm": 0.6240317821502686, + "learning_rate": 0.000209523382099329, + "loss": 3.0912, + "step": 36577 + }, + { + "epoch": 1.79, + "grad_norm": 0.6466373205184937, + "learning_rate": 0.0002095087029037115, + "loss": 2.9799, + "step": 36578 + }, + { + "epoch": 1.79, + "grad_norm": 0.7123924493789673, + "learning_rate": 0.00020949402394643062, + "loss": 3.0003, + "step": 36579 + }, + { + "epoch": 1.79, + "grad_norm": 0.6275160312652588, + "learning_rate": 0.00020947934522752524, + "loss": 2.9326, + "step": 36580 + }, + { + "epoch": 1.79, + "grad_norm": 0.5951224565505981, + "learning_rate": 0.00020946466674703378, + "loss": 2.9995, + "step": 36581 + }, + { + "epoch": 1.79, + "grad_norm": 0.6290155053138733, + "learning_rate": 0.00020944998850499501, + "loss": 2.9477, + "step": 36582 + }, + { + "epoch": 1.79, + "grad_norm": 0.645408570766449, + "learning_rate": 0.00020943531050144745, + "loss": 3.0292, + "step": 36583 + }, + { + "epoch": 1.79, + "grad_norm": 0.6126695871353149, + "learning_rate": 0.00020942063273642988, + "loss": 2.7644, + "step": 36584 + }, + { + "epoch": 1.79, + "grad_norm": 0.6352578401565552, + "learning_rate": 0.000209405955209981, + "loss": 2.9679, + "step": 36585 + }, + { + "epoch": 1.79, + "grad_norm": 0.6484764218330383, + "learning_rate": 0.00020939127792213928, + "loss": 3.058, + "step": 36586 + }, + { + "epoch": 1.79, + "grad_norm": 0.6158818602561951, + "learning_rate": 0.00020937660087294366, + "loss": 3.0141, + "step": 36587 + }, + { + "epoch": 1.79, + "grad_norm": 0.6046254634857178, + "learning_rate": 0.0002093619240624325, + "loss": 3.0229, + "step": 36588 + }, + { + "epoch": 1.79, + "grad_norm": 0.6500994563102722, + "learning_rate": 0.00020934724749064457, + "loss": 2.7405, + "step": 36589 + }, + { + "epoch": 1.79, + "grad_norm": 0.6411953568458557, + "learning_rate": 0.00020933257115761866, + "loss": 2.9773, + "step": 36590 + }, + { + "epoch": 1.79, + "grad_norm": 0.6058011651039124, + "learning_rate": 0.00020931789506339324, + "loss": 3.0519, + "step": 36591 + }, + { + "epoch": 1.79, + "grad_norm": 0.6400535702705383, + "learning_rate": 0.0002093032192080071, + "loss": 2.8411, + "step": 36592 + }, + { + "epoch": 1.79, + "grad_norm": 0.6149111986160278, + "learning_rate": 0.0002092885435914987, + "loss": 3.0371, + "step": 36593 + }, + { + "epoch": 1.79, + "grad_norm": 0.64310222864151, + "learning_rate": 0.00020927386821390688, + "loss": 2.8115, + "step": 36594 + }, + { + "epoch": 1.79, + "grad_norm": 0.620502769947052, + "learning_rate": 0.0002092591930752703, + "loss": 2.8578, + "step": 36595 + }, + { + "epoch": 1.79, + "grad_norm": 0.6260932683944702, + "learning_rate": 0.00020924451817562738, + "loss": 3.0976, + "step": 36596 + }, + { + "epoch": 1.79, + "grad_norm": 0.6364988088607788, + "learning_rate": 0.00020922984351501707, + "loss": 3.0012, + "step": 36597 + }, + { + "epoch": 1.79, + "grad_norm": 0.6061363220214844, + "learning_rate": 0.00020921516909347778, + "loss": 3.0253, + "step": 36598 + }, + { + "epoch": 1.79, + "grad_norm": 0.6426119804382324, + "learning_rate": 0.00020920049491104833, + "loss": 3.0864, + "step": 36599 + }, + { + "epoch": 1.79, + "grad_norm": 0.6786094903945923, + "learning_rate": 0.00020918582096776737, + "loss": 2.8078, + "step": 36600 + }, + { + "epoch": 1.79, + "grad_norm": 0.60618656873703, + "learning_rate": 0.0002091711472636733, + "loss": 2.9329, + "step": 36601 + }, + { + "epoch": 1.79, + "grad_norm": 0.6546763777732849, + "learning_rate": 0.00020915647379880515, + "loss": 3.0702, + "step": 36602 + }, + { + "epoch": 1.79, + "grad_norm": 0.6468024849891663, + "learning_rate": 0.00020914180057320127, + "loss": 3.0346, + "step": 36603 + }, + { + "epoch": 1.79, + "grad_norm": 0.6212300062179565, + "learning_rate": 0.00020912712758690033, + "loss": 3.0142, + "step": 36604 + }, + { + "epoch": 1.79, + "grad_norm": 0.6541531682014465, + "learning_rate": 0.00020911245483994117, + "loss": 2.9084, + "step": 36605 + }, + { + "epoch": 1.79, + "grad_norm": 0.680400550365448, + "learning_rate": 0.00020909778233236224, + "loss": 2.9649, + "step": 36606 + }, + { + "epoch": 1.79, + "grad_norm": 0.6672208309173584, + "learning_rate": 0.00020908311006420234, + "loss": 2.812, + "step": 36607 + }, + { + "epoch": 1.79, + "grad_norm": 0.6206700205802917, + "learning_rate": 0.0002090684380354999, + "loss": 3.1004, + "step": 36608 + }, + { + "epoch": 1.79, + "grad_norm": 0.6560788154602051, + "learning_rate": 0.00020905376624629377, + "loss": 2.8389, + "step": 36609 + }, + { + "epoch": 1.79, + "grad_norm": 0.61861652135849, + "learning_rate": 0.00020903909469662253, + "loss": 3.2776, + "step": 36610 + }, + { + "epoch": 1.79, + "grad_norm": 0.6285805702209473, + "learning_rate": 0.00020902442338652474, + "loss": 3.0981, + "step": 36611 + }, + { + "epoch": 1.79, + "grad_norm": 0.6283038258552551, + "learning_rate": 0.0002090097523160392, + "loss": 2.9732, + "step": 36612 + }, + { + "epoch": 1.79, + "grad_norm": 0.6750781536102295, + "learning_rate": 0.00020899508148520442, + "loss": 2.8903, + "step": 36613 + }, + { + "epoch": 1.79, + "grad_norm": 0.6676583886146545, + "learning_rate": 0.00020898041089405912, + "loss": 3.013, + "step": 36614 + }, + { + "epoch": 1.79, + "grad_norm": 0.6667542457580566, + "learning_rate": 0.00020896574054264183, + "loss": 2.9005, + "step": 36615 + }, + { + "epoch": 1.79, + "grad_norm": 0.6553710699081421, + "learning_rate": 0.00020895107043099125, + "loss": 2.7584, + "step": 36616 + }, + { + "epoch": 1.79, + "grad_norm": 0.6519506573677063, + "learning_rate": 0.00020893640055914613, + "loss": 3.0171, + "step": 36617 + }, + { + "epoch": 1.79, + "grad_norm": 0.6207144260406494, + "learning_rate": 0.00020892173092714491, + "loss": 2.9434, + "step": 36618 + }, + { + "epoch": 1.79, + "grad_norm": 0.6540098190307617, + "learning_rate": 0.0002089070615350264, + "loss": 3.1374, + "step": 36619 + }, + { + "epoch": 1.79, + "grad_norm": 0.6426064372062683, + "learning_rate": 0.00020889239238282902, + "loss": 2.8674, + "step": 36620 + }, + { + "epoch": 1.79, + "grad_norm": 0.6309428811073303, + "learning_rate": 0.00020887772347059165, + "loss": 2.9389, + "step": 36621 + }, + { + "epoch": 1.79, + "grad_norm": 0.6225348711013794, + "learning_rate": 0.00020886305479835286, + "loss": 2.9193, + "step": 36622 + }, + { + "epoch": 1.79, + "grad_norm": 0.626919686794281, + "learning_rate": 0.00020884838636615113, + "loss": 3.2561, + "step": 36623 + }, + { + "epoch": 1.79, + "grad_norm": 0.637225329875946, + "learning_rate": 0.00020883371817402533, + "loss": 2.9154, + "step": 36624 + }, + { + "epoch": 1.79, + "grad_norm": 0.7885748744010925, + "learning_rate": 0.00020881905022201392, + "loss": 2.9749, + "step": 36625 + }, + { + "epoch": 1.79, + "grad_norm": 0.6135236620903015, + "learning_rate": 0.00020880438251015548, + "loss": 3.3823, + "step": 36626 + }, + { + "epoch": 1.8, + "grad_norm": 0.634709358215332, + "learning_rate": 0.00020878971503848892, + "loss": 2.9316, + "step": 36627 + }, + { + "epoch": 1.8, + "grad_norm": 0.6408893465995789, + "learning_rate": 0.00020877504780705258, + "loss": 3.0149, + "step": 36628 + }, + { + "epoch": 1.8, + "grad_norm": 0.8755736351013184, + "learning_rate": 0.00020876038081588534, + "loss": 2.8576, + "step": 36629 + }, + { + "epoch": 1.8, + "grad_norm": 0.6238410472869873, + "learning_rate": 0.00020874571406502551, + "loss": 2.9549, + "step": 36630 + }, + { + "epoch": 1.8, + "grad_norm": 0.633625864982605, + "learning_rate": 0.00020873104755451198, + "loss": 2.7371, + "step": 36631 + }, + { + "epoch": 1.8, + "grad_norm": 0.6014534831047058, + "learning_rate": 0.00020871638128438342, + "loss": 3.0362, + "step": 36632 + }, + { + "epoch": 1.8, + "grad_norm": 0.6362054944038391, + "learning_rate": 0.00020870171525467812, + "loss": 2.9729, + "step": 36633 + }, + { + "epoch": 1.8, + "grad_norm": 0.6586917638778687, + "learning_rate": 0.00020868704946543518, + "loss": 3.0641, + "step": 36634 + }, + { + "epoch": 1.8, + "grad_norm": 0.6921262145042419, + "learning_rate": 0.00020867238391669284, + "loss": 3.0557, + "step": 36635 + }, + { + "epoch": 1.8, + "grad_norm": 0.6216939687728882, + "learning_rate": 0.0002086577186084898, + "loss": 3.0349, + "step": 36636 + }, + { + "epoch": 1.8, + "grad_norm": 0.6409044861793518, + "learning_rate": 0.00020864305354086489, + "loss": 3.1367, + "step": 36637 + }, + { + "epoch": 1.8, + "grad_norm": 0.6375213265419006, + "learning_rate": 0.00020862838871385655, + "loss": 3.0367, + "step": 36638 + }, + { + "epoch": 1.8, + "grad_norm": 0.6405977010726929, + "learning_rate": 0.00020861372412750347, + "loss": 3.1688, + "step": 36639 + }, + { + "epoch": 1.8, + "grad_norm": 0.6716753244400024, + "learning_rate": 0.00020859905978184415, + "loss": 3.0811, + "step": 36640 + }, + { + "epoch": 1.8, + "grad_norm": 0.7119506597518921, + "learning_rate": 0.0002085843956769173, + "loss": 3.0607, + "step": 36641 + }, + { + "epoch": 1.8, + "grad_norm": 0.6559655070304871, + "learning_rate": 0.0002085697318127617, + "loss": 3.1062, + "step": 36642 + }, + { + "epoch": 1.8, + "grad_norm": 0.6267669796943665, + "learning_rate": 0.0002085550681894158, + "loss": 3.0712, + "step": 36643 + }, + { + "epoch": 1.8, + "grad_norm": 0.6666412949562073, + "learning_rate": 0.00020854040480691824, + "loss": 3.0376, + "step": 36644 + }, + { + "epoch": 1.8, + "grad_norm": 0.6373438835144043, + "learning_rate": 0.0002085257416653076, + "loss": 3.1018, + "step": 36645 + }, + { + "epoch": 1.8, + "grad_norm": 0.6477558612823486, + "learning_rate": 0.00020851107876462257, + "loss": 3.0902, + "step": 36646 + }, + { + "epoch": 1.8, + "grad_norm": 0.6476315259933472, + "learning_rate": 0.00020849641610490185, + "loss": 3.0337, + "step": 36647 + }, + { + "epoch": 1.8, + "grad_norm": 0.6021722555160522, + "learning_rate": 0.0002084817536861838, + "loss": 3.0909, + "step": 36648 + }, + { + "epoch": 1.8, + "grad_norm": 0.6318761110305786, + "learning_rate": 0.00020846709150850736, + "loss": 2.9111, + "step": 36649 + }, + { + "epoch": 1.8, + "grad_norm": 0.6342579126358032, + "learning_rate": 0.00020845242957191087, + "loss": 3.0186, + "step": 36650 + }, + { + "epoch": 1.8, + "grad_norm": 0.6395136713981628, + "learning_rate": 0.00020843776787643302, + "loss": 3.0277, + "step": 36651 + }, + { + "epoch": 1.8, + "grad_norm": 0.613551914691925, + "learning_rate": 0.0002084231064221126, + "loss": 3.0372, + "step": 36652 + }, + { + "epoch": 1.8, + "grad_norm": 0.6066928505897522, + "learning_rate": 0.00020840844520898804, + "loss": 3.0808, + "step": 36653 + }, + { + "epoch": 1.8, + "grad_norm": 0.6370232701301575, + "learning_rate": 0.00020839378423709805, + "loss": 3.1052, + "step": 36654 + }, + { + "epoch": 1.8, + "grad_norm": 0.6237879991531372, + "learning_rate": 0.0002083791235064811, + "loss": 2.753, + "step": 36655 + }, + { + "epoch": 1.8, + "grad_norm": 0.6108696460723877, + "learning_rate": 0.00020836446301717598, + "loss": 2.9691, + "step": 36656 + }, + { + "epoch": 1.8, + "grad_norm": 0.6503059267997742, + "learning_rate": 0.00020834980276922125, + "loss": 3.0589, + "step": 36657 + }, + { + "epoch": 1.8, + "grad_norm": 0.6078692078590393, + "learning_rate": 0.0002083351427626554, + "loss": 3.2604, + "step": 36658 + }, + { + "epoch": 1.8, + "grad_norm": 0.6196121573448181, + "learning_rate": 0.00020832048299751726, + "loss": 3.1257, + "step": 36659 + }, + { + "epoch": 1.8, + "grad_norm": 0.651357889175415, + "learning_rate": 0.0002083058234738452, + "loss": 2.9815, + "step": 36660 + }, + { + "epoch": 1.8, + "grad_norm": 0.5992416143417358, + "learning_rate": 0.000208291164191678, + "loss": 2.7149, + "step": 36661 + }, + { + "epoch": 1.8, + "grad_norm": 0.6600878238677979, + "learning_rate": 0.0002082765051510543, + "loss": 3.1287, + "step": 36662 + }, + { + "epoch": 1.8, + "grad_norm": 0.6135623455047607, + "learning_rate": 0.00020826184635201254, + "loss": 2.8863, + "step": 36663 + }, + { + "epoch": 1.8, + "grad_norm": 0.614033579826355, + "learning_rate": 0.0002082471877945915, + "loss": 3.1524, + "step": 36664 + }, + { + "epoch": 1.8, + "grad_norm": 0.6445769667625427, + "learning_rate": 0.00020823252947882969, + "loss": 2.993, + "step": 36665 + }, + { + "epoch": 1.8, + "grad_norm": 0.6738548874855042, + "learning_rate": 0.00020821787140476564, + "loss": 3.0463, + "step": 36666 + }, + { + "epoch": 1.8, + "grad_norm": 0.678357720375061, + "learning_rate": 0.00020820321357243818, + "loss": 2.7473, + "step": 36667 + }, + { + "epoch": 1.8, + "grad_norm": 0.6646535396575928, + "learning_rate": 0.00020818855598188574, + "loss": 2.9655, + "step": 36668 + }, + { + "epoch": 1.8, + "grad_norm": 0.6294272541999817, + "learning_rate": 0.000208173898633147, + "loss": 3.0103, + "step": 36669 + }, + { + "epoch": 1.8, + "grad_norm": 0.6239528059959412, + "learning_rate": 0.0002081592415262605, + "loss": 3.0741, + "step": 36670 + }, + { + "epoch": 1.8, + "grad_norm": 0.630536675453186, + "learning_rate": 0.00020814458466126485, + "loss": 3.0073, + "step": 36671 + }, + { + "epoch": 1.8, + "grad_norm": 0.6464529633522034, + "learning_rate": 0.0002081299280381988, + "loss": 3.1235, + "step": 36672 + }, + { + "epoch": 1.8, + "grad_norm": 0.6047073602676392, + "learning_rate": 0.00020811527165710069, + "loss": 2.9439, + "step": 36673 + }, + { + "epoch": 1.8, + "grad_norm": 0.6120465397834778, + "learning_rate": 0.0002081006155180094, + "loss": 2.8782, + "step": 36674 + }, + { + "epoch": 1.8, + "grad_norm": 0.631659984588623, + "learning_rate": 0.00020808595962096336, + "loss": 3.1335, + "step": 36675 + }, + { + "epoch": 1.8, + "grad_norm": 0.6841307282447815, + "learning_rate": 0.00020807130396600115, + "loss": 2.9434, + "step": 36676 + }, + { + "epoch": 1.8, + "grad_norm": 0.626285970211029, + "learning_rate": 0.00020805664855316153, + "loss": 2.8426, + "step": 36677 + }, + { + "epoch": 1.8, + "grad_norm": 0.6159030795097351, + "learning_rate": 0.00020804199338248292, + "loss": 3.0015, + "step": 36678 + }, + { + "epoch": 1.8, + "grad_norm": 0.593059778213501, + "learning_rate": 0.00020802733845400411, + "loss": 2.8796, + "step": 36679 + }, + { + "epoch": 1.8, + "grad_norm": 0.7196869254112244, + "learning_rate": 0.00020801268376776348, + "loss": 2.7227, + "step": 36680 + }, + { + "epoch": 1.8, + "grad_norm": 0.6337694525718689, + "learning_rate": 0.0002079980293237997, + "loss": 3.0181, + "step": 36681 + }, + { + "epoch": 1.8, + "grad_norm": 0.612427830696106, + "learning_rate": 0.00020798337512215154, + "loss": 2.9788, + "step": 36682 + }, + { + "epoch": 1.8, + "grad_norm": 0.629952073097229, + "learning_rate": 0.0002079687211628574, + "loss": 2.9669, + "step": 36683 + }, + { + "epoch": 1.8, + "grad_norm": 0.657768964767456, + "learning_rate": 0.00020795406744595604, + "loss": 2.9131, + "step": 36684 + }, + { + "epoch": 1.8, + "grad_norm": 0.6409008502960205, + "learning_rate": 0.00020793941397148576, + "loss": 3.0031, + "step": 36685 + }, + { + "epoch": 1.8, + "grad_norm": 0.6390206217765808, + "learning_rate": 0.00020792476073948544, + "loss": 3.0613, + "step": 36686 + }, + { + "epoch": 1.8, + "grad_norm": 0.663670539855957, + "learning_rate": 0.00020791010774999367, + "loss": 3.0369, + "step": 36687 + }, + { + "epoch": 1.8, + "grad_norm": 0.6246560215950012, + "learning_rate": 0.00020789545500304873, + "loss": 3.0468, + "step": 36688 + }, + { + "epoch": 1.8, + "grad_norm": 0.6665436625480652, + "learning_rate": 0.00020788080249868965, + "loss": 2.7913, + "step": 36689 + }, + { + "epoch": 1.8, + "grad_norm": 0.6187042593955994, + "learning_rate": 0.00020786615023695473, + "loss": 2.8444, + "step": 36690 + }, + { + "epoch": 1.8, + "grad_norm": 0.659570038318634, + "learning_rate": 0.0002078514982178827, + "loss": 2.9988, + "step": 36691 + }, + { + "epoch": 1.8, + "grad_norm": 0.6804064512252808, + "learning_rate": 0.00020783684644151194, + "loss": 3.032, + "step": 36692 + }, + { + "epoch": 1.8, + "grad_norm": 0.6458125710487366, + "learning_rate": 0.00020782219490788126, + "loss": 2.9141, + "step": 36693 + }, + { + "epoch": 1.8, + "grad_norm": 0.6113386154174805, + "learning_rate": 0.00020780754361702925, + "loss": 2.9778, + "step": 36694 + }, + { + "epoch": 1.8, + "grad_norm": 0.6785792112350464, + "learning_rate": 0.00020779289256899435, + "loss": 3.1989, + "step": 36695 + }, + { + "epoch": 1.8, + "grad_norm": 0.5986670851707458, + "learning_rate": 0.0002077782417638153, + "loss": 2.8518, + "step": 36696 + }, + { + "epoch": 1.8, + "grad_norm": 0.6187350749969482, + "learning_rate": 0.00020776359120153054, + "loss": 2.9814, + "step": 36697 + }, + { + "epoch": 1.8, + "grad_norm": 0.6327255368232727, + "learning_rate": 0.00020774894088217868, + "loss": 3.1217, + "step": 36698 + }, + { + "epoch": 1.8, + "grad_norm": 0.6451022028923035, + "learning_rate": 0.0002077342908057985, + "loss": 3.1749, + "step": 36699 + }, + { + "epoch": 1.8, + "grad_norm": 0.6500363945960999, + "learning_rate": 0.00020771964097242827, + "loss": 2.8666, + "step": 36700 + }, + { + "epoch": 1.8, + "grad_norm": 0.659353494644165, + "learning_rate": 0.00020770499138210696, + "loss": 3.2217, + "step": 36701 + }, + { + "epoch": 1.8, + "grad_norm": 0.6432385444641113, + "learning_rate": 0.00020769034203487274, + "loss": 2.7555, + "step": 36702 + }, + { + "epoch": 1.8, + "grad_norm": 0.7030990123748779, + "learning_rate": 0.00020767569293076444, + "loss": 2.9348, + "step": 36703 + }, + { + "epoch": 1.8, + "grad_norm": 0.6431054472923279, + "learning_rate": 0.00020766104406982067, + "loss": 2.866, + "step": 36704 + }, + { + "epoch": 1.8, + "grad_norm": 0.6525076627731323, + "learning_rate": 0.00020764639545207988, + "loss": 3.0982, + "step": 36705 + }, + { + "epoch": 1.8, + "grad_norm": 0.6153354644775391, + "learning_rate": 0.0002076317470775808, + "loss": 2.7865, + "step": 36706 + }, + { + "epoch": 1.8, + "grad_norm": 0.6521280407905579, + "learning_rate": 0.0002076170989463618, + "loss": 2.9171, + "step": 36707 + }, + { + "epoch": 1.8, + "grad_norm": 0.677487313747406, + "learning_rate": 0.00020760245105846164, + "loss": 3.1123, + "step": 36708 + }, + { + "epoch": 1.8, + "grad_norm": 0.6668803095817566, + "learning_rate": 0.00020758780341391888, + "loss": 2.9094, + "step": 36709 + }, + { + "epoch": 1.8, + "grad_norm": 0.6921082139015198, + "learning_rate": 0.00020757315601277197, + "loss": 3.0438, + "step": 36710 + }, + { + "epoch": 1.8, + "grad_norm": 0.6118077039718628, + "learning_rate": 0.00020755850885505965, + "loss": 3.111, + "step": 36711 + }, + { + "epoch": 1.8, + "grad_norm": 0.6800283789634705, + "learning_rate": 0.00020754386194082037, + "loss": 2.9849, + "step": 36712 + }, + { + "epoch": 1.8, + "grad_norm": 0.6486184597015381, + "learning_rate": 0.00020752921527009277, + "loss": 2.8923, + "step": 36713 + }, + { + "epoch": 1.8, + "grad_norm": 0.631157636642456, + "learning_rate": 0.0002075145688429155, + "loss": 3.034, + "step": 36714 + }, + { + "epoch": 1.8, + "grad_norm": 0.610759973526001, + "learning_rate": 0.00020749992265932697, + "loss": 3.0504, + "step": 36715 + }, + { + "epoch": 1.8, + "grad_norm": 0.6135387420654297, + "learning_rate": 0.00020748527671936596, + "loss": 2.8504, + "step": 36716 + }, + { + "epoch": 1.8, + "grad_norm": 0.6542940139770508, + "learning_rate": 0.00020747063102307078, + "loss": 3.0165, + "step": 36717 + }, + { + "epoch": 1.8, + "grad_norm": 0.7032894492149353, + "learning_rate": 0.00020745598557048018, + "loss": 2.9044, + "step": 36718 + }, + { + "epoch": 1.8, + "grad_norm": 0.6352038383483887, + "learning_rate": 0.0002074413403616328, + "loss": 2.8658, + "step": 36719 + }, + { + "epoch": 1.8, + "grad_norm": 0.6255574822425842, + "learning_rate": 0.000207426695396567, + "loss": 3.1381, + "step": 36720 + }, + { + "epoch": 1.8, + "grad_norm": 0.6670256853103638, + "learning_rate": 0.00020741205067532158, + "loss": 2.993, + "step": 36721 + }, + { + "epoch": 1.8, + "grad_norm": 0.6089436411857605, + "learning_rate": 0.00020739740619793486, + "loss": 3.0545, + "step": 36722 + }, + { + "epoch": 1.8, + "grad_norm": 0.6588670015335083, + "learning_rate": 0.0002073827619644456, + "loss": 3.0358, + "step": 36723 + }, + { + "epoch": 1.8, + "grad_norm": 0.6333926916122437, + "learning_rate": 0.00020736811797489243, + "loss": 2.8797, + "step": 36724 + }, + { + "epoch": 1.8, + "grad_norm": 0.6069063544273376, + "learning_rate": 0.00020735347422931366, + "loss": 3.113, + "step": 36725 + }, + { + "epoch": 1.8, + "grad_norm": 0.6227714419364929, + "learning_rate": 0.00020733883072774815, + "loss": 3.0379, + "step": 36726 + }, + { + "epoch": 1.8, + "grad_norm": 0.635350227355957, + "learning_rate": 0.00020732418747023424, + "loss": 3.0705, + "step": 36727 + }, + { + "epoch": 1.8, + "grad_norm": 0.6420906186103821, + "learning_rate": 0.00020730954445681053, + "loss": 2.8377, + "step": 36728 + }, + { + "epoch": 1.8, + "grad_norm": 0.6361296772956848, + "learning_rate": 0.0002072949016875158, + "loss": 3.1277, + "step": 36729 + }, + { + "epoch": 1.8, + "grad_norm": 0.6772409677505493, + "learning_rate": 0.00020728025916238836, + "loss": 2.8932, + "step": 36730 + }, + { + "epoch": 1.8, + "grad_norm": 0.6195856928825378, + "learning_rate": 0.00020726561688146696, + "loss": 3.0179, + "step": 36731 + }, + { + "epoch": 1.8, + "grad_norm": 0.6762132048606873, + "learning_rate": 0.0002072509748447899, + "loss": 3.0329, + "step": 36732 + }, + { + "epoch": 1.8, + "grad_norm": 0.6098534464836121, + "learning_rate": 0.000207236333052396, + "loss": 2.9847, + "step": 36733 + }, + { + "epoch": 1.8, + "grad_norm": 0.6518588066101074, + "learning_rate": 0.00020722169150432383, + "loss": 3.0305, + "step": 36734 + }, + { + "epoch": 1.8, + "grad_norm": 0.6830816268920898, + "learning_rate": 0.00020720705020061173, + "loss": 3.1915, + "step": 36735 + }, + { + "epoch": 1.8, + "grad_norm": 0.6383629441261292, + "learning_rate": 0.00020719240914129854, + "loss": 3.1503, + "step": 36736 + }, + { + "epoch": 1.8, + "grad_norm": 0.5932306051254272, + "learning_rate": 0.0002071777683264226, + "loss": 3.0398, + "step": 36737 + }, + { + "epoch": 1.8, + "grad_norm": 0.6360545754432678, + "learning_rate": 0.00020716312775602254, + "loss": 3.0491, + "step": 36738 + }, + { + "epoch": 1.8, + "grad_norm": 0.5986540913581848, + "learning_rate": 0.000207148487430137, + "loss": 2.8491, + "step": 36739 + }, + { + "epoch": 1.8, + "grad_norm": 0.6128685474395752, + "learning_rate": 0.00020713384734880445, + "loss": 2.9966, + "step": 36740 + }, + { + "epoch": 1.8, + "grad_norm": 0.5811918377876282, + "learning_rate": 0.00020711920751206354, + "loss": 2.828, + "step": 36741 + }, + { + "epoch": 1.8, + "grad_norm": 0.6315313577651978, + "learning_rate": 0.00020710456791995266, + "loss": 3.0263, + "step": 36742 + }, + { + "epoch": 1.8, + "grad_norm": 0.5910419821739197, + "learning_rate": 0.0002070899285725104, + "loss": 2.8827, + "step": 36743 + }, + { + "epoch": 1.8, + "grad_norm": 0.6899007558822632, + "learning_rate": 0.00020707528946977555, + "loss": 2.9803, + "step": 36744 + }, + { + "epoch": 1.8, + "grad_norm": 0.6172998547554016, + "learning_rate": 0.00020706065061178644, + "loss": 3.0134, + "step": 36745 + }, + { + "epoch": 1.8, + "grad_norm": 0.6163078546524048, + "learning_rate": 0.00020704601199858172, + "loss": 3.1796, + "step": 36746 + }, + { + "epoch": 1.8, + "grad_norm": 0.6540799140930176, + "learning_rate": 0.00020703137363019985, + "loss": 2.8401, + "step": 36747 + }, + { + "epoch": 1.8, + "grad_norm": 0.6538397073745728, + "learning_rate": 0.0002070167355066795, + "loss": 2.8745, + "step": 36748 + }, + { + "epoch": 1.8, + "grad_norm": 0.7262605428695679, + "learning_rate": 0.00020700209762805922, + "loss": 3.2279, + "step": 36749 + }, + { + "epoch": 1.8, + "grad_norm": 0.6116203665733337, + "learning_rate": 0.00020698745999437736, + "loss": 3.0517, + "step": 36750 + }, + { + "epoch": 1.8, + "grad_norm": 0.6457718014717102, + "learning_rate": 0.0002069728226056728, + "loss": 2.8412, + "step": 36751 + }, + { + "epoch": 1.8, + "grad_norm": 0.6367230415344238, + "learning_rate": 0.00020695818546198383, + "loss": 3.0288, + "step": 36752 + }, + { + "epoch": 1.8, + "grad_norm": 0.6102105379104614, + "learning_rate": 0.00020694354856334908, + "loss": 2.8687, + "step": 36753 + }, + { + "epoch": 1.8, + "grad_norm": 0.679972767829895, + "learning_rate": 0.00020692891190980716, + "loss": 2.9442, + "step": 36754 + }, + { + "epoch": 1.8, + "grad_norm": 0.6194684505462646, + "learning_rate": 0.00020691427550139656, + "loss": 2.9587, + "step": 36755 + }, + { + "epoch": 1.8, + "grad_norm": 0.7451155781745911, + "learning_rate": 0.00020689963933815593, + "loss": 3.0858, + "step": 36756 + }, + { + "epoch": 1.8, + "grad_norm": 0.615552544593811, + "learning_rate": 0.00020688500342012355, + "loss": 2.8748, + "step": 36757 + }, + { + "epoch": 1.8, + "grad_norm": 0.6059805750846863, + "learning_rate": 0.00020687036774733823, + "loss": 2.7704, + "step": 36758 + }, + { + "epoch": 1.8, + "grad_norm": 0.6106909513473511, + "learning_rate": 0.00020685573231983852, + "loss": 2.8743, + "step": 36759 + }, + { + "epoch": 1.8, + "grad_norm": 0.6602317094802856, + "learning_rate": 0.00020684109713766278, + "loss": 3.0426, + "step": 36760 + }, + { + "epoch": 1.8, + "grad_norm": 0.6297293305397034, + "learning_rate": 0.00020682646220084974, + "loss": 3.0277, + "step": 36761 + }, + { + "epoch": 1.8, + "grad_norm": 0.6300654411315918, + "learning_rate": 0.00020681182750943776, + "loss": 3.0978, + "step": 36762 + }, + { + "epoch": 1.8, + "grad_norm": 0.6634638905525208, + "learning_rate": 0.00020679719306346557, + "loss": 3.0136, + "step": 36763 + }, + { + "epoch": 1.8, + "grad_norm": 0.6517794728279114, + "learning_rate": 0.00020678255886297171, + "loss": 2.971, + "step": 36764 + }, + { + "epoch": 1.8, + "grad_norm": 0.6523165106773376, + "learning_rate": 0.00020676792490799447, + "loss": 3.0286, + "step": 36765 + }, + { + "epoch": 1.8, + "grad_norm": 0.6734626889228821, + "learning_rate": 0.00020675329119857275, + "loss": 3.1295, + "step": 36766 + }, + { + "epoch": 1.8, + "grad_norm": 0.632370114326477, + "learning_rate": 0.00020673865773474477, + "loss": 3.0338, + "step": 36767 + }, + { + "epoch": 1.8, + "grad_norm": 0.6716340780258179, + "learning_rate": 0.0002067240245165494, + "loss": 3.0109, + "step": 36768 + }, + { + "epoch": 1.8, + "grad_norm": 0.641351044178009, + "learning_rate": 0.00020670939154402478, + "loss": 2.9696, + "step": 36769 + }, + { + "epoch": 1.8, + "grad_norm": 0.610369086265564, + "learning_rate": 0.00020669475881720975, + "loss": 3.0894, + "step": 36770 + }, + { + "epoch": 1.8, + "grad_norm": 0.6596361398696899, + "learning_rate": 0.00020668012633614287, + "loss": 3.0013, + "step": 36771 + }, + { + "epoch": 1.8, + "grad_norm": 0.6531994938850403, + "learning_rate": 0.00020666549410086244, + "loss": 3.2047, + "step": 36772 + }, + { + "epoch": 1.8, + "grad_norm": 0.6469523906707764, + "learning_rate": 0.00020665086211140723, + "loss": 2.853, + "step": 36773 + }, + { + "epoch": 1.8, + "grad_norm": 0.6518253684043884, + "learning_rate": 0.00020663623036781565, + "loss": 2.9245, + "step": 36774 + }, + { + "epoch": 1.8, + "grad_norm": 0.617925763130188, + "learning_rate": 0.00020662159887012616, + "loss": 2.7722, + "step": 36775 + }, + { + "epoch": 1.8, + "grad_norm": 0.6405405402183533, + "learning_rate": 0.00020660696761837761, + "loss": 3.2794, + "step": 36776 + }, + { + "epoch": 1.8, + "grad_norm": 0.6911517977714539, + "learning_rate": 0.00020659233661260822, + "loss": 2.9798, + "step": 36777 + }, + { + "epoch": 1.8, + "grad_norm": 0.627225935459137, + "learning_rate": 0.0002065777058528567, + "loss": 3.0352, + "step": 36778 + }, + { + "epoch": 1.8, + "grad_norm": 0.6320593953132629, + "learning_rate": 0.00020656307533916142, + "loss": 2.9623, + "step": 36779 + }, + { + "epoch": 1.8, + "grad_norm": 0.6159602403640747, + "learning_rate": 0.00020654844507156107, + "loss": 3.0543, + "step": 36780 + }, + { + "epoch": 1.8, + "grad_norm": 0.6653395891189575, + "learning_rate": 0.00020653381505009418, + "loss": 2.9946, + "step": 36781 + }, + { + "epoch": 1.8, + "grad_norm": 0.6371135711669922, + "learning_rate": 0.00020651918527479914, + "loss": 3.1782, + "step": 36782 + }, + { + "epoch": 1.8, + "grad_norm": 0.6792336106300354, + "learning_rate": 0.0002065045557457147, + "loss": 3.0116, + "step": 36783 + }, + { + "epoch": 1.8, + "grad_norm": 0.6385068297386169, + "learning_rate": 0.0002064899264628791, + "loss": 2.9848, + "step": 36784 + }, + { + "epoch": 1.8, + "grad_norm": 0.6543515920639038, + "learning_rate": 0.00020647529742633117, + "loss": 2.9478, + "step": 36785 + }, + { + "epoch": 1.8, + "grad_norm": 0.6753814816474915, + "learning_rate": 0.00020646066863610931, + "loss": 3.0857, + "step": 36786 + }, + { + "epoch": 1.8, + "grad_norm": 0.6674555540084839, + "learning_rate": 0.00020644604009225191, + "loss": 3.0606, + "step": 36787 + }, + { + "epoch": 1.8, + "grad_norm": 0.6608149409294128, + "learning_rate": 0.00020643141179479775, + "loss": 3.1004, + "step": 36788 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545112729072571, + "learning_rate": 0.00020641678374378527, + "loss": 2.8895, + "step": 36789 + }, + { + "epoch": 1.8, + "grad_norm": 0.6464564800262451, + "learning_rate": 0.0002064021559392528, + "loss": 3.1448, + "step": 36790 + }, + { + "epoch": 1.8, + "grad_norm": 0.6789308190345764, + "learning_rate": 0.00020638752838123925, + "loss": 3.0127, + "step": 36791 + }, + { + "epoch": 1.8, + "grad_norm": 0.6076009273529053, + "learning_rate": 0.00020637290106978283, + "loss": 3.0271, + "step": 36792 + }, + { + "epoch": 1.8, + "grad_norm": 0.6446407437324524, + "learning_rate": 0.00020635827400492223, + "loss": 2.9744, + "step": 36793 + }, + { + "epoch": 1.8, + "grad_norm": 0.6316608786582947, + "learning_rate": 0.00020634364718669584, + "loss": 3.008, + "step": 36794 + }, + { + "epoch": 1.8, + "grad_norm": 0.6050397157669067, + "learning_rate": 0.0002063290206151423, + "loss": 2.8838, + "step": 36795 + }, + { + "epoch": 1.8, + "grad_norm": 0.6316719651222229, + "learning_rate": 0.00020631439429030012, + "loss": 2.9403, + "step": 36796 + }, + { + "epoch": 1.8, + "grad_norm": 0.6683484315872192, + "learning_rate": 0.00020629976821220767, + "loss": 3.0497, + "step": 36797 + }, + { + "epoch": 1.8, + "grad_norm": 0.663461446762085, + "learning_rate": 0.00020628514238090373, + "loss": 3.0129, + "step": 36798 + }, + { + "epoch": 1.8, + "grad_norm": 0.6077750325202942, + "learning_rate": 0.00020627051679642666, + "loss": 2.9524, + "step": 36799 + }, + { + "epoch": 1.8, + "grad_norm": 0.632233738899231, + "learning_rate": 0.0002062558914588149, + "loss": 2.8329, + "step": 36800 + }, + { + "epoch": 1.8, + "grad_norm": 0.6921725273132324, + "learning_rate": 0.00020624126636810725, + "loss": 3.0573, + "step": 36801 + }, + { + "epoch": 1.8, + "grad_norm": 0.6208415031433105, + "learning_rate": 0.000206226641524342, + "loss": 2.8597, + "step": 36802 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545069217681885, + "learning_rate": 0.00020621201692755777, + "loss": 3.0219, + "step": 36803 + }, + { + "epoch": 1.8, + "grad_norm": 0.649098813533783, + "learning_rate": 0.0002061973925777929, + "loss": 3.1196, + "step": 36804 + }, + { + "epoch": 1.8, + "grad_norm": 0.6079701781272888, + "learning_rate": 0.00020618276847508604, + "loss": 2.788, + "step": 36805 + }, + { + "epoch": 1.8, + "grad_norm": 0.6139565110206604, + "learning_rate": 0.00020616814461947588, + "loss": 2.9477, + "step": 36806 + }, + { + "epoch": 1.8, + "grad_norm": 0.6319758296012878, + "learning_rate": 0.0002061535210110007, + "loss": 3.0009, + "step": 36807 + }, + { + "epoch": 1.8, + "grad_norm": 0.6935994029045105, + "learning_rate": 0.0002061388976496991, + "loss": 2.9628, + "step": 36808 + }, + { + "epoch": 1.8, + "grad_norm": 0.6240067481994629, + "learning_rate": 0.00020612427453560946, + "loss": 2.795, + "step": 36809 + }, + { + "epoch": 1.8, + "grad_norm": 0.6220824122428894, + "learning_rate": 0.00020610965166877054, + "loss": 3.0931, + "step": 36810 + }, + { + "epoch": 1.8, + "grad_norm": 0.6552855968475342, + "learning_rate": 0.00020609502904922074, + "loss": 2.8704, + "step": 36811 + }, + { + "epoch": 1.8, + "grad_norm": 0.6498665809631348, + "learning_rate": 0.00020608040667699845, + "loss": 3.0281, + "step": 36812 + }, + { + "epoch": 1.8, + "grad_norm": 0.6534984707832336, + "learning_rate": 0.00020606578455214242, + "loss": 3.0395, + "step": 36813 + }, + { + "epoch": 1.8, + "grad_norm": 0.7032010555267334, + "learning_rate": 0.00020605116267469097, + "loss": 3.0231, + "step": 36814 + }, + { + "epoch": 1.8, + "grad_norm": 0.6376596093177795, + "learning_rate": 0.0002060365410446826, + "loss": 3.1822, + "step": 36815 + }, + { + "epoch": 1.8, + "grad_norm": 0.6410520672798157, + "learning_rate": 0.00020602191966215605, + "loss": 3.0589, + "step": 36816 + }, + { + "epoch": 1.8, + "grad_norm": 0.6917757391929626, + "learning_rate": 0.00020600729852714958, + "loss": 2.9896, + "step": 36817 + }, + { + "epoch": 1.8, + "grad_norm": 0.6153084635734558, + "learning_rate": 0.0002059926776397019, + "loss": 3.0361, + "step": 36818 + }, + { + "epoch": 1.8, + "grad_norm": 0.720618724822998, + "learning_rate": 0.00020597805699985124, + "loss": 3.0778, + "step": 36819 + }, + { + "epoch": 1.8, + "grad_norm": 0.6662018895149231, + "learning_rate": 0.00020596343660763642, + "loss": 2.7993, + "step": 36820 + }, + { + "epoch": 1.8, + "grad_norm": 0.6655521392822266, + "learning_rate": 0.0002059488164630958, + "loss": 2.942, + "step": 36821 + }, + { + "epoch": 1.8, + "grad_norm": 0.6253843307495117, + "learning_rate": 0.00020593419656626778, + "loss": 3.0623, + "step": 36822 + }, + { + "epoch": 1.8, + "grad_norm": 0.6495913863182068, + "learning_rate": 0.00020591957691719112, + "loss": 2.8426, + "step": 36823 + }, + { + "epoch": 1.8, + "grad_norm": 0.6686586737632751, + "learning_rate": 0.00020590495751590408, + "loss": 2.9081, + "step": 36824 + }, + { + "epoch": 1.8, + "grad_norm": 0.6453542113304138, + "learning_rate": 0.00020589033836244534, + "loss": 3.1822, + "step": 36825 + }, + { + "epoch": 1.8, + "grad_norm": 0.6410621404647827, + "learning_rate": 0.0002058757194568534, + "loss": 2.809, + "step": 36826 + }, + { + "epoch": 1.8, + "grad_norm": 0.6789267659187317, + "learning_rate": 0.00020586110079916654, + "loss": 2.9868, + "step": 36827 + }, + { + "epoch": 1.8, + "grad_norm": 0.6374423503875732, + "learning_rate": 0.00020584648238942358, + "loss": 3.154, + "step": 36828 + }, + { + "epoch": 1.8, + "grad_norm": 0.6617890000343323, + "learning_rate": 0.0002058318642276628, + "loss": 3.0522, + "step": 36829 + }, + { + "epoch": 1.8, + "grad_norm": 0.6505021452903748, + "learning_rate": 0.00020581724631392267, + "loss": 3.2386, + "step": 36830 + }, + { + "epoch": 1.8, + "grad_norm": 0.6423779726028442, + "learning_rate": 0.00020580262864824193, + "loss": 3.1164, + "step": 36831 + }, + { + "epoch": 1.81, + "grad_norm": 0.6466403603553772, + "learning_rate": 0.0002057880112306589, + "loss": 3.0535, + "step": 36832 + }, + { + "epoch": 1.81, + "grad_norm": 0.6527153849601746, + "learning_rate": 0.00020577339406121215, + "loss": 2.8668, + "step": 36833 + }, + { + "epoch": 1.81, + "grad_norm": 0.6217302083969116, + "learning_rate": 0.00020575877713994002, + "loss": 3.0263, + "step": 36834 + }, + { + "epoch": 1.81, + "grad_norm": 0.6141082048416138, + "learning_rate": 0.00020574416046688124, + "loss": 3.1425, + "step": 36835 + }, + { + "epoch": 1.81, + "grad_norm": 0.6378345489501953, + "learning_rate": 0.00020572954404207423, + "loss": 3.1281, + "step": 36836 + }, + { + "epoch": 1.81, + "grad_norm": 0.6423177719116211, + "learning_rate": 0.00020571492786555733, + "loss": 3.0076, + "step": 36837 + }, + { + "epoch": 1.81, + "grad_norm": 0.6553202867507935, + "learning_rate": 0.0002057003119373693, + "loss": 2.9455, + "step": 36838 + }, + { + "epoch": 1.81, + "grad_norm": 0.627715528011322, + "learning_rate": 0.00020568569625754844, + "loss": 2.9104, + "step": 36839 + }, + { + "epoch": 1.81, + "grad_norm": 0.6472757458686829, + "learning_rate": 0.00020567108082613324, + "loss": 2.9738, + "step": 36840 + }, + { + "epoch": 1.81, + "grad_norm": 0.6219276785850525, + "learning_rate": 0.00020565646564316237, + "loss": 3.0335, + "step": 36841 + }, + { + "epoch": 1.81, + "grad_norm": 0.6370683312416077, + "learning_rate": 0.00020564185070867416, + "loss": 3.1745, + "step": 36842 + }, + { + "epoch": 1.81, + "grad_norm": 0.6401722431182861, + "learning_rate": 0.0002056272360227072, + "loss": 3.118, + "step": 36843 + }, + { + "epoch": 1.81, + "grad_norm": 0.7073693871498108, + "learning_rate": 0.00020561262158529984, + "loss": 2.9147, + "step": 36844 + }, + { + "epoch": 1.81, + "grad_norm": 0.637370228767395, + "learning_rate": 0.00020559800739649065, + "loss": 3.0401, + "step": 36845 + }, + { + "epoch": 1.81, + "grad_norm": 0.658949077129364, + "learning_rate": 0.00020558339345631827, + "loss": 2.925, + "step": 36846 + }, + { + "epoch": 1.81, + "grad_norm": 0.6802607774734497, + "learning_rate": 0.000205568779764821, + "loss": 2.9548, + "step": 36847 + }, + { + "epoch": 1.81, + "grad_norm": 0.6683867573738098, + "learning_rate": 0.00020555416632203749, + "loss": 3.0257, + "step": 36848 + }, + { + "epoch": 1.81, + "grad_norm": 0.6157516241073608, + "learning_rate": 0.00020553955312800596, + "loss": 3.0218, + "step": 36849 + }, + { + "epoch": 1.81, + "grad_norm": 0.6470605134963989, + "learning_rate": 0.0002055249401827652, + "loss": 2.8591, + "step": 36850 + }, + { + "epoch": 1.81, + "grad_norm": 0.7139477729797363, + "learning_rate": 0.0002055103274863535, + "loss": 2.8813, + "step": 36851 + }, + { + "epoch": 1.81, + "grad_norm": 0.5850152373313904, + "learning_rate": 0.00020549571503880937, + "loss": 2.9898, + "step": 36852 + }, + { + "epoch": 1.81, + "grad_norm": 0.6298661828041077, + "learning_rate": 0.00020548110284017144, + "loss": 3.1898, + "step": 36853 + }, + { + "epoch": 1.81, + "grad_norm": 0.6115210056304932, + "learning_rate": 0.00020546649089047805, + "loss": 2.9295, + "step": 36854 + }, + { + "epoch": 1.81, + "grad_norm": 0.6343759894371033, + "learning_rate": 0.0002054518791897678, + "loss": 3.2084, + "step": 36855 + }, + { + "epoch": 1.81, + "grad_norm": 0.6652531623840332, + "learning_rate": 0.00020543726773807896, + "loss": 2.8345, + "step": 36856 + }, + { + "epoch": 1.81, + "grad_norm": 0.6212843060493469, + "learning_rate": 0.00020542265653545022, + "loss": 2.9031, + "step": 36857 + }, + { + "epoch": 1.81, + "grad_norm": 0.6636337637901306, + "learning_rate": 0.00020540804558192007, + "loss": 3.254, + "step": 36858 + }, + { + "epoch": 1.81, + "grad_norm": 0.6373041868209839, + "learning_rate": 0.0002053934348775268, + "loss": 2.9458, + "step": 36859 + }, + { + "epoch": 1.81, + "grad_norm": 0.6539564728736877, + "learning_rate": 0.00020537882442230916, + "loss": 2.9724, + "step": 36860 + }, + { + "epoch": 1.81, + "grad_norm": 0.6322240829467773, + "learning_rate": 0.00020536421421630537, + "loss": 2.9376, + "step": 36861 + }, + { + "epoch": 1.81, + "grad_norm": 0.6140871047973633, + "learning_rate": 0.000205349604259554, + "loss": 3.0766, + "step": 36862 + }, + { + "epoch": 1.81, + "grad_norm": 0.6811372637748718, + "learning_rate": 0.00020533499455209364, + "loss": 2.9396, + "step": 36863 + }, + { + "epoch": 1.81, + "grad_norm": 0.6444950699806213, + "learning_rate": 0.0002053203850939626, + "loss": 2.9081, + "step": 36864 + }, + { + "epoch": 1.81, + "grad_norm": 0.6304511427879333, + "learning_rate": 0.00020530577588519968, + "loss": 3.3028, + "step": 36865 + }, + { + "epoch": 1.81, + "grad_norm": 0.6575809121131897, + "learning_rate": 0.00020529116692584286, + "loss": 2.9909, + "step": 36866 + }, + { + "epoch": 1.81, + "grad_norm": 0.6433678865432739, + "learning_rate": 0.00020527655821593092, + "loss": 3.0047, + "step": 36867 + }, + { + "epoch": 1.81, + "grad_norm": 0.6921300888061523, + "learning_rate": 0.00020526194975550242, + "loss": 3.0075, + "step": 36868 + }, + { + "epoch": 1.81, + "grad_norm": 0.6458017230033875, + "learning_rate": 0.00020524734154459562, + "loss": 3.1427, + "step": 36869 + }, + { + "epoch": 1.81, + "grad_norm": 0.6515001058578491, + "learning_rate": 0.00020523273358324912, + "loss": 2.9976, + "step": 36870 + }, + { + "epoch": 1.81, + "grad_norm": 0.6600772142410278, + "learning_rate": 0.0002052181258715013, + "loss": 2.9814, + "step": 36871 + }, + { + "epoch": 1.81, + "grad_norm": 0.6418211460113525, + "learning_rate": 0.00020520351840939074, + "loss": 3.1198, + "step": 36872 + }, + { + "epoch": 1.81, + "grad_norm": 0.6537681221961975, + "learning_rate": 0.00020518891119695592, + "loss": 3.0484, + "step": 36873 + }, + { + "epoch": 1.81, + "grad_norm": 0.669597327709198, + "learning_rate": 0.00020517430423423513, + "loss": 2.9601, + "step": 36874 + }, + { + "epoch": 1.81, + "grad_norm": 0.6508209109306335, + "learning_rate": 0.0002051596975212671, + "loss": 2.887, + "step": 36875 + }, + { + "epoch": 1.81, + "grad_norm": 0.580568790435791, + "learning_rate": 0.00020514509105809013, + "loss": 3.0521, + "step": 36876 + }, + { + "epoch": 1.81, + "grad_norm": 0.6386202573776245, + "learning_rate": 0.00020513048484474265, + "loss": 2.9794, + "step": 36877 + }, + { + "epoch": 1.81, + "grad_norm": 0.6095972657203674, + "learning_rate": 0.00020511587888126338, + "loss": 3.0316, + "step": 36878 + }, + { + "epoch": 1.81, + "grad_norm": 0.6701319813728333, + "learning_rate": 0.00020510127316769055, + "loss": 2.9153, + "step": 36879 + }, + { + "epoch": 1.81, + "grad_norm": 0.6238023042678833, + "learning_rate": 0.00020508666770406277, + "loss": 3.0534, + "step": 36880 + }, + { + "epoch": 1.81, + "grad_norm": 0.6750576496124268, + "learning_rate": 0.0002050720624904183, + "loss": 2.9337, + "step": 36881 + }, + { + "epoch": 1.81, + "grad_norm": 0.67641282081604, + "learning_rate": 0.0002050574575267958, + "loss": 3.0228, + "step": 36882 + }, + { + "epoch": 1.81, + "grad_norm": 0.6155452132225037, + "learning_rate": 0.00020504285281323375, + "loss": 3.0172, + "step": 36883 + }, + { + "epoch": 1.81, + "grad_norm": 0.6545352339744568, + "learning_rate": 0.0002050282483497704, + "loss": 2.9602, + "step": 36884 + }, + { + "epoch": 1.81, + "grad_norm": 0.630028247833252, + "learning_rate": 0.00020501364413644455, + "loss": 3.0454, + "step": 36885 + }, + { + "epoch": 1.81, + "grad_norm": 0.6380795836448669, + "learning_rate": 0.00020499904017329433, + "loss": 3.0451, + "step": 36886 + }, + { + "epoch": 1.81, + "grad_norm": 0.6383315324783325, + "learning_rate": 0.0002049844364603584, + "loss": 3.048, + "step": 36887 + }, + { + "epoch": 1.81, + "grad_norm": 0.6407351493835449, + "learning_rate": 0.00020496983299767528, + "loss": 3.145, + "step": 36888 + }, + { + "epoch": 1.81, + "grad_norm": 0.6724693775177002, + "learning_rate": 0.00020495522978528316, + "loss": 3.084, + "step": 36889 + }, + { + "epoch": 1.81, + "grad_norm": 0.625685453414917, + "learning_rate": 0.00020494062682322085, + "loss": 3.1366, + "step": 36890 + }, + { + "epoch": 1.81, + "grad_norm": 0.6715691089630127, + "learning_rate": 0.00020492602411152655, + "loss": 3.0932, + "step": 36891 + }, + { + "epoch": 1.81, + "grad_norm": 0.635994017124176, + "learning_rate": 0.00020491142165023874, + "loss": 2.9926, + "step": 36892 + }, + { + "epoch": 1.81, + "grad_norm": 0.6208715438842773, + "learning_rate": 0.00020489681943939604, + "loss": 3.1724, + "step": 36893 + }, + { + "epoch": 1.81, + "grad_norm": 0.6574182510375977, + "learning_rate": 0.0002048822174790368, + "loss": 3.1069, + "step": 36894 + }, + { + "epoch": 1.81, + "grad_norm": 0.6716117858886719, + "learning_rate": 0.0002048676157691995, + "loss": 2.7032, + "step": 36895 + }, + { + "epoch": 1.81, + "grad_norm": 0.6087467670440674, + "learning_rate": 0.00020485301430992254, + "loss": 3.0921, + "step": 36896 + }, + { + "epoch": 1.81, + "grad_norm": 0.6368101835250854, + "learning_rate": 0.00020483841310124445, + "loss": 3.028, + "step": 36897 + }, + { + "epoch": 1.81, + "grad_norm": 0.634164035320282, + "learning_rate": 0.0002048238121432037, + "loss": 3.1461, + "step": 36898 + }, + { + "epoch": 1.81, + "grad_norm": 0.6230823993682861, + "learning_rate": 0.00020480921143583864, + "loss": 2.879, + "step": 36899 + }, + { + "epoch": 1.81, + "grad_norm": 0.656328022480011, + "learning_rate": 0.00020479461097918787, + "loss": 3.0447, + "step": 36900 + }, + { + "epoch": 1.81, + "grad_norm": 0.6206152439117432, + "learning_rate": 0.00020478001077328973, + "loss": 2.7903, + "step": 36901 + }, + { + "epoch": 1.81, + "grad_norm": 0.6653019785881042, + "learning_rate": 0.00020476541081818268, + "loss": 3.1942, + "step": 36902 + }, + { + "epoch": 1.81, + "grad_norm": 0.6336932182312012, + "learning_rate": 0.00020475081111390533, + "loss": 2.9204, + "step": 36903 + }, + { + "epoch": 1.81, + "grad_norm": 0.66854327917099, + "learning_rate": 0.00020473621166049592, + "loss": 2.9795, + "step": 36904 + }, + { + "epoch": 1.81, + "grad_norm": 0.6467189788818359, + "learning_rate": 0.0002047216124579931, + "loss": 2.8518, + "step": 36905 + }, + { + "epoch": 1.81, + "grad_norm": 0.6561312079429626, + "learning_rate": 0.00020470701350643503, + "loss": 2.9816, + "step": 36906 + }, + { + "epoch": 1.81, + "grad_norm": 0.6752562522888184, + "learning_rate": 0.0002046924148058604, + "loss": 3.0072, + "step": 36907 + }, + { + "epoch": 1.81, + "grad_norm": 0.6663380861282349, + "learning_rate": 0.00020467781635630773, + "loss": 3.0714, + "step": 36908 + }, + { + "epoch": 1.81, + "grad_norm": 0.6871305108070374, + "learning_rate": 0.00020466321815781525, + "loss": 2.7063, + "step": 36909 + }, + { + "epoch": 1.81, + "grad_norm": 0.6497218608856201, + "learning_rate": 0.0002046486202104216, + "loss": 2.948, + "step": 36910 + }, + { + "epoch": 1.81, + "grad_norm": 0.633631706237793, + "learning_rate": 0.000204634022514165, + "loss": 2.9519, + "step": 36911 + }, + { + "epoch": 1.81, + "grad_norm": 0.6532074809074402, + "learning_rate": 0.00020461942506908408, + "loss": 2.8991, + "step": 36912 + }, + { + "epoch": 1.81, + "grad_norm": 0.6703870892524719, + "learning_rate": 0.0002046048278752173, + "loss": 3.0202, + "step": 36913 + }, + { + "epoch": 1.81, + "grad_norm": 0.6608051657676697, + "learning_rate": 0.00020459023093260295, + "loss": 3.1402, + "step": 36914 + }, + { + "epoch": 1.81, + "grad_norm": 0.6233830451965332, + "learning_rate": 0.00020457563424127967, + "loss": 3.1028, + "step": 36915 + }, + { + "epoch": 1.81, + "grad_norm": 0.6100077629089355, + "learning_rate": 0.00020456103780128575, + "loss": 2.8412, + "step": 36916 + }, + { + "epoch": 1.81, + "grad_norm": 0.60553377866745, + "learning_rate": 0.00020454644161265958, + "loss": 2.9459, + "step": 36917 + }, + { + "epoch": 1.81, + "grad_norm": 0.6492863893508911, + "learning_rate": 0.00020453184567543992, + "loss": 3.0801, + "step": 36918 + }, + { + "epoch": 1.81, + "grad_norm": 0.6354946494102478, + "learning_rate": 0.00020451724998966488, + "loss": 3.0188, + "step": 36919 + }, + { + "epoch": 1.81, + "grad_norm": 0.617892861366272, + "learning_rate": 0.00020450265455537308, + "loss": 2.9104, + "step": 36920 + }, + { + "epoch": 1.81, + "grad_norm": 0.6572944521903992, + "learning_rate": 0.00020448805937260287, + "loss": 2.9129, + "step": 36921 + }, + { + "epoch": 1.81, + "grad_norm": 0.7081995010375977, + "learning_rate": 0.00020447346444139275, + "loss": 2.9972, + "step": 36922 + }, + { + "epoch": 1.81, + "grad_norm": 0.6335873007774353, + "learning_rate": 0.00020445886976178115, + "loss": 3.0526, + "step": 36923 + }, + { + "epoch": 1.81, + "grad_norm": 0.6316371560096741, + "learning_rate": 0.00020444427533380642, + "loss": 2.906, + "step": 36924 + }, + { + "epoch": 1.81, + "grad_norm": 0.6577939391136169, + "learning_rate": 0.00020442968115750722, + "loss": 2.9412, + "step": 36925 + }, + { + "epoch": 1.81, + "grad_norm": 0.6616895794868469, + "learning_rate": 0.0002044150872329217, + "loss": 2.8354, + "step": 36926 + }, + { + "epoch": 1.81, + "grad_norm": 0.6275897026062012, + "learning_rate": 0.0002044004935600886, + "loss": 3.1783, + "step": 36927 + }, + { + "epoch": 1.81, + "grad_norm": 0.6467520594596863, + "learning_rate": 0.00020438590013904605, + "loss": 3.1131, + "step": 36928 + }, + { + "epoch": 1.81, + "grad_norm": 0.6437161564826965, + "learning_rate": 0.00020437130696983265, + "loss": 3.0071, + "step": 36929 + }, + { + "epoch": 1.81, + "grad_norm": 0.6312966346740723, + "learning_rate": 0.00020435671405248694, + "loss": 3.2288, + "step": 36930 + }, + { + "epoch": 1.81, + "grad_norm": 0.6609817743301392, + "learning_rate": 0.00020434212138704716, + "loss": 3.136, + "step": 36931 + }, + { + "epoch": 1.81, + "grad_norm": 0.6926374435424805, + "learning_rate": 0.0002043275289735519, + "loss": 2.8855, + "step": 36932 + }, + { + "epoch": 1.81, + "grad_norm": 0.6676579713821411, + "learning_rate": 0.00020431293681203938, + "loss": 3.1017, + "step": 36933 + }, + { + "epoch": 1.81, + "grad_norm": 0.676917314529419, + "learning_rate": 0.00020429834490254823, + "loss": 3.0384, + "step": 36934 + }, + { + "epoch": 1.81, + "grad_norm": 0.6208324432373047, + "learning_rate": 0.0002042837532451169, + "loss": 3.0039, + "step": 36935 + }, + { + "epoch": 1.81, + "grad_norm": 0.6595355868339539, + "learning_rate": 0.00020426916183978363, + "loss": 2.996, + "step": 36936 + }, + { + "epoch": 1.81, + "grad_norm": 0.6129118800163269, + "learning_rate": 0.00020425457068658706, + "loss": 3.043, + "step": 36937 + }, + { + "epoch": 1.81, + "grad_norm": 0.5993137955665588, + "learning_rate": 0.00020423997978556547, + "loss": 3.0566, + "step": 36938 + }, + { + "epoch": 1.81, + "grad_norm": 0.620607316493988, + "learning_rate": 0.00020422538913675726, + "loss": 2.8563, + "step": 36939 + }, + { + "epoch": 1.81, + "grad_norm": 0.6551635265350342, + "learning_rate": 0.00020421079874020112, + "loss": 3.1258, + "step": 36940 + }, + { + "epoch": 1.81, + "grad_norm": 0.6164003610610962, + "learning_rate": 0.00020419620859593517, + "loss": 3.1104, + "step": 36941 + }, + { + "epoch": 1.81, + "grad_norm": 0.6258066296577454, + "learning_rate": 0.00020418161870399808, + "loss": 3.1211, + "step": 36942 + }, + { + "epoch": 1.81, + "grad_norm": 0.6458246111869812, + "learning_rate": 0.00020416702906442805, + "loss": 3.0477, + "step": 36943 + }, + { + "epoch": 1.81, + "grad_norm": 0.6024858951568604, + "learning_rate": 0.00020415243967726362, + "loss": 2.9224, + "step": 36944 + }, + { + "epoch": 1.81, + "grad_norm": 0.6260032653808594, + "learning_rate": 0.00020413785054254337, + "loss": 2.8545, + "step": 36945 + }, + { + "epoch": 1.81, + "grad_norm": 0.6995924711227417, + "learning_rate": 0.00020412326166030539, + "loss": 2.8598, + "step": 36946 + }, + { + "epoch": 1.81, + "grad_norm": 0.634792149066925, + "learning_rate": 0.0002041086730305884, + "loss": 2.72, + "step": 36947 + }, + { + "epoch": 1.81, + "grad_norm": 0.657988965511322, + "learning_rate": 0.00020409408465343066, + "loss": 2.8743, + "step": 36948 + }, + { + "epoch": 1.81, + "grad_norm": 0.6405299305915833, + "learning_rate": 0.00020407949652887064, + "loss": 2.9527, + "step": 36949 + }, + { + "epoch": 1.81, + "grad_norm": 0.6475312113761902, + "learning_rate": 0.0002040649086569468, + "loss": 3.0618, + "step": 36950 + }, + { + "epoch": 1.81, + "grad_norm": 0.6201655864715576, + "learning_rate": 0.00020405032103769746, + "loss": 3.1179, + "step": 36951 + }, + { + "epoch": 1.81, + "grad_norm": 0.6073014140129089, + "learning_rate": 0.0002040357336711612, + "loss": 2.9965, + "step": 36952 + }, + { + "epoch": 1.81, + "grad_norm": 0.6534898281097412, + "learning_rate": 0.00020402114655737635, + "loss": 2.9554, + "step": 36953 + }, + { + "epoch": 1.81, + "grad_norm": 0.630856990814209, + "learning_rate": 0.0002040065596963812, + "loss": 2.8513, + "step": 36954 + }, + { + "epoch": 1.81, + "grad_norm": 0.6377066969871521, + "learning_rate": 0.00020399197308821444, + "loss": 3.083, + "step": 36955 + }, + { + "epoch": 1.81, + "grad_norm": 0.6434876322746277, + "learning_rate": 0.00020397738673291432, + "loss": 2.7419, + "step": 36956 + }, + { + "epoch": 1.81, + "grad_norm": 0.6340557932853699, + "learning_rate": 0.0002039628006305193, + "loss": 2.9313, + "step": 36957 + }, + { + "epoch": 1.81, + "grad_norm": 0.6318641304969788, + "learning_rate": 0.00020394821478106768, + "loss": 2.8285, + "step": 36958 + }, + { + "epoch": 1.81, + "grad_norm": 0.6800630688667297, + "learning_rate": 0.000203933629184598, + "loss": 3.031, + "step": 36959 + }, + { + "epoch": 1.81, + "grad_norm": 0.6454338431358337, + "learning_rate": 0.00020391904384114875, + "loss": 2.8901, + "step": 36960 + }, + { + "epoch": 1.81, + "grad_norm": 0.7657660245895386, + "learning_rate": 0.00020390445875075812, + "loss": 3.1257, + "step": 36961 + }, + { + "epoch": 1.81, + "grad_norm": 0.6502024531364441, + "learning_rate": 0.00020388987391346477, + "loss": 3.2133, + "step": 36962 + }, + { + "epoch": 1.81, + "grad_norm": 0.6643862128257751, + "learning_rate": 0.00020387528932930691, + "loss": 3.0175, + "step": 36963 + }, + { + "epoch": 1.81, + "grad_norm": 0.6402032375335693, + "learning_rate": 0.00020386070499832303, + "loss": 2.9454, + "step": 36964 + }, + { + "epoch": 1.81, + "grad_norm": 0.6753803491592407, + "learning_rate": 0.00020384612092055166, + "loss": 3.1392, + "step": 36965 + }, + { + "epoch": 1.81, + "grad_norm": 0.6361395716667175, + "learning_rate": 0.000203831537096031, + "loss": 2.8411, + "step": 36966 + }, + { + "epoch": 1.81, + "grad_norm": 0.6042212843894958, + "learning_rate": 0.00020381695352479967, + "loss": 2.9097, + "step": 36967 + }, + { + "epoch": 1.81, + "grad_norm": 0.724984347820282, + "learning_rate": 0.00020380237020689582, + "loss": 3.0332, + "step": 36968 + }, + { + "epoch": 1.81, + "grad_norm": 0.6241597533226013, + "learning_rate": 0.00020378778714235803, + "loss": 2.9793, + "step": 36969 + }, + { + "epoch": 1.81, + "grad_norm": 0.6064009070396423, + "learning_rate": 0.00020377320433122482, + "loss": 3.2852, + "step": 36970 + }, + { + "epoch": 1.81, + "grad_norm": 0.6465131044387817, + "learning_rate": 0.00020375862177353436, + "loss": 3.1043, + "step": 36971 + }, + { + "epoch": 1.81, + "grad_norm": 0.6775936484336853, + "learning_rate": 0.0002037440394693253, + "loss": 3.1991, + "step": 36972 + }, + { + "epoch": 1.81, + "grad_norm": 0.6569826602935791, + "learning_rate": 0.00020372945741863575, + "loss": 2.8564, + "step": 36973 + }, + { + "epoch": 1.81, + "grad_norm": 0.6317790746688843, + "learning_rate": 0.00020371487562150436, + "loss": 3.0964, + "step": 36974 + }, + { + "epoch": 1.81, + "grad_norm": 0.642630398273468, + "learning_rate": 0.00020370029407796951, + "loss": 3.1208, + "step": 36975 + }, + { + "epoch": 1.81, + "grad_norm": 0.6569013595581055, + "learning_rate": 0.00020368571278806943, + "loss": 3.1136, + "step": 36976 + }, + { + "epoch": 1.81, + "grad_norm": 0.6564520597457886, + "learning_rate": 0.00020367113175184278, + "loss": 2.9458, + "step": 36977 + }, + { + "epoch": 1.81, + "grad_norm": 0.6364352703094482, + "learning_rate": 0.00020365655096932773, + "loss": 3.0549, + "step": 36978 + }, + { + "epoch": 1.81, + "grad_norm": 0.6397393345832825, + "learning_rate": 0.00020364197044056277, + "loss": 2.7911, + "step": 36979 + }, + { + "epoch": 1.81, + "grad_norm": 0.6152938008308411, + "learning_rate": 0.00020362739016558642, + "loss": 2.8501, + "step": 36980 + }, + { + "epoch": 1.81, + "grad_norm": 0.6240938305854797, + "learning_rate": 0.00020361281014443693, + "loss": 2.9638, + "step": 36981 + }, + { + "epoch": 1.81, + "grad_norm": 0.6312567591667175, + "learning_rate": 0.0002035982303771528, + "loss": 2.9071, + "step": 36982 + }, + { + "epoch": 1.81, + "grad_norm": 0.6281710267066956, + "learning_rate": 0.00020358365086377224, + "loss": 2.9096, + "step": 36983 + }, + { + "epoch": 1.81, + "grad_norm": 0.6307974457740784, + "learning_rate": 0.00020356907160433386, + "loss": 2.7694, + "step": 36984 + }, + { + "epoch": 1.81, + "grad_norm": 0.6692641377449036, + "learning_rate": 0.00020355449259887607, + "loss": 3.0278, + "step": 36985 + }, + { + "epoch": 1.81, + "grad_norm": 0.6274914145469666, + "learning_rate": 0.000203539913847437, + "loss": 3.0309, + "step": 36986 + }, + { + "epoch": 1.81, + "grad_norm": 0.6962708234786987, + "learning_rate": 0.00020352533535005545, + "loss": 2.827, + "step": 36987 + }, + { + "epoch": 1.81, + "grad_norm": 0.6521985530853271, + "learning_rate": 0.0002035107571067694, + "loss": 3.0369, + "step": 36988 + }, + { + "epoch": 1.81, + "grad_norm": 0.6316908597946167, + "learning_rate": 0.00020349617911761753, + "loss": 2.8481, + "step": 36989 + }, + { + "epoch": 1.81, + "grad_norm": 0.6510263085365295, + "learning_rate": 0.00020348160138263817, + "loss": 3.0658, + "step": 36990 + }, + { + "epoch": 1.81, + "grad_norm": 0.6218271255493164, + "learning_rate": 0.00020346702390186964, + "loss": 2.7787, + "step": 36991 + }, + { + "epoch": 1.81, + "grad_norm": 0.6902745366096497, + "learning_rate": 0.0002034524466753505, + "loss": 2.9119, + "step": 36992 + }, + { + "epoch": 1.81, + "grad_norm": 0.6275454163551331, + "learning_rate": 0.00020343786970311895, + "loss": 2.893, + "step": 36993 + }, + { + "epoch": 1.81, + "grad_norm": 0.6849902868270874, + "learning_rate": 0.00020342329298521345, + "loss": 2.9944, + "step": 36994 + }, + { + "epoch": 1.81, + "grad_norm": 0.7208778262138367, + "learning_rate": 0.00020340871652167248, + "loss": 2.9227, + "step": 36995 + }, + { + "epoch": 1.81, + "grad_norm": 0.6108225584030151, + "learning_rate": 0.00020339414031253432, + "loss": 3.1913, + "step": 36996 + }, + { + "epoch": 1.81, + "grad_norm": 0.6657624840736389, + "learning_rate": 0.00020337956435783749, + "loss": 2.6087, + "step": 36997 + }, + { + "epoch": 1.81, + "grad_norm": 0.6316897869110107, + "learning_rate": 0.00020336498865762014, + "loss": 3.0783, + "step": 36998 + }, + { + "epoch": 1.81, + "grad_norm": 0.6183792352676392, + "learning_rate": 0.00020335041321192088, + "loss": 2.9349, + "step": 36999 + }, + { + "epoch": 1.81, + "grad_norm": 0.6724928617477417, + "learning_rate": 0.00020333583802077811, + "loss": 2.9243, + "step": 37000 + }, + { + "epoch": 1.81, + "grad_norm": 0.5862394571304321, + "learning_rate": 0.00020332126308422998, + "loss": 3.1631, + "step": 37001 + }, + { + "epoch": 1.81, + "grad_norm": 0.6738343834877014, + "learning_rate": 0.00020330668840231515, + "loss": 2.9031, + "step": 37002 + }, + { + "epoch": 1.81, + "grad_norm": 0.637536346912384, + "learning_rate": 0.0002032921139750719, + "loss": 2.9616, + "step": 37003 + }, + { + "epoch": 1.81, + "grad_norm": 0.6663991212844849, + "learning_rate": 0.00020327753980253863, + "loss": 3.0612, + "step": 37004 + }, + { + "epoch": 1.81, + "grad_norm": 0.6609890460968018, + "learning_rate": 0.00020326296588475356, + "loss": 3.0707, + "step": 37005 + }, + { + "epoch": 1.81, + "grad_norm": 0.645164966583252, + "learning_rate": 0.0002032483922217553, + "loss": 2.9354, + "step": 37006 + }, + { + "epoch": 1.81, + "grad_norm": 0.675213098526001, + "learning_rate": 0.00020323381881358224, + "loss": 3.0725, + "step": 37007 + }, + { + "epoch": 1.81, + "grad_norm": 0.6274236440658569, + "learning_rate": 0.00020321924566027254, + "loss": 3.0605, + "step": 37008 + }, + { + "epoch": 1.81, + "grad_norm": 0.6301320791244507, + "learning_rate": 0.00020320467276186484, + "loss": 3.1105, + "step": 37009 + }, + { + "epoch": 1.81, + "grad_norm": 0.6352813839912415, + "learning_rate": 0.00020319010011839728, + "loss": 3.2368, + "step": 37010 + }, + { + "epoch": 1.81, + "grad_norm": 0.6481574773788452, + "learning_rate": 0.00020317552772990843, + "loss": 2.9269, + "step": 37011 + }, + { + "epoch": 1.81, + "grad_norm": 0.6353967189788818, + "learning_rate": 0.00020316095559643667, + "loss": 3.0593, + "step": 37012 + }, + { + "epoch": 1.81, + "grad_norm": 0.6281538009643555, + "learning_rate": 0.0002031463837180202, + "loss": 2.9999, + "step": 37013 + }, + { + "epoch": 1.81, + "grad_norm": 0.6500021815299988, + "learning_rate": 0.00020313181209469762, + "loss": 3.1799, + "step": 37014 + }, + { + "epoch": 1.81, + "grad_norm": 0.611717164516449, + "learning_rate": 0.00020311724072650717, + "loss": 2.9209, + "step": 37015 + }, + { + "epoch": 1.81, + "grad_norm": 0.657289981842041, + "learning_rate": 0.00020310266961348714, + "loss": 2.8855, + "step": 37016 + }, + { + "epoch": 1.81, + "grad_norm": 0.6263495683670044, + "learning_rate": 0.00020308809875567622, + "loss": 2.8046, + "step": 37017 + }, + { + "epoch": 1.81, + "grad_norm": 0.6198089122772217, + "learning_rate": 0.00020307352815311248, + "loss": 2.8548, + "step": 37018 + }, + { + "epoch": 1.81, + "grad_norm": 0.6605949401855469, + "learning_rate": 0.00020305895780583453, + "loss": 3.0922, + "step": 37019 + }, + { + "epoch": 1.81, + "grad_norm": 0.6008433103561401, + "learning_rate": 0.0002030443877138805, + "loss": 2.9261, + "step": 37020 + }, + { + "epoch": 1.81, + "grad_norm": 0.6339818835258484, + "learning_rate": 0.0002030298178772889, + "loss": 2.9345, + "step": 37021 + }, + { + "epoch": 1.81, + "grad_norm": 0.6539232730865479, + "learning_rate": 0.0002030152482960982, + "loss": 2.9125, + "step": 37022 + }, + { + "epoch": 1.81, + "grad_norm": 0.5978472232818604, + "learning_rate": 0.00020300067897034655, + "loss": 3.0355, + "step": 37023 + }, + { + "epoch": 1.81, + "grad_norm": 0.6346756219863892, + "learning_rate": 0.00020298610990007255, + "loss": 2.9446, + "step": 37024 + }, + { + "epoch": 1.81, + "grad_norm": 0.6300790905952454, + "learning_rate": 0.00020297154108531442, + "loss": 2.8722, + "step": 37025 + }, + { + "epoch": 1.81, + "grad_norm": 0.6479358077049255, + "learning_rate": 0.0002029569725261105, + "loss": 3.0601, + "step": 37026 + }, + { + "epoch": 1.81, + "grad_norm": 0.6204195618629456, + "learning_rate": 0.00020294240422249937, + "loss": 2.9397, + "step": 37027 + }, + { + "epoch": 1.81, + "grad_norm": 0.7228114008903503, + "learning_rate": 0.0002029278361745192, + "loss": 2.7625, + "step": 37028 + }, + { + "epoch": 1.81, + "grad_norm": 0.6030262112617493, + "learning_rate": 0.0002029132683822085, + "loss": 2.9429, + "step": 37029 + }, + { + "epoch": 1.81, + "grad_norm": 0.623123049736023, + "learning_rate": 0.00020289870084560542, + "loss": 2.9507, + "step": 37030 + }, + { + "epoch": 1.81, + "grad_norm": 0.6438615322113037, + "learning_rate": 0.0002028841335647485, + "loss": 3.0401, + "step": 37031 + }, + { + "epoch": 1.81, + "grad_norm": 0.7645979523658752, + "learning_rate": 0.0002028695665396762, + "loss": 3.062, + "step": 37032 + }, + { + "epoch": 1.81, + "grad_norm": 0.6688880920410156, + "learning_rate": 0.0002028549997704267, + "loss": 2.9529, + "step": 37033 + }, + { + "epoch": 1.81, + "grad_norm": 0.6559062004089355, + "learning_rate": 0.0002028404332570385, + "loss": 3.097, + "step": 37034 + }, + { + "epoch": 1.81, + "grad_norm": 0.6532920002937317, + "learning_rate": 0.00020282586699954977, + "loss": 2.7324, + "step": 37035 + }, + { + "epoch": 1.82, + "grad_norm": 0.6466086506843567, + "learning_rate": 0.0002028113009979991, + "loss": 3.0552, + "step": 37036 + }, + { + "epoch": 1.82, + "grad_norm": 0.6443438529968262, + "learning_rate": 0.00020279673525242477, + "loss": 2.9941, + "step": 37037 + }, + { + "epoch": 1.82, + "grad_norm": 0.6671386957168579, + "learning_rate": 0.00020278216976286504, + "loss": 2.95, + "step": 37038 + }, + { + "epoch": 1.82, + "grad_norm": 0.6276841163635254, + "learning_rate": 0.00020276760452935848, + "loss": 3.0235, + "step": 37039 + }, + { + "epoch": 1.82, + "grad_norm": 0.650061845779419, + "learning_rate": 0.0002027530395519433, + "loss": 3.0023, + "step": 37040 + }, + { + "epoch": 1.82, + "grad_norm": 0.6341570019721985, + "learning_rate": 0.0002027384748306578, + "loss": 2.9555, + "step": 37041 + }, + { + "epoch": 1.82, + "grad_norm": 0.6999353766441345, + "learning_rate": 0.00020272391036554058, + "loss": 3.009, + "step": 37042 + }, + { + "epoch": 1.82, + "grad_norm": 0.630683183670044, + "learning_rate": 0.00020270934615662978, + "loss": 3.1733, + "step": 37043 + }, + { + "epoch": 1.82, + "grad_norm": 0.6502330303192139, + "learning_rate": 0.00020269478220396393, + "loss": 2.8276, + "step": 37044 + }, + { + "epoch": 1.82, + "grad_norm": 0.599058210849762, + "learning_rate": 0.00020268021850758115, + "loss": 2.8763, + "step": 37045 + }, + { + "epoch": 1.82, + "grad_norm": 0.5828586220741272, + "learning_rate": 0.00020266565506752002, + "loss": 2.9274, + "step": 37046 + }, + { + "epoch": 1.82, + "grad_norm": 0.6535758376121521, + "learning_rate": 0.00020265109188381887, + "loss": 3.153, + "step": 37047 + }, + { + "epoch": 1.82, + "grad_norm": 0.624528169631958, + "learning_rate": 0.00020263652895651587, + "loss": 2.9203, + "step": 37048 + }, + { + "epoch": 1.82, + "grad_norm": 0.6232485175132751, + "learning_rate": 0.0002026219662856497, + "loss": 2.9709, + "step": 37049 + }, + { + "epoch": 1.82, + "grad_norm": 0.6133432388305664, + "learning_rate": 0.00020260740387125834, + "loss": 2.9712, + "step": 37050 + }, + { + "epoch": 1.82, + "grad_norm": 0.6627100706100464, + "learning_rate": 0.00020259284171338042, + "loss": 2.9813, + "step": 37051 + }, + { + "epoch": 1.82, + "grad_norm": 0.6731305122375488, + "learning_rate": 0.0002025782798120543, + "loss": 3.0706, + "step": 37052 + }, + { + "epoch": 1.82, + "grad_norm": 0.6375182867050171, + "learning_rate": 0.0002025637181673181, + "loss": 3.0896, + "step": 37053 + }, + { + "epoch": 1.82, + "grad_norm": 0.684897243976593, + "learning_rate": 0.0002025491567792104, + "loss": 2.8677, + "step": 37054 + }, + { + "epoch": 1.82, + "grad_norm": 0.6456209421157837, + "learning_rate": 0.00020253459564776945, + "loss": 3.1135, + "step": 37055 + }, + { + "epoch": 1.82, + "grad_norm": 0.6744033098220825, + "learning_rate": 0.00020252003477303353, + "loss": 3.0566, + "step": 37056 + }, + { + "epoch": 1.82, + "grad_norm": 0.6282669305801392, + "learning_rate": 0.0002025054741550412, + "loss": 3.0302, + "step": 37057 + }, + { + "epoch": 1.82, + "grad_norm": 0.6827451586723328, + "learning_rate": 0.00020249091379383066, + "loss": 2.8749, + "step": 37058 + }, + { + "epoch": 1.82, + "grad_norm": 0.631802499294281, + "learning_rate": 0.0002024763536894403, + "loss": 3.0074, + "step": 37059 + }, + { + "epoch": 1.82, + "grad_norm": 0.6587291359901428, + "learning_rate": 0.00020246179384190836, + "loss": 2.7468, + "step": 37060 + }, + { + "epoch": 1.82, + "grad_norm": 0.655880868434906, + "learning_rate": 0.00020244723425127335, + "loss": 3.1387, + "step": 37061 + }, + { + "epoch": 1.82, + "grad_norm": 0.6162218451499939, + "learning_rate": 0.0002024326749175736, + "loss": 3.1554, + "step": 37062 + }, + { + "epoch": 1.82, + "grad_norm": 0.6140300631523132, + "learning_rate": 0.0002024181158408473, + "loss": 3.0222, + "step": 37063 + }, + { + "epoch": 1.82, + "grad_norm": 0.6147719025611877, + "learning_rate": 0.000202403557021133, + "loss": 2.9423, + "step": 37064 + }, + { + "epoch": 1.82, + "grad_norm": 0.6162436604499817, + "learning_rate": 0.00020238899845846889, + "loss": 2.9565, + "step": 37065 + }, + { + "epoch": 1.82, + "grad_norm": 0.6531295776367188, + "learning_rate": 0.00020237444015289336, + "loss": 3.1544, + "step": 37066 + }, + { + "epoch": 1.82, + "grad_norm": 0.6456297039985657, + "learning_rate": 0.00020235988210444482, + "loss": 2.9994, + "step": 37067 + }, + { + "epoch": 1.82, + "grad_norm": 0.6418159008026123, + "learning_rate": 0.00020234532431316157, + "loss": 2.8051, + "step": 37068 + }, + { + "epoch": 1.82, + "grad_norm": 0.6710839867591858, + "learning_rate": 0.00020233076677908196, + "loss": 2.9623, + "step": 37069 + }, + { + "epoch": 1.82, + "grad_norm": 0.6115990877151489, + "learning_rate": 0.00020231620950224419, + "loss": 2.956, + "step": 37070 + }, + { + "epoch": 1.82, + "grad_norm": 0.601249098777771, + "learning_rate": 0.0002023016524826867, + "loss": 3.0491, + "step": 37071 + }, + { + "epoch": 1.82, + "grad_norm": 0.6476225256919861, + "learning_rate": 0.00020228709572044803, + "loss": 3.0574, + "step": 37072 + }, + { + "epoch": 1.82, + "grad_norm": 0.6286438703536987, + "learning_rate": 0.00020227253921556624, + "loss": 2.8102, + "step": 37073 + }, + { + "epoch": 1.82, + "grad_norm": 0.6054357886314392, + "learning_rate": 0.00020225798296807986, + "loss": 3.1466, + "step": 37074 + }, + { + "epoch": 1.82, + "grad_norm": 0.681904673576355, + "learning_rate": 0.00020224342697802705, + "loss": 3.0506, + "step": 37075 + }, + { + "epoch": 1.82, + "grad_norm": 0.6787092089653015, + "learning_rate": 0.00020222887124544625, + "loss": 3.0671, + "step": 37076 + }, + { + "epoch": 1.82, + "grad_norm": 0.6227284669876099, + "learning_rate": 0.00020221431577037584, + "loss": 3.0675, + "step": 37077 + }, + { + "epoch": 1.82, + "grad_norm": 0.6418759226799011, + "learning_rate": 0.000202199760552854, + "loss": 3.0101, + "step": 37078 + }, + { + "epoch": 1.82, + "grad_norm": 0.6091601252555847, + "learning_rate": 0.00020218520559291928, + "loss": 2.8531, + "step": 37079 + }, + { + "epoch": 1.82, + "grad_norm": 0.6450102925300598, + "learning_rate": 0.0002021706508906099, + "loss": 3.0782, + "step": 37080 + }, + { + "epoch": 1.82, + "grad_norm": 0.6219157576560974, + "learning_rate": 0.00020215609644596422, + "loss": 3.0429, + "step": 37081 + }, + { + "epoch": 1.82, + "grad_norm": 0.6502217650413513, + "learning_rate": 0.0002021415422590204, + "loss": 2.9689, + "step": 37082 + }, + { + "epoch": 1.82, + "grad_norm": 0.6541635990142822, + "learning_rate": 0.000202126988329817, + "loss": 2.9687, + "step": 37083 + }, + { + "epoch": 1.82, + "grad_norm": 0.6724274754524231, + "learning_rate": 0.00020211243465839237, + "loss": 2.9509, + "step": 37084 + }, + { + "epoch": 1.82, + "grad_norm": 0.6167418360710144, + "learning_rate": 0.0002020978812447846, + "loss": 3.0759, + "step": 37085 + }, + { + "epoch": 1.82, + "grad_norm": 0.6464261412620544, + "learning_rate": 0.0002020833280890323, + "loss": 2.7714, + "step": 37086 + }, + { + "epoch": 1.82, + "grad_norm": 0.6340137720108032, + "learning_rate": 0.0002020687751911736, + "loss": 3.0417, + "step": 37087 + }, + { + "epoch": 1.82, + "grad_norm": 0.631371259689331, + "learning_rate": 0.00020205422255124684, + "loss": 2.8924, + "step": 37088 + }, + { + "epoch": 1.82, + "grad_norm": 0.6182030439376831, + "learning_rate": 0.00020203967016929053, + "loss": 3.1116, + "step": 37089 + }, + { + "epoch": 1.82, + "grad_norm": 0.6417081356048584, + "learning_rate": 0.00020202511804534278, + "loss": 3.0076, + "step": 37090 + }, + { + "epoch": 1.82, + "grad_norm": 0.6194267868995667, + "learning_rate": 0.00020201056617944217, + "loss": 3.0782, + "step": 37091 + }, + { + "epoch": 1.82, + "grad_norm": 0.6341774463653564, + "learning_rate": 0.00020199601457162667, + "loss": 2.9582, + "step": 37092 + }, + { + "epoch": 1.82, + "grad_norm": 0.6601331233978271, + "learning_rate": 0.00020198146322193483, + "loss": 3.0365, + "step": 37093 + }, + { + "epoch": 1.82, + "grad_norm": 0.6085912585258484, + "learning_rate": 0.00020196691213040507, + "loss": 3.1794, + "step": 37094 + }, + { + "epoch": 1.82, + "grad_norm": 0.621552586555481, + "learning_rate": 0.00020195236129707552, + "loss": 3.1996, + "step": 37095 + }, + { + "epoch": 1.82, + "grad_norm": 0.6433894634246826, + "learning_rate": 0.00020193781072198468, + "loss": 2.7632, + "step": 37096 + }, + { + "epoch": 1.82, + "grad_norm": 0.633199155330658, + "learning_rate": 0.00020192326040517064, + "loss": 3.1733, + "step": 37097 + }, + { + "epoch": 1.82, + "grad_norm": 0.6254742741584778, + "learning_rate": 0.0002019087103466719, + "loss": 3.0559, + "step": 37098 + }, + { + "epoch": 1.82, + "grad_norm": 0.6049128770828247, + "learning_rate": 0.0002018941605465268, + "loss": 3.0833, + "step": 37099 + }, + { + "epoch": 1.82, + "grad_norm": 0.6894818544387817, + "learning_rate": 0.0002018796110047735, + "loss": 2.7616, + "step": 37100 + }, + { + "epoch": 1.82, + "grad_norm": 0.596031665802002, + "learning_rate": 0.0002018650617214505, + "loss": 2.9947, + "step": 37101 + }, + { + "epoch": 1.82, + "grad_norm": 0.6057289838790894, + "learning_rate": 0.000201850512696596, + "loss": 3.1525, + "step": 37102 + }, + { + "epoch": 1.82, + "grad_norm": 0.6352101564407349, + "learning_rate": 0.00020183596393024832, + "loss": 3.0793, + "step": 37103 + }, + { + "epoch": 1.82, + "grad_norm": 0.6023186445236206, + "learning_rate": 0.00020182141542244593, + "loss": 3.0214, + "step": 37104 + }, + { + "epoch": 1.82, + "grad_norm": 0.6341410875320435, + "learning_rate": 0.000201806867173227, + "loss": 2.868, + "step": 37105 + }, + { + "epoch": 1.82, + "grad_norm": 0.6453239917755127, + "learning_rate": 0.00020179231918262988, + "loss": 3.0226, + "step": 37106 + }, + { + "epoch": 1.82, + "grad_norm": 0.6963949203491211, + "learning_rate": 0.00020177777145069284, + "loss": 3.0891, + "step": 37107 + }, + { + "epoch": 1.82, + "grad_norm": 0.6552883386611938, + "learning_rate": 0.00020176322397745427, + "loss": 3.1864, + "step": 37108 + }, + { + "epoch": 1.82, + "grad_norm": 0.6849330067634583, + "learning_rate": 0.00020174867676295256, + "loss": 3.0045, + "step": 37109 + }, + { + "epoch": 1.82, + "grad_norm": 0.7277083992958069, + "learning_rate": 0.00020173412980722575, + "loss": 3.0083, + "step": 37110 + }, + { + "epoch": 1.82, + "grad_norm": 0.6667925119400024, + "learning_rate": 0.00020171958311031245, + "loss": 2.9899, + "step": 37111 + }, + { + "epoch": 1.82, + "grad_norm": 0.6309471726417542, + "learning_rate": 0.00020170503667225076, + "loss": 3.0762, + "step": 37112 + }, + { + "epoch": 1.82, + "grad_norm": 0.6317311525344849, + "learning_rate": 0.00020169049049307918, + "loss": 2.9453, + "step": 37113 + }, + { + "epoch": 1.82, + "grad_norm": 0.6759694814682007, + "learning_rate": 0.00020167594457283595, + "loss": 3.2062, + "step": 37114 + }, + { + "epoch": 1.82, + "grad_norm": 0.6160515546798706, + "learning_rate": 0.0002016613989115592, + "loss": 3.099, + "step": 37115 + }, + { + "epoch": 1.82, + "grad_norm": 0.6787965297698975, + "learning_rate": 0.00020164685350928757, + "loss": 2.8703, + "step": 37116 + }, + { + "epoch": 1.82, + "grad_norm": 0.6101091504096985, + "learning_rate": 0.00020163230836605912, + "loss": 3.2368, + "step": 37117 + }, + { + "epoch": 1.82, + "grad_norm": 0.6226139068603516, + "learning_rate": 0.00020161776348191223, + "loss": 3.125, + "step": 37118 + }, + { + "epoch": 1.82, + "grad_norm": 0.7263720035552979, + "learning_rate": 0.00020160321885688527, + "loss": 3.0666, + "step": 37119 + }, + { + "epoch": 1.82, + "grad_norm": 0.6440698504447937, + "learning_rate": 0.00020158867449101645, + "loss": 2.9552, + "step": 37120 + }, + { + "epoch": 1.82, + "grad_norm": 0.6304377317428589, + "learning_rate": 0.0002015741303843442, + "loss": 3.2824, + "step": 37121 + }, + { + "epoch": 1.82, + "grad_norm": 0.6487731337547302, + "learning_rate": 0.00020155958653690664, + "loss": 2.8647, + "step": 37122 + }, + { + "epoch": 1.82, + "grad_norm": 0.6729735732078552, + "learning_rate": 0.00020154504294874224, + "loss": 2.879, + "step": 37123 + }, + { + "epoch": 1.82, + "grad_norm": 0.6333503127098083, + "learning_rate": 0.0002015304996198893, + "loss": 3.0657, + "step": 37124 + }, + { + "epoch": 1.82, + "grad_norm": 0.6147133708000183, + "learning_rate": 0.0002015159565503859, + "loss": 3.0688, + "step": 37125 + }, + { + "epoch": 1.82, + "grad_norm": 0.6360808610916138, + "learning_rate": 0.0002015014137402707, + "loss": 3.21, + "step": 37126 + }, + { + "epoch": 1.82, + "grad_norm": 0.6403216123580933, + "learning_rate": 0.00020148687118958175, + "loss": 3.0954, + "step": 37127 + }, + { + "epoch": 1.82, + "grad_norm": 0.6334720849990845, + "learning_rate": 0.00020147232889835735, + "loss": 3.0747, + "step": 37128 + }, + { + "epoch": 1.82, + "grad_norm": 0.688264787197113, + "learning_rate": 0.000201457786866636, + "loss": 2.8259, + "step": 37129 + }, + { + "epoch": 1.82, + "grad_norm": 0.6472107172012329, + "learning_rate": 0.00020144324509445584, + "loss": 3.025, + "step": 37130 + }, + { + "epoch": 1.82, + "grad_norm": 0.6436012983322144, + "learning_rate": 0.0002014287035818552, + "loss": 2.9706, + "step": 37131 + }, + { + "epoch": 1.82, + "grad_norm": 0.6599947810173035, + "learning_rate": 0.00020141416232887232, + "loss": 2.8894, + "step": 37132 + }, + { + "epoch": 1.82, + "grad_norm": 0.65586918592453, + "learning_rate": 0.00020139962133554552, + "loss": 3.006, + "step": 37133 + }, + { + "epoch": 1.82, + "grad_norm": 0.7479538321495056, + "learning_rate": 0.0002013850806019133, + "loss": 2.8773, + "step": 37134 + }, + { + "epoch": 1.82, + "grad_norm": 0.6416367888450623, + "learning_rate": 0.00020137054012801377, + "loss": 3.0427, + "step": 37135 + }, + { + "epoch": 1.82, + "grad_norm": 0.5910057425498962, + "learning_rate": 0.00020135599991388524, + "loss": 2.9003, + "step": 37136 + }, + { + "epoch": 1.82, + "grad_norm": 0.604770839214325, + "learning_rate": 0.00020134145995956597, + "loss": 2.9453, + "step": 37137 + }, + { + "epoch": 1.82, + "grad_norm": 0.6588855385780334, + "learning_rate": 0.0002013269202650943, + "loss": 2.9322, + "step": 37138 + }, + { + "epoch": 1.82, + "grad_norm": 0.6265450716018677, + "learning_rate": 0.00020131238083050863, + "loss": 2.8692, + "step": 37139 + }, + { + "epoch": 1.82, + "grad_norm": 0.607767641544342, + "learning_rate": 0.00020129784165584703, + "loss": 3.0773, + "step": 37140 + }, + { + "epoch": 1.82, + "grad_norm": 0.660254716873169, + "learning_rate": 0.00020128330274114803, + "loss": 2.9246, + "step": 37141 + }, + { + "epoch": 1.82, + "grad_norm": 0.6739830374717712, + "learning_rate": 0.00020126876408644976, + "loss": 2.7952, + "step": 37142 + }, + { + "epoch": 1.82, + "grad_norm": 0.6582867503166199, + "learning_rate": 0.00020125422569179053, + "loss": 2.9389, + "step": 37143 + }, + { + "epoch": 1.82, + "grad_norm": 0.6650797724723816, + "learning_rate": 0.00020123968755720876, + "loss": 2.9109, + "step": 37144 + }, + { + "epoch": 1.82, + "grad_norm": 0.7328686118125916, + "learning_rate": 0.00020122514968274257, + "loss": 3.0024, + "step": 37145 + }, + { + "epoch": 1.82, + "grad_norm": 0.6152870059013367, + "learning_rate": 0.00020121061206843042, + "loss": 2.9541, + "step": 37146 + }, + { + "epoch": 1.82, + "grad_norm": 0.603813648223877, + "learning_rate": 0.00020119607471431034, + "loss": 2.8763, + "step": 37147 + }, + { + "epoch": 1.82, + "grad_norm": 0.646477222442627, + "learning_rate": 0.00020118153762042088, + "loss": 3.0344, + "step": 37148 + }, + { + "epoch": 1.82, + "grad_norm": 0.7055385112762451, + "learning_rate": 0.0002011670007868003, + "loss": 3.0673, + "step": 37149 + }, + { + "epoch": 1.82, + "grad_norm": 0.6229759454727173, + "learning_rate": 0.00020115246421348668, + "loss": 3.1857, + "step": 37150 + }, + { + "epoch": 1.82, + "grad_norm": 0.6314061284065247, + "learning_rate": 0.00020113792790051854, + "loss": 3.0096, + "step": 37151 + }, + { + "epoch": 1.82, + "grad_norm": 0.6126551032066345, + "learning_rate": 0.000201123391847934, + "loss": 2.9149, + "step": 37152 + }, + { + "epoch": 1.82, + "grad_norm": 0.6453806161880493, + "learning_rate": 0.00020110885605577146, + "loss": 3.0979, + "step": 37153 + }, + { + "epoch": 1.82, + "grad_norm": 0.6280014514923096, + "learning_rate": 0.0002010943205240692, + "loss": 2.8085, + "step": 37154 + }, + { + "epoch": 1.82, + "grad_norm": 0.6255245208740234, + "learning_rate": 0.00020107978525286537, + "loss": 3.2936, + "step": 37155 + }, + { + "epoch": 1.82, + "grad_norm": 0.6780776977539062, + "learning_rate": 0.0002010652502421985, + "loss": 2.9829, + "step": 37156 + }, + { + "epoch": 1.82, + "grad_norm": 0.7052608132362366, + "learning_rate": 0.0002010507154921066, + "loss": 2.8949, + "step": 37157 + }, + { + "epoch": 1.82, + "grad_norm": 0.6485824584960938, + "learning_rate": 0.00020103618100262815, + "loss": 2.9753, + "step": 37158 + }, + { + "epoch": 1.82, + "grad_norm": 0.6526051163673401, + "learning_rate": 0.00020102164677380122, + "loss": 3.1261, + "step": 37159 + }, + { + "epoch": 1.82, + "grad_norm": 0.6495037078857422, + "learning_rate": 0.00020100711280566431, + "loss": 2.9435, + "step": 37160 + }, + { + "epoch": 1.82, + "grad_norm": 0.5939731597900391, + "learning_rate": 0.00020099257909825567, + "loss": 3.1176, + "step": 37161 + }, + { + "epoch": 1.82, + "grad_norm": 0.6784581542015076, + "learning_rate": 0.00020097804565161342, + "loss": 3.1203, + "step": 37162 + }, + { + "epoch": 1.82, + "grad_norm": 0.6253423690795898, + "learning_rate": 0.000200963512465776, + "loss": 3.0622, + "step": 37163 + }, + { + "epoch": 1.82, + "grad_norm": 0.6293935179710388, + "learning_rate": 0.00020094897954078163, + "loss": 3.1353, + "step": 37164 + }, + { + "epoch": 1.82, + "grad_norm": 0.7159479856491089, + "learning_rate": 0.0002009344468766685, + "loss": 3.2139, + "step": 37165 + }, + { + "epoch": 1.82, + "grad_norm": 0.6202086806297302, + "learning_rate": 0.0002009199144734751, + "loss": 3.0405, + "step": 37166 + }, + { + "epoch": 1.82, + "grad_norm": 0.6366106867790222, + "learning_rate": 0.00020090538233123948, + "loss": 3.0614, + "step": 37167 + }, + { + "epoch": 1.82, + "grad_norm": 0.6750251650810242, + "learning_rate": 0.00020089085045000012, + "loss": 3.0452, + "step": 37168 + }, + { + "epoch": 1.82, + "grad_norm": 0.6211447715759277, + "learning_rate": 0.00020087631882979504, + "loss": 3.0842, + "step": 37169 + }, + { + "epoch": 1.82, + "grad_norm": 0.6459939479827881, + "learning_rate": 0.00020086178747066272, + "loss": 2.9701, + "step": 37170 + }, + { + "epoch": 1.82, + "grad_norm": 0.6417645812034607, + "learning_rate": 0.00020084725637264147, + "loss": 2.9645, + "step": 37171 + }, + { + "epoch": 1.82, + "grad_norm": 0.610141396522522, + "learning_rate": 0.00020083272553576929, + "loss": 2.9528, + "step": 37172 + }, + { + "epoch": 1.82, + "grad_norm": 0.6073371171951294, + "learning_rate": 0.00020081819496008477, + "loss": 3.0393, + "step": 37173 + }, + { + "epoch": 1.82, + "grad_norm": 0.6643429398536682, + "learning_rate": 0.00020080366464562594, + "loss": 2.9228, + "step": 37174 + }, + { + "epoch": 1.82, + "grad_norm": 0.6577807664871216, + "learning_rate": 0.00020078913459243119, + "loss": 3.1136, + "step": 37175 + }, + { + "epoch": 1.82, + "grad_norm": 0.5988003015518188, + "learning_rate": 0.00020077460480053883, + "loss": 2.9344, + "step": 37176 + }, + { + "epoch": 1.82, + "grad_norm": 0.6243207454681396, + "learning_rate": 0.00020076007526998696, + "loss": 2.9445, + "step": 37177 + }, + { + "epoch": 1.82, + "grad_norm": 0.6623547077178955, + "learning_rate": 0.00020074554600081406, + "loss": 2.9545, + "step": 37178 + }, + { + "epoch": 1.82, + "grad_norm": 0.6492032408714294, + "learning_rate": 0.00020073101699305822, + "loss": 2.8503, + "step": 37179 + }, + { + "epoch": 1.82, + "grad_norm": 0.6891651153564453, + "learning_rate": 0.00020071648824675773, + "loss": 3.1972, + "step": 37180 + }, + { + "epoch": 1.82, + "grad_norm": 0.6541674733161926, + "learning_rate": 0.00020070195976195105, + "loss": 2.9552, + "step": 37181 + }, + { + "epoch": 1.82, + "grad_norm": 0.6345112323760986, + "learning_rate": 0.0002006874315386762, + "loss": 2.9296, + "step": 37182 + }, + { + "epoch": 1.82, + "grad_norm": 0.6238939762115479, + "learning_rate": 0.00020067290357697162, + "loss": 2.9192, + "step": 37183 + }, + { + "epoch": 1.82, + "grad_norm": 0.6258445978164673, + "learning_rate": 0.00020065837587687535, + "loss": 2.798, + "step": 37184 + }, + { + "epoch": 1.82, + "grad_norm": 0.6790409088134766, + "learning_rate": 0.00020064384843842588, + "loss": 3.1312, + "step": 37185 + }, + { + "epoch": 1.82, + "grad_norm": 0.6279799342155457, + "learning_rate": 0.00020062932126166147, + "loss": 2.8466, + "step": 37186 + }, + { + "epoch": 1.82, + "grad_norm": 0.6571778059005737, + "learning_rate": 0.00020061479434662017, + "loss": 3.1089, + "step": 37187 + }, + { + "epoch": 1.82, + "grad_norm": 0.6268858909606934, + "learning_rate": 0.0002006002676933405, + "loss": 2.922, + "step": 37188 + }, + { + "epoch": 1.82, + "grad_norm": 0.6086148023605347, + "learning_rate": 0.0002005857413018605, + "loss": 2.881, + "step": 37189 + }, + { + "epoch": 1.82, + "grad_norm": 0.6448004245758057, + "learning_rate": 0.0002005712151722185, + "loss": 2.9411, + "step": 37190 + }, + { + "epoch": 1.82, + "grad_norm": 0.6258628964424133, + "learning_rate": 0.0002005566893044529, + "loss": 3.1739, + "step": 37191 + }, + { + "epoch": 1.82, + "grad_norm": 0.6352370381355286, + "learning_rate": 0.00020054216369860177, + "loss": 2.9686, + "step": 37192 + }, + { + "epoch": 1.82, + "grad_norm": 0.6704267859458923, + "learning_rate": 0.0002005276383547035, + "loss": 3.0863, + "step": 37193 + }, + { + "epoch": 1.82, + "grad_norm": 0.6442691087722778, + "learning_rate": 0.00020051311327279618, + "loss": 2.9182, + "step": 37194 + }, + { + "epoch": 1.82, + "grad_norm": 0.6933001279830933, + "learning_rate": 0.00020049858845291814, + "loss": 3.0166, + "step": 37195 + }, + { + "epoch": 1.82, + "grad_norm": 0.6700044870376587, + "learning_rate": 0.00020048406389510786, + "loss": 2.9624, + "step": 37196 + }, + { + "epoch": 1.82, + "grad_norm": 0.6275744438171387, + "learning_rate": 0.00020046953959940324, + "loss": 2.9598, + "step": 37197 + }, + { + "epoch": 1.82, + "grad_norm": 0.6295133829116821, + "learning_rate": 0.00020045501556584285, + "loss": 2.9946, + "step": 37198 + }, + { + "epoch": 1.82, + "grad_norm": 0.6550989747047424, + "learning_rate": 0.00020044049179446463, + "loss": 2.8582, + "step": 37199 + }, + { + "epoch": 1.82, + "grad_norm": 0.6450958251953125, + "learning_rate": 0.00020042596828530706, + "loss": 2.9827, + "step": 37200 + }, + { + "epoch": 1.82, + "grad_norm": 0.6483104228973389, + "learning_rate": 0.00020041144503840836, + "loss": 2.8909, + "step": 37201 + }, + { + "epoch": 1.82, + "grad_norm": 0.6364044547080994, + "learning_rate": 0.00020039692205380666, + "loss": 2.9006, + "step": 37202 + }, + { + "epoch": 1.82, + "grad_norm": 0.6128686666488647, + "learning_rate": 0.00020038239933154042, + "loss": 2.91, + "step": 37203 + }, + { + "epoch": 1.82, + "grad_norm": 0.6625557541847229, + "learning_rate": 0.0002003678768716477, + "loss": 2.8524, + "step": 37204 + }, + { + "epoch": 1.82, + "grad_norm": 0.6292389631271362, + "learning_rate": 0.00020035335467416676, + "loss": 3.1534, + "step": 37205 + }, + { + "epoch": 1.82, + "grad_norm": 0.6541522741317749, + "learning_rate": 0.000200338832739136, + "loss": 2.8124, + "step": 37206 + }, + { + "epoch": 1.82, + "grad_norm": 0.6432307958602905, + "learning_rate": 0.00020032431106659353, + "loss": 2.9206, + "step": 37207 + }, + { + "epoch": 1.82, + "grad_norm": 0.6428229212760925, + "learning_rate": 0.00020030978965657775, + "loss": 2.7471, + "step": 37208 + }, + { + "epoch": 1.82, + "grad_norm": 0.6233975887298584, + "learning_rate": 0.00020029526850912663, + "loss": 2.965, + "step": 37209 + }, + { + "epoch": 1.82, + "grad_norm": 0.6788715720176697, + "learning_rate": 0.00020028074762427865, + "loss": 3.0143, + "step": 37210 + }, + { + "epoch": 1.82, + "grad_norm": 0.6513542532920837, + "learning_rate": 0.00020026622700207206, + "loss": 3.0382, + "step": 37211 + }, + { + "epoch": 1.82, + "grad_norm": 0.6843482255935669, + "learning_rate": 0.00020025170664254488, + "loss": 2.7828, + "step": 37212 + }, + { + "epoch": 1.82, + "grad_norm": 0.5919236540794373, + "learning_rate": 0.00020023718654573567, + "loss": 2.951, + "step": 37213 + }, + { + "epoch": 1.82, + "grad_norm": 0.6735742092132568, + "learning_rate": 0.0002002226667116824, + "loss": 3.0408, + "step": 37214 + }, + { + "epoch": 1.82, + "grad_norm": 0.6410362720489502, + "learning_rate": 0.0002002081471404234, + "loss": 3.0412, + "step": 37215 + }, + { + "epoch": 1.82, + "grad_norm": 0.623496949672699, + "learning_rate": 0.0002001936278319971, + "loss": 3.1536, + "step": 37216 + }, + { + "epoch": 1.82, + "grad_norm": 0.5960062742233276, + "learning_rate": 0.0002001791087864414, + "loss": 3.2316, + "step": 37217 + }, + { + "epoch": 1.82, + "grad_norm": 0.6214715242385864, + "learning_rate": 0.00020016459000379483, + "loss": 2.8934, + "step": 37218 + }, + { + "epoch": 1.82, + "grad_norm": 0.6391525268554688, + "learning_rate": 0.00020015007148409548, + "loss": 2.8583, + "step": 37219 + }, + { + "epoch": 1.82, + "grad_norm": 0.616982102394104, + "learning_rate": 0.00020013555322738157, + "loss": 3.1318, + "step": 37220 + }, + { + "epoch": 1.82, + "grad_norm": 0.665523111820221, + "learning_rate": 0.0002001210352336915, + "loss": 2.9528, + "step": 37221 + }, + { + "epoch": 1.82, + "grad_norm": 0.6505708694458008, + "learning_rate": 0.00020010651750306332, + "loss": 2.9226, + "step": 37222 + }, + { + "epoch": 1.82, + "grad_norm": 0.6481497883796692, + "learning_rate": 0.00020009200003553545, + "loss": 3.2122, + "step": 37223 + }, + { + "epoch": 1.82, + "grad_norm": 0.6302831768989563, + "learning_rate": 0.00020007748283114586, + "loss": 3.0263, + "step": 37224 + }, + { + "epoch": 1.82, + "grad_norm": 0.6752156019210815, + "learning_rate": 0.00020006296588993304, + "loss": 3.0486, + "step": 37225 + }, + { + "epoch": 1.82, + "grad_norm": 0.6285132169723511, + "learning_rate": 0.0002000484492119352, + "loss": 2.983, + "step": 37226 + }, + { + "epoch": 1.82, + "grad_norm": 0.651867151260376, + "learning_rate": 0.0002000339327971904, + "loss": 3.0873, + "step": 37227 + }, + { + "epoch": 1.82, + "grad_norm": 0.6278545260429382, + "learning_rate": 0.00020001941664573705, + "loss": 3.0538, + "step": 37228 + }, + { + "epoch": 1.82, + "grad_norm": 0.6236552000045776, + "learning_rate": 0.0002000049007576133, + "loss": 2.9181, + "step": 37229 + }, + { + "epoch": 1.82, + "grad_norm": 0.647704005241394, + "learning_rate": 0.00019999038513285732, + "loss": 2.9183, + "step": 37230 + }, + { + "epoch": 1.82, + "grad_norm": 0.6166619658470154, + "learning_rate": 0.00019997586977150752, + "loss": 3.0254, + "step": 37231 + }, + { + "epoch": 1.82, + "grad_norm": 0.636547863483429, + "learning_rate": 0.000199961354673602, + "loss": 3.1082, + "step": 37232 + }, + { + "epoch": 1.82, + "grad_norm": 0.6511338353157043, + "learning_rate": 0.00019994683983917905, + "loss": 2.9792, + "step": 37233 + }, + { + "epoch": 1.82, + "grad_norm": 0.6390637755393982, + "learning_rate": 0.0001999323252682768, + "loss": 3.0991, + "step": 37234 + }, + { + "epoch": 1.82, + "grad_norm": 0.6388706564903259, + "learning_rate": 0.0001999178109609335, + "loss": 3.0653, + "step": 37235 + }, + { + "epoch": 1.82, + "grad_norm": 0.629075825214386, + "learning_rate": 0.00019990329691718753, + "loss": 2.9239, + "step": 37236 + }, + { + "epoch": 1.82, + "grad_norm": 0.6319878101348877, + "learning_rate": 0.00019988878313707698, + "loss": 3.2043, + "step": 37237 + }, + { + "epoch": 1.82, + "grad_norm": 0.6719925403594971, + "learning_rate": 0.0001998742696206402, + "loss": 3.0834, + "step": 37238 + }, + { + "epoch": 1.82, + "grad_norm": 0.6301445960998535, + "learning_rate": 0.00019985975636791517, + "loss": 3.2402, + "step": 37239 + }, + { + "epoch": 1.83, + "grad_norm": 0.6451971530914307, + "learning_rate": 0.00019984524337894038, + "loss": 3.2114, + "step": 37240 + }, + { + "epoch": 1.83, + "grad_norm": 0.6485806107521057, + "learning_rate": 0.00019983073065375393, + "loss": 2.8606, + "step": 37241 + }, + { + "epoch": 1.83, + "grad_norm": 0.6495173573493958, + "learning_rate": 0.00019981621819239396, + "loss": 2.9877, + "step": 37242 + }, + { + "epoch": 1.83, + "grad_norm": 0.6307535171508789, + "learning_rate": 0.0001998017059948989, + "loss": 2.8788, + "step": 37243 + }, + { + "epoch": 1.83, + "grad_norm": 0.6974294781684875, + "learning_rate": 0.0001997871940613068, + "loss": 3.1262, + "step": 37244 + }, + { + "epoch": 1.83, + "grad_norm": 0.6444449424743652, + "learning_rate": 0.00019977268239165604, + "loss": 3.0831, + "step": 37245 + }, + { + "epoch": 1.83, + "grad_norm": 0.6444473266601562, + "learning_rate": 0.0001997581709859846, + "loss": 2.886, + "step": 37246 + }, + { + "epoch": 1.83, + "grad_norm": 0.6505341529846191, + "learning_rate": 0.0001997436598443309, + "loss": 2.8883, + "step": 37247 + }, + { + "epoch": 1.83, + "grad_norm": 0.6732316017150879, + "learning_rate": 0.00019972914896673318, + "loss": 3.0509, + "step": 37248 + }, + { + "epoch": 1.83, + "grad_norm": 0.6295874714851379, + "learning_rate": 0.00019971463835322947, + "loss": 3.2458, + "step": 37249 + }, + { + "epoch": 1.83, + "grad_norm": 0.6318504214286804, + "learning_rate": 0.00019970012800385822, + "loss": 2.9675, + "step": 37250 + }, + { + "epoch": 1.83, + "grad_norm": 0.6528605818748474, + "learning_rate": 0.00019968561791865744, + "loss": 2.8613, + "step": 37251 + }, + { + "epoch": 1.83, + "grad_norm": 0.649989902973175, + "learning_rate": 0.0001996711080976654, + "loss": 3.1373, + "step": 37252 + }, + { + "epoch": 1.83, + "grad_norm": 0.6412109136581421, + "learning_rate": 0.00019965659854092048, + "loss": 2.7814, + "step": 37253 + }, + { + "epoch": 1.83, + "grad_norm": 0.6689494848251343, + "learning_rate": 0.0001996420892484606, + "loss": 2.877, + "step": 37254 + }, + { + "epoch": 1.83, + "grad_norm": 0.6729632616043091, + "learning_rate": 0.00019962758022032441, + "loss": 3.1106, + "step": 37255 + }, + { + "epoch": 1.83, + "grad_norm": 0.6817084550857544, + "learning_rate": 0.00019961307145654962, + "loss": 3.0034, + "step": 37256 + }, + { + "epoch": 1.83, + "grad_norm": 0.6519462466239929, + "learning_rate": 0.0001995985629571747, + "loss": 3.2566, + "step": 37257 + }, + { + "epoch": 1.83, + "grad_norm": 0.6344906687736511, + "learning_rate": 0.00019958405472223796, + "loss": 3.128, + "step": 37258 + }, + { + "epoch": 1.83, + "grad_norm": 0.6581130027770996, + "learning_rate": 0.0001995695467517774, + "loss": 3.1058, + "step": 37259 + }, + { + "epoch": 1.83, + "grad_norm": 0.6368814706802368, + "learning_rate": 0.00019955503904583142, + "loss": 2.973, + "step": 37260 + }, + { + "epoch": 1.83, + "grad_norm": 0.6296173334121704, + "learning_rate": 0.000199540531604438, + "loss": 2.9619, + "step": 37261 + }, + { + "epoch": 1.83, + "grad_norm": 0.6157219409942627, + "learning_rate": 0.00019952602442763553, + "loss": 3.0496, + "step": 37262 + }, + { + "epoch": 1.83, + "grad_norm": 0.645746111869812, + "learning_rate": 0.0001995115175154623, + "loss": 2.9254, + "step": 37263 + }, + { + "epoch": 1.83, + "grad_norm": 0.6452582478523254, + "learning_rate": 0.00019949701086795625, + "loss": 2.8589, + "step": 37264 + }, + { + "epoch": 1.83, + "grad_norm": 0.6056995987892151, + "learning_rate": 0.00019948250448515578, + "loss": 2.9054, + "step": 37265 + }, + { + "epoch": 1.83, + "grad_norm": 0.649617612361908, + "learning_rate": 0.00019946799836709908, + "loss": 2.7798, + "step": 37266 + }, + { + "epoch": 1.83, + "grad_norm": 0.6778903007507324, + "learning_rate": 0.0001994534925138242, + "loss": 2.7532, + "step": 37267 + }, + { + "epoch": 1.83, + "grad_norm": 0.6305443644523621, + "learning_rate": 0.0001994389869253696, + "loss": 3.0986, + "step": 37268 + }, + { + "epoch": 1.83, + "grad_norm": 0.6616302132606506, + "learning_rate": 0.00019942448160177335, + "loss": 3.2042, + "step": 37269 + }, + { + "epoch": 1.83, + "grad_norm": 0.6154059767723083, + "learning_rate": 0.00019940997654307367, + "loss": 2.8201, + "step": 37270 + }, + { + "epoch": 1.83, + "grad_norm": 0.6408655643463135, + "learning_rate": 0.00019939547174930867, + "loss": 3.0539, + "step": 37271 + }, + { + "epoch": 1.83, + "grad_norm": 0.6339004635810852, + "learning_rate": 0.00019938096722051666, + "loss": 2.887, + "step": 37272 + }, + { + "epoch": 1.83, + "grad_norm": 0.6301832795143127, + "learning_rate": 0.00019936646295673586, + "loss": 3.0111, + "step": 37273 + }, + { + "epoch": 1.83, + "grad_norm": 0.6578487157821655, + "learning_rate": 0.00019935195895800435, + "loss": 3.0007, + "step": 37274 + }, + { + "epoch": 1.83, + "grad_norm": 0.6201621294021606, + "learning_rate": 0.00019933745522436052, + "loss": 3.0404, + "step": 37275 + }, + { + "epoch": 1.83, + "grad_norm": 0.63294517993927, + "learning_rate": 0.00019932295175584234, + "loss": 2.8971, + "step": 37276 + }, + { + "epoch": 1.83, + "grad_norm": 0.6506541967391968, + "learning_rate": 0.00019930844855248816, + "loss": 3.0214, + "step": 37277 + }, + { + "epoch": 1.83, + "grad_norm": 0.6443052887916565, + "learning_rate": 0.00019929394561433628, + "loss": 2.9546, + "step": 37278 + }, + { + "epoch": 1.83, + "grad_norm": 0.641716480255127, + "learning_rate": 0.00019927944294142455, + "loss": 3.1024, + "step": 37279 + }, + { + "epoch": 1.83, + "grad_norm": 0.6462839245796204, + "learning_rate": 0.00019926494053379155, + "loss": 3.033, + "step": 37280 + }, + { + "epoch": 1.83, + "grad_norm": 0.6586825251579285, + "learning_rate": 0.00019925043839147527, + "loss": 2.9807, + "step": 37281 + }, + { + "epoch": 1.83, + "grad_norm": 0.6649317145347595, + "learning_rate": 0.00019923593651451386, + "loss": 3.1034, + "step": 37282 + }, + { + "epoch": 1.83, + "grad_norm": 0.6174647212028503, + "learning_rate": 0.00019922143490294576, + "loss": 2.9981, + "step": 37283 + }, + { + "epoch": 1.83, + "grad_norm": 0.7386031746864319, + "learning_rate": 0.00019920693355680893, + "loss": 2.9414, + "step": 37284 + }, + { + "epoch": 1.83, + "grad_norm": 0.7577504515647888, + "learning_rate": 0.00019919243247614168, + "loss": 3.0452, + "step": 37285 + }, + { + "epoch": 1.83, + "grad_norm": 0.624578058719635, + "learning_rate": 0.00019917793166098204, + "loss": 3.1585, + "step": 37286 + }, + { + "epoch": 1.83, + "grad_norm": 0.6715158224105835, + "learning_rate": 0.0001991634311113684, + "loss": 3.2085, + "step": 37287 + }, + { + "epoch": 1.83, + "grad_norm": 0.672528862953186, + "learning_rate": 0.00019914893082733888, + "loss": 3.1, + "step": 37288 + }, + { + "epoch": 1.83, + "grad_norm": 0.6003857254981995, + "learning_rate": 0.0001991344308089316, + "loss": 3.0433, + "step": 37289 + }, + { + "epoch": 1.83, + "grad_norm": 0.6297593116760254, + "learning_rate": 0.00019911993105618494, + "loss": 2.9949, + "step": 37290 + }, + { + "epoch": 1.83, + "grad_norm": 0.6269044280052185, + "learning_rate": 0.00019910543156913688, + "loss": 3.0166, + "step": 37291 + }, + { + "epoch": 1.83, + "grad_norm": 0.6183241009712219, + "learning_rate": 0.00019909093234782565, + "loss": 3.0963, + "step": 37292 + }, + { + "epoch": 1.83, + "grad_norm": 0.6317001581192017, + "learning_rate": 0.0001990764333922896, + "loss": 3.1374, + "step": 37293 + }, + { + "epoch": 1.83, + "grad_norm": 0.6698792576789856, + "learning_rate": 0.00019906193470256678, + "loss": 2.7781, + "step": 37294 + }, + { + "epoch": 1.83, + "grad_norm": 0.6696431040763855, + "learning_rate": 0.00019904743627869543, + "loss": 3.0622, + "step": 37295 + }, + { + "epoch": 1.83, + "grad_norm": 0.6266303062438965, + "learning_rate": 0.00019903293812071357, + "loss": 3.1569, + "step": 37296 + }, + { + "epoch": 1.83, + "grad_norm": 0.6246880292892456, + "learning_rate": 0.0001990184402286595, + "loss": 3.3158, + "step": 37297 + }, + { + "epoch": 1.83, + "grad_norm": 0.7014569044113159, + "learning_rate": 0.0001990039426025716, + "loss": 2.9063, + "step": 37298 + }, + { + "epoch": 1.83, + "grad_norm": 0.5825400352478027, + "learning_rate": 0.0001989894452424878, + "loss": 2.9569, + "step": 37299 + }, + { + "epoch": 1.83, + "grad_norm": 0.6470881104469299, + "learning_rate": 0.00019897494814844643, + "loss": 2.9374, + "step": 37300 + }, + { + "epoch": 1.83, + "grad_norm": 0.6545881628990173, + "learning_rate": 0.00019896045132048544, + "loss": 3.0919, + "step": 37301 + }, + { + "epoch": 1.83, + "grad_norm": 0.701209306716919, + "learning_rate": 0.00019894595475864325, + "loss": 3.0953, + "step": 37302 + }, + { + "epoch": 1.83, + "grad_norm": 0.7519078254699707, + "learning_rate": 0.00019893145846295806, + "loss": 2.9928, + "step": 37303 + }, + { + "epoch": 1.83, + "grad_norm": 0.6551284193992615, + "learning_rate": 0.00019891696243346782, + "loss": 3.2031, + "step": 37304 + }, + { + "epoch": 1.83, + "grad_norm": 0.6280120611190796, + "learning_rate": 0.00019890246667021095, + "loss": 3.0596, + "step": 37305 + }, + { + "epoch": 1.83, + "grad_norm": 0.62892746925354, + "learning_rate": 0.0001988879711732255, + "loss": 2.8681, + "step": 37306 + }, + { + "epoch": 1.83, + "grad_norm": 0.6775756478309631, + "learning_rate": 0.00019887347594254958, + "loss": 2.9004, + "step": 37307 + }, + { + "epoch": 1.83, + "grad_norm": 0.6382337808609009, + "learning_rate": 0.0001988589809782216, + "loss": 2.8997, + "step": 37308 + }, + { + "epoch": 1.83, + "grad_norm": 0.639171838760376, + "learning_rate": 0.00019884448628027955, + "loss": 3.1746, + "step": 37309 + }, + { + "epoch": 1.83, + "grad_norm": 0.643237829208374, + "learning_rate": 0.0001988299918487617, + "loss": 3.0516, + "step": 37310 + }, + { + "epoch": 1.83, + "grad_norm": 0.6277839541435242, + "learning_rate": 0.00019881549768370604, + "loss": 2.9276, + "step": 37311 + }, + { + "epoch": 1.83, + "grad_norm": 0.6903034448623657, + "learning_rate": 0.000198801003785151, + "loss": 2.7113, + "step": 37312 + }, + { + "epoch": 1.83, + "grad_norm": 0.6506561636924744, + "learning_rate": 0.00019878651015313464, + "loss": 3.142, + "step": 37313 + }, + { + "epoch": 1.83, + "grad_norm": 0.6318259835243225, + "learning_rate": 0.00019877201678769504, + "loss": 3.018, + "step": 37314 + }, + { + "epoch": 1.83, + "grad_norm": 0.660740852355957, + "learning_rate": 0.0001987575236888706, + "loss": 3.2023, + "step": 37315 + }, + { + "epoch": 1.83, + "grad_norm": 0.6094775199890137, + "learning_rate": 0.00019874303085669923, + "loss": 2.8146, + "step": 37316 + }, + { + "epoch": 1.83, + "grad_norm": 0.64765465259552, + "learning_rate": 0.0001987285382912194, + "loss": 3.1413, + "step": 37317 + }, + { + "epoch": 1.83, + "grad_norm": 0.6887152791023254, + "learning_rate": 0.00019871404599246893, + "loss": 3.0623, + "step": 37318 + }, + { + "epoch": 1.83, + "grad_norm": 0.6461672782897949, + "learning_rate": 0.0001986995539604862, + "loss": 3.0541, + "step": 37319 + }, + { + "epoch": 1.83, + "grad_norm": 0.6719304323196411, + "learning_rate": 0.0001986850621953094, + "loss": 3.1318, + "step": 37320 + }, + { + "epoch": 1.83, + "grad_norm": 0.6449258327484131, + "learning_rate": 0.00019867057069697665, + "loss": 2.7289, + "step": 37321 + }, + { + "epoch": 1.83, + "grad_norm": 0.6388571858406067, + "learning_rate": 0.00019865607946552616, + "loss": 2.9219, + "step": 37322 + }, + { + "epoch": 1.83, + "grad_norm": 0.6894384622573853, + "learning_rate": 0.00019864158850099595, + "loss": 3.1242, + "step": 37323 + }, + { + "epoch": 1.83, + "grad_norm": 0.6467133164405823, + "learning_rate": 0.00019862709780342432, + "loss": 3.1348, + "step": 37324 + }, + { + "epoch": 1.83, + "grad_norm": 0.6446911096572876, + "learning_rate": 0.0001986126073728495, + "loss": 2.8621, + "step": 37325 + }, + { + "epoch": 1.83, + "grad_norm": 0.6543609499931335, + "learning_rate": 0.0001985981172093094, + "loss": 2.8432, + "step": 37326 + }, + { + "epoch": 1.83, + "grad_norm": 0.6412947773933411, + "learning_rate": 0.00019858362731284247, + "loss": 2.9529, + "step": 37327 + }, + { + "epoch": 1.83, + "grad_norm": 0.6755062341690063, + "learning_rate": 0.0001985691376834867, + "loss": 3.0239, + "step": 37328 + }, + { + "epoch": 1.83, + "grad_norm": 0.6204738616943359, + "learning_rate": 0.00019855464832128024, + "loss": 3.0975, + "step": 37329 + }, + { + "epoch": 1.83, + "grad_norm": 0.6035327315330505, + "learning_rate": 0.00019854015922626141, + "loss": 3.0631, + "step": 37330 + }, + { + "epoch": 1.83, + "grad_norm": 0.6611145734786987, + "learning_rate": 0.00019852567039846825, + "loss": 2.9024, + "step": 37331 + }, + { + "epoch": 1.83, + "grad_norm": 0.6760570406913757, + "learning_rate": 0.00019851118183793898, + "loss": 2.8843, + "step": 37332 + }, + { + "epoch": 1.83, + "grad_norm": 0.6354165077209473, + "learning_rate": 0.00019849669354471162, + "loss": 3.0704, + "step": 37333 + }, + { + "epoch": 1.83, + "grad_norm": 0.6146211624145508, + "learning_rate": 0.00019848220551882447, + "loss": 3.0125, + "step": 37334 + }, + { + "epoch": 1.83, + "grad_norm": 0.726223349571228, + "learning_rate": 0.00019846771776031572, + "loss": 2.838, + "step": 37335 + }, + { + "epoch": 1.83, + "grad_norm": 0.6218348145484924, + "learning_rate": 0.00019845323026922339, + "loss": 3.0697, + "step": 37336 + }, + { + "epoch": 1.83, + "grad_norm": 0.6220173835754395, + "learning_rate": 0.00019843874304558578, + "loss": 3.224, + "step": 37337 + }, + { + "epoch": 1.83, + "grad_norm": 0.6685629487037659, + "learning_rate": 0.00019842425608944085, + "loss": 3.0376, + "step": 37338 + }, + { + "epoch": 1.83, + "grad_norm": 0.6236324906349182, + "learning_rate": 0.00019840976940082696, + "loss": 3.0332, + "step": 37339 + }, + { + "epoch": 1.83, + "grad_norm": 0.6823529601097107, + "learning_rate": 0.00019839528297978227, + "loss": 2.8418, + "step": 37340 + }, + { + "epoch": 1.83, + "grad_norm": 0.6390722393989563, + "learning_rate": 0.0001983807968263447, + "loss": 3.0181, + "step": 37341 + }, + { + "epoch": 1.83, + "grad_norm": 0.6559518575668335, + "learning_rate": 0.00019836631094055266, + "loss": 2.774, + "step": 37342 + }, + { + "epoch": 1.83, + "grad_norm": 0.6333982348442078, + "learning_rate": 0.0001983518253224442, + "loss": 2.8799, + "step": 37343 + }, + { + "epoch": 1.83, + "grad_norm": 0.6427645683288574, + "learning_rate": 0.00019833733997205735, + "loss": 2.924, + "step": 37344 + }, + { + "epoch": 1.83, + "grad_norm": 0.6432594656944275, + "learning_rate": 0.0001983228548894305, + "loss": 2.9778, + "step": 37345 + }, + { + "epoch": 1.83, + "grad_norm": 0.6509485244750977, + "learning_rate": 0.00019830837007460162, + "loss": 3.065, + "step": 37346 + }, + { + "epoch": 1.83, + "grad_norm": 0.6645312905311584, + "learning_rate": 0.000198293885527609, + "loss": 3.0997, + "step": 37347 + }, + { + "epoch": 1.83, + "grad_norm": 0.6213983297348022, + "learning_rate": 0.00019827940124849061, + "loss": 3.0474, + "step": 37348 + }, + { + "epoch": 1.83, + "grad_norm": 0.7174972295761108, + "learning_rate": 0.00019826491723728474, + "loss": 3.0375, + "step": 37349 + }, + { + "epoch": 1.83, + "grad_norm": 0.6308889389038086, + "learning_rate": 0.00019825043349402954, + "loss": 2.937, + "step": 37350 + }, + { + "epoch": 1.83, + "grad_norm": 0.6400142312049866, + "learning_rate": 0.00019823595001876301, + "loss": 3.1328, + "step": 37351 + }, + { + "epoch": 1.83, + "grad_norm": 0.6188294887542725, + "learning_rate": 0.00019822146681152354, + "loss": 2.9726, + "step": 37352 + }, + { + "epoch": 1.83, + "grad_norm": 0.6347734928131104, + "learning_rate": 0.0001982069838723491, + "loss": 2.8248, + "step": 37353 + }, + { + "epoch": 1.83, + "grad_norm": 0.6330137252807617, + "learning_rate": 0.00019819250120127778, + "loss": 3.0705, + "step": 37354 + }, + { + "epoch": 1.83, + "grad_norm": 0.6539512872695923, + "learning_rate": 0.00019817801879834794, + "loss": 2.9606, + "step": 37355 + }, + { + "epoch": 1.83, + "grad_norm": 0.6931697726249695, + "learning_rate": 0.00019816353666359752, + "loss": 2.8545, + "step": 37356 + }, + { + "epoch": 1.83, + "grad_norm": 0.6446145176887512, + "learning_rate": 0.0001981490547970648, + "loss": 3.0497, + "step": 37357 + }, + { + "epoch": 1.83, + "grad_norm": 0.6381685137748718, + "learning_rate": 0.00019813457319878782, + "loss": 3.0407, + "step": 37358 + }, + { + "epoch": 1.83, + "grad_norm": 0.7098675966262817, + "learning_rate": 0.0001981200918688047, + "loss": 3.0888, + "step": 37359 + }, + { + "epoch": 1.83, + "grad_norm": 0.623608410358429, + "learning_rate": 0.00019810561080715384, + "loss": 2.9005, + "step": 37360 + }, + { + "epoch": 1.83, + "grad_norm": 0.6890565156936646, + "learning_rate": 0.00019809113001387305, + "loss": 2.7902, + "step": 37361 + }, + { + "epoch": 1.83, + "grad_norm": 0.6344024538993835, + "learning_rate": 0.00019807664948900071, + "loss": 2.9944, + "step": 37362 + }, + { + "epoch": 1.83, + "grad_norm": 0.6248427033424377, + "learning_rate": 0.00019806216923257475, + "loss": 3.179, + "step": 37363 + }, + { + "epoch": 1.83, + "grad_norm": 0.6155762672424316, + "learning_rate": 0.00019804768924463348, + "loss": 2.9478, + "step": 37364 + }, + { + "epoch": 1.83, + "grad_norm": 0.6568313837051392, + "learning_rate": 0.00019803320952521501, + "loss": 3.0381, + "step": 37365 + }, + { + "epoch": 1.83, + "grad_norm": 0.6843270063400269, + "learning_rate": 0.0001980187300743573, + "loss": 2.9444, + "step": 37366 + }, + { + "epoch": 1.83, + "grad_norm": 0.6298990845680237, + "learning_rate": 0.00019800425089209881, + "loss": 3.0637, + "step": 37367 + }, + { + "epoch": 1.83, + "grad_norm": 0.6255677938461304, + "learning_rate": 0.00019798977197847743, + "loss": 3.1191, + "step": 37368 + }, + { + "epoch": 1.83, + "grad_norm": 0.6226163506507874, + "learning_rate": 0.00019797529333353127, + "loss": 3.1296, + "step": 37369 + }, + { + "epoch": 1.83, + "grad_norm": 0.6100264191627502, + "learning_rate": 0.00019796081495729868, + "loss": 3.1796, + "step": 37370 + }, + { + "epoch": 1.83, + "grad_norm": 0.644378662109375, + "learning_rate": 0.00019794633684981758, + "loss": 2.7442, + "step": 37371 + }, + { + "epoch": 1.83, + "grad_norm": 0.6284385919570923, + "learning_rate": 0.00019793185901112627, + "loss": 3.1884, + "step": 37372 + }, + { + "epoch": 1.83, + "grad_norm": 0.659661054611206, + "learning_rate": 0.0001979173814412627, + "loss": 3.0421, + "step": 37373 + }, + { + "epoch": 1.83, + "grad_norm": 0.6167201399803162, + "learning_rate": 0.00019790290414026514, + "loss": 3.1204, + "step": 37374 + }, + { + "epoch": 1.83, + "grad_norm": 0.6492601037025452, + "learning_rate": 0.00019788842710817173, + "loss": 3.0217, + "step": 37375 + }, + { + "epoch": 1.83, + "grad_norm": 0.8590417504310608, + "learning_rate": 0.00019787395034502047, + "loss": 2.93, + "step": 37376 + }, + { + "epoch": 1.83, + "grad_norm": 0.6162007451057434, + "learning_rate": 0.00019785947385084966, + "loss": 2.8552, + "step": 37377 + }, + { + "epoch": 1.83, + "grad_norm": 0.6711397767066956, + "learning_rate": 0.0001978449976256972, + "loss": 3.0362, + "step": 37378 + }, + { + "epoch": 1.83, + "grad_norm": 0.6198904514312744, + "learning_rate": 0.00019783052166960147, + "loss": 2.8681, + "step": 37379 + }, + { + "epoch": 1.83, + "grad_norm": 0.6615649461746216, + "learning_rate": 0.00019781604598260048, + "loss": 3.008, + "step": 37380 + }, + { + "epoch": 1.83, + "grad_norm": 0.6402595639228821, + "learning_rate": 0.0001978015705647323, + "loss": 2.8823, + "step": 37381 + }, + { + "epoch": 1.83, + "grad_norm": 0.6121137738227844, + "learning_rate": 0.0001977870954160352, + "loss": 3.1466, + "step": 37382 + }, + { + "epoch": 1.83, + "grad_norm": 0.63599693775177, + "learning_rate": 0.00019777262053654714, + "loss": 2.923, + "step": 37383 + }, + { + "epoch": 1.83, + "grad_norm": 0.6528685092926025, + "learning_rate": 0.0001977581459263063, + "loss": 2.9125, + "step": 37384 + }, + { + "epoch": 1.83, + "grad_norm": 0.6184086799621582, + "learning_rate": 0.00019774367158535095, + "loss": 3.0302, + "step": 37385 + }, + { + "epoch": 1.83, + "grad_norm": 0.6583097577095032, + "learning_rate": 0.00019772919751371903, + "loss": 3.0069, + "step": 37386 + }, + { + "epoch": 1.83, + "grad_norm": 0.6171321868896484, + "learning_rate": 0.00019771472371144874, + "loss": 2.911, + "step": 37387 + }, + { + "epoch": 1.83, + "grad_norm": 0.6588572263717651, + "learning_rate": 0.0001977002501785781, + "loss": 3.247, + "step": 37388 + }, + { + "epoch": 1.83, + "grad_norm": 0.6455833911895752, + "learning_rate": 0.00019768577691514538, + "loss": 2.9604, + "step": 37389 + }, + { + "epoch": 1.83, + "grad_norm": 0.6067509651184082, + "learning_rate": 0.00019767130392118868, + "loss": 2.9472, + "step": 37390 + }, + { + "epoch": 1.83, + "grad_norm": 0.6183087229728699, + "learning_rate": 0.00019765683119674595, + "loss": 3.1244, + "step": 37391 + }, + { + "epoch": 1.83, + "grad_norm": 0.7128816843032837, + "learning_rate": 0.0001976423587418555, + "loss": 3.1313, + "step": 37392 + }, + { + "epoch": 1.83, + "grad_norm": 0.6646550893783569, + "learning_rate": 0.0001976278865565554, + "loss": 2.901, + "step": 37393 + }, + { + "epoch": 1.83, + "grad_norm": 0.6627399325370789, + "learning_rate": 0.0001976134146408838, + "loss": 3.0871, + "step": 37394 + }, + { + "epoch": 1.83, + "grad_norm": 0.6553120017051697, + "learning_rate": 0.00019759894299487858, + "loss": 3.2268, + "step": 37395 + }, + { + "epoch": 1.83, + "grad_norm": 0.6263793110847473, + "learning_rate": 0.00019758447161857812, + "loss": 3.1037, + "step": 37396 + }, + { + "epoch": 1.83, + "grad_norm": 0.647152304649353, + "learning_rate": 0.0001975700005120205, + "loss": 2.8602, + "step": 37397 + }, + { + "epoch": 1.83, + "grad_norm": 0.6689240336418152, + "learning_rate": 0.00019755552967524372, + "loss": 3.1095, + "step": 37398 + }, + { + "epoch": 1.83, + "grad_norm": 0.6836102604866028, + "learning_rate": 0.00019754105910828603, + "loss": 2.922, + "step": 37399 + }, + { + "epoch": 1.83, + "grad_norm": 0.6442393064498901, + "learning_rate": 0.00019752658881118536, + "loss": 2.9583, + "step": 37400 + }, + { + "epoch": 1.83, + "grad_norm": 0.6044512391090393, + "learning_rate": 0.00019751211878398002, + "loss": 3.21, + "step": 37401 + }, + { + "epoch": 1.83, + "grad_norm": 0.627376139163971, + "learning_rate": 0.00019749764902670804, + "loss": 3.017, + "step": 37402 + }, + { + "epoch": 1.83, + "grad_norm": 0.6755691766738892, + "learning_rate": 0.00019748317953940744, + "loss": 3.0251, + "step": 37403 + }, + { + "epoch": 1.83, + "grad_norm": 0.621547520160675, + "learning_rate": 0.00019746871032211657, + "loss": 3.0488, + "step": 37404 + }, + { + "epoch": 1.83, + "grad_norm": 0.6141914129257202, + "learning_rate": 0.00019745424137487328, + "loss": 3.1355, + "step": 37405 + }, + { + "epoch": 1.83, + "grad_norm": 0.6534335017204285, + "learning_rate": 0.00019743977269771573, + "loss": 3.0441, + "step": 37406 + }, + { + "epoch": 1.83, + "grad_norm": 0.6744629144668579, + "learning_rate": 0.0001974253042906822, + "loss": 3.0055, + "step": 37407 + }, + { + "epoch": 1.83, + "grad_norm": 0.6473421454429626, + "learning_rate": 0.0001974108361538106, + "loss": 2.8622, + "step": 37408 + }, + { + "epoch": 1.83, + "grad_norm": 0.6605812311172485, + "learning_rate": 0.00019739636828713922, + "loss": 3.0587, + "step": 37409 + }, + { + "epoch": 1.83, + "grad_norm": 0.6951032876968384, + "learning_rate": 0.0001973819006907059, + "loss": 3.0455, + "step": 37410 + }, + { + "epoch": 1.83, + "grad_norm": 0.6221358180046082, + "learning_rate": 0.000197367433364549, + "loss": 3.1964, + "step": 37411 + }, + { + "epoch": 1.83, + "grad_norm": 0.6235369443893433, + "learning_rate": 0.00019735296630870656, + "loss": 2.9421, + "step": 37412 + }, + { + "epoch": 1.83, + "grad_norm": 0.6678821444511414, + "learning_rate": 0.00019733849952321655, + "loss": 2.8644, + "step": 37413 + }, + { + "epoch": 1.83, + "grad_norm": 0.6573663949966431, + "learning_rate": 0.00019732403300811729, + "loss": 3.0983, + "step": 37414 + }, + { + "epoch": 1.83, + "grad_norm": 0.6279765367507935, + "learning_rate": 0.00019730956676344667, + "loss": 2.7258, + "step": 37415 + }, + { + "epoch": 1.83, + "grad_norm": 0.6428644061088562, + "learning_rate": 0.0001972951007892429, + "loss": 2.9142, + "step": 37416 + }, + { + "epoch": 1.83, + "grad_norm": 0.6456839442253113, + "learning_rate": 0.0001972806350855441, + "loss": 2.9727, + "step": 37417 + }, + { + "epoch": 1.83, + "grad_norm": 0.6260696649551392, + "learning_rate": 0.00019726616965238828, + "loss": 3.1471, + "step": 37418 + }, + { + "epoch": 1.83, + "grad_norm": 0.6509988307952881, + "learning_rate": 0.00019725170448981377, + "loss": 2.923, + "step": 37419 + }, + { + "epoch": 1.83, + "grad_norm": 0.6517656445503235, + "learning_rate": 0.0001972372395978583, + "loss": 2.9567, + "step": 37420 + }, + { + "epoch": 1.83, + "grad_norm": 0.6638441681861877, + "learning_rate": 0.00019722277497656018, + "loss": 3.1524, + "step": 37421 + }, + { + "epoch": 1.83, + "grad_norm": 0.6186351776123047, + "learning_rate": 0.00019720831062595759, + "loss": 3.002, + "step": 37422 + }, + { + "epoch": 1.83, + "grad_norm": 0.6337801814079285, + "learning_rate": 0.00019719384654608849, + "loss": 2.9353, + "step": 37423 + }, + { + "epoch": 1.83, + "grad_norm": 0.6668829917907715, + "learning_rate": 0.00019717938273699107, + "loss": 3.0473, + "step": 37424 + }, + { + "epoch": 1.83, + "grad_norm": 0.6589478850364685, + "learning_rate": 0.00019716491919870324, + "loss": 3.1595, + "step": 37425 + }, + { + "epoch": 1.83, + "grad_norm": 0.6218236684799194, + "learning_rate": 0.00019715045593126325, + "loss": 2.9327, + "step": 37426 + }, + { + "epoch": 1.83, + "grad_norm": 0.6272194981575012, + "learning_rate": 0.00019713599293470928, + "loss": 2.9345, + "step": 37427 + }, + { + "epoch": 1.83, + "grad_norm": 0.655690610408783, + "learning_rate": 0.00019712153020907917, + "loss": 3.021, + "step": 37428 + }, + { + "epoch": 1.83, + "grad_norm": 0.6610934734344482, + "learning_rate": 0.0001971070677544113, + "loss": 3.0346, + "step": 37429 + }, + { + "epoch": 1.83, + "grad_norm": 0.6667243242263794, + "learning_rate": 0.00019709260557074348, + "loss": 3.0287, + "step": 37430 + }, + { + "epoch": 1.83, + "grad_norm": 0.6456306576728821, + "learning_rate": 0.00019707814365811391, + "loss": 2.8273, + "step": 37431 + }, + { + "epoch": 1.83, + "grad_norm": 0.6371907591819763, + "learning_rate": 0.00019706368201656086, + "loss": 3.0458, + "step": 37432 + }, + { + "epoch": 1.83, + "grad_norm": 0.6370196342468262, + "learning_rate": 0.00019704922064612217, + "loss": 3.024, + "step": 37433 + }, + { + "epoch": 1.83, + "grad_norm": 0.6374519467353821, + "learning_rate": 0.00019703475954683606, + "loss": 2.9035, + "step": 37434 + }, + { + "epoch": 1.83, + "grad_norm": 0.7033529877662659, + "learning_rate": 0.00019702029871874048, + "loss": 3.0132, + "step": 37435 + }, + { + "epoch": 1.83, + "grad_norm": 0.6438736915588379, + "learning_rate": 0.00019700583816187363, + "loss": 3.004, + "step": 37436 + }, + { + "epoch": 1.83, + "grad_norm": 0.8195931911468506, + "learning_rate": 0.00019699137787627369, + "loss": 2.8433, + "step": 37437 + }, + { + "epoch": 1.83, + "grad_norm": 0.6257565021514893, + "learning_rate": 0.0001969769178619785, + "loss": 3.02, + "step": 37438 + }, + { + "epoch": 1.83, + "grad_norm": 0.6688423752784729, + "learning_rate": 0.0001969624581190264, + "loss": 3.1861, + "step": 37439 + }, + { + "epoch": 1.83, + "grad_norm": 0.6222291588783264, + "learning_rate": 0.00019694799864745525, + "loss": 2.9031, + "step": 37440 + }, + { + "epoch": 1.83, + "grad_norm": 0.6126012206077576, + "learning_rate": 0.00019693353944730325, + "loss": 2.9896, + "step": 37441 + }, + { + "epoch": 1.83, + "grad_norm": 0.6390916705131531, + "learning_rate": 0.0001969190805186086, + "loss": 2.7175, + "step": 37442 + }, + { + "epoch": 1.83, + "grad_norm": 0.6593005061149597, + "learning_rate": 0.00019690462186140907, + "loss": 2.9528, + "step": 37443 + }, + { + "epoch": 1.84, + "grad_norm": 0.6446021199226379, + "learning_rate": 0.00019689016347574308, + "loss": 3.0649, + "step": 37444 + }, + { + "epoch": 1.84, + "grad_norm": 0.6597580909729004, + "learning_rate": 0.0001968757053616485, + "loss": 2.949, + "step": 37445 + }, + { + "epoch": 1.84, + "grad_norm": 0.6188734173774719, + "learning_rate": 0.0001968612475191634, + "loss": 3.0953, + "step": 37446 + }, + { + "epoch": 1.84, + "grad_norm": 0.6276201009750366, + "learning_rate": 0.00019684678994832605, + "loss": 2.9462, + "step": 37447 + }, + { + "epoch": 1.84, + "grad_norm": 0.6137672662734985, + "learning_rate": 0.00019683233264917435, + "loss": 2.7369, + "step": 37448 + }, + { + "epoch": 1.84, + "grad_norm": 0.6281610727310181, + "learning_rate": 0.00019681787562174645, + "loss": 2.8975, + "step": 37449 + }, + { + "epoch": 1.84, + "grad_norm": 0.675428032875061, + "learning_rate": 0.00019680341886608034, + "loss": 3.0545, + "step": 37450 + }, + { + "epoch": 1.84, + "grad_norm": 0.6224192976951599, + "learning_rate": 0.0001967889623822142, + "loss": 2.9262, + "step": 37451 + }, + { + "epoch": 1.84, + "grad_norm": 0.6284218430519104, + "learning_rate": 0.00019677450617018613, + "loss": 2.9402, + "step": 37452 + }, + { + "epoch": 1.84, + "grad_norm": 0.6586654186248779, + "learning_rate": 0.000196760050230034, + "loss": 2.9344, + "step": 37453 + }, + { + "epoch": 1.84, + "grad_norm": 0.6120323538780212, + "learning_rate": 0.0001967455945617962, + "loss": 2.8945, + "step": 37454 + }, + { + "epoch": 1.84, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.00019673113916551053, + "loss": 3.0017, + "step": 37455 + }, + { + "epoch": 1.84, + "grad_norm": 0.6687843799591064, + "learning_rate": 0.0001967166840412151, + "loss": 3.0225, + "step": 37456 + }, + { + "epoch": 1.84, + "grad_norm": 0.6557430028915405, + "learning_rate": 0.0001967022291889482, + "loss": 3.1305, + "step": 37457 + }, + { + "epoch": 1.84, + "grad_norm": 0.6802811026573181, + "learning_rate": 0.00019668777460874766, + "loss": 3.053, + "step": 37458 + }, + { + "epoch": 1.84, + "grad_norm": 0.6781749725341797, + "learning_rate": 0.00019667332030065172, + "loss": 2.8287, + "step": 37459 + }, + { + "epoch": 1.84, + "grad_norm": 0.6726826429367065, + "learning_rate": 0.00019665886626469824, + "loss": 3.1735, + "step": 37460 + }, + { + "epoch": 1.84, + "grad_norm": 0.6283700466156006, + "learning_rate": 0.00019664441250092542, + "loss": 2.9045, + "step": 37461 + }, + { + "epoch": 1.84, + "grad_norm": 0.6707321405410767, + "learning_rate": 0.00019662995900937143, + "loss": 3.1, + "step": 37462 + }, + { + "epoch": 1.84, + "grad_norm": 0.6527551412582397, + "learning_rate": 0.0001966155057900742, + "loss": 2.8893, + "step": 37463 + }, + { + "epoch": 1.84, + "grad_norm": 0.6310652494430542, + "learning_rate": 0.0001966010528430719, + "loss": 3.1475, + "step": 37464 + }, + { + "epoch": 1.84, + "grad_norm": 0.626528799533844, + "learning_rate": 0.00019658660016840237, + "loss": 3.2232, + "step": 37465 + }, + { + "epoch": 1.84, + "grad_norm": 0.6604952812194824, + "learning_rate": 0.00019657214776610388, + "loss": 3.2123, + "step": 37466 + }, + { + "epoch": 1.84, + "grad_norm": 0.6406633257865906, + "learning_rate": 0.00019655769563621457, + "loss": 2.8845, + "step": 37467 + }, + { + "epoch": 1.84, + "grad_norm": 0.6594672799110413, + "learning_rate": 0.00019654324377877224, + "loss": 3.0813, + "step": 37468 + }, + { + "epoch": 1.84, + "grad_norm": 0.6008974313735962, + "learning_rate": 0.0001965287921938152, + "loss": 3.1406, + "step": 37469 + }, + { + "epoch": 1.84, + "grad_norm": 0.6440598964691162, + "learning_rate": 0.00019651434088138135, + "loss": 3.0571, + "step": 37470 + }, + { + "epoch": 1.84, + "grad_norm": 0.6732915043830872, + "learning_rate": 0.00019649988984150887, + "loss": 2.9316, + "step": 37471 + }, + { + "epoch": 1.84, + "grad_norm": 0.6244654059410095, + "learning_rate": 0.00019648543907423567, + "loss": 3.0407, + "step": 37472 + }, + { + "epoch": 1.84, + "grad_norm": 0.6534292697906494, + "learning_rate": 0.0001964709885795999, + "loss": 2.9746, + "step": 37473 + }, + { + "epoch": 1.84, + "grad_norm": 0.626899778842926, + "learning_rate": 0.0001964565383576397, + "loss": 2.8462, + "step": 37474 + }, + { + "epoch": 1.84, + "grad_norm": 0.6582539677619934, + "learning_rate": 0.00019644208840839295, + "loss": 3.0684, + "step": 37475 + }, + { + "epoch": 1.84, + "grad_norm": 0.6599977016448975, + "learning_rate": 0.00019642763873189796, + "loss": 2.8524, + "step": 37476 + }, + { + "epoch": 1.84, + "grad_norm": 0.6857508420944214, + "learning_rate": 0.00019641318932819255, + "loss": 2.9823, + "step": 37477 + }, + { + "epoch": 1.84, + "grad_norm": 0.6616058349609375, + "learning_rate": 0.00019639874019731476, + "loss": 3.2488, + "step": 37478 + }, + { + "epoch": 1.84, + "grad_norm": 0.6988261938095093, + "learning_rate": 0.0001963842913393029, + "loss": 2.8922, + "step": 37479 + }, + { + "epoch": 1.84, + "grad_norm": 0.6614097952842712, + "learning_rate": 0.0001963698427541948, + "loss": 2.8559, + "step": 37480 + }, + { + "epoch": 1.84, + "grad_norm": 0.6063705086708069, + "learning_rate": 0.00019635539444202874, + "loss": 2.972, + "step": 37481 + }, + { + "epoch": 1.84, + "grad_norm": 0.6515814065933228, + "learning_rate": 0.00019634094640284244, + "loss": 2.9072, + "step": 37482 + }, + { + "epoch": 1.84, + "grad_norm": 0.6901261210441589, + "learning_rate": 0.00019632649863667414, + "loss": 3.0161, + "step": 37483 + }, + { + "epoch": 1.84, + "grad_norm": 0.628663957118988, + "learning_rate": 0.000196312051143562, + "loss": 3.0743, + "step": 37484 + }, + { + "epoch": 1.84, + "grad_norm": 0.6244681477546692, + "learning_rate": 0.0001962976039235439, + "loss": 3.1726, + "step": 37485 + }, + { + "epoch": 1.84, + "grad_norm": 0.6625844836235046, + "learning_rate": 0.000196283156976658, + "loss": 2.9569, + "step": 37486 + }, + { + "epoch": 1.84, + "grad_norm": 0.6282349228858948, + "learning_rate": 0.0001962687103029422, + "loss": 3.1225, + "step": 37487 + }, + { + "epoch": 1.84, + "grad_norm": 0.6397706866264343, + "learning_rate": 0.00019625426390243472, + "loss": 3.0557, + "step": 37488 + }, + { + "epoch": 1.84, + "grad_norm": 0.6789543032646179, + "learning_rate": 0.0001962398177751736, + "loss": 2.9712, + "step": 37489 + }, + { + "epoch": 1.84, + "grad_norm": 0.6492639780044556, + "learning_rate": 0.0001962253719211967, + "loss": 2.9644, + "step": 37490 + }, + { + "epoch": 1.84, + "grad_norm": 0.6266045570373535, + "learning_rate": 0.00019621092634054235, + "loss": 3.053, + "step": 37491 + }, + { + "epoch": 1.84, + "grad_norm": 0.633960485458374, + "learning_rate": 0.00019619648103324835, + "loss": 3.073, + "step": 37492 + }, + { + "epoch": 1.84, + "grad_norm": 0.680493175983429, + "learning_rate": 0.00019618203599935282, + "loss": 3.0203, + "step": 37493 + }, + { + "epoch": 1.84, + "grad_norm": 0.6308938264846802, + "learning_rate": 0.0001961675912388939, + "loss": 3.03, + "step": 37494 + }, + { + "epoch": 1.84, + "grad_norm": 0.6352134943008423, + "learning_rate": 0.00019615314675190955, + "loss": 3.1219, + "step": 37495 + }, + { + "epoch": 1.84, + "grad_norm": 0.608165442943573, + "learning_rate": 0.00019613870253843785, + "loss": 3.0078, + "step": 37496 + }, + { + "epoch": 1.84, + "grad_norm": 0.6532719731330872, + "learning_rate": 0.00019612425859851673, + "loss": 2.8869, + "step": 37497 + }, + { + "epoch": 1.84, + "grad_norm": 0.6677577495574951, + "learning_rate": 0.00019610981493218437, + "loss": 2.8604, + "step": 37498 + }, + { + "epoch": 1.84, + "grad_norm": 0.5935999751091003, + "learning_rate": 0.0001960953715394788, + "loss": 3.175, + "step": 37499 + }, + { + "epoch": 1.84, + "grad_norm": 0.6511732935905457, + "learning_rate": 0.0001960809284204379, + "loss": 2.9567, + "step": 37500 + }, + { + "epoch": 1.84, + "grad_norm": 0.6400048136711121, + "learning_rate": 0.00019606648557509996, + "loss": 2.9456, + "step": 37501 + }, + { + "epoch": 1.84, + "grad_norm": 0.6501739025115967, + "learning_rate": 0.0001960520430035028, + "loss": 2.9489, + "step": 37502 + }, + { + "epoch": 1.84, + "grad_norm": 0.6374745965003967, + "learning_rate": 0.0001960376007056846, + "loss": 3.0406, + "step": 37503 + }, + { + "epoch": 1.84, + "grad_norm": 0.6723983883857727, + "learning_rate": 0.0001960231586816834, + "loss": 2.8661, + "step": 37504 + }, + { + "epoch": 1.84, + "grad_norm": 0.6227818727493286, + "learning_rate": 0.0001960087169315371, + "loss": 2.7228, + "step": 37505 + }, + { + "epoch": 1.84, + "grad_norm": 0.6718969345092773, + "learning_rate": 0.00019599427545528392, + "loss": 3.0161, + "step": 37506 + }, + { + "epoch": 1.84, + "grad_norm": 0.6838817596435547, + "learning_rate": 0.00019597983425296176, + "loss": 2.8973, + "step": 37507 + }, + { + "epoch": 1.84, + "grad_norm": 0.710553765296936, + "learning_rate": 0.00019596539332460863, + "loss": 2.8391, + "step": 37508 + }, + { + "epoch": 1.84, + "grad_norm": 0.6641603708267212, + "learning_rate": 0.00019595095267026277, + "loss": 3.1281, + "step": 37509 + }, + { + "epoch": 1.84, + "grad_norm": 0.6101526618003845, + "learning_rate": 0.00019593651228996201, + "loss": 3.0391, + "step": 37510 + }, + { + "epoch": 1.84, + "grad_norm": 0.6275225281715393, + "learning_rate": 0.00019592207218374448, + "loss": 2.9472, + "step": 37511 + }, + { + "epoch": 1.84, + "grad_norm": 0.6456922888755798, + "learning_rate": 0.0001959076323516481, + "loss": 3.2232, + "step": 37512 + }, + { + "epoch": 1.84, + "grad_norm": 0.6447487473487854, + "learning_rate": 0.000195893192793711, + "loss": 2.7937, + "step": 37513 + }, + { + "epoch": 1.84, + "grad_norm": 0.6500653624534607, + "learning_rate": 0.00019587875350997127, + "loss": 3.1145, + "step": 37514 + }, + { + "epoch": 1.84, + "grad_norm": 0.5911208391189575, + "learning_rate": 0.00019586431450046676, + "loss": 3.0935, + "step": 37515 + }, + { + "epoch": 1.84, + "grad_norm": 0.6784209609031677, + "learning_rate": 0.00019584987576523572, + "loss": 2.9249, + "step": 37516 + }, + { + "epoch": 1.84, + "grad_norm": 0.6476365923881531, + "learning_rate": 0.000195835437304316, + "loss": 2.9984, + "step": 37517 + }, + { + "epoch": 1.84, + "grad_norm": 0.6495740413665771, + "learning_rate": 0.0001958209991177456, + "loss": 2.7983, + "step": 37518 + }, + { + "epoch": 1.84, + "grad_norm": 0.6220631003379822, + "learning_rate": 0.0001958065612055628, + "loss": 3.1553, + "step": 37519 + }, + { + "epoch": 1.84, + "grad_norm": 0.6869008541107178, + "learning_rate": 0.0001957921235678054, + "loss": 2.9559, + "step": 37520 + }, + { + "epoch": 1.84, + "grad_norm": 0.6284224390983582, + "learning_rate": 0.00019577768620451154, + "loss": 2.9683, + "step": 37521 + }, + { + "epoch": 1.84, + "grad_norm": 0.6778497695922852, + "learning_rate": 0.00019576324911571908, + "loss": 3.1064, + "step": 37522 + }, + { + "epoch": 1.84, + "grad_norm": 0.6434932947158813, + "learning_rate": 0.00019574881230146615, + "loss": 3.1636, + "step": 37523 + }, + { + "epoch": 1.84, + "grad_norm": 0.7298178672790527, + "learning_rate": 0.0001957343757617909, + "loss": 3.0143, + "step": 37524 + }, + { + "epoch": 1.84, + "grad_norm": 0.6708294749259949, + "learning_rate": 0.0001957199394967312, + "loss": 2.9184, + "step": 37525 + }, + { + "epoch": 1.84, + "grad_norm": 0.6386827826499939, + "learning_rate": 0.0001957055035063252, + "loss": 3.0899, + "step": 37526 + }, + { + "epoch": 1.84, + "grad_norm": 0.7549542784690857, + "learning_rate": 0.00019569106779061067, + "loss": 2.9044, + "step": 37527 + }, + { + "epoch": 1.84, + "grad_norm": 0.7159115076065063, + "learning_rate": 0.00019567663234962586, + "loss": 3.0351, + "step": 37528 + }, + { + "epoch": 1.84, + "grad_norm": 0.643603503704071, + "learning_rate": 0.00019566219718340877, + "loss": 3.097, + "step": 37529 + }, + { + "epoch": 1.84, + "grad_norm": 0.6065301895141602, + "learning_rate": 0.00019564776229199726, + "loss": 2.8429, + "step": 37530 + }, + { + "epoch": 1.84, + "grad_norm": 0.6380607485771179, + "learning_rate": 0.00019563332767542964, + "loss": 3.0454, + "step": 37531 + }, + { + "epoch": 1.84, + "grad_norm": 0.6211923956871033, + "learning_rate": 0.0001956188933337436, + "loss": 2.7386, + "step": 37532 + }, + { + "epoch": 1.84, + "grad_norm": 0.6757135987281799, + "learning_rate": 0.0001956044592669773, + "loss": 2.9283, + "step": 37533 + }, + { + "epoch": 1.84, + "grad_norm": 0.5980146527290344, + "learning_rate": 0.00019559002547516887, + "loss": 3.097, + "step": 37534 + }, + { + "epoch": 1.84, + "grad_norm": 0.6270644664764404, + "learning_rate": 0.00019557559195835614, + "loss": 2.8905, + "step": 37535 + }, + { + "epoch": 1.84, + "grad_norm": 0.6243385076522827, + "learning_rate": 0.00019556115871657728, + "loss": 3.0983, + "step": 37536 + }, + { + "epoch": 1.84, + "grad_norm": 0.6248341798782349, + "learning_rate": 0.00019554672574987011, + "loss": 3.1228, + "step": 37537 + }, + { + "epoch": 1.84, + "grad_norm": 0.6501051187515259, + "learning_rate": 0.00019553229305827282, + "loss": 3.0654, + "step": 37538 + }, + { + "epoch": 1.84, + "grad_norm": 0.7006635069847107, + "learning_rate": 0.0001955178606418234, + "loss": 2.8608, + "step": 37539 + }, + { + "epoch": 1.84, + "grad_norm": 0.6526589393615723, + "learning_rate": 0.00019550342850055974, + "loss": 3.0982, + "step": 37540 + }, + { + "epoch": 1.84, + "grad_norm": 0.6131229996681213, + "learning_rate": 0.00019548899663452007, + "loss": 2.8902, + "step": 37541 + }, + { + "epoch": 1.84, + "grad_norm": 0.6289817094802856, + "learning_rate": 0.00019547456504374213, + "loss": 3.0206, + "step": 37542 + }, + { + "epoch": 1.84, + "grad_norm": 0.6378176212310791, + "learning_rate": 0.00019546013372826411, + "loss": 3.0923, + "step": 37543 + }, + { + "epoch": 1.84, + "grad_norm": 0.6519317030906677, + "learning_rate": 0.00019544570268812406, + "loss": 3.0326, + "step": 37544 + }, + { + "epoch": 1.84, + "grad_norm": 0.7056100368499756, + "learning_rate": 0.00019543127192335979, + "loss": 2.8943, + "step": 37545 + }, + { + "epoch": 1.84, + "grad_norm": 0.6589441895484924, + "learning_rate": 0.00019541684143400953, + "loss": 3.1757, + "step": 37546 + }, + { + "epoch": 1.84, + "grad_norm": 0.6761717200279236, + "learning_rate": 0.00019540241122011111, + "loss": 2.8114, + "step": 37547 + }, + { + "epoch": 1.84, + "grad_norm": 0.6543384790420532, + "learning_rate": 0.00019538798128170272, + "loss": 3.2523, + "step": 37548 + }, + { + "epoch": 1.84, + "grad_norm": 0.633036196231842, + "learning_rate": 0.0001953735516188221, + "loss": 2.858, + "step": 37549 + }, + { + "epoch": 1.84, + "grad_norm": 0.6491231322288513, + "learning_rate": 0.00019535912223150745, + "loss": 3.1472, + "step": 37550 + }, + { + "epoch": 1.84, + "grad_norm": 0.6392214894294739, + "learning_rate": 0.00019534469311979685, + "loss": 2.9719, + "step": 37551 + }, + { + "epoch": 1.84, + "grad_norm": 0.6699644923210144, + "learning_rate": 0.00019533026428372803, + "loss": 2.9833, + "step": 37552 + }, + { + "epoch": 1.84, + "grad_norm": 0.695570707321167, + "learning_rate": 0.00019531583572333924, + "loss": 2.985, + "step": 37553 + }, + { + "epoch": 1.84, + "grad_norm": 0.7695941925048828, + "learning_rate": 0.0001953014074386684, + "loss": 2.8777, + "step": 37554 + }, + { + "epoch": 1.84, + "grad_norm": 0.6346907615661621, + "learning_rate": 0.0001952869794297534, + "loss": 3.1383, + "step": 37555 + }, + { + "epoch": 1.84, + "grad_norm": 0.6936262249946594, + "learning_rate": 0.00019527255169663244, + "loss": 3.0131, + "step": 37556 + }, + { + "epoch": 1.84, + "grad_norm": 0.6635444760322571, + "learning_rate": 0.0001952581242393434, + "loss": 3.0441, + "step": 37557 + }, + { + "epoch": 1.84, + "grad_norm": 0.6767592430114746, + "learning_rate": 0.0001952436970579244, + "loss": 2.9566, + "step": 37558 + }, + { + "epoch": 1.84, + "grad_norm": 0.6856628656387329, + "learning_rate": 0.00019522927015241316, + "loss": 3.2645, + "step": 37559 + }, + { + "epoch": 1.84, + "grad_norm": 0.6861822009086609, + "learning_rate": 0.00019521484352284793, + "loss": 3.02, + "step": 37560 + }, + { + "epoch": 1.84, + "grad_norm": 0.6693269610404968, + "learning_rate": 0.00019520041716926674, + "loss": 3.0023, + "step": 37561 + }, + { + "epoch": 1.84, + "grad_norm": 0.6560589075088501, + "learning_rate": 0.00019518599109170727, + "loss": 2.8327, + "step": 37562 + }, + { + "epoch": 1.84, + "grad_norm": 0.6303350925445557, + "learning_rate": 0.00019517156529020793, + "loss": 3.0216, + "step": 37563 + }, + { + "epoch": 1.84, + "grad_norm": 0.6482394337654114, + "learning_rate": 0.0001951571397648064, + "loss": 3.0575, + "step": 37564 + }, + { + "epoch": 1.84, + "grad_norm": 0.6522819399833679, + "learning_rate": 0.00019514271451554076, + "loss": 3.136, + "step": 37565 + }, + { + "epoch": 1.84, + "grad_norm": 0.6398262977600098, + "learning_rate": 0.00019512828954244913, + "loss": 2.9499, + "step": 37566 + }, + { + "epoch": 1.84, + "grad_norm": 0.6437414288520813, + "learning_rate": 0.00019511386484556933, + "loss": 2.9961, + "step": 37567 + }, + { + "epoch": 1.84, + "grad_norm": 0.6147285103797913, + "learning_rate": 0.00019509944042493953, + "loss": 3.1381, + "step": 37568 + }, + { + "epoch": 1.84, + "grad_norm": 0.6700540781021118, + "learning_rate": 0.0001950850162805975, + "loss": 2.8628, + "step": 37569 + }, + { + "epoch": 1.84, + "grad_norm": 0.6323624849319458, + "learning_rate": 0.00019507059241258133, + "loss": 2.9542, + "step": 37570 + }, + { + "epoch": 1.84, + "grad_norm": 0.6200016140937805, + "learning_rate": 0.00019505616882092917, + "loss": 3.0798, + "step": 37571 + }, + { + "epoch": 1.84, + "grad_norm": 0.6565930247306824, + "learning_rate": 0.00019504174550567875, + "loss": 2.9682, + "step": 37572 + }, + { + "epoch": 1.84, + "grad_norm": 0.6288124918937683, + "learning_rate": 0.00019502732246686825, + "loss": 3.1476, + "step": 37573 + }, + { + "epoch": 1.84, + "grad_norm": 0.7063016891479492, + "learning_rate": 0.00019501289970453545, + "loss": 3.0224, + "step": 37574 + }, + { + "epoch": 1.84, + "grad_norm": 0.626835286617279, + "learning_rate": 0.00019499847721871855, + "loss": 2.9865, + "step": 37575 + }, + { + "epoch": 1.84, + "grad_norm": 0.6863059401512146, + "learning_rate": 0.0001949840550094555, + "loss": 3.0436, + "step": 37576 + }, + { + "epoch": 1.84, + "grad_norm": 0.6676422357559204, + "learning_rate": 0.0001949696330767841, + "loss": 3.0392, + "step": 37577 + }, + { + "epoch": 1.84, + "grad_norm": 0.649277925491333, + "learning_rate": 0.00019495521142074264, + "loss": 3.0748, + "step": 37578 + }, + { + "epoch": 1.84, + "grad_norm": 0.6728671789169312, + "learning_rate": 0.00019494079004136886, + "loss": 3.0207, + "step": 37579 + }, + { + "epoch": 1.84, + "grad_norm": 0.6115674376487732, + "learning_rate": 0.00019492636893870074, + "loss": 3.0674, + "step": 37580 + }, + { + "epoch": 1.84, + "grad_norm": 0.6326895356178284, + "learning_rate": 0.00019491194811277647, + "loss": 3.042, + "step": 37581 + }, + { + "epoch": 1.84, + "grad_norm": 0.6360620260238647, + "learning_rate": 0.00019489752756363378, + "loss": 2.882, + "step": 37582 + }, + { + "epoch": 1.84, + "grad_norm": 0.6419123411178589, + "learning_rate": 0.00019488310729131095, + "loss": 3.1109, + "step": 37583 + }, + { + "epoch": 1.84, + "grad_norm": 0.6531715989112854, + "learning_rate": 0.00019486868729584565, + "loss": 2.713, + "step": 37584 + }, + { + "epoch": 1.84, + "grad_norm": 0.6210694909095764, + "learning_rate": 0.00019485426757727593, + "loss": 3.0199, + "step": 37585 + }, + { + "epoch": 1.84, + "grad_norm": 0.6679146885871887, + "learning_rate": 0.00019483984813564002, + "loss": 2.7999, + "step": 37586 + }, + { + "epoch": 1.84, + "grad_norm": 0.6969517469406128, + "learning_rate": 0.0001948254289709756, + "loss": 2.928, + "step": 37587 + }, + { + "epoch": 1.84, + "grad_norm": 0.6617041826248169, + "learning_rate": 0.00019481101008332082, + "loss": 3.1227, + "step": 37588 + }, + { + "epoch": 1.84, + "grad_norm": 0.6362742185592651, + "learning_rate": 0.00019479659147271354, + "loss": 2.732, + "step": 37589 + }, + { + "epoch": 1.84, + "grad_norm": 0.6810065507888794, + "learning_rate": 0.00019478217313919176, + "loss": 3.0502, + "step": 37590 + }, + { + "epoch": 1.84, + "grad_norm": 0.643323540687561, + "learning_rate": 0.00019476775508279361, + "loss": 2.8579, + "step": 37591 + }, + { + "epoch": 1.84, + "grad_norm": 0.6467273235321045, + "learning_rate": 0.00019475333730355681, + "loss": 3.1401, + "step": 37592 + }, + { + "epoch": 1.84, + "grad_norm": 0.6345007419586182, + "learning_rate": 0.00019473891980151958, + "loss": 2.7846, + "step": 37593 + }, + { + "epoch": 1.84, + "grad_norm": 0.6979330778121948, + "learning_rate": 0.00019472450257671973, + "loss": 2.8787, + "step": 37594 + }, + { + "epoch": 1.84, + "grad_norm": 0.8724361062049866, + "learning_rate": 0.00019471008562919518, + "loss": 3.1034, + "step": 37595 + }, + { + "epoch": 1.84, + "grad_norm": 0.6298291087150574, + "learning_rate": 0.00019469566895898412, + "loss": 3.1297, + "step": 37596 + }, + { + "epoch": 1.84, + "grad_norm": 0.6355240941047668, + "learning_rate": 0.0001946812525661244, + "loss": 3.1283, + "step": 37597 + }, + { + "epoch": 1.84, + "grad_norm": 0.6391889452934265, + "learning_rate": 0.00019466683645065402, + "loss": 3.0097, + "step": 37598 + }, + { + "epoch": 1.84, + "grad_norm": 0.6222915649414062, + "learning_rate": 0.0001946524206126108, + "loss": 3.0043, + "step": 37599 + }, + { + "epoch": 1.84, + "grad_norm": 0.6683493256568909, + "learning_rate": 0.00019463800505203284, + "loss": 3.053, + "step": 37600 + }, + { + "epoch": 1.84, + "grad_norm": 0.6785693168640137, + "learning_rate": 0.00019462358976895822, + "loss": 3.0694, + "step": 37601 + }, + { + "epoch": 1.84, + "grad_norm": 0.6351543068885803, + "learning_rate": 0.00019460917476342464, + "loss": 2.9983, + "step": 37602 + }, + { + "epoch": 1.84, + "grad_norm": 0.6268134713172913, + "learning_rate": 0.00019459476003547035, + "loss": 2.9023, + "step": 37603 + }, + { + "epoch": 1.84, + "grad_norm": 0.6076909303665161, + "learning_rate": 0.00019458034558513304, + "loss": 3.0121, + "step": 37604 + }, + { + "epoch": 1.84, + "grad_norm": 0.6232816576957703, + "learning_rate": 0.00019456593141245086, + "loss": 2.9033, + "step": 37605 + }, + { + "epoch": 1.84, + "grad_norm": 0.6748607754707336, + "learning_rate": 0.00019455151751746183, + "loss": 3.0505, + "step": 37606 + }, + { + "epoch": 1.84, + "grad_norm": 0.6353899836540222, + "learning_rate": 0.00019453710390020365, + "loss": 3.1189, + "step": 37607 + }, + { + "epoch": 1.84, + "grad_norm": 0.6453273892402649, + "learning_rate": 0.00019452269056071457, + "loss": 3.3284, + "step": 37608 + }, + { + "epoch": 1.84, + "grad_norm": 0.6326226592063904, + "learning_rate": 0.00019450827749903238, + "loss": 2.9296, + "step": 37609 + }, + { + "epoch": 1.84, + "grad_norm": 0.6521055102348328, + "learning_rate": 0.00019449386471519502, + "loss": 3.0117, + "step": 37610 + }, + { + "epoch": 1.84, + "grad_norm": 0.663453221321106, + "learning_rate": 0.00019447945220924063, + "loss": 2.8768, + "step": 37611 + }, + { + "epoch": 1.84, + "grad_norm": 0.6682723760604858, + "learning_rate": 0.000194465039981207, + "loss": 2.9598, + "step": 37612 + }, + { + "epoch": 1.84, + "grad_norm": 0.6204665899276733, + "learning_rate": 0.00019445062803113218, + "loss": 3.1624, + "step": 37613 + }, + { + "epoch": 1.84, + "grad_norm": 0.6165514588356018, + "learning_rate": 0.000194436216359054, + "loss": 3.3258, + "step": 37614 + }, + { + "epoch": 1.84, + "grad_norm": 0.6465550065040588, + "learning_rate": 0.00019442180496501054, + "loss": 2.9518, + "step": 37615 + }, + { + "epoch": 1.84, + "grad_norm": 0.6707178354263306, + "learning_rate": 0.00019440739384903982, + "loss": 2.9864, + "step": 37616 + }, + { + "epoch": 1.84, + "grad_norm": 0.6141831278800964, + "learning_rate": 0.00019439298301117955, + "loss": 2.9937, + "step": 37617 + }, + { + "epoch": 1.84, + "grad_norm": 0.6654483675956726, + "learning_rate": 0.00019437857245146797, + "loss": 2.8547, + "step": 37618 + }, + { + "epoch": 1.84, + "grad_norm": 0.6087260842323303, + "learning_rate": 0.00019436416216994285, + "loss": 2.8802, + "step": 37619 + }, + { + "epoch": 1.84, + "grad_norm": 0.7105410099029541, + "learning_rate": 0.00019434975216664217, + "loss": 3.0672, + "step": 37620 + }, + { + "epoch": 1.84, + "grad_norm": 0.6254940629005432, + "learning_rate": 0.000194335342441604, + "loss": 2.9234, + "step": 37621 + }, + { + "epoch": 1.84, + "grad_norm": 0.6136469841003418, + "learning_rate": 0.00019432093299486612, + "loss": 3.0558, + "step": 37622 + }, + { + "epoch": 1.84, + "grad_norm": 0.6719083786010742, + "learning_rate": 0.00019430652382646665, + "loss": 2.9483, + "step": 37623 + }, + { + "epoch": 1.84, + "grad_norm": 0.6281088590621948, + "learning_rate": 0.00019429211493644335, + "loss": 2.958, + "step": 37624 + }, + { + "epoch": 1.84, + "grad_norm": 0.642126739025116, + "learning_rate": 0.00019427770632483423, + "loss": 3.0231, + "step": 37625 + }, + { + "epoch": 1.84, + "grad_norm": 0.6834142208099365, + "learning_rate": 0.00019426329799167742, + "loss": 2.9678, + "step": 37626 + }, + { + "epoch": 1.84, + "grad_norm": 0.6461591124534607, + "learning_rate": 0.00019424888993701072, + "loss": 2.944, + "step": 37627 + }, + { + "epoch": 1.84, + "grad_norm": 0.6434966325759888, + "learning_rate": 0.0001942344821608721, + "loss": 2.8844, + "step": 37628 + }, + { + "epoch": 1.84, + "grad_norm": 0.62734055519104, + "learning_rate": 0.00019422007466329938, + "loss": 3.0886, + "step": 37629 + }, + { + "epoch": 1.84, + "grad_norm": 0.6289473176002502, + "learning_rate": 0.00019420566744433074, + "loss": 2.8589, + "step": 37630 + }, + { + "epoch": 1.84, + "grad_norm": 0.6288855671882629, + "learning_rate": 0.000194191260504004, + "loss": 2.986, + "step": 37631 + }, + { + "epoch": 1.84, + "grad_norm": 0.6580570936203003, + "learning_rate": 0.000194176853842357, + "loss": 3.0736, + "step": 37632 + }, + { + "epoch": 1.84, + "grad_norm": 0.6353893280029297, + "learning_rate": 0.00019416244745942797, + "loss": 3.0207, + "step": 37633 + }, + { + "epoch": 1.84, + "grad_norm": 0.6546821594238281, + "learning_rate": 0.0001941480413552546, + "loss": 2.9067, + "step": 37634 + }, + { + "epoch": 1.84, + "grad_norm": 0.6692838668823242, + "learning_rate": 0.00019413363552987496, + "loss": 3.0202, + "step": 37635 + }, + { + "epoch": 1.84, + "grad_norm": 0.6453374028205872, + "learning_rate": 0.00019411922998332682, + "loss": 3.0015, + "step": 37636 + }, + { + "epoch": 1.84, + "grad_norm": 0.6587013006210327, + "learning_rate": 0.00019410482471564838, + "loss": 2.9682, + "step": 37637 + }, + { + "epoch": 1.84, + "grad_norm": 0.6185652613639832, + "learning_rate": 0.00019409041972687745, + "loss": 2.9989, + "step": 37638 + }, + { + "epoch": 1.84, + "grad_norm": 0.6610234379768372, + "learning_rate": 0.0001940760150170519, + "loss": 3.0296, + "step": 37639 + }, + { + "epoch": 1.84, + "grad_norm": 0.610658586025238, + "learning_rate": 0.00019406161058620983, + "loss": 3.0412, + "step": 37640 + }, + { + "epoch": 1.84, + "grad_norm": 0.6481900215148926, + "learning_rate": 0.000194047206434389, + "loss": 2.9011, + "step": 37641 + }, + { + "epoch": 1.84, + "grad_norm": 0.6394829750061035, + "learning_rate": 0.00019403280256162744, + "loss": 3.0083, + "step": 37642 + }, + { + "epoch": 1.84, + "grad_norm": 0.6193023920059204, + "learning_rate": 0.0001940183989679632, + "loss": 3.0251, + "step": 37643 + }, + { + "epoch": 1.84, + "grad_norm": 0.6388663053512573, + "learning_rate": 0.00019400399565343393, + "loss": 3.0921, + "step": 37644 + }, + { + "epoch": 1.84, + "grad_norm": 0.6341210603713989, + "learning_rate": 0.00019398959261807794, + "loss": 3.1424, + "step": 37645 + }, + { + "epoch": 1.84, + "grad_norm": 0.662173330783844, + "learning_rate": 0.00019397518986193277, + "loss": 2.9784, + "step": 37646 + }, + { + "epoch": 1.84, + "grad_norm": 0.6525291800498962, + "learning_rate": 0.00019396078738503659, + "loss": 2.8043, + "step": 37647 + }, + { + "epoch": 1.85, + "grad_norm": 0.6806787252426147, + "learning_rate": 0.00019394638518742733, + "loss": 2.8409, + "step": 37648 + }, + { + "epoch": 1.85, + "grad_norm": 0.6746661067008972, + "learning_rate": 0.0001939319832691429, + "loss": 3.0212, + "step": 37649 + }, + { + "epoch": 1.85, + "grad_norm": 0.6305448412895203, + "learning_rate": 0.00019391758163022125, + "loss": 3.1187, + "step": 37650 + }, + { + "epoch": 1.85, + "grad_norm": 0.6361280679702759, + "learning_rate": 0.00019390318027070013, + "loss": 3.0798, + "step": 37651 + }, + { + "epoch": 1.85, + "grad_norm": 0.6559592485427856, + "learning_rate": 0.0001938887791906177, + "loss": 3.1859, + "step": 37652 + }, + { + "epoch": 1.85, + "grad_norm": 0.6498610973358154, + "learning_rate": 0.00019387437839001184, + "loss": 3.1285, + "step": 37653 + }, + { + "epoch": 1.85, + "grad_norm": 0.6713485717773438, + "learning_rate": 0.0001938599778689203, + "loss": 2.8758, + "step": 37654 + }, + { + "epoch": 1.85, + "grad_norm": 0.6473253965377808, + "learning_rate": 0.00019384557762738134, + "loss": 2.882, + "step": 37655 + }, + { + "epoch": 1.85, + "grad_norm": 0.6119524240493774, + "learning_rate": 0.0001938311776654326, + "loss": 2.7755, + "step": 37656 + }, + { + "epoch": 1.85, + "grad_norm": 0.6408953070640564, + "learning_rate": 0.00019381677798311207, + "loss": 3.2346, + "step": 37657 + }, + { + "epoch": 1.85, + "grad_norm": 0.6806536912918091, + "learning_rate": 0.0001938023785804578, + "loss": 3.0154, + "step": 37658 + }, + { + "epoch": 1.85, + "grad_norm": 0.6582728028297424, + "learning_rate": 0.00019378797945750756, + "loss": 2.8902, + "step": 37659 + }, + { + "epoch": 1.85, + "grad_norm": 0.6802835464477539, + "learning_rate": 0.00019377358061429943, + "loss": 3.0137, + "step": 37660 + }, + { + "epoch": 1.85, + "grad_norm": 0.6308269500732422, + "learning_rate": 0.00019375918205087112, + "loss": 3.1318, + "step": 37661 + }, + { + "epoch": 1.85, + "grad_norm": 0.7181369662284851, + "learning_rate": 0.00019374478376726074, + "loss": 3.1762, + "step": 37662 + }, + { + "epoch": 1.85, + "grad_norm": 0.6746264100074768, + "learning_rate": 0.0001937303857635062, + "loss": 2.9109, + "step": 37663 + }, + { + "epoch": 1.85, + "grad_norm": 0.653678834438324, + "learning_rate": 0.00019371598803964523, + "loss": 2.9796, + "step": 37664 + }, + { + "epoch": 1.85, + "grad_norm": 0.6139053106307983, + "learning_rate": 0.00019370159059571608, + "loss": 2.995, + "step": 37665 + }, + { + "epoch": 1.85, + "grad_norm": 0.6195376515388489, + "learning_rate": 0.0001936871934317563, + "loss": 2.9449, + "step": 37666 + }, + { + "epoch": 1.85, + "grad_norm": 0.6081912517547607, + "learning_rate": 0.0001936727965478041, + "loss": 3.0824, + "step": 37667 + }, + { + "epoch": 1.85, + "grad_norm": 0.7261514067649841, + "learning_rate": 0.0001936583999438973, + "loss": 3.1328, + "step": 37668 + }, + { + "epoch": 1.85, + "grad_norm": 0.6595963835716248, + "learning_rate": 0.0001936440036200737, + "loss": 3.1274, + "step": 37669 + }, + { + "epoch": 1.85, + "grad_norm": 0.6450942158699036, + "learning_rate": 0.00019362960757637147, + "loss": 3.0672, + "step": 37670 + }, + { + "epoch": 1.85, + "grad_norm": 0.6324230432510376, + "learning_rate": 0.00019361521181282833, + "loss": 3.0313, + "step": 37671 + }, + { + "epoch": 1.85, + "grad_norm": 0.6502272486686707, + "learning_rate": 0.00019360081632948216, + "loss": 3.1039, + "step": 37672 + }, + { + "epoch": 1.85, + "grad_norm": 0.6629947423934937, + "learning_rate": 0.0001935864211263711, + "loss": 2.9381, + "step": 37673 + }, + { + "epoch": 1.85, + "grad_norm": 0.6200689077377319, + "learning_rate": 0.00019357202620353284, + "loss": 3.0525, + "step": 37674 + }, + { + "epoch": 1.85, + "grad_norm": 0.6390634775161743, + "learning_rate": 0.00019355763156100544, + "loss": 3.0149, + "step": 37675 + }, + { + "epoch": 1.85, + "grad_norm": 0.6882568597793579, + "learning_rate": 0.00019354323719882667, + "loss": 2.9673, + "step": 37676 + }, + { + "epoch": 1.85, + "grad_norm": 0.6751754283905029, + "learning_rate": 0.00019352884311703457, + "loss": 3.0688, + "step": 37677 + }, + { + "epoch": 1.85, + "grad_norm": 0.6495875716209412, + "learning_rate": 0.00019351444931566707, + "loss": 3.0199, + "step": 37678 + }, + { + "epoch": 1.85, + "grad_norm": 0.6434128880500793, + "learning_rate": 0.00019350005579476186, + "loss": 2.9602, + "step": 37679 + }, + { + "epoch": 1.85, + "grad_norm": 0.6609219312667847, + "learning_rate": 0.0001934856625543572, + "loss": 3.1063, + "step": 37680 + }, + { + "epoch": 1.85, + "grad_norm": 0.6295854449272156, + "learning_rate": 0.00019347126959449068, + "loss": 2.9994, + "step": 37681 + }, + { + "epoch": 1.85, + "grad_norm": 0.6681686639785767, + "learning_rate": 0.00019345687691520028, + "loss": 3.0875, + "step": 37682 + }, + { + "epoch": 1.85, + "grad_norm": 0.7291771173477173, + "learning_rate": 0.00019344248451652412, + "loss": 3.2816, + "step": 37683 + }, + { + "epoch": 1.85, + "grad_norm": 0.6248905062675476, + "learning_rate": 0.00019342809239849987, + "loss": 2.9861, + "step": 37684 + }, + { + "epoch": 1.85, + "grad_norm": 0.6470739841461182, + "learning_rate": 0.00019341370056116556, + "loss": 2.7754, + "step": 37685 + }, + { + "epoch": 1.85, + "grad_norm": 0.6730750203132629, + "learning_rate": 0.00019339930900455894, + "loss": 2.8785, + "step": 37686 + }, + { + "epoch": 1.85, + "grad_norm": 0.6697092652320862, + "learning_rate": 0.00019338491772871806, + "loss": 3.0509, + "step": 37687 + }, + { + "epoch": 1.85, + "grad_norm": 0.6595104336738586, + "learning_rate": 0.00019337052673368087, + "loss": 3.1409, + "step": 37688 + }, + { + "epoch": 1.85, + "grad_norm": 0.6694124937057495, + "learning_rate": 0.00019335613601948511, + "loss": 3.0137, + "step": 37689 + }, + { + "epoch": 1.85, + "grad_norm": 0.649567186832428, + "learning_rate": 0.00019334174558616887, + "loss": 3.1952, + "step": 37690 + }, + { + "epoch": 1.85, + "grad_norm": 0.6771290302276611, + "learning_rate": 0.0001933273554337698, + "loss": 2.8044, + "step": 37691 + }, + { + "epoch": 1.85, + "grad_norm": 0.6197347044944763, + "learning_rate": 0.00019331296556232598, + "loss": 3.1106, + "step": 37692 + }, + { + "epoch": 1.85, + "grad_norm": 0.6491581797599792, + "learning_rate": 0.0001932985759718754, + "loss": 3.0107, + "step": 37693 + }, + { + "epoch": 1.85, + "grad_norm": 0.6176472902297974, + "learning_rate": 0.00019328418666245568, + "loss": 2.936, + "step": 37694 + }, + { + "epoch": 1.85, + "grad_norm": 0.6549054980278015, + "learning_rate": 0.00019326979763410501, + "loss": 2.8333, + "step": 37695 + }, + { + "epoch": 1.85, + "grad_norm": 0.6261773109436035, + "learning_rate": 0.0001932554088868611, + "loss": 2.8714, + "step": 37696 + }, + { + "epoch": 1.85, + "grad_norm": 0.6560527086257935, + "learning_rate": 0.00019324102042076176, + "loss": 2.9073, + "step": 37697 + }, + { + "epoch": 1.85, + "grad_norm": 0.6137068867683411, + "learning_rate": 0.00019322663223584523, + "loss": 2.9635, + "step": 37698 + }, + { + "epoch": 1.85, + "grad_norm": 0.71515953540802, + "learning_rate": 0.0001932122443321491, + "loss": 2.8384, + "step": 37699 + }, + { + "epoch": 1.85, + "grad_norm": 0.6288661360740662, + "learning_rate": 0.00019319785670971143, + "loss": 2.9895, + "step": 37700 + }, + { + "epoch": 1.85, + "grad_norm": 0.6185125112533569, + "learning_rate": 0.00019318346936856998, + "loss": 3.028, + "step": 37701 + }, + { + "epoch": 1.85, + "grad_norm": 0.6613661646842957, + "learning_rate": 0.00019316908230876276, + "loss": 3.1521, + "step": 37702 + }, + { + "epoch": 1.85, + "grad_norm": 0.6700349450111389, + "learning_rate": 0.00019315469553032766, + "loss": 2.9103, + "step": 37703 + }, + { + "epoch": 1.85, + "grad_norm": 0.6402429938316345, + "learning_rate": 0.00019314030903330238, + "loss": 3.1274, + "step": 37704 + }, + { + "epoch": 1.85, + "grad_norm": 0.6753808856010437, + "learning_rate": 0.00019312592281772509, + "loss": 2.8903, + "step": 37705 + }, + { + "epoch": 1.85, + "grad_norm": 0.6379236578941345, + "learning_rate": 0.00019311153688363346, + "loss": 3.0307, + "step": 37706 + }, + { + "epoch": 1.85, + "grad_norm": 0.6957564353942871, + "learning_rate": 0.00019309715123106558, + "loss": 2.9885, + "step": 37707 + }, + { + "epoch": 1.85, + "grad_norm": 0.694107711315155, + "learning_rate": 0.00019308276586005915, + "loss": 3.096, + "step": 37708 + }, + { + "epoch": 1.85, + "grad_norm": 0.7162430882453918, + "learning_rate": 0.00019306838077065208, + "loss": 2.9719, + "step": 37709 + }, + { + "epoch": 1.85, + "grad_norm": 0.6701442003250122, + "learning_rate": 0.0001930539959628825, + "loss": 3.0133, + "step": 37710 + }, + { + "epoch": 1.85, + "grad_norm": 0.6491703391075134, + "learning_rate": 0.00019303961143678797, + "loss": 2.9792, + "step": 37711 + }, + { + "epoch": 1.85, + "grad_norm": 0.6918108463287354, + "learning_rate": 0.00019302522719240664, + "loss": 2.9754, + "step": 37712 + }, + { + "epoch": 1.85, + "grad_norm": 0.6854971647262573, + "learning_rate": 0.00019301084322977612, + "loss": 3.0784, + "step": 37713 + }, + { + "epoch": 1.85, + "grad_norm": 0.6159098148345947, + "learning_rate": 0.00019299645954893452, + "loss": 2.9788, + "step": 37714 + }, + { + "epoch": 1.85, + "grad_norm": 0.6346889138221741, + "learning_rate": 0.0001929820761499197, + "loss": 2.8104, + "step": 37715 + }, + { + "epoch": 1.85, + "grad_norm": 0.6619777679443359, + "learning_rate": 0.0001929676930327694, + "loss": 3.0457, + "step": 37716 + }, + { + "epoch": 1.85, + "grad_norm": 0.6497694253921509, + "learning_rate": 0.00019295331019752166, + "loss": 3.2556, + "step": 37717 + }, + { + "epoch": 1.85, + "grad_norm": 0.6476145386695862, + "learning_rate": 0.0001929389276442143, + "loss": 2.798, + "step": 37718 + }, + { + "epoch": 1.85, + "grad_norm": 0.6143190264701843, + "learning_rate": 0.00019292454537288514, + "loss": 3.0311, + "step": 37719 + }, + { + "epoch": 1.85, + "grad_norm": 0.635507345199585, + "learning_rate": 0.00019291016338357218, + "loss": 2.9349, + "step": 37720 + }, + { + "epoch": 1.85, + "grad_norm": 0.6005894541740417, + "learning_rate": 0.00019289578167631322, + "loss": 3.1017, + "step": 37721 + }, + { + "epoch": 1.85, + "grad_norm": 0.6846247315406799, + "learning_rate": 0.00019288140025114622, + "loss": 3.0524, + "step": 37722 + }, + { + "epoch": 1.85, + "grad_norm": 0.6498640179634094, + "learning_rate": 0.0001928670191081089, + "loss": 3.1925, + "step": 37723 + }, + { + "epoch": 1.85, + "grad_norm": 0.6696533560752869, + "learning_rate": 0.00019285263824723927, + "loss": 2.9829, + "step": 37724 + }, + { + "epoch": 1.85, + "grad_norm": 0.6601403951644897, + "learning_rate": 0.00019283825766857522, + "loss": 3.0455, + "step": 37725 + }, + { + "epoch": 1.85, + "grad_norm": 0.650351345539093, + "learning_rate": 0.00019282387737215448, + "loss": 3.0277, + "step": 37726 + }, + { + "epoch": 1.85, + "grad_norm": 0.5891628265380859, + "learning_rate": 0.0001928094973580151, + "loss": 2.8122, + "step": 37727 + }, + { + "epoch": 1.85, + "grad_norm": 0.7312580943107605, + "learning_rate": 0.00019279511762619478, + "loss": 2.9223, + "step": 37728 + }, + { + "epoch": 1.85, + "grad_norm": 0.6708784103393555, + "learning_rate": 0.00019278073817673155, + "loss": 3.0713, + "step": 37729 + }, + { + "epoch": 1.85, + "grad_norm": 0.6447456479072571, + "learning_rate": 0.0001927663590096633, + "loss": 2.892, + "step": 37730 + }, + { + "epoch": 1.85, + "grad_norm": 0.6298274993896484, + "learning_rate": 0.00019275198012502767, + "loss": 3.187, + "step": 37731 + }, + { + "epoch": 1.85, + "grad_norm": 0.683275043964386, + "learning_rate": 0.0001927376015228628, + "loss": 3.1043, + "step": 37732 + }, + { + "epoch": 1.85, + "grad_norm": 0.6236386299133301, + "learning_rate": 0.0001927232232032064, + "loss": 3.0443, + "step": 37733 + }, + { + "epoch": 1.85, + "grad_norm": 0.6350182294845581, + "learning_rate": 0.00019270884516609633, + "loss": 2.9401, + "step": 37734 + }, + { + "epoch": 1.85, + "grad_norm": 0.6639793515205383, + "learning_rate": 0.0001926944674115706, + "loss": 3.0851, + "step": 37735 + }, + { + "epoch": 1.85, + "grad_norm": 0.611555814743042, + "learning_rate": 0.000192680089939667, + "loss": 3.0449, + "step": 37736 + }, + { + "epoch": 1.85, + "grad_norm": 0.7056550979614258, + "learning_rate": 0.0001926657127504234, + "loss": 3.1318, + "step": 37737 + }, + { + "epoch": 1.85, + "grad_norm": 0.6342179775238037, + "learning_rate": 0.00019265133584387754, + "loss": 2.7571, + "step": 37738 + }, + { + "epoch": 1.85, + "grad_norm": 0.6653878688812256, + "learning_rate": 0.00019263695922006746, + "loss": 3.1323, + "step": 37739 + }, + { + "epoch": 1.85, + "grad_norm": 0.6867647171020508, + "learning_rate": 0.00019262258287903105, + "loss": 3.0003, + "step": 37740 + }, + { + "epoch": 1.85, + "grad_norm": 0.6258794069290161, + "learning_rate": 0.00019260820682080596, + "loss": 3.0193, + "step": 37741 + }, + { + "epoch": 1.85, + "grad_norm": 0.6566473841667175, + "learning_rate": 0.00019259383104543027, + "loss": 3.1575, + "step": 37742 + }, + { + "epoch": 1.85, + "grad_norm": 0.6476341485977173, + "learning_rate": 0.0001925794555529417, + "loss": 3.0576, + "step": 37743 + }, + { + "epoch": 1.85, + "grad_norm": 0.6693728566169739, + "learning_rate": 0.00019256508034337817, + "loss": 2.9551, + "step": 37744 + }, + { + "epoch": 1.85, + "grad_norm": 0.623833954334259, + "learning_rate": 0.0001925507054167776, + "loss": 3.0357, + "step": 37745 + }, + { + "epoch": 1.85, + "grad_norm": 0.6552188992500305, + "learning_rate": 0.00019253633077317772, + "loss": 2.9078, + "step": 37746 + }, + { + "epoch": 1.85, + "grad_norm": 0.6924782991409302, + "learning_rate": 0.00019252195641261663, + "loss": 2.8484, + "step": 37747 + }, + { + "epoch": 1.85, + "grad_norm": 0.6190445423126221, + "learning_rate": 0.00019250758233513185, + "loss": 3.0351, + "step": 37748 + }, + { + "epoch": 1.85, + "grad_norm": 0.7065290212631226, + "learning_rate": 0.0001924932085407614, + "loss": 3.1009, + "step": 37749 + }, + { + "epoch": 1.85, + "grad_norm": 0.6103439331054688, + "learning_rate": 0.00019247883502954326, + "loss": 3.1459, + "step": 37750 + }, + { + "epoch": 1.85, + "grad_norm": 0.6249575018882751, + "learning_rate": 0.00019246446180151515, + "loss": 3.0093, + "step": 37751 + }, + { + "epoch": 1.85, + "grad_norm": 0.6286122798919678, + "learning_rate": 0.00019245008885671494, + "loss": 3.0115, + "step": 37752 + }, + { + "epoch": 1.85, + "grad_norm": 0.6574020385742188, + "learning_rate": 0.00019243571619518044, + "loss": 2.8001, + "step": 37753 + }, + { + "epoch": 1.85, + "grad_norm": 0.6263217329978943, + "learning_rate": 0.00019242134381694962, + "loss": 3.0588, + "step": 37754 + }, + { + "epoch": 1.85, + "grad_norm": 0.6395589709281921, + "learning_rate": 0.0001924069717220603, + "loss": 2.9058, + "step": 37755 + }, + { + "epoch": 1.85, + "grad_norm": 0.6206561326980591, + "learning_rate": 0.00019239259991055022, + "loss": 3.1611, + "step": 37756 + }, + { + "epoch": 1.85, + "grad_norm": 0.6132239699363708, + "learning_rate": 0.00019237822838245744, + "loss": 2.9598, + "step": 37757 + }, + { + "epoch": 1.85, + "grad_norm": 0.6488093137741089, + "learning_rate": 0.00019236385713781965, + "loss": 3.0767, + "step": 37758 + }, + { + "epoch": 1.85, + "grad_norm": 0.6497656106948853, + "learning_rate": 0.00019234948617667463, + "loss": 3.0172, + "step": 37759 + }, + { + "epoch": 1.85, + "grad_norm": 0.6418945789337158, + "learning_rate": 0.00019233511549906053, + "loss": 2.9671, + "step": 37760 + }, + { + "epoch": 1.85, + "grad_norm": 0.6646747589111328, + "learning_rate": 0.0001923207451050149, + "loss": 3.0554, + "step": 37761 + }, + { + "epoch": 1.85, + "grad_norm": 0.5927530527114868, + "learning_rate": 0.0001923063749945758, + "loss": 2.8786, + "step": 37762 + }, + { + "epoch": 1.85, + "grad_norm": 0.6495875716209412, + "learning_rate": 0.00019229200516778083, + "loss": 3.0241, + "step": 37763 + }, + { + "epoch": 1.85, + "grad_norm": 0.6332212686538696, + "learning_rate": 0.00019227763562466808, + "loss": 3.0432, + "step": 37764 + }, + { + "epoch": 1.85, + "grad_norm": 0.6797601580619812, + "learning_rate": 0.00019226326636527538, + "loss": 2.9573, + "step": 37765 + }, + { + "epoch": 1.85, + "grad_norm": 0.6078984141349792, + "learning_rate": 0.00019224889738964033, + "loss": 2.9746, + "step": 37766 + }, + { + "epoch": 1.85, + "grad_norm": 0.6482206583023071, + "learning_rate": 0.0001922345286978011, + "loss": 3.1526, + "step": 37767 + }, + { + "epoch": 1.85, + "grad_norm": 0.6135627627372742, + "learning_rate": 0.00019222016028979527, + "loss": 2.9846, + "step": 37768 + }, + { + "epoch": 1.85, + "grad_norm": 0.6265912055969238, + "learning_rate": 0.00019220579216566082, + "loss": 3.0213, + "step": 37769 + }, + { + "epoch": 1.85, + "grad_norm": 0.648912787437439, + "learning_rate": 0.00019219142432543566, + "loss": 3.17, + "step": 37770 + }, + { + "epoch": 1.85, + "grad_norm": 0.6402798295021057, + "learning_rate": 0.00019217705676915744, + "loss": 3.302, + "step": 37771 + }, + { + "epoch": 1.85, + "grad_norm": 0.6315393447875977, + "learning_rate": 0.00019216268949686415, + "loss": 3.0794, + "step": 37772 + }, + { + "epoch": 1.85, + "grad_norm": 0.6344591975212097, + "learning_rate": 0.00019214832250859357, + "loss": 3.0817, + "step": 37773 + }, + { + "epoch": 1.85, + "grad_norm": 0.6381920576095581, + "learning_rate": 0.0001921339558043835, + "loss": 3.1171, + "step": 37774 + }, + { + "epoch": 1.85, + "grad_norm": 0.6676338315010071, + "learning_rate": 0.00019211958938427192, + "loss": 3.1187, + "step": 37775 + }, + { + "epoch": 1.85, + "grad_norm": 0.6421663761138916, + "learning_rate": 0.00019210522324829656, + "loss": 3.0546, + "step": 37776 + }, + { + "epoch": 1.85, + "grad_norm": 0.6194716691970825, + "learning_rate": 0.00019209085739649529, + "loss": 3.1053, + "step": 37777 + }, + { + "epoch": 1.85, + "grad_norm": 0.6466617584228516, + "learning_rate": 0.00019207649182890584, + "loss": 3.0838, + "step": 37778 + }, + { + "epoch": 1.85, + "grad_norm": 0.6185702085494995, + "learning_rate": 0.00019206212654556619, + "loss": 3.0664, + "step": 37779 + }, + { + "epoch": 1.85, + "grad_norm": 0.6287671327590942, + "learning_rate": 0.0001920477615465142, + "loss": 3.0368, + "step": 37780 + }, + { + "epoch": 1.85, + "grad_norm": 0.6562327146530151, + "learning_rate": 0.00019203339683178755, + "loss": 2.8597, + "step": 37781 + }, + { + "epoch": 1.85, + "grad_norm": 0.609699010848999, + "learning_rate": 0.0001920190324014242, + "loss": 3.0641, + "step": 37782 + }, + { + "epoch": 1.85, + "grad_norm": 0.631375789642334, + "learning_rate": 0.00019200466825546189, + "loss": 2.9766, + "step": 37783 + }, + { + "epoch": 1.85, + "grad_norm": 0.6531733870506287, + "learning_rate": 0.0001919903043939386, + "loss": 2.9142, + "step": 37784 + }, + { + "epoch": 1.85, + "grad_norm": 0.6809682846069336, + "learning_rate": 0.0001919759408168919, + "loss": 2.7791, + "step": 37785 + }, + { + "epoch": 1.85, + "grad_norm": 0.6292451024055481, + "learning_rate": 0.00019196157752435986, + "loss": 3.1373, + "step": 37786 + }, + { + "epoch": 1.85, + "grad_norm": 0.6322467923164368, + "learning_rate": 0.0001919472145163803, + "loss": 2.9073, + "step": 37787 + }, + { + "epoch": 1.85, + "grad_norm": 0.6325113773345947, + "learning_rate": 0.00019193285179299084, + "loss": 3.0091, + "step": 37788 + }, + { + "epoch": 1.85, + "grad_norm": 0.6427119374275208, + "learning_rate": 0.0001919184893542296, + "loss": 2.873, + "step": 37789 + }, + { + "epoch": 1.85, + "grad_norm": 0.6506245732307434, + "learning_rate": 0.0001919041272001341, + "loss": 2.9152, + "step": 37790 + }, + { + "epoch": 1.85, + "grad_norm": 0.7006027698516846, + "learning_rate": 0.0001918897653307424, + "loss": 3.2097, + "step": 37791 + }, + { + "epoch": 1.85, + "grad_norm": 0.6487759947776794, + "learning_rate": 0.00019187540374609235, + "loss": 3.1275, + "step": 37792 + }, + { + "epoch": 1.85, + "grad_norm": 0.6356369256973267, + "learning_rate": 0.00019186104244622152, + "loss": 2.8266, + "step": 37793 + }, + { + "epoch": 1.85, + "grad_norm": 0.6854275465011597, + "learning_rate": 0.000191846681431168, + "loss": 2.9221, + "step": 37794 + }, + { + "epoch": 1.85, + "grad_norm": 0.6778642535209656, + "learning_rate": 0.0001918323207009695, + "loss": 3.0824, + "step": 37795 + }, + { + "epoch": 1.85, + "grad_norm": 0.6329032182693481, + "learning_rate": 0.00019181796025566375, + "loss": 2.8184, + "step": 37796 + }, + { + "epoch": 1.85, + "grad_norm": 0.6632543206214905, + "learning_rate": 0.0001918036000952888, + "loss": 2.7941, + "step": 37797 + }, + { + "epoch": 1.85, + "grad_norm": 0.6179139614105225, + "learning_rate": 0.00019178924021988225, + "loss": 2.9767, + "step": 37798 + }, + { + "epoch": 1.85, + "grad_norm": 0.6899917721748352, + "learning_rate": 0.00019177488062948215, + "loss": 3.0879, + "step": 37799 + }, + { + "epoch": 1.85, + "grad_norm": 0.6436970829963684, + "learning_rate": 0.00019176052132412598, + "loss": 3.2205, + "step": 37800 + }, + { + "epoch": 1.85, + "grad_norm": 0.6844620108604431, + "learning_rate": 0.00019174616230385188, + "loss": 3.1832, + "step": 37801 + }, + { + "epoch": 1.85, + "grad_norm": 0.6446673274040222, + "learning_rate": 0.0001917318035686976, + "loss": 3.1017, + "step": 37802 + }, + { + "epoch": 1.85, + "grad_norm": 0.676476001739502, + "learning_rate": 0.0001917174451187008, + "loss": 3.0669, + "step": 37803 + }, + { + "epoch": 1.85, + "grad_norm": 0.6377735733985901, + "learning_rate": 0.00019170308695389956, + "loss": 3.1079, + "step": 37804 + }, + { + "epoch": 1.85, + "grad_norm": 0.6439332962036133, + "learning_rate": 0.00019168872907433142, + "loss": 2.9107, + "step": 37805 + }, + { + "epoch": 1.85, + "grad_norm": 0.6758326888084412, + "learning_rate": 0.00019167437148003428, + "loss": 2.9954, + "step": 37806 + }, + { + "epoch": 1.85, + "grad_norm": 0.6786984801292419, + "learning_rate": 0.00019166001417104616, + "loss": 3.1813, + "step": 37807 + }, + { + "epoch": 1.85, + "grad_norm": 0.6583569645881653, + "learning_rate": 0.00019164565714740458, + "loss": 3.01, + "step": 37808 + }, + { + "epoch": 1.85, + "grad_norm": 0.6564452648162842, + "learning_rate": 0.0001916313004091476, + "loss": 2.9767, + "step": 37809 + }, + { + "epoch": 1.85, + "grad_norm": 0.6123052835464478, + "learning_rate": 0.00019161694395631283, + "loss": 2.8546, + "step": 37810 + }, + { + "epoch": 1.85, + "grad_norm": 0.6195050477981567, + "learning_rate": 0.00019160258778893813, + "loss": 2.8823, + "step": 37811 + }, + { + "epoch": 1.85, + "grad_norm": 0.6337482333183289, + "learning_rate": 0.00019158823190706148, + "loss": 2.9034, + "step": 37812 + }, + { + "epoch": 1.85, + "grad_norm": 0.6698657274246216, + "learning_rate": 0.0001915738763107205, + "loss": 3.1051, + "step": 37813 + }, + { + "epoch": 1.85, + "grad_norm": 0.6584785580635071, + "learning_rate": 0.00019155952099995311, + "loss": 3.0516, + "step": 37814 + }, + { + "epoch": 1.85, + "grad_norm": 0.715290367603302, + "learning_rate": 0.00019154516597479697, + "loss": 3.1058, + "step": 37815 + }, + { + "epoch": 1.85, + "grad_norm": 0.6434201598167419, + "learning_rate": 0.00019153081123529005, + "loss": 2.9279, + "step": 37816 + }, + { + "epoch": 1.85, + "grad_norm": 0.6424633264541626, + "learning_rate": 0.00019151645678147015, + "loss": 3.0143, + "step": 37817 + }, + { + "epoch": 1.85, + "grad_norm": 0.6202142238616943, + "learning_rate": 0.00019150210261337488, + "loss": 3.1368, + "step": 37818 + }, + { + "epoch": 1.85, + "grad_norm": 0.6413096189498901, + "learning_rate": 0.00019148774873104237, + "loss": 3.1654, + "step": 37819 + }, + { + "epoch": 1.85, + "grad_norm": 0.6295573711395264, + "learning_rate": 0.00019147339513451016, + "loss": 3.0959, + "step": 37820 + }, + { + "epoch": 1.85, + "grad_norm": 0.6364253163337708, + "learning_rate": 0.00019145904182381606, + "loss": 3.0315, + "step": 37821 + }, + { + "epoch": 1.85, + "grad_norm": 0.6383337378501892, + "learning_rate": 0.00019144468879899807, + "loss": 2.9825, + "step": 37822 + }, + { + "epoch": 1.85, + "grad_norm": 0.6247068643569946, + "learning_rate": 0.00019143033606009384, + "loss": 2.9444, + "step": 37823 + }, + { + "epoch": 1.85, + "grad_norm": 0.6323005557060242, + "learning_rate": 0.00019141598360714128, + "loss": 3.0731, + "step": 37824 + }, + { + "epoch": 1.85, + "grad_norm": 0.6761791706085205, + "learning_rate": 0.00019140163144017798, + "loss": 2.9698, + "step": 37825 + }, + { + "epoch": 1.85, + "grad_norm": 0.710123598575592, + "learning_rate": 0.00019138727955924197, + "loss": 2.9033, + "step": 37826 + }, + { + "epoch": 1.85, + "grad_norm": 0.6326896548271179, + "learning_rate": 0.000191372927964371, + "loss": 3.2679, + "step": 37827 + }, + { + "epoch": 1.85, + "grad_norm": 0.6370804309844971, + "learning_rate": 0.00019135857665560272, + "loss": 3.0771, + "step": 37828 + }, + { + "epoch": 1.85, + "grad_norm": 0.8834513425827026, + "learning_rate": 0.00019134422563297514, + "loss": 2.8565, + "step": 37829 + }, + { + "epoch": 1.85, + "grad_norm": 0.6568150520324707, + "learning_rate": 0.00019132987489652584, + "loss": 3.1293, + "step": 37830 + }, + { + "epoch": 1.85, + "grad_norm": 0.6432253122329712, + "learning_rate": 0.00019131552444629282, + "loss": 2.9941, + "step": 37831 + }, + { + "epoch": 1.85, + "grad_norm": 0.6338484883308411, + "learning_rate": 0.00019130117428231384, + "loss": 2.9888, + "step": 37832 + }, + { + "epoch": 1.85, + "grad_norm": 0.6891887187957764, + "learning_rate": 0.00019128682440462653, + "loss": 3.0169, + "step": 37833 + }, + { + "epoch": 1.85, + "grad_norm": 0.6474312543869019, + "learning_rate": 0.00019127247481326895, + "loss": 2.8492, + "step": 37834 + }, + { + "epoch": 1.85, + "grad_norm": 0.6678059101104736, + "learning_rate": 0.00019125812550827866, + "loss": 3.1827, + "step": 37835 + }, + { + "epoch": 1.85, + "grad_norm": 0.663590133190155, + "learning_rate": 0.00019124377648969346, + "loss": 3.037, + "step": 37836 + }, + { + "epoch": 1.85, + "grad_norm": 0.7006127238273621, + "learning_rate": 0.00019122942775755134, + "loss": 2.9233, + "step": 37837 + }, + { + "epoch": 1.85, + "grad_norm": 0.6066510677337646, + "learning_rate": 0.00019121507931188993, + "loss": 3.0044, + "step": 37838 + }, + { + "epoch": 1.85, + "grad_norm": 0.6588680744171143, + "learning_rate": 0.00019120073115274716, + "loss": 2.8884, + "step": 37839 + }, + { + "epoch": 1.85, + "grad_norm": 0.6584458351135254, + "learning_rate": 0.00019118638328016055, + "loss": 3.0469, + "step": 37840 + }, + { + "epoch": 1.85, + "grad_norm": 0.6320998668670654, + "learning_rate": 0.00019117203569416814, + "loss": 3.0939, + "step": 37841 + }, + { + "epoch": 1.85, + "grad_norm": 0.699286162853241, + "learning_rate": 0.00019115768839480772, + "loss": 2.8838, + "step": 37842 + }, + { + "epoch": 1.85, + "grad_norm": 0.6417528986930847, + "learning_rate": 0.00019114334138211692, + "loss": 2.9171, + "step": 37843 + }, + { + "epoch": 1.85, + "grad_norm": 0.614177942276001, + "learning_rate": 0.00019112899465613365, + "loss": 2.8298, + "step": 37844 + }, + { + "epoch": 1.85, + "grad_norm": 0.6882364153862, + "learning_rate": 0.00019111464821689565, + "loss": 2.9626, + "step": 37845 + }, + { + "epoch": 1.85, + "grad_norm": 0.6353859305381775, + "learning_rate": 0.0001911003020644406, + "loss": 2.9776, + "step": 37846 + }, + { + "epoch": 1.85, + "grad_norm": 0.6176462769508362, + "learning_rate": 0.00019108595619880657, + "loss": 3.1251, + "step": 37847 + }, + { + "epoch": 1.85, + "grad_norm": 0.6521666049957275, + "learning_rate": 0.00019107161062003106, + "loss": 2.8021, + "step": 37848 + }, + { + "epoch": 1.85, + "grad_norm": 0.6447446346282959, + "learning_rate": 0.0001910572653281521, + "loss": 3.2144, + "step": 37849 + }, + { + "epoch": 1.85, + "grad_norm": 0.6715746521949768, + "learning_rate": 0.00019104292032320712, + "loss": 3.0234, + "step": 37850 + }, + { + "epoch": 1.85, + "grad_norm": 0.6217811107635498, + "learning_rate": 0.0001910285756052342, + "loss": 2.7758, + "step": 37851 + }, + { + "epoch": 1.86, + "grad_norm": 0.6977801322937012, + "learning_rate": 0.0001910142311742711, + "loss": 2.9053, + "step": 37852 + }, + { + "epoch": 1.86, + "grad_norm": 0.6272152066230774, + "learning_rate": 0.0001909998870303555, + "loss": 3.087, + "step": 37853 + }, + { + "epoch": 1.86, + "grad_norm": 0.6529848575592041, + "learning_rate": 0.0001909855431735253, + "loss": 3.0135, + "step": 37854 + }, + { + "epoch": 1.86, + "grad_norm": 0.6610116362571716, + "learning_rate": 0.00019097119960381805, + "loss": 2.8011, + "step": 37855 + }, + { + "epoch": 1.86, + "grad_norm": 0.6474286913871765, + "learning_rate": 0.00019095685632127172, + "loss": 3.1778, + "step": 37856 + }, + { + "epoch": 1.86, + "grad_norm": 0.7384703755378723, + "learning_rate": 0.00019094251332592418, + "loss": 3.0711, + "step": 37857 + }, + { + "epoch": 1.86, + "grad_norm": 0.668545126914978, + "learning_rate": 0.0001909281706178129, + "loss": 2.834, + "step": 37858 + }, + { + "epoch": 1.86, + "grad_norm": 0.640242874622345, + "learning_rate": 0.00019091382819697597, + "loss": 3.1228, + "step": 37859 + }, + { + "epoch": 1.86, + "grad_norm": 0.6718035936355591, + "learning_rate": 0.00019089948606345095, + "loss": 2.866, + "step": 37860 + }, + { + "epoch": 1.86, + "grad_norm": 0.6487429141998291, + "learning_rate": 0.00019088514421727572, + "loss": 2.8989, + "step": 37861 + }, + { + "epoch": 1.86, + "grad_norm": 0.6288765668869019, + "learning_rate": 0.00019087080265848795, + "loss": 2.8211, + "step": 37862 + }, + { + "epoch": 1.86, + "grad_norm": 0.644413411617279, + "learning_rate": 0.0001908564613871255, + "loss": 3.0838, + "step": 37863 + }, + { + "epoch": 1.86, + "grad_norm": 0.6302248239517212, + "learning_rate": 0.00019084212040322622, + "loss": 2.9669, + "step": 37864 + }, + { + "epoch": 1.86, + "grad_norm": 0.6651326417922974, + "learning_rate": 0.00019082777970682763, + "loss": 2.8824, + "step": 37865 + }, + { + "epoch": 1.86, + "grad_norm": 0.631876528263092, + "learning_rate": 0.0001908134392979678, + "loss": 3.0457, + "step": 37866 + }, + { + "epoch": 1.86, + "grad_norm": 0.7018389105796814, + "learning_rate": 0.00019079909917668432, + "loss": 3.0692, + "step": 37867 + }, + { + "epoch": 1.86, + "grad_norm": 0.6403344869613647, + "learning_rate": 0.00019078475934301492, + "loss": 2.8391, + "step": 37868 + }, + { + "epoch": 1.86, + "grad_norm": 0.6221745014190674, + "learning_rate": 0.00019077041979699753, + "loss": 3.002, + "step": 37869 + }, + { + "epoch": 1.86, + "grad_norm": 0.6578125357627869, + "learning_rate": 0.00019075608053866974, + "loss": 3.0516, + "step": 37870 + }, + { + "epoch": 1.86, + "grad_norm": 0.6671814918518066, + "learning_rate": 0.00019074174156806956, + "loss": 2.9886, + "step": 37871 + }, + { + "epoch": 1.86, + "grad_norm": 0.6524838805198669, + "learning_rate": 0.00019072740288523452, + "loss": 2.9684, + "step": 37872 + }, + { + "epoch": 1.86, + "grad_norm": 0.6703388690948486, + "learning_rate": 0.00019071306449020236, + "loss": 2.9369, + "step": 37873 + }, + { + "epoch": 1.86, + "grad_norm": 0.6536246538162231, + "learning_rate": 0.00019069872638301116, + "loss": 3.1842, + "step": 37874 + }, + { + "epoch": 1.86, + "grad_norm": 0.6407559514045715, + "learning_rate": 0.00019068438856369838, + "loss": 3.1175, + "step": 37875 + }, + { + "epoch": 1.86, + "grad_norm": 0.6341151595115662, + "learning_rate": 0.00019067005103230196, + "loss": 3.2322, + "step": 37876 + }, + { + "epoch": 1.86, + "grad_norm": 0.6281828880310059, + "learning_rate": 0.00019065571378885942, + "loss": 2.9174, + "step": 37877 + }, + { + "epoch": 1.86, + "grad_norm": 0.6292392611503601, + "learning_rate": 0.0001906413768334088, + "loss": 2.999, + "step": 37878 + }, + { + "epoch": 1.86, + "grad_norm": 0.7081571221351624, + "learning_rate": 0.00019062704016598777, + "loss": 3.2075, + "step": 37879 + }, + { + "epoch": 1.86, + "grad_norm": 0.6541411876678467, + "learning_rate": 0.00019061270378663395, + "loss": 3.3458, + "step": 37880 + }, + { + "epoch": 1.86, + "grad_norm": 0.6368046402931213, + "learning_rate": 0.00019059836769538534, + "loss": 2.9417, + "step": 37881 + }, + { + "epoch": 1.86, + "grad_norm": 0.6417937874794006, + "learning_rate": 0.00019058403189227952, + "loss": 3.0076, + "step": 37882 + }, + { + "epoch": 1.86, + "grad_norm": 0.6951587200164795, + "learning_rate": 0.00019056969637735422, + "loss": 2.9381, + "step": 37883 + }, + { + "epoch": 1.86, + "grad_norm": 0.6338977813720703, + "learning_rate": 0.00019055536115064747, + "loss": 3.0194, + "step": 37884 + }, + { + "epoch": 1.86, + "grad_norm": 0.6625250577926636, + "learning_rate": 0.0001905410262121967, + "loss": 3.0054, + "step": 37885 + }, + { + "epoch": 1.86, + "grad_norm": 0.6405186653137207, + "learning_rate": 0.0001905266915620399, + "loss": 3.0492, + "step": 37886 + }, + { + "epoch": 1.86, + "grad_norm": 0.644585132598877, + "learning_rate": 0.0001905123572002146, + "loss": 2.828, + "step": 37887 + }, + { + "epoch": 1.86, + "grad_norm": 0.6336441040039062, + "learning_rate": 0.00019049802312675872, + "loss": 2.9381, + "step": 37888 + }, + { + "epoch": 1.86, + "grad_norm": 0.6756220459938049, + "learning_rate": 0.00019048368934171004, + "loss": 2.8513, + "step": 37889 + }, + { + "epoch": 1.86, + "grad_norm": 0.6149240136146545, + "learning_rate": 0.00019046935584510614, + "loss": 2.9183, + "step": 37890 + }, + { + "epoch": 1.86, + "grad_norm": 0.6757703423500061, + "learning_rate": 0.00019045502263698497, + "loss": 2.9603, + "step": 37891 + }, + { + "epoch": 1.86, + "grad_norm": 0.6618746519088745, + "learning_rate": 0.0001904406897173841, + "loss": 2.9956, + "step": 37892 + }, + { + "epoch": 1.86, + "grad_norm": 0.6471590995788574, + "learning_rate": 0.0001904263570863414, + "loss": 2.9915, + "step": 37893 + }, + { + "epoch": 1.86, + "grad_norm": 0.6565477252006531, + "learning_rate": 0.00019041202474389467, + "loss": 2.9342, + "step": 37894 + }, + { + "epoch": 1.86, + "grad_norm": 0.6662431955337524, + "learning_rate": 0.00019039769269008148, + "loss": 2.9803, + "step": 37895 + }, + { + "epoch": 1.86, + "grad_norm": 0.6561885476112366, + "learning_rate": 0.00019038336092493975, + "loss": 3.0549, + "step": 37896 + }, + { + "epoch": 1.86, + "grad_norm": 0.6919810771942139, + "learning_rate": 0.0001903690294485071, + "loss": 3.1325, + "step": 37897 + }, + { + "epoch": 1.86, + "grad_norm": 0.6764747500419617, + "learning_rate": 0.00019035469826082126, + "loss": 2.9849, + "step": 37898 + }, + { + "epoch": 1.86, + "grad_norm": 0.6264052987098694, + "learning_rate": 0.00019034036736192018, + "loss": 2.9931, + "step": 37899 + }, + { + "epoch": 1.86, + "grad_norm": 0.652153491973877, + "learning_rate": 0.00019032603675184142, + "loss": 2.9467, + "step": 37900 + }, + { + "epoch": 1.86, + "grad_norm": 0.6279745101928711, + "learning_rate": 0.00019031170643062282, + "loss": 3.1106, + "step": 37901 + }, + { + "epoch": 1.86, + "grad_norm": 0.6377081871032715, + "learning_rate": 0.00019029737639830193, + "loss": 2.8446, + "step": 37902 + }, + { + "epoch": 1.86, + "grad_norm": 0.6817088723182678, + "learning_rate": 0.0001902830466549167, + "loss": 2.9539, + "step": 37903 + }, + { + "epoch": 1.86, + "grad_norm": 0.6729699969291687, + "learning_rate": 0.0001902687172005049, + "loss": 3.0809, + "step": 37904 + }, + { + "epoch": 1.86, + "grad_norm": 0.680040180683136, + "learning_rate": 0.00019025438803510405, + "loss": 2.9381, + "step": 37905 + }, + { + "epoch": 1.86, + "grad_norm": 0.6663704514503479, + "learning_rate": 0.00019024005915875214, + "loss": 3.1179, + "step": 37906 + }, + { + "epoch": 1.86, + "grad_norm": 0.6278336048126221, + "learning_rate": 0.00019022573057148675, + "loss": 2.857, + "step": 37907 + }, + { + "epoch": 1.86, + "grad_norm": 0.6568112373352051, + "learning_rate": 0.0001902114022733456, + "loss": 3.1337, + "step": 37908 + }, + { + "epoch": 1.86, + "grad_norm": 0.6545429229736328, + "learning_rate": 0.00019019707426436654, + "loss": 3.0168, + "step": 37909 + }, + { + "epoch": 1.86, + "grad_norm": 0.6433941125869751, + "learning_rate": 0.0001901827465445872, + "loss": 3.0279, + "step": 37910 + }, + { + "epoch": 1.86, + "grad_norm": 0.6414997577667236, + "learning_rate": 0.00019016841911404555, + "loss": 3.0132, + "step": 37911 + }, + { + "epoch": 1.86, + "grad_norm": 0.6626715660095215, + "learning_rate": 0.00019015409197277892, + "loss": 2.9186, + "step": 37912 + }, + { + "epoch": 1.86, + "grad_norm": 0.6758489608764648, + "learning_rate": 0.00019013976512082532, + "loss": 2.8539, + "step": 37913 + }, + { + "epoch": 1.86, + "grad_norm": 0.661674439907074, + "learning_rate": 0.00019012543855822253, + "loss": 3.1409, + "step": 37914 + }, + { + "epoch": 1.86, + "grad_norm": 0.6630668640136719, + "learning_rate": 0.00019011111228500814, + "loss": 3.066, + "step": 37915 + }, + { + "epoch": 1.86, + "grad_norm": 0.6342833638191223, + "learning_rate": 0.00019009678630121997, + "loss": 2.8859, + "step": 37916 + }, + { + "epoch": 1.86, + "grad_norm": 0.6333215832710266, + "learning_rate": 0.00019008246060689562, + "loss": 3.1166, + "step": 37917 + }, + { + "epoch": 1.86, + "grad_norm": 0.6316998600959778, + "learning_rate": 0.00019006813520207295, + "loss": 2.9157, + "step": 37918 + }, + { + "epoch": 1.86, + "grad_norm": 0.6606517434120178, + "learning_rate": 0.00019005381008678973, + "loss": 2.9766, + "step": 37919 + }, + { + "epoch": 1.86, + "grad_norm": 0.6711786985397339, + "learning_rate": 0.0001900394852610835, + "loss": 3.1096, + "step": 37920 + }, + { + "epoch": 1.86, + "grad_norm": 0.7660440802574158, + "learning_rate": 0.00019002516072499223, + "loss": 2.9022, + "step": 37921 + }, + { + "epoch": 1.86, + "grad_norm": 0.7094885110855103, + "learning_rate": 0.00019001083647855347, + "loss": 2.9963, + "step": 37922 + }, + { + "epoch": 1.86, + "grad_norm": 0.6385478377342224, + "learning_rate": 0.00018999651252180486, + "loss": 3.0875, + "step": 37923 + }, + { + "epoch": 1.86, + "grad_norm": 0.632442831993103, + "learning_rate": 0.00018998218885478446, + "loss": 3.0835, + "step": 37924 + }, + { + "epoch": 1.86, + "grad_norm": 0.6296939849853516, + "learning_rate": 0.00018996786547752974, + "loss": 3.0832, + "step": 37925 + }, + { + "epoch": 1.86, + "grad_norm": 0.6628495454788208, + "learning_rate": 0.00018995354239007852, + "loss": 3.208, + "step": 37926 + }, + { + "epoch": 1.86, + "grad_norm": 0.6310528516769409, + "learning_rate": 0.0001899392195924684, + "loss": 3.0514, + "step": 37927 + }, + { + "epoch": 1.86, + "grad_norm": 0.6705329418182373, + "learning_rate": 0.00018992489708473723, + "loss": 2.9223, + "step": 37928 + }, + { + "epoch": 1.86, + "grad_norm": 0.6532049179077148, + "learning_rate": 0.00018991057486692276, + "loss": 2.8727, + "step": 37929 + }, + { + "epoch": 1.86, + "grad_norm": 0.6473034024238586, + "learning_rate": 0.00018989625293906252, + "loss": 2.9542, + "step": 37930 + }, + { + "epoch": 1.86, + "grad_norm": 0.7011779546737671, + "learning_rate": 0.00018988193130119448, + "loss": 3.082, + "step": 37931 + }, + { + "epoch": 1.86, + "grad_norm": 0.6515766382217407, + "learning_rate": 0.00018986760995335614, + "loss": 2.863, + "step": 37932 + }, + { + "epoch": 1.86, + "grad_norm": 0.6576693654060364, + "learning_rate": 0.0001898532888955854, + "loss": 2.8862, + "step": 37933 + }, + { + "epoch": 1.86, + "grad_norm": 0.6526912450790405, + "learning_rate": 0.0001898389681279199, + "loss": 2.8443, + "step": 37934 + }, + { + "epoch": 1.86, + "grad_norm": 0.6956529021263123, + "learning_rate": 0.00018982464765039726, + "loss": 3.0819, + "step": 37935 + }, + { + "epoch": 1.86, + "grad_norm": 0.6562744975090027, + "learning_rate": 0.00018981032746305542, + "loss": 2.8812, + "step": 37936 + }, + { + "epoch": 1.86, + "grad_norm": 0.6449558138847351, + "learning_rate": 0.00018979600756593193, + "loss": 2.9919, + "step": 37937 + }, + { + "epoch": 1.86, + "grad_norm": 0.6720443367958069, + "learning_rate": 0.00018978168795906458, + "loss": 3.1166, + "step": 37938 + }, + { + "epoch": 1.86, + "grad_norm": 0.6303071975708008, + "learning_rate": 0.00018976736864249094, + "loss": 3.0804, + "step": 37939 + }, + { + "epoch": 1.86, + "grad_norm": 0.6402961015701294, + "learning_rate": 0.00018975304961624893, + "loss": 2.8876, + "step": 37940 + }, + { + "epoch": 1.86, + "grad_norm": 0.654398500919342, + "learning_rate": 0.00018973873088037616, + "loss": 2.8619, + "step": 37941 + }, + { + "epoch": 1.86, + "grad_norm": 0.6452508568763733, + "learning_rate": 0.00018972441243491028, + "loss": 2.9692, + "step": 37942 + }, + { + "epoch": 1.86, + "grad_norm": 0.6764039397239685, + "learning_rate": 0.00018971009427988918, + "loss": 3.1798, + "step": 37943 + }, + { + "epoch": 1.86, + "grad_norm": 0.6741904616355896, + "learning_rate": 0.00018969577641535042, + "loss": 3.0523, + "step": 37944 + }, + { + "epoch": 1.86, + "grad_norm": 0.6469334959983826, + "learning_rate": 0.00018968145884133169, + "loss": 3.0406, + "step": 37945 + }, + { + "epoch": 1.86, + "grad_norm": 0.678054690361023, + "learning_rate": 0.0001896671415578709, + "loss": 3.0538, + "step": 37946 + }, + { + "epoch": 1.86, + "grad_norm": 0.678111732006073, + "learning_rate": 0.00018965282456500553, + "loss": 3.0945, + "step": 37947 + }, + { + "epoch": 1.86, + "grad_norm": 0.6882033348083496, + "learning_rate": 0.00018963850786277346, + "loss": 3.1515, + "step": 37948 + }, + { + "epoch": 1.86, + "grad_norm": 0.6521106958389282, + "learning_rate": 0.0001896241914512122, + "loss": 2.9909, + "step": 37949 + }, + { + "epoch": 1.86, + "grad_norm": 0.6649858355522156, + "learning_rate": 0.00018960987533035963, + "loss": 2.954, + "step": 37950 + }, + { + "epoch": 1.86, + "grad_norm": 0.6313856244087219, + "learning_rate": 0.0001895955595002535, + "loss": 3.0676, + "step": 37951 + }, + { + "epoch": 1.86, + "grad_norm": 0.6400474309921265, + "learning_rate": 0.0001895812439609313, + "loss": 3.0938, + "step": 37952 + }, + { + "epoch": 1.86, + "grad_norm": 0.6551691293716431, + "learning_rate": 0.0001895669287124309, + "loss": 2.9153, + "step": 37953 + }, + { + "epoch": 1.86, + "grad_norm": 0.6542170643806458, + "learning_rate": 0.0001895526137547899, + "loss": 3.0044, + "step": 37954 + }, + { + "epoch": 1.86, + "grad_norm": 0.6334107518196106, + "learning_rate": 0.00018953829908804608, + "loss": 2.9579, + "step": 37955 + }, + { + "epoch": 1.86, + "grad_norm": 0.6488938927650452, + "learning_rate": 0.0001895239847122372, + "loss": 3.2192, + "step": 37956 + }, + { + "epoch": 1.86, + "grad_norm": 0.7389078736305237, + "learning_rate": 0.00018950967062740076, + "loss": 2.7784, + "step": 37957 + }, + { + "epoch": 1.86, + "grad_norm": 0.6441724300384521, + "learning_rate": 0.0001894953568335747, + "loss": 3.2414, + "step": 37958 + }, + { + "epoch": 1.86, + "grad_norm": 0.6733617186546326, + "learning_rate": 0.00018948104333079658, + "loss": 3.0559, + "step": 37959 + }, + { + "epoch": 1.86, + "grad_norm": 0.6783096790313721, + "learning_rate": 0.00018946673011910404, + "loss": 2.9956, + "step": 37960 + }, + { + "epoch": 1.86, + "grad_norm": 0.6289350390434265, + "learning_rate": 0.00018945241719853497, + "loss": 3.144, + "step": 37961 + }, + { + "epoch": 1.86, + "grad_norm": 0.7053776383399963, + "learning_rate": 0.0001894381045691269, + "loss": 3.1361, + "step": 37962 + }, + { + "epoch": 1.86, + "grad_norm": 0.6618072986602783, + "learning_rate": 0.00018942379223091765, + "loss": 2.9112, + "step": 37963 + }, + { + "epoch": 1.86, + "grad_norm": 0.6476094722747803, + "learning_rate": 0.00018940948018394472, + "loss": 3.1336, + "step": 37964 + }, + { + "epoch": 1.86, + "grad_norm": 0.6161331534385681, + "learning_rate": 0.00018939516842824599, + "loss": 3.2245, + "step": 37965 + }, + { + "epoch": 1.86, + "grad_norm": 0.6259023547172546, + "learning_rate": 0.00018938085696385917, + "loss": 2.9133, + "step": 37966 + }, + { + "epoch": 1.86, + "grad_norm": 0.6382219195365906, + "learning_rate": 0.00018936654579082175, + "loss": 2.9625, + "step": 37967 + }, + { + "epoch": 1.86, + "grad_norm": 0.6519684791564941, + "learning_rate": 0.0001893522349091717, + "loss": 3.0695, + "step": 37968 + }, + { + "epoch": 1.86, + "grad_norm": 0.7257705926895142, + "learning_rate": 0.0001893379243189465, + "loss": 3.1289, + "step": 37969 + }, + { + "epoch": 1.86, + "grad_norm": 0.7212842702865601, + "learning_rate": 0.00018932361402018388, + "loss": 3.1242, + "step": 37970 + }, + { + "epoch": 1.86, + "grad_norm": 0.6503854990005493, + "learning_rate": 0.00018930930401292163, + "loss": 2.8469, + "step": 37971 + }, + { + "epoch": 1.86, + "grad_norm": 0.6265800595283508, + "learning_rate": 0.00018929499429719726, + "loss": 2.973, + "step": 37972 + }, + { + "epoch": 1.86, + "grad_norm": 0.6678257584571838, + "learning_rate": 0.0001892806848730488, + "loss": 3.1497, + "step": 37973 + }, + { + "epoch": 1.86, + "grad_norm": 0.6247530579566956, + "learning_rate": 0.00018926637574051346, + "loss": 2.8594, + "step": 37974 + }, + { + "epoch": 1.86, + "grad_norm": 0.7319062352180481, + "learning_rate": 0.0001892520668996292, + "loss": 3.0092, + "step": 37975 + }, + { + "epoch": 1.86, + "grad_norm": 0.6562508940696716, + "learning_rate": 0.00018923775835043383, + "loss": 2.9743, + "step": 37976 + }, + { + "epoch": 1.86, + "grad_norm": 0.6403142213821411, + "learning_rate": 0.00018922345009296477, + "loss": 2.8766, + "step": 37977 + }, + { + "epoch": 1.86, + "grad_norm": 0.6423630714416504, + "learning_rate": 0.00018920914212725994, + "loss": 2.9736, + "step": 37978 + }, + { + "epoch": 1.86, + "grad_norm": 0.6349278688430786, + "learning_rate": 0.00018919483445335676, + "loss": 3.0937, + "step": 37979 + }, + { + "epoch": 1.86, + "grad_norm": 0.6362356543540955, + "learning_rate": 0.00018918052707129313, + "loss": 2.9766, + "step": 37980 + }, + { + "epoch": 1.86, + "grad_norm": 0.642997145652771, + "learning_rate": 0.0001891662199811067, + "loss": 2.864, + "step": 37981 + }, + { + "epoch": 1.86, + "grad_norm": 0.6604081988334656, + "learning_rate": 0.00018915191318283502, + "loss": 2.9531, + "step": 37982 + }, + { + "epoch": 1.86, + "grad_norm": 0.6715200543403625, + "learning_rate": 0.00018913760667651602, + "loss": 3.0857, + "step": 37983 + }, + { + "epoch": 1.86, + "grad_norm": 0.6149404644966125, + "learning_rate": 0.0001891233004621871, + "loss": 3.145, + "step": 37984 + }, + { + "epoch": 1.86, + "grad_norm": 0.6347184181213379, + "learning_rate": 0.00018910899453988603, + "loss": 2.9672, + "step": 37985 + }, + { + "epoch": 1.86, + "grad_norm": 0.6498271822929382, + "learning_rate": 0.00018909468890965068, + "loss": 3.1004, + "step": 37986 + }, + { + "epoch": 1.86, + "grad_norm": 0.6263914108276367, + "learning_rate": 0.00018908038357151849, + "loss": 2.935, + "step": 37987 + }, + { + "epoch": 1.86, + "grad_norm": 0.6811741590499878, + "learning_rate": 0.00018906607852552728, + "loss": 2.8293, + "step": 37988 + }, + { + "epoch": 1.86, + "grad_norm": 0.6744228601455688, + "learning_rate": 0.00018905177377171457, + "loss": 3.1593, + "step": 37989 + }, + { + "epoch": 1.86, + "grad_norm": 0.6605095267295837, + "learning_rate": 0.00018903746931011816, + "loss": 3.1381, + "step": 37990 + }, + { + "epoch": 1.86, + "grad_norm": 0.625415563583374, + "learning_rate": 0.00018902316514077578, + "loss": 3.067, + "step": 37991 + }, + { + "epoch": 1.86, + "grad_norm": 0.6394377946853638, + "learning_rate": 0.00018900886126372492, + "loss": 3.0368, + "step": 37992 + }, + { + "epoch": 1.86, + "grad_norm": 0.6430702209472656, + "learning_rate": 0.00018899455767900345, + "loss": 3.0768, + "step": 37993 + }, + { + "epoch": 1.86, + "grad_norm": 0.7407238483428955, + "learning_rate": 0.00018898025438664884, + "loss": 2.9264, + "step": 37994 + }, + { + "epoch": 1.86, + "grad_norm": 0.6624389290809631, + "learning_rate": 0.00018896595138669895, + "loss": 2.9439, + "step": 37995 + }, + { + "epoch": 1.86, + "grad_norm": 0.649860143661499, + "learning_rate": 0.00018895164867919142, + "loss": 3.0471, + "step": 37996 + }, + { + "epoch": 1.86, + "grad_norm": 0.652580201625824, + "learning_rate": 0.00018893734626416374, + "loss": 3.1846, + "step": 37997 + }, + { + "epoch": 1.86, + "grad_norm": 0.6489927172660828, + "learning_rate": 0.00018892304414165387, + "loss": 3.0071, + "step": 37998 + }, + { + "epoch": 1.86, + "grad_norm": 0.6793820858001709, + "learning_rate": 0.00018890874231169925, + "loss": 2.9431, + "step": 37999 + }, + { + "epoch": 1.86, + "grad_norm": 0.6238475441932678, + "learning_rate": 0.00018889444077433757, + "loss": 3.0595, + "step": 38000 + }, + { + "epoch": 1.86, + "grad_norm": 0.6435956954956055, + "learning_rate": 0.00018888013952960667, + "loss": 2.8716, + "step": 38001 + }, + { + "epoch": 1.86, + "grad_norm": 0.6251559257507324, + "learning_rate": 0.00018886583857754405, + "loss": 3.0129, + "step": 38002 + }, + { + "epoch": 1.86, + "grad_norm": 0.6116903424263, + "learning_rate": 0.00018885153791818742, + "loss": 2.9931, + "step": 38003 + }, + { + "epoch": 1.86, + "grad_norm": 0.6142878532409668, + "learning_rate": 0.0001888372375515744, + "loss": 2.9865, + "step": 38004 + }, + { + "epoch": 1.86, + "grad_norm": 0.6752248406410217, + "learning_rate": 0.00018882293747774276, + "loss": 2.8574, + "step": 38005 + }, + { + "epoch": 1.86, + "grad_norm": 0.6765214204788208, + "learning_rate": 0.0001888086376967301, + "loss": 3.0034, + "step": 38006 + }, + { + "epoch": 1.86, + "grad_norm": 0.7171432375907898, + "learning_rate": 0.00018879433820857402, + "loss": 2.8449, + "step": 38007 + }, + { + "epoch": 1.86, + "grad_norm": 0.6618970632553101, + "learning_rate": 0.00018878003901331235, + "loss": 3.0472, + "step": 38008 + }, + { + "epoch": 1.86, + "grad_norm": 0.6702781915664673, + "learning_rate": 0.00018876574011098262, + "loss": 3.0298, + "step": 38009 + }, + { + "epoch": 1.86, + "grad_norm": 0.6527828574180603, + "learning_rate": 0.00018875144150162245, + "loss": 2.8084, + "step": 38010 + }, + { + "epoch": 1.86, + "grad_norm": 0.6515904068946838, + "learning_rate": 0.00018873714318526968, + "loss": 3.1088, + "step": 38011 + }, + { + "epoch": 1.86, + "grad_norm": 0.6282628774642944, + "learning_rate": 0.00018872284516196182, + "loss": 3.0204, + "step": 38012 + }, + { + "epoch": 1.86, + "grad_norm": 0.6256953477859497, + "learning_rate": 0.0001887085474317366, + "loss": 3.097, + "step": 38013 + }, + { + "epoch": 1.86, + "grad_norm": 0.7299720048904419, + "learning_rate": 0.00018869424999463154, + "loss": 2.9651, + "step": 38014 + }, + { + "epoch": 1.86, + "grad_norm": 0.6656356453895569, + "learning_rate": 0.00018867995285068442, + "loss": 3.061, + "step": 38015 + }, + { + "epoch": 1.86, + "grad_norm": 0.7072047591209412, + "learning_rate": 0.000188665655999933, + "loss": 2.9438, + "step": 38016 + }, + { + "epoch": 1.86, + "grad_norm": 0.6499008536338806, + "learning_rate": 0.00018865135944241476, + "loss": 3.1369, + "step": 38017 + }, + { + "epoch": 1.86, + "grad_norm": 0.6523368954658508, + "learning_rate": 0.00018863706317816747, + "loss": 3.0557, + "step": 38018 + }, + { + "epoch": 1.86, + "grad_norm": 0.6852509379386902, + "learning_rate": 0.0001886227672072286, + "loss": 2.9922, + "step": 38019 + }, + { + "epoch": 1.86, + "grad_norm": 0.695910632610321, + "learning_rate": 0.00018860847152963602, + "loss": 2.8368, + "step": 38020 + }, + { + "epoch": 1.86, + "grad_norm": 0.687065839767456, + "learning_rate": 0.00018859417614542725, + "loss": 2.9775, + "step": 38021 + }, + { + "epoch": 1.86, + "grad_norm": 0.6046010851860046, + "learning_rate": 0.00018857988105463991, + "loss": 2.9377, + "step": 38022 + }, + { + "epoch": 1.86, + "grad_norm": 0.6309628486633301, + "learning_rate": 0.00018856558625731187, + "loss": 3.0841, + "step": 38023 + }, + { + "epoch": 1.86, + "grad_norm": 0.6558736562728882, + "learning_rate": 0.00018855129175348052, + "loss": 2.9364, + "step": 38024 + }, + { + "epoch": 1.86, + "grad_norm": 0.6964730620384216, + "learning_rate": 0.0001885369975431837, + "loss": 3.0563, + "step": 38025 + }, + { + "epoch": 1.86, + "grad_norm": 0.722326934337616, + "learning_rate": 0.00018852270362645885, + "loss": 3.048, + "step": 38026 + }, + { + "epoch": 1.86, + "grad_norm": 0.626826286315918, + "learning_rate": 0.00018850841000334382, + "loss": 2.9608, + "step": 38027 + }, + { + "epoch": 1.86, + "grad_norm": 0.656711995601654, + "learning_rate": 0.00018849411667387625, + "loss": 3.0312, + "step": 38028 + }, + { + "epoch": 1.86, + "grad_norm": 0.6362355351448059, + "learning_rate": 0.0001884798236380936, + "loss": 2.9385, + "step": 38029 + }, + { + "epoch": 1.86, + "grad_norm": 0.7519611716270447, + "learning_rate": 0.00018846553089603372, + "loss": 2.8248, + "step": 38030 + }, + { + "epoch": 1.86, + "grad_norm": 0.6189745664596558, + "learning_rate": 0.00018845123844773412, + "loss": 2.846, + "step": 38031 + }, + { + "epoch": 1.86, + "grad_norm": 0.6334044337272644, + "learning_rate": 0.00018843694629323244, + "loss": 2.9264, + "step": 38032 + }, + { + "epoch": 1.86, + "grad_norm": 0.6451401114463806, + "learning_rate": 0.0001884226544325665, + "loss": 2.9789, + "step": 38033 + }, + { + "epoch": 1.86, + "grad_norm": 0.6401827931404114, + "learning_rate": 0.00018840836286577366, + "loss": 3.155, + "step": 38034 + }, + { + "epoch": 1.86, + "grad_norm": 0.6569207906723022, + "learning_rate": 0.00018839407159289192, + "loss": 3.1407, + "step": 38035 + }, + { + "epoch": 1.86, + "grad_norm": 0.6262386441230774, + "learning_rate": 0.00018837978061395856, + "loss": 2.9666, + "step": 38036 + }, + { + "epoch": 1.86, + "grad_norm": 0.6198394894599915, + "learning_rate": 0.0001883654899290113, + "loss": 3.0311, + "step": 38037 + }, + { + "epoch": 1.86, + "grad_norm": 0.6523306369781494, + "learning_rate": 0.00018835119953808805, + "loss": 2.9531, + "step": 38038 + }, + { + "epoch": 1.86, + "grad_norm": 0.6170300245285034, + "learning_rate": 0.00018833690944122615, + "loss": 3.0921, + "step": 38039 + }, + { + "epoch": 1.86, + "grad_norm": 0.6404187679290771, + "learning_rate": 0.00018832261963846344, + "loss": 3.2716, + "step": 38040 + }, + { + "epoch": 1.86, + "grad_norm": 0.6466928124427795, + "learning_rate": 0.00018830833012983727, + "loss": 2.9561, + "step": 38041 + }, + { + "epoch": 1.86, + "grad_norm": 0.6404629349708557, + "learning_rate": 0.0001882940409153856, + "loss": 3.045, + "step": 38042 + }, + { + "epoch": 1.86, + "grad_norm": 0.6768361926078796, + "learning_rate": 0.00018827975199514597, + "loss": 3.0527, + "step": 38043 + }, + { + "epoch": 1.86, + "grad_norm": 0.6463939547538757, + "learning_rate": 0.0001882654633691558, + "loss": 3.1942, + "step": 38044 + }, + { + "epoch": 1.86, + "grad_norm": 0.6092776656150818, + "learning_rate": 0.00018825117503745308, + "loss": 3.0076, + "step": 38045 + }, + { + "epoch": 1.86, + "grad_norm": 0.6425269246101379, + "learning_rate": 0.00018823688700007516, + "loss": 2.8472, + "step": 38046 + }, + { + "epoch": 1.86, + "grad_norm": 0.6583645939826965, + "learning_rate": 0.00018822259925705973, + "loss": 2.9726, + "step": 38047 + }, + { + "epoch": 1.86, + "grad_norm": 0.6546190977096558, + "learning_rate": 0.00018820831180844455, + "loss": 3.0767, + "step": 38048 + }, + { + "epoch": 1.86, + "grad_norm": 0.6298822164535522, + "learning_rate": 0.00018819402465426716, + "loss": 2.9643, + "step": 38049 + }, + { + "epoch": 1.86, + "grad_norm": 0.6522963047027588, + "learning_rate": 0.0001881797377945652, + "loss": 3.0874, + "step": 38050 + }, + { + "epoch": 1.86, + "grad_norm": 0.6804884672164917, + "learning_rate": 0.0001881654512293762, + "loss": 3.01, + "step": 38051 + }, + { + "epoch": 1.86, + "grad_norm": 0.6772292852401733, + "learning_rate": 0.00018815116495873792, + "loss": 3.0163, + "step": 38052 + }, + { + "epoch": 1.86, + "grad_norm": 0.6438421607017517, + "learning_rate": 0.00018813687898268798, + "loss": 3.2455, + "step": 38053 + }, + { + "epoch": 1.86, + "grad_norm": 0.6451230645179749, + "learning_rate": 0.00018812259330126392, + "loss": 2.9777, + "step": 38054 + }, + { + "epoch": 1.86, + "grad_norm": 0.6543946862220764, + "learning_rate": 0.0001881083079145035, + "loss": 2.9877, + "step": 38055 + }, + { + "epoch": 1.87, + "grad_norm": 0.6809148192405701, + "learning_rate": 0.00018809402282244414, + "loss": 3.0758, + "step": 38056 + }, + { + "epoch": 1.87, + "grad_norm": 0.644823431968689, + "learning_rate": 0.00018807973802512367, + "loss": 2.9312, + "step": 38057 + }, + { + "epoch": 1.87, + "grad_norm": 0.6681671142578125, + "learning_rate": 0.00018806545352257963, + "loss": 3.0616, + "step": 38058 + }, + { + "epoch": 1.87, + "grad_norm": 0.6262026429176331, + "learning_rate": 0.0001880511693148496, + "loss": 2.9225, + "step": 38059 + }, + { + "epoch": 1.87, + "grad_norm": 0.6308271884918213, + "learning_rate": 0.00018803688540197132, + "loss": 3.0833, + "step": 38060 + }, + { + "epoch": 1.87, + "grad_norm": 0.6894946098327637, + "learning_rate": 0.00018802260178398228, + "loss": 2.9456, + "step": 38061 + }, + { + "epoch": 1.87, + "grad_norm": 0.6232718825340271, + "learning_rate": 0.0001880083184609201, + "loss": 2.8134, + "step": 38062 + }, + { + "epoch": 1.87, + "grad_norm": 0.6857800483703613, + "learning_rate": 0.00018799403543282259, + "loss": 2.9333, + "step": 38063 + }, + { + "epoch": 1.87, + "grad_norm": 0.6674224734306335, + "learning_rate": 0.00018797975269972718, + "loss": 3.0611, + "step": 38064 + }, + { + "epoch": 1.87, + "grad_norm": 0.6705251336097717, + "learning_rate": 0.00018796547026167157, + "loss": 3.1249, + "step": 38065 + }, + { + "epoch": 1.87, + "grad_norm": 0.6390047073364258, + "learning_rate": 0.00018795118811869323, + "loss": 3.0813, + "step": 38066 + }, + { + "epoch": 1.87, + "grad_norm": 0.6156495809555054, + "learning_rate": 0.00018793690627082999, + "loss": 2.861, + "step": 38067 + }, + { + "epoch": 1.87, + "grad_norm": 0.6561599373817444, + "learning_rate": 0.00018792262471811942, + "loss": 2.9476, + "step": 38068 + }, + { + "epoch": 1.87, + "grad_norm": 0.6761384606361389, + "learning_rate": 0.00018790834346059894, + "loss": 3.1729, + "step": 38069 + }, + { + "epoch": 1.87, + "grad_norm": 0.6904240846633911, + "learning_rate": 0.00018789406249830647, + "loss": 2.988, + "step": 38070 + }, + { + "epoch": 1.87, + "grad_norm": 0.6512880325317383, + "learning_rate": 0.00018787978183127939, + "loss": 3.1239, + "step": 38071 + }, + { + "epoch": 1.87, + "grad_norm": 0.6401209831237793, + "learning_rate": 0.00018786550145955534, + "loss": 3.0033, + "step": 38072 + }, + { + "epoch": 1.87, + "grad_norm": 0.6807664632797241, + "learning_rate": 0.00018785122138317208, + "loss": 2.7812, + "step": 38073 + }, + { + "epoch": 1.87, + "grad_norm": 0.6754370927810669, + "learning_rate": 0.00018783694160216701, + "loss": 2.8129, + "step": 38074 + }, + { + "epoch": 1.87, + "grad_norm": 0.6359378695487976, + "learning_rate": 0.000187822662116578, + "loss": 3.2968, + "step": 38075 + }, + { + "epoch": 1.87, + "grad_norm": 0.6342616081237793, + "learning_rate": 0.00018780838292644235, + "loss": 2.9636, + "step": 38076 + }, + { + "epoch": 1.87, + "grad_norm": 0.6342082023620605, + "learning_rate": 0.00018779410403179782, + "loss": 3.1626, + "step": 38077 + }, + { + "epoch": 1.87, + "grad_norm": 0.6407181024551392, + "learning_rate": 0.00018777982543268215, + "loss": 3.0483, + "step": 38078 + }, + { + "epoch": 1.87, + "grad_norm": 0.6541846394538879, + "learning_rate": 0.0001877655471291328, + "loss": 3.0294, + "step": 38079 + }, + { + "epoch": 1.87, + "grad_norm": 0.6525475978851318, + "learning_rate": 0.0001877512691211874, + "loss": 2.8933, + "step": 38080 + }, + { + "epoch": 1.87, + "grad_norm": 0.6654950976371765, + "learning_rate": 0.00018773699140888346, + "loss": 3.043, + "step": 38081 + }, + { + "epoch": 1.87, + "grad_norm": 0.684727668762207, + "learning_rate": 0.00018772271399225877, + "loss": 3.1511, + "step": 38082 + }, + { + "epoch": 1.87, + "grad_norm": 0.6460987329483032, + "learning_rate": 0.00018770843687135083, + "loss": 3.2699, + "step": 38083 + }, + { + "epoch": 1.87, + "grad_norm": 0.6055113673210144, + "learning_rate": 0.00018769416004619718, + "loss": 2.9939, + "step": 38084 + }, + { + "epoch": 1.87, + "grad_norm": 0.6769329309463501, + "learning_rate": 0.00018767988351683562, + "loss": 2.9545, + "step": 38085 + }, + { + "epoch": 1.87, + "grad_norm": 0.6439892053604126, + "learning_rate": 0.00018766560728330359, + "loss": 3.0184, + "step": 38086 + }, + { + "epoch": 1.87, + "grad_norm": 0.6827326416969299, + "learning_rate": 0.00018765133134563864, + "loss": 3.0605, + "step": 38087 + }, + { + "epoch": 1.87, + "grad_norm": 0.6329249739646912, + "learning_rate": 0.0001876370557038786, + "loss": 3.1776, + "step": 38088 + }, + { + "epoch": 1.87, + "grad_norm": 0.6486325263977051, + "learning_rate": 0.00018762278035806083, + "loss": 3.0524, + "step": 38089 + }, + { + "epoch": 1.87, + "grad_norm": 0.6577807664871216, + "learning_rate": 0.00018760850530822313, + "loss": 2.9419, + "step": 38090 + }, + { + "epoch": 1.87, + "grad_norm": 0.6746330857276917, + "learning_rate": 0.00018759423055440285, + "loss": 2.9862, + "step": 38091 + }, + { + "epoch": 1.87, + "grad_norm": 0.6533883213996887, + "learning_rate": 0.00018757995609663782, + "loss": 3.0289, + "step": 38092 + }, + { + "epoch": 1.87, + "grad_norm": 0.6736960411071777, + "learning_rate": 0.00018756568193496558, + "loss": 3.0924, + "step": 38093 + }, + { + "epoch": 1.87, + "grad_norm": 0.6582047343254089, + "learning_rate": 0.0001875514080694236, + "loss": 2.9104, + "step": 38094 + }, + { + "epoch": 1.87, + "grad_norm": 0.6109486818313599, + "learning_rate": 0.00018753713450004968, + "loss": 2.941, + "step": 38095 + }, + { + "epoch": 1.87, + "grad_norm": 0.6732144355773926, + "learning_rate": 0.0001875228612268812, + "loss": 3.1589, + "step": 38096 + }, + { + "epoch": 1.87, + "grad_norm": 0.7052947282791138, + "learning_rate": 0.00018750858824995598, + "loss": 2.7537, + "step": 38097 + }, + { + "epoch": 1.87, + "grad_norm": 0.6800371408462524, + "learning_rate": 0.00018749431556931142, + "loss": 3.0416, + "step": 38098 + }, + { + "epoch": 1.87, + "grad_norm": 0.675338864326477, + "learning_rate": 0.00018748004318498512, + "loss": 2.8279, + "step": 38099 + }, + { + "epoch": 1.87, + "grad_norm": 0.6601358652114868, + "learning_rate": 0.00018746577109701482, + "loss": 3.0966, + "step": 38100 + }, + { + "epoch": 1.87, + "grad_norm": 0.6854960322380066, + "learning_rate": 0.000187451499305438, + "loss": 3.2466, + "step": 38101 + }, + { + "epoch": 1.87, + "grad_norm": 0.6765010356903076, + "learning_rate": 0.00018743722781029235, + "loss": 3.161, + "step": 38102 + }, + { + "epoch": 1.87, + "grad_norm": 0.6960597634315491, + "learning_rate": 0.0001874229566116152, + "loss": 3.2019, + "step": 38103 + }, + { + "epoch": 1.87, + "grad_norm": 0.6276587247848511, + "learning_rate": 0.0001874086857094444, + "loss": 3.1574, + "step": 38104 + }, + { + "epoch": 1.87, + "grad_norm": 0.6926313042640686, + "learning_rate": 0.00018739441510381752, + "loss": 2.9851, + "step": 38105 + }, + { + "epoch": 1.87, + "grad_norm": 0.6597981452941895, + "learning_rate": 0.00018738014479477197, + "loss": 2.8751, + "step": 38106 + }, + { + "epoch": 1.87, + "grad_norm": 0.6823752522468567, + "learning_rate": 0.00018736587478234554, + "loss": 3.0197, + "step": 38107 + }, + { + "epoch": 1.87, + "grad_norm": 0.6578922271728516, + "learning_rate": 0.00018735160506657566, + "loss": 2.9142, + "step": 38108 + }, + { + "epoch": 1.87, + "grad_norm": 0.7423891425132751, + "learning_rate": 0.0001873373356474999, + "loss": 2.882, + "step": 38109 + }, + { + "epoch": 1.87, + "grad_norm": 0.6776301264762878, + "learning_rate": 0.00018732306652515608, + "loss": 2.981, + "step": 38110 + }, + { + "epoch": 1.87, + "grad_norm": 0.6616842150688171, + "learning_rate": 0.0001873087976995815, + "loss": 2.7471, + "step": 38111 + }, + { + "epoch": 1.87, + "grad_norm": 0.706602156162262, + "learning_rate": 0.000187294529170814, + "loss": 3.0262, + "step": 38112 + }, + { + "epoch": 1.87, + "grad_norm": 0.6833293437957764, + "learning_rate": 0.00018728026093889084, + "loss": 3.0769, + "step": 38113 + }, + { + "epoch": 1.87, + "grad_norm": 0.6360612511634827, + "learning_rate": 0.00018726599300384985, + "loss": 2.8711, + "step": 38114 + }, + { + "epoch": 1.87, + "grad_norm": 0.6546128392219543, + "learning_rate": 0.0001872517253657286, + "loss": 3.115, + "step": 38115 + }, + { + "epoch": 1.87, + "grad_norm": 0.614295482635498, + "learning_rate": 0.00018723745802456447, + "loss": 2.9583, + "step": 38116 + }, + { + "epoch": 1.87, + "grad_norm": 0.6379879713058472, + "learning_rate": 0.00018722319098039532, + "loss": 3.0077, + "step": 38117 + }, + { + "epoch": 1.87, + "grad_norm": 0.6464399099349976, + "learning_rate": 0.00018720892423325846, + "loss": 2.9822, + "step": 38118 + }, + { + "epoch": 1.87, + "grad_norm": 0.6336324214935303, + "learning_rate": 0.00018719465778319161, + "loss": 3.0851, + "step": 38119 + }, + { + "epoch": 1.87, + "grad_norm": 0.6779120564460754, + "learning_rate": 0.00018718039163023246, + "loss": 2.8303, + "step": 38120 + }, + { + "epoch": 1.87, + "grad_norm": 0.6924359202384949, + "learning_rate": 0.00018716612577441828, + "loss": 3.1107, + "step": 38121 + }, + { + "epoch": 1.87, + "grad_norm": 0.643219530582428, + "learning_rate": 0.0001871518602157869, + "loss": 3.1419, + "step": 38122 + }, + { + "epoch": 1.87, + "grad_norm": 0.6407486200332642, + "learning_rate": 0.00018713759495437576, + "loss": 2.9626, + "step": 38123 + }, + { + "epoch": 1.87, + "grad_norm": 0.6377759575843811, + "learning_rate": 0.00018712332999022246, + "loss": 3.0036, + "step": 38124 + }, + { + "epoch": 1.87, + "grad_norm": 0.6537527441978455, + "learning_rate": 0.00018710906532336466, + "loss": 3.0181, + "step": 38125 + }, + { + "epoch": 1.87, + "grad_norm": 0.6885370016098022, + "learning_rate": 0.00018709480095383977, + "loss": 2.8599, + "step": 38126 + }, + { + "epoch": 1.87, + "grad_norm": 0.6301379203796387, + "learning_rate": 0.00018708053688168554, + "loss": 2.8084, + "step": 38127 + }, + { + "epoch": 1.87, + "grad_norm": 0.6435237526893616, + "learning_rate": 0.00018706627310693933, + "loss": 3.1439, + "step": 38128 + }, + { + "epoch": 1.87, + "grad_norm": 0.6685776114463806, + "learning_rate": 0.00018705200962963885, + "loss": 2.9715, + "step": 38129 + }, + { + "epoch": 1.87, + "grad_norm": 0.6370143294334412, + "learning_rate": 0.00018703774644982177, + "loss": 3.0221, + "step": 38130 + }, + { + "epoch": 1.87, + "grad_norm": 0.6487584710121155, + "learning_rate": 0.00018702348356752535, + "loss": 2.9997, + "step": 38131 + }, + { + "epoch": 1.87, + "grad_norm": 0.6534705758094788, + "learning_rate": 0.00018700922098278746, + "loss": 2.8816, + "step": 38132 + }, + { + "epoch": 1.87, + "grad_norm": 0.6294593214988708, + "learning_rate": 0.00018699495869564547, + "loss": 2.9037, + "step": 38133 + }, + { + "epoch": 1.87, + "grad_norm": 0.8882706761360168, + "learning_rate": 0.00018698069670613696, + "loss": 2.9112, + "step": 38134 + }, + { + "epoch": 1.87, + "grad_norm": 0.6950933933258057, + "learning_rate": 0.00018696643501429967, + "loss": 2.9052, + "step": 38135 + }, + { + "epoch": 1.87, + "grad_norm": 0.6648854613304138, + "learning_rate": 0.0001869521736201709, + "loss": 2.9956, + "step": 38136 + }, + { + "epoch": 1.87, + "grad_norm": 0.712222695350647, + "learning_rate": 0.00018693791252378855, + "loss": 3.2136, + "step": 38137 + }, + { + "epoch": 1.87, + "grad_norm": 0.6462759375572205, + "learning_rate": 0.0001869236517251898, + "loss": 3.0022, + "step": 38138 + }, + { + "epoch": 1.87, + "grad_norm": 0.6655752658843994, + "learning_rate": 0.00018690939122441237, + "loss": 2.9743, + "step": 38139 + }, + { + "epoch": 1.87, + "grad_norm": 0.6417601108551025, + "learning_rate": 0.00018689513102149394, + "loss": 3.051, + "step": 38140 + }, + { + "epoch": 1.87, + "grad_norm": 0.6326993107795715, + "learning_rate": 0.00018688087111647195, + "loss": 3.0341, + "step": 38141 + }, + { + "epoch": 1.87, + "grad_norm": 0.651273787021637, + "learning_rate": 0.00018686661150938403, + "loss": 3.1457, + "step": 38142 + }, + { + "epoch": 1.87, + "grad_norm": 0.668491005897522, + "learning_rate": 0.00018685235220026756, + "loss": 2.8735, + "step": 38143 + }, + { + "epoch": 1.87, + "grad_norm": 0.6334226727485657, + "learning_rate": 0.00018683809318916023, + "loss": 2.9159, + "step": 38144 + }, + { + "epoch": 1.87, + "grad_norm": 0.6658082008361816, + "learning_rate": 0.0001868238344760997, + "loss": 2.978, + "step": 38145 + }, + { + "epoch": 1.87, + "grad_norm": 0.6330041289329529, + "learning_rate": 0.00018680957606112325, + "loss": 2.9432, + "step": 38146 + }, + { + "epoch": 1.87, + "grad_norm": 0.6996323466300964, + "learning_rate": 0.00018679531794426875, + "loss": 3.1075, + "step": 38147 + }, + { + "epoch": 1.87, + "grad_norm": 0.6674297451972961, + "learning_rate": 0.00018678106012557353, + "loss": 3.0515, + "step": 38148 + }, + { + "epoch": 1.87, + "grad_norm": 0.632754921913147, + "learning_rate": 0.00018676680260507511, + "loss": 2.9696, + "step": 38149 + }, + { + "epoch": 1.87, + "grad_norm": 0.6195089221000671, + "learning_rate": 0.0001867525453828113, + "loss": 3.0214, + "step": 38150 + }, + { + "epoch": 1.87, + "grad_norm": 0.6678367853164673, + "learning_rate": 0.00018673828845881943, + "loss": 2.8939, + "step": 38151 + }, + { + "epoch": 1.87, + "grad_norm": 0.6400881409645081, + "learning_rate": 0.00018672403183313716, + "loss": 3.1865, + "step": 38152 + }, + { + "epoch": 1.87, + "grad_norm": 0.6747406125068665, + "learning_rate": 0.00018670977550580185, + "loss": 3.1998, + "step": 38153 + }, + { + "epoch": 1.87, + "grad_norm": 0.6130285263061523, + "learning_rate": 0.00018669551947685128, + "loss": 3.1601, + "step": 38154 + }, + { + "epoch": 1.87, + "grad_norm": 0.6921128034591675, + "learning_rate": 0.00018668126374632294, + "loss": 2.9525, + "step": 38155 + }, + { + "epoch": 1.87, + "grad_norm": 0.6503788232803345, + "learning_rate": 0.00018666700831425424, + "loss": 3.0248, + "step": 38156 + }, + { + "epoch": 1.87, + "grad_norm": 0.624090850353241, + "learning_rate": 0.00018665275318068296, + "loss": 3.0273, + "step": 38157 + }, + { + "epoch": 1.87, + "grad_norm": 0.6640616059303284, + "learning_rate": 0.00018663849834564634, + "loss": 3.284, + "step": 38158 + }, + { + "epoch": 1.87, + "grad_norm": 0.7166267037391663, + "learning_rate": 0.00018662424380918221, + "loss": 2.9603, + "step": 38159 + }, + { + "epoch": 1.87, + "grad_norm": 0.6211531758308411, + "learning_rate": 0.00018660998957132805, + "loss": 3.0816, + "step": 38160 + }, + { + "epoch": 1.87, + "grad_norm": 0.6357093453407288, + "learning_rate": 0.00018659573563212123, + "loss": 3.1217, + "step": 38161 + }, + { + "epoch": 1.87, + "grad_norm": 0.6868173480033875, + "learning_rate": 0.00018658148199159953, + "loss": 2.9255, + "step": 38162 + }, + { + "epoch": 1.87, + "grad_norm": 0.6988732218742371, + "learning_rate": 0.00018656722864980035, + "loss": 2.914, + "step": 38163 + }, + { + "epoch": 1.87, + "grad_norm": 0.620403528213501, + "learning_rate": 0.0001865529756067612, + "loss": 3.0708, + "step": 38164 + }, + { + "epoch": 1.87, + "grad_norm": 0.7112835049629211, + "learning_rate": 0.00018653872286251974, + "loss": 3.1756, + "step": 38165 + }, + { + "epoch": 1.87, + "grad_norm": 0.6396851539611816, + "learning_rate": 0.00018652447041711344, + "loss": 2.86, + "step": 38166 + }, + { + "epoch": 1.87, + "grad_norm": 0.6468586325645447, + "learning_rate": 0.00018651021827057992, + "loss": 2.9201, + "step": 38167 + }, + { + "epoch": 1.87, + "grad_norm": 0.6509426832199097, + "learning_rate": 0.00018649596642295645, + "loss": 2.9801, + "step": 38168 + }, + { + "epoch": 1.87, + "grad_norm": 0.684259295463562, + "learning_rate": 0.00018648171487428087, + "loss": 3.1006, + "step": 38169 + }, + { + "epoch": 1.87, + "grad_norm": 0.6615915298461914, + "learning_rate": 0.00018646746362459065, + "loss": 3.0024, + "step": 38170 + }, + { + "epoch": 1.87, + "grad_norm": 0.6847988963127136, + "learning_rate": 0.0001864532126739232, + "loss": 2.9606, + "step": 38171 + }, + { + "epoch": 1.87, + "grad_norm": 0.6112357974052429, + "learning_rate": 0.00018643896202231623, + "loss": 3.182, + "step": 38172 + }, + { + "epoch": 1.87, + "grad_norm": 0.6447160243988037, + "learning_rate": 0.00018642471166980707, + "loss": 3.0568, + "step": 38173 + }, + { + "epoch": 1.87, + "grad_norm": 0.6257251501083374, + "learning_rate": 0.00018641046161643353, + "loss": 3.1314, + "step": 38174 + }, + { + "epoch": 1.87, + "grad_norm": 0.6292673349380493, + "learning_rate": 0.00018639621186223278, + "loss": 2.9662, + "step": 38175 + }, + { + "epoch": 1.87, + "grad_norm": 0.6028404235839844, + "learning_rate": 0.0001863819624072426, + "loss": 3.1069, + "step": 38176 + }, + { + "epoch": 1.87, + "grad_norm": 0.6941775679588318, + "learning_rate": 0.00018636771325150056, + "loss": 2.98, + "step": 38177 + }, + { + "epoch": 1.87, + "grad_norm": 0.6567956805229187, + "learning_rate": 0.00018635346439504395, + "loss": 2.9761, + "step": 38178 + }, + { + "epoch": 1.87, + "grad_norm": 0.6520336866378784, + "learning_rate": 0.00018633921583791055, + "loss": 3.1373, + "step": 38179 + }, + { + "epoch": 1.87, + "grad_norm": 0.6364136338233948, + "learning_rate": 0.00018632496758013768, + "loss": 3.017, + "step": 38180 + }, + { + "epoch": 1.87, + "grad_norm": 0.6324988603591919, + "learning_rate": 0.00018631071962176303, + "loss": 2.9674, + "step": 38181 + }, + { + "epoch": 1.87, + "grad_norm": 0.6260587573051453, + "learning_rate": 0.00018629647196282412, + "loss": 2.8975, + "step": 38182 + }, + { + "epoch": 1.87, + "grad_norm": 0.6402279138565063, + "learning_rate": 0.0001862822246033583, + "loss": 3.0804, + "step": 38183 + }, + { + "epoch": 1.87, + "grad_norm": 0.6180815100669861, + "learning_rate": 0.00018626797754340332, + "loss": 2.917, + "step": 38184 + }, + { + "epoch": 1.87, + "grad_norm": 0.6797022819519043, + "learning_rate": 0.00018625373078299658, + "loss": 2.9086, + "step": 38185 + }, + { + "epoch": 1.87, + "grad_norm": 0.6699610352516174, + "learning_rate": 0.00018623948432217554, + "loss": 3.0638, + "step": 38186 + }, + { + "epoch": 1.87, + "grad_norm": 0.6808142066001892, + "learning_rate": 0.0001862252381609779, + "loss": 2.9938, + "step": 38187 + }, + { + "epoch": 1.87, + "grad_norm": 0.6589908599853516, + "learning_rate": 0.00018621099229944107, + "loss": 3.1829, + "step": 38188 + }, + { + "epoch": 1.87, + "grad_norm": 0.6513395309448242, + "learning_rate": 0.00018619674673760262, + "loss": 2.9098, + "step": 38189 + }, + { + "epoch": 1.87, + "grad_norm": 0.6903216242790222, + "learning_rate": 0.00018618250147549995, + "loss": 2.9158, + "step": 38190 + }, + { + "epoch": 1.87, + "grad_norm": 0.71368807554245, + "learning_rate": 0.0001861682565131707, + "loss": 3.026, + "step": 38191 + }, + { + "epoch": 1.87, + "grad_norm": 0.659199595451355, + "learning_rate": 0.00018615401185065238, + "loss": 2.8882, + "step": 38192 + }, + { + "epoch": 1.87, + "grad_norm": 0.6380458474159241, + "learning_rate": 0.0001861397674879824, + "loss": 3.1365, + "step": 38193 + }, + { + "epoch": 1.87, + "grad_norm": 0.6615623235702515, + "learning_rate": 0.00018612552342519849, + "loss": 2.7819, + "step": 38194 + }, + { + "epoch": 1.87, + "grad_norm": 0.6508620977401733, + "learning_rate": 0.00018611127966233797, + "loss": 2.9649, + "step": 38195 + }, + { + "epoch": 1.87, + "grad_norm": 0.6376355290412903, + "learning_rate": 0.00018609703619943833, + "loss": 2.9156, + "step": 38196 + }, + { + "epoch": 1.87, + "grad_norm": 0.6251353621482849, + "learning_rate": 0.00018608279303653733, + "loss": 3.0193, + "step": 38197 + }, + { + "epoch": 1.87, + "grad_norm": 0.6305531859397888, + "learning_rate": 0.00018606855017367223, + "loss": 2.8546, + "step": 38198 + }, + { + "epoch": 1.87, + "grad_norm": 0.6258016228675842, + "learning_rate": 0.00018605430761088073, + "loss": 2.9225, + "step": 38199 + }, + { + "epoch": 1.87, + "grad_norm": 0.6111253499984741, + "learning_rate": 0.00018604006534820016, + "loss": 2.9206, + "step": 38200 + }, + { + "epoch": 1.87, + "grad_norm": 0.6775639057159424, + "learning_rate": 0.0001860258233856681, + "loss": 2.9885, + "step": 38201 + }, + { + "epoch": 1.87, + "grad_norm": 0.6494293212890625, + "learning_rate": 0.0001860115817233222, + "loss": 3.0052, + "step": 38202 + }, + { + "epoch": 1.87, + "grad_norm": 0.6443254947662354, + "learning_rate": 0.0001859973403611998, + "loss": 3.0249, + "step": 38203 + }, + { + "epoch": 1.87, + "grad_norm": 0.5837622880935669, + "learning_rate": 0.00018598309929933853, + "loss": 2.9753, + "step": 38204 + }, + { + "epoch": 1.87, + "grad_norm": 0.6752540469169617, + "learning_rate": 0.00018596885853777574, + "loss": 2.9707, + "step": 38205 + }, + { + "epoch": 1.87, + "grad_norm": 0.6625667810440063, + "learning_rate": 0.00018595461807654906, + "loss": 3.0735, + "step": 38206 + }, + { + "epoch": 1.87, + "grad_norm": 0.6508365273475647, + "learning_rate": 0.00018594037791569602, + "loss": 3.2523, + "step": 38207 + }, + { + "epoch": 1.87, + "grad_norm": 0.6375356316566467, + "learning_rate": 0.00018592613805525396, + "loss": 3.043, + "step": 38208 + }, + { + "epoch": 1.87, + "grad_norm": 0.6329519152641296, + "learning_rate": 0.00018591189849526062, + "loss": 2.8927, + "step": 38209 + }, + { + "epoch": 1.87, + "grad_norm": 0.6348059177398682, + "learning_rate": 0.00018589765923575333, + "loss": 3.1013, + "step": 38210 + }, + { + "epoch": 1.87, + "grad_norm": 0.6728472709655762, + "learning_rate": 0.00018588342027676957, + "loss": 3.0327, + "step": 38211 + }, + { + "epoch": 1.87, + "grad_norm": 0.6233043074607849, + "learning_rate": 0.00018586918161834706, + "loss": 2.726, + "step": 38212 + }, + { + "epoch": 1.87, + "grad_norm": 0.7845079302787781, + "learning_rate": 0.00018585494326052307, + "loss": 2.8162, + "step": 38213 + }, + { + "epoch": 1.87, + "grad_norm": 0.662473738193512, + "learning_rate": 0.00018584070520333528, + "loss": 2.9308, + "step": 38214 + }, + { + "epoch": 1.87, + "grad_norm": 0.6366502642631531, + "learning_rate": 0.000185826467446821, + "loss": 2.8987, + "step": 38215 + }, + { + "epoch": 1.87, + "grad_norm": 0.6556762456893921, + "learning_rate": 0.00018581222999101784, + "loss": 2.9589, + "step": 38216 + }, + { + "epoch": 1.87, + "grad_norm": 0.626169741153717, + "learning_rate": 0.0001857979928359634, + "loss": 3.0476, + "step": 38217 + }, + { + "epoch": 1.87, + "grad_norm": 0.6835764050483704, + "learning_rate": 0.0001857837559816949, + "loss": 3.0164, + "step": 38218 + }, + { + "epoch": 1.87, + "grad_norm": 0.6800299286842346, + "learning_rate": 0.00018576951942825018, + "loss": 2.9386, + "step": 38219 + }, + { + "epoch": 1.87, + "grad_norm": 0.6377986073493958, + "learning_rate": 0.0001857552831756664, + "loss": 2.98, + "step": 38220 + }, + { + "epoch": 1.87, + "grad_norm": 0.6307637095451355, + "learning_rate": 0.0001857410472239813, + "loss": 2.9888, + "step": 38221 + }, + { + "epoch": 1.87, + "grad_norm": 0.6545106768608093, + "learning_rate": 0.00018572681157323233, + "loss": 3.1622, + "step": 38222 + }, + { + "epoch": 1.87, + "grad_norm": 0.6275971531867981, + "learning_rate": 0.00018571257622345685, + "loss": 3.1065, + "step": 38223 + }, + { + "epoch": 1.87, + "grad_norm": 0.6176348328590393, + "learning_rate": 0.00018569834117469254, + "loss": 2.9758, + "step": 38224 + }, + { + "epoch": 1.87, + "grad_norm": 0.6735263466835022, + "learning_rate": 0.00018568410642697678, + "loss": 2.9344, + "step": 38225 + }, + { + "epoch": 1.87, + "grad_norm": 0.6405964493751526, + "learning_rate": 0.00018566987198034696, + "loss": 3.0465, + "step": 38226 + }, + { + "epoch": 1.87, + "grad_norm": 0.6843787431716919, + "learning_rate": 0.0001856556378348409, + "loss": 2.8972, + "step": 38227 + }, + { + "epoch": 1.87, + "grad_norm": 0.687495231628418, + "learning_rate": 0.00018564140399049574, + "loss": 3.133, + "step": 38228 + }, + { + "epoch": 1.87, + "grad_norm": 0.6218973994255066, + "learning_rate": 0.00018562717044734924, + "loss": 3.1439, + "step": 38229 + }, + { + "epoch": 1.87, + "grad_norm": 0.6834261417388916, + "learning_rate": 0.0001856129372054386, + "loss": 2.9368, + "step": 38230 + }, + { + "epoch": 1.87, + "grad_norm": 0.6899203658103943, + "learning_rate": 0.0001855987042648015, + "loss": 3.1648, + "step": 38231 + }, + { + "epoch": 1.87, + "grad_norm": 0.6193641424179077, + "learning_rate": 0.00018558447162547557, + "loss": 3.1351, + "step": 38232 + }, + { + "epoch": 1.87, + "grad_norm": 0.6698924899101257, + "learning_rate": 0.0001855702392874979, + "loss": 3.0177, + "step": 38233 + }, + { + "epoch": 1.87, + "grad_norm": 0.6577365398406982, + "learning_rate": 0.0001855560072509064, + "loss": 2.8135, + "step": 38234 + }, + { + "epoch": 1.87, + "grad_norm": 0.705444872379303, + "learning_rate": 0.0001855417755157382, + "loss": 2.8679, + "step": 38235 + }, + { + "epoch": 1.87, + "grad_norm": 0.6559796929359436, + "learning_rate": 0.00018552754408203092, + "loss": 3.0659, + "step": 38236 + }, + { + "epoch": 1.87, + "grad_norm": 0.6978949904441833, + "learning_rate": 0.00018551331294982214, + "loss": 2.8357, + "step": 38237 + }, + { + "epoch": 1.87, + "grad_norm": 0.6330851316452026, + "learning_rate": 0.00018549908211914928, + "loss": 2.945, + "step": 38238 + }, + { + "epoch": 1.87, + "grad_norm": 0.6387531161308289, + "learning_rate": 0.00018548485159004978, + "loss": 3.0267, + "step": 38239 + }, + { + "epoch": 1.87, + "grad_norm": 0.6540210843086243, + "learning_rate": 0.00018547062136256107, + "loss": 3.0548, + "step": 38240 + }, + { + "epoch": 1.87, + "grad_norm": 0.6874194741249084, + "learning_rate": 0.00018545639143672068, + "loss": 2.8508, + "step": 38241 + }, + { + "epoch": 1.87, + "grad_norm": 0.6516175270080566, + "learning_rate": 0.00018544216181256624, + "loss": 3.0576, + "step": 38242 + }, + { + "epoch": 1.87, + "grad_norm": 0.6130465865135193, + "learning_rate": 0.00018542793249013505, + "loss": 2.9599, + "step": 38243 + }, + { + "epoch": 1.87, + "grad_norm": 0.6488755941390991, + "learning_rate": 0.00018541370346946467, + "loss": 3.0966, + "step": 38244 + }, + { + "epoch": 1.87, + "grad_norm": 0.6888542771339417, + "learning_rate": 0.00018539947475059245, + "loss": 2.9866, + "step": 38245 + }, + { + "epoch": 1.87, + "grad_norm": 0.7215191721916199, + "learning_rate": 0.000185385246333556, + "loss": 3.2059, + "step": 38246 + }, + { + "epoch": 1.87, + "grad_norm": 0.6577127575874329, + "learning_rate": 0.00018537101821839282, + "loss": 2.8346, + "step": 38247 + }, + { + "epoch": 1.87, + "grad_norm": 0.6458978056907654, + "learning_rate": 0.0001853567904051402, + "loss": 3.181, + "step": 38248 + }, + { + "epoch": 1.87, + "grad_norm": 0.6692994236946106, + "learning_rate": 0.00018534256289383583, + "loss": 2.9622, + "step": 38249 + }, + { + "epoch": 1.87, + "grad_norm": 0.6456918716430664, + "learning_rate": 0.00018532833568451706, + "loss": 3.1947, + "step": 38250 + }, + { + "epoch": 1.87, + "grad_norm": 0.6313451528549194, + "learning_rate": 0.00018531410877722142, + "loss": 3.005, + "step": 38251 + }, + { + "epoch": 1.87, + "grad_norm": 1.0270297527313232, + "learning_rate": 0.00018529988217198622, + "loss": 2.9805, + "step": 38252 + }, + { + "epoch": 1.87, + "grad_norm": 0.6870240569114685, + "learning_rate": 0.00018528565586884915, + "loss": 3.0719, + "step": 38253 + }, + { + "epoch": 1.87, + "grad_norm": 0.631271481513977, + "learning_rate": 0.00018527142986784763, + "loss": 2.9909, + "step": 38254 + }, + { + "epoch": 1.87, + "grad_norm": 0.6817666292190552, + "learning_rate": 0.00018525720416901894, + "loss": 3.0439, + "step": 38255 + }, + { + "epoch": 1.87, + "grad_norm": 0.6389595866203308, + "learning_rate": 0.00018524297877240084, + "loss": 3.2042, + "step": 38256 + }, + { + "epoch": 1.87, + "grad_norm": 0.6673776507377625, + "learning_rate": 0.00018522875367803053, + "loss": 2.914, + "step": 38257 + }, + { + "epoch": 1.87, + "grad_norm": 0.6235078573226929, + "learning_rate": 0.0001852145288859456, + "loss": 2.8799, + "step": 38258 + }, + { + "epoch": 1.87, + "grad_norm": 0.6437073945999146, + "learning_rate": 0.00018520030439618364, + "loss": 3.0012, + "step": 38259 + }, + { + "epoch": 1.88, + "grad_norm": 0.634657621383667, + "learning_rate": 0.00018518608020878184, + "loss": 2.9875, + "step": 38260 + }, + { + "epoch": 1.88, + "grad_norm": 0.6310805678367615, + "learning_rate": 0.00018517185632377787, + "loss": 3.0513, + "step": 38261 + }, + { + "epoch": 1.88, + "grad_norm": 0.6541125178337097, + "learning_rate": 0.00018515763274120912, + "loss": 3.0226, + "step": 38262 + }, + { + "epoch": 1.88, + "grad_norm": 0.6451280117034912, + "learning_rate": 0.00018514340946111302, + "loss": 2.9589, + "step": 38263 + }, + { + "epoch": 1.88, + "grad_norm": 0.6415655016899109, + "learning_rate": 0.00018512918648352718, + "loss": 3.0097, + "step": 38264 + }, + { + "epoch": 1.88, + "grad_norm": 0.6348758339881897, + "learning_rate": 0.0001851149638084889, + "loss": 2.8844, + "step": 38265 + }, + { + "epoch": 1.88, + "grad_norm": 0.6602434515953064, + "learning_rate": 0.00018510074143603575, + "loss": 2.8191, + "step": 38266 + }, + { + "epoch": 1.88, + "grad_norm": 0.6239683032035828, + "learning_rate": 0.00018508651936620504, + "loss": 2.9728, + "step": 38267 + }, + { + "epoch": 1.88, + "grad_norm": 0.7023791074752808, + "learning_rate": 0.00018507229759903436, + "loss": 2.9814, + "step": 38268 + }, + { + "epoch": 1.88, + "grad_norm": 0.7200928330421448, + "learning_rate": 0.00018505807613456116, + "loss": 2.8517, + "step": 38269 + }, + { + "epoch": 1.88, + "grad_norm": 0.6196789741516113, + "learning_rate": 0.00018504385497282281, + "loss": 3.0157, + "step": 38270 + }, + { + "epoch": 1.88, + "grad_norm": 0.6774440407752991, + "learning_rate": 0.00018502963411385688, + "loss": 2.9213, + "step": 38271 + }, + { + "epoch": 1.88, + "grad_norm": 0.6709611415863037, + "learning_rate": 0.00018501541355770074, + "loss": 3.0191, + "step": 38272 + }, + { + "epoch": 1.88, + "grad_norm": 0.6436893343925476, + "learning_rate": 0.00018500119330439182, + "loss": 3.0043, + "step": 38273 + }, + { + "epoch": 1.88, + "grad_norm": 0.6333373785018921, + "learning_rate": 0.00018498697335396773, + "loss": 2.9307, + "step": 38274 + }, + { + "epoch": 1.88, + "grad_norm": 0.6157318949699402, + "learning_rate": 0.00018497275370646575, + "loss": 3.2422, + "step": 38275 + }, + { + "epoch": 1.88, + "grad_norm": 0.6362408995628357, + "learning_rate": 0.00018495853436192348, + "loss": 2.9209, + "step": 38276 + }, + { + "epoch": 1.88, + "grad_norm": 0.6697286367416382, + "learning_rate": 0.00018494431532037816, + "loss": 2.9718, + "step": 38277 + }, + { + "epoch": 1.88, + "grad_norm": 0.6749752759933472, + "learning_rate": 0.00018493009658186743, + "loss": 2.9866, + "step": 38278 + }, + { + "epoch": 1.88, + "grad_norm": 0.6495433449745178, + "learning_rate": 0.00018491587814642874, + "loss": 3.0554, + "step": 38279 + }, + { + "epoch": 1.88, + "grad_norm": 0.6848012208938599, + "learning_rate": 0.00018490166001409935, + "loss": 3.0164, + "step": 38280 + }, + { + "epoch": 1.88, + "grad_norm": 0.7173035740852356, + "learning_rate": 0.00018488744218491694, + "loss": 3.077, + "step": 38281 + }, + { + "epoch": 1.88, + "grad_norm": 0.6794202327728271, + "learning_rate": 0.00018487322465891875, + "loss": 3.0307, + "step": 38282 + }, + { + "epoch": 1.88, + "grad_norm": 0.640831470489502, + "learning_rate": 0.00018485900743614244, + "loss": 2.995, + "step": 38283 + }, + { + "epoch": 1.88, + "grad_norm": 0.6711544394493103, + "learning_rate": 0.00018484479051662534, + "loss": 2.9166, + "step": 38284 + }, + { + "epoch": 1.88, + "grad_norm": 0.654302179813385, + "learning_rate": 0.00018483057390040482, + "loss": 3.2323, + "step": 38285 + }, + { + "epoch": 1.88, + "grad_norm": 0.6835788488388062, + "learning_rate": 0.00018481635758751852, + "loss": 3.0623, + "step": 38286 + }, + { + "epoch": 1.88, + "grad_norm": 0.6237308382987976, + "learning_rate": 0.00018480214157800368, + "loss": 3.0551, + "step": 38287 + }, + { + "epoch": 1.88, + "grad_norm": 0.652237594127655, + "learning_rate": 0.00018478792587189774, + "loss": 2.9883, + "step": 38288 + }, + { + "epoch": 1.88, + "grad_norm": 0.6164937019348145, + "learning_rate": 0.0001847737104692384, + "loss": 3.1875, + "step": 38289 + }, + { + "epoch": 1.88, + "grad_norm": 0.6360146403312683, + "learning_rate": 0.0001847594953700629, + "loss": 3.3418, + "step": 38290 + }, + { + "epoch": 1.88, + "grad_norm": 0.639178991317749, + "learning_rate": 0.0001847452805744087, + "loss": 2.9557, + "step": 38291 + }, + { + "epoch": 1.88, + "grad_norm": 0.6623160243034363, + "learning_rate": 0.0001847310660823132, + "loss": 2.8778, + "step": 38292 + }, + { + "epoch": 1.88, + "grad_norm": 0.7035412788391113, + "learning_rate": 0.0001847168518938139, + "loss": 3.1368, + "step": 38293 + }, + { + "epoch": 1.88, + "grad_norm": 0.6533422470092773, + "learning_rate": 0.00018470263800894827, + "loss": 3.0838, + "step": 38294 + }, + { + "epoch": 1.88, + "grad_norm": 0.7027701139450073, + "learning_rate": 0.00018468842442775362, + "loss": 2.8786, + "step": 38295 + }, + { + "epoch": 1.88, + "grad_norm": 0.6815171241760254, + "learning_rate": 0.0001846742111502676, + "loss": 2.8839, + "step": 38296 + }, + { + "epoch": 1.88, + "grad_norm": 0.6483194828033447, + "learning_rate": 0.0001846599981765274, + "loss": 2.8239, + "step": 38297 + }, + { + "epoch": 1.88, + "grad_norm": 0.6512202024459839, + "learning_rate": 0.00018464578550657053, + "loss": 2.9617, + "step": 38298 + }, + { + "epoch": 1.88, + "grad_norm": 0.6246964335441589, + "learning_rate": 0.00018463157314043462, + "loss": 2.7436, + "step": 38299 + }, + { + "epoch": 1.88, + "grad_norm": 0.6432050466537476, + "learning_rate": 0.00018461736107815675, + "loss": 3.1272, + "step": 38300 + }, + { + "epoch": 1.88, + "grad_norm": 0.7117769718170166, + "learning_rate": 0.00018460314931977481, + "loss": 2.9599, + "step": 38301 + }, + { + "epoch": 1.88, + "grad_norm": 0.6452391743659973, + "learning_rate": 0.0001845889378653257, + "loss": 3.0305, + "step": 38302 + }, + { + "epoch": 1.88, + "grad_norm": 0.6579294800758362, + "learning_rate": 0.00018457472671484717, + "loss": 3.053, + "step": 38303 + }, + { + "epoch": 1.88, + "grad_norm": 0.5954685211181641, + "learning_rate": 0.00018456051586837672, + "loss": 2.9531, + "step": 38304 + }, + { + "epoch": 1.88, + "grad_norm": 0.673682451248169, + "learning_rate": 0.00018454630532595154, + "loss": 2.987, + "step": 38305 + }, + { + "epoch": 1.88, + "grad_norm": 0.6650246381759644, + "learning_rate": 0.0001845320950876093, + "loss": 2.9635, + "step": 38306 + }, + { + "epoch": 1.88, + "grad_norm": 0.6644670367240906, + "learning_rate": 0.00018451788515338715, + "loss": 3.0036, + "step": 38307 + }, + { + "epoch": 1.88, + "grad_norm": 0.6150769591331482, + "learning_rate": 0.00018450367552332272, + "loss": 3.0552, + "step": 38308 + }, + { + "epoch": 1.88, + "grad_norm": 0.6673305630683899, + "learning_rate": 0.00018448946619745342, + "loss": 2.8861, + "step": 38309 + }, + { + "epoch": 1.88, + "grad_norm": 0.6792820692062378, + "learning_rate": 0.00018447525717581656, + "loss": 3.0457, + "step": 38310 + }, + { + "epoch": 1.88, + "grad_norm": 0.6672223210334778, + "learning_rate": 0.00018446104845844976, + "loss": 2.9729, + "step": 38311 + }, + { + "epoch": 1.88, + "grad_norm": 0.6425515413284302, + "learning_rate": 0.0001844468400453902, + "loss": 2.9787, + "step": 38312 + }, + { + "epoch": 1.88, + "grad_norm": 0.6276784539222717, + "learning_rate": 0.00018443263193667543, + "loss": 2.833, + "step": 38313 + }, + { + "epoch": 1.88, + "grad_norm": 0.6771143674850464, + "learning_rate": 0.00018441842413234296, + "loss": 2.79, + "step": 38314 + }, + { + "epoch": 1.88, + "grad_norm": 0.6400710940361023, + "learning_rate": 0.00018440421663243009, + "loss": 3.1405, + "step": 38315 + }, + { + "epoch": 1.88, + "grad_norm": 0.6542527079582214, + "learning_rate": 0.00018439000943697427, + "loss": 2.8716, + "step": 38316 + }, + { + "epoch": 1.88, + "grad_norm": 0.648661196231842, + "learning_rate": 0.0001843758025460128, + "loss": 2.9464, + "step": 38317 + }, + { + "epoch": 1.88, + "grad_norm": 0.6391183733940125, + "learning_rate": 0.00018436159595958332, + "loss": 3.1304, + "step": 38318 + }, + { + "epoch": 1.88, + "grad_norm": 0.6920697093009949, + "learning_rate": 0.0001843473896777232, + "loss": 2.9166, + "step": 38319 + }, + { + "epoch": 1.88, + "grad_norm": 0.6402799487113953, + "learning_rate": 0.00018433318370046964, + "loss": 2.9813, + "step": 38320 + }, + { + "epoch": 1.88, + "grad_norm": 0.6674555540084839, + "learning_rate": 0.00018431897802786035, + "loss": 3.056, + "step": 38321 + }, + { + "epoch": 1.88, + "grad_norm": 0.6335968971252441, + "learning_rate": 0.00018430477265993247, + "loss": 3.0924, + "step": 38322 + }, + { + "epoch": 1.88, + "grad_norm": 0.6576682329177856, + "learning_rate": 0.00018429056759672362, + "loss": 2.7713, + "step": 38323 + }, + { + "epoch": 1.88, + "grad_norm": 0.6932410001754761, + "learning_rate": 0.00018427636283827124, + "loss": 3.0169, + "step": 38324 + }, + { + "epoch": 1.88, + "grad_norm": 0.6399856209754944, + "learning_rate": 0.0001842621583846125, + "loss": 3.008, + "step": 38325 + }, + { + "epoch": 1.88, + "grad_norm": 0.65628582239151, + "learning_rate": 0.0001842479542357851, + "loss": 3.279, + "step": 38326 + }, + { + "epoch": 1.88, + "grad_norm": 0.6356684565544128, + "learning_rate": 0.00018423375039182624, + "loss": 2.8128, + "step": 38327 + }, + { + "epoch": 1.88, + "grad_norm": 0.6769824028015137, + "learning_rate": 0.00018421954685277347, + "loss": 2.8816, + "step": 38328 + }, + { + "epoch": 1.88, + "grad_norm": 0.6468422412872314, + "learning_rate": 0.00018420534361866396, + "loss": 2.7391, + "step": 38329 + }, + { + "epoch": 1.88, + "grad_norm": 0.653346598148346, + "learning_rate": 0.0001841911406895354, + "loss": 3.1066, + "step": 38330 + }, + { + "epoch": 1.88, + "grad_norm": 0.671597957611084, + "learning_rate": 0.00018417693806542512, + "loss": 2.9741, + "step": 38331 + }, + { + "epoch": 1.88, + "grad_norm": 0.6430724859237671, + "learning_rate": 0.00018416273574637043, + "loss": 3.0607, + "step": 38332 + }, + { + "epoch": 1.88, + "grad_norm": 0.6749132871627808, + "learning_rate": 0.00018414853373240888, + "loss": 3.0446, + "step": 38333 + }, + { + "epoch": 1.88, + "grad_norm": 0.6767664551734924, + "learning_rate": 0.00018413433202357774, + "loss": 2.8677, + "step": 38334 + }, + { + "epoch": 1.88, + "grad_norm": 0.633886992931366, + "learning_rate": 0.0001841201306199144, + "loss": 2.7959, + "step": 38335 + }, + { + "epoch": 1.88, + "grad_norm": 0.6434829831123352, + "learning_rate": 0.00018410592952145647, + "loss": 2.8927, + "step": 38336 + }, + { + "epoch": 1.88, + "grad_norm": 0.6419342160224915, + "learning_rate": 0.00018409172872824116, + "loss": 3.1004, + "step": 38337 + }, + { + "epoch": 1.88, + "grad_norm": 0.6574919819831848, + "learning_rate": 0.00018407752824030598, + "loss": 3.0997, + "step": 38338 + }, + { + "epoch": 1.88, + "grad_norm": 0.6425691246986389, + "learning_rate": 0.00018406332805768813, + "loss": 2.7971, + "step": 38339 + }, + { + "epoch": 1.88, + "grad_norm": 0.677888810634613, + "learning_rate": 0.0001840491281804253, + "loss": 2.7808, + "step": 38340 + }, + { + "epoch": 1.88, + "grad_norm": 0.6419331431388855, + "learning_rate": 0.00018403492860855474, + "loss": 2.9517, + "step": 38341 + }, + { + "epoch": 1.88, + "grad_norm": 0.6205721497535706, + "learning_rate": 0.00018402072934211377, + "loss": 3.0624, + "step": 38342 + }, + { + "epoch": 1.88, + "grad_norm": 0.6749025583267212, + "learning_rate": 0.00018400653038114, + "loss": 2.9666, + "step": 38343 + }, + { + "epoch": 1.88, + "grad_norm": 0.6735125184059143, + "learning_rate": 0.00018399233172567057, + "loss": 2.8711, + "step": 38344 + }, + { + "epoch": 1.88, + "grad_norm": 0.640764594078064, + "learning_rate": 0.00018397813337574307, + "loss": 3.1523, + "step": 38345 + }, + { + "epoch": 1.88, + "grad_norm": 0.6429493427276611, + "learning_rate": 0.00018396393533139492, + "loss": 2.983, + "step": 38346 + }, + { + "epoch": 1.88, + "grad_norm": 0.6589473485946655, + "learning_rate": 0.0001839497375926633, + "loss": 2.9291, + "step": 38347 + }, + { + "epoch": 1.88, + "grad_norm": 0.6674602031707764, + "learning_rate": 0.00018393554015958588, + "loss": 3.0551, + "step": 38348 + }, + { + "epoch": 1.88, + "grad_norm": 0.6474887132644653, + "learning_rate": 0.00018392134303219984, + "loss": 2.9849, + "step": 38349 + }, + { + "epoch": 1.88, + "grad_norm": 0.6512155532836914, + "learning_rate": 0.00018390714621054253, + "loss": 3.0877, + "step": 38350 + }, + { + "epoch": 1.88, + "grad_norm": 0.6206179261207581, + "learning_rate": 0.00018389294969465165, + "loss": 3.1422, + "step": 38351 + }, + { + "epoch": 1.88, + "grad_norm": 0.6520265340805054, + "learning_rate": 0.00018387875348456427, + "loss": 2.8446, + "step": 38352 + }, + { + "epoch": 1.88, + "grad_norm": 0.6797086596488953, + "learning_rate": 0.000183864557580318, + "loss": 3.1531, + "step": 38353 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597195863723755, + "learning_rate": 0.00018385036198195, + "loss": 2.9861, + "step": 38354 + }, + { + "epoch": 1.88, + "grad_norm": 0.6018645167350769, + "learning_rate": 0.0001838361666894979, + "loss": 3.0342, + "step": 38355 + }, + { + "epoch": 1.88, + "grad_norm": 0.6335331797599792, + "learning_rate": 0.000183821971702999, + "loss": 2.9257, + "step": 38356 + }, + { + "epoch": 1.88, + "grad_norm": 0.6828946471214294, + "learning_rate": 0.0001838077770224905, + "loss": 2.9478, + "step": 38357 + }, + { + "epoch": 1.88, + "grad_norm": 0.6436010599136353, + "learning_rate": 0.00018379358264801018, + "loss": 3.0097, + "step": 38358 + }, + { + "epoch": 1.88, + "grad_norm": 0.6501964926719666, + "learning_rate": 0.00018377938857959509, + "loss": 2.8804, + "step": 38359 + }, + { + "epoch": 1.88, + "grad_norm": 0.6490870118141174, + "learning_rate": 0.00018376519481728263, + "loss": 2.8949, + "step": 38360 + }, + { + "epoch": 1.88, + "grad_norm": 0.6395012140274048, + "learning_rate": 0.00018375100136111043, + "loss": 3.0246, + "step": 38361 + }, + { + "epoch": 1.88, + "grad_norm": 0.6163665056228638, + "learning_rate": 0.00018373680821111562, + "loss": 3.0793, + "step": 38362 + }, + { + "epoch": 1.88, + "grad_norm": 0.635441243648529, + "learning_rate": 0.00018372261536733582, + "loss": 3.1323, + "step": 38363 + }, + { + "epoch": 1.88, + "grad_norm": 0.6642354726791382, + "learning_rate": 0.0001837084228298081, + "loss": 2.957, + "step": 38364 + }, + { + "epoch": 1.88, + "grad_norm": 0.666587769985199, + "learning_rate": 0.00018369423059857, + "loss": 3.0932, + "step": 38365 + }, + { + "epoch": 1.88, + "grad_norm": 0.6363824605941772, + "learning_rate": 0.00018368003867365909, + "loss": 2.7929, + "step": 38366 + }, + { + "epoch": 1.88, + "grad_norm": 0.6824597716331482, + "learning_rate": 0.0001836658470551125, + "loss": 3.0304, + "step": 38367 + }, + { + "epoch": 1.88, + "grad_norm": 0.6351178288459778, + "learning_rate": 0.00018365165574296766, + "loss": 3.0697, + "step": 38368 + }, + { + "epoch": 1.88, + "grad_norm": 0.6279469728469849, + "learning_rate": 0.00018363746473726194, + "loss": 2.9615, + "step": 38369 + }, + { + "epoch": 1.88, + "grad_norm": 0.6463478803634644, + "learning_rate": 0.00018362327403803277, + "loss": 3.1272, + "step": 38370 + }, + { + "epoch": 1.88, + "grad_norm": 0.6736152768135071, + "learning_rate": 0.00018360908364531756, + "loss": 3.0822, + "step": 38371 + }, + { + "epoch": 1.88, + "grad_norm": 0.627798318862915, + "learning_rate": 0.0001835948935591535, + "loss": 3.2404, + "step": 38372 + }, + { + "epoch": 1.88, + "grad_norm": 0.6646186709403992, + "learning_rate": 0.00018358070377957824, + "loss": 2.8242, + "step": 38373 + }, + { + "epoch": 1.88, + "grad_norm": 0.6356680393218994, + "learning_rate": 0.00018356651430662888, + "loss": 2.7888, + "step": 38374 + }, + { + "epoch": 1.88, + "grad_norm": 0.6446818709373474, + "learning_rate": 0.0001835523251403429, + "loss": 2.9223, + "step": 38375 + }, + { + "epoch": 1.88, + "grad_norm": 0.7403597235679626, + "learning_rate": 0.00018353813628075775, + "loss": 3.102, + "step": 38376 + }, + { + "epoch": 1.88, + "grad_norm": 0.6128467321395874, + "learning_rate": 0.00018352394772791071, + "loss": 2.79, + "step": 38377 + }, + { + "epoch": 1.88, + "grad_norm": 0.6641044020652771, + "learning_rate": 0.00018350975948183926, + "loss": 2.8962, + "step": 38378 + }, + { + "epoch": 1.88, + "grad_norm": 0.6116800308227539, + "learning_rate": 0.00018349557154258056, + "loss": 3.0184, + "step": 38379 + }, + { + "epoch": 1.88, + "grad_norm": 0.6737449765205383, + "learning_rate": 0.00018348138391017214, + "loss": 3.0201, + "step": 38380 + }, + { + "epoch": 1.88, + "grad_norm": 0.6416964530944824, + "learning_rate": 0.0001834671965846514, + "loss": 3.0682, + "step": 38381 + }, + { + "epoch": 1.88, + "grad_norm": 0.6830700635910034, + "learning_rate": 0.00018345300956605553, + "loss": 2.9621, + "step": 38382 + }, + { + "epoch": 1.88, + "grad_norm": 0.6257073283195496, + "learning_rate": 0.0001834388228544221, + "loss": 3.1164, + "step": 38383 + }, + { + "epoch": 1.88, + "grad_norm": 0.6321331858634949, + "learning_rate": 0.00018342463644978825, + "loss": 3.0862, + "step": 38384 + }, + { + "epoch": 1.88, + "grad_norm": 0.6541038155555725, + "learning_rate": 0.00018341045035219153, + "loss": 3.0973, + "step": 38385 + }, + { + "epoch": 1.88, + "grad_norm": 0.691582202911377, + "learning_rate": 0.00018339626456166934, + "loss": 3.0136, + "step": 38386 + }, + { + "epoch": 1.88, + "grad_norm": 0.6166252493858337, + "learning_rate": 0.00018338207907825878, + "loss": 3.0924, + "step": 38387 + }, + { + "epoch": 1.88, + "grad_norm": 0.6885400414466858, + "learning_rate": 0.00018336789390199757, + "loss": 3.0155, + "step": 38388 + }, + { + "epoch": 1.88, + "grad_norm": 0.6976376175880432, + "learning_rate": 0.00018335370903292277, + "loss": 3.1868, + "step": 38389 + }, + { + "epoch": 1.88, + "grad_norm": 0.6312760710716248, + "learning_rate": 0.0001833395244710718, + "loss": 3.015, + "step": 38390 + }, + { + "epoch": 1.88, + "grad_norm": 0.6591302752494812, + "learning_rate": 0.0001833253402164822, + "loss": 2.9672, + "step": 38391 + }, + { + "epoch": 1.88, + "grad_norm": 0.647793710231781, + "learning_rate": 0.00018331115626919113, + "loss": 2.8549, + "step": 38392 + }, + { + "epoch": 1.88, + "grad_norm": 0.6505529284477234, + "learning_rate": 0.0001832969726292361, + "loss": 2.9445, + "step": 38393 + }, + { + "epoch": 1.88, + "grad_norm": 0.7024715542793274, + "learning_rate": 0.00018328278929665424, + "loss": 3.0583, + "step": 38394 + }, + { + "epoch": 1.88, + "grad_norm": 0.6506431698799133, + "learning_rate": 0.00018326860627148312, + "loss": 2.8853, + "step": 38395 + }, + { + "epoch": 1.88, + "grad_norm": 0.6874439120292664, + "learning_rate": 0.0001832544235537601, + "loss": 3.0919, + "step": 38396 + }, + { + "epoch": 1.88, + "grad_norm": 0.6847971081733704, + "learning_rate": 0.00018324024114352227, + "loss": 3.0551, + "step": 38397 + }, + { + "epoch": 1.88, + "grad_norm": 0.6514188647270203, + "learning_rate": 0.00018322605904080738, + "loss": 3.0029, + "step": 38398 + }, + { + "epoch": 1.88, + "grad_norm": 0.6254274249076843, + "learning_rate": 0.00018321187724565244, + "loss": 2.9289, + "step": 38399 + }, + { + "epoch": 1.88, + "grad_norm": 0.7166518568992615, + "learning_rate": 0.00018319769575809498, + "loss": 3.158, + "step": 38400 + }, + { + "epoch": 1.88, + "grad_norm": 0.6519727110862732, + "learning_rate": 0.00018318351457817235, + "loss": 3.1565, + "step": 38401 + }, + { + "epoch": 1.88, + "grad_norm": 0.6201913356781006, + "learning_rate": 0.00018316933370592182, + "loss": 2.9268, + "step": 38402 + }, + { + "epoch": 1.88, + "grad_norm": 0.6431904435157776, + "learning_rate": 0.00018315515314138085, + "loss": 2.8744, + "step": 38403 + }, + { + "epoch": 1.88, + "grad_norm": 0.6314598917961121, + "learning_rate": 0.0001831409728845866, + "loss": 3.0051, + "step": 38404 + }, + { + "epoch": 1.88, + "grad_norm": 0.6558628082275391, + "learning_rate": 0.0001831267929355765, + "loss": 2.7542, + "step": 38405 + }, + { + "epoch": 1.88, + "grad_norm": 0.6240569353103638, + "learning_rate": 0.0001831126132943881, + "loss": 3.0997, + "step": 38406 + }, + { + "epoch": 1.88, + "grad_norm": 0.6257813572883606, + "learning_rate": 0.00018309843396105852, + "loss": 2.8727, + "step": 38407 + }, + { + "epoch": 1.88, + "grad_norm": 0.6663781404495239, + "learning_rate": 0.0001830842549356252, + "loss": 2.9911, + "step": 38408 + }, + { + "epoch": 1.88, + "grad_norm": 0.634472131729126, + "learning_rate": 0.00018307007621812534, + "loss": 3.0498, + "step": 38409 + }, + { + "epoch": 1.88, + "grad_norm": 0.6562722325325012, + "learning_rate": 0.0001830558978085965, + "loss": 2.6464, + "step": 38410 + }, + { + "epoch": 1.88, + "grad_norm": 0.6011884212493896, + "learning_rate": 0.0001830417197070759, + "loss": 2.8752, + "step": 38411 + }, + { + "epoch": 1.88, + "grad_norm": 0.6544702053070068, + "learning_rate": 0.00018302754191360082, + "loss": 2.94, + "step": 38412 + }, + { + "epoch": 1.88, + "grad_norm": 0.6202430129051208, + "learning_rate": 0.00018301336442820882, + "loss": 3.1568, + "step": 38413 + }, + { + "epoch": 1.88, + "grad_norm": 0.6908934712409973, + "learning_rate": 0.00018299918725093702, + "loss": 3.1212, + "step": 38414 + }, + { + "epoch": 1.88, + "grad_norm": 0.6435255408287048, + "learning_rate": 0.0001829850103818229, + "loss": 2.9659, + "step": 38415 + }, + { + "epoch": 1.88, + "grad_norm": 0.6239068508148193, + "learning_rate": 0.00018297083382090364, + "loss": 2.8303, + "step": 38416 + }, + { + "epoch": 1.88, + "grad_norm": 0.6315820813179016, + "learning_rate": 0.00018295665756821674, + "loss": 3.0459, + "step": 38417 + }, + { + "epoch": 1.88, + "grad_norm": 0.6654682755470276, + "learning_rate": 0.00018294248162379953, + "loss": 3.1875, + "step": 38418 + }, + { + "epoch": 1.88, + "grad_norm": 0.6455845832824707, + "learning_rate": 0.00018292830598768916, + "loss": 3.2678, + "step": 38419 + }, + { + "epoch": 1.88, + "grad_norm": 0.6427332758903503, + "learning_rate": 0.00018291413065992326, + "loss": 2.9188, + "step": 38420 + }, + { + "epoch": 1.88, + "grad_norm": 0.6541956067085266, + "learning_rate": 0.0001828999556405389, + "loss": 3.0159, + "step": 38421 + }, + { + "epoch": 1.88, + "grad_norm": 0.6762649416923523, + "learning_rate": 0.0001828857809295735, + "loss": 3.1659, + "step": 38422 + }, + { + "epoch": 1.88, + "grad_norm": 0.6119546890258789, + "learning_rate": 0.00018287160652706455, + "loss": 2.9543, + "step": 38423 + }, + { + "epoch": 1.88, + "grad_norm": 0.6221927404403687, + "learning_rate": 0.00018285743243304903, + "loss": 2.8137, + "step": 38424 + }, + { + "epoch": 1.88, + "grad_norm": 0.6315422654151917, + "learning_rate": 0.00018284325864756468, + "loss": 3.1488, + "step": 38425 + }, + { + "epoch": 1.88, + "grad_norm": 0.6791592240333557, + "learning_rate": 0.0001828290851706486, + "loss": 3.1033, + "step": 38426 + }, + { + "epoch": 1.88, + "grad_norm": 0.6015139222145081, + "learning_rate": 0.00018281491200233807, + "loss": 2.8296, + "step": 38427 + }, + { + "epoch": 1.88, + "grad_norm": 0.6524919867515564, + "learning_rate": 0.00018280073914267065, + "loss": 3.0343, + "step": 38428 + }, + { + "epoch": 1.88, + "grad_norm": 0.663172721862793, + "learning_rate": 0.00018278656659168345, + "loss": 3.046, + "step": 38429 + }, + { + "epoch": 1.88, + "grad_norm": 0.634985625743866, + "learning_rate": 0.0001827723943494139, + "loss": 3.1256, + "step": 38430 + }, + { + "epoch": 1.88, + "grad_norm": 0.6688196063041687, + "learning_rate": 0.00018275822241589927, + "loss": 2.8842, + "step": 38431 + }, + { + "epoch": 1.88, + "grad_norm": 0.6805686354637146, + "learning_rate": 0.0001827440507911769, + "loss": 2.8134, + "step": 38432 + }, + { + "epoch": 1.88, + "grad_norm": 0.6817598342895508, + "learning_rate": 0.00018272987947528423, + "loss": 3.2789, + "step": 38433 + }, + { + "epoch": 1.88, + "grad_norm": 0.6709921956062317, + "learning_rate": 0.00018271570846825833, + "loss": 3.0719, + "step": 38434 + }, + { + "epoch": 1.88, + "grad_norm": 0.6306078433990479, + "learning_rate": 0.00018270153777013684, + "loss": 2.9929, + "step": 38435 + }, + { + "epoch": 1.88, + "grad_norm": 0.6629069447517395, + "learning_rate": 0.00018268736738095688, + "loss": 3.0485, + "step": 38436 + }, + { + "epoch": 1.88, + "grad_norm": 0.6367194652557373, + "learning_rate": 0.00018267319730075573, + "loss": 2.9225, + "step": 38437 + }, + { + "epoch": 1.88, + "grad_norm": 0.6755772829055786, + "learning_rate": 0.0001826590275295709, + "loss": 2.9001, + "step": 38438 + }, + { + "epoch": 1.88, + "grad_norm": 0.6794211268424988, + "learning_rate": 0.0001826448580674396, + "loss": 3.0339, + "step": 38439 + }, + { + "epoch": 1.88, + "grad_norm": 0.7521235942840576, + "learning_rate": 0.0001826306889143992, + "loss": 2.8449, + "step": 38440 + }, + { + "epoch": 1.88, + "grad_norm": 0.6859071254730225, + "learning_rate": 0.0001826165200704868, + "loss": 3.0413, + "step": 38441 + }, + { + "epoch": 1.88, + "grad_norm": 0.6678243279457092, + "learning_rate": 0.00018260235153574, + "loss": 2.9641, + "step": 38442 + }, + { + "epoch": 1.88, + "grad_norm": 0.6535446643829346, + "learning_rate": 0.00018258818331019606, + "loss": 3.1897, + "step": 38443 + }, + { + "epoch": 1.88, + "grad_norm": 0.6140435338020325, + "learning_rate": 0.00018257401539389218, + "loss": 2.9456, + "step": 38444 + }, + { + "epoch": 1.88, + "grad_norm": 0.6568311452865601, + "learning_rate": 0.00018255984778686583, + "loss": 3.0298, + "step": 38445 + }, + { + "epoch": 1.88, + "grad_norm": 0.6603125333786011, + "learning_rate": 0.0001825456804891541, + "loss": 2.9342, + "step": 38446 + }, + { + "epoch": 1.88, + "grad_norm": 0.623427152633667, + "learning_rate": 0.0001825315135007945, + "loss": 3.0201, + "step": 38447 + }, + { + "epoch": 1.88, + "grad_norm": 0.6774618029594421, + "learning_rate": 0.00018251734682182437, + "loss": 3.215, + "step": 38448 + }, + { + "epoch": 1.88, + "grad_norm": 0.6762425899505615, + "learning_rate": 0.00018250318045228081, + "loss": 2.904, + "step": 38449 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597325205802917, + "learning_rate": 0.0001824890143922014, + "loss": 3.0122, + "step": 38450 + }, + { + "epoch": 1.88, + "grad_norm": 0.6491790413856506, + "learning_rate": 0.00018247484864162322, + "loss": 3.1313, + "step": 38451 + }, + { + "epoch": 1.88, + "grad_norm": 0.6365981101989746, + "learning_rate": 0.00018246068320058366, + "loss": 3.0639, + "step": 38452 + }, + { + "epoch": 1.88, + "grad_norm": 0.6332027912139893, + "learning_rate": 0.0001824465180691201, + "loss": 2.9355, + "step": 38453 + }, + { + "epoch": 1.88, + "grad_norm": 0.6683153510093689, + "learning_rate": 0.00018243235324726974, + "loss": 3.1984, + "step": 38454 + }, + { + "epoch": 1.88, + "grad_norm": 0.6562347412109375, + "learning_rate": 0.00018241818873507003, + "loss": 2.9301, + "step": 38455 + }, + { + "epoch": 1.88, + "grad_norm": 0.6307790279388428, + "learning_rate": 0.00018240402453255803, + "loss": 2.8323, + "step": 38456 + }, + { + "epoch": 1.88, + "grad_norm": 0.626919686794281, + "learning_rate": 0.00018238986063977126, + "loss": 3.0347, + "step": 38457 + }, + { + "epoch": 1.88, + "grad_norm": 0.6405884027481079, + "learning_rate": 0.00018237569705674704, + "loss": 3.0111, + "step": 38458 + }, + { + "epoch": 1.88, + "grad_norm": 0.6130412817001343, + "learning_rate": 0.00018236153378352248, + "loss": 3.0065, + "step": 38459 + }, + { + "epoch": 1.88, + "grad_norm": 0.6970798969268799, + "learning_rate": 0.00018234737082013506, + "loss": 2.88, + "step": 38460 + }, + { + "epoch": 1.88, + "grad_norm": 0.6523925065994263, + "learning_rate": 0.00018233320816662206, + "loss": 2.9462, + "step": 38461 + }, + { + "epoch": 1.88, + "grad_norm": 0.6778101921081543, + "learning_rate": 0.0001823190458230206, + "loss": 3.0931, + "step": 38462 + }, + { + "epoch": 1.88, + "grad_norm": 0.6423189640045166, + "learning_rate": 0.00018230488378936826, + "loss": 3.1616, + "step": 38463 + }, + { + "epoch": 1.89, + "grad_norm": 0.6352372765541077, + "learning_rate": 0.00018229072206570212, + "loss": 2.9447, + "step": 38464 + }, + { + "epoch": 1.89, + "grad_norm": 0.627549946308136, + "learning_rate": 0.0001822765606520597, + "loss": 3.098, + "step": 38465 + }, + { + "epoch": 1.89, + "grad_norm": 0.6376739740371704, + "learning_rate": 0.00018226239954847805, + "loss": 3.143, + "step": 38466 + }, + { + "epoch": 1.89, + "grad_norm": 0.7025358080863953, + "learning_rate": 0.00018224823875499451, + "loss": 3.028, + "step": 38467 + }, + { + "epoch": 1.89, + "grad_norm": 0.6670701503753662, + "learning_rate": 0.00018223407827164663, + "loss": 2.9701, + "step": 38468 + }, + { + "epoch": 1.89, + "grad_norm": 0.6625725626945496, + "learning_rate": 0.00018221991809847142, + "loss": 2.9864, + "step": 38469 + }, + { + "epoch": 1.89, + "grad_norm": 0.6725914478302002, + "learning_rate": 0.00018220575823550635, + "loss": 3.1172, + "step": 38470 + }, + { + "epoch": 1.89, + "grad_norm": 0.6515083909034729, + "learning_rate": 0.00018219159868278852, + "loss": 3.0334, + "step": 38471 + }, + { + "epoch": 1.89, + "grad_norm": 0.6303300261497498, + "learning_rate": 0.0001821774394403554, + "loss": 2.7711, + "step": 38472 + }, + { + "epoch": 1.89, + "grad_norm": 0.6697096228599548, + "learning_rate": 0.0001821632805082443, + "loss": 2.9665, + "step": 38473 + }, + { + "epoch": 1.89, + "grad_norm": 0.657671332359314, + "learning_rate": 0.00018214912188649232, + "loss": 2.8763, + "step": 38474 + }, + { + "epoch": 1.89, + "grad_norm": 0.6618888974189758, + "learning_rate": 0.00018213496357513703, + "loss": 2.7808, + "step": 38475 + }, + { + "epoch": 1.89, + "grad_norm": 0.652420163154602, + "learning_rate": 0.00018212080557421543, + "loss": 3.0118, + "step": 38476 + }, + { + "epoch": 1.89, + "grad_norm": 0.6736677289009094, + "learning_rate": 0.00018210664788376491, + "loss": 3.188, + "step": 38477 + }, + { + "epoch": 1.89, + "grad_norm": 0.6545586585998535, + "learning_rate": 0.0001820924905038229, + "loss": 3.0641, + "step": 38478 + }, + { + "epoch": 1.89, + "grad_norm": 0.6806058883666992, + "learning_rate": 0.00018207833343442657, + "loss": 2.9763, + "step": 38479 + }, + { + "epoch": 1.89, + "grad_norm": 0.6442785263061523, + "learning_rate": 0.00018206417667561322, + "loss": 3.1018, + "step": 38480 + }, + { + "epoch": 1.89, + "grad_norm": 0.6341136693954468, + "learning_rate": 0.00018205002022742006, + "loss": 2.9789, + "step": 38481 + }, + { + "epoch": 1.89, + "grad_norm": 0.6478147506713867, + "learning_rate": 0.00018203586408988446, + "loss": 3.0614, + "step": 38482 + }, + { + "epoch": 1.89, + "grad_norm": 0.6580110788345337, + "learning_rate": 0.00018202170826304374, + "loss": 2.8771, + "step": 38483 + }, + { + "epoch": 1.89, + "grad_norm": 0.6745438575744629, + "learning_rate": 0.00018200755274693506, + "loss": 3.0367, + "step": 38484 + }, + { + "epoch": 1.89, + "grad_norm": 0.6208056211471558, + "learning_rate": 0.0001819933975415959, + "loss": 3.0012, + "step": 38485 + }, + { + "epoch": 1.89, + "grad_norm": 0.6185520887374878, + "learning_rate": 0.00018197924264706325, + "loss": 2.8297, + "step": 38486 + }, + { + "epoch": 1.89, + "grad_norm": 0.6431528925895691, + "learning_rate": 0.00018196508806337476, + "loss": 3.266, + "step": 38487 + }, + { + "epoch": 1.89, + "grad_norm": 0.660499095916748, + "learning_rate": 0.0001819509337905674, + "loss": 2.9249, + "step": 38488 + }, + { + "epoch": 1.89, + "grad_norm": 0.636012077331543, + "learning_rate": 0.00018193677982867847, + "loss": 3.2491, + "step": 38489 + }, + { + "epoch": 1.89, + "grad_norm": 0.6246479153633118, + "learning_rate": 0.00018192262617774553, + "loss": 2.9983, + "step": 38490 + }, + { + "epoch": 1.89, + "grad_norm": 0.6834371089935303, + "learning_rate": 0.00018190847283780555, + "loss": 2.9373, + "step": 38491 + }, + { + "epoch": 1.89, + "grad_norm": 0.6637904047966003, + "learning_rate": 0.000181894319808896, + "loss": 2.8806, + "step": 38492 + }, + { + "epoch": 1.89, + "grad_norm": 0.6326305270195007, + "learning_rate": 0.000181880167091054, + "loss": 3.0709, + "step": 38493 + }, + { + "epoch": 1.89, + "grad_norm": 0.6314154267311096, + "learning_rate": 0.00018186601468431695, + "loss": 2.9915, + "step": 38494 + }, + { + "epoch": 1.89, + "grad_norm": 0.6132782697677612, + "learning_rate": 0.00018185186258872212, + "loss": 3.0361, + "step": 38495 + }, + { + "epoch": 1.89, + "grad_norm": 0.6412111520767212, + "learning_rate": 0.0001818377108043066, + "loss": 2.7906, + "step": 38496 + }, + { + "epoch": 1.89, + "grad_norm": 0.6674846410751343, + "learning_rate": 0.000181823559331108, + "loss": 2.857, + "step": 38497 + }, + { + "epoch": 1.89, + "grad_norm": 0.8719302415847778, + "learning_rate": 0.00018180940816916327, + "loss": 2.9331, + "step": 38498 + }, + { + "epoch": 1.89, + "grad_norm": 0.6447135210037231, + "learning_rate": 0.00018179525731850982, + "loss": 3.0817, + "step": 38499 + }, + { + "epoch": 1.89, + "grad_norm": 0.6724530458450317, + "learning_rate": 0.000181781106779185, + "loss": 2.9817, + "step": 38500 + }, + { + "epoch": 1.89, + "grad_norm": 0.6975722908973694, + "learning_rate": 0.00018176695655122596, + "loss": 3.1141, + "step": 38501 + }, + { + "epoch": 1.89, + "grad_norm": 0.6536849141120911, + "learning_rate": 0.00018175280663467003, + "loss": 2.867, + "step": 38502 + }, + { + "epoch": 1.89, + "grad_norm": 0.6566682457923889, + "learning_rate": 0.0001817386570295543, + "loss": 2.8168, + "step": 38503 + }, + { + "epoch": 1.89, + "grad_norm": 0.7100996971130371, + "learning_rate": 0.0001817245077359163, + "loss": 2.9156, + "step": 38504 + }, + { + "epoch": 1.89, + "grad_norm": 0.6142645478248596, + "learning_rate": 0.00018171035875379324, + "loss": 3.0425, + "step": 38505 + }, + { + "epoch": 1.89, + "grad_norm": 0.6454697251319885, + "learning_rate": 0.0001816962100832222, + "loss": 3.0476, + "step": 38506 + }, + { + "epoch": 1.89, + "grad_norm": 0.6202601790428162, + "learning_rate": 0.0001816820617242407, + "loss": 2.9302, + "step": 38507 + }, + { + "epoch": 1.89, + "grad_norm": 0.6279861927032471, + "learning_rate": 0.00018166791367688574, + "loss": 2.9165, + "step": 38508 + }, + { + "epoch": 1.89, + "grad_norm": 0.6671816110610962, + "learning_rate": 0.0001816537659411948, + "loss": 2.832, + "step": 38509 + }, + { + "epoch": 1.89, + "grad_norm": 0.6781206130981445, + "learning_rate": 0.00018163961851720513, + "loss": 2.9789, + "step": 38510 + }, + { + "epoch": 1.89, + "grad_norm": 0.6522396802902222, + "learning_rate": 0.0001816254714049538, + "loss": 2.8528, + "step": 38511 + }, + { + "epoch": 1.89, + "grad_norm": 0.643043577671051, + "learning_rate": 0.0001816113246044783, + "loss": 2.8771, + "step": 38512 + }, + { + "epoch": 1.89, + "grad_norm": 0.6434921026229858, + "learning_rate": 0.00018159717811581575, + "loss": 2.9918, + "step": 38513 + }, + { + "epoch": 1.89, + "grad_norm": 0.6609666347503662, + "learning_rate": 0.00018158303193900336, + "loss": 3.013, + "step": 38514 + }, + { + "epoch": 1.89, + "grad_norm": 0.6703172326087952, + "learning_rate": 0.0001815688860740786, + "loss": 3.0498, + "step": 38515 + }, + { + "epoch": 1.89, + "grad_norm": 0.6868824362754822, + "learning_rate": 0.00018155474052107857, + "loss": 3.0622, + "step": 38516 + }, + { + "epoch": 1.89, + "grad_norm": 0.632976770401001, + "learning_rate": 0.00018154059528004057, + "loss": 3.2264, + "step": 38517 + }, + { + "epoch": 1.89, + "grad_norm": 0.7155187129974365, + "learning_rate": 0.00018152645035100177, + "loss": 2.9952, + "step": 38518 + }, + { + "epoch": 1.89, + "grad_norm": 0.6054431796073914, + "learning_rate": 0.00018151230573399955, + "loss": 2.9829, + "step": 38519 + }, + { + "epoch": 1.89, + "grad_norm": 0.6408286690711975, + "learning_rate": 0.00018149816142907114, + "loss": 2.8755, + "step": 38520 + }, + { + "epoch": 1.89, + "grad_norm": 0.6466242074966431, + "learning_rate": 0.00018148401743625368, + "loss": 3.2028, + "step": 38521 + }, + { + "epoch": 1.89, + "grad_norm": 0.6467182636260986, + "learning_rate": 0.00018146987375558462, + "loss": 2.9694, + "step": 38522 + }, + { + "epoch": 1.89, + "grad_norm": 0.6525686979293823, + "learning_rate": 0.00018145573038710105, + "loss": 2.9897, + "step": 38523 + }, + { + "epoch": 1.89, + "grad_norm": 0.6323734521865845, + "learning_rate": 0.00018144158733084023, + "loss": 2.8444, + "step": 38524 + }, + { + "epoch": 1.89, + "grad_norm": 0.6349821090698242, + "learning_rate": 0.00018142744458683952, + "loss": 3.0805, + "step": 38525 + }, + { + "epoch": 1.89, + "grad_norm": 0.6364793181419373, + "learning_rate": 0.000181413302155136, + "loss": 2.8423, + "step": 38526 + }, + { + "epoch": 1.89, + "grad_norm": 0.6155493259429932, + "learning_rate": 0.00018139916003576722, + "loss": 3.0647, + "step": 38527 + }, + { + "epoch": 1.89, + "grad_norm": 0.62147456407547, + "learning_rate": 0.00018138501822877003, + "loss": 2.977, + "step": 38528 + }, + { + "epoch": 1.89, + "grad_norm": 0.6351068615913391, + "learning_rate": 0.00018137087673418184, + "loss": 3.0379, + "step": 38529 + }, + { + "epoch": 1.89, + "grad_norm": 0.6419429779052734, + "learning_rate": 0.00018135673555204012, + "loss": 2.8855, + "step": 38530 + }, + { + "epoch": 1.89, + "grad_norm": 0.6373555660247803, + "learning_rate": 0.00018134259468238182, + "loss": 2.8341, + "step": 38531 + }, + { + "epoch": 1.89, + "grad_norm": 0.687985360622406, + "learning_rate": 0.0001813284541252444, + "loss": 2.7618, + "step": 38532 + }, + { + "epoch": 1.89, + "grad_norm": 0.6934620141983032, + "learning_rate": 0.0001813143138806648, + "loss": 2.9882, + "step": 38533 + }, + { + "epoch": 1.89, + "grad_norm": 0.6637023687362671, + "learning_rate": 0.00018130017394868056, + "loss": 3.0607, + "step": 38534 + }, + { + "epoch": 1.89, + "grad_norm": 0.6849845051765442, + "learning_rate": 0.0001812860343293289, + "loss": 2.9292, + "step": 38535 + }, + { + "epoch": 1.89, + "grad_norm": 0.6725578308105469, + "learning_rate": 0.0001812718950226468, + "loss": 3.3626, + "step": 38536 + }, + { + "epoch": 1.89, + "grad_norm": 0.6605555415153503, + "learning_rate": 0.0001812577560286718, + "loss": 2.9055, + "step": 38537 + }, + { + "epoch": 1.89, + "grad_norm": 0.6455115079879761, + "learning_rate": 0.000181243617347441, + "loss": 2.9905, + "step": 38538 + }, + { + "epoch": 1.89, + "grad_norm": 0.6488058567047119, + "learning_rate": 0.0001812294789789916, + "loss": 2.8396, + "step": 38539 + }, + { + "epoch": 1.89, + "grad_norm": 0.6056329011917114, + "learning_rate": 0.00018121534092336097, + "loss": 2.8001, + "step": 38540 + }, + { + "epoch": 1.89, + "grad_norm": 0.6151305437088013, + "learning_rate": 0.00018120120318058624, + "loss": 3.2172, + "step": 38541 + }, + { + "epoch": 1.89, + "grad_norm": 0.6283738613128662, + "learning_rate": 0.00018118706575070474, + "loss": 2.85, + "step": 38542 + }, + { + "epoch": 1.89, + "grad_norm": 0.6253724098205566, + "learning_rate": 0.00018117292863375353, + "loss": 2.6845, + "step": 38543 + }, + { + "epoch": 1.89, + "grad_norm": 0.6234411597251892, + "learning_rate": 0.00018115879182977, + "loss": 3.0064, + "step": 38544 + }, + { + "epoch": 1.89, + "grad_norm": 0.6679108142852783, + "learning_rate": 0.00018114465533879142, + "loss": 2.8951, + "step": 38545 + }, + { + "epoch": 1.89, + "grad_norm": 0.6431871056556702, + "learning_rate": 0.0001811305191608548, + "loss": 2.9186, + "step": 38546 + }, + { + "epoch": 1.89, + "grad_norm": 0.6422107219696045, + "learning_rate": 0.00018111638329599763, + "loss": 2.9075, + "step": 38547 + }, + { + "epoch": 1.89, + "grad_norm": 0.6774196028709412, + "learning_rate": 0.00018110224774425696, + "loss": 2.915, + "step": 38548 + }, + { + "epoch": 1.89, + "grad_norm": 0.6358468532562256, + "learning_rate": 0.00018108811250567005, + "loss": 2.9972, + "step": 38549 + }, + { + "epoch": 1.89, + "grad_norm": 0.6383891701698303, + "learning_rate": 0.00018107397758027432, + "loss": 2.936, + "step": 38550 + }, + { + "epoch": 1.89, + "grad_norm": 0.6550706028938293, + "learning_rate": 0.00018105984296810668, + "loss": 3.0692, + "step": 38551 + }, + { + "epoch": 1.89, + "grad_norm": 0.6626957654953003, + "learning_rate": 0.00018104570866920466, + "loss": 3.0563, + "step": 38552 + }, + { + "epoch": 1.89, + "grad_norm": 0.6499350666999817, + "learning_rate": 0.00018103157468360526, + "loss": 3.0258, + "step": 38553 + }, + { + "epoch": 1.89, + "grad_norm": 0.6834374666213989, + "learning_rate": 0.00018101744101134577, + "loss": 2.8288, + "step": 38554 + }, + { + "epoch": 1.89, + "grad_norm": 0.5994073152542114, + "learning_rate": 0.00018100330765246356, + "loss": 3.1779, + "step": 38555 + }, + { + "epoch": 1.89, + "grad_norm": 0.675519585609436, + "learning_rate": 0.0001809891746069957, + "loss": 2.8748, + "step": 38556 + }, + { + "epoch": 1.89, + "grad_norm": 0.624893069267273, + "learning_rate": 0.00018097504187497944, + "loss": 3.2098, + "step": 38557 + }, + { + "epoch": 1.89, + "grad_norm": 0.637237548828125, + "learning_rate": 0.0001809609094564519, + "loss": 2.9335, + "step": 38558 + }, + { + "epoch": 1.89, + "grad_norm": 0.6514628529548645, + "learning_rate": 0.00018094677735145057, + "loss": 3.2457, + "step": 38559 + }, + { + "epoch": 1.89, + "grad_norm": 0.6226351857185364, + "learning_rate": 0.0001809326455600125, + "loss": 2.851, + "step": 38560 + }, + { + "epoch": 1.89, + "grad_norm": 0.6673992276191711, + "learning_rate": 0.0001809185140821748, + "loss": 3.0208, + "step": 38561 + }, + { + "epoch": 1.89, + "grad_norm": 0.6154362559318542, + "learning_rate": 0.00018090438291797496, + "loss": 2.9796, + "step": 38562 + }, + { + "epoch": 1.89, + "grad_norm": 0.6860130429267883, + "learning_rate": 0.00018089025206744999, + "loss": 2.9236, + "step": 38563 + }, + { + "epoch": 1.89, + "grad_norm": 0.7032354474067688, + "learning_rate": 0.00018087612153063723, + "loss": 2.9412, + "step": 38564 + }, + { + "epoch": 1.89, + "grad_norm": 0.6549293994903564, + "learning_rate": 0.00018086199130757375, + "loss": 3.201, + "step": 38565 + }, + { + "epoch": 1.89, + "grad_norm": 0.6441744565963745, + "learning_rate": 0.00018084786139829686, + "loss": 2.9833, + "step": 38566 + }, + { + "epoch": 1.89, + "grad_norm": 0.6373230218887329, + "learning_rate": 0.00018083373180284386, + "loss": 2.9114, + "step": 38567 + }, + { + "epoch": 1.89, + "grad_norm": 0.7155922651290894, + "learning_rate": 0.00018081960252125178, + "loss": 2.8217, + "step": 38568 + }, + { + "epoch": 1.89, + "grad_norm": 0.6718721985816956, + "learning_rate": 0.00018080547355355802, + "loss": 2.8134, + "step": 38569 + }, + { + "epoch": 1.89, + "grad_norm": 0.7212534546852112, + "learning_rate": 0.00018079134489979958, + "loss": 2.8709, + "step": 38570 + }, + { + "epoch": 1.89, + "grad_norm": 0.6554144024848938, + "learning_rate": 0.0001807772165600139, + "loss": 2.8055, + "step": 38571 + }, + { + "epoch": 1.89, + "grad_norm": 0.6695471405982971, + "learning_rate": 0.0001807630885342381, + "loss": 2.8279, + "step": 38572 + }, + { + "epoch": 1.89, + "grad_norm": 0.696575403213501, + "learning_rate": 0.00018074896082250928, + "loss": 3.1577, + "step": 38573 + }, + { + "epoch": 1.89, + "grad_norm": 0.6690512895584106, + "learning_rate": 0.00018073483342486481, + "loss": 3.0392, + "step": 38574 + }, + { + "epoch": 1.89, + "grad_norm": 0.6713618040084839, + "learning_rate": 0.00018072070634134185, + "loss": 3.108, + "step": 38575 + }, + { + "epoch": 1.89, + "grad_norm": 0.6604920625686646, + "learning_rate": 0.00018070657957197753, + "loss": 2.8892, + "step": 38576 + }, + { + "epoch": 1.89, + "grad_norm": 0.6354912519454956, + "learning_rate": 0.0001806924531168092, + "loss": 3.2366, + "step": 38577 + }, + { + "epoch": 1.89, + "grad_norm": 0.6330511569976807, + "learning_rate": 0.00018067832697587393, + "loss": 2.9807, + "step": 38578 + }, + { + "epoch": 1.89, + "grad_norm": 0.6298176646232605, + "learning_rate": 0.00018066420114920906, + "loss": 3.1927, + "step": 38579 + }, + { + "epoch": 1.89, + "grad_norm": 0.6028285622596741, + "learning_rate": 0.0001806500756368516, + "loss": 3.075, + "step": 38580 + }, + { + "epoch": 1.89, + "grad_norm": 0.6283663511276245, + "learning_rate": 0.00018063595043883894, + "loss": 2.7543, + "step": 38581 + }, + { + "epoch": 1.89, + "grad_norm": 0.6311411261558533, + "learning_rate": 0.00018062182555520824, + "loss": 3.0354, + "step": 38582 + }, + { + "epoch": 1.89, + "grad_norm": 0.708346426486969, + "learning_rate": 0.00018060770098599657, + "loss": 3.1341, + "step": 38583 + }, + { + "epoch": 1.89, + "grad_norm": 0.6508063673973083, + "learning_rate": 0.0001805935767312414, + "loss": 2.8863, + "step": 38584 + }, + { + "epoch": 1.89, + "grad_norm": 0.6476802825927734, + "learning_rate": 0.00018057945279097964, + "loss": 2.9496, + "step": 38585 + }, + { + "epoch": 1.89, + "grad_norm": 0.6408866047859192, + "learning_rate": 0.0001805653291652486, + "loss": 3.0421, + "step": 38586 + }, + { + "epoch": 1.89, + "grad_norm": 0.6279966831207275, + "learning_rate": 0.0001805512058540856, + "loss": 3.1745, + "step": 38587 + }, + { + "epoch": 1.89, + "grad_norm": 0.6622072458267212, + "learning_rate": 0.00018053708285752763, + "loss": 2.9178, + "step": 38588 + }, + { + "epoch": 1.89, + "grad_norm": 0.6587921977043152, + "learning_rate": 0.00018052296017561213, + "loss": 3.1572, + "step": 38589 + }, + { + "epoch": 1.89, + "grad_norm": 0.6754596829414368, + "learning_rate": 0.00018050883780837607, + "loss": 2.918, + "step": 38590 + }, + { + "epoch": 1.89, + "grad_norm": 0.6614739298820496, + "learning_rate": 0.0001804947157558567, + "loss": 2.8633, + "step": 38591 + }, + { + "epoch": 1.89, + "grad_norm": 0.6809832453727722, + "learning_rate": 0.00018048059401809134, + "loss": 2.962, + "step": 38592 + }, + { + "epoch": 1.89, + "grad_norm": 0.6957449316978455, + "learning_rate": 0.00018046647259511704, + "loss": 2.9203, + "step": 38593 + }, + { + "epoch": 1.89, + "grad_norm": 0.6387645602226257, + "learning_rate": 0.0001804523514869711, + "loss": 3.0615, + "step": 38594 + }, + { + "epoch": 1.89, + "grad_norm": 0.6960155963897705, + "learning_rate": 0.00018043823069369057, + "loss": 2.8501, + "step": 38595 + }, + { + "epoch": 1.89, + "grad_norm": 0.6919668316841125, + "learning_rate": 0.00018042411021531273, + "loss": 3.0659, + "step": 38596 + }, + { + "epoch": 1.89, + "grad_norm": 0.6586015820503235, + "learning_rate": 0.0001804099900518749, + "loss": 3.0833, + "step": 38597 + }, + { + "epoch": 1.89, + "grad_norm": 0.6569781303405762, + "learning_rate": 0.000180395870203414, + "loss": 3.1045, + "step": 38598 + }, + { + "epoch": 1.89, + "grad_norm": 0.6461341977119446, + "learning_rate": 0.00018038175066996745, + "loss": 3.1601, + "step": 38599 + }, + { + "epoch": 1.89, + "grad_norm": 0.6257455945014954, + "learning_rate": 0.0001803676314515723, + "loss": 2.9019, + "step": 38600 + }, + { + "epoch": 1.89, + "grad_norm": 0.6397156119346619, + "learning_rate": 0.00018035351254826575, + "loss": 2.9765, + "step": 38601 + }, + { + "epoch": 1.89, + "grad_norm": 0.6896104216575623, + "learning_rate": 0.00018033939396008513, + "loss": 3.2906, + "step": 38602 + }, + { + "epoch": 1.89, + "grad_norm": 0.6564487814903259, + "learning_rate": 0.00018032527568706745, + "loss": 2.951, + "step": 38603 + }, + { + "epoch": 1.89, + "grad_norm": 0.6752212047576904, + "learning_rate": 0.00018031115772925005, + "loss": 2.8648, + "step": 38604 + }, + { + "epoch": 1.89, + "grad_norm": 0.6492764353752136, + "learning_rate": 0.00018029704008666988, + "loss": 3.0707, + "step": 38605 + }, + { + "epoch": 1.89, + "grad_norm": 0.668976366519928, + "learning_rate": 0.00018028292275936431, + "loss": 3.0693, + "step": 38606 + }, + { + "epoch": 1.89, + "grad_norm": 0.6379303932189941, + "learning_rate": 0.00018026880574737055, + "loss": 3.0406, + "step": 38607 + }, + { + "epoch": 1.89, + "grad_norm": 0.8051410913467407, + "learning_rate": 0.00018025468905072563, + "loss": 3.0841, + "step": 38608 + }, + { + "epoch": 1.89, + "grad_norm": 0.6822032928466797, + "learning_rate": 0.0001802405726694669, + "loss": 2.7943, + "step": 38609 + }, + { + "epoch": 1.89, + "grad_norm": 0.6568723917007446, + "learning_rate": 0.00018022645660363133, + "loss": 2.8804, + "step": 38610 + }, + { + "epoch": 1.89, + "grad_norm": 0.6965839266777039, + "learning_rate": 0.00018021234085325628, + "loss": 3.0882, + "step": 38611 + }, + { + "epoch": 1.89, + "grad_norm": 0.6454933881759644, + "learning_rate": 0.00018019822541837896, + "loss": 3.1348, + "step": 38612 + }, + { + "epoch": 1.89, + "grad_norm": 0.6876438856124878, + "learning_rate": 0.00018018411029903632, + "loss": 2.9058, + "step": 38613 + }, + { + "epoch": 1.89, + "grad_norm": 0.6394537687301636, + "learning_rate": 0.0001801699954952658, + "loss": 2.9671, + "step": 38614 + }, + { + "epoch": 1.89, + "grad_norm": 0.6840302348136902, + "learning_rate": 0.00018015588100710439, + "loss": 2.8984, + "step": 38615 + }, + { + "epoch": 1.89, + "grad_norm": 0.6187319159507751, + "learning_rate": 0.00018014176683458927, + "loss": 3.1231, + "step": 38616 + }, + { + "epoch": 1.89, + "grad_norm": 0.605553150177002, + "learning_rate": 0.00018012765297775778, + "loss": 2.8378, + "step": 38617 + }, + { + "epoch": 1.89, + "grad_norm": 0.6925824880599976, + "learning_rate": 0.0001801135394366469, + "loss": 2.8799, + "step": 38618 + }, + { + "epoch": 1.89, + "grad_norm": 0.651581883430481, + "learning_rate": 0.00018009942621129398, + "loss": 3.0663, + "step": 38619 + }, + { + "epoch": 1.89, + "grad_norm": 0.6642777323722839, + "learning_rate": 0.00018008531330173598, + "loss": 2.7444, + "step": 38620 + }, + { + "epoch": 1.89, + "grad_norm": 0.6268136501312256, + "learning_rate": 0.00018007120070801023, + "loss": 3.073, + "step": 38621 + }, + { + "epoch": 1.89, + "grad_norm": 0.6606512069702148, + "learning_rate": 0.00018005708843015392, + "loss": 3.14, + "step": 38622 + }, + { + "epoch": 1.89, + "grad_norm": 0.7048143744468689, + "learning_rate": 0.000180042976468204, + "loss": 2.6478, + "step": 38623 + }, + { + "epoch": 1.89, + "grad_norm": 0.6552999019622803, + "learning_rate": 0.00018002886482219798, + "loss": 2.8626, + "step": 38624 + }, + { + "epoch": 1.89, + "grad_norm": 0.6531506180763245, + "learning_rate": 0.00018001475349217275, + "loss": 2.9233, + "step": 38625 + }, + { + "epoch": 1.89, + "grad_norm": 0.682345986366272, + "learning_rate": 0.0001800006424781655, + "loss": 2.9008, + "step": 38626 + }, + { + "epoch": 1.89, + "grad_norm": 0.6473051905632019, + "learning_rate": 0.0001799865317802136, + "loss": 3.1767, + "step": 38627 + }, + { + "epoch": 1.89, + "grad_norm": 0.6639640927314758, + "learning_rate": 0.0001799724213983539, + "loss": 2.9966, + "step": 38628 + }, + { + "epoch": 1.89, + "grad_norm": 0.6246669888496399, + "learning_rate": 0.00017995831133262402, + "loss": 3.0385, + "step": 38629 + }, + { + "epoch": 1.89, + "grad_norm": 0.6045641899108887, + "learning_rate": 0.0001799442015830606, + "loss": 3.017, + "step": 38630 + }, + { + "epoch": 1.89, + "grad_norm": 0.7342897057533264, + "learning_rate": 0.00017993009214970106, + "loss": 3.1561, + "step": 38631 + }, + { + "epoch": 1.89, + "grad_norm": 0.7383832335472107, + "learning_rate": 0.00017991598303258266, + "loss": 2.8414, + "step": 38632 + }, + { + "epoch": 1.89, + "grad_norm": 0.7174133062362671, + "learning_rate": 0.00017990187423174242, + "loss": 2.9566, + "step": 38633 + }, + { + "epoch": 1.89, + "grad_norm": 0.6822835206985474, + "learning_rate": 0.00017988776574721758, + "loss": 3.0601, + "step": 38634 + }, + { + "epoch": 1.89, + "grad_norm": 0.6779569387435913, + "learning_rate": 0.0001798736575790451, + "loss": 2.7663, + "step": 38635 + }, + { + "epoch": 1.89, + "grad_norm": 0.6900989413261414, + "learning_rate": 0.00017985954972726237, + "loss": 2.9155, + "step": 38636 + }, + { + "epoch": 1.89, + "grad_norm": 0.6946017146110535, + "learning_rate": 0.00017984544219190653, + "loss": 3.2055, + "step": 38637 + }, + { + "epoch": 1.89, + "grad_norm": 0.6267918944358826, + "learning_rate": 0.00017983133497301453, + "loss": 2.9432, + "step": 38638 + }, + { + "epoch": 1.89, + "grad_norm": 0.7059924006462097, + "learning_rate": 0.0001798172280706238, + "loss": 3.08, + "step": 38639 + }, + { + "epoch": 1.89, + "grad_norm": 0.6676067113876343, + "learning_rate": 0.0001798031214847713, + "loss": 2.9552, + "step": 38640 + }, + { + "epoch": 1.89, + "grad_norm": 0.6539041996002197, + "learning_rate": 0.00017978901521549428, + "loss": 3.0205, + "step": 38641 + }, + { + "epoch": 1.89, + "grad_norm": 0.6838967204093933, + "learning_rate": 0.0001797749092628298, + "loss": 2.9309, + "step": 38642 + }, + { + "epoch": 1.89, + "grad_norm": 0.6737627983093262, + "learning_rate": 0.0001797608036268151, + "loss": 3.1568, + "step": 38643 + }, + { + "epoch": 1.89, + "grad_norm": 0.6425014734268188, + "learning_rate": 0.00017974669830748735, + "loss": 3.0889, + "step": 38644 + }, + { + "epoch": 1.89, + "grad_norm": 0.6968850493431091, + "learning_rate": 0.00017973259330488354, + "loss": 2.8293, + "step": 38645 + }, + { + "epoch": 1.89, + "grad_norm": 0.6469689011573792, + "learning_rate": 0.00017971848861904105, + "loss": 2.9373, + "step": 38646 + }, + { + "epoch": 1.89, + "grad_norm": 0.6483293175697327, + "learning_rate": 0.00017970438424999688, + "loss": 3.098, + "step": 38647 + }, + { + "epoch": 1.89, + "grad_norm": 0.6595166325569153, + "learning_rate": 0.0001796902801977881, + "loss": 3.055, + "step": 38648 + }, + { + "epoch": 1.89, + "grad_norm": 0.6388629078865051, + "learning_rate": 0.00017967617646245213, + "loss": 2.9905, + "step": 38649 + }, + { + "epoch": 1.89, + "grad_norm": 0.6459506750106812, + "learning_rate": 0.00017966207304402583, + "loss": 2.9324, + "step": 38650 + }, + { + "epoch": 1.89, + "grad_norm": 0.7083884477615356, + "learning_rate": 0.00017964796994254656, + "loss": 2.7932, + "step": 38651 + }, + { + "epoch": 1.89, + "grad_norm": 0.5959693193435669, + "learning_rate": 0.00017963386715805137, + "loss": 2.8373, + "step": 38652 + }, + { + "epoch": 1.89, + "grad_norm": 0.6729698181152344, + "learning_rate": 0.0001796197646905773, + "loss": 3.1081, + "step": 38653 + }, + { + "epoch": 1.89, + "grad_norm": 0.6342985033988953, + "learning_rate": 0.00017960566254016176, + "loss": 3.0374, + "step": 38654 + }, + { + "epoch": 1.89, + "grad_norm": 0.7330729365348816, + "learning_rate": 0.00017959156070684163, + "loss": 3.2231, + "step": 38655 + }, + { + "epoch": 1.89, + "grad_norm": 0.6425516605377197, + "learning_rate": 0.00017957745919065427, + "loss": 2.9848, + "step": 38656 + }, + { + "epoch": 1.89, + "grad_norm": 0.6559476256370544, + "learning_rate": 0.00017956335799163654, + "loss": 3.0129, + "step": 38657 + }, + { + "epoch": 1.89, + "grad_norm": 0.7110540866851807, + "learning_rate": 0.00017954925710982582, + "loss": 3.008, + "step": 38658 + }, + { + "epoch": 1.89, + "grad_norm": 0.6435421109199524, + "learning_rate": 0.00017953515654525924, + "loss": 2.9261, + "step": 38659 + }, + { + "epoch": 1.89, + "grad_norm": 0.6702364683151245, + "learning_rate": 0.00017952105629797375, + "loss": 2.9763, + "step": 38660 + }, + { + "epoch": 1.89, + "grad_norm": 0.645945131778717, + "learning_rate": 0.00017950695636800672, + "loss": 3.0355, + "step": 38661 + }, + { + "epoch": 1.89, + "grad_norm": 0.6295376420021057, + "learning_rate": 0.00017949285675539515, + "loss": 3.0032, + "step": 38662 + }, + { + "epoch": 1.89, + "grad_norm": 0.640484631061554, + "learning_rate": 0.0001794787574601761, + "loss": 3.0606, + "step": 38663 + }, + { + "epoch": 1.89, + "grad_norm": 0.6546970009803772, + "learning_rate": 0.00017946465848238697, + "loss": 2.9448, + "step": 38664 + }, + { + "epoch": 1.89, + "grad_norm": 0.6797558665275574, + "learning_rate": 0.00017945055982206465, + "loss": 2.8821, + "step": 38665 + }, + { + "epoch": 1.89, + "grad_norm": 0.6290567517280579, + "learning_rate": 0.00017943646147924639, + "loss": 2.9998, + "step": 38666 + }, + { + "epoch": 1.89, + "grad_norm": 0.6903430819511414, + "learning_rate": 0.0001794223634539692, + "loss": 3.0059, + "step": 38667 + }, + { + "epoch": 1.9, + "grad_norm": 0.7165091037750244, + "learning_rate": 0.00017940826574627038, + "loss": 2.9054, + "step": 38668 + }, + { + "epoch": 1.9, + "grad_norm": 0.6272432208061218, + "learning_rate": 0.00017939416835618702, + "loss": 3.0671, + "step": 38669 + }, + { + "epoch": 1.9, + "grad_norm": 0.6418277621269226, + "learning_rate": 0.00017938007128375609, + "loss": 2.8885, + "step": 38670 + }, + { + "epoch": 1.9, + "grad_norm": 0.671662449836731, + "learning_rate": 0.00017936597452901493, + "loss": 2.9865, + "step": 38671 + }, + { + "epoch": 1.9, + "grad_norm": 0.6581867337226868, + "learning_rate": 0.00017935187809200046, + "loss": 2.9492, + "step": 38672 + }, + { + "epoch": 1.9, + "grad_norm": 0.6353850364685059, + "learning_rate": 0.00017933778197275002, + "loss": 2.9451, + "step": 38673 + }, + { + "epoch": 1.9, + "grad_norm": 0.6606138348579407, + "learning_rate": 0.0001793236861713007, + "loss": 2.9165, + "step": 38674 + }, + { + "epoch": 1.9, + "grad_norm": 0.7428664565086365, + "learning_rate": 0.00017930959068768944, + "loss": 2.9639, + "step": 38675 + }, + { + "epoch": 1.9, + "grad_norm": 0.6777299046516418, + "learning_rate": 0.0001792954955219536, + "loss": 3.3009, + "step": 38676 + }, + { + "epoch": 1.9, + "grad_norm": 0.639334499835968, + "learning_rate": 0.0001792814006741301, + "loss": 2.9518, + "step": 38677 + }, + { + "epoch": 1.9, + "grad_norm": 0.6822625994682312, + "learning_rate": 0.00017926730614425615, + "loss": 3.0377, + "step": 38678 + }, + { + "epoch": 1.9, + "grad_norm": 0.6702042818069458, + "learning_rate": 0.00017925321193236897, + "loss": 2.9242, + "step": 38679 + }, + { + "epoch": 1.9, + "grad_norm": 0.6273990273475647, + "learning_rate": 0.0001792391180385056, + "loss": 2.8685, + "step": 38680 + }, + { + "epoch": 1.9, + "grad_norm": 0.6678882837295532, + "learning_rate": 0.00017922502446270314, + "loss": 2.9388, + "step": 38681 + }, + { + "epoch": 1.9, + "grad_norm": 0.6841011047363281, + "learning_rate": 0.0001792109312049986, + "loss": 3.025, + "step": 38682 + }, + { + "epoch": 1.9, + "grad_norm": 0.6396910548210144, + "learning_rate": 0.0001791968382654293, + "loss": 3.0088, + "step": 38683 + }, + { + "epoch": 1.9, + "grad_norm": 0.63197261095047, + "learning_rate": 0.00017918274564403236, + "loss": 3.0014, + "step": 38684 + }, + { + "epoch": 1.9, + "grad_norm": 0.6431912779808044, + "learning_rate": 0.00017916865334084464, + "loss": 3.0681, + "step": 38685 + }, + { + "epoch": 1.9, + "grad_norm": 0.6578310132026672, + "learning_rate": 0.00017915456135590363, + "loss": 2.9865, + "step": 38686 + }, + { + "epoch": 1.9, + "grad_norm": 0.6344066262245178, + "learning_rate": 0.00017914046968924612, + "loss": 2.8682, + "step": 38687 + }, + { + "epoch": 1.9, + "grad_norm": 0.6525806784629822, + "learning_rate": 0.00017912637834090932, + "loss": 2.86, + "step": 38688 + }, + { + "epoch": 1.9, + "grad_norm": 0.6194393038749695, + "learning_rate": 0.00017911228731093052, + "loss": 2.8638, + "step": 38689 + }, + { + "epoch": 1.9, + "grad_norm": 0.6648958325386047, + "learning_rate": 0.0001790981965993465, + "loss": 2.9185, + "step": 38690 + }, + { + "epoch": 1.9, + "grad_norm": 0.6630010008811951, + "learning_rate": 0.0001790841062061948, + "loss": 2.8877, + "step": 38691 + }, + { + "epoch": 1.9, + "grad_norm": 0.643262505531311, + "learning_rate": 0.00017907001613151212, + "loss": 2.8769, + "step": 38692 + }, + { + "epoch": 1.9, + "grad_norm": 0.6801116466522217, + "learning_rate": 0.00017905592637533572, + "loss": 2.9138, + "step": 38693 + }, + { + "epoch": 1.9, + "grad_norm": 0.6177447438240051, + "learning_rate": 0.00017904183693770285, + "loss": 3.2762, + "step": 38694 + }, + { + "epoch": 1.9, + "grad_norm": 0.6487329602241516, + "learning_rate": 0.00017902774781865043, + "loss": 2.8668, + "step": 38695 + }, + { + "epoch": 1.9, + "grad_norm": 0.6444410085678101, + "learning_rate": 0.00017901365901821573, + "loss": 3.0458, + "step": 38696 + }, + { + "epoch": 1.9, + "grad_norm": 0.6466326713562012, + "learning_rate": 0.00017899957053643567, + "loss": 3.0129, + "step": 38697 + }, + { + "epoch": 1.9, + "grad_norm": 0.6659789085388184, + "learning_rate": 0.00017898548237334748, + "loss": 3.0764, + "step": 38698 + }, + { + "epoch": 1.9, + "grad_norm": 0.6836056709289551, + "learning_rate": 0.0001789713945289883, + "loss": 2.9231, + "step": 38699 + }, + { + "epoch": 1.9, + "grad_norm": 0.6317378282546997, + "learning_rate": 0.00017895730700339508, + "loss": 2.9773, + "step": 38700 + }, + { + "epoch": 1.9, + "grad_norm": 0.7079015374183655, + "learning_rate": 0.0001789432197966051, + "loss": 3.0086, + "step": 38701 + }, + { + "epoch": 1.9, + "grad_norm": 0.6322833895683289, + "learning_rate": 0.00017892913290865533, + "loss": 2.9109, + "step": 38702 + }, + { + "epoch": 1.9, + "grad_norm": 0.620879590511322, + "learning_rate": 0.00017891504633958286, + "loss": 3.1562, + "step": 38703 + }, + { + "epoch": 1.9, + "grad_norm": 0.6418386697769165, + "learning_rate": 0.00017890096008942502, + "loss": 3.0976, + "step": 38704 + }, + { + "epoch": 1.9, + "grad_norm": 0.6648834943771362, + "learning_rate": 0.00017888687415821864, + "loss": 3.162, + "step": 38705 + }, + { + "epoch": 1.9, + "grad_norm": 0.6503156423568726, + "learning_rate": 0.000178872788546001, + "loss": 2.9803, + "step": 38706 + }, + { + "epoch": 1.9, + "grad_norm": 0.6655746698379517, + "learning_rate": 0.000178858703252809, + "loss": 2.8608, + "step": 38707 + }, + { + "epoch": 1.9, + "grad_norm": 0.7488763928413391, + "learning_rate": 0.0001788446182786799, + "loss": 3.0323, + "step": 38708 + }, + { + "epoch": 1.9, + "grad_norm": 0.6233779788017273, + "learning_rate": 0.00017883053362365088, + "loss": 3.0267, + "step": 38709 + }, + { + "epoch": 1.9, + "grad_norm": 0.6482000946998596, + "learning_rate": 0.0001788164492877587, + "loss": 2.9888, + "step": 38710 + }, + { + "epoch": 1.9, + "grad_norm": 0.6482628583908081, + "learning_rate": 0.00017880236527104086, + "loss": 2.9677, + "step": 38711 + }, + { + "epoch": 1.9, + "grad_norm": 0.6418460011482239, + "learning_rate": 0.00017878828157353417, + "loss": 2.7318, + "step": 38712 + }, + { + "epoch": 1.9, + "grad_norm": 0.6458458304405212, + "learning_rate": 0.00017877419819527582, + "loss": 3.0026, + "step": 38713 + }, + { + "epoch": 1.9, + "grad_norm": 0.6432719826698303, + "learning_rate": 0.00017876011513630297, + "loss": 3.0038, + "step": 38714 + }, + { + "epoch": 1.9, + "grad_norm": 0.6705681085586548, + "learning_rate": 0.00017874603239665255, + "loss": 2.8765, + "step": 38715 + }, + { + "epoch": 1.9, + "grad_norm": 0.6631069779396057, + "learning_rate": 0.00017873194997636182, + "loss": 3.0594, + "step": 38716 + }, + { + "epoch": 1.9, + "grad_norm": 0.6457616090774536, + "learning_rate": 0.00017871786787546776, + "loss": 2.9703, + "step": 38717 + }, + { + "epoch": 1.9, + "grad_norm": 0.6328929662704468, + "learning_rate": 0.0001787037860940076, + "loss": 2.7767, + "step": 38718 + }, + { + "epoch": 1.9, + "grad_norm": 0.6492588520050049, + "learning_rate": 0.00017868970463201815, + "loss": 3.0519, + "step": 38719 + }, + { + "epoch": 1.9, + "grad_norm": 0.6573117971420288, + "learning_rate": 0.0001786756234895367, + "loss": 3.0632, + "step": 38720 + }, + { + "epoch": 1.9, + "grad_norm": 0.6802584528923035, + "learning_rate": 0.00017866154266660045, + "loss": 3.218, + "step": 38721 + }, + { + "epoch": 1.9, + "grad_norm": 0.6282143592834473, + "learning_rate": 0.0001786474621632462, + "loss": 2.9557, + "step": 38722 + }, + { + "epoch": 1.9, + "grad_norm": 0.6655977964401245, + "learning_rate": 0.00017863338197951126, + "loss": 2.8931, + "step": 38723 + }, + { + "epoch": 1.9, + "grad_norm": 0.6891822218894958, + "learning_rate": 0.00017861930211543264, + "loss": 2.9934, + "step": 38724 + }, + { + "epoch": 1.9, + "grad_norm": 0.6471545100212097, + "learning_rate": 0.0001786052225710473, + "loss": 3.1537, + "step": 38725 + }, + { + "epoch": 1.9, + "grad_norm": 0.6515899896621704, + "learning_rate": 0.0001785911433463926, + "loss": 3.0171, + "step": 38726 + }, + { + "epoch": 1.9, + "grad_norm": 0.6359424591064453, + "learning_rate": 0.00017857706444150535, + "loss": 3.0864, + "step": 38727 + }, + { + "epoch": 1.9, + "grad_norm": 0.6311108469963074, + "learning_rate": 0.00017856298585642284, + "loss": 3.1186, + "step": 38728 + }, + { + "epoch": 1.9, + "grad_norm": 0.6485701203346252, + "learning_rate": 0.00017854890759118192, + "loss": 3.1168, + "step": 38729 + }, + { + "epoch": 1.9, + "grad_norm": 0.6353026628494263, + "learning_rate": 0.00017853482964581992, + "loss": 3.0755, + "step": 38730 + }, + { + "epoch": 1.9, + "grad_norm": 0.6661299467086792, + "learning_rate": 0.00017852075202037378, + "loss": 2.908, + "step": 38731 + }, + { + "epoch": 1.9, + "grad_norm": 0.6596245765686035, + "learning_rate": 0.00017850667471488053, + "loss": 2.9477, + "step": 38732 + }, + { + "epoch": 1.9, + "grad_norm": 0.67014080286026, + "learning_rate": 0.00017849259772937743, + "loss": 3.2171, + "step": 38733 + }, + { + "epoch": 1.9, + "grad_norm": 0.6512858867645264, + "learning_rate": 0.00017847852106390132, + "loss": 2.8545, + "step": 38734 + }, + { + "epoch": 1.9, + "grad_norm": 0.6994531154632568, + "learning_rate": 0.00017846444471848944, + "loss": 2.9238, + "step": 38735 + }, + { + "epoch": 1.9, + "grad_norm": 0.6024457216262817, + "learning_rate": 0.0001784503686931789, + "loss": 2.9689, + "step": 38736 + }, + { + "epoch": 1.9, + "grad_norm": 0.6673498153686523, + "learning_rate": 0.00017843629298800655, + "loss": 2.8969, + "step": 38737 + }, + { + "epoch": 1.9, + "grad_norm": 0.6520382761955261, + "learning_rate": 0.00017842221760300977, + "loss": 3.0239, + "step": 38738 + }, + { + "epoch": 1.9, + "grad_norm": 0.6899427175521851, + "learning_rate": 0.0001784081425382254, + "loss": 3.0428, + "step": 38739 + }, + { + "epoch": 1.9, + "grad_norm": 0.6339791417121887, + "learning_rate": 0.00017839406779369052, + "loss": 2.8405, + "step": 38740 + }, + { + "epoch": 1.9, + "grad_norm": 1.6099035739898682, + "learning_rate": 0.00017837999336944236, + "loss": 2.9079, + "step": 38741 + }, + { + "epoch": 1.9, + "grad_norm": 0.6363784074783325, + "learning_rate": 0.00017836591926551786, + "loss": 3.0636, + "step": 38742 + }, + { + "epoch": 1.9, + "grad_norm": 0.6817944049835205, + "learning_rate": 0.00017835184548195418, + "loss": 2.865, + "step": 38743 + }, + { + "epoch": 1.9, + "grad_norm": 0.6493276953697205, + "learning_rate": 0.0001783377720187882, + "loss": 2.8713, + "step": 38744 + }, + { + "epoch": 1.9, + "grad_norm": 0.6394343376159668, + "learning_rate": 0.00017832369887605718, + "loss": 3.0309, + "step": 38745 + }, + { + "epoch": 1.9, + "grad_norm": 0.6780338883399963, + "learning_rate": 0.00017830962605379813, + "loss": 3.0202, + "step": 38746 + }, + { + "epoch": 1.9, + "grad_norm": 0.6320281028747559, + "learning_rate": 0.00017829555355204803, + "loss": 2.8572, + "step": 38747 + }, + { + "epoch": 1.9, + "grad_norm": 0.6657953858375549, + "learning_rate": 0.00017828148137084411, + "loss": 3.0652, + "step": 38748 + }, + { + "epoch": 1.9, + "grad_norm": 0.6262355446815491, + "learning_rate": 0.00017826740951022336, + "loss": 3.1262, + "step": 38749 + }, + { + "epoch": 1.9, + "grad_norm": 0.7236253023147583, + "learning_rate": 0.0001782533379702227, + "loss": 2.9621, + "step": 38750 + }, + { + "epoch": 1.9, + "grad_norm": 0.6585520505905151, + "learning_rate": 0.00017823926675087945, + "loss": 3.0541, + "step": 38751 + }, + { + "epoch": 1.9, + "grad_norm": 0.6381440162658691, + "learning_rate": 0.0001782251958522304, + "loss": 2.9744, + "step": 38752 + }, + { + "epoch": 1.9, + "grad_norm": 0.6845026612281799, + "learning_rate": 0.00017821112527431287, + "loss": 3.0564, + "step": 38753 + }, + { + "epoch": 1.9, + "grad_norm": 0.6539150476455688, + "learning_rate": 0.0001781970550171638, + "loss": 2.9587, + "step": 38754 + }, + { + "epoch": 1.9, + "grad_norm": 0.7004772424697876, + "learning_rate": 0.00017818298508082015, + "loss": 3.0137, + "step": 38755 + }, + { + "epoch": 1.9, + "grad_norm": 0.6276587247848511, + "learning_rate": 0.00017816891546531914, + "loss": 2.9447, + "step": 38756 + }, + { + "epoch": 1.9, + "grad_norm": 0.6287436485290527, + "learning_rate": 0.00017815484617069775, + "loss": 2.9367, + "step": 38757 + }, + { + "epoch": 1.9, + "grad_norm": 0.7259926795959473, + "learning_rate": 0.00017814077719699312, + "loss": 3.1074, + "step": 38758 + }, + { + "epoch": 1.9, + "grad_norm": 0.6566614508628845, + "learning_rate": 0.00017812670854424207, + "loss": 3.2498, + "step": 38759 + }, + { + "epoch": 1.9, + "grad_norm": 0.6689494848251343, + "learning_rate": 0.0001781126402124819, + "loss": 3.0541, + "step": 38760 + }, + { + "epoch": 1.9, + "grad_norm": 0.6920581459999084, + "learning_rate": 0.00017809857220174962, + "loss": 2.9865, + "step": 38761 + }, + { + "epoch": 1.9, + "grad_norm": 0.627387285232544, + "learning_rate": 0.00017808450451208213, + "loss": 3.109, + "step": 38762 + }, + { + "epoch": 1.9, + "grad_norm": 0.6338323354721069, + "learning_rate": 0.0001780704371435167, + "loss": 2.9502, + "step": 38763 + }, + { + "epoch": 1.9, + "grad_norm": 0.6786047220230103, + "learning_rate": 0.00017805637009609025, + "loss": 3.134, + "step": 38764 + }, + { + "epoch": 1.9, + "grad_norm": 0.6783745884895325, + "learning_rate": 0.00017804230336983975, + "loss": 3.161, + "step": 38765 + }, + { + "epoch": 1.9, + "grad_norm": 0.6686196327209473, + "learning_rate": 0.0001780282369648025, + "loss": 3.1341, + "step": 38766 + }, + { + "epoch": 1.9, + "grad_norm": 0.6732924580574036, + "learning_rate": 0.00017801417088101533, + "loss": 2.8498, + "step": 38767 + }, + { + "epoch": 1.9, + "grad_norm": 0.6651960015296936, + "learning_rate": 0.00017800010511851542, + "loss": 3.1484, + "step": 38768 + }, + { + "epoch": 1.9, + "grad_norm": 0.6445776224136353, + "learning_rate": 0.00017798603967733963, + "loss": 3.0089, + "step": 38769 + }, + { + "epoch": 1.9, + "grad_norm": 0.7008956074714661, + "learning_rate": 0.0001779719745575252, + "loss": 3.0585, + "step": 38770 + }, + { + "epoch": 1.9, + "grad_norm": 0.7105851173400879, + "learning_rate": 0.00017795790975910913, + "loss": 2.9633, + "step": 38771 + }, + { + "epoch": 1.9, + "grad_norm": 0.6775414347648621, + "learning_rate": 0.00017794384528212835, + "loss": 2.9435, + "step": 38772 + }, + { + "epoch": 1.9, + "grad_norm": 0.6525750160217285, + "learning_rate": 0.00017792978112662013, + "loss": 2.8013, + "step": 38773 + }, + { + "epoch": 1.9, + "grad_norm": 0.6795325875282288, + "learning_rate": 0.0001779157172926212, + "loss": 3.1657, + "step": 38774 + }, + { + "epoch": 1.9, + "grad_norm": 0.6364821791648865, + "learning_rate": 0.00017790165378016884, + "loss": 2.9929, + "step": 38775 + }, + { + "epoch": 1.9, + "grad_norm": 0.6568708419799805, + "learning_rate": 0.00017788759058930015, + "loss": 3.088, + "step": 38776 + }, + { + "epoch": 1.9, + "grad_norm": 0.6670739650726318, + "learning_rate": 0.00017787352772005184, + "loss": 2.9832, + "step": 38777 + }, + { + "epoch": 1.9, + "grad_norm": 0.6435451507568359, + "learning_rate": 0.00017785946517246136, + "loss": 2.9281, + "step": 38778 + }, + { + "epoch": 1.9, + "grad_norm": 0.6362979412078857, + "learning_rate": 0.00017784540294656543, + "loss": 2.9789, + "step": 38779 + }, + { + "epoch": 1.9, + "grad_norm": 0.6384310722351074, + "learning_rate": 0.00017783134104240117, + "loss": 2.8507, + "step": 38780 + }, + { + "epoch": 1.9, + "grad_norm": 0.6559550166130066, + "learning_rate": 0.00017781727946000574, + "loss": 2.9265, + "step": 38781 + }, + { + "epoch": 1.9, + "grad_norm": 0.650518000125885, + "learning_rate": 0.000177803218199416, + "loss": 2.9284, + "step": 38782 + }, + { + "epoch": 1.9, + "grad_norm": 0.6557312607765198, + "learning_rate": 0.00017778915726066917, + "loss": 3.059, + "step": 38783 + }, + { + "epoch": 1.9, + "grad_norm": 0.613975465297699, + "learning_rate": 0.00017777509664380206, + "loss": 2.9277, + "step": 38784 + }, + { + "epoch": 1.9, + "grad_norm": 0.6250554323196411, + "learning_rate": 0.00017776103634885185, + "loss": 2.9664, + "step": 38785 + }, + { + "epoch": 1.9, + "grad_norm": 0.6322828531265259, + "learning_rate": 0.00017774697637585563, + "loss": 2.8287, + "step": 38786 + }, + { + "epoch": 1.9, + "grad_norm": 0.6861053109169006, + "learning_rate": 0.00017773291672485022, + "loss": 3.1791, + "step": 38787 + }, + { + "epoch": 1.9, + "grad_norm": 0.630342960357666, + "learning_rate": 0.00017771885739587288, + "loss": 2.999, + "step": 38788 + }, + { + "epoch": 1.9, + "grad_norm": 0.6999707818031311, + "learning_rate": 0.0001777047983889605, + "loss": 3.0539, + "step": 38789 + }, + { + "epoch": 1.9, + "grad_norm": 0.6790862679481506, + "learning_rate": 0.00017769073970415005, + "loss": 3.0786, + "step": 38790 + }, + { + "epoch": 1.9, + "grad_norm": 0.6557303667068481, + "learning_rate": 0.0001776766813414788, + "loss": 2.7987, + "step": 38791 + }, + { + "epoch": 1.9, + "grad_norm": 0.6694257855415344, + "learning_rate": 0.0001776626233009835, + "loss": 2.9747, + "step": 38792 + }, + { + "epoch": 1.9, + "grad_norm": 0.7797983288764954, + "learning_rate": 0.00017764856558270153, + "loss": 2.932, + "step": 38793 + }, + { + "epoch": 1.9, + "grad_norm": 0.6885984539985657, + "learning_rate": 0.00017763450818666945, + "loss": 2.8699, + "step": 38794 + }, + { + "epoch": 1.9, + "grad_norm": 0.6026552319526672, + "learning_rate": 0.00017762045111292455, + "loss": 2.9986, + "step": 38795 + }, + { + "epoch": 1.9, + "grad_norm": 0.7198888063430786, + "learning_rate": 0.00017760639436150398, + "loss": 2.9756, + "step": 38796 + }, + { + "epoch": 1.9, + "grad_norm": 0.646704375743866, + "learning_rate": 0.00017759233793244452, + "loss": 2.9899, + "step": 38797 + }, + { + "epoch": 1.9, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.00017757828182578332, + "loss": 2.8782, + "step": 38798 + }, + { + "epoch": 1.9, + "grad_norm": 0.629848062992096, + "learning_rate": 0.00017756422604155728, + "loss": 2.9032, + "step": 38799 + }, + { + "epoch": 1.9, + "grad_norm": 0.6715112924575806, + "learning_rate": 0.00017755017057980362, + "loss": 2.9572, + "step": 38800 + }, + { + "epoch": 1.9, + "grad_norm": 0.6943894028663635, + "learning_rate": 0.00017753611544055916, + "loss": 3.0577, + "step": 38801 + }, + { + "epoch": 1.9, + "grad_norm": 0.6915984153747559, + "learning_rate": 0.00017752206062386096, + "loss": 2.9434, + "step": 38802 + }, + { + "epoch": 1.9, + "grad_norm": 0.6393081545829773, + "learning_rate": 0.00017750800612974626, + "loss": 2.9935, + "step": 38803 + }, + { + "epoch": 1.9, + "grad_norm": 0.6286363005638123, + "learning_rate": 0.00017749395195825173, + "loss": 2.7904, + "step": 38804 + }, + { + "epoch": 1.9, + "grad_norm": 0.6637555956840515, + "learning_rate": 0.0001774798981094147, + "loss": 2.8944, + "step": 38805 + }, + { + "epoch": 1.9, + "grad_norm": 0.6585133075714111, + "learning_rate": 0.00017746584458327189, + "loss": 2.8192, + "step": 38806 + }, + { + "epoch": 1.9, + "grad_norm": 0.6345171332359314, + "learning_rate": 0.0001774517913798605, + "loss": 3.0814, + "step": 38807 + }, + { + "epoch": 1.9, + "grad_norm": 0.6817945241928101, + "learning_rate": 0.00017743773849921758, + "loss": 3.1959, + "step": 38808 + }, + { + "epoch": 1.9, + "grad_norm": 0.673495888710022, + "learning_rate": 0.00017742368594137993, + "loss": 2.8187, + "step": 38809 + }, + { + "epoch": 1.9, + "grad_norm": 0.6089171171188354, + "learning_rate": 0.00017740963370638485, + "loss": 3.0861, + "step": 38810 + }, + { + "epoch": 1.9, + "grad_norm": 0.6373415589332581, + "learning_rate": 0.00017739558179426915, + "loss": 2.9421, + "step": 38811 + }, + { + "epoch": 1.9, + "grad_norm": 0.7025952935218811, + "learning_rate": 0.00017738153020506982, + "loss": 3.0454, + "step": 38812 + }, + { + "epoch": 1.9, + "grad_norm": 0.66726154088974, + "learning_rate": 0.0001773674789388241, + "loss": 3.1199, + "step": 38813 + }, + { + "epoch": 1.9, + "grad_norm": 0.630937397480011, + "learning_rate": 0.00017735342799556864, + "loss": 2.9137, + "step": 38814 + }, + { + "epoch": 1.9, + "grad_norm": 0.6461975574493408, + "learning_rate": 0.00017733937737534082, + "loss": 2.9248, + "step": 38815 + }, + { + "epoch": 1.9, + "grad_norm": 0.6686995029449463, + "learning_rate": 0.0001773253270781774, + "loss": 2.8179, + "step": 38816 + }, + { + "epoch": 1.9, + "grad_norm": 0.6307097673416138, + "learning_rate": 0.0001773112771041154, + "loss": 2.9402, + "step": 38817 + }, + { + "epoch": 1.9, + "grad_norm": 0.6663997769355774, + "learning_rate": 0.000177297227453192, + "loss": 2.7335, + "step": 38818 + }, + { + "epoch": 1.9, + "grad_norm": 0.660294771194458, + "learning_rate": 0.00017728317812544406, + "loss": 2.9602, + "step": 38819 + }, + { + "epoch": 1.9, + "grad_norm": 0.7105454206466675, + "learning_rate": 0.0001772691291209086, + "loss": 2.9683, + "step": 38820 + }, + { + "epoch": 1.9, + "grad_norm": 0.6138967275619507, + "learning_rate": 0.00017725508043962257, + "loss": 2.9629, + "step": 38821 + }, + { + "epoch": 1.9, + "grad_norm": 0.7000978589057922, + "learning_rate": 0.00017724103208162308, + "loss": 3.1553, + "step": 38822 + }, + { + "epoch": 1.9, + "grad_norm": 0.6332494020462036, + "learning_rate": 0.00017722698404694715, + "loss": 3.03, + "step": 38823 + }, + { + "epoch": 1.9, + "grad_norm": 0.5949512720108032, + "learning_rate": 0.00017721293633563164, + "loss": 3.0115, + "step": 38824 + }, + { + "epoch": 1.9, + "grad_norm": 0.6465287208557129, + "learning_rate": 0.0001771988889477137, + "loss": 2.9231, + "step": 38825 + }, + { + "epoch": 1.9, + "grad_norm": 0.6742639541625977, + "learning_rate": 0.00017718484188323014, + "loss": 3.1172, + "step": 38826 + }, + { + "epoch": 1.9, + "grad_norm": 0.6116287708282471, + "learning_rate": 0.00017717079514221806, + "loss": 3.1104, + "step": 38827 + }, + { + "epoch": 1.9, + "grad_norm": 0.7612192034721375, + "learning_rate": 0.0001771567487247146, + "loss": 2.9702, + "step": 38828 + }, + { + "epoch": 1.9, + "grad_norm": 0.6515193581581116, + "learning_rate": 0.0001771427026307566, + "loss": 3.0939, + "step": 38829 + }, + { + "epoch": 1.9, + "grad_norm": 0.6422175765037537, + "learning_rate": 0.00017712865686038106, + "loss": 2.9786, + "step": 38830 + }, + { + "epoch": 1.9, + "grad_norm": 0.6645070314407349, + "learning_rate": 0.0001771146114136249, + "loss": 2.7252, + "step": 38831 + }, + { + "epoch": 1.9, + "grad_norm": 0.6393051743507385, + "learning_rate": 0.00017710056629052526, + "loss": 2.7584, + "step": 38832 + }, + { + "epoch": 1.9, + "grad_norm": 0.6298916935920715, + "learning_rate": 0.00017708652149111913, + "loss": 3.3263, + "step": 38833 + }, + { + "epoch": 1.9, + "grad_norm": 0.6236022710800171, + "learning_rate": 0.00017707247701544335, + "loss": 2.9878, + "step": 38834 + }, + { + "epoch": 1.9, + "grad_norm": 0.6486828923225403, + "learning_rate": 0.0001770584328635351, + "loss": 3.0399, + "step": 38835 + }, + { + "epoch": 1.9, + "grad_norm": 0.6714288592338562, + "learning_rate": 0.00017704438903543116, + "loss": 2.9953, + "step": 38836 + }, + { + "epoch": 1.9, + "grad_norm": 0.6709398627281189, + "learning_rate": 0.0001770303455311687, + "loss": 2.9666, + "step": 38837 + }, + { + "epoch": 1.9, + "grad_norm": 0.6724570393562317, + "learning_rate": 0.0001770163023507847, + "loss": 3.0689, + "step": 38838 + }, + { + "epoch": 1.9, + "grad_norm": 0.6929112076759338, + "learning_rate": 0.00017700225949431598, + "loss": 3.0246, + "step": 38839 + }, + { + "epoch": 1.9, + "grad_norm": 0.6533306837081909, + "learning_rate": 0.00017698821696179978, + "loss": 2.8858, + "step": 38840 + }, + { + "epoch": 1.9, + "grad_norm": 0.6718860268592834, + "learning_rate": 0.00017697417475327288, + "loss": 2.9085, + "step": 38841 + }, + { + "epoch": 1.9, + "grad_norm": 0.650233805179596, + "learning_rate": 0.00017696013286877226, + "loss": 3.1286, + "step": 38842 + }, + { + "epoch": 1.9, + "grad_norm": 0.7158471941947937, + "learning_rate": 0.0001769460913083351, + "loss": 2.9035, + "step": 38843 + }, + { + "epoch": 1.9, + "grad_norm": 0.6609078049659729, + "learning_rate": 0.00017693205007199812, + "loss": 3.0562, + "step": 38844 + }, + { + "epoch": 1.9, + "grad_norm": 0.672481894493103, + "learning_rate": 0.00017691800915979855, + "loss": 2.8383, + "step": 38845 + }, + { + "epoch": 1.9, + "grad_norm": 0.6177345514297485, + "learning_rate": 0.00017690396857177317, + "loss": 2.8635, + "step": 38846 + }, + { + "epoch": 1.9, + "grad_norm": 0.6650989651679993, + "learning_rate": 0.00017688992830795905, + "loss": 3.1194, + "step": 38847 + }, + { + "epoch": 1.9, + "grad_norm": 0.6523521542549133, + "learning_rate": 0.00017687588836839323, + "loss": 2.92, + "step": 38848 + }, + { + "epoch": 1.9, + "grad_norm": 0.6387304067611694, + "learning_rate": 0.00017686184875311255, + "loss": 3.0017, + "step": 38849 + }, + { + "epoch": 1.9, + "grad_norm": 0.650626540184021, + "learning_rate": 0.00017684780946215412, + "loss": 3.224, + "step": 38850 + }, + { + "epoch": 1.9, + "grad_norm": 0.7072687745094299, + "learning_rate": 0.00017683377049555483, + "loss": 3.068, + "step": 38851 + }, + { + "epoch": 1.9, + "grad_norm": 0.6670072674751282, + "learning_rate": 0.00017681973185335165, + "loss": 2.7412, + "step": 38852 + }, + { + "epoch": 1.9, + "grad_norm": 0.6184037327766418, + "learning_rate": 0.0001768056935355817, + "loss": 3.0716, + "step": 38853 + }, + { + "epoch": 1.9, + "grad_norm": 0.6479713916778564, + "learning_rate": 0.0001767916555422817, + "loss": 3.0369, + "step": 38854 + }, + { + "epoch": 1.9, + "grad_norm": 0.6216630339622498, + "learning_rate": 0.00017677761787348892, + "loss": 3.1023, + "step": 38855 + }, + { + "epoch": 1.9, + "grad_norm": 0.667361319065094, + "learning_rate": 0.00017676358052924008, + "loss": 2.9674, + "step": 38856 + }, + { + "epoch": 1.9, + "grad_norm": 0.6166158318519592, + "learning_rate": 0.00017674954350957217, + "loss": 2.8018, + "step": 38857 + }, + { + "epoch": 1.9, + "grad_norm": 0.6484586596488953, + "learning_rate": 0.00017673550681452242, + "loss": 2.9074, + "step": 38858 + }, + { + "epoch": 1.9, + "grad_norm": 0.6432170271873474, + "learning_rate": 0.00017672147044412757, + "loss": 2.7897, + "step": 38859 + }, + { + "epoch": 1.9, + "grad_norm": 0.645042359828949, + "learning_rate": 0.00017670743439842466, + "loss": 3.1734, + "step": 38860 + }, + { + "epoch": 1.9, + "grad_norm": 0.6494126915931702, + "learning_rate": 0.00017669339867745052, + "loss": 3.0803, + "step": 38861 + }, + { + "epoch": 1.9, + "grad_norm": 0.6544557809829712, + "learning_rate": 0.00017667936328124233, + "loss": 2.9205, + "step": 38862 + }, + { + "epoch": 1.9, + "grad_norm": 0.666323184967041, + "learning_rate": 0.000176665328209837, + "loss": 2.8539, + "step": 38863 + }, + { + "epoch": 1.9, + "grad_norm": 0.6175612211227417, + "learning_rate": 0.00017665129346327134, + "loss": 3.1707, + "step": 38864 + }, + { + "epoch": 1.9, + "grad_norm": 0.7000352740287781, + "learning_rate": 0.00017663725904158253, + "loss": 3.1623, + "step": 38865 + }, + { + "epoch": 1.9, + "grad_norm": 0.7238625884056091, + "learning_rate": 0.00017662322494480743, + "loss": 2.7223, + "step": 38866 + }, + { + "epoch": 1.9, + "grad_norm": 0.6424652338027954, + "learning_rate": 0.00017660919117298292, + "loss": 2.8492, + "step": 38867 + }, + { + "epoch": 1.9, + "grad_norm": 0.6737831234931946, + "learning_rate": 0.00017659515772614618, + "loss": 3.1558, + "step": 38868 + }, + { + "epoch": 1.9, + "grad_norm": 0.6455912590026855, + "learning_rate": 0.00017658112460433397, + "loss": 2.8725, + "step": 38869 + }, + { + "epoch": 1.9, + "grad_norm": 0.6594234704971313, + "learning_rate": 0.00017656709180758343, + "loss": 3.1363, + "step": 38870 + }, + { + "epoch": 1.9, + "grad_norm": 0.6191946864128113, + "learning_rate": 0.00017655305933593124, + "loss": 2.8576, + "step": 38871 + }, + { + "epoch": 1.91, + "grad_norm": 0.6468594074249268, + "learning_rate": 0.00017653902718941464, + "loss": 3.2429, + "step": 38872 + }, + { + "epoch": 1.91, + "grad_norm": 0.653252124786377, + "learning_rate": 0.00017652499536807052, + "loss": 2.8514, + "step": 38873 + }, + { + "epoch": 1.91, + "grad_norm": 0.6690567135810852, + "learning_rate": 0.00017651096387193567, + "loss": 2.8927, + "step": 38874 + }, + { + "epoch": 1.91, + "grad_norm": 0.6499602794647217, + "learning_rate": 0.0001764969327010473, + "loss": 2.9986, + "step": 38875 + }, + { + "epoch": 1.91, + "grad_norm": 0.6645433902740479, + "learning_rate": 0.00017648290185544213, + "loss": 3.0087, + "step": 38876 + }, + { + "epoch": 1.91, + "grad_norm": 0.6643459796905518, + "learning_rate": 0.00017646887133515734, + "loss": 3.1536, + "step": 38877 + }, + { + "epoch": 1.91, + "grad_norm": 0.6513760685920715, + "learning_rate": 0.00017645484114022972, + "loss": 3.0292, + "step": 38878 + }, + { + "epoch": 1.91, + "grad_norm": 0.646967887878418, + "learning_rate": 0.00017644081127069622, + "loss": 3.1265, + "step": 38879 + }, + { + "epoch": 1.91, + "grad_norm": 0.7216665744781494, + "learning_rate": 0.00017642678172659395, + "loss": 3.0569, + "step": 38880 + }, + { + "epoch": 1.91, + "grad_norm": 0.6408677101135254, + "learning_rate": 0.00017641275250795968, + "loss": 2.9765, + "step": 38881 + }, + { + "epoch": 1.91, + "grad_norm": 0.6516188979148865, + "learning_rate": 0.00017639872361483052, + "loss": 3.033, + "step": 38882 + }, + { + "epoch": 1.91, + "grad_norm": 0.6692624688148499, + "learning_rate": 0.0001763846950472432, + "loss": 2.9756, + "step": 38883 + }, + { + "epoch": 1.91, + "grad_norm": 0.6817802786827087, + "learning_rate": 0.00017637066680523483, + "loss": 2.9432, + "step": 38884 + }, + { + "epoch": 1.91, + "grad_norm": 0.7394009828567505, + "learning_rate": 0.00017635663888884243, + "loss": 3.1279, + "step": 38885 + }, + { + "epoch": 1.91, + "grad_norm": 0.6396210789680481, + "learning_rate": 0.00017634261129810273, + "loss": 3.2632, + "step": 38886 + }, + { + "epoch": 1.91, + "grad_norm": 0.6615631580352783, + "learning_rate": 0.0001763285840330529, + "loss": 3.0701, + "step": 38887 + }, + { + "epoch": 1.91, + "grad_norm": 0.6404348611831665, + "learning_rate": 0.00017631455709372971, + "loss": 2.9918, + "step": 38888 + }, + { + "epoch": 1.91, + "grad_norm": 0.667752742767334, + "learning_rate": 0.00017630053048017016, + "loss": 3.0932, + "step": 38889 + }, + { + "epoch": 1.91, + "grad_norm": 0.6676562428474426, + "learning_rate": 0.0001762865041924113, + "loss": 2.9182, + "step": 38890 + }, + { + "epoch": 1.91, + "grad_norm": 0.6496162414550781, + "learning_rate": 0.00017627247823048987, + "loss": 3.0129, + "step": 38891 + }, + { + "epoch": 1.91, + "grad_norm": 0.650679886341095, + "learning_rate": 0.00017625845259444304, + "loss": 2.9894, + "step": 38892 + }, + { + "epoch": 1.91, + "grad_norm": 0.6663594245910645, + "learning_rate": 0.00017624442728430748, + "loss": 3.0855, + "step": 38893 + }, + { + "epoch": 1.91, + "grad_norm": 0.6415731310844421, + "learning_rate": 0.00017623040230012037, + "loss": 3.0113, + "step": 38894 + }, + { + "epoch": 1.91, + "grad_norm": 0.6369303464889526, + "learning_rate": 0.0001762163776419186, + "loss": 2.9257, + "step": 38895 + }, + { + "epoch": 1.91, + "grad_norm": 0.6577563285827637, + "learning_rate": 0.00017620235330973897, + "loss": 3.0859, + "step": 38896 + }, + { + "epoch": 1.91, + "grad_norm": 0.6328701972961426, + "learning_rate": 0.00017618832930361858, + "loss": 3.0281, + "step": 38897 + }, + { + "epoch": 1.91, + "grad_norm": 0.6667453050613403, + "learning_rate": 0.00017617430562359422, + "loss": 3.0712, + "step": 38898 + }, + { + "epoch": 1.91, + "grad_norm": 0.6224763989448547, + "learning_rate": 0.00017616028226970297, + "loss": 2.9339, + "step": 38899 + }, + { + "epoch": 1.91, + "grad_norm": 0.6647245287895203, + "learning_rate": 0.0001761462592419818, + "loss": 3.0157, + "step": 38900 + }, + { + "epoch": 1.91, + "grad_norm": 0.6523852944374084, + "learning_rate": 0.0001761322365404674, + "loss": 2.9646, + "step": 38901 + }, + { + "epoch": 1.91, + "grad_norm": 0.6578008532524109, + "learning_rate": 0.00017611821416519693, + "loss": 2.9051, + "step": 38902 + }, + { + "epoch": 1.91, + "grad_norm": 0.6416773200035095, + "learning_rate": 0.0001761041921162072, + "loss": 3.0441, + "step": 38903 + }, + { + "epoch": 1.91, + "grad_norm": 0.6548252701759338, + "learning_rate": 0.00017609017039353517, + "loss": 2.8382, + "step": 38904 + }, + { + "epoch": 1.91, + "grad_norm": 0.641315758228302, + "learning_rate": 0.00017607614899721786, + "loss": 2.9971, + "step": 38905 + }, + { + "epoch": 1.91, + "grad_norm": 0.6452323794364929, + "learning_rate": 0.00017606212792729207, + "loss": 3.0848, + "step": 38906 + }, + { + "epoch": 1.91, + "grad_norm": 0.6650744080543518, + "learning_rate": 0.00017604810718379487, + "loss": 3.0076, + "step": 38907 + }, + { + "epoch": 1.91, + "grad_norm": 0.678641140460968, + "learning_rate": 0.00017603408676676295, + "loss": 3.0404, + "step": 38908 + }, + { + "epoch": 1.91, + "grad_norm": 0.6717094779014587, + "learning_rate": 0.00017602006667623345, + "loss": 3.0307, + "step": 38909 + }, + { + "epoch": 1.91, + "grad_norm": 0.6648962497711182, + "learning_rate": 0.0001760060469122433, + "loss": 2.8707, + "step": 38910 + }, + { + "epoch": 1.91, + "grad_norm": 0.6424950361251831, + "learning_rate": 0.00017599202747482925, + "loss": 2.8513, + "step": 38911 + }, + { + "epoch": 1.91, + "grad_norm": 0.6827686429023743, + "learning_rate": 0.00017597800836402844, + "loss": 2.8907, + "step": 38912 + }, + { + "epoch": 1.91, + "grad_norm": 0.6990754008293152, + "learning_rate": 0.0001759639895798776, + "loss": 3.1935, + "step": 38913 + }, + { + "epoch": 1.91, + "grad_norm": 0.6267073154449463, + "learning_rate": 0.00017594997112241375, + "loss": 2.9517, + "step": 38914 + }, + { + "epoch": 1.91, + "grad_norm": 0.6684610843658447, + "learning_rate": 0.00017593595299167386, + "loss": 2.6714, + "step": 38915 + }, + { + "epoch": 1.91, + "grad_norm": 0.65615314245224, + "learning_rate": 0.0001759219351876947, + "loss": 3.2431, + "step": 38916 + }, + { + "epoch": 1.91, + "grad_norm": 0.6717365980148315, + "learning_rate": 0.00017590791771051343, + "loss": 3.0229, + "step": 38917 + }, + { + "epoch": 1.91, + "grad_norm": 0.6720296740531921, + "learning_rate": 0.0001758939005601667, + "loss": 2.9914, + "step": 38918 + }, + { + "epoch": 1.91, + "grad_norm": 0.6723449230194092, + "learning_rate": 0.00017587988373669156, + "loss": 3.0294, + "step": 38919 + }, + { + "epoch": 1.91, + "grad_norm": 0.6474607586860657, + "learning_rate": 0.000175865867240125, + "loss": 2.9674, + "step": 38920 + }, + { + "epoch": 1.91, + "grad_norm": 0.6441729068756104, + "learning_rate": 0.00017585185107050374, + "loss": 3.0136, + "step": 38921 + }, + { + "epoch": 1.91, + "grad_norm": 0.6469467878341675, + "learning_rate": 0.00017583783522786496, + "loss": 2.8793, + "step": 38922 + }, + { + "epoch": 1.91, + "grad_norm": 0.6387615203857422, + "learning_rate": 0.00017582381971224527, + "loss": 3.1108, + "step": 38923 + }, + { + "epoch": 1.91, + "grad_norm": 0.6527659296989441, + "learning_rate": 0.0001758098045236818, + "loss": 2.997, + "step": 38924 + }, + { + "epoch": 1.91, + "grad_norm": 0.6545872092247009, + "learning_rate": 0.00017579578966221145, + "loss": 3.3368, + "step": 38925 + }, + { + "epoch": 1.91, + "grad_norm": 0.6568055152893066, + "learning_rate": 0.00017578177512787103, + "loss": 3.0761, + "step": 38926 + }, + { + "epoch": 1.91, + "grad_norm": 0.6849158406257629, + "learning_rate": 0.00017576776092069759, + "loss": 2.8588, + "step": 38927 + }, + { + "epoch": 1.91, + "grad_norm": 0.6461156606674194, + "learning_rate": 0.00017575374704072787, + "loss": 2.8396, + "step": 38928 + }, + { + "epoch": 1.91, + "grad_norm": 0.6694230437278748, + "learning_rate": 0.00017573973348799883, + "loss": 3.0169, + "step": 38929 + }, + { + "epoch": 1.91, + "grad_norm": 0.6535854935646057, + "learning_rate": 0.00017572572026254755, + "loss": 2.9841, + "step": 38930 + }, + { + "epoch": 1.91, + "grad_norm": 0.6744319200515747, + "learning_rate": 0.00017571170736441072, + "loss": 3.0615, + "step": 38931 + }, + { + "epoch": 1.91, + "grad_norm": 0.6252499222755432, + "learning_rate": 0.00017569769479362543, + "loss": 3.0025, + "step": 38932 + }, + { + "epoch": 1.91, + "grad_norm": 0.6410253047943115, + "learning_rate": 0.00017568368255022835, + "loss": 3.1207, + "step": 38933 + }, + { + "epoch": 1.91, + "grad_norm": 0.6542544960975647, + "learning_rate": 0.00017566967063425658, + "loss": 2.8341, + "step": 38934 + }, + { + "epoch": 1.91, + "grad_norm": 0.6271516680717468, + "learning_rate": 0.00017565565904574702, + "loss": 3.0428, + "step": 38935 + }, + { + "epoch": 1.91, + "grad_norm": 0.6435152888298035, + "learning_rate": 0.00017564164778473642, + "loss": 3.0798, + "step": 38936 + }, + { + "epoch": 1.91, + "grad_norm": 0.6573494672775269, + "learning_rate": 0.00017562763685126194, + "loss": 3.2102, + "step": 38937 + }, + { + "epoch": 1.91, + "grad_norm": 0.6944133639335632, + "learning_rate": 0.00017561362624536015, + "loss": 2.9844, + "step": 38938 + }, + { + "epoch": 1.91, + "grad_norm": 0.7348315119743347, + "learning_rate": 0.0001755996159670682, + "loss": 3.2201, + "step": 38939 + }, + { + "epoch": 1.91, + "grad_norm": 0.6667076945304871, + "learning_rate": 0.00017558560601642302, + "loss": 3.1203, + "step": 38940 + }, + { + "epoch": 1.91, + "grad_norm": 0.9448343515396118, + "learning_rate": 0.00017557159639346124, + "loss": 2.9531, + "step": 38941 + }, + { + "epoch": 1.91, + "grad_norm": 0.6497033834457397, + "learning_rate": 0.00017555758709822004, + "loss": 2.9215, + "step": 38942 + }, + { + "epoch": 1.91, + "grad_norm": 1.2949033975601196, + "learning_rate": 0.00017554357813073622, + "loss": 3.0897, + "step": 38943 + }, + { + "epoch": 1.91, + "grad_norm": 0.6681045293807983, + "learning_rate": 0.00017552956949104656, + "loss": 3.1463, + "step": 38944 + }, + { + "epoch": 1.91, + "grad_norm": 0.6465238332748413, + "learning_rate": 0.0001755155611791882, + "loss": 2.9325, + "step": 38945 + }, + { + "epoch": 1.91, + "grad_norm": 0.6648427844047546, + "learning_rate": 0.00017550155319519777, + "loss": 2.8387, + "step": 38946 + }, + { + "epoch": 1.91, + "grad_norm": 0.8117325305938721, + "learning_rate": 0.00017548754553911241, + "loss": 2.6738, + "step": 38947 + }, + { + "epoch": 1.91, + "grad_norm": 0.629001796245575, + "learning_rate": 0.00017547353821096873, + "loss": 3.0694, + "step": 38948 + }, + { + "epoch": 1.91, + "grad_norm": 0.6560392379760742, + "learning_rate": 0.00017545953121080388, + "loss": 2.8574, + "step": 38949 + }, + { + "epoch": 1.91, + "grad_norm": 0.636073887348175, + "learning_rate": 0.00017544552453865474, + "loss": 3.1828, + "step": 38950 + }, + { + "epoch": 1.91, + "grad_norm": 0.6364843845367432, + "learning_rate": 0.0001754315181945579, + "loss": 2.8929, + "step": 38951 + }, + { + "epoch": 1.91, + "grad_norm": 0.7276064157485962, + "learning_rate": 0.00017541751217855066, + "loss": 2.9348, + "step": 38952 + }, + { + "epoch": 1.91, + "grad_norm": 0.67525315284729, + "learning_rate": 0.00017540350649066966, + "loss": 2.9005, + "step": 38953 + }, + { + "epoch": 1.91, + "grad_norm": 0.6328374743461609, + "learning_rate": 0.00017538950113095192, + "loss": 2.9466, + "step": 38954 + }, + { + "epoch": 1.91, + "grad_norm": 0.6840808987617493, + "learning_rate": 0.0001753754960994341, + "loss": 2.9267, + "step": 38955 + }, + { + "epoch": 1.91, + "grad_norm": 0.635502278804779, + "learning_rate": 0.00017536149139615322, + "loss": 3.1029, + "step": 38956 + }, + { + "epoch": 1.91, + "grad_norm": 0.6700057983398438, + "learning_rate": 0.00017534748702114638, + "loss": 2.954, + "step": 38957 + }, + { + "epoch": 1.91, + "grad_norm": 0.6268248558044434, + "learning_rate": 0.0001753334829744501, + "loss": 2.9197, + "step": 38958 + }, + { + "epoch": 1.91, + "grad_norm": 0.6551121473312378, + "learning_rate": 0.00017531947925610155, + "loss": 3.0124, + "step": 38959 + }, + { + "epoch": 1.91, + "grad_norm": 0.7056483030319214, + "learning_rate": 0.00017530547586613738, + "loss": 3.1388, + "step": 38960 + }, + { + "epoch": 1.91, + "grad_norm": 0.6477718949317932, + "learning_rate": 0.00017529147280459466, + "loss": 3.0884, + "step": 38961 + }, + { + "epoch": 1.91, + "grad_norm": 0.6173779964447021, + "learning_rate": 0.00017527747007151022, + "loss": 3.0833, + "step": 38962 + }, + { + "epoch": 1.91, + "grad_norm": 0.6646998524665833, + "learning_rate": 0.00017526346766692083, + "loss": 3.027, + "step": 38963 + }, + { + "epoch": 1.91, + "grad_norm": 0.7051180005073547, + "learning_rate": 0.0001752494655908636, + "loss": 2.7579, + "step": 38964 + }, + { + "epoch": 1.91, + "grad_norm": 0.6498520374298096, + "learning_rate": 0.00017523546384337515, + "loss": 2.9913, + "step": 38965 + }, + { + "epoch": 1.91, + "grad_norm": 0.6855219602584839, + "learning_rate": 0.00017522146242449245, + "loss": 2.9673, + "step": 38966 + }, + { + "epoch": 1.91, + "grad_norm": 0.6545772552490234, + "learning_rate": 0.00017520746133425252, + "loss": 3.0786, + "step": 38967 + }, + { + "epoch": 1.91, + "grad_norm": 0.7005255222320557, + "learning_rate": 0.00017519346057269207, + "loss": 2.9772, + "step": 38968 + }, + { + "epoch": 1.91, + "grad_norm": 0.652550220489502, + "learning_rate": 0.0001751794601398481, + "loss": 3.0452, + "step": 38969 + }, + { + "epoch": 1.91, + "grad_norm": 0.626100480556488, + "learning_rate": 0.00017516546003575726, + "loss": 3.0872, + "step": 38970 + }, + { + "epoch": 1.91, + "grad_norm": 0.6244769096374512, + "learning_rate": 0.00017515146026045664, + "loss": 2.9593, + "step": 38971 + }, + { + "epoch": 1.91, + "grad_norm": 0.6298187375068665, + "learning_rate": 0.0001751374608139831, + "loss": 3.0651, + "step": 38972 + }, + { + "epoch": 1.91, + "grad_norm": 0.611535906791687, + "learning_rate": 0.00017512346169637336, + "loss": 3.2176, + "step": 38973 + }, + { + "epoch": 1.91, + "grad_norm": 0.6949870586395264, + "learning_rate": 0.00017510946290766446, + "loss": 3.2544, + "step": 38974 + }, + { + "epoch": 1.91, + "grad_norm": 0.6494085192680359, + "learning_rate": 0.00017509546444789315, + "loss": 2.9234, + "step": 38975 + }, + { + "epoch": 1.91, + "grad_norm": 0.656135618686676, + "learning_rate": 0.00017508146631709643, + "loss": 3.0353, + "step": 38976 + }, + { + "epoch": 1.91, + "grad_norm": 0.6190617084503174, + "learning_rate": 0.00017506746851531107, + "loss": 2.99, + "step": 38977 + }, + { + "epoch": 1.91, + "grad_norm": 0.6905009746551514, + "learning_rate": 0.0001750534710425739, + "loss": 2.7008, + "step": 38978 + }, + { + "epoch": 1.91, + "grad_norm": 0.6204220056533813, + "learning_rate": 0.0001750394738989219, + "loss": 3.1247, + "step": 38979 + }, + { + "epoch": 1.91, + "grad_norm": 0.6508306264877319, + "learning_rate": 0.00017502547708439178, + "loss": 3.0195, + "step": 38980 + }, + { + "epoch": 1.91, + "grad_norm": 0.684338390827179, + "learning_rate": 0.00017501148059902047, + "loss": 2.9336, + "step": 38981 + }, + { + "epoch": 1.91, + "grad_norm": 0.6646551489830017, + "learning_rate": 0.000174997484442845, + "loss": 2.8238, + "step": 38982 + }, + { + "epoch": 1.91, + "grad_norm": 0.7016298770904541, + "learning_rate": 0.000174983488615902, + "loss": 3.1572, + "step": 38983 + }, + { + "epoch": 1.91, + "grad_norm": 0.7458086013793945, + "learning_rate": 0.00017496949311822854, + "loss": 2.8436, + "step": 38984 + }, + { + "epoch": 1.91, + "grad_norm": 0.6394898891448975, + "learning_rate": 0.00017495549794986138, + "loss": 2.989, + "step": 38985 + }, + { + "epoch": 1.91, + "grad_norm": 0.6666737198829651, + "learning_rate": 0.00017494150311083724, + "loss": 2.8933, + "step": 38986 + }, + { + "epoch": 1.91, + "grad_norm": 0.6851913332939148, + "learning_rate": 0.00017492750860119324, + "loss": 2.8082, + "step": 38987 + }, + { + "epoch": 1.91, + "grad_norm": 0.674731433391571, + "learning_rate": 0.00017491351442096599, + "loss": 3.1715, + "step": 38988 + }, + { + "epoch": 1.91, + "grad_norm": 0.6982241868972778, + "learning_rate": 0.0001748995205701926, + "loss": 2.9834, + "step": 38989 + }, + { + "epoch": 1.91, + "grad_norm": 0.7044297456741333, + "learning_rate": 0.00017488552704890965, + "loss": 2.8811, + "step": 38990 + }, + { + "epoch": 1.91, + "grad_norm": 0.6905117630958557, + "learning_rate": 0.00017487153385715426, + "loss": 2.897, + "step": 38991 + }, + { + "epoch": 1.91, + "grad_norm": 0.6385732889175415, + "learning_rate": 0.00017485754099496318, + "loss": 2.971, + "step": 38992 + }, + { + "epoch": 1.91, + "grad_norm": 0.6887711882591248, + "learning_rate": 0.0001748435484623732, + "loss": 2.9601, + "step": 38993 + }, + { + "epoch": 1.91, + "grad_norm": 0.6429010033607483, + "learning_rate": 0.00017482955625942125, + "loss": 3.0313, + "step": 38994 + }, + { + "epoch": 1.91, + "grad_norm": 0.6536113023757935, + "learning_rate": 0.00017481556438614411, + "loss": 2.9513, + "step": 38995 + }, + { + "epoch": 1.91, + "grad_norm": 0.6555978655815125, + "learning_rate": 0.00017480157284257864, + "loss": 2.8433, + "step": 38996 + }, + { + "epoch": 1.91, + "grad_norm": 0.6542037725448608, + "learning_rate": 0.00017478758162876184, + "loss": 2.9936, + "step": 38997 + }, + { + "epoch": 1.91, + "grad_norm": 0.6809813976287842, + "learning_rate": 0.00017477359074473034, + "loss": 2.9191, + "step": 38998 + }, + { + "epoch": 1.91, + "grad_norm": 0.6719657182693481, + "learning_rate": 0.00017475960019052122, + "loss": 2.9595, + "step": 38999 + }, + { + "epoch": 1.91, + "grad_norm": 0.6527001857757568, + "learning_rate": 0.00017474560996617126, + "loss": 3.1502, + "step": 39000 + }, + { + "epoch": 1.91, + "grad_norm": 0.6487961411476135, + "learning_rate": 0.0001747316200717171, + "loss": 2.6611, + "step": 39001 + }, + { + "epoch": 1.91, + "grad_norm": 0.689109742641449, + "learning_rate": 0.00017471763050719584, + "loss": 2.9463, + "step": 39002 + }, + { + "epoch": 1.91, + "grad_norm": 0.6380165815353394, + "learning_rate": 0.00017470364127264413, + "loss": 3.0582, + "step": 39003 + }, + { + "epoch": 1.91, + "grad_norm": 0.7141815423965454, + "learning_rate": 0.00017468965236809906, + "loss": 3.0516, + "step": 39004 + }, + { + "epoch": 1.91, + "grad_norm": 0.6605910658836365, + "learning_rate": 0.0001746756637935972, + "loss": 3.1249, + "step": 39005 + }, + { + "epoch": 1.91, + "grad_norm": 0.6353509426116943, + "learning_rate": 0.0001746616755491756, + "loss": 3.0085, + "step": 39006 + }, + { + "epoch": 1.91, + "grad_norm": 0.6613842248916626, + "learning_rate": 0.0001746476876348711, + "loss": 2.9168, + "step": 39007 + }, + { + "epoch": 1.91, + "grad_norm": 0.6803551316261292, + "learning_rate": 0.0001746337000507203, + "loss": 2.9971, + "step": 39008 + }, + { + "epoch": 1.91, + "grad_norm": 0.6482704877853394, + "learning_rate": 0.00017461971279676027, + "loss": 3.0769, + "step": 39009 + }, + { + "epoch": 1.91, + "grad_norm": 0.6631468534469604, + "learning_rate": 0.00017460572587302777, + "loss": 3.2677, + "step": 39010 + }, + { + "epoch": 1.91, + "grad_norm": 0.740536630153656, + "learning_rate": 0.0001745917392795596, + "loss": 2.9764, + "step": 39011 + }, + { + "epoch": 1.91, + "grad_norm": 0.6036262512207031, + "learning_rate": 0.00017457775301639276, + "loss": 3.0704, + "step": 39012 + }, + { + "epoch": 1.91, + "grad_norm": 0.6554529666900635, + "learning_rate": 0.00017456376708356404, + "loss": 2.9901, + "step": 39013 + }, + { + "epoch": 1.91, + "grad_norm": 0.6793158650398254, + "learning_rate": 0.00017454978148111016, + "loss": 3.0852, + "step": 39014 + }, + { + "epoch": 1.91, + "grad_norm": 0.6710419654846191, + "learning_rate": 0.00017453579620906794, + "loss": 3.0756, + "step": 39015 + }, + { + "epoch": 1.91, + "grad_norm": 0.65449458360672, + "learning_rate": 0.00017452181126747425, + "loss": 3.0163, + "step": 39016 + }, + { + "epoch": 1.91, + "grad_norm": 0.6442614793777466, + "learning_rate": 0.0001745078266563661, + "loss": 2.8513, + "step": 39017 + }, + { + "epoch": 1.91, + "grad_norm": 0.6554681658744812, + "learning_rate": 0.00017449384237578004, + "loss": 3.0563, + "step": 39018 + }, + { + "epoch": 1.91, + "grad_norm": 0.6386138796806335, + "learning_rate": 0.00017447985842575315, + "loss": 3.106, + "step": 39019 + }, + { + "epoch": 1.91, + "grad_norm": 0.6400095820426941, + "learning_rate": 0.0001744658748063221, + "loss": 3.0464, + "step": 39020 + }, + { + "epoch": 1.91, + "grad_norm": 0.6746090650558472, + "learning_rate": 0.00017445189151752387, + "loss": 3.0485, + "step": 39021 + }, + { + "epoch": 1.91, + "grad_norm": 0.6967078447341919, + "learning_rate": 0.0001744379085593952, + "loss": 2.9496, + "step": 39022 + }, + { + "epoch": 1.91, + "grad_norm": 0.6654866337776184, + "learning_rate": 0.00017442392593197274, + "loss": 3.1855, + "step": 39023 + }, + { + "epoch": 1.91, + "grad_norm": 0.6376835107803345, + "learning_rate": 0.00017440994363529365, + "loss": 2.8765, + "step": 39024 + }, + { + "epoch": 1.91, + "grad_norm": 0.6771338582038879, + "learning_rate": 0.00017439596166939452, + "loss": 2.9306, + "step": 39025 + }, + { + "epoch": 1.91, + "grad_norm": 0.6553162932395935, + "learning_rate": 0.0001743819800343122, + "loss": 2.962, + "step": 39026 + }, + { + "epoch": 1.91, + "grad_norm": 0.6516581773757935, + "learning_rate": 0.00017436799873008372, + "loss": 2.978, + "step": 39027 + }, + { + "epoch": 1.91, + "grad_norm": 0.6631935834884644, + "learning_rate": 0.00017435401775674574, + "loss": 3.1225, + "step": 39028 + }, + { + "epoch": 1.91, + "grad_norm": 0.6429182291030884, + "learning_rate": 0.00017434003711433508, + "loss": 2.9737, + "step": 39029 + }, + { + "epoch": 1.91, + "grad_norm": 0.634573757648468, + "learning_rate": 0.00017432605680288847, + "loss": 3.0406, + "step": 39030 + }, + { + "epoch": 1.91, + "grad_norm": 0.613707423210144, + "learning_rate": 0.00017431207682244295, + "loss": 3.0347, + "step": 39031 + }, + { + "epoch": 1.91, + "grad_norm": 0.681470513343811, + "learning_rate": 0.0001742980971730351, + "loss": 3.0185, + "step": 39032 + }, + { + "epoch": 1.91, + "grad_norm": 0.6601608395576477, + "learning_rate": 0.00017428411785470194, + "loss": 2.944, + "step": 39033 + }, + { + "epoch": 1.91, + "grad_norm": 0.6461837291717529, + "learning_rate": 0.00017427013886748028, + "loss": 3.1707, + "step": 39034 + }, + { + "epoch": 1.91, + "grad_norm": 0.6573340892791748, + "learning_rate": 0.0001742561602114069, + "loss": 3.0512, + "step": 39035 + }, + { + "epoch": 1.91, + "grad_norm": 0.6389563083648682, + "learning_rate": 0.00017424218188651859, + "loss": 2.9478, + "step": 39036 + }, + { + "epoch": 1.91, + "grad_norm": 0.6320285797119141, + "learning_rate": 0.00017422820389285202, + "loss": 3.2002, + "step": 39037 + }, + { + "epoch": 1.91, + "grad_norm": 0.6769739985466003, + "learning_rate": 0.0001742142262304442, + "loss": 3.0265, + "step": 39038 + }, + { + "epoch": 1.91, + "grad_norm": 0.6444475054740906, + "learning_rate": 0.000174200248899332, + "loss": 2.9721, + "step": 39039 + }, + { + "epoch": 1.91, + "grad_norm": 0.6511723399162292, + "learning_rate": 0.00017418627189955198, + "loss": 3.2104, + "step": 39040 + }, + { + "epoch": 1.91, + "grad_norm": 0.8678764700889587, + "learning_rate": 0.00017417229523114126, + "loss": 3.0062, + "step": 39041 + }, + { + "epoch": 1.91, + "grad_norm": 0.6624598503112793, + "learning_rate": 0.0001741583188941364, + "loss": 3.0018, + "step": 39042 + }, + { + "epoch": 1.91, + "grad_norm": 0.6774799823760986, + "learning_rate": 0.00017414434288857438, + "loss": 2.7333, + "step": 39043 + }, + { + "epoch": 1.91, + "grad_norm": 0.6489653587341309, + "learning_rate": 0.00017413036721449198, + "loss": 3.112, + "step": 39044 + }, + { + "epoch": 1.91, + "grad_norm": 0.6598437428474426, + "learning_rate": 0.00017411639187192582, + "loss": 2.9259, + "step": 39045 + }, + { + "epoch": 1.91, + "grad_norm": 0.6542210578918457, + "learning_rate": 0.00017410241686091298, + "loss": 2.8695, + "step": 39046 + }, + { + "epoch": 1.91, + "grad_norm": 0.6994691491127014, + "learning_rate": 0.00017408844218149008, + "loss": 2.8518, + "step": 39047 + }, + { + "epoch": 1.91, + "grad_norm": 0.6199400424957275, + "learning_rate": 0.00017407446783369393, + "loss": 2.9526, + "step": 39048 + }, + { + "epoch": 1.91, + "grad_norm": 0.6711297631263733, + "learning_rate": 0.00017406049381756152, + "loss": 2.8745, + "step": 39049 + }, + { + "epoch": 1.91, + "grad_norm": 0.6305173635482788, + "learning_rate": 0.00017404652013312954, + "loss": 3.0848, + "step": 39050 + }, + { + "epoch": 1.91, + "grad_norm": 0.6580938100814819, + "learning_rate": 0.0001740325467804348, + "loss": 3.0753, + "step": 39051 + }, + { + "epoch": 1.91, + "grad_norm": 0.6921206116676331, + "learning_rate": 0.00017401857375951395, + "loss": 3.0142, + "step": 39052 + }, + { + "epoch": 1.91, + "grad_norm": 1.0677298307418823, + "learning_rate": 0.00017400460107040394, + "loss": 3.0512, + "step": 39053 + }, + { + "epoch": 1.91, + "grad_norm": 0.6595743298530579, + "learning_rate": 0.0001739906287131417, + "loss": 3.0075, + "step": 39054 + }, + { + "epoch": 1.91, + "grad_norm": 0.6712070107460022, + "learning_rate": 0.00017397665668776374, + "loss": 3.1811, + "step": 39055 + }, + { + "epoch": 1.91, + "grad_norm": 0.6308397650718689, + "learning_rate": 0.00017396268499430718, + "loss": 3.1166, + "step": 39056 + }, + { + "epoch": 1.91, + "grad_norm": 0.6080089807510376, + "learning_rate": 0.0001739487136328086, + "loss": 3.0898, + "step": 39057 + }, + { + "epoch": 1.91, + "grad_norm": 0.6288923025131226, + "learning_rate": 0.00017393474260330477, + "loss": 3.0053, + "step": 39058 + }, + { + "epoch": 1.91, + "grad_norm": 0.6430457830429077, + "learning_rate": 0.00017392077190583267, + "loss": 3.2299, + "step": 39059 + }, + { + "epoch": 1.91, + "grad_norm": 0.6252223253250122, + "learning_rate": 0.00017390680154042888, + "loss": 3.121, + "step": 39060 + }, + { + "epoch": 1.91, + "grad_norm": 0.6593741774559021, + "learning_rate": 0.00017389283150713038, + "loss": 3.0993, + "step": 39061 + }, + { + "epoch": 1.91, + "grad_norm": 0.6887229681015015, + "learning_rate": 0.0001738788618059738, + "loss": 3.1088, + "step": 39062 + }, + { + "epoch": 1.91, + "grad_norm": 0.6626426577568054, + "learning_rate": 0.00017386489243699608, + "loss": 3.0802, + "step": 39063 + }, + { + "epoch": 1.91, + "grad_norm": 0.663589358329773, + "learning_rate": 0.00017385092340023404, + "loss": 2.8885, + "step": 39064 + }, + { + "epoch": 1.91, + "grad_norm": 0.6585755944252014, + "learning_rate": 0.00017383695469572435, + "loss": 2.9546, + "step": 39065 + }, + { + "epoch": 1.91, + "grad_norm": 0.6396164894104004, + "learning_rate": 0.00017382298632350387, + "loss": 3.1609, + "step": 39066 + }, + { + "epoch": 1.91, + "grad_norm": 0.647235095500946, + "learning_rate": 0.00017380901828360925, + "loss": 2.9741, + "step": 39067 + }, + { + "epoch": 1.91, + "grad_norm": 0.6416546702384949, + "learning_rate": 0.0001737950505760774, + "loss": 3.097, + "step": 39068 + }, + { + "epoch": 1.91, + "grad_norm": 0.697792649269104, + "learning_rate": 0.00017378108320094518, + "loss": 2.6795, + "step": 39069 + }, + { + "epoch": 1.91, + "grad_norm": 0.716894805431366, + "learning_rate": 0.0001737671161582492, + "loss": 2.7536, + "step": 39070 + }, + { + "epoch": 1.91, + "grad_norm": 0.6366348266601562, + "learning_rate": 0.00017375314944802648, + "loss": 2.8962, + "step": 39071 + }, + { + "epoch": 1.91, + "grad_norm": 0.6648399829864502, + "learning_rate": 0.00017373918307031364, + "loss": 3.1113, + "step": 39072 + }, + { + "epoch": 1.91, + "grad_norm": 0.6838628649711609, + "learning_rate": 0.0001737252170251474, + "loss": 3.1721, + "step": 39073 + }, + { + "epoch": 1.91, + "grad_norm": 0.6086000204086304, + "learning_rate": 0.0001737112513125647, + "loss": 3.0741, + "step": 39074 + }, + { + "epoch": 1.91, + "grad_norm": 0.6315998435020447, + "learning_rate": 0.0001736972859326022, + "loss": 3.0657, + "step": 39075 + }, + { + "epoch": 1.92, + "grad_norm": 0.6525732278823853, + "learning_rate": 0.0001736833208852968, + "loss": 2.9191, + "step": 39076 + }, + { + "epoch": 1.92, + "grad_norm": 0.6616672873497009, + "learning_rate": 0.00017366935617068517, + "loss": 2.8436, + "step": 39077 + }, + { + "epoch": 1.92, + "grad_norm": 0.6813806295394897, + "learning_rate": 0.00017365539178880422, + "loss": 3.0425, + "step": 39078 + }, + { + "epoch": 1.92, + "grad_norm": 1.1355561017990112, + "learning_rate": 0.00017364142773969066, + "loss": 2.9847, + "step": 39079 + }, + { + "epoch": 1.92, + "grad_norm": 0.6255960464477539, + "learning_rate": 0.00017362746402338114, + "loss": 2.9503, + "step": 39080 + }, + { + "epoch": 1.92, + "grad_norm": 0.6622676253318787, + "learning_rate": 0.0001736135006399127, + "loss": 3.4366, + "step": 39081 + }, + { + "epoch": 1.92, + "grad_norm": 0.6947815418243408, + "learning_rate": 0.00017359953758932183, + "loss": 2.9106, + "step": 39082 + }, + { + "epoch": 1.92, + "grad_norm": 0.6500281095504761, + "learning_rate": 0.00017358557487164548, + "loss": 2.6224, + "step": 39083 + }, + { + "epoch": 1.92, + "grad_norm": 0.6997030973434448, + "learning_rate": 0.0001735716124869205, + "loss": 2.9147, + "step": 39084 + }, + { + "epoch": 1.92, + "grad_norm": 0.6823716759681702, + "learning_rate": 0.0001735576504351834, + "loss": 2.9929, + "step": 39085 + }, + { + "epoch": 1.92, + "grad_norm": 0.6616767644882202, + "learning_rate": 0.0001735436887164713, + "loss": 2.9272, + "step": 39086 + }, + { + "epoch": 1.92, + "grad_norm": 0.6855137348175049, + "learning_rate": 0.00017352972733082075, + "loss": 2.9099, + "step": 39087 + }, + { + "epoch": 1.92, + "grad_norm": 0.6969751715660095, + "learning_rate": 0.00017351576627826847, + "loss": 2.7265, + "step": 39088 + }, + { + "epoch": 1.92, + "grad_norm": 0.6577869057655334, + "learning_rate": 0.00017350180555885143, + "loss": 3.2084, + "step": 39089 + }, + { + "epoch": 1.92, + "grad_norm": 0.6708926558494568, + "learning_rate": 0.00017348784517260614, + "loss": 3.1078, + "step": 39090 + }, + { + "epoch": 1.92, + "grad_norm": 0.6796287894248962, + "learning_rate": 0.00017347388511956965, + "loss": 2.8497, + "step": 39091 + }, + { + "epoch": 1.92, + "grad_norm": 0.6603464484214783, + "learning_rate": 0.00017345992539977847, + "loss": 2.8234, + "step": 39092 + }, + { + "epoch": 1.92, + "grad_norm": 0.635272741317749, + "learning_rate": 0.00017344596601326964, + "loss": 3.1138, + "step": 39093 + }, + { + "epoch": 1.92, + "grad_norm": 0.6617568731307983, + "learning_rate": 0.00017343200696007979, + "loss": 3.1461, + "step": 39094 + }, + { + "epoch": 1.92, + "grad_norm": 0.6393948793411255, + "learning_rate": 0.0001734180482402455, + "loss": 2.8271, + "step": 39095 + }, + { + "epoch": 1.92, + "grad_norm": 0.6940094232559204, + "learning_rate": 0.0001734040898538039, + "loss": 3.0738, + "step": 39096 + }, + { + "epoch": 1.92, + "grad_norm": 0.6491406559944153, + "learning_rate": 0.00017339013180079136, + "loss": 3.0714, + "step": 39097 + }, + { + "epoch": 1.92, + "grad_norm": 0.6679297089576721, + "learning_rate": 0.0001733761740812449, + "loss": 2.9234, + "step": 39098 + }, + { + "epoch": 1.92, + "grad_norm": 0.6698212027549744, + "learning_rate": 0.00017336221669520135, + "loss": 2.956, + "step": 39099 + }, + { + "epoch": 1.92, + "grad_norm": 0.6277477145195007, + "learning_rate": 0.00017334825964269722, + "loss": 2.9522, + "step": 39100 + }, + { + "epoch": 1.92, + "grad_norm": 0.63001549243927, + "learning_rate": 0.00017333430292376965, + "loss": 3.1925, + "step": 39101 + }, + { + "epoch": 1.92, + "grad_norm": 0.6419739127159119, + "learning_rate": 0.0001733203465384549, + "loss": 2.9653, + "step": 39102 + }, + { + "epoch": 1.92, + "grad_norm": 0.6369063258171082, + "learning_rate": 0.00017330639048678995, + "loss": 2.8996, + "step": 39103 + }, + { + "epoch": 1.92, + "grad_norm": 0.7089871764183044, + "learning_rate": 0.0001732924347688117, + "loss": 2.9241, + "step": 39104 + }, + { + "epoch": 1.92, + "grad_norm": 0.639457106590271, + "learning_rate": 0.00017327847938455673, + "loss": 2.9672, + "step": 39105 + }, + { + "epoch": 1.92, + "grad_norm": 0.641934335231781, + "learning_rate": 0.00017326452433406194, + "loss": 2.8944, + "step": 39106 + }, + { + "epoch": 1.92, + "grad_norm": 0.6629417538642883, + "learning_rate": 0.0001732505696173639, + "loss": 2.8033, + "step": 39107 + }, + { + "epoch": 1.92, + "grad_norm": 0.6630560159683228, + "learning_rate": 0.00017323661523449958, + "loss": 2.9384, + "step": 39108 + }, + { + "epoch": 1.92, + "grad_norm": 0.6467246413230896, + "learning_rate": 0.00017322266118550557, + "loss": 2.9888, + "step": 39109 + }, + { + "epoch": 1.92, + "grad_norm": 0.6524175405502319, + "learning_rate": 0.00017320870747041862, + "loss": 3.2388, + "step": 39110 + }, + { + "epoch": 1.92, + "grad_norm": 0.6555929183959961, + "learning_rate": 0.00017319475408927564, + "loss": 3.0857, + "step": 39111 + }, + { + "epoch": 1.92, + "grad_norm": 0.6451212167739868, + "learning_rate": 0.00017318080104211314, + "loss": 2.8654, + "step": 39112 + }, + { + "epoch": 1.92, + "grad_norm": 0.6508936285972595, + "learning_rate": 0.00017316684832896815, + "loss": 2.9725, + "step": 39113 + }, + { + "epoch": 1.92, + "grad_norm": 0.6641062498092651, + "learning_rate": 0.0001731528959498771, + "loss": 2.8809, + "step": 39114 + }, + { + "epoch": 1.92, + "grad_norm": 0.6545625329017639, + "learning_rate": 0.0001731389439048771, + "loss": 2.9962, + "step": 39115 + }, + { + "epoch": 1.92, + "grad_norm": 0.652595579624176, + "learning_rate": 0.00017312499219400461, + "loss": 2.9298, + "step": 39116 + }, + { + "epoch": 1.92, + "grad_norm": 0.6414437890052795, + "learning_rate": 0.00017311104081729642, + "loss": 2.9284, + "step": 39117 + }, + { + "epoch": 1.92, + "grad_norm": 0.7035130262374878, + "learning_rate": 0.00017309708977478942, + "loss": 3.058, + "step": 39118 + }, + { + "epoch": 1.92, + "grad_norm": 0.6662630438804626, + "learning_rate": 0.00017308313906652016, + "loss": 3.0701, + "step": 39119 + }, + { + "epoch": 1.92, + "grad_norm": 0.645362377166748, + "learning_rate": 0.0001730691886925255, + "loss": 3.1298, + "step": 39120 + }, + { + "epoch": 1.92, + "grad_norm": 0.6399428844451904, + "learning_rate": 0.0001730552386528423, + "loss": 3.0839, + "step": 39121 + }, + { + "epoch": 1.92, + "grad_norm": 0.6200828552246094, + "learning_rate": 0.000173041288947507, + "loss": 2.9407, + "step": 39122 + }, + { + "epoch": 1.92, + "grad_norm": 0.6302321553230286, + "learning_rate": 0.00017302733957655662, + "loss": 2.9237, + "step": 39123 + }, + { + "epoch": 1.92, + "grad_norm": 0.6495705246925354, + "learning_rate": 0.00017301339054002785, + "loss": 2.8885, + "step": 39124 + }, + { + "epoch": 1.92, + "grad_norm": 0.634657621383667, + "learning_rate": 0.00017299944183795724, + "loss": 3.1106, + "step": 39125 + }, + { + "epoch": 1.92, + "grad_norm": 0.6378446817398071, + "learning_rate": 0.00017298549347038178, + "loss": 2.7874, + "step": 39126 + }, + { + "epoch": 1.92, + "grad_norm": 0.6395211815834045, + "learning_rate": 0.00017297154543733796, + "loss": 3.0792, + "step": 39127 + }, + { + "epoch": 1.92, + "grad_norm": 0.6783758997917175, + "learning_rate": 0.0001729575977388627, + "loss": 2.8279, + "step": 39128 + }, + { + "epoch": 1.92, + "grad_norm": 0.6913485527038574, + "learning_rate": 0.00017294365037499263, + "loss": 3.101, + "step": 39129 + }, + { + "epoch": 1.92, + "grad_norm": 0.659443199634552, + "learning_rate": 0.00017292970334576466, + "loss": 3.1545, + "step": 39130 + }, + { + "epoch": 1.92, + "grad_norm": 0.6611900329589844, + "learning_rate": 0.00017291575665121534, + "loss": 2.865, + "step": 39131 + }, + { + "epoch": 1.92, + "grad_norm": 0.6406236290931702, + "learning_rate": 0.0001729018102913814, + "loss": 3.0973, + "step": 39132 + }, + { + "epoch": 1.92, + "grad_norm": 0.6783467531204224, + "learning_rate": 0.0001728878642662997, + "loss": 3.074, + "step": 39133 + }, + { + "epoch": 1.92, + "grad_norm": 0.6730901002883911, + "learning_rate": 0.0001728739185760068, + "loss": 2.8599, + "step": 39134 + }, + { + "epoch": 1.92, + "grad_norm": 0.6598025560379028, + "learning_rate": 0.00017285997322053954, + "loss": 3.1268, + "step": 39135 + }, + { + "epoch": 1.92, + "grad_norm": 0.6816617846488953, + "learning_rate": 0.00017284602819993476, + "loss": 2.944, + "step": 39136 + }, + { + "epoch": 1.92, + "grad_norm": 0.6511579155921936, + "learning_rate": 0.00017283208351422908, + "loss": 3.1893, + "step": 39137 + }, + { + "epoch": 1.92, + "grad_norm": 0.693321943283081, + "learning_rate": 0.00017281813916345917, + "loss": 3.0119, + "step": 39138 + }, + { + "epoch": 1.92, + "grad_norm": 0.6485716700553894, + "learning_rate": 0.00017280419514766174, + "loss": 2.7844, + "step": 39139 + }, + { + "epoch": 1.92, + "grad_norm": 0.6522268056869507, + "learning_rate": 0.00017279025146687353, + "loss": 2.9277, + "step": 39140 + }, + { + "epoch": 1.92, + "grad_norm": 0.6479056477546692, + "learning_rate": 0.0001727763081211315, + "loss": 2.686, + "step": 39141 + }, + { + "epoch": 1.92, + "grad_norm": 0.6812885999679565, + "learning_rate": 0.00017276236511047202, + "loss": 2.8859, + "step": 39142 + }, + { + "epoch": 1.92, + "grad_norm": 0.6709456443786621, + "learning_rate": 0.0001727484224349321, + "loss": 2.9349, + "step": 39143 + }, + { + "epoch": 1.92, + "grad_norm": 0.6877416372299194, + "learning_rate": 0.00017273448009454825, + "loss": 2.9836, + "step": 39144 + }, + { + "epoch": 1.92, + "grad_norm": 0.6465352177619934, + "learning_rate": 0.00017272053808935743, + "loss": 3.0815, + "step": 39145 + }, + { + "epoch": 1.92, + "grad_norm": 0.6705036163330078, + "learning_rate": 0.0001727065964193962, + "loss": 3.1177, + "step": 39146 + }, + { + "epoch": 1.92, + "grad_norm": 0.6689993739128113, + "learning_rate": 0.00017269265508470112, + "loss": 3.0918, + "step": 39147 + }, + { + "epoch": 1.92, + "grad_norm": 0.6450504660606384, + "learning_rate": 0.00017267871408530928, + "loss": 3.0432, + "step": 39148 + }, + { + "epoch": 1.92, + "grad_norm": 0.6703768968582153, + "learning_rate": 0.00017266477342125702, + "loss": 3.0998, + "step": 39149 + }, + { + "epoch": 1.92, + "grad_norm": 0.6024968028068542, + "learning_rate": 0.00017265083309258127, + "loss": 2.9523, + "step": 39150 + }, + { + "epoch": 1.92, + "grad_norm": 0.6327059268951416, + "learning_rate": 0.00017263689309931886, + "loss": 3.127, + "step": 39151 + }, + { + "epoch": 1.92, + "grad_norm": 0.709504246711731, + "learning_rate": 0.00017262295344150634, + "loss": 3.1062, + "step": 39152 + }, + { + "epoch": 1.92, + "grad_norm": 0.6775631308555603, + "learning_rate": 0.00017260901411918043, + "loss": 3.0224, + "step": 39153 + }, + { + "epoch": 1.92, + "grad_norm": 0.6679512858390808, + "learning_rate": 0.00017259507513237777, + "loss": 2.8959, + "step": 39154 + }, + { + "epoch": 1.92, + "grad_norm": 0.7322613596916199, + "learning_rate": 0.00017258113648113517, + "loss": 2.9774, + "step": 39155 + }, + { + "epoch": 1.92, + "grad_norm": 0.6525643467903137, + "learning_rate": 0.00017256719816548943, + "loss": 3.0671, + "step": 39156 + }, + { + "epoch": 1.92, + "grad_norm": 0.662835419178009, + "learning_rate": 0.00017255326018547707, + "loss": 2.9745, + "step": 39157 + }, + { + "epoch": 1.92, + "grad_norm": 0.7078630328178406, + "learning_rate": 0.000172539322541135, + "loss": 3.0263, + "step": 39158 + }, + { + "epoch": 1.92, + "grad_norm": 0.6778189539909363, + "learning_rate": 0.0001725253852324998, + "loss": 3.1786, + "step": 39159 + }, + { + "epoch": 1.92, + "grad_norm": 0.7059077024459839, + "learning_rate": 0.00017251144825960812, + "loss": 2.7726, + "step": 39160 + }, + { + "epoch": 1.92, + "grad_norm": 0.6346893310546875, + "learning_rate": 0.00017249751162249686, + "loss": 2.9413, + "step": 39161 + }, + { + "epoch": 1.92, + "grad_norm": 0.6974074840545654, + "learning_rate": 0.00017248357532120248, + "loss": 3.048, + "step": 39162 + }, + { + "epoch": 1.92, + "grad_norm": 0.6645182967185974, + "learning_rate": 0.00017246963935576196, + "loss": 2.7881, + "step": 39163 + }, + { + "epoch": 1.92, + "grad_norm": 0.6341907978057861, + "learning_rate": 0.0001724557037262117, + "loss": 3.1263, + "step": 39164 + }, + { + "epoch": 1.92, + "grad_norm": 0.625954806804657, + "learning_rate": 0.00017244176843258862, + "loss": 2.7248, + "step": 39165 + }, + { + "epoch": 1.92, + "grad_norm": 0.6558056473731995, + "learning_rate": 0.0001724278334749295, + "loss": 3.1032, + "step": 39166 + }, + { + "epoch": 1.92, + "grad_norm": 0.6719792485237122, + "learning_rate": 0.00017241389885327087, + "loss": 2.9548, + "step": 39167 + }, + { + "epoch": 1.92, + "grad_norm": 0.682371199131012, + "learning_rate": 0.00017239996456764952, + "loss": 3.1351, + "step": 39168 + }, + { + "epoch": 1.92, + "grad_norm": 0.6656419634819031, + "learning_rate": 0.00017238603061810195, + "loss": 2.8981, + "step": 39169 + }, + { + "epoch": 1.92, + "grad_norm": 0.6694563031196594, + "learning_rate": 0.00017237209700466502, + "loss": 3.2168, + "step": 39170 + }, + { + "epoch": 1.92, + "grad_norm": 0.7101410627365112, + "learning_rate": 0.00017235816372737558, + "loss": 2.9831, + "step": 39171 + }, + { + "epoch": 1.92, + "grad_norm": 0.6762426495552063, + "learning_rate": 0.00017234423078627003, + "loss": 2.8293, + "step": 39172 + }, + { + "epoch": 1.92, + "grad_norm": 0.617826521396637, + "learning_rate": 0.00017233029818138532, + "loss": 2.8919, + "step": 39173 + }, + { + "epoch": 1.92, + "grad_norm": 0.7168938517570496, + "learning_rate": 0.00017231636591275807, + "loss": 2.9602, + "step": 39174 + }, + { + "epoch": 1.92, + "grad_norm": 0.6657389998435974, + "learning_rate": 0.00017230243398042475, + "loss": 2.9613, + "step": 39175 + }, + { + "epoch": 1.92, + "grad_norm": 0.6456219553947449, + "learning_rate": 0.0001722885023844224, + "loss": 2.8773, + "step": 39176 + }, + { + "epoch": 1.92, + "grad_norm": 0.6605064868927002, + "learning_rate": 0.00017227457112478745, + "loss": 3.088, + "step": 39177 + }, + { + "epoch": 1.92, + "grad_norm": 0.6815346479415894, + "learning_rate": 0.0001722606402015568, + "loss": 3.0484, + "step": 39178 + }, + { + "epoch": 1.92, + "grad_norm": 0.6612982153892517, + "learning_rate": 0.0001722467096147669, + "loss": 3.0271, + "step": 39179 + }, + { + "epoch": 1.92, + "grad_norm": 0.6247538328170776, + "learning_rate": 0.00017223277936445475, + "loss": 2.8698, + "step": 39180 + }, + { + "epoch": 1.92, + "grad_norm": 0.6820695996284485, + "learning_rate": 0.00017221884945065687, + "loss": 3.0734, + "step": 39181 + }, + { + "epoch": 1.92, + "grad_norm": 0.6383649110794067, + "learning_rate": 0.0001722049198734098, + "loss": 3.1281, + "step": 39182 + }, + { + "epoch": 1.92, + "grad_norm": 0.6495553255081177, + "learning_rate": 0.00017219099063275052, + "loss": 3.0078, + "step": 39183 + }, + { + "epoch": 1.92, + "grad_norm": 0.6677046418190002, + "learning_rate": 0.0001721770617287154, + "loss": 3.0541, + "step": 39184 + }, + { + "epoch": 1.92, + "grad_norm": 0.6571451425552368, + "learning_rate": 0.00017216313316134135, + "loss": 2.964, + "step": 39185 + }, + { + "epoch": 1.92, + "grad_norm": 0.6236145496368408, + "learning_rate": 0.0001721492049306651, + "loss": 2.8809, + "step": 39186 + }, + { + "epoch": 1.92, + "grad_norm": 0.6534735560417175, + "learning_rate": 0.00017213527703672314, + "loss": 2.8744, + "step": 39187 + }, + { + "epoch": 1.92, + "grad_norm": 0.6529717445373535, + "learning_rate": 0.00017212134947955235, + "loss": 3.0686, + "step": 39188 + }, + { + "epoch": 1.92, + "grad_norm": 0.6746391654014587, + "learning_rate": 0.00017210742225918935, + "loss": 3.1402, + "step": 39189 + }, + { + "epoch": 1.92, + "grad_norm": 0.6508035659790039, + "learning_rate": 0.00017209349537567072, + "loss": 2.9857, + "step": 39190 + }, + { + "epoch": 1.92, + "grad_norm": 0.6782659888267517, + "learning_rate": 0.00017207956882903314, + "loss": 3.0049, + "step": 39191 + }, + { + "epoch": 1.92, + "grad_norm": 0.6732063293457031, + "learning_rate": 0.00017206564261931334, + "loss": 2.904, + "step": 39192 + }, + { + "epoch": 1.92, + "grad_norm": 0.719319760799408, + "learning_rate": 0.00017205171674654815, + "loss": 3.0394, + "step": 39193 + }, + { + "epoch": 1.92, + "grad_norm": 0.6435275673866272, + "learning_rate": 0.00017203779121077397, + "loss": 2.911, + "step": 39194 + }, + { + "epoch": 1.92, + "grad_norm": 0.6958181858062744, + "learning_rate": 0.00017202386601202776, + "loss": 3.1103, + "step": 39195 + }, + { + "epoch": 1.92, + "grad_norm": 0.6404264569282532, + "learning_rate": 0.00017200994115034606, + "loss": 2.9457, + "step": 39196 + }, + { + "epoch": 1.92, + "grad_norm": 0.6635984778404236, + "learning_rate": 0.00017199601662576543, + "loss": 2.8922, + "step": 39197 + }, + { + "epoch": 1.92, + "grad_norm": 0.6762363910675049, + "learning_rate": 0.00017198209243832278, + "loss": 3.0213, + "step": 39198 + }, + { + "epoch": 1.92, + "grad_norm": 0.6661695241928101, + "learning_rate": 0.0001719681685880545, + "loss": 2.809, + "step": 39199 + }, + { + "epoch": 1.92, + "grad_norm": 0.6384485363960266, + "learning_rate": 0.00017195424507499755, + "loss": 3.1867, + "step": 39200 + }, + { + "epoch": 1.92, + "grad_norm": 0.6842610836029053, + "learning_rate": 0.0001719403218991884, + "loss": 2.9495, + "step": 39201 + }, + { + "epoch": 1.92, + "grad_norm": 0.6691260933876038, + "learning_rate": 0.00017192639906066386, + "loss": 3.0114, + "step": 39202 + }, + { + "epoch": 1.92, + "grad_norm": 0.6943384408950806, + "learning_rate": 0.00017191247655946057, + "loss": 3.1103, + "step": 39203 + }, + { + "epoch": 1.92, + "grad_norm": 0.6369804739952087, + "learning_rate": 0.00017189855439561503, + "loss": 3.121, + "step": 39204 + }, + { + "epoch": 1.92, + "grad_norm": 0.6395381689071655, + "learning_rate": 0.00017188463256916417, + "loss": 2.9585, + "step": 39205 + }, + { + "epoch": 1.92, + "grad_norm": 0.621781051158905, + "learning_rate": 0.00017187071108014447, + "loss": 2.9129, + "step": 39206 + }, + { + "epoch": 1.92, + "grad_norm": 0.6474804282188416, + "learning_rate": 0.00017185678992859258, + "loss": 2.9731, + "step": 39207 + }, + { + "epoch": 1.92, + "grad_norm": 0.6578860282897949, + "learning_rate": 0.0001718428691145454, + "loss": 2.8956, + "step": 39208 + }, + { + "epoch": 1.92, + "grad_norm": 0.7093460559844971, + "learning_rate": 0.00017182894863803928, + "loss": 2.9726, + "step": 39209 + }, + { + "epoch": 1.92, + "grad_norm": 0.6357724070549011, + "learning_rate": 0.00017181502849911121, + "loss": 3.2573, + "step": 39210 + }, + { + "epoch": 1.92, + "grad_norm": 0.6507198214530945, + "learning_rate": 0.0001718011086977977, + "loss": 2.9724, + "step": 39211 + }, + { + "epoch": 1.92, + "grad_norm": 0.6744953393936157, + "learning_rate": 0.00017178718923413524, + "loss": 3.127, + "step": 39212 + }, + { + "epoch": 1.92, + "grad_norm": 0.6369718313217163, + "learning_rate": 0.00017177327010816076, + "loss": 2.9405, + "step": 39213 + }, + { + "epoch": 1.92, + "grad_norm": 0.6532459259033203, + "learning_rate": 0.00017175935131991067, + "loss": 2.9442, + "step": 39214 + }, + { + "epoch": 1.92, + "grad_norm": 0.6640715003013611, + "learning_rate": 0.00017174543286942192, + "loss": 2.7625, + "step": 39215 + }, + { + "epoch": 1.92, + "grad_norm": 0.7218378782272339, + "learning_rate": 0.0001717315147567309, + "loss": 3.116, + "step": 39216 + }, + { + "epoch": 1.92, + "grad_norm": 0.7001231908798218, + "learning_rate": 0.0001717175969818745, + "loss": 2.9236, + "step": 39217 + }, + { + "epoch": 1.92, + "grad_norm": 0.6841039657592773, + "learning_rate": 0.00017170367954488926, + "loss": 2.7602, + "step": 39218 + }, + { + "epoch": 1.92, + "grad_norm": 0.67069011926651, + "learning_rate": 0.00017168976244581174, + "loss": 3.0251, + "step": 39219 + }, + { + "epoch": 1.92, + "grad_norm": 0.6532872319221497, + "learning_rate": 0.0001716758456846788, + "loss": 3.0348, + "step": 39220 + }, + { + "epoch": 1.92, + "grad_norm": 0.6450393795967102, + "learning_rate": 0.00017166192926152684, + "loss": 3.1042, + "step": 39221 + }, + { + "epoch": 1.92, + "grad_norm": 0.6509761214256287, + "learning_rate": 0.0001716480131763927, + "loss": 2.8918, + "step": 39222 + }, + { + "epoch": 1.92, + "grad_norm": 0.6896138191223145, + "learning_rate": 0.00017163409742931303, + "loss": 3.0114, + "step": 39223 + }, + { + "epoch": 1.92, + "grad_norm": 0.6828722953796387, + "learning_rate": 0.00017162018202032442, + "loss": 3.0185, + "step": 39224 + }, + { + "epoch": 1.92, + "grad_norm": 0.6410359144210815, + "learning_rate": 0.0001716062669494636, + "loss": 3.0876, + "step": 39225 + }, + { + "epoch": 1.92, + "grad_norm": 0.6558801531791687, + "learning_rate": 0.0001715923522167672, + "loss": 2.8841, + "step": 39226 + }, + { + "epoch": 1.92, + "grad_norm": 0.6510255336761475, + "learning_rate": 0.00017157843782227169, + "loss": 2.9658, + "step": 39227 + }, + { + "epoch": 1.92, + "grad_norm": 0.6774110794067383, + "learning_rate": 0.00017156452376601398, + "loss": 3.0839, + "step": 39228 + }, + { + "epoch": 1.92, + "grad_norm": 0.6347213983535767, + "learning_rate": 0.0001715506100480305, + "loss": 3.1037, + "step": 39229 + }, + { + "epoch": 1.92, + "grad_norm": 0.6718534827232361, + "learning_rate": 0.00017153669666835809, + "loss": 3.0421, + "step": 39230 + }, + { + "epoch": 1.92, + "grad_norm": 0.674842894077301, + "learning_rate": 0.00017152278362703318, + "loss": 3.0798, + "step": 39231 + }, + { + "epoch": 1.92, + "grad_norm": 0.6837745308876038, + "learning_rate": 0.00017150887092409265, + "loss": 2.7996, + "step": 39232 + }, + { + "epoch": 1.92, + "grad_norm": 0.6717110276222229, + "learning_rate": 0.00017149495855957303, + "loss": 2.842, + "step": 39233 + }, + { + "epoch": 1.92, + "grad_norm": 0.6553992629051208, + "learning_rate": 0.00017148104653351087, + "loss": 2.9463, + "step": 39234 + }, + { + "epoch": 1.92, + "grad_norm": 0.702616810798645, + "learning_rate": 0.00017146713484594297, + "loss": 2.8664, + "step": 39235 + }, + { + "epoch": 1.92, + "grad_norm": 0.6626030206680298, + "learning_rate": 0.0001714532234969058, + "loss": 3.052, + "step": 39236 + }, + { + "epoch": 1.92, + "grad_norm": 0.6390082240104675, + "learning_rate": 0.00017143931248643607, + "loss": 3.0915, + "step": 39237 + }, + { + "epoch": 1.92, + "grad_norm": 0.6931836009025574, + "learning_rate": 0.0001714254018145706, + "loss": 3.2053, + "step": 39238 + }, + { + "epoch": 1.92, + "grad_norm": 0.6288201212882996, + "learning_rate": 0.0001714114914813459, + "loss": 3.1003, + "step": 39239 + }, + { + "epoch": 1.92, + "grad_norm": 0.6410542130470276, + "learning_rate": 0.00017139758148679854, + "loss": 3.0281, + "step": 39240 + }, + { + "epoch": 1.92, + "grad_norm": 0.664252758026123, + "learning_rate": 0.0001713836718309651, + "loss": 3.0345, + "step": 39241 + }, + { + "epoch": 1.92, + "grad_norm": 0.6826432943344116, + "learning_rate": 0.00017136976251388232, + "loss": 3.0711, + "step": 39242 + }, + { + "epoch": 1.92, + "grad_norm": 0.6907158493995667, + "learning_rate": 0.00017135585353558695, + "loss": 2.7451, + "step": 39243 + }, + { + "epoch": 1.92, + "grad_norm": 0.6534987092018127, + "learning_rate": 0.00017134194489611538, + "loss": 3.0869, + "step": 39244 + }, + { + "epoch": 1.92, + "grad_norm": 0.6228570342063904, + "learning_rate": 0.0001713280365955045, + "loss": 3.0009, + "step": 39245 + }, + { + "epoch": 1.92, + "grad_norm": 0.697776198387146, + "learning_rate": 0.00017131412863379068, + "loss": 3.175, + "step": 39246 + }, + { + "epoch": 1.92, + "grad_norm": 0.6710581183433533, + "learning_rate": 0.0001713002210110108, + "loss": 2.8232, + "step": 39247 + }, + { + "epoch": 1.92, + "grad_norm": 0.7085484862327576, + "learning_rate": 0.00017128631372720138, + "loss": 2.9369, + "step": 39248 + }, + { + "epoch": 1.92, + "grad_norm": 0.666691780090332, + "learning_rate": 0.00017127240678239884, + "loss": 2.9581, + "step": 39249 + }, + { + "epoch": 1.92, + "grad_norm": 0.686436653137207, + "learning_rate": 0.0001712585001766402, + "loss": 2.9945, + "step": 39250 + }, + { + "epoch": 1.92, + "grad_norm": 0.6662260890007019, + "learning_rate": 0.00017124459390996174, + "loss": 2.924, + "step": 39251 + }, + { + "epoch": 1.92, + "grad_norm": 0.6701951026916504, + "learning_rate": 0.0001712306879824003, + "loss": 3.0694, + "step": 39252 + }, + { + "epoch": 1.92, + "grad_norm": 0.6563501358032227, + "learning_rate": 0.00017121678239399252, + "loss": 3.0597, + "step": 39253 + }, + { + "epoch": 1.92, + "grad_norm": 0.6329666972160339, + "learning_rate": 0.00017120287714477494, + "loss": 2.7156, + "step": 39254 + }, + { + "epoch": 1.92, + "grad_norm": 0.6433852314949036, + "learning_rate": 0.0001711889722347842, + "loss": 2.9309, + "step": 39255 + }, + { + "epoch": 1.92, + "grad_norm": 0.7456541657447815, + "learning_rate": 0.00017117506766405684, + "loss": 2.9767, + "step": 39256 + }, + { + "epoch": 1.92, + "grad_norm": 0.6476452350616455, + "learning_rate": 0.0001711611634326295, + "loss": 2.9284, + "step": 39257 + }, + { + "epoch": 1.92, + "grad_norm": 0.6435794830322266, + "learning_rate": 0.00017114725954053904, + "loss": 3.1301, + "step": 39258 + }, + { + "epoch": 1.92, + "grad_norm": 0.6286764144897461, + "learning_rate": 0.0001711333559878217, + "loss": 3.2672, + "step": 39259 + }, + { + "epoch": 1.92, + "grad_norm": 0.6440817713737488, + "learning_rate": 0.00017111945277451445, + "loss": 3.0443, + "step": 39260 + }, + { + "epoch": 1.92, + "grad_norm": 0.6571938991546631, + "learning_rate": 0.0001711055499006538, + "loss": 2.9511, + "step": 39261 + }, + { + "epoch": 1.92, + "grad_norm": 0.7274107933044434, + "learning_rate": 0.00017109164736627618, + "loss": 2.9802, + "step": 39262 + }, + { + "epoch": 1.92, + "grad_norm": 0.6378130912780762, + "learning_rate": 0.00017107774517141847, + "loss": 3.0211, + "step": 39263 + }, + { + "epoch": 1.92, + "grad_norm": 0.6885344386100769, + "learning_rate": 0.00017106384331611705, + "loss": 2.9844, + "step": 39264 + }, + { + "epoch": 1.92, + "grad_norm": 0.6548222303390503, + "learning_rate": 0.0001710499418004088, + "loss": 3.0318, + "step": 39265 + }, + { + "epoch": 1.92, + "grad_norm": 0.681775689125061, + "learning_rate": 0.00017103604062433, + "loss": 2.7903, + "step": 39266 + }, + { + "epoch": 1.92, + "grad_norm": 0.674473226070404, + "learning_rate": 0.00017102213978791763, + "loss": 3.0038, + "step": 39267 + }, + { + "epoch": 1.92, + "grad_norm": 0.6563646793365479, + "learning_rate": 0.00017100823929120797, + "loss": 2.8295, + "step": 39268 + }, + { + "epoch": 1.92, + "grad_norm": 0.6491327881813049, + "learning_rate": 0.00017099433913423791, + "loss": 3.0033, + "step": 39269 + }, + { + "epoch": 1.92, + "grad_norm": 0.6606160402297974, + "learning_rate": 0.00017098043931704394, + "loss": 3.0782, + "step": 39270 + }, + { + "epoch": 1.92, + "grad_norm": 0.6784059405326843, + "learning_rate": 0.00017096653983966252, + "loss": 3.1224, + "step": 39271 + }, + { + "epoch": 1.92, + "grad_norm": 0.6741899251937866, + "learning_rate": 0.00017095264070213055, + "loss": 2.8379, + "step": 39272 + }, + { + "epoch": 1.92, + "grad_norm": 0.6609475016593933, + "learning_rate": 0.00017093874190448434, + "loss": 3.0115, + "step": 39273 + }, + { + "epoch": 1.92, + "grad_norm": 0.7126678228378296, + "learning_rate": 0.00017092484344676068, + "loss": 2.9805, + "step": 39274 + }, + { + "epoch": 1.92, + "grad_norm": 0.6484171152114868, + "learning_rate": 0.00017091094532899624, + "loss": 2.8915, + "step": 39275 + }, + { + "epoch": 1.92, + "grad_norm": 0.6500241160392761, + "learning_rate": 0.0001708970475512275, + "loss": 2.8879, + "step": 39276 + }, + { + "epoch": 1.92, + "grad_norm": 0.6842443943023682, + "learning_rate": 0.00017088315011349115, + "loss": 3.0103, + "step": 39277 + }, + { + "epoch": 1.92, + "grad_norm": 0.6493595838546753, + "learning_rate": 0.00017086925301582358, + "loss": 2.9825, + "step": 39278 + }, + { + "epoch": 1.92, + "grad_norm": 0.6276535391807556, + "learning_rate": 0.00017085535625826156, + "loss": 3.0221, + "step": 39279 + }, + { + "epoch": 1.93, + "grad_norm": 0.693079948425293, + "learning_rate": 0.00017084145984084175, + "loss": 3.1203, + "step": 39280 + }, + { + "epoch": 1.93, + "grad_norm": 0.6312422156333923, + "learning_rate": 0.0001708275637636006, + "loss": 3.001, + "step": 39281 + }, + { + "epoch": 1.93, + "grad_norm": 0.6259576082229614, + "learning_rate": 0.0001708136680265749, + "loss": 2.9753, + "step": 39282 + }, + { + "epoch": 1.93, + "grad_norm": 0.6311910152435303, + "learning_rate": 0.00017079977262980116, + "loss": 3.1335, + "step": 39283 + }, + { + "epoch": 1.93, + "grad_norm": 0.6822161674499512, + "learning_rate": 0.0001707858775733158, + "loss": 2.8774, + "step": 39284 + }, + { + "epoch": 1.93, + "grad_norm": 0.7114952802658081, + "learning_rate": 0.0001707719828571557, + "loss": 3.1555, + "step": 39285 + }, + { + "epoch": 1.93, + "grad_norm": 0.6649879813194275, + "learning_rate": 0.00017075808848135722, + "loss": 3.0921, + "step": 39286 + }, + { + "epoch": 1.93, + "grad_norm": 0.6248159408569336, + "learning_rate": 0.00017074419444595716, + "loss": 2.9811, + "step": 39287 + }, + { + "epoch": 1.93, + "grad_norm": 0.6573299169540405, + "learning_rate": 0.00017073030075099191, + "loss": 2.8717, + "step": 39288 + }, + { + "epoch": 1.93, + "grad_norm": 0.7020032405853271, + "learning_rate": 0.00017071640739649814, + "loss": 2.8755, + "step": 39289 + }, + { + "epoch": 1.93, + "grad_norm": 0.6383627653121948, + "learning_rate": 0.00017070251438251264, + "loss": 2.9567, + "step": 39290 + }, + { + "epoch": 1.93, + "grad_norm": 0.7252497673034668, + "learning_rate": 0.00017068862170907181, + "loss": 2.9716, + "step": 39291 + }, + { + "epoch": 1.93, + "grad_norm": 0.666100025177002, + "learning_rate": 0.00017067472937621229, + "loss": 3.1238, + "step": 39292 + }, + { + "epoch": 1.93, + "grad_norm": 0.6566705107688904, + "learning_rate": 0.00017066083738397048, + "loss": 3.0706, + "step": 39293 + }, + { + "epoch": 1.93, + "grad_norm": 0.657951295375824, + "learning_rate": 0.00017064694573238316, + "loss": 3.0127, + "step": 39294 + }, + { + "epoch": 1.93, + "grad_norm": 0.6560387015342712, + "learning_rate": 0.000170633054421487, + "loss": 2.9246, + "step": 39295 + }, + { + "epoch": 1.93, + "grad_norm": 0.6837169528007507, + "learning_rate": 0.00017061916345131834, + "loss": 3.0309, + "step": 39296 + }, + { + "epoch": 1.93, + "grad_norm": 0.7252517938613892, + "learning_rate": 0.00017060527282191406, + "loss": 2.6591, + "step": 39297 + }, + { + "epoch": 1.93, + "grad_norm": 0.6759359240531921, + "learning_rate": 0.00017059138253331055, + "loss": 3.1497, + "step": 39298 + }, + { + "epoch": 1.93, + "grad_norm": 0.631263256072998, + "learning_rate": 0.00017057749258554434, + "loss": 3.1716, + "step": 39299 + }, + { + "epoch": 1.93, + "grad_norm": 0.6345106959342957, + "learning_rate": 0.0001705636029786522, + "loss": 2.8964, + "step": 39300 + }, + { + "epoch": 1.93, + "grad_norm": 0.6319267153739929, + "learning_rate": 0.00017054971371267054, + "loss": 3.0213, + "step": 39301 + }, + { + "epoch": 1.93, + "grad_norm": 0.6083059906959534, + "learning_rate": 0.00017053582478763614, + "loss": 2.8726, + "step": 39302 + }, + { + "epoch": 1.93, + "grad_norm": 0.6584345698356628, + "learning_rate": 0.0001705219362035853, + "loss": 2.9741, + "step": 39303 + }, + { + "epoch": 1.93, + "grad_norm": 0.720427930355072, + "learning_rate": 0.0001705080479605549, + "loss": 2.8017, + "step": 39304 + }, + { + "epoch": 1.93, + "grad_norm": 0.6475998759269714, + "learning_rate": 0.00017049416005858142, + "loss": 3.1804, + "step": 39305 + }, + { + "epoch": 1.93, + "grad_norm": 0.6432875394821167, + "learning_rate": 0.00017048027249770122, + "loss": 2.8589, + "step": 39306 + }, + { + "epoch": 1.93, + "grad_norm": 0.645972490310669, + "learning_rate": 0.0001704663852779512, + "loss": 2.9266, + "step": 39307 + }, + { + "epoch": 1.93, + "grad_norm": 0.6561703681945801, + "learning_rate": 0.00017045249839936768, + "loss": 3.0237, + "step": 39308 + }, + { + "epoch": 1.93, + "grad_norm": 0.6930022239685059, + "learning_rate": 0.00017043861186198732, + "loss": 2.9589, + "step": 39309 + }, + { + "epoch": 1.93, + "grad_norm": 0.6722618937492371, + "learning_rate": 0.00017042472566584688, + "loss": 3.1136, + "step": 39310 + }, + { + "epoch": 1.93, + "grad_norm": 0.6967858672142029, + "learning_rate": 0.00017041083981098263, + "loss": 2.8958, + "step": 39311 + }, + { + "epoch": 1.93, + "grad_norm": 0.649520218372345, + "learning_rate": 0.00017039695429743143, + "loss": 3.0567, + "step": 39312 + }, + { + "epoch": 1.93, + "grad_norm": 0.6798179745674133, + "learning_rate": 0.0001703830691252297, + "loss": 3.0234, + "step": 39313 + }, + { + "epoch": 1.93, + "grad_norm": 0.6915568709373474, + "learning_rate": 0.00017036918429441387, + "loss": 2.9978, + "step": 39314 + }, + { + "epoch": 1.93, + "grad_norm": 0.6442030072212219, + "learning_rate": 0.0001703552998050208, + "loss": 3.2237, + "step": 39315 + }, + { + "epoch": 1.93, + "grad_norm": 0.6549793481826782, + "learning_rate": 0.00017034141565708683, + "loss": 3.2327, + "step": 39316 + }, + { + "epoch": 1.93, + "grad_norm": 0.6851274371147156, + "learning_rate": 0.0001703275318506487, + "loss": 2.8293, + "step": 39317 + }, + { + "epoch": 1.93, + "grad_norm": 0.6663837432861328, + "learning_rate": 0.0001703136483857428, + "loss": 2.8694, + "step": 39318 + }, + { + "epoch": 1.93, + "grad_norm": 0.6201865077018738, + "learning_rate": 0.00017029976526240588, + "loss": 2.9923, + "step": 39319 + }, + { + "epoch": 1.93, + "grad_norm": 0.6380475759506226, + "learning_rate": 0.00017028588248067445, + "loss": 2.9472, + "step": 39320 + }, + { + "epoch": 1.93, + "grad_norm": 0.6493070721626282, + "learning_rate": 0.0001702720000405849, + "loss": 3.1009, + "step": 39321 + }, + { + "epoch": 1.93, + "grad_norm": 0.6758432388305664, + "learning_rate": 0.00017025811794217408, + "loss": 2.8572, + "step": 39322 + }, + { + "epoch": 1.93, + "grad_norm": 0.6773945093154907, + "learning_rate": 0.00017024423618547824, + "loss": 3.0127, + "step": 39323 + }, + { + "epoch": 1.93, + "grad_norm": 0.6368323564529419, + "learning_rate": 0.00017023035477053416, + "loss": 2.8353, + "step": 39324 + }, + { + "epoch": 1.93, + "grad_norm": 0.6623914241790771, + "learning_rate": 0.00017021647369737844, + "loss": 3.0751, + "step": 39325 + }, + { + "epoch": 1.93, + "grad_norm": 0.6415942311286926, + "learning_rate": 0.00017020259296604743, + "loss": 2.9105, + "step": 39326 + }, + { + "epoch": 1.93, + "grad_norm": 0.6625025868415833, + "learning_rate": 0.00017018871257657808, + "loss": 2.912, + "step": 39327 + }, + { + "epoch": 1.93, + "grad_norm": 0.6630984544754028, + "learning_rate": 0.00017017483252900641, + "loss": 2.9823, + "step": 39328 + }, + { + "epoch": 1.93, + "grad_norm": 0.637005090713501, + "learning_rate": 0.00017016095282336926, + "loss": 3.0265, + "step": 39329 + }, + { + "epoch": 1.93, + "grad_norm": 0.6559372544288635, + "learning_rate": 0.0001701470734597033, + "loss": 3.1208, + "step": 39330 + }, + { + "epoch": 1.93, + "grad_norm": 0.683228075504303, + "learning_rate": 0.0001701331944380448, + "loss": 3.2746, + "step": 39331 + }, + { + "epoch": 1.93, + "grad_norm": 0.6639457941055298, + "learning_rate": 0.00017011931575843062, + "loss": 3.0369, + "step": 39332 + }, + { + "epoch": 1.93, + "grad_norm": 0.691205620765686, + "learning_rate": 0.00017010543742089707, + "loss": 2.921, + "step": 39333 + }, + { + "epoch": 1.93, + "grad_norm": 0.6341986656188965, + "learning_rate": 0.00017009155942548088, + "loss": 3.1622, + "step": 39334 + }, + { + "epoch": 1.93, + "grad_norm": 0.6385431289672852, + "learning_rate": 0.00017007768177221857, + "loss": 2.8519, + "step": 39335 + }, + { + "epoch": 1.93, + "grad_norm": 0.6719568967819214, + "learning_rate": 0.00017006380446114646, + "loss": 2.9594, + "step": 39336 + }, + { + "epoch": 1.93, + "grad_norm": 0.6813459396362305, + "learning_rate": 0.00017004992749230144, + "loss": 2.9986, + "step": 39337 + }, + { + "epoch": 1.93, + "grad_norm": 0.7117205262184143, + "learning_rate": 0.0001700360508657198, + "loss": 3.0113, + "step": 39338 + }, + { + "epoch": 1.93, + "grad_norm": 0.6474804878234863, + "learning_rate": 0.00017002217458143813, + "loss": 2.8304, + "step": 39339 + }, + { + "epoch": 1.93, + "grad_norm": 0.6792690753936768, + "learning_rate": 0.00017000829863949316, + "loss": 2.9919, + "step": 39340 + }, + { + "epoch": 1.93, + "grad_norm": 0.6472276449203491, + "learning_rate": 0.00016999442303992137, + "loss": 3.0074, + "step": 39341 + }, + { + "epoch": 1.93, + "grad_norm": 0.6135537028312683, + "learning_rate": 0.0001699805477827592, + "loss": 2.929, + "step": 39342 + }, + { + "epoch": 1.93, + "grad_norm": 0.6506457924842834, + "learning_rate": 0.00016996667286804318, + "loss": 2.8383, + "step": 39343 + }, + { + "epoch": 1.93, + "grad_norm": 0.6891106963157654, + "learning_rate": 0.00016995279829580998, + "loss": 2.7531, + "step": 39344 + }, + { + "epoch": 1.93, + "grad_norm": 0.6155516505241394, + "learning_rate": 0.000169938924066096, + "loss": 2.9584, + "step": 39345 + }, + { + "epoch": 1.93, + "grad_norm": 0.6519812941551208, + "learning_rate": 0.00016992505017893782, + "loss": 3.0768, + "step": 39346 + }, + { + "epoch": 1.93, + "grad_norm": 0.6716668605804443, + "learning_rate": 0.0001699111766343722, + "loss": 2.9624, + "step": 39347 + }, + { + "epoch": 1.93, + "grad_norm": 0.6471678614616394, + "learning_rate": 0.00016989730343243536, + "loss": 3.1556, + "step": 39348 + }, + { + "epoch": 1.93, + "grad_norm": 0.6281684637069702, + "learning_rate": 0.0001698834305731641, + "loss": 3.1132, + "step": 39349 + }, + { + "epoch": 1.93, + "grad_norm": 0.621414840221405, + "learning_rate": 0.00016986955805659487, + "loss": 2.9695, + "step": 39350 + }, + { + "epoch": 1.93, + "grad_norm": 0.6490262746810913, + "learning_rate": 0.000169855685882764, + "loss": 2.8509, + "step": 39351 + }, + { + "epoch": 1.93, + "grad_norm": 0.687820553779602, + "learning_rate": 0.00016984181405170837, + "loss": 3.0534, + "step": 39352 + }, + { + "epoch": 1.93, + "grad_norm": 0.6453794240951538, + "learning_rate": 0.00016982794256346424, + "loss": 3.0, + "step": 39353 + }, + { + "epoch": 1.93, + "grad_norm": 0.7020411491394043, + "learning_rate": 0.00016981407141806836, + "loss": 3.0087, + "step": 39354 + }, + { + "epoch": 1.93, + "grad_norm": 0.6691149473190308, + "learning_rate": 0.00016980020061555702, + "loss": 2.934, + "step": 39355 + }, + { + "epoch": 1.93, + "grad_norm": 0.6700564622879028, + "learning_rate": 0.00016978633015596708, + "loss": 2.7317, + "step": 39356 + }, + { + "epoch": 1.93, + "grad_norm": 0.6754282712936401, + "learning_rate": 0.0001697724600393348, + "loss": 3.1237, + "step": 39357 + }, + { + "epoch": 1.93, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.00016975859026569672, + "loss": 3.0405, + "step": 39358 + }, + { + "epoch": 1.93, + "grad_norm": 0.6934306025505066, + "learning_rate": 0.00016974472083508958, + "loss": 3.0548, + "step": 39359 + }, + { + "epoch": 1.93, + "grad_norm": 0.7280018925666809, + "learning_rate": 0.00016973085174754962, + "loss": 2.8646, + "step": 39360 + }, + { + "epoch": 1.93, + "grad_norm": 0.664756178855896, + "learning_rate": 0.00016971698300311357, + "loss": 3.0039, + "step": 39361 + }, + { + "epoch": 1.93, + "grad_norm": 0.6578002572059631, + "learning_rate": 0.000169703114601818, + "loss": 2.6559, + "step": 39362 + }, + { + "epoch": 1.93, + "grad_norm": 0.661143958568573, + "learning_rate": 0.0001696892465436994, + "loss": 2.5943, + "step": 39363 + }, + { + "epoch": 1.93, + "grad_norm": 0.6856162548065186, + "learning_rate": 0.00016967537882879417, + "loss": 3.0467, + "step": 39364 + }, + { + "epoch": 1.93, + "grad_norm": 0.661816418170929, + "learning_rate": 0.00016966151145713886, + "loss": 3.0191, + "step": 39365 + }, + { + "epoch": 1.93, + "grad_norm": 0.665028989315033, + "learning_rate": 0.00016964764442877001, + "loss": 2.9423, + "step": 39366 + }, + { + "epoch": 1.93, + "grad_norm": 0.702292799949646, + "learning_rate": 0.00016963377774372433, + "loss": 3.2149, + "step": 39367 + }, + { + "epoch": 1.93, + "grad_norm": 0.6974702477455139, + "learning_rate": 0.00016961991140203802, + "loss": 2.791, + "step": 39368 + }, + { + "epoch": 1.93, + "grad_norm": 0.6458693146705627, + "learning_rate": 0.0001696060454037479, + "loss": 3.1692, + "step": 39369 + }, + { + "epoch": 1.93, + "grad_norm": 0.6239611506462097, + "learning_rate": 0.0001695921797488903, + "loss": 3.1647, + "step": 39370 + }, + { + "epoch": 1.93, + "grad_norm": 0.6828367710113525, + "learning_rate": 0.00016957831443750184, + "loss": 2.7982, + "step": 39371 + }, + { + "epoch": 1.93, + "grad_norm": 0.6554055213928223, + "learning_rate": 0.00016956444946961905, + "loss": 2.841, + "step": 39372 + }, + { + "epoch": 1.93, + "grad_norm": 0.6682385206222534, + "learning_rate": 0.00016955058484527825, + "loss": 3.153, + "step": 39373 + }, + { + "epoch": 1.93, + "grad_norm": 0.6697953939437866, + "learning_rate": 0.00016953672056451626, + "loss": 3.0903, + "step": 39374 + }, + { + "epoch": 1.93, + "grad_norm": 0.693271279335022, + "learning_rate": 0.0001695228566273693, + "loss": 2.9609, + "step": 39375 + }, + { + "epoch": 1.93, + "grad_norm": 0.6608006358146667, + "learning_rate": 0.000169508993033874, + "loss": 3.1086, + "step": 39376 + }, + { + "epoch": 1.93, + "grad_norm": 0.6983053088188171, + "learning_rate": 0.00016949512978406704, + "loss": 2.9174, + "step": 39377 + }, + { + "epoch": 1.93, + "grad_norm": 0.6728823184967041, + "learning_rate": 0.00016948126687798483, + "loss": 3.0248, + "step": 39378 + }, + { + "epoch": 1.93, + "grad_norm": 0.6789221167564392, + "learning_rate": 0.00016946740431566376, + "loss": 3.0388, + "step": 39379 + }, + { + "epoch": 1.93, + "grad_norm": 0.619369387626648, + "learning_rate": 0.00016945354209714036, + "loss": 3.1008, + "step": 39380 + }, + { + "epoch": 1.93, + "grad_norm": 0.6840798258781433, + "learning_rate": 0.0001694396802224512, + "loss": 3.2137, + "step": 39381 + }, + { + "epoch": 1.93, + "grad_norm": 0.6583610773086548, + "learning_rate": 0.00016942581869163292, + "loss": 3.0252, + "step": 39382 + }, + { + "epoch": 1.93, + "grad_norm": 0.6744231581687927, + "learning_rate": 0.00016941195750472176, + "loss": 2.9969, + "step": 39383 + }, + { + "epoch": 1.93, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.00016939809666175452, + "loss": 3.0379, + "step": 39384 + }, + { + "epoch": 1.93, + "grad_norm": 0.6821282505989075, + "learning_rate": 0.00016938423616276755, + "loss": 2.6843, + "step": 39385 + }, + { + "epoch": 1.93, + "grad_norm": 0.6183244585990906, + "learning_rate": 0.00016937037600779729, + "loss": 2.9519, + "step": 39386 + }, + { + "epoch": 1.93, + "grad_norm": 0.6722292304039001, + "learning_rate": 0.00016935651619688036, + "loss": 2.8854, + "step": 39387 + }, + { + "epoch": 1.93, + "grad_norm": 0.6663751006126404, + "learning_rate": 0.00016934265673005313, + "loss": 2.9347, + "step": 39388 + }, + { + "epoch": 1.93, + "grad_norm": 0.694520890712738, + "learning_rate": 0.00016932879760735235, + "loss": 2.9421, + "step": 39389 + }, + { + "epoch": 1.93, + "grad_norm": 0.6110134124755859, + "learning_rate": 0.0001693149388288142, + "loss": 3.0589, + "step": 39390 + }, + { + "epoch": 1.93, + "grad_norm": 0.6420994400978088, + "learning_rate": 0.00016930108039447538, + "loss": 3.1698, + "step": 39391 + }, + { + "epoch": 1.93, + "grad_norm": 0.6498850584030151, + "learning_rate": 0.00016928722230437248, + "loss": 3.1342, + "step": 39392 + }, + { + "epoch": 1.93, + "grad_norm": 0.7132246494293213, + "learning_rate": 0.00016927336455854186, + "loss": 2.945, + "step": 39393 + }, + { + "epoch": 1.93, + "grad_norm": 0.6706508994102478, + "learning_rate": 0.00016925950715702005, + "loss": 2.9076, + "step": 39394 + }, + { + "epoch": 1.93, + "grad_norm": 0.7032055854797363, + "learning_rate": 0.00016924565009984342, + "loss": 2.9207, + "step": 39395 + }, + { + "epoch": 1.93, + "grad_norm": 0.6941987872123718, + "learning_rate": 0.0001692317933870486, + "loss": 3.0327, + "step": 39396 + }, + { + "epoch": 1.93, + "grad_norm": 0.6361653208732605, + "learning_rate": 0.00016921793701867217, + "loss": 3.0947, + "step": 39397 + }, + { + "epoch": 1.93, + "grad_norm": 0.6204395890235901, + "learning_rate": 0.0001692040809947504, + "loss": 2.8356, + "step": 39398 + }, + { + "epoch": 1.93, + "grad_norm": 0.6345608830451965, + "learning_rate": 0.00016919022531532, + "loss": 3.0054, + "step": 39399 + }, + { + "epoch": 1.93, + "grad_norm": 0.9609873294830322, + "learning_rate": 0.00016917636998041742, + "loss": 2.9836, + "step": 39400 + }, + { + "epoch": 1.93, + "grad_norm": 0.6671732664108276, + "learning_rate": 0.000169162514990079, + "loss": 2.8382, + "step": 39401 + }, + { + "epoch": 1.93, + "grad_norm": 0.6544909477233887, + "learning_rate": 0.0001691486603443414, + "loss": 2.9315, + "step": 39402 + }, + { + "epoch": 1.93, + "grad_norm": 0.6705605387687683, + "learning_rate": 0.00016913480604324095, + "loss": 2.9288, + "step": 39403 + }, + { + "epoch": 1.93, + "grad_norm": 0.6991368532180786, + "learning_rate": 0.00016912095208681437, + "loss": 2.7712, + "step": 39404 + }, + { + "epoch": 1.93, + "grad_norm": 0.7298339009284973, + "learning_rate": 0.00016910709847509786, + "loss": 3.005, + "step": 39405 + }, + { + "epoch": 1.93, + "grad_norm": 0.6603250503540039, + "learning_rate": 0.00016909324520812823, + "loss": 2.8994, + "step": 39406 + }, + { + "epoch": 1.93, + "grad_norm": 0.6446770429611206, + "learning_rate": 0.00016907939228594175, + "loss": 2.9513, + "step": 39407 + }, + { + "epoch": 1.93, + "grad_norm": 0.6649962067604065, + "learning_rate": 0.00016906553970857488, + "loss": 2.9068, + "step": 39408 + }, + { + "epoch": 1.93, + "grad_norm": 0.6251447796821594, + "learning_rate": 0.00016905168747606428, + "loss": 2.8336, + "step": 39409 + }, + { + "epoch": 1.93, + "grad_norm": 0.6825475692749023, + "learning_rate": 0.00016903783558844617, + "loss": 2.9488, + "step": 39410 + }, + { + "epoch": 1.93, + "grad_norm": 0.6691312789916992, + "learning_rate": 0.00016902398404575726, + "loss": 3.0109, + "step": 39411 + }, + { + "epoch": 1.93, + "grad_norm": 0.6389507055282593, + "learning_rate": 0.00016901013284803407, + "loss": 3.1017, + "step": 39412 + }, + { + "epoch": 1.93, + "grad_norm": 0.6700393557548523, + "learning_rate": 0.00016899628199531282, + "loss": 2.999, + "step": 39413 + }, + { + "epoch": 1.93, + "grad_norm": 0.6409572958946228, + "learning_rate": 0.00016898243148763032, + "loss": 2.8796, + "step": 39414 + }, + { + "epoch": 1.93, + "grad_norm": 0.6346601843833923, + "learning_rate": 0.00016896858132502287, + "loss": 3.0772, + "step": 39415 + }, + { + "epoch": 1.93, + "grad_norm": 0.6301149129867554, + "learning_rate": 0.00016895473150752683, + "loss": 3.0398, + "step": 39416 + }, + { + "epoch": 1.93, + "grad_norm": 0.6977820992469788, + "learning_rate": 0.00016894088203517894, + "loss": 2.8781, + "step": 39417 + }, + { + "epoch": 1.93, + "grad_norm": 0.9036363363265991, + "learning_rate": 0.00016892703290801543, + "loss": 2.9435, + "step": 39418 + }, + { + "epoch": 1.93, + "grad_norm": 0.7055711150169373, + "learning_rate": 0.00016891318412607298, + "loss": 2.9065, + "step": 39419 + }, + { + "epoch": 1.93, + "grad_norm": 0.6767593026161194, + "learning_rate": 0.00016889933568938788, + "loss": 2.8684, + "step": 39420 + }, + { + "epoch": 1.93, + "grad_norm": 0.6799156665802002, + "learning_rate": 0.0001688854875979968, + "loss": 3.161, + "step": 39421 + }, + { + "epoch": 1.93, + "grad_norm": 0.615311324596405, + "learning_rate": 0.00016887163985193615, + "loss": 3.001, + "step": 39422 + }, + { + "epoch": 1.93, + "grad_norm": 0.6541116833686829, + "learning_rate": 0.00016885779245124223, + "loss": 3.0344, + "step": 39423 + }, + { + "epoch": 1.93, + "grad_norm": 0.6408389806747437, + "learning_rate": 0.00016884394539595172, + "loss": 2.9967, + "step": 39424 + }, + { + "epoch": 1.93, + "grad_norm": 0.7208709716796875, + "learning_rate": 0.00016883009868610093, + "loss": 2.9828, + "step": 39425 + }, + { + "epoch": 1.93, + "grad_norm": 0.6646652221679688, + "learning_rate": 0.00016881625232172652, + "loss": 2.9533, + "step": 39426 + }, + { + "epoch": 1.93, + "grad_norm": 0.637306809425354, + "learning_rate": 0.00016880240630286477, + "loss": 3.0291, + "step": 39427 + }, + { + "epoch": 1.93, + "grad_norm": 0.6594432592391968, + "learning_rate": 0.0001687885606295522, + "loss": 3.0415, + "step": 39428 + }, + { + "epoch": 1.93, + "grad_norm": 0.6402209401130676, + "learning_rate": 0.0001687747153018256, + "loss": 2.9242, + "step": 39429 + }, + { + "epoch": 1.93, + "grad_norm": 0.6318802237510681, + "learning_rate": 0.00016876087031972084, + "loss": 2.9576, + "step": 39430 + }, + { + "epoch": 1.93, + "grad_norm": 0.6425923109054565, + "learning_rate": 0.00016874702568327489, + "loss": 2.804, + "step": 39431 + }, + { + "epoch": 1.93, + "grad_norm": 0.6498662829399109, + "learning_rate": 0.00016873318139252382, + "loss": 2.924, + "step": 39432 + }, + { + "epoch": 1.93, + "grad_norm": 0.6536316871643066, + "learning_rate": 0.0001687193374475043, + "loss": 2.885, + "step": 39433 + }, + { + "epoch": 1.93, + "grad_norm": 0.6774493455886841, + "learning_rate": 0.0001687054938482529, + "loss": 2.8245, + "step": 39434 + }, + { + "epoch": 1.93, + "grad_norm": 0.6602688431739807, + "learning_rate": 0.00016869165059480584, + "loss": 3.3949, + "step": 39435 + }, + { + "epoch": 1.93, + "grad_norm": 0.6157000660896301, + "learning_rate": 0.00016867780768719983, + "loss": 2.9687, + "step": 39436 + }, + { + "epoch": 1.93, + "grad_norm": 0.6612973213195801, + "learning_rate": 0.00016866396512547124, + "loss": 2.9654, + "step": 39437 + }, + { + "epoch": 1.93, + "grad_norm": 0.6561480760574341, + "learning_rate": 0.0001686501229096563, + "loss": 2.9176, + "step": 39438 + }, + { + "epoch": 1.93, + "grad_norm": 0.680400550365448, + "learning_rate": 0.0001686362810397918, + "loss": 3.068, + "step": 39439 + }, + { + "epoch": 1.93, + "grad_norm": 0.6502717733383179, + "learning_rate": 0.00016862243951591395, + "loss": 3.0433, + "step": 39440 + }, + { + "epoch": 1.93, + "grad_norm": 0.641892671585083, + "learning_rate": 0.00016860859833805943, + "loss": 2.891, + "step": 39441 + }, + { + "epoch": 1.93, + "grad_norm": 0.6740272641181946, + "learning_rate": 0.00016859475750626444, + "loss": 3.0464, + "step": 39442 + }, + { + "epoch": 1.93, + "grad_norm": 0.6247941851615906, + "learning_rate": 0.0001685809170205657, + "loss": 2.8856, + "step": 39443 + }, + { + "epoch": 1.93, + "grad_norm": 0.6555472016334534, + "learning_rate": 0.00016856707688099956, + "loss": 3.1368, + "step": 39444 + }, + { + "epoch": 1.93, + "grad_norm": 0.6383205056190491, + "learning_rate": 0.0001685532370876023, + "loss": 2.6563, + "step": 39445 + }, + { + "epoch": 1.93, + "grad_norm": 0.6372672319412231, + "learning_rate": 0.0001685393976404106, + "loss": 3.1764, + "step": 39446 + }, + { + "epoch": 1.93, + "grad_norm": 0.6690751910209656, + "learning_rate": 0.00016852555853946074, + "loss": 2.9395, + "step": 39447 + }, + { + "epoch": 1.93, + "grad_norm": 0.6416040658950806, + "learning_rate": 0.00016851171978478927, + "loss": 2.9809, + "step": 39448 + }, + { + "epoch": 1.93, + "grad_norm": 0.6781452894210815, + "learning_rate": 0.0001684978813764327, + "loss": 3.1393, + "step": 39449 + }, + { + "epoch": 1.93, + "grad_norm": 0.6450480818748474, + "learning_rate": 0.00016848404331442732, + "loss": 3.0979, + "step": 39450 + }, + { + "epoch": 1.93, + "grad_norm": 0.6464423537254333, + "learning_rate": 0.0001684702055988098, + "loss": 3.0349, + "step": 39451 + }, + { + "epoch": 1.93, + "grad_norm": 0.6569921374320984, + "learning_rate": 0.00016845636822961638, + "loss": 2.9834, + "step": 39452 + }, + { + "epoch": 1.93, + "grad_norm": 0.6588374972343445, + "learning_rate": 0.0001684425312068835, + "loss": 3.0806, + "step": 39453 + }, + { + "epoch": 1.93, + "grad_norm": 0.6816110014915466, + "learning_rate": 0.00016842869453064776, + "loss": 2.9424, + "step": 39454 + }, + { + "epoch": 1.93, + "grad_norm": 0.6436490416526794, + "learning_rate": 0.00016841485820094543, + "loss": 2.9879, + "step": 39455 + }, + { + "epoch": 1.93, + "grad_norm": 0.6433058381080627, + "learning_rate": 0.00016840102221781316, + "loss": 2.932, + "step": 39456 + }, + { + "epoch": 1.93, + "grad_norm": 0.6444736123085022, + "learning_rate": 0.00016838718658128714, + "loss": 2.9911, + "step": 39457 + }, + { + "epoch": 1.93, + "grad_norm": 0.6469634175300598, + "learning_rate": 0.00016837335129140404, + "loss": 2.8933, + "step": 39458 + }, + { + "epoch": 1.93, + "grad_norm": 0.6529709100723267, + "learning_rate": 0.00016835951634820023, + "loss": 3.0116, + "step": 39459 + }, + { + "epoch": 1.93, + "grad_norm": 0.6344585418701172, + "learning_rate": 0.00016834568175171197, + "loss": 2.9716, + "step": 39460 + }, + { + "epoch": 1.93, + "grad_norm": 0.6618320941925049, + "learning_rate": 0.00016833184750197596, + "loss": 3.0455, + "step": 39461 + }, + { + "epoch": 1.93, + "grad_norm": 0.6651561260223389, + "learning_rate": 0.00016831801359902846, + "loss": 3.1178, + "step": 39462 + }, + { + "epoch": 1.93, + "grad_norm": 0.6415659785270691, + "learning_rate": 0.00016830418004290594, + "loss": 2.7613, + "step": 39463 + }, + { + "epoch": 1.93, + "grad_norm": 0.7086910605430603, + "learning_rate": 0.00016829034683364498, + "loss": 2.8569, + "step": 39464 + }, + { + "epoch": 1.93, + "grad_norm": 0.619654655456543, + "learning_rate": 0.00016827651397128188, + "loss": 3.1485, + "step": 39465 + }, + { + "epoch": 1.93, + "grad_norm": 0.6941916942596436, + "learning_rate": 0.00016826268145585314, + "loss": 2.7932, + "step": 39466 + }, + { + "epoch": 1.93, + "grad_norm": 0.7001242637634277, + "learning_rate": 0.00016824884928739496, + "loss": 2.9649, + "step": 39467 + }, + { + "epoch": 1.93, + "grad_norm": 0.6473196148872375, + "learning_rate": 0.00016823501746594403, + "loss": 3.0718, + "step": 39468 + }, + { + "epoch": 1.93, + "grad_norm": 0.6944223642349243, + "learning_rate": 0.00016822118599153676, + "loss": 3.0691, + "step": 39469 + }, + { + "epoch": 1.93, + "grad_norm": 0.6287515759468079, + "learning_rate": 0.00016820735486420944, + "loss": 2.9734, + "step": 39470 + }, + { + "epoch": 1.93, + "grad_norm": 0.699953556060791, + "learning_rate": 0.00016819352408399869, + "loss": 2.9848, + "step": 39471 + }, + { + "epoch": 1.93, + "grad_norm": 0.6306780576705933, + "learning_rate": 0.00016817969365094074, + "loss": 3.1997, + "step": 39472 + }, + { + "epoch": 1.93, + "grad_norm": 0.6510525345802307, + "learning_rate": 0.0001681658635650722, + "loss": 2.9396, + "step": 39473 + }, + { + "epoch": 1.93, + "grad_norm": 0.6423694491386414, + "learning_rate": 0.0001681520338264294, + "loss": 3.058, + "step": 39474 + }, + { + "epoch": 1.93, + "grad_norm": 0.662390410900116, + "learning_rate": 0.00016813820443504866, + "loss": 3.0601, + "step": 39475 + }, + { + "epoch": 1.93, + "grad_norm": 0.653954267501831, + "learning_rate": 0.0001681243753909666, + "loss": 3.0233, + "step": 39476 + }, + { + "epoch": 1.93, + "grad_norm": 0.6505160331726074, + "learning_rate": 0.00016811054669421948, + "loss": 3.0404, + "step": 39477 + }, + { + "epoch": 1.93, + "grad_norm": 0.6412315368652344, + "learning_rate": 0.00016809671834484377, + "loss": 2.9671, + "step": 39478 + }, + { + "epoch": 1.93, + "grad_norm": 0.646365225315094, + "learning_rate": 0.00016808289034287604, + "loss": 2.9477, + "step": 39479 + }, + { + "epoch": 1.93, + "grad_norm": 0.6499340534210205, + "learning_rate": 0.00016806906268835263, + "loss": 3.2125, + "step": 39480 + }, + { + "epoch": 1.93, + "grad_norm": 0.6421015858650208, + "learning_rate": 0.00016805523538130986, + "loss": 3.1327, + "step": 39481 + }, + { + "epoch": 1.93, + "grad_norm": 0.6436095833778381, + "learning_rate": 0.00016804140842178412, + "loss": 2.9494, + "step": 39482 + }, + { + "epoch": 1.93, + "grad_norm": 0.658585786819458, + "learning_rate": 0.0001680275818098119, + "loss": 3.0283, + "step": 39483 + }, + { + "epoch": 1.94, + "grad_norm": 0.6562331318855286, + "learning_rate": 0.00016801375554542977, + "loss": 3.0036, + "step": 39484 + }, + { + "epoch": 1.94, + "grad_norm": 0.7266846299171448, + "learning_rate": 0.00016799992962867385, + "loss": 2.9681, + "step": 39485 + }, + { + "epoch": 1.94, + "grad_norm": 0.6242222785949707, + "learning_rate": 0.00016798610405958087, + "loss": 2.9437, + "step": 39486 + }, + { + "epoch": 1.94, + "grad_norm": 0.6533987522125244, + "learning_rate": 0.00016797227883818706, + "loss": 2.7359, + "step": 39487 + }, + { + "epoch": 1.94, + "grad_norm": 0.6742711067199707, + "learning_rate": 0.00016795845396452876, + "loss": 2.8229, + "step": 39488 + }, + { + "epoch": 1.94, + "grad_norm": 0.6334719657897949, + "learning_rate": 0.00016794462943864257, + "loss": 2.9316, + "step": 39489 + }, + { + "epoch": 1.94, + "grad_norm": 0.651542067527771, + "learning_rate": 0.00016793080526056472, + "loss": 3.0136, + "step": 39490 + }, + { + "epoch": 1.94, + "grad_norm": 0.6780474185943604, + "learning_rate": 0.00016791698143033183, + "loss": 2.9013, + "step": 39491 + }, + { + "epoch": 1.94, + "grad_norm": 0.674872100353241, + "learning_rate": 0.00016790315794798006, + "loss": 2.8134, + "step": 39492 + }, + { + "epoch": 1.94, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.00016788933481354595, + "loss": 2.7779, + "step": 39493 + }, + { + "epoch": 1.94, + "grad_norm": 0.6219632625579834, + "learning_rate": 0.00016787551202706604, + "loss": 2.8627, + "step": 39494 + }, + { + "epoch": 1.94, + "grad_norm": 0.640094518661499, + "learning_rate": 0.0001678616895885766, + "loss": 2.8384, + "step": 39495 + }, + { + "epoch": 1.94, + "grad_norm": 0.7231557965278625, + "learning_rate": 0.000167847867498114, + "loss": 2.9999, + "step": 39496 + }, + { + "epoch": 1.94, + "grad_norm": 0.6574937701225281, + "learning_rate": 0.0001678340457557146, + "loss": 2.9708, + "step": 39497 + }, + { + "epoch": 1.94, + "grad_norm": 0.6397494077682495, + "learning_rate": 0.000167820224361415, + "loss": 2.8948, + "step": 39498 + }, + { + "epoch": 1.94, + "grad_norm": 0.6591821908950806, + "learning_rate": 0.0001678064033152514, + "loss": 3.1144, + "step": 39499 + }, + { + "epoch": 1.94, + "grad_norm": 0.684015691280365, + "learning_rate": 0.00016779258261726029, + "loss": 3.0855, + "step": 39500 + }, + { + "epoch": 1.94, + "grad_norm": 0.6491175293922424, + "learning_rate": 0.00016777876226747816, + "loss": 2.8436, + "step": 39501 + }, + { + "epoch": 1.94, + "grad_norm": 0.6679362058639526, + "learning_rate": 0.0001677649422659413, + "loss": 2.9078, + "step": 39502 + }, + { + "epoch": 1.94, + "grad_norm": 0.6936571598052979, + "learning_rate": 0.00016775112261268617, + "loss": 2.9691, + "step": 39503 + }, + { + "epoch": 1.94, + "grad_norm": 0.673439621925354, + "learning_rate": 0.000167737303307749, + "loss": 3.0569, + "step": 39504 + }, + { + "epoch": 1.94, + "grad_norm": 0.6793358325958252, + "learning_rate": 0.00016772348435116637, + "loss": 3.1337, + "step": 39505 + }, + { + "epoch": 1.94, + "grad_norm": 0.6524600982666016, + "learning_rate": 0.00016770966574297465, + "loss": 2.9261, + "step": 39506 + }, + { + "epoch": 1.94, + "grad_norm": 0.6856192946434021, + "learning_rate": 0.00016769584748321014, + "loss": 3.0568, + "step": 39507 + }, + { + "epoch": 1.94, + "grad_norm": 0.6691726446151733, + "learning_rate": 0.00016768202957190943, + "loss": 2.7727, + "step": 39508 + }, + { + "epoch": 1.94, + "grad_norm": 0.7057440876960754, + "learning_rate": 0.00016766821200910883, + "loss": 2.9153, + "step": 39509 + }, + { + "epoch": 1.94, + "grad_norm": 0.6541669368743896, + "learning_rate": 0.00016765439479484448, + "loss": 3.16, + "step": 39510 + }, + { + "epoch": 1.94, + "grad_norm": 0.7234159708023071, + "learning_rate": 0.00016764057792915317, + "loss": 3.1711, + "step": 39511 + }, + { + "epoch": 1.94, + "grad_norm": 0.6421957015991211, + "learning_rate": 0.00016762676141207096, + "loss": 2.8255, + "step": 39512 + }, + { + "epoch": 1.94, + "grad_norm": 0.6890547871589661, + "learning_rate": 0.0001676129452436345, + "loss": 2.9754, + "step": 39513 + }, + { + "epoch": 1.94, + "grad_norm": 0.6445096731185913, + "learning_rate": 0.00016759912942388, + "loss": 2.8582, + "step": 39514 + }, + { + "epoch": 1.94, + "grad_norm": 0.6753875017166138, + "learning_rate": 0.00016758531395284383, + "loss": 3.0514, + "step": 39515 + }, + { + "epoch": 1.94, + "grad_norm": 0.6762891411781311, + "learning_rate": 0.00016757149883056264, + "loss": 2.9444, + "step": 39516 + }, + { + "epoch": 1.94, + "grad_norm": 0.6587990522384644, + "learning_rate": 0.0001675576840570726, + "loss": 2.8884, + "step": 39517 + }, + { + "epoch": 1.94, + "grad_norm": 0.6049191951751709, + "learning_rate": 0.00016754386963241014, + "loss": 2.928, + "step": 39518 + }, + { + "epoch": 1.94, + "grad_norm": 0.6751249432563782, + "learning_rate": 0.00016753005555661152, + "loss": 3.0188, + "step": 39519 + }, + { + "epoch": 1.94, + "grad_norm": 0.6588065028190613, + "learning_rate": 0.00016751624182971325, + "loss": 3.018, + "step": 39520 + }, + { + "epoch": 1.94, + "grad_norm": 0.6722527742385864, + "learning_rate": 0.00016750242845175182, + "loss": 2.9835, + "step": 39521 + }, + { + "epoch": 1.94, + "grad_norm": 0.7164931893348694, + "learning_rate": 0.00016748861542276332, + "loss": 2.8443, + "step": 39522 + }, + { + "epoch": 1.94, + "grad_norm": 0.6754207015037537, + "learning_rate": 0.0001674748027427845, + "loss": 3.1071, + "step": 39523 + }, + { + "epoch": 1.94, + "grad_norm": 0.6811302304267883, + "learning_rate": 0.00016746099041185152, + "loss": 2.8322, + "step": 39524 + }, + { + "epoch": 1.94, + "grad_norm": 0.6523797512054443, + "learning_rate": 0.00016744717843000066, + "loss": 3.0216, + "step": 39525 + }, + { + "epoch": 1.94, + "grad_norm": 0.6112352013587952, + "learning_rate": 0.0001674333667972685, + "loss": 2.9525, + "step": 39526 + }, + { + "epoch": 1.94, + "grad_norm": 0.6537315249443054, + "learning_rate": 0.0001674195555136913, + "loss": 3.0354, + "step": 39527 + }, + { + "epoch": 1.94, + "grad_norm": 0.6480567455291748, + "learning_rate": 0.00016740574457930552, + "loss": 3.0474, + "step": 39528 + }, + { + "epoch": 1.94, + "grad_norm": 0.6240382194519043, + "learning_rate": 0.00016739193399414743, + "loss": 2.7532, + "step": 39529 + }, + { + "epoch": 1.94, + "grad_norm": 0.6805428266525269, + "learning_rate": 0.00016737812375825357, + "loss": 2.878, + "step": 39530 + }, + { + "epoch": 1.94, + "grad_norm": 0.6385943293571472, + "learning_rate": 0.00016736431387166017, + "loss": 3.1757, + "step": 39531 + }, + { + "epoch": 1.94, + "grad_norm": 0.6469088196754456, + "learning_rate": 0.00016735050433440353, + "loss": 2.8245, + "step": 39532 + }, + { + "epoch": 1.94, + "grad_norm": 0.6487658619880676, + "learning_rate": 0.0001673366951465203, + "loss": 2.864, + "step": 39533 + }, + { + "epoch": 1.94, + "grad_norm": 0.6648675799369812, + "learning_rate": 0.0001673228863080465, + "loss": 3.1449, + "step": 39534 + }, + { + "epoch": 1.94, + "grad_norm": 0.6737173199653625, + "learning_rate": 0.00016730907781901873, + "loss": 2.8128, + "step": 39535 + }, + { + "epoch": 1.94, + "grad_norm": 0.6540577411651611, + "learning_rate": 0.00016729526967947344, + "loss": 3.0712, + "step": 39536 + }, + { + "epoch": 1.94, + "grad_norm": 0.6505504250526428, + "learning_rate": 0.00016728146188944672, + "loss": 3.0605, + "step": 39537 + }, + { + "epoch": 1.94, + "grad_norm": 0.6246081590652466, + "learning_rate": 0.0001672676544489752, + "loss": 2.902, + "step": 39538 + }, + { + "epoch": 1.94, + "grad_norm": 0.6178304553031921, + "learning_rate": 0.00016725384735809517, + "loss": 2.738, + "step": 39539 + }, + { + "epoch": 1.94, + "grad_norm": 0.711046040058136, + "learning_rate": 0.00016724004061684282, + "loss": 3.0829, + "step": 39540 + }, + { + "epoch": 1.94, + "grad_norm": 0.6916102766990662, + "learning_rate": 0.00016722623422525478, + "loss": 3.0951, + "step": 39541 + }, + { + "epoch": 1.94, + "grad_norm": 0.6540066599845886, + "learning_rate": 0.00016721242818336712, + "loss": 3.0883, + "step": 39542 + }, + { + "epoch": 1.94, + "grad_norm": 0.6461518406867981, + "learning_rate": 0.00016719862249121655, + "loss": 3.0277, + "step": 39543 + }, + { + "epoch": 1.94, + "grad_norm": 0.6972729563713074, + "learning_rate": 0.00016718481714883912, + "loss": 3.252, + "step": 39544 + }, + { + "epoch": 1.94, + "grad_norm": 0.6396310329437256, + "learning_rate": 0.00016717101215627143, + "loss": 2.5932, + "step": 39545 + }, + { + "epoch": 1.94, + "grad_norm": 0.7706670165061951, + "learning_rate": 0.0001671572075135497, + "loss": 3.0133, + "step": 39546 + }, + { + "epoch": 1.94, + "grad_norm": 0.6633245944976807, + "learning_rate": 0.00016714340322071023, + "loss": 3.0178, + "step": 39547 + }, + { + "epoch": 1.94, + "grad_norm": 0.7238604426383972, + "learning_rate": 0.00016712959927778957, + "loss": 2.921, + "step": 39548 + }, + { + "epoch": 1.94, + "grad_norm": 0.6844163537025452, + "learning_rate": 0.0001671157956848239, + "loss": 3.0155, + "step": 39549 + }, + { + "epoch": 1.94, + "grad_norm": 0.6485677361488342, + "learning_rate": 0.0001671019924418496, + "loss": 3.1059, + "step": 39550 + }, + { + "epoch": 1.94, + "grad_norm": 0.6759480237960815, + "learning_rate": 0.0001670881895489032, + "loss": 3.2116, + "step": 39551 + }, + { + "epoch": 1.94, + "grad_norm": 0.641191303730011, + "learning_rate": 0.00016707438700602084, + "loss": 2.8688, + "step": 39552 + }, + { + "epoch": 1.94, + "grad_norm": 0.6632412672042847, + "learning_rate": 0.00016706058481323908, + "loss": 3.114, + "step": 39553 + }, + { + "epoch": 1.94, + "grad_norm": 0.6805461049079895, + "learning_rate": 0.0001670467829705941, + "loss": 2.9444, + "step": 39554 + }, + { + "epoch": 1.94, + "grad_norm": 0.6704729199409485, + "learning_rate": 0.00016703298147812223, + "loss": 2.851, + "step": 39555 + }, + { + "epoch": 1.94, + "grad_norm": 0.625964879989624, + "learning_rate": 0.00016701918033586, + "loss": 3.2429, + "step": 39556 + }, + { + "epoch": 1.94, + "grad_norm": 0.6714993119239807, + "learning_rate": 0.00016700537954384356, + "loss": 2.9443, + "step": 39557 + }, + { + "epoch": 1.94, + "grad_norm": 0.6405186057090759, + "learning_rate": 0.0001669915791021095, + "loss": 3.0493, + "step": 39558 + }, + { + "epoch": 1.94, + "grad_norm": 0.6585462093353271, + "learning_rate": 0.00016697777901069385, + "loss": 3.0023, + "step": 39559 + }, + { + "epoch": 1.94, + "grad_norm": 0.7218492031097412, + "learning_rate": 0.00016696397926963325, + "loss": 3.137, + "step": 39560 + }, + { + "epoch": 1.94, + "grad_norm": 0.6385161280632019, + "learning_rate": 0.00016695017987896398, + "loss": 3.008, + "step": 39561 + }, + { + "epoch": 1.94, + "grad_norm": 0.6418231725692749, + "learning_rate": 0.00016693638083872217, + "loss": 3.023, + "step": 39562 + }, + { + "epoch": 1.94, + "grad_norm": 0.6520888805389404, + "learning_rate": 0.00016692258214894446, + "loss": 3.0538, + "step": 39563 + }, + { + "epoch": 1.94, + "grad_norm": 0.6636407375335693, + "learning_rate": 0.0001669087838096669, + "loss": 2.8663, + "step": 39564 + }, + { + "epoch": 1.94, + "grad_norm": 0.6697338223457336, + "learning_rate": 0.000166894985820926, + "loss": 2.9433, + "step": 39565 + }, + { + "epoch": 1.94, + "grad_norm": 0.6582183241844177, + "learning_rate": 0.00016688118818275824, + "loss": 3.0544, + "step": 39566 + }, + { + "epoch": 1.94, + "grad_norm": 0.6985874176025391, + "learning_rate": 0.0001668673908951998, + "loss": 3.1551, + "step": 39567 + }, + { + "epoch": 1.94, + "grad_norm": 0.6799558997154236, + "learning_rate": 0.00016685359395828702, + "loss": 2.9644, + "step": 39568 + }, + { + "epoch": 1.94, + "grad_norm": 0.6577656865119934, + "learning_rate": 0.00016683979737205614, + "loss": 2.9913, + "step": 39569 + }, + { + "epoch": 1.94, + "grad_norm": 0.6054019331932068, + "learning_rate": 0.0001668260011365436, + "loss": 2.8949, + "step": 39570 + }, + { + "epoch": 1.94, + "grad_norm": 0.7262248992919922, + "learning_rate": 0.00016681220525178585, + "loss": 3.1803, + "step": 39571 + }, + { + "epoch": 1.94, + "grad_norm": 0.6668470501899719, + "learning_rate": 0.00016679840971781904, + "loss": 3.1008, + "step": 39572 + }, + { + "epoch": 1.94, + "grad_norm": 0.6440438032150269, + "learning_rate": 0.00016678461453467963, + "loss": 2.9656, + "step": 39573 + }, + { + "epoch": 1.94, + "grad_norm": 0.6357288956642151, + "learning_rate": 0.00016677081970240386, + "loss": 2.9421, + "step": 39574 + }, + { + "epoch": 1.94, + "grad_norm": 0.6600980162620544, + "learning_rate": 0.00016675702522102822, + "loss": 2.661, + "step": 39575 + }, + { + "epoch": 1.94, + "grad_norm": 0.6919565796852112, + "learning_rate": 0.0001667432310905889, + "loss": 2.8373, + "step": 39576 + }, + { + "epoch": 1.94, + "grad_norm": 0.6861438751220703, + "learning_rate": 0.0001667294373111222, + "loss": 2.9764, + "step": 39577 + }, + { + "epoch": 1.94, + "grad_norm": 0.7029724717140198, + "learning_rate": 0.00016671564388266456, + "loss": 2.9436, + "step": 39578 + }, + { + "epoch": 1.94, + "grad_norm": 0.6634342670440674, + "learning_rate": 0.0001667018508052522, + "loss": 2.8775, + "step": 39579 + }, + { + "epoch": 1.94, + "grad_norm": 0.6349776387214661, + "learning_rate": 0.0001666880580789216, + "loss": 2.9675, + "step": 39580 + }, + { + "epoch": 1.94, + "grad_norm": 0.6573511362075806, + "learning_rate": 0.00016667426570370888, + "loss": 2.9222, + "step": 39581 + }, + { + "epoch": 1.94, + "grad_norm": 0.6723518967628479, + "learning_rate": 0.00016666047367965058, + "loss": 2.9447, + "step": 39582 + }, + { + "epoch": 1.94, + "grad_norm": 0.6813273429870605, + "learning_rate": 0.00016664668200678297, + "loss": 3.0502, + "step": 39583 + }, + { + "epoch": 1.94, + "grad_norm": 0.6408233046531677, + "learning_rate": 0.0001666328906851422, + "loss": 2.913, + "step": 39584 + }, + { + "epoch": 1.94, + "grad_norm": 0.6882518529891968, + "learning_rate": 0.00016661909971476489, + "loss": 2.9673, + "step": 39585 + }, + { + "epoch": 1.94, + "grad_norm": 0.7611137628555298, + "learning_rate": 0.00016660530909568704, + "loss": 3.0111, + "step": 39586 + }, + { + "epoch": 1.94, + "grad_norm": 0.6440632343292236, + "learning_rate": 0.00016659151882794516, + "loss": 3.0178, + "step": 39587 + }, + { + "epoch": 1.94, + "grad_norm": 0.6395631432533264, + "learning_rate": 0.00016657772891157563, + "loss": 2.963, + "step": 39588 + }, + { + "epoch": 1.94, + "grad_norm": 0.6646182537078857, + "learning_rate": 0.0001665639393466147, + "loss": 2.8769, + "step": 39589 + }, + { + "epoch": 1.94, + "grad_norm": 0.6622651815414429, + "learning_rate": 0.00016655015013309867, + "loss": 3.1089, + "step": 39590 + }, + { + "epoch": 1.94, + "grad_norm": 0.6579053997993469, + "learning_rate": 0.00016653636127106374, + "loss": 2.8252, + "step": 39591 + }, + { + "epoch": 1.94, + "grad_norm": 0.635835587978363, + "learning_rate": 0.00016652257276054638, + "loss": 3.1497, + "step": 39592 + }, + { + "epoch": 1.94, + "grad_norm": 0.645119309425354, + "learning_rate": 0.00016650878460158295, + "loss": 3.0745, + "step": 39593 + }, + { + "epoch": 1.94, + "grad_norm": 0.707358717918396, + "learning_rate": 0.0001664949967942096, + "loss": 3.0034, + "step": 39594 + }, + { + "epoch": 1.94, + "grad_norm": 0.6341613531112671, + "learning_rate": 0.00016648120933846289, + "loss": 2.9241, + "step": 39595 + }, + { + "epoch": 1.94, + "grad_norm": 0.6709751486778259, + "learning_rate": 0.00016646742223437879, + "loss": 3.0487, + "step": 39596 + }, + { + "epoch": 1.94, + "grad_norm": 0.6443012356758118, + "learning_rate": 0.00016645363548199396, + "loss": 2.947, + "step": 39597 + }, + { + "epoch": 1.94, + "grad_norm": 0.6468132734298706, + "learning_rate": 0.0001664398490813445, + "loss": 3.1464, + "step": 39598 + }, + { + "epoch": 1.94, + "grad_norm": 0.6579627394676208, + "learning_rate": 0.0001664260630324667, + "loss": 2.9348, + "step": 39599 + }, + { + "epoch": 1.94, + "grad_norm": 0.6067564487457275, + "learning_rate": 0.00016641227733539703, + "loss": 2.9836, + "step": 39600 + }, + { + "epoch": 1.94, + "grad_norm": 0.6814645528793335, + "learning_rate": 0.00016639849199017164, + "loss": 2.9497, + "step": 39601 + }, + { + "epoch": 1.94, + "grad_norm": 0.7532250285148621, + "learning_rate": 0.00016638470699682688, + "loss": 2.8528, + "step": 39602 + }, + { + "epoch": 1.94, + "grad_norm": 0.6621189713478088, + "learning_rate": 0.00016637092235539925, + "loss": 3.103, + "step": 39603 + }, + { + "epoch": 1.94, + "grad_norm": 0.6254897713661194, + "learning_rate": 0.00016635713806592483, + "loss": 2.6625, + "step": 39604 + }, + { + "epoch": 1.94, + "grad_norm": 0.6650421023368835, + "learning_rate": 0.00016634335412844, + "loss": 3.0827, + "step": 39605 + }, + { + "epoch": 1.94, + "grad_norm": 0.6702773571014404, + "learning_rate": 0.00016632957054298097, + "loss": 3.0373, + "step": 39606 + }, + { + "epoch": 1.94, + "grad_norm": 0.629437267780304, + "learning_rate": 0.0001663157873095841, + "loss": 2.8015, + "step": 39607 + }, + { + "epoch": 1.94, + "grad_norm": 0.6859908103942871, + "learning_rate": 0.00016630200442828582, + "loss": 2.8849, + "step": 39608 + }, + { + "epoch": 1.94, + "grad_norm": 0.6508818864822388, + "learning_rate": 0.00016628822189912226, + "loss": 2.7454, + "step": 39609 + }, + { + "epoch": 1.94, + "grad_norm": 0.6694782972335815, + "learning_rate": 0.00016627443972212986, + "loss": 2.9539, + "step": 39610 + }, + { + "epoch": 1.94, + "grad_norm": 0.6411961317062378, + "learning_rate": 0.00016626065789734486, + "loss": 2.9037, + "step": 39611 + }, + { + "epoch": 1.94, + "grad_norm": 0.6573426127433777, + "learning_rate": 0.00016624687642480344, + "loss": 2.8916, + "step": 39612 + }, + { + "epoch": 1.94, + "grad_norm": 0.7193799614906311, + "learning_rate": 0.00016623309530454215, + "loss": 2.843, + "step": 39613 + }, + { + "epoch": 1.94, + "grad_norm": 0.6922988891601562, + "learning_rate": 0.00016621931453659697, + "loss": 3.0807, + "step": 39614 + }, + { + "epoch": 1.94, + "grad_norm": 0.6599312424659729, + "learning_rate": 0.00016620553412100453, + "loss": 2.8812, + "step": 39615 + }, + { + "epoch": 1.94, + "grad_norm": 0.6752668619155884, + "learning_rate": 0.00016619175405780081, + "loss": 2.9444, + "step": 39616 + }, + { + "epoch": 1.94, + "grad_norm": 0.6617905497550964, + "learning_rate": 0.00016617797434702233, + "loss": 3.1259, + "step": 39617 + }, + { + "epoch": 1.94, + "grad_norm": 0.6991008520126343, + "learning_rate": 0.00016616419498870533, + "loss": 3.0066, + "step": 39618 + }, + { + "epoch": 1.94, + "grad_norm": 0.6415068507194519, + "learning_rate": 0.00016615041598288613, + "loss": 3.1282, + "step": 39619 + }, + { + "epoch": 1.94, + "grad_norm": 0.6914513111114502, + "learning_rate": 0.000166136637329601, + "loss": 3.2745, + "step": 39620 + }, + { + "epoch": 1.94, + "grad_norm": 0.6138527989387512, + "learning_rate": 0.00016612285902888605, + "loss": 2.8569, + "step": 39621 + }, + { + "epoch": 1.94, + "grad_norm": 0.6552730798721313, + "learning_rate": 0.00016610908108077776, + "loss": 3.088, + "step": 39622 + }, + { + "epoch": 1.94, + "grad_norm": 0.6678380370140076, + "learning_rate": 0.00016609530348531246, + "loss": 3.0937, + "step": 39623 + }, + { + "epoch": 1.94, + "grad_norm": 0.7249351143836975, + "learning_rate": 0.00016608152624252628, + "loss": 3.1583, + "step": 39624 + }, + { + "epoch": 1.94, + "grad_norm": 0.6721065044403076, + "learning_rate": 0.00016606774935245568, + "loss": 3.0651, + "step": 39625 + }, + { + "epoch": 1.94, + "grad_norm": 0.6632776260375977, + "learning_rate": 0.00016605397281513684, + "loss": 2.9947, + "step": 39626 + }, + { + "epoch": 1.94, + "grad_norm": 0.6905990839004517, + "learning_rate": 0.00016604019663060595, + "loss": 2.9802, + "step": 39627 + }, + { + "epoch": 1.94, + "grad_norm": 0.7240660190582275, + "learning_rate": 0.00016602642079889955, + "loss": 2.9532, + "step": 39628 + }, + { + "epoch": 1.94, + "grad_norm": 0.653408944606781, + "learning_rate": 0.00016601264532005363, + "loss": 3.0149, + "step": 39629 + }, + { + "epoch": 1.94, + "grad_norm": 0.665780246257782, + "learning_rate": 0.00016599887019410473, + "loss": 2.9089, + "step": 39630 + }, + { + "epoch": 1.94, + "grad_norm": 0.6445152163505554, + "learning_rate": 0.0001659850954210889, + "loss": 2.9931, + "step": 39631 + }, + { + "epoch": 1.94, + "grad_norm": 0.6995826363563538, + "learning_rate": 0.00016597132100104264, + "loss": 3.13, + "step": 39632 + }, + { + "epoch": 1.94, + "grad_norm": 0.6662886738777161, + "learning_rate": 0.00016595754693400215, + "loss": 3.0491, + "step": 39633 + }, + { + "epoch": 1.94, + "grad_norm": 0.6708129048347473, + "learning_rate": 0.00016594377322000356, + "loss": 2.9691, + "step": 39634 + }, + { + "epoch": 1.94, + "grad_norm": 0.6616761088371277, + "learning_rate": 0.00016592999985908343, + "loss": 3.0653, + "step": 39635 + }, + { + "epoch": 1.94, + "grad_norm": 0.6500592827796936, + "learning_rate": 0.0001659162268512777, + "loss": 3.0368, + "step": 39636 + }, + { + "epoch": 1.94, + "grad_norm": 0.6823159456253052, + "learning_rate": 0.00016590245419662283, + "loss": 2.9157, + "step": 39637 + }, + { + "epoch": 1.94, + "grad_norm": 0.6613301634788513, + "learning_rate": 0.00016588868189515522, + "loss": 2.8774, + "step": 39638 + }, + { + "epoch": 1.94, + "grad_norm": 0.647834062576294, + "learning_rate": 0.00016587490994691092, + "loss": 2.8103, + "step": 39639 + }, + { + "epoch": 1.94, + "grad_norm": 0.6654465198516846, + "learning_rate": 0.00016586113835192635, + "loss": 2.9277, + "step": 39640 + }, + { + "epoch": 1.94, + "grad_norm": 0.6587596535682678, + "learning_rate": 0.0001658473671102378, + "loss": 2.9219, + "step": 39641 + }, + { + "epoch": 1.94, + "grad_norm": 0.6656188368797302, + "learning_rate": 0.00016583359622188132, + "loss": 3.0531, + "step": 39642 + }, + { + "epoch": 1.94, + "grad_norm": 0.6441434025764465, + "learning_rate": 0.00016581982568689342, + "loss": 2.9493, + "step": 39643 + }, + { + "epoch": 1.94, + "grad_norm": 0.6490876078605652, + "learning_rate": 0.00016580605550531018, + "loss": 2.9725, + "step": 39644 + }, + { + "epoch": 1.94, + "grad_norm": 0.7419745326042175, + "learning_rate": 0.00016579228567716807, + "loss": 2.88, + "step": 39645 + }, + { + "epoch": 1.94, + "grad_norm": 0.6507675051689148, + "learning_rate": 0.00016577851620250313, + "loss": 2.8218, + "step": 39646 + }, + { + "epoch": 1.94, + "grad_norm": 0.6721502542495728, + "learning_rate": 0.00016576474708135188, + "loss": 3.1349, + "step": 39647 + }, + { + "epoch": 1.94, + "grad_norm": 0.6784391403198242, + "learning_rate": 0.00016575097831375045, + "loss": 2.9116, + "step": 39648 + }, + { + "epoch": 1.94, + "grad_norm": 0.6822816133499146, + "learning_rate": 0.000165737209899735, + "loss": 2.8589, + "step": 39649 + }, + { + "epoch": 1.94, + "grad_norm": 0.6557987332344055, + "learning_rate": 0.000165723441839342, + "loss": 2.8979, + "step": 39650 + }, + { + "epoch": 1.94, + "grad_norm": 0.6441841125488281, + "learning_rate": 0.00016570967413260748, + "loss": 2.9015, + "step": 39651 + }, + { + "epoch": 1.94, + "grad_norm": 0.7314832210540771, + "learning_rate": 0.00016569590677956785, + "loss": 3.1157, + "step": 39652 + }, + { + "epoch": 1.94, + "grad_norm": 0.6173034310340881, + "learning_rate": 0.00016568213978025947, + "loss": 3.1966, + "step": 39653 + }, + { + "epoch": 1.94, + "grad_norm": 0.6204674243927002, + "learning_rate": 0.00016566837313471838, + "loss": 3.0183, + "step": 39654 + }, + { + "epoch": 1.94, + "grad_norm": 0.6612196564674377, + "learning_rate": 0.00016565460684298116, + "loss": 2.8712, + "step": 39655 + }, + { + "epoch": 1.94, + "grad_norm": 0.6848688721656799, + "learning_rate": 0.00016564084090508362, + "loss": 3.0089, + "step": 39656 + }, + { + "epoch": 1.94, + "grad_norm": 0.6412416696548462, + "learning_rate": 0.00016562707532106235, + "loss": 2.9572, + "step": 39657 + }, + { + "epoch": 1.94, + "grad_norm": 0.667161226272583, + "learning_rate": 0.0001656133100909534, + "loss": 2.8977, + "step": 39658 + }, + { + "epoch": 1.94, + "grad_norm": 0.639665961265564, + "learning_rate": 0.00016559954521479315, + "loss": 3.0027, + "step": 39659 + }, + { + "epoch": 1.94, + "grad_norm": 0.6585216522216797, + "learning_rate": 0.00016558578069261791, + "loss": 3.0025, + "step": 39660 + }, + { + "epoch": 1.94, + "grad_norm": 0.637984573841095, + "learning_rate": 0.00016557201652446374, + "loss": 2.8542, + "step": 39661 + }, + { + "epoch": 1.94, + "grad_norm": 0.6432386040687561, + "learning_rate": 0.00016555825271036715, + "loss": 2.9786, + "step": 39662 + }, + { + "epoch": 1.94, + "grad_norm": 0.6286998391151428, + "learning_rate": 0.00016554448925036426, + "loss": 3.0735, + "step": 39663 + }, + { + "epoch": 1.94, + "grad_norm": 0.6470546722412109, + "learning_rate": 0.00016553072614449118, + "loss": 2.8628, + "step": 39664 + }, + { + "epoch": 1.94, + "grad_norm": 0.7382664084434509, + "learning_rate": 0.0001655169633927844, + "loss": 3.0028, + "step": 39665 + }, + { + "epoch": 1.94, + "grad_norm": 0.6897470355033875, + "learning_rate": 0.0001655032009952799, + "loss": 2.8576, + "step": 39666 + }, + { + "epoch": 1.94, + "grad_norm": 0.6223475933074951, + "learning_rate": 0.00016548943895201428, + "loss": 2.8861, + "step": 39667 + }, + { + "epoch": 1.94, + "grad_norm": 0.721520721912384, + "learning_rate": 0.0001654756772630234, + "loss": 3.0925, + "step": 39668 + }, + { + "epoch": 1.94, + "grad_norm": 0.6690992116928101, + "learning_rate": 0.00016546191592834385, + "loss": 2.8354, + "step": 39669 + }, + { + "epoch": 1.94, + "grad_norm": 0.6852384209632874, + "learning_rate": 0.00016544815494801173, + "loss": 3.0092, + "step": 39670 + }, + { + "epoch": 1.94, + "grad_norm": 0.6654887795448303, + "learning_rate": 0.00016543439432206312, + "loss": 3.1952, + "step": 39671 + }, + { + "epoch": 1.94, + "grad_norm": 0.6750345826148987, + "learning_rate": 0.00016542063405053457, + "loss": 3.0411, + "step": 39672 + }, + { + "epoch": 1.94, + "grad_norm": 0.695560872554779, + "learning_rate": 0.00016540687413346203, + "loss": 3.0145, + "step": 39673 + }, + { + "epoch": 1.94, + "grad_norm": 0.7053586840629578, + "learning_rate": 0.0001653931145708819, + "loss": 3.1015, + "step": 39674 + }, + { + "epoch": 1.94, + "grad_norm": 0.6435714960098267, + "learning_rate": 0.0001653793553628305, + "loss": 3.0385, + "step": 39675 + }, + { + "epoch": 1.94, + "grad_norm": 0.6890304684638977, + "learning_rate": 0.00016536559650934385, + "loss": 2.8383, + "step": 39676 + }, + { + "epoch": 1.94, + "grad_norm": 0.7017620205879211, + "learning_rate": 0.00016535183801045847, + "loss": 2.8856, + "step": 39677 + }, + { + "epoch": 1.94, + "grad_norm": 0.638668417930603, + "learning_rate": 0.0001653380798662104, + "loss": 2.9055, + "step": 39678 + }, + { + "epoch": 1.94, + "grad_norm": 0.7142661809921265, + "learning_rate": 0.0001653243220766358, + "loss": 2.8482, + "step": 39679 + }, + { + "epoch": 1.94, + "grad_norm": 0.6465545892715454, + "learning_rate": 0.0001653105646417711, + "loss": 2.9118, + "step": 39680 + }, + { + "epoch": 1.94, + "grad_norm": 0.640198826789856, + "learning_rate": 0.00016529680756165237, + "loss": 3.0155, + "step": 39681 + }, + { + "epoch": 1.94, + "grad_norm": 0.6553422212600708, + "learning_rate": 0.00016528305083631604, + "loss": 3.0283, + "step": 39682 + }, + { + "epoch": 1.94, + "grad_norm": 0.842480480670929, + "learning_rate": 0.0001652692944657981, + "loss": 2.7033, + "step": 39683 + }, + { + "epoch": 1.94, + "grad_norm": 0.6969196200370789, + "learning_rate": 0.000165255538450135, + "loss": 3.1925, + "step": 39684 + }, + { + "epoch": 1.94, + "grad_norm": 0.6522579193115234, + "learning_rate": 0.00016524178278936295, + "loss": 2.8948, + "step": 39685 + }, + { + "epoch": 1.94, + "grad_norm": 0.659083902835846, + "learning_rate": 0.00016522802748351797, + "loss": 2.8768, + "step": 39686 + }, + { + "epoch": 1.94, + "grad_norm": 0.6438480615615845, + "learning_rate": 0.00016521427253263652, + "loss": 2.9011, + "step": 39687 + }, + { + "epoch": 1.95, + "grad_norm": 0.6511509418487549, + "learning_rate": 0.0001652005179367546, + "loss": 2.9839, + "step": 39688 + }, + { + "epoch": 1.95, + "grad_norm": 0.6866702437400818, + "learning_rate": 0.0001651867636959086, + "loss": 2.9139, + "step": 39689 + }, + { + "epoch": 1.95, + "grad_norm": 0.6492428183555603, + "learning_rate": 0.00016517300981013484, + "loss": 3.0238, + "step": 39690 + }, + { + "epoch": 1.95, + "grad_norm": 0.629295825958252, + "learning_rate": 0.00016515925627946942, + "loss": 3.1701, + "step": 39691 + }, + { + "epoch": 1.95, + "grad_norm": 0.7377083897590637, + "learning_rate": 0.0001651455031039486, + "loss": 2.8515, + "step": 39692 + }, + { + "epoch": 1.95, + "grad_norm": 0.6295761466026306, + "learning_rate": 0.00016513175028360844, + "loss": 2.862, + "step": 39693 + }, + { + "epoch": 1.95, + "grad_norm": 0.6240883469581604, + "learning_rate": 0.00016511799781848528, + "loss": 2.7688, + "step": 39694 + }, + { + "epoch": 1.95, + "grad_norm": 0.7190060615539551, + "learning_rate": 0.00016510424570861544, + "loss": 3.069, + "step": 39695 + }, + { + "epoch": 1.95, + "grad_norm": 0.6325370073318481, + "learning_rate": 0.00016509049395403496, + "loss": 2.8299, + "step": 39696 + }, + { + "epoch": 1.95, + "grad_norm": 0.6780902743339539, + "learning_rate": 0.00016507674255478028, + "loss": 2.9697, + "step": 39697 + }, + { + "epoch": 1.95, + "grad_norm": 0.6456710696220398, + "learning_rate": 0.0001650629915108874, + "loss": 2.8723, + "step": 39698 + }, + { + "epoch": 1.95, + "grad_norm": 0.6725703477859497, + "learning_rate": 0.00016504924082239272, + "loss": 3.1925, + "step": 39699 + }, + { + "epoch": 1.95, + "grad_norm": 0.6326077580451965, + "learning_rate": 0.00016503549048933237, + "loss": 2.9816, + "step": 39700 + }, + { + "epoch": 1.95, + "grad_norm": 0.6344588994979858, + "learning_rate": 0.00016502174051174245, + "loss": 3.0964, + "step": 39701 + }, + { + "epoch": 1.95, + "grad_norm": 0.6674083471298218, + "learning_rate": 0.00016500799088965942, + "loss": 2.851, + "step": 39702 + }, + { + "epoch": 1.95, + "grad_norm": 0.6438904404640198, + "learning_rate": 0.0001649942416231192, + "loss": 3.1094, + "step": 39703 + }, + { + "epoch": 1.95, + "grad_norm": 0.6310093402862549, + "learning_rate": 0.00016498049271215822, + "loss": 3.0114, + "step": 39704 + }, + { + "epoch": 1.95, + "grad_norm": 0.6457886099815369, + "learning_rate": 0.0001649667441568127, + "loss": 3.0389, + "step": 39705 + }, + { + "epoch": 1.95, + "grad_norm": 0.6540348529815674, + "learning_rate": 0.00016495299595711886, + "loss": 3.1004, + "step": 39706 + }, + { + "epoch": 1.95, + "grad_norm": 0.6192497611045837, + "learning_rate": 0.00016493924811311278, + "loss": 2.7377, + "step": 39707 + }, + { + "epoch": 1.95, + "grad_norm": 0.6560617685317993, + "learning_rate": 0.00016492550062483064, + "loss": 2.9241, + "step": 39708 + }, + { + "epoch": 1.95, + "grad_norm": 0.6480932831764221, + "learning_rate": 0.0001649117534923087, + "loss": 3.131, + "step": 39709 + }, + { + "epoch": 1.95, + "grad_norm": 0.6678999662399292, + "learning_rate": 0.00016489800671558335, + "loss": 3.0523, + "step": 39710 + }, + { + "epoch": 1.95, + "grad_norm": 0.7164120674133301, + "learning_rate": 0.0001648842602946905, + "loss": 2.8345, + "step": 39711 + }, + { + "epoch": 1.95, + "grad_norm": 0.6553846001625061, + "learning_rate": 0.00016487051422966667, + "loss": 2.9484, + "step": 39712 + }, + { + "epoch": 1.95, + "grad_norm": 0.6383706331253052, + "learning_rate": 0.00016485676852054785, + "loss": 2.9157, + "step": 39713 + }, + { + "epoch": 1.95, + "grad_norm": 0.6358631253242493, + "learning_rate": 0.00016484302316737022, + "loss": 3.0168, + "step": 39714 + }, + { + "epoch": 1.95, + "grad_norm": 0.7204211354255676, + "learning_rate": 0.00016482927817017018, + "loss": 3.0729, + "step": 39715 + }, + { + "epoch": 1.95, + "grad_norm": 0.7235838770866394, + "learning_rate": 0.00016481553352898365, + "loss": 3.097, + "step": 39716 + }, + { + "epoch": 1.95, + "grad_norm": 0.6387816667556763, + "learning_rate": 0.0001648017892438471, + "loss": 3.0294, + "step": 39717 + }, + { + "epoch": 1.95, + "grad_norm": 0.71495121717453, + "learning_rate": 0.00016478804531479653, + "loss": 3.0884, + "step": 39718 + }, + { + "epoch": 1.95, + "grad_norm": 0.6762590408325195, + "learning_rate": 0.00016477430174186822, + "loss": 3.1445, + "step": 39719 + }, + { + "epoch": 1.95, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0001647605585250985, + "loss": 2.7726, + "step": 39720 + }, + { + "epoch": 1.95, + "grad_norm": 0.6876216530799866, + "learning_rate": 0.00016474681566452343, + "loss": 2.9254, + "step": 39721 + }, + { + "epoch": 1.95, + "grad_norm": 0.662525475025177, + "learning_rate": 0.00016473307316017922, + "loss": 3.0107, + "step": 39722 + }, + { + "epoch": 1.95, + "grad_norm": 0.6243906617164612, + "learning_rate": 0.00016471933101210194, + "loss": 2.8323, + "step": 39723 + }, + { + "epoch": 1.95, + "grad_norm": 0.6680139303207397, + "learning_rate": 0.00016470558922032793, + "loss": 2.8206, + "step": 39724 + }, + { + "epoch": 1.95, + "grad_norm": 0.6394197940826416, + "learning_rate": 0.00016469184778489346, + "loss": 3.0246, + "step": 39725 + }, + { + "epoch": 1.95, + "grad_norm": 0.7584770917892456, + "learning_rate": 0.00016467810670583446, + "loss": 2.7716, + "step": 39726 + }, + { + "epoch": 1.95, + "grad_norm": 0.7045025825500488, + "learning_rate": 0.0001646643659831875, + "loss": 2.8362, + "step": 39727 + }, + { + "epoch": 1.95, + "grad_norm": 0.676650881767273, + "learning_rate": 0.00016465062561698846, + "loss": 3.054, + "step": 39728 + }, + { + "epoch": 1.95, + "grad_norm": 0.686996340751648, + "learning_rate": 0.00016463688560727354, + "loss": 2.9678, + "step": 39729 + }, + { + "epoch": 1.95, + "grad_norm": 0.6655346155166626, + "learning_rate": 0.00016462314595407914, + "loss": 3.0299, + "step": 39730 + }, + { + "epoch": 1.95, + "grad_norm": 0.6884318590164185, + "learning_rate": 0.0001646094066574412, + "loss": 2.9813, + "step": 39731 + }, + { + "epoch": 1.95, + "grad_norm": 0.6549413204193115, + "learning_rate": 0.00016459566771739612, + "loss": 2.9558, + "step": 39732 + }, + { + "epoch": 1.95, + "grad_norm": 0.6976496577262878, + "learning_rate": 0.0001645819291339799, + "loss": 2.9014, + "step": 39733 + }, + { + "epoch": 1.95, + "grad_norm": 0.6469190716743469, + "learning_rate": 0.00016456819090722894, + "loss": 2.9572, + "step": 39734 + }, + { + "epoch": 1.95, + "grad_norm": 0.6241649985313416, + "learning_rate": 0.0001645544530371793, + "loss": 3.1763, + "step": 39735 + }, + { + "epoch": 1.95, + "grad_norm": 0.6479203701019287, + "learning_rate": 0.000164540715523867, + "loss": 2.9041, + "step": 39736 + }, + { + "epoch": 1.95, + "grad_norm": 0.7182705402374268, + "learning_rate": 0.00016452697836732858, + "loss": 3.0069, + "step": 39737 + }, + { + "epoch": 1.95, + "grad_norm": 0.6855570077896118, + "learning_rate": 0.00016451324156759986, + "loss": 3.1234, + "step": 39738 + }, + { + "epoch": 1.95, + "grad_norm": 0.7441180944442749, + "learning_rate": 0.0001644995051247173, + "loss": 3.099, + "step": 39739 + }, + { + "epoch": 1.95, + "grad_norm": 0.7113296985626221, + "learning_rate": 0.00016448576903871687, + "loss": 3.0428, + "step": 39740 + }, + { + "epoch": 1.95, + "grad_norm": 0.6470918655395508, + "learning_rate": 0.00016447203330963482, + "loss": 2.9745, + "step": 39741 + }, + { + "epoch": 1.95, + "grad_norm": 0.6484912037849426, + "learning_rate": 0.0001644582979375075, + "loss": 3.0841, + "step": 39742 + }, + { + "epoch": 1.95, + "grad_norm": 0.7016591429710388, + "learning_rate": 0.00016444456292237093, + "loss": 3.1074, + "step": 39743 + }, + { + "epoch": 1.95, + "grad_norm": 0.7112533450126648, + "learning_rate": 0.00016443082826426126, + "loss": 2.7582, + "step": 39744 + }, + { + "epoch": 1.95, + "grad_norm": 0.6694428324699402, + "learning_rate": 0.00016441709396321463, + "loss": 2.7622, + "step": 39745 + }, + { + "epoch": 1.95, + "grad_norm": 0.6349228024482727, + "learning_rate": 0.00016440336001926722, + "loss": 2.8594, + "step": 39746 + }, + { + "epoch": 1.95, + "grad_norm": 0.6667150855064392, + "learning_rate": 0.00016438962643245544, + "loss": 3.1511, + "step": 39747 + }, + { + "epoch": 1.95, + "grad_norm": 0.6422046422958374, + "learning_rate": 0.00016437589320281515, + "loss": 3.1107, + "step": 39748 + }, + { + "epoch": 1.95, + "grad_norm": 0.644203782081604, + "learning_rate": 0.00016436216033038274, + "loss": 2.8401, + "step": 39749 + }, + { + "epoch": 1.95, + "grad_norm": 0.6971120834350586, + "learning_rate": 0.00016434842781519433, + "loss": 2.9795, + "step": 39750 + }, + { + "epoch": 1.95, + "grad_norm": 0.6946253180503845, + "learning_rate": 0.00016433469565728593, + "loss": 2.9588, + "step": 39751 + }, + { + "epoch": 1.95, + "grad_norm": 0.6759042143821716, + "learning_rate": 0.00016432096385669397, + "loss": 3.0513, + "step": 39752 + }, + { + "epoch": 1.95, + "grad_norm": 0.8178485035896301, + "learning_rate": 0.00016430723241345432, + "loss": 2.9891, + "step": 39753 + }, + { + "epoch": 1.95, + "grad_norm": 0.6548961997032166, + "learning_rate": 0.00016429350132760346, + "loss": 3.034, + "step": 39754 + }, + { + "epoch": 1.95, + "grad_norm": 0.6240988969802856, + "learning_rate": 0.00016427977059917727, + "loss": 2.8678, + "step": 39755 + }, + { + "epoch": 1.95, + "grad_norm": 0.6791014671325684, + "learning_rate": 0.00016426604022821205, + "loss": 2.9517, + "step": 39756 + }, + { + "epoch": 1.95, + "grad_norm": 0.6903157234191895, + "learning_rate": 0.00016425231021474418, + "loss": 3.2275, + "step": 39757 + }, + { + "epoch": 1.95, + "grad_norm": 0.6479774117469788, + "learning_rate": 0.00016423858055880937, + "loss": 2.8358, + "step": 39758 + }, + { + "epoch": 1.95, + "grad_norm": 0.6902576088905334, + "learning_rate": 0.0001642248512604441, + "loss": 3.2466, + "step": 39759 + }, + { + "epoch": 1.95, + "grad_norm": 0.6335325837135315, + "learning_rate": 0.00016421112231968437, + "loss": 2.871, + "step": 39760 + }, + { + "epoch": 1.95, + "grad_norm": 0.7053777575492859, + "learning_rate": 0.00016419739373656642, + "loss": 2.9095, + "step": 39761 + }, + { + "epoch": 1.95, + "grad_norm": 0.6629794239997864, + "learning_rate": 0.00016418366551112644, + "loss": 3.159, + "step": 39762 + }, + { + "epoch": 1.95, + "grad_norm": 0.6280484199523926, + "learning_rate": 0.0001641699376434005, + "loss": 3.0098, + "step": 39763 + }, + { + "epoch": 1.95, + "grad_norm": 0.6917252540588379, + "learning_rate": 0.00016415621013342492, + "loss": 2.9122, + "step": 39764 + }, + { + "epoch": 1.95, + "grad_norm": 0.6806678771972656, + "learning_rate": 0.00016414248298123572, + "loss": 3.1373, + "step": 39765 + }, + { + "epoch": 1.95, + "grad_norm": 0.6609867215156555, + "learning_rate": 0.00016412875618686894, + "loss": 2.8685, + "step": 39766 + }, + { + "epoch": 1.95, + "grad_norm": 0.6640313267707825, + "learning_rate": 0.00016411502975036097, + "loss": 2.9706, + "step": 39767 + }, + { + "epoch": 1.95, + "grad_norm": 0.6441121697425842, + "learning_rate": 0.0001641013036717478, + "loss": 2.926, + "step": 39768 + }, + { + "epoch": 1.95, + "grad_norm": 0.6373971104621887, + "learning_rate": 0.00016408757795106574, + "loss": 3.1379, + "step": 39769 + }, + { + "epoch": 1.95, + "grad_norm": 0.6475521326065063, + "learning_rate": 0.0001640738525883507, + "loss": 3.017, + "step": 39770 + }, + { + "epoch": 1.95, + "grad_norm": 0.6653847694396973, + "learning_rate": 0.0001640601275836391, + "loss": 2.8866, + "step": 39771 + }, + { + "epoch": 1.95, + "grad_norm": 0.6433156132698059, + "learning_rate": 0.000164046402936967, + "loss": 2.9023, + "step": 39772 + }, + { + "epoch": 1.95, + "grad_norm": 0.6815069913864136, + "learning_rate": 0.00016403267864837035, + "loss": 3.0265, + "step": 39773 + }, + { + "epoch": 1.95, + "grad_norm": 0.6718235015869141, + "learning_rate": 0.00016401895471788557, + "loss": 3.1506, + "step": 39774 + }, + { + "epoch": 1.95, + "grad_norm": 0.7037975788116455, + "learning_rate": 0.00016400523114554858, + "loss": 3.1061, + "step": 39775 + }, + { + "epoch": 1.95, + "grad_norm": 0.6948851346969604, + "learning_rate": 0.00016399150793139567, + "loss": 2.9012, + "step": 39776 + }, + { + "epoch": 1.95, + "grad_norm": 0.6527612209320068, + "learning_rate": 0.00016397778507546307, + "loss": 2.7146, + "step": 39777 + }, + { + "epoch": 1.95, + "grad_norm": 0.6541974544525146, + "learning_rate": 0.00016396406257778663, + "loss": 2.8992, + "step": 39778 + }, + { + "epoch": 1.95, + "grad_norm": 0.6501400470733643, + "learning_rate": 0.00016395034043840284, + "loss": 3.1708, + "step": 39779 + }, + { + "epoch": 1.95, + "grad_norm": 0.6790065169334412, + "learning_rate": 0.00016393661865734765, + "loss": 3.1035, + "step": 39780 + }, + { + "epoch": 1.95, + "grad_norm": 0.7167571187019348, + "learning_rate": 0.0001639228972346571, + "loss": 2.9389, + "step": 39781 + }, + { + "epoch": 1.95, + "grad_norm": 0.6675640344619751, + "learning_rate": 0.00016390917617036758, + "loss": 2.938, + "step": 39782 + }, + { + "epoch": 1.95, + "grad_norm": 0.6446592211723328, + "learning_rate": 0.00016389545546451498, + "loss": 3.2109, + "step": 39783 + }, + { + "epoch": 1.95, + "grad_norm": 0.637033998966217, + "learning_rate": 0.00016388173511713565, + "loss": 2.8193, + "step": 39784 + }, + { + "epoch": 1.95, + "grad_norm": 0.6496434807777405, + "learning_rate": 0.00016386801512826554, + "loss": 3.1458, + "step": 39785 + }, + { + "epoch": 1.95, + "grad_norm": 0.6477523446083069, + "learning_rate": 0.000163854295497941, + "loss": 3.1229, + "step": 39786 + }, + { + "epoch": 1.95, + "grad_norm": 0.6604776382446289, + "learning_rate": 0.00016384057622619803, + "loss": 2.9036, + "step": 39787 + }, + { + "epoch": 1.95, + "grad_norm": 0.6297851800918579, + "learning_rate": 0.00016382685731307267, + "loss": 2.9685, + "step": 39788 + }, + { + "epoch": 1.95, + "grad_norm": 0.713854968547821, + "learning_rate": 0.0001638131387586013, + "loss": 3.1751, + "step": 39789 + }, + { + "epoch": 1.95, + "grad_norm": 0.6805860996246338, + "learning_rate": 0.00016379942056281977, + "loss": 2.9272, + "step": 39790 + }, + { + "epoch": 1.95, + "grad_norm": 0.6449320912361145, + "learning_rate": 0.00016378570272576436, + "loss": 2.8766, + "step": 39791 + }, + { + "epoch": 1.95, + "grad_norm": 0.6588577628135681, + "learning_rate": 0.00016377198524747128, + "loss": 2.8883, + "step": 39792 + }, + { + "epoch": 1.95, + "grad_norm": 0.6518809795379639, + "learning_rate": 0.00016375826812797662, + "loss": 2.8129, + "step": 39793 + }, + { + "epoch": 1.95, + "grad_norm": 0.6627069711685181, + "learning_rate": 0.00016374455136731642, + "loss": 3.0616, + "step": 39794 + }, + { + "epoch": 1.95, + "grad_norm": 0.6738892197608948, + "learning_rate": 0.0001637308349655267, + "loss": 3.2886, + "step": 39795 + }, + { + "epoch": 1.95, + "grad_norm": 0.6399411559104919, + "learning_rate": 0.00016371711892264386, + "loss": 3.0394, + "step": 39796 + }, + { + "epoch": 1.95, + "grad_norm": 0.6393530964851379, + "learning_rate": 0.00016370340323870392, + "loss": 2.8555, + "step": 39797 + }, + { + "epoch": 1.95, + "grad_norm": 0.6676591038703918, + "learning_rate": 0.0001636896879137429, + "loss": 3.174, + "step": 39798 + }, + { + "epoch": 1.95, + "grad_norm": 0.6630427241325378, + "learning_rate": 0.0001636759729477971, + "loss": 2.9689, + "step": 39799 + }, + { + "epoch": 1.95, + "grad_norm": 0.6797800064086914, + "learning_rate": 0.00016366225834090246, + "loss": 2.9354, + "step": 39800 + }, + { + "epoch": 1.95, + "grad_norm": 0.6970747709274292, + "learning_rate": 0.0001636485440930953, + "loss": 3.0008, + "step": 39801 + }, + { + "epoch": 1.95, + "grad_norm": 0.7164403796195984, + "learning_rate": 0.00016363483020441166, + "loss": 3.0449, + "step": 39802 + }, + { + "epoch": 1.95, + "grad_norm": 0.6394767165184021, + "learning_rate": 0.00016362111667488747, + "loss": 2.8067, + "step": 39803 + }, + { + "epoch": 1.95, + "grad_norm": 0.6600726842880249, + "learning_rate": 0.00016360740350455916, + "loss": 3.053, + "step": 39804 + }, + { + "epoch": 1.95, + "grad_norm": 0.6320900321006775, + "learning_rate": 0.00016359369069346257, + "loss": 3.0765, + "step": 39805 + }, + { + "epoch": 1.95, + "grad_norm": 0.7160202264785767, + "learning_rate": 0.00016357997824163395, + "loss": 2.9636, + "step": 39806 + }, + { + "epoch": 1.95, + "grad_norm": 0.6760608553886414, + "learning_rate": 0.00016356626614910956, + "loss": 3.1093, + "step": 39807 + }, + { + "epoch": 1.95, + "grad_norm": 0.6356624960899353, + "learning_rate": 0.0001635525544159253, + "loss": 3.0642, + "step": 39808 + }, + { + "epoch": 1.95, + "grad_norm": 0.6428079009056091, + "learning_rate": 0.0001635388430421174, + "loss": 3.1223, + "step": 39809 + }, + { + "epoch": 1.95, + "grad_norm": 0.7246979475021362, + "learning_rate": 0.00016352513202772182, + "loss": 2.7984, + "step": 39810 + }, + { + "epoch": 1.95, + "grad_norm": 0.6196163892745972, + "learning_rate": 0.00016351142137277483, + "loss": 3.017, + "step": 39811 + }, + { + "epoch": 1.95, + "grad_norm": 0.7229598164558411, + "learning_rate": 0.00016349771107731245, + "loss": 2.9938, + "step": 39812 + }, + { + "epoch": 1.95, + "grad_norm": 0.6564931869506836, + "learning_rate": 0.00016348400114137078, + "loss": 3.0232, + "step": 39813 + }, + { + "epoch": 1.95, + "grad_norm": 0.7086764574050903, + "learning_rate": 0.00016347029156498608, + "loss": 2.7794, + "step": 39814 + }, + { + "epoch": 1.95, + "grad_norm": 0.6638086438179016, + "learning_rate": 0.0001634565823481944, + "loss": 3.0231, + "step": 39815 + }, + { + "epoch": 1.95, + "grad_norm": 0.6503745317459106, + "learning_rate": 0.00016344287349103178, + "loss": 2.9724, + "step": 39816 + }, + { + "epoch": 1.95, + "grad_norm": 0.6424645781517029, + "learning_rate": 0.00016342916499353428, + "loss": 3.037, + "step": 39817 + }, + { + "epoch": 1.95, + "grad_norm": 0.6524776816368103, + "learning_rate": 0.00016341545685573798, + "loss": 2.8168, + "step": 39818 + }, + { + "epoch": 1.95, + "grad_norm": 0.6896033883094788, + "learning_rate": 0.00016340174907767926, + "loss": 2.7819, + "step": 39819 + }, + { + "epoch": 1.95, + "grad_norm": 0.6743185520172119, + "learning_rate": 0.00016338804165939394, + "loss": 2.8043, + "step": 39820 + }, + { + "epoch": 1.95, + "grad_norm": 0.7410269379615784, + "learning_rate": 0.0001633743346009183, + "loss": 2.8985, + "step": 39821 + }, + { + "epoch": 1.95, + "grad_norm": 0.6511000990867615, + "learning_rate": 0.00016336062790228828, + "loss": 2.9387, + "step": 39822 + }, + { + "epoch": 1.95, + "grad_norm": 0.6686780452728271, + "learning_rate": 0.0001633469215635402, + "loss": 3.1086, + "step": 39823 + }, + { + "epoch": 1.95, + "grad_norm": 0.6792110800743103, + "learning_rate": 0.00016333321558471, + "loss": 2.9515, + "step": 39824 + }, + { + "epoch": 1.95, + "grad_norm": 0.6629397869110107, + "learning_rate": 0.00016331950996583373, + "loss": 2.9703, + "step": 39825 + }, + { + "epoch": 1.95, + "grad_norm": 0.6876334547996521, + "learning_rate": 0.00016330580470694762, + "loss": 3.1131, + "step": 39826 + }, + { + "epoch": 1.95, + "grad_norm": 0.6542200446128845, + "learning_rate": 0.00016329209980808765, + "loss": 3.0926, + "step": 39827 + }, + { + "epoch": 1.95, + "grad_norm": 0.7039780616760254, + "learning_rate": 0.00016327839526928995, + "loss": 2.9963, + "step": 39828 + }, + { + "epoch": 1.95, + "grad_norm": 0.6578562259674072, + "learning_rate": 0.00016326469109059077, + "loss": 3.1772, + "step": 39829 + }, + { + "epoch": 1.95, + "grad_norm": 0.6808101534843445, + "learning_rate": 0.00016325098727202608, + "loss": 3.1833, + "step": 39830 + }, + { + "epoch": 1.95, + "grad_norm": 0.650033175945282, + "learning_rate": 0.00016323728381363196, + "loss": 2.9292, + "step": 39831 + }, + { + "epoch": 1.95, + "grad_norm": 0.7056677341461182, + "learning_rate": 0.00016322358071544442, + "loss": 2.8928, + "step": 39832 + }, + { + "epoch": 1.95, + "grad_norm": 0.6719111204147339, + "learning_rate": 0.00016320987797749966, + "loss": 2.8892, + "step": 39833 + }, + { + "epoch": 1.95, + "grad_norm": 0.6775871515274048, + "learning_rate": 0.00016319617559983386, + "loss": 2.929, + "step": 39834 + }, + { + "epoch": 1.95, + "grad_norm": 0.6379905343055725, + "learning_rate": 0.00016318247358248285, + "loss": 3.0039, + "step": 39835 + }, + { + "epoch": 1.95, + "grad_norm": 0.636997640132904, + "learning_rate": 0.00016316877192548303, + "loss": 3.059, + "step": 39836 + }, + { + "epoch": 1.95, + "grad_norm": 0.6647437810897827, + "learning_rate": 0.00016315507062887033, + "loss": 2.9804, + "step": 39837 + }, + { + "epoch": 1.95, + "grad_norm": 0.6169809103012085, + "learning_rate": 0.00016314136969268069, + "loss": 3.0341, + "step": 39838 + }, + { + "epoch": 1.95, + "grad_norm": 0.661453366279602, + "learning_rate": 0.00016312766911695046, + "loss": 3.0617, + "step": 39839 + }, + { + "epoch": 1.95, + "grad_norm": 0.6447640657424927, + "learning_rate": 0.00016311396890171553, + "loss": 3.1025, + "step": 39840 + }, + { + "epoch": 1.95, + "grad_norm": 0.6600822806358337, + "learning_rate": 0.00016310026904701213, + "loss": 3.0792, + "step": 39841 + }, + { + "epoch": 1.95, + "grad_norm": 0.6973288655281067, + "learning_rate": 0.00016308656955287617, + "loss": 2.8341, + "step": 39842 + }, + { + "epoch": 1.95, + "grad_norm": 0.656122088432312, + "learning_rate": 0.0001630728704193439, + "loss": 3.031, + "step": 39843 + }, + { + "epoch": 1.95, + "grad_norm": 0.6374558210372925, + "learning_rate": 0.0001630591716464514, + "loss": 3.0751, + "step": 39844 + }, + { + "epoch": 1.95, + "grad_norm": 0.6601612567901611, + "learning_rate": 0.00016304547323423469, + "loss": 3.0094, + "step": 39845 + }, + { + "epoch": 1.95, + "grad_norm": 0.7332333326339722, + "learning_rate": 0.00016303177518272983, + "loss": 2.8679, + "step": 39846 + }, + { + "epoch": 1.95, + "grad_norm": 0.6448028683662415, + "learning_rate": 0.00016301807749197284, + "loss": 3.2066, + "step": 39847 + }, + { + "epoch": 1.95, + "grad_norm": 0.660399317741394, + "learning_rate": 0.00016300438016199982, + "loss": 3.1289, + "step": 39848 + }, + { + "epoch": 1.95, + "grad_norm": 0.5987576246261597, + "learning_rate": 0.00016299068319284706, + "loss": 3.0243, + "step": 39849 + }, + { + "epoch": 1.95, + "grad_norm": 0.6372739672660828, + "learning_rate": 0.0001629769865845503, + "loss": 2.5937, + "step": 39850 + }, + { + "epoch": 1.95, + "grad_norm": 0.7394205927848816, + "learning_rate": 0.00016296329033714597, + "loss": 3.0907, + "step": 39851 + }, + { + "epoch": 1.95, + "grad_norm": 0.6508176922798157, + "learning_rate": 0.0001629495944506699, + "loss": 3.2754, + "step": 39852 + }, + { + "epoch": 1.95, + "grad_norm": 0.6771491765975952, + "learning_rate": 0.00016293589892515817, + "loss": 3.012, + "step": 39853 + }, + { + "epoch": 1.95, + "grad_norm": 0.6909971833229065, + "learning_rate": 0.000162922203760647, + "loss": 3.0311, + "step": 39854 + }, + { + "epoch": 1.95, + "grad_norm": 0.7127150297164917, + "learning_rate": 0.00016290850895717222, + "loss": 3.0109, + "step": 39855 + }, + { + "epoch": 1.95, + "grad_norm": 0.6183626055717468, + "learning_rate": 0.00016289481451477015, + "loss": 2.9736, + "step": 39856 + }, + { + "epoch": 1.95, + "grad_norm": 0.6582019329071045, + "learning_rate": 0.0001628811204334767, + "loss": 2.9166, + "step": 39857 + }, + { + "epoch": 1.95, + "grad_norm": 0.5948113203048706, + "learning_rate": 0.00016286742671332803, + "loss": 3.0286, + "step": 39858 + }, + { + "epoch": 1.95, + "grad_norm": 0.6852336525917053, + "learning_rate": 0.00016285373335436022, + "loss": 2.9953, + "step": 39859 + }, + { + "epoch": 1.95, + "grad_norm": 0.7192272543907166, + "learning_rate": 0.00016284004035660915, + "loss": 2.9652, + "step": 39860 + }, + { + "epoch": 1.95, + "grad_norm": 0.705686092376709, + "learning_rate": 0.00016282634772011112, + "loss": 2.9451, + "step": 39861 + }, + { + "epoch": 1.95, + "grad_norm": 0.6376373767852783, + "learning_rate": 0.00016281265544490202, + "loss": 3.0674, + "step": 39862 + }, + { + "epoch": 1.95, + "grad_norm": 0.680194079875946, + "learning_rate": 0.00016279896353101793, + "loss": 3.0108, + "step": 39863 + }, + { + "epoch": 1.95, + "grad_norm": 0.6736658811569214, + "learning_rate": 0.00016278527197849507, + "loss": 2.8899, + "step": 39864 + }, + { + "epoch": 1.95, + "grad_norm": 0.688944935798645, + "learning_rate": 0.00016277158078736932, + "loss": 2.7236, + "step": 39865 + }, + { + "epoch": 1.95, + "grad_norm": 0.6319748163223267, + "learning_rate": 0.00016275788995767692, + "loss": 2.7577, + "step": 39866 + }, + { + "epoch": 1.95, + "grad_norm": 0.683836817741394, + "learning_rate": 0.00016274419948945383, + "loss": 2.9514, + "step": 39867 + }, + { + "epoch": 1.95, + "grad_norm": 0.6992816925048828, + "learning_rate": 0.00016273050938273598, + "loss": 3.1241, + "step": 39868 + }, + { + "epoch": 1.95, + "grad_norm": 0.6931037306785583, + "learning_rate": 0.00016271681963755968, + "loss": 3.0657, + "step": 39869 + }, + { + "epoch": 1.95, + "grad_norm": 0.7147294282913208, + "learning_rate": 0.0001627031302539607, + "loss": 2.9349, + "step": 39870 + }, + { + "epoch": 1.95, + "grad_norm": 0.6773857474327087, + "learning_rate": 0.0001626894412319754, + "loss": 2.9131, + "step": 39871 + }, + { + "epoch": 1.95, + "grad_norm": 0.6557199954986572, + "learning_rate": 0.0001626757525716396, + "loss": 2.7906, + "step": 39872 + }, + { + "epoch": 1.95, + "grad_norm": 0.6949388980865479, + "learning_rate": 0.00016266206427298954, + "loss": 3.2067, + "step": 39873 + }, + { + "epoch": 1.95, + "grad_norm": 0.6508885025978088, + "learning_rate": 0.00016264837633606113, + "loss": 2.8906, + "step": 39874 + }, + { + "epoch": 1.95, + "grad_norm": 0.6729174852371216, + "learning_rate": 0.00016263468876089038, + "loss": 3.0934, + "step": 39875 + }, + { + "epoch": 1.95, + "grad_norm": 0.6540240049362183, + "learning_rate": 0.00016262100154751353, + "loss": 2.9799, + "step": 39876 + }, + { + "epoch": 1.95, + "grad_norm": 0.7319003939628601, + "learning_rate": 0.00016260731469596639, + "loss": 3.0345, + "step": 39877 + }, + { + "epoch": 1.95, + "grad_norm": 0.6470528841018677, + "learning_rate": 0.00016259362820628513, + "loss": 2.9739, + "step": 39878 + }, + { + "epoch": 1.95, + "grad_norm": 0.6552753448486328, + "learning_rate": 0.000162579942078506, + "loss": 3.0781, + "step": 39879 + }, + { + "epoch": 1.95, + "grad_norm": 0.6793324947357178, + "learning_rate": 0.00016256625631266467, + "loss": 3.0156, + "step": 39880 + }, + { + "epoch": 1.95, + "grad_norm": 0.636117160320282, + "learning_rate": 0.00016255257090879753, + "loss": 3.1659, + "step": 39881 + }, + { + "epoch": 1.95, + "grad_norm": 0.6674087643623352, + "learning_rate": 0.00016253888586694045, + "loss": 3.0635, + "step": 39882 + }, + { + "epoch": 1.95, + "grad_norm": 0.7054731249809265, + "learning_rate": 0.00016252520118712935, + "loss": 3.1278, + "step": 39883 + }, + { + "epoch": 1.95, + "grad_norm": 0.6451898813247681, + "learning_rate": 0.00016251151686940055, + "loss": 2.7129, + "step": 39884 + }, + { + "epoch": 1.95, + "grad_norm": 0.6893364787101746, + "learning_rate": 0.00016249783291378986, + "loss": 3.0023, + "step": 39885 + }, + { + "epoch": 1.95, + "grad_norm": 0.6900562047958374, + "learning_rate": 0.0001624841493203335, + "loss": 3.0193, + "step": 39886 + }, + { + "epoch": 1.95, + "grad_norm": 0.6447057723999023, + "learning_rate": 0.00016247046608906728, + "loss": 3.0807, + "step": 39887 + }, + { + "epoch": 1.95, + "grad_norm": 0.7077726125717163, + "learning_rate": 0.00016245678322002755, + "loss": 2.8504, + "step": 39888 + }, + { + "epoch": 1.95, + "grad_norm": 0.6641325354576111, + "learning_rate": 0.00016244310071325017, + "loss": 3.2085, + "step": 39889 + }, + { + "epoch": 1.95, + "grad_norm": 0.701665461063385, + "learning_rate": 0.00016242941856877107, + "loss": 3.1182, + "step": 39890 + }, + { + "epoch": 1.95, + "grad_norm": 0.6562961339950562, + "learning_rate": 0.00016241573678662656, + "loss": 3.1035, + "step": 39891 + }, + { + "epoch": 1.96, + "grad_norm": 0.6483310461044312, + "learning_rate": 0.00016240205536685235, + "loss": 2.9854, + "step": 39892 + }, + { + "epoch": 1.96, + "grad_norm": 0.6495265364646912, + "learning_rate": 0.0001623883743094848, + "loss": 3.0443, + "step": 39893 + }, + { + "epoch": 1.96, + "grad_norm": 0.7036017179489136, + "learning_rate": 0.00016237469361455963, + "loss": 3.0209, + "step": 39894 + }, + { + "epoch": 1.96, + "grad_norm": 0.674313485622406, + "learning_rate": 0.00016236101328211317, + "loss": 2.8208, + "step": 39895 + }, + { + "epoch": 1.96, + "grad_norm": 0.7025102972984314, + "learning_rate": 0.00016234733331218132, + "loss": 2.8201, + "step": 39896 + }, + { + "epoch": 1.96, + "grad_norm": 0.66330885887146, + "learning_rate": 0.00016233365370479997, + "loss": 3.1881, + "step": 39897 + }, + { + "epoch": 1.96, + "grad_norm": 0.6553804874420166, + "learning_rate": 0.0001623199744600054, + "loss": 2.9553, + "step": 39898 + }, + { + "epoch": 1.96, + "grad_norm": 0.6467947959899902, + "learning_rate": 0.00016230629557783342, + "loss": 2.9607, + "step": 39899 + }, + { + "epoch": 1.96, + "grad_norm": 0.6393236517906189, + "learning_rate": 0.00016229261705832015, + "loss": 3.1014, + "step": 39900 + }, + { + "epoch": 1.96, + "grad_norm": 0.675190269947052, + "learning_rate": 0.00016227893890150173, + "loss": 2.8303, + "step": 39901 + }, + { + "epoch": 1.96, + "grad_norm": 0.6858446002006531, + "learning_rate": 0.00016226526110741396, + "loss": 2.9271, + "step": 39902 + }, + { + "epoch": 1.96, + "grad_norm": 0.6574386954307556, + "learning_rate": 0.00016225158367609314, + "loss": 2.8188, + "step": 39903 + }, + { + "epoch": 1.96, + "grad_norm": 0.6161906123161316, + "learning_rate": 0.0001622379066075751, + "loss": 2.8077, + "step": 39904 + }, + { + "epoch": 1.96, + "grad_norm": 0.6860578060150146, + "learning_rate": 0.0001622242299018958, + "loss": 2.8483, + "step": 39905 + }, + { + "epoch": 1.96, + "grad_norm": 0.7148755788803101, + "learning_rate": 0.00016221055355909148, + "loss": 3.0416, + "step": 39906 + }, + { + "epoch": 1.96, + "grad_norm": 0.7170261740684509, + "learning_rate": 0.00016219687757919793, + "loss": 2.8434, + "step": 39907 + }, + { + "epoch": 1.96, + "grad_norm": 0.6350681185722351, + "learning_rate": 0.00016218320196225138, + "loss": 3.034, + "step": 39908 + }, + { + "epoch": 1.96, + "grad_norm": 0.6371099948883057, + "learning_rate": 0.00016216952670828762, + "loss": 3.0398, + "step": 39909 + }, + { + "epoch": 1.96, + "grad_norm": 0.6577742695808411, + "learning_rate": 0.00016215585181734296, + "loss": 2.7016, + "step": 39910 + }, + { + "epoch": 1.96, + "grad_norm": 0.6610209345817566, + "learning_rate": 0.00016214217728945324, + "loss": 3.0579, + "step": 39911 + }, + { + "epoch": 1.96, + "grad_norm": 0.685881495475769, + "learning_rate": 0.00016212850312465437, + "loss": 2.9577, + "step": 39912 + }, + { + "epoch": 1.96, + "grad_norm": 0.6734350919723511, + "learning_rate": 0.00016211482932298264, + "loss": 3.0301, + "step": 39913 + }, + { + "epoch": 1.96, + "grad_norm": 0.6929466128349304, + "learning_rate": 0.00016210115588447378, + "loss": 3.0674, + "step": 39914 + }, + { + "epoch": 1.96, + "grad_norm": 0.6628326177597046, + "learning_rate": 0.00016208748280916393, + "loss": 3.0554, + "step": 39915 + }, + { + "epoch": 1.96, + "grad_norm": 0.6858869791030884, + "learning_rate": 0.00016207381009708923, + "loss": 2.9002, + "step": 39916 + }, + { + "epoch": 1.96, + "grad_norm": 0.6722764372825623, + "learning_rate": 0.00016206013774828557, + "loss": 2.8665, + "step": 39917 + }, + { + "epoch": 1.96, + "grad_norm": 0.6647232174873352, + "learning_rate": 0.00016204646576278893, + "loss": 3.0932, + "step": 39918 + }, + { + "epoch": 1.96, + "grad_norm": 0.7160334587097168, + "learning_rate": 0.00016203279414063525, + "loss": 2.9176, + "step": 39919 + }, + { + "epoch": 1.96, + "grad_norm": 0.6342455148696899, + "learning_rate": 0.00016201912288186067, + "loss": 3.0347, + "step": 39920 + }, + { + "epoch": 1.96, + "grad_norm": 0.711457371711731, + "learning_rate": 0.00016200545198650126, + "loss": 3.0407, + "step": 39921 + }, + { + "epoch": 1.96, + "grad_norm": 0.6820720434188843, + "learning_rate": 0.00016199178145459277, + "loss": 3.1249, + "step": 39922 + }, + { + "epoch": 1.96, + "grad_norm": 0.7002881765365601, + "learning_rate": 0.00016197811128617153, + "loss": 2.8607, + "step": 39923 + }, + { + "epoch": 1.96, + "grad_norm": 0.7007981538772583, + "learning_rate": 0.0001619644414812733, + "loss": 2.8763, + "step": 39924 + }, + { + "epoch": 1.96, + "grad_norm": 0.6570713520050049, + "learning_rate": 0.0001619507720399342, + "loss": 2.885, + "step": 39925 + }, + { + "epoch": 1.96, + "grad_norm": 0.6478700637817383, + "learning_rate": 0.00016193710296219025, + "loss": 3.041, + "step": 39926 + }, + { + "epoch": 1.96, + "grad_norm": 0.7194265127182007, + "learning_rate": 0.0001619234342480773, + "loss": 2.9688, + "step": 39927 + }, + { + "epoch": 1.96, + "grad_norm": 0.6922836303710938, + "learning_rate": 0.00016190976589763151, + "loss": 2.9779, + "step": 39928 + }, + { + "epoch": 1.96, + "grad_norm": 0.6822351217269897, + "learning_rate": 0.00016189609791088878, + "loss": 2.9213, + "step": 39929 + }, + { + "epoch": 1.96, + "grad_norm": 0.6374616622924805, + "learning_rate": 0.00016188243028788507, + "loss": 2.8969, + "step": 39930 + }, + { + "epoch": 1.96, + "grad_norm": 0.6212733387947083, + "learning_rate": 0.00016186876302865663, + "loss": 2.8603, + "step": 39931 + }, + { + "epoch": 1.96, + "grad_norm": 0.6491883397102356, + "learning_rate": 0.00016185509613323928, + "loss": 3.0147, + "step": 39932 + }, + { + "epoch": 1.96, + "grad_norm": 0.6715843677520752, + "learning_rate": 0.000161841429601669, + "loss": 2.8747, + "step": 39933 + }, + { + "epoch": 1.96, + "grad_norm": 0.658258318901062, + "learning_rate": 0.0001618277634339817, + "loss": 2.9536, + "step": 39934 + }, + { + "epoch": 1.96, + "grad_norm": 0.7133314609527588, + "learning_rate": 0.00016181409763021346, + "loss": 3.0629, + "step": 39935 + }, + { + "epoch": 1.96, + "grad_norm": 0.6304369568824768, + "learning_rate": 0.00016180043219040041, + "loss": 2.8939, + "step": 39936 + }, + { + "epoch": 1.96, + "grad_norm": 0.6943387985229492, + "learning_rate": 0.00016178676711457832, + "loss": 3.0713, + "step": 39937 + }, + { + "epoch": 1.96, + "grad_norm": 0.6555782556533813, + "learning_rate": 0.00016177310240278342, + "loss": 2.8656, + "step": 39938 + }, + { + "epoch": 1.96, + "grad_norm": 0.634103000164032, + "learning_rate": 0.00016175943805505152, + "loss": 2.9851, + "step": 39939 + }, + { + "epoch": 1.96, + "grad_norm": 0.6928660869598389, + "learning_rate": 0.00016174577407141857, + "loss": 2.9063, + "step": 39940 + }, + { + "epoch": 1.96, + "grad_norm": 0.6522265076637268, + "learning_rate": 0.0001617321104519208, + "loss": 2.973, + "step": 39941 + }, + { + "epoch": 1.96, + "grad_norm": 0.7050772905349731, + "learning_rate": 0.00016171844719659384, + "loss": 2.8929, + "step": 39942 + }, + { + "epoch": 1.96, + "grad_norm": 0.6689801216125488, + "learning_rate": 0.000161704784305474, + "loss": 3.0081, + "step": 39943 + }, + { + "epoch": 1.96, + "grad_norm": 0.7276343107223511, + "learning_rate": 0.0001616911217785971, + "loss": 2.9564, + "step": 39944 + }, + { + "epoch": 1.96, + "grad_norm": 0.6313931345939636, + "learning_rate": 0.00016167745961599914, + "loss": 3.0023, + "step": 39945 + }, + { + "epoch": 1.96, + "grad_norm": 0.6859260201454163, + "learning_rate": 0.0001616637978177162, + "loss": 2.778, + "step": 39946 + }, + { + "epoch": 1.96, + "grad_norm": 0.6386987566947937, + "learning_rate": 0.00016165013638378423, + "loss": 2.9645, + "step": 39947 + }, + { + "epoch": 1.96, + "grad_norm": 0.6548264026641846, + "learning_rate": 0.0001616364753142392, + "loss": 2.9473, + "step": 39948 + }, + { + "epoch": 1.96, + "grad_norm": 0.6422974467277527, + "learning_rate": 0.00016162281460911694, + "loss": 2.829, + "step": 39949 + }, + { + "epoch": 1.96, + "grad_norm": 0.6441436409950256, + "learning_rate": 0.0001616091542684535, + "loss": 3.045, + "step": 39950 + }, + { + "epoch": 1.96, + "grad_norm": 0.6431066989898682, + "learning_rate": 0.0001615954942922851, + "loss": 3.079, + "step": 39951 + }, + { + "epoch": 1.96, + "grad_norm": 0.6600309610366821, + "learning_rate": 0.00016158183468064738, + "loss": 3.1532, + "step": 39952 + }, + { + "epoch": 1.96, + "grad_norm": 0.7169448733329773, + "learning_rate": 0.00016156817543357656, + "loss": 2.9717, + "step": 39953 + }, + { + "epoch": 1.96, + "grad_norm": 0.6597325801849365, + "learning_rate": 0.00016155451655110856, + "loss": 3.0926, + "step": 39954 + }, + { + "epoch": 1.96, + "grad_norm": 0.6279834508895874, + "learning_rate": 0.0001615408580332792, + "loss": 2.9351, + "step": 39955 + }, + { + "epoch": 1.96, + "grad_norm": 0.6826247572898865, + "learning_rate": 0.00016152719988012467, + "loss": 2.9439, + "step": 39956 + }, + { + "epoch": 1.96, + "grad_norm": 0.6647608876228333, + "learning_rate": 0.00016151354209168074, + "loss": 2.9722, + "step": 39957 + }, + { + "epoch": 1.96, + "grad_norm": 0.6797239780426025, + "learning_rate": 0.00016149988466798365, + "loss": 3.0933, + "step": 39958 + }, + { + "epoch": 1.96, + "grad_norm": 0.7346055507659912, + "learning_rate": 0.000161486227609069, + "loss": 2.8231, + "step": 39959 + }, + { + "epoch": 1.96, + "grad_norm": 0.6545577049255371, + "learning_rate": 0.00016147257091497316, + "loss": 3.0375, + "step": 39960 + }, + { + "epoch": 1.96, + "grad_norm": 0.6826615929603577, + "learning_rate": 0.0001614589145857319, + "loss": 2.7554, + "step": 39961 + }, + { + "epoch": 1.96, + "grad_norm": 0.6645894646644592, + "learning_rate": 0.00016144525862138106, + "loss": 2.9856, + "step": 39962 + }, + { + "epoch": 1.96, + "grad_norm": 0.6471953392028809, + "learning_rate": 0.00016143160302195682, + "loss": 3.0498, + "step": 39963 + }, + { + "epoch": 1.96, + "grad_norm": 0.6736137866973877, + "learning_rate": 0.00016141794778749502, + "loss": 3.0682, + "step": 39964 + }, + { + "epoch": 1.96, + "grad_norm": 0.65671706199646, + "learning_rate": 0.00016140429291803166, + "loss": 2.9904, + "step": 39965 + }, + { + "epoch": 1.96, + "grad_norm": 0.6748507618904114, + "learning_rate": 0.00016139063841360285, + "loss": 3.1008, + "step": 39966 + }, + { + "epoch": 1.96, + "grad_norm": 0.6879149079322815, + "learning_rate": 0.00016137698427424427, + "loss": 2.9373, + "step": 39967 + }, + { + "epoch": 1.96, + "grad_norm": 0.706725001335144, + "learning_rate": 0.00016136333049999215, + "loss": 3.0366, + "step": 39968 + }, + { + "epoch": 1.96, + "grad_norm": 0.6533642411231995, + "learning_rate": 0.00016134967709088236, + "loss": 2.8464, + "step": 39969 + }, + { + "epoch": 1.96, + "grad_norm": 0.6878400444984436, + "learning_rate": 0.0001613360240469508, + "loss": 3.0078, + "step": 39970 + }, + { + "epoch": 1.96, + "grad_norm": 0.6259313225746155, + "learning_rate": 0.00016132237136823338, + "loss": 2.9485, + "step": 39971 + }, + { + "epoch": 1.96, + "grad_norm": 0.6699492335319519, + "learning_rate": 0.00016130871905476615, + "loss": 3.141, + "step": 39972 + }, + { + "epoch": 1.96, + "grad_norm": 0.653860867023468, + "learning_rate": 0.0001612950671065852, + "loss": 3.0009, + "step": 39973 + }, + { + "epoch": 1.96, + "grad_norm": 0.6694608330726624, + "learning_rate": 0.00016128141552372618, + "loss": 3.0055, + "step": 39974 + }, + { + "epoch": 1.96, + "grad_norm": 0.6927410960197449, + "learning_rate": 0.00016126776430622536, + "loss": 3.1213, + "step": 39975 + }, + { + "epoch": 1.96, + "grad_norm": 0.690608561038971, + "learning_rate": 0.00016125411345411855, + "loss": 2.9176, + "step": 39976 + }, + { + "epoch": 1.96, + "grad_norm": 0.6918655633926392, + "learning_rate": 0.00016124046296744159, + "loss": 2.9613, + "step": 39977 + }, + { + "epoch": 1.96, + "grad_norm": 0.6636469960212708, + "learning_rate": 0.00016122681284623064, + "loss": 2.9775, + "step": 39978 + }, + { + "epoch": 1.96, + "grad_norm": 0.6595686078071594, + "learning_rate": 0.00016121316309052145, + "loss": 2.8676, + "step": 39979 + }, + { + "epoch": 1.96, + "grad_norm": 0.6494026184082031, + "learning_rate": 0.00016119951370035021, + "loss": 2.8945, + "step": 39980 + }, + { + "epoch": 1.96, + "grad_norm": 0.7089259624481201, + "learning_rate": 0.0001611858646757526, + "loss": 2.8564, + "step": 39981 + }, + { + "epoch": 1.96, + "grad_norm": 0.6717854142189026, + "learning_rate": 0.0001611722160167647, + "loss": 2.9878, + "step": 39982 + }, + { + "epoch": 1.96, + "grad_norm": 0.6851207613945007, + "learning_rate": 0.00016115856772342275, + "loss": 3.0027, + "step": 39983 + }, + { + "epoch": 1.96, + "grad_norm": 0.6713508367538452, + "learning_rate": 0.00016114491979576215, + "loss": 3.0956, + "step": 39984 + }, + { + "epoch": 1.96, + "grad_norm": 0.6420844793319702, + "learning_rate": 0.00016113127223381919, + "loss": 3.0019, + "step": 39985 + }, + { + "epoch": 1.96, + "grad_norm": 0.63764488697052, + "learning_rate": 0.00016111762503762965, + "loss": 3.0084, + "step": 39986 + }, + { + "epoch": 1.96, + "grad_norm": 0.6774821877479553, + "learning_rate": 0.00016110397820722953, + "loss": 2.9804, + "step": 39987 + }, + { + "epoch": 1.96, + "grad_norm": 0.6736724376678467, + "learning_rate": 0.00016109033174265498, + "loss": 3.0381, + "step": 39988 + }, + { + "epoch": 1.96, + "grad_norm": 0.6819289922714233, + "learning_rate": 0.00016107668564394157, + "loss": 3.1965, + "step": 39989 + }, + { + "epoch": 1.96, + "grad_norm": 0.7004780769348145, + "learning_rate": 0.0001610630399111256, + "loss": 2.8297, + "step": 39990 + }, + { + "epoch": 1.96, + "grad_norm": 0.6639523506164551, + "learning_rate": 0.00016104939454424284, + "loss": 3.05, + "step": 39991 + }, + { + "epoch": 1.96, + "grad_norm": 0.7015445828437805, + "learning_rate": 0.00016103574954332906, + "loss": 2.9532, + "step": 39992 + }, + { + "epoch": 1.96, + "grad_norm": 0.6690757274627686, + "learning_rate": 0.00016102210490842058, + "loss": 2.9515, + "step": 39993 + }, + { + "epoch": 1.96, + "grad_norm": 0.6284964680671692, + "learning_rate": 0.00016100846063955295, + "loss": 3.1455, + "step": 39994 + }, + { + "epoch": 1.96, + "grad_norm": 0.7172853350639343, + "learning_rate": 0.00016099481673676242, + "loss": 2.9555, + "step": 39995 + }, + { + "epoch": 1.96, + "grad_norm": 0.6635323166847229, + "learning_rate": 0.00016098117320008465, + "loss": 3.0923, + "step": 39996 + }, + { + "epoch": 1.96, + "grad_norm": 0.6090884208679199, + "learning_rate": 0.00016096753002955587, + "loss": 3.099, + "step": 39997 + }, + { + "epoch": 1.96, + "grad_norm": 0.6727035045623779, + "learning_rate": 0.00016095388722521184, + "loss": 3.0681, + "step": 39998 + }, + { + "epoch": 1.96, + "grad_norm": 0.6885789036750793, + "learning_rate": 0.0001609402447870884, + "loss": 3.3607, + "step": 39999 + }, + { + "epoch": 1.96, + "grad_norm": 0.6628068685531616, + "learning_rate": 0.0001609266027152217, + "loss": 2.766, + "step": 40000 + }, + { + "epoch": 1.96, + "grad_norm": 0.7326607704162598, + "learning_rate": 0.00016091296100964746, + "loss": 2.8427, + "step": 40001 + }, + { + "epoch": 1.96, + "grad_norm": 0.637006938457489, + "learning_rate": 0.00016089931967040171, + "loss": 2.9066, + "step": 40002 + }, + { + "epoch": 1.96, + "grad_norm": 0.6497381329536438, + "learning_rate": 0.0001608856786975205, + "loss": 2.8787, + "step": 40003 + }, + { + "epoch": 1.96, + "grad_norm": 0.6747108101844788, + "learning_rate": 0.00016087203809103952, + "loss": 3.0503, + "step": 40004 + }, + { + "epoch": 1.96, + "grad_norm": 0.6333727240562439, + "learning_rate": 0.00016085839785099493, + "loss": 2.9565, + "step": 40005 + }, + { + "epoch": 1.96, + "grad_norm": 0.6521469950675964, + "learning_rate": 0.00016084475797742252, + "loss": 3.0972, + "step": 40006 + }, + { + "epoch": 1.96, + "grad_norm": 0.7146298885345459, + "learning_rate": 0.00016083111847035815, + "loss": 2.9459, + "step": 40007 + }, + { + "epoch": 1.96, + "grad_norm": 0.6354973912239075, + "learning_rate": 0.00016081747932983792, + "loss": 3.2163, + "step": 40008 + }, + { + "epoch": 1.96, + "grad_norm": 0.6594139337539673, + "learning_rate": 0.00016080384055589756, + "loss": 2.9049, + "step": 40009 + }, + { + "epoch": 1.96, + "grad_norm": 0.693379282951355, + "learning_rate": 0.00016079020214857325, + "loss": 2.8049, + "step": 40010 + }, + { + "epoch": 1.96, + "grad_norm": 0.6748722791671753, + "learning_rate": 0.0001607765641079006, + "loss": 2.7994, + "step": 40011 + }, + { + "epoch": 1.96, + "grad_norm": 0.6617879867553711, + "learning_rate": 0.0001607629264339158, + "loss": 2.9751, + "step": 40012 + }, + { + "epoch": 1.96, + "grad_norm": 0.6831246018409729, + "learning_rate": 0.00016074928912665468, + "loss": 2.9856, + "step": 40013 + }, + { + "epoch": 1.96, + "grad_norm": 0.6496304869651794, + "learning_rate": 0.000160735652186153, + "loss": 2.932, + "step": 40014 + }, + { + "epoch": 1.96, + "grad_norm": 0.6654090881347656, + "learning_rate": 0.0001607220156124469, + "loss": 3.2564, + "step": 40015 + }, + { + "epoch": 1.96, + "grad_norm": 0.690848171710968, + "learning_rate": 0.00016070837940557214, + "loss": 3.0275, + "step": 40016 + }, + { + "epoch": 1.96, + "grad_norm": 0.6424733400344849, + "learning_rate": 0.0001606947435655647, + "loss": 3.1051, + "step": 40017 + }, + { + "epoch": 1.96, + "grad_norm": 0.6729825735092163, + "learning_rate": 0.00016068110809246058, + "loss": 3.0315, + "step": 40018 + }, + { + "epoch": 1.96, + "grad_norm": 0.6387492418289185, + "learning_rate": 0.00016066747298629558, + "loss": 3.0497, + "step": 40019 + }, + { + "epoch": 1.96, + "grad_norm": 0.6648398637771606, + "learning_rate": 0.0001606538382471057, + "loss": 2.9517, + "step": 40020 + }, + { + "epoch": 1.96, + "grad_norm": 0.6861499547958374, + "learning_rate": 0.00016064020387492666, + "loss": 2.9324, + "step": 40021 + }, + { + "epoch": 1.96, + "grad_norm": 0.6802994608879089, + "learning_rate": 0.00016062656986979448, + "loss": 2.9477, + "step": 40022 + }, + { + "epoch": 1.96, + "grad_norm": 0.6521588563919067, + "learning_rate": 0.00016061293623174523, + "loss": 2.8447, + "step": 40023 + }, + { + "epoch": 1.96, + "grad_norm": 0.7014820575714111, + "learning_rate": 0.00016059930296081457, + "loss": 2.9435, + "step": 40024 + }, + { + "epoch": 1.96, + "grad_norm": 0.634397029876709, + "learning_rate": 0.00016058567005703864, + "loss": 3.1462, + "step": 40025 + }, + { + "epoch": 1.96, + "grad_norm": 0.6457260847091675, + "learning_rate": 0.00016057203752045307, + "loss": 2.9698, + "step": 40026 + }, + { + "epoch": 1.96, + "grad_norm": 0.7065685391426086, + "learning_rate": 0.00016055840535109407, + "loss": 3.1251, + "step": 40027 + }, + { + "epoch": 1.96, + "grad_norm": 0.6713890433311462, + "learning_rate": 0.00016054477354899739, + "loss": 2.9537, + "step": 40028 + }, + { + "epoch": 1.96, + "grad_norm": 0.674758791923523, + "learning_rate": 0.0001605311421141988, + "loss": 3.1935, + "step": 40029 + }, + { + "epoch": 1.96, + "grad_norm": 0.6733911633491516, + "learning_rate": 0.00016051751104673447, + "loss": 3.0881, + "step": 40030 + }, + { + "epoch": 1.96, + "grad_norm": 0.6857780814170837, + "learning_rate": 0.00016050388034664006, + "loss": 2.8133, + "step": 40031 + }, + { + "epoch": 1.96, + "grad_norm": 0.6858083605766296, + "learning_rate": 0.00016049025001395158, + "loss": 2.9045, + "step": 40032 + }, + { + "epoch": 1.96, + "grad_norm": 0.6884641647338867, + "learning_rate": 0.00016047662004870506, + "loss": 3.1941, + "step": 40033 + }, + { + "epoch": 1.96, + "grad_norm": 0.6528140902519226, + "learning_rate": 0.00016046299045093624, + "loss": 2.9926, + "step": 40034 + }, + { + "epoch": 1.96, + "grad_norm": 0.644972026348114, + "learning_rate": 0.00016044936122068106, + "loss": 2.9634, + "step": 40035 + }, + { + "epoch": 1.96, + "grad_norm": 0.6678783297538757, + "learning_rate": 0.00016043573235797532, + "loss": 3.021, + "step": 40036 + }, + { + "epoch": 1.96, + "grad_norm": 0.6511356830596924, + "learning_rate": 0.00016042210386285497, + "loss": 2.9231, + "step": 40037 + }, + { + "epoch": 1.96, + "grad_norm": 0.6782459020614624, + "learning_rate": 0.00016040847573535607, + "loss": 2.848, + "step": 40038 + }, + { + "epoch": 1.96, + "grad_norm": 0.7153217792510986, + "learning_rate": 0.00016039484797551425, + "loss": 3.042, + "step": 40039 + }, + { + "epoch": 1.96, + "grad_norm": 0.6678678393363953, + "learning_rate": 0.00016038122058336567, + "loss": 3.0906, + "step": 40040 + }, + { + "epoch": 1.96, + "grad_norm": 0.6697278618812561, + "learning_rate": 0.00016036759355894604, + "loss": 3.0417, + "step": 40041 + }, + { + "epoch": 1.96, + "grad_norm": 0.6855822801589966, + "learning_rate": 0.00016035396690229124, + "loss": 3.2378, + "step": 40042 + }, + { + "epoch": 1.96, + "grad_norm": 0.6451258659362793, + "learning_rate": 0.00016034034061343728, + "loss": 3.1821, + "step": 40043 + }, + { + "epoch": 1.96, + "grad_norm": 0.8559314012527466, + "learning_rate": 0.00016032671469241988, + "loss": 2.936, + "step": 40044 + }, + { + "epoch": 1.96, + "grad_norm": 0.6715162396430969, + "learning_rate": 0.00016031308913927515, + "loss": 3.0956, + "step": 40045 + }, + { + "epoch": 1.96, + "grad_norm": 0.6763110160827637, + "learning_rate": 0.00016029946395403877, + "loss": 3.0766, + "step": 40046 + }, + { + "epoch": 1.96, + "grad_norm": 0.6643348336219788, + "learning_rate": 0.0001602858391367468, + "loss": 2.9967, + "step": 40047 + }, + { + "epoch": 1.96, + "grad_norm": 0.6329568028450012, + "learning_rate": 0.00016027221468743497, + "loss": 3.013, + "step": 40048 + }, + { + "epoch": 1.96, + "grad_norm": 0.6494516134262085, + "learning_rate": 0.0001602585906061393, + "loss": 2.789, + "step": 40049 + }, + { + "epoch": 1.96, + "grad_norm": 0.6577889919281006, + "learning_rate": 0.00016024496689289564, + "loss": 3.0301, + "step": 40050 + }, + { + "epoch": 1.96, + "grad_norm": 0.7097561359405518, + "learning_rate": 0.0001602313435477397, + "loss": 2.8324, + "step": 40051 + }, + { + "epoch": 1.96, + "grad_norm": 0.652508556842804, + "learning_rate": 0.0001602177205707076, + "loss": 2.8232, + "step": 40052 + }, + { + "epoch": 1.96, + "grad_norm": 0.6659618020057678, + "learning_rate": 0.00016020409796183507, + "loss": 2.7219, + "step": 40053 + }, + { + "epoch": 1.96, + "grad_norm": 0.6810920834541321, + "learning_rate": 0.000160190475721158, + "loss": 2.9999, + "step": 40054 + }, + { + "epoch": 1.96, + "grad_norm": 0.7163436412811279, + "learning_rate": 0.00016017685384871243, + "loss": 3.0742, + "step": 40055 + }, + { + "epoch": 1.96, + "grad_norm": 0.6159772276878357, + "learning_rate": 0.00016016323234453413, + "loss": 2.757, + "step": 40056 + }, + { + "epoch": 1.96, + "grad_norm": 0.642678439617157, + "learning_rate": 0.00016014961120865894, + "loss": 2.9309, + "step": 40057 + }, + { + "epoch": 1.96, + "grad_norm": 0.7379891276359558, + "learning_rate": 0.0001601359904411227, + "loss": 2.9141, + "step": 40058 + }, + { + "epoch": 1.96, + "grad_norm": 0.6562783122062683, + "learning_rate": 0.0001601223700419613, + "loss": 3.2553, + "step": 40059 + }, + { + "epoch": 1.96, + "grad_norm": 0.6276502013206482, + "learning_rate": 0.00016010875001121075, + "loss": 3.117, + "step": 40060 + }, + { + "epoch": 1.96, + "grad_norm": 0.681545615196228, + "learning_rate": 0.00016009513034890674, + "loss": 3.1442, + "step": 40061 + }, + { + "epoch": 1.96, + "grad_norm": 0.6715508699417114, + "learning_rate": 0.00016008151105508536, + "loss": 2.8528, + "step": 40062 + }, + { + "epoch": 1.96, + "grad_norm": 0.6921969652175903, + "learning_rate": 0.00016006789212978233, + "loss": 3.1951, + "step": 40063 + }, + { + "epoch": 1.96, + "grad_norm": 0.635017454624176, + "learning_rate": 0.00016005427357303347, + "loss": 3.0769, + "step": 40064 + }, + { + "epoch": 1.96, + "grad_norm": 0.6426366567611694, + "learning_rate": 0.00016004065538487478, + "loss": 3.112, + "step": 40065 + }, + { + "epoch": 1.96, + "grad_norm": 0.6538386940956116, + "learning_rate": 0.00016002703756534195, + "loss": 3.0287, + "step": 40066 + }, + { + "epoch": 1.96, + "grad_norm": 0.6401711702346802, + "learning_rate": 0.00016001342011447113, + "loss": 2.9856, + "step": 40067 + }, + { + "epoch": 1.96, + "grad_norm": 0.6818058490753174, + "learning_rate": 0.00015999980303229788, + "loss": 2.9975, + "step": 40068 + }, + { + "epoch": 1.96, + "grad_norm": 0.6866037845611572, + "learning_rate": 0.0001599861863188582, + "loss": 2.8041, + "step": 40069 + }, + { + "epoch": 1.96, + "grad_norm": 0.6289970874786377, + "learning_rate": 0.00015997256997418808, + "loss": 2.9064, + "step": 40070 + }, + { + "epoch": 1.96, + "grad_norm": 0.6188867688179016, + "learning_rate": 0.0001599589539983233, + "loss": 2.975, + "step": 40071 + }, + { + "epoch": 1.96, + "grad_norm": 0.6899058818817139, + "learning_rate": 0.00015994533839129961, + "loss": 3.1588, + "step": 40072 + }, + { + "epoch": 1.96, + "grad_norm": 0.6458503603935242, + "learning_rate": 0.00015993172315315285, + "loss": 2.9297, + "step": 40073 + }, + { + "epoch": 1.96, + "grad_norm": 0.6466934084892273, + "learning_rate": 0.000159918108283919, + "loss": 2.9095, + "step": 40074 + }, + { + "epoch": 1.96, + "grad_norm": 0.6431629657745361, + "learning_rate": 0.000159904493783634, + "loss": 3.019, + "step": 40075 + }, + { + "epoch": 1.96, + "grad_norm": 0.6387059688568115, + "learning_rate": 0.00015989087965233347, + "loss": 2.7465, + "step": 40076 + }, + { + "epoch": 1.96, + "grad_norm": 0.6597829461097717, + "learning_rate": 0.00015987726589005352, + "loss": 3.1271, + "step": 40077 + }, + { + "epoch": 1.96, + "grad_norm": 0.673991858959198, + "learning_rate": 0.00015986365249682992, + "loss": 3.0424, + "step": 40078 + }, + { + "epoch": 1.96, + "grad_norm": 0.6489351391792297, + "learning_rate": 0.00015985003947269834, + "loss": 2.8951, + "step": 40079 + }, + { + "epoch": 1.96, + "grad_norm": 0.7270581722259521, + "learning_rate": 0.00015983642681769494, + "loss": 2.8268, + "step": 40080 + }, + { + "epoch": 1.96, + "grad_norm": 0.7210774421691895, + "learning_rate": 0.00015982281453185522, + "loss": 3.0683, + "step": 40081 + }, + { + "epoch": 1.96, + "grad_norm": 0.6519688963890076, + "learning_rate": 0.0001598092026152154, + "loss": 2.9316, + "step": 40082 + }, + { + "epoch": 1.96, + "grad_norm": 0.6710745692253113, + "learning_rate": 0.00015979559106781103, + "loss": 3.1201, + "step": 40083 + }, + { + "epoch": 1.96, + "grad_norm": 0.6843429803848267, + "learning_rate": 0.00015978197988967811, + "loss": 2.9192, + "step": 40084 + }, + { + "epoch": 1.96, + "grad_norm": 0.6874869465827942, + "learning_rate": 0.00015976836908085268, + "loss": 2.9876, + "step": 40085 + }, + { + "epoch": 1.96, + "grad_norm": 0.6737939715385437, + "learning_rate": 0.00015975475864137017, + "loss": 2.9837, + "step": 40086 + }, + { + "epoch": 1.96, + "grad_norm": 0.69536292552948, + "learning_rate": 0.00015974114857126675, + "loss": 3.025, + "step": 40087 + }, + { + "epoch": 1.96, + "grad_norm": 0.6652324795722961, + "learning_rate": 0.00015972753887057799, + "loss": 2.9949, + "step": 40088 + }, + { + "epoch": 1.96, + "grad_norm": 0.6829466223716736, + "learning_rate": 0.0001597139295393399, + "loss": 3.1127, + "step": 40089 + }, + { + "epoch": 1.96, + "grad_norm": 0.7316030859947205, + "learning_rate": 0.0001597003205775885, + "loss": 2.9742, + "step": 40090 + }, + { + "epoch": 1.96, + "grad_norm": 0.6916874051094055, + "learning_rate": 0.0001596867119853593, + "loss": 3.2263, + "step": 40091 + }, + { + "epoch": 1.96, + "grad_norm": 0.647091805934906, + "learning_rate": 0.00015967310376268837, + "loss": 3.0349, + "step": 40092 + }, + { + "epoch": 1.96, + "grad_norm": 0.6750726699829102, + "learning_rate": 0.00015965949590961156, + "loss": 2.8189, + "step": 40093 + }, + { + "epoch": 1.96, + "grad_norm": 0.6377853751182556, + "learning_rate": 0.00015964588842616446, + "loss": 2.7829, + "step": 40094 + }, + { + "epoch": 1.96, + "grad_norm": 0.7172527313232422, + "learning_rate": 0.00015963228131238324, + "loss": 3.0597, + "step": 40095 + }, + { + "epoch": 1.97, + "grad_norm": 0.6895819902420044, + "learning_rate": 0.00015961867456830339, + "loss": 3.0861, + "step": 40096 + }, + { + "epoch": 1.97, + "grad_norm": 0.7082463502883911, + "learning_rate": 0.0001596050681939611, + "loss": 2.7711, + "step": 40097 + }, + { + "epoch": 1.97, + "grad_norm": 0.6395615339279175, + "learning_rate": 0.00015959146218939193, + "loss": 3.0708, + "step": 40098 + }, + { + "epoch": 1.97, + "grad_norm": 0.6931965351104736, + "learning_rate": 0.00015957785655463193, + "loss": 2.9829, + "step": 40099 + }, + { + "epoch": 1.97, + "grad_norm": 0.6493892073631287, + "learning_rate": 0.00015956425128971682, + "loss": 3.0342, + "step": 40100 + }, + { + "epoch": 1.97, + "grad_norm": 0.6747897267341614, + "learning_rate": 0.0001595506463946823, + "loss": 2.8114, + "step": 40101 + }, + { + "epoch": 1.97, + "grad_norm": 0.6698330044746399, + "learning_rate": 0.00015953704186956452, + "loss": 3.0373, + "step": 40102 + }, + { + "epoch": 1.97, + "grad_norm": 0.6742802262306213, + "learning_rate": 0.00015952343771439903, + "loss": 3.0902, + "step": 40103 + }, + { + "epoch": 1.97, + "grad_norm": 0.6375805735588074, + "learning_rate": 0.00015950983392922175, + "loss": 3.0517, + "step": 40104 + }, + { + "epoch": 1.97, + "grad_norm": 0.6356774568557739, + "learning_rate": 0.0001594962305140686, + "loss": 2.8752, + "step": 40105 + }, + { + "epoch": 1.97, + "grad_norm": 0.6412781476974487, + "learning_rate": 0.00015948262746897526, + "loss": 3.0477, + "step": 40106 + }, + { + "epoch": 1.97, + "grad_norm": 0.6381820440292358, + "learning_rate": 0.00015946902479397777, + "loss": 3.0045, + "step": 40107 + }, + { + "epoch": 1.97, + "grad_norm": 0.6312680840492249, + "learning_rate": 0.0001594554224891118, + "loss": 2.9117, + "step": 40108 + }, + { + "epoch": 1.97, + "grad_norm": 0.6366028785705566, + "learning_rate": 0.0001594418205544131, + "loss": 3.0584, + "step": 40109 + }, + { + "epoch": 1.97, + "grad_norm": 0.6530911922454834, + "learning_rate": 0.0001594282189899177, + "loss": 2.8788, + "step": 40110 + }, + { + "epoch": 1.97, + "grad_norm": 0.653728187084198, + "learning_rate": 0.0001594146177956612, + "loss": 2.8951, + "step": 40111 + }, + { + "epoch": 1.97, + "grad_norm": 0.6730470061302185, + "learning_rate": 0.0001594010169716797, + "loss": 3.071, + "step": 40112 + }, + { + "epoch": 1.97, + "grad_norm": 0.6829031705856323, + "learning_rate": 0.00015938741651800868, + "loss": 2.9737, + "step": 40113 + }, + { + "epoch": 1.97, + "grad_norm": 0.6463053226470947, + "learning_rate": 0.00015937381643468427, + "loss": 3.0131, + "step": 40114 + }, + { + "epoch": 1.97, + "grad_norm": 0.6664642095565796, + "learning_rate": 0.00015936021672174216, + "loss": 2.7908, + "step": 40115 + }, + { + "epoch": 1.97, + "grad_norm": 0.6486683487892151, + "learning_rate": 0.00015934661737921808, + "loss": 3.1284, + "step": 40116 + }, + { + "epoch": 1.97, + "grad_norm": 0.6821164488792419, + "learning_rate": 0.00015933301840714804, + "loss": 3.0977, + "step": 40117 + }, + { + "epoch": 1.97, + "grad_norm": 0.6829363107681274, + "learning_rate": 0.00015931941980556767, + "loss": 3.0408, + "step": 40118 + }, + { + "epoch": 1.97, + "grad_norm": 0.7041226625442505, + "learning_rate": 0.00015930582157451284, + "loss": 2.9775, + "step": 40119 + }, + { + "epoch": 1.97, + "grad_norm": 0.6732624769210815, + "learning_rate": 0.00015929222371401956, + "loss": 2.9952, + "step": 40120 + }, + { + "epoch": 1.97, + "grad_norm": 0.6260468363761902, + "learning_rate": 0.00015927862622412345, + "loss": 3.1095, + "step": 40121 + }, + { + "epoch": 1.97, + "grad_norm": 0.6741329431533813, + "learning_rate": 0.00015926502910486033, + "loss": 2.9427, + "step": 40122 + }, + { + "epoch": 1.97, + "grad_norm": 0.6579768657684326, + "learning_rate": 0.00015925143235626595, + "loss": 3.0081, + "step": 40123 + }, + { + "epoch": 1.97, + "grad_norm": 0.6937235593795776, + "learning_rate": 0.00015923783597837635, + "loss": 3.1918, + "step": 40124 + }, + { + "epoch": 1.97, + "grad_norm": 0.6266592144966125, + "learning_rate": 0.00015922423997122705, + "loss": 2.9933, + "step": 40125 + }, + { + "epoch": 1.97, + "grad_norm": 0.6419720649719238, + "learning_rate": 0.00015921064433485404, + "loss": 2.8718, + "step": 40126 + }, + { + "epoch": 1.97, + "grad_norm": 0.7935446500778198, + "learning_rate": 0.00015919704906929317, + "loss": 2.76, + "step": 40127 + }, + { + "epoch": 1.97, + "grad_norm": 0.6469537615776062, + "learning_rate": 0.00015918345417458008, + "loss": 2.8866, + "step": 40128 + }, + { + "epoch": 1.97, + "grad_norm": 0.6699641346931458, + "learning_rate": 0.00015916985965075078, + "loss": 2.925, + "step": 40129 + }, + { + "epoch": 1.97, + "grad_norm": 0.6339029669761658, + "learning_rate": 0.000159156265497841, + "loss": 2.9374, + "step": 40130 + }, + { + "epoch": 1.97, + "grad_norm": 0.6614867448806763, + "learning_rate": 0.00015914267171588637, + "loss": 2.8457, + "step": 40131 + }, + { + "epoch": 1.97, + "grad_norm": 0.6942468881607056, + "learning_rate": 0.00015912907830492292, + "loss": 3.162, + "step": 40132 + }, + { + "epoch": 1.97, + "grad_norm": 0.6900768280029297, + "learning_rate": 0.00015911548526498627, + "loss": 3.1054, + "step": 40133 + }, + { + "epoch": 1.97, + "grad_norm": 0.6508978009223938, + "learning_rate": 0.00015910189259611248, + "loss": 2.7635, + "step": 40134 + }, + { + "epoch": 1.97, + "grad_norm": 0.6943615674972534, + "learning_rate": 0.00015908830029833702, + "loss": 3.0908, + "step": 40135 + }, + { + "epoch": 1.97, + "grad_norm": 0.6539590954780579, + "learning_rate": 0.00015907470837169598, + "loss": 2.8507, + "step": 40136 + }, + { + "epoch": 1.97, + "grad_norm": 0.6708890795707703, + "learning_rate": 0.0001590611168162251, + "loss": 3.1577, + "step": 40137 + }, + { + "epoch": 1.97, + "grad_norm": 0.7016001343727112, + "learning_rate": 0.00015904752563195994, + "loss": 2.9599, + "step": 40138 + }, + { + "epoch": 1.97, + "grad_norm": 0.716665506362915, + "learning_rate": 0.00015903393481893658, + "loss": 3.1311, + "step": 40139 + }, + { + "epoch": 1.97, + "grad_norm": 0.6903090476989746, + "learning_rate": 0.0001590203443771906, + "loss": 2.9423, + "step": 40140 + }, + { + "epoch": 1.97, + "grad_norm": 0.6860828995704651, + "learning_rate": 0.0001590067543067579, + "loss": 3.0349, + "step": 40141 + }, + { + "epoch": 1.97, + "grad_norm": 0.6501972675323486, + "learning_rate": 0.00015899316460767437, + "loss": 2.9293, + "step": 40142 + }, + { + "epoch": 1.97, + "grad_norm": 0.6735811233520508, + "learning_rate": 0.00015897957527997574, + "loss": 3.2349, + "step": 40143 + }, + { + "epoch": 1.97, + "grad_norm": 0.6617088317871094, + "learning_rate": 0.00015896598632369774, + "loss": 2.9299, + "step": 40144 + }, + { + "epoch": 1.97, + "grad_norm": 0.6840248703956604, + "learning_rate": 0.00015895239773887611, + "loss": 3.0315, + "step": 40145 + }, + { + "epoch": 1.97, + "grad_norm": 0.6260735988616943, + "learning_rate": 0.0001589388095255467, + "loss": 2.7975, + "step": 40146 + }, + { + "epoch": 1.97, + "grad_norm": 0.7066246867179871, + "learning_rate": 0.00015892522168374547, + "loss": 2.9681, + "step": 40147 + }, + { + "epoch": 1.97, + "grad_norm": 0.6458545327186584, + "learning_rate": 0.00015891163421350788, + "loss": 2.8748, + "step": 40148 + }, + { + "epoch": 1.97, + "grad_norm": 0.6603613495826721, + "learning_rate": 0.00015889804711487007, + "loss": 3.029, + "step": 40149 + }, + { + "epoch": 1.97, + "grad_norm": 0.6358199715614319, + "learning_rate": 0.00015888446038786746, + "loss": 2.9619, + "step": 40150 + }, + { + "epoch": 1.97, + "grad_norm": 0.7179542779922485, + "learning_rate": 0.00015887087403253618, + "loss": 2.9119, + "step": 40151 + }, + { + "epoch": 1.97, + "grad_norm": 0.6448338627815247, + "learning_rate": 0.00015885728804891186, + "loss": 2.9725, + "step": 40152 + }, + { + "epoch": 1.97, + "grad_norm": 0.7053864002227783, + "learning_rate": 0.00015884370243703018, + "loss": 2.6094, + "step": 40153 + }, + { + "epoch": 1.97, + "grad_norm": 0.6579211354255676, + "learning_rate": 0.0001588301171969271, + "loss": 2.9604, + "step": 40154 + }, + { + "epoch": 1.97, + "grad_norm": 0.6514232754707336, + "learning_rate": 0.00015881653232863822, + "loss": 3.1176, + "step": 40155 + }, + { + "epoch": 1.97, + "grad_norm": 0.6942654848098755, + "learning_rate": 0.00015880294783219943, + "loss": 3.1528, + "step": 40156 + }, + { + "epoch": 1.97, + "grad_norm": 0.7441583871841431, + "learning_rate": 0.0001587893637076466, + "loss": 2.9195, + "step": 40157 + }, + { + "epoch": 1.97, + "grad_norm": 0.6600908041000366, + "learning_rate": 0.00015877577995501542, + "loss": 2.9988, + "step": 40158 + }, + { + "epoch": 1.97, + "grad_norm": 0.6638275980949402, + "learning_rate": 0.00015876219657434164, + "loss": 2.9348, + "step": 40159 + }, + { + "epoch": 1.97, + "grad_norm": 0.6445525288581848, + "learning_rate": 0.00015874861356566096, + "loss": 2.8587, + "step": 40160 + }, + { + "epoch": 1.97, + "grad_norm": 0.6753540635108948, + "learning_rate": 0.00015873503092900924, + "loss": 2.8209, + "step": 40161 + }, + { + "epoch": 1.97, + "grad_norm": 0.6722790002822876, + "learning_rate": 0.0001587214486644224, + "loss": 2.9628, + "step": 40162 + }, + { + "epoch": 1.97, + "grad_norm": 0.7291383147239685, + "learning_rate": 0.00015870786677193592, + "loss": 3.0806, + "step": 40163 + }, + { + "epoch": 1.97, + "grad_norm": 0.6721978187561035, + "learning_rate": 0.00015869428525158588, + "loss": 3.0379, + "step": 40164 + }, + { + "epoch": 1.97, + "grad_norm": 0.7075223922729492, + "learning_rate": 0.00015868070410340787, + "loss": 3.0957, + "step": 40165 + }, + { + "epoch": 1.97, + "grad_norm": 0.6854174733161926, + "learning_rate": 0.00015866712332743757, + "loss": 2.9606, + "step": 40166 + }, + { + "epoch": 1.97, + "grad_norm": 0.678521454334259, + "learning_rate": 0.00015865354292371098, + "loss": 2.8023, + "step": 40167 + }, + { + "epoch": 1.97, + "grad_norm": 0.6438807249069214, + "learning_rate": 0.0001586399628922637, + "loss": 3.0427, + "step": 40168 + }, + { + "epoch": 1.97, + "grad_norm": 0.6650844216346741, + "learning_rate": 0.00015862638323313157, + "loss": 3.129, + "step": 40169 + }, + { + "epoch": 1.97, + "grad_norm": 0.6512045860290527, + "learning_rate": 0.00015861280394635028, + "loss": 3.1556, + "step": 40170 + }, + { + "epoch": 1.97, + "grad_norm": 0.6742256879806519, + "learning_rate": 0.00015859922503195563, + "loss": 2.9698, + "step": 40171 + }, + { + "epoch": 1.97, + "grad_norm": 0.6796380877494812, + "learning_rate": 0.00015858564648998355, + "loss": 2.9838, + "step": 40172 + }, + { + "epoch": 1.97, + "grad_norm": 0.6988964080810547, + "learning_rate": 0.00015857206832046967, + "loss": 2.9046, + "step": 40173 + }, + { + "epoch": 1.97, + "grad_norm": 0.6292441487312317, + "learning_rate": 0.00015855849052344974, + "loss": 3.0492, + "step": 40174 + }, + { + "epoch": 1.97, + "grad_norm": 0.6308088898658752, + "learning_rate": 0.0001585449130989594, + "loss": 2.9864, + "step": 40175 + }, + { + "epoch": 1.97, + "grad_norm": 0.6453452706336975, + "learning_rate": 0.00015853133604703452, + "loss": 3.0101, + "step": 40176 + }, + { + "epoch": 1.97, + "grad_norm": 0.7138879299163818, + "learning_rate": 0.00015851775936771102, + "loss": 3.0329, + "step": 40177 + }, + { + "epoch": 1.97, + "grad_norm": 0.687667191028595, + "learning_rate": 0.00015850418306102436, + "loss": 2.7237, + "step": 40178 + }, + { + "epoch": 1.97, + "grad_norm": 0.720391571521759, + "learning_rate": 0.0001584906071270106, + "loss": 3.1241, + "step": 40179 + }, + { + "epoch": 1.97, + "grad_norm": 0.6851097345352173, + "learning_rate": 0.0001584770315657053, + "loss": 2.8208, + "step": 40180 + }, + { + "epoch": 1.97, + "grad_norm": 0.6699183583259583, + "learning_rate": 0.00015846345637714418, + "loss": 3.0279, + "step": 40181 + }, + { + "epoch": 1.97, + "grad_norm": 0.6366994976997375, + "learning_rate": 0.00015844988156136323, + "loss": 2.9347, + "step": 40182 + }, + { + "epoch": 1.97, + "grad_norm": 0.6660268306732178, + "learning_rate": 0.00015843630711839788, + "loss": 2.9707, + "step": 40183 + }, + { + "epoch": 1.97, + "grad_norm": 0.6552741527557373, + "learning_rate": 0.00015842273304828415, + "loss": 2.951, + "step": 40184 + }, + { + "epoch": 1.97, + "grad_norm": 0.7053703665733337, + "learning_rate": 0.0001584091593510576, + "loss": 2.9394, + "step": 40185 + }, + { + "epoch": 1.97, + "grad_norm": 0.6609983444213867, + "learning_rate": 0.00015839558602675423, + "loss": 2.9316, + "step": 40186 + }, + { + "epoch": 1.97, + "grad_norm": 0.6549779772758484, + "learning_rate": 0.00015838201307540959, + "loss": 2.8942, + "step": 40187 + }, + { + "epoch": 1.97, + "grad_norm": 0.6527225971221924, + "learning_rate": 0.0001583684404970594, + "loss": 3.155, + "step": 40188 + }, + { + "epoch": 1.97, + "grad_norm": 0.6407042145729065, + "learning_rate": 0.00015835486829173954, + "loss": 2.8456, + "step": 40189 + }, + { + "epoch": 1.97, + "grad_norm": 0.6975271701812744, + "learning_rate": 0.00015834129645948564, + "loss": 2.7692, + "step": 40190 + }, + { + "epoch": 1.97, + "grad_norm": 0.6264335513114929, + "learning_rate": 0.00015832772500033344, + "loss": 2.8132, + "step": 40191 + }, + { + "epoch": 1.97, + "grad_norm": 0.6382396817207336, + "learning_rate": 0.00015831415391431893, + "loss": 2.8311, + "step": 40192 + }, + { + "epoch": 1.97, + "grad_norm": 0.6645261645317078, + "learning_rate": 0.0001583005832014775, + "loss": 3.2165, + "step": 40193 + }, + { + "epoch": 1.97, + "grad_norm": 0.6498206257820129, + "learning_rate": 0.00015828701286184517, + "loss": 2.9257, + "step": 40194 + }, + { + "epoch": 1.97, + "grad_norm": 0.6160268187522888, + "learning_rate": 0.00015827344289545765, + "loss": 2.9067, + "step": 40195 + }, + { + "epoch": 1.97, + "grad_norm": 0.6547241806983948, + "learning_rate": 0.00015825987330235044, + "loss": 2.8953, + "step": 40196 + }, + { + "epoch": 1.97, + "grad_norm": 0.6759021282196045, + "learning_rate": 0.00015824630408255955, + "loss": 2.8691, + "step": 40197 + }, + { + "epoch": 1.97, + "grad_norm": 0.6542566418647766, + "learning_rate": 0.0001582327352361205, + "loss": 2.8604, + "step": 40198 + }, + { + "epoch": 1.97, + "grad_norm": 0.6945884227752686, + "learning_rate": 0.0001582191667630693, + "loss": 3.0368, + "step": 40199 + }, + { + "epoch": 1.97, + "grad_norm": 0.6422370076179504, + "learning_rate": 0.00015820559866344138, + "loss": 2.9877, + "step": 40200 + }, + { + "epoch": 1.97, + "grad_norm": 0.6469019651412964, + "learning_rate": 0.00015819203093727271, + "loss": 3.0606, + "step": 40201 + }, + { + "epoch": 1.97, + "grad_norm": 0.6708106398582458, + "learning_rate": 0.000158178463584599, + "loss": 3.2249, + "step": 40202 + }, + { + "epoch": 1.97, + "grad_norm": 0.6726407408714294, + "learning_rate": 0.00015816489660545579, + "loss": 2.8245, + "step": 40203 + }, + { + "epoch": 1.97, + "grad_norm": 0.6494171619415283, + "learning_rate": 0.00015815132999987905, + "loss": 3.1015, + "step": 40204 + }, + { + "epoch": 1.97, + "grad_norm": 0.6671391725540161, + "learning_rate": 0.00015813776376790426, + "loss": 2.937, + "step": 40205 + }, + { + "epoch": 1.97, + "grad_norm": 0.6737003922462463, + "learning_rate": 0.00015812419790956744, + "loss": 2.9281, + "step": 40206 + }, + { + "epoch": 1.97, + "grad_norm": 0.6515318155288696, + "learning_rate": 0.00015811063242490407, + "loss": 3.1345, + "step": 40207 + }, + { + "epoch": 1.97, + "grad_norm": 0.6995930671691895, + "learning_rate": 0.00015809706731395002, + "loss": 2.8716, + "step": 40208 + }, + { + "epoch": 1.97, + "grad_norm": 0.6816954612731934, + "learning_rate": 0.00015808350257674113, + "loss": 3.0883, + "step": 40209 + }, + { + "epoch": 1.97, + "grad_norm": 0.6604844331741333, + "learning_rate": 0.00015806993821331283, + "loss": 3.192, + "step": 40210 + }, + { + "epoch": 1.97, + "grad_norm": 0.6579017043113708, + "learning_rate": 0.0001580563742237011, + "loss": 2.839, + "step": 40211 + }, + { + "epoch": 1.97, + "grad_norm": 0.6625229716300964, + "learning_rate": 0.00015804281060794142, + "loss": 2.9405, + "step": 40212 + }, + { + "epoch": 1.97, + "grad_norm": 0.7139385938644409, + "learning_rate": 0.00015802924736606965, + "loss": 3.0133, + "step": 40213 + }, + { + "epoch": 1.97, + "grad_norm": 0.6507391333580017, + "learning_rate": 0.00015801568449812166, + "loss": 3.032, + "step": 40214 + }, + { + "epoch": 1.97, + "grad_norm": 0.6921162009239197, + "learning_rate": 0.00015800212200413292, + "loss": 2.862, + "step": 40215 + }, + { + "epoch": 1.97, + "grad_norm": 0.6766071319580078, + "learning_rate": 0.00015798855988413937, + "loss": 3.1456, + "step": 40216 + }, + { + "epoch": 1.97, + "grad_norm": 0.7535398602485657, + "learning_rate": 0.00015797499813817665, + "loss": 2.9585, + "step": 40217 + }, + { + "epoch": 1.97, + "grad_norm": 0.6730259656906128, + "learning_rate": 0.0001579614367662803, + "loss": 3.04, + "step": 40218 + }, + { + "epoch": 1.97, + "grad_norm": 0.6672115921974182, + "learning_rate": 0.0001579478757684863, + "loss": 2.9337, + "step": 40219 + }, + { + "epoch": 1.97, + "grad_norm": 0.708581805229187, + "learning_rate": 0.00015793431514483016, + "loss": 2.9796, + "step": 40220 + }, + { + "epoch": 1.97, + "grad_norm": 0.6547756791114807, + "learning_rate": 0.00015792075489534782, + "loss": 2.9422, + "step": 40221 + }, + { + "epoch": 1.97, + "grad_norm": 0.6528579592704773, + "learning_rate": 0.0001579071950200747, + "loss": 2.9878, + "step": 40222 + }, + { + "epoch": 1.97, + "grad_norm": 0.6749711036682129, + "learning_rate": 0.00015789363551904685, + "loss": 2.8666, + "step": 40223 + }, + { + "epoch": 1.97, + "grad_norm": 0.6884815692901611, + "learning_rate": 0.0001578800763922998, + "loss": 2.9652, + "step": 40224 + }, + { + "epoch": 1.97, + "grad_norm": 0.6515948176383972, + "learning_rate": 0.00015786651763986912, + "loss": 2.9444, + "step": 40225 + }, + { + "epoch": 1.97, + "grad_norm": 0.7087914347648621, + "learning_rate": 0.00015785295926179087, + "loss": 2.9341, + "step": 40226 + }, + { + "epoch": 1.97, + "grad_norm": 0.6495019197463989, + "learning_rate": 0.0001578394012581004, + "loss": 3.0155, + "step": 40227 + }, + { + "epoch": 1.97, + "grad_norm": 0.6784089207649231, + "learning_rate": 0.00015782584362883362, + "loss": 3.0786, + "step": 40228 + }, + { + "epoch": 1.97, + "grad_norm": 0.6523367166519165, + "learning_rate": 0.00015781228637402629, + "loss": 2.9048, + "step": 40229 + }, + { + "epoch": 1.97, + "grad_norm": 0.6989542245864868, + "learning_rate": 0.00015779872949371395, + "loss": 3.0083, + "step": 40230 + }, + { + "epoch": 1.97, + "grad_norm": 0.6683775186538696, + "learning_rate": 0.00015778517298793249, + "loss": 2.8116, + "step": 40231 + }, + { + "epoch": 1.97, + "grad_norm": 0.7025153636932373, + "learning_rate": 0.0001577716168567175, + "loss": 3.2109, + "step": 40232 + }, + { + "epoch": 1.97, + "grad_norm": 0.6500304937362671, + "learning_rate": 0.0001577580611001046, + "loss": 3.2659, + "step": 40233 + }, + { + "epoch": 1.97, + "grad_norm": 0.6912034749984741, + "learning_rate": 0.0001577445057181297, + "loss": 2.8978, + "step": 40234 + }, + { + "epoch": 1.97, + "grad_norm": 0.6490848660469055, + "learning_rate": 0.00015773095071082825, + "loss": 3.2104, + "step": 40235 + }, + { + "epoch": 1.97, + "grad_norm": 0.7083659172058105, + "learning_rate": 0.00015771739607823627, + "loss": 2.9851, + "step": 40236 + }, + { + "epoch": 1.97, + "grad_norm": 0.6967908143997192, + "learning_rate": 0.00015770384182038914, + "loss": 3.0441, + "step": 40237 + }, + { + "epoch": 1.97, + "grad_norm": 0.7174261808395386, + "learning_rate": 0.00015769028793732282, + "loss": 3.1049, + "step": 40238 + }, + { + "epoch": 1.97, + "grad_norm": 0.6764041185379028, + "learning_rate": 0.0001576767344290729, + "loss": 2.8842, + "step": 40239 + }, + { + "epoch": 1.97, + "grad_norm": 0.6803184747695923, + "learning_rate": 0.00015766318129567498, + "loss": 3.1289, + "step": 40240 + }, + { + "epoch": 1.97, + "grad_norm": 0.6668773293495178, + "learning_rate": 0.0001576496285371649, + "loss": 3.0398, + "step": 40241 + }, + { + "epoch": 1.97, + "grad_norm": 0.639971137046814, + "learning_rate": 0.00015763607615357821, + "loss": 3.1722, + "step": 40242 + }, + { + "epoch": 1.97, + "grad_norm": 0.6707262396812439, + "learning_rate": 0.00015762252414495073, + "loss": 2.9338, + "step": 40243 + }, + { + "epoch": 1.97, + "grad_norm": 0.6912310123443604, + "learning_rate": 0.00015760897251131822, + "loss": 2.9795, + "step": 40244 + }, + { + "epoch": 1.97, + "grad_norm": 0.6759024858474731, + "learning_rate": 0.0001575954212527163, + "loss": 3.0455, + "step": 40245 + }, + { + "epoch": 1.97, + "grad_norm": 0.6769490242004395, + "learning_rate": 0.00015758187036918054, + "loss": 2.9084, + "step": 40246 + }, + { + "epoch": 1.97, + "grad_norm": 0.7240374684333801, + "learning_rate": 0.0001575683198607467, + "loss": 3.026, + "step": 40247 + }, + { + "epoch": 1.97, + "grad_norm": 0.7014404535293579, + "learning_rate": 0.0001575547697274505, + "loss": 2.8883, + "step": 40248 + }, + { + "epoch": 1.97, + "grad_norm": 0.6260035634040833, + "learning_rate": 0.0001575412199693277, + "loss": 2.9092, + "step": 40249 + }, + { + "epoch": 1.97, + "grad_norm": 0.6681224703788757, + "learning_rate": 0.00015752767058641376, + "loss": 3.019, + "step": 40250 + }, + { + "epoch": 1.97, + "grad_norm": 0.6676934957504272, + "learning_rate": 0.00015751412157874467, + "loss": 3.0937, + "step": 40251 + }, + { + "epoch": 1.97, + "grad_norm": 0.7086468935012817, + "learning_rate": 0.00015750057294635584, + "loss": 2.9019, + "step": 40252 + }, + { + "epoch": 1.97, + "grad_norm": 0.6498782634735107, + "learning_rate": 0.00015748702468928322, + "loss": 2.9223, + "step": 40253 + }, + { + "epoch": 1.97, + "grad_norm": 0.6463537216186523, + "learning_rate": 0.0001574734768075623, + "loss": 2.8543, + "step": 40254 + }, + { + "epoch": 1.97, + "grad_norm": 0.66627037525177, + "learning_rate": 0.00015745992930122873, + "loss": 2.9977, + "step": 40255 + }, + { + "epoch": 1.97, + "grad_norm": 0.6671886444091797, + "learning_rate": 0.0001574463821703184, + "loss": 3.0417, + "step": 40256 + }, + { + "epoch": 1.97, + "grad_norm": 0.6478363275527954, + "learning_rate": 0.00015743283541486673, + "loss": 2.8345, + "step": 40257 + }, + { + "epoch": 1.97, + "grad_norm": 0.7030870914459229, + "learning_rate": 0.00015741928903490954, + "loss": 2.8857, + "step": 40258 + }, + { + "epoch": 1.97, + "grad_norm": 0.6763714551925659, + "learning_rate": 0.0001574057430304826, + "loss": 3.0909, + "step": 40259 + }, + { + "epoch": 1.97, + "grad_norm": 0.6961510181427002, + "learning_rate": 0.0001573921974016215, + "loss": 2.933, + "step": 40260 + }, + { + "epoch": 1.97, + "grad_norm": 0.6534474492073059, + "learning_rate": 0.00015737865214836193, + "loss": 3.1245, + "step": 40261 + }, + { + "epoch": 1.97, + "grad_norm": 0.6820818185806274, + "learning_rate": 0.0001573651072707394, + "loss": 3.09, + "step": 40262 + }, + { + "epoch": 1.97, + "grad_norm": 0.6295596361160278, + "learning_rate": 0.00015735156276878976, + "loss": 3.1183, + "step": 40263 + }, + { + "epoch": 1.97, + "grad_norm": 0.6853962540626526, + "learning_rate": 0.00015733801864254874, + "loss": 3.0549, + "step": 40264 + }, + { + "epoch": 1.97, + "grad_norm": 0.7549799084663391, + "learning_rate": 0.0001573244748920518, + "loss": 3.0955, + "step": 40265 + }, + { + "epoch": 1.97, + "grad_norm": 0.6778276562690735, + "learning_rate": 0.00015731093151733484, + "loss": 2.9153, + "step": 40266 + }, + { + "epoch": 1.97, + "grad_norm": 0.6821315884590149, + "learning_rate": 0.00015729738851843344, + "loss": 3.0041, + "step": 40267 + }, + { + "epoch": 1.97, + "grad_norm": 0.7044394016265869, + "learning_rate": 0.00015728384589538315, + "loss": 2.8476, + "step": 40268 + }, + { + "epoch": 1.97, + "grad_norm": 0.6207550764083862, + "learning_rate": 0.00015727030364821984, + "loss": 3.1545, + "step": 40269 + }, + { + "epoch": 1.97, + "grad_norm": 0.6819803714752197, + "learning_rate": 0.000157256761776979, + "loss": 3.0553, + "step": 40270 + }, + { + "epoch": 1.97, + "grad_norm": 0.6631947159767151, + "learning_rate": 0.00015724322028169645, + "loss": 2.9441, + "step": 40271 + }, + { + "epoch": 1.97, + "grad_norm": 0.6724246144294739, + "learning_rate": 0.0001572296791624077, + "loss": 3.1259, + "step": 40272 + }, + { + "epoch": 1.97, + "grad_norm": 0.6821426749229431, + "learning_rate": 0.00015721613841914852, + "loss": 3.0238, + "step": 40273 + }, + { + "epoch": 1.97, + "grad_norm": 0.6199890971183777, + "learning_rate": 0.0001572025980519546, + "loss": 3.0075, + "step": 40274 + }, + { + "epoch": 1.97, + "grad_norm": 0.637385368347168, + "learning_rate": 0.00015718905806086158, + "loss": 2.9653, + "step": 40275 + }, + { + "epoch": 1.97, + "grad_norm": 0.6946399211883545, + "learning_rate": 0.00015717551844590512, + "loss": 3.0097, + "step": 40276 + }, + { + "epoch": 1.97, + "grad_norm": 0.6899637579917908, + "learning_rate": 0.00015716197920712072, + "loss": 2.9472, + "step": 40277 + }, + { + "epoch": 1.97, + "grad_norm": 0.6917797923088074, + "learning_rate": 0.00015714844034454432, + "loss": 2.9553, + "step": 40278 + }, + { + "epoch": 1.97, + "grad_norm": 0.6806402206420898, + "learning_rate": 0.00015713490185821128, + "loss": 3.078, + "step": 40279 + }, + { + "epoch": 1.97, + "grad_norm": 0.6591494679450989, + "learning_rate": 0.00015712136374815744, + "loss": 3.1675, + "step": 40280 + }, + { + "epoch": 1.97, + "grad_norm": 0.632714033126831, + "learning_rate": 0.00015710782601441854, + "loss": 2.9648, + "step": 40281 + }, + { + "epoch": 1.97, + "grad_norm": 0.6618544459342957, + "learning_rate": 0.00015709428865703012, + "loss": 3.0405, + "step": 40282 + }, + { + "epoch": 1.97, + "grad_norm": 0.6480012536048889, + "learning_rate": 0.00015708075167602785, + "loss": 2.8349, + "step": 40283 + }, + { + "epoch": 1.97, + "grad_norm": 0.6463007926940918, + "learning_rate": 0.00015706721507144727, + "loss": 3.0346, + "step": 40284 + }, + { + "epoch": 1.97, + "grad_norm": 0.652336597442627, + "learning_rate": 0.00015705367884332413, + "loss": 3.1047, + "step": 40285 + }, + { + "epoch": 1.97, + "grad_norm": 0.6886484026908875, + "learning_rate": 0.0001570401429916942, + "loss": 2.8904, + "step": 40286 + }, + { + "epoch": 1.97, + "grad_norm": 0.632122814655304, + "learning_rate": 0.00015702660751659288, + "loss": 3.155, + "step": 40287 + }, + { + "epoch": 1.97, + "grad_norm": 0.7023600339889526, + "learning_rate": 0.00015701307241805613, + "loss": 2.8844, + "step": 40288 + }, + { + "epoch": 1.97, + "grad_norm": 0.6865475177764893, + "learning_rate": 0.00015699953769611938, + "loss": 3.0419, + "step": 40289 + }, + { + "epoch": 1.97, + "grad_norm": 0.676014244556427, + "learning_rate": 0.0001569860033508182, + "loss": 2.9082, + "step": 40290 + }, + { + "epoch": 1.97, + "grad_norm": 0.6413493752479553, + "learning_rate": 0.00015697246938218854, + "loss": 2.8572, + "step": 40291 + }, + { + "epoch": 1.97, + "grad_norm": 0.6815982460975647, + "learning_rate": 0.00015695893579026576, + "loss": 3.0746, + "step": 40292 + }, + { + "epoch": 1.97, + "grad_norm": 0.6308925151824951, + "learning_rate": 0.00015694540257508565, + "loss": 2.9461, + "step": 40293 + }, + { + "epoch": 1.97, + "grad_norm": 0.7394782304763794, + "learning_rate": 0.00015693186973668376, + "loss": 2.8717, + "step": 40294 + }, + { + "epoch": 1.97, + "grad_norm": 0.656547486782074, + "learning_rate": 0.00015691833727509577, + "loss": 2.8996, + "step": 40295 + }, + { + "epoch": 1.97, + "grad_norm": 0.6732604503631592, + "learning_rate": 0.00015690480519035745, + "loss": 3.065, + "step": 40296 + }, + { + "epoch": 1.97, + "grad_norm": 0.6596916317939758, + "learning_rate": 0.00015689127348250434, + "loss": 3.0307, + "step": 40297 + }, + { + "epoch": 1.97, + "grad_norm": 0.7515655159950256, + "learning_rate": 0.0001568777421515721, + "loss": 2.9068, + "step": 40298 + }, + { + "epoch": 1.97, + "grad_norm": 0.6503599286079407, + "learning_rate": 0.0001568642111975962, + "loss": 2.922, + "step": 40299 + }, + { + "epoch": 1.98, + "grad_norm": 0.6916700601577759, + "learning_rate": 0.00015685068062061243, + "loss": 2.9162, + "step": 40300 + }, + { + "epoch": 1.98, + "grad_norm": 0.6582286357879639, + "learning_rate": 0.00015683715042065654, + "loss": 2.8721, + "step": 40301 + }, + { + "epoch": 1.98, + "grad_norm": 0.6865876317024231, + "learning_rate": 0.00015682362059776393, + "loss": 3.0664, + "step": 40302 + }, + { + "epoch": 1.98, + "grad_norm": 0.7114717960357666, + "learning_rate": 0.00015681009115197048, + "loss": 3.087, + "step": 40303 + }, + { + "epoch": 1.98, + "grad_norm": 0.6616129875183105, + "learning_rate": 0.00015679656208331165, + "loss": 3.0272, + "step": 40304 + }, + { + "epoch": 1.98, + "grad_norm": 0.6594287753105164, + "learning_rate": 0.00015678303339182305, + "loss": 2.9298, + "step": 40305 + }, + { + "epoch": 1.98, + "grad_norm": 0.7083821892738342, + "learning_rate": 0.00015676950507754044, + "loss": 3.2278, + "step": 40306 + }, + { + "epoch": 1.98, + "grad_norm": 0.6950168013572693, + "learning_rate": 0.00015675597714049925, + "loss": 2.9107, + "step": 40307 + }, + { + "epoch": 1.98, + "grad_norm": 0.7014042139053345, + "learning_rate": 0.00015674244958073543, + "loss": 2.892, + "step": 40308 + }, + { + "epoch": 1.98, + "grad_norm": 0.7078503370285034, + "learning_rate": 0.00015672892239828432, + "loss": 2.8923, + "step": 40309 + }, + { + "epoch": 1.98, + "grad_norm": 0.6410806179046631, + "learning_rate": 0.00015671539559318162, + "loss": 3.0353, + "step": 40310 + }, + { + "epoch": 1.98, + "grad_norm": 0.6679813265800476, + "learning_rate": 0.00015670186916546318, + "loss": 2.9274, + "step": 40311 + }, + { + "epoch": 1.98, + "grad_norm": 0.6931245923042297, + "learning_rate": 0.00015668834311516428, + "loss": 2.9186, + "step": 40312 + }, + { + "epoch": 1.98, + "grad_norm": 0.6886323094367981, + "learning_rate": 0.00015667481744232082, + "loss": 3.0251, + "step": 40313 + }, + { + "epoch": 1.98, + "grad_norm": 0.6314588785171509, + "learning_rate": 0.00015666129214696815, + "loss": 3.0817, + "step": 40314 + }, + { + "epoch": 1.98, + "grad_norm": 0.6671131253242493, + "learning_rate": 0.00015664776722914205, + "loss": 2.987, + "step": 40315 + }, + { + "epoch": 1.98, + "grad_norm": 0.6521415114402771, + "learning_rate": 0.00015663424268887828, + "loss": 2.884, + "step": 40316 + }, + { + "epoch": 1.98, + "grad_norm": 0.6604424118995667, + "learning_rate": 0.00015662071852621222, + "loss": 2.767, + "step": 40317 + }, + { + "epoch": 1.98, + "grad_norm": 0.6475251317024231, + "learning_rate": 0.00015660719474117968, + "loss": 3.0083, + "step": 40318 + }, + { + "epoch": 1.98, + "grad_norm": 0.6570774912834167, + "learning_rate": 0.0001565936713338162, + "loss": 3.0535, + "step": 40319 + }, + { + "epoch": 1.98, + "grad_norm": 0.6820668578147888, + "learning_rate": 0.00015658014830415728, + "loss": 2.9305, + "step": 40320 + }, + { + "epoch": 1.98, + "grad_norm": 0.6360170841217041, + "learning_rate": 0.00015656662565223878, + "loss": 3.0399, + "step": 40321 + }, + { + "epoch": 1.98, + "grad_norm": 0.6604784727096558, + "learning_rate": 0.00015655310337809604, + "loss": 3.2472, + "step": 40322 + }, + { + "epoch": 1.98, + "grad_norm": 0.694609522819519, + "learning_rate": 0.00015653958148176492, + "loss": 2.9491, + "step": 40323 + }, + { + "epoch": 1.98, + "grad_norm": 0.6777598857879639, + "learning_rate": 0.00015652605996328085, + "loss": 3.0202, + "step": 40324 + }, + { + "epoch": 1.98, + "grad_norm": 0.6691681742668152, + "learning_rate": 0.00015651253882267965, + "loss": 3.0187, + "step": 40325 + }, + { + "epoch": 1.98, + "grad_norm": 0.6447973251342773, + "learning_rate": 0.0001564990180599968, + "loss": 3.2181, + "step": 40326 + }, + { + "epoch": 1.98, + "grad_norm": 0.6347460746765137, + "learning_rate": 0.00015648549767526777, + "loss": 3.0601, + "step": 40327 + }, + { + "epoch": 1.98, + "grad_norm": 0.6752090454101562, + "learning_rate": 0.00015647197766852848, + "loss": 2.9086, + "step": 40328 + }, + { + "epoch": 1.98, + "grad_norm": 0.6812394261360168, + "learning_rate": 0.00015645845803981425, + "loss": 2.9608, + "step": 40329 + }, + { + "epoch": 1.98, + "grad_norm": 0.7588211894035339, + "learning_rate": 0.00015644493878916077, + "loss": 3.021, + "step": 40330 + }, + { + "epoch": 1.98, + "grad_norm": 0.6755926012992859, + "learning_rate": 0.00015643141991660388, + "loss": 3.0217, + "step": 40331 + }, + { + "epoch": 1.98, + "grad_norm": 0.6614903211593628, + "learning_rate": 0.00015641790142217882, + "loss": 3.1655, + "step": 40332 + }, + { + "epoch": 1.98, + "grad_norm": 0.6844372749328613, + "learning_rate": 0.00015640438330592158, + "loss": 2.9864, + "step": 40333 + }, + { + "epoch": 1.98, + "grad_norm": 0.6596974730491638, + "learning_rate": 0.00015639086556786747, + "loss": 3.099, + "step": 40334 + }, + { + "epoch": 1.98, + "grad_norm": 0.6417086720466614, + "learning_rate": 0.00015637734820805212, + "loss": 3.0785, + "step": 40335 + }, + { + "epoch": 1.98, + "grad_norm": 0.6517664194107056, + "learning_rate": 0.00015636383122651125, + "loss": 2.8524, + "step": 40336 + }, + { + "epoch": 1.98, + "grad_norm": 0.7032948136329651, + "learning_rate": 0.00015635031462328034, + "loss": 3.0513, + "step": 40337 + }, + { + "epoch": 1.98, + "grad_norm": 0.666313886642456, + "learning_rate": 0.0001563367983983952, + "loss": 3.2258, + "step": 40338 + }, + { + "epoch": 1.98, + "grad_norm": 0.6794551610946655, + "learning_rate": 0.0001563232825518911, + "loss": 3.0646, + "step": 40339 + }, + { + "epoch": 1.98, + "grad_norm": 0.6857566833496094, + "learning_rate": 0.00015630976708380396, + "loss": 3.1632, + "step": 40340 + }, + { + "epoch": 1.98, + "grad_norm": 0.6492341756820679, + "learning_rate": 0.00015629625199416926, + "loss": 3.1526, + "step": 40341 + }, + { + "epoch": 1.98, + "grad_norm": 0.6422709226608276, + "learning_rate": 0.00015628273728302241, + "loss": 3.0216, + "step": 40342 + }, + { + "epoch": 1.98, + "grad_norm": 0.677402138710022, + "learning_rate": 0.00015626922295039936, + "loss": 2.9903, + "step": 40343 + }, + { + "epoch": 1.98, + "grad_norm": 0.6274300217628479, + "learning_rate": 0.00015625570899633533, + "loss": 2.9704, + "step": 40344 + }, + { + "epoch": 1.98, + "grad_norm": 0.6816664934158325, + "learning_rate": 0.00015624219542086613, + "loss": 3.1336, + "step": 40345 + }, + { + "epoch": 1.98, + "grad_norm": 0.719647228717804, + "learning_rate": 0.00015622868222402747, + "loss": 3.0212, + "step": 40346 + }, + { + "epoch": 1.98, + "grad_norm": 0.7077556252479553, + "learning_rate": 0.00015621516940585474, + "loss": 2.945, + "step": 40347 + }, + { + "epoch": 1.98, + "grad_norm": 0.7448094487190247, + "learning_rate": 0.0001562016569663836, + "loss": 2.854, + "step": 40348 + }, + { + "epoch": 1.98, + "grad_norm": 0.6531862616539001, + "learning_rate": 0.0001561881449056495, + "loss": 3.0116, + "step": 40349 + }, + { + "epoch": 1.98, + "grad_norm": 0.6494091749191284, + "learning_rate": 0.00015617463322368814, + "loss": 2.9924, + "step": 40350 + }, + { + "epoch": 1.98, + "grad_norm": 0.6503402590751648, + "learning_rate": 0.00015616112192053526, + "loss": 3.1069, + "step": 40351 + }, + { + "epoch": 1.98, + "grad_norm": 0.7476397752761841, + "learning_rate": 0.00015614761099622617, + "loss": 2.9541, + "step": 40352 + }, + { + "epoch": 1.98, + "grad_norm": 0.6769456267356873, + "learning_rate": 0.0001561341004507967, + "loss": 2.9051, + "step": 40353 + }, + { + "epoch": 1.98, + "grad_norm": 0.6509860157966614, + "learning_rate": 0.00015612059028428224, + "loss": 2.997, + "step": 40354 + }, + { + "epoch": 1.98, + "grad_norm": 0.6968262195587158, + "learning_rate": 0.0001561070804967185, + "loss": 2.9891, + "step": 40355 + }, + { + "epoch": 1.98, + "grad_norm": 0.6519584655761719, + "learning_rate": 0.0001560935710881411, + "loss": 3.1037, + "step": 40356 + }, + { + "epoch": 1.98, + "grad_norm": 0.6796095967292786, + "learning_rate": 0.00015608006205858539, + "loss": 2.8397, + "step": 40357 + }, + { + "epoch": 1.98, + "grad_norm": 0.6914886832237244, + "learning_rate": 0.00015606655340808718, + "loss": 3.1213, + "step": 40358 + }, + { + "epoch": 1.98, + "grad_norm": 0.6436863541603088, + "learning_rate": 0.00015605304513668187, + "loss": 3.0416, + "step": 40359 + }, + { + "epoch": 1.98, + "grad_norm": 0.7205463647842407, + "learning_rate": 0.00015603953724440528, + "loss": 3.1417, + "step": 40360 + }, + { + "epoch": 1.98, + "grad_norm": 0.6410875916481018, + "learning_rate": 0.00015602602973129273, + "loss": 2.9679, + "step": 40361 + }, + { + "epoch": 1.98, + "grad_norm": 0.6640878915786743, + "learning_rate": 0.00015601252259737997, + "loss": 3.0664, + "step": 40362 + }, + { + "epoch": 1.98, + "grad_norm": 0.6714800000190735, + "learning_rate": 0.00015599901584270257, + "loss": 2.942, + "step": 40363 + }, + { + "epoch": 1.98, + "grad_norm": 0.6468148231506348, + "learning_rate": 0.0001559855094672959, + "loss": 2.7303, + "step": 40364 + }, + { + "epoch": 1.98, + "grad_norm": 0.6648983955383301, + "learning_rate": 0.00015597200347119581, + "loss": 3.0801, + "step": 40365 + }, + { + "epoch": 1.98, + "grad_norm": 0.784553050994873, + "learning_rate": 0.00015595849785443762, + "loss": 2.8801, + "step": 40366 + }, + { + "epoch": 1.98, + "grad_norm": 0.6907336711883545, + "learning_rate": 0.00015594499261705703, + "loss": 2.9111, + "step": 40367 + }, + { + "epoch": 1.98, + "grad_norm": 0.6566920876502991, + "learning_rate": 0.00015593148775908973, + "loss": 2.9668, + "step": 40368 + }, + { + "epoch": 1.98, + "grad_norm": 0.6529011130332947, + "learning_rate": 0.00015591798328057113, + "loss": 2.9668, + "step": 40369 + }, + { + "epoch": 1.98, + "grad_norm": 0.6959037184715271, + "learning_rate": 0.00015590447918153687, + "loss": 3.1346, + "step": 40370 + }, + { + "epoch": 1.98, + "grad_norm": 0.6820028424263, + "learning_rate": 0.00015589097546202235, + "loss": 2.9913, + "step": 40371 + }, + { + "epoch": 1.98, + "grad_norm": 0.688714325428009, + "learning_rate": 0.0001558774721220633, + "loss": 3.0301, + "step": 40372 + }, + { + "epoch": 1.98, + "grad_norm": 0.6635189056396484, + "learning_rate": 0.00015586396916169532, + "loss": 3.0752, + "step": 40373 + }, + { + "epoch": 1.98, + "grad_norm": 0.6321013569831848, + "learning_rate": 0.00015585046658095383, + "loss": 3.0512, + "step": 40374 + }, + { + "epoch": 1.98, + "grad_norm": 0.6665242910385132, + "learning_rate": 0.00015583696437987459, + "loss": 3.0112, + "step": 40375 + }, + { + "epoch": 1.98, + "grad_norm": 0.6853665709495544, + "learning_rate": 0.00015582346255849287, + "loss": 3.0421, + "step": 40376 + }, + { + "epoch": 1.98, + "grad_norm": 0.6680829524993896, + "learning_rate": 0.00015580996111684456, + "loss": 3.0632, + "step": 40377 + }, + { + "epoch": 1.98, + "grad_norm": 0.6497752070426941, + "learning_rate": 0.00015579646005496505, + "loss": 2.9086, + "step": 40378 + }, + { + "epoch": 1.98, + "grad_norm": 0.6857454776763916, + "learning_rate": 0.00015578295937288979, + "loss": 2.7439, + "step": 40379 + }, + { + "epoch": 1.98, + "grad_norm": 0.6618149280548096, + "learning_rate": 0.00015576945907065456, + "loss": 2.9654, + "step": 40380 + }, + { + "epoch": 1.98, + "grad_norm": 0.6831642389297485, + "learning_rate": 0.00015575595914829474, + "loss": 3.0545, + "step": 40381 + }, + { + "epoch": 1.98, + "grad_norm": 0.6822869777679443, + "learning_rate": 0.00015574245960584595, + "loss": 3.1841, + "step": 40382 + }, + { + "epoch": 1.98, + "grad_norm": 0.6751419305801392, + "learning_rate": 0.00015572896044334386, + "loss": 2.9026, + "step": 40383 + }, + { + "epoch": 1.98, + "grad_norm": 0.7170292735099792, + "learning_rate": 0.00015571546166082397, + "loss": 3.2591, + "step": 40384 + }, + { + "epoch": 1.98, + "grad_norm": 0.6834322214126587, + "learning_rate": 0.00015570196325832175, + "loss": 2.9228, + "step": 40385 + }, + { + "epoch": 1.98, + "grad_norm": 0.675579309463501, + "learning_rate": 0.00015568846523587263, + "loss": 3.0748, + "step": 40386 + }, + { + "epoch": 1.98, + "grad_norm": 0.694878101348877, + "learning_rate": 0.00015567496759351238, + "loss": 3.1112, + "step": 40387 + }, + { + "epoch": 1.98, + "grad_norm": 0.6923894882202148, + "learning_rate": 0.00015566147033127656, + "loss": 3.0667, + "step": 40388 + }, + { + "epoch": 1.98, + "grad_norm": 0.7370680570602417, + "learning_rate": 0.00015564797344920058, + "loss": 3.0021, + "step": 40389 + }, + { + "epoch": 1.98, + "grad_norm": 0.7134600281715393, + "learning_rate": 0.00015563447694732016, + "loss": 3.0047, + "step": 40390 + }, + { + "epoch": 1.98, + "grad_norm": 0.6903583407402039, + "learning_rate": 0.00015562098082567071, + "loss": 3.044, + "step": 40391 + }, + { + "epoch": 1.98, + "grad_norm": 0.6875481605529785, + "learning_rate": 0.00015560748508428775, + "loss": 2.8929, + "step": 40392 + }, + { + "epoch": 1.98, + "grad_norm": 0.6734123826026917, + "learning_rate": 0.00015559398972320694, + "loss": 2.7966, + "step": 40393 + }, + { + "epoch": 1.98, + "grad_norm": 0.6842463612556458, + "learning_rate": 0.0001555804947424637, + "loss": 2.912, + "step": 40394 + }, + { + "epoch": 1.98, + "grad_norm": 0.6658815741539001, + "learning_rate": 0.00015556700014209376, + "loss": 3.0623, + "step": 40395 + }, + { + "epoch": 1.98, + "grad_norm": 0.6950148344039917, + "learning_rate": 0.00015555350592213237, + "loss": 2.833, + "step": 40396 + }, + { + "epoch": 1.98, + "grad_norm": 0.6506776809692383, + "learning_rate": 0.0001555400120826153, + "loss": 2.842, + "step": 40397 + }, + { + "epoch": 1.98, + "grad_norm": 0.6653673052787781, + "learning_rate": 0.00015552651862357818, + "loss": 3.0188, + "step": 40398 + }, + { + "epoch": 1.98, + "grad_norm": 0.6891602873802185, + "learning_rate": 0.00015551302554505633, + "loss": 2.92, + "step": 40399 + }, + { + "epoch": 1.98, + "grad_norm": 0.6852928400039673, + "learning_rate": 0.00015549953284708543, + "loss": 3.1038, + "step": 40400 + }, + { + "epoch": 1.98, + "grad_norm": 0.6566252708435059, + "learning_rate": 0.00015548604052970078, + "loss": 3.0613, + "step": 40401 + }, + { + "epoch": 1.98, + "grad_norm": 0.7153254151344299, + "learning_rate": 0.00015547254859293812, + "loss": 2.9753, + "step": 40402 + }, + { + "epoch": 1.98, + "grad_norm": 0.6642997860908508, + "learning_rate": 0.00015545905703683304, + "loss": 2.9404, + "step": 40403 + }, + { + "epoch": 1.98, + "grad_norm": 0.6532772779464722, + "learning_rate": 0.0001554455658614209, + "loss": 2.8544, + "step": 40404 + }, + { + "epoch": 1.98, + "grad_norm": 0.6967506408691406, + "learning_rate": 0.00015543207506673744, + "loss": 2.8222, + "step": 40405 + }, + { + "epoch": 1.98, + "grad_norm": 0.6587045788764954, + "learning_rate": 0.00015541858465281807, + "loss": 2.8928, + "step": 40406 + }, + { + "epoch": 1.98, + "grad_norm": 0.668938398361206, + "learning_rate": 0.00015540509461969818, + "loss": 3.1886, + "step": 40407 + }, + { + "epoch": 1.98, + "grad_norm": 0.6699063181877136, + "learning_rate": 0.00015539160496741357, + "loss": 3.013, + "step": 40408 + }, + { + "epoch": 1.98, + "grad_norm": 0.6921393275260925, + "learning_rate": 0.00015537811569599954, + "loss": 2.9297, + "step": 40409 + }, + { + "epoch": 1.98, + "grad_norm": 0.7287542223930359, + "learning_rate": 0.0001553646268054918, + "loss": 3.1772, + "step": 40410 + }, + { + "epoch": 1.98, + "grad_norm": 0.6881520748138428, + "learning_rate": 0.0001553511382959257, + "loss": 2.9042, + "step": 40411 + }, + { + "epoch": 1.98, + "grad_norm": 0.6746994256973267, + "learning_rate": 0.000155337650167337, + "loss": 3.0089, + "step": 40412 + }, + { + "epoch": 1.98, + "grad_norm": 0.6475083827972412, + "learning_rate": 0.00015532416241976108, + "loss": 2.9789, + "step": 40413 + }, + { + "epoch": 1.98, + "grad_norm": 0.7340616583824158, + "learning_rate": 0.00015531067505323334, + "loss": 2.9834, + "step": 40414 + }, + { + "epoch": 1.98, + "grad_norm": 0.6629500389099121, + "learning_rate": 0.00015529718806778955, + "loss": 3.1256, + "step": 40415 + }, + { + "epoch": 1.98, + "grad_norm": 0.6916855573654175, + "learning_rate": 0.00015528370146346504, + "loss": 2.9015, + "step": 40416 + }, + { + "epoch": 1.98, + "grad_norm": 0.6919158697128296, + "learning_rate": 0.0001552702152402954, + "loss": 3.1538, + "step": 40417 + }, + { + "epoch": 1.98, + "grad_norm": 0.6587050557136536, + "learning_rate": 0.00015525672939831623, + "loss": 2.9421, + "step": 40418 + }, + { + "epoch": 1.98, + "grad_norm": 0.6316267848014832, + "learning_rate": 0.00015524324393756292, + "loss": 2.9828, + "step": 40419 + }, + { + "epoch": 1.98, + "grad_norm": 0.6622289419174194, + "learning_rate": 0.0001552297588580711, + "loss": 2.9457, + "step": 40420 + }, + { + "epoch": 1.98, + "grad_norm": 0.656522274017334, + "learning_rate": 0.0001552162741598763, + "loss": 2.9718, + "step": 40421 + }, + { + "epoch": 1.98, + "grad_norm": 0.6395564079284668, + "learning_rate": 0.0001552027898430138, + "loss": 3.034, + "step": 40422 + }, + { + "epoch": 1.98, + "grad_norm": 0.6881017684936523, + "learning_rate": 0.00015518930590751942, + "loss": 3.075, + "step": 40423 + }, + { + "epoch": 1.98, + "grad_norm": 0.7126860022544861, + "learning_rate": 0.0001551758223534284, + "loss": 2.8778, + "step": 40424 + }, + { + "epoch": 1.98, + "grad_norm": 0.6271448135375977, + "learning_rate": 0.00015516233918077657, + "loss": 3.032, + "step": 40425 + }, + { + "epoch": 1.98, + "grad_norm": 0.6470649838447571, + "learning_rate": 0.0001551488563895991, + "loss": 2.8064, + "step": 40426 + }, + { + "epoch": 1.98, + "grad_norm": 0.6680401563644409, + "learning_rate": 0.0001551353739799318, + "loss": 2.9065, + "step": 40427 + }, + { + "epoch": 1.98, + "grad_norm": 0.716007649898529, + "learning_rate": 0.00015512189195181003, + "loss": 3.0163, + "step": 40428 + }, + { + "epoch": 1.98, + "grad_norm": 0.6913284063339233, + "learning_rate": 0.0001551084103052692, + "loss": 3.1034, + "step": 40429 + }, + { + "epoch": 1.98, + "grad_norm": 0.689062774181366, + "learning_rate": 0.00015509492904034505, + "loss": 3.1922, + "step": 40430 + }, + { + "epoch": 1.98, + "grad_norm": 0.6667220592498779, + "learning_rate": 0.0001550814481570729, + "loss": 2.9831, + "step": 40431 + }, + { + "epoch": 1.98, + "grad_norm": 0.665783166885376, + "learning_rate": 0.00015506796765548823, + "loss": 2.9346, + "step": 40432 + }, + { + "epoch": 1.98, + "grad_norm": 0.6425278186798096, + "learning_rate": 0.00015505448753562682, + "loss": 2.8453, + "step": 40433 + }, + { + "epoch": 1.98, + "grad_norm": 0.6473648548126221, + "learning_rate": 0.00015504100779752385, + "loss": 2.9768, + "step": 40434 + }, + { + "epoch": 1.98, + "grad_norm": 0.6833198070526123, + "learning_rate": 0.00015502752844121512, + "loss": 2.8831, + "step": 40435 + }, + { + "epoch": 1.98, + "grad_norm": 0.9269675016403198, + "learning_rate": 0.00015501404946673597, + "loss": 3.0654, + "step": 40436 + }, + { + "epoch": 1.98, + "grad_norm": 0.7202673554420471, + "learning_rate": 0.0001550005708741219, + "loss": 3.1508, + "step": 40437 + }, + { + "epoch": 1.98, + "grad_norm": 0.6838274598121643, + "learning_rate": 0.0001549870926634083, + "loss": 2.8832, + "step": 40438 + }, + { + "epoch": 1.98, + "grad_norm": 0.6578678488731384, + "learning_rate": 0.0001549736148346308, + "loss": 3.1325, + "step": 40439 + }, + { + "epoch": 1.98, + "grad_norm": 0.6588073968887329, + "learning_rate": 0.00015496013738782501, + "loss": 2.9813, + "step": 40440 + }, + { + "epoch": 1.98, + "grad_norm": 0.6366355419158936, + "learning_rate": 0.00015494666032302617, + "loss": 2.9525, + "step": 40441 + }, + { + "epoch": 1.98, + "grad_norm": 0.6619384288787842, + "learning_rate": 0.00015493318364027007, + "loss": 2.9329, + "step": 40442 + }, + { + "epoch": 1.98, + "grad_norm": 0.6706056594848633, + "learning_rate": 0.00015491970733959198, + "loss": 2.832, + "step": 40443 + }, + { + "epoch": 1.98, + "grad_norm": 0.638305127620697, + "learning_rate": 0.00015490623142102737, + "loss": 2.9308, + "step": 40444 + }, + { + "epoch": 1.98, + "grad_norm": 0.6442788243293762, + "learning_rate": 0.00015489275588461196, + "loss": 3.0546, + "step": 40445 + }, + { + "epoch": 1.98, + "grad_norm": 0.6786911487579346, + "learning_rate": 0.00015487928073038094, + "loss": 3.0076, + "step": 40446 + }, + { + "epoch": 1.98, + "grad_norm": 0.6867267489433289, + "learning_rate": 0.00015486580595837013, + "loss": 2.8399, + "step": 40447 + }, + { + "epoch": 1.98, + "grad_norm": 0.6981363296508789, + "learning_rate": 0.0001548523315686147, + "loss": 2.8704, + "step": 40448 + }, + { + "epoch": 1.98, + "grad_norm": 0.6754472851753235, + "learning_rate": 0.00015483885756115045, + "loss": 3.1064, + "step": 40449 + }, + { + "epoch": 1.98, + "grad_norm": 0.7413714528083801, + "learning_rate": 0.0001548253839360127, + "loss": 2.9747, + "step": 40450 + }, + { + "epoch": 1.98, + "grad_norm": 0.6990171670913696, + "learning_rate": 0.0001548119106932368, + "loss": 2.9777, + "step": 40451 + }, + { + "epoch": 1.98, + "grad_norm": 0.7089271545410156, + "learning_rate": 0.00015479843783285856, + "loss": 2.9411, + "step": 40452 + }, + { + "epoch": 1.98, + "grad_norm": 0.6640129089355469, + "learning_rate": 0.00015478496535491315, + "loss": 3.1461, + "step": 40453 + }, + { + "epoch": 1.98, + "grad_norm": 0.6691185832023621, + "learning_rate": 0.00015477149325943617, + "loss": 3.0974, + "step": 40454 + }, + { + "epoch": 1.98, + "grad_norm": 0.6627618074417114, + "learning_rate": 0.00015475802154646327, + "loss": 2.9525, + "step": 40455 + }, + { + "epoch": 1.98, + "grad_norm": 0.7101806998252869, + "learning_rate": 0.00015474455021602967, + "loss": 2.8795, + "step": 40456 + }, + { + "epoch": 1.98, + "grad_norm": 0.6481982469558716, + "learning_rate": 0.00015473107926817107, + "loss": 3.1016, + "step": 40457 + }, + { + "epoch": 1.98, + "grad_norm": 0.6829726696014404, + "learning_rate": 0.00015471760870292285, + "loss": 3.1465, + "step": 40458 + }, + { + "epoch": 1.98, + "grad_norm": 0.708656370639801, + "learning_rate": 0.00015470413852032038, + "loss": 2.91, + "step": 40459 + }, + { + "epoch": 1.98, + "grad_norm": 0.6740055084228516, + "learning_rate": 0.0001546906687203994, + "loss": 2.9464, + "step": 40460 + }, + { + "epoch": 1.98, + "grad_norm": 0.6535217761993408, + "learning_rate": 0.00015467719930319508, + "loss": 3.3205, + "step": 40461 + }, + { + "epoch": 1.98, + "grad_norm": 0.656358540058136, + "learning_rate": 0.00015466373026874313, + "loss": 3.0558, + "step": 40462 + }, + { + "epoch": 1.98, + "grad_norm": 0.6297536492347717, + "learning_rate": 0.00015465026161707888, + "loss": 3.3642, + "step": 40463 + }, + { + "epoch": 1.98, + "grad_norm": 0.6395627856254578, + "learning_rate": 0.00015463679334823795, + "loss": 2.8474, + "step": 40464 + }, + { + "epoch": 1.98, + "grad_norm": 0.6379209756851196, + "learning_rate": 0.00015462332546225578, + "loss": 3.1087, + "step": 40465 + }, + { + "epoch": 1.98, + "grad_norm": 0.7360820770263672, + "learning_rate": 0.00015460985795916766, + "loss": 2.8954, + "step": 40466 + }, + { + "epoch": 1.98, + "grad_norm": 0.6527162790298462, + "learning_rate": 0.00015459639083900928, + "loss": 3.1293, + "step": 40467 + }, + { + "epoch": 1.98, + "grad_norm": 0.6693764328956604, + "learning_rate": 0.0001545829241018159, + "loss": 2.9546, + "step": 40468 + }, + { + "epoch": 1.98, + "grad_norm": 0.7010257244110107, + "learning_rate": 0.00015456945774762316, + "loss": 2.8822, + "step": 40469 + }, + { + "epoch": 1.98, + "grad_norm": 0.6681190133094788, + "learning_rate": 0.00015455599177646655, + "loss": 2.9926, + "step": 40470 + }, + { + "epoch": 1.98, + "grad_norm": 0.7095162868499756, + "learning_rate": 0.0001545425261883815, + "loss": 3.2781, + "step": 40471 + }, + { + "epoch": 1.98, + "grad_norm": 0.6761109828948975, + "learning_rate": 0.0001545290609834034, + "loss": 2.8825, + "step": 40472 + }, + { + "epoch": 1.98, + "grad_norm": 0.6583576202392578, + "learning_rate": 0.00015451559616156766, + "loss": 2.9249, + "step": 40473 + }, + { + "epoch": 1.98, + "grad_norm": 0.6421191692352295, + "learning_rate": 0.0001545021317229099, + "loss": 2.8429, + "step": 40474 + }, + { + "epoch": 1.98, + "grad_norm": 0.6621305346488953, + "learning_rate": 0.00015448866766746552, + "loss": 2.9998, + "step": 40475 + }, + { + "epoch": 1.98, + "grad_norm": 0.708996057510376, + "learning_rate": 0.00015447520399526998, + "loss": 2.8448, + "step": 40476 + }, + { + "epoch": 1.98, + "grad_norm": 0.6437864899635315, + "learning_rate": 0.00015446174070635882, + "loss": 2.9074, + "step": 40477 + }, + { + "epoch": 1.98, + "grad_norm": 0.6887125372886658, + "learning_rate": 0.00015444827780076726, + "loss": 3.0311, + "step": 40478 + }, + { + "epoch": 1.98, + "grad_norm": 0.6327663660049438, + "learning_rate": 0.00015443481527853113, + "loss": 3.1015, + "step": 40479 + }, + { + "epoch": 1.98, + "grad_norm": 0.6891899704933167, + "learning_rate": 0.00015442135313968564, + "loss": 2.9709, + "step": 40480 + }, + { + "epoch": 1.98, + "grad_norm": 0.6350082159042358, + "learning_rate": 0.00015440789138426614, + "loss": 3.0892, + "step": 40481 + }, + { + "epoch": 1.98, + "grad_norm": 0.6630797386169434, + "learning_rate": 0.00015439443001230842, + "loss": 3.0736, + "step": 40482 + }, + { + "epoch": 1.98, + "grad_norm": 0.6615853309631348, + "learning_rate": 0.0001543809690238476, + "loss": 3.1135, + "step": 40483 + }, + { + "epoch": 1.98, + "grad_norm": 0.69512939453125, + "learning_rate": 0.00015436750841891925, + "loss": 3.0358, + "step": 40484 + }, + { + "epoch": 1.98, + "grad_norm": 0.7393690943717957, + "learning_rate": 0.000154354048197559, + "loss": 2.8721, + "step": 40485 + }, + { + "epoch": 1.98, + "grad_norm": 0.6448101997375488, + "learning_rate": 0.0001543405883598022, + "loss": 2.8879, + "step": 40486 + }, + { + "epoch": 1.98, + "grad_norm": 0.6300211548805237, + "learning_rate": 0.00015432712890568422, + "loss": 2.9113, + "step": 40487 + }, + { + "epoch": 1.98, + "grad_norm": 0.7020927667617798, + "learning_rate": 0.00015431366983524041, + "loss": 2.9078, + "step": 40488 + }, + { + "epoch": 1.98, + "grad_norm": 0.717994213104248, + "learning_rate": 0.00015430021114850635, + "loss": 2.9872, + "step": 40489 + }, + { + "epoch": 1.98, + "grad_norm": 0.6843698620796204, + "learning_rate": 0.00015428675284551765, + "loss": 2.9769, + "step": 40490 + }, + { + "epoch": 1.98, + "grad_norm": 0.6766223907470703, + "learning_rate": 0.00015427329492630944, + "loss": 2.8667, + "step": 40491 + }, + { + "epoch": 1.98, + "grad_norm": 0.6386144161224365, + "learning_rate": 0.00015425983739091747, + "loss": 2.972, + "step": 40492 + }, + { + "epoch": 1.98, + "grad_norm": 0.6815845966339111, + "learning_rate": 0.00015424638023937707, + "loss": 3.0052, + "step": 40493 + }, + { + "epoch": 1.98, + "grad_norm": 0.6865285038948059, + "learning_rate": 0.00015423292347172348, + "loss": 2.8614, + "step": 40494 + }, + { + "epoch": 1.98, + "grad_norm": 0.8084060549736023, + "learning_rate": 0.00015421946708799248, + "loss": 2.7633, + "step": 40495 + }, + { + "epoch": 1.98, + "grad_norm": 0.6883880496025085, + "learning_rate": 0.00015420601108821918, + "loss": 3.018, + "step": 40496 + }, + { + "epoch": 1.98, + "grad_norm": 0.6464810967445374, + "learning_rate": 0.00015419255547243936, + "loss": 2.8539, + "step": 40497 + }, + { + "epoch": 1.98, + "grad_norm": 0.6759091019630432, + "learning_rate": 0.00015417910024068815, + "loss": 2.9933, + "step": 40498 + }, + { + "epoch": 1.98, + "grad_norm": 0.7044882774353027, + "learning_rate": 0.0001541656453930011, + "loss": 3.1197, + "step": 40499 + }, + { + "epoch": 1.98, + "grad_norm": 0.6467495560646057, + "learning_rate": 0.0001541521909294138, + "loss": 2.8981, + "step": 40500 + }, + { + "epoch": 1.98, + "grad_norm": 0.6994920372962952, + "learning_rate": 0.00015413873684996156, + "loss": 3.0469, + "step": 40501 + }, + { + "epoch": 1.98, + "grad_norm": 0.6590608954429626, + "learning_rate": 0.00015412528315467987, + "loss": 2.7399, + "step": 40502 + }, + { + "epoch": 1.98, + "grad_norm": 0.6317247748374939, + "learning_rate": 0.0001541118298436039, + "loss": 2.947, + "step": 40503 + }, + { + "epoch": 1.99, + "grad_norm": 0.7404329180717468, + "learning_rate": 0.0001540983769167694, + "loss": 2.9812, + "step": 40504 + }, + { + "epoch": 1.99, + "grad_norm": 0.6499910950660706, + "learning_rate": 0.00015408492437421175, + "loss": 2.9587, + "step": 40505 + }, + { + "epoch": 1.99, + "grad_norm": 0.6785842180252075, + "learning_rate": 0.0001540714722159662, + "loss": 2.9186, + "step": 40506 + }, + { + "epoch": 1.99, + "grad_norm": 0.7388772368431091, + "learning_rate": 0.00015405802044206844, + "loss": 2.8404, + "step": 40507 + }, + { + "epoch": 1.99, + "grad_norm": 0.6410542726516724, + "learning_rate": 0.00015404456905255377, + "loss": 2.897, + "step": 40508 + }, + { + "epoch": 1.99, + "grad_norm": 0.6782169342041016, + "learning_rate": 0.0001540311180474575, + "loss": 3.1239, + "step": 40509 + }, + { + "epoch": 1.99, + "grad_norm": 0.6322621703147888, + "learning_rate": 0.00015401766742681528, + "loss": 2.882, + "step": 40510 + }, + { + "epoch": 1.99, + "grad_norm": 0.6843157410621643, + "learning_rate": 0.0001540042171906623, + "loss": 2.9253, + "step": 40511 + }, + { + "epoch": 1.99, + "grad_norm": 0.6755958795547485, + "learning_rate": 0.00015399076733903425, + "loss": 2.7574, + "step": 40512 + }, + { + "epoch": 1.99, + "grad_norm": 0.6259028911590576, + "learning_rate": 0.00015397731787196632, + "loss": 2.7801, + "step": 40513 + }, + { + "epoch": 1.99, + "grad_norm": 0.6746711134910583, + "learning_rate": 0.0001539638687894941, + "loss": 2.9839, + "step": 40514 + }, + { + "epoch": 1.99, + "grad_norm": 0.639659583568573, + "learning_rate": 0.000153950420091653, + "loss": 3.1256, + "step": 40515 + }, + { + "epoch": 1.99, + "grad_norm": 0.6220987439155579, + "learning_rate": 0.00015393697177847824, + "loss": 2.8792, + "step": 40516 + }, + { + "epoch": 1.99, + "grad_norm": 0.6759223937988281, + "learning_rate": 0.0001539235238500055, + "loss": 2.9302, + "step": 40517 + }, + { + "epoch": 1.99, + "grad_norm": 0.6941930055618286, + "learning_rate": 0.00015391007630627, + "loss": 2.8693, + "step": 40518 + }, + { + "epoch": 1.99, + "grad_norm": 0.6453860402107239, + "learning_rate": 0.00015389662914730732, + "loss": 2.9384, + "step": 40519 + }, + { + "epoch": 1.99, + "grad_norm": 0.6453847885131836, + "learning_rate": 0.0001538831823731527, + "loss": 2.9717, + "step": 40520 + }, + { + "epoch": 1.99, + "grad_norm": 0.6334241032600403, + "learning_rate": 0.00015386973598384166, + "loss": 2.8373, + "step": 40521 + }, + { + "epoch": 1.99, + "grad_norm": 0.6756218075752258, + "learning_rate": 0.00015385628997940972, + "loss": 2.9777, + "step": 40522 + }, + { + "epoch": 1.99, + "grad_norm": 0.6967214345932007, + "learning_rate": 0.00015384284435989218, + "loss": 2.9146, + "step": 40523 + }, + { + "epoch": 1.99, + "grad_norm": 0.692237138748169, + "learning_rate": 0.00015382939912532447, + "loss": 2.9344, + "step": 40524 + }, + { + "epoch": 1.99, + "grad_norm": 0.6731816530227661, + "learning_rate": 0.00015381595427574189, + "loss": 3.0359, + "step": 40525 + }, + { + "epoch": 1.99, + "grad_norm": 0.6681119203567505, + "learning_rate": 0.00015380250981117993, + "loss": 2.9214, + "step": 40526 + }, + { + "epoch": 1.99, + "grad_norm": 0.6464305520057678, + "learning_rate": 0.00015378906573167413, + "loss": 2.9634, + "step": 40527 + }, + { + "epoch": 1.99, + "grad_norm": 0.6641029119491577, + "learning_rate": 0.0001537756220372597, + "loss": 3.0965, + "step": 40528 + }, + { + "epoch": 1.99, + "grad_norm": 0.6519691944122314, + "learning_rate": 0.00015376217872797228, + "loss": 3.0628, + "step": 40529 + }, + { + "epoch": 1.99, + "grad_norm": 0.7187452912330627, + "learning_rate": 0.00015374873580384707, + "loss": 3.1994, + "step": 40530 + }, + { + "epoch": 1.99, + "grad_norm": 0.666256308555603, + "learning_rate": 0.0001537352932649195, + "loss": 2.9878, + "step": 40531 + }, + { + "epoch": 1.99, + "grad_norm": 0.6944901347160339, + "learning_rate": 0.0001537218511112251, + "loss": 3.0209, + "step": 40532 + }, + { + "epoch": 1.99, + "grad_norm": 0.7129674553871155, + "learning_rate": 0.00015370840934279906, + "loss": 3.0064, + "step": 40533 + }, + { + "epoch": 1.99, + "grad_norm": 0.6722531914710999, + "learning_rate": 0.00015369496795967705, + "loss": 2.8155, + "step": 40534 + }, + { + "epoch": 1.99, + "grad_norm": 0.6533281803131104, + "learning_rate": 0.0001536815269618942, + "loss": 2.9369, + "step": 40535 + }, + { + "epoch": 1.99, + "grad_norm": 0.6529192328453064, + "learning_rate": 0.00015366808634948605, + "loss": 2.8809, + "step": 40536 + }, + { + "epoch": 1.99, + "grad_norm": 0.6997420787811279, + "learning_rate": 0.00015365464612248825, + "loss": 3.0539, + "step": 40537 + }, + { + "epoch": 1.99, + "grad_norm": 0.7066969275474548, + "learning_rate": 0.00015364120628093573, + "loss": 2.9932, + "step": 40538 + }, + { + "epoch": 1.99, + "grad_norm": 0.6391503810882568, + "learning_rate": 0.0001536277668248642, + "loss": 2.9198, + "step": 40539 + }, + { + "epoch": 1.99, + "grad_norm": 0.6769759058952332, + "learning_rate": 0.00015361432775430885, + "loss": 2.9239, + "step": 40540 + }, + { + "epoch": 1.99, + "grad_norm": 0.6748432517051697, + "learning_rate": 0.0001536008890693052, + "loss": 3.0284, + "step": 40541 + }, + { + "epoch": 1.99, + "grad_norm": 0.7206935882568359, + "learning_rate": 0.00015358745076988873, + "loss": 2.8153, + "step": 40542 + }, + { + "epoch": 1.99, + "grad_norm": 0.6426916122436523, + "learning_rate": 0.0001535740128560946, + "loss": 2.8724, + "step": 40543 + }, + { + "epoch": 1.99, + "grad_norm": 0.7100978493690491, + "learning_rate": 0.00015356057532795855, + "loss": 2.7467, + "step": 40544 + }, + { + "epoch": 1.99, + "grad_norm": 0.6432538032531738, + "learning_rate": 0.00015354713818551568, + "loss": 2.8214, + "step": 40545 + }, + { + "epoch": 1.99, + "grad_norm": 0.6779876351356506, + "learning_rate": 0.00015353370142880132, + "loss": 2.8472, + "step": 40546 + }, + { + "epoch": 1.99, + "grad_norm": 0.6484413743019104, + "learning_rate": 0.0001535202650578512, + "loss": 2.9393, + "step": 40547 + }, + { + "epoch": 1.99, + "grad_norm": 0.6245802640914917, + "learning_rate": 0.00015350682907270033, + "loss": 3.1935, + "step": 40548 + }, + { + "epoch": 1.99, + "grad_norm": 0.6747826337814331, + "learning_rate": 0.00015349339347338442, + "loss": 3.0041, + "step": 40549 + }, + { + "epoch": 1.99, + "grad_norm": 0.6607273817062378, + "learning_rate": 0.0001534799582599386, + "loss": 2.8044, + "step": 40550 + }, + { + "epoch": 1.99, + "grad_norm": 0.6659206748008728, + "learning_rate": 0.0001534665234323985, + "loss": 2.9159, + "step": 40551 + }, + { + "epoch": 1.99, + "grad_norm": 0.6552886962890625, + "learning_rate": 0.00015345308899079936, + "loss": 2.8379, + "step": 40552 + }, + { + "epoch": 1.99, + "grad_norm": 0.6883482933044434, + "learning_rate": 0.0001534396549351765, + "loss": 2.922, + "step": 40553 + }, + { + "epoch": 1.99, + "grad_norm": 0.6916592121124268, + "learning_rate": 0.00015342622126556546, + "loss": 2.9854, + "step": 40554 + }, + { + "epoch": 1.99, + "grad_norm": 0.6768868565559387, + "learning_rate": 0.00015341278798200141, + "loss": 2.969, + "step": 40555 + }, + { + "epoch": 1.99, + "grad_norm": 0.6460238695144653, + "learning_rate": 0.0001533993550845199, + "loss": 3.0113, + "step": 40556 + }, + { + "epoch": 1.99, + "grad_norm": 0.7118356823921204, + "learning_rate": 0.00015338592257315638, + "loss": 2.9523, + "step": 40557 + }, + { + "epoch": 1.99, + "grad_norm": 0.6956257224082947, + "learning_rate": 0.00015337249044794602, + "loss": 2.9509, + "step": 40558 + }, + { + "epoch": 1.99, + "grad_norm": 0.702347457408905, + "learning_rate": 0.0001533590587089244, + "loss": 2.9354, + "step": 40559 + }, + { + "epoch": 1.99, + "grad_norm": 0.6793229579925537, + "learning_rate": 0.0001533456273561268, + "loss": 2.9578, + "step": 40560 + }, + { + "epoch": 1.99, + "grad_norm": 0.738370418548584, + "learning_rate": 0.00015333219638958849, + "loss": 3.0186, + "step": 40561 + }, + { + "epoch": 1.99, + "grad_norm": 0.7036491632461548, + "learning_rate": 0.00015331876580934505, + "loss": 3.0268, + "step": 40562 + }, + { + "epoch": 1.99, + "grad_norm": 0.7223859429359436, + "learning_rate": 0.00015330533561543164, + "loss": 2.8532, + "step": 40563 + }, + { + "epoch": 1.99, + "grad_norm": 0.6854945421218872, + "learning_rate": 0.00015329190580788387, + "loss": 3.0173, + "step": 40564 + }, + { + "epoch": 1.99, + "grad_norm": 0.6289524435997009, + "learning_rate": 0.00015327847638673688, + "loss": 2.9122, + "step": 40565 + }, + { + "epoch": 1.99, + "grad_norm": 0.7152692675590515, + "learning_rate": 0.00015326504735202622, + "loss": 2.9671, + "step": 40566 + }, + { + "epoch": 1.99, + "grad_norm": 0.6731276512145996, + "learning_rate": 0.0001532516187037872, + "loss": 3.0716, + "step": 40567 + }, + { + "epoch": 1.99, + "grad_norm": 0.6647647619247437, + "learning_rate": 0.00015323819044205507, + "loss": 2.9302, + "step": 40568 + }, + { + "epoch": 1.99, + "grad_norm": 0.7299318909645081, + "learning_rate": 0.0001532247625668654, + "loss": 2.6701, + "step": 40569 + }, + { + "epoch": 1.99, + "grad_norm": 0.7116023302078247, + "learning_rate": 0.0001532113350782534, + "loss": 2.8473, + "step": 40570 + }, + { + "epoch": 1.99, + "grad_norm": 0.6508423089981079, + "learning_rate": 0.0001531979079762544, + "loss": 3.0764, + "step": 40571 + }, + { + "epoch": 1.99, + "grad_norm": 0.6736328601837158, + "learning_rate": 0.00015318448126090407, + "loss": 3.047, + "step": 40572 + }, + { + "epoch": 1.99, + "grad_norm": 0.6313000917434692, + "learning_rate": 0.00015317105493223746, + "loss": 2.8944, + "step": 40573 + }, + { + "epoch": 1.99, + "grad_norm": 0.6281668543815613, + "learning_rate": 0.0001531576289902901, + "loss": 2.7681, + "step": 40574 + }, + { + "epoch": 1.99, + "grad_norm": 0.6827503442764282, + "learning_rate": 0.00015314420343509716, + "loss": 3.1714, + "step": 40575 + }, + { + "epoch": 1.99, + "grad_norm": 0.6493374109268188, + "learning_rate": 0.00015313077826669414, + "loss": 3.0646, + "step": 40576 + }, + { + "epoch": 1.99, + "grad_norm": 0.702488899230957, + "learning_rate": 0.00015311735348511648, + "loss": 2.9156, + "step": 40577 + }, + { + "epoch": 1.99, + "grad_norm": 0.6838111281394958, + "learning_rate": 0.00015310392909039932, + "loss": 2.7314, + "step": 40578 + }, + { + "epoch": 1.99, + "grad_norm": 0.7085728645324707, + "learning_rate": 0.0001530905050825783, + "loss": 2.9128, + "step": 40579 + }, + { + "epoch": 1.99, + "grad_norm": 0.6542535424232483, + "learning_rate": 0.0001530770814616884, + "loss": 3.0594, + "step": 40580 + }, + { + "epoch": 1.99, + "grad_norm": 0.6909346580505371, + "learning_rate": 0.0001530636582277654, + "loss": 2.9486, + "step": 40581 + }, + { + "epoch": 1.99, + "grad_norm": 0.6850855350494385, + "learning_rate": 0.0001530502353808444, + "loss": 3.1318, + "step": 40582 + }, + { + "epoch": 1.99, + "grad_norm": 0.633678138256073, + "learning_rate": 0.0001530368129209607, + "loss": 3.0352, + "step": 40583 + }, + { + "epoch": 1.99, + "grad_norm": 0.6679724454879761, + "learning_rate": 0.0001530233908481499, + "loss": 3.1599, + "step": 40584 + }, + { + "epoch": 1.99, + "grad_norm": 0.7710044384002686, + "learning_rate": 0.00015300996916244703, + "loss": 3.0995, + "step": 40585 + }, + { + "epoch": 1.99, + "grad_norm": 0.6416032314300537, + "learning_rate": 0.00015299654786388766, + "loss": 2.9725, + "step": 40586 + }, + { + "epoch": 1.99, + "grad_norm": 0.6761838793754578, + "learning_rate": 0.0001529831269525072, + "loss": 3.2538, + "step": 40587 + }, + { + "epoch": 1.99, + "grad_norm": 0.6844536066055298, + "learning_rate": 0.00015296970642834088, + "loss": 2.9308, + "step": 40588 + }, + { + "epoch": 1.99, + "grad_norm": 0.6797305941581726, + "learning_rate": 0.000152956286291424, + "loss": 2.9528, + "step": 40589 + }, + { + "epoch": 1.99, + "grad_norm": 0.6660833954811096, + "learning_rate": 0.00015294286654179192, + "loss": 3.0079, + "step": 40590 + }, + { + "epoch": 1.99, + "grad_norm": 0.6612690091133118, + "learning_rate": 0.0001529294471794801, + "loss": 3.0193, + "step": 40591 + }, + { + "epoch": 1.99, + "grad_norm": 0.6777284145355225, + "learning_rate": 0.00015291602820452374, + "loss": 3.0346, + "step": 40592 + }, + { + "epoch": 1.99, + "grad_norm": 0.678779125213623, + "learning_rate": 0.00015290260961695824, + "loss": 2.9215, + "step": 40593 + }, + { + "epoch": 1.99, + "grad_norm": 0.6552702784538269, + "learning_rate": 0.00015288919141681905, + "loss": 3.193, + "step": 40594 + }, + { + "epoch": 1.99, + "grad_norm": 0.66253662109375, + "learning_rate": 0.00015287577360414146, + "loss": 2.8081, + "step": 40595 + }, + { + "epoch": 1.99, + "grad_norm": 0.6897073984146118, + "learning_rate": 0.00015286235617896073, + "loss": 3.0626, + "step": 40596 + }, + { + "epoch": 1.99, + "grad_norm": 0.6485527753829956, + "learning_rate": 0.0001528489391413121, + "loss": 3.1438, + "step": 40597 + }, + { + "epoch": 1.99, + "grad_norm": 0.6479106545448303, + "learning_rate": 0.00015283552249123112, + "loss": 3.1592, + "step": 40598 + }, + { + "epoch": 1.99, + "grad_norm": 0.6787962317466736, + "learning_rate": 0.00015282210622875308, + "loss": 3.2376, + "step": 40599 + }, + { + "epoch": 1.99, + "grad_norm": 0.656581699848175, + "learning_rate": 0.00015280869035391324, + "loss": 3.0471, + "step": 40600 + }, + { + "epoch": 1.99, + "grad_norm": 0.6656479835510254, + "learning_rate": 0.00015279527486674707, + "loss": 2.8886, + "step": 40601 + }, + { + "epoch": 1.99, + "grad_norm": 0.6412723660469055, + "learning_rate": 0.00015278185976728972, + "loss": 3.0208, + "step": 40602 + }, + { + "epoch": 1.99, + "grad_norm": 0.7223705649375916, + "learning_rate": 0.0001527684450555767, + "loss": 3.1058, + "step": 40603 + }, + { + "epoch": 1.99, + "grad_norm": 0.7059385180473328, + "learning_rate": 0.00015275503073164327, + "loss": 3.0413, + "step": 40604 + }, + { + "epoch": 1.99, + "grad_norm": 0.7070484161376953, + "learning_rate": 0.00015274161679552464, + "loss": 3.0564, + "step": 40605 + }, + { + "epoch": 1.99, + "grad_norm": 0.6791709661483765, + "learning_rate": 0.00015272820324725635, + "loss": 2.8612, + "step": 40606 + }, + { + "epoch": 1.99, + "grad_norm": 0.6438567638397217, + "learning_rate": 0.00015271479008687359, + "loss": 2.9881, + "step": 40607 + }, + { + "epoch": 1.99, + "grad_norm": 0.631389856338501, + "learning_rate": 0.00015270137731441164, + "loss": 3.0258, + "step": 40608 + }, + { + "epoch": 1.99, + "grad_norm": 0.7261651754379272, + "learning_rate": 0.00015268796492990603, + "loss": 2.9647, + "step": 40609 + }, + { + "epoch": 1.99, + "grad_norm": 0.6476674675941467, + "learning_rate": 0.00015267455293339205, + "loss": 2.9719, + "step": 40610 + }, + { + "epoch": 1.99, + "grad_norm": 0.6464186310768127, + "learning_rate": 0.00015266114132490488, + "loss": 3.1635, + "step": 40611 + }, + { + "epoch": 1.99, + "grad_norm": 0.6697189211845398, + "learning_rate": 0.0001526477301044798, + "loss": 2.9862, + "step": 40612 + }, + { + "epoch": 1.99, + "grad_norm": 0.6742683053016663, + "learning_rate": 0.00015263431927215224, + "loss": 2.984, + "step": 40613 + }, + { + "epoch": 1.99, + "grad_norm": 0.6472331881523132, + "learning_rate": 0.00015262090882795765, + "loss": 3.1692, + "step": 40614 + }, + { + "epoch": 1.99, + "grad_norm": 0.6672264337539673, + "learning_rate": 0.0001526074987719311, + "loss": 2.9423, + "step": 40615 + }, + { + "epoch": 1.99, + "grad_norm": 0.6128020286560059, + "learning_rate": 0.00015259408910410816, + "loss": 3.0283, + "step": 40616 + }, + { + "epoch": 1.99, + "grad_norm": 0.665045976638794, + "learning_rate": 0.00015258067982452402, + "loss": 2.9884, + "step": 40617 + }, + { + "epoch": 1.99, + "grad_norm": 0.6567105650901794, + "learning_rate": 0.00015256727093321387, + "loss": 3.0353, + "step": 40618 + }, + { + "epoch": 1.99, + "grad_norm": 0.6515582203865051, + "learning_rate": 0.00015255386243021328, + "loss": 2.936, + "step": 40619 + }, + { + "epoch": 1.99, + "grad_norm": 0.6716775298118591, + "learning_rate": 0.0001525404543155573, + "loss": 2.9665, + "step": 40620 + }, + { + "epoch": 1.99, + "grad_norm": 0.651340901851654, + "learning_rate": 0.00015252704658928152, + "loss": 3.1269, + "step": 40621 + }, + { + "epoch": 1.99, + "grad_norm": 0.6484601497650146, + "learning_rate": 0.00015251363925142098, + "loss": 3.0912, + "step": 40622 + }, + { + "epoch": 1.99, + "grad_norm": 0.6475279331207275, + "learning_rate": 0.00015250023230201114, + "loss": 2.8503, + "step": 40623 + }, + { + "epoch": 1.99, + "grad_norm": 0.653603732585907, + "learning_rate": 0.00015248682574108745, + "loss": 2.9839, + "step": 40624 + }, + { + "epoch": 1.99, + "grad_norm": 0.6498973965644836, + "learning_rate": 0.00015247341956868505, + "loss": 2.8576, + "step": 40625 + }, + { + "epoch": 1.99, + "grad_norm": 0.6715899705886841, + "learning_rate": 0.00015246001378483925, + "loss": 2.9632, + "step": 40626 + }, + { + "epoch": 1.99, + "grad_norm": 0.6394970417022705, + "learning_rate": 0.00015244660838958528, + "loss": 2.8658, + "step": 40627 + }, + { + "epoch": 1.99, + "grad_norm": 0.6495617032051086, + "learning_rate": 0.00015243320338295856, + "loss": 2.9026, + "step": 40628 + }, + { + "epoch": 1.99, + "grad_norm": 0.710526168346405, + "learning_rate": 0.00015241979876499446, + "loss": 3.0276, + "step": 40629 + }, + { + "epoch": 1.99, + "grad_norm": 0.6743832230567932, + "learning_rate": 0.00015240639453572812, + "loss": 2.8014, + "step": 40630 + }, + { + "epoch": 1.99, + "grad_norm": 0.6878013610839844, + "learning_rate": 0.00015239299069519504, + "loss": 2.8822, + "step": 40631 + }, + { + "epoch": 1.99, + "grad_norm": 0.6774446368217468, + "learning_rate": 0.00015237958724343044, + "loss": 2.9242, + "step": 40632 + }, + { + "epoch": 1.99, + "grad_norm": 0.6619464159011841, + "learning_rate": 0.00015236618418046943, + "loss": 3.0985, + "step": 40633 + }, + { + "epoch": 1.99, + "grad_norm": 0.7469950914382935, + "learning_rate": 0.00015235278150634764, + "loss": 3.0865, + "step": 40634 + }, + { + "epoch": 1.99, + "grad_norm": 0.6729745864868164, + "learning_rate": 0.0001523393792211001, + "loss": 2.9876, + "step": 40635 + }, + { + "epoch": 1.99, + "grad_norm": 0.665226936340332, + "learning_rate": 0.00015232597732476227, + "loss": 2.9907, + "step": 40636 + }, + { + "epoch": 1.99, + "grad_norm": 0.6587421298027039, + "learning_rate": 0.00015231257581736933, + "loss": 3.2124, + "step": 40637 + }, + { + "epoch": 1.99, + "grad_norm": 0.6516327261924744, + "learning_rate": 0.00015229917469895664, + "loss": 3.0794, + "step": 40638 + }, + { + "epoch": 1.99, + "grad_norm": 1.108006477355957, + "learning_rate": 0.00015228577396955974, + "loss": 2.8191, + "step": 40639 + }, + { + "epoch": 1.99, + "grad_norm": 0.7497507929801941, + "learning_rate": 0.0001522723736292134, + "loss": 2.9436, + "step": 40640 + }, + { + "epoch": 1.99, + "grad_norm": 0.6455642580986023, + "learning_rate": 0.0001522589736779534, + "loss": 3.1215, + "step": 40641 + }, + { + "epoch": 1.99, + "grad_norm": 0.6802120804786682, + "learning_rate": 0.00015224557411581464, + "loss": 2.6826, + "step": 40642 + }, + { + "epoch": 1.99, + "grad_norm": 0.6976883411407471, + "learning_rate": 0.0001522321749428326, + "loss": 3.0047, + "step": 40643 + }, + { + "epoch": 1.99, + "grad_norm": 0.6384730339050293, + "learning_rate": 0.0001522187761590427, + "loss": 2.865, + "step": 40644 + }, + { + "epoch": 1.99, + "grad_norm": 0.635405957698822, + "learning_rate": 0.00015220537776448, + "loss": 2.9101, + "step": 40645 + }, + { + "epoch": 1.99, + "grad_norm": 0.6480907797813416, + "learning_rate": 0.00015219197975918002, + "loss": 3.0178, + "step": 40646 + }, + { + "epoch": 1.99, + "grad_norm": 0.6745615005493164, + "learning_rate": 0.00015217858214317792, + "loss": 2.8982, + "step": 40647 + }, + { + "epoch": 1.99, + "grad_norm": 0.658542811870575, + "learning_rate": 0.0001521651849165088, + "loss": 3.2529, + "step": 40648 + }, + { + "epoch": 1.99, + "grad_norm": 0.6688123941421509, + "learning_rate": 0.00015215178807920834, + "loss": 3.1468, + "step": 40649 + }, + { + "epoch": 1.99, + "grad_norm": 0.6520625352859497, + "learning_rate": 0.00015213839163131146, + "loss": 3.004, + "step": 40650 + }, + { + "epoch": 1.99, + "grad_norm": 0.6656911969184875, + "learning_rate": 0.00015212499557285374, + "loss": 3.0078, + "step": 40651 + }, + { + "epoch": 1.99, + "grad_norm": 0.6434037685394287, + "learning_rate": 0.00015211159990387015, + "loss": 3.0344, + "step": 40652 + }, + { + "epoch": 1.99, + "grad_norm": 0.6860767006874084, + "learning_rate": 0.0001520982046243963, + "loss": 3.1034, + "step": 40653 + }, + { + "epoch": 1.99, + "grad_norm": 0.6536909341812134, + "learning_rate": 0.00015208480973446733, + "loss": 3.039, + "step": 40654 + }, + { + "epoch": 1.99, + "grad_norm": 0.6466705203056335, + "learning_rate": 0.00015207141523411834, + "loss": 3.0428, + "step": 40655 + }, + { + "epoch": 1.99, + "grad_norm": 0.65459805727005, + "learning_rate": 0.00015205802112338492, + "loss": 2.9133, + "step": 40656 + }, + { + "epoch": 1.99, + "grad_norm": 0.6901729106903076, + "learning_rate": 0.0001520446274023021, + "loss": 3.2814, + "step": 40657 + }, + { + "epoch": 1.99, + "grad_norm": 0.6484500765800476, + "learning_rate": 0.00015203123407090522, + "loss": 2.9905, + "step": 40658 + }, + { + "epoch": 1.99, + "grad_norm": 0.6647442579269409, + "learning_rate": 0.00015201784112922975, + "loss": 3.1639, + "step": 40659 + }, + { + "epoch": 1.99, + "grad_norm": 0.6574181914329529, + "learning_rate": 0.00015200444857731062, + "loss": 2.8888, + "step": 40660 + }, + { + "epoch": 1.99, + "grad_norm": 0.6742486953735352, + "learning_rate": 0.0001519910564151835, + "loss": 2.9154, + "step": 40661 + }, + { + "epoch": 1.99, + "grad_norm": 0.6795637607574463, + "learning_rate": 0.00015197766464288337, + "loss": 2.9062, + "step": 40662 + }, + { + "epoch": 1.99, + "grad_norm": 0.642927885055542, + "learning_rate": 0.00015196427326044553, + "loss": 3.0751, + "step": 40663 + }, + { + "epoch": 1.99, + "grad_norm": 0.6584546566009521, + "learning_rate": 0.00015195088226790536, + "loss": 3.1254, + "step": 40664 + }, + { + "epoch": 1.99, + "grad_norm": 0.6724053621292114, + "learning_rate": 0.00015193749166529804, + "loss": 3.3392, + "step": 40665 + }, + { + "epoch": 1.99, + "grad_norm": 0.6559529900550842, + "learning_rate": 0.00015192410145265891, + "loss": 3.0559, + "step": 40666 + }, + { + "epoch": 1.99, + "grad_norm": 0.6671188473701477, + "learning_rate": 0.00015191071163002308, + "loss": 3.1581, + "step": 40667 + }, + { + "epoch": 1.99, + "grad_norm": 0.6583215594291687, + "learning_rate": 0.0001518973221974261, + "loss": 2.9865, + "step": 40668 + }, + { + "epoch": 1.99, + "grad_norm": 0.7388840317726135, + "learning_rate": 0.00015188393315490308, + "loss": 2.8475, + "step": 40669 + }, + { + "epoch": 1.99, + "grad_norm": 0.6607951521873474, + "learning_rate": 0.0001518705445024891, + "loss": 3.153, + "step": 40670 + }, + { + "epoch": 1.99, + "grad_norm": 0.6548836827278137, + "learning_rate": 0.00015185715624021977, + "loss": 2.8508, + "step": 40671 + }, + { + "epoch": 1.99, + "grad_norm": 0.741969883441925, + "learning_rate": 0.00015184376836813003, + "loss": 3.0824, + "step": 40672 + }, + { + "epoch": 1.99, + "grad_norm": 0.6918534636497498, + "learning_rate": 0.0001518303808862554, + "loss": 2.9278, + "step": 40673 + }, + { + "epoch": 1.99, + "grad_norm": 0.628976583480835, + "learning_rate": 0.00015181699379463094, + "loss": 2.9431, + "step": 40674 + }, + { + "epoch": 1.99, + "grad_norm": 0.6541244983673096, + "learning_rate": 0.0001518036070932921, + "loss": 2.7107, + "step": 40675 + }, + { + "epoch": 1.99, + "grad_norm": 0.6655003428459167, + "learning_rate": 0.00015179022078227406, + "loss": 3.0281, + "step": 40676 + }, + { + "epoch": 1.99, + "grad_norm": 0.6962637901306152, + "learning_rate": 0.00015177683486161195, + "loss": 3.0287, + "step": 40677 + }, + { + "epoch": 1.99, + "grad_norm": 0.6648634672164917, + "learning_rate": 0.00015176344933134117, + "loss": 3.0775, + "step": 40678 + }, + { + "epoch": 1.99, + "grad_norm": 0.6377243995666504, + "learning_rate": 0.0001517500641914969, + "loss": 2.7844, + "step": 40679 + }, + { + "epoch": 1.99, + "grad_norm": 0.6637604236602783, + "learning_rate": 0.0001517366794421144, + "loss": 3.2034, + "step": 40680 + }, + { + "epoch": 1.99, + "grad_norm": 0.6475324630737305, + "learning_rate": 0.00015172329508322908, + "loss": 3.0258, + "step": 40681 + }, + { + "epoch": 1.99, + "grad_norm": 0.7287402749061584, + "learning_rate": 0.0001517099111148759, + "loss": 2.7741, + "step": 40682 + }, + { + "epoch": 1.99, + "grad_norm": 0.6636556386947632, + "learning_rate": 0.00015169652753709045, + "loss": 2.9621, + "step": 40683 + }, + { + "epoch": 1.99, + "grad_norm": 0.6482304334640503, + "learning_rate": 0.0001516831443499078, + "loss": 2.9935, + "step": 40684 + }, + { + "epoch": 1.99, + "grad_norm": 0.6574996709823608, + "learning_rate": 0.00015166976155336304, + "loss": 2.9147, + "step": 40685 + }, + { + "epoch": 1.99, + "grad_norm": 0.6734505891799927, + "learning_rate": 0.0001516563791474917, + "loss": 3.0778, + "step": 40686 + }, + { + "epoch": 1.99, + "grad_norm": 0.6687676906585693, + "learning_rate": 0.00015164299713232886, + "loss": 2.9375, + "step": 40687 + }, + { + "epoch": 1.99, + "grad_norm": 0.6934749484062195, + "learning_rate": 0.00015162961550790988, + "loss": 3.0035, + "step": 40688 + }, + { + "epoch": 1.99, + "grad_norm": 0.7008736729621887, + "learning_rate": 0.00015161623427426983, + "loss": 2.7388, + "step": 40689 + }, + { + "epoch": 1.99, + "grad_norm": 0.6899448037147522, + "learning_rate": 0.00015160285343144417, + "loss": 2.8754, + "step": 40690 + }, + { + "epoch": 1.99, + "grad_norm": 0.6953763961791992, + "learning_rate": 0.00015158947297946805, + "loss": 3.1054, + "step": 40691 + }, + { + "epoch": 1.99, + "grad_norm": 0.7021351456642151, + "learning_rate": 0.00015157609291837658, + "loss": 2.9845, + "step": 40692 + }, + { + "epoch": 1.99, + "grad_norm": 0.657197892665863, + "learning_rate": 0.00015156271324820523, + "loss": 2.8504, + "step": 40693 + }, + { + "epoch": 1.99, + "grad_norm": 0.6748011708259583, + "learning_rate": 0.000151549333968989, + "loss": 2.7495, + "step": 40694 + }, + { + "epoch": 1.99, + "grad_norm": 0.6754131317138672, + "learning_rate": 0.00015153595508076326, + "loss": 2.8573, + "step": 40695 + }, + { + "epoch": 1.99, + "grad_norm": 0.6298975944519043, + "learning_rate": 0.00015152257658356336, + "loss": 2.8821, + "step": 40696 + }, + { + "epoch": 1.99, + "grad_norm": 0.6881123185157776, + "learning_rate": 0.0001515091984774244, + "loss": 2.8627, + "step": 40697 + }, + { + "epoch": 1.99, + "grad_norm": 0.6599323153495789, + "learning_rate": 0.00015149582076238164, + "loss": 2.8482, + "step": 40698 + }, + { + "epoch": 1.99, + "grad_norm": 0.640341579914093, + "learning_rate": 0.00015148244343847025, + "loss": 3.0448, + "step": 40699 + }, + { + "epoch": 1.99, + "grad_norm": 0.7159940004348755, + "learning_rate": 0.00015146906650572546, + "loss": 3.1676, + "step": 40700 + }, + { + "epoch": 1.99, + "grad_norm": 0.6907761096954346, + "learning_rate": 0.00015145568996418268, + "loss": 2.9688, + "step": 40701 + }, + { + "epoch": 1.99, + "grad_norm": 0.6505234837532043, + "learning_rate": 0.00015144231381387693, + "loss": 2.9647, + "step": 40702 + }, + { + "epoch": 1.99, + "grad_norm": 0.6479827761650085, + "learning_rate": 0.00015142893805484363, + "loss": 3.0348, + "step": 40703 + }, + { + "epoch": 1.99, + "grad_norm": 0.7323071956634521, + "learning_rate": 0.00015141556268711783, + "loss": 2.7454, + "step": 40704 + }, + { + "epoch": 1.99, + "grad_norm": 0.6864801049232483, + "learning_rate": 0.00015140218771073495, + "loss": 2.9794, + "step": 40705 + }, + { + "epoch": 1.99, + "grad_norm": 0.675094485282898, + "learning_rate": 0.00015138881312573007, + "loss": 2.9768, + "step": 40706 + }, + { + "epoch": 1.99, + "grad_norm": 0.6760757565498352, + "learning_rate": 0.00015137543893213836, + "loss": 2.6509, + "step": 40707 + }, + { + "epoch": 2.0, + "grad_norm": 0.6964802145957947, + "learning_rate": 0.00015136206512999527, + "loss": 2.8603, + "step": 40708 + }, + { + "epoch": 2.0, + "grad_norm": 0.6976985931396484, + "learning_rate": 0.00015134869171933577, + "loss": 2.8392, + "step": 40709 + }, + { + "epoch": 2.0, + "grad_norm": 0.6337934136390686, + "learning_rate": 0.00015133531870019525, + "loss": 3.0877, + "step": 40710 + }, + { + "epoch": 2.0, + "grad_norm": 0.6604355573654175, + "learning_rate": 0.00015132194607260892, + "loss": 2.9733, + "step": 40711 + }, + { + "epoch": 2.0, + "grad_norm": 0.6681331992149353, + "learning_rate": 0.00015130857383661203, + "loss": 2.9892, + "step": 40712 + }, + { + "epoch": 2.0, + "grad_norm": 0.7403008937835693, + "learning_rate": 0.00015129520199223974, + "loss": 2.8967, + "step": 40713 + }, + { + "epoch": 2.0, + "grad_norm": 0.6657782196998596, + "learning_rate": 0.00015128183053952714, + "loss": 2.9276, + "step": 40714 + }, + { + "epoch": 2.0, + "grad_norm": 0.6958048343658447, + "learning_rate": 0.00015126845947850955, + "loss": 2.9373, + "step": 40715 + }, + { + "epoch": 2.0, + "grad_norm": 0.6533171534538269, + "learning_rate": 0.00015125508880922236, + "loss": 3.2072, + "step": 40716 + }, + { + "epoch": 2.0, + "grad_norm": 0.6738389730453491, + "learning_rate": 0.00015124171853170054, + "loss": 3.0335, + "step": 40717 + }, + { + "epoch": 2.0, + "grad_norm": 0.6788296699523926, + "learning_rate": 0.0001512283486459795, + "loss": 3.0401, + "step": 40718 + }, + { + "epoch": 2.0, + "grad_norm": 0.6417179703712463, + "learning_rate": 0.00015121497915209436, + "loss": 2.66, + "step": 40719 + }, + { + "epoch": 2.0, + "grad_norm": 0.6341338157653809, + "learning_rate": 0.00015120161005008017, + "loss": 3.0029, + "step": 40720 + }, + { + "epoch": 2.0, + "grad_norm": 0.6832126975059509, + "learning_rate": 0.0001511882413399725, + "loss": 2.7799, + "step": 40721 + }, + { + "epoch": 2.0, + "grad_norm": 0.7208033800125122, + "learning_rate": 0.0001511748730218062, + "loss": 2.8199, + "step": 40722 + }, + { + "epoch": 2.0, + "grad_norm": 0.7194510698318481, + "learning_rate": 0.00015116150509561675, + "loss": 3.077, + "step": 40723 + }, + { + "epoch": 2.0, + "grad_norm": 0.665046751499176, + "learning_rate": 0.00015114813756143917, + "loss": 3.0192, + "step": 40724 + }, + { + "epoch": 2.0, + "grad_norm": 0.6552749872207642, + "learning_rate": 0.0001511347704193087, + "loss": 3.1042, + "step": 40725 + }, + { + "epoch": 2.0, + "grad_norm": 0.7193647623062134, + "learning_rate": 0.00015112140366926074, + "loss": 3.11, + "step": 40726 + }, + { + "epoch": 2.0, + "grad_norm": 0.718508243560791, + "learning_rate": 0.00015110803731133038, + "loss": 2.9426, + "step": 40727 + }, + { + "epoch": 2.0, + "grad_norm": 0.6804858446121216, + "learning_rate": 0.00015109467134555277, + "loss": 3.0969, + "step": 40728 + }, + { + "epoch": 2.0, + "grad_norm": 0.6869458556175232, + "learning_rate": 0.00015108130577196302, + "loss": 3.13, + "step": 40729 + }, + { + "epoch": 2.0, + "grad_norm": 0.669521152973175, + "learning_rate": 0.00015106794059059647, + "loss": 2.9521, + "step": 40730 + }, + { + "epoch": 2.0, + "grad_norm": 0.6213893294334412, + "learning_rate": 0.00015105457580148838, + "loss": 3.031, + "step": 40731 + }, + { + "epoch": 2.0, + "grad_norm": 0.6913130879402161, + "learning_rate": 0.00015104121140467378, + "loss": 2.8968, + "step": 40732 + }, + { + "epoch": 2.0, + "grad_norm": 0.676311731338501, + "learning_rate": 0.00015102784740018803, + "loss": 3.1907, + "step": 40733 + }, + { + "epoch": 2.0, + "grad_norm": 0.6928296685218811, + "learning_rate": 0.00015101448378806632, + "loss": 2.9377, + "step": 40734 + }, + { + "epoch": 2.0, + "grad_norm": 0.6946173906326294, + "learning_rate": 0.00015100112056834367, + "loss": 3.0254, + "step": 40735 + }, + { + "epoch": 2.0, + "grad_norm": 0.6546354293823242, + "learning_rate": 0.0001509877577410555, + "loss": 3.2543, + "step": 40736 + }, + { + "epoch": 2.0, + "grad_norm": 0.707718551158905, + "learning_rate": 0.00015097439530623675, + "loss": 2.915, + "step": 40737 + }, + { + "epoch": 2.0, + "grad_norm": 0.6598696708679199, + "learning_rate": 0.00015096103326392291, + "loss": 2.7935, + "step": 40738 + }, + { + "epoch": 2.0, + "grad_norm": 0.626047670841217, + "learning_rate": 0.0001509476716141489, + "loss": 3.126, + "step": 40739 + }, + { + "epoch": 2.0, + "grad_norm": 0.6716310977935791, + "learning_rate": 0.00015093431035695013, + "loss": 3.1383, + "step": 40740 + }, + { + "epoch": 2.0, + "grad_norm": 0.6620728969573975, + "learning_rate": 0.0001509209494923617, + "loss": 3.0502, + "step": 40741 + }, + { + "epoch": 2.0, + "grad_norm": 0.6665850877761841, + "learning_rate": 0.00015090758902041874, + "loss": 3.2004, + "step": 40742 + }, + { + "epoch": 2.0, + "grad_norm": 0.6744237542152405, + "learning_rate": 0.00015089422894115656, + "loss": 3.0869, + "step": 40743 + }, + { + "epoch": 2.0, + "grad_norm": 0.6411026120185852, + "learning_rate": 0.00015088086925461018, + "loss": 3.0758, + "step": 40744 + }, + { + "epoch": 2.0, + "grad_norm": 0.6751980781555176, + "learning_rate": 0.00015086750996081493, + "loss": 2.9283, + "step": 40745 + }, + { + "epoch": 2.0, + "grad_norm": 0.6658977270126343, + "learning_rate": 0.000150854151059806, + "loss": 3.2339, + "step": 40746 + }, + { + "epoch": 2.0, + "grad_norm": 0.6690980195999146, + "learning_rate": 0.0001508407925516185, + "loss": 2.8654, + "step": 40747 + }, + { + "epoch": 2.0, + "grad_norm": 0.7223593592643738, + "learning_rate": 0.0001508274344362877, + "loss": 2.8638, + "step": 40748 + }, + { + "epoch": 2.0, + "grad_norm": 0.673263669013977, + "learning_rate": 0.00015081407671384874, + "loss": 2.9679, + "step": 40749 + }, + { + "epoch": 2.0, + "grad_norm": 0.706080436706543, + "learning_rate": 0.0001508007193843368, + "loss": 3.065, + "step": 40750 + }, + { + "epoch": 2.0, + "grad_norm": 0.6620662212371826, + "learning_rate": 0.00015078736244778692, + "loss": 2.858, + "step": 40751 + }, + { + "epoch": 2.0, + "grad_norm": 0.6945991516113281, + "learning_rate": 0.00015077400590423443, + "loss": 2.972, + "step": 40752 + }, + { + "epoch": 2.0, + "grad_norm": 0.6781968474388123, + "learning_rate": 0.0001507606497537146, + "loss": 2.8857, + "step": 40753 + }, + { + "epoch": 2.0, + "grad_norm": 0.6542848348617554, + "learning_rate": 0.00015074729399626243, + "loss": 2.7439, + "step": 40754 + }, + { + "epoch": 2.0, + "grad_norm": 0.6352688670158386, + "learning_rate": 0.0001507339386319132, + "loss": 2.9773, + "step": 40755 + }, + { + "epoch": 2.0, + "grad_norm": 0.6776319742202759, + "learning_rate": 0.00015072058366070207, + "loss": 2.7851, + "step": 40756 + }, + { + "epoch": 2.0, + "grad_norm": 0.6314222812652588, + "learning_rate": 0.00015070722908266409, + "loss": 2.9906, + "step": 40757 + }, + { + "epoch": 2.0, + "grad_norm": 0.6974877715110779, + "learning_rate": 0.00015069387489783466, + "loss": 2.8347, + "step": 40758 + }, + { + "epoch": 2.0, + "grad_norm": 0.6645646691322327, + "learning_rate": 0.0001506805211062487, + "loss": 2.8755, + "step": 40759 + }, + { + "epoch": 2.0, + "grad_norm": 0.6492670774459839, + "learning_rate": 0.00015066716770794164, + "loss": 2.9298, + "step": 40760 + }, + { + "epoch": 2.0, + "grad_norm": 0.6646486520767212, + "learning_rate": 0.00015065381470294838, + "loss": 3.1262, + "step": 40761 + }, + { + "epoch": 2.0, + "grad_norm": 0.6600433588027954, + "learning_rate": 0.00015064046209130425, + "loss": 3.1513, + "step": 40762 + }, + { + "epoch": 2.0, + "grad_norm": 0.6794731616973877, + "learning_rate": 0.0001506271098730445, + "loss": 2.994, + "step": 40763 + }, + { + "epoch": 2.0, + "grad_norm": 0.6728107929229736, + "learning_rate": 0.00015061375804820422, + "loss": 3.0659, + "step": 40764 + }, + { + "epoch": 2.0, + "grad_norm": 0.6461384296417236, + "learning_rate": 0.00015060040661681853, + "loss": 2.8795, + "step": 40765 + }, + { + "epoch": 2.0, + "grad_norm": 0.6834413409233093, + "learning_rate": 0.0001505870555789225, + "loss": 2.884, + "step": 40766 + }, + { + "epoch": 2.0, + "grad_norm": 0.6609336733818054, + "learning_rate": 0.00015057370493455142, + "loss": 2.9974, + "step": 40767 + }, + { + "epoch": 2.0, + "grad_norm": 0.6304715275764465, + "learning_rate": 0.00015056035468374054, + "loss": 2.9703, + "step": 40768 + }, + { + "epoch": 2.0, + "grad_norm": 0.6766150593757629, + "learning_rate": 0.00015054700482652482, + "loss": 2.9168, + "step": 40769 + }, + { + "epoch": 2.0, + "grad_norm": 0.6861264705657959, + "learning_rate": 0.00015053365536293967, + "loss": 2.6705, + "step": 40770 + }, + { + "epoch": 2.0, + "grad_norm": 0.7163991928100586, + "learning_rate": 0.00015052030629302007, + "loss": 2.8756, + "step": 40771 + }, + { + "epoch": 2.0, + "grad_norm": 0.6539211869239807, + "learning_rate": 0.00015050695761680108, + "loss": 2.8508, + "step": 40772 + }, + { + "epoch": 2.0, + "grad_norm": 0.6503493785858154, + "learning_rate": 0.00015049360933431816, + "loss": 2.8387, + "step": 40773 + }, + { + "epoch": 2.0, + "grad_norm": 0.6681285500526428, + "learning_rate": 0.00015048026144560617, + "loss": 3.1689, + "step": 40774 + }, + { + "epoch": 2.0, + "grad_norm": 0.6761608719825745, + "learning_rate": 0.0001504669139507005, + "loss": 2.8518, + "step": 40775 + }, + { + "epoch": 2.0, + "grad_norm": 0.7086718082427979, + "learning_rate": 0.00015045356684963612, + "loss": 3.0821, + "step": 40776 + }, + { + "epoch": 2.0, + "grad_norm": 0.6684795618057251, + "learning_rate": 0.00015044022014244835, + "loss": 3.0127, + "step": 40777 + }, + { + "epoch": 2.0, + "grad_norm": 0.6545731425285339, + "learning_rate": 0.00015042687382917224, + "loss": 3.1053, + "step": 40778 + }, + { + "epoch": 2.0, + "grad_norm": 0.7030220627784729, + "learning_rate": 0.00015041352790984286, + "loss": 2.9971, + "step": 40779 + }, + { + "epoch": 2.0, + "grad_norm": 0.6445237994194031, + "learning_rate": 0.00015040018238449558, + "loss": 2.959, + "step": 40780 + }, + { + "epoch": 2.0, + "grad_norm": 0.702018678188324, + "learning_rate": 0.0001503868372531653, + "loss": 2.873, + "step": 40781 + }, + { + "epoch": 2.0, + "grad_norm": 0.6564288139343262, + "learning_rate": 0.00015037349251588732, + "loss": 2.7837, + "step": 40782 + }, + { + "epoch": 2.0, + "grad_norm": 0.732306957244873, + "learning_rate": 0.00015036014817269687, + "loss": 3.0326, + "step": 40783 + }, + { + "epoch": 2.0, + "grad_norm": 0.6501794457435608, + "learning_rate": 0.00015034680422362884, + "loss": 2.9075, + "step": 40784 + }, + { + "epoch": 2.0, + "grad_norm": 0.6539938449859619, + "learning_rate": 0.00015033346066871866, + "loss": 2.8234, + "step": 40785 + }, + { + "epoch": 2.0, + "grad_norm": 0.6749308705329895, + "learning_rate": 0.0001503201175080014, + "loss": 3.2814, + "step": 40786 + }, + { + "epoch": 2.0, + "grad_norm": 0.6837059855461121, + "learning_rate": 0.00015030677474151195, + "loss": 2.97, + "step": 40787 + }, + { + "epoch": 2.0, + "grad_norm": 0.6461108922958374, + "learning_rate": 0.00015029343236928582, + "loss": 3.1164, + "step": 40788 + }, + { + "epoch": 2.0, + "grad_norm": 0.6855670213699341, + "learning_rate": 0.00015028009039135781, + "loss": 2.8901, + "step": 40789 + }, + { + "epoch": 2.0, + "grad_norm": 0.683847188949585, + "learning_rate": 0.00015026674880776338, + "loss": 2.9545, + "step": 40790 + }, + { + "epoch": 2.0, + "grad_norm": 0.6807911396026611, + "learning_rate": 0.00015025340761853738, + "loss": 2.9702, + "step": 40791 + }, + { + "epoch": 2.0, + "grad_norm": 0.689594566822052, + "learning_rate": 0.00015024006682371523, + "loss": 3.1826, + "step": 40792 + }, + { + "epoch": 2.0, + "grad_norm": 0.667519211769104, + "learning_rate": 0.00015022672642333188, + "loss": 2.9398, + "step": 40793 + }, + { + "epoch": 2.0, + "grad_norm": 0.7207837700843811, + "learning_rate": 0.00015021338641742245, + "loss": 2.7841, + "step": 40794 + }, + { + "epoch": 2.0, + "grad_norm": 0.661036491394043, + "learning_rate": 0.00015020004680602218, + "loss": 3.011, + "step": 40795 + }, + { + "epoch": 2.0, + "grad_norm": 0.72581946849823, + "learning_rate": 0.00015018670758916607, + "loss": 3.1146, + "step": 40796 + }, + { + "epoch": 2.0, + "grad_norm": 0.6660535335540771, + "learning_rate": 0.00015017336876688934, + "loss": 3.1452, + "step": 40797 + }, + { + "epoch": 2.0, + "grad_norm": 0.6611814498901367, + "learning_rate": 0.00015016003033922726, + "loss": 2.895, + "step": 40798 + }, + { + "epoch": 2.0, + "grad_norm": 0.6168580651283264, + "learning_rate": 0.00015014669230621483, + "loss": 3.0379, + "step": 40799 + }, + { + "epoch": 2.0, + "grad_norm": 0.6940125226974487, + "learning_rate": 0.00015013335466788712, + "loss": 2.9946, + "step": 40800 + }, + { + "epoch": 2.0, + "grad_norm": 0.6904090642929077, + "learning_rate": 0.00015012001742427923, + "loss": 2.9315, + "step": 40801 + }, + { + "epoch": 2.0, + "grad_norm": 0.6492003202438354, + "learning_rate": 0.00015010668057542637, + "loss": 2.9914, + "step": 40802 + }, + { + "epoch": 2.0, + "grad_norm": 0.652450442314148, + "learning_rate": 0.00015009334412136377, + "loss": 3.0156, + "step": 40803 + }, + { + "epoch": 2.0, + "grad_norm": 0.6898772716522217, + "learning_rate": 0.00015008000806212635, + "loss": 3.0286, + "step": 40804 + }, + { + "epoch": 2.0, + "grad_norm": 0.6315729022026062, + "learning_rate": 0.00015006667239774943, + "loss": 2.6668, + "step": 40805 + }, + { + "epoch": 2.0, + "grad_norm": 0.6902825832366943, + "learning_rate": 0.00015005333712826794, + "loss": 2.9763, + "step": 40806 + }, + { + "epoch": 2.0, + "grad_norm": 0.6964982151985168, + "learning_rate": 0.0001500400022537172, + "loss": 2.8922, + "step": 40807 + }, + { + "epoch": 2.0, + "grad_norm": 0.6511028409004211, + "learning_rate": 0.00015002666777413229, + "loss": 3.2217, + "step": 40808 + }, + { + "epoch": 2.0, + "grad_norm": 0.6448655128479004, + "learning_rate": 0.0001500133336895481, + "loss": 3.1245, + "step": 40809 + }, + { + "epoch": 2.0, + "grad_norm": 0.8458114862442017, + "learning_rate": 0.00015000000000000004, + "loss": 2.9976, + "step": 40810 + }, + { + "epoch": 2.0, + "grad_norm": 0.6726029515266418, + "learning_rate": 0.00014998666670552302, + "loss": 2.8374, + "step": 40811 + }, + { + "epoch": 2.0, + "grad_norm": 0.6877138614654541, + "learning_rate": 0.00014997333380615223, + "loss": 2.915, + "step": 40812 + }, + { + "epoch": 2.0, + "grad_norm": 0.6989501714706421, + "learning_rate": 0.00014996000130192293, + "loss": 2.9306, + "step": 40813 + }, + { + "epoch": 2.0, + "grad_norm": 0.6564236283302307, + "learning_rate": 0.00014994666919287012, + "loss": 2.9193, + "step": 40814 + }, + { + "epoch": 2.0, + "grad_norm": 0.6724837422370911, + "learning_rate": 0.0001499333374790289, + "loss": 2.9177, + "step": 40815 + }, + { + "epoch": 2.0, + "grad_norm": 0.6566126346588135, + "learning_rate": 0.00014992000616043428, + "loss": 3.009, + "step": 40816 + }, + { + "epoch": 2.0, + "grad_norm": 0.6281498074531555, + "learning_rate": 0.00014990667523712145, + "loss": 3.0989, + "step": 40817 + }, + { + "epoch": 2.0, + "grad_norm": 0.6152768731117249, + "learning_rate": 0.0001498933447091257, + "loss": 2.8553, + "step": 40818 + }, + { + "epoch": 2.0, + "grad_norm": 0.649753749370575, + "learning_rate": 0.00014988001457648186, + "loss": 2.8566, + "step": 40819 + }, + { + "epoch": 2.0, + "grad_norm": 0.6632745862007141, + "learning_rate": 0.0001498666848392253, + "loss": 2.9984, + "step": 40820 + }, + { + "epoch": 2.0, + "grad_norm": 0.6726300120353699, + "learning_rate": 0.00014985335549739097, + "loss": 2.6237, + "step": 40821 + }, + { + "epoch": 2.0, + "grad_norm": 0.6407365202903748, + "learning_rate": 0.00014984002655101393, + "loss": 2.9663, + "step": 40822 + }, + { + "epoch": 2.0, + "grad_norm": 0.6597623825073242, + "learning_rate": 0.00014982669800012944, + "loss": 2.8987, + "step": 40823 + }, + { + "epoch": 2.0, + "grad_norm": 0.689781904220581, + "learning_rate": 0.00014981336984477243, + "loss": 2.9595, + "step": 40824 + }, + { + "epoch": 2.0, + "grad_norm": 0.6714945435523987, + "learning_rate": 0.0001498000420849782, + "loss": 2.9263, + "step": 40825 + }, + { + "epoch": 2.0, + "grad_norm": 0.635680615901947, + "learning_rate": 0.00014978671472078167, + "loss": 2.8758, + "step": 40826 + }, + { + "epoch": 2.0, + "grad_norm": 0.6559689044952393, + "learning_rate": 0.0001497733877522181, + "loss": 3.0803, + "step": 40827 + }, + { + "epoch": 2.0, + "grad_norm": 0.6731073260307312, + "learning_rate": 0.00014976006117932243, + "loss": 2.9924, + "step": 40828 + }, + { + "epoch": 2.0, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.00014974673500212996, + "loss": 2.9098, + "step": 40829 + }, + { + "epoch": 2.0, + "grad_norm": 0.6949353814125061, + "learning_rate": 0.00014973340922067564, + "loss": 2.73, + "step": 40830 + }, + { + "epoch": 2.0, + "grad_norm": 0.6641110777854919, + "learning_rate": 0.0001497200838349945, + "loss": 3.1152, + "step": 40831 + }, + { + "epoch": 2.0, + "grad_norm": 0.7344586253166199, + "learning_rate": 0.00014970675884512188, + "loss": 2.8915, + "step": 40832 + }, + { + "epoch": 2.0, + "grad_norm": 0.7318583726882935, + "learning_rate": 0.00014969343425109258, + "loss": 2.968, + "step": 40833 + }, + { + "epoch": 2.0, + "grad_norm": 0.7301783561706543, + "learning_rate": 0.00014968011005294187, + "loss": 3.0687, + "step": 40834 + }, + { + "epoch": 2.0, + "grad_norm": 0.6690212488174438, + "learning_rate": 0.00014966678625070494, + "loss": 2.7896, + "step": 40835 + }, + { + "epoch": 2.0, + "grad_norm": 0.6483722925186157, + "learning_rate": 0.00014965346284441674, + "loss": 2.9243, + "step": 40836 + }, + { + "epoch": 2.0, + "grad_norm": 0.6564923524856567, + "learning_rate": 0.00014964013983411237, + "loss": 2.853, + "step": 40837 + }, + { + "epoch": 2.0, + "grad_norm": 0.6334251165390015, + "learning_rate": 0.00014962681721982686, + "loss": 2.8823, + "step": 40838 + }, + { + "epoch": 2.0, + "grad_norm": 0.7064406275749207, + "learning_rate": 0.00014961349500159537, + "loss": 2.8174, + "step": 40839 + }, + { + "epoch": 2.0, + "grad_norm": 0.7101202607154846, + "learning_rate": 0.0001496001731794531, + "loss": 3.037, + "step": 40840 + }, + { + "epoch": 2.0, + "grad_norm": 0.688157320022583, + "learning_rate": 0.0001495868517534349, + "loss": 2.8154, + "step": 40841 + }, + { + "epoch": 2.0, + "grad_norm": 0.6820224523544312, + "learning_rate": 0.0001495735307235761, + "loss": 2.9316, + "step": 40842 + }, + { + "epoch": 2.0, + "grad_norm": 0.6670019030570984, + "learning_rate": 0.0001495602100899117, + "loss": 2.9859, + "step": 40843 + }, + { + "epoch": 2.0, + "grad_norm": 0.7046992778778076, + "learning_rate": 0.0001495468898524766, + "loss": 2.7433, + "step": 40844 + }, + { + "epoch": 2.0, + "grad_norm": 0.6730843782424927, + "learning_rate": 0.00014953357001130617, + "loss": 2.7432, + "step": 40845 + }, + { + "epoch": 2.0, + "grad_norm": 0.6858168840408325, + "learning_rate": 0.00014952025056643524, + "loss": 2.9308, + "step": 40846 + }, + { + "epoch": 2.0, + "grad_norm": 0.6220602989196777, + "learning_rate": 0.00014950693151789913, + "loss": 2.903, + "step": 40847 + }, + { + "epoch": 2.0, + "grad_norm": 0.6544325947761536, + "learning_rate": 0.0001494936128657327, + "loss": 3.0168, + "step": 40848 + }, + { + "epoch": 2.0, + "grad_norm": 0.668447732925415, + "learning_rate": 0.0001494802946099711, + "loss": 2.9114, + "step": 40849 + }, + { + "epoch": 2.0, + "grad_norm": 0.6595551371574402, + "learning_rate": 0.00014946697675064955, + "loss": 2.9192, + "step": 40850 + }, + { + "epoch": 2.0, + "grad_norm": 0.6544173359870911, + "learning_rate": 0.00014945365928780306, + "loss": 3.0946, + "step": 40851 + }, + { + "epoch": 2.0, + "grad_norm": 0.7191558480262756, + "learning_rate": 0.0001494403422214666, + "loss": 3.0198, + "step": 40852 + }, + { + "epoch": 2.0, + "grad_norm": 0.6780735850334167, + "learning_rate": 0.00014942702555167521, + "loss": 2.9645, + "step": 40853 + }, + { + "epoch": 2.0, + "grad_norm": 0.6664487719535828, + "learning_rate": 0.00014941370927846407, + "loss": 2.9704, + "step": 40854 + }, + { + "epoch": 2.0, + "grad_norm": 0.706179678440094, + "learning_rate": 0.00014940039340186835, + "loss": 2.8314, + "step": 40855 + }, + { + "epoch": 2.0, + "grad_norm": 0.6947509050369263, + "learning_rate": 0.00014938707792192286, + "loss": 2.823, + "step": 40856 + }, + { + "epoch": 2.0, + "grad_norm": 0.6411890387535095, + "learning_rate": 0.00014937376283866296, + "loss": 3.0037, + "step": 40857 + }, + { + "epoch": 2.0, + "grad_norm": 0.7116650938987732, + "learning_rate": 0.00014936044815212356, + "loss": 2.8026, + "step": 40858 + }, + { + "epoch": 2.0, + "grad_norm": 0.6230390071868896, + "learning_rate": 0.00014934713386233968, + "loss": 2.948, + "step": 40859 + }, + { + "epoch": 2.0, + "grad_norm": 0.6088177561759949, + "learning_rate": 0.00014933381996934655, + "loss": 2.8931, + "step": 40860 + }, + { + "epoch": 2.0, + "grad_norm": 0.68245530128479, + "learning_rate": 0.00014932050647317903, + "loss": 3.0912, + "step": 40861 + }, + { + "epoch": 2.0, + "grad_norm": 0.6987802386283875, + "learning_rate": 0.0001493071933738724, + "loss": 2.9147, + "step": 40862 + }, + { + "epoch": 2.0, + "grad_norm": 0.6714006066322327, + "learning_rate": 0.0001492938806714615, + "loss": 2.9907, + "step": 40863 + }, + { + "epoch": 2.0, + "grad_norm": 0.6982893347740173, + "learning_rate": 0.0001492805683659815, + "loss": 2.9568, + "step": 40864 + }, + { + "epoch": 2.0, + "grad_norm": 0.6661611795425415, + "learning_rate": 0.00014926725645746775, + "loss": 2.8846, + "step": 40865 + }, + { + "epoch": 2.0, + "grad_norm": 0.7231001257896423, + "learning_rate": 0.0001492539449459548, + "loss": 2.9071, + "step": 40866 + }, + { + "epoch": 2.0, + "grad_norm": 0.6181085705757141, + "learning_rate": 0.00014924063383147805, + "loss": 2.9526, + "step": 40867 + }, + { + "epoch": 2.0, + "grad_norm": 0.6579425930976868, + "learning_rate": 0.00014922732311407235, + "loss": 2.8463, + "step": 40868 + }, + { + "epoch": 2.0, + "grad_norm": 0.6170904636383057, + "learning_rate": 0.00014921401279377285, + "loss": 2.8966, + "step": 40869 + }, + { + "epoch": 2.0, + "grad_norm": 0.6523815393447876, + "learning_rate": 0.00014920070287061473, + "loss": 2.8954, + "step": 40870 + }, + { + "epoch": 2.0, + "grad_norm": 0.7443393468856812, + "learning_rate": 0.00014918739334463286, + "loss": 2.858, + "step": 40871 + }, + { + "epoch": 2.0, + "grad_norm": 0.6416648030281067, + "learning_rate": 0.00014917408421586247, + "loss": 2.8031, + "step": 40872 + }, + { + "epoch": 2.0, + "grad_norm": 0.6666995286941528, + "learning_rate": 0.0001491607754843385, + "loss": 2.951, + "step": 40873 + }, + { + "epoch": 2.0, + "grad_norm": 0.642076849937439, + "learning_rate": 0.0001491474671500959, + "loss": 2.7547, + "step": 40874 + }, + { + "epoch": 2.0, + "grad_norm": 0.6719287037849426, + "learning_rate": 0.00014913415921316995, + "loss": 2.9337, + "step": 40875 + }, + { + "epoch": 2.0, + "grad_norm": 0.6449657678604126, + "learning_rate": 0.0001491208516735955, + "loss": 2.9809, + "step": 40876 + }, + { + "epoch": 2.0, + "grad_norm": 0.6794679164886475, + "learning_rate": 0.00014910754453140775, + "loss": 2.9901, + "step": 40877 + }, + { + "epoch": 2.0, + "grad_norm": 0.6926255822181702, + "learning_rate": 0.00014909423778664165, + "loss": 2.8638, + "step": 40878 + }, + { + "epoch": 2.0, + "grad_norm": 0.6646115779876709, + "learning_rate": 0.00014908093143933235, + "loss": 2.7086, + "step": 40879 + }, + { + "epoch": 2.0, + "grad_norm": 0.6892146468162537, + "learning_rate": 0.00014906762548951485, + "loss": 2.8719, + "step": 40880 + }, + { + "epoch": 2.0, + "grad_norm": 0.6423749923706055, + "learning_rate": 0.00014905431993722407, + "loss": 2.862, + "step": 40881 + }, + { + "epoch": 2.0, + "grad_norm": 0.7053487300872803, + "learning_rate": 0.00014904101478249525, + "loss": 3.0384, + "step": 40882 + }, + { + "epoch": 2.0, + "grad_norm": 0.6942588090896606, + "learning_rate": 0.00014902771002536325, + "loss": 2.9993, + "step": 40883 + }, + { + "epoch": 2.0, + "grad_norm": 0.773780345916748, + "learning_rate": 0.0001490144056658632, + "loss": 2.9149, + "step": 40884 + }, + { + "epoch": 2.0, + "grad_norm": 0.7698376774787903, + "learning_rate": 0.00014900110170403027, + "loss": 2.89, + "step": 40885 + }, + { + "epoch": 2.0, + "grad_norm": 0.7158568501472473, + "learning_rate": 0.00014898779813989925, + "loss": 2.7576, + "step": 40886 + }, + { + "epoch": 2.0, + "grad_norm": 0.671528697013855, + "learning_rate": 0.00014897449497350545, + "loss": 2.9737, + "step": 40887 + }, + { + "epoch": 2.0, + "grad_norm": 0.6958116888999939, + "learning_rate": 0.0001489611922048837, + "loss": 2.8742, + "step": 40888 + }, + { + "epoch": 2.0, + "grad_norm": 0.6262465119361877, + "learning_rate": 0.00014894788983406905, + "loss": 2.7978, + "step": 40889 + }, + { + "epoch": 2.0, + "grad_norm": 0.7286080718040466, + "learning_rate": 0.0001489345878610967, + "loss": 2.6667, + "step": 40890 + }, + { + "epoch": 2.0, + "grad_norm": 0.686791718006134, + "learning_rate": 0.0001489212862860014, + "loss": 2.9975, + "step": 40891 + }, + { + "epoch": 2.0, + "grad_norm": 0.6629527807235718, + "learning_rate": 0.00014890798510881854, + "loss": 2.7829, + "step": 40892 + }, + { + "epoch": 2.0, + "grad_norm": 0.6882234215736389, + "learning_rate": 0.00014889468432958283, + "loss": 2.8711, + "step": 40893 + }, + { + "epoch": 2.0, + "grad_norm": 0.6697384119033813, + "learning_rate": 0.00014888138394832955, + "loss": 3.0293, + "step": 40894 + }, + { + "epoch": 2.0, + "grad_norm": 0.6356438994407654, + "learning_rate": 0.0001488680839650936, + "loss": 2.9981, + "step": 40895 + }, + { + "epoch": 2.0, + "grad_norm": 0.6785897612571716, + "learning_rate": 0.00014885478437990998, + "loss": 2.9045, + "step": 40896 + }, + { + "epoch": 2.0, + "grad_norm": 0.6552438139915466, + "learning_rate": 0.0001488414851928138, + "loss": 3.0607, + "step": 40897 + }, + { + "epoch": 2.0, + "grad_norm": 0.6709671020507812, + "learning_rate": 0.00014882818640384, + "loss": 3.0093, + "step": 40898 + }, + { + "epoch": 2.0, + "grad_norm": 0.632817268371582, + "learning_rate": 0.00014881488801302367, + "loss": 3.0937, + "step": 40899 + }, + { + "epoch": 2.0, + "grad_norm": 0.6626100540161133, + "learning_rate": 0.0001488015900203999, + "loss": 2.7311, + "step": 40900 + }, + { + "epoch": 2.0, + "grad_norm": 0.7310886979103088, + "learning_rate": 0.0001487882924260037, + "loss": 2.8512, + "step": 40901 + }, + { + "epoch": 2.0, + "grad_norm": 0.6690772771835327, + "learning_rate": 0.00014877499522986996, + "loss": 2.8298, + "step": 40902 + }, + { + "epoch": 2.0, + "grad_norm": 0.6155535578727722, + "learning_rate": 0.0001487616984320337, + "loss": 2.8708, + "step": 40903 + }, + { + "epoch": 2.0, + "grad_norm": 0.6756638884544373, + "learning_rate": 0.00014874840203253016, + "loss": 3.0289, + "step": 40904 + }, + { + "epoch": 2.0, + "grad_norm": 0.6889369487762451, + "learning_rate": 0.00014873510603139408, + "loss": 2.9191, + "step": 40905 + }, + { + "epoch": 2.0, + "grad_norm": 0.6636484265327454, + "learning_rate": 0.0001487218104286606, + "loss": 2.8135, + "step": 40906 + }, + { + "epoch": 2.0, + "grad_norm": 0.7536406517028809, + "learning_rate": 0.0001487085152243649, + "loss": 3.1152, + "step": 40907 + }, + { + "epoch": 2.0, + "grad_norm": 0.6910778880119324, + "learning_rate": 0.00014869522041854175, + "loss": 2.8008, + "step": 40908 + }, + { + "epoch": 2.0, + "grad_norm": 0.7060720324516296, + "learning_rate": 0.00014868192601122637, + "loss": 3.0552, + "step": 40909 + }, + { + "epoch": 2.0, + "grad_norm": 0.6742306351661682, + "learning_rate": 0.0001486686320024537, + "loss": 2.7962, + "step": 40910 + }, + { + "epoch": 2.0, + "grad_norm": 0.6626346111297607, + "learning_rate": 0.00014865533839225858, + "loss": 2.9618, + "step": 40911 + }, + { + "epoch": 2.0, + "grad_norm": 0.6891857385635376, + "learning_rate": 0.00014864204518067628, + "loss": 2.9439, + "step": 40912 + }, + { + "epoch": 2.01, + "grad_norm": 0.650107741355896, + "learning_rate": 0.00014862875236774164, + "loss": 2.9421, + "step": 40913 + }, + { + "epoch": 2.01, + "grad_norm": 0.7103627324104309, + "learning_rate": 0.0001486154599534898, + "loss": 3.0348, + "step": 40914 + }, + { + "epoch": 2.01, + "grad_norm": 0.6941289901733398, + "learning_rate": 0.0001486021679379556, + "loss": 3.0479, + "step": 40915 + }, + { + "epoch": 2.01, + "grad_norm": 0.6747994422912598, + "learning_rate": 0.00014858887632117424, + "loss": 3.063, + "step": 40916 + }, + { + "epoch": 2.01, + "grad_norm": 0.6820676922798157, + "learning_rate": 0.00014857558510318072, + "loss": 3.0172, + "step": 40917 + }, + { + "epoch": 2.01, + "grad_norm": 0.6499535441398621, + "learning_rate": 0.0001485622942840098, + "loss": 2.9887, + "step": 40918 + }, + { + "epoch": 2.01, + "grad_norm": 0.6234838962554932, + "learning_rate": 0.00014854900386369676, + "loss": 2.8712, + "step": 40919 + }, + { + "epoch": 2.01, + "grad_norm": 0.7115194201469421, + "learning_rate": 0.00014853571384227637, + "loss": 2.9185, + "step": 40920 + }, + { + "epoch": 2.01, + "grad_norm": 0.6853716373443604, + "learning_rate": 0.00014852242421978378, + "loss": 2.8748, + "step": 40921 + }, + { + "epoch": 2.01, + "grad_norm": 0.7544880509376526, + "learning_rate": 0.00014850913499625412, + "loss": 2.7981, + "step": 40922 + }, + { + "epoch": 2.01, + "grad_norm": 0.6600387692451477, + "learning_rate": 0.00014849584617172222, + "loss": 2.9013, + "step": 40923 + }, + { + "epoch": 2.01, + "grad_norm": 0.6780473589897156, + "learning_rate": 0.0001484825577462231, + "loss": 2.8438, + "step": 40924 + }, + { + "epoch": 2.01, + "grad_norm": 0.667023241519928, + "learning_rate": 0.00014846926971979162, + "loss": 2.8672, + "step": 40925 + }, + { + "epoch": 2.01, + "grad_norm": 0.6934083104133606, + "learning_rate": 0.00014845598209246294, + "loss": 2.7644, + "step": 40926 + }, + { + "epoch": 2.01, + "grad_norm": 0.7000552415847778, + "learning_rate": 0.00014844269486427217, + "loss": 2.9773, + "step": 40927 + }, + { + "epoch": 2.01, + "grad_norm": 0.6767484545707703, + "learning_rate": 0.00014842940803525405, + "loss": 2.5985, + "step": 40928 + }, + { + "epoch": 2.01, + "grad_norm": 0.6668387055397034, + "learning_rate": 0.00014841612160544382, + "loss": 2.9201, + "step": 40929 + }, + { + "epoch": 2.01, + "grad_norm": 0.6447761654853821, + "learning_rate": 0.0001484028355748762, + "loss": 3.0512, + "step": 40930 + }, + { + "epoch": 2.01, + "grad_norm": 0.6750105619430542, + "learning_rate": 0.00014838954994358648, + "loss": 3.1484, + "step": 40931 + }, + { + "epoch": 2.01, + "grad_norm": 0.6737284064292908, + "learning_rate": 0.00014837626471160952, + "loss": 2.8619, + "step": 40932 + }, + { + "epoch": 2.01, + "grad_norm": 0.6524981260299683, + "learning_rate": 0.00014836297987898018, + "loss": 3.1476, + "step": 40933 + }, + { + "epoch": 2.01, + "grad_norm": 0.6673911809921265, + "learning_rate": 0.00014834969544573367, + "loss": 2.9744, + "step": 40934 + }, + { + "epoch": 2.01, + "grad_norm": 0.6576171517372131, + "learning_rate": 0.00014833641141190476, + "loss": 2.8008, + "step": 40935 + }, + { + "epoch": 2.01, + "grad_norm": 0.6706809997558594, + "learning_rate": 0.00014832312777752856, + "loss": 2.8009, + "step": 40936 + }, + { + "epoch": 2.01, + "grad_norm": 0.6633245348930359, + "learning_rate": 0.00014830984454264015, + "loss": 2.7447, + "step": 40937 + }, + { + "epoch": 2.01, + "grad_norm": 0.6687329411506653, + "learning_rate": 0.00014829656170727443, + "loss": 2.8792, + "step": 40938 + }, + { + "epoch": 2.01, + "grad_norm": 0.7036616206169128, + "learning_rate": 0.0001482832792714664, + "loss": 2.9527, + "step": 40939 + }, + { + "epoch": 2.01, + "grad_norm": 0.6736128330230713, + "learning_rate": 0.00014826999723525084, + "loss": 2.9944, + "step": 40940 + }, + { + "epoch": 2.01, + "grad_norm": 0.6891912817955017, + "learning_rate": 0.00014825671559866294, + "loss": 2.9328, + "step": 40941 + }, + { + "epoch": 2.01, + "grad_norm": 0.6677616834640503, + "learning_rate": 0.00014824343436173774, + "loss": 2.9777, + "step": 40942 + }, + { + "epoch": 2.01, + "grad_norm": 0.6840057969093323, + "learning_rate": 0.00014823015352451006, + "loss": 2.9738, + "step": 40943 + }, + { + "epoch": 2.01, + "grad_norm": 0.7241817712783813, + "learning_rate": 0.00014821687308701502, + "loss": 2.8735, + "step": 40944 + }, + { + "epoch": 2.01, + "grad_norm": 0.7008278369903564, + "learning_rate": 0.0001482035930492875, + "loss": 2.8839, + "step": 40945 + }, + { + "epoch": 2.01, + "grad_norm": 0.639018714427948, + "learning_rate": 0.00014819031341136246, + "loss": 2.9546, + "step": 40946 + }, + { + "epoch": 2.01, + "grad_norm": 0.6628701686859131, + "learning_rate": 0.000148177034173275, + "loss": 2.8829, + "step": 40947 + }, + { + "epoch": 2.01, + "grad_norm": 0.7044912576675415, + "learning_rate": 0.00014816375533505992, + "loss": 2.7321, + "step": 40948 + }, + { + "epoch": 2.01, + "grad_norm": 0.658323347568512, + "learning_rate": 0.00014815047689675234, + "loss": 2.9143, + "step": 40949 + }, + { + "epoch": 2.01, + "grad_norm": 0.6978631615638733, + "learning_rate": 0.00014813719885838712, + "loss": 2.7587, + "step": 40950 + }, + { + "epoch": 2.01, + "grad_norm": 0.6369495987892151, + "learning_rate": 0.00014812392121999927, + "loss": 2.9732, + "step": 40951 + }, + { + "epoch": 2.01, + "grad_norm": 0.6941462159156799, + "learning_rate": 0.00014811064398162388, + "loss": 2.7965, + "step": 40952 + }, + { + "epoch": 2.01, + "grad_norm": 0.6733207106590271, + "learning_rate": 0.00014809736714329588, + "loss": 2.9032, + "step": 40953 + }, + { + "epoch": 2.01, + "grad_norm": 0.6448726058006287, + "learning_rate": 0.00014808409070505012, + "loss": 2.8493, + "step": 40954 + }, + { + "epoch": 2.01, + "grad_norm": 0.6968315839767456, + "learning_rate": 0.00014807081466692155, + "loss": 2.8681, + "step": 40955 + }, + { + "epoch": 2.01, + "grad_norm": 0.6684523820877075, + "learning_rate": 0.0001480575390289452, + "loss": 2.9887, + "step": 40956 + }, + { + "epoch": 2.01, + "grad_norm": 0.6307798027992249, + "learning_rate": 0.0001480442637911562, + "loss": 2.8416, + "step": 40957 + }, + { + "epoch": 2.01, + "grad_norm": 0.660571277141571, + "learning_rate": 0.0001480309889535892, + "loss": 3.104, + "step": 40958 + }, + { + "epoch": 2.01, + "grad_norm": 0.6901208758354187, + "learning_rate": 0.00014801771451627944, + "loss": 2.873, + "step": 40959 + }, + { + "epoch": 2.01, + "grad_norm": 0.6825695037841797, + "learning_rate": 0.0001480044404792618, + "loss": 2.9064, + "step": 40960 + }, + { + "epoch": 2.01, + "grad_norm": 0.6804020404815674, + "learning_rate": 0.00014799116684257108, + "loss": 2.9686, + "step": 40961 + }, + { + "epoch": 2.01, + "grad_norm": 0.6763026714324951, + "learning_rate": 0.00014797789360624248, + "loss": 2.9093, + "step": 40962 + }, + { + "epoch": 2.01, + "grad_norm": 0.6673914194107056, + "learning_rate": 0.00014796462077031075, + "loss": 3.0376, + "step": 40963 + }, + { + "epoch": 2.01, + "grad_norm": 0.6648275852203369, + "learning_rate": 0.0001479513483348111, + "loss": 2.8259, + "step": 40964 + }, + { + "epoch": 2.01, + "grad_norm": 0.6990496516227722, + "learning_rate": 0.00014793807629977817, + "loss": 2.86, + "step": 40965 + }, + { + "epoch": 2.01, + "grad_norm": 0.6605084538459778, + "learning_rate": 0.0001479248046652471, + "loss": 3.1202, + "step": 40966 + }, + { + "epoch": 2.01, + "grad_norm": 0.7304693460464478, + "learning_rate": 0.00014791153343125305, + "loss": 2.9801, + "step": 40967 + }, + { + "epoch": 2.01, + "grad_norm": 0.7068866491317749, + "learning_rate": 0.0001478982625978305, + "loss": 2.8674, + "step": 40968 + }, + { + "epoch": 2.01, + "grad_norm": 0.6797716021537781, + "learning_rate": 0.00014788499216501481, + "loss": 2.8571, + "step": 40969 + }, + { + "epoch": 2.01, + "grad_norm": 0.6691450476646423, + "learning_rate": 0.00014787172213284066, + "loss": 2.9016, + "step": 40970 + }, + { + "epoch": 2.01, + "grad_norm": 0.6694939732551575, + "learning_rate": 0.0001478584525013431, + "loss": 2.7443, + "step": 40971 + }, + { + "epoch": 2.01, + "grad_norm": 0.6383887529373169, + "learning_rate": 0.00014784518327055722, + "loss": 2.8351, + "step": 40972 + }, + { + "epoch": 2.01, + "grad_norm": 0.6808183193206787, + "learning_rate": 0.0001478319144405177, + "loss": 2.9814, + "step": 40973 + }, + { + "epoch": 2.01, + "grad_norm": 0.6715272665023804, + "learning_rate": 0.00014781864601125976, + "loss": 3.099, + "step": 40974 + }, + { + "epoch": 2.01, + "grad_norm": 0.7185418009757996, + "learning_rate": 0.00014780537798281823, + "loss": 2.9109, + "step": 40975 + }, + { + "epoch": 2.01, + "grad_norm": 0.6826639175415039, + "learning_rate": 0.00014779211035522794, + "loss": 2.8128, + "step": 40976 + }, + { + "epoch": 2.01, + "grad_norm": 0.6765093207359314, + "learning_rate": 0.00014777884312852405, + "loss": 2.9096, + "step": 40977 + }, + { + "epoch": 2.01, + "grad_norm": 0.691008448600769, + "learning_rate": 0.00014776557630274127, + "loss": 2.8658, + "step": 40978 + }, + { + "epoch": 2.01, + "grad_norm": 0.6892204880714417, + "learning_rate": 0.00014775230987791477, + "loss": 2.9025, + "step": 40979 + }, + { + "epoch": 2.01, + "grad_norm": 0.7367006540298462, + "learning_rate": 0.00014773904385407925, + "loss": 2.7799, + "step": 40980 + }, + { + "epoch": 2.01, + "grad_norm": 0.7002752423286438, + "learning_rate": 0.00014772577823126993, + "loss": 2.981, + "step": 40981 + }, + { + "epoch": 2.01, + "grad_norm": 0.6693331003189087, + "learning_rate": 0.00014771251300952162, + "loss": 2.9569, + "step": 40982 + }, + { + "epoch": 2.01, + "grad_norm": 0.6542279124259949, + "learning_rate": 0.00014769924818886908, + "loss": 2.99, + "step": 40983 + }, + { + "epoch": 2.01, + "grad_norm": 0.7216241955757141, + "learning_rate": 0.00014768598376934758, + "loss": 2.9451, + "step": 40984 + }, + { + "epoch": 2.01, + "grad_norm": 0.6984073519706726, + "learning_rate": 0.00014767271975099172, + "loss": 3.0107, + "step": 40985 + }, + { + "epoch": 2.01, + "grad_norm": 0.6856890320777893, + "learning_rate": 0.00014765945613383675, + "loss": 2.8309, + "step": 40986 + }, + { + "epoch": 2.01, + "grad_norm": 0.6620368361473083, + "learning_rate": 0.00014764619291791732, + "loss": 2.8103, + "step": 40987 + }, + { + "epoch": 2.01, + "grad_norm": 0.6587978005409241, + "learning_rate": 0.00014763293010326855, + "loss": 2.8216, + "step": 40988 + }, + { + "epoch": 2.01, + "grad_norm": 0.6450566649436951, + "learning_rate": 0.00014761966768992536, + "loss": 3.0106, + "step": 40989 + }, + { + "epoch": 2.01, + "grad_norm": 0.6524011492729187, + "learning_rate": 0.00014760640567792265, + "loss": 2.8636, + "step": 40990 + }, + { + "epoch": 2.01, + "grad_norm": 0.6809319257736206, + "learning_rate": 0.00014759314406729537, + "loss": 2.8167, + "step": 40991 + }, + { + "epoch": 2.01, + "grad_norm": 0.6798052787780762, + "learning_rate": 0.0001475798828580783, + "loss": 2.8203, + "step": 40992 + }, + { + "epoch": 2.01, + "grad_norm": 0.6698510050773621, + "learning_rate": 0.00014756662205030646, + "loss": 2.9901, + "step": 40993 + }, + { + "epoch": 2.01, + "grad_norm": 0.6708970665931702, + "learning_rate": 0.00014755336164401492, + "loss": 2.9513, + "step": 40994 + }, + { + "epoch": 2.01, + "grad_norm": 0.7181163430213928, + "learning_rate": 0.00014754010163923838, + "loss": 2.9335, + "step": 40995 + }, + { + "epoch": 2.01, + "grad_norm": 0.6596902012825012, + "learning_rate": 0.00014752684203601197, + "loss": 3.0563, + "step": 40996 + }, + { + "epoch": 2.01, + "grad_norm": 0.681885838508606, + "learning_rate": 0.00014751358283437055, + "loss": 2.9588, + "step": 40997 + }, + { + "epoch": 2.01, + "grad_norm": 0.6878100037574768, + "learning_rate": 0.00014750032403434884, + "loss": 2.9884, + "step": 40998 + }, + { + "epoch": 2.01, + "grad_norm": 0.696635365486145, + "learning_rate": 0.0001474870656359821, + "loss": 2.9776, + "step": 40999 + }, + { + "epoch": 2.01, + "grad_norm": 0.6515666842460632, + "learning_rate": 0.00014747380763930492, + "loss": 3.0161, + "step": 41000 + }, + { + "epoch": 2.01, + "grad_norm": 0.673594057559967, + "learning_rate": 0.0001474605500443525, + "loss": 3.0754, + "step": 41001 + }, + { + "epoch": 2.01, + "grad_norm": 0.6816129088401794, + "learning_rate": 0.00014744729285115954, + "loss": 2.928, + "step": 41002 + }, + { + "epoch": 2.01, + "grad_norm": 0.6874986290931702, + "learning_rate": 0.00014743403605976113, + "loss": 2.9407, + "step": 41003 + }, + { + "epoch": 2.01, + "grad_norm": 0.6564611792564392, + "learning_rate": 0.00014742077967019215, + "loss": 2.8386, + "step": 41004 + }, + { + "epoch": 2.01, + "grad_norm": 0.6538634300231934, + "learning_rate": 0.00014740752368248733, + "loss": 2.9536, + "step": 41005 + }, + { + "epoch": 2.01, + "grad_norm": 0.6837107539176941, + "learning_rate": 0.00014739426809668186, + "loss": 2.7744, + "step": 41006 + }, + { + "epoch": 2.01, + "grad_norm": 0.7023835778236389, + "learning_rate": 0.00014738101291281036, + "loss": 2.8523, + "step": 41007 + }, + { + "epoch": 2.01, + "grad_norm": 0.6810896396636963, + "learning_rate": 0.00014736775813090793, + "loss": 3.1852, + "step": 41008 + }, + { + "epoch": 2.01, + "grad_norm": 0.6751614809036255, + "learning_rate": 0.0001473545037510096, + "loss": 2.9334, + "step": 41009 + }, + { + "epoch": 2.01, + "grad_norm": 0.6512244939804077, + "learning_rate": 0.00014734124977315, + "loss": 2.8679, + "step": 41010 + }, + { + "epoch": 2.01, + "grad_norm": 0.6602606773376465, + "learning_rate": 0.00014732799619736423, + "loss": 2.7867, + "step": 41011 + }, + { + "epoch": 2.01, + "grad_norm": 0.6242491006851196, + "learning_rate": 0.00014731474302368715, + "loss": 3.0989, + "step": 41012 + }, + { + "epoch": 2.01, + "grad_norm": 0.6728277206420898, + "learning_rate": 0.00014730149025215356, + "loss": 2.9024, + "step": 41013 + }, + { + "epoch": 2.01, + "grad_norm": 0.6889778971672058, + "learning_rate": 0.00014728823788279859, + "loss": 2.952, + "step": 41014 + }, + { + "epoch": 2.01, + "grad_norm": 0.709487795829773, + "learning_rate": 0.00014727498591565687, + "loss": 3.0053, + "step": 41015 + }, + { + "epoch": 2.01, + "grad_norm": 0.7062227129936218, + "learning_rate": 0.00014726173435076356, + "loss": 2.8784, + "step": 41016 + }, + { + "epoch": 2.01, + "grad_norm": 0.6593536138534546, + "learning_rate": 0.00014724848318815333, + "loss": 3.0108, + "step": 41017 + }, + { + "epoch": 2.01, + "grad_norm": 0.6188623309135437, + "learning_rate": 0.0001472352324278613, + "loss": 2.8337, + "step": 41018 + }, + { + "epoch": 2.01, + "grad_norm": 0.6538429260253906, + "learning_rate": 0.00014722198206992232, + "loss": 2.7436, + "step": 41019 + }, + { + "epoch": 2.01, + "grad_norm": 0.7170898914337158, + "learning_rate": 0.00014720873211437107, + "loss": 2.916, + "step": 41020 + }, + { + "epoch": 2.01, + "grad_norm": 0.6724970936775208, + "learning_rate": 0.00014719548256124276, + "loss": 2.9541, + "step": 41021 + }, + { + "epoch": 2.01, + "grad_norm": 0.6659535765647888, + "learning_rate": 0.00014718223341057203, + "loss": 3.0944, + "step": 41022 + }, + { + "epoch": 2.01, + "grad_norm": 0.6524555683135986, + "learning_rate": 0.00014716898466239384, + "loss": 3.1875, + "step": 41023 + }, + { + "epoch": 2.01, + "grad_norm": 0.6858758926391602, + "learning_rate": 0.00014715573631674328, + "loss": 2.8407, + "step": 41024 + }, + { + "epoch": 2.01, + "grad_norm": 0.64168781042099, + "learning_rate": 0.0001471424883736551, + "loss": 2.8617, + "step": 41025 + }, + { + "epoch": 2.01, + "grad_norm": 0.6399775743484497, + "learning_rate": 0.00014712924083316417, + "loss": 3.0222, + "step": 41026 + }, + { + "epoch": 2.01, + "grad_norm": 0.6765655875205994, + "learning_rate": 0.0001471159936953053, + "loss": 2.8945, + "step": 41027 + }, + { + "epoch": 2.01, + "grad_norm": 0.6615230441093445, + "learning_rate": 0.00014710274696011347, + "loss": 2.7149, + "step": 41028 + }, + { + "epoch": 2.01, + "grad_norm": 0.726775586605072, + "learning_rate": 0.00014708950062762368, + "loss": 2.961, + "step": 41029 + }, + { + "epoch": 2.01, + "grad_norm": 0.7074599862098694, + "learning_rate": 0.00014707625469787065, + "loss": 3.0504, + "step": 41030 + }, + { + "epoch": 2.01, + "grad_norm": 0.6598540544509888, + "learning_rate": 0.0001470630091708894, + "loss": 2.9812, + "step": 41031 + }, + { + "epoch": 2.01, + "grad_norm": 0.6536500453948975, + "learning_rate": 0.00014704976404671466, + "loss": 3.0781, + "step": 41032 + }, + { + "epoch": 2.01, + "grad_norm": 0.7111579775810242, + "learning_rate": 0.00014703651932538152, + "loss": 3.1258, + "step": 41033 + }, + { + "epoch": 2.01, + "grad_norm": 0.6571623682975769, + "learning_rate": 0.00014702327500692474, + "loss": 3.0178, + "step": 41034 + }, + { + "epoch": 2.01, + "grad_norm": 0.6660772562026978, + "learning_rate": 0.00014701003109137908, + "loss": 2.8174, + "step": 41035 + }, + { + "epoch": 2.01, + "grad_norm": 0.6512477993965149, + "learning_rate": 0.0001469967875787797, + "loss": 3.073, + "step": 41036 + }, + { + "epoch": 2.01, + "grad_norm": 0.6898970603942871, + "learning_rate": 0.00014698354446916124, + "loss": 2.8279, + "step": 41037 + }, + { + "epoch": 2.01, + "grad_norm": 0.6960706114768982, + "learning_rate": 0.00014697030176255864, + "loss": 3.0873, + "step": 41038 + }, + { + "epoch": 2.01, + "grad_norm": 0.6520676612854004, + "learning_rate": 0.00014695705945900691, + "loss": 3.0638, + "step": 41039 + }, + { + "epoch": 2.01, + "grad_norm": 0.716157853603363, + "learning_rate": 0.00014694381755854088, + "loss": 2.9452, + "step": 41040 + }, + { + "epoch": 2.01, + "grad_norm": 0.7131884098052979, + "learning_rate": 0.00014693057606119533, + "loss": 3.0743, + "step": 41041 + }, + { + "epoch": 2.01, + "grad_norm": 0.6574112772941589, + "learning_rate": 0.0001469173349670051, + "loss": 2.943, + "step": 41042 + }, + { + "epoch": 2.01, + "grad_norm": 0.6817065477371216, + "learning_rate": 0.00014690409427600516, + "loss": 2.8438, + "step": 41043 + }, + { + "epoch": 2.01, + "grad_norm": 0.6450731754302979, + "learning_rate": 0.00014689085398823047, + "loss": 3.0053, + "step": 41044 + }, + { + "epoch": 2.01, + "grad_norm": 0.6768467426300049, + "learning_rate": 0.0001468776141037157, + "loss": 3.0556, + "step": 41045 + }, + { + "epoch": 2.01, + "grad_norm": 0.6716314554214478, + "learning_rate": 0.00014686437462249592, + "loss": 2.6741, + "step": 41046 + }, + { + "epoch": 2.01, + "grad_norm": 0.6731881499290466, + "learning_rate": 0.0001468511355446059, + "loss": 2.8936, + "step": 41047 + }, + { + "epoch": 2.01, + "grad_norm": 0.705536961555481, + "learning_rate": 0.00014683789687008043, + "loss": 2.8119, + "step": 41048 + }, + { + "epoch": 2.01, + "grad_norm": 0.6849344372749329, + "learning_rate": 0.00014682465859895458, + "loss": 3.0863, + "step": 41049 + }, + { + "epoch": 2.01, + "grad_norm": 0.7398621439933777, + "learning_rate": 0.00014681142073126295, + "loss": 2.9127, + "step": 41050 + }, + { + "epoch": 2.01, + "grad_norm": 0.6680430769920349, + "learning_rate": 0.00014679818326704072, + "loss": 2.9321, + "step": 41051 + }, + { + "epoch": 2.01, + "grad_norm": 0.6883407235145569, + "learning_rate": 0.00014678494620632242, + "loss": 2.9762, + "step": 41052 + }, + { + "epoch": 2.01, + "grad_norm": 0.6741495132446289, + "learning_rate": 0.00014677170954914312, + "loss": 2.9253, + "step": 41053 + }, + { + "epoch": 2.01, + "grad_norm": 0.6511622071266174, + "learning_rate": 0.00014675847329553772, + "loss": 3.0779, + "step": 41054 + }, + { + "epoch": 2.01, + "grad_norm": 0.6725462079048157, + "learning_rate": 0.00014674523744554109, + "loss": 2.8217, + "step": 41055 + }, + { + "epoch": 2.01, + "grad_norm": 0.67629075050354, + "learning_rate": 0.00014673200199918796, + "loss": 2.9472, + "step": 41056 + }, + { + "epoch": 2.01, + "grad_norm": 0.6784157752990723, + "learning_rate": 0.0001467187669565131, + "loss": 2.8195, + "step": 41057 + }, + { + "epoch": 2.01, + "grad_norm": 0.6392921209335327, + "learning_rate": 0.00014670553231755165, + "loss": 3.0216, + "step": 41058 + }, + { + "epoch": 2.01, + "grad_norm": 0.6643005013465881, + "learning_rate": 0.00014669229808233822, + "loss": 3.0051, + "step": 41059 + }, + { + "epoch": 2.01, + "grad_norm": 0.617713987827301, + "learning_rate": 0.00014667906425090775, + "loss": 3.0391, + "step": 41060 + }, + { + "epoch": 2.01, + "grad_norm": 0.6412746906280518, + "learning_rate": 0.00014666583082329526, + "loss": 2.7262, + "step": 41061 + }, + { + "epoch": 2.01, + "grad_norm": 0.6644930243492126, + "learning_rate": 0.00014665259779953545, + "loss": 2.8943, + "step": 41062 + }, + { + "epoch": 2.01, + "grad_norm": 0.67530357837677, + "learning_rate": 0.00014663936517966315, + "loss": 2.7935, + "step": 41063 + }, + { + "epoch": 2.01, + "grad_norm": 0.7180463075637817, + "learning_rate": 0.00014662613296371316, + "loss": 2.7565, + "step": 41064 + }, + { + "epoch": 2.01, + "grad_norm": 0.6856942176818848, + "learning_rate": 0.00014661290115172045, + "loss": 2.9684, + "step": 41065 + }, + { + "epoch": 2.01, + "grad_norm": 0.672465443611145, + "learning_rate": 0.00014659966974371993, + "loss": 2.8019, + "step": 41066 + }, + { + "epoch": 2.01, + "grad_norm": 0.6677531003952026, + "learning_rate": 0.00014658643873974623, + "loss": 2.8741, + "step": 41067 + }, + { + "epoch": 2.01, + "grad_norm": 0.6654563546180725, + "learning_rate": 0.00014657320813983446, + "loss": 2.8794, + "step": 41068 + }, + { + "epoch": 2.01, + "grad_norm": 0.6987980604171753, + "learning_rate": 0.00014655997794401932, + "loss": 2.9344, + "step": 41069 + }, + { + "epoch": 2.01, + "grad_norm": 0.6206268668174744, + "learning_rate": 0.00014654674815233555, + "loss": 2.898, + "step": 41070 + }, + { + "epoch": 2.01, + "grad_norm": 0.6681779026985168, + "learning_rate": 0.00014653351876481823, + "loss": 2.8274, + "step": 41071 + }, + { + "epoch": 2.01, + "grad_norm": 0.6701809167861938, + "learning_rate": 0.00014652028978150196, + "loss": 2.7613, + "step": 41072 + }, + { + "epoch": 2.01, + "grad_norm": 0.6777552962303162, + "learning_rate": 0.00014650706120242186, + "loss": 2.967, + "step": 41073 + }, + { + "epoch": 2.01, + "grad_norm": 0.7489084601402283, + "learning_rate": 0.00014649383302761247, + "loss": 3.0146, + "step": 41074 + }, + { + "epoch": 2.01, + "grad_norm": 0.6484798192977905, + "learning_rate": 0.0001464806052571088, + "loss": 3.095, + "step": 41075 + }, + { + "epoch": 2.01, + "grad_norm": 0.716480553150177, + "learning_rate": 0.00014646737789094578, + "loss": 2.8885, + "step": 41076 + }, + { + "epoch": 2.01, + "grad_norm": 0.6795092821121216, + "learning_rate": 0.00014645415092915813, + "loss": 2.5886, + "step": 41077 + }, + { + "epoch": 2.01, + "grad_norm": 0.7201026678085327, + "learning_rate": 0.00014644092437178074, + "loss": 2.8806, + "step": 41078 + }, + { + "epoch": 2.01, + "grad_norm": 0.6745376586914062, + "learning_rate": 0.00014642769821884824, + "loss": 3.0144, + "step": 41079 + }, + { + "epoch": 2.01, + "grad_norm": 0.7620112895965576, + "learning_rate": 0.0001464144724703956, + "loss": 2.797, + "step": 41080 + }, + { + "epoch": 2.01, + "grad_norm": 0.6425158381462097, + "learning_rate": 0.00014640124712645786, + "loss": 2.6399, + "step": 41081 + }, + { + "epoch": 2.01, + "grad_norm": 0.6472249031066895, + "learning_rate": 0.00014638802218706955, + "loss": 3.0311, + "step": 41082 + }, + { + "epoch": 2.01, + "grad_norm": 0.6940520405769348, + "learning_rate": 0.00014637479765226572, + "loss": 2.9101, + "step": 41083 + }, + { + "epoch": 2.01, + "grad_norm": 0.6446316242218018, + "learning_rate": 0.00014636157352208113, + "loss": 2.8199, + "step": 41084 + }, + { + "epoch": 2.01, + "grad_norm": 0.662503719329834, + "learning_rate": 0.00014634834979655042, + "loss": 3.034, + "step": 41085 + }, + { + "epoch": 2.01, + "grad_norm": 0.6603543758392334, + "learning_rate": 0.00014633512647570877, + "loss": 2.9694, + "step": 41086 + }, + { + "epoch": 2.01, + "grad_norm": 0.6480185389518738, + "learning_rate": 0.00014632190355959066, + "loss": 3.0205, + "step": 41087 + }, + { + "epoch": 2.01, + "grad_norm": 0.681262195110321, + "learning_rate": 0.00014630868104823118, + "loss": 3.0011, + "step": 41088 + }, + { + "epoch": 2.01, + "grad_norm": 0.6547122001647949, + "learning_rate": 0.000146295458941665, + "loss": 2.9701, + "step": 41089 + }, + { + "epoch": 2.01, + "grad_norm": 0.6928819417953491, + "learning_rate": 0.00014628223723992694, + "loss": 2.9301, + "step": 41090 + }, + { + "epoch": 2.01, + "grad_norm": 0.6732745170593262, + "learning_rate": 0.00014626901594305203, + "loss": 2.7723, + "step": 41091 + }, + { + "epoch": 2.01, + "grad_norm": 0.6303524971008301, + "learning_rate": 0.00014625579505107497, + "loss": 2.7936, + "step": 41092 + }, + { + "epoch": 2.01, + "grad_norm": 0.675657331943512, + "learning_rate": 0.00014624257456403054, + "loss": 2.9224, + "step": 41093 + }, + { + "epoch": 2.01, + "grad_norm": 0.7712878584861755, + "learning_rate": 0.00014622935448195345, + "loss": 2.9784, + "step": 41094 + }, + { + "epoch": 2.01, + "grad_norm": 0.6558848023414612, + "learning_rate": 0.00014621613480487864, + "loss": 2.9069, + "step": 41095 + }, + { + "epoch": 2.01, + "grad_norm": 0.7321421504020691, + "learning_rate": 0.00014620291553284106, + "loss": 2.8179, + "step": 41096 + }, + { + "epoch": 2.01, + "grad_norm": 0.6873998641967773, + "learning_rate": 0.0001461896966658753, + "loss": 2.9708, + "step": 41097 + }, + { + "epoch": 2.01, + "grad_norm": 0.7613605856895447, + "learning_rate": 0.00014617647820401637, + "loss": 3.1689, + "step": 41098 + }, + { + "epoch": 2.01, + "grad_norm": 0.6851860284805298, + "learning_rate": 0.000146163260147299, + "loss": 3.0292, + "step": 41099 + }, + { + "epoch": 2.01, + "grad_norm": 0.6968518495559692, + "learning_rate": 0.0001461500424957579, + "loss": 2.7074, + "step": 41100 + }, + { + "epoch": 2.01, + "grad_norm": 0.6626551151275635, + "learning_rate": 0.00014613682524942804, + "loss": 2.8939, + "step": 41101 + }, + { + "epoch": 2.01, + "grad_norm": 0.6604430079460144, + "learning_rate": 0.00014612360840834411, + "loss": 2.9987, + "step": 41102 + }, + { + "epoch": 2.01, + "grad_norm": 0.711830735206604, + "learning_rate": 0.00014611039197254106, + "loss": 3.0629, + "step": 41103 + }, + { + "epoch": 2.01, + "grad_norm": 0.6499634385108948, + "learning_rate": 0.00014609717594205355, + "loss": 2.7851, + "step": 41104 + }, + { + "epoch": 2.01, + "grad_norm": 0.6638641953468323, + "learning_rate": 0.00014608396031691657, + "loss": 2.8837, + "step": 41105 + }, + { + "epoch": 2.01, + "grad_norm": 0.6749254465103149, + "learning_rate": 0.00014607074509716478, + "loss": 3.0239, + "step": 41106 + }, + { + "epoch": 2.01, + "grad_norm": 0.6623033285140991, + "learning_rate": 0.00014605753028283292, + "loss": 2.9147, + "step": 41107 + }, + { + "epoch": 2.01, + "grad_norm": 0.7307236790657043, + "learning_rate": 0.00014604431587395602, + "loss": 2.833, + "step": 41108 + }, + { + "epoch": 2.01, + "grad_norm": 0.6709109544754028, + "learning_rate": 0.00014603110187056865, + "loss": 2.9349, + "step": 41109 + }, + { + "epoch": 2.01, + "grad_norm": 0.6868018507957458, + "learning_rate": 0.00014601788827270574, + "loss": 3.0805, + "step": 41110 + }, + { + "epoch": 2.01, + "grad_norm": 0.6887187361717224, + "learning_rate": 0.00014600467508040218, + "loss": 2.9418, + "step": 41111 + }, + { + "epoch": 2.01, + "grad_norm": 0.6684630513191223, + "learning_rate": 0.00014599146229369252, + "loss": 2.9893, + "step": 41112 + }, + { + "epoch": 2.01, + "grad_norm": 0.6830692291259766, + "learning_rate": 0.00014597824991261185, + "loss": 2.9589, + "step": 41113 + }, + { + "epoch": 2.01, + "grad_norm": 0.6974941492080688, + "learning_rate": 0.00014596503793719482, + "loss": 2.9326, + "step": 41114 + }, + { + "epoch": 2.01, + "grad_norm": 0.638469934463501, + "learning_rate": 0.00014595182636747614, + "loss": 2.9861, + "step": 41115 + }, + { + "epoch": 2.01, + "grad_norm": 0.6490311026573181, + "learning_rate": 0.0001459386152034908, + "loss": 3.0936, + "step": 41116 + }, + { + "epoch": 2.02, + "grad_norm": 0.6784543991088867, + "learning_rate": 0.0001459254044452734, + "loss": 3.0954, + "step": 41117 + }, + { + "epoch": 2.02, + "grad_norm": 0.6834042072296143, + "learning_rate": 0.00014591219409285894, + "loss": 2.914, + "step": 41118 + }, + { + "epoch": 2.02, + "grad_norm": 0.6517180800437927, + "learning_rate": 0.00014589898414628198, + "loss": 3.13, + "step": 41119 + }, + { + "epoch": 2.02, + "grad_norm": 0.6616392731666565, + "learning_rate": 0.00014588577460557755, + "loss": 3.1142, + "step": 41120 + }, + { + "epoch": 2.02, + "grad_norm": 0.6413401365280151, + "learning_rate": 0.00014587256547078035, + "loss": 3.092, + "step": 41121 + }, + { + "epoch": 2.02, + "grad_norm": 0.6451615691184998, + "learning_rate": 0.00014585935674192504, + "loss": 2.8525, + "step": 41122 + }, + { + "epoch": 2.02, + "grad_norm": 0.6675742268562317, + "learning_rate": 0.0001458461484190466, + "loss": 3.0727, + "step": 41123 + }, + { + "epoch": 2.02, + "grad_norm": 0.6669958829879761, + "learning_rate": 0.00014583294050217967, + "loss": 2.8502, + "step": 41124 + }, + { + "epoch": 2.02, + "grad_norm": 0.6869798302650452, + "learning_rate": 0.0001458197329913591, + "loss": 2.9034, + "step": 41125 + }, + { + "epoch": 2.02, + "grad_norm": 0.6724489331245422, + "learning_rate": 0.00014580652588661974, + "loss": 2.9271, + "step": 41126 + }, + { + "epoch": 2.02, + "grad_norm": 0.6963558793067932, + "learning_rate": 0.00014579331918799634, + "loss": 2.8939, + "step": 41127 + }, + { + "epoch": 2.02, + "grad_norm": 0.6525717973709106, + "learning_rate": 0.0001457801128955237, + "loss": 2.8894, + "step": 41128 + }, + { + "epoch": 2.02, + "grad_norm": 0.6995739936828613, + "learning_rate": 0.0001457669070092364, + "loss": 2.807, + "step": 41129 + }, + { + "epoch": 2.02, + "grad_norm": 0.6798244714736938, + "learning_rate": 0.0001457537015291694, + "loss": 3.1188, + "step": 41130 + }, + { + "epoch": 2.02, + "grad_norm": 0.6413742303848267, + "learning_rate": 0.0001457404964553576, + "loss": 2.8515, + "step": 41131 + }, + { + "epoch": 2.02, + "grad_norm": 0.68084317445755, + "learning_rate": 0.0001457272917878355, + "loss": 2.7809, + "step": 41132 + }, + { + "epoch": 2.02, + "grad_norm": 0.6480937004089355, + "learning_rate": 0.00014571408752663817, + "loss": 2.7539, + "step": 41133 + }, + { + "epoch": 2.02, + "grad_norm": 0.6867110729217529, + "learning_rate": 0.00014570088367180006, + "loss": 3.0271, + "step": 41134 + }, + { + "epoch": 2.02, + "grad_norm": 0.7035594582557678, + "learning_rate": 0.00014568768022335632, + "loss": 3.1673, + "step": 41135 + }, + { + "epoch": 2.02, + "grad_norm": 0.6588776707649231, + "learning_rate": 0.00014567447718134147, + "loss": 2.9375, + "step": 41136 + }, + { + "epoch": 2.02, + "grad_norm": 0.6488538980484009, + "learning_rate": 0.00014566127454579026, + "loss": 2.9297, + "step": 41137 + }, + { + "epoch": 2.02, + "grad_norm": 0.6975100040435791, + "learning_rate": 0.00014564807231673768, + "loss": 3.0464, + "step": 41138 + }, + { + "epoch": 2.02, + "grad_norm": 0.6509141325950623, + "learning_rate": 0.00014563487049421825, + "loss": 2.8602, + "step": 41139 + }, + { + "epoch": 2.02, + "grad_norm": 0.6687818169593811, + "learning_rate": 0.00014562166907826696, + "loss": 2.992, + "step": 41140 + }, + { + "epoch": 2.02, + "grad_norm": 0.6703136563301086, + "learning_rate": 0.00014560846806891837, + "loss": 2.8122, + "step": 41141 + }, + { + "epoch": 2.02, + "grad_norm": 0.7211496829986572, + "learning_rate": 0.00014559526746620748, + "loss": 2.7898, + "step": 41142 + }, + { + "epoch": 2.02, + "grad_norm": 0.6537548303604126, + "learning_rate": 0.00014558206727016896, + "loss": 2.7898, + "step": 41143 + }, + { + "epoch": 2.02, + "grad_norm": 0.6834998726844788, + "learning_rate": 0.00014556886748083743, + "loss": 2.9153, + "step": 41144 + }, + { + "epoch": 2.02, + "grad_norm": 0.6743541359901428, + "learning_rate": 0.00014555566809824788, + "loss": 2.9644, + "step": 41145 + }, + { + "epoch": 2.02, + "grad_norm": 0.6714619994163513, + "learning_rate": 0.00014554246912243488, + "loss": 2.9662, + "step": 41146 + }, + { + "epoch": 2.02, + "grad_norm": 0.6737746596336365, + "learning_rate": 0.0001455292705534333, + "loss": 2.7696, + "step": 41147 + }, + { + "epoch": 2.02, + "grad_norm": 0.654802143573761, + "learning_rate": 0.00014551607239127797, + "loss": 2.8942, + "step": 41148 + }, + { + "epoch": 2.02, + "grad_norm": 0.6912543177604675, + "learning_rate": 0.0001455028746360036, + "loss": 3.193, + "step": 41149 + }, + { + "epoch": 2.02, + "grad_norm": 0.6937560439109802, + "learning_rate": 0.00014548967728764492, + "loss": 2.8186, + "step": 41150 + }, + { + "epoch": 2.02, + "grad_norm": 0.6445648074150085, + "learning_rate": 0.00014547648034623657, + "loss": 2.8501, + "step": 41151 + }, + { + "epoch": 2.02, + "grad_norm": 0.7112509608268738, + "learning_rate": 0.00014546328381181346, + "loss": 2.6133, + "step": 41152 + }, + { + "epoch": 2.02, + "grad_norm": 0.6472128629684448, + "learning_rate": 0.00014545008768441042, + "loss": 2.8535, + "step": 41153 + }, + { + "epoch": 2.02, + "grad_norm": 0.6357482075691223, + "learning_rate": 0.00014543689196406197, + "loss": 2.9497, + "step": 41154 + }, + { + "epoch": 2.02, + "grad_norm": 0.6605250239372253, + "learning_rate": 0.00014542369665080315, + "loss": 2.8173, + "step": 41155 + }, + { + "epoch": 2.02, + "grad_norm": 0.6524414420127869, + "learning_rate": 0.00014541050174466844, + "loss": 3.0099, + "step": 41156 + }, + { + "epoch": 2.02, + "grad_norm": 0.6539895534515381, + "learning_rate": 0.0001453973072456928, + "loss": 3.0537, + "step": 41157 + }, + { + "epoch": 2.02, + "grad_norm": 0.6876203417778015, + "learning_rate": 0.00014538411315391094, + "loss": 2.9975, + "step": 41158 + }, + { + "epoch": 2.02, + "grad_norm": 0.6869372725486755, + "learning_rate": 0.00014537091946935742, + "loss": 3.1229, + "step": 41159 + }, + { + "epoch": 2.02, + "grad_norm": 0.6883721947669983, + "learning_rate": 0.0001453577261920673, + "loss": 2.8087, + "step": 41160 + }, + { + "epoch": 2.02, + "grad_norm": 0.6697179675102234, + "learning_rate": 0.000145344533322075, + "loss": 2.9453, + "step": 41161 + }, + { + "epoch": 2.02, + "grad_norm": 0.6436740159988403, + "learning_rate": 0.00014533134085941546, + "loss": 3.0862, + "step": 41162 + }, + { + "epoch": 2.02, + "grad_norm": 0.6619797945022583, + "learning_rate": 0.0001453181488041235, + "loss": 3.0743, + "step": 41163 + }, + { + "epoch": 2.02, + "grad_norm": 0.6886985898017883, + "learning_rate": 0.0001453049571562338, + "loss": 2.8477, + "step": 41164 + }, + { + "epoch": 2.02, + "grad_norm": 0.6293699145317078, + "learning_rate": 0.00014529176591578105, + "loss": 2.8921, + "step": 41165 + }, + { + "epoch": 2.02, + "grad_norm": 0.7019802331924438, + "learning_rate": 0.00014527857508279992, + "loss": 2.9149, + "step": 41166 + }, + { + "epoch": 2.02, + "grad_norm": 0.6821876168251038, + "learning_rate": 0.00014526538465732522, + "loss": 2.7066, + "step": 41167 + }, + { + "epoch": 2.02, + "grad_norm": 0.7083041071891785, + "learning_rate": 0.00014525219463939183, + "loss": 2.9684, + "step": 41168 + }, + { + "epoch": 2.02, + "grad_norm": 0.7001522183418274, + "learning_rate": 0.0001452390050290343, + "loss": 2.9329, + "step": 41169 + }, + { + "epoch": 2.02, + "grad_norm": 0.7127074003219604, + "learning_rate": 0.00014522581582628754, + "loss": 2.958, + "step": 41170 + }, + { + "epoch": 2.02, + "grad_norm": 0.6899229884147644, + "learning_rate": 0.00014521262703118617, + "loss": 2.9127, + "step": 41171 + }, + { + "epoch": 2.02, + "grad_norm": 0.6654265522956848, + "learning_rate": 0.00014519943864376486, + "loss": 2.997, + "step": 41172 + }, + { + "epoch": 2.02, + "grad_norm": 0.670941174030304, + "learning_rate": 0.00014518625066405852, + "loss": 2.6803, + "step": 41173 + }, + { + "epoch": 2.02, + "grad_norm": 0.6345453858375549, + "learning_rate": 0.00014517306309210176, + "loss": 3.1015, + "step": 41174 + }, + { + "epoch": 2.02, + "grad_norm": 0.6700384616851807, + "learning_rate": 0.0001451598759279294, + "loss": 3.0479, + "step": 41175 + }, + { + "epoch": 2.02, + "grad_norm": 0.686065673828125, + "learning_rate": 0.00014514668917157603, + "loss": 2.9507, + "step": 41176 + }, + { + "epoch": 2.02, + "grad_norm": 0.6618055701255798, + "learning_rate": 0.00014513350282307648, + "loss": 2.918, + "step": 41177 + }, + { + "epoch": 2.02, + "grad_norm": 0.7136662006378174, + "learning_rate": 0.00014512031688246563, + "loss": 3.132, + "step": 41178 + }, + { + "epoch": 2.02, + "grad_norm": 0.6863377690315247, + "learning_rate": 0.00014510713134977804, + "loss": 2.8091, + "step": 41179 + }, + { + "epoch": 2.02, + "grad_norm": 0.6750915050506592, + "learning_rate": 0.00014509394622504843, + "loss": 2.9447, + "step": 41180 + }, + { + "epoch": 2.02, + "grad_norm": 0.6594275832176208, + "learning_rate": 0.00014508076150831145, + "loss": 3.0114, + "step": 41181 + }, + { + "epoch": 2.02, + "grad_norm": 0.6762315630912781, + "learning_rate": 0.00014506757719960192, + "loss": 2.8729, + "step": 41182 + }, + { + "epoch": 2.02, + "grad_norm": 0.7663244605064392, + "learning_rate": 0.0001450543932989547, + "loss": 2.8163, + "step": 41183 + }, + { + "epoch": 2.02, + "grad_norm": 0.7097091674804688, + "learning_rate": 0.0001450412098064043, + "loss": 2.8352, + "step": 41184 + }, + { + "epoch": 2.02, + "grad_norm": 0.7125594615936279, + "learning_rate": 0.0001450280267219856, + "loss": 3.0027, + "step": 41185 + }, + { + "epoch": 2.02, + "grad_norm": 0.6626073718070984, + "learning_rate": 0.00014501484404573327, + "loss": 2.8722, + "step": 41186 + }, + { + "epoch": 2.02, + "grad_norm": 0.6956759691238403, + "learning_rate": 0.0001450016617776819, + "loss": 2.7935, + "step": 41187 + }, + { + "epoch": 2.02, + "grad_norm": 0.638950765132904, + "learning_rate": 0.00014498847991786644, + "loss": 2.8794, + "step": 41188 + }, + { + "epoch": 2.02, + "grad_norm": 0.6870553493499756, + "learning_rate": 0.00014497529846632136, + "loss": 3.0417, + "step": 41189 + }, + { + "epoch": 2.02, + "grad_norm": 0.681865394115448, + "learning_rate": 0.00014496211742308165, + "loss": 2.9795, + "step": 41190 + }, + { + "epoch": 2.02, + "grad_norm": 0.6893475651741028, + "learning_rate": 0.00014494893678818174, + "loss": 2.7124, + "step": 41191 + }, + { + "epoch": 2.02, + "grad_norm": 0.6525934338569641, + "learning_rate": 0.00014493575656165654, + "loss": 2.8924, + "step": 41192 + }, + { + "epoch": 2.02, + "grad_norm": 0.7462884783744812, + "learning_rate": 0.0001449225767435409, + "loss": 2.8665, + "step": 41193 + }, + { + "epoch": 2.02, + "grad_norm": 0.662916898727417, + "learning_rate": 0.00014490939733386915, + "loss": 3.0731, + "step": 41194 + }, + { + "epoch": 2.02, + "grad_norm": 0.639997661113739, + "learning_rate": 0.00014489621833267628, + "loss": 3.1213, + "step": 41195 + }, + { + "epoch": 2.02, + "grad_norm": 0.6598368883132935, + "learning_rate": 0.00014488303973999683, + "loss": 2.8662, + "step": 41196 + }, + { + "epoch": 2.02, + "grad_norm": 0.6675333380699158, + "learning_rate": 0.0001448698615558656, + "loss": 2.9091, + "step": 41197 + }, + { + "epoch": 2.02, + "grad_norm": 0.6575494408607483, + "learning_rate": 0.0001448566837803174, + "loss": 3.143, + "step": 41198 + }, + { + "epoch": 2.02, + "grad_norm": 0.7101870775222778, + "learning_rate": 0.00014484350641338672, + "loss": 2.8739, + "step": 41199 + }, + { + "epoch": 2.02, + "grad_norm": 0.7055264711380005, + "learning_rate": 0.00014483032945510852, + "loss": 3.0787, + "step": 41200 + }, + { + "epoch": 2.02, + "grad_norm": 0.6996819376945496, + "learning_rate": 0.00014481715290551734, + "loss": 3.099, + "step": 41201 + }, + { + "epoch": 2.02, + "grad_norm": 0.665325939655304, + "learning_rate": 0.0001448039767646478, + "loss": 2.913, + "step": 41202 + }, + { + "epoch": 2.02, + "grad_norm": 0.6452702879905701, + "learning_rate": 0.00014479080103253487, + "loss": 2.9902, + "step": 41203 + }, + { + "epoch": 2.02, + "grad_norm": 0.6679407358169556, + "learning_rate": 0.00014477762570921294, + "loss": 3.0373, + "step": 41204 + }, + { + "epoch": 2.02, + "grad_norm": 0.6822876930236816, + "learning_rate": 0.000144764450794717, + "loss": 2.9035, + "step": 41205 + }, + { + "epoch": 2.02, + "grad_norm": 0.6947330236434937, + "learning_rate": 0.00014475127628908152, + "loss": 3.0485, + "step": 41206 + }, + { + "epoch": 2.02, + "grad_norm": 0.6485691070556641, + "learning_rate": 0.00014473810219234142, + "loss": 2.6635, + "step": 41207 + }, + { + "epoch": 2.02, + "grad_norm": 0.6676596999168396, + "learning_rate": 0.00014472492850453126, + "loss": 2.921, + "step": 41208 + }, + { + "epoch": 2.02, + "grad_norm": 0.676518440246582, + "learning_rate": 0.00014471175522568568, + "loss": 3.0537, + "step": 41209 + }, + { + "epoch": 2.02, + "grad_norm": 0.6761108040809631, + "learning_rate": 0.0001446985823558395, + "loss": 2.8734, + "step": 41210 + }, + { + "epoch": 2.02, + "grad_norm": 0.6406627893447876, + "learning_rate": 0.0001446854098950273, + "loss": 3.074, + "step": 41211 + }, + { + "epoch": 2.02, + "grad_norm": 0.6818718314170837, + "learning_rate": 0.00014467223784328382, + "loss": 3.0591, + "step": 41212 + }, + { + "epoch": 2.02, + "grad_norm": 0.6850602030754089, + "learning_rate": 0.00014465906620064387, + "loss": 3.0959, + "step": 41213 + }, + { + "epoch": 2.02, + "grad_norm": 0.6739491820335388, + "learning_rate": 0.00014464589496714198, + "loss": 2.9633, + "step": 41214 + }, + { + "epoch": 2.02, + "grad_norm": 0.7058461308479309, + "learning_rate": 0.00014463272414281297, + "loss": 2.8375, + "step": 41215 + }, + { + "epoch": 2.02, + "grad_norm": 0.6772422790527344, + "learning_rate": 0.00014461955372769149, + "loss": 2.969, + "step": 41216 + }, + { + "epoch": 2.02, + "grad_norm": 0.64620441198349, + "learning_rate": 0.00014460638372181217, + "loss": 2.8003, + "step": 41217 + }, + { + "epoch": 2.02, + "grad_norm": 0.6821702718734741, + "learning_rate": 0.00014459321412520963, + "loss": 2.873, + "step": 41218 + }, + { + "epoch": 2.02, + "grad_norm": 0.6443877220153809, + "learning_rate": 0.00014458004493791868, + "loss": 2.7943, + "step": 41219 + }, + { + "epoch": 2.02, + "grad_norm": 0.651509165763855, + "learning_rate": 0.00014456687615997408, + "loss": 2.7859, + "step": 41220 + }, + { + "epoch": 2.02, + "grad_norm": 0.7200280427932739, + "learning_rate": 0.0001445537077914103, + "loss": 2.7788, + "step": 41221 + }, + { + "epoch": 2.02, + "grad_norm": 0.71668940782547, + "learning_rate": 0.00014454053983226225, + "loss": 2.7504, + "step": 41222 + }, + { + "epoch": 2.02, + "grad_norm": 0.6477024555206299, + "learning_rate": 0.00014452737228256448, + "loss": 2.8924, + "step": 41223 + }, + { + "epoch": 2.02, + "grad_norm": 0.6733539700508118, + "learning_rate": 0.00014451420514235158, + "loss": 2.9713, + "step": 41224 + }, + { + "epoch": 2.02, + "grad_norm": 0.6498569250106812, + "learning_rate": 0.0001445010384116585, + "loss": 2.7993, + "step": 41225 + }, + { + "epoch": 2.02, + "grad_norm": 0.8216848373413086, + "learning_rate": 0.00014448787209051962, + "loss": 2.8396, + "step": 41226 + }, + { + "epoch": 2.02, + "grad_norm": 0.7104951739311218, + "learning_rate": 0.00014447470617896986, + "loss": 2.8338, + "step": 41227 + }, + { + "epoch": 2.02, + "grad_norm": 0.6697823405265808, + "learning_rate": 0.00014446154067704366, + "loss": 2.9435, + "step": 41228 + }, + { + "epoch": 2.02, + "grad_norm": 0.6788064241409302, + "learning_rate": 0.00014444837558477596, + "loss": 2.8841, + "step": 41229 + }, + { + "epoch": 2.02, + "grad_norm": 0.6955843567848206, + "learning_rate": 0.0001444352109022013, + "loss": 2.8783, + "step": 41230 + }, + { + "epoch": 2.02, + "grad_norm": 0.6401929259300232, + "learning_rate": 0.00014442204662935425, + "loss": 2.9695, + "step": 41231 + }, + { + "epoch": 2.02, + "grad_norm": 0.6757379174232483, + "learning_rate": 0.0001444088827662697, + "loss": 3.1244, + "step": 41232 + }, + { + "epoch": 2.02, + "grad_norm": 0.6907119154930115, + "learning_rate": 0.0001443957193129821, + "loss": 2.9758, + "step": 41233 + }, + { + "epoch": 2.02, + "grad_norm": 0.659367561340332, + "learning_rate": 0.00014438255626952624, + "loss": 2.9205, + "step": 41234 + }, + { + "epoch": 2.02, + "grad_norm": 0.685149073600769, + "learning_rate": 0.00014436939363593685, + "loss": 3.0351, + "step": 41235 + }, + { + "epoch": 2.02, + "grad_norm": 0.6743383407592773, + "learning_rate": 0.00014435623141224844, + "loss": 2.8732, + "step": 41236 + }, + { + "epoch": 2.02, + "grad_norm": 0.7082371115684509, + "learning_rate": 0.00014434306959849587, + "loss": 2.9676, + "step": 41237 + }, + { + "epoch": 2.02, + "grad_norm": 0.6763203144073486, + "learning_rate": 0.0001443299081947137, + "loss": 3.0997, + "step": 41238 + }, + { + "epoch": 2.02, + "grad_norm": 0.7034807205200195, + "learning_rate": 0.00014431674720093648, + "loss": 3.0158, + "step": 41239 + }, + { + "epoch": 2.02, + "grad_norm": 0.693721354007721, + "learning_rate": 0.00014430358661719908, + "loss": 2.9763, + "step": 41240 + }, + { + "epoch": 2.02, + "grad_norm": 0.6822949051856995, + "learning_rate": 0.000144290426443536, + "loss": 2.9631, + "step": 41241 + }, + { + "epoch": 2.02, + "grad_norm": 0.6607871055603027, + "learning_rate": 0.00014427726667998203, + "loss": 3.0862, + "step": 41242 + }, + { + "epoch": 2.02, + "grad_norm": 0.6698300838470459, + "learning_rate": 0.00014426410732657172, + "loss": 2.8321, + "step": 41243 + }, + { + "epoch": 2.02, + "grad_norm": 0.6640063524246216, + "learning_rate": 0.00014425094838333986, + "loss": 2.9378, + "step": 41244 + }, + { + "epoch": 2.02, + "grad_norm": 0.6912232637405396, + "learning_rate": 0.000144237789850321, + "loss": 3.0132, + "step": 41245 + }, + { + "epoch": 2.02, + "grad_norm": 0.6797363758087158, + "learning_rate": 0.00014422463172754973, + "loss": 2.7892, + "step": 41246 + }, + { + "epoch": 2.02, + "grad_norm": 0.7383774518966675, + "learning_rate": 0.0001442114740150609, + "loss": 2.9162, + "step": 41247 + }, + { + "epoch": 2.02, + "grad_norm": 0.7191385626792908, + "learning_rate": 0.000144198316712889, + "loss": 2.8664, + "step": 41248 + }, + { + "epoch": 2.02, + "grad_norm": 0.733830988407135, + "learning_rate": 0.0001441851598210687, + "loss": 2.8591, + "step": 41249 + }, + { + "epoch": 2.02, + "grad_norm": 0.6828484535217285, + "learning_rate": 0.00014417200333963486, + "loss": 3.1103, + "step": 41250 + }, + { + "epoch": 2.02, + "grad_norm": 0.6779079437255859, + "learning_rate": 0.0001441588472686219, + "loss": 2.8456, + "step": 41251 + }, + { + "epoch": 2.02, + "grad_norm": 0.6722716093063354, + "learning_rate": 0.0001441456916080646, + "loss": 2.9776, + "step": 41252 + }, + { + "epoch": 2.02, + "grad_norm": 0.7105075716972351, + "learning_rate": 0.00014413253635799743, + "loss": 3.1086, + "step": 41253 + }, + { + "epoch": 2.02, + "grad_norm": 0.6589248180389404, + "learning_rate": 0.00014411938151845518, + "loss": 3.064, + "step": 41254 + }, + { + "epoch": 2.02, + "grad_norm": 0.6822506785392761, + "learning_rate": 0.00014410622708947257, + "loss": 2.9046, + "step": 41255 + }, + { + "epoch": 2.02, + "grad_norm": 0.7151714563369751, + "learning_rate": 0.00014409307307108405, + "loss": 2.8397, + "step": 41256 + }, + { + "epoch": 2.02, + "grad_norm": 0.684058427810669, + "learning_rate": 0.0001440799194633245, + "loss": 2.7317, + "step": 41257 + }, + { + "epoch": 2.02, + "grad_norm": 0.6461659073829651, + "learning_rate": 0.00014406676626622827, + "loss": 3.0171, + "step": 41258 + }, + { + "epoch": 2.02, + "grad_norm": 0.6536449790000916, + "learning_rate": 0.00014405361347983033, + "loss": 3.0362, + "step": 41259 + }, + { + "epoch": 2.02, + "grad_norm": 0.6458353400230408, + "learning_rate": 0.00014404046110416517, + "loss": 2.7066, + "step": 41260 + }, + { + "epoch": 2.02, + "grad_norm": 0.7059634923934937, + "learning_rate": 0.00014402730913926724, + "loss": 2.8764, + "step": 41261 + }, + { + "epoch": 2.02, + "grad_norm": 0.6733324527740479, + "learning_rate": 0.0001440141575851715, + "loss": 2.9203, + "step": 41262 + }, + { + "epoch": 2.02, + "grad_norm": 0.6528270244598389, + "learning_rate": 0.0001440010064419124, + "loss": 2.8455, + "step": 41263 + }, + { + "epoch": 2.02, + "grad_norm": 0.6702911257743835, + "learning_rate": 0.00014398785570952458, + "loss": 2.8677, + "step": 41264 + }, + { + "epoch": 2.02, + "grad_norm": 0.671634316444397, + "learning_rate": 0.00014397470538804282, + "loss": 3.0517, + "step": 41265 + }, + { + "epoch": 2.02, + "grad_norm": 0.7339351177215576, + "learning_rate": 0.00014396155547750166, + "loss": 2.8713, + "step": 41266 + }, + { + "epoch": 2.02, + "grad_norm": 0.6775936484336853, + "learning_rate": 0.00014394840597793576, + "loss": 2.8699, + "step": 41267 + }, + { + "epoch": 2.02, + "grad_norm": 0.6924892663955688, + "learning_rate": 0.0001439352568893796, + "loss": 3.0556, + "step": 41268 + }, + { + "epoch": 2.02, + "grad_norm": 0.6678718328475952, + "learning_rate": 0.0001439221082118679, + "loss": 3.0232, + "step": 41269 + }, + { + "epoch": 2.02, + "grad_norm": 0.7233895063400269, + "learning_rate": 0.0001439089599454355, + "loss": 2.8655, + "step": 41270 + }, + { + "epoch": 2.02, + "grad_norm": 0.6184765100479126, + "learning_rate": 0.0001438958120901167, + "loss": 3.0533, + "step": 41271 + }, + { + "epoch": 2.02, + "grad_norm": 0.7089718580245972, + "learning_rate": 0.00014388266464594645, + "loss": 3.1122, + "step": 41272 + }, + { + "epoch": 2.02, + "grad_norm": 0.6720311045646667, + "learning_rate": 0.0001438695176129592, + "loss": 2.9664, + "step": 41273 + }, + { + "epoch": 2.02, + "grad_norm": 0.66576087474823, + "learning_rate": 0.00014385637099118944, + "loss": 2.8472, + "step": 41274 + }, + { + "epoch": 2.02, + "grad_norm": 0.6577184200286865, + "learning_rate": 0.0001438432247806721, + "loss": 2.7898, + "step": 41275 + }, + { + "epoch": 2.02, + "grad_norm": 0.6298528909683228, + "learning_rate": 0.00014383007898144151, + "loss": 2.9125, + "step": 41276 + }, + { + "epoch": 2.02, + "grad_norm": 0.6644882559776306, + "learning_rate": 0.0001438169335935326, + "loss": 2.9001, + "step": 41277 + }, + { + "epoch": 2.02, + "grad_norm": 0.6496797800064087, + "learning_rate": 0.00014380378861697968, + "loss": 2.8136, + "step": 41278 + }, + { + "epoch": 2.02, + "grad_norm": 0.6571464538574219, + "learning_rate": 0.00014379064405181752, + "loss": 2.9994, + "step": 41279 + }, + { + "epoch": 2.02, + "grad_norm": 0.6770309209823608, + "learning_rate": 0.00014377749989808087, + "loss": 2.9402, + "step": 41280 + }, + { + "epoch": 2.02, + "grad_norm": 0.6546334028244019, + "learning_rate": 0.0001437643561558042, + "loss": 2.9614, + "step": 41281 + }, + { + "epoch": 2.02, + "grad_norm": 0.6517234444618225, + "learning_rate": 0.00014375121282502216, + "loss": 2.8285, + "step": 41282 + }, + { + "epoch": 2.02, + "grad_norm": 0.6837851405143738, + "learning_rate": 0.00014373806990576924, + "loss": 3.1049, + "step": 41283 + }, + { + "epoch": 2.02, + "grad_norm": 0.6652602553367615, + "learning_rate": 0.00014372492739808017, + "loss": 2.958, + "step": 41284 + }, + { + "epoch": 2.02, + "grad_norm": 0.6891589760780334, + "learning_rate": 0.0001437117853019897, + "loss": 3.0539, + "step": 41285 + }, + { + "epoch": 2.02, + "grad_norm": 0.6521453857421875, + "learning_rate": 0.00014369864361753217, + "loss": 2.7239, + "step": 41286 + }, + { + "epoch": 2.02, + "grad_norm": 0.642013669013977, + "learning_rate": 0.00014368550234474244, + "loss": 3.1218, + "step": 41287 + }, + { + "epoch": 2.02, + "grad_norm": 0.6464647650718689, + "learning_rate": 0.00014367236148365504, + "loss": 2.8118, + "step": 41288 + }, + { + "epoch": 2.02, + "grad_norm": 0.6746804714202881, + "learning_rate": 0.0001436592210343044, + "loss": 2.9244, + "step": 41289 + }, + { + "epoch": 2.02, + "grad_norm": 0.6726866364479065, + "learning_rate": 0.00014364608099672543, + "loss": 2.8907, + "step": 41290 + }, + { + "epoch": 2.02, + "grad_norm": 0.686701774597168, + "learning_rate": 0.00014363294137095247, + "loss": 2.9126, + "step": 41291 + }, + { + "epoch": 2.02, + "grad_norm": 0.6961394548416138, + "learning_rate": 0.00014361980215702035, + "loss": 3.023, + "step": 41292 + }, + { + "epoch": 2.02, + "grad_norm": 0.662632167339325, + "learning_rate": 0.0001436066633549635, + "loss": 2.9869, + "step": 41293 + }, + { + "epoch": 2.02, + "grad_norm": 0.6637516617774963, + "learning_rate": 0.00014359352496481667, + "loss": 2.8747, + "step": 41294 + }, + { + "epoch": 2.02, + "grad_norm": 0.6575713753700256, + "learning_rate": 0.00014358038698661442, + "loss": 2.8826, + "step": 41295 + }, + { + "epoch": 2.02, + "grad_norm": 0.6427962779998779, + "learning_rate": 0.00014356724942039118, + "loss": 3.012, + "step": 41296 + }, + { + "epoch": 2.02, + "grad_norm": 0.6798694729804993, + "learning_rate": 0.00014355411226618184, + "loss": 2.9795, + "step": 41297 + }, + { + "epoch": 2.02, + "grad_norm": 0.680004894733429, + "learning_rate": 0.00014354097552402077, + "loss": 3.1077, + "step": 41298 + }, + { + "epoch": 2.02, + "grad_norm": 0.6552472114562988, + "learning_rate": 0.00014352783919394273, + "loss": 2.9108, + "step": 41299 + }, + { + "epoch": 2.02, + "grad_norm": 0.6746847629547119, + "learning_rate": 0.00014351470327598215, + "loss": 2.8963, + "step": 41300 + }, + { + "epoch": 2.02, + "grad_norm": 0.6662331223487854, + "learning_rate": 0.0001435015677701737, + "loss": 2.7933, + "step": 41301 + }, + { + "epoch": 2.02, + "grad_norm": 0.6595388650894165, + "learning_rate": 0.00014348843267655219, + "loss": 2.7368, + "step": 41302 + }, + { + "epoch": 2.02, + "grad_norm": 0.6785330176353455, + "learning_rate": 0.00014347529799515197, + "loss": 2.8712, + "step": 41303 + }, + { + "epoch": 2.02, + "grad_norm": 0.6657465696334839, + "learning_rate": 0.00014346216372600768, + "loss": 2.7114, + "step": 41304 + }, + { + "epoch": 2.02, + "grad_norm": 0.6740917563438416, + "learning_rate": 0.00014344902986915386, + "loss": 2.8561, + "step": 41305 + }, + { + "epoch": 2.02, + "grad_norm": 0.6666470170021057, + "learning_rate": 0.00014343589642462512, + "loss": 2.9495, + "step": 41306 + }, + { + "epoch": 2.02, + "grad_norm": 0.71805739402771, + "learning_rate": 0.00014342276339245622, + "loss": 3.068, + "step": 41307 + }, + { + "epoch": 2.02, + "grad_norm": 0.7400341033935547, + "learning_rate": 0.00014340963077268153, + "loss": 3.0277, + "step": 41308 + }, + { + "epoch": 2.02, + "grad_norm": 0.6645949482917786, + "learning_rate": 0.00014339649856533583, + "loss": 2.9738, + "step": 41309 + }, + { + "epoch": 2.02, + "grad_norm": 0.6728555560112, + "learning_rate": 0.00014338336677045365, + "loss": 3.0966, + "step": 41310 + }, + { + "epoch": 2.02, + "grad_norm": 0.6942307353019714, + "learning_rate": 0.0001433702353880694, + "loss": 2.7839, + "step": 41311 + }, + { + "epoch": 2.02, + "grad_norm": 0.705938458442688, + "learning_rate": 0.0001433571044182179, + "loss": 2.6229, + "step": 41312 + }, + { + "epoch": 2.02, + "grad_norm": 0.6518526077270508, + "learning_rate": 0.00014334397386093356, + "loss": 2.8772, + "step": 41313 + }, + { + "epoch": 2.02, + "grad_norm": 0.7186595797538757, + "learning_rate": 0.00014333084371625116, + "loss": 2.8394, + "step": 41314 + }, + { + "epoch": 2.02, + "grad_norm": 0.6730000376701355, + "learning_rate": 0.00014331771398420502, + "loss": 2.9375, + "step": 41315 + }, + { + "epoch": 2.02, + "grad_norm": 0.672404408454895, + "learning_rate": 0.00014330458466482986, + "loss": 2.8477, + "step": 41316 + }, + { + "epoch": 2.02, + "grad_norm": 0.6947441697120667, + "learning_rate": 0.00014329145575816044, + "loss": 2.8878, + "step": 41317 + }, + { + "epoch": 2.02, + "grad_norm": 0.7594353556632996, + "learning_rate": 0.00014327832726423114, + "loss": 3.0445, + "step": 41318 + }, + { + "epoch": 2.02, + "grad_norm": 0.6902020573616028, + "learning_rate": 0.00014326519918307653, + "loss": 3.1234, + "step": 41319 + }, + { + "epoch": 2.02, + "grad_norm": 0.641372799873352, + "learning_rate": 0.00014325207151473114, + "loss": 2.9663, + "step": 41320 + }, + { + "epoch": 2.03, + "grad_norm": 0.6673992276191711, + "learning_rate": 0.00014323894425922963, + "loss": 2.906, + "step": 41321 + }, + { + "epoch": 2.03, + "grad_norm": 0.6893587708473206, + "learning_rate": 0.00014322581741660668, + "loss": 2.7704, + "step": 41322 + }, + { + "epoch": 2.03, + "grad_norm": 0.6781730055809021, + "learning_rate": 0.00014321269098689664, + "loss": 2.6757, + "step": 41323 + }, + { + "epoch": 2.03, + "grad_norm": 0.6808968782424927, + "learning_rate": 0.0001431995649701343, + "loss": 3.0846, + "step": 41324 + }, + { + "epoch": 2.03, + "grad_norm": 0.650240421295166, + "learning_rate": 0.00014318643936635415, + "loss": 2.9279, + "step": 41325 + }, + { + "epoch": 2.03, + "grad_norm": 0.6758670210838318, + "learning_rate": 0.0001431733141755906, + "loss": 2.9108, + "step": 41326 + }, + { + "epoch": 2.03, + "grad_norm": 0.6735976934432983, + "learning_rate": 0.0001431601893978785, + "loss": 3.1225, + "step": 41327 + }, + { + "epoch": 2.03, + "grad_norm": 0.6825075745582581, + "learning_rate": 0.00014314706503325213, + "loss": 2.8253, + "step": 41328 + }, + { + "epoch": 2.03, + "grad_norm": 0.6795275211334229, + "learning_rate": 0.00014313394108174637, + "loss": 3.0741, + "step": 41329 + }, + { + "epoch": 2.03, + "grad_norm": 0.6708423495292664, + "learning_rate": 0.00014312081754339545, + "loss": 2.9368, + "step": 41330 + }, + { + "epoch": 2.03, + "grad_norm": 0.7268308997154236, + "learning_rate": 0.00014310769441823424, + "loss": 2.8793, + "step": 41331 + }, + { + "epoch": 2.03, + "grad_norm": 0.662429690361023, + "learning_rate": 0.00014309457170629718, + "loss": 2.9008, + "step": 41332 + }, + { + "epoch": 2.03, + "grad_norm": 0.6782928109169006, + "learning_rate": 0.00014308144940761868, + "loss": 2.9867, + "step": 41333 + }, + { + "epoch": 2.03, + "grad_norm": 0.6727951765060425, + "learning_rate": 0.00014306832752223358, + "loss": 3.1139, + "step": 41334 + }, + { + "epoch": 2.03, + "grad_norm": 0.6834380626678467, + "learning_rate": 0.00014305520605017617, + "loss": 3.0817, + "step": 41335 + }, + { + "epoch": 2.03, + "grad_norm": 0.6775288581848145, + "learning_rate": 0.00014304208499148118, + "loss": 3.0145, + "step": 41336 + }, + { + "epoch": 2.03, + "grad_norm": 0.6631368398666382, + "learning_rate": 0.00014302896434618322, + "loss": 3.1274, + "step": 41337 + }, + { + "epoch": 2.03, + "grad_norm": 0.65370112657547, + "learning_rate": 0.00014301584411431664, + "loss": 3.0237, + "step": 41338 + }, + { + "epoch": 2.03, + "grad_norm": 0.6937180757522583, + "learning_rate": 0.00014300272429591625, + "loss": 3.0834, + "step": 41339 + }, + { + "epoch": 2.03, + "grad_norm": 0.6696368455886841, + "learning_rate": 0.00014298960489101642, + "loss": 3.1994, + "step": 41340 + }, + { + "epoch": 2.03, + "grad_norm": 0.6452273726463318, + "learning_rate": 0.00014297648589965169, + "loss": 2.9717, + "step": 41341 + }, + { + "epoch": 2.03, + "grad_norm": 0.6574122309684753, + "learning_rate": 0.0001429633673218568, + "loss": 2.9185, + "step": 41342 + }, + { + "epoch": 2.03, + "grad_norm": 0.6667985320091248, + "learning_rate": 0.000142950249157666, + "loss": 2.9371, + "step": 41343 + }, + { + "epoch": 2.03, + "grad_norm": 0.6507903337478638, + "learning_rate": 0.00014293713140711416, + "loss": 2.9564, + "step": 41344 + }, + { + "epoch": 2.03, + "grad_norm": 0.6721150875091553, + "learning_rate": 0.0001429240140702356, + "loss": 2.9744, + "step": 41345 + }, + { + "epoch": 2.03, + "grad_norm": 0.6516289710998535, + "learning_rate": 0.00014291089714706505, + "loss": 2.9591, + "step": 41346 + }, + { + "epoch": 2.03, + "grad_norm": 0.6478187441825867, + "learning_rate": 0.00014289778063763698, + "loss": 2.8334, + "step": 41347 + }, + { + "epoch": 2.03, + "grad_norm": 0.6945951581001282, + "learning_rate": 0.0001428846645419858, + "loss": 3.1982, + "step": 41348 + }, + { + "epoch": 2.03, + "grad_norm": 0.7314687967300415, + "learning_rate": 0.00014287154886014628, + "loss": 2.7935, + "step": 41349 + }, + { + "epoch": 2.03, + "grad_norm": 0.7077362537384033, + "learning_rate": 0.00014285843359215274, + "loss": 2.9902, + "step": 41350 + }, + { + "epoch": 2.03, + "grad_norm": 0.676292359828949, + "learning_rate": 0.00014284531873803987, + "loss": 3.0049, + "step": 41351 + }, + { + "epoch": 2.03, + "grad_norm": 0.6695364117622375, + "learning_rate": 0.00014283220429784228, + "loss": 3.1166, + "step": 41352 + }, + { + "epoch": 2.03, + "grad_norm": 0.7214575409889221, + "learning_rate": 0.00014281909027159443, + "loss": 2.924, + "step": 41353 + }, + { + "epoch": 2.03, + "grad_norm": 0.6631299257278442, + "learning_rate": 0.00014280597665933083, + "loss": 3.1442, + "step": 41354 + }, + { + "epoch": 2.03, + "grad_norm": 0.6750709414482117, + "learning_rate": 0.00014279286346108593, + "loss": 3.065, + "step": 41355 + }, + { + "epoch": 2.03, + "grad_norm": 0.7027854323387146, + "learning_rate": 0.0001427797506768944, + "loss": 3.1048, + "step": 41356 + }, + { + "epoch": 2.03, + "grad_norm": 0.6556305885314941, + "learning_rate": 0.00014276663830679081, + "loss": 2.944, + "step": 41357 + }, + { + "epoch": 2.03, + "grad_norm": 0.7109807729721069, + "learning_rate": 0.00014275352635080956, + "loss": 2.8792, + "step": 41358 + }, + { + "epoch": 2.03, + "grad_norm": 0.7511147260665894, + "learning_rate": 0.00014274041480898534, + "loss": 2.8699, + "step": 41359 + }, + { + "epoch": 2.03, + "grad_norm": 0.665418803691864, + "learning_rate": 0.0001427273036813525, + "loss": 2.9801, + "step": 41360 + }, + { + "epoch": 2.03, + "grad_norm": 0.7250953316688538, + "learning_rate": 0.0001427141929679458, + "loss": 3.0798, + "step": 41361 + }, + { + "epoch": 2.03, + "grad_norm": 0.6798324584960938, + "learning_rate": 0.00014270108266879964, + "loss": 3.0348, + "step": 41362 + }, + { + "epoch": 2.03, + "grad_norm": 0.6868711113929749, + "learning_rate": 0.00014268797278394843, + "loss": 2.9232, + "step": 41363 + }, + { + "epoch": 2.03, + "grad_norm": 0.683142900466919, + "learning_rate": 0.00014267486331342695, + "loss": 2.918, + "step": 41364 + }, + { + "epoch": 2.03, + "grad_norm": 0.6508060097694397, + "learning_rate": 0.00014266175425726952, + "loss": 2.8509, + "step": 41365 + }, + { + "epoch": 2.03, + "grad_norm": 0.6772064566612244, + "learning_rate": 0.0001426486456155107, + "loss": 2.9454, + "step": 41366 + }, + { + "epoch": 2.03, + "grad_norm": 0.6569616794586182, + "learning_rate": 0.00014263553738818517, + "loss": 2.8806, + "step": 41367 + }, + { + "epoch": 2.03, + "grad_norm": 0.6555266380310059, + "learning_rate": 0.0001426224295753274, + "loss": 2.7536, + "step": 41368 + }, + { + "epoch": 2.03, + "grad_norm": 0.6969777345657349, + "learning_rate": 0.00014260932217697182, + "loss": 3.0595, + "step": 41369 + }, + { + "epoch": 2.03, + "grad_norm": 0.6618245840072632, + "learning_rate": 0.00014259621519315292, + "loss": 2.9002, + "step": 41370 + }, + { + "epoch": 2.03, + "grad_norm": 0.6819080710411072, + "learning_rate": 0.0001425831086239054, + "loss": 2.9353, + "step": 41371 + }, + { + "epoch": 2.03, + "grad_norm": 0.8382446765899658, + "learning_rate": 0.00014257000246926352, + "loss": 2.6845, + "step": 41372 + }, + { + "epoch": 2.03, + "grad_norm": 0.6515592336654663, + "learning_rate": 0.00014255689672926198, + "loss": 3.1173, + "step": 41373 + }, + { + "epoch": 2.03, + "grad_norm": 0.6922911405563354, + "learning_rate": 0.0001425437914039354, + "loss": 2.8563, + "step": 41374 + }, + { + "epoch": 2.03, + "grad_norm": 0.6700411438941956, + "learning_rate": 0.00014253068649331817, + "loss": 3.0166, + "step": 41375 + }, + { + "epoch": 2.03, + "grad_norm": 0.666197657585144, + "learning_rate": 0.0001425175819974448, + "loss": 2.7031, + "step": 41376 + }, + { + "epoch": 2.03, + "grad_norm": 0.7301174402236938, + "learning_rate": 0.0001425044779163497, + "loss": 2.8902, + "step": 41377 + }, + { + "epoch": 2.03, + "grad_norm": 0.6734825968742371, + "learning_rate": 0.0001424913742500675, + "loss": 3.0026, + "step": 41378 + }, + { + "epoch": 2.03, + "grad_norm": 0.7192350625991821, + "learning_rate": 0.0001424782709986328, + "loss": 3.0602, + "step": 41379 + }, + { + "epoch": 2.03, + "grad_norm": 0.7443360090255737, + "learning_rate": 0.00014246516816207995, + "loss": 2.7758, + "step": 41380 + }, + { + "epoch": 2.03, + "grad_norm": 0.6618134379386902, + "learning_rate": 0.00014245206574044358, + "loss": 2.6856, + "step": 41381 + }, + { + "epoch": 2.03, + "grad_norm": 0.7043996453285217, + "learning_rate": 0.00014243896373375807, + "loss": 2.8153, + "step": 41382 + }, + { + "epoch": 2.03, + "grad_norm": 0.7315269708633423, + "learning_rate": 0.00014242586214205814, + "loss": 2.9694, + "step": 41383 + }, + { + "epoch": 2.03, + "grad_norm": 0.6584864258766174, + "learning_rate": 0.0001424127609653781, + "loss": 2.7616, + "step": 41384 + }, + { + "epoch": 2.03, + "grad_norm": 0.6800797581672668, + "learning_rate": 0.00014239966020375246, + "loss": 2.8515, + "step": 41385 + }, + { + "epoch": 2.03, + "grad_norm": 0.7683088183403015, + "learning_rate": 0.0001423865598572159, + "loss": 2.9446, + "step": 41386 + }, + { + "epoch": 2.03, + "grad_norm": 0.6943177580833435, + "learning_rate": 0.00014237345992580266, + "loss": 3.2236, + "step": 41387 + }, + { + "epoch": 2.03, + "grad_norm": 0.7042199969291687, + "learning_rate": 0.0001423603604095474, + "loss": 2.973, + "step": 41388 + }, + { + "epoch": 2.03, + "grad_norm": 0.6742187738418579, + "learning_rate": 0.00014234726130848472, + "loss": 2.6971, + "step": 41389 + }, + { + "epoch": 2.03, + "grad_norm": 0.6640881299972534, + "learning_rate": 0.00014233416262264903, + "loss": 2.7863, + "step": 41390 + }, + { + "epoch": 2.03, + "grad_norm": 0.6789493560791016, + "learning_rate": 0.0001423210643520748, + "loss": 2.9233, + "step": 41391 + }, + { + "epoch": 2.03, + "grad_norm": 0.6761623620986938, + "learning_rate": 0.00014230796649679643, + "loss": 3.1005, + "step": 41392 + }, + { + "epoch": 2.03, + "grad_norm": 0.6618014574050903, + "learning_rate": 0.00014229486905684854, + "loss": 3.047, + "step": 41393 + }, + { + "epoch": 2.03, + "grad_norm": 0.7283720970153809, + "learning_rate": 0.00014228177203226574, + "loss": 3.1616, + "step": 41394 + }, + { + "epoch": 2.03, + "grad_norm": 0.6539528965950012, + "learning_rate": 0.00014226867542308226, + "loss": 2.833, + "step": 41395 + }, + { + "epoch": 2.03, + "grad_norm": 0.6887137293815613, + "learning_rate": 0.00014225557922933286, + "loss": 3.0419, + "step": 41396 + }, + { + "epoch": 2.03, + "grad_norm": 0.6629752516746521, + "learning_rate": 0.00014224248345105192, + "loss": 2.8952, + "step": 41397 + }, + { + "epoch": 2.03, + "grad_norm": 0.6697224378585815, + "learning_rate": 0.00014222938808827377, + "loss": 3.0586, + "step": 41398 + }, + { + "epoch": 2.03, + "grad_norm": 0.6652210354804993, + "learning_rate": 0.00014221629314103317, + "loss": 3.0246, + "step": 41399 + }, + { + "epoch": 2.03, + "grad_norm": 0.6590397953987122, + "learning_rate": 0.0001422031986093644, + "loss": 2.7692, + "step": 41400 + }, + { + "epoch": 2.03, + "grad_norm": 0.7004410028457642, + "learning_rate": 0.0001421901044933021, + "loss": 2.9138, + "step": 41401 + }, + { + "epoch": 2.03, + "grad_norm": 0.7048691511154175, + "learning_rate": 0.00014217701079288062, + "loss": 2.7807, + "step": 41402 + }, + { + "epoch": 2.03, + "grad_norm": 0.7057774066925049, + "learning_rate": 0.00014216391750813452, + "loss": 2.8305, + "step": 41403 + }, + { + "epoch": 2.03, + "grad_norm": 0.6988634467124939, + "learning_rate": 0.00014215082463909837, + "loss": 2.8967, + "step": 41404 + }, + { + "epoch": 2.03, + "grad_norm": 0.644032895565033, + "learning_rate": 0.00014213773218580658, + "loss": 2.9624, + "step": 41405 + }, + { + "epoch": 2.03, + "grad_norm": 0.659142792224884, + "learning_rate": 0.00014212464014829362, + "loss": 3.1234, + "step": 41406 + }, + { + "epoch": 2.03, + "grad_norm": 0.7245345115661621, + "learning_rate": 0.00014211154852659388, + "loss": 3.121, + "step": 41407 + }, + { + "epoch": 2.03, + "grad_norm": 0.6617531180381775, + "learning_rate": 0.0001420984573207419, + "loss": 2.9157, + "step": 41408 + }, + { + "epoch": 2.03, + "grad_norm": 0.6753281354904175, + "learning_rate": 0.00014208536653077236, + "loss": 2.8922, + "step": 41409 + }, + { + "epoch": 2.03, + "grad_norm": 0.7002780437469482, + "learning_rate": 0.00014207227615671943, + "loss": 3.0797, + "step": 41410 + }, + { + "epoch": 2.03, + "grad_norm": 0.7127296328544617, + "learning_rate": 0.00014205918619861785, + "loss": 3.0361, + "step": 41411 + }, + { + "epoch": 2.03, + "grad_norm": 0.6621199250221252, + "learning_rate": 0.00014204609665650198, + "loss": 2.8687, + "step": 41412 + }, + { + "epoch": 2.03, + "grad_norm": 0.6912297010421753, + "learning_rate": 0.00014203300753040616, + "loss": 2.8506, + "step": 41413 + }, + { + "epoch": 2.03, + "grad_norm": 0.6624032258987427, + "learning_rate": 0.00014201991882036513, + "loss": 2.7342, + "step": 41414 + }, + { + "epoch": 2.03, + "grad_norm": 0.7326284646987915, + "learning_rate": 0.00014200683052641307, + "loss": 2.9032, + "step": 41415 + }, + { + "epoch": 2.03, + "grad_norm": 0.6524210572242737, + "learning_rate": 0.00014199374264858478, + "loss": 3.0304, + "step": 41416 + }, + { + "epoch": 2.03, + "grad_norm": 0.6589338183403015, + "learning_rate": 0.00014198065518691443, + "loss": 2.9891, + "step": 41417 + }, + { + "epoch": 2.03, + "grad_norm": 0.7966728806495667, + "learning_rate": 0.00014196756814143665, + "loss": 2.8156, + "step": 41418 + }, + { + "epoch": 2.03, + "grad_norm": 0.6796247363090515, + "learning_rate": 0.00014195448151218607, + "loss": 2.8564, + "step": 41419 + }, + { + "epoch": 2.03, + "grad_norm": 0.9536327123641968, + "learning_rate": 0.00014194139529919675, + "loss": 3.0352, + "step": 41420 + }, + { + "epoch": 2.03, + "grad_norm": 0.7161813378334045, + "learning_rate": 0.0001419283095025035, + "loss": 2.9973, + "step": 41421 + }, + { + "epoch": 2.03, + "grad_norm": 0.7191630005836487, + "learning_rate": 0.00014191522412214055, + "loss": 2.8155, + "step": 41422 + }, + { + "epoch": 2.03, + "grad_norm": 0.6631558537483215, + "learning_rate": 0.00014190213915814246, + "loss": 2.923, + "step": 41423 + }, + { + "epoch": 2.03, + "grad_norm": 0.7255247235298157, + "learning_rate": 0.00014188905461054384, + "loss": 2.9193, + "step": 41424 + }, + { + "epoch": 2.03, + "grad_norm": 0.733752965927124, + "learning_rate": 0.0001418759704793789, + "loss": 2.7613, + "step": 41425 + }, + { + "epoch": 2.03, + "grad_norm": 0.7541595697402954, + "learning_rate": 0.00014186288676468236, + "loss": 2.9079, + "step": 41426 + }, + { + "epoch": 2.03, + "grad_norm": 0.6659170389175415, + "learning_rate": 0.00014184980346648848, + "loss": 2.9627, + "step": 41427 + }, + { + "epoch": 2.03, + "grad_norm": 0.6909793019294739, + "learning_rate": 0.00014183672058483174, + "loss": 2.79, + "step": 41428 + }, + { + "epoch": 2.03, + "grad_norm": 0.6981406807899475, + "learning_rate": 0.00014182363811974667, + "loss": 3.0426, + "step": 41429 + }, + { + "epoch": 2.03, + "grad_norm": 0.6924818754196167, + "learning_rate": 0.00014181055607126764, + "loss": 2.6948, + "step": 41430 + }, + { + "epoch": 2.03, + "grad_norm": 0.6846038699150085, + "learning_rate": 0.00014179747443942927, + "loss": 2.9601, + "step": 41431 + }, + { + "epoch": 2.03, + "grad_norm": 0.722434937953949, + "learning_rate": 0.0001417843932242658, + "loss": 2.7735, + "step": 41432 + }, + { + "epoch": 2.03, + "grad_norm": 0.6762760877609253, + "learning_rate": 0.0001417713124258119, + "loss": 3.0094, + "step": 41433 + }, + { + "epoch": 2.03, + "grad_norm": 0.7446743845939636, + "learning_rate": 0.0001417582320441019, + "loss": 3.1196, + "step": 41434 + }, + { + "epoch": 2.03, + "grad_norm": 0.6979431509971619, + "learning_rate": 0.00014174515207917014, + "loss": 3.1234, + "step": 41435 + }, + { + "epoch": 2.03, + "grad_norm": 0.6926884055137634, + "learning_rate": 0.0001417320725310513, + "loss": 3.0273, + "step": 41436 + }, + { + "epoch": 2.03, + "grad_norm": 0.6868926286697388, + "learning_rate": 0.00014171899339977963, + "loss": 2.8642, + "step": 41437 + }, + { + "epoch": 2.03, + "grad_norm": 0.6802986860275269, + "learning_rate": 0.00014170591468538966, + "loss": 2.9907, + "step": 41438 + }, + { + "epoch": 2.03, + "grad_norm": 0.7366862893104553, + "learning_rate": 0.00014169283638791597, + "loss": 2.7683, + "step": 41439 + }, + { + "epoch": 2.03, + "grad_norm": 0.6840645670890808, + "learning_rate": 0.00014167975850739277, + "loss": 3.0044, + "step": 41440 + }, + { + "epoch": 2.03, + "grad_norm": 0.6888478398323059, + "learning_rate": 0.00014166668104385473, + "loss": 3.0112, + "step": 41441 + }, + { + "epoch": 2.03, + "grad_norm": 0.6686787009239197, + "learning_rate": 0.00014165360399733616, + "loss": 2.9337, + "step": 41442 + }, + { + "epoch": 2.03, + "grad_norm": 0.6644048690795898, + "learning_rate": 0.0001416405273678714, + "loss": 3.073, + "step": 41443 + }, + { + "epoch": 2.03, + "grad_norm": 0.6644816994667053, + "learning_rate": 0.0001416274511554952, + "loss": 2.9437, + "step": 41444 + }, + { + "epoch": 2.03, + "grad_norm": 0.6657946705818176, + "learning_rate": 0.00014161437536024162, + "loss": 2.9644, + "step": 41445 + }, + { + "epoch": 2.03, + "grad_norm": 0.6964468955993652, + "learning_rate": 0.00014160129998214545, + "loss": 3.0717, + "step": 41446 + }, + { + "epoch": 2.03, + "grad_norm": 0.7129657864570618, + "learning_rate": 0.00014158822502124084, + "loss": 3.0621, + "step": 41447 + }, + { + "epoch": 2.03, + "grad_norm": 0.6289086937904358, + "learning_rate": 0.00014157515047756247, + "loss": 2.828, + "step": 41448 + }, + { + "epoch": 2.03, + "grad_norm": 0.6725035905838013, + "learning_rate": 0.00014156207635114467, + "loss": 2.9882, + "step": 41449 + }, + { + "epoch": 2.03, + "grad_norm": 0.6678059101104736, + "learning_rate": 0.00014154900264202176, + "loss": 2.9306, + "step": 41450 + }, + { + "epoch": 2.03, + "grad_norm": 0.6416679620742798, + "learning_rate": 0.0001415359293502284, + "loss": 2.7726, + "step": 41451 + }, + { + "epoch": 2.03, + "grad_norm": 0.6881377100944519, + "learning_rate": 0.00014152285647579878, + "loss": 2.6733, + "step": 41452 + }, + { + "epoch": 2.03, + "grad_norm": 0.6680883765220642, + "learning_rate": 0.00014150978401876755, + "loss": 2.9299, + "step": 41453 + }, + { + "epoch": 2.03, + "grad_norm": 0.6864650249481201, + "learning_rate": 0.000141496711979169, + "loss": 2.9543, + "step": 41454 + }, + { + "epoch": 2.03, + "grad_norm": 0.7557768225669861, + "learning_rate": 0.00014148364035703763, + "loss": 3.0048, + "step": 41455 + }, + { + "epoch": 2.03, + "grad_norm": 0.6464671492576599, + "learning_rate": 0.0001414705691524079, + "loss": 2.8257, + "step": 41456 + }, + { + "epoch": 2.03, + "grad_norm": 0.6590653657913208, + "learning_rate": 0.00014145749836531405, + "loss": 2.8427, + "step": 41457 + }, + { + "epoch": 2.03, + "grad_norm": 0.6637300252914429, + "learning_rate": 0.00014144442799579075, + "loss": 2.8594, + "step": 41458 + }, + { + "epoch": 2.03, + "grad_norm": 0.7155690789222717, + "learning_rate": 0.00014143135804387221, + "loss": 2.8372, + "step": 41459 + }, + { + "epoch": 2.03, + "grad_norm": 0.6667417287826538, + "learning_rate": 0.00014141828850959296, + "loss": 2.8662, + "step": 41460 + }, + { + "epoch": 2.03, + "grad_norm": 0.7127690315246582, + "learning_rate": 0.00014140521939298755, + "loss": 2.9396, + "step": 41461 + }, + { + "epoch": 2.03, + "grad_norm": 0.6322091817855835, + "learning_rate": 0.00014139215069409013, + "loss": 2.8889, + "step": 41462 + }, + { + "epoch": 2.03, + "grad_norm": 0.6614790558815002, + "learning_rate": 0.00014137908241293538, + "loss": 2.8967, + "step": 41463 + }, + { + "epoch": 2.03, + "grad_norm": 0.6741194128990173, + "learning_rate": 0.0001413660145495576, + "loss": 2.9764, + "step": 41464 + }, + { + "epoch": 2.03, + "grad_norm": 0.6606133580207825, + "learning_rate": 0.0001413529471039911, + "loss": 2.7991, + "step": 41465 + }, + { + "epoch": 2.03, + "grad_norm": 0.6671067476272583, + "learning_rate": 0.0001413398800762705, + "loss": 3.006, + "step": 41466 + }, + { + "epoch": 2.03, + "grad_norm": 0.6627110242843628, + "learning_rate": 0.00014132681346643, + "loss": 2.884, + "step": 41467 + }, + { + "epoch": 2.03, + "grad_norm": 0.7146353721618652, + "learning_rate": 0.00014131374727450427, + "loss": 2.889, + "step": 41468 + }, + { + "epoch": 2.03, + "grad_norm": 0.6611478924751282, + "learning_rate": 0.0001413006815005275, + "loss": 2.8787, + "step": 41469 + }, + { + "epoch": 2.03, + "grad_norm": 0.6627159118652344, + "learning_rate": 0.00014128761614453424, + "loss": 3.0298, + "step": 41470 + }, + { + "epoch": 2.03, + "grad_norm": 0.6639991998672485, + "learning_rate": 0.0001412745512065589, + "loss": 2.9859, + "step": 41471 + }, + { + "epoch": 2.03, + "grad_norm": 0.6625120639801025, + "learning_rate": 0.0001412614866866357, + "loss": 2.9387, + "step": 41472 + }, + { + "epoch": 2.03, + "grad_norm": 0.6913546919822693, + "learning_rate": 0.00014124842258479936, + "loss": 2.8833, + "step": 41473 + }, + { + "epoch": 2.03, + "grad_norm": 0.6938081979751587, + "learning_rate": 0.00014123535890108398, + "loss": 2.8589, + "step": 41474 + }, + { + "epoch": 2.03, + "grad_norm": 0.6782478094100952, + "learning_rate": 0.0001412222956355241, + "loss": 3.0988, + "step": 41475 + }, + { + "epoch": 2.03, + "grad_norm": 0.6729618906974792, + "learning_rate": 0.00014120923278815428, + "loss": 2.9759, + "step": 41476 + }, + { + "epoch": 2.03, + "grad_norm": 0.670555055141449, + "learning_rate": 0.00014119617035900875, + "loss": 2.8674, + "step": 41477 + }, + { + "epoch": 2.03, + "grad_norm": 0.7138615250587463, + "learning_rate": 0.00014118310834812194, + "loss": 2.9513, + "step": 41478 + }, + { + "epoch": 2.03, + "grad_norm": 0.6860772967338562, + "learning_rate": 0.00014117004675552817, + "loss": 2.845, + "step": 41479 + }, + { + "epoch": 2.03, + "grad_norm": 0.7426106333732605, + "learning_rate": 0.00014115698558126195, + "loss": 2.8868, + "step": 41480 + }, + { + "epoch": 2.03, + "grad_norm": 0.6855692267417908, + "learning_rate": 0.00014114392482535772, + "loss": 2.9, + "step": 41481 + }, + { + "epoch": 2.03, + "grad_norm": 0.7088014483451843, + "learning_rate": 0.00014113086448784974, + "loss": 2.9, + "step": 41482 + }, + { + "epoch": 2.03, + "grad_norm": 0.6505365967750549, + "learning_rate": 0.0001411178045687726, + "loss": 3.0525, + "step": 41483 + }, + { + "epoch": 2.03, + "grad_norm": 0.702044665813446, + "learning_rate": 0.00014110474506816043, + "loss": 2.9941, + "step": 41484 + }, + { + "epoch": 2.03, + "grad_norm": 0.6758970022201538, + "learning_rate": 0.00014109168598604795, + "loss": 3.1789, + "step": 41485 + }, + { + "epoch": 2.03, + "grad_norm": 0.6437978744506836, + "learning_rate": 0.00014107862732246935, + "loss": 2.6538, + "step": 41486 + }, + { + "epoch": 2.03, + "grad_norm": 0.7227490544319153, + "learning_rate": 0.00014106556907745896, + "loss": 2.884, + "step": 41487 + }, + { + "epoch": 2.03, + "grad_norm": 0.7027838826179504, + "learning_rate": 0.00014105251125105139, + "loss": 3.0361, + "step": 41488 + }, + { + "epoch": 2.03, + "grad_norm": 0.6415818333625793, + "learning_rate": 0.0001410394538432808, + "loss": 2.9468, + "step": 41489 + }, + { + "epoch": 2.03, + "grad_norm": 0.6897194981575012, + "learning_rate": 0.0001410263968541817, + "loss": 2.9501, + "step": 41490 + }, + { + "epoch": 2.03, + "grad_norm": 0.6428824663162231, + "learning_rate": 0.0001410133402837886, + "loss": 2.9695, + "step": 41491 + }, + { + "epoch": 2.03, + "grad_norm": 0.7042983770370483, + "learning_rate": 0.00014100028413213572, + "loss": 2.9618, + "step": 41492 + }, + { + "epoch": 2.03, + "grad_norm": 0.7326884269714355, + "learning_rate": 0.00014098722839925754, + "loss": 3.2218, + "step": 41493 + }, + { + "epoch": 2.03, + "grad_norm": 0.6782544255256653, + "learning_rate": 0.00014097417308518825, + "loss": 2.9167, + "step": 41494 + }, + { + "epoch": 2.03, + "grad_norm": 0.6886657476425171, + "learning_rate": 0.0001409611181899624, + "loss": 2.8998, + "step": 41495 + }, + { + "epoch": 2.03, + "grad_norm": 0.7487680912017822, + "learning_rate": 0.00014094806371361447, + "loss": 2.9263, + "step": 41496 + }, + { + "epoch": 2.03, + "grad_norm": 0.654858410358429, + "learning_rate": 0.0001409350096561786, + "loss": 3.0155, + "step": 41497 + }, + { + "epoch": 2.03, + "grad_norm": 0.6685404181480408, + "learning_rate": 0.00014092195601768945, + "loss": 3.0298, + "step": 41498 + }, + { + "epoch": 2.03, + "grad_norm": 0.7062793970108032, + "learning_rate": 0.0001409089027981812, + "loss": 2.883, + "step": 41499 + }, + { + "epoch": 2.03, + "grad_norm": 0.7158371210098267, + "learning_rate": 0.00014089584999768821, + "loss": 2.9065, + "step": 41500 + }, + { + "epoch": 2.03, + "grad_norm": 0.7162919640541077, + "learning_rate": 0.00014088279761624505, + "loss": 3.1325, + "step": 41501 + }, + { + "epoch": 2.03, + "grad_norm": 0.6152122616767883, + "learning_rate": 0.00014086974565388587, + "loss": 2.8914, + "step": 41502 + }, + { + "epoch": 2.03, + "grad_norm": 0.7153500318527222, + "learning_rate": 0.00014085669411064523, + "loss": 3.1672, + "step": 41503 + }, + { + "epoch": 2.03, + "grad_norm": 0.6505460143089294, + "learning_rate": 0.00014084364298655734, + "loss": 3.0139, + "step": 41504 + }, + { + "epoch": 2.03, + "grad_norm": 0.7156624794006348, + "learning_rate": 0.00014083059228165665, + "loss": 2.918, + "step": 41505 + }, + { + "epoch": 2.03, + "grad_norm": 0.657463550567627, + "learning_rate": 0.00014081754199597767, + "loss": 3.0337, + "step": 41506 + }, + { + "epoch": 2.03, + "grad_norm": 0.6673892736434937, + "learning_rate": 0.00014080449212955468, + "loss": 2.8168, + "step": 41507 + }, + { + "epoch": 2.03, + "grad_norm": 0.6680115461349487, + "learning_rate": 0.00014079144268242197, + "loss": 2.916, + "step": 41508 + }, + { + "epoch": 2.03, + "grad_norm": 0.676265299320221, + "learning_rate": 0.00014077839365461386, + "loss": 3.0468, + "step": 41509 + }, + { + "epoch": 2.03, + "grad_norm": 0.6526424288749695, + "learning_rate": 0.00014076534504616484, + "loss": 2.9184, + "step": 41510 + }, + { + "epoch": 2.03, + "grad_norm": 0.695165753364563, + "learning_rate": 0.00014075229685710935, + "loss": 3.1055, + "step": 41511 + }, + { + "epoch": 2.03, + "grad_norm": 0.6782224178314209, + "learning_rate": 0.00014073924908748154, + "loss": 2.893, + "step": 41512 + }, + { + "epoch": 2.03, + "grad_norm": 0.6610283255577087, + "learning_rate": 0.00014072620173731604, + "loss": 3.0361, + "step": 41513 + }, + { + "epoch": 2.03, + "grad_norm": 0.6884645223617554, + "learning_rate": 0.00014071315480664704, + "loss": 3.005, + "step": 41514 + }, + { + "epoch": 2.03, + "grad_norm": 0.7342187166213989, + "learning_rate": 0.00014070010829550885, + "loss": 2.9996, + "step": 41515 + }, + { + "epoch": 2.03, + "grad_norm": 0.6680518388748169, + "learning_rate": 0.000140687062203936, + "loss": 3.1241, + "step": 41516 + }, + { + "epoch": 2.03, + "grad_norm": 0.6695300936698914, + "learning_rate": 0.00014067401653196268, + "loss": 3.0162, + "step": 41517 + }, + { + "epoch": 2.03, + "grad_norm": 0.6923040747642517, + "learning_rate": 0.00014066097127962346, + "loss": 2.6873, + "step": 41518 + }, + { + "epoch": 2.03, + "grad_norm": 0.7008634209632874, + "learning_rate": 0.00014064792644695242, + "loss": 2.9065, + "step": 41519 + }, + { + "epoch": 2.03, + "grad_norm": 0.6829542517662048, + "learning_rate": 0.0001406348820339841, + "loss": 2.8814, + "step": 41520 + }, + { + "epoch": 2.03, + "grad_norm": 0.6533739566802979, + "learning_rate": 0.00014062183804075307, + "loss": 2.969, + "step": 41521 + }, + { + "epoch": 2.03, + "grad_norm": 0.6787754893302917, + "learning_rate": 0.0001406087944672932, + "loss": 2.8614, + "step": 41522 + }, + { + "epoch": 2.03, + "grad_norm": 0.6453655958175659, + "learning_rate": 0.00014059575131363918, + "loss": 2.9535, + "step": 41523 + }, + { + "epoch": 2.03, + "grad_norm": 0.733506441116333, + "learning_rate": 0.00014058270857982523, + "loss": 2.9048, + "step": 41524 + }, + { + "epoch": 2.04, + "grad_norm": 0.6687877774238586, + "learning_rate": 0.00014056966626588578, + "loss": 2.7681, + "step": 41525 + }, + { + "epoch": 2.04, + "grad_norm": 0.6705677509307861, + "learning_rate": 0.0001405566243718551, + "loss": 3.2083, + "step": 41526 + }, + { + "epoch": 2.04, + "grad_norm": 0.6572078466415405, + "learning_rate": 0.00014054358289776754, + "loss": 2.8929, + "step": 41527 + }, + { + "epoch": 2.04, + "grad_norm": 0.6767847537994385, + "learning_rate": 0.0001405305418436576, + "loss": 2.8771, + "step": 41528 + }, + { + "epoch": 2.04, + "grad_norm": 0.6408040523529053, + "learning_rate": 0.00014051750120955956, + "loss": 2.841, + "step": 41529 + }, + { + "epoch": 2.04, + "grad_norm": 0.6896417140960693, + "learning_rate": 0.00014050446099550773, + "loss": 2.8297, + "step": 41530 + }, + { + "epoch": 2.04, + "grad_norm": 0.7329106330871582, + "learning_rate": 0.00014049142120153632, + "loss": 3.0008, + "step": 41531 + }, + { + "epoch": 2.04, + "grad_norm": 0.6867200136184692, + "learning_rate": 0.0001404783818276798, + "loss": 2.6561, + "step": 41532 + }, + { + "epoch": 2.04, + "grad_norm": 0.6792352199554443, + "learning_rate": 0.00014046534287397264, + "loss": 3.0706, + "step": 41533 + }, + { + "epoch": 2.04, + "grad_norm": 0.7385411858558655, + "learning_rate": 0.00014045230434044895, + "loss": 2.5935, + "step": 41534 + }, + { + "epoch": 2.04, + "grad_norm": 0.662626326084137, + "learning_rate": 0.00014043926622714332, + "loss": 2.9917, + "step": 41535 + }, + { + "epoch": 2.04, + "grad_norm": 0.734079897403717, + "learning_rate": 0.00014042622853408993, + "loss": 3.1046, + "step": 41536 + }, + { + "epoch": 2.04, + "grad_norm": 0.646392822265625, + "learning_rate": 0.0001404131912613231, + "loss": 2.8517, + "step": 41537 + }, + { + "epoch": 2.04, + "grad_norm": 0.6550691723823547, + "learning_rate": 0.00014040015440887723, + "loss": 3.0403, + "step": 41538 + }, + { + "epoch": 2.04, + "grad_norm": 0.7334123849868774, + "learning_rate": 0.00014038711797678658, + "loss": 2.8179, + "step": 41539 + }, + { + "epoch": 2.04, + "grad_norm": 0.6451386213302612, + "learning_rate": 0.00014037408196508562, + "loss": 2.9276, + "step": 41540 + }, + { + "epoch": 2.04, + "grad_norm": 0.6706496477127075, + "learning_rate": 0.00014036104637380853, + "loss": 2.8136, + "step": 41541 + }, + { + "epoch": 2.04, + "grad_norm": 0.6933931112289429, + "learning_rate": 0.00014034801120298967, + "loss": 3.1359, + "step": 41542 + }, + { + "epoch": 2.04, + "grad_norm": 0.6907314658164978, + "learning_rate": 0.00014033497645266356, + "loss": 2.8287, + "step": 41543 + }, + { + "epoch": 2.04, + "grad_norm": 0.6742256879806519, + "learning_rate": 0.00014032194212286442, + "loss": 2.9517, + "step": 41544 + }, + { + "epoch": 2.04, + "grad_norm": 0.6563650369644165, + "learning_rate": 0.00014030890821362655, + "loss": 2.9116, + "step": 41545 + }, + { + "epoch": 2.04, + "grad_norm": 0.665621280670166, + "learning_rate": 0.00014029587472498415, + "loss": 2.9368, + "step": 41546 + }, + { + "epoch": 2.04, + "grad_norm": 0.6880399584770203, + "learning_rate": 0.00014028284165697167, + "loss": 2.9152, + "step": 41547 + }, + { + "epoch": 2.04, + "grad_norm": 0.7508476376533508, + "learning_rate": 0.00014026980900962358, + "loss": 3.043, + "step": 41548 + }, + { + "epoch": 2.04, + "grad_norm": 0.7052724957466125, + "learning_rate": 0.00014025677678297394, + "loss": 3.0261, + "step": 41549 + }, + { + "epoch": 2.04, + "grad_norm": 0.698532223701477, + "learning_rate": 0.00014024374497705736, + "loss": 3.0202, + "step": 41550 + }, + { + "epoch": 2.04, + "grad_norm": 0.6969287395477295, + "learning_rate": 0.00014023071359190795, + "loss": 2.8709, + "step": 41551 + }, + { + "epoch": 2.04, + "grad_norm": 0.7008760571479797, + "learning_rate": 0.00014021768262756002, + "loss": 2.9338, + "step": 41552 + }, + { + "epoch": 2.04, + "grad_norm": 0.6544531583786011, + "learning_rate": 0.00014020465208404808, + "loss": 2.8513, + "step": 41553 + }, + { + "epoch": 2.04, + "grad_norm": 0.659297525882721, + "learning_rate": 0.00014019162196140622, + "loss": 2.8528, + "step": 41554 + }, + { + "epoch": 2.04, + "grad_norm": 0.6890259385108948, + "learning_rate": 0.00014017859225966896, + "loss": 2.9265, + "step": 41555 + }, + { + "epoch": 2.04, + "grad_norm": 0.6693836450576782, + "learning_rate": 0.00014016556297887043, + "loss": 2.9833, + "step": 41556 + }, + { + "epoch": 2.04, + "grad_norm": 0.7449944615364075, + "learning_rate": 0.00014015253411904519, + "loss": 2.8482, + "step": 41557 + }, + { + "epoch": 2.04, + "grad_norm": 0.6576284766197205, + "learning_rate": 0.00014013950568022735, + "loss": 2.9817, + "step": 41558 + }, + { + "epoch": 2.04, + "grad_norm": 0.6823240518569946, + "learning_rate": 0.0001401264776624512, + "loss": 2.7373, + "step": 41559 + }, + { + "epoch": 2.04, + "grad_norm": 0.6630463600158691, + "learning_rate": 0.00014011345006575127, + "loss": 2.9244, + "step": 41560 + }, + { + "epoch": 2.04, + "grad_norm": 0.6817547082901001, + "learning_rate": 0.00014010042289016163, + "loss": 3.0798, + "step": 41561 + }, + { + "epoch": 2.04, + "grad_norm": 0.697632908821106, + "learning_rate": 0.00014008739613571668, + "loss": 2.9706, + "step": 41562 + }, + { + "epoch": 2.04, + "grad_norm": 0.6645799279212952, + "learning_rate": 0.00014007436980245088, + "loss": 3.0473, + "step": 41563 + }, + { + "epoch": 2.04, + "grad_norm": 0.7861841320991516, + "learning_rate": 0.00014006134389039828, + "loss": 2.9287, + "step": 41564 + }, + { + "epoch": 2.04, + "grad_norm": 0.7191126942634583, + "learning_rate": 0.00014004831839959346, + "loss": 2.8517, + "step": 41565 + }, + { + "epoch": 2.04, + "grad_norm": 0.6533079743385315, + "learning_rate": 0.0001400352933300706, + "loss": 2.7748, + "step": 41566 + }, + { + "epoch": 2.04, + "grad_norm": 0.6802554130554199, + "learning_rate": 0.00014002226868186382, + "loss": 2.985, + "step": 41567 + }, + { + "epoch": 2.04, + "grad_norm": 0.6917102932929993, + "learning_rate": 0.00014000924445500774, + "loss": 2.9208, + "step": 41568 + }, + { + "epoch": 2.04, + "grad_norm": 0.6812888383865356, + "learning_rate": 0.0001399962206495364, + "loss": 3.2155, + "step": 41569 + }, + { + "epoch": 2.04, + "grad_norm": 0.6770250797271729, + "learning_rate": 0.00013998319726548435, + "loss": 2.7714, + "step": 41570 + }, + { + "epoch": 2.04, + "grad_norm": 0.6774451732635498, + "learning_rate": 0.00013997017430288566, + "loss": 2.9362, + "step": 41571 + }, + { + "epoch": 2.04, + "grad_norm": 0.6566124558448792, + "learning_rate": 0.00013995715176177482, + "loss": 2.9055, + "step": 41572 + }, + { + "epoch": 2.04, + "grad_norm": 0.6735152006149292, + "learning_rate": 0.00013994412964218604, + "loss": 2.8136, + "step": 41573 + }, + { + "epoch": 2.04, + "grad_norm": 0.6460737586021423, + "learning_rate": 0.00013993110794415352, + "loss": 2.9369, + "step": 41574 + }, + { + "epoch": 2.04, + "grad_norm": 0.6937167644500732, + "learning_rate": 0.00013991808666771177, + "loss": 2.94, + "step": 41575 + }, + { + "epoch": 2.04, + "grad_norm": 0.6455183625221252, + "learning_rate": 0.00013990506581289483, + "loss": 2.7323, + "step": 41576 + }, + { + "epoch": 2.04, + "grad_norm": 0.6627008318901062, + "learning_rate": 0.00013989204537973715, + "loss": 2.8377, + "step": 41577 + }, + { + "epoch": 2.04, + "grad_norm": 0.6939973831176758, + "learning_rate": 0.00013987902536827314, + "loss": 2.9573, + "step": 41578 + }, + { + "epoch": 2.04, + "grad_norm": 0.7010552883148193, + "learning_rate": 0.00013986600577853696, + "loss": 2.984, + "step": 41579 + }, + { + "epoch": 2.04, + "grad_norm": 0.6695321798324585, + "learning_rate": 0.0001398529866105629, + "loss": 2.967, + "step": 41580 + }, + { + "epoch": 2.04, + "grad_norm": 0.7119990587234497, + "learning_rate": 0.0001398399678643851, + "loss": 2.8192, + "step": 41581 + }, + { + "epoch": 2.04, + "grad_norm": 0.6729029417037964, + "learning_rate": 0.00013982694954003805, + "loss": 2.8459, + "step": 41582 + }, + { + "epoch": 2.04, + "grad_norm": 0.6772317290306091, + "learning_rate": 0.0001398139316375561, + "loss": 2.9779, + "step": 41583 + }, + { + "epoch": 2.04, + "grad_norm": 0.6854050159454346, + "learning_rate": 0.0001398009141569733, + "loss": 2.9556, + "step": 41584 + }, + { + "epoch": 2.04, + "grad_norm": 0.6610407829284668, + "learning_rate": 0.00013978789709832414, + "loss": 2.862, + "step": 41585 + }, + { + "epoch": 2.04, + "grad_norm": 0.7166022658348083, + "learning_rate": 0.00013977488046164276, + "loss": 3.1247, + "step": 41586 + }, + { + "epoch": 2.04, + "grad_norm": 0.673302948474884, + "learning_rate": 0.00013976186424696362, + "loss": 2.812, + "step": 41587 + }, + { + "epoch": 2.04, + "grad_norm": 0.6916917562484741, + "learning_rate": 0.00013974884845432088, + "loss": 3.0233, + "step": 41588 + }, + { + "epoch": 2.04, + "grad_norm": 0.6967342495918274, + "learning_rate": 0.0001397358330837487, + "loss": 2.9757, + "step": 41589 + }, + { + "epoch": 2.04, + "grad_norm": 0.741462767124176, + "learning_rate": 0.0001397228181352816, + "loss": 2.9999, + "step": 41590 + }, + { + "epoch": 2.04, + "grad_norm": 0.7056967616081238, + "learning_rate": 0.0001397098036089537, + "loss": 2.8734, + "step": 41591 + }, + { + "epoch": 2.04, + "grad_norm": 0.6610565781593323, + "learning_rate": 0.00013969678950479926, + "loss": 2.9993, + "step": 41592 + }, + { + "epoch": 2.04, + "grad_norm": 0.653553307056427, + "learning_rate": 0.00013968377582285278, + "loss": 2.873, + "step": 41593 + }, + { + "epoch": 2.04, + "grad_norm": 0.724814772605896, + "learning_rate": 0.0001396707625631484, + "loss": 2.916, + "step": 41594 + }, + { + "epoch": 2.04, + "grad_norm": 0.6634653210639954, + "learning_rate": 0.00013965774972572032, + "loss": 2.9582, + "step": 41595 + }, + { + "epoch": 2.04, + "grad_norm": 0.6641644835472107, + "learning_rate": 0.00013964473731060276, + "loss": 3.0106, + "step": 41596 + }, + { + "epoch": 2.04, + "grad_norm": 0.6954382061958313, + "learning_rate": 0.00013963172531783013, + "loss": 3.1128, + "step": 41597 + }, + { + "epoch": 2.04, + "grad_norm": 0.6707383394241333, + "learning_rate": 0.00013961871374743677, + "loss": 2.7836, + "step": 41598 + }, + { + "epoch": 2.04, + "grad_norm": 0.6706055402755737, + "learning_rate": 0.00013960570259945675, + "loss": 2.9707, + "step": 41599 + }, + { + "epoch": 2.04, + "grad_norm": 0.6660712361335754, + "learning_rate": 0.00013959269187392457, + "loss": 2.9676, + "step": 41600 + }, + { + "epoch": 2.04, + "grad_norm": 0.7226710319519043, + "learning_rate": 0.00013957968157087434, + "loss": 3.0102, + "step": 41601 + }, + { + "epoch": 2.04, + "grad_norm": 0.70147705078125, + "learning_rate": 0.00013956667169034021, + "loss": 3.0778, + "step": 41602 + }, + { + "epoch": 2.04, + "grad_norm": 0.7111939787864685, + "learning_rate": 0.00013955366223235679, + "loss": 2.958, + "step": 41603 + }, + { + "epoch": 2.04, + "grad_norm": 0.6577525734901428, + "learning_rate": 0.00013954065319695796, + "loss": 3.0126, + "step": 41604 + }, + { + "epoch": 2.04, + "grad_norm": 0.6963541507720947, + "learning_rate": 0.00013952764458417832, + "loss": 2.9144, + "step": 41605 + }, + { + "epoch": 2.04, + "grad_norm": 0.6417427659034729, + "learning_rate": 0.00013951463639405186, + "loss": 2.8289, + "step": 41606 + }, + { + "epoch": 2.04, + "grad_norm": 0.6816616654396057, + "learning_rate": 0.00013950162862661305, + "loss": 2.8721, + "step": 41607 + }, + { + "epoch": 2.04, + "grad_norm": 0.7183780670166016, + "learning_rate": 0.00013948862128189597, + "loss": 2.8842, + "step": 41608 + }, + { + "epoch": 2.04, + "grad_norm": 0.6773726940155029, + "learning_rate": 0.00013947561435993507, + "loss": 2.8267, + "step": 41609 + }, + { + "epoch": 2.04, + "grad_norm": 0.715185821056366, + "learning_rate": 0.00013946260786076452, + "loss": 2.8606, + "step": 41610 + }, + { + "epoch": 2.04, + "grad_norm": 0.6540136337280273, + "learning_rate": 0.00013944960178441848, + "loss": 3.098, + "step": 41611 + }, + { + "epoch": 2.04, + "grad_norm": 0.6640399098396301, + "learning_rate": 0.00013943659613093137, + "loss": 2.9417, + "step": 41612 + }, + { + "epoch": 2.04, + "grad_norm": 0.6632117629051208, + "learning_rate": 0.00013942359090033724, + "loss": 2.8215, + "step": 41613 + }, + { + "epoch": 2.04, + "grad_norm": 0.7612461447715759, + "learning_rate": 0.00013941058609267048, + "loss": 2.7336, + "step": 41614 + }, + { + "epoch": 2.04, + "grad_norm": 0.6554035544395447, + "learning_rate": 0.00013939758170796547, + "loss": 2.7637, + "step": 41615 + }, + { + "epoch": 2.04, + "grad_norm": 0.6623538732528687, + "learning_rate": 0.0001393845777462563, + "loss": 3.0458, + "step": 41616 + }, + { + "epoch": 2.04, + "grad_norm": 0.6795579195022583, + "learning_rate": 0.00013937157420757722, + "loss": 2.8377, + "step": 41617 + }, + { + "epoch": 2.04, + "grad_norm": 0.6572181582450867, + "learning_rate": 0.0001393585710919624, + "loss": 2.7483, + "step": 41618 + }, + { + "epoch": 2.04, + "grad_norm": 0.6754372715950012, + "learning_rate": 0.00013934556839944622, + "loss": 2.8756, + "step": 41619 + }, + { + "epoch": 2.04, + "grad_norm": 0.6733306646347046, + "learning_rate": 0.00013933256613006298, + "loss": 2.8464, + "step": 41620 + }, + { + "epoch": 2.04, + "grad_norm": 0.6594640016555786, + "learning_rate": 0.0001393195642838467, + "loss": 2.9303, + "step": 41621 + }, + { + "epoch": 2.04, + "grad_norm": 0.6582934260368347, + "learning_rate": 0.00013930656286083193, + "loss": 2.913, + "step": 41622 + }, + { + "epoch": 2.04, + "grad_norm": 0.6640312671661377, + "learning_rate": 0.00013929356186105272, + "loss": 2.8359, + "step": 41623 + }, + { + "epoch": 2.04, + "grad_norm": 0.643187403678894, + "learning_rate": 0.0001392805612845432, + "loss": 3.2914, + "step": 41624 + }, + { + "epoch": 2.04, + "grad_norm": 0.7015854120254517, + "learning_rate": 0.00013926756113133792, + "loss": 3.1108, + "step": 41625 + }, + { + "epoch": 2.04, + "grad_norm": 0.6656563878059387, + "learning_rate": 0.0001392545614014708, + "loss": 2.9735, + "step": 41626 + }, + { + "epoch": 2.04, + "grad_norm": 0.6701093316078186, + "learning_rate": 0.00013924156209497638, + "loss": 2.7599, + "step": 41627 + }, + { + "epoch": 2.04, + "grad_norm": 0.6650725603103638, + "learning_rate": 0.0001392285632118886, + "loss": 2.8968, + "step": 41628 + }, + { + "epoch": 2.04, + "grad_norm": 0.6751466989517212, + "learning_rate": 0.00013921556475224184, + "loss": 2.9247, + "step": 41629 + }, + { + "epoch": 2.04, + "grad_norm": 0.6479591727256775, + "learning_rate": 0.00013920256671607047, + "loss": 3.0724, + "step": 41630 + }, + { + "epoch": 2.04, + "grad_norm": 0.7332056164741516, + "learning_rate": 0.00013918956910340856, + "loss": 2.8461, + "step": 41631 + }, + { + "epoch": 2.04, + "grad_norm": 0.690640926361084, + "learning_rate": 0.00013917657191429042, + "loss": 2.9417, + "step": 41632 + }, + { + "epoch": 2.04, + "grad_norm": 0.6897100806236267, + "learning_rate": 0.0001391635751487501, + "loss": 2.9565, + "step": 41633 + }, + { + "epoch": 2.04, + "grad_norm": 0.6536319851875305, + "learning_rate": 0.000139150578806822, + "loss": 2.6714, + "step": 41634 + }, + { + "epoch": 2.04, + "grad_norm": 0.8222829699516296, + "learning_rate": 0.0001391375828885404, + "loss": 2.9552, + "step": 41635 + }, + { + "epoch": 2.04, + "grad_norm": 0.7058513164520264, + "learning_rate": 0.00013912458739393933, + "loss": 2.6434, + "step": 41636 + }, + { + "epoch": 2.04, + "grad_norm": 0.6645085215568542, + "learning_rate": 0.00013911159232305327, + "loss": 3.0102, + "step": 41637 + }, + { + "epoch": 2.04, + "grad_norm": 0.6806157231330872, + "learning_rate": 0.0001390985976759163, + "loss": 2.9867, + "step": 41638 + }, + { + "epoch": 2.04, + "grad_norm": 0.7031665444374084, + "learning_rate": 0.00013908560345256255, + "loss": 3.0195, + "step": 41639 + }, + { + "epoch": 2.04, + "grad_norm": 0.6819475889205933, + "learning_rate": 0.00013907260965302648, + "loss": 3.0876, + "step": 41640 + }, + { + "epoch": 2.04, + "grad_norm": 0.6787337064743042, + "learning_rate": 0.00013905961627734204, + "loss": 2.7164, + "step": 41641 + }, + { + "epoch": 2.04, + "grad_norm": 0.7834779024124146, + "learning_rate": 0.00013904662332554372, + "loss": 2.7447, + "step": 41642 + }, + { + "epoch": 2.04, + "grad_norm": 0.6589922904968262, + "learning_rate": 0.0001390336307976655, + "loss": 2.8369, + "step": 41643 + }, + { + "epoch": 2.04, + "grad_norm": 0.6925103664398193, + "learning_rate": 0.00013902063869374175, + "loss": 2.7699, + "step": 41644 + }, + { + "epoch": 2.04, + "grad_norm": 0.6515282392501831, + "learning_rate": 0.00013900764701380675, + "loss": 2.9382, + "step": 41645 + }, + { + "epoch": 2.04, + "grad_norm": 0.7050473690032959, + "learning_rate": 0.0001389946557578946, + "loss": 2.8169, + "step": 41646 + }, + { + "epoch": 2.04, + "grad_norm": 0.7375444769859314, + "learning_rate": 0.00013898166492603957, + "loss": 2.8523, + "step": 41647 + }, + { + "epoch": 2.04, + "grad_norm": 0.8539415597915649, + "learning_rate": 0.00013896867451827576, + "loss": 2.9165, + "step": 41648 + }, + { + "epoch": 2.04, + "grad_norm": 0.7187871336936951, + "learning_rate": 0.0001389556845346374, + "loss": 2.6562, + "step": 41649 + }, + { + "epoch": 2.04, + "grad_norm": 0.6538518071174622, + "learning_rate": 0.00013894269497515894, + "loss": 2.9595, + "step": 41650 + }, + { + "epoch": 2.04, + "grad_norm": 0.6808871030807495, + "learning_rate": 0.00013892970583987427, + "loss": 2.7846, + "step": 41651 + }, + { + "epoch": 2.04, + "grad_norm": 0.6848717927932739, + "learning_rate": 0.0001389167171288179, + "loss": 2.8772, + "step": 41652 + }, + { + "epoch": 2.04, + "grad_norm": 0.6991068720817566, + "learning_rate": 0.0001389037288420239, + "loss": 3.1921, + "step": 41653 + }, + { + "epoch": 2.04, + "grad_norm": 0.701252818107605, + "learning_rate": 0.00013889074097952637, + "loss": 2.9685, + "step": 41654 + }, + { + "epoch": 2.04, + "grad_norm": 0.6782150864601135, + "learning_rate": 0.00013887775354135971, + "loss": 2.807, + "step": 41655 + }, + { + "epoch": 2.04, + "grad_norm": 0.6534834504127502, + "learning_rate": 0.00013886476652755794, + "loss": 2.9893, + "step": 41656 + }, + { + "epoch": 2.04, + "grad_norm": 0.6911651492118835, + "learning_rate": 0.00013885177993815548, + "loss": 2.8997, + "step": 41657 + }, + { + "epoch": 2.04, + "grad_norm": 0.6815614700317383, + "learning_rate": 0.00013883879377318627, + "loss": 2.8842, + "step": 41658 + }, + { + "epoch": 2.04, + "grad_norm": 0.681822657585144, + "learning_rate": 0.00013882580803268482, + "loss": 2.9365, + "step": 41659 + }, + { + "epoch": 2.04, + "grad_norm": 0.6272294521331787, + "learning_rate": 0.00013881282271668515, + "loss": 2.9309, + "step": 41660 + }, + { + "epoch": 2.04, + "grad_norm": 0.7433640360832214, + "learning_rate": 0.0001387998378252214, + "loss": 2.9243, + "step": 41661 + }, + { + "epoch": 2.04, + "grad_norm": 0.6774015426635742, + "learning_rate": 0.00013878685335832789, + "loss": 3.0579, + "step": 41662 + }, + { + "epoch": 2.04, + "grad_norm": 0.6565629839897156, + "learning_rate": 0.00013877386931603874, + "loss": 3.0814, + "step": 41663 + }, + { + "epoch": 2.04, + "grad_norm": 0.6806601285934448, + "learning_rate": 0.00013876088569838815, + "loss": 2.9186, + "step": 41664 + }, + { + "epoch": 2.04, + "grad_norm": 0.6668409109115601, + "learning_rate": 0.0001387479025054105, + "loss": 2.8923, + "step": 41665 + }, + { + "epoch": 2.04, + "grad_norm": 0.6895928978919983, + "learning_rate": 0.0001387349197371397, + "loss": 3.0788, + "step": 41666 + }, + { + "epoch": 2.04, + "grad_norm": 0.634085476398468, + "learning_rate": 0.00013872193739361023, + "loss": 2.8866, + "step": 41667 + }, + { + "epoch": 2.04, + "grad_norm": 0.704136073589325, + "learning_rate": 0.0001387089554748561, + "loss": 3.0615, + "step": 41668 + }, + { + "epoch": 2.04, + "grad_norm": 0.6691809296607971, + "learning_rate": 0.00013869597398091148, + "loss": 3.0299, + "step": 41669 + }, + { + "epoch": 2.04, + "grad_norm": 0.6336991190910339, + "learning_rate": 0.00013868299291181068, + "loss": 2.7573, + "step": 41670 + }, + { + "epoch": 2.04, + "grad_norm": 0.698936939239502, + "learning_rate": 0.00013867001226758774, + "loss": 2.8835, + "step": 41671 + }, + { + "epoch": 2.04, + "grad_norm": 0.7142788767814636, + "learning_rate": 0.00013865703204827707, + "loss": 3.3196, + "step": 41672 + }, + { + "epoch": 2.04, + "grad_norm": 0.694664716720581, + "learning_rate": 0.0001386440522539126, + "loss": 2.9352, + "step": 41673 + }, + { + "epoch": 2.04, + "grad_norm": 0.666140079498291, + "learning_rate": 0.00013863107288452877, + "loss": 2.9518, + "step": 41674 + }, + { + "epoch": 2.04, + "grad_norm": 0.722091555595398, + "learning_rate": 0.00013861809394015963, + "loss": 2.8776, + "step": 41675 + }, + { + "epoch": 2.04, + "grad_norm": 0.6839096546173096, + "learning_rate": 0.00013860511542083923, + "loss": 3.0636, + "step": 41676 + }, + { + "epoch": 2.04, + "grad_norm": 0.666094183921814, + "learning_rate": 0.000138592137326602, + "loss": 2.9686, + "step": 41677 + }, + { + "epoch": 2.04, + "grad_norm": 0.696448802947998, + "learning_rate": 0.00013857915965748195, + "loss": 3.0072, + "step": 41678 + }, + { + "epoch": 2.04, + "grad_norm": 0.7537996172904968, + "learning_rate": 0.00013856618241351332, + "loss": 2.8667, + "step": 41679 + }, + { + "epoch": 2.04, + "grad_norm": 0.6860277652740479, + "learning_rate": 0.00013855320559473037, + "loss": 2.8746, + "step": 41680 + }, + { + "epoch": 2.04, + "grad_norm": 0.6779218316078186, + "learning_rate": 0.00013854022920116725, + "loss": 2.9084, + "step": 41681 + }, + { + "epoch": 2.04, + "grad_norm": 0.680435836315155, + "learning_rate": 0.00013852725323285807, + "loss": 2.9952, + "step": 41682 + }, + { + "epoch": 2.04, + "grad_norm": 0.6816911697387695, + "learning_rate": 0.00013851427768983693, + "loss": 2.7457, + "step": 41683 + }, + { + "epoch": 2.04, + "grad_norm": 0.7383108735084534, + "learning_rate": 0.0001385013025721382, + "loss": 3.1948, + "step": 41684 + }, + { + "epoch": 2.04, + "grad_norm": 0.6533733606338501, + "learning_rate": 0.00013848832787979586, + "loss": 2.7713, + "step": 41685 + }, + { + "epoch": 2.04, + "grad_norm": 0.6958843469619751, + "learning_rate": 0.00013847535361284418, + "loss": 3.1565, + "step": 41686 + }, + { + "epoch": 2.04, + "grad_norm": 0.6978946924209595, + "learning_rate": 0.00013846237977131742, + "loss": 2.8521, + "step": 41687 + }, + { + "epoch": 2.04, + "grad_norm": 0.6730976104736328, + "learning_rate": 0.0001384494063552496, + "loss": 3.025, + "step": 41688 + }, + { + "epoch": 2.04, + "grad_norm": 0.6826785802841187, + "learning_rate": 0.000138436433364675, + "loss": 2.9969, + "step": 41689 + }, + { + "epoch": 2.04, + "grad_norm": 0.7159188389778137, + "learning_rate": 0.0001384234607996278, + "loss": 2.9785, + "step": 41690 + }, + { + "epoch": 2.04, + "grad_norm": 0.6930112838745117, + "learning_rate": 0.00013841048866014197, + "loss": 2.9677, + "step": 41691 + }, + { + "epoch": 2.04, + "grad_norm": 0.6859667897224426, + "learning_rate": 0.00013839751694625196, + "loss": 2.9933, + "step": 41692 + }, + { + "epoch": 2.04, + "grad_norm": 0.7143214344978333, + "learning_rate": 0.00013838454565799163, + "loss": 2.9476, + "step": 41693 + }, + { + "epoch": 2.04, + "grad_norm": 0.7018111944198608, + "learning_rate": 0.00013837157479539542, + "loss": 2.7747, + "step": 41694 + }, + { + "epoch": 2.04, + "grad_norm": 0.6992251873016357, + "learning_rate": 0.00013835860435849727, + "loss": 3.0552, + "step": 41695 + }, + { + "epoch": 2.04, + "grad_norm": 0.6739795207977295, + "learning_rate": 0.00013834563434733156, + "loss": 2.8918, + "step": 41696 + }, + { + "epoch": 2.04, + "grad_norm": 0.7193833589553833, + "learning_rate": 0.00013833266476193237, + "loss": 3.0882, + "step": 41697 + }, + { + "epoch": 2.04, + "grad_norm": 0.6857161521911621, + "learning_rate": 0.0001383196956023337, + "loss": 3.0136, + "step": 41698 + }, + { + "epoch": 2.04, + "grad_norm": 0.73293137550354, + "learning_rate": 0.0001383067268685699, + "loss": 2.7954, + "step": 41699 + }, + { + "epoch": 2.04, + "grad_norm": 0.6800030469894409, + "learning_rate": 0.000138293758560675, + "loss": 3.0618, + "step": 41700 + }, + { + "epoch": 2.04, + "grad_norm": 0.6845666766166687, + "learning_rate": 0.0001382807906786832, + "loss": 3.0205, + "step": 41701 + }, + { + "epoch": 2.04, + "grad_norm": 0.6896492838859558, + "learning_rate": 0.00013826782322262881, + "loss": 2.8907, + "step": 41702 + }, + { + "epoch": 2.04, + "grad_norm": 0.6447883248329163, + "learning_rate": 0.00013825485619254582, + "loss": 2.8803, + "step": 41703 + }, + { + "epoch": 2.04, + "grad_norm": 0.6869139671325684, + "learning_rate": 0.00013824188958846845, + "loss": 3.2243, + "step": 41704 + }, + { + "epoch": 2.04, + "grad_norm": 0.6388272047042847, + "learning_rate": 0.00013822892341043067, + "loss": 2.7329, + "step": 41705 + }, + { + "epoch": 2.04, + "grad_norm": 0.6573567986488342, + "learning_rate": 0.00013821595765846676, + "loss": 2.9315, + "step": 41706 + }, + { + "epoch": 2.04, + "grad_norm": 0.6946920156478882, + "learning_rate": 0.00013820299233261102, + "loss": 2.8361, + "step": 41707 + }, + { + "epoch": 2.04, + "grad_norm": 0.7050734758377075, + "learning_rate": 0.00013819002743289732, + "loss": 2.8692, + "step": 41708 + }, + { + "epoch": 2.04, + "grad_norm": 0.7093161940574646, + "learning_rate": 0.00013817706295936007, + "loss": 2.8641, + "step": 41709 + }, + { + "epoch": 2.04, + "grad_norm": 0.6960554122924805, + "learning_rate": 0.0001381640989120332, + "loss": 2.9397, + "step": 41710 + }, + { + "epoch": 2.04, + "grad_norm": 0.6445857882499695, + "learning_rate": 0.00013815113529095104, + "loss": 2.9978, + "step": 41711 + }, + { + "epoch": 2.04, + "grad_norm": 0.7119023203849792, + "learning_rate": 0.00013813817209614766, + "loss": 2.9848, + "step": 41712 + }, + { + "epoch": 2.04, + "grad_norm": 0.6626527309417725, + "learning_rate": 0.00013812520932765702, + "loss": 2.8887, + "step": 41713 + }, + { + "epoch": 2.04, + "grad_norm": 0.6804422736167908, + "learning_rate": 0.00013811224698551358, + "loss": 3.0319, + "step": 41714 + }, + { + "epoch": 2.04, + "grad_norm": 0.7170805335044861, + "learning_rate": 0.0001380992850697512, + "loss": 2.9286, + "step": 41715 + }, + { + "epoch": 2.04, + "grad_norm": 0.6595010161399841, + "learning_rate": 0.0001380863235804042, + "loss": 2.9433, + "step": 41716 + }, + { + "epoch": 2.04, + "grad_norm": 0.6733608245849609, + "learning_rate": 0.0001380733625175067, + "loss": 2.8411, + "step": 41717 + }, + { + "epoch": 2.04, + "grad_norm": 0.8234678506851196, + "learning_rate": 0.00013806040188109282, + "loss": 2.8652, + "step": 41718 + }, + { + "epoch": 2.04, + "grad_norm": 0.7054716348648071, + "learning_rate": 0.00013804744167119666, + "loss": 2.7808, + "step": 41719 + }, + { + "epoch": 2.04, + "grad_norm": 0.7219303250312805, + "learning_rate": 0.00013803448188785228, + "loss": 2.8326, + "step": 41720 + }, + { + "epoch": 2.04, + "grad_norm": 0.6587951183319092, + "learning_rate": 0.0001380215225310939, + "loss": 2.8626, + "step": 41721 + }, + { + "epoch": 2.04, + "grad_norm": 0.6685791611671448, + "learning_rate": 0.0001380085636009558, + "loss": 2.9384, + "step": 41722 + }, + { + "epoch": 2.04, + "grad_norm": 0.7046728134155273, + "learning_rate": 0.00013799560509747185, + "loss": 2.9112, + "step": 41723 + }, + { + "epoch": 2.04, + "grad_norm": 1.1903138160705566, + "learning_rate": 0.00013798264702067636, + "loss": 2.7223, + "step": 41724 + }, + { + "epoch": 2.04, + "grad_norm": 0.6698146462440491, + "learning_rate": 0.00013796968937060345, + "loss": 3.0392, + "step": 41725 + }, + { + "epoch": 2.04, + "grad_norm": 0.6656641960144043, + "learning_rate": 0.00013795673214728703, + "loss": 3.1732, + "step": 41726 + }, + { + "epoch": 2.04, + "grad_norm": 0.6809387803077698, + "learning_rate": 0.00013794377535076153, + "loss": 3.057, + "step": 41727 + }, + { + "epoch": 2.04, + "grad_norm": 0.7437176704406738, + "learning_rate": 0.00013793081898106081, + "loss": 3.0012, + "step": 41728 + }, + { + "epoch": 2.05, + "grad_norm": 0.6619541645050049, + "learning_rate": 0.0001379178630382193, + "loss": 3.0339, + "step": 41729 + }, + { + "epoch": 2.05, + "grad_norm": 0.6481929421424866, + "learning_rate": 0.00013790490752227074, + "loss": 2.806, + "step": 41730 + }, + { + "epoch": 2.05, + "grad_norm": 0.6587005853652954, + "learning_rate": 0.00013789195243324952, + "loss": 3.1365, + "step": 41731 + }, + { + "epoch": 2.05, + "grad_norm": 0.6814006567001343, + "learning_rate": 0.00013787899777118978, + "loss": 2.8502, + "step": 41732 + }, + { + "epoch": 2.05, + "grad_norm": 0.6785389184951782, + "learning_rate": 0.00013786604353612558, + "loss": 2.901, + "step": 41733 + }, + { + "epoch": 2.05, + "grad_norm": 0.6514621376991272, + "learning_rate": 0.00013785308972809102, + "loss": 2.7796, + "step": 41734 + }, + { + "epoch": 2.05, + "grad_norm": 0.7052819728851318, + "learning_rate": 0.00013784013634712009, + "loss": 2.825, + "step": 41735 + }, + { + "epoch": 2.05, + "grad_norm": 0.6751008629798889, + "learning_rate": 0.00013782718339324702, + "loss": 2.9968, + "step": 41736 + }, + { + "epoch": 2.05, + "grad_norm": 0.6512966752052307, + "learning_rate": 0.00013781423086650608, + "loss": 2.9316, + "step": 41737 + }, + { + "epoch": 2.05, + "grad_norm": 0.6523333787918091, + "learning_rate": 0.00013780127876693112, + "loss": 3.0541, + "step": 41738 + }, + { + "epoch": 2.05, + "grad_norm": 0.7050060033798218, + "learning_rate": 0.0001377883270945565, + "loss": 3.0663, + "step": 41739 + }, + { + "epoch": 2.05, + "grad_norm": 0.7077402472496033, + "learning_rate": 0.00013777537584941618, + "loss": 3.0436, + "step": 41740 + }, + { + "epoch": 2.05, + "grad_norm": 0.6588230133056641, + "learning_rate": 0.0001377624250315442, + "loss": 2.858, + "step": 41741 + }, + { + "epoch": 2.05, + "grad_norm": 0.708586573600769, + "learning_rate": 0.00013774947464097488, + "loss": 2.8115, + "step": 41742 + }, + { + "epoch": 2.05, + "grad_norm": 0.6874078512191772, + "learning_rate": 0.0001377365246777421, + "loss": 2.7809, + "step": 41743 + }, + { + "epoch": 2.05, + "grad_norm": 0.6789014935493469, + "learning_rate": 0.00013772357514188022, + "loss": 3.0268, + "step": 41744 + }, + { + "epoch": 2.05, + "grad_norm": 0.7179741263389587, + "learning_rate": 0.00013771062603342308, + "loss": 3.13, + "step": 41745 + }, + { + "epoch": 2.05, + "grad_norm": 0.6951627731323242, + "learning_rate": 0.00013769767735240493, + "loss": 3.0357, + "step": 41746 + }, + { + "epoch": 2.05, + "grad_norm": 0.6996464133262634, + "learning_rate": 0.00013768472909886007, + "loss": 2.9498, + "step": 41747 + }, + { + "epoch": 2.05, + "grad_norm": 0.6561077237129211, + "learning_rate": 0.0001376717812728222, + "loss": 2.8745, + "step": 41748 + }, + { + "epoch": 2.05, + "grad_norm": 0.736611008644104, + "learning_rate": 0.00013765883387432568, + "loss": 2.8959, + "step": 41749 + }, + { + "epoch": 2.05, + "grad_norm": 0.6801815629005432, + "learning_rate": 0.00013764588690340447, + "loss": 3.0878, + "step": 41750 + }, + { + "epoch": 2.05, + "grad_norm": 0.6809715032577515, + "learning_rate": 0.00013763294036009274, + "loss": 2.9666, + "step": 41751 + }, + { + "epoch": 2.05, + "grad_norm": 0.7071965336799622, + "learning_rate": 0.00013761999424442473, + "loss": 2.9907, + "step": 41752 + }, + { + "epoch": 2.05, + "grad_norm": 0.6758064031600952, + "learning_rate": 0.00013760704855643428, + "loss": 2.7936, + "step": 41753 + }, + { + "epoch": 2.05, + "grad_norm": 0.6896607875823975, + "learning_rate": 0.0001375941032961557, + "loss": 2.8758, + "step": 41754 + }, + { + "epoch": 2.05, + "grad_norm": 0.6973819136619568, + "learning_rate": 0.000137581158463623, + "loss": 2.994, + "step": 41755 + }, + { + "epoch": 2.05, + "grad_norm": 0.6735284924507141, + "learning_rate": 0.0001375682140588702, + "loss": 2.8614, + "step": 41756 + }, + { + "epoch": 2.05, + "grad_norm": 0.680656373500824, + "learning_rate": 0.00013755527008193154, + "loss": 2.9821, + "step": 41757 + }, + { + "epoch": 2.05, + "grad_norm": 0.6982008814811707, + "learning_rate": 0.00013754232653284093, + "loss": 2.8955, + "step": 41758 + }, + { + "epoch": 2.05, + "grad_norm": 0.683266282081604, + "learning_rate": 0.0001375293834116327, + "loss": 3.0358, + "step": 41759 + }, + { + "epoch": 2.05, + "grad_norm": 0.6826072335243225, + "learning_rate": 0.00013751644071834066, + "loss": 2.7453, + "step": 41760 + }, + { + "epoch": 2.05, + "grad_norm": 0.7050215005874634, + "learning_rate": 0.0001375034984529992, + "loss": 2.7758, + "step": 41761 + }, + { + "epoch": 2.05, + "grad_norm": 0.7060586214065552, + "learning_rate": 0.0001374905566156422, + "loss": 2.8637, + "step": 41762 + }, + { + "epoch": 2.05, + "grad_norm": 0.7489438056945801, + "learning_rate": 0.0001374776152063037, + "loss": 2.8327, + "step": 41763 + }, + { + "epoch": 2.05, + "grad_norm": 0.7340999841690063, + "learning_rate": 0.000137464674225018, + "loss": 2.963, + "step": 41764 + }, + { + "epoch": 2.05, + "grad_norm": 0.6897082328796387, + "learning_rate": 0.00013745173367181899, + "loss": 2.8568, + "step": 41765 + }, + { + "epoch": 2.05, + "grad_norm": 0.7091054320335388, + "learning_rate": 0.0001374387935467409, + "loss": 2.9136, + "step": 41766 + }, + { + "epoch": 2.05, + "grad_norm": 0.7176274657249451, + "learning_rate": 0.00013742585384981767, + "loss": 2.8918, + "step": 41767 + }, + { + "epoch": 2.05, + "grad_norm": 0.675040602684021, + "learning_rate": 0.00013741291458108342, + "loss": 2.8829, + "step": 41768 + }, + { + "epoch": 2.05, + "grad_norm": 0.6755308508872986, + "learning_rate": 0.00013739997574057237, + "loss": 3.0007, + "step": 41769 + }, + { + "epoch": 2.05, + "grad_norm": 0.698590874671936, + "learning_rate": 0.0001373870373283185, + "loss": 3.1754, + "step": 41770 + }, + { + "epoch": 2.05, + "grad_norm": 0.6927210688591003, + "learning_rate": 0.00013737409934435588, + "loss": 2.8816, + "step": 41771 + }, + { + "epoch": 2.05, + "grad_norm": 0.7321736812591553, + "learning_rate": 0.00013736116178871848, + "loss": 2.995, + "step": 41772 + }, + { + "epoch": 2.05, + "grad_norm": 0.7068862318992615, + "learning_rate": 0.0001373482246614405, + "loss": 2.895, + "step": 41773 + }, + { + "epoch": 2.05, + "grad_norm": 0.6535940170288086, + "learning_rate": 0.00013733528796255605, + "loss": 2.9816, + "step": 41774 + }, + { + "epoch": 2.05, + "grad_norm": 0.6542276740074158, + "learning_rate": 0.00013732235169209908, + "loss": 2.8163, + "step": 41775 + }, + { + "epoch": 2.05, + "grad_norm": 0.7153056859970093, + "learning_rate": 0.0001373094158501038, + "loss": 2.7931, + "step": 41776 + }, + { + "epoch": 2.05, + "grad_norm": 0.7129161357879639, + "learning_rate": 0.00013729648043660423, + "loss": 2.7878, + "step": 41777 + }, + { + "epoch": 2.05, + "grad_norm": 0.656928300857544, + "learning_rate": 0.0001372835454516343, + "loss": 2.8457, + "step": 41778 + }, + { + "epoch": 2.05, + "grad_norm": 0.733238935470581, + "learning_rate": 0.00013727061089522833, + "loss": 2.9476, + "step": 41779 + }, + { + "epoch": 2.05, + "grad_norm": 0.6839013695716858, + "learning_rate": 0.0001372576767674201, + "loss": 2.9249, + "step": 41780 + }, + { + "epoch": 2.05, + "grad_norm": 0.7424287796020508, + "learning_rate": 0.000137244743068244, + "loss": 2.9104, + "step": 41781 + }, + { + "epoch": 2.05, + "grad_norm": 0.7072665691375732, + "learning_rate": 0.00013723180979773378, + "loss": 2.9528, + "step": 41782 + }, + { + "epoch": 2.05, + "grad_norm": 0.7041286826133728, + "learning_rate": 0.00013721887695592372, + "loss": 2.7528, + "step": 41783 + }, + { + "epoch": 2.05, + "grad_norm": 0.7332327961921692, + "learning_rate": 0.00013720594454284788, + "loss": 2.7556, + "step": 41784 + }, + { + "epoch": 2.05, + "grad_norm": 0.6847696900367737, + "learning_rate": 0.0001371930125585401, + "loss": 2.7119, + "step": 41785 + }, + { + "epoch": 2.05, + "grad_norm": 0.6434175968170166, + "learning_rate": 0.00013718008100303472, + "loss": 2.8625, + "step": 41786 + }, + { + "epoch": 2.05, + "grad_norm": 0.6727170944213867, + "learning_rate": 0.00013716714987636555, + "loss": 2.9554, + "step": 41787 + }, + { + "epoch": 2.05, + "grad_norm": 0.6850985884666443, + "learning_rate": 0.00013715421917856677, + "loss": 3.069, + "step": 41788 + }, + { + "epoch": 2.05, + "grad_norm": 0.6658537983894348, + "learning_rate": 0.00013714128890967252, + "loss": 2.8858, + "step": 41789 + }, + { + "epoch": 2.05, + "grad_norm": 0.6863417625427246, + "learning_rate": 0.00013712835906971667, + "loss": 2.8266, + "step": 41790 + }, + { + "epoch": 2.05, + "grad_norm": 0.7340531349182129, + "learning_rate": 0.00013711542965873352, + "loss": 2.7212, + "step": 41791 + }, + { + "epoch": 2.05, + "grad_norm": 0.7077760696411133, + "learning_rate": 0.00013710250067675697, + "loss": 2.9836, + "step": 41792 + }, + { + "epoch": 2.05, + "grad_norm": 0.7007636427879333, + "learning_rate": 0.000137089572123821, + "loss": 2.7947, + "step": 41793 + }, + { + "epoch": 2.05, + "grad_norm": 0.6507523655891418, + "learning_rate": 0.0001370766439999598, + "loss": 2.7803, + "step": 41794 + }, + { + "epoch": 2.05, + "grad_norm": 0.6822836995124817, + "learning_rate": 0.00013706371630520727, + "loss": 2.7836, + "step": 41795 + }, + { + "epoch": 2.05, + "grad_norm": 1.2069634199142456, + "learning_rate": 0.00013705078903959768, + "loss": 2.9114, + "step": 41796 + }, + { + "epoch": 2.05, + "grad_norm": 0.6869301795959473, + "learning_rate": 0.00013703786220316483, + "loss": 2.7778, + "step": 41797 + }, + { + "epoch": 2.05, + "grad_norm": 0.6838812828063965, + "learning_rate": 0.000137024935795943, + "loss": 2.9701, + "step": 41798 + }, + { + "epoch": 2.05, + "grad_norm": 0.6679784655570984, + "learning_rate": 0.00013701200981796615, + "loss": 2.873, + "step": 41799 + }, + { + "epoch": 2.05, + "grad_norm": 0.6981244683265686, + "learning_rate": 0.00013699908426926814, + "loss": 3.1318, + "step": 41800 + }, + { + "epoch": 2.05, + "grad_norm": 0.6437106728553772, + "learning_rate": 0.00013698615914988334, + "loss": 2.8379, + "step": 41801 + }, + { + "epoch": 2.05, + "grad_norm": 0.6813018321990967, + "learning_rate": 0.00013697323445984545, + "loss": 2.8552, + "step": 41802 + }, + { + "epoch": 2.05, + "grad_norm": 0.7143990993499756, + "learning_rate": 0.00013696031019918873, + "loss": 2.9512, + "step": 41803 + }, + { + "epoch": 2.05, + "grad_norm": 0.7127072215080261, + "learning_rate": 0.00013694738636794727, + "loss": 2.966, + "step": 41804 + }, + { + "epoch": 2.05, + "grad_norm": 0.7043535709381104, + "learning_rate": 0.000136934462966155, + "loss": 2.801, + "step": 41805 + }, + { + "epoch": 2.05, + "grad_norm": 0.6857683062553406, + "learning_rate": 0.000136921539993846, + "loss": 2.8319, + "step": 41806 + }, + { + "epoch": 2.05, + "grad_norm": 0.6381077766418457, + "learning_rate": 0.00013690861745105414, + "loss": 3.0228, + "step": 41807 + }, + { + "epoch": 2.05, + "grad_norm": 0.6954429745674133, + "learning_rate": 0.00013689569533781364, + "loss": 2.9477, + "step": 41808 + }, + { + "epoch": 2.05, + "grad_norm": 0.6456500291824341, + "learning_rate": 0.00013688277365415854, + "loss": 3.0681, + "step": 41809 + }, + { + "epoch": 2.05, + "grad_norm": 0.6630203127861023, + "learning_rate": 0.00013686985240012276, + "loss": 2.8573, + "step": 41810 + }, + { + "epoch": 2.05, + "grad_norm": 0.6759008169174194, + "learning_rate": 0.00013685693157574046, + "loss": 2.7842, + "step": 41811 + }, + { + "epoch": 2.05, + "grad_norm": 0.6748966574668884, + "learning_rate": 0.00013684401118104552, + "loss": 2.9473, + "step": 41812 + }, + { + "epoch": 2.05, + "grad_norm": 0.7730910778045654, + "learning_rate": 0.00013683109121607218, + "loss": 2.636, + "step": 41813 + }, + { + "epoch": 2.05, + "grad_norm": 0.6848281025886536, + "learning_rate": 0.00013681817168085432, + "loss": 2.8989, + "step": 41814 + }, + { + "epoch": 2.05, + "grad_norm": 0.6651986241340637, + "learning_rate": 0.0001368052525754259, + "loss": 2.7469, + "step": 41815 + }, + { + "epoch": 2.05, + "grad_norm": 0.6602210998535156, + "learning_rate": 0.00013679233389982112, + "loss": 3.0205, + "step": 41816 + }, + { + "epoch": 2.05, + "grad_norm": 0.7078651189804077, + "learning_rate": 0.00013677941565407384, + "loss": 2.8248, + "step": 41817 + }, + { + "epoch": 2.05, + "grad_norm": 0.6716862320899963, + "learning_rate": 0.00013676649783821813, + "loss": 2.8242, + "step": 41818 + }, + { + "epoch": 2.05, + "grad_norm": 0.6786720156669617, + "learning_rate": 0.00013675358045228821, + "loss": 3.0426, + "step": 41819 + }, + { + "epoch": 2.05, + "grad_norm": 0.6606864929199219, + "learning_rate": 0.00013674066349631791, + "loss": 3.0192, + "step": 41820 + }, + { + "epoch": 2.05, + "grad_norm": 0.6600707769393921, + "learning_rate": 0.00013672774697034127, + "loss": 3.1479, + "step": 41821 + }, + { + "epoch": 2.05, + "grad_norm": 0.7069227695465088, + "learning_rate": 0.00013671483087439223, + "loss": 2.8687, + "step": 41822 + }, + { + "epoch": 2.05, + "grad_norm": 0.6884543895721436, + "learning_rate": 0.0001367019152085049, + "loss": 2.9547, + "step": 41823 + }, + { + "epoch": 2.05, + "grad_norm": 0.6656718850135803, + "learning_rate": 0.0001366889999727134, + "loss": 2.9701, + "step": 41824 + }, + { + "epoch": 2.05, + "grad_norm": 0.6968078017234802, + "learning_rate": 0.00013667608516705154, + "loss": 3.0355, + "step": 41825 + }, + { + "epoch": 2.05, + "grad_norm": 0.7189076542854309, + "learning_rate": 0.0001366631707915536, + "loss": 2.9283, + "step": 41826 + }, + { + "epoch": 2.05, + "grad_norm": 0.6748219728469849, + "learning_rate": 0.00013665025684625335, + "loss": 3.0839, + "step": 41827 + }, + { + "epoch": 2.05, + "grad_norm": 0.6655088067054749, + "learning_rate": 0.00013663734333118483, + "loss": 2.8599, + "step": 41828 + }, + { + "epoch": 2.05, + "grad_norm": 0.6688360571861267, + "learning_rate": 0.00013662443024638218, + "loss": 3.2349, + "step": 41829 + }, + { + "epoch": 2.05, + "grad_norm": 0.7066716551780701, + "learning_rate": 0.00013661151759187927, + "loss": 2.9871, + "step": 41830 + }, + { + "epoch": 2.05, + "grad_norm": 0.6709800362586975, + "learning_rate": 0.00013659860536771023, + "loss": 2.815, + "step": 41831 + }, + { + "epoch": 2.05, + "grad_norm": 0.7102221250534058, + "learning_rate": 0.00013658569357390897, + "loss": 2.8565, + "step": 41832 + }, + { + "epoch": 2.05, + "grad_norm": 0.6634939312934875, + "learning_rate": 0.0001365727822105095, + "loss": 2.7641, + "step": 41833 + }, + { + "epoch": 2.05, + "grad_norm": 0.6929799318313599, + "learning_rate": 0.000136559871277546, + "loss": 2.8275, + "step": 41834 + }, + { + "epoch": 2.05, + "grad_norm": 0.695609450340271, + "learning_rate": 0.00013654696077505232, + "loss": 3.0188, + "step": 41835 + }, + { + "epoch": 2.05, + "grad_norm": 0.6726747751235962, + "learning_rate": 0.00013653405070306252, + "loss": 2.9284, + "step": 41836 + }, + { + "epoch": 2.05, + "grad_norm": 0.7245111465454102, + "learning_rate": 0.00013652114106161043, + "loss": 2.8936, + "step": 41837 + }, + { + "epoch": 2.05, + "grad_norm": 0.7520313858985901, + "learning_rate": 0.00013650823185073035, + "loss": 2.9826, + "step": 41838 + }, + { + "epoch": 2.05, + "grad_norm": 0.6968186497688293, + "learning_rate": 0.00013649532307045595, + "loss": 3.0164, + "step": 41839 + }, + { + "epoch": 2.05, + "grad_norm": 0.7093591094017029, + "learning_rate": 0.00013648241472082143, + "loss": 3.0614, + "step": 41840 + }, + { + "epoch": 2.05, + "grad_norm": 0.6695505380630493, + "learning_rate": 0.0001364695068018609, + "loss": 2.957, + "step": 41841 + }, + { + "epoch": 2.05, + "grad_norm": 0.721322238445282, + "learning_rate": 0.0001364565993136082, + "loss": 2.7591, + "step": 41842 + }, + { + "epoch": 2.05, + "grad_norm": 0.6672532558441162, + "learning_rate": 0.00013644369225609734, + "loss": 3.027, + "step": 41843 + }, + { + "epoch": 2.05, + "grad_norm": 0.7743553519248962, + "learning_rate": 0.00013643078562936218, + "loss": 2.7336, + "step": 41844 + }, + { + "epoch": 2.05, + "grad_norm": 0.7321560978889465, + "learning_rate": 0.00013641787943343686, + "loss": 2.8884, + "step": 41845 + }, + { + "epoch": 2.05, + "grad_norm": 0.7255355715751648, + "learning_rate": 0.0001364049736683555, + "loss": 2.8195, + "step": 41846 + }, + { + "epoch": 2.05, + "grad_norm": 0.721168577671051, + "learning_rate": 0.00013639206833415183, + "loss": 2.7635, + "step": 41847 + }, + { + "epoch": 2.05, + "grad_norm": 0.6950680613517761, + "learning_rate": 0.0001363791634308601, + "loss": 2.7786, + "step": 41848 + }, + { + "epoch": 2.05, + "grad_norm": 0.7091295719146729, + "learning_rate": 0.00013636625895851413, + "loss": 3.0322, + "step": 41849 + }, + { + "epoch": 2.05, + "grad_norm": 0.7242528796195984, + "learning_rate": 0.00013635335491714786, + "loss": 2.8183, + "step": 41850 + }, + { + "epoch": 2.05, + "grad_norm": 0.7005438804626465, + "learning_rate": 0.00013634045130679544, + "loss": 2.9768, + "step": 41851 + }, + { + "epoch": 2.05, + "grad_norm": 0.6915015578269958, + "learning_rate": 0.00013632754812749066, + "loss": 3.1398, + "step": 41852 + }, + { + "epoch": 2.05, + "grad_norm": 0.6812850832939148, + "learning_rate": 0.00013631464537926775, + "loss": 3.0446, + "step": 41853 + }, + { + "epoch": 2.05, + "grad_norm": 0.69903165102005, + "learning_rate": 0.00013630174306216044, + "loss": 2.9758, + "step": 41854 + }, + { + "epoch": 2.05, + "grad_norm": 0.6556075215339661, + "learning_rate": 0.00013628884117620284, + "loss": 2.8251, + "step": 41855 + }, + { + "epoch": 2.05, + "grad_norm": 0.9440958499908447, + "learning_rate": 0.00013627593972142903, + "loss": 2.9879, + "step": 41856 + }, + { + "epoch": 2.05, + "grad_norm": 0.6949568390846252, + "learning_rate": 0.00013626303869787292, + "loss": 2.7712, + "step": 41857 + }, + { + "epoch": 2.05, + "grad_norm": 0.7072689533233643, + "learning_rate": 0.0001362501381055684, + "loss": 2.9006, + "step": 41858 + }, + { + "epoch": 2.05, + "grad_norm": 0.7018836140632629, + "learning_rate": 0.00013623723794454944, + "loss": 3.0257, + "step": 41859 + }, + { + "epoch": 2.05, + "grad_norm": 0.6747689247131348, + "learning_rate": 0.00013622433821485006, + "loss": 2.9541, + "step": 41860 + }, + { + "epoch": 2.05, + "grad_norm": 0.704422116279602, + "learning_rate": 0.00013621143891650437, + "loss": 2.9823, + "step": 41861 + }, + { + "epoch": 2.05, + "grad_norm": 0.6954941749572754, + "learning_rate": 0.00013619854004954612, + "loss": 3.0837, + "step": 41862 + }, + { + "epoch": 2.05, + "grad_norm": 0.7778325080871582, + "learning_rate": 0.00013618564161400947, + "loss": 2.797, + "step": 41863 + }, + { + "epoch": 2.05, + "grad_norm": 0.7002326846122742, + "learning_rate": 0.00013617274360992832, + "loss": 2.8489, + "step": 41864 + }, + { + "epoch": 2.05, + "grad_norm": 0.6892054080963135, + "learning_rate": 0.00013615984603733655, + "loss": 2.9662, + "step": 41865 + }, + { + "epoch": 2.05, + "grad_norm": 0.6607648730278015, + "learning_rate": 0.0001361469488962683, + "loss": 2.9692, + "step": 41866 + }, + { + "epoch": 2.05, + "grad_norm": 0.6442403197288513, + "learning_rate": 0.00013613405218675735, + "loss": 2.8593, + "step": 41867 + }, + { + "epoch": 2.05, + "grad_norm": 0.6695894002914429, + "learning_rate": 0.0001361211559088379, + "loss": 2.7185, + "step": 41868 + }, + { + "epoch": 2.05, + "grad_norm": 0.725757360458374, + "learning_rate": 0.00013610826006254368, + "loss": 2.9114, + "step": 41869 + }, + { + "epoch": 2.05, + "grad_norm": 0.7685546278953552, + "learning_rate": 0.00013609536464790871, + "loss": 2.8982, + "step": 41870 + }, + { + "epoch": 2.05, + "grad_norm": 0.6293439865112305, + "learning_rate": 0.00013608246966496714, + "loss": 2.8666, + "step": 41871 + }, + { + "epoch": 2.05, + "grad_norm": 0.7110158205032349, + "learning_rate": 0.00013606957511375285, + "loss": 2.7713, + "step": 41872 + }, + { + "epoch": 2.05, + "grad_norm": 0.687812864780426, + "learning_rate": 0.00013605668099429967, + "loss": 3.1352, + "step": 41873 + }, + { + "epoch": 2.05, + "grad_norm": 0.6826140284538269, + "learning_rate": 0.00013604378730664156, + "loss": 2.7359, + "step": 41874 + }, + { + "epoch": 2.05, + "grad_norm": 0.6955284476280212, + "learning_rate": 0.0001360308940508126, + "loss": 2.8839, + "step": 41875 + }, + { + "epoch": 2.05, + "grad_norm": 0.744838535785675, + "learning_rate": 0.00013601800122684676, + "loss": 2.8754, + "step": 41876 + }, + { + "epoch": 2.05, + "grad_norm": 0.7015159130096436, + "learning_rate": 0.00013600510883477786, + "loss": 2.9217, + "step": 41877 + }, + { + "epoch": 2.05, + "grad_norm": 0.6986700296401978, + "learning_rate": 0.00013599221687464005, + "loss": 2.8404, + "step": 41878 + }, + { + "epoch": 2.05, + "grad_norm": 0.7709964513778687, + "learning_rate": 0.00013597932534646718, + "loss": 2.8286, + "step": 41879 + }, + { + "epoch": 2.05, + "grad_norm": 0.6718994379043579, + "learning_rate": 0.0001359664342502931, + "loss": 2.9081, + "step": 41880 + }, + { + "epoch": 2.05, + "grad_norm": 0.7286249399185181, + "learning_rate": 0.00013595354358615194, + "loss": 2.9856, + "step": 41881 + }, + { + "epoch": 2.05, + "grad_norm": 0.6866543889045715, + "learning_rate": 0.00013594065335407752, + "loss": 2.8534, + "step": 41882 + }, + { + "epoch": 2.05, + "grad_norm": 0.6790507435798645, + "learning_rate": 0.00013592776355410394, + "loss": 2.9359, + "step": 41883 + }, + { + "epoch": 2.05, + "grad_norm": 0.6834301948547363, + "learning_rate": 0.0001359148741862649, + "loss": 2.9917, + "step": 41884 + }, + { + "epoch": 2.05, + "grad_norm": 0.6941182613372803, + "learning_rate": 0.00013590198525059466, + "loss": 2.926, + "step": 41885 + }, + { + "epoch": 2.05, + "grad_norm": 0.7055644392967224, + "learning_rate": 0.00013588909674712698, + "loss": 2.7773, + "step": 41886 + }, + { + "epoch": 2.05, + "grad_norm": 0.6956052780151367, + "learning_rate": 0.0001358762086758957, + "loss": 2.8471, + "step": 41887 + }, + { + "epoch": 2.05, + "grad_norm": 0.6483541131019592, + "learning_rate": 0.00013586332103693506, + "loss": 3.033, + "step": 41888 + }, + { + "epoch": 2.05, + "grad_norm": 0.6963681578636169, + "learning_rate": 0.00013585043383027874, + "loss": 3.1238, + "step": 41889 + }, + { + "epoch": 2.05, + "grad_norm": 0.6419037580490112, + "learning_rate": 0.00013583754705596076, + "loss": 2.8311, + "step": 41890 + }, + { + "epoch": 2.05, + "grad_norm": 0.6917277574539185, + "learning_rate": 0.0001358246607140152, + "loss": 2.8747, + "step": 41891 + }, + { + "epoch": 2.05, + "grad_norm": 0.7247151732444763, + "learning_rate": 0.00013581177480447576, + "loss": 2.9386, + "step": 41892 + }, + { + "epoch": 2.05, + "grad_norm": 0.71748948097229, + "learning_rate": 0.00013579888932737665, + "loss": 3.047, + "step": 41893 + }, + { + "epoch": 2.05, + "grad_norm": 0.6673287749290466, + "learning_rate": 0.00013578600428275166, + "loss": 3.0716, + "step": 41894 + }, + { + "epoch": 2.05, + "grad_norm": 0.7329942584037781, + "learning_rate": 0.00013577311967063462, + "loss": 2.7382, + "step": 41895 + }, + { + "epoch": 2.05, + "grad_norm": 0.652654767036438, + "learning_rate": 0.00013576023549105966, + "loss": 3.2103, + "step": 41896 + }, + { + "epoch": 2.05, + "grad_norm": 0.730176568031311, + "learning_rate": 0.00013574735174406055, + "loss": 2.8715, + "step": 41897 + }, + { + "epoch": 2.05, + "grad_norm": 0.6587347984313965, + "learning_rate": 0.0001357344684296714, + "loss": 2.7973, + "step": 41898 + }, + { + "epoch": 2.05, + "grad_norm": 0.6811947822570801, + "learning_rate": 0.00013572158554792595, + "loss": 2.9729, + "step": 41899 + }, + { + "epoch": 2.05, + "grad_norm": 0.6677283644676208, + "learning_rate": 0.0001357087030988583, + "loss": 3.0673, + "step": 41900 + }, + { + "epoch": 2.05, + "grad_norm": 0.7097472548484802, + "learning_rate": 0.00013569582108250236, + "loss": 2.7919, + "step": 41901 + }, + { + "epoch": 2.05, + "grad_norm": 0.6857821345329285, + "learning_rate": 0.00013568293949889186, + "loss": 3.1361, + "step": 41902 + }, + { + "epoch": 2.05, + "grad_norm": 0.6500460505485535, + "learning_rate": 0.000135670058348061, + "loss": 2.869, + "step": 41903 + }, + { + "epoch": 2.05, + "grad_norm": 0.6371486186981201, + "learning_rate": 0.00013565717763004348, + "loss": 2.9958, + "step": 41904 + }, + { + "epoch": 2.05, + "grad_norm": 0.6974950432777405, + "learning_rate": 0.0001356442973448733, + "loss": 2.9836, + "step": 41905 + }, + { + "epoch": 2.05, + "grad_norm": 0.727595329284668, + "learning_rate": 0.00013563141749258455, + "loss": 2.9796, + "step": 41906 + }, + { + "epoch": 2.05, + "grad_norm": 0.681317150592804, + "learning_rate": 0.000135618538073211, + "loss": 2.7705, + "step": 41907 + }, + { + "epoch": 2.05, + "grad_norm": 0.6713190674781799, + "learning_rate": 0.00013560565908678654, + "loss": 2.934, + "step": 41908 + }, + { + "epoch": 2.05, + "grad_norm": 0.698500394821167, + "learning_rate": 0.00013559278053334507, + "loss": 2.9041, + "step": 41909 + }, + { + "epoch": 2.05, + "grad_norm": 0.6464803814888, + "learning_rate": 0.0001355799024129206, + "loss": 2.9814, + "step": 41910 + }, + { + "epoch": 2.05, + "grad_norm": 0.7684870362281799, + "learning_rate": 0.00013556702472554712, + "loss": 2.9662, + "step": 41911 + }, + { + "epoch": 2.05, + "grad_norm": 0.7648671269416809, + "learning_rate": 0.00013555414747125832, + "loss": 2.8802, + "step": 41912 + }, + { + "epoch": 2.05, + "grad_norm": 0.6603399515151978, + "learning_rate": 0.00013554127065008837, + "loss": 3.0558, + "step": 41913 + }, + { + "epoch": 2.05, + "grad_norm": 0.6440830230712891, + "learning_rate": 0.00013552839426207093, + "loss": 2.8296, + "step": 41914 + }, + { + "epoch": 2.05, + "grad_norm": 0.6936287879943848, + "learning_rate": 0.00013551551830724018, + "loss": 2.9706, + "step": 41915 + }, + { + "epoch": 2.05, + "grad_norm": 0.6846458911895752, + "learning_rate": 0.0001355026427856299, + "loss": 3.0088, + "step": 41916 + }, + { + "epoch": 2.05, + "grad_norm": 0.6697417497634888, + "learning_rate": 0.00013548976769727386, + "loss": 2.8968, + "step": 41917 + }, + { + "epoch": 2.05, + "grad_norm": 0.7135060429573059, + "learning_rate": 0.00013547689304220625, + "loss": 3.0002, + "step": 41918 + }, + { + "epoch": 2.05, + "grad_norm": 0.69883131980896, + "learning_rate": 0.0001354640188204607, + "loss": 2.8384, + "step": 41919 + }, + { + "epoch": 2.05, + "grad_norm": 0.663644552230835, + "learning_rate": 0.00013545114503207142, + "loss": 2.6999, + "step": 41920 + }, + { + "epoch": 2.05, + "grad_norm": 0.6672549247741699, + "learning_rate": 0.00013543827167707202, + "loss": 3.0655, + "step": 41921 + }, + { + "epoch": 2.05, + "grad_norm": 0.7014299035072327, + "learning_rate": 0.00013542539875549664, + "loss": 2.8712, + "step": 41922 + }, + { + "epoch": 2.05, + "grad_norm": 0.6401073336601257, + "learning_rate": 0.00013541252626737908, + "loss": 2.9725, + "step": 41923 + }, + { + "epoch": 2.05, + "grad_norm": 0.7074952125549316, + "learning_rate": 0.00013539965421275317, + "loss": 2.8557, + "step": 41924 + }, + { + "epoch": 2.05, + "grad_norm": 0.6953228712081909, + "learning_rate": 0.00013538678259165297, + "loss": 2.674, + "step": 41925 + }, + { + "epoch": 2.05, + "grad_norm": 0.6561875939369202, + "learning_rate": 0.00013537391140411222, + "loss": 3.0387, + "step": 41926 + }, + { + "epoch": 2.05, + "grad_norm": 0.70853191614151, + "learning_rate": 0.00013536104065016492, + "loss": 2.6825, + "step": 41927 + }, + { + "epoch": 2.05, + "grad_norm": 0.7176726460456848, + "learning_rate": 0.00013534817032984504, + "loss": 2.6226, + "step": 41928 + }, + { + "epoch": 2.05, + "grad_norm": 0.7297140955924988, + "learning_rate": 0.0001353353004431864, + "loss": 2.9366, + "step": 41929 + }, + { + "epoch": 2.05, + "grad_norm": 0.6884056329727173, + "learning_rate": 0.00013532243099022287, + "loss": 2.9259, + "step": 41930 + }, + { + "epoch": 2.05, + "grad_norm": 0.7293509244918823, + "learning_rate": 0.0001353095619709883, + "loss": 2.9394, + "step": 41931 + }, + { + "epoch": 2.05, + "grad_norm": 0.7084832191467285, + "learning_rate": 0.0001352966933855166, + "loss": 3.1906, + "step": 41932 + }, + { + "epoch": 2.06, + "grad_norm": 0.7524933815002441, + "learning_rate": 0.00013528382523384183, + "loss": 2.9092, + "step": 41933 + }, + { + "epoch": 2.06, + "grad_norm": 0.6961275935173035, + "learning_rate": 0.00013527095751599767, + "loss": 2.8531, + "step": 41934 + }, + { + "epoch": 2.06, + "grad_norm": 0.6843053698539734, + "learning_rate": 0.0001352580902320182, + "loss": 2.7244, + "step": 41935 + }, + { + "epoch": 2.06, + "grad_norm": 0.7608031034469604, + "learning_rate": 0.0001352452233819371, + "loss": 3.1026, + "step": 41936 + }, + { + "epoch": 2.06, + "grad_norm": 0.6482999324798584, + "learning_rate": 0.0001352323569657885, + "loss": 2.9609, + "step": 41937 + }, + { + "epoch": 2.06, + "grad_norm": 0.6896520853042603, + "learning_rate": 0.00013521949098360616, + "loss": 2.9405, + "step": 41938 + }, + { + "epoch": 2.06, + "grad_norm": 0.7269614934921265, + "learning_rate": 0.00013520662543542385, + "loss": 2.7086, + "step": 41939 + }, + { + "epoch": 2.06, + "grad_norm": 0.6921659111976624, + "learning_rate": 0.00013519376032127567, + "loss": 2.7784, + "step": 41940 + }, + { + "epoch": 2.06, + "grad_norm": 0.6634232401847839, + "learning_rate": 0.00013518089564119534, + "loss": 3.0583, + "step": 41941 + }, + { + "epoch": 2.06, + "grad_norm": 0.7075629830360413, + "learning_rate": 0.00013516803139521677, + "loss": 2.8788, + "step": 41942 + }, + { + "epoch": 2.06, + "grad_norm": 0.7123188972473145, + "learning_rate": 0.000135155167583374, + "loss": 2.802, + "step": 41943 + }, + { + "epoch": 2.06, + "grad_norm": 0.6718761920928955, + "learning_rate": 0.00013514230420570085, + "loss": 2.9926, + "step": 41944 + }, + { + "epoch": 2.06, + "grad_norm": 0.6959552764892578, + "learning_rate": 0.00013512944126223107, + "loss": 2.9625, + "step": 41945 + }, + { + "epoch": 2.06, + "grad_norm": 0.7005223035812378, + "learning_rate": 0.00013511657875299853, + "loss": 2.8976, + "step": 41946 + }, + { + "epoch": 2.06, + "grad_norm": 0.6967735290527344, + "learning_rate": 0.00013510371667803717, + "loss": 2.9729, + "step": 41947 + }, + { + "epoch": 2.06, + "grad_norm": 0.6776131391525269, + "learning_rate": 0.00013509085503738102, + "loss": 2.8577, + "step": 41948 + }, + { + "epoch": 2.06, + "grad_norm": 0.6918829679489136, + "learning_rate": 0.0001350779938310637, + "loss": 3.0947, + "step": 41949 + }, + { + "epoch": 2.06, + "grad_norm": 0.7643651366233826, + "learning_rate": 0.00013506513305911933, + "loss": 3.176, + "step": 41950 + }, + { + "epoch": 2.06, + "grad_norm": 0.6947302222251892, + "learning_rate": 0.00013505227272158163, + "loss": 2.9131, + "step": 41951 + }, + { + "epoch": 2.06, + "grad_norm": 0.7408885359764099, + "learning_rate": 0.00013503941281848439, + "loss": 2.9874, + "step": 41952 + }, + { + "epoch": 2.06, + "grad_norm": 0.7771589159965515, + "learning_rate": 0.00013502655334986173, + "loss": 3.0083, + "step": 41953 + }, + { + "epoch": 2.06, + "grad_norm": 0.6549042463302612, + "learning_rate": 0.00013501369431574725, + "loss": 2.8394, + "step": 41954 + }, + { + "epoch": 2.06, + "grad_norm": 0.687390148639679, + "learning_rate": 0.00013500083571617508, + "loss": 2.7261, + "step": 41955 + }, + { + "epoch": 2.06, + "grad_norm": 0.6740281581878662, + "learning_rate": 0.0001349879775511788, + "loss": 2.8224, + "step": 41956 + }, + { + "epoch": 2.06, + "grad_norm": 0.6710662841796875, + "learning_rate": 0.00013497511982079246, + "loss": 2.709, + "step": 41957 + }, + { + "epoch": 2.06, + "grad_norm": 0.7352544665336609, + "learning_rate": 0.00013496226252504998, + "loss": 3.0141, + "step": 41958 + }, + { + "epoch": 2.06, + "grad_norm": 0.6972978115081787, + "learning_rate": 0.0001349494056639852, + "loss": 2.9088, + "step": 41959 + }, + { + "epoch": 2.06, + "grad_norm": 0.6581162214279175, + "learning_rate": 0.00013493654923763183, + "loss": 2.8458, + "step": 41960 + }, + { + "epoch": 2.06, + "grad_norm": 0.6925591230392456, + "learning_rate": 0.00013492369324602375, + "loss": 2.9503, + "step": 41961 + }, + { + "epoch": 2.06, + "grad_norm": 0.663356602191925, + "learning_rate": 0.00013491083768919494, + "loss": 2.8988, + "step": 41962 + }, + { + "epoch": 2.06, + "grad_norm": 0.7245420813560486, + "learning_rate": 0.00013489798256717926, + "loss": 2.8092, + "step": 41963 + }, + { + "epoch": 2.06, + "grad_norm": 0.6705074310302734, + "learning_rate": 0.00013488512788001043, + "loss": 2.8578, + "step": 41964 + }, + { + "epoch": 2.06, + "grad_norm": 0.6748921871185303, + "learning_rate": 0.00013487227362772252, + "loss": 3.1966, + "step": 41965 + }, + { + "epoch": 2.06, + "grad_norm": 0.728326141834259, + "learning_rate": 0.00013485941981034922, + "loss": 2.8332, + "step": 41966 + }, + { + "epoch": 2.06, + "grad_norm": 0.7034401297569275, + "learning_rate": 0.00013484656642792436, + "loss": 2.9848, + "step": 41967 + }, + { + "epoch": 2.06, + "grad_norm": 0.68290776014328, + "learning_rate": 0.00013483371348048197, + "loss": 2.8918, + "step": 41968 + }, + { + "epoch": 2.06, + "grad_norm": 0.6936345100402832, + "learning_rate": 0.00013482086096805565, + "loss": 2.779, + "step": 41969 + }, + { + "epoch": 2.06, + "grad_norm": 0.7086421251296997, + "learning_rate": 0.00013480800889067955, + "loss": 2.9097, + "step": 41970 + }, + { + "epoch": 2.06, + "grad_norm": 0.7316939830780029, + "learning_rate": 0.00013479515724838722, + "loss": 2.8459, + "step": 41971 + }, + { + "epoch": 2.06, + "grad_norm": 0.6890463829040527, + "learning_rate": 0.00013478230604121265, + "loss": 3.0289, + "step": 41972 + }, + { + "epoch": 2.06, + "grad_norm": 0.6866118907928467, + "learning_rate": 0.00013476945526918978, + "loss": 2.7847, + "step": 41973 + }, + { + "epoch": 2.06, + "grad_norm": 0.7601082921028137, + "learning_rate": 0.00013475660493235237, + "loss": 2.8525, + "step": 41974 + }, + { + "epoch": 2.06, + "grad_norm": 0.7042220234870911, + "learning_rate": 0.00013474375503073433, + "loss": 2.945, + "step": 41975 + }, + { + "epoch": 2.06, + "grad_norm": 0.6664708852767944, + "learning_rate": 0.00013473090556436927, + "loss": 2.9917, + "step": 41976 + }, + { + "epoch": 2.06, + "grad_norm": 0.6848310232162476, + "learning_rate": 0.00013471805653329118, + "loss": 2.8884, + "step": 41977 + }, + { + "epoch": 2.06, + "grad_norm": 0.6613977551460266, + "learning_rate": 0.00013470520793753407, + "loss": 2.9344, + "step": 41978 + }, + { + "epoch": 2.06, + "grad_norm": 0.7326992750167847, + "learning_rate": 0.00013469235977713148, + "loss": 2.8053, + "step": 41979 + }, + { + "epoch": 2.06, + "grad_norm": 0.6662197113037109, + "learning_rate": 0.00013467951205211755, + "loss": 2.8492, + "step": 41980 + }, + { + "epoch": 2.06, + "grad_norm": 0.6890004873275757, + "learning_rate": 0.00013466666476252595, + "loss": 2.9529, + "step": 41981 + }, + { + "epoch": 2.06, + "grad_norm": 0.6676129102706909, + "learning_rate": 0.00013465381790839042, + "loss": 2.8345, + "step": 41982 + }, + { + "epoch": 2.06, + "grad_norm": 0.676196813583374, + "learning_rate": 0.00013464097148974504, + "loss": 3.2066, + "step": 41983 + }, + { + "epoch": 2.06, + "grad_norm": 0.6789147257804871, + "learning_rate": 0.0001346281255066234, + "loss": 3.0088, + "step": 41984 + }, + { + "epoch": 2.06, + "grad_norm": 0.7471140623092651, + "learning_rate": 0.00013461527995905955, + "loss": 2.9146, + "step": 41985 + }, + { + "epoch": 2.06, + "grad_norm": 0.6926723718643188, + "learning_rate": 0.00013460243484708713, + "loss": 2.9489, + "step": 41986 + }, + { + "epoch": 2.06, + "grad_norm": 0.7110167145729065, + "learning_rate": 0.00013458959017074017, + "loss": 2.8836, + "step": 41987 + }, + { + "epoch": 2.06, + "grad_norm": 0.7335017323493958, + "learning_rate": 0.0001345767459300524, + "loss": 2.9397, + "step": 41988 + }, + { + "epoch": 2.06, + "grad_norm": 0.7071158289909363, + "learning_rate": 0.0001345639021250575, + "loss": 2.8792, + "step": 41989 + }, + { + "epoch": 2.06, + "grad_norm": 0.7941440343856812, + "learning_rate": 0.0001345510587557896, + "loss": 2.9275, + "step": 41990 + }, + { + "epoch": 2.06, + "grad_norm": 0.6561496257781982, + "learning_rate": 0.00013453821582228224, + "loss": 2.9598, + "step": 41991 + }, + { + "epoch": 2.06, + "grad_norm": 0.7104381918907166, + "learning_rate": 0.0001345253733245694, + "loss": 2.897, + "step": 41992 + }, + { + "epoch": 2.06, + "grad_norm": 0.702156662940979, + "learning_rate": 0.00013451253126268496, + "loss": 2.8701, + "step": 41993 + }, + { + "epoch": 2.06, + "grad_norm": 0.7097292542457581, + "learning_rate": 0.00013449968963666253, + "loss": 2.7465, + "step": 41994 + }, + { + "epoch": 2.06, + "grad_norm": 0.6726009249687195, + "learning_rate": 0.00013448684844653624, + "loss": 2.6797, + "step": 41995 + }, + { + "epoch": 2.06, + "grad_norm": 0.6798008680343628, + "learning_rate": 0.0001344740076923397, + "loss": 2.8157, + "step": 41996 + }, + { + "epoch": 2.06, + "grad_norm": 0.7259616255760193, + "learning_rate": 0.00013446116737410676, + "loss": 2.9134, + "step": 41997 + }, + { + "epoch": 2.06, + "grad_norm": 0.7127224802970886, + "learning_rate": 0.00013444832749187114, + "loss": 3.0528, + "step": 41998 + }, + { + "epoch": 2.06, + "grad_norm": 0.6722402572631836, + "learning_rate": 0.00013443548804566678, + "loss": 2.7478, + "step": 41999 + }, + { + "epoch": 2.06, + "grad_norm": 0.7285507321357727, + "learning_rate": 0.0001344226490355276, + "loss": 2.9906, + "step": 42000 + }, + { + "epoch": 2.06, + "grad_norm": 0.6551629304885864, + "learning_rate": 0.00013440981046148717, + "loss": 2.8337, + "step": 42001 + }, + { + "epoch": 2.06, + "grad_norm": 0.678065836429596, + "learning_rate": 0.00013439697232357954, + "loss": 2.8061, + "step": 42002 + }, + { + "epoch": 2.06, + "grad_norm": 0.7075507044792175, + "learning_rate": 0.00013438413462183842, + "loss": 2.7313, + "step": 42003 + }, + { + "epoch": 2.06, + "grad_norm": 0.658933699131012, + "learning_rate": 0.00013437129735629748, + "loss": 2.8594, + "step": 42004 + }, + { + "epoch": 2.06, + "grad_norm": 0.6496484279632568, + "learning_rate": 0.0001343584605269908, + "loss": 3.0021, + "step": 42005 + }, + { + "epoch": 2.06, + "grad_norm": 0.6887893080711365, + "learning_rate": 0.00013434562413395195, + "loss": 2.803, + "step": 42006 + }, + { + "epoch": 2.06, + "grad_norm": 0.6488357782363892, + "learning_rate": 0.00013433278817721496, + "loss": 2.9725, + "step": 42007 + }, + { + "epoch": 2.06, + "grad_norm": 0.7380465865135193, + "learning_rate": 0.00013431995265681337, + "loss": 3.0395, + "step": 42008 + }, + { + "epoch": 2.06, + "grad_norm": 0.703506350517273, + "learning_rate": 0.0001343071175727813, + "loss": 3.0208, + "step": 42009 + }, + { + "epoch": 2.06, + "grad_norm": 0.7320290803909302, + "learning_rate": 0.00013429428292515238, + "loss": 2.8408, + "step": 42010 + }, + { + "epoch": 2.06, + "grad_norm": 0.6865428686141968, + "learning_rate": 0.0001342814487139603, + "loss": 3.1323, + "step": 42011 + }, + { + "epoch": 2.06, + "grad_norm": 0.6920920014381409, + "learning_rate": 0.00013426861493923915, + "loss": 2.8396, + "step": 42012 + }, + { + "epoch": 2.06, + "grad_norm": 0.7023778557777405, + "learning_rate": 0.00013425578160102238, + "loss": 2.9399, + "step": 42013 + }, + { + "epoch": 2.06, + "grad_norm": 0.648321270942688, + "learning_rate": 0.00013424294869934407, + "loss": 3.1877, + "step": 42014 + }, + { + "epoch": 2.06, + "grad_norm": 0.6870456337928772, + "learning_rate": 0.000134230116234238, + "loss": 3.2613, + "step": 42015 + }, + { + "epoch": 2.06, + "grad_norm": 0.7093237638473511, + "learning_rate": 0.00013421728420573778, + "loss": 2.8811, + "step": 42016 + }, + { + "epoch": 2.06, + "grad_norm": 0.6749858260154724, + "learning_rate": 0.00013420445261387747, + "loss": 2.7412, + "step": 42017 + }, + { + "epoch": 2.06, + "grad_norm": 0.7406046986579895, + "learning_rate": 0.0001341916214586907, + "loss": 2.8874, + "step": 42018 + }, + { + "epoch": 2.06, + "grad_norm": 0.6919505596160889, + "learning_rate": 0.00013417879074021117, + "loss": 2.9656, + "step": 42019 + }, + { + "epoch": 2.06, + "grad_norm": 0.7033610939979553, + "learning_rate": 0.00013416596045847293, + "loss": 3.0286, + "step": 42020 + }, + { + "epoch": 2.06, + "grad_norm": 0.6647442579269409, + "learning_rate": 0.00013415313061350948, + "loss": 2.861, + "step": 42021 + }, + { + "epoch": 2.06, + "grad_norm": 0.6930309534072876, + "learning_rate": 0.00013414030120535491, + "loss": 3.1232, + "step": 42022 + }, + { + "epoch": 2.06, + "grad_norm": 0.6791087985038757, + "learning_rate": 0.00013412747223404275, + "loss": 2.9181, + "step": 42023 + }, + { + "epoch": 2.06, + "grad_norm": 0.6947936415672302, + "learning_rate": 0.00013411464369960706, + "loss": 2.7641, + "step": 42024 + }, + { + "epoch": 2.06, + "grad_norm": 0.7209773659706116, + "learning_rate": 0.00013410181560208142, + "loss": 2.9464, + "step": 42025 + }, + { + "epoch": 2.06, + "grad_norm": 0.7256671190261841, + "learning_rate": 0.00013408898794149956, + "loss": 2.8287, + "step": 42026 + }, + { + "epoch": 2.06, + "grad_norm": 0.67732834815979, + "learning_rate": 0.00013407616071789552, + "loss": 3.0294, + "step": 42027 + }, + { + "epoch": 2.06, + "grad_norm": 0.6837236285209656, + "learning_rate": 0.00013406333393130277, + "loss": 2.7781, + "step": 42028 + }, + { + "epoch": 2.06, + "grad_norm": 0.6771765351295471, + "learning_rate": 0.0001340505075817553, + "loss": 2.9683, + "step": 42029 + }, + { + "epoch": 2.06, + "grad_norm": 0.685924768447876, + "learning_rate": 0.00013403768166928698, + "loss": 2.9958, + "step": 42030 + }, + { + "epoch": 2.06, + "grad_norm": 0.6353333592414856, + "learning_rate": 0.00013402485619393146, + "loss": 2.9384, + "step": 42031 + }, + { + "epoch": 2.06, + "grad_norm": 0.697657585144043, + "learning_rate": 0.00013401203115572254, + "loss": 2.921, + "step": 42032 + }, + { + "epoch": 2.06, + "grad_norm": 0.6873392462730408, + "learning_rate": 0.00013399920655469383, + "loss": 2.7485, + "step": 42033 + }, + { + "epoch": 2.06, + "grad_norm": 0.6944345235824585, + "learning_rate": 0.0001339863823908793, + "loss": 2.8502, + "step": 42034 + }, + { + "epoch": 2.06, + "grad_norm": 0.6621580719947815, + "learning_rate": 0.0001339735586643128, + "loss": 2.8554, + "step": 42035 + }, + { + "epoch": 2.06, + "grad_norm": 0.6833303570747375, + "learning_rate": 0.00013396073537502782, + "loss": 3.2196, + "step": 42036 + }, + { + "epoch": 2.06, + "grad_norm": 0.6794368028640747, + "learning_rate": 0.00013394791252305848, + "loss": 2.8268, + "step": 42037 + }, + { + "epoch": 2.06, + "grad_norm": 0.7149304747581482, + "learning_rate": 0.00013393509010843827, + "loss": 3.1625, + "step": 42038 + }, + { + "epoch": 2.06, + "grad_norm": 0.6875823736190796, + "learning_rate": 0.00013392226813120118, + "loss": 2.8485, + "step": 42039 + }, + { + "epoch": 2.06, + "grad_norm": 0.7069306969642639, + "learning_rate": 0.00013390944659138085, + "loss": 3.0752, + "step": 42040 + }, + { + "epoch": 2.06, + "grad_norm": 0.6851629614830017, + "learning_rate": 0.000133896625489011, + "loss": 3.107, + "step": 42041 + }, + { + "epoch": 2.06, + "grad_norm": 0.6919379234313965, + "learning_rate": 0.00013388380482412554, + "loss": 3.0562, + "step": 42042 + }, + { + "epoch": 2.06, + "grad_norm": 0.724163830280304, + "learning_rate": 0.0001338709845967581, + "loss": 3.1261, + "step": 42043 + }, + { + "epoch": 2.06, + "grad_norm": 0.6805170178413391, + "learning_rate": 0.00013385816480694247, + "loss": 2.8018, + "step": 42044 + }, + { + "epoch": 2.06, + "grad_norm": 0.660524845123291, + "learning_rate": 0.0001338453454547126, + "loss": 2.8098, + "step": 42045 + }, + { + "epoch": 2.06, + "grad_norm": 0.6695048809051514, + "learning_rate": 0.00013383252654010212, + "loss": 3.0506, + "step": 42046 + }, + { + "epoch": 2.06, + "grad_norm": 0.6940860748291016, + "learning_rate": 0.00013381970806314474, + "loss": 2.9447, + "step": 42047 + }, + { + "epoch": 2.06, + "grad_norm": 0.6937212944030762, + "learning_rate": 0.0001338068900238742, + "loss": 2.954, + "step": 42048 + }, + { + "epoch": 2.06, + "grad_norm": 0.6832905411720276, + "learning_rate": 0.0001337940724223243, + "loss": 2.8383, + "step": 42049 + }, + { + "epoch": 2.06, + "grad_norm": 0.6810662746429443, + "learning_rate": 0.00013378125525852897, + "loss": 2.6696, + "step": 42050 + }, + { + "epoch": 2.06, + "grad_norm": 0.6970690488815308, + "learning_rate": 0.0001337684385325217, + "loss": 3.0258, + "step": 42051 + }, + { + "epoch": 2.06, + "grad_norm": 0.6836695671081543, + "learning_rate": 0.00013375562224433646, + "loss": 3.0219, + "step": 42052 + }, + { + "epoch": 2.06, + "grad_norm": 0.6802341341972351, + "learning_rate": 0.00013374280639400694, + "loss": 3.1268, + "step": 42053 + }, + { + "epoch": 2.06, + "grad_norm": 0.6934910416603088, + "learning_rate": 0.00013372999098156674, + "loss": 2.9171, + "step": 42054 + }, + { + "epoch": 2.06, + "grad_norm": 0.6853800415992737, + "learning_rate": 0.00013371717600704986, + "loss": 2.9585, + "step": 42055 + }, + { + "epoch": 2.06, + "grad_norm": 0.9536087512969971, + "learning_rate": 0.00013370436147048986, + "loss": 2.9071, + "step": 42056 + }, + { + "epoch": 2.06, + "grad_norm": 0.6992372274398804, + "learning_rate": 0.00013369154737192065, + "loss": 2.9655, + "step": 42057 + }, + { + "epoch": 2.06, + "grad_norm": 0.6866870522499084, + "learning_rate": 0.00013367873371137576, + "loss": 2.9159, + "step": 42058 + }, + { + "epoch": 2.06, + "grad_norm": 0.6795086860656738, + "learning_rate": 0.0001336659204888891, + "loss": 2.8051, + "step": 42059 + }, + { + "epoch": 2.06, + "grad_norm": 0.6785724759101868, + "learning_rate": 0.00013365310770449454, + "loss": 3.0133, + "step": 42060 + }, + { + "epoch": 2.06, + "grad_norm": 0.6523509621620178, + "learning_rate": 0.00013364029535822564, + "loss": 2.8145, + "step": 42061 + }, + { + "epoch": 2.06, + "grad_norm": 0.7120859026908875, + "learning_rate": 0.0001336274834501162, + "loss": 3.0528, + "step": 42062 + }, + { + "epoch": 2.06, + "grad_norm": 0.6999312043190002, + "learning_rate": 0.00013361467198019984, + "loss": 2.9157, + "step": 42063 + }, + { + "epoch": 2.06, + "grad_norm": 0.6902523040771484, + "learning_rate": 0.00013360186094851038, + "loss": 2.6057, + "step": 42064 + }, + { + "epoch": 2.06, + "grad_norm": 0.6437488198280334, + "learning_rate": 0.00013358905035508172, + "loss": 2.9131, + "step": 42065 + }, + { + "epoch": 2.06, + "grad_norm": 0.6648966670036316, + "learning_rate": 0.0001335762401999474, + "loss": 2.6556, + "step": 42066 + }, + { + "epoch": 2.06, + "grad_norm": 0.7151343822479248, + "learning_rate": 0.00013356343048314133, + "loss": 2.9348, + "step": 42067 + }, + { + "epoch": 2.06, + "grad_norm": 0.6936889290809631, + "learning_rate": 0.0001335506212046971, + "loss": 2.8895, + "step": 42068 + }, + { + "epoch": 2.06, + "grad_norm": 0.6722537279129028, + "learning_rate": 0.00013353781236464846, + "loss": 3.094, + "step": 42069 + }, + { + "epoch": 2.06, + "grad_norm": 0.6585366725921631, + "learning_rate": 0.00013352500396302927, + "loss": 2.9741, + "step": 42070 + }, + { + "epoch": 2.06, + "grad_norm": 0.7267591953277588, + "learning_rate": 0.0001335121959998731, + "loss": 2.9534, + "step": 42071 + }, + { + "epoch": 2.06, + "grad_norm": 0.6568361520767212, + "learning_rate": 0.00013349938847521383, + "loss": 2.7674, + "step": 42072 + }, + { + "epoch": 2.06, + "grad_norm": 0.700718343257904, + "learning_rate": 0.00013348658138908503, + "loss": 2.9192, + "step": 42073 + }, + { + "epoch": 2.06, + "grad_norm": 0.7025381326675415, + "learning_rate": 0.00013347377474152064, + "loss": 3.0146, + "step": 42074 + }, + { + "epoch": 2.06, + "grad_norm": 0.7157812714576721, + "learning_rate": 0.00013346096853255427, + "loss": 2.8323, + "step": 42075 + }, + { + "epoch": 2.06, + "grad_norm": 0.6481377482414246, + "learning_rate": 0.00013344816276221957, + "loss": 3.0237, + "step": 42076 + }, + { + "epoch": 2.06, + "grad_norm": 0.6517822742462158, + "learning_rate": 0.00013343535743055043, + "loss": 2.7866, + "step": 42077 + }, + { + "epoch": 2.06, + "grad_norm": 0.6869397759437561, + "learning_rate": 0.00013342255253758042, + "loss": 2.9609, + "step": 42078 + }, + { + "epoch": 2.06, + "grad_norm": 0.6711219549179077, + "learning_rate": 0.00013340974808334345, + "loss": 3.0348, + "step": 42079 + }, + { + "epoch": 2.06, + "grad_norm": 0.7152694463729858, + "learning_rate": 0.00013339694406787304, + "loss": 3.1945, + "step": 42080 + }, + { + "epoch": 2.06, + "grad_norm": 0.7110965251922607, + "learning_rate": 0.00013338414049120302, + "loss": 2.8687, + "step": 42081 + }, + { + "epoch": 2.06, + "grad_norm": 0.6851297616958618, + "learning_rate": 0.0001333713373533672, + "loss": 3.0068, + "step": 42082 + }, + { + "epoch": 2.06, + "grad_norm": 0.6575678586959839, + "learning_rate": 0.00013335853465439924, + "loss": 2.7863, + "step": 42083 + }, + { + "epoch": 2.06, + "grad_norm": 0.6966585516929626, + "learning_rate": 0.0001333457323943328, + "loss": 2.9452, + "step": 42084 + }, + { + "epoch": 2.06, + "grad_norm": 0.6733703017234802, + "learning_rate": 0.0001333329305732015, + "loss": 3.0046, + "step": 42085 + }, + { + "epoch": 2.06, + "grad_norm": 0.7381994128227234, + "learning_rate": 0.00013332012919103922, + "loss": 2.8515, + "step": 42086 + }, + { + "epoch": 2.06, + "grad_norm": 0.6656725406646729, + "learning_rate": 0.00013330732824787974, + "loss": 2.9327, + "step": 42087 + }, + { + "epoch": 2.06, + "grad_norm": 0.6955260634422302, + "learning_rate": 0.00013329452774375657, + "loss": 3.0594, + "step": 42088 + }, + { + "epoch": 2.06, + "grad_norm": 0.6607492566108704, + "learning_rate": 0.00013328172767870363, + "loss": 2.8572, + "step": 42089 + }, + { + "epoch": 2.06, + "grad_norm": 0.6896944046020508, + "learning_rate": 0.00013326892805275457, + "loss": 2.8783, + "step": 42090 + }, + { + "epoch": 2.06, + "grad_norm": 0.7322929501533508, + "learning_rate": 0.00013325612886594293, + "loss": 2.9448, + "step": 42091 + }, + { + "epoch": 2.06, + "grad_norm": 0.6637055277824402, + "learning_rate": 0.00013324333011830266, + "loss": 2.9994, + "step": 42092 + }, + { + "epoch": 2.06, + "grad_norm": 0.66081303358078, + "learning_rate": 0.00013323053180986724, + "loss": 2.9542, + "step": 42093 + }, + { + "epoch": 2.06, + "grad_norm": 0.7156060934066772, + "learning_rate": 0.00013321773394067064, + "loss": 2.8346, + "step": 42094 + }, + { + "epoch": 2.06, + "grad_norm": 0.7147414088249207, + "learning_rate": 0.0001332049365107463, + "loss": 2.9021, + "step": 42095 + }, + { + "epoch": 2.06, + "grad_norm": 0.6858595013618469, + "learning_rate": 0.0001331921395201281, + "loss": 2.8172, + "step": 42096 + }, + { + "epoch": 2.06, + "grad_norm": 0.6633896231651306, + "learning_rate": 0.00013317934296884978, + "loss": 2.9635, + "step": 42097 + }, + { + "epoch": 2.06, + "grad_norm": 0.6843210458755493, + "learning_rate": 0.00013316654685694497, + "loss": 3.0135, + "step": 42098 + }, + { + "epoch": 2.06, + "grad_norm": 0.7373406887054443, + "learning_rate": 0.00013315375118444733, + "loss": 2.8179, + "step": 42099 + }, + { + "epoch": 2.06, + "grad_norm": 0.6996890306472778, + "learning_rate": 0.00013314095595139057, + "loss": 2.9772, + "step": 42100 + }, + { + "epoch": 2.06, + "grad_norm": 0.6776704788208008, + "learning_rate": 0.00013312816115780836, + "loss": 2.874, + "step": 42101 + }, + { + "epoch": 2.06, + "grad_norm": 0.7402383089065552, + "learning_rate": 0.00013311536680373458, + "loss": 3.0208, + "step": 42102 + }, + { + "epoch": 2.06, + "grad_norm": 0.6806477308273315, + "learning_rate": 0.0001331025728892027, + "loss": 2.8178, + "step": 42103 + }, + { + "epoch": 2.06, + "grad_norm": 0.7480725049972534, + "learning_rate": 0.00013308977941424663, + "loss": 2.759, + "step": 42104 + }, + { + "epoch": 2.06, + "grad_norm": 0.6960058808326721, + "learning_rate": 0.00013307698637889996, + "loss": 3.0031, + "step": 42105 + }, + { + "epoch": 2.06, + "grad_norm": 0.7045592665672302, + "learning_rate": 0.00013306419378319626, + "loss": 2.8392, + "step": 42106 + }, + { + "epoch": 2.06, + "grad_norm": 0.6933001279830933, + "learning_rate": 0.00013305140162716948, + "loss": 2.8758, + "step": 42107 + }, + { + "epoch": 2.06, + "grad_norm": 0.6979026794433594, + "learning_rate": 0.00013303860991085306, + "loss": 2.8514, + "step": 42108 + }, + { + "epoch": 2.06, + "grad_norm": 0.6984500885009766, + "learning_rate": 0.00013302581863428094, + "loss": 2.853, + "step": 42109 + }, + { + "epoch": 2.06, + "grad_norm": 0.7042974829673767, + "learning_rate": 0.00013301302779748653, + "loss": 2.9368, + "step": 42110 + }, + { + "epoch": 2.06, + "grad_norm": 0.6589822173118591, + "learning_rate": 0.00013300023740050378, + "loss": 2.8031, + "step": 42111 + }, + { + "epoch": 2.06, + "grad_norm": 0.6807518005371094, + "learning_rate": 0.0001329874474433663, + "loss": 3.1159, + "step": 42112 + }, + { + "epoch": 2.06, + "grad_norm": 0.7369489669799805, + "learning_rate": 0.00013297465792610761, + "loss": 3.026, + "step": 42113 + }, + { + "epoch": 2.06, + "grad_norm": 0.6795954704284668, + "learning_rate": 0.00013296186884876164, + "loss": 2.8004, + "step": 42114 + }, + { + "epoch": 2.06, + "grad_norm": 0.6816520690917969, + "learning_rate": 0.00013294908021136185, + "loss": 2.9879, + "step": 42115 + }, + { + "epoch": 2.06, + "grad_norm": 0.6890973448753357, + "learning_rate": 0.00013293629201394203, + "loss": 2.9887, + "step": 42116 + }, + { + "epoch": 2.06, + "grad_norm": 0.6854380369186401, + "learning_rate": 0.00013292350425653596, + "loss": 3.0083, + "step": 42117 + }, + { + "epoch": 2.06, + "grad_norm": 0.7107696533203125, + "learning_rate": 0.00013291071693917713, + "loss": 2.912, + "step": 42118 + }, + { + "epoch": 2.06, + "grad_norm": 0.6896004676818848, + "learning_rate": 0.00013289793006189942, + "loss": 2.9846, + "step": 42119 + }, + { + "epoch": 2.06, + "grad_norm": 0.7244950532913208, + "learning_rate": 0.0001328851436247364, + "loss": 2.9464, + "step": 42120 + }, + { + "epoch": 2.06, + "grad_norm": 0.6931073069572449, + "learning_rate": 0.00013287235762772165, + "loss": 2.7884, + "step": 42121 + }, + { + "epoch": 2.06, + "grad_norm": 0.6745514869689941, + "learning_rate": 0.00013285957207088907, + "loss": 2.7589, + "step": 42122 + }, + { + "epoch": 2.06, + "grad_norm": 0.737249493598938, + "learning_rate": 0.00013284678695427206, + "loss": 2.9354, + "step": 42123 + }, + { + "epoch": 2.06, + "grad_norm": 0.6523367762565613, + "learning_rate": 0.0001328340022779046, + "loss": 2.7846, + "step": 42124 + }, + { + "epoch": 2.06, + "grad_norm": 0.6890686750411987, + "learning_rate": 0.0001328212180418201, + "loss": 3.1741, + "step": 42125 + }, + { + "epoch": 2.06, + "grad_norm": 0.7183402180671692, + "learning_rate": 0.00013280843424605239, + "loss": 2.9694, + "step": 42126 + }, + { + "epoch": 2.06, + "grad_norm": 0.6588950157165527, + "learning_rate": 0.0001327956508906351, + "loss": 2.9488, + "step": 42127 + }, + { + "epoch": 2.06, + "grad_norm": 0.6817463040351868, + "learning_rate": 0.00013278286797560184, + "loss": 2.8579, + "step": 42128 + }, + { + "epoch": 2.06, + "grad_norm": 0.6694334745407104, + "learning_rate": 0.00013277008550098638, + "loss": 2.9406, + "step": 42129 + }, + { + "epoch": 2.06, + "grad_norm": 0.6861422657966614, + "learning_rate": 0.0001327573034668222, + "loss": 3.0737, + "step": 42130 + }, + { + "epoch": 2.06, + "grad_norm": 0.6940646171569824, + "learning_rate": 0.00013274452187314316, + "loss": 2.8372, + "step": 42131 + }, + { + "epoch": 2.06, + "grad_norm": 0.6737503409385681, + "learning_rate": 0.0001327317407199829, + "loss": 3.0897, + "step": 42132 + }, + { + "epoch": 2.06, + "grad_norm": 0.692583441734314, + "learning_rate": 0.00013271896000737512, + "loss": 2.9895, + "step": 42133 + }, + { + "epoch": 2.06, + "grad_norm": 0.7276341915130615, + "learning_rate": 0.00013270617973535332, + "loss": 2.7632, + "step": 42134 + }, + { + "epoch": 2.06, + "grad_norm": 0.6521527767181396, + "learning_rate": 0.0001326933999039512, + "loss": 2.7446, + "step": 42135 + }, + { + "epoch": 2.06, + "grad_norm": 0.7015560865402222, + "learning_rate": 0.00013268062051320246, + "loss": 3.0649, + "step": 42136 + }, + { + "epoch": 2.07, + "grad_norm": 0.6922124028205872, + "learning_rate": 0.00013266784156314087, + "loss": 2.8697, + "step": 42137 + }, + { + "epoch": 2.07, + "grad_norm": 0.7114979028701782, + "learning_rate": 0.00013265506305379983, + "loss": 2.9049, + "step": 42138 + }, + { + "epoch": 2.07, + "grad_norm": 0.6223366260528564, + "learning_rate": 0.0001326422849852133, + "loss": 2.7975, + "step": 42139 + }, + { + "epoch": 2.07, + "grad_norm": 0.7140401005744934, + "learning_rate": 0.00013262950735741467, + "loss": 3.1222, + "step": 42140 + }, + { + "epoch": 2.07, + "grad_norm": 0.6884499192237854, + "learning_rate": 0.00013261673017043782, + "loss": 2.8993, + "step": 42141 + }, + { + "epoch": 2.07, + "grad_norm": 0.6920923590660095, + "learning_rate": 0.00013260395342431627, + "loss": 2.9722, + "step": 42142 + }, + { + "epoch": 2.07, + "grad_norm": 0.662397563457489, + "learning_rate": 0.00013259117711908358, + "loss": 3.0986, + "step": 42143 + }, + { + "epoch": 2.07, + "grad_norm": 0.6610293388366699, + "learning_rate": 0.00013257840125477363, + "loss": 3.0419, + "step": 42144 + }, + { + "epoch": 2.07, + "grad_norm": 0.7399603724479675, + "learning_rate": 0.00013256562583141985, + "loss": 2.8981, + "step": 42145 + }, + { + "epoch": 2.07, + "grad_norm": 0.681763768196106, + "learning_rate": 0.00013255285084905598, + "loss": 2.756, + "step": 42146 + }, + { + "epoch": 2.07, + "grad_norm": 0.7297256588935852, + "learning_rate": 0.0001325400763077158, + "loss": 3.0475, + "step": 42147 + }, + { + "epoch": 2.07, + "grad_norm": 0.6886538863182068, + "learning_rate": 0.0001325273022074328, + "loss": 3.1051, + "step": 42148 + }, + { + "epoch": 2.07, + "grad_norm": 0.7075888514518738, + "learning_rate": 0.00013251452854824068, + "loss": 3.0244, + "step": 42149 + }, + { + "epoch": 2.07, + "grad_norm": 0.6723986864089966, + "learning_rate": 0.00013250175533017293, + "loss": 3.032, + "step": 42150 + }, + { + "epoch": 2.07, + "grad_norm": 0.6775681972503662, + "learning_rate": 0.00013248898255326346, + "loss": 2.9657, + "step": 42151 + }, + { + "epoch": 2.07, + "grad_norm": 0.6784015893936157, + "learning_rate": 0.00013247621021754562, + "loss": 2.8706, + "step": 42152 + }, + { + "epoch": 2.07, + "grad_norm": 0.6836172342300415, + "learning_rate": 0.00013246343832305324, + "loss": 2.9455, + "step": 42153 + }, + { + "epoch": 2.07, + "grad_norm": 0.7069783806800842, + "learning_rate": 0.00013245066686982, + "loss": 2.92, + "step": 42154 + }, + { + "epoch": 2.07, + "grad_norm": 0.6457093358039856, + "learning_rate": 0.00013243789585787948, + "loss": 2.9114, + "step": 42155 + }, + { + "epoch": 2.07, + "grad_norm": 0.7186350226402283, + "learning_rate": 0.0001324251252872653, + "loss": 2.7841, + "step": 42156 + }, + { + "epoch": 2.07, + "grad_norm": 0.672428548336029, + "learning_rate": 0.00013241235515801093, + "loss": 3.0373, + "step": 42157 + }, + { + "epoch": 2.07, + "grad_norm": 0.7136212587356567, + "learning_rate": 0.0001323995854701502, + "loss": 2.8389, + "step": 42158 + }, + { + "epoch": 2.07, + "grad_norm": 0.6720640659332275, + "learning_rate": 0.0001323868162237168, + "loss": 2.7359, + "step": 42159 + }, + { + "epoch": 2.07, + "grad_norm": 0.6694479584693909, + "learning_rate": 0.0001323740474187442, + "loss": 2.7698, + "step": 42160 + }, + { + "epoch": 2.07, + "grad_norm": 0.6907969117164612, + "learning_rate": 0.0001323612790552662, + "loss": 2.7868, + "step": 42161 + }, + { + "epoch": 2.07, + "grad_norm": 0.6790691018104553, + "learning_rate": 0.0001323485111333162, + "loss": 3.055, + "step": 42162 + }, + { + "epoch": 2.07, + "grad_norm": 0.7087593674659729, + "learning_rate": 0.00013233574365292803, + "loss": 2.9253, + "step": 42163 + }, + { + "epoch": 2.07, + "grad_norm": 0.6997819542884827, + "learning_rate": 0.0001323229766141353, + "loss": 2.972, + "step": 42164 + }, + { + "epoch": 2.07, + "grad_norm": 0.7132028341293335, + "learning_rate": 0.00013231021001697146, + "loss": 2.8296, + "step": 42165 + }, + { + "epoch": 2.07, + "grad_norm": 0.6946109533309937, + "learning_rate": 0.00013229744386147036, + "loss": 2.8329, + "step": 42166 + }, + { + "epoch": 2.07, + "grad_norm": 0.7032156586647034, + "learning_rate": 0.0001322846781476654, + "loss": 2.7633, + "step": 42167 + }, + { + "epoch": 2.07, + "grad_norm": 0.6949759721755981, + "learning_rate": 0.0001322719128755903, + "loss": 3.0027, + "step": 42168 + }, + { + "epoch": 2.07, + "grad_norm": 0.6636202335357666, + "learning_rate": 0.00013225914804527883, + "loss": 2.9693, + "step": 42169 + }, + { + "epoch": 2.07, + "grad_norm": 0.6825640797615051, + "learning_rate": 0.00013224638365676451, + "loss": 3.0983, + "step": 42170 + }, + { + "epoch": 2.07, + "grad_norm": 0.669562816619873, + "learning_rate": 0.0001322336197100809, + "loss": 2.9744, + "step": 42171 + }, + { + "epoch": 2.07, + "grad_norm": 0.6761650443077087, + "learning_rate": 0.00013222085620526153, + "loss": 2.8963, + "step": 42172 + }, + { + "epoch": 2.07, + "grad_norm": 0.7180191278457642, + "learning_rate": 0.00013220809314234017, + "loss": 2.925, + "step": 42173 + }, + { + "epoch": 2.07, + "grad_norm": 0.6580066084861755, + "learning_rate": 0.00013219533052135051, + "loss": 2.7424, + "step": 42174 + }, + { + "epoch": 2.07, + "grad_norm": 0.7591055035591125, + "learning_rate": 0.0001321825683423259, + "loss": 2.7706, + "step": 42175 + }, + { + "epoch": 2.07, + "grad_norm": 0.6858645677566528, + "learning_rate": 0.00013216980660530025, + "loss": 2.8627, + "step": 42176 + }, + { + "epoch": 2.07, + "grad_norm": 0.7282005548477173, + "learning_rate": 0.000132157045310307, + "loss": 2.7958, + "step": 42177 + }, + { + "epoch": 2.07, + "grad_norm": 0.6546103954315186, + "learning_rate": 0.00013214428445737974, + "loss": 2.7244, + "step": 42178 + }, + { + "epoch": 2.07, + "grad_norm": 0.7295477986335754, + "learning_rate": 0.00013213152404655218, + "loss": 2.9721, + "step": 42179 + }, + { + "epoch": 2.07, + "grad_norm": 0.6619624495506287, + "learning_rate": 0.00013211876407785779, + "loss": 2.9897, + "step": 42180 + }, + { + "epoch": 2.07, + "grad_norm": 0.7149553298950195, + "learning_rate": 0.00013210600455133037, + "loss": 2.7235, + "step": 42181 + }, + { + "epoch": 2.07, + "grad_norm": 0.6860978603363037, + "learning_rate": 0.00013209324546700334, + "loss": 3.0681, + "step": 42182 + }, + { + "epoch": 2.07, + "grad_norm": 0.6987245678901672, + "learning_rate": 0.00013208048682491038, + "loss": 2.7269, + "step": 42183 + }, + { + "epoch": 2.07, + "grad_norm": 0.6757438778877258, + "learning_rate": 0.0001320677286250852, + "loss": 2.8652, + "step": 42184 + }, + { + "epoch": 2.07, + "grad_norm": 0.6813808679580688, + "learning_rate": 0.0001320549708675613, + "loss": 2.8605, + "step": 42185 + }, + { + "epoch": 2.07, + "grad_norm": 0.6611071825027466, + "learning_rate": 0.0001320422135523723, + "loss": 2.7863, + "step": 42186 + }, + { + "epoch": 2.07, + "grad_norm": 0.6753425598144531, + "learning_rate": 0.00013202945667955163, + "loss": 2.9375, + "step": 42187 + }, + { + "epoch": 2.07, + "grad_norm": 0.6801412105560303, + "learning_rate": 0.00013201670024913308, + "loss": 2.9191, + "step": 42188 + }, + { + "epoch": 2.07, + "grad_norm": 0.6811955571174622, + "learning_rate": 0.00013200394426115034, + "loss": 2.8201, + "step": 42189 + }, + { + "epoch": 2.07, + "grad_norm": 0.6670415997505188, + "learning_rate": 0.00013199118871563674, + "loss": 2.7303, + "step": 42190 + }, + { + "epoch": 2.07, + "grad_norm": 0.7115445733070374, + "learning_rate": 0.00013197843361262613, + "loss": 2.8177, + "step": 42191 + }, + { + "epoch": 2.07, + "grad_norm": 0.6806511878967285, + "learning_rate": 0.00013196567895215202, + "loss": 3.0169, + "step": 42192 + }, + { + "epoch": 2.07, + "grad_norm": 0.6739731431007385, + "learning_rate": 0.00013195292473424782, + "loss": 2.9967, + "step": 42193 + }, + { + "epoch": 2.07, + "grad_norm": 0.6881250143051147, + "learning_rate": 0.0001319401709589474, + "loss": 2.8704, + "step": 42194 + }, + { + "epoch": 2.07, + "grad_norm": 0.6715559363365173, + "learning_rate": 0.0001319274176262841, + "loss": 2.9177, + "step": 42195 + }, + { + "epoch": 2.07, + "grad_norm": 0.6683921813964844, + "learning_rate": 0.0001319146647362918, + "loss": 2.8413, + "step": 42196 + }, + { + "epoch": 2.07, + "grad_norm": 0.7129725813865662, + "learning_rate": 0.00013190191228900378, + "loss": 2.8465, + "step": 42197 + }, + { + "epoch": 2.07, + "grad_norm": 0.6918213963508606, + "learning_rate": 0.00013188916028445376, + "loss": 2.9743, + "step": 42198 + }, + { + "epoch": 2.07, + "grad_norm": 0.7041875720024109, + "learning_rate": 0.00013187640872267545, + "loss": 2.9971, + "step": 42199 + }, + { + "epoch": 2.07, + "grad_norm": 0.7519069314002991, + "learning_rate": 0.00013186365760370233, + "loss": 2.8209, + "step": 42200 + }, + { + "epoch": 2.07, + "grad_norm": 0.7009527087211609, + "learning_rate": 0.00013185090692756796, + "loss": 2.8803, + "step": 42201 + }, + { + "epoch": 2.07, + "grad_norm": 0.7051267623901367, + "learning_rate": 0.00013183815669430587, + "loss": 2.7405, + "step": 42202 + }, + { + "epoch": 2.07, + "grad_norm": 0.7135884761810303, + "learning_rate": 0.00013182540690394968, + "loss": 2.9261, + "step": 42203 + }, + { + "epoch": 2.07, + "grad_norm": 0.6404792070388794, + "learning_rate": 0.00013181265755653313, + "loss": 2.755, + "step": 42204 + }, + { + "epoch": 2.07, + "grad_norm": 0.7060980200767517, + "learning_rate": 0.00013179990865208957, + "loss": 2.8539, + "step": 42205 + }, + { + "epoch": 2.07, + "grad_norm": 0.6715150475502014, + "learning_rate": 0.00013178716019065275, + "loss": 2.8695, + "step": 42206 + }, + { + "epoch": 2.07, + "grad_norm": 0.7175034880638123, + "learning_rate": 0.0001317744121722562, + "loss": 2.8646, + "step": 42207 + }, + { + "epoch": 2.07, + "grad_norm": 0.669849157333374, + "learning_rate": 0.00013176166459693338, + "loss": 2.9067, + "step": 42208 + }, + { + "epoch": 2.07, + "grad_norm": 0.6535002589225769, + "learning_rate": 0.00013174891746471805, + "loss": 2.9545, + "step": 42209 + }, + { + "epoch": 2.07, + "grad_norm": 0.6995859146118164, + "learning_rate": 0.00013173617077564357, + "loss": 2.938, + "step": 42210 + }, + { + "epoch": 2.07, + "grad_norm": 0.6866672039031982, + "learning_rate": 0.00013172342452974378, + "loss": 2.8747, + "step": 42211 + }, + { + "epoch": 2.07, + "grad_norm": 0.7818604111671448, + "learning_rate": 0.00013171067872705196, + "loss": 3.0297, + "step": 42212 + }, + { + "epoch": 2.07, + "grad_norm": 0.6974015831947327, + "learning_rate": 0.00013169793336760194, + "loss": 2.9315, + "step": 42213 + }, + { + "epoch": 2.07, + "grad_norm": 0.683866560459137, + "learning_rate": 0.00013168518845142716, + "loss": 2.9169, + "step": 42214 + }, + { + "epoch": 2.07, + "grad_norm": 0.6737753748893738, + "learning_rate": 0.00013167244397856113, + "loss": 3.0056, + "step": 42215 + }, + { + "epoch": 2.07, + "grad_norm": 0.7229675054550171, + "learning_rate": 0.00013165969994903757, + "loss": 3.1931, + "step": 42216 + }, + { + "epoch": 2.07, + "grad_norm": 0.7247565388679504, + "learning_rate": 0.00013164695636288986, + "loss": 2.9204, + "step": 42217 + }, + { + "epoch": 2.07, + "grad_norm": 0.7432151436805725, + "learning_rate": 0.00013163421322015168, + "loss": 3.1582, + "step": 42218 + }, + { + "epoch": 2.07, + "grad_norm": 0.6654666662216187, + "learning_rate": 0.00013162147052085666, + "loss": 3.0593, + "step": 42219 + }, + { + "epoch": 2.07, + "grad_norm": 0.6909993886947632, + "learning_rate": 0.00013160872826503816, + "loss": 2.7321, + "step": 42220 + }, + { + "epoch": 2.07, + "grad_norm": 0.7081809043884277, + "learning_rate": 0.00013159598645273002, + "loss": 2.8623, + "step": 42221 + }, + { + "epoch": 2.07, + "grad_norm": 0.6888182163238525, + "learning_rate": 0.00013158324508396563, + "loss": 2.9768, + "step": 42222 + }, + { + "epoch": 2.07, + "grad_norm": 0.6894894242286682, + "learning_rate": 0.00013157050415877843, + "loss": 3.0968, + "step": 42223 + }, + { + "epoch": 2.07, + "grad_norm": 0.6939984560012817, + "learning_rate": 0.00013155776367720223, + "loss": 3.0215, + "step": 42224 + }, + { + "epoch": 2.07, + "grad_norm": 0.7307678461074829, + "learning_rate": 0.00013154502363927037, + "loss": 2.9241, + "step": 42225 + }, + { + "epoch": 2.07, + "grad_norm": 0.6858737468719482, + "learning_rate": 0.00013153228404501658, + "loss": 2.9914, + "step": 42226 + }, + { + "epoch": 2.07, + "grad_norm": 0.6769899725914001, + "learning_rate": 0.00013151954489447424, + "loss": 2.9458, + "step": 42227 + }, + { + "epoch": 2.07, + "grad_norm": 0.6825265884399414, + "learning_rate": 0.0001315068061876771, + "loss": 2.8933, + "step": 42228 + }, + { + "epoch": 2.07, + "grad_norm": 0.6815954446792603, + "learning_rate": 0.00013149406792465859, + "loss": 3.093, + "step": 42229 + }, + { + "epoch": 2.07, + "grad_norm": 0.652065634727478, + "learning_rate": 0.00013148133010545214, + "loss": 3.0218, + "step": 42230 + }, + { + "epoch": 2.07, + "grad_norm": 0.6865938901901245, + "learning_rate": 0.00013146859273009158, + "loss": 2.7974, + "step": 42231 + }, + { + "epoch": 2.07, + "grad_norm": 0.6675371527671814, + "learning_rate": 0.00013145585579861022, + "loss": 2.9922, + "step": 42232 + }, + { + "epoch": 2.07, + "grad_norm": 0.6868575811386108, + "learning_rate": 0.0001314431193110418, + "loss": 2.9725, + "step": 42233 + }, + { + "epoch": 2.07, + "grad_norm": 0.689803421497345, + "learning_rate": 0.00013143038326741963, + "loss": 2.7922, + "step": 42234 + }, + { + "epoch": 2.07, + "grad_norm": 0.7004591822624207, + "learning_rate": 0.00013141764766777753, + "loss": 2.8264, + "step": 42235 + }, + { + "epoch": 2.07, + "grad_norm": 0.7188933491706848, + "learning_rate": 0.00013140491251214888, + "loss": 2.9029, + "step": 42236 + }, + { + "epoch": 2.07, + "grad_norm": 0.6429142355918884, + "learning_rate": 0.00013139217780056713, + "loss": 2.8006, + "step": 42237 + }, + { + "epoch": 2.07, + "grad_norm": 0.6708921194076538, + "learning_rate": 0.00013137944353306603, + "loss": 2.9435, + "step": 42238 + }, + { + "epoch": 2.07, + "grad_norm": 0.6704015135765076, + "learning_rate": 0.00013136670970967897, + "loss": 2.9552, + "step": 42239 + }, + { + "epoch": 2.07, + "grad_norm": 0.7032222151756287, + "learning_rate": 0.0001313539763304395, + "loss": 2.898, + "step": 42240 + }, + { + "epoch": 2.07, + "grad_norm": 0.6727782487869263, + "learning_rate": 0.0001313412433953813, + "loss": 3.1218, + "step": 42241 + }, + { + "epoch": 2.07, + "grad_norm": 0.7007694244384766, + "learning_rate": 0.00013132851090453775, + "loss": 3.0368, + "step": 42242 + }, + { + "epoch": 2.07, + "grad_norm": 0.6816314458847046, + "learning_rate": 0.00013131577885794248, + "loss": 2.9854, + "step": 42243 + }, + { + "epoch": 2.07, + "grad_norm": 0.7471539974212646, + "learning_rate": 0.00013130304725562906, + "loss": 2.9592, + "step": 42244 + }, + { + "epoch": 2.07, + "grad_norm": 0.6539857983589172, + "learning_rate": 0.0001312903160976308, + "loss": 2.988, + "step": 42245 + }, + { + "epoch": 2.07, + "grad_norm": 0.6955440044403076, + "learning_rate": 0.00013127758538398145, + "loss": 3.0305, + "step": 42246 + }, + { + "epoch": 2.07, + "grad_norm": 0.6992368698120117, + "learning_rate": 0.0001312648551147144, + "loss": 2.9075, + "step": 42247 + }, + { + "epoch": 2.07, + "grad_norm": 0.7383258938789368, + "learning_rate": 0.00013125212528986336, + "loss": 2.7885, + "step": 42248 + }, + { + "epoch": 2.07, + "grad_norm": 0.7086347341537476, + "learning_rate": 0.00013123939590946164, + "loss": 3.0402, + "step": 42249 + }, + { + "epoch": 2.07, + "grad_norm": 0.715691089630127, + "learning_rate": 0.00013122666697354296, + "loss": 2.9558, + "step": 42250 + }, + { + "epoch": 2.07, + "grad_norm": 0.6848446726799011, + "learning_rate": 0.00013121393848214077, + "loss": 2.9925, + "step": 42251 + }, + { + "epoch": 2.07, + "grad_norm": 0.679757297039032, + "learning_rate": 0.00013120121043528845, + "loss": 2.9478, + "step": 42252 + }, + { + "epoch": 2.07, + "grad_norm": 0.6886526346206665, + "learning_rate": 0.00013118848283301981, + "loss": 3.0612, + "step": 42253 + }, + { + "epoch": 2.07, + "grad_norm": 0.7555577158927917, + "learning_rate": 0.00013117575567536807, + "loss": 3.0184, + "step": 42254 + }, + { + "epoch": 2.07, + "grad_norm": 0.6762755513191223, + "learning_rate": 0.00013116302896236693, + "loss": 2.84, + "step": 42255 + }, + { + "epoch": 2.07, + "grad_norm": 0.6835176944732666, + "learning_rate": 0.00013115030269404996, + "loss": 2.9008, + "step": 42256 + }, + { + "epoch": 2.07, + "grad_norm": 0.6507018804550171, + "learning_rate": 0.00013113757687045063, + "loss": 2.7376, + "step": 42257 + }, + { + "epoch": 2.07, + "grad_norm": 0.7082822322845459, + "learning_rate": 0.0001311248514916024, + "loss": 3.1054, + "step": 42258 + }, + { + "epoch": 2.07, + "grad_norm": 0.6697868704795837, + "learning_rate": 0.0001311121265575387, + "loss": 2.9395, + "step": 42259 + }, + { + "epoch": 2.07, + "grad_norm": 0.7068183422088623, + "learning_rate": 0.00013109940206829316, + "loss": 2.9129, + "step": 42260 + }, + { + "epoch": 2.07, + "grad_norm": 0.7108052968978882, + "learning_rate": 0.0001310866780238994, + "loss": 3.0556, + "step": 42261 + }, + { + "epoch": 2.07, + "grad_norm": 0.7249935865402222, + "learning_rate": 0.00013107395442439073, + "loss": 3.0242, + "step": 42262 + }, + { + "epoch": 2.07, + "grad_norm": 0.6782382130622864, + "learning_rate": 0.00013106123126980083, + "loss": 2.988, + "step": 42263 + }, + { + "epoch": 2.07, + "grad_norm": 0.6726512908935547, + "learning_rate": 0.00013104850856016304, + "loss": 2.9917, + "step": 42264 + }, + { + "epoch": 2.07, + "grad_norm": 0.6596064567565918, + "learning_rate": 0.00013103578629551104, + "loss": 3.1468, + "step": 42265 + }, + { + "epoch": 2.07, + "grad_norm": 0.6861616969108582, + "learning_rate": 0.0001310230644758783, + "loss": 2.9792, + "step": 42266 + }, + { + "epoch": 2.07, + "grad_norm": 0.7167417407035828, + "learning_rate": 0.0001310103431012982, + "loss": 2.8683, + "step": 42267 + }, + { + "epoch": 2.07, + "grad_norm": 0.671953558921814, + "learning_rate": 0.0001309976221718044, + "loss": 3.0194, + "step": 42268 + }, + { + "epoch": 2.07, + "grad_norm": 0.6684266924858093, + "learning_rate": 0.00013098490168743024, + "loss": 2.8826, + "step": 42269 + }, + { + "epoch": 2.07, + "grad_norm": 0.6714996099472046, + "learning_rate": 0.00013097218164820934, + "loss": 2.8523, + "step": 42270 + }, + { + "epoch": 2.07, + "grad_norm": 0.6615957617759705, + "learning_rate": 0.00013095946205417527, + "loss": 2.9081, + "step": 42271 + }, + { + "epoch": 2.07, + "grad_norm": 0.7182602882385254, + "learning_rate": 0.00013094674290536144, + "loss": 2.9164, + "step": 42272 + }, + { + "epoch": 2.07, + "grad_norm": 0.6356480717658997, + "learning_rate": 0.00013093402420180137, + "loss": 2.974, + "step": 42273 + }, + { + "epoch": 2.07, + "grad_norm": 0.6807072162628174, + "learning_rate": 0.00013092130594352845, + "loss": 2.9493, + "step": 42274 + }, + { + "epoch": 2.07, + "grad_norm": 0.7285915613174438, + "learning_rate": 0.00013090858813057622, + "loss": 3.148, + "step": 42275 + }, + { + "epoch": 2.07, + "grad_norm": 0.6464709639549255, + "learning_rate": 0.0001308958707629784, + "loss": 3.0066, + "step": 42276 + }, + { + "epoch": 2.07, + "grad_norm": 0.7085345387458801, + "learning_rate": 0.00013088315384076813, + "loss": 2.9503, + "step": 42277 + }, + { + "epoch": 2.07, + "grad_norm": 0.7010959386825562, + "learning_rate": 0.00013087043736397923, + "loss": 2.9879, + "step": 42278 + }, + { + "epoch": 2.07, + "grad_norm": 0.6488636136054993, + "learning_rate": 0.00013085772133264505, + "loss": 3.0544, + "step": 42279 + }, + { + "epoch": 2.07, + "grad_norm": 0.6806224584579468, + "learning_rate": 0.00013084500574679896, + "loss": 3.0751, + "step": 42280 + }, + { + "epoch": 2.07, + "grad_norm": 0.7017746567726135, + "learning_rate": 0.00013083229060647473, + "loss": 2.8714, + "step": 42281 + }, + { + "epoch": 2.07, + "grad_norm": 0.7165690660476685, + "learning_rate": 0.00013081957591170551, + "loss": 2.8805, + "step": 42282 + }, + { + "epoch": 2.07, + "grad_norm": 0.6798978447914124, + "learning_rate": 0.00013080686166252515, + "loss": 2.9131, + "step": 42283 + }, + { + "epoch": 2.07, + "grad_norm": 0.6638495922088623, + "learning_rate": 0.00013079414785896676, + "loss": 2.9656, + "step": 42284 + }, + { + "epoch": 2.07, + "grad_norm": 0.7176745533943176, + "learning_rate": 0.00013078143450106407, + "loss": 2.8036, + "step": 42285 + }, + { + "epoch": 2.07, + "grad_norm": 0.6890358328819275, + "learning_rate": 0.00013076872158885062, + "loss": 2.8643, + "step": 42286 + }, + { + "epoch": 2.07, + "grad_norm": 0.6619595289230347, + "learning_rate": 0.0001307560091223598, + "loss": 2.8526, + "step": 42287 + }, + { + "epoch": 2.07, + "grad_norm": 0.682287871837616, + "learning_rate": 0.00013074329710162507, + "loss": 2.9174, + "step": 42288 + }, + { + "epoch": 2.07, + "grad_norm": 0.6877726912498474, + "learning_rate": 0.00013073058552667978, + "loss": 3.0925, + "step": 42289 + }, + { + "epoch": 2.07, + "grad_norm": 0.6723368167877197, + "learning_rate": 0.0001307178743975576, + "loss": 2.8354, + "step": 42290 + }, + { + "epoch": 2.07, + "grad_norm": 0.6675642132759094, + "learning_rate": 0.00013070516371429204, + "loss": 2.9596, + "step": 42291 + }, + { + "epoch": 2.07, + "grad_norm": 0.6696404218673706, + "learning_rate": 0.0001306924534769164, + "loss": 2.8843, + "step": 42292 + }, + { + "epoch": 2.07, + "grad_norm": 1.3090136051177979, + "learning_rate": 0.00013067974368546437, + "loss": 3.0016, + "step": 42293 + }, + { + "epoch": 2.07, + "grad_norm": 0.6999812722206116, + "learning_rate": 0.0001306670343399693, + "loss": 2.9312, + "step": 42294 + }, + { + "epoch": 2.07, + "grad_norm": 0.6614927053451538, + "learning_rate": 0.00013065432544046455, + "loss": 2.9818, + "step": 42295 + }, + { + "epoch": 2.07, + "grad_norm": 0.6659789085388184, + "learning_rate": 0.00013064161698698383, + "loss": 2.8157, + "step": 42296 + }, + { + "epoch": 2.07, + "grad_norm": 0.6961437463760376, + "learning_rate": 0.0001306289089795604, + "loss": 2.8556, + "step": 42297 + }, + { + "epoch": 2.07, + "grad_norm": 0.6694318056106567, + "learning_rate": 0.00013061620141822792, + "loss": 2.9884, + "step": 42298 + }, + { + "epoch": 2.07, + "grad_norm": 0.6987858414649963, + "learning_rate": 0.0001306034943030197, + "loss": 2.8139, + "step": 42299 + }, + { + "epoch": 2.07, + "grad_norm": 0.6902264952659607, + "learning_rate": 0.00013059078763396923, + "loss": 2.9773, + "step": 42300 + }, + { + "epoch": 2.07, + "grad_norm": 0.6752839088439941, + "learning_rate": 0.00013057808141111026, + "loss": 2.8498, + "step": 42301 + }, + { + "epoch": 2.07, + "grad_norm": 0.7169557809829712, + "learning_rate": 0.00013056537563447578, + "loss": 2.7842, + "step": 42302 + }, + { + "epoch": 2.07, + "grad_norm": 0.7090547680854797, + "learning_rate": 0.00013055267030409964, + "loss": 3.2104, + "step": 42303 + }, + { + "epoch": 2.07, + "grad_norm": 0.6589310169219971, + "learning_rate": 0.00013053996542001503, + "loss": 2.7599, + "step": 42304 + }, + { + "epoch": 2.07, + "grad_norm": 0.7081121802330017, + "learning_rate": 0.00013052726098225563, + "loss": 3.0029, + "step": 42305 + }, + { + "epoch": 2.07, + "grad_norm": 0.6937905550003052, + "learning_rate": 0.0001305145569908547, + "loss": 3.0007, + "step": 42306 + }, + { + "epoch": 2.07, + "grad_norm": 0.6769243478775024, + "learning_rate": 0.0001305018534458458, + "loss": 2.8763, + "step": 42307 + }, + { + "epoch": 2.07, + "grad_norm": 0.7453411221504211, + "learning_rate": 0.00013048915034726254, + "loss": 2.8378, + "step": 42308 + }, + { + "epoch": 2.07, + "grad_norm": 0.674867570400238, + "learning_rate": 0.0001304764476951382, + "loss": 3.0571, + "step": 42309 + }, + { + "epoch": 2.07, + "grad_norm": 0.707789421081543, + "learning_rate": 0.00013046374548950632, + "loss": 3.1763, + "step": 42310 + }, + { + "epoch": 2.07, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.00013045104373040015, + "loss": 2.9595, + "step": 42311 + }, + { + "epoch": 2.07, + "grad_norm": 0.7053055167198181, + "learning_rate": 0.0001304383424178533, + "loss": 2.9741, + "step": 42312 + }, + { + "epoch": 2.07, + "grad_norm": 0.6682379841804504, + "learning_rate": 0.00013042564155189937, + "loss": 2.9207, + "step": 42313 + }, + { + "epoch": 2.07, + "grad_norm": 0.6992611289024353, + "learning_rate": 0.00013041294113257153, + "loss": 2.9781, + "step": 42314 + }, + { + "epoch": 2.07, + "grad_norm": 0.6636302471160889, + "learning_rate": 0.0001304002411599035, + "loss": 2.9719, + "step": 42315 + }, + { + "epoch": 2.07, + "grad_norm": 0.6672903299331665, + "learning_rate": 0.00013038754163392858, + "loss": 3.0387, + "step": 42316 + }, + { + "epoch": 2.07, + "grad_norm": 0.7034769654273987, + "learning_rate": 0.0001303748425546801, + "loss": 2.8552, + "step": 42317 + }, + { + "epoch": 2.07, + "grad_norm": 0.665863037109375, + "learning_rate": 0.00013036214392219176, + "loss": 2.9452, + "step": 42318 + }, + { + "epoch": 2.07, + "grad_norm": 0.6769158244132996, + "learning_rate": 0.0001303494457364968, + "loss": 2.951, + "step": 42319 + }, + { + "epoch": 2.07, + "grad_norm": 0.6977242827415466, + "learning_rate": 0.00013033674799762884, + "loss": 2.9524, + "step": 42320 + }, + { + "epoch": 2.07, + "grad_norm": 0.6679743528366089, + "learning_rate": 0.00013032405070562112, + "loss": 2.8681, + "step": 42321 + }, + { + "epoch": 2.07, + "grad_norm": 0.7165648341178894, + "learning_rate": 0.0001303113538605072, + "loss": 3.006, + "step": 42322 + }, + { + "epoch": 2.07, + "grad_norm": 0.7313899993896484, + "learning_rate": 0.00013029865746232063, + "loss": 2.9289, + "step": 42323 + }, + { + "epoch": 2.07, + "grad_norm": 0.6785582304000854, + "learning_rate": 0.00013028596151109473, + "loss": 2.9195, + "step": 42324 + }, + { + "epoch": 2.07, + "grad_norm": 0.6553201079368591, + "learning_rate": 0.00013027326600686294, + "loss": 2.9507, + "step": 42325 + }, + { + "epoch": 2.07, + "grad_norm": 0.7384634017944336, + "learning_rate": 0.0001302605709496586, + "loss": 2.8676, + "step": 42326 + }, + { + "epoch": 2.07, + "grad_norm": 0.6803930401802063, + "learning_rate": 0.00013024787633951528, + "loss": 2.7254, + "step": 42327 + }, + { + "epoch": 2.07, + "grad_norm": 0.7045084834098816, + "learning_rate": 0.0001302351821764665, + "loss": 2.9962, + "step": 42328 + }, + { + "epoch": 2.07, + "grad_norm": 0.7233037352561951, + "learning_rate": 0.00013022248846054538, + "loss": 2.6192, + "step": 42329 + }, + { + "epoch": 2.07, + "grad_norm": 0.7065985798835754, + "learning_rate": 0.00013020979519178574, + "loss": 3.0464, + "step": 42330 + }, + { + "epoch": 2.07, + "grad_norm": 0.6664899587631226, + "learning_rate": 0.00013019710237022078, + "loss": 2.9836, + "step": 42331 + }, + { + "epoch": 2.07, + "grad_norm": 0.6708837151527405, + "learning_rate": 0.0001301844099958839, + "loss": 3.0906, + "step": 42332 + }, + { + "epoch": 2.07, + "grad_norm": 0.6945251822471619, + "learning_rate": 0.0001301717180688087, + "loss": 2.8813, + "step": 42333 + }, + { + "epoch": 2.07, + "grad_norm": 0.6718724370002747, + "learning_rate": 0.00013015902658902837, + "loss": 3.0567, + "step": 42334 + }, + { + "epoch": 2.07, + "grad_norm": 0.6948716044425964, + "learning_rate": 0.00013014633555657665, + "loss": 3.2154, + "step": 42335 + }, + { + "epoch": 2.07, + "grad_norm": 0.6696521639823914, + "learning_rate": 0.00013013364497148661, + "loss": 2.8059, + "step": 42336 + }, + { + "epoch": 2.07, + "grad_norm": 0.723652184009552, + "learning_rate": 0.000130120954833792, + "loss": 2.7653, + "step": 42337 + }, + { + "epoch": 2.07, + "grad_norm": 0.7283807992935181, + "learning_rate": 0.00013010826514352615, + "loss": 2.9663, + "step": 42338 + }, + { + "epoch": 2.07, + "grad_norm": 0.7475623488426208, + "learning_rate": 0.00013009557590072226, + "loss": 3.0521, + "step": 42339 + }, + { + "epoch": 2.07, + "grad_norm": 0.6831005811691284, + "learning_rate": 0.00013008288710541406, + "loss": 3.1819, + "step": 42340 + }, + { + "epoch": 2.08, + "grad_norm": 0.6755504012107849, + "learning_rate": 0.0001300701987576347, + "loss": 2.8866, + "step": 42341 + }, + { + "epoch": 2.08, + "grad_norm": 0.6740015149116516, + "learning_rate": 0.00013005751085741777, + "loss": 2.8911, + "step": 42342 + }, + { + "epoch": 2.08, + "grad_norm": 0.6407496929168701, + "learning_rate": 0.00013004482340479672, + "loss": 2.9256, + "step": 42343 + }, + { + "epoch": 2.08, + "grad_norm": 0.7576172947883606, + "learning_rate": 0.0001300321363998048, + "loss": 3.1794, + "step": 42344 + }, + { + "epoch": 2.08, + "grad_norm": 0.6567968726158142, + "learning_rate": 0.00013001944984247565, + "loss": 2.9828, + "step": 42345 + }, + { + "epoch": 2.08, + "grad_norm": 0.7320488691329956, + "learning_rate": 0.00013000676373284254, + "loss": 2.9482, + "step": 42346 + }, + { + "epoch": 2.08, + "grad_norm": 0.646527886390686, + "learning_rate": 0.0001299940780709388, + "loss": 2.8785, + "step": 42347 + }, + { + "epoch": 2.08, + "grad_norm": 0.6653791069984436, + "learning_rate": 0.00012998139285679802, + "loss": 3.0511, + "step": 42348 + }, + { + "epoch": 2.08, + "grad_norm": 0.6764574646949768, + "learning_rate": 0.00012996870809045346, + "loss": 2.9864, + "step": 42349 + }, + { + "epoch": 2.08, + "grad_norm": 0.6396170854568481, + "learning_rate": 0.00012995602377193868, + "loss": 2.8156, + "step": 42350 + }, + { + "epoch": 2.08, + "grad_norm": 0.6769349575042725, + "learning_rate": 0.00012994333990128693, + "loss": 2.9659, + "step": 42351 + }, + { + "epoch": 2.08, + "grad_norm": 0.711395263671875, + "learning_rate": 0.00012993065647853177, + "loss": 3.0182, + "step": 42352 + }, + { + "epoch": 2.08, + "grad_norm": 0.6921821236610413, + "learning_rate": 0.00012991797350370656, + "loss": 2.8534, + "step": 42353 + }, + { + "epoch": 2.08, + "grad_norm": 0.686550498008728, + "learning_rate": 0.00012990529097684455, + "loss": 2.9083, + "step": 42354 + }, + { + "epoch": 2.08, + "grad_norm": 0.7336958646774292, + "learning_rate": 0.00012989260889797934, + "loss": 2.7641, + "step": 42355 + }, + { + "epoch": 2.08, + "grad_norm": 0.7120491862297058, + "learning_rate": 0.0001298799272671442, + "loss": 2.9055, + "step": 42356 + }, + { + "epoch": 2.08, + "grad_norm": 0.6996833086013794, + "learning_rate": 0.0001298672460843726, + "loss": 2.9461, + "step": 42357 + }, + { + "epoch": 2.08, + "grad_norm": 0.7006560564041138, + "learning_rate": 0.000129854565349698, + "loss": 2.8888, + "step": 42358 + }, + { + "epoch": 2.08, + "grad_norm": 0.6544038653373718, + "learning_rate": 0.0001298418850631538, + "loss": 2.8266, + "step": 42359 + }, + { + "epoch": 2.08, + "grad_norm": 0.7305658459663391, + "learning_rate": 0.00012982920522477324, + "loss": 3.1084, + "step": 42360 + }, + { + "epoch": 2.08, + "grad_norm": 0.6678386330604553, + "learning_rate": 0.00012981652583458973, + "loss": 2.7814, + "step": 42361 + }, + { + "epoch": 2.08, + "grad_norm": 0.6706693768501282, + "learning_rate": 0.00012980384689263675, + "loss": 2.9772, + "step": 42362 + }, + { + "epoch": 2.08, + "grad_norm": 0.7006844878196716, + "learning_rate": 0.00012979116839894779, + "loss": 2.9108, + "step": 42363 + }, + { + "epoch": 2.08, + "grad_norm": 0.7549619078636169, + "learning_rate": 0.000129778490353556, + "loss": 3.0531, + "step": 42364 + }, + { + "epoch": 2.08, + "grad_norm": 0.6899545192718506, + "learning_rate": 0.00012976581275649503, + "loss": 2.8675, + "step": 42365 + }, + { + "epoch": 2.08, + "grad_norm": 0.6634476184844971, + "learning_rate": 0.00012975313560779805, + "loss": 3.0111, + "step": 42366 + }, + { + "epoch": 2.08, + "grad_norm": 0.6710315942764282, + "learning_rate": 0.00012974045890749866, + "loss": 2.9529, + "step": 42367 + }, + { + "epoch": 2.08, + "grad_norm": 0.6514169573783875, + "learning_rate": 0.0001297277826556301, + "loss": 2.697, + "step": 42368 + }, + { + "epoch": 2.08, + "grad_norm": 0.6755321025848389, + "learning_rate": 0.00012971510685222572, + "loss": 3.0741, + "step": 42369 + }, + { + "epoch": 2.08, + "grad_norm": 0.7050230503082275, + "learning_rate": 0.00012970243149731904, + "loss": 2.6275, + "step": 42370 + }, + { + "epoch": 2.08, + "grad_norm": 0.7091357707977295, + "learning_rate": 0.00012968975659094332, + "loss": 2.6535, + "step": 42371 + }, + { + "epoch": 2.08, + "grad_norm": 0.6768811941146851, + "learning_rate": 0.00012967708213313197, + "loss": 3.0096, + "step": 42372 + }, + { + "epoch": 2.08, + "grad_norm": 0.6693670153617859, + "learning_rate": 0.00012966440812391854, + "loss": 2.8613, + "step": 42373 + }, + { + "epoch": 2.08, + "grad_norm": 0.7181096076965332, + "learning_rate": 0.00012965173456333626, + "loss": 2.7873, + "step": 42374 + }, + { + "epoch": 2.08, + "grad_norm": 0.7134459614753723, + "learning_rate": 0.00012963906145141855, + "loss": 2.8422, + "step": 42375 + }, + { + "epoch": 2.08, + "grad_norm": 0.7182124853134155, + "learning_rate": 0.00012962638878819864, + "loss": 2.9213, + "step": 42376 + }, + { + "epoch": 2.08, + "grad_norm": 0.7274218201637268, + "learning_rate": 0.00012961371657371003, + "loss": 2.7016, + "step": 42377 + }, + { + "epoch": 2.08, + "grad_norm": 0.6675017476081848, + "learning_rate": 0.0001296010448079862, + "loss": 2.8717, + "step": 42378 + }, + { + "epoch": 2.08, + "grad_norm": 0.7228912115097046, + "learning_rate": 0.00012958837349106032, + "loss": 3.0004, + "step": 42379 + }, + { + "epoch": 2.08, + "grad_norm": 0.6356688737869263, + "learning_rate": 0.000129575702622966, + "loss": 2.5134, + "step": 42380 + }, + { + "epoch": 2.08, + "grad_norm": 0.663016140460968, + "learning_rate": 0.00012956303220373647, + "loss": 2.9052, + "step": 42381 + }, + { + "epoch": 2.08, + "grad_norm": 0.6874765157699585, + "learning_rate": 0.00012955036223340498, + "loss": 3.127, + "step": 42382 + }, + { + "epoch": 2.08, + "grad_norm": 0.7107385396957397, + "learning_rate": 0.00012953769271200518, + "loss": 2.9775, + "step": 42383 + }, + { + "epoch": 2.08, + "grad_norm": 0.7340335249900818, + "learning_rate": 0.00012952502363957014, + "loss": 2.8887, + "step": 42384 + }, + { + "epoch": 2.08, + "grad_norm": 0.6798539161682129, + "learning_rate": 0.00012951235501613352, + "loss": 2.9121, + "step": 42385 + }, + { + "epoch": 2.08, + "grad_norm": 0.7123425006866455, + "learning_rate": 0.00012949968684172845, + "loss": 3.0914, + "step": 42386 + }, + { + "epoch": 2.08, + "grad_norm": 0.6899136900901794, + "learning_rate": 0.00012948701911638845, + "loss": 2.9722, + "step": 42387 + }, + { + "epoch": 2.08, + "grad_norm": 0.7115585207939148, + "learning_rate": 0.00012947435184014673, + "loss": 3.0044, + "step": 42388 + }, + { + "epoch": 2.08, + "grad_norm": 0.6917968392372131, + "learning_rate": 0.0001294616850130369, + "loss": 3.1324, + "step": 42389 + }, + { + "epoch": 2.08, + "grad_norm": 0.6570953726768494, + "learning_rate": 0.00012944901863509213, + "loss": 2.9288, + "step": 42390 + }, + { + "epoch": 2.08, + "grad_norm": 0.6744385957717896, + "learning_rate": 0.00012943635270634574, + "loss": 2.7654, + "step": 42391 + }, + { + "epoch": 2.08, + "grad_norm": 0.7136799693107605, + "learning_rate": 0.00012942368722683125, + "loss": 2.899, + "step": 42392 + }, + { + "epoch": 2.08, + "grad_norm": 0.6737086772918701, + "learning_rate": 0.00012941102219658182, + "loss": 2.7251, + "step": 42393 + }, + { + "epoch": 2.08, + "grad_norm": 0.6692909598350525, + "learning_rate": 0.00012939835761563095, + "loss": 3.0238, + "step": 42394 + }, + { + "epoch": 2.08, + "grad_norm": 0.6992953419685364, + "learning_rate": 0.0001293856934840121, + "loss": 2.8346, + "step": 42395 + }, + { + "epoch": 2.08, + "grad_norm": 0.6643773913383484, + "learning_rate": 0.00012937302980175848, + "loss": 2.7622, + "step": 42396 + }, + { + "epoch": 2.08, + "grad_norm": 0.6850830912590027, + "learning_rate": 0.00012936036656890345, + "loss": 2.9824, + "step": 42397 + }, + { + "epoch": 2.08, + "grad_norm": 0.7364450097084045, + "learning_rate": 0.0001293477037854803, + "loss": 2.953, + "step": 42398 + }, + { + "epoch": 2.08, + "grad_norm": 0.7071236968040466, + "learning_rate": 0.00012933504145152245, + "loss": 3.1013, + "step": 42399 + }, + { + "epoch": 2.08, + "grad_norm": 0.7141284942626953, + "learning_rate": 0.00012932237956706332, + "loss": 2.7942, + "step": 42400 + }, + { + "epoch": 2.08, + "grad_norm": 0.7371499538421631, + "learning_rate": 0.00012930971813213612, + "loss": 2.533, + "step": 42401 + }, + { + "epoch": 2.08, + "grad_norm": 0.7052841782569885, + "learning_rate": 0.0001292970571467744, + "loss": 2.8772, + "step": 42402 + }, + { + "epoch": 2.08, + "grad_norm": 0.7750392556190491, + "learning_rate": 0.00012928439661101133, + "loss": 2.9739, + "step": 42403 + }, + { + "epoch": 2.08, + "grad_norm": 0.7135288715362549, + "learning_rate": 0.00012927173652488023, + "loss": 2.8134, + "step": 42404 + }, + { + "epoch": 2.08, + "grad_norm": 0.6652043461799622, + "learning_rate": 0.00012925907688841464, + "loss": 3.063, + "step": 42405 + }, + { + "epoch": 2.08, + "grad_norm": 0.6913604736328125, + "learning_rate": 0.00012924641770164762, + "loss": 2.7423, + "step": 42406 + }, + { + "epoch": 2.08, + "grad_norm": 0.7233631014823914, + "learning_rate": 0.0001292337589646128, + "loss": 2.9022, + "step": 42407 + }, + { + "epoch": 2.08, + "grad_norm": 0.6916614174842834, + "learning_rate": 0.0001292211006773433, + "loss": 2.9954, + "step": 42408 + }, + { + "epoch": 2.08, + "grad_norm": 0.7121450901031494, + "learning_rate": 0.00012920844283987256, + "loss": 3.0694, + "step": 42409 + }, + { + "epoch": 2.08, + "grad_norm": 0.7093349695205688, + "learning_rate": 0.00012919578545223404, + "loss": 3.1103, + "step": 42410 + }, + { + "epoch": 2.08, + "grad_norm": 0.6733999848365784, + "learning_rate": 0.0001291831285144609, + "loss": 2.8649, + "step": 42411 + }, + { + "epoch": 2.08, + "grad_norm": 0.6598990559577942, + "learning_rate": 0.00012917047202658656, + "loss": 3.0327, + "step": 42412 + }, + { + "epoch": 2.08, + "grad_norm": 0.6923961043357849, + "learning_rate": 0.00012915781598864422, + "loss": 2.8068, + "step": 42413 + }, + { + "epoch": 2.08, + "grad_norm": 0.6872490048408508, + "learning_rate": 0.00012914516040066727, + "loss": 2.7399, + "step": 42414 + }, + { + "epoch": 2.08, + "grad_norm": 0.6673179268836975, + "learning_rate": 0.00012913250526268923, + "loss": 2.9066, + "step": 42415 + }, + { + "epoch": 2.08, + "grad_norm": 0.6778900027275085, + "learning_rate": 0.00012911985057474315, + "loss": 3.0026, + "step": 42416 + }, + { + "epoch": 2.08, + "grad_norm": 0.7145534157752991, + "learning_rate": 0.00012910719633686264, + "loss": 2.9472, + "step": 42417 + }, + { + "epoch": 2.08, + "grad_norm": 0.6881458163261414, + "learning_rate": 0.00012909454254908084, + "loss": 2.8058, + "step": 42418 + }, + { + "epoch": 2.08, + "grad_norm": 0.6922593712806702, + "learning_rate": 0.00012908188921143104, + "loss": 2.9075, + "step": 42419 + }, + { + "epoch": 2.08, + "grad_norm": 0.6974817514419556, + "learning_rate": 0.00012906923632394676, + "loss": 3.0196, + "step": 42420 + }, + { + "epoch": 2.08, + "grad_norm": 0.6990817189216614, + "learning_rate": 0.0001290565838866611, + "loss": 2.8226, + "step": 42421 + }, + { + "epoch": 2.08, + "grad_norm": 0.7017726302146912, + "learning_rate": 0.00012904393189960763, + "loss": 2.9575, + "step": 42422 + }, + { + "epoch": 2.08, + "grad_norm": 0.7260956168174744, + "learning_rate": 0.00012903128036281943, + "loss": 2.9157, + "step": 42423 + }, + { + "epoch": 2.08, + "grad_norm": 0.675983726978302, + "learning_rate": 0.00012901862927632994, + "loss": 2.8013, + "step": 42424 + }, + { + "epoch": 2.08, + "grad_norm": 0.7136183977127075, + "learning_rate": 0.00012900597864017258, + "loss": 2.784, + "step": 42425 + }, + { + "epoch": 2.08, + "grad_norm": 0.7095462679862976, + "learning_rate": 0.00012899332845438052, + "loss": 2.9734, + "step": 42426 + }, + { + "epoch": 2.08, + "grad_norm": 0.6412888765335083, + "learning_rate": 0.00012898067871898716, + "loss": 3.1996, + "step": 42427 + }, + { + "epoch": 2.08, + "grad_norm": 0.7176571488380432, + "learning_rate": 0.00012896802943402564, + "loss": 2.721, + "step": 42428 + }, + { + "epoch": 2.08, + "grad_norm": 0.685215950012207, + "learning_rate": 0.00012895538059952947, + "loss": 3.2016, + "step": 42429 + }, + { + "epoch": 2.08, + "grad_norm": 0.7202776670455933, + "learning_rate": 0.00012894273221553197, + "loss": 2.8996, + "step": 42430 + }, + { + "epoch": 2.08, + "grad_norm": 0.6567890644073486, + "learning_rate": 0.0001289300842820663, + "loss": 3.0631, + "step": 42431 + }, + { + "epoch": 2.08, + "grad_norm": 0.7814381718635559, + "learning_rate": 0.00012891743679916597, + "loss": 2.7118, + "step": 42432 + }, + { + "epoch": 2.08, + "grad_norm": 0.6643449068069458, + "learning_rate": 0.00012890478976686418, + "loss": 2.8286, + "step": 42433 + }, + { + "epoch": 2.08, + "grad_norm": 0.6698668003082275, + "learning_rate": 0.00012889214318519417, + "loss": 2.9327, + "step": 42434 + }, + { + "epoch": 2.08, + "grad_norm": 0.6875276565551758, + "learning_rate": 0.00012887949705418938, + "loss": 2.9944, + "step": 42435 + }, + { + "epoch": 2.08, + "grad_norm": 0.7161490321159363, + "learning_rate": 0.00012886685137388298, + "loss": 2.9799, + "step": 42436 + }, + { + "epoch": 2.08, + "grad_norm": 0.6984416246414185, + "learning_rate": 0.00012885420614430846, + "loss": 2.8483, + "step": 42437 + }, + { + "epoch": 2.08, + "grad_norm": 0.6920896768569946, + "learning_rate": 0.00012884156136549893, + "loss": 3.0116, + "step": 42438 + }, + { + "epoch": 2.08, + "grad_norm": 0.704794704914093, + "learning_rate": 0.00012882891703748793, + "loss": 2.6875, + "step": 42439 + }, + { + "epoch": 2.08, + "grad_norm": 0.7424866557121277, + "learning_rate": 0.00012881627316030858, + "loss": 2.78, + "step": 42440 + }, + { + "epoch": 2.08, + "grad_norm": 0.6924298405647278, + "learning_rate": 0.0001288036297339941, + "loss": 2.8718, + "step": 42441 + }, + { + "epoch": 2.08, + "grad_norm": 0.6861541271209717, + "learning_rate": 0.000128790986758578, + "loss": 2.786, + "step": 42442 + }, + { + "epoch": 2.08, + "grad_norm": 0.720872700214386, + "learning_rate": 0.00012877834423409345, + "loss": 2.7509, + "step": 42443 + }, + { + "epoch": 2.08, + "grad_norm": 0.6972219347953796, + "learning_rate": 0.00012876570216057375, + "loss": 3.0085, + "step": 42444 + }, + { + "epoch": 2.08, + "grad_norm": 0.6819028854370117, + "learning_rate": 0.00012875306053805236, + "loss": 2.9576, + "step": 42445 + }, + { + "epoch": 2.08, + "grad_norm": 0.6733766794204712, + "learning_rate": 0.0001287404193665623, + "loss": 2.9113, + "step": 42446 + }, + { + "epoch": 2.08, + "grad_norm": 0.6779769062995911, + "learning_rate": 0.00012872777864613718, + "loss": 2.7558, + "step": 42447 + }, + { + "epoch": 2.08, + "grad_norm": 0.7019845843315125, + "learning_rate": 0.00012871513837681007, + "loss": 2.8872, + "step": 42448 + }, + { + "epoch": 2.08, + "grad_norm": 0.7672656178474426, + "learning_rate": 0.00012870249855861428, + "loss": 3.0762, + "step": 42449 + }, + { + "epoch": 2.08, + "grad_norm": 0.6681340932846069, + "learning_rate": 0.00012868985919158322, + "loss": 3.0499, + "step": 42450 + }, + { + "epoch": 2.08, + "grad_norm": 0.7144621014595032, + "learning_rate": 0.00012867722027574998, + "loss": 3.2119, + "step": 42451 + }, + { + "epoch": 2.08, + "grad_norm": 0.671342134475708, + "learning_rate": 0.0001286645818111481, + "loss": 3.0014, + "step": 42452 + }, + { + "epoch": 2.08, + "grad_norm": 0.6688801646232605, + "learning_rate": 0.00012865194379781063, + "loss": 2.8508, + "step": 42453 + }, + { + "epoch": 2.08, + "grad_norm": 0.6864688396453857, + "learning_rate": 0.00012863930623577103, + "loss": 3.1261, + "step": 42454 + }, + { + "epoch": 2.08, + "grad_norm": 0.7038698792457581, + "learning_rate": 0.00012862666912506255, + "loss": 2.8566, + "step": 42455 + }, + { + "epoch": 2.08, + "grad_norm": 0.6805946230888367, + "learning_rate": 0.00012861403246571836, + "loss": 2.8152, + "step": 42456 + }, + { + "epoch": 2.08, + "grad_norm": 0.6786236763000488, + "learning_rate": 0.00012860139625777187, + "loss": 2.89, + "step": 42457 + }, + { + "epoch": 2.08, + "grad_norm": 0.694636881351471, + "learning_rate": 0.00012858876050125627, + "loss": 2.8622, + "step": 42458 + }, + { + "epoch": 2.08, + "grad_norm": 0.7160537838935852, + "learning_rate": 0.00012857612519620487, + "loss": 2.8614, + "step": 42459 + }, + { + "epoch": 2.08, + "grad_norm": 0.675896406173706, + "learning_rate": 0.00012856349034265105, + "loss": 2.8066, + "step": 42460 + }, + { + "epoch": 2.08, + "grad_norm": 0.6733073592185974, + "learning_rate": 0.000128550855940628, + "loss": 3.0653, + "step": 42461 + }, + { + "epoch": 2.08, + "grad_norm": 0.7059203386306763, + "learning_rate": 0.000128538221990169, + "loss": 3.1194, + "step": 42462 + }, + { + "epoch": 2.08, + "grad_norm": 0.6924806237220764, + "learning_rate": 0.00012852558849130725, + "loss": 2.6967, + "step": 42463 + }, + { + "epoch": 2.08, + "grad_norm": 0.6774609684944153, + "learning_rate": 0.00012851295544407618, + "loss": 3.0024, + "step": 42464 + }, + { + "epoch": 2.08, + "grad_norm": 0.6729218363761902, + "learning_rate": 0.0001285003228485089, + "loss": 2.94, + "step": 42465 + }, + { + "epoch": 2.08, + "grad_norm": 0.673554539680481, + "learning_rate": 0.0001284876907046387, + "loss": 2.9425, + "step": 42466 + }, + { + "epoch": 2.08, + "grad_norm": 0.7064787149429321, + "learning_rate": 0.00012847505901249912, + "loss": 2.8488, + "step": 42467 + }, + { + "epoch": 2.08, + "grad_norm": 0.6640927791595459, + "learning_rate": 0.00012846242777212304, + "loss": 2.7895, + "step": 42468 + }, + { + "epoch": 2.08, + "grad_norm": 0.6812401413917542, + "learning_rate": 0.00012844979698354406, + "loss": 2.8939, + "step": 42469 + }, + { + "epoch": 2.08, + "grad_norm": 0.6853512525558472, + "learning_rate": 0.00012843716664679528, + "loss": 3.0094, + "step": 42470 + }, + { + "epoch": 2.08, + "grad_norm": 0.673121988773346, + "learning_rate": 0.00012842453676190988, + "loss": 2.8918, + "step": 42471 + }, + { + "epoch": 2.08, + "grad_norm": 0.7033911347389221, + "learning_rate": 0.00012841190732892135, + "loss": 2.8784, + "step": 42472 + }, + { + "epoch": 2.08, + "grad_norm": 0.6943319439888, + "learning_rate": 0.00012839927834786268, + "loss": 2.8472, + "step": 42473 + }, + { + "epoch": 2.08, + "grad_norm": 0.6493386626243591, + "learning_rate": 0.00012838664981876743, + "loss": 2.7605, + "step": 42474 + }, + { + "epoch": 2.08, + "grad_norm": 0.6886091232299805, + "learning_rate": 0.00012837402174166863, + "loss": 2.9289, + "step": 42475 + }, + { + "epoch": 2.08, + "grad_norm": 0.6950169205665588, + "learning_rate": 0.0001283613941165997, + "loss": 3.0136, + "step": 42476 + }, + { + "epoch": 2.08, + "grad_norm": 0.7558622360229492, + "learning_rate": 0.00012834876694359386, + "loss": 2.9728, + "step": 42477 + }, + { + "epoch": 2.08, + "grad_norm": 0.7118024230003357, + "learning_rate": 0.00012833614022268418, + "loss": 2.9034, + "step": 42478 + }, + { + "epoch": 2.08, + "grad_norm": 0.6858274340629578, + "learning_rate": 0.0001283235139539042, + "loss": 2.7353, + "step": 42479 + }, + { + "epoch": 2.08, + "grad_norm": 0.6761425733566284, + "learning_rate": 0.00012831088813728694, + "loss": 2.8738, + "step": 42480 + }, + { + "epoch": 2.08, + "grad_norm": 0.686199426651001, + "learning_rate": 0.00012829826277286577, + "loss": 2.7441, + "step": 42481 + }, + { + "epoch": 2.08, + "grad_norm": 0.7111159563064575, + "learning_rate": 0.00012828563786067398, + "loss": 2.8098, + "step": 42482 + }, + { + "epoch": 2.08, + "grad_norm": 0.7450211644172668, + "learning_rate": 0.00012827301340074485, + "loss": 3.027, + "step": 42483 + }, + { + "epoch": 2.08, + "grad_norm": 0.6911284923553467, + "learning_rate": 0.0001282603893931115, + "loss": 3.0419, + "step": 42484 + }, + { + "epoch": 2.08, + "grad_norm": 0.6608761548995972, + "learning_rate": 0.00012824776583780716, + "loss": 2.6919, + "step": 42485 + }, + { + "epoch": 2.08, + "grad_norm": 0.687088131904602, + "learning_rate": 0.00012823514273486511, + "loss": 2.9157, + "step": 42486 + }, + { + "epoch": 2.08, + "grad_norm": 0.6786913871765137, + "learning_rate": 0.00012822252008431878, + "loss": 2.8025, + "step": 42487 + }, + { + "epoch": 2.08, + "grad_norm": 0.6923184394836426, + "learning_rate": 0.00012820989788620119, + "loss": 2.9774, + "step": 42488 + }, + { + "epoch": 2.08, + "grad_norm": 0.7364439964294434, + "learning_rate": 0.0001281972761405457, + "loss": 3.0177, + "step": 42489 + }, + { + "epoch": 2.08, + "grad_norm": 0.6883888840675354, + "learning_rate": 0.00012818465484738548, + "loss": 3.1355, + "step": 42490 + }, + { + "epoch": 2.08, + "grad_norm": 0.6888550519943237, + "learning_rate": 0.0001281720340067539, + "loss": 2.9592, + "step": 42491 + }, + { + "epoch": 2.08, + "grad_norm": 0.6707346439361572, + "learning_rate": 0.0001281594136186841, + "loss": 2.9254, + "step": 42492 + }, + { + "epoch": 2.08, + "grad_norm": 0.695807158946991, + "learning_rate": 0.00012814679368320923, + "loss": 2.833, + "step": 42493 + }, + { + "epoch": 2.08, + "grad_norm": 0.7045168876647949, + "learning_rate": 0.00012813417420036273, + "loss": 2.9055, + "step": 42494 + }, + { + "epoch": 2.08, + "grad_norm": 0.6547208428382874, + "learning_rate": 0.00012812155517017763, + "loss": 2.7095, + "step": 42495 + }, + { + "epoch": 2.08, + "grad_norm": 0.6461189985275269, + "learning_rate": 0.0001281089365926873, + "loss": 2.9019, + "step": 42496 + }, + { + "epoch": 2.08, + "grad_norm": 0.6801860332489014, + "learning_rate": 0.00012809631846792503, + "loss": 2.8929, + "step": 42497 + }, + { + "epoch": 2.08, + "grad_norm": 0.704507052898407, + "learning_rate": 0.00012808370079592398, + "loss": 2.8624, + "step": 42498 + }, + { + "epoch": 2.08, + "grad_norm": 0.6679760217666626, + "learning_rate": 0.00012807108357671738, + "loss": 2.8551, + "step": 42499 + }, + { + "epoch": 2.08, + "grad_norm": 0.6944798231124878, + "learning_rate": 0.0001280584668103384, + "loss": 3.0003, + "step": 42500 + }, + { + "epoch": 2.08, + "grad_norm": 0.7155947089195251, + "learning_rate": 0.00012804585049682024, + "loss": 2.9066, + "step": 42501 + }, + { + "epoch": 2.08, + "grad_norm": 0.7650623321533203, + "learning_rate": 0.00012803323463619638, + "loss": 2.8278, + "step": 42502 + }, + { + "epoch": 2.08, + "grad_norm": 0.6742417812347412, + "learning_rate": 0.00012802061922849976, + "loss": 2.7601, + "step": 42503 + }, + { + "epoch": 2.08, + "grad_norm": 0.6518091559410095, + "learning_rate": 0.00012800800427376384, + "loss": 3.0368, + "step": 42504 + }, + { + "epoch": 2.08, + "grad_norm": 0.7054563164710999, + "learning_rate": 0.00012799538977202173, + "loss": 2.8426, + "step": 42505 + }, + { + "epoch": 2.08, + "grad_norm": 0.7084836959838867, + "learning_rate": 0.00012798277572330656, + "loss": 3.0179, + "step": 42506 + }, + { + "epoch": 2.08, + "grad_norm": 0.6683297753334045, + "learning_rate": 0.00012797016212765175, + "loss": 2.9062, + "step": 42507 + }, + { + "epoch": 2.08, + "grad_norm": 0.6678385734558105, + "learning_rate": 0.00012795754898509035, + "loss": 3.106, + "step": 42508 + }, + { + "epoch": 2.08, + "grad_norm": 0.7056717276573181, + "learning_rate": 0.00012794493629565576, + "loss": 2.9415, + "step": 42509 + }, + { + "epoch": 2.08, + "grad_norm": 0.7119282484054565, + "learning_rate": 0.00012793232405938095, + "loss": 2.7636, + "step": 42510 + }, + { + "epoch": 2.08, + "grad_norm": 0.743801474571228, + "learning_rate": 0.0001279197122762993, + "loss": 2.6962, + "step": 42511 + }, + { + "epoch": 2.08, + "grad_norm": 0.7152933478355408, + "learning_rate": 0.0001279071009464441, + "loss": 2.87, + "step": 42512 + }, + { + "epoch": 2.08, + "grad_norm": 0.6678538918495178, + "learning_rate": 0.00012789449006984853, + "loss": 3.0891, + "step": 42513 + }, + { + "epoch": 2.08, + "grad_norm": 0.7106472849845886, + "learning_rate": 0.0001278818796465457, + "loss": 2.802, + "step": 42514 + }, + { + "epoch": 2.08, + "grad_norm": 0.7291449904441833, + "learning_rate": 0.0001278692696765688, + "loss": 2.8319, + "step": 42515 + }, + { + "epoch": 2.08, + "grad_norm": 0.6768184304237366, + "learning_rate": 0.00012785666015995107, + "loss": 2.8618, + "step": 42516 + }, + { + "epoch": 2.08, + "grad_norm": 0.6788813471794128, + "learning_rate": 0.00012784405109672595, + "loss": 2.8516, + "step": 42517 + }, + { + "epoch": 2.08, + "grad_norm": 0.6908031702041626, + "learning_rate": 0.0001278314424869263, + "loss": 3.0243, + "step": 42518 + }, + { + "epoch": 2.08, + "grad_norm": 0.7076452374458313, + "learning_rate": 0.0001278188343305856, + "loss": 2.8712, + "step": 42519 + }, + { + "epoch": 2.08, + "grad_norm": 0.6761187314987183, + "learning_rate": 0.00012780622662773698, + "loss": 2.963, + "step": 42520 + }, + { + "epoch": 2.08, + "grad_norm": 0.7147707939147949, + "learning_rate": 0.0001277936193784135, + "loss": 3.1233, + "step": 42521 + }, + { + "epoch": 2.08, + "grad_norm": 1.0204122066497803, + "learning_rate": 0.0001277810125826486, + "loss": 2.9935, + "step": 42522 + }, + { + "epoch": 2.08, + "grad_norm": 0.690827488899231, + "learning_rate": 0.00012776840624047526, + "loss": 2.8386, + "step": 42523 + }, + { + "epoch": 2.08, + "grad_norm": 0.6465492248535156, + "learning_rate": 0.00012775580035192692, + "loss": 2.8819, + "step": 42524 + }, + { + "epoch": 2.08, + "grad_norm": 0.6738273501396179, + "learning_rate": 0.00012774319491703652, + "loss": 2.924, + "step": 42525 + }, + { + "epoch": 2.08, + "grad_norm": 0.6924593448638916, + "learning_rate": 0.00012773058993583735, + "loss": 3.0896, + "step": 42526 + }, + { + "epoch": 2.08, + "grad_norm": 0.6848966479301453, + "learning_rate": 0.00012771798540836283, + "loss": 2.8388, + "step": 42527 + }, + { + "epoch": 2.08, + "grad_norm": 0.7153657674789429, + "learning_rate": 0.00012770538133464595, + "loss": 3.1066, + "step": 42528 + }, + { + "epoch": 2.08, + "grad_norm": 0.6560115814208984, + "learning_rate": 0.00012769277771471996, + "loss": 2.9721, + "step": 42529 + }, + { + "epoch": 2.08, + "grad_norm": 0.6960185766220093, + "learning_rate": 0.0001276801745486179, + "loss": 2.9123, + "step": 42530 + }, + { + "epoch": 2.08, + "grad_norm": 0.64568030834198, + "learning_rate": 0.00012766757183637308, + "loss": 3.2167, + "step": 42531 + }, + { + "epoch": 2.08, + "grad_norm": 0.697411060333252, + "learning_rate": 0.00012765496957801886, + "loss": 2.7463, + "step": 42532 + }, + { + "epoch": 2.08, + "grad_norm": 0.6693733334541321, + "learning_rate": 0.00012764236777358816, + "loss": 2.99, + "step": 42533 + }, + { + "epoch": 2.08, + "grad_norm": 0.6906144618988037, + "learning_rate": 0.00012762976642311438, + "loss": 2.9348, + "step": 42534 + }, + { + "epoch": 2.08, + "grad_norm": 0.7639672160148621, + "learning_rate": 0.00012761716552663063, + "loss": 2.8812, + "step": 42535 + }, + { + "epoch": 2.08, + "grad_norm": 0.6580451130867004, + "learning_rate": 0.00012760456508417002, + "loss": 2.9266, + "step": 42536 + }, + { + "epoch": 2.08, + "grad_norm": 0.7195175886154175, + "learning_rate": 0.00012759196509576587, + "loss": 2.6738, + "step": 42537 + }, + { + "epoch": 2.08, + "grad_norm": 0.7359403371810913, + "learning_rate": 0.0001275793655614512, + "loss": 2.9124, + "step": 42538 + }, + { + "epoch": 2.08, + "grad_norm": 0.7050012946128845, + "learning_rate": 0.00012756676648125942, + "loss": 2.9858, + "step": 42539 + }, + { + "epoch": 2.08, + "grad_norm": 0.6860030293464661, + "learning_rate": 0.00012755416785522347, + "loss": 2.7879, + "step": 42540 + }, + { + "epoch": 2.08, + "grad_norm": 0.667359471321106, + "learning_rate": 0.00012754156968337678, + "loss": 3.0746, + "step": 42541 + }, + { + "epoch": 2.08, + "grad_norm": 0.6795210242271423, + "learning_rate": 0.0001275289719657524, + "loss": 2.9643, + "step": 42542 + }, + { + "epoch": 2.08, + "grad_norm": 0.6692383885383606, + "learning_rate": 0.0001275163747023834, + "loss": 2.9197, + "step": 42543 + }, + { + "epoch": 2.08, + "grad_norm": 0.6974451541900635, + "learning_rate": 0.0001275037778933032, + "loss": 2.857, + "step": 42544 + }, + { + "epoch": 2.09, + "grad_norm": 0.7106644511222839, + "learning_rate": 0.0001274911815385447, + "loss": 2.8773, + "step": 42545 + }, + { + "epoch": 2.09, + "grad_norm": 0.7244455814361572, + "learning_rate": 0.00012747858563814138, + "loss": 2.9333, + "step": 42546 + }, + { + "epoch": 2.09, + "grad_norm": 0.6804713606834412, + "learning_rate": 0.00012746599019212613, + "loss": 2.7637, + "step": 42547 + }, + { + "epoch": 2.09, + "grad_norm": 0.7047104835510254, + "learning_rate": 0.0001274533952005323, + "loss": 2.9819, + "step": 42548 + }, + { + "epoch": 2.09, + "grad_norm": 0.6896386742591858, + "learning_rate": 0.00012744080066339305, + "loss": 2.9393, + "step": 42549 + }, + { + "epoch": 2.09, + "grad_norm": 0.7217565178871155, + "learning_rate": 0.00012742820658074157, + "loss": 3.0425, + "step": 42550 + }, + { + "epoch": 2.09, + "grad_norm": 0.6903233528137207, + "learning_rate": 0.000127415612952611, + "loss": 3.207, + "step": 42551 + }, + { + "epoch": 2.09, + "grad_norm": 0.6575881838798523, + "learning_rate": 0.00012740301977903434, + "loss": 2.8382, + "step": 42552 + }, + { + "epoch": 2.09, + "grad_norm": 0.6848930716514587, + "learning_rate": 0.00012739042706004495, + "loss": 2.8969, + "step": 42553 + }, + { + "epoch": 2.09, + "grad_norm": 0.6955052614212036, + "learning_rate": 0.00012737783479567605, + "loss": 3.1267, + "step": 42554 + }, + { + "epoch": 2.09, + "grad_norm": 0.669148862361908, + "learning_rate": 0.00012736524298596061, + "loss": 3.1156, + "step": 42555 + }, + { + "epoch": 2.09, + "grad_norm": 0.6611831188201904, + "learning_rate": 0.00012735265163093198, + "loss": 2.8058, + "step": 42556 + }, + { + "epoch": 2.09, + "grad_norm": 0.6889923810958862, + "learning_rate": 0.00012734006073062326, + "loss": 2.7144, + "step": 42557 + }, + { + "epoch": 2.09, + "grad_norm": 0.6790809631347656, + "learning_rate": 0.00012732747028506745, + "loss": 3.0519, + "step": 42558 + }, + { + "epoch": 2.09, + "grad_norm": 0.6864499449729919, + "learning_rate": 0.00012731488029429797, + "loss": 2.8055, + "step": 42559 + }, + { + "epoch": 2.09, + "grad_norm": 0.6939069628715515, + "learning_rate": 0.00012730229075834777, + "loss": 2.8274, + "step": 42560 + }, + { + "epoch": 2.09, + "grad_norm": 0.6930176019668579, + "learning_rate": 0.00012728970167725023, + "loss": 2.8174, + "step": 42561 + }, + { + "epoch": 2.09, + "grad_norm": 0.7337930798530579, + "learning_rate": 0.00012727711305103822, + "loss": 2.9908, + "step": 42562 + }, + { + "epoch": 2.09, + "grad_norm": 0.6973217725753784, + "learning_rate": 0.0001272645248797452, + "loss": 3.1223, + "step": 42563 + }, + { + "epoch": 2.09, + "grad_norm": 0.6723378896713257, + "learning_rate": 0.00012725193716340414, + "loss": 3.0629, + "step": 42564 + }, + { + "epoch": 2.09, + "grad_norm": 0.7197105884552002, + "learning_rate": 0.00012723934990204816, + "loss": 3.0432, + "step": 42565 + }, + { + "epoch": 2.09, + "grad_norm": 0.6942613124847412, + "learning_rate": 0.00012722676309571057, + "loss": 2.7, + "step": 42566 + }, + { + "epoch": 2.09, + "grad_norm": 0.6984099745750427, + "learning_rate": 0.00012721417674442433, + "loss": 3.1964, + "step": 42567 + }, + { + "epoch": 2.09, + "grad_norm": 0.6897585988044739, + "learning_rate": 0.0001272015908482227, + "loss": 2.9698, + "step": 42568 + }, + { + "epoch": 2.09, + "grad_norm": 0.6685306429862976, + "learning_rate": 0.00012718900540713892, + "loss": 2.8009, + "step": 42569 + }, + { + "epoch": 2.09, + "grad_norm": 0.7120461463928223, + "learning_rate": 0.00012717642042120596, + "loss": 2.9525, + "step": 42570 + }, + { + "epoch": 2.09, + "grad_norm": 0.6789481043815613, + "learning_rate": 0.00012716383589045715, + "loss": 2.6623, + "step": 42571 + }, + { + "epoch": 2.09, + "grad_norm": 0.6555876135826111, + "learning_rate": 0.00012715125181492554, + "loss": 2.8546, + "step": 42572 + }, + { + "epoch": 2.09, + "grad_norm": 0.7006644010543823, + "learning_rate": 0.00012713866819464412, + "loss": 2.9681, + "step": 42573 + }, + { + "epoch": 2.09, + "grad_norm": 0.681387722492218, + "learning_rate": 0.0001271260850296463, + "loss": 2.846, + "step": 42574 + }, + { + "epoch": 2.09, + "grad_norm": 0.7118346095085144, + "learning_rate": 0.00012711350231996502, + "loss": 3.0905, + "step": 42575 + }, + { + "epoch": 2.09, + "grad_norm": 0.673995316028595, + "learning_rate": 0.0001271009200656336, + "loss": 2.8904, + "step": 42576 + }, + { + "epoch": 2.09, + "grad_norm": 0.6982349157333374, + "learning_rate": 0.00012708833826668495, + "loss": 2.7196, + "step": 42577 + }, + { + "epoch": 2.09, + "grad_norm": 0.6825549006462097, + "learning_rate": 0.00012707575692315248, + "loss": 3.1081, + "step": 42578 + }, + { + "epoch": 2.09, + "grad_norm": 0.6906778812408447, + "learning_rate": 0.00012706317603506916, + "loss": 2.7565, + "step": 42579 + }, + { + "epoch": 2.09, + "grad_norm": 0.6175122857093811, + "learning_rate": 0.00012705059560246806, + "loss": 3.0223, + "step": 42580 + }, + { + "epoch": 2.09, + "grad_norm": 0.681134819984436, + "learning_rate": 0.00012703801562538252, + "loss": 2.924, + "step": 42581 + }, + { + "epoch": 2.09, + "grad_norm": 0.6651497483253479, + "learning_rate": 0.00012702543610384546, + "loss": 2.8553, + "step": 42582 + }, + { + "epoch": 2.09, + "grad_norm": 0.6877047419548035, + "learning_rate": 0.00012701285703789006, + "loss": 3.2676, + "step": 42583 + }, + { + "epoch": 2.09, + "grad_norm": 0.737549901008606, + "learning_rate": 0.00012700027842754968, + "loss": 2.7642, + "step": 42584 + }, + { + "epoch": 2.09, + "grad_norm": 0.669161319732666, + "learning_rate": 0.0001269877002728572, + "loss": 2.8865, + "step": 42585 + }, + { + "epoch": 2.09, + "grad_norm": 0.6651346683502197, + "learning_rate": 0.00012697512257384586, + "loss": 3.0816, + "step": 42586 + }, + { + "epoch": 2.09, + "grad_norm": 0.6895802021026611, + "learning_rate": 0.00012696254533054867, + "loss": 3.0483, + "step": 42587 + }, + { + "epoch": 2.09, + "grad_norm": 0.6694716811180115, + "learning_rate": 0.0001269499685429988, + "loss": 3.1017, + "step": 42588 + }, + { + "epoch": 2.09, + "grad_norm": 0.7255198955535889, + "learning_rate": 0.0001269373922112295, + "loss": 3.0076, + "step": 42589 + }, + { + "epoch": 2.09, + "grad_norm": 0.6456109881401062, + "learning_rate": 0.0001269248163352737, + "loss": 2.855, + "step": 42590 + }, + { + "epoch": 2.09, + "grad_norm": 0.6860771775245667, + "learning_rate": 0.00012691224091516476, + "loss": 3.0306, + "step": 42591 + }, + { + "epoch": 2.09, + "grad_norm": 0.6903136968612671, + "learning_rate": 0.00012689966595093557, + "loss": 3.0801, + "step": 42592 + }, + { + "epoch": 2.09, + "grad_norm": 0.7179170846939087, + "learning_rate": 0.00012688709144261938, + "loss": 2.8021, + "step": 42593 + }, + { + "epoch": 2.09, + "grad_norm": 0.6981700658798218, + "learning_rate": 0.00012687451739024934, + "loss": 2.8115, + "step": 42594 + }, + { + "epoch": 2.09, + "grad_norm": 0.6771155595779419, + "learning_rate": 0.0001268619437938584, + "loss": 2.8058, + "step": 42595 + }, + { + "epoch": 2.09, + "grad_norm": 0.6560533046722412, + "learning_rate": 0.00012684937065347984, + "loss": 2.8685, + "step": 42596 + }, + { + "epoch": 2.09, + "grad_norm": 0.6963851451873779, + "learning_rate": 0.00012683679796914664, + "loss": 2.8669, + "step": 42597 + }, + { + "epoch": 2.09, + "grad_norm": 0.7008418440818787, + "learning_rate": 0.00012682422574089198, + "loss": 3.0551, + "step": 42598 + }, + { + "epoch": 2.09, + "grad_norm": 0.7024134993553162, + "learning_rate": 0.00012681165396874908, + "loss": 2.8471, + "step": 42599 + }, + { + "epoch": 2.09, + "grad_norm": 0.7309966683387756, + "learning_rate": 0.00012679908265275094, + "loss": 2.7808, + "step": 42600 + }, + { + "epoch": 2.09, + "grad_norm": 0.6708288192749023, + "learning_rate": 0.00012678651179293072, + "loss": 2.6906, + "step": 42601 + }, + { + "epoch": 2.09, + "grad_norm": 0.6789960861206055, + "learning_rate": 0.00012677394138932137, + "loss": 2.7973, + "step": 42602 + }, + { + "epoch": 2.09, + "grad_norm": 0.7056165337562561, + "learning_rate": 0.0001267613714419561, + "loss": 3.0738, + "step": 42603 + }, + { + "epoch": 2.09, + "grad_norm": 0.6883794069290161, + "learning_rate": 0.0001267488019508682, + "loss": 3.0586, + "step": 42604 + }, + { + "epoch": 2.09, + "grad_norm": 0.7074939012527466, + "learning_rate": 0.00012673623291609043, + "loss": 2.7751, + "step": 42605 + }, + { + "epoch": 2.09, + "grad_norm": 0.7160875797271729, + "learning_rate": 0.00012672366433765623, + "loss": 2.8472, + "step": 42606 + }, + { + "epoch": 2.09, + "grad_norm": 0.7391530871391296, + "learning_rate": 0.00012671109621559856, + "loss": 2.7816, + "step": 42607 + }, + { + "epoch": 2.09, + "grad_norm": 0.7099108695983887, + "learning_rate": 0.00012669852854995037, + "loss": 2.8312, + "step": 42608 + }, + { + "epoch": 2.09, + "grad_norm": 0.6549707651138306, + "learning_rate": 0.00012668596134074504, + "loss": 2.9045, + "step": 42609 + }, + { + "epoch": 2.09, + "grad_norm": 0.6979371309280396, + "learning_rate": 0.00012667339458801542, + "loss": 2.9533, + "step": 42610 + }, + { + "epoch": 2.09, + "grad_norm": 0.7130467295646667, + "learning_rate": 0.0001266608282917948, + "loss": 2.8201, + "step": 42611 + }, + { + "epoch": 2.09, + "grad_norm": 0.674332857131958, + "learning_rate": 0.00012664826245211612, + "loss": 2.9441, + "step": 42612 + }, + { + "epoch": 2.09, + "grad_norm": 0.6953020095825195, + "learning_rate": 0.0001266356970690126, + "loss": 2.7931, + "step": 42613 + }, + { + "epoch": 2.09, + "grad_norm": 0.6988925933837891, + "learning_rate": 0.0001266231321425173, + "loss": 2.7282, + "step": 42614 + }, + { + "epoch": 2.09, + "grad_norm": 0.7261149287223816, + "learning_rate": 0.00012661056767266333, + "loss": 2.8631, + "step": 42615 + }, + { + "epoch": 2.09, + "grad_norm": 0.667811393737793, + "learning_rate": 0.0001265980036594838, + "loss": 2.9077, + "step": 42616 + }, + { + "epoch": 2.09, + "grad_norm": 0.694728434085846, + "learning_rate": 0.00012658544010301165, + "loss": 3.0601, + "step": 42617 + }, + { + "epoch": 2.09, + "grad_norm": 0.7233669757843018, + "learning_rate": 0.00012657287700328012, + "loss": 2.75, + "step": 42618 + }, + { + "epoch": 2.09, + "grad_norm": 0.7053804993629456, + "learning_rate": 0.00012656031436032218, + "loss": 3.0824, + "step": 42619 + }, + { + "epoch": 2.09, + "grad_norm": 0.6913226246833801, + "learning_rate": 0.000126547752174171, + "loss": 2.8967, + "step": 42620 + }, + { + "epoch": 2.09, + "grad_norm": 0.720045804977417, + "learning_rate": 0.00012653519044485975, + "loss": 2.7501, + "step": 42621 + }, + { + "epoch": 2.09, + "grad_norm": 0.7346246242523193, + "learning_rate": 0.00012652262917242144, + "loss": 3.0411, + "step": 42622 + }, + { + "epoch": 2.09, + "grad_norm": 0.6882084012031555, + "learning_rate": 0.00012651006835688917, + "loss": 2.87, + "step": 42623 + }, + { + "epoch": 2.09, + "grad_norm": 0.6975372433662415, + "learning_rate": 0.00012649750799829585, + "loss": 2.9828, + "step": 42624 + }, + { + "epoch": 2.09, + "grad_norm": 0.6522185802459717, + "learning_rate": 0.00012648494809667469, + "loss": 3.0208, + "step": 42625 + }, + { + "epoch": 2.09, + "grad_norm": 0.709596574306488, + "learning_rate": 0.0001264723886520589, + "loss": 2.7963, + "step": 42626 + }, + { + "epoch": 2.09, + "grad_norm": 0.6858445405960083, + "learning_rate": 0.00012645982966448134, + "loss": 3.0234, + "step": 42627 + }, + { + "epoch": 2.09, + "grad_norm": 0.6798964142799377, + "learning_rate": 0.00012644727113397532, + "loss": 3.0091, + "step": 42628 + }, + { + "epoch": 2.09, + "grad_norm": 0.7389910221099854, + "learning_rate": 0.00012643471306057372, + "loss": 2.7046, + "step": 42629 + }, + { + "epoch": 2.09, + "grad_norm": 0.6868226528167725, + "learning_rate": 0.00012642215544430964, + "loss": 2.7799, + "step": 42630 + }, + { + "epoch": 2.09, + "grad_norm": 0.6818183064460754, + "learning_rate": 0.00012640959828521628, + "loss": 3.1201, + "step": 42631 + }, + { + "epoch": 2.09, + "grad_norm": 0.6521576046943665, + "learning_rate": 0.0001263970415833265, + "loss": 3.1, + "step": 42632 + }, + { + "epoch": 2.09, + "grad_norm": 0.7379617691040039, + "learning_rate": 0.00012638448533867366, + "loss": 2.8706, + "step": 42633 + }, + { + "epoch": 2.09, + "grad_norm": 0.7239534258842468, + "learning_rate": 0.00012637192955129053, + "loss": 2.6417, + "step": 42634 + }, + { + "epoch": 2.09, + "grad_norm": 0.7074881792068481, + "learning_rate": 0.00012635937422121035, + "loss": 2.987, + "step": 42635 + }, + { + "epoch": 2.09, + "grad_norm": 0.7031037211418152, + "learning_rate": 0.00012634681934846624, + "loss": 2.9419, + "step": 42636 + }, + { + "epoch": 2.09, + "grad_norm": 0.6728188991546631, + "learning_rate": 0.0001263342649330912, + "loss": 2.7092, + "step": 42637 + }, + { + "epoch": 2.09, + "grad_norm": 0.7104039788246155, + "learning_rate": 0.00012632171097511828, + "loss": 2.6515, + "step": 42638 + }, + { + "epoch": 2.09, + "grad_norm": 0.6465256214141846, + "learning_rate": 0.00012630915747458043, + "loss": 3.0457, + "step": 42639 + }, + { + "epoch": 2.09, + "grad_norm": 0.690544605255127, + "learning_rate": 0.00012629660443151086, + "loss": 2.8131, + "step": 42640 + }, + { + "epoch": 2.09, + "grad_norm": 0.6940127611160278, + "learning_rate": 0.0001262840518459427, + "loss": 2.9941, + "step": 42641 + }, + { + "epoch": 2.09, + "grad_norm": 0.6851858496665955, + "learning_rate": 0.0001262714997179088, + "loss": 2.9212, + "step": 42642 + }, + { + "epoch": 2.09, + "grad_norm": 0.6655691862106323, + "learning_rate": 0.00012625894804744244, + "loss": 2.8587, + "step": 42643 + }, + { + "epoch": 2.09, + "grad_norm": 0.688534140586853, + "learning_rate": 0.0001262463968345766, + "loss": 3.0816, + "step": 42644 + }, + { + "epoch": 2.09, + "grad_norm": 0.7009671330451965, + "learning_rate": 0.00012623384607934415, + "loss": 2.8889, + "step": 42645 + }, + { + "epoch": 2.09, + "grad_norm": 0.6391334533691406, + "learning_rate": 0.00012622129578177845, + "loss": 2.8735, + "step": 42646 + }, + { + "epoch": 2.09, + "grad_norm": 0.6934916973114014, + "learning_rate": 0.0001262087459419123, + "loss": 2.8965, + "step": 42647 + }, + { + "epoch": 2.09, + "grad_norm": 0.6752188205718994, + "learning_rate": 0.00012619619655977897, + "loss": 2.9981, + "step": 42648 + }, + { + "epoch": 2.09, + "grad_norm": 0.6940574645996094, + "learning_rate": 0.0001261836476354113, + "loss": 2.7263, + "step": 42649 + }, + { + "epoch": 2.09, + "grad_norm": 0.7268062829971313, + "learning_rate": 0.00012617109916884247, + "loss": 3.1159, + "step": 42650 + }, + { + "epoch": 2.09, + "grad_norm": 0.656769871711731, + "learning_rate": 0.0001261585511601056, + "loss": 2.9465, + "step": 42651 + }, + { + "epoch": 2.09, + "grad_norm": 0.6630927920341492, + "learning_rate": 0.00012614600360923367, + "loss": 2.9908, + "step": 42652 + }, + { + "epoch": 2.09, + "grad_norm": 0.6990768909454346, + "learning_rate": 0.0001261334565162597, + "loss": 2.9062, + "step": 42653 + }, + { + "epoch": 2.09, + "grad_norm": 0.689720630645752, + "learning_rate": 0.00012612090988121661, + "loss": 2.9548, + "step": 42654 + }, + { + "epoch": 2.09, + "grad_norm": 0.7280600666999817, + "learning_rate": 0.00012610836370413763, + "loss": 2.8982, + "step": 42655 + }, + { + "epoch": 2.09, + "grad_norm": 0.6745010614395142, + "learning_rate": 0.0001260958179850558, + "loss": 2.8903, + "step": 42656 + }, + { + "epoch": 2.09, + "grad_norm": 0.7021997570991516, + "learning_rate": 0.00012608327272400406, + "loss": 3.0062, + "step": 42657 + }, + { + "epoch": 2.09, + "grad_norm": 0.648439884185791, + "learning_rate": 0.00012607072792101558, + "loss": 2.9718, + "step": 42658 + }, + { + "epoch": 2.09, + "grad_norm": 0.7959494590759277, + "learning_rate": 0.00012605818357612334, + "loss": 2.9918, + "step": 42659 + }, + { + "epoch": 2.09, + "grad_norm": 0.676099419593811, + "learning_rate": 0.00012604563968936027, + "loss": 3.0113, + "step": 42660 + }, + { + "epoch": 2.09, + "grad_norm": 0.6749578714370728, + "learning_rate": 0.00012603309626075962, + "loss": 2.8392, + "step": 42661 + }, + { + "epoch": 2.09, + "grad_norm": 0.6763250231742859, + "learning_rate": 0.0001260205532903542, + "loss": 3.0267, + "step": 42662 + }, + { + "epoch": 2.09, + "grad_norm": 0.7066740989685059, + "learning_rate": 0.00012600801077817728, + "loss": 2.9946, + "step": 42663 + }, + { + "epoch": 2.09, + "grad_norm": 0.6909931898117065, + "learning_rate": 0.00012599546872426164, + "loss": 2.8585, + "step": 42664 + }, + { + "epoch": 2.09, + "grad_norm": 0.6577308177947998, + "learning_rate": 0.00012598292712864058, + "loss": 2.7868, + "step": 42665 + }, + { + "epoch": 2.09, + "grad_norm": 0.7595558166503906, + "learning_rate": 0.000125970385991347, + "loss": 2.8711, + "step": 42666 + }, + { + "epoch": 2.09, + "grad_norm": 0.670626699924469, + "learning_rate": 0.0001259578453124138, + "loss": 2.9837, + "step": 42667 + }, + { + "epoch": 2.09, + "grad_norm": 0.7183277010917664, + "learning_rate": 0.00012594530509187424, + "loss": 2.9113, + "step": 42668 + }, + { + "epoch": 2.09, + "grad_norm": 0.6675026416778564, + "learning_rate": 0.00012593276532976117, + "loss": 3.0022, + "step": 42669 + }, + { + "epoch": 2.09, + "grad_norm": 0.703376293182373, + "learning_rate": 0.00012592022602610768, + "loss": 2.9697, + "step": 42670 + }, + { + "epoch": 2.09, + "grad_norm": 0.6627922654151917, + "learning_rate": 0.00012590768718094694, + "loss": 3.0211, + "step": 42671 + }, + { + "epoch": 2.09, + "grad_norm": 0.727324366569519, + "learning_rate": 0.00012589514879431172, + "loss": 2.8978, + "step": 42672 + }, + { + "epoch": 2.09, + "grad_norm": 0.6731564402580261, + "learning_rate": 0.00012588261086623527, + "loss": 2.9288, + "step": 42673 + }, + { + "epoch": 2.09, + "grad_norm": 0.6762987971305847, + "learning_rate": 0.00012587007339675052, + "loss": 2.8168, + "step": 42674 + }, + { + "epoch": 2.09, + "grad_norm": 0.7034830451011658, + "learning_rate": 0.0001258575363858904, + "loss": 2.9776, + "step": 42675 + }, + { + "epoch": 2.09, + "grad_norm": 0.6867435574531555, + "learning_rate": 0.00012584499983368807, + "loss": 2.8703, + "step": 42676 + }, + { + "epoch": 2.09, + "grad_norm": 0.6966498494148254, + "learning_rate": 0.00012583246374017645, + "loss": 2.6529, + "step": 42677 + }, + { + "epoch": 2.09, + "grad_norm": 0.6629147529602051, + "learning_rate": 0.00012581992810538868, + "loss": 2.7213, + "step": 42678 + }, + { + "epoch": 2.09, + "grad_norm": 0.6690661907196045, + "learning_rate": 0.00012580739292935759, + "loss": 3.1114, + "step": 42679 + }, + { + "epoch": 2.09, + "grad_norm": 0.7012913227081299, + "learning_rate": 0.00012579485821211643, + "loss": 2.9342, + "step": 42680 + }, + { + "epoch": 2.09, + "grad_norm": 0.7117915749549866, + "learning_rate": 0.00012578232395369808, + "loss": 2.8775, + "step": 42681 + }, + { + "epoch": 2.09, + "grad_norm": 0.6585233807563782, + "learning_rate": 0.00012576979015413542, + "loss": 2.899, + "step": 42682 + }, + { + "epoch": 2.09, + "grad_norm": 0.744594156742096, + "learning_rate": 0.00012575725681346174, + "loss": 2.9088, + "step": 42683 + }, + { + "epoch": 2.09, + "grad_norm": 0.6832166910171509, + "learning_rate": 0.00012574472393170978, + "loss": 3.0143, + "step": 42684 + }, + { + "epoch": 2.09, + "grad_norm": 0.7105337977409363, + "learning_rate": 0.0001257321915089127, + "loss": 3.03, + "step": 42685 + }, + { + "epoch": 2.09, + "grad_norm": 0.6521598100662231, + "learning_rate": 0.00012571965954510358, + "loss": 2.9901, + "step": 42686 + }, + { + "epoch": 2.09, + "grad_norm": 0.6741447448730469, + "learning_rate": 0.00012570712804031535, + "loss": 2.9124, + "step": 42687 + }, + { + "epoch": 2.09, + "grad_norm": 0.6789092421531677, + "learning_rate": 0.000125694596994581, + "loss": 2.9516, + "step": 42688 + }, + { + "epoch": 2.09, + "grad_norm": 0.7190807461738586, + "learning_rate": 0.00012568206640793344, + "loss": 2.7755, + "step": 42689 + }, + { + "epoch": 2.09, + "grad_norm": 0.7378038763999939, + "learning_rate": 0.00012566953628040574, + "loss": 2.8331, + "step": 42690 + }, + { + "epoch": 2.09, + "grad_norm": 0.695372998714447, + "learning_rate": 0.00012565700661203107, + "loss": 3.0781, + "step": 42691 + }, + { + "epoch": 2.09, + "grad_norm": 0.7172736525535583, + "learning_rate": 0.00012564447740284217, + "loss": 2.9251, + "step": 42692 + }, + { + "epoch": 2.09, + "grad_norm": 0.6886417865753174, + "learning_rate": 0.00012563194865287228, + "loss": 2.979, + "step": 42693 + }, + { + "epoch": 2.09, + "grad_norm": 0.7642080187797546, + "learning_rate": 0.00012561942036215415, + "loss": 2.9911, + "step": 42694 + }, + { + "epoch": 2.09, + "grad_norm": 0.6821017265319824, + "learning_rate": 0.00012560689253072102, + "loss": 2.8739, + "step": 42695 + }, + { + "epoch": 2.09, + "grad_norm": 0.7164608240127563, + "learning_rate": 0.00012559436515860575, + "loss": 2.6133, + "step": 42696 + }, + { + "epoch": 2.09, + "grad_norm": 0.6764235496520996, + "learning_rate": 0.0001255818382458413, + "loss": 3.0949, + "step": 42697 + }, + { + "epoch": 2.09, + "grad_norm": 0.7024563550949097, + "learning_rate": 0.0001255693117924608, + "loss": 2.678, + "step": 42698 + }, + { + "epoch": 2.09, + "grad_norm": 0.7356938719749451, + "learning_rate": 0.00012555678579849707, + "loss": 2.7922, + "step": 42699 + }, + { + "epoch": 2.09, + "grad_norm": 0.6670063734054565, + "learning_rate": 0.00012554426026398327, + "loss": 2.9197, + "step": 42700 + }, + { + "epoch": 2.09, + "grad_norm": 0.709257185459137, + "learning_rate": 0.00012553173518895224, + "loss": 2.8767, + "step": 42701 + }, + { + "epoch": 2.09, + "grad_norm": 0.6837866902351379, + "learning_rate": 0.00012551921057343715, + "loss": 2.9562, + "step": 42702 + }, + { + "epoch": 2.09, + "grad_norm": 0.6681262850761414, + "learning_rate": 0.00012550668641747085, + "loss": 2.9842, + "step": 42703 + }, + { + "epoch": 2.09, + "grad_norm": 0.7419562339782715, + "learning_rate": 0.00012549416272108625, + "loss": 2.988, + "step": 42704 + }, + { + "epoch": 2.09, + "grad_norm": 0.7583781480789185, + "learning_rate": 0.0001254816394843166, + "loss": 3.1105, + "step": 42705 + }, + { + "epoch": 2.09, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.00012546911670719457, + "loss": 2.9673, + "step": 42706 + }, + { + "epoch": 2.09, + "grad_norm": 0.6698441505432129, + "learning_rate": 0.00012545659438975332, + "loss": 2.9013, + "step": 42707 + }, + { + "epoch": 2.09, + "grad_norm": 0.7208741903305054, + "learning_rate": 0.00012544407253202588, + "loss": 2.9384, + "step": 42708 + }, + { + "epoch": 2.09, + "grad_norm": 0.6900094747543335, + "learning_rate": 0.0001254315511340452, + "loss": 2.9388, + "step": 42709 + }, + { + "epoch": 2.09, + "grad_norm": 0.7146252393722534, + "learning_rate": 0.00012541903019584418, + "loss": 3.1268, + "step": 42710 + }, + { + "epoch": 2.09, + "grad_norm": 0.6867741942405701, + "learning_rate": 0.0001254065097174558, + "loss": 2.9237, + "step": 42711 + }, + { + "epoch": 2.09, + "grad_norm": 0.7199037075042725, + "learning_rate": 0.00012539398969891298, + "loss": 2.8591, + "step": 42712 + }, + { + "epoch": 2.09, + "grad_norm": 0.7059955596923828, + "learning_rate": 0.00012538147014024895, + "loss": 2.8159, + "step": 42713 + }, + { + "epoch": 2.09, + "grad_norm": 0.6868063807487488, + "learning_rate": 0.00012536895104149642, + "loss": 3.0604, + "step": 42714 + }, + { + "epoch": 2.09, + "grad_norm": 0.6989514231681824, + "learning_rate": 0.00012535643240268852, + "loss": 2.9723, + "step": 42715 + }, + { + "epoch": 2.09, + "grad_norm": 0.7568475008010864, + "learning_rate": 0.0001253439142238581, + "loss": 3.1044, + "step": 42716 + }, + { + "epoch": 2.09, + "grad_norm": 0.672238290309906, + "learning_rate": 0.0001253313965050383, + "loss": 2.7832, + "step": 42717 + }, + { + "epoch": 2.09, + "grad_norm": 0.7147806882858276, + "learning_rate": 0.000125318879246262, + "loss": 2.7591, + "step": 42718 + }, + { + "epoch": 2.09, + "grad_norm": 0.6832359433174133, + "learning_rate": 0.000125306362447562, + "loss": 2.829, + "step": 42719 + }, + { + "epoch": 2.09, + "grad_norm": 0.6849291324615479, + "learning_rate": 0.00012529384610897158, + "loss": 3.1068, + "step": 42720 + }, + { + "epoch": 2.09, + "grad_norm": 0.6654471755027771, + "learning_rate": 0.0001252813302305234, + "loss": 2.8561, + "step": 42721 + }, + { + "epoch": 2.09, + "grad_norm": 0.7064377069473267, + "learning_rate": 0.0001252688148122506, + "loss": 2.907, + "step": 42722 + }, + { + "epoch": 2.09, + "grad_norm": 0.665331244468689, + "learning_rate": 0.0001252562998541862, + "loss": 2.8152, + "step": 42723 + }, + { + "epoch": 2.09, + "grad_norm": 0.738166332244873, + "learning_rate": 0.0001252437853563631, + "loss": 2.8205, + "step": 42724 + }, + { + "epoch": 2.09, + "grad_norm": 0.7051920890808105, + "learning_rate": 0.0001252312713188142, + "loss": 2.7961, + "step": 42725 + }, + { + "epoch": 2.09, + "grad_norm": 0.7235719561576843, + "learning_rate": 0.00012521875774157242, + "loss": 3.0615, + "step": 42726 + }, + { + "epoch": 2.09, + "grad_norm": 0.7330026626586914, + "learning_rate": 0.00012520624462467077, + "loss": 2.9727, + "step": 42727 + }, + { + "epoch": 2.09, + "grad_norm": 0.6940485835075378, + "learning_rate": 0.00012519373196814238, + "loss": 2.731, + "step": 42728 + }, + { + "epoch": 2.09, + "grad_norm": 0.6567136645317078, + "learning_rate": 0.00012518121977201993, + "loss": 2.971, + "step": 42729 + }, + { + "epoch": 2.09, + "grad_norm": 0.6531281471252441, + "learning_rate": 0.0001251687080363366, + "loss": 2.893, + "step": 42730 + }, + { + "epoch": 2.09, + "grad_norm": 0.6377910375595093, + "learning_rate": 0.00012515619676112526, + "loss": 2.8912, + "step": 42731 + }, + { + "epoch": 2.09, + "grad_norm": 0.6578353643417358, + "learning_rate": 0.0001251436859464187, + "loss": 2.7222, + "step": 42732 + }, + { + "epoch": 2.09, + "grad_norm": 0.6784064173698425, + "learning_rate": 0.00012513117559225016, + "loss": 2.9927, + "step": 42733 + }, + { + "epoch": 2.09, + "grad_norm": 0.6903968453407288, + "learning_rate": 0.00012511866569865236, + "loss": 2.8334, + "step": 42734 + }, + { + "epoch": 2.09, + "grad_norm": 0.6765158772468567, + "learning_rate": 0.00012510615626565842, + "loss": 2.9354, + "step": 42735 + }, + { + "epoch": 2.09, + "grad_norm": 0.7345734238624573, + "learning_rate": 0.0001250936472933011, + "loss": 2.8646, + "step": 42736 + }, + { + "epoch": 2.09, + "grad_norm": 0.6818332076072693, + "learning_rate": 0.00012508113878161348, + "loss": 3.0219, + "step": 42737 + }, + { + "epoch": 2.09, + "grad_norm": 0.6421600580215454, + "learning_rate": 0.00012506863073062855, + "loss": 2.9526, + "step": 42738 + }, + { + "epoch": 2.09, + "grad_norm": 0.7006136178970337, + "learning_rate": 0.00012505612314037918, + "loss": 2.8891, + "step": 42739 + }, + { + "epoch": 2.09, + "grad_norm": 0.6548375487327576, + "learning_rate": 0.00012504361601089833, + "loss": 2.9922, + "step": 42740 + }, + { + "epoch": 2.09, + "grad_norm": 0.6888555884361267, + "learning_rate": 0.0001250311093422188, + "loss": 2.828, + "step": 42741 + }, + { + "epoch": 2.09, + "grad_norm": 0.7467426657676697, + "learning_rate": 0.00012501860313437368, + "loss": 3.1437, + "step": 42742 + }, + { + "epoch": 2.09, + "grad_norm": 0.6846618056297302, + "learning_rate": 0.00012500609738739596, + "loss": 2.9356, + "step": 42743 + }, + { + "epoch": 2.09, + "grad_norm": 0.6674962043762207, + "learning_rate": 0.00012499359210131842, + "loss": 2.9799, + "step": 42744 + }, + { + "epoch": 2.09, + "grad_norm": 0.6744292378425598, + "learning_rate": 0.00012498108727617416, + "loss": 2.8018, + "step": 42745 + }, + { + "epoch": 2.09, + "grad_norm": 0.6720545291900635, + "learning_rate": 0.00012496858291199608, + "loss": 2.9251, + "step": 42746 + }, + { + "epoch": 2.09, + "grad_norm": 0.702659547328949, + "learning_rate": 0.00012495607900881692, + "loss": 2.9121, + "step": 42747 + }, + { + "epoch": 2.09, + "grad_norm": 0.6851487159729004, + "learning_rate": 0.00012494357556666987, + "loss": 2.9867, + "step": 42748 + }, + { + "epoch": 2.1, + "grad_norm": 0.6916007995605469, + "learning_rate": 0.00012493107258558763, + "loss": 2.9223, + "step": 42749 + }, + { + "epoch": 2.1, + "grad_norm": 0.6949058771133423, + "learning_rate": 0.0001249185700656034, + "loss": 2.7184, + "step": 42750 + }, + { + "epoch": 2.1, + "grad_norm": 0.6698328256607056, + "learning_rate": 0.00012490606800674982, + "loss": 2.7598, + "step": 42751 + }, + { + "epoch": 2.1, + "grad_norm": 0.678553581237793, + "learning_rate": 0.00012489356640906, + "loss": 2.9062, + "step": 42752 + }, + { + "epoch": 2.1, + "grad_norm": 0.6635074615478516, + "learning_rate": 0.0001248810652725669, + "loss": 3.1156, + "step": 42753 + }, + { + "epoch": 2.1, + "grad_norm": 0.6807247400283813, + "learning_rate": 0.00012486856459730336, + "loss": 2.9481, + "step": 42754 + }, + { + "epoch": 2.1, + "grad_norm": 0.7227141261100769, + "learning_rate": 0.00012485606438330236, + "loss": 2.9745, + "step": 42755 + }, + { + "epoch": 2.1, + "grad_norm": 0.7141606211662292, + "learning_rate": 0.00012484356463059662, + "loss": 2.7561, + "step": 42756 + }, + { + "epoch": 2.1, + "grad_norm": 0.7132400870323181, + "learning_rate": 0.00012483106533921925, + "loss": 2.8926, + "step": 42757 + }, + { + "epoch": 2.1, + "grad_norm": 0.7027788162231445, + "learning_rate": 0.00012481856650920327, + "loss": 2.9338, + "step": 42758 + }, + { + "epoch": 2.1, + "grad_norm": 0.6948914527893066, + "learning_rate": 0.00012480606814058132, + "loss": 2.8984, + "step": 42759 + }, + { + "epoch": 2.1, + "grad_norm": 0.6889869570732117, + "learning_rate": 0.0001247935702333866, + "loss": 2.8425, + "step": 42760 + }, + { + "epoch": 2.1, + "grad_norm": 0.7095881104469299, + "learning_rate": 0.00012478107278765188, + "loss": 2.9341, + "step": 42761 + }, + { + "epoch": 2.1, + "grad_norm": 0.7111302018165588, + "learning_rate": 0.00012476857580341, + "loss": 2.8932, + "step": 42762 + }, + { + "epoch": 2.1, + "grad_norm": 0.6622797846794128, + "learning_rate": 0.00012475607928069408, + "loss": 2.7765, + "step": 42763 + }, + { + "epoch": 2.1, + "grad_norm": 0.6576592922210693, + "learning_rate": 0.00012474358321953681, + "loss": 2.9659, + "step": 42764 + }, + { + "epoch": 2.1, + "grad_norm": 0.68706876039505, + "learning_rate": 0.00012473108761997133, + "loss": 2.8225, + "step": 42765 + }, + { + "epoch": 2.1, + "grad_norm": 0.7006784081459045, + "learning_rate": 0.0001247185924820303, + "loss": 2.9659, + "step": 42766 + }, + { + "epoch": 2.1, + "grad_norm": 0.7574266195297241, + "learning_rate": 0.00012470609780574688, + "loss": 3.0306, + "step": 42767 + }, + { + "epoch": 2.1, + "grad_norm": 0.678636372089386, + "learning_rate": 0.00012469360359115389, + "loss": 2.8967, + "step": 42768 + }, + { + "epoch": 2.1, + "grad_norm": 0.7261688113212585, + "learning_rate": 0.00012468110983828408, + "loss": 2.8648, + "step": 42769 + }, + { + "epoch": 2.1, + "grad_norm": 0.6693586707115173, + "learning_rate": 0.00012466861654717058, + "loss": 3.0083, + "step": 42770 + }, + { + "epoch": 2.1, + "grad_norm": 0.699759840965271, + "learning_rate": 0.00012465612371784611, + "loss": 2.8289, + "step": 42771 + }, + { + "epoch": 2.1, + "grad_norm": 0.6655587553977966, + "learning_rate": 0.0001246436313503437, + "loss": 2.9072, + "step": 42772 + }, + { + "epoch": 2.1, + "grad_norm": 0.6945595741271973, + "learning_rate": 0.00012463113944469626, + "loss": 2.9781, + "step": 42773 + }, + { + "epoch": 2.1, + "grad_norm": 0.6992049217224121, + "learning_rate": 0.00012461864800093655, + "loss": 2.8408, + "step": 42774 + }, + { + "epoch": 2.1, + "grad_norm": 0.6617697477340698, + "learning_rate": 0.0001246061570190977, + "loss": 2.8496, + "step": 42775 + }, + { + "epoch": 2.1, + "grad_norm": 0.6566224098205566, + "learning_rate": 0.00012459366649921247, + "loss": 2.8566, + "step": 42776 + }, + { + "epoch": 2.1, + "grad_norm": 0.6914481520652771, + "learning_rate": 0.00012458117644131375, + "loss": 2.9731, + "step": 42777 + }, + { + "epoch": 2.1, + "grad_norm": 0.6699321866035461, + "learning_rate": 0.00012456868684543437, + "loss": 2.8621, + "step": 42778 + }, + { + "epoch": 2.1, + "grad_norm": 0.7015001177787781, + "learning_rate": 0.00012455619771160732, + "loss": 2.9381, + "step": 42779 + }, + { + "epoch": 2.1, + "grad_norm": 0.7836304306983948, + "learning_rate": 0.00012454370903986555, + "loss": 2.8052, + "step": 42780 + }, + { + "epoch": 2.1, + "grad_norm": 0.6836073994636536, + "learning_rate": 0.0001245312208302418, + "loss": 2.8882, + "step": 42781 + }, + { + "epoch": 2.1, + "grad_norm": 0.6994935870170593, + "learning_rate": 0.00012451873308276913, + "loss": 3.035, + "step": 42782 + }, + { + "epoch": 2.1, + "grad_norm": 0.6761261224746704, + "learning_rate": 0.00012450624579748035, + "loss": 2.9866, + "step": 42783 + }, + { + "epoch": 2.1, + "grad_norm": 0.6878436207771301, + "learning_rate": 0.00012449375897440824, + "loss": 2.9566, + "step": 42784 + }, + { + "epoch": 2.1, + "grad_norm": 0.6756868958473206, + "learning_rate": 0.0001244812726135859, + "loss": 2.941, + "step": 42785 + }, + { + "epoch": 2.1, + "grad_norm": 0.669620156288147, + "learning_rate": 0.00012446878671504603, + "loss": 2.9263, + "step": 42786 + }, + { + "epoch": 2.1, + "grad_norm": 0.6921489834785461, + "learning_rate": 0.00012445630127882166, + "loss": 2.9708, + "step": 42787 + }, + { + "epoch": 2.1, + "grad_norm": 0.7075048089027405, + "learning_rate": 0.00012444381630494553, + "loss": 2.9319, + "step": 42788 + }, + { + "epoch": 2.1, + "grad_norm": 0.7006393074989319, + "learning_rate": 0.0001244313317934507, + "loss": 2.8783, + "step": 42789 + }, + { + "epoch": 2.1, + "grad_norm": 0.698784351348877, + "learning_rate": 0.00012441884774436993, + "loss": 3.0471, + "step": 42790 + }, + { + "epoch": 2.1, + "grad_norm": 0.6695194840431213, + "learning_rate": 0.00012440636415773606, + "loss": 2.7919, + "step": 42791 + }, + { + "epoch": 2.1, + "grad_norm": 0.6728737354278564, + "learning_rate": 0.0001243938810335821, + "loss": 2.995, + "step": 42792 + }, + { + "epoch": 2.1, + "grad_norm": 0.6940301656723022, + "learning_rate": 0.00012438139837194076, + "loss": 2.8124, + "step": 42793 + }, + { + "epoch": 2.1, + "grad_norm": 0.683421790599823, + "learning_rate": 0.00012436891617284503, + "loss": 2.7807, + "step": 42794 + }, + { + "epoch": 2.1, + "grad_norm": 0.6776772141456604, + "learning_rate": 0.00012435643443632787, + "loss": 2.8754, + "step": 42795 + }, + { + "epoch": 2.1, + "grad_norm": 0.6897674202919006, + "learning_rate": 0.00012434395316242197, + "loss": 3.0021, + "step": 42796 + }, + { + "epoch": 2.1, + "grad_norm": 0.6728845238685608, + "learning_rate": 0.00012433147235116037, + "loss": 3.2454, + "step": 42797 + }, + { + "epoch": 2.1, + "grad_norm": 0.7249746918678284, + "learning_rate": 0.0001243189920025759, + "loss": 2.924, + "step": 42798 + }, + { + "epoch": 2.1, + "grad_norm": 0.7298779487609863, + "learning_rate": 0.00012430651211670129, + "loss": 3.0579, + "step": 42799 + }, + { + "epoch": 2.1, + "grad_norm": 0.8019329905509949, + "learning_rate": 0.00012429403269356956, + "loss": 3.0202, + "step": 42800 + }, + { + "epoch": 2.1, + "grad_norm": 0.7995365858078003, + "learning_rate": 0.00012428155373321348, + "loss": 2.7468, + "step": 42801 + }, + { + "epoch": 2.1, + "grad_norm": 0.672685980796814, + "learning_rate": 0.00012426907523566607, + "loss": 2.8585, + "step": 42802 + }, + { + "epoch": 2.1, + "grad_norm": 0.7194468379020691, + "learning_rate": 0.00012425659720096, + "loss": 2.9569, + "step": 42803 + }, + { + "epoch": 2.1, + "grad_norm": 0.728124737739563, + "learning_rate": 0.0001242441196291283, + "loss": 2.766, + "step": 42804 + }, + { + "epoch": 2.1, + "grad_norm": 0.7027937173843384, + "learning_rate": 0.00012423164252020377, + "loss": 2.9544, + "step": 42805 + }, + { + "epoch": 2.1, + "grad_norm": 0.7222430109977722, + "learning_rate": 0.00012421916587421918, + "loss": 2.8564, + "step": 42806 + }, + { + "epoch": 2.1, + "grad_norm": 0.693401038646698, + "learning_rate": 0.0001242066896912076, + "loss": 2.9747, + "step": 42807 + }, + { + "epoch": 2.1, + "grad_norm": 0.7571882605552673, + "learning_rate": 0.00012419421397120162, + "loss": 2.9809, + "step": 42808 + }, + { + "epoch": 2.1, + "grad_norm": 0.6618812680244446, + "learning_rate": 0.00012418173871423427, + "loss": 3.0709, + "step": 42809 + }, + { + "epoch": 2.1, + "grad_norm": 0.6777374148368835, + "learning_rate": 0.00012416926392033847, + "loss": 2.8907, + "step": 42810 + }, + { + "epoch": 2.1, + "grad_norm": 0.6922290325164795, + "learning_rate": 0.00012415678958954703, + "loss": 2.8615, + "step": 42811 + }, + { + "epoch": 2.1, + "grad_norm": 0.7021549940109253, + "learning_rate": 0.00012414431572189272, + "loss": 2.8617, + "step": 42812 + }, + { + "epoch": 2.1, + "grad_norm": 0.7654609084129333, + "learning_rate": 0.00012413184231740835, + "loss": 3.0118, + "step": 42813 + }, + { + "epoch": 2.1, + "grad_norm": 0.6596476435661316, + "learning_rate": 0.00012411936937612686, + "loss": 2.9509, + "step": 42814 + }, + { + "epoch": 2.1, + "grad_norm": 0.7117953896522522, + "learning_rate": 0.00012410689689808122, + "loss": 3.0015, + "step": 42815 + }, + { + "epoch": 2.1, + "grad_norm": 0.7276239991188049, + "learning_rate": 0.00012409442488330404, + "loss": 2.7217, + "step": 42816 + }, + { + "epoch": 2.1, + "grad_norm": 0.7083224058151245, + "learning_rate": 0.0001240819533318284, + "loss": 2.9467, + "step": 42817 + }, + { + "epoch": 2.1, + "grad_norm": 0.6544482111930847, + "learning_rate": 0.0001240694822436869, + "loss": 2.9033, + "step": 42818 + }, + { + "epoch": 2.1, + "grad_norm": 0.6893659234046936, + "learning_rate": 0.00012405701161891268, + "loss": 2.8934, + "step": 42819 + }, + { + "epoch": 2.1, + "grad_norm": 0.6625657677650452, + "learning_rate": 0.0001240445414575384, + "loss": 2.988, + "step": 42820 + }, + { + "epoch": 2.1, + "grad_norm": 0.6781535148620605, + "learning_rate": 0.00012403207175959688, + "loss": 2.9722, + "step": 42821 + }, + { + "epoch": 2.1, + "grad_norm": 0.6784567832946777, + "learning_rate": 0.00012401960252512108, + "loss": 2.8418, + "step": 42822 + }, + { + "epoch": 2.1, + "grad_norm": 0.6653459668159485, + "learning_rate": 0.0001240071337541437, + "loss": 2.7502, + "step": 42823 + }, + { + "epoch": 2.1, + "grad_norm": 0.7122622132301331, + "learning_rate": 0.00012399466544669763, + "loss": 2.9881, + "step": 42824 + }, + { + "epoch": 2.1, + "grad_norm": 0.6495645642280579, + "learning_rate": 0.00012398219760281586, + "loss": 2.9813, + "step": 42825 + }, + { + "epoch": 2.1, + "grad_norm": 0.6739984154701233, + "learning_rate": 0.0001239697302225311, + "loss": 2.9362, + "step": 42826 + }, + { + "epoch": 2.1, + "grad_norm": 0.7379757165908813, + "learning_rate": 0.00012395726330587619, + "loss": 2.8813, + "step": 42827 + }, + { + "epoch": 2.1, + "grad_norm": 0.6480869054794312, + "learning_rate": 0.00012394479685288385, + "loss": 2.7832, + "step": 42828 + }, + { + "epoch": 2.1, + "grad_norm": 0.7142587304115295, + "learning_rate": 0.00012393233086358706, + "loss": 2.8355, + "step": 42829 + }, + { + "epoch": 2.1, + "grad_norm": 0.6860468983650208, + "learning_rate": 0.00012391986533801872, + "loss": 2.9661, + "step": 42830 + }, + { + "epoch": 2.1, + "grad_norm": 0.6586563587188721, + "learning_rate": 0.00012390740027621146, + "loss": 2.6982, + "step": 42831 + }, + { + "epoch": 2.1, + "grad_norm": 0.6689882278442383, + "learning_rate": 0.00012389493567819832, + "loss": 3.0087, + "step": 42832 + }, + { + "epoch": 2.1, + "grad_norm": 0.6792545914649963, + "learning_rate": 0.00012388247154401205, + "loss": 3.1049, + "step": 42833 + }, + { + "epoch": 2.1, + "grad_norm": 0.7196153998374939, + "learning_rate": 0.0001238700078736853, + "loss": 3.0143, + "step": 42834 + }, + { + "epoch": 2.1, + "grad_norm": 0.6919540762901306, + "learning_rate": 0.00012385754466725122, + "loss": 2.8527, + "step": 42835 + }, + { + "epoch": 2.1, + "grad_norm": 0.6894993185997009, + "learning_rate": 0.00012384508192474233, + "loss": 2.8945, + "step": 42836 + }, + { + "epoch": 2.1, + "grad_norm": 0.6774792075157166, + "learning_rate": 0.00012383261964619171, + "loss": 2.8883, + "step": 42837 + }, + { + "epoch": 2.1, + "grad_norm": 0.6447596549987793, + "learning_rate": 0.00012382015783163195, + "loss": 3.1129, + "step": 42838 + }, + { + "epoch": 2.1, + "grad_norm": 0.6821197271347046, + "learning_rate": 0.000123807696481096, + "loss": 2.8552, + "step": 42839 + }, + { + "epoch": 2.1, + "grad_norm": 0.6983603835105896, + "learning_rate": 0.00012379523559461674, + "loss": 2.7279, + "step": 42840 + }, + { + "epoch": 2.1, + "grad_norm": 0.7073538303375244, + "learning_rate": 0.00012378277517222693, + "loss": 2.8989, + "step": 42841 + }, + { + "epoch": 2.1, + "grad_norm": 0.6931063532829285, + "learning_rate": 0.00012377031521395942, + "loss": 2.8065, + "step": 42842 + }, + { + "epoch": 2.1, + "grad_norm": 0.6739084124565125, + "learning_rate": 0.00012375785571984682, + "loss": 3.0719, + "step": 42843 + }, + { + "epoch": 2.1, + "grad_norm": 0.8324639797210693, + "learning_rate": 0.00012374539668992212, + "loss": 2.7407, + "step": 42844 + }, + { + "epoch": 2.1, + "grad_norm": 0.6686299443244934, + "learning_rate": 0.00012373293812421827, + "loss": 2.8201, + "step": 42845 + }, + { + "epoch": 2.1, + "grad_norm": 0.6707850694656372, + "learning_rate": 0.00012372048002276778, + "loss": 2.8545, + "step": 42846 + }, + { + "epoch": 2.1, + "grad_norm": 0.6706002354621887, + "learning_rate": 0.00012370802238560378, + "loss": 2.9133, + "step": 42847 + }, + { + "epoch": 2.1, + "grad_norm": 0.6939883828163147, + "learning_rate": 0.0001236955652127589, + "loss": 2.7663, + "step": 42848 + }, + { + "epoch": 2.1, + "grad_norm": 0.7302611470222473, + "learning_rate": 0.00012368310850426585, + "loss": 2.8755, + "step": 42849 + }, + { + "epoch": 2.1, + "grad_norm": 0.6648881435394287, + "learning_rate": 0.00012367065226015768, + "loss": 3.1254, + "step": 42850 + }, + { + "epoch": 2.1, + "grad_norm": 0.7274592518806458, + "learning_rate": 0.00012365819648046697, + "loss": 2.9416, + "step": 42851 + }, + { + "epoch": 2.1, + "grad_norm": 0.7390041947364807, + "learning_rate": 0.0001236457411652267, + "loss": 2.8705, + "step": 42852 + }, + { + "epoch": 2.1, + "grad_norm": 0.7336735129356384, + "learning_rate": 0.00012363328631446956, + "loss": 2.8143, + "step": 42853 + }, + { + "epoch": 2.1, + "grad_norm": 0.7421209216117859, + "learning_rate": 0.00012362083192822847, + "loss": 2.9359, + "step": 42854 + }, + { + "epoch": 2.1, + "grad_norm": 0.6754651665687561, + "learning_rate": 0.00012360837800653608, + "loss": 2.9147, + "step": 42855 + }, + { + "epoch": 2.1, + "grad_norm": 0.6838744282722473, + "learning_rate": 0.00012359592454942538, + "loss": 2.9293, + "step": 42856 + }, + { + "epoch": 2.1, + "grad_norm": 0.6924825310707092, + "learning_rate": 0.00012358347155692906, + "loss": 2.908, + "step": 42857 + }, + { + "epoch": 2.1, + "grad_norm": 0.6746190190315247, + "learning_rate": 0.00012357101902907983, + "loss": 3.1511, + "step": 42858 + }, + { + "epoch": 2.1, + "grad_norm": 0.7157896757125854, + "learning_rate": 0.00012355856696591065, + "loss": 3.0123, + "step": 42859 + }, + { + "epoch": 2.1, + "grad_norm": 0.6908437609672546, + "learning_rate": 0.00012354611536745416, + "loss": 2.7298, + "step": 42860 + }, + { + "epoch": 2.1, + "grad_norm": 0.6571826934814453, + "learning_rate": 0.00012353366423374323, + "loss": 2.7174, + "step": 42861 + }, + { + "epoch": 2.1, + "grad_norm": 0.7128993272781372, + "learning_rate": 0.00012352121356481083, + "loss": 2.9394, + "step": 42862 + }, + { + "epoch": 2.1, + "grad_norm": 0.7356405258178711, + "learning_rate": 0.00012350876336068953, + "loss": 2.8878, + "step": 42863 + }, + { + "epoch": 2.1, + "grad_norm": 0.7242211699485779, + "learning_rate": 0.0001234963136214122, + "loss": 3.0483, + "step": 42864 + }, + { + "epoch": 2.1, + "grad_norm": 0.6854356527328491, + "learning_rate": 0.0001234838643470115, + "loss": 2.7815, + "step": 42865 + }, + { + "epoch": 2.1, + "grad_norm": 0.7101714015007019, + "learning_rate": 0.00012347141553752035, + "loss": 3.0021, + "step": 42866 + }, + { + "epoch": 2.1, + "grad_norm": 0.6805014610290527, + "learning_rate": 0.0001234589671929716, + "loss": 2.8371, + "step": 42867 + }, + { + "epoch": 2.1, + "grad_norm": 0.70173180103302, + "learning_rate": 0.00012344651931339785, + "loss": 3.0057, + "step": 42868 + }, + { + "epoch": 2.1, + "grad_norm": 0.6723647117614746, + "learning_rate": 0.0001234340718988321, + "loss": 2.9304, + "step": 42869 + }, + { + "epoch": 2.1, + "grad_norm": 0.6882205009460449, + "learning_rate": 0.00012342162494930704, + "loss": 2.8999, + "step": 42870 + }, + { + "epoch": 2.1, + "grad_norm": 0.7191662788391113, + "learning_rate": 0.0001234091784648553, + "loss": 3.0633, + "step": 42871 + }, + { + "epoch": 2.1, + "grad_norm": 0.6749732494354248, + "learning_rate": 0.00012339673244550993, + "loss": 2.9349, + "step": 42872 + }, + { + "epoch": 2.1, + "grad_norm": 0.7107117772102356, + "learning_rate": 0.00012338428689130348, + "loss": 3.0515, + "step": 42873 + }, + { + "epoch": 2.1, + "grad_norm": 0.6865886449813843, + "learning_rate": 0.00012337184180226892, + "loss": 2.7808, + "step": 42874 + }, + { + "epoch": 2.1, + "grad_norm": 0.6923301219940186, + "learning_rate": 0.00012335939717843882, + "loss": 2.9724, + "step": 42875 + }, + { + "epoch": 2.1, + "grad_norm": 0.6706522107124329, + "learning_rate": 0.00012334695301984607, + "loss": 3.0265, + "step": 42876 + }, + { + "epoch": 2.1, + "grad_norm": 0.6823163628578186, + "learning_rate": 0.00012333450932652357, + "loss": 3.1459, + "step": 42877 + }, + { + "epoch": 2.1, + "grad_norm": 0.7010007500648499, + "learning_rate": 0.00012332206609850399, + "loss": 2.9557, + "step": 42878 + }, + { + "epoch": 2.1, + "grad_norm": 0.7058342695236206, + "learning_rate": 0.00012330962333582005, + "loss": 2.85, + "step": 42879 + }, + { + "epoch": 2.1, + "grad_norm": 0.7387443780899048, + "learning_rate": 0.00012329718103850447, + "loss": 2.7243, + "step": 42880 + }, + { + "epoch": 2.1, + "grad_norm": 0.6735854744911194, + "learning_rate": 0.0001232847392065901, + "loss": 2.9853, + "step": 42881 + }, + { + "epoch": 2.1, + "grad_norm": 0.7551968693733215, + "learning_rate": 0.00012327229784010982, + "loss": 2.844, + "step": 42882 + }, + { + "epoch": 2.1, + "grad_norm": 0.6592845916748047, + "learning_rate": 0.0001232598569390962, + "loss": 2.9419, + "step": 42883 + }, + { + "epoch": 2.1, + "grad_norm": 0.7430188655853271, + "learning_rate": 0.00012324741650358221, + "loss": 2.8737, + "step": 42884 + }, + { + "epoch": 2.1, + "grad_norm": 0.683355450630188, + "learning_rate": 0.0001232349765336005, + "loss": 2.8757, + "step": 42885 + }, + { + "epoch": 2.1, + "grad_norm": 0.7336805462837219, + "learning_rate": 0.00012322253702918374, + "loss": 2.922, + "step": 42886 + }, + { + "epoch": 2.1, + "grad_norm": 0.7294582724571228, + "learning_rate": 0.0001232100979903649, + "loss": 2.9502, + "step": 42887 + }, + { + "epoch": 2.1, + "grad_norm": 0.7106903195381165, + "learning_rate": 0.0001231976594171765, + "loss": 2.7697, + "step": 42888 + }, + { + "epoch": 2.1, + "grad_norm": 0.702437698841095, + "learning_rate": 0.00012318522130965162, + "loss": 3.0185, + "step": 42889 + }, + { + "epoch": 2.1, + "grad_norm": 0.673358142375946, + "learning_rate": 0.0001231727836678227, + "loss": 3.0657, + "step": 42890 + }, + { + "epoch": 2.1, + "grad_norm": 0.7490993142127991, + "learning_rate": 0.00012316034649172272, + "loss": 2.8019, + "step": 42891 + }, + { + "epoch": 2.1, + "grad_norm": 0.6476032137870789, + "learning_rate": 0.00012314790978138436, + "loss": 2.7008, + "step": 42892 + }, + { + "epoch": 2.1, + "grad_norm": 0.6794806122779846, + "learning_rate": 0.00012313547353684027, + "loss": 2.9429, + "step": 42893 + }, + { + "epoch": 2.1, + "grad_norm": 0.6979109048843384, + "learning_rate": 0.0001231230377581234, + "loss": 2.7561, + "step": 42894 + }, + { + "epoch": 2.1, + "grad_norm": 0.7019332647323608, + "learning_rate": 0.0001231106024452663, + "loss": 2.9071, + "step": 42895 + }, + { + "epoch": 2.1, + "grad_norm": 0.7101191878318787, + "learning_rate": 0.0001230981675983019, + "loss": 3.0134, + "step": 42896 + }, + { + "epoch": 2.1, + "grad_norm": 0.6974600553512573, + "learning_rate": 0.00012308573321726288, + "loss": 2.9611, + "step": 42897 + }, + { + "epoch": 2.1, + "grad_norm": 0.6884080171585083, + "learning_rate": 0.00012307329930218196, + "loss": 2.8888, + "step": 42898 + }, + { + "epoch": 2.1, + "grad_norm": 0.7027839422225952, + "learning_rate": 0.00012306086585309197, + "loss": 3.136, + "step": 42899 + }, + { + "epoch": 2.1, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.00012304843287002566, + "loss": 2.8276, + "step": 42900 + }, + { + "epoch": 2.1, + "grad_norm": 0.7058283686637878, + "learning_rate": 0.00012303600035301558, + "loss": 2.8065, + "step": 42901 + }, + { + "epoch": 2.1, + "grad_norm": 0.6885665655136108, + "learning_rate": 0.00012302356830209477, + "loss": 2.69, + "step": 42902 + }, + { + "epoch": 2.1, + "grad_norm": 0.6902278661727905, + "learning_rate": 0.00012301113671729565, + "loss": 2.8309, + "step": 42903 + }, + { + "epoch": 2.1, + "grad_norm": 0.6766996383666992, + "learning_rate": 0.0001229987055986513, + "loss": 2.8522, + "step": 42904 + }, + { + "epoch": 2.1, + "grad_norm": 0.7072258591651917, + "learning_rate": 0.00012298627494619415, + "loss": 2.952, + "step": 42905 + }, + { + "epoch": 2.1, + "grad_norm": 0.7212955355644226, + "learning_rate": 0.0001229738447599572, + "loss": 2.9739, + "step": 42906 + }, + { + "epoch": 2.1, + "grad_norm": 0.6990088224411011, + "learning_rate": 0.0001229614150399731, + "loss": 2.8831, + "step": 42907 + }, + { + "epoch": 2.1, + "grad_norm": 0.6565753221511841, + "learning_rate": 0.0001229489857862744, + "loss": 3.0789, + "step": 42908 + }, + { + "epoch": 2.1, + "grad_norm": 0.7555392384529114, + "learning_rate": 0.00012293655699889414, + "loss": 2.8962, + "step": 42909 + }, + { + "epoch": 2.1, + "grad_norm": 0.6769063472747803, + "learning_rate": 0.0001229241286778648, + "loss": 2.825, + "step": 42910 + }, + { + "epoch": 2.1, + "grad_norm": 0.7057018876075745, + "learning_rate": 0.00012291170082321924, + "loss": 2.8159, + "step": 42911 + }, + { + "epoch": 2.1, + "grad_norm": 0.6804623007774353, + "learning_rate": 0.0001228992734349903, + "loss": 2.8989, + "step": 42912 + }, + { + "epoch": 2.1, + "grad_norm": 0.6761884093284607, + "learning_rate": 0.00012288684651321055, + "loss": 2.9359, + "step": 42913 + }, + { + "epoch": 2.1, + "grad_norm": 0.7361788153648376, + "learning_rate": 0.00012287442005791275, + "loss": 2.9682, + "step": 42914 + }, + { + "epoch": 2.1, + "grad_norm": 0.7586960792541504, + "learning_rate": 0.00012286199406912957, + "loss": 2.6563, + "step": 42915 + }, + { + "epoch": 2.1, + "grad_norm": 0.7192031145095825, + "learning_rate": 0.00012284956854689377, + "loss": 2.9592, + "step": 42916 + }, + { + "epoch": 2.1, + "grad_norm": 0.673794686794281, + "learning_rate": 0.00012283714349123825, + "loss": 2.9138, + "step": 42917 + }, + { + "epoch": 2.1, + "grad_norm": 0.6666536331176758, + "learning_rate": 0.00012282471890219548, + "loss": 2.8351, + "step": 42918 + }, + { + "epoch": 2.1, + "grad_norm": 0.6872618794441223, + "learning_rate": 0.0001228122947797984, + "loss": 3.0766, + "step": 42919 + }, + { + "epoch": 2.1, + "grad_norm": 0.6858128309249878, + "learning_rate": 0.00012279987112407954, + "loss": 2.9871, + "step": 42920 + }, + { + "epoch": 2.1, + "grad_norm": 0.6915315389633179, + "learning_rate": 0.00012278744793507184, + "loss": 2.796, + "step": 42921 + }, + { + "epoch": 2.1, + "grad_norm": 0.6912660002708435, + "learning_rate": 0.0001227750252128079, + "loss": 2.8063, + "step": 42922 + }, + { + "epoch": 2.1, + "grad_norm": 0.7054868340492249, + "learning_rate": 0.00012276260295732028, + "loss": 2.9651, + "step": 42923 + }, + { + "epoch": 2.1, + "grad_norm": 0.6961238980293274, + "learning_rate": 0.00012275018116864202, + "loss": 3.0532, + "step": 42924 + }, + { + "epoch": 2.1, + "grad_norm": 0.7195843458175659, + "learning_rate": 0.00012273775984680553, + "loss": 2.7511, + "step": 42925 + }, + { + "epoch": 2.1, + "grad_norm": 0.6970654129981995, + "learning_rate": 0.00012272533899184367, + "loss": 3.1432, + "step": 42926 + }, + { + "epoch": 2.1, + "grad_norm": 0.6991729736328125, + "learning_rate": 0.00012271291860378925, + "loss": 3.0489, + "step": 42927 + }, + { + "epoch": 2.1, + "grad_norm": 0.6705859303474426, + "learning_rate": 0.00012270049868267488, + "loss": 2.7341, + "step": 42928 + }, + { + "epoch": 2.1, + "grad_norm": 0.679558277130127, + "learning_rate": 0.0001226880792285333, + "loss": 2.8421, + "step": 42929 + }, + { + "epoch": 2.1, + "grad_norm": 0.7303120493888855, + "learning_rate": 0.00012267566024139705, + "loss": 3.0701, + "step": 42930 + }, + { + "epoch": 2.1, + "grad_norm": 0.6934303641319275, + "learning_rate": 0.00012266324172129913, + "loss": 2.879, + "step": 42931 + }, + { + "epoch": 2.1, + "grad_norm": 0.6617268323898315, + "learning_rate": 0.00012265082366827196, + "loss": 3.037, + "step": 42932 + }, + { + "epoch": 2.1, + "grad_norm": 0.7124339938163757, + "learning_rate": 0.0001226384060823484, + "loss": 2.8715, + "step": 42933 + }, + { + "epoch": 2.1, + "grad_norm": 0.710786759853363, + "learning_rate": 0.00012262598896356128, + "loss": 3.0071, + "step": 42934 + }, + { + "epoch": 2.1, + "grad_norm": 0.708827793598175, + "learning_rate": 0.00012261357231194313, + "loss": 3.0311, + "step": 42935 + }, + { + "epoch": 2.1, + "grad_norm": 0.7814995646476746, + "learning_rate": 0.00012260115612752672, + "loss": 3.0753, + "step": 42936 + }, + { + "epoch": 2.1, + "grad_norm": 0.6750069260597229, + "learning_rate": 0.0001225887404103446, + "loss": 2.8407, + "step": 42937 + }, + { + "epoch": 2.1, + "grad_norm": 0.6857645511627197, + "learning_rate": 0.0001225763251604296, + "loss": 3.0635, + "step": 42938 + }, + { + "epoch": 2.1, + "grad_norm": 0.7252575755119324, + "learning_rate": 0.00012256391037781454, + "loss": 3.1004, + "step": 42939 + }, + { + "epoch": 2.1, + "grad_norm": 0.671009361743927, + "learning_rate": 0.00012255149606253183, + "loss": 3.0177, + "step": 42940 + }, + { + "epoch": 2.1, + "grad_norm": 0.7091543674468994, + "learning_rate": 0.0001225390822146145, + "loss": 3.0017, + "step": 42941 + }, + { + "epoch": 2.1, + "grad_norm": 0.7315735816955566, + "learning_rate": 0.00012252666883409493, + "loss": 2.9222, + "step": 42942 + }, + { + "epoch": 2.1, + "grad_norm": 0.7119027376174927, + "learning_rate": 0.00012251425592100607, + "loss": 2.8792, + "step": 42943 + }, + { + "epoch": 2.1, + "grad_norm": 0.6784539222717285, + "learning_rate": 0.0001225018434753805, + "loss": 2.9135, + "step": 42944 + }, + { + "epoch": 2.1, + "grad_norm": 0.7480369806289673, + "learning_rate": 0.00012248943149725084, + "loss": 2.8613, + "step": 42945 + }, + { + "epoch": 2.1, + "grad_norm": 0.7432013154029846, + "learning_rate": 0.00012247701998664994, + "loss": 2.8941, + "step": 42946 + }, + { + "epoch": 2.1, + "grad_norm": 0.7087968587875366, + "learning_rate": 0.0001224646089436103, + "loss": 2.9759, + "step": 42947 + }, + { + "epoch": 2.1, + "grad_norm": 0.7181050181388855, + "learning_rate": 0.0001224521983681647, + "loss": 2.8121, + "step": 42948 + }, + { + "epoch": 2.1, + "grad_norm": 0.6602891087532043, + "learning_rate": 0.00012243978826034595, + "loss": 3.1781, + "step": 42949 + }, + { + "epoch": 2.1, + "grad_norm": 0.7048685550689697, + "learning_rate": 0.00012242737862018662, + "loss": 2.9432, + "step": 42950 + }, + { + "epoch": 2.1, + "grad_norm": 0.6855930685997009, + "learning_rate": 0.0001224149694477194, + "loss": 2.8298, + "step": 42951 + }, + { + "epoch": 2.1, + "grad_norm": 0.6815904378890991, + "learning_rate": 0.00012240256074297687, + "loss": 2.8817, + "step": 42952 + }, + { + "epoch": 2.11, + "grad_norm": 0.6977430582046509, + "learning_rate": 0.00012239015250599183, + "loss": 2.9694, + "step": 42953 + }, + { + "epoch": 2.11, + "grad_norm": 0.7615177631378174, + "learning_rate": 0.00012237774473679704, + "loss": 2.9469, + "step": 42954 + }, + { + "epoch": 2.11, + "grad_norm": 0.6653721332550049, + "learning_rate": 0.00012236533743542496, + "loss": 2.8976, + "step": 42955 + }, + { + "epoch": 2.11, + "grad_norm": 0.6686670184135437, + "learning_rate": 0.00012235293060190855, + "loss": 2.8846, + "step": 42956 + }, + { + "epoch": 2.11, + "grad_norm": 0.6878864765167236, + "learning_rate": 0.00012234052423628033, + "loss": 2.6982, + "step": 42957 + }, + { + "epoch": 2.11, + "grad_norm": 0.720576822757721, + "learning_rate": 0.00012232811833857283, + "loss": 2.9867, + "step": 42958 + }, + { + "epoch": 2.11, + "grad_norm": 0.686820387840271, + "learning_rate": 0.00012231571290881898, + "loss": 3.0024, + "step": 42959 + }, + { + "epoch": 2.11, + "grad_norm": 0.6706504821777344, + "learning_rate": 0.00012230330794705125, + "loss": 2.9889, + "step": 42960 + }, + { + "epoch": 2.11, + "grad_norm": 0.672160804271698, + "learning_rate": 0.00012229090345330254, + "loss": 3.0674, + "step": 42961 + }, + { + "epoch": 2.11, + "grad_norm": 0.7444672584533691, + "learning_rate": 0.00012227849942760528, + "loss": 2.9431, + "step": 42962 + }, + { + "epoch": 2.11, + "grad_norm": 0.7115716338157654, + "learning_rate": 0.00012226609586999223, + "loss": 2.8684, + "step": 42963 + }, + { + "epoch": 2.11, + "grad_norm": 0.768132746219635, + "learning_rate": 0.00012225369278049617, + "loss": 2.7876, + "step": 42964 + }, + { + "epoch": 2.11, + "grad_norm": 0.7231113314628601, + "learning_rate": 0.0001222412901591497, + "loss": 2.8806, + "step": 42965 + }, + { + "epoch": 2.11, + "grad_norm": 0.6965844035148621, + "learning_rate": 0.00012222888800598544, + "loss": 2.8743, + "step": 42966 + }, + { + "epoch": 2.11, + "grad_norm": 0.6945066452026367, + "learning_rate": 0.000122216486321036, + "loss": 2.7055, + "step": 42967 + }, + { + "epoch": 2.11, + "grad_norm": 0.675247073173523, + "learning_rate": 0.00012220408510433409, + "loss": 2.7753, + "step": 42968 + }, + { + "epoch": 2.11, + "grad_norm": 0.6834036707878113, + "learning_rate": 0.00012219168435591252, + "loss": 2.7571, + "step": 42969 + }, + { + "epoch": 2.11, + "grad_norm": 0.6711874008178711, + "learning_rate": 0.00012217928407580377, + "loss": 2.9645, + "step": 42970 + }, + { + "epoch": 2.11, + "grad_norm": 0.6867322325706482, + "learning_rate": 0.00012216688426404062, + "loss": 2.949, + "step": 42971 + }, + { + "epoch": 2.11, + "grad_norm": 0.713107168674469, + "learning_rate": 0.00012215448492065568, + "loss": 2.974, + "step": 42972 + }, + { + "epoch": 2.11, + "grad_norm": 0.6954575777053833, + "learning_rate": 0.00012214208604568148, + "loss": 2.8547, + "step": 42973 + }, + { + "epoch": 2.11, + "grad_norm": 0.7084396481513977, + "learning_rate": 0.00012212968763915095, + "loss": 3.0541, + "step": 42974 + }, + { + "epoch": 2.11, + "grad_norm": 0.6738825440406799, + "learning_rate": 0.00012211728970109643, + "loss": 2.9123, + "step": 42975 + }, + { + "epoch": 2.11, + "grad_norm": 0.730086624622345, + "learning_rate": 0.00012210489223155087, + "loss": 3.0752, + "step": 42976 + }, + { + "epoch": 2.11, + "grad_norm": 0.7121165990829468, + "learning_rate": 0.00012209249523054667, + "loss": 2.9596, + "step": 42977 + }, + { + "epoch": 2.11, + "grad_norm": 0.6749788522720337, + "learning_rate": 0.00012208009869811663, + "loss": 2.9951, + "step": 42978 + }, + { + "epoch": 2.11, + "grad_norm": 0.7120500206947327, + "learning_rate": 0.00012206770263429344, + "loss": 2.8199, + "step": 42979 + }, + { + "epoch": 2.11, + "grad_norm": 0.71989506483078, + "learning_rate": 0.00012205530703910967, + "loss": 3.0151, + "step": 42980 + }, + { + "epoch": 2.11, + "grad_norm": 0.698340117931366, + "learning_rate": 0.00012204291191259799, + "loss": 2.9014, + "step": 42981 + }, + { + "epoch": 2.11, + "grad_norm": 0.6799280047416687, + "learning_rate": 0.00012203051725479094, + "loss": 2.9977, + "step": 42982 + }, + { + "epoch": 2.11, + "grad_norm": 0.6946273446083069, + "learning_rate": 0.00012201812306572124, + "loss": 2.6704, + "step": 42983 + }, + { + "epoch": 2.11, + "grad_norm": 0.7306190133094788, + "learning_rate": 0.00012200572934542167, + "loss": 2.9042, + "step": 42984 + }, + { + "epoch": 2.11, + "grad_norm": 0.7344149947166443, + "learning_rate": 0.00012199333609392462, + "loss": 2.9512, + "step": 42985 + }, + { + "epoch": 2.11, + "grad_norm": 0.7163786292076111, + "learning_rate": 0.00012198094331126299, + "loss": 2.9448, + "step": 42986 + }, + { + "epoch": 2.11, + "grad_norm": 0.7278892397880554, + "learning_rate": 0.00012196855099746928, + "loss": 3.0489, + "step": 42987 + }, + { + "epoch": 2.11, + "grad_norm": 0.7170532941818237, + "learning_rate": 0.00012195615915257607, + "loss": 2.9299, + "step": 42988 + }, + { + "epoch": 2.11, + "grad_norm": 0.6998884677886963, + "learning_rate": 0.00012194376777661615, + "loss": 2.6829, + "step": 42989 + }, + { + "epoch": 2.11, + "grad_norm": 0.7183066606521606, + "learning_rate": 0.00012193137686962197, + "loss": 2.6611, + "step": 42990 + }, + { + "epoch": 2.11, + "grad_norm": 0.6729427576065063, + "learning_rate": 0.00012191898643162638, + "loss": 3.1633, + "step": 42991 + }, + { + "epoch": 2.11, + "grad_norm": 0.7029131054878235, + "learning_rate": 0.00012190659646266182, + "loss": 2.9584, + "step": 42992 + }, + { + "epoch": 2.11, + "grad_norm": 0.709124743938446, + "learning_rate": 0.0001218942069627611, + "loss": 3.1043, + "step": 42993 + }, + { + "epoch": 2.11, + "grad_norm": 0.640663743019104, + "learning_rate": 0.00012188181793195679, + "loss": 3.0283, + "step": 42994 + }, + { + "epoch": 2.11, + "grad_norm": 0.6382255554199219, + "learning_rate": 0.00012186942937028135, + "loss": 2.9777, + "step": 42995 + }, + { + "epoch": 2.11, + "grad_norm": 0.6668031215667725, + "learning_rate": 0.00012185704127776767, + "loss": 2.933, + "step": 42996 + }, + { + "epoch": 2.11, + "grad_norm": 0.6796296834945679, + "learning_rate": 0.00012184465365444815, + "loss": 2.7516, + "step": 42997 + }, + { + "epoch": 2.11, + "grad_norm": 0.6396820545196533, + "learning_rate": 0.00012183226650035555, + "loss": 2.989, + "step": 42998 + }, + { + "epoch": 2.11, + "grad_norm": 0.6721262335777283, + "learning_rate": 0.00012181987981552255, + "loss": 3.1257, + "step": 42999 + }, + { + "epoch": 2.11, + "grad_norm": 0.7053427696228027, + "learning_rate": 0.0001218074935999816, + "loss": 2.7152, + "step": 43000 + }, + { + "epoch": 2.11, + "grad_norm": 0.6978045105934143, + "learning_rate": 0.00012179510785376552, + "loss": 3.0351, + "step": 43001 + }, + { + "epoch": 2.11, + "grad_norm": 0.6725461483001709, + "learning_rate": 0.00012178272257690682, + "loss": 2.95, + "step": 43002 + }, + { + "epoch": 2.11, + "grad_norm": 0.7001270651817322, + "learning_rate": 0.00012177033776943805, + "loss": 2.9442, + "step": 43003 + }, + { + "epoch": 2.11, + "grad_norm": 0.7013981938362122, + "learning_rate": 0.00012175795343139199, + "loss": 2.8046, + "step": 43004 + }, + { + "epoch": 2.11, + "grad_norm": 0.7114749550819397, + "learning_rate": 0.00012174556956280109, + "loss": 2.9528, + "step": 43005 + }, + { + "epoch": 2.11, + "grad_norm": 0.687053918838501, + "learning_rate": 0.00012173318616369815, + "loss": 3.0319, + "step": 43006 + }, + { + "epoch": 2.11, + "grad_norm": 0.7071757912635803, + "learning_rate": 0.00012172080323411557, + "loss": 3.0435, + "step": 43007 + }, + { + "epoch": 2.11, + "grad_norm": 0.7466889023780823, + "learning_rate": 0.0001217084207740862, + "loss": 3.0157, + "step": 43008 + }, + { + "epoch": 2.11, + "grad_norm": 0.6620727777481079, + "learning_rate": 0.00012169603878364255, + "loss": 2.9545, + "step": 43009 + }, + { + "epoch": 2.11, + "grad_norm": 0.7407063245773315, + "learning_rate": 0.0001216836572628171, + "loss": 3.0802, + "step": 43010 + }, + { + "epoch": 2.11, + "grad_norm": 0.6958205699920654, + "learning_rate": 0.0001216712762116427, + "loss": 2.79, + "step": 43011 + }, + { + "epoch": 2.11, + "grad_norm": 0.7473426461219788, + "learning_rate": 0.00012165889563015171, + "loss": 2.844, + "step": 43012 + }, + { + "epoch": 2.11, + "grad_norm": 0.701392412185669, + "learning_rate": 0.00012164651551837698, + "loss": 3.0328, + "step": 43013 + }, + { + "epoch": 2.11, + "grad_norm": 0.7354457974433899, + "learning_rate": 0.00012163413587635091, + "loss": 3.0237, + "step": 43014 + }, + { + "epoch": 2.11, + "grad_norm": 0.6868544220924377, + "learning_rate": 0.0001216217567041063, + "loss": 3.0438, + "step": 43015 + }, + { + "epoch": 2.11, + "grad_norm": 0.6519982814788818, + "learning_rate": 0.00012160937800167564, + "loss": 2.9513, + "step": 43016 + }, + { + "epoch": 2.11, + "grad_norm": 0.6850939393043518, + "learning_rate": 0.00012159699976909144, + "loss": 2.9938, + "step": 43017 + }, + { + "epoch": 2.11, + "grad_norm": 0.6990240812301636, + "learning_rate": 0.00012158462200638655, + "loss": 3.0489, + "step": 43018 + }, + { + "epoch": 2.11, + "grad_norm": 0.6919039487838745, + "learning_rate": 0.0001215722447135933, + "loss": 2.9913, + "step": 43019 + }, + { + "epoch": 2.11, + "grad_norm": 0.6520496606826782, + "learning_rate": 0.00012155986789074439, + "loss": 2.8828, + "step": 43020 + }, + { + "epoch": 2.11, + "grad_norm": 0.6608403921127319, + "learning_rate": 0.00012154749153787257, + "loss": 2.7283, + "step": 43021 + }, + { + "epoch": 2.11, + "grad_norm": 0.6845476031303406, + "learning_rate": 0.00012153511565501023, + "loss": 3.0882, + "step": 43022 + }, + { + "epoch": 2.11, + "grad_norm": 0.6670951247215271, + "learning_rate": 0.00012152274024219016, + "loss": 2.8529, + "step": 43023 + }, + { + "epoch": 2.11, + "grad_norm": 0.684021532535553, + "learning_rate": 0.0001215103652994448, + "loss": 2.8678, + "step": 43024 + }, + { + "epoch": 2.11, + "grad_norm": 0.7129572033882141, + "learning_rate": 0.00012149799082680672, + "loss": 3.1019, + "step": 43025 + }, + { + "epoch": 2.11, + "grad_norm": 0.6857742667198181, + "learning_rate": 0.00012148561682430866, + "loss": 2.7703, + "step": 43026 + }, + { + "epoch": 2.11, + "grad_norm": 0.6729403138160706, + "learning_rate": 0.00012147324329198302, + "loss": 2.7764, + "step": 43027 + }, + { + "epoch": 2.11, + "grad_norm": 0.7263168692588806, + "learning_rate": 0.00012146087022986264, + "loss": 2.9186, + "step": 43028 + }, + { + "epoch": 2.11, + "grad_norm": 0.6728549003601074, + "learning_rate": 0.00012144849763797981, + "loss": 2.9232, + "step": 43029 + }, + { + "epoch": 2.11, + "grad_norm": 0.7353836894035339, + "learning_rate": 0.00012143612551636741, + "loss": 3.0423, + "step": 43030 + }, + { + "epoch": 2.11, + "grad_norm": 0.7096356749534607, + "learning_rate": 0.00012142375386505791, + "loss": 3.0581, + "step": 43031 + }, + { + "epoch": 2.11, + "grad_norm": 0.6555191874504089, + "learning_rate": 0.00012141138268408373, + "loss": 2.6584, + "step": 43032 + }, + { + "epoch": 2.11, + "grad_norm": 0.7258883118629456, + "learning_rate": 0.00012139901197347775, + "loss": 2.9691, + "step": 43033 + }, + { + "epoch": 2.11, + "grad_norm": 0.6590479016304016, + "learning_rate": 0.00012138664173327224, + "loss": 2.7755, + "step": 43034 + }, + { + "epoch": 2.11, + "grad_norm": 0.6529909372329712, + "learning_rate": 0.00012137427196349996, + "loss": 2.913, + "step": 43035 + }, + { + "epoch": 2.11, + "grad_norm": 0.6840808391571045, + "learning_rate": 0.00012136190266419361, + "loss": 2.9598, + "step": 43036 + }, + { + "epoch": 2.11, + "grad_norm": 0.6781811714172363, + "learning_rate": 0.0001213495338353856, + "loss": 2.9957, + "step": 43037 + }, + { + "epoch": 2.11, + "grad_norm": 0.7219123840332031, + "learning_rate": 0.00012133716547710852, + "loss": 3.1447, + "step": 43038 + }, + { + "epoch": 2.11, + "grad_norm": 0.6630083322525024, + "learning_rate": 0.00012132479758939487, + "loss": 2.7962, + "step": 43039 + }, + { + "epoch": 2.11, + "grad_norm": 0.6876730918884277, + "learning_rate": 0.00012131243017227734, + "loss": 2.7907, + "step": 43040 + }, + { + "epoch": 2.11, + "grad_norm": 0.6795483827590942, + "learning_rate": 0.00012130006322578856, + "loss": 2.9542, + "step": 43041 + }, + { + "epoch": 2.11, + "grad_norm": 0.6933289766311646, + "learning_rate": 0.00012128769674996093, + "loss": 2.8632, + "step": 43042 + }, + { + "epoch": 2.11, + "grad_norm": 0.7193173170089722, + "learning_rate": 0.00012127533074482718, + "loss": 2.946, + "step": 43043 + }, + { + "epoch": 2.11, + "grad_norm": 0.7053383588790894, + "learning_rate": 0.00012126296521041976, + "loss": 2.74, + "step": 43044 + }, + { + "epoch": 2.11, + "grad_norm": 0.6752833127975464, + "learning_rate": 0.00012125060014677137, + "loss": 2.6228, + "step": 43045 + }, + { + "epoch": 2.11, + "grad_norm": 0.6935940980911255, + "learning_rate": 0.00012123823555391449, + "loss": 3.0145, + "step": 43046 + }, + { + "epoch": 2.11, + "grad_norm": 0.6865079998970032, + "learning_rate": 0.00012122587143188162, + "loss": 2.8768, + "step": 43047 + }, + { + "epoch": 2.11, + "grad_norm": 0.6913439631462097, + "learning_rate": 0.00012121350778070547, + "loss": 2.8929, + "step": 43048 + }, + { + "epoch": 2.11, + "grad_norm": 0.7718340754508972, + "learning_rate": 0.00012120114460041844, + "loss": 3.1285, + "step": 43049 + }, + { + "epoch": 2.11, + "grad_norm": 0.6865830421447754, + "learning_rate": 0.00012118878189105318, + "loss": 2.8353, + "step": 43050 + }, + { + "epoch": 2.11, + "grad_norm": 0.6636896729469299, + "learning_rate": 0.00012117641965264238, + "loss": 2.7779, + "step": 43051 + }, + { + "epoch": 2.11, + "grad_norm": 0.7035448551177979, + "learning_rate": 0.00012116405788521848, + "loss": 3.0683, + "step": 43052 + }, + { + "epoch": 2.11, + "grad_norm": 0.6768655180931091, + "learning_rate": 0.00012115169658881401, + "loss": 3.105, + "step": 43053 + }, + { + "epoch": 2.11, + "grad_norm": 0.688899576663971, + "learning_rate": 0.00012113933576346143, + "loss": 2.7824, + "step": 43054 + }, + { + "epoch": 2.11, + "grad_norm": 0.7167072892189026, + "learning_rate": 0.00012112697540919346, + "loss": 2.7641, + "step": 43055 + }, + { + "epoch": 2.11, + "grad_norm": 0.7082327008247375, + "learning_rate": 0.0001211146155260427, + "loss": 2.829, + "step": 43056 + }, + { + "epoch": 2.11, + "grad_norm": 0.6907978653907776, + "learning_rate": 0.0001211022561140415, + "loss": 3.096, + "step": 43057 + }, + { + "epoch": 2.11, + "grad_norm": 0.6979928016662598, + "learning_rate": 0.00012108989717322266, + "loss": 2.9573, + "step": 43058 + }, + { + "epoch": 2.11, + "grad_norm": 0.7088404297828674, + "learning_rate": 0.00012107753870361857, + "loss": 2.769, + "step": 43059 + }, + { + "epoch": 2.11, + "grad_norm": 0.6992489695549011, + "learning_rate": 0.0001210651807052617, + "loss": 2.806, + "step": 43060 + }, + { + "epoch": 2.11, + "grad_norm": 0.6478296518325806, + "learning_rate": 0.00012105282317818484, + "loss": 2.7097, + "step": 43061 + }, + { + "epoch": 2.11, + "grad_norm": 0.6695234775543213, + "learning_rate": 0.00012104046612242028, + "loss": 3.1377, + "step": 43062 + }, + { + "epoch": 2.11, + "grad_norm": 0.6910408735275269, + "learning_rate": 0.0001210281095380008, + "loss": 2.951, + "step": 43063 + }, + { + "epoch": 2.11, + "grad_norm": 0.6836959719657898, + "learning_rate": 0.00012101575342495874, + "loss": 2.9718, + "step": 43064 + }, + { + "epoch": 2.11, + "grad_norm": 0.6675450801849365, + "learning_rate": 0.00012100339778332671, + "loss": 2.9584, + "step": 43065 + }, + { + "epoch": 2.11, + "grad_norm": 0.6959847807884216, + "learning_rate": 0.00012099104261313742, + "loss": 2.9065, + "step": 43066 + }, + { + "epoch": 2.11, + "grad_norm": 0.6748732924461365, + "learning_rate": 0.00012097868791442327, + "loss": 2.9597, + "step": 43067 + }, + { + "epoch": 2.11, + "grad_norm": 0.6882777810096741, + "learning_rate": 0.00012096633368721678, + "loss": 2.9002, + "step": 43068 + }, + { + "epoch": 2.11, + "grad_norm": 0.7168293595314026, + "learning_rate": 0.00012095397993155042, + "loss": 2.871, + "step": 43069 + }, + { + "epoch": 2.11, + "grad_norm": 0.73194420337677, + "learning_rate": 0.00012094162664745682, + "loss": 2.8233, + "step": 43070 + }, + { + "epoch": 2.11, + "grad_norm": 0.7441113591194153, + "learning_rate": 0.00012092927383496862, + "loss": 2.9003, + "step": 43071 + }, + { + "epoch": 2.11, + "grad_norm": 0.6556605696678162, + "learning_rate": 0.00012091692149411815, + "loss": 2.7515, + "step": 43072 + }, + { + "epoch": 2.11, + "grad_norm": 0.6657657623291016, + "learning_rate": 0.00012090456962493811, + "loss": 2.959, + "step": 43073 + }, + { + "epoch": 2.11, + "grad_norm": 0.6989374756813049, + "learning_rate": 0.00012089221822746101, + "loss": 3.0809, + "step": 43074 + }, + { + "epoch": 2.11, + "grad_norm": 0.6639280319213867, + "learning_rate": 0.00012087986730171923, + "loss": 2.8231, + "step": 43075 + }, + { + "epoch": 2.11, + "grad_norm": 0.7291037440299988, + "learning_rate": 0.00012086751684774548, + "loss": 2.933, + "step": 43076 + }, + { + "epoch": 2.11, + "grad_norm": 0.7148304581642151, + "learning_rate": 0.00012085516686557214, + "loss": 2.7072, + "step": 43077 + }, + { + "epoch": 2.11, + "grad_norm": 0.693949818611145, + "learning_rate": 0.00012084281735523191, + "loss": 2.8575, + "step": 43078 + }, + { + "epoch": 2.11, + "grad_norm": 0.7028672695159912, + "learning_rate": 0.00012083046831675711, + "loss": 2.8634, + "step": 43079 + }, + { + "epoch": 2.11, + "grad_norm": 0.7147821187973022, + "learning_rate": 0.0001208181197501804, + "loss": 2.8629, + "step": 43080 + }, + { + "epoch": 2.11, + "grad_norm": 0.706278383731842, + "learning_rate": 0.00012080577165553434, + "loss": 2.8998, + "step": 43081 + }, + { + "epoch": 2.11, + "grad_norm": 0.6842054128646851, + "learning_rate": 0.00012079342403285143, + "loss": 2.9569, + "step": 43082 + }, + { + "epoch": 2.11, + "grad_norm": 0.6698657870292664, + "learning_rate": 0.00012078107688216415, + "loss": 2.9066, + "step": 43083 + }, + { + "epoch": 2.11, + "grad_norm": 0.6596769690513611, + "learning_rate": 0.00012076873020350489, + "loss": 2.799, + "step": 43084 + }, + { + "epoch": 2.11, + "grad_norm": 0.7220476865768433, + "learning_rate": 0.00012075638399690642, + "loss": 2.8281, + "step": 43085 + }, + { + "epoch": 2.11, + "grad_norm": 0.7004108428955078, + "learning_rate": 0.00012074403826240103, + "loss": 2.7315, + "step": 43086 + }, + { + "epoch": 2.11, + "grad_norm": 0.68475341796875, + "learning_rate": 0.00012073169300002136, + "loss": 2.9782, + "step": 43087 + }, + { + "epoch": 2.11, + "grad_norm": 0.7112210988998413, + "learning_rate": 0.0001207193482098, + "loss": 2.8447, + "step": 43088 + }, + { + "epoch": 2.11, + "grad_norm": 0.6924461722373962, + "learning_rate": 0.00012070700389176937, + "loss": 2.9229, + "step": 43089 + }, + { + "epoch": 2.11, + "grad_norm": 0.6814031004905701, + "learning_rate": 0.000120694660045962, + "loss": 2.8308, + "step": 43090 + }, + { + "epoch": 2.11, + "grad_norm": 0.7307980060577393, + "learning_rate": 0.00012068231667241027, + "loss": 3.0753, + "step": 43091 + }, + { + "epoch": 2.11, + "grad_norm": 0.729032576084137, + "learning_rate": 0.0001206699737711468, + "loss": 2.8441, + "step": 43092 + }, + { + "epoch": 2.11, + "grad_norm": 0.6686051487922668, + "learning_rate": 0.00012065763134220422, + "loss": 2.9681, + "step": 43093 + }, + { + "epoch": 2.11, + "grad_norm": 0.7233748435974121, + "learning_rate": 0.00012064528938561483, + "loss": 2.8097, + "step": 43094 + }, + { + "epoch": 2.11, + "grad_norm": 0.6904696822166443, + "learning_rate": 0.00012063294790141134, + "loss": 2.8041, + "step": 43095 + }, + { + "epoch": 2.11, + "grad_norm": 0.6654314994812012, + "learning_rate": 0.00012062060688962611, + "loss": 2.9394, + "step": 43096 + }, + { + "epoch": 2.11, + "grad_norm": 0.6917867064476013, + "learning_rate": 0.0001206082663502916, + "loss": 2.9945, + "step": 43097 + }, + { + "epoch": 2.11, + "grad_norm": 0.6887367963790894, + "learning_rate": 0.00012059592628344051, + "loss": 2.899, + "step": 43098 + }, + { + "epoch": 2.11, + "grad_norm": 0.7017031311988831, + "learning_rate": 0.00012058358668910509, + "loss": 2.7895, + "step": 43099 + }, + { + "epoch": 2.11, + "grad_norm": 0.6768372654914856, + "learning_rate": 0.00012057124756731808, + "loss": 2.9466, + "step": 43100 + }, + { + "epoch": 2.11, + "grad_norm": 0.7897763252258301, + "learning_rate": 0.0001205589089181118, + "loss": 3.1558, + "step": 43101 + }, + { + "epoch": 2.11, + "grad_norm": 0.7289750576019287, + "learning_rate": 0.00012054657074151879, + "loss": 2.8482, + "step": 43102 + }, + { + "epoch": 2.11, + "grad_norm": 0.6732428073883057, + "learning_rate": 0.00012053423303757167, + "loss": 2.9046, + "step": 43103 + }, + { + "epoch": 2.11, + "grad_norm": 0.6708196997642517, + "learning_rate": 0.00012052189580630288, + "loss": 2.9676, + "step": 43104 + }, + { + "epoch": 2.11, + "grad_norm": 0.7545298933982849, + "learning_rate": 0.00012050955904774486, + "loss": 3.1395, + "step": 43105 + }, + { + "epoch": 2.11, + "grad_norm": 0.7323744893074036, + "learning_rate": 0.00012049722276192999, + "loss": 2.966, + "step": 43106 + }, + { + "epoch": 2.11, + "grad_norm": 0.6957453489303589, + "learning_rate": 0.0001204848869488909, + "loss": 2.8968, + "step": 43107 + }, + { + "epoch": 2.11, + "grad_norm": 0.6939908266067505, + "learning_rate": 0.0001204725516086602, + "loss": 2.7944, + "step": 43108 + }, + { + "epoch": 2.11, + "grad_norm": 0.6847617626190186, + "learning_rate": 0.00012046021674127011, + "loss": 2.9505, + "step": 43109 + }, + { + "epoch": 2.11, + "grad_norm": 0.7532222867012024, + "learning_rate": 0.0001204478823467534, + "loss": 2.8696, + "step": 43110 + }, + { + "epoch": 2.11, + "grad_norm": 0.6626277565956116, + "learning_rate": 0.00012043554842514238, + "loss": 3.0799, + "step": 43111 + }, + { + "epoch": 2.11, + "grad_norm": 0.6772818565368652, + "learning_rate": 0.00012042321497646946, + "loss": 2.9087, + "step": 43112 + }, + { + "epoch": 2.11, + "grad_norm": 0.6805033683776855, + "learning_rate": 0.00012041088200076734, + "loss": 2.8525, + "step": 43113 + }, + { + "epoch": 2.11, + "grad_norm": 0.7096461057662964, + "learning_rate": 0.00012039854949806833, + "loss": 2.9631, + "step": 43114 + }, + { + "epoch": 2.11, + "grad_norm": 0.7710531949996948, + "learning_rate": 0.00012038621746840501, + "loss": 3.0431, + "step": 43115 + }, + { + "epoch": 2.11, + "grad_norm": 0.6965796947479248, + "learning_rate": 0.00012037388591180978, + "loss": 2.9314, + "step": 43116 + }, + { + "epoch": 2.11, + "grad_norm": 0.6995751857757568, + "learning_rate": 0.00012036155482831526, + "loss": 2.9496, + "step": 43117 + }, + { + "epoch": 2.11, + "grad_norm": 0.6960641741752625, + "learning_rate": 0.00012034922421795378, + "loss": 3.0477, + "step": 43118 + }, + { + "epoch": 2.11, + "grad_norm": 0.6957606673240662, + "learning_rate": 0.00012033689408075783, + "loss": 2.8089, + "step": 43119 + }, + { + "epoch": 2.11, + "grad_norm": 0.7263481616973877, + "learning_rate": 0.00012032456441675999, + "loss": 3.0083, + "step": 43120 + }, + { + "epoch": 2.11, + "grad_norm": 0.6750003695487976, + "learning_rate": 0.0001203122352259926, + "loss": 2.727, + "step": 43121 + }, + { + "epoch": 2.11, + "grad_norm": 0.6566562056541443, + "learning_rate": 0.00012029990650848817, + "loss": 2.8773, + "step": 43122 + }, + { + "epoch": 2.11, + "grad_norm": 0.6913471221923828, + "learning_rate": 0.00012028757826427931, + "loss": 3.0402, + "step": 43123 + }, + { + "epoch": 2.11, + "grad_norm": 0.6827965974807739, + "learning_rate": 0.00012027525049339829, + "loss": 2.7393, + "step": 43124 + }, + { + "epoch": 2.11, + "grad_norm": 0.6656849980354309, + "learning_rate": 0.00012026292319587776, + "loss": 3.0347, + "step": 43125 + }, + { + "epoch": 2.11, + "grad_norm": 0.7655985355377197, + "learning_rate": 0.00012025059637175012, + "loss": 2.9697, + "step": 43126 + }, + { + "epoch": 2.11, + "grad_norm": 0.7039027810096741, + "learning_rate": 0.00012023827002104767, + "loss": 2.8746, + "step": 43127 + }, + { + "epoch": 2.11, + "grad_norm": 0.6682581901550293, + "learning_rate": 0.00012022594414380317, + "loss": 3.0307, + "step": 43128 + }, + { + "epoch": 2.11, + "grad_norm": 0.6695263981819153, + "learning_rate": 0.00012021361874004882, + "loss": 2.9126, + "step": 43129 + }, + { + "epoch": 2.11, + "grad_norm": 0.6603823900222778, + "learning_rate": 0.00012020129380981729, + "loss": 3.0492, + "step": 43130 + }, + { + "epoch": 2.11, + "grad_norm": 0.6570489406585693, + "learning_rate": 0.00012018896935314086, + "loss": 2.8269, + "step": 43131 + }, + { + "epoch": 2.11, + "grad_norm": 0.7224562764167786, + "learning_rate": 0.00012017664537005215, + "loss": 2.9248, + "step": 43132 + }, + { + "epoch": 2.11, + "grad_norm": 0.7442463636398315, + "learning_rate": 0.00012016432186058359, + "loss": 2.8481, + "step": 43133 + }, + { + "epoch": 2.11, + "grad_norm": 0.6883277893066406, + "learning_rate": 0.00012015199882476748, + "loss": 2.8931, + "step": 43134 + }, + { + "epoch": 2.11, + "grad_norm": 0.6893338561058044, + "learning_rate": 0.0001201396762626365, + "loss": 3.1737, + "step": 43135 + }, + { + "epoch": 2.11, + "grad_norm": 0.6887253522872925, + "learning_rate": 0.00012012735417422288, + "loss": 2.9977, + "step": 43136 + }, + { + "epoch": 2.11, + "grad_norm": 0.7243923544883728, + "learning_rate": 0.0001201150325595592, + "loss": 2.78, + "step": 43137 + }, + { + "epoch": 2.11, + "grad_norm": 0.6988846659660339, + "learning_rate": 0.000120102711418678, + "loss": 2.8849, + "step": 43138 + }, + { + "epoch": 2.11, + "grad_norm": 0.6805744171142578, + "learning_rate": 0.00012009039075161162, + "loss": 2.8165, + "step": 43139 + }, + { + "epoch": 2.11, + "grad_norm": 0.7949154376983643, + "learning_rate": 0.00012007807055839257, + "loss": 2.8517, + "step": 43140 + }, + { + "epoch": 2.11, + "grad_norm": 0.6621174216270447, + "learning_rate": 0.00012006575083905312, + "loss": 2.9517, + "step": 43141 + }, + { + "epoch": 2.11, + "grad_norm": 0.702193021774292, + "learning_rate": 0.00012005343159362588, + "loss": 2.8943, + "step": 43142 + }, + { + "epoch": 2.11, + "grad_norm": 0.6978039145469666, + "learning_rate": 0.00012004111282214336, + "loss": 2.9092, + "step": 43143 + }, + { + "epoch": 2.11, + "grad_norm": 0.7211534976959229, + "learning_rate": 0.0001200287945246378, + "loss": 2.7052, + "step": 43144 + }, + { + "epoch": 2.11, + "grad_norm": 0.7075209617614746, + "learning_rate": 0.00012001647670114186, + "loss": 3.0433, + "step": 43145 + }, + { + "epoch": 2.11, + "grad_norm": 0.7127019166946411, + "learning_rate": 0.0001200041593516878, + "loss": 3.0556, + "step": 43146 + }, + { + "epoch": 2.11, + "grad_norm": 0.728064239025116, + "learning_rate": 0.00011999184247630822, + "loss": 3.0178, + "step": 43147 + }, + { + "epoch": 2.11, + "grad_norm": 0.6802637577056885, + "learning_rate": 0.00011997952607503553, + "loss": 2.7863, + "step": 43148 + }, + { + "epoch": 2.11, + "grad_norm": 0.716955840587616, + "learning_rate": 0.00011996721014790195, + "loss": 2.9714, + "step": 43149 + }, + { + "epoch": 2.11, + "grad_norm": 0.772565484046936, + "learning_rate": 0.00011995489469494026, + "loss": 2.9096, + "step": 43150 + }, + { + "epoch": 2.11, + "grad_norm": 0.6997973322868347, + "learning_rate": 0.00011994257971618261, + "loss": 2.8717, + "step": 43151 + }, + { + "epoch": 2.11, + "grad_norm": 0.7149245142936707, + "learning_rate": 0.00011993026521166154, + "loss": 2.8153, + "step": 43152 + }, + { + "epoch": 2.11, + "grad_norm": 0.7250232696533203, + "learning_rate": 0.0001199179511814096, + "loss": 2.9153, + "step": 43153 + }, + { + "epoch": 2.11, + "grad_norm": 0.7074378728866577, + "learning_rate": 0.0001199056376254591, + "loss": 3.0232, + "step": 43154 + }, + { + "epoch": 2.11, + "grad_norm": 0.7458380460739136, + "learning_rate": 0.0001198933245438425, + "loss": 2.7417, + "step": 43155 + }, + { + "epoch": 2.11, + "grad_norm": 0.6658608913421631, + "learning_rate": 0.00011988101193659214, + "loss": 2.7459, + "step": 43156 + }, + { + "epoch": 2.12, + "grad_norm": 0.698132336139679, + "learning_rate": 0.00011986869980374052, + "loss": 3.1057, + "step": 43157 + }, + { + "epoch": 2.12, + "grad_norm": 0.6716579794883728, + "learning_rate": 0.00011985638814532016, + "loss": 3.0451, + "step": 43158 + }, + { + "epoch": 2.12, + "grad_norm": 0.7294808030128479, + "learning_rate": 0.00011984407696136332, + "loss": 3.0347, + "step": 43159 + }, + { + "epoch": 2.12, + "grad_norm": 0.7262159585952759, + "learning_rate": 0.00011983176625190261, + "loss": 2.9116, + "step": 43160 + }, + { + "epoch": 2.12, + "grad_norm": 0.7426044344902039, + "learning_rate": 0.00011981945601697036, + "loss": 3.1963, + "step": 43161 + }, + { + "epoch": 2.12, + "grad_norm": 0.6831039190292358, + "learning_rate": 0.0001198071462565989, + "loss": 2.8845, + "step": 43162 + }, + { + "epoch": 2.12, + "grad_norm": 0.717583954334259, + "learning_rate": 0.0001197948369708208, + "loss": 2.8779, + "step": 43163 + }, + { + "epoch": 2.12, + "grad_norm": 0.7191320061683655, + "learning_rate": 0.00011978252815966833, + "loss": 2.7909, + "step": 43164 + }, + { + "epoch": 2.12, + "grad_norm": 0.7239760756492615, + "learning_rate": 0.00011977021982317412, + "loss": 3.0215, + "step": 43165 + }, + { + "epoch": 2.12, + "grad_norm": 0.7211264967918396, + "learning_rate": 0.00011975791196137032, + "loss": 3.0887, + "step": 43166 + }, + { + "epoch": 2.12, + "grad_norm": 0.7530267238616943, + "learning_rate": 0.00011974560457428965, + "loss": 2.6428, + "step": 43167 + }, + { + "epoch": 2.12, + "grad_norm": 0.7176784873008728, + "learning_rate": 0.00011973329766196422, + "loss": 3.0019, + "step": 43168 + }, + { + "epoch": 2.12, + "grad_norm": 0.6822056174278259, + "learning_rate": 0.0001197209912244267, + "loss": 3.0091, + "step": 43169 + }, + { + "epoch": 2.12, + "grad_norm": 0.7103027701377869, + "learning_rate": 0.00011970868526170942, + "loss": 2.9864, + "step": 43170 + }, + { + "epoch": 2.12, + "grad_norm": 0.6895686984062195, + "learning_rate": 0.00011969637977384464, + "loss": 3.0177, + "step": 43171 + }, + { + "epoch": 2.12, + "grad_norm": 0.7093585729598999, + "learning_rate": 0.00011968407476086502, + "loss": 3.0156, + "step": 43172 + }, + { + "epoch": 2.12, + "grad_norm": 0.7251325249671936, + "learning_rate": 0.0001196717702228027, + "loss": 2.7788, + "step": 43173 + }, + { + "epoch": 2.12, + "grad_norm": 0.7053051590919495, + "learning_rate": 0.00011965946615969028, + "loss": 3.0574, + "step": 43174 + }, + { + "epoch": 2.12, + "grad_norm": 0.6850510239601135, + "learning_rate": 0.00011964716257156019, + "loss": 3.1772, + "step": 43175 + }, + { + "epoch": 2.12, + "grad_norm": 0.768746554851532, + "learning_rate": 0.00011963485945844477, + "loss": 2.9536, + "step": 43176 + }, + { + "epoch": 2.12, + "grad_norm": 0.7009607553482056, + "learning_rate": 0.00011962255682037643, + "loss": 2.8981, + "step": 43177 + }, + { + "epoch": 2.12, + "grad_norm": 0.714339017868042, + "learning_rate": 0.00011961025465738745, + "loss": 2.7134, + "step": 43178 + }, + { + "epoch": 2.12, + "grad_norm": 0.6838600039482117, + "learning_rate": 0.00011959795296951032, + "loss": 2.9917, + "step": 43179 + }, + { + "epoch": 2.12, + "grad_norm": 0.6880089640617371, + "learning_rate": 0.00011958565175677759, + "loss": 2.663, + "step": 43180 + }, + { + "epoch": 2.12, + "grad_norm": 0.7251917123794556, + "learning_rate": 0.00011957335101922145, + "loss": 3.0461, + "step": 43181 + }, + { + "epoch": 2.12, + "grad_norm": 0.7145901918411255, + "learning_rate": 0.00011956105075687445, + "loss": 3.0669, + "step": 43182 + }, + { + "epoch": 2.12, + "grad_norm": 0.7030085325241089, + "learning_rate": 0.00011954875096976883, + "loss": 3.0314, + "step": 43183 + }, + { + "epoch": 2.12, + "grad_norm": 0.6625970005989075, + "learning_rate": 0.00011953645165793717, + "loss": 2.8142, + "step": 43184 + }, + { + "epoch": 2.12, + "grad_norm": 0.718930184841156, + "learning_rate": 0.00011952415282141176, + "loss": 2.8112, + "step": 43185 + }, + { + "epoch": 2.12, + "grad_norm": 0.7372531294822693, + "learning_rate": 0.0001195118544602249, + "loss": 2.9687, + "step": 43186 + }, + { + "epoch": 2.12, + "grad_norm": 0.714538037776947, + "learning_rate": 0.00011949955657440918, + "loss": 2.6542, + "step": 43187 + }, + { + "epoch": 2.12, + "grad_norm": 0.6826031804084778, + "learning_rate": 0.0001194872591639968, + "loss": 2.8859, + "step": 43188 + }, + { + "epoch": 2.12, + "grad_norm": 0.7082632780075073, + "learning_rate": 0.0001194749622290202, + "loss": 2.933, + "step": 43189 + }, + { + "epoch": 2.12, + "grad_norm": 0.6917764544487, + "learning_rate": 0.00011946266576951195, + "loss": 3.0133, + "step": 43190 + }, + { + "epoch": 2.12, + "grad_norm": 0.7382910251617432, + "learning_rate": 0.00011945036978550428, + "loss": 2.9339, + "step": 43191 + }, + { + "epoch": 2.12, + "grad_norm": 0.7105042338371277, + "learning_rate": 0.00011943807427702957, + "loss": 3.0067, + "step": 43192 + }, + { + "epoch": 2.12, + "grad_norm": 0.6918284296989441, + "learning_rate": 0.00011942577924412016, + "loss": 3.0175, + "step": 43193 + }, + { + "epoch": 2.12, + "grad_norm": 0.7218431830406189, + "learning_rate": 0.00011941348468680847, + "loss": 2.8759, + "step": 43194 + }, + { + "epoch": 2.12, + "grad_norm": 0.695606529712677, + "learning_rate": 0.00011940119060512702, + "loss": 2.9613, + "step": 43195 + }, + { + "epoch": 2.12, + "grad_norm": 0.7161937952041626, + "learning_rate": 0.00011938889699910794, + "loss": 2.9276, + "step": 43196 + }, + { + "epoch": 2.12, + "grad_norm": 0.6780910491943359, + "learning_rate": 0.00011937660386878388, + "loss": 2.9779, + "step": 43197 + }, + { + "epoch": 2.12, + "grad_norm": 0.6835953593254089, + "learning_rate": 0.00011936431121418708, + "loss": 2.7212, + "step": 43198 + }, + { + "epoch": 2.12, + "grad_norm": 0.7107114791870117, + "learning_rate": 0.00011935201903534981, + "loss": 3.214, + "step": 43199 + }, + { + "epoch": 2.12, + "grad_norm": 0.6647638082504272, + "learning_rate": 0.00011933972733230469, + "loss": 2.9332, + "step": 43200 + }, + { + "epoch": 2.12, + "grad_norm": 0.7023340463638306, + "learning_rate": 0.00011932743610508383, + "loss": 3.1139, + "step": 43201 + }, + { + "epoch": 2.12, + "grad_norm": 0.7224500775337219, + "learning_rate": 0.00011931514535371983, + "loss": 2.8405, + "step": 43202 + }, + { + "epoch": 2.12, + "grad_norm": 0.6990615725517273, + "learning_rate": 0.00011930285507824487, + "loss": 2.9217, + "step": 43203 + }, + { + "epoch": 2.12, + "grad_norm": 0.7267166972160339, + "learning_rate": 0.0001192905652786914, + "loss": 2.9494, + "step": 43204 + }, + { + "epoch": 2.12, + "grad_norm": 0.6758955717086792, + "learning_rate": 0.00011927827595509191, + "loss": 3.1795, + "step": 43205 + }, + { + "epoch": 2.12, + "grad_norm": 0.7026833295822144, + "learning_rate": 0.00011926598710747867, + "loss": 3.0531, + "step": 43206 + }, + { + "epoch": 2.12, + "grad_norm": 0.7141755819320679, + "learning_rate": 0.00011925369873588404, + "loss": 2.865, + "step": 43207 + }, + { + "epoch": 2.12, + "grad_norm": 0.6519131064414978, + "learning_rate": 0.00011924141084034027, + "loss": 2.8592, + "step": 43208 + }, + { + "epoch": 2.12, + "grad_norm": 0.6646400690078735, + "learning_rate": 0.00011922912342087986, + "loss": 2.6698, + "step": 43209 + }, + { + "epoch": 2.12, + "grad_norm": 0.7201839685440063, + "learning_rate": 0.00011921683647753523, + "loss": 2.891, + "step": 43210 + }, + { + "epoch": 2.12, + "grad_norm": 0.7078610062599182, + "learning_rate": 0.00011920455001033855, + "loss": 2.9393, + "step": 43211 + }, + { + "epoch": 2.12, + "grad_norm": 0.7181376218795776, + "learning_rate": 0.0001191922640193224, + "loss": 2.7381, + "step": 43212 + }, + { + "epoch": 2.12, + "grad_norm": 0.6957877278327942, + "learning_rate": 0.00011917997850451902, + "loss": 2.9347, + "step": 43213 + }, + { + "epoch": 2.12, + "grad_norm": 0.7163203358650208, + "learning_rate": 0.00011916769346596069, + "loss": 3.1533, + "step": 43214 + }, + { + "epoch": 2.12, + "grad_norm": 0.7181594967842102, + "learning_rate": 0.00011915540890367997, + "loss": 2.8268, + "step": 43215 + }, + { + "epoch": 2.12, + "grad_norm": 0.6955720782279968, + "learning_rate": 0.00011914312481770898, + "loss": 3.059, + "step": 43216 + }, + { + "epoch": 2.12, + "grad_norm": 0.6729768514633179, + "learning_rate": 0.00011913084120808029, + "loss": 2.8435, + "step": 43217 + }, + { + "epoch": 2.12, + "grad_norm": 0.7270863056182861, + "learning_rate": 0.00011911855807482606, + "loss": 2.7089, + "step": 43218 + }, + { + "epoch": 2.12, + "grad_norm": 0.6758031845092773, + "learning_rate": 0.00011910627541797882, + "loss": 2.683, + "step": 43219 + }, + { + "epoch": 2.12, + "grad_norm": 0.6879879236221313, + "learning_rate": 0.00011909399323757088, + "loss": 3.0191, + "step": 43220 + }, + { + "epoch": 2.12, + "grad_norm": 0.6507390737533569, + "learning_rate": 0.00011908171153363439, + "loss": 2.9589, + "step": 43221 + }, + { + "epoch": 2.12, + "grad_norm": 0.7297090291976929, + "learning_rate": 0.000119069430306202, + "loss": 3.0383, + "step": 43222 + }, + { + "epoch": 2.12, + "grad_norm": 0.6910653114318848, + "learning_rate": 0.00011905714955530578, + "loss": 2.8401, + "step": 43223 + }, + { + "epoch": 2.12, + "grad_norm": 0.7071283459663391, + "learning_rate": 0.00011904486928097823, + "loss": 2.9656, + "step": 43224 + }, + { + "epoch": 2.12, + "grad_norm": 0.6900044083595276, + "learning_rate": 0.00011903258948325173, + "loss": 2.885, + "step": 43225 + }, + { + "epoch": 2.12, + "grad_norm": 0.7246055006980896, + "learning_rate": 0.0001190203101621585, + "loss": 3.0716, + "step": 43226 + }, + { + "epoch": 2.12, + "grad_norm": 0.6860259175300598, + "learning_rate": 0.00011900803131773104, + "loss": 3.0308, + "step": 43227 + }, + { + "epoch": 2.12, + "grad_norm": 0.6874629855155945, + "learning_rate": 0.00011899575295000155, + "loss": 2.9469, + "step": 43228 + }, + { + "epoch": 2.12, + "grad_norm": 0.6858367919921875, + "learning_rate": 0.00011898347505900236, + "loss": 2.8059, + "step": 43229 + }, + { + "epoch": 2.12, + "grad_norm": 0.6908622980117798, + "learning_rate": 0.00011897119764476593, + "loss": 2.7786, + "step": 43230 + }, + { + "epoch": 2.12, + "grad_norm": 0.729853630065918, + "learning_rate": 0.00011895892070732445, + "loss": 2.9305, + "step": 43231 + }, + { + "epoch": 2.12, + "grad_norm": 0.6964291334152222, + "learning_rate": 0.00011894664424671043, + "loss": 2.8263, + "step": 43232 + }, + { + "epoch": 2.12, + "grad_norm": 0.7044500112533569, + "learning_rate": 0.00011893436826295597, + "loss": 2.928, + "step": 43233 + }, + { + "epoch": 2.12, + "grad_norm": 0.7081640958786011, + "learning_rate": 0.00011892209275609367, + "loss": 2.9927, + "step": 43234 + }, + { + "epoch": 2.12, + "grad_norm": 0.7295190095901489, + "learning_rate": 0.00011890981772615573, + "loss": 3.0935, + "step": 43235 + }, + { + "epoch": 2.12, + "grad_norm": 0.6798872351646423, + "learning_rate": 0.00011889754317317436, + "loss": 2.8526, + "step": 43236 + }, + { + "epoch": 2.12, + "grad_norm": 0.7571244835853577, + "learning_rate": 0.00011888526909718213, + "loss": 2.9359, + "step": 43237 + }, + { + "epoch": 2.12, + "grad_norm": 0.7170717120170593, + "learning_rate": 0.00011887299549821114, + "loss": 2.925, + "step": 43238 + }, + { + "epoch": 2.12, + "grad_norm": 0.7144536375999451, + "learning_rate": 0.00011886072237629378, + "loss": 2.9109, + "step": 43239 + }, + { + "epoch": 2.12, + "grad_norm": 0.7104244232177734, + "learning_rate": 0.00011884844973146254, + "loss": 2.8062, + "step": 43240 + }, + { + "epoch": 2.12, + "grad_norm": 0.672351062297821, + "learning_rate": 0.00011883617756374963, + "loss": 2.9023, + "step": 43241 + }, + { + "epoch": 2.12, + "grad_norm": 0.6850356459617615, + "learning_rate": 0.00011882390587318737, + "loss": 3.0001, + "step": 43242 + }, + { + "epoch": 2.12, + "grad_norm": 0.7461333870887756, + "learning_rate": 0.00011881163465980797, + "loss": 2.7922, + "step": 43243 + }, + { + "epoch": 2.12, + "grad_norm": 0.6776725053787231, + "learning_rate": 0.00011879936392364395, + "loss": 3.152, + "step": 43244 + }, + { + "epoch": 2.12, + "grad_norm": 0.7232372760772705, + "learning_rate": 0.00011878709366472742, + "loss": 2.9213, + "step": 43245 + }, + { + "epoch": 2.12, + "grad_norm": 0.6616763472557068, + "learning_rate": 0.00011877482388309084, + "loss": 3.0254, + "step": 43246 + }, + { + "epoch": 2.12, + "grad_norm": 0.6551421880722046, + "learning_rate": 0.00011876255457876656, + "loss": 2.9206, + "step": 43247 + }, + { + "epoch": 2.12, + "grad_norm": 0.712017834186554, + "learning_rate": 0.00011875028575178676, + "loss": 2.8227, + "step": 43248 + }, + { + "epoch": 2.12, + "grad_norm": 0.6586719751358032, + "learning_rate": 0.00011873801740218392, + "loss": 3.1015, + "step": 43249 + }, + { + "epoch": 2.12, + "grad_norm": 0.7056365013122559, + "learning_rate": 0.00011872574952999024, + "loss": 2.7413, + "step": 43250 + }, + { + "epoch": 2.12, + "grad_norm": 0.6891767978668213, + "learning_rate": 0.00011871348213523796, + "loss": 2.8436, + "step": 43251 + }, + { + "epoch": 2.12, + "grad_norm": 0.7506184577941895, + "learning_rate": 0.0001187012152179596, + "loss": 2.9205, + "step": 43252 + }, + { + "epoch": 2.12, + "grad_norm": 0.7506493926048279, + "learning_rate": 0.00011868894877818721, + "loss": 2.7396, + "step": 43253 + }, + { + "epoch": 2.12, + "grad_norm": 0.7198980450630188, + "learning_rate": 0.00011867668281595336, + "loss": 2.991, + "step": 43254 + }, + { + "epoch": 2.12, + "grad_norm": 0.6733829975128174, + "learning_rate": 0.00011866441733129013, + "loss": 2.8639, + "step": 43255 + }, + { + "epoch": 2.12, + "grad_norm": 0.726628839969635, + "learning_rate": 0.00011865215232423003, + "loss": 2.8438, + "step": 43256 + }, + { + "epoch": 2.12, + "grad_norm": 0.6809700727462769, + "learning_rate": 0.00011863988779480524, + "loss": 2.8867, + "step": 43257 + }, + { + "epoch": 2.12, + "grad_norm": 0.6893267631530762, + "learning_rate": 0.00011862762374304798, + "loss": 3.0118, + "step": 43258 + }, + { + "epoch": 2.12, + "grad_norm": 0.7157814502716064, + "learning_rate": 0.00011861536016899074, + "loss": 3.1546, + "step": 43259 + }, + { + "epoch": 2.12, + "grad_norm": 0.6700130701065063, + "learning_rate": 0.00011860309707266568, + "loss": 3.0809, + "step": 43260 + }, + { + "epoch": 2.12, + "grad_norm": 0.6558499932289124, + "learning_rate": 0.00011859083445410513, + "loss": 2.9951, + "step": 43261 + }, + { + "epoch": 2.12, + "grad_norm": 0.7448561787605286, + "learning_rate": 0.0001185785723133415, + "loss": 2.9421, + "step": 43262 + }, + { + "epoch": 2.12, + "grad_norm": 0.716131865978241, + "learning_rate": 0.000118566310650407, + "loss": 2.95, + "step": 43263 + }, + { + "epoch": 2.12, + "grad_norm": 0.709511399269104, + "learning_rate": 0.00011855404946533395, + "loss": 3.1927, + "step": 43264 + }, + { + "epoch": 2.12, + "grad_norm": 0.7204448580741882, + "learning_rate": 0.00011854178875815446, + "loss": 2.855, + "step": 43265 + }, + { + "epoch": 2.12, + "grad_norm": 0.6846634745597839, + "learning_rate": 0.000118529528528901, + "loss": 2.9704, + "step": 43266 + }, + { + "epoch": 2.12, + "grad_norm": 0.6869163513183594, + "learning_rate": 0.00011851726877760594, + "loss": 2.8469, + "step": 43267 + }, + { + "epoch": 2.12, + "grad_norm": 0.7922883033752441, + "learning_rate": 0.00011850500950430138, + "loss": 2.7902, + "step": 43268 + }, + { + "epoch": 2.12, + "grad_norm": 0.672174870967865, + "learning_rate": 0.00011849275070901979, + "loss": 3.0636, + "step": 43269 + }, + { + "epoch": 2.12, + "grad_norm": 0.7000868916511536, + "learning_rate": 0.00011848049239179324, + "loss": 2.9595, + "step": 43270 + }, + { + "epoch": 2.12, + "grad_norm": 0.6964728236198425, + "learning_rate": 0.00011846823455265425, + "loss": 2.8609, + "step": 43271 + }, + { + "epoch": 2.12, + "grad_norm": 0.7069092392921448, + "learning_rate": 0.00011845597719163501, + "loss": 3.1638, + "step": 43272 + }, + { + "epoch": 2.12, + "grad_norm": 0.6975874304771423, + "learning_rate": 0.00011844372030876769, + "loss": 2.9583, + "step": 43273 + }, + { + "epoch": 2.12, + "grad_norm": 0.6885384917259216, + "learning_rate": 0.00011843146390408476, + "loss": 2.817, + "step": 43274 + }, + { + "epoch": 2.12, + "grad_norm": 0.7786584496498108, + "learning_rate": 0.00011841920797761829, + "loss": 2.8851, + "step": 43275 + }, + { + "epoch": 2.12, + "grad_norm": 0.6855915784835815, + "learning_rate": 0.0001184069525294007, + "loss": 2.7401, + "step": 43276 + }, + { + "epoch": 2.12, + "grad_norm": 0.6739755272865295, + "learning_rate": 0.00011839469755946432, + "loss": 2.9035, + "step": 43277 + }, + { + "epoch": 2.12, + "grad_norm": 0.6957394480705261, + "learning_rate": 0.0001183824430678414, + "loss": 3.0632, + "step": 43278 + }, + { + "epoch": 2.12, + "grad_norm": 0.6879928112030029, + "learning_rate": 0.00011837018905456415, + "loss": 2.8398, + "step": 43279 + }, + { + "epoch": 2.12, + "grad_norm": 0.701741635799408, + "learning_rate": 0.00011835793551966477, + "loss": 2.8844, + "step": 43280 + }, + { + "epoch": 2.12, + "grad_norm": 0.6822145581245422, + "learning_rate": 0.00011834568246317562, + "loss": 2.8215, + "step": 43281 + }, + { + "epoch": 2.12, + "grad_norm": 0.6779221296310425, + "learning_rate": 0.00011833342988512909, + "loss": 3.0876, + "step": 43282 + }, + { + "epoch": 2.12, + "grad_norm": 0.6709139347076416, + "learning_rate": 0.00011832117778555727, + "loss": 2.7494, + "step": 43283 + }, + { + "epoch": 2.12, + "grad_norm": 0.7509815096855164, + "learning_rate": 0.00011830892616449257, + "loss": 2.8125, + "step": 43284 + }, + { + "epoch": 2.12, + "grad_norm": 0.6872658133506775, + "learning_rate": 0.00011829667502196722, + "loss": 2.7154, + "step": 43285 + }, + { + "epoch": 2.12, + "grad_norm": 0.7320376634597778, + "learning_rate": 0.00011828442435801331, + "loss": 2.9323, + "step": 43286 + }, + { + "epoch": 2.12, + "grad_norm": 0.6770885586738586, + "learning_rate": 0.00011827217417266339, + "loss": 2.9723, + "step": 43287 + }, + { + "epoch": 2.12, + "grad_norm": 0.7534797191619873, + "learning_rate": 0.00011825992446594948, + "loss": 3.0001, + "step": 43288 + }, + { + "epoch": 2.12, + "grad_norm": 0.720881998538971, + "learning_rate": 0.00011824767523790406, + "loss": 2.8738, + "step": 43289 + }, + { + "epoch": 2.12, + "grad_norm": 0.6875128149986267, + "learning_rate": 0.00011823542648855916, + "loss": 3.1074, + "step": 43290 + }, + { + "epoch": 2.12, + "grad_norm": 0.7054754495620728, + "learning_rate": 0.00011822317821794718, + "loss": 2.8814, + "step": 43291 + }, + { + "epoch": 2.12, + "grad_norm": 0.647639811038971, + "learning_rate": 0.00011821093042610049, + "loss": 2.9831, + "step": 43292 + }, + { + "epoch": 2.12, + "grad_norm": 0.6805564165115356, + "learning_rate": 0.0001181986831130512, + "loss": 2.9677, + "step": 43293 + }, + { + "epoch": 2.12, + "grad_norm": 0.724385678768158, + "learning_rate": 0.0001181864362788316, + "loss": 2.9741, + "step": 43294 + }, + { + "epoch": 2.12, + "grad_norm": 0.6952561736106873, + "learning_rate": 0.00011817418992347381, + "loss": 2.7953, + "step": 43295 + }, + { + "epoch": 2.12, + "grad_norm": 0.7037989497184753, + "learning_rate": 0.00011816194404701024, + "loss": 2.5987, + "step": 43296 + }, + { + "epoch": 2.12, + "grad_norm": 0.6977366209030151, + "learning_rate": 0.0001181496986494732, + "loss": 2.6767, + "step": 43297 + }, + { + "epoch": 2.12, + "grad_norm": 0.6811829805374146, + "learning_rate": 0.00011813745373089473, + "loss": 3.0466, + "step": 43298 + }, + { + "epoch": 2.12, + "grad_norm": 0.6867378354072571, + "learning_rate": 0.00011812520929130736, + "loss": 3.0597, + "step": 43299 + }, + { + "epoch": 2.12, + "grad_norm": 0.6996347308158875, + "learning_rate": 0.00011811296533074315, + "loss": 2.8842, + "step": 43300 + }, + { + "epoch": 2.12, + "grad_norm": 0.661796510219574, + "learning_rate": 0.00011810072184923429, + "loss": 3.1053, + "step": 43301 + }, + { + "epoch": 2.12, + "grad_norm": 0.6767133474349976, + "learning_rate": 0.00011808847884681323, + "loss": 2.8145, + "step": 43302 + }, + { + "epoch": 2.12, + "grad_norm": 0.710540771484375, + "learning_rate": 0.00011807623632351201, + "loss": 2.9525, + "step": 43303 + }, + { + "epoch": 2.12, + "grad_norm": 0.6866672039031982, + "learning_rate": 0.00011806399427936308, + "loss": 2.9246, + "step": 43304 + }, + { + "epoch": 2.12, + "grad_norm": 0.6833668947219849, + "learning_rate": 0.00011805175271439846, + "loss": 3.2081, + "step": 43305 + }, + { + "epoch": 2.12, + "grad_norm": 0.6920410394668579, + "learning_rate": 0.00011803951162865048, + "loss": 3.0166, + "step": 43306 + }, + { + "epoch": 2.12, + "grad_norm": 0.7049962878227234, + "learning_rate": 0.00011802727102215155, + "loss": 2.7362, + "step": 43307 + }, + { + "epoch": 2.12, + "grad_norm": 0.727013349533081, + "learning_rate": 0.00011801503089493376, + "loss": 2.8944, + "step": 43308 + }, + { + "epoch": 2.12, + "grad_norm": 0.6701471209526062, + "learning_rate": 0.00011800279124702936, + "loss": 2.7099, + "step": 43309 + }, + { + "epoch": 2.12, + "grad_norm": 0.6716243624687195, + "learning_rate": 0.00011799055207847045, + "loss": 3.0203, + "step": 43310 + }, + { + "epoch": 2.12, + "grad_norm": 0.7197967171669006, + "learning_rate": 0.00011797831338928943, + "loss": 2.8381, + "step": 43311 + }, + { + "epoch": 2.12, + "grad_norm": 0.6998694539070129, + "learning_rate": 0.00011796607517951858, + "loss": 2.9622, + "step": 43312 + }, + { + "epoch": 2.12, + "grad_norm": 0.6807000041007996, + "learning_rate": 0.00011795383744918996, + "loss": 3.1586, + "step": 43313 + }, + { + "epoch": 2.12, + "grad_norm": 0.6802100539207458, + "learning_rate": 0.00011794160019833599, + "loss": 2.6665, + "step": 43314 + }, + { + "epoch": 2.12, + "grad_norm": 0.6740386486053467, + "learning_rate": 0.00011792936342698882, + "loss": 2.8575, + "step": 43315 + }, + { + "epoch": 2.12, + "grad_norm": 0.6891768574714661, + "learning_rate": 0.00011791712713518056, + "loss": 2.8772, + "step": 43316 + }, + { + "epoch": 2.12, + "grad_norm": 0.7108803391456604, + "learning_rate": 0.00011790489132294368, + "loss": 3.0195, + "step": 43317 + }, + { + "epoch": 2.12, + "grad_norm": 0.7088941335678101, + "learning_rate": 0.00011789265599031012, + "loss": 2.92, + "step": 43318 + }, + { + "epoch": 2.12, + "grad_norm": 0.6881635785102844, + "learning_rate": 0.00011788042113731239, + "loss": 2.8271, + "step": 43319 + }, + { + "epoch": 2.12, + "grad_norm": 0.6677702069282532, + "learning_rate": 0.00011786818676398243, + "loss": 3.0857, + "step": 43320 + }, + { + "epoch": 2.12, + "grad_norm": 0.7288718223571777, + "learning_rate": 0.00011785595287035276, + "loss": 3.0319, + "step": 43321 + }, + { + "epoch": 2.12, + "grad_norm": 0.6868347525596619, + "learning_rate": 0.00011784371945645546, + "loss": 2.5492, + "step": 43322 + }, + { + "epoch": 2.12, + "grad_norm": 0.6867601275444031, + "learning_rate": 0.00011783148652232264, + "loss": 2.7741, + "step": 43323 + }, + { + "epoch": 2.12, + "grad_norm": 0.715054988861084, + "learning_rate": 0.0001178192540679867, + "loss": 3.0581, + "step": 43324 + }, + { + "epoch": 2.12, + "grad_norm": 0.6956353187561035, + "learning_rate": 0.0001178070220934797, + "loss": 2.9861, + "step": 43325 + }, + { + "epoch": 2.12, + "grad_norm": 0.7079643607139587, + "learning_rate": 0.00011779479059883407, + "loss": 2.8541, + "step": 43326 + }, + { + "epoch": 2.12, + "grad_norm": 0.712270200252533, + "learning_rate": 0.00011778255958408175, + "loss": 2.7609, + "step": 43327 + }, + { + "epoch": 2.12, + "grad_norm": 0.6944162845611572, + "learning_rate": 0.00011777032904925512, + "loss": 2.9822, + "step": 43328 + }, + { + "epoch": 2.12, + "grad_norm": 0.6619276404380798, + "learning_rate": 0.00011775809899438645, + "loss": 2.9275, + "step": 43329 + }, + { + "epoch": 2.12, + "grad_norm": 0.7741801738739014, + "learning_rate": 0.0001177458694195079, + "loss": 2.7809, + "step": 43330 + }, + { + "epoch": 2.12, + "grad_norm": 0.7182871103286743, + "learning_rate": 0.00011773364032465167, + "loss": 2.8416, + "step": 43331 + }, + { + "epoch": 2.12, + "grad_norm": 0.6957319974899292, + "learning_rate": 0.00011772141170984984, + "loss": 2.8044, + "step": 43332 + }, + { + "epoch": 2.12, + "grad_norm": 0.7118658423423767, + "learning_rate": 0.0001177091835751347, + "loss": 2.9776, + "step": 43333 + }, + { + "epoch": 2.12, + "grad_norm": 0.7111032605171204, + "learning_rate": 0.00011769695592053861, + "loss": 2.8035, + "step": 43334 + }, + { + "epoch": 2.12, + "grad_norm": 0.7072035074234009, + "learning_rate": 0.00011768472874609355, + "loss": 2.9224, + "step": 43335 + }, + { + "epoch": 2.12, + "grad_norm": 0.6957293748855591, + "learning_rate": 0.00011767250205183195, + "loss": 2.8217, + "step": 43336 + }, + { + "epoch": 2.12, + "grad_norm": 0.6990187764167786, + "learning_rate": 0.00011766027583778586, + "loss": 3.1363, + "step": 43337 + }, + { + "epoch": 2.12, + "grad_norm": 0.6784077882766724, + "learning_rate": 0.00011764805010398743, + "loss": 2.9643, + "step": 43338 + }, + { + "epoch": 2.12, + "grad_norm": 0.6869450211524963, + "learning_rate": 0.00011763582485046902, + "loss": 2.9682, + "step": 43339 + }, + { + "epoch": 2.12, + "grad_norm": 0.7190410494804382, + "learning_rate": 0.00011762360007726266, + "loss": 2.8506, + "step": 43340 + }, + { + "epoch": 2.12, + "grad_norm": 0.667700469493866, + "learning_rate": 0.00011761137578440076, + "loss": 3.0683, + "step": 43341 + }, + { + "epoch": 2.12, + "grad_norm": 0.7749731540679932, + "learning_rate": 0.00011759915197191527, + "loss": 2.8538, + "step": 43342 + }, + { + "epoch": 2.12, + "grad_norm": 0.7492234706878662, + "learning_rate": 0.00011758692863983863, + "loss": 2.7344, + "step": 43343 + }, + { + "epoch": 2.12, + "grad_norm": 0.6971567273139954, + "learning_rate": 0.0001175747057882029, + "loss": 2.8228, + "step": 43344 + }, + { + "epoch": 2.12, + "grad_norm": 0.6927439570426941, + "learning_rate": 0.00011756248341704017, + "loss": 2.9262, + "step": 43345 + }, + { + "epoch": 2.12, + "grad_norm": 0.6876089572906494, + "learning_rate": 0.00011755026152638288, + "loss": 3.01, + "step": 43346 + }, + { + "epoch": 2.12, + "grad_norm": 0.7251725792884827, + "learning_rate": 0.00011753804011626297, + "loss": 2.8696, + "step": 43347 + }, + { + "epoch": 2.12, + "grad_norm": 0.6793904900550842, + "learning_rate": 0.00011752581918671276, + "loss": 3.1156, + "step": 43348 + }, + { + "epoch": 2.12, + "grad_norm": 0.7280372381210327, + "learning_rate": 0.00011751359873776453, + "loss": 3.0312, + "step": 43349 + }, + { + "epoch": 2.12, + "grad_norm": 0.7209813594818115, + "learning_rate": 0.00011750137876945021, + "loss": 3.1338, + "step": 43350 + }, + { + "epoch": 2.12, + "grad_norm": 0.7237643003463745, + "learning_rate": 0.00011748915928180228, + "loss": 2.7921, + "step": 43351 + }, + { + "epoch": 2.12, + "grad_norm": 0.7128032445907593, + "learning_rate": 0.0001174769402748528, + "loss": 2.8238, + "step": 43352 + }, + { + "epoch": 2.12, + "grad_norm": 0.694153368473053, + "learning_rate": 0.0001174647217486338, + "loss": 2.8686, + "step": 43353 + }, + { + "epoch": 2.12, + "grad_norm": 0.7112940549850464, + "learning_rate": 0.00011745250370317769, + "loss": 2.7379, + "step": 43354 + }, + { + "epoch": 2.12, + "grad_norm": 0.7519726753234863, + "learning_rate": 0.00011744028613851646, + "loss": 2.7854, + "step": 43355 + }, + { + "epoch": 2.12, + "grad_norm": 0.7124344706535339, + "learning_rate": 0.00011742806905468248, + "loss": 3.0715, + "step": 43356 + }, + { + "epoch": 2.12, + "grad_norm": 0.6832916736602783, + "learning_rate": 0.00011741585245170773, + "loss": 3.0614, + "step": 43357 + }, + { + "epoch": 2.12, + "grad_norm": 0.7317155599594116, + "learning_rate": 0.0001174036363296246, + "loss": 3.0663, + "step": 43358 + }, + { + "epoch": 2.12, + "grad_norm": 0.684130072593689, + "learning_rate": 0.00011739142068846512, + "loss": 2.7761, + "step": 43359 + }, + { + "epoch": 2.12, + "grad_norm": 0.7288141846656799, + "learning_rate": 0.00011737920552826142, + "loss": 2.9951, + "step": 43360 + }, + { + "epoch": 2.13, + "grad_norm": 0.7078750133514404, + "learning_rate": 0.00011736699084904584, + "loss": 2.8456, + "step": 43361 + }, + { + "epoch": 2.13, + "grad_norm": 0.6609907746315002, + "learning_rate": 0.00011735477665085035, + "loss": 2.8533, + "step": 43362 + }, + { + "epoch": 2.13, + "grad_norm": 0.6616264581680298, + "learning_rate": 0.00011734256293370722, + "loss": 2.7806, + "step": 43363 + }, + { + "epoch": 2.13, + "grad_norm": 0.7512163519859314, + "learning_rate": 0.00011733034969764876, + "loss": 2.9013, + "step": 43364 + }, + { + "epoch": 2.13, + "grad_norm": 0.6941211819648743, + "learning_rate": 0.00011731813694270699, + "loss": 2.8684, + "step": 43365 + }, + { + "epoch": 2.13, + "grad_norm": 0.6857606768608093, + "learning_rate": 0.00011730592466891406, + "loss": 2.8198, + "step": 43366 + }, + { + "epoch": 2.13, + "grad_norm": 0.6707267165184021, + "learning_rate": 0.0001172937128763021, + "loss": 2.9227, + "step": 43367 + }, + { + "epoch": 2.13, + "grad_norm": 0.6939967274665833, + "learning_rate": 0.00011728150156490332, + "loss": 2.8507, + "step": 43368 + }, + { + "epoch": 2.13, + "grad_norm": 0.7025797367095947, + "learning_rate": 0.00011726929073475, + "loss": 2.8585, + "step": 43369 + }, + { + "epoch": 2.13, + "grad_norm": 0.7331152558326721, + "learning_rate": 0.0001172570803858741, + "loss": 3.0419, + "step": 43370 + }, + { + "epoch": 2.13, + "grad_norm": 0.669756293296814, + "learning_rate": 0.000117244870518308, + "loss": 2.9938, + "step": 43371 + }, + { + "epoch": 2.13, + "grad_norm": 0.6814187169075012, + "learning_rate": 0.00011723266113208363, + "loss": 2.9681, + "step": 43372 + }, + { + "epoch": 2.13, + "grad_norm": 0.6948679089546204, + "learning_rate": 0.00011722045222723333, + "loss": 3.1171, + "step": 43373 + }, + { + "epoch": 2.13, + "grad_norm": 0.6597676277160645, + "learning_rate": 0.00011720824380378922, + "loss": 2.6783, + "step": 43374 + }, + { + "epoch": 2.13, + "grad_norm": 0.6691949367523193, + "learning_rate": 0.00011719603586178329, + "loss": 2.845, + "step": 43375 + }, + { + "epoch": 2.13, + "grad_norm": 0.6968749165534973, + "learning_rate": 0.00011718382840124793, + "loss": 3.0667, + "step": 43376 + }, + { + "epoch": 2.13, + "grad_norm": 0.7081393599510193, + "learning_rate": 0.0001171716214222151, + "loss": 2.9093, + "step": 43377 + }, + { + "epoch": 2.13, + "grad_norm": 0.677230954170227, + "learning_rate": 0.00011715941492471702, + "loss": 2.9554, + "step": 43378 + }, + { + "epoch": 2.13, + "grad_norm": 0.6978651285171509, + "learning_rate": 0.00011714720890878592, + "loss": 2.8337, + "step": 43379 + }, + { + "epoch": 2.13, + "grad_norm": 0.7071506381034851, + "learning_rate": 0.00011713500337445392, + "loss": 3.0446, + "step": 43380 + }, + { + "epoch": 2.13, + "grad_norm": 0.7174202799797058, + "learning_rate": 0.00011712279832175313, + "loss": 2.9749, + "step": 43381 + }, + { + "epoch": 2.13, + "grad_norm": 0.7007304430007935, + "learning_rate": 0.00011711059375071558, + "loss": 3.0169, + "step": 43382 + }, + { + "epoch": 2.13, + "grad_norm": 0.7151933908462524, + "learning_rate": 0.00011709838966137354, + "loss": 3.0288, + "step": 43383 + }, + { + "epoch": 2.13, + "grad_norm": 0.7130842208862305, + "learning_rate": 0.00011708618605375923, + "loss": 3.0052, + "step": 43384 + }, + { + "epoch": 2.13, + "grad_norm": 0.7058139443397522, + "learning_rate": 0.0001170739829279046, + "loss": 2.9758, + "step": 43385 + }, + { + "epoch": 2.13, + "grad_norm": 0.6666308641433716, + "learning_rate": 0.000117061780283842, + "loss": 2.9686, + "step": 43386 + }, + { + "epoch": 2.13, + "grad_norm": 0.7057749032974243, + "learning_rate": 0.00011704957812160347, + "loss": 2.9458, + "step": 43387 + }, + { + "epoch": 2.13, + "grad_norm": 0.6779221892356873, + "learning_rate": 0.00011703737644122103, + "loss": 2.8908, + "step": 43388 + }, + { + "epoch": 2.13, + "grad_norm": 0.690578818321228, + "learning_rate": 0.00011702517524272703, + "loss": 3.1712, + "step": 43389 + }, + { + "epoch": 2.13, + "grad_norm": 0.6773284077644348, + "learning_rate": 0.00011701297452615342, + "loss": 2.9113, + "step": 43390 + }, + { + "epoch": 2.13, + "grad_norm": 0.700973629951477, + "learning_rate": 0.00011700077429153249, + "loss": 2.9496, + "step": 43391 + }, + { + "epoch": 2.13, + "grad_norm": 0.6661964654922485, + "learning_rate": 0.00011698857453889621, + "loss": 2.9452, + "step": 43392 + }, + { + "epoch": 2.13, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.00011697637526827682, + "loss": 2.8738, + "step": 43393 + }, + { + "epoch": 2.13, + "grad_norm": 0.6811676621437073, + "learning_rate": 0.00011696417647970654, + "loss": 3.0137, + "step": 43394 + }, + { + "epoch": 2.13, + "grad_norm": 0.6550053358078003, + "learning_rate": 0.00011695197817321736, + "loss": 2.9058, + "step": 43395 + }, + { + "epoch": 2.13, + "grad_norm": 0.7086590528488159, + "learning_rate": 0.00011693978034884147, + "loss": 2.7958, + "step": 43396 + }, + { + "epoch": 2.13, + "grad_norm": 0.6640387177467346, + "learning_rate": 0.00011692758300661087, + "loss": 2.9049, + "step": 43397 + }, + { + "epoch": 2.13, + "grad_norm": 0.7309964299201965, + "learning_rate": 0.00011691538614655786, + "loss": 2.6874, + "step": 43398 + }, + { + "epoch": 2.13, + "grad_norm": 0.7164502143859863, + "learning_rate": 0.0001169031897687144, + "loss": 3.0238, + "step": 43399 + }, + { + "epoch": 2.13, + "grad_norm": 0.6886323094367981, + "learning_rate": 0.00011689099387311268, + "loss": 3.0394, + "step": 43400 + }, + { + "epoch": 2.13, + "grad_norm": 0.6922336220741272, + "learning_rate": 0.00011687879845978499, + "loss": 2.758, + "step": 43401 + }, + { + "epoch": 2.13, + "grad_norm": 0.6996184587478638, + "learning_rate": 0.00011686660352876325, + "loss": 2.8422, + "step": 43402 + }, + { + "epoch": 2.13, + "grad_norm": 0.6815118789672852, + "learning_rate": 0.00011685440908007968, + "loss": 2.984, + "step": 43403 + }, + { + "epoch": 2.13, + "grad_norm": 0.7413270473480225, + "learning_rate": 0.00011684221511376619, + "loss": 2.9322, + "step": 43404 + }, + { + "epoch": 2.13, + "grad_norm": 0.6742251515388489, + "learning_rate": 0.00011683002162985512, + "loss": 3.0762, + "step": 43405 + }, + { + "epoch": 2.13, + "grad_norm": 0.6736028790473938, + "learning_rate": 0.00011681782862837858, + "loss": 2.8401, + "step": 43406 + }, + { + "epoch": 2.13, + "grad_norm": 0.6667304635047913, + "learning_rate": 0.00011680563610936853, + "loss": 2.9525, + "step": 43407 + }, + { + "epoch": 2.13, + "grad_norm": 0.7072355151176453, + "learning_rate": 0.0001167934440728573, + "loss": 2.7609, + "step": 43408 + }, + { + "epoch": 2.13, + "grad_norm": 0.6967011094093323, + "learning_rate": 0.00011678125251887677, + "loss": 2.8879, + "step": 43409 + }, + { + "epoch": 2.13, + "grad_norm": 0.6703258156776428, + "learning_rate": 0.00011676906144745923, + "loss": 3.1206, + "step": 43410 + }, + { + "epoch": 2.13, + "grad_norm": 0.7139511108398438, + "learning_rate": 0.00011675687085863677, + "loss": 2.9037, + "step": 43411 + }, + { + "epoch": 2.13, + "grad_norm": 0.7220322489738464, + "learning_rate": 0.0001167446807524413, + "loss": 2.8709, + "step": 43412 + }, + { + "epoch": 2.13, + "grad_norm": 0.6931965351104736, + "learning_rate": 0.00011673249112890518, + "loss": 3.021, + "step": 43413 + }, + { + "epoch": 2.13, + "grad_norm": 0.7006539106369019, + "learning_rate": 0.00011672030198806031, + "loss": 3.0521, + "step": 43414 + }, + { + "epoch": 2.13, + "grad_norm": 0.7006272673606873, + "learning_rate": 0.0001167081133299389, + "loss": 2.8595, + "step": 43415 + }, + { + "epoch": 2.13, + "grad_norm": 0.7924833297729492, + "learning_rate": 0.00011669592515457318, + "loss": 2.7673, + "step": 43416 + }, + { + "epoch": 2.13, + "grad_norm": 0.683752179145813, + "learning_rate": 0.00011668373746199508, + "loss": 2.8574, + "step": 43417 + }, + { + "epoch": 2.13, + "grad_norm": 0.7187227010726929, + "learning_rate": 0.00011667155025223674, + "loss": 2.9725, + "step": 43418 + }, + { + "epoch": 2.13, + "grad_norm": 0.6991389393806458, + "learning_rate": 0.00011665936352533013, + "loss": 3.0743, + "step": 43419 + }, + { + "epoch": 2.13, + "grad_norm": 0.7020309567451477, + "learning_rate": 0.00011664717728130751, + "loss": 2.9786, + "step": 43420 + }, + { + "epoch": 2.13, + "grad_norm": 0.6862619519233704, + "learning_rate": 0.00011663499152020101, + "loss": 2.9249, + "step": 43421 + }, + { + "epoch": 2.13, + "grad_norm": 0.6912420392036438, + "learning_rate": 0.00011662280624204258, + "loss": 2.787, + "step": 43422 + }, + { + "epoch": 2.13, + "grad_norm": 0.7102240920066833, + "learning_rate": 0.00011661062144686448, + "loss": 2.9573, + "step": 43423 + }, + { + "epoch": 2.13, + "grad_norm": 0.6816980838775635, + "learning_rate": 0.0001165984371346987, + "loss": 2.8612, + "step": 43424 + }, + { + "epoch": 2.13, + "grad_norm": 0.6912957429885864, + "learning_rate": 0.00011658625330557722, + "loss": 2.8478, + "step": 43425 + }, + { + "epoch": 2.13, + "grad_norm": 0.690959095954895, + "learning_rate": 0.00011657406995953237, + "loss": 2.9166, + "step": 43426 + }, + { + "epoch": 2.13, + "grad_norm": 0.7246711254119873, + "learning_rate": 0.00011656188709659603, + "loss": 2.7915, + "step": 43427 + }, + { + "epoch": 2.13, + "grad_norm": 0.6712881326675415, + "learning_rate": 0.00011654970471680046, + "loss": 3.0455, + "step": 43428 + }, + { + "epoch": 2.13, + "grad_norm": 0.7149039506912231, + "learning_rate": 0.00011653752282017754, + "loss": 2.949, + "step": 43429 + }, + { + "epoch": 2.13, + "grad_norm": 0.7015359401702881, + "learning_rate": 0.00011652534140675951, + "loss": 2.9687, + "step": 43430 + }, + { + "epoch": 2.13, + "grad_norm": 0.694905161857605, + "learning_rate": 0.00011651316047657853, + "loss": 2.9374, + "step": 43431 + }, + { + "epoch": 2.13, + "grad_norm": 0.6754507422447205, + "learning_rate": 0.00011650098002966656, + "loss": 3.0613, + "step": 43432 + }, + { + "epoch": 2.13, + "grad_norm": 0.695451021194458, + "learning_rate": 0.00011648880006605567, + "loss": 3.0479, + "step": 43433 + }, + { + "epoch": 2.13, + "grad_norm": 0.7183724045753479, + "learning_rate": 0.00011647662058577789, + "loss": 2.8461, + "step": 43434 + }, + { + "epoch": 2.13, + "grad_norm": 0.6836041808128357, + "learning_rate": 0.00011646444158886538, + "loss": 2.9376, + "step": 43435 + }, + { + "epoch": 2.13, + "grad_norm": 0.6931686401367188, + "learning_rate": 0.00011645226307535027, + "loss": 3.1012, + "step": 43436 + }, + { + "epoch": 2.13, + "grad_norm": 0.682621419429779, + "learning_rate": 0.00011644008504526452, + "loss": 3.041, + "step": 43437 + }, + { + "epoch": 2.13, + "grad_norm": 0.685973584651947, + "learning_rate": 0.00011642790749864034, + "loss": 2.728, + "step": 43438 + }, + { + "epoch": 2.13, + "grad_norm": 0.737464427947998, + "learning_rate": 0.0001164157304355097, + "loss": 2.8201, + "step": 43439 + }, + { + "epoch": 2.13, + "grad_norm": 0.6804065108299255, + "learning_rate": 0.00011640355385590462, + "loss": 2.7606, + "step": 43440 + }, + { + "epoch": 2.13, + "grad_norm": 0.7319862842559814, + "learning_rate": 0.00011639137775985736, + "loss": 2.9008, + "step": 43441 + }, + { + "epoch": 2.13, + "grad_norm": 0.6785258650779724, + "learning_rate": 0.00011637920214739974, + "loss": 2.857, + "step": 43442 + }, + { + "epoch": 2.13, + "grad_norm": 0.7257917523384094, + "learning_rate": 0.0001163670270185641, + "loss": 2.9225, + "step": 43443 + }, + { + "epoch": 2.13, + "grad_norm": 0.7215557098388672, + "learning_rate": 0.00011635485237338224, + "loss": 3.1464, + "step": 43444 + }, + { + "epoch": 2.13, + "grad_norm": 0.7050979733467102, + "learning_rate": 0.00011634267821188646, + "loss": 2.9606, + "step": 43445 + }, + { + "epoch": 2.13, + "grad_norm": 0.6802178025245667, + "learning_rate": 0.00011633050453410873, + "loss": 2.981, + "step": 43446 + }, + { + "epoch": 2.13, + "grad_norm": 0.7454336285591125, + "learning_rate": 0.00011631833134008099, + "loss": 2.9528, + "step": 43447 + }, + { + "epoch": 2.13, + "grad_norm": 0.6547221541404724, + "learning_rate": 0.00011630615862983551, + "loss": 2.689, + "step": 43448 + }, + { + "epoch": 2.13, + "grad_norm": 0.7083719372749329, + "learning_rate": 0.00011629398640340418, + "loss": 2.882, + "step": 43449 + }, + { + "epoch": 2.13, + "grad_norm": 0.72013258934021, + "learning_rate": 0.00011628181466081912, + "loss": 2.7815, + "step": 43450 + }, + { + "epoch": 2.13, + "grad_norm": 0.6839480400085449, + "learning_rate": 0.00011626964340211249, + "loss": 2.9452, + "step": 43451 + }, + { + "epoch": 2.13, + "grad_norm": 0.7362170815467834, + "learning_rate": 0.00011625747262731617, + "loss": 2.8755, + "step": 43452 + }, + { + "epoch": 2.13, + "grad_norm": 0.6892048716545105, + "learning_rate": 0.00011624530233646241, + "loss": 3.0673, + "step": 43453 + }, + { + "epoch": 2.13, + "grad_norm": 0.7359662652015686, + "learning_rate": 0.00011623313252958317, + "loss": 3.0499, + "step": 43454 + }, + { + "epoch": 2.13, + "grad_norm": 0.7010781168937683, + "learning_rate": 0.00011622096320671038, + "loss": 2.8973, + "step": 43455 + }, + { + "epoch": 2.13, + "grad_norm": 0.6811627745628357, + "learning_rate": 0.00011620879436787631, + "loss": 2.8123, + "step": 43456 + }, + { + "epoch": 2.13, + "grad_norm": 0.7134902477264404, + "learning_rate": 0.0001161966260131128, + "loss": 2.972, + "step": 43457 + }, + { + "epoch": 2.13, + "grad_norm": 0.7200930714607239, + "learning_rate": 0.00011618445814245211, + "loss": 2.9097, + "step": 43458 + }, + { + "epoch": 2.13, + "grad_norm": 0.7086115479469299, + "learning_rate": 0.00011617229075592606, + "loss": 3.1063, + "step": 43459 + }, + { + "epoch": 2.13, + "grad_norm": 0.7229481935501099, + "learning_rate": 0.00011616012385356693, + "loss": 3.1506, + "step": 43460 + }, + { + "epoch": 2.13, + "grad_norm": 0.6957486271858215, + "learning_rate": 0.00011614795743540666, + "loss": 2.677, + "step": 43461 + }, + { + "epoch": 2.13, + "grad_norm": 0.7093002200126648, + "learning_rate": 0.00011613579150147718, + "loss": 2.6256, + "step": 43462 + }, + { + "epoch": 2.13, + "grad_norm": 0.7090989351272583, + "learning_rate": 0.00011612362605181073, + "loss": 2.9412, + "step": 43463 + }, + { + "epoch": 2.13, + "grad_norm": 0.7614361047744751, + "learning_rate": 0.0001161114610864392, + "loss": 2.6719, + "step": 43464 + }, + { + "epoch": 2.13, + "grad_norm": 0.7168323993682861, + "learning_rate": 0.00011609929660539467, + "loss": 2.8601, + "step": 43465 + }, + { + "epoch": 2.13, + "grad_norm": 0.7068440914154053, + "learning_rate": 0.0001160871326087093, + "loss": 2.883, + "step": 43466 + }, + { + "epoch": 2.13, + "grad_norm": 0.7084940671920776, + "learning_rate": 0.000116074969096415, + "loss": 2.763, + "step": 43467 + }, + { + "epoch": 2.13, + "grad_norm": 0.6470160484313965, + "learning_rate": 0.00011606280606854385, + "loss": 2.9537, + "step": 43468 + }, + { + "epoch": 2.13, + "grad_norm": 0.7235172986984253, + "learning_rate": 0.00011605064352512779, + "loss": 3.1622, + "step": 43469 + }, + { + "epoch": 2.13, + "grad_norm": 0.7104700207710266, + "learning_rate": 0.00011603848146619893, + "loss": 2.9741, + "step": 43470 + }, + { + "epoch": 2.13, + "grad_norm": 0.7352017164230347, + "learning_rate": 0.0001160263198917894, + "loss": 2.8902, + "step": 43471 + }, + { + "epoch": 2.13, + "grad_norm": 0.6944370269775391, + "learning_rate": 0.000116014158801931, + "loss": 2.685, + "step": 43472 + }, + { + "epoch": 2.13, + "grad_norm": 0.6649462580680847, + "learning_rate": 0.00011600199819665605, + "loss": 2.8065, + "step": 43473 + }, + { + "epoch": 2.13, + "grad_norm": 0.7043724060058594, + "learning_rate": 0.0001159898380759963, + "loss": 2.8261, + "step": 43474 + }, + { + "epoch": 2.13, + "grad_norm": 0.7225465178489685, + "learning_rate": 0.000115977678439984, + "loss": 3.0143, + "step": 43475 + }, + { + "epoch": 2.13, + "grad_norm": 0.7245696187019348, + "learning_rate": 0.00011596551928865109, + "loss": 2.8686, + "step": 43476 + }, + { + "epoch": 2.13, + "grad_norm": 0.7076426148414612, + "learning_rate": 0.0001159533606220295, + "loss": 2.9664, + "step": 43477 + }, + { + "epoch": 2.13, + "grad_norm": 0.7165865302085876, + "learning_rate": 0.00011594120244015143, + "loss": 2.9488, + "step": 43478 + }, + { + "epoch": 2.13, + "grad_norm": 0.7248859405517578, + "learning_rate": 0.00011592904474304867, + "loss": 2.8313, + "step": 43479 + }, + { + "epoch": 2.13, + "grad_norm": 0.6996978521347046, + "learning_rate": 0.00011591688753075351, + "loss": 2.9924, + "step": 43480 + }, + { + "epoch": 2.13, + "grad_norm": 0.7273635268211365, + "learning_rate": 0.00011590473080329775, + "loss": 3.0422, + "step": 43481 + }, + { + "epoch": 2.13, + "grad_norm": 0.6897189021110535, + "learning_rate": 0.0001158925745607136, + "loss": 2.9329, + "step": 43482 + }, + { + "epoch": 2.13, + "grad_norm": 0.717872142791748, + "learning_rate": 0.00011588041880303297, + "loss": 3.0231, + "step": 43483 + }, + { + "epoch": 2.13, + "grad_norm": 0.717965304851532, + "learning_rate": 0.00011586826353028777, + "loss": 2.9274, + "step": 43484 + }, + { + "epoch": 2.13, + "grad_norm": 0.7113906741142273, + "learning_rate": 0.00011585610874251022, + "loss": 2.8744, + "step": 43485 + }, + { + "epoch": 2.13, + "grad_norm": 0.6816174387931824, + "learning_rate": 0.00011584395443973216, + "loss": 3.0101, + "step": 43486 + }, + { + "epoch": 2.13, + "grad_norm": 0.6823785305023193, + "learning_rate": 0.00011583180062198567, + "loss": 2.9838, + "step": 43487 + }, + { + "epoch": 2.13, + "grad_norm": 0.7035296559333801, + "learning_rate": 0.00011581964728930291, + "loss": 2.6212, + "step": 43488 + }, + { + "epoch": 2.13, + "grad_norm": 0.699919581413269, + "learning_rate": 0.0001158074944417157, + "loss": 2.8144, + "step": 43489 + }, + { + "epoch": 2.13, + "grad_norm": 0.7117913961410522, + "learning_rate": 0.00011579534207925614, + "loss": 2.9497, + "step": 43490 + }, + { + "epoch": 2.13, + "grad_norm": 0.7389628887176514, + "learning_rate": 0.00011578319020195607, + "loss": 2.7566, + "step": 43491 + }, + { + "epoch": 2.13, + "grad_norm": 0.6526266932487488, + "learning_rate": 0.00011577103880984764, + "loss": 2.7454, + "step": 43492 + }, + { + "epoch": 2.13, + "grad_norm": 0.6850768327713013, + "learning_rate": 0.00011575888790296294, + "loss": 2.7901, + "step": 43493 + }, + { + "epoch": 2.13, + "grad_norm": 0.70717853307724, + "learning_rate": 0.00011574673748133377, + "loss": 2.8612, + "step": 43494 + }, + { + "epoch": 2.13, + "grad_norm": 0.6925551295280457, + "learning_rate": 0.00011573458754499231, + "loss": 2.8711, + "step": 43495 + }, + { + "epoch": 2.13, + "grad_norm": 0.6883793473243713, + "learning_rate": 0.00011572243809397042, + "loss": 2.872, + "step": 43496 + }, + { + "epoch": 2.13, + "grad_norm": 0.6649243235588074, + "learning_rate": 0.00011571028912830027, + "loss": 2.8072, + "step": 43497 + }, + { + "epoch": 2.13, + "grad_norm": 0.7801764011383057, + "learning_rate": 0.00011569814064801373, + "loss": 2.809, + "step": 43498 + }, + { + "epoch": 2.13, + "grad_norm": 0.6581512093544006, + "learning_rate": 0.0001156859926531427, + "loss": 3.2304, + "step": 43499 + }, + { + "epoch": 2.13, + "grad_norm": 0.6470869779586792, + "learning_rate": 0.0001156738451437194, + "loss": 3.0452, + "step": 43500 + }, + { + "epoch": 2.13, + "grad_norm": 0.6938720345497131, + "learning_rate": 0.00011566169811977564, + "loss": 3.0109, + "step": 43501 + }, + { + "epoch": 2.13, + "grad_norm": 0.7173318266868591, + "learning_rate": 0.00011564955158134347, + "loss": 2.6295, + "step": 43502 + }, + { + "epoch": 2.13, + "grad_norm": 0.6866092085838318, + "learning_rate": 0.000115637405528455, + "loss": 2.907, + "step": 43503 + }, + { + "epoch": 2.13, + "grad_norm": 0.6871329545974731, + "learning_rate": 0.00011562525996114216, + "loss": 3.1121, + "step": 43504 + }, + { + "epoch": 2.13, + "grad_norm": 0.7441400289535522, + "learning_rate": 0.00011561311487943686, + "loss": 2.8122, + "step": 43505 + }, + { + "epoch": 2.13, + "grad_norm": 0.7241978645324707, + "learning_rate": 0.00011560097028337107, + "loss": 2.8481, + "step": 43506 + }, + { + "epoch": 2.13, + "grad_norm": 0.7289275527000427, + "learning_rate": 0.0001155888261729768, + "loss": 2.8336, + "step": 43507 + }, + { + "epoch": 2.13, + "grad_norm": 0.6423391103744507, + "learning_rate": 0.00011557668254828618, + "loss": 2.8059, + "step": 43508 + }, + { + "epoch": 2.13, + "grad_norm": 0.7227512001991272, + "learning_rate": 0.00011556453940933098, + "loss": 2.8517, + "step": 43509 + }, + { + "epoch": 2.13, + "grad_norm": 0.6960424780845642, + "learning_rate": 0.00011555239675614337, + "loss": 2.8775, + "step": 43510 + }, + { + "epoch": 2.13, + "grad_norm": 0.7161725163459778, + "learning_rate": 0.0001155402545887553, + "loss": 2.7589, + "step": 43511 + }, + { + "epoch": 2.13, + "grad_norm": 0.7142117619514465, + "learning_rate": 0.00011552811290719856, + "loss": 3.1114, + "step": 43512 + }, + { + "epoch": 2.13, + "grad_norm": 0.7251697182655334, + "learning_rate": 0.00011551597171150539, + "loss": 2.9898, + "step": 43513 + }, + { + "epoch": 2.13, + "grad_norm": 0.6893921494483948, + "learning_rate": 0.00011550383100170752, + "loss": 2.9428, + "step": 43514 + }, + { + "epoch": 2.13, + "grad_norm": 0.6587451100349426, + "learning_rate": 0.0001154916907778372, + "loss": 2.6993, + "step": 43515 + }, + { + "epoch": 2.13, + "grad_norm": 0.6924602389335632, + "learning_rate": 0.00011547955103992611, + "loss": 2.8499, + "step": 43516 + }, + { + "epoch": 2.13, + "grad_norm": 0.6500839591026306, + "learning_rate": 0.00011546741178800637, + "loss": 3.2, + "step": 43517 + }, + { + "epoch": 2.13, + "grad_norm": 0.7085121273994446, + "learning_rate": 0.00011545527302211009, + "loss": 2.839, + "step": 43518 + }, + { + "epoch": 2.13, + "grad_norm": 0.6867219805717468, + "learning_rate": 0.0001154431347422691, + "loss": 2.7854, + "step": 43519 + }, + { + "epoch": 2.13, + "grad_norm": 0.7688367366790771, + "learning_rate": 0.00011543099694851534, + "loss": 2.9432, + "step": 43520 + }, + { + "epoch": 2.13, + "grad_norm": 0.6721521019935608, + "learning_rate": 0.00011541885964088074, + "loss": 2.9551, + "step": 43521 + }, + { + "epoch": 2.13, + "grad_norm": 0.6870953440666199, + "learning_rate": 0.00011540672281939731, + "loss": 2.9556, + "step": 43522 + }, + { + "epoch": 2.13, + "grad_norm": 0.7174733281135559, + "learning_rate": 0.00011539458648409718, + "loss": 2.8465, + "step": 43523 + }, + { + "epoch": 2.13, + "grad_norm": 0.6979454159736633, + "learning_rate": 0.00011538245063501205, + "loss": 2.8541, + "step": 43524 + }, + { + "epoch": 2.13, + "grad_norm": 0.7371723651885986, + "learning_rate": 0.00011537031527217417, + "loss": 3.0364, + "step": 43525 + }, + { + "epoch": 2.13, + "grad_norm": 0.6876466274261475, + "learning_rate": 0.0001153581803956153, + "loss": 2.9516, + "step": 43526 + }, + { + "epoch": 2.13, + "grad_norm": 0.7251150608062744, + "learning_rate": 0.00011534604600536737, + "loss": 2.7975, + "step": 43527 + }, + { + "epoch": 2.13, + "grad_norm": 0.7008710503578186, + "learning_rate": 0.00011533391210146249, + "loss": 2.9521, + "step": 43528 + }, + { + "epoch": 2.13, + "grad_norm": 0.690662145614624, + "learning_rate": 0.00011532177868393248, + "loss": 2.7882, + "step": 43529 + }, + { + "epoch": 2.13, + "grad_norm": 0.6789386868476868, + "learning_rate": 0.00011530964575280944, + "loss": 3.1341, + "step": 43530 + }, + { + "epoch": 2.13, + "grad_norm": 0.6649258732795715, + "learning_rate": 0.00011529751330812516, + "loss": 2.8537, + "step": 43531 + }, + { + "epoch": 2.13, + "grad_norm": 0.7068399786949158, + "learning_rate": 0.00011528538134991167, + "loss": 2.5939, + "step": 43532 + }, + { + "epoch": 2.13, + "grad_norm": 0.7076483964920044, + "learning_rate": 0.00011527324987820104, + "loss": 3.0746, + "step": 43533 + }, + { + "epoch": 2.13, + "grad_norm": 0.6737373471260071, + "learning_rate": 0.00011526111889302511, + "loss": 3.0245, + "step": 43534 + }, + { + "epoch": 2.13, + "grad_norm": 0.7310186624526978, + "learning_rate": 0.00011524898839441589, + "loss": 2.9546, + "step": 43535 + }, + { + "epoch": 2.13, + "grad_norm": 0.6874125003814697, + "learning_rate": 0.00011523685838240512, + "loss": 2.9512, + "step": 43536 + }, + { + "epoch": 2.13, + "grad_norm": 0.7353755831718445, + "learning_rate": 0.00011522472885702491, + "loss": 2.8711, + "step": 43537 + }, + { + "epoch": 2.13, + "grad_norm": 0.6856021285057068, + "learning_rate": 0.00011521259981830732, + "loss": 2.8758, + "step": 43538 + }, + { + "epoch": 2.13, + "grad_norm": 0.7210877537727356, + "learning_rate": 0.00011520047126628409, + "loss": 3.0204, + "step": 43539 + }, + { + "epoch": 2.13, + "grad_norm": 0.6792193055152893, + "learning_rate": 0.00011518834320098735, + "loss": 2.9862, + "step": 43540 + }, + { + "epoch": 2.13, + "grad_norm": 0.67461097240448, + "learning_rate": 0.00011517621562244893, + "loss": 2.9252, + "step": 43541 + }, + { + "epoch": 2.13, + "grad_norm": 0.6909298896789551, + "learning_rate": 0.0001151640885307007, + "loss": 2.9947, + "step": 43542 + }, + { + "epoch": 2.13, + "grad_norm": 0.695124626159668, + "learning_rate": 0.0001151519619257748, + "loss": 2.9083, + "step": 43543 + }, + { + "epoch": 2.13, + "grad_norm": 0.6613640189170837, + "learning_rate": 0.00011513983580770294, + "loss": 2.9106, + "step": 43544 + }, + { + "epoch": 2.13, + "grad_norm": 0.6796993017196655, + "learning_rate": 0.00011512771017651727, + "loss": 2.9502, + "step": 43545 + }, + { + "epoch": 2.13, + "grad_norm": 0.6530843377113342, + "learning_rate": 0.00011511558503224954, + "loss": 2.8476, + "step": 43546 + }, + { + "epoch": 2.13, + "grad_norm": 0.7151798605918884, + "learning_rate": 0.0001151034603749319, + "loss": 2.8159, + "step": 43547 + }, + { + "epoch": 2.13, + "grad_norm": 0.7040915489196777, + "learning_rate": 0.00011509133620459615, + "loss": 3.0598, + "step": 43548 + }, + { + "epoch": 2.13, + "grad_norm": 0.7070721983909607, + "learning_rate": 0.00011507921252127411, + "loss": 2.9802, + "step": 43549 + }, + { + "epoch": 2.13, + "grad_norm": 0.6628305315971375, + "learning_rate": 0.00011506708932499795, + "loss": 3.0236, + "step": 43550 + }, + { + "epoch": 2.13, + "grad_norm": 0.6968042850494385, + "learning_rate": 0.00011505496661579937, + "loss": 2.8769, + "step": 43551 + }, + { + "epoch": 2.13, + "grad_norm": 0.7575791478157043, + "learning_rate": 0.00011504284439371044, + "loss": 2.7383, + "step": 43552 + }, + { + "epoch": 2.13, + "grad_norm": 0.6684389114379883, + "learning_rate": 0.00011503072265876314, + "loss": 3.0318, + "step": 43553 + }, + { + "epoch": 2.13, + "grad_norm": 0.67277991771698, + "learning_rate": 0.00011501860141098924, + "loss": 2.9531, + "step": 43554 + }, + { + "epoch": 2.13, + "grad_norm": 0.6650310754776001, + "learning_rate": 0.00011500648065042082, + "loss": 3.0134, + "step": 43555 + }, + { + "epoch": 2.13, + "grad_norm": 0.7093574404716492, + "learning_rate": 0.00011499436037708975, + "loss": 2.7774, + "step": 43556 + }, + { + "epoch": 2.13, + "grad_norm": 0.701271653175354, + "learning_rate": 0.00011498224059102793, + "loss": 2.8878, + "step": 43557 + }, + { + "epoch": 2.13, + "grad_norm": 0.6936562657356262, + "learning_rate": 0.00011497012129226714, + "loss": 2.7935, + "step": 43558 + }, + { + "epoch": 2.13, + "grad_norm": 0.6467651724815369, + "learning_rate": 0.00011495800248083947, + "loss": 2.8307, + "step": 43559 + }, + { + "epoch": 2.13, + "grad_norm": 0.6694419384002686, + "learning_rate": 0.00011494588415677694, + "loss": 2.8483, + "step": 43560 + }, + { + "epoch": 2.13, + "grad_norm": 0.7048700451850891, + "learning_rate": 0.00011493376632011121, + "loss": 2.719, + "step": 43561 + }, + { + "epoch": 2.13, + "grad_norm": 0.6826921701431274, + "learning_rate": 0.0001149216489708744, + "loss": 2.984, + "step": 43562 + }, + { + "epoch": 2.13, + "grad_norm": 0.7334887385368347, + "learning_rate": 0.00011490953210909837, + "loss": 3.0855, + "step": 43563 + }, + { + "epoch": 2.13, + "grad_norm": 0.7349691987037659, + "learning_rate": 0.00011489741573481491, + "loss": 2.7208, + "step": 43564 + }, + { + "epoch": 2.14, + "grad_norm": 0.7117055058479309, + "learning_rate": 0.00011488529984805614, + "loss": 2.8431, + "step": 43565 + }, + { + "epoch": 2.14, + "grad_norm": 0.7462189197540283, + "learning_rate": 0.00011487318444885377, + "loss": 3.0923, + "step": 43566 + }, + { + "epoch": 2.14, + "grad_norm": 0.7095943689346313, + "learning_rate": 0.00011486106953723987, + "loss": 3.0918, + "step": 43567 + }, + { + "epoch": 2.14, + "grad_norm": 0.7356390953063965, + "learning_rate": 0.0001148489551132462, + "loss": 2.9536, + "step": 43568 + }, + { + "epoch": 2.14, + "grad_norm": 0.6879743337631226, + "learning_rate": 0.00011483684117690487, + "loss": 2.749, + "step": 43569 + }, + { + "epoch": 2.14, + "grad_norm": 0.6889345645904541, + "learning_rate": 0.00011482472772824765, + "loss": 2.9549, + "step": 43570 + }, + { + "epoch": 2.14, + "grad_norm": 0.7350010871887207, + "learning_rate": 0.00011481261476730634, + "loss": 2.9164, + "step": 43571 + }, + { + "epoch": 2.14, + "grad_norm": 0.7565581202507019, + "learning_rate": 0.00011480050229411307, + "loss": 2.873, + "step": 43572 + }, + { + "epoch": 2.14, + "grad_norm": 0.7308693528175354, + "learning_rate": 0.00011478839030869956, + "loss": 2.8584, + "step": 43573 + }, + { + "epoch": 2.14, + "grad_norm": 0.7340457439422607, + "learning_rate": 0.00011477627881109776, + "loss": 3.0429, + "step": 43574 + }, + { + "epoch": 2.14, + "grad_norm": 0.7506102323532104, + "learning_rate": 0.00011476416780133972, + "loss": 3.0489, + "step": 43575 + }, + { + "epoch": 2.14, + "grad_norm": 0.7139230966567993, + "learning_rate": 0.0001147520572794571, + "loss": 2.948, + "step": 43576 + }, + { + "epoch": 2.14, + "grad_norm": 0.7269354462623596, + "learning_rate": 0.00011473994724548199, + "loss": 2.8071, + "step": 43577 + }, + { + "epoch": 2.14, + "grad_norm": 0.7375877499580383, + "learning_rate": 0.00011472783769944623, + "loss": 2.8066, + "step": 43578 + }, + { + "epoch": 2.14, + "grad_norm": 0.701732337474823, + "learning_rate": 0.00011471572864138158, + "loss": 2.7454, + "step": 43579 + }, + { + "epoch": 2.14, + "grad_norm": 0.6951040029525757, + "learning_rate": 0.00011470362007132013, + "loss": 3.1121, + "step": 43580 + }, + { + "epoch": 2.14, + "grad_norm": 0.745621919631958, + "learning_rate": 0.00011469151198929362, + "loss": 2.6036, + "step": 43581 + }, + { + "epoch": 2.14, + "grad_norm": 0.7371377944946289, + "learning_rate": 0.00011467940439533406, + "loss": 2.7472, + "step": 43582 + }, + { + "epoch": 2.14, + "grad_norm": 0.7030379176139832, + "learning_rate": 0.00011466729728947321, + "loss": 2.8501, + "step": 43583 + }, + { + "epoch": 2.14, + "grad_norm": 0.6986172795295715, + "learning_rate": 0.00011465519067174312, + "loss": 2.8457, + "step": 43584 + }, + { + "epoch": 2.14, + "grad_norm": 0.673969566822052, + "learning_rate": 0.0001146430845421756, + "loss": 2.8234, + "step": 43585 + }, + { + "epoch": 2.14, + "grad_norm": 0.6397623419761658, + "learning_rate": 0.00011463097890080241, + "loss": 3.3042, + "step": 43586 + }, + { + "epoch": 2.14, + "grad_norm": 0.7095158696174622, + "learning_rate": 0.00011461887374765564, + "loss": 2.8524, + "step": 43587 + }, + { + "epoch": 2.14, + "grad_norm": 0.6966813206672668, + "learning_rate": 0.000114606769082767, + "loss": 3.1101, + "step": 43588 + }, + { + "epoch": 2.14, + "grad_norm": 0.7600648999214172, + "learning_rate": 0.00011459466490616845, + "loss": 2.838, + "step": 43589 + }, + { + "epoch": 2.14, + "grad_norm": 0.7137221693992615, + "learning_rate": 0.00011458256121789194, + "loss": 2.8731, + "step": 43590 + }, + { + "epoch": 2.14, + "grad_norm": 0.6784641742706299, + "learning_rate": 0.00011457045801796932, + "loss": 3.0268, + "step": 43591 + }, + { + "epoch": 2.14, + "grad_norm": 0.7136608362197876, + "learning_rate": 0.0001145583553064324, + "loss": 2.9876, + "step": 43592 + }, + { + "epoch": 2.14, + "grad_norm": 0.7022787928581238, + "learning_rate": 0.00011454625308331298, + "loss": 2.7072, + "step": 43593 + }, + { + "epoch": 2.14, + "grad_norm": 0.6970979571342468, + "learning_rate": 0.00011453415134864305, + "loss": 3.2014, + "step": 43594 + }, + { + "epoch": 2.14, + "grad_norm": 0.6926916241645813, + "learning_rate": 0.00011452205010245456, + "loss": 2.9012, + "step": 43595 + }, + { + "epoch": 2.14, + "grad_norm": 0.6950446367263794, + "learning_rate": 0.00011450994934477922, + "loss": 3.2737, + "step": 43596 + }, + { + "epoch": 2.14, + "grad_norm": 0.6955012083053589, + "learning_rate": 0.00011449784907564908, + "loss": 2.9293, + "step": 43597 + }, + { + "epoch": 2.14, + "grad_norm": 0.6871076226234436, + "learning_rate": 0.00011448574929509578, + "loss": 3.1272, + "step": 43598 + }, + { + "epoch": 2.14, + "grad_norm": 0.7409434914588928, + "learning_rate": 0.0001144736500031514, + "loss": 2.9277, + "step": 43599 + }, + { + "epoch": 2.14, + "grad_norm": 0.7080227732658386, + "learning_rate": 0.00011446155119984774, + "loss": 3.0437, + "step": 43600 + }, + { + "epoch": 2.14, + "grad_norm": 0.7121559977531433, + "learning_rate": 0.00011444945288521653, + "loss": 2.964, + "step": 43601 + }, + { + "epoch": 2.14, + "grad_norm": 0.7132676839828491, + "learning_rate": 0.00011443735505928989, + "loss": 2.9648, + "step": 43602 + }, + { + "epoch": 2.14, + "grad_norm": 0.7137494683265686, + "learning_rate": 0.0001144252577220994, + "loss": 2.8898, + "step": 43603 + }, + { + "epoch": 2.14, + "grad_norm": 0.7248656153678894, + "learning_rate": 0.00011441316087367711, + "loss": 2.808, + "step": 43604 + }, + { + "epoch": 2.14, + "grad_norm": 0.6689263582229614, + "learning_rate": 0.00011440106451405492, + "loss": 2.6834, + "step": 43605 + }, + { + "epoch": 2.14, + "grad_norm": 0.6864780783653259, + "learning_rate": 0.0001143889686432646, + "loss": 2.8517, + "step": 43606 + }, + { + "epoch": 2.14, + "grad_norm": 0.7175418734550476, + "learning_rate": 0.000114376873261338, + "loss": 3.1024, + "step": 43607 + }, + { + "epoch": 2.14, + "grad_norm": 0.723504900932312, + "learning_rate": 0.00011436477836830692, + "loss": 2.971, + "step": 43608 + }, + { + "epoch": 2.14, + "grad_norm": 0.7824511528015137, + "learning_rate": 0.00011435268396420327, + "loss": 2.944, + "step": 43609 + }, + { + "epoch": 2.14, + "grad_norm": 0.7112360596656799, + "learning_rate": 0.00011434059004905903, + "loss": 2.7707, + "step": 43610 + }, + { + "epoch": 2.14, + "grad_norm": 0.7221190929412842, + "learning_rate": 0.00011432849662290584, + "loss": 3.0907, + "step": 43611 + }, + { + "epoch": 2.14, + "grad_norm": 0.6884748339653015, + "learning_rate": 0.00011431640368577577, + "loss": 2.9186, + "step": 43612 + }, + { + "epoch": 2.14, + "grad_norm": 0.7097779512405396, + "learning_rate": 0.00011430431123770052, + "loss": 3.1311, + "step": 43613 + }, + { + "epoch": 2.14, + "grad_norm": 0.6831579804420471, + "learning_rate": 0.00011429221927871192, + "loss": 2.7814, + "step": 43614 + }, + { + "epoch": 2.14, + "grad_norm": 0.7085390686988831, + "learning_rate": 0.00011428012780884197, + "loss": 2.8956, + "step": 43615 + }, + { + "epoch": 2.14, + "grad_norm": 0.7171388268470764, + "learning_rate": 0.0001142680368281223, + "loss": 3.1167, + "step": 43616 + }, + { + "epoch": 2.14, + "grad_norm": 0.679538905620575, + "learning_rate": 0.00011425594633658499, + "loss": 3.0024, + "step": 43617 + }, + { + "epoch": 2.14, + "grad_norm": 0.7115911245346069, + "learning_rate": 0.00011424385633426165, + "loss": 2.8164, + "step": 43618 + }, + { + "epoch": 2.14, + "grad_norm": 0.7108883857727051, + "learning_rate": 0.00011423176682118425, + "loss": 2.9752, + "step": 43619 + }, + { + "epoch": 2.14, + "grad_norm": 0.6663087010383606, + "learning_rate": 0.00011421967779738475, + "loss": 2.8533, + "step": 43620 + }, + { + "epoch": 2.14, + "grad_norm": 0.6840021014213562, + "learning_rate": 0.00011420758926289487, + "loss": 3.0553, + "step": 43621 + }, + { + "epoch": 2.14, + "grad_norm": 0.7471802234649658, + "learning_rate": 0.00011419550121774641, + "loss": 2.944, + "step": 43622 + }, + { + "epoch": 2.14, + "grad_norm": 0.6915409564971924, + "learning_rate": 0.00011418341366197113, + "loss": 2.7266, + "step": 43623 + }, + { + "epoch": 2.14, + "grad_norm": 0.6648789048194885, + "learning_rate": 0.00011417132659560102, + "loss": 2.988, + "step": 43624 + }, + { + "epoch": 2.14, + "grad_norm": 0.7293175458908081, + "learning_rate": 0.00011415924001866795, + "loss": 2.966, + "step": 43625 + }, + { + "epoch": 2.14, + "grad_norm": 0.6726661324501038, + "learning_rate": 0.00011414715393120358, + "loss": 3.2105, + "step": 43626 + }, + { + "epoch": 2.14, + "grad_norm": 0.6855407357215881, + "learning_rate": 0.00011413506833323991, + "loss": 2.7334, + "step": 43627 + }, + { + "epoch": 2.14, + "grad_norm": 0.7026200890541077, + "learning_rate": 0.00011412298322480873, + "loss": 2.9252, + "step": 43628 + }, + { + "epoch": 2.14, + "grad_norm": 0.7490710020065308, + "learning_rate": 0.00011411089860594174, + "loss": 2.815, + "step": 43629 + }, + { + "epoch": 2.14, + "grad_norm": 0.6966449022293091, + "learning_rate": 0.00011409881447667098, + "loss": 2.8681, + "step": 43630 + }, + { + "epoch": 2.14, + "grad_norm": 0.7190315127372742, + "learning_rate": 0.00011408673083702801, + "loss": 3.0971, + "step": 43631 + }, + { + "epoch": 2.14, + "grad_norm": 0.6999360918998718, + "learning_rate": 0.00011407464768704497, + "loss": 3.0055, + "step": 43632 + }, + { + "epoch": 2.14, + "grad_norm": 0.7241888046264648, + "learning_rate": 0.00011406256502675338, + "loss": 2.8215, + "step": 43633 + }, + { + "epoch": 2.14, + "grad_norm": 0.6731426119804382, + "learning_rate": 0.00011405048285618535, + "loss": 3.0833, + "step": 43634 + }, + { + "epoch": 2.14, + "grad_norm": 0.6741295456886292, + "learning_rate": 0.00011403840117537243, + "loss": 2.9559, + "step": 43635 + }, + { + "epoch": 2.14, + "grad_norm": 0.6998021602630615, + "learning_rate": 0.00011402631998434671, + "loss": 2.9883, + "step": 43636 + }, + { + "epoch": 2.14, + "grad_norm": 0.7177157402038574, + "learning_rate": 0.00011401423928313985, + "loss": 2.9022, + "step": 43637 + }, + { + "epoch": 2.14, + "grad_norm": 0.764556348323822, + "learning_rate": 0.00011400215907178359, + "loss": 3.133, + "step": 43638 + }, + { + "epoch": 2.14, + "grad_norm": 0.7035207748413086, + "learning_rate": 0.00011399007935030994, + "loss": 3.065, + "step": 43639 + }, + { + "epoch": 2.14, + "grad_norm": 0.6932657361030579, + "learning_rate": 0.00011397800011875052, + "loss": 2.9998, + "step": 43640 + }, + { + "epoch": 2.14, + "grad_norm": 0.7348163723945618, + "learning_rate": 0.00011396592137713725, + "loss": 2.9956, + "step": 43641 + }, + { + "epoch": 2.14, + "grad_norm": 0.6634372472763062, + "learning_rate": 0.00011395384312550207, + "loss": 2.6726, + "step": 43642 + }, + { + "epoch": 2.14, + "grad_norm": 0.667530357837677, + "learning_rate": 0.00011394176536387664, + "loss": 2.8978, + "step": 43643 + }, + { + "epoch": 2.14, + "grad_norm": 0.6972176432609558, + "learning_rate": 0.00011392968809229277, + "loss": 3.0545, + "step": 43644 + }, + { + "epoch": 2.14, + "grad_norm": 0.7097292542457581, + "learning_rate": 0.00011391761131078224, + "loss": 2.9076, + "step": 43645 + }, + { + "epoch": 2.14, + "grad_norm": 0.66300368309021, + "learning_rate": 0.00011390553501937687, + "loss": 2.765, + "step": 43646 + }, + { + "epoch": 2.14, + "grad_norm": 0.6970599889755249, + "learning_rate": 0.00011389345921810862, + "loss": 2.7709, + "step": 43647 + }, + { + "epoch": 2.14, + "grad_norm": 0.6801822781562805, + "learning_rate": 0.00011388138390700906, + "loss": 2.7156, + "step": 43648 + }, + { + "epoch": 2.14, + "grad_norm": 0.6642677187919617, + "learning_rate": 0.00011386930908611024, + "loss": 2.9296, + "step": 43649 + }, + { + "epoch": 2.14, + "grad_norm": 0.726485550403595, + "learning_rate": 0.00011385723475544383, + "loss": 2.9243, + "step": 43650 + }, + { + "epoch": 2.14, + "grad_norm": 0.6999016404151917, + "learning_rate": 0.00011384516091504156, + "loss": 2.9357, + "step": 43651 + }, + { + "epoch": 2.14, + "grad_norm": 0.6688977479934692, + "learning_rate": 0.0001138330875649354, + "loss": 2.9612, + "step": 43652 + }, + { + "epoch": 2.14, + "grad_norm": 0.6783860921859741, + "learning_rate": 0.00011382101470515694, + "loss": 2.9645, + "step": 43653 + }, + { + "epoch": 2.14, + "grad_norm": 0.6920573711395264, + "learning_rate": 0.0001138089423357382, + "loss": 2.8786, + "step": 43654 + }, + { + "epoch": 2.14, + "grad_norm": 0.6857960820198059, + "learning_rate": 0.0001137968704567108, + "loss": 2.9031, + "step": 43655 + }, + { + "epoch": 2.14, + "grad_norm": 0.7088309526443481, + "learning_rate": 0.00011378479906810658, + "loss": 2.9249, + "step": 43656 + }, + { + "epoch": 2.14, + "grad_norm": 0.6725496649742126, + "learning_rate": 0.00011377272816995746, + "loss": 2.9255, + "step": 43657 + }, + { + "epoch": 2.14, + "grad_norm": 0.7024503350257874, + "learning_rate": 0.00011376065776229515, + "loss": 3.0291, + "step": 43658 + }, + { + "epoch": 2.14, + "grad_norm": 0.7618658542633057, + "learning_rate": 0.00011374858784515143, + "loss": 2.8862, + "step": 43659 + }, + { + "epoch": 2.14, + "grad_norm": 0.7083359956741333, + "learning_rate": 0.00011373651841855797, + "loss": 2.9298, + "step": 43660 + }, + { + "epoch": 2.14, + "grad_norm": 0.7162266373634338, + "learning_rate": 0.00011372444948254667, + "loss": 3.0221, + "step": 43661 + }, + { + "epoch": 2.14, + "grad_norm": 0.6709370613098145, + "learning_rate": 0.00011371238103714944, + "loss": 2.74, + "step": 43662 + }, + { + "epoch": 2.14, + "grad_norm": 0.6915350556373596, + "learning_rate": 0.00011370031308239783, + "loss": 2.9314, + "step": 43663 + }, + { + "epoch": 2.14, + "grad_norm": 0.7247147560119629, + "learning_rate": 0.00011368824561832383, + "loss": 3.0371, + "step": 43664 + }, + { + "epoch": 2.14, + "grad_norm": 0.8002133965492249, + "learning_rate": 0.00011367617864495916, + "loss": 3.008, + "step": 43665 + }, + { + "epoch": 2.14, + "grad_norm": 0.6851048469543457, + "learning_rate": 0.00011366411216233548, + "loss": 2.9939, + "step": 43666 + }, + { + "epoch": 2.14, + "grad_norm": 0.7082476019859314, + "learning_rate": 0.00011365204617048474, + "loss": 3.0785, + "step": 43667 + }, + { + "epoch": 2.14, + "grad_norm": 0.668368399143219, + "learning_rate": 0.00011363998066943857, + "loss": 2.8655, + "step": 43668 + }, + { + "epoch": 2.14, + "grad_norm": 0.6958811283111572, + "learning_rate": 0.00011362791565922892, + "loss": 2.7918, + "step": 43669 + }, + { + "epoch": 2.14, + "grad_norm": 0.7040616273880005, + "learning_rate": 0.00011361585113988737, + "loss": 3.1085, + "step": 43670 + }, + { + "epoch": 2.14, + "grad_norm": 0.7405644655227661, + "learning_rate": 0.00011360378711144588, + "loss": 2.7158, + "step": 43671 + }, + { + "epoch": 2.14, + "grad_norm": 0.66157066822052, + "learning_rate": 0.00011359172357393618, + "loss": 2.8066, + "step": 43672 + }, + { + "epoch": 2.14, + "grad_norm": 0.8532086610794067, + "learning_rate": 0.00011357966052738988, + "loss": 2.8688, + "step": 43673 + }, + { + "epoch": 2.14, + "grad_norm": 0.7060077786445618, + "learning_rate": 0.00011356759797183896, + "loss": 2.921, + "step": 43674 + }, + { + "epoch": 2.14, + "grad_norm": 0.71628737449646, + "learning_rate": 0.00011355553590731503, + "loss": 3.0828, + "step": 43675 + }, + { + "epoch": 2.14, + "grad_norm": 0.6801706552505493, + "learning_rate": 0.00011354347433384993, + "loss": 2.9732, + "step": 43676 + }, + { + "epoch": 2.14, + "grad_norm": 0.6853070259094238, + "learning_rate": 0.00011353141325147555, + "loss": 2.8799, + "step": 43677 + }, + { + "epoch": 2.14, + "grad_norm": 0.6793110966682434, + "learning_rate": 0.00011351935266022343, + "loss": 2.8989, + "step": 43678 + }, + { + "epoch": 2.14, + "grad_norm": 0.6548277735710144, + "learning_rate": 0.00011350729256012555, + "loss": 2.7669, + "step": 43679 + }, + { + "epoch": 2.14, + "grad_norm": 0.7266643047332764, + "learning_rate": 0.00011349523295121356, + "loss": 2.8731, + "step": 43680 + }, + { + "epoch": 2.14, + "grad_norm": 0.7011723518371582, + "learning_rate": 0.00011348317383351915, + "loss": 3.0052, + "step": 43681 + }, + { + "epoch": 2.14, + "grad_norm": 0.6877959370613098, + "learning_rate": 0.00011347111520707424, + "loss": 3.0605, + "step": 43682 + }, + { + "epoch": 2.14, + "grad_norm": 0.7112438678741455, + "learning_rate": 0.00011345905707191044, + "loss": 3.0148, + "step": 43683 + }, + { + "epoch": 2.14, + "grad_norm": 0.7566985487937927, + "learning_rate": 0.00011344699942805966, + "loss": 3.1465, + "step": 43684 + }, + { + "epoch": 2.14, + "grad_norm": 0.7064225673675537, + "learning_rate": 0.0001134349422755535, + "loss": 3.0024, + "step": 43685 + }, + { + "epoch": 2.14, + "grad_norm": 0.6750404238700867, + "learning_rate": 0.00011342288561442386, + "loss": 2.8038, + "step": 43686 + }, + { + "epoch": 2.14, + "grad_norm": 0.6803438663482666, + "learning_rate": 0.00011341082944470248, + "loss": 2.9661, + "step": 43687 + }, + { + "epoch": 2.14, + "grad_norm": 0.6766833066940308, + "learning_rate": 0.00011339877376642095, + "loss": 3.184, + "step": 43688 + }, + { + "epoch": 2.14, + "grad_norm": 0.70738685131073, + "learning_rate": 0.00011338671857961125, + "loss": 2.949, + "step": 43689 + }, + { + "epoch": 2.14, + "grad_norm": 0.6981825828552246, + "learning_rate": 0.00011337466388430489, + "loss": 2.8143, + "step": 43690 + }, + { + "epoch": 2.14, + "grad_norm": 0.6753236651420593, + "learning_rate": 0.00011336260968053374, + "loss": 3.0315, + "step": 43691 + }, + { + "epoch": 2.14, + "grad_norm": 0.7474436163902283, + "learning_rate": 0.0001133505559683297, + "loss": 2.7655, + "step": 43692 + }, + { + "epoch": 2.14, + "grad_norm": 0.717900812625885, + "learning_rate": 0.00011333850274772433, + "loss": 2.8381, + "step": 43693 + }, + { + "epoch": 2.14, + "grad_norm": 0.6985374093055725, + "learning_rate": 0.00011332645001874947, + "loss": 2.7779, + "step": 43694 + }, + { + "epoch": 2.14, + "grad_norm": 0.6623815298080444, + "learning_rate": 0.00011331439778143669, + "loss": 2.7031, + "step": 43695 + }, + { + "epoch": 2.14, + "grad_norm": 0.7060630321502686, + "learning_rate": 0.00011330234603581782, + "loss": 3.1725, + "step": 43696 + }, + { + "epoch": 2.14, + "grad_norm": 0.6988420486450195, + "learning_rate": 0.00011329029478192478, + "loss": 2.9677, + "step": 43697 + }, + { + "epoch": 2.14, + "grad_norm": 0.7007870078086853, + "learning_rate": 0.00011327824401978907, + "loss": 2.9562, + "step": 43698 + }, + { + "epoch": 2.14, + "grad_norm": 0.6870179176330566, + "learning_rate": 0.00011326619374944263, + "loss": 3.0245, + "step": 43699 + }, + { + "epoch": 2.14, + "grad_norm": 0.7430259585380554, + "learning_rate": 0.00011325414397091699, + "loss": 2.8442, + "step": 43700 + }, + { + "epoch": 2.14, + "grad_norm": 0.7168817520141602, + "learning_rate": 0.0001132420946842441, + "loss": 2.5137, + "step": 43701 + }, + { + "epoch": 2.14, + "grad_norm": 0.7056675553321838, + "learning_rate": 0.00011323004588945558, + "loss": 2.8598, + "step": 43702 + }, + { + "epoch": 2.14, + "grad_norm": 0.6785736680030823, + "learning_rate": 0.0001132179975865831, + "loss": 2.6708, + "step": 43703 + }, + { + "epoch": 2.14, + "grad_norm": 0.6528010368347168, + "learning_rate": 0.00011320594977565854, + "loss": 2.8608, + "step": 43704 + }, + { + "epoch": 2.14, + "grad_norm": 0.7285547256469727, + "learning_rate": 0.00011319390245671346, + "loss": 2.785, + "step": 43705 + }, + { + "epoch": 2.14, + "grad_norm": 0.6859037280082703, + "learning_rate": 0.00011318185562977969, + "loss": 2.9215, + "step": 43706 + }, + { + "epoch": 2.14, + "grad_norm": 0.683616578578949, + "learning_rate": 0.00011316980929488908, + "loss": 2.921, + "step": 43707 + }, + { + "epoch": 2.14, + "grad_norm": 0.7073354125022888, + "learning_rate": 0.00011315776345207322, + "loss": 3.038, + "step": 43708 + }, + { + "epoch": 2.14, + "grad_norm": 0.6951118111610413, + "learning_rate": 0.00011314571810136386, + "loss": 3.0371, + "step": 43709 + }, + { + "epoch": 2.14, + "grad_norm": 0.7080778479576111, + "learning_rate": 0.00011313367324279263, + "loss": 3.0615, + "step": 43710 + }, + { + "epoch": 2.14, + "grad_norm": 0.712817907333374, + "learning_rate": 0.0001131216288763914, + "loss": 2.7927, + "step": 43711 + }, + { + "epoch": 2.14, + "grad_norm": 0.7057470679283142, + "learning_rate": 0.00011310958500219177, + "loss": 3.0071, + "step": 43712 + }, + { + "epoch": 2.14, + "grad_norm": 0.7209771275520325, + "learning_rate": 0.00011309754162022551, + "loss": 2.9617, + "step": 43713 + }, + { + "epoch": 2.14, + "grad_norm": 0.6734073162078857, + "learning_rate": 0.00011308549873052447, + "loss": 2.7933, + "step": 43714 + }, + { + "epoch": 2.14, + "grad_norm": 0.6648086905479431, + "learning_rate": 0.00011307345633312024, + "loss": 2.9428, + "step": 43715 + }, + { + "epoch": 2.14, + "grad_norm": 0.6905327439308167, + "learning_rate": 0.00011306141442804456, + "loss": 2.8132, + "step": 43716 + }, + { + "epoch": 2.14, + "grad_norm": 0.7069293856620789, + "learning_rate": 0.00011304937301532901, + "loss": 2.7406, + "step": 43717 + }, + { + "epoch": 2.14, + "grad_norm": 0.6863256096839905, + "learning_rate": 0.00011303733209500548, + "loss": 2.9026, + "step": 43718 + }, + { + "epoch": 2.14, + "grad_norm": 0.6748086810112, + "learning_rate": 0.0001130252916671057, + "loss": 2.9089, + "step": 43719 + }, + { + "epoch": 2.14, + "grad_norm": 0.68429034948349, + "learning_rate": 0.0001130132517316612, + "loss": 2.9713, + "step": 43720 + }, + { + "epoch": 2.14, + "grad_norm": 0.6415186524391174, + "learning_rate": 0.00011300121228870396, + "loss": 2.6823, + "step": 43721 + }, + { + "epoch": 2.14, + "grad_norm": 0.6968873143196106, + "learning_rate": 0.0001129891733382654, + "loss": 2.8784, + "step": 43722 + }, + { + "epoch": 2.14, + "grad_norm": 0.7179439067840576, + "learning_rate": 0.00011297713488037748, + "loss": 2.8995, + "step": 43723 + }, + { + "epoch": 2.14, + "grad_norm": 0.7167483568191528, + "learning_rate": 0.0001129650969150718, + "loss": 2.8282, + "step": 43724 + }, + { + "epoch": 2.14, + "grad_norm": 0.7356143593788147, + "learning_rate": 0.00011295305944237995, + "loss": 2.6649, + "step": 43725 + }, + { + "epoch": 2.14, + "grad_norm": 0.6669628024101257, + "learning_rate": 0.00011294102246233385, + "loss": 2.8961, + "step": 43726 + }, + { + "epoch": 2.14, + "grad_norm": 0.7157459855079651, + "learning_rate": 0.000112928985974965, + "loss": 2.9598, + "step": 43727 + }, + { + "epoch": 2.14, + "grad_norm": 0.6857950091362, + "learning_rate": 0.00011291694998030517, + "loss": 2.9189, + "step": 43728 + }, + { + "epoch": 2.14, + "grad_norm": 0.7263401746749878, + "learning_rate": 0.00011290491447838622, + "loss": 2.8394, + "step": 43729 + }, + { + "epoch": 2.14, + "grad_norm": 0.7011303901672363, + "learning_rate": 0.00011289287946923973, + "loss": 2.8701, + "step": 43730 + }, + { + "epoch": 2.14, + "grad_norm": 0.7553688287734985, + "learning_rate": 0.00011288084495289737, + "loss": 2.8729, + "step": 43731 + }, + { + "epoch": 2.14, + "grad_norm": 0.6972824931144714, + "learning_rate": 0.00011286881092939074, + "loss": 2.9316, + "step": 43732 + }, + { + "epoch": 2.14, + "grad_norm": 0.7040778398513794, + "learning_rate": 0.00011285677739875163, + "loss": 2.7414, + "step": 43733 + }, + { + "epoch": 2.14, + "grad_norm": 0.6931003332138062, + "learning_rate": 0.00011284474436101188, + "loss": 2.7205, + "step": 43734 + }, + { + "epoch": 2.14, + "grad_norm": 0.6897353529930115, + "learning_rate": 0.00011283271181620295, + "loss": 2.8586, + "step": 43735 + }, + { + "epoch": 2.14, + "grad_norm": 0.716933012008667, + "learning_rate": 0.00011282067976435676, + "loss": 3.0455, + "step": 43736 + }, + { + "epoch": 2.14, + "grad_norm": 0.7096711993217468, + "learning_rate": 0.00011280864820550474, + "loss": 2.9453, + "step": 43737 + }, + { + "epoch": 2.14, + "grad_norm": 0.7447996139526367, + "learning_rate": 0.00011279661713967882, + "loss": 2.9867, + "step": 43738 + }, + { + "epoch": 2.14, + "grad_norm": 0.6898730993270874, + "learning_rate": 0.0001127845865669106, + "loss": 2.8042, + "step": 43739 + }, + { + "epoch": 2.14, + "grad_norm": 0.6772977709770203, + "learning_rate": 0.00011277255648723163, + "loss": 2.9059, + "step": 43740 + }, + { + "epoch": 2.14, + "grad_norm": 0.7092178463935852, + "learning_rate": 0.00011276052690067382, + "loss": 2.8711, + "step": 43741 + }, + { + "epoch": 2.14, + "grad_norm": 0.6839644312858582, + "learning_rate": 0.00011274849780726866, + "loss": 2.8759, + "step": 43742 + }, + { + "epoch": 2.14, + "grad_norm": 0.7505874037742615, + "learning_rate": 0.00011273646920704792, + "loss": 3.0373, + "step": 43743 + }, + { + "epoch": 2.14, + "grad_norm": 0.7028711438179016, + "learning_rate": 0.00011272444110004335, + "loss": 2.7585, + "step": 43744 + }, + { + "epoch": 2.14, + "grad_norm": 0.7121570706367493, + "learning_rate": 0.00011271241348628659, + "loss": 2.9666, + "step": 43745 + }, + { + "epoch": 2.14, + "grad_norm": 0.7014894485473633, + "learning_rate": 0.00011270038636580928, + "loss": 2.9703, + "step": 43746 + }, + { + "epoch": 2.14, + "grad_norm": 0.7833147644996643, + "learning_rate": 0.000112688359738643, + "loss": 2.7485, + "step": 43747 + }, + { + "epoch": 2.14, + "grad_norm": 0.6829766631126404, + "learning_rate": 0.00011267633360481954, + "loss": 3.0008, + "step": 43748 + }, + { + "epoch": 2.14, + "grad_norm": 0.7053651809692383, + "learning_rate": 0.00011266430796437067, + "loss": 3.0635, + "step": 43749 + }, + { + "epoch": 2.14, + "grad_norm": 0.7544901371002197, + "learning_rate": 0.00011265228281732784, + "loss": 2.9556, + "step": 43750 + }, + { + "epoch": 2.14, + "grad_norm": 0.6905205845832825, + "learning_rate": 0.00011264025816372298, + "loss": 3.0138, + "step": 43751 + }, + { + "epoch": 2.14, + "grad_norm": 0.7467920780181885, + "learning_rate": 0.00011262823400358756, + "loss": 2.7328, + "step": 43752 + }, + { + "epoch": 2.14, + "grad_norm": 0.6714316606521606, + "learning_rate": 0.00011261621033695327, + "loss": 2.9427, + "step": 43753 + }, + { + "epoch": 2.14, + "grad_norm": 0.6609356999397278, + "learning_rate": 0.00011260418716385189, + "loss": 2.8976, + "step": 43754 + }, + { + "epoch": 2.14, + "grad_norm": 0.728022038936615, + "learning_rate": 0.00011259216448431493, + "loss": 2.8679, + "step": 43755 + }, + { + "epoch": 2.14, + "grad_norm": 0.7265163660049438, + "learning_rate": 0.00011258014229837424, + "loss": 2.9928, + "step": 43756 + }, + { + "epoch": 2.14, + "grad_norm": 0.6698508262634277, + "learning_rate": 0.00011256812060606127, + "loss": 2.8256, + "step": 43757 + }, + { + "epoch": 2.14, + "grad_norm": 0.7141711115837097, + "learning_rate": 0.0001125560994074078, + "loss": 2.6912, + "step": 43758 + }, + { + "epoch": 2.14, + "grad_norm": 0.705557644367218, + "learning_rate": 0.0001125440787024456, + "loss": 2.8845, + "step": 43759 + }, + { + "epoch": 2.14, + "grad_norm": 0.7180753946304321, + "learning_rate": 0.00011253205849120621, + "loss": 2.883, + "step": 43760 + }, + { + "epoch": 2.14, + "grad_norm": 0.6521044969558716, + "learning_rate": 0.00011252003877372132, + "loss": 2.9457, + "step": 43761 + }, + { + "epoch": 2.14, + "grad_norm": 0.7582170367240906, + "learning_rate": 0.00011250801955002245, + "loss": 3.0808, + "step": 43762 + }, + { + "epoch": 2.14, + "grad_norm": 0.6841397881507874, + "learning_rate": 0.00011249600082014135, + "loss": 2.8299, + "step": 43763 + }, + { + "epoch": 2.14, + "grad_norm": 0.6913440227508545, + "learning_rate": 0.00011248398258410986, + "loss": 2.8295, + "step": 43764 + }, + { + "epoch": 2.14, + "grad_norm": 0.6824356317520142, + "learning_rate": 0.00011247196484195933, + "loss": 2.8794, + "step": 43765 + }, + { + "epoch": 2.14, + "grad_norm": 0.7842134237289429, + "learning_rate": 0.00011245994759372164, + "loss": 2.8076, + "step": 43766 + }, + { + "epoch": 2.14, + "grad_norm": 0.7321650981903076, + "learning_rate": 0.00011244793083942838, + "loss": 3.0722, + "step": 43767 + }, + { + "epoch": 2.14, + "grad_norm": 0.7260737419128418, + "learning_rate": 0.00011243591457911107, + "loss": 2.994, + "step": 43768 + }, + { + "epoch": 2.15, + "grad_norm": 0.7154833674430847, + "learning_rate": 0.00011242389881280158, + "loss": 2.985, + "step": 43769 + }, + { + "epoch": 2.15, + "grad_norm": 0.6974570155143738, + "learning_rate": 0.00011241188354053131, + "loss": 2.9196, + "step": 43770 + }, + { + "epoch": 2.15, + "grad_norm": 0.7746239900588989, + "learning_rate": 0.00011239986876233214, + "loss": 2.7981, + "step": 43771 + }, + { + "epoch": 2.15, + "grad_norm": 0.6849657297134399, + "learning_rate": 0.00011238785447823552, + "loss": 2.9164, + "step": 43772 + }, + { + "epoch": 2.15, + "grad_norm": 0.7732822895050049, + "learning_rate": 0.0001123758406882733, + "loss": 2.8262, + "step": 43773 + }, + { + "epoch": 2.15, + "grad_norm": 0.7015544176101685, + "learning_rate": 0.000112363827392477, + "loss": 2.9393, + "step": 43774 + }, + { + "epoch": 2.15, + "grad_norm": 0.6607134342193604, + "learning_rate": 0.00011235181459087816, + "loss": 2.9479, + "step": 43775 + }, + { + "epoch": 2.15, + "grad_norm": 0.7178341150283813, + "learning_rate": 0.00011233980228350863, + "loss": 2.9219, + "step": 43776 + }, + { + "epoch": 2.15, + "grad_norm": 0.6694148778915405, + "learning_rate": 0.00011232779047039983, + "loss": 2.7359, + "step": 43777 + }, + { + "epoch": 2.15, + "grad_norm": 0.7360499501228333, + "learning_rate": 0.00011231577915158353, + "loss": 2.9348, + "step": 43778 + }, + { + "epoch": 2.15, + "grad_norm": 0.672639787197113, + "learning_rate": 0.00011230376832709144, + "loss": 2.846, + "step": 43779 + }, + { + "epoch": 2.15, + "grad_norm": 0.7251139879226685, + "learning_rate": 0.00011229175799695498, + "loss": 3.0, + "step": 43780 + }, + { + "epoch": 2.15, + "grad_norm": 0.6625097393989563, + "learning_rate": 0.00011227974816120603, + "loss": 2.9168, + "step": 43781 + }, + { + "epoch": 2.15, + "grad_norm": 0.6882855296134949, + "learning_rate": 0.0001122677388198761, + "loss": 3.1654, + "step": 43782 + }, + { + "epoch": 2.15, + "grad_norm": 0.6938197612762451, + "learning_rate": 0.0001122557299729967, + "loss": 2.8398, + "step": 43783 + }, + { + "epoch": 2.15, + "grad_norm": 0.701835572719574, + "learning_rate": 0.00011224372162059972, + "loss": 3.03, + "step": 43784 + }, + { + "epoch": 2.15, + "grad_norm": 0.6981818675994873, + "learning_rate": 0.0001122317137627165, + "loss": 2.9271, + "step": 43785 + }, + { + "epoch": 2.15, + "grad_norm": 0.7233262658119202, + "learning_rate": 0.00011221970639937891, + "loss": 2.7305, + "step": 43786 + }, + { + "epoch": 2.15, + "grad_norm": 0.7583865523338318, + "learning_rate": 0.00011220769953061837, + "loss": 2.9369, + "step": 43787 + }, + { + "epoch": 2.15, + "grad_norm": 0.6952337622642517, + "learning_rate": 0.00011219569315646672, + "loss": 2.8671, + "step": 43788 + }, + { + "epoch": 2.15, + "grad_norm": 0.6880810260772705, + "learning_rate": 0.00011218368727695546, + "loss": 2.9465, + "step": 43789 + }, + { + "epoch": 2.15, + "grad_norm": 0.6928128600120544, + "learning_rate": 0.00011217168189211614, + "loss": 2.7894, + "step": 43790 + }, + { + "epoch": 2.15, + "grad_norm": 0.6935843825340271, + "learning_rate": 0.00011215967700198057, + "loss": 2.9268, + "step": 43791 + }, + { + "epoch": 2.15, + "grad_norm": 0.7136497497558594, + "learning_rate": 0.00011214767260658011, + "loss": 3.0224, + "step": 43792 + }, + { + "epoch": 2.15, + "grad_norm": 0.7205958962440491, + "learning_rate": 0.00011213566870594668, + "loss": 2.8294, + "step": 43793 + }, + { + "epoch": 2.15, + "grad_norm": 0.714300811290741, + "learning_rate": 0.00011212366530011162, + "loss": 3.0795, + "step": 43794 + }, + { + "epoch": 2.15, + "grad_norm": 0.6874080300331116, + "learning_rate": 0.00011211166238910677, + "loss": 3.0615, + "step": 43795 + }, + { + "epoch": 2.15, + "grad_norm": 0.7105938196182251, + "learning_rate": 0.00011209965997296363, + "loss": 2.9694, + "step": 43796 + }, + { + "epoch": 2.15, + "grad_norm": 0.7178138494491577, + "learning_rate": 0.00011208765805171373, + "loss": 3.0575, + "step": 43797 + }, + { + "epoch": 2.15, + "grad_norm": 0.7538326978683472, + "learning_rate": 0.0001120756566253889, + "loss": 3.0899, + "step": 43798 + }, + { + "epoch": 2.15, + "grad_norm": 0.6940291523933411, + "learning_rate": 0.00011206365569402051, + "loss": 2.9107, + "step": 43799 + }, + { + "epoch": 2.15, + "grad_norm": 0.754417896270752, + "learning_rate": 0.00011205165525764026, + "loss": 2.9723, + "step": 43800 + }, + { + "epoch": 2.15, + "grad_norm": 0.7753744125366211, + "learning_rate": 0.00011203965531627991, + "loss": 2.9622, + "step": 43801 + }, + { + "epoch": 2.15, + "grad_norm": 0.7098350524902344, + "learning_rate": 0.00011202765586997082, + "loss": 2.8377, + "step": 43802 + }, + { + "epoch": 2.15, + "grad_norm": 0.7141963839530945, + "learning_rate": 0.00011201565691874482, + "loss": 2.887, + "step": 43803 + }, + { + "epoch": 2.15, + "grad_norm": 0.6638633608818054, + "learning_rate": 0.00011200365846263338, + "loss": 2.9423, + "step": 43804 + }, + { + "epoch": 2.15, + "grad_norm": 0.6971076726913452, + "learning_rate": 0.00011199166050166801, + "loss": 2.8963, + "step": 43805 + }, + { + "epoch": 2.15, + "grad_norm": 0.6292498111724854, + "learning_rate": 0.00011197966303588055, + "loss": 3.1008, + "step": 43806 + }, + { + "epoch": 2.15, + "grad_norm": 0.696465790271759, + "learning_rate": 0.00011196766606530238, + "loss": 2.8679, + "step": 43807 + }, + { + "epoch": 2.15, + "grad_norm": 0.6825982332229614, + "learning_rate": 0.00011195566958996526, + "loss": 3.0, + "step": 43808 + }, + { + "epoch": 2.15, + "grad_norm": 0.7092751264572144, + "learning_rate": 0.00011194367360990063, + "loss": 3.0249, + "step": 43809 + }, + { + "epoch": 2.15, + "grad_norm": 0.7213112115859985, + "learning_rate": 0.0001119316781251403, + "loss": 2.9623, + "step": 43810 + }, + { + "epoch": 2.15, + "grad_norm": 0.6883493065834045, + "learning_rate": 0.0001119196831357157, + "loss": 3.0142, + "step": 43811 + }, + { + "epoch": 2.15, + "grad_norm": 0.6896858215332031, + "learning_rate": 0.00011190768864165837, + "loss": 2.941, + "step": 43812 + }, + { + "epoch": 2.15, + "grad_norm": 0.7189692258834839, + "learning_rate": 0.00011189569464300007, + "loss": 2.9459, + "step": 43813 + }, + { + "epoch": 2.15, + "grad_norm": 0.67086261510849, + "learning_rate": 0.00011188370113977226, + "loss": 3.2535, + "step": 43814 + }, + { + "epoch": 2.15, + "grad_norm": 0.6902649998664856, + "learning_rate": 0.00011187170813200654, + "loss": 3.2002, + "step": 43815 + }, + { + "epoch": 2.15, + "grad_norm": 0.6927818655967712, + "learning_rate": 0.00011185971561973463, + "loss": 2.8208, + "step": 43816 + }, + { + "epoch": 2.15, + "grad_norm": 0.728071391582489, + "learning_rate": 0.00011184772360298804, + "loss": 3.154, + "step": 43817 + }, + { + "epoch": 2.15, + "grad_norm": 0.6964516043663025, + "learning_rate": 0.00011183573208179832, + "loss": 3.2181, + "step": 43818 + }, + { + "epoch": 2.15, + "grad_norm": 0.7113494277000427, + "learning_rate": 0.00011182374105619696, + "loss": 2.8732, + "step": 43819 + }, + { + "epoch": 2.15, + "grad_norm": 0.7100608944892883, + "learning_rate": 0.00011181175052621568, + "loss": 3.012, + "step": 43820 + }, + { + "epoch": 2.15, + "grad_norm": 0.6832837462425232, + "learning_rate": 0.00011179976049188609, + "loss": 2.7346, + "step": 43821 + }, + { + "epoch": 2.15, + "grad_norm": 0.7172902226448059, + "learning_rate": 0.00011178777095323966, + "loss": 3.0578, + "step": 43822 + }, + { + "epoch": 2.15, + "grad_norm": 0.7123684883117676, + "learning_rate": 0.00011177578191030807, + "loss": 3.0289, + "step": 43823 + }, + { + "epoch": 2.15, + "grad_norm": 0.7194732427597046, + "learning_rate": 0.00011176379336312279, + "loss": 2.8659, + "step": 43824 + }, + { + "epoch": 2.15, + "grad_norm": 0.7032181620597839, + "learning_rate": 0.00011175180531171553, + "loss": 2.9368, + "step": 43825 + }, + { + "epoch": 2.15, + "grad_norm": 0.7210255265235901, + "learning_rate": 0.0001117398177561178, + "loss": 2.8385, + "step": 43826 + }, + { + "epoch": 2.15, + "grad_norm": 0.7119678258895874, + "learning_rate": 0.00011172783069636106, + "loss": 2.687, + "step": 43827 + }, + { + "epoch": 2.15, + "grad_norm": 0.6808481812477112, + "learning_rate": 0.00011171584413247706, + "loss": 2.9508, + "step": 43828 + }, + { + "epoch": 2.15, + "grad_norm": 0.6783161759376526, + "learning_rate": 0.00011170385806449719, + "loss": 3.0674, + "step": 43829 + }, + { + "epoch": 2.15, + "grad_norm": 0.7140722274780273, + "learning_rate": 0.00011169187249245313, + "loss": 2.7876, + "step": 43830 + }, + { + "epoch": 2.15, + "grad_norm": 0.6937413215637207, + "learning_rate": 0.00011167988741637652, + "loss": 3.2024, + "step": 43831 + }, + { + "epoch": 2.15, + "grad_norm": 0.706088125705719, + "learning_rate": 0.00011166790283629889, + "loss": 2.8087, + "step": 43832 + }, + { + "epoch": 2.15, + "grad_norm": 0.7085302472114563, + "learning_rate": 0.0001116559187522517, + "loss": 2.7154, + "step": 43833 + }, + { + "epoch": 2.15, + "grad_norm": 0.6916400790214539, + "learning_rate": 0.0001116439351642665, + "loss": 2.9563, + "step": 43834 + }, + { + "epoch": 2.15, + "grad_norm": 0.6995882987976074, + "learning_rate": 0.00011163195207237493, + "loss": 2.7844, + "step": 43835 + }, + { + "epoch": 2.15, + "grad_norm": 0.7213637232780457, + "learning_rate": 0.00011161996947660864, + "loss": 3.0642, + "step": 43836 + }, + { + "epoch": 2.15, + "grad_norm": 0.6776930689811707, + "learning_rate": 0.000111607987376999, + "loss": 3.0639, + "step": 43837 + }, + { + "epoch": 2.15, + "grad_norm": 0.6928814649581909, + "learning_rate": 0.00011159600577357778, + "loss": 2.9717, + "step": 43838 + }, + { + "epoch": 2.15, + "grad_norm": 0.6714714169502258, + "learning_rate": 0.0001115840246663764, + "loss": 2.9155, + "step": 43839 + }, + { + "epoch": 2.15, + "grad_norm": 0.6734461784362793, + "learning_rate": 0.00011157204405542631, + "loss": 3.2625, + "step": 43840 + }, + { + "epoch": 2.15, + "grad_norm": 0.7229616641998291, + "learning_rate": 0.00011156006394075935, + "loss": 2.9042, + "step": 43841 + }, + { + "epoch": 2.15, + "grad_norm": 0.7451503276824951, + "learning_rate": 0.00011154808432240677, + "loss": 2.969, + "step": 43842 + }, + { + "epoch": 2.15, + "grad_norm": 0.6939261555671692, + "learning_rate": 0.00011153610520040039, + "loss": 2.8367, + "step": 43843 + }, + { + "epoch": 2.15, + "grad_norm": 0.7914009690284729, + "learning_rate": 0.00011152412657477155, + "loss": 2.8234, + "step": 43844 + }, + { + "epoch": 2.15, + "grad_norm": 0.7722774744033813, + "learning_rate": 0.00011151214844555186, + "loss": 2.8591, + "step": 43845 + }, + { + "epoch": 2.15, + "grad_norm": 1.1138639450073242, + "learning_rate": 0.00011150017081277302, + "loss": 3.0576, + "step": 43846 + }, + { + "epoch": 2.15, + "grad_norm": 0.7328840494155884, + "learning_rate": 0.00011148819367646643, + "loss": 2.8833, + "step": 43847 + }, + { + "epoch": 2.15, + "grad_norm": 0.6928589344024658, + "learning_rate": 0.00011147621703666368, + "loss": 2.7493, + "step": 43848 + }, + { + "epoch": 2.15, + "grad_norm": 0.7033262848854065, + "learning_rate": 0.00011146424089339616, + "loss": 2.7993, + "step": 43849 + }, + { + "epoch": 2.15, + "grad_norm": 0.7417920827865601, + "learning_rate": 0.0001114522652466956, + "loss": 3.0088, + "step": 43850 + }, + { + "epoch": 2.15, + "grad_norm": 0.7107601761817932, + "learning_rate": 0.00011144029009659352, + "loss": 2.8515, + "step": 43851 + }, + { + "epoch": 2.15, + "grad_norm": 0.6659914255142212, + "learning_rate": 0.00011142831544312139, + "loss": 2.9079, + "step": 43852 + }, + { + "epoch": 2.15, + "grad_norm": 0.6738490462303162, + "learning_rate": 0.00011141634128631082, + "loss": 2.926, + "step": 43853 + }, + { + "epoch": 2.15, + "grad_norm": 0.708010733127594, + "learning_rate": 0.00011140436762619336, + "loss": 2.9225, + "step": 43854 + }, + { + "epoch": 2.15, + "grad_norm": 0.7281856536865234, + "learning_rate": 0.00011139239446280039, + "loss": 3.0498, + "step": 43855 + }, + { + "epoch": 2.15, + "grad_norm": 0.7190080881118774, + "learning_rate": 0.00011138042179616364, + "loss": 2.6951, + "step": 43856 + }, + { + "epoch": 2.15, + "grad_norm": 0.7010225057601929, + "learning_rate": 0.00011136844962631449, + "loss": 2.8787, + "step": 43857 + }, + { + "epoch": 2.15, + "grad_norm": 0.7194762229919434, + "learning_rate": 0.00011135647795328461, + "loss": 2.806, + "step": 43858 + }, + { + "epoch": 2.15, + "grad_norm": 0.7051863670349121, + "learning_rate": 0.0001113445067771054, + "loss": 2.9298, + "step": 43859 + }, + { + "epoch": 2.15, + "grad_norm": 0.7123686671257019, + "learning_rate": 0.00011133253609780841, + "loss": 2.8497, + "step": 43860 + }, + { + "epoch": 2.15, + "grad_norm": 0.7194857597351074, + "learning_rate": 0.00011132056591542535, + "loss": 2.9762, + "step": 43861 + }, + { + "epoch": 2.15, + "grad_norm": 0.705207884311676, + "learning_rate": 0.00011130859622998755, + "loss": 2.7528, + "step": 43862 + }, + { + "epoch": 2.15, + "grad_norm": 0.6921209692955017, + "learning_rate": 0.00011129662704152663, + "loss": 2.9349, + "step": 43863 + }, + { + "epoch": 2.15, + "grad_norm": 0.6830700635910034, + "learning_rate": 0.00011128465835007397, + "loss": 2.8637, + "step": 43864 + }, + { + "epoch": 2.15, + "grad_norm": 0.7572240829467773, + "learning_rate": 0.00011127269015566133, + "loss": 3.0668, + "step": 43865 + }, + { + "epoch": 2.15, + "grad_norm": 0.6755383014678955, + "learning_rate": 0.00011126072245831996, + "loss": 2.814, + "step": 43866 + }, + { + "epoch": 2.15, + "grad_norm": 0.7219648361206055, + "learning_rate": 0.00011124875525808156, + "loss": 2.7501, + "step": 43867 + }, + { + "epoch": 2.15, + "grad_norm": 0.7269349098205566, + "learning_rate": 0.00011123678855497768, + "loss": 2.7667, + "step": 43868 + }, + { + "epoch": 2.15, + "grad_norm": 0.7479811310768127, + "learning_rate": 0.0001112248223490398, + "loss": 2.907, + "step": 43869 + }, + { + "epoch": 2.15, + "grad_norm": 0.7241057753562927, + "learning_rate": 0.00011121285664029938, + "loss": 3.0555, + "step": 43870 + }, + { + "epoch": 2.15, + "grad_norm": 0.6962974071502686, + "learning_rate": 0.00011120089142878789, + "loss": 2.985, + "step": 43871 + }, + { + "epoch": 2.15, + "grad_norm": 0.7270846366882324, + "learning_rate": 0.0001111889267145369, + "loss": 2.7828, + "step": 43872 + }, + { + "epoch": 2.15, + "grad_norm": 0.7334160208702087, + "learning_rate": 0.00011117696249757801, + "loss": 2.902, + "step": 43873 + }, + { + "epoch": 2.15, + "grad_norm": 0.7237550616264343, + "learning_rate": 0.00011116499877794262, + "loss": 3.0554, + "step": 43874 + }, + { + "epoch": 2.15, + "grad_norm": 0.7188262343406677, + "learning_rate": 0.00011115303555566233, + "loss": 2.7096, + "step": 43875 + }, + { + "epoch": 2.15, + "grad_norm": 0.7280434370040894, + "learning_rate": 0.00011114107283076862, + "loss": 2.7683, + "step": 43876 + }, + { + "epoch": 2.15, + "grad_norm": 0.6606218218803406, + "learning_rate": 0.0001111291106032929, + "loss": 2.9093, + "step": 43877 + }, + { + "epoch": 2.15, + "grad_norm": 0.7308887839317322, + "learning_rate": 0.00011111714887326684, + "loss": 2.7953, + "step": 43878 + }, + { + "epoch": 2.15, + "grad_norm": 0.7169768214225769, + "learning_rate": 0.00011110518764072174, + "loss": 2.6644, + "step": 43879 + }, + { + "epoch": 2.15, + "grad_norm": 0.7009234428405762, + "learning_rate": 0.00011109322690568934, + "loss": 2.9879, + "step": 43880 + }, + { + "epoch": 2.15, + "grad_norm": 0.7171613574028015, + "learning_rate": 0.00011108126666820094, + "loss": 3.0013, + "step": 43881 + }, + { + "epoch": 2.15, + "grad_norm": 0.7147977352142334, + "learning_rate": 0.0001110693069282881, + "loss": 2.7512, + "step": 43882 + }, + { + "epoch": 2.15, + "grad_norm": 0.7143003344535828, + "learning_rate": 0.00011105734768598244, + "loss": 2.9142, + "step": 43883 + }, + { + "epoch": 2.15, + "grad_norm": 0.6658389568328857, + "learning_rate": 0.00011104538894131537, + "loss": 2.8317, + "step": 43884 + }, + { + "epoch": 2.15, + "grad_norm": 0.7165668606758118, + "learning_rate": 0.00011103343069431838, + "loss": 2.9532, + "step": 43885 + }, + { + "epoch": 2.15, + "grad_norm": 0.7112611532211304, + "learning_rate": 0.00011102147294502287, + "loss": 3.2125, + "step": 43886 + }, + { + "epoch": 2.15, + "grad_norm": 0.6863123178482056, + "learning_rate": 0.00011100951569346045, + "loss": 2.9475, + "step": 43887 + }, + { + "epoch": 2.15, + "grad_norm": 0.7303819060325623, + "learning_rate": 0.00011099755893966266, + "loss": 2.982, + "step": 43888 + }, + { + "epoch": 2.15, + "grad_norm": 0.7144728899002075, + "learning_rate": 0.00011098560268366087, + "loss": 2.8729, + "step": 43889 + }, + { + "epoch": 2.15, + "grad_norm": 0.6913188099861145, + "learning_rate": 0.00011097364692548669, + "loss": 2.8669, + "step": 43890 + }, + { + "epoch": 2.15, + "grad_norm": 0.7074322700500488, + "learning_rate": 0.00011096169166517153, + "loss": 3.1753, + "step": 43891 + }, + { + "epoch": 2.15, + "grad_norm": 0.6860098242759705, + "learning_rate": 0.00011094973690274684, + "loss": 2.9712, + "step": 43892 + }, + { + "epoch": 2.15, + "grad_norm": 0.6915674209594727, + "learning_rate": 0.00011093778263824422, + "loss": 2.9525, + "step": 43893 + }, + { + "epoch": 2.15, + "grad_norm": 0.7016196846961975, + "learning_rate": 0.00011092582887169501, + "loss": 2.9581, + "step": 43894 + }, + { + "epoch": 2.15, + "grad_norm": 0.6921491622924805, + "learning_rate": 0.00011091387560313088, + "loss": 2.9924, + "step": 43895 + }, + { + "epoch": 2.15, + "grad_norm": 0.7028470635414124, + "learning_rate": 0.00011090192283258312, + "loss": 2.7785, + "step": 43896 + }, + { + "epoch": 2.15, + "grad_norm": 0.6804261803627014, + "learning_rate": 0.0001108899705600834, + "loss": 2.8979, + "step": 43897 + }, + { + "epoch": 2.15, + "grad_norm": 0.7549912929534912, + "learning_rate": 0.00011087801878566308, + "loss": 3.1259, + "step": 43898 + }, + { + "epoch": 2.15, + "grad_norm": 0.6933797001838684, + "learning_rate": 0.00011086606750935358, + "loss": 2.9454, + "step": 43899 + }, + { + "epoch": 2.15, + "grad_norm": 0.6788550615310669, + "learning_rate": 0.00011085411673118654, + "loss": 2.9116, + "step": 43900 + }, + { + "epoch": 2.15, + "grad_norm": 0.7214063405990601, + "learning_rate": 0.00011084216645119328, + "loss": 2.8206, + "step": 43901 + }, + { + "epoch": 2.15, + "grad_norm": 0.6881868839263916, + "learning_rate": 0.00011083021666940533, + "loss": 2.8979, + "step": 43902 + }, + { + "epoch": 2.15, + "grad_norm": 0.7121565937995911, + "learning_rate": 0.00011081826738585428, + "loss": 2.8872, + "step": 43903 + }, + { + "epoch": 2.15, + "grad_norm": 0.6824859380722046, + "learning_rate": 0.00011080631860057144, + "loss": 2.7616, + "step": 43904 + }, + { + "epoch": 2.15, + "grad_norm": 0.7560449242591858, + "learning_rate": 0.00011079437031358844, + "loss": 3.0213, + "step": 43905 + }, + { + "epoch": 2.15, + "grad_norm": 0.7062668204307556, + "learning_rate": 0.00011078242252493663, + "loss": 3.0686, + "step": 43906 + }, + { + "epoch": 2.15, + "grad_norm": 0.6976759433746338, + "learning_rate": 0.00011077047523464738, + "loss": 2.8067, + "step": 43907 + }, + { + "epoch": 2.15, + "grad_norm": 0.7655328512191772, + "learning_rate": 0.00011075852844275242, + "loss": 2.8811, + "step": 43908 + }, + { + "epoch": 2.15, + "grad_norm": 0.6743826270103455, + "learning_rate": 0.00011074658214928298, + "loss": 2.7917, + "step": 43909 + }, + { + "epoch": 2.15, + "grad_norm": 0.6853476762771606, + "learning_rate": 0.00011073463635427068, + "loss": 3.049, + "step": 43910 + }, + { + "epoch": 2.15, + "grad_norm": 0.6936709880828857, + "learning_rate": 0.00011072269105774686, + "loss": 3.0572, + "step": 43911 + }, + { + "epoch": 2.15, + "grad_norm": 0.7399135231971741, + "learning_rate": 0.00011071074625974316, + "loss": 2.7455, + "step": 43912 + }, + { + "epoch": 2.15, + "grad_norm": 0.6544350385665894, + "learning_rate": 0.00011069880196029089, + "loss": 2.8913, + "step": 43913 + }, + { + "epoch": 2.15, + "grad_norm": 0.7409467101097107, + "learning_rate": 0.00011068685815942146, + "loss": 2.8284, + "step": 43914 + }, + { + "epoch": 2.15, + "grad_norm": 0.7109572291374207, + "learning_rate": 0.00011067491485716651, + "loss": 2.8065, + "step": 43915 + }, + { + "epoch": 2.15, + "grad_norm": 0.6747780442237854, + "learning_rate": 0.00011066297205355728, + "loss": 2.9514, + "step": 43916 + }, + { + "epoch": 2.15, + "grad_norm": 0.7135562896728516, + "learning_rate": 0.00011065102974862537, + "loss": 2.9687, + "step": 43917 + }, + { + "epoch": 2.15, + "grad_norm": 0.6812368035316467, + "learning_rate": 0.00011063908794240228, + "loss": 2.9524, + "step": 43918 + }, + { + "epoch": 2.15, + "grad_norm": 0.8200016617774963, + "learning_rate": 0.00011062714663491941, + "loss": 2.8971, + "step": 43919 + }, + { + "epoch": 2.15, + "grad_norm": 0.6595801115036011, + "learning_rate": 0.00011061520582620818, + "loss": 3.0807, + "step": 43920 + }, + { + "epoch": 2.15, + "grad_norm": 0.6876057982444763, + "learning_rate": 0.00011060326551629993, + "loss": 2.9015, + "step": 43921 + }, + { + "epoch": 2.15, + "grad_norm": 0.6769374012947083, + "learning_rate": 0.00011059132570522627, + "loss": 2.9048, + "step": 43922 + }, + { + "epoch": 2.15, + "grad_norm": 0.7245048880577087, + "learning_rate": 0.00011057938639301868, + "loss": 3.0612, + "step": 43923 + }, + { + "epoch": 2.15, + "grad_norm": 0.6851451396942139, + "learning_rate": 0.00011056744757970844, + "loss": 2.8888, + "step": 43924 + }, + { + "epoch": 2.15, + "grad_norm": 0.7699195742607117, + "learning_rate": 0.00011055550926532718, + "loss": 2.7047, + "step": 43925 + }, + { + "epoch": 2.15, + "grad_norm": 0.6714342832565308, + "learning_rate": 0.00011054357144990615, + "loss": 3.0598, + "step": 43926 + }, + { + "epoch": 2.15, + "grad_norm": 0.7293652892112732, + "learning_rate": 0.00011053163413347701, + "loss": 2.7219, + "step": 43927 + }, + { + "epoch": 2.15, + "grad_norm": 0.7357663512229919, + "learning_rate": 0.00011051969731607106, + "loss": 3.0263, + "step": 43928 + }, + { + "epoch": 2.15, + "grad_norm": 0.6910032033920288, + "learning_rate": 0.00011050776099771966, + "loss": 2.9287, + "step": 43929 + }, + { + "epoch": 2.15, + "grad_norm": 0.6952933669090271, + "learning_rate": 0.00011049582517845445, + "loss": 2.846, + "step": 43930 + }, + { + "epoch": 2.15, + "grad_norm": 0.742231547832489, + "learning_rate": 0.00011048388985830668, + "loss": 2.9032, + "step": 43931 + }, + { + "epoch": 2.15, + "grad_norm": 0.7026122212409973, + "learning_rate": 0.00011047195503730786, + "loss": 2.7614, + "step": 43932 + }, + { + "epoch": 2.15, + "grad_norm": 0.7984775304794312, + "learning_rate": 0.00011046002071548954, + "loss": 2.631, + "step": 43933 + }, + { + "epoch": 2.15, + "grad_norm": 0.6924632787704468, + "learning_rate": 0.00011044808689288306, + "loss": 2.9778, + "step": 43934 + }, + { + "epoch": 2.15, + "grad_norm": 0.7308608889579773, + "learning_rate": 0.00011043615356951982, + "loss": 2.9257, + "step": 43935 + }, + { + "epoch": 2.15, + "grad_norm": 0.719944953918457, + "learning_rate": 0.00011042422074543118, + "loss": 2.8804, + "step": 43936 + }, + { + "epoch": 2.15, + "grad_norm": 0.7189018130302429, + "learning_rate": 0.00011041228842064864, + "loss": 2.9876, + "step": 43937 + }, + { + "epoch": 2.15, + "grad_norm": 0.7342965006828308, + "learning_rate": 0.00011040035659520375, + "loss": 3.0549, + "step": 43938 + }, + { + "epoch": 2.15, + "grad_norm": 0.7213689684867859, + "learning_rate": 0.00011038842526912776, + "loss": 2.8142, + "step": 43939 + }, + { + "epoch": 2.15, + "grad_norm": 0.7366634011268616, + "learning_rate": 0.00011037649444245225, + "loss": 2.757, + "step": 43940 + }, + { + "epoch": 2.15, + "grad_norm": 0.7220673561096191, + "learning_rate": 0.00011036456411520853, + "loss": 2.5639, + "step": 43941 + }, + { + "epoch": 2.15, + "grad_norm": 0.705657422542572, + "learning_rate": 0.00011035263428742799, + "loss": 2.9537, + "step": 43942 + }, + { + "epoch": 2.15, + "grad_norm": 0.7454379200935364, + "learning_rate": 0.00011034070495914219, + "loss": 2.7032, + "step": 43943 + }, + { + "epoch": 2.15, + "grad_norm": 0.7317204475402832, + "learning_rate": 0.00011032877613038238, + "loss": 2.9766, + "step": 43944 + }, + { + "epoch": 2.15, + "grad_norm": 0.7271674871444702, + "learning_rate": 0.00011031684780118016, + "loss": 2.9546, + "step": 43945 + }, + { + "epoch": 2.15, + "grad_norm": 0.6818316578865051, + "learning_rate": 0.00011030491997156679, + "loss": 2.8576, + "step": 43946 + }, + { + "epoch": 2.15, + "grad_norm": 0.6842644810676575, + "learning_rate": 0.00011029299264157382, + "loss": 3.1527, + "step": 43947 + }, + { + "epoch": 2.15, + "grad_norm": 0.6735297441482544, + "learning_rate": 0.00011028106581123253, + "loss": 3.0781, + "step": 43948 + }, + { + "epoch": 2.15, + "grad_norm": 0.745730996131897, + "learning_rate": 0.00011026913948057448, + "loss": 2.9958, + "step": 43949 + }, + { + "epoch": 2.15, + "grad_norm": 0.7029087543487549, + "learning_rate": 0.000110257213649631, + "loss": 2.845, + "step": 43950 + }, + { + "epoch": 2.15, + "grad_norm": 0.6922839879989624, + "learning_rate": 0.0001102452883184334, + "loss": 2.9185, + "step": 43951 + }, + { + "epoch": 2.15, + "grad_norm": 0.7484878301620483, + "learning_rate": 0.00011023336348701332, + "loss": 3.0122, + "step": 43952 + }, + { + "epoch": 2.15, + "grad_norm": 0.7015834450721741, + "learning_rate": 0.00011022143915540189, + "loss": 2.9061, + "step": 43953 + }, + { + "epoch": 2.15, + "grad_norm": 0.7120810151100159, + "learning_rate": 0.0001102095153236307, + "loss": 2.984, + "step": 43954 + }, + { + "epoch": 2.15, + "grad_norm": 0.730353832244873, + "learning_rate": 0.00011019759199173123, + "loss": 2.9169, + "step": 43955 + }, + { + "epoch": 2.15, + "grad_norm": 0.7234374284744263, + "learning_rate": 0.00011018566915973477, + "loss": 3.1308, + "step": 43956 + }, + { + "epoch": 2.15, + "grad_norm": 0.6735356450080872, + "learning_rate": 0.0001101737468276727, + "loss": 2.9635, + "step": 43957 + }, + { + "epoch": 2.15, + "grad_norm": 0.6654012799263, + "learning_rate": 0.00011016182499557638, + "loss": 2.8069, + "step": 43958 + }, + { + "epoch": 2.15, + "grad_norm": 0.6770622134208679, + "learning_rate": 0.00011014990366347725, + "loss": 2.9357, + "step": 43959 + }, + { + "epoch": 2.15, + "grad_norm": 0.7623130083084106, + "learning_rate": 0.00011013798283140686, + "loss": 3.0011, + "step": 43960 + }, + { + "epoch": 2.15, + "grad_norm": 0.7139075398445129, + "learning_rate": 0.00011012606249939635, + "loss": 3.1044, + "step": 43961 + }, + { + "epoch": 2.15, + "grad_norm": 0.7361212372779846, + "learning_rate": 0.00011011414266747738, + "loss": 2.9142, + "step": 43962 + }, + { + "epoch": 2.15, + "grad_norm": 0.6977233290672302, + "learning_rate": 0.00011010222333568108, + "loss": 3.1171, + "step": 43963 + }, + { + "epoch": 2.15, + "grad_norm": 0.6813346743583679, + "learning_rate": 0.0001100903045040391, + "loss": 3.0365, + "step": 43964 + }, + { + "epoch": 2.15, + "grad_norm": 0.7580142021179199, + "learning_rate": 0.00011007838617258272, + "loss": 2.8071, + "step": 43965 + }, + { + "epoch": 2.15, + "grad_norm": 0.6768600940704346, + "learning_rate": 0.0001100664683413432, + "loss": 2.8438, + "step": 43966 + }, + { + "epoch": 2.15, + "grad_norm": 0.6694023013114929, + "learning_rate": 0.00011005455101035216, + "loss": 2.9374, + "step": 43967 + }, + { + "epoch": 2.15, + "grad_norm": 0.6759893894195557, + "learning_rate": 0.00011004263417964081, + "loss": 2.9976, + "step": 43968 + }, + { + "epoch": 2.15, + "grad_norm": 0.6973206996917725, + "learning_rate": 0.00011003071784924054, + "loss": 3.0073, + "step": 43969 + }, + { + "epoch": 2.15, + "grad_norm": 0.7266998291015625, + "learning_rate": 0.00011001880201918295, + "loss": 2.9469, + "step": 43970 + }, + { + "epoch": 2.15, + "grad_norm": 0.6822577118873596, + "learning_rate": 0.00011000688668949927, + "loss": 2.8839, + "step": 43971 + }, + { + "epoch": 2.15, + "grad_norm": 0.7668481469154358, + "learning_rate": 0.00010999497186022086, + "loss": 3.0304, + "step": 43972 + }, + { + "epoch": 2.16, + "grad_norm": 0.7106690406799316, + "learning_rate": 0.00010998305753137901, + "loss": 2.7812, + "step": 43973 + }, + { + "epoch": 2.16, + "grad_norm": 0.6828577518463135, + "learning_rate": 0.00010997114370300523, + "loss": 2.889, + "step": 43974 + }, + { + "epoch": 2.16, + "grad_norm": 0.6851549744606018, + "learning_rate": 0.00010995923037513101, + "loss": 2.9489, + "step": 43975 + }, + { + "epoch": 2.16, + "grad_norm": 0.6917469501495361, + "learning_rate": 0.00010994731754778749, + "loss": 3.2144, + "step": 43976 + }, + { + "epoch": 2.16, + "grad_norm": 0.6860583424568176, + "learning_rate": 0.00010993540522100626, + "loss": 2.9111, + "step": 43977 + }, + { + "epoch": 2.16, + "grad_norm": 0.6750058531761169, + "learning_rate": 0.0001099234933948186, + "loss": 2.7888, + "step": 43978 + }, + { + "epoch": 2.16, + "grad_norm": 0.6400216817855835, + "learning_rate": 0.00010991158206925578, + "loss": 3.1329, + "step": 43979 + }, + { + "epoch": 2.16, + "grad_norm": 0.6475121378898621, + "learning_rate": 0.00010989967124434934, + "loss": 2.9708, + "step": 43980 + }, + { + "epoch": 2.16, + "grad_norm": 0.6716324687004089, + "learning_rate": 0.0001098877609201305, + "loss": 2.8048, + "step": 43981 + }, + { + "epoch": 2.16, + "grad_norm": 0.7722020745277405, + "learning_rate": 0.00010987585109663082, + "loss": 2.9072, + "step": 43982 + }, + { + "epoch": 2.16, + "grad_norm": 0.750815212726593, + "learning_rate": 0.00010986394177388144, + "loss": 2.8756, + "step": 43983 + }, + { + "epoch": 2.16, + "grad_norm": 0.725193977355957, + "learning_rate": 0.00010985203295191385, + "loss": 2.9399, + "step": 43984 + }, + { + "epoch": 2.16, + "grad_norm": 0.7670903205871582, + "learning_rate": 0.00010984012463075953, + "loss": 2.6659, + "step": 43985 + }, + { + "epoch": 2.16, + "grad_norm": 0.6852924823760986, + "learning_rate": 0.0001098282168104497, + "loss": 2.96, + "step": 43986 + }, + { + "epoch": 2.16, + "grad_norm": 0.7253130078315735, + "learning_rate": 0.00010981630949101578, + "loss": 2.5994, + "step": 43987 + }, + { + "epoch": 2.16, + "grad_norm": 0.7172092795372009, + "learning_rate": 0.00010980440267248896, + "loss": 3.1027, + "step": 43988 + }, + { + "epoch": 2.16, + "grad_norm": 0.7548342347145081, + "learning_rate": 0.00010979249635490075, + "loss": 2.9589, + "step": 43989 + }, + { + "epoch": 2.16, + "grad_norm": 0.7267433404922485, + "learning_rate": 0.0001097805905382826, + "loss": 2.8408, + "step": 43990 + }, + { + "epoch": 2.16, + "grad_norm": 0.6913394927978516, + "learning_rate": 0.00010976868522266569, + "loss": 2.9435, + "step": 43991 + }, + { + "epoch": 2.16, + "grad_norm": 0.7468941807746887, + "learning_rate": 0.00010975678040808153, + "loss": 2.8856, + "step": 43992 + }, + { + "epoch": 2.16, + "grad_norm": 0.7527220249176025, + "learning_rate": 0.00010974487609456138, + "loss": 2.7722, + "step": 43993 + }, + { + "epoch": 2.16, + "grad_norm": 0.8277668952941895, + "learning_rate": 0.00010973297228213654, + "loss": 2.9391, + "step": 43994 + }, + { + "epoch": 2.16, + "grad_norm": 0.7322959899902344, + "learning_rate": 0.00010972106897083851, + "loss": 2.8405, + "step": 43995 + }, + { + "epoch": 2.16, + "grad_norm": 0.6588741540908813, + "learning_rate": 0.0001097091661606985, + "loss": 2.8649, + "step": 43996 + }, + { + "epoch": 2.16, + "grad_norm": 0.7032271027565002, + "learning_rate": 0.00010969726385174799, + "loss": 2.939, + "step": 43997 + }, + { + "epoch": 2.16, + "grad_norm": 0.666032075881958, + "learning_rate": 0.00010968536204401817, + "loss": 3.0517, + "step": 43998 + }, + { + "epoch": 2.16, + "grad_norm": 0.6766948699951172, + "learning_rate": 0.00010967346073754061, + "loss": 2.9951, + "step": 43999 + }, + { + "epoch": 2.16, + "grad_norm": 0.6832386255264282, + "learning_rate": 0.0001096615599323465, + "loss": 2.7958, + "step": 44000 + }, + { + "epoch": 2.16, + "grad_norm": 0.6713371276855469, + "learning_rate": 0.00010964965962846708, + "loss": 2.9533, + "step": 44001 + }, + { + "epoch": 2.16, + "grad_norm": 0.7049140930175781, + "learning_rate": 0.00010963775982593396, + "loss": 3.0323, + "step": 44002 + }, + { + "epoch": 2.16, + "grad_norm": 0.7148593068122864, + "learning_rate": 0.00010962586052477825, + "loss": 3.0128, + "step": 44003 + }, + { + "epoch": 2.16, + "grad_norm": 0.7361739277839661, + "learning_rate": 0.00010961396172503137, + "loss": 2.8242, + "step": 44004 + }, + { + "epoch": 2.16, + "grad_norm": 0.6860854029655457, + "learning_rate": 0.00010960206342672477, + "loss": 2.8884, + "step": 44005 + }, + { + "epoch": 2.16, + "grad_norm": 0.8418459892272949, + "learning_rate": 0.00010959016562988962, + "loss": 3.0021, + "step": 44006 + }, + { + "epoch": 2.16, + "grad_norm": 0.728326678276062, + "learning_rate": 0.00010957826833455737, + "loss": 2.8175, + "step": 44007 + }, + { + "epoch": 2.16, + "grad_norm": 0.7612557411193848, + "learning_rate": 0.00010956637154075938, + "loss": 2.8144, + "step": 44008 + }, + { + "epoch": 2.16, + "grad_norm": 0.7064254283905029, + "learning_rate": 0.00010955447524852677, + "loss": 2.8047, + "step": 44009 + }, + { + "epoch": 2.16, + "grad_norm": 0.694352924823761, + "learning_rate": 0.00010954257945789114, + "loss": 3.0043, + "step": 44010 + }, + { + "epoch": 2.16, + "grad_norm": 0.6741226315498352, + "learning_rate": 0.00010953068416888359, + "loss": 2.9749, + "step": 44011 + }, + { + "epoch": 2.16, + "grad_norm": 0.6707633137702942, + "learning_rate": 0.00010951878938153567, + "loss": 3.0853, + "step": 44012 + }, + { + "epoch": 2.16, + "grad_norm": 0.7317514419555664, + "learning_rate": 0.00010950689509587851, + "loss": 2.8133, + "step": 44013 + }, + { + "epoch": 2.16, + "grad_norm": 0.6689852476119995, + "learning_rate": 0.00010949500131194364, + "loss": 2.7656, + "step": 44014 + }, + { + "epoch": 2.16, + "grad_norm": 0.7170653939247131, + "learning_rate": 0.00010948310802976224, + "loss": 2.8662, + "step": 44015 + }, + { + "epoch": 2.16, + "grad_norm": 0.6306023597717285, + "learning_rate": 0.0001094712152493656, + "loss": 2.8672, + "step": 44016 + }, + { + "epoch": 2.16, + "grad_norm": 0.6976908445358276, + "learning_rate": 0.00010945932297078516, + "loss": 3.2326, + "step": 44017 + }, + { + "epoch": 2.16, + "grad_norm": 0.6834633946418762, + "learning_rate": 0.00010944743119405214, + "loss": 2.9629, + "step": 44018 + }, + { + "epoch": 2.16, + "grad_norm": 0.666874885559082, + "learning_rate": 0.00010943553991919788, + "loss": 2.8406, + "step": 44019 + }, + { + "epoch": 2.16, + "grad_norm": 0.6879183650016785, + "learning_rate": 0.00010942364914625387, + "loss": 2.7367, + "step": 44020 + }, + { + "epoch": 2.16, + "grad_norm": 0.6888954043388367, + "learning_rate": 0.00010941175887525128, + "loss": 3.0009, + "step": 44021 + }, + { + "epoch": 2.16, + "grad_norm": 0.7139040231704712, + "learning_rate": 0.00010939986910622141, + "loss": 2.979, + "step": 44022 + }, + { + "epoch": 2.16, + "grad_norm": 0.7036026120185852, + "learning_rate": 0.00010938797983919553, + "loss": 2.9986, + "step": 44023 + }, + { + "epoch": 2.16, + "grad_norm": 0.7085335850715637, + "learning_rate": 0.00010937609107420512, + "loss": 2.8668, + "step": 44024 + }, + { + "epoch": 2.16, + "grad_norm": 0.7103490233421326, + "learning_rate": 0.0001093642028112813, + "loss": 2.8923, + "step": 44025 + }, + { + "epoch": 2.16, + "grad_norm": 0.6850371956825256, + "learning_rate": 0.00010935231505045547, + "loss": 2.8388, + "step": 44026 + }, + { + "epoch": 2.16, + "grad_norm": 0.6872725486755371, + "learning_rate": 0.00010934042779175905, + "loss": 3.0184, + "step": 44027 + }, + { + "epoch": 2.16, + "grad_norm": 0.6997257471084595, + "learning_rate": 0.00010932854103522318, + "loss": 3.055, + "step": 44028 + }, + { + "epoch": 2.16, + "grad_norm": 0.6553372144699097, + "learning_rate": 0.00010931665478087933, + "loss": 2.7495, + "step": 44029 + }, + { + "epoch": 2.16, + "grad_norm": 0.7009562253952026, + "learning_rate": 0.0001093047690287587, + "loss": 3.0277, + "step": 44030 + }, + { + "epoch": 2.16, + "grad_norm": 0.7105560302734375, + "learning_rate": 0.00010929288377889249, + "loss": 3.132, + "step": 44031 + }, + { + "epoch": 2.16, + "grad_norm": 0.6748614311218262, + "learning_rate": 0.00010928099903131225, + "loss": 3.0997, + "step": 44032 + }, + { + "epoch": 2.16, + "grad_norm": 0.7473457455635071, + "learning_rate": 0.00010926911478604903, + "loss": 2.9778, + "step": 44033 + }, + { + "epoch": 2.16, + "grad_norm": 0.6980936527252197, + "learning_rate": 0.00010925723104313439, + "loss": 3.0761, + "step": 44034 + }, + { + "epoch": 2.16, + "grad_norm": 0.6893765330314636, + "learning_rate": 0.00010924534780259937, + "loss": 2.7932, + "step": 44035 + }, + { + "epoch": 2.16, + "grad_norm": 0.6904683113098145, + "learning_rate": 0.0001092334650644755, + "loss": 2.8992, + "step": 44036 + }, + { + "epoch": 2.16, + "grad_norm": 0.7253763675689697, + "learning_rate": 0.00010922158282879398, + "loss": 2.9312, + "step": 44037 + }, + { + "epoch": 2.16, + "grad_norm": 0.6974086165428162, + "learning_rate": 0.00010920970109558597, + "loss": 2.9157, + "step": 44038 + }, + { + "epoch": 2.16, + "grad_norm": 0.7278842329978943, + "learning_rate": 0.00010919781986488302, + "loss": 2.8189, + "step": 44039 + }, + { + "epoch": 2.16, + "grad_norm": 0.73302161693573, + "learning_rate": 0.00010918593913671615, + "loss": 2.7879, + "step": 44040 + }, + { + "epoch": 2.16, + "grad_norm": 0.7178779244422913, + "learning_rate": 0.00010917405891111684, + "loss": 2.8211, + "step": 44041 + }, + { + "epoch": 2.16, + "grad_norm": 0.7002609372138977, + "learning_rate": 0.00010916217918811642, + "loss": 2.9447, + "step": 44042 + }, + { + "epoch": 2.16, + "grad_norm": 0.7161980867385864, + "learning_rate": 0.00010915029996774607, + "loss": 2.8354, + "step": 44043 + }, + { + "epoch": 2.16, + "grad_norm": 0.7740674614906311, + "learning_rate": 0.0001091384212500371, + "loss": 3.0059, + "step": 44044 + }, + { + "epoch": 2.16, + "grad_norm": 0.7472088932991028, + "learning_rate": 0.00010912654303502072, + "loss": 2.5457, + "step": 44045 + }, + { + "epoch": 2.16, + "grad_norm": 0.6775591373443604, + "learning_rate": 0.00010911466532272827, + "loss": 2.7964, + "step": 44046 + }, + { + "epoch": 2.16, + "grad_norm": 0.6801954507827759, + "learning_rate": 0.00010910278811319119, + "loss": 2.9941, + "step": 44047 + }, + { + "epoch": 2.16, + "grad_norm": 0.7444169521331787, + "learning_rate": 0.00010909091140644049, + "loss": 3.0355, + "step": 44048 + }, + { + "epoch": 2.16, + "grad_norm": 0.7006654739379883, + "learning_rate": 0.0001090790352025077, + "loss": 2.8252, + "step": 44049 + }, + { + "epoch": 2.16, + "grad_norm": 0.7231435775756836, + "learning_rate": 0.00010906715950142389, + "loss": 2.8966, + "step": 44050 + }, + { + "epoch": 2.16, + "grad_norm": 0.6741910576820374, + "learning_rate": 0.00010905528430322054, + "loss": 2.9011, + "step": 44051 + }, + { + "epoch": 2.16, + "grad_norm": 0.6673889756202698, + "learning_rate": 0.0001090434096079288, + "loss": 2.8311, + "step": 44052 + }, + { + "epoch": 2.16, + "grad_norm": 0.7044496536254883, + "learning_rate": 0.00010903153541557988, + "loss": 2.8997, + "step": 44053 + }, + { + "epoch": 2.16, + "grad_norm": 0.7047674655914307, + "learning_rate": 0.00010901966172620527, + "loss": 2.9994, + "step": 44054 + }, + { + "epoch": 2.16, + "grad_norm": 0.7237603664398193, + "learning_rate": 0.00010900778853983599, + "loss": 2.6291, + "step": 44055 + }, + { + "epoch": 2.16, + "grad_norm": 0.8270739316940308, + "learning_rate": 0.00010899591585650345, + "loss": 2.7419, + "step": 44056 + }, + { + "epoch": 2.16, + "grad_norm": 0.7122185230255127, + "learning_rate": 0.000108984043676239, + "loss": 3.0659, + "step": 44057 + }, + { + "epoch": 2.16, + "grad_norm": 0.6983858346939087, + "learning_rate": 0.00010897217199907381, + "loss": 3.0083, + "step": 44058 + }, + { + "epoch": 2.16, + "grad_norm": 0.7358377575874329, + "learning_rate": 0.00010896030082503915, + "loss": 2.6745, + "step": 44059 + }, + { + "epoch": 2.16, + "grad_norm": 0.7250675559043884, + "learning_rate": 0.00010894843015416622, + "loss": 2.9184, + "step": 44060 + }, + { + "epoch": 2.16, + "grad_norm": 0.7548567056655884, + "learning_rate": 0.00010893655998648634, + "loss": 3.1155, + "step": 44061 + }, + { + "epoch": 2.16, + "grad_norm": 0.7145891189575195, + "learning_rate": 0.00010892469032203088, + "loss": 2.9655, + "step": 44062 + }, + { + "epoch": 2.16, + "grad_norm": 0.7194769382476807, + "learning_rate": 0.00010891282116083093, + "loss": 2.9013, + "step": 44063 + }, + { + "epoch": 2.16, + "grad_norm": 0.6655535101890564, + "learning_rate": 0.00010890095250291792, + "loss": 3.0525, + "step": 44064 + }, + { + "epoch": 2.16, + "grad_norm": 0.7517229318618774, + "learning_rate": 0.00010888908434832294, + "loss": 3.1682, + "step": 44065 + }, + { + "epoch": 2.16, + "grad_norm": 0.6540136337280273, + "learning_rate": 0.00010887721669707742, + "loss": 2.9677, + "step": 44066 + }, + { + "epoch": 2.16, + "grad_norm": 0.7223973274230957, + "learning_rate": 0.00010886534954921253, + "loss": 2.8192, + "step": 44067 + }, + { + "epoch": 2.16, + "grad_norm": 0.7570539116859436, + "learning_rate": 0.00010885348290475943, + "loss": 3.0616, + "step": 44068 + }, + { + "epoch": 2.16, + "grad_norm": 0.6754083037376404, + "learning_rate": 0.00010884161676374957, + "loss": 3.1191, + "step": 44069 + }, + { + "epoch": 2.16, + "grad_norm": 0.6810846328735352, + "learning_rate": 0.00010882975112621401, + "loss": 3.0448, + "step": 44070 + }, + { + "epoch": 2.16, + "grad_norm": 0.7092759609222412, + "learning_rate": 0.00010881788599218406, + "loss": 2.939, + "step": 44071 + }, + { + "epoch": 2.16, + "grad_norm": 0.6637600064277649, + "learning_rate": 0.00010880602136169116, + "loss": 2.9407, + "step": 44072 + }, + { + "epoch": 2.16, + "grad_norm": 0.7038546800613403, + "learning_rate": 0.00010879415723476636, + "loss": 3.0425, + "step": 44073 + }, + { + "epoch": 2.16, + "grad_norm": 0.7373028993606567, + "learning_rate": 0.00010878229361144098, + "loss": 3.0257, + "step": 44074 + }, + { + "epoch": 2.16, + "grad_norm": 0.6334795355796814, + "learning_rate": 0.00010877043049174608, + "loss": 2.9988, + "step": 44075 + }, + { + "epoch": 2.16, + "grad_norm": 0.716715395450592, + "learning_rate": 0.0001087585678757131, + "loss": 2.8645, + "step": 44076 + }, + { + "epoch": 2.16, + "grad_norm": 0.7330484986305237, + "learning_rate": 0.00010874670576337335, + "loss": 3.0347, + "step": 44077 + }, + { + "epoch": 2.16, + "grad_norm": 0.6826630234718323, + "learning_rate": 0.00010873484415475789, + "loss": 3.0281, + "step": 44078 + }, + { + "epoch": 2.16, + "grad_norm": 0.6680867671966553, + "learning_rate": 0.00010872298304989811, + "loss": 3.0422, + "step": 44079 + }, + { + "epoch": 2.16, + "grad_norm": 0.6701593995094299, + "learning_rate": 0.00010871112244882519, + "loss": 2.7085, + "step": 44080 + }, + { + "epoch": 2.16, + "grad_norm": 0.7055572271347046, + "learning_rate": 0.00010869926235157028, + "loss": 3.1402, + "step": 44081 + }, + { + "epoch": 2.16, + "grad_norm": 0.6829777359962463, + "learning_rate": 0.00010868740275816478, + "loss": 2.7673, + "step": 44082 + }, + { + "epoch": 2.16, + "grad_norm": 0.6810120344161987, + "learning_rate": 0.00010867554366863972, + "loss": 3.0536, + "step": 44083 + }, + { + "epoch": 2.16, + "grad_norm": 0.6956943869590759, + "learning_rate": 0.00010866368508302658, + "loss": 2.9712, + "step": 44084 + }, + { + "epoch": 2.16, + "grad_norm": 0.7180123329162598, + "learning_rate": 0.00010865182700135638, + "loss": 2.8563, + "step": 44085 + }, + { + "epoch": 2.16, + "grad_norm": 0.6888797283172607, + "learning_rate": 0.00010863996942366042, + "loss": 3.0218, + "step": 44086 + }, + { + "epoch": 2.16, + "grad_norm": 0.6845893263816833, + "learning_rate": 0.00010862811234997007, + "loss": 2.9983, + "step": 44087 + }, + { + "epoch": 2.16, + "grad_norm": 0.7172102928161621, + "learning_rate": 0.00010861625578031642, + "loss": 2.9244, + "step": 44088 + }, + { + "epoch": 2.16, + "grad_norm": 0.7032397985458374, + "learning_rate": 0.00010860439971473072, + "loss": 2.9089, + "step": 44089 + }, + { + "epoch": 2.16, + "grad_norm": 0.7030555605888367, + "learning_rate": 0.00010859254415324413, + "loss": 2.8213, + "step": 44090 + }, + { + "epoch": 2.16, + "grad_norm": 0.7154149413108826, + "learning_rate": 0.0001085806890958879, + "loss": 2.6665, + "step": 44091 + }, + { + "epoch": 2.16, + "grad_norm": 0.6783580780029297, + "learning_rate": 0.00010856883454269342, + "loss": 2.9993, + "step": 44092 + }, + { + "epoch": 2.16, + "grad_norm": 0.7258152365684509, + "learning_rate": 0.0001085569804936917, + "loss": 2.9013, + "step": 44093 + }, + { + "epoch": 2.16, + "grad_norm": 0.6536148190498352, + "learning_rate": 0.00010854512694891413, + "loss": 2.8165, + "step": 44094 + }, + { + "epoch": 2.16, + "grad_norm": 0.712685763835907, + "learning_rate": 0.00010853327390839184, + "loss": 2.9026, + "step": 44095 + }, + { + "epoch": 2.16, + "grad_norm": 0.6730918288230896, + "learning_rate": 0.00010852142137215597, + "loss": 2.9592, + "step": 44096 + }, + { + "epoch": 2.16, + "grad_norm": 0.6709897518157959, + "learning_rate": 0.00010850956934023795, + "loss": 2.5844, + "step": 44097 + }, + { + "epoch": 2.16, + "grad_norm": 0.6992678642272949, + "learning_rate": 0.00010849771781266872, + "loss": 2.9257, + "step": 44098 + }, + { + "epoch": 2.16, + "grad_norm": 0.7160495519638062, + "learning_rate": 0.0001084858667894798, + "loss": 3.0139, + "step": 44099 + }, + { + "epoch": 2.16, + "grad_norm": 0.6836722493171692, + "learning_rate": 0.00010847401627070212, + "loss": 3.0412, + "step": 44100 + }, + { + "epoch": 2.16, + "grad_norm": 0.6511483192443848, + "learning_rate": 0.00010846216625636712, + "loss": 2.8481, + "step": 44101 + }, + { + "epoch": 2.16, + "grad_norm": 0.6964370012283325, + "learning_rate": 0.00010845031674650589, + "loss": 2.901, + "step": 44102 + }, + { + "epoch": 2.16, + "grad_norm": 0.7739323377609253, + "learning_rate": 0.0001084384677411496, + "loss": 3.06, + "step": 44103 + }, + { + "epoch": 2.16, + "grad_norm": 0.7379111051559448, + "learning_rate": 0.0001084266192403296, + "loss": 2.8302, + "step": 44104 + }, + { + "epoch": 2.16, + "grad_norm": 0.7733993530273438, + "learning_rate": 0.00010841477124407694, + "loss": 2.986, + "step": 44105 + }, + { + "epoch": 2.16, + "grad_norm": 0.7553689479827881, + "learning_rate": 0.00010840292375242298, + "loss": 2.9365, + "step": 44106 + }, + { + "epoch": 2.16, + "grad_norm": 0.6967439651489258, + "learning_rate": 0.00010839107676539875, + "loss": 2.9238, + "step": 44107 + }, + { + "epoch": 2.16, + "grad_norm": 0.6885391473770142, + "learning_rate": 0.00010837923028303555, + "loss": 2.9053, + "step": 44108 + }, + { + "epoch": 2.16, + "grad_norm": 0.6458328366279602, + "learning_rate": 0.00010836738430536466, + "loss": 2.945, + "step": 44109 + }, + { + "epoch": 2.16, + "grad_norm": 0.7221814393997192, + "learning_rate": 0.00010835553883241723, + "loss": 2.827, + "step": 44110 + }, + { + "epoch": 2.16, + "grad_norm": 0.6976704001426697, + "learning_rate": 0.00010834369386422441, + "loss": 2.8029, + "step": 44111 + }, + { + "epoch": 2.16, + "grad_norm": 0.7037103176116943, + "learning_rate": 0.0001083318494008173, + "loss": 2.8437, + "step": 44112 + }, + { + "epoch": 2.16, + "grad_norm": 0.7122330069541931, + "learning_rate": 0.00010832000544222726, + "loss": 2.9479, + "step": 44113 + }, + { + "epoch": 2.16, + "grad_norm": 0.705687403678894, + "learning_rate": 0.0001083081619884855, + "loss": 2.8805, + "step": 44114 + }, + { + "epoch": 2.16, + "grad_norm": 0.6933003067970276, + "learning_rate": 0.00010829631903962304, + "loss": 2.9338, + "step": 44115 + }, + { + "epoch": 2.16, + "grad_norm": 0.7246118783950806, + "learning_rate": 0.00010828447659567132, + "loss": 2.9069, + "step": 44116 + }, + { + "epoch": 2.16, + "grad_norm": 0.6804153323173523, + "learning_rate": 0.00010827263465666137, + "loss": 2.8831, + "step": 44117 + }, + { + "epoch": 2.16, + "grad_norm": 0.7856549024581909, + "learning_rate": 0.00010826079322262427, + "loss": 2.8996, + "step": 44118 + }, + { + "epoch": 2.16, + "grad_norm": 0.6795721650123596, + "learning_rate": 0.00010824895229359146, + "loss": 2.716, + "step": 44119 + }, + { + "epoch": 2.16, + "grad_norm": 0.7031411528587341, + "learning_rate": 0.00010823711186959392, + "loss": 2.9412, + "step": 44120 + }, + { + "epoch": 2.16, + "grad_norm": 0.6899999976158142, + "learning_rate": 0.00010822527195066301, + "loss": 3.0925, + "step": 44121 + }, + { + "epoch": 2.16, + "grad_norm": 0.7404148578643799, + "learning_rate": 0.00010821343253682977, + "loss": 2.9588, + "step": 44122 + }, + { + "epoch": 2.16, + "grad_norm": 0.7039032578468323, + "learning_rate": 0.0001082015936281255, + "loss": 2.9895, + "step": 44123 + }, + { + "epoch": 2.16, + "grad_norm": 0.6816439032554626, + "learning_rate": 0.0001081897552245813, + "loss": 2.707, + "step": 44124 + }, + { + "epoch": 2.16, + "grad_norm": 0.7398577928543091, + "learning_rate": 0.00010817791732622829, + "loss": 3.0734, + "step": 44125 + }, + { + "epoch": 2.16, + "grad_norm": 0.7134485244750977, + "learning_rate": 0.00010816607993309785, + "loss": 2.9215, + "step": 44126 + }, + { + "epoch": 2.16, + "grad_norm": 0.7148016691207886, + "learning_rate": 0.0001081542430452209, + "loss": 3.0477, + "step": 44127 + }, + { + "epoch": 2.16, + "grad_norm": 0.7097327709197998, + "learning_rate": 0.00010814240666262879, + "loss": 2.9591, + "step": 44128 + }, + { + "epoch": 2.16, + "grad_norm": 0.7393547892570496, + "learning_rate": 0.00010813057078535272, + "loss": 3.0054, + "step": 44129 + }, + { + "epoch": 2.16, + "grad_norm": 0.7232438325881958, + "learning_rate": 0.00010811873541342372, + "loss": 2.7291, + "step": 44130 + }, + { + "epoch": 2.16, + "grad_norm": 0.8301907181739807, + "learning_rate": 0.00010810690054687315, + "loss": 2.7839, + "step": 44131 + }, + { + "epoch": 2.16, + "grad_norm": 0.751732349395752, + "learning_rate": 0.00010809506618573207, + "loss": 2.8497, + "step": 44132 + }, + { + "epoch": 2.16, + "grad_norm": 0.7149835228919983, + "learning_rate": 0.00010808323233003154, + "loss": 3.0302, + "step": 44133 + }, + { + "epoch": 2.16, + "grad_norm": 0.7186267375946045, + "learning_rate": 0.00010807139897980295, + "loss": 2.8241, + "step": 44134 + }, + { + "epoch": 2.16, + "grad_norm": 0.6884257197380066, + "learning_rate": 0.00010805956613507726, + "loss": 2.8621, + "step": 44135 + }, + { + "epoch": 2.16, + "grad_norm": 0.7444910407066345, + "learning_rate": 0.00010804773379588582, + "loss": 2.8246, + "step": 44136 + }, + { + "epoch": 2.16, + "grad_norm": 0.7724918127059937, + "learning_rate": 0.00010803590196225963, + "loss": 2.9742, + "step": 44137 + }, + { + "epoch": 2.16, + "grad_norm": 0.707662045955658, + "learning_rate": 0.00010802407063423001, + "loss": 2.7956, + "step": 44138 + }, + { + "epoch": 2.16, + "grad_norm": 0.7242353558540344, + "learning_rate": 0.00010801223981182806, + "loss": 3.0574, + "step": 44139 + }, + { + "epoch": 2.16, + "grad_norm": 0.6966133713722229, + "learning_rate": 0.0001080004094950848, + "loss": 3.0594, + "step": 44140 + }, + { + "epoch": 2.16, + "grad_norm": 0.7180776000022888, + "learning_rate": 0.00010798857968403162, + "loss": 2.9338, + "step": 44141 + }, + { + "epoch": 2.16, + "grad_norm": 0.97182697057724, + "learning_rate": 0.00010797675037869947, + "loss": 2.9121, + "step": 44142 + }, + { + "epoch": 2.16, + "grad_norm": 0.7087554335594177, + "learning_rate": 0.0001079649215791196, + "loss": 2.6782, + "step": 44143 + }, + { + "epoch": 2.16, + "grad_norm": 0.6923799514770508, + "learning_rate": 0.00010795309328532327, + "loss": 2.8516, + "step": 44144 + }, + { + "epoch": 2.16, + "grad_norm": 0.6832737922668457, + "learning_rate": 0.00010794126549734155, + "loss": 2.8526, + "step": 44145 + }, + { + "epoch": 2.16, + "grad_norm": 0.6955265402793884, + "learning_rate": 0.00010792943821520554, + "loss": 2.892, + "step": 44146 + }, + { + "epoch": 2.16, + "grad_norm": 0.7082833051681519, + "learning_rate": 0.00010791761143894635, + "loss": 2.7952, + "step": 44147 + }, + { + "epoch": 2.16, + "grad_norm": 0.6620471477508545, + "learning_rate": 0.00010790578516859517, + "loss": 3.1079, + "step": 44148 + }, + { + "epoch": 2.16, + "grad_norm": 0.7037109136581421, + "learning_rate": 0.00010789395940418331, + "loss": 3.0704, + "step": 44149 + }, + { + "epoch": 2.16, + "grad_norm": 0.6921175122261047, + "learning_rate": 0.0001078821341457417, + "loss": 2.9341, + "step": 44150 + }, + { + "epoch": 2.16, + "grad_norm": 0.7180284261703491, + "learning_rate": 0.00010787030939330165, + "loss": 2.9196, + "step": 44151 + }, + { + "epoch": 2.16, + "grad_norm": 0.6754325032234192, + "learning_rate": 0.00010785848514689414, + "loss": 2.8187, + "step": 44152 + }, + { + "epoch": 2.16, + "grad_norm": 0.7477900981903076, + "learning_rate": 0.00010784666140655051, + "loss": 3.0184, + "step": 44153 + }, + { + "epoch": 2.16, + "grad_norm": 0.7051811814308167, + "learning_rate": 0.0001078348381723018, + "loss": 2.8238, + "step": 44154 + }, + { + "epoch": 2.16, + "grad_norm": 0.7260898947715759, + "learning_rate": 0.00010782301544417903, + "loss": 2.8263, + "step": 44155 + }, + { + "epoch": 2.16, + "grad_norm": 0.7143333554267883, + "learning_rate": 0.00010781119322221354, + "loss": 2.773, + "step": 44156 + }, + { + "epoch": 2.16, + "grad_norm": 0.7197613716125488, + "learning_rate": 0.00010779937150643628, + "loss": 2.9153, + "step": 44157 + }, + { + "epoch": 2.16, + "grad_norm": 0.7276310324668884, + "learning_rate": 0.00010778755029687852, + "loss": 2.8371, + "step": 44158 + }, + { + "epoch": 2.16, + "grad_norm": 0.6611834764480591, + "learning_rate": 0.00010777572959357143, + "loss": 3.0825, + "step": 44159 + }, + { + "epoch": 2.16, + "grad_norm": 0.6871948838233948, + "learning_rate": 0.0001077639093965461, + "loss": 3.0084, + "step": 44160 + }, + { + "epoch": 2.16, + "grad_norm": 0.7019148468971252, + "learning_rate": 0.00010775208970583359, + "loss": 2.9931, + "step": 44161 + }, + { + "epoch": 2.16, + "grad_norm": 0.6876431703567505, + "learning_rate": 0.000107740270521465, + "loss": 3.141, + "step": 44162 + }, + { + "epoch": 2.16, + "grad_norm": 0.667335033416748, + "learning_rate": 0.00010772845184347154, + "loss": 2.7994, + "step": 44163 + }, + { + "epoch": 2.16, + "grad_norm": 0.6756302118301392, + "learning_rate": 0.00010771663367188445, + "loss": 3.0605, + "step": 44164 + }, + { + "epoch": 2.16, + "grad_norm": 0.7222525477409363, + "learning_rate": 0.00010770481600673463, + "loss": 3.2013, + "step": 44165 + }, + { + "epoch": 2.16, + "grad_norm": 0.7120800018310547, + "learning_rate": 0.0001076929988480534, + "loss": 2.8396, + "step": 44166 + }, + { + "epoch": 2.16, + "grad_norm": 0.698549211025238, + "learning_rate": 0.00010768118219587182, + "loss": 2.9708, + "step": 44167 + }, + { + "epoch": 2.16, + "grad_norm": 0.6790973544120789, + "learning_rate": 0.00010766936605022088, + "loss": 3.1835, + "step": 44168 + }, + { + "epoch": 2.16, + "grad_norm": 0.6756714582443237, + "learning_rate": 0.00010765755041113194, + "loss": 2.6655, + "step": 44169 + }, + { + "epoch": 2.16, + "grad_norm": 0.7276606559753418, + "learning_rate": 0.00010764573527863587, + "loss": 2.8426, + "step": 44170 + }, + { + "epoch": 2.16, + "grad_norm": 0.6843422055244446, + "learning_rate": 0.00010763392065276404, + "loss": 2.9493, + "step": 44171 + }, + { + "epoch": 2.16, + "grad_norm": 0.7065037488937378, + "learning_rate": 0.00010762210653354733, + "loss": 2.8149, + "step": 44172 + }, + { + "epoch": 2.16, + "grad_norm": 0.7013691067695618, + "learning_rate": 0.00010761029292101693, + "loss": 2.811, + "step": 44173 + }, + { + "epoch": 2.16, + "grad_norm": 0.727316677570343, + "learning_rate": 0.00010759847981520412, + "loss": 3.0262, + "step": 44174 + }, + { + "epoch": 2.16, + "grad_norm": 0.7056801319122314, + "learning_rate": 0.0001075866672161399, + "loss": 2.8486, + "step": 44175 + }, + { + "epoch": 2.16, + "grad_norm": 0.6833690404891968, + "learning_rate": 0.00010757485512385535, + "loss": 2.8004, + "step": 44176 + }, + { + "epoch": 2.17, + "grad_norm": 0.6740131974220276, + "learning_rate": 0.00010756304353838152, + "loss": 2.829, + "step": 44177 + }, + { + "epoch": 2.17, + "grad_norm": 0.7866957187652588, + "learning_rate": 0.00010755123245974969, + "loss": 2.8718, + "step": 44178 + }, + { + "epoch": 2.17, + "grad_norm": 0.6585379838943481, + "learning_rate": 0.00010753942188799077, + "loss": 2.9546, + "step": 44179 + }, + { + "epoch": 2.17, + "grad_norm": 0.7182008624076843, + "learning_rate": 0.00010752761182313597, + "loss": 2.8891, + "step": 44180 + }, + { + "epoch": 2.17, + "grad_norm": 0.695152223110199, + "learning_rate": 0.0001075158022652165, + "loss": 3.0113, + "step": 44181 + }, + { + "epoch": 2.17, + "grad_norm": 0.7504631280899048, + "learning_rate": 0.00010750399321426336, + "loss": 2.8999, + "step": 44182 + }, + { + "epoch": 2.17, + "grad_norm": 0.6874749064445496, + "learning_rate": 0.00010749218467030767, + "loss": 2.7791, + "step": 44183 + }, + { + "epoch": 2.17, + "grad_norm": 0.725240170955658, + "learning_rate": 0.00010748037663338039, + "loss": 2.7461, + "step": 44184 + }, + { + "epoch": 2.17, + "grad_norm": 0.7044048309326172, + "learning_rate": 0.00010746856910351275, + "loss": 2.9411, + "step": 44185 + }, + { + "epoch": 2.17, + "grad_norm": 0.7227499485015869, + "learning_rate": 0.00010745676208073597, + "loss": 2.9253, + "step": 44186 + }, + { + "epoch": 2.17, + "grad_norm": 0.700607180595398, + "learning_rate": 0.0001074449555650809, + "loss": 2.886, + "step": 44187 + }, + { + "epoch": 2.17, + "grad_norm": 0.6638893485069275, + "learning_rate": 0.00010743314955657889, + "loss": 2.8618, + "step": 44188 + }, + { + "epoch": 2.17, + "grad_norm": 0.7097645401954651, + "learning_rate": 0.00010742134405526078, + "loss": 3.0554, + "step": 44189 + }, + { + "epoch": 2.17, + "grad_norm": 0.7432897686958313, + "learning_rate": 0.0001074095390611579, + "loss": 3.0488, + "step": 44190 + }, + { + "epoch": 2.17, + "grad_norm": 0.7031722068786621, + "learning_rate": 0.00010739773457430122, + "loss": 2.9038, + "step": 44191 + }, + { + "epoch": 2.17, + "grad_norm": 0.6727100610733032, + "learning_rate": 0.00010738593059472174, + "loss": 2.8709, + "step": 44192 + }, + { + "epoch": 2.17, + "grad_norm": 0.7058899402618408, + "learning_rate": 0.00010737412712245074, + "loss": 3.1592, + "step": 44193 + }, + { + "epoch": 2.17, + "grad_norm": 0.6950834393501282, + "learning_rate": 0.00010736232415751913, + "loss": 3.0163, + "step": 44194 + }, + { + "epoch": 2.17, + "grad_norm": 0.7071797251701355, + "learning_rate": 0.00010735052169995809, + "loss": 2.9139, + "step": 44195 + }, + { + "epoch": 2.17, + "grad_norm": 0.6854180097579956, + "learning_rate": 0.00010733871974979878, + "loss": 2.8134, + "step": 44196 + }, + { + "epoch": 2.17, + "grad_norm": 0.7189562916755676, + "learning_rate": 0.00010732691830707221, + "loss": 2.8558, + "step": 44197 + }, + { + "epoch": 2.17, + "grad_norm": 0.6695606112480164, + "learning_rate": 0.00010731511737180947, + "loss": 3.0807, + "step": 44198 + }, + { + "epoch": 2.17, + "grad_norm": 0.6888101696968079, + "learning_rate": 0.00010730331694404153, + "loss": 3.014, + "step": 44199 + }, + { + "epoch": 2.17, + "grad_norm": 0.6768573522567749, + "learning_rate": 0.00010729151702379955, + "loss": 2.9749, + "step": 44200 + }, + { + "epoch": 2.17, + "grad_norm": 0.7207431197166443, + "learning_rate": 0.00010727971761111472, + "loss": 3.0053, + "step": 44201 + }, + { + "epoch": 2.17, + "grad_norm": 0.7181470394134521, + "learning_rate": 0.00010726791870601795, + "loss": 2.881, + "step": 44202 + }, + { + "epoch": 2.17, + "grad_norm": 0.7218087911605835, + "learning_rate": 0.00010725612030854048, + "loss": 2.86, + "step": 44203 + }, + { + "epoch": 2.17, + "grad_norm": 0.7341431379318237, + "learning_rate": 0.00010724432241871329, + "loss": 2.9529, + "step": 44204 + }, + { + "epoch": 2.17, + "grad_norm": 0.7279463410377502, + "learning_rate": 0.00010723252503656737, + "loss": 2.9883, + "step": 44205 + }, + { + "epoch": 2.17, + "grad_norm": 0.6712480187416077, + "learning_rate": 0.00010722072816213396, + "loss": 2.8739, + "step": 44206 + }, + { + "epoch": 2.17, + "grad_norm": 0.6901949048042297, + "learning_rate": 0.00010720893179544397, + "loss": 2.8589, + "step": 44207 + }, + { + "epoch": 2.17, + "grad_norm": 0.7036940455436707, + "learning_rate": 0.00010719713593652866, + "loss": 2.8149, + "step": 44208 + }, + { + "epoch": 2.17, + "grad_norm": 0.7346253991127014, + "learning_rate": 0.00010718534058541885, + "loss": 2.9119, + "step": 44209 + }, + { + "epoch": 2.17, + "grad_norm": 0.7171415686607361, + "learning_rate": 0.00010717354574214577, + "loss": 3.0225, + "step": 44210 + }, + { + "epoch": 2.17, + "grad_norm": 0.7323898673057556, + "learning_rate": 0.00010716175140674055, + "loss": 2.8658, + "step": 44211 + }, + { + "epoch": 2.17, + "grad_norm": 0.7688388824462891, + "learning_rate": 0.00010714995757923418, + "loss": 2.8879, + "step": 44212 + }, + { + "epoch": 2.17, + "grad_norm": 0.6723164319992065, + "learning_rate": 0.00010713816425965768, + "loss": 2.9502, + "step": 44213 + }, + { + "epoch": 2.17, + "grad_norm": 0.6992418766021729, + "learning_rate": 0.00010712637144804207, + "loss": 2.9745, + "step": 44214 + }, + { + "epoch": 2.17, + "grad_norm": 0.6746936440467834, + "learning_rate": 0.00010711457914441846, + "loss": 2.9747, + "step": 44215 + }, + { + "epoch": 2.17, + "grad_norm": 0.7169411182403564, + "learning_rate": 0.00010710278734881804, + "loss": 3.0951, + "step": 44216 + }, + { + "epoch": 2.17, + "grad_norm": 0.7070459127426147, + "learning_rate": 0.00010709099606127165, + "loss": 2.8509, + "step": 44217 + }, + { + "epoch": 2.17, + "grad_norm": 0.6970311999320984, + "learning_rate": 0.00010707920528181053, + "loss": 2.991, + "step": 44218 + }, + { + "epoch": 2.17, + "grad_norm": 0.7649372816085815, + "learning_rate": 0.00010706741501046566, + "loss": 3.022, + "step": 44219 + }, + { + "epoch": 2.17, + "grad_norm": 0.727436900138855, + "learning_rate": 0.00010705562524726802, + "loss": 2.9837, + "step": 44220 + }, + { + "epoch": 2.17, + "grad_norm": 0.6991956830024719, + "learning_rate": 0.0001070438359922488, + "loss": 2.7914, + "step": 44221 + }, + { + "epoch": 2.17, + "grad_norm": 0.6980074048042297, + "learning_rate": 0.0001070320472454389, + "loss": 3.0317, + "step": 44222 + }, + { + "epoch": 2.17, + "grad_norm": 0.6773894429206848, + "learning_rate": 0.00010702025900686954, + "loss": 2.8222, + "step": 44223 + }, + { + "epoch": 2.17, + "grad_norm": 0.7050642967224121, + "learning_rate": 0.00010700847127657159, + "loss": 2.8748, + "step": 44224 + }, + { + "epoch": 2.17, + "grad_norm": 0.7063111662864685, + "learning_rate": 0.00010699668405457625, + "loss": 3.1069, + "step": 44225 + }, + { + "epoch": 2.17, + "grad_norm": 0.7139179706573486, + "learning_rate": 0.00010698489734091454, + "loss": 2.855, + "step": 44226 + }, + { + "epoch": 2.17, + "grad_norm": 0.6964961886405945, + "learning_rate": 0.00010697311113561733, + "loss": 3.0488, + "step": 44227 + }, + { + "epoch": 2.17, + "grad_norm": 0.6961595416069031, + "learning_rate": 0.00010696132543871591, + "loss": 2.9491, + "step": 44228 + }, + { + "epoch": 2.17, + "grad_norm": 0.6952822208404541, + "learning_rate": 0.00010694954025024115, + "loss": 3.0702, + "step": 44229 + }, + { + "epoch": 2.17, + "grad_norm": 0.7170188426971436, + "learning_rate": 0.00010693775557022407, + "loss": 2.9606, + "step": 44230 + }, + { + "epoch": 2.17, + "grad_norm": 0.7045404314994812, + "learning_rate": 0.00010692597139869595, + "loss": 2.8441, + "step": 44231 + }, + { + "epoch": 2.17, + "grad_norm": 0.7429175972938538, + "learning_rate": 0.00010691418773568752, + "loss": 3.0192, + "step": 44232 + }, + { + "epoch": 2.17, + "grad_norm": 0.7014785408973694, + "learning_rate": 0.00010690240458123008, + "loss": 2.8359, + "step": 44233 + }, + { + "epoch": 2.17, + "grad_norm": 0.7238576412200928, + "learning_rate": 0.00010689062193535456, + "loss": 2.9959, + "step": 44234 + }, + { + "epoch": 2.17, + "grad_norm": 0.701926052570343, + "learning_rate": 0.00010687883979809183, + "loss": 2.9024, + "step": 44235 + }, + { + "epoch": 2.17, + "grad_norm": 0.7221980094909668, + "learning_rate": 0.00010686705816947323, + "loss": 2.9919, + "step": 44236 + }, + { + "epoch": 2.17, + "grad_norm": 0.7259619235992432, + "learning_rate": 0.0001068552770495295, + "loss": 2.8936, + "step": 44237 + }, + { + "epoch": 2.17, + "grad_norm": 0.7003749012947083, + "learning_rate": 0.00010684349643829188, + "loss": 3.1794, + "step": 44238 + }, + { + "epoch": 2.17, + "grad_norm": 0.7119761109352112, + "learning_rate": 0.00010683171633579124, + "loss": 2.933, + "step": 44239 + }, + { + "epoch": 2.17, + "grad_norm": 0.7327340841293335, + "learning_rate": 0.00010681993674205877, + "loss": 2.743, + "step": 44240 + }, + { + "epoch": 2.17, + "grad_norm": 0.7414251565933228, + "learning_rate": 0.00010680815765712543, + "loss": 2.885, + "step": 44241 + }, + { + "epoch": 2.17, + "grad_norm": 0.683208703994751, + "learning_rate": 0.0001067963790810221, + "loss": 2.9057, + "step": 44242 + }, + { + "epoch": 2.17, + "grad_norm": 0.711431086063385, + "learning_rate": 0.00010678460101378003, + "loss": 2.8266, + "step": 44243 + }, + { + "epoch": 2.17, + "grad_norm": 0.711889386177063, + "learning_rate": 0.00010677282345543005, + "loss": 2.8303, + "step": 44244 + }, + { + "epoch": 2.17, + "grad_norm": 0.7022669315338135, + "learning_rate": 0.00010676104640600324, + "loss": 2.8831, + "step": 44245 + }, + { + "epoch": 2.17, + "grad_norm": 0.684972882270813, + "learning_rate": 0.00010674926986553074, + "loss": 2.8915, + "step": 44246 + }, + { + "epoch": 2.17, + "grad_norm": 0.6765041947364807, + "learning_rate": 0.0001067374938340435, + "loss": 2.8127, + "step": 44247 + }, + { + "epoch": 2.17, + "grad_norm": 0.6731730103492737, + "learning_rate": 0.00010672571831157251, + "loss": 2.9058, + "step": 44248 + }, + { + "epoch": 2.17, + "grad_norm": 0.7448628544807434, + "learning_rate": 0.00010671394329814864, + "loss": 2.6898, + "step": 44249 + }, + { + "epoch": 2.17, + "grad_norm": 0.7296499609947205, + "learning_rate": 0.00010670216879380308, + "loss": 3.0543, + "step": 44250 + }, + { + "epoch": 2.17, + "grad_norm": 0.7346038222312927, + "learning_rate": 0.00010669039479856691, + "loss": 2.8413, + "step": 44251 + }, + { + "epoch": 2.17, + "grad_norm": 0.6547524929046631, + "learning_rate": 0.00010667862131247092, + "loss": 2.7705, + "step": 44252 + }, + { + "epoch": 2.17, + "grad_norm": 0.719100832939148, + "learning_rate": 0.00010666684833554635, + "loss": 3.0006, + "step": 44253 + }, + { + "epoch": 2.17, + "grad_norm": 0.7397176027297974, + "learning_rate": 0.00010665507586782399, + "loss": 2.7199, + "step": 44254 + }, + { + "epoch": 2.17, + "grad_norm": 0.7125775814056396, + "learning_rate": 0.00010664330390933505, + "loss": 2.9314, + "step": 44255 + }, + { + "epoch": 2.17, + "grad_norm": 0.7293382287025452, + "learning_rate": 0.00010663153246011042, + "loss": 3.1108, + "step": 44256 + }, + { + "epoch": 2.17, + "grad_norm": 0.6914783120155334, + "learning_rate": 0.00010661976152018101, + "loss": 2.8427, + "step": 44257 + }, + { + "epoch": 2.17, + "grad_norm": 0.7073983550071716, + "learning_rate": 0.00010660799108957806, + "loss": 2.8991, + "step": 44258 + }, + { + "epoch": 2.17, + "grad_norm": 0.7683893442153931, + "learning_rate": 0.00010659622116833233, + "loss": 2.8923, + "step": 44259 + }, + { + "epoch": 2.17, + "grad_norm": 0.7236352562904358, + "learning_rate": 0.00010658445175647506, + "loss": 2.9805, + "step": 44260 + }, + { + "epoch": 2.17, + "grad_norm": 0.7174046635627747, + "learning_rate": 0.00010657268285403697, + "loss": 3.0995, + "step": 44261 + }, + { + "epoch": 2.17, + "grad_norm": 0.6961384415626526, + "learning_rate": 0.00010656091446104936, + "loss": 2.8501, + "step": 44262 + }, + { + "epoch": 2.17, + "grad_norm": 0.7074950933456421, + "learning_rate": 0.00010654914657754304, + "loss": 2.8524, + "step": 44263 + }, + { + "epoch": 2.17, + "grad_norm": 0.6755015850067139, + "learning_rate": 0.00010653737920354896, + "loss": 2.8667, + "step": 44264 + }, + { + "epoch": 2.17, + "grad_norm": 0.673994779586792, + "learning_rate": 0.00010652561233909828, + "loss": 2.8819, + "step": 44265 + }, + { + "epoch": 2.17, + "grad_norm": 0.6857726573944092, + "learning_rate": 0.00010651384598422178, + "loss": 2.9646, + "step": 44266 + }, + { + "epoch": 2.17, + "grad_norm": 0.7020795941352844, + "learning_rate": 0.00010650208013895057, + "loss": 2.888, + "step": 44267 + }, + { + "epoch": 2.17, + "grad_norm": 0.7138441801071167, + "learning_rate": 0.00010649031480331577, + "loss": 2.9728, + "step": 44268 + }, + { + "epoch": 2.17, + "grad_norm": 0.6965253949165344, + "learning_rate": 0.00010647854997734824, + "loss": 2.8346, + "step": 44269 + }, + { + "epoch": 2.17, + "grad_norm": 0.6731846332550049, + "learning_rate": 0.00010646678566107895, + "loss": 2.8683, + "step": 44270 + }, + { + "epoch": 2.17, + "grad_norm": 0.7317405939102173, + "learning_rate": 0.0001064550218545388, + "loss": 2.9809, + "step": 44271 + }, + { + "epoch": 2.17, + "grad_norm": 0.7305527925491333, + "learning_rate": 0.00010644325855775889, + "loss": 2.9701, + "step": 44272 + }, + { + "epoch": 2.17, + "grad_norm": 0.6821311712265015, + "learning_rate": 0.00010643149577077027, + "loss": 2.8188, + "step": 44273 + }, + { + "epoch": 2.17, + "grad_norm": 0.7498180866241455, + "learning_rate": 0.00010641973349360374, + "loss": 3.0598, + "step": 44274 + }, + { + "epoch": 2.17, + "grad_norm": 0.731899619102478, + "learning_rate": 0.00010640797172629051, + "loss": 2.8892, + "step": 44275 + }, + { + "epoch": 2.17, + "grad_norm": 0.7006483674049377, + "learning_rate": 0.00010639621046886128, + "loss": 2.86, + "step": 44276 + }, + { + "epoch": 2.17, + "grad_norm": 0.7303549647331238, + "learning_rate": 0.00010638444972134728, + "loss": 2.597, + "step": 44277 + }, + { + "epoch": 2.17, + "grad_norm": 0.6863207817077637, + "learning_rate": 0.00010637268948377939, + "loss": 2.6882, + "step": 44278 + }, + { + "epoch": 2.17, + "grad_norm": 0.66311115026474, + "learning_rate": 0.00010636092975618847, + "loss": 2.9305, + "step": 44279 + }, + { + "epoch": 2.17, + "grad_norm": 0.7255011796951294, + "learning_rate": 0.00010634917053860569, + "loss": 2.8173, + "step": 44280 + }, + { + "epoch": 2.17, + "grad_norm": 0.6959855556488037, + "learning_rate": 0.00010633741183106184, + "loss": 2.8846, + "step": 44281 + }, + { + "epoch": 2.17, + "grad_norm": 0.6742451190948486, + "learning_rate": 0.00010632565363358797, + "loss": 3.0126, + "step": 44282 + }, + { + "epoch": 2.17, + "grad_norm": 0.7093662023544312, + "learning_rate": 0.00010631389594621515, + "loss": 3.0639, + "step": 44283 + }, + { + "epoch": 2.17, + "grad_norm": 0.6777709126472473, + "learning_rate": 0.00010630213876897423, + "loss": 2.9066, + "step": 44284 + }, + { + "epoch": 2.17, + "grad_norm": 0.700195848941803, + "learning_rate": 0.00010629038210189623, + "loss": 2.9751, + "step": 44285 + }, + { + "epoch": 2.17, + "grad_norm": 0.6865321397781372, + "learning_rate": 0.00010627862594501196, + "loss": 3.0077, + "step": 44286 + }, + { + "epoch": 2.17, + "grad_norm": 0.6771736741065979, + "learning_rate": 0.0001062668702983525, + "loss": 2.7006, + "step": 44287 + }, + { + "epoch": 2.17, + "grad_norm": 0.7320778965950012, + "learning_rate": 0.00010625511516194895, + "loss": 2.9485, + "step": 44288 + }, + { + "epoch": 2.17, + "grad_norm": 0.7206733226776123, + "learning_rate": 0.00010624336053583202, + "loss": 2.8812, + "step": 44289 + }, + { + "epoch": 2.17, + "grad_norm": 0.6806963682174683, + "learning_rate": 0.00010623160642003292, + "loss": 2.8639, + "step": 44290 + }, + { + "epoch": 2.17, + "grad_norm": 0.7328031659126282, + "learning_rate": 0.00010621985281458233, + "loss": 2.9097, + "step": 44291 + }, + { + "epoch": 2.17, + "grad_norm": 0.7521789073944092, + "learning_rate": 0.00010620809971951148, + "loss": 2.605, + "step": 44292 + }, + { + "epoch": 2.17, + "grad_norm": 0.7122011780738831, + "learning_rate": 0.00010619634713485122, + "loss": 2.9234, + "step": 44293 + }, + { + "epoch": 2.17, + "grad_norm": 0.7520004510879517, + "learning_rate": 0.00010618459506063235, + "loss": 2.8301, + "step": 44294 + }, + { + "epoch": 2.17, + "grad_norm": 0.7129939794540405, + "learning_rate": 0.00010617284349688607, + "loss": 2.9049, + "step": 44295 + }, + { + "epoch": 2.17, + "grad_norm": 0.6909685134887695, + "learning_rate": 0.0001061610924436431, + "loss": 2.9319, + "step": 44296 + }, + { + "epoch": 2.17, + "grad_norm": 0.6826851963996887, + "learning_rate": 0.00010614934190093454, + "loss": 2.7563, + "step": 44297 + }, + { + "epoch": 2.17, + "grad_norm": 0.6831250190734863, + "learning_rate": 0.00010613759186879138, + "loss": 2.9022, + "step": 44298 + }, + { + "epoch": 2.17, + "grad_norm": 0.6965422034263611, + "learning_rate": 0.00010612584234724452, + "loss": 3.158, + "step": 44299 + }, + { + "epoch": 2.17, + "grad_norm": 0.7100363969802856, + "learning_rate": 0.00010611409333632486, + "loss": 2.6984, + "step": 44300 + }, + { + "epoch": 2.17, + "grad_norm": 0.7082723379135132, + "learning_rate": 0.00010610234483606325, + "loss": 2.8544, + "step": 44301 + }, + { + "epoch": 2.17, + "grad_norm": 0.7065490484237671, + "learning_rate": 0.00010609059684649076, + "loss": 3.014, + "step": 44302 + }, + { + "epoch": 2.17, + "grad_norm": 0.7121196389198303, + "learning_rate": 0.00010607884936763841, + "loss": 2.8571, + "step": 44303 + }, + { + "epoch": 2.17, + "grad_norm": 0.7432886958122253, + "learning_rate": 0.00010606710239953697, + "loss": 2.9936, + "step": 44304 + }, + { + "epoch": 2.17, + "grad_norm": 0.6981472969055176, + "learning_rate": 0.00010605535594221757, + "loss": 2.8229, + "step": 44305 + }, + { + "epoch": 2.17, + "grad_norm": 0.7373912334442139, + "learning_rate": 0.000106043609995711, + "loss": 2.8332, + "step": 44306 + }, + { + "epoch": 2.17, + "grad_norm": 0.7022235989570618, + "learning_rate": 0.00010603186456004816, + "loss": 2.783, + "step": 44307 + }, + { + "epoch": 2.17, + "grad_norm": 0.7700810432434082, + "learning_rate": 0.00010602011963526013, + "loss": 2.9577, + "step": 44308 + }, + { + "epoch": 2.17, + "grad_norm": 0.7263349294662476, + "learning_rate": 0.0001060083752213777, + "loss": 2.8884, + "step": 44309 + }, + { + "epoch": 2.17, + "grad_norm": 0.6605125665664673, + "learning_rate": 0.00010599663131843197, + "loss": 2.9842, + "step": 44310 + }, + { + "epoch": 2.17, + "grad_norm": 0.680346667766571, + "learning_rate": 0.00010598488792645367, + "loss": 3.0116, + "step": 44311 + }, + { + "epoch": 2.17, + "grad_norm": 0.696643054485321, + "learning_rate": 0.00010597314504547386, + "loss": 2.5903, + "step": 44312 + }, + { + "epoch": 2.17, + "grad_norm": 0.7104796171188354, + "learning_rate": 0.00010596140267552351, + "loss": 3.0219, + "step": 44313 + }, + { + "epoch": 2.17, + "grad_norm": 0.8690118193626404, + "learning_rate": 0.00010594966081663352, + "loss": 2.9789, + "step": 44314 + }, + { + "epoch": 2.17, + "grad_norm": 0.7163686156272888, + "learning_rate": 0.00010593791946883473, + "loss": 3.0463, + "step": 44315 + }, + { + "epoch": 2.17, + "grad_norm": 0.7209365367889404, + "learning_rate": 0.00010592617863215806, + "loss": 2.8777, + "step": 44316 + }, + { + "epoch": 2.17, + "grad_norm": 0.6892098188400269, + "learning_rate": 0.00010591443830663447, + "loss": 2.7619, + "step": 44317 + }, + { + "epoch": 2.17, + "grad_norm": 0.7479181289672852, + "learning_rate": 0.00010590269849229497, + "loss": 2.9923, + "step": 44318 + }, + { + "epoch": 2.17, + "grad_norm": 0.6602400541305542, + "learning_rate": 0.00010589095918917032, + "loss": 3.0618, + "step": 44319 + }, + { + "epoch": 2.17, + "grad_norm": 0.6800590753555298, + "learning_rate": 0.00010587922039729162, + "loss": 2.837, + "step": 44320 + }, + { + "epoch": 2.17, + "grad_norm": 0.6915212869644165, + "learning_rate": 0.00010586748211668975, + "loss": 3.046, + "step": 44321 + }, + { + "epoch": 2.17, + "grad_norm": 0.6551668643951416, + "learning_rate": 0.00010585574434739542, + "loss": 2.8772, + "step": 44322 + }, + { + "epoch": 2.17, + "grad_norm": 0.7143257856369019, + "learning_rate": 0.00010584400708943979, + "loss": 2.9301, + "step": 44323 + }, + { + "epoch": 2.17, + "grad_norm": 0.6705986857414246, + "learning_rate": 0.00010583227034285357, + "loss": 2.8245, + "step": 44324 + }, + { + "epoch": 2.17, + "grad_norm": 0.6597686409950256, + "learning_rate": 0.00010582053410766792, + "loss": 2.9585, + "step": 44325 + }, + { + "epoch": 2.17, + "grad_norm": 0.7265187501907349, + "learning_rate": 0.00010580879838391349, + "loss": 2.9596, + "step": 44326 + }, + { + "epoch": 2.17, + "grad_norm": 0.7158097624778748, + "learning_rate": 0.00010579706317162145, + "loss": 2.7455, + "step": 44327 + }, + { + "epoch": 2.17, + "grad_norm": 0.7624508142471313, + "learning_rate": 0.0001057853284708225, + "loss": 2.8151, + "step": 44328 + }, + { + "epoch": 2.17, + "grad_norm": 0.7447002530097961, + "learning_rate": 0.00010577359428154759, + "loss": 2.5754, + "step": 44329 + }, + { + "epoch": 2.17, + "grad_norm": 0.6778799295425415, + "learning_rate": 0.00010576186060382771, + "loss": 2.855, + "step": 44330 + }, + { + "epoch": 2.17, + "grad_norm": 0.7103135585784912, + "learning_rate": 0.0001057501274376936, + "loss": 2.9367, + "step": 44331 + }, + { + "epoch": 2.17, + "grad_norm": 0.6705018281936646, + "learning_rate": 0.0001057383947831763, + "loss": 3.1099, + "step": 44332 + }, + { + "epoch": 2.17, + "grad_norm": 0.7252798676490784, + "learning_rate": 0.00010572666264030677, + "loss": 2.8609, + "step": 44333 + }, + { + "epoch": 2.17, + "grad_norm": 0.7287251353263855, + "learning_rate": 0.00010571493100911573, + "loss": 2.8493, + "step": 44334 + }, + { + "epoch": 2.17, + "grad_norm": 0.7141810059547424, + "learning_rate": 0.00010570319988963424, + "loss": 2.855, + "step": 44335 + }, + { + "epoch": 2.17, + "grad_norm": 0.7226595878601074, + "learning_rate": 0.00010569146928189316, + "loss": 2.6597, + "step": 44336 + }, + { + "epoch": 2.17, + "grad_norm": 0.7063793540000916, + "learning_rate": 0.00010567973918592334, + "loss": 3.0757, + "step": 44337 + }, + { + "epoch": 2.17, + "grad_norm": 0.7134846448898315, + "learning_rate": 0.0001056680096017556, + "loss": 2.8873, + "step": 44338 + }, + { + "epoch": 2.17, + "grad_norm": 0.7262305021286011, + "learning_rate": 0.00010565628052942091, + "loss": 3.1055, + "step": 44339 + }, + { + "epoch": 2.17, + "grad_norm": 0.729193389415741, + "learning_rate": 0.00010564455196895029, + "loss": 2.9053, + "step": 44340 + }, + { + "epoch": 2.17, + "grad_norm": 0.7212259769439697, + "learning_rate": 0.00010563282392037443, + "loss": 2.9384, + "step": 44341 + }, + { + "epoch": 2.17, + "grad_norm": 0.7601874470710754, + "learning_rate": 0.00010562109638372436, + "loss": 2.7878, + "step": 44342 + }, + { + "epoch": 2.17, + "grad_norm": 0.6938914656639099, + "learning_rate": 0.00010560936935903093, + "loss": 3.0421, + "step": 44343 + }, + { + "epoch": 2.17, + "grad_norm": 0.7167030572891235, + "learning_rate": 0.00010559764284632493, + "loss": 3.0963, + "step": 44344 + }, + { + "epoch": 2.17, + "grad_norm": 0.7385327219963074, + "learning_rate": 0.0001055859168456374, + "loss": 3.0804, + "step": 44345 + }, + { + "epoch": 2.17, + "grad_norm": 0.7111912369728088, + "learning_rate": 0.00010557419135699906, + "loss": 3.0677, + "step": 44346 + }, + { + "epoch": 2.17, + "grad_norm": 0.7527398467063904, + "learning_rate": 0.000105562466380441, + "loss": 3.0916, + "step": 44347 + }, + { + "epoch": 2.17, + "grad_norm": 0.6765745282173157, + "learning_rate": 0.00010555074191599387, + "loss": 2.9669, + "step": 44348 + }, + { + "epoch": 2.17, + "grad_norm": 0.6760923266410828, + "learning_rate": 0.00010553901796368876, + "loss": 2.8139, + "step": 44349 + }, + { + "epoch": 2.17, + "grad_norm": 0.7211750149726868, + "learning_rate": 0.00010552729452355644, + "loss": 2.7989, + "step": 44350 + }, + { + "epoch": 2.17, + "grad_norm": 0.6986032128334045, + "learning_rate": 0.00010551557159562769, + "loss": 3.0181, + "step": 44351 + }, + { + "epoch": 2.17, + "grad_norm": 0.7039467692375183, + "learning_rate": 0.00010550384917993362, + "loss": 2.9076, + "step": 44352 + }, + { + "epoch": 2.17, + "grad_norm": 0.7151457667350769, + "learning_rate": 0.00010549212727650487, + "loss": 3.0128, + "step": 44353 + }, + { + "epoch": 2.17, + "grad_norm": 0.7260016202926636, + "learning_rate": 0.00010548040588537244, + "loss": 3.0107, + "step": 44354 + }, + { + "epoch": 2.17, + "grad_norm": 0.7048295140266418, + "learning_rate": 0.00010546868500656728, + "loss": 2.9732, + "step": 44355 + }, + { + "epoch": 2.17, + "grad_norm": 0.6705439686775208, + "learning_rate": 0.00010545696464012006, + "loss": 2.736, + "step": 44356 + }, + { + "epoch": 2.17, + "grad_norm": 0.6685308814048767, + "learning_rate": 0.00010544524478606185, + "loss": 3.0948, + "step": 44357 + }, + { + "epoch": 2.17, + "grad_norm": 0.7032225728034973, + "learning_rate": 0.00010543352544442342, + "loss": 2.8816, + "step": 44358 + }, + { + "epoch": 2.17, + "grad_norm": 0.7694846987724304, + "learning_rate": 0.00010542180661523555, + "loss": 3.0432, + "step": 44359 + }, + { + "epoch": 2.17, + "grad_norm": 0.6599303483963013, + "learning_rate": 0.00010541008829852929, + "loss": 2.9444, + "step": 44360 + }, + { + "epoch": 2.17, + "grad_norm": 0.7349076271057129, + "learning_rate": 0.00010539837049433531, + "loss": 2.981, + "step": 44361 + }, + { + "epoch": 2.17, + "grad_norm": 0.7268679141998291, + "learning_rate": 0.00010538665320268466, + "loss": 2.9341, + "step": 44362 + }, + { + "epoch": 2.17, + "grad_norm": 0.6871877312660217, + "learning_rate": 0.00010537493642360802, + "loss": 2.8963, + "step": 44363 + }, + { + "epoch": 2.17, + "grad_norm": 0.669917643070221, + "learning_rate": 0.00010536322015713642, + "loss": 2.921, + "step": 44364 + }, + { + "epoch": 2.17, + "grad_norm": 0.7450718283653259, + "learning_rate": 0.00010535150440330068, + "loss": 2.6976, + "step": 44365 + }, + { + "epoch": 2.17, + "grad_norm": 0.6828457713127136, + "learning_rate": 0.00010533978916213148, + "loss": 2.8831, + "step": 44366 + }, + { + "epoch": 2.17, + "grad_norm": 0.7049870491027832, + "learning_rate": 0.00010532807443365994, + "loss": 2.9459, + "step": 44367 + }, + { + "epoch": 2.17, + "grad_norm": 0.6955314874649048, + "learning_rate": 0.00010531636021791664, + "loss": 2.8951, + "step": 44368 + }, + { + "epoch": 2.17, + "grad_norm": 0.6996759176254272, + "learning_rate": 0.00010530464651493263, + "loss": 2.9312, + "step": 44369 + }, + { + "epoch": 2.17, + "grad_norm": 0.7178577184677124, + "learning_rate": 0.00010529293332473879, + "loss": 2.7958, + "step": 44370 + }, + { + "epoch": 2.17, + "grad_norm": 0.6655074954032898, + "learning_rate": 0.00010528122064736591, + "loss": 2.9293, + "step": 44371 + }, + { + "epoch": 2.17, + "grad_norm": 0.6931262016296387, + "learning_rate": 0.00010526950848284482, + "loss": 2.9426, + "step": 44372 + }, + { + "epoch": 2.17, + "grad_norm": 0.6952087879180908, + "learning_rate": 0.00010525779683120621, + "loss": 2.921, + "step": 44373 + }, + { + "epoch": 2.17, + "grad_norm": 0.6865946054458618, + "learning_rate": 0.00010524608569248113, + "loss": 2.8322, + "step": 44374 + }, + { + "epoch": 2.17, + "grad_norm": 0.6755927801132202, + "learning_rate": 0.00010523437506670052, + "loss": 2.7113, + "step": 44375 + }, + { + "epoch": 2.17, + "grad_norm": 0.704673171043396, + "learning_rate": 0.00010522266495389491, + "loss": 2.8718, + "step": 44376 + }, + { + "epoch": 2.17, + "grad_norm": 0.6965922713279724, + "learning_rate": 0.00010521095535409547, + "loss": 2.8379, + "step": 44377 + }, + { + "epoch": 2.17, + "grad_norm": 0.7056841850280762, + "learning_rate": 0.00010519924626733275, + "loss": 3.0354, + "step": 44378 + }, + { + "epoch": 2.17, + "grad_norm": 0.7823092937469482, + "learning_rate": 0.00010518753769363782, + "loss": 3.1314, + "step": 44379 + }, + { + "epoch": 2.17, + "grad_norm": 0.6921338438987732, + "learning_rate": 0.00010517582963304146, + "loss": 3.0036, + "step": 44380 + }, + { + "epoch": 2.18, + "grad_norm": 0.6734832525253296, + "learning_rate": 0.00010516412208557434, + "loss": 2.9865, + "step": 44381 + }, + { + "epoch": 2.18, + "grad_norm": 0.7287729382514954, + "learning_rate": 0.00010515241505126754, + "loss": 3.0678, + "step": 44382 + }, + { + "epoch": 2.18, + "grad_norm": 0.6677972674369812, + "learning_rate": 0.00010514070853015169, + "loss": 2.9314, + "step": 44383 + }, + { + "epoch": 2.18, + "grad_norm": 0.7038114070892334, + "learning_rate": 0.00010512900252225766, + "loss": 3.0285, + "step": 44384 + }, + { + "epoch": 2.18, + "grad_norm": 0.6778184175491333, + "learning_rate": 0.00010511729702761648, + "loss": 2.8647, + "step": 44385 + }, + { + "epoch": 2.18, + "grad_norm": 0.7425260543823242, + "learning_rate": 0.00010510559204625882, + "loss": 2.9822, + "step": 44386 + }, + { + "epoch": 2.18, + "grad_norm": 0.720942497253418, + "learning_rate": 0.00010509388757821555, + "loss": 2.9104, + "step": 44387 + }, + { + "epoch": 2.18, + "grad_norm": 0.7403482794761658, + "learning_rate": 0.00010508218362351735, + "loss": 2.881, + "step": 44388 + }, + { + "epoch": 2.18, + "grad_norm": 0.6942797303199768, + "learning_rate": 0.00010507048018219515, + "loss": 3.1106, + "step": 44389 + }, + { + "epoch": 2.18, + "grad_norm": 0.7015599012374878, + "learning_rate": 0.00010505877725427991, + "loss": 2.8412, + "step": 44390 + }, + { + "epoch": 2.18, + "grad_norm": 0.7615519165992737, + "learning_rate": 0.00010504707483980221, + "loss": 2.9333, + "step": 44391 + }, + { + "epoch": 2.18, + "grad_norm": 0.711105465888977, + "learning_rate": 0.00010503537293879313, + "loss": 3.13, + "step": 44392 + }, + { + "epoch": 2.18, + "grad_norm": 0.7053470015525818, + "learning_rate": 0.00010502367155128332, + "loss": 2.9103, + "step": 44393 + }, + { + "epoch": 2.18, + "grad_norm": 0.6969516277313232, + "learning_rate": 0.00010501197067730357, + "loss": 2.8205, + "step": 44394 + }, + { + "epoch": 2.18, + "grad_norm": 0.7566748857498169, + "learning_rate": 0.00010500027031688482, + "loss": 3.0093, + "step": 44395 + }, + { + "epoch": 2.18, + "grad_norm": 0.6897950172424316, + "learning_rate": 0.00010498857047005777, + "loss": 3.0615, + "step": 44396 + }, + { + "epoch": 2.18, + "grad_norm": 0.7079964280128479, + "learning_rate": 0.00010497687113685337, + "loss": 3.1859, + "step": 44397 + }, + { + "epoch": 2.18, + "grad_norm": 0.72109454870224, + "learning_rate": 0.00010496517231730228, + "loss": 2.4967, + "step": 44398 + }, + { + "epoch": 2.18, + "grad_norm": 0.7014322876930237, + "learning_rate": 0.00010495347401143538, + "loss": 3.1057, + "step": 44399 + }, + { + "epoch": 2.18, + "grad_norm": 0.7019551396369934, + "learning_rate": 0.00010494177621928358, + "loss": 2.9177, + "step": 44400 + }, + { + "epoch": 2.18, + "grad_norm": 0.721088171005249, + "learning_rate": 0.00010493007894087762, + "loss": 3.1077, + "step": 44401 + }, + { + "epoch": 2.18, + "grad_norm": 0.6936663389205933, + "learning_rate": 0.00010491838217624829, + "loss": 2.9189, + "step": 44402 + }, + { + "epoch": 2.18, + "grad_norm": 0.7555628418922424, + "learning_rate": 0.0001049066859254263, + "loss": 2.906, + "step": 44403 + }, + { + "epoch": 2.18, + "grad_norm": 0.6784551739692688, + "learning_rate": 0.00010489499018844253, + "loss": 2.9672, + "step": 44404 + }, + { + "epoch": 2.18, + "grad_norm": 0.7072699069976807, + "learning_rate": 0.00010488329496532791, + "loss": 2.966, + "step": 44405 + }, + { + "epoch": 2.18, + "grad_norm": 0.7025425434112549, + "learning_rate": 0.00010487160025611305, + "loss": 2.966, + "step": 44406 + }, + { + "epoch": 2.18, + "grad_norm": 0.7028836607933044, + "learning_rate": 0.00010485990606082895, + "loss": 2.991, + "step": 44407 + }, + { + "epoch": 2.18, + "grad_norm": 0.7300164103507996, + "learning_rate": 0.0001048482123795063, + "loss": 2.8152, + "step": 44408 + }, + { + "epoch": 2.18, + "grad_norm": 0.6703711748123169, + "learning_rate": 0.0001048365192121758, + "loss": 2.9551, + "step": 44409 + }, + { + "epoch": 2.18, + "grad_norm": 0.7038151621818542, + "learning_rate": 0.00010482482655886841, + "loss": 2.5322, + "step": 44410 + }, + { + "epoch": 2.18, + "grad_norm": 0.7217896580696106, + "learning_rate": 0.0001048131344196148, + "loss": 2.7781, + "step": 44411 + }, + { + "epoch": 2.18, + "grad_norm": 0.7087219953536987, + "learning_rate": 0.00010480144279444595, + "loss": 2.8338, + "step": 44412 + }, + { + "epoch": 2.18, + "grad_norm": 0.7262712121009827, + "learning_rate": 0.0001047897516833924, + "loss": 3.0148, + "step": 44413 + }, + { + "epoch": 2.18, + "grad_norm": 0.6909366846084595, + "learning_rate": 0.0001047780610864852, + "loss": 2.9738, + "step": 44414 + }, + { + "epoch": 2.18, + "grad_norm": 0.6847735047340393, + "learning_rate": 0.0001047663710037549, + "loss": 2.733, + "step": 44415 + }, + { + "epoch": 2.18, + "grad_norm": 0.7341592907905579, + "learning_rate": 0.00010475468143523248, + "loss": 2.8791, + "step": 44416 + }, + { + "epoch": 2.18, + "grad_norm": 0.7717563509941101, + "learning_rate": 0.0001047429923809487, + "loss": 2.9478, + "step": 44417 + }, + { + "epoch": 2.18, + "grad_norm": 0.6956798434257507, + "learning_rate": 0.00010473130384093417, + "loss": 2.9691, + "step": 44418 + }, + { + "epoch": 2.18, + "grad_norm": 0.7189202904701233, + "learning_rate": 0.00010471961581521993, + "loss": 3.0525, + "step": 44419 + }, + { + "epoch": 2.18, + "grad_norm": 0.6681233048439026, + "learning_rate": 0.0001047079283038365, + "loss": 2.8649, + "step": 44420 + }, + { + "epoch": 2.18, + "grad_norm": 0.7311338186264038, + "learning_rate": 0.00010469624130681485, + "loss": 3.015, + "step": 44421 + }, + { + "epoch": 2.18, + "grad_norm": 0.722417950630188, + "learning_rate": 0.00010468455482418577, + "loss": 2.8841, + "step": 44422 + }, + { + "epoch": 2.18, + "grad_norm": 0.6731986403465271, + "learning_rate": 0.00010467286885598003, + "loss": 2.7704, + "step": 44423 + }, + { + "epoch": 2.18, + "grad_norm": 0.6961910724639893, + "learning_rate": 0.00010466118340222833, + "loss": 2.8648, + "step": 44424 + }, + { + "epoch": 2.18, + "grad_norm": 0.7090938687324524, + "learning_rate": 0.00010464949846296138, + "loss": 2.8517, + "step": 44425 + }, + { + "epoch": 2.18, + "grad_norm": 0.690933108329773, + "learning_rate": 0.00010463781403821004, + "loss": 2.8032, + "step": 44426 + }, + { + "epoch": 2.18, + "grad_norm": 0.6868634819984436, + "learning_rate": 0.00010462613012800524, + "loss": 3.0002, + "step": 44427 + }, + { + "epoch": 2.18, + "grad_norm": 0.6949028968811035, + "learning_rate": 0.00010461444673237747, + "loss": 2.784, + "step": 44428 + }, + { + "epoch": 2.18, + "grad_norm": 0.7235555648803711, + "learning_rate": 0.00010460276385135778, + "loss": 2.958, + "step": 44429 + }, + { + "epoch": 2.18, + "grad_norm": 0.7306112051010132, + "learning_rate": 0.00010459108148497681, + "loss": 2.9012, + "step": 44430 + }, + { + "epoch": 2.18, + "grad_norm": 0.7331579327583313, + "learning_rate": 0.0001045793996332652, + "loss": 2.8728, + "step": 44431 + }, + { + "epoch": 2.18, + "grad_norm": 0.7387495636940002, + "learning_rate": 0.00010456771829625396, + "loss": 2.9061, + "step": 44432 + }, + { + "epoch": 2.18, + "grad_norm": 0.6881783604621887, + "learning_rate": 0.0001045560374739736, + "loss": 2.9365, + "step": 44433 + }, + { + "epoch": 2.18, + "grad_norm": 0.7136433720588684, + "learning_rate": 0.00010454435716645518, + "loss": 2.8599, + "step": 44434 + }, + { + "epoch": 2.18, + "grad_norm": 0.6872485280036926, + "learning_rate": 0.00010453267737372915, + "loss": 2.7754, + "step": 44435 + }, + { + "epoch": 2.18, + "grad_norm": 0.7080912590026855, + "learning_rate": 0.00010452099809582646, + "loss": 3.0945, + "step": 44436 + }, + { + "epoch": 2.18, + "grad_norm": 0.7158118486404419, + "learning_rate": 0.00010450931933277795, + "loss": 2.9047, + "step": 44437 + }, + { + "epoch": 2.18, + "grad_norm": 0.6851445436477661, + "learning_rate": 0.00010449764108461425, + "loss": 2.8758, + "step": 44438 + }, + { + "epoch": 2.18, + "grad_norm": 0.7490003705024719, + "learning_rate": 0.00010448596335136615, + "loss": 2.8528, + "step": 44439 + }, + { + "epoch": 2.18, + "grad_norm": 0.7051994800567627, + "learning_rate": 0.00010447428613306428, + "loss": 3.0044, + "step": 44440 + }, + { + "epoch": 2.18, + "grad_norm": 0.7388154864311218, + "learning_rate": 0.00010446260942973953, + "loss": 3.0324, + "step": 44441 + }, + { + "epoch": 2.18, + "grad_norm": 0.7008410096168518, + "learning_rate": 0.00010445093324142272, + "loss": 3.035, + "step": 44442 + }, + { + "epoch": 2.18, + "grad_norm": 0.6961836218833923, + "learning_rate": 0.00010443925756814442, + "loss": 3.0496, + "step": 44443 + }, + { + "epoch": 2.18, + "grad_norm": 0.7032221555709839, + "learning_rate": 0.00010442758240993557, + "loss": 2.9266, + "step": 44444 + }, + { + "epoch": 2.18, + "grad_norm": 0.7082571387290955, + "learning_rate": 0.00010441590776682686, + "loss": 3.0056, + "step": 44445 + }, + { + "epoch": 2.18, + "grad_norm": 0.7039412260055542, + "learning_rate": 0.00010440423363884888, + "loss": 2.8115, + "step": 44446 + }, + { + "epoch": 2.18, + "grad_norm": 0.6998090744018555, + "learning_rate": 0.00010439256002603261, + "loss": 3.0467, + "step": 44447 + }, + { + "epoch": 2.18, + "grad_norm": 0.7542231678962708, + "learning_rate": 0.0001043808869284086, + "loss": 2.9567, + "step": 44448 + }, + { + "epoch": 2.18, + "grad_norm": 0.7332549095153809, + "learning_rate": 0.00010436921434600778, + "loss": 2.8982, + "step": 44449 + }, + { + "epoch": 2.18, + "grad_norm": 0.6957454681396484, + "learning_rate": 0.00010435754227886072, + "loss": 2.8315, + "step": 44450 + }, + { + "epoch": 2.18, + "grad_norm": 0.715060293674469, + "learning_rate": 0.00010434587072699832, + "loss": 2.8012, + "step": 44451 + }, + { + "epoch": 2.18, + "grad_norm": 0.7155557870864868, + "learning_rate": 0.00010433419969045122, + "loss": 3.2461, + "step": 44452 + }, + { + "epoch": 2.18, + "grad_norm": 0.7036327123641968, + "learning_rate": 0.00010432252916925012, + "loss": 3.0977, + "step": 44453 + }, + { + "epoch": 2.18, + "grad_norm": 0.7026028037071228, + "learning_rate": 0.00010431085916342589, + "loss": 2.854, + "step": 44454 + }, + { + "epoch": 2.18, + "grad_norm": 0.7156589031219482, + "learning_rate": 0.00010429918967300912, + "loss": 2.8383, + "step": 44455 + }, + { + "epoch": 2.18, + "grad_norm": 0.7051274180412292, + "learning_rate": 0.00010428752069803063, + "loss": 2.9757, + "step": 44456 + }, + { + "epoch": 2.18, + "grad_norm": 0.6463907957077026, + "learning_rate": 0.00010427585223852122, + "loss": 2.9478, + "step": 44457 + }, + { + "epoch": 2.18, + "grad_norm": 0.7040029764175415, + "learning_rate": 0.00010426418429451144, + "loss": 2.6731, + "step": 44458 + }, + { + "epoch": 2.18, + "grad_norm": 0.7455715537071228, + "learning_rate": 0.00010425251686603225, + "loss": 2.8721, + "step": 44459 + }, + { + "epoch": 2.18, + "grad_norm": 0.7634180784225464, + "learning_rate": 0.00010424084995311426, + "loss": 3.066, + "step": 44460 + }, + { + "epoch": 2.18, + "grad_norm": 0.7320479154586792, + "learning_rate": 0.0001042291835557881, + "loss": 2.9601, + "step": 44461 + }, + { + "epoch": 2.18, + "grad_norm": 0.7387556433677673, + "learning_rate": 0.00010421751767408468, + "loss": 2.9843, + "step": 44462 + }, + { + "epoch": 2.18, + "grad_norm": 0.7048659324645996, + "learning_rate": 0.00010420585230803452, + "loss": 2.8313, + "step": 44463 + }, + { + "epoch": 2.18, + "grad_norm": 0.7122004628181458, + "learning_rate": 0.00010419418745766858, + "loss": 2.9771, + "step": 44464 + }, + { + "epoch": 2.18, + "grad_norm": 0.7229511141777039, + "learning_rate": 0.0001041825231230174, + "loss": 2.9935, + "step": 44465 + }, + { + "epoch": 2.18, + "grad_norm": 0.7182523608207703, + "learning_rate": 0.00010417085930411185, + "loss": 2.9483, + "step": 44466 + }, + { + "epoch": 2.18, + "grad_norm": 0.7179908156394958, + "learning_rate": 0.00010415919600098255, + "loss": 2.85, + "step": 44467 + }, + { + "epoch": 2.18, + "grad_norm": 0.7596323490142822, + "learning_rate": 0.00010414753321366015, + "loss": 2.7197, + "step": 44468 + }, + { + "epoch": 2.18, + "grad_norm": 0.7082833051681519, + "learning_rate": 0.00010413587094217552, + "loss": 2.9153, + "step": 44469 + }, + { + "epoch": 2.18, + "grad_norm": 0.6668033003807068, + "learning_rate": 0.00010412420918655925, + "loss": 3.0079, + "step": 44470 + }, + { + "epoch": 2.18, + "grad_norm": 0.7020968794822693, + "learning_rate": 0.00010411254794684212, + "loss": 2.8986, + "step": 44471 + }, + { + "epoch": 2.18, + "grad_norm": 0.6882535815238953, + "learning_rate": 0.00010410088722305493, + "loss": 2.9372, + "step": 44472 + }, + { + "epoch": 2.18, + "grad_norm": 0.7956342101097107, + "learning_rate": 0.0001040892270152283, + "loss": 3.0606, + "step": 44473 + }, + { + "epoch": 2.18, + "grad_norm": 0.7361034750938416, + "learning_rate": 0.00010407756732339293, + "loss": 2.9101, + "step": 44474 + }, + { + "epoch": 2.18, + "grad_norm": 0.7044380903244019, + "learning_rate": 0.00010406590814757946, + "loss": 2.798, + "step": 44475 + }, + { + "epoch": 2.18, + "grad_norm": 0.6986120343208313, + "learning_rate": 0.00010405424948781869, + "loss": 2.7251, + "step": 44476 + }, + { + "epoch": 2.18, + "grad_norm": 0.712506890296936, + "learning_rate": 0.0001040425913441414, + "loss": 2.9249, + "step": 44477 + }, + { + "epoch": 2.18, + "grad_norm": 0.7093831896781921, + "learning_rate": 0.00010403093371657807, + "loss": 2.7835, + "step": 44478 + }, + { + "epoch": 2.18, + "grad_norm": 0.7020789384841919, + "learning_rate": 0.00010401927660515972, + "loss": 2.7917, + "step": 44479 + }, + { + "epoch": 2.18, + "grad_norm": 0.70029616355896, + "learning_rate": 0.00010400762000991675, + "loss": 3.0307, + "step": 44480 + }, + { + "epoch": 2.18, + "grad_norm": 0.7338650226593018, + "learning_rate": 0.00010399596393088007, + "loss": 3.0209, + "step": 44481 + }, + { + "epoch": 2.18, + "grad_norm": 0.700697124004364, + "learning_rate": 0.00010398430836808032, + "loss": 2.859, + "step": 44482 + }, + { + "epoch": 2.18, + "grad_norm": 0.6929804086685181, + "learning_rate": 0.00010397265332154807, + "loss": 2.814, + "step": 44483 + }, + { + "epoch": 2.18, + "grad_norm": 0.6898903250694275, + "learning_rate": 0.00010396099879131422, + "loss": 2.8862, + "step": 44484 + }, + { + "epoch": 2.18, + "grad_norm": 0.7237740755081177, + "learning_rate": 0.00010394934477740928, + "loss": 3.033, + "step": 44485 + }, + { + "epoch": 2.18, + "grad_norm": 0.6766157150268555, + "learning_rate": 0.00010393769127986403, + "loss": 2.9624, + "step": 44486 + }, + { + "epoch": 2.18, + "grad_norm": 0.7159266471862793, + "learning_rate": 0.00010392603829870928, + "loss": 2.8887, + "step": 44487 + }, + { + "epoch": 2.18, + "grad_norm": 0.7595884799957275, + "learning_rate": 0.0001039143858339756, + "loss": 3.0751, + "step": 44488 + }, + { + "epoch": 2.18, + "grad_norm": 0.6963126063346863, + "learning_rate": 0.00010390273388569367, + "loss": 2.8867, + "step": 44489 + }, + { + "epoch": 2.18, + "grad_norm": 0.810042142868042, + "learning_rate": 0.00010389108245389412, + "loss": 2.8458, + "step": 44490 + }, + { + "epoch": 2.18, + "grad_norm": 0.7496387362480164, + "learning_rate": 0.00010387943153860781, + "loss": 2.8515, + "step": 44491 + }, + { + "epoch": 2.18, + "grad_norm": 0.7184833884239197, + "learning_rate": 0.00010386778113986525, + "loss": 2.8776, + "step": 44492 + }, + { + "epoch": 2.18, + "grad_norm": 0.7305948734283447, + "learning_rate": 0.00010385613125769719, + "loss": 2.8932, + "step": 44493 + }, + { + "epoch": 2.18, + "grad_norm": 0.7090479135513306, + "learning_rate": 0.00010384448189213446, + "loss": 2.8305, + "step": 44494 + }, + { + "epoch": 2.18, + "grad_norm": 0.7010915279388428, + "learning_rate": 0.00010383283304320758, + "loss": 2.9246, + "step": 44495 + }, + { + "epoch": 2.18, + "grad_norm": 0.7132809162139893, + "learning_rate": 0.0001038211847109473, + "loss": 2.831, + "step": 44496 + }, + { + "epoch": 2.18, + "grad_norm": 0.7011138796806335, + "learning_rate": 0.00010380953689538413, + "loss": 2.9277, + "step": 44497 + }, + { + "epoch": 2.18, + "grad_norm": 0.7759160995483398, + "learning_rate": 0.00010379788959654889, + "loss": 2.8512, + "step": 44498 + }, + { + "epoch": 2.18, + "grad_norm": 0.7896947264671326, + "learning_rate": 0.00010378624281447236, + "loss": 2.8438, + "step": 44499 + }, + { + "epoch": 2.18, + "grad_norm": 0.7089138627052307, + "learning_rate": 0.000103774596549185, + "loss": 2.9216, + "step": 44500 + }, + { + "epoch": 2.18, + "grad_norm": 0.7258244752883911, + "learning_rate": 0.00010376295080071769, + "loss": 2.8647, + "step": 44501 + }, + { + "epoch": 2.18, + "grad_norm": 0.7142013311386108, + "learning_rate": 0.00010375130556910088, + "loss": 2.9314, + "step": 44502 + }, + { + "epoch": 2.18, + "grad_norm": 0.7016758322715759, + "learning_rate": 0.00010373966085436548, + "loss": 3.0253, + "step": 44503 + }, + { + "epoch": 2.18, + "grad_norm": 0.7166877388954163, + "learning_rate": 0.00010372801665654205, + "loss": 2.858, + "step": 44504 + }, + { + "epoch": 2.18, + "grad_norm": 0.6986833214759827, + "learning_rate": 0.00010371637297566112, + "loss": 3.0418, + "step": 44505 + }, + { + "epoch": 2.18, + "grad_norm": 0.7260124683380127, + "learning_rate": 0.00010370472981175361, + "loss": 2.9055, + "step": 44506 + }, + { + "epoch": 2.18, + "grad_norm": 0.7071568369865417, + "learning_rate": 0.00010369308716484999, + "loss": 2.8121, + "step": 44507 + }, + { + "epoch": 2.18, + "grad_norm": 0.6688234210014343, + "learning_rate": 0.00010368144503498097, + "loss": 3.0533, + "step": 44508 + }, + { + "epoch": 2.18, + "grad_norm": 0.6707281470298767, + "learning_rate": 0.00010366980342217734, + "loss": 3.061, + "step": 44509 + }, + { + "epoch": 2.18, + "grad_norm": 0.7021904587745667, + "learning_rate": 0.00010365816232646965, + "loss": 2.7661, + "step": 44510 + }, + { + "epoch": 2.18, + "grad_norm": 0.7200233936309814, + "learning_rate": 0.0001036465217478886, + "loss": 3.0572, + "step": 44511 + }, + { + "epoch": 2.18, + "grad_norm": 0.7074561715126038, + "learning_rate": 0.00010363488168646473, + "loss": 3.0943, + "step": 44512 + }, + { + "epoch": 2.18, + "grad_norm": 0.7289097309112549, + "learning_rate": 0.00010362324214222876, + "loss": 2.9338, + "step": 44513 + }, + { + "epoch": 2.18, + "grad_norm": 0.7148504257202148, + "learning_rate": 0.00010361160311521148, + "loss": 3.0845, + "step": 44514 + }, + { + "epoch": 2.18, + "grad_norm": 0.6821744441986084, + "learning_rate": 0.00010359996460544335, + "loss": 3.1103, + "step": 44515 + }, + { + "epoch": 2.18, + "grad_norm": 0.6762775182723999, + "learning_rate": 0.00010358832661295524, + "loss": 2.9369, + "step": 44516 + }, + { + "epoch": 2.18, + "grad_norm": 0.7053170204162598, + "learning_rate": 0.00010357668913777756, + "loss": 2.8441, + "step": 44517 + }, + { + "epoch": 2.18, + "grad_norm": 0.7255358099937439, + "learning_rate": 0.00010356505217994118, + "loss": 2.8484, + "step": 44518 + }, + { + "epoch": 2.18, + "grad_norm": 0.6944178938865662, + "learning_rate": 0.00010355341573947665, + "loss": 2.8011, + "step": 44519 + }, + { + "epoch": 2.18, + "grad_norm": 0.722078800201416, + "learning_rate": 0.00010354177981641449, + "loss": 2.8922, + "step": 44520 + }, + { + "epoch": 2.18, + "grad_norm": 0.7471203207969666, + "learning_rate": 0.00010353014441078558, + "loss": 2.8489, + "step": 44521 + }, + { + "epoch": 2.18, + "grad_norm": 0.6949812173843384, + "learning_rate": 0.00010351850952262036, + "loss": 2.863, + "step": 44522 + }, + { + "epoch": 2.18, + "grad_norm": 0.7270025610923767, + "learning_rate": 0.0001035068751519496, + "loss": 3.0245, + "step": 44523 + }, + { + "epoch": 2.18, + "grad_norm": 0.7315390110015869, + "learning_rate": 0.00010349524129880399, + "loss": 2.8959, + "step": 44524 + }, + { + "epoch": 2.18, + "grad_norm": 0.747600793838501, + "learning_rate": 0.00010348360796321411, + "loss": 2.7812, + "step": 44525 + }, + { + "epoch": 2.18, + "grad_norm": 0.7093079686164856, + "learning_rate": 0.00010347197514521058, + "loss": 2.6684, + "step": 44526 + }, + { + "epoch": 2.18, + "grad_norm": 0.7230992317199707, + "learning_rate": 0.00010346034284482392, + "loss": 2.9073, + "step": 44527 + }, + { + "epoch": 2.18, + "grad_norm": 0.6705207824707031, + "learning_rate": 0.00010344871106208492, + "loss": 2.9985, + "step": 44528 + }, + { + "epoch": 2.18, + "grad_norm": 0.6426035761833191, + "learning_rate": 0.00010343707979702429, + "loss": 2.997, + "step": 44529 + }, + { + "epoch": 2.18, + "grad_norm": 0.7323707938194275, + "learning_rate": 0.00010342544904967247, + "loss": 2.7661, + "step": 44530 + }, + { + "epoch": 2.18, + "grad_norm": 0.7027085423469543, + "learning_rate": 0.00010341381882006027, + "loss": 2.7511, + "step": 44531 + }, + { + "epoch": 2.18, + "grad_norm": 0.6900554299354553, + "learning_rate": 0.00010340218910821823, + "loss": 2.9398, + "step": 44532 + }, + { + "epoch": 2.18, + "grad_norm": 0.7302149534225464, + "learning_rate": 0.00010339055991417688, + "loss": 3.0751, + "step": 44533 + }, + { + "epoch": 2.18, + "grad_norm": 0.7446348071098328, + "learning_rate": 0.0001033789312379671, + "loss": 2.897, + "step": 44534 + }, + { + "epoch": 2.18, + "grad_norm": 0.7004778385162354, + "learning_rate": 0.00010336730307961925, + "loss": 2.9385, + "step": 44535 + }, + { + "epoch": 2.18, + "grad_norm": 0.6848006844520569, + "learning_rate": 0.00010335567543916417, + "loss": 3.0406, + "step": 44536 + }, + { + "epoch": 2.18, + "grad_norm": 0.6743120551109314, + "learning_rate": 0.00010334404831663232, + "loss": 2.8368, + "step": 44537 + }, + { + "epoch": 2.18, + "grad_norm": 0.6830146908760071, + "learning_rate": 0.00010333242171205436, + "loss": 2.7874, + "step": 44538 + }, + { + "epoch": 2.18, + "grad_norm": 0.6831412315368652, + "learning_rate": 0.00010332079562546108, + "loss": 2.5951, + "step": 44539 + }, + { + "epoch": 2.18, + "grad_norm": 0.7036148309707642, + "learning_rate": 0.00010330917005688297, + "loss": 2.7791, + "step": 44540 + }, + { + "epoch": 2.18, + "grad_norm": 0.7623158693313599, + "learning_rate": 0.00010329754500635067, + "loss": 2.9707, + "step": 44541 + }, + { + "epoch": 2.18, + "grad_norm": 0.6715494394302368, + "learning_rate": 0.00010328592047389464, + "loss": 3.103, + "step": 44542 + }, + { + "epoch": 2.18, + "grad_norm": 0.7432985305786133, + "learning_rate": 0.00010327429645954567, + "loss": 2.717, + "step": 44543 + }, + { + "epoch": 2.18, + "grad_norm": 0.7097046375274658, + "learning_rate": 0.00010326267296333445, + "loss": 3.0353, + "step": 44544 + }, + { + "epoch": 2.18, + "grad_norm": 0.7098416090011597, + "learning_rate": 0.00010325104998529136, + "loss": 2.8349, + "step": 44545 + }, + { + "epoch": 2.18, + "grad_norm": 0.719925582408905, + "learning_rate": 0.00010323942752544725, + "loss": 2.8611, + "step": 44546 + }, + { + "epoch": 2.18, + "grad_norm": 0.7523378133773804, + "learning_rate": 0.0001032278055838326, + "loss": 2.8775, + "step": 44547 + }, + { + "epoch": 2.18, + "grad_norm": 0.7218157649040222, + "learning_rate": 0.00010321618416047796, + "loss": 2.8731, + "step": 44548 + }, + { + "epoch": 2.18, + "grad_norm": 0.6982775926589966, + "learning_rate": 0.00010320456325541412, + "loss": 2.6777, + "step": 44549 + }, + { + "epoch": 2.18, + "grad_norm": 0.7226266264915466, + "learning_rate": 0.0001031929428686715, + "loss": 3.0359, + "step": 44550 + }, + { + "epoch": 2.18, + "grad_norm": 0.7298018932342529, + "learning_rate": 0.00010318132300028087, + "loss": 2.7433, + "step": 44551 + }, + { + "epoch": 2.18, + "grad_norm": 0.725264310836792, + "learning_rate": 0.00010316970365027267, + "loss": 2.873, + "step": 44552 + }, + { + "epoch": 2.18, + "grad_norm": 0.70024573802948, + "learning_rate": 0.00010315808481867765, + "loss": 3.1367, + "step": 44553 + }, + { + "epoch": 2.18, + "grad_norm": 0.6710588335990906, + "learning_rate": 0.0001031464665055264, + "loss": 2.8933, + "step": 44554 + }, + { + "epoch": 2.18, + "grad_norm": 0.7196180820465088, + "learning_rate": 0.00010313484871084935, + "loss": 2.9405, + "step": 44555 + }, + { + "epoch": 2.18, + "grad_norm": 0.6892144083976746, + "learning_rate": 0.00010312323143467734, + "loss": 2.879, + "step": 44556 + }, + { + "epoch": 2.18, + "grad_norm": 0.7082641124725342, + "learning_rate": 0.00010311161467704073, + "loss": 2.9238, + "step": 44557 + }, + { + "epoch": 2.18, + "grad_norm": 0.709845244884491, + "learning_rate": 0.00010309999843797025, + "loss": 2.761, + "step": 44558 + }, + { + "epoch": 2.18, + "grad_norm": 0.7715518474578857, + "learning_rate": 0.00010308838271749657, + "loss": 2.8075, + "step": 44559 + }, + { + "epoch": 2.18, + "grad_norm": 0.6767800450325012, + "learning_rate": 0.0001030767675156501, + "loss": 2.8401, + "step": 44560 + }, + { + "epoch": 2.18, + "grad_norm": 0.74344402551651, + "learning_rate": 0.00010306515283246161, + "loss": 2.967, + "step": 44561 + }, + { + "epoch": 2.18, + "grad_norm": 0.7046286463737488, + "learning_rate": 0.00010305353866796163, + "loss": 3.0008, + "step": 44562 + }, + { + "epoch": 2.18, + "grad_norm": 0.6904906034469604, + "learning_rate": 0.00010304192502218059, + "loss": 2.8009, + "step": 44563 + }, + { + "epoch": 2.18, + "grad_norm": 0.7006396055221558, + "learning_rate": 0.00010303031189514934, + "loss": 2.9043, + "step": 44564 + }, + { + "epoch": 2.18, + "grad_norm": 0.6934382915496826, + "learning_rate": 0.00010301869928689824, + "loss": 2.9064, + "step": 44565 + }, + { + "epoch": 2.18, + "grad_norm": 0.7191388010978699, + "learning_rate": 0.00010300708719745807, + "loss": 2.9023, + "step": 44566 + }, + { + "epoch": 2.18, + "grad_norm": 0.6918665170669556, + "learning_rate": 0.00010299547562685924, + "loss": 2.9055, + "step": 44567 + }, + { + "epoch": 2.18, + "grad_norm": 0.7260489463806152, + "learning_rate": 0.00010298386457513247, + "loss": 3.0175, + "step": 44568 + }, + { + "epoch": 2.18, + "grad_norm": 0.688124418258667, + "learning_rate": 0.00010297225404230834, + "loss": 2.9031, + "step": 44569 + }, + { + "epoch": 2.18, + "grad_norm": 0.6864445805549622, + "learning_rate": 0.00010296064402841724, + "loss": 2.9554, + "step": 44570 + }, + { + "epoch": 2.18, + "grad_norm": 0.6903675198554993, + "learning_rate": 0.00010294903453349, + "loss": 2.8524, + "step": 44571 + }, + { + "epoch": 2.18, + "grad_norm": 0.6836926937103271, + "learning_rate": 0.00010293742555755695, + "loss": 3.106, + "step": 44572 + }, + { + "epoch": 2.18, + "grad_norm": 0.6697653532028198, + "learning_rate": 0.00010292581710064892, + "loss": 2.6941, + "step": 44573 + }, + { + "epoch": 2.18, + "grad_norm": 0.7098059058189392, + "learning_rate": 0.00010291420916279625, + "loss": 2.8763, + "step": 44574 + }, + { + "epoch": 2.18, + "grad_norm": 0.6917982697486877, + "learning_rate": 0.00010290260174402974, + "loss": 2.9143, + "step": 44575 + }, + { + "epoch": 2.18, + "grad_norm": 0.7418469786643982, + "learning_rate": 0.00010289099484437982, + "loss": 2.8845, + "step": 44576 + }, + { + "epoch": 2.18, + "grad_norm": 0.6864647269248962, + "learning_rate": 0.00010287938846387701, + "loss": 2.9388, + "step": 44577 + }, + { + "epoch": 2.18, + "grad_norm": 0.6718887686729431, + "learning_rate": 0.00010286778260255203, + "loss": 3.0018, + "step": 44578 + }, + { + "epoch": 2.18, + "grad_norm": 0.7265290021896362, + "learning_rate": 0.00010285617726043529, + "loss": 2.9683, + "step": 44579 + }, + { + "epoch": 2.18, + "grad_norm": 0.6988957524299622, + "learning_rate": 0.00010284457243755743, + "loss": 2.8479, + "step": 44580 + }, + { + "epoch": 2.18, + "grad_norm": 0.7348703145980835, + "learning_rate": 0.00010283296813394911, + "loss": 2.7909, + "step": 44581 + }, + { + "epoch": 2.18, + "grad_norm": 0.7384020090103149, + "learning_rate": 0.00010282136434964072, + "loss": 2.8118, + "step": 44582 + }, + { + "epoch": 2.18, + "grad_norm": 0.7310593724250793, + "learning_rate": 0.00010280976108466301, + "loss": 2.8275, + "step": 44583 + }, + { + "epoch": 2.18, + "grad_norm": 0.7117740511894226, + "learning_rate": 0.00010279815833904645, + "loss": 2.8532, + "step": 44584 + }, + { + "epoch": 2.19, + "grad_norm": 0.6952208280563354, + "learning_rate": 0.0001027865561128215, + "loss": 2.6818, + "step": 44585 + }, + { + "epoch": 2.19, + "grad_norm": 0.6998794078826904, + "learning_rate": 0.00010277495440601887, + "loss": 2.7961, + "step": 44586 + }, + { + "epoch": 2.19, + "grad_norm": 0.71541827917099, + "learning_rate": 0.00010276335321866896, + "loss": 3.0561, + "step": 44587 + }, + { + "epoch": 2.19, + "grad_norm": 0.7112535834312439, + "learning_rate": 0.00010275175255080251, + "loss": 2.9963, + "step": 44588 + }, + { + "epoch": 2.19, + "grad_norm": 0.7849178314208984, + "learning_rate": 0.00010274015240244992, + "loss": 2.8191, + "step": 44589 + }, + { + "epoch": 2.19, + "grad_norm": 0.7260420918464661, + "learning_rate": 0.00010272855277364188, + "loss": 2.6095, + "step": 44590 + }, + { + "epoch": 2.19, + "grad_norm": 0.6618551015853882, + "learning_rate": 0.00010271695366440885, + "loss": 2.9215, + "step": 44591 + }, + { + "epoch": 2.19, + "grad_norm": 0.6712572574615479, + "learning_rate": 0.00010270535507478135, + "loss": 2.9213, + "step": 44592 + }, + { + "epoch": 2.19, + "grad_norm": 0.7594893574714661, + "learning_rate": 0.00010269375700479002, + "loss": 2.7903, + "step": 44593 + }, + { + "epoch": 2.19, + "grad_norm": 0.7108433246612549, + "learning_rate": 0.00010268215945446529, + "loss": 2.9075, + "step": 44594 + }, + { + "epoch": 2.19, + "grad_norm": 0.725123941898346, + "learning_rate": 0.00010267056242383777, + "loss": 2.88, + "step": 44595 + }, + { + "epoch": 2.19, + "grad_norm": 0.6654282808303833, + "learning_rate": 0.0001026589659129381, + "loss": 2.8843, + "step": 44596 + }, + { + "epoch": 2.19, + "grad_norm": 0.751349687576294, + "learning_rate": 0.00010264736992179675, + "loss": 3.0375, + "step": 44597 + }, + { + "epoch": 2.19, + "grad_norm": 0.7495384216308594, + "learning_rate": 0.00010263577445044425, + "loss": 2.9641, + "step": 44598 + }, + { + "epoch": 2.19, + "grad_norm": 0.7475329041481018, + "learning_rate": 0.00010262417949891104, + "loss": 2.908, + "step": 44599 + }, + { + "epoch": 2.19, + "grad_norm": 0.6864625811576843, + "learning_rate": 0.00010261258506722772, + "loss": 2.9052, + "step": 44600 + }, + { + "epoch": 2.19, + "grad_norm": 0.6639381647109985, + "learning_rate": 0.00010260099115542499, + "loss": 2.8416, + "step": 44601 + }, + { + "epoch": 2.19, + "grad_norm": 0.7066051959991455, + "learning_rate": 0.00010258939776353316, + "loss": 2.7455, + "step": 44602 + }, + { + "epoch": 2.19, + "grad_norm": 0.6839228868484497, + "learning_rate": 0.00010257780489158295, + "loss": 3.0094, + "step": 44603 + }, + { + "epoch": 2.19, + "grad_norm": 0.7170974016189575, + "learning_rate": 0.00010256621253960474, + "loss": 2.9012, + "step": 44604 + }, + { + "epoch": 2.19, + "grad_norm": 0.7041524052619934, + "learning_rate": 0.00010255462070762921, + "loss": 3.0658, + "step": 44605 + }, + { + "epoch": 2.19, + "grad_norm": 0.6791936159133911, + "learning_rate": 0.0001025430293956868, + "loss": 2.9157, + "step": 44606 + }, + { + "epoch": 2.19, + "grad_norm": 0.7103440165519714, + "learning_rate": 0.00010253143860380793, + "loss": 2.9272, + "step": 44607 + }, + { + "epoch": 2.19, + "grad_norm": 0.6542874574661255, + "learning_rate": 0.00010251984833202337, + "loss": 2.9294, + "step": 44608 + }, + { + "epoch": 2.19, + "grad_norm": 0.7594596147537231, + "learning_rate": 0.00010250825858036346, + "loss": 2.8874, + "step": 44609 + }, + { + "epoch": 2.19, + "grad_norm": 0.7474169731140137, + "learning_rate": 0.00010249666934885874, + "loss": 2.8206, + "step": 44610 + }, + { + "epoch": 2.19, + "grad_norm": 0.7913607954978943, + "learning_rate": 0.0001024850806375399, + "loss": 2.8971, + "step": 44611 + }, + { + "epoch": 2.19, + "grad_norm": 0.6927233934402466, + "learning_rate": 0.00010247349244643734, + "loss": 2.7587, + "step": 44612 + }, + { + "epoch": 2.19, + "grad_norm": 0.6824236512184143, + "learning_rate": 0.0001024619047755816, + "loss": 2.9516, + "step": 44613 + }, + { + "epoch": 2.19, + "grad_norm": 0.7336830496788025, + "learning_rate": 0.00010245031762500306, + "loss": 2.9245, + "step": 44614 + }, + { + "epoch": 2.19, + "grad_norm": 0.7069776654243469, + "learning_rate": 0.0001024387309947324, + "loss": 2.7558, + "step": 44615 + }, + { + "epoch": 2.19, + "grad_norm": 0.6687659621238708, + "learning_rate": 0.00010242714488480018, + "loss": 2.8424, + "step": 44616 + }, + { + "epoch": 2.19, + "grad_norm": 0.7121585607528687, + "learning_rate": 0.00010241555929523673, + "loss": 2.8238, + "step": 44617 + }, + { + "epoch": 2.19, + "grad_norm": 0.6747063398361206, + "learning_rate": 0.00010240397422607277, + "loss": 3.1629, + "step": 44618 + }, + { + "epoch": 2.19, + "grad_norm": 0.7251124978065491, + "learning_rate": 0.00010239238967733865, + "loss": 2.8919, + "step": 44619 + }, + { + "epoch": 2.19, + "grad_norm": 0.6380995512008667, + "learning_rate": 0.000102380805649065, + "loss": 2.9561, + "step": 44620 + }, + { + "epoch": 2.19, + "grad_norm": 0.7634221315383911, + "learning_rate": 0.0001023692221412823, + "loss": 2.9855, + "step": 44621 + }, + { + "epoch": 2.19, + "grad_norm": 0.6780490875244141, + "learning_rate": 0.00010235763915402094, + "loss": 2.9611, + "step": 44622 + }, + { + "epoch": 2.19, + "grad_norm": 0.715461254119873, + "learning_rate": 0.0001023460566873116, + "loss": 2.8342, + "step": 44623 + }, + { + "epoch": 2.19, + "grad_norm": 0.6985108852386475, + "learning_rate": 0.0001023344747411846, + "loss": 2.9235, + "step": 44624 + }, + { + "epoch": 2.19, + "grad_norm": 0.7491456866264343, + "learning_rate": 0.00010232289331567057, + "loss": 2.9031, + "step": 44625 + }, + { + "epoch": 2.19, + "grad_norm": 0.709112823009491, + "learning_rate": 0.00010231131241080009, + "loss": 2.8829, + "step": 44626 + }, + { + "epoch": 2.19, + "grad_norm": 0.6923708319664001, + "learning_rate": 0.00010229973202660358, + "loss": 3.0084, + "step": 44627 + }, + { + "epoch": 2.19, + "grad_norm": 0.7466208934783936, + "learning_rate": 0.00010228815216311152, + "loss": 2.9558, + "step": 44628 + }, + { + "epoch": 2.19, + "grad_norm": 0.7512304782867432, + "learning_rate": 0.00010227657282035431, + "loss": 2.763, + "step": 44629 + }, + { + "epoch": 2.19, + "grad_norm": 0.7179058790206909, + "learning_rate": 0.00010226499399836256, + "loss": 2.9326, + "step": 44630 + }, + { + "epoch": 2.19, + "grad_norm": 0.7273489236831665, + "learning_rate": 0.00010225341569716686, + "loss": 2.8466, + "step": 44631 + }, + { + "epoch": 2.19, + "grad_norm": 0.6925496459007263, + "learning_rate": 0.00010224183791679751, + "loss": 3.0426, + "step": 44632 + }, + { + "epoch": 2.19, + "grad_norm": 0.7936264276504517, + "learning_rate": 0.00010223026065728517, + "loss": 2.8253, + "step": 44633 + }, + { + "epoch": 2.19, + "grad_norm": 0.7013174295425415, + "learning_rate": 0.00010221868391866031, + "loss": 2.9166, + "step": 44634 + }, + { + "epoch": 2.19, + "grad_norm": 0.7175893187522888, + "learning_rate": 0.00010220710770095326, + "loss": 2.8768, + "step": 44635 + }, + { + "epoch": 2.19, + "grad_norm": 0.6898190975189209, + "learning_rate": 0.00010219553200419469, + "loss": 2.7969, + "step": 44636 + }, + { + "epoch": 2.19, + "grad_norm": 0.7262145280838013, + "learning_rate": 0.00010218395682841494, + "loss": 2.6368, + "step": 44637 + }, + { + "epoch": 2.19, + "grad_norm": 0.693634033203125, + "learning_rate": 0.00010217238217364471, + "loss": 3.0816, + "step": 44638 + }, + { + "epoch": 2.19, + "grad_norm": 0.7244294285774231, + "learning_rate": 0.0001021608080399142, + "loss": 2.8939, + "step": 44639 + }, + { + "epoch": 2.19, + "grad_norm": 0.6989253163337708, + "learning_rate": 0.00010214923442725407, + "loss": 2.8912, + "step": 44640 + }, + { + "epoch": 2.19, + "grad_norm": 0.694697380065918, + "learning_rate": 0.00010213766133569488, + "loss": 2.85, + "step": 44641 + }, + { + "epoch": 2.19, + "grad_norm": 0.7070184350013733, + "learning_rate": 0.000102126088765267, + "loss": 3.0055, + "step": 44642 + }, + { + "epoch": 2.19, + "grad_norm": 0.7489998936653137, + "learning_rate": 0.00010211451671600095, + "loss": 2.6904, + "step": 44643 + }, + { + "epoch": 2.19, + "grad_norm": 0.7626986503601074, + "learning_rate": 0.00010210294518792708, + "loss": 2.8428, + "step": 44644 + }, + { + "epoch": 2.19, + "grad_norm": 0.6796693801879883, + "learning_rate": 0.00010209137418107604, + "loss": 3.1733, + "step": 44645 + }, + { + "epoch": 2.19, + "grad_norm": 0.7073202133178711, + "learning_rate": 0.00010207980369547814, + "loss": 3.0606, + "step": 44646 + }, + { + "epoch": 2.19, + "grad_norm": 0.7209928035736084, + "learning_rate": 0.00010206823373116391, + "loss": 2.6608, + "step": 44647 + }, + { + "epoch": 2.19, + "grad_norm": 0.6928178668022156, + "learning_rate": 0.00010205666428816403, + "loss": 2.8192, + "step": 44648 + }, + { + "epoch": 2.19, + "grad_norm": 0.8096620440483093, + "learning_rate": 0.00010204509536650876, + "loss": 2.9148, + "step": 44649 + }, + { + "epoch": 2.19, + "grad_norm": 0.75314861536026, + "learning_rate": 0.00010203352696622862, + "loss": 2.6021, + "step": 44650 + }, + { + "epoch": 2.19, + "grad_norm": 0.7307797074317932, + "learning_rate": 0.00010202195908735396, + "loss": 2.9043, + "step": 44651 + }, + { + "epoch": 2.19, + "grad_norm": 0.7347580790519714, + "learning_rate": 0.00010201039172991536, + "loss": 2.7061, + "step": 44652 + }, + { + "epoch": 2.19, + "grad_norm": 0.7171032428741455, + "learning_rate": 0.00010199882489394339, + "loss": 3.0612, + "step": 44653 + }, + { + "epoch": 2.19, + "grad_norm": 0.7112158536911011, + "learning_rate": 0.0001019872585794683, + "loss": 2.8196, + "step": 44654 + }, + { + "epoch": 2.19, + "grad_norm": 0.6790067553520203, + "learning_rate": 0.0001019756927865208, + "loss": 2.8819, + "step": 44655 + }, + { + "epoch": 2.19, + "grad_norm": 0.7215171456336975, + "learning_rate": 0.00010196412751513117, + "loss": 2.9303, + "step": 44656 + }, + { + "epoch": 2.19, + "grad_norm": 0.7309430241584778, + "learning_rate": 0.00010195256276532984, + "loss": 3.0757, + "step": 44657 + }, + { + "epoch": 2.19, + "grad_norm": 0.7545703649520874, + "learning_rate": 0.00010194099853714742, + "loss": 2.9345, + "step": 44658 + }, + { + "epoch": 2.19, + "grad_norm": 0.7092612981796265, + "learning_rate": 0.00010192943483061423, + "loss": 2.8953, + "step": 44659 + }, + { + "epoch": 2.19, + "grad_norm": 0.6743860840797424, + "learning_rate": 0.00010191787164576088, + "loss": 3.0049, + "step": 44660 + }, + { + "epoch": 2.19, + "grad_norm": 0.676878035068512, + "learning_rate": 0.00010190630898261759, + "loss": 2.8814, + "step": 44661 + }, + { + "epoch": 2.19, + "grad_norm": 0.7018594741821289, + "learning_rate": 0.000101894746841215, + "loss": 2.926, + "step": 44662 + }, + { + "epoch": 2.19, + "grad_norm": 0.6919565200805664, + "learning_rate": 0.00010188318522158362, + "loss": 3.0665, + "step": 44663 + }, + { + "epoch": 2.19, + "grad_norm": 0.7357789278030396, + "learning_rate": 0.00010187162412375378, + "loss": 2.9906, + "step": 44664 + }, + { + "epoch": 2.19, + "grad_norm": 0.7537810802459717, + "learning_rate": 0.00010186006354775598, + "loss": 2.9191, + "step": 44665 + }, + { + "epoch": 2.19, + "grad_norm": 0.7341231107711792, + "learning_rate": 0.00010184850349362051, + "loss": 3.0268, + "step": 44666 + }, + { + "epoch": 2.19, + "grad_norm": 0.6830435991287231, + "learning_rate": 0.00010183694396137797, + "loss": 2.8872, + "step": 44667 + }, + { + "epoch": 2.19, + "grad_norm": 0.7135305404663086, + "learning_rate": 0.00010182538495105886, + "loss": 2.8633, + "step": 44668 + }, + { + "epoch": 2.19, + "grad_norm": 0.7441238164901733, + "learning_rate": 0.00010181382646269347, + "loss": 2.9731, + "step": 44669 + }, + { + "epoch": 2.19, + "grad_norm": 0.7147581577301025, + "learning_rate": 0.00010180226849631244, + "loss": 2.7636, + "step": 44670 + }, + { + "epoch": 2.19, + "grad_norm": 0.7436352968215942, + "learning_rate": 0.00010179071105194603, + "loss": 2.9936, + "step": 44671 + }, + { + "epoch": 2.19, + "grad_norm": 0.7712357640266418, + "learning_rate": 0.0001017791541296247, + "loss": 2.7682, + "step": 44672 + }, + { + "epoch": 2.19, + "grad_norm": 0.6808838248252869, + "learning_rate": 0.00010176759772937898, + "loss": 2.7931, + "step": 44673 + }, + { + "epoch": 2.19, + "grad_norm": 0.6870922446250916, + "learning_rate": 0.0001017560418512392, + "loss": 2.9253, + "step": 44674 + }, + { + "epoch": 2.19, + "grad_norm": 0.7315462827682495, + "learning_rate": 0.0001017444864952359, + "loss": 3.0074, + "step": 44675 + }, + { + "epoch": 2.19, + "grad_norm": 0.7336083054542542, + "learning_rate": 0.00010173293166139942, + "loss": 2.8301, + "step": 44676 + }, + { + "epoch": 2.19, + "grad_norm": 0.7012789845466614, + "learning_rate": 0.00010172137734976032, + "loss": 2.7756, + "step": 44677 + }, + { + "epoch": 2.19, + "grad_norm": 0.704544723033905, + "learning_rate": 0.00010170982356034896, + "loss": 2.9755, + "step": 44678 + }, + { + "epoch": 2.19, + "grad_norm": 0.6943780183792114, + "learning_rate": 0.00010169827029319562, + "loss": 2.9111, + "step": 44679 + }, + { + "epoch": 2.19, + "grad_norm": 0.7300577759742737, + "learning_rate": 0.00010168671754833104, + "loss": 2.8051, + "step": 44680 + }, + { + "epoch": 2.19, + "grad_norm": 0.7285476922988892, + "learning_rate": 0.00010167516532578535, + "loss": 2.6524, + "step": 44681 + }, + { + "epoch": 2.19, + "grad_norm": 0.7168903946876526, + "learning_rate": 0.00010166361362558908, + "loss": 2.9434, + "step": 44682 + }, + { + "epoch": 2.19, + "grad_norm": 0.6996946930885315, + "learning_rate": 0.00010165206244777282, + "loss": 2.8507, + "step": 44683 + }, + { + "epoch": 2.19, + "grad_norm": 0.6721283197402954, + "learning_rate": 0.00010164051179236674, + "loss": 2.8241, + "step": 44684 + }, + { + "epoch": 2.19, + "grad_norm": 0.6588842272758484, + "learning_rate": 0.00010162896165940148, + "loss": 2.8318, + "step": 44685 + }, + { + "epoch": 2.19, + "grad_norm": 0.6902130842208862, + "learning_rate": 0.00010161741204890734, + "loss": 2.8502, + "step": 44686 + }, + { + "epoch": 2.19, + "grad_norm": 0.6781286001205444, + "learning_rate": 0.00010160586296091468, + "loss": 3.0355, + "step": 44687 + }, + { + "epoch": 2.19, + "grad_norm": 0.7113543748855591, + "learning_rate": 0.00010159431439545409, + "loss": 2.9687, + "step": 44688 + }, + { + "epoch": 2.19, + "grad_norm": 0.737820029258728, + "learning_rate": 0.00010158276635255579, + "loss": 2.955, + "step": 44689 + }, + { + "epoch": 2.19, + "grad_norm": 0.7128449082374573, + "learning_rate": 0.0001015712188322504, + "loss": 2.8879, + "step": 44690 + }, + { + "epoch": 2.19, + "grad_norm": 0.6655691862106323, + "learning_rate": 0.0001015596718345681, + "loss": 2.86, + "step": 44691 + }, + { + "epoch": 2.19, + "grad_norm": 0.7450001239776611, + "learning_rate": 0.00010154812535953957, + "loss": 2.9949, + "step": 44692 + }, + { + "epoch": 2.19, + "grad_norm": 0.6946448087692261, + "learning_rate": 0.0001015365794071951, + "loss": 3.2081, + "step": 44693 + }, + { + "epoch": 2.19, + "grad_norm": 0.6991720199584961, + "learning_rate": 0.00010152503397756493, + "loss": 2.9359, + "step": 44694 + }, + { + "epoch": 2.19, + "grad_norm": 0.7270272970199585, + "learning_rate": 0.00010151348907067978, + "loss": 2.7, + "step": 44695 + }, + { + "epoch": 2.19, + "grad_norm": 0.6899583339691162, + "learning_rate": 0.00010150194468656975, + "loss": 3.1077, + "step": 44696 + }, + { + "epoch": 2.19, + "grad_norm": 0.7620866894721985, + "learning_rate": 0.00010149040082526541, + "loss": 2.9878, + "step": 44697 + }, + { + "epoch": 2.19, + "grad_norm": 0.6977541446685791, + "learning_rate": 0.0001014788574867973, + "loss": 2.9698, + "step": 44698 + }, + { + "epoch": 2.19, + "grad_norm": 0.7355969548225403, + "learning_rate": 0.00010146731467119562, + "loss": 2.9087, + "step": 44699 + }, + { + "epoch": 2.19, + "grad_norm": 0.6769136190414429, + "learning_rate": 0.00010145577237849085, + "loss": 2.988, + "step": 44700 + }, + { + "epoch": 2.19, + "grad_norm": 0.6845611333847046, + "learning_rate": 0.00010144423060871322, + "loss": 2.7466, + "step": 44701 + }, + { + "epoch": 2.19, + "grad_norm": 0.6977527141571045, + "learning_rate": 0.00010143268936189332, + "loss": 2.8652, + "step": 44702 + }, + { + "epoch": 2.19, + "grad_norm": 0.7107285261154175, + "learning_rate": 0.00010142114863806158, + "loss": 3.1307, + "step": 44703 + }, + { + "epoch": 2.19, + "grad_norm": 0.7218692898750305, + "learning_rate": 0.00010140960843724822, + "loss": 2.9057, + "step": 44704 + }, + { + "epoch": 2.19, + "grad_norm": 0.6744379997253418, + "learning_rate": 0.00010139806875948385, + "loss": 2.8445, + "step": 44705 + }, + { + "epoch": 2.19, + "grad_norm": 0.7107676267623901, + "learning_rate": 0.00010138652960479864, + "loss": 3.0127, + "step": 44706 + }, + { + "epoch": 2.19, + "grad_norm": 0.7380881309509277, + "learning_rate": 0.00010137499097322315, + "loss": 2.743, + "step": 44707 + }, + { + "epoch": 2.19, + "grad_norm": 0.6708388924598694, + "learning_rate": 0.00010136345286478777, + "loss": 2.9925, + "step": 44708 + }, + { + "epoch": 2.19, + "grad_norm": 0.7307834625244141, + "learning_rate": 0.0001013519152795227, + "loss": 2.9303, + "step": 44709 + }, + { + "epoch": 2.19, + "grad_norm": 0.7044564485549927, + "learning_rate": 0.00010134037821745853, + "loss": 2.8851, + "step": 44710 + }, + { + "epoch": 2.19, + "grad_norm": 0.7049198746681213, + "learning_rate": 0.00010132884167862551, + "loss": 2.8379, + "step": 44711 + }, + { + "epoch": 2.19, + "grad_norm": 0.7499628067016602, + "learning_rate": 0.00010131730566305404, + "loss": 2.8606, + "step": 44712 + }, + { + "epoch": 2.19, + "grad_norm": 0.7331987023353577, + "learning_rate": 0.00010130577017077468, + "loss": 3.0568, + "step": 44713 + }, + { + "epoch": 2.19, + "grad_norm": 0.7273868322372437, + "learning_rate": 0.00010129423520181768, + "loss": 2.8243, + "step": 44714 + }, + { + "epoch": 2.19, + "grad_norm": 0.7468082308769226, + "learning_rate": 0.00010128270075621343, + "loss": 2.9944, + "step": 44715 + }, + { + "epoch": 2.19, + "grad_norm": 0.6693267226219177, + "learning_rate": 0.00010127116683399219, + "loss": 2.9631, + "step": 44716 + }, + { + "epoch": 2.19, + "grad_norm": 0.7424618005752563, + "learning_rate": 0.00010125963343518446, + "loss": 3.0344, + "step": 44717 + }, + { + "epoch": 2.19, + "grad_norm": 0.6935816407203674, + "learning_rate": 0.00010124810055982069, + "loss": 3.0167, + "step": 44718 + }, + { + "epoch": 2.19, + "grad_norm": 0.7153097987174988, + "learning_rate": 0.00010123656820793108, + "loss": 3.0504, + "step": 44719 + }, + { + "epoch": 2.19, + "grad_norm": 0.7567762136459351, + "learning_rate": 0.0001012250363795462, + "loss": 2.9657, + "step": 44720 + }, + { + "epoch": 2.19, + "grad_norm": 0.7075486183166504, + "learning_rate": 0.00010121350507469632, + "loss": 2.9507, + "step": 44721 + }, + { + "epoch": 2.19, + "grad_norm": 0.7259413599967957, + "learning_rate": 0.00010120197429341169, + "loss": 2.8704, + "step": 44722 + }, + { + "epoch": 2.19, + "grad_norm": 0.6765598654747009, + "learning_rate": 0.00010119044403572293, + "loss": 2.7903, + "step": 44723 + }, + { + "epoch": 2.19, + "grad_norm": 0.6906706690788269, + "learning_rate": 0.00010117891430166018, + "loss": 2.7619, + "step": 44724 + }, + { + "epoch": 2.19, + "grad_norm": 0.7371125221252441, + "learning_rate": 0.00010116738509125397, + "loss": 2.7714, + "step": 44725 + }, + { + "epoch": 2.19, + "grad_norm": 0.711427628993988, + "learning_rate": 0.00010115585640453453, + "loss": 2.9452, + "step": 44726 + }, + { + "epoch": 2.19, + "grad_norm": 0.6924580931663513, + "learning_rate": 0.00010114432824153239, + "loss": 2.8359, + "step": 44727 + }, + { + "epoch": 2.19, + "grad_norm": 0.7518722414970398, + "learning_rate": 0.00010113280060227773, + "loss": 2.8871, + "step": 44728 + }, + { + "epoch": 2.19, + "grad_norm": 0.671915590763092, + "learning_rate": 0.00010112127348680109, + "loss": 3.0578, + "step": 44729 + }, + { + "epoch": 2.19, + "grad_norm": 0.7346581220626831, + "learning_rate": 0.00010110974689513275, + "loss": 2.9824, + "step": 44730 + }, + { + "epoch": 2.19, + "grad_norm": 0.6971009373664856, + "learning_rate": 0.00010109822082730295, + "loss": 2.8534, + "step": 44731 + }, + { + "epoch": 2.19, + "grad_norm": 0.7431304454803467, + "learning_rate": 0.00010108669528334225, + "loss": 2.9716, + "step": 44732 + }, + { + "epoch": 2.19, + "grad_norm": 0.7102420330047607, + "learning_rate": 0.00010107517026328083, + "loss": 3.0343, + "step": 44733 + }, + { + "epoch": 2.19, + "grad_norm": 0.7316977381706238, + "learning_rate": 0.0001010636457671491, + "loss": 3.0913, + "step": 44734 + }, + { + "epoch": 2.19, + "grad_norm": 0.705828845500946, + "learning_rate": 0.00010105212179497757, + "loss": 3.0188, + "step": 44735 + }, + { + "epoch": 2.19, + "grad_norm": 0.7134692072868347, + "learning_rate": 0.00010104059834679643, + "loss": 3.1093, + "step": 44736 + }, + { + "epoch": 2.19, + "grad_norm": 0.7043414115905762, + "learning_rate": 0.00010102907542263608, + "loss": 2.8323, + "step": 44737 + }, + { + "epoch": 2.19, + "grad_norm": 0.694769561290741, + "learning_rate": 0.00010101755302252678, + "loss": 2.843, + "step": 44738 + }, + { + "epoch": 2.19, + "grad_norm": 0.7042750120162964, + "learning_rate": 0.00010100603114649892, + "loss": 2.9738, + "step": 44739 + }, + { + "epoch": 2.19, + "grad_norm": 0.7265387177467346, + "learning_rate": 0.000100994509794583, + "loss": 2.8053, + "step": 44740 + }, + { + "epoch": 2.19, + "grad_norm": 0.743010401725769, + "learning_rate": 0.00010098298896680912, + "loss": 2.7329, + "step": 44741 + }, + { + "epoch": 2.19, + "grad_norm": 0.6783542633056641, + "learning_rate": 0.00010097146866320784, + "loss": 3.1741, + "step": 44742 + }, + { + "epoch": 2.19, + "grad_norm": 0.6887816190719604, + "learning_rate": 0.00010095994888380932, + "loss": 2.9669, + "step": 44743 + }, + { + "epoch": 2.19, + "grad_norm": 0.6920436024665833, + "learning_rate": 0.00010094842962864411, + "loss": 3.0444, + "step": 44744 + }, + { + "epoch": 2.19, + "grad_norm": 0.6976600289344788, + "learning_rate": 0.0001009369108977424, + "loss": 3.287, + "step": 44745 + }, + { + "epoch": 2.19, + "grad_norm": 0.7294341921806335, + "learning_rate": 0.00010092539269113444, + "loss": 2.784, + "step": 44746 + }, + { + "epoch": 2.19, + "grad_norm": 0.687296986579895, + "learning_rate": 0.00010091387500885083, + "loss": 2.8358, + "step": 44747 + }, + { + "epoch": 2.19, + "grad_norm": 0.7157872319221497, + "learning_rate": 0.00010090235785092164, + "loss": 2.8321, + "step": 44748 + }, + { + "epoch": 2.19, + "grad_norm": 0.6645101308822632, + "learning_rate": 0.0001008908412173773, + "loss": 3.1118, + "step": 44749 + }, + { + "epoch": 2.19, + "grad_norm": 0.7363380193710327, + "learning_rate": 0.00010087932510824829, + "loss": 2.7542, + "step": 44750 + }, + { + "epoch": 2.19, + "grad_norm": 0.732443630695343, + "learning_rate": 0.00010086780952356481, + "loss": 3.1261, + "step": 44751 + }, + { + "epoch": 2.19, + "grad_norm": 0.7466281652450562, + "learning_rate": 0.00010085629446335718, + "loss": 2.8356, + "step": 44752 + }, + { + "epoch": 2.19, + "grad_norm": 0.705704391002655, + "learning_rate": 0.00010084477992765566, + "loss": 2.7202, + "step": 44753 + }, + { + "epoch": 2.19, + "grad_norm": 0.6960504055023193, + "learning_rate": 0.00010083326591649065, + "loss": 2.9017, + "step": 44754 + }, + { + "epoch": 2.19, + "grad_norm": 0.7246660590171814, + "learning_rate": 0.0001008217524298926, + "loss": 2.9993, + "step": 44755 + }, + { + "epoch": 2.19, + "grad_norm": 0.7061540484428406, + "learning_rate": 0.00010081023946789163, + "loss": 2.8255, + "step": 44756 + }, + { + "epoch": 2.19, + "grad_norm": 0.7059864401817322, + "learning_rate": 0.00010079872703051826, + "loss": 2.7661, + "step": 44757 + }, + { + "epoch": 2.19, + "grad_norm": 0.7149134278297424, + "learning_rate": 0.00010078721511780266, + "loss": 2.8085, + "step": 44758 + }, + { + "epoch": 2.19, + "grad_norm": 0.6905004978179932, + "learning_rate": 0.00010077570372977513, + "loss": 2.9624, + "step": 44759 + }, + { + "epoch": 2.19, + "grad_norm": 0.7025719285011292, + "learning_rate": 0.00010076419286646614, + "loss": 2.9586, + "step": 44760 + }, + { + "epoch": 2.19, + "grad_norm": 0.7020415663719177, + "learning_rate": 0.00010075268252790586, + "loss": 2.8406, + "step": 44761 + }, + { + "epoch": 2.19, + "grad_norm": 0.7688156366348267, + "learning_rate": 0.00010074117271412473, + "loss": 2.9296, + "step": 44762 + }, + { + "epoch": 2.19, + "grad_norm": 0.7104856967926025, + "learning_rate": 0.0001007296634251529, + "loss": 2.875, + "step": 44763 + }, + { + "epoch": 2.19, + "grad_norm": 0.7477237582206726, + "learning_rate": 0.0001007181546610208, + "loss": 2.8623, + "step": 44764 + }, + { + "epoch": 2.19, + "grad_norm": 0.7128356099128723, + "learning_rate": 0.00010070664642175885, + "loss": 2.9696, + "step": 44765 + }, + { + "epoch": 2.19, + "grad_norm": 0.730711042881012, + "learning_rate": 0.0001006951387073972, + "loss": 2.933, + "step": 44766 + }, + { + "epoch": 2.19, + "grad_norm": 0.7141094207763672, + "learning_rate": 0.0001006836315179662, + "loss": 2.9074, + "step": 44767 + }, + { + "epoch": 2.19, + "grad_norm": 0.6910660266876221, + "learning_rate": 0.00010067212485349609, + "loss": 2.9853, + "step": 44768 + }, + { + "epoch": 2.19, + "grad_norm": 0.6767820715904236, + "learning_rate": 0.00010066061871401721, + "loss": 2.9947, + "step": 44769 + }, + { + "epoch": 2.19, + "grad_norm": 0.7095887660980225, + "learning_rate": 0.00010064911309956001, + "loss": 2.7702, + "step": 44770 + }, + { + "epoch": 2.19, + "grad_norm": 0.7199665904045105, + "learning_rate": 0.00010063760801015457, + "loss": 2.8923, + "step": 44771 + }, + { + "epoch": 2.19, + "grad_norm": 0.7285208702087402, + "learning_rate": 0.0001006261034458314, + "loss": 2.9582, + "step": 44772 + }, + { + "epoch": 2.19, + "grad_norm": 0.7214688658714294, + "learning_rate": 0.00010061459940662073, + "loss": 2.7663, + "step": 44773 + }, + { + "epoch": 2.19, + "grad_norm": 0.7294226288795471, + "learning_rate": 0.00010060309589255273, + "loss": 3.0265, + "step": 44774 + }, + { + "epoch": 2.19, + "grad_norm": 0.7086969614028931, + "learning_rate": 0.00010059159290365788, + "loss": 2.8846, + "step": 44775 + }, + { + "epoch": 2.19, + "grad_norm": 0.7389560341835022, + "learning_rate": 0.00010058009043996628, + "loss": 2.8932, + "step": 44776 + }, + { + "epoch": 2.19, + "grad_norm": 0.6759814620018005, + "learning_rate": 0.00010056858850150849, + "loss": 2.9601, + "step": 44777 + }, + { + "epoch": 2.19, + "grad_norm": 0.7065625786781311, + "learning_rate": 0.00010055708708831451, + "loss": 2.806, + "step": 44778 + }, + { + "epoch": 2.19, + "grad_norm": 0.7095101475715637, + "learning_rate": 0.00010054558620041492, + "loss": 2.9055, + "step": 44779 + }, + { + "epoch": 2.19, + "grad_norm": 0.791003942489624, + "learning_rate": 0.00010053408583783984, + "loss": 2.9966, + "step": 44780 + }, + { + "epoch": 2.19, + "grad_norm": 0.7229596376419067, + "learning_rate": 0.00010052258600061953, + "loss": 2.7862, + "step": 44781 + }, + { + "epoch": 2.19, + "grad_norm": 0.7416842579841614, + "learning_rate": 0.0001005110866887844, + "loss": 2.8895, + "step": 44782 + }, + { + "epoch": 2.19, + "grad_norm": 0.737678050994873, + "learning_rate": 0.0001004995879023646, + "loss": 3.0674, + "step": 44783 + }, + { + "epoch": 2.19, + "grad_norm": 0.6982100605964661, + "learning_rate": 0.00010048808964139045, + "loss": 2.7872, + "step": 44784 + }, + { + "epoch": 2.19, + "grad_norm": 0.7631065845489502, + "learning_rate": 0.00010047659190589243, + "loss": 2.8958, + "step": 44785 + }, + { + "epoch": 2.19, + "grad_norm": 0.6911045908927917, + "learning_rate": 0.00010046509469590054, + "loss": 2.7337, + "step": 44786 + }, + { + "epoch": 2.19, + "grad_norm": 0.6925402283668518, + "learning_rate": 0.00010045359801144526, + "loss": 2.8405, + "step": 44787 + }, + { + "epoch": 2.19, + "grad_norm": 0.7005817890167236, + "learning_rate": 0.00010044210185255685, + "loss": 3.0554, + "step": 44788 + }, + { + "epoch": 2.2, + "grad_norm": 0.7038406133651733, + "learning_rate": 0.0001004306062192654, + "loss": 2.8066, + "step": 44789 + }, + { + "epoch": 2.2, + "grad_norm": 0.7171939015388489, + "learning_rate": 0.00010041911111160143, + "loss": 2.9208, + "step": 44790 + }, + { + "epoch": 2.2, + "grad_norm": 0.757029116153717, + "learning_rate": 0.00010040761652959505, + "loss": 2.892, + "step": 44791 + }, + { + "epoch": 2.2, + "grad_norm": 0.703034520149231, + "learning_rate": 0.00010039612247327666, + "loss": 2.908, + "step": 44792 + }, + { + "epoch": 2.2, + "grad_norm": 0.7032923698425293, + "learning_rate": 0.00010038462894267639, + "loss": 2.9547, + "step": 44793 + }, + { + "epoch": 2.2, + "grad_norm": 0.7094815373420715, + "learning_rate": 0.00010037313593782467, + "loss": 2.9108, + "step": 44794 + }, + { + "epoch": 2.2, + "grad_norm": 0.6859133839607239, + "learning_rate": 0.00010036164345875168, + "loss": 2.9368, + "step": 44795 + }, + { + "epoch": 2.2, + "grad_norm": 0.7258820533752441, + "learning_rate": 0.00010035015150548763, + "loss": 2.901, + "step": 44796 + }, + { + "epoch": 2.2, + "grad_norm": 0.7605149149894714, + "learning_rate": 0.00010033866007806296, + "loss": 2.7665, + "step": 44797 + }, + { + "epoch": 2.2, + "grad_norm": 0.7210118174552917, + "learning_rate": 0.00010032716917650771, + "loss": 3.0509, + "step": 44798 + }, + { + "epoch": 2.2, + "grad_norm": 0.7114245891571045, + "learning_rate": 0.0001003156788008523, + "loss": 2.6478, + "step": 44799 + }, + { + "epoch": 2.2, + "grad_norm": 0.7188319563865662, + "learning_rate": 0.00010030418895112701, + "loss": 2.9248, + "step": 44800 + }, + { + "epoch": 2.2, + "grad_norm": 0.7011023759841919, + "learning_rate": 0.00010029269962736209, + "loss": 2.8442, + "step": 44801 + }, + { + "epoch": 2.2, + "grad_norm": 0.7070204019546509, + "learning_rate": 0.00010028121082958777, + "loss": 2.9462, + "step": 44802 + }, + { + "epoch": 2.2, + "grad_norm": 0.7259615659713745, + "learning_rate": 0.0001002697225578342, + "loss": 3.1676, + "step": 44803 + }, + { + "epoch": 2.2, + "grad_norm": 0.6928755640983582, + "learning_rate": 0.00010025823481213183, + "loss": 2.977, + "step": 44804 + }, + { + "epoch": 2.2, + "grad_norm": 0.7410005331039429, + "learning_rate": 0.00010024674759251074, + "loss": 3.0249, + "step": 44805 + }, + { + "epoch": 2.2, + "grad_norm": 0.6871789693832397, + "learning_rate": 0.00010023526089900125, + "loss": 2.867, + "step": 44806 + }, + { + "epoch": 2.2, + "grad_norm": 0.6890696883201599, + "learning_rate": 0.00010022377473163378, + "loss": 3.096, + "step": 44807 + }, + { + "epoch": 2.2, + "grad_norm": 0.6946797966957092, + "learning_rate": 0.00010021228909043831, + "loss": 2.9074, + "step": 44808 + }, + { + "epoch": 2.2, + "grad_norm": 0.7045637369155884, + "learning_rate": 0.0001002008039754453, + "loss": 3.0301, + "step": 44809 + }, + { + "epoch": 2.2, + "grad_norm": 0.7553611397743225, + "learning_rate": 0.00010018931938668497, + "loss": 2.8607, + "step": 44810 + }, + { + "epoch": 2.2, + "grad_norm": 0.6659156680107117, + "learning_rate": 0.00010017783532418738, + "loss": 2.957, + "step": 44811 + }, + { + "epoch": 2.2, + "grad_norm": 0.7131342887878418, + "learning_rate": 0.00010016635178798304, + "loss": 2.9252, + "step": 44812 + }, + { + "epoch": 2.2, + "grad_norm": 0.741779625415802, + "learning_rate": 0.00010015486877810194, + "loss": 2.9722, + "step": 44813 + }, + { + "epoch": 2.2, + "grad_norm": 0.6962339282035828, + "learning_rate": 0.0001001433862945746, + "loss": 2.8588, + "step": 44814 + }, + { + "epoch": 2.2, + "grad_norm": 0.6637634038925171, + "learning_rate": 0.00010013190433743097, + "loss": 2.912, + "step": 44815 + }, + { + "epoch": 2.2, + "grad_norm": 0.6877065896987915, + "learning_rate": 0.00010012042290670158, + "loss": 3.0355, + "step": 44816 + }, + { + "epoch": 2.2, + "grad_norm": 0.7660030126571655, + "learning_rate": 0.00010010894200241648, + "loss": 3.0438, + "step": 44817 + }, + { + "epoch": 2.2, + "grad_norm": 0.711549699306488, + "learning_rate": 0.00010009746162460592, + "loss": 2.8984, + "step": 44818 + }, + { + "epoch": 2.2, + "grad_norm": 0.7448240518569946, + "learning_rate": 0.00010008598177330022, + "loss": 2.9206, + "step": 44819 + }, + { + "epoch": 2.2, + "grad_norm": 0.6869280934333801, + "learning_rate": 0.00010007450244852951, + "loss": 2.9734, + "step": 44820 + }, + { + "epoch": 2.2, + "grad_norm": 0.8840466141700745, + "learning_rate": 0.00010006302365032404, + "loss": 2.6196, + "step": 44821 + }, + { + "epoch": 2.2, + "grad_norm": 0.7302159070968628, + "learning_rate": 0.00010005154537871423, + "loss": 2.9981, + "step": 44822 + }, + { + "epoch": 2.2, + "grad_norm": 0.7159656882286072, + "learning_rate": 0.00010004006763373014, + "loss": 2.7305, + "step": 44823 + }, + { + "epoch": 2.2, + "grad_norm": 0.7011760473251343, + "learning_rate": 0.00010002859041540202, + "loss": 2.9915, + "step": 44824 + }, + { + "epoch": 2.2, + "grad_norm": 0.8024114370346069, + "learning_rate": 0.00010001711372376005, + "loss": 2.9443, + "step": 44825 + }, + { + "epoch": 2.2, + "grad_norm": 0.7106544971466064, + "learning_rate": 0.00010000563755883449, + "loss": 2.8404, + "step": 44826 + }, + { + "epoch": 2.2, + "grad_norm": 0.7113576531410217, + "learning_rate": 9.999416192065568e-05, + "loss": 3.0292, + "step": 44827 + }, + { + "epoch": 2.2, + "grad_norm": 0.7601901888847351, + "learning_rate": 9.998268680925368e-05, + "loss": 2.8893, + "step": 44828 + }, + { + "epoch": 2.2, + "grad_norm": 0.721528172492981, + "learning_rate": 9.997121222465886e-05, + "loss": 2.8252, + "step": 44829 + }, + { + "epoch": 2.2, + "grad_norm": 0.7221820950508118, + "learning_rate": 9.99597381669013e-05, + "loss": 2.986, + "step": 44830 + }, + { + "epoch": 2.2, + "grad_norm": 0.733727216720581, + "learning_rate": 9.994826463601136e-05, + "loss": 2.9532, + "step": 44831 + }, + { + "epoch": 2.2, + "grad_norm": 0.6860153675079346, + "learning_rate": 9.993679163201921e-05, + "loss": 2.9451, + "step": 44832 + }, + { + "epoch": 2.2, + "grad_norm": 0.7270806431770325, + "learning_rate": 9.992531915495494e-05, + "loss": 2.8611, + "step": 44833 + }, + { + "epoch": 2.2, + "grad_norm": 0.6891951560974121, + "learning_rate": 9.9913847204849e-05, + "loss": 3.1389, + "step": 44834 + }, + { + "epoch": 2.2, + "grad_norm": 0.7794495224952698, + "learning_rate": 9.990237578173135e-05, + "loss": 2.7893, + "step": 44835 + }, + { + "epoch": 2.2, + "grad_norm": 0.707525908946991, + "learning_rate": 9.989090488563233e-05, + "loss": 2.9123, + "step": 44836 + }, + { + "epoch": 2.2, + "grad_norm": 0.788293719291687, + "learning_rate": 9.987943451658228e-05, + "loss": 2.8435, + "step": 44837 + }, + { + "epoch": 2.2, + "grad_norm": 0.7396550178527832, + "learning_rate": 9.98679646746113e-05, + "loss": 2.9739, + "step": 44838 + }, + { + "epoch": 2.2, + "grad_norm": 0.7474350929260254, + "learning_rate": 9.985649535974954e-05, + "loss": 2.8834, + "step": 44839 + }, + { + "epoch": 2.2, + "grad_norm": 0.7349193096160889, + "learning_rate": 9.984502657202716e-05, + "loss": 2.9247, + "step": 44840 + }, + { + "epoch": 2.2, + "grad_norm": 0.6847119927406311, + "learning_rate": 9.983355831147446e-05, + "loss": 3.1132, + "step": 44841 + }, + { + "epoch": 2.2, + "grad_norm": 0.7022256255149841, + "learning_rate": 9.982209057812177e-05, + "loss": 2.9059, + "step": 44842 + }, + { + "epoch": 2.2, + "grad_norm": 0.7146331667900085, + "learning_rate": 9.981062337199905e-05, + "loss": 2.7936, + "step": 44843 + }, + { + "epoch": 2.2, + "grad_norm": 0.7405462265014648, + "learning_rate": 9.979915669313672e-05, + "loss": 2.8458, + "step": 44844 + }, + { + "epoch": 2.2, + "grad_norm": 0.7254295945167542, + "learning_rate": 9.97876905415648e-05, + "loss": 3.0363, + "step": 44845 + }, + { + "epoch": 2.2, + "grad_norm": 0.6980922222137451, + "learning_rate": 9.977622491731364e-05, + "loss": 3.0195, + "step": 44846 + }, + { + "epoch": 2.2, + "grad_norm": 0.6992781758308411, + "learning_rate": 9.97647598204134e-05, + "loss": 3.0519, + "step": 44847 + }, + { + "epoch": 2.2, + "grad_norm": 0.7055832147598267, + "learning_rate": 9.975329525089412e-05, + "loss": 2.8358, + "step": 44848 + }, + { + "epoch": 2.2, + "grad_norm": 0.6944383382797241, + "learning_rate": 9.974183120878624e-05, + "loss": 3.0116, + "step": 44849 + }, + { + "epoch": 2.2, + "grad_norm": 0.714631199836731, + "learning_rate": 9.973036769411972e-05, + "loss": 3.0383, + "step": 44850 + }, + { + "epoch": 2.2, + "grad_norm": 0.7210114598274231, + "learning_rate": 9.971890470692489e-05, + "loss": 2.9004, + "step": 44851 + }, + { + "epoch": 2.2, + "grad_norm": 0.7096450924873352, + "learning_rate": 9.970744224723201e-05, + "loss": 2.8675, + "step": 44852 + }, + { + "epoch": 2.2, + "grad_norm": 0.6983294486999512, + "learning_rate": 9.969598031507119e-05, + "loss": 3.1334, + "step": 44853 + }, + { + "epoch": 2.2, + "grad_norm": 0.6857281923294067, + "learning_rate": 9.968451891047259e-05, + "loss": 2.967, + "step": 44854 + }, + { + "epoch": 2.2, + "grad_norm": 0.6746247410774231, + "learning_rate": 9.967305803346633e-05, + "loss": 3.0097, + "step": 44855 + }, + { + "epoch": 2.2, + "grad_norm": 0.7361533045768738, + "learning_rate": 9.966159768408267e-05, + "loss": 2.8268, + "step": 44856 + }, + { + "epoch": 2.2, + "grad_norm": 0.7003144025802612, + "learning_rate": 9.965013786235192e-05, + "loss": 3.0137, + "step": 44857 + }, + { + "epoch": 2.2, + "grad_norm": 0.7251600623130798, + "learning_rate": 9.963867856830404e-05, + "loss": 2.7261, + "step": 44858 + }, + { + "epoch": 2.2, + "grad_norm": 0.7403432130813599, + "learning_rate": 9.962721980196942e-05, + "loss": 3.0198, + "step": 44859 + }, + { + "epoch": 2.2, + "grad_norm": 0.7124742865562439, + "learning_rate": 9.961576156337814e-05, + "loss": 2.8851, + "step": 44860 + }, + { + "epoch": 2.2, + "grad_norm": 0.6854257583618164, + "learning_rate": 9.960430385256029e-05, + "loss": 3.0093, + "step": 44861 + }, + { + "epoch": 2.2, + "grad_norm": 0.7400795221328735, + "learning_rate": 9.95928466695462e-05, + "loss": 3.0098, + "step": 44862 + }, + { + "epoch": 2.2, + "grad_norm": 0.7203598022460938, + "learning_rate": 9.958139001436593e-05, + "loss": 2.8234, + "step": 44863 + }, + { + "epoch": 2.2, + "grad_norm": 0.7296319007873535, + "learning_rate": 9.956993388704977e-05, + "loss": 3.0328, + "step": 44864 + }, + { + "epoch": 2.2, + "grad_norm": 0.7356207966804504, + "learning_rate": 9.955847828762778e-05, + "loss": 2.9695, + "step": 44865 + }, + { + "epoch": 2.2, + "grad_norm": 0.7014438509941101, + "learning_rate": 9.954702321613015e-05, + "loss": 3.0264, + "step": 44866 + }, + { + "epoch": 2.2, + "grad_norm": 0.740602970123291, + "learning_rate": 9.953556867258717e-05, + "loss": 2.8827, + "step": 44867 + }, + { + "epoch": 2.2, + "grad_norm": 0.699821412563324, + "learning_rate": 9.952411465702893e-05, + "loss": 2.8705, + "step": 44868 + }, + { + "epoch": 2.2, + "grad_norm": 0.7270916104316711, + "learning_rate": 9.95126611694856e-05, + "loss": 3.002, + "step": 44869 + }, + { + "epoch": 2.2, + "grad_norm": 0.6713517904281616, + "learning_rate": 9.950120820998721e-05, + "loss": 2.8885, + "step": 44870 + }, + { + "epoch": 2.2, + "grad_norm": 0.7152098417282104, + "learning_rate": 9.948975577856409e-05, + "loss": 3.0077, + "step": 44871 + }, + { + "epoch": 2.2, + "grad_norm": 0.703411340713501, + "learning_rate": 9.947830387524644e-05, + "loss": 2.8513, + "step": 44872 + }, + { + "epoch": 2.2, + "grad_norm": 0.7669411301612854, + "learning_rate": 9.946685250006427e-05, + "loss": 3.0245, + "step": 44873 + }, + { + "epoch": 2.2, + "grad_norm": 0.6936809420585632, + "learning_rate": 9.945540165304789e-05, + "loss": 2.8511, + "step": 44874 + }, + { + "epoch": 2.2, + "grad_norm": 0.7249215841293335, + "learning_rate": 9.944395133422741e-05, + "loss": 2.9093, + "step": 44875 + }, + { + "epoch": 2.2, + "grad_norm": 0.7058155536651611, + "learning_rate": 9.943250154363283e-05, + "loss": 3.0376, + "step": 44876 + }, + { + "epoch": 2.2, + "grad_norm": 0.7191633582115173, + "learning_rate": 9.942105228129456e-05, + "loss": 3.0574, + "step": 44877 + }, + { + "epoch": 2.2, + "grad_norm": 0.7496932744979858, + "learning_rate": 9.940960354724255e-05, + "loss": 2.8689, + "step": 44878 + }, + { + "epoch": 2.2, + "grad_norm": 0.7125669121742249, + "learning_rate": 9.939815534150711e-05, + "loss": 3.145, + "step": 44879 + }, + { + "epoch": 2.2, + "grad_norm": 0.7709475159645081, + "learning_rate": 9.938670766411825e-05, + "loss": 2.8035, + "step": 44880 + }, + { + "epoch": 2.2, + "grad_norm": 0.705838680267334, + "learning_rate": 9.937526051510629e-05, + "loss": 2.9948, + "step": 44881 + }, + { + "epoch": 2.2, + "grad_norm": 0.7062419056892395, + "learning_rate": 9.936381389450129e-05, + "loss": 2.7527, + "step": 44882 + }, + { + "epoch": 2.2, + "grad_norm": 0.7100111246109009, + "learning_rate": 9.935236780233325e-05, + "loss": 2.9641, + "step": 44883 + }, + { + "epoch": 2.2, + "grad_norm": 0.6903637051582336, + "learning_rate": 9.934092223863261e-05, + "loss": 2.9546, + "step": 44884 + }, + { + "epoch": 2.2, + "grad_norm": 0.6916725039482117, + "learning_rate": 9.932947720342921e-05, + "loss": 2.8971, + "step": 44885 + }, + { + "epoch": 2.2, + "grad_norm": 0.7668691277503967, + "learning_rate": 9.931803269675351e-05, + "loss": 2.7209, + "step": 44886 + }, + { + "epoch": 2.2, + "grad_norm": 0.6978892087936401, + "learning_rate": 9.930658871863533e-05, + "loss": 2.9635, + "step": 44887 + }, + { + "epoch": 2.2, + "grad_norm": 0.7086914777755737, + "learning_rate": 9.929514526910499e-05, + "loss": 3.0682, + "step": 44888 + }, + { + "epoch": 2.2, + "grad_norm": 0.7077818512916565, + "learning_rate": 9.928370234819274e-05, + "loss": 2.9384, + "step": 44889 + }, + { + "epoch": 2.2, + "grad_norm": 0.6918936967849731, + "learning_rate": 9.927225995592855e-05, + "loss": 2.7194, + "step": 44890 + }, + { + "epoch": 2.2, + "grad_norm": 0.7205008864402771, + "learning_rate": 9.926081809234262e-05, + "loss": 2.9189, + "step": 44891 + }, + { + "epoch": 2.2, + "grad_norm": 0.7491716146469116, + "learning_rate": 9.924937675746493e-05, + "loss": 3.0693, + "step": 44892 + }, + { + "epoch": 2.2, + "grad_norm": 0.7079654932022095, + "learning_rate": 9.923793595132577e-05, + "loss": 3.0083, + "step": 44893 + }, + { + "epoch": 2.2, + "grad_norm": 0.7141336798667908, + "learning_rate": 9.922649567395537e-05, + "loss": 2.8574, + "step": 44894 + }, + { + "epoch": 2.2, + "grad_norm": 0.7455933690071106, + "learning_rate": 9.921505592538362e-05, + "loss": 3.0319, + "step": 44895 + }, + { + "epoch": 2.2, + "grad_norm": 0.7107555270195007, + "learning_rate": 9.920361670564087e-05, + "loss": 2.7566, + "step": 44896 + }, + { + "epoch": 2.2, + "grad_norm": 0.7711759209632874, + "learning_rate": 9.919217801475715e-05, + "loss": 2.9126, + "step": 44897 + }, + { + "epoch": 2.2, + "grad_norm": 0.6815950870513916, + "learning_rate": 9.918073985276248e-05, + "loss": 2.7761, + "step": 44898 + }, + { + "epoch": 2.2, + "grad_norm": 0.7206303477287292, + "learning_rate": 9.916930221968724e-05, + "loss": 3.123, + "step": 44899 + }, + { + "epoch": 2.2, + "grad_norm": 0.675313413143158, + "learning_rate": 9.915786511556123e-05, + "loss": 2.9108, + "step": 44900 + }, + { + "epoch": 2.2, + "grad_norm": 0.7314799427986145, + "learning_rate": 9.91464285404149e-05, + "loss": 2.7628, + "step": 44901 + }, + { + "epoch": 2.2, + "grad_norm": 0.7563438415527344, + "learning_rate": 9.913499249427812e-05, + "loss": 2.858, + "step": 44902 + }, + { + "epoch": 2.2, + "grad_norm": 0.7086213231086731, + "learning_rate": 9.912355697718119e-05, + "loss": 2.8109, + "step": 44903 + }, + { + "epoch": 2.2, + "grad_norm": 0.7429339289665222, + "learning_rate": 9.911212198915415e-05, + "loss": 2.8593, + "step": 44904 + }, + { + "epoch": 2.2, + "grad_norm": 0.6543605327606201, + "learning_rate": 9.910068753022706e-05, + "loss": 2.8576, + "step": 44905 + }, + { + "epoch": 2.2, + "grad_norm": 0.7347574830055237, + "learning_rate": 9.908925360043015e-05, + "loss": 2.725, + "step": 44906 + }, + { + "epoch": 2.2, + "grad_norm": 0.7006809115409851, + "learning_rate": 9.907782019979343e-05, + "loss": 2.8389, + "step": 44907 + }, + { + "epoch": 2.2, + "grad_norm": 0.7191192507743835, + "learning_rate": 9.906638732834701e-05, + "loss": 2.9768, + "step": 44908 + }, + { + "epoch": 2.2, + "grad_norm": 0.7019370794296265, + "learning_rate": 9.905495498612118e-05, + "loss": 2.7632, + "step": 44909 + }, + { + "epoch": 2.2, + "grad_norm": 0.7217749357223511, + "learning_rate": 9.904352317314583e-05, + "loss": 2.9252, + "step": 44910 + }, + { + "epoch": 2.2, + "grad_norm": 0.7480356097221375, + "learning_rate": 9.903209188945127e-05, + "loss": 2.9351, + "step": 44911 + }, + { + "epoch": 2.2, + "grad_norm": 0.7302222847938538, + "learning_rate": 9.902066113506747e-05, + "loss": 2.8074, + "step": 44912 + }, + { + "epoch": 2.2, + "grad_norm": 0.7058594226837158, + "learning_rate": 9.90092309100245e-05, + "loss": 2.976, + "step": 44913 + }, + { + "epoch": 2.2, + "grad_norm": 0.7320786118507385, + "learning_rate": 9.899780121435262e-05, + "loss": 2.8483, + "step": 44914 + }, + { + "epoch": 2.2, + "grad_norm": 0.7021784782409668, + "learning_rate": 9.898637204808176e-05, + "loss": 2.9878, + "step": 44915 + }, + { + "epoch": 2.2, + "grad_norm": 0.7117753028869629, + "learning_rate": 9.89749434112422e-05, + "loss": 2.871, + "step": 44916 + }, + { + "epoch": 2.2, + "grad_norm": 0.7381766438484192, + "learning_rate": 9.896351530386385e-05, + "loss": 2.8505, + "step": 44917 + }, + { + "epoch": 2.2, + "grad_norm": 0.69048011302948, + "learning_rate": 9.895208772597705e-05, + "loss": 3.1488, + "step": 44918 + }, + { + "epoch": 2.2, + "grad_norm": 0.7255758047103882, + "learning_rate": 9.894066067761171e-05, + "loss": 2.8735, + "step": 44919 + }, + { + "epoch": 2.2, + "grad_norm": 0.6982675790786743, + "learning_rate": 9.89292341587979e-05, + "loss": 2.9529, + "step": 44920 + }, + { + "epoch": 2.2, + "grad_norm": 0.7159181237220764, + "learning_rate": 9.891780816956587e-05, + "loss": 2.9456, + "step": 44921 + }, + { + "epoch": 2.2, + "grad_norm": 0.6875530481338501, + "learning_rate": 9.890638270994553e-05, + "loss": 2.8369, + "step": 44922 + }, + { + "epoch": 2.2, + "grad_norm": 0.6837921142578125, + "learning_rate": 9.889495777996709e-05, + "loss": 3.0856, + "step": 44923 + }, + { + "epoch": 2.2, + "grad_norm": 0.7514594793319702, + "learning_rate": 9.888353337966075e-05, + "loss": 3.0671, + "step": 44924 + }, + { + "epoch": 2.2, + "grad_norm": 0.7067429423332214, + "learning_rate": 9.887210950905643e-05, + "loss": 2.7875, + "step": 44925 + }, + { + "epoch": 2.2, + "grad_norm": 0.7050120830535889, + "learning_rate": 9.88606861681843e-05, + "loss": 3.0002, + "step": 44926 + }, + { + "epoch": 2.2, + "grad_norm": 0.7361913919448853, + "learning_rate": 9.884926335707431e-05, + "loss": 2.9435, + "step": 44927 + }, + { + "epoch": 2.2, + "grad_norm": 0.7067899107933044, + "learning_rate": 9.883784107575665e-05, + "loss": 2.8466, + "step": 44928 + }, + { + "epoch": 2.2, + "grad_norm": 0.7761995792388916, + "learning_rate": 9.882641932426149e-05, + "loss": 2.9587, + "step": 44929 + }, + { + "epoch": 2.2, + "grad_norm": 0.7403863668441772, + "learning_rate": 9.881499810261871e-05, + "loss": 2.7738, + "step": 44930 + }, + { + "epoch": 2.2, + "grad_norm": 0.6781747341156006, + "learning_rate": 9.880357741085865e-05, + "loss": 2.973, + "step": 44931 + }, + { + "epoch": 2.2, + "grad_norm": 0.7273198366165161, + "learning_rate": 9.879215724901109e-05, + "loss": 2.8573, + "step": 44932 + }, + { + "epoch": 2.2, + "grad_norm": 0.7114270925521851, + "learning_rate": 9.878073761710639e-05, + "loss": 2.7882, + "step": 44933 + }, + { + "epoch": 2.2, + "grad_norm": 0.7011170387268066, + "learning_rate": 9.876931851517454e-05, + "loss": 2.8851, + "step": 44934 + }, + { + "epoch": 2.2, + "grad_norm": 0.7164783477783203, + "learning_rate": 9.875789994324544e-05, + "loss": 3.0467, + "step": 44935 + }, + { + "epoch": 2.2, + "grad_norm": 0.7922382950782776, + "learning_rate": 9.874648190134941e-05, + "loss": 3.0482, + "step": 44936 + }, + { + "epoch": 2.2, + "grad_norm": 0.6847321391105652, + "learning_rate": 9.87350643895163e-05, + "loss": 2.9396, + "step": 44937 + }, + { + "epoch": 2.2, + "grad_norm": 0.7641184329986572, + "learning_rate": 9.872364740777632e-05, + "loss": 3.0284, + "step": 44938 + }, + { + "epoch": 2.2, + "grad_norm": 0.7687472105026245, + "learning_rate": 9.871223095615959e-05, + "loss": 3.0097, + "step": 44939 + }, + { + "epoch": 2.2, + "grad_norm": 0.7540838122367859, + "learning_rate": 9.87008150346961e-05, + "loss": 2.9874, + "step": 44940 + }, + { + "epoch": 2.2, + "grad_norm": 0.8029380440711975, + "learning_rate": 9.868939964341597e-05, + "loss": 2.6693, + "step": 44941 + }, + { + "epoch": 2.2, + "grad_norm": 0.7017350792884827, + "learning_rate": 9.86779847823491e-05, + "loss": 3.0033, + "step": 44942 + }, + { + "epoch": 2.2, + "grad_norm": 0.7199008464813232, + "learning_rate": 9.866657045152564e-05, + "loss": 3.0994, + "step": 44943 + }, + { + "epoch": 2.2, + "grad_norm": 0.6676777005195618, + "learning_rate": 9.865515665097581e-05, + "loss": 2.7228, + "step": 44944 + }, + { + "epoch": 2.2, + "grad_norm": 0.6894500851631165, + "learning_rate": 9.864374338072945e-05, + "loss": 2.9413, + "step": 44945 + }, + { + "epoch": 2.2, + "grad_norm": 0.7256870269775391, + "learning_rate": 9.863233064081682e-05, + "loss": 2.7942, + "step": 44946 + }, + { + "epoch": 2.2, + "grad_norm": 0.7502337098121643, + "learning_rate": 9.862091843126777e-05, + "loss": 3.1109, + "step": 44947 + }, + { + "epoch": 2.2, + "grad_norm": 0.6998251080513, + "learning_rate": 9.860950675211255e-05, + "loss": 2.8859, + "step": 44948 + }, + { + "epoch": 2.2, + "grad_norm": 0.7343108057975769, + "learning_rate": 9.859809560338115e-05, + "loss": 3.0134, + "step": 44949 + }, + { + "epoch": 2.2, + "grad_norm": 0.7494150996208191, + "learning_rate": 9.85866849851035e-05, + "loss": 2.7913, + "step": 44950 + }, + { + "epoch": 2.2, + "grad_norm": 0.6747055649757385, + "learning_rate": 9.857527489730987e-05, + "loss": 2.9572, + "step": 44951 + }, + { + "epoch": 2.2, + "grad_norm": 0.6779135465621948, + "learning_rate": 9.856386534003011e-05, + "loss": 2.8614, + "step": 44952 + }, + { + "epoch": 2.2, + "grad_norm": 0.713498592376709, + "learning_rate": 9.855245631329432e-05, + "loss": 2.8165, + "step": 44953 + }, + { + "epoch": 2.2, + "grad_norm": 0.6696012616157532, + "learning_rate": 9.854104781713272e-05, + "loss": 3.0592, + "step": 44954 + }, + { + "epoch": 2.2, + "grad_norm": 0.7587345242500305, + "learning_rate": 9.852963985157525e-05, + "loss": 2.8299, + "step": 44955 + }, + { + "epoch": 2.2, + "grad_norm": 0.7578355669975281, + "learning_rate": 9.851823241665188e-05, + "loss": 2.9793, + "step": 44956 + }, + { + "epoch": 2.2, + "grad_norm": 0.6964207291603088, + "learning_rate": 9.850682551239265e-05, + "loss": 2.9437, + "step": 44957 + }, + { + "epoch": 2.2, + "grad_norm": 0.8269992470741272, + "learning_rate": 9.849541913882774e-05, + "loss": 2.9062, + "step": 44958 + }, + { + "epoch": 2.2, + "grad_norm": 0.6761069893836975, + "learning_rate": 9.848401329598705e-05, + "loss": 2.9209, + "step": 44959 + }, + { + "epoch": 2.2, + "grad_norm": 0.7033849954605103, + "learning_rate": 9.847260798390064e-05, + "loss": 2.974, + "step": 44960 + }, + { + "epoch": 2.2, + "grad_norm": 0.7039138078689575, + "learning_rate": 9.846120320259873e-05, + "loss": 2.9274, + "step": 44961 + }, + { + "epoch": 2.2, + "grad_norm": 0.6891793608665466, + "learning_rate": 9.844979895211121e-05, + "loss": 2.7909, + "step": 44962 + }, + { + "epoch": 2.2, + "grad_norm": 0.6782719492912292, + "learning_rate": 9.843839523246815e-05, + "loss": 2.8842, + "step": 44963 + }, + { + "epoch": 2.2, + "grad_norm": 0.7017290592193604, + "learning_rate": 9.842699204369944e-05, + "loss": 3.038, + "step": 44964 + }, + { + "epoch": 2.2, + "grad_norm": 0.8234224915504456, + "learning_rate": 9.841558938583525e-05, + "loss": 2.8788, + "step": 44965 + }, + { + "epoch": 2.2, + "grad_norm": 0.720818817615509, + "learning_rate": 9.840418725890569e-05, + "loss": 2.7221, + "step": 44966 + }, + { + "epoch": 2.2, + "grad_norm": 0.7300722599029541, + "learning_rate": 9.839278566294063e-05, + "loss": 2.9979, + "step": 44967 + }, + { + "epoch": 2.2, + "grad_norm": 0.6792693138122559, + "learning_rate": 9.838138459797026e-05, + "loss": 2.7892, + "step": 44968 + }, + { + "epoch": 2.2, + "grad_norm": 0.7436376214027405, + "learning_rate": 9.83699840640244e-05, + "loss": 3.0367, + "step": 44969 + }, + { + "epoch": 2.2, + "grad_norm": 0.6573380827903748, + "learning_rate": 9.835858406113333e-05, + "loss": 2.9167, + "step": 44970 + }, + { + "epoch": 2.2, + "grad_norm": 0.6874247789382935, + "learning_rate": 9.834718458932692e-05, + "loss": 2.7693, + "step": 44971 + }, + { + "epoch": 2.2, + "grad_norm": 0.7144591808319092, + "learning_rate": 9.833578564863514e-05, + "loss": 2.7311, + "step": 44972 + }, + { + "epoch": 2.2, + "grad_norm": 0.72472083568573, + "learning_rate": 9.832438723908814e-05, + "loss": 3.0349, + "step": 44973 + }, + { + "epoch": 2.2, + "grad_norm": 0.7377344369888306, + "learning_rate": 9.831298936071586e-05, + "loss": 3.0525, + "step": 44974 + }, + { + "epoch": 2.2, + "grad_norm": 0.7162564992904663, + "learning_rate": 9.830159201354828e-05, + "loss": 2.935, + "step": 44975 + }, + { + "epoch": 2.2, + "grad_norm": 0.7077290415763855, + "learning_rate": 9.829019519761564e-05, + "loss": 3.0306, + "step": 44976 + }, + { + "epoch": 2.2, + "grad_norm": 0.6880072951316833, + "learning_rate": 9.827879891294777e-05, + "loss": 2.8846, + "step": 44977 + }, + { + "epoch": 2.2, + "grad_norm": 0.7291051149368286, + "learning_rate": 9.826740315957476e-05, + "loss": 2.8189, + "step": 44978 + }, + { + "epoch": 2.2, + "grad_norm": 0.7197936177253723, + "learning_rate": 9.825600793752647e-05, + "loss": 2.816, + "step": 44979 + }, + { + "epoch": 2.2, + "grad_norm": 0.6987482309341431, + "learning_rate": 9.824461324683304e-05, + "loss": 2.7839, + "step": 44980 + }, + { + "epoch": 2.2, + "grad_norm": 0.7558214664459229, + "learning_rate": 9.823321908752454e-05, + "loss": 2.8651, + "step": 44981 + }, + { + "epoch": 2.2, + "grad_norm": 0.7327513694763184, + "learning_rate": 9.822182545963082e-05, + "loss": 2.8579, + "step": 44982 + }, + { + "epoch": 2.2, + "grad_norm": 0.7057175040245056, + "learning_rate": 9.821043236318209e-05, + "loss": 2.9509, + "step": 44983 + }, + { + "epoch": 2.2, + "grad_norm": 0.7285510301589966, + "learning_rate": 9.819903979820825e-05, + "loss": 2.9622, + "step": 44984 + }, + { + "epoch": 2.2, + "grad_norm": 0.6786232590675354, + "learning_rate": 9.81876477647392e-05, + "loss": 3.0846, + "step": 44985 + }, + { + "epoch": 2.2, + "grad_norm": 0.7109321355819702, + "learning_rate": 9.817625626280511e-05, + "loss": 2.8528, + "step": 44986 + }, + { + "epoch": 2.2, + "grad_norm": 0.7196544408798218, + "learning_rate": 9.816486529243587e-05, + "loss": 2.7905, + "step": 44987 + }, + { + "epoch": 2.2, + "grad_norm": 0.6925801634788513, + "learning_rate": 9.815347485366162e-05, + "loss": 2.8368, + "step": 44988 + }, + { + "epoch": 2.2, + "grad_norm": 0.687707245349884, + "learning_rate": 9.814208494651214e-05, + "loss": 2.8255, + "step": 44989 + }, + { + "epoch": 2.2, + "grad_norm": 0.7053709030151367, + "learning_rate": 9.81306955710176e-05, + "loss": 3.0476, + "step": 44990 + }, + { + "epoch": 2.2, + "grad_norm": 0.6974628567695618, + "learning_rate": 9.811930672720803e-05, + "loss": 2.739, + "step": 44991 + }, + { + "epoch": 2.2, + "grad_norm": 0.7128167152404785, + "learning_rate": 9.810791841511336e-05, + "loss": 3.0011, + "step": 44992 + }, + { + "epoch": 2.2, + "grad_norm": 0.694808304309845, + "learning_rate": 9.809653063476358e-05, + "loss": 2.8174, + "step": 44993 + }, + { + "epoch": 2.21, + "grad_norm": 0.7266348600387573, + "learning_rate": 9.808514338618856e-05, + "loss": 2.8004, + "step": 44994 + }, + { + "epoch": 2.21, + "grad_norm": 0.6718068718910217, + "learning_rate": 9.807375666941846e-05, + "loss": 2.9927, + "step": 44995 + }, + { + "epoch": 2.21, + "grad_norm": 0.7475922107696533, + "learning_rate": 9.806237048448329e-05, + "loss": 2.8641, + "step": 44996 + }, + { + "epoch": 2.21, + "grad_norm": 0.6909314393997192, + "learning_rate": 9.805098483141287e-05, + "loss": 2.8757, + "step": 44997 + }, + { + "epoch": 2.21, + "grad_norm": 0.7345043420791626, + "learning_rate": 9.803959971023742e-05, + "loss": 2.8709, + "step": 44998 + }, + { + "epoch": 2.21, + "grad_norm": 0.7070412635803223, + "learning_rate": 9.802821512098677e-05, + "loss": 2.997, + "step": 44999 + }, + { + "epoch": 2.21, + "grad_norm": 0.6929289698600769, + "learning_rate": 9.801683106369082e-05, + "loss": 3.0065, + "step": 45000 + }, + { + "epoch": 2.21, + "grad_norm": 0.7302191257476807, + "learning_rate": 9.800544753837978e-05, + "loss": 2.8856, + "step": 45001 + }, + { + "epoch": 2.21, + "grad_norm": 0.7192659378051758, + "learning_rate": 9.799406454508343e-05, + "loss": 2.8797, + "step": 45002 + }, + { + "epoch": 2.21, + "grad_norm": 0.757664680480957, + "learning_rate": 9.798268208383192e-05, + "loss": 2.4703, + "step": 45003 + }, + { + "epoch": 2.21, + "grad_norm": 0.7271679043769836, + "learning_rate": 9.797130015465507e-05, + "loss": 2.98, + "step": 45004 + }, + { + "epoch": 2.21, + "grad_norm": 0.7153617739677429, + "learning_rate": 9.7959918757583e-05, + "loss": 2.992, + "step": 45005 + }, + { + "epoch": 2.21, + "grad_norm": 0.7023801207542419, + "learning_rate": 9.794853789264564e-05, + "loss": 2.8245, + "step": 45006 + }, + { + "epoch": 2.21, + "grad_norm": 0.7436354160308838, + "learning_rate": 9.793715755987283e-05, + "loss": 2.9209, + "step": 45007 + }, + { + "epoch": 2.21, + "grad_norm": 0.7024301886558533, + "learning_rate": 9.792577775929476e-05, + "loss": 3.0383, + "step": 45008 + }, + { + "epoch": 2.21, + "grad_norm": 0.680385410785675, + "learning_rate": 9.791439849094124e-05, + "loss": 2.8592, + "step": 45009 + }, + { + "epoch": 2.21, + "grad_norm": 0.6959872245788574, + "learning_rate": 9.790301975484228e-05, + "loss": 2.9236, + "step": 45010 + }, + { + "epoch": 2.21, + "grad_norm": 0.7033017873764038, + "learning_rate": 9.789164155102795e-05, + "loss": 3.054, + "step": 45011 + }, + { + "epoch": 2.21, + "grad_norm": 0.6920191645622253, + "learning_rate": 9.788026387952807e-05, + "loss": 2.8266, + "step": 45012 + }, + { + "epoch": 2.21, + "grad_norm": 0.6858896017074585, + "learning_rate": 9.786888674037276e-05, + "loss": 2.9281, + "step": 45013 + }, + { + "epoch": 2.21, + "grad_norm": 0.6978284120559692, + "learning_rate": 9.785751013359189e-05, + "loss": 2.9375, + "step": 45014 + }, + { + "epoch": 2.21, + "grad_norm": 0.7071654200553894, + "learning_rate": 9.784613405921535e-05, + "loss": 2.8444, + "step": 45015 + }, + { + "epoch": 2.21, + "grad_norm": 0.6944267749786377, + "learning_rate": 9.78347585172733e-05, + "loss": 2.8455, + "step": 45016 + }, + { + "epoch": 2.21, + "grad_norm": 0.7331633567810059, + "learning_rate": 9.782338350779548e-05, + "loss": 3.0644, + "step": 45017 + }, + { + "epoch": 2.21, + "grad_norm": 0.7539132237434387, + "learning_rate": 9.781200903081206e-05, + "loss": 2.8949, + "step": 45018 + }, + { + "epoch": 2.21, + "grad_norm": 0.7396433353424072, + "learning_rate": 9.780063508635279e-05, + "loss": 2.938, + "step": 45019 + }, + { + "epoch": 2.21, + "grad_norm": 0.7210110425949097, + "learning_rate": 9.778926167444781e-05, + "loss": 2.6836, + "step": 45020 + }, + { + "epoch": 2.21, + "grad_norm": 0.7514802813529968, + "learning_rate": 9.777788879512702e-05, + "loss": 2.9036, + "step": 45021 + }, + { + "epoch": 2.21, + "grad_norm": 0.7779740691184998, + "learning_rate": 9.776651644842026e-05, + "loss": 2.8719, + "step": 45022 + }, + { + "epoch": 2.21, + "grad_norm": 0.7061024308204651, + "learning_rate": 9.775514463435767e-05, + "loss": 2.7755, + "step": 45023 + }, + { + "epoch": 2.21, + "grad_norm": 0.7684993743896484, + "learning_rate": 9.7743773352969e-05, + "loss": 2.855, + "step": 45024 + }, + { + "epoch": 2.21, + "grad_norm": 0.7269738912582397, + "learning_rate": 9.773240260428431e-05, + "loss": 3.2446, + "step": 45025 + }, + { + "epoch": 2.21, + "grad_norm": 0.6850230693817139, + "learning_rate": 9.772103238833364e-05, + "loss": 3.0956, + "step": 45026 + }, + { + "epoch": 2.21, + "grad_norm": 0.6893491148948669, + "learning_rate": 9.770966270514684e-05, + "loss": 2.9326, + "step": 45027 + }, + { + "epoch": 2.21, + "grad_norm": 0.7849996089935303, + "learning_rate": 9.769829355475388e-05, + "loss": 2.8876, + "step": 45028 + }, + { + "epoch": 2.21, + "grad_norm": 0.7221559882164001, + "learning_rate": 9.768692493718457e-05, + "loss": 2.9922, + "step": 45029 + }, + { + "epoch": 2.21, + "grad_norm": 0.6811572909355164, + "learning_rate": 9.767555685246896e-05, + "loss": 2.9712, + "step": 45030 + }, + { + "epoch": 2.21, + "grad_norm": 0.7301697731018066, + "learning_rate": 9.766418930063709e-05, + "loss": 3.1497, + "step": 45031 + }, + { + "epoch": 2.21, + "grad_norm": 0.7387075424194336, + "learning_rate": 9.765282228171869e-05, + "loss": 2.731, + "step": 45032 + }, + { + "epoch": 2.21, + "grad_norm": 0.7403630018234253, + "learning_rate": 9.764145579574394e-05, + "loss": 3.0009, + "step": 45033 + }, + { + "epoch": 2.21, + "grad_norm": 0.7094658613204956, + "learning_rate": 9.763008984274254e-05, + "loss": 2.7886, + "step": 45034 + }, + { + "epoch": 2.21, + "grad_norm": 0.6980571150779724, + "learning_rate": 9.761872442274464e-05, + "loss": 3.1573, + "step": 45035 + }, + { + "epoch": 2.21, + "grad_norm": 0.7270064949989319, + "learning_rate": 9.760735953578006e-05, + "loss": 2.9224, + "step": 45036 + }, + { + "epoch": 2.21, + "grad_norm": 0.8115057349205017, + "learning_rate": 9.759599518187867e-05, + "loss": 3.1933, + "step": 45037 + }, + { + "epoch": 2.21, + "grad_norm": 0.7091026306152344, + "learning_rate": 9.758463136107055e-05, + "loss": 2.6673, + "step": 45038 + }, + { + "epoch": 2.21, + "grad_norm": 0.7223027348518372, + "learning_rate": 9.757326807338548e-05, + "loss": 2.7039, + "step": 45039 + }, + { + "epoch": 2.21, + "grad_norm": 0.7187199592590332, + "learning_rate": 9.756190531885353e-05, + "loss": 2.971, + "step": 45040 + }, + { + "epoch": 2.21, + "grad_norm": 0.7661718726158142, + "learning_rate": 9.755054309750451e-05, + "loss": 2.9357, + "step": 45041 + }, + { + "epoch": 2.21, + "grad_norm": 0.710060179233551, + "learning_rate": 9.753918140936846e-05, + "loss": 3.0332, + "step": 45042 + }, + { + "epoch": 2.21, + "grad_norm": 0.6724956035614014, + "learning_rate": 9.752782025447523e-05, + "loss": 3.0016, + "step": 45043 + }, + { + "epoch": 2.21, + "grad_norm": 0.6956879496574402, + "learning_rate": 9.751645963285469e-05, + "loss": 3.0643, + "step": 45044 + }, + { + "epoch": 2.21, + "grad_norm": 0.7180279493331909, + "learning_rate": 9.750509954453689e-05, + "loss": 3.1198, + "step": 45045 + }, + { + "epoch": 2.21, + "grad_norm": 0.6832389235496521, + "learning_rate": 9.749373998955163e-05, + "loss": 2.9694, + "step": 45046 + }, + { + "epoch": 2.21, + "grad_norm": 0.6939116716384888, + "learning_rate": 9.748238096792886e-05, + "loss": 2.8727, + "step": 45047 + }, + { + "epoch": 2.21, + "grad_norm": 0.6981992721557617, + "learning_rate": 9.747102247969862e-05, + "loss": 2.8948, + "step": 45048 + }, + { + "epoch": 2.21, + "grad_norm": 0.7490467429161072, + "learning_rate": 9.745966452489072e-05, + "loss": 2.6799, + "step": 45049 + }, + { + "epoch": 2.21, + "grad_norm": 0.7155380845069885, + "learning_rate": 9.74483071035351e-05, + "loss": 2.9145, + "step": 45050 + }, + { + "epoch": 2.21, + "grad_norm": 0.7536515593528748, + "learning_rate": 9.743695021566157e-05, + "loss": 2.81, + "step": 45051 + }, + { + "epoch": 2.21, + "grad_norm": 0.6787039041519165, + "learning_rate": 9.742559386130012e-05, + "loss": 2.8627, + "step": 45052 + }, + { + "epoch": 2.21, + "grad_norm": 0.7021150588989258, + "learning_rate": 9.741423804048077e-05, + "loss": 2.9647, + "step": 45053 + }, + { + "epoch": 2.21, + "grad_norm": 0.7255040407180786, + "learning_rate": 9.740288275323321e-05, + "loss": 3.0502, + "step": 45054 + }, + { + "epoch": 2.21, + "grad_norm": 0.7342783212661743, + "learning_rate": 9.739152799958758e-05, + "loss": 2.8024, + "step": 45055 + }, + { + "epoch": 2.21, + "grad_norm": 0.6887534260749817, + "learning_rate": 9.73801737795736e-05, + "loss": 2.7358, + "step": 45056 + }, + { + "epoch": 2.21, + "grad_norm": 0.6920632123947144, + "learning_rate": 9.736882009322131e-05, + "loss": 2.8555, + "step": 45057 + }, + { + "epoch": 2.21, + "grad_norm": 0.7355381846427917, + "learning_rate": 9.735746694056055e-05, + "loss": 3.0263, + "step": 45058 + }, + { + "epoch": 2.21, + "grad_norm": 0.7083019018173218, + "learning_rate": 9.734611432162114e-05, + "loss": 2.8918, + "step": 45059 + }, + { + "epoch": 2.21, + "grad_norm": 0.7241811156272888, + "learning_rate": 9.733476223643317e-05, + "loss": 2.8173, + "step": 45060 + }, + { + "epoch": 2.21, + "grad_norm": 0.7107488512992859, + "learning_rate": 9.73234106850263e-05, + "loss": 2.9891, + "step": 45061 + }, + { + "epoch": 2.21, + "grad_norm": 0.7031468152999878, + "learning_rate": 9.731205966743058e-05, + "loss": 2.7615, + "step": 45062 + }, + { + "epoch": 2.21, + "grad_norm": 0.6482835412025452, + "learning_rate": 9.730070918367598e-05, + "loss": 2.8392, + "step": 45063 + }, + { + "epoch": 2.21, + "grad_norm": 0.696492612361908, + "learning_rate": 9.728935923379234e-05, + "loss": 2.9115, + "step": 45064 + }, + { + "epoch": 2.21, + "grad_norm": 0.6701128482818604, + "learning_rate": 9.727800981780946e-05, + "loss": 2.6987, + "step": 45065 + }, + { + "epoch": 2.21, + "grad_norm": 0.6930135488510132, + "learning_rate": 9.726666093575723e-05, + "loss": 3.1348, + "step": 45066 + }, + { + "epoch": 2.21, + "grad_norm": 0.7013523578643799, + "learning_rate": 9.725531258766555e-05, + "loss": 2.9139, + "step": 45067 + }, + { + "epoch": 2.21, + "grad_norm": 0.6763277649879456, + "learning_rate": 9.72439647735645e-05, + "loss": 2.9529, + "step": 45068 + }, + { + "epoch": 2.21, + "grad_norm": 0.7029493451118469, + "learning_rate": 9.723261749348373e-05, + "loss": 2.7961, + "step": 45069 + }, + { + "epoch": 2.21, + "grad_norm": 0.6882089376449585, + "learning_rate": 9.72212707474533e-05, + "loss": 2.9743, + "step": 45070 + }, + { + "epoch": 2.21, + "grad_norm": 0.7075545191764832, + "learning_rate": 9.720992453550292e-05, + "loss": 3.0006, + "step": 45071 + }, + { + "epoch": 2.21, + "grad_norm": 0.7261462211608887, + "learning_rate": 9.719857885766266e-05, + "loss": 2.9622, + "step": 45072 + }, + { + "epoch": 2.21, + "grad_norm": 0.7145310044288635, + "learning_rate": 9.718723371396234e-05, + "loss": 2.9799, + "step": 45073 + }, + { + "epoch": 2.21, + "grad_norm": 0.7117692828178406, + "learning_rate": 9.71758891044317e-05, + "loss": 2.9716, + "step": 45074 + }, + { + "epoch": 2.21, + "grad_norm": 0.7104676961898804, + "learning_rate": 9.716454502910082e-05, + "loss": 3.0304, + "step": 45075 + }, + { + "epoch": 2.21, + "grad_norm": 0.7376676797866821, + "learning_rate": 9.715320148799942e-05, + "loss": 2.8945, + "step": 45076 + }, + { + "epoch": 2.21, + "grad_norm": 0.7351451516151428, + "learning_rate": 9.714185848115746e-05, + "loss": 2.8031, + "step": 45077 + }, + { + "epoch": 2.21, + "grad_norm": 0.7405024766921997, + "learning_rate": 9.713051600860486e-05, + "loss": 2.973, + "step": 45078 + }, + { + "epoch": 2.21, + "grad_norm": 0.7368142604827881, + "learning_rate": 9.711917407037147e-05, + "loss": 3.0478, + "step": 45079 + }, + { + "epoch": 2.21, + "grad_norm": 0.756643533706665, + "learning_rate": 9.71078326664871e-05, + "loss": 3.123, + "step": 45080 + }, + { + "epoch": 2.21, + "grad_norm": 0.742498517036438, + "learning_rate": 9.709649179698156e-05, + "loss": 2.9582, + "step": 45081 + }, + { + "epoch": 2.21, + "grad_norm": 0.7527225017547607, + "learning_rate": 9.708515146188482e-05, + "loss": 2.8509, + "step": 45082 + }, + { + "epoch": 2.21, + "grad_norm": 0.7525281310081482, + "learning_rate": 9.707381166122685e-05, + "loss": 3.034, + "step": 45083 + }, + { + "epoch": 2.21, + "grad_norm": 0.69659823179245, + "learning_rate": 9.70624723950373e-05, + "loss": 2.8755, + "step": 45084 + }, + { + "epoch": 2.21, + "grad_norm": 0.7091747522354126, + "learning_rate": 9.705113366334622e-05, + "loss": 2.8933, + "step": 45085 + }, + { + "epoch": 2.21, + "grad_norm": 0.6765292882919312, + "learning_rate": 9.70397954661834e-05, + "loss": 2.8283, + "step": 45086 + }, + { + "epoch": 2.21, + "grad_norm": 0.7405532002449036, + "learning_rate": 9.702845780357864e-05, + "loss": 2.9788, + "step": 45087 + }, + { + "epoch": 2.21, + "grad_norm": 0.7477160692214966, + "learning_rate": 9.701712067556192e-05, + "loss": 2.8926, + "step": 45088 + }, + { + "epoch": 2.21, + "grad_norm": 0.7491059303283691, + "learning_rate": 9.700578408216296e-05, + "loss": 2.7266, + "step": 45089 + }, + { + "epoch": 2.21, + "grad_norm": 0.6882579326629639, + "learning_rate": 9.69944480234118e-05, + "loss": 3.1125, + "step": 45090 + }, + { + "epoch": 2.21, + "grad_norm": 0.7268614768981934, + "learning_rate": 9.698311249933807e-05, + "loss": 2.9157, + "step": 45091 + }, + { + "epoch": 2.21, + "grad_norm": 0.7116838693618774, + "learning_rate": 9.697177750997175e-05, + "loss": 2.7934, + "step": 45092 + }, + { + "epoch": 2.21, + "grad_norm": 0.718450129032135, + "learning_rate": 9.696044305534281e-05, + "loss": 2.9242, + "step": 45093 + }, + { + "epoch": 2.21, + "grad_norm": 0.7193343043327332, + "learning_rate": 9.694910913548099e-05, + "loss": 2.7036, + "step": 45094 + }, + { + "epoch": 2.21, + "grad_norm": 0.6873836517333984, + "learning_rate": 9.693777575041613e-05, + "loss": 2.9068, + "step": 45095 + }, + { + "epoch": 2.21, + "grad_norm": 0.7232233881950378, + "learning_rate": 9.692644290017801e-05, + "loss": 3.0087, + "step": 45096 + }, + { + "epoch": 2.21, + "grad_norm": 0.6899810433387756, + "learning_rate": 9.691511058479653e-05, + "loss": 3.0144, + "step": 45097 + }, + { + "epoch": 2.21, + "grad_norm": 0.7038057446479797, + "learning_rate": 9.690377880430168e-05, + "loss": 2.8663, + "step": 45098 + }, + { + "epoch": 2.21, + "grad_norm": 0.6957258582115173, + "learning_rate": 9.689244755872308e-05, + "loss": 3.0446, + "step": 45099 + }, + { + "epoch": 2.21, + "grad_norm": 0.7041094899177551, + "learning_rate": 9.688111684809078e-05, + "loss": 2.7151, + "step": 45100 + }, + { + "epoch": 2.21, + "grad_norm": 0.668713390827179, + "learning_rate": 9.686978667243453e-05, + "loss": 3.0839, + "step": 45101 + }, + { + "epoch": 2.21, + "grad_norm": 0.7322125434875488, + "learning_rate": 9.685845703178408e-05, + "loss": 2.8446, + "step": 45102 + }, + { + "epoch": 2.21, + "grad_norm": 0.7117242217063904, + "learning_rate": 9.684712792616946e-05, + "loss": 3.0491, + "step": 45103 + }, + { + "epoch": 2.21, + "grad_norm": 0.724531352519989, + "learning_rate": 9.683579935562029e-05, + "loss": 2.8113, + "step": 45104 + }, + { + "epoch": 2.21, + "grad_norm": 0.7037317752838135, + "learning_rate": 9.682447132016664e-05, + "loss": 2.9621, + "step": 45105 + }, + { + "epoch": 2.21, + "grad_norm": 0.7283707857131958, + "learning_rate": 9.681314381983811e-05, + "loss": 3.037, + "step": 45106 + }, + { + "epoch": 2.21, + "grad_norm": 0.6818432807922363, + "learning_rate": 9.680181685466477e-05, + "loss": 2.7621, + "step": 45107 + }, + { + "epoch": 2.21, + "grad_norm": 0.6957379579544067, + "learning_rate": 9.679049042467636e-05, + "loss": 2.7626, + "step": 45108 + }, + { + "epoch": 2.21, + "grad_norm": 0.6879173517227173, + "learning_rate": 9.677916452990255e-05, + "loss": 2.795, + "step": 45109 + }, + { + "epoch": 2.21, + "grad_norm": 0.6849760413169861, + "learning_rate": 9.676783917037343e-05, + "loss": 2.8795, + "step": 45110 + }, + { + "epoch": 2.21, + "grad_norm": 0.7054252028465271, + "learning_rate": 9.67565143461186e-05, + "loss": 2.7672, + "step": 45111 + }, + { + "epoch": 2.21, + "grad_norm": 0.6849786043167114, + "learning_rate": 9.674519005716809e-05, + "loss": 2.8216, + "step": 45112 + }, + { + "epoch": 2.21, + "grad_norm": 0.6915918588638306, + "learning_rate": 9.673386630355158e-05, + "loss": 2.9443, + "step": 45113 + }, + { + "epoch": 2.21, + "grad_norm": 0.6977018117904663, + "learning_rate": 9.672254308529891e-05, + "loss": 2.933, + "step": 45114 + }, + { + "epoch": 2.21, + "grad_norm": 0.7735402584075928, + "learning_rate": 9.671122040244004e-05, + "loss": 2.7578, + "step": 45115 + }, + { + "epoch": 2.21, + "grad_norm": 0.7309532165527344, + "learning_rate": 9.669989825500467e-05, + "loss": 2.9269, + "step": 45116 + }, + { + "epoch": 2.21, + "grad_norm": 0.7018609046936035, + "learning_rate": 9.668857664302269e-05, + "loss": 2.8062, + "step": 45117 + }, + { + "epoch": 2.21, + "grad_norm": 0.7004704475402832, + "learning_rate": 9.667725556652372e-05, + "loss": 2.9491, + "step": 45118 + }, + { + "epoch": 2.21, + "grad_norm": 0.7916359305381775, + "learning_rate": 9.666593502553776e-05, + "loss": 3.0852, + "step": 45119 + }, + { + "epoch": 2.21, + "grad_norm": 0.7221834659576416, + "learning_rate": 9.665461502009468e-05, + "loss": 2.9854, + "step": 45120 + }, + { + "epoch": 2.21, + "grad_norm": 0.706676721572876, + "learning_rate": 9.664329555022415e-05, + "loss": 2.9534, + "step": 45121 + }, + { + "epoch": 2.21, + "grad_norm": 0.7531946301460266, + "learning_rate": 9.663197661595609e-05, + "loss": 2.8679, + "step": 45122 + }, + { + "epoch": 2.21, + "grad_norm": 0.7091394066810608, + "learning_rate": 9.66206582173203e-05, + "loss": 3.028, + "step": 45123 + }, + { + "epoch": 2.21, + "grad_norm": 0.7693554162979126, + "learning_rate": 9.660934035434644e-05, + "loss": 2.9596, + "step": 45124 + }, + { + "epoch": 2.21, + "grad_norm": 0.7340413928031921, + "learning_rate": 9.659802302706452e-05, + "loss": 2.7628, + "step": 45125 + }, + { + "epoch": 2.21, + "grad_norm": 0.7072880864143372, + "learning_rate": 9.65867062355042e-05, + "loss": 2.7449, + "step": 45126 + }, + { + "epoch": 2.21, + "grad_norm": 0.6786409020423889, + "learning_rate": 9.657538997969542e-05, + "loss": 2.7573, + "step": 45127 + }, + { + "epoch": 2.21, + "grad_norm": 0.6778995394706726, + "learning_rate": 9.656407425966782e-05, + "loss": 3.0114, + "step": 45128 + }, + { + "epoch": 2.21, + "grad_norm": 0.7381721138954163, + "learning_rate": 9.655275907545139e-05, + "loss": 2.7733, + "step": 45129 + }, + { + "epoch": 2.21, + "grad_norm": 0.7340525984764099, + "learning_rate": 9.654144442707583e-05, + "loss": 2.7722, + "step": 45130 + }, + { + "epoch": 2.21, + "grad_norm": 0.72030109167099, + "learning_rate": 9.653013031457089e-05, + "loss": 2.9553, + "step": 45131 + }, + { + "epoch": 2.21, + "grad_norm": 0.7066547870635986, + "learning_rate": 9.65188167379665e-05, + "loss": 2.9542, + "step": 45132 + }, + { + "epoch": 2.21, + "grad_norm": 0.7587066292762756, + "learning_rate": 9.650750369729228e-05, + "loss": 2.8834, + "step": 45133 + }, + { + "epoch": 2.21, + "grad_norm": 0.7313148379325867, + "learning_rate": 9.649619119257813e-05, + "loss": 2.7555, + "step": 45134 + }, + { + "epoch": 2.21, + "grad_norm": 0.713611900806427, + "learning_rate": 9.648487922385397e-05, + "loss": 2.8716, + "step": 45135 + }, + { + "epoch": 2.21, + "grad_norm": 0.6982900500297546, + "learning_rate": 9.647356779114934e-05, + "loss": 2.9958, + "step": 45136 + }, + { + "epoch": 2.21, + "grad_norm": 0.7607917189598083, + "learning_rate": 9.646225689449428e-05, + "loss": 2.9409, + "step": 45137 + }, + { + "epoch": 2.21, + "grad_norm": 0.6986991763114929, + "learning_rate": 9.645094653391848e-05, + "loss": 2.8637, + "step": 45138 + }, + { + "epoch": 2.21, + "grad_norm": 0.7355843782424927, + "learning_rate": 9.643963670945159e-05, + "loss": 2.914, + "step": 45139 + }, + { + "epoch": 2.21, + "grad_norm": 0.746160089969635, + "learning_rate": 9.642832742112363e-05, + "loss": 2.9648, + "step": 45140 + }, + { + "epoch": 2.21, + "grad_norm": 0.718352198600769, + "learning_rate": 9.641701866896418e-05, + "loss": 2.7866, + "step": 45141 + }, + { + "epoch": 2.21, + "grad_norm": 0.6826595067977905, + "learning_rate": 9.640571045300323e-05, + "loss": 2.7788, + "step": 45142 + }, + { + "epoch": 2.21, + "grad_norm": 0.7046639919281006, + "learning_rate": 9.639440277327033e-05, + "loss": 2.8692, + "step": 45143 + }, + { + "epoch": 2.21, + "grad_norm": 0.6994836926460266, + "learning_rate": 9.638309562979551e-05, + "loss": 2.9952, + "step": 45144 + }, + { + "epoch": 2.21, + "grad_norm": 0.7048764824867249, + "learning_rate": 9.637178902260843e-05, + "loss": 3.0218, + "step": 45145 + }, + { + "epoch": 2.21, + "grad_norm": 0.6403487920761108, + "learning_rate": 9.636048295173873e-05, + "loss": 2.8368, + "step": 45146 + }, + { + "epoch": 2.21, + "grad_norm": 0.7118167281150818, + "learning_rate": 9.634917741721648e-05, + "loss": 2.7272, + "step": 45147 + }, + { + "epoch": 2.21, + "grad_norm": 0.7441112399101257, + "learning_rate": 9.633787241907117e-05, + "loss": 3.164, + "step": 45148 + }, + { + "epoch": 2.21, + "grad_norm": 0.707645833492279, + "learning_rate": 9.632656795733271e-05, + "loss": 3.0041, + "step": 45149 + }, + { + "epoch": 2.21, + "grad_norm": 0.6987390518188477, + "learning_rate": 9.631526403203098e-05, + "loss": 2.8839, + "step": 45150 + }, + { + "epoch": 2.21, + "grad_norm": 0.7065211534500122, + "learning_rate": 9.630396064319565e-05, + "loss": 2.8892, + "step": 45151 + }, + { + "epoch": 2.21, + "grad_norm": 0.7638340592384338, + "learning_rate": 9.629265779085647e-05, + "loss": 2.8871, + "step": 45152 + }, + { + "epoch": 2.21, + "grad_norm": 0.7217139005661011, + "learning_rate": 9.628135547504315e-05, + "loss": 3.0046, + "step": 45153 + }, + { + "epoch": 2.21, + "grad_norm": 0.7055973410606384, + "learning_rate": 9.627005369578553e-05, + "loss": 2.738, + "step": 45154 + }, + { + "epoch": 2.21, + "grad_norm": 0.7258812785148621, + "learning_rate": 9.625875245311345e-05, + "loss": 3.0541, + "step": 45155 + }, + { + "epoch": 2.21, + "grad_norm": 0.7114151120185852, + "learning_rate": 9.624745174705655e-05, + "loss": 3.0669, + "step": 45156 + }, + { + "epoch": 2.21, + "grad_norm": 0.6896607875823975, + "learning_rate": 9.623615157764473e-05, + "loss": 2.8723, + "step": 45157 + }, + { + "epoch": 2.21, + "grad_norm": 0.7319231033325195, + "learning_rate": 9.622485194490757e-05, + "loss": 3.1262, + "step": 45158 + }, + { + "epoch": 2.21, + "grad_norm": 0.6564500331878662, + "learning_rate": 9.621355284887502e-05, + "loss": 2.8374, + "step": 45159 + }, + { + "epoch": 2.21, + "grad_norm": 0.7160139083862305, + "learning_rate": 9.620225428957677e-05, + "loss": 2.93, + "step": 45160 + }, + { + "epoch": 2.21, + "grad_norm": 0.7266965508460999, + "learning_rate": 9.619095626704244e-05, + "loss": 2.8838, + "step": 45161 + }, + { + "epoch": 2.21, + "grad_norm": 0.7377114295959473, + "learning_rate": 9.617965878130203e-05, + "loss": 2.9046, + "step": 45162 + }, + { + "epoch": 2.21, + "grad_norm": 0.7144932746887207, + "learning_rate": 9.616836183238504e-05, + "loss": 3.0129, + "step": 45163 + }, + { + "epoch": 2.21, + "grad_norm": 0.6912056803703308, + "learning_rate": 9.61570654203214e-05, + "loss": 2.7707, + "step": 45164 + }, + { + "epoch": 2.21, + "grad_norm": 0.730238139629364, + "learning_rate": 9.614576954514088e-05, + "loss": 3.1059, + "step": 45165 + }, + { + "epoch": 2.21, + "grad_norm": 0.6824841499328613, + "learning_rate": 9.613447420687319e-05, + "loss": 2.7708, + "step": 45166 + }, + { + "epoch": 2.21, + "grad_norm": 0.7248892188072205, + "learning_rate": 9.612317940554806e-05, + "loss": 2.8777, + "step": 45167 + }, + { + "epoch": 2.21, + "grad_norm": 0.7106010913848877, + "learning_rate": 9.611188514119515e-05, + "loss": 2.8532, + "step": 45168 + }, + { + "epoch": 2.21, + "grad_norm": 0.7336381077766418, + "learning_rate": 9.610059141384427e-05, + "loss": 2.8722, + "step": 45169 + }, + { + "epoch": 2.21, + "grad_norm": 0.7006831765174866, + "learning_rate": 9.60892982235253e-05, + "loss": 2.8112, + "step": 45170 + }, + { + "epoch": 2.21, + "grad_norm": 0.7395011782646179, + "learning_rate": 9.607800557026777e-05, + "loss": 2.9595, + "step": 45171 + }, + { + "epoch": 2.21, + "grad_norm": 0.730022668838501, + "learning_rate": 9.606671345410164e-05, + "loss": 3.002, + "step": 45172 + }, + { + "epoch": 2.21, + "grad_norm": 0.7525355815887451, + "learning_rate": 9.605542187505643e-05, + "loss": 2.8871, + "step": 45173 + }, + { + "epoch": 2.21, + "grad_norm": 0.6975340247154236, + "learning_rate": 9.60441308331621e-05, + "loss": 2.8554, + "step": 45174 + }, + { + "epoch": 2.21, + "grad_norm": 0.7521911859512329, + "learning_rate": 9.603284032844828e-05, + "loss": 2.772, + "step": 45175 + }, + { + "epoch": 2.21, + "grad_norm": 0.7508547306060791, + "learning_rate": 9.60215503609446e-05, + "loss": 2.8397, + "step": 45176 + }, + { + "epoch": 2.21, + "grad_norm": 0.7319634556770325, + "learning_rate": 9.601026093068098e-05, + "loss": 2.9338, + "step": 45177 + }, + { + "epoch": 2.21, + "grad_norm": 0.6809556484222412, + "learning_rate": 9.5998972037687e-05, + "loss": 2.8386, + "step": 45178 + }, + { + "epoch": 2.21, + "grad_norm": 0.7330763936042786, + "learning_rate": 9.598768368199245e-05, + "loss": 2.6866, + "step": 45179 + }, + { + "epoch": 2.21, + "grad_norm": 0.7155676484107971, + "learning_rate": 9.59763958636272e-05, + "loss": 2.8636, + "step": 45180 + }, + { + "epoch": 2.21, + "grad_norm": 0.6808676719665527, + "learning_rate": 9.596510858262085e-05, + "loss": 2.9295, + "step": 45181 + }, + { + "epoch": 2.21, + "grad_norm": 0.7002042531967163, + "learning_rate": 9.595382183900312e-05, + "loss": 3.0021, + "step": 45182 + }, + { + "epoch": 2.21, + "grad_norm": 0.7323654294013977, + "learning_rate": 9.594253563280364e-05, + "loss": 2.8457, + "step": 45183 + }, + { + "epoch": 2.21, + "grad_norm": 0.677535891532898, + "learning_rate": 9.593124996405228e-05, + "loss": 2.9339, + "step": 45184 + }, + { + "epoch": 2.21, + "grad_norm": 0.7651351094245911, + "learning_rate": 9.591996483277882e-05, + "loss": 2.7807, + "step": 45185 + }, + { + "epoch": 2.21, + "grad_norm": 0.6927827000617981, + "learning_rate": 9.590868023901283e-05, + "loss": 2.9166, + "step": 45186 + }, + { + "epoch": 2.21, + "grad_norm": 0.731319010257721, + "learning_rate": 9.589739618278415e-05, + "loss": 2.9789, + "step": 45187 + }, + { + "epoch": 2.21, + "grad_norm": 0.7109827995300293, + "learning_rate": 9.588611266412247e-05, + "loss": 2.8897, + "step": 45188 + }, + { + "epoch": 2.21, + "grad_norm": 0.7135148048400879, + "learning_rate": 9.58748296830574e-05, + "loss": 2.9604, + "step": 45189 + }, + { + "epoch": 2.21, + "grad_norm": 0.6991270184516907, + "learning_rate": 9.586354723961883e-05, + "loss": 3.079, + "step": 45190 + }, + { + "epoch": 2.21, + "grad_norm": 0.692098081111908, + "learning_rate": 9.585226533383627e-05, + "loss": 2.877, + "step": 45191 + }, + { + "epoch": 2.21, + "grad_norm": 0.7232987880706787, + "learning_rate": 9.584098396573969e-05, + "loss": 3.0033, + "step": 45192 + }, + { + "epoch": 2.21, + "grad_norm": 0.7077509760856628, + "learning_rate": 9.582970313535857e-05, + "loss": 3.0322, + "step": 45193 + }, + { + "epoch": 2.21, + "grad_norm": 0.7513511776924133, + "learning_rate": 9.581842284272279e-05, + "loss": 2.9617, + "step": 45194 + }, + { + "epoch": 2.21, + "grad_norm": 0.7315356135368347, + "learning_rate": 9.58071430878619e-05, + "loss": 3.0344, + "step": 45195 + }, + { + "epoch": 2.21, + "grad_norm": 0.745606005191803, + "learning_rate": 9.57958638708058e-05, + "loss": 2.9354, + "step": 45196 + }, + { + "epoch": 2.21, + "grad_norm": 0.7538384795188904, + "learning_rate": 9.578458519158409e-05, + "loss": 3.0113, + "step": 45197 + }, + { + "epoch": 2.22, + "grad_norm": 0.7272838354110718, + "learning_rate": 9.577330705022637e-05, + "loss": 2.9533, + "step": 45198 + }, + { + "epoch": 2.22, + "grad_norm": 0.6927639842033386, + "learning_rate": 9.576202944676254e-05, + "loss": 3.0826, + "step": 45199 + }, + { + "epoch": 2.22, + "grad_norm": 0.6876212358474731, + "learning_rate": 9.575075238122217e-05, + "loss": 2.7837, + "step": 45200 + }, + { + "epoch": 2.22, + "grad_norm": 0.721787691116333, + "learning_rate": 9.573947585363499e-05, + "loss": 2.7393, + "step": 45201 + }, + { + "epoch": 2.22, + "grad_norm": 0.7061600685119629, + "learning_rate": 9.572819986403079e-05, + "loss": 2.8766, + "step": 45202 + }, + { + "epoch": 2.22, + "grad_norm": 0.7322124242782593, + "learning_rate": 9.571692441243925e-05, + "loss": 2.9802, + "step": 45203 + }, + { + "epoch": 2.22, + "grad_norm": 0.7068262696266174, + "learning_rate": 9.570564949888998e-05, + "loss": 3.033, + "step": 45204 + }, + { + "epoch": 2.22, + "grad_norm": 0.7690457105636597, + "learning_rate": 9.569437512341261e-05, + "loss": 2.9735, + "step": 45205 + }, + { + "epoch": 2.22, + "grad_norm": 0.7033405303955078, + "learning_rate": 9.568310128603696e-05, + "loss": 3.0367, + "step": 45206 + }, + { + "epoch": 2.22, + "grad_norm": 0.7443763613700867, + "learning_rate": 9.567182798679278e-05, + "loss": 2.7348, + "step": 45207 + }, + { + "epoch": 2.22, + "grad_norm": 0.7035591006278992, + "learning_rate": 9.56605552257096e-05, + "loss": 2.8969, + "step": 45208 + }, + { + "epoch": 2.22, + "grad_norm": 0.693112313747406, + "learning_rate": 9.564928300281726e-05, + "loss": 2.8667, + "step": 45209 + }, + { + "epoch": 2.22, + "grad_norm": 0.7365559339523315, + "learning_rate": 9.563801131814539e-05, + "loss": 2.7897, + "step": 45210 + }, + { + "epoch": 2.22, + "grad_norm": 0.7358099818229675, + "learning_rate": 9.56267401717236e-05, + "loss": 3.0661, + "step": 45211 + }, + { + "epoch": 2.22, + "grad_norm": 0.7144678235054016, + "learning_rate": 9.56154695635817e-05, + "loss": 3.0905, + "step": 45212 + }, + { + "epoch": 2.22, + "grad_norm": 0.7409480810165405, + "learning_rate": 9.560419949374927e-05, + "loss": 3.028, + "step": 45213 + }, + { + "epoch": 2.22, + "grad_norm": 0.7363123297691345, + "learning_rate": 9.55929299622561e-05, + "loss": 2.9172, + "step": 45214 + }, + { + "epoch": 2.22, + "grad_norm": 0.7069172263145447, + "learning_rate": 9.558166096913173e-05, + "loss": 2.9742, + "step": 45215 + }, + { + "epoch": 2.22, + "grad_norm": 0.7248828411102295, + "learning_rate": 9.55703925144059e-05, + "loss": 3.0391, + "step": 45216 + }, + { + "epoch": 2.22, + "grad_norm": 0.7333267331123352, + "learning_rate": 9.555912459810845e-05, + "loss": 2.8933, + "step": 45217 + }, + { + "epoch": 2.22, + "grad_norm": 0.6944749355316162, + "learning_rate": 9.554785722026892e-05, + "loss": 2.8961, + "step": 45218 + }, + { + "epoch": 2.22, + "grad_norm": 0.7157734036445618, + "learning_rate": 9.553659038091695e-05, + "loss": 2.8081, + "step": 45219 + }, + { + "epoch": 2.22, + "grad_norm": 0.743134081363678, + "learning_rate": 9.552532408008216e-05, + "loss": 2.7779, + "step": 45220 + }, + { + "epoch": 2.22, + "grad_norm": 0.6870113611221313, + "learning_rate": 9.551405831779431e-05, + "loss": 2.8549, + "step": 45221 + }, + { + "epoch": 2.22, + "grad_norm": 0.7241684198379517, + "learning_rate": 9.550279309408321e-05, + "loss": 2.9022, + "step": 45222 + }, + { + "epoch": 2.22, + "grad_norm": 0.6814631819725037, + "learning_rate": 9.549152840897827e-05, + "loss": 2.82, + "step": 45223 + }, + { + "epoch": 2.22, + "grad_norm": 0.6824125647544861, + "learning_rate": 9.54802642625094e-05, + "loss": 3.0936, + "step": 45224 + }, + { + "epoch": 2.22, + "grad_norm": 0.7024969458580017, + "learning_rate": 9.546900065470615e-05, + "loss": 2.9814, + "step": 45225 + }, + { + "epoch": 2.22, + "grad_norm": 0.7450183629989624, + "learning_rate": 9.545773758559809e-05, + "loss": 2.8562, + "step": 45226 + }, + { + "epoch": 2.22, + "grad_norm": 0.6997722387313843, + "learning_rate": 9.544647505521508e-05, + "loss": 3.0313, + "step": 45227 + }, + { + "epoch": 2.22, + "grad_norm": 0.7680618166923523, + "learning_rate": 9.543521306358657e-05, + "loss": 3.1848, + "step": 45228 + }, + { + "epoch": 2.22, + "grad_norm": 0.7047203779220581, + "learning_rate": 9.542395161074246e-05, + "loss": 2.8618, + "step": 45229 + }, + { + "epoch": 2.22, + "grad_norm": 0.6786448359489441, + "learning_rate": 9.541269069671221e-05, + "loss": 2.9918, + "step": 45230 + }, + { + "epoch": 2.22, + "grad_norm": 0.7172592878341675, + "learning_rate": 9.540143032152564e-05, + "loss": 3.0713, + "step": 45231 + }, + { + "epoch": 2.22, + "grad_norm": 0.7199307680130005, + "learning_rate": 9.539017048521233e-05, + "loss": 3.0211, + "step": 45232 + }, + { + "epoch": 2.22, + "grad_norm": 0.7008538842201233, + "learning_rate": 9.537891118780181e-05, + "loss": 2.8901, + "step": 45233 + }, + { + "epoch": 2.22, + "grad_norm": 0.6756240129470825, + "learning_rate": 9.536765242932396e-05, + "loss": 2.9633, + "step": 45234 + }, + { + "epoch": 2.22, + "grad_norm": 0.6890457272529602, + "learning_rate": 9.535639420980826e-05, + "loss": 2.9048, + "step": 45235 + }, + { + "epoch": 2.22, + "grad_norm": 0.6927510499954224, + "learning_rate": 9.534513652928442e-05, + "loss": 2.8731, + "step": 45236 + }, + { + "epoch": 2.22, + "grad_norm": 0.6908006072044373, + "learning_rate": 9.53338793877822e-05, + "loss": 2.7577, + "step": 45237 + }, + { + "epoch": 2.22, + "grad_norm": 0.7486088871955872, + "learning_rate": 9.532262278533107e-05, + "loss": 2.8153, + "step": 45238 + }, + { + "epoch": 2.22, + "grad_norm": 0.7346619963645935, + "learning_rate": 9.531136672196082e-05, + "loss": 2.8707, + "step": 45239 + }, + { + "epoch": 2.22, + "grad_norm": 0.7260370254516602, + "learning_rate": 9.530011119770107e-05, + "loss": 2.7412, + "step": 45240 + }, + { + "epoch": 2.22, + "grad_norm": 0.743890106678009, + "learning_rate": 9.528885621258134e-05, + "loss": 2.8874, + "step": 45241 + }, + { + "epoch": 2.22, + "grad_norm": 0.7786975502967834, + "learning_rate": 9.527760176663143e-05, + "loss": 2.8397, + "step": 45242 + }, + { + "epoch": 2.22, + "grad_norm": 0.6872302293777466, + "learning_rate": 9.526634785988087e-05, + "loss": 2.9666, + "step": 45243 + }, + { + "epoch": 2.22, + "grad_norm": 0.6927504539489746, + "learning_rate": 9.52550944923594e-05, + "loss": 2.873, + "step": 45244 + }, + { + "epoch": 2.22, + "grad_norm": 0.7176833152770996, + "learning_rate": 9.524384166409653e-05, + "loss": 3.0863, + "step": 45245 + }, + { + "epoch": 2.22, + "grad_norm": 0.71924889087677, + "learning_rate": 9.523258937512204e-05, + "loss": 2.7674, + "step": 45246 + }, + { + "epoch": 2.22, + "grad_norm": 0.6952587962150574, + "learning_rate": 9.522133762546555e-05, + "loss": 2.9272, + "step": 45247 + }, + { + "epoch": 2.22, + "grad_norm": 0.7094781398773193, + "learning_rate": 9.521008641515653e-05, + "loss": 2.8632, + "step": 45248 + }, + { + "epoch": 2.22, + "grad_norm": 0.7094677686691284, + "learning_rate": 9.519883574422481e-05, + "loss": 2.9484, + "step": 45249 + }, + { + "epoch": 2.22, + "grad_norm": 0.6779662370681763, + "learning_rate": 9.518758561269988e-05, + "loss": 2.7069, + "step": 45250 + }, + { + "epoch": 2.22, + "grad_norm": 0.6990089416503906, + "learning_rate": 9.517633602061141e-05, + "loss": 3.1154, + "step": 45251 + }, + { + "epoch": 2.22, + "grad_norm": 0.7468245625495911, + "learning_rate": 9.516508696798914e-05, + "loss": 3.0644, + "step": 45252 + }, + { + "epoch": 2.22, + "grad_norm": 0.7292611598968506, + "learning_rate": 9.515383845486263e-05, + "loss": 3.1084, + "step": 45253 + }, + { + "epoch": 2.22, + "grad_norm": 0.6730436682701111, + "learning_rate": 9.514259048126147e-05, + "loss": 2.9853, + "step": 45254 + }, + { + "epoch": 2.22, + "grad_norm": 0.6685360074043274, + "learning_rate": 9.513134304721521e-05, + "loss": 2.9323, + "step": 45255 + }, + { + "epoch": 2.22, + "grad_norm": 0.7453835606575012, + "learning_rate": 9.512009615275356e-05, + "loss": 2.8677, + "step": 45256 + }, + { + "epoch": 2.22, + "grad_norm": 0.7116197347640991, + "learning_rate": 9.510884979790624e-05, + "loss": 2.7648, + "step": 45257 + }, + { + "epoch": 2.22, + "grad_norm": 0.7128415703773499, + "learning_rate": 9.50976039827027e-05, + "loss": 2.8006, + "step": 45258 + }, + { + "epoch": 2.22, + "grad_norm": 0.6778522729873657, + "learning_rate": 9.508635870717272e-05, + "loss": 3.0609, + "step": 45259 + }, + { + "epoch": 2.22, + "grad_norm": 0.6794968843460083, + "learning_rate": 9.507511397134574e-05, + "loss": 2.8762, + "step": 45260 + }, + { + "epoch": 2.22, + "grad_norm": 0.6920288801193237, + "learning_rate": 9.506386977525156e-05, + "loss": 2.7519, + "step": 45261 + }, + { + "epoch": 2.22, + "grad_norm": 0.7245926856994629, + "learning_rate": 9.50526261189197e-05, + "loss": 2.9135, + "step": 45262 + }, + { + "epoch": 2.22, + "grad_norm": 0.7115115523338318, + "learning_rate": 9.50413830023797e-05, + "loss": 3.035, + "step": 45263 + }, + { + "epoch": 2.22, + "grad_norm": 0.6681340336799622, + "learning_rate": 9.503014042566135e-05, + "loss": 2.8618, + "step": 45264 + }, + { + "epoch": 2.22, + "grad_norm": 0.6481229662895203, + "learning_rate": 9.501889838879407e-05, + "loss": 2.9743, + "step": 45265 + }, + { + "epoch": 2.22, + "grad_norm": 0.7262416481971741, + "learning_rate": 9.500765689180754e-05, + "loss": 2.9994, + "step": 45266 + }, + { + "epoch": 2.22, + "grad_norm": 0.7244082093238831, + "learning_rate": 9.499641593473152e-05, + "loss": 2.7771, + "step": 45267 + }, + { + "epoch": 2.22, + "grad_norm": 0.7363994121551514, + "learning_rate": 9.498517551759548e-05, + "loss": 3.0425, + "step": 45268 + }, + { + "epoch": 2.22, + "grad_norm": 0.7266939878463745, + "learning_rate": 9.497393564042901e-05, + "loss": 2.9354, + "step": 45269 + }, + { + "epoch": 2.22, + "grad_norm": 0.7804268002510071, + "learning_rate": 9.496269630326165e-05, + "loss": 2.9353, + "step": 45270 + }, + { + "epoch": 2.22, + "grad_norm": 0.6953659653663635, + "learning_rate": 9.495145750612319e-05, + "loss": 3.292, + "step": 45271 + }, + { + "epoch": 2.22, + "grad_norm": 0.6974464654922485, + "learning_rate": 9.494021924904302e-05, + "loss": 2.9793, + "step": 45272 + }, + { + "epoch": 2.22, + "grad_norm": 0.7132477760314941, + "learning_rate": 9.492898153205085e-05, + "loss": 2.9696, + "step": 45273 + }, + { + "epoch": 2.22, + "grad_norm": 0.6814945340156555, + "learning_rate": 9.491774435517637e-05, + "loss": 2.5446, + "step": 45274 + }, + { + "epoch": 2.22, + "grad_norm": 0.6983672976493835, + "learning_rate": 9.490650771844899e-05, + "loss": 2.9157, + "step": 45275 + }, + { + "epoch": 2.22, + "grad_norm": 0.7214673161506653, + "learning_rate": 9.48952716218986e-05, + "loss": 2.794, + "step": 45276 + }, + { + "epoch": 2.22, + "grad_norm": 0.7465963363647461, + "learning_rate": 9.48840360655544e-05, + "loss": 2.991, + "step": 45277 + }, + { + "epoch": 2.22, + "grad_norm": 0.7039017677307129, + "learning_rate": 9.487280104944616e-05, + "loss": 2.845, + "step": 45278 + }, + { + "epoch": 2.22, + "grad_norm": 0.737330436706543, + "learning_rate": 9.486156657360357e-05, + "loss": 2.8189, + "step": 45279 + }, + { + "epoch": 2.22, + "grad_norm": 0.6810418367385864, + "learning_rate": 9.485033263805607e-05, + "loss": 3.0818, + "step": 45280 + }, + { + "epoch": 2.22, + "grad_norm": 0.6977468729019165, + "learning_rate": 9.483909924283338e-05, + "loss": 2.782, + "step": 45281 + }, + { + "epoch": 2.22, + "grad_norm": 0.7160059213638306, + "learning_rate": 9.482786638796496e-05, + "loss": 2.9945, + "step": 45282 + }, + { + "epoch": 2.22, + "grad_norm": 0.7127458453178406, + "learning_rate": 9.48166340734805e-05, + "loss": 2.7842, + "step": 45283 + }, + { + "epoch": 2.22, + "grad_norm": 0.6878350973129272, + "learning_rate": 9.480540229940956e-05, + "loss": 2.7996, + "step": 45284 + }, + { + "epoch": 2.22, + "grad_norm": 0.6775646209716797, + "learning_rate": 9.47941710657816e-05, + "loss": 2.926, + "step": 45285 + }, + { + "epoch": 2.22, + "grad_norm": 0.7190422415733337, + "learning_rate": 9.478294037262642e-05, + "loss": 2.9772, + "step": 45286 + }, + { + "epoch": 2.22, + "grad_norm": 0.6985235214233398, + "learning_rate": 9.477171021997335e-05, + "loss": 3.0118, + "step": 45287 + }, + { + "epoch": 2.22, + "grad_norm": 0.7282534241676331, + "learning_rate": 9.476048060785213e-05, + "loss": 2.8941, + "step": 45288 + }, + { + "epoch": 2.22, + "grad_norm": 0.7488526701927185, + "learning_rate": 9.474925153629238e-05, + "loss": 3.0031, + "step": 45289 + }, + { + "epoch": 2.22, + "grad_norm": 0.7149317860603333, + "learning_rate": 9.473802300532361e-05, + "loss": 3.0255, + "step": 45290 + }, + { + "epoch": 2.22, + "grad_norm": 0.6947160959243774, + "learning_rate": 9.472679501497538e-05, + "loss": 2.9155, + "step": 45291 + }, + { + "epoch": 2.22, + "grad_norm": 0.7043709754943848, + "learning_rate": 9.471556756527716e-05, + "loss": 2.9663, + "step": 45292 + }, + { + "epoch": 2.22, + "grad_norm": 0.7300408482551575, + "learning_rate": 9.470434065625865e-05, + "loss": 2.7279, + "step": 45293 + }, + { + "epoch": 2.22, + "grad_norm": 0.6901246309280396, + "learning_rate": 9.469311428794948e-05, + "loss": 2.9823, + "step": 45294 + }, + { + "epoch": 2.22, + "grad_norm": 0.719524621963501, + "learning_rate": 9.468188846037904e-05, + "loss": 3.1144, + "step": 45295 + }, + { + "epoch": 2.22, + "grad_norm": 0.7352761030197144, + "learning_rate": 9.467066317357707e-05, + "loss": 2.8261, + "step": 45296 + }, + { + "epoch": 2.22, + "grad_norm": 0.7071006894111633, + "learning_rate": 9.4659438427573e-05, + "loss": 3.0008, + "step": 45297 + }, + { + "epoch": 2.22, + "grad_norm": 0.7212498188018799, + "learning_rate": 9.46482142223965e-05, + "loss": 2.8313, + "step": 45298 + }, + { + "epoch": 2.22, + "grad_norm": 0.7842340469360352, + "learning_rate": 9.463699055807713e-05, + "loss": 2.8263, + "step": 45299 + }, + { + "epoch": 2.22, + "grad_norm": 0.6890537142753601, + "learning_rate": 9.462576743464426e-05, + "loss": 2.9741, + "step": 45300 + }, + { + "epoch": 2.22, + "grad_norm": 0.7267663478851318, + "learning_rate": 9.461454485212774e-05, + "loss": 2.9529, + "step": 45301 + }, + { + "epoch": 2.22, + "grad_norm": 0.7003971934318542, + "learning_rate": 9.460332281055684e-05, + "loss": 2.9145, + "step": 45302 + }, + { + "epoch": 2.22, + "grad_norm": 0.7068154215812683, + "learning_rate": 9.45921013099613e-05, + "loss": 2.8808, + "step": 45303 + }, + { + "epoch": 2.22, + "grad_norm": 0.7120148539543152, + "learning_rate": 9.45808803503707e-05, + "loss": 2.8749, + "step": 45304 + }, + { + "epoch": 2.22, + "grad_norm": 0.6946321725845337, + "learning_rate": 9.456965993181453e-05, + "loss": 2.9578, + "step": 45305 + }, + { + "epoch": 2.22, + "grad_norm": 0.699981689453125, + "learning_rate": 9.455844005432234e-05, + "loss": 2.8174, + "step": 45306 + }, + { + "epoch": 2.22, + "grad_norm": 0.6788482666015625, + "learning_rate": 9.45472207179236e-05, + "loss": 2.8667, + "step": 45307 + }, + { + "epoch": 2.22, + "grad_norm": 0.7016308307647705, + "learning_rate": 9.453600192264792e-05, + "loss": 2.7427, + "step": 45308 + }, + { + "epoch": 2.22, + "grad_norm": 0.7483291625976562, + "learning_rate": 9.452478366852498e-05, + "loss": 2.8937, + "step": 45309 + }, + { + "epoch": 2.22, + "grad_norm": 0.6895776987075806, + "learning_rate": 9.451356595558412e-05, + "loss": 2.9125, + "step": 45310 + }, + { + "epoch": 2.22, + "grad_norm": 0.7158215641975403, + "learning_rate": 9.450234878385505e-05, + "loss": 2.9937, + "step": 45311 + }, + { + "epoch": 2.22, + "grad_norm": 0.7149104475975037, + "learning_rate": 9.449113215336727e-05, + "loss": 2.8569, + "step": 45312 + }, + { + "epoch": 2.22, + "grad_norm": 0.7052308320999146, + "learning_rate": 9.447991606415021e-05, + "loss": 3.0051, + "step": 45313 + }, + { + "epoch": 2.22, + "grad_norm": 0.7073119878768921, + "learning_rate": 9.446870051623358e-05, + "loss": 2.9697, + "step": 45314 + }, + { + "epoch": 2.22, + "grad_norm": 0.7047192454338074, + "learning_rate": 9.445748550964675e-05, + "loss": 3.0161, + "step": 45315 + }, + { + "epoch": 2.22, + "grad_norm": 0.7001875042915344, + "learning_rate": 9.444627104441944e-05, + "loss": 2.9102, + "step": 45316 + }, + { + "epoch": 2.22, + "grad_norm": 0.6942550539970398, + "learning_rate": 9.443505712058098e-05, + "loss": 3.0716, + "step": 45317 + }, + { + "epoch": 2.22, + "grad_norm": 0.6810586452484131, + "learning_rate": 9.4423843738161e-05, + "loss": 2.7442, + "step": 45318 + }, + { + "epoch": 2.22, + "grad_norm": 0.688390851020813, + "learning_rate": 9.441263089718919e-05, + "loss": 2.7613, + "step": 45319 + }, + { + "epoch": 2.22, + "grad_norm": 0.7119620442390442, + "learning_rate": 9.440141859769494e-05, + "loss": 3.0136, + "step": 45320 + }, + { + "epoch": 2.22, + "grad_norm": 0.7164089679718018, + "learning_rate": 9.439020683970778e-05, + "loss": 2.7685, + "step": 45321 + }, + { + "epoch": 2.22, + "grad_norm": 0.7110692858695984, + "learning_rate": 9.437899562325715e-05, + "loss": 2.7599, + "step": 45322 + }, + { + "epoch": 2.22, + "grad_norm": 0.8134690523147583, + "learning_rate": 9.436778494837264e-05, + "loss": 2.9939, + "step": 45323 + }, + { + "epoch": 2.22, + "grad_norm": 0.7323058843612671, + "learning_rate": 9.435657481508393e-05, + "loss": 2.9119, + "step": 45324 + }, + { + "epoch": 2.22, + "grad_norm": 0.6736330389976501, + "learning_rate": 9.434536522342033e-05, + "loss": 3.0363, + "step": 45325 + }, + { + "epoch": 2.22, + "grad_norm": 0.710964560508728, + "learning_rate": 9.433415617341156e-05, + "loss": 2.7434, + "step": 45326 + }, + { + "epoch": 2.22, + "grad_norm": 0.6646226644515991, + "learning_rate": 9.432294766508705e-05, + "loss": 3.0675, + "step": 45327 + }, + { + "epoch": 2.22, + "grad_norm": 0.7395647168159485, + "learning_rate": 9.43117396984762e-05, + "loss": 2.9635, + "step": 45328 + }, + { + "epoch": 2.22, + "grad_norm": 0.691923975944519, + "learning_rate": 9.430053227360875e-05, + "loss": 2.7976, + "step": 45329 + }, + { + "epoch": 2.22, + "grad_norm": 0.6718361377716064, + "learning_rate": 9.428932539051399e-05, + "loss": 2.9561, + "step": 45330 + }, + { + "epoch": 2.22, + "grad_norm": 0.7138594388961792, + "learning_rate": 9.427811904922167e-05, + "loss": 3.0096, + "step": 45331 + }, + { + "epoch": 2.22, + "grad_norm": 0.7084429860115051, + "learning_rate": 9.426691324976113e-05, + "loss": 2.8364, + "step": 45332 + }, + { + "epoch": 2.22, + "grad_norm": 0.7155402898788452, + "learning_rate": 9.4255707992162e-05, + "loss": 3.0008, + "step": 45333 + }, + { + "epoch": 2.22, + "grad_norm": 0.7018200755119324, + "learning_rate": 9.424450327645374e-05, + "loss": 2.7881, + "step": 45334 + }, + { + "epoch": 2.22, + "grad_norm": 0.7858231663703918, + "learning_rate": 9.423329910266576e-05, + "loss": 2.9398, + "step": 45335 + }, + { + "epoch": 2.22, + "grad_norm": 0.69808030128479, + "learning_rate": 9.42220954708278e-05, + "loss": 2.9118, + "step": 45336 + }, + { + "epoch": 2.22, + "grad_norm": 0.7413972020149231, + "learning_rate": 9.42108923809691e-05, + "loss": 2.9831, + "step": 45337 + }, + { + "epoch": 2.22, + "grad_norm": 0.7176880836486816, + "learning_rate": 9.419968983311934e-05, + "loss": 2.7619, + "step": 45338 + }, + { + "epoch": 2.22, + "grad_norm": 0.7590557336807251, + "learning_rate": 9.418848782730805e-05, + "loss": 3.0727, + "step": 45339 + }, + { + "epoch": 2.22, + "grad_norm": 0.6953451633453369, + "learning_rate": 9.417728636356459e-05, + "loss": 3.0103, + "step": 45340 + }, + { + "epoch": 2.22, + "grad_norm": 0.7350430488586426, + "learning_rate": 9.416608544191866e-05, + "loss": 3.0276, + "step": 45341 + }, + { + "epoch": 2.22, + "grad_norm": 0.6567845344543457, + "learning_rate": 9.415488506239964e-05, + "loss": 2.8636, + "step": 45342 + }, + { + "epoch": 2.22, + "grad_norm": 0.7473620772361755, + "learning_rate": 9.414368522503695e-05, + "loss": 2.96, + "step": 45343 + }, + { + "epoch": 2.22, + "grad_norm": 0.7204223871231079, + "learning_rate": 9.413248592986025e-05, + "loss": 3.0137, + "step": 45344 + }, + { + "epoch": 2.22, + "grad_norm": 0.7221093773841858, + "learning_rate": 9.412128717689888e-05, + "loss": 3.0327, + "step": 45345 + }, + { + "epoch": 2.22, + "grad_norm": 0.7267528176307678, + "learning_rate": 9.41100889661825e-05, + "loss": 2.7521, + "step": 45346 + }, + { + "epoch": 2.22, + "grad_norm": 0.7274415493011475, + "learning_rate": 9.409889129774043e-05, + "loss": 2.9669, + "step": 45347 + }, + { + "epoch": 2.22, + "grad_norm": 0.7493104934692383, + "learning_rate": 9.408769417160237e-05, + "loss": 2.8927, + "step": 45348 + }, + { + "epoch": 2.22, + "grad_norm": 0.7367200255393982, + "learning_rate": 9.407649758779766e-05, + "loss": 3.0401, + "step": 45349 + }, + { + "epoch": 2.22, + "grad_norm": 0.6605836749076843, + "learning_rate": 9.406530154635574e-05, + "loss": 2.9622, + "step": 45350 + }, + { + "epoch": 2.22, + "grad_norm": 0.7259601950645447, + "learning_rate": 9.405410604730629e-05, + "loss": 3.0617, + "step": 45351 + }, + { + "epoch": 2.22, + "grad_norm": 1.0172241926193237, + "learning_rate": 9.40429110906786e-05, + "loss": 2.8292, + "step": 45352 + }, + { + "epoch": 2.22, + "grad_norm": 0.6789811849594116, + "learning_rate": 9.403171667650235e-05, + "loss": 3.0832, + "step": 45353 + }, + { + "epoch": 2.22, + "grad_norm": 0.7042773962020874, + "learning_rate": 9.402052280480677e-05, + "loss": 2.8346, + "step": 45354 + }, + { + "epoch": 2.22, + "grad_norm": 0.7092491388320923, + "learning_rate": 9.400932947562165e-05, + "loss": 2.7202, + "step": 45355 + }, + { + "epoch": 2.22, + "grad_norm": 0.7057832479476929, + "learning_rate": 9.399813668897625e-05, + "loss": 3.0595, + "step": 45356 + }, + { + "epoch": 2.22, + "grad_norm": 0.751794159412384, + "learning_rate": 9.398694444490006e-05, + "loss": 2.8588, + "step": 45357 + }, + { + "epoch": 2.22, + "grad_norm": 0.7302895784378052, + "learning_rate": 9.39757527434227e-05, + "loss": 2.9267, + "step": 45358 + }, + { + "epoch": 2.22, + "grad_norm": 0.6791434288024902, + "learning_rate": 9.396456158457348e-05, + "loss": 2.9432, + "step": 45359 + }, + { + "epoch": 2.22, + "grad_norm": 0.7375596761703491, + "learning_rate": 9.395337096838191e-05, + "loss": 2.9125, + "step": 45360 + }, + { + "epoch": 2.22, + "grad_norm": 0.7245837450027466, + "learning_rate": 9.394218089487763e-05, + "loss": 2.8705, + "step": 45361 + }, + { + "epoch": 2.22, + "grad_norm": 0.7496141195297241, + "learning_rate": 9.393099136408989e-05, + "loss": 2.8422, + "step": 45362 + }, + { + "epoch": 2.22, + "grad_norm": 0.7486103773117065, + "learning_rate": 9.391980237604834e-05, + "loss": 2.9145, + "step": 45363 + }, + { + "epoch": 2.22, + "grad_norm": 0.6848008632659912, + "learning_rate": 9.390861393078237e-05, + "loss": 2.8837, + "step": 45364 + }, + { + "epoch": 2.22, + "grad_norm": 0.70891273021698, + "learning_rate": 9.389742602832137e-05, + "loss": 2.8657, + "step": 45365 + }, + { + "epoch": 2.22, + "grad_norm": 0.7543848752975464, + "learning_rate": 9.388623866869498e-05, + "loss": 3.0147, + "step": 45366 + }, + { + "epoch": 2.22, + "grad_norm": 0.7381017804145813, + "learning_rate": 9.387505185193246e-05, + "loss": 2.9008, + "step": 45367 + }, + { + "epoch": 2.22, + "grad_norm": 0.7030778527259827, + "learning_rate": 9.38638655780635e-05, + "loss": 2.7483, + "step": 45368 + }, + { + "epoch": 2.22, + "grad_norm": 0.6804453730583191, + "learning_rate": 9.385267984711733e-05, + "loss": 2.8687, + "step": 45369 + }, + { + "epoch": 2.22, + "grad_norm": 0.7367046475410461, + "learning_rate": 9.384149465912362e-05, + "loss": 2.8755, + "step": 45370 + }, + { + "epoch": 2.22, + "grad_norm": 0.7181659936904907, + "learning_rate": 9.383031001411177e-05, + "loss": 3.0086, + "step": 45371 + }, + { + "epoch": 2.22, + "grad_norm": 0.7256984114646912, + "learning_rate": 9.381912591211108e-05, + "loss": 3.0784, + "step": 45372 + }, + { + "epoch": 2.22, + "grad_norm": 0.6880785226821899, + "learning_rate": 9.380794235315125e-05, + "loss": 3.0513, + "step": 45373 + }, + { + "epoch": 2.22, + "grad_norm": 0.7282678484916687, + "learning_rate": 9.379675933726149e-05, + "loss": 2.742, + "step": 45374 + }, + { + "epoch": 2.22, + "grad_norm": 0.7389535307884216, + "learning_rate": 9.378557686447138e-05, + "loss": 2.7666, + "step": 45375 + }, + { + "epoch": 2.22, + "grad_norm": 0.6855918169021606, + "learning_rate": 9.377439493481051e-05, + "loss": 2.9682, + "step": 45376 + }, + { + "epoch": 2.22, + "grad_norm": 0.6976229548454285, + "learning_rate": 9.376321354830815e-05, + "loss": 3.0365, + "step": 45377 + }, + { + "epoch": 2.22, + "grad_norm": 0.6974002122879028, + "learning_rate": 9.375203270499382e-05, + "loss": 2.7856, + "step": 45378 + }, + { + "epoch": 2.22, + "grad_norm": 0.7219595313072205, + "learning_rate": 9.374085240489686e-05, + "loss": 3.002, + "step": 45379 + }, + { + "epoch": 2.22, + "grad_norm": 0.7123830318450928, + "learning_rate": 9.372967264804678e-05, + "loss": 2.9279, + "step": 45380 + }, + { + "epoch": 2.22, + "grad_norm": 0.7480422258377075, + "learning_rate": 9.371849343447313e-05, + "loss": 2.6871, + "step": 45381 + }, + { + "epoch": 2.22, + "grad_norm": 0.7664074897766113, + "learning_rate": 9.370731476420521e-05, + "loss": 2.9509, + "step": 45382 + }, + { + "epoch": 2.22, + "grad_norm": 0.7201911211013794, + "learning_rate": 9.369613663727257e-05, + "loss": 2.9775, + "step": 45383 + }, + { + "epoch": 2.22, + "grad_norm": 0.7252894639968872, + "learning_rate": 9.368495905370454e-05, + "loss": 2.9017, + "step": 45384 + }, + { + "epoch": 2.22, + "grad_norm": 0.7937513589859009, + "learning_rate": 9.367378201353072e-05, + "loss": 3.3099, + "step": 45385 + }, + { + "epoch": 2.22, + "grad_norm": 0.7466810345649719, + "learning_rate": 9.366260551678043e-05, + "loss": 2.7903, + "step": 45386 + }, + { + "epoch": 2.22, + "grad_norm": 0.7079269289970398, + "learning_rate": 9.365142956348303e-05, + "loss": 2.878, + "step": 45387 + }, + { + "epoch": 2.22, + "grad_norm": 0.75801682472229, + "learning_rate": 9.364025415366815e-05, + "loss": 2.7751, + "step": 45388 + }, + { + "epoch": 2.22, + "grad_norm": 0.7508972883224487, + "learning_rate": 9.362907928736504e-05, + "loss": 2.8401, + "step": 45389 + }, + { + "epoch": 2.22, + "grad_norm": 0.6906934380531311, + "learning_rate": 9.361790496460321e-05, + "loss": 2.8999, + "step": 45390 + }, + { + "epoch": 2.22, + "grad_norm": 0.7488749027252197, + "learning_rate": 9.36067311854122e-05, + "loss": 2.8891, + "step": 45391 + }, + { + "epoch": 2.22, + "grad_norm": 0.7199647426605225, + "learning_rate": 9.359555794982136e-05, + "loss": 2.8431, + "step": 45392 + }, + { + "epoch": 2.22, + "grad_norm": 0.7045316100120544, + "learning_rate": 9.358438525786006e-05, + "loss": 2.8693, + "step": 45393 + }, + { + "epoch": 2.22, + "grad_norm": 0.7193567156791687, + "learning_rate": 9.357321310955766e-05, + "loss": 3.1406, + "step": 45394 + }, + { + "epoch": 2.22, + "grad_norm": 0.6742414832115173, + "learning_rate": 9.356204150494374e-05, + "loss": 2.8068, + "step": 45395 + }, + { + "epoch": 2.22, + "grad_norm": 0.7182538509368896, + "learning_rate": 9.355087044404772e-05, + "loss": 3.1212, + "step": 45396 + }, + { + "epoch": 2.22, + "grad_norm": 0.7040326595306396, + "learning_rate": 9.353969992689891e-05, + "loss": 3.1036, + "step": 45397 + }, + { + "epoch": 2.22, + "grad_norm": 0.7186934947967529, + "learning_rate": 9.352852995352689e-05, + "loss": 2.7082, + "step": 45398 + }, + { + "epoch": 2.22, + "grad_norm": 0.7571741342544556, + "learning_rate": 9.351736052396088e-05, + "loss": 3.0488, + "step": 45399 + }, + { + "epoch": 2.22, + "grad_norm": 0.7321252822875977, + "learning_rate": 9.350619163823051e-05, + "loss": 2.9676, + "step": 45400 + }, + { + "epoch": 2.22, + "grad_norm": 0.7001104950904846, + "learning_rate": 9.349502329636505e-05, + "loss": 2.819, + "step": 45401 + }, + { + "epoch": 2.23, + "grad_norm": 0.718051016330719, + "learning_rate": 9.34838554983939e-05, + "loss": 2.897, + "step": 45402 + }, + { + "epoch": 2.23, + "grad_norm": 0.7028896808624268, + "learning_rate": 9.347268824434662e-05, + "loss": 2.8919, + "step": 45403 + }, + { + "epoch": 2.23, + "grad_norm": 0.7708454728126526, + "learning_rate": 9.346152153425245e-05, + "loss": 2.753, + "step": 45404 + }, + { + "epoch": 2.23, + "grad_norm": 0.712142288684845, + "learning_rate": 9.345035536814087e-05, + "loss": 2.8239, + "step": 45405 + }, + { + "epoch": 2.23, + "grad_norm": 0.7452940344810486, + "learning_rate": 9.34391897460414e-05, + "loss": 2.6923, + "step": 45406 + }, + { + "epoch": 2.23, + "grad_norm": 0.7222115993499756, + "learning_rate": 9.342802466798334e-05, + "loss": 2.8162, + "step": 45407 + }, + { + "epoch": 2.23, + "grad_norm": 0.7443071603775024, + "learning_rate": 9.341686013399609e-05, + "loss": 3.0949, + "step": 45408 + }, + { + "epoch": 2.23, + "grad_norm": 0.7208560109138489, + "learning_rate": 9.340569614410902e-05, + "loss": 3.0639, + "step": 45409 + }, + { + "epoch": 2.23, + "grad_norm": 0.7204750180244446, + "learning_rate": 9.339453269835158e-05, + "loss": 2.9431, + "step": 45410 + }, + { + "epoch": 2.23, + "grad_norm": 0.6994853615760803, + "learning_rate": 9.338336979675323e-05, + "loss": 2.8151, + "step": 45411 + }, + { + "epoch": 2.23, + "grad_norm": 0.6930583119392395, + "learning_rate": 9.337220743934328e-05, + "loss": 3.2522, + "step": 45412 + }, + { + "epoch": 2.23, + "grad_norm": 0.783711314201355, + "learning_rate": 9.336104562615123e-05, + "loss": 2.9764, + "step": 45413 + }, + { + "epoch": 2.23, + "grad_norm": 0.766595721244812, + "learning_rate": 9.334988435720643e-05, + "loss": 2.9608, + "step": 45414 + }, + { + "epoch": 2.23, + "grad_norm": 0.7293518781661987, + "learning_rate": 9.333872363253817e-05, + "loss": 3.0567, + "step": 45415 + }, + { + "epoch": 2.23, + "grad_norm": 0.6588782668113708, + "learning_rate": 9.332756345217604e-05, + "loss": 3.0559, + "step": 45416 + }, + { + "epoch": 2.23, + "grad_norm": 0.7530074715614319, + "learning_rate": 9.331640381614923e-05, + "loss": 2.9012, + "step": 45417 + }, + { + "epoch": 2.23, + "grad_norm": 0.740515410900116, + "learning_rate": 9.330524472448735e-05, + "loss": 2.8191, + "step": 45418 + }, + { + "epoch": 2.23, + "grad_norm": 0.7402000427246094, + "learning_rate": 9.329408617721953e-05, + "loss": 2.8501, + "step": 45419 + }, + { + "epoch": 2.23, + "grad_norm": 0.7017174959182739, + "learning_rate": 9.328292817437535e-05, + "loss": 3.007, + "step": 45420 + }, + { + "epoch": 2.23, + "grad_norm": 0.7643221616744995, + "learning_rate": 9.327177071598425e-05, + "loss": 3.0573, + "step": 45421 + }, + { + "epoch": 2.23, + "grad_norm": 0.7307689785957336, + "learning_rate": 9.326061380207548e-05, + "loss": 3.0476, + "step": 45422 + }, + { + "epoch": 2.23, + "grad_norm": 0.7625921368598938, + "learning_rate": 9.324945743267852e-05, + "loss": 2.6859, + "step": 45423 + }, + { + "epoch": 2.23, + "grad_norm": 0.7725873589515686, + "learning_rate": 9.323830160782256e-05, + "loss": 2.987, + "step": 45424 + }, + { + "epoch": 2.23, + "grad_norm": 0.7039408683776855, + "learning_rate": 9.322714632753721e-05, + "loss": 3.0487, + "step": 45425 + }, + { + "epoch": 2.23, + "grad_norm": 0.725426435470581, + "learning_rate": 9.321599159185171e-05, + "loss": 2.9296, + "step": 45426 + }, + { + "epoch": 2.23, + "grad_norm": 0.7159402966499329, + "learning_rate": 9.320483740079549e-05, + "loss": 2.8903, + "step": 45427 + }, + { + "epoch": 2.23, + "grad_norm": 0.7357923984527588, + "learning_rate": 9.319368375439798e-05, + "loss": 2.6201, + "step": 45428 + }, + { + "epoch": 2.23, + "grad_norm": 0.7285069227218628, + "learning_rate": 9.318253065268856e-05, + "loss": 2.908, + "step": 45429 + }, + { + "epoch": 2.23, + "grad_norm": 0.7228646278381348, + "learning_rate": 9.317137809569651e-05, + "loss": 3.1464, + "step": 45430 + }, + { + "epoch": 2.23, + "grad_norm": 0.6842017769813538, + "learning_rate": 9.316022608345115e-05, + "loss": 3.0632, + "step": 45431 + }, + { + "epoch": 2.23, + "grad_norm": 0.7215100526809692, + "learning_rate": 9.314907461598198e-05, + "loss": 2.8692, + "step": 45432 + }, + { + "epoch": 2.23, + "grad_norm": 0.6963567137718201, + "learning_rate": 9.31379236933184e-05, + "loss": 2.8567, + "step": 45433 + }, + { + "epoch": 2.23, + "grad_norm": 0.6841099262237549, + "learning_rate": 9.312677331548965e-05, + "loss": 2.6787, + "step": 45434 + }, + { + "epoch": 2.23, + "grad_norm": 0.6942437291145325, + "learning_rate": 9.311562348252523e-05, + "loss": 3.0439, + "step": 45435 + }, + { + "epoch": 2.23, + "grad_norm": 0.7410614490509033, + "learning_rate": 9.310447419445446e-05, + "loss": 2.9797, + "step": 45436 + }, + { + "epoch": 2.23, + "grad_norm": 0.685966432094574, + "learning_rate": 9.30933254513066e-05, + "loss": 2.8331, + "step": 45437 + }, + { + "epoch": 2.23, + "grad_norm": 0.7045655846595764, + "learning_rate": 9.30821772531112e-05, + "loss": 2.9347, + "step": 45438 + }, + { + "epoch": 2.23, + "grad_norm": 0.6830521821975708, + "learning_rate": 9.307102959989741e-05, + "loss": 2.9615, + "step": 45439 + }, + { + "epoch": 2.23, + "grad_norm": 0.6648380756378174, + "learning_rate": 9.305988249169481e-05, + "loss": 3.034, + "step": 45440 + }, + { + "epoch": 2.23, + "grad_norm": 0.7092429995536804, + "learning_rate": 9.304873592853253e-05, + "loss": 2.9286, + "step": 45441 + }, + { + "epoch": 2.23, + "grad_norm": 0.7085046172142029, + "learning_rate": 9.303758991044007e-05, + "loss": 2.8802, + "step": 45442 + }, + { + "epoch": 2.23, + "grad_norm": 0.7178999781608582, + "learning_rate": 9.302644443744689e-05, + "loss": 2.7642, + "step": 45443 + }, + { + "epoch": 2.23, + "grad_norm": 0.7461972236633301, + "learning_rate": 9.301529950958218e-05, + "loss": 2.9643, + "step": 45444 + }, + { + "epoch": 2.23, + "grad_norm": 0.7783269286155701, + "learning_rate": 9.300415512687538e-05, + "loss": 3.0405, + "step": 45445 + }, + { + "epoch": 2.23, + "grad_norm": 0.7068384885787964, + "learning_rate": 9.299301128935568e-05, + "loss": 2.7719, + "step": 45446 + }, + { + "epoch": 2.23, + "grad_norm": 0.7588903903961182, + "learning_rate": 9.298186799705251e-05, + "loss": 2.9524, + "step": 45447 + }, + { + "epoch": 2.23, + "grad_norm": 0.7695575952529907, + "learning_rate": 9.297072524999541e-05, + "loss": 2.719, + "step": 45448 + }, + { + "epoch": 2.23, + "grad_norm": 0.6897878646850586, + "learning_rate": 9.295958304821344e-05, + "loss": 2.7618, + "step": 45449 + }, + { + "epoch": 2.23, + "grad_norm": 0.7157047390937805, + "learning_rate": 9.294844139173622e-05, + "loss": 2.966, + "step": 45450 + }, + { + "epoch": 2.23, + "grad_norm": 0.7025874257087708, + "learning_rate": 9.293730028059295e-05, + "loss": 3.0808, + "step": 45451 + }, + { + "epoch": 2.23, + "grad_norm": 0.6616024374961853, + "learning_rate": 9.292615971481287e-05, + "loss": 3.149, + "step": 45452 + }, + { + "epoch": 2.23, + "grad_norm": 0.6998037099838257, + "learning_rate": 9.291501969442551e-05, + "loss": 2.9832, + "step": 45453 + }, + { + "epoch": 2.23, + "grad_norm": 0.7034969925880432, + "learning_rate": 9.290388021946005e-05, + "loss": 2.8652, + "step": 45454 + }, + { + "epoch": 2.23, + "grad_norm": 0.6883959174156189, + "learning_rate": 9.289274128994603e-05, + "loss": 2.9851, + "step": 45455 + }, + { + "epoch": 2.23, + "grad_norm": 0.6983670592308044, + "learning_rate": 9.288160290591255e-05, + "loss": 2.6393, + "step": 45456 + }, + { + "epoch": 2.23, + "grad_norm": 0.7119463086128235, + "learning_rate": 9.287046506738919e-05, + "loss": 2.8454, + "step": 45457 + }, + { + "epoch": 2.23, + "grad_norm": 0.6975045204162598, + "learning_rate": 9.285932777440517e-05, + "loss": 2.897, + "step": 45458 + }, + { + "epoch": 2.23, + "grad_norm": 0.7082361578941345, + "learning_rate": 9.284819102698967e-05, + "loss": 2.7897, + "step": 45459 + }, + { + "epoch": 2.23, + "grad_norm": 0.6975535154342651, + "learning_rate": 9.283705482517231e-05, + "loss": 2.9471, + "step": 45460 + }, + { + "epoch": 2.23, + "grad_norm": 0.6530376076698303, + "learning_rate": 9.282591916898216e-05, + "loss": 2.8358, + "step": 45461 + }, + { + "epoch": 2.23, + "grad_norm": 0.7379580140113831, + "learning_rate": 9.281478405844866e-05, + "loss": 2.7979, + "step": 45462 + }, + { + "epoch": 2.23, + "grad_norm": 0.7067075967788696, + "learning_rate": 9.280364949360125e-05, + "loss": 2.9221, + "step": 45463 + }, + { + "epoch": 2.23, + "grad_norm": 0.7226753234863281, + "learning_rate": 9.279251547446906e-05, + "loss": 2.8154, + "step": 45464 + }, + { + "epoch": 2.23, + "grad_norm": 0.72322678565979, + "learning_rate": 9.278138200108161e-05, + "loss": 2.9727, + "step": 45465 + }, + { + "epoch": 2.23, + "grad_norm": 0.6826564073562622, + "learning_rate": 9.277024907346812e-05, + "loss": 2.9562, + "step": 45466 + }, + { + "epoch": 2.23, + "grad_norm": 0.7021982669830322, + "learning_rate": 9.27591166916578e-05, + "loss": 2.8354, + "step": 45467 + }, + { + "epoch": 2.23, + "grad_norm": 0.7087491154670715, + "learning_rate": 9.274798485568019e-05, + "loss": 2.7002, + "step": 45468 + }, + { + "epoch": 2.23, + "grad_norm": 0.7043088674545288, + "learning_rate": 9.273685356556437e-05, + "loss": 2.8718, + "step": 45469 + }, + { + "epoch": 2.23, + "grad_norm": 0.7019631266593933, + "learning_rate": 9.272572282133992e-05, + "loss": 2.7755, + "step": 45470 + }, + { + "epoch": 2.23, + "grad_norm": 0.6537641882896423, + "learning_rate": 9.271459262303594e-05, + "loss": 3.0271, + "step": 45471 + }, + { + "epoch": 2.23, + "grad_norm": 0.7400332689285278, + "learning_rate": 9.270346297068192e-05, + "loss": 2.8694, + "step": 45472 + }, + { + "epoch": 2.23, + "grad_norm": 0.7204238772392273, + "learning_rate": 9.269233386430705e-05, + "loss": 2.9346, + "step": 45473 + }, + { + "epoch": 2.23, + "grad_norm": 0.6749469637870789, + "learning_rate": 9.26812053039406e-05, + "loss": 2.7155, + "step": 45474 + }, + { + "epoch": 2.23, + "grad_norm": 0.6849311590194702, + "learning_rate": 9.267007728961205e-05, + "loss": 2.9678, + "step": 45475 + }, + { + "epoch": 2.23, + "grad_norm": 0.6702030301094055, + "learning_rate": 9.265894982135054e-05, + "loss": 2.946, + "step": 45476 + }, + { + "epoch": 2.23, + "grad_norm": 0.7221550345420837, + "learning_rate": 9.264782289918542e-05, + "loss": 3.0065, + "step": 45477 + }, + { + "epoch": 2.23, + "grad_norm": 0.6980705857276917, + "learning_rate": 9.263669652314615e-05, + "loss": 3.0259, + "step": 45478 + }, + { + "epoch": 2.23, + "grad_norm": 0.7406960725784302, + "learning_rate": 9.26255706932619e-05, + "loss": 2.8508, + "step": 45479 + }, + { + "epoch": 2.23, + "grad_norm": 0.7291260957717896, + "learning_rate": 9.261444540956201e-05, + "loss": 2.8585, + "step": 45480 + }, + { + "epoch": 2.23, + "grad_norm": 0.7162569165229797, + "learning_rate": 9.260332067207563e-05, + "loss": 3.0759, + "step": 45481 + }, + { + "epoch": 2.23, + "grad_norm": 0.6985812783241272, + "learning_rate": 9.25921964808322e-05, + "loss": 2.9614, + "step": 45482 + }, + { + "epoch": 2.23, + "grad_norm": 0.7387300133705139, + "learning_rate": 9.258107283586111e-05, + "loss": 2.8777, + "step": 45483 + }, + { + "epoch": 2.23, + "grad_norm": 0.7236722111701965, + "learning_rate": 9.256994973719148e-05, + "loss": 2.9205, + "step": 45484 + }, + { + "epoch": 2.23, + "grad_norm": 0.8212042450904846, + "learning_rate": 9.255882718485272e-05, + "loss": 2.83, + "step": 45485 + }, + { + "epoch": 2.23, + "grad_norm": 0.7353643774986267, + "learning_rate": 9.254770517887403e-05, + "loss": 2.8415, + "step": 45486 + }, + { + "epoch": 2.23, + "grad_norm": 0.6906113624572754, + "learning_rate": 9.253658371928487e-05, + "loss": 2.7953, + "step": 45487 + }, + { + "epoch": 2.23, + "grad_norm": 0.7319288849830627, + "learning_rate": 9.25254628061144e-05, + "loss": 3.0537, + "step": 45488 + }, + { + "epoch": 2.23, + "grad_norm": 0.6931775212287903, + "learning_rate": 9.251434243939183e-05, + "loss": 2.9983, + "step": 45489 + }, + { + "epoch": 2.23, + "grad_norm": 0.6803140640258789, + "learning_rate": 9.250322261914666e-05, + "loss": 2.9426, + "step": 45490 + }, + { + "epoch": 2.23, + "grad_norm": 0.7066940665245056, + "learning_rate": 9.249210334540798e-05, + "loss": 2.713, + "step": 45491 + }, + { + "epoch": 2.23, + "grad_norm": 0.6657136082649231, + "learning_rate": 9.248098461820513e-05, + "loss": 2.6789, + "step": 45492 + }, + { + "epoch": 2.23, + "grad_norm": 0.7160326242446899, + "learning_rate": 9.246986643756755e-05, + "loss": 2.6241, + "step": 45493 + }, + { + "epoch": 2.23, + "grad_norm": 0.7386571764945984, + "learning_rate": 9.245874880352441e-05, + "loss": 3.0217, + "step": 45494 + }, + { + "epoch": 2.23, + "grad_norm": 0.7494421601295471, + "learning_rate": 9.244763171610494e-05, + "loss": 2.9368, + "step": 45495 + }, + { + "epoch": 2.23, + "grad_norm": 0.7182496786117554, + "learning_rate": 9.24365151753384e-05, + "loss": 3.0068, + "step": 45496 + }, + { + "epoch": 2.23, + "grad_norm": 0.7699613571166992, + "learning_rate": 9.242539918125416e-05, + "loss": 2.7448, + "step": 45497 + }, + { + "epoch": 2.23, + "grad_norm": 0.6990972757339478, + "learning_rate": 9.241428373388152e-05, + "loss": 2.9623, + "step": 45498 + }, + { + "epoch": 2.23, + "grad_norm": 0.7023851275444031, + "learning_rate": 9.240316883324964e-05, + "loss": 2.8577, + "step": 45499 + }, + { + "epoch": 2.23, + "grad_norm": 0.710388720035553, + "learning_rate": 9.239205447938793e-05, + "loss": 2.885, + "step": 45500 + }, + { + "epoch": 2.23, + "grad_norm": 0.7227920889854431, + "learning_rate": 9.238094067232551e-05, + "loss": 2.8459, + "step": 45501 + }, + { + "epoch": 2.23, + "grad_norm": 0.7624646425247192, + "learning_rate": 9.236982741209191e-05, + "loss": 2.9227, + "step": 45502 + }, + { + "epoch": 2.23, + "grad_norm": 0.7066624164581299, + "learning_rate": 9.235871469871607e-05, + "loss": 3.1932, + "step": 45503 + }, + { + "epoch": 2.23, + "grad_norm": 0.7189279198646545, + "learning_rate": 9.23476025322274e-05, + "loss": 2.8258, + "step": 45504 + }, + { + "epoch": 2.23, + "grad_norm": 0.6847929954528809, + "learning_rate": 9.233649091265529e-05, + "loss": 2.6352, + "step": 45505 + }, + { + "epoch": 2.23, + "grad_norm": 0.7308363318443298, + "learning_rate": 9.232537984002878e-05, + "loss": 3.0242, + "step": 45506 + }, + { + "epoch": 2.23, + "grad_norm": 0.7426416873931885, + "learning_rate": 9.231426931437734e-05, + "loss": 2.9488, + "step": 45507 + }, + { + "epoch": 2.23, + "grad_norm": 0.6900115609169006, + "learning_rate": 9.230315933573005e-05, + "loss": 2.8308, + "step": 45508 + }, + { + "epoch": 2.23, + "grad_norm": 0.7146220207214355, + "learning_rate": 9.229204990411637e-05, + "loss": 2.9156, + "step": 45509 + }, + { + "epoch": 2.23, + "grad_norm": 0.733957052230835, + "learning_rate": 9.228094101956546e-05, + "loss": 2.8853, + "step": 45510 + }, + { + "epoch": 2.23, + "grad_norm": 0.7220903635025024, + "learning_rate": 9.226983268210649e-05, + "loss": 2.9089, + "step": 45511 + }, + { + "epoch": 2.23, + "grad_norm": 0.7509836554527283, + "learning_rate": 9.225872489176888e-05, + "loss": 2.9585, + "step": 45512 + }, + { + "epoch": 2.23, + "grad_norm": 0.7305395603179932, + "learning_rate": 9.224761764858172e-05, + "loss": 3.0472, + "step": 45513 + }, + { + "epoch": 2.23, + "grad_norm": 0.7344761490821838, + "learning_rate": 9.223651095257435e-05, + "loss": 2.9322, + "step": 45514 + }, + { + "epoch": 2.23, + "grad_norm": 0.693290650844574, + "learning_rate": 9.222540480377613e-05, + "loss": 2.8171, + "step": 45515 + }, + { + "epoch": 2.23, + "grad_norm": 0.6867750287055969, + "learning_rate": 9.22142992022162e-05, + "loss": 3.0071, + "step": 45516 + }, + { + "epoch": 2.23, + "grad_norm": 0.7052892446517944, + "learning_rate": 9.220319414792382e-05, + "loss": 2.9504, + "step": 45517 + }, + { + "epoch": 2.23, + "grad_norm": 0.7113419771194458, + "learning_rate": 9.219208964092812e-05, + "loss": 2.9487, + "step": 45518 + }, + { + "epoch": 2.23, + "grad_norm": 0.7192926406860352, + "learning_rate": 9.218098568125849e-05, + "loss": 2.9342, + "step": 45519 + }, + { + "epoch": 2.23, + "grad_norm": 0.7447081208229065, + "learning_rate": 9.216988226894423e-05, + "loss": 2.9044, + "step": 45520 + }, + { + "epoch": 2.23, + "grad_norm": 0.7151831388473511, + "learning_rate": 9.215877940401441e-05, + "loss": 3.0041, + "step": 45521 + }, + { + "epoch": 2.23, + "grad_norm": 0.7142324447631836, + "learning_rate": 9.214767708649845e-05, + "loss": 2.8919, + "step": 45522 + }, + { + "epoch": 2.23, + "grad_norm": 0.7118008732795715, + "learning_rate": 9.213657531642541e-05, + "loss": 2.7913, + "step": 45523 + }, + { + "epoch": 2.23, + "grad_norm": 0.7367550730705261, + "learning_rate": 9.212547409382473e-05, + "loss": 3.1136, + "step": 45524 + }, + { + "epoch": 2.23, + "grad_norm": 0.7201369404792786, + "learning_rate": 9.211437341872557e-05, + "loss": 2.8379, + "step": 45525 + }, + { + "epoch": 2.23, + "grad_norm": 0.7369980216026306, + "learning_rate": 9.2103273291157e-05, + "loss": 2.9005, + "step": 45526 + }, + { + "epoch": 2.23, + "grad_norm": 0.7533488869667053, + "learning_rate": 9.209217371114852e-05, + "loss": 3.1198, + "step": 45527 + }, + { + "epoch": 2.23, + "grad_norm": 0.6949545741081238, + "learning_rate": 9.208107467872914e-05, + "loss": 2.8966, + "step": 45528 + }, + { + "epoch": 2.23, + "grad_norm": 0.7416513562202454, + "learning_rate": 9.20699761939282e-05, + "loss": 2.7839, + "step": 45529 + }, + { + "epoch": 2.23, + "grad_norm": 0.6859050989151001, + "learning_rate": 9.205887825677504e-05, + "loss": 2.9379, + "step": 45530 + }, + { + "epoch": 2.23, + "grad_norm": 0.7521802186965942, + "learning_rate": 9.204778086729875e-05, + "loss": 3.0114, + "step": 45531 + }, + { + "epoch": 2.23, + "grad_norm": 0.6823815703392029, + "learning_rate": 9.203668402552857e-05, + "loss": 2.9262, + "step": 45532 + }, + { + "epoch": 2.23, + "grad_norm": 0.7390176057815552, + "learning_rate": 9.202558773149368e-05, + "loss": 2.6721, + "step": 45533 + }, + { + "epoch": 2.23, + "grad_norm": 0.7297099232673645, + "learning_rate": 9.201449198522334e-05, + "loss": 2.9628, + "step": 45534 + }, + { + "epoch": 2.23, + "grad_norm": 0.7210890054702759, + "learning_rate": 9.20033967867469e-05, + "loss": 2.8604, + "step": 45535 + }, + { + "epoch": 2.23, + "grad_norm": 0.7243456244468689, + "learning_rate": 9.199230213609342e-05, + "loss": 2.7529, + "step": 45536 + }, + { + "epoch": 2.23, + "grad_norm": 0.7083402872085571, + "learning_rate": 9.198120803329225e-05, + "loss": 3.0748, + "step": 45537 + }, + { + "epoch": 2.23, + "grad_norm": 0.7190839648246765, + "learning_rate": 9.197011447837253e-05, + "loss": 2.7782, + "step": 45538 + }, + { + "epoch": 2.23, + "grad_norm": 0.7521587014198303, + "learning_rate": 9.19590214713634e-05, + "loss": 3.3249, + "step": 45539 + }, + { + "epoch": 2.23, + "grad_norm": 0.7044076323509216, + "learning_rate": 9.194792901229429e-05, + "loss": 2.9068, + "step": 45540 + }, + { + "epoch": 2.23, + "grad_norm": 0.7235004901885986, + "learning_rate": 9.19368371011942e-05, + "loss": 3.2096, + "step": 45541 + }, + { + "epoch": 2.23, + "grad_norm": 0.728836715221405, + "learning_rate": 9.19257457380925e-05, + "loss": 2.7914, + "step": 45542 + }, + { + "epoch": 2.23, + "grad_norm": 0.7259882688522339, + "learning_rate": 9.191465492301827e-05, + "loss": 2.8201, + "step": 45543 + }, + { + "epoch": 2.23, + "grad_norm": 0.6856926083564758, + "learning_rate": 9.190356465600081e-05, + "loss": 3.0713, + "step": 45544 + }, + { + "epoch": 2.23, + "grad_norm": 0.7389464974403381, + "learning_rate": 9.189247493706939e-05, + "loss": 3.0653, + "step": 45545 + }, + { + "epoch": 2.23, + "grad_norm": 0.7572897672653198, + "learning_rate": 9.188138576625314e-05, + "loss": 3.0166, + "step": 45546 + }, + { + "epoch": 2.23, + "grad_norm": 0.7614259123802185, + "learning_rate": 9.187029714358124e-05, + "loss": 2.9215, + "step": 45547 + }, + { + "epoch": 2.23, + "grad_norm": 0.8179017305374146, + "learning_rate": 9.185920906908288e-05, + "loss": 2.6157, + "step": 45548 + }, + { + "epoch": 2.23, + "grad_norm": 0.7034313082695007, + "learning_rate": 9.184812154278726e-05, + "loss": 3.0541, + "step": 45549 + }, + { + "epoch": 2.23, + "grad_norm": 0.7329689264297485, + "learning_rate": 9.183703456472373e-05, + "loss": 3.0281, + "step": 45550 + }, + { + "epoch": 2.23, + "grad_norm": 0.7217639088630676, + "learning_rate": 9.18259481349213e-05, + "loss": 2.6217, + "step": 45551 + }, + { + "epoch": 2.23, + "grad_norm": 0.7059627771377563, + "learning_rate": 9.181486225340938e-05, + "loss": 2.8865, + "step": 45552 + }, + { + "epoch": 2.23, + "grad_norm": 0.726000964641571, + "learning_rate": 9.180377692021702e-05, + "loss": 2.8841, + "step": 45553 + }, + { + "epoch": 2.23, + "grad_norm": 0.7023661136627197, + "learning_rate": 9.179269213537334e-05, + "loss": 3.087, + "step": 45554 + }, + { + "epoch": 2.23, + "grad_norm": 0.736512303352356, + "learning_rate": 9.178160789890776e-05, + "loss": 2.9646, + "step": 45555 + }, + { + "epoch": 2.23, + "grad_norm": 0.7114843130111694, + "learning_rate": 9.177052421084923e-05, + "loss": 2.8579, + "step": 45556 + }, + { + "epoch": 2.23, + "grad_norm": 0.7629355192184448, + "learning_rate": 9.175944107122718e-05, + "loss": 2.6514, + "step": 45557 + }, + { + "epoch": 2.23, + "grad_norm": 0.7430128455162048, + "learning_rate": 9.17483584800706e-05, + "loss": 2.9601, + "step": 45558 + }, + { + "epoch": 2.23, + "grad_norm": 0.6909064650535583, + "learning_rate": 9.173727643740882e-05, + "loss": 2.8679, + "step": 45559 + }, + { + "epoch": 2.23, + "grad_norm": 0.6793957948684692, + "learning_rate": 9.172619494327101e-05, + "loss": 2.9662, + "step": 45560 + }, + { + "epoch": 2.23, + "grad_norm": 0.6956231594085693, + "learning_rate": 9.171511399768621e-05, + "loss": 2.8303, + "step": 45561 + }, + { + "epoch": 2.23, + "grad_norm": 0.7205344438552856, + "learning_rate": 9.170403360068381e-05, + "loss": 2.8042, + "step": 45562 + }, + { + "epoch": 2.23, + "grad_norm": 0.7068600058555603, + "learning_rate": 9.16929537522928e-05, + "loss": 2.7959, + "step": 45563 + }, + { + "epoch": 2.23, + "grad_norm": 0.7382459044456482, + "learning_rate": 9.168187445254247e-05, + "loss": 3.0018, + "step": 45564 + }, + { + "epoch": 2.23, + "grad_norm": 0.7958078980445862, + "learning_rate": 9.167079570146207e-05, + "loss": 2.7408, + "step": 45565 + }, + { + "epoch": 2.23, + "grad_norm": 0.7165841460227966, + "learning_rate": 9.165971749908063e-05, + "loss": 2.958, + "step": 45566 + }, + { + "epoch": 2.23, + "grad_norm": 0.7124565839767456, + "learning_rate": 9.164863984542746e-05, + "loss": 2.9686, + "step": 45567 + }, + { + "epoch": 2.23, + "grad_norm": 0.7308372855186462, + "learning_rate": 9.163756274053171e-05, + "loss": 2.7848, + "step": 45568 + }, + { + "epoch": 2.23, + "grad_norm": 0.6912724375724792, + "learning_rate": 9.162648618442239e-05, + "loss": 2.6554, + "step": 45569 + }, + { + "epoch": 2.23, + "grad_norm": 0.7064422965049744, + "learning_rate": 9.161541017712893e-05, + "loss": 2.9808, + "step": 45570 + }, + { + "epoch": 2.23, + "grad_norm": 0.7386999130249023, + "learning_rate": 9.160433471868026e-05, + "loss": 2.8824, + "step": 45571 + }, + { + "epoch": 2.23, + "grad_norm": 0.7071724534034729, + "learning_rate": 9.159325980910578e-05, + "loss": 3.0607, + "step": 45572 + }, + { + "epoch": 2.23, + "grad_norm": 0.734996497631073, + "learning_rate": 9.158218544843442e-05, + "loss": 2.6605, + "step": 45573 + }, + { + "epoch": 2.23, + "grad_norm": 0.7159255743026733, + "learning_rate": 9.15711116366956e-05, + "loss": 3.0333, + "step": 45574 + }, + { + "epoch": 2.23, + "grad_norm": 0.6946538686752319, + "learning_rate": 9.156003837391834e-05, + "loss": 3.0226, + "step": 45575 + }, + { + "epoch": 2.23, + "grad_norm": 0.678464949131012, + "learning_rate": 9.154896566013172e-05, + "loss": 2.7583, + "step": 45576 + }, + { + "epoch": 2.23, + "grad_norm": 0.7494169473648071, + "learning_rate": 9.153789349536511e-05, + "loss": 3.1025, + "step": 45577 + }, + { + "epoch": 2.23, + "grad_norm": 0.7178239822387695, + "learning_rate": 9.152682187964749e-05, + "loss": 3.0465, + "step": 45578 + }, + { + "epoch": 2.23, + "grad_norm": 0.7179162502288818, + "learning_rate": 9.151575081300807e-05, + "loss": 2.8249, + "step": 45579 + }, + { + "epoch": 2.23, + "grad_norm": 0.6873793005943298, + "learning_rate": 9.150468029547616e-05, + "loss": 3.0525, + "step": 45580 + }, + { + "epoch": 2.23, + "grad_norm": 0.7262808680534363, + "learning_rate": 9.14936103270808e-05, + "loss": 2.9266, + "step": 45581 + }, + { + "epoch": 2.23, + "grad_norm": 0.6978574991226196, + "learning_rate": 9.148254090785113e-05, + "loss": 2.8802, + "step": 45582 + }, + { + "epoch": 2.23, + "grad_norm": 0.7047426104545593, + "learning_rate": 9.147147203781622e-05, + "loss": 2.6446, + "step": 45583 + }, + { + "epoch": 2.23, + "grad_norm": 0.7362146377563477, + "learning_rate": 9.14604037170054e-05, + "loss": 2.7547, + "step": 45584 + }, + { + "epoch": 2.23, + "grad_norm": 0.6840211153030396, + "learning_rate": 9.14493359454477e-05, + "loss": 3.0048, + "step": 45585 + }, + { + "epoch": 2.23, + "grad_norm": 0.727241575717926, + "learning_rate": 9.143826872317224e-05, + "loss": 2.8566, + "step": 45586 + }, + { + "epoch": 2.23, + "grad_norm": 0.6769251823425293, + "learning_rate": 9.14272020502084e-05, + "loss": 2.8878, + "step": 45587 + }, + { + "epoch": 2.23, + "grad_norm": 0.7173831462860107, + "learning_rate": 9.141613592658504e-05, + "loss": 2.9605, + "step": 45588 + }, + { + "epoch": 2.23, + "grad_norm": 0.7770647406578064, + "learning_rate": 9.140507035233154e-05, + "loss": 2.8595, + "step": 45589 + }, + { + "epoch": 2.23, + "grad_norm": 0.7519656419754028, + "learning_rate": 9.139400532747694e-05, + "loss": 2.6777, + "step": 45590 + }, + { + "epoch": 2.23, + "grad_norm": 0.7570061087608337, + "learning_rate": 9.138294085205031e-05, + "loss": 2.9145, + "step": 45591 + }, + { + "epoch": 2.23, + "grad_norm": 0.7038795351982117, + "learning_rate": 9.137187692608091e-05, + "loss": 2.9259, + "step": 45592 + }, + { + "epoch": 2.23, + "grad_norm": 0.7403584718704224, + "learning_rate": 9.136081354959777e-05, + "loss": 3.0187, + "step": 45593 + }, + { + "epoch": 2.23, + "grad_norm": 0.6944155693054199, + "learning_rate": 9.134975072263022e-05, + "loss": 2.928, + "step": 45594 + }, + { + "epoch": 2.23, + "grad_norm": 0.7204017043113708, + "learning_rate": 9.133868844520712e-05, + "loss": 2.8462, + "step": 45595 + }, + { + "epoch": 2.23, + "grad_norm": 0.7419741153717041, + "learning_rate": 9.132762671735789e-05, + "loss": 3.0493, + "step": 45596 + }, + { + "epoch": 2.23, + "grad_norm": 0.6750164031982422, + "learning_rate": 9.13165655391115e-05, + "loss": 2.957, + "step": 45597 + }, + { + "epoch": 2.23, + "grad_norm": 0.7366441488265991, + "learning_rate": 9.130550491049705e-05, + "loss": 2.8784, + "step": 45598 + }, + { + "epoch": 2.23, + "grad_norm": 0.8148265480995178, + "learning_rate": 9.129444483154379e-05, + "loss": 3.0984, + "step": 45599 + }, + { + "epoch": 2.23, + "grad_norm": 0.7063391804695129, + "learning_rate": 9.128338530228074e-05, + "loss": 2.8162, + "step": 45600 + }, + { + "epoch": 2.23, + "grad_norm": 0.7022308707237244, + "learning_rate": 9.127232632273708e-05, + "loss": 2.8945, + "step": 45601 + }, + { + "epoch": 2.23, + "grad_norm": 0.6840649247169495, + "learning_rate": 9.126126789294203e-05, + "loss": 2.8795, + "step": 45602 + }, + { + "epoch": 2.23, + "grad_norm": 0.7260749340057373, + "learning_rate": 9.125021001292462e-05, + "loss": 2.9185, + "step": 45603 + }, + { + "epoch": 2.23, + "grad_norm": 0.7081303596496582, + "learning_rate": 9.1239152682714e-05, + "loss": 3.0542, + "step": 45604 + }, + { + "epoch": 2.23, + "grad_norm": 0.6989736557006836, + "learning_rate": 9.122809590233916e-05, + "loss": 3.1102, + "step": 45605 + }, + { + "epoch": 2.24, + "grad_norm": 0.6952615976333618, + "learning_rate": 9.121703967182934e-05, + "loss": 2.8599, + "step": 45606 + }, + { + "epoch": 2.24, + "grad_norm": 0.714263916015625, + "learning_rate": 9.120598399121375e-05, + "loss": 2.8885, + "step": 45607 + }, + { + "epoch": 2.24, + "grad_norm": 0.7635734677314758, + "learning_rate": 9.119492886052133e-05, + "loss": 2.9962, + "step": 45608 + }, + { + "epoch": 2.24, + "grad_norm": 0.690173864364624, + "learning_rate": 9.118387427978137e-05, + "loss": 2.9021, + "step": 45609 + }, + { + "epoch": 2.24, + "grad_norm": 0.6887124180793762, + "learning_rate": 9.117282024902282e-05, + "loss": 2.9197, + "step": 45610 + }, + { + "epoch": 2.24, + "grad_norm": 0.6885898113250732, + "learning_rate": 9.116176676827496e-05, + "loss": 2.9658, + "step": 45611 + }, + { + "epoch": 2.24, + "grad_norm": 0.6705561280250549, + "learning_rate": 9.115071383756681e-05, + "loss": 3.082, + "step": 45612 + }, + { + "epoch": 2.24, + "grad_norm": 0.6776615977287292, + "learning_rate": 9.113966145692737e-05, + "loss": 3.0067, + "step": 45613 + }, + { + "epoch": 2.24, + "grad_norm": 0.7082581520080566, + "learning_rate": 9.112860962638598e-05, + "loss": 3.1007, + "step": 45614 + }, + { + "epoch": 2.24, + "grad_norm": 0.7481296062469482, + "learning_rate": 9.111755834597155e-05, + "loss": 2.9351, + "step": 45615 + }, + { + "epoch": 2.24, + "grad_norm": 0.6857460737228394, + "learning_rate": 9.110650761571325e-05, + "loss": 2.7641, + "step": 45616 + }, + { + "epoch": 2.24, + "grad_norm": 0.7014007568359375, + "learning_rate": 9.109545743564032e-05, + "loss": 2.897, + "step": 45617 + }, + { + "epoch": 2.24, + "grad_norm": 0.7194677591323853, + "learning_rate": 9.108440780578172e-05, + "loss": 2.8158, + "step": 45618 + }, + { + "epoch": 2.24, + "grad_norm": 0.7104588150978088, + "learning_rate": 9.107335872616661e-05, + "loss": 2.8991, + "step": 45619 + }, + { + "epoch": 2.24, + "grad_norm": 0.7148900628089905, + "learning_rate": 9.106231019682396e-05, + "loss": 2.6667, + "step": 45620 + }, + { + "epoch": 2.24, + "grad_norm": 0.7613111734390259, + "learning_rate": 9.105126221778299e-05, + "loss": 2.9804, + "step": 45621 + }, + { + "epoch": 2.24, + "grad_norm": 0.7180856466293335, + "learning_rate": 9.104021478907289e-05, + "loss": 3.0473, + "step": 45622 + }, + { + "epoch": 2.24, + "grad_norm": 0.7277432680130005, + "learning_rate": 9.102916791072252e-05, + "loss": 3.0207, + "step": 45623 + }, + { + "epoch": 2.24, + "grad_norm": 0.6842195987701416, + "learning_rate": 9.101812158276121e-05, + "loss": 2.8484, + "step": 45624 + }, + { + "epoch": 2.24, + "grad_norm": 0.719687819480896, + "learning_rate": 9.100707580521785e-05, + "loss": 3.1122, + "step": 45625 + }, + { + "epoch": 2.24, + "grad_norm": 0.7382856607437134, + "learning_rate": 9.099603057812173e-05, + "loss": 2.9134, + "step": 45626 + }, + { + "epoch": 2.24, + "grad_norm": 0.7554395198822021, + "learning_rate": 9.098498590150183e-05, + "loss": 2.8051, + "step": 45627 + }, + { + "epoch": 2.24, + "grad_norm": 0.7509669661521912, + "learning_rate": 9.097394177538717e-05, + "loss": 2.9297, + "step": 45628 + }, + { + "epoch": 2.24, + "grad_norm": 1.0624964237213135, + "learning_rate": 9.096289819980701e-05, + "loss": 2.9979, + "step": 45629 + }, + { + "epoch": 2.24, + "grad_norm": 0.6812724471092224, + "learning_rate": 9.095185517479025e-05, + "loss": 2.8224, + "step": 45630 + }, + { + "epoch": 2.24, + "grad_norm": 0.7406055927276611, + "learning_rate": 9.094081270036608e-05, + "loss": 2.8994, + "step": 45631 + }, + { + "epoch": 2.24, + "grad_norm": 0.7246912717819214, + "learning_rate": 9.092977077656366e-05, + "loss": 2.7739, + "step": 45632 + }, + { + "epoch": 2.24, + "grad_norm": 0.7108069658279419, + "learning_rate": 9.091872940341199e-05, + "loss": 2.8253, + "step": 45633 + }, + { + "epoch": 2.24, + "grad_norm": 0.7075560688972473, + "learning_rate": 9.090768858094015e-05, + "loss": 3.2586, + "step": 45634 + }, + { + "epoch": 2.24, + "grad_norm": 0.7471787333488464, + "learning_rate": 9.089664830917713e-05, + "loss": 2.915, + "step": 45635 + }, + { + "epoch": 2.24, + "grad_norm": 0.6982598304748535, + "learning_rate": 9.088560858815208e-05, + "loss": 2.8928, + "step": 45636 + }, + { + "epoch": 2.24, + "grad_norm": 0.6782969236373901, + "learning_rate": 9.087456941789421e-05, + "loss": 2.8622, + "step": 45637 + }, + { + "epoch": 2.24, + "grad_norm": 0.7105681896209717, + "learning_rate": 9.086353079843238e-05, + "loss": 2.9933, + "step": 45638 + }, + { + "epoch": 2.24, + "grad_norm": 0.719509482383728, + "learning_rate": 9.085249272979582e-05, + "loss": 2.9826, + "step": 45639 + }, + { + "epoch": 2.24, + "grad_norm": 0.7250756025314331, + "learning_rate": 9.084145521201357e-05, + "loss": 2.8474, + "step": 45640 + }, + { + "epoch": 2.24, + "grad_norm": 0.7311588525772095, + "learning_rate": 9.083041824511455e-05, + "loss": 2.9898, + "step": 45641 + }, + { + "epoch": 2.24, + "grad_norm": 0.7223825454711914, + "learning_rate": 9.081938182912809e-05, + "loss": 2.9584, + "step": 45642 + }, + { + "epoch": 2.24, + "grad_norm": 0.7354487180709839, + "learning_rate": 9.080834596408301e-05, + "loss": 2.9169, + "step": 45643 + }, + { + "epoch": 2.24, + "grad_norm": 0.7054993510246277, + "learning_rate": 9.079731065000858e-05, + "loss": 2.7169, + "step": 45644 + }, + { + "epoch": 2.24, + "grad_norm": 0.7050856351852417, + "learning_rate": 9.07862758869337e-05, + "loss": 2.8571, + "step": 45645 + }, + { + "epoch": 2.24, + "grad_norm": 0.7472595572471619, + "learning_rate": 9.077524167488746e-05, + "loss": 3.0456, + "step": 45646 + }, + { + "epoch": 2.24, + "grad_norm": 0.7568950653076172, + "learning_rate": 9.076420801389909e-05, + "loss": 2.7929, + "step": 45647 + }, + { + "epoch": 2.24, + "grad_norm": 0.6783512830734253, + "learning_rate": 9.075317490399752e-05, + "loss": 2.9669, + "step": 45648 + }, + { + "epoch": 2.24, + "grad_norm": 0.706605851650238, + "learning_rate": 9.074214234521182e-05, + "loss": 2.7641, + "step": 45649 + }, + { + "epoch": 2.24, + "grad_norm": 0.6669334173202515, + "learning_rate": 9.073111033757094e-05, + "loss": 2.7432, + "step": 45650 + }, + { + "epoch": 2.24, + "grad_norm": 0.7224730253219604, + "learning_rate": 9.072007888110404e-05, + "loss": 2.7118, + "step": 45651 + }, + { + "epoch": 2.24, + "grad_norm": 0.7125197052955627, + "learning_rate": 9.070904797584029e-05, + "loss": 2.9566, + "step": 45652 + }, + { + "epoch": 2.24, + "grad_norm": 0.7195077538490295, + "learning_rate": 9.069801762180853e-05, + "loss": 2.6613, + "step": 45653 + }, + { + "epoch": 2.24, + "grad_norm": 0.7205254435539246, + "learning_rate": 9.0686987819038e-05, + "loss": 3.0001, + "step": 45654 + }, + { + "epoch": 2.24, + "grad_norm": 0.7503374218940735, + "learning_rate": 9.067595856755765e-05, + "loss": 2.7024, + "step": 45655 + }, + { + "epoch": 2.24, + "grad_norm": 0.6984966397285461, + "learning_rate": 9.066492986739647e-05, + "loss": 2.7526, + "step": 45656 + }, + { + "epoch": 2.24, + "grad_norm": 0.6726536154747009, + "learning_rate": 9.065390171858366e-05, + "loss": 2.9738, + "step": 45657 + }, + { + "epoch": 2.24, + "grad_norm": 0.6979069113731384, + "learning_rate": 9.064287412114808e-05, + "loss": 2.9208, + "step": 45658 + }, + { + "epoch": 2.24, + "grad_norm": 0.695800244808197, + "learning_rate": 9.063184707511899e-05, + "loss": 2.9246, + "step": 45659 + }, + { + "epoch": 2.24, + "grad_norm": 0.7155253887176514, + "learning_rate": 9.06208205805252e-05, + "loss": 2.8638, + "step": 45660 + }, + { + "epoch": 2.24, + "grad_norm": 0.6801446676254272, + "learning_rate": 9.060979463739598e-05, + "loss": 2.9644, + "step": 45661 + }, + { + "epoch": 2.24, + "grad_norm": 0.6838302612304688, + "learning_rate": 9.059876924576029e-05, + "loss": 2.9361, + "step": 45662 + }, + { + "epoch": 2.24, + "grad_norm": 0.7186485528945923, + "learning_rate": 9.0587744405647e-05, + "loss": 2.8708, + "step": 45663 + }, + { + "epoch": 2.24, + "grad_norm": 0.664668083190918, + "learning_rate": 9.057672011708541e-05, + "loss": 2.7418, + "step": 45664 + }, + { + "epoch": 2.24, + "grad_norm": 0.7575511336326599, + "learning_rate": 9.056569638010436e-05, + "loss": 2.9639, + "step": 45665 + }, + { + "epoch": 2.24, + "grad_norm": 0.7134124040603638, + "learning_rate": 9.055467319473303e-05, + "loss": 2.6629, + "step": 45666 + }, + { + "epoch": 2.24, + "grad_norm": 0.7106820344924927, + "learning_rate": 9.054365056100027e-05, + "loss": 2.8322, + "step": 45667 + }, + { + "epoch": 2.24, + "grad_norm": 0.6775752902030945, + "learning_rate": 9.053262847893526e-05, + "loss": 2.8653, + "step": 45668 + }, + { + "epoch": 2.24, + "grad_norm": 0.7096874713897705, + "learning_rate": 9.052160694856706e-05, + "loss": 3.0051, + "step": 45669 + }, + { + "epoch": 2.24, + "grad_norm": 0.7215758562088013, + "learning_rate": 9.051058596992463e-05, + "loss": 2.9697, + "step": 45670 + }, + { + "epoch": 2.24, + "grad_norm": 0.7285184860229492, + "learning_rate": 9.049956554303702e-05, + "loss": 2.8607, + "step": 45671 + }, + { + "epoch": 2.24, + "grad_norm": 0.7512555122375488, + "learning_rate": 9.048854566793314e-05, + "loss": 3.0024, + "step": 45672 + }, + { + "epoch": 2.24, + "grad_norm": 0.7152148485183716, + "learning_rate": 9.04775263446421e-05, + "loss": 3.0288, + "step": 45673 + }, + { + "epoch": 2.24, + "grad_norm": 0.7303699851036072, + "learning_rate": 9.046650757319304e-05, + "loss": 2.9171, + "step": 45674 + }, + { + "epoch": 2.24, + "grad_norm": 0.6994890570640564, + "learning_rate": 9.045548935361475e-05, + "loss": 2.7558, + "step": 45675 + }, + { + "epoch": 2.24, + "grad_norm": 0.7585562467575073, + "learning_rate": 9.044447168593649e-05, + "loss": 3.004, + "step": 45676 + }, + { + "epoch": 2.24, + "grad_norm": 0.7422113418579102, + "learning_rate": 9.043345457018718e-05, + "loss": 2.9295, + "step": 45677 + }, + { + "epoch": 2.24, + "grad_norm": 0.7175257802009583, + "learning_rate": 9.042243800639569e-05, + "loss": 2.9137, + "step": 45678 + }, + { + "epoch": 2.24, + "grad_norm": 0.7064406275749207, + "learning_rate": 9.041142199459127e-05, + "loss": 2.9965, + "step": 45679 + }, + { + "epoch": 2.24, + "grad_norm": 0.7073319554328918, + "learning_rate": 9.040040653480272e-05, + "loss": 2.8982, + "step": 45680 + }, + { + "epoch": 2.24, + "grad_norm": 0.7129002213478088, + "learning_rate": 9.038939162705926e-05, + "loss": 2.8788, + "step": 45681 + }, + { + "epoch": 2.24, + "grad_norm": 0.7113178968429565, + "learning_rate": 9.037837727138974e-05, + "loss": 3.0197, + "step": 45682 + }, + { + "epoch": 2.24, + "grad_norm": 0.7069679498672485, + "learning_rate": 9.036736346782331e-05, + "loss": 2.7209, + "step": 45683 + }, + { + "epoch": 2.24, + "grad_norm": 0.7247236371040344, + "learning_rate": 9.035635021638889e-05, + "loss": 2.9684, + "step": 45684 + }, + { + "epoch": 2.24, + "grad_norm": 0.685911238193512, + "learning_rate": 9.034533751711539e-05, + "loss": 3.061, + "step": 45685 + }, + { + "epoch": 2.24, + "grad_norm": 0.7022624611854553, + "learning_rate": 9.033432537003204e-05, + "loss": 2.9441, + "step": 45686 + }, + { + "epoch": 2.24, + "grad_norm": 0.7100083827972412, + "learning_rate": 9.032331377516764e-05, + "loss": 2.7339, + "step": 45687 + }, + { + "epoch": 2.24, + "grad_norm": 0.7229495048522949, + "learning_rate": 9.031230273255124e-05, + "loss": 2.8123, + "step": 45688 + }, + { + "epoch": 2.24, + "grad_norm": 0.7110562920570374, + "learning_rate": 9.030129224221201e-05, + "loss": 2.9297, + "step": 45689 + }, + { + "epoch": 2.24, + "grad_norm": 0.6981791257858276, + "learning_rate": 9.029028230417871e-05, + "loss": 3.0289, + "step": 45690 + }, + { + "epoch": 2.24, + "grad_norm": 0.7141596674919128, + "learning_rate": 9.027927291848055e-05, + "loss": 2.9254, + "step": 45691 + }, + { + "epoch": 2.24, + "grad_norm": 0.7503494620323181, + "learning_rate": 9.026826408514641e-05, + "loss": 2.9531, + "step": 45692 + }, + { + "epoch": 2.24, + "grad_norm": 0.729244589805603, + "learning_rate": 9.025725580420521e-05, + "loss": 2.9759, + "step": 45693 + }, + { + "epoch": 2.24, + "grad_norm": 0.7077925801277161, + "learning_rate": 9.024624807568615e-05, + "loss": 2.9944, + "step": 45694 + }, + { + "epoch": 2.24, + "grad_norm": 0.7241457104682922, + "learning_rate": 9.023524089961797e-05, + "loss": 2.8535, + "step": 45695 + }, + { + "epoch": 2.24, + "grad_norm": 0.7095908522605896, + "learning_rate": 9.022423427602991e-05, + "loss": 2.9944, + "step": 45696 + }, + { + "epoch": 2.24, + "grad_norm": 0.717788815498352, + "learning_rate": 9.021322820495074e-05, + "loss": 2.9462, + "step": 45697 + }, + { + "epoch": 2.24, + "grad_norm": 0.6879000663757324, + "learning_rate": 9.020222268640968e-05, + "loss": 2.8645, + "step": 45698 + }, + { + "epoch": 2.24, + "grad_norm": 0.7336751222610474, + "learning_rate": 9.019121772043554e-05, + "loss": 2.8715, + "step": 45699 + }, + { + "epoch": 2.24, + "grad_norm": 0.7165769338607788, + "learning_rate": 9.01802133070573e-05, + "loss": 3.0437, + "step": 45700 + }, + { + "epoch": 2.24, + "grad_norm": 0.7212401032447815, + "learning_rate": 9.016920944630407e-05, + "loss": 2.9006, + "step": 45701 + }, + { + "epoch": 2.24, + "grad_norm": 0.7075793147087097, + "learning_rate": 9.015820613820468e-05, + "loss": 2.9832, + "step": 45702 + }, + { + "epoch": 2.24, + "grad_norm": 0.6998143196105957, + "learning_rate": 9.014720338278818e-05, + "loss": 2.8956, + "step": 45703 + }, + { + "epoch": 2.24, + "grad_norm": 0.6937516927719116, + "learning_rate": 9.013620118008365e-05, + "loss": 3.1962, + "step": 45704 + }, + { + "epoch": 2.24, + "grad_norm": 0.6879575252532959, + "learning_rate": 9.012519953011999e-05, + "loss": 2.9001, + "step": 45705 + }, + { + "epoch": 2.24, + "grad_norm": 0.7310060858726501, + "learning_rate": 9.011419843292615e-05, + "loss": 2.6941, + "step": 45706 + }, + { + "epoch": 2.24, + "grad_norm": 0.6938087344169617, + "learning_rate": 9.010319788853102e-05, + "loss": 2.8176, + "step": 45707 + }, + { + "epoch": 2.24, + "grad_norm": 0.7372106313705444, + "learning_rate": 9.009219789696369e-05, + "loss": 2.8348, + "step": 45708 + }, + { + "epoch": 2.24, + "grad_norm": 0.7054558992385864, + "learning_rate": 9.00811984582532e-05, + "loss": 2.8556, + "step": 45709 + }, + { + "epoch": 2.24, + "grad_norm": 0.709180474281311, + "learning_rate": 9.007019957242831e-05, + "loss": 2.8494, + "step": 45710 + }, + { + "epoch": 2.24, + "grad_norm": 0.714429497718811, + "learning_rate": 9.005920123951824e-05, + "loss": 3.0477, + "step": 45711 + }, + { + "epoch": 2.24, + "grad_norm": 0.7273450493812561, + "learning_rate": 9.004820345955173e-05, + "loss": 2.9587, + "step": 45712 + }, + { + "epoch": 2.24, + "grad_norm": 0.7173202633857727, + "learning_rate": 9.003720623255795e-05, + "loss": 3.0399, + "step": 45713 + }, + { + "epoch": 2.24, + "grad_norm": 0.7252642512321472, + "learning_rate": 9.002620955856574e-05, + "loss": 3.1725, + "step": 45714 + }, + { + "epoch": 2.24, + "grad_norm": 0.707578182220459, + "learning_rate": 9.0015213437604e-05, + "loss": 2.9012, + "step": 45715 + }, + { + "epoch": 2.24, + "grad_norm": 0.7207184433937073, + "learning_rate": 9.000421786970188e-05, + "loss": 2.5106, + "step": 45716 + }, + { + "epoch": 2.24, + "grad_norm": 0.7439934611320496, + "learning_rate": 8.999322285488814e-05, + "loss": 2.9038, + "step": 45717 + }, + { + "epoch": 2.24, + "grad_norm": 0.6846615076065063, + "learning_rate": 8.998222839319182e-05, + "loss": 2.8526, + "step": 45718 + }, + { + "epoch": 2.24, + "grad_norm": 0.6808404326438904, + "learning_rate": 8.997123448464199e-05, + "loss": 3.0216, + "step": 45719 + }, + { + "epoch": 2.24, + "grad_norm": 0.6881184577941895, + "learning_rate": 8.99602411292675e-05, + "loss": 2.9798, + "step": 45720 + }, + { + "epoch": 2.24, + "grad_norm": 0.7790664434432983, + "learning_rate": 8.994924832709732e-05, + "loss": 3.0434, + "step": 45721 + }, + { + "epoch": 2.24, + "grad_norm": 0.7797252535820007, + "learning_rate": 8.993825607816031e-05, + "loss": 2.7815, + "step": 45722 + }, + { + "epoch": 2.24, + "grad_norm": 0.6707017421722412, + "learning_rate": 8.99272643824855e-05, + "loss": 3.0048, + "step": 45723 + }, + { + "epoch": 2.24, + "grad_norm": 0.7407448887825012, + "learning_rate": 8.991627324010193e-05, + "loss": 2.8102, + "step": 45724 + }, + { + "epoch": 2.24, + "grad_norm": 0.7850496768951416, + "learning_rate": 8.990528265103839e-05, + "loss": 3.0061, + "step": 45725 + }, + { + "epoch": 2.24, + "grad_norm": 0.7100532054901123, + "learning_rate": 8.989429261532398e-05, + "loss": 2.9508, + "step": 45726 + }, + { + "epoch": 2.24, + "grad_norm": 0.7451736927032471, + "learning_rate": 8.988330313298748e-05, + "loss": 2.9667, + "step": 45727 + }, + { + "epoch": 2.24, + "grad_norm": 0.7049230337142944, + "learning_rate": 8.9872314204058e-05, + "loss": 2.787, + "step": 45728 + }, + { + "epoch": 2.24, + "grad_norm": 0.6660589575767517, + "learning_rate": 8.986132582856442e-05, + "loss": 2.7769, + "step": 45729 + }, + { + "epoch": 2.24, + "grad_norm": 0.7338511347770691, + "learning_rate": 8.985033800653558e-05, + "loss": 2.8149, + "step": 45730 + }, + { + "epoch": 2.24, + "grad_norm": 0.6853067278862, + "learning_rate": 8.983935073800059e-05, + "loss": 2.7641, + "step": 45731 + }, + { + "epoch": 2.24, + "grad_norm": 0.7548957467079163, + "learning_rate": 8.98283640229882e-05, + "loss": 2.7663, + "step": 45732 + }, + { + "epoch": 2.24, + "grad_norm": 0.7389004826545715, + "learning_rate": 8.981737786152748e-05, + "loss": 3.224, + "step": 45733 + }, + { + "epoch": 2.24, + "grad_norm": 0.6959095001220703, + "learning_rate": 8.98063922536474e-05, + "loss": 2.9059, + "step": 45734 + }, + { + "epoch": 2.24, + "grad_norm": 0.7141698002815247, + "learning_rate": 8.979540719937684e-05, + "loss": 3.022, + "step": 45735 + }, + { + "epoch": 2.24, + "grad_norm": 0.7193446159362793, + "learning_rate": 8.978442269874473e-05, + "loss": 2.8698, + "step": 45736 + }, + { + "epoch": 2.24, + "grad_norm": 0.6984773874282837, + "learning_rate": 8.977343875177988e-05, + "loss": 2.8872, + "step": 45737 + }, + { + "epoch": 2.24, + "grad_norm": 0.7044937610626221, + "learning_rate": 8.976245535851145e-05, + "loss": 2.9768, + "step": 45738 + }, + { + "epoch": 2.24, + "grad_norm": 0.732440173625946, + "learning_rate": 8.975147251896816e-05, + "loss": 2.8359, + "step": 45739 + }, + { + "epoch": 2.24, + "grad_norm": 0.7215022444725037, + "learning_rate": 8.974049023317898e-05, + "loss": 2.7151, + "step": 45740 + }, + { + "epoch": 2.24, + "grad_norm": 0.741974949836731, + "learning_rate": 8.972950850117303e-05, + "loss": 2.7282, + "step": 45741 + }, + { + "epoch": 2.24, + "grad_norm": 0.702601432800293, + "learning_rate": 8.971852732297908e-05, + "loss": 2.9357, + "step": 45742 + }, + { + "epoch": 2.24, + "grad_norm": 0.690036952495575, + "learning_rate": 8.970754669862605e-05, + "loss": 2.7958, + "step": 45743 + }, + { + "epoch": 2.24, + "grad_norm": 0.7078272700309753, + "learning_rate": 8.969656662814278e-05, + "loss": 2.6961, + "step": 45744 + }, + { + "epoch": 2.24, + "grad_norm": 0.7023754119873047, + "learning_rate": 8.968558711155828e-05, + "loss": 2.9431, + "step": 45745 + }, + { + "epoch": 2.24, + "grad_norm": 0.7015004754066467, + "learning_rate": 8.967460814890154e-05, + "loss": 2.9241, + "step": 45746 + }, + { + "epoch": 2.24, + "grad_norm": 0.7229611277580261, + "learning_rate": 8.966362974020135e-05, + "loss": 2.9218, + "step": 45747 + }, + { + "epoch": 2.24, + "grad_norm": 0.7046184539794922, + "learning_rate": 8.965265188548675e-05, + "loss": 2.7128, + "step": 45748 + }, + { + "epoch": 2.24, + "grad_norm": 0.6686332821846008, + "learning_rate": 8.964167458478649e-05, + "loss": 2.9473, + "step": 45749 + }, + { + "epoch": 2.24, + "grad_norm": 0.7126154899597168, + "learning_rate": 8.963069783812966e-05, + "loss": 3.0076, + "step": 45750 + }, + { + "epoch": 2.24, + "grad_norm": 0.720814049243927, + "learning_rate": 8.961972164554509e-05, + "loss": 2.8959, + "step": 45751 + }, + { + "epoch": 2.24, + "grad_norm": 0.767537534236908, + "learning_rate": 8.960874600706163e-05, + "loss": 2.9654, + "step": 45752 + }, + { + "epoch": 2.24, + "grad_norm": 0.7737765312194824, + "learning_rate": 8.95977709227083e-05, + "loss": 2.9991, + "step": 45753 + }, + { + "epoch": 2.24, + "grad_norm": 0.7451054453849792, + "learning_rate": 8.958679639251383e-05, + "loss": 2.76, + "step": 45754 + }, + { + "epoch": 2.24, + "grad_norm": 0.682572066783905, + "learning_rate": 8.957582241650729e-05, + "loss": 3.0767, + "step": 45755 + }, + { + "epoch": 2.24, + "grad_norm": 0.7428975701332092, + "learning_rate": 8.956484899471761e-05, + "loss": 2.8986, + "step": 45756 + }, + { + "epoch": 2.24, + "grad_norm": 0.7675819993019104, + "learning_rate": 8.955387612717362e-05, + "loss": 2.911, + "step": 45757 + }, + { + "epoch": 2.24, + "grad_norm": 0.7058717012405396, + "learning_rate": 8.954290381390422e-05, + "loss": 2.8263, + "step": 45758 + }, + { + "epoch": 2.24, + "grad_norm": 0.7053318023681641, + "learning_rate": 8.953193205493819e-05, + "loss": 2.8086, + "step": 45759 + }, + { + "epoch": 2.24, + "grad_norm": 0.7443513870239258, + "learning_rate": 8.952096085030458e-05, + "loss": 2.9749, + "step": 45760 + }, + { + "epoch": 2.24, + "grad_norm": 0.6907855868339539, + "learning_rate": 8.950999020003236e-05, + "loss": 2.86, + "step": 45761 + }, + { + "epoch": 2.24, + "grad_norm": 0.6709179282188416, + "learning_rate": 8.949902010415016e-05, + "loss": 2.8877, + "step": 45762 + }, + { + "epoch": 2.24, + "grad_norm": 0.6978246569633484, + "learning_rate": 8.948805056268718e-05, + "loss": 2.9097, + "step": 45763 + }, + { + "epoch": 2.24, + "grad_norm": 0.7394630908966064, + "learning_rate": 8.947708157567213e-05, + "loss": 2.9971, + "step": 45764 + }, + { + "epoch": 2.24, + "grad_norm": 0.7391385436058044, + "learning_rate": 8.946611314313386e-05, + "loss": 2.8428, + "step": 45765 + }, + { + "epoch": 2.24, + "grad_norm": 0.7328847646713257, + "learning_rate": 8.945514526510141e-05, + "loss": 2.82, + "step": 45766 + }, + { + "epoch": 2.24, + "grad_norm": 0.7323940992355347, + "learning_rate": 8.944417794160352e-05, + "loss": 3.0405, + "step": 45767 + }, + { + "epoch": 2.24, + "grad_norm": 0.7202035784721375, + "learning_rate": 8.94332111726692e-05, + "loss": 2.8026, + "step": 45768 + }, + { + "epoch": 2.24, + "grad_norm": 0.6832515597343445, + "learning_rate": 8.942224495832721e-05, + "loss": 2.8618, + "step": 45769 + }, + { + "epoch": 2.24, + "grad_norm": 0.6816558241844177, + "learning_rate": 8.941127929860645e-05, + "loss": 2.9306, + "step": 45770 + }, + { + "epoch": 2.24, + "grad_norm": 0.7403445839881897, + "learning_rate": 8.9400314193536e-05, + "loss": 3.0726, + "step": 45771 + }, + { + "epoch": 2.24, + "grad_norm": 0.6996352672576904, + "learning_rate": 8.938934964314456e-05, + "loss": 2.7873, + "step": 45772 + }, + { + "epoch": 2.24, + "grad_norm": 0.7047386169433594, + "learning_rate": 8.937838564746107e-05, + "loss": 2.8053, + "step": 45773 + }, + { + "epoch": 2.24, + "grad_norm": 0.7159958481788635, + "learning_rate": 8.936742220651427e-05, + "loss": 2.9682, + "step": 45774 + }, + { + "epoch": 2.24, + "grad_norm": 0.6776139736175537, + "learning_rate": 8.935645932033311e-05, + "loss": 2.88, + "step": 45775 + }, + { + "epoch": 2.24, + "grad_norm": 0.6902870535850525, + "learning_rate": 8.934549698894664e-05, + "loss": 2.8853, + "step": 45776 + }, + { + "epoch": 2.24, + "grad_norm": 0.7082247138023376, + "learning_rate": 8.933453521238348e-05, + "loss": 2.9993, + "step": 45777 + }, + { + "epoch": 2.24, + "grad_norm": 0.694507360458374, + "learning_rate": 8.93235739906727e-05, + "loss": 2.9914, + "step": 45778 + }, + { + "epoch": 2.24, + "grad_norm": 0.6993120908737183, + "learning_rate": 8.931261332384308e-05, + "loss": 2.9088, + "step": 45779 + }, + { + "epoch": 2.24, + "grad_norm": 0.701582670211792, + "learning_rate": 8.930165321192341e-05, + "loss": 2.9265, + "step": 45780 + }, + { + "epoch": 2.24, + "grad_norm": 0.7343215942382812, + "learning_rate": 8.92906936549427e-05, + "loss": 2.856, + "step": 45781 + }, + { + "epoch": 2.24, + "grad_norm": 0.7673470973968506, + "learning_rate": 8.927973465292967e-05, + "loss": 3.0379, + "step": 45782 + }, + { + "epoch": 2.24, + "grad_norm": 0.7404628396034241, + "learning_rate": 8.926877620591339e-05, + "loss": 3.0584, + "step": 45783 + }, + { + "epoch": 2.24, + "grad_norm": 0.7033378481864929, + "learning_rate": 8.925781831392246e-05, + "loss": 2.9548, + "step": 45784 + }, + { + "epoch": 2.24, + "grad_norm": 0.70482337474823, + "learning_rate": 8.9246860976986e-05, + "loss": 2.8364, + "step": 45785 + }, + { + "epoch": 2.24, + "grad_norm": 0.6887717247009277, + "learning_rate": 8.923590419513273e-05, + "loss": 3.0289, + "step": 45786 + }, + { + "epoch": 2.24, + "grad_norm": 0.7255544066429138, + "learning_rate": 8.922494796839143e-05, + "loss": 2.9807, + "step": 45787 + }, + { + "epoch": 2.24, + "grad_norm": 0.6836867928504944, + "learning_rate": 8.921399229679118e-05, + "loss": 2.8207, + "step": 45788 + }, + { + "epoch": 2.24, + "grad_norm": 0.707114040851593, + "learning_rate": 8.920303718036058e-05, + "loss": 2.8818, + "step": 45789 + }, + { + "epoch": 2.24, + "grad_norm": 0.7001112699508667, + "learning_rate": 8.919208261912862e-05, + "loss": 3.0064, + "step": 45790 + }, + { + "epoch": 2.24, + "grad_norm": 0.7056055068969727, + "learning_rate": 8.918112861312422e-05, + "loss": 2.915, + "step": 45791 + }, + { + "epoch": 2.24, + "grad_norm": 0.71812504529953, + "learning_rate": 8.91701751623761e-05, + "loss": 2.6304, + "step": 45792 + }, + { + "epoch": 2.24, + "grad_norm": 0.7017490267753601, + "learning_rate": 8.915922226691323e-05, + "loss": 3.0009, + "step": 45793 + }, + { + "epoch": 2.24, + "grad_norm": 0.6928110718727112, + "learning_rate": 8.914826992676439e-05, + "loss": 2.8037, + "step": 45794 + }, + { + "epoch": 2.24, + "grad_norm": 0.7178500890731812, + "learning_rate": 8.913731814195836e-05, + "loss": 3.0088, + "step": 45795 + }, + { + "epoch": 2.24, + "grad_norm": 0.7326123118400574, + "learning_rate": 8.91263669125241e-05, + "loss": 2.7809, + "step": 45796 + }, + { + "epoch": 2.24, + "grad_norm": 0.7544444799423218, + "learning_rate": 8.911541623849035e-05, + "loss": 2.9505, + "step": 45797 + }, + { + "epoch": 2.24, + "grad_norm": 0.6959039568901062, + "learning_rate": 8.910446611988609e-05, + "loss": 2.9881, + "step": 45798 + }, + { + "epoch": 2.24, + "grad_norm": 0.7174462676048279, + "learning_rate": 8.909351655674e-05, + "loss": 2.8899, + "step": 45799 + }, + { + "epoch": 2.24, + "grad_norm": 0.7137072682380676, + "learning_rate": 8.908256754908108e-05, + "loss": 2.8453, + "step": 45800 + }, + { + "epoch": 2.24, + "grad_norm": 0.7283577919006348, + "learning_rate": 8.907161909693806e-05, + "loss": 2.7934, + "step": 45801 + }, + { + "epoch": 2.24, + "grad_norm": 0.732901930809021, + "learning_rate": 8.906067120033975e-05, + "loss": 2.6958, + "step": 45802 + }, + { + "epoch": 2.24, + "grad_norm": 0.7311907410621643, + "learning_rate": 8.904972385931509e-05, + "loss": 2.8987, + "step": 45803 + }, + { + "epoch": 2.24, + "grad_norm": 0.6583136320114136, + "learning_rate": 8.903877707389279e-05, + "loss": 2.809, + "step": 45804 + }, + { + "epoch": 2.24, + "grad_norm": 0.6969159245491028, + "learning_rate": 8.902783084410174e-05, + "loss": 2.8369, + "step": 45805 + }, + { + "epoch": 2.24, + "grad_norm": 0.7024007439613342, + "learning_rate": 8.901688516997091e-05, + "loss": 2.8592, + "step": 45806 + }, + { + "epoch": 2.24, + "grad_norm": 0.8098918795585632, + "learning_rate": 8.900594005152896e-05, + "loss": 2.9321, + "step": 45807 + }, + { + "epoch": 2.24, + "grad_norm": 0.7050246000289917, + "learning_rate": 8.899499548880476e-05, + "loss": 2.8934, + "step": 45808 + }, + { + "epoch": 2.24, + "grad_norm": 0.7347972393035889, + "learning_rate": 8.898405148182708e-05, + "loss": 2.9272, + "step": 45809 + }, + { + "epoch": 2.25, + "grad_norm": 0.7350515127182007, + "learning_rate": 8.897310803062475e-05, + "loss": 2.971, + "step": 45810 + }, + { + "epoch": 2.25, + "grad_norm": 0.7553649544715881, + "learning_rate": 8.896216513522676e-05, + "loss": 3.0858, + "step": 45811 + }, + { + "epoch": 2.25, + "grad_norm": 0.736858069896698, + "learning_rate": 8.895122279566168e-05, + "loss": 2.8214, + "step": 45812 + }, + { + "epoch": 2.25, + "grad_norm": 0.7918192744255066, + "learning_rate": 8.894028101195859e-05, + "loss": 2.7361, + "step": 45813 + }, + { + "epoch": 2.25, + "grad_norm": 0.70150226354599, + "learning_rate": 8.89293397841461e-05, + "loss": 2.9136, + "step": 45814 + }, + { + "epoch": 2.25, + "grad_norm": 0.7298532724380493, + "learning_rate": 8.89183991122532e-05, + "loss": 2.9162, + "step": 45815 + }, + { + "epoch": 2.25, + "grad_norm": 0.6884034276008606, + "learning_rate": 8.890745899630857e-05, + "loss": 2.7884, + "step": 45816 + }, + { + "epoch": 2.25, + "grad_norm": 0.7185069918632507, + "learning_rate": 8.889651943634101e-05, + "loss": 2.9132, + "step": 45817 + }, + { + "epoch": 2.25, + "grad_norm": 0.7185012698173523, + "learning_rate": 8.888558043237946e-05, + "loss": 2.8284, + "step": 45818 + }, + { + "epoch": 2.25, + "grad_norm": 0.6928901672363281, + "learning_rate": 8.88746419844526e-05, + "loss": 2.8727, + "step": 45819 + }, + { + "epoch": 2.25, + "grad_norm": 0.7991043329238892, + "learning_rate": 8.886370409258935e-05, + "loss": 3.0365, + "step": 45820 + }, + { + "epoch": 2.25, + "grad_norm": 0.7154514193534851, + "learning_rate": 8.885276675681842e-05, + "loss": 2.7826, + "step": 45821 + }, + { + "epoch": 2.25, + "grad_norm": 0.6965875625610352, + "learning_rate": 8.884182997716874e-05, + "loss": 3.0469, + "step": 45822 + }, + { + "epoch": 2.25, + "grad_norm": 0.7708733677864075, + "learning_rate": 8.883089375366904e-05, + "loss": 2.8216, + "step": 45823 + }, + { + "epoch": 2.25, + "grad_norm": 0.7230860590934753, + "learning_rate": 8.881995808634801e-05, + "loss": 3.0006, + "step": 45824 + }, + { + "epoch": 2.25, + "grad_norm": 0.6626743674278259, + "learning_rate": 8.880902297523471e-05, + "loss": 2.7648, + "step": 45825 + }, + { + "epoch": 2.25, + "grad_norm": 0.7058641910552979, + "learning_rate": 8.879808842035769e-05, + "loss": 2.9872, + "step": 45826 + }, + { + "epoch": 2.25, + "grad_norm": 0.7745085954666138, + "learning_rate": 8.878715442174585e-05, + "loss": 2.9538, + "step": 45827 + }, + { + "epoch": 2.25, + "grad_norm": 0.6953856945037842, + "learning_rate": 8.877622097942807e-05, + "loss": 2.8034, + "step": 45828 + }, + { + "epoch": 2.25, + "grad_norm": 0.8223050236701965, + "learning_rate": 8.876528809343302e-05, + "loss": 2.8538, + "step": 45829 + }, + { + "epoch": 2.25, + "grad_norm": 0.7568581700325012, + "learning_rate": 8.87543557637897e-05, + "loss": 2.9885, + "step": 45830 + }, + { + "epoch": 2.25, + "grad_norm": 0.7025486826896667, + "learning_rate": 8.874342399052658e-05, + "loss": 2.6614, + "step": 45831 + }, + { + "epoch": 2.25, + "grad_norm": 0.7281057834625244, + "learning_rate": 8.873249277367262e-05, + "loss": 2.9158, + "step": 45832 + }, + { + "epoch": 2.25, + "grad_norm": 0.7436516284942627, + "learning_rate": 8.872156211325673e-05, + "loss": 2.8681, + "step": 45833 + }, + { + "epoch": 2.25, + "grad_norm": 0.6978209018707275, + "learning_rate": 8.871063200930747e-05, + "loss": 2.8703, + "step": 45834 + }, + { + "epoch": 2.25, + "grad_norm": 0.7288390398025513, + "learning_rate": 8.869970246185382e-05, + "loss": 2.9508, + "step": 45835 + }, + { + "epoch": 2.25, + "grad_norm": 0.6683169007301331, + "learning_rate": 8.868877347092445e-05, + "loss": 3.0472, + "step": 45836 + }, + { + "epoch": 2.25, + "grad_norm": 0.7435875535011292, + "learning_rate": 8.867784503654823e-05, + "loss": 3.1275, + "step": 45837 + }, + { + "epoch": 2.25, + "grad_norm": 0.751097559928894, + "learning_rate": 8.86669171587539e-05, + "loss": 2.6619, + "step": 45838 + }, + { + "epoch": 2.25, + "grad_norm": 0.7168517708778381, + "learning_rate": 8.865598983757017e-05, + "loss": 2.9837, + "step": 45839 + }, + { + "epoch": 2.25, + "grad_norm": 0.7347090840339661, + "learning_rate": 8.864506307302596e-05, + "loss": 2.6833, + "step": 45840 + }, + { + "epoch": 2.25, + "grad_norm": 0.6629171967506409, + "learning_rate": 8.863413686514986e-05, + "loss": 2.9058, + "step": 45841 + }, + { + "epoch": 2.25, + "grad_norm": 0.7456681132316589, + "learning_rate": 8.862321121397079e-05, + "loss": 2.7433, + "step": 45842 + }, + { + "epoch": 2.25, + "grad_norm": 0.6898133158683777, + "learning_rate": 8.861228611951762e-05, + "loss": 2.9031, + "step": 45843 + }, + { + "epoch": 2.25, + "grad_norm": 0.7045638561248779, + "learning_rate": 8.860136158181898e-05, + "loss": 2.8671, + "step": 45844 + }, + { + "epoch": 2.25, + "grad_norm": 0.7033824324607849, + "learning_rate": 8.859043760090368e-05, + "loss": 3.0791, + "step": 45845 + }, + { + "epoch": 2.25, + "grad_norm": 0.7060011029243469, + "learning_rate": 8.857951417680036e-05, + "loss": 2.7681, + "step": 45846 + }, + { + "epoch": 2.25, + "grad_norm": 0.6827572584152222, + "learning_rate": 8.856859130953795e-05, + "loss": 3.0132, + "step": 45847 + }, + { + "epoch": 2.25, + "grad_norm": 0.725695788860321, + "learning_rate": 8.855766899914527e-05, + "loss": 3.0672, + "step": 45848 + }, + { + "epoch": 2.25, + "grad_norm": 0.7227092981338501, + "learning_rate": 8.854674724565086e-05, + "loss": 3.0037, + "step": 45849 + }, + { + "epoch": 2.25, + "grad_norm": 0.6998618245124817, + "learning_rate": 8.853582604908374e-05, + "loss": 2.7627, + "step": 45850 + }, + { + "epoch": 2.25, + "grad_norm": 0.7095021605491638, + "learning_rate": 8.852490540947249e-05, + "loss": 2.8435, + "step": 45851 + }, + { + "epoch": 2.25, + "grad_norm": 0.7027034163475037, + "learning_rate": 8.8513985326846e-05, + "loss": 2.8237, + "step": 45852 + }, + { + "epoch": 2.25, + "grad_norm": 0.7380695939064026, + "learning_rate": 8.850306580123298e-05, + "loss": 2.7772, + "step": 45853 + }, + { + "epoch": 2.25, + "grad_norm": 0.7337902784347534, + "learning_rate": 8.849214683266208e-05, + "loss": 3.0559, + "step": 45854 + }, + { + "epoch": 2.25, + "grad_norm": 0.7061793804168701, + "learning_rate": 8.848122842116227e-05, + "loss": 2.8721, + "step": 45855 + }, + { + "epoch": 2.25, + "grad_norm": 0.7373761534690857, + "learning_rate": 8.847031056676211e-05, + "loss": 2.9133, + "step": 45856 + }, + { + "epoch": 2.25, + "grad_norm": 0.7317122220993042, + "learning_rate": 8.84593932694904e-05, + "loss": 2.997, + "step": 45857 + }, + { + "epoch": 2.25, + "grad_norm": 0.7210045456886292, + "learning_rate": 8.844847652937609e-05, + "loss": 2.9035, + "step": 45858 + }, + { + "epoch": 2.25, + "grad_norm": 0.7218835949897766, + "learning_rate": 8.843756034644773e-05, + "loss": 2.761, + "step": 45859 + }, + { + "epoch": 2.25, + "grad_norm": 0.6628503799438477, + "learning_rate": 8.842664472073415e-05, + "loss": 2.9027, + "step": 45860 + }, + { + "epoch": 2.25, + "grad_norm": 0.694086492061615, + "learning_rate": 8.841572965226396e-05, + "loss": 2.9911, + "step": 45861 + }, + { + "epoch": 2.25, + "grad_norm": 0.7303391098976135, + "learning_rate": 8.840481514106602e-05, + "loss": 2.8838, + "step": 45862 + }, + { + "epoch": 2.25, + "grad_norm": 0.7094843983650208, + "learning_rate": 8.839390118716915e-05, + "loss": 3.0449, + "step": 45863 + }, + { + "epoch": 2.25, + "grad_norm": 0.6973925828933716, + "learning_rate": 8.838298779060198e-05, + "loss": 2.9713, + "step": 45864 + }, + { + "epoch": 2.25, + "grad_norm": 0.7205259799957275, + "learning_rate": 8.837207495139336e-05, + "loss": 2.8643, + "step": 45865 + }, + { + "epoch": 2.25, + "grad_norm": 0.6511806845664978, + "learning_rate": 8.836116266957193e-05, + "loss": 2.7828, + "step": 45866 + }, + { + "epoch": 2.25, + "grad_norm": 0.7268171906471252, + "learning_rate": 8.83502509451664e-05, + "loss": 2.982, + "step": 45867 + }, + { + "epoch": 2.25, + "grad_norm": 0.751723051071167, + "learning_rate": 8.833933977820569e-05, + "loss": 3.0784, + "step": 45868 + }, + { + "epoch": 2.25, + "grad_norm": 0.7350054979324341, + "learning_rate": 8.83284291687183e-05, + "loss": 2.9919, + "step": 45869 + }, + { + "epoch": 2.25, + "grad_norm": 0.7295348644256592, + "learning_rate": 8.83175191167332e-05, + "loss": 2.9076, + "step": 45870 + }, + { + "epoch": 2.25, + "grad_norm": 0.7354061603546143, + "learning_rate": 8.830660962227892e-05, + "loss": 2.8714, + "step": 45871 + }, + { + "epoch": 2.25, + "grad_norm": 0.7122185230255127, + "learning_rate": 8.82957006853843e-05, + "loss": 2.8408, + "step": 45872 + }, + { + "epoch": 2.25, + "grad_norm": 0.7535485029220581, + "learning_rate": 8.828479230607814e-05, + "loss": 3.0789, + "step": 45873 + }, + { + "epoch": 2.25, + "grad_norm": 0.7035789489746094, + "learning_rate": 8.827388448438908e-05, + "loss": 2.9393, + "step": 45874 + }, + { + "epoch": 2.25, + "grad_norm": 0.7172368764877319, + "learning_rate": 8.826297722034589e-05, + "loss": 2.8124, + "step": 45875 + }, + { + "epoch": 2.25, + "grad_norm": 0.7242939472198486, + "learning_rate": 8.825207051397717e-05, + "loss": 2.8569, + "step": 45876 + }, + { + "epoch": 2.25, + "grad_norm": 0.7067150473594666, + "learning_rate": 8.824116436531174e-05, + "loss": 2.7216, + "step": 45877 + }, + { + "epoch": 2.25, + "grad_norm": 0.6949864029884338, + "learning_rate": 8.823025877437839e-05, + "loss": 3.1062, + "step": 45878 + }, + { + "epoch": 2.25, + "grad_norm": 0.6859537363052368, + "learning_rate": 8.821935374120572e-05, + "loss": 2.8185, + "step": 45879 + }, + { + "epoch": 2.25, + "grad_norm": 0.7593329548835754, + "learning_rate": 8.82084492658226e-05, + "loss": 2.8668, + "step": 45880 + }, + { + "epoch": 2.25, + "grad_norm": 0.6818985342979431, + "learning_rate": 8.819754534825766e-05, + "loss": 2.8889, + "step": 45881 + }, + { + "epoch": 2.25, + "grad_norm": 0.7531813383102417, + "learning_rate": 8.818664198853954e-05, + "loss": 2.6414, + "step": 45882 + }, + { + "epoch": 2.25, + "grad_norm": 0.6858770847320557, + "learning_rate": 8.817573918669716e-05, + "loss": 2.6542, + "step": 45883 + }, + { + "epoch": 2.25, + "grad_norm": 0.7132046222686768, + "learning_rate": 8.8164836942759e-05, + "loss": 2.9236, + "step": 45884 + }, + { + "epoch": 2.25, + "grad_norm": 0.7180655598640442, + "learning_rate": 8.815393525675397e-05, + "loss": 2.9675, + "step": 45885 + }, + { + "epoch": 2.25, + "grad_norm": 0.6883565187454224, + "learning_rate": 8.814303412871063e-05, + "loss": 2.8287, + "step": 45886 + }, + { + "epoch": 2.25, + "grad_norm": 0.7425944209098816, + "learning_rate": 8.813213355865783e-05, + "loss": 2.9131, + "step": 45887 + }, + { + "epoch": 2.25, + "grad_norm": 0.736083447933197, + "learning_rate": 8.812123354662427e-05, + "loss": 2.8409, + "step": 45888 + }, + { + "epoch": 2.25, + "grad_norm": 0.6757725477218628, + "learning_rate": 8.811033409263844e-05, + "loss": 2.7647, + "step": 45889 + }, + { + "epoch": 2.25, + "grad_norm": 0.7013656497001648, + "learning_rate": 8.809943519672936e-05, + "loss": 2.9786, + "step": 45890 + }, + { + "epoch": 2.25, + "grad_norm": 0.7240048050880432, + "learning_rate": 8.808853685892545e-05, + "loss": 3.0001, + "step": 45891 + }, + { + "epoch": 2.25, + "grad_norm": 0.7245268821716309, + "learning_rate": 8.807763907925568e-05, + "loss": 2.9762, + "step": 45892 + }, + { + "epoch": 2.25, + "grad_norm": 0.7503712177276611, + "learning_rate": 8.806674185774852e-05, + "loss": 2.8701, + "step": 45893 + }, + { + "epoch": 2.25, + "grad_norm": 0.7144389152526855, + "learning_rate": 8.805584519443276e-05, + "loss": 2.8848, + "step": 45894 + }, + { + "epoch": 2.25, + "grad_norm": 0.6713013052940369, + "learning_rate": 8.804494908933723e-05, + "loss": 3.0602, + "step": 45895 + }, + { + "epoch": 2.25, + "grad_norm": 0.7255938053131104, + "learning_rate": 8.803405354249049e-05, + "loss": 2.8985, + "step": 45896 + }, + { + "epoch": 2.25, + "grad_norm": 0.7103670835494995, + "learning_rate": 8.802315855392127e-05, + "loss": 2.865, + "step": 45897 + }, + { + "epoch": 2.25, + "grad_norm": 0.7415536046028137, + "learning_rate": 8.801226412365818e-05, + "loss": 2.9041, + "step": 45898 + }, + { + "epoch": 2.25, + "grad_norm": 0.7270449995994568, + "learning_rate": 8.800137025172998e-05, + "loss": 2.9277, + "step": 45899 + }, + { + "epoch": 2.25, + "grad_norm": 0.7240244150161743, + "learning_rate": 8.799047693816545e-05, + "loss": 2.9637, + "step": 45900 + }, + { + "epoch": 2.25, + "grad_norm": 0.6903491616249084, + "learning_rate": 8.79795841829931e-05, + "loss": 2.8616, + "step": 45901 + }, + { + "epoch": 2.25, + "grad_norm": 0.6921778917312622, + "learning_rate": 8.796869198624182e-05, + "loss": 3.0011, + "step": 45902 + }, + { + "epoch": 2.25, + "grad_norm": 0.7207080125808716, + "learning_rate": 8.795780034794024e-05, + "loss": 2.7592, + "step": 45903 + }, + { + "epoch": 2.25, + "grad_norm": 0.7446100115776062, + "learning_rate": 8.794690926811687e-05, + "loss": 3.0125, + "step": 45904 + }, + { + "epoch": 2.25, + "grad_norm": 0.7436581254005432, + "learning_rate": 8.793601874680061e-05, + "loss": 2.8783, + "step": 45905 + }, + { + "epoch": 2.25, + "grad_norm": 0.69056236743927, + "learning_rate": 8.792512878402001e-05, + "loss": 2.9237, + "step": 45906 + }, + { + "epoch": 2.25, + "grad_norm": 0.743360698223114, + "learning_rate": 8.791423937980387e-05, + "loss": 2.9432, + "step": 45907 + }, + { + "epoch": 2.25, + "grad_norm": 0.7039749026298523, + "learning_rate": 8.790335053418074e-05, + "loss": 3.0534, + "step": 45908 + }, + { + "epoch": 2.25, + "grad_norm": 0.6992817521095276, + "learning_rate": 8.789246224717945e-05, + "loss": 3.0331, + "step": 45909 + }, + { + "epoch": 2.25, + "grad_norm": 0.6970164179801941, + "learning_rate": 8.78815745188286e-05, + "loss": 2.9252, + "step": 45910 + }, + { + "epoch": 2.25, + "grad_norm": 0.7451713681221008, + "learning_rate": 8.787068734915673e-05, + "loss": 3.0809, + "step": 45911 + }, + { + "epoch": 2.25, + "grad_norm": 0.7480039000511169, + "learning_rate": 8.78598007381928e-05, + "loss": 3.0333, + "step": 45912 + }, + { + "epoch": 2.25, + "grad_norm": 0.7085638046264648, + "learning_rate": 8.784891468596517e-05, + "loss": 2.881, + "step": 45913 + }, + { + "epoch": 2.25, + "grad_norm": 0.728363573551178, + "learning_rate": 8.783802919250271e-05, + "loss": 2.8696, + "step": 45914 + }, + { + "epoch": 2.25, + "grad_norm": 0.708055317401886, + "learning_rate": 8.782714425783414e-05, + "loss": 3.2101, + "step": 45915 + }, + { + "epoch": 2.25, + "grad_norm": 0.6821580529212952, + "learning_rate": 8.781625988198792e-05, + "loss": 3.1762, + "step": 45916 + }, + { + "epoch": 2.25, + "grad_norm": 0.7033885717391968, + "learning_rate": 8.780537606499297e-05, + "loss": 2.8277, + "step": 45917 + }, + { + "epoch": 2.25, + "grad_norm": 0.7111839652061462, + "learning_rate": 8.779449280687778e-05, + "loss": 2.9337, + "step": 45918 + }, + { + "epoch": 2.25, + "grad_norm": 0.7263917326927185, + "learning_rate": 8.778361010767099e-05, + "loss": 2.8357, + "step": 45919 + }, + { + "epoch": 2.25, + "grad_norm": 0.7069695591926575, + "learning_rate": 8.777272796740139e-05, + "loss": 2.7324, + "step": 45920 + }, + { + "epoch": 2.25, + "grad_norm": 0.7394420504570007, + "learning_rate": 8.77618463860975e-05, + "loss": 2.7066, + "step": 45921 + }, + { + "epoch": 2.25, + "grad_norm": 0.7445680499076843, + "learning_rate": 8.77509653637882e-05, + "loss": 2.8956, + "step": 45922 + }, + { + "epoch": 2.25, + "grad_norm": 0.7384760975837708, + "learning_rate": 8.774008490050185e-05, + "loss": 2.944, + "step": 45923 + }, + { + "epoch": 2.25, + "grad_norm": 0.7003813982009888, + "learning_rate": 8.772920499626741e-05, + "loss": 2.7802, + "step": 45924 + }, + { + "epoch": 2.25, + "grad_norm": 0.7218019366264343, + "learning_rate": 8.771832565111335e-05, + "loss": 2.6604, + "step": 45925 + }, + { + "epoch": 2.25, + "grad_norm": 0.7279051542282104, + "learning_rate": 8.770744686506828e-05, + "loss": 2.8343, + "step": 45926 + }, + { + "epoch": 2.25, + "grad_norm": 0.6893950700759888, + "learning_rate": 8.769656863816106e-05, + "loss": 2.986, + "step": 45927 + }, + { + "epoch": 2.25, + "grad_norm": 0.7431288361549377, + "learning_rate": 8.768569097042011e-05, + "loss": 2.7472, + "step": 45928 + }, + { + "epoch": 2.25, + "grad_norm": 0.7378048896789551, + "learning_rate": 8.767481386187418e-05, + "loss": 2.8564, + "step": 45929 + }, + { + "epoch": 2.25, + "grad_norm": 0.7334486842155457, + "learning_rate": 8.766393731255205e-05, + "loss": 2.9231, + "step": 45930 + }, + { + "epoch": 2.25, + "grad_norm": 0.7250813841819763, + "learning_rate": 8.76530613224822e-05, + "loss": 2.9032, + "step": 45931 + }, + { + "epoch": 2.25, + "grad_norm": 0.7708786725997925, + "learning_rate": 8.764218589169338e-05, + "loss": 2.8288, + "step": 45932 + }, + { + "epoch": 2.25, + "grad_norm": 0.7491968274116516, + "learning_rate": 8.763131102021402e-05, + "loss": 2.9354, + "step": 45933 + }, + { + "epoch": 2.25, + "grad_norm": 0.7650907039642334, + "learning_rate": 8.762043670807293e-05, + "loss": 2.8662, + "step": 45934 + }, + { + "epoch": 2.25, + "grad_norm": 0.7074927091598511, + "learning_rate": 8.760956295529887e-05, + "loss": 3.0082, + "step": 45935 + }, + { + "epoch": 2.25, + "grad_norm": 0.6994934678077698, + "learning_rate": 8.75986897619202e-05, + "loss": 2.7367, + "step": 45936 + }, + { + "epoch": 2.25, + "grad_norm": 0.7439029812812805, + "learning_rate": 8.758781712796584e-05, + "loss": 2.9358, + "step": 45937 + }, + { + "epoch": 2.25, + "grad_norm": 0.737886369228363, + "learning_rate": 8.757694505346416e-05, + "loss": 2.7807, + "step": 45938 + }, + { + "epoch": 2.25, + "grad_norm": 0.6919266581535339, + "learning_rate": 8.756607353844405e-05, + "loss": 2.9541, + "step": 45939 + }, + { + "epoch": 2.25, + "grad_norm": 0.6991779804229736, + "learning_rate": 8.7555202582934e-05, + "loss": 2.8848, + "step": 45940 + }, + { + "epoch": 2.25, + "grad_norm": 0.7198300361633301, + "learning_rate": 8.754433218696258e-05, + "loss": 2.7478, + "step": 45941 + }, + { + "epoch": 2.25, + "grad_norm": 0.7473375201225281, + "learning_rate": 8.753346235055857e-05, + "loss": 2.7545, + "step": 45942 + }, + { + "epoch": 2.25, + "grad_norm": 0.6938808560371399, + "learning_rate": 8.752259307375046e-05, + "loss": 2.8858, + "step": 45943 + }, + { + "epoch": 2.25, + "grad_norm": 0.6992427110671997, + "learning_rate": 8.751172435656695e-05, + "loss": 3.0062, + "step": 45944 + }, + { + "epoch": 2.25, + "grad_norm": 0.7342805862426758, + "learning_rate": 8.750085619903672e-05, + "loss": 3.0506, + "step": 45945 + }, + { + "epoch": 2.25, + "grad_norm": 0.714984655380249, + "learning_rate": 8.748998860118838e-05, + "loss": 2.9983, + "step": 45946 + }, + { + "epoch": 2.25, + "grad_norm": 0.6858196258544922, + "learning_rate": 8.74791215630505e-05, + "loss": 2.8878, + "step": 45947 + }, + { + "epoch": 2.25, + "grad_norm": 0.7082827687263489, + "learning_rate": 8.74682550846516e-05, + "loss": 2.883, + "step": 45948 + }, + { + "epoch": 2.25, + "grad_norm": 0.7259243726730347, + "learning_rate": 8.745738916602044e-05, + "loss": 2.6689, + "step": 45949 + }, + { + "epoch": 2.25, + "grad_norm": 0.7247135639190674, + "learning_rate": 8.744652380718568e-05, + "loss": 2.9188, + "step": 45950 + }, + { + "epoch": 2.25, + "grad_norm": 0.7118974924087524, + "learning_rate": 8.743565900817582e-05, + "loss": 2.8574, + "step": 45951 + }, + { + "epoch": 2.25, + "grad_norm": 0.6941021680831909, + "learning_rate": 8.742479476901958e-05, + "loss": 3.0137, + "step": 45952 + }, + { + "epoch": 2.25, + "grad_norm": 0.7223904728889465, + "learning_rate": 8.741393108974545e-05, + "loss": 2.9941, + "step": 45953 + }, + { + "epoch": 2.25, + "grad_norm": 0.7473703026771545, + "learning_rate": 8.740306797038217e-05, + "loss": 2.8769, + "step": 45954 + }, + { + "epoch": 2.25, + "grad_norm": 0.6923856735229492, + "learning_rate": 8.739220541095831e-05, + "loss": 3.0033, + "step": 45955 + }, + { + "epoch": 2.25, + "grad_norm": 0.7007562518119812, + "learning_rate": 8.73813434115024e-05, + "loss": 3.1408, + "step": 45956 + }, + { + "epoch": 2.25, + "grad_norm": 0.7167068123817444, + "learning_rate": 8.737048197204317e-05, + "loss": 2.8446, + "step": 45957 + }, + { + "epoch": 2.25, + "grad_norm": 0.7058669328689575, + "learning_rate": 8.73596210926091e-05, + "loss": 2.9255, + "step": 45958 + }, + { + "epoch": 2.25, + "grad_norm": 0.7336747646331787, + "learning_rate": 8.734876077322887e-05, + "loss": 2.907, + "step": 45959 + }, + { + "epoch": 2.25, + "grad_norm": 0.6712983250617981, + "learning_rate": 8.733790101393114e-05, + "loss": 2.8126, + "step": 45960 + }, + { + "epoch": 2.25, + "grad_norm": 0.7041860818862915, + "learning_rate": 8.732704181474448e-05, + "loss": 2.9029, + "step": 45961 + }, + { + "epoch": 2.25, + "grad_norm": 0.739032506942749, + "learning_rate": 8.731618317569742e-05, + "loss": 2.9452, + "step": 45962 + }, + { + "epoch": 2.25, + "grad_norm": 0.790059506893158, + "learning_rate": 8.730532509681853e-05, + "loss": 3.131, + "step": 45963 + }, + { + "epoch": 2.25, + "grad_norm": 0.6874712109565735, + "learning_rate": 8.729446757813651e-05, + "loss": 2.8963, + "step": 45964 + }, + { + "epoch": 2.25, + "grad_norm": 0.718464732170105, + "learning_rate": 8.728361061967998e-05, + "loss": 2.9917, + "step": 45965 + }, + { + "epoch": 2.25, + "grad_norm": 0.7371983528137207, + "learning_rate": 8.727275422147743e-05, + "loss": 3.0787, + "step": 45966 + }, + { + "epoch": 2.25, + "grad_norm": 0.689892590045929, + "learning_rate": 8.726189838355756e-05, + "loss": 2.9387, + "step": 45967 + }, + { + "epoch": 2.25, + "grad_norm": 0.7095820903778076, + "learning_rate": 8.725104310594893e-05, + "loss": 2.8798, + "step": 45968 + }, + { + "epoch": 2.25, + "grad_norm": 0.7040510773658752, + "learning_rate": 8.724018838867999e-05, + "loss": 2.8971, + "step": 45969 + }, + { + "epoch": 2.25, + "grad_norm": 0.713518500328064, + "learning_rate": 8.722933423177956e-05, + "loss": 2.9733, + "step": 45970 + }, + { + "epoch": 2.25, + "grad_norm": 0.6998233199119568, + "learning_rate": 8.721848063527601e-05, + "loss": 2.6608, + "step": 45971 + }, + { + "epoch": 2.25, + "grad_norm": 0.6755211353302002, + "learning_rate": 8.720762759919814e-05, + "loss": 2.9071, + "step": 45972 + }, + { + "epoch": 2.25, + "grad_norm": 0.7184854745864868, + "learning_rate": 8.719677512357432e-05, + "loss": 2.8504, + "step": 45973 + }, + { + "epoch": 2.25, + "grad_norm": 0.704601526260376, + "learning_rate": 8.718592320843335e-05, + "loss": 2.9019, + "step": 45974 + }, + { + "epoch": 2.25, + "grad_norm": 0.7312195301055908, + "learning_rate": 8.717507185380359e-05, + "loss": 2.9244, + "step": 45975 + }, + { + "epoch": 2.25, + "grad_norm": 0.7099519371986389, + "learning_rate": 8.716422105971385e-05, + "loss": 2.9385, + "step": 45976 + }, + { + "epoch": 2.25, + "grad_norm": 0.7298969626426697, + "learning_rate": 8.715337082619256e-05, + "loss": 2.9582, + "step": 45977 + }, + { + "epoch": 2.25, + "grad_norm": 0.7228435277938843, + "learning_rate": 8.714252115326825e-05, + "loss": 3.0337, + "step": 45978 + }, + { + "epoch": 2.25, + "grad_norm": 0.7032173275947571, + "learning_rate": 8.713167204096967e-05, + "loss": 2.8777, + "step": 45979 + }, + { + "epoch": 2.25, + "grad_norm": 0.7508836388587952, + "learning_rate": 8.712082348932518e-05, + "loss": 2.8243, + "step": 45980 + }, + { + "epoch": 2.25, + "grad_norm": 0.7287681102752686, + "learning_rate": 8.71099754983635e-05, + "loss": 3.0163, + "step": 45981 + }, + { + "epoch": 2.25, + "grad_norm": 0.7005704045295715, + "learning_rate": 8.709912806811327e-05, + "loss": 2.8342, + "step": 45982 + }, + { + "epoch": 2.25, + "grad_norm": 0.7210054993629456, + "learning_rate": 8.708828119860295e-05, + "loss": 3.0389, + "step": 45983 + }, + { + "epoch": 2.25, + "grad_norm": 0.693412184715271, + "learning_rate": 8.70774348898611e-05, + "loss": 2.8852, + "step": 45984 + }, + { + "epoch": 2.25, + "grad_norm": 0.687518835067749, + "learning_rate": 8.706658914191625e-05, + "loss": 2.9454, + "step": 45985 + }, + { + "epoch": 2.25, + "grad_norm": 0.7263759970664978, + "learning_rate": 8.7055743954797e-05, + "loss": 2.9181, + "step": 45986 + }, + { + "epoch": 2.25, + "grad_norm": 0.7387471199035645, + "learning_rate": 8.704489932853203e-05, + "loss": 2.8456, + "step": 45987 + }, + { + "epoch": 2.25, + "grad_norm": 0.6722941398620605, + "learning_rate": 8.703405526314975e-05, + "loss": 2.8528, + "step": 45988 + }, + { + "epoch": 2.25, + "grad_norm": 0.6813567876815796, + "learning_rate": 8.702321175867884e-05, + "loss": 2.9437, + "step": 45989 + }, + { + "epoch": 2.25, + "grad_norm": 0.7118014693260193, + "learning_rate": 8.701236881514784e-05, + "loss": 2.8783, + "step": 45990 + }, + { + "epoch": 2.25, + "grad_norm": 0.7127276659011841, + "learning_rate": 8.700152643258516e-05, + "loss": 3.0367, + "step": 45991 + }, + { + "epoch": 2.25, + "grad_norm": 0.8109259605407715, + "learning_rate": 8.699068461101955e-05, + "loss": 2.8818, + "step": 45992 + }, + { + "epoch": 2.25, + "grad_norm": 0.689415693283081, + "learning_rate": 8.697984335047939e-05, + "loss": 3.0599, + "step": 45993 + }, + { + "epoch": 2.25, + "grad_norm": 0.7123958468437195, + "learning_rate": 8.696900265099343e-05, + "loss": 2.8822, + "step": 45994 + }, + { + "epoch": 2.25, + "grad_norm": 0.7250252366065979, + "learning_rate": 8.695816251259002e-05, + "loss": 2.9265, + "step": 45995 + }, + { + "epoch": 2.25, + "grad_norm": 0.7128042578697205, + "learning_rate": 8.694732293529783e-05, + "loss": 3.0612, + "step": 45996 + }, + { + "epoch": 2.25, + "grad_norm": 0.7381935119628906, + "learning_rate": 8.693648391914543e-05, + "loss": 2.7554, + "step": 45997 + }, + { + "epoch": 2.25, + "grad_norm": 0.6966440677642822, + "learning_rate": 8.692564546416137e-05, + "loss": 2.7047, + "step": 45998 + }, + { + "epoch": 2.25, + "grad_norm": 0.7341095209121704, + "learning_rate": 8.691480757037414e-05, + "loss": 3.1269, + "step": 45999 + }, + { + "epoch": 2.25, + "grad_norm": 0.6964755654335022, + "learning_rate": 8.690397023781219e-05, + "loss": 3.019, + "step": 46000 + }, + { + "epoch": 2.25, + "grad_norm": 0.7326573729515076, + "learning_rate": 8.689313346650419e-05, + "loss": 2.7295, + "step": 46001 + }, + { + "epoch": 2.25, + "grad_norm": 0.7106379866600037, + "learning_rate": 8.688229725647876e-05, + "loss": 2.8008, + "step": 46002 + }, + { + "epoch": 2.25, + "grad_norm": 0.6738712787628174, + "learning_rate": 8.687146160776424e-05, + "loss": 2.7775, + "step": 46003 + }, + { + "epoch": 2.25, + "grad_norm": 0.6668511033058167, + "learning_rate": 8.686062652038939e-05, + "loss": 2.9989, + "step": 46004 + }, + { + "epoch": 2.25, + "grad_norm": 0.8261511921882629, + "learning_rate": 8.684979199438259e-05, + "loss": 2.7222, + "step": 46005 + }, + { + "epoch": 2.25, + "grad_norm": 0.6966729760169983, + "learning_rate": 8.683895802977236e-05, + "loss": 3.1088, + "step": 46006 + }, + { + "epoch": 2.25, + "grad_norm": 0.7278913259506226, + "learning_rate": 8.682812462658736e-05, + "loss": 3.029, + "step": 46007 + }, + { + "epoch": 2.25, + "grad_norm": 0.6950763463973999, + "learning_rate": 8.681729178485595e-05, + "loss": 2.7216, + "step": 46008 + }, + { + "epoch": 2.25, + "grad_norm": 0.753512442111969, + "learning_rate": 8.68064595046069e-05, + "loss": 2.9456, + "step": 46009 + }, + { + "epoch": 2.25, + "grad_norm": 0.7178856730461121, + "learning_rate": 8.679562778586848e-05, + "loss": 2.7979, + "step": 46010 + }, + { + "epoch": 2.25, + "grad_norm": 0.6799273490905762, + "learning_rate": 8.678479662866944e-05, + "loss": 3.0293, + "step": 46011 + }, + { + "epoch": 2.25, + "grad_norm": 0.7092909812927246, + "learning_rate": 8.677396603303821e-05, + "loss": 2.8555, + "step": 46012 + }, + { + "epoch": 2.25, + "grad_norm": 0.744310736656189, + "learning_rate": 8.676313599900325e-05, + "loss": 3.0313, + "step": 46013 + }, + { + "epoch": 2.26, + "grad_norm": 0.7400776743888855, + "learning_rate": 8.675230652659322e-05, + "loss": 2.9776, + "step": 46014 + }, + { + "epoch": 2.26, + "grad_norm": 0.7001795768737793, + "learning_rate": 8.674147761583649e-05, + "loss": 3.0375, + "step": 46015 + }, + { + "epoch": 2.26, + "grad_norm": 0.7090204954147339, + "learning_rate": 8.673064926676166e-05, + "loss": 3.1105, + "step": 46016 + }, + { + "epoch": 2.26, + "grad_norm": 0.6803956031799316, + "learning_rate": 8.671982147939733e-05, + "loss": 2.9111, + "step": 46017 + }, + { + "epoch": 2.26, + "grad_norm": 0.6881393790245056, + "learning_rate": 8.670899425377191e-05, + "loss": 2.9803, + "step": 46018 + }, + { + "epoch": 2.26, + "grad_norm": 0.7794377207756042, + "learning_rate": 8.669816758991397e-05, + "loss": 3.0788, + "step": 46019 + }, + { + "epoch": 2.26, + "grad_norm": 0.6823427677154541, + "learning_rate": 8.668734148785205e-05, + "loss": 2.7918, + "step": 46020 + }, + { + "epoch": 2.26, + "grad_norm": 0.7123182415962219, + "learning_rate": 8.667651594761452e-05, + "loss": 2.896, + "step": 46021 + }, + { + "epoch": 2.26, + "grad_norm": 0.7293609976768494, + "learning_rate": 8.666569096923007e-05, + "loss": 3.039, + "step": 46022 + }, + { + "epoch": 2.26, + "grad_norm": 0.7326741218566895, + "learning_rate": 8.665486655272708e-05, + "loss": 3.0871, + "step": 46023 + }, + { + "epoch": 2.26, + "grad_norm": 0.7077974677085876, + "learning_rate": 8.664404269813416e-05, + "loss": 2.9412, + "step": 46024 + }, + { + "epoch": 2.26, + "grad_norm": 0.7263396382331848, + "learning_rate": 8.663321940547971e-05, + "loss": 2.8853, + "step": 46025 + }, + { + "epoch": 2.26, + "grad_norm": 0.6902568340301514, + "learning_rate": 8.662239667479238e-05, + "loss": 2.8761, + "step": 46026 + }, + { + "epoch": 2.26, + "grad_norm": 0.6834216713905334, + "learning_rate": 8.661157450610058e-05, + "loss": 2.987, + "step": 46027 + }, + { + "epoch": 2.26, + "grad_norm": 0.7192767262458801, + "learning_rate": 8.660075289943274e-05, + "loss": 3.0016, + "step": 46028 + }, + { + "epoch": 2.26, + "grad_norm": 0.711889922618866, + "learning_rate": 8.658993185481752e-05, + "loss": 2.9948, + "step": 46029 + }, + { + "epoch": 2.26, + "grad_norm": 0.7222273349761963, + "learning_rate": 8.657911137228328e-05, + "loss": 2.9268, + "step": 46030 + }, + { + "epoch": 2.26, + "grad_norm": 0.7341220378875732, + "learning_rate": 8.656829145185858e-05, + "loss": 2.8919, + "step": 46031 + }, + { + "epoch": 2.26, + "grad_norm": 0.6676339507102966, + "learning_rate": 8.655747209357203e-05, + "loss": 2.9097, + "step": 46032 + }, + { + "epoch": 2.26, + "grad_norm": 0.7094964385032654, + "learning_rate": 8.654665329745199e-05, + "loss": 3.0597, + "step": 46033 + }, + { + "epoch": 2.26, + "grad_norm": 0.7654356360435486, + "learning_rate": 8.653583506352702e-05, + "loss": 2.7896, + "step": 46034 + }, + { + "epoch": 2.26, + "grad_norm": 0.7312438488006592, + "learning_rate": 8.652501739182546e-05, + "loss": 2.9281, + "step": 46035 + }, + { + "epoch": 2.26, + "grad_norm": 0.6991698741912842, + "learning_rate": 8.651420028237594e-05, + "loss": 2.9008, + "step": 46036 + }, + { + "epoch": 2.26, + "grad_norm": 0.725242018699646, + "learning_rate": 8.650338373520702e-05, + "loss": 2.9847, + "step": 46037 + }, + { + "epoch": 2.26, + "grad_norm": 0.6978802680969238, + "learning_rate": 8.6492567750347e-05, + "loss": 2.6138, + "step": 46038 + }, + { + "epoch": 2.26, + "grad_norm": 0.7490013241767883, + "learning_rate": 8.648175232782458e-05, + "loss": 2.8943, + "step": 46039 + }, + { + "epoch": 2.26, + "grad_norm": 0.7092497944831848, + "learning_rate": 8.647093746766799e-05, + "loss": 2.9318, + "step": 46040 + }, + { + "epoch": 2.26, + "grad_norm": 0.709816575050354, + "learning_rate": 8.6460123169906e-05, + "loss": 2.881, + "step": 46041 + }, + { + "epoch": 2.26, + "grad_norm": 0.7483521699905396, + "learning_rate": 8.644930943456693e-05, + "loss": 2.7731, + "step": 46042 + }, + { + "epoch": 2.26, + "grad_norm": 0.7091911435127258, + "learning_rate": 8.643849626167917e-05, + "loss": 2.8516, + "step": 46043 + }, + { + "epoch": 2.26, + "grad_norm": 0.7004270553588867, + "learning_rate": 8.642768365127141e-05, + "loss": 2.8595, + "step": 46044 + }, + { + "epoch": 2.26, + "grad_norm": 0.703554093837738, + "learning_rate": 8.641687160337195e-05, + "loss": 2.5378, + "step": 46045 + }, + { + "epoch": 2.26, + "grad_norm": 0.68243408203125, + "learning_rate": 8.640606011800935e-05, + "loss": 2.8221, + "step": 46046 + }, + { + "epoch": 2.26, + "grad_norm": 0.743057906627655, + "learning_rate": 8.639524919521213e-05, + "loss": 2.9218, + "step": 46047 + }, + { + "epoch": 2.26, + "grad_norm": 0.6928690075874329, + "learning_rate": 8.638443883500877e-05, + "loss": 2.9012, + "step": 46048 + }, + { + "epoch": 2.26, + "grad_norm": 0.7314662337303162, + "learning_rate": 8.637362903742768e-05, + "loss": 2.9544, + "step": 46049 + }, + { + "epoch": 2.26, + "grad_norm": 0.6911783218383789, + "learning_rate": 8.63628198024972e-05, + "loss": 2.8013, + "step": 46050 + }, + { + "epoch": 2.26, + "grad_norm": 0.8123417496681213, + "learning_rate": 8.635201113024607e-05, + "loss": 2.9374, + "step": 46051 + }, + { + "epoch": 2.26, + "grad_norm": 0.7197860479354858, + "learning_rate": 8.634120302070254e-05, + "loss": 3.0019, + "step": 46052 + }, + { + "epoch": 2.26, + "grad_norm": 0.7315776944160461, + "learning_rate": 8.633039547389511e-05, + "loss": 2.7753, + "step": 46053 + }, + { + "epoch": 2.26, + "grad_norm": 0.790367841720581, + "learning_rate": 8.631958848985241e-05, + "loss": 2.9611, + "step": 46054 + }, + { + "epoch": 2.26, + "grad_norm": 0.8130670189857483, + "learning_rate": 8.630878206860275e-05, + "loss": 2.861, + "step": 46055 + }, + { + "epoch": 2.26, + "grad_norm": 0.7119998931884766, + "learning_rate": 8.629797621017475e-05, + "loss": 2.7987, + "step": 46056 + }, + { + "epoch": 2.26, + "grad_norm": 0.7341967821121216, + "learning_rate": 8.62871709145966e-05, + "loss": 2.9332, + "step": 46057 + }, + { + "epoch": 2.26, + "grad_norm": 0.7142411470413208, + "learning_rate": 8.62763661818969e-05, + "loss": 2.8448, + "step": 46058 + }, + { + "epoch": 2.26, + "grad_norm": 0.7042397260665894, + "learning_rate": 8.626556201210424e-05, + "loss": 2.8789, + "step": 46059 + }, + { + "epoch": 2.26, + "grad_norm": 0.7835875153541565, + "learning_rate": 8.625475840524681e-05, + "loss": 2.9927, + "step": 46060 + }, + { + "epoch": 2.26, + "grad_norm": 0.718367338180542, + "learning_rate": 8.624395536135336e-05, + "loss": 3.0031, + "step": 46061 + }, + { + "epoch": 2.26, + "grad_norm": 0.742847204208374, + "learning_rate": 8.623315288045207e-05, + "loss": 2.9724, + "step": 46062 + }, + { + "epoch": 2.26, + "grad_norm": 0.7116962671279907, + "learning_rate": 8.62223509625716e-05, + "loss": 2.7999, + "step": 46063 + }, + { + "epoch": 2.26, + "grad_norm": 0.7158562541007996, + "learning_rate": 8.621154960774035e-05, + "loss": 3.0854, + "step": 46064 + }, + { + "epoch": 2.26, + "grad_norm": 0.7016698122024536, + "learning_rate": 8.620074881598662e-05, + "loss": 2.6261, + "step": 46065 + }, + { + "epoch": 2.26, + "grad_norm": 0.7218732833862305, + "learning_rate": 8.618994858733906e-05, + "loss": 3.0268, + "step": 46066 + }, + { + "epoch": 2.26, + "grad_norm": 0.7430708408355713, + "learning_rate": 8.617914892182592e-05, + "loss": 3.0253, + "step": 46067 + }, + { + "epoch": 2.26, + "grad_norm": 0.7524659633636475, + "learning_rate": 8.616834981947577e-05, + "loss": 2.8653, + "step": 46068 + }, + { + "epoch": 2.26, + "grad_norm": 0.6865372061729431, + "learning_rate": 8.615755128031714e-05, + "loss": 3.1231, + "step": 46069 + }, + { + "epoch": 2.26, + "grad_norm": 0.7529311180114746, + "learning_rate": 8.614675330437835e-05, + "loss": 2.8919, + "step": 46070 + }, + { + "epoch": 2.26, + "grad_norm": 0.7249378561973572, + "learning_rate": 8.613595589168787e-05, + "loss": 2.9534, + "step": 46071 + }, + { + "epoch": 2.26, + "grad_norm": 0.7141491770744324, + "learning_rate": 8.612515904227401e-05, + "loss": 2.8534, + "step": 46072 + }, + { + "epoch": 2.26, + "grad_norm": 0.6884422302246094, + "learning_rate": 8.611436275616534e-05, + "loss": 2.8794, + "step": 46073 + }, + { + "epoch": 2.26, + "grad_norm": 0.7267909646034241, + "learning_rate": 8.610356703339033e-05, + "loss": 2.7915, + "step": 46074 + }, + { + "epoch": 2.26, + "grad_norm": 0.7484807968139648, + "learning_rate": 8.609277187397728e-05, + "loss": 2.9957, + "step": 46075 + }, + { + "epoch": 2.26, + "grad_norm": 0.7652937769889832, + "learning_rate": 8.60819772779548e-05, + "loss": 2.8617, + "step": 46076 + }, + { + "epoch": 2.26, + "grad_norm": 0.6917968392372131, + "learning_rate": 8.607118324535114e-05, + "loss": 2.7486, + "step": 46077 + }, + { + "epoch": 2.26, + "grad_norm": 0.7388394474983215, + "learning_rate": 8.606038977619492e-05, + "loss": 2.768, + "step": 46078 + }, + { + "epoch": 2.26, + "grad_norm": 0.7591552734375, + "learning_rate": 8.604959687051444e-05, + "loss": 2.7493, + "step": 46079 + }, + { + "epoch": 2.26, + "grad_norm": 0.7432824969291687, + "learning_rate": 8.603880452833805e-05, + "loss": 2.7448, + "step": 46080 + }, + { + "epoch": 2.26, + "grad_norm": 0.6959530115127563, + "learning_rate": 8.602801274969436e-05, + "loss": 3.0574, + "step": 46081 + }, + { + "epoch": 2.26, + "grad_norm": 0.7385079264640808, + "learning_rate": 8.601722153461162e-05, + "loss": 2.8571, + "step": 46082 + }, + { + "epoch": 2.26, + "grad_norm": 0.6802938580513, + "learning_rate": 8.600643088311833e-05, + "loss": 2.9388, + "step": 46083 + }, + { + "epoch": 2.26, + "grad_norm": 0.7278597950935364, + "learning_rate": 8.599564079524302e-05, + "loss": 2.9041, + "step": 46084 + }, + { + "epoch": 2.26, + "grad_norm": 0.7153709530830383, + "learning_rate": 8.598485127101399e-05, + "loss": 2.9392, + "step": 46085 + }, + { + "epoch": 2.26, + "grad_norm": 0.7356995344161987, + "learning_rate": 8.59740623104597e-05, + "loss": 3.0901, + "step": 46086 + }, + { + "epoch": 2.26, + "grad_norm": 0.7170118689537048, + "learning_rate": 8.596327391360843e-05, + "loss": 3.0913, + "step": 46087 + }, + { + "epoch": 2.26, + "grad_norm": 0.7780619859695435, + "learning_rate": 8.595248608048869e-05, + "loss": 2.8865, + "step": 46088 + }, + { + "epoch": 2.26, + "grad_norm": 0.7607010006904602, + "learning_rate": 8.594169881112902e-05, + "loss": 2.9598, + "step": 46089 + }, + { + "epoch": 2.26, + "grad_norm": 0.7569159269332886, + "learning_rate": 8.593091210555764e-05, + "loss": 2.8838, + "step": 46090 + }, + { + "epoch": 2.26, + "grad_norm": 0.738321840763092, + "learning_rate": 8.592012596380311e-05, + "loss": 2.94, + "step": 46091 + }, + { + "epoch": 2.26, + "grad_norm": 0.722771406173706, + "learning_rate": 8.59093403858938e-05, + "loss": 3.0879, + "step": 46092 + }, + { + "epoch": 2.26, + "grad_norm": 0.7251681089401245, + "learning_rate": 8.589855537185794e-05, + "loss": 3.1147, + "step": 46093 + }, + { + "epoch": 2.26, + "grad_norm": 0.7316486239433289, + "learning_rate": 8.58877709217242e-05, + "loss": 2.9141, + "step": 46094 + }, + { + "epoch": 2.26, + "grad_norm": 0.6975620985031128, + "learning_rate": 8.587698703552078e-05, + "loss": 2.7911, + "step": 46095 + }, + { + "epoch": 2.26, + "grad_norm": 0.69795161485672, + "learning_rate": 8.586620371327625e-05, + "loss": 2.9119, + "step": 46096 + }, + { + "epoch": 2.26, + "grad_norm": 0.7358713746070862, + "learning_rate": 8.585542095501883e-05, + "loss": 2.9688, + "step": 46097 + }, + { + "epoch": 2.26, + "grad_norm": 0.692922830581665, + "learning_rate": 8.584463876077703e-05, + "loss": 3.0083, + "step": 46098 + }, + { + "epoch": 2.26, + "grad_norm": 0.7178481817245483, + "learning_rate": 8.58338571305793e-05, + "loss": 2.887, + "step": 46099 + }, + { + "epoch": 2.26, + "grad_norm": 0.6919596195220947, + "learning_rate": 8.582307606445402e-05, + "loss": 2.746, + "step": 46100 + }, + { + "epoch": 2.26, + "grad_norm": 0.7563260197639465, + "learning_rate": 8.581229556242951e-05, + "loss": 2.7533, + "step": 46101 + }, + { + "epoch": 2.26, + "grad_norm": 0.7244362831115723, + "learning_rate": 8.58015156245341e-05, + "loss": 3.0494, + "step": 46102 + }, + { + "epoch": 2.26, + "grad_norm": 0.7256781458854675, + "learning_rate": 8.579073625079627e-05, + "loss": 2.8626, + "step": 46103 + }, + { + "epoch": 2.26, + "grad_norm": 0.7332637310028076, + "learning_rate": 8.577995744124451e-05, + "loss": 2.8991, + "step": 46104 + }, + { + "epoch": 2.26, + "grad_norm": 0.7425779700279236, + "learning_rate": 8.576917919590704e-05, + "loss": 2.7907, + "step": 46105 + }, + { + "epoch": 2.26, + "grad_norm": 0.7489483952522278, + "learning_rate": 8.57584015148124e-05, + "loss": 2.9985, + "step": 46106 + }, + { + "epoch": 2.26, + "grad_norm": 0.7290831208229065, + "learning_rate": 8.57476243979889e-05, + "loss": 2.969, + "step": 46107 + }, + { + "epoch": 2.26, + "grad_norm": 0.752694308757782, + "learning_rate": 8.573684784546485e-05, + "loss": 3.069, + "step": 46108 + }, + { + "epoch": 2.26, + "grad_norm": 0.7758665084838867, + "learning_rate": 8.572607185726879e-05, + "loss": 2.8742, + "step": 46109 + }, + { + "epoch": 2.26, + "grad_norm": 0.7011808753013611, + "learning_rate": 8.571529643342891e-05, + "loss": 3.0316, + "step": 46110 + }, + { + "epoch": 2.26, + "grad_norm": 0.7500031590461731, + "learning_rate": 8.570452157397378e-05, + "loss": 3.0522, + "step": 46111 + }, + { + "epoch": 2.26, + "grad_norm": 0.7314960956573486, + "learning_rate": 8.569374727893164e-05, + "loss": 3.0971, + "step": 46112 + }, + { + "epoch": 2.26, + "grad_norm": 0.6992918252944946, + "learning_rate": 8.568297354833102e-05, + "loss": 2.9469, + "step": 46113 + }, + { + "epoch": 2.26, + "grad_norm": 0.7275790572166443, + "learning_rate": 8.567220038220016e-05, + "loss": 3.1835, + "step": 46114 + }, + { + "epoch": 2.26, + "grad_norm": 0.7153680920600891, + "learning_rate": 8.566142778056743e-05, + "loss": 2.79, + "step": 46115 + }, + { + "epoch": 2.26, + "grad_norm": 0.7057393193244934, + "learning_rate": 8.565065574346132e-05, + "loss": 3.0257, + "step": 46116 + }, + { + "epoch": 2.26, + "grad_norm": 0.7389050126075745, + "learning_rate": 8.563988427091005e-05, + "loss": 2.9956, + "step": 46117 + }, + { + "epoch": 2.26, + "grad_norm": 0.69362473487854, + "learning_rate": 8.562911336294204e-05, + "loss": 2.6957, + "step": 46118 + }, + { + "epoch": 2.26, + "grad_norm": 0.7017712593078613, + "learning_rate": 8.56183430195858e-05, + "loss": 2.9043, + "step": 46119 + }, + { + "epoch": 2.26, + "grad_norm": 0.7296519875526428, + "learning_rate": 8.560757324086951e-05, + "loss": 3.0865, + "step": 46120 + }, + { + "epoch": 2.26, + "grad_norm": 0.7155781388282776, + "learning_rate": 8.559680402682168e-05, + "loss": 2.9978, + "step": 46121 + }, + { + "epoch": 2.26, + "grad_norm": 0.7151504158973694, + "learning_rate": 8.558603537747062e-05, + "loss": 2.8498, + "step": 46122 + }, + { + "epoch": 2.26, + "grad_norm": 0.7091836929321289, + "learning_rate": 8.557526729284459e-05, + "loss": 3.0795, + "step": 46123 + }, + { + "epoch": 2.26, + "grad_norm": 0.701084315776825, + "learning_rate": 8.55644997729721e-05, + "loss": 2.8647, + "step": 46124 + }, + { + "epoch": 2.26, + "grad_norm": 0.7470046281814575, + "learning_rate": 8.55537328178814e-05, + "loss": 2.8312, + "step": 46125 + }, + { + "epoch": 2.26, + "grad_norm": 0.7091386318206787, + "learning_rate": 8.554296642760095e-05, + "loss": 2.9283, + "step": 46126 + }, + { + "epoch": 2.26, + "grad_norm": 0.7808635234832764, + "learning_rate": 8.5532200602159e-05, + "loss": 2.9434, + "step": 46127 + }, + { + "epoch": 2.26, + "grad_norm": 0.6947294473648071, + "learning_rate": 8.552143534158403e-05, + "loss": 2.8923, + "step": 46128 + }, + { + "epoch": 2.26, + "grad_norm": 0.6899562478065491, + "learning_rate": 8.551067064590432e-05, + "loss": 2.7862, + "step": 46129 + }, + { + "epoch": 2.26, + "grad_norm": 0.6616258025169373, + "learning_rate": 8.54999065151481e-05, + "loss": 2.9572, + "step": 46130 + }, + { + "epoch": 2.26, + "grad_norm": 0.7294912338256836, + "learning_rate": 8.548914294934399e-05, + "loss": 3.0817, + "step": 46131 + }, + { + "epoch": 2.26, + "grad_norm": 0.7399985194206238, + "learning_rate": 8.547837994852007e-05, + "loss": 2.8683, + "step": 46132 + }, + { + "epoch": 2.26, + "grad_norm": 0.6928227543830872, + "learning_rate": 8.546761751270492e-05, + "loss": 2.9985, + "step": 46133 + }, + { + "epoch": 2.26, + "grad_norm": 0.7888341546058655, + "learning_rate": 8.545685564192669e-05, + "loss": 2.8663, + "step": 46134 + }, + { + "epoch": 2.26, + "grad_norm": 0.7424514889717102, + "learning_rate": 8.544609433621388e-05, + "loss": 2.8363, + "step": 46135 + }, + { + "epoch": 2.26, + "grad_norm": 0.7457790970802307, + "learning_rate": 8.543533359559479e-05, + "loss": 2.7649, + "step": 46136 + }, + { + "epoch": 2.26, + "grad_norm": 0.7245312333106995, + "learning_rate": 8.542457342009765e-05, + "loss": 2.641, + "step": 46137 + }, + { + "epoch": 2.26, + "grad_norm": 0.761008620262146, + "learning_rate": 8.541381380975096e-05, + "loss": 2.9057, + "step": 46138 + }, + { + "epoch": 2.26, + "grad_norm": 0.7968191504478455, + "learning_rate": 8.54030547645829e-05, + "loss": 2.6337, + "step": 46139 + }, + { + "epoch": 2.26, + "grad_norm": 0.7402268648147583, + "learning_rate": 8.53922962846219e-05, + "loss": 3.0397, + "step": 46140 + }, + { + "epoch": 2.26, + "grad_norm": 0.8710418939590454, + "learning_rate": 8.538153836989635e-05, + "loss": 2.8891, + "step": 46141 + }, + { + "epoch": 2.26, + "grad_norm": 0.734683096408844, + "learning_rate": 8.537078102043447e-05, + "loss": 2.929, + "step": 46142 + }, + { + "epoch": 2.26, + "grad_norm": 0.7593229413032532, + "learning_rate": 8.536002423626475e-05, + "loss": 2.8161, + "step": 46143 + }, + { + "epoch": 2.26, + "grad_norm": 0.6786648035049438, + "learning_rate": 8.534926801741538e-05, + "loss": 2.9832, + "step": 46144 + }, + { + "epoch": 2.26, + "grad_norm": 0.7301282286643982, + "learning_rate": 8.533851236391463e-05, + "loss": 2.9853, + "step": 46145 + }, + { + "epoch": 2.26, + "grad_norm": 0.725745439529419, + "learning_rate": 8.532775727579104e-05, + "loss": 3.1637, + "step": 46146 + }, + { + "epoch": 2.26, + "grad_norm": 0.7195970416069031, + "learning_rate": 8.531700275307274e-05, + "loss": 2.8625, + "step": 46147 + }, + { + "epoch": 2.26, + "grad_norm": 0.7576825618743896, + "learning_rate": 8.530624879578824e-05, + "loss": 2.8941, + "step": 46148 + }, + { + "epoch": 2.26, + "grad_norm": 0.7363550066947937, + "learning_rate": 8.529549540396564e-05, + "loss": 3.004, + "step": 46149 + }, + { + "epoch": 2.26, + "grad_norm": 0.6844567656517029, + "learning_rate": 8.528474257763346e-05, + "loss": 2.9282, + "step": 46150 + }, + { + "epoch": 2.26, + "grad_norm": 0.6960411667823792, + "learning_rate": 8.527399031682e-05, + "loss": 3.001, + "step": 46151 + }, + { + "epoch": 2.26, + "grad_norm": 0.7222846150398254, + "learning_rate": 8.52632386215534e-05, + "loss": 2.987, + "step": 46152 + }, + { + "epoch": 2.26, + "grad_norm": 0.7160438895225525, + "learning_rate": 8.525248749186219e-05, + "loss": 2.9166, + "step": 46153 + }, + { + "epoch": 2.26, + "grad_norm": 0.7318093776702881, + "learning_rate": 8.52417369277745e-05, + "loss": 2.9424, + "step": 46154 + }, + { + "epoch": 2.26, + "grad_norm": 0.7212334275245667, + "learning_rate": 8.523098692931877e-05, + "loss": 2.9443, + "step": 46155 + }, + { + "epoch": 2.26, + "grad_norm": 0.7121982574462891, + "learning_rate": 8.522023749652338e-05, + "loss": 2.9703, + "step": 46156 + }, + { + "epoch": 2.26, + "grad_norm": 0.722030520439148, + "learning_rate": 8.520948862941646e-05, + "loss": 3.0426, + "step": 46157 + }, + { + "epoch": 2.26, + "grad_norm": 0.7229368090629578, + "learning_rate": 8.519874032802658e-05, + "loss": 3.1051, + "step": 46158 + }, + { + "epoch": 2.26, + "grad_norm": 0.7376280426979065, + "learning_rate": 8.518799259238167e-05, + "loss": 2.8847, + "step": 46159 + }, + { + "epoch": 2.26, + "grad_norm": 0.6587594151496887, + "learning_rate": 8.517724542251026e-05, + "loss": 2.7757, + "step": 46160 + }, + { + "epoch": 2.26, + "grad_norm": 0.7172147631645203, + "learning_rate": 8.516649881844077e-05, + "loss": 2.6424, + "step": 46161 + }, + { + "epoch": 2.26, + "grad_norm": 0.7306764125823975, + "learning_rate": 8.515575278020125e-05, + "loss": 2.9843, + "step": 46162 + }, + { + "epoch": 2.26, + "grad_norm": 0.7316793203353882, + "learning_rate": 8.514500730782023e-05, + "loss": 2.9263, + "step": 46163 + }, + { + "epoch": 2.26, + "grad_norm": 0.8195810317993164, + "learning_rate": 8.513426240132584e-05, + "loss": 2.9318, + "step": 46164 + }, + { + "epoch": 2.26, + "grad_norm": 0.7335458397865295, + "learning_rate": 8.51235180607465e-05, + "loss": 2.8091, + "step": 46165 + }, + { + "epoch": 2.26, + "grad_norm": 0.6977335810661316, + "learning_rate": 8.511277428611048e-05, + "loss": 3.0965, + "step": 46166 + }, + { + "epoch": 2.26, + "grad_norm": 0.6865370273590088, + "learning_rate": 8.510203107744598e-05, + "loss": 2.9652, + "step": 46167 + }, + { + "epoch": 2.26, + "grad_norm": 0.737516462802887, + "learning_rate": 8.509128843478145e-05, + "loss": 2.9361, + "step": 46168 + }, + { + "epoch": 2.26, + "grad_norm": 0.7313123345375061, + "learning_rate": 8.508054635814501e-05, + "loss": 2.9974, + "step": 46169 + }, + { + "epoch": 2.26, + "grad_norm": 0.7545174360275269, + "learning_rate": 8.506980484756507e-05, + "loss": 3.1408, + "step": 46170 + }, + { + "epoch": 2.26, + "grad_norm": 0.7121573090553284, + "learning_rate": 8.505906390306994e-05, + "loss": 2.9658, + "step": 46171 + }, + { + "epoch": 2.26, + "grad_norm": 0.7106088399887085, + "learning_rate": 8.504832352468795e-05, + "loss": 2.9336, + "step": 46172 + }, + { + "epoch": 2.26, + "grad_norm": 0.741375744342804, + "learning_rate": 8.503758371244724e-05, + "loss": 3.0173, + "step": 46173 + }, + { + "epoch": 2.26, + "grad_norm": 0.7445472478866577, + "learning_rate": 8.502684446637608e-05, + "loss": 2.7716, + "step": 46174 + }, + { + "epoch": 2.26, + "grad_norm": 0.7355408668518066, + "learning_rate": 8.501610578650289e-05, + "loss": 2.6886, + "step": 46175 + }, + { + "epoch": 2.26, + "grad_norm": 0.7312442660331726, + "learning_rate": 8.500536767285595e-05, + "loss": 2.8622, + "step": 46176 + }, + { + "epoch": 2.26, + "grad_norm": 0.7817408442497253, + "learning_rate": 8.49946301254634e-05, + "loss": 2.7271, + "step": 46177 + }, + { + "epoch": 2.26, + "grad_norm": 0.7041735649108887, + "learning_rate": 8.498389314435372e-05, + "loss": 2.8909, + "step": 46178 + }, + { + "epoch": 2.26, + "grad_norm": 0.7182589173316956, + "learning_rate": 8.497315672955498e-05, + "loss": 2.8677, + "step": 46179 + }, + { + "epoch": 2.26, + "grad_norm": 0.6795207262039185, + "learning_rate": 8.496242088109569e-05, + "loss": 2.7094, + "step": 46180 + }, + { + "epoch": 2.26, + "grad_norm": 0.6875749826431274, + "learning_rate": 8.495168559900397e-05, + "loss": 2.8307, + "step": 46181 + }, + { + "epoch": 2.26, + "grad_norm": 0.7328691482543945, + "learning_rate": 8.494095088330803e-05, + "loss": 2.9532, + "step": 46182 + }, + { + "epoch": 2.26, + "grad_norm": 0.7125970721244812, + "learning_rate": 8.493021673403635e-05, + "loss": 2.7698, + "step": 46183 + }, + { + "epoch": 2.26, + "grad_norm": 0.7266592383384705, + "learning_rate": 8.491948315121696e-05, + "loss": 2.8341, + "step": 46184 + }, + { + "epoch": 2.26, + "grad_norm": 0.7378901243209839, + "learning_rate": 8.490875013487831e-05, + "loss": 2.9183, + "step": 46185 + }, + { + "epoch": 2.26, + "grad_norm": 0.7208519577980042, + "learning_rate": 8.489801768504868e-05, + "loss": 3.2133, + "step": 46186 + }, + { + "epoch": 2.26, + "grad_norm": 0.7134647369384766, + "learning_rate": 8.488728580175625e-05, + "loss": 2.9837, + "step": 46187 + }, + { + "epoch": 2.26, + "grad_norm": 0.7949679493904114, + "learning_rate": 8.487655448502934e-05, + "loss": 2.9484, + "step": 46188 + }, + { + "epoch": 2.26, + "grad_norm": 0.7748767137527466, + "learning_rate": 8.48658237348961e-05, + "loss": 2.7965, + "step": 46189 + }, + { + "epoch": 2.26, + "grad_norm": 0.7083140015602112, + "learning_rate": 8.485509355138485e-05, + "loss": 2.912, + "step": 46190 + }, + { + "epoch": 2.26, + "grad_norm": 0.6907854676246643, + "learning_rate": 8.484436393452399e-05, + "loss": 2.6598, + "step": 46191 + }, + { + "epoch": 2.26, + "grad_norm": 0.7154836058616638, + "learning_rate": 8.483363488434156e-05, + "loss": 2.9983, + "step": 46192 + }, + { + "epoch": 2.26, + "grad_norm": 0.6884499788284302, + "learning_rate": 8.482290640086602e-05, + "loss": 2.947, + "step": 46193 + }, + { + "epoch": 2.26, + "grad_norm": 0.723860502243042, + "learning_rate": 8.481217848412554e-05, + "loss": 2.8889, + "step": 46194 + }, + { + "epoch": 2.26, + "grad_norm": 0.7943236827850342, + "learning_rate": 8.480145113414828e-05, + "loss": 2.9106, + "step": 46195 + }, + { + "epoch": 2.26, + "grad_norm": 0.7377872467041016, + "learning_rate": 8.479072435096265e-05, + "loss": 2.8235, + "step": 46196 + }, + { + "epoch": 2.26, + "grad_norm": 0.6710457801818848, + "learning_rate": 8.477999813459678e-05, + "loss": 2.9226, + "step": 46197 + }, + { + "epoch": 2.26, + "grad_norm": 0.6965027451515198, + "learning_rate": 8.476927248507904e-05, + "loss": 3.1496, + "step": 46198 + }, + { + "epoch": 2.26, + "grad_norm": 0.6677283048629761, + "learning_rate": 8.475854740243755e-05, + "loss": 2.8667, + "step": 46199 + }, + { + "epoch": 2.26, + "grad_norm": 0.7114673256874084, + "learning_rate": 8.474782288670057e-05, + "loss": 3.0478, + "step": 46200 + }, + { + "epoch": 2.26, + "grad_norm": 0.734982967376709, + "learning_rate": 8.473709893789654e-05, + "loss": 3.0154, + "step": 46201 + }, + { + "epoch": 2.26, + "grad_norm": 0.7224012613296509, + "learning_rate": 8.472637555605356e-05, + "loss": 2.8927, + "step": 46202 + }, + { + "epoch": 2.26, + "grad_norm": 0.7512854337692261, + "learning_rate": 8.471565274119984e-05, + "loss": 2.7395, + "step": 46203 + }, + { + "epoch": 2.26, + "grad_norm": 0.7039549946784973, + "learning_rate": 8.470493049336361e-05, + "loss": 2.8335, + "step": 46204 + }, + { + "epoch": 2.26, + "grad_norm": 0.7275938987731934, + "learning_rate": 8.469420881257322e-05, + "loss": 2.8, + "step": 46205 + }, + { + "epoch": 2.26, + "grad_norm": 0.6578214168548584, + "learning_rate": 8.468348769885678e-05, + "loss": 2.8258, + "step": 46206 + }, + { + "epoch": 2.26, + "grad_norm": 0.6866016387939453, + "learning_rate": 8.467276715224257e-05, + "loss": 2.9636, + "step": 46207 + }, + { + "epoch": 2.26, + "grad_norm": 0.7433063983917236, + "learning_rate": 8.466204717275894e-05, + "loss": 3.011, + "step": 46208 + }, + { + "epoch": 2.26, + "grad_norm": 0.6856465935707092, + "learning_rate": 8.465132776043407e-05, + "loss": 2.7571, + "step": 46209 + }, + { + "epoch": 2.26, + "grad_norm": 0.6921495795249939, + "learning_rate": 8.464060891529612e-05, + "loss": 2.729, + "step": 46210 + }, + { + "epoch": 2.26, + "grad_norm": 0.7133631706237793, + "learning_rate": 8.46298906373733e-05, + "loss": 2.9614, + "step": 46211 + }, + { + "epoch": 2.26, + "grad_norm": 0.6963069438934326, + "learning_rate": 8.461917292669389e-05, + "loss": 2.9202, + "step": 46212 + }, + { + "epoch": 2.26, + "grad_norm": 0.7098976373672485, + "learning_rate": 8.46084557832862e-05, + "loss": 2.7825, + "step": 46213 + }, + { + "epoch": 2.26, + "grad_norm": 0.6936521530151367, + "learning_rate": 8.459773920717831e-05, + "loss": 2.7475, + "step": 46214 + }, + { + "epoch": 2.26, + "grad_norm": 0.6678090691566467, + "learning_rate": 8.458702319839861e-05, + "loss": 2.8444, + "step": 46215 + }, + { + "epoch": 2.26, + "grad_norm": 0.8158767223358154, + "learning_rate": 8.457630775697522e-05, + "loss": 2.8724, + "step": 46216 + }, + { + "epoch": 2.26, + "grad_norm": 0.7068122625350952, + "learning_rate": 8.456559288293632e-05, + "loss": 2.9663, + "step": 46217 + }, + { + "epoch": 2.27, + "grad_norm": 0.7318935990333557, + "learning_rate": 8.455487857631023e-05, + "loss": 2.9523, + "step": 46218 + }, + { + "epoch": 2.27, + "grad_norm": 0.7318170666694641, + "learning_rate": 8.454416483712505e-05, + "loss": 2.8784, + "step": 46219 + }, + { + "epoch": 2.27, + "grad_norm": 0.718914270401001, + "learning_rate": 8.45334516654092e-05, + "loss": 2.9895, + "step": 46220 + }, + { + "epoch": 2.27, + "grad_norm": 0.7228567600250244, + "learning_rate": 8.452273906119064e-05, + "loss": 2.9328, + "step": 46221 + }, + { + "epoch": 2.27, + "grad_norm": 0.7042717933654785, + "learning_rate": 8.451202702449773e-05, + "loss": 3.196, + "step": 46222 + }, + { + "epoch": 2.27, + "grad_norm": 0.7152907848358154, + "learning_rate": 8.450131555535876e-05, + "loss": 2.9061, + "step": 46223 + }, + { + "epoch": 2.27, + "grad_norm": 0.7358282208442688, + "learning_rate": 8.449060465380188e-05, + "loss": 2.9795, + "step": 46224 + }, + { + "epoch": 2.27, + "grad_norm": 0.6989845037460327, + "learning_rate": 8.447989431985521e-05, + "loss": 2.791, + "step": 46225 + }, + { + "epoch": 2.27, + "grad_norm": 0.7169927954673767, + "learning_rate": 8.446918455354699e-05, + "loss": 2.9684, + "step": 46226 + }, + { + "epoch": 2.27, + "grad_norm": 0.7589333653450012, + "learning_rate": 8.44584753549054e-05, + "loss": 2.7864, + "step": 46227 + }, + { + "epoch": 2.27, + "grad_norm": 0.7092520594596863, + "learning_rate": 8.444776672395886e-05, + "loss": 3.1328, + "step": 46228 + }, + { + "epoch": 2.27, + "grad_norm": 0.7001926898956299, + "learning_rate": 8.443705866073526e-05, + "loss": 2.773, + "step": 46229 + }, + { + "epoch": 2.27, + "grad_norm": 0.713153600692749, + "learning_rate": 8.44263511652631e-05, + "loss": 2.9185, + "step": 46230 + }, + { + "epoch": 2.27, + "grad_norm": 0.7044378519058228, + "learning_rate": 8.441564423757045e-05, + "loss": 2.8194, + "step": 46231 + }, + { + "epoch": 2.27, + "grad_norm": 0.7447625398635864, + "learning_rate": 8.44049378776854e-05, + "loss": 2.9084, + "step": 46232 + }, + { + "epoch": 2.27, + "grad_norm": 0.7260729074478149, + "learning_rate": 8.439423208563633e-05, + "loss": 2.9366, + "step": 46233 + }, + { + "epoch": 2.27, + "grad_norm": 0.6695955991744995, + "learning_rate": 8.438352686145124e-05, + "loss": 2.9983, + "step": 46234 + }, + { + "epoch": 2.27, + "grad_norm": 0.7658150792121887, + "learning_rate": 8.43728222051586e-05, + "loss": 2.9357, + "step": 46235 + }, + { + "epoch": 2.27, + "grad_norm": 0.6993694305419922, + "learning_rate": 8.436211811678632e-05, + "loss": 2.9622, + "step": 46236 + }, + { + "epoch": 2.27, + "grad_norm": 0.7462490200996399, + "learning_rate": 8.435141459636281e-05, + "loss": 2.8533, + "step": 46237 + }, + { + "epoch": 2.27, + "grad_norm": 0.7425763607025146, + "learning_rate": 8.43407116439162e-05, + "loss": 2.9425, + "step": 46238 + }, + { + "epoch": 2.27, + "grad_norm": 0.7130029201507568, + "learning_rate": 8.433000925947452e-05, + "loss": 2.857, + "step": 46239 + }, + { + "epoch": 2.27, + "grad_norm": 0.6981950402259827, + "learning_rate": 8.43193074430662e-05, + "loss": 2.7975, + "step": 46240 + }, + { + "epoch": 2.27, + "grad_norm": 0.6925346255302429, + "learning_rate": 8.430860619471921e-05, + "loss": 2.9858, + "step": 46241 + }, + { + "epoch": 2.27, + "grad_norm": 0.6922650933265686, + "learning_rate": 8.429790551446186e-05, + "loss": 2.8014, + "step": 46242 + }, + { + "epoch": 2.27, + "grad_norm": 0.7049052119255066, + "learning_rate": 8.42872054023224e-05, + "loss": 3.0081, + "step": 46243 + }, + { + "epoch": 2.27, + "grad_norm": 0.7104154825210571, + "learning_rate": 8.427650585832884e-05, + "loss": 2.9192, + "step": 46244 + }, + { + "epoch": 2.27, + "grad_norm": 0.6965845227241516, + "learning_rate": 8.426580688250952e-05, + "loss": 3.0567, + "step": 46245 + }, + { + "epoch": 2.27, + "grad_norm": 0.6756129264831543, + "learning_rate": 8.425510847489257e-05, + "loss": 2.8244, + "step": 46246 + }, + { + "epoch": 2.27, + "grad_norm": 0.7112818956375122, + "learning_rate": 8.424441063550603e-05, + "loss": 2.77, + "step": 46247 + }, + { + "epoch": 2.27, + "grad_norm": 0.7068428993225098, + "learning_rate": 8.42337133643783e-05, + "loss": 2.8858, + "step": 46248 + }, + { + "epoch": 2.27, + "grad_norm": 0.6985403299331665, + "learning_rate": 8.422301666153734e-05, + "loss": 2.8649, + "step": 46249 + }, + { + "epoch": 2.27, + "grad_norm": 0.6854354739189148, + "learning_rate": 8.421232052701152e-05, + "loss": 2.9009, + "step": 46250 + }, + { + "epoch": 2.27, + "grad_norm": 0.6568443775177002, + "learning_rate": 8.42016249608288e-05, + "loss": 2.7745, + "step": 46251 + }, + { + "epoch": 2.27, + "grad_norm": 0.7257750034332275, + "learning_rate": 8.419092996301761e-05, + "loss": 2.9674, + "step": 46252 + }, + { + "epoch": 2.27, + "grad_norm": 0.7708278894424438, + "learning_rate": 8.418023553360596e-05, + "loss": 2.837, + "step": 46253 + }, + { + "epoch": 2.27, + "grad_norm": 0.7032803893089294, + "learning_rate": 8.416954167262193e-05, + "loss": 3.0264, + "step": 46254 + }, + { + "epoch": 2.27, + "grad_norm": 0.6957308650016785, + "learning_rate": 8.415884838009388e-05, + "loss": 2.9592, + "step": 46255 + }, + { + "epoch": 2.27, + "grad_norm": 0.7218675017356873, + "learning_rate": 8.414815565604983e-05, + "loss": 2.8956, + "step": 46256 + }, + { + "epoch": 2.27, + "grad_norm": 0.7233446836471558, + "learning_rate": 8.413746350051797e-05, + "loss": 2.9225, + "step": 46257 + }, + { + "epoch": 2.27, + "grad_norm": 0.7115861177444458, + "learning_rate": 8.412677191352656e-05, + "loss": 3.0288, + "step": 46258 + }, + { + "epoch": 2.27, + "grad_norm": 0.7438410520553589, + "learning_rate": 8.41160808951037e-05, + "loss": 2.8894, + "step": 46259 + }, + { + "epoch": 2.27, + "grad_norm": 0.700964629650116, + "learning_rate": 8.410539044527753e-05, + "loss": 3.0023, + "step": 46260 + }, + { + "epoch": 2.27, + "grad_norm": 0.7393155694007874, + "learning_rate": 8.409470056407614e-05, + "loss": 2.9852, + "step": 46261 + }, + { + "epoch": 2.27, + "grad_norm": 0.6867033243179321, + "learning_rate": 8.408401125152776e-05, + "loss": 2.9322, + "step": 46262 + }, + { + "epoch": 2.27, + "grad_norm": 0.6977315545082092, + "learning_rate": 8.407332250766063e-05, + "loss": 2.9155, + "step": 46263 + }, + { + "epoch": 2.27, + "grad_norm": 0.6820677518844604, + "learning_rate": 8.40626343325027e-05, + "loss": 2.9321, + "step": 46264 + }, + { + "epoch": 2.27, + "grad_norm": 0.7445458173751831, + "learning_rate": 8.405194672608233e-05, + "loss": 2.9113, + "step": 46265 + }, + { + "epoch": 2.27, + "grad_norm": 0.6989564895629883, + "learning_rate": 8.40412596884275e-05, + "loss": 3.0236, + "step": 46266 + }, + { + "epoch": 2.27, + "grad_norm": 0.7086830735206604, + "learning_rate": 8.403057321956648e-05, + "loss": 2.9824, + "step": 46267 + }, + { + "epoch": 2.27, + "grad_norm": 0.7059872150421143, + "learning_rate": 8.401988731952741e-05, + "loss": 3.0183, + "step": 46268 + }, + { + "epoch": 2.27, + "grad_norm": 0.7030812501907349, + "learning_rate": 8.400920198833829e-05, + "loss": 3.0954, + "step": 46269 + }, + { + "epoch": 2.27, + "grad_norm": 0.7751390337944031, + "learning_rate": 8.399851722602743e-05, + "loss": 2.9932, + "step": 46270 + }, + { + "epoch": 2.27, + "grad_norm": 0.7409400343894958, + "learning_rate": 8.398783303262283e-05, + "loss": 2.9746, + "step": 46271 + }, + { + "epoch": 2.27, + "grad_norm": 0.7673764228820801, + "learning_rate": 8.39771494081527e-05, + "loss": 2.8363, + "step": 46272 + }, + { + "epoch": 2.27, + "grad_norm": 0.7108684182167053, + "learning_rate": 8.396646635264528e-05, + "loss": 3.0562, + "step": 46273 + }, + { + "epoch": 2.27, + "grad_norm": 0.6982866525650024, + "learning_rate": 8.395578386612861e-05, + "loss": 2.8194, + "step": 46274 + }, + { + "epoch": 2.27, + "grad_norm": 0.7467199563980103, + "learning_rate": 8.394510194863085e-05, + "loss": 3.0077, + "step": 46275 + }, + { + "epoch": 2.27, + "grad_norm": 0.7095677256584167, + "learning_rate": 8.393442060017998e-05, + "loss": 2.9502, + "step": 46276 + }, + { + "epoch": 2.27, + "grad_norm": 0.7236535549163818, + "learning_rate": 8.392373982080428e-05, + "loss": 2.7709, + "step": 46277 + }, + { + "epoch": 2.27, + "grad_norm": 0.7553972005844116, + "learning_rate": 8.391305961053197e-05, + "loss": 2.9195, + "step": 46278 + }, + { + "epoch": 2.27, + "grad_norm": 0.7126523852348328, + "learning_rate": 8.390237996939098e-05, + "loss": 2.9921, + "step": 46279 + }, + { + "epoch": 2.27, + "grad_norm": 0.743773341178894, + "learning_rate": 8.389170089740962e-05, + "loss": 2.9643, + "step": 46280 + }, + { + "epoch": 2.27, + "grad_norm": 0.7017213106155396, + "learning_rate": 8.388102239461582e-05, + "loss": 3.0562, + "step": 46281 + }, + { + "epoch": 2.27, + "grad_norm": 0.7224516868591309, + "learning_rate": 8.387034446103794e-05, + "loss": 2.9265, + "step": 46282 + }, + { + "epoch": 2.27, + "grad_norm": 0.7359697818756104, + "learning_rate": 8.385966709670395e-05, + "loss": 3.0036, + "step": 46283 + }, + { + "epoch": 2.27, + "grad_norm": 0.6996152400970459, + "learning_rate": 8.384899030164189e-05, + "loss": 2.9987, + "step": 46284 + }, + { + "epoch": 2.27, + "grad_norm": 0.6811954975128174, + "learning_rate": 8.383831407588011e-05, + "loss": 3.0215, + "step": 46285 + }, + { + "epoch": 2.27, + "grad_norm": 0.746880829334259, + "learning_rate": 8.382763841944654e-05, + "loss": 3.0656, + "step": 46286 + }, + { + "epoch": 2.27, + "grad_norm": 0.7244267463684082, + "learning_rate": 8.381696333236944e-05, + "loss": 2.9721, + "step": 46287 + }, + { + "epoch": 2.27, + "grad_norm": 0.7403231263160706, + "learning_rate": 8.380628881467674e-05, + "loss": 3.0218, + "step": 46288 + }, + { + "epoch": 2.27, + "grad_norm": 0.7562906742095947, + "learning_rate": 8.379561486639679e-05, + "loss": 2.9035, + "step": 46289 + }, + { + "epoch": 2.27, + "grad_norm": 0.7179818749427795, + "learning_rate": 8.378494148755753e-05, + "loss": 2.8786, + "step": 46290 + }, + { + "epoch": 2.27, + "grad_norm": 0.7321277856826782, + "learning_rate": 8.377426867818708e-05, + "loss": 2.9989, + "step": 46291 + }, + { + "epoch": 2.27, + "grad_norm": 0.7414536476135254, + "learning_rate": 8.376359643831369e-05, + "loss": 2.8159, + "step": 46292 + }, + { + "epoch": 2.27, + "grad_norm": 0.7076948285102844, + "learning_rate": 8.375292476796524e-05, + "loss": 2.7403, + "step": 46293 + }, + { + "epoch": 2.27, + "grad_norm": 0.7004036903381348, + "learning_rate": 8.374225366717002e-05, + "loss": 2.9799, + "step": 46294 + }, + { + "epoch": 2.27, + "grad_norm": 0.728317379951477, + "learning_rate": 8.373158313595612e-05, + "loss": 2.6187, + "step": 46295 + }, + { + "epoch": 2.27, + "grad_norm": 0.7136837840080261, + "learning_rate": 8.372091317435164e-05, + "loss": 2.8687, + "step": 46296 + }, + { + "epoch": 2.27, + "grad_norm": 0.6955955028533936, + "learning_rate": 8.371024378238464e-05, + "loss": 2.9601, + "step": 46297 + }, + { + "epoch": 2.27, + "grad_norm": 0.6891588568687439, + "learning_rate": 8.369957496008314e-05, + "loss": 2.9733, + "step": 46298 + }, + { + "epoch": 2.27, + "grad_norm": 0.7407911419868469, + "learning_rate": 8.368890670747535e-05, + "loss": 2.8449, + "step": 46299 + }, + { + "epoch": 2.27, + "grad_norm": 0.7350627183914185, + "learning_rate": 8.367823902458941e-05, + "loss": 2.957, + "step": 46300 + }, + { + "epoch": 2.27, + "grad_norm": 0.7319565415382385, + "learning_rate": 8.366757191145328e-05, + "loss": 2.77, + "step": 46301 + }, + { + "epoch": 2.27, + "grad_norm": 0.7195742130279541, + "learning_rate": 8.365690536809525e-05, + "loss": 2.8073, + "step": 46302 + }, + { + "epoch": 2.27, + "grad_norm": 0.7700269818305969, + "learning_rate": 8.364623939454318e-05, + "loss": 2.8723, + "step": 46303 + }, + { + "epoch": 2.27, + "grad_norm": 0.7323468923568726, + "learning_rate": 8.363557399082535e-05, + "loss": 2.8773, + "step": 46304 + }, + { + "epoch": 2.27, + "grad_norm": 0.7363598346710205, + "learning_rate": 8.362490915696981e-05, + "loss": 2.843, + "step": 46305 + }, + { + "epoch": 2.27, + "grad_norm": 0.7405996322631836, + "learning_rate": 8.36142448930045e-05, + "loss": 3.0222, + "step": 46306 + }, + { + "epoch": 2.27, + "grad_norm": 0.7685582041740417, + "learning_rate": 8.360358119895773e-05, + "loss": 2.8072, + "step": 46307 + }, + { + "epoch": 2.27, + "grad_norm": 0.7079715728759766, + "learning_rate": 8.35929180748574e-05, + "loss": 2.8948, + "step": 46308 + }, + { + "epoch": 2.27, + "grad_norm": 0.7493276000022888, + "learning_rate": 8.358225552073166e-05, + "loss": 2.8799, + "step": 46309 + }, + { + "epoch": 2.27, + "grad_norm": 0.7022292613983154, + "learning_rate": 8.357159353660871e-05, + "loss": 3.0506, + "step": 46310 + }, + { + "epoch": 2.27, + "grad_norm": 0.7002536058425903, + "learning_rate": 8.356093212251652e-05, + "loss": 2.9324, + "step": 46311 + }, + { + "epoch": 2.27, + "grad_norm": 0.7278270721435547, + "learning_rate": 8.35502712784832e-05, + "loss": 2.9016, + "step": 46312 + }, + { + "epoch": 2.27, + "grad_norm": 0.6652935147285461, + "learning_rate": 8.353961100453668e-05, + "loss": 3.0654, + "step": 46313 + }, + { + "epoch": 2.27, + "grad_norm": 0.7217055559158325, + "learning_rate": 8.352895130070518e-05, + "loss": 2.75, + "step": 46314 + }, + { + "epoch": 2.27, + "grad_norm": 0.7330880165100098, + "learning_rate": 8.351829216701685e-05, + "loss": 2.9563, + "step": 46315 + }, + { + "epoch": 2.27, + "grad_norm": 0.6977697014808655, + "learning_rate": 8.350763360349961e-05, + "loss": 3.0273, + "step": 46316 + }, + { + "epoch": 2.27, + "grad_norm": 0.7105409502983093, + "learning_rate": 8.349697561018167e-05, + "loss": 2.9189, + "step": 46317 + }, + { + "epoch": 2.27, + "grad_norm": 0.7222777009010315, + "learning_rate": 8.348631818709103e-05, + "loss": 2.8138, + "step": 46318 + }, + { + "epoch": 2.27, + "grad_norm": 0.6798412203788757, + "learning_rate": 8.347566133425568e-05, + "loss": 2.9272, + "step": 46319 + }, + { + "epoch": 2.27, + "grad_norm": 0.6820101737976074, + "learning_rate": 8.346500505170385e-05, + "loss": 3.1816, + "step": 46320 + }, + { + "epoch": 2.27, + "grad_norm": 0.7206324934959412, + "learning_rate": 8.345434933946345e-05, + "loss": 2.8773, + "step": 46321 + }, + { + "epoch": 2.27, + "grad_norm": 0.7084947228431702, + "learning_rate": 8.344369419756268e-05, + "loss": 2.8789, + "step": 46322 + }, + { + "epoch": 2.27, + "grad_norm": 0.7819315195083618, + "learning_rate": 8.343303962602948e-05, + "loss": 2.7798, + "step": 46323 + }, + { + "epoch": 2.27, + "grad_norm": 0.7215809226036072, + "learning_rate": 8.342238562489192e-05, + "loss": 3.1057, + "step": 46324 + }, + { + "epoch": 2.27, + "grad_norm": 0.7359743118286133, + "learning_rate": 8.341173219417826e-05, + "loss": 2.7669, + "step": 46325 + }, + { + "epoch": 2.27, + "grad_norm": 0.7675909996032715, + "learning_rate": 8.340107933391641e-05, + "loss": 2.9143, + "step": 46326 + }, + { + "epoch": 2.27, + "grad_norm": 0.7441685199737549, + "learning_rate": 8.339042704413443e-05, + "loss": 2.8385, + "step": 46327 + }, + { + "epoch": 2.27, + "grad_norm": 0.6832141280174255, + "learning_rate": 8.337977532486024e-05, + "loss": 2.7445, + "step": 46328 + }, + { + "epoch": 2.27, + "grad_norm": 0.6876572966575623, + "learning_rate": 8.336912417612208e-05, + "loss": 3.1471, + "step": 46329 + }, + { + "epoch": 2.27, + "grad_norm": 0.6860120892524719, + "learning_rate": 8.335847359794805e-05, + "loss": 3.0647, + "step": 46330 + }, + { + "epoch": 2.27, + "grad_norm": 0.6835752725601196, + "learning_rate": 8.334782359036599e-05, + "loss": 3.1026, + "step": 46331 + }, + { + "epoch": 2.27, + "grad_norm": 0.7607629299163818, + "learning_rate": 8.333717415340419e-05, + "loss": 2.8308, + "step": 46332 + }, + { + "epoch": 2.27, + "grad_norm": 0.7346097826957703, + "learning_rate": 8.332652528709054e-05, + "loss": 2.8343, + "step": 46333 + }, + { + "epoch": 2.27, + "grad_norm": 0.7202624678611755, + "learning_rate": 8.331587699145303e-05, + "loss": 2.9422, + "step": 46334 + }, + { + "epoch": 2.27, + "grad_norm": 0.7167713642120361, + "learning_rate": 8.330522926651992e-05, + "loss": 2.8753, + "step": 46335 + }, + { + "epoch": 2.27, + "grad_norm": 0.6617588400840759, + "learning_rate": 8.329458211231902e-05, + "loss": 2.9843, + "step": 46336 + }, + { + "epoch": 2.27, + "grad_norm": 0.7064581513404846, + "learning_rate": 8.32839355288786e-05, + "loss": 2.9314, + "step": 46337 + }, + { + "epoch": 2.27, + "grad_norm": 0.670749306678772, + "learning_rate": 8.327328951622646e-05, + "loss": 2.8856, + "step": 46338 + }, + { + "epoch": 2.27, + "grad_norm": 0.6679714918136597, + "learning_rate": 8.326264407439088e-05, + "loss": 2.9836, + "step": 46339 + }, + { + "epoch": 2.27, + "grad_norm": 0.729651153087616, + "learning_rate": 8.325199920339978e-05, + "loss": 2.982, + "step": 46340 + }, + { + "epoch": 2.27, + "grad_norm": 0.706098735332489, + "learning_rate": 8.324135490328112e-05, + "loss": 2.766, + "step": 46341 + }, + { + "epoch": 2.27, + "grad_norm": 0.7295530438423157, + "learning_rate": 8.32307111740631e-05, + "loss": 3.0328, + "step": 46342 + }, + { + "epoch": 2.27, + "grad_norm": 0.6685080528259277, + "learning_rate": 8.322006801577356e-05, + "loss": 2.9178, + "step": 46343 + }, + { + "epoch": 2.27, + "grad_norm": 0.6723880171775818, + "learning_rate": 8.320942542844065e-05, + "loss": 2.8969, + "step": 46344 + }, + { + "epoch": 2.27, + "grad_norm": 0.741836428642273, + "learning_rate": 8.319878341209248e-05, + "loss": 3.0116, + "step": 46345 + }, + { + "epoch": 2.27, + "grad_norm": 0.7528204321861267, + "learning_rate": 8.318814196675689e-05, + "loss": 2.9499, + "step": 46346 + }, + { + "epoch": 2.27, + "grad_norm": 0.7128376364707947, + "learning_rate": 8.317750109246213e-05, + "loss": 2.8801, + "step": 46347 + }, + { + "epoch": 2.27, + "grad_norm": 0.7809441685676575, + "learning_rate": 8.316686078923608e-05, + "loss": 2.8204, + "step": 46348 + }, + { + "epoch": 2.27, + "grad_norm": 0.7417147755622864, + "learning_rate": 8.315622105710672e-05, + "loss": 2.7649, + "step": 46349 + }, + { + "epoch": 2.27, + "grad_norm": 0.6948042511940002, + "learning_rate": 8.31455818961022e-05, + "loss": 2.8864, + "step": 46350 + }, + { + "epoch": 2.27, + "grad_norm": 0.757095456123352, + "learning_rate": 8.313494330625041e-05, + "loss": 2.9817, + "step": 46351 + }, + { + "epoch": 2.27, + "grad_norm": 0.6737290024757385, + "learning_rate": 8.312430528757954e-05, + "loss": 2.9095, + "step": 46352 + }, + { + "epoch": 2.27, + "grad_norm": 0.7008237838745117, + "learning_rate": 8.31136678401174e-05, + "loss": 2.9619, + "step": 46353 + }, + { + "epoch": 2.27, + "grad_norm": 0.7046443223953247, + "learning_rate": 8.310303096389224e-05, + "loss": 2.9833, + "step": 46354 + }, + { + "epoch": 2.27, + "grad_norm": 0.7089089751243591, + "learning_rate": 8.309239465893191e-05, + "loss": 2.67, + "step": 46355 + }, + { + "epoch": 2.27, + "grad_norm": 0.7592144012451172, + "learning_rate": 8.30817589252644e-05, + "loss": 2.9825, + "step": 46356 + }, + { + "epoch": 2.27, + "grad_norm": 0.6998030543327332, + "learning_rate": 8.307112376291789e-05, + "loss": 2.8194, + "step": 46357 + }, + { + "epoch": 2.27, + "grad_norm": 0.714889407157898, + "learning_rate": 8.306048917192021e-05, + "loss": 2.9081, + "step": 46358 + }, + { + "epoch": 2.27, + "grad_norm": 0.7006431818008423, + "learning_rate": 8.304985515229943e-05, + "loss": 3.1693, + "step": 46359 + }, + { + "epoch": 2.27, + "grad_norm": 0.7032368183135986, + "learning_rate": 8.303922170408366e-05, + "loss": 2.8906, + "step": 46360 + }, + { + "epoch": 2.27, + "grad_norm": 0.7039076089859009, + "learning_rate": 8.302858882730085e-05, + "loss": 3.0944, + "step": 46361 + }, + { + "epoch": 2.27, + "grad_norm": 0.7378221750259399, + "learning_rate": 8.301795652197898e-05, + "loss": 2.7746, + "step": 46362 + }, + { + "epoch": 2.27, + "grad_norm": 0.738791823387146, + "learning_rate": 8.300732478814594e-05, + "loss": 2.9973, + "step": 46363 + }, + { + "epoch": 2.27, + "grad_norm": 0.7050439715385437, + "learning_rate": 8.299669362582995e-05, + "loss": 2.8897, + "step": 46364 + }, + { + "epoch": 2.27, + "grad_norm": 0.7334348559379578, + "learning_rate": 8.298606303505883e-05, + "loss": 2.8901, + "step": 46365 + }, + { + "epoch": 2.27, + "grad_norm": 0.6942351460456848, + "learning_rate": 8.297543301586062e-05, + "loss": 3.1832, + "step": 46366 + }, + { + "epoch": 2.27, + "grad_norm": 0.7496381998062134, + "learning_rate": 8.296480356826347e-05, + "loss": 2.9803, + "step": 46367 + }, + { + "epoch": 2.27, + "grad_norm": 0.7410102486610413, + "learning_rate": 8.295417469229514e-05, + "loss": 2.7448, + "step": 46368 + }, + { + "epoch": 2.27, + "grad_norm": 0.6778762936592102, + "learning_rate": 8.294354638798386e-05, + "loss": 2.9249, + "step": 46369 + }, + { + "epoch": 2.27, + "grad_norm": 0.7150377035140991, + "learning_rate": 8.293291865535748e-05, + "loss": 2.9564, + "step": 46370 + }, + { + "epoch": 2.27, + "grad_norm": 0.7311868071556091, + "learning_rate": 8.292229149444394e-05, + "loss": 2.8055, + "step": 46371 + }, + { + "epoch": 2.27, + "grad_norm": 0.7008099555969238, + "learning_rate": 8.291166490527143e-05, + "loss": 2.923, + "step": 46372 + }, + { + "epoch": 2.27, + "grad_norm": 0.7243220210075378, + "learning_rate": 8.290103888786768e-05, + "loss": 3.0309, + "step": 46373 + }, + { + "epoch": 2.27, + "grad_norm": 0.7139156460762024, + "learning_rate": 8.289041344226091e-05, + "loss": 3.1218, + "step": 46374 + }, + { + "epoch": 2.27, + "grad_norm": 0.7168537378311157, + "learning_rate": 8.287978856847894e-05, + "loss": 2.9758, + "step": 46375 + }, + { + "epoch": 2.27, + "grad_norm": 0.7514629364013672, + "learning_rate": 8.286916426654988e-05, + "loss": 2.6922, + "step": 46376 + }, + { + "epoch": 2.27, + "grad_norm": 0.7429746985435486, + "learning_rate": 8.285854053650167e-05, + "loss": 3.0634, + "step": 46377 + }, + { + "epoch": 2.27, + "grad_norm": 0.8082495331764221, + "learning_rate": 8.284791737836219e-05, + "loss": 2.968, + "step": 46378 + }, + { + "epoch": 2.27, + "grad_norm": 0.7452201843261719, + "learning_rate": 8.283729479215959e-05, + "loss": 2.9215, + "step": 46379 + }, + { + "epoch": 2.27, + "grad_norm": 0.7076332569122314, + "learning_rate": 8.282667277792168e-05, + "loss": 2.6926, + "step": 46380 + }, + { + "epoch": 2.27, + "grad_norm": 0.6897568106651306, + "learning_rate": 8.281605133567652e-05, + "loss": 2.9806, + "step": 46381 + }, + { + "epoch": 2.27, + "grad_norm": 0.7367916703224182, + "learning_rate": 8.280543046545215e-05, + "loss": 2.9092, + "step": 46382 + }, + { + "epoch": 2.27, + "grad_norm": 0.7310504913330078, + "learning_rate": 8.27948101672764e-05, + "loss": 2.8276, + "step": 46383 + }, + { + "epoch": 2.27, + "grad_norm": 0.7102162837982178, + "learning_rate": 8.278419044117749e-05, + "loss": 2.9182, + "step": 46384 + }, + { + "epoch": 2.27, + "grad_norm": 0.6854374408721924, + "learning_rate": 8.277357128718305e-05, + "loss": 2.9456, + "step": 46385 + }, + { + "epoch": 2.27, + "grad_norm": 0.7474464178085327, + "learning_rate": 8.276295270532119e-05, + "loss": 2.8981, + "step": 46386 + }, + { + "epoch": 2.27, + "grad_norm": 0.7687355875968933, + "learning_rate": 8.275233469562004e-05, + "loss": 2.7403, + "step": 46387 + }, + { + "epoch": 2.27, + "grad_norm": 0.7094236612319946, + "learning_rate": 8.274171725810729e-05, + "loss": 2.9686, + "step": 46388 + }, + { + "epoch": 2.27, + "grad_norm": 0.7075121402740479, + "learning_rate": 8.273110039281117e-05, + "loss": 2.8354, + "step": 46389 + }, + { + "epoch": 2.27, + "grad_norm": 0.724186360836029, + "learning_rate": 8.272048409975942e-05, + "loss": 2.7629, + "step": 46390 + }, + { + "epoch": 2.27, + "grad_norm": 0.7304971218109131, + "learning_rate": 8.27098683789802e-05, + "loss": 2.7759, + "step": 46391 + }, + { + "epoch": 2.27, + "grad_norm": 0.7718026041984558, + "learning_rate": 8.269925323050134e-05, + "loss": 3.0894, + "step": 46392 + }, + { + "epoch": 2.27, + "grad_norm": 0.7265807390213013, + "learning_rate": 8.268863865435074e-05, + "loss": 2.9327, + "step": 46393 + }, + { + "epoch": 2.27, + "grad_norm": 0.6995274424552917, + "learning_rate": 8.267802465055656e-05, + "loss": 2.6844, + "step": 46394 + }, + { + "epoch": 2.27, + "grad_norm": 0.7589938044548035, + "learning_rate": 8.266741121914653e-05, + "loss": 2.9637, + "step": 46395 + }, + { + "epoch": 2.27, + "grad_norm": 0.7459881901741028, + "learning_rate": 8.265679836014872e-05, + "loss": 2.7266, + "step": 46396 + }, + { + "epoch": 2.27, + "grad_norm": 0.7490134239196777, + "learning_rate": 8.264618607359115e-05, + "loss": 2.9801, + "step": 46397 + }, + { + "epoch": 2.27, + "grad_norm": 0.7336806058883667, + "learning_rate": 8.263557435950172e-05, + "loss": 2.8788, + "step": 46398 + }, + { + "epoch": 2.27, + "grad_norm": 0.6905733942985535, + "learning_rate": 8.262496321790831e-05, + "loss": 2.9608, + "step": 46399 + }, + { + "epoch": 2.27, + "grad_norm": 0.6890795230865479, + "learning_rate": 8.261435264883887e-05, + "loss": 2.9883, + "step": 46400 + }, + { + "epoch": 2.27, + "grad_norm": 0.7028673887252808, + "learning_rate": 8.260374265232135e-05, + "loss": 2.8123, + "step": 46401 + }, + { + "epoch": 2.27, + "grad_norm": 0.7058315873146057, + "learning_rate": 8.259313322838386e-05, + "loss": 2.807, + "step": 46402 + }, + { + "epoch": 2.27, + "grad_norm": 0.7144296169281006, + "learning_rate": 8.258252437705411e-05, + "loss": 2.912, + "step": 46403 + }, + { + "epoch": 2.27, + "grad_norm": 0.7268756628036499, + "learning_rate": 8.257191609836024e-05, + "loss": 2.9637, + "step": 46404 + }, + { + "epoch": 2.27, + "grad_norm": 0.6951913237571716, + "learning_rate": 8.256130839233001e-05, + "loss": 2.7703, + "step": 46405 + }, + { + "epoch": 2.27, + "grad_norm": 0.7235651016235352, + "learning_rate": 8.255070125899151e-05, + "loss": 3.0491, + "step": 46406 + }, + { + "epoch": 2.27, + "grad_norm": 0.6981080770492554, + "learning_rate": 8.254009469837264e-05, + "loss": 2.9714, + "step": 46407 + }, + { + "epoch": 2.27, + "grad_norm": 0.6944175362586975, + "learning_rate": 8.252948871050122e-05, + "loss": 2.8881, + "step": 46408 + }, + { + "epoch": 2.27, + "grad_norm": 0.7283467054367065, + "learning_rate": 8.251888329540536e-05, + "loss": 2.8444, + "step": 46409 + }, + { + "epoch": 2.27, + "grad_norm": 0.7321710586547852, + "learning_rate": 8.250827845311283e-05, + "loss": 3.0853, + "step": 46410 + }, + { + "epoch": 2.27, + "grad_norm": 0.747199535369873, + "learning_rate": 8.249767418365162e-05, + "loss": 3.0333, + "step": 46411 + }, + { + "epoch": 2.27, + "grad_norm": 0.693293571472168, + "learning_rate": 8.248707048704975e-05, + "loss": 2.949, + "step": 46412 + }, + { + "epoch": 2.27, + "grad_norm": 0.6886857151985168, + "learning_rate": 8.247646736333511e-05, + "loss": 3.0812, + "step": 46413 + }, + { + "epoch": 2.27, + "grad_norm": 0.7032137513160706, + "learning_rate": 8.24658648125356e-05, + "loss": 2.7775, + "step": 46414 + }, + { + "epoch": 2.27, + "grad_norm": 0.7268944382667542, + "learning_rate": 8.2455262834679e-05, + "loss": 2.8439, + "step": 46415 + }, + { + "epoch": 2.27, + "grad_norm": 0.6935317516326904, + "learning_rate": 8.244466142979341e-05, + "loss": 2.8731, + "step": 46416 + }, + { + "epoch": 2.27, + "grad_norm": 0.7060828804969788, + "learning_rate": 8.243406059790676e-05, + "loss": 2.8517, + "step": 46417 + }, + { + "epoch": 2.27, + "grad_norm": 0.6857595443725586, + "learning_rate": 8.242346033904688e-05, + "loss": 2.9907, + "step": 46418 + }, + { + "epoch": 2.27, + "grad_norm": 0.7835713624954224, + "learning_rate": 8.24128606532418e-05, + "loss": 2.6536, + "step": 46419 + }, + { + "epoch": 2.27, + "grad_norm": 0.7131385803222656, + "learning_rate": 8.240226154051936e-05, + "loss": 2.8273, + "step": 46420 + }, + { + "epoch": 2.27, + "grad_norm": 0.7091208100318909, + "learning_rate": 8.23916630009074e-05, + "loss": 2.8953, + "step": 46421 + }, + { + "epoch": 2.28, + "grad_norm": 0.7581696510314941, + "learning_rate": 8.238106503443402e-05, + "loss": 2.8396, + "step": 46422 + }, + { + "epoch": 2.28, + "grad_norm": 0.7144935131072998, + "learning_rate": 8.237046764112694e-05, + "loss": 2.9896, + "step": 46423 + }, + { + "epoch": 2.28, + "grad_norm": 0.7232292890548706, + "learning_rate": 8.235987082101427e-05, + "loss": 2.5879, + "step": 46424 + }, + { + "epoch": 2.28, + "grad_norm": 0.7325698137283325, + "learning_rate": 8.234927457412371e-05, + "loss": 2.9548, + "step": 46425 + }, + { + "epoch": 2.28, + "grad_norm": 0.7658261656761169, + "learning_rate": 8.233867890048327e-05, + "loss": 2.8471, + "step": 46426 + }, + { + "epoch": 2.28, + "grad_norm": 0.7133269906044006, + "learning_rate": 8.232808380012095e-05, + "loss": 3.0446, + "step": 46427 + }, + { + "epoch": 2.28, + "grad_norm": 0.7498300075531006, + "learning_rate": 8.231748927306456e-05, + "loss": 2.9269, + "step": 46428 + }, + { + "epoch": 2.28, + "grad_norm": 0.7016955614089966, + "learning_rate": 8.230689531934203e-05, + "loss": 3.0732, + "step": 46429 + }, + { + "epoch": 2.28, + "grad_norm": 0.6857497692108154, + "learning_rate": 8.229630193898111e-05, + "loss": 2.9937, + "step": 46430 + }, + { + "epoch": 2.28, + "grad_norm": 0.7271170020103455, + "learning_rate": 8.228570913200987e-05, + "loss": 2.8245, + "step": 46431 + }, + { + "epoch": 2.28, + "grad_norm": 0.720839262008667, + "learning_rate": 8.227511689845627e-05, + "loss": 3.0504, + "step": 46432 + }, + { + "epoch": 2.28, + "grad_norm": 0.720252513885498, + "learning_rate": 8.2264525238348e-05, + "loss": 3.0355, + "step": 46433 + }, + { + "epoch": 2.28, + "grad_norm": 0.7447952032089233, + "learning_rate": 8.225393415171316e-05, + "loss": 3.0458, + "step": 46434 + }, + { + "epoch": 2.28, + "grad_norm": 0.720052182674408, + "learning_rate": 8.224334363857954e-05, + "loss": 2.916, + "step": 46435 + }, + { + "epoch": 2.28, + "grad_norm": 0.7244362235069275, + "learning_rate": 8.223275369897497e-05, + "loss": 2.7844, + "step": 46436 + }, + { + "epoch": 2.28, + "grad_norm": 0.7059257626533508, + "learning_rate": 8.222216433292751e-05, + "loss": 3.0789, + "step": 46437 + }, + { + "epoch": 2.28, + "grad_norm": 0.6963223218917847, + "learning_rate": 8.221157554046486e-05, + "loss": 2.9243, + "step": 46438 + }, + { + "epoch": 2.28, + "grad_norm": 0.7097539305686951, + "learning_rate": 8.22009873216151e-05, + "loss": 2.9694, + "step": 46439 + }, + { + "epoch": 2.28, + "grad_norm": 0.70842045545578, + "learning_rate": 8.219039967640594e-05, + "loss": 2.9949, + "step": 46440 + }, + { + "epoch": 2.28, + "grad_norm": 0.730398952960968, + "learning_rate": 8.217981260486542e-05, + "loss": 2.9813, + "step": 46441 + }, + { + "epoch": 2.28, + "grad_norm": 0.7620899081230164, + "learning_rate": 8.216922610702136e-05, + "loss": 2.8141, + "step": 46442 + }, + { + "epoch": 2.28, + "grad_norm": 0.7119383811950684, + "learning_rate": 8.215864018290154e-05, + "loss": 2.92, + "step": 46443 + }, + { + "epoch": 2.28, + "grad_norm": 0.7197368144989014, + "learning_rate": 8.214805483253403e-05, + "loss": 2.9015, + "step": 46444 + }, + { + "epoch": 2.28, + "grad_norm": 0.7236267924308777, + "learning_rate": 8.213747005594655e-05, + "loss": 2.8312, + "step": 46445 + }, + { + "epoch": 2.28, + "grad_norm": 0.743288516998291, + "learning_rate": 8.21268858531671e-05, + "loss": 3.0393, + "step": 46446 + }, + { + "epoch": 2.28, + "grad_norm": 0.6969639658927917, + "learning_rate": 8.211630222422343e-05, + "loss": 3.0242, + "step": 46447 + }, + { + "epoch": 2.28, + "grad_norm": 0.9204838275909424, + "learning_rate": 8.210571916914348e-05, + "loss": 2.927, + "step": 46448 + }, + { + "epoch": 2.28, + "grad_norm": 0.7029668092727661, + "learning_rate": 8.209513668795523e-05, + "loss": 2.8216, + "step": 46449 + }, + { + "epoch": 2.28, + "grad_norm": 0.7012711763381958, + "learning_rate": 8.208455478068646e-05, + "loss": 2.8378, + "step": 46450 + }, + { + "epoch": 2.28, + "grad_norm": 0.7693458199501038, + "learning_rate": 8.207397344736502e-05, + "loss": 2.926, + "step": 46451 + }, + { + "epoch": 2.28, + "grad_norm": 0.6860461831092834, + "learning_rate": 8.20633926880187e-05, + "loss": 2.8357, + "step": 46452 + }, + { + "epoch": 2.28, + "grad_norm": 0.7062203288078308, + "learning_rate": 8.20528125026755e-05, + "loss": 3.0731, + "step": 46453 + }, + { + "epoch": 2.28, + "grad_norm": 0.7307056188583374, + "learning_rate": 8.20422328913633e-05, + "loss": 2.6463, + "step": 46454 + }, + { + "epoch": 2.28, + "grad_norm": 0.6875963807106018, + "learning_rate": 8.203165385410981e-05, + "loss": 2.8983, + "step": 46455 + }, + { + "epoch": 2.28, + "grad_norm": 0.8253306150436401, + "learning_rate": 8.202107539094311e-05, + "loss": 2.8164, + "step": 46456 + }, + { + "epoch": 2.28, + "grad_norm": 0.703727662563324, + "learning_rate": 8.201049750189094e-05, + "loss": 2.9208, + "step": 46457 + }, + { + "epoch": 2.28, + "grad_norm": 0.7636400461196899, + "learning_rate": 8.19999201869811e-05, + "loss": 2.9006, + "step": 46458 + }, + { + "epoch": 2.28, + "grad_norm": 0.6992866396903992, + "learning_rate": 8.198934344624156e-05, + "loss": 3.0271, + "step": 46459 + }, + { + "epoch": 2.28, + "grad_norm": 0.7500672340393066, + "learning_rate": 8.197876727970006e-05, + "loss": 2.8297, + "step": 46460 + }, + { + "epoch": 2.28, + "grad_norm": 0.682694137096405, + "learning_rate": 8.196819168738465e-05, + "loss": 2.9394, + "step": 46461 + }, + { + "epoch": 2.28, + "grad_norm": 0.6699866056442261, + "learning_rate": 8.195761666932291e-05, + "loss": 2.9927, + "step": 46462 + }, + { + "epoch": 2.28, + "grad_norm": 0.7636361718177795, + "learning_rate": 8.194704222554297e-05, + "loss": 2.6734, + "step": 46463 + }, + { + "epoch": 2.28, + "grad_norm": 0.8084840774536133, + "learning_rate": 8.193646835607256e-05, + "loss": 2.8364, + "step": 46464 + }, + { + "epoch": 2.28, + "grad_norm": 0.7584612965583801, + "learning_rate": 8.192589506093943e-05, + "loss": 2.8917, + "step": 46465 + }, + { + "epoch": 2.28, + "grad_norm": 0.8246785998344421, + "learning_rate": 8.191532234017162e-05, + "loss": 2.7962, + "step": 46466 + }, + { + "epoch": 2.28, + "grad_norm": 0.7840319871902466, + "learning_rate": 8.190475019379678e-05, + "loss": 2.8299, + "step": 46467 + }, + { + "epoch": 2.28, + "grad_norm": 0.7328280806541443, + "learning_rate": 8.189417862184287e-05, + "loss": 2.7687, + "step": 46468 + }, + { + "epoch": 2.28, + "grad_norm": 0.7843512892723083, + "learning_rate": 8.18836076243378e-05, + "loss": 3.0041, + "step": 46469 + }, + { + "epoch": 2.28, + "grad_norm": 0.7056668400764465, + "learning_rate": 8.187303720130922e-05, + "loss": 3.0747, + "step": 46470 + }, + { + "epoch": 2.28, + "grad_norm": 0.7091785669326782, + "learning_rate": 8.186246735278518e-05, + "loss": 2.9689, + "step": 46471 + }, + { + "epoch": 2.28, + "grad_norm": 0.7240222096443176, + "learning_rate": 8.185189807879345e-05, + "loss": 2.8462, + "step": 46472 + }, + { + "epoch": 2.28, + "grad_norm": 0.6857366561889648, + "learning_rate": 8.18413293793617e-05, + "loss": 3.0774, + "step": 46473 + }, + { + "epoch": 2.28, + "grad_norm": 0.7089368104934692, + "learning_rate": 8.183076125451803e-05, + "loss": 2.8356, + "step": 46474 + }, + { + "epoch": 2.28, + "grad_norm": 0.8050333261489868, + "learning_rate": 8.182019370429005e-05, + "loss": 3.0691, + "step": 46475 + }, + { + "epoch": 2.28, + "grad_norm": 0.740153431892395, + "learning_rate": 8.180962672870577e-05, + "loss": 2.796, + "step": 46476 + }, + { + "epoch": 2.28, + "grad_norm": 0.7142683267593384, + "learning_rate": 8.179906032779283e-05, + "loss": 2.8724, + "step": 46477 + }, + { + "epoch": 2.28, + "grad_norm": 0.6894727349281311, + "learning_rate": 8.178849450157932e-05, + "loss": 2.6895, + "step": 46478 + }, + { + "epoch": 2.28, + "grad_norm": 0.7132574915885925, + "learning_rate": 8.177792925009286e-05, + "loss": 2.925, + "step": 46479 + }, + { + "epoch": 2.28, + "grad_norm": 0.7323726415634155, + "learning_rate": 8.176736457336129e-05, + "loss": 2.7728, + "step": 46480 + }, + { + "epoch": 2.28, + "grad_norm": 0.7239104509353638, + "learning_rate": 8.175680047141254e-05, + "loss": 2.7761, + "step": 46481 + }, + { + "epoch": 2.28, + "grad_norm": 0.8582285046577454, + "learning_rate": 8.174623694427433e-05, + "loss": 3.2315, + "step": 46482 + }, + { + "epoch": 2.28, + "grad_norm": 0.6943944692611694, + "learning_rate": 8.173567399197448e-05, + "loss": 2.8219, + "step": 46483 + }, + { + "epoch": 2.28, + "grad_norm": 0.7097330689430237, + "learning_rate": 8.172511161454095e-05, + "loss": 2.7027, + "step": 46484 + }, + { + "epoch": 2.28, + "grad_norm": 0.7035298347473145, + "learning_rate": 8.17145498120015e-05, + "loss": 2.9617, + "step": 46485 + }, + { + "epoch": 2.28, + "grad_norm": 0.7039844393730164, + "learning_rate": 8.17039885843839e-05, + "loss": 2.9092, + "step": 46486 + }, + { + "epoch": 2.28, + "grad_norm": 0.7377235889434814, + "learning_rate": 8.16934279317159e-05, + "loss": 2.912, + "step": 46487 + }, + { + "epoch": 2.28, + "grad_norm": 0.6866671442985535, + "learning_rate": 8.168286785402537e-05, + "loss": 2.9288, + "step": 46488 + }, + { + "epoch": 2.28, + "grad_norm": 0.7029548287391663, + "learning_rate": 8.167230835134027e-05, + "loss": 2.9683, + "step": 46489 + }, + { + "epoch": 2.28, + "grad_norm": 0.7336953282356262, + "learning_rate": 8.16617494236882e-05, + "loss": 2.921, + "step": 46490 + }, + { + "epoch": 2.28, + "grad_norm": 0.6844274997711182, + "learning_rate": 8.165119107109714e-05, + "loss": 2.8364, + "step": 46491 + }, + { + "epoch": 2.28, + "grad_norm": 0.7402933835983276, + "learning_rate": 8.164063329359473e-05, + "loss": 2.6783, + "step": 46492 + }, + { + "epoch": 2.28, + "grad_norm": 0.6989611983299255, + "learning_rate": 8.163007609120897e-05, + "loss": 3.0188, + "step": 46493 + }, + { + "epoch": 2.28, + "grad_norm": 0.7093426585197449, + "learning_rate": 8.161951946396756e-05, + "loss": 2.6546, + "step": 46494 + }, + { + "epoch": 2.28, + "grad_norm": 0.7372518181800842, + "learning_rate": 8.160896341189821e-05, + "loss": 2.7915, + "step": 46495 + }, + { + "epoch": 2.28, + "grad_norm": 0.7127835154533386, + "learning_rate": 8.159840793502891e-05, + "loss": 2.8474, + "step": 46496 + }, + { + "epoch": 2.28, + "grad_norm": 0.7472658157348633, + "learning_rate": 8.158785303338731e-05, + "loss": 2.912, + "step": 46497 + }, + { + "epoch": 2.28, + "grad_norm": 0.7016239166259766, + "learning_rate": 8.157729870700125e-05, + "loss": 2.835, + "step": 46498 + }, + { + "epoch": 2.28, + "grad_norm": 0.7203468680381775, + "learning_rate": 8.156674495589862e-05, + "loss": 2.7075, + "step": 46499 + }, + { + "epoch": 2.28, + "grad_norm": 0.6911539435386658, + "learning_rate": 8.155619178010717e-05, + "loss": 2.7875, + "step": 46500 + }, + { + "epoch": 2.28, + "grad_norm": 0.8295294046401978, + "learning_rate": 8.154563917965465e-05, + "loss": 2.7513, + "step": 46501 + }, + { + "epoch": 2.28, + "grad_norm": 0.715740978717804, + "learning_rate": 8.15350871545688e-05, + "loss": 2.8735, + "step": 46502 + }, + { + "epoch": 2.28, + "grad_norm": 0.7473269104957581, + "learning_rate": 8.152453570487749e-05, + "loss": 2.6743, + "step": 46503 + }, + { + "epoch": 2.28, + "grad_norm": 0.7766650319099426, + "learning_rate": 8.151398483060859e-05, + "loss": 2.8402, + "step": 46504 + }, + { + "epoch": 2.28, + "grad_norm": 0.6803699731826782, + "learning_rate": 8.150343453178971e-05, + "loss": 2.6261, + "step": 46505 + }, + { + "epoch": 2.28, + "grad_norm": 0.9094350934028625, + "learning_rate": 8.149288480844884e-05, + "loss": 2.9338, + "step": 46506 + }, + { + "epoch": 2.28, + "grad_norm": 0.7277110815048218, + "learning_rate": 8.148233566061352e-05, + "loss": 3.097, + "step": 46507 + }, + { + "epoch": 2.28, + "grad_norm": 0.7063420414924622, + "learning_rate": 8.147178708831181e-05, + "loss": 2.9711, + "step": 46508 + }, + { + "epoch": 2.28, + "grad_norm": 0.7325363159179688, + "learning_rate": 8.146123909157137e-05, + "loss": 2.838, + "step": 46509 + }, + { + "epoch": 2.28, + "grad_norm": 0.6835830211639404, + "learning_rate": 8.145069167041982e-05, + "loss": 2.8725, + "step": 46510 + }, + { + "epoch": 2.28, + "grad_norm": 0.7027117609977722, + "learning_rate": 8.144014482488523e-05, + "loss": 2.8241, + "step": 46511 + }, + { + "epoch": 2.28, + "grad_norm": 0.7053887248039246, + "learning_rate": 8.142959855499511e-05, + "loss": 2.895, + "step": 46512 + }, + { + "epoch": 2.28, + "grad_norm": 0.6964938044548035, + "learning_rate": 8.141905286077736e-05, + "loss": 3.0054, + "step": 46513 + }, + { + "epoch": 2.28, + "grad_norm": 0.7192620038986206, + "learning_rate": 8.140850774225988e-05, + "loss": 2.8054, + "step": 46514 + }, + { + "epoch": 2.28, + "grad_norm": 0.7804059386253357, + "learning_rate": 8.139796319947032e-05, + "loss": 2.9574, + "step": 46515 + }, + { + "epoch": 2.28, + "grad_norm": 0.6990054845809937, + "learning_rate": 8.138741923243643e-05, + "loss": 2.7702, + "step": 46516 + }, + { + "epoch": 2.28, + "grad_norm": 0.7165387272834778, + "learning_rate": 8.137687584118594e-05, + "loss": 2.841, + "step": 46517 + }, + { + "epoch": 2.28, + "grad_norm": 0.7471354603767395, + "learning_rate": 8.136633302574678e-05, + "loss": 2.9077, + "step": 46518 + }, + { + "epoch": 2.28, + "grad_norm": 0.7257309556007385, + "learning_rate": 8.13557907861465e-05, + "loss": 3.0477, + "step": 46519 + }, + { + "epoch": 2.28, + "grad_norm": 0.7215765118598938, + "learning_rate": 8.1345249122413e-05, + "loss": 2.9011, + "step": 46520 + }, + { + "epoch": 2.28, + "grad_norm": 0.6833771467208862, + "learning_rate": 8.133470803457413e-05, + "loss": 2.7987, + "step": 46521 + }, + { + "epoch": 2.28, + "grad_norm": 0.7473302483558655, + "learning_rate": 8.132416752265754e-05, + "loss": 2.707, + "step": 46522 + }, + { + "epoch": 2.28, + "grad_norm": 0.7858054637908936, + "learning_rate": 8.131362758669103e-05, + "loss": 3.0084, + "step": 46523 + }, + { + "epoch": 2.28, + "grad_norm": 0.7041542530059814, + "learning_rate": 8.130308822670223e-05, + "loss": 2.665, + "step": 46524 + }, + { + "epoch": 2.28, + "grad_norm": 0.705814778804779, + "learning_rate": 8.1292549442719e-05, + "loss": 3.0473, + "step": 46525 + }, + { + "epoch": 2.28, + "grad_norm": 0.7002612352371216, + "learning_rate": 8.128201123476923e-05, + "loss": 2.7564, + "step": 46526 + }, + { + "epoch": 2.28, + "grad_norm": 0.746374249458313, + "learning_rate": 8.127147360288043e-05, + "loss": 2.9071, + "step": 46527 + }, + { + "epoch": 2.28, + "grad_norm": 0.7429247498512268, + "learning_rate": 8.126093654708056e-05, + "loss": 2.9663, + "step": 46528 + }, + { + "epoch": 2.28, + "grad_norm": 0.7674092650413513, + "learning_rate": 8.125040006739722e-05, + "loss": 3.1434, + "step": 46529 + }, + { + "epoch": 2.28, + "grad_norm": 0.6966801285743713, + "learning_rate": 8.123986416385831e-05, + "loss": 2.9157, + "step": 46530 + }, + { + "epoch": 2.28, + "grad_norm": 0.7209636569023132, + "learning_rate": 8.122932883649147e-05, + "loss": 3.0265, + "step": 46531 + }, + { + "epoch": 2.28, + "grad_norm": 0.7358286380767822, + "learning_rate": 8.121879408532439e-05, + "loss": 3.1313, + "step": 46532 + }, + { + "epoch": 2.28, + "grad_norm": 0.7346293926239014, + "learning_rate": 8.120825991038502e-05, + "loss": 2.9752, + "step": 46533 + }, + { + "epoch": 2.28, + "grad_norm": 0.7059187889099121, + "learning_rate": 8.119772631170088e-05, + "loss": 2.8991, + "step": 46534 + }, + { + "epoch": 2.28, + "grad_norm": 0.6938578486442566, + "learning_rate": 8.118719328929983e-05, + "loss": 2.9862, + "step": 46535 + }, + { + "epoch": 2.28, + "grad_norm": 0.7114361524581909, + "learning_rate": 8.117666084320967e-05, + "loss": 2.7991, + "step": 46536 + }, + { + "epoch": 2.28, + "grad_norm": 0.7124288082122803, + "learning_rate": 8.116612897345807e-05, + "loss": 3.0468, + "step": 46537 + }, + { + "epoch": 2.28, + "grad_norm": 0.7127500772476196, + "learning_rate": 8.115559768007278e-05, + "loss": 2.8632, + "step": 46538 + }, + { + "epoch": 2.28, + "grad_norm": 0.6895161867141724, + "learning_rate": 8.114506696308147e-05, + "loss": 2.7845, + "step": 46539 + }, + { + "epoch": 2.28, + "grad_norm": 0.7293110489845276, + "learning_rate": 8.113453682251188e-05, + "loss": 2.8978, + "step": 46540 + }, + { + "epoch": 2.28, + "grad_norm": 0.7473793029785156, + "learning_rate": 8.112400725839193e-05, + "loss": 3.1251, + "step": 46541 + }, + { + "epoch": 2.28, + "grad_norm": 0.710602343082428, + "learning_rate": 8.111347827074912e-05, + "loss": 2.9912, + "step": 46542 + }, + { + "epoch": 2.28, + "grad_norm": 0.6810015439987183, + "learning_rate": 8.110294985961136e-05, + "loss": 2.8958, + "step": 46543 + }, + { + "epoch": 2.28, + "grad_norm": 0.6719366312026978, + "learning_rate": 8.10924220250063e-05, + "loss": 3.0995, + "step": 46544 + }, + { + "epoch": 2.28, + "grad_norm": 0.6884929537773132, + "learning_rate": 8.108189476696162e-05, + "loss": 3.2298, + "step": 46545 + }, + { + "epoch": 2.28, + "grad_norm": 0.7371322512626648, + "learning_rate": 8.107136808550513e-05, + "loss": 2.9997, + "step": 46546 + }, + { + "epoch": 2.28, + "grad_norm": 0.7723472118377686, + "learning_rate": 8.106084198066448e-05, + "loss": 2.9791, + "step": 46547 + }, + { + "epoch": 2.28, + "grad_norm": 0.7253080606460571, + "learning_rate": 8.105031645246752e-05, + "loss": 2.6226, + "step": 46548 + }, + { + "epoch": 2.28, + "grad_norm": 0.7322795987129211, + "learning_rate": 8.103979150094178e-05, + "loss": 2.8257, + "step": 46549 + }, + { + "epoch": 2.28, + "grad_norm": 0.7000114321708679, + "learning_rate": 8.102926712611508e-05, + "loss": 3.0231, + "step": 46550 + }, + { + "epoch": 2.28, + "grad_norm": 0.7282766699790955, + "learning_rate": 8.101874332801526e-05, + "loss": 3.0256, + "step": 46551 + }, + { + "epoch": 2.28, + "grad_norm": 0.702883243560791, + "learning_rate": 8.100822010666995e-05, + "loss": 2.8617, + "step": 46552 + }, + { + "epoch": 2.28, + "grad_norm": 0.7281079292297363, + "learning_rate": 8.099769746210681e-05, + "loss": 3.0817, + "step": 46553 + }, + { + "epoch": 2.28, + "grad_norm": 0.7449455261230469, + "learning_rate": 8.098717539435349e-05, + "loss": 2.7433, + "step": 46554 + }, + { + "epoch": 2.28, + "grad_norm": 0.7251495122909546, + "learning_rate": 8.09766539034378e-05, + "loss": 2.6756, + "step": 46555 + }, + { + "epoch": 2.28, + "grad_norm": 0.7186842560768127, + "learning_rate": 8.096613298938758e-05, + "loss": 2.9822, + "step": 46556 + }, + { + "epoch": 2.28, + "grad_norm": 0.7262799739837646, + "learning_rate": 8.095561265223028e-05, + "loss": 2.6999, + "step": 46557 + }, + { + "epoch": 2.28, + "grad_norm": 0.7726730108261108, + "learning_rate": 8.094509289199384e-05, + "loss": 2.7446, + "step": 46558 + }, + { + "epoch": 2.28, + "grad_norm": 0.7000957727432251, + "learning_rate": 8.093457370870588e-05, + "loss": 2.8547, + "step": 46559 + }, + { + "epoch": 2.28, + "grad_norm": 0.7143601179122925, + "learning_rate": 8.092405510239402e-05, + "loss": 2.8717, + "step": 46560 + }, + { + "epoch": 2.28, + "grad_norm": 0.6854216456413269, + "learning_rate": 8.09135370730861e-05, + "loss": 2.9581, + "step": 46561 + }, + { + "epoch": 2.28, + "grad_norm": 0.745839536190033, + "learning_rate": 8.090301962080968e-05, + "loss": 2.9961, + "step": 46562 + }, + { + "epoch": 2.28, + "grad_norm": 0.7243121862411499, + "learning_rate": 8.089250274559262e-05, + "loss": 2.821, + "step": 46563 + }, + { + "epoch": 2.28, + "grad_norm": 0.7288719415664673, + "learning_rate": 8.088198644746244e-05, + "loss": 2.6618, + "step": 46564 + }, + { + "epoch": 2.28, + "grad_norm": 0.7389212250709534, + "learning_rate": 8.087147072644704e-05, + "loss": 2.9932, + "step": 46565 + }, + { + "epoch": 2.28, + "grad_norm": 0.7060454487800598, + "learning_rate": 8.086095558257402e-05, + "loss": 2.9091, + "step": 46566 + }, + { + "epoch": 2.28, + "grad_norm": 0.7329028248786926, + "learning_rate": 8.085044101587097e-05, + "loss": 2.9153, + "step": 46567 + }, + { + "epoch": 2.28, + "grad_norm": 0.7609493136405945, + "learning_rate": 8.083992702636578e-05, + "loss": 3.0979, + "step": 46568 + }, + { + "epoch": 2.28, + "grad_norm": 0.7355934381484985, + "learning_rate": 8.082941361408593e-05, + "loss": 2.9956, + "step": 46569 + }, + { + "epoch": 2.28, + "grad_norm": 0.6936449408531189, + "learning_rate": 8.081890077905925e-05, + "loss": 3.0962, + "step": 46570 + }, + { + "epoch": 2.28, + "grad_norm": 0.7607849836349487, + "learning_rate": 8.08083885213135e-05, + "loss": 2.7487, + "step": 46571 + }, + { + "epoch": 2.28, + "grad_norm": 0.7081700563430786, + "learning_rate": 8.079787684087614e-05, + "loss": 2.7324, + "step": 46572 + }, + { + "epoch": 2.28, + "grad_norm": 0.7279213070869446, + "learning_rate": 8.07873657377751e-05, + "loss": 2.8963, + "step": 46573 + }, + { + "epoch": 2.28, + "grad_norm": 0.7351059317588806, + "learning_rate": 8.077685521203795e-05, + "loss": 2.8639, + "step": 46574 + }, + { + "epoch": 2.28, + "grad_norm": 0.7259153127670288, + "learning_rate": 8.07663452636923e-05, + "loss": 2.8075, + "step": 46575 + }, + { + "epoch": 2.28, + "grad_norm": 0.7282455563545227, + "learning_rate": 8.0755835892766e-05, + "loss": 3.0483, + "step": 46576 + }, + { + "epoch": 2.28, + "grad_norm": 0.6987950801849365, + "learning_rate": 8.074532709928651e-05, + "loss": 2.9277, + "step": 46577 + }, + { + "epoch": 2.28, + "grad_norm": 0.7334350347518921, + "learning_rate": 8.073481888328174e-05, + "loss": 2.9942, + "step": 46578 + }, + { + "epoch": 2.28, + "grad_norm": 0.7089470624923706, + "learning_rate": 8.072431124477917e-05, + "loss": 2.9821, + "step": 46579 + }, + { + "epoch": 2.28, + "grad_norm": 0.7162338495254517, + "learning_rate": 8.071380418380663e-05, + "loss": 2.7657, + "step": 46580 + }, + { + "epoch": 2.28, + "grad_norm": 0.7498306035995483, + "learning_rate": 8.070329770039175e-05, + "loss": 2.8501, + "step": 46581 + }, + { + "epoch": 2.28, + "grad_norm": 0.735768735408783, + "learning_rate": 8.069279179456209e-05, + "loss": 2.9595, + "step": 46582 + }, + { + "epoch": 2.28, + "grad_norm": 0.7566409111022949, + "learning_rate": 8.068228646634549e-05, + "loss": 2.9093, + "step": 46583 + }, + { + "epoch": 2.28, + "grad_norm": 0.7465990781784058, + "learning_rate": 8.067178171576945e-05, + "loss": 2.9164, + "step": 46584 + }, + { + "epoch": 2.28, + "grad_norm": 0.7334000468254089, + "learning_rate": 8.066127754286173e-05, + "loss": 3.0872, + "step": 46585 + }, + { + "epoch": 2.28, + "grad_norm": 0.8011319041252136, + "learning_rate": 8.06507739476501e-05, + "loss": 2.9013, + "step": 46586 + }, + { + "epoch": 2.28, + "grad_norm": 0.729567289352417, + "learning_rate": 8.064027093016208e-05, + "loss": 2.8262, + "step": 46587 + }, + { + "epoch": 2.28, + "grad_norm": 0.7324111461639404, + "learning_rate": 8.062976849042539e-05, + "loss": 2.7336, + "step": 46588 + }, + { + "epoch": 2.28, + "grad_norm": 0.7233032584190369, + "learning_rate": 8.06192666284676e-05, + "loss": 2.4499, + "step": 46589 + }, + { + "epoch": 2.28, + "grad_norm": 0.7346892952919006, + "learning_rate": 8.060876534431642e-05, + "loss": 2.9368, + "step": 46590 + }, + { + "epoch": 2.28, + "grad_norm": 0.7054365277290344, + "learning_rate": 8.059826463799961e-05, + "loss": 2.6252, + "step": 46591 + }, + { + "epoch": 2.28, + "grad_norm": 0.7693349719047546, + "learning_rate": 8.058776450954467e-05, + "loss": 2.9058, + "step": 46592 + }, + { + "epoch": 2.28, + "grad_norm": 0.7086447477340698, + "learning_rate": 8.057726495897941e-05, + "loss": 2.8654, + "step": 46593 + }, + { + "epoch": 2.28, + "grad_norm": 0.7039918303489685, + "learning_rate": 8.056676598633134e-05, + "loss": 2.985, + "step": 46594 + }, + { + "epoch": 2.28, + "grad_norm": 0.722930371761322, + "learning_rate": 8.055626759162824e-05, + "loss": 2.9315, + "step": 46595 + }, + { + "epoch": 2.28, + "grad_norm": 0.7084254026412964, + "learning_rate": 8.054576977489771e-05, + "loss": 2.7143, + "step": 46596 + }, + { + "epoch": 2.28, + "grad_norm": 0.7326437830924988, + "learning_rate": 8.053527253616731e-05, + "loss": 2.91, + "step": 46597 + }, + { + "epoch": 2.28, + "grad_norm": 0.709077000617981, + "learning_rate": 8.052477587546486e-05, + "loss": 2.8769, + "step": 46598 + }, + { + "epoch": 2.28, + "grad_norm": 0.7226983308792114, + "learning_rate": 8.051427979281779e-05, + "loss": 2.8338, + "step": 46599 + }, + { + "epoch": 2.28, + "grad_norm": 0.7126448154449463, + "learning_rate": 8.050378428825398e-05, + "loss": 2.8205, + "step": 46600 + }, + { + "epoch": 2.28, + "grad_norm": 0.6979407668113708, + "learning_rate": 8.049328936180087e-05, + "loss": 2.9157, + "step": 46601 + }, + { + "epoch": 2.28, + "grad_norm": 0.7206785678863525, + "learning_rate": 8.048279501348624e-05, + "loss": 2.8154, + "step": 46602 + }, + { + "epoch": 2.28, + "grad_norm": 0.711652934551239, + "learning_rate": 8.047230124333771e-05, + "loss": 3.0502, + "step": 46603 + }, + { + "epoch": 2.28, + "grad_norm": 1.0295687913894653, + "learning_rate": 8.04618080513828e-05, + "loss": 2.8277, + "step": 46604 + }, + { + "epoch": 2.28, + "grad_norm": 0.7558762431144714, + "learning_rate": 8.045131543764933e-05, + "loss": 2.846, + "step": 46605 + }, + { + "epoch": 2.28, + "grad_norm": 0.7087622284889221, + "learning_rate": 8.044082340216474e-05, + "loss": 2.8821, + "step": 46606 + }, + { + "epoch": 2.28, + "grad_norm": 0.7025753259658813, + "learning_rate": 8.043033194495678e-05, + "loss": 3.1409, + "step": 46607 + }, + { + "epoch": 2.28, + "grad_norm": 0.6761602163314819, + "learning_rate": 8.041984106605316e-05, + "loss": 2.7138, + "step": 46608 + }, + { + "epoch": 2.28, + "grad_norm": 0.7491973042488098, + "learning_rate": 8.040935076548135e-05, + "loss": 2.8269, + "step": 46609 + }, + { + "epoch": 2.28, + "grad_norm": 0.7301766276359558, + "learning_rate": 8.039886104326921e-05, + "loss": 3.1797, + "step": 46610 + }, + { + "epoch": 2.28, + "grad_norm": 0.6868095993995667, + "learning_rate": 8.0388371899444e-05, + "loss": 3.0877, + "step": 46611 + }, + { + "epoch": 2.28, + "grad_norm": 0.7199649214744568, + "learning_rate": 8.037788333403357e-05, + "loss": 3.0416, + "step": 46612 + }, + { + "epoch": 2.28, + "grad_norm": 0.6923564076423645, + "learning_rate": 8.036739534706563e-05, + "loss": 2.9795, + "step": 46613 + }, + { + "epoch": 2.28, + "grad_norm": 0.7084708213806152, + "learning_rate": 8.03569079385676e-05, + "loss": 2.8675, + "step": 46614 + }, + { + "epoch": 2.28, + "grad_norm": 0.7493816614151001, + "learning_rate": 8.03464211085673e-05, + "loss": 3.133, + "step": 46615 + }, + { + "epoch": 2.28, + "grad_norm": 0.6935444474220276, + "learning_rate": 8.033593485709214e-05, + "loss": 2.8476, + "step": 46616 + }, + { + "epoch": 2.28, + "grad_norm": 0.7323094606399536, + "learning_rate": 8.032544918416996e-05, + "loss": 3.0348, + "step": 46617 + }, + { + "epoch": 2.28, + "grad_norm": 0.7361232042312622, + "learning_rate": 8.03149640898283e-05, + "loss": 2.812, + "step": 46618 + }, + { + "epoch": 2.28, + "grad_norm": 0.7444538474082947, + "learning_rate": 8.030447957409464e-05, + "loss": 2.874, + "step": 46619 + }, + { + "epoch": 2.28, + "grad_norm": 0.718134343624115, + "learning_rate": 8.029399563699674e-05, + "loss": 2.8746, + "step": 46620 + }, + { + "epoch": 2.28, + "grad_norm": 0.7226566076278687, + "learning_rate": 8.028351227856213e-05, + "loss": 3.0168, + "step": 46621 + }, + { + "epoch": 2.28, + "grad_norm": 0.844632089138031, + "learning_rate": 8.027302949881845e-05, + "loss": 2.8895, + "step": 46622 + }, + { + "epoch": 2.28, + "grad_norm": 0.7833714485168457, + "learning_rate": 8.026254729779344e-05, + "loss": 2.9055, + "step": 46623 + }, + { + "epoch": 2.28, + "grad_norm": 0.7003939747810364, + "learning_rate": 8.025206567551455e-05, + "loss": 3.0071, + "step": 46624 + }, + { + "epoch": 2.28, + "grad_norm": 0.6972513794898987, + "learning_rate": 8.024158463200946e-05, + "loss": 2.9896, + "step": 46625 + }, + { + "epoch": 2.29, + "grad_norm": 0.7376452684402466, + "learning_rate": 8.023110416730564e-05, + "loss": 2.8573, + "step": 46626 + }, + { + "epoch": 2.29, + "grad_norm": 0.720854640007019, + "learning_rate": 8.022062428143076e-05, + "loss": 3.1533, + "step": 46627 + }, + { + "epoch": 2.29, + "grad_norm": 0.7451248168945312, + "learning_rate": 8.02101449744126e-05, + "loss": 3.0563, + "step": 46628 + }, + { + "epoch": 2.29, + "grad_norm": 0.7125359773635864, + "learning_rate": 8.019966624627852e-05, + "loss": 2.8611, + "step": 46629 + }, + { + "epoch": 2.29, + "grad_norm": 0.7607391476631165, + "learning_rate": 8.018918809705628e-05, + "loss": 2.8573, + "step": 46630 + }, + { + "epoch": 2.29, + "grad_norm": 0.7929643988609314, + "learning_rate": 8.017871052677335e-05, + "loss": 2.8047, + "step": 46631 + }, + { + "epoch": 2.29, + "grad_norm": 0.7148974537849426, + "learning_rate": 8.016823353545749e-05, + "loss": 2.8755, + "step": 46632 + }, + { + "epoch": 2.29, + "grad_norm": 0.8018413782119751, + "learning_rate": 8.015775712313616e-05, + "loss": 2.7893, + "step": 46633 + }, + { + "epoch": 2.29, + "grad_norm": 0.6894913911819458, + "learning_rate": 8.014728128983693e-05, + "loss": 2.7284, + "step": 46634 + }, + { + "epoch": 2.29, + "grad_norm": 0.7120516896247864, + "learning_rate": 8.013680603558754e-05, + "loss": 2.9376, + "step": 46635 + }, + { + "epoch": 2.29, + "grad_norm": 0.7336519360542297, + "learning_rate": 8.012633136041537e-05, + "loss": 2.8246, + "step": 46636 + }, + { + "epoch": 2.29, + "grad_norm": 0.7027100920677185, + "learning_rate": 8.011585726434813e-05, + "loss": 2.955, + "step": 46637 + }, + { + "epoch": 2.29, + "grad_norm": 0.7373955249786377, + "learning_rate": 8.010538374741355e-05, + "loss": 2.8527, + "step": 46638 + }, + { + "epoch": 2.29, + "grad_norm": 0.6577633619308472, + "learning_rate": 8.0094910809639e-05, + "loss": 2.8324, + "step": 46639 + }, + { + "epoch": 2.29, + "grad_norm": 0.7087864279747009, + "learning_rate": 8.008443845105216e-05, + "loss": 2.7932, + "step": 46640 + }, + { + "epoch": 2.29, + "grad_norm": 0.7115554809570312, + "learning_rate": 8.007396667168052e-05, + "loss": 2.8594, + "step": 46641 + }, + { + "epoch": 2.29, + "grad_norm": 0.7160695791244507, + "learning_rate": 8.006349547155171e-05, + "loss": 2.9263, + "step": 46642 + }, + { + "epoch": 2.29, + "grad_norm": 0.7285076379776001, + "learning_rate": 8.005302485069339e-05, + "loss": 2.9813, + "step": 46643 + }, + { + "epoch": 2.29, + "grad_norm": 0.7814637422561646, + "learning_rate": 8.004255480913302e-05, + "loss": 2.8582, + "step": 46644 + }, + { + "epoch": 2.29, + "grad_norm": 0.6893579959869385, + "learning_rate": 8.00320853468983e-05, + "loss": 2.6703, + "step": 46645 + }, + { + "epoch": 2.29, + "grad_norm": 0.8030416965484619, + "learning_rate": 8.002161646401671e-05, + "loss": 2.9652, + "step": 46646 + }, + { + "epoch": 2.29, + "grad_norm": 0.7509509325027466, + "learning_rate": 8.00111481605158e-05, + "loss": 2.639, + "step": 46647 + }, + { + "epoch": 2.29, + "grad_norm": 0.6878439784049988, + "learning_rate": 8.000068043642325e-05, + "loss": 3.2898, + "step": 46648 + }, + { + "epoch": 2.29, + "grad_norm": 0.6887226104736328, + "learning_rate": 7.999021329176649e-05, + "loss": 2.7212, + "step": 46649 + }, + { + "epoch": 2.29, + "grad_norm": 0.7231206297874451, + "learning_rate": 7.997974672657325e-05, + "loss": 2.9418, + "step": 46650 + }, + { + "epoch": 2.29, + "grad_norm": 0.7704378366470337, + "learning_rate": 7.996928074087092e-05, + "loss": 2.7953, + "step": 46651 + }, + { + "epoch": 2.29, + "grad_norm": 0.6910778284072876, + "learning_rate": 7.995881533468717e-05, + "loss": 3.2411, + "step": 46652 + }, + { + "epoch": 2.29, + "grad_norm": 0.6970378160476685, + "learning_rate": 7.994835050804966e-05, + "loss": 2.6902, + "step": 46653 + }, + { + "epoch": 2.29, + "grad_norm": 0.6877171993255615, + "learning_rate": 7.993788626098583e-05, + "loss": 2.701, + "step": 46654 + }, + { + "epoch": 2.29, + "grad_norm": 0.7191632390022278, + "learning_rate": 7.992742259352324e-05, + "loss": 2.7879, + "step": 46655 + }, + { + "epoch": 2.29, + "grad_norm": 0.7293527722358704, + "learning_rate": 7.991695950568939e-05, + "loss": 2.9878, + "step": 46656 + }, + { + "epoch": 2.29, + "grad_norm": 0.6775935292243958, + "learning_rate": 7.99064969975119e-05, + "loss": 2.8923, + "step": 46657 + }, + { + "epoch": 2.29, + "grad_norm": 0.7595655918121338, + "learning_rate": 7.989603506901842e-05, + "loss": 2.995, + "step": 46658 + }, + { + "epoch": 2.29, + "grad_norm": 0.6996396780014038, + "learning_rate": 7.988557372023637e-05, + "loss": 2.8157, + "step": 46659 + }, + { + "epoch": 2.29, + "grad_norm": 0.7048630714416504, + "learning_rate": 7.987511295119342e-05, + "loss": 2.8022, + "step": 46660 + }, + { + "epoch": 2.29, + "grad_norm": 0.7288792133331299, + "learning_rate": 7.986465276191708e-05, + "loss": 2.8384, + "step": 46661 + }, + { + "epoch": 2.29, + "grad_norm": 0.6870715618133545, + "learning_rate": 7.985419315243476e-05, + "loss": 3.0466, + "step": 46662 + }, + { + "epoch": 2.29, + "grad_norm": 0.7056323289871216, + "learning_rate": 7.984373412277422e-05, + "loss": 2.8514, + "step": 46663 + }, + { + "epoch": 2.29, + "grad_norm": 0.6987048387527466, + "learning_rate": 7.983327567296284e-05, + "loss": 3.0196, + "step": 46664 + }, + { + "epoch": 2.29, + "grad_norm": 0.7563616037368774, + "learning_rate": 7.982281780302834e-05, + "loss": 2.9289, + "step": 46665 + }, + { + "epoch": 2.29, + "grad_norm": 0.684380054473877, + "learning_rate": 7.981236051299806e-05, + "loss": 2.8782, + "step": 46666 + }, + { + "epoch": 2.29, + "grad_norm": 0.715981662273407, + "learning_rate": 7.980190380289973e-05, + "loss": 2.9129, + "step": 46667 + }, + { + "epoch": 2.29, + "grad_norm": 0.7292078733444214, + "learning_rate": 7.97914476727608e-05, + "loss": 2.8948, + "step": 46668 + }, + { + "epoch": 2.29, + "grad_norm": 0.791476845741272, + "learning_rate": 7.978099212260877e-05, + "loss": 2.6953, + "step": 46669 + }, + { + "epoch": 2.29, + "grad_norm": 0.7197558879852295, + "learning_rate": 7.977053715247127e-05, + "loss": 2.9594, + "step": 46670 + }, + { + "epoch": 2.29, + "grad_norm": 0.7770285606384277, + "learning_rate": 7.976008276237572e-05, + "loss": 2.7839, + "step": 46671 + }, + { + "epoch": 2.29, + "grad_norm": 0.6870531439781189, + "learning_rate": 7.97496289523498e-05, + "loss": 2.8836, + "step": 46672 + }, + { + "epoch": 2.29, + "grad_norm": 0.7081472277641296, + "learning_rate": 7.973917572242091e-05, + "loss": 2.8042, + "step": 46673 + }, + { + "epoch": 2.29, + "grad_norm": 0.702302098274231, + "learning_rate": 7.972872307261661e-05, + "loss": 2.8638, + "step": 46674 + }, + { + "epoch": 2.29, + "grad_norm": 0.7121188044548035, + "learning_rate": 7.971827100296456e-05, + "loss": 2.8277, + "step": 46675 + }, + { + "epoch": 2.29, + "grad_norm": 0.7616391777992249, + "learning_rate": 7.970781951349218e-05, + "loss": 2.8267, + "step": 46676 + }, + { + "epoch": 2.29, + "grad_norm": 0.7238693237304688, + "learning_rate": 7.969736860422704e-05, + "loss": 3.0476, + "step": 46677 + }, + { + "epoch": 2.29, + "grad_norm": 0.7182580828666687, + "learning_rate": 7.968691827519653e-05, + "loss": 3.1638, + "step": 46678 + }, + { + "epoch": 2.29, + "grad_norm": 0.708895742893219, + "learning_rate": 7.967646852642827e-05, + "loss": 3.0584, + "step": 46679 + }, + { + "epoch": 2.29, + "grad_norm": 0.7667748332023621, + "learning_rate": 7.96660193579499e-05, + "loss": 2.935, + "step": 46680 + }, + { + "epoch": 2.29, + "grad_norm": 0.7279300093650818, + "learning_rate": 7.965557076978871e-05, + "loss": 2.8778, + "step": 46681 + }, + { + "epoch": 2.29, + "grad_norm": 0.6909590363502502, + "learning_rate": 7.964512276197244e-05, + "loss": 2.6265, + "step": 46682 + }, + { + "epoch": 2.29, + "grad_norm": 0.7029200792312622, + "learning_rate": 7.963467533452851e-05, + "loss": 2.8785, + "step": 46683 + }, + { + "epoch": 2.29, + "grad_norm": 0.6875868439674377, + "learning_rate": 7.962422848748434e-05, + "loss": 2.9, + "step": 46684 + }, + { + "epoch": 2.29, + "grad_norm": 0.7104182243347168, + "learning_rate": 7.961378222086765e-05, + "loss": 2.7144, + "step": 46685 + }, + { + "epoch": 2.29, + "grad_norm": 0.7271749973297119, + "learning_rate": 7.960333653470575e-05, + "loss": 2.947, + "step": 46686 + }, + { + "epoch": 2.29, + "grad_norm": 0.7475045919418335, + "learning_rate": 7.959289142902633e-05, + "loss": 2.7681, + "step": 46687 + }, + { + "epoch": 2.29, + "grad_norm": 0.7369611263275146, + "learning_rate": 7.958244690385669e-05, + "loss": 3.0237, + "step": 46688 + }, + { + "epoch": 2.29, + "grad_norm": 0.7487080097198486, + "learning_rate": 7.95720029592246e-05, + "loss": 2.9306, + "step": 46689 + }, + { + "epoch": 2.29, + "grad_norm": 0.7130888104438782, + "learning_rate": 7.956155959515741e-05, + "loss": 2.7723, + "step": 46690 + }, + { + "epoch": 2.29, + "grad_norm": 0.7896283268928528, + "learning_rate": 7.955111681168256e-05, + "loss": 2.7895, + "step": 46691 + }, + { + "epoch": 2.29, + "grad_norm": 0.783572256565094, + "learning_rate": 7.954067460882776e-05, + "loss": 2.797, + "step": 46692 + }, + { + "epoch": 2.29, + "grad_norm": 0.7980788350105286, + "learning_rate": 7.953023298662029e-05, + "loss": 2.9751, + "step": 46693 + }, + { + "epoch": 2.29, + "grad_norm": 0.7563785314559937, + "learning_rate": 7.951979194508772e-05, + "loss": 2.9569, + "step": 46694 + }, + { + "epoch": 2.29, + "grad_norm": 0.7462531328201294, + "learning_rate": 7.950935148425772e-05, + "loss": 2.9621, + "step": 46695 + }, + { + "epoch": 2.29, + "grad_norm": 0.7288581132888794, + "learning_rate": 7.949891160415755e-05, + "loss": 2.8099, + "step": 46696 + }, + { + "epoch": 2.29, + "grad_norm": 0.7490408420562744, + "learning_rate": 7.948847230481491e-05, + "loss": 2.7926, + "step": 46697 + }, + { + "epoch": 2.29, + "grad_norm": 0.726197361946106, + "learning_rate": 7.947803358625716e-05, + "loss": 2.9515, + "step": 46698 + }, + { + "epoch": 2.29, + "grad_norm": 0.7250626087188721, + "learning_rate": 7.946759544851178e-05, + "loss": 2.9783, + "step": 46699 + }, + { + "epoch": 2.29, + "grad_norm": 0.7216305732727051, + "learning_rate": 7.945715789160637e-05, + "loss": 2.7268, + "step": 46700 + }, + { + "epoch": 2.29, + "grad_norm": 0.7879276275634766, + "learning_rate": 7.944672091556829e-05, + "loss": 2.9204, + "step": 46701 + }, + { + "epoch": 2.29, + "grad_norm": 0.7333400845527649, + "learning_rate": 7.94362845204252e-05, + "loss": 2.8761, + "step": 46702 + }, + { + "epoch": 2.29, + "grad_norm": 0.7444015145301819, + "learning_rate": 7.942584870620439e-05, + "loss": 3.0149, + "step": 46703 + }, + { + "epoch": 2.29, + "grad_norm": 0.6867544651031494, + "learning_rate": 7.941541347293353e-05, + "loss": 2.848, + "step": 46704 + }, + { + "epoch": 2.29, + "grad_norm": 0.7290630340576172, + "learning_rate": 7.940497882064e-05, + "loss": 3.0744, + "step": 46705 + }, + { + "epoch": 2.29, + "grad_norm": 0.7640529870986938, + "learning_rate": 7.939454474935122e-05, + "loss": 3.1576, + "step": 46706 + }, + { + "epoch": 2.29, + "grad_norm": 0.7068827748298645, + "learning_rate": 7.938411125909487e-05, + "loss": 2.9971, + "step": 46707 + }, + { + "epoch": 2.29, + "grad_norm": 0.6828535199165344, + "learning_rate": 7.937367834989821e-05, + "loss": 2.9971, + "step": 46708 + }, + { + "epoch": 2.29, + "grad_norm": 0.6947302222251892, + "learning_rate": 7.93632460217888e-05, + "loss": 2.9175, + "step": 46709 + }, + { + "epoch": 2.29, + "grad_norm": 0.7065187096595764, + "learning_rate": 7.935281427479425e-05, + "loss": 3.0409, + "step": 46710 + }, + { + "epoch": 2.29, + "grad_norm": 0.7438076734542847, + "learning_rate": 7.934238310894183e-05, + "loss": 2.8061, + "step": 46711 + }, + { + "epoch": 2.29, + "grad_norm": 0.7637118101119995, + "learning_rate": 7.933195252425929e-05, + "loss": 2.8936, + "step": 46712 + }, + { + "epoch": 2.29, + "grad_norm": 0.7260427474975586, + "learning_rate": 7.932152252077374e-05, + "loss": 2.9526, + "step": 46713 + }, + { + "epoch": 2.29, + "grad_norm": 0.7055556774139404, + "learning_rate": 7.931109309851283e-05, + "loss": 2.802, + "step": 46714 + }, + { + "epoch": 2.29, + "grad_norm": 0.7367199659347534, + "learning_rate": 7.93006642575041e-05, + "loss": 3.0472, + "step": 46715 + }, + { + "epoch": 2.29, + "grad_norm": 0.6921574473381042, + "learning_rate": 7.929023599777484e-05, + "loss": 2.6497, + "step": 46716 + }, + { + "epoch": 2.29, + "grad_norm": 0.6976795792579651, + "learning_rate": 7.927980831935276e-05, + "loss": 2.7202, + "step": 46717 + }, + { + "epoch": 2.29, + "grad_norm": 0.7396517992019653, + "learning_rate": 7.926938122226508e-05, + "loss": 3.1099, + "step": 46718 + }, + { + "epoch": 2.29, + "grad_norm": 0.7003223299980164, + "learning_rate": 7.925895470653943e-05, + "loss": 2.8682, + "step": 46719 + }, + { + "epoch": 2.29, + "grad_norm": 0.7250968217849731, + "learning_rate": 7.924852877220324e-05, + "loss": 3.0162, + "step": 46720 + }, + { + "epoch": 2.29, + "grad_norm": 0.7336358428001404, + "learning_rate": 7.923810341928385e-05, + "loss": 2.7529, + "step": 46721 + }, + { + "epoch": 2.29, + "grad_norm": 0.6997026205062866, + "learning_rate": 7.92276786478089e-05, + "loss": 2.8156, + "step": 46722 + }, + { + "epoch": 2.29, + "grad_norm": 0.7014487981796265, + "learning_rate": 7.921725445780567e-05, + "loss": 3.0013, + "step": 46723 + }, + { + "epoch": 2.29, + "grad_norm": 0.6987916827201843, + "learning_rate": 7.920683084930166e-05, + "loss": 3.029, + "step": 46724 + }, + { + "epoch": 2.29, + "grad_norm": 0.7134968042373657, + "learning_rate": 7.919640782232451e-05, + "loss": 2.8053, + "step": 46725 + }, + { + "epoch": 2.29, + "grad_norm": 0.7232719659805298, + "learning_rate": 7.91859853769015e-05, + "loss": 3.2245, + "step": 46726 + }, + { + "epoch": 2.29, + "grad_norm": 0.718622624874115, + "learning_rate": 7.917556351306011e-05, + "loss": 2.8381, + "step": 46727 + }, + { + "epoch": 2.29, + "grad_norm": 0.708051860332489, + "learning_rate": 7.916514223082768e-05, + "loss": 2.9364, + "step": 46728 + }, + { + "epoch": 2.29, + "grad_norm": 0.7392677068710327, + "learning_rate": 7.91547215302318e-05, + "loss": 3.0071, + "step": 46729 + }, + { + "epoch": 2.29, + "grad_norm": 0.7383646368980408, + "learning_rate": 7.914430141129994e-05, + "loss": 2.6716, + "step": 46730 + }, + { + "epoch": 2.29, + "grad_norm": 0.6965837478637695, + "learning_rate": 7.913388187405943e-05, + "loss": 2.9672, + "step": 46731 + }, + { + "epoch": 2.29, + "grad_norm": 0.7411499619483948, + "learning_rate": 7.912346291853783e-05, + "loss": 2.9199, + "step": 46732 + }, + { + "epoch": 2.29, + "grad_norm": 0.7892962098121643, + "learning_rate": 7.911304454476244e-05, + "loss": 2.9208, + "step": 46733 + }, + { + "epoch": 2.29, + "grad_norm": 0.6945703029632568, + "learning_rate": 7.910262675276087e-05, + "loss": 2.8022, + "step": 46734 + }, + { + "epoch": 2.29, + "grad_norm": 0.7150418162345886, + "learning_rate": 7.909220954256046e-05, + "loss": 2.8295, + "step": 46735 + }, + { + "epoch": 2.29, + "grad_norm": 0.6972780823707581, + "learning_rate": 7.90817929141886e-05, + "loss": 2.9666, + "step": 46736 + }, + { + "epoch": 2.29, + "grad_norm": 0.7024328708648682, + "learning_rate": 7.907137686767281e-05, + "loss": 2.7697, + "step": 46737 + }, + { + "epoch": 2.29, + "grad_norm": 0.7076515555381775, + "learning_rate": 7.906096140304046e-05, + "loss": 2.7878, + "step": 46738 + }, + { + "epoch": 2.29, + "grad_norm": 0.7264235615730286, + "learning_rate": 7.905054652031897e-05, + "loss": 2.7891, + "step": 46739 + }, + { + "epoch": 2.29, + "grad_norm": 0.7383031845092773, + "learning_rate": 7.904013221953594e-05, + "loss": 3.013, + "step": 46740 + }, + { + "epoch": 2.29, + "grad_norm": 0.7196172475814819, + "learning_rate": 7.90297185007187e-05, + "loss": 2.5622, + "step": 46741 + }, + { + "epoch": 2.29, + "grad_norm": 0.7266930341720581, + "learning_rate": 7.901930536389461e-05, + "loss": 3.0448, + "step": 46742 + }, + { + "epoch": 2.29, + "grad_norm": 0.6897668242454529, + "learning_rate": 7.900889280909106e-05, + "loss": 2.7908, + "step": 46743 + }, + { + "epoch": 2.29, + "grad_norm": 0.7381284832954407, + "learning_rate": 7.899848083633556e-05, + "loss": 2.889, + "step": 46744 + }, + { + "epoch": 2.29, + "grad_norm": 0.6909975409507751, + "learning_rate": 7.898806944565563e-05, + "loss": 3.0759, + "step": 46745 + }, + { + "epoch": 2.29, + "grad_norm": 0.791771411895752, + "learning_rate": 7.897765863707848e-05, + "loss": 2.8085, + "step": 46746 + }, + { + "epoch": 2.29, + "grad_norm": 0.7384874224662781, + "learning_rate": 7.896724841063176e-05, + "loss": 2.9682, + "step": 46747 + }, + { + "epoch": 2.29, + "grad_norm": 0.7547340989112854, + "learning_rate": 7.895683876634274e-05, + "loss": 2.9999, + "step": 46748 + }, + { + "epoch": 2.29, + "grad_norm": 0.7127857208251953, + "learning_rate": 7.894642970423878e-05, + "loss": 2.5953, + "step": 46749 + }, + { + "epoch": 2.29, + "grad_norm": 0.7140387296676636, + "learning_rate": 7.893602122434748e-05, + "loss": 2.8312, + "step": 46750 + }, + { + "epoch": 2.29, + "grad_norm": 0.6861268877983093, + "learning_rate": 7.892561332669609e-05, + "loss": 2.9191, + "step": 46751 + }, + { + "epoch": 2.29, + "grad_norm": 0.731544017791748, + "learning_rate": 7.891520601131213e-05, + "loss": 2.9862, + "step": 46752 + }, + { + "epoch": 2.29, + "grad_norm": 0.7233723402023315, + "learning_rate": 7.89047992782229e-05, + "loss": 3.1886, + "step": 46753 + }, + { + "epoch": 2.29, + "grad_norm": 0.7004035115242004, + "learning_rate": 7.889439312745598e-05, + "loss": 2.8099, + "step": 46754 + }, + { + "epoch": 2.29, + "grad_norm": 0.7136520743370056, + "learning_rate": 7.888398755903857e-05, + "loss": 2.9018, + "step": 46755 + }, + { + "epoch": 2.29, + "grad_norm": 0.7034738659858704, + "learning_rate": 7.88735825729983e-05, + "loss": 2.9447, + "step": 46756 + }, + { + "epoch": 2.29, + "grad_norm": 0.7353273630142212, + "learning_rate": 7.886317816936242e-05, + "loss": 2.9035, + "step": 46757 + }, + { + "epoch": 2.29, + "grad_norm": 0.7724609971046448, + "learning_rate": 7.885277434815828e-05, + "loss": 2.8167, + "step": 46758 + }, + { + "epoch": 2.29, + "grad_norm": 0.9952621459960938, + "learning_rate": 7.884237110941348e-05, + "loss": 3.0053, + "step": 46759 + }, + { + "epoch": 2.29, + "grad_norm": 0.7721501588821411, + "learning_rate": 7.883196845315521e-05, + "loss": 2.9342, + "step": 46760 + }, + { + "epoch": 2.29, + "grad_norm": 0.7446574568748474, + "learning_rate": 7.882156637941098e-05, + "loss": 2.8327, + "step": 46761 + }, + { + "epoch": 2.29, + "grad_norm": 0.7871111631393433, + "learning_rate": 7.881116488820827e-05, + "loss": 2.8406, + "step": 46762 + }, + { + "epoch": 2.29, + "grad_norm": 0.7120261788368225, + "learning_rate": 7.880076397957438e-05, + "loss": 2.7964, + "step": 46763 + }, + { + "epoch": 2.29, + "grad_norm": 0.8707037568092346, + "learning_rate": 7.879036365353672e-05, + "loss": 2.9292, + "step": 46764 + }, + { + "epoch": 2.29, + "grad_norm": 0.699736475944519, + "learning_rate": 7.877996391012256e-05, + "loss": 2.8143, + "step": 46765 + }, + { + "epoch": 2.29, + "grad_norm": 0.731170117855072, + "learning_rate": 7.876956474935941e-05, + "loss": 2.8974, + "step": 46766 + }, + { + "epoch": 2.29, + "grad_norm": 0.7486166954040527, + "learning_rate": 7.875916617127475e-05, + "loss": 3.0931, + "step": 46767 + }, + { + "epoch": 2.29, + "grad_norm": 0.6976475119590759, + "learning_rate": 7.874876817589572e-05, + "loss": 3.0062, + "step": 46768 + }, + { + "epoch": 2.29, + "grad_norm": 0.76833176612854, + "learning_rate": 7.873837076325001e-05, + "loss": 2.9005, + "step": 46769 + }, + { + "epoch": 2.29, + "grad_norm": 0.7090509533882141, + "learning_rate": 7.872797393336482e-05, + "loss": 3.1187, + "step": 46770 + }, + { + "epoch": 2.29, + "grad_norm": 0.7460590600967407, + "learning_rate": 7.871757768626749e-05, + "loss": 2.9877, + "step": 46771 + }, + { + "epoch": 2.29, + "grad_norm": 0.7121726870536804, + "learning_rate": 7.870718202198555e-05, + "loss": 2.9237, + "step": 46772 + }, + { + "epoch": 2.29, + "grad_norm": 0.7376371026039124, + "learning_rate": 7.869678694054622e-05, + "loss": 2.9882, + "step": 46773 + }, + { + "epoch": 2.29, + "grad_norm": 0.7384995818138123, + "learning_rate": 7.868639244197704e-05, + "loss": 2.9219, + "step": 46774 + }, + { + "epoch": 2.29, + "grad_norm": 0.8270297646522522, + "learning_rate": 7.867599852630523e-05, + "loss": 2.8947, + "step": 46775 + }, + { + "epoch": 2.29, + "grad_norm": 0.765021026134491, + "learning_rate": 7.866560519355825e-05, + "loss": 2.8757, + "step": 46776 + }, + { + "epoch": 2.29, + "grad_norm": 0.6961572170257568, + "learning_rate": 7.865521244376353e-05, + "loss": 3.0606, + "step": 46777 + }, + { + "epoch": 2.29, + "grad_norm": 0.7141311764717102, + "learning_rate": 7.864482027694842e-05, + "loss": 2.6212, + "step": 46778 + }, + { + "epoch": 2.29, + "grad_norm": 0.6754100322723389, + "learning_rate": 7.863442869314024e-05, + "loss": 2.7227, + "step": 46779 + }, + { + "epoch": 2.29, + "grad_norm": 0.7049227952957153, + "learning_rate": 7.862403769236628e-05, + "loss": 3.0977, + "step": 46780 + }, + { + "epoch": 2.29, + "grad_norm": 0.780094563961029, + "learning_rate": 7.861364727465399e-05, + "loss": 2.8449, + "step": 46781 + }, + { + "epoch": 2.29, + "grad_norm": 0.6903862357139587, + "learning_rate": 7.860325744003085e-05, + "loss": 3.1181, + "step": 46782 + }, + { + "epoch": 2.29, + "grad_norm": 0.6869261860847473, + "learning_rate": 7.859286818852401e-05, + "loss": 2.9011, + "step": 46783 + }, + { + "epoch": 2.29, + "grad_norm": 0.7356374859809875, + "learning_rate": 7.858247952016106e-05, + "loss": 3.0236, + "step": 46784 + }, + { + "epoch": 2.29, + "grad_norm": 0.6988884210586548, + "learning_rate": 7.857209143496925e-05, + "loss": 2.8257, + "step": 46785 + }, + { + "epoch": 2.29, + "grad_norm": 0.7464091777801514, + "learning_rate": 7.856170393297582e-05, + "loss": 2.5629, + "step": 46786 + }, + { + "epoch": 2.29, + "grad_norm": 0.7155094742774963, + "learning_rate": 7.855131701420836e-05, + "loss": 3.0864, + "step": 46787 + }, + { + "epoch": 2.29, + "grad_norm": 0.7148899435997009, + "learning_rate": 7.854093067869401e-05, + "loss": 2.9552, + "step": 46788 + }, + { + "epoch": 2.29, + "grad_norm": 0.7054211497306824, + "learning_rate": 7.853054492646033e-05, + "loss": 3.0602, + "step": 46789 + }, + { + "epoch": 2.29, + "grad_norm": 0.7065744996070862, + "learning_rate": 7.852015975753447e-05, + "loss": 2.9686, + "step": 46790 + }, + { + "epoch": 2.29, + "grad_norm": 0.7648019194602966, + "learning_rate": 7.850977517194397e-05, + "loss": 3.0381, + "step": 46791 + }, + { + "epoch": 2.29, + "grad_norm": 0.7037829756736755, + "learning_rate": 7.849939116971608e-05, + "loss": 3.0974, + "step": 46792 + }, + { + "epoch": 2.29, + "grad_norm": 0.7240031957626343, + "learning_rate": 7.84890077508781e-05, + "loss": 3.1276, + "step": 46793 + }, + { + "epoch": 2.29, + "grad_norm": 0.7773902416229248, + "learning_rate": 7.847862491545755e-05, + "loss": 2.8614, + "step": 46794 + }, + { + "epoch": 2.29, + "grad_norm": 0.708374559879303, + "learning_rate": 7.846824266348153e-05, + "loss": 2.7868, + "step": 46795 + }, + { + "epoch": 2.29, + "grad_norm": 0.7300131916999817, + "learning_rate": 7.845786099497757e-05, + "loss": 2.8318, + "step": 46796 + }, + { + "epoch": 2.29, + "grad_norm": 0.7228390574455261, + "learning_rate": 7.844747990997305e-05, + "loss": 3.0077, + "step": 46797 + }, + { + "epoch": 2.29, + "grad_norm": 0.6896500587463379, + "learning_rate": 7.843709940849513e-05, + "loss": 2.915, + "step": 46798 + }, + { + "epoch": 2.29, + "grad_norm": 0.7400826811790466, + "learning_rate": 7.842671949057134e-05, + "loss": 3.0408, + "step": 46799 + }, + { + "epoch": 2.29, + "grad_norm": 0.7104637622833252, + "learning_rate": 7.841634015622891e-05, + "loss": 2.9581, + "step": 46800 + }, + { + "epoch": 2.29, + "grad_norm": 0.7338810563087463, + "learning_rate": 7.840596140549514e-05, + "loss": 3.0433, + "step": 46801 + }, + { + "epoch": 2.29, + "grad_norm": 0.727415919303894, + "learning_rate": 7.83955832383975e-05, + "loss": 3.0346, + "step": 46802 + }, + { + "epoch": 2.29, + "grad_norm": 0.7342788577079773, + "learning_rate": 7.838520565496314e-05, + "loss": 2.829, + "step": 46803 + }, + { + "epoch": 2.29, + "grad_norm": 0.7318736910820007, + "learning_rate": 7.837482865521959e-05, + "loss": 3.0128, + "step": 46804 + }, + { + "epoch": 2.29, + "grad_norm": 0.7315552830696106, + "learning_rate": 7.836445223919401e-05, + "loss": 3.0684, + "step": 46805 + }, + { + "epoch": 2.29, + "grad_norm": 0.7073324918746948, + "learning_rate": 7.835407640691388e-05, + "loss": 2.7697, + "step": 46806 + }, + { + "epoch": 2.29, + "grad_norm": 0.659433126449585, + "learning_rate": 7.834370115840647e-05, + "loss": 2.8127, + "step": 46807 + }, + { + "epoch": 2.29, + "grad_norm": 0.7416350245475769, + "learning_rate": 7.833332649369902e-05, + "loss": 2.9297, + "step": 46808 + }, + { + "epoch": 2.29, + "grad_norm": 0.6925668120384216, + "learning_rate": 7.832295241281898e-05, + "loss": 2.7095, + "step": 46809 + }, + { + "epoch": 2.29, + "grad_norm": 0.7669311761856079, + "learning_rate": 7.831257891579358e-05, + "loss": 2.9366, + "step": 46810 + }, + { + "epoch": 2.29, + "grad_norm": 0.7601991891860962, + "learning_rate": 7.830220600265014e-05, + "loss": 2.8255, + "step": 46811 + }, + { + "epoch": 2.29, + "grad_norm": 0.6947373747825623, + "learning_rate": 7.829183367341613e-05, + "loss": 2.9179, + "step": 46812 + }, + { + "epoch": 2.29, + "grad_norm": 0.7191194295883179, + "learning_rate": 7.828146192811877e-05, + "loss": 2.9629, + "step": 46813 + }, + { + "epoch": 2.29, + "grad_norm": 0.753840982913971, + "learning_rate": 7.827109076678533e-05, + "loss": 2.7047, + "step": 46814 + }, + { + "epoch": 2.29, + "grad_norm": 0.7168121933937073, + "learning_rate": 7.826072018944312e-05, + "loss": 3.094, + "step": 46815 + }, + { + "epoch": 2.29, + "grad_norm": 0.75309157371521, + "learning_rate": 7.825035019611948e-05, + "loss": 2.8804, + "step": 46816 + }, + { + "epoch": 2.29, + "grad_norm": 0.6898667216300964, + "learning_rate": 7.82399807868418e-05, + "loss": 3.1949, + "step": 46817 + }, + { + "epoch": 2.29, + "grad_norm": 0.7027104496955872, + "learning_rate": 7.82296119616373e-05, + "loss": 2.7947, + "step": 46818 + }, + { + "epoch": 2.29, + "grad_norm": 0.7549039125442505, + "learning_rate": 7.821924372053338e-05, + "loss": 3.0059, + "step": 46819 + }, + { + "epoch": 2.29, + "grad_norm": 0.7107914090156555, + "learning_rate": 7.82088760635572e-05, + "loss": 3.0119, + "step": 46820 + }, + { + "epoch": 2.29, + "grad_norm": 0.7330605387687683, + "learning_rate": 7.819850899073625e-05, + "loss": 3.0567, + "step": 46821 + }, + { + "epoch": 2.29, + "grad_norm": 0.7199442982673645, + "learning_rate": 7.818814250209775e-05, + "loss": 2.9784, + "step": 46822 + }, + { + "epoch": 2.29, + "grad_norm": 0.7049921751022339, + "learning_rate": 7.81777765976689e-05, + "loss": 3.0305, + "step": 46823 + }, + { + "epoch": 2.29, + "grad_norm": 0.7271491289138794, + "learning_rate": 7.816741127747716e-05, + "loss": 2.9861, + "step": 46824 + }, + { + "epoch": 2.29, + "grad_norm": 0.7731443643569946, + "learning_rate": 7.815704654154971e-05, + "loss": 2.9895, + "step": 46825 + }, + { + "epoch": 2.29, + "grad_norm": 0.7125991582870483, + "learning_rate": 7.81466823899139e-05, + "loss": 3.0968, + "step": 46826 + }, + { + "epoch": 2.29, + "grad_norm": 0.7187132239341736, + "learning_rate": 7.813631882259708e-05, + "loss": 2.8701, + "step": 46827 + }, + { + "epoch": 2.29, + "grad_norm": 0.7413823008537292, + "learning_rate": 7.812595583962653e-05, + "loss": 2.9285, + "step": 46828 + }, + { + "epoch": 2.29, + "grad_norm": 0.6879110932350159, + "learning_rate": 7.811559344102953e-05, + "loss": 3.0651, + "step": 46829 + }, + { + "epoch": 2.3, + "grad_norm": 0.6981056332588196, + "learning_rate": 7.810523162683323e-05, + "loss": 2.7413, + "step": 46830 + }, + { + "epoch": 2.3, + "grad_norm": 0.7995371222496033, + "learning_rate": 7.809487039706517e-05, + "loss": 2.7898, + "step": 46831 + }, + { + "epoch": 2.3, + "grad_norm": 0.6955015063285828, + "learning_rate": 7.808450975175239e-05, + "loss": 2.8881, + "step": 46832 + }, + { + "epoch": 2.3, + "grad_norm": 0.7273989319801331, + "learning_rate": 7.807414969092233e-05, + "loss": 3.0188, + "step": 46833 + }, + { + "epoch": 2.3, + "grad_norm": 0.7289535403251648, + "learning_rate": 7.806379021460233e-05, + "loss": 2.9289, + "step": 46834 + }, + { + "epoch": 2.3, + "grad_norm": 0.7082308530807495, + "learning_rate": 7.805343132281951e-05, + "loss": 2.8786, + "step": 46835 + }, + { + "epoch": 2.3, + "grad_norm": 0.736067533493042, + "learning_rate": 7.804307301560131e-05, + "loss": 3.1124, + "step": 46836 + }, + { + "epoch": 2.3, + "grad_norm": 0.751034677028656, + "learning_rate": 7.803271529297496e-05, + "loss": 2.9974, + "step": 46837 + }, + { + "epoch": 2.3, + "grad_norm": 0.7050460577011108, + "learning_rate": 7.802235815496762e-05, + "loss": 2.7239, + "step": 46838 + }, + { + "epoch": 2.3, + "grad_norm": 0.7277799844741821, + "learning_rate": 7.801200160160676e-05, + "loss": 3.0167, + "step": 46839 + }, + { + "epoch": 2.3, + "grad_norm": 0.7088861465454102, + "learning_rate": 7.800164563291947e-05, + "loss": 2.8003, + "step": 46840 + }, + { + "epoch": 2.3, + "grad_norm": 0.6924530267715454, + "learning_rate": 7.79912902489332e-05, + "loss": 2.7845, + "step": 46841 + }, + { + "epoch": 2.3, + "grad_norm": 0.7400920391082764, + "learning_rate": 7.798093544967509e-05, + "loss": 3.0177, + "step": 46842 + }, + { + "epoch": 2.3, + "grad_norm": 0.7346185445785522, + "learning_rate": 7.797058123517253e-05, + "loss": 3.1688, + "step": 46843 + }, + { + "epoch": 2.3, + "grad_norm": 0.7445531487464905, + "learning_rate": 7.796022760545272e-05, + "loss": 2.8967, + "step": 46844 + }, + { + "epoch": 2.3, + "grad_norm": 0.7266702651977539, + "learning_rate": 7.794987456054288e-05, + "loss": 2.9966, + "step": 46845 + }, + { + "epoch": 2.3, + "grad_norm": 0.7435703277587891, + "learning_rate": 7.793952210047039e-05, + "loss": 2.9924, + "step": 46846 + }, + { + "epoch": 2.3, + "grad_norm": 0.7759212255477905, + "learning_rate": 7.792917022526239e-05, + "loss": 2.6906, + "step": 46847 + }, + { + "epoch": 2.3, + "grad_norm": 0.7034785747528076, + "learning_rate": 7.791881893494622e-05, + "loss": 3.0356, + "step": 46848 + }, + { + "epoch": 2.3, + "grad_norm": 0.7417486906051636, + "learning_rate": 7.790846822954922e-05, + "loss": 2.8477, + "step": 46849 + }, + { + "epoch": 2.3, + "grad_norm": 0.73434978723526, + "learning_rate": 7.78981181090986e-05, + "loss": 2.7031, + "step": 46850 + }, + { + "epoch": 2.3, + "grad_norm": 0.7167014479637146, + "learning_rate": 7.788776857362157e-05, + "loss": 2.9902, + "step": 46851 + }, + { + "epoch": 2.3, + "grad_norm": 0.717901885509491, + "learning_rate": 7.787741962314533e-05, + "loss": 3.0099, + "step": 46852 + }, + { + "epoch": 2.3, + "grad_norm": 0.7119569778442383, + "learning_rate": 7.78670712576972e-05, + "loss": 2.8717, + "step": 46853 + }, + { + "epoch": 2.3, + "grad_norm": 0.7578343152999878, + "learning_rate": 7.785672347730459e-05, + "loss": 2.9561, + "step": 46854 + }, + { + "epoch": 2.3, + "grad_norm": 0.741181492805481, + "learning_rate": 7.784637628199448e-05, + "loss": 2.934, + "step": 46855 + }, + { + "epoch": 2.3, + "grad_norm": 0.7272793054580688, + "learning_rate": 7.783602967179438e-05, + "loss": 2.8121, + "step": 46856 + }, + { + "epoch": 2.3, + "grad_norm": 0.7096351981163025, + "learning_rate": 7.78256836467313e-05, + "loss": 3.0762, + "step": 46857 + }, + { + "epoch": 2.3, + "grad_norm": 0.703946590423584, + "learning_rate": 7.781533820683272e-05, + "loss": 2.9331, + "step": 46858 + }, + { + "epoch": 2.3, + "grad_norm": 0.7070743441581726, + "learning_rate": 7.780499335212576e-05, + "loss": 2.8825, + "step": 46859 + }, + { + "epoch": 2.3, + "grad_norm": 0.7188315987586975, + "learning_rate": 7.779464908263762e-05, + "loss": 2.9131, + "step": 46860 + }, + { + "epoch": 2.3, + "grad_norm": 0.6855785846710205, + "learning_rate": 7.778430539839569e-05, + "loss": 2.9938, + "step": 46861 + }, + { + "epoch": 2.3, + "grad_norm": 0.7094498872756958, + "learning_rate": 7.777396229942705e-05, + "loss": 2.9944, + "step": 46862 + }, + { + "epoch": 2.3, + "grad_norm": 0.7395783066749573, + "learning_rate": 7.776361978575902e-05, + "loss": 2.9292, + "step": 46863 + }, + { + "epoch": 2.3, + "grad_norm": 0.729633629322052, + "learning_rate": 7.775327785741894e-05, + "loss": 2.8902, + "step": 46864 + }, + { + "epoch": 2.3, + "grad_norm": 0.7409253716468811, + "learning_rate": 7.774293651443394e-05, + "loss": 3.0206, + "step": 46865 + }, + { + "epoch": 2.3, + "grad_norm": 0.7665625810623169, + "learning_rate": 7.773259575683131e-05, + "loss": 2.5951, + "step": 46866 + }, + { + "epoch": 2.3, + "grad_norm": 0.7642672657966614, + "learning_rate": 7.772225558463813e-05, + "loss": 2.8159, + "step": 46867 + }, + { + "epoch": 2.3, + "grad_norm": 0.7062388062477112, + "learning_rate": 7.771191599788174e-05, + "loss": 2.9574, + "step": 46868 + }, + { + "epoch": 2.3, + "grad_norm": 0.7327240705490112, + "learning_rate": 7.770157699658947e-05, + "loss": 2.9558, + "step": 46869 + }, + { + "epoch": 2.3, + "grad_norm": 0.7051011323928833, + "learning_rate": 7.76912385807884e-05, + "loss": 2.7563, + "step": 46870 + }, + { + "epoch": 2.3, + "grad_norm": 0.721517026424408, + "learning_rate": 7.768090075050588e-05, + "loss": 2.9567, + "step": 46871 + }, + { + "epoch": 2.3, + "grad_norm": 0.7337429523468018, + "learning_rate": 7.767056350576911e-05, + "loss": 2.8228, + "step": 46872 + }, + { + "epoch": 2.3, + "grad_norm": 0.7145420908927917, + "learning_rate": 7.76602268466052e-05, + "loss": 2.8212, + "step": 46873 + }, + { + "epoch": 2.3, + "grad_norm": 0.717994749546051, + "learning_rate": 7.764989077304153e-05, + "loss": 2.808, + "step": 46874 + }, + { + "epoch": 2.3, + "grad_norm": 0.6848964095115662, + "learning_rate": 7.763955528510516e-05, + "loss": 3.0177, + "step": 46875 + }, + { + "epoch": 2.3, + "grad_norm": 0.7062221765518188, + "learning_rate": 7.762922038282352e-05, + "loss": 2.721, + "step": 46876 + }, + { + "epoch": 2.3, + "grad_norm": 0.7325926423072815, + "learning_rate": 7.761888606622362e-05, + "loss": 2.8309, + "step": 46877 + }, + { + "epoch": 2.3, + "grad_norm": 0.7448124289512634, + "learning_rate": 7.760855233533279e-05, + "loss": 2.8649, + "step": 46878 + }, + { + "epoch": 2.3, + "grad_norm": 0.6935537457466125, + "learning_rate": 7.759821919017831e-05, + "loss": 2.824, + "step": 46879 + }, + { + "epoch": 2.3, + "grad_norm": 0.7377654314041138, + "learning_rate": 7.75878866307873e-05, + "loss": 2.9616, + "step": 46880 + }, + { + "epoch": 2.3, + "grad_norm": 0.7462741732597351, + "learning_rate": 7.757755465718703e-05, + "loss": 2.763, + "step": 46881 + }, + { + "epoch": 2.3, + "grad_norm": 0.7333121299743652, + "learning_rate": 7.756722326940455e-05, + "loss": 3.0852, + "step": 46882 + }, + { + "epoch": 2.3, + "grad_norm": 0.7244988083839417, + "learning_rate": 7.755689246746721e-05, + "loss": 2.975, + "step": 46883 + }, + { + "epoch": 2.3, + "grad_norm": 0.7320109605789185, + "learning_rate": 7.754656225140229e-05, + "loss": 2.8481, + "step": 46884 + }, + { + "epoch": 2.3, + "grad_norm": 0.6962860226631165, + "learning_rate": 7.753623262123683e-05, + "loss": 2.9392, + "step": 46885 + }, + { + "epoch": 2.3, + "grad_norm": 0.761470377445221, + "learning_rate": 7.75259035769982e-05, + "loss": 3.0863, + "step": 46886 + }, + { + "epoch": 2.3, + "grad_norm": 0.6684213876724243, + "learning_rate": 7.751557511871356e-05, + "loss": 2.7991, + "step": 46887 + }, + { + "epoch": 2.3, + "grad_norm": 0.6986576318740845, + "learning_rate": 7.750524724640996e-05, + "loss": 2.8605, + "step": 46888 + }, + { + "epoch": 2.3, + "grad_norm": 0.7003403902053833, + "learning_rate": 7.74949199601148e-05, + "loss": 2.8998, + "step": 46889 + }, + { + "epoch": 2.3, + "grad_norm": 0.7219340205192566, + "learning_rate": 7.748459325985512e-05, + "loss": 2.9807, + "step": 46890 + }, + { + "epoch": 2.3, + "grad_norm": 0.7780990600585938, + "learning_rate": 7.747426714565828e-05, + "loss": 2.9845, + "step": 46891 + }, + { + "epoch": 2.3, + "grad_norm": 0.7375266551971436, + "learning_rate": 7.74639416175513e-05, + "loss": 2.8539, + "step": 46892 + }, + { + "epoch": 2.3, + "grad_norm": 0.702833890914917, + "learning_rate": 7.745361667556159e-05, + "loss": 2.7213, + "step": 46893 + }, + { + "epoch": 2.3, + "grad_norm": 0.7389039397239685, + "learning_rate": 7.744329231971621e-05, + "loss": 3.0499, + "step": 46894 + }, + { + "epoch": 2.3, + "grad_norm": 0.7284427285194397, + "learning_rate": 7.743296855004224e-05, + "loss": 2.8602, + "step": 46895 + }, + { + "epoch": 2.3, + "grad_norm": 0.7277424335479736, + "learning_rate": 7.742264536656713e-05, + "loss": 2.7597, + "step": 46896 + }, + { + "epoch": 2.3, + "grad_norm": 0.7396055459976196, + "learning_rate": 7.741232276931785e-05, + "loss": 2.9429, + "step": 46897 + }, + { + "epoch": 2.3, + "grad_norm": 0.7248284220695496, + "learning_rate": 7.740200075832164e-05, + "loss": 2.803, + "step": 46898 + }, + { + "epoch": 2.3, + "grad_norm": 0.7068817019462585, + "learning_rate": 7.739167933360584e-05, + "loss": 2.8822, + "step": 46899 + }, + { + "epoch": 2.3, + "grad_norm": 0.724687933921814, + "learning_rate": 7.73813584951974e-05, + "loss": 2.7615, + "step": 46900 + }, + { + "epoch": 2.3, + "grad_norm": 0.7012940645217896, + "learning_rate": 7.73710382431237e-05, + "loss": 3.0116, + "step": 46901 + }, + { + "epoch": 2.3, + "grad_norm": 0.706241250038147, + "learning_rate": 7.736071857741188e-05, + "loss": 3.0091, + "step": 46902 + }, + { + "epoch": 2.3, + "grad_norm": 0.8327338099479675, + "learning_rate": 7.735039949808897e-05, + "loss": 2.8047, + "step": 46903 + }, + { + "epoch": 2.3, + "grad_norm": 0.7665473818778992, + "learning_rate": 7.734008100518233e-05, + "loss": 2.9935, + "step": 46904 + }, + { + "epoch": 2.3, + "grad_norm": 0.6795613169670105, + "learning_rate": 7.732976309871901e-05, + "loss": 2.8637, + "step": 46905 + }, + { + "epoch": 2.3, + "grad_norm": 0.7590937614440918, + "learning_rate": 7.731944577872629e-05, + "loss": 2.9044, + "step": 46906 + }, + { + "epoch": 2.3, + "grad_norm": 0.6913984417915344, + "learning_rate": 7.730912904523123e-05, + "loss": 2.8735, + "step": 46907 + }, + { + "epoch": 2.3, + "grad_norm": 0.7234129309654236, + "learning_rate": 7.729881289826116e-05, + "loss": 2.7942, + "step": 46908 + }, + { + "epoch": 2.3, + "grad_norm": 0.7211498618125916, + "learning_rate": 7.728849733784313e-05, + "loss": 3.0061, + "step": 46909 + }, + { + "epoch": 2.3, + "grad_norm": 0.7252943515777588, + "learning_rate": 7.72781823640043e-05, + "loss": 2.9296, + "step": 46910 + }, + { + "epoch": 2.3, + "grad_norm": 0.7195742130279541, + "learning_rate": 7.726786797677191e-05, + "loss": 3.0563, + "step": 46911 + }, + { + "epoch": 2.3, + "grad_norm": 0.7221367955207825, + "learning_rate": 7.7257554176173e-05, + "loss": 3.0016, + "step": 46912 + }, + { + "epoch": 2.3, + "grad_norm": 0.7329986095428467, + "learning_rate": 7.724724096223496e-05, + "loss": 3.0884, + "step": 46913 + }, + { + "epoch": 2.3, + "grad_norm": 0.7828996181488037, + "learning_rate": 7.723692833498471e-05, + "loss": 2.7201, + "step": 46914 + }, + { + "epoch": 2.3, + "grad_norm": 0.7411604523658752, + "learning_rate": 7.722661629444959e-05, + "loss": 2.9872, + "step": 46915 + }, + { + "epoch": 2.3, + "grad_norm": 0.7232510447502136, + "learning_rate": 7.72163048406567e-05, + "loss": 2.959, + "step": 46916 + }, + { + "epoch": 2.3, + "grad_norm": 0.7410550713539124, + "learning_rate": 7.720599397363309e-05, + "loss": 2.7832, + "step": 46917 + }, + { + "epoch": 2.3, + "grad_norm": 0.6873605251312256, + "learning_rate": 7.719568369340611e-05, + "loss": 2.9664, + "step": 46918 + }, + { + "epoch": 2.3, + "grad_norm": 0.6807610392570496, + "learning_rate": 7.718537400000275e-05, + "loss": 2.8011, + "step": 46919 + }, + { + "epoch": 2.3, + "grad_norm": 0.7145941257476807, + "learning_rate": 7.71750648934502e-05, + "loss": 2.9894, + "step": 46920 + }, + { + "epoch": 2.3, + "grad_norm": 0.6965793371200562, + "learning_rate": 7.716475637377575e-05, + "loss": 3.0861, + "step": 46921 + }, + { + "epoch": 2.3, + "grad_norm": 0.7429308295249939, + "learning_rate": 7.715444844100637e-05, + "loss": 2.9534, + "step": 46922 + }, + { + "epoch": 2.3, + "grad_norm": 0.7118332386016846, + "learning_rate": 7.714414109516937e-05, + "loss": 3.0844, + "step": 46923 + }, + { + "epoch": 2.3, + "grad_norm": 0.7275873422622681, + "learning_rate": 7.71338343362918e-05, + "loss": 2.7865, + "step": 46924 + }, + { + "epoch": 2.3, + "grad_norm": 0.779687762260437, + "learning_rate": 7.712352816440072e-05, + "loss": 3.0541, + "step": 46925 + }, + { + "epoch": 2.3, + "grad_norm": 0.7450523376464844, + "learning_rate": 7.711322257952349e-05, + "loss": 2.8133, + "step": 46926 + }, + { + "epoch": 2.3, + "grad_norm": 0.725862979888916, + "learning_rate": 7.7102917581687e-05, + "loss": 2.8848, + "step": 46927 + }, + { + "epoch": 2.3, + "grad_norm": 0.705909788608551, + "learning_rate": 7.709261317091869e-05, + "loss": 2.7166, + "step": 46928 + }, + { + "epoch": 2.3, + "grad_norm": 0.7430562376976013, + "learning_rate": 7.70823093472454e-05, + "loss": 2.7686, + "step": 46929 + }, + { + "epoch": 2.3, + "grad_norm": 0.717370867729187, + "learning_rate": 7.70720061106945e-05, + "loss": 2.9506, + "step": 46930 + }, + { + "epoch": 2.3, + "grad_norm": 0.7363516688346863, + "learning_rate": 7.706170346129303e-05, + "loss": 2.9871, + "step": 46931 + }, + { + "epoch": 2.3, + "grad_norm": 0.7221070528030396, + "learning_rate": 7.705140139906806e-05, + "loss": 2.7575, + "step": 46932 + }, + { + "epoch": 2.3, + "grad_norm": 0.6913543939590454, + "learning_rate": 7.704109992404684e-05, + "loss": 3.0035, + "step": 46933 + }, + { + "epoch": 2.3, + "grad_norm": 0.7119285464286804, + "learning_rate": 7.703079903625639e-05, + "loss": 2.9111, + "step": 46934 + }, + { + "epoch": 2.3, + "grad_norm": 0.7075133323669434, + "learning_rate": 7.702049873572392e-05, + "loss": 2.7502, + "step": 46935 + }, + { + "epoch": 2.3, + "grad_norm": 0.717917263507843, + "learning_rate": 7.701019902247661e-05, + "loss": 2.8735, + "step": 46936 + }, + { + "epoch": 2.3, + "grad_norm": 0.735043466091156, + "learning_rate": 7.699989989654145e-05, + "loss": 2.7548, + "step": 46937 + }, + { + "epoch": 2.3, + "grad_norm": 0.7249195575714111, + "learning_rate": 7.698960135794581e-05, + "loss": 2.8316, + "step": 46938 + }, + { + "epoch": 2.3, + "grad_norm": 0.7049363255500793, + "learning_rate": 7.697930340671641e-05, + "loss": 3.0058, + "step": 46939 + }, + { + "epoch": 2.3, + "grad_norm": 0.7325100302696228, + "learning_rate": 7.696900604288066e-05, + "loss": 2.9678, + "step": 46940 + }, + { + "epoch": 2.3, + "grad_norm": 0.7083897590637207, + "learning_rate": 7.695870926646573e-05, + "loss": 2.813, + "step": 46941 + }, + { + "epoch": 2.3, + "grad_norm": 0.7134562730789185, + "learning_rate": 7.69484130774985e-05, + "loss": 3.0571, + "step": 46942 + }, + { + "epoch": 2.3, + "grad_norm": 0.7213225364685059, + "learning_rate": 7.693811747600633e-05, + "loss": 2.9821, + "step": 46943 + }, + { + "epoch": 2.3, + "grad_norm": 0.6950119137763977, + "learning_rate": 7.692782246201615e-05, + "loss": 2.9263, + "step": 46944 + }, + { + "epoch": 2.3, + "grad_norm": 0.6881087422370911, + "learning_rate": 7.691752803555524e-05, + "loss": 2.8272, + "step": 46945 + }, + { + "epoch": 2.3, + "grad_norm": 0.7488217353820801, + "learning_rate": 7.69072341966506e-05, + "loss": 2.9328, + "step": 46946 + }, + { + "epoch": 2.3, + "grad_norm": 0.7348080277442932, + "learning_rate": 7.68969409453293e-05, + "loss": 2.8666, + "step": 46947 + }, + { + "epoch": 2.3, + "grad_norm": 0.738717257976532, + "learning_rate": 7.688664828161861e-05, + "loss": 2.9959, + "step": 46948 + }, + { + "epoch": 2.3, + "grad_norm": 0.7138335108757019, + "learning_rate": 7.687635620554549e-05, + "loss": 3.0209, + "step": 46949 + }, + { + "epoch": 2.3, + "grad_norm": 0.6965306997299194, + "learning_rate": 7.686606471713708e-05, + "loss": 2.7597, + "step": 46950 + }, + { + "epoch": 2.3, + "grad_norm": 0.702285885810852, + "learning_rate": 7.685577381642059e-05, + "loss": 2.8853, + "step": 46951 + }, + { + "epoch": 2.3, + "grad_norm": 0.7388809323310852, + "learning_rate": 7.684548350342305e-05, + "loss": 2.8097, + "step": 46952 + }, + { + "epoch": 2.3, + "grad_norm": 0.7757939696311951, + "learning_rate": 7.683519377817156e-05, + "loss": 2.7593, + "step": 46953 + }, + { + "epoch": 2.3, + "grad_norm": 0.7448767423629761, + "learning_rate": 7.682490464069315e-05, + "loss": 2.9256, + "step": 46954 + }, + { + "epoch": 2.3, + "grad_norm": 0.7056995630264282, + "learning_rate": 7.6814616091015e-05, + "loss": 2.8403, + "step": 46955 + }, + { + "epoch": 2.3, + "grad_norm": 0.6833332180976868, + "learning_rate": 7.680432812916427e-05, + "loss": 2.839, + "step": 46956 + }, + { + "epoch": 2.3, + "grad_norm": 0.6968466639518738, + "learning_rate": 7.679404075516786e-05, + "loss": 2.8719, + "step": 46957 + }, + { + "epoch": 2.3, + "grad_norm": 0.7767067551612854, + "learning_rate": 7.678375396905314e-05, + "loss": 3.0657, + "step": 46958 + }, + { + "epoch": 2.3, + "grad_norm": 0.690322995185852, + "learning_rate": 7.677346777084691e-05, + "loss": 2.9365, + "step": 46959 + }, + { + "epoch": 2.3, + "grad_norm": 0.7153850197792053, + "learning_rate": 7.676318216057654e-05, + "loss": 2.8756, + "step": 46960 + }, + { + "epoch": 2.3, + "grad_norm": 0.6876060962677002, + "learning_rate": 7.675289713826896e-05, + "loss": 2.8716, + "step": 46961 + }, + { + "epoch": 2.3, + "grad_norm": 0.6872727870941162, + "learning_rate": 7.674261270395119e-05, + "loss": 2.7886, + "step": 46962 + }, + { + "epoch": 2.3, + "grad_norm": 0.7221353054046631, + "learning_rate": 7.673232885765053e-05, + "loss": 2.7449, + "step": 46963 + }, + { + "epoch": 2.3, + "grad_norm": 0.7340434789657593, + "learning_rate": 7.672204559939381e-05, + "loss": 2.7857, + "step": 46964 + }, + { + "epoch": 2.3, + "grad_norm": 0.7158043384552002, + "learning_rate": 7.671176292920828e-05, + "loss": 2.9561, + "step": 46965 + }, + { + "epoch": 2.3, + "grad_norm": 0.68252032995224, + "learning_rate": 7.67014808471211e-05, + "loss": 2.8707, + "step": 46966 + }, + { + "epoch": 2.3, + "grad_norm": 0.7790015339851379, + "learning_rate": 7.669119935315923e-05, + "loss": 2.9822, + "step": 46967 + }, + { + "epoch": 2.3, + "grad_norm": 0.7351877689361572, + "learning_rate": 7.668091844734975e-05, + "loss": 2.7752, + "step": 46968 + }, + { + "epoch": 2.3, + "grad_norm": 0.7354260683059692, + "learning_rate": 7.667063812971968e-05, + "loss": 2.8619, + "step": 46969 + }, + { + "epoch": 2.3, + "grad_norm": 0.7240676283836365, + "learning_rate": 7.666035840029615e-05, + "loss": 2.9733, + "step": 46970 + }, + { + "epoch": 2.3, + "grad_norm": 0.6971290111541748, + "learning_rate": 7.665007925910635e-05, + "loss": 2.8701, + "step": 46971 + }, + { + "epoch": 2.3, + "grad_norm": 0.7421733140945435, + "learning_rate": 7.663980070617721e-05, + "loss": 2.8046, + "step": 46972 + }, + { + "epoch": 2.3, + "grad_norm": 0.7057068347930908, + "learning_rate": 7.66295227415359e-05, + "loss": 2.9185, + "step": 46973 + }, + { + "epoch": 2.3, + "grad_norm": 0.704230010509491, + "learning_rate": 7.661924536520943e-05, + "loss": 2.961, + "step": 46974 + }, + { + "epoch": 2.3, + "grad_norm": 0.7011248469352722, + "learning_rate": 7.66089685772248e-05, + "loss": 3.1366, + "step": 46975 + }, + { + "epoch": 2.3, + "grad_norm": 0.7493485808372498, + "learning_rate": 7.659869237760925e-05, + "loss": 2.9313, + "step": 46976 + }, + { + "epoch": 2.3, + "grad_norm": 0.7271580696105957, + "learning_rate": 7.658841676638963e-05, + "loss": 2.659, + "step": 46977 + }, + { + "epoch": 2.3, + "grad_norm": 0.7556811571121216, + "learning_rate": 7.657814174359323e-05, + "loss": 2.7887, + "step": 46978 + }, + { + "epoch": 2.3, + "grad_norm": 0.786780595779419, + "learning_rate": 7.656786730924692e-05, + "loss": 2.8375, + "step": 46979 + }, + { + "epoch": 2.3, + "grad_norm": 0.7101176381111145, + "learning_rate": 7.655759346337788e-05, + "loss": 3.0459, + "step": 46980 + }, + { + "epoch": 2.3, + "grad_norm": 0.753944456577301, + "learning_rate": 7.654732020601318e-05, + "loss": 3.0886, + "step": 46981 + }, + { + "epoch": 2.3, + "grad_norm": 0.6981664896011353, + "learning_rate": 7.653704753717983e-05, + "loss": 2.9098, + "step": 46982 + }, + { + "epoch": 2.3, + "grad_norm": 0.7053401470184326, + "learning_rate": 7.652677545690493e-05, + "loss": 2.8841, + "step": 46983 + }, + { + "epoch": 2.3, + "grad_norm": 0.7201017737388611, + "learning_rate": 7.65165039652154e-05, + "loss": 2.8831, + "step": 46984 + }, + { + "epoch": 2.3, + "grad_norm": 0.7298591732978821, + "learning_rate": 7.650623306213845e-05, + "loss": 2.8965, + "step": 46985 + }, + { + "epoch": 2.3, + "grad_norm": 0.7369044423103333, + "learning_rate": 7.649596274770098e-05, + "loss": 2.7928, + "step": 46986 + }, + { + "epoch": 2.3, + "grad_norm": 0.7387498021125793, + "learning_rate": 7.648569302193014e-05, + "loss": 3.0365, + "step": 46987 + }, + { + "epoch": 2.3, + "grad_norm": 0.7449101209640503, + "learning_rate": 7.647542388485308e-05, + "loss": 2.7906, + "step": 46988 + }, + { + "epoch": 2.3, + "grad_norm": 0.7396485805511475, + "learning_rate": 7.64651553364967e-05, + "loss": 2.9116, + "step": 46989 + }, + { + "epoch": 2.3, + "grad_norm": 0.6997805237770081, + "learning_rate": 7.64548873768881e-05, + "loss": 2.776, + "step": 46990 + }, + { + "epoch": 2.3, + "grad_norm": 0.6954943537712097, + "learning_rate": 7.644462000605423e-05, + "loss": 2.9855, + "step": 46991 + }, + { + "epoch": 2.3, + "grad_norm": 0.7475723028182983, + "learning_rate": 7.64343532240222e-05, + "loss": 3.1449, + "step": 46992 + }, + { + "epoch": 2.3, + "grad_norm": 0.7167398929595947, + "learning_rate": 7.642408703081911e-05, + "loss": 2.9414, + "step": 46993 + }, + { + "epoch": 2.3, + "grad_norm": 0.7235413789749146, + "learning_rate": 7.64138214264719e-05, + "loss": 3.0369, + "step": 46994 + }, + { + "epoch": 2.3, + "grad_norm": 0.7205052971839905, + "learning_rate": 7.640355641100773e-05, + "loss": 2.9086, + "step": 46995 + }, + { + "epoch": 2.3, + "grad_norm": 0.7066383957862854, + "learning_rate": 7.639329198445354e-05, + "loss": 3.0573, + "step": 46996 + }, + { + "epoch": 2.3, + "grad_norm": 0.7013208270072937, + "learning_rate": 7.638302814683632e-05, + "loss": 2.8161, + "step": 46997 + }, + { + "epoch": 2.3, + "grad_norm": 0.7191861271858215, + "learning_rate": 7.637276489818325e-05, + "loss": 2.9867, + "step": 46998 + }, + { + "epoch": 2.3, + "grad_norm": 0.7596316337585449, + "learning_rate": 7.63625022385212e-05, + "loss": 2.9672, + "step": 46999 + }, + { + "epoch": 2.3, + "grad_norm": 0.7377591729164124, + "learning_rate": 7.635224016787734e-05, + "loss": 3.1616, + "step": 47000 + }, + { + "epoch": 2.3, + "grad_norm": 0.6754733324050903, + "learning_rate": 7.634197868627858e-05, + "loss": 2.9541, + "step": 47001 + }, + { + "epoch": 2.3, + "grad_norm": 0.6994279026985168, + "learning_rate": 7.633171779375199e-05, + "loss": 2.7826, + "step": 47002 + }, + { + "epoch": 2.3, + "grad_norm": 0.7444949746131897, + "learning_rate": 7.632145749032468e-05, + "loss": 2.913, + "step": 47003 + }, + { + "epoch": 2.3, + "grad_norm": 0.7129213213920593, + "learning_rate": 7.631119777602365e-05, + "loss": 2.9534, + "step": 47004 + }, + { + "epoch": 2.3, + "grad_norm": 0.7305833101272583, + "learning_rate": 7.630093865087584e-05, + "loss": 2.8632, + "step": 47005 + }, + { + "epoch": 2.3, + "grad_norm": 0.7327616214752197, + "learning_rate": 7.629068011490822e-05, + "loss": 2.9634, + "step": 47006 + }, + { + "epoch": 2.3, + "grad_norm": 0.7405668497085571, + "learning_rate": 7.628042216814789e-05, + "loss": 3.079, + "step": 47007 + }, + { + "epoch": 2.3, + "grad_norm": 0.721222996711731, + "learning_rate": 7.627016481062196e-05, + "loss": 3.0155, + "step": 47008 + }, + { + "epoch": 2.3, + "grad_norm": 0.7441813945770264, + "learning_rate": 7.625990804235729e-05, + "loss": 2.9115, + "step": 47009 + }, + { + "epoch": 2.3, + "grad_norm": 0.7018464803695679, + "learning_rate": 7.624965186338106e-05, + "loss": 2.7319, + "step": 47010 + }, + { + "epoch": 2.3, + "grad_norm": 0.7066758275032043, + "learning_rate": 7.623939627372015e-05, + "loss": 2.9433, + "step": 47011 + }, + { + "epoch": 2.3, + "grad_norm": 0.7435897588729858, + "learning_rate": 7.622914127340156e-05, + "loss": 3.0037, + "step": 47012 + }, + { + "epoch": 2.3, + "grad_norm": 0.7094926238059998, + "learning_rate": 7.62188868624524e-05, + "loss": 2.9102, + "step": 47013 + }, + { + "epoch": 2.3, + "grad_norm": 0.7312250733375549, + "learning_rate": 7.620863304089955e-05, + "loss": 2.9911, + "step": 47014 + }, + { + "epoch": 2.3, + "grad_norm": 0.7396236658096313, + "learning_rate": 7.619837980877019e-05, + "loss": 3.1673, + "step": 47015 + }, + { + "epoch": 2.3, + "grad_norm": 0.7736823558807373, + "learning_rate": 7.61881271660911e-05, + "loss": 2.6794, + "step": 47016 + }, + { + "epoch": 2.3, + "grad_norm": 0.7479914426803589, + "learning_rate": 7.617787511288953e-05, + "loss": 3.0262, + "step": 47017 + }, + { + "epoch": 2.3, + "grad_norm": 0.7253764271736145, + "learning_rate": 7.616762364919236e-05, + "loss": 2.7085, + "step": 47018 + }, + { + "epoch": 2.3, + "grad_norm": 0.6933542490005493, + "learning_rate": 7.615737277502649e-05, + "loss": 3.026, + "step": 47019 + }, + { + "epoch": 2.3, + "grad_norm": 0.7153028845787048, + "learning_rate": 7.614712249041912e-05, + "loss": 3.0809, + "step": 47020 + }, + { + "epoch": 2.3, + "grad_norm": 0.7277600765228271, + "learning_rate": 7.613687279539706e-05, + "loss": 3.0096, + "step": 47021 + }, + { + "epoch": 2.3, + "grad_norm": 0.7183275818824768, + "learning_rate": 7.612662368998736e-05, + "loss": 2.9682, + "step": 47022 + }, + { + "epoch": 2.3, + "grad_norm": 0.7380486726760864, + "learning_rate": 7.611637517421717e-05, + "loss": 2.8966, + "step": 47023 + }, + { + "epoch": 2.3, + "grad_norm": 0.7355697154998779, + "learning_rate": 7.61061272481133e-05, + "loss": 3.1274, + "step": 47024 + }, + { + "epoch": 2.3, + "grad_norm": 0.6961773633956909, + "learning_rate": 7.609587991170284e-05, + "loss": 2.9941, + "step": 47025 + }, + { + "epoch": 2.3, + "grad_norm": 0.7682431936264038, + "learning_rate": 7.608563316501276e-05, + "loss": 2.8109, + "step": 47026 + }, + { + "epoch": 2.3, + "grad_norm": 0.7720531225204468, + "learning_rate": 7.607538700806994e-05, + "loss": 2.9825, + "step": 47027 + }, + { + "epoch": 2.3, + "grad_norm": 0.725392758846283, + "learning_rate": 7.606514144090154e-05, + "loss": 3.0161, + "step": 47028 + }, + { + "epoch": 2.3, + "grad_norm": 0.7036164999008179, + "learning_rate": 7.605489646353437e-05, + "loss": 2.7043, + "step": 47029 + }, + { + "epoch": 2.3, + "grad_norm": 0.734400749206543, + "learning_rate": 7.604465207599561e-05, + "loss": 3.0003, + "step": 47030 + }, + { + "epoch": 2.3, + "grad_norm": 0.7298409342765808, + "learning_rate": 7.603440827831203e-05, + "loss": 2.6084, + "step": 47031 + }, + { + "epoch": 2.3, + "grad_norm": 0.7258648872375488, + "learning_rate": 7.60241650705108e-05, + "loss": 2.769, + "step": 47032 + }, + { + "epoch": 2.3, + "grad_norm": 0.676993191242218, + "learning_rate": 7.601392245261883e-05, + "loss": 2.8381, + "step": 47033 + }, + { + "epoch": 2.31, + "grad_norm": 0.7960457801818848, + "learning_rate": 7.600368042466298e-05, + "loss": 2.8848, + "step": 47034 + }, + { + "epoch": 2.31, + "grad_norm": 0.7140025496482849, + "learning_rate": 7.599343898667042e-05, + "loss": 2.9983, + "step": 47035 + }, + { + "epoch": 2.31, + "grad_norm": 0.707834005355835, + "learning_rate": 7.598319813866794e-05, + "loss": 2.9884, + "step": 47036 + }, + { + "epoch": 2.31, + "grad_norm": 0.8292667269706726, + "learning_rate": 7.59729578806826e-05, + "loss": 2.8562, + "step": 47037 + }, + { + "epoch": 2.31, + "grad_norm": 0.6989453434944153, + "learning_rate": 7.596271821274148e-05, + "loss": 2.7451, + "step": 47038 + }, + { + "epoch": 2.31, + "grad_norm": 0.7435256838798523, + "learning_rate": 7.595247913487135e-05, + "loss": 2.8466, + "step": 47039 + }, + { + "epoch": 2.31, + "grad_norm": 0.7675233483314514, + "learning_rate": 7.594224064709946e-05, + "loss": 2.978, + "step": 47040 + }, + { + "epoch": 2.31, + "grad_norm": 0.7073809504508972, + "learning_rate": 7.593200274945238e-05, + "loss": 2.8363, + "step": 47041 + }, + { + "epoch": 2.31, + "grad_norm": 0.7317488789558411, + "learning_rate": 7.592176544195729e-05, + "loss": 2.9328, + "step": 47042 + }, + { + "epoch": 2.31, + "grad_norm": 0.7460283041000366, + "learning_rate": 7.591152872464123e-05, + "loss": 2.8801, + "step": 47043 + }, + { + "epoch": 2.31, + "grad_norm": 0.7187976241111755, + "learning_rate": 7.5901292597531e-05, + "loss": 2.9518, + "step": 47044 + }, + { + "epoch": 2.31, + "grad_norm": 0.7074181437492371, + "learning_rate": 7.589105706065371e-05, + "loss": 2.8221, + "step": 47045 + }, + { + "epoch": 2.31, + "grad_norm": 0.6958547830581665, + "learning_rate": 7.588082211403617e-05, + "loss": 2.9221, + "step": 47046 + }, + { + "epoch": 2.31, + "grad_norm": 0.7111318707466125, + "learning_rate": 7.58705877577055e-05, + "loss": 2.8711, + "step": 47047 + }, + { + "epoch": 2.31, + "grad_norm": 0.6999523639678955, + "learning_rate": 7.586035399168856e-05, + "loss": 2.839, + "step": 47048 + }, + { + "epoch": 2.31, + "grad_norm": 0.726325511932373, + "learning_rate": 7.585012081601222e-05, + "loss": 2.83, + "step": 47049 + }, + { + "epoch": 2.31, + "grad_norm": 0.7136778235435486, + "learning_rate": 7.583988823070362e-05, + "loss": 2.9703, + "step": 47050 + }, + { + "epoch": 2.31, + "grad_norm": 0.7415741086006165, + "learning_rate": 7.582965623578954e-05, + "loss": 2.8102, + "step": 47051 + }, + { + "epoch": 2.31, + "grad_norm": 0.83095782995224, + "learning_rate": 7.581942483129697e-05, + "loss": 2.9474, + "step": 47052 + }, + { + "epoch": 2.31, + "grad_norm": 0.7691175937652588, + "learning_rate": 7.580919401725303e-05, + "loss": 2.9403, + "step": 47053 + }, + { + "epoch": 2.31, + "grad_norm": 0.7467166185379028, + "learning_rate": 7.579896379368449e-05, + "loss": 3.0011, + "step": 47054 + }, + { + "epoch": 2.31, + "grad_norm": 0.6988757252693176, + "learning_rate": 7.578873416061833e-05, + "loss": 3.056, + "step": 47055 + }, + { + "epoch": 2.31, + "grad_norm": 0.666217029094696, + "learning_rate": 7.577850511808139e-05, + "loss": 2.89, + "step": 47056 + }, + { + "epoch": 2.31, + "grad_norm": 0.7455030083656311, + "learning_rate": 7.576827666610075e-05, + "loss": 2.9968, + "step": 47057 + }, + { + "epoch": 2.31, + "grad_norm": 0.742010235786438, + "learning_rate": 7.575804880470338e-05, + "loss": 3.046, + "step": 47058 + }, + { + "epoch": 2.31, + "grad_norm": 0.7573196291923523, + "learning_rate": 7.574782153391608e-05, + "loss": 2.8716, + "step": 47059 + }, + { + "epoch": 2.31, + "grad_norm": 0.725332498550415, + "learning_rate": 7.573759485376593e-05, + "loss": 2.8598, + "step": 47060 + }, + { + "epoch": 2.31, + "grad_norm": 0.7262852787971497, + "learning_rate": 7.572736876427973e-05, + "loss": 2.8723, + "step": 47061 + }, + { + "epoch": 2.31, + "grad_norm": 0.6897485852241516, + "learning_rate": 7.571714326548454e-05, + "loss": 2.8583, + "step": 47062 + }, + { + "epoch": 2.31, + "grad_norm": 0.6986570954322815, + "learning_rate": 7.570691835740725e-05, + "loss": 2.6931, + "step": 47063 + }, + { + "epoch": 2.31, + "grad_norm": 0.7163300514221191, + "learning_rate": 7.569669404007471e-05, + "loss": 3.034, + "step": 47064 + }, + { + "epoch": 2.31, + "grad_norm": 0.7340909838676453, + "learning_rate": 7.568647031351396e-05, + "loss": 2.7613, + "step": 47065 + }, + { + "epoch": 2.31, + "grad_norm": 0.75508713722229, + "learning_rate": 7.567624717775179e-05, + "loss": 3.004, + "step": 47066 + }, + { + "epoch": 2.31, + "grad_norm": 0.6939140558242798, + "learning_rate": 7.566602463281528e-05, + "loss": 2.7456, + "step": 47067 + }, + { + "epoch": 2.31, + "grad_norm": 0.7123984694480896, + "learning_rate": 7.565580267873124e-05, + "loss": 2.9987, + "step": 47068 + }, + { + "epoch": 2.31, + "grad_norm": 0.7077834010124207, + "learning_rate": 7.56455813155267e-05, + "loss": 2.9786, + "step": 47069 + }, + { + "epoch": 2.31, + "grad_norm": 0.7424761652946472, + "learning_rate": 7.563536054322856e-05, + "loss": 2.8784, + "step": 47070 + }, + { + "epoch": 2.31, + "grad_norm": 0.7440283894538879, + "learning_rate": 7.562514036186359e-05, + "loss": 2.7986, + "step": 47071 + }, + { + "epoch": 2.31, + "grad_norm": 0.7975735068321228, + "learning_rate": 7.56149207714589e-05, + "loss": 2.9229, + "step": 47072 + }, + { + "epoch": 2.31, + "grad_norm": 0.6961278915405273, + "learning_rate": 7.560470177204124e-05, + "loss": 2.8992, + "step": 47073 + }, + { + "epoch": 2.31, + "grad_norm": 0.6985488533973694, + "learning_rate": 7.55944833636376e-05, + "loss": 2.6921, + "step": 47074 + }, + { + "epoch": 2.31, + "grad_norm": 0.7156649827957153, + "learning_rate": 7.558426554627499e-05, + "loss": 2.8174, + "step": 47075 + }, + { + "epoch": 2.31, + "grad_norm": 0.776537299156189, + "learning_rate": 7.557404831998028e-05, + "loss": 2.9881, + "step": 47076 + }, + { + "epoch": 2.31, + "grad_norm": 0.7213156819343567, + "learning_rate": 7.55638316847803e-05, + "loss": 2.8593, + "step": 47077 + }, + { + "epoch": 2.31, + "grad_norm": 0.7126505374908447, + "learning_rate": 7.55536156407019e-05, + "loss": 2.7945, + "step": 47078 + }, + { + "epoch": 2.31, + "grad_norm": 0.7194722890853882, + "learning_rate": 7.554340018777207e-05, + "loss": 2.8476, + "step": 47079 + }, + { + "epoch": 2.31, + "grad_norm": 0.718202531337738, + "learning_rate": 7.553318532601785e-05, + "loss": 2.6935, + "step": 47080 + }, + { + "epoch": 2.31, + "grad_norm": 0.7163863778114319, + "learning_rate": 7.552297105546588e-05, + "loss": 3.0641, + "step": 47081 + }, + { + "epoch": 2.31, + "grad_norm": 0.7556397914886475, + "learning_rate": 7.551275737614335e-05, + "loss": 2.9546, + "step": 47082 + }, + { + "epoch": 2.31, + "grad_norm": 0.7361882925033569, + "learning_rate": 7.55025442880769e-05, + "loss": 2.9487, + "step": 47083 + }, + { + "epoch": 2.31, + "grad_norm": 0.7101148962974548, + "learning_rate": 7.549233179129362e-05, + "loss": 2.9951, + "step": 47084 + }, + { + "epoch": 2.31, + "grad_norm": 0.7225397229194641, + "learning_rate": 7.548211988582035e-05, + "loss": 3.103, + "step": 47085 + }, + { + "epoch": 2.31, + "grad_norm": 0.7092863917350769, + "learning_rate": 7.547190857168387e-05, + "loss": 2.7833, + "step": 47086 + }, + { + "epoch": 2.31, + "grad_norm": 0.6932755708694458, + "learning_rate": 7.546169784891127e-05, + "loss": 2.8817, + "step": 47087 + }, + { + "epoch": 2.31, + "grad_norm": 0.6869648098945618, + "learning_rate": 7.545148771752925e-05, + "loss": 2.8556, + "step": 47088 + }, + { + "epoch": 2.31, + "grad_norm": 0.7577425837516785, + "learning_rate": 7.54412781775648e-05, + "loss": 2.88, + "step": 47089 + }, + { + "epoch": 2.31, + "grad_norm": 0.7419918179512024, + "learning_rate": 7.543106922904488e-05, + "loss": 2.9787, + "step": 47090 + }, + { + "epoch": 2.31, + "grad_norm": 0.7026674747467041, + "learning_rate": 7.542086087199631e-05, + "loss": 2.8453, + "step": 47091 + }, + { + "epoch": 2.31, + "grad_norm": 0.7323837280273438, + "learning_rate": 7.541065310644597e-05, + "loss": 2.9609, + "step": 47092 + }, + { + "epoch": 2.31, + "grad_norm": 0.6933581233024597, + "learning_rate": 7.540044593242067e-05, + "loss": 3.0251, + "step": 47093 + }, + { + "epoch": 2.31, + "grad_norm": 0.7326129078865051, + "learning_rate": 7.539023934994734e-05, + "loss": 2.8222, + "step": 47094 + }, + { + "epoch": 2.31, + "grad_norm": 0.7666764259338379, + "learning_rate": 7.5380033359053e-05, + "loss": 2.9111, + "step": 47095 + }, + { + "epoch": 2.31, + "grad_norm": 0.7760982513427734, + "learning_rate": 7.536982795976436e-05, + "loss": 2.896, + "step": 47096 + }, + { + "epoch": 2.31, + "grad_norm": 0.6980563998222351, + "learning_rate": 7.535962315210843e-05, + "loss": 2.9076, + "step": 47097 + }, + { + "epoch": 2.31, + "grad_norm": 0.730495810508728, + "learning_rate": 7.534941893611201e-05, + "loss": 3.0254, + "step": 47098 + }, + { + "epoch": 2.31, + "grad_norm": 0.6933460235595703, + "learning_rate": 7.533921531180191e-05, + "loss": 2.8862, + "step": 47099 + }, + { + "epoch": 2.31, + "grad_norm": 0.7039830088615417, + "learning_rate": 7.532901227920517e-05, + "loss": 2.9391, + "step": 47100 + }, + { + "epoch": 2.31, + "grad_norm": 0.7359967827796936, + "learning_rate": 7.531880983834845e-05, + "loss": 2.8176, + "step": 47101 + }, + { + "epoch": 2.31, + "grad_norm": 0.6860753893852234, + "learning_rate": 7.530860798925888e-05, + "loss": 2.8287, + "step": 47102 + }, + { + "epoch": 2.31, + "grad_norm": 0.7534465789794922, + "learning_rate": 7.529840673196309e-05, + "loss": 2.7622, + "step": 47103 + }, + { + "epoch": 2.31, + "grad_norm": 0.7103568315505981, + "learning_rate": 7.528820606648805e-05, + "loss": 3.1858, + "step": 47104 + }, + { + "epoch": 2.31, + "grad_norm": 0.6781308054924011, + "learning_rate": 7.527800599286073e-05, + "loss": 3.0361, + "step": 47105 + }, + { + "epoch": 2.31, + "grad_norm": 0.7126346230506897, + "learning_rate": 7.526780651110788e-05, + "loss": 2.8852, + "step": 47106 + }, + { + "epoch": 2.31, + "grad_norm": 0.7078046798706055, + "learning_rate": 7.525760762125636e-05, + "loss": 2.8481, + "step": 47107 + }, + { + "epoch": 2.31, + "grad_norm": 0.7195985317230225, + "learning_rate": 7.524740932333298e-05, + "loss": 2.8783, + "step": 47108 + }, + { + "epoch": 2.31, + "grad_norm": 0.7203580737113953, + "learning_rate": 7.523721161736468e-05, + "loss": 2.9165, + "step": 47109 + }, + { + "epoch": 2.31, + "grad_norm": 0.7243421673774719, + "learning_rate": 7.522701450337837e-05, + "loss": 2.8335, + "step": 47110 + }, + { + "epoch": 2.31, + "grad_norm": 0.7248387336730957, + "learning_rate": 7.521681798140076e-05, + "loss": 2.9192, + "step": 47111 + }, + { + "epoch": 2.31, + "grad_norm": 0.7272186875343323, + "learning_rate": 7.520662205145888e-05, + "loss": 2.9508, + "step": 47112 + }, + { + "epoch": 2.31, + "grad_norm": 0.6788175702095032, + "learning_rate": 7.519642671357951e-05, + "loss": 2.9344, + "step": 47113 + }, + { + "epoch": 2.31, + "grad_norm": 0.7410783767700195, + "learning_rate": 7.518623196778939e-05, + "loss": 2.9244, + "step": 47114 + }, + { + "epoch": 2.31, + "grad_norm": 0.7222927212715149, + "learning_rate": 7.517603781411558e-05, + "loss": 2.7248, + "step": 47115 + }, + { + "epoch": 2.31, + "grad_norm": 0.7629684805870056, + "learning_rate": 7.516584425258471e-05, + "loss": 3.0434, + "step": 47116 + }, + { + "epoch": 2.31, + "grad_norm": 0.762963056564331, + "learning_rate": 7.515565128322382e-05, + "loss": 3.1039, + "step": 47117 + }, + { + "epoch": 2.31, + "grad_norm": 0.7189714908599854, + "learning_rate": 7.51454589060596e-05, + "loss": 2.8823, + "step": 47118 + }, + { + "epoch": 2.31, + "grad_norm": 0.7012630105018616, + "learning_rate": 7.513526712111904e-05, + "loss": 2.8067, + "step": 47119 + }, + { + "epoch": 2.31, + "grad_norm": 0.734362006187439, + "learning_rate": 7.512507592842892e-05, + "loss": 2.7973, + "step": 47120 + }, + { + "epoch": 2.31, + "grad_norm": 0.7629452347755432, + "learning_rate": 7.511488532801598e-05, + "loss": 3.0645, + "step": 47121 + }, + { + "epoch": 2.31, + "grad_norm": 0.7741067409515381, + "learning_rate": 7.510469531990723e-05, + "loss": 2.7464, + "step": 47122 + }, + { + "epoch": 2.31, + "grad_norm": 0.7644525170326233, + "learning_rate": 7.509450590412936e-05, + "loss": 2.8866, + "step": 47123 + }, + { + "epoch": 2.31, + "grad_norm": 0.7327432036399841, + "learning_rate": 7.508431708070927e-05, + "loss": 2.83, + "step": 47124 + }, + { + "epoch": 2.31, + "grad_norm": 0.6900554299354553, + "learning_rate": 7.507412884967387e-05, + "loss": 2.9412, + "step": 47125 + }, + { + "epoch": 2.31, + "grad_norm": 0.742673933506012, + "learning_rate": 7.506394121104988e-05, + "loss": 2.7973, + "step": 47126 + }, + { + "epoch": 2.31, + "grad_norm": 0.7525250315666199, + "learning_rate": 7.505375416486423e-05, + "loss": 2.7082, + "step": 47127 + }, + { + "epoch": 2.31, + "grad_norm": 0.6825203895568848, + "learning_rate": 7.50435677111437e-05, + "loss": 2.8795, + "step": 47128 + }, + { + "epoch": 2.31, + "grad_norm": 0.6943187117576599, + "learning_rate": 7.503338184991506e-05, + "loss": 3.0387, + "step": 47129 + }, + { + "epoch": 2.31, + "grad_norm": 0.781815767288208, + "learning_rate": 7.502319658120526e-05, + "loss": 3.0341, + "step": 47130 + }, + { + "epoch": 2.31, + "grad_norm": 0.7488953471183777, + "learning_rate": 7.5013011905041e-05, + "loss": 2.9886, + "step": 47131 + }, + { + "epoch": 2.31, + "grad_norm": 0.7378432750701904, + "learning_rate": 7.500282782144923e-05, + "loss": 2.9618, + "step": 47132 + }, + { + "epoch": 2.31, + "grad_norm": 0.6865012049674988, + "learning_rate": 7.499264433045663e-05, + "loss": 2.6925, + "step": 47133 + }, + { + "epoch": 2.31, + "grad_norm": 0.7527883648872375, + "learning_rate": 7.498246143209022e-05, + "loss": 2.9324, + "step": 47134 + }, + { + "epoch": 2.31, + "grad_norm": 0.7522280216217041, + "learning_rate": 7.497227912637666e-05, + "loss": 3.007, + "step": 47135 + }, + { + "epoch": 2.31, + "grad_norm": 0.7576622366905212, + "learning_rate": 7.496209741334274e-05, + "loss": 2.9856, + "step": 47136 + }, + { + "epoch": 2.31, + "grad_norm": 0.7035020589828491, + "learning_rate": 7.49519162930154e-05, + "loss": 3.0091, + "step": 47137 + }, + { + "epoch": 2.31, + "grad_norm": 0.7081121206283569, + "learning_rate": 7.494173576542137e-05, + "loss": 2.7836, + "step": 47138 + }, + { + "epoch": 2.31, + "grad_norm": 0.7172082662582397, + "learning_rate": 7.493155583058747e-05, + "loss": 2.775, + "step": 47139 + }, + { + "epoch": 2.31, + "grad_norm": 0.7473821043968201, + "learning_rate": 7.492137648854062e-05, + "loss": 3.1286, + "step": 47140 + }, + { + "epoch": 2.31, + "grad_norm": 0.7130926847457886, + "learning_rate": 7.491119773930754e-05, + "loss": 2.9354, + "step": 47141 + }, + { + "epoch": 2.31, + "grad_norm": 0.7745637893676758, + "learning_rate": 7.490101958291504e-05, + "loss": 2.7904, + "step": 47142 + }, + { + "epoch": 2.31, + "grad_norm": 0.6880003213882446, + "learning_rate": 7.489084201938985e-05, + "loss": 2.8231, + "step": 47143 + }, + { + "epoch": 2.31, + "grad_norm": 0.7113720178604126, + "learning_rate": 7.488066504875897e-05, + "loss": 2.9742, + "step": 47144 + }, + { + "epoch": 2.31, + "grad_norm": 0.710329532623291, + "learning_rate": 7.4870488671049e-05, + "loss": 3.0068, + "step": 47145 + }, + { + "epoch": 2.31, + "grad_norm": 0.7468894720077515, + "learning_rate": 7.486031288628682e-05, + "loss": 2.9647, + "step": 47146 + }, + { + "epoch": 2.31, + "grad_norm": 0.7187853455543518, + "learning_rate": 7.485013769449935e-05, + "loss": 2.9264, + "step": 47147 + }, + { + "epoch": 2.31, + "grad_norm": 0.6992723941802979, + "learning_rate": 7.483996309571319e-05, + "loss": 2.8219, + "step": 47148 + }, + { + "epoch": 2.31, + "grad_norm": 0.7436049580574036, + "learning_rate": 7.482978908995532e-05, + "loss": 2.9799, + "step": 47149 + }, + { + "epoch": 2.31, + "grad_norm": 0.7207313776016235, + "learning_rate": 7.481961567725246e-05, + "loss": 2.915, + "step": 47150 + }, + { + "epoch": 2.31, + "grad_norm": 0.7421787977218628, + "learning_rate": 7.480944285763127e-05, + "loss": 2.8734, + "step": 47151 + }, + { + "epoch": 2.31, + "grad_norm": 0.73797607421875, + "learning_rate": 7.479927063111879e-05, + "loss": 2.9639, + "step": 47152 + }, + { + "epoch": 2.31, + "grad_norm": 0.7333713173866272, + "learning_rate": 7.478909899774161e-05, + "loss": 2.8489, + "step": 47153 + }, + { + "epoch": 2.31, + "grad_norm": 0.717434823513031, + "learning_rate": 7.477892795752669e-05, + "loss": 2.7659, + "step": 47154 + }, + { + "epoch": 2.31, + "grad_norm": 0.715705156326294, + "learning_rate": 7.476875751050063e-05, + "loss": 2.7922, + "step": 47155 + }, + { + "epoch": 2.31, + "grad_norm": 0.7075499892234802, + "learning_rate": 7.475858765669038e-05, + "loss": 3.0544, + "step": 47156 + }, + { + "epoch": 2.31, + "grad_norm": 0.7804867625236511, + "learning_rate": 7.474841839612269e-05, + "loss": 2.7703, + "step": 47157 + }, + { + "epoch": 2.31, + "grad_norm": 0.7036874890327454, + "learning_rate": 7.473824972882422e-05, + "loss": 2.846, + "step": 47158 + }, + { + "epoch": 2.31, + "grad_norm": 0.7310692071914673, + "learning_rate": 7.472808165482195e-05, + "loss": 2.9657, + "step": 47159 + }, + { + "epoch": 2.31, + "grad_norm": 0.7222773432731628, + "learning_rate": 7.471791417414246e-05, + "loss": 3.0682, + "step": 47160 + }, + { + "epoch": 2.31, + "grad_norm": 0.6930948495864868, + "learning_rate": 7.47077472868126e-05, + "loss": 2.6749, + "step": 47161 + }, + { + "epoch": 2.31, + "grad_norm": 0.748211681842804, + "learning_rate": 7.46975809928593e-05, + "loss": 2.9393, + "step": 47162 + }, + { + "epoch": 2.31, + "grad_norm": 0.7243354916572571, + "learning_rate": 7.468741529230911e-05, + "loss": 2.8963, + "step": 47163 + }, + { + "epoch": 2.31, + "grad_norm": 0.7261446714401245, + "learning_rate": 7.4677250185189e-05, + "loss": 2.9313, + "step": 47164 + }, + { + "epoch": 2.31, + "grad_norm": 0.7376651167869568, + "learning_rate": 7.466708567152565e-05, + "loss": 2.7539, + "step": 47165 + }, + { + "epoch": 2.31, + "grad_norm": 0.7748557925224304, + "learning_rate": 7.465692175134575e-05, + "loss": 2.8347, + "step": 47166 + }, + { + "epoch": 2.31, + "grad_norm": 0.6955622434616089, + "learning_rate": 7.46467584246762e-05, + "loss": 2.6828, + "step": 47167 + }, + { + "epoch": 2.31, + "grad_norm": 0.7150453329086304, + "learning_rate": 7.463659569154367e-05, + "loss": 2.9824, + "step": 47168 + }, + { + "epoch": 2.31, + "grad_norm": 0.7358688712120056, + "learning_rate": 7.462643355197506e-05, + "loss": 2.9723, + "step": 47169 + }, + { + "epoch": 2.31, + "grad_norm": 0.7066740393638611, + "learning_rate": 7.461627200599694e-05, + "loss": 2.6598, + "step": 47170 + }, + { + "epoch": 2.31, + "grad_norm": 0.7047446370124817, + "learning_rate": 7.46061110536363e-05, + "loss": 2.9277, + "step": 47171 + }, + { + "epoch": 2.31, + "grad_norm": 0.7581452131271362, + "learning_rate": 7.459595069491976e-05, + "loss": 2.9114, + "step": 47172 + }, + { + "epoch": 2.31, + "grad_norm": 0.7496849894523621, + "learning_rate": 7.4585790929874e-05, + "loss": 2.9983, + "step": 47173 + }, + { + "epoch": 2.31, + "grad_norm": 0.7440602779388428, + "learning_rate": 7.4575631758526e-05, + "loss": 3.0909, + "step": 47174 + }, + { + "epoch": 2.31, + "grad_norm": 0.7057591080665588, + "learning_rate": 7.456547318090234e-05, + "loss": 2.9252, + "step": 47175 + }, + { + "epoch": 2.31, + "grad_norm": 0.7352858781814575, + "learning_rate": 7.455531519702978e-05, + "loss": 2.806, + "step": 47176 + }, + { + "epoch": 2.31, + "grad_norm": 0.7194220423698425, + "learning_rate": 7.454515780693527e-05, + "loss": 2.9611, + "step": 47177 + }, + { + "epoch": 2.31, + "grad_norm": 0.7085871696472168, + "learning_rate": 7.453500101064542e-05, + "loss": 3.1392, + "step": 47178 + }, + { + "epoch": 2.31, + "grad_norm": 0.7426517605781555, + "learning_rate": 7.452484480818694e-05, + "loss": 2.7845, + "step": 47179 + }, + { + "epoch": 2.31, + "grad_norm": 0.671974241733551, + "learning_rate": 7.451468919958657e-05, + "loss": 2.9442, + "step": 47180 + }, + { + "epoch": 2.31, + "grad_norm": 0.6885994672775269, + "learning_rate": 7.450453418487112e-05, + "loss": 2.8299, + "step": 47181 + }, + { + "epoch": 2.31, + "grad_norm": 0.7266910672187805, + "learning_rate": 7.44943797640674e-05, + "loss": 3.0384, + "step": 47182 + }, + { + "epoch": 2.31, + "grad_norm": 0.7851859927177429, + "learning_rate": 7.448422593720203e-05, + "loss": 2.8663, + "step": 47183 + }, + { + "epoch": 2.31, + "grad_norm": 0.704609751701355, + "learning_rate": 7.447407270430185e-05, + "loss": 2.9252, + "step": 47184 + }, + { + "epoch": 2.31, + "grad_norm": 0.705119252204895, + "learning_rate": 7.446392006539351e-05, + "loss": 3.0968, + "step": 47185 + }, + { + "epoch": 2.31, + "grad_norm": 0.739789605140686, + "learning_rate": 7.445376802050387e-05, + "loss": 2.7645, + "step": 47186 + }, + { + "epoch": 2.31, + "grad_norm": 0.7058214545249939, + "learning_rate": 7.444361656965956e-05, + "loss": 2.7778, + "step": 47187 + }, + { + "epoch": 2.31, + "grad_norm": 0.6922550201416016, + "learning_rate": 7.443346571288733e-05, + "loss": 3.1309, + "step": 47188 + }, + { + "epoch": 2.31, + "grad_norm": 0.7196448445320129, + "learning_rate": 7.4423315450214e-05, + "loss": 2.9757, + "step": 47189 + }, + { + "epoch": 2.31, + "grad_norm": 0.7384520173072815, + "learning_rate": 7.441316578166617e-05, + "loss": 2.8736, + "step": 47190 + }, + { + "epoch": 2.31, + "grad_norm": 0.8098287582397461, + "learning_rate": 7.440301670727062e-05, + "loss": 2.8157, + "step": 47191 + }, + { + "epoch": 2.31, + "grad_norm": 0.7096481919288635, + "learning_rate": 7.439286822705423e-05, + "loss": 2.9116, + "step": 47192 + }, + { + "epoch": 2.31, + "grad_norm": 0.7278345823287964, + "learning_rate": 7.438272034104356e-05, + "loss": 2.9966, + "step": 47193 + }, + { + "epoch": 2.31, + "grad_norm": 0.7431257367134094, + "learning_rate": 7.437257304926541e-05, + "loss": 2.9264, + "step": 47194 + }, + { + "epoch": 2.31, + "grad_norm": 0.7055538892745972, + "learning_rate": 7.436242635174638e-05, + "loss": 2.9575, + "step": 47195 + }, + { + "epoch": 2.31, + "grad_norm": 0.7002086639404297, + "learning_rate": 7.43522802485133e-05, + "loss": 2.7348, + "step": 47196 + }, + { + "epoch": 2.31, + "grad_norm": 0.7327378988265991, + "learning_rate": 7.434213473959299e-05, + "loss": 2.9672, + "step": 47197 + }, + { + "epoch": 2.31, + "grad_norm": 0.6708829402923584, + "learning_rate": 7.4331989825012e-05, + "loss": 2.9387, + "step": 47198 + }, + { + "epoch": 2.31, + "grad_norm": 0.6982355117797852, + "learning_rate": 7.432184550479715e-05, + "loss": 2.7249, + "step": 47199 + }, + { + "epoch": 2.31, + "grad_norm": 0.719652533531189, + "learning_rate": 7.431170177897514e-05, + "loss": 3.0177, + "step": 47200 + }, + { + "epoch": 2.31, + "grad_norm": 0.7651442885398865, + "learning_rate": 7.430155864757261e-05, + "loss": 3.0903, + "step": 47201 + }, + { + "epoch": 2.31, + "grad_norm": 0.7121278047561646, + "learning_rate": 7.42914161106164e-05, + "loss": 2.9088, + "step": 47202 + }, + { + "epoch": 2.31, + "grad_norm": 0.6944317817687988, + "learning_rate": 7.42812741681331e-05, + "loss": 3.1477, + "step": 47203 + }, + { + "epoch": 2.31, + "grad_norm": 0.7282336950302124, + "learning_rate": 7.427113282014955e-05, + "loss": 2.9578, + "step": 47204 + }, + { + "epoch": 2.31, + "grad_norm": 0.6893202662467957, + "learning_rate": 7.426099206669234e-05, + "loss": 2.8417, + "step": 47205 + }, + { + "epoch": 2.31, + "grad_norm": 0.7046981453895569, + "learning_rate": 7.425085190778818e-05, + "loss": 2.9142, + "step": 47206 + }, + { + "epoch": 2.31, + "grad_norm": 0.694952130317688, + "learning_rate": 7.424071234346395e-05, + "loss": 3.0216, + "step": 47207 + }, + { + "epoch": 2.31, + "grad_norm": 0.7959489822387695, + "learning_rate": 7.423057337374623e-05, + "loss": 2.7588, + "step": 47208 + }, + { + "epoch": 2.31, + "grad_norm": 0.8016543388366699, + "learning_rate": 7.422043499866174e-05, + "loss": 2.9142, + "step": 47209 + }, + { + "epoch": 2.31, + "grad_norm": 0.7541260719299316, + "learning_rate": 7.421029721823706e-05, + "loss": 2.8306, + "step": 47210 + }, + { + "epoch": 2.31, + "grad_norm": 0.7044775485992432, + "learning_rate": 7.420016003249904e-05, + "loss": 2.8552, + "step": 47211 + }, + { + "epoch": 2.31, + "grad_norm": 0.704594075679779, + "learning_rate": 7.419002344147441e-05, + "loss": 3.0789, + "step": 47212 + }, + { + "epoch": 2.31, + "grad_norm": 0.7283633351325989, + "learning_rate": 7.417988744518968e-05, + "loss": 2.8004, + "step": 47213 + }, + { + "epoch": 2.31, + "grad_norm": 0.9223310351371765, + "learning_rate": 7.416975204367182e-05, + "loss": 2.9323, + "step": 47214 + }, + { + "epoch": 2.31, + "grad_norm": 0.7180284857749939, + "learning_rate": 7.41596172369473e-05, + "loss": 3.002, + "step": 47215 + }, + { + "epoch": 2.31, + "grad_norm": 0.7672765254974365, + "learning_rate": 7.414948302504284e-05, + "loss": 3.1372, + "step": 47216 + }, + { + "epoch": 2.31, + "grad_norm": 0.7458218932151794, + "learning_rate": 7.413934940798525e-05, + "loss": 2.8517, + "step": 47217 + }, + { + "epoch": 2.31, + "grad_norm": 0.6984786987304688, + "learning_rate": 7.412921638580106e-05, + "loss": 2.946, + "step": 47218 + }, + { + "epoch": 2.31, + "grad_norm": 0.7315978407859802, + "learning_rate": 7.411908395851715e-05, + "loss": 2.9058, + "step": 47219 + }, + { + "epoch": 2.31, + "grad_norm": 0.7283531427383423, + "learning_rate": 7.410895212616e-05, + "loss": 3.0075, + "step": 47220 + }, + { + "epoch": 2.31, + "grad_norm": 0.7277526259422302, + "learning_rate": 7.409882088875649e-05, + "loss": 2.9993, + "step": 47221 + }, + { + "epoch": 2.31, + "grad_norm": 0.7374258637428284, + "learning_rate": 7.408869024633321e-05, + "loss": 2.9424, + "step": 47222 + }, + { + "epoch": 2.31, + "grad_norm": 0.7135753035545349, + "learning_rate": 7.407856019891673e-05, + "loss": 2.9005, + "step": 47223 + }, + { + "epoch": 2.31, + "grad_norm": 0.7380639910697937, + "learning_rate": 7.406843074653395e-05, + "loss": 3.0794, + "step": 47224 + }, + { + "epoch": 2.31, + "grad_norm": 0.7215073108673096, + "learning_rate": 7.405830188921134e-05, + "loss": 3.0077, + "step": 47225 + }, + { + "epoch": 2.31, + "grad_norm": 0.7313400506973267, + "learning_rate": 7.404817362697579e-05, + "loss": 3.0572, + "step": 47226 + }, + { + "epoch": 2.31, + "grad_norm": 0.7082253694534302, + "learning_rate": 7.403804595985377e-05, + "loss": 2.9114, + "step": 47227 + }, + { + "epoch": 2.31, + "grad_norm": 0.7084292769432068, + "learning_rate": 7.402791888787204e-05, + "loss": 2.8432, + "step": 47228 + }, + { + "epoch": 2.31, + "grad_norm": 0.6946665048599243, + "learning_rate": 7.401779241105739e-05, + "loss": 2.9464, + "step": 47229 + }, + { + "epoch": 2.31, + "grad_norm": 0.697638988494873, + "learning_rate": 7.400766652943636e-05, + "loss": 2.7472, + "step": 47230 + }, + { + "epoch": 2.31, + "grad_norm": 0.7209494709968567, + "learning_rate": 7.399754124303568e-05, + "loss": 2.576, + "step": 47231 + }, + { + "epoch": 2.31, + "grad_norm": 0.7583011984825134, + "learning_rate": 7.398741655188184e-05, + "loss": 3.1185, + "step": 47232 + }, + { + "epoch": 2.31, + "grad_norm": 0.8109094500541687, + "learning_rate": 7.39772924560017e-05, + "loss": 3.1152, + "step": 47233 + }, + { + "epoch": 2.31, + "grad_norm": 0.7156438231468201, + "learning_rate": 7.396716895542191e-05, + "loss": 2.9722, + "step": 47234 + }, + { + "epoch": 2.31, + "grad_norm": 0.712462842464447, + "learning_rate": 7.395704605016903e-05, + "loss": 2.8642, + "step": 47235 + }, + { + "epoch": 2.31, + "grad_norm": 0.6899236440658569, + "learning_rate": 7.394692374026989e-05, + "loss": 2.9877, + "step": 47236 + }, + { + "epoch": 2.31, + "grad_norm": 0.7041913270950317, + "learning_rate": 7.393680202575106e-05, + "loss": 2.9676, + "step": 47237 + }, + { + "epoch": 2.32, + "grad_norm": 0.7307323217391968, + "learning_rate": 7.392668090663905e-05, + "loss": 2.9229, + "step": 47238 + }, + { + "epoch": 2.32, + "grad_norm": 0.7059488892555237, + "learning_rate": 7.39165603829608e-05, + "loss": 3.1652, + "step": 47239 + }, + { + "epoch": 2.32, + "grad_norm": 0.723430335521698, + "learning_rate": 7.39064404547427e-05, + "loss": 2.8094, + "step": 47240 + }, + { + "epoch": 2.32, + "grad_norm": 0.700806200504303, + "learning_rate": 7.38963211220116e-05, + "loss": 2.7546, + "step": 47241 + }, + { + "epoch": 2.32, + "grad_norm": 0.7575515508651733, + "learning_rate": 7.388620238479399e-05, + "loss": 2.9442, + "step": 47242 + }, + { + "epoch": 2.32, + "grad_norm": 0.7537017464637756, + "learning_rate": 7.387608424311671e-05, + "loss": 2.818, + "step": 47243 + }, + { + "epoch": 2.32, + "grad_norm": 0.7906556129455566, + "learning_rate": 7.38659666970063e-05, + "loss": 2.6717, + "step": 47244 + }, + { + "epoch": 2.32, + "grad_norm": 0.7230514287948608, + "learning_rate": 7.385584974648932e-05, + "loss": 2.937, + "step": 47245 + }, + { + "epoch": 2.32, + "grad_norm": 0.7493695020675659, + "learning_rate": 7.384573339159261e-05, + "loss": 2.8982, + "step": 47246 + }, + { + "epoch": 2.32, + "grad_norm": 0.6845947504043579, + "learning_rate": 7.383561763234262e-05, + "loss": 2.7624, + "step": 47247 + }, + { + "epoch": 2.32, + "grad_norm": 0.6852810382843018, + "learning_rate": 7.382550246876609e-05, + "loss": 2.8006, + "step": 47248 + }, + { + "epoch": 2.32, + "grad_norm": 0.7320659756660461, + "learning_rate": 7.381538790088974e-05, + "loss": 2.9585, + "step": 47249 + }, + { + "epoch": 2.32, + "grad_norm": 0.704412579536438, + "learning_rate": 7.380527392874004e-05, + "loss": 2.7925, + "step": 47250 + }, + { + "epoch": 2.32, + "grad_norm": 0.7940717935562134, + "learning_rate": 7.379516055234381e-05, + "loss": 3.0233, + "step": 47251 + }, + { + "epoch": 2.32, + "grad_norm": 0.7513147592544556, + "learning_rate": 7.378504777172757e-05, + "loss": 3.0405, + "step": 47252 + }, + { + "epoch": 2.32, + "grad_norm": 0.7274273633956909, + "learning_rate": 7.377493558691793e-05, + "loss": 3.1427, + "step": 47253 + }, + { + "epoch": 2.32, + "grad_norm": 0.7024219632148743, + "learning_rate": 7.376482399794164e-05, + "loss": 2.915, + "step": 47254 + }, + { + "epoch": 2.32, + "grad_norm": 0.7163192629814148, + "learning_rate": 7.375471300482517e-05, + "loss": 3.0311, + "step": 47255 + }, + { + "epoch": 2.32, + "grad_norm": 0.685631513595581, + "learning_rate": 7.374460260759531e-05, + "loss": 2.7272, + "step": 47256 + }, + { + "epoch": 2.32, + "grad_norm": 0.7799549102783203, + "learning_rate": 7.373449280627858e-05, + "loss": 2.7492, + "step": 47257 + }, + { + "epoch": 2.32, + "grad_norm": 0.6964437365531921, + "learning_rate": 7.372438360090174e-05, + "loss": 2.9227, + "step": 47258 + }, + { + "epoch": 2.32, + "grad_norm": 0.7885956764221191, + "learning_rate": 7.371427499149127e-05, + "loss": 2.8981, + "step": 47259 + }, + { + "epoch": 2.32, + "grad_norm": 0.7466039657592773, + "learning_rate": 7.370416697807384e-05, + "loss": 2.7457, + "step": 47260 + }, + { + "epoch": 2.32, + "grad_norm": 0.7249606251716614, + "learning_rate": 7.369405956067612e-05, + "loss": 2.8658, + "step": 47261 + }, + { + "epoch": 2.32, + "grad_norm": 0.7615867853164673, + "learning_rate": 7.368395273932459e-05, + "loss": 2.8922, + "step": 47262 + }, + { + "epoch": 2.32, + "grad_norm": 0.6936567425727844, + "learning_rate": 7.367384651404602e-05, + "loss": 2.9543, + "step": 47263 + }, + { + "epoch": 2.32, + "grad_norm": 0.7578475475311279, + "learning_rate": 7.366374088486705e-05, + "loss": 2.954, + "step": 47264 + }, + { + "epoch": 2.32, + "grad_norm": 0.7116115689277649, + "learning_rate": 7.365363585181413e-05, + "loss": 3.0167, + "step": 47265 + }, + { + "epoch": 2.32, + "grad_norm": 0.7062137722969055, + "learning_rate": 7.364353141491413e-05, + "loss": 2.9352, + "step": 47266 + }, + { + "epoch": 2.32, + "grad_norm": 0.7953941226005554, + "learning_rate": 7.363342757419338e-05, + "loss": 2.9842, + "step": 47267 + }, + { + "epoch": 2.32, + "grad_norm": 0.7345401644706726, + "learning_rate": 7.362332432967857e-05, + "loss": 2.936, + "step": 47268 + }, + { + "epoch": 2.32, + "grad_norm": 0.768124520778656, + "learning_rate": 7.361322168139647e-05, + "loss": 2.9616, + "step": 47269 + }, + { + "epoch": 2.32, + "grad_norm": 0.7298167943954468, + "learning_rate": 7.360311962937347e-05, + "loss": 3.0135, + "step": 47270 + }, + { + "epoch": 2.32, + "grad_norm": 0.6858828067779541, + "learning_rate": 7.35930181736364e-05, + "loss": 3.1113, + "step": 47271 + }, + { + "epoch": 2.32, + "grad_norm": 0.7418141961097717, + "learning_rate": 7.358291731421165e-05, + "loss": 2.8592, + "step": 47272 + }, + { + "epoch": 2.32, + "grad_norm": 0.7051644921302795, + "learning_rate": 7.357281705112598e-05, + "loss": 2.9234, + "step": 47273 + }, + { + "epoch": 2.32, + "grad_norm": 0.7438302040100098, + "learning_rate": 7.356271738440597e-05, + "loss": 2.9264, + "step": 47274 + }, + { + "epoch": 2.32, + "grad_norm": 0.700322151184082, + "learning_rate": 7.355261831407807e-05, + "loss": 2.9011, + "step": 47275 + }, + { + "epoch": 2.32, + "grad_norm": 0.7566295862197876, + "learning_rate": 7.354251984016907e-05, + "loss": 3.0113, + "step": 47276 + }, + { + "epoch": 2.32, + "grad_norm": 0.7644765377044678, + "learning_rate": 7.353242196270545e-05, + "loss": 2.7803, + "step": 47277 + }, + { + "epoch": 2.32, + "grad_norm": 0.7297467589378357, + "learning_rate": 7.352232468171381e-05, + "loss": 2.927, + "step": 47278 + }, + { + "epoch": 2.32, + "grad_norm": 0.7533126473426819, + "learning_rate": 7.351222799722087e-05, + "loss": 2.7635, + "step": 47279 + }, + { + "epoch": 2.32, + "grad_norm": 0.7412834763526917, + "learning_rate": 7.350213190925316e-05, + "loss": 2.8367, + "step": 47280 + }, + { + "epoch": 2.32, + "grad_norm": 0.7042602300643921, + "learning_rate": 7.349203641783722e-05, + "loss": 2.9655, + "step": 47281 + }, + { + "epoch": 2.32, + "grad_norm": 0.7818167209625244, + "learning_rate": 7.348194152299956e-05, + "loss": 2.9304, + "step": 47282 + }, + { + "epoch": 2.32, + "grad_norm": 0.6732924580574036, + "learning_rate": 7.34718472247669e-05, + "loss": 2.804, + "step": 47283 + }, + { + "epoch": 2.32, + "grad_norm": 0.7412101030349731, + "learning_rate": 7.346175352316587e-05, + "loss": 2.9378, + "step": 47284 + }, + { + "epoch": 2.32, + "grad_norm": 0.7184247374534607, + "learning_rate": 7.345166041822291e-05, + "loss": 2.9313, + "step": 47285 + }, + { + "epoch": 2.32, + "grad_norm": 0.7510634064674377, + "learning_rate": 7.344156790996476e-05, + "loss": 2.8656, + "step": 47286 + }, + { + "epoch": 2.32, + "grad_norm": 0.6889684200286865, + "learning_rate": 7.343147599841783e-05, + "loss": 2.844, + "step": 47287 + }, + { + "epoch": 2.32, + "grad_norm": 0.688102126121521, + "learning_rate": 7.342138468360885e-05, + "loss": 2.9992, + "step": 47288 + }, + { + "epoch": 2.32, + "grad_norm": 0.7358808517456055, + "learning_rate": 7.341129396556437e-05, + "loss": 3.0776, + "step": 47289 + }, + { + "epoch": 2.32, + "grad_norm": 0.7751854062080383, + "learning_rate": 7.340120384431084e-05, + "loss": 2.8425, + "step": 47290 + }, + { + "epoch": 2.32, + "grad_norm": 0.7157254815101624, + "learning_rate": 7.339111431987501e-05, + "loss": 2.9434, + "step": 47291 + }, + { + "epoch": 2.32, + "grad_norm": 0.703119695186615, + "learning_rate": 7.338102539228328e-05, + "loss": 2.9188, + "step": 47292 + }, + { + "epoch": 2.32, + "grad_norm": 0.6870105862617493, + "learning_rate": 7.337093706156232e-05, + "loss": 3.0188, + "step": 47293 + }, + { + "epoch": 2.32, + "grad_norm": 0.753438413143158, + "learning_rate": 7.336084932773878e-05, + "loss": 2.8878, + "step": 47294 + }, + { + "epoch": 2.32, + "grad_norm": 0.715949535369873, + "learning_rate": 7.335076219083915e-05, + "loss": 2.9588, + "step": 47295 + }, + { + "epoch": 2.32, + "grad_norm": 0.6934475898742676, + "learning_rate": 7.334067565089001e-05, + "loss": 2.8501, + "step": 47296 + }, + { + "epoch": 2.32, + "grad_norm": 0.696337878704071, + "learning_rate": 7.333058970791781e-05, + "loss": 2.8632, + "step": 47297 + }, + { + "epoch": 2.32, + "grad_norm": 0.7058655023574829, + "learning_rate": 7.332050436194928e-05, + "loss": 3.0071, + "step": 47298 + }, + { + "epoch": 2.32, + "grad_norm": 0.7588328719139099, + "learning_rate": 7.331041961301086e-05, + "loss": 2.8746, + "step": 47299 + }, + { + "epoch": 2.32, + "grad_norm": 0.7144933938980103, + "learning_rate": 7.330033546112915e-05, + "loss": 2.7623, + "step": 47300 + }, + { + "epoch": 2.32, + "grad_norm": 0.7363554239273071, + "learning_rate": 7.329025190633081e-05, + "loss": 2.9335, + "step": 47301 + }, + { + "epoch": 2.32, + "grad_norm": 0.7441504597663879, + "learning_rate": 7.328016894864232e-05, + "loss": 2.7166, + "step": 47302 + }, + { + "epoch": 2.32, + "grad_norm": 0.7541495561599731, + "learning_rate": 7.327008658809025e-05, + "loss": 2.8355, + "step": 47303 + }, + { + "epoch": 2.32, + "grad_norm": 0.7827197909355164, + "learning_rate": 7.326000482470102e-05, + "loss": 3.0587, + "step": 47304 + }, + { + "epoch": 2.32, + "grad_norm": 0.7094131112098694, + "learning_rate": 7.324992365850133e-05, + "loss": 2.8584, + "step": 47305 + }, + { + "epoch": 2.32, + "grad_norm": 0.7611328959465027, + "learning_rate": 7.323984308951778e-05, + "loss": 2.8228, + "step": 47306 + }, + { + "epoch": 2.32, + "grad_norm": 0.6947973370552063, + "learning_rate": 7.322976311777675e-05, + "loss": 2.6941, + "step": 47307 + }, + { + "epoch": 2.32, + "grad_norm": 0.7067661881446838, + "learning_rate": 7.321968374330495e-05, + "loss": 3.1039, + "step": 47308 + }, + { + "epoch": 2.32, + "grad_norm": 0.7562273144721985, + "learning_rate": 7.320960496612881e-05, + "loss": 2.8796, + "step": 47309 + }, + { + "epoch": 2.32, + "grad_norm": 0.7289214134216309, + "learning_rate": 7.319952678627499e-05, + "loss": 3.0467, + "step": 47310 + }, + { + "epoch": 2.32, + "grad_norm": 0.7128106951713562, + "learning_rate": 7.318944920376997e-05, + "loss": 2.882, + "step": 47311 + }, + { + "epoch": 2.32, + "grad_norm": 0.7181450128555298, + "learning_rate": 7.31793722186402e-05, + "loss": 3.0642, + "step": 47312 + }, + { + "epoch": 2.32, + "grad_norm": 0.7351670265197754, + "learning_rate": 7.316929583091238e-05, + "loss": 2.862, + "step": 47313 + }, + { + "epoch": 2.32, + "grad_norm": 0.693118691444397, + "learning_rate": 7.315922004061292e-05, + "loss": 2.9532, + "step": 47314 + }, + { + "epoch": 2.32, + "grad_norm": 0.7117541432380676, + "learning_rate": 7.314914484776841e-05, + "loss": 2.8341, + "step": 47315 + }, + { + "epoch": 2.32, + "grad_norm": 0.690302312374115, + "learning_rate": 7.313907025240549e-05, + "loss": 2.7515, + "step": 47316 + }, + { + "epoch": 2.32, + "grad_norm": 0.7544472217559814, + "learning_rate": 7.312899625455057e-05, + "loss": 2.8819, + "step": 47317 + }, + { + "epoch": 2.32, + "grad_norm": 0.7129111886024475, + "learning_rate": 7.311892285423025e-05, + "loss": 2.878, + "step": 47318 + }, + { + "epoch": 2.32, + "grad_norm": 0.7087661623954773, + "learning_rate": 7.31088500514709e-05, + "loss": 2.8482, + "step": 47319 + }, + { + "epoch": 2.32, + "grad_norm": 0.7653520703315735, + "learning_rate": 7.30987778462992e-05, + "loss": 2.7622, + "step": 47320 + }, + { + "epoch": 2.32, + "grad_norm": 0.6924532055854797, + "learning_rate": 7.308870623874171e-05, + "loss": 2.8243, + "step": 47321 + }, + { + "epoch": 2.32, + "grad_norm": 0.7128939032554626, + "learning_rate": 7.307863522882483e-05, + "loss": 2.8937, + "step": 47322 + }, + { + "epoch": 2.32, + "grad_norm": 0.6976691484451294, + "learning_rate": 7.306856481657526e-05, + "loss": 2.7315, + "step": 47323 + }, + { + "epoch": 2.32, + "grad_norm": 0.7400538921356201, + "learning_rate": 7.30584950020194e-05, + "loss": 2.7653, + "step": 47324 + }, + { + "epoch": 2.32, + "grad_norm": 0.734775960445404, + "learning_rate": 7.30484257851837e-05, + "loss": 3.0132, + "step": 47325 + }, + { + "epoch": 2.32, + "grad_norm": 0.7255701422691345, + "learning_rate": 7.303835716609482e-05, + "loss": 2.882, + "step": 47326 + }, + { + "epoch": 2.32, + "grad_norm": 0.7620897889137268, + "learning_rate": 7.302828914477918e-05, + "loss": 2.9522, + "step": 47327 + }, + { + "epoch": 2.32, + "grad_norm": 0.8050826787948608, + "learning_rate": 7.301822172126344e-05, + "loss": 2.9349, + "step": 47328 + }, + { + "epoch": 2.32, + "grad_norm": 0.737463116645813, + "learning_rate": 7.300815489557394e-05, + "loss": 2.7589, + "step": 47329 + }, + { + "epoch": 2.32, + "grad_norm": 0.7167901992797852, + "learning_rate": 7.299808866773722e-05, + "loss": 3.015, + "step": 47330 + }, + { + "epoch": 2.32, + "grad_norm": 0.7121860384941101, + "learning_rate": 7.298802303777999e-05, + "loss": 2.9854, + "step": 47331 + }, + { + "epoch": 2.32, + "grad_norm": 0.7176076769828796, + "learning_rate": 7.297795800572858e-05, + "loss": 2.8822, + "step": 47332 + }, + { + "epoch": 2.32, + "grad_norm": 0.7200085520744324, + "learning_rate": 7.296789357160958e-05, + "loss": 3.0043, + "step": 47333 + }, + { + "epoch": 2.32, + "grad_norm": 0.7072357535362244, + "learning_rate": 7.295782973544932e-05, + "loss": 2.8592, + "step": 47334 + }, + { + "epoch": 2.32, + "grad_norm": 0.7582478523254395, + "learning_rate": 7.294776649727446e-05, + "loss": 3.0922, + "step": 47335 + }, + { + "epoch": 2.32, + "grad_norm": 0.767278254032135, + "learning_rate": 7.29377038571116e-05, + "loss": 2.7644, + "step": 47336 + }, + { + "epoch": 2.32, + "grad_norm": 0.7298552989959717, + "learning_rate": 7.2927641814987e-05, + "loss": 2.8319, + "step": 47337 + }, + { + "epoch": 2.32, + "grad_norm": 0.6883513331413269, + "learning_rate": 7.291758037092739e-05, + "loss": 3.0519, + "step": 47338 + }, + { + "epoch": 2.32, + "grad_norm": 0.7653185725212097, + "learning_rate": 7.290751952495918e-05, + "loss": 3.0538, + "step": 47339 + }, + { + "epoch": 2.32, + "grad_norm": 0.7106703519821167, + "learning_rate": 7.289745927710878e-05, + "loss": 2.7821, + "step": 47340 + }, + { + "epoch": 2.32, + "grad_norm": 0.7490907311439514, + "learning_rate": 7.288739962740285e-05, + "loss": 3.0965, + "step": 47341 + }, + { + "epoch": 2.32, + "grad_norm": 0.7440788149833679, + "learning_rate": 7.287734057586768e-05, + "loss": 2.9511, + "step": 47342 + }, + { + "epoch": 2.32, + "grad_norm": 0.7107495665550232, + "learning_rate": 7.286728212252997e-05, + "loss": 2.9284, + "step": 47343 + }, + { + "epoch": 2.32, + "grad_norm": 0.7320402264595032, + "learning_rate": 7.285722426741608e-05, + "loss": 2.8895, + "step": 47344 + }, + { + "epoch": 2.32, + "grad_norm": 0.7153285145759583, + "learning_rate": 7.284716701055263e-05, + "loss": 2.7128, + "step": 47345 + }, + { + "epoch": 2.32, + "grad_norm": 0.7777250409126282, + "learning_rate": 7.283711035196602e-05, + "loss": 2.8695, + "step": 47346 + }, + { + "epoch": 2.32, + "grad_norm": 0.7311616539955139, + "learning_rate": 7.282705429168263e-05, + "loss": 3.0246, + "step": 47347 + }, + { + "epoch": 2.32, + "grad_norm": 0.7358239889144897, + "learning_rate": 7.281699882972916e-05, + "loss": 2.912, + "step": 47348 + }, + { + "epoch": 2.32, + "grad_norm": 0.7160769701004028, + "learning_rate": 7.280694396613194e-05, + "loss": 3.0558, + "step": 47349 + }, + { + "epoch": 2.32, + "grad_norm": 0.8619295358657837, + "learning_rate": 7.279688970091744e-05, + "loss": 2.6971, + "step": 47350 + }, + { + "epoch": 2.32, + "grad_norm": 0.7436091303825378, + "learning_rate": 7.278683603411235e-05, + "loss": 2.9886, + "step": 47351 + }, + { + "epoch": 2.32, + "grad_norm": 0.7118484973907471, + "learning_rate": 7.277678296574288e-05, + "loss": 2.9102, + "step": 47352 + }, + { + "epoch": 2.32, + "grad_norm": 0.7110477089881897, + "learning_rate": 7.276673049583576e-05, + "loss": 2.9047, + "step": 47353 + }, + { + "epoch": 2.32, + "grad_norm": 0.7068123817443848, + "learning_rate": 7.275667862441731e-05, + "loss": 3.0401, + "step": 47354 + }, + { + "epoch": 2.32, + "grad_norm": 0.7033482789993286, + "learning_rate": 7.274662735151396e-05, + "loss": 2.8322, + "step": 47355 + }, + { + "epoch": 2.32, + "grad_norm": 0.707513689994812, + "learning_rate": 7.273657667715235e-05, + "loss": 2.9297, + "step": 47356 + }, + { + "epoch": 2.32, + "grad_norm": 0.7551565170288086, + "learning_rate": 7.272652660135877e-05, + "loss": 3.0123, + "step": 47357 + }, + { + "epoch": 2.32, + "grad_norm": 0.7359175682067871, + "learning_rate": 7.271647712415987e-05, + "loss": 2.9357, + "step": 47358 + }, + { + "epoch": 2.32, + "grad_norm": 0.6989636421203613, + "learning_rate": 7.270642824558192e-05, + "loss": 2.9436, + "step": 47359 + }, + { + "epoch": 2.32, + "grad_norm": 0.7528616189956665, + "learning_rate": 7.269637996565159e-05, + "loss": 2.9145, + "step": 47360 + }, + { + "epoch": 2.32, + "grad_norm": 0.7683088183403015, + "learning_rate": 7.268633228439526e-05, + "loss": 2.9134, + "step": 47361 + }, + { + "epoch": 2.32, + "grad_norm": 0.7808603048324585, + "learning_rate": 7.267628520183927e-05, + "loss": 2.794, + "step": 47362 + }, + { + "epoch": 2.32, + "grad_norm": 0.763262927532196, + "learning_rate": 7.266623871801032e-05, + "loss": 2.8442, + "step": 47363 + }, + { + "epoch": 2.32, + "grad_norm": 0.7707550525665283, + "learning_rate": 7.265619283293464e-05, + "loss": 2.8606, + "step": 47364 + }, + { + "epoch": 2.32, + "grad_norm": 0.7410865426063538, + "learning_rate": 7.26461475466388e-05, + "loss": 2.995, + "step": 47365 + }, + { + "epoch": 2.32, + "grad_norm": 0.7402563095092773, + "learning_rate": 7.263610285914933e-05, + "loss": 2.8527, + "step": 47366 + }, + { + "epoch": 2.32, + "grad_norm": 0.774781346321106, + "learning_rate": 7.26260587704925e-05, + "loss": 2.9038, + "step": 47367 + }, + { + "epoch": 2.32, + "grad_norm": 0.7119985222816467, + "learning_rate": 7.261601528069507e-05, + "loss": 2.8396, + "step": 47368 + }, + { + "epoch": 2.32, + "grad_norm": 0.8101449608802795, + "learning_rate": 7.260597238978316e-05, + "loss": 2.9007, + "step": 47369 + }, + { + "epoch": 2.32, + "grad_norm": 0.7611187696456909, + "learning_rate": 7.259593009778329e-05, + "loss": 2.9946, + "step": 47370 + }, + { + "epoch": 2.32, + "grad_norm": 0.7523656487464905, + "learning_rate": 7.258588840472209e-05, + "loss": 2.9665, + "step": 47371 + }, + { + "epoch": 2.32, + "grad_norm": 0.7050994634628296, + "learning_rate": 7.257584731062583e-05, + "loss": 2.9543, + "step": 47372 + }, + { + "epoch": 2.32, + "grad_norm": 0.7177386283874512, + "learning_rate": 7.256580681552106e-05, + "loss": 3.003, + "step": 47373 + }, + { + "epoch": 2.32, + "grad_norm": 0.7362361550331116, + "learning_rate": 7.255576691943413e-05, + "loss": 2.6465, + "step": 47374 + }, + { + "epoch": 2.32, + "grad_norm": 0.7049803733825684, + "learning_rate": 7.25457276223916e-05, + "loss": 2.9968, + "step": 47375 + }, + { + "epoch": 2.32, + "grad_norm": 0.6966225504875183, + "learning_rate": 7.253568892441988e-05, + "loss": 2.8923, + "step": 47376 + }, + { + "epoch": 2.32, + "grad_norm": 0.733306884765625, + "learning_rate": 7.252565082554526e-05, + "loss": 2.816, + "step": 47377 + }, + { + "epoch": 2.32, + "grad_norm": 0.7428738474845886, + "learning_rate": 7.251561332579441e-05, + "loss": 2.5529, + "step": 47378 + }, + { + "epoch": 2.32, + "grad_norm": 0.7241402268409729, + "learning_rate": 7.250557642519354e-05, + "loss": 2.6333, + "step": 47379 + }, + { + "epoch": 2.32, + "grad_norm": 0.7124062776565552, + "learning_rate": 7.249554012376931e-05, + "loss": 2.9414, + "step": 47380 + }, + { + "epoch": 2.32, + "grad_norm": 0.721976101398468, + "learning_rate": 7.248550442154797e-05, + "loss": 2.9294, + "step": 47381 + }, + { + "epoch": 2.32, + "grad_norm": 0.7031937837600708, + "learning_rate": 7.247546931855609e-05, + "loss": 2.6558, + "step": 47382 + }, + { + "epoch": 2.32, + "grad_norm": 0.7368708848953247, + "learning_rate": 7.246543481482002e-05, + "loss": 2.6632, + "step": 47383 + }, + { + "epoch": 2.32, + "grad_norm": 0.7131219506263733, + "learning_rate": 7.245540091036614e-05, + "loss": 2.7965, + "step": 47384 + }, + { + "epoch": 2.32, + "grad_norm": 0.7883298993110657, + "learning_rate": 7.244536760522103e-05, + "loss": 2.7718, + "step": 47385 + }, + { + "epoch": 2.32, + "grad_norm": 0.8104693293571472, + "learning_rate": 7.243533489941094e-05, + "loss": 2.874, + "step": 47386 + }, + { + "epoch": 2.32, + "grad_norm": 0.7419458031654358, + "learning_rate": 7.242530279296239e-05, + "loss": 2.8306, + "step": 47387 + }, + { + "epoch": 2.32, + "grad_norm": 0.7081753015518188, + "learning_rate": 7.241527128590185e-05, + "loss": 2.8082, + "step": 47388 + }, + { + "epoch": 2.32, + "grad_norm": 0.7221062183380127, + "learning_rate": 7.240524037825561e-05, + "loss": 2.83, + "step": 47389 + }, + { + "epoch": 2.32, + "grad_norm": 0.736663281917572, + "learning_rate": 7.239521007005026e-05, + "loss": 2.8742, + "step": 47390 + }, + { + "epoch": 2.32, + "grad_norm": 0.7576841711997986, + "learning_rate": 7.238518036131214e-05, + "loss": 2.9401, + "step": 47391 + }, + { + "epoch": 2.32, + "grad_norm": 0.7570251822471619, + "learning_rate": 7.237515125206755e-05, + "loss": 2.7961, + "step": 47392 + }, + { + "epoch": 2.32, + "grad_norm": 0.7038759589195251, + "learning_rate": 7.236512274234308e-05, + "loss": 2.9395, + "step": 47393 + }, + { + "epoch": 2.32, + "grad_norm": 0.714281439781189, + "learning_rate": 7.235509483216501e-05, + "loss": 3.0216, + "step": 47394 + }, + { + "epoch": 2.32, + "grad_norm": 0.7493951916694641, + "learning_rate": 7.234506752155986e-05, + "loss": 2.8871, + "step": 47395 + }, + { + "epoch": 2.32, + "grad_norm": 0.7389973998069763, + "learning_rate": 7.233504081055393e-05, + "loss": 3.0943, + "step": 47396 + }, + { + "epoch": 2.32, + "grad_norm": 0.7300322651863098, + "learning_rate": 7.232501469917377e-05, + "loss": 2.9865, + "step": 47397 + }, + { + "epoch": 2.32, + "grad_norm": 0.6961638331413269, + "learning_rate": 7.231498918744567e-05, + "loss": 2.7805, + "step": 47398 + }, + { + "epoch": 2.32, + "grad_norm": 0.7474333047866821, + "learning_rate": 7.230496427539601e-05, + "loss": 2.7427, + "step": 47399 + }, + { + "epoch": 2.32, + "grad_norm": 0.7698484063148499, + "learning_rate": 7.229493996305135e-05, + "loss": 2.8524, + "step": 47400 + }, + { + "epoch": 2.32, + "grad_norm": 0.7011622190475464, + "learning_rate": 7.228491625043791e-05, + "loss": 3.0087, + "step": 47401 + }, + { + "epoch": 2.32, + "grad_norm": 0.7040664553642273, + "learning_rate": 7.227489313758214e-05, + "loss": 2.7225, + "step": 47402 + }, + { + "epoch": 2.32, + "grad_norm": 0.7392282485961914, + "learning_rate": 7.226487062451061e-05, + "loss": 2.9377, + "step": 47403 + }, + { + "epoch": 2.32, + "grad_norm": 0.697232723236084, + "learning_rate": 7.225484871124953e-05, + "loss": 2.9972, + "step": 47404 + }, + { + "epoch": 2.32, + "grad_norm": 0.6810755133628845, + "learning_rate": 7.224482739782538e-05, + "loss": 3.0003, + "step": 47405 + }, + { + "epoch": 2.32, + "grad_norm": 0.7662976384162903, + "learning_rate": 7.223480668426444e-05, + "loss": 2.6729, + "step": 47406 + }, + { + "epoch": 2.32, + "grad_norm": 0.7100536227226257, + "learning_rate": 7.222478657059318e-05, + "loss": 2.8443, + "step": 47407 + }, + { + "epoch": 2.32, + "grad_norm": 0.7035341262817383, + "learning_rate": 7.221476705683807e-05, + "loss": 2.8649, + "step": 47408 + }, + { + "epoch": 2.32, + "grad_norm": 0.7688108086585999, + "learning_rate": 7.220474814302536e-05, + "loss": 2.8819, + "step": 47409 + }, + { + "epoch": 2.32, + "grad_norm": 0.7461357116699219, + "learning_rate": 7.219472982918155e-05, + "loss": 2.9177, + "step": 47410 + }, + { + "epoch": 2.32, + "grad_norm": 0.727735161781311, + "learning_rate": 7.218471211533293e-05, + "loss": 2.9527, + "step": 47411 + }, + { + "epoch": 2.32, + "grad_norm": 0.7366343140602112, + "learning_rate": 7.217469500150602e-05, + "loss": 2.886, + "step": 47412 + }, + { + "epoch": 2.32, + "grad_norm": 0.76343834400177, + "learning_rate": 7.21646784877271e-05, + "loss": 2.9228, + "step": 47413 + }, + { + "epoch": 2.32, + "grad_norm": 0.7279701828956604, + "learning_rate": 7.215466257402252e-05, + "loss": 2.9934, + "step": 47414 + }, + { + "epoch": 2.32, + "grad_norm": 0.7330734133720398, + "learning_rate": 7.214464726041873e-05, + "loss": 2.8277, + "step": 47415 + }, + { + "epoch": 2.32, + "grad_norm": 0.7413004636764526, + "learning_rate": 7.213463254694206e-05, + "loss": 2.9595, + "step": 47416 + }, + { + "epoch": 2.32, + "grad_norm": 0.7532314658164978, + "learning_rate": 7.212461843361888e-05, + "loss": 2.9631, + "step": 47417 + }, + { + "epoch": 2.32, + "grad_norm": 0.7230085134506226, + "learning_rate": 7.211460492047571e-05, + "loss": 3.1592, + "step": 47418 + }, + { + "epoch": 2.32, + "grad_norm": 0.7556701302528381, + "learning_rate": 7.21045920075388e-05, + "loss": 2.9073, + "step": 47419 + }, + { + "epoch": 2.32, + "grad_norm": 0.713424801826477, + "learning_rate": 7.209457969483454e-05, + "loss": 3.1115, + "step": 47420 + }, + { + "epoch": 2.32, + "grad_norm": 0.7252991199493408, + "learning_rate": 7.20845679823892e-05, + "loss": 2.8228, + "step": 47421 + }, + { + "epoch": 2.32, + "grad_norm": 0.7200642228126526, + "learning_rate": 7.207455687022924e-05, + "loss": 2.9238, + "step": 47422 + }, + { + "epoch": 2.32, + "grad_norm": 0.7238421440124512, + "learning_rate": 7.206454635838115e-05, + "loss": 2.826, + "step": 47423 + }, + { + "epoch": 2.32, + "grad_norm": 0.7494496703147888, + "learning_rate": 7.205453644687107e-05, + "loss": 2.9615, + "step": 47424 + }, + { + "epoch": 2.32, + "grad_norm": 0.7229695916175842, + "learning_rate": 7.204452713572555e-05, + "loss": 2.6852, + "step": 47425 + }, + { + "epoch": 2.32, + "grad_norm": 0.7765418887138367, + "learning_rate": 7.20345184249709e-05, + "loss": 2.8497, + "step": 47426 + }, + { + "epoch": 2.32, + "grad_norm": 0.7109969258308411, + "learning_rate": 7.202451031463334e-05, + "loss": 2.8812, + "step": 47427 + }, + { + "epoch": 2.32, + "grad_norm": 0.7664215564727783, + "learning_rate": 7.201450280473943e-05, + "loss": 2.7231, + "step": 47428 + }, + { + "epoch": 2.32, + "grad_norm": 0.7280341982841492, + "learning_rate": 7.20044958953154e-05, + "loss": 3.0021, + "step": 47429 + }, + { + "epoch": 2.32, + "grad_norm": 0.7474103569984436, + "learning_rate": 7.199448958638768e-05, + "loss": 2.6722, + "step": 47430 + }, + { + "epoch": 2.32, + "grad_norm": 0.8022735714912415, + "learning_rate": 7.198448387798253e-05, + "loss": 2.6483, + "step": 47431 + }, + { + "epoch": 2.32, + "grad_norm": 0.7711005210876465, + "learning_rate": 7.19744787701264e-05, + "loss": 2.7537, + "step": 47432 + }, + { + "epoch": 2.32, + "grad_norm": 0.7397340536117554, + "learning_rate": 7.196447426284565e-05, + "loss": 2.9846, + "step": 47433 + }, + { + "epoch": 2.32, + "grad_norm": 0.7473747730255127, + "learning_rate": 7.195447035616661e-05, + "loss": 2.7482, + "step": 47434 + }, + { + "epoch": 2.32, + "grad_norm": 0.7164973020553589, + "learning_rate": 7.194446705011561e-05, + "loss": 3.0895, + "step": 47435 + }, + { + "epoch": 2.32, + "grad_norm": 0.7285258769989014, + "learning_rate": 7.19344643447189e-05, + "loss": 2.9992, + "step": 47436 + }, + { + "epoch": 2.32, + "grad_norm": 0.741519033908844, + "learning_rate": 7.19244622400029e-05, + "loss": 2.8107, + "step": 47437 + }, + { + "epoch": 2.32, + "grad_norm": 0.7468116283416748, + "learning_rate": 7.191446073599408e-05, + "loss": 3.1146, + "step": 47438 + }, + { + "epoch": 2.32, + "grad_norm": 0.7286897301673889, + "learning_rate": 7.19044598327186e-05, + "loss": 2.855, + "step": 47439 + }, + { + "epoch": 2.32, + "grad_norm": 0.7384865880012512, + "learning_rate": 7.189445953020292e-05, + "loss": 2.6907, + "step": 47440 + }, + { + "epoch": 2.32, + "grad_norm": 0.7932009696960449, + "learning_rate": 7.188445982847336e-05, + "loss": 2.9254, + "step": 47441 + }, + { + "epoch": 2.33, + "grad_norm": 0.7377317547798157, + "learning_rate": 7.187446072755613e-05, + "loss": 2.8892, + "step": 47442 + }, + { + "epoch": 2.33, + "grad_norm": 0.7107809782028198, + "learning_rate": 7.18644622274778e-05, + "loss": 2.9628, + "step": 47443 + }, + { + "epoch": 2.33, + "grad_norm": 0.8373969197273254, + "learning_rate": 7.185446432826444e-05, + "loss": 2.7945, + "step": 47444 + }, + { + "epoch": 2.33, + "grad_norm": 0.7064989805221558, + "learning_rate": 7.184446702994262e-05, + "loss": 2.7455, + "step": 47445 + }, + { + "epoch": 2.33, + "grad_norm": 0.7400098443031311, + "learning_rate": 7.183447033253845e-05, + "loss": 2.8944, + "step": 47446 + }, + { + "epoch": 2.33, + "grad_norm": 0.8077271580696106, + "learning_rate": 7.182447423607852e-05, + "loss": 3.0493, + "step": 47447 + }, + { + "epoch": 2.33, + "grad_norm": 0.6956033706665039, + "learning_rate": 7.181447874058895e-05, + "loss": 2.9979, + "step": 47448 + }, + { + "epoch": 2.33, + "grad_norm": 0.6707293391227722, + "learning_rate": 7.180448384609606e-05, + "loss": 2.9002, + "step": 47449 + }, + { + "epoch": 2.33, + "grad_norm": 0.6920158863067627, + "learning_rate": 7.179448955262634e-05, + "loss": 3.0506, + "step": 47450 + }, + { + "epoch": 2.33, + "grad_norm": 0.7085862159729004, + "learning_rate": 7.17844958602059e-05, + "loss": 2.8566, + "step": 47451 + }, + { + "epoch": 2.33, + "grad_norm": 0.7076865434646606, + "learning_rate": 7.177450276886129e-05, + "loss": 2.936, + "step": 47452 + }, + { + "epoch": 2.33, + "grad_norm": 0.7326858043670654, + "learning_rate": 7.176451027861863e-05, + "loss": 3.1567, + "step": 47453 + }, + { + "epoch": 2.33, + "grad_norm": 0.6912264823913574, + "learning_rate": 7.175451838950433e-05, + "loss": 3.0702, + "step": 47454 + }, + { + "epoch": 2.33, + "grad_norm": 0.7292463779449463, + "learning_rate": 7.174452710154475e-05, + "loss": 2.7248, + "step": 47455 + }, + { + "epoch": 2.33, + "grad_norm": 0.7355402112007141, + "learning_rate": 7.173453641476622e-05, + "loss": 2.8516, + "step": 47456 + }, + { + "epoch": 2.33, + "grad_norm": 0.7186945080757141, + "learning_rate": 7.172454632919493e-05, + "loss": 2.7671, + "step": 47457 + }, + { + "epoch": 2.33, + "grad_norm": 0.7160497903823853, + "learning_rate": 7.171455684485721e-05, + "loss": 2.8474, + "step": 47458 + }, + { + "epoch": 2.33, + "grad_norm": 0.7210008502006531, + "learning_rate": 7.170456796177939e-05, + "loss": 2.6995, + "step": 47459 + }, + { + "epoch": 2.33, + "grad_norm": 0.7707409858703613, + "learning_rate": 7.16945796799879e-05, + "loss": 2.9242, + "step": 47460 + }, + { + "epoch": 2.33, + "grad_norm": 0.6895835399627686, + "learning_rate": 7.168459199950884e-05, + "loss": 2.9707, + "step": 47461 + }, + { + "epoch": 2.33, + "grad_norm": 0.734785795211792, + "learning_rate": 7.167460492036871e-05, + "loss": 2.9447, + "step": 47462 + }, + { + "epoch": 2.33, + "grad_norm": 0.7238161563873291, + "learning_rate": 7.166461844259376e-05, + "loss": 2.8899, + "step": 47463 + }, + { + "epoch": 2.33, + "grad_norm": 0.6985582709312439, + "learning_rate": 7.165463256621016e-05, + "loss": 3.0099, + "step": 47464 + }, + { + "epoch": 2.33, + "grad_norm": 0.8546037673950195, + "learning_rate": 7.164464729124439e-05, + "loss": 2.9665, + "step": 47465 + }, + { + "epoch": 2.33, + "grad_norm": 0.7167295813560486, + "learning_rate": 7.163466261772261e-05, + "loss": 2.8678, + "step": 47466 + }, + { + "epoch": 2.33, + "grad_norm": 0.7737851142883301, + "learning_rate": 7.162467854567125e-05, + "loss": 2.9525, + "step": 47467 + }, + { + "epoch": 2.33, + "grad_norm": 0.741790771484375, + "learning_rate": 7.161469507511642e-05, + "loss": 3.0993, + "step": 47468 + }, + { + "epoch": 2.33, + "grad_norm": 0.7221540808677673, + "learning_rate": 7.160471220608466e-05, + "loss": 2.7828, + "step": 47469 + }, + { + "epoch": 2.33, + "grad_norm": 0.7722229957580566, + "learning_rate": 7.159472993860212e-05, + "loss": 2.8201, + "step": 47470 + }, + { + "epoch": 2.33, + "grad_norm": 0.7124035358428955, + "learning_rate": 7.1584748272695e-05, + "loss": 2.8623, + "step": 47471 + }, + { + "epoch": 2.33, + "grad_norm": 0.6828311681747437, + "learning_rate": 7.157476720838981e-05, + "loss": 3.045, + "step": 47472 + }, + { + "epoch": 2.33, + "grad_norm": 0.6699220538139343, + "learning_rate": 7.156478674571261e-05, + "loss": 2.8361, + "step": 47473 + }, + { + "epoch": 2.33, + "grad_norm": 0.7215057015419006, + "learning_rate": 7.15548068846898e-05, + "loss": 3.0085, + "step": 47474 + }, + { + "epoch": 2.33, + "grad_norm": 0.7264130115509033, + "learning_rate": 7.154482762534776e-05, + "loss": 2.7652, + "step": 47475 + }, + { + "epoch": 2.33, + "grad_norm": 0.7632395029067993, + "learning_rate": 7.15348489677126e-05, + "loss": 2.9227, + "step": 47476 + }, + { + "epoch": 2.33, + "grad_norm": 0.7025119662284851, + "learning_rate": 7.152487091181074e-05, + "loss": 2.9366, + "step": 47477 + }, + { + "epoch": 2.33, + "grad_norm": 0.700484037399292, + "learning_rate": 7.151489345766842e-05, + "loss": 2.8368, + "step": 47478 + }, + { + "epoch": 2.33, + "grad_norm": 0.7083252668380737, + "learning_rate": 7.150491660531179e-05, + "loss": 2.9546, + "step": 47479 + }, + { + "epoch": 2.33, + "grad_norm": 0.7263261675834656, + "learning_rate": 7.149494035476732e-05, + "loss": 3.1148, + "step": 47480 + }, + { + "epoch": 2.33, + "grad_norm": 0.755137026309967, + "learning_rate": 7.148496470606114e-05, + "loss": 2.9635, + "step": 47481 + }, + { + "epoch": 2.33, + "grad_norm": 0.7463118433952332, + "learning_rate": 7.147498965921965e-05, + "loss": 2.8403, + "step": 47482 + }, + { + "epoch": 2.33, + "grad_norm": 0.6622539162635803, + "learning_rate": 7.1465015214269e-05, + "loss": 2.8063, + "step": 47483 + }, + { + "epoch": 2.33, + "grad_norm": 0.7079895734786987, + "learning_rate": 7.145504137123555e-05, + "loss": 2.8559, + "step": 47484 + }, + { + "epoch": 2.33, + "grad_norm": 0.6947416067123413, + "learning_rate": 7.144506813014557e-05, + "loss": 2.9942, + "step": 47485 + }, + { + "epoch": 2.33, + "grad_norm": 0.7773357629776001, + "learning_rate": 7.143509549102524e-05, + "loss": 3.0241, + "step": 47486 + }, + { + "epoch": 2.33, + "grad_norm": 0.7051486372947693, + "learning_rate": 7.142512345390093e-05, + "loss": 2.9468, + "step": 47487 + }, + { + "epoch": 2.33, + "grad_norm": 0.7427360415458679, + "learning_rate": 7.141515201879877e-05, + "loss": 2.8745, + "step": 47488 + }, + { + "epoch": 2.33, + "grad_norm": 0.6730073094367981, + "learning_rate": 7.14051811857451e-05, + "loss": 2.7274, + "step": 47489 + }, + { + "epoch": 2.33, + "grad_norm": 0.6964417695999146, + "learning_rate": 7.139521095476629e-05, + "loss": 2.8322, + "step": 47490 + }, + { + "epoch": 2.33, + "grad_norm": 0.6890613436698914, + "learning_rate": 7.13852413258884e-05, + "loss": 3.0825, + "step": 47491 + }, + { + "epoch": 2.33, + "grad_norm": 0.7256914973258972, + "learning_rate": 7.137527229913796e-05, + "loss": 2.9446, + "step": 47492 + }, + { + "epoch": 2.33, + "grad_norm": 0.6925685405731201, + "learning_rate": 7.136530387454089e-05, + "loss": 2.8681, + "step": 47493 + }, + { + "epoch": 2.33, + "grad_norm": 0.712538480758667, + "learning_rate": 7.135533605212363e-05, + "loss": 3.152, + "step": 47494 + }, + { + "epoch": 2.33, + "grad_norm": 0.7189764380455017, + "learning_rate": 7.134536883191244e-05, + "loss": 2.7111, + "step": 47495 + }, + { + "epoch": 2.33, + "grad_norm": 0.73213130235672, + "learning_rate": 7.133540221393349e-05, + "loss": 2.8043, + "step": 47496 + }, + { + "epoch": 2.33, + "grad_norm": 0.704254150390625, + "learning_rate": 7.132543619821318e-05, + "loss": 2.9035, + "step": 47497 + }, + { + "epoch": 2.33, + "grad_norm": 0.7130454778671265, + "learning_rate": 7.131547078477757e-05, + "loss": 2.6674, + "step": 47498 + }, + { + "epoch": 2.33, + "grad_norm": 0.6928410530090332, + "learning_rate": 7.130550597365308e-05, + "loss": 2.9636, + "step": 47499 + }, + { + "epoch": 2.33, + "grad_norm": 0.6888012886047363, + "learning_rate": 7.129554176486587e-05, + "loss": 2.921, + "step": 47500 + }, + { + "epoch": 2.33, + "grad_norm": 0.7133302688598633, + "learning_rate": 7.12855781584421e-05, + "loss": 2.9925, + "step": 47501 + }, + { + "epoch": 2.33, + "grad_norm": 0.7603508234024048, + "learning_rate": 7.127561515440818e-05, + "loss": 2.8541, + "step": 47502 + }, + { + "epoch": 2.33, + "grad_norm": 0.8669589757919312, + "learning_rate": 7.126565275279019e-05, + "loss": 2.982, + "step": 47503 + }, + { + "epoch": 2.33, + "grad_norm": 0.7202988862991333, + "learning_rate": 7.125569095361447e-05, + "loss": 2.8785, + "step": 47504 + }, + { + "epoch": 2.33, + "grad_norm": 0.6969466805458069, + "learning_rate": 7.124572975690731e-05, + "loss": 3.0048, + "step": 47505 + }, + { + "epoch": 2.33, + "grad_norm": 0.7168153524398804, + "learning_rate": 7.123576916269487e-05, + "loss": 2.8723, + "step": 47506 + }, + { + "epoch": 2.33, + "grad_norm": 0.7444748282432556, + "learning_rate": 7.122580917100338e-05, + "loss": 3.053, + "step": 47507 + }, + { + "epoch": 2.33, + "grad_norm": 0.7564586997032166, + "learning_rate": 7.121584978185903e-05, + "loss": 2.9999, + "step": 47508 + }, + { + "epoch": 2.33, + "grad_norm": 0.7514041066169739, + "learning_rate": 7.120589099528807e-05, + "loss": 2.8877, + "step": 47509 + }, + { + "epoch": 2.33, + "grad_norm": 0.7307299971580505, + "learning_rate": 7.119593281131684e-05, + "loss": 3.0657, + "step": 47510 + }, + { + "epoch": 2.33, + "grad_norm": 0.7431744933128357, + "learning_rate": 7.118597522997142e-05, + "loss": 2.7934, + "step": 47511 + }, + { + "epoch": 2.33, + "grad_norm": 0.7438200116157532, + "learning_rate": 7.117601825127817e-05, + "loss": 2.9412, + "step": 47512 + }, + { + "epoch": 2.33, + "grad_norm": 0.7108175754547119, + "learning_rate": 7.11660618752632e-05, + "loss": 2.8302, + "step": 47513 + }, + { + "epoch": 2.33, + "grad_norm": 0.6957178711891174, + "learning_rate": 7.115610610195285e-05, + "loss": 3.1017, + "step": 47514 + }, + { + "epoch": 2.33, + "grad_norm": 0.7372037768363953, + "learning_rate": 7.114615093137326e-05, + "loss": 2.8933, + "step": 47515 + }, + { + "epoch": 2.33, + "grad_norm": 0.7208225131034851, + "learning_rate": 7.113619636355061e-05, + "loss": 3.0632, + "step": 47516 + }, + { + "epoch": 2.33, + "grad_norm": 0.7117692232131958, + "learning_rate": 7.112624239851123e-05, + "loss": 2.95, + "step": 47517 + }, + { + "epoch": 2.33, + "grad_norm": 0.7726485133171082, + "learning_rate": 7.11162890362812e-05, + "loss": 3.0097, + "step": 47518 + }, + { + "epoch": 2.33, + "grad_norm": 0.7577906847000122, + "learning_rate": 7.110633627688684e-05, + "loss": 2.875, + "step": 47519 + }, + { + "epoch": 2.33, + "grad_norm": 0.7112172245979309, + "learning_rate": 7.109638412035439e-05, + "loss": 2.795, + "step": 47520 + }, + { + "epoch": 2.33, + "grad_norm": 0.7526218891143799, + "learning_rate": 7.108643256671002e-05, + "loss": 3.0337, + "step": 47521 + }, + { + "epoch": 2.33, + "grad_norm": 0.883800208568573, + "learning_rate": 7.107648161597993e-05, + "loss": 2.7492, + "step": 47522 + }, + { + "epoch": 2.33, + "grad_norm": 0.7418673634529114, + "learning_rate": 7.106653126819027e-05, + "loss": 2.9126, + "step": 47523 + }, + { + "epoch": 2.33, + "grad_norm": 0.7279163002967834, + "learning_rate": 7.10565815233673e-05, + "loss": 3.0511, + "step": 47524 + }, + { + "epoch": 2.33, + "grad_norm": 0.7238751649856567, + "learning_rate": 7.104663238153731e-05, + "loss": 2.8631, + "step": 47525 + }, + { + "epoch": 2.33, + "grad_norm": 0.7138600945472717, + "learning_rate": 7.103668384272632e-05, + "loss": 2.7983, + "step": 47526 + }, + { + "epoch": 2.33, + "grad_norm": 0.7913941144943237, + "learning_rate": 7.102673590696074e-05, + "loss": 3.039, + "step": 47527 + }, + { + "epoch": 2.33, + "grad_norm": 0.7563286423683167, + "learning_rate": 7.101678857426669e-05, + "loss": 2.974, + "step": 47528 + }, + { + "epoch": 2.33, + "grad_norm": 0.744836151599884, + "learning_rate": 7.100684184467025e-05, + "loss": 2.9618, + "step": 47529 + }, + { + "epoch": 2.33, + "grad_norm": 0.7190981507301331, + "learning_rate": 7.099689571819778e-05, + "loss": 3.0592, + "step": 47530 + }, + { + "epoch": 2.33, + "grad_norm": 0.7641801834106445, + "learning_rate": 7.098695019487535e-05, + "loss": 2.817, + "step": 47531 + }, + { + "epoch": 2.33, + "grad_norm": 0.7573862075805664, + "learning_rate": 7.097700527472934e-05, + "loss": 2.9914, + "step": 47532 + }, + { + "epoch": 2.33, + "grad_norm": 0.7357966899871826, + "learning_rate": 7.096706095778567e-05, + "loss": 2.8585, + "step": 47533 + }, + { + "epoch": 2.33, + "grad_norm": 0.8040800094604492, + "learning_rate": 7.095711724407081e-05, + "loss": 3.1379, + "step": 47534 + }, + { + "epoch": 2.33, + "grad_norm": 0.7585517764091492, + "learning_rate": 7.094717413361075e-05, + "loss": 2.8646, + "step": 47535 + }, + { + "epoch": 2.33, + "grad_norm": 0.7271699905395508, + "learning_rate": 7.09372316264318e-05, + "loss": 3.0099, + "step": 47536 + }, + { + "epoch": 2.33, + "grad_norm": 0.7077714800834656, + "learning_rate": 7.092728972256013e-05, + "loss": 2.8668, + "step": 47537 + }, + { + "epoch": 2.33, + "grad_norm": 0.7180951237678528, + "learning_rate": 7.09173484220218e-05, + "loss": 2.758, + "step": 47538 + }, + { + "epoch": 2.33, + "grad_norm": 0.7497140765190125, + "learning_rate": 7.090740772484315e-05, + "loss": 2.8124, + "step": 47539 + }, + { + "epoch": 2.33, + "grad_norm": 0.7486867904663086, + "learning_rate": 7.089746763105021e-05, + "loss": 2.8029, + "step": 47540 + }, + { + "epoch": 2.33, + "grad_norm": 0.6781291961669922, + "learning_rate": 7.088752814066926e-05, + "loss": 3.086, + "step": 47541 + }, + { + "epoch": 2.33, + "grad_norm": 0.7236211895942688, + "learning_rate": 7.087758925372658e-05, + "loss": 2.9152, + "step": 47542 + }, + { + "epoch": 2.33, + "grad_norm": 0.7155327200889587, + "learning_rate": 7.08676509702482e-05, + "loss": 2.8893, + "step": 47543 + }, + { + "epoch": 2.33, + "grad_norm": 0.7060009837150574, + "learning_rate": 7.085771329026036e-05, + "loss": 2.9783, + "step": 47544 + }, + { + "epoch": 2.33, + "grad_norm": 0.7284157872200012, + "learning_rate": 7.084777621378908e-05, + "loss": 2.9028, + "step": 47545 + }, + { + "epoch": 2.33, + "grad_norm": 0.7081487774848938, + "learning_rate": 7.083783974086068e-05, + "loss": 2.9875, + "step": 47546 + }, + { + "epoch": 2.33, + "grad_norm": 0.7967720627784729, + "learning_rate": 7.082790387150137e-05, + "loss": 2.9191, + "step": 47547 + }, + { + "epoch": 2.33, + "grad_norm": 0.6838148832321167, + "learning_rate": 7.081796860573721e-05, + "loss": 2.9943, + "step": 47548 + }, + { + "epoch": 2.33, + "grad_norm": 0.7207189798355103, + "learning_rate": 7.080803394359448e-05, + "loss": 2.8106, + "step": 47549 + }, + { + "epoch": 2.33, + "grad_norm": 0.7479190826416016, + "learning_rate": 7.079809988509926e-05, + "loss": 2.8428, + "step": 47550 + }, + { + "epoch": 2.33, + "grad_norm": 0.6582784652709961, + "learning_rate": 7.078816643027767e-05, + "loss": 2.7417, + "step": 47551 + }, + { + "epoch": 2.33, + "grad_norm": 0.7573397755622864, + "learning_rate": 7.077823357915602e-05, + "loss": 3.0219, + "step": 47552 + }, + { + "epoch": 2.33, + "grad_norm": 0.7158548831939697, + "learning_rate": 7.076830133176032e-05, + "loss": 2.9151, + "step": 47553 + }, + { + "epoch": 2.33, + "grad_norm": 0.7018261551856995, + "learning_rate": 7.075836968811687e-05, + "loss": 2.9753, + "step": 47554 + }, + { + "epoch": 2.33, + "grad_norm": 0.7017635703086853, + "learning_rate": 7.074843864825164e-05, + "loss": 3.038, + "step": 47555 + }, + { + "epoch": 2.33, + "grad_norm": 0.7433605194091797, + "learning_rate": 7.073850821219094e-05, + "loss": 2.915, + "step": 47556 + }, + { + "epoch": 2.33, + "grad_norm": 0.7094408273696899, + "learning_rate": 7.072857837996098e-05, + "loss": 2.9466, + "step": 47557 + }, + { + "epoch": 2.33, + "grad_norm": 0.7520463466644287, + "learning_rate": 7.071864915158777e-05, + "loss": 2.8484, + "step": 47558 + }, + { + "epoch": 2.33, + "grad_norm": 0.7429606318473816, + "learning_rate": 7.070872052709754e-05, + "loss": 2.9665, + "step": 47559 + }, + { + "epoch": 2.33, + "grad_norm": 0.7459293603897095, + "learning_rate": 7.069879250651634e-05, + "loss": 3.1393, + "step": 47560 + }, + { + "epoch": 2.33, + "grad_norm": 0.7327176928520203, + "learning_rate": 7.06888650898704e-05, + "loss": 2.6332, + "step": 47561 + }, + { + "epoch": 2.33, + "grad_norm": 0.7204429507255554, + "learning_rate": 7.06789382771859e-05, + "loss": 2.6896, + "step": 47562 + }, + { + "epoch": 2.33, + "grad_norm": 0.751361608505249, + "learning_rate": 7.066901206848888e-05, + "loss": 3.0084, + "step": 47563 + }, + { + "epoch": 2.33, + "grad_norm": 0.7837120294570923, + "learning_rate": 7.065908646380565e-05, + "loss": 2.9443, + "step": 47564 + }, + { + "epoch": 2.33, + "grad_norm": 0.7149581909179688, + "learning_rate": 7.064916146316221e-05, + "loss": 3.048, + "step": 47565 + }, + { + "epoch": 2.33, + "grad_norm": 0.7094987630844116, + "learning_rate": 7.063923706658469e-05, + "loss": 2.8596, + "step": 47566 + }, + { + "epoch": 2.33, + "grad_norm": 0.7471241354942322, + "learning_rate": 7.062931327409933e-05, + "loss": 2.9456, + "step": 47567 + }, + { + "epoch": 2.33, + "grad_norm": 0.8007349967956543, + "learning_rate": 7.061939008573216e-05, + "loss": 3.0527, + "step": 47568 + }, + { + "epoch": 2.33, + "grad_norm": 0.7094486355781555, + "learning_rate": 7.060946750150944e-05, + "loss": 2.8813, + "step": 47569 + }, + { + "epoch": 2.33, + "grad_norm": 0.6963364481925964, + "learning_rate": 7.059954552145718e-05, + "loss": 2.9723, + "step": 47570 + }, + { + "epoch": 2.33, + "grad_norm": 0.737665057182312, + "learning_rate": 7.058962414560161e-05, + "loss": 2.8979, + "step": 47571 + }, + { + "epoch": 2.33, + "grad_norm": 0.7348478436470032, + "learning_rate": 7.057970337396884e-05, + "loss": 3.1241, + "step": 47572 + }, + { + "epoch": 2.33, + "grad_norm": 0.7251835465431213, + "learning_rate": 7.05697832065849e-05, + "loss": 2.9297, + "step": 47573 + }, + { + "epoch": 2.33, + "grad_norm": 0.7356771230697632, + "learning_rate": 7.055986364347608e-05, + "loss": 2.9944, + "step": 47574 + }, + { + "epoch": 2.33, + "grad_norm": 0.7503240704536438, + "learning_rate": 7.05499446846683e-05, + "loss": 2.7325, + "step": 47575 + }, + { + "epoch": 2.33, + "grad_norm": 0.7172363996505737, + "learning_rate": 7.054002633018785e-05, + "loss": 2.8792, + "step": 47576 + }, + { + "epoch": 2.33, + "grad_norm": 0.7425442934036255, + "learning_rate": 7.05301085800609e-05, + "loss": 2.9969, + "step": 47577 + }, + { + "epoch": 2.33, + "grad_norm": 0.7278176546096802, + "learning_rate": 7.052019143431336e-05, + "loss": 2.8364, + "step": 47578 + }, + { + "epoch": 2.33, + "grad_norm": 0.711143970489502, + "learning_rate": 7.051027489297157e-05, + "loss": 2.8685, + "step": 47579 + }, + { + "epoch": 2.33, + "grad_norm": 0.7150108218193054, + "learning_rate": 7.050035895606156e-05, + "loss": 2.9239, + "step": 47580 + }, + { + "epoch": 2.33, + "grad_norm": 0.7470396161079407, + "learning_rate": 7.049044362360938e-05, + "loss": 2.9947, + "step": 47581 + }, + { + "epoch": 2.33, + "grad_norm": 0.6868398189544678, + "learning_rate": 7.048052889564125e-05, + "loss": 3.0457, + "step": 47582 + }, + { + "epoch": 2.33, + "grad_norm": 0.7090058326721191, + "learning_rate": 7.047061477218312e-05, + "loss": 2.9331, + "step": 47583 + }, + { + "epoch": 2.33, + "grad_norm": 0.7475082278251648, + "learning_rate": 7.046070125326135e-05, + "loss": 2.9428, + "step": 47584 + }, + { + "epoch": 2.33, + "grad_norm": 0.7163534164428711, + "learning_rate": 7.045078833890183e-05, + "loss": 2.9627, + "step": 47585 + }, + { + "epoch": 2.33, + "grad_norm": 0.7448176741600037, + "learning_rate": 7.044087602913084e-05, + "loss": 2.9439, + "step": 47586 + }, + { + "epoch": 2.33, + "grad_norm": 0.7112734317779541, + "learning_rate": 7.043096432397437e-05, + "loss": 2.8171, + "step": 47587 + }, + { + "epoch": 2.33, + "grad_norm": 0.7507283687591553, + "learning_rate": 7.042105322345847e-05, + "loss": 2.9742, + "step": 47588 + }, + { + "epoch": 2.33, + "grad_norm": 0.749320924282074, + "learning_rate": 7.041114272760948e-05, + "loss": 2.8345, + "step": 47589 + }, + { + "epoch": 2.33, + "grad_norm": 0.7169381976127625, + "learning_rate": 7.040123283645322e-05, + "loss": 3.0269, + "step": 47590 + }, + { + "epoch": 2.33, + "grad_norm": 0.7252839207649231, + "learning_rate": 7.039132355001592e-05, + "loss": 2.8734, + "step": 47591 + }, + { + "epoch": 2.33, + "grad_norm": 0.7459837794303894, + "learning_rate": 7.03814148683238e-05, + "loss": 2.8828, + "step": 47592 + }, + { + "epoch": 2.33, + "grad_norm": 0.7205169200897217, + "learning_rate": 7.037150679140276e-05, + "loss": 2.8298, + "step": 47593 + }, + { + "epoch": 2.33, + "grad_norm": 0.7566838264465332, + "learning_rate": 7.036159931927911e-05, + "loss": 2.6596, + "step": 47594 + }, + { + "epoch": 2.33, + "grad_norm": 0.7399532198905945, + "learning_rate": 7.035169245197867e-05, + "loss": 2.791, + "step": 47595 + }, + { + "epoch": 2.33, + "grad_norm": 0.744898796081543, + "learning_rate": 7.034178618952769e-05, + "loss": 2.9397, + "step": 47596 + }, + { + "epoch": 2.33, + "grad_norm": 0.741045355796814, + "learning_rate": 7.03318805319523e-05, + "loss": 2.9292, + "step": 47597 + }, + { + "epoch": 2.33, + "grad_norm": 0.7274682521820068, + "learning_rate": 7.032197547927847e-05, + "loss": 2.7797, + "step": 47598 + }, + { + "epoch": 2.33, + "grad_norm": 0.7332300543785095, + "learning_rate": 7.031207103153241e-05, + "loss": 2.758, + "step": 47599 + }, + { + "epoch": 2.33, + "grad_norm": 0.7493497729301453, + "learning_rate": 7.030216718874011e-05, + "loss": 3.1175, + "step": 47600 + }, + { + "epoch": 2.33, + "grad_norm": 0.7280975580215454, + "learning_rate": 7.029226395092774e-05, + "loss": 2.8282, + "step": 47601 + }, + { + "epoch": 2.33, + "grad_norm": 0.7699424028396606, + "learning_rate": 7.028236131812138e-05, + "loss": 2.8258, + "step": 47602 + }, + { + "epoch": 2.33, + "grad_norm": 0.7214636206626892, + "learning_rate": 7.027245929034695e-05, + "loss": 2.7958, + "step": 47603 + }, + { + "epoch": 2.33, + "grad_norm": 0.7424951195716858, + "learning_rate": 7.026255786763075e-05, + "loss": 2.8481, + "step": 47604 + }, + { + "epoch": 2.33, + "grad_norm": 0.7581061124801636, + "learning_rate": 7.025265704999867e-05, + "loss": 2.9243, + "step": 47605 + }, + { + "epoch": 2.33, + "grad_norm": 0.6965128779411316, + "learning_rate": 7.024275683747688e-05, + "loss": 2.958, + "step": 47606 + }, + { + "epoch": 2.33, + "grad_norm": 0.7387304902076721, + "learning_rate": 7.023285723009155e-05, + "loss": 3.0201, + "step": 47607 + }, + { + "epoch": 2.33, + "grad_norm": 0.7237225770950317, + "learning_rate": 7.022295822786861e-05, + "loss": 2.7551, + "step": 47608 + }, + { + "epoch": 2.33, + "grad_norm": 0.6971107721328735, + "learning_rate": 7.021305983083423e-05, + "loss": 2.8338, + "step": 47609 + }, + { + "epoch": 2.33, + "grad_norm": 0.7018718719482422, + "learning_rate": 7.020316203901432e-05, + "loss": 2.6294, + "step": 47610 + }, + { + "epoch": 2.33, + "grad_norm": 0.678143322467804, + "learning_rate": 7.019326485243513e-05, + "loss": 3.0442, + "step": 47611 + }, + { + "epoch": 2.33, + "grad_norm": 0.718937873840332, + "learning_rate": 7.018336827112257e-05, + "loss": 2.7917, + "step": 47612 + }, + { + "epoch": 2.33, + "grad_norm": 0.701008677482605, + "learning_rate": 7.01734722951028e-05, + "loss": 2.7951, + "step": 47613 + }, + { + "epoch": 2.33, + "grad_norm": 0.7457568645477295, + "learning_rate": 7.016357692440193e-05, + "loss": 2.8648, + "step": 47614 + }, + { + "epoch": 2.33, + "grad_norm": 0.6575247049331665, + "learning_rate": 7.015368215904593e-05, + "loss": 2.987, + "step": 47615 + }, + { + "epoch": 2.33, + "grad_norm": 0.7346352338790894, + "learning_rate": 7.014378799906095e-05, + "loss": 2.8045, + "step": 47616 + }, + { + "epoch": 2.33, + "grad_norm": 0.7044388055801392, + "learning_rate": 7.013389444447301e-05, + "loss": 2.7003, + "step": 47617 + }, + { + "epoch": 2.33, + "grad_norm": 0.7355045676231384, + "learning_rate": 7.012400149530804e-05, + "loss": 2.9629, + "step": 47618 + }, + { + "epoch": 2.33, + "grad_norm": 0.7335948348045349, + "learning_rate": 7.011410915159232e-05, + "loss": 2.8067, + "step": 47619 + }, + { + "epoch": 2.33, + "grad_norm": 0.7035436630249023, + "learning_rate": 7.010421741335168e-05, + "loss": 2.7938, + "step": 47620 + }, + { + "epoch": 2.33, + "grad_norm": 0.6967812180519104, + "learning_rate": 7.009432628061243e-05, + "loss": 2.8902, + "step": 47621 + }, + { + "epoch": 2.33, + "grad_norm": 0.7366414070129395, + "learning_rate": 7.008443575340032e-05, + "loss": 2.6877, + "step": 47622 + }, + { + "epoch": 2.33, + "grad_norm": 0.7091039419174194, + "learning_rate": 7.00745458317417e-05, + "loss": 2.7761, + "step": 47623 + }, + { + "epoch": 2.33, + "grad_norm": 0.7306919693946838, + "learning_rate": 7.006465651566247e-05, + "loss": 2.782, + "step": 47624 + }, + { + "epoch": 2.33, + "grad_norm": 0.6974079012870789, + "learning_rate": 7.005476780518856e-05, + "loss": 2.9357, + "step": 47625 + }, + { + "epoch": 2.33, + "grad_norm": 0.7919893860816956, + "learning_rate": 7.004487970034627e-05, + "loss": 2.9349, + "step": 47626 + }, + { + "epoch": 2.33, + "grad_norm": 0.7445873022079468, + "learning_rate": 7.00349922011614e-05, + "loss": 3.0683, + "step": 47627 + }, + { + "epoch": 2.33, + "grad_norm": 0.7282187342643738, + "learning_rate": 7.00251053076601e-05, + "loss": 2.9457, + "step": 47628 + }, + { + "epoch": 2.33, + "grad_norm": 0.6826276779174805, + "learning_rate": 7.001521901986848e-05, + "loss": 2.7284, + "step": 47629 + }, + { + "epoch": 2.33, + "grad_norm": 0.7542269825935364, + "learning_rate": 7.000533333781256e-05, + "loss": 2.8766, + "step": 47630 + }, + { + "epoch": 2.33, + "grad_norm": 0.763077974319458, + "learning_rate": 6.999544826151829e-05, + "loss": 2.846, + "step": 47631 + }, + { + "epoch": 2.33, + "grad_norm": 0.7759633660316467, + "learning_rate": 6.99855637910117e-05, + "loss": 3.0532, + "step": 47632 + }, + { + "epoch": 2.33, + "grad_norm": 0.7288686037063599, + "learning_rate": 6.997567992631883e-05, + "loss": 3.0587, + "step": 47633 + }, + { + "epoch": 2.33, + "grad_norm": 0.7107670307159424, + "learning_rate": 6.996579666746584e-05, + "loss": 2.9452, + "step": 47634 + }, + { + "epoch": 2.33, + "grad_norm": 0.7164334654808044, + "learning_rate": 6.995591401447858e-05, + "loss": 3.1213, + "step": 47635 + }, + { + "epoch": 2.33, + "grad_norm": 0.7293843030929565, + "learning_rate": 6.994603196738326e-05, + "loss": 2.9488, + "step": 47636 + }, + { + "epoch": 2.33, + "grad_norm": 0.7001025676727295, + "learning_rate": 6.993615052620576e-05, + "loss": 2.8525, + "step": 47637 + }, + { + "epoch": 2.33, + "grad_norm": 0.736518919467926, + "learning_rate": 6.992626969097221e-05, + "loss": 3.0508, + "step": 47638 + }, + { + "epoch": 2.33, + "grad_norm": 0.6913434267044067, + "learning_rate": 6.991638946170861e-05, + "loss": 2.8179, + "step": 47639 + }, + { + "epoch": 2.33, + "grad_norm": 0.6732978820800781, + "learning_rate": 6.990650983844087e-05, + "loss": 3.0415, + "step": 47640 + }, + { + "epoch": 2.33, + "grad_norm": 0.7510814070701599, + "learning_rate": 6.989663082119518e-05, + "loss": 2.9317, + "step": 47641 + }, + { + "epoch": 2.33, + "grad_norm": 0.6988676190376282, + "learning_rate": 6.98867524099974e-05, + "loss": 2.9012, + "step": 47642 + }, + { + "epoch": 2.33, + "grad_norm": 0.7889525890350342, + "learning_rate": 6.987687460487362e-05, + "loss": 2.7416, + "step": 47643 + }, + { + "epoch": 2.33, + "grad_norm": 0.6934266686439514, + "learning_rate": 6.986699740584998e-05, + "loss": 3.0119, + "step": 47644 + }, + { + "epoch": 2.33, + "grad_norm": 0.7091248631477356, + "learning_rate": 6.985712081295235e-05, + "loss": 2.8714, + "step": 47645 + }, + { + "epoch": 2.34, + "grad_norm": 0.7630372047424316, + "learning_rate": 6.984724482620678e-05, + "loss": 3.0798, + "step": 47646 + }, + { + "epoch": 2.34, + "grad_norm": 0.7124559879302979, + "learning_rate": 6.983736944563919e-05, + "loss": 3.0748, + "step": 47647 + }, + { + "epoch": 2.34, + "grad_norm": 0.7879320979118347, + "learning_rate": 6.982749467127566e-05, + "loss": 2.7061, + "step": 47648 + }, + { + "epoch": 2.34, + "grad_norm": 0.7572959065437317, + "learning_rate": 6.981762050314232e-05, + "loss": 3.1158, + "step": 47649 + }, + { + "epoch": 2.34, + "grad_norm": 0.7393097877502441, + "learning_rate": 6.980774694126495e-05, + "loss": 2.8247, + "step": 47650 + }, + { + "epoch": 2.34, + "grad_norm": 0.7403393387794495, + "learning_rate": 6.979787398566979e-05, + "loss": 2.862, + "step": 47651 + }, + { + "epoch": 2.34, + "grad_norm": 0.7622966170310974, + "learning_rate": 6.978800163638268e-05, + "loss": 2.9045, + "step": 47652 + }, + { + "epoch": 2.34, + "grad_norm": 0.7208401560783386, + "learning_rate": 6.977812989342964e-05, + "loss": 2.8656, + "step": 47653 + }, + { + "epoch": 2.34, + "grad_norm": 0.7213475108146667, + "learning_rate": 6.976825875683674e-05, + "loss": 2.9971, + "step": 47654 + }, + { + "epoch": 2.34, + "grad_norm": 0.7142321467399597, + "learning_rate": 6.975838822662987e-05, + "loss": 3.0197, + "step": 47655 + }, + { + "epoch": 2.34, + "grad_norm": 0.8058083057403564, + "learning_rate": 6.97485183028352e-05, + "loss": 2.7902, + "step": 47656 + }, + { + "epoch": 2.34, + "grad_norm": 0.7557923197746277, + "learning_rate": 6.973864898547852e-05, + "loss": 2.9234, + "step": 47657 + }, + { + "epoch": 2.34, + "grad_norm": 0.6770955324172974, + "learning_rate": 6.97287802745859e-05, + "loss": 2.6823, + "step": 47658 + }, + { + "epoch": 2.34, + "grad_norm": 0.7257001399993896, + "learning_rate": 6.971891217018349e-05, + "loss": 2.9746, + "step": 47659 + }, + { + "epoch": 2.34, + "grad_norm": 0.665259063243866, + "learning_rate": 6.97090446722971e-05, + "loss": 2.6029, + "step": 47660 + }, + { + "epoch": 2.34, + "grad_norm": 0.7421854734420776, + "learning_rate": 6.96991777809528e-05, + "loss": 2.9455, + "step": 47661 + }, + { + "epoch": 2.34, + "grad_norm": 0.7707141041755676, + "learning_rate": 6.968931149617643e-05, + "loss": 3.1785, + "step": 47662 + }, + { + "epoch": 2.34, + "grad_norm": 0.7522193789482117, + "learning_rate": 6.967944581799409e-05, + "loss": 2.8248, + "step": 47663 + }, + { + "epoch": 2.34, + "grad_norm": 0.747096836566925, + "learning_rate": 6.966958074643189e-05, + "loss": 3.0049, + "step": 47664 + }, + { + "epoch": 2.34, + "grad_norm": 0.7807623744010925, + "learning_rate": 6.965971628151558e-05, + "loss": 2.9797, + "step": 47665 + }, + { + "epoch": 2.34, + "grad_norm": 0.6890551447868347, + "learning_rate": 6.964985242327132e-05, + "loss": 2.7737, + "step": 47666 + }, + { + "epoch": 2.34, + "grad_norm": 0.744841456413269, + "learning_rate": 6.963998917172501e-05, + "loss": 2.9702, + "step": 47667 + }, + { + "epoch": 2.34, + "grad_norm": 0.7255067825317383, + "learning_rate": 6.963012652690257e-05, + "loss": 2.9437, + "step": 47668 + }, + { + "epoch": 2.34, + "grad_norm": 0.7375694513320923, + "learning_rate": 6.962026448883013e-05, + "loss": 2.8523, + "step": 47669 + }, + { + "epoch": 2.34, + "grad_norm": 0.7139278054237366, + "learning_rate": 6.961040305753347e-05, + "loss": 2.7384, + "step": 47670 + }, + { + "epoch": 2.34, + "grad_norm": 0.74039626121521, + "learning_rate": 6.960054223303875e-05, + "loss": 2.9185, + "step": 47671 + }, + { + "epoch": 2.34, + "grad_norm": 0.7546448707580566, + "learning_rate": 6.95906820153718e-05, + "loss": 2.917, + "step": 47672 + }, + { + "epoch": 2.34, + "grad_norm": 0.726020872592926, + "learning_rate": 6.958082240455871e-05, + "loss": 2.8537, + "step": 47673 + }, + { + "epoch": 2.34, + "grad_norm": 0.7269747257232666, + "learning_rate": 6.95709634006254e-05, + "loss": 2.9355, + "step": 47674 + }, + { + "epoch": 2.34, + "grad_norm": 0.7227620482444763, + "learning_rate": 6.956110500359776e-05, + "loss": 2.8675, + "step": 47675 + }, + { + "epoch": 2.34, + "grad_norm": 0.6997947096824646, + "learning_rate": 6.955124721350187e-05, + "loss": 2.8877, + "step": 47676 + }, + { + "epoch": 2.34, + "grad_norm": 0.7475396990776062, + "learning_rate": 6.954139003036357e-05, + "loss": 2.9018, + "step": 47677 + }, + { + "epoch": 2.34, + "grad_norm": 0.6795568466186523, + "learning_rate": 6.953153345420892e-05, + "loss": 2.901, + "step": 47678 + }, + { + "epoch": 2.34, + "grad_norm": 0.7249543070793152, + "learning_rate": 6.952167748506393e-05, + "loss": 2.9914, + "step": 47679 + }, + { + "epoch": 2.34, + "grad_norm": 0.715927243232727, + "learning_rate": 6.951182212295438e-05, + "loss": 2.8587, + "step": 47680 + }, + { + "epoch": 2.34, + "grad_norm": 0.7298700213432312, + "learning_rate": 6.950196736790645e-05, + "loss": 2.942, + "step": 47681 + }, + { + "epoch": 2.34, + "grad_norm": 0.7658237814903259, + "learning_rate": 6.949211321994597e-05, + "loss": 3.0521, + "step": 47682 + }, + { + "epoch": 2.34, + "grad_norm": 0.7058979272842407, + "learning_rate": 6.948225967909878e-05, + "loss": 2.8072, + "step": 47683 + }, + { + "epoch": 2.34, + "grad_norm": 0.6835278868675232, + "learning_rate": 6.947240674539109e-05, + "loss": 3.063, + "step": 47684 + }, + { + "epoch": 2.34, + "grad_norm": 0.7149368524551392, + "learning_rate": 6.946255441884858e-05, + "loss": 3.0687, + "step": 47685 + }, + { + "epoch": 2.34, + "grad_norm": 0.7184211611747742, + "learning_rate": 6.945270269949744e-05, + "loss": 2.9289, + "step": 47686 + }, + { + "epoch": 2.34, + "grad_norm": 0.8042710423469543, + "learning_rate": 6.944285158736344e-05, + "loss": 2.8712, + "step": 47687 + }, + { + "epoch": 2.34, + "grad_norm": 0.7624470591545105, + "learning_rate": 6.943300108247268e-05, + "loss": 2.86, + "step": 47688 + }, + { + "epoch": 2.34, + "grad_norm": 0.6996687650680542, + "learning_rate": 6.9423151184851e-05, + "loss": 2.894, + "step": 47689 + }, + { + "epoch": 2.34, + "grad_norm": 0.7101437449455261, + "learning_rate": 6.941330189452431e-05, + "loss": 2.7634, + "step": 47690 + }, + { + "epoch": 2.34, + "grad_norm": 0.7409912943840027, + "learning_rate": 6.940345321151867e-05, + "loss": 2.7299, + "step": 47691 + }, + { + "epoch": 2.34, + "grad_norm": 0.7612147331237793, + "learning_rate": 6.939360513585988e-05, + "loss": 2.9949, + "step": 47692 + }, + { + "epoch": 2.34, + "grad_norm": 0.7287157773971558, + "learning_rate": 6.938375766757401e-05, + "loss": 2.9987, + "step": 47693 + }, + { + "epoch": 2.34, + "grad_norm": 0.7364184260368347, + "learning_rate": 6.937391080668691e-05, + "loss": 2.9102, + "step": 47694 + }, + { + "epoch": 2.34, + "grad_norm": 0.7477836608886719, + "learning_rate": 6.936406455322458e-05, + "loss": 2.9471, + "step": 47695 + }, + { + "epoch": 2.34, + "grad_norm": 0.746062159538269, + "learning_rate": 6.935421890721293e-05, + "loss": 2.8956, + "step": 47696 + }, + { + "epoch": 2.34, + "grad_norm": 0.7618799209594727, + "learning_rate": 6.934437386867779e-05, + "loss": 3.0421, + "step": 47697 + }, + { + "epoch": 2.34, + "grad_norm": 0.6989330053329468, + "learning_rate": 6.933452943764527e-05, + "loss": 3.0442, + "step": 47698 + }, + { + "epoch": 2.34, + "grad_norm": 0.7096387147903442, + "learning_rate": 6.932468561414115e-05, + "loss": 2.7441, + "step": 47699 + }, + { + "epoch": 2.34, + "grad_norm": 0.7569087147712708, + "learning_rate": 6.931484239819134e-05, + "loss": 2.8035, + "step": 47700 + }, + { + "epoch": 2.34, + "grad_norm": 0.7354052662849426, + "learning_rate": 6.930499978982198e-05, + "loss": 3.0321, + "step": 47701 + }, + { + "epoch": 2.34, + "grad_norm": 0.723621129989624, + "learning_rate": 6.929515778905878e-05, + "loss": 2.9771, + "step": 47702 + }, + { + "epoch": 2.34, + "grad_norm": 0.7257896661758423, + "learning_rate": 6.928531639592778e-05, + "loss": 2.5832, + "step": 47703 + }, + { + "epoch": 2.34, + "grad_norm": 0.7249346375465393, + "learning_rate": 6.927547561045487e-05, + "loss": 2.8028, + "step": 47704 + }, + { + "epoch": 2.34, + "grad_norm": 0.7552078366279602, + "learning_rate": 6.926563543266587e-05, + "loss": 2.9345, + "step": 47705 + }, + { + "epoch": 2.34, + "grad_norm": 0.7495518922805786, + "learning_rate": 6.925579586258689e-05, + "loss": 2.7828, + "step": 47706 + }, + { + "epoch": 2.34, + "grad_norm": 0.6926343441009521, + "learning_rate": 6.92459569002436e-05, + "loss": 3.0711, + "step": 47707 + }, + { + "epoch": 2.34, + "grad_norm": 0.7301257252693176, + "learning_rate": 6.923611854566219e-05, + "loss": 2.9068, + "step": 47708 + }, + { + "epoch": 2.34, + "grad_norm": 0.7383841276168823, + "learning_rate": 6.922628079886829e-05, + "loss": 2.9509, + "step": 47709 + }, + { + "epoch": 2.34, + "grad_norm": 0.7082839608192444, + "learning_rate": 6.921644365988807e-05, + "loss": 2.8345, + "step": 47710 + }, + { + "epoch": 2.34, + "grad_norm": 0.8540667295455933, + "learning_rate": 6.920660712874732e-05, + "loss": 2.9104, + "step": 47711 + }, + { + "epoch": 2.34, + "grad_norm": 0.6855905652046204, + "learning_rate": 6.919677120547189e-05, + "loss": 3.1228, + "step": 47712 + }, + { + "epoch": 2.34, + "grad_norm": 0.7041099667549133, + "learning_rate": 6.91869358900878e-05, + "loss": 2.6503, + "step": 47713 + }, + { + "epoch": 2.34, + "grad_norm": 0.7083460688591003, + "learning_rate": 6.91771011826208e-05, + "loss": 2.8676, + "step": 47714 + }, + { + "epoch": 2.34, + "grad_norm": 0.7578673362731934, + "learning_rate": 6.916726708309693e-05, + "loss": 3.0593, + "step": 47715 + }, + { + "epoch": 2.34, + "grad_norm": 0.7105020880699158, + "learning_rate": 6.915743359154212e-05, + "loss": 2.7786, + "step": 47716 + }, + { + "epoch": 2.34, + "grad_norm": 0.7636972665786743, + "learning_rate": 6.91476007079821e-05, + "loss": 2.8917, + "step": 47717 + }, + { + "epoch": 2.34, + "grad_norm": 0.7262598276138306, + "learning_rate": 6.9137768432443e-05, + "loss": 3.1578, + "step": 47718 + }, + { + "epoch": 2.34, + "grad_norm": 0.7343432307243347, + "learning_rate": 6.912793676495057e-05, + "loss": 3.0025, + "step": 47719 + }, + { + "epoch": 2.34, + "grad_norm": 0.7269200086593628, + "learning_rate": 6.911810570553063e-05, + "loss": 2.8975, + "step": 47720 + }, + { + "epoch": 2.34, + "grad_norm": 0.7389751672744751, + "learning_rate": 6.910827525420923e-05, + "loss": 3.0294, + "step": 47721 + }, + { + "epoch": 2.34, + "grad_norm": 0.7205168604850769, + "learning_rate": 6.909844541101213e-05, + "loss": 2.8921, + "step": 47722 + }, + { + "epoch": 2.34, + "grad_norm": 0.7263239622116089, + "learning_rate": 6.908861617596538e-05, + "loss": 2.9286, + "step": 47723 + }, + { + "epoch": 2.34, + "grad_norm": 0.7304479479789734, + "learning_rate": 6.907878754909467e-05, + "loss": 2.722, + "step": 47724 + }, + { + "epoch": 2.34, + "grad_norm": 0.7433699369430542, + "learning_rate": 6.906895953042611e-05, + "loss": 2.9203, + "step": 47725 + }, + { + "epoch": 2.34, + "grad_norm": 0.7258157730102539, + "learning_rate": 6.905913211998546e-05, + "loss": 2.8464, + "step": 47726 + }, + { + "epoch": 2.34, + "grad_norm": 0.7196770906448364, + "learning_rate": 6.904930531779852e-05, + "loss": 2.9407, + "step": 47727 + }, + { + "epoch": 2.34, + "grad_norm": 0.736857533454895, + "learning_rate": 6.903947912389133e-05, + "loss": 2.902, + "step": 47728 + }, + { + "epoch": 2.34, + "grad_norm": 0.8262715935707092, + "learning_rate": 6.902965353828963e-05, + "loss": 3.0228, + "step": 47729 + }, + { + "epoch": 2.34, + "grad_norm": 0.740009069442749, + "learning_rate": 6.901982856101938e-05, + "loss": 2.985, + "step": 47730 + }, + { + "epoch": 2.34, + "grad_norm": 0.7470229864120483, + "learning_rate": 6.901000419210652e-05, + "loss": 2.9201, + "step": 47731 + }, + { + "epoch": 2.34, + "grad_norm": 0.705930769443512, + "learning_rate": 6.900018043157686e-05, + "loss": 2.4926, + "step": 47732 + }, + { + "epoch": 2.34, + "grad_norm": 0.7490159869194031, + "learning_rate": 6.899035727945624e-05, + "loss": 2.8409, + "step": 47733 + }, + { + "epoch": 2.34, + "grad_norm": 0.7825806736946106, + "learning_rate": 6.898053473577052e-05, + "loss": 2.9334, + "step": 47734 + }, + { + "epoch": 2.34, + "grad_norm": 0.7251452207565308, + "learning_rate": 6.897071280054558e-05, + "loss": 2.864, + "step": 47735 + }, + { + "epoch": 2.34, + "grad_norm": 0.7280852794647217, + "learning_rate": 6.896089147380743e-05, + "loss": 2.8302, + "step": 47736 + }, + { + "epoch": 2.34, + "grad_norm": 0.7135215401649475, + "learning_rate": 6.895107075558173e-05, + "loss": 2.8002, + "step": 47737 + }, + { + "epoch": 2.34, + "grad_norm": 0.7429882884025574, + "learning_rate": 6.894125064589455e-05, + "loss": 2.7959, + "step": 47738 + }, + { + "epoch": 2.34, + "grad_norm": 0.7092627882957458, + "learning_rate": 6.893143114477152e-05, + "loss": 2.8633, + "step": 47739 + }, + { + "epoch": 2.34, + "grad_norm": 0.7626802921295166, + "learning_rate": 6.892161225223875e-05, + "loss": 2.9394, + "step": 47740 + }, + { + "epoch": 2.34, + "grad_norm": 0.7585441470146179, + "learning_rate": 6.891179396832196e-05, + "loss": 2.8403, + "step": 47741 + }, + { + "epoch": 2.34, + "grad_norm": 0.6958228945732117, + "learning_rate": 6.890197629304694e-05, + "loss": 2.9464, + "step": 47742 + }, + { + "epoch": 2.34, + "grad_norm": 0.7759091854095459, + "learning_rate": 6.889215922643975e-05, + "loss": 2.838, + "step": 47743 + }, + { + "epoch": 2.34, + "grad_norm": 0.7096752524375916, + "learning_rate": 6.888234276852604e-05, + "loss": 2.9345, + "step": 47744 + }, + { + "epoch": 2.34, + "grad_norm": 0.7344189882278442, + "learning_rate": 6.887252691933175e-05, + "loss": 3.2115, + "step": 47745 + }, + { + "epoch": 2.34, + "grad_norm": 0.7086279392242432, + "learning_rate": 6.886271167888284e-05, + "loss": 2.7383, + "step": 47746 + }, + { + "epoch": 2.34, + "grad_norm": 0.7072312235832214, + "learning_rate": 6.885289704720507e-05, + "loss": 2.9756, + "step": 47747 + }, + { + "epoch": 2.34, + "grad_norm": 0.7190287709236145, + "learning_rate": 6.884308302432428e-05, + "loss": 2.8416, + "step": 47748 + }, + { + "epoch": 2.34, + "grad_norm": 0.7287060618400574, + "learning_rate": 6.883326961026621e-05, + "loss": 2.9435, + "step": 47749 + }, + { + "epoch": 2.34, + "grad_norm": 0.7083621621131897, + "learning_rate": 6.882345680505684e-05, + "loss": 2.9013, + "step": 47750 + }, + { + "epoch": 2.34, + "grad_norm": 0.7850791215896606, + "learning_rate": 6.881364460872209e-05, + "loss": 2.8539, + "step": 47751 + }, + { + "epoch": 2.34, + "grad_norm": 0.7175219655036926, + "learning_rate": 6.880383302128761e-05, + "loss": 2.8696, + "step": 47752 + }, + { + "epoch": 2.34, + "grad_norm": 0.7923054099082947, + "learning_rate": 6.879402204277945e-05, + "loss": 2.7013, + "step": 47753 + }, + { + "epoch": 2.34, + "grad_norm": 0.7685131430625916, + "learning_rate": 6.878421167322329e-05, + "loss": 2.782, + "step": 47754 + }, + { + "epoch": 2.34, + "grad_norm": 0.6883244514465332, + "learning_rate": 6.877440191264497e-05, + "loss": 2.882, + "step": 47755 + }, + { + "epoch": 2.34, + "grad_norm": 0.7355750203132629, + "learning_rate": 6.876459276107043e-05, + "loss": 2.6442, + "step": 47756 + }, + { + "epoch": 2.34, + "grad_norm": 0.7222840189933777, + "learning_rate": 6.875478421852539e-05, + "loss": 2.9603, + "step": 47757 + }, + { + "epoch": 2.34, + "grad_norm": 0.7285621762275696, + "learning_rate": 6.874497628503582e-05, + "loss": 2.9548, + "step": 47758 + }, + { + "epoch": 2.34, + "grad_norm": 0.7584347724914551, + "learning_rate": 6.87351689606274e-05, + "loss": 2.9329, + "step": 47759 + }, + { + "epoch": 2.34, + "grad_norm": 0.8179945945739746, + "learning_rate": 6.872536224532602e-05, + "loss": 3.0256, + "step": 47760 + }, + { + "epoch": 2.34, + "grad_norm": 0.7475566267967224, + "learning_rate": 6.871555613915764e-05, + "loss": 3.0588, + "step": 47761 + }, + { + "epoch": 2.34, + "grad_norm": 0.6885706186294556, + "learning_rate": 6.870575064214795e-05, + "loss": 3.1584, + "step": 47762 + }, + { + "epoch": 2.34, + "grad_norm": 0.713989794254303, + "learning_rate": 6.869594575432282e-05, + "loss": 2.9286, + "step": 47763 + }, + { + "epoch": 2.34, + "grad_norm": 0.7147015929222107, + "learning_rate": 6.868614147570795e-05, + "loss": 2.9192, + "step": 47764 + }, + { + "epoch": 2.34, + "grad_norm": 0.718229353427887, + "learning_rate": 6.867633780632934e-05, + "loss": 2.8925, + "step": 47765 + }, + { + "epoch": 2.34, + "grad_norm": 0.7605919241905212, + "learning_rate": 6.866653474621268e-05, + "loss": 2.8551, + "step": 47766 + }, + { + "epoch": 2.34, + "grad_norm": 0.7661386728286743, + "learning_rate": 6.865673229538384e-05, + "loss": 2.9573, + "step": 47767 + }, + { + "epoch": 2.34, + "grad_norm": 0.7024096846580505, + "learning_rate": 6.864693045386872e-05, + "loss": 2.832, + "step": 47768 + }, + { + "epoch": 2.34, + "grad_norm": 0.7051783800125122, + "learning_rate": 6.863712922169305e-05, + "loss": 2.7672, + "step": 47769 + }, + { + "epoch": 2.34, + "grad_norm": 0.7214155793190002, + "learning_rate": 6.862732859888267e-05, + "loss": 2.6812, + "step": 47770 + }, + { + "epoch": 2.34, + "grad_norm": 0.7348127365112305, + "learning_rate": 6.861752858546328e-05, + "loss": 2.9103, + "step": 47771 + }, + { + "epoch": 2.34, + "grad_norm": 0.6963437795639038, + "learning_rate": 6.860772918146082e-05, + "loss": 2.7701, + "step": 47772 + }, + { + "epoch": 2.34, + "grad_norm": 0.733690619468689, + "learning_rate": 6.859793038690113e-05, + "loss": 2.8199, + "step": 47773 + }, + { + "epoch": 2.34, + "grad_norm": 0.6885663270950317, + "learning_rate": 6.858813220180988e-05, + "loss": 2.8728, + "step": 47774 + }, + { + "epoch": 2.34, + "grad_norm": 0.708145797252655, + "learning_rate": 6.857833462621303e-05, + "loss": 2.7329, + "step": 47775 + }, + { + "epoch": 2.34, + "grad_norm": 0.71966552734375, + "learning_rate": 6.85685376601363e-05, + "loss": 2.8923, + "step": 47776 + }, + { + "epoch": 2.34, + "grad_norm": 0.7468180656433105, + "learning_rate": 6.855874130360546e-05, + "loss": 2.9768, + "step": 47777 + }, + { + "epoch": 2.34, + "grad_norm": 0.7484171986579895, + "learning_rate": 6.85489455566464e-05, + "loss": 2.7614, + "step": 47778 + }, + { + "epoch": 2.34, + "grad_norm": 0.7327077388763428, + "learning_rate": 6.853915041928479e-05, + "loss": 3.0613, + "step": 47779 + }, + { + "epoch": 2.34, + "grad_norm": 0.7350683212280273, + "learning_rate": 6.85293558915466e-05, + "loss": 2.9889, + "step": 47780 + }, + { + "epoch": 2.34, + "grad_norm": 0.7374870181083679, + "learning_rate": 6.851956197345747e-05, + "loss": 2.9741, + "step": 47781 + }, + { + "epoch": 2.34, + "grad_norm": 0.7582040429115295, + "learning_rate": 6.850976866504325e-05, + "loss": 2.8146, + "step": 47782 + }, + { + "epoch": 2.34, + "grad_norm": 0.6711180806159973, + "learning_rate": 6.849997596632983e-05, + "loss": 3.099, + "step": 47783 + }, + { + "epoch": 2.34, + "grad_norm": 0.7463443279266357, + "learning_rate": 6.849018387734293e-05, + "loss": 2.9372, + "step": 47784 + }, + { + "epoch": 2.34, + "grad_norm": 0.74971604347229, + "learning_rate": 6.848039239810832e-05, + "loss": 2.8886, + "step": 47785 + }, + { + "epoch": 2.34, + "grad_norm": 0.7736210227012634, + "learning_rate": 6.847060152865173e-05, + "loss": 2.7884, + "step": 47786 + }, + { + "epoch": 2.34, + "grad_norm": 0.8005310297012329, + "learning_rate": 6.846081126899903e-05, + "loss": 2.9747, + "step": 47787 + }, + { + "epoch": 2.34, + "grad_norm": 0.7372254133224487, + "learning_rate": 6.845102161917604e-05, + "loss": 2.6831, + "step": 47788 + }, + { + "epoch": 2.34, + "grad_norm": 0.7566565275192261, + "learning_rate": 6.844123257920845e-05, + "loss": 3.0529, + "step": 47789 + }, + { + "epoch": 2.34, + "grad_norm": 0.726959764957428, + "learning_rate": 6.843144414912214e-05, + "loss": 2.8369, + "step": 47790 + }, + { + "epoch": 2.34, + "grad_norm": 0.7393621206283569, + "learning_rate": 6.842165632894286e-05, + "loss": 3.0986, + "step": 47791 + }, + { + "epoch": 2.34, + "grad_norm": 0.7415285110473633, + "learning_rate": 6.841186911869628e-05, + "loss": 2.9621, + "step": 47792 + }, + { + "epoch": 2.34, + "grad_norm": 0.7591820955276489, + "learning_rate": 6.840208251840834e-05, + "loss": 2.9591, + "step": 47793 + }, + { + "epoch": 2.34, + "grad_norm": 0.7299678325653076, + "learning_rate": 6.839229652810467e-05, + "loss": 2.8123, + "step": 47794 + }, + { + "epoch": 2.34, + "grad_norm": 0.727067232131958, + "learning_rate": 6.838251114781118e-05, + "loss": 3.0317, + "step": 47795 + }, + { + "epoch": 2.34, + "grad_norm": 0.721146285533905, + "learning_rate": 6.837272637755351e-05, + "loss": 2.9183, + "step": 47796 + }, + { + "epoch": 2.34, + "grad_norm": 0.7278393507003784, + "learning_rate": 6.836294221735761e-05, + "loss": 2.8804, + "step": 47797 + }, + { + "epoch": 2.34, + "grad_norm": 0.6947075724601746, + "learning_rate": 6.835315866724911e-05, + "loss": 2.8544, + "step": 47798 + }, + { + "epoch": 2.34, + "grad_norm": 0.7457790970802307, + "learning_rate": 6.834337572725372e-05, + "loss": 2.9784, + "step": 47799 + }, + { + "epoch": 2.34, + "grad_norm": 0.7459518313407898, + "learning_rate": 6.83335933973974e-05, + "loss": 2.8344, + "step": 47800 + }, + { + "epoch": 2.34, + "grad_norm": 0.7571932673454285, + "learning_rate": 6.832381167770574e-05, + "loss": 2.968, + "step": 47801 + }, + { + "epoch": 2.34, + "grad_norm": 0.6742973327636719, + "learning_rate": 6.831403056820455e-05, + "loss": 2.9231, + "step": 47802 + }, + { + "epoch": 2.34, + "grad_norm": 0.7140032052993774, + "learning_rate": 6.83042500689197e-05, + "loss": 2.7132, + "step": 47803 + }, + { + "epoch": 2.34, + "grad_norm": 0.7056100368499756, + "learning_rate": 6.82944701798768e-05, + "loss": 2.9182, + "step": 47804 + }, + { + "epoch": 2.34, + "grad_norm": 0.7132856845855713, + "learning_rate": 6.828469090110172e-05, + "loss": 2.9506, + "step": 47805 + }, + { + "epoch": 2.34, + "grad_norm": 0.7402212619781494, + "learning_rate": 6.827491223262018e-05, + "loss": 2.9935, + "step": 47806 + }, + { + "epoch": 2.34, + "grad_norm": 0.7678213119506836, + "learning_rate": 6.826513417445785e-05, + "loss": 2.8589, + "step": 47807 + }, + { + "epoch": 2.34, + "grad_norm": 0.6965996623039246, + "learning_rate": 6.825535672664063e-05, + "loss": 2.8132, + "step": 47808 + }, + { + "epoch": 2.34, + "grad_norm": 0.7514899969100952, + "learning_rate": 6.824557988919414e-05, + "loss": 2.8645, + "step": 47809 + }, + { + "epoch": 2.34, + "grad_norm": 0.7812216281890869, + "learning_rate": 6.823580366214428e-05, + "loss": 2.6188, + "step": 47810 + }, + { + "epoch": 2.34, + "grad_norm": 0.7255864143371582, + "learning_rate": 6.822602804551659e-05, + "loss": 2.8784, + "step": 47811 + }, + { + "epoch": 2.34, + "grad_norm": 0.7047956585884094, + "learning_rate": 6.821625303933703e-05, + "loss": 2.9166, + "step": 47812 + }, + { + "epoch": 2.34, + "grad_norm": 0.6879226565361023, + "learning_rate": 6.820647864363128e-05, + "loss": 2.9923, + "step": 47813 + }, + { + "epoch": 2.34, + "grad_norm": 0.7026329636573792, + "learning_rate": 6.819670485842495e-05, + "loss": 2.7337, + "step": 47814 + }, + { + "epoch": 2.34, + "grad_norm": 0.7578959465026855, + "learning_rate": 6.818693168374398e-05, + "loss": 2.8791, + "step": 47815 + }, + { + "epoch": 2.34, + "grad_norm": 0.6991091966629028, + "learning_rate": 6.817715911961391e-05, + "loss": 2.6325, + "step": 47816 + }, + { + "epoch": 2.34, + "grad_norm": 0.769070029258728, + "learning_rate": 6.81673871660606e-05, + "loss": 3.1683, + "step": 47817 + }, + { + "epoch": 2.34, + "grad_norm": 0.72699373960495, + "learning_rate": 6.815761582310986e-05, + "loss": 2.7933, + "step": 47818 + }, + { + "epoch": 2.34, + "grad_norm": 0.756587028503418, + "learning_rate": 6.814784509078728e-05, + "loss": 2.7654, + "step": 47819 + }, + { + "epoch": 2.34, + "grad_norm": 0.7062783241271973, + "learning_rate": 6.813807496911882e-05, + "loss": 2.9737, + "step": 47820 + }, + { + "epoch": 2.34, + "grad_norm": 0.708519458770752, + "learning_rate": 6.812830545812985e-05, + "loss": 2.8675, + "step": 47821 + }, + { + "epoch": 2.34, + "grad_norm": 0.7568124532699585, + "learning_rate": 6.811853655784633e-05, + "loss": 2.9477, + "step": 47822 + }, + { + "epoch": 2.34, + "grad_norm": 0.7141517400741577, + "learning_rate": 6.810876826829403e-05, + "loss": 2.8733, + "step": 47823 + }, + { + "epoch": 2.34, + "grad_norm": 0.732783854007721, + "learning_rate": 6.80990005894985e-05, + "loss": 2.923, + "step": 47824 + }, + { + "epoch": 2.34, + "grad_norm": 0.7289860248565674, + "learning_rate": 6.808923352148567e-05, + "loss": 2.9246, + "step": 47825 + }, + { + "epoch": 2.34, + "grad_norm": 0.7409937977790833, + "learning_rate": 6.80794670642811e-05, + "loss": 3.007, + "step": 47826 + }, + { + "epoch": 2.34, + "grad_norm": 0.7329813838005066, + "learning_rate": 6.806970121791062e-05, + "loss": 2.8461, + "step": 47827 + }, + { + "epoch": 2.34, + "grad_norm": 0.6916979551315308, + "learning_rate": 6.805993598239998e-05, + "loss": 2.9873, + "step": 47828 + }, + { + "epoch": 2.34, + "grad_norm": 0.7569551467895508, + "learning_rate": 6.805017135777469e-05, + "loss": 2.7445, + "step": 47829 + }, + { + "epoch": 2.34, + "grad_norm": 0.7574961185455322, + "learning_rate": 6.804040734406071e-05, + "loss": 2.7435, + "step": 47830 + }, + { + "epoch": 2.34, + "grad_norm": 0.7416138648986816, + "learning_rate": 6.803064394128355e-05, + "loss": 2.873, + "step": 47831 + }, + { + "epoch": 2.34, + "grad_norm": 0.7477595210075378, + "learning_rate": 6.802088114946908e-05, + "loss": 2.9572, + "step": 47832 + }, + { + "epoch": 2.34, + "grad_norm": 0.6934207677841187, + "learning_rate": 6.801111896864301e-05, + "loss": 2.7564, + "step": 47833 + }, + { + "epoch": 2.34, + "grad_norm": 0.7345800995826721, + "learning_rate": 6.800135739883103e-05, + "loss": 2.9987, + "step": 47834 + }, + { + "epoch": 2.34, + "grad_norm": 0.7357404232025146, + "learning_rate": 6.799159644005882e-05, + "loss": 2.8571, + "step": 47835 + }, + { + "epoch": 2.34, + "grad_norm": 0.7066752314567566, + "learning_rate": 6.7981836092352e-05, + "loss": 2.9571, + "step": 47836 + }, + { + "epoch": 2.34, + "grad_norm": 0.766917884349823, + "learning_rate": 6.797207635573636e-05, + "loss": 2.8954, + "step": 47837 + }, + { + "epoch": 2.34, + "grad_norm": 0.8036574721336365, + "learning_rate": 6.796231723023772e-05, + "loss": 2.9591, + "step": 47838 + }, + { + "epoch": 2.34, + "grad_norm": 0.7077456116676331, + "learning_rate": 6.795255871588157e-05, + "loss": 2.9171, + "step": 47839 + }, + { + "epoch": 2.34, + "grad_norm": 0.7191286087036133, + "learning_rate": 6.794280081269385e-05, + "loss": 2.8068, + "step": 47840 + }, + { + "epoch": 2.34, + "grad_norm": 0.6867917776107788, + "learning_rate": 6.79330435207e-05, + "loss": 2.9191, + "step": 47841 + }, + { + "epoch": 2.34, + "grad_norm": 0.7825203537940979, + "learning_rate": 6.792328683992598e-05, + "loss": 2.9092, + "step": 47842 + }, + { + "epoch": 2.34, + "grad_norm": 0.7153095602989197, + "learning_rate": 6.791353077039733e-05, + "loss": 2.7737, + "step": 47843 + }, + { + "epoch": 2.34, + "grad_norm": 0.768951416015625, + "learning_rate": 6.790377531213969e-05, + "loss": 2.8843, + "step": 47844 + }, + { + "epoch": 2.34, + "grad_norm": 0.8321551084518433, + "learning_rate": 6.789402046517896e-05, + "loss": 2.6498, + "step": 47845 + }, + { + "epoch": 2.34, + "grad_norm": 0.726223886013031, + "learning_rate": 6.788426622954059e-05, + "loss": 2.914, + "step": 47846 + }, + { + "epoch": 2.34, + "grad_norm": 0.7852296233177185, + "learning_rate": 6.787451260525045e-05, + "loss": 2.9952, + "step": 47847 + }, + { + "epoch": 2.34, + "grad_norm": 0.7488097548484802, + "learning_rate": 6.786475959233414e-05, + "loss": 3.0178, + "step": 47848 + }, + { + "epoch": 2.34, + "grad_norm": 0.7254324555397034, + "learning_rate": 6.785500719081742e-05, + "loss": 3.0343, + "step": 47849 + }, + { + "epoch": 2.35, + "grad_norm": 0.7108405828475952, + "learning_rate": 6.784525540072596e-05, + "loss": 2.789, + "step": 47850 + }, + { + "epoch": 2.35, + "grad_norm": 0.7349480986595154, + "learning_rate": 6.783550422208534e-05, + "loss": 2.6636, + "step": 47851 + }, + { + "epoch": 2.35, + "grad_norm": 0.7456988096237183, + "learning_rate": 6.782575365492138e-05, + "loss": 2.9112, + "step": 47852 + }, + { + "epoch": 2.35, + "grad_norm": 0.7272082567214966, + "learning_rate": 6.781600369925962e-05, + "loss": 3.1821, + "step": 47853 + }, + { + "epoch": 2.35, + "grad_norm": 0.7798593640327454, + "learning_rate": 6.780625435512585e-05, + "loss": 2.9526, + "step": 47854 + }, + { + "epoch": 2.35, + "grad_norm": 0.7535591721534729, + "learning_rate": 6.77965056225458e-05, + "loss": 2.8622, + "step": 47855 + }, + { + "epoch": 2.35, + "grad_norm": 0.680900514125824, + "learning_rate": 6.778675750154506e-05, + "loss": 2.9729, + "step": 47856 + }, + { + "epoch": 2.35, + "grad_norm": 0.7286996245384216, + "learning_rate": 6.777700999214928e-05, + "loss": 3.0362, + "step": 47857 + }, + { + "epoch": 2.35, + "grad_norm": 0.772244930267334, + "learning_rate": 6.776726309438408e-05, + "loss": 3.0197, + "step": 47858 + }, + { + "epoch": 2.35, + "grad_norm": 0.7076226472854614, + "learning_rate": 6.775751680827525e-05, + "loss": 3.0025, + "step": 47859 + }, + { + "epoch": 2.35, + "grad_norm": 0.7190698385238647, + "learning_rate": 6.774777113384848e-05, + "loss": 3.1247, + "step": 47860 + }, + { + "epoch": 2.35, + "grad_norm": 0.7115904688835144, + "learning_rate": 6.773802607112934e-05, + "loss": 2.7755, + "step": 47861 + }, + { + "epoch": 2.35, + "grad_norm": 0.7206274271011353, + "learning_rate": 6.772828162014359e-05, + "loss": 2.8821, + "step": 47862 + }, + { + "epoch": 2.35, + "grad_norm": 0.7294838428497314, + "learning_rate": 6.771853778091678e-05, + "loss": 2.8908, + "step": 47863 + }, + { + "epoch": 2.35, + "grad_norm": 0.728641927242279, + "learning_rate": 6.77087945534747e-05, + "loss": 2.984, + "step": 47864 + }, + { + "epoch": 2.35, + "grad_norm": 0.6940865516662598, + "learning_rate": 6.769905193784295e-05, + "loss": 2.7064, + "step": 47865 + }, + { + "epoch": 2.35, + "grad_norm": 0.7145792245864868, + "learning_rate": 6.768930993404712e-05, + "loss": 2.7834, + "step": 47866 + }, + { + "epoch": 2.35, + "grad_norm": 0.7666870951652527, + "learning_rate": 6.767956854211302e-05, + "loss": 2.8824, + "step": 47867 + }, + { + "epoch": 2.35, + "grad_norm": 0.6863735318183899, + "learning_rate": 6.766982776206616e-05, + "loss": 2.7799, + "step": 47868 + }, + { + "epoch": 2.35, + "grad_norm": 0.714529275894165, + "learning_rate": 6.766008759393222e-05, + "loss": 2.8986, + "step": 47869 + }, + { + "epoch": 2.35, + "grad_norm": 0.6909911036491394, + "learning_rate": 6.7650348037737e-05, + "loss": 2.6996, + "step": 47870 + }, + { + "epoch": 2.35, + "grad_norm": 0.6930615901947021, + "learning_rate": 6.764060909350604e-05, + "loss": 3.0113, + "step": 47871 + }, + { + "epoch": 2.35, + "grad_norm": 0.7450976967811584, + "learning_rate": 6.7630870761265e-05, + "loss": 2.7526, + "step": 47872 + }, + { + "epoch": 2.35, + "grad_norm": 0.7528846263885498, + "learning_rate": 6.762113304103947e-05, + "loss": 3.0688, + "step": 47873 + }, + { + "epoch": 2.35, + "grad_norm": 0.7500634789466858, + "learning_rate": 6.761139593285512e-05, + "loss": 2.8791, + "step": 47874 + }, + { + "epoch": 2.35, + "grad_norm": 0.7247843146324158, + "learning_rate": 6.760165943673774e-05, + "loss": 3.054, + "step": 47875 + }, + { + "epoch": 2.35, + "grad_norm": 0.6777768731117249, + "learning_rate": 6.759192355271272e-05, + "loss": 3.0018, + "step": 47876 + }, + { + "epoch": 2.35, + "grad_norm": 0.7233495116233826, + "learning_rate": 6.758218828080598e-05, + "loss": 2.7997, + "step": 47877 + }, + { + "epoch": 2.35, + "grad_norm": 0.7728328108787537, + "learning_rate": 6.757245362104303e-05, + "loss": 3.1007, + "step": 47878 + }, + { + "epoch": 2.35, + "grad_norm": 0.6918376684188843, + "learning_rate": 6.756271957344938e-05, + "loss": 2.9147, + "step": 47879 + }, + { + "epoch": 2.35, + "grad_norm": 0.6865652799606323, + "learning_rate": 6.755298613805092e-05, + "loss": 3.0058, + "step": 47880 + }, + { + "epoch": 2.35, + "grad_norm": 0.7141489386558533, + "learning_rate": 6.754325331487308e-05, + "loss": 2.8366, + "step": 47881 + }, + { + "epoch": 2.35, + "grad_norm": 0.7318657636642456, + "learning_rate": 6.753352110394163e-05, + "loss": 2.7465, + "step": 47882 + }, + { + "epoch": 2.35, + "grad_norm": 0.7256183624267578, + "learning_rate": 6.752378950528206e-05, + "loss": 2.975, + "step": 47883 + }, + { + "epoch": 2.35, + "grad_norm": 0.8105762600898743, + "learning_rate": 6.751405851892009e-05, + "loss": 2.629, + "step": 47884 + }, + { + "epoch": 2.35, + "grad_norm": 0.7484055757522583, + "learning_rate": 6.750432814488145e-05, + "loss": 2.9615, + "step": 47885 + }, + { + "epoch": 2.35, + "grad_norm": 0.7243873476982117, + "learning_rate": 6.749459838319164e-05, + "loss": 2.8934, + "step": 47886 + }, + { + "epoch": 2.35, + "grad_norm": 0.7047001719474792, + "learning_rate": 6.748486923387636e-05, + "loss": 2.9622, + "step": 47887 + }, + { + "epoch": 2.35, + "grad_norm": 0.6941709518432617, + "learning_rate": 6.747514069696104e-05, + "loss": 3.0624, + "step": 47888 + }, + { + "epoch": 2.35, + "grad_norm": 0.7410936951637268, + "learning_rate": 6.74654127724715e-05, + "loss": 2.6276, + "step": 47889 + }, + { + "epoch": 2.35, + "grad_norm": 0.74365234375, + "learning_rate": 6.745568546043338e-05, + "loss": 3.0654, + "step": 47890 + }, + { + "epoch": 2.35, + "grad_norm": 0.7380276322364807, + "learning_rate": 6.744595876087211e-05, + "loss": 2.9712, + "step": 47891 + }, + { + "epoch": 2.35, + "grad_norm": 0.6733323335647583, + "learning_rate": 6.743623267381355e-05, + "loss": 2.6858, + "step": 47892 + }, + { + "epoch": 2.35, + "grad_norm": 0.7377191781997681, + "learning_rate": 6.742650719928317e-05, + "loss": 2.7803, + "step": 47893 + }, + { + "epoch": 2.35, + "grad_norm": 0.7515687942504883, + "learning_rate": 6.741678233730654e-05, + "loss": 3.0201, + "step": 47894 + }, + { + "epoch": 2.35, + "grad_norm": 0.7286888360977173, + "learning_rate": 6.740705808790943e-05, + "loss": 3.0113, + "step": 47895 + }, + { + "epoch": 2.35, + "grad_norm": 0.7062298059463501, + "learning_rate": 6.739733445111728e-05, + "loss": 2.7799, + "step": 47896 + }, + { + "epoch": 2.35, + "grad_norm": 0.7720447778701782, + "learning_rate": 6.738761142695588e-05, + "loss": 2.9522, + "step": 47897 + }, + { + "epoch": 2.35, + "grad_norm": 0.714799165725708, + "learning_rate": 6.737788901545067e-05, + "loss": 2.951, + "step": 47898 + }, + { + "epoch": 2.35, + "grad_norm": 0.7219805717468262, + "learning_rate": 6.736816721662737e-05, + "loss": 2.8175, + "step": 47899 + }, + { + "epoch": 2.35, + "grad_norm": 0.7763769626617432, + "learning_rate": 6.735844603051158e-05, + "loss": 2.9363, + "step": 47900 + }, + { + "epoch": 2.35, + "grad_norm": 0.7317289113998413, + "learning_rate": 6.73487254571288e-05, + "loss": 2.9334, + "step": 47901 + }, + { + "epoch": 2.35, + "grad_norm": 0.7248978018760681, + "learning_rate": 6.733900549650475e-05, + "loss": 3.0164, + "step": 47902 + }, + { + "epoch": 2.35, + "grad_norm": 0.7333472967147827, + "learning_rate": 6.732928614866494e-05, + "loss": 3.0241, + "step": 47903 + }, + { + "epoch": 2.35, + "grad_norm": 0.7930324673652649, + "learning_rate": 6.731956741363498e-05, + "loss": 2.8516, + "step": 47904 + }, + { + "epoch": 2.35, + "grad_norm": 0.7575118541717529, + "learning_rate": 6.730984929144058e-05, + "loss": 2.876, + "step": 47905 + }, + { + "epoch": 2.35, + "grad_norm": 0.7359734773635864, + "learning_rate": 6.730013178210717e-05, + "loss": 2.8377, + "step": 47906 + }, + { + "epoch": 2.35, + "grad_norm": 0.7023991942405701, + "learning_rate": 6.729041488566054e-05, + "loss": 2.9254, + "step": 47907 + }, + { + "epoch": 2.35, + "grad_norm": 0.7352596521377563, + "learning_rate": 6.728069860212616e-05, + "loss": 2.9139, + "step": 47908 + }, + { + "epoch": 2.35, + "grad_norm": 0.6935974359512329, + "learning_rate": 6.727098293152952e-05, + "loss": 2.9124, + "step": 47909 + }, + { + "epoch": 2.35, + "grad_norm": 0.7225365042686462, + "learning_rate": 6.726126787389645e-05, + "loss": 2.8963, + "step": 47910 + }, + { + "epoch": 2.35, + "grad_norm": 0.7281026840209961, + "learning_rate": 6.725155342925228e-05, + "loss": 2.9505, + "step": 47911 + }, + { + "epoch": 2.35, + "grad_norm": 0.7165417075157166, + "learning_rate": 6.724183959762285e-05, + "loss": 2.8035, + "step": 47912 + }, + { + "epoch": 2.35, + "grad_norm": 0.7453783750534058, + "learning_rate": 6.72321263790335e-05, + "loss": 2.9435, + "step": 47913 + }, + { + "epoch": 2.35, + "grad_norm": 0.7814623117446899, + "learning_rate": 6.722241377351006e-05, + "loss": 2.9833, + "step": 47914 + }, + { + "epoch": 2.35, + "grad_norm": 0.7536753416061401, + "learning_rate": 6.721270178107794e-05, + "loss": 2.8194, + "step": 47915 + }, + { + "epoch": 2.35, + "grad_norm": 0.7346058487892151, + "learning_rate": 6.72029904017627e-05, + "loss": 2.7661, + "step": 47916 + }, + { + "epoch": 2.35, + "grad_norm": 0.7435902953147888, + "learning_rate": 6.719327963559005e-05, + "loss": 2.7599, + "step": 47917 + }, + { + "epoch": 2.35, + "grad_norm": 0.7150677442550659, + "learning_rate": 6.718356948258543e-05, + "loss": 2.984, + "step": 47918 + }, + { + "epoch": 2.35, + "grad_norm": 0.7524111270904541, + "learning_rate": 6.717385994277446e-05, + "loss": 2.838, + "step": 47919 + }, + { + "epoch": 2.35, + "grad_norm": 0.7796128988265991, + "learning_rate": 6.716415101618284e-05, + "loss": 2.7455, + "step": 47920 + }, + { + "epoch": 2.35, + "grad_norm": 0.7386764883995056, + "learning_rate": 6.715444270283598e-05, + "loss": 2.9067, + "step": 47921 + }, + { + "epoch": 2.35, + "grad_norm": 0.7747188806533813, + "learning_rate": 6.71447350027596e-05, + "loss": 2.8656, + "step": 47922 + }, + { + "epoch": 2.35, + "grad_norm": 0.7116460800170898, + "learning_rate": 6.713502791597906e-05, + "loss": 2.8489, + "step": 47923 + }, + { + "epoch": 2.35, + "grad_norm": 0.7225497961044312, + "learning_rate": 6.712532144252014e-05, + "loss": 2.9257, + "step": 47924 + }, + { + "epoch": 2.35, + "grad_norm": 0.7185220122337341, + "learning_rate": 6.711561558240818e-05, + "loss": 2.9328, + "step": 47925 + }, + { + "epoch": 2.35, + "grad_norm": 0.7107573747634888, + "learning_rate": 6.710591033566888e-05, + "loss": 2.8896, + "step": 47926 + }, + { + "epoch": 2.35, + "grad_norm": 0.6854578852653503, + "learning_rate": 6.709620570232787e-05, + "loss": 3.0585, + "step": 47927 + }, + { + "epoch": 2.35, + "grad_norm": 0.7601986527442932, + "learning_rate": 6.708650168241055e-05, + "loss": 2.7619, + "step": 47928 + }, + { + "epoch": 2.35, + "grad_norm": 0.7219326496124268, + "learning_rate": 6.707679827594266e-05, + "loss": 3.0518, + "step": 47929 + }, + { + "epoch": 2.35, + "grad_norm": 0.7219486236572266, + "learning_rate": 6.706709548294963e-05, + "loss": 2.8814, + "step": 47930 + }, + { + "epoch": 2.35, + "grad_norm": 0.7470270395278931, + "learning_rate": 6.705739330345699e-05, + "loss": 2.8936, + "step": 47931 + }, + { + "epoch": 2.35, + "grad_norm": 0.7633888125419617, + "learning_rate": 6.70476917374904e-05, + "loss": 3.121, + "step": 47932 + }, + { + "epoch": 2.35, + "grad_norm": 0.7289944291114807, + "learning_rate": 6.703799078507527e-05, + "loss": 2.9113, + "step": 47933 + }, + { + "epoch": 2.35, + "grad_norm": 0.7318068742752075, + "learning_rate": 6.702829044623736e-05, + "loss": 2.9518, + "step": 47934 + }, + { + "epoch": 2.35, + "grad_norm": 0.6930985450744629, + "learning_rate": 6.701859072100197e-05, + "loss": 3.0098, + "step": 47935 + }, + { + "epoch": 2.35, + "grad_norm": 0.691349983215332, + "learning_rate": 6.700889160939488e-05, + "loss": 3.0021, + "step": 47936 + }, + { + "epoch": 2.35, + "grad_norm": 0.6962461471557617, + "learning_rate": 6.699919311144151e-05, + "loss": 2.9648, + "step": 47937 + }, + { + "epoch": 2.35, + "grad_norm": 0.6997219324111938, + "learning_rate": 6.698949522716737e-05, + "loss": 3.0215, + "step": 47938 + }, + { + "epoch": 2.35, + "grad_norm": 0.7151573896408081, + "learning_rate": 6.69797979565981e-05, + "loss": 2.9138, + "step": 47939 + }, + { + "epoch": 2.35, + "grad_norm": 0.7079306840896606, + "learning_rate": 6.697010129975917e-05, + "loss": 2.7507, + "step": 47940 + }, + { + "epoch": 2.35, + "grad_norm": 0.7613853216171265, + "learning_rate": 6.69604052566761e-05, + "loss": 2.7584, + "step": 47941 + }, + { + "epoch": 2.35, + "grad_norm": 0.7670363187789917, + "learning_rate": 6.695070982737456e-05, + "loss": 2.9046, + "step": 47942 + }, + { + "epoch": 2.35, + "grad_norm": 0.7608156204223633, + "learning_rate": 6.694101501187993e-05, + "loss": 2.9756, + "step": 47943 + }, + { + "epoch": 2.35, + "grad_norm": 0.7187494039535522, + "learning_rate": 6.693132081021788e-05, + "loss": 2.8245, + "step": 47944 + }, + { + "epoch": 2.35, + "grad_norm": 0.766250729560852, + "learning_rate": 6.69216272224139e-05, + "loss": 2.7639, + "step": 47945 + }, + { + "epoch": 2.35, + "grad_norm": 0.7323248386383057, + "learning_rate": 6.69119342484934e-05, + "loss": 2.975, + "step": 47946 + }, + { + "epoch": 2.35, + "grad_norm": 0.7261918187141418, + "learning_rate": 6.690224188848208e-05, + "loss": 3.0653, + "step": 47947 + }, + { + "epoch": 2.35, + "grad_norm": 0.7363904118537903, + "learning_rate": 6.689255014240533e-05, + "loss": 2.8412, + "step": 47948 + }, + { + "epoch": 2.35, + "grad_norm": 0.7276012301445007, + "learning_rate": 6.688285901028883e-05, + "loss": 3.0029, + "step": 47949 + }, + { + "epoch": 2.35, + "grad_norm": 0.7301309108734131, + "learning_rate": 6.68731684921579e-05, + "loss": 2.9247, + "step": 47950 + }, + { + "epoch": 2.35, + "grad_norm": 0.7242274880409241, + "learning_rate": 6.686347858803827e-05, + "loss": 2.767, + "step": 47951 + }, + { + "epoch": 2.35, + "grad_norm": 0.7706395387649536, + "learning_rate": 6.685378929795541e-05, + "loss": 3.0477, + "step": 47952 + }, + { + "epoch": 2.35, + "grad_norm": 0.7164594531059265, + "learning_rate": 6.684410062193467e-05, + "loss": 2.8957, + "step": 47953 + }, + { + "epoch": 2.35, + "grad_norm": 0.7162006497383118, + "learning_rate": 6.683441256000182e-05, + "loss": 2.9765, + "step": 47954 + }, + { + "epoch": 2.35, + "grad_norm": 0.7521647214889526, + "learning_rate": 6.682472511218213e-05, + "loss": 2.8028, + "step": 47955 + }, + { + "epoch": 2.35, + "grad_norm": 0.7329918146133423, + "learning_rate": 6.681503827850124e-05, + "loss": 3.0094, + "step": 47956 + }, + { + "epoch": 2.35, + "grad_norm": 0.7082628011703491, + "learning_rate": 6.680535205898476e-05, + "loss": 3.1608, + "step": 47957 + }, + { + "epoch": 2.35, + "grad_norm": 0.721556544303894, + "learning_rate": 6.679566645365813e-05, + "loss": 2.7341, + "step": 47958 + }, + { + "epoch": 2.35, + "grad_norm": 0.7148202657699585, + "learning_rate": 6.67859814625468e-05, + "loss": 3.0916, + "step": 47959 + }, + { + "epoch": 2.35, + "grad_norm": 0.7291754484176636, + "learning_rate": 6.677629708567624e-05, + "loss": 2.8334, + "step": 47960 + }, + { + "epoch": 2.35, + "grad_norm": 0.8292950391769409, + "learning_rate": 6.676661332307201e-05, + "loss": 3.0277, + "step": 47961 + }, + { + "epoch": 2.35, + "grad_norm": 0.7232643365859985, + "learning_rate": 6.675693017475972e-05, + "loss": 2.8574, + "step": 47962 + }, + { + "epoch": 2.35, + "grad_norm": 0.7282116413116455, + "learning_rate": 6.674724764076471e-05, + "loss": 3.191, + "step": 47963 + }, + { + "epoch": 2.35, + "grad_norm": 0.7802262902259827, + "learning_rate": 6.673756572111268e-05, + "loss": 2.9908, + "step": 47964 + }, + { + "epoch": 2.35, + "grad_norm": 0.732752799987793, + "learning_rate": 6.672788441582886e-05, + "loss": 2.8625, + "step": 47965 + }, + { + "epoch": 2.35, + "grad_norm": 0.7601721286773682, + "learning_rate": 6.671820372493901e-05, + "loss": 2.9282, + "step": 47966 + }, + { + "epoch": 2.35, + "grad_norm": 0.7560071349143982, + "learning_rate": 6.670852364846855e-05, + "loss": 3.0195, + "step": 47967 + }, + { + "epoch": 2.35, + "grad_norm": 0.694574236869812, + "learning_rate": 6.669884418644281e-05, + "loss": 3.183, + "step": 47968 + }, + { + "epoch": 2.35, + "grad_norm": 0.7665233016014099, + "learning_rate": 6.668916533888752e-05, + "loss": 3.1757, + "step": 47969 + }, + { + "epoch": 2.35, + "grad_norm": 0.7764652371406555, + "learning_rate": 6.667948710582798e-05, + "loss": 2.9517, + "step": 47970 + }, + { + "epoch": 2.35, + "grad_norm": 0.7054910063743591, + "learning_rate": 6.666980948728978e-05, + "loss": 2.9967, + "step": 47971 + }, + { + "epoch": 2.35, + "grad_norm": 0.7566725611686707, + "learning_rate": 6.666013248329845e-05, + "loss": 2.9945, + "step": 47972 + }, + { + "epoch": 2.35, + "grad_norm": 0.7435179948806763, + "learning_rate": 6.665045609387944e-05, + "loss": 2.9586, + "step": 47973 + }, + { + "epoch": 2.35, + "grad_norm": 0.6983252167701721, + "learning_rate": 6.664078031905822e-05, + "loss": 2.8248, + "step": 47974 + }, + { + "epoch": 2.35, + "grad_norm": 0.7251109480857849, + "learning_rate": 6.663110515886021e-05, + "loss": 3.0567, + "step": 47975 + }, + { + "epoch": 2.35, + "grad_norm": 0.7957937717437744, + "learning_rate": 6.662143061331093e-05, + "loss": 2.6457, + "step": 47976 + }, + { + "epoch": 2.35, + "grad_norm": 0.7234058380126953, + "learning_rate": 6.661175668243596e-05, + "loss": 3.1194, + "step": 47977 + }, + { + "epoch": 2.35, + "grad_norm": 0.7288504242897034, + "learning_rate": 6.660208336626066e-05, + "loss": 2.9923, + "step": 47978 + }, + { + "epoch": 2.35, + "grad_norm": 0.7363245487213135, + "learning_rate": 6.659241066481063e-05, + "loss": 3.0909, + "step": 47979 + }, + { + "epoch": 2.35, + "grad_norm": 0.7586546540260315, + "learning_rate": 6.658273857811123e-05, + "loss": 2.919, + "step": 47980 + }, + { + "epoch": 2.35, + "grad_norm": 0.7015671730041504, + "learning_rate": 6.657306710618794e-05, + "loss": 2.8089, + "step": 47981 + }, + { + "epoch": 2.35, + "grad_norm": 0.7294623851776123, + "learning_rate": 6.656339624906633e-05, + "loss": 3.0357, + "step": 47982 + }, + { + "epoch": 2.35, + "grad_norm": 0.7198174595832825, + "learning_rate": 6.655372600677171e-05, + "loss": 2.9374, + "step": 47983 + }, + { + "epoch": 2.35, + "grad_norm": 0.7496002316474915, + "learning_rate": 6.654405637932976e-05, + "loss": 2.7852, + "step": 47984 + }, + { + "epoch": 2.35, + "grad_norm": 0.71015465259552, + "learning_rate": 6.653438736676575e-05, + "loss": 2.9953, + "step": 47985 + }, + { + "epoch": 2.35, + "grad_norm": 0.7362428307533264, + "learning_rate": 6.65247189691052e-05, + "loss": 2.6275, + "step": 47986 + }, + { + "epoch": 2.35, + "grad_norm": 0.7148375511169434, + "learning_rate": 6.651505118637374e-05, + "loss": 2.952, + "step": 47987 + }, + { + "epoch": 2.35, + "grad_norm": 0.7447728514671326, + "learning_rate": 6.650538401859665e-05, + "loss": 3.0321, + "step": 47988 + }, + { + "epoch": 2.35, + "grad_norm": 0.716418981552124, + "learning_rate": 6.649571746579946e-05, + "loss": 2.7179, + "step": 47989 + }, + { + "epoch": 2.35, + "grad_norm": 0.7445757389068604, + "learning_rate": 6.648605152800753e-05, + "loss": 2.9258, + "step": 47990 + }, + { + "epoch": 2.35, + "grad_norm": 0.7147215008735657, + "learning_rate": 6.647638620524642e-05, + "loss": 2.9995, + "step": 47991 + }, + { + "epoch": 2.35, + "grad_norm": 0.7629702091217041, + "learning_rate": 6.646672149754161e-05, + "loss": 2.941, + "step": 47992 + }, + { + "epoch": 2.35, + "grad_norm": 0.6876111030578613, + "learning_rate": 6.645705740491847e-05, + "loss": 2.9911, + "step": 47993 + }, + { + "epoch": 2.35, + "grad_norm": 0.7215240597724915, + "learning_rate": 6.644739392740255e-05, + "loss": 2.8185, + "step": 47994 + }, + { + "epoch": 2.35, + "grad_norm": 0.7500731348991394, + "learning_rate": 6.643773106501925e-05, + "loss": 2.9694, + "step": 47995 + }, + { + "epoch": 2.35, + "grad_norm": 0.7356647253036499, + "learning_rate": 6.642806881779392e-05, + "loss": 2.8889, + "step": 47996 + }, + { + "epoch": 2.35, + "grad_norm": 0.7396929860115051, + "learning_rate": 6.641840718575221e-05, + "loss": 2.9491, + "step": 47997 + }, + { + "epoch": 2.35, + "grad_norm": 0.7047430276870728, + "learning_rate": 6.640874616891938e-05, + "loss": 2.9185, + "step": 47998 + }, + { + "epoch": 2.35, + "grad_norm": 0.7342384457588196, + "learning_rate": 6.639908576732105e-05, + "loss": 2.8062, + "step": 47999 + }, + { + "epoch": 2.35, + "grad_norm": 0.7048220634460449, + "learning_rate": 6.638942598098245e-05, + "loss": 3.0256, + "step": 48000 + }, + { + "epoch": 2.35, + "grad_norm": 0.6992772817611694, + "learning_rate": 6.637976680992926e-05, + "loss": 2.8008, + "step": 48001 + }, + { + "epoch": 2.35, + "grad_norm": 0.7305018305778503, + "learning_rate": 6.63701082541868e-05, + "loss": 2.9617, + "step": 48002 + }, + { + "epoch": 2.35, + "grad_norm": 0.7258499264717102, + "learning_rate": 6.636045031378041e-05, + "loss": 2.9443, + "step": 48003 + }, + { + "epoch": 2.35, + "grad_norm": 0.7828282713890076, + "learning_rate": 6.635079298873572e-05, + "loss": 3.0035, + "step": 48004 + }, + { + "epoch": 2.35, + "grad_norm": 0.7275944948196411, + "learning_rate": 6.634113627907798e-05, + "loss": 2.8979, + "step": 48005 + }, + { + "epoch": 2.35, + "grad_norm": 0.7850326299667358, + "learning_rate": 6.633148018483281e-05, + "loss": 3.0463, + "step": 48006 + }, + { + "epoch": 2.35, + "grad_norm": 0.7134331464767456, + "learning_rate": 6.63218247060255e-05, + "loss": 2.8976, + "step": 48007 + }, + { + "epoch": 2.35, + "grad_norm": 0.7655074596405029, + "learning_rate": 6.631216984268149e-05, + "loss": 2.9853, + "step": 48008 + }, + { + "epoch": 2.35, + "grad_norm": 0.7371658682823181, + "learning_rate": 6.630251559482633e-05, + "loss": 2.8503, + "step": 48009 + }, + { + "epoch": 2.35, + "grad_norm": 0.7096714973449707, + "learning_rate": 6.629286196248539e-05, + "loss": 2.905, + "step": 48010 + }, + { + "epoch": 2.35, + "grad_norm": 0.7324158549308777, + "learning_rate": 6.628320894568407e-05, + "loss": 3.0072, + "step": 48011 + }, + { + "epoch": 2.35, + "grad_norm": 0.7133394479751587, + "learning_rate": 6.627355654444774e-05, + "loss": 2.8081, + "step": 48012 + }, + { + "epoch": 2.35, + "grad_norm": 0.7191671133041382, + "learning_rate": 6.626390475880183e-05, + "loss": 2.7956, + "step": 48013 + }, + { + "epoch": 2.35, + "grad_norm": 0.7937895655632019, + "learning_rate": 6.625425358877194e-05, + "loss": 2.8923, + "step": 48014 + }, + { + "epoch": 2.35, + "grad_norm": 0.7353552579879761, + "learning_rate": 6.624460303438325e-05, + "loss": 2.9375, + "step": 48015 + }, + { + "epoch": 2.35, + "grad_norm": 0.7080419063568115, + "learning_rate": 6.623495309566138e-05, + "loss": 2.9092, + "step": 48016 + }, + { + "epoch": 2.35, + "grad_norm": 0.7076489925384521, + "learning_rate": 6.622530377263164e-05, + "loss": 3.1517, + "step": 48017 + }, + { + "epoch": 2.35, + "grad_norm": 0.7452599406242371, + "learning_rate": 6.621565506531939e-05, + "loss": 2.9966, + "step": 48018 + }, + { + "epoch": 2.35, + "grad_norm": 0.7066365480422974, + "learning_rate": 6.620600697375019e-05, + "loss": 2.7327, + "step": 48019 + }, + { + "epoch": 2.35, + "grad_norm": 0.6905827522277832, + "learning_rate": 6.619635949794929e-05, + "loss": 3.0837, + "step": 48020 + }, + { + "epoch": 2.35, + "grad_norm": 0.7231424450874329, + "learning_rate": 6.618671263794226e-05, + "loss": 3.1329, + "step": 48021 + }, + { + "epoch": 2.35, + "grad_norm": 0.7269807457923889, + "learning_rate": 6.617706639375437e-05, + "loss": 2.7666, + "step": 48022 + }, + { + "epoch": 2.35, + "grad_norm": 0.7062513828277588, + "learning_rate": 6.616742076541117e-05, + "loss": 2.7635, + "step": 48023 + }, + { + "epoch": 2.35, + "grad_norm": 0.7632284760475159, + "learning_rate": 6.615777575293797e-05, + "loss": 2.9912, + "step": 48024 + }, + { + "epoch": 2.35, + "grad_norm": 0.7120453715324402, + "learning_rate": 6.614813135636008e-05, + "loss": 3.2382, + "step": 48025 + }, + { + "epoch": 2.35, + "grad_norm": 0.7771921157836914, + "learning_rate": 6.61384875757031e-05, + "loss": 2.8899, + "step": 48026 + }, + { + "epoch": 2.35, + "grad_norm": 0.71280837059021, + "learning_rate": 6.612884441099229e-05, + "loss": 2.8302, + "step": 48027 + }, + { + "epoch": 2.35, + "grad_norm": 0.7356342077255249, + "learning_rate": 6.611920186225305e-05, + "loss": 3.1114, + "step": 48028 + }, + { + "epoch": 2.35, + "grad_norm": 0.7350482940673828, + "learning_rate": 6.610955992951092e-05, + "loss": 3.127, + "step": 48029 + }, + { + "epoch": 2.35, + "grad_norm": 0.7893431782722473, + "learning_rate": 6.60999186127911e-05, + "loss": 2.8438, + "step": 48030 + }, + { + "epoch": 2.35, + "grad_norm": 0.716373085975647, + "learning_rate": 6.609027791211919e-05, + "loss": 2.901, + "step": 48031 + }, + { + "epoch": 2.35, + "grad_norm": 0.7196996212005615, + "learning_rate": 6.608063782752045e-05, + "loss": 2.7733, + "step": 48032 + }, + { + "epoch": 2.35, + "grad_norm": 0.7472556829452515, + "learning_rate": 6.607099835902023e-05, + "loss": 3.0575, + "step": 48033 + }, + { + "epoch": 2.35, + "grad_norm": 0.7139104604721069, + "learning_rate": 6.606135950664403e-05, + "loss": 2.9093, + "step": 48034 + }, + { + "epoch": 2.35, + "grad_norm": 0.8291552662849426, + "learning_rate": 6.605172127041715e-05, + "loss": 2.989, + "step": 48035 + }, + { + "epoch": 2.35, + "grad_norm": 0.7301540970802307, + "learning_rate": 6.604208365036507e-05, + "loss": 2.89, + "step": 48036 + }, + { + "epoch": 2.35, + "grad_norm": 0.7189569473266602, + "learning_rate": 6.603244664651301e-05, + "loss": 2.8514, + "step": 48037 + }, + { + "epoch": 2.35, + "grad_norm": 0.7034281492233276, + "learning_rate": 6.602281025888656e-05, + "loss": 2.8714, + "step": 48038 + }, + { + "epoch": 2.35, + "grad_norm": 0.7544010281562805, + "learning_rate": 6.601317448751101e-05, + "loss": 2.8995, + "step": 48039 + }, + { + "epoch": 2.35, + "grad_norm": 0.6748170256614685, + "learning_rate": 6.600353933241164e-05, + "loss": 2.6243, + "step": 48040 + }, + { + "epoch": 2.35, + "grad_norm": 0.685340940952301, + "learning_rate": 6.599390479361398e-05, + "loss": 2.7344, + "step": 48041 + }, + { + "epoch": 2.35, + "grad_norm": 0.6887692213058472, + "learning_rate": 6.598427087114326e-05, + "loss": 3.0116, + "step": 48042 + }, + { + "epoch": 2.35, + "grad_norm": 0.7316964864730835, + "learning_rate": 6.597463756502492e-05, + "loss": 3.0474, + "step": 48043 + }, + { + "epoch": 2.35, + "grad_norm": 0.7430610060691833, + "learning_rate": 6.596500487528447e-05, + "loss": 2.7318, + "step": 48044 + }, + { + "epoch": 2.35, + "grad_norm": 0.71198570728302, + "learning_rate": 6.595537280194704e-05, + "loss": 2.9316, + "step": 48045 + }, + { + "epoch": 2.35, + "grad_norm": 0.720703661441803, + "learning_rate": 6.594574134503822e-05, + "loss": 3.1858, + "step": 48046 + }, + { + "epoch": 2.35, + "grad_norm": 0.7150319218635559, + "learning_rate": 6.593611050458325e-05, + "loss": 2.8819, + "step": 48047 + }, + { + "epoch": 2.35, + "grad_norm": 0.7347413897514343, + "learning_rate": 6.592648028060746e-05, + "loss": 3.1495, + "step": 48048 + }, + { + "epoch": 2.35, + "grad_norm": 0.7172079682350159, + "learning_rate": 6.591685067313634e-05, + "loss": 2.856, + "step": 48049 + }, + { + "epoch": 2.35, + "grad_norm": 0.7229471802711487, + "learning_rate": 6.59072216821951e-05, + "loss": 2.9738, + "step": 48050 + }, + { + "epoch": 2.35, + "grad_norm": 0.7183575630187988, + "learning_rate": 6.589759330780926e-05, + "loss": 3.0105, + "step": 48051 + }, + { + "epoch": 2.35, + "grad_norm": 0.7318481802940369, + "learning_rate": 6.588796555000403e-05, + "loss": 2.8139, + "step": 48052 + }, + { + "epoch": 2.35, + "grad_norm": 0.7228243350982666, + "learning_rate": 6.587833840880493e-05, + "loss": 2.8365, + "step": 48053 + }, + { + "epoch": 2.36, + "grad_norm": 0.7609462738037109, + "learning_rate": 6.586871188423724e-05, + "loss": 2.6898, + "step": 48054 + }, + { + "epoch": 2.36, + "grad_norm": 0.7056893110275269, + "learning_rate": 6.585908597632622e-05, + "loss": 2.9007, + "step": 48055 + }, + { + "epoch": 2.36, + "grad_norm": 0.7319220304489136, + "learning_rate": 6.584946068509737e-05, + "loss": 2.9661, + "step": 48056 + }, + { + "epoch": 2.36, + "grad_norm": 0.7054177522659302, + "learning_rate": 6.583983601057593e-05, + "loss": 2.7722, + "step": 48057 + }, + { + "epoch": 2.36, + "grad_norm": 0.7039554119110107, + "learning_rate": 6.583021195278726e-05, + "loss": 2.7155, + "step": 48058 + }, + { + "epoch": 2.36, + "grad_norm": 0.7091907858848572, + "learning_rate": 6.582058851175683e-05, + "loss": 3.0179, + "step": 48059 + }, + { + "epoch": 2.36, + "grad_norm": 0.7543073892593384, + "learning_rate": 6.581096568750993e-05, + "loss": 2.9976, + "step": 48060 + }, + { + "epoch": 2.36, + "grad_norm": 0.7862129211425781, + "learning_rate": 6.580134348007183e-05, + "loss": 2.6822, + "step": 48061 + }, + { + "epoch": 2.36, + "grad_norm": 0.7358943223953247, + "learning_rate": 6.579172188946789e-05, + "loss": 2.8732, + "step": 48062 + }, + { + "epoch": 2.36, + "grad_norm": 0.7497488260269165, + "learning_rate": 6.578210091572344e-05, + "loss": 2.9566, + "step": 48063 + }, + { + "epoch": 2.36, + "grad_norm": 0.6996098160743713, + "learning_rate": 6.577248055886396e-05, + "loss": 3.085, + "step": 48064 + }, + { + "epoch": 2.36, + "grad_norm": 0.7074849009513855, + "learning_rate": 6.576286081891462e-05, + "loss": 2.998, + "step": 48065 + }, + { + "epoch": 2.36, + "grad_norm": 0.7271658182144165, + "learning_rate": 6.575324169590086e-05, + "loss": 3.0539, + "step": 48066 + }, + { + "epoch": 2.36, + "grad_norm": 0.8061689138412476, + "learning_rate": 6.574362318984792e-05, + "loss": 2.6798, + "step": 48067 + }, + { + "epoch": 2.36, + "grad_norm": 0.7243710160255432, + "learning_rate": 6.573400530078129e-05, + "loss": 2.8028, + "step": 48068 + }, + { + "epoch": 2.36, + "grad_norm": 0.7632570862770081, + "learning_rate": 6.57243880287262e-05, + "loss": 2.8476, + "step": 48069 + }, + { + "epoch": 2.36, + "grad_norm": 0.7301486134529114, + "learning_rate": 6.571477137370788e-05, + "loss": 2.9998, + "step": 48070 + }, + { + "epoch": 2.36, + "grad_norm": 0.7374549508094788, + "learning_rate": 6.570515533575188e-05, + "loss": 3.0035, + "step": 48071 + }, + { + "epoch": 2.36, + "grad_norm": 0.7524304986000061, + "learning_rate": 6.569553991488332e-05, + "loss": 2.8548, + "step": 48072 + }, + { + "epoch": 2.36, + "grad_norm": 0.706081748008728, + "learning_rate": 6.568592511112759e-05, + "loss": 2.8905, + "step": 48073 + }, + { + "epoch": 2.36, + "grad_norm": 0.7375027537345886, + "learning_rate": 6.567631092451014e-05, + "loss": 3.1871, + "step": 48074 + }, + { + "epoch": 2.36, + "grad_norm": 0.7398857474327087, + "learning_rate": 6.566669735505619e-05, + "loss": 2.9136, + "step": 48075 + }, + { + "epoch": 2.36, + "grad_norm": 0.7311112880706787, + "learning_rate": 6.565708440279106e-05, + "loss": 2.6949, + "step": 48076 + }, + { + "epoch": 2.36, + "grad_norm": 0.7611480355262756, + "learning_rate": 6.564747206773996e-05, + "loss": 3.016, + "step": 48077 + }, + { + "epoch": 2.36, + "grad_norm": 0.7483656406402588, + "learning_rate": 6.563786034992843e-05, + "loss": 3.0053, + "step": 48078 + }, + { + "epoch": 2.36, + "grad_norm": 0.7387386560440063, + "learning_rate": 6.562824924938162e-05, + "loss": 2.9809, + "step": 48079 + }, + { + "epoch": 2.36, + "grad_norm": 0.745112955570221, + "learning_rate": 6.561863876612484e-05, + "loss": 2.9586, + "step": 48080 + }, + { + "epoch": 2.36, + "grad_norm": 0.7196276783943176, + "learning_rate": 6.560902890018357e-05, + "loss": 3.2109, + "step": 48081 + }, + { + "epoch": 2.36, + "grad_norm": 0.7283993363380432, + "learning_rate": 6.559941965158297e-05, + "loss": 3.1374, + "step": 48082 + }, + { + "epoch": 2.36, + "grad_norm": 0.7114156484603882, + "learning_rate": 6.558981102034844e-05, + "loss": 3.015, + "step": 48083 + }, + { + "epoch": 2.36, + "grad_norm": 0.7603095769882202, + "learning_rate": 6.55802030065051e-05, + "loss": 2.9397, + "step": 48084 + }, + { + "epoch": 2.36, + "grad_norm": 0.6935713291168213, + "learning_rate": 6.557059561007843e-05, + "loss": 3.1052, + "step": 48085 + }, + { + "epoch": 2.36, + "grad_norm": 0.6886460185050964, + "learning_rate": 6.556098883109378e-05, + "loss": 2.9214, + "step": 48086 + }, + { + "epoch": 2.36, + "grad_norm": 0.7370249032974243, + "learning_rate": 6.555138266957627e-05, + "loss": 2.9907, + "step": 48087 + }, + { + "epoch": 2.36, + "grad_norm": 0.7281854748725891, + "learning_rate": 6.554177712555138e-05, + "loss": 2.7215, + "step": 48088 + }, + { + "epoch": 2.36, + "grad_norm": 0.7895329594612122, + "learning_rate": 6.553217219904424e-05, + "loss": 2.8622, + "step": 48089 + }, + { + "epoch": 2.36, + "grad_norm": 0.7070068717002869, + "learning_rate": 6.552256789008031e-05, + "loss": 2.8635, + "step": 48090 + }, + { + "epoch": 2.36, + "grad_norm": 0.8132027387619019, + "learning_rate": 6.551296419868481e-05, + "loss": 2.7608, + "step": 48091 + }, + { + "epoch": 2.36, + "grad_norm": 0.7146351933479309, + "learning_rate": 6.5503361124883e-05, + "loss": 2.5896, + "step": 48092 + }, + { + "epoch": 2.36, + "grad_norm": 0.7069380879402161, + "learning_rate": 6.549375866870023e-05, + "loss": 3.038, + "step": 48093 + }, + { + "epoch": 2.36, + "grad_norm": 0.7371593713760376, + "learning_rate": 6.548415683016169e-05, + "loss": 2.9843, + "step": 48094 + }, + { + "epoch": 2.36, + "grad_norm": 0.7395793795585632, + "learning_rate": 6.547455560929278e-05, + "loss": 2.6887, + "step": 48095 + }, + { + "epoch": 2.36, + "grad_norm": 0.7200589179992676, + "learning_rate": 6.546495500611882e-05, + "loss": 2.8007, + "step": 48096 + }, + { + "epoch": 2.36, + "grad_norm": 0.7023578882217407, + "learning_rate": 6.545535502066505e-05, + "loss": 2.632, + "step": 48097 + }, + { + "epoch": 2.36, + "grad_norm": 0.7165631055831909, + "learning_rate": 6.544575565295672e-05, + "loss": 2.9052, + "step": 48098 + }, + { + "epoch": 2.36, + "grad_norm": 0.7894178032875061, + "learning_rate": 6.543615690301903e-05, + "loss": 2.7053, + "step": 48099 + }, + { + "epoch": 2.36, + "grad_norm": 0.6898087859153748, + "learning_rate": 6.54265587708774e-05, + "loss": 2.7166, + "step": 48100 + }, + { + "epoch": 2.36, + "grad_norm": 0.7402461171150208, + "learning_rate": 6.541696125655711e-05, + "loss": 3.0452, + "step": 48101 + }, + { + "epoch": 2.36, + "grad_norm": 0.7221176028251648, + "learning_rate": 6.540736436008333e-05, + "loss": 2.9999, + "step": 48102 + }, + { + "epoch": 2.36, + "grad_norm": 0.7226662039756775, + "learning_rate": 6.539776808148148e-05, + "loss": 2.8164, + "step": 48103 + }, + { + "epoch": 2.36, + "grad_norm": 0.7799202799797058, + "learning_rate": 6.538817242077676e-05, + "loss": 2.9798, + "step": 48104 + }, + { + "epoch": 2.36, + "grad_norm": 0.7280610203742981, + "learning_rate": 6.537857737799438e-05, + "loss": 2.9594, + "step": 48105 + }, + { + "epoch": 2.36, + "grad_norm": 0.7227134108543396, + "learning_rate": 6.536898295315975e-05, + "loss": 2.6752, + "step": 48106 + }, + { + "epoch": 2.36, + "grad_norm": 0.6850351691246033, + "learning_rate": 6.535938914629797e-05, + "loss": 2.8372, + "step": 48107 + }, + { + "epoch": 2.36, + "grad_norm": 0.716296911239624, + "learning_rate": 6.534979595743449e-05, + "loss": 2.9671, + "step": 48108 + }, + { + "epoch": 2.36, + "grad_norm": 0.6905850172042847, + "learning_rate": 6.534020338659439e-05, + "loss": 2.728, + "step": 48109 + }, + { + "epoch": 2.36, + "grad_norm": 0.7352511882781982, + "learning_rate": 6.533061143380304e-05, + "loss": 2.9374, + "step": 48110 + }, + { + "epoch": 2.36, + "grad_norm": 0.7677817940711975, + "learning_rate": 6.53210200990858e-05, + "loss": 2.7527, + "step": 48111 + }, + { + "epoch": 2.36, + "grad_norm": 0.7257785201072693, + "learning_rate": 6.531142938246781e-05, + "loss": 3.1291, + "step": 48112 + }, + { + "epoch": 2.36, + "grad_norm": 0.7301912903785706, + "learning_rate": 6.530183928397435e-05, + "loss": 2.7511, + "step": 48113 + }, + { + "epoch": 2.36, + "grad_norm": 0.7411414384841919, + "learning_rate": 6.529224980363062e-05, + "loss": 2.822, + "step": 48114 + }, + { + "epoch": 2.36, + "grad_norm": 0.7773602604866028, + "learning_rate": 6.528266094146191e-05, + "loss": 2.8123, + "step": 48115 + }, + { + "epoch": 2.36, + "grad_norm": 0.7311223149299622, + "learning_rate": 6.527307269749358e-05, + "loss": 2.9162, + "step": 48116 + }, + { + "epoch": 2.36, + "grad_norm": 0.7010335922241211, + "learning_rate": 6.526348507175073e-05, + "loss": 2.9099, + "step": 48117 + }, + { + "epoch": 2.36, + "grad_norm": 0.7506341338157654, + "learning_rate": 6.525389806425875e-05, + "loss": 2.963, + "step": 48118 + }, + { + "epoch": 2.36, + "grad_norm": 0.7527403235435486, + "learning_rate": 6.524431167504283e-05, + "loss": 2.9887, + "step": 48119 + }, + { + "epoch": 2.36, + "grad_norm": 0.7092116475105286, + "learning_rate": 6.523472590412814e-05, + "loss": 2.7487, + "step": 48120 + }, + { + "epoch": 2.36, + "grad_norm": 0.7055248618125916, + "learning_rate": 6.522514075154006e-05, + "loss": 3.0072, + "step": 48121 + }, + { + "epoch": 2.36, + "grad_norm": 0.7026886940002441, + "learning_rate": 6.521555621730372e-05, + "loss": 2.7419, + "step": 48122 + }, + { + "epoch": 2.36, + "grad_norm": 0.6752901673316956, + "learning_rate": 6.52059723014445e-05, + "loss": 2.918, + "step": 48123 + }, + { + "epoch": 2.36, + "grad_norm": 0.715085506439209, + "learning_rate": 6.519638900398747e-05, + "loss": 2.9607, + "step": 48124 + }, + { + "epoch": 2.36, + "grad_norm": 0.7532249093055725, + "learning_rate": 6.518680632495805e-05, + "loss": 2.8491, + "step": 48125 + }, + { + "epoch": 2.36, + "grad_norm": 0.7319445610046387, + "learning_rate": 6.517722426438139e-05, + "loss": 2.9336, + "step": 48126 + }, + { + "epoch": 2.36, + "grad_norm": 0.7512108683586121, + "learning_rate": 6.516764282228264e-05, + "loss": 2.6582, + "step": 48127 + }, + { + "epoch": 2.36, + "grad_norm": 0.7304036021232605, + "learning_rate": 6.515806199868722e-05, + "loss": 3.0088, + "step": 48128 + }, + { + "epoch": 2.36, + "grad_norm": 0.7679119110107422, + "learning_rate": 6.514848179362017e-05, + "loss": 2.9008, + "step": 48129 + }, + { + "epoch": 2.36, + "grad_norm": 0.6933061480522156, + "learning_rate": 6.513890220710682e-05, + "loss": 3.1265, + "step": 48130 + }, + { + "epoch": 2.36, + "grad_norm": 0.7610794305801392, + "learning_rate": 6.512932323917251e-05, + "loss": 2.8423, + "step": 48131 + }, + { + "epoch": 2.36, + "grad_norm": 0.7362642288208008, + "learning_rate": 6.511974488984226e-05, + "loss": 2.924, + "step": 48132 + }, + { + "epoch": 2.36, + "grad_norm": 0.7452098727226257, + "learning_rate": 6.511016715914147e-05, + "loss": 2.9748, + "step": 48133 + }, + { + "epoch": 2.36, + "grad_norm": 0.7530220150947571, + "learning_rate": 6.51005900470953e-05, + "loss": 3.0002, + "step": 48134 + }, + { + "epoch": 2.36, + "grad_norm": 0.8055080771446228, + "learning_rate": 6.50910135537289e-05, + "loss": 3.0089, + "step": 48135 + }, + { + "epoch": 2.36, + "grad_norm": 0.722628653049469, + "learning_rate": 6.508143767906761e-05, + "loss": 2.9678, + "step": 48136 + }, + { + "epoch": 2.36, + "grad_norm": 0.7451878190040588, + "learning_rate": 6.507186242313657e-05, + "loss": 2.8849, + "step": 48137 + }, + { + "epoch": 2.36, + "grad_norm": 0.7400360107421875, + "learning_rate": 6.50622877859611e-05, + "loss": 2.9511, + "step": 48138 + }, + { + "epoch": 2.36, + "grad_norm": 0.7950756549835205, + "learning_rate": 6.505271376756625e-05, + "loss": 2.7903, + "step": 48139 + }, + { + "epoch": 2.36, + "grad_norm": 0.7754967212677002, + "learning_rate": 6.504314036797742e-05, + "loss": 2.8494, + "step": 48140 + }, + { + "epoch": 2.36, + "grad_norm": 0.719846785068512, + "learning_rate": 6.503356758721974e-05, + "loss": 3.1789, + "step": 48141 + }, + { + "epoch": 2.36, + "grad_norm": 0.7460191249847412, + "learning_rate": 6.502399542531835e-05, + "loss": 2.7791, + "step": 48142 + }, + { + "epoch": 2.36, + "grad_norm": 0.6921855211257935, + "learning_rate": 6.501442388229862e-05, + "loss": 2.8102, + "step": 48143 + }, + { + "epoch": 2.36, + "grad_norm": 0.8616728782653809, + "learning_rate": 6.500485295818557e-05, + "loss": 2.8678, + "step": 48144 + }, + { + "epoch": 2.36, + "grad_norm": 0.7126127481460571, + "learning_rate": 6.499528265300453e-05, + "loss": 3.0098, + "step": 48145 + }, + { + "epoch": 2.36, + "grad_norm": 0.723812460899353, + "learning_rate": 6.498571296678075e-05, + "loss": 2.8521, + "step": 48146 + }, + { + "epoch": 2.36, + "grad_norm": 0.7311473488807678, + "learning_rate": 6.497614389953932e-05, + "loss": 3.0363, + "step": 48147 + }, + { + "epoch": 2.36, + "grad_norm": 0.7247294783592224, + "learning_rate": 6.496657545130565e-05, + "loss": 2.7482, + "step": 48148 + }, + { + "epoch": 2.36, + "grad_norm": 0.7272924184799194, + "learning_rate": 6.495700762210459e-05, + "loss": 3.0696, + "step": 48149 + }, + { + "epoch": 2.36, + "grad_norm": 0.7352733016014099, + "learning_rate": 6.494744041196158e-05, + "loss": 2.9417, + "step": 48150 + }, + { + "epoch": 2.36, + "grad_norm": 0.7280297875404358, + "learning_rate": 6.493787382090186e-05, + "loss": 3.0833, + "step": 48151 + }, + { + "epoch": 2.36, + "grad_norm": 0.6772411465644836, + "learning_rate": 6.492830784895042e-05, + "loss": 3.0068, + "step": 48152 + }, + { + "epoch": 2.36, + "grad_norm": 0.7198385000228882, + "learning_rate": 6.491874249613267e-05, + "loss": 3.0322, + "step": 48153 + }, + { + "epoch": 2.36, + "grad_norm": 0.7675883173942566, + "learning_rate": 6.490917776247364e-05, + "loss": 3.009, + "step": 48154 + }, + { + "epoch": 2.36, + "grad_norm": 0.7238621115684509, + "learning_rate": 6.489961364799868e-05, + "loss": 2.8473, + "step": 48155 + }, + { + "epoch": 2.36, + "grad_norm": 0.804047703742981, + "learning_rate": 6.489005015273288e-05, + "loss": 2.7081, + "step": 48156 + }, + { + "epoch": 2.36, + "grad_norm": 0.688144862651825, + "learning_rate": 6.488048727670136e-05, + "loss": 3.1035, + "step": 48157 + }, + { + "epoch": 2.36, + "grad_norm": 0.8125393390655518, + "learning_rate": 6.487092501992946e-05, + "loss": 2.8867, + "step": 48158 + }, + { + "epoch": 2.36, + "grad_norm": 0.7027606964111328, + "learning_rate": 6.486136338244222e-05, + "loss": 3.0313, + "step": 48159 + }, + { + "epoch": 2.36, + "grad_norm": 0.7894521355628967, + "learning_rate": 6.485180236426499e-05, + "loss": 2.9952, + "step": 48160 + }, + { + "epoch": 2.36, + "grad_norm": 0.7138534188270569, + "learning_rate": 6.484224196542276e-05, + "loss": 2.9025, + "step": 48161 + }, + { + "epoch": 2.36, + "grad_norm": 0.7329240441322327, + "learning_rate": 6.48326821859409e-05, + "loss": 2.9142, + "step": 48162 + }, + { + "epoch": 2.36, + "grad_norm": 0.7437616586685181, + "learning_rate": 6.482312302584448e-05, + "loss": 2.9068, + "step": 48163 + }, + { + "epoch": 2.36, + "grad_norm": 0.7317550182342529, + "learning_rate": 6.481356448515863e-05, + "loss": 2.7661, + "step": 48164 + }, + { + "epoch": 2.36, + "grad_norm": 0.7197153568267822, + "learning_rate": 6.480400656390866e-05, + "loss": 2.8229, + "step": 48165 + }, + { + "epoch": 2.36, + "grad_norm": 0.7194134593009949, + "learning_rate": 6.479444926211959e-05, + "loss": 2.7384, + "step": 48166 + }, + { + "epoch": 2.36, + "grad_norm": 0.7750094532966614, + "learning_rate": 6.478489257981667e-05, + "loss": 2.7421, + "step": 48167 + }, + { + "epoch": 2.36, + "grad_norm": 0.7340207695960999, + "learning_rate": 6.477533651702517e-05, + "loss": 2.7373, + "step": 48168 + }, + { + "epoch": 2.36, + "grad_norm": 0.7263909578323364, + "learning_rate": 6.476578107377008e-05, + "loss": 2.9162, + "step": 48169 + }, + { + "epoch": 2.36, + "grad_norm": 0.7772607207298279, + "learning_rate": 6.475622625007674e-05, + "loss": 3.024, + "step": 48170 + }, + { + "epoch": 2.36, + "grad_norm": 0.7200969457626343, + "learning_rate": 6.47466720459702e-05, + "loss": 3.0136, + "step": 48171 + }, + { + "epoch": 2.36, + "grad_norm": 0.7096529006958008, + "learning_rate": 6.473711846147557e-05, + "loss": 3.0276, + "step": 48172 + }, + { + "epoch": 2.36, + "grad_norm": 0.7910388708114624, + "learning_rate": 6.47275654966182e-05, + "loss": 2.8153, + "step": 48173 + }, + { + "epoch": 2.36, + "grad_norm": 0.753176212310791, + "learning_rate": 6.471801315142304e-05, + "loss": 2.9071, + "step": 48174 + }, + { + "epoch": 2.36, + "grad_norm": 0.7686089277267456, + "learning_rate": 6.470846142591545e-05, + "loss": 2.9368, + "step": 48175 + }, + { + "epoch": 2.36, + "grad_norm": 0.7298809885978699, + "learning_rate": 6.46989103201204e-05, + "loss": 2.8194, + "step": 48176 + }, + { + "epoch": 2.36, + "grad_norm": 0.7550878524780273, + "learning_rate": 6.46893598340632e-05, + "loss": 3.235, + "step": 48177 + }, + { + "epoch": 2.36, + "grad_norm": 0.7033806443214417, + "learning_rate": 6.467980996776898e-05, + "loss": 2.869, + "step": 48178 + }, + { + "epoch": 2.36, + "grad_norm": 0.743836522102356, + "learning_rate": 6.467026072126275e-05, + "loss": 2.7897, + "step": 48179 + }, + { + "epoch": 2.36, + "grad_norm": 0.7496225833892822, + "learning_rate": 6.466071209456984e-05, + "loss": 2.9425, + "step": 48180 + }, + { + "epoch": 2.36, + "grad_norm": 0.7606794238090515, + "learning_rate": 6.465116408771526e-05, + "loss": 2.9058, + "step": 48181 + }, + { + "epoch": 2.36, + "grad_norm": 0.7309315204620361, + "learning_rate": 6.46416167007242e-05, + "loss": 2.6442, + "step": 48182 + }, + { + "epoch": 2.36, + "grad_norm": 0.7935462594032288, + "learning_rate": 6.463206993362191e-05, + "loss": 2.9268, + "step": 48183 + }, + { + "epoch": 2.36, + "grad_norm": 0.7384037971496582, + "learning_rate": 6.462252378643345e-05, + "loss": 3.1264, + "step": 48184 + }, + { + "epoch": 2.36, + "grad_norm": 0.7172885537147522, + "learning_rate": 6.461297825918394e-05, + "loss": 2.9865, + "step": 48185 + }, + { + "epoch": 2.36, + "grad_norm": 0.7741020321846008, + "learning_rate": 6.460343335189849e-05, + "loss": 2.7657, + "step": 48186 + }, + { + "epoch": 2.36, + "grad_norm": 0.742149293422699, + "learning_rate": 6.45938890646023e-05, + "loss": 2.8609, + "step": 48187 + }, + { + "epoch": 2.36, + "grad_norm": 0.6724493503570557, + "learning_rate": 6.458434539732058e-05, + "loss": 2.9769, + "step": 48188 + }, + { + "epoch": 2.36, + "grad_norm": 0.7558503746986389, + "learning_rate": 6.457480235007829e-05, + "loss": 2.8232, + "step": 48189 + }, + { + "epoch": 2.36, + "grad_norm": 0.7640053033828735, + "learning_rate": 6.456525992290075e-05, + "loss": 2.8353, + "step": 48190 + }, + { + "epoch": 2.36, + "grad_norm": 0.7217065095901489, + "learning_rate": 6.455571811581293e-05, + "loss": 2.814, + "step": 48191 + }, + { + "epoch": 2.36, + "grad_norm": 0.7169857025146484, + "learning_rate": 6.454617692884014e-05, + "loss": 2.9869, + "step": 48192 + }, + { + "epoch": 2.36, + "grad_norm": 0.7590023279190063, + "learning_rate": 6.453663636200738e-05, + "loss": 2.8972, + "step": 48193 + }, + { + "epoch": 2.36, + "grad_norm": 0.6975011825561523, + "learning_rate": 6.45270964153397e-05, + "loss": 3.0994, + "step": 48194 + }, + { + "epoch": 2.36, + "grad_norm": 0.7145510911941528, + "learning_rate": 6.451755708886243e-05, + "loss": 2.8702, + "step": 48195 + }, + { + "epoch": 2.36, + "grad_norm": 0.7827848196029663, + "learning_rate": 6.450801838260055e-05, + "loss": 3.2263, + "step": 48196 + }, + { + "epoch": 2.36, + "grad_norm": 0.7232081890106201, + "learning_rate": 6.449848029657918e-05, + "loss": 2.9435, + "step": 48197 + }, + { + "epoch": 2.36, + "grad_norm": 0.7153977155685425, + "learning_rate": 6.448894283082358e-05, + "loss": 2.8311, + "step": 48198 + }, + { + "epoch": 2.36, + "grad_norm": 0.7476628422737122, + "learning_rate": 6.44794059853588e-05, + "loss": 2.9139, + "step": 48199 + }, + { + "epoch": 2.36, + "grad_norm": 0.7741516828536987, + "learning_rate": 6.446986976020994e-05, + "loss": 2.8474, + "step": 48200 + }, + { + "epoch": 2.36, + "grad_norm": 0.7252960801124573, + "learning_rate": 6.4460334155402e-05, + "loss": 3.0896, + "step": 48201 + }, + { + "epoch": 2.36, + "grad_norm": 0.711409866809845, + "learning_rate": 6.445079917096024e-05, + "loss": 2.9098, + "step": 48202 + }, + { + "epoch": 2.36, + "grad_norm": 0.6946399211883545, + "learning_rate": 6.44412648069098e-05, + "loss": 2.8107, + "step": 48203 + }, + { + "epoch": 2.36, + "grad_norm": 0.7085345983505249, + "learning_rate": 6.443173106327568e-05, + "loss": 3.0131, + "step": 48204 + }, + { + "epoch": 2.36, + "grad_norm": 0.7308792471885681, + "learning_rate": 6.44221979400831e-05, + "loss": 3.0106, + "step": 48205 + }, + { + "epoch": 2.36, + "grad_norm": 0.7140591144561768, + "learning_rate": 6.441266543735712e-05, + "loss": 3.0825, + "step": 48206 + }, + { + "epoch": 2.36, + "grad_norm": 0.7082121968269348, + "learning_rate": 6.440313355512277e-05, + "loss": 2.9419, + "step": 48207 + }, + { + "epoch": 2.36, + "grad_norm": 0.7402740120887756, + "learning_rate": 6.43936022934053e-05, + "loss": 3.0304, + "step": 48208 + }, + { + "epoch": 2.36, + "grad_norm": 0.7371035814285278, + "learning_rate": 6.438407165222964e-05, + "loss": 2.9194, + "step": 48209 + }, + { + "epoch": 2.36, + "grad_norm": 0.695787250995636, + "learning_rate": 6.437454163162108e-05, + "loss": 3.0194, + "step": 48210 + }, + { + "epoch": 2.36, + "grad_norm": 0.7401152849197388, + "learning_rate": 6.436501223160456e-05, + "loss": 3.0295, + "step": 48211 + }, + { + "epoch": 2.36, + "grad_norm": 0.7056068778038025, + "learning_rate": 6.435548345220523e-05, + "loss": 3.2774, + "step": 48212 + }, + { + "epoch": 2.36, + "grad_norm": 0.7429165840148926, + "learning_rate": 6.434595529344832e-05, + "loss": 2.6779, + "step": 48213 + }, + { + "epoch": 2.36, + "grad_norm": 0.7436327338218689, + "learning_rate": 6.43364277553588e-05, + "loss": 2.8903, + "step": 48214 + }, + { + "epoch": 2.36, + "grad_norm": 0.7075293064117432, + "learning_rate": 6.432690083796177e-05, + "loss": 2.9532, + "step": 48215 + }, + { + "epoch": 2.36, + "grad_norm": 0.728111982345581, + "learning_rate": 6.431737454128224e-05, + "loss": 2.8952, + "step": 48216 + }, + { + "epoch": 2.36, + "grad_norm": 0.7516396045684814, + "learning_rate": 6.43078488653454e-05, + "loss": 3.0249, + "step": 48217 + }, + { + "epoch": 2.36, + "grad_norm": 0.7346088290214539, + "learning_rate": 6.42983238101764e-05, + "loss": 2.7594, + "step": 48218 + }, + { + "epoch": 2.36, + "grad_norm": 0.7349608540534973, + "learning_rate": 6.428879937580015e-05, + "loss": 2.7937, + "step": 48219 + }, + { + "epoch": 2.36, + "grad_norm": 0.7432407140731812, + "learning_rate": 6.427927556224196e-05, + "loss": 2.8822, + "step": 48220 + }, + { + "epoch": 2.36, + "grad_norm": 0.7252591848373413, + "learning_rate": 6.426975236952675e-05, + "loss": 2.7845, + "step": 48221 + }, + { + "epoch": 2.36, + "grad_norm": 0.6931779384613037, + "learning_rate": 6.42602297976796e-05, + "loss": 2.9773, + "step": 48222 + }, + { + "epoch": 2.36, + "grad_norm": 0.7488669157028198, + "learning_rate": 6.425070784672571e-05, + "loss": 2.9075, + "step": 48223 + }, + { + "epoch": 2.36, + "grad_norm": 0.7043687105178833, + "learning_rate": 6.424118651668998e-05, + "loss": 3.07, + "step": 48224 + }, + { + "epoch": 2.36, + "grad_norm": 0.726883053779602, + "learning_rate": 6.423166580759771e-05, + "loss": 2.6815, + "step": 48225 + }, + { + "epoch": 2.36, + "grad_norm": 0.7561427354812622, + "learning_rate": 6.422214571947376e-05, + "loss": 2.7455, + "step": 48226 + }, + { + "epoch": 2.36, + "grad_norm": 0.6870282292366028, + "learning_rate": 6.421262625234337e-05, + "loss": 2.6591, + "step": 48227 + }, + { + "epoch": 2.36, + "grad_norm": 0.7265474200248718, + "learning_rate": 6.420310740623154e-05, + "loss": 2.7049, + "step": 48228 + }, + { + "epoch": 2.36, + "grad_norm": 0.7331658601760864, + "learning_rate": 6.419358918116329e-05, + "loss": 2.8969, + "step": 48229 + }, + { + "epoch": 2.36, + "grad_norm": 0.7479391098022461, + "learning_rate": 6.418407157716381e-05, + "loss": 3.0111, + "step": 48230 + }, + { + "epoch": 2.36, + "grad_norm": 0.738805890083313, + "learning_rate": 6.417455459425804e-05, + "loss": 2.7992, + "step": 48231 + }, + { + "epoch": 2.36, + "grad_norm": 0.7408674955368042, + "learning_rate": 6.416503823247115e-05, + "loss": 2.7176, + "step": 48232 + }, + { + "epoch": 2.36, + "grad_norm": 0.7549987435340881, + "learning_rate": 6.415552249182812e-05, + "loss": 2.8767, + "step": 48233 + }, + { + "epoch": 2.36, + "grad_norm": 0.7115577459335327, + "learning_rate": 6.414600737235402e-05, + "loss": 2.8056, + "step": 48234 + }, + { + "epoch": 2.36, + "grad_norm": 0.7305412292480469, + "learning_rate": 6.413649287407406e-05, + "loss": 2.9721, + "step": 48235 + }, + { + "epoch": 2.36, + "grad_norm": 0.7211993336677551, + "learning_rate": 6.412697899701315e-05, + "loss": 2.8724, + "step": 48236 + }, + { + "epoch": 2.36, + "grad_norm": 0.7803027629852295, + "learning_rate": 6.411746574119642e-05, + "loss": 2.7537, + "step": 48237 + }, + { + "epoch": 2.36, + "grad_norm": 0.7594745755195618, + "learning_rate": 6.410795310664878e-05, + "loss": 3.07, + "step": 48238 + }, + { + "epoch": 2.36, + "grad_norm": 0.7324116230010986, + "learning_rate": 6.409844109339542e-05, + "loss": 2.8141, + "step": 48239 + }, + { + "epoch": 2.36, + "grad_norm": 0.6955828070640564, + "learning_rate": 6.408892970146143e-05, + "loss": 2.7801, + "step": 48240 + }, + { + "epoch": 2.36, + "grad_norm": 0.7122442722320557, + "learning_rate": 6.407941893087172e-05, + "loss": 2.8966, + "step": 48241 + }, + { + "epoch": 2.36, + "grad_norm": 0.7142869234085083, + "learning_rate": 6.40699087816515e-05, + "loss": 2.817, + "step": 48242 + }, + { + "epoch": 2.36, + "grad_norm": 0.7723212838172913, + "learning_rate": 6.40603992538257e-05, + "loss": 2.9792, + "step": 48243 + }, + { + "epoch": 2.36, + "grad_norm": 0.7571322917938232, + "learning_rate": 6.405089034741937e-05, + "loss": 2.9604, + "step": 48244 + }, + { + "epoch": 2.36, + "grad_norm": 0.730701744556427, + "learning_rate": 6.404138206245765e-05, + "loss": 2.9751, + "step": 48245 + }, + { + "epoch": 2.36, + "grad_norm": 0.7998728156089783, + "learning_rate": 6.403187439896541e-05, + "loss": 3.1012, + "step": 48246 + }, + { + "epoch": 2.36, + "grad_norm": 0.6990532279014587, + "learning_rate": 6.402236735696793e-05, + "loss": 2.839, + "step": 48247 + }, + { + "epoch": 2.36, + "grad_norm": 0.7441544532775879, + "learning_rate": 6.401286093648998e-05, + "loss": 2.9951, + "step": 48248 + }, + { + "epoch": 2.36, + "grad_norm": 0.6886500716209412, + "learning_rate": 6.400335513755679e-05, + "loss": 2.8154, + "step": 48249 + }, + { + "epoch": 2.36, + "grad_norm": 0.7198939323425293, + "learning_rate": 6.399384996019349e-05, + "loss": 3.056, + "step": 48250 + }, + { + "epoch": 2.36, + "grad_norm": 0.7159644961357117, + "learning_rate": 6.398434540442479e-05, + "loss": 2.8903, + "step": 48251 + }, + { + "epoch": 2.36, + "grad_norm": 0.733924388885498, + "learning_rate": 6.3974841470276e-05, + "loss": 2.9423, + "step": 48252 + }, + { + "epoch": 2.36, + "grad_norm": 0.7536724209785461, + "learning_rate": 6.396533815777197e-05, + "loss": 2.8155, + "step": 48253 + }, + { + "epoch": 2.36, + "grad_norm": 0.7304123044013977, + "learning_rate": 6.395583546693781e-05, + "loss": 2.8958, + "step": 48254 + }, + { + "epoch": 2.36, + "grad_norm": 0.7245292663574219, + "learning_rate": 6.394633339779865e-05, + "loss": 2.9736, + "step": 48255 + }, + { + "epoch": 2.36, + "grad_norm": 0.7507905960083008, + "learning_rate": 6.393683195037931e-05, + "loss": 2.8121, + "step": 48256 + }, + { + "epoch": 2.36, + "grad_norm": 0.7602225542068481, + "learning_rate": 6.392733112470502e-05, + "loss": 3.0037, + "step": 48257 + }, + { + "epoch": 2.37, + "grad_norm": 0.7352206707000732, + "learning_rate": 6.391783092080069e-05, + "loss": 2.929, + "step": 48258 + }, + { + "epoch": 2.37, + "grad_norm": 0.7542200684547424, + "learning_rate": 6.390833133869132e-05, + "loss": 2.9124, + "step": 48259 + }, + { + "epoch": 2.37, + "grad_norm": 0.7620871663093567, + "learning_rate": 6.389883237840201e-05, + "loss": 2.9583, + "step": 48260 + }, + { + "epoch": 2.37, + "grad_norm": 0.756596028804779, + "learning_rate": 6.388933403995768e-05, + "loss": 2.8351, + "step": 48261 + }, + { + "epoch": 2.37, + "grad_norm": 0.7041810750961304, + "learning_rate": 6.387983632338346e-05, + "loss": 2.9144, + "step": 48262 + }, + { + "epoch": 2.37, + "grad_norm": 0.7106823921203613, + "learning_rate": 6.387033922870425e-05, + "loss": 3.1492, + "step": 48263 + }, + { + "epoch": 2.37, + "grad_norm": 0.7551729083061218, + "learning_rate": 6.386084275594519e-05, + "loss": 2.929, + "step": 48264 + }, + { + "epoch": 2.37, + "grad_norm": 0.7914819717407227, + "learning_rate": 6.385134690513122e-05, + "loss": 2.8709, + "step": 48265 + }, + { + "epoch": 2.37, + "grad_norm": 0.7370805144309998, + "learning_rate": 6.384185167628726e-05, + "loss": 2.7838, + "step": 48266 + }, + { + "epoch": 2.37, + "grad_norm": 0.7235316634178162, + "learning_rate": 6.383235706943852e-05, + "loss": 3.0178, + "step": 48267 + }, + { + "epoch": 2.37, + "grad_norm": 0.7227551341056824, + "learning_rate": 6.382286308460982e-05, + "loss": 3.1076, + "step": 48268 + }, + { + "epoch": 2.37, + "grad_norm": 0.7112234830856323, + "learning_rate": 6.381336972182624e-05, + "loss": 2.9721, + "step": 48269 + }, + { + "epoch": 2.37, + "grad_norm": 0.7669709920883179, + "learning_rate": 6.380387698111288e-05, + "loss": 2.8578, + "step": 48270 + }, + { + "epoch": 2.37, + "grad_norm": 0.7495376467704773, + "learning_rate": 6.379438486249457e-05, + "loss": 2.8566, + "step": 48271 + }, + { + "epoch": 2.37, + "grad_norm": 0.6867067813873291, + "learning_rate": 6.378489336599644e-05, + "loss": 3.2184, + "step": 48272 + }, + { + "epoch": 2.37, + "grad_norm": 0.6908702254295349, + "learning_rate": 6.377540249164348e-05, + "loss": 2.8861, + "step": 48273 + }, + { + "epoch": 2.37, + "grad_norm": 0.7597622871398926, + "learning_rate": 6.376591223946054e-05, + "loss": 2.673, + "step": 48274 + }, + { + "epoch": 2.37, + "grad_norm": 0.7196870446205139, + "learning_rate": 6.375642260947278e-05, + "loss": 2.8766, + "step": 48275 + }, + { + "epoch": 2.37, + "grad_norm": 0.7505701780319214, + "learning_rate": 6.374693360170512e-05, + "loss": 2.9577, + "step": 48276 + }, + { + "epoch": 2.37, + "grad_norm": 0.7584325671195984, + "learning_rate": 6.373744521618261e-05, + "loss": 2.9858, + "step": 48277 + }, + { + "epoch": 2.37, + "grad_norm": 0.7458747625350952, + "learning_rate": 6.372795745293013e-05, + "loss": 3.0763, + "step": 48278 + }, + { + "epoch": 2.37, + "grad_norm": 0.79670649766922, + "learning_rate": 6.371847031197283e-05, + "loss": 2.8207, + "step": 48279 + }, + { + "epoch": 2.37, + "grad_norm": 0.67808997631073, + "learning_rate": 6.37089837933356e-05, + "loss": 2.705, + "step": 48280 + }, + { + "epoch": 2.37, + "grad_norm": 0.741486132144928, + "learning_rate": 6.369949789704332e-05, + "loss": 2.8786, + "step": 48281 + }, + { + "epoch": 2.37, + "grad_norm": 0.7459058165550232, + "learning_rate": 6.369001262312118e-05, + "loss": 3.1757, + "step": 48282 + }, + { + "epoch": 2.37, + "grad_norm": 0.6977839469909668, + "learning_rate": 6.368052797159402e-05, + "loss": 2.8612, + "step": 48283 + }, + { + "epoch": 2.37, + "grad_norm": 0.7027153968811035, + "learning_rate": 6.367104394248684e-05, + "loss": 2.9531, + "step": 48284 + }, + { + "epoch": 2.37, + "grad_norm": 0.6924808621406555, + "learning_rate": 6.366156053582475e-05, + "loss": 2.8736, + "step": 48285 + }, + { + "epoch": 2.37, + "grad_norm": 0.7258365750312805, + "learning_rate": 6.365207775163259e-05, + "loss": 2.7555, + "step": 48286 + }, + { + "epoch": 2.37, + "grad_norm": 0.7599617838859558, + "learning_rate": 6.36425955899354e-05, + "loss": 2.9788, + "step": 48287 + }, + { + "epoch": 2.37, + "grad_norm": 0.6839301586151123, + "learning_rate": 6.363311405075803e-05, + "loss": 3.2282, + "step": 48288 + }, + { + "epoch": 2.37, + "grad_norm": 0.7343278527259827, + "learning_rate": 6.362363313412557e-05, + "loss": 2.9432, + "step": 48289 + }, + { + "epoch": 2.37, + "grad_norm": 0.739058256149292, + "learning_rate": 6.361415284006301e-05, + "loss": 2.7162, + "step": 48290 + }, + { + "epoch": 2.37, + "grad_norm": 0.7455694675445557, + "learning_rate": 6.360467316859522e-05, + "loss": 3.0939, + "step": 48291 + }, + { + "epoch": 2.37, + "grad_norm": 0.7573563456535339, + "learning_rate": 6.359519411974733e-05, + "loss": 2.9621, + "step": 48292 + }, + { + "epoch": 2.37, + "grad_norm": 0.8001168370246887, + "learning_rate": 6.358571569354408e-05, + "loss": 2.9751, + "step": 48293 + }, + { + "epoch": 2.37, + "grad_norm": 0.7187392115592957, + "learning_rate": 6.357623789001064e-05, + "loss": 2.984, + "step": 48294 + }, + { + "epoch": 2.37, + "grad_norm": 0.6924291849136353, + "learning_rate": 6.356676070917192e-05, + "loss": 2.9977, + "step": 48295 + }, + { + "epoch": 2.37, + "grad_norm": 0.7191535830497742, + "learning_rate": 6.355728415105276e-05, + "loss": 2.8086, + "step": 48296 + }, + { + "epoch": 2.37, + "grad_norm": 0.7379903793334961, + "learning_rate": 6.354780821567828e-05, + "loss": 2.7732, + "step": 48297 + }, + { + "epoch": 2.37, + "grad_norm": 0.7109218239784241, + "learning_rate": 6.353833290307329e-05, + "loss": 2.6506, + "step": 48298 + }, + { + "epoch": 2.37, + "grad_norm": 0.7022459506988525, + "learning_rate": 6.35288582132628e-05, + "loss": 2.8524, + "step": 48299 + }, + { + "epoch": 2.37, + "grad_norm": 0.734695553779602, + "learning_rate": 6.351938414627191e-05, + "loss": 2.8649, + "step": 48300 + }, + { + "epoch": 2.37, + "grad_norm": 0.7235419750213623, + "learning_rate": 6.350991070212544e-05, + "loss": 2.6928, + "step": 48301 + }, + { + "epoch": 2.37, + "grad_norm": 0.7387106418609619, + "learning_rate": 6.350043788084835e-05, + "loss": 2.7351, + "step": 48302 + }, + { + "epoch": 2.37, + "grad_norm": 0.709895133972168, + "learning_rate": 6.349096568246549e-05, + "loss": 2.8434, + "step": 48303 + }, + { + "epoch": 2.37, + "grad_norm": 0.757611870765686, + "learning_rate": 6.348149410700192e-05, + "loss": 2.9994, + "step": 48304 + }, + { + "epoch": 2.37, + "grad_norm": 0.6948244571685791, + "learning_rate": 6.347202315448265e-05, + "loss": 2.7651, + "step": 48305 + }, + { + "epoch": 2.37, + "grad_norm": 0.7108848094940186, + "learning_rate": 6.346255282493248e-05, + "loss": 2.6992, + "step": 48306 + }, + { + "epoch": 2.37, + "grad_norm": 0.7500197291374207, + "learning_rate": 6.34530831183765e-05, + "loss": 2.9149, + "step": 48307 + }, + { + "epoch": 2.37, + "grad_norm": 0.7581573724746704, + "learning_rate": 6.34436140348396e-05, + "loss": 2.8101, + "step": 48308 + }, + { + "epoch": 2.37, + "grad_norm": 0.7715035676956177, + "learning_rate": 6.343414557434658e-05, + "loss": 2.795, + "step": 48309 + }, + { + "epoch": 2.37, + "grad_norm": 0.7194180488586426, + "learning_rate": 6.342467773692258e-05, + "loss": 2.9186, + "step": 48310 + }, + { + "epoch": 2.37, + "grad_norm": 0.7068958282470703, + "learning_rate": 6.341521052259236e-05, + "loss": 2.9039, + "step": 48311 + }, + { + "epoch": 2.37, + "grad_norm": 0.7456514239311218, + "learning_rate": 6.340574393138104e-05, + "loss": 2.9809, + "step": 48312 + }, + { + "epoch": 2.37, + "grad_norm": 0.7299491763114929, + "learning_rate": 6.339627796331338e-05, + "loss": 3.1068, + "step": 48313 + }, + { + "epoch": 2.37, + "grad_norm": 0.7144024968147278, + "learning_rate": 6.338681261841447e-05, + "loss": 2.7794, + "step": 48314 + }, + { + "epoch": 2.37, + "grad_norm": 0.7453769445419312, + "learning_rate": 6.337734789670907e-05, + "loss": 2.8954, + "step": 48315 + }, + { + "epoch": 2.37, + "grad_norm": 0.7227847576141357, + "learning_rate": 6.336788379822226e-05, + "loss": 2.9263, + "step": 48316 + }, + { + "epoch": 2.37, + "grad_norm": 0.7394989132881165, + "learning_rate": 6.335842032297892e-05, + "loss": 2.9357, + "step": 48317 + }, + { + "epoch": 2.37, + "grad_norm": 0.7643879652023315, + "learning_rate": 6.334895747100388e-05, + "loss": 2.8486, + "step": 48318 + }, + { + "epoch": 2.37, + "grad_norm": 0.710767388343811, + "learning_rate": 6.333949524232222e-05, + "loss": 2.8892, + "step": 48319 + }, + { + "epoch": 2.37, + "grad_norm": 0.743543803691864, + "learning_rate": 6.33300336369587e-05, + "loss": 3.076, + "step": 48320 + }, + { + "epoch": 2.37, + "grad_norm": 0.7929369211196899, + "learning_rate": 6.332057265493834e-05, + "loss": 2.9037, + "step": 48321 + }, + { + "epoch": 2.37, + "grad_norm": 0.7415796518325806, + "learning_rate": 6.33111122962861e-05, + "loss": 2.8008, + "step": 48322 + }, + { + "epoch": 2.37, + "grad_norm": 0.7029473185539246, + "learning_rate": 6.330165256102687e-05, + "loss": 2.8482, + "step": 48323 + }, + { + "epoch": 2.37, + "grad_norm": 0.7240558862686157, + "learning_rate": 6.32921934491855e-05, + "loss": 2.8949, + "step": 48324 + }, + { + "epoch": 2.37, + "grad_norm": 0.9161472320556641, + "learning_rate": 6.328273496078686e-05, + "loss": 2.7527, + "step": 48325 + }, + { + "epoch": 2.37, + "grad_norm": 0.7548016309738159, + "learning_rate": 6.327327709585597e-05, + "loss": 2.7239, + "step": 48326 + }, + { + "epoch": 2.37, + "grad_norm": 0.7531619071960449, + "learning_rate": 6.326381985441776e-05, + "loss": 2.9115, + "step": 48327 + }, + { + "epoch": 2.37, + "grad_norm": 0.7124982476234436, + "learning_rate": 6.3254363236497e-05, + "loss": 2.8327, + "step": 48328 + }, + { + "epoch": 2.37, + "grad_norm": 0.7740418910980225, + "learning_rate": 6.324490724211879e-05, + "loss": 2.811, + "step": 48329 + }, + { + "epoch": 2.37, + "grad_norm": 0.7585882544517517, + "learning_rate": 6.32354518713079e-05, + "loss": 2.7411, + "step": 48330 + }, + { + "epoch": 2.37, + "grad_norm": 0.6965351104736328, + "learning_rate": 6.322599712408922e-05, + "loss": 2.8907, + "step": 48331 + }, + { + "epoch": 2.37, + "grad_norm": 0.7372415065765381, + "learning_rate": 6.321654300048774e-05, + "loss": 3.0206, + "step": 48332 + }, + { + "epoch": 2.37, + "grad_norm": 0.7417186498641968, + "learning_rate": 6.320708950052827e-05, + "loss": 2.8534, + "step": 48333 + }, + { + "epoch": 2.37, + "grad_norm": 0.701483964920044, + "learning_rate": 6.31976366242358e-05, + "loss": 2.8217, + "step": 48334 + }, + { + "epoch": 2.37, + "grad_norm": 0.7164797186851501, + "learning_rate": 6.318818437163514e-05, + "loss": 2.8596, + "step": 48335 + }, + { + "epoch": 2.37, + "grad_norm": 0.727675199508667, + "learning_rate": 6.31787327427512e-05, + "loss": 2.7906, + "step": 48336 + }, + { + "epoch": 2.37, + "grad_norm": 0.6889267563819885, + "learning_rate": 6.3169281737609e-05, + "loss": 3.2159, + "step": 48337 + }, + { + "epoch": 2.37, + "grad_norm": 0.7056227326393127, + "learning_rate": 6.315983135623331e-05, + "loss": 2.9742, + "step": 48338 + }, + { + "epoch": 2.37, + "grad_norm": 0.7440009713172913, + "learning_rate": 6.315038159864905e-05, + "loss": 2.872, + "step": 48339 + }, + { + "epoch": 2.37, + "grad_norm": 0.7045519948005676, + "learning_rate": 6.314093246488101e-05, + "loss": 3.1583, + "step": 48340 + }, + { + "epoch": 2.37, + "grad_norm": 0.7586000561714172, + "learning_rate": 6.313148395495418e-05, + "loss": 2.7775, + "step": 48341 + }, + { + "epoch": 2.37, + "grad_norm": 0.7050657272338867, + "learning_rate": 6.312203606889356e-05, + "loss": 2.7034, + "step": 48342 + }, + { + "epoch": 2.37, + "grad_norm": 0.734698474407196, + "learning_rate": 6.311258880672376e-05, + "loss": 2.8888, + "step": 48343 + }, + { + "epoch": 2.37, + "grad_norm": 0.7230560183525085, + "learning_rate": 6.310314216846992e-05, + "loss": 2.8722, + "step": 48344 + }, + { + "epoch": 2.37, + "grad_norm": 0.7460218667984009, + "learning_rate": 6.309369615415681e-05, + "loss": 2.943, + "step": 48345 + }, + { + "epoch": 2.37, + "grad_norm": 0.7174545526504517, + "learning_rate": 6.308425076380925e-05, + "loss": 2.6023, + "step": 48346 + }, + { + "epoch": 2.37, + "grad_norm": 0.7263779044151306, + "learning_rate": 6.307480599745225e-05, + "loss": 2.9191, + "step": 48347 + }, + { + "epoch": 2.37, + "grad_norm": 0.7491353154182434, + "learning_rate": 6.306536185511053e-05, + "loss": 2.9063, + "step": 48348 + }, + { + "epoch": 2.37, + "grad_norm": 0.691290557384491, + "learning_rate": 6.30559183368091e-05, + "loss": 3.0822, + "step": 48349 + }, + { + "epoch": 2.37, + "grad_norm": 0.776136577129364, + "learning_rate": 6.304647544257273e-05, + "loss": 2.7441, + "step": 48350 + }, + { + "epoch": 2.37, + "grad_norm": 0.7215666174888611, + "learning_rate": 6.30370331724264e-05, + "loss": 2.9793, + "step": 48351 + }, + { + "epoch": 2.37, + "grad_norm": 0.7477503418922424, + "learning_rate": 6.302759152639496e-05, + "loss": 3.0098, + "step": 48352 + }, + { + "epoch": 2.37, + "grad_norm": 0.7336341142654419, + "learning_rate": 6.301815050450313e-05, + "loss": 2.9931, + "step": 48353 + }, + { + "epoch": 2.37, + "grad_norm": 0.8228724598884583, + "learning_rate": 6.300871010677596e-05, + "loss": 2.9006, + "step": 48354 + }, + { + "epoch": 2.37, + "grad_norm": 0.7225062847137451, + "learning_rate": 6.299927033323817e-05, + "loss": 3.0748, + "step": 48355 + }, + { + "epoch": 2.37, + "grad_norm": 0.7133581638336182, + "learning_rate": 6.298983118391467e-05, + "loss": 2.8641, + "step": 48356 + }, + { + "epoch": 2.37, + "grad_norm": 0.7188670635223389, + "learning_rate": 6.298039265883044e-05, + "loss": 2.9304, + "step": 48357 + }, + { + "epoch": 2.37, + "grad_norm": 0.6973915100097656, + "learning_rate": 6.297095475801014e-05, + "loss": 3.0409, + "step": 48358 + }, + { + "epoch": 2.37, + "grad_norm": 0.7095557451248169, + "learning_rate": 6.296151748147883e-05, + "loss": 3.026, + "step": 48359 + }, + { + "epoch": 2.37, + "grad_norm": 0.75076824426651, + "learning_rate": 6.295208082926126e-05, + "loss": 3.1059, + "step": 48360 + }, + { + "epoch": 2.37, + "grad_norm": 0.7055838704109192, + "learning_rate": 6.294264480138222e-05, + "loss": 2.9436, + "step": 48361 + }, + { + "epoch": 2.37, + "grad_norm": 0.7464839220046997, + "learning_rate": 6.29332093978667e-05, + "loss": 3.007, + "step": 48362 + }, + { + "epoch": 2.37, + "grad_norm": 0.715093731880188, + "learning_rate": 6.29237746187394e-05, + "loss": 2.8428, + "step": 48363 + }, + { + "epoch": 2.37, + "grad_norm": 0.7014065384864807, + "learning_rate": 6.291434046402533e-05, + "loss": 3.0259, + "step": 48364 + }, + { + "epoch": 2.37, + "grad_norm": 0.7219024896621704, + "learning_rate": 6.290490693374915e-05, + "loss": 3.1076, + "step": 48365 + }, + { + "epoch": 2.37, + "grad_norm": 0.7014488577842712, + "learning_rate": 6.289547402793594e-05, + "loss": 2.9713, + "step": 48366 + }, + { + "epoch": 2.37, + "grad_norm": 0.703105092048645, + "learning_rate": 6.288604174661041e-05, + "loss": 2.9675, + "step": 48367 + }, + { + "epoch": 2.37, + "grad_norm": 0.7211623191833496, + "learning_rate": 6.287661008979732e-05, + "loss": 2.8565, + "step": 48368 + }, + { + "epoch": 2.37, + "grad_norm": 0.7292293310165405, + "learning_rate": 6.286717905752169e-05, + "loss": 3.0596, + "step": 48369 + }, + { + "epoch": 2.37, + "grad_norm": 0.7422485947608948, + "learning_rate": 6.285774864980819e-05, + "loss": 2.8302, + "step": 48370 + }, + { + "epoch": 2.37, + "grad_norm": 0.7382919192314148, + "learning_rate": 6.284831886668175e-05, + "loss": 2.8995, + "step": 48371 + }, + { + "epoch": 2.37, + "grad_norm": 0.7146931886672974, + "learning_rate": 6.283888970816726e-05, + "loss": 3.0275, + "step": 48372 + }, + { + "epoch": 2.37, + "grad_norm": 0.6943666934967041, + "learning_rate": 6.282946117428942e-05, + "loss": 2.9928, + "step": 48373 + }, + { + "epoch": 2.37, + "grad_norm": 0.730255126953125, + "learning_rate": 6.282003326507323e-05, + "loss": 2.9221, + "step": 48374 + }, + { + "epoch": 2.37, + "grad_norm": 0.6991233825683594, + "learning_rate": 6.281060598054342e-05, + "loss": 2.6633, + "step": 48375 + }, + { + "epoch": 2.37, + "grad_norm": 0.7602810859680176, + "learning_rate": 6.280117932072474e-05, + "loss": 2.8916, + "step": 48376 + }, + { + "epoch": 2.37, + "grad_norm": 0.6797572374343872, + "learning_rate": 6.279175328564217e-05, + "loss": 2.6735, + "step": 48377 + }, + { + "epoch": 2.37, + "grad_norm": 0.8254128694534302, + "learning_rate": 6.278232787532041e-05, + "loss": 2.8041, + "step": 48378 + }, + { + "epoch": 2.37, + "grad_norm": 0.701127827167511, + "learning_rate": 6.277290308978445e-05, + "loss": 2.8629, + "step": 48379 + }, + { + "epoch": 2.37, + "grad_norm": 0.7156503796577454, + "learning_rate": 6.276347892905887e-05, + "loss": 2.8143, + "step": 48380 + }, + { + "epoch": 2.37, + "grad_norm": 0.7461564540863037, + "learning_rate": 6.275405539316876e-05, + "loss": 3.0213, + "step": 48381 + }, + { + "epoch": 2.37, + "grad_norm": 0.7387633323669434, + "learning_rate": 6.274463248213877e-05, + "loss": 3.003, + "step": 48382 + }, + { + "epoch": 2.37, + "grad_norm": 0.7231044173240662, + "learning_rate": 6.27352101959937e-05, + "loss": 2.9757, + "step": 48383 + }, + { + "epoch": 2.37, + "grad_norm": 0.7560230493545532, + "learning_rate": 6.27257885347585e-05, + "loss": 2.792, + "step": 48384 + }, + { + "epoch": 2.37, + "grad_norm": 0.7416335344314575, + "learning_rate": 6.271636749845783e-05, + "loss": 2.8567, + "step": 48385 + }, + { + "epoch": 2.37, + "grad_norm": 0.7156395316123962, + "learning_rate": 6.270694708711656e-05, + "loss": 2.8991, + "step": 48386 + }, + { + "epoch": 2.37, + "grad_norm": 0.7483652830123901, + "learning_rate": 6.269752730075962e-05, + "loss": 2.8864, + "step": 48387 + }, + { + "epoch": 2.37, + "grad_norm": 0.7325050234794617, + "learning_rate": 6.268810813941173e-05, + "loss": 2.7133, + "step": 48388 + }, + { + "epoch": 2.37, + "grad_norm": 0.7214424014091492, + "learning_rate": 6.26786896030977e-05, + "loss": 2.7828, + "step": 48389 + }, + { + "epoch": 2.37, + "grad_norm": 0.7492168545722961, + "learning_rate": 6.266927169184223e-05, + "loss": 2.9989, + "step": 48390 + }, + { + "epoch": 2.37, + "grad_norm": 0.7547978758811951, + "learning_rate": 6.265985440567029e-05, + "loss": 2.6112, + "step": 48391 + }, + { + "epoch": 2.37, + "grad_norm": 0.6953510642051697, + "learning_rate": 6.265043774460659e-05, + "loss": 3.0589, + "step": 48392 + }, + { + "epoch": 2.37, + "grad_norm": 0.7603529095649719, + "learning_rate": 6.264102170867588e-05, + "loss": 2.8438, + "step": 48393 + }, + { + "epoch": 2.37, + "grad_norm": 0.6982311606407166, + "learning_rate": 6.263160629790318e-05, + "loss": 2.8759, + "step": 48394 + }, + { + "epoch": 2.37, + "grad_norm": 0.6933879852294922, + "learning_rate": 6.262219151231305e-05, + "loss": 3.0471, + "step": 48395 + }, + { + "epoch": 2.37, + "grad_norm": 0.7062197923660278, + "learning_rate": 6.261277735193047e-05, + "loss": 2.9002, + "step": 48396 + }, + { + "epoch": 2.37, + "grad_norm": 0.7573047280311584, + "learning_rate": 6.260336381678011e-05, + "loss": 2.892, + "step": 48397 + }, + { + "epoch": 2.37, + "grad_norm": 0.6713040471076965, + "learning_rate": 6.259395090688678e-05, + "loss": 2.9715, + "step": 48398 + }, + { + "epoch": 2.37, + "grad_norm": 0.7490809559822083, + "learning_rate": 6.258453862227534e-05, + "loss": 2.9012, + "step": 48399 + }, + { + "epoch": 2.37, + "grad_norm": 0.7417430877685547, + "learning_rate": 6.257512696297047e-05, + "loss": 2.9564, + "step": 48400 + }, + { + "epoch": 2.37, + "grad_norm": 0.7003105878829956, + "learning_rate": 6.256571592899708e-05, + "loss": 2.9267, + "step": 48401 + }, + { + "epoch": 2.37, + "grad_norm": 0.7170599102973938, + "learning_rate": 6.255630552037983e-05, + "loss": 2.8603, + "step": 48402 + }, + { + "epoch": 2.37, + "grad_norm": 0.7321606874465942, + "learning_rate": 6.254689573714363e-05, + "loss": 3.1191, + "step": 48403 + }, + { + "epoch": 2.37, + "grad_norm": 0.7242743372917175, + "learning_rate": 6.253748657931326e-05, + "loss": 2.9785, + "step": 48404 + }, + { + "epoch": 2.37, + "grad_norm": 0.7323653101921082, + "learning_rate": 6.252807804691333e-05, + "loss": 2.8328, + "step": 48405 + }, + { + "epoch": 2.37, + "grad_norm": 0.7143465280532837, + "learning_rate": 6.251867013996884e-05, + "loss": 2.9296, + "step": 48406 + }, + { + "epoch": 2.37, + "grad_norm": 0.7798712849617004, + "learning_rate": 6.250926285850437e-05, + "loss": 3.0471, + "step": 48407 + }, + { + "epoch": 2.37, + "grad_norm": 0.7423224449157715, + "learning_rate": 6.249985620254479e-05, + "loss": 2.9293, + "step": 48408 + }, + { + "epoch": 2.37, + "grad_norm": 0.7109093070030212, + "learning_rate": 6.249045017211499e-05, + "loss": 2.9326, + "step": 48409 + }, + { + "epoch": 2.37, + "grad_norm": 0.7001209259033203, + "learning_rate": 6.248104476723963e-05, + "loss": 2.6705, + "step": 48410 + }, + { + "epoch": 2.37, + "grad_norm": 0.7019423842430115, + "learning_rate": 6.247163998794349e-05, + "loss": 2.7414, + "step": 48411 + }, + { + "epoch": 2.37, + "grad_norm": 0.7206318378448486, + "learning_rate": 6.246223583425122e-05, + "loss": 2.8257, + "step": 48412 + }, + { + "epoch": 2.37, + "grad_norm": 0.7256973385810852, + "learning_rate": 6.245283230618773e-05, + "loss": 2.7982, + "step": 48413 + }, + { + "epoch": 2.37, + "grad_norm": 0.7365603446960449, + "learning_rate": 6.244342940377787e-05, + "loss": 2.8841, + "step": 48414 + }, + { + "epoch": 2.37, + "grad_norm": 0.7689672708511353, + "learning_rate": 6.243402712704618e-05, + "loss": 2.956, + "step": 48415 + }, + { + "epoch": 2.37, + "grad_norm": 0.7122086882591248, + "learning_rate": 6.242462547601762e-05, + "loss": 2.8686, + "step": 48416 + }, + { + "epoch": 2.37, + "grad_norm": 0.7387104034423828, + "learning_rate": 6.241522445071683e-05, + "loss": 3.0019, + "step": 48417 + }, + { + "epoch": 2.37, + "grad_norm": 0.7708308100700378, + "learning_rate": 6.240582405116865e-05, + "loss": 2.8456, + "step": 48418 + }, + { + "epoch": 2.37, + "grad_norm": 0.7489545345306396, + "learning_rate": 6.239642427739783e-05, + "loss": 2.8216, + "step": 48419 + }, + { + "epoch": 2.37, + "grad_norm": 0.7343196272850037, + "learning_rate": 6.238702512942904e-05, + "loss": 2.7546, + "step": 48420 + }, + { + "epoch": 2.37, + "grad_norm": 0.7246786952018738, + "learning_rate": 6.237762660728712e-05, + "loss": 3.0248, + "step": 48421 + }, + { + "epoch": 2.37, + "grad_norm": 0.7109408378601074, + "learning_rate": 6.236822871099678e-05, + "loss": 2.9897, + "step": 48422 + }, + { + "epoch": 2.37, + "grad_norm": 0.7401118278503418, + "learning_rate": 6.235883144058275e-05, + "loss": 2.7363, + "step": 48423 + }, + { + "epoch": 2.37, + "grad_norm": 0.6865191459655762, + "learning_rate": 6.234943479606993e-05, + "loss": 2.7305, + "step": 48424 + }, + { + "epoch": 2.37, + "grad_norm": 0.7721907496452332, + "learning_rate": 6.234003877748295e-05, + "loss": 2.9704, + "step": 48425 + }, + { + "epoch": 2.37, + "grad_norm": 0.728862464427948, + "learning_rate": 6.233064338484656e-05, + "loss": 2.7578, + "step": 48426 + }, + { + "epoch": 2.37, + "grad_norm": 0.7266650199890137, + "learning_rate": 6.232124861818543e-05, + "loss": 2.904, + "step": 48427 + }, + { + "epoch": 2.37, + "grad_norm": 0.698026716709137, + "learning_rate": 6.231185447752444e-05, + "loss": 3.1348, + "step": 48428 + }, + { + "epoch": 2.37, + "grad_norm": 0.7453986406326294, + "learning_rate": 6.23024609628883e-05, + "loss": 2.982, + "step": 48429 + }, + { + "epoch": 2.37, + "grad_norm": 0.7807963490486145, + "learning_rate": 6.229306807430168e-05, + "loss": 3.0589, + "step": 48430 + }, + { + "epoch": 2.37, + "grad_norm": 0.7143594622612, + "learning_rate": 6.228367581178945e-05, + "loss": 2.7484, + "step": 48431 + }, + { + "epoch": 2.37, + "grad_norm": 0.7836489081382751, + "learning_rate": 6.227428417537629e-05, + "loss": 2.9194, + "step": 48432 + }, + { + "epoch": 2.37, + "grad_norm": 0.702145516872406, + "learning_rate": 6.226489316508678e-05, + "loss": 2.9083, + "step": 48433 + }, + { + "epoch": 2.37, + "grad_norm": 0.8490208387374878, + "learning_rate": 6.225550278094591e-05, + "loss": 2.8837, + "step": 48434 + }, + { + "epoch": 2.37, + "grad_norm": 0.6956592798233032, + "learning_rate": 6.224611302297821e-05, + "loss": 3.056, + "step": 48435 + }, + { + "epoch": 2.37, + "grad_norm": 0.7090912461280823, + "learning_rate": 6.223672389120855e-05, + "loss": 2.9451, + "step": 48436 + }, + { + "epoch": 2.37, + "grad_norm": 0.7443285584449768, + "learning_rate": 6.222733538566153e-05, + "loss": 2.851, + "step": 48437 + }, + { + "epoch": 2.37, + "grad_norm": 0.7666028738021851, + "learning_rate": 6.221794750636195e-05, + "loss": 2.8243, + "step": 48438 + }, + { + "epoch": 2.37, + "grad_norm": 0.7153770923614502, + "learning_rate": 6.220856025333462e-05, + "loss": 2.7963, + "step": 48439 + }, + { + "epoch": 2.37, + "grad_norm": 0.7110601663589478, + "learning_rate": 6.219917362660419e-05, + "loss": 2.9492, + "step": 48440 + }, + { + "epoch": 2.37, + "grad_norm": 0.7466623187065125, + "learning_rate": 6.218978762619533e-05, + "loss": 2.8231, + "step": 48441 + }, + { + "epoch": 2.37, + "grad_norm": 0.746893584728241, + "learning_rate": 6.218040225213277e-05, + "loss": 2.8501, + "step": 48442 + }, + { + "epoch": 2.37, + "grad_norm": 0.7358802556991577, + "learning_rate": 6.217101750444124e-05, + "loss": 2.9031, + "step": 48443 + }, + { + "epoch": 2.37, + "grad_norm": 0.6966966986656189, + "learning_rate": 6.216163338314555e-05, + "loss": 3.0072, + "step": 48444 + }, + { + "epoch": 2.37, + "grad_norm": 0.7609187960624695, + "learning_rate": 6.215224988827027e-05, + "loss": 2.7928, + "step": 48445 + }, + { + "epoch": 2.37, + "grad_norm": 0.6976915001869202, + "learning_rate": 6.214286701984028e-05, + "loss": 3.0634, + "step": 48446 + }, + { + "epoch": 2.37, + "grad_norm": 0.7734736204147339, + "learning_rate": 6.213348477788017e-05, + "loss": 3.026, + "step": 48447 + }, + { + "epoch": 2.37, + "grad_norm": 0.7737905383110046, + "learning_rate": 6.212410316241465e-05, + "loss": 2.9261, + "step": 48448 + }, + { + "epoch": 2.37, + "grad_norm": 0.7344973683357239, + "learning_rate": 6.21147221734685e-05, + "loss": 2.6841, + "step": 48449 + }, + { + "epoch": 2.37, + "grad_norm": 0.687893807888031, + "learning_rate": 6.210534181106634e-05, + "loss": 2.7757, + "step": 48450 + }, + { + "epoch": 2.37, + "grad_norm": 0.7061976194381714, + "learning_rate": 6.209596207523299e-05, + "loss": 3.0322, + "step": 48451 + }, + { + "epoch": 2.37, + "grad_norm": 0.7552989721298218, + "learning_rate": 6.2086582965993e-05, + "loss": 2.8587, + "step": 48452 + }, + { + "epoch": 2.37, + "grad_norm": 0.7478323578834534, + "learning_rate": 6.207720448337127e-05, + "loss": 2.93, + "step": 48453 + }, + { + "epoch": 2.37, + "grad_norm": 0.7599266767501831, + "learning_rate": 6.206782662739236e-05, + "loss": 2.9545, + "step": 48454 + }, + { + "epoch": 2.37, + "grad_norm": 0.752623975276947, + "learning_rate": 6.205844939808096e-05, + "loss": 2.79, + "step": 48455 + }, + { + "epoch": 2.37, + "grad_norm": 0.7089788913726807, + "learning_rate": 6.204907279546188e-05, + "loss": 2.9757, + "step": 48456 + }, + { + "epoch": 2.37, + "grad_norm": 0.7725157737731934, + "learning_rate": 6.203969681955967e-05, + "loss": 3.0366, + "step": 48457 + }, + { + "epoch": 2.37, + "grad_norm": 0.6831510663032532, + "learning_rate": 6.203032147039909e-05, + "loss": 3.068, + "step": 48458 + }, + { + "epoch": 2.37, + "grad_norm": 0.7839120626449585, + "learning_rate": 6.202094674800492e-05, + "loss": 2.8344, + "step": 48459 + }, + { + "epoch": 2.37, + "grad_norm": 0.7446629405021667, + "learning_rate": 6.20115726524017e-05, + "loss": 2.8361, + "step": 48460 + }, + { + "epoch": 2.37, + "grad_norm": 0.7064789533615112, + "learning_rate": 6.200219918361431e-05, + "loss": 2.8209, + "step": 48461 + }, + { + "epoch": 2.38, + "grad_norm": 0.7748939394950867, + "learning_rate": 6.199282634166726e-05, + "loss": 2.9435, + "step": 48462 + }, + { + "epoch": 2.38, + "grad_norm": 0.7227396368980408, + "learning_rate": 6.198345412658529e-05, + "loss": 2.6958, + "step": 48463 + }, + { + "epoch": 2.38, + "grad_norm": 0.705480694770813, + "learning_rate": 6.197408253839314e-05, + "loss": 2.9662, + "step": 48464 + }, + { + "epoch": 2.38, + "grad_norm": 0.704855740070343, + "learning_rate": 6.196471157711535e-05, + "loss": 3.0984, + "step": 48465 + }, + { + "epoch": 2.38, + "grad_norm": 0.8089683651924133, + "learning_rate": 6.19553412427768e-05, + "loss": 2.8041, + "step": 48466 + }, + { + "epoch": 2.38, + "grad_norm": 0.7625295519828796, + "learning_rate": 6.194597153540199e-05, + "loss": 2.6505, + "step": 48467 + }, + { + "epoch": 2.38, + "grad_norm": 0.7611942887306213, + "learning_rate": 6.193660245501575e-05, + "loss": 2.988, + "step": 48468 + }, + { + "epoch": 2.38, + "grad_norm": 0.7407697439193726, + "learning_rate": 6.192723400164268e-05, + "loss": 2.9857, + "step": 48469 + }, + { + "epoch": 2.38, + "grad_norm": 0.7640385627746582, + "learning_rate": 6.191786617530739e-05, + "loss": 2.7547, + "step": 48470 + }, + { + "epoch": 2.38, + "grad_norm": 0.7157742977142334, + "learning_rate": 6.190849897603472e-05, + "loss": 2.9381, + "step": 48471 + }, + { + "epoch": 2.38, + "grad_norm": 0.7664361596107483, + "learning_rate": 6.189913240384911e-05, + "loss": 2.8453, + "step": 48472 + }, + { + "epoch": 2.38, + "grad_norm": 0.7323439121246338, + "learning_rate": 6.188976645877548e-05, + "loss": 2.9364, + "step": 48473 + }, + { + "epoch": 2.38, + "grad_norm": 0.7110058069229126, + "learning_rate": 6.188040114083829e-05, + "loss": 2.8488, + "step": 48474 + }, + { + "epoch": 2.38, + "grad_norm": 0.7112416625022888, + "learning_rate": 6.187103645006228e-05, + "loss": 2.9664, + "step": 48475 + }, + { + "epoch": 2.38, + "grad_norm": 0.7495659589767456, + "learning_rate": 6.186167238647232e-05, + "loss": 2.6364, + "step": 48476 + }, + { + "epoch": 2.38, + "grad_norm": 0.7033055424690247, + "learning_rate": 6.18523089500927e-05, + "loss": 2.9183, + "step": 48477 + }, + { + "epoch": 2.38, + "grad_norm": 0.6921057105064392, + "learning_rate": 6.184294614094835e-05, + "loss": 2.8459, + "step": 48478 + }, + { + "epoch": 2.38, + "grad_norm": 0.7195352911949158, + "learning_rate": 6.183358395906376e-05, + "loss": 2.8092, + "step": 48479 + }, + { + "epoch": 2.38, + "grad_norm": 0.6981593370437622, + "learning_rate": 6.182422240446372e-05, + "loss": 2.9155, + "step": 48480 + }, + { + "epoch": 2.38, + "grad_norm": 0.715376079082489, + "learning_rate": 6.181486147717287e-05, + "loss": 2.7584, + "step": 48481 + }, + { + "epoch": 2.38, + "grad_norm": 0.7249025106430054, + "learning_rate": 6.180550117721577e-05, + "loss": 2.8825, + "step": 48482 + }, + { + "epoch": 2.38, + "grad_norm": 0.7154632806777954, + "learning_rate": 6.179614150461722e-05, + "loss": 2.695, + "step": 48483 + }, + { + "epoch": 2.38, + "grad_norm": 0.7087334394454956, + "learning_rate": 6.178678245940179e-05, + "loss": 2.7371, + "step": 48484 + }, + { + "epoch": 2.38, + "grad_norm": 0.7212601900100708, + "learning_rate": 6.177742404159404e-05, + "loss": 2.6432, + "step": 48485 + }, + { + "epoch": 2.38, + "grad_norm": 0.7712526321411133, + "learning_rate": 6.176806625121882e-05, + "loss": 2.7397, + "step": 48486 + }, + { + "epoch": 2.38, + "grad_norm": 0.7613638639450073, + "learning_rate": 6.175870908830056e-05, + "loss": 2.9963, + "step": 48487 + }, + { + "epoch": 2.38, + "grad_norm": 0.7476872205734253, + "learning_rate": 6.174935255286408e-05, + "loss": 2.7339, + "step": 48488 + }, + { + "epoch": 2.38, + "grad_norm": 0.7133546471595764, + "learning_rate": 6.17399966449339e-05, + "loss": 2.8263, + "step": 48489 + }, + { + "epoch": 2.38, + "grad_norm": 0.736723005771637, + "learning_rate": 6.173064136453477e-05, + "loss": 2.7688, + "step": 48490 + }, + { + "epoch": 2.38, + "grad_norm": 0.7491676807403564, + "learning_rate": 6.172128671169128e-05, + "loss": 3.0156, + "step": 48491 + }, + { + "epoch": 2.38, + "grad_norm": 0.7248450517654419, + "learning_rate": 6.171193268642799e-05, + "loss": 2.8164, + "step": 48492 + }, + { + "epoch": 2.38, + "grad_norm": 0.7055536508560181, + "learning_rate": 6.170257928876969e-05, + "loss": 2.9714, + "step": 48493 + }, + { + "epoch": 2.38, + "grad_norm": 0.7106114029884338, + "learning_rate": 6.169322651874085e-05, + "loss": 3.0877, + "step": 48494 + }, + { + "epoch": 2.38, + "grad_norm": 0.7218091487884521, + "learning_rate": 6.168387437636619e-05, + "loss": 2.9154, + "step": 48495 + }, + { + "epoch": 2.38, + "grad_norm": 0.7130762338638306, + "learning_rate": 6.167452286167042e-05, + "loss": 2.9306, + "step": 48496 + }, + { + "epoch": 2.38, + "grad_norm": 0.7284539937973022, + "learning_rate": 6.1665171974678e-05, + "loss": 2.9702, + "step": 48497 + }, + { + "epoch": 2.38, + "grad_norm": 0.736548125743866, + "learning_rate": 6.165582171541375e-05, + "loss": 2.7348, + "step": 48498 + }, + { + "epoch": 2.38, + "grad_norm": 0.7026635408401489, + "learning_rate": 6.164647208390217e-05, + "loss": 3.0233, + "step": 48499 + }, + { + "epoch": 2.38, + "grad_norm": 0.7796565890312195, + "learning_rate": 6.163712308016788e-05, + "loss": 2.9025, + "step": 48500 + }, + { + "epoch": 2.38, + "grad_norm": 0.8084689378738403, + "learning_rate": 6.162777470423555e-05, + "loss": 2.9667, + "step": 48501 + }, + { + "epoch": 2.38, + "grad_norm": 0.7033407092094421, + "learning_rate": 6.161842695612974e-05, + "loss": 2.8283, + "step": 48502 + }, + { + "epoch": 2.38, + "grad_norm": 0.7982221841812134, + "learning_rate": 6.160907983587521e-05, + "loss": 3.0055, + "step": 48503 + }, + { + "epoch": 2.38, + "grad_norm": 0.7438850402832031, + "learning_rate": 6.159973334349637e-05, + "loss": 2.9091, + "step": 48504 + }, + { + "epoch": 2.38, + "grad_norm": 0.7206811904907227, + "learning_rate": 6.159038747901804e-05, + "loss": 2.7498, + "step": 48505 + }, + { + "epoch": 2.38, + "grad_norm": 0.6850907206535339, + "learning_rate": 6.158104224246477e-05, + "loss": 2.9421, + "step": 48506 + }, + { + "epoch": 2.38, + "grad_norm": 0.7406187057495117, + "learning_rate": 6.157169763386102e-05, + "loss": 2.954, + "step": 48507 + }, + { + "epoch": 2.38, + "grad_norm": 0.728126585483551, + "learning_rate": 6.156235365323165e-05, + "loss": 2.7771, + "step": 48508 + }, + { + "epoch": 2.38, + "grad_norm": 0.6951186656951904, + "learning_rate": 6.15530103006011e-05, + "loss": 2.8559, + "step": 48509 + }, + { + "epoch": 2.38, + "grad_norm": 0.7429779767990112, + "learning_rate": 6.154366757599399e-05, + "loss": 2.92, + "step": 48510 + }, + { + "epoch": 2.38, + "grad_norm": 0.6777859330177307, + "learning_rate": 6.153432547943504e-05, + "loss": 2.9665, + "step": 48511 + }, + { + "epoch": 2.38, + "grad_norm": 0.7383310794830322, + "learning_rate": 6.15249840109488e-05, + "loss": 2.7535, + "step": 48512 + }, + { + "epoch": 2.38, + "grad_norm": 0.7504093050956726, + "learning_rate": 6.151564317055986e-05, + "loss": 2.9476, + "step": 48513 + }, + { + "epoch": 2.38, + "grad_norm": 0.6900618672370911, + "learning_rate": 6.150630295829275e-05, + "loss": 2.7624, + "step": 48514 + }, + { + "epoch": 2.38, + "grad_norm": 0.7341283559799194, + "learning_rate": 6.149696337417213e-05, + "loss": 2.8167, + "step": 48515 + }, + { + "epoch": 2.38, + "grad_norm": 0.7598667144775391, + "learning_rate": 6.148762441822266e-05, + "loss": 2.7693, + "step": 48516 + }, + { + "epoch": 2.38, + "grad_norm": 0.7878901362419128, + "learning_rate": 6.147828609046882e-05, + "loss": 2.7556, + "step": 48517 + }, + { + "epoch": 2.38, + "grad_norm": 0.7257072329521179, + "learning_rate": 6.146894839093535e-05, + "loss": 3.006, + "step": 48518 + }, + { + "epoch": 2.38, + "grad_norm": 0.7379465103149414, + "learning_rate": 6.145961131964671e-05, + "loss": 3.0419, + "step": 48519 + }, + { + "epoch": 2.38, + "grad_norm": 0.7503682374954224, + "learning_rate": 6.145027487662762e-05, + "loss": 2.8252, + "step": 48520 + }, + { + "epoch": 2.38, + "grad_norm": 0.7141533493995667, + "learning_rate": 6.144093906190256e-05, + "loss": 2.9226, + "step": 48521 + }, + { + "epoch": 2.38, + "grad_norm": 0.7183959484100342, + "learning_rate": 6.14316038754961e-05, + "loss": 2.8882, + "step": 48522 + }, + { + "epoch": 2.38, + "grad_norm": 0.7660282254219055, + "learning_rate": 6.142226931743296e-05, + "loss": 2.8312, + "step": 48523 + }, + { + "epoch": 2.38, + "grad_norm": 0.7023894190788269, + "learning_rate": 6.141293538773758e-05, + "loss": 2.787, + "step": 48524 + }, + { + "epoch": 2.38, + "grad_norm": 0.7554915547370911, + "learning_rate": 6.140360208643459e-05, + "loss": 2.9541, + "step": 48525 + }, + { + "epoch": 2.38, + "grad_norm": 0.6967014670372009, + "learning_rate": 6.139426941354866e-05, + "loss": 2.9179, + "step": 48526 + }, + { + "epoch": 2.38, + "grad_norm": 0.7923035025596619, + "learning_rate": 6.138493736910433e-05, + "loss": 3.1075, + "step": 48527 + }, + { + "epoch": 2.38, + "grad_norm": 0.7242387533187866, + "learning_rate": 6.137560595312614e-05, + "loss": 2.7576, + "step": 48528 + }, + { + "epoch": 2.38, + "grad_norm": 0.7367933392524719, + "learning_rate": 6.136627516563861e-05, + "loss": 2.8662, + "step": 48529 + }, + { + "epoch": 2.38, + "grad_norm": 0.7342579364776611, + "learning_rate": 6.135694500666638e-05, + "loss": 2.8216, + "step": 48530 + }, + { + "epoch": 2.38, + "grad_norm": 0.7151744365692139, + "learning_rate": 6.134761547623413e-05, + "loss": 2.8982, + "step": 48531 + }, + { + "epoch": 2.38, + "grad_norm": 0.7614980340003967, + "learning_rate": 6.133828657436622e-05, + "loss": 2.969, + "step": 48532 + }, + { + "epoch": 2.38, + "grad_norm": 0.7628613114356995, + "learning_rate": 6.132895830108744e-05, + "loss": 2.946, + "step": 48533 + }, + { + "epoch": 2.38, + "grad_norm": 0.7301518321037292, + "learning_rate": 6.131963065642221e-05, + "loss": 2.8227, + "step": 48534 + }, + { + "epoch": 2.38, + "grad_norm": 0.7378628253936768, + "learning_rate": 6.13103036403951e-05, + "loss": 3.0836, + "step": 48535 + }, + { + "epoch": 2.38, + "grad_norm": 0.7157490253448486, + "learning_rate": 6.130097725303078e-05, + "loss": 2.8048, + "step": 48536 + }, + { + "epoch": 2.38, + "grad_norm": 0.7131938338279724, + "learning_rate": 6.129165149435364e-05, + "loss": 3.134, + "step": 48537 + }, + { + "epoch": 2.38, + "grad_norm": 0.7120147943496704, + "learning_rate": 6.128232636438846e-05, + "loss": 2.9152, + "step": 48538 + }, + { + "epoch": 2.38, + "grad_norm": 0.8592355847358704, + "learning_rate": 6.127300186315962e-05, + "loss": 2.9719, + "step": 48539 + }, + { + "epoch": 2.38, + "grad_norm": 0.7270697951316833, + "learning_rate": 6.126367799069172e-05, + "loss": 2.8289, + "step": 48540 + }, + { + "epoch": 2.38, + "grad_norm": 0.7252556681632996, + "learning_rate": 6.125435474700944e-05, + "loss": 2.7769, + "step": 48541 + }, + { + "epoch": 2.38, + "grad_norm": 0.8012216687202454, + "learning_rate": 6.124503213213723e-05, + "loss": 2.9533, + "step": 48542 + }, + { + "epoch": 2.38, + "grad_norm": 0.7498360872268677, + "learning_rate": 6.123571014609967e-05, + "loss": 2.8852, + "step": 48543 + }, + { + "epoch": 2.38, + "grad_norm": 0.7274046540260315, + "learning_rate": 6.122638878892121e-05, + "loss": 3.0165, + "step": 48544 + }, + { + "epoch": 2.38, + "grad_norm": 0.713111400604248, + "learning_rate": 6.12170680606266e-05, + "loss": 2.8686, + "step": 48545 + }, + { + "epoch": 2.38, + "grad_norm": 0.7824342250823975, + "learning_rate": 6.120774796124017e-05, + "loss": 2.973, + "step": 48546 + }, + { + "epoch": 2.38, + "grad_norm": 0.7419819235801697, + "learning_rate": 6.119842849078656e-05, + "loss": 2.9885, + "step": 48547 + }, + { + "epoch": 2.38, + "grad_norm": 0.7952620983123779, + "learning_rate": 6.118910964929042e-05, + "loss": 2.8178, + "step": 48548 + }, + { + "epoch": 2.38, + "grad_norm": 0.7857388854026794, + "learning_rate": 6.117979143677623e-05, + "loss": 2.9158, + "step": 48549 + }, + { + "epoch": 2.38, + "grad_norm": 0.7016808986663818, + "learning_rate": 6.117047385326851e-05, + "loss": 3.0688, + "step": 48550 + }, + { + "epoch": 2.38, + "grad_norm": 0.7236056923866272, + "learning_rate": 6.116115689879168e-05, + "loss": 3.1444, + "step": 48551 + }, + { + "epoch": 2.38, + "grad_norm": 0.7185097336769104, + "learning_rate": 6.11518405733704e-05, + "loss": 2.6668, + "step": 48552 + }, + { + "epoch": 2.38, + "grad_norm": 0.7458245754241943, + "learning_rate": 6.114252487702931e-05, + "loss": 2.8227, + "step": 48553 + }, + { + "epoch": 2.38, + "grad_norm": 0.7336084246635437, + "learning_rate": 6.113320980979275e-05, + "loss": 2.7382, + "step": 48554 + }, + { + "epoch": 2.38, + "grad_norm": 0.750745415687561, + "learning_rate": 6.112389537168544e-05, + "loss": 2.87, + "step": 48555 + }, + { + "epoch": 2.38, + "grad_norm": 0.7281879186630249, + "learning_rate": 6.11145815627318e-05, + "loss": 2.9631, + "step": 48556 + }, + { + "epoch": 2.38, + "grad_norm": 0.777651309967041, + "learning_rate": 6.110526838295631e-05, + "loss": 2.7747, + "step": 48557 + }, + { + "epoch": 2.38, + "grad_norm": 0.7897119522094727, + "learning_rate": 6.109595583238363e-05, + "loss": 2.9692, + "step": 48558 + }, + { + "epoch": 2.38, + "grad_norm": 0.7822614312171936, + "learning_rate": 6.108664391103815e-05, + "loss": 2.7721, + "step": 48559 + }, + { + "epoch": 2.38, + "grad_norm": 0.7523203492164612, + "learning_rate": 6.107733261894457e-05, + "loss": 2.9797, + "step": 48560 + }, + { + "epoch": 2.38, + "grad_norm": 0.747348964214325, + "learning_rate": 6.10680219561272e-05, + "loss": 3.1613, + "step": 48561 + }, + { + "epoch": 2.38, + "grad_norm": 0.7030299305915833, + "learning_rate": 6.105871192261068e-05, + "loss": 2.8633, + "step": 48562 + }, + { + "epoch": 2.38, + "grad_norm": 0.703755259513855, + "learning_rate": 6.104940251841966e-05, + "loss": 2.968, + "step": 48563 + }, + { + "epoch": 2.38, + "grad_norm": 0.7231656908988953, + "learning_rate": 6.104009374357846e-05, + "loss": 2.7655, + "step": 48564 + }, + { + "epoch": 2.38, + "grad_norm": 0.7645540833473206, + "learning_rate": 6.10307855981117e-05, + "loss": 3.0025, + "step": 48565 + }, + { + "epoch": 2.38, + "grad_norm": 0.7383845448493958, + "learning_rate": 6.1021478082043765e-05, + "loss": 2.9483, + "step": 48566 + }, + { + "epoch": 2.38, + "grad_norm": 0.7293666005134583, + "learning_rate": 6.101217119539927e-05, + "loss": 3.0159, + "step": 48567 + }, + { + "epoch": 2.38, + "grad_norm": 0.736205518245697, + "learning_rate": 6.100286493820281e-05, + "loss": 2.806, + "step": 48568 + }, + { + "epoch": 2.38, + "grad_norm": 0.7195806503295898, + "learning_rate": 6.099355931047872e-05, + "loss": 2.8859, + "step": 48569 + }, + { + "epoch": 2.38, + "grad_norm": 0.7205501198768616, + "learning_rate": 6.098425431225169e-05, + "loss": 2.6709, + "step": 48570 + }, + { + "epoch": 2.38, + "grad_norm": 0.7264590263366699, + "learning_rate": 6.097494994354612e-05, + "loss": 2.8121, + "step": 48571 + }, + { + "epoch": 2.38, + "grad_norm": 0.7377135753631592, + "learning_rate": 6.0965646204386445e-05, + "loss": 2.7602, + "step": 48572 + }, + { + "epoch": 2.38, + "grad_norm": 0.7292772531509399, + "learning_rate": 6.0956343094797355e-05, + "loss": 2.9107, + "step": 48573 + }, + { + "epoch": 2.38, + "grad_norm": 0.8081817626953125, + "learning_rate": 6.0947040614803144e-05, + "loss": 2.7616, + "step": 48574 + }, + { + "epoch": 2.38, + "grad_norm": 0.6972966194152832, + "learning_rate": 6.093773876442852e-05, + "loss": 2.8046, + "step": 48575 + }, + { + "epoch": 2.38, + "grad_norm": 0.6994813084602356, + "learning_rate": 6.09284375436978e-05, + "loss": 2.872, + "step": 48576 + }, + { + "epoch": 2.38, + "grad_norm": 0.7522056102752686, + "learning_rate": 6.091913695263557e-05, + "loss": 3.0694, + "step": 48577 + }, + { + "epoch": 2.38, + "grad_norm": 0.7025183439254761, + "learning_rate": 6.0909836991266506e-05, + "loss": 2.9151, + "step": 48578 + }, + { + "epoch": 2.38, + "grad_norm": 0.7216269373893738, + "learning_rate": 6.090053765961471e-05, + "loss": 2.8595, + "step": 48579 + }, + { + "epoch": 2.38, + "grad_norm": 0.7327861189842224, + "learning_rate": 6.089123895770498e-05, + "loss": 2.7038, + "step": 48580 + }, + { + "epoch": 2.38, + "grad_norm": 0.7535680532455444, + "learning_rate": 6.088194088556165e-05, + "loss": 2.866, + "step": 48581 + }, + { + "epoch": 2.38, + "grad_norm": 0.7392609119415283, + "learning_rate": 6.0872643443209246e-05, + "loss": 2.8061, + "step": 48582 + }, + { + "epoch": 2.38, + "grad_norm": 0.7233937382698059, + "learning_rate": 6.0863346630672336e-05, + "loss": 2.8853, + "step": 48583 + }, + { + "epoch": 2.38, + "grad_norm": 0.7154392600059509, + "learning_rate": 6.085405044797531e-05, + "loss": 3.0673, + "step": 48584 + }, + { + "epoch": 2.38, + "grad_norm": 0.7913565039634705, + "learning_rate": 6.084475489514274e-05, + "loss": 3.0256, + "step": 48585 + }, + { + "epoch": 2.38, + "grad_norm": 0.7132130861282349, + "learning_rate": 6.0835459972199086e-05, + "loss": 2.9848, + "step": 48586 + }, + { + "epoch": 2.38, + "grad_norm": 0.7644349932670593, + "learning_rate": 6.082616567916871e-05, + "loss": 2.9009, + "step": 48587 + }, + { + "epoch": 2.38, + "grad_norm": 0.7143843173980713, + "learning_rate": 6.081687201607628e-05, + "loss": 2.6485, + "step": 48588 + }, + { + "epoch": 2.38, + "grad_norm": 0.6977376341819763, + "learning_rate": 6.080757898294606e-05, + "loss": 2.8017, + "step": 48589 + }, + { + "epoch": 2.38, + "grad_norm": 0.7115483283996582, + "learning_rate": 6.0798286579802765e-05, + "loss": 2.9167, + "step": 48590 + }, + { + "epoch": 2.38, + "grad_norm": 0.7304779887199402, + "learning_rate": 6.0788994806670656e-05, + "loss": 2.9317, + "step": 48591 + }, + { + "epoch": 2.38, + "grad_norm": 0.7648419141769409, + "learning_rate": 6.0779703663574364e-05, + "loss": 3.0281, + "step": 48592 + }, + { + "epoch": 2.38, + "grad_norm": 0.7392321825027466, + "learning_rate": 6.077041315053828e-05, + "loss": 2.9541, + "step": 48593 + }, + { + "epoch": 2.38, + "grad_norm": 0.8053700923919678, + "learning_rate": 6.076112326758684e-05, + "loss": 3.0026, + "step": 48594 + }, + { + "epoch": 2.38, + "grad_norm": 0.7006090879440308, + "learning_rate": 6.075183401474464e-05, + "loss": 3.0143, + "step": 48595 + }, + { + "epoch": 2.38, + "grad_norm": 0.7133437395095825, + "learning_rate": 6.0742545392035945e-05, + "loss": 2.8935, + "step": 48596 + }, + { + "epoch": 2.38, + "grad_norm": 0.7549411654472351, + "learning_rate": 6.073325739948538e-05, + "loss": 2.9046, + "step": 48597 + }, + { + "epoch": 2.38, + "grad_norm": 0.6961774826049805, + "learning_rate": 6.072397003711742e-05, + "loss": 2.9923, + "step": 48598 + }, + { + "epoch": 2.38, + "grad_norm": 0.8232783675193787, + "learning_rate": 6.0714683304956415e-05, + "loss": 2.8209, + "step": 48599 + }, + { + "epoch": 2.38, + "grad_norm": 0.7344974875450134, + "learning_rate": 6.070539720302694e-05, + "loss": 2.8822, + "step": 48600 + }, + { + "epoch": 2.38, + "grad_norm": 0.7219202518463135, + "learning_rate": 6.0696111731353414e-05, + "loss": 2.8987, + "step": 48601 + }, + { + "epoch": 2.38, + "grad_norm": 0.7357993721961975, + "learning_rate": 6.0686826889960185e-05, + "loss": 2.9248, + "step": 48602 + }, + { + "epoch": 2.38, + "grad_norm": 0.7212379574775696, + "learning_rate": 6.0677542678871894e-05, + "loss": 2.7127, + "step": 48603 + }, + { + "epoch": 2.38, + "grad_norm": 0.7509791254997253, + "learning_rate": 6.066825909811279e-05, + "loss": 2.7341, + "step": 48604 + }, + { + "epoch": 2.38, + "grad_norm": 0.6835814714431763, + "learning_rate": 6.065897614770753e-05, + "loss": 2.8247, + "step": 48605 + }, + { + "epoch": 2.38, + "grad_norm": 0.7493883371353149, + "learning_rate": 6.0649693827680376e-05, + "loss": 2.8594, + "step": 48606 + }, + { + "epoch": 2.38, + "grad_norm": 0.7076842784881592, + "learning_rate": 6.064041213805596e-05, + "loss": 2.963, + "step": 48607 + }, + { + "epoch": 2.38, + "grad_norm": 0.7497107982635498, + "learning_rate": 6.063113107885862e-05, + "loss": 2.8824, + "step": 48608 + }, + { + "epoch": 2.38, + "grad_norm": 0.7138265371322632, + "learning_rate": 6.0621850650112735e-05, + "loss": 2.7468, + "step": 48609 + }, + { + "epoch": 2.38, + "grad_norm": 0.7438095211982727, + "learning_rate": 6.061257085184292e-05, + "loss": 2.7966, + "step": 48610 + }, + { + "epoch": 2.38, + "grad_norm": 0.7051562666893005, + "learning_rate": 6.060329168407343e-05, + "loss": 2.8157, + "step": 48611 + }, + { + "epoch": 2.38, + "grad_norm": 0.7326208353042603, + "learning_rate": 6.059401314682877e-05, + "loss": 2.7358, + "step": 48612 + }, + { + "epoch": 2.38, + "grad_norm": 0.7127295732498169, + "learning_rate": 6.058473524013352e-05, + "loss": 2.9006, + "step": 48613 + }, + { + "epoch": 2.38, + "grad_norm": 0.7416107654571533, + "learning_rate": 6.0575457964012e-05, + "loss": 2.9238, + "step": 48614 + }, + { + "epoch": 2.38, + "grad_norm": 0.7051231265068054, + "learning_rate": 6.0566181318488614e-05, + "loss": 2.9534, + "step": 48615 + }, + { + "epoch": 2.38, + "grad_norm": 0.6944036483764648, + "learning_rate": 6.0556905303587786e-05, + "loss": 2.7596, + "step": 48616 + }, + { + "epoch": 2.38, + "grad_norm": 0.7137592434883118, + "learning_rate": 6.054762991933396e-05, + "loss": 2.7521, + "step": 48617 + }, + { + "epoch": 2.38, + "grad_norm": 0.7165379524230957, + "learning_rate": 6.053835516575168e-05, + "loss": 2.7684, + "step": 48618 + }, + { + "epoch": 2.38, + "grad_norm": 0.7815676331520081, + "learning_rate": 6.052908104286524e-05, + "loss": 2.6846, + "step": 48619 + }, + { + "epoch": 2.38, + "grad_norm": 0.7360538840293884, + "learning_rate": 6.0519807550699164e-05, + "loss": 2.9516, + "step": 48620 + }, + { + "epoch": 2.38, + "grad_norm": 0.7272911667823792, + "learning_rate": 6.0510534689277746e-05, + "loss": 2.9062, + "step": 48621 + }, + { + "epoch": 2.38, + "grad_norm": 0.761579155921936, + "learning_rate": 6.050126245862557e-05, + "loss": 2.8845, + "step": 48622 + }, + { + "epoch": 2.38, + "grad_norm": 0.7632530331611633, + "learning_rate": 6.049199085876697e-05, + "loss": 2.9801, + "step": 48623 + }, + { + "epoch": 2.38, + "grad_norm": 0.8465824723243713, + "learning_rate": 6.04827198897263e-05, + "loss": 2.775, + "step": 48624 + }, + { + "epoch": 2.38, + "grad_norm": 0.7626355290412903, + "learning_rate": 6.047344955152812e-05, + "loss": 2.7857, + "step": 48625 + }, + { + "epoch": 2.38, + "grad_norm": 0.7368438839912415, + "learning_rate": 6.0464179844196693e-05, + "loss": 2.8184, + "step": 48626 + }, + { + "epoch": 2.38, + "grad_norm": 0.72593754529953, + "learning_rate": 6.04549107677566e-05, + "loss": 2.9202, + "step": 48627 + }, + { + "epoch": 2.38, + "grad_norm": 0.7318313717842102, + "learning_rate": 6.04456423222321e-05, + "loss": 2.9295, + "step": 48628 + }, + { + "epoch": 2.38, + "grad_norm": 0.7129902243614197, + "learning_rate": 6.043637450764777e-05, + "loss": 2.8795, + "step": 48629 + }, + { + "epoch": 2.38, + "grad_norm": 0.7956576943397522, + "learning_rate": 6.042710732402791e-05, + "loss": 2.9245, + "step": 48630 + }, + { + "epoch": 2.38, + "grad_norm": 0.7019463777542114, + "learning_rate": 6.0417840771396854e-05, + "loss": 2.6454, + "step": 48631 + }, + { + "epoch": 2.38, + "grad_norm": 0.7243613600730896, + "learning_rate": 6.04085748497792e-05, + "loss": 2.916, + "step": 48632 + }, + { + "epoch": 2.38, + "grad_norm": 0.6990789771080017, + "learning_rate": 6.039930955919914e-05, + "loss": 3.0681, + "step": 48633 + }, + { + "epoch": 2.38, + "grad_norm": 0.7699520587921143, + "learning_rate": 6.039004489968121e-05, + "loss": 2.863, + "step": 48634 + }, + { + "epoch": 2.38, + "grad_norm": 0.7257212400436401, + "learning_rate": 6.038078087124983e-05, + "loss": 2.7365, + "step": 48635 + }, + { + "epoch": 2.38, + "grad_norm": 0.7219493985176086, + "learning_rate": 6.037151747392939e-05, + "loss": 2.9616, + "step": 48636 + }, + { + "epoch": 2.38, + "grad_norm": 0.7039251327514648, + "learning_rate": 6.036225470774425e-05, + "loss": 2.5781, + "step": 48637 + }, + { + "epoch": 2.38, + "grad_norm": 0.7816781401634216, + "learning_rate": 6.035299257271874e-05, + "loss": 2.8161, + "step": 48638 + }, + { + "epoch": 2.38, + "grad_norm": 0.7119601368904114, + "learning_rate": 6.034373106887733e-05, + "loss": 2.9124, + "step": 48639 + }, + { + "epoch": 2.38, + "grad_norm": 0.7373980283737183, + "learning_rate": 6.033447019624448e-05, + "loss": 2.7964, + "step": 48640 + }, + { + "epoch": 2.38, + "grad_norm": 0.6862812042236328, + "learning_rate": 6.032520995484442e-05, + "loss": 2.9101, + "step": 48641 + }, + { + "epoch": 2.38, + "grad_norm": 0.7756862640380859, + "learning_rate": 6.031595034470171e-05, + "loss": 2.986, + "step": 48642 + }, + { + "epoch": 2.38, + "grad_norm": 0.6969619393348694, + "learning_rate": 6.0306691365840596e-05, + "loss": 2.9996, + "step": 48643 + }, + { + "epoch": 2.38, + "grad_norm": 0.7498837113380432, + "learning_rate": 6.029743301828559e-05, + "loss": 2.8987, + "step": 48644 + }, + { + "epoch": 2.38, + "grad_norm": 0.7080926895141602, + "learning_rate": 6.028817530206104e-05, + "loss": 2.9105, + "step": 48645 + }, + { + "epoch": 2.38, + "grad_norm": 0.7312246561050415, + "learning_rate": 6.027891821719122e-05, + "loss": 3.199, + "step": 48646 + }, + { + "epoch": 2.38, + "grad_norm": 0.7515241503715515, + "learning_rate": 6.026966176370065e-05, + "loss": 2.9646, + "step": 48647 + }, + { + "epoch": 2.38, + "grad_norm": 0.7634444832801819, + "learning_rate": 6.026040594161358e-05, + "loss": 3.049, + "step": 48648 + }, + { + "epoch": 2.38, + "grad_norm": 0.7122647166252136, + "learning_rate": 6.025115075095448e-05, + "loss": 2.6741, + "step": 48649 + }, + { + "epoch": 2.38, + "grad_norm": 0.6937204003334045, + "learning_rate": 6.0241896191747774e-05, + "loss": 2.8746, + "step": 48650 + }, + { + "epoch": 2.38, + "grad_norm": 0.7191386222839355, + "learning_rate": 6.023264226401777e-05, + "loss": 2.843, + "step": 48651 + }, + { + "epoch": 2.38, + "grad_norm": 0.7549066543579102, + "learning_rate": 6.022338896778882e-05, + "loss": 2.9481, + "step": 48652 + }, + { + "epoch": 2.38, + "grad_norm": 0.731148898601532, + "learning_rate": 6.021413630308527e-05, + "loss": 2.8964, + "step": 48653 + }, + { + "epoch": 2.38, + "grad_norm": 0.7206429243087769, + "learning_rate": 6.020488426993153e-05, + "loss": 2.709, + "step": 48654 + }, + { + "epoch": 2.38, + "grad_norm": 0.7050144076347351, + "learning_rate": 6.019563286835205e-05, + "loss": 2.7941, + "step": 48655 + }, + { + "epoch": 2.38, + "grad_norm": 0.7599301338195801, + "learning_rate": 6.0186382098371047e-05, + "loss": 2.9327, + "step": 48656 + }, + { + "epoch": 2.38, + "grad_norm": 0.6911978721618652, + "learning_rate": 6.0177131960013056e-05, + "loss": 2.8945, + "step": 48657 + }, + { + "epoch": 2.38, + "grad_norm": 0.7535004019737244, + "learning_rate": 6.016788245330231e-05, + "loss": 2.9582, + "step": 48658 + }, + { + "epoch": 2.38, + "grad_norm": 0.7646125555038452, + "learning_rate": 6.015863357826314e-05, + "loss": 3.0724, + "step": 48659 + }, + { + "epoch": 2.38, + "grad_norm": 0.7803003787994385, + "learning_rate": 6.014938533492006e-05, + "loss": 2.959, + "step": 48660 + }, + { + "epoch": 2.38, + "grad_norm": 0.730233371257782, + "learning_rate": 6.014013772329727e-05, + "loss": 2.8426, + "step": 48661 + }, + { + "epoch": 2.38, + "grad_norm": 0.7188946008682251, + "learning_rate": 6.0130890743419225e-05, + "loss": 3.0968, + "step": 48662 + }, + { + "epoch": 2.38, + "grad_norm": 0.7240670323371887, + "learning_rate": 6.012164439531022e-05, + "loss": 2.8007, + "step": 48663 + }, + { + "epoch": 2.38, + "grad_norm": 0.7316474318504333, + "learning_rate": 6.011239867899463e-05, + "loss": 2.9304, + "step": 48664 + }, + { + "epoch": 2.38, + "grad_norm": 0.8154338598251343, + "learning_rate": 6.010315359449688e-05, + "loss": 2.9659, + "step": 48665 + }, + { + "epoch": 2.39, + "grad_norm": 0.7354592084884644, + "learning_rate": 6.0093909141841266e-05, + "loss": 2.9925, + "step": 48666 + }, + { + "epoch": 2.39, + "grad_norm": 0.7725946307182312, + "learning_rate": 6.008466532105212e-05, + "loss": 2.8038, + "step": 48667 + }, + { + "epoch": 2.39, + "grad_norm": 0.7260931134223938, + "learning_rate": 6.0075422132153705e-05, + "loss": 2.8687, + "step": 48668 + }, + { + "epoch": 2.39, + "grad_norm": 0.7305506467819214, + "learning_rate": 6.0066179575170445e-05, + "loss": 2.9901, + "step": 48669 + }, + { + "epoch": 2.39, + "grad_norm": 0.8036696910858154, + "learning_rate": 6.005693765012679e-05, + "loss": 2.9703, + "step": 48670 + }, + { + "epoch": 2.39, + "grad_norm": 0.7172324061393738, + "learning_rate": 6.004769635704691e-05, + "loss": 2.9001, + "step": 48671 + }, + { + "epoch": 2.39, + "grad_norm": 0.7303609251976013, + "learning_rate": 6.0038455695955265e-05, + "loss": 3.0238, + "step": 48672 + }, + { + "epoch": 2.39, + "grad_norm": 0.7004274725914001, + "learning_rate": 6.0029215666876194e-05, + "loss": 2.9343, + "step": 48673 + }, + { + "epoch": 2.39, + "grad_norm": 0.7272935509681702, + "learning_rate": 6.001997626983388e-05, + "loss": 2.738, + "step": 48674 + }, + { + "epoch": 2.39, + "grad_norm": 0.7348398566246033, + "learning_rate": 6.001073750485283e-05, + "loss": 2.7692, + "step": 48675 + }, + { + "epoch": 2.39, + "grad_norm": 0.7049981951713562, + "learning_rate": 6.000149937195724e-05, + "loss": 2.9259, + "step": 48676 + }, + { + "epoch": 2.39, + "grad_norm": 0.7831222414970398, + "learning_rate": 5.999226187117161e-05, + "loss": 2.91, + "step": 48677 + }, + { + "epoch": 2.39, + "grad_norm": 0.7655361294746399, + "learning_rate": 5.998302500252009e-05, + "loss": 2.6398, + "step": 48678 + }, + { + "epoch": 2.39, + "grad_norm": 0.7228807210922241, + "learning_rate": 5.997378876602713e-05, + "loss": 2.789, + "step": 48679 + }, + { + "epoch": 2.39, + "grad_norm": 0.7596110105514526, + "learning_rate": 5.996455316171708e-05, + "loss": 2.8773, + "step": 48680 + }, + { + "epoch": 2.39, + "grad_norm": 0.7604156136512756, + "learning_rate": 5.995531818961408e-05, + "loss": 2.9434, + "step": 48681 + }, + { + "epoch": 2.39, + "grad_norm": 0.7096068859100342, + "learning_rate": 5.9946083849742645e-05, + "loss": 2.9145, + "step": 48682 + }, + { + "epoch": 2.39, + "grad_norm": 0.7317915558815002, + "learning_rate": 5.993685014212696e-05, + "loss": 2.9156, + "step": 48683 + }, + { + "epoch": 2.39, + "grad_norm": 0.6918458342552185, + "learning_rate": 5.992761706679143e-05, + "loss": 2.7525, + "step": 48684 + }, + { + "epoch": 2.39, + "grad_norm": 0.7611396908760071, + "learning_rate": 5.991838462376042e-05, + "loss": 2.9744, + "step": 48685 + }, + { + "epoch": 2.39, + "grad_norm": 0.7494409680366516, + "learning_rate": 5.990915281305809e-05, + "loss": 3.0139, + "step": 48686 + }, + { + "epoch": 2.39, + "grad_norm": 0.7255983352661133, + "learning_rate": 5.9899921634708936e-05, + "loss": 3.0039, + "step": 48687 + }, + { + "epoch": 2.39, + "grad_norm": 0.7247331738471985, + "learning_rate": 5.9890691088737165e-05, + "loss": 2.9438, + "step": 48688 + }, + { + "epoch": 2.39, + "grad_norm": 0.7208533883094788, + "learning_rate": 5.988146117516702e-05, + "loss": 2.7187, + "step": 48689 + }, + { + "epoch": 2.39, + "grad_norm": 0.7259299755096436, + "learning_rate": 5.987223189402299e-05, + "loss": 3.0192, + "step": 48690 + }, + { + "epoch": 2.39, + "grad_norm": 0.7051576375961304, + "learning_rate": 5.986300324532922e-05, + "loss": 3.1708, + "step": 48691 + }, + { + "epoch": 2.39, + "grad_norm": 0.8053131699562073, + "learning_rate": 5.985377522911016e-05, + "loss": 2.8649, + "step": 48692 + }, + { + "epoch": 2.39, + "grad_norm": 0.7644606828689575, + "learning_rate": 5.984454784538994e-05, + "loss": 2.9055, + "step": 48693 + }, + { + "epoch": 2.39, + "grad_norm": 0.7201364636421204, + "learning_rate": 5.9835321094193035e-05, + "loss": 2.8487, + "step": 48694 + }, + { + "epoch": 2.39, + "grad_norm": 0.7057961821556091, + "learning_rate": 5.9826094975543705e-05, + "loss": 3.0138, + "step": 48695 + }, + { + "epoch": 2.39, + "grad_norm": 0.7159346342086792, + "learning_rate": 5.9816869489466135e-05, + "loss": 2.6607, + "step": 48696 + }, + { + "epoch": 2.39, + "grad_norm": 0.7882915735244751, + "learning_rate": 5.9807644635984775e-05, + "loss": 2.9877, + "step": 48697 + }, + { + "epoch": 2.39, + "grad_norm": 0.7171709537506104, + "learning_rate": 5.9798420415123774e-05, + "loss": 2.8216, + "step": 48698 + }, + { + "epoch": 2.39, + "grad_norm": 0.7261692881584167, + "learning_rate": 5.97891968269075e-05, + "loss": 2.7935, + "step": 48699 + }, + { + "epoch": 2.39, + "grad_norm": 0.7348390221595764, + "learning_rate": 5.977997387136032e-05, + "loss": 3.0635, + "step": 48700 + }, + { + "epoch": 2.39, + "grad_norm": 0.7532333731651306, + "learning_rate": 5.97707515485064e-05, + "loss": 2.9131, + "step": 48701 + }, + { + "epoch": 2.39, + "grad_norm": 0.6886923313140869, + "learning_rate": 5.976152985837026e-05, + "loss": 2.7078, + "step": 48702 + }, + { + "epoch": 2.39, + "grad_norm": 0.7542856335639954, + "learning_rate": 5.975230880097581e-05, + "loss": 2.6983, + "step": 48703 + }, + { + "epoch": 2.39, + "grad_norm": 0.7375133037567139, + "learning_rate": 5.974308837634767e-05, + "loss": 2.6483, + "step": 48704 + }, + { + "epoch": 2.39, + "grad_norm": 0.7276037335395813, + "learning_rate": 5.973386858450988e-05, + "loss": 2.8235, + "step": 48705 + }, + { + "epoch": 2.39, + "grad_norm": 0.7167425751686096, + "learning_rate": 5.972464942548686e-05, + "loss": 2.6771, + "step": 48706 + }, + { + "epoch": 2.39, + "grad_norm": 0.760288655757904, + "learning_rate": 5.9715430899302954e-05, + "loss": 2.7104, + "step": 48707 + }, + { + "epoch": 2.39, + "grad_norm": 0.7757988572120667, + "learning_rate": 5.970621300598227e-05, + "loss": 2.8529, + "step": 48708 + }, + { + "epoch": 2.39, + "grad_norm": 0.7403610348701477, + "learning_rate": 5.969699574554927e-05, + "loss": 2.9158, + "step": 48709 + }, + { + "epoch": 2.39, + "grad_norm": 0.7176191210746765, + "learning_rate": 5.968777911802812e-05, + "loss": 3.0029, + "step": 48710 + }, + { + "epoch": 2.39, + "grad_norm": 0.711616039276123, + "learning_rate": 5.9678563123443045e-05, + "loss": 3.1245, + "step": 48711 + }, + { + "epoch": 2.39, + "grad_norm": 0.7264795303344727, + "learning_rate": 5.9669347761818455e-05, + "loss": 2.7855, + "step": 48712 + }, + { + "epoch": 2.39, + "grad_norm": 0.7248842120170593, + "learning_rate": 5.966013303317846e-05, + "loss": 2.8838, + "step": 48713 + }, + { + "epoch": 2.39, + "grad_norm": 0.7030981183052063, + "learning_rate": 5.965091893754751e-05, + "loss": 3.0203, + "step": 48714 + }, + { + "epoch": 2.39, + "grad_norm": 0.7812050580978394, + "learning_rate": 5.964170547494972e-05, + "loss": 2.9395, + "step": 48715 + }, + { + "epoch": 2.39, + "grad_norm": 0.762490451335907, + "learning_rate": 5.9632492645409467e-05, + "loss": 2.9752, + "step": 48716 + }, + { + "epoch": 2.39, + "grad_norm": 0.68095463514328, + "learning_rate": 5.962328044895097e-05, + "loss": 2.803, + "step": 48717 + }, + { + "epoch": 2.39, + "grad_norm": 0.6975252628326416, + "learning_rate": 5.961406888559844e-05, + "loss": 3.1397, + "step": 48718 + }, + { + "epoch": 2.39, + "grad_norm": 0.7371811866760254, + "learning_rate": 5.960485795537626e-05, + "loss": 2.9046, + "step": 48719 + }, + { + "epoch": 2.39, + "grad_norm": 0.7280109524726868, + "learning_rate": 5.959564765830854e-05, + "loss": 2.8761, + "step": 48720 + }, + { + "epoch": 2.39, + "grad_norm": 0.7430052161216736, + "learning_rate": 5.95864379944196e-05, + "loss": 2.8337, + "step": 48721 + }, + { + "epoch": 2.39, + "grad_norm": 0.7345473170280457, + "learning_rate": 5.957722896373381e-05, + "loss": 2.9135, + "step": 48722 + }, + { + "epoch": 2.39, + "grad_norm": 0.7026997208595276, + "learning_rate": 5.9568020566275235e-05, + "loss": 2.9278, + "step": 48723 + }, + { + "epoch": 2.39, + "grad_norm": 0.7376667857170105, + "learning_rate": 5.9558812802068335e-05, + "loss": 2.7952, + "step": 48724 + }, + { + "epoch": 2.39, + "grad_norm": 0.7143561840057373, + "learning_rate": 5.95496056711372e-05, + "loss": 2.7486, + "step": 48725 + }, + { + "epoch": 2.39, + "grad_norm": 0.7290409803390503, + "learning_rate": 5.954039917350608e-05, + "loss": 2.697, + "step": 48726 + }, + { + "epoch": 2.39, + "grad_norm": 0.7321444749832153, + "learning_rate": 5.953119330919935e-05, + "loss": 2.8251, + "step": 48727 + }, + { + "epoch": 2.39, + "grad_norm": 0.7561767101287842, + "learning_rate": 5.95219880782411e-05, + "loss": 2.7957, + "step": 48728 + }, + { + "epoch": 2.39, + "grad_norm": 0.750032365322113, + "learning_rate": 5.95127834806557e-05, + "loss": 3.2731, + "step": 48729 + }, + { + "epoch": 2.39, + "grad_norm": 0.7444902658462524, + "learning_rate": 5.95035795164673e-05, + "loss": 2.8988, + "step": 48730 + }, + { + "epoch": 2.39, + "grad_norm": 0.735755205154419, + "learning_rate": 5.9494376185700256e-05, + "loss": 2.9861, + "step": 48731 + }, + { + "epoch": 2.39, + "grad_norm": 0.7328507900238037, + "learning_rate": 5.948517348837874e-05, + "loss": 2.9542, + "step": 48732 + }, + { + "epoch": 2.39, + "grad_norm": 0.7095339894294739, + "learning_rate": 5.947597142452693e-05, + "loss": 2.7841, + "step": 48733 + }, + { + "epoch": 2.39, + "grad_norm": 0.7547805309295654, + "learning_rate": 5.946676999416917e-05, + "loss": 2.9635, + "step": 48734 + }, + { + "epoch": 2.39, + "grad_norm": 0.6976374387741089, + "learning_rate": 5.945756919732956e-05, + "loss": 2.8442, + "step": 48735 + }, + { + "epoch": 2.39, + "grad_norm": 0.7196562886238098, + "learning_rate": 5.944836903403246e-05, + "loss": 3.1681, + "step": 48736 + }, + { + "epoch": 2.39, + "grad_norm": 0.7531615495681763, + "learning_rate": 5.94391695043021e-05, + "loss": 3.1516, + "step": 48737 + }, + { + "epoch": 2.39, + "grad_norm": 0.7510145306587219, + "learning_rate": 5.942997060816268e-05, + "loss": 2.8634, + "step": 48738 + }, + { + "epoch": 2.39, + "grad_norm": 0.7500221133232117, + "learning_rate": 5.9420772345638423e-05, + "loss": 3.1042, + "step": 48739 + }, + { + "epoch": 2.39, + "grad_norm": 0.7888630032539368, + "learning_rate": 5.9411574716753474e-05, + "loss": 2.7464, + "step": 48740 + }, + { + "epoch": 2.39, + "grad_norm": 0.7947269082069397, + "learning_rate": 5.9402377721532144e-05, + "loss": 2.9241, + "step": 48741 + }, + { + "epoch": 2.39, + "grad_norm": 0.6809054613113403, + "learning_rate": 5.939318135999872e-05, + "loss": 2.7824, + "step": 48742 + }, + { + "epoch": 2.39, + "grad_norm": 0.785391628742218, + "learning_rate": 5.938398563217728e-05, + "loss": 2.9763, + "step": 48743 + }, + { + "epoch": 2.39, + "grad_norm": 0.7565441131591797, + "learning_rate": 5.9374790538092166e-05, + "loss": 3.02, + "step": 48744 + }, + { + "epoch": 2.39, + "grad_norm": 0.719491720199585, + "learning_rate": 5.93655960777675e-05, + "loss": 3.204, + "step": 48745 + }, + { + "epoch": 2.39, + "grad_norm": 0.7319931983947754, + "learning_rate": 5.9356402251227585e-05, + "loss": 2.8652, + "step": 48746 + }, + { + "epoch": 2.39, + "grad_norm": 0.7423975467681885, + "learning_rate": 5.9347209058496615e-05, + "loss": 2.956, + "step": 48747 + }, + { + "epoch": 2.39, + "grad_norm": 0.7304463386535645, + "learning_rate": 5.9338016499598725e-05, + "loss": 2.9522, + "step": 48748 + }, + { + "epoch": 2.39, + "grad_norm": 0.7549775838851929, + "learning_rate": 5.932882457455821e-05, + "loss": 2.9967, + "step": 48749 + }, + { + "epoch": 2.39, + "grad_norm": 0.6921327710151672, + "learning_rate": 5.93196332833992e-05, + "loss": 2.9852, + "step": 48750 + }, + { + "epoch": 2.39, + "grad_norm": 0.7499932646751404, + "learning_rate": 5.931044262614596e-05, + "loss": 3.0477, + "step": 48751 + }, + { + "epoch": 2.39, + "grad_norm": 0.7360295057296753, + "learning_rate": 5.9301252602822794e-05, + "loss": 3.2046, + "step": 48752 + }, + { + "epoch": 2.39, + "grad_norm": 0.70384281873703, + "learning_rate": 5.929206321345378e-05, + "loss": 2.6769, + "step": 48753 + }, + { + "epoch": 2.39, + "grad_norm": 0.7233741879463196, + "learning_rate": 5.9282874458063136e-05, + "loss": 2.8938, + "step": 48754 + }, + { + "epoch": 2.39, + "grad_norm": 0.7333195805549622, + "learning_rate": 5.9273686336675007e-05, + "loss": 2.8446, + "step": 48755 + }, + { + "epoch": 2.39, + "grad_norm": 0.7375016808509827, + "learning_rate": 5.9264498849313644e-05, + "loss": 3.0102, + "step": 48756 + }, + { + "epoch": 2.39, + "grad_norm": 0.7300297617912292, + "learning_rate": 5.925531199600336e-05, + "loss": 2.8862, + "step": 48757 + }, + { + "epoch": 2.39, + "grad_norm": 0.7252567410469055, + "learning_rate": 5.924612577676817e-05, + "loss": 2.9107, + "step": 48758 + }, + { + "epoch": 2.39, + "grad_norm": 0.7813132405281067, + "learning_rate": 5.923694019163242e-05, + "loss": 2.8536, + "step": 48759 + }, + { + "epoch": 2.39, + "grad_norm": 0.714250385761261, + "learning_rate": 5.9227755240620254e-05, + "loss": 2.9715, + "step": 48760 + }, + { + "epoch": 2.39, + "grad_norm": 0.7643177509307861, + "learning_rate": 5.921857092375576e-05, + "loss": 2.942, + "step": 48761 + }, + { + "epoch": 2.39, + "grad_norm": 0.7428061366081238, + "learning_rate": 5.920938724106328e-05, + "loss": 2.9505, + "step": 48762 + }, + { + "epoch": 2.39, + "grad_norm": 0.7188575267791748, + "learning_rate": 5.9200204192566856e-05, + "loss": 2.9623, + "step": 48763 + }, + { + "epoch": 2.39, + "grad_norm": 0.7388699650764465, + "learning_rate": 5.9191021778290817e-05, + "loss": 3.0572, + "step": 48764 + }, + { + "epoch": 2.39, + "grad_norm": 0.7034398913383484, + "learning_rate": 5.918183999825923e-05, + "loss": 2.8113, + "step": 48765 + }, + { + "epoch": 2.39, + "grad_norm": 0.7972329258918762, + "learning_rate": 5.917265885249631e-05, + "loss": 3.0816, + "step": 48766 + }, + { + "epoch": 2.39, + "grad_norm": 0.7589764595031738, + "learning_rate": 5.916347834102635e-05, + "loss": 2.9365, + "step": 48767 + }, + { + "epoch": 2.39, + "grad_norm": 0.7065830826759338, + "learning_rate": 5.915429846387342e-05, + "loss": 2.9464, + "step": 48768 + }, + { + "epoch": 2.39, + "grad_norm": 0.8015071153640747, + "learning_rate": 5.914511922106173e-05, + "loss": 3.0396, + "step": 48769 + }, + { + "epoch": 2.39, + "grad_norm": 0.7204930186271667, + "learning_rate": 5.913594061261534e-05, + "loss": 2.7337, + "step": 48770 + }, + { + "epoch": 2.39, + "grad_norm": 0.7478088140487671, + "learning_rate": 5.9126762638558556e-05, + "loss": 2.9955, + "step": 48771 + }, + { + "epoch": 2.39, + "grad_norm": 0.7554129362106323, + "learning_rate": 5.911758529891559e-05, + "loss": 2.9007, + "step": 48772 + }, + { + "epoch": 2.39, + "grad_norm": 0.7005829811096191, + "learning_rate": 5.9108408593710465e-05, + "loss": 2.9987, + "step": 48773 + }, + { + "epoch": 2.39, + "grad_norm": 0.7382482290267944, + "learning_rate": 5.909923252296752e-05, + "loss": 3.1134, + "step": 48774 + }, + { + "epoch": 2.39, + "grad_norm": 0.8164120316505432, + "learning_rate": 5.909005708671084e-05, + "loss": 2.9471, + "step": 48775 + }, + { + "epoch": 2.39, + "grad_norm": 0.706451416015625, + "learning_rate": 5.9080882284964506e-05, + "loss": 3.1401, + "step": 48776 + }, + { + "epoch": 2.39, + "grad_norm": 0.7515252232551575, + "learning_rate": 5.907170811775283e-05, + "loss": 2.8985, + "step": 48777 + }, + { + "epoch": 2.39, + "grad_norm": 0.6926350593566895, + "learning_rate": 5.906253458509982e-05, + "loss": 2.9885, + "step": 48778 + }, + { + "epoch": 2.39, + "grad_norm": 0.7306018471717834, + "learning_rate": 5.90533616870298e-05, + "loss": 2.8483, + "step": 48779 + }, + { + "epoch": 2.39, + "grad_norm": 0.6878507733345032, + "learning_rate": 5.904418942356678e-05, + "loss": 2.9312, + "step": 48780 + }, + { + "epoch": 2.39, + "grad_norm": 0.728437602519989, + "learning_rate": 5.9035017794735107e-05, + "loss": 2.9679, + "step": 48781 + }, + { + "epoch": 2.39, + "grad_norm": 0.710863471031189, + "learning_rate": 5.902584680055876e-05, + "loss": 2.9908, + "step": 48782 + }, + { + "epoch": 2.39, + "grad_norm": 0.7090634703636169, + "learning_rate": 5.901667644106193e-05, + "loss": 3.095, + "step": 48783 + }, + { + "epoch": 2.39, + "grad_norm": 0.7385855913162231, + "learning_rate": 5.900750671626885e-05, + "loss": 3.1334, + "step": 48784 + }, + { + "epoch": 2.39, + "grad_norm": 0.705529510974884, + "learning_rate": 5.899833762620354e-05, + "loss": 2.8821, + "step": 48785 + }, + { + "epoch": 2.39, + "grad_norm": 0.716135561466217, + "learning_rate": 5.8989169170890316e-05, + "loss": 2.8941, + "step": 48786 + }, + { + "epoch": 2.39, + "grad_norm": 0.690018355846405, + "learning_rate": 5.898000135035316e-05, + "loss": 2.8815, + "step": 48787 + }, + { + "epoch": 2.39, + "grad_norm": 0.7214192748069763, + "learning_rate": 5.8970834164616273e-05, + "loss": 2.8851, + "step": 48788 + }, + { + "epoch": 2.39, + "grad_norm": 0.8130916357040405, + "learning_rate": 5.8961667613703923e-05, + "loss": 2.9957, + "step": 48789 + }, + { + "epoch": 2.39, + "grad_norm": 0.7414829730987549, + "learning_rate": 5.8952501697640145e-05, + "loss": 2.6515, + "step": 48790 + }, + { + "epoch": 2.39, + "grad_norm": 0.7804091572761536, + "learning_rate": 5.894333641644908e-05, + "loss": 2.7691, + "step": 48791 + }, + { + "epoch": 2.39, + "grad_norm": 0.7167255878448486, + "learning_rate": 5.8934171770154824e-05, + "loss": 2.9098, + "step": 48792 + }, + { + "epoch": 2.39, + "grad_norm": 0.7627856135368347, + "learning_rate": 5.892500775878155e-05, + "loss": 3.1166, + "step": 48793 + }, + { + "epoch": 2.39, + "grad_norm": 0.7267736792564392, + "learning_rate": 5.8915844382353474e-05, + "loss": 3.0016, + "step": 48794 + }, + { + "epoch": 2.39, + "grad_norm": 0.7814797163009644, + "learning_rate": 5.89066816408946e-05, + "loss": 2.6712, + "step": 48795 + }, + { + "epoch": 2.39, + "grad_norm": 0.7293899059295654, + "learning_rate": 5.889751953442919e-05, + "loss": 2.9074, + "step": 48796 + }, + { + "epoch": 2.39, + "grad_norm": 0.7417458891868591, + "learning_rate": 5.888835806298132e-05, + "loss": 2.8068, + "step": 48797 + }, + { + "epoch": 2.39, + "grad_norm": 0.742914080619812, + "learning_rate": 5.8879197226575034e-05, + "loss": 2.6278, + "step": 48798 + }, + { + "epoch": 2.39, + "grad_norm": 0.7378799915313721, + "learning_rate": 5.887003702523462e-05, + "loss": 2.7636, + "step": 48799 + }, + { + "epoch": 2.39, + "grad_norm": 0.7445825338363647, + "learning_rate": 5.886087745898405e-05, + "loss": 2.9709, + "step": 48800 + }, + { + "epoch": 2.39, + "grad_norm": 0.7399541139602661, + "learning_rate": 5.88517185278476e-05, + "loss": 2.699, + "step": 48801 + }, + { + "epoch": 2.39, + "grad_norm": 0.7600640654563904, + "learning_rate": 5.884256023184924e-05, + "loss": 2.9231, + "step": 48802 + }, + { + "epoch": 2.39, + "grad_norm": 0.7569655776023865, + "learning_rate": 5.883340257101312e-05, + "loss": 3.065, + "step": 48803 + }, + { + "epoch": 2.39, + "grad_norm": 0.7483676671981812, + "learning_rate": 5.8824245545363615e-05, + "loss": 2.8574, + "step": 48804 + }, + { + "epoch": 2.39, + "grad_norm": 0.7173334360122681, + "learning_rate": 5.8815089154924456e-05, + "loss": 2.865, + "step": 48805 + }, + { + "epoch": 2.39, + "grad_norm": 0.7077659964561462, + "learning_rate": 5.8805933399719993e-05, + "loss": 2.8537, + "step": 48806 + }, + { + "epoch": 2.39, + "grad_norm": 0.7615037560462952, + "learning_rate": 5.879677827977425e-05, + "loss": 2.7053, + "step": 48807 + }, + { + "epoch": 2.39, + "grad_norm": 0.7203567028045654, + "learning_rate": 5.878762379511133e-05, + "loss": 2.8675, + "step": 48808 + }, + { + "epoch": 2.39, + "grad_norm": 0.7756044864654541, + "learning_rate": 5.877846994575548e-05, + "loss": 2.9412, + "step": 48809 + }, + { + "epoch": 2.39, + "grad_norm": 0.7094483971595764, + "learning_rate": 5.876931673173062e-05, + "loss": 2.9699, + "step": 48810 + }, + { + "epoch": 2.39, + "grad_norm": 0.7442003488540649, + "learning_rate": 5.8760164153061064e-05, + "loss": 2.912, + "step": 48811 + }, + { + "epoch": 2.39, + "grad_norm": 0.7222998142242432, + "learning_rate": 5.8751012209770786e-05, + "loss": 2.8013, + "step": 48812 + }, + { + "epoch": 2.39, + "grad_norm": 0.7189565300941467, + "learning_rate": 5.8741860901883844e-05, + "loss": 2.8706, + "step": 48813 + }, + { + "epoch": 2.39, + "grad_norm": 0.7589866518974304, + "learning_rate": 5.8732710229424507e-05, + "loss": 3.0814, + "step": 48814 + }, + { + "epoch": 2.39, + "grad_norm": 0.7047926783561707, + "learning_rate": 5.8723560192416665e-05, + "loss": 2.7251, + "step": 48815 + }, + { + "epoch": 2.39, + "grad_norm": 0.7266496419906616, + "learning_rate": 5.8714410790884626e-05, + "loss": 2.8967, + "step": 48816 + }, + { + "epoch": 2.39, + "grad_norm": 0.7001505494117737, + "learning_rate": 5.8705262024852304e-05, + "loss": 2.843, + "step": 48817 + }, + { + "epoch": 2.39, + "grad_norm": 0.7681334018707275, + "learning_rate": 5.869611389434399e-05, + "loss": 2.7958, + "step": 48818 + }, + { + "epoch": 2.39, + "grad_norm": 0.7098401784896851, + "learning_rate": 5.868696639938365e-05, + "loss": 2.9353, + "step": 48819 + }, + { + "epoch": 2.39, + "grad_norm": 0.7518061995506287, + "learning_rate": 5.867781953999531e-05, + "loss": 2.8312, + "step": 48820 + }, + { + "epoch": 2.39, + "grad_norm": 0.7705138325691223, + "learning_rate": 5.866867331620324e-05, + "loss": 2.8373, + "step": 48821 + }, + { + "epoch": 2.39, + "grad_norm": 0.7576525211334229, + "learning_rate": 5.865952772803136e-05, + "loss": 2.8809, + "step": 48822 + }, + { + "epoch": 2.39, + "grad_norm": 0.7420570850372314, + "learning_rate": 5.8650382775503855e-05, + "loss": 3.1835, + "step": 48823 + }, + { + "epoch": 2.39, + "grad_norm": 0.7061996459960938, + "learning_rate": 5.8641238458644835e-05, + "loss": 2.9064, + "step": 48824 + }, + { + "epoch": 2.39, + "grad_norm": 0.7823624610900879, + "learning_rate": 5.8632094777478275e-05, + "loss": 2.9696, + "step": 48825 + }, + { + "epoch": 2.39, + "grad_norm": 0.7167471647262573, + "learning_rate": 5.8622951732028435e-05, + "loss": 3.052, + "step": 48826 + }, + { + "epoch": 2.39, + "grad_norm": 0.7196730375289917, + "learning_rate": 5.861380932231925e-05, + "loss": 2.752, + "step": 48827 + }, + { + "epoch": 2.39, + "grad_norm": 0.758525013923645, + "learning_rate": 5.8604667548374776e-05, + "loss": 2.8304, + "step": 48828 + }, + { + "epoch": 2.39, + "grad_norm": 0.7166337370872498, + "learning_rate": 5.859552641021922e-05, + "loss": 3.0448, + "step": 48829 + }, + { + "epoch": 2.39, + "grad_norm": 0.6951048970222473, + "learning_rate": 5.8586385907876534e-05, + "loss": 2.9832, + "step": 48830 + }, + { + "epoch": 2.39, + "grad_norm": 0.7193121314048767, + "learning_rate": 5.8577246041370896e-05, + "loss": 2.9533, + "step": 48831 + }, + { + "epoch": 2.39, + "grad_norm": 0.7775723934173584, + "learning_rate": 5.856810681072626e-05, + "loss": 2.7201, + "step": 48832 + }, + { + "epoch": 2.39, + "grad_norm": 0.7279043793678284, + "learning_rate": 5.855896821596683e-05, + "loss": 2.7716, + "step": 48833 + }, + { + "epoch": 2.39, + "grad_norm": 0.7554843425750732, + "learning_rate": 5.854983025711662e-05, + "loss": 2.8317, + "step": 48834 + }, + { + "epoch": 2.39, + "grad_norm": 0.7403421401977539, + "learning_rate": 5.854069293419963e-05, + "loss": 3.1781, + "step": 48835 + }, + { + "epoch": 2.39, + "grad_norm": 0.70306396484375, + "learning_rate": 5.8531556247240054e-05, + "loss": 2.8058, + "step": 48836 + }, + { + "epoch": 2.39, + "grad_norm": 0.716789960861206, + "learning_rate": 5.852242019626181e-05, + "loss": 2.9193, + "step": 48837 + }, + { + "epoch": 2.39, + "grad_norm": 0.7575057148933411, + "learning_rate": 5.851328478128902e-05, + "loss": 2.7854, + "step": 48838 + }, + { + "epoch": 2.39, + "grad_norm": 0.7853378057479858, + "learning_rate": 5.850415000234585e-05, + "loss": 3.17, + "step": 48839 + }, + { + "epoch": 2.39, + "grad_norm": 0.7358184456825256, + "learning_rate": 5.8495015859456294e-05, + "loss": 2.8608, + "step": 48840 + }, + { + "epoch": 2.39, + "grad_norm": 0.6891388893127441, + "learning_rate": 5.8485882352644366e-05, + "loss": 2.8855, + "step": 48841 + }, + { + "epoch": 2.39, + "grad_norm": 0.7379217743873596, + "learning_rate": 5.847674948193407e-05, + "loss": 2.6099, + "step": 48842 + }, + { + "epoch": 2.39, + "grad_norm": 0.7153098583221436, + "learning_rate": 5.846761724734953e-05, + "loss": 2.8372, + "step": 48843 + }, + { + "epoch": 2.39, + "grad_norm": 0.750810980796814, + "learning_rate": 5.845848564891489e-05, + "loss": 2.8569, + "step": 48844 + }, + { + "epoch": 2.39, + "grad_norm": 0.7509940266609192, + "learning_rate": 5.8449354686654025e-05, + "loss": 2.6827, + "step": 48845 + }, + { + "epoch": 2.39, + "grad_norm": 0.7342125773429871, + "learning_rate": 5.844022436059116e-05, + "loss": 2.7864, + "step": 48846 + }, + { + "epoch": 2.39, + "grad_norm": 0.7348741292953491, + "learning_rate": 5.8431094670750165e-05, + "loss": 2.8741, + "step": 48847 + }, + { + "epoch": 2.39, + "grad_norm": 0.7125906348228455, + "learning_rate": 5.842196561715526e-05, + "loss": 3.0068, + "step": 48848 + }, + { + "epoch": 2.39, + "grad_norm": 0.7036176323890686, + "learning_rate": 5.84128371998304e-05, + "loss": 2.9852, + "step": 48849 + }, + { + "epoch": 2.39, + "grad_norm": 0.7853854298591614, + "learning_rate": 5.8403709418799546e-05, + "loss": 2.9464, + "step": 48850 + }, + { + "epoch": 2.39, + "grad_norm": 0.7350966930389404, + "learning_rate": 5.8394582274086935e-05, + "loss": 2.9554, + "step": 48851 + }, + { + "epoch": 2.39, + "grad_norm": 0.7737547755241394, + "learning_rate": 5.838545576571639e-05, + "loss": 2.9423, + "step": 48852 + }, + { + "epoch": 2.39, + "grad_norm": 0.7212133407592773, + "learning_rate": 5.837632989371205e-05, + "loss": 2.8444, + "step": 48853 + }, + { + "epoch": 2.39, + "grad_norm": 0.7580508589744568, + "learning_rate": 5.836720465809808e-05, + "loss": 2.9049, + "step": 48854 + }, + { + "epoch": 2.39, + "grad_norm": 0.7183226943016052, + "learning_rate": 5.8358080058898326e-05, + "loss": 2.894, + "step": 48855 + }, + { + "epoch": 2.39, + "grad_norm": 0.7060414552688599, + "learning_rate": 5.8348956096136945e-05, + "loss": 2.6954, + "step": 48856 + }, + { + "epoch": 2.39, + "grad_norm": 0.7525982856750488, + "learning_rate": 5.833983276983778e-05, + "loss": 3.1094, + "step": 48857 + }, + { + "epoch": 2.39, + "grad_norm": 0.7521215677261353, + "learning_rate": 5.833071008002508e-05, + "loss": 2.8246, + "step": 48858 + }, + { + "epoch": 2.39, + "grad_norm": 0.7773438096046448, + "learning_rate": 5.832158802672272e-05, + "loss": 2.7105, + "step": 48859 + }, + { + "epoch": 2.39, + "grad_norm": 0.7317736148834229, + "learning_rate": 5.8312466609954756e-05, + "loss": 3.0221, + "step": 48860 + }, + { + "epoch": 2.39, + "grad_norm": 0.7350138425827026, + "learning_rate": 5.830334582974533e-05, + "loss": 2.9349, + "step": 48861 + }, + { + "epoch": 2.39, + "grad_norm": 0.7256360650062561, + "learning_rate": 5.829422568611839e-05, + "loss": 2.9305, + "step": 48862 + }, + { + "epoch": 2.39, + "grad_norm": 0.7844482660293579, + "learning_rate": 5.828510617909795e-05, + "loss": 3.0334, + "step": 48863 + }, + { + "epoch": 2.39, + "grad_norm": 0.7349040508270264, + "learning_rate": 5.827598730870793e-05, + "loss": 2.9445, + "step": 48864 + }, + { + "epoch": 2.39, + "grad_norm": 0.7633810639381409, + "learning_rate": 5.8266869074972436e-05, + "loss": 2.8846, + "step": 48865 + }, + { + "epoch": 2.39, + "grad_norm": 0.7389031052589417, + "learning_rate": 5.825775147791555e-05, + "loss": 2.833, + "step": 48866 + }, + { + "epoch": 2.39, + "grad_norm": 0.7971253395080566, + "learning_rate": 5.8248634517561156e-05, + "loss": 2.8842, + "step": 48867 + }, + { + "epoch": 2.39, + "grad_norm": 0.7472830414772034, + "learning_rate": 5.82395181939334e-05, + "loss": 2.6718, + "step": 48868 + }, + { + "epoch": 2.39, + "grad_norm": 1.0377117395401, + "learning_rate": 5.8230402507056165e-05, + "loss": 2.8771, + "step": 48869 + }, + { + "epoch": 2.4, + "grad_norm": 0.7521938681602478, + "learning_rate": 5.822128745695359e-05, + "loss": 2.8167, + "step": 48870 + }, + { + "epoch": 2.4, + "grad_norm": 0.7492688298225403, + "learning_rate": 5.821217304364963e-05, + "loss": 2.8992, + "step": 48871 + }, + { + "epoch": 2.4, + "grad_norm": 0.7289814949035645, + "learning_rate": 5.820305926716817e-05, + "loss": 2.9214, + "step": 48872 + }, + { + "epoch": 2.4, + "grad_norm": 0.6818594336509705, + "learning_rate": 5.819394612753342e-05, + "loss": 2.7029, + "step": 48873 + }, + { + "epoch": 2.4, + "grad_norm": 0.7078396677970886, + "learning_rate": 5.818483362476919e-05, + "loss": 2.6411, + "step": 48874 + }, + { + "epoch": 2.4, + "grad_norm": 0.7174808979034424, + "learning_rate": 5.8175721758899566e-05, + "loss": 2.9228, + "step": 48875 + }, + { + "epoch": 2.4, + "grad_norm": 0.7881730794906616, + "learning_rate": 5.8166610529948624e-05, + "loss": 2.882, + "step": 48876 + }, + { + "epoch": 2.4, + "grad_norm": 0.7044768929481506, + "learning_rate": 5.8157499937940275e-05, + "loss": 2.976, + "step": 48877 + }, + { + "epoch": 2.4, + "grad_norm": 0.72385573387146, + "learning_rate": 5.814838998289855e-05, + "loss": 2.8273, + "step": 48878 + }, + { + "epoch": 2.4, + "grad_norm": 0.68699049949646, + "learning_rate": 5.813928066484734e-05, + "loss": 2.8484, + "step": 48879 + }, + { + "epoch": 2.4, + "grad_norm": 0.7593944072723389, + "learning_rate": 5.813017198381071e-05, + "loss": 2.8658, + "step": 48880 + }, + { + "epoch": 2.4, + "grad_norm": 0.7795584201812744, + "learning_rate": 5.8121063939812776e-05, + "loss": 2.9373, + "step": 48881 + }, + { + "epoch": 2.4, + "grad_norm": 0.7256165742874146, + "learning_rate": 5.811195653287727e-05, + "loss": 2.8412, + "step": 48882 + }, + { + "epoch": 2.4, + "grad_norm": 0.8968254923820496, + "learning_rate": 5.8102849763028444e-05, + "loss": 2.9702, + "step": 48883 + }, + { + "epoch": 2.4, + "grad_norm": 0.7178112864494324, + "learning_rate": 5.8093743630290125e-05, + "loss": 2.5675, + "step": 48884 + }, + { + "epoch": 2.4, + "grad_norm": 0.75639408826828, + "learning_rate": 5.808463813468628e-05, + "loss": 2.9394, + "step": 48885 + }, + { + "epoch": 2.4, + "grad_norm": 0.7796924114227295, + "learning_rate": 5.807553327624099e-05, + "loss": 2.9314, + "step": 48886 + }, + { + "epoch": 2.4, + "grad_norm": 0.7454410195350647, + "learning_rate": 5.806642905497811e-05, + "loss": 3.0443, + "step": 48887 + }, + { + "epoch": 2.4, + "grad_norm": 0.7198060750961304, + "learning_rate": 5.805732547092179e-05, + "loss": 2.7605, + "step": 48888 + }, + { + "epoch": 2.4, + "grad_norm": 0.6760872006416321, + "learning_rate": 5.8048222524095833e-05, + "loss": 2.7732, + "step": 48889 + }, + { + "epoch": 2.4, + "grad_norm": 0.7360389828681946, + "learning_rate": 5.8039120214524293e-05, + "loss": 2.6977, + "step": 48890 + }, + { + "epoch": 2.4, + "grad_norm": 0.7301698327064514, + "learning_rate": 5.803001854223122e-05, + "loss": 2.8464, + "step": 48891 + }, + { + "epoch": 2.4, + "grad_norm": 0.7200692296028137, + "learning_rate": 5.8020917507240516e-05, + "loss": 3.0045, + "step": 48892 + }, + { + "epoch": 2.4, + "grad_norm": 0.7441954016685486, + "learning_rate": 5.8011817109576145e-05, + "loss": 3.1825, + "step": 48893 + }, + { + "epoch": 2.4, + "grad_norm": 0.7359301447868347, + "learning_rate": 5.800271734926197e-05, + "loss": 2.9192, + "step": 48894 + }, + { + "epoch": 2.4, + "grad_norm": 0.7263439893722534, + "learning_rate": 5.799361822632209e-05, + "loss": 2.9108, + "step": 48895 + }, + { + "epoch": 2.4, + "grad_norm": 0.7272346615791321, + "learning_rate": 5.798451974078051e-05, + "loss": 2.8907, + "step": 48896 + }, + { + "epoch": 2.4, + "grad_norm": 0.7894495129585266, + "learning_rate": 5.7975421892661045e-05, + "loss": 2.9208, + "step": 48897 + }, + { + "epoch": 2.4, + "grad_norm": 0.8710970282554626, + "learning_rate": 5.796632468198784e-05, + "loss": 3.0931, + "step": 48898 + }, + { + "epoch": 2.4, + "grad_norm": 0.6832615733146667, + "learning_rate": 5.795722810878475e-05, + "loss": 2.9985, + "step": 48899 + }, + { + "epoch": 2.4, + "grad_norm": 0.7081145644187927, + "learning_rate": 5.794813217307565e-05, + "loss": 3.0453, + "step": 48900 + }, + { + "epoch": 2.4, + "grad_norm": 0.7711806893348694, + "learning_rate": 5.793903687488468e-05, + "loss": 2.8482, + "step": 48901 + }, + { + "epoch": 2.4, + "grad_norm": 0.8075990676879883, + "learning_rate": 5.79299422142356e-05, + "loss": 3.1117, + "step": 48902 + }, + { + "epoch": 2.4, + "grad_norm": 0.7430427670478821, + "learning_rate": 5.7920848191152555e-05, + "loss": 2.9316, + "step": 48903 + }, + { + "epoch": 2.4, + "grad_norm": 0.7735027074813843, + "learning_rate": 5.791175480565936e-05, + "loss": 2.8317, + "step": 48904 + }, + { + "epoch": 2.4, + "grad_norm": 0.7572428584098816, + "learning_rate": 5.790266205778006e-05, + "loss": 2.9534, + "step": 48905 + }, + { + "epoch": 2.4, + "grad_norm": 0.721071720123291, + "learning_rate": 5.789356994753853e-05, + "loss": 2.9628, + "step": 48906 + }, + { + "epoch": 2.4, + "grad_norm": 0.7726892828941345, + "learning_rate": 5.788447847495869e-05, + "loss": 2.8635, + "step": 48907 + }, + { + "epoch": 2.4, + "grad_norm": 0.7207630276679993, + "learning_rate": 5.787538764006463e-05, + "loss": 2.9928, + "step": 48908 + }, + { + "epoch": 2.4, + "grad_norm": 0.7308686971664429, + "learning_rate": 5.786629744288012e-05, + "loss": 2.8482, + "step": 48909 + }, + { + "epoch": 2.4, + "grad_norm": 0.8075633645057678, + "learning_rate": 5.785720788342917e-05, + "loss": 2.8302, + "step": 48910 + }, + { + "epoch": 2.4, + "grad_norm": 0.7197324633598328, + "learning_rate": 5.78481189617358e-05, + "loss": 2.9422, + "step": 48911 + }, + { + "epoch": 2.4, + "grad_norm": 0.7496452927589417, + "learning_rate": 5.783903067782383e-05, + "loss": 2.7668, + "step": 48912 + }, + { + "epoch": 2.4, + "grad_norm": 0.7587379813194275, + "learning_rate": 5.7829943031717295e-05, + "loss": 2.8343, + "step": 48913 + }, + { + "epoch": 2.4, + "grad_norm": 0.7361343502998352, + "learning_rate": 5.782085602344012e-05, + "loss": 2.9595, + "step": 48914 + }, + { + "epoch": 2.4, + "grad_norm": 0.6897303462028503, + "learning_rate": 5.7811769653016117e-05, + "loss": 2.8756, + "step": 48915 + }, + { + "epoch": 2.4, + "grad_norm": 0.7227274179458618, + "learning_rate": 5.7802683920469364e-05, + "loss": 3.046, + "step": 48916 + }, + { + "epoch": 2.4, + "grad_norm": 0.7711227536201477, + "learning_rate": 5.779359882582364e-05, + "loss": 3.0059, + "step": 48917 + }, + { + "epoch": 2.4, + "grad_norm": 0.6948781609535217, + "learning_rate": 5.7784514369103064e-05, + "loss": 2.9202, + "step": 48918 + }, + { + "epoch": 2.4, + "grad_norm": 0.7455137968063354, + "learning_rate": 5.777543055033138e-05, + "loss": 2.9468, + "step": 48919 + }, + { + "epoch": 2.4, + "grad_norm": 0.738844096660614, + "learning_rate": 5.776634736953267e-05, + "loss": 3.0334, + "step": 48920 + }, + { + "epoch": 2.4, + "grad_norm": 0.7826026082038879, + "learning_rate": 5.7757264826730786e-05, + "loss": 2.8642, + "step": 48921 + }, + { + "epoch": 2.4, + "grad_norm": 0.7317241430282593, + "learning_rate": 5.774818292194957e-05, + "loss": 2.8831, + "step": 48922 + }, + { + "epoch": 2.4, + "grad_norm": 0.745648980140686, + "learning_rate": 5.773910165521311e-05, + "loss": 3.1187, + "step": 48923 + }, + { + "epoch": 2.4, + "grad_norm": 0.7066932916641235, + "learning_rate": 5.7730021026545107e-05, + "loss": 2.9324, + "step": 48924 + }, + { + "epoch": 2.4, + "grad_norm": 0.719154417514801, + "learning_rate": 5.772094103596965e-05, + "loss": 2.9116, + "step": 48925 + }, + { + "epoch": 2.4, + "grad_norm": 0.7419759631156921, + "learning_rate": 5.7711861683510654e-05, + "loss": 2.7544, + "step": 48926 + }, + { + "epoch": 2.4, + "grad_norm": 0.7109057307243347, + "learning_rate": 5.770278296919193e-05, + "loss": 2.6735, + "step": 48927 + }, + { + "epoch": 2.4, + "grad_norm": 0.6973155736923218, + "learning_rate": 5.769370489303749e-05, + "loss": 2.9472, + "step": 48928 + }, + { + "epoch": 2.4, + "grad_norm": 0.7412685751914978, + "learning_rate": 5.768462745507123e-05, + "loss": 2.96, + "step": 48929 + }, + { + "epoch": 2.4, + "grad_norm": 0.726198136806488, + "learning_rate": 5.767555065531695e-05, + "loss": 2.9192, + "step": 48930 + }, + { + "epoch": 2.4, + "grad_norm": 0.7647451162338257, + "learning_rate": 5.76664744937987e-05, + "loss": 2.9987, + "step": 48931 + }, + { + "epoch": 2.4, + "grad_norm": 0.790251612663269, + "learning_rate": 5.7657398970540226e-05, + "loss": 2.8102, + "step": 48932 + }, + { + "epoch": 2.4, + "grad_norm": 0.7141204476356506, + "learning_rate": 5.764832408556561e-05, + "loss": 3.3217, + "step": 48933 + }, + { + "epoch": 2.4, + "grad_norm": 0.742522120475769, + "learning_rate": 5.76392498388986e-05, + "loss": 2.9367, + "step": 48934 + }, + { + "epoch": 2.4, + "grad_norm": 0.6904557347297668, + "learning_rate": 5.763017623056325e-05, + "loss": 2.5379, + "step": 48935 + }, + { + "epoch": 2.4, + "grad_norm": 0.6816965937614441, + "learning_rate": 5.762110326058337e-05, + "loss": 2.8829, + "step": 48936 + }, + { + "epoch": 2.4, + "grad_norm": 0.7373006343841553, + "learning_rate": 5.761203092898277e-05, + "loss": 2.9315, + "step": 48937 + }, + { + "epoch": 2.4, + "grad_norm": 0.7109895348548889, + "learning_rate": 5.76029592357855e-05, + "loss": 2.756, + "step": 48938 + }, + { + "epoch": 2.4, + "grad_norm": 0.7374069690704346, + "learning_rate": 5.75938881810153e-05, + "loss": 3.042, + "step": 48939 + }, + { + "epoch": 2.4, + "grad_norm": 0.7400507926940918, + "learning_rate": 5.758481776469623e-05, + "loss": 2.9827, + "step": 48940 + }, + { + "epoch": 2.4, + "grad_norm": 0.7280579209327698, + "learning_rate": 5.757574798685204e-05, + "loss": 2.8857, + "step": 48941 + }, + { + "epoch": 2.4, + "grad_norm": 0.7616007924079895, + "learning_rate": 5.756667884750673e-05, + "loss": 2.8854, + "step": 48942 + }, + { + "epoch": 2.4, + "grad_norm": 0.7113364338874817, + "learning_rate": 5.755761034668413e-05, + "loss": 3.0651, + "step": 48943 + }, + { + "epoch": 2.4, + "grad_norm": 0.7248356938362122, + "learning_rate": 5.754854248440807e-05, + "loss": 2.927, + "step": 48944 + }, + { + "epoch": 2.4, + "grad_norm": 0.7483928799629211, + "learning_rate": 5.753947526070254e-05, + "loss": 2.9758, + "step": 48945 + }, + { + "epoch": 2.4, + "grad_norm": 0.7303197383880615, + "learning_rate": 5.753040867559132e-05, + "loss": 3.0986, + "step": 48946 + }, + { + "epoch": 2.4, + "grad_norm": 0.7678343653678894, + "learning_rate": 5.752134272909833e-05, + "loss": 2.8695, + "step": 48947 + }, + { + "epoch": 2.4, + "grad_norm": 0.7396532893180847, + "learning_rate": 5.751227742124751e-05, + "loss": 3.1046, + "step": 48948 + }, + { + "epoch": 2.4, + "grad_norm": 0.7055127620697021, + "learning_rate": 5.7503212752062646e-05, + "loss": 2.9769, + "step": 48949 + }, + { + "epoch": 2.4, + "grad_norm": 0.8863729238510132, + "learning_rate": 5.7494148721567676e-05, + "loss": 2.8523, + "step": 48950 + }, + { + "epoch": 2.4, + "grad_norm": 0.7113443613052368, + "learning_rate": 5.748508532978649e-05, + "loss": 2.8839, + "step": 48951 + }, + { + "epoch": 2.4, + "grad_norm": 0.7774294018745422, + "learning_rate": 5.747602257674284e-05, + "loss": 2.8566, + "step": 48952 + }, + { + "epoch": 2.4, + "grad_norm": 0.747187614440918, + "learning_rate": 5.746696046246073e-05, + "loss": 2.8786, + "step": 48953 + }, + { + "epoch": 2.4, + "grad_norm": 0.6938170790672302, + "learning_rate": 5.745789898696388e-05, + "loss": 2.8971, + "step": 48954 + }, + { + "epoch": 2.4, + "grad_norm": 0.7477014064788818, + "learning_rate": 5.7448838150276366e-05, + "loss": 2.9451, + "step": 48955 + }, + { + "epoch": 2.4, + "grad_norm": 0.7510203123092651, + "learning_rate": 5.7439777952421805e-05, + "loss": 2.918, + "step": 48956 + }, + { + "epoch": 2.4, + "grad_norm": 0.7017959356307983, + "learning_rate": 5.7430718393424315e-05, + "loss": 2.7877, + "step": 48957 + }, + { + "epoch": 2.4, + "grad_norm": 0.7510301470756531, + "learning_rate": 5.7421659473307605e-05, + "loss": 2.9053, + "step": 48958 + }, + { + "epoch": 2.4, + "grad_norm": 0.6991977095603943, + "learning_rate": 5.7412601192095454e-05, + "loss": 3.0594, + "step": 48959 + }, + { + "epoch": 2.4, + "grad_norm": 0.6977912783622742, + "learning_rate": 5.740354354981195e-05, + "loss": 3.0323, + "step": 48960 + }, + { + "epoch": 2.4, + "grad_norm": 0.7294617295265198, + "learning_rate": 5.73944865464807e-05, + "loss": 3.0129, + "step": 48961 + }, + { + "epoch": 2.4, + "grad_norm": 0.7957404255867004, + "learning_rate": 5.738543018212573e-05, + "loss": 2.7911, + "step": 48962 + }, + { + "epoch": 2.4, + "grad_norm": 0.850426971912384, + "learning_rate": 5.73763744567709e-05, + "loss": 2.9213, + "step": 48963 + }, + { + "epoch": 2.4, + "grad_norm": 0.7113354802131653, + "learning_rate": 5.7367319370440014e-05, + "loss": 2.8616, + "step": 48964 + }, + { + "epoch": 2.4, + "grad_norm": 0.77390456199646, + "learning_rate": 5.735826492315687e-05, + "loss": 3.1204, + "step": 48965 + }, + { + "epoch": 2.4, + "grad_norm": 0.734718382358551, + "learning_rate": 5.7349211114945325e-05, + "loss": 2.8761, + "step": 48966 + }, + { + "epoch": 2.4, + "grad_norm": 0.9392869472503662, + "learning_rate": 5.734015794582926e-05, + "loss": 2.9756, + "step": 48967 + }, + { + "epoch": 2.4, + "grad_norm": 0.7067123055458069, + "learning_rate": 5.733110541583258e-05, + "loss": 3.0634, + "step": 48968 + }, + { + "epoch": 2.4, + "grad_norm": 0.7297945618629456, + "learning_rate": 5.732205352497901e-05, + "loss": 2.9535, + "step": 48969 + }, + { + "epoch": 2.4, + "grad_norm": 0.7064141631126404, + "learning_rate": 5.731300227329249e-05, + "loss": 3.0467, + "step": 48970 + }, + { + "epoch": 2.4, + "grad_norm": 0.7072996497154236, + "learning_rate": 5.730395166079677e-05, + "loss": 2.9136, + "step": 48971 + }, + { + "epoch": 2.4, + "grad_norm": 0.749560534954071, + "learning_rate": 5.7294901687515795e-05, + "loss": 2.8613, + "step": 48972 + }, + { + "epoch": 2.4, + "grad_norm": 0.735333263874054, + "learning_rate": 5.7285852353473315e-05, + "loss": 2.983, + "step": 48973 + }, + { + "epoch": 2.4, + "grad_norm": 0.7418922185897827, + "learning_rate": 5.727680365869315e-05, + "loss": 3.1744, + "step": 48974 + }, + { + "epoch": 2.4, + "grad_norm": 0.7562124133110046, + "learning_rate": 5.726775560319924e-05, + "loss": 2.8761, + "step": 48975 + }, + { + "epoch": 2.4, + "grad_norm": 0.724221408367157, + "learning_rate": 5.725870818701527e-05, + "loss": 2.9732, + "step": 48976 + }, + { + "epoch": 2.4, + "grad_norm": 0.7300118207931519, + "learning_rate": 5.724966141016515e-05, + "loss": 2.77, + "step": 48977 + }, + { + "epoch": 2.4, + "grad_norm": 0.7261303663253784, + "learning_rate": 5.7240615272672763e-05, + "loss": 2.919, + "step": 48978 + }, + { + "epoch": 2.4, + "grad_norm": 0.7926436066627502, + "learning_rate": 5.723156977456189e-05, + "loss": 2.8964, + "step": 48979 + }, + { + "epoch": 2.4, + "grad_norm": 0.7375580072402954, + "learning_rate": 5.7222524915856314e-05, + "loss": 3.0317, + "step": 48980 + }, + { + "epoch": 2.4, + "grad_norm": 0.7428630590438843, + "learning_rate": 5.721348069657985e-05, + "loss": 2.9936, + "step": 48981 + }, + { + "epoch": 2.4, + "grad_norm": 0.766548216342926, + "learning_rate": 5.720443711675631e-05, + "loss": 2.9699, + "step": 48982 + }, + { + "epoch": 2.4, + "grad_norm": 0.7440727949142456, + "learning_rate": 5.719539417640967e-05, + "loss": 3.1029, + "step": 48983 + }, + { + "epoch": 2.4, + "grad_norm": 0.7492101192474365, + "learning_rate": 5.718635187556355e-05, + "loss": 2.7294, + "step": 48984 + }, + { + "epoch": 2.4, + "grad_norm": 0.8655734658241272, + "learning_rate": 5.71773102142419e-05, + "loss": 2.683, + "step": 48985 + }, + { + "epoch": 2.4, + "grad_norm": 0.7333561182022095, + "learning_rate": 5.71682691924685e-05, + "loss": 2.9099, + "step": 48986 + }, + { + "epoch": 2.4, + "grad_norm": 0.7671554684638977, + "learning_rate": 5.715922881026705e-05, + "loss": 2.9234, + "step": 48987 + }, + { + "epoch": 2.4, + "grad_norm": 0.6992800235748291, + "learning_rate": 5.715018906766156e-05, + "loss": 3.0187, + "step": 48988 + }, + { + "epoch": 2.4, + "grad_norm": 0.707787036895752, + "learning_rate": 5.714114996467565e-05, + "loss": 2.8687, + "step": 48989 + }, + { + "epoch": 2.4, + "grad_norm": 0.7215185165405273, + "learning_rate": 5.713211150133328e-05, + "loss": 2.97, + "step": 48990 + }, + { + "epoch": 2.4, + "grad_norm": 0.7288063168525696, + "learning_rate": 5.7123073677658126e-05, + "loss": 3.0844, + "step": 48991 + }, + { + "epoch": 2.4, + "grad_norm": 0.6853630542755127, + "learning_rate": 5.7114036493674045e-05, + "loss": 3.0357, + "step": 48992 + }, + { + "epoch": 2.4, + "grad_norm": 0.694296658039093, + "learning_rate": 5.710499994940491e-05, + "loss": 3.081, + "step": 48993 + }, + { + "epoch": 2.4, + "grad_norm": 0.7616432905197144, + "learning_rate": 5.7095964044874496e-05, + "loss": 3.1458, + "step": 48994 + }, + { + "epoch": 2.4, + "grad_norm": 0.7581666707992554, + "learning_rate": 5.708692878010653e-05, + "loss": 2.9142, + "step": 48995 + }, + { + "epoch": 2.4, + "grad_norm": 0.8747384548187256, + "learning_rate": 5.7077894155124795e-05, + "loss": 2.6311, + "step": 48996 + }, + { + "epoch": 2.4, + "grad_norm": 0.71066814661026, + "learning_rate": 5.706886016995309e-05, + "loss": 2.8955, + "step": 48997 + }, + { + "epoch": 2.4, + "grad_norm": 0.7173435688018799, + "learning_rate": 5.7059826824615377e-05, + "loss": 2.8613, + "step": 48998 + }, + { + "epoch": 2.4, + "grad_norm": 0.7304638028144836, + "learning_rate": 5.705079411913522e-05, + "loss": 2.5475, + "step": 48999 + }, + { + "epoch": 2.4, + "grad_norm": 0.7528974413871765, + "learning_rate": 5.704176205353662e-05, + "loss": 2.7217, + "step": 49000 + }, + { + "epoch": 2.4, + "grad_norm": 0.7631033062934875, + "learning_rate": 5.703273062784321e-05, + "loss": 2.8355, + "step": 49001 + }, + { + "epoch": 2.4, + "grad_norm": 0.734296441078186, + "learning_rate": 5.702369984207882e-05, + "loss": 2.9648, + "step": 49002 + }, + { + "epoch": 2.4, + "grad_norm": 0.6912224888801575, + "learning_rate": 5.701466969626728e-05, + "loss": 3.0517, + "step": 49003 + }, + { + "epoch": 2.4, + "grad_norm": 0.7426257133483887, + "learning_rate": 5.700564019043224e-05, + "loss": 2.8531, + "step": 49004 + }, + { + "epoch": 2.4, + "grad_norm": 0.795962393283844, + "learning_rate": 5.6996611324597695e-05, + "loss": 2.8197, + "step": 49005 + }, + { + "epoch": 2.4, + "grad_norm": 0.7349714040756226, + "learning_rate": 5.698758309878721e-05, + "loss": 2.938, + "step": 49006 + }, + { + "epoch": 2.4, + "grad_norm": 0.7070464491844177, + "learning_rate": 5.697855551302475e-05, + "loss": 3.0683, + "step": 49007 + }, + { + "epoch": 2.4, + "grad_norm": 0.7317219972610474, + "learning_rate": 5.696952856733401e-05, + "loss": 2.8491, + "step": 49008 + }, + { + "epoch": 2.4, + "grad_norm": 0.7292014956474304, + "learning_rate": 5.696050226173865e-05, + "loss": 2.8261, + "step": 49009 + }, + { + "epoch": 2.4, + "grad_norm": 0.7460183501243591, + "learning_rate": 5.695147659626265e-05, + "loss": 2.7796, + "step": 49010 + }, + { + "epoch": 2.4, + "grad_norm": 0.7135501503944397, + "learning_rate": 5.6942451570929614e-05, + "loss": 3.0784, + "step": 49011 + }, + { + "epoch": 2.4, + "grad_norm": 0.7707333564758301, + "learning_rate": 5.6933427185763466e-05, + "loss": 3.0009, + "step": 49012 + }, + { + "epoch": 2.4, + "grad_norm": 0.7286562323570251, + "learning_rate": 5.692440344078779e-05, + "loss": 2.8597, + "step": 49013 + }, + { + "epoch": 2.4, + "grad_norm": 0.7598409056663513, + "learning_rate": 5.6915380336026494e-05, + "loss": 3.0185, + "step": 49014 + }, + { + "epoch": 2.4, + "grad_norm": 0.7345313429832458, + "learning_rate": 5.6906357871503326e-05, + "loss": 3.0258, + "step": 49015 + }, + { + "epoch": 2.4, + "grad_norm": 0.7436714172363281, + "learning_rate": 5.689733604724207e-05, + "loss": 2.8548, + "step": 49016 + }, + { + "epoch": 2.4, + "grad_norm": 0.750116765499115, + "learning_rate": 5.6888314863266436e-05, + "loss": 2.9594, + "step": 49017 + }, + { + "epoch": 2.4, + "grad_norm": 0.7049277424812317, + "learning_rate": 5.687929431960011e-05, + "loss": 2.9449, + "step": 49018 + }, + { + "epoch": 2.4, + "grad_norm": 0.7214593291282654, + "learning_rate": 5.6870274416266934e-05, + "loss": 3.0394, + "step": 49019 + }, + { + "epoch": 2.4, + "grad_norm": 0.7183932662010193, + "learning_rate": 5.6861255153290765e-05, + "loss": 2.8671, + "step": 49020 + }, + { + "epoch": 2.4, + "grad_norm": 0.7463213801383972, + "learning_rate": 5.685223653069514e-05, + "loss": 2.714, + "step": 49021 + }, + { + "epoch": 2.4, + "grad_norm": 0.7598058581352234, + "learning_rate": 5.684321854850401e-05, + "loss": 2.9408, + "step": 49022 + }, + { + "epoch": 2.4, + "grad_norm": 0.7609604597091675, + "learning_rate": 5.683420120674106e-05, + "loss": 2.9313, + "step": 49023 + }, + { + "epoch": 2.4, + "grad_norm": 0.7309617400169373, + "learning_rate": 5.682518450542996e-05, + "loss": 2.8476, + "step": 49024 + }, + { + "epoch": 2.4, + "grad_norm": 0.7466161251068115, + "learning_rate": 5.6816168444594576e-05, + "loss": 2.8262, + "step": 49025 + }, + { + "epoch": 2.4, + "grad_norm": 0.7562272548675537, + "learning_rate": 5.680715302425855e-05, + "loss": 2.8261, + "step": 49026 + }, + { + "epoch": 2.4, + "grad_norm": 0.7695786952972412, + "learning_rate": 5.6798138244445745e-05, + "loss": 2.9201, + "step": 49027 + }, + { + "epoch": 2.4, + "grad_norm": 0.686572790145874, + "learning_rate": 5.678912410517976e-05, + "loss": 2.8701, + "step": 49028 + }, + { + "epoch": 2.4, + "grad_norm": 0.746677815914154, + "learning_rate": 5.678011060648441e-05, + "loss": 2.8989, + "step": 49029 + }, + { + "epoch": 2.4, + "grad_norm": 0.7117769122123718, + "learning_rate": 5.677109774838363e-05, + "loss": 2.8207, + "step": 49030 + }, + { + "epoch": 2.4, + "grad_norm": 0.7121254205703735, + "learning_rate": 5.676208553090078e-05, + "loss": 3.1207, + "step": 49031 + }, + { + "epoch": 2.4, + "grad_norm": 0.7127178907394409, + "learning_rate": 5.6753073954059866e-05, + "loss": 3.032, + "step": 49032 + }, + { + "epoch": 2.4, + "grad_norm": 0.7273410558700562, + "learning_rate": 5.6744063017884447e-05, + "loss": 2.8643, + "step": 49033 + }, + { + "epoch": 2.4, + "grad_norm": 0.7432453632354736, + "learning_rate": 5.673505272239839e-05, + "loss": 2.9869, + "step": 49034 + }, + { + "epoch": 2.4, + "grad_norm": 0.7287410497665405, + "learning_rate": 5.6726043067625447e-05, + "loss": 2.8154, + "step": 49035 + }, + { + "epoch": 2.4, + "grad_norm": 0.7052610516548157, + "learning_rate": 5.6717034053589205e-05, + "loss": 2.8844, + "step": 49036 + }, + { + "epoch": 2.4, + "grad_norm": 0.7654694318771362, + "learning_rate": 5.670802568031354e-05, + "loss": 2.9412, + "step": 49037 + }, + { + "epoch": 2.4, + "grad_norm": 0.7241032123565674, + "learning_rate": 5.6699017947822137e-05, + "loss": 2.8518, + "step": 49038 + }, + { + "epoch": 2.4, + "grad_norm": 0.7288097143173218, + "learning_rate": 5.66900108561386e-05, + "loss": 2.8373, + "step": 49039 + }, + { + "epoch": 2.4, + "grad_norm": 0.7443707585334778, + "learning_rate": 5.6681004405286826e-05, + "loss": 2.9401, + "step": 49040 + }, + { + "epoch": 2.4, + "grad_norm": 0.7623165249824524, + "learning_rate": 5.667199859529038e-05, + "loss": 2.8009, + "step": 49041 + }, + { + "epoch": 2.4, + "grad_norm": 0.7050398588180542, + "learning_rate": 5.666299342617315e-05, + "loss": 2.7906, + "step": 49042 + }, + { + "epoch": 2.4, + "grad_norm": 0.7741838097572327, + "learning_rate": 5.665398889795869e-05, + "loss": 2.944, + "step": 49043 + }, + { + "epoch": 2.4, + "grad_norm": 0.7164863348007202, + "learning_rate": 5.6644985010670877e-05, + "loss": 2.9287, + "step": 49044 + }, + { + "epoch": 2.4, + "grad_norm": 0.7640129327774048, + "learning_rate": 5.663598176433332e-05, + "loss": 2.988, + "step": 49045 + }, + { + "epoch": 2.4, + "grad_norm": 0.7587370872497559, + "learning_rate": 5.662697915896967e-05, + "loss": 2.884, + "step": 49046 + }, + { + "epoch": 2.4, + "grad_norm": 0.8077144026756287, + "learning_rate": 5.66179771946038e-05, + "loss": 2.9191, + "step": 49047 + }, + { + "epoch": 2.4, + "grad_norm": 0.7830314636230469, + "learning_rate": 5.660897587125928e-05, + "loss": 2.935, + "step": 49048 + }, + { + "epoch": 2.4, + "grad_norm": 0.7073858976364136, + "learning_rate": 5.6599975188959835e-05, + "loss": 3.1283, + "step": 49049 + }, + { + "epoch": 2.4, + "grad_norm": 0.7274441719055176, + "learning_rate": 5.659097514772932e-05, + "loss": 2.8923, + "step": 49050 + }, + { + "epoch": 2.4, + "grad_norm": 0.7369108200073242, + "learning_rate": 5.658197574759126e-05, + "loss": 2.8339, + "step": 49051 + }, + { + "epoch": 2.4, + "grad_norm": 0.7142553925514221, + "learning_rate": 5.6572976988569486e-05, + "loss": 2.8431, + "step": 49052 + }, + { + "epoch": 2.4, + "grad_norm": 0.7000463604927063, + "learning_rate": 5.656397887068765e-05, + "loss": 2.8695, + "step": 49053 + }, + { + "epoch": 2.4, + "grad_norm": 0.693121612071991, + "learning_rate": 5.655498139396938e-05, + "loss": 2.7841, + "step": 49054 + }, + { + "epoch": 2.4, + "grad_norm": 0.6943217515945435, + "learning_rate": 5.654598455843851e-05, + "loss": 3.0532, + "step": 49055 + }, + { + "epoch": 2.4, + "grad_norm": 0.7242630124092102, + "learning_rate": 5.653698836411855e-05, + "loss": 2.8341, + "step": 49056 + }, + { + "epoch": 2.4, + "grad_norm": 0.7549852132797241, + "learning_rate": 5.6527992811033427e-05, + "loss": 2.8954, + "step": 49057 + }, + { + "epoch": 2.4, + "grad_norm": 0.6854625940322876, + "learning_rate": 5.6518997899206616e-05, + "loss": 3.1429, + "step": 49058 + }, + { + "epoch": 2.4, + "grad_norm": 0.7364606857299805, + "learning_rate": 5.6510003628662006e-05, + "loss": 2.7754, + "step": 49059 + }, + { + "epoch": 2.4, + "grad_norm": 0.7241211533546448, + "learning_rate": 5.6501009999423164e-05, + "loss": 3.0467, + "step": 49060 + }, + { + "epoch": 2.4, + "grad_norm": 0.7505494952201843, + "learning_rate": 5.649201701151371e-05, + "loss": 3.0433, + "step": 49061 + }, + { + "epoch": 2.4, + "grad_norm": 0.7099612355232239, + "learning_rate": 5.648302466495753e-05, + "loss": 2.716, + "step": 49062 + }, + { + "epoch": 2.4, + "grad_norm": 0.7239779233932495, + "learning_rate": 5.6474032959778104e-05, + "loss": 2.9557, + "step": 49063 + }, + { + "epoch": 2.4, + "grad_norm": 0.7321217656135559, + "learning_rate": 5.6465041895999205e-05, + "loss": 2.7211, + "step": 49064 + }, + { + "epoch": 2.4, + "grad_norm": 0.7070171236991882, + "learning_rate": 5.645605147364459e-05, + "loss": 2.7646, + "step": 49065 + }, + { + "epoch": 2.4, + "grad_norm": 0.6989107131958008, + "learning_rate": 5.644706169273787e-05, + "loss": 2.9561, + "step": 49066 + }, + { + "epoch": 2.4, + "grad_norm": 0.730031430721283, + "learning_rate": 5.6438072553302725e-05, + "loss": 2.9178, + "step": 49067 + }, + { + "epoch": 2.4, + "grad_norm": 0.7238811254501343, + "learning_rate": 5.642908405536274e-05, + "loss": 2.7931, + "step": 49068 + }, + { + "epoch": 2.4, + "grad_norm": 0.7353603839874268, + "learning_rate": 5.6420096198941654e-05, + "loss": 2.9666, + "step": 49069 + }, + { + "epoch": 2.4, + "grad_norm": 0.738623321056366, + "learning_rate": 5.6411108984063255e-05, + "loss": 2.9291, + "step": 49070 + }, + { + "epoch": 2.4, + "grad_norm": 0.7306110858917236, + "learning_rate": 5.640212241075103e-05, + "loss": 2.9059, + "step": 49071 + }, + { + "epoch": 2.4, + "grad_norm": 0.7487077713012695, + "learning_rate": 5.639313647902881e-05, + "loss": 2.8363, + "step": 49072 + }, + { + "epoch": 2.4, + "grad_norm": 0.7644553184509277, + "learning_rate": 5.638415118892009e-05, + "loss": 2.9736, + "step": 49073 + }, + { + "epoch": 2.4, + "grad_norm": 1.27792489528656, + "learning_rate": 5.6375166540448744e-05, + "loss": 3.1355, + "step": 49074 + }, + { + "epoch": 2.41, + "grad_norm": 0.7529338598251343, + "learning_rate": 5.636618253363829e-05, + "loss": 2.8665, + "step": 49075 + }, + { + "epoch": 2.41, + "grad_norm": 0.7581132650375366, + "learning_rate": 5.635719916851238e-05, + "loss": 3.0295, + "step": 49076 + }, + { + "epoch": 2.41, + "grad_norm": 0.7204800248146057, + "learning_rate": 5.634821644509479e-05, + "loss": 2.7356, + "step": 49077 + }, + { + "epoch": 2.41, + "grad_norm": 0.7828012704849243, + "learning_rate": 5.6339234363408994e-05, + "loss": 2.927, + "step": 49078 + }, + { + "epoch": 2.41, + "grad_norm": 0.786764919757843, + "learning_rate": 5.633025292347878e-05, + "loss": 2.9383, + "step": 49079 + }, + { + "epoch": 2.41, + "grad_norm": 0.7644099593162537, + "learning_rate": 5.6321272125327865e-05, + "loss": 2.9283, + "step": 49080 + }, + { + "epoch": 2.41, + "grad_norm": 0.7182053923606873, + "learning_rate": 5.631229196897983e-05, + "loss": 2.9578, + "step": 49081 + }, + { + "epoch": 2.41, + "grad_norm": 0.7207154631614685, + "learning_rate": 5.6303312454458316e-05, + "loss": 3.0525, + "step": 49082 + }, + { + "epoch": 2.41, + "grad_norm": 0.773795485496521, + "learning_rate": 5.6294333581786875e-05, + "loss": 3.0164, + "step": 49083 + }, + { + "epoch": 2.41, + "grad_norm": 0.7547546625137329, + "learning_rate": 5.628535535098929e-05, + "loss": 3.0012, + "step": 49084 + }, + { + "epoch": 2.41, + "grad_norm": 0.7342237830162048, + "learning_rate": 5.6276377762089243e-05, + "loss": 2.9923, + "step": 49085 + }, + { + "epoch": 2.41, + "grad_norm": 0.7141954898834229, + "learning_rate": 5.626740081511024e-05, + "loss": 2.6876, + "step": 49086 + }, + { + "epoch": 2.41, + "grad_norm": 0.7377126216888428, + "learning_rate": 5.6258424510076074e-05, + "loss": 2.8332, + "step": 49087 + }, + { + "epoch": 2.41, + "grad_norm": 0.7284415364265442, + "learning_rate": 5.624944884701028e-05, + "loss": 3.057, + "step": 49088 + }, + { + "epoch": 2.41, + "grad_norm": 0.7273825407028198, + "learning_rate": 5.624047382593655e-05, + "loss": 2.7143, + "step": 49089 + }, + { + "epoch": 2.41, + "grad_norm": 0.7405761480331421, + "learning_rate": 5.6231499446878424e-05, + "loss": 2.8364, + "step": 49090 + }, + { + "epoch": 2.41, + "grad_norm": 0.7068529725074768, + "learning_rate": 5.622252570985959e-05, + "loss": 2.8806, + "step": 49091 + }, + { + "epoch": 2.41, + "grad_norm": 0.6906842589378357, + "learning_rate": 5.621355261490382e-05, + "loss": 2.7804, + "step": 49092 + }, + { + "epoch": 2.41, + "grad_norm": 0.7543202638626099, + "learning_rate": 5.620458016203454e-05, + "loss": 2.6974, + "step": 49093 + }, + { + "epoch": 2.41, + "grad_norm": 0.735177218914032, + "learning_rate": 5.619560835127556e-05, + "loss": 3.0508, + "step": 49094 + }, + { + "epoch": 2.41, + "grad_norm": 0.733551025390625, + "learning_rate": 5.618663718265032e-05, + "loss": 2.8695, + "step": 49095 + }, + { + "epoch": 2.41, + "grad_norm": 0.7386725544929504, + "learning_rate": 5.617766665618267e-05, + "loss": 2.9166, + "step": 49096 + }, + { + "epoch": 2.41, + "grad_norm": 0.7156366109848022, + "learning_rate": 5.61686967718961e-05, + "loss": 2.7209, + "step": 49097 + }, + { + "epoch": 2.41, + "grad_norm": 0.7398567199707031, + "learning_rate": 5.615972752981419e-05, + "loss": 2.716, + "step": 49098 + }, + { + "epoch": 2.41, + "grad_norm": 0.7204150557518005, + "learning_rate": 5.6150758929960715e-05, + "loss": 3.0253, + "step": 49099 + }, + { + "epoch": 2.41, + "grad_norm": 0.7159655690193176, + "learning_rate": 5.614179097235913e-05, + "loss": 2.8584, + "step": 49100 + }, + { + "epoch": 2.41, + "grad_norm": 0.6986108422279358, + "learning_rate": 5.61328236570331e-05, + "loss": 2.7455, + "step": 49101 + }, + { + "epoch": 2.41, + "grad_norm": 0.7308990955352783, + "learning_rate": 5.612385698400639e-05, + "loss": 3.1152, + "step": 49102 + }, + { + "epoch": 2.41, + "grad_norm": 0.7399376034736633, + "learning_rate": 5.611489095330251e-05, + "loss": 2.8173, + "step": 49103 + }, + { + "epoch": 2.41, + "grad_norm": 0.7102044820785522, + "learning_rate": 5.6105925564945066e-05, + "loss": 2.8044, + "step": 49104 + }, + { + "epoch": 2.41, + "grad_norm": 0.7224224805831909, + "learning_rate": 5.609696081895758e-05, + "loss": 2.9648, + "step": 49105 + }, + { + "epoch": 2.41, + "grad_norm": 0.6727746725082397, + "learning_rate": 5.608799671536377e-05, + "loss": 2.7387, + "step": 49106 + }, + { + "epoch": 2.41, + "grad_norm": 0.7503066062927246, + "learning_rate": 5.607903325418731e-05, + "loss": 3.0684, + "step": 49107 + }, + { + "epoch": 2.41, + "grad_norm": 0.7269359827041626, + "learning_rate": 5.6070070435451685e-05, + "loss": 2.9476, + "step": 49108 + }, + { + "epoch": 2.41, + "grad_norm": 0.7300078868865967, + "learning_rate": 5.6061108259180575e-05, + "loss": 2.7939, + "step": 49109 + }, + { + "epoch": 2.41, + "grad_norm": 0.7930198311805725, + "learning_rate": 5.60521467253976e-05, + "loss": 2.9471, + "step": 49110 + }, + { + "epoch": 2.41, + "grad_norm": 0.7446059584617615, + "learning_rate": 5.604318583412622e-05, + "loss": 2.9892, + "step": 49111 + }, + { + "epoch": 2.41, + "grad_norm": 0.6801463961601257, + "learning_rate": 5.6034225585390245e-05, + "loss": 2.8705, + "step": 49112 + }, + { + "epoch": 2.41, + "grad_norm": 0.82508385181427, + "learning_rate": 5.6025265979213074e-05, + "loss": 2.7941, + "step": 49113 + }, + { + "epoch": 2.41, + "grad_norm": 0.7679817080497742, + "learning_rate": 5.6016307015618454e-05, + "loss": 2.8535, + "step": 49114 + }, + { + "epoch": 2.41, + "grad_norm": 0.7149144411087036, + "learning_rate": 5.600734869462987e-05, + "loss": 2.7853, + "step": 49115 + }, + { + "epoch": 2.41, + "grad_norm": 0.7261170148849487, + "learning_rate": 5.599839101627097e-05, + "loss": 2.8658, + "step": 49116 + }, + { + "epoch": 2.41, + "grad_norm": 0.7425785660743713, + "learning_rate": 5.5989433980565435e-05, + "loss": 2.9014, + "step": 49117 + }, + { + "epoch": 2.41, + "grad_norm": 0.7709639072418213, + "learning_rate": 5.5980477587536775e-05, + "loss": 2.8676, + "step": 49118 + }, + { + "epoch": 2.41, + "grad_norm": 0.6940927505493164, + "learning_rate": 5.5971521837208546e-05, + "loss": 2.9155, + "step": 49119 + }, + { + "epoch": 2.41, + "grad_norm": 0.747685968875885, + "learning_rate": 5.5962566729604317e-05, + "loss": 2.6554, + "step": 49120 + }, + { + "epoch": 2.41, + "grad_norm": 0.7228460907936096, + "learning_rate": 5.595361226474768e-05, + "loss": 2.7611, + "step": 49121 + }, + { + "epoch": 2.41, + "grad_norm": 0.6861898303031921, + "learning_rate": 5.5944658442662385e-05, + "loss": 2.8041, + "step": 49122 + }, + { + "epoch": 2.41, + "grad_norm": 0.7472571134567261, + "learning_rate": 5.593570526337176e-05, + "loss": 2.7697, + "step": 49123 + }, + { + "epoch": 2.41, + "grad_norm": 0.7464053630828857, + "learning_rate": 5.592675272689965e-05, + "loss": 2.8793, + "step": 49124 + }, + { + "epoch": 2.41, + "grad_norm": 0.6809940338134766, + "learning_rate": 5.591780083326948e-05, + "loss": 2.857, + "step": 49125 + }, + { + "epoch": 2.41, + "grad_norm": 0.6927462220191956, + "learning_rate": 5.5908849582504764e-05, + "loss": 2.9232, + "step": 49126 + }, + { + "epoch": 2.41, + "grad_norm": 0.7227660417556763, + "learning_rate": 5.589989897462923e-05, + "loss": 2.784, + "step": 49127 + }, + { + "epoch": 2.41, + "grad_norm": 0.7501949071884155, + "learning_rate": 5.589094900966632e-05, + "loss": 2.7866, + "step": 49128 + }, + { + "epoch": 2.41, + "grad_norm": 0.7532203197479248, + "learning_rate": 5.5881999687639746e-05, + "loss": 2.9745, + "step": 49129 + }, + { + "epoch": 2.41, + "grad_norm": 0.7279804944992065, + "learning_rate": 5.587305100857292e-05, + "loss": 2.9465, + "step": 49130 + }, + { + "epoch": 2.41, + "grad_norm": 0.7034523487091064, + "learning_rate": 5.58641029724895e-05, + "loss": 2.9068, + "step": 49131 + }, + { + "epoch": 2.41, + "grad_norm": 0.7209697961807251, + "learning_rate": 5.585515557941322e-05, + "loss": 2.7805, + "step": 49132 + }, + { + "epoch": 2.41, + "grad_norm": 0.7370447516441345, + "learning_rate": 5.584620882936731e-05, + "loss": 2.8805, + "step": 49133 + }, + { + "epoch": 2.41, + "grad_norm": 0.7712350487709045, + "learning_rate": 5.5837262722375574e-05, + "loss": 2.7915, + "step": 49134 + }, + { + "epoch": 2.41, + "grad_norm": 0.7178522944450378, + "learning_rate": 5.58283172584614e-05, + "loss": 2.9186, + "step": 49135 + }, + { + "epoch": 2.41, + "grad_norm": 0.730476438999176, + "learning_rate": 5.581937243764846e-05, + "loss": 2.9216, + "step": 49136 + }, + { + "epoch": 2.41, + "grad_norm": 0.7608109712600708, + "learning_rate": 5.5810428259960375e-05, + "loss": 3.1027, + "step": 49137 + }, + { + "epoch": 2.41, + "grad_norm": 0.7009885907173157, + "learning_rate": 5.580148472542055e-05, + "loss": 2.7478, + "step": 49138 + }, + { + "epoch": 2.41, + "grad_norm": 0.7496047019958496, + "learning_rate": 5.579254183405272e-05, + "loss": 2.9551, + "step": 49139 + }, + { + "epoch": 2.41, + "grad_norm": 0.7258723974227905, + "learning_rate": 5.578359958588028e-05, + "loss": 2.7385, + "step": 49140 + }, + { + "epoch": 2.41, + "grad_norm": 0.7414917945861816, + "learning_rate": 5.5774657980926815e-05, + "loss": 2.8253, + "step": 49141 + }, + { + "epoch": 2.41, + "grad_norm": 0.7586607933044434, + "learning_rate": 5.576571701921595e-05, + "loss": 2.8642, + "step": 49142 + }, + { + "epoch": 2.41, + "grad_norm": 0.7755255699157715, + "learning_rate": 5.575677670077112e-05, + "loss": 2.964, + "step": 49143 + }, + { + "epoch": 2.41, + "grad_norm": 0.7838589549064636, + "learning_rate": 5.574783702561598e-05, + "loss": 2.9531, + "step": 49144 + }, + { + "epoch": 2.41, + "grad_norm": 0.7484715580940247, + "learning_rate": 5.573889799377398e-05, + "loss": 2.9606, + "step": 49145 + }, + { + "epoch": 2.41, + "grad_norm": 0.7210723757743835, + "learning_rate": 5.572995960526879e-05, + "loss": 2.9659, + "step": 49146 + }, + { + "epoch": 2.41, + "grad_norm": 0.7425600290298462, + "learning_rate": 5.572102186012387e-05, + "loss": 3.0856, + "step": 49147 + }, + { + "epoch": 2.41, + "grad_norm": 0.7232694625854492, + "learning_rate": 5.571208475836269e-05, + "loss": 2.9543, + "step": 49148 + }, + { + "epoch": 2.41, + "grad_norm": 0.713165819644928, + "learning_rate": 5.570314830000895e-05, + "loss": 2.8556, + "step": 49149 + }, + { + "epoch": 2.41, + "grad_norm": 0.741438627243042, + "learning_rate": 5.569421248508601e-05, + "loss": 3.0215, + "step": 49150 + }, + { + "epoch": 2.41, + "grad_norm": 0.7061321139335632, + "learning_rate": 5.56852773136175e-05, + "loss": 3.0182, + "step": 49151 + }, + { + "epoch": 2.41, + "grad_norm": 0.7206674814224243, + "learning_rate": 5.5676342785627045e-05, + "loss": 2.8039, + "step": 49152 + }, + { + "epoch": 2.41, + "grad_norm": 0.7030647397041321, + "learning_rate": 5.566740890113799e-05, + "loss": 3.02, + "step": 49153 + }, + { + "epoch": 2.41, + "grad_norm": 0.7327060699462891, + "learning_rate": 5.565847566017403e-05, + "loss": 2.7408, + "step": 49154 + }, + { + "epoch": 2.41, + "grad_norm": 0.7196124196052551, + "learning_rate": 5.5649543062758617e-05, + "loss": 2.8843, + "step": 49155 + }, + { + "epoch": 2.41, + "grad_norm": 0.7554773092269897, + "learning_rate": 5.5640611108915204e-05, + "loss": 2.8299, + "step": 49156 + }, + { + "epoch": 2.41, + "grad_norm": 0.7063097357749939, + "learning_rate": 5.5631679798667495e-05, + "loss": 3.0298, + "step": 49157 + }, + { + "epoch": 2.41, + "grad_norm": 0.7428682446479797, + "learning_rate": 5.5622749132038815e-05, + "loss": 3.0282, + "step": 49158 + }, + { + "epoch": 2.41, + "grad_norm": 0.7211458086967468, + "learning_rate": 5.561381910905287e-05, + "loss": 2.7996, + "step": 49159 + }, + { + "epoch": 2.41, + "grad_norm": 0.7217656373977661, + "learning_rate": 5.5604889729733016e-05, + "loss": 2.6599, + "step": 49160 + }, + { + "epoch": 2.41, + "grad_norm": 0.7122716903686523, + "learning_rate": 5.55959609941029e-05, + "loss": 2.7535, + "step": 49161 + }, + { + "epoch": 2.41, + "grad_norm": 0.7155596613883972, + "learning_rate": 5.558703290218599e-05, + "loss": 2.8666, + "step": 49162 + }, + { + "epoch": 2.41, + "grad_norm": 0.7553215622901917, + "learning_rate": 5.557810545400575e-05, + "loss": 2.9545, + "step": 49163 + }, + { + "epoch": 2.41, + "grad_norm": 0.727124810218811, + "learning_rate": 5.556917864958579e-05, + "loss": 2.8262, + "step": 49164 + }, + { + "epoch": 2.41, + "grad_norm": 0.7256839871406555, + "learning_rate": 5.556025248894952e-05, + "loss": 2.8445, + "step": 49165 + }, + { + "epoch": 2.41, + "grad_norm": 0.7141228318214417, + "learning_rate": 5.555132697212049e-05, + "loss": 2.7558, + "step": 49166 + }, + { + "epoch": 2.41, + "grad_norm": 0.7409408688545227, + "learning_rate": 5.5542402099122287e-05, + "loss": 2.7731, + "step": 49167 + }, + { + "epoch": 2.41, + "grad_norm": 0.7171137928962708, + "learning_rate": 5.553347786997835e-05, + "loss": 3.0352, + "step": 49168 + }, + { + "epoch": 2.41, + "grad_norm": 0.7588046789169312, + "learning_rate": 5.5524554284712196e-05, + "loss": 3.0276, + "step": 49169 + }, + { + "epoch": 2.41, + "grad_norm": 0.7045204639434814, + "learning_rate": 5.5515631343347244e-05, + "loss": 2.807, + "step": 49170 + }, + { + "epoch": 2.41, + "grad_norm": 0.7897858619689941, + "learning_rate": 5.550670904590714e-05, + "loss": 2.7827, + "step": 49171 + }, + { + "epoch": 2.41, + "grad_norm": 0.7354269623756409, + "learning_rate": 5.5497787392415227e-05, + "loss": 2.6785, + "step": 49172 + }, + { + "epoch": 2.41, + "grad_norm": 0.7657710313796997, + "learning_rate": 5.548886638289506e-05, + "loss": 2.7895, + "step": 49173 + }, + { + "epoch": 2.41, + "grad_norm": 0.789874255657196, + "learning_rate": 5.547994601737028e-05, + "loss": 2.8425, + "step": 49174 + }, + { + "epoch": 2.41, + "grad_norm": 0.7321118712425232, + "learning_rate": 5.547102629586417e-05, + "loss": 2.8132, + "step": 49175 + }, + { + "epoch": 2.41, + "grad_norm": 0.7470834851264954, + "learning_rate": 5.546210721840039e-05, + "loss": 2.9483, + "step": 49176 + }, + { + "epoch": 2.41, + "grad_norm": 0.725573718547821, + "learning_rate": 5.5453188785002366e-05, + "loss": 3.0163, + "step": 49177 + }, + { + "epoch": 2.41, + "grad_norm": 0.7329528331756592, + "learning_rate": 5.5444270995693496e-05, + "loss": 2.7606, + "step": 49178 + }, + { + "epoch": 2.41, + "grad_norm": 0.6969563364982605, + "learning_rate": 5.543535385049742e-05, + "loss": 2.8289, + "step": 49179 + }, + { + "epoch": 2.41, + "grad_norm": 0.7398406267166138, + "learning_rate": 5.542643734943749e-05, + "loss": 3.0499, + "step": 49180 + }, + { + "epoch": 2.41, + "grad_norm": 0.7166543006896973, + "learning_rate": 5.541752149253735e-05, + "loss": 2.9627, + "step": 49181 + }, + { + "epoch": 2.41, + "grad_norm": 0.7646118998527527, + "learning_rate": 5.540860627982029e-05, + "loss": 2.8877, + "step": 49182 + }, + { + "epoch": 2.41, + "grad_norm": 0.7465002536773682, + "learning_rate": 5.5399691711309944e-05, + "loss": 2.8294, + "step": 49183 + }, + { + "epoch": 2.41, + "grad_norm": 0.7415258288383484, + "learning_rate": 5.539077778702977e-05, + "loss": 3.1178, + "step": 49184 + }, + { + "epoch": 2.41, + "grad_norm": 0.7385164499282837, + "learning_rate": 5.538186450700312e-05, + "loss": 3.0409, + "step": 49185 + }, + { + "epoch": 2.41, + "grad_norm": 0.7214295864105225, + "learning_rate": 5.537295187125366e-05, + "loss": 2.9575, + "step": 49186 + }, + { + "epoch": 2.41, + "grad_norm": 0.742912769317627, + "learning_rate": 5.5364039879804646e-05, + "loss": 2.7598, + "step": 49187 + }, + { + "epoch": 2.41, + "grad_norm": 0.726211428642273, + "learning_rate": 5.535512853267969e-05, + "loss": 3.0765, + "step": 49188 + }, + { + "epoch": 2.41, + "grad_norm": 0.6831627488136292, + "learning_rate": 5.534621782990232e-05, + "loss": 2.8655, + "step": 49189 + }, + { + "epoch": 2.41, + "grad_norm": 0.7041841745376587, + "learning_rate": 5.5337307771495933e-05, + "loss": 2.9758, + "step": 49190 + }, + { + "epoch": 2.41, + "grad_norm": 0.7825841307640076, + "learning_rate": 5.5328398357483986e-05, + "loss": 2.8501, + "step": 49191 + }, + { + "epoch": 2.41, + "grad_norm": 0.795807957649231, + "learning_rate": 5.5319489587889886e-05, + "loss": 2.7646, + "step": 49192 + }, + { + "epoch": 2.41, + "grad_norm": 0.7698444724082947, + "learning_rate": 5.5310581462737155e-05, + "loss": 2.7806, + "step": 49193 + }, + { + "epoch": 2.41, + "grad_norm": 0.6966370940208435, + "learning_rate": 5.5301673982049314e-05, + "loss": 3.0312, + "step": 49194 + }, + { + "epoch": 2.41, + "grad_norm": 0.7309635877609253, + "learning_rate": 5.5292767145849726e-05, + "loss": 2.7085, + "step": 49195 + }, + { + "epoch": 2.41, + "grad_norm": 0.7802844643592834, + "learning_rate": 5.528386095416195e-05, + "loss": 2.9802, + "step": 49196 + }, + { + "epoch": 2.41, + "grad_norm": 0.7086724638938904, + "learning_rate": 5.5274955407009305e-05, + "loss": 3.0243, + "step": 49197 + }, + { + "epoch": 2.41, + "grad_norm": 0.8560823798179626, + "learning_rate": 5.5266050504415424e-05, + "loss": 2.8772, + "step": 49198 + }, + { + "epoch": 2.41, + "grad_norm": 0.6955791115760803, + "learning_rate": 5.525714624640367e-05, + "loss": 2.7175, + "step": 49199 + }, + { + "epoch": 2.41, + "grad_norm": 0.7265035510063171, + "learning_rate": 5.524824263299738e-05, + "loss": 2.8838, + "step": 49200 + }, + { + "epoch": 2.41, + "grad_norm": 0.7440124750137329, + "learning_rate": 5.523933966422025e-05, + "loss": 2.9098, + "step": 49201 + }, + { + "epoch": 2.41, + "grad_norm": 0.7409399151802063, + "learning_rate": 5.523043734009548e-05, + "loss": 2.7065, + "step": 49202 + }, + { + "epoch": 2.41, + "grad_norm": 0.7265484929084778, + "learning_rate": 5.522153566064662e-05, + "loss": 3.1762, + "step": 49203 + }, + { + "epoch": 2.41, + "grad_norm": 0.7463249564170837, + "learning_rate": 5.521263462589722e-05, + "loss": 2.8447, + "step": 49204 + }, + { + "epoch": 2.41, + "grad_norm": 0.7473812699317932, + "learning_rate": 5.520373423587064e-05, + "loss": 2.7834, + "step": 49205 + }, + { + "epoch": 2.41, + "grad_norm": 0.725105881690979, + "learning_rate": 5.51948344905903e-05, + "loss": 3.1029, + "step": 49206 + }, + { + "epoch": 2.41, + "grad_norm": 0.694555938243866, + "learning_rate": 5.518593539007957e-05, + "loss": 2.9168, + "step": 49207 + }, + { + "epoch": 2.41, + "grad_norm": 0.7044192552566528, + "learning_rate": 5.5177036934362e-05, + "loss": 2.7009, + "step": 49208 + }, + { + "epoch": 2.41, + "grad_norm": 0.7090144753456116, + "learning_rate": 5.516813912346102e-05, + "loss": 3.0191, + "step": 49209 + }, + { + "epoch": 2.41, + "grad_norm": 0.7357432842254639, + "learning_rate": 5.515924195740002e-05, + "loss": 2.8309, + "step": 49210 + }, + { + "epoch": 2.41, + "grad_norm": 0.6832718253135681, + "learning_rate": 5.5150345436202495e-05, + "loss": 3.1344, + "step": 49211 + }, + { + "epoch": 2.41, + "grad_norm": 0.6962417960166931, + "learning_rate": 5.514144955989187e-05, + "loss": 2.9699, + "step": 49212 + }, + { + "epoch": 2.41, + "grad_norm": 0.7619498372077942, + "learning_rate": 5.513255432849144e-05, + "loss": 2.7511, + "step": 49213 + }, + { + "epoch": 2.41, + "grad_norm": 0.7212022542953491, + "learning_rate": 5.512365974202482e-05, + "loss": 2.8064, + "step": 49214 + }, + { + "epoch": 2.41, + "grad_norm": 0.7095093131065369, + "learning_rate": 5.511476580051528e-05, + "loss": 2.7405, + "step": 49215 + }, + { + "epoch": 2.41, + "grad_norm": 0.7469050884246826, + "learning_rate": 5.510587250398638e-05, + "loss": 2.773, + "step": 49216 + }, + { + "epoch": 2.41, + "grad_norm": 0.708609938621521, + "learning_rate": 5.509697985246143e-05, + "loss": 2.9376, + "step": 49217 + }, + { + "epoch": 2.41, + "grad_norm": 0.7691136598587036, + "learning_rate": 5.508808784596388e-05, + "loss": 2.9028, + "step": 49218 + }, + { + "epoch": 2.41, + "grad_norm": 0.7040303945541382, + "learning_rate": 5.5079196484517274e-05, + "loss": 2.8717, + "step": 49219 + }, + { + "epoch": 2.41, + "grad_norm": 0.7086603045463562, + "learning_rate": 5.507030576814493e-05, + "loss": 2.9845, + "step": 49220 + }, + { + "epoch": 2.41, + "grad_norm": 0.7217570543289185, + "learning_rate": 5.5061415696870223e-05, + "loss": 3.0093, + "step": 49221 + }, + { + "epoch": 2.41, + "grad_norm": 0.7315422892570496, + "learning_rate": 5.505252627071655e-05, + "loss": 2.9514, + "step": 49222 + }, + { + "epoch": 2.41, + "grad_norm": 0.7391554117202759, + "learning_rate": 5.5043637489707406e-05, + "loss": 2.9218, + "step": 49223 + }, + { + "epoch": 2.41, + "grad_norm": 0.7177310585975647, + "learning_rate": 5.503474935386621e-05, + "loss": 2.8227, + "step": 49224 + }, + { + "epoch": 2.41, + "grad_norm": 0.7659426331520081, + "learning_rate": 5.502586186321629e-05, + "loss": 2.9178, + "step": 49225 + }, + { + "epoch": 2.41, + "grad_norm": 0.7270904183387756, + "learning_rate": 5.5016975017781184e-05, + "loss": 2.9833, + "step": 49226 + }, + { + "epoch": 2.41, + "grad_norm": 0.6957828998565674, + "learning_rate": 5.500808881758421e-05, + "loss": 2.883, + "step": 49227 + }, + { + "epoch": 2.41, + "grad_norm": 0.7378817200660706, + "learning_rate": 5.49992032626487e-05, + "loss": 2.8653, + "step": 49228 + }, + { + "epoch": 2.41, + "grad_norm": 0.750215470790863, + "learning_rate": 5.499031835299823e-05, + "loss": 2.5859, + "step": 49229 + }, + { + "epoch": 2.41, + "grad_norm": 0.790286123752594, + "learning_rate": 5.4981434088656026e-05, + "loss": 2.9305, + "step": 49230 + }, + { + "epoch": 2.41, + "grad_norm": 0.7604691386222839, + "learning_rate": 5.497255046964565e-05, + "loss": 2.8586, + "step": 49231 + }, + { + "epoch": 2.41, + "grad_norm": 0.7047723531723022, + "learning_rate": 5.496366749599034e-05, + "loss": 2.9883, + "step": 49232 + }, + { + "epoch": 2.41, + "grad_norm": 0.7287726998329163, + "learning_rate": 5.4954785167713645e-05, + "loss": 2.9042, + "step": 49233 + }, + { + "epoch": 2.41, + "grad_norm": 0.7202035188674927, + "learning_rate": 5.494590348483888e-05, + "loss": 2.8477, + "step": 49234 + }, + { + "epoch": 2.41, + "grad_norm": 0.7223737239837646, + "learning_rate": 5.4937022447389365e-05, + "loss": 2.9645, + "step": 49235 + }, + { + "epoch": 2.41, + "grad_norm": 0.7385289669036865, + "learning_rate": 5.4928142055388645e-05, + "loss": 2.9676, + "step": 49236 + }, + { + "epoch": 2.41, + "grad_norm": 0.7490689754486084, + "learning_rate": 5.491926230885997e-05, + "loss": 2.8101, + "step": 49237 + }, + { + "epoch": 2.41, + "grad_norm": 0.7428318858146667, + "learning_rate": 5.491038320782678e-05, + "loss": 2.9541, + "step": 49238 + }, + { + "epoch": 2.41, + "grad_norm": 0.7282332181930542, + "learning_rate": 5.490150475231257e-05, + "loss": 3.0235, + "step": 49239 + }, + { + "epoch": 2.41, + "grad_norm": 0.7052958011627197, + "learning_rate": 5.4892626942340533e-05, + "loss": 2.8107, + "step": 49240 + }, + { + "epoch": 2.41, + "grad_norm": 0.7355846166610718, + "learning_rate": 5.488374977793421e-05, + "loss": 3.0651, + "step": 49241 + }, + { + "epoch": 2.41, + "grad_norm": 0.7115058898925781, + "learning_rate": 5.4874873259116924e-05, + "loss": 2.8074, + "step": 49242 + }, + { + "epoch": 2.41, + "grad_norm": 0.6894238591194153, + "learning_rate": 5.4865997385911976e-05, + "loss": 2.9663, + "step": 49243 + }, + { + "epoch": 2.41, + "grad_norm": 0.7079063653945923, + "learning_rate": 5.48571221583429e-05, + "loss": 2.9716, + "step": 49244 + }, + { + "epoch": 2.41, + "grad_norm": 0.7443049550056458, + "learning_rate": 5.484824757643289e-05, + "loss": 2.9997, + "step": 49245 + }, + { + "epoch": 2.41, + "grad_norm": 0.7488800883293152, + "learning_rate": 5.483937364020552e-05, + "loss": 2.8637, + "step": 49246 + }, + { + "epoch": 2.41, + "grad_norm": 0.6989515423774719, + "learning_rate": 5.483050034968397e-05, + "loss": 2.9274, + "step": 49247 + }, + { + "epoch": 2.41, + "grad_norm": 0.66061931848526, + "learning_rate": 5.4821627704891756e-05, + "loss": 2.9987, + "step": 49248 + }, + { + "epoch": 2.41, + "grad_norm": 0.7440872192382812, + "learning_rate": 5.4812755705852194e-05, + "loss": 3.2342, + "step": 49249 + }, + { + "epoch": 2.41, + "grad_norm": 0.7412754893302917, + "learning_rate": 5.4803884352588566e-05, + "loss": 2.899, + "step": 49250 + }, + { + "epoch": 2.41, + "grad_norm": 0.7123976945877075, + "learning_rate": 5.479501364512442e-05, + "loss": 2.9107, + "step": 49251 + }, + { + "epoch": 2.41, + "grad_norm": 0.7113731503486633, + "learning_rate": 5.478614358348293e-05, + "loss": 3.1027, + "step": 49252 + }, + { + "epoch": 2.41, + "grad_norm": 0.7375507354736328, + "learning_rate": 5.4777274167687625e-05, + "loss": 2.7018, + "step": 49253 + }, + { + "epoch": 2.41, + "grad_norm": 0.7448770999908447, + "learning_rate": 5.4768405397761674e-05, + "loss": 2.9676, + "step": 49254 + }, + { + "epoch": 2.41, + "grad_norm": 0.7443416714668274, + "learning_rate": 5.4759537273728595e-05, + "loss": 2.904, + "step": 49255 + }, + { + "epoch": 2.41, + "grad_norm": 0.7311747670173645, + "learning_rate": 5.475066979561174e-05, + "loss": 3.1093, + "step": 49256 + }, + { + "epoch": 2.41, + "grad_norm": 0.728496253490448, + "learning_rate": 5.474180296343444e-05, + "loss": 3.0179, + "step": 49257 + }, + { + "epoch": 2.41, + "grad_norm": 0.7192690968513489, + "learning_rate": 5.4732936777220035e-05, + "loss": 2.9779, + "step": 49258 + }, + { + "epoch": 2.41, + "grad_norm": 0.7940871715545654, + "learning_rate": 5.472407123699176e-05, + "loss": 2.894, + "step": 49259 + }, + { + "epoch": 2.41, + "grad_norm": 0.7260642647743225, + "learning_rate": 5.471520634277313e-05, + "loss": 2.9742, + "step": 49260 + }, + { + "epoch": 2.41, + "grad_norm": 0.7162463665008545, + "learning_rate": 5.470634209458746e-05, + "loss": 2.9621, + "step": 49261 + }, + { + "epoch": 2.41, + "grad_norm": 0.7385729551315308, + "learning_rate": 5.469747849245803e-05, + "loss": 2.6355, + "step": 49262 + }, + { + "epoch": 2.41, + "grad_norm": 0.7240334153175354, + "learning_rate": 5.468861553640833e-05, + "loss": 2.7548, + "step": 49263 + }, + { + "epoch": 2.41, + "grad_norm": 0.7283338308334351, + "learning_rate": 5.46797532264616e-05, + "loss": 2.7219, + "step": 49264 + }, + { + "epoch": 2.41, + "grad_norm": 0.8645117282867432, + "learning_rate": 5.467089156264109e-05, + "loss": 2.8017, + "step": 49265 + }, + { + "epoch": 2.41, + "grad_norm": 0.7759827375411987, + "learning_rate": 5.46620305449703e-05, + "loss": 2.8121, + "step": 49266 + }, + { + "epoch": 2.41, + "grad_norm": 0.7253652811050415, + "learning_rate": 5.465317017347244e-05, + "loss": 2.9753, + "step": 49267 + }, + { + "epoch": 2.41, + "grad_norm": 0.720482349395752, + "learning_rate": 5.4644310448171e-05, + "loss": 3.0358, + "step": 49268 + }, + { + "epoch": 2.41, + "grad_norm": 0.7371129393577576, + "learning_rate": 5.463545136908915e-05, + "loss": 2.7949, + "step": 49269 + }, + { + "epoch": 2.41, + "grad_norm": 0.6884015202522278, + "learning_rate": 5.462659293625038e-05, + "loss": 2.8906, + "step": 49270 + }, + { + "epoch": 2.41, + "grad_norm": 0.7280858159065247, + "learning_rate": 5.4617735149677925e-05, + "loss": 3.058, + "step": 49271 + }, + { + "epoch": 2.41, + "grad_norm": 0.7321854829788208, + "learning_rate": 5.4608878009395084e-05, + "loss": 3.049, + "step": 49272 + }, + { + "epoch": 2.41, + "grad_norm": 0.7545171976089478, + "learning_rate": 5.4600021515425284e-05, + "loss": 2.9722, + "step": 49273 + }, + { + "epoch": 2.41, + "grad_norm": 0.7591923475265503, + "learning_rate": 5.45911656677917e-05, + "loss": 2.9183, + "step": 49274 + }, + { + "epoch": 2.41, + "grad_norm": 0.6920126676559448, + "learning_rate": 5.458231046651779e-05, + "loss": 2.8652, + "step": 49275 + }, + { + "epoch": 2.41, + "grad_norm": 0.7270497679710388, + "learning_rate": 5.457345591162692e-05, + "loss": 2.8887, + "step": 49276 + }, + { + "epoch": 2.41, + "grad_norm": 0.8276441693305969, + "learning_rate": 5.456460200314224e-05, + "loss": 2.7831, + "step": 49277 + }, + { + "epoch": 2.41, + "grad_norm": 0.7535173892974854, + "learning_rate": 5.455574874108725e-05, + "loss": 3.0215, + "step": 49278 + }, + { + "epoch": 2.42, + "grad_norm": 0.7337012887001038, + "learning_rate": 5.4546896125485173e-05, + "loss": 2.7542, + "step": 49279 + }, + { + "epoch": 2.42, + "grad_norm": 0.7610710859298706, + "learning_rate": 5.453804415635927e-05, + "loss": 3.0278, + "step": 49280 + }, + { + "epoch": 2.42, + "grad_norm": 0.751594066619873, + "learning_rate": 5.4529192833732996e-05, + "loss": 3.1184, + "step": 49281 + }, + { + "epoch": 2.42, + "grad_norm": 0.6849843859672546, + "learning_rate": 5.452034215762948e-05, + "loss": 3.1234, + "step": 49282 + }, + { + "epoch": 2.42, + "grad_norm": 0.7445322871208191, + "learning_rate": 5.4511492128072255e-05, + "loss": 2.7937, + "step": 49283 + }, + { + "epoch": 2.42, + "grad_norm": 0.7667579054832458, + "learning_rate": 5.4502642745084425e-05, + "loss": 2.9009, + "step": 49284 + }, + { + "epoch": 2.42, + "grad_norm": 0.6802947521209717, + "learning_rate": 5.449379400868944e-05, + "loss": 3.005, + "step": 49285 + }, + { + "epoch": 2.42, + "grad_norm": 0.7073135375976562, + "learning_rate": 5.4484945918910585e-05, + "loss": 2.8548, + "step": 49286 + }, + { + "epoch": 2.42, + "grad_norm": 0.7205443978309631, + "learning_rate": 5.4476098475771033e-05, + "loss": 2.8307, + "step": 49287 + }, + { + "epoch": 2.42, + "grad_norm": 0.7851844429969788, + "learning_rate": 5.446725167929428e-05, + "loss": 2.8409, + "step": 49288 + }, + { + "epoch": 2.42, + "grad_norm": 0.7031289339065552, + "learning_rate": 5.445840552950346e-05, + "loss": 2.8974, + "step": 49289 + }, + { + "epoch": 2.42, + "grad_norm": 0.7477833032608032, + "learning_rate": 5.4449560026421924e-05, + "loss": 2.9867, + "step": 49290 + }, + { + "epoch": 2.42, + "grad_norm": 0.7260494828224182, + "learning_rate": 5.444071517007306e-05, + "loss": 2.6814, + "step": 49291 + }, + { + "epoch": 2.42, + "grad_norm": 0.7753791809082031, + "learning_rate": 5.443187096048011e-05, + "loss": 2.9645, + "step": 49292 + }, + { + "epoch": 2.42, + "grad_norm": 0.8131752610206604, + "learning_rate": 5.4423027397666354e-05, + "loss": 3.0435, + "step": 49293 + }, + { + "epoch": 2.42, + "grad_norm": 0.7920990586280823, + "learning_rate": 5.441418448165501e-05, + "loss": 2.957, + "step": 49294 + }, + { + "epoch": 2.42, + "grad_norm": 0.7970216274261475, + "learning_rate": 5.4405342212469427e-05, + "loss": 2.7669, + "step": 49295 + }, + { + "epoch": 2.42, + "grad_norm": 0.7318688631057739, + "learning_rate": 5.4396500590132984e-05, + "loss": 2.9577, + "step": 49296 + }, + { + "epoch": 2.42, + "grad_norm": 0.7441763281822205, + "learning_rate": 5.4387659614668834e-05, + "loss": 2.9414, + "step": 49297 + }, + { + "epoch": 2.42, + "grad_norm": 0.7256457209587097, + "learning_rate": 5.437881928610035e-05, + "loss": 2.8141, + "step": 49298 + }, + { + "epoch": 2.42, + "grad_norm": 0.790198564529419, + "learning_rate": 5.436997960445075e-05, + "loss": 3.0438, + "step": 49299 + }, + { + "epoch": 2.42, + "grad_norm": 0.7521136999130249, + "learning_rate": 5.4361140569743423e-05, + "loss": 2.976, + "step": 49300 + }, + { + "epoch": 2.42, + "grad_norm": 0.7699940800666809, + "learning_rate": 5.435230218200157e-05, + "loss": 2.8781, + "step": 49301 + }, + { + "epoch": 2.42, + "grad_norm": 0.7283733487129211, + "learning_rate": 5.4343464441248384e-05, + "loss": 2.7156, + "step": 49302 + }, + { + "epoch": 2.42, + "grad_norm": 0.7702507376670837, + "learning_rate": 5.433462734750734e-05, + "loss": 3.0198, + "step": 49303 + }, + { + "epoch": 2.42, + "grad_norm": 1.300162672996521, + "learning_rate": 5.432579090080152e-05, + "loss": 2.9121, + "step": 49304 + }, + { + "epoch": 2.42, + "grad_norm": 0.7907041907310486, + "learning_rate": 5.431695510115428e-05, + "loss": 2.9838, + "step": 49305 + }, + { + "epoch": 2.42, + "grad_norm": 0.7288923859596252, + "learning_rate": 5.4308119948588994e-05, + "loss": 2.9276, + "step": 49306 + }, + { + "epoch": 2.42, + "grad_norm": 0.7126811742782593, + "learning_rate": 5.4299285443128806e-05, + "loss": 2.9427, + "step": 49307 + }, + { + "epoch": 2.42, + "grad_norm": 0.7134640216827393, + "learning_rate": 5.429045158479703e-05, + "loss": 2.8017, + "step": 49308 + }, + { + "epoch": 2.42, + "grad_norm": 0.7535879015922546, + "learning_rate": 5.428161837361686e-05, + "loss": 2.7213, + "step": 49309 + }, + { + "epoch": 2.42, + "grad_norm": 0.6850473880767822, + "learning_rate": 5.4272785809611606e-05, + "loss": 2.9667, + "step": 49310 + }, + { + "epoch": 2.42, + "grad_norm": 0.7917988896369934, + "learning_rate": 5.4263953892804614e-05, + "loss": 2.9065, + "step": 49311 + }, + { + "epoch": 2.42, + "grad_norm": 0.7340681552886963, + "learning_rate": 5.4255122623218994e-05, + "loss": 2.9701, + "step": 49312 + }, + { + "epoch": 2.42, + "grad_norm": 0.7131487131118774, + "learning_rate": 5.4246292000878164e-05, + "loss": 2.864, + "step": 49313 + }, + { + "epoch": 2.42, + "grad_norm": 0.7326993942260742, + "learning_rate": 5.423746202580528e-05, + "loss": 2.9615, + "step": 49314 + }, + { + "epoch": 2.42, + "grad_norm": 0.7834832668304443, + "learning_rate": 5.422863269802358e-05, + "loss": 3.0211, + "step": 49315 + }, + { + "epoch": 2.42, + "grad_norm": 0.7300680875778198, + "learning_rate": 5.421980401755645e-05, + "loss": 2.9014, + "step": 49316 + }, + { + "epoch": 2.42, + "grad_norm": 0.7191560864448547, + "learning_rate": 5.421097598442693e-05, + "loss": 2.6949, + "step": 49317 + }, + { + "epoch": 2.42, + "grad_norm": 0.7470296025276184, + "learning_rate": 5.420214859865851e-05, + "loss": 2.8543, + "step": 49318 + }, + { + "epoch": 2.42, + "grad_norm": 0.7371562719345093, + "learning_rate": 5.419332186027424e-05, + "loss": 2.9266, + "step": 49319 + }, + { + "epoch": 2.42, + "grad_norm": 0.7162860631942749, + "learning_rate": 5.418449576929746e-05, + "loss": 2.8709, + "step": 49320 + }, + { + "epoch": 2.42, + "grad_norm": 0.7437654137611389, + "learning_rate": 5.41756703257515e-05, + "loss": 2.857, + "step": 49321 + }, + { + "epoch": 2.42, + "grad_norm": 0.7650477290153503, + "learning_rate": 5.416684552965949e-05, + "loss": 2.6571, + "step": 49322 + }, + { + "epoch": 2.42, + "grad_norm": 0.743986189365387, + "learning_rate": 5.415802138104468e-05, + "loss": 2.8046, + "step": 49323 + }, + { + "epoch": 2.42, + "grad_norm": 0.7592489719390869, + "learning_rate": 5.4149197879930296e-05, + "loss": 2.8845, + "step": 49324 + }, + { + "epoch": 2.42, + "grad_norm": 0.700624406337738, + "learning_rate": 5.414037502633965e-05, + "loss": 2.8208, + "step": 49325 + }, + { + "epoch": 2.42, + "grad_norm": 0.7299220561981201, + "learning_rate": 5.4131552820295875e-05, + "loss": 2.7412, + "step": 49326 + }, + { + "epoch": 2.42, + "grad_norm": 0.756610095500946, + "learning_rate": 5.412273126182227e-05, + "loss": 3.0103, + "step": 49327 + }, + { + "epoch": 2.42, + "grad_norm": 0.7145278453826904, + "learning_rate": 5.411391035094218e-05, + "loss": 2.8637, + "step": 49328 + }, + { + "epoch": 2.42, + "grad_norm": 0.749764621257782, + "learning_rate": 5.410509008767868e-05, + "loss": 2.8035, + "step": 49329 + }, + { + "epoch": 2.42, + "grad_norm": 0.7430508136749268, + "learning_rate": 5.409627047205509e-05, + "loss": 2.7404, + "step": 49330 + }, + { + "epoch": 2.42, + "grad_norm": 0.7532656788825989, + "learning_rate": 5.4087451504094494e-05, + "loss": 3.022, + "step": 49331 + }, + { + "epoch": 2.42, + "grad_norm": 0.7438516020774841, + "learning_rate": 5.407863318382023e-05, + "loss": 3.084, + "step": 49332 + }, + { + "epoch": 2.42, + "grad_norm": 0.7108733654022217, + "learning_rate": 5.4069815511255585e-05, + "loss": 2.985, + "step": 49333 + }, + { + "epoch": 2.42, + "grad_norm": 0.7077670693397522, + "learning_rate": 5.406099848642368e-05, + "loss": 3.0003, + "step": 49334 + }, + { + "epoch": 2.42, + "grad_norm": 0.7985681891441345, + "learning_rate": 5.405218210934782e-05, + "loss": 2.78, + "step": 49335 + }, + { + "epoch": 2.42, + "grad_norm": 0.7880358695983887, + "learning_rate": 5.404336638005116e-05, + "loss": 3.0255, + "step": 49336 + }, + { + "epoch": 2.42, + "grad_norm": 0.7047920227050781, + "learning_rate": 5.403455129855687e-05, + "loss": 3.151, + "step": 49337 + }, + { + "epoch": 2.42, + "grad_norm": 0.7189116477966309, + "learning_rate": 5.4025736864888346e-05, + "loss": 2.9223, + "step": 49338 + }, + { + "epoch": 2.42, + "grad_norm": 0.7386056780815125, + "learning_rate": 5.4016923079068565e-05, + "loss": 2.9541, + "step": 49339 + }, + { + "epoch": 2.42, + "grad_norm": 0.7559934854507446, + "learning_rate": 5.400810994112097e-05, + "loss": 2.9204, + "step": 49340 + }, + { + "epoch": 2.42, + "grad_norm": 0.7456420063972473, + "learning_rate": 5.3999297451068614e-05, + "loss": 3.1086, + "step": 49341 + }, + { + "epoch": 2.42, + "grad_norm": 0.748192310333252, + "learning_rate": 5.399048560893474e-05, + "loss": 2.7161, + "step": 49342 + }, + { + "epoch": 2.42, + "grad_norm": 0.7148739099502563, + "learning_rate": 5.398167441474267e-05, + "loss": 2.9892, + "step": 49343 + }, + { + "epoch": 2.42, + "grad_norm": 0.7266228795051575, + "learning_rate": 5.397286386851552e-05, + "loss": 3.0108, + "step": 49344 + }, + { + "epoch": 2.42, + "grad_norm": 0.7317888736724854, + "learning_rate": 5.396405397027649e-05, + "loss": 3.0233, + "step": 49345 + }, + { + "epoch": 2.42, + "grad_norm": 0.7267348766326904, + "learning_rate": 5.3955244720048704e-05, + "loss": 2.9689, + "step": 49346 + }, + { + "epoch": 2.42, + "grad_norm": 0.7102034687995911, + "learning_rate": 5.394643611785549e-05, + "loss": 2.6048, + "step": 49347 + }, + { + "epoch": 2.42, + "grad_norm": 0.7675157785415649, + "learning_rate": 5.393762816372004e-05, + "loss": 3.0444, + "step": 49348 + }, + { + "epoch": 2.42, + "grad_norm": 0.7861278653144836, + "learning_rate": 5.392882085766548e-05, + "loss": 2.9381, + "step": 49349 + }, + { + "epoch": 2.42, + "grad_norm": 0.7278153300285339, + "learning_rate": 5.392001419971512e-05, + "loss": 2.6851, + "step": 49350 + }, + { + "epoch": 2.42, + "grad_norm": 0.7110613584518433, + "learning_rate": 5.391120818989205e-05, + "loss": 3.0432, + "step": 49351 + }, + { + "epoch": 2.42, + "grad_norm": 0.7356806993484497, + "learning_rate": 5.390240282821945e-05, + "loss": 2.8609, + "step": 49352 + }, + { + "epoch": 2.42, + "grad_norm": 0.73133784532547, + "learning_rate": 5.389359811472066e-05, + "loss": 2.9498, + "step": 49353 + }, + { + "epoch": 2.42, + "grad_norm": 0.7367465496063232, + "learning_rate": 5.388479404941863e-05, + "loss": 2.9129, + "step": 49354 + }, + { + "epoch": 2.42, + "grad_norm": 0.7416877746582031, + "learning_rate": 5.387599063233682e-05, + "loss": 2.9267, + "step": 49355 + }, + { + "epoch": 2.42, + "grad_norm": 0.7233571410179138, + "learning_rate": 5.386718786349819e-05, + "loss": 3.1255, + "step": 49356 + }, + { + "epoch": 2.42, + "grad_norm": 0.7607924342155457, + "learning_rate": 5.3858385742926016e-05, + "loss": 2.7879, + "step": 49357 + }, + { + "epoch": 2.42, + "grad_norm": 0.7684453129768372, + "learning_rate": 5.384958427064362e-05, + "loss": 2.8697, + "step": 49358 + }, + { + "epoch": 2.42, + "grad_norm": 0.7534241676330566, + "learning_rate": 5.3840783446673895e-05, + "loss": 2.7628, + "step": 49359 + }, + { + "epoch": 2.42, + "grad_norm": 0.7305625677108765, + "learning_rate": 5.383198327104026e-05, + "loss": 2.663, + "step": 49360 + }, + { + "epoch": 2.42, + "grad_norm": 0.7438086867332458, + "learning_rate": 5.3823183743765716e-05, + "loss": 2.9783, + "step": 49361 + }, + { + "epoch": 2.42, + "grad_norm": 0.7273472547531128, + "learning_rate": 5.381438486487356e-05, + "loss": 2.9193, + "step": 49362 + }, + { + "epoch": 2.42, + "grad_norm": 0.7327690124511719, + "learning_rate": 5.3805586634387e-05, + "loss": 2.7964, + "step": 49363 + }, + { + "epoch": 2.42, + "grad_norm": 0.7488052248954773, + "learning_rate": 5.379678905232905e-05, + "loss": 2.9159, + "step": 49364 + }, + { + "epoch": 2.42, + "grad_norm": 0.7673367857933044, + "learning_rate": 5.378799211872306e-05, + "loss": 3.1029, + "step": 49365 + }, + { + "epoch": 2.42, + "grad_norm": 0.7909415364265442, + "learning_rate": 5.37791958335921e-05, + "loss": 3.0563, + "step": 49366 + }, + { + "epoch": 2.42, + "grad_norm": 0.7198272347450256, + "learning_rate": 5.377040019695931e-05, + "loss": 2.782, + "step": 49367 + }, + { + "epoch": 2.42, + "grad_norm": 0.7427800893783569, + "learning_rate": 5.3761605208847944e-05, + "loss": 3.2383, + "step": 49368 + }, + { + "epoch": 2.42, + "grad_norm": 0.6806737184524536, + "learning_rate": 5.375281086928104e-05, + "loss": 2.7696, + "step": 49369 + }, + { + "epoch": 2.42, + "grad_norm": 0.7873852252960205, + "learning_rate": 5.374401717828193e-05, + "loss": 2.9428, + "step": 49370 + }, + { + "epoch": 2.42, + "grad_norm": 0.7423635721206665, + "learning_rate": 5.373522413587363e-05, + "loss": 2.8635, + "step": 49371 + }, + { + "epoch": 2.42, + "grad_norm": 0.7548320293426514, + "learning_rate": 5.37264317420794e-05, + "loss": 3.1002, + "step": 49372 + }, + { + "epoch": 2.42, + "grad_norm": 0.711924135684967, + "learning_rate": 5.371763999692238e-05, + "loss": 3.0354, + "step": 49373 + }, + { + "epoch": 2.42, + "grad_norm": 0.773473858833313, + "learning_rate": 5.370884890042558e-05, + "loss": 2.8397, + "step": 49374 + }, + { + "epoch": 2.42, + "grad_norm": 0.7292789220809937, + "learning_rate": 5.370005845261239e-05, + "loss": 2.9119, + "step": 49375 + }, + { + "epoch": 2.42, + "grad_norm": 0.7399142980575562, + "learning_rate": 5.369126865350576e-05, + "loss": 2.8673, + "step": 49376 + }, + { + "epoch": 2.42, + "grad_norm": 0.7260851263999939, + "learning_rate": 5.368247950312889e-05, + "loss": 2.8445, + "step": 49377 + }, + { + "epoch": 2.42, + "grad_norm": 0.7297760248184204, + "learning_rate": 5.367369100150508e-05, + "loss": 2.8716, + "step": 49378 + }, + { + "epoch": 2.42, + "grad_norm": 0.7921993136405945, + "learning_rate": 5.366490314865727e-05, + "loss": 3.0418, + "step": 49379 + }, + { + "epoch": 2.42, + "grad_norm": 0.7311543226242065, + "learning_rate": 5.36561159446088e-05, + "loss": 2.768, + "step": 49380 + }, + { + "epoch": 2.42, + "grad_norm": 0.7326648831367493, + "learning_rate": 5.364732938938266e-05, + "loss": 3.0984, + "step": 49381 + }, + { + "epoch": 2.42, + "grad_norm": 0.6894707083702087, + "learning_rate": 5.363854348300201e-05, + "loss": 2.6443, + "step": 49382 + }, + { + "epoch": 2.42, + "grad_norm": 0.754602313041687, + "learning_rate": 5.362975822549007e-05, + "loss": 2.7576, + "step": 49383 + }, + { + "epoch": 2.42, + "grad_norm": 0.7371578216552734, + "learning_rate": 5.362097361686988e-05, + "loss": 2.9871, + "step": 49384 + }, + { + "epoch": 2.42, + "grad_norm": 0.7002536058425903, + "learning_rate": 5.36121896571647e-05, + "loss": 2.982, + "step": 49385 + }, + { + "epoch": 2.42, + "grad_norm": 0.7025123238563538, + "learning_rate": 5.3603406346397504e-05, + "loss": 2.8782, + "step": 49386 + }, + { + "epoch": 2.42, + "grad_norm": 0.7461484670639038, + "learning_rate": 5.359462368459162e-05, + "loss": 3.0057, + "step": 49387 + }, + { + "epoch": 2.42, + "grad_norm": 0.7868021130561829, + "learning_rate": 5.3585841671770036e-05, + "loss": 2.9907, + "step": 49388 + }, + { + "epoch": 2.42, + "grad_norm": 0.7514649033546448, + "learning_rate": 5.3577060307955867e-05, + "loss": 3.0199, + "step": 49389 + }, + { + "epoch": 2.42, + "grad_norm": 0.7672540545463562, + "learning_rate": 5.356827959317239e-05, + "loss": 2.8189, + "step": 49390 + }, + { + "epoch": 2.42, + "grad_norm": 0.6891894340515137, + "learning_rate": 5.355949952744253e-05, + "loss": 2.785, + "step": 49391 + }, + { + "epoch": 2.42, + "grad_norm": 0.775546669960022, + "learning_rate": 5.355072011078953e-05, + "loss": 2.8854, + "step": 49392 + }, + { + "epoch": 2.42, + "grad_norm": 0.7490105628967285, + "learning_rate": 5.35419413432366e-05, + "loss": 2.9474, + "step": 49393 + }, + { + "epoch": 2.42, + "grad_norm": 0.7415432333946228, + "learning_rate": 5.3533163224806755e-05, + "loss": 2.5371, + "step": 49394 + }, + { + "epoch": 2.42, + "grad_norm": 0.7412639260292053, + "learning_rate": 5.352438575552312e-05, + "loss": 2.828, + "step": 49395 + }, + { + "epoch": 2.42, + "grad_norm": 0.7198470234870911, + "learning_rate": 5.351560893540873e-05, + "loss": 2.9522, + "step": 49396 + }, + { + "epoch": 2.42, + "grad_norm": 0.7310846447944641, + "learning_rate": 5.3506832764486776e-05, + "loss": 3.1216, + "step": 49397 + }, + { + "epoch": 2.42, + "grad_norm": 0.776674747467041, + "learning_rate": 5.34980572427805e-05, + "loss": 2.9169, + "step": 49398 + }, + { + "epoch": 2.42, + "grad_norm": 0.7521768808364868, + "learning_rate": 5.348928237031279e-05, + "loss": 3.0317, + "step": 49399 + }, + { + "epoch": 2.42, + "grad_norm": 0.7370387315750122, + "learning_rate": 5.348050814710696e-05, + "loss": 3.0164, + "step": 49400 + }, + { + "epoch": 2.42, + "grad_norm": 0.727357804775238, + "learning_rate": 5.347173457318595e-05, + "loss": 2.8261, + "step": 49401 + }, + { + "epoch": 2.42, + "grad_norm": 0.697250247001648, + "learning_rate": 5.346296164857305e-05, + "loss": 2.782, + "step": 49402 + }, + { + "epoch": 2.42, + "grad_norm": 0.7319405674934387, + "learning_rate": 5.3454189373291246e-05, + "loss": 2.8101, + "step": 49403 + }, + { + "epoch": 2.42, + "grad_norm": 0.7242304682731628, + "learning_rate": 5.344541774736357e-05, + "loss": 2.9944, + "step": 49404 + }, + { + "epoch": 2.42, + "grad_norm": 0.7132266163825989, + "learning_rate": 5.3436646770813286e-05, + "loss": 2.9694, + "step": 49405 + }, + { + "epoch": 2.42, + "grad_norm": 0.7501279711723328, + "learning_rate": 5.342787644366337e-05, + "loss": 2.7354, + "step": 49406 + }, + { + "epoch": 2.42, + "grad_norm": 0.6961253881454468, + "learning_rate": 5.3419106765937025e-05, + "loss": 2.8499, + "step": 49407 + }, + { + "epoch": 2.42, + "grad_norm": 0.7209061980247498, + "learning_rate": 5.3410337737657216e-05, + "loss": 3.1587, + "step": 49408 + }, + { + "epoch": 2.42, + "grad_norm": 0.718574047088623, + "learning_rate": 5.340156935884722e-05, + "loss": 2.867, + "step": 49409 + }, + { + "epoch": 2.42, + "grad_norm": 0.8040580153465271, + "learning_rate": 5.3392801629530015e-05, + "loss": 2.8241, + "step": 49410 + }, + { + "epoch": 2.42, + "grad_norm": 0.717680037021637, + "learning_rate": 5.338403454972865e-05, + "loss": 3.0446, + "step": 49411 + }, + { + "epoch": 2.42, + "grad_norm": 0.7457809448242188, + "learning_rate": 5.337526811946634e-05, + "loss": 2.5612, + "step": 49412 + }, + { + "epoch": 2.42, + "grad_norm": 0.7153359055519104, + "learning_rate": 5.336650233876604e-05, + "loss": 3.0441, + "step": 49413 + }, + { + "epoch": 2.42, + "grad_norm": 0.7195101976394653, + "learning_rate": 5.3357737207650886e-05, + "loss": 3.1782, + "step": 49414 + }, + { + "epoch": 2.42, + "grad_norm": 0.6793602705001831, + "learning_rate": 5.3348972726144104e-05, + "loss": 2.7251, + "step": 49415 + }, + { + "epoch": 2.42, + "grad_norm": 0.7900977730751038, + "learning_rate": 5.334020889426863e-05, + "loss": 2.7028, + "step": 49416 + }, + { + "epoch": 2.42, + "grad_norm": 0.7003040909767151, + "learning_rate": 5.333144571204756e-05, + "loss": 2.7757, + "step": 49417 + }, + { + "epoch": 2.42, + "grad_norm": 0.7215965986251831, + "learning_rate": 5.332268317950393e-05, + "loss": 2.8256, + "step": 49418 + }, + { + "epoch": 2.42, + "grad_norm": 0.6919942498207092, + "learning_rate": 5.33139212966609e-05, + "loss": 2.7628, + "step": 49419 + }, + { + "epoch": 2.42, + "grad_norm": 0.7243634462356567, + "learning_rate": 5.3305160063541576e-05, + "loss": 2.8523, + "step": 49420 + }, + { + "epoch": 2.42, + "grad_norm": 0.7183544635772705, + "learning_rate": 5.329639948016891e-05, + "loss": 2.7497, + "step": 49421 + }, + { + "epoch": 2.42, + "grad_norm": 0.7528842687606812, + "learning_rate": 5.3287639546566144e-05, + "loss": 2.9952, + "step": 49422 + }, + { + "epoch": 2.42, + "grad_norm": 0.7461656332015991, + "learning_rate": 5.327888026275616e-05, + "loss": 2.9245, + "step": 49423 + }, + { + "epoch": 2.42, + "grad_norm": 0.7598857879638672, + "learning_rate": 5.327012162876219e-05, + "loss": 2.8832, + "step": 49424 + }, + { + "epoch": 2.42, + "grad_norm": 0.7612981200218201, + "learning_rate": 5.3261363644607255e-05, + "loss": 3.0593, + "step": 49425 + }, + { + "epoch": 2.42, + "grad_norm": 0.7197171449661255, + "learning_rate": 5.325260631031433e-05, + "loss": 2.6926, + "step": 49426 + }, + { + "epoch": 2.42, + "grad_norm": 0.7351194620132446, + "learning_rate": 5.324384962590661e-05, + "loss": 3.1384, + "step": 49427 + }, + { + "epoch": 2.42, + "grad_norm": 0.7707248330116272, + "learning_rate": 5.323509359140702e-05, + "loss": 2.9092, + "step": 49428 + }, + { + "epoch": 2.42, + "grad_norm": 0.7522650957107544, + "learning_rate": 5.322633820683874e-05, + "loss": 2.8542, + "step": 49429 + }, + { + "epoch": 2.42, + "grad_norm": 0.7015370726585388, + "learning_rate": 5.321758347222481e-05, + "loss": 2.7529, + "step": 49430 + }, + { + "epoch": 2.42, + "grad_norm": 0.733875036239624, + "learning_rate": 5.320882938758833e-05, + "loss": 2.9341, + "step": 49431 + }, + { + "epoch": 2.42, + "grad_norm": 0.7439164519309998, + "learning_rate": 5.320007595295226e-05, + "loss": 3.0875, + "step": 49432 + }, + { + "epoch": 2.42, + "grad_norm": 0.7210975289344788, + "learning_rate": 5.319132316833963e-05, + "loss": 2.8665, + "step": 49433 + }, + { + "epoch": 2.42, + "grad_norm": 0.7494742274284363, + "learning_rate": 5.3182571033773543e-05, + "loss": 2.8854, + "step": 49434 + }, + { + "epoch": 2.42, + "grad_norm": 0.7517364025115967, + "learning_rate": 5.3173819549277164e-05, + "loss": 2.7901, + "step": 49435 + }, + { + "epoch": 2.42, + "grad_norm": 0.8002448678016663, + "learning_rate": 5.316506871487336e-05, + "loss": 2.639, + "step": 49436 + }, + { + "epoch": 2.42, + "grad_norm": 0.6921734809875488, + "learning_rate": 5.3156318530585315e-05, + "loss": 2.8351, + "step": 49437 + }, + { + "epoch": 2.42, + "grad_norm": 0.7180572152137756, + "learning_rate": 5.314756899643605e-05, + "loss": 2.8778, + "step": 49438 + }, + { + "epoch": 2.42, + "grad_norm": 0.7430548667907715, + "learning_rate": 5.313882011244848e-05, + "loss": 3.0747, + "step": 49439 + }, + { + "epoch": 2.42, + "grad_norm": 0.7170864939689636, + "learning_rate": 5.313007187864584e-05, + "loss": 2.9257, + "step": 49440 + }, + { + "epoch": 2.42, + "grad_norm": 0.7517858743667603, + "learning_rate": 5.312132429505099e-05, + "loss": 2.9093, + "step": 49441 + }, + { + "epoch": 2.42, + "grad_norm": 0.7107861042022705, + "learning_rate": 5.311257736168715e-05, + "loss": 2.7721, + "step": 49442 + }, + { + "epoch": 2.42, + "grad_norm": 0.7427153587341309, + "learning_rate": 5.310383107857718e-05, + "loss": 2.8472, + "step": 49443 + }, + { + "epoch": 2.42, + "grad_norm": 0.7173393964767456, + "learning_rate": 5.309508544574419e-05, + "loss": 2.7931, + "step": 49444 + }, + { + "epoch": 2.42, + "grad_norm": 0.7084386944770813, + "learning_rate": 5.308634046321131e-05, + "loss": 2.6603, + "step": 49445 + }, + { + "epoch": 2.42, + "grad_norm": 0.6978672742843628, + "learning_rate": 5.307759613100152e-05, + "loss": 2.8571, + "step": 49446 + }, + { + "epoch": 2.42, + "grad_norm": 0.7057198286056519, + "learning_rate": 5.3068852449137775e-05, + "loss": 2.7171, + "step": 49447 + }, + { + "epoch": 2.42, + "grad_norm": 0.7628961801528931, + "learning_rate": 5.306010941764313e-05, + "loss": 2.8952, + "step": 49448 + }, + { + "epoch": 2.42, + "grad_norm": 0.7418630123138428, + "learning_rate": 5.305136703654058e-05, + "loss": 2.7643, + "step": 49449 + }, + { + "epoch": 2.42, + "grad_norm": 0.7383913993835449, + "learning_rate": 5.304262530585329e-05, + "loss": 2.8741, + "step": 49450 + }, + { + "epoch": 2.42, + "grad_norm": 0.7368167042732239, + "learning_rate": 5.303388422560414e-05, + "loss": 2.8344, + "step": 49451 + }, + { + "epoch": 2.42, + "grad_norm": 0.7100529670715332, + "learning_rate": 5.302514379581627e-05, + "loss": 2.9651, + "step": 49452 + }, + { + "epoch": 2.42, + "grad_norm": 0.7233177423477173, + "learning_rate": 5.301640401651267e-05, + "loss": 3.1435, + "step": 49453 + }, + { + "epoch": 2.42, + "grad_norm": 0.7448151111602783, + "learning_rate": 5.3007664887716216e-05, + "loss": 2.6896, + "step": 49454 + }, + { + "epoch": 2.42, + "grad_norm": 0.7152411341667175, + "learning_rate": 5.2998926409450126e-05, + "loss": 2.9082, + "step": 49455 + }, + { + "epoch": 2.42, + "grad_norm": 0.7502825260162354, + "learning_rate": 5.299018858173725e-05, + "loss": 2.7657, + "step": 49456 + }, + { + "epoch": 2.42, + "grad_norm": 0.7130440473556519, + "learning_rate": 5.298145140460076e-05, + "loss": 2.8697, + "step": 49457 + }, + { + "epoch": 2.42, + "grad_norm": 0.7099788188934326, + "learning_rate": 5.297271487806352e-05, + "loss": 2.6939, + "step": 49458 + }, + { + "epoch": 2.42, + "grad_norm": 0.706955075263977, + "learning_rate": 5.2963979002148596e-05, + "loss": 2.7304, + "step": 49459 + }, + { + "epoch": 2.42, + "grad_norm": 0.7011268734931946, + "learning_rate": 5.295524377687914e-05, + "loss": 2.7672, + "step": 49460 + }, + { + "epoch": 2.42, + "grad_norm": 0.7620493769645691, + "learning_rate": 5.294650920227791e-05, + "loss": 2.7686, + "step": 49461 + }, + { + "epoch": 2.42, + "grad_norm": 0.7569233775138855, + "learning_rate": 5.293777527836808e-05, + "loss": 2.7607, + "step": 49462 + }, + { + "epoch": 2.42, + "grad_norm": 0.7677346467971802, + "learning_rate": 5.292904200517253e-05, + "loss": 2.9383, + "step": 49463 + }, + { + "epoch": 2.42, + "grad_norm": 0.7182645201683044, + "learning_rate": 5.292030938271434e-05, + "loss": 2.9959, + "step": 49464 + }, + { + "epoch": 2.42, + "grad_norm": 0.7732892036437988, + "learning_rate": 5.291157741101657e-05, + "loss": 2.8637, + "step": 49465 + }, + { + "epoch": 2.42, + "grad_norm": 0.7590673565864563, + "learning_rate": 5.290284609010209e-05, + "loss": 2.7361, + "step": 49466 + }, + { + "epoch": 2.42, + "grad_norm": 0.6932297945022583, + "learning_rate": 5.289411541999406e-05, + "loss": 3.0343, + "step": 49467 + }, + { + "epoch": 2.42, + "grad_norm": 0.7510533332824707, + "learning_rate": 5.288538540071534e-05, + "loss": 2.8191, + "step": 49468 + }, + { + "epoch": 2.42, + "grad_norm": 0.7724784016609192, + "learning_rate": 5.287665603228887e-05, + "loss": 2.7771, + "step": 49469 + }, + { + "epoch": 2.42, + "grad_norm": 0.7106216549873352, + "learning_rate": 5.2867927314737815e-05, + "loss": 2.9812, + "step": 49470 + }, + { + "epoch": 2.42, + "grad_norm": 0.7474266290664673, + "learning_rate": 5.2859199248084994e-05, + "loss": 2.8478, + "step": 49471 + }, + { + "epoch": 2.42, + "grad_norm": 0.7339901924133301, + "learning_rate": 5.2850471832353594e-05, + "loss": 2.9416, + "step": 49472 + }, + { + "epoch": 2.42, + "grad_norm": 0.7034941911697388, + "learning_rate": 5.284174506756636e-05, + "loss": 2.9021, + "step": 49473 + }, + { + "epoch": 2.42, + "grad_norm": 0.7442716956138611, + "learning_rate": 5.28330189537465e-05, + "loss": 2.9255, + "step": 49474 + }, + { + "epoch": 2.42, + "grad_norm": 0.7037724852561951, + "learning_rate": 5.28242934909169e-05, + "loss": 2.6829, + "step": 49475 + }, + { + "epoch": 2.42, + "grad_norm": 0.7233594655990601, + "learning_rate": 5.281556867910048e-05, + "loss": 2.8918, + "step": 49476 + }, + { + "epoch": 2.42, + "grad_norm": 0.7807363867759705, + "learning_rate": 5.280684451832032e-05, + "loss": 2.9158, + "step": 49477 + }, + { + "epoch": 2.42, + "grad_norm": 0.7810481786727905, + "learning_rate": 5.279812100859934e-05, + "loss": 2.9131, + "step": 49478 + }, + { + "epoch": 2.42, + "grad_norm": 0.7510912418365479, + "learning_rate": 5.278939814996057e-05, + "loss": 2.8349, + "step": 49479 + }, + { + "epoch": 2.42, + "grad_norm": 0.8030551671981812, + "learning_rate": 5.2780675942426874e-05, + "loss": 2.7635, + "step": 49480 + }, + { + "epoch": 2.42, + "grad_norm": 0.7197742462158203, + "learning_rate": 5.2771954386021296e-05, + "loss": 3.1214, + "step": 49481 + }, + { + "epoch": 2.42, + "grad_norm": 0.7838296890258789, + "learning_rate": 5.276323348076689e-05, + "loss": 3.0325, + "step": 49482 + }, + { + "epoch": 2.43, + "grad_norm": 0.777555525302887, + "learning_rate": 5.2754513226686556e-05, + "loss": 2.9526, + "step": 49483 + }, + { + "epoch": 2.43, + "grad_norm": 0.7960125207901001, + "learning_rate": 5.274579362380322e-05, + "loss": 2.8986, + "step": 49484 + }, + { + "epoch": 2.43, + "grad_norm": 0.7355430126190186, + "learning_rate": 5.2737074672139837e-05, + "loss": 2.8928, + "step": 49485 + }, + { + "epoch": 2.43, + "grad_norm": 0.7454689741134644, + "learning_rate": 5.272835637171941e-05, + "loss": 2.8384, + "step": 49486 + }, + { + "epoch": 2.43, + "grad_norm": 0.7542171478271484, + "learning_rate": 5.2719638722564986e-05, + "loss": 3.036, + "step": 49487 + }, + { + "epoch": 2.43, + "grad_norm": 0.7559950947761536, + "learning_rate": 5.271092172469935e-05, + "loss": 2.6835, + "step": 49488 + }, + { + "epoch": 2.43, + "grad_norm": 0.6909935474395752, + "learning_rate": 5.270220537814562e-05, + "loss": 2.8508, + "step": 49489 + }, + { + "epoch": 2.43, + "grad_norm": 0.7282992005348206, + "learning_rate": 5.269348968292671e-05, + "loss": 2.9179, + "step": 49490 + }, + { + "epoch": 2.43, + "grad_norm": 0.7650557160377502, + "learning_rate": 5.268477463906548e-05, + "loss": 2.8272, + "step": 49491 + }, + { + "epoch": 2.43, + "grad_norm": 0.744831383228302, + "learning_rate": 5.267606024658502e-05, + "loss": 2.8916, + "step": 49492 + }, + { + "epoch": 2.43, + "grad_norm": 0.7628011107444763, + "learning_rate": 5.266734650550813e-05, + "loss": 2.6101, + "step": 49493 + }, + { + "epoch": 2.43, + "grad_norm": 0.7256711721420288, + "learning_rate": 5.265863341585794e-05, + "loss": 3.0817, + "step": 49494 + }, + { + "epoch": 2.43, + "grad_norm": 0.7441546320915222, + "learning_rate": 5.2649920977657225e-05, + "loss": 2.9964, + "step": 49495 + }, + { + "epoch": 2.43, + "grad_norm": 0.7593109607696533, + "learning_rate": 5.264120919092911e-05, + "loss": 2.8387, + "step": 49496 + }, + { + "epoch": 2.43, + "grad_norm": 0.722559928894043, + "learning_rate": 5.2632498055696424e-05, + "loss": 3.0698, + "step": 49497 + }, + { + "epoch": 2.43, + "grad_norm": 0.7306622862815857, + "learning_rate": 5.2623787571982044e-05, + "loss": 2.8901, + "step": 49498 + }, + { + "epoch": 2.43, + "grad_norm": 0.7256033420562744, + "learning_rate": 5.261507773980912e-05, + "loss": 2.958, + "step": 49499 + }, + { + "epoch": 2.43, + "grad_norm": 0.7559966444969177, + "learning_rate": 5.260636855920036e-05, + "loss": 2.9003, + "step": 49500 + }, + { + "epoch": 2.43, + "grad_norm": 0.7539312839508057, + "learning_rate": 5.2597660030178825e-05, + "loss": 2.7591, + "step": 49501 + }, + { + "epoch": 2.43, + "grad_norm": 0.7615699172019958, + "learning_rate": 5.258895215276752e-05, + "loss": 2.8271, + "step": 49502 + }, + { + "epoch": 2.43, + "grad_norm": 0.720807671546936, + "learning_rate": 5.258024492698919e-05, + "loss": 2.739, + "step": 49503 + }, + { + "epoch": 2.43, + "grad_norm": 0.8670095801353455, + "learning_rate": 5.2571538352866994e-05, + "loss": 2.8535, + "step": 49504 + }, + { + "epoch": 2.43, + "grad_norm": 0.708129346370697, + "learning_rate": 5.256283243042374e-05, + "loss": 3.1879, + "step": 49505 + }, + { + "epoch": 2.43, + "grad_norm": 0.6828888058662415, + "learning_rate": 5.255412715968228e-05, + "loss": 3.0061, + "step": 49506 + }, + { + "epoch": 2.43, + "grad_norm": 0.722131073474884, + "learning_rate": 5.254542254066569e-05, + "loss": 3.0957, + "step": 49507 + }, + { + "epoch": 2.43, + "grad_norm": 0.7353353500366211, + "learning_rate": 5.2536718573396796e-05, + "loss": 2.8735, + "step": 49508 + }, + { + "epoch": 2.43, + "grad_norm": 0.7344322204589844, + "learning_rate": 5.2528015257898605e-05, + "loss": 2.8081, + "step": 49509 + }, + { + "epoch": 2.43, + "grad_norm": 0.7135079503059387, + "learning_rate": 5.251931259419393e-05, + "loss": 2.9412, + "step": 49510 + }, + { + "epoch": 2.43, + "grad_norm": 0.7717560529708862, + "learning_rate": 5.251061058230582e-05, + "loss": 2.8145, + "step": 49511 + }, + { + "epoch": 2.43, + "grad_norm": 0.7272468209266663, + "learning_rate": 5.250190922225717e-05, + "loss": 2.9457, + "step": 49512 + }, + { + "epoch": 2.43, + "grad_norm": 0.7593507170677185, + "learning_rate": 5.2493208514070736e-05, + "loss": 2.7668, + "step": 49513 + }, + { + "epoch": 2.43, + "grad_norm": 0.7987231612205505, + "learning_rate": 5.2484508457769657e-05, + "loss": 2.7785, + "step": 49514 + }, + { + "epoch": 2.43, + "grad_norm": 0.6929014325141907, + "learning_rate": 5.247580905337667e-05, + "loss": 2.7896, + "step": 49515 + }, + { + "epoch": 2.43, + "grad_norm": 0.7104371190071106, + "learning_rate": 5.246711030091476e-05, + "loss": 2.8161, + "step": 49516 + }, + { + "epoch": 2.43, + "grad_norm": 0.6814081072807312, + "learning_rate": 5.245841220040691e-05, + "loss": 2.8394, + "step": 49517 + }, + { + "epoch": 2.43, + "grad_norm": 0.691145658493042, + "learning_rate": 5.244971475187599e-05, + "loss": 2.9412, + "step": 49518 + }, + { + "epoch": 2.43, + "grad_norm": 0.7448225021362305, + "learning_rate": 5.244101795534487e-05, + "loss": 2.8151, + "step": 49519 + }, + { + "epoch": 2.43, + "grad_norm": 0.7341607213020325, + "learning_rate": 5.243232181083638e-05, + "loss": 2.8329, + "step": 49520 + }, + { + "epoch": 2.43, + "grad_norm": 0.7958085536956787, + "learning_rate": 5.24236263183735e-05, + "loss": 2.9727, + "step": 49521 + }, + { + "epoch": 2.43, + "grad_norm": 0.7446455359458923, + "learning_rate": 5.2414931477979225e-05, + "loss": 3.0168, + "step": 49522 + }, + { + "epoch": 2.43, + "grad_norm": 0.7200234532356262, + "learning_rate": 5.24062372896763e-05, + "loss": 2.8772, + "step": 49523 + }, + { + "epoch": 2.43, + "grad_norm": 0.7179722785949707, + "learning_rate": 5.239754375348777e-05, + "loss": 2.9839, + "step": 49524 + }, + { + "epoch": 2.43, + "grad_norm": 0.7151455879211426, + "learning_rate": 5.2388850869436385e-05, + "loss": 2.7769, + "step": 49525 + }, + { + "epoch": 2.43, + "grad_norm": 0.7526031732559204, + "learning_rate": 5.238015863754519e-05, + "loss": 3.0104, + "step": 49526 + }, + { + "epoch": 2.43, + "grad_norm": 0.7519762516021729, + "learning_rate": 5.2371467057837003e-05, + "loss": 2.9088, + "step": 49527 + }, + { + "epoch": 2.43, + "grad_norm": 0.7097587585449219, + "learning_rate": 5.236277613033461e-05, + "loss": 2.985, + "step": 49528 + }, + { + "epoch": 2.43, + "grad_norm": 0.6919777989387512, + "learning_rate": 5.235408585506111e-05, + "loss": 2.8978, + "step": 49529 + }, + { + "epoch": 2.43, + "grad_norm": 0.7104206681251526, + "learning_rate": 5.2345396232039205e-05, + "loss": 2.9985, + "step": 49530 + }, + { + "epoch": 2.43, + "grad_norm": 0.721882164478302, + "learning_rate": 5.2336707261291865e-05, + "loss": 2.9761, + "step": 49531 + }, + { + "epoch": 2.43, + "grad_norm": 0.7098190784454346, + "learning_rate": 5.2328018942842044e-05, + "loss": 2.9406, + "step": 49532 + }, + { + "epoch": 2.43, + "grad_norm": 0.7680677771568298, + "learning_rate": 5.231933127671258e-05, + "loss": 2.9672, + "step": 49533 + }, + { + "epoch": 2.43, + "grad_norm": 0.7131537795066833, + "learning_rate": 5.2310644262926305e-05, + "loss": 2.9661, + "step": 49534 + }, + { + "epoch": 2.43, + "grad_norm": 0.7221238017082214, + "learning_rate": 5.2301957901506054e-05, + "loss": 2.9696, + "step": 49535 + }, + { + "epoch": 2.43, + "grad_norm": 0.7687680125236511, + "learning_rate": 5.229327219247478e-05, + "loss": 2.8797, + "step": 49536 + }, + { + "epoch": 2.43, + "grad_norm": 0.7497595548629761, + "learning_rate": 5.228458713585543e-05, + "loss": 2.8141, + "step": 49537 + }, + { + "epoch": 2.43, + "grad_norm": 0.7552316784858704, + "learning_rate": 5.2275902731670716e-05, + "loss": 2.9196, + "step": 49538 + }, + { + "epoch": 2.43, + "grad_norm": 0.7658446431159973, + "learning_rate": 5.2267218979943695e-05, + "loss": 2.9684, + "step": 49539 + }, + { + "epoch": 2.43, + "grad_norm": 0.7868543267250061, + "learning_rate": 5.225853588069714e-05, + "loss": 2.8374, + "step": 49540 + }, + { + "epoch": 2.43, + "grad_norm": 0.72367262840271, + "learning_rate": 5.224985343395384e-05, + "loss": 2.9543, + "step": 49541 + }, + { + "epoch": 2.43, + "grad_norm": 0.7303585410118103, + "learning_rate": 5.22411716397368e-05, + "loss": 2.9499, + "step": 49542 + }, + { + "epoch": 2.43, + "grad_norm": 0.7236695885658264, + "learning_rate": 5.223249049806878e-05, + "loss": 3.0774, + "step": 49543 + }, + { + "epoch": 2.43, + "grad_norm": 0.7127900123596191, + "learning_rate": 5.222381000897279e-05, + "loss": 3.1424, + "step": 49544 + }, + { + "epoch": 2.43, + "grad_norm": 0.7575745582580566, + "learning_rate": 5.2215130172471476e-05, + "loss": 3.0335, + "step": 49545 + }, + { + "epoch": 2.43, + "grad_norm": 0.7566746473312378, + "learning_rate": 5.220645098858786e-05, + "loss": 2.8638, + "step": 49546 + }, + { + "epoch": 2.43, + "grad_norm": 0.7185924053192139, + "learning_rate": 5.219777245734484e-05, + "loss": 2.9908, + "step": 49547 + }, + { + "epoch": 2.43, + "grad_norm": 0.7872166633605957, + "learning_rate": 5.218909457876515e-05, + "loss": 2.8108, + "step": 49548 + }, + { + "epoch": 2.43, + "grad_norm": 0.7263684868812561, + "learning_rate": 5.2180417352871736e-05, + "loss": 2.7144, + "step": 49549 + }, + { + "epoch": 2.43, + "grad_norm": 0.7238753437995911, + "learning_rate": 5.2171740779687346e-05, + "loss": 2.9509, + "step": 49550 + }, + { + "epoch": 2.43, + "grad_norm": 0.7120121121406555, + "learning_rate": 5.216306485923485e-05, + "loss": 2.8628, + "step": 49551 + }, + { + "epoch": 2.43, + "grad_norm": 0.6943619251251221, + "learning_rate": 5.2154389591537236e-05, + "loss": 2.7111, + "step": 49552 + }, + { + "epoch": 2.43, + "grad_norm": 0.709274411201477, + "learning_rate": 5.214571497661723e-05, + "loss": 2.7601, + "step": 49553 + }, + { + "epoch": 2.43, + "grad_norm": 0.7480226755142212, + "learning_rate": 5.2137041014497726e-05, + "loss": 2.9873, + "step": 49554 + }, + { + "epoch": 2.43, + "grad_norm": 0.7200178503990173, + "learning_rate": 5.2128367705201594e-05, + "loss": 2.7593, + "step": 49555 + }, + { + "epoch": 2.43, + "grad_norm": 0.7735962867736816, + "learning_rate": 5.211969504875153e-05, + "loss": 2.8678, + "step": 49556 + }, + { + "epoch": 2.43, + "grad_norm": 0.7781702876091003, + "learning_rate": 5.2111023045170565e-05, + "loss": 2.9205, + "step": 49557 + }, + { + "epoch": 2.43, + "grad_norm": 0.7568464875221252, + "learning_rate": 5.21023516944814e-05, + "loss": 3.0284, + "step": 49558 + }, + { + "epoch": 2.43, + "grad_norm": 0.7191801071166992, + "learning_rate": 5.2093680996707e-05, + "loss": 2.9578, + "step": 49559 + }, + { + "epoch": 2.43, + "grad_norm": 0.7512852549552917, + "learning_rate": 5.208501095187003e-05, + "loss": 3.0233, + "step": 49560 + }, + { + "epoch": 2.43, + "grad_norm": 0.7531700134277344, + "learning_rate": 5.207634155999355e-05, + "loss": 2.6399, + "step": 49561 + }, + { + "epoch": 2.43, + "grad_norm": 0.7530140280723572, + "learning_rate": 5.2067672821100234e-05, + "loss": 3.0416, + "step": 49562 + }, + { + "epoch": 2.43, + "grad_norm": 0.7887686491012573, + "learning_rate": 5.2059004735212904e-05, + "loss": 3.0472, + "step": 49563 + }, + { + "epoch": 2.43, + "grad_norm": 0.7277945280075073, + "learning_rate": 5.20503373023545e-05, + "loss": 2.7531, + "step": 49564 + }, + { + "epoch": 2.43, + "grad_norm": 0.7439764142036438, + "learning_rate": 5.2041670522547715e-05, + "loss": 2.9304, + "step": 49565 + }, + { + "epoch": 2.43, + "grad_norm": 0.737322986125946, + "learning_rate": 5.203300439581556e-05, + "loss": 3.0708, + "step": 49566 + }, + { + "epoch": 2.43, + "grad_norm": 0.7382588982582092, + "learning_rate": 5.2024338922180606e-05, + "loss": 2.8939, + "step": 49567 + }, + { + "epoch": 2.43, + "grad_norm": 0.729706883430481, + "learning_rate": 5.201567410166588e-05, + "loss": 3.2045, + "step": 49568 + }, + { + "epoch": 2.43, + "grad_norm": 0.7282545566558838, + "learning_rate": 5.20070099342942e-05, + "loss": 2.8105, + "step": 49569 + }, + { + "epoch": 2.43, + "grad_norm": 0.7177003026008606, + "learning_rate": 5.19983464200883e-05, + "loss": 2.7949, + "step": 49570 + }, + { + "epoch": 2.43, + "grad_norm": 0.7702353000640869, + "learning_rate": 5.1989683559071074e-05, + "loss": 2.9669, + "step": 49571 + }, + { + "epoch": 2.43, + "grad_norm": 0.7214787602424622, + "learning_rate": 5.198102135126516e-05, + "loss": 2.8121, + "step": 49572 + }, + { + "epoch": 2.43, + "grad_norm": 0.8282303214073181, + "learning_rate": 5.1972359796693554e-05, + "loss": 2.9693, + "step": 49573 + }, + { + "epoch": 2.43, + "grad_norm": 0.7337009906768799, + "learning_rate": 5.196369889537906e-05, + "loss": 2.7557, + "step": 49574 + }, + { + "epoch": 2.43, + "grad_norm": 0.7773966789245605, + "learning_rate": 5.195503864734439e-05, + "loss": 2.9181, + "step": 49575 + }, + { + "epoch": 2.43, + "grad_norm": 0.6771313548088074, + "learning_rate": 5.194637905261247e-05, + "loss": 2.6734, + "step": 49576 + }, + { + "epoch": 2.43, + "grad_norm": 0.7496398091316223, + "learning_rate": 5.1937720111206085e-05, + "loss": 2.9345, + "step": 49577 + }, + { + "epoch": 2.43, + "grad_norm": 0.7166499495506287, + "learning_rate": 5.19290618231479e-05, + "loss": 2.7787, + "step": 49578 + }, + { + "epoch": 2.43, + "grad_norm": 0.7843281626701355, + "learning_rate": 5.192040418846091e-05, + "loss": 2.7845, + "step": 49579 + }, + { + "epoch": 2.43, + "grad_norm": 0.7141745686531067, + "learning_rate": 5.1911747207167754e-05, + "loss": 2.7912, + "step": 49580 + }, + { + "epoch": 2.43, + "grad_norm": 0.7347038388252258, + "learning_rate": 5.1903090879291384e-05, + "loss": 2.7822, + "step": 49581 + }, + { + "epoch": 2.43, + "grad_norm": 0.7614521980285645, + "learning_rate": 5.189443520485442e-05, + "loss": 2.749, + "step": 49582 + }, + { + "epoch": 2.43, + "grad_norm": 0.7512968182563782, + "learning_rate": 5.188578018387981e-05, + "loss": 2.9849, + "step": 49583 + }, + { + "epoch": 2.43, + "grad_norm": 0.7517346143722534, + "learning_rate": 5.1877125816390365e-05, + "loss": 2.7312, + "step": 49584 + }, + { + "epoch": 2.43, + "grad_norm": 0.728527307510376, + "learning_rate": 5.1868472102408834e-05, + "loss": 2.9731, + "step": 49585 + }, + { + "epoch": 2.43, + "grad_norm": 0.7840355634689331, + "learning_rate": 5.185981904195796e-05, + "loss": 2.8932, + "step": 49586 + }, + { + "epoch": 2.43, + "grad_norm": 0.7409562468528748, + "learning_rate": 5.18511666350605e-05, + "loss": 3.0215, + "step": 49587 + }, + { + "epoch": 2.43, + "grad_norm": 0.756843626499176, + "learning_rate": 5.184251488173934e-05, + "loss": 3.0411, + "step": 49588 + }, + { + "epoch": 2.43, + "grad_norm": 0.7006279826164246, + "learning_rate": 5.1833863782017284e-05, + "loss": 2.7748, + "step": 49589 + }, + { + "epoch": 2.43, + "grad_norm": 0.708868145942688, + "learning_rate": 5.182521333591698e-05, + "loss": 2.8136, + "step": 49590 + }, + { + "epoch": 2.43, + "grad_norm": 0.779094934463501, + "learning_rate": 5.181656354346142e-05, + "loss": 2.9222, + "step": 49591 + }, + { + "epoch": 2.43, + "grad_norm": 0.7329817414283752, + "learning_rate": 5.1807914404673244e-05, + "loss": 2.7675, + "step": 49592 + }, + { + "epoch": 2.43, + "grad_norm": 0.7352889180183411, + "learning_rate": 5.1799265919575175e-05, + "loss": 2.6653, + "step": 49593 + }, + { + "epoch": 2.43, + "grad_norm": 0.737707793712616, + "learning_rate": 5.179061808819015e-05, + "loss": 3.0523, + "step": 49594 + }, + { + "epoch": 2.43, + "grad_norm": 0.737191915512085, + "learning_rate": 5.178197091054079e-05, + "loss": 3.004, + "step": 49595 + }, + { + "epoch": 2.43, + "grad_norm": 0.7531353831291199, + "learning_rate": 5.177332438665005e-05, + "loss": 3.0624, + "step": 49596 + }, + { + "epoch": 2.43, + "grad_norm": 0.6949726939201355, + "learning_rate": 5.17646785165405e-05, + "loss": 2.8186, + "step": 49597 + }, + { + "epoch": 2.43, + "grad_norm": 0.6904205083847046, + "learning_rate": 5.17560333002351e-05, + "loss": 3.0604, + "step": 49598 + }, + { + "epoch": 2.43, + "grad_norm": 0.7312195301055908, + "learning_rate": 5.174738873775652e-05, + "loss": 3.1245, + "step": 49599 + }, + { + "epoch": 2.43, + "grad_norm": 0.7542881965637207, + "learning_rate": 5.173874482912749e-05, + "loss": 2.9555, + "step": 49600 + }, + { + "epoch": 2.43, + "grad_norm": 0.7179239988327026, + "learning_rate": 5.173010157437088e-05, + "loss": 2.8191, + "step": 49601 + }, + { + "epoch": 2.43, + "grad_norm": 0.7660733461380005, + "learning_rate": 5.172145897350933e-05, + "loss": 2.7249, + "step": 49602 + }, + { + "epoch": 2.43, + "grad_norm": 0.7874742150306702, + "learning_rate": 5.171281702656568e-05, + "loss": 3.0061, + "step": 49603 + }, + { + "epoch": 2.43, + "grad_norm": 0.7039852738380432, + "learning_rate": 5.170417573356276e-05, + "loss": 3.0712, + "step": 49604 + }, + { + "epoch": 2.43, + "grad_norm": 0.7054070830345154, + "learning_rate": 5.169553509452319e-05, + "loss": 2.7834, + "step": 49605 + }, + { + "epoch": 2.43, + "grad_norm": 0.7984621524810791, + "learning_rate": 5.168689510946985e-05, + "loss": 2.8207, + "step": 49606 + }, + { + "epoch": 2.43, + "grad_norm": 0.7172672748565674, + "learning_rate": 5.167825577842546e-05, + "loss": 2.8539, + "step": 49607 + }, + { + "epoch": 2.43, + "grad_norm": 0.7304677367210388, + "learning_rate": 5.1669617101412663e-05, + "loss": 2.7716, + "step": 49608 + }, + { + "epoch": 2.43, + "grad_norm": 0.6986484527587891, + "learning_rate": 5.166097907845438e-05, + "loss": 3.0376, + "step": 49609 + }, + { + "epoch": 2.43, + "grad_norm": 0.7038609981536865, + "learning_rate": 5.165234170957319e-05, + "loss": 2.8795, + "step": 49610 + }, + { + "epoch": 2.43, + "grad_norm": 0.7226280570030212, + "learning_rate": 5.164370499479204e-05, + "loss": 2.8304, + "step": 49611 + }, + { + "epoch": 2.43, + "grad_norm": 0.7109252214431763, + "learning_rate": 5.16350689341335e-05, + "loss": 2.8575, + "step": 49612 + }, + { + "epoch": 2.43, + "grad_norm": 0.7385297417640686, + "learning_rate": 5.162643352762045e-05, + "loss": 2.8261, + "step": 49613 + }, + { + "epoch": 2.43, + "grad_norm": 0.7628830671310425, + "learning_rate": 5.1617798775275576e-05, + "loss": 2.9239, + "step": 49614 + }, + { + "epoch": 2.43, + "grad_norm": 0.787331759929657, + "learning_rate": 5.160916467712154e-05, + "loss": 3.0273, + "step": 49615 + }, + { + "epoch": 2.43, + "grad_norm": 0.7409013509750366, + "learning_rate": 5.160053123318125e-05, + "loss": 2.9011, + "step": 49616 + }, + { + "epoch": 2.43, + "grad_norm": 0.7271203994750977, + "learning_rate": 5.1591898443477263e-05, + "loss": 2.9749, + "step": 49617 + }, + { + "epoch": 2.43, + "grad_norm": 0.7120565176010132, + "learning_rate": 5.158326630803242e-05, + "loss": 2.8932, + "step": 49618 + }, + { + "epoch": 2.43, + "grad_norm": 0.706852376461029, + "learning_rate": 5.1574634826869544e-05, + "loss": 3.0749, + "step": 49619 + }, + { + "epoch": 2.43, + "grad_norm": 0.7524295449256897, + "learning_rate": 5.156600400001124e-05, + "loss": 2.855, + "step": 49620 + }, + { + "epoch": 2.43, + "grad_norm": 0.7389339208602905, + "learning_rate": 5.1557373827480264e-05, + "loss": 2.967, + "step": 49621 + }, + { + "epoch": 2.43, + "grad_norm": 0.7299748063087463, + "learning_rate": 5.1548744309299295e-05, + "loss": 2.9577, + "step": 49622 + }, + { + "epoch": 2.43, + "grad_norm": 0.7077644467353821, + "learning_rate": 5.154011544549108e-05, + "loss": 2.8618, + "step": 49623 + }, + { + "epoch": 2.43, + "grad_norm": 0.7535231113433838, + "learning_rate": 5.153148723607851e-05, + "loss": 2.6616, + "step": 49624 + }, + { + "epoch": 2.43, + "grad_norm": 0.7145181894302368, + "learning_rate": 5.1522859681084084e-05, + "loss": 2.9405, + "step": 49625 + }, + { + "epoch": 2.43, + "grad_norm": 0.7331188917160034, + "learning_rate": 5.151423278053069e-05, + "loss": 2.9301, + "step": 49626 + }, + { + "epoch": 2.43, + "grad_norm": 0.756357729434967, + "learning_rate": 5.150560653444093e-05, + "loss": 2.965, + "step": 49627 + }, + { + "epoch": 2.43, + "grad_norm": 0.7148869633674622, + "learning_rate": 5.149698094283765e-05, + "loss": 2.9109, + "step": 49628 + }, + { + "epoch": 2.43, + "grad_norm": 0.7142993211746216, + "learning_rate": 5.148835600574349e-05, + "loss": 3.0988, + "step": 49629 + }, + { + "epoch": 2.43, + "grad_norm": 0.8061341047286987, + "learning_rate": 5.14797317231811e-05, + "loss": 2.7739, + "step": 49630 + }, + { + "epoch": 2.43, + "grad_norm": 0.7073236107826233, + "learning_rate": 5.147110809517334e-05, + "loss": 2.8595, + "step": 49631 + }, + { + "epoch": 2.43, + "grad_norm": 0.7803508043289185, + "learning_rate": 5.146248512174278e-05, + "loss": 2.7204, + "step": 49632 + }, + { + "epoch": 2.43, + "grad_norm": 0.758151113986969, + "learning_rate": 5.1453862802912195e-05, + "loss": 2.9361, + "step": 49633 + }, + { + "epoch": 2.43, + "grad_norm": 0.7642468214035034, + "learning_rate": 5.1445241138704395e-05, + "loss": 2.6602, + "step": 49634 + }, + { + "epoch": 2.43, + "grad_norm": 0.7350394129753113, + "learning_rate": 5.143662012914197e-05, + "loss": 2.734, + "step": 49635 + }, + { + "epoch": 2.43, + "grad_norm": 0.6950120329856873, + "learning_rate": 5.1427999774247664e-05, + "loss": 2.6326, + "step": 49636 + }, + { + "epoch": 2.43, + "grad_norm": 0.7212783694267273, + "learning_rate": 5.141938007404409e-05, + "loss": 2.8121, + "step": 49637 + }, + { + "epoch": 2.43, + "grad_norm": 0.7310404777526855, + "learning_rate": 5.14107610285541e-05, + "loss": 2.9614, + "step": 49638 + }, + { + "epoch": 2.43, + "grad_norm": 0.7168389558792114, + "learning_rate": 5.1402142637800246e-05, + "loss": 2.9321, + "step": 49639 + }, + { + "epoch": 2.43, + "grad_norm": 0.7117984294891357, + "learning_rate": 5.139352490180534e-05, + "loss": 2.7888, + "step": 49640 + }, + { + "epoch": 2.43, + "grad_norm": 0.7920997738838196, + "learning_rate": 5.138490782059209e-05, + "loss": 2.9202, + "step": 49641 + }, + { + "epoch": 2.43, + "grad_norm": 0.6791305541992188, + "learning_rate": 5.1376291394183154e-05, + "loss": 2.5108, + "step": 49642 + }, + { + "epoch": 2.43, + "grad_norm": 0.7199430465698242, + "learning_rate": 5.136767562260121e-05, + "loss": 2.7794, + "step": 49643 + }, + { + "epoch": 2.43, + "grad_norm": 0.6989941596984863, + "learning_rate": 5.135906050586888e-05, + "loss": 2.9884, + "step": 49644 + }, + { + "epoch": 2.43, + "grad_norm": 0.7715273499488831, + "learning_rate": 5.135044604400893e-05, + "loss": 2.8518, + "step": 49645 + }, + { + "epoch": 2.43, + "grad_norm": 0.7331072688102722, + "learning_rate": 5.134183223704412e-05, + "loss": 2.9443, + "step": 49646 + }, + { + "epoch": 2.43, + "grad_norm": 0.7313644886016846, + "learning_rate": 5.1333219084997036e-05, + "loss": 2.8292, + "step": 49647 + }, + { + "epoch": 2.43, + "grad_norm": 0.7467599511146545, + "learning_rate": 5.1324606587890416e-05, + "loss": 3.0896, + "step": 49648 + }, + { + "epoch": 2.43, + "grad_norm": 0.7694347500801086, + "learning_rate": 5.131599474574688e-05, + "loss": 3.0131, + "step": 49649 + }, + { + "epoch": 2.43, + "grad_norm": 0.7463330626487732, + "learning_rate": 5.130738355858924e-05, + "loss": 3.0468, + "step": 49650 + }, + { + "epoch": 2.43, + "grad_norm": 0.7481557130813599, + "learning_rate": 5.1298773026440056e-05, + "loss": 2.9252, + "step": 49651 + }, + { + "epoch": 2.43, + "grad_norm": 0.705471932888031, + "learning_rate": 5.129016314932199e-05, + "loss": 2.9475, + "step": 49652 + }, + { + "epoch": 2.43, + "grad_norm": 0.7567241191864014, + "learning_rate": 5.128155392725785e-05, + "loss": 2.8548, + "step": 49653 + }, + { + "epoch": 2.43, + "grad_norm": 0.6947323083877563, + "learning_rate": 5.127294536027012e-05, + "loss": 2.675, + "step": 49654 + }, + { + "epoch": 2.43, + "grad_norm": 0.7445532083511353, + "learning_rate": 5.126433744838161e-05, + "loss": 3.0083, + "step": 49655 + }, + { + "epoch": 2.43, + "grad_norm": 0.7622706294059753, + "learning_rate": 5.1255730191615055e-05, + "loss": 2.7358, + "step": 49656 + }, + { + "epoch": 2.43, + "grad_norm": 0.7756778597831726, + "learning_rate": 5.1247123589993e-05, + "loss": 3.0063, + "step": 49657 + }, + { + "epoch": 2.43, + "grad_norm": 0.7175281047821045, + "learning_rate": 5.123851764353817e-05, + "loss": 2.9401, + "step": 49658 + }, + { + "epoch": 2.43, + "grad_norm": 0.710747480392456, + "learning_rate": 5.122991235227314e-05, + "loss": 2.8863, + "step": 49659 + }, + { + "epoch": 2.43, + "grad_norm": 0.7191362977027893, + "learning_rate": 5.122130771622063e-05, + "loss": 2.8623, + "step": 49660 + }, + { + "epoch": 2.43, + "grad_norm": 0.7421351075172424, + "learning_rate": 5.121270373540338e-05, + "loss": 2.8252, + "step": 49661 + }, + { + "epoch": 2.43, + "grad_norm": 0.7567232847213745, + "learning_rate": 5.120410040984395e-05, + "loss": 3.1095, + "step": 49662 + }, + { + "epoch": 2.43, + "grad_norm": 0.7147611975669861, + "learning_rate": 5.1195497739565086e-05, + "loss": 2.9548, + "step": 49663 + }, + { + "epoch": 2.43, + "grad_norm": 0.7359359264373779, + "learning_rate": 5.11868957245894e-05, + "loss": 2.8819, + "step": 49664 + }, + { + "epoch": 2.43, + "grad_norm": 0.7486942410469055, + "learning_rate": 5.1178294364939464e-05, + "loss": 3.0448, + "step": 49665 + }, + { + "epoch": 2.43, + "grad_norm": 0.7085438966751099, + "learning_rate": 5.116969366063811e-05, + "loss": 2.9634, + "step": 49666 + }, + { + "epoch": 2.43, + "grad_norm": 0.7525193095207214, + "learning_rate": 5.1161093611707816e-05, + "loss": 2.8993, + "step": 49667 + }, + { + "epoch": 2.43, + "grad_norm": 0.760326623916626, + "learning_rate": 5.115249421817136e-05, + "loss": 3.0144, + "step": 49668 + }, + { + "epoch": 2.43, + "grad_norm": 0.75190669298172, + "learning_rate": 5.114389548005129e-05, + "loss": 2.6683, + "step": 49669 + }, + { + "epoch": 2.43, + "grad_norm": 0.7118992805480957, + "learning_rate": 5.113529739737029e-05, + "loss": 3.0528, + "step": 49670 + }, + { + "epoch": 2.43, + "grad_norm": 0.7461121082305908, + "learning_rate": 5.112669997015111e-05, + "loss": 2.8172, + "step": 49671 + }, + { + "epoch": 2.43, + "grad_norm": 0.7285375595092773, + "learning_rate": 5.1118103198416325e-05, + "loss": 2.8323, + "step": 49672 + }, + { + "epoch": 2.43, + "grad_norm": 0.7130274176597595, + "learning_rate": 5.110950708218853e-05, + "loss": 2.9635, + "step": 49673 + }, + { + "epoch": 2.43, + "grad_norm": 0.7329024076461792, + "learning_rate": 5.110091162149033e-05, + "loss": 2.9495, + "step": 49674 + }, + { + "epoch": 2.43, + "grad_norm": 0.7448126077651978, + "learning_rate": 5.1092316816344404e-05, + "loss": 3.0704, + "step": 49675 + }, + { + "epoch": 2.43, + "grad_norm": 0.7353721857070923, + "learning_rate": 5.108372266677351e-05, + "loss": 2.918, + "step": 49676 + }, + { + "epoch": 2.43, + "grad_norm": 0.7445089817047119, + "learning_rate": 5.107512917280012e-05, + "loss": 3.0014, + "step": 49677 + }, + { + "epoch": 2.43, + "grad_norm": 0.7183898091316223, + "learning_rate": 5.1066536334447e-05, + "loss": 2.6663, + "step": 49678 + }, + { + "epoch": 2.43, + "grad_norm": 0.7457320690155029, + "learning_rate": 5.105794415173671e-05, + "loss": 2.7268, + "step": 49679 + }, + { + "epoch": 2.43, + "grad_norm": 0.7721786499023438, + "learning_rate": 5.104935262469181e-05, + "loss": 2.8341, + "step": 49680 + }, + { + "epoch": 2.43, + "grad_norm": 0.7069118022918701, + "learning_rate": 5.1040761753335044e-05, + "loss": 2.6973, + "step": 49681 + }, + { + "epoch": 2.43, + "grad_norm": 0.7151609659194946, + "learning_rate": 5.103217153768897e-05, + "loss": 2.901, + "step": 49682 + }, + { + "epoch": 2.43, + "grad_norm": 0.7430471777915955, + "learning_rate": 5.1023581977776296e-05, + "loss": 3.03, + "step": 49683 + }, + { + "epoch": 2.43, + "grad_norm": 0.7141035199165344, + "learning_rate": 5.1014993073619536e-05, + "loss": 2.9409, + "step": 49684 + }, + { + "epoch": 2.43, + "grad_norm": 0.7425801753997803, + "learning_rate": 5.100640482524134e-05, + "loss": 2.962, + "step": 49685 + }, + { + "epoch": 2.43, + "grad_norm": 0.8909529447555542, + "learning_rate": 5.099781723266453e-05, + "loss": 2.8787, + "step": 49686 + }, + { + "epoch": 2.44, + "grad_norm": 0.7250303030014038, + "learning_rate": 5.098923029591138e-05, + "loss": 2.8883, + "step": 49687 + }, + { + "epoch": 2.44, + "grad_norm": 0.7350665926933289, + "learning_rate": 5.098064401500477e-05, + "loss": 2.9089, + "step": 49688 + }, + { + "epoch": 2.44, + "grad_norm": 0.7405259013175964, + "learning_rate": 5.0972058389967116e-05, + "loss": 2.9799, + "step": 49689 + }, + { + "epoch": 2.44, + "grad_norm": 0.7162747383117676, + "learning_rate": 5.096347342082114e-05, + "loss": 2.8799, + "step": 49690 + }, + { + "epoch": 2.44, + "grad_norm": 0.7429664731025696, + "learning_rate": 5.095488910758956e-05, + "loss": 2.8375, + "step": 49691 + }, + { + "epoch": 2.44, + "grad_norm": 0.7161949872970581, + "learning_rate": 5.094630545029475e-05, + "loss": 2.8939, + "step": 49692 + }, + { + "epoch": 2.44, + "grad_norm": 0.7380734086036682, + "learning_rate": 5.093772244895952e-05, + "loss": 2.928, + "step": 49693 + }, + { + "epoch": 2.44, + "grad_norm": 0.778752326965332, + "learning_rate": 5.092914010360642e-05, + "loss": 2.9385, + "step": 49694 + }, + { + "epoch": 2.44, + "grad_norm": 0.6871793866157532, + "learning_rate": 5.092055841425795e-05, + "loss": 2.8888, + "step": 49695 + }, + { + "epoch": 2.44, + "grad_norm": 0.7245429158210754, + "learning_rate": 5.091197738093684e-05, + "loss": 3.1834, + "step": 49696 + }, + { + "epoch": 2.44, + "grad_norm": 0.7372999787330627, + "learning_rate": 5.0903397003665616e-05, + "loss": 2.8991, + "step": 49697 + }, + { + "epoch": 2.44, + "grad_norm": 0.7384405732154846, + "learning_rate": 5.089481728246695e-05, + "loss": 3.0387, + "step": 49698 + }, + { + "epoch": 2.44, + "grad_norm": 0.7918789386749268, + "learning_rate": 5.088623821736333e-05, + "loss": 2.6822, + "step": 49699 + }, + { + "epoch": 2.44, + "grad_norm": 0.6995186805725098, + "learning_rate": 5.087765980837747e-05, + "loss": 2.7499, + "step": 49700 + }, + { + "epoch": 2.44, + "grad_norm": 0.7654358148574829, + "learning_rate": 5.0869082055531915e-05, + "loss": 2.7592, + "step": 49701 + }, + { + "epoch": 2.44, + "grad_norm": 0.7654218077659607, + "learning_rate": 5.0860504958849194e-05, + "loss": 2.7973, + "step": 49702 + }, + { + "epoch": 2.44, + "grad_norm": 0.7214524149894714, + "learning_rate": 5.0851928518352e-05, + "loss": 2.9906, + "step": 49703 + }, + { + "epoch": 2.44, + "grad_norm": 0.737037718296051, + "learning_rate": 5.084335273406284e-05, + "loss": 2.8326, + "step": 49704 + }, + { + "epoch": 2.44, + "grad_norm": 0.6899082660675049, + "learning_rate": 5.083477760600431e-05, + "loss": 3.0577, + "step": 49705 + }, + { + "epoch": 2.44, + "grad_norm": 0.717755138874054, + "learning_rate": 5.0826203134199095e-05, + "loss": 2.9758, + "step": 49706 + }, + { + "epoch": 2.44, + "grad_norm": 0.7414624094963074, + "learning_rate": 5.081762931866965e-05, + "loss": 2.791, + "step": 49707 + }, + { + "epoch": 2.44, + "grad_norm": 0.7816479802131653, + "learning_rate": 5.0809056159438686e-05, + "loss": 2.9598, + "step": 49708 + }, + { + "epoch": 2.44, + "grad_norm": 0.7575608491897583, + "learning_rate": 5.080048365652872e-05, + "loss": 2.9766, + "step": 49709 + }, + { + "epoch": 2.44, + "grad_norm": 0.7834826707839966, + "learning_rate": 5.079191180996224e-05, + "loss": 2.8922, + "step": 49710 + }, + { + "epoch": 2.44, + "grad_norm": 0.7255894541740417, + "learning_rate": 5.078334061976195e-05, + "loss": 3.0187, + "step": 49711 + }, + { + "epoch": 2.44, + "grad_norm": 0.7802708745002747, + "learning_rate": 5.0774770085950336e-05, + "loss": 2.9521, + "step": 49712 + }, + { + "epoch": 2.44, + "grad_norm": 0.6934137940406799, + "learning_rate": 5.076620020855008e-05, + "loss": 2.8263, + "step": 49713 + }, + { + "epoch": 2.44, + "grad_norm": 0.7279984951019287, + "learning_rate": 5.0757630987583596e-05, + "loss": 2.8758, + "step": 49714 + }, + { + "epoch": 2.44, + "grad_norm": 0.7556265592575073, + "learning_rate": 5.074906242307364e-05, + "loss": 2.8469, + "step": 49715 + }, + { + "epoch": 2.44, + "grad_norm": 0.784926176071167, + "learning_rate": 5.074049451504265e-05, + "loss": 2.8924, + "step": 49716 + }, + { + "epoch": 2.44, + "grad_norm": 0.7134363651275635, + "learning_rate": 5.0731927263513185e-05, + "loss": 2.9432, + "step": 49717 + }, + { + "epoch": 2.44, + "grad_norm": 0.7337852120399475, + "learning_rate": 5.072336066850792e-05, + "loss": 3.0742, + "step": 49718 + }, + { + "epoch": 2.44, + "grad_norm": 0.7417206168174744, + "learning_rate": 5.071479473004927e-05, + "loss": 2.9438, + "step": 49719 + }, + { + "epoch": 2.44, + "grad_norm": 0.7676438689231873, + "learning_rate": 5.0706229448159954e-05, + "loss": 3.0447, + "step": 49720 + }, + { + "epoch": 2.44, + "grad_norm": 0.7735446691513062, + "learning_rate": 5.069766482286238e-05, + "loss": 3.1604, + "step": 49721 + }, + { + "epoch": 2.44, + "grad_norm": 0.7267362475395203, + "learning_rate": 5.0689100854179245e-05, + "loss": 2.8411, + "step": 49722 + }, + { + "epoch": 2.44, + "grad_norm": 0.7745874524116516, + "learning_rate": 5.068053754213305e-05, + "loss": 3.0884, + "step": 49723 + }, + { + "epoch": 2.44, + "grad_norm": 0.7192590236663818, + "learning_rate": 5.067197488674625e-05, + "loss": 2.8488, + "step": 49724 + }, + { + "epoch": 2.44, + "grad_norm": 0.766343891620636, + "learning_rate": 5.066341288804155e-05, + "loss": 2.883, + "step": 49725 + }, + { + "epoch": 2.44, + "grad_norm": 0.7043959498405457, + "learning_rate": 5.0654851546041356e-05, + "loss": 2.8619, + "step": 49726 + }, + { + "epoch": 2.44, + "grad_norm": 0.7229441404342651, + "learning_rate": 5.064629086076832e-05, + "loss": 2.9009, + "step": 49727 + }, + { + "epoch": 2.44, + "grad_norm": 0.71528559923172, + "learning_rate": 5.063773083224502e-05, + "loss": 2.8358, + "step": 49728 + }, + { + "epoch": 2.44, + "grad_norm": 0.7294989824295044, + "learning_rate": 5.062917146049388e-05, + "loss": 2.5765, + "step": 49729 + }, + { + "epoch": 2.44, + "grad_norm": 0.7297892570495605, + "learning_rate": 5.062061274553759e-05, + "loss": 3.1941, + "step": 49730 + }, + { + "epoch": 2.44, + "grad_norm": 0.7526649832725525, + "learning_rate": 5.061205468739861e-05, + "loss": 2.9326, + "step": 49731 + }, + { + "epoch": 2.44, + "grad_norm": 0.7116498351097107, + "learning_rate": 5.06034972860994e-05, + "loss": 2.7313, + "step": 49732 + }, + { + "epoch": 2.44, + "grad_norm": 0.7177486419677734, + "learning_rate": 5.0594940541662664e-05, + "loss": 2.9048, + "step": 49733 + }, + { + "epoch": 2.44, + "grad_norm": 0.7500801682472229, + "learning_rate": 5.058638445411077e-05, + "loss": 2.8435, + "step": 49734 + }, + { + "epoch": 2.44, + "grad_norm": 0.7371991872787476, + "learning_rate": 5.0577829023466446e-05, + "loss": 2.8987, + "step": 49735 + }, + { + "epoch": 2.44, + "grad_norm": 0.7661231756210327, + "learning_rate": 5.056927424975199e-05, + "loss": 2.7352, + "step": 49736 + }, + { + "epoch": 2.44, + "grad_norm": 0.7188680171966553, + "learning_rate": 5.056072013299016e-05, + "loss": 3.0446, + "step": 49737 + }, + { + "epoch": 2.44, + "grad_norm": 0.7141361832618713, + "learning_rate": 5.055216667320341e-05, + "loss": 3.023, + "step": 49738 + }, + { + "epoch": 2.44, + "grad_norm": 0.7152678966522217, + "learning_rate": 5.0543613870414144e-05, + "loss": 2.9097, + "step": 49739 + }, + { + "epoch": 2.44, + "grad_norm": 0.7155481576919556, + "learning_rate": 5.0535061724645086e-05, + "loss": 2.9771, + "step": 49740 + }, + { + "epoch": 2.44, + "grad_norm": 0.7778782844543457, + "learning_rate": 5.052651023591858e-05, + "loss": 3.011, + "step": 49741 + }, + { + "epoch": 2.44, + "grad_norm": 0.6794896125793457, + "learning_rate": 5.0517959404257215e-05, + "loss": 2.8614, + "step": 49742 + }, + { + "epoch": 2.44, + "grad_norm": 0.7529333233833313, + "learning_rate": 5.050940922968363e-05, + "loss": 3.0088, + "step": 49743 + }, + { + "epoch": 2.44, + "grad_norm": 0.7487618923187256, + "learning_rate": 5.0500859712220245e-05, + "loss": 2.8297, + "step": 49744 + }, + { + "epoch": 2.44, + "grad_norm": 0.735675573348999, + "learning_rate": 5.0492310851889575e-05, + "loss": 2.8424, + "step": 49745 + }, + { + "epoch": 2.44, + "grad_norm": 0.7913566827774048, + "learning_rate": 5.048376264871406e-05, + "loss": 2.986, + "step": 49746 + }, + { + "epoch": 2.44, + "grad_norm": 0.668773353099823, + "learning_rate": 5.0475215102716306e-05, + "loss": 2.8925, + "step": 49747 + }, + { + "epoch": 2.44, + "grad_norm": 0.7151376008987427, + "learning_rate": 5.046666821391887e-05, + "loss": 2.7141, + "step": 49748 + }, + { + "epoch": 2.44, + "grad_norm": 0.7672746777534485, + "learning_rate": 5.0458121982344144e-05, + "loss": 2.8902, + "step": 49749 + }, + { + "epoch": 2.44, + "grad_norm": 0.6590734124183655, + "learning_rate": 5.044957640801475e-05, + "loss": 2.8578, + "step": 49750 + }, + { + "epoch": 2.44, + "grad_norm": 0.7381081581115723, + "learning_rate": 5.044103149095309e-05, + "loss": 2.8413, + "step": 49751 + }, + { + "epoch": 2.44, + "grad_norm": 0.7295042872428894, + "learning_rate": 5.043248723118182e-05, + "loss": 2.9015, + "step": 49752 + }, + { + "epoch": 2.44, + "grad_norm": 0.7408165335655212, + "learning_rate": 5.042394362872332e-05, + "loss": 2.8261, + "step": 49753 + }, + { + "epoch": 2.44, + "grad_norm": 0.7612532377243042, + "learning_rate": 5.041540068360004e-05, + "loss": 2.8644, + "step": 49754 + }, + { + "epoch": 2.44, + "grad_norm": 0.7137171030044556, + "learning_rate": 5.0406858395834656e-05, + "loss": 3.0074, + "step": 49755 + }, + { + "epoch": 2.44, + "grad_norm": 0.6950783133506775, + "learning_rate": 5.03983167654495e-05, + "loss": 2.9107, + "step": 49756 + }, + { + "epoch": 2.44, + "grad_norm": 0.7441567778587341, + "learning_rate": 5.0389775792467134e-05, + "loss": 2.7601, + "step": 49757 + }, + { + "epoch": 2.44, + "grad_norm": 0.7424207925796509, + "learning_rate": 5.038123547691012e-05, + "loss": 2.8487, + "step": 49758 + }, + { + "epoch": 2.44, + "grad_norm": 0.6980047225952148, + "learning_rate": 5.03726958188009e-05, + "loss": 2.7768, + "step": 49759 + }, + { + "epoch": 2.44, + "grad_norm": 0.7469981908798218, + "learning_rate": 5.036415681816196e-05, + "loss": 2.9057, + "step": 49760 + }, + { + "epoch": 2.44, + "grad_norm": 0.7088938355445862, + "learning_rate": 5.0355618475015716e-05, + "loss": 2.9294, + "step": 49761 + }, + { + "epoch": 2.44, + "grad_norm": 0.7196033000946045, + "learning_rate": 5.034708078938474e-05, + "loss": 2.9308, + "step": 49762 + }, + { + "epoch": 2.44, + "grad_norm": 0.7394540309906006, + "learning_rate": 5.033854376129156e-05, + "loss": 2.7985, + "step": 49763 + }, + { + "epoch": 2.44, + "grad_norm": 0.7375257611274719, + "learning_rate": 5.033000739075852e-05, + "loss": 2.968, + "step": 49764 + }, + { + "epoch": 2.44, + "grad_norm": 0.7190216779708862, + "learning_rate": 5.0321471677808304e-05, + "loss": 2.9031, + "step": 49765 + }, + { + "epoch": 2.44, + "grad_norm": 0.769286572933197, + "learning_rate": 5.031293662246325e-05, + "loss": 2.5801, + "step": 49766 + }, + { + "epoch": 2.44, + "grad_norm": 0.7621033191680908, + "learning_rate": 5.030440222474582e-05, + "loss": 2.91, + "step": 49767 + }, + { + "epoch": 2.44, + "grad_norm": 0.7350282669067383, + "learning_rate": 5.0295868484678595e-05, + "loss": 2.8721, + "step": 49768 + }, + { + "epoch": 2.44, + "grad_norm": 0.7469178438186646, + "learning_rate": 5.028733540228391e-05, + "loss": 2.9342, + "step": 49769 + }, + { + "epoch": 2.44, + "grad_norm": 0.8052636384963989, + "learning_rate": 5.027880297758443e-05, + "loss": 3.0468, + "step": 49770 + }, + { + "epoch": 2.44, + "grad_norm": 0.7492589354515076, + "learning_rate": 5.027027121060243e-05, + "loss": 2.9264, + "step": 49771 + }, + { + "epoch": 2.44, + "grad_norm": 0.7447794675827026, + "learning_rate": 5.026174010136046e-05, + "loss": 2.9722, + "step": 49772 + }, + { + "epoch": 2.44, + "grad_norm": 0.7605386972427368, + "learning_rate": 5.0253209649881096e-05, + "loss": 2.9998, + "step": 49773 + }, + { + "epoch": 2.44, + "grad_norm": 0.7599433660507202, + "learning_rate": 5.024467985618673e-05, + "loss": 2.7689, + "step": 49774 + }, + { + "epoch": 2.44, + "grad_norm": 0.7454763650894165, + "learning_rate": 5.023615072029977e-05, + "loss": 2.8439, + "step": 49775 + }, + { + "epoch": 2.44, + "grad_norm": 0.7649785876274109, + "learning_rate": 5.022762224224267e-05, + "loss": 2.7798, + "step": 49776 + }, + { + "epoch": 2.44, + "grad_norm": 0.7098384499549866, + "learning_rate": 5.021909442203794e-05, + "loss": 2.8681, + "step": 49777 + }, + { + "epoch": 2.44, + "grad_norm": 0.7371013760566711, + "learning_rate": 5.0210567259708135e-05, + "loss": 2.8648, + "step": 49778 + }, + { + "epoch": 2.44, + "grad_norm": 0.7675368785858154, + "learning_rate": 5.020204075527553e-05, + "loss": 2.9071, + "step": 49779 + }, + { + "epoch": 2.44, + "grad_norm": 0.7284151315689087, + "learning_rate": 5.0193514908762775e-05, + "loss": 2.9625, + "step": 49780 + }, + { + "epoch": 2.44, + "grad_norm": 0.7707260251045227, + "learning_rate": 5.0184989720192194e-05, + "loss": 2.9693, + "step": 49781 + }, + { + "epoch": 2.44, + "grad_norm": 0.7367619276046753, + "learning_rate": 5.017646518958619e-05, + "loss": 2.9056, + "step": 49782 + }, + { + "epoch": 2.44, + "grad_norm": 0.7275775074958801, + "learning_rate": 5.0167941316967417e-05, + "loss": 2.9327, + "step": 49783 + }, + { + "epoch": 2.44, + "grad_norm": 0.7671332359313965, + "learning_rate": 5.015941810235809e-05, + "loss": 2.7362, + "step": 49784 + }, + { + "epoch": 2.44, + "grad_norm": 0.7026022672653198, + "learning_rate": 5.015089554578087e-05, + "loss": 2.8569, + "step": 49785 + }, + { + "epoch": 2.44, + "grad_norm": 0.7949316501617432, + "learning_rate": 5.014237364725801e-05, + "loss": 2.9605, + "step": 49786 + }, + { + "epoch": 2.44, + "grad_norm": 0.7388706207275391, + "learning_rate": 5.013385240681216e-05, + "loss": 2.7446, + "step": 49787 + }, + { + "epoch": 2.44, + "grad_norm": 0.7240619659423828, + "learning_rate": 5.012533182446561e-05, + "loss": 2.7338, + "step": 49788 + }, + { + "epoch": 2.44, + "grad_norm": 0.7435773015022278, + "learning_rate": 5.011681190024082e-05, + "loss": 2.9415, + "step": 49789 + }, + { + "epoch": 2.44, + "grad_norm": 0.7612718939781189, + "learning_rate": 5.0108292634160294e-05, + "loss": 2.857, + "step": 49790 + }, + { + "epoch": 2.44, + "grad_norm": 0.7587111592292786, + "learning_rate": 5.0099774026246366e-05, + "loss": 3.0447, + "step": 49791 + }, + { + "epoch": 2.44, + "grad_norm": 0.7427108287811279, + "learning_rate": 5.009125607652163e-05, + "loss": 2.682, + "step": 49792 + }, + { + "epoch": 2.44, + "grad_norm": 0.75941002368927, + "learning_rate": 5.008273878500836e-05, + "loss": 2.8607, + "step": 49793 + }, + { + "epoch": 2.44, + "grad_norm": 0.7439859509468079, + "learning_rate": 5.007422215172905e-05, + "loss": 2.8351, + "step": 49794 + }, + { + "epoch": 2.44, + "grad_norm": 0.758802056312561, + "learning_rate": 5.006570617670619e-05, + "loss": 2.8682, + "step": 49795 + }, + { + "epoch": 2.44, + "grad_norm": 0.7315456867218018, + "learning_rate": 5.005719085996216e-05, + "loss": 3.1513, + "step": 49796 + }, + { + "epoch": 2.44, + "grad_norm": 0.7400312423706055, + "learning_rate": 5.004867620151941e-05, + "loss": 2.9039, + "step": 49797 + }, + { + "epoch": 2.44, + "grad_norm": 0.766927182674408, + "learning_rate": 5.0040162201400256e-05, + "loss": 3.0392, + "step": 49798 + }, + { + "epoch": 2.44, + "grad_norm": 0.756879985332489, + "learning_rate": 5.003164885962722e-05, + "loss": 2.956, + "step": 49799 + }, + { + "epoch": 2.44, + "grad_norm": 0.7322831153869629, + "learning_rate": 5.00231361762228e-05, + "loss": 2.8099, + "step": 49800 + }, + { + "epoch": 2.44, + "grad_norm": 0.7429162859916687, + "learning_rate": 5.0014624151209224e-05, + "loss": 2.9539, + "step": 49801 + }, + { + "epoch": 2.44, + "grad_norm": 0.7556778192520142, + "learning_rate": 5.000611278460911e-05, + "loss": 3.0639, + "step": 49802 + }, + { + "epoch": 2.44, + "grad_norm": 0.7347615957260132, + "learning_rate": 4.99976020764448e-05, + "loss": 2.9699, + "step": 49803 + }, + { + "epoch": 2.44, + "grad_norm": 0.7194696068763733, + "learning_rate": 4.998909202673858e-05, + "loss": 2.9908, + "step": 49804 + }, + { + "epoch": 2.44, + "grad_norm": 0.7030249238014221, + "learning_rate": 4.998058263551306e-05, + "loss": 2.8154, + "step": 49805 + }, + { + "epoch": 2.44, + "grad_norm": 0.7510625123977661, + "learning_rate": 4.997207390279052e-05, + "loss": 2.9402, + "step": 49806 + }, + { + "epoch": 2.44, + "grad_norm": 0.7080065011978149, + "learning_rate": 4.9963565828593455e-05, + "loss": 2.8348, + "step": 49807 + }, + { + "epoch": 2.44, + "grad_norm": 0.7457221150398254, + "learning_rate": 4.9955058412944205e-05, + "loss": 3.1089, + "step": 49808 + }, + { + "epoch": 2.44, + "grad_norm": 0.7274998426437378, + "learning_rate": 4.994655165586519e-05, + "loss": 2.8142, + "step": 49809 + }, + { + "epoch": 2.44, + "grad_norm": 0.7139880657196045, + "learning_rate": 4.993804555737888e-05, + "loss": 2.9572, + "step": 49810 + }, + { + "epoch": 2.44, + "grad_norm": 0.7105646133422852, + "learning_rate": 4.9929540117507674e-05, + "loss": 2.8571, + "step": 49811 + }, + { + "epoch": 2.44, + "grad_norm": 0.6922122240066528, + "learning_rate": 4.9921035336273904e-05, + "loss": 2.9435, + "step": 49812 + }, + { + "epoch": 2.44, + "grad_norm": 0.7425985932350159, + "learning_rate": 4.991253121369993e-05, + "loss": 2.8171, + "step": 49813 + }, + { + "epoch": 2.44, + "grad_norm": 0.7572745084762573, + "learning_rate": 4.990402774980824e-05, + "loss": 3.0096, + "step": 49814 + }, + { + "epoch": 2.44, + "grad_norm": 0.7443933486938477, + "learning_rate": 4.989552494462127e-05, + "loss": 2.6377, + "step": 49815 + }, + { + "epoch": 2.44, + "grad_norm": 0.7220988273620605, + "learning_rate": 4.9887022798161266e-05, + "loss": 3.0297, + "step": 49816 + }, + { + "epoch": 2.44, + "grad_norm": 0.7042877078056335, + "learning_rate": 4.987852131045079e-05, + "loss": 2.9561, + "step": 49817 + }, + { + "epoch": 2.44, + "grad_norm": 0.7358666062355042, + "learning_rate": 4.987002048151215e-05, + "loss": 3.0252, + "step": 49818 + }, + { + "epoch": 2.44, + "grad_norm": 0.7317989468574524, + "learning_rate": 4.986152031136766e-05, + "loss": 2.9775, + "step": 49819 + }, + { + "epoch": 2.44, + "grad_norm": 0.7053791284561157, + "learning_rate": 4.985302080003988e-05, + "loss": 2.7853, + "step": 49820 + }, + { + "epoch": 2.44, + "grad_norm": 0.7373185753822327, + "learning_rate": 4.984452194755101e-05, + "loss": 2.8829, + "step": 49821 + }, + { + "epoch": 2.44, + "grad_norm": 0.7272002100944519, + "learning_rate": 4.98360237539236e-05, + "loss": 3.0424, + "step": 49822 + }, + { + "epoch": 2.44, + "grad_norm": 0.7341743111610413, + "learning_rate": 4.9827526219179914e-05, + "loss": 2.8273, + "step": 49823 + }, + { + "epoch": 2.44, + "grad_norm": 0.7472624778747559, + "learning_rate": 4.981902934334242e-05, + "loss": 2.8973, + "step": 49824 + }, + { + "epoch": 2.44, + "grad_norm": 0.7138078808784485, + "learning_rate": 4.9810533126433475e-05, + "loss": 2.9605, + "step": 49825 + }, + { + "epoch": 2.44, + "grad_norm": 0.7111098766326904, + "learning_rate": 4.980203756847532e-05, + "loss": 3.0122, + "step": 49826 + }, + { + "epoch": 2.44, + "grad_norm": 0.7603220343589783, + "learning_rate": 4.979354266949057e-05, + "loss": 2.9321, + "step": 49827 + }, + { + "epoch": 2.44, + "grad_norm": 0.7600682973861694, + "learning_rate": 4.978504842950138e-05, + "loss": 2.6835, + "step": 49828 + }, + { + "epoch": 2.44, + "grad_norm": 0.7622936367988586, + "learning_rate": 4.9776554848530205e-05, + "loss": 2.6772, + "step": 49829 + }, + { + "epoch": 2.44, + "grad_norm": 0.7368950843811035, + "learning_rate": 4.976806192659951e-05, + "loss": 2.8547, + "step": 49830 + }, + { + "epoch": 2.44, + "grad_norm": 0.7676935195922852, + "learning_rate": 4.975956966373153e-05, + "loss": 2.8766, + "step": 49831 + }, + { + "epoch": 2.44, + "grad_norm": 0.7402030229568481, + "learning_rate": 4.975107805994872e-05, + "loss": 2.8607, + "step": 49832 + }, + { + "epoch": 2.44, + "grad_norm": 0.745424211025238, + "learning_rate": 4.974258711527346e-05, + "loss": 2.6656, + "step": 49833 + }, + { + "epoch": 2.44, + "grad_norm": 0.756466269493103, + "learning_rate": 4.973409682972794e-05, + "loss": 2.926, + "step": 49834 + }, + { + "epoch": 2.44, + "grad_norm": 0.7675303816795349, + "learning_rate": 4.972560720333473e-05, + "loss": 2.8321, + "step": 49835 + }, + { + "epoch": 2.44, + "grad_norm": 0.758357584476471, + "learning_rate": 4.971711823611606e-05, + "loss": 3.0498, + "step": 49836 + }, + { + "epoch": 2.44, + "grad_norm": 0.8041175603866577, + "learning_rate": 4.970862992809436e-05, + "loss": 2.9132, + "step": 49837 + }, + { + "epoch": 2.44, + "grad_norm": 0.7109980583190918, + "learning_rate": 4.9700142279291933e-05, + "loss": 2.7802, + "step": 49838 + }, + { + "epoch": 2.44, + "grad_norm": 0.785557746887207, + "learning_rate": 4.9691655289731205e-05, + "loss": 3.0044, + "step": 49839 + }, + { + "epoch": 2.44, + "grad_norm": 0.726588249206543, + "learning_rate": 4.968316895943448e-05, + "loss": 2.7941, + "step": 49840 + }, + { + "epoch": 2.44, + "grad_norm": 0.7485399842262268, + "learning_rate": 4.967468328842406e-05, + "loss": 2.9152, + "step": 49841 + }, + { + "epoch": 2.44, + "grad_norm": 0.740860104560852, + "learning_rate": 4.96661982767224e-05, + "loss": 2.8871, + "step": 49842 + }, + { + "epoch": 2.44, + "grad_norm": 0.790346622467041, + "learning_rate": 4.965771392435174e-05, + "loss": 3.1134, + "step": 49843 + }, + { + "epoch": 2.44, + "grad_norm": 0.7316777110099792, + "learning_rate": 4.964923023133448e-05, + "loss": 3.0394, + "step": 49844 + }, + { + "epoch": 2.44, + "grad_norm": 0.7692133784294128, + "learning_rate": 4.9640747197693007e-05, + "loss": 2.8604, + "step": 49845 + }, + { + "epoch": 2.44, + "grad_norm": 0.7087564468383789, + "learning_rate": 4.963226482344968e-05, + "loss": 2.8347, + "step": 49846 + }, + { + "epoch": 2.44, + "grad_norm": 0.7064667344093323, + "learning_rate": 4.962378310862672e-05, + "loss": 3.0958, + "step": 49847 + }, + { + "epoch": 2.44, + "grad_norm": 0.7500705122947693, + "learning_rate": 4.961530205324651e-05, + "loss": 2.8577, + "step": 49848 + }, + { + "epoch": 2.44, + "grad_norm": 0.7233101725578308, + "learning_rate": 4.960682165733137e-05, + "loss": 2.8902, + "step": 49849 + }, + { + "epoch": 2.44, + "grad_norm": 0.7733365893363953, + "learning_rate": 4.959834192090377e-05, + "loss": 2.8184, + "step": 49850 + }, + { + "epoch": 2.44, + "grad_norm": 0.7586425542831421, + "learning_rate": 4.9589862843985886e-05, + "loss": 2.9315, + "step": 49851 + }, + { + "epoch": 2.44, + "grad_norm": 0.6858609914779663, + "learning_rate": 4.958138442660016e-05, + "loss": 2.6727, + "step": 49852 + }, + { + "epoch": 2.44, + "grad_norm": 0.6984794735908508, + "learning_rate": 4.957290666876881e-05, + "loss": 2.9782, + "step": 49853 + }, + { + "epoch": 2.44, + "grad_norm": 0.7984606027603149, + "learning_rate": 4.956442957051429e-05, + "loss": 3.0055, + "step": 49854 + }, + { + "epoch": 2.44, + "grad_norm": 0.7043806314468384, + "learning_rate": 4.9555953131858875e-05, + "loss": 2.9892, + "step": 49855 + }, + { + "epoch": 2.44, + "grad_norm": 0.732420802116394, + "learning_rate": 4.9547477352824815e-05, + "loss": 2.8946, + "step": 49856 + }, + { + "epoch": 2.44, + "grad_norm": 0.7292214632034302, + "learning_rate": 4.9539002233434535e-05, + "loss": 2.8447, + "step": 49857 + }, + { + "epoch": 2.44, + "grad_norm": 0.7352430820465088, + "learning_rate": 4.95305277737103e-05, + "loss": 2.9061, + "step": 49858 + }, + { + "epoch": 2.44, + "grad_norm": 0.7342036962509155, + "learning_rate": 4.952205397367441e-05, + "loss": 3.0595, + "step": 49859 + }, + { + "epoch": 2.44, + "grad_norm": 0.6796708106994629, + "learning_rate": 4.951358083334931e-05, + "loss": 3.1384, + "step": 49860 + }, + { + "epoch": 2.44, + "grad_norm": 0.7156011462211609, + "learning_rate": 4.950510835275724e-05, + "loss": 2.8325, + "step": 49861 + }, + { + "epoch": 2.44, + "grad_norm": 0.7859975099563599, + "learning_rate": 4.9496636531920484e-05, + "loss": 2.9611, + "step": 49862 + }, + { + "epoch": 2.44, + "grad_norm": 0.7750837206840515, + "learning_rate": 4.948816537086133e-05, + "loss": 3.0251, + "step": 49863 + }, + { + "epoch": 2.44, + "grad_norm": 0.7656334638595581, + "learning_rate": 4.9479694869602123e-05, + "loss": 2.9598, + "step": 49864 + }, + { + "epoch": 2.44, + "grad_norm": 0.720266580581665, + "learning_rate": 4.9471225028165285e-05, + "loss": 2.8096, + "step": 49865 + }, + { + "epoch": 2.44, + "grad_norm": 0.7152171730995178, + "learning_rate": 4.946275584657292e-05, + "loss": 3.0708, + "step": 49866 + }, + { + "epoch": 2.44, + "grad_norm": 0.7371180653572083, + "learning_rate": 4.945428732484752e-05, + "loss": 2.7161, + "step": 49867 + }, + { + "epoch": 2.44, + "grad_norm": 0.7790018916130066, + "learning_rate": 4.9445819463011294e-05, + "loss": 2.8841, + "step": 49868 + }, + { + "epoch": 2.44, + "grad_norm": 0.7312780022621155, + "learning_rate": 4.9437352261086594e-05, + "loss": 3.1128, + "step": 49869 + }, + { + "epoch": 2.44, + "grad_norm": 0.7291393876075745, + "learning_rate": 4.94288857190956e-05, + "loss": 3.0187, + "step": 49870 + }, + { + "epoch": 2.44, + "grad_norm": 0.7577651739120483, + "learning_rate": 4.94204198370607e-05, + "loss": 2.9457, + "step": 49871 + }, + { + "epoch": 2.44, + "grad_norm": 0.7718584537506104, + "learning_rate": 4.941195461500424e-05, + "loss": 2.8328, + "step": 49872 + }, + { + "epoch": 2.44, + "grad_norm": 0.7502276301383972, + "learning_rate": 4.940349005294839e-05, + "loss": 2.9141, + "step": 49873 + }, + { + "epoch": 2.44, + "grad_norm": 0.7148169279098511, + "learning_rate": 4.939502615091562e-05, + "loss": 2.9375, + "step": 49874 + }, + { + "epoch": 2.44, + "grad_norm": 0.7954051494598389, + "learning_rate": 4.938656290892803e-05, + "loss": 2.9624, + "step": 49875 + }, + { + "epoch": 2.44, + "grad_norm": 0.7510395646095276, + "learning_rate": 4.937810032700811e-05, + "loss": 2.9211, + "step": 49876 + }, + { + "epoch": 2.44, + "grad_norm": 0.7023428678512573, + "learning_rate": 4.9369638405178e-05, + "loss": 3.2698, + "step": 49877 + }, + { + "epoch": 2.44, + "grad_norm": 0.7281398773193359, + "learning_rate": 4.936117714345996e-05, + "loss": 2.9131, + "step": 49878 + }, + { + "epoch": 2.44, + "grad_norm": 0.7942187190055847, + "learning_rate": 4.9352716541876436e-05, + "loss": 2.9639, + "step": 49879 + }, + { + "epoch": 2.44, + "grad_norm": 0.70033860206604, + "learning_rate": 4.934425660044951e-05, + "loss": 2.9204, + "step": 49880 + }, + { + "epoch": 2.44, + "grad_norm": 0.7631633281707764, + "learning_rate": 4.933579731920162e-05, + "loss": 3.0097, + "step": 49881 + }, + { + "epoch": 2.44, + "grad_norm": 0.7145242691040039, + "learning_rate": 4.932733869815503e-05, + "loss": 2.8329, + "step": 49882 + }, + { + "epoch": 2.44, + "grad_norm": 0.7550216913223267, + "learning_rate": 4.9318880737332015e-05, + "loss": 2.8755, + "step": 49883 + }, + { + "epoch": 2.44, + "grad_norm": 0.7773282527923584, + "learning_rate": 4.931042343675482e-05, + "loss": 2.9673, + "step": 49884 + }, + { + "epoch": 2.44, + "grad_norm": 0.7082733511924744, + "learning_rate": 4.9301966796445636e-05, + "loss": 2.9873, + "step": 49885 + }, + { + "epoch": 2.44, + "grad_norm": 0.6855611801147461, + "learning_rate": 4.9293510816426807e-05, + "loss": 2.7685, + "step": 49886 + }, + { + "epoch": 2.44, + "grad_norm": 0.696873664855957, + "learning_rate": 4.928505549672072e-05, + "loss": 3.0811, + "step": 49887 + }, + { + "epoch": 2.44, + "grad_norm": 0.7408643364906311, + "learning_rate": 4.927660083734948e-05, + "loss": 2.7622, + "step": 49888 + }, + { + "epoch": 2.44, + "grad_norm": 0.7321268916130066, + "learning_rate": 4.926814683833547e-05, + "loss": 2.9631, + "step": 49889 + }, + { + "epoch": 2.44, + "grad_norm": 0.7376499176025391, + "learning_rate": 4.925969349970093e-05, + "loss": 3.2182, + "step": 49890 + }, + { + "epoch": 2.45, + "grad_norm": 0.6759738922119141, + "learning_rate": 4.925124082146804e-05, + "loss": 2.8273, + "step": 49891 + }, + { + "epoch": 2.45, + "grad_norm": 0.7222961783409119, + "learning_rate": 4.924278880365917e-05, + "loss": 2.7853, + "step": 49892 + }, + { + "epoch": 2.45, + "grad_norm": 0.7377529740333557, + "learning_rate": 4.923433744629647e-05, + "loss": 3.1511, + "step": 49893 + }, + { + "epoch": 2.45, + "grad_norm": 0.8030790686607361, + "learning_rate": 4.922588674940231e-05, + "loss": 2.9492, + "step": 49894 + }, + { + "epoch": 2.45, + "grad_norm": 0.7189480066299438, + "learning_rate": 4.921743671299888e-05, + "loss": 2.8765, + "step": 49895 + }, + { + "epoch": 2.45, + "grad_norm": 0.7487554550170898, + "learning_rate": 4.920898733710843e-05, + "loss": 2.9909, + "step": 49896 + }, + { + "epoch": 2.45, + "grad_norm": 0.7432091236114502, + "learning_rate": 4.92005386217533e-05, + "loss": 2.8594, + "step": 49897 + }, + { + "epoch": 2.45, + "grad_norm": 0.7807390093803406, + "learning_rate": 4.919209056695572e-05, + "loss": 3.0344, + "step": 49898 + }, + { + "epoch": 2.45, + "grad_norm": 0.7431802749633789, + "learning_rate": 4.918364317273786e-05, + "loss": 2.9692, + "step": 49899 + }, + { + "epoch": 2.45, + "grad_norm": 0.7242401242256165, + "learning_rate": 4.917519643912198e-05, + "loss": 2.6837, + "step": 49900 + }, + { + "epoch": 2.45, + "grad_norm": 0.7620343565940857, + "learning_rate": 4.9166750366130345e-05, + "loss": 2.7063, + "step": 49901 + }, + { + "epoch": 2.45, + "grad_norm": 0.7530914545059204, + "learning_rate": 4.9158304953785286e-05, + "loss": 2.923, + "step": 49902 + }, + { + "epoch": 2.45, + "grad_norm": 0.7380638122558594, + "learning_rate": 4.914986020210892e-05, + "loss": 3.0402, + "step": 49903 + }, + { + "epoch": 2.45, + "grad_norm": 0.7419988512992859, + "learning_rate": 4.914141611112361e-05, + "loss": 2.7586, + "step": 49904 + }, + { + "epoch": 2.45, + "grad_norm": 0.7261704802513123, + "learning_rate": 4.913297268085152e-05, + "loss": 3.0975, + "step": 49905 + }, + { + "epoch": 2.45, + "grad_norm": 0.7934512495994568, + "learning_rate": 4.912452991131483e-05, + "loss": 2.8156, + "step": 49906 + }, + { + "epoch": 2.45, + "grad_norm": 0.7128746509552002, + "learning_rate": 4.911608780253593e-05, + "loss": 2.8872, + "step": 49907 + }, + { + "epoch": 2.45, + "grad_norm": 0.7485001683235168, + "learning_rate": 4.910764635453689e-05, + "loss": 2.9185, + "step": 49908 + }, + { + "epoch": 2.45, + "grad_norm": 0.7733572721481323, + "learning_rate": 4.909920556734009e-05, + "loss": 2.8178, + "step": 49909 + }, + { + "epoch": 2.45, + "grad_norm": 0.7357202172279358, + "learning_rate": 4.909076544096763e-05, + "loss": 2.7962, + "step": 49910 + }, + { + "epoch": 2.45, + "grad_norm": 1.1617584228515625, + "learning_rate": 4.9082325975441805e-05, + "loss": 3.1216, + "step": 49911 + }, + { + "epoch": 2.45, + "grad_norm": 0.7278109788894653, + "learning_rate": 4.907388717078498e-05, + "loss": 3.0274, + "step": 49912 + }, + { + "epoch": 2.45, + "grad_norm": 0.7125959992408752, + "learning_rate": 4.9065449027019125e-05, + "loss": 2.8246, + "step": 49913 + }, + { + "epoch": 2.45, + "grad_norm": 0.8183368444442749, + "learning_rate": 4.9057011544166625e-05, + "loss": 2.6864, + "step": 49914 + }, + { + "epoch": 2.45, + "grad_norm": 0.7317843437194824, + "learning_rate": 4.90485747222496e-05, + "loss": 2.7996, + "step": 49915 + }, + { + "epoch": 2.45, + "grad_norm": 0.8040397763252258, + "learning_rate": 4.904013856129036e-05, + "loss": 3.1255, + "step": 49916 + }, + { + "epoch": 2.45, + "grad_norm": 0.6812474131584167, + "learning_rate": 4.903170306131113e-05, + "loss": 2.9811, + "step": 49917 + }, + { + "epoch": 2.45, + "grad_norm": 0.7134155631065369, + "learning_rate": 4.9023268222334054e-05, + "loss": 2.9236, + "step": 49918 + }, + { + "epoch": 2.45, + "grad_norm": 0.730156421661377, + "learning_rate": 4.901483404438145e-05, + "loss": 3.0052, + "step": 49919 + }, + { + "epoch": 2.45, + "grad_norm": 0.7486832737922668, + "learning_rate": 4.9006400527475455e-05, + "loss": 2.9662, + "step": 49920 + }, + { + "epoch": 2.45, + "grad_norm": 0.7389925122261047, + "learning_rate": 4.8997967671638225e-05, + "loss": 2.913, + "step": 49921 + }, + { + "epoch": 2.45, + "grad_norm": 0.7639115452766418, + "learning_rate": 4.898953547689212e-05, + "loss": 2.9612, + "step": 49922 + }, + { + "epoch": 2.45, + "grad_norm": 0.8156792521476746, + "learning_rate": 4.89811039432592e-05, + "loss": 3.0135, + "step": 49923 + }, + { + "epoch": 2.45, + "grad_norm": 0.7284063696861267, + "learning_rate": 4.8972673070761836e-05, + "loss": 2.9221, + "step": 49924 + }, + { + "epoch": 2.45, + "grad_norm": 0.7258008122444153, + "learning_rate": 4.8964242859422033e-05, + "loss": 2.9088, + "step": 49925 + }, + { + "epoch": 2.45, + "grad_norm": 0.7349685430526733, + "learning_rate": 4.8955813309262204e-05, + "loss": 3.1959, + "step": 49926 + }, + { + "epoch": 2.45, + "grad_norm": 0.7146064043045044, + "learning_rate": 4.894738442030444e-05, + "loss": 2.7013, + "step": 49927 + }, + { + "epoch": 2.45, + "grad_norm": 0.75479656457901, + "learning_rate": 4.8938956192570847e-05, + "loss": 3.0321, + "step": 49928 + }, + { + "epoch": 2.45, + "grad_norm": 0.7491357922554016, + "learning_rate": 4.8930528626083846e-05, + "loss": 2.6871, + "step": 49929 + }, + { + "epoch": 2.45, + "grad_norm": 0.7528855800628662, + "learning_rate": 4.8922101720865384e-05, + "loss": 2.7284, + "step": 49930 + }, + { + "epoch": 2.45, + "grad_norm": 0.7925986647605896, + "learning_rate": 4.891367547693784e-05, + "loss": 2.9621, + "step": 49931 + }, + { + "epoch": 2.45, + "grad_norm": 0.7412816882133484, + "learning_rate": 4.890524989432341e-05, + "loss": 2.8603, + "step": 49932 + }, + { + "epoch": 2.45, + "grad_norm": 0.7685806751251221, + "learning_rate": 4.889682497304412e-05, + "loss": 2.7904, + "step": 49933 + }, + { + "epoch": 2.45, + "grad_norm": 0.7178759574890137, + "learning_rate": 4.8888400713122376e-05, + "loss": 2.9155, + "step": 49934 + }, + { + "epoch": 2.45, + "grad_norm": 0.7180420160293579, + "learning_rate": 4.887997711458025e-05, + "loss": 2.8599, + "step": 49935 + }, + { + "epoch": 2.45, + "grad_norm": 0.7158687710762024, + "learning_rate": 4.887155417743985e-05, + "loss": 2.684, + "step": 49936 + }, + { + "epoch": 2.45, + "grad_norm": 0.7684953212738037, + "learning_rate": 4.886313190172354e-05, + "loss": 2.7244, + "step": 49937 + }, + { + "epoch": 2.45, + "grad_norm": 0.7468008399009705, + "learning_rate": 4.885471028745332e-05, + "loss": 2.9189, + "step": 49938 + }, + { + "epoch": 2.45, + "grad_norm": 0.7421904802322388, + "learning_rate": 4.8846289334651554e-05, + "loss": 2.9611, + "step": 49939 + }, + { + "epoch": 2.45, + "grad_norm": 0.7222840785980225, + "learning_rate": 4.883786904334021e-05, + "loss": 2.7801, + "step": 49940 + }, + { + "epoch": 2.45, + "grad_norm": 0.7169114351272583, + "learning_rate": 4.882944941354169e-05, + "loss": 3.0289, + "step": 49941 + }, + { + "epoch": 2.45, + "grad_norm": 0.7277195453643799, + "learning_rate": 4.882103044527805e-05, + "loss": 2.8314, + "step": 49942 + }, + { + "epoch": 2.45, + "grad_norm": 0.7513947486877441, + "learning_rate": 4.8812612138571426e-05, + "loss": 2.7394, + "step": 49943 + }, + { + "epoch": 2.45, + "grad_norm": 0.7872393727302551, + "learning_rate": 4.880419449344406e-05, + "loss": 2.8735, + "step": 49944 + }, + { + "epoch": 2.45, + "grad_norm": 0.8093016743659973, + "learning_rate": 4.8795777509918064e-05, + "loss": 2.8194, + "step": 49945 + }, + { + "epoch": 2.45, + "grad_norm": 0.7250414490699768, + "learning_rate": 4.878736118801566e-05, + "loss": 2.9011, + "step": 49946 + }, + { + "epoch": 2.45, + "grad_norm": 0.7930007576942444, + "learning_rate": 4.877894552775906e-05, + "loss": 2.9038, + "step": 49947 + }, + { + "epoch": 2.45, + "grad_norm": 0.8064557909965515, + "learning_rate": 4.877053052917035e-05, + "loss": 2.7648, + "step": 49948 + }, + { + "epoch": 2.45, + "grad_norm": 0.7449289560317993, + "learning_rate": 4.876211619227174e-05, + "loss": 2.8408, + "step": 49949 + }, + { + "epoch": 2.45, + "grad_norm": 0.7603309154510498, + "learning_rate": 4.875370251708528e-05, + "loss": 2.8804, + "step": 49950 + }, + { + "epoch": 2.45, + "grad_norm": 0.7044733762741089, + "learning_rate": 4.874528950363325e-05, + "loss": 2.9399, + "step": 49951 + }, + { + "epoch": 2.45, + "grad_norm": 0.7044036388397217, + "learning_rate": 4.873687715193775e-05, + "loss": 2.7616, + "step": 49952 + }, + { + "epoch": 2.45, + "grad_norm": 0.7298387885093689, + "learning_rate": 4.8728465462020935e-05, + "loss": 2.9534, + "step": 49953 + }, + { + "epoch": 2.45, + "grad_norm": 0.7330809831619263, + "learning_rate": 4.872005443390508e-05, + "loss": 3.0583, + "step": 49954 + }, + { + "epoch": 2.45, + "grad_norm": 0.7165080904960632, + "learning_rate": 4.8711644067612145e-05, + "loss": 2.9493, + "step": 49955 + }, + { + "epoch": 2.45, + "grad_norm": 0.7864888310432434, + "learning_rate": 4.870323436316446e-05, + "loss": 2.9212, + "step": 49956 + }, + { + "epoch": 2.45, + "grad_norm": 0.7288803458213806, + "learning_rate": 4.8694825320584097e-05, + "loss": 2.8663, + "step": 49957 + }, + { + "epoch": 2.45, + "grad_norm": 0.7244741320610046, + "learning_rate": 4.8686416939893114e-05, + "loss": 2.874, + "step": 49958 + }, + { + "epoch": 2.45, + "grad_norm": 0.7267864346504211, + "learning_rate": 4.8678009221113826e-05, + "loss": 2.8159, + "step": 49959 + }, + { + "epoch": 2.45, + "grad_norm": 0.7464987635612488, + "learning_rate": 4.866960216426821e-05, + "loss": 2.8817, + "step": 49960 + }, + { + "epoch": 2.45, + "grad_norm": 0.7408061027526855, + "learning_rate": 4.866119576937858e-05, + "loss": 2.9639, + "step": 49961 + }, + { + "epoch": 2.45, + "grad_norm": 0.7390897870063782, + "learning_rate": 4.8652790036466925e-05, + "loss": 3.0278, + "step": 49962 + }, + { + "epoch": 2.45, + "grad_norm": 0.7427120804786682, + "learning_rate": 4.86443849655555e-05, + "loss": 3.0713, + "step": 49963 + }, + { + "epoch": 2.45, + "grad_norm": 0.7576953172683716, + "learning_rate": 4.86359805566664e-05, + "loss": 2.905, + "step": 49964 + }, + { + "epoch": 2.45, + "grad_norm": 0.7212951183319092, + "learning_rate": 4.8627576809821654e-05, + "loss": 3.0014, + "step": 49965 + }, + { + "epoch": 2.45, + "grad_norm": 0.761062741279602, + "learning_rate": 4.861917372504361e-05, + "loss": 2.7335, + "step": 49966 + }, + { + "epoch": 2.45, + "grad_norm": 0.7463256120681763, + "learning_rate": 4.861077130235419e-05, + "loss": 2.7925, + "step": 49967 + }, + { + "epoch": 2.45, + "grad_norm": 0.7250059843063354, + "learning_rate": 4.86023695417756e-05, + "loss": 2.9038, + "step": 49968 + }, + { + "epoch": 2.45, + "grad_norm": 0.7397086024284363, + "learning_rate": 4.859396844333009e-05, + "loss": 3.0888, + "step": 49969 + }, + { + "epoch": 2.45, + "grad_norm": 0.7244906425476074, + "learning_rate": 4.858556800703964e-05, + "loss": 3.0914, + "step": 49970 + }, + { + "epoch": 2.45, + "grad_norm": 0.7587701082229614, + "learning_rate": 4.857716823292644e-05, + "loss": 2.8498, + "step": 49971 + }, + { + "epoch": 2.45, + "grad_norm": 0.7259935736656189, + "learning_rate": 4.856876912101254e-05, + "loss": 2.7492, + "step": 49972 + }, + { + "epoch": 2.45, + "grad_norm": 0.7055765390396118, + "learning_rate": 4.856037067132012e-05, + "loss": 3.1046, + "step": 49973 + }, + { + "epoch": 2.45, + "grad_norm": 0.7304884195327759, + "learning_rate": 4.8551972883871313e-05, + "loss": 2.8807, + "step": 49974 + }, + { + "epoch": 2.45, + "grad_norm": 0.7568566203117371, + "learning_rate": 4.854357575868819e-05, + "loss": 2.9917, + "step": 49975 + }, + { + "epoch": 2.45, + "grad_norm": 0.7316765189170837, + "learning_rate": 4.8535179295792957e-05, + "loss": 3.0772, + "step": 49976 + }, + { + "epoch": 2.45, + "grad_norm": 0.7427469491958618, + "learning_rate": 4.8526783495207565e-05, + "loss": 2.8457, + "step": 49977 + }, + { + "epoch": 2.45, + "grad_norm": 0.7722180485725403, + "learning_rate": 4.851838835695433e-05, + "loss": 3.0185, + "step": 49978 + }, + { + "epoch": 2.45, + "grad_norm": 0.7619324922561646, + "learning_rate": 4.850999388105527e-05, + "loss": 2.7384, + "step": 49979 + }, + { + "epoch": 2.45, + "grad_norm": 0.7218577861785889, + "learning_rate": 4.850160006753239e-05, + "loss": 2.9204, + "step": 49980 + }, + { + "epoch": 2.45, + "grad_norm": 0.7448925375938416, + "learning_rate": 4.8493206916407976e-05, + "loss": 2.8308, + "step": 49981 + }, + { + "epoch": 2.45, + "grad_norm": 0.7064295411109924, + "learning_rate": 4.8484814427703986e-05, + "loss": 3.0217, + "step": 49982 + }, + { + "epoch": 2.45, + "grad_norm": 0.7662560939788818, + "learning_rate": 4.8476422601442564e-05, + "loss": 2.6942, + "step": 49983 + }, + { + "epoch": 2.45, + "grad_norm": 0.7750674486160278, + "learning_rate": 4.846803143764593e-05, + "loss": 2.761, + "step": 49984 + }, + { + "epoch": 2.45, + "grad_norm": 0.7387879490852356, + "learning_rate": 4.845964093633612e-05, + "loss": 2.8892, + "step": 49985 + }, + { + "epoch": 2.45, + "grad_norm": 0.7031064033508301, + "learning_rate": 4.845125109753516e-05, + "loss": 3.0094, + "step": 49986 + }, + { + "epoch": 2.45, + "grad_norm": 0.7086855173110962, + "learning_rate": 4.844286192126513e-05, + "loss": 2.8456, + "step": 49987 + }, + { + "epoch": 2.45, + "grad_norm": 0.7467719912528992, + "learning_rate": 4.843447340754821e-05, + "loss": 2.914, + "step": 49988 + }, + { + "epoch": 2.45, + "grad_norm": 0.7186830639839172, + "learning_rate": 4.8426085556406523e-05, + "loss": 3.1394, + "step": 49989 + }, + { + "epoch": 2.45, + "grad_norm": 0.7500671744346619, + "learning_rate": 4.841769836786205e-05, + "loss": 2.966, + "step": 49990 + }, + { + "epoch": 2.45, + "grad_norm": 0.7453409433364868, + "learning_rate": 4.840931184193703e-05, + "loss": 2.7501, + "step": 49991 + }, + { + "epoch": 2.45, + "grad_norm": 0.7155488133430481, + "learning_rate": 4.840092597865342e-05, + "loss": 2.8779, + "step": 49992 + }, + { + "epoch": 2.45, + "grad_norm": 0.7144157290458679, + "learning_rate": 4.8392540778033295e-05, + "loss": 3.0127, + "step": 49993 + }, + { + "epoch": 2.45, + "grad_norm": 0.7175836563110352, + "learning_rate": 4.838415624009888e-05, + "loss": 2.9094, + "step": 49994 + }, + { + "epoch": 2.45, + "grad_norm": 0.7162435054779053, + "learning_rate": 4.837577236487209e-05, + "loss": 2.8378, + "step": 49995 + }, + { + "epoch": 2.45, + "grad_norm": 0.7388350963592529, + "learning_rate": 4.8367389152375166e-05, + "loss": 2.8836, + "step": 49996 + }, + { + "epoch": 2.45, + "grad_norm": 0.7505469918251038, + "learning_rate": 4.835900660263003e-05, + "loss": 2.8015, + "step": 49997 + }, + { + "epoch": 2.45, + "grad_norm": 0.734264612197876, + "learning_rate": 4.835062471565887e-05, + "loss": 2.9549, + "step": 49998 + }, + { + "epoch": 2.45, + "grad_norm": 0.753314733505249, + "learning_rate": 4.834224349148379e-05, + "loss": 2.9851, + "step": 49999 + }, + { + "epoch": 2.45, + "grad_norm": 0.7283154726028442, + "learning_rate": 4.833386293012681e-05, + "loss": 2.9575, + "step": 50000 + }, + { + "epoch": 2.45, + "grad_norm": 0.6950429081916809, + "learning_rate": 4.832548303160998e-05, + "loss": 3.0719, + "step": 50001 + }, + { + "epoch": 2.45, + "grad_norm": 0.7324820756912231, + "learning_rate": 4.831710379595535e-05, + "loss": 2.906, + "step": 50002 + }, + { + "epoch": 2.45, + "grad_norm": 0.7295659780502319, + "learning_rate": 4.8308725223185e-05, + "loss": 2.8802, + "step": 50003 + }, + { + "epoch": 2.45, + "grad_norm": 0.7294290661811829, + "learning_rate": 4.830034731332111e-05, + "loss": 3.0532, + "step": 50004 + }, + { + "epoch": 2.45, + "grad_norm": 0.764670729637146, + "learning_rate": 4.82919700663856e-05, + "loss": 3.0021, + "step": 50005 + }, + { + "epoch": 2.45, + "grad_norm": 0.7712165713310242, + "learning_rate": 4.828359348240069e-05, + "loss": 2.8191, + "step": 50006 + }, + { + "epoch": 2.45, + "grad_norm": 0.749573290348053, + "learning_rate": 4.827521756138831e-05, + "loss": 3.0938, + "step": 50007 + }, + { + "epoch": 2.45, + "grad_norm": 0.7555610537528992, + "learning_rate": 4.8266842303370534e-05, + "loss": 2.7916, + "step": 50008 + }, + { + "epoch": 2.45, + "grad_norm": 0.716448187828064, + "learning_rate": 4.8258467708369495e-05, + "loss": 3.0179, + "step": 50009 + }, + { + "epoch": 2.45, + "grad_norm": 0.7162739038467407, + "learning_rate": 4.825009377640712e-05, + "loss": 2.8407, + "step": 50010 + }, + { + "epoch": 2.45, + "grad_norm": 0.7964804768562317, + "learning_rate": 4.824172050750564e-05, + "loss": 2.8375, + "step": 50011 + }, + { + "epoch": 2.45, + "grad_norm": 0.7051910161972046, + "learning_rate": 4.823334790168693e-05, + "loss": 2.7792, + "step": 50012 + }, + { + "epoch": 2.45, + "grad_norm": 0.7237557768821716, + "learning_rate": 4.822497595897315e-05, + "loss": 2.9245, + "step": 50013 + }, + { + "epoch": 2.45, + "grad_norm": 0.7709342837333679, + "learning_rate": 4.821660467938645e-05, + "loss": 2.8221, + "step": 50014 + }, + { + "epoch": 2.45, + "grad_norm": 0.7040632367134094, + "learning_rate": 4.8208234062948626e-05, + "loss": 2.7416, + "step": 50015 + }, + { + "epoch": 2.45, + "grad_norm": 0.6828323602676392, + "learning_rate": 4.819986410968192e-05, + "loss": 2.9322, + "step": 50016 + }, + { + "epoch": 2.45, + "grad_norm": 0.7632225751876831, + "learning_rate": 4.819149481960825e-05, + "loss": 2.7827, + "step": 50017 + }, + { + "epoch": 2.45, + "grad_norm": 0.7432723641395569, + "learning_rate": 4.8183126192749686e-05, + "loss": 2.7461, + "step": 50018 + }, + { + "epoch": 2.45, + "grad_norm": 0.7697473764419556, + "learning_rate": 4.81747582291284e-05, + "loss": 2.8574, + "step": 50019 + }, + { + "epoch": 2.45, + "grad_norm": 0.6835653185844421, + "learning_rate": 4.8166390928766286e-05, + "loss": 2.9083, + "step": 50020 + }, + { + "epoch": 2.45, + "grad_norm": 0.6899628639221191, + "learning_rate": 4.815802429168547e-05, + "loss": 2.7771, + "step": 50021 + }, + { + "epoch": 2.45, + "grad_norm": 0.7158491015434265, + "learning_rate": 4.814965831790797e-05, + "loss": 2.8758, + "step": 50022 + }, + { + "epoch": 2.45, + "grad_norm": 0.7541047930717468, + "learning_rate": 4.8141293007455727e-05, + "loss": 2.8541, + "step": 50023 + }, + { + "epoch": 2.45, + "grad_norm": 0.7383518815040588, + "learning_rate": 4.8132928360350934e-05, + "loss": 2.932, + "step": 50024 + }, + { + "epoch": 2.45, + "grad_norm": 0.736765444278717, + "learning_rate": 4.8124564376615424e-05, + "loss": 2.6686, + "step": 50025 + }, + { + "epoch": 2.45, + "grad_norm": 0.7256408929824829, + "learning_rate": 4.811620105627143e-05, + "loss": 3.1262, + "step": 50026 + }, + { + "epoch": 2.45, + "grad_norm": 0.7852066159248352, + "learning_rate": 4.810783839934082e-05, + "loss": 2.8989, + "step": 50027 + }, + { + "epoch": 2.45, + "grad_norm": 0.7440248131752014, + "learning_rate": 4.809947640584576e-05, + "loss": 2.8755, + "step": 50028 + }, + { + "epoch": 2.45, + "grad_norm": 0.7258433699607849, + "learning_rate": 4.809111507580819e-05, + "loss": 2.9438, + "step": 50029 + }, + { + "epoch": 2.45, + "grad_norm": 0.7557896375656128, + "learning_rate": 4.808275440925005e-05, + "loss": 2.8328, + "step": 50030 + }, + { + "epoch": 2.45, + "grad_norm": 0.7530323266983032, + "learning_rate": 4.8074394406193564e-05, + "loss": 2.973, + "step": 50031 + }, + { + "epoch": 2.45, + "grad_norm": 0.7720298767089844, + "learning_rate": 4.8066035066660544e-05, + "loss": 2.9985, + "step": 50032 + }, + { + "epoch": 2.45, + "grad_norm": 0.7010929584503174, + "learning_rate": 4.8057676390673184e-05, + "loss": 2.9103, + "step": 50033 + }, + { + "epoch": 2.45, + "grad_norm": 0.8532752990722656, + "learning_rate": 4.804931837825332e-05, + "loss": 2.9676, + "step": 50034 + }, + { + "epoch": 2.45, + "grad_norm": 0.7763757705688477, + "learning_rate": 4.804096102942311e-05, + "loss": 2.7373, + "step": 50035 + }, + { + "epoch": 2.45, + "grad_norm": 0.7673393487930298, + "learning_rate": 4.803260434420456e-05, + "loss": 2.762, + "step": 50036 + }, + { + "epoch": 2.45, + "grad_norm": 0.722378134727478, + "learning_rate": 4.802424832261963e-05, + "loss": 3.0049, + "step": 50037 + }, + { + "epoch": 2.45, + "grad_norm": 0.7699034810066223, + "learning_rate": 4.8015892964690364e-05, + "loss": 3.0195, + "step": 50038 + }, + { + "epoch": 2.45, + "grad_norm": 0.8004751801490784, + "learning_rate": 4.800753827043864e-05, + "loss": 2.8241, + "step": 50039 + }, + { + "epoch": 2.45, + "grad_norm": 0.7232542037963867, + "learning_rate": 4.799918423988658e-05, + "loss": 2.8742, + "step": 50040 + }, + { + "epoch": 2.45, + "grad_norm": 0.778252363204956, + "learning_rate": 4.799083087305623e-05, + "loss": 2.7134, + "step": 50041 + }, + { + "epoch": 2.45, + "grad_norm": 0.7834492325782776, + "learning_rate": 4.798247816996948e-05, + "loss": 2.9848, + "step": 50042 + }, + { + "epoch": 2.45, + "grad_norm": 0.7058895230293274, + "learning_rate": 4.7974126130648425e-05, + "loss": 3.0141, + "step": 50043 + }, + { + "epoch": 2.45, + "grad_norm": 0.7730516791343689, + "learning_rate": 4.7965774755115035e-05, + "loss": 2.8361, + "step": 50044 + }, + { + "epoch": 2.45, + "grad_norm": 0.7547017931938171, + "learning_rate": 4.795742404339122e-05, + "loss": 2.7368, + "step": 50045 + }, + { + "epoch": 2.45, + "grad_norm": 0.747089684009552, + "learning_rate": 4.79490739954991e-05, + "loss": 2.7331, + "step": 50046 + }, + { + "epoch": 2.45, + "grad_norm": 0.7834062576293945, + "learning_rate": 4.794072461146055e-05, + "loss": 2.8744, + "step": 50047 + }, + { + "epoch": 2.45, + "grad_norm": 0.7755687236785889, + "learning_rate": 4.793237589129772e-05, + "loss": 2.6205, + "step": 50048 + }, + { + "epoch": 2.45, + "grad_norm": 0.7240215539932251, + "learning_rate": 4.7924027835032385e-05, + "loss": 3.0052, + "step": 50049 + }, + { + "epoch": 2.45, + "grad_norm": 0.7895364165306091, + "learning_rate": 4.7915680442686745e-05, + "loss": 3.0121, + "step": 50050 + }, + { + "epoch": 2.45, + "grad_norm": 0.7438759207725525, + "learning_rate": 4.790733371428267e-05, + "loss": 2.6353, + "step": 50051 + }, + { + "epoch": 2.45, + "grad_norm": 0.7612919211387634, + "learning_rate": 4.789898764984211e-05, + "loss": 2.974, + "step": 50052 + }, + { + "epoch": 2.45, + "grad_norm": 0.7200536727905273, + "learning_rate": 4.7890642249387154e-05, + "loss": 2.8291, + "step": 50053 + }, + { + "epoch": 2.45, + "grad_norm": 0.744355320930481, + "learning_rate": 4.788229751293965e-05, + "loss": 2.8287, + "step": 50054 + }, + { + "epoch": 2.45, + "grad_norm": 0.7230464816093445, + "learning_rate": 4.787395344052167e-05, + "loss": 2.8453, + "step": 50055 + }, + { + "epoch": 2.45, + "grad_norm": 0.7152355909347534, + "learning_rate": 4.7865610032155246e-05, + "loss": 2.9215, + "step": 50056 + }, + { + "epoch": 2.45, + "grad_norm": 0.7902154922485352, + "learning_rate": 4.785726728786221e-05, + "loss": 2.7708, + "step": 50057 + }, + { + "epoch": 2.45, + "grad_norm": 0.7618465423583984, + "learning_rate": 4.784892520766466e-05, + "loss": 2.8228, + "step": 50058 + }, + { + "epoch": 2.45, + "grad_norm": 0.8009930849075317, + "learning_rate": 4.784058379158453e-05, + "loss": 2.6846, + "step": 50059 + }, + { + "epoch": 2.45, + "grad_norm": 0.7363435626029968, + "learning_rate": 4.7832243039643714e-05, + "loss": 2.7056, + "step": 50060 + }, + { + "epoch": 2.45, + "grad_norm": 0.7347774505615234, + "learning_rate": 4.782390295186429e-05, + "loss": 2.8223, + "step": 50061 + }, + { + "epoch": 2.45, + "grad_norm": 0.7615084648132324, + "learning_rate": 4.7815563528268105e-05, + "loss": 3.0836, + "step": 50062 + }, + { + "epoch": 2.45, + "grad_norm": 0.7269819378852844, + "learning_rate": 4.780722476887725e-05, + "loss": 2.7853, + "step": 50063 + }, + { + "epoch": 2.45, + "grad_norm": 0.7375494837760925, + "learning_rate": 4.779888667371356e-05, + "loss": 2.589, + "step": 50064 + }, + { + "epoch": 2.45, + "grad_norm": 0.7187495827674866, + "learning_rate": 4.779054924279917e-05, + "loss": 2.8879, + "step": 50065 + }, + { + "epoch": 2.45, + "grad_norm": 0.7238545417785645, + "learning_rate": 4.778221247615591e-05, + "loss": 2.8817, + "step": 50066 + }, + { + "epoch": 2.45, + "grad_norm": 0.7386276125907898, + "learning_rate": 4.7773876373805676e-05, + "loss": 2.8928, + "step": 50067 + }, + { + "epoch": 2.45, + "grad_norm": 0.720185399055481, + "learning_rate": 4.776554093577061e-05, + "loss": 2.9978, + "step": 50068 + }, + { + "epoch": 2.45, + "grad_norm": 0.7330771684646606, + "learning_rate": 4.775720616207249e-05, + "loss": 2.6761, + "step": 50069 + }, + { + "epoch": 2.45, + "grad_norm": 0.6842424273490906, + "learning_rate": 4.774887205273335e-05, + "loss": 2.7894, + "step": 50070 + }, + { + "epoch": 2.45, + "grad_norm": 0.7490212321281433, + "learning_rate": 4.774053860777518e-05, + "loss": 2.9307, + "step": 50071 + }, + { + "epoch": 2.45, + "grad_norm": 0.7413821220397949, + "learning_rate": 4.773220582721988e-05, + "loss": 2.7725, + "step": 50072 + }, + { + "epoch": 2.45, + "grad_norm": 0.6934046149253845, + "learning_rate": 4.7723873711089436e-05, + "loss": 2.8426, + "step": 50073 + }, + { + "epoch": 2.45, + "grad_norm": 0.7559138536453247, + "learning_rate": 4.7715542259405645e-05, + "loss": 2.9685, + "step": 50074 + }, + { + "epoch": 2.45, + "grad_norm": 0.7740827798843384, + "learning_rate": 4.770721147219061e-05, + "loss": 3.0002, + "step": 50075 + }, + { + "epoch": 2.45, + "grad_norm": 0.7439709305763245, + "learning_rate": 4.7698881349466265e-05, + "loss": 3.0176, + "step": 50076 + }, + { + "epoch": 2.45, + "grad_norm": 0.7715969085693359, + "learning_rate": 4.7690551891254436e-05, + "loss": 2.862, + "step": 50077 + }, + { + "epoch": 2.45, + "grad_norm": 0.7283157110214233, + "learning_rate": 4.76822230975772e-05, + "loss": 3.0025, + "step": 50078 + }, + { + "epoch": 2.45, + "grad_norm": 0.7790409326553345, + "learning_rate": 4.7673894968456375e-05, + "loss": 2.9668, + "step": 50079 + }, + { + "epoch": 2.45, + "grad_norm": 0.7363762259483337, + "learning_rate": 4.766556750391404e-05, + "loss": 2.8291, + "step": 50080 + }, + { + "epoch": 2.45, + "grad_norm": 0.723741888999939, + "learning_rate": 4.765724070397201e-05, + "loss": 3.0666, + "step": 50081 + }, + { + "epoch": 2.45, + "grad_norm": 0.7611827254295349, + "learning_rate": 4.7648914568652176e-05, + "loss": 2.9559, + "step": 50082 + }, + { + "epoch": 2.45, + "grad_norm": 0.7597978115081787, + "learning_rate": 4.764058909797661e-05, + "loss": 2.7526, + "step": 50083 + }, + { + "epoch": 2.45, + "grad_norm": 0.7658380270004272, + "learning_rate": 4.763226429196707e-05, + "loss": 2.6695, + "step": 50084 + }, + { + "epoch": 2.45, + "grad_norm": 0.7255105972290039, + "learning_rate": 4.76239401506456e-05, + "loss": 2.7215, + "step": 50085 + }, + { + "epoch": 2.45, + "grad_norm": 0.715186595916748, + "learning_rate": 4.761561667403418e-05, + "loss": 2.8727, + "step": 50086 + }, + { + "epoch": 2.45, + "grad_norm": 0.7413427233695984, + "learning_rate": 4.760729386215464e-05, + "loss": 2.8856, + "step": 50087 + }, + { + "epoch": 2.45, + "grad_norm": 0.7388500571250916, + "learning_rate": 4.759897171502891e-05, + "loss": 3.0116, + "step": 50088 + }, + { + "epoch": 2.45, + "grad_norm": 0.6811955571174622, + "learning_rate": 4.759065023267885e-05, + "loss": 3.0865, + "step": 50089 + }, + { + "epoch": 2.45, + "grad_norm": 0.7523310780525208, + "learning_rate": 4.758232941512644e-05, + "loss": 2.9596, + "step": 50090 + }, + { + "epoch": 2.45, + "grad_norm": 0.7621850371360779, + "learning_rate": 4.757400926239366e-05, + "loss": 2.8558, + "step": 50091 + }, + { + "epoch": 2.45, + "grad_norm": 0.697986900806427, + "learning_rate": 4.75656897745023e-05, + "loss": 2.8994, + "step": 50092 + }, + { + "epoch": 2.45, + "grad_norm": 0.722909688949585, + "learning_rate": 4.7557370951474404e-05, + "loss": 2.8762, + "step": 50093 + }, + { + "epoch": 2.45, + "grad_norm": 0.7483739852905273, + "learning_rate": 4.754905279333182e-05, + "loss": 2.8685, + "step": 50094 + }, + { + "epoch": 2.46, + "grad_norm": 0.7431113123893738, + "learning_rate": 4.754073530009633e-05, + "loss": 2.8286, + "step": 50095 + }, + { + "epoch": 2.46, + "grad_norm": 0.750974178314209, + "learning_rate": 4.7532418471790055e-05, + "loss": 3.0905, + "step": 50096 + }, + { + "epoch": 2.46, + "grad_norm": 0.7384655475616455, + "learning_rate": 4.752410230843474e-05, + "loss": 2.9023, + "step": 50097 + }, + { + "epoch": 2.46, + "grad_norm": 0.7771185636520386, + "learning_rate": 4.75157868100524e-05, + "loss": 2.6056, + "step": 50098 + }, + { + "epoch": 2.46, + "grad_norm": 0.7554793357849121, + "learning_rate": 4.750747197666482e-05, + "loss": 2.7112, + "step": 50099 + }, + { + "epoch": 2.46, + "grad_norm": 0.7633035182952881, + "learning_rate": 4.749915780829399e-05, + "loss": 2.9715, + "step": 50100 + }, + { + "epoch": 2.46, + "grad_norm": 0.7684073448181152, + "learning_rate": 4.749084430496184e-05, + "loss": 3.0109, + "step": 50101 + }, + { + "epoch": 2.46, + "grad_norm": 0.8110712170600891, + "learning_rate": 4.74825314666902e-05, + "loss": 2.9253, + "step": 50102 + }, + { + "epoch": 2.46, + "grad_norm": 0.6965272426605225, + "learning_rate": 4.747421929350098e-05, + "loss": 2.757, + "step": 50103 + }, + { + "epoch": 2.46, + "grad_norm": 0.834354817867279, + "learning_rate": 4.7465907785416005e-05, + "loss": 2.9619, + "step": 50104 + }, + { + "epoch": 2.46, + "grad_norm": 0.7369504570960999, + "learning_rate": 4.7457596942457274e-05, + "loss": 3.0316, + "step": 50105 + }, + { + "epoch": 2.46, + "grad_norm": 0.727476954460144, + "learning_rate": 4.744928676464659e-05, + "loss": 2.924, + "step": 50106 + }, + { + "epoch": 2.46, + "grad_norm": 0.7420288920402527, + "learning_rate": 4.744097725200585e-05, + "loss": 2.7727, + "step": 50107 + }, + { + "epoch": 2.46, + "grad_norm": 0.7241715788841248, + "learning_rate": 4.7432668404557054e-05, + "loss": 2.7901, + "step": 50108 + }, + { + "epoch": 2.46, + "grad_norm": 0.70522540807724, + "learning_rate": 4.7424360222322e-05, + "loss": 2.6263, + "step": 50109 + }, + { + "epoch": 2.46, + "grad_norm": 0.7506316304206848, + "learning_rate": 4.7416052705322584e-05, + "loss": 3.0343, + "step": 50110 + }, + { + "epoch": 2.46, + "grad_norm": 0.7315349578857422, + "learning_rate": 4.740774585358058e-05, + "loss": 3.0413, + "step": 50111 + }, + { + "epoch": 2.46, + "grad_norm": 0.7280415892601013, + "learning_rate": 4.739943966711798e-05, + "loss": 2.9838, + "step": 50112 + }, + { + "epoch": 2.46, + "grad_norm": 0.7337613701820374, + "learning_rate": 4.739113414595672e-05, + "loss": 3.1778, + "step": 50113 + }, + { + "epoch": 2.46, + "grad_norm": 0.7724540829658508, + "learning_rate": 4.73828292901185e-05, + "loss": 2.7373, + "step": 50114 + }, + { + "epoch": 2.46, + "grad_norm": 0.728657603263855, + "learning_rate": 4.737452509962538e-05, + "loss": 2.6146, + "step": 50115 + }, + { + "epoch": 2.46, + "grad_norm": 0.774933397769928, + "learning_rate": 4.736622157449913e-05, + "loss": 2.9025, + "step": 50116 + }, + { + "epoch": 2.46, + "grad_norm": 0.7137971520423889, + "learning_rate": 4.735791871476159e-05, + "loss": 2.8057, + "step": 50117 + }, + { + "epoch": 2.46, + "grad_norm": 0.723777711391449, + "learning_rate": 4.734961652043471e-05, + "loss": 2.9628, + "step": 50118 + }, + { + "epoch": 2.46, + "grad_norm": 0.7265045642852783, + "learning_rate": 4.734131499154027e-05, + "loss": 2.6844, + "step": 50119 + }, + { + "epoch": 2.46, + "grad_norm": 0.7293531894683838, + "learning_rate": 4.733301412810022e-05, + "loss": 2.9703, + "step": 50120 + }, + { + "epoch": 2.46, + "grad_norm": 0.7541471719741821, + "learning_rate": 4.7324713930136346e-05, + "loss": 2.9009, + "step": 50121 + }, + { + "epoch": 2.46, + "grad_norm": 0.7486446499824524, + "learning_rate": 4.7316414397670535e-05, + "loss": 2.8286, + "step": 50122 + }, + { + "epoch": 2.46, + "grad_norm": 0.7592146396636963, + "learning_rate": 4.730811553072472e-05, + "loss": 2.7649, + "step": 50123 + }, + { + "epoch": 2.46, + "grad_norm": 0.7211746573448181, + "learning_rate": 4.72998173293207e-05, + "loss": 2.611, + "step": 50124 + }, + { + "epoch": 2.46, + "grad_norm": 0.7064350843429565, + "learning_rate": 4.729151979348032e-05, + "loss": 2.9046, + "step": 50125 + }, + { + "epoch": 2.46, + "grad_norm": 0.7288532257080078, + "learning_rate": 4.7283222923225385e-05, + "loss": 2.9871, + "step": 50126 + }, + { + "epoch": 2.46, + "grad_norm": 0.7138639092445374, + "learning_rate": 4.727492671857782e-05, + "loss": 3.0573, + "step": 50127 + }, + { + "epoch": 2.46, + "grad_norm": 0.7234009504318237, + "learning_rate": 4.726663117955951e-05, + "loss": 2.9079, + "step": 50128 + }, + { + "epoch": 2.46, + "grad_norm": 0.7255566716194153, + "learning_rate": 4.725833630619217e-05, + "loss": 2.8949, + "step": 50129 + }, + { + "epoch": 2.46, + "grad_norm": 0.7464993596076965, + "learning_rate": 4.725004209849784e-05, + "loss": 2.8752, + "step": 50130 + }, + { + "epoch": 2.46, + "grad_norm": 0.7366446256637573, + "learning_rate": 4.724174855649822e-05, + "loss": 2.8406, + "step": 50131 + }, + { + "epoch": 2.46, + "grad_norm": 0.7235312461853027, + "learning_rate": 4.723345568021514e-05, + "loss": 2.7292, + "step": 50132 + }, + { + "epoch": 2.46, + "grad_norm": 0.7639561891555786, + "learning_rate": 4.7225163469670555e-05, + "loss": 2.93, + "step": 50133 + }, + { + "epoch": 2.46, + "grad_norm": 0.7084791660308838, + "learning_rate": 4.721687192488616e-05, + "loss": 3.0532, + "step": 50134 + }, + { + "epoch": 2.46, + "grad_norm": 0.7618387937545776, + "learning_rate": 4.720858104588399e-05, + "loss": 2.9025, + "step": 50135 + }, + { + "epoch": 2.46, + "grad_norm": 0.7836553454399109, + "learning_rate": 4.720029083268566e-05, + "loss": 2.9431, + "step": 50136 + }, + { + "epoch": 2.46, + "grad_norm": 0.7372074127197266, + "learning_rate": 4.7192001285313095e-05, + "loss": 2.4869, + "step": 50137 + }, + { + "epoch": 2.46, + "grad_norm": 0.7809687852859497, + "learning_rate": 4.718371240378824e-05, + "loss": 2.9196, + "step": 50138 + }, + { + "epoch": 2.46, + "grad_norm": 0.7471933960914612, + "learning_rate": 4.717542418813285e-05, + "loss": 3.0387, + "step": 50139 + }, + { + "epoch": 2.46, + "grad_norm": 0.7411444187164307, + "learning_rate": 4.7167136638368694e-05, + "loss": 3.055, + "step": 50140 + }, + { + "epoch": 2.46, + "grad_norm": 0.7536376118659973, + "learning_rate": 4.7158849754517604e-05, + "loss": 2.8807, + "step": 50141 + }, + { + "epoch": 2.46, + "grad_norm": 0.7287087440490723, + "learning_rate": 4.715056353660142e-05, + "loss": 3.0737, + "step": 50142 + }, + { + "epoch": 2.46, + "grad_norm": 0.7162652611732483, + "learning_rate": 4.7142277984642084e-05, + "loss": 3.0043, + "step": 50143 + }, + { + "epoch": 2.46, + "grad_norm": 0.7836141586303711, + "learning_rate": 4.7133993098661226e-05, + "loss": 2.9177, + "step": 50144 + }, + { + "epoch": 2.46, + "grad_norm": 0.7386676669120789, + "learning_rate": 4.712570887868086e-05, + "loss": 2.6577, + "step": 50145 + }, + { + "epoch": 2.46, + "grad_norm": 0.7532510757446289, + "learning_rate": 4.711742532472269e-05, + "loss": 2.8556, + "step": 50146 + }, + { + "epoch": 2.46, + "grad_norm": 0.7517036199569702, + "learning_rate": 4.710914243680851e-05, + "loss": 2.9074, + "step": 50147 + }, + { + "epoch": 2.46, + "grad_norm": 0.7926651239395142, + "learning_rate": 4.710086021496024e-05, + "loss": 2.6681, + "step": 50148 + }, + { + "epoch": 2.46, + "grad_norm": 0.7204487919807434, + "learning_rate": 4.709257865919954e-05, + "loss": 2.8429, + "step": 50149 + }, + { + "epoch": 2.46, + "grad_norm": 0.7931594252586365, + "learning_rate": 4.708429776954838e-05, + "loss": 2.6764, + "step": 50150 + }, + { + "epoch": 2.46, + "grad_norm": 0.7327009439468384, + "learning_rate": 4.7076017546028474e-05, + "loss": 2.8861, + "step": 50151 + }, + { + "epoch": 2.46, + "grad_norm": 0.7598114609718323, + "learning_rate": 4.7067737988661706e-05, + "loss": 2.5865, + "step": 50152 + }, + { + "epoch": 2.46, + "grad_norm": 0.7243731617927551, + "learning_rate": 4.705945909746984e-05, + "loss": 2.872, + "step": 50153 + }, + { + "epoch": 2.46, + "grad_norm": 0.7335036396980286, + "learning_rate": 4.705118087247461e-05, + "loss": 2.701, + "step": 50154 + }, + { + "epoch": 2.46, + "grad_norm": 0.7911424040794373, + "learning_rate": 4.7042903313697966e-05, + "loss": 2.7696, + "step": 50155 + }, + { + "epoch": 2.46, + "grad_norm": 0.6929256916046143, + "learning_rate": 4.703462642116158e-05, + "loss": 2.9583, + "step": 50156 + }, + { + "epoch": 2.46, + "grad_norm": 0.7362402081489563, + "learning_rate": 4.7026350194887276e-05, + "loss": 3.0049, + "step": 50157 + }, + { + "epoch": 2.46, + "grad_norm": 0.6989345550537109, + "learning_rate": 4.7018074634896966e-05, + "loss": 3.0146, + "step": 50158 + }, + { + "epoch": 2.46, + "grad_norm": 0.7882124781608582, + "learning_rate": 4.700979974121226e-05, + "loss": 2.9058, + "step": 50159 + }, + { + "epoch": 2.46, + "grad_norm": 0.7394413948059082, + "learning_rate": 4.7001525513855155e-05, + "loss": 3.002, + "step": 50160 + }, + { + "epoch": 2.46, + "grad_norm": 0.7787472009658813, + "learning_rate": 4.699325195284732e-05, + "loss": 2.9563, + "step": 50161 + }, + { + "epoch": 2.46, + "grad_norm": 0.7208383083343506, + "learning_rate": 4.698497905821052e-05, + "loss": 2.9109, + "step": 50162 + }, + { + "epoch": 2.46, + "grad_norm": 0.7691134214401245, + "learning_rate": 4.6976706829966646e-05, + "loss": 2.6999, + "step": 50163 + }, + { + "epoch": 2.46, + "grad_norm": 0.7439152598381042, + "learning_rate": 4.696843526813737e-05, + "loss": 2.7271, + "step": 50164 + }, + { + "epoch": 2.46, + "grad_norm": 0.7308996915817261, + "learning_rate": 4.69601643727446e-05, + "loss": 2.6876, + "step": 50165 + }, + { + "epoch": 2.46, + "grad_norm": 0.8340305685997009, + "learning_rate": 4.695189414380999e-05, + "loss": 3.0734, + "step": 50166 + }, + { + "epoch": 2.46, + "grad_norm": 0.6861405968666077, + "learning_rate": 4.694362458135544e-05, + "loss": 2.9448, + "step": 50167 + }, + { + "epoch": 2.46, + "grad_norm": 0.7236627340316772, + "learning_rate": 4.693535568540272e-05, + "loss": 2.7754, + "step": 50168 + }, + { + "epoch": 2.46, + "grad_norm": 0.7470681071281433, + "learning_rate": 4.6927087455973466e-05, + "loss": 2.9247, + "step": 50169 + }, + { + "epoch": 2.46, + "grad_norm": 0.7589225172996521, + "learning_rate": 4.6918819893089667e-05, + "loss": 2.902, + "step": 50170 + }, + { + "epoch": 2.46, + "grad_norm": 0.7107172608375549, + "learning_rate": 4.69105529967729e-05, + "loss": 2.9717, + "step": 50171 + }, + { + "epoch": 2.46, + "grad_norm": 0.7193456888198853, + "learning_rate": 4.690228676704502e-05, + "loss": 2.9345, + "step": 50172 + }, + { + "epoch": 2.46, + "grad_norm": 0.7701574563980103, + "learning_rate": 4.689402120392787e-05, + "loss": 2.6803, + "step": 50173 + }, + { + "epoch": 2.46, + "grad_norm": 0.7131604552268982, + "learning_rate": 4.688575630744317e-05, + "loss": 3.0103, + "step": 50174 + }, + { + "epoch": 2.46, + "grad_norm": 0.7473611831665039, + "learning_rate": 4.68774920776127e-05, + "loss": 2.9097, + "step": 50175 + }, + { + "epoch": 2.46, + "grad_norm": 0.7232372164726257, + "learning_rate": 4.6869228514458076e-05, + "loss": 2.8485, + "step": 50176 + }, + { + "epoch": 2.46, + "grad_norm": 0.7372998595237732, + "learning_rate": 4.6860965618001223e-05, + "loss": 2.696, + "step": 50177 + }, + { + "epoch": 2.46, + "grad_norm": 0.7132263779640198, + "learning_rate": 4.6852703388263946e-05, + "loss": 2.7469, + "step": 50178 + }, + { + "epoch": 2.46, + "grad_norm": 0.7559736967086792, + "learning_rate": 4.684444182526787e-05, + "loss": 2.8729, + "step": 50179 + }, + { + "epoch": 2.46, + "grad_norm": 0.7392177581787109, + "learning_rate": 4.6836180929034826e-05, + "loss": 2.8812, + "step": 50180 + }, + { + "epoch": 2.46, + "grad_norm": 0.7340937256813049, + "learning_rate": 4.6827920699586555e-05, + "loss": 2.896, + "step": 50181 + }, + { + "epoch": 2.46, + "grad_norm": 0.7307325005531311, + "learning_rate": 4.681966113694485e-05, + "loss": 2.8654, + "step": 50182 + }, + { + "epoch": 2.46, + "grad_norm": 0.7325087785720825, + "learning_rate": 4.6811402241131445e-05, + "loss": 2.8743, + "step": 50183 + }, + { + "epoch": 2.46, + "grad_norm": 0.7375662326812744, + "learning_rate": 4.680314401216798e-05, + "loss": 2.886, + "step": 50184 + }, + { + "epoch": 2.46, + "grad_norm": 0.7653541564941406, + "learning_rate": 4.679488645007641e-05, + "loss": 2.9494, + "step": 50185 + }, + { + "epoch": 2.46, + "grad_norm": 0.7421068549156189, + "learning_rate": 4.678662955487831e-05, + "loss": 3.1609, + "step": 50186 + }, + { + "epoch": 2.46, + "grad_norm": 0.7914893627166748, + "learning_rate": 4.6778373326595534e-05, + "loss": 3.062, + "step": 50187 + }, + { + "epoch": 2.46, + "grad_norm": 0.7131171226501465, + "learning_rate": 4.677011776524973e-05, + "loss": 2.5862, + "step": 50188 + }, + { + "epoch": 2.46, + "grad_norm": 0.7471457123756409, + "learning_rate": 4.6761862870862776e-05, + "loss": 3.0483, + "step": 50189 + }, + { + "epoch": 2.46, + "grad_norm": 0.7291505932807922, + "learning_rate": 4.6753608643456365e-05, + "loss": 2.9675, + "step": 50190 + }, + { + "epoch": 2.46, + "grad_norm": 0.7441790103912354, + "learning_rate": 4.674535508305211e-05, + "loss": 2.9056, + "step": 50191 + }, + { + "epoch": 2.46, + "grad_norm": 0.7517664432525635, + "learning_rate": 4.6737102189671916e-05, + "loss": 2.8592, + "step": 50192 + }, + { + "epoch": 2.46, + "grad_norm": 0.7586925029754639, + "learning_rate": 4.6728849963337415e-05, + "loss": 2.7071, + "step": 50193 + }, + { + "epoch": 2.46, + "grad_norm": 0.754487931728363, + "learning_rate": 4.672059840407034e-05, + "loss": 3.0674, + "step": 50194 + }, + { + "epoch": 2.46, + "grad_norm": 0.7265577912330627, + "learning_rate": 4.6712347511892556e-05, + "loss": 2.9569, + "step": 50195 + }, + { + "epoch": 2.46, + "grad_norm": 0.7128158807754517, + "learning_rate": 4.67040972868257e-05, + "loss": 3.096, + "step": 50196 + }, + { + "epoch": 2.46, + "grad_norm": 0.6750921607017517, + "learning_rate": 4.66958477288915e-05, + "loss": 2.7767, + "step": 50197 + }, + { + "epoch": 2.46, + "grad_norm": 0.724058210849762, + "learning_rate": 4.668759883811163e-05, + "loss": 2.9174, + "step": 50198 + }, + { + "epoch": 2.46, + "grad_norm": 0.7566876411437988, + "learning_rate": 4.667935061450787e-05, + "loss": 2.9974, + "step": 50199 + }, + { + "epoch": 2.46, + "grad_norm": 0.7634623050689697, + "learning_rate": 4.667110305810201e-05, + "loss": 2.8993, + "step": 50200 + }, + { + "epoch": 2.46, + "grad_norm": 0.7499482035636902, + "learning_rate": 4.666285616891563e-05, + "loss": 3.0086, + "step": 50201 + }, + { + "epoch": 2.46, + "grad_norm": 0.7458711862564087, + "learning_rate": 4.6654609946970614e-05, + "loss": 3.0176, + "step": 50202 + }, + { + "epoch": 2.46, + "grad_norm": 0.7508730292320251, + "learning_rate": 4.664636439228854e-05, + "loss": 2.8624, + "step": 50203 + }, + { + "epoch": 2.46, + "grad_norm": 0.7427207231521606, + "learning_rate": 4.663811950489122e-05, + "loss": 2.8796, + "step": 50204 + }, + { + "epoch": 2.46, + "grad_norm": 0.7459037899971008, + "learning_rate": 4.6629875284800385e-05, + "loss": 2.821, + "step": 50205 + }, + { + "epoch": 2.46, + "grad_norm": 0.6995728611946106, + "learning_rate": 4.662163173203758e-05, + "loss": 2.9301, + "step": 50206 + }, + { + "epoch": 2.46, + "grad_norm": 0.7462338805198669, + "learning_rate": 4.661338884662471e-05, + "loss": 2.9296, + "step": 50207 + }, + { + "epoch": 2.46, + "grad_norm": 0.7074584364891052, + "learning_rate": 4.6605146628583365e-05, + "loss": 2.956, + "step": 50208 + }, + { + "epoch": 2.46, + "grad_norm": 0.745572030544281, + "learning_rate": 4.6596905077935266e-05, + "loss": 2.9507, + "step": 50209 + }, + { + "epoch": 2.46, + "grad_norm": 0.7482078075408936, + "learning_rate": 4.658866419470225e-05, + "loss": 2.9588, + "step": 50210 + }, + { + "epoch": 2.46, + "grad_norm": 0.6879667043685913, + "learning_rate": 4.658042397890591e-05, + "loss": 2.7677, + "step": 50211 + }, + { + "epoch": 2.46, + "grad_norm": 0.7026304602622986, + "learning_rate": 4.657218443056796e-05, + "loss": 3.1308, + "step": 50212 + }, + { + "epoch": 2.46, + "grad_norm": 0.7168485522270203, + "learning_rate": 4.656394554971002e-05, + "loss": 2.6956, + "step": 50213 + }, + { + "epoch": 2.46, + "grad_norm": 0.7557949423789978, + "learning_rate": 4.655570733635389e-05, + "loss": 2.959, + "step": 50214 + }, + { + "epoch": 2.46, + "grad_norm": 0.7124577760696411, + "learning_rate": 4.654746979052133e-05, + "loss": 2.9676, + "step": 50215 + }, + { + "epoch": 2.46, + "grad_norm": 0.7355839014053345, + "learning_rate": 4.653923291223386e-05, + "loss": 3.0602, + "step": 50216 + }, + { + "epoch": 2.46, + "grad_norm": 0.7148059606552124, + "learning_rate": 4.653099670151336e-05, + "loss": 2.8388, + "step": 50217 + }, + { + "epoch": 2.46, + "grad_norm": 0.7521902918815613, + "learning_rate": 4.6522761158381407e-05, + "loss": 2.7998, + "step": 50218 + }, + { + "epoch": 2.46, + "grad_norm": 0.7478179335594177, + "learning_rate": 4.651452628285969e-05, + "loss": 3.0467, + "step": 50219 + }, + { + "epoch": 2.46, + "grad_norm": 0.7289862036705017, + "learning_rate": 4.6506292074969955e-05, + "loss": 2.8849, + "step": 50220 + }, + { + "epoch": 2.46, + "grad_norm": 0.7552449703216553, + "learning_rate": 4.649805853473382e-05, + "loss": 2.9719, + "step": 50221 + }, + { + "epoch": 2.46, + "grad_norm": 0.7331597805023193, + "learning_rate": 4.648982566217307e-05, + "loss": 3.1323, + "step": 50222 + }, + { + "epoch": 2.46, + "grad_norm": 0.7304408550262451, + "learning_rate": 4.648159345730927e-05, + "loss": 2.7518, + "step": 50223 + }, + { + "epoch": 2.46, + "grad_norm": 0.7600448727607727, + "learning_rate": 4.647336192016413e-05, + "loss": 2.8241, + "step": 50224 + }, + { + "epoch": 2.46, + "grad_norm": 0.7154588103294373, + "learning_rate": 4.646513105075949e-05, + "loss": 2.7548, + "step": 50225 + }, + { + "epoch": 2.46, + "grad_norm": 0.729921281337738, + "learning_rate": 4.645690084911685e-05, + "loss": 2.94, + "step": 50226 + }, + { + "epoch": 2.46, + "grad_norm": 0.7225162386894226, + "learning_rate": 4.644867131525795e-05, + "loss": 2.8544, + "step": 50227 + }, + { + "epoch": 2.46, + "grad_norm": 0.7441264390945435, + "learning_rate": 4.6440442449204386e-05, + "loss": 3.0495, + "step": 50228 + }, + { + "epoch": 2.46, + "grad_norm": 0.7257991433143616, + "learning_rate": 4.643221425097792e-05, + "loss": 2.7045, + "step": 50229 + }, + { + "epoch": 2.46, + "grad_norm": 0.7100118398666382, + "learning_rate": 4.642398672060026e-05, + "loss": 2.7428, + "step": 50230 + }, + { + "epoch": 2.46, + "grad_norm": 0.7242577075958252, + "learning_rate": 4.6415759858092926e-05, + "loss": 2.9569, + "step": 50231 + }, + { + "epoch": 2.46, + "grad_norm": 0.7217210531234741, + "learning_rate": 4.6407533663477766e-05, + "loss": 3.0893, + "step": 50232 + }, + { + "epoch": 2.46, + "grad_norm": 0.7589887380599976, + "learning_rate": 4.6399308136776336e-05, + "loss": 2.7432, + "step": 50233 + }, + { + "epoch": 2.46, + "grad_norm": 0.6991437077522278, + "learning_rate": 4.639108327801028e-05, + "loss": 3.0003, + "step": 50234 + }, + { + "epoch": 2.46, + "grad_norm": 0.7275975942611694, + "learning_rate": 4.638285908720137e-05, + "loss": 2.8634, + "step": 50235 + }, + { + "epoch": 2.46, + "grad_norm": 0.7588894367218018, + "learning_rate": 4.637463556437111e-05, + "loss": 3.0073, + "step": 50236 + }, + { + "epoch": 2.46, + "grad_norm": 0.7242416739463806, + "learning_rate": 4.6366412709541336e-05, + "loss": 3.0063, + "step": 50237 + }, + { + "epoch": 2.46, + "grad_norm": 0.7257980704307556, + "learning_rate": 4.635819052273353e-05, + "loss": 3.0214, + "step": 50238 + }, + { + "epoch": 2.46, + "grad_norm": 0.7471466064453125, + "learning_rate": 4.6349969003969444e-05, + "loss": 2.9178, + "step": 50239 + }, + { + "epoch": 2.46, + "grad_norm": 0.746959924697876, + "learning_rate": 4.6341748153270864e-05, + "loss": 2.9513, + "step": 50240 + }, + { + "epoch": 2.46, + "grad_norm": 0.7836167812347412, + "learning_rate": 4.633352797065917e-05, + "loss": 2.9061, + "step": 50241 + }, + { + "epoch": 2.46, + "grad_norm": 0.764286994934082, + "learning_rate": 4.632530845615621e-05, + "loss": 2.8445, + "step": 50242 + }, + { + "epoch": 2.46, + "grad_norm": 0.7559465765953064, + "learning_rate": 4.63170896097835e-05, + "loss": 2.8471, + "step": 50243 + }, + { + "epoch": 2.46, + "grad_norm": 0.7415547966957092, + "learning_rate": 4.630887143156276e-05, + "loss": 2.8428, + "step": 50244 + }, + { + "epoch": 2.46, + "grad_norm": 0.7760792970657349, + "learning_rate": 4.6300653921515665e-05, + "loss": 2.8229, + "step": 50245 + }, + { + "epoch": 2.46, + "grad_norm": 0.7538564801216125, + "learning_rate": 4.6292437079663766e-05, + "loss": 2.8987, + "step": 50246 + }, + { + "epoch": 2.46, + "grad_norm": 0.7254053354263306, + "learning_rate": 4.6284220906028854e-05, + "loss": 2.9147, + "step": 50247 + }, + { + "epoch": 2.46, + "grad_norm": 0.808498203754425, + "learning_rate": 4.627600540063248e-05, + "loss": 2.7537, + "step": 50248 + }, + { + "epoch": 2.46, + "grad_norm": 0.7465435862541199, + "learning_rate": 4.626779056349617e-05, + "loss": 2.7917, + "step": 50249 + }, + { + "epoch": 2.46, + "grad_norm": 0.7259182929992676, + "learning_rate": 4.625957639464177e-05, + "loss": 2.9146, + "step": 50250 + }, + { + "epoch": 2.46, + "grad_norm": 0.7807119488716125, + "learning_rate": 4.6251362894090715e-05, + "loss": 2.967, + "step": 50251 + }, + { + "epoch": 2.46, + "grad_norm": 0.744609534740448, + "learning_rate": 4.624315006186482e-05, + "loss": 2.8336, + "step": 50252 + }, + { + "epoch": 2.46, + "grad_norm": 0.703451931476593, + "learning_rate": 4.623493789798557e-05, + "loss": 2.8774, + "step": 50253 + }, + { + "epoch": 2.46, + "grad_norm": 0.8264244198799133, + "learning_rate": 4.622672640247471e-05, + "loss": 2.7383, + "step": 50254 + }, + { + "epoch": 2.46, + "grad_norm": 0.736638069152832, + "learning_rate": 4.621851557535383e-05, + "loss": 2.7002, + "step": 50255 + }, + { + "epoch": 2.46, + "grad_norm": 0.7538456320762634, + "learning_rate": 4.621030541664444e-05, + "loss": 2.9702, + "step": 50256 + }, + { + "epoch": 2.46, + "grad_norm": 0.7488062977790833, + "learning_rate": 4.6202095926368366e-05, + "loss": 2.812, + "step": 50257 + }, + { + "epoch": 2.46, + "grad_norm": 0.7549282908439636, + "learning_rate": 4.619388710454704e-05, + "loss": 2.9381, + "step": 50258 + }, + { + "epoch": 2.46, + "grad_norm": 0.7125292420387268, + "learning_rate": 4.618567895120223e-05, + "loss": 3.0659, + "step": 50259 + }, + { + "epoch": 2.46, + "grad_norm": 0.8093187808990479, + "learning_rate": 4.617747146635544e-05, + "loss": 2.9393, + "step": 50260 + }, + { + "epoch": 2.46, + "grad_norm": 0.7467921376228333, + "learning_rate": 4.616926465002836e-05, + "loss": 2.9444, + "step": 50261 + }, + { + "epoch": 2.46, + "grad_norm": 0.774636447429657, + "learning_rate": 4.616105850224263e-05, + "loss": 2.9592, + "step": 50262 + }, + { + "epoch": 2.46, + "grad_norm": 0.7182961702346802, + "learning_rate": 4.615285302301981e-05, + "loss": 2.8498, + "step": 50263 + }, + { + "epoch": 2.46, + "grad_norm": 0.7876618504524231, + "learning_rate": 4.614464821238154e-05, + "loss": 2.8896, + "step": 50264 + }, + { + "epoch": 2.46, + "grad_norm": 0.7740433812141418, + "learning_rate": 4.613644407034934e-05, + "loss": 3.0115, + "step": 50265 + }, + { + "epoch": 2.46, + "grad_norm": 0.7827932834625244, + "learning_rate": 4.612824059694486e-05, + "loss": 3.0513, + "step": 50266 + }, + { + "epoch": 2.46, + "grad_norm": 0.7410049438476562, + "learning_rate": 4.6120037792189854e-05, + "loss": 2.7407, + "step": 50267 + }, + { + "epoch": 2.46, + "grad_norm": 0.7162790298461914, + "learning_rate": 4.61118356561057e-05, + "loss": 2.7981, + "step": 50268 + }, + { + "epoch": 2.46, + "grad_norm": 0.7440741658210754, + "learning_rate": 4.6103634188714214e-05, + "loss": 2.9791, + "step": 50269 + }, + { + "epoch": 2.46, + "grad_norm": 0.715707540512085, + "learning_rate": 4.6095433390036876e-05, + "loss": 2.86, + "step": 50270 + }, + { + "epoch": 2.46, + "grad_norm": 0.7865996956825256, + "learning_rate": 4.60872332600952e-05, + "loss": 2.8264, + "step": 50271 + }, + { + "epoch": 2.46, + "grad_norm": 0.7152213454246521, + "learning_rate": 4.607903379891098e-05, + "loss": 2.9668, + "step": 50272 + }, + { + "epoch": 2.46, + "grad_norm": 0.7056474089622498, + "learning_rate": 4.6070835006505655e-05, + "loss": 2.9386, + "step": 50273 + }, + { + "epoch": 2.46, + "grad_norm": 0.7383698225021362, + "learning_rate": 4.606263688290095e-05, + "loss": 3.0749, + "step": 50274 + }, + { + "epoch": 2.46, + "grad_norm": 0.7201040387153625, + "learning_rate": 4.6054439428118306e-05, + "loss": 2.8478, + "step": 50275 + }, + { + "epoch": 2.46, + "grad_norm": 0.7062273025512695, + "learning_rate": 4.6046242642179474e-05, + "loss": 2.9832, + "step": 50276 + }, + { + "epoch": 2.46, + "grad_norm": 0.687692403793335, + "learning_rate": 4.6038046525105974e-05, + "loss": 2.6567, + "step": 50277 + }, + { + "epoch": 2.46, + "grad_norm": 0.7824928760528564, + "learning_rate": 4.602985107691929e-05, + "loss": 2.8693, + "step": 50278 + }, + { + "epoch": 2.46, + "grad_norm": 0.7675195932388306, + "learning_rate": 4.60216562976412e-05, + "loss": 2.7285, + "step": 50279 + }, + { + "epoch": 2.46, + "grad_norm": 0.7547996640205383, + "learning_rate": 4.6013462187293085e-05, + "loss": 2.709, + "step": 50280 + }, + { + "epoch": 2.46, + "grad_norm": 0.7081446647644043, + "learning_rate": 4.600526874589666e-05, + "loss": 2.9257, + "step": 50281 + }, + { + "epoch": 2.46, + "grad_norm": 0.7581326961517334, + "learning_rate": 4.599707597347352e-05, + "loss": 2.8561, + "step": 50282 + }, + { + "epoch": 2.46, + "grad_norm": 0.6948836445808411, + "learning_rate": 4.5988883870045135e-05, + "loss": 2.9192, + "step": 50283 + }, + { + "epoch": 2.46, + "grad_norm": 0.7415758371353149, + "learning_rate": 4.598069243563323e-05, + "loss": 2.7314, + "step": 50284 + }, + { + "epoch": 2.46, + "grad_norm": 0.7500677704811096, + "learning_rate": 4.5972501670259275e-05, + "loss": 2.8871, + "step": 50285 + }, + { + "epoch": 2.46, + "grad_norm": 0.739080548286438, + "learning_rate": 4.59643115739448e-05, + "loss": 3.08, + "step": 50286 + }, + { + "epoch": 2.46, + "grad_norm": 0.7174320816993713, + "learning_rate": 4.595612214671151e-05, + "loss": 3.0523, + "step": 50287 + }, + { + "epoch": 2.46, + "grad_norm": 0.7595369219779968, + "learning_rate": 4.5947933388580835e-05, + "loss": 3.0033, + "step": 50288 + }, + { + "epoch": 2.46, + "grad_norm": 0.741030216217041, + "learning_rate": 4.593974529957448e-05, + "loss": 2.7456, + "step": 50289 + }, + { + "epoch": 2.46, + "grad_norm": 0.7448769807815552, + "learning_rate": 4.5931557879713865e-05, + "loss": 2.9838, + "step": 50290 + }, + { + "epoch": 2.46, + "grad_norm": 0.744935154914856, + "learning_rate": 4.5923371129020704e-05, + "loss": 2.8322, + "step": 50291 + }, + { + "epoch": 2.46, + "grad_norm": 0.7401701211929321, + "learning_rate": 4.591518504751648e-05, + "loss": 3.0576, + "step": 50292 + }, + { + "epoch": 2.46, + "grad_norm": 0.748890221118927, + "learning_rate": 4.5906999635222706e-05, + "loss": 2.9098, + "step": 50293 + }, + { + "epoch": 2.46, + "grad_norm": 0.7550206780433655, + "learning_rate": 4.589881489216104e-05, + "loss": 2.846, + "step": 50294 + }, + { + "epoch": 2.46, + "grad_norm": 0.7691561579704285, + "learning_rate": 4.5890630818352926e-05, + "loss": 2.8187, + "step": 50295 + }, + { + "epoch": 2.46, + "grad_norm": 0.7871925234794617, + "learning_rate": 4.5882447413820015e-05, + "loss": 2.7289, + "step": 50296 + }, + { + "epoch": 2.46, + "grad_norm": 0.7003697752952576, + "learning_rate": 4.587426467858388e-05, + "loss": 2.8819, + "step": 50297 + }, + { + "epoch": 2.46, + "grad_norm": 0.7725687026977539, + "learning_rate": 4.586608261266602e-05, + "loss": 2.8305, + "step": 50298 + }, + { + "epoch": 2.47, + "grad_norm": 0.7272656559944153, + "learning_rate": 4.585790121608798e-05, + "loss": 3.1106, + "step": 50299 + }, + { + "epoch": 2.47, + "grad_norm": 0.7602006196975708, + "learning_rate": 4.584972048887127e-05, + "loss": 2.934, + "step": 50300 + }, + { + "epoch": 2.47, + "grad_norm": 0.7348072528839111, + "learning_rate": 4.584154043103747e-05, + "loss": 3.0664, + "step": 50301 + }, + { + "epoch": 2.47, + "grad_norm": 0.752991795539856, + "learning_rate": 4.583336104260821e-05, + "loss": 2.8116, + "step": 50302 + }, + { + "epoch": 2.47, + "grad_norm": 0.7089395523071289, + "learning_rate": 4.582518232360489e-05, + "loss": 2.9561, + "step": 50303 + }, + { + "epoch": 2.47, + "grad_norm": 0.7611316442489624, + "learning_rate": 4.5817004274049205e-05, + "loss": 2.9413, + "step": 50304 + }, + { + "epoch": 2.47, + "grad_norm": 0.7629949450492859, + "learning_rate": 4.5808826893962535e-05, + "loss": 2.7534, + "step": 50305 + }, + { + "epoch": 2.47, + "grad_norm": 0.7246274948120117, + "learning_rate": 4.580065018336656e-05, + "loss": 2.9361, + "step": 50306 + }, + { + "epoch": 2.47, + "grad_norm": 0.7618169784545898, + "learning_rate": 4.5792474142282766e-05, + "loss": 2.8229, + "step": 50307 + }, + { + "epoch": 2.47, + "grad_norm": 0.7503147721290588, + "learning_rate": 4.5784298770732595e-05, + "loss": 3.0257, + "step": 50308 + }, + { + "epoch": 2.47, + "grad_norm": 0.721960723400116, + "learning_rate": 4.577612406873774e-05, + "loss": 2.9242, + "step": 50309 + }, + { + "epoch": 2.47, + "grad_norm": 0.7781165242195129, + "learning_rate": 4.576795003631954e-05, + "loss": 2.9042, + "step": 50310 + }, + { + "epoch": 2.47, + "grad_norm": 0.7347601056098938, + "learning_rate": 4.575977667349966e-05, + "loss": 2.8163, + "step": 50311 + }, + { + "epoch": 2.47, + "grad_norm": 0.7000023722648621, + "learning_rate": 4.575160398029967e-05, + "loss": 2.8964, + "step": 50312 + }, + { + "epoch": 2.47, + "grad_norm": 0.7539365887641907, + "learning_rate": 4.574343195674102e-05, + "loss": 3.0921, + "step": 50313 + }, + { + "epoch": 2.47, + "grad_norm": 0.6888923645019531, + "learning_rate": 4.573526060284523e-05, + "loss": 2.8024, + "step": 50314 + }, + { + "epoch": 2.47, + "grad_norm": 0.7311072945594788, + "learning_rate": 4.572708991863374e-05, + "loss": 2.6267, + "step": 50315 + }, + { + "epoch": 2.47, + "grad_norm": 0.7336209416389465, + "learning_rate": 4.571891990412822e-05, + "loss": 2.8586, + "step": 50316 + }, + { + "epoch": 2.47, + "grad_norm": 0.7051305174827576, + "learning_rate": 4.5710750559350126e-05, + "loss": 2.9322, + "step": 50317 + }, + { + "epoch": 2.47, + "grad_norm": 0.7536783814430237, + "learning_rate": 4.5702581884320964e-05, + "loss": 2.7619, + "step": 50318 + }, + { + "epoch": 2.47, + "grad_norm": 0.7377327680587769, + "learning_rate": 4.569441387906231e-05, + "loss": 2.7176, + "step": 50319 + }, + { + "epoch": 2.47, + "grad_norm": 0.7616892457008362, + "learning_rate": 4.568624654359561e-05, + "loss": 2.8448, + "step": 50320 + }, + { + "epoch": 2.47, + "grad_norm": 0.7389867305755615, + "learning_rate": 4.567807987794234e-05, + "loss": 2.8724, + "step": 50321 + }, + { + "epoch": 2.47, + "grad_norm": 0.7348322868347168, + "learning_rate": 4.566991388212413e-05, + "loss": 3.0046, + "step": 50322 + }, + { + "epoch": 2.47, + "grad_norm": 0.7657172679901123, + "learning_rate": 4.5661748556162357e-05, + "loss": 2.9771, + "step": 50323 + }, + { + "epoch": 2.47, + "grad_norm": 0.6982909440994263, + "learning_rate": 4.5653583900078636e-05, + "loss": 2.8583, + "step": 50324 + }, + { + "epoch": 2.47, + "grad_norm": 0.8317862153053284, + "learning_rate": 4.564541991389439e-05, + "loss": 2.9129, + "step": 50325 + }, + { + "epoch": 2.47, + "grad_norm": 0.6974571943283081, + "learning_rate": 4.563725659763112e-05, + "loss": 2.9483, + "step": 50326 + }, + { + "epoch": 2.47, + "grad_norm": 0.813891589641571, + "learning_rate": 4.5629093951310456e-05, + "loss": 2.9055, + "step": 50327 + }, + { + "epoch": 2.47, + "grad_norm": 0.7852758765220642, + "learning_rate": 4.5620931974953815e-05, + "loss": 2.8643, + "step": 50328 + }, + { + "epoch": 2.47, + "grad_norm": 0.7360036373138428, + "learning_rate": 4.5612770668582664e-05, + "loss": 2.8992, + "step": 50329 + }, + { + "epoch": 2.47, + "grad_norm": 0.7240647077560425, + "learning_rate": 4.560461003221844e-05, + "loss": 2.815, + "step": 50330 + }, + { + "epoch": 2.47, + "grad_norm": 0.7547604441642761, + "learning_rate": 4.559645006588274e-05, + "loss": 3.0039, + "step": 50331 + }, + { + "epoch": 2.47, + "grad_norm": 0.7343550324440002, + "learning_rate": 4.558829076959706e-05, + "loss": 2.8534, + "step": 50332 + }, + { + "epoch": 2.47, + "grad_norm": 0.7282590866088867, + "learning_rate": 4.5580132143382805e-05, + "loss": 3.1069, + "step": 50333 + }, + { + "epoch": 2.47, + "grad_norm": 0.7644495964050293, + "learning_rate": 4.557197418726161e-05, + "loss": 3.1148, + "step": 50334 + }, + { + "epoch": 2.47, + "grad_norm": 0.7662924528121948, + "learning_rate": 4.556381690125487e-05, + "loss": 3.0473, + "step": 50335 + }, + { + "epoch": 2.47, + "grad_norm": 0.7156445980072021, + "learning_rate": 4.555566028538398e-05, + "loss": 2.9062, + "step": 50336 + }, + { + "epoch": 2.47, + "grad_norm": 0.7530838251113892, + "learning_rate": 4.554750433967058e-05, + "loss": 2.9819, + "step": 50337 + }, + { + "epoch": 2.47, + "grad_norm": 0.7620644569396973, + "learning_rate": 4.5539349064136034e-05, + "loss": 2.9154, + "step": 50338 + }, + { + "epoch": 2.47, + "grad_norm": 0.7103702425956726, + "learning_rate": 4.5531194458801914e-05, + "loss": 3.012, + "step": 50339 + }, + { + "epoch": 2.47, + "grad_norm": 0.6881940960884094, + "learning_rate": 4.55230405236896e-05, + "loss": 3.1089, + "step": 50340 + }, + { + "epoch": 2.47, + "grad_norm": 0.7752044796943665, + "learning_rate": 4.5514887258820686e-05, + "loss": 2.8773, + "step": 50341 + }, + { + "epoch": 2.47, + "grad_norm": 0.7745306491851807, + "learning_rate": 4.550673466421658e-05, + "loss": 2.8716, + "step": 50342 + }, + { + "epoch": 2.47, + "grad_norm": 0.7594026923179626, + "learning_rate": 4.5498582739898705e-05, + "loss": 2.9974, + "step": 50343 + }, + { + "epoch": 2.47, + "grad_norm": 0.7362309694290161, + "learning_rate": 4.549043148588863e-05, + "loss": 2.9818, + "step": 50344 + }, + { + "epoch": 2.47, + "grad_norm": 0.7302587032318115, + "learning_rate": 4.548228090220773e-05, + "loss": 2.8843, + "step": 50345 + }, + { + "epoch": 2.47, + "grad_norm": 0.7178745865821838, + "learning_rate": 4.547413098887755e-05, + "loss": 2.8736, + "step": 50346 + }, + { + "epoch": 2.47, + "grad_norm": 0.753461480140686, + "learning_rate": 4.546598174591951e-05, + "loss": 3.1034, + "step": 50347 + }, + { + "epoch": 2.47, + "grad_norm": 0.7480427026748657, + "learning_rate": 4.545783317335504e-05, + "loss": 3.1151, + "step": 50348 + }, + { + "epoch": 2.47, + "grad_norm": 0.7292770147323608, + "learning_rate": 4.544968527120576e-05, + "loss": 2.8363, + "step": 50349 + }, + { + "epoch": 2.47, + "grad_norm": 0.7527563571929932, + "learning_rate": 4.5441538039492985e-05, + "loss": 2.9367, + "step": 50350 + }, + { + "epoch": 2.47, + "grad_norm": 0.7489085793495178, + "learning_rate": 4.543339147823824e-05, + "loss": 2.9372, + "step": 50351 + }, + { + "epoch": 2.47, + "grad_norm": 0.7443061470985413, + "learning_rate": 4.5425245587462855e-05, + "loss": 2.9903, + "step": 50352 + }, + { + "epoch": 2.47, + "grad_norm": 0.73963463306427, + "learning_rate": 4.5417100367188365e-05, + "loss": 2.977, + "step": 50353 + }, + { + "epoch": 2.47, + "grad_norm": 0.7527986168861389, + "learning_rate": 4.5408955817436344e-05, + "loss": 2.7493, + "step": 50354 + }, + { + "epoch": 2.47, + "grad_norm": 0.7716648578643799, + "learning_rate": 4.540081193822804e-05, + "loss": 2.9532, + "step": 50355 + }, + { + "epoch": 2.47, + "grad_norm": 0.6952173113822937, + "learning_rate": 4.539266872958508e-05, + "loss": 3.0827, + "step": 50356 + }, + { + "epoch": 2.47, + "grad_norm": 0.7592951059341431, + "learning_rate": 4.538452619152884e-05, + "loss": 2.9578, + "step": 50357 + }, + { + "epoch": 2.47, + "grad_norm": 0.7521546483039856, + "learning_rate": 4.537638432408066e-05, + "loss": 2.7848, + "step": 50358 + }, + { + "epoch": 2.47, + "grad_norm": 0.7431459426879883, + "learning_rate": 4.536824312726217e-05, + "loss": 2.8052, + "step": 50359 + }, + { + "epoch": 2.47, + "grad_norm": 0.7520620822906494, + "learning_rate": 4.536010260109465e-05, + "loss": 2.8657, + "step": 50360 + }, + { + "epoch": 2.47, + "grad_norm": 0.7569798827171326, + "learning_rate": 4.535196274559968e-05, + "loss": 2.9279, + "step": 50361 + }, + { + "epoch": 2.47, + "grad_norm": 0.771931529045105, + "learning_rate": 4.5343823560798574e-05, + "loss": 2.8402, + "step": 50362 + }, + { + "epoch": 2.47, + "grad_norm": 0.7233810424804688, + "learning_rate": 4.533568504671282e-05, + "loss": 2.8706, + "step": 50363 + }, + { + "epoch": 2.47, + "grad_norm": 0.6947945952415466, + "learning_rate": 4.53275472033639e-05, + "loss": 2.771, + "step": 50364 + }, + { + "epoch": 2.47, + "grad_norm": 0.7521064281463623, + "learning_rate": 4.531941003077326e-05, + "loss": 2.7515, + "step": 50365 + }, + { + "epoch": 2.47, + "grad_norm": 0.7488537430763245, + "learning_rate": 4.531127352896226e-05, + "loss": 2.7538, + "step": 50366 + }, + { + "epoch": 2.47, + "grad_norm": 0.78842693567276, + "learning_rate": 4.5303137697952266e-05, + "loss": 3.043, + "step": 50367 + }, + { + "epoch": 2.47, + "grad_norm": 0.7508882284164429, + "learning_rate": 4.5295002537764804e-05, + "loss": 2.9645, + "step": 50368 + }, + { + "epoch": 2.47, + "grad_norm": 0.8009318113327026, + "learning_rate": 4.528686804842136e-05, + "loss": 2.9572, + "step": 50369 + }, + { + "epoch": 2.47, + "grad_norm": 0.7622132897377014, + "learning_rate": 4.5278734229943204e-05, + "loss": 2.6743, + "step": 50370 + }, + { + "epoch": 2.47, + "grad_norm": 0.7067914605140686, + "learning_rate": 4.52706010823519e-05, + "loss": 2.8212, + "step": 50371 + }, + { + "epoch": 2.47, + "grad_norm": 0.7103236317634583, + "learning_rate": 4.5262468605668823e-05, + "loss": 2.9323, + "step": 50372 + }, + { + "epoch": 2.47, + "grad_norm": 0.7678020596504211, + "learning_rate": 4.525433679991529e-05, + "loss": 3.0528, + "step": 50373 + }, + { + "epoch": 2.47, + "grad_norm": 0.7323145866394043, + "learning_rate": 4.5246205665112924e-05, + "loss": 2.7942, + "step": 50374 + }, + { + "epoch": 2.47, + "grad_norm": 0.713549792766571, + "learning_rate": 4.52380752012829e-05, + "loss": 2.9964, + "step": 50375 + }, + { + "epoch": 2.47, + "grad_norm": 0.7168814539909363, + "learning_rate": 4.5229945408446865e-05, + "loss": 2.8764, + "step": 50376 + }, + { + "epoch": 2.47, + "grad_norm": 0.7501139044761658, + "learning_rate": 4.522181628662605e-05, + "loss": 2.8077, + "step": 50377 + }, + { + "epoch": 2.47, + "grad_norm": 0.7667758464813232, + "learning_rate": 4.5213687835841983e-05, + "loss": 2.8093, + "step": 50378 + }, + { + "epoch": 2.47, + "grad_norm": 0.7523639798164368, + "learning_rate": 4.520556005611603e-05, + "loss": 2.8804, + "step": 50379 + }, + { + "epoch": 2.47, + "grad_norm": 0.7287237048149109, + "learning_rate": 4.5197432947469536e-05, + "loss": 3.0469, + "step": 50380 + }, + { + "epoch": 2.47, + "grad_norm": 0.7388063073158264, + "learning_rate": 4.518930650992405e-05, + "loss": 2.8228, + "step": 50381 + }, + { + "epoch": 2.47, + "grad_norm": 0.751380980014801, + "learning_rate": 4.518118074350079e-05, + "loss": 2.8182, + "step": 50382 + }, + { + "epoch": 2.47, + "grad_norm": 0.7500567436218262, + "learning_rate": 4.517305564822127e-05, + "loss": 3.1013, + "step": 50383 + }, + { + "epoch": 2.47, + "grad_norm": 0.7376142740249634, + "learning_rate": 4.516493122410697e-05, + "loss": 2.8591, + "step": 50384 + }, + { + "epoch": 2.47, + "grad_norm": 0.6814765334129333, + "learning_rate": 4.515680747117911e-05, + "loss": 2.7155, + "step": 50385 + }, + { + "epoch": 2.47, + "grad_norm": 0.7675392031669617, + "learning_rate": 4.5148684389459244e-05, + "loss": 2.9702, + "step": 50386 + }, + { + "epoch": 2.47, + "grad_norm": 0.7462222576141357, + "learning_rate": 4.514056197896868e-05, + "loss": 2.7139, + "step": 50387 + }, + { + "epoch": 2.47, + "grad_norm": 0.7849350571632385, + "learning_rate": 4.5132440239728774e-05, + "loss": 3.1364, + "step": 50388 + }, + { + "epoch": 2.47, + "grad_norm": 0.7008315324783325, + "learning_rate": 4.512431917176107e-05, + "loss": 3.0296, + "step": 50389 + }, + { + "epoch": 2.47, + "grad_norm": 0.7495970726013184, + "learning_rate": 4.5116198775086756e-05, + "loss": 2.5677, + "step": 50390 + }, + { + "epoch": 2.47, + "grad_norm": 0.732743501663208, + "learning_rate": 4.510807904972737e-05, + "loss": 2.8116, + "step": 50391 + }, + { + "epoch": 2.47, + "grad_norm": 0.7251850962638855, + "learning_rate": 4.509995999570422e-05, + "loss": 2.9006, + "step": 50392 + }, + { + "epoch": 2.47, + "grad_norm": 0.7376589775085449, + "learning_rate": 4.509184161303876e-05, + "loss": 3.0105, + "step": 50393 + }, + { + "epoch": 2.47, + "grad_norm": 0.7272580862045288, + "learning_rate": 4.508372390175235e-05, + "loss": 2.7939, + "step": 50394 + }, + { + "epoch": 2.47, + "grad_norm": 0.7777386903762817, + "learning_rate": 4.5075606861866285e-05, + "loss": 2.7377, + "step": 50395 + }, + { + "epoch": 2.47, + "grad_norm": 0.7213007211685181, + "learning_rate": 4.5067490493402066e-05, + "loss": 2.7687, + "step": 50396 + }, + { + "epoch": 2.47, + "grad_norm": 0.707280158996582, + "learning_rate": 4.505937479638092e-05, + "loss": 2.8246, + "step": 50397 + }, + { + "epoch": 2.47, + "grad_norm": 0.74385666847229, + "learning_rate": 4.505125977082436e-05, + "loss": 2.7228, + "step": 50398 + }, + { + "epoch": 2.47, + "grad_norm": 0.738721489906311, + "learning_rate": 4.5043145416753755e-05, + "loss": 2.8539, + "step": 50399 + }, + { + "epoch": 2.47, + "grad_norm": 0.7245712876319885, + "learning_rate": 4.5035031734190476e-05, + "loss": 3.0549, + "step": 50400 + }, + { + "epoch": 2.47, + "grad_norm": 0.7531771063804626, + "learning_rate": 4.5026918723155826e-05, + "loss": 2.9992, + "step": 50401 + }, + { + "epoch": 2.47, + "grad_norm": 0.7361901998519897, + "learning_rate": 4.501880638367112e-05, + "loss": 2.7009, + "step": 50402 + }, + { + "epoch": 2.47, + "grad_norm": 0.7059527039527893, + "learning_rate": 4.5010694715757826e-05, + "loss": 2.8706, + "step": 50403 + }, + { + "epoch": 2.47, + "grad_norm": 0.7453323602676392, + "learning_rate": 4.500258371943737e-05, + "loss": 2.6934, + "step": 50404 + }, + { + "epoch": 2.47, + "grad_norm": 0.7383242845535278, + "learning_rate": 4.4994473394730944e-05, + "loss": 2.8522, + "step": 50405 + }, + { + "epoch": 2.47, + "grad_norm": 0.723354697227478, + "learning_rate": 4.498636374166007e-05, + "loss": 2.944, + "step": 50406 + }, + { + "epoch": 2.47, + "grad_norm": 0.7948548793792725, + "learning_rate": 4.4978254760245955e-05, + "loss": 2.9672, + "step": 50407 + }, + { + "epoch": 2.47, + "grad_norm": 0.7579402923583984, + "learning_rate": 4.4970146450510124e-05, + "loss": 2.704, + "step": 50408 + }, + { + "epoch": 2.47, + "grad_norm": 0.7572686672210693, + "learning_rate": 4.496203881247385e-05, + "loss": 2.9605, + "step": 50409 + }, + { + "epoch": 2.47, + "grad_norm": 0.7329517006874084, + "learning_rate": 4.49539318461584e-05, + "loss": 2.8146, + "step": 50410 + }, + { + "epoch": 2.47, + "grad_norm": 0.744154691696167, + "learning_rate": 4.49458255515853e-05, + "loss": 2.7126, + "step": 50411 + }, + { + "epoch": 2.47, + "grad_norm": 0.7456027269363403, + "learning_rate": 4.493771992877572e-05, + "loss": 2.9384, + "step": 50412 + }, + { + "epoch": 2.47, + "grad_norm": 0.7537271976470947, + "learning_rate": 4.492961497775109e-05, + "loss": 2.9007, + "step": 50413 + }, + { + "epoch": 2.47, + "grad_norm": 0.7661435008049011, + "learning_rate": 4.4921510698532846e-05, + "loss": 2.7748, + "step": 50414 + }, + { + "epoch": 2.47, + "grad_norm": 0.7212929129600525, + "learning_rate": 4.4913407091142265e-05, + "loss": 3.0509, + "step": 50415 + }, + { + "epoch": 2.47, + "grad_norm": 0.7772074341773987, + "learning_rate": 4.490530415560065e-05, + "loss": 2.6768, + "step": 50416 + }, + { + "epoch": 2.47, + "grad_norm": 0.7107312083244324, + "learning_rate": 4.48972018919293e-05, + "loss": 3.0139, + "step": 50417 + }, + { + "epoch": 2.47, + "grad_norm": 0.7618693709373474, + "learning_rate": 4.488910030014973e-05, + "loss": 2.992, + "step": 50418 + }, + { + "epoch": 2.47, + "grad_norm": 0.7590107321739197, + "learning_rate": 4.4880999380283075e-05, + "loss": 2.7462, + "step": 50419 + }, + { + "epoch": 2.47, + "grad_norm": 0.7364644408226013, + "learning_rate": 4.487289913235077e-05, + "loss": 2.9629, + "step": 50420 + }, + { + "epoch": 2.47, + "grad_norm": 0.726629376411438, + "learning_rate": 4.486479955637422e-05, + "loss": 3.1113, + "step": 50421 + }, + { + "epoch": 2.47, + "grad_norm": 0.7420285940170288, + "learning_rate": 4.4856700652374714e-05, + "loss": 2.9697, + "step": 50422 + }, + { + "epoch": 2.47, + "grad_norm": 0.7878907322883606, + "learning_rate": 4.484860242037352e-05, + "loss": 2.7718, + "step": 50423 + }, + { + "epoch": 2.47, + "grad_norm": 0.7129849791526794, + "learning_rate": 4.484050486039195e-05, + "loss": 3.0591, + "step": 50424 + }, + { + "epoch": 2.47, + "grad_norm": 0.7504441142082214, + "learning_rate": 4.4832407972451354e-05, + "loss": 2.9215, + "step": 50425 + }, + { + "epoch": 2.47, + "grad_norm": 0.7380017042160034, + "learning_rate": 4.4824311756573204e-05, + "loss": 2.9334, + "step": 50426 + }, + { + "epoch": 2.47, + "grad_norm": 0.7509652376174927, + "learning_rate": 4.481621621277861e-05, + "loss": 3.0772, + "step": 50427 + }, + { + "epoch": 2.47, + "grad_norm": 0.7229263186454773, + "learning_rate": 4.4808121341089054e-05, + "loss": 2.8744, + "step": 50428 + }, + { + "epoch": 2.47, + "grad_norm": 0.7127883434295654, + "learning_rate": 4.480002714152575e-05, + "loss": 2.7404, + "step": 50429 + }, + { + "epoch": 2.47, + "grad_norm": 0.77323979139328, + "learning_rate": 4.479193361411012e-05, + "loss": 3.0917, + "step": 50430 + }, + { + "epoch": 2.47, + "grad_norm": 0.7353416085243225, + "learning_rate": 4.478384075886341e-05, + "loss": 2.8433, + "step": 50431 + }, + { + "epoch": 2.47, + "grad_norm": 0.7446988821029663, + "learning_rate": 4.477574857580687e-05, + "loss": 3.1125, + "step": 50432 + }, + { + "epoch": 2.47, + "grad_norm": 0.7732658386230469, + "learning_rate": 4.476765706496198e-05, + "loss": 2.7603, + "step": 50433 + }, + { + "epoch": 2.47, + "grad_norm": 0.7612114548683167, + "learning_rate": 4.475956622634986e-05, + "loss": 3.1181, + "step": 50434 + }, + { + "epoch": 2.47, + "grad_norm": 0.9173521399497986, + "learning_rate": 4.475147605999196e-05, + "loss": 2.8608, + "step": 50435 + }, + { + "epoch": 2.47, + "grad_norm": 0.7881463766098022, + "learning_rate": 4.47433865659096e-05, + "loss": 2.8996, + "step": 50436 + }, + { + "epoch": 2.47, + "grad_norm": 0.7628815174102783, + "learning_rate": 4.473529774412405e-05, + "loss": 2.9975, + "step": 50437 + }, + { + "epoch": 2.47, + "grad_norm": 0.710648238658905, + "learning_rate": 4.472720959465658e-05, + "loss": 3.011, + "step": 50438 + }, + { + "epoch": 2.47, + "grad_norm": 0.705659806728363, + "learning_rate": 4.471912211752845e-05, + "loss": 3.1302, + "step": 50439 + }, + { + "epoch": 2.47, + "grad_norm": 0.7531771659851074, + "learning_rate": 4.471103531276102e-05, + "loss": 2.7541, + "step": 50440 + }, + { + "epoch": 2.47, + "grad_norm": 0.7266751527786255, + "learning_rate": 4.470294918037565e-05, + "loss": 2.8924, + "step": 50441 + }, + { + "epoch": 2.47, + "grad_norm": 0.7597355246543884, + "learning_rate": 4.4694863720393536e-05, + "loss": 2.7383, + "step": 50442 + }, + { + "epoch": 2.47, + "grad_norm": 0.7361400723457336, + "learning_rate": 4.4686778932836043e-05, + "loss": 2.8306, + "step": 50443 + }, + { + "epoch": 2.47, + "grad_norm": 0.7326222062110901, + "learning_rate": 4.467869481772448e-05, + "loss": 2.7957, + "step": 50444 + }, + { + "epoch": 2.47, + "grad_norm": 0.7179027199745178, + "learning_rate": 4.467061137508e-05, + "loss": 2.7548, + "step": 50445 + }, + { + "epoch": 2.47, + "grad_norm": 0.7314981818199158, + "learning_rate": 4.466252860492409e-05, + "loss": 2.7318, + "step": 50446 + }, + { + "epoch": 2.47, + "grad_norm": 0.8546863794326782, + "learning_rate": 4.465444650727785e-05, + "loss": 2.7946, + "step": 50447 + }, + { + "epoch": 2.47, + "grad_norm": 0.7847939729690552, + "learning_rate": 4.4646365082162715e-05, + "loss": 2.8562, + "step": 50448 + }, + { + "epoch": 2.47, + "grad_norm": 0.7865023016929626, + "learning_rate": 4.463828432959986e-05, + "loss": 2.7398, + "step": 50449 + }, + { + "epoch": 2.47, + "grad_norm": 0.7689087390899658, + "learning_rate": 4.463020424961064e-05, + "loss": 2.9209, + "step": 50450 + }, + { + "epoch": 2.47, + "grad_norm": 0.7183415293693542, + "learning_rate": 4.462212484221636e-05, + "loss": 2.8917, + "step": 50451 + }, + { + "epoch": 2.47, + "grad_norm": 0.7046708464622498, + "learning_rate": 4.461404610743824e-05, + "loss": 3.1845, + "step": 50452 + }, + { + "epoch": 2.47, + "grad_norm": 0.7757652401924133, + "learning_rate": 4.4605968045297566e-05, + "loss": 2.8422, + "step": 50453 + }, + { + "epoch": 2.47, + "grad_norm": 0.702682614326477, + "learning_rate": 4.459789065581559e-05, + "loss": 2.8507, + "step": 50454 + }, + { + "epoch": 2.47, + "grad_norm": 0.7367978692054749, + "learning_rate": 4.45898139390136e-05, + "loss": 2.7709, + "step": 50455 + }, + { + "epoch": 2.47, + "grad_norm": 0.7395363450050354, + "learning_rate": 4.458173789491293e-05, + "loss": 2.6215, + "step": 50456 + }, + { + "epoch": 2.47, + "grad_norm": 0.8202866315841675, + "learning_rate": 4.457366252353477e-05, + "loss": 2.9662, + "step": 50457 + }, + { + "epoch": 2.47, + "grad_norm": 0.738206148147583, + "learning_rate": 4.4565587824900475e-05, + "loss": 2.843, + "step": 50458 + }, + { + "epoch": 2.47, + "grad_norm": 0.6997504830360413, + "learning_rate": 4.455751379903129e-05, + "loss": 2.8662, + "step": 50459 + }, + { + "epoch": 2.47, + "grad_norm": 0.7250627875328064, + "learning_rate": 4.454944044594836e-05, + "loss": 2.9364, + "step": 50460 + }, + { + "epoch": 2.47, + "grad_norm": 0.7618042230606079, + "learning_rate": 4.4541367765673105e-05, + "loss": 2.7884, + "step": 50461 + }, + { + "epoch": 2.47, + "grad_norm": 0.7377805113792419, + "learning_rate": 4.453329575822665e-05, + "loss": 2.7566, + "step": 50462 + }, + { + "epoch": 2.47, + "grad_norm": 0.7210468649864197, + "learning_rate": 4.4525224423630434e-05, + "loss": 2.8037, + "step": 50463 + }, + { + "epoch": 2.47, + "grad_norm": 0.7345592975616455, + "learning_rate": 4.451715376190551e-05, + "loss": 2.8834, + "step": 50464 + }, + { + "epoch": 2.47, + "grad_norm": 0.7215968370437622, + "learning_rate": 4.4509083773073226e-05, + "loss": 2.9943, + "step": 50465 + }, + { + "epoch": 2.47, + "grad_norm": 0.7119701504707336, + "learning_rate": 4.4501014457154935e-05, + "loss": 2.8041, + "step": 50466 + }, + { + "epoch": 2.47, + "grad_norm": 0.7098177075386047, + "learning_rate": 4.449294581417182e-05, + "loss": 2.8293, + "step": 50467 + }, + { + "epoch": 2.47, + "grad_norm": 0.7481569647789001, + "learning_rate": 4.448487784414506e-05, + "loss": 2.802, + "step": 50468 + }, + { + "epoch": 2.47, + "grad_norm": 0.7269555330276489, + "learning_rate": 4.447681054709594e-05, + "loss": 2.7692, + "step": 50469 + }, + { + "epoch": 2.47, + "grad_norm": 0.6905206441879272, + "learning_rate": 4.446874392304568e-05, + "loss": 2.6578, + "step": 50470 + }, + { + "epoch": 2.47, + "grad_norm": 0.7605257630348206, + "learning_rate": 4.446067797201569e-05, + "loss": 2.8615, + "step": 50471 + }, + { + "epoch": 2.47, + "grad_norm": 0.7663816213607788, + "learning_rate": 4.445261269402699e-05, + "loss": 2.7005, + "step": 50472 + }, + { + "epoch": 2.47, + "grad_norm": 0.7191436290740967, + "learning_rate": 4.4444548089101e-05, + "loss": 3.0061, + "step": 50473 + }, + { + "epoch": 2.47, + "grad_norm": 0.7263677716255188, + "learning_rate": 4.443648415725889e-05, + "loss": 2.8994, + "step": 50474 + }, + { + "epoch": 2.47, + "grad_norm": 0.7229388952255249, + "learning_rate": 4.442842089852182e-05, + "loss": 2.9245, + "step": 50475 + }, + { + "epoch": 2.47, + "grad_norm": 0.8069564700126648, + "learning_rate": 4.442035831291117e-05, + "loss": 2.7624, + "step": 50476 + }, + { + "epoch": 2.47, + "grad_norm": 0.7363615036010742, + "learning_rate": 4.4412296400448064e-05, + "loss": 2.8038, + "step": 50477 + }, + { + "epoch": 2.47, + "grad_norm": 0.7362646460533142, + "learning_rate": 4.4404235161153846e-05, + "loss": 2.9683, + "step": 50478 + }, + { + "epoch": 2.47, + "grad_norm": 0.7199697494506836, + "learning_rate": 4.439617459504959e-05, + "loss": 2.852, + "step": 50479 + }, + { + "epoch": 2.47, + "grad_norm": 0.7283482551574707, + "learning_rate": 4.43881147021567e-05, + "loss": 2.7507, + "step": 50480 + }, + { + "epoch": 2.47, + "grad_norm": 0.7187907695770264, + "learning_rate": 4.43800554824963e-05, + "loss": 3.07, + "step": 50481 + }, + { + "epoch": 2.47, + "grad_norm": 0.711829423904419, + "learning_rate": 4.4371996936089595e-05, + "loss": 2.7205, + "step": 50482 + }, + { + "epoch": 2.47, + "grad_norm": 0.7250786423683167, + "learning_rate": 4.436393906295792e-05, + "loss": 2.7172, + "step": 50483 + }, + { + "epoch": 2.47, + "grad_norm": 0.8321327567100525, + "learning_rate": 4.4355881863122367e-05, + "loss": 2.9453, + "step": 50484 + }, + { + "epoch": 2.47, + "grad_norm": 0.7267419099807739, + "learning_rate": 4.4347825336604216e-05, + "loss": 2.6185, + "step": 50485 + }, + { + "epoch": 2.47, + "grad_norm": 0.7733017802238464, + "learning_rate": 4.4339769483424783e-05, + "loss": 3.1191, + "step": 50486 + }, + { + "epoch": 2.47, + "grad_norm": 0.75758957862854, + "learning_rate": 4.433171430360508e-05, + "loss": 2.7548, + "step": 50487 + }, + { + "epoch": 2.47, + "grad_norm": 0.7199147939682007, + "learning_rate": 4.432365979716653e-05, + "loss": 2.8798, + "step": 50488 + }, + { + "epoch": 2.47, + "grad_norm": 0.723355233669281, + "learning_rate": 4.431560596413025e-05, + "loss": 2.8854, + "step": 50489 + }, + { + "epoch": 2.47, + "grad_norm": 0.7314903140068054, + "learning_rate": 4.430755280451739e-05, + "loss": 2.8459, + "step": 50490 + }, + { + "epoch": 2.47, + "grad_norm": 0.8310138583183289, + "learning_rate": 4.429950031834932e-05, + "loss": 3.1573, + "step": 50491 + }, + { + "epoch": 2.47, + "grad_norm": 0.728308916091919, + "learning_rate": 4.4291448505647043e-05, + "loss": 3.0318, + "step": 50492 + }, + { + "epoch": 2.47, + "grad_norm": 0.776800274848938, + "learning_rate": 4.4283397366431964e-05, + "loss": 2.8144, + "step": 50493 + }, + { + "epoch": 2.47, + "grad_norm": 0.7904463410377502, + "learning_rate": 4.427534690072516e-05, + "loss": 2.8312, + "step": 50494 + }, + { + "epoch": 2.47, + "grad_norm": 0.734760046005249, + "learning_rate": 4.426729710854793e-05, + "loss": 3.018, + "step": 50495 + }, + { + "epoch": 2.47, + "grad_norm": 0.7029054760932922, + "learning_rate": 4.425924798992141e-05, + "loss": 2.8469, + "step": 50496 + }, + { + "epoch": 2.47, + "grad_norm": 0.7326534390449524, + "learning_rate": 4.4251199544866755e-05, + "loss": 2.8889, + "step": 50497 + }, + { + "epoch": 2.47, + "grad_norm": 0.7710874676704407, + "learning_rate": 4.424315177340528e-05, + "loss": 2.9888, + "step": 50498 + }, + { + "epoch": 2.47, + "grad_norm": 0.7983696460723877, + "learning_rate": 4.423510467555804e-05, + "loss": 3.0389, + "step": 50499 + }, + { + "epoch": 2.47, + "grad_norm": 0.7880064249038696, + "learning_rate": 4.4227058251346416e-05, + "loss": 2.8939, + "step": 50500 + }, + { + "epoch": 2.47, + "grad_norm": 0.7095093131065369, + "learning_rate": 4.421901250079138e-05, + "loss": 2.9274, + "step": 50501 + }, + { + "epoch": 2.47, + "grad_norm": 0.732288658618927, + "learning_rate": 4.4210967423914366e-05, + "loss": 3.0009, + "step": 50502 + }, + { + "epoch": 2.48, + "grad_norm": 0.7357195615768433, + "learning_rate": 4.420292302073638e-05, + "loss": 3.0076, + "step": 50503 + }, + { + "epoch": 2.48, + "grad_norm": 0.6846479773521423, + "learning_rate": 4.419487929127864e-05, + "loss": 2.9429, + "step": 50504 + }, + { + "epoch": 2.48, + "grad_norm": 0.7166612148284912, + "learning_rate": 4.41868362355624e-05, + "loss": 2.7909, + "step": 50505 + }, + { + "epoch": 2.48, + "grad_norm": 0.7130656242370605, + "learning_rate": 4.417879385360876e-05, + "loss": 2.7799, + "step": 50506 + }, + { + "epoch": 2.48, + "grad_norm": 0.7403449416160583, + "learning_rate": 4.417075214543893e-05, + "loss": 3.0642, + "step": 50507 + }, + { + "epoch": 2.48, + "grad_norm": 0.7145542502403259, + "learning_rate": 4.416271111107417e-05, + "loss": 2.9387, + "step": 50508 + }, + { + "epoch": 2.48, + "grad_norm": 0.7323039174079895, + "learning_rate": 4.41546707505355e-05, + "loss": 2.8822, + "step": 50509 + }, + { + "epoch": 2.48, + "grad_norm": 0.7286449074745178, + "learning_rate": 4.414663106384428e-05, + "loss": 3.0583, + "step": 50510 + }, + { + "epoch": 2.48, + "grad_norm": 0.773391842842102, + "learning_rate": 4.4138592051021595e-05, + "loss": 2.8229, + "step": 50511 + }, + { + "epoch": 2.48, + "grad_norm": 0.7383174300193787, + "learning_rate": 4.413055371208854e-05, + "loss": 3.0271, + "step": 50512 + }, + { + "epoch": 2.48, + "grad_norm": 0.7910652160644531, + "learning_rate": 4.4122516047066457e-05, + "loss": 2.7217, + "step": 50513 + }, + { + "epoch": 2.48, + "grad_norm": 0.874261200428009, + "learning_rate": 4.4114479055976316e-05, + "loss": 3.0323, + "step": 50514 + }, + { + "epoch": 2.48, + "grad_norm": 0.7225984930992126, + "learning_rate": 4.41064427388395e-05, + "loss": 3.0949, + "step": 50515 + }, + { + "epoch": 2.48, + "grad_norm": 0.7266954779624939, + "learning_rate": 4.4098407095676956e-05, + "loss": 2.8574, + "step": 50516 + }, + { + "epoch": 2.48, + "grad_norm": 0.7139513492584229, + "learning_rate": 4.409037212651004e-05, + "loss": 2.6369, + "step": 50517 + }, + { + "epoch": 2.48, + "grad_norm": 0.7322560548782349, + "learning_rate": 4.408233783135984e-05, + "loss": 2.8167, + "step": 50518 + }, + { + "epoch": 2.48, + "grad_norm": 0.7208927869796753, + "learning_rate": 4.407430421024746e-05, + "loss": 2.8963, + "step": 50519 + }, + { + "epoch": 2.48, + "grad_norm": 0.7421464323997498, + "learning_rate": 4.406627126319415e-05, + "loss": 2.9859, + "step": 50520 + }, + { + "epoch": 2.48, + "grad_norm": 0.7248895764350891, + "learning_rate": 4.4058238990220975e-05, + "loss": 2.9614, + "step": 50521 + }, + { + "epoch": 2.48, + "grad_norm": 0.7308017611503601, + "learning_rate": 4.405020739134914e-05, + "loss": 2.8708, + "step": 50522 + }, + { + "epoch": 2.48, + "grad_norm": 0.7517200708389282, + "learning_rate": 4.404217646659985e-05, + "loss": 2.8794, + "step": 50523 + }, + { + "epoch": 2.48, + "grad_norm": 0.7376325726509094, + "learning_rate": 4.403414621599421e-05, + "loss": 2.8069, + "step": 50524 + }, + { + "epoch": 2.48, + "grad_norm": 0.7713701128959656, + "learning_rate": 4.4026116639553364e-05, + "loss": 2.7789, + "step": 50525 + }, + { + "epoch": 2.48, + "grad_norm": 0.7685970067977905, + "learning_rate": 4.401808773729842e-05, + "loss": 2.8286, + "step": 50526 + }, + { + "epoch": 2.48, + "grad_norm": 0.753248393535614, + "learning_rate": 4.401005950925054e-05, + "loss": 2.903, + "step": 50527 + }, + { + "epoch": 2.48, + "grad_norm": 0.7071647047996521, + "learning_rate": 4.4002031955430994e-05, + "loss": 2.7273, + "step": 50528 + }, + { + "epoch": 2.48, + "grad_norm": 0.7305782437324524, + "learning_rate": 4.399400507586074e-05, + "loss": 3.0772, + "step": 50529 + }, + { + "epoch": 2.48, + "grad_norm": 0.7409292459487915, + "learning_rate": 4.398597887056109e-05, + "loss": 2.8982, + "step": 50530 + }, + { + "epoch": 2.48, + "grad_norm": 0.7586793303489685, + "learning_rate": 4.397795333955303e-05, + "loss": 3.0227, + "step": 50531 + }, + { + "epoch": 2.48, + "grad_norm": 0.7480147480964661, + "learning_rate": 4.396992848285781e-05, + "loss": 2.5625, + "step": 50532 + }, + { + "epoch": 2.48, + "grad_norm": 0.7570845484733582, + "learning_rate": 4.3961904300496574e-05, + "loss": 2.9181, + "step": 50533 + }, + { + "epoch": 2.48, + "grad_norm": 0.7564980983734131, + "learning_rate": 4.3953880792490285e-05, + "loss": 2.9893, + "step": 50534 + }, + { + "epoch": 2.48, + "grad_norm": 0.7372446060180664, + "learning_rate": 4.394585795886031e-05, + "loss": 2.7992, + "step": 50535 + }, + { + "epoch": 2.48, + "grad_norm": 0.789084792137146, + "learning_rate": 4.3937835799627577e-05, + "loss": 2.991, + "step": 50536 + }, + { + "epoch": 2.48, + "grad_norm": 0.7723736763000488, + "learning_rate": 4.392981431481329e-05, + "loss": 2.9124, + "step": 50537 + }, + { + "epoch": 2.48, + "grad_norm": 0.7522666454315186, + "learning_rate": 4.3921793504438685e-05, + "loss": 2.9372, + "step": 50538 + }, + { + "epoch": 2.48, + "grad_norm": 0.765546977519989, + "learning_rate": 4.391377336852476e-05, + "loss": 2.8857, + "step": 50539 + }, + { + "epoch": 2.48, + "grad_norm": 0.8160582780838013, + "learning_rate": 4.390575390709271e-05, + "loss": 3.0308, + "step": 50540 + }, + { + "epoch": 2.48, + "grad_norm": 0.7867605686187744, + "learning_rate": 4.3897735120163544e-05, + "loss": 2.9236, + "step": 50541 + }, + { + "epoch": 2.48, + "grad_norm": 0.7485517263412476, + "learning_rate": 4.3889717007758416e-05, + "loss": 3.0409, + "step": 50542 + }, + { + "epoch": 2.48, + "grad_norm": 0.7289761900901794, + "learning_rate": 4.388169956989861e-05, + "loss": 3.1161, + "step": 50543 + }, + { + "epoch": 2.48, + "grad_norm": 0.7518919110298157, + "learning_rate": 4.3873682806605004e-05, + "loss": 2.7525, + "step": 50544 + }, + { + "epoch": 2.48, + "grad_norm": 0.758308470249176, + "learning_rate": 4.3865666717898915e-05, + "loss": 2.7692, + "step": 50545 + }, + { + "epoch": 2.48, + "grad_norm": 0.7173359394073486, + "learning_rate": 4.385765130380137e-05, + "loss": 2.7888, + "step": 50546 + }, + { + "epoch": 2.48, + "grad_norm": 0.7279924154281616, + "learning_rate": 4.384963656433337e-05, + "loss": 2.8149, + "step": 50547 + }, + { + "epoch": 2.48, + "grad_norm": 0.754176139831543, + "learning_rate": 4.384162249951624e-05, + "loss": 2.9872, + "step": 50548 + }, + { + "epoch": 2.48, + "grad_norm": 0.7126150727272034, + "learning_rate": 4.383360910937089e-05, + "loss": 2.9256, + "step": 50549 + }, + { + "epoch": 2.48, + "grad_norm": 0.7775080800056458, + "learning_rate": 4.3825596393918586e-05, + "loss": 2.9169, + "step": 50550 + }, + { + "epoch": 2.48, + "grad_norm": 0.7692570686340332, + "learning_rate": 4.381758435318029e-05, + "loss": 2.8355, + "step": 50551 + }, + { + "epoch": 2.48, + "grad_norm": 0.7409884929656982, + "learning_rate": 4.380957298717717e-05, + "loss": 2.9527, + "step": 50552 + }, + { + "epoch": 2.48, + "grad_norm": 0.7356708645820618, + "learning_rate": 4.3801562295930395e-05, + "loss": 2.7726, + "step": 50553 + }, + { + "epoch": 2.48, + "grad_norm": 0.7332990169525146, + "learning_rate": 4.379355227946102e-05, + "loss": 2.7626, + "step": 50554 + }, + { + "epoch": 2.48, + "grad_norm": 0.7492383718490601, + "learning_rate": 4.37855429377901e-05, + "loss": 2.7641, + "step": 50555 + }, + { + "epoch": 2.48, + "grad_norm": 0.7193925976753235, + "learning_rate": 4.377753427093867e-05, + "loss": 2.7948, + "step": 50556 + }, + { + "epoch": 2.48, + "grad_norm": 0.7237381339073181, + "learning_rate": 4.3769526278927935e-05, + "loss": 2.8049, + "step": 50557 + }, + { + "epoch": 2.48, + "grad_norm": 0.7913342118263245, + "learning_rate": 4.376151896177903e-05, + "loss": 2.9314, + "step": 50558 + }, + { + "epoch": 2.48, + "grad_norm": 0.6835290789604187, + "learning_rate": 4.3753512319512876e-05, + "loss": 3.2759, + "step": 50559 + }, + { + "epoch": 2.48, + "grad_norm": 0.7239914536476135, + "learning_rate": 4.3745506352150715e-05, + "loss": 2.7385, + "step": 50560 + }, + { + "epoch": 2.48, + "grad_norm": 0.7153903841972351, + "learning_rate": 4.373750105971361e-05, + "loss": 3.0137, + "step": 50561 + }, + { + "epoch": 2.48, + "grad_norm": 0.7405564785003662, + "learning_rate": 4.37294964422225e-05, + "loss": 2.8258, + "step": 50562 + }, + { + "epoch": 2.48, + "grad_norm": 0.7112692594528198, + "learning_rate": 4.3721492499698674e-05, + "loss": 2.9587, + "step": 50563 + }, + { + "epoch": 2.48, + "grad_norm": 0.7458570003509521, + "learning_rate": 4.3713489232163014e-05, + "loss": 2.8849, + "step": 50564 + }, + { + "epoch": 2.48, + "grad_norm": 0.7017235159873962, + "learning_rate": 4.37054866396368e-05, + "loss": 2.9045, + "step": 50565 + }, + { + "epoch": 2.48, + "grad_norm": 0.7555688619613647, + "learning_rate": 4.3697484722140917e-05, + "loss": 2.9832, + "step": 50566 + }, + { + "epoch": 2.48, + "grad_norm": 0.7248188853263855, + "learning_rate": 4.368948347969655e-05, + "loss": 2.955, + "step": 50567 + }, + { + "epoch": 2.48, + "grad_norm": 0.7523334622383118, + "learning_rate": 4.368148291232492e-05, + "loss": 2.9627, + "step": 50568 + }, + { + "epoch": 2.48, + "grad_norm": 0.7347005009651184, + "learning_rate": 4.3673483020046764e-05, + "loss": 3.0475, + "step": 50569 + }, + { + "epoch": 2.48, + "grad_norm": 0.7141953110694885, + "learning_rate": 4.366548380288337e-05, + "loss": 2.8616, + "step": 50570 + }, + { + "epoch": 2.48, + "grad_norm": 0.7329511046409607, + "learning_rate": 4.365748526085573e-05, + "loss": 2.9409, + "step": 50571 + }, + { + "epoch": 2.48, + "grad_norm": 0.7447880506515503, + "learning_rate": 4.364948739398498e-05, + "loss": 2.8136, + "step": 50572 + }, + { + "epoch": 2.48, + "grad_norm": 0.7083245515823364, + "learning_rate": 4.3641490202292086e-05, + "loss": 2.616, + "step": 50573 + }, + { + "epoch": 2.48, + "grad_norm": 0.7109681367874146, + "learning_rate": 4.363349368579818e-05, + "loss": 3.0928, + "step": 50574 + }, + { + "epoch": 2.48, + "grad_norm": 0.8247482180595398, + "learning_rate": 4.362549784452436e-05, + "loss": 2.8209, + "step": 50575 + }, + { + "epoch": 2.48, + "grad_norm": 0.7620562314987183, + "learning_rate": 4.3617502678491666e-05, + "loss": 3.0653, + "step": 50576 + }, + { + "epoch": 2.48, + "grad_norm": 0.7446070313453674, + "learning_rate": 4.360950818772112e-05, + "loss": 2.795, + "step": 50577 + }, + { + "epoch": 2.48, + "grad_norm": 0.7512825727462769, + "learning_rate": 4.36015143722337e-05, + "loss": 2.978, + "step": 50578 + }, + { + "epoch": 2.48, + "grad_norm": 0.7243783473968506, + "learning_rate": 4.3593521232050564e-05, + "loss": 3.0133, + "step": 50579 + }, + { + "epoch": 2.48, + "grad_norm": 0.7371456027030945, + "learning_rate": 4.358552876719282e-05, + "loss": 2.8551, + "step": 50580 + }, + { + "epoch": 2.48, + "grad_norm": 0.7351969480514526, + "learning_rate": 4.357753697768136e-05, + "loss": 3.0309, + "step": 50581 + }, + { + "epoch": 2.48, + "grad_norm": 0.7740240693092346, + "learning_rate": 4.356954586353739e-05, + "loss": 2.8982, + "step": 50582 + }, + { + "epoch": 2.48, + "grad_norm": 0.7251624464988708, + "learning_rate": 4.356155542478187e-05, + "loss": 2.8026, + "step": 50583 + }, + { + "epoch": 2.48, + "grad_norm": 0.734462559223175, + "learning_rate": 4.3553565661435804e-05, + "loss": 2.9494, + "step": 50584 + }, + { + "epoch": 2.48, + "grad_norm": 0.7287883758544922, + "learning_rate": 4.354557657352039e-05, + "loss": 2.782, + "step": 50585 + }, + { + "epoch": 2.48, + "grad_norm": 0.7678456902503967, + "learning_rate": 4.3537588161056466e-05, + "loss": 2.7814, + "step": 50586 + }, + { + "epoch": 2.48, + "grad_norm": 0.722035825252533, + "learning_rate": 4.352960042406526e-05, + "loss": 2.8701, + "step": 50587 + }, + { + "epoch": 2.48, + "grad_norm": 0.7033929824829102, + "learning_rate": 4.352161336256768e-05, + "loss": 2.9547, + "step": 50588 + }, + { + "epoch": 2.48, + "grad_norm": 0.7072238326072693, + "learning_rate": 4.351362697658477e-05, + "loss": 2.9044, + "step": 50589 + }, + { + "epoch": 2.48, + "grad_norm": 0.7303126454353333, + "learning_rate": 4.350564126613769e-05, + "loss": 2.7136, + "step": 50590 + }, + { + "epoch": 2.48, + "grad_norm": 0.7483873963356018, + "learning_rate": 4.349765623124739e-05, + "loss": 2.8554, + "step": 50591 + }, + { + "epoch": 2.48, + "grad_norm": 0.722395658493042, + "learning_rate": 4.3489671871934915e-05, + "loss": 2.8171, + "step": 50592 + }, + { + "epoch": 2.48, + "grad_norm": 0.71732497215271, + "learning_rate": 4.3481688188221185e-05, + "loss": 2.9246, + "step": 50593 + }, + { + "epoch": 2.48, + "grad_norm": 0.7207403182983398, + "learning_rate": 4.347370518012731e-05, + "loss": 3.0359, + "step": 50594 + }, + { + "epoch": 2.48, + "grad_norm": 0.7453340291976929, + "learning_rate": 4.346572284767442e-05, + "loss": 2.9454, + "step": 50595 + }, + { + "epoch": 2.48, + "grad_norm": 0.7322558164596558, + "learning_rate": 4.345774119088339e-05, + "loss": 3.0738, + "step": 50596 + }, + { + "epoch": 2.48, + "grad_norm": 0.7502358555793762, + "learning_rate": 4.344976020977533e-05, + "loss": 2.9023, + "step": 50597 + }, + { + "epoch": 2.48, + "grad_norm": 0.7384856939315796, + "learning_rate": 4.3441779904371234e-05, + "loss": 2.8681, + "step": 50598 + }, + { + "epoch": 2.48, + "grad_norm": 0.7567711472511292, + "learning_rate": 4.343380027469208e-05, + "loss": 3.0825, + "step": 50599 + }, + { + "epoch": 2.48, + "grad_norm": 0.7449147701263428, + "learning_rate": 4.342582132075895e-05, + "loss": 2.8255, + "step": 50600 + }, + { + "epoch": 2.48, + "grad_norm": 0.7349591255187988, + "learning_rate": 4.341784304259276e-05, + "loss": 2.8695, + "step": 50601 + }, + { + "epoch": 2.48, + "grad_norm": 0.7392820119857788, + "learning_rate": 4.340986544021466e-05, + "loss": 3.0477, + "step": 50602 + }, + { + "epoch": 2.48, + "grad_norm": 0.7246425747871399, + "learning_rate": 4.3401888513645545e-05, + "loss": 2.7809, + "step": 50603 + }, + { + "epoch": 2.48, + "grad_norm": 0.7254135608673096, + "learning_rate": 4.339391226290655e-05, + "loss": 2.8379, + "step": 50604 + }, + { + "epoch": 2.48, + "grad_norm": 0.7831563353538513, + "learning_rate": 4.3385936688018564e-05, + "loss": 2.6841, + "step": 50605 + }, + { + "epoch": 2.48, + "grad_norm": 0.7377023100852966, + "learning_rate": 4.33779617890026e-05, + "loss": 2.8574, + "step": 50606 + }, + { + "epoch": 2.48, + "grad_norm": 0.7295764088630676, + "learning_rate": 4.336998756587975e-05, + "loss": 2.7794, + "step": 50607 + }, + { + "epoch": 2.48, + "grad_norm": 0.7548102736473083, + "learning_rate": 4.336201401867089e-05, + "loss": 2.8338, + "step": 50608 + }, + { + "epoch": 2.48, + "grad_norm": 0.7400689721107483, + "learning_rate": 4.33540411473971e-05, + "loss": 3.0272, + "step": 50609 + }, + { + "epoch": 2.48, + "grad_norm": 0.7324529886245728, + "learning_rate": 4.334606895207944e-05, + "loss": 2.8919, + "step": 50610 + }, + { + "epoch": 2.48, + "grad_norm": 0.7176007628440857, + "learning_rate": 4.333809743273878e-05, + "loss": 2.9282, + "step": 50611 + }, + { + "epoch": 2.48, + "grad_norm": 0.7532137632369995, + "learning_rate": 4.333012658939625e-05, + "loss": 2.8807, + "step": 50612 + }, + { + "epoch": 2.48, + "grad_norm": 0.7730857133865356, + "learning_rate": 4.332215642207276e-05, + "loss": 2.9552, + "step": 50613 + }, + { + "epoch": 2.48, + "grad_norm": 0.7338061332702637, + "learning_rate": 4.3314186930789226e-05, + "loss": 2.8655, + "step": 50614 + }, + { + "epoch": 2.48, + "grad_norm": 0.7536339163780212, + "learning_rate": 4.33062181155668e-05, + "loss": 2.9174, + "step": 50615 + }, + { + "epoch": 2.48, + "grad_norm": 0.7680534720420837, + "learning_rate": 4.3298249976426336e-05, + "loss": 2.8697, + "step": 50616 + }, + { + "epoch": 2.48, + "grad_norm": 0.7779417634010315, + "learning_rate": 4.329028251338897e-05, + "loss": 2.6885, + "step": 50617 + }, + { + "epoch": 2.48, + "grad_norm": 0.7388231754302979, + "learning_rate": 4.3282315726475505e-05, + "loss": 2.8754, + "step": 50618 + }, + { + "epoch": 2.48, + "grad_norm": 0.7245052456855774, + "learning_rate": 4.327434961570707e-05, + "loss": 2.9758, + "step": 50619 + }, + { + "epoch": 2.48, + "grad_norm": 0.7409709692001343, + "learning_rate": 4.3266384181104595e-05, + "loss": 2.9113, + "step": 50620 + }, + { + "epoch": 2.48, + "grad_norm": 0.7566462755203247, + "learning_rate": 4.3258419422688995e-05, + "loss": 3.0348, + "step": 50621 + }, + { + "epoch": 2.48, + "grad_norm": 0.7550995349884033, + "learning_rate": 4.325045534048135e-05, + "loss": 2.9427, + "step": 50622 + }, + { + "epoch": 2.48, + "grad_norm": 0.6849644184112549, + "learning_rate": 4.324249193450254e-05, + "loss": 2.708, + "step": 50623 + }, + { + "epoch": 2.48, + "grad_norm": 0.7459685206413269, + "learning_rate": 4.3234529204773624e-05, + "loss": 2.933, + "step": 50624 + }, + { + "epoch": 2.48, + "grad_norm": 0.753688633441925, + "learning_rate": 4.3226567151315585e-05, + "loss": 2.771, + "step": 50625 + }, + { + "epoch": 2.48, + "grad_norm": 0.7236641645431519, + "learning_rate": 4.321860577414933e-05, + "loss": 2.9226, + "step": 50626 + }, + { + "epoch": 2.48, + "grad_norm": 0.7358285188674927, + "learning_rate": 4.321064507329588e-05, + "loss": 2.8284, + "step": 50627 + }, + { + "epoch": 2.48, + "grad_norm": 0.769458532333374, + "learning_rate": 4.3202685048776085e-05, + "loss": 2.8737, + "step": 50628 + }, + { + "epoch": 2.48, + "grad_norm": 0.721544086933136, + "learning_rate": 4.3194725700611e-05, + "loss": 2.9255, + "step": 50629 + }, + { + "epoch": 2.48, + "grad_norm": 0.7584481239318848, + "learning_rate": 4.3186767028821666e-05, + "loss": 2.8509, + "step": 50630 + }, + { + "epoch": 2.48, + "grad_norm": 0.8179458975791931, + "learning_rate": 4.317880903342887e-05, + "loss": 2.7925, + "step": 50631 + }, + { + "epoch": 2.48, + "grad_norm": 0.7520755529403687, + "learning_rate": 4.317085171445377e-05, + "loss": 2.8761, + "step": 50632 + }, + { + "epoch": 2.48, + "grad_norm": 0.7519469261169434, + "learning_rate": 4.31628950719171e-05, + "loss": 2.8604, + "step": 50633 + }, + { + "epoch": 2.48, + "grad_norm": 0.7431280016899109, + "learning_rate": 4.3154939105840056e-05, + "loss": 2.6998, + "step": 50634 + }, + { + "epoch": 2.48, + "grad_norm": 0.7596693634986877, + "learning_rate": 4.314698381624342e-05, + "loss": 3.019, + "step": 50635 + }, + { + "epoch": 2.48, + "grad_norm": 0.6919360756874084, + "learning_rate": 4.313902920314817e-05, + "loss": 2.8299, + "step": 50636 + }, + { + "epoch": 2.48, + "grad_norm": 0.7272117733955383, + "learning_rate": 4.313107526657532e-05, + "loss": 3.0207, + "step": 50637 + }, + { + "epoch": 2.48, + "grad_norm": 0.7418824434280396, + "learning_rate": 4.312312200654573e-05, + "loss": 2.9491, + "step": 50638 + }, + { + "epoch": 2.48, + "grad_norm": 0.759490966796875, + "learning_rate": 4.3115169423080406e-05, + "loss": 2.7287, + "step": 50639 + }, + { + "epoch": 2.48, + "grad_norm": 0.7105665802955627, + "learning_rate": 4.310721751620034e-05, + "loss": 2.8817, + "step": 50640 + }, + { + "epoch": 2.48, + "grad_norm": 0.6908241510391235, + "learning_rate": 4.309926628592645e-05, + "loss": 2.9844, + "step": 50641 + }, + { + "epoch": 2.48, + "grad_norm": 0.7713239789009094, + "learning_rate": 4.309131573227964e-05, + "loss": 2.9163, + "step": 50642 + }, + { + "epoch": 2.48, + "grad_norm": 0.7205983400344849, + "learning_rate": 4.308336585528077e-05, + "loss": 2.8435, + "step": 50643 + }, + { + "epoch": 2.48, + "grad_norm": 0.7385979294776917, + "learning_rate": 4.307541665495089e-05, + "loss": 2.8786, + "step": 50644 + }, + { + "epoch": 2.48, + "grad_norm": 0.753772497177124, + "learning_rate": 4.306746813131098e-05, + "loss": 2.8506, + "step": 50645 + }, + { + "epoch": 2.48, + "grad_norm": 0.7099587321281433, + "learning_rate": 4.305952028438186e-05, + "loss": 2.9241, + "step": 50646 + }, + { + "epoch": 2.48, + "grad_norm": 0.7350571155548096, + "learning_rate": 4.305157311418457e-05, + "loss": 3.0212, + "step": 50647 + }, + { + "epoch": 2.48, + "grad_norm": 0.7650042176246643, + "learning_rate": 4.304362662073997e-05, + "loss": 3.1281, + "step": 50648 + }, + { + "epoch": 2.48, + "grad_norm": 0.761756181716919, + "learning_rate": 4.303568080406901e-05, + "loss": 2.9067, + "step": 50649 + }, + { + "epoch": 2.48, + "grad_norm": 0.7702816724777222, + "learning_rate": 4.302773566419256e-05, + "loss": 3.1063, + "step": 50650 + }, + { + "epoch": 2.48, + "grad_norm": 0.7798253893852234, + "learning_rate": 4.301979120113159e-05, + "loss": 2.8491, + "step": 50651 + }, + { + "epoch": 2.48, + "grad_norm": 0.7569634318351746, + "learning_rate": 4.301184741490711e-05, + "loss": 2.9733, + "step": 50652 + }, + { + "epoch": 2.48, + "grad_norm": 0.7624911665916443, + "learning_rate": 4.3003904305539896e-05, + "loss": 3.0518, + "step": 50653 + }, + { + "epoch": 2.48, + "grad_norm": 0.7225568294525146, + "learning_rate": 4.2995961873051e-05, + "loss": 2.8497, + "step": 50654 + }, + { + "epoch": 2.48, + "grad_norm": 0.6899399757385254, + "learning_rate": 4.298802011746121e-05, + "loss": 2.7573, + "step": 50655 + }, + { + "epoch": 2.48, + "grad_norm": 0.7465593218803406, + "learning_rate": 4.298007903879157e-05, + "loss": 2.9092, + "step": 50656 + }, + { + "epoch": 2.48, + "grad_norm": 0.7861872315406799, + "learning_rate": 4.297213863706294e-05, + "loss": 2.9574, + "step": 50657 + }, + { + "epoch": 2.48, + "grad_norm": 0.7941358685493469, + "learning_rate": 4.296419891229619e-05, + "loss": 3.0628, + "step": 50658 + }, + { + "epoch": 2.48, + "grad_norm": 0.7803980112075806, + "learning_rate": 4.295625986451231e-05, + "loss": 2.8888, + "step": 50659 + }, + { + "epoch": 2.48, + "grad_norm": 0.7576730847358704, + "learning_rate": 4.294832149373212e-05, + "loss": 2.9702, + "step": 50660 + }, + { + "epoch": 2.48, + "grad_norm": 0.6931632161140442, + "learning_rate": 4.294038379997656e-05, + "loss": 2.9009, + "step": 50661 + }, + { + "epoch": 2.48, + "grad_norm": 0.7237907648086548, + "learning_rate": 4.293244678326666e-05, + "loss": 3.0382, + "step": 50662 + }, + { + "epoch": 2.48, + "grad_norm": 0.7643592953681946, + "learning_rate": 4.292451044362319e-05, + "loss": 3.0474, + "step": 50663 + }, + { + "epoch": 2.48, + "grad_norm": 0.7706605195999146, + "learning_rate": 4.2916574781067116e-05, + "loss": 2.8414, + "step": 50664 + }, + { + "epoch": 2.48, + "grad_norm": 0.7509461045265198, + "learning_rate": 4.2908639795619204e-05, + "loss": 2.8932, + "step": 50665 + }, + { + "epoch": 2.48, + "grad_norm": 0.7358827590942383, + "learning_rate": 4.290070548730048e-05, + "loss": 2.8797, + "step": 50666 + }, + { + "epoch": 2.48, + "grad_norm": 0.7475077509880066, + "learning_rate": 4.289277185613189e-05, + "loss": 2.8333, + "step": 50667 + }, + { + "epoch": 2.48, + "grad_norm": 0.7222772240638733, + "learning_rate": 4.288483890213416e-05, + "loss": 2.7275, + "step": 50668 + }, + { + "epoch": 2.48, + "grad_norm": 0.7573405504226685, + "learning_rate": 4.287690662532839e-05, + "loss": 2.9758, + "step": 50669 + }, + { + "epoch": 2.48, + "grad_norm": 0.7598530054092407, + "learning_rate": 4.286897502573534e-05, + "loss": 3.048, + "step": 50670 + }, + { + "epoch": 2.48, + "grad_norm": 0.7809165716171265, + "learning_rate": 4.286104410337586e-05, + "loss": 3.0113, + "step": 50671 + }, + { + "epoch": 2.48, + "grad_norm": 0.7310870885848999, + "learning_rate": 4.2853113858270974e-05, + "loss": 2.9604, + "step": 50672 + }, + { + "epoch": 2.48, + "grad_norm": 0.7275199294090271, + "learning_rate": 4.2845184290441415e-05, + "loss": 2.9085, + "step": 50673 + }, + { + "epoch": 2.48, + "grad_norm": 0.7422723174095154, + "learning_rate": 4.283725539990821e-05, + "loss": 2.789, + "step": 50674 + }, + { + "epoch": 2.48, + "grad_norm": 0.7516480088233948, + "learning_rate": 4.282932718669214e-05, + "loss": 2.8959, + "step": 50675 + }, + { + "epoch": 2.48, + "grad_norm": 0.7623327374458313, + "learning_rate": 4.2821399650814126e-05, + "loss": 2.8064, + "step": 50676 + }, + { + "epoch": 2.48, + "grad_norm": 0.7669196724891663, + "learning_rate": 4.281347279229511e-05, + "loss": 2.8621, + "step": 50677 + }, + { + "epoch": 2.48, + "grad_norm": 0.726617157459259, + "learning_rate": 4.2805546611155916e-05, + "loss": 2.616, + "step": 50678 + }, + { + "epoch": 2.48, + "grad_norm": 0.7164513468742371, + "learning_rate": 4.279762110741742e-05, + "loss": 2.7553, + "step": 50679 + }, + { + "epoch": 2.48, + "grad_norm": 0.7366183400154114, + "learning_rate": 4.2789696281100417e-05, + "loss": 3.0931, + "step": 50680 + }, + { + "epoch": 2.48, + "grad_norm": 0.7067437171936035, + "learning_rate": 4.278177213222584e-05, + "loss": 2.9771, + "step": 50681 + }, + { + "epoch": 2.48, + "grad_norm": 0.8154669404029846, + "learning_rate": 4.2773848660814656e-05, + "loss": 2.7634, + "step": 50682 + }, + { + "epoch": 2.48, + "grad_norm": 0.7058912515640259, + "learning_rate": 4.2765925866887574e-05, + "loss": 2.9507, + "step": 50683 + }, + { + "epoch": 2.48, + "grad_norm": 0.7300341129302979, + "learning_rate": 4.275800375046561e-05, + "loss": 3.0809, + "step": 50684 + }, + { + "epoch": 2.48, + "grad_norm": 0.7966801524162292, + "learning_rate": 4.275008231156954e-05, + "loss": 2.898, + "step": 50685 + }, + { + "epoch": 2.48, + "grad_norm": 0.7171053290367126, + "learning_rate": 4.27421615502202e-05, + "loss": 3.0457, + "step": 50686 + }, + { + "epoch": 2.48, + "grad_norm": 0.7475465536117554, + "learning_rate": 4.2734241466438554e-05, + "loss": 2.9942, + "step": 50687 + }, + { + "epoch": 2.48, + "grad_norm": 0.738580584526062, + "learning_rate": 4.272632206024533e-05, + "loss": 3.0752, + "step": 50688 + }, + { + "epoch": 2.48, + "grad_norm": 0.7871452569961548, + "learning_rate": 4.2718403331661547e-05, + "loss": 3.007, + "step": 50689 + }, + { + "epoch": 2.48, + "grad_norm": 0.745241105556488, + "learning_rate": 4.2710485280707873e-05, + "loss": 2.8626, + "step": 50690 + }, + { + "epoch": 2.48, + "grad_norm": 0.7506486773490906, + "learning_rate": 4.2702567907405305e-05, + "loss": 2.9813, + "step": 50691 + }, + { + "epoch": 2.48, + "grad_norm": 0.7289913296699524, + "learning_rate": 4.269465121177469e-05, + "loss": 2.7847, + "step": 50692 + }, + { + "epoch": 2.48, + "grad_norm": 0.7110291123390198, + "learning_rate": 4.268673519383687e-05, + "loss": 3.036, + "step": 50693 + }, + { + "epoch": 2.48, + "grad_norm": 0.773071825504303, + "learning_rate": 4.2678819853612646e-05, + "loss": 2.926, + "step": 50694 + }, + { + "epoch": 2.48, + "grad_norm": 0.7590256333351135, + "learning_rate": 4.267090519112285e-05, + "loss": 2.9054, + "step": 50695 + }, + { + "epoch": 2.48, + "grad_norm": 0.7588886022567749, + "learning_rate": 4.266299120638834e-05, + "loss": 2.8203, + "step": 50696 + }, + { + "epoch": 2.48, + "grad_norm": 0.7508629560470581, + "learning_rate": 4.265507789943007e-05, + "loss": 2.8658, + "step": 50697 + }, + { + "epoch": 2.48, + "grad_norm": 0.7774369120597839, + "learning_rate": 4.264716527026871e-05, + "loss": 3.0411, + "step": 50698 + }, + { + "epoch": 2.48, + "grad_norm": 0.7264341115951538, + "learning_rate": 4.263925331892526e-05, + "loss": 2.9546, + "step": 50699 + }, + { + "epoch": 2.48, + "grad_norm": 0.7322808504104614, + "learning_rate": 4.2631342045420523e-05, + "loss": 2.8151, + "step": 50700 + }, + { + "epoch": 2.48, + "grad_norm": 0.7444933652877808, + "learning_rate": 4.2623431449775194e-05, + "loss": 2.7928, + "step": 50701 + }, + { + "epoch": 2.48, + "grad_norm": 0.7420282959938049, + "learning_rate": 4.261552153201028e-05, + "loss": 2.8152, + "step": 50702 + }, + { + "epoch": 2.48, + "grad_norm": 0.7798295021057129, + "learning_rate": 4.2607612292146496e-05, + "loss": 2.8232, + "step": 50703 + }, + { + "epoch": 2.48, + "grad_norm": 0.7416540384292603, + "learning_rate": 4.259970373020477e-05, + "loss": 2.9673, + "step": 50704 + }, + { + "epoch": 2.48, + "grad_norm": 0.6930776238441467, + "learning_rate": 4.2591795846205844e-05, + "loss": 2.8342, + "step": 50705 + }, + { + "epoch": 2.48, + "grad_norm": 0.7511429786682129, + "learning_rate": 4.2583888640170674e-05, + "loss": 2.8983, + "step": 50706 + }, + { + "epoch": 2.49, + "grad_norm": 0.7975016236305237, + "learning_rate": 4.257598211211997e-05, + "loss": 2.861, + "step": 50707 + }, + { + "epoch": 2.49, + "grad_norm": 0.7396607995033264, + "learning_rate": 4.256807626207451e-05, + "loss": 2.6752, + "step": 50708 + }, + { + "epoch": 2.49, + "grad_norm": 0.7415522336959839, + "learning_rate": 4.256017109005526e-05, + "loss": 2.7366, + "step": 50709 + }, + { + "epoch": 2.49, + "grad_norm": 0.7048690915107727, + "learning_rate": 4.2552266596082926e-05, + "loss": 2.9888, + "step": 50710 + }, + { + "epoch": 2.49, + "grad_norm": 0.787742555141449, + "learning_rate": 4.2544362780178366e-05, + "loss": 2.7388, + "step": 50711 + }, + { + "epoch": 2.49, + "grad_norm": 0.8210556507110596, + "learning_rate": 4.253645964236249e-05, + "loss": 2.6602, + "step": 50712 + }, + { + "epoch": 2.49, + "grad_norm": 0.7322614789009094, + "learning_rate": 4.252855718265594e-05, + "loss": 2.8288, + "step": 50713 + }, + { + "epoch": 2.49, + "grad_norm": 0.7463746070861816, + "learning_rate": 4.252065540107972e-05, + "loss": 2.9142, + "step": 50714 + }, + { + "epoch": 2.49, + "grad_norm": 0.7110394835472107, + "learning_rate": 4.25127542976545e-05, + "loss": 2.7829, + "step": 50715 + }, + { + "epoch": 2.49, + "grad_norm": 0.7307591438293457, + "learning_rate": 4.250485387240109e-05, + "loss": 3.1279, + "step": 50716 + }, + { + "epoch": 2.49, + "grad_norm": 0.7428984045982361, + "learning_rate": 4.249695412534039e-05, + "loss": 2.9116, + "step": 50717 + }, + { + "epoch": 2.49, + "grad_norm": 0.7223466634750366, + "learning_rate": 4.24890550564931e-05, + "loss": 2.9612, + "step": 50718 + }, + { + "epoch": 2.49, + "grad_norm": 0.7388598322868347, + "learning_rate": 4.2481156665880154e-05, + "loss": 3.0812, + "step": 50719 + }, + { + "epoch": 2.49, + "grad_norm": 0.7969038486480713, + "learning_rate": 4.247325895352219e-05, + "loss": 2.5446, + "step": 50720 + }, + { + "epoch": 2.49, + "grad_norm": 0.7594483494758606, + "learning_rate": 4.2465361919440165e-05, + "loss": 2.7923, + "step": 50721 + }, + { + "epoch": 2.49, + "grad_norm": 0.7183117270469666, + "learning_rate": 4.2457465563654824e-05, + "loss": 2.7408, + "step": 50722 + }, + { + "epoch": 2.49, + "grad_norm": 0.7206244468688965, + "learning_rate": 4.244956988618692e-05, + "loss": 3.0161, + "step": 50723 + }, + { + "epoch": 2.49, + "grad_norm": 0.781730592250824, + "learning_rate": 4.24416748870573e-05, + "loss": 2.9401, + "step": 50724 + }, + { + "epoch": 2.49, + "grad_norm": 0.7326677441596985, + "learning_rate": 4.243378056628668e-05, + "loss": 2.9432, + "step": 50725 + }, + { + "epoch": 2.49, + "grad_norm": 0.7561776041984558, + "learning_rate": 4.242588692389595e-05, + "loss": 2.7459, + "step": 50726 + }, + { + "epoch": 2.49, + "grad_norm": 0.7506656646728516, + "learning_rate": 4.241799395990588e-05, + "loss": 2.7, + "step": 50727 + }, + { + "epoch": 2.49, + "grad_norm": 0.7250350713729858, + "learning_rate": 4.241010167433727e-05, + "loss": 2.8735, + "step": 50728 + }, + { + "epoch": 2.49, + "grad_norm": 0.7116358876228333, + "learning_rate": 4.24022100672109e-05, + "loss": 2.829, + "step": 50729 + }, + { + "epoch": 2.49, + "grad_norm": 0.7642216682434082, + "learning_rate": 4.239431913854744e-05, + "loss": 2.8395, + "step": 50730 + }, + { + "epoch": 2.49, + "grad_norm": 0.7662079334259033, + "learning_rate": 4.238642888836782e-05, + "loss": 2.9226, + "step": 50731 + }, + { + "epoch": 2.49, + "grad_norm": 0.7560353875160217, + "learning_rate": 4.237853931669272e-05, + "loss": 2.8221, + "step": 50732 + }, + { + "epoch": 2.49, + "grad_norm": 0.7635805010795593, + "learning_rate": 4.237065042354295e-05, + "loss": 2.9538, + "step": 50733 + }, + { + "epoch": 2.49, + "grad_norm": 0.750133752822876, + "learning_rate": 4.236276220893937e-05, + "loss": 3.0837, + "step": 50734 + }, + { + "epoch": 2.49, + "grad_norm": 0.7176488041877747, + "learning_rate": 4.2354874672902664e-05, + "loss": 2.914, + "step": 50735 + }, + { + "epoch": 2.49, + "grad_norm": 0.7769431471824646, + "learning_rate": 4.234698781545367e-05, + "loss": 3.0444, + "step": 50736 + }, + { + "epoch": 2.49, + "grad_norm": 0.7291898727416992, + "learning_rate": 4.2339101636613116e-05, + "loss": 2.7426, + "step": 50737 + }, + { + "epoch": 2.49, + "grad_norm": 0.7305982112884521, + "learning_rate": 4.2331216136401716e-05, + "loss": 2.8365, + "step": 50738 + }, + { + "epoch": 2.49, + "grad_norm": 0.7421076893806458, + "learning_rate": 4.232333131484038e-05, + "loss": 2.9467, + "step": 50739 + }, + { + "epoch": 2.49, + "grad_norm": 0.7870148420333862, + "learning_rate": 4.2315447171949735e-05, + "loss": 2.9623, + "step": 50740 + }, + { + "epoch": 2.49, + "grad_norm": 0.7365769147872925, + "learning_rate": 4.230756370775066e-05, + "loss": 3.0139, + "step": 50741 + }, + { + "epoch": 2.49, + "grad_norm": 0.7033193707466125, + "learning_rate": 4.2299680922263835e-05, + "loss": 2.8967, + "step": 50742 + }, + { + "epoch": 2.49, + "grad_norm": 0.7226850390434265, + "learning_rate": 4.229179881551008e-05, + "loss": 2.9138, + "step": 50743 + }, + { + "epoch": 2.49, + "grad_norm": 0.763777494430542, + "learning_rate": 4.228391738751015e-05, + "loss": 2.7401, + "step": 50744 + }, + { + "epoch": 2.49, + "grad_norm": 0.7105336785316467, + "learning_rate": 4.227603663828472e-05, + "loss": 3.0466, + "step": 50745 + }, + { + "epoch": 2.49, + "grad_norm": 0.7253413200378418, + "learning_rate": 4.226815656785467e-05, + "loss": 2.976, + "step": 50746 + }, + { + "epoch": 2.49, + "grad_norm": 0.7211529612541199, + "learning_rate": 4.226027717624063e-05, + "loss": 2.8548, + "step": 50747 + }, + { + "epoch": 2.49, + "grad_norm": 0.7225125432014465, + "learning_rate": 4.225239846346341e-05, + "loss": 2.841, + "step": 50748 + }, + { + "epoch": 2.49, + "grad_norm": 0.7354315519332886, + "learning_rate": 4.224452042954386e-05, + "loss": 2.6947, + "step": 50749 + }, + { + "epoch": 2.49, + "grad_norm": 0.7653220891952515, + "learning_rate": 4.223664307450263e-05, + "loss": 2.9488, + "step": 50750 + }, + { + "epoch": 2.49, + "grad_norm": 0.7226547598838806, + "learning_rate": 4.2228766398360434e-05, + "loss": 2.9566, + "step": 50751 + }, + { + "epoch": 2.49, + "grad_norm": 0.7502910494804382, + "learning_rate": 4.222089040113803e-05, + "loss": 2.8887, + "step": 50752 + }, + { + "epoch": 2.49, + "grad_norm": 0.7303741574287415, + "learning_rate": 4.221301508285619e-05, + "loss": 3.1261, + "step": 50753 + }, + { + "epoch": 2.49, + "grad_norm": 0.7288039326667786, + "learning_rate": 4.2205140443535714e-05, + "loss": 2.8225, + "step": 50754 + }, + { + "epoch": 2.49, + "grad_norm": 0.7314246296882629, + "learning_rate": 4.21972664831972e-05, + "loss": 2.9364, + "step": 50755 + }, + { + "epoch": 2.49, + "grad_norm": 0.7128660678863525, + "learning_rate": 4.218939320186158e-05, + "loss": 2.7165, + "step": 50756 + }, + { + "epoch": 2.49, + "grad_norm": 0.7295336127281189, + "learning_rate": 4.218152059954939e-05, + "loss": 2.7854, + "step": 50757 + }, + { + "epoch": 2.49, + "grad_norm": 0.7478935122489929, + "learning_rate": 4.2173648676281524e-05, + "loss": 2.8468, + "step": 50758 + }, + { + "epoch": 2.49, + "grad_norm": 0.7409915328025818, + "learning_rate": 4.216577743207863e-05, + "loss": 2.678, + "step": 50759 + }, + { + "epoch": 2.49, + "grad_norm": 0.742350161075592, + "learning_rate": 4.2157906866961425e-05, + "loss": 2.8212, + "step": 50760 + }, + { + "epoch": 2.49, + "grad_norm": 0.7968260645866394, + "learning_rate": 4.215003698095072e-05, + "loss": 2.9907, + "step": 50761 + }, + { + "epoch": 2.49, + "grad_norm": 0.712583601474762, + "learning_rate": 4.2142167774067104e-05, + "loss": 3.0309, + "step": 50762 + }, + { + "epoch": 2.49, + "grad_norm": 0.7087548971176147, + "learning_rate": 4.213429924633143e-05, + "loss": 2.7685, + "step": 50763 + }, + { + "epoch": 2.49, + "grad_norm": 0.7460187673568726, + "learning_rate": 4.212643139776444e-05, + "loss": 2.9495, + "step": 50764 + }, + { + "epoch": 2.49, + "grad_norm": 0.8229163885116577, + "learning_rate": 4.211856422838679e-05, + "loss": 2.9066, + "step": 50765 + }, + { + "epoch": 2.49, + "grad_norm": 0.7447080612182617, + "learning_rate": 4.211069773821922e-05, + "loss": 3.06, + "step": 50766 + }, + { + "epoch": 2.49, + "grad_norm": 0.712235689163208, + "learning_rate": 4.2102831927282364e-05, + "loss": 2.8577, + "step": 50767 + }, + { + "epoch": 2.49, + "grad_norm": 0.7422438859939575, + "learning_rate": 4.209496679559703e-05, + "loss": 2.9714, + "step": 50768 + }, + { + "epoch": 2.49, + "grad_norm": 0.7196982502937317, + "learning_rate": 4.2087102343183995e-05, + "loss": 2.5705, + "step": 50769 + }, + { + "epoch": 2.49, + "grad_norm": 0.7336692214012146, + "learning_rate": 4.207923857006379e-05, + "loss": 2.9921, + "step": 50770 + }, + { + "epoch": 2.49, + "grad_norm": 0.8158669471740723, + "learning_rate": 4.2071375476257316e-05, + "loss": 2.8008, + "step": 50771 + }, + { + "epoch": 2.49, + "grad_norm": 0.7556344270706177, + "learning_rate": 4.2063513061785236e-05, + "loss": 2.9787, + "step": 50772 + }, + { + "epoch": 2.49, + "grad_norm": 0.7272917628288269, + "learning_rate": 4.2055651326668126e-05, + "loss": 2.8565, + "step": 50773 + }, + { + "epoch": 2.49, + "grad_norm": 0.7603312730789185, + "learning_rate": 4.204779027092684e-05, + "loss": 2.9942, + "step": 50774 + }, + { + "epoch": 2.49, + "grad_norm": 0.728800892829895, + "learning_rate": 4.203992989458199e-05, + "loss": 2.9436, + "step": 50775 + }, + { + "epoch": 2.49, + "grad_norm": 0.7117544412612915, + "learning_rate": 4.2032070197654364e-05, + "loss": 2.9086, + "step": 50776 + }, + { + "epoch": 2.49, + "grad_norm": 0.7658325433731079, + "learning_rate": 4.2024211180164545e-05, + "loss": 2.9249, + "step": 50777 + }, + { + "epoch": 2.49, + "grad_norm": 0.6998578906059265, + "learning_rate": 4.201635284213335e-05, + "loss": 2.9473, + "step": 50778 + }, + { + "epoch": 2.49, + "grad_norm": 0.7814579606056213, + "learning_rate": 4.200849518358146e-05, + "loss": 2.9595, + "step": 50779 + }, + { + "epoch": 2.49, + "grad_norm": 0.7035899758338928, + "learning_rate": 4.200063820452956e-05, + "loss": 2.8746, + "step": 50780 + }, + { + "epoch": 2.49, + "grad_norm": 0.7148592472076416, + "learning_rate": 4.1992781904998295e-05, + "loss": 2.9529, + "step": 50781 + }, + { + "epoch": 2.49, + "grad_norm": 0.7354755401611328, + "learning_rate": 4.1984926285008355e-05, + "loss": 2.8767, + "step": 50782 + }, + { + "epoch": 2.49, + "grad_norm": 0.723908543586731, + "learning_rate": 4.197707134458046e-05, + "loss": 2.9528, + "step": 50783 + }, + { + "epoch": 2.49, + "grad_norm": 0.7492967844009399, + "learning_rate": 4.196921708373535e-05, + "loss": 3.0366, + "step": 50784 + }, + { + "epoch": 2.49, + "grad_norm": 0.7645527720451355, + "learning_rate": 4.1961363502493615e-05, + "loss": 2.9825, + "step": 50785 + }, + { + "epoch": 2.49, + "grad_norm": 0.7469053268432617, + "learning_rate": 4.1953510600876036e-05, + "loss": 2.858, + "step": 50786 + }, + { + "epoch": 2.49, + "grad_norm": 0.7710843682289124, + "learning_rate": 4.194565837890327e-05, + "loss": 2.8289, + "step": 50787 + }, + { + "epoch": 2.49, + "grad_norm": 0.7673295140266418, + "learning_rate": 4.193780683659589e-05, + "loss": 3.0426, + "step": 50788 + }, + { + "epoch": 2.49, + "grad_norm": 0.7491339445114136, + "learning_rate": 4.192995597397475e-05, + "loss": 3.077, + "step": 50789 + }, + { + "epoch": 2.49, + "grad_norm": 0.756820023059845, + "learning_rate": 4.192210579106037e-05, + "loss": 3.0091, + "step": 50790 + }, + { + "epoch": 2.49, + "grad_norm": 0.7435303926467896, + "learning_rate": 4.1914256287873525e-05, + "loss": 2.8404, + "step": 50791 + }, + { + "epoch": 2.49, + "grad_norm": 0.7386417388916016, + "learning_rate": 4.190640746443484e-05, + "loss": 2.8662, + "step": 50792 + }, + { + "epoch": 2.49, + "grad_norm": 0.7664363980293274, + "learning_rate": 4.189855932076497e-05, + "loss": 3.0039, + "step": 50793 + }, + { + "epoch": 2.49, + "grad_norm": 0.7032740116119385, + "learning_rate": 4.1890711856884786e-05, + "loss": 2.7883, + "step": 50794 + }, + { + "epoch": 2.49, + "grad_norm": 0.7009146809577942, + "learning_rate": 4.188286507281464e-05, + "loss": 2.8372, + "step": 50795 + }, + { + "epoch": 2.49, + "grad_norm": 0.7520615458488464, + "learning_rate": 4.187501896857542e-05, + "loss": 2.9782, + "step": 50796 + }, + { + "epoch": 2.49, + "grad_norm": 0.726274847984314, + "learning_rate": 4.1867173544187685e-05, + "loss": 2.7904, + "step": 50797 + }, + { + "epoch": 2.49, + "grad_norm": 0.7836074233055115, + "learning_rate": 4.18593287996721e-05, + "loss": 3.0073, + "step": 50798 + }, + { + "epoch": 2.49, + "grad_norm": 0.7328587770462036, + "learning_rate": 4.1851484735049425e-05, + "loss": 2.7654, + "step": 50799 + }, + { + "epoch": 2.49, + "grad_norm": 0.7041208744049072, + "learning_rate": 4.1843641350340205e-05, + "loss": 3.1183, + "step": 50800 + }, + { + "epoch": 2.49, + "grad_norm": 0.7885406017303467, + "learning_rate": 4.183579864556523e-05, + "loss": 3.0173, + "step": 50801 + }, + { + "epoch": 2.49, + "grad_norm": 0.7367026805877686, + "learning_rate": 4.182795662074505e-05, + "loss": 2.8478, + "step": 50802 + }, + { + "epoch": 2.49, + "grad_norm": 0.7012369632720947, + "learning_rate": 4.182011527590031e-05, + "loss": 2.9023, + "step": 50803 + }, + { + "epoch": 2.49, + "grad_norm": 0.8011801838874817, + "learning_rate": 4.1812274611051765e-05, + "loss": 2.8189, + "step": 50804 + }, + { + "epoch": 2.49, + "grad_norm": 0.7242926359176636, + "learning_rate": 4.1804434626219896e-05, + "loss": 2.8217, + "step": 50805 + }, + { + "epoch": 2.49, + "grad_norm": 0.6962056159973145, + "learning_rate": 4.179659532142555e-05, + "loss": 2.7622, + "step": 50806 + }, + { + "epoch": 2.49, + "grad_norm": 0.728546142578125, + "learning_rate": 4.1788756696689216e-05, + "loss": 3.0023, + "step": 50807 + }, + { + "epoch": 2.49, + "grad_norm": 0.7366439700126648, + "learning_rate": 4.1780918752031644e-05, + "loss": 2.9501, + "step": 50808 + }, + { + "epoch": 2.49, + "grad_norm": 0.7231490612030029, + "learning_rate": 4.177308148747345e-05, + "loss": 2.9292, + "step": 50809 + }, + { + "epoch": 2.49, + "grad_norm": 0.7084015011787415, + "learning_rate": 4.176524490303521e-05, + "loss": 2.782, + "step": 50810 + }, + { + "epoch": 2.49, + "grad_norm": 0.7416843175888062, + "learning_rate": 4.175740899873768e-05, + "loss": 2.898, + "step": 50811 + }, + { + "epoch": 2.49, + "grad_norm": 0.7785187363624573, + "learning_rate": 4.1749573774601354e-05, + "loss": 3.0888, + "step": 50812 + }, + { + "epoch": 2.49, + "grad_norm": 0.7746663689613342, + "learning_rate": 4.174173923064703e-05, + "loss": 2.8559, + "step": 50813 + }, + { + "epoch": 2.49, + "grad_norm": 0.7415159344673157, + "learning_rate": 4.173390536689517e-05, + "loss": 2.9723, + "step": 50814 + }, + { + "epoch": 2.49, + "grad_norm": 1.2300888299942017, + "learning_rate": 4.172607218336652e-05, + "loss": 2.9255, + "step": 50815 + }, + { + "epoch": 2.49, + "grad_norm": 0.7681926488876343, + "learning_rate": 4.171823968008177e-05, + "loss": 2.791, + "step": 50816 + }, + { + "epoch": 2.49, + "grad_norm": 0.7152865529060364, + "learning_rate": 4.171040785706146e-05, + "loss": 2.6684, + "step": 50817 + }, + { + "epoch": 2.49, + "grad_norm": 0.7356628179550171, + "learning_rate": 4.1702576714326205e-05, + "loss": 2.873, + "step": 50818 + }, + { + "epoch": 2.49, + "grad_norm": 0.7573530673980713, + "learning_rate": 4.1694746251896606e-05, + "loss": 2.8615, + "step": 50819 + }, + { + "epoch": 2.49, + "grad_norm": 0.7217455506324768, + "learning_rate": 4.168691646979333e-05, + "loss": 2.9256, + "step": 50820 + }, + { + "epoch": 2.49, + "grad_norm": 0.7954509854316711, + "learning_rate": 4.1679087368037066e-05, + "loss": 2.9018, + "step": 50821 + }, + { + "epoch": 2.49, + "grad_norm": 0.7794967293739319, + "learning_rate": 4.16712589466483e-05, + "loss": 2.9567, + "step": 50822 + }, + { + "epoch": 2.49, + "grad_norm": 0.719433605670929, + "learning_rate": 4.166343120564781e-05, + "loss": 2.7478, + "step": 50823 + }, + { + "epoch": 2.49, + "grad_norm": 0.7607887387275696, + "learning_rate": 4.1655604145056085e-05, + "loss": 2.9903, + "step": 50824 + }, + { + "epoch": 2.49, + "grad_norm": 0.7895941734313965, + "learning_rate": 4.1647777764893745e-05, + "loss": 2.8344, + "step": 50825 + }, + { + "epoch": 2.49, + "grad_norm": 0.7224075794219971, + "learning_rate": 4.163995206518149e-05, + "loss": 3.0332, + "step": 50826 + }, + { + "epoch": 2.49, + "grad_norm": 0.7128440737724304, + "learning_rate": 4.1632127045939826e-05, + "loss": 2.6762, + "step": 50827 + }, + { + "epoch": 2.49, + "grad_norm": 0.7446962594985962, + "learning_rate": 4.162430270718948e-05, + "loss": 2.9153, + "step": 50828 + }, + { + "epoch": 2.49, + "grad_norm": 0.7283123731613159, + "learning_rate": 4.161647904895092e-05, + "loss": 2.8714, + "step": 50829 + }, + { + "epoch": 2.49, + "grad_norm": 0.7481231093406677, + "learning_rate": 4.1608656071244897e-05, + "loss": 2.9547, + "step": 50830 + }, + { + "epoch": 2.49, + "grad_norm": 0.7285983562469482, + "learning_rate": 4.1600833774091945e-05, + "loss": 2.8458, + "step": 50831 + }, + { + "epoch": 2.49, + "grad_norm": 0.7572833299636841, + "learning_rate": 4.159301215751259e-05, + "loss": 2.9878, + "step": 50832 + }, + { + "epoch": 2.49, + "grad_norm": 0.748000979423523, + "learning_rate": 4.1585191221527595e-05, + "loss": 2.9091, + "step": 50833 + }, + { + "epoch": 2.49, + "grad_norm": 0.7040547132492065, + "learning_rate": 4.157737096615742e-05, + "loss": 2.8364, + "step": 50834 + }, + { + "epoch": 2.49, + "grad_norm": 0.7504925727844238, + "learning_rate": 4.1569551391422695e-05, + "loss": 2.8602, + "step": 50835 + }, + { + "epoch": 2.49, + "grad_norm": 0.7645800113677979, + "learning_rate": 4.15617324973441e-05, + "loss": 2.9762, + "step": 50836 + }, + { + "epoch": 2.49, + "grad_norm": 0.7223212122917175, + "learning_rate": 4.1553914283942094e-05, + "loss": 3.0629, + "step": 50837 + }, + { + "epoch": 2.49, + "grad_norm": 0.7501276135444641, + "learning_rate": 4.154609675123741e-05, + "loss": 2.8408, + "step": 50838 + }, + { + "epoch": 2.49, + "grad_norm": 0.7350544929504395, + "learning_rate": 4.153827989925055e-05, + "loss": 2.9287, + "step": 50839 + }, + { + "epoch": 2.49, + "grad_norm": 0.7663587331771851, + "learning_rate": 4.153046372800208e-05, + "loss": 2.9415, + "step": 50840 + }, + { + "epoch": 2.49, + "grad_norm": 0.8215747475624084, + "learning_rate": 4.1522648237512666e-05, + "loss": 2.8608, + "step": 50841 + }, + { + "epoch": 2.49, + "grad_norm": 0.7529155015945435, + "learning_rate": 4.151483342780282e-05, + "loss": 2.9685, + "step": 50842 + }, + { + "epoch": 2.49, + "grad_norm": 0.7362189888954163, + "learning_rate": 4.1507019298893186e-05, + "loss": 2.8494, + "step": 50843 + }, + { + "epoch": 2.49, + "grad_norm": 0.7676265835762024, + "learning_rate": 4.149920585080429e-05, + "loss": 2.8628, + "step": 50844 + }, + { + "epoch": 2.49, + "grad_norm": 0.7176138758659363, + "learning_rate": 4.1491393083556766e-05, + "loss": 3.0375, + "step": 50845 + }, + { + "epoch": 2.49, + "grad_norm": 0.7533155083656311, + "learning_rate": 4.1483580997171186e-05, + "loss": 2.8913, + "step": 50846 + }, + { + "epoch": 2.49, + "grad_norm": 0.7027636170387268, + "learning_rate": 4.147576959166802e-05, + "loss": 2.9655, + "step": 50847 + }, + { + "epoch": 2.49, + "grad_norm": 0.7691273093223572, + "learning_rate": 4.146795886706798e-05, + "loss": 2.9198, + "step": 50848 + }, + { + "epoch": 2.49, + "grad_norm": 0.7978218793869019, + "learning_rate": 4.146014882339154e-05, + "loss": 2.9319, + "step": 50849 + }, + { + "epoch": 2.49, + "grad_norm": 0.7753750085830688, + "learning_rate": 4.145233946065929e-05, + "loss": 2.8632, + "step": 50850 + }, + { + "epoch": 2.49, + "grad_norm": 0.7168684601783752, + "learning_rate": 4.144453077889189e-05, + "loss": 3.0417, + "step": 50851 + }, + { + "epoch": 2.49, + "grad_norm": 0.730107307434082, + "learning_rate": 4.143672277810983e-05, + "loss": 2.9331, + "step": 50852 + }, + { + "epoch": 2.49, + "grad_norm": 0.7644616365432739, + "learning_rate": 4.142891545833369e-05, + "loss": 2.7204, + "step": 50853 + }, + { + "epoch": 2.49, + "grad_norm": 0.7631624937057495, + "learning_rate": 4.142110881958398e-05, + "loss": 3.1044, + "step": 50854 + }, + { + "epoch": 2.49, + "grad_norm": 0.7302795648574829, + "learning_rate": 4.1413302861881256e-05, + "loss": 2.8666, + "step": 50855 + }, + { + "epoch": 2.49, + "grad_norm": 0.7175984382629395, + "learning_rate": 4.140549758524624e-05, + "loss": 2.8372, + "step": 50856 + }, + { + "epoch": 2.49, + "grad_norm": 0.7876809239387512, + "learning_rate": 4.139769298969927e-05, + "loss": 2.9315, + "step": 50857 + }, + { + "epoch": 2.49, + "grad_norm": 0.7984057068824768, + "learning_rate": 4.1389889075261105e-05, + "loss": 2.756, + "step": 50858 + }, + { + "epoch": 2.49, + "grad_norm": 0.8823890686035156, + "learning_rate": 4.138208584195213e-05, + "loss": 2.8131, + "step": 50859 + }, + { + "epoch": 2.49, + "grad_norm": 0.7620226144790649, + "learning_rate": 4.137428328979302e-05, + "loss": 3.0526, + "step": 50860 + }, + { + "epoch": 2.49, + "grad_norm": 0.7054426670074463, + "learning_rate": 4.1366481418804266e-05, + "loss": 3.0109, + "step": 50861 + }, + { + "epoch": 2.49, + "grad_norm": 0.7472173571586609, + "learning_rate": 4.135868022900638e-05, + "loss": 2.9188, + "step": 50862 + }, + { + "epoch": 2.49, + "grad_norm": 0.7145726084709167, + "learning_rate": 4.1350879720420014e-05, + "loss": 2.9618, + "step": 50863 + }, + { + "epoch": 2.49, + "grad_norm": 0.7612037062644958, + "learning_rate": 4.1343079893065586e-05, + "loss": 2.781, + "step": 50864 + }, + { + "epoch": 2.49, + "grad_norm": 0.7213705778121948, + "learning_rate": 4.133528074696372e-05, + "loss": 2.8342, + "step": 50865 + }, + { + "epoch": 2.49, + "grad_norm": 0.8010908365249634, + "learning_rate": 4.132748228213498e-05, + "loss": 2.7692, + "step": 50866 + }, + { + "epoch": 2.49, + "grad_norm": 0.7797710299491882, + "learning_rate": 4.1319684498599906e-05, + "loss": 3.0265, + "step": 50867 + }, + { + "epoch": 2.49, + "grad_norm": 0.7466559410095215, + "learning_rate": 4.131188739637896e-05, + "loss": 2.8454, + "step": 50868 + }, + { + "epoch": 2.49, + "grad_norm": 0.7062391638755798, + "learning_rate": 4.130409097549268e-05, + "loss": 2.9389, + "step": 50869 + }, + { + "epoch": 2.49, + "grad_norm": 0.7370114326477051, + "learning_rate": 4.1296295235961627e-05, + "loss": 2.7059, + "step": 50870 + }, + { + "epoch": 2.49, + "grad_norm": 0.7448911070823669, + "learning_rate": 4.128850017780641e-05, + "loss": 3.0902, + "step": 50871 + }, + { + "epoch": 2.49, + "grad_norm": 0.7794041633605957, + "learning_rate": 4.128070580104742e-05, + "loss": 3.0043, + "step": 50872 + }, + { + "epoch": 2.49, + "grad_norm": 0.7436469197273254, + "learning_rate": 4.127291210570536e-05, + "loss": 2.7554, + "step": 50873 + }, + { + "epoch": 2.49, + "grad_norm": 0.8578379154205322, + "learning_rate": 4.12651190918006e-05, + "loss": 2.766, + "step": 50874 + }, + { + "epoch": 2.49, + "grad_norm": 0.7621721029281616, + "learning_rate": 4.12573267593537e-05, + "loss": 2.7681, + "step": 50875 + }, + { + "epoch": 2.49, + "grad_norm": 0.6907230019569397, + "learning_rate": 4.1249535108385246e-05, + "loss": 2.9351, + "step": 50876 + }, + { + "epoch": 2.49, + "grad_norm": 0.7239318490028381, + "learning_rate": 4.1241744138915676e-05, + "loss": 2.8717, + "step": 50877 + }, + { + "epoch": 2.49, + "grad_norm": 0.7087956070899963, + "learning_rate": 4.1233953850965616e-05, + "loss": 2.7214, + "step": 50878 + }, + { + "epoch": 2.49, + "grad_norm": 0.7738022804260254, + "learning_rate": 4.122616424455545e-05, + "loss": 2.937, + "step": 50879 + }, + { + "epoch": 2.49, + "grad_norm": 0.7342925071716309, + "learning_rate": 4.121837531970575e-05, + "loss": 2.8623, + "step": 50880 + }, + { + "epoch": 2.49, + "grad_norm": 0.7471789121627808, + "learning_rate": 4.121058707643712e-05, + "loss": 2.9401, + "step": 50881 + }, + { + "epoch": 2.49, + "grad_norm": 0.7654462456703186, + "learning_rate": 4.120279951476999e-05, + "loss": 2.8794, + "step": 50882 + }, + { + "epoch": 2.49, + "grad_norm": 0.6907823085784912, + "learning_rate": 4.119501263472489e-05, + "loss": 2.8218, + "step": 50883 + }, + { + "epoch": 2.49, + "grad_norm": 0.7497276067733765, + "learning_rate": 4.1187226436322265e-05, + "loss": 2.9644, + "step": 50884 + }, + { + "epoch": 2.49, + "grad_norm": 0.6869165301322937, + "learning_rate": 4.117944091958273e-05, + "loss": 3.0914, + "step": 50885 + }, + { + "epoch": 2.49, + "grad_norm": 0.7467650175094604, + "learning_rate": 4.117165608452667e-05, + "loss": 2.7657, + "step": 50886 + }, + { + "epoch": 2.49, + "grad_norm": 0.7433682084083557, + "learning_rate": 4.1163871931174674e-05, + "loss": 2.8362, + "step": 50887 + }, + { + "epoch": 2.49, + "grad_norm": 0.7507684826850891, + "learning_rate": 4.1156088459547254e-05, + "loss": 2.918, + "step": 50888 + }, + { + "epoch": 2.49, + "grad_norm": 0.7188553810119629, + "learning_rate": 4.114830566966489e-05, + "loss": 2.9672, + "step": 50889 + }, + { + "epoch": 2.49, + "grad_norm": 0.7091402411460876, + "learning_rate": 4.114052356154811e-05, + "loss": 2.8625, + "step": 50890 + }, + { + "epoch": 2.49, + "grad_norm": 0.7486855983734131, + "learning_rate": 4.113274213521726e-05, + "loss": 2.8445, + "step": 50891 + }, + { + "epoch": 2.49, + "grad_norm": 0.7630000114440918, + "learning_rate": 4.112496139069298e-05, + "loss": 2.8836, + "step": 50892 + }, + { + "epoch": 2.49, + "grad_norm": 0.8397104740142822, + "learning_rate": 4.11171813279958e-05, + "loss": 2.9828, + "step": 50893 + }, + { + "epoch": 2.49, + "grad_norm": 0.7878146767616272, + "learning_rate": 4.110940194714603e-05, + "loss": 3.0389, + "step": 50894 + }, + { + "epoch": 2.49, + "grad_norm": 0.744600236415863, + "learning_rate": 4.1101623248164386e-05, + "loss": 2.859, + "step": 50895 + }, + { + "epoch": 2.49, + "grad_norm": 0.7375232577323914, + "learning_rate": 4.109384523107122e-05, + "loss": 3.0262, + "step": 50896 + }, + { + "epoch": 2.49, + "grad_norm": 0.7445172071456909, + "learning_rate": 4.108606789588695e-05, + "loss": 2.8573, + "step": 50897 + }, + { + "epoch": 2.49, + "grad_norm": 0.7916731834411621, + "learning_rate": 4.107829124263227e-05, + "loss": 2.5277, + "step": 50898 + }, + { + "epoch": 2.49, + "grad_norm": 0.7740074396133423, + "learning_rate": 4.107051527132741e-05, + "loss": 2.8962, + "step": 50899 + }, + { + "epoch": 2.49, + "grad_norm": 0.7415305376052856, + "learning_rate": 4.10627399819931e-05, + "loss": 3.0435, + "step": 50900 + }, + { + "epoch": 2.49, + "grad_norm": 0.7645403146743774, + "learning_rate": 4.105496537464958e-05, + "loss": 3.0199, + "step": 50901 + }, + { + "epoch": 2.49, + "grad_norm": 0.7634096741676331, + "learning_rate": 4.104719144931749e-05, + "loss": 2.9406, + "step": 50902 + }, + { + "epoch": 2.49, + "grad_norm": 0.7687584161758423, + "learning_rate": 4.103941820601731e-05, + "loss": 2.9617, + "step": 50903 + }, + { + "epoch": 2.49, + "grad_norm": 0.7485208511352539, + "learning_rate": 4.1031645644769494e-05, + "loss": 2.8768, + "step": 50904 + }, + { + "epoch": 2.49, + "grad_norm": 0.7247251868247986, + "learning_rate": 4.102387376559444e-05, + "loss": 2.923, + "step": 50905 + }, + { + "epoch": 2.49, + "grad_norm": 0.8152522444725037, + "learning_rate": 4.101610256851263e-05, + "loss": 2.8562, + "step": 50906 + }, + { + "epoch": 2.49, + "grad_norm": 0.7078531384468079, + "learning_rate": 4.100833205354452e-05, + "loss": 3.0635, + "step": 50907 + }, + { + "epoch": 2.49, + "grad_norm": 0.7330393195152283, + "learning_rate": 4.100056222071073e-05, + "loss": 2.6665, + "step": 50908 + }, + { + "epoch": 2.49, + "grad_norm": 0.7819045186042786, + "learning_rate": 4.099279307003154e-05, + "loss": 3.0064, + "step": 50909 + }, + { + "epoch": 2.49, + "grad_norm": 0.8102908730506897, + "learning_rate": 4.098502460152753e-05, + "loss": 3.0131, + "step": 50910 + }, + { + "epoch": 2.5, + "grad_norm": 0.7458033561706543, + "learning_rate": 4.097725681521912e-05, + "loss": 2.8156, + "step": 50911 + }, + { + "epoch": 2.5, + "grad_norm": 0.750956654548645, + "learning_rate": 4.0969489711126726e-05, + "loss": 3.1724, + "step": 50912 + }, + { + "epoch": 2.5, + "grad_norm": 0.7348149418830872, + "learning_rate": 4.09617232892709e-05, + "loss": 2.8127, + "step": 50913 + }, + { + "epoch": 2.5, + "grad_norm": 0.7550942897796631, + "learning_rate": 4.095395754967196e-05, + "loss": 2.8972, + "step": 50914 + }, + { + "epoch": 2.5, + "grad_norm": 0.7148580551147461, + "learning_rate": 4.0946192492350514e-05, + "loss": 2.8255, + "step": 50915 + }, + { + "epoch": 2.5, + "grad_norm": 0.7225046157836914, + "learning_rate": 4.0938428117326895e-05, + "loss": 2.7282, + "step": 50916 + }, + { + "epoch": 2.5, + "grad_norm": 0.7666391134262085, + "learning_rate": 4.093066442462157e-05, + "loss": 2.7273, + "step": 50917 + }, + { + "epoch": 2.5, + "grad_norm": 0.746823787689209, + "learning_rate": 4.092290141425511e-05, + "loss": 2.9481, + "step": 50918 + }, + { + "epoch": 2.5, + "grad_norm": 0.7512428164482117, + "learning_rate": 4.091513908624784e-05, + "loss": 2.8239, + "step": 50919 + }, + { + "epoch": 2.5, + "grad_norm": 0.7290512919425964, + "learning_rate": 4.090737744062023e-05, + "loss": 2.7134, + "step": 50920 + }, + { + "epoch": 2.5, + "grad_norm": 0.7385398149490356, + "learning_rate": 4.089961647739268e-05, + "loss": 3.15, + "step": 50921 + }, + { + "epoch": 2.5, + "grad_norm": 0.7204491496086121, + "learning_rate": 4.089185619658566e-05, + "loss": 2.7842, + "step": 50922 + }, + { + "epoch": 2.5, + "grad_norm": 0.7641138434410095, + "learning_rate": 4.088409659821971e-05, + "loss": 2.9864, + "step": 50923 + }, + { + "epoch": 2.5, + "grad_norm": 0.7317721843719482, + "learning_rate": 4.087633768231511e-05, + "loss": 2.883, + "step": 50924 + }, + { + "epoch": 2.5, + "grad_norm": 0.7656816244125366, + "learning_rate": 4.0868579448892415e-05, + "loss": 2.813, + "step": 50925 + }, + { + "epoch": 2.5, + "grad_norm": 0.7583822011947632, + "learning_rate": 4.086082189797204e-05, + "loss": 2.9214, + "step": 50926 + }, + { + "epoch": 2.5, + "grad_norm": 0.7554618716239929, + "learning_rate": 4.085306502957427e-05, + "loss": 2.8041, + "step": 50927 + }, + { + "epoch": 2.5, + "grad_norm": 0.7482122778892517, + "learning_rate": 4.0845308843719756e-05, + "loss": 2.8939, + "step": 50928 + }, + { + "epoch": 2.5, + "grad_norm": 0.8274185657501221, + "learning_rate": 4.0837553340428784e-05, + "loss": 2.7364, + "step": 50929 + }, + { + "epoch": 2.5, + "grad_norm": 0.7026564478874207, + "learning_rate": 4.082979851972183e-05, + "loss": 2.7929, + "step": 50930 + }, + { + "epoch": 2.5, + "grad_norm": 0.8461477160453796, + "learning_rate": 4.082204438161929e-05, + "loss": 2.9104, + "step": 50931 + }, + { + "epoch": 2.5, + "grad_norm": 0.7340132594108582, + "learning_rate": 4.0814290926141636e-05, + "loss": 2.814, + "step": 50932 + }, + { + "epoch": 2.5, + "grad_norm": 0.7171696424484253, + "learning_rate": 4.0806538153309284e-05, + "loss": 2.979, + "step": 50933 + }, + { + "epoch": 2.5, + "grad_norm": 0.7311767935752869, + "learning_rate": 4.079878606314253e-05, + "loss": 2.9879, + "step": 50934 + }, + { + "epoch": 2.5, + "grad_norm": 0.7532653212547302, + "learning_rate": 4.079103465566198e-05, + "loss": 2.8224, + "step": 50935 + }, + { + "epoch": 2.5, + "grad_norm": 0.7105627059936523, + "learning_rate": 4.078328393088789e-05, + "loss": 2.803, + "step": 50936 + }, + { + "epoch": 2.5, + "grad_norm": 0.7665572166442871, + "learning_rate": 4.077553388884074e-05, + "loss": 3.1236, + "step": 50937 + }, + { + "epoch": 2.5, + "grad_norm": 0.7285357713699341, + "learning_rate": 4.076778452954101e-05, + "loss": 2.8497, + "step": 50938 + }, + { + "epoch": 2.5, + "grad_norm": 0.7463434338569641, + "learning_rate": 4.0760035853009e-05, + "loss": 2.943, + "step": 50939 + }, + { + "epoch": 2.5, + "grad_norm": 0.7962722778320312, + "learning_rate": 4.075228785926521e-05, + "loss": 3.0252, + "step": 50940 + }, + { + "epoch": 2.5, + "grad_norm": 0.7439616322517395, + "learning_rate": 4.0744540548329996e-05, + "loss": 2.759, + "step": 50941 + }, + { + "epoch": 2.5, + "grad_norm": 0.7312211990356445, + "learning_rate": 4.073679392022371e-05, + "loss": 2.6245, + "step": 50942 + }, + { + "epoch": 2.5, + "grad_norm": 0.7195311188697815, + "learning_rate": 4.072904797496686e-05, + "loss": 3.0517, + "step": 50943 + }, + { + "epoch": 2.5, + "grad_norm": 0.6905331015586853, + "learning_rate": 4.072130271257975e-05, + "loss": 2.9166, + "step": 50944 + }, + { + "epoch": 2.5, + "grad_norm": 0.7037015557289124, + "learning_rate": 4.071355813308291e-05, + "loss": 2.9205, + "step": 50945 + }, + { + "epoch": 2.5, + "grad_norm": 0.7677614688873291, + "learning_rate": 4.070581423649657e-05, + "loss": 2.7968, + "step": 50946 + }, + { + "epoch": 2.5, + "grad_norm": 0.7849667072296143, + "learning_rate": 4.069807102284127e-05, + "loss": 2.8588, + "step": 50947 + }, + { + "epoch": 2.5, + "grad_norm": 0.7227040529251099, + "learning_rate": 4.0690328492137366e-05, + "loss": 2.7918, + "step": 50948 + }, + { + "epoch": 2.5, + "grad_norm": 0.8007916212081909, + "learning_rate": 4.068258664440518e-05, + "loss": 3.1101, + "step": 50949 + }, + { + "epoch": 2.5, + "grad_norm": 0.7214465141296387, + "learning_rate": 4.0674845479665185e-05, + "loss": 2.6496, + "step": 50950 + }, + { + "epoch": 2.5, + "grad_norm": 0.7386294603347778, + "learning_rate": 4.066710499793771e-05, + "loss": 2.728, + "step": 50951 + }, + { + "epoch": 2.5, + "grad_norm": 0.7470434904098511, + "learning_rate": 4.065936519924315e-05, + "loss": 2.9676, + "step": 50952 + }, + { + "epoch": 2.5, + "grad_norm": 0.7561085820198059, + "learning_rate": 4.065162608360198e-05, + "loss": 2.8916, + "step": 50953 + }, + { + "epoch": 2.5, + "grad_norm": 0.7485124468803406, + "learning_rate": 4.0643887651034514e-05, + "loss": 2.6446, + "step": 50954 + }, + { + "epoch": 2.5, + "grad_norm": 0.7447924017906189, + "learning_rate": 4.063614990156112e-05, + "loss": 2.8693, + "step": 50955 + }, + { + "epoch": 2.5, + "grad_norm": 0.729457437992096, + "learning_rate": 4.062841283520213e-05, + "loss": 2.7354, + "step": 50956 + }, + { + "epoch": 2.5, + "grad_norm": 0.7371611595153809, + "learning_rate": 4.062067645197802e-05, + "loss": 3.0109, + "step": 50957 + }, + { + "epoch": 2.5, + "grad_norm": 0.7636083960533142, + "learning_rate": 4.061294075190914e-05, + "loss": 2.6421, + "step": 50958 + }, + { + "epoch": 2.5, + "grad_norm": 0.8881843090057373, + "learning_rate": 4.060520573501581e-05, + "loss": 2.8784, + "step": 50959 + }, + { + "epoch": 2.5, + "grad_norm": 0.7290064692497253, + "learning_rate": 4.059747140131855e-05, + "loss": 3.0156, + "step": 50960 + }, + { + "epoch": 2.5, + "grad_norm": 0.7382261157035828, + "learning_rate": 4.0589737750837506e-05, + "loss": 2.779, + "step": 50961 + }, + { + "epoch": 2.5, + "grad_norm": 0.7628456354141235, + "learning_rate": 4.058200478359326e-05, + "loss": 2.9542, + "step": 50962 + }, + { + "epoch": 2.5, + "grad_norm": 0.7667835354804993, + "learning_rate": 4.05742724996061e-05, + "loss": 3.0102, + "step": 50963 + }, + { + "epoch": 2.5, + "grad_norm": 0.7994858026504517, + "learning_rate": 4.0566540898896275e-05, + "loss": 2.8177, + "step": 50964 + }, + { + "epoch": 2.5, + "grad_norm": 0.7050232887268066, + "learning_rate": 4.0558809981484364e-05, + "loss": 2.9967, + "step": 50965 + }, + { + "epoch": 2.5, + "grad_norm": 0.7449204325675964, + "learning_rate": 4.055107974739053e-05, + "loss": 2.8754, + "step": 50966 + }, + { + "epoch": 2.5, + "grad_norm": 0.7724292874336243, + "learning_rate": 4.054335019663527e-05, + "loss": 2.6654, + "step": 50967 + }, + { + "epoch": 2.5, + "grad_norm": 0.7795135378837585, + "learning_rate": 4.0535621329238857e-05, + "loss": 2.7718, + "step": 50968 + }, + { + "epoch": 2.5, + "grad_norm": 0.6958447098731995, + "learning_rate": 4.052789314522172e-05, + "loss": 2.8641, + "step": 50969 + }, + { + "epoch": 2.5, + "grad_norm": 0.7487135529518127, + "learning_rate": 4.05201656446042e-05, + "loss": 2.8888, + "step": 50970 + }, + { + "epoch": 2.5, + "grad_norm": 0.7429445385932922, + "learning_rate": 4.051243882740652e-05, + "loss": 2.8184, + "step": 50971 + }, + { + "epoch": 2.5, + "grad_norm": 0.7123727798461914, + "learning_rate": 4.050471269364926e-05, + "loss": 3.1401, + "step": 50972 + }, + { + "epoch": 2.5, + "grad_norm": 0.7284667491912842, + "learning_rate": 4.049698724335254e-05, + "loss": 2.7846, + "step": 50973 + }, + { + "epoch": 2.5, + "grad_norm": 0.7588906288146973, + "learning_rate": 4.04892624765368e-05, + "loss": 2.6116, + "step": 50974 + }, + { + "epoch": 2.5, + "grad_norm": 0.7456479072570801, + "learning_rate": 4.0481538393222516e-05, + "loss": 2.9805, + "step": 50975 + }, + { + "epoch": 2.5, + "grad_norm": 0.7160006761550903, + "learning_rate": 4.047381499342986e-05, + "loss": 2.7069, + "step": 50976 + }, + { + "epoch": 2.5, + "grad_norm": 0.7530381083488464, + "learning_rate": 4.0466092277179274e-05, + "loss": 2.8844, + "step": 50977 + }, + { + "epoch": 2.5, + "grad_norm": 0.7419160604476929, + "learning_rate": 4.0458370244490954e-05, + "loss": 2.5117, + "step": 50978 + }, + { + "epoch": 2.5, + "grad_norm": 0.7657301425933838, + "learning_rate": 4.045064889538536e-05, + "loss": 2.5939, + "step": 50979 + }, + { + "epoch": 2.5, + "grad_norm": 0.7660251259803772, + "learning_rate": 4.044292822988284e-05, + "loss": 2.7927, + "step": 50980 + }, + { + "epoch": 2.5, + "grad_norm": 0.8668144941329956, + "learning_rate": 4.043520824800368e-05, + "loss": 2.9519, + "step": 50981 + }, + { + "epoch": 2.5, + "grad_norm": 0.7728575468063354, + "learning_rate": 4.042748894976826e-05, + "loss": 3.0094, + "step": 50982 + }, + { + "epoch": 2.5, + "grad_norm": 0.7155167460441589, + "learning_rate": 4.04197703351968e-05, + "loss": 2.6734, + "step": 50983 + }, + { + "epoch": 2.5, + "grad_norm": 0.7189810276031494, + "learning_rate": 4.041205240430981e-05, + "loss": 2.9716, + "step": 50984 + }, + { + "epoch": 2.5, + "grad_norm": 0.728173553943634, + "learning_rate": 4.040433515712749e-05, + "loss": 2.8533, + "step": 50985 + }, + { + "epoch": 2.5, + "grad_norm": 0.7641592025756836, + "learning_rate": 4.039661859367014e-05, + "loss": 3.3096, + "step": 50986 + }, + { + "epoch": 2.5, + "grad_norm": 0.7122716903686523, + "learning_rate": 4.0388902713958184e-05, + "loss": 2.7429, + "step": 50987 + }, + { + "epoch": 2.5, + "grad_norm": 0.7600113153457642, + "learning_rate": 4.038118751801184e-05, + "loss": 2.947, + "step": 50988 + }, + { + "epoch": 2.5, + "grad_norm": 0.7478308081626892, + "learning_rate": 4.0373473005851454e-05, + "loss": 2.8651, + "step": 50989 + }, + { + "epoch": 2.5, + "grad_norm": 0.7460908889770508, + "learning_rate": 4.036575917749748e-05, + "loss": 2.9738, + "step": 50990 + }, + { + "epoch": 2.5, + "grad_norm": 0.7135355472564697, + "learning_rate": 4.03580460329701e-05, + "loss": 2.7612, + "step": 50991 + }, + { + "epoch": 2.5, + "grad_norm": 0.743721604347229, + "learning_rate": 4.0350333572289675e-05, + "loss": 2.7419, + "step": 50992 + }, + { + "epoch": 2.5, + "grad_norm": 0.737909197807312, + "learning_rate": 4.034262179547644e-05, + "loss": 2.8635, + "step": 50993 + }, + { + "epoch": 2.5, + "grad_norm": 0.7796975374221802, + "learning_rate": 4.033491070255075e-05, + "loss": 2.8157, + "step": 50994 + }, + { + "epoch": 2.5, + "grad_norm": 0.7635996341705322, + "learning_rate": 4.032720029353302e-05, + "loss": 2.777, + "step": 50995 + }, + { + "epoch": 2.5, + "grad_norm": 0.7595275640487671, + "learning_rate": 4.031949056844338e-05, + "loss": 2.8294, + "step": 50996 + }, + { + "epoch": 2.5, + "grad_norm": 0.7468879222869873, + "learning_rate": 4.03117815273023e-05, + "loss": 2.8661, + "step": 50997 + }, + { + "epoch": 2.5, + "grad_norm": 0.7194596529006958, + "learning_rate": 4.0304073170130036e-05, + "loss": 3.0833, + "step": 50998 + }, + { + "epoch": 2.5, + "grad_norm": 0.7229351997375488, + "learning_rate": 4.029636549694677e-05, + "loss": 3.0329, + "step": 50999 + }, + { + "epoch": 2.5, + "grad_norm": 0.7696372270584106, + "learning_rate": 4.028865850777295e-05, + "loss": 3.0217, + "step": 51000 + }, + { + "epoch": 2.5, + "grad_norm": 0.730970561504364, + "learning_rate": 4.028095220262877e-05, + "loss": 2.8736, + "step": 51001 + }, + { + "epoch": 2.5, + "grad_norm": 0.7224681973457336, + "learning_rate": 4.0273246581534636e-05, + "loss": 2.857, + "step": 51002 + }, + { + "epoch": 2.5, + "grad_norm": 0.7172154188156128, + "learning_rate": 4.026554164451073e-05, + "loss": 2.9542, + "step": 51003 + }, + { + "epoch": 2.5, + "grad_norm": 0.7283074259757996, + "learning_rate": 4.025783739157738e-05, + "loss": 2.8613, + "step": 51004 + }, + { + "epoch": 2.5, + "grad_norm": 0.7916626930236816, + "learning_rate": 4.0250133822755004e-05, + "loss": 3.0261, + "step": 51005 + }, + { + "epoch": 2.5, + "grad_norm": 0.751471757888794, + "learning_rate": 4.0242430938063776e-05, + "loss": 2.9181, + "step": 51006 + }, + { + "epoch": 2.5, + "grad_norm": 0.7159141898155212, + "learning_rate": 4.023472873752398e-05, + "loss": 2.8968, + "step": 51007 + }, + { + "epoch": 2.5, + "grad_norm": 0.769489049911499, + "learning_rate": 4.022702722115584e-05, + "loss": 2.9992, + "step": 51008 + }, + { + "epoch": 2.5, + "grad_norm": 0.7926540970802307, + "learning_rate": 4.0219326388979736e-05, + "loss": 2.8284, + "step": 51009 + }, + { + "epoch": 2.5, + "grad_norm": 0.7663864493370056, + "learning_rate": 4.021162624101598e-05, + "loss": 2.996, + "step": 51010 + }, + { + "epoch": 2.5, + "grad_norm": 0.8369364142417908, + "learning_rate": 4.020392677728473e-05, + "loss": 2.7815, + "step": 51011 + }, + { + "epoch": 2.5, + "grad_norm": 0.7207963466644287, + "learning_rate": 4.019622799780644e-05, + "loss": 3.0512, + "step": 51012 + }, + { + "epoch": 2.5, + "grad_norm": 0.826167106628418, + "learning_rate": 4.018852990260125e-05, + "loss": 2.9079, + "step": 51013 + }, + { + "epoch": 2.5, + "grad_norm": 0.7820410132408142, + "learning_rate": 4.018083249168942e-05, + "loss": 3.0379, + "step": 51014 + }, + { + "epoch": 2.5, + "grad_norm": 0.7185248136520386, + "learning_rate": 4.0173135765091325e-05, + "loss": 2.8409, + "step": 51015 + }, + { + "epoch": 2.5, + "grad_norm": 0.7619501948356628, + "learning_rate": 4.016543972282715e-05, + "loss": 2.7795, + "step": 51016 + }, + { + "epoch": 2.5, + "grad_norm": 0.7843639254570007, + "learning_rate": 4.0157744364917254e-05, + "loss": 2.994, + "step": 51017 + }, + { + "epoch": 2.5, + "grad_norm": 0.7171763777732849, + "learning_rate": 4.015004969138178e-05, + "loss": 2.8102, + "step": 51018 + }, + { + "epoch": 2.5, + "grad_norm": 0.7367191910743713, + "learning_rate": 4.014235570224105e-05, + "loss": 2.9537, + "step": 51019 + }, + { + "epoch": 2.5, + "grad_norm": 0.7253427505493164, + "learning_rate": 4.013466239751544e-05, + "loss": 2.9189, + "step": 51020 + }, + { + "epoch": 2.5, + "grad_norm": 0.7575175166130066, + "learning_rate": 4.0126969777225105e-05, + "loss": 2.8425, + "step": 51021 + }, + { + "epoch": 2.5, + "grad_norm": 0.7350121140480042, + "learning_rate": 4.0119277841390296e-05, + "loss": 2.9173, + "step": 51022 + }, + { + "epoch": 2.5, + "grad_norm": 0.7644230723381042, + "learning_rate": 4.0111586590031264e-05, + "loss": 2.844, + "step": 51023 + }, + { + "epoch": 2.5, + "grad_norm": 0.7468816041946411, + "learning_rate": 4.010389602316829e-05, + "loss": 3.0427, + "step": 51024 + }, + { + "epoch": 2.5, + "grad_norm": 0.7907638549804688, + "learning_rate": 4.0096206140821695e-05, + "loss": 2.8312, + "step": 51025 + }, + { + "epoch": 2.5, + "grad_norm": 0.7258133888244629, + "learning_rate": 4.00885169430116e-05, + "loss": 2.8525, + "step": 51026 + }, + { + "epoch": 2.5, + "grad_norm": 0.7306505441665649, + "learning_rate": 4.00808284297584e-05, + "loss": 2.9856, + "step": 51027 + }, + { + "epoch": 2.5, + "grad_norm": 0.7286311388015747, + "learning_rate": 4.007314060108228e-05, + "loss": 3.0388, + "step": 51028 + }, + { + "epoch": 2.5, + "grad_norm": 0.7742034792900085, + "learning_rate": 4.006545345700343e-05, + "loss": 2.9805, + "step": 51029 + }, + { + "epoch": 2.5, + "grad_norm": 0.7489369511604309, + "learning_rate": 4.005776699754217e-05, + "loss": 2.8488, + "step": 51030 + }, + { + "epoch": 2.5, + "grad_norm": 0.7013357877731323, + "learning_rate": 4.00500812227187e-05, + "loss": 2.9688, + "step": 51031 + }, + { + "epoch": 2.5, + "grad_norm": 0.7368605136871338, + "learning_rate": 4.0042396132553355e-05, + "loss": 2.7995, + "step": 51032 + }, + { + "epoch": 2.5, + "grad_norm": 0.7846304178237915, + "learning_rate": 4.003471172706623e-05, + "loss": 2.9969, + "step": 51033 + }, + { + "epoch": 2.5, + "grad_norm": 0.7510572075843811, + "learning_rate": 4.00270280062777e-05, + "loss": 2.8879, + "step": 51034 + }, + { + "epoch": 2.5, + "grad_norm": 0.7406590580940247, + "learning_rate": 4.0019344970207976e-05, + "loss": 2.8439, + "step": 51035 + }, + { + "epoch": 2.5, + "grad_norm": 0.7419261336326599, + "learning_rate": 4.001166261887716e-05, + "loss": 2.7751, + "step": 51036 + }, + { + "epoch": 2.5, + "grad_norm": 0.7146857976913452, + "learning_rate": 4.000398095230569e-05, + "loss": 3.0476, + "step": 51037 + }, + { + "epoch": 2.5, + "grad_norm": 0.7753526568412781, + "learning_rate": 3.999629997051359e-05, + "loss": 2.6755, + "step": 51038 + }, + { + "epoch": 2.5, + "grad_norm": 0.7410012483596802, + "learning_rate": 3.998861967352131e-05, + "loss": 2.756, + "step": 51039 + }, + { + "epoch": 2.5, + "grad_norm": 0.7166079878807068, + "learning_rate": 3.998094006134885e-05, + "loss": 2.9648, + "step": 51040 + }, + { + "epoch": 2.5, + "grad_norm": 0.7411850094795227, + "learning_rate": 3.997326113401659e-05, + "loss": 2.8957, + "step": 51041 + }, + { + "epoch": 2.5, + "grad_norm": 0.7261947989463806, + "learning_rate": 3.996558289154479e-05, + "loss": 2.9451, + "step": 51042 + }, + { + "epoch": 2.5, + "grad_norm": 0.7302013635635376, + "learning_rate": 3.9957905333953574e-05, + "loss": 3.0935, + "step": 51043 + }, + { + "epoch": 2.5, + "grad_norm": 0.7443523406982422, + "learning_rate": 3.99502284612632e-05, + "loss": 3.1287, + "step": 51044 + }, + { + "epoch": 2.5, + "grad_norm": 0.7418161630630493, + "learning_rate": 3.994255227349381e-05, + "loss": 2.9928, + "step": 51045 + }, + { + "epoch": 2.5, + "grad_norm": 0.7650214433670044, + "learning_rate": 3.9934876770665694e-05, + "loss": 2.4832, + "step": 51046 + }, + { + "epoch": 2.5, + "grad_norm": 0.7540137767791748, + "learning_rate": 3.9927201952799094e-05, + "loss": 2.8556, + "step": 51047 + }, + { + "epoch": 2.5, + "grad_norm": 0.7868987321853638, + "learning_rate": 3.991952781991417e-05, + "loss": 3.0147, + "step": 51048 + }, + { + "epoch": 2.5, + "grad_norm": 0.6718930006027222, + "learning_rate": 3.9911854372031196e-05, + "loss": 2.7966, + "step": 51049 + }, + { + "epoch": 2.5, + "grad_norm": 0.722568690776825, + "learning_rate": 3.990418160917036e-05, + "loss": 2.8758, + "step": 51050 + }, + { + "epoch": 2.5, + "grad_norm": 0.7367124557495117, + "learning_rate": 3.989650953135178e-05, + "loss": 2.8975, + "step": 51051 + }, + { + "epoch": 2.5, + "grad_norm": 0.7118638157844543, + "learning_rate": 3.988883813859578e-05, + "loss": 2.8865, + "step": 51052 + }, + { + "epoch": 2.5, + "grad_norm": 0.7357069849967957, + "learning_rate": 3.98811674309225e-05, + "loss": 3.1935, + "step": 51053 + }, + { + "epoch": 2.5, + "grad_norm": 0.7383196949958801, + "learning_rate": 3.9873497408352194e-05, + "loss": 3.1177, + "step": 51054 + }, + { + "epoch": 2.5, + "grad_norm": 0.7383975982666016, + "learning_rate": 3.986582807090498e-05, + "loss": 2.8114, + "step": 51055 + }, + { + "epoch": 2.5, + "grad_norm": 0.7209492325782776, + "learning_rate": 3.9858159418601174e-05, + "loss": 2.8342, + "step": 51056 + }, + { + "epoch": 2.5, + "grad_norm": 0.7390090823173523, + "learning_rate": 3.98504914514609e-05, + "loss": 2.7422, + "step": 51057 + }, + { + "epoch": 2.5, + "grad_norm": 0.7286245226860046, + "learning_rate": 3.984282416950433e-05, + "loss": 2.655, + "step": 51058 + }, + { + "epoch": 2.5, + "grad_norm": 0.7232625484466553, + "learning_rate": 3.983515757275172e-05, + "loss": 3.0018, + "step": 51059 + }, + { + "epoch": 2.5, + "grad_norm": 0.7462209463119507, + "learning_rate": 3.9827491661223185e-05, + "loss": 2.7941, + "step": 51060 + }, + { + "epoch": 2.5, + "grad_norm": 0.7977537512779236, + "learning_rate": 3.981982643493898e-05, + "loss": 2.9345, + "step": 51061 + }, + { + "epoch": 2.5, + "grad_norm": 0.7521597743034363, + "learning_rate": 3.9812161893919356e-05, + "loss": 2.7811, + "step": 51062 + }, + { + "epoch": 2.5, + "grad_norm": 0.7260125279426575, + "learning_rate": 3.980449803818433e-05, + "loss": 2.8128, + "step": 51063 + }, + { + "epoch": 2.5, + "grad_norm": 0.7464675307273865, + "learning_rate": 3.9796834867754244e-05, + "loss": 3.1766, + "step": 51064 + }, + { + "epoch": 2.5, + "grad_norm": 0.71709144115448, + "learning_rate": 3.9789172382649226e-05, + "loss": 2.724, + "step": 51065 + }, + { + "epoch": 2.5, + "grad_norm": 0.7842869162559509, + "learning_rate": 3.978151058288939e-05, + "loss": 2.8037, + "step": 51066 + }, + { + "epoch": 2.5, + "grad_norm": 0.7320528626441956, + "learning_rate": 3.9773849468495044e-05, + "loss": 2.8024, + "step": 51067 + }, + { + "epoch": 2.5, + "grad_norm": 0.7437505125999451, + "learning_rate": 3.976618903948622e-05, + "loss": 2.9678, + "step": 51068 + }, + { + "epoch": 2.5, + "grad_norm": 0.7431596517562866, + "learning_rate": 3.9758529295883226e-05, + "loss": 2.9204, + "step": 51069 + }, + { + "epoch": 2.5, + "grad_norm": 0.7559123635292053, + "learning_rate": 3.9750870237706146e-05, + "loss": 2.9171, + "step": 51070 + }, + { + "epoch": 2.5, + "grad_norm": 0.7823855876922607, + "learning_rate": 3.974321186497523e-05, + "loss": 2.9853, + "step": 51071 + }, + { + "epoch": 2.5, + "grad_norm": 0.7621415853500366, + "learning_rate": 3.97355541777106e-05, + "loss": 2.9533, + "step": 51072 + }, + { + "epoch": 2.5, + "grad_norm": 0.7120107412338257, + "learning_rate": 3.9727897175932377e-05, + "loss": 2.8572, + "step": 51073 + }, + { + "epoch": 2.5, + "grad_norm": 0.7418361306190491, + "learning_rate": 3.972024085966087e-05, + "loss": 2.9235, + "step": 51074 + }, + { + "epoch": 2.5, + "grad_norm": 0.7263070940971375, + "learning_rate": 3.9712585228916064e-05, + "loss": 2.7967, + "step": 51075 + }, + { + "epoch": 2.5, + "grad_norm": 0.7198815941810608, + "learning_rate": 3.970493028371824e-05, + "loss": 3.0169, + "step": 51076 + }, + { + "epoch": 2.5, + "grad_norm": 0.7565320134162903, + "learning_rate": 3.9697276024087555e-05, + "loss": 2.8488, + "step": 51077 + }, + { + "epoch": 2.5, + "grad_norm": 0.7032371759414673, + "learning_rate": 3.968962245004419e-05, + "loss": 3.0858, + "step": 51078 + }, + { + "epoch": 2.5, + "grad_norm": 0.7568981051445007, + "learning_rate": 3.968196956160826e-05, + "loss": 3.0574, + "step": 51079 + }, + { + "epoch": 2.5, + "grad_norm": 0.7816786170005798, + "learning_rate": 3.967431735879982e-05, + "loss": 2.8361, + "step": 51080 + }, + { + "epoch": 2.5, + "grad_norm": 0.7889500260353088, + "learning_rate": 3.966666584163916e-05, + "loss": 2.8982, + "step": 51081 + }, + { + "epoch": 2.5, + "grad_norm": 0.8262700438499451, + "learning_rate": 3.965901501014648e-05, + "loss": 2.9173, + "step": 51082 + }, + { + "epoch": 2.5, + "grad_norm": 0.7360769510269165, + "learning_rate": 3.965136486434174e-05, + "loss": 2.8857, + "step": 51083 + }, + { + "epoch": 2.5, + "grad_norm": 0.7041099071502686, + "learning_rate": 3.964371540424529e-05, + "loss": 2.8569, + "step": 51084 + }, + { + "epoch": 2.5, + "grad_norm": 0.7271643280982971, + "learning_rate": 3.963606662987712e-05, + "loss": 2.841, + "step": 51085 + }, + { + "epoch": 2.5, + "grad_norm": 0.7795175313949585, + "learning_rate": 3.9628418541257534e-05, + "loss": 2.9807, + "step": 51086 + }, + { + "epoch": 2.5, + "grad_norm": 0.7626325488090515, + "learning_rate": 3.962077113840656e-05, + "loss": 2.9521, + "step": 51087 + }, + { + "epoch": 2.5, + "grad_norm": 0.7495712637901306, + "learning_rate": 3.9613124421344275e-05, + "loss": 2.8875, + "step": 51088 + }, + { + "epoch": 2.5, + "grad_norm": 0.7421875, + "learning_rate": 3.960547839009101e-05, + "loss": 2.8907, + "step": 51089 + }, + { + "epoch": 2.5, + "grad_norm": 0.7435591816902161, + "learning_rate": 3.959783304466673e-05, + "loss": 2.9217, + "step": 51090 + }, + { + "epoch": 2.5, + "grad_norm": 0.7713919878005981, + "learning_rate": 3.959018838509162e-05, + "loss": 2.7192, + "step": 51091 + }, + { + "epoch": 2.5, + "grad_norm": 0.7549655437469482, + "learning_rate": 3.958254441138592e-05, + "loss": 2.9587, + "step": 51092 + }, + { + "epoch": 2.5, + "grad_norm": 0.7387896776199341, + "learning_rate": 3.9574901123569655e-05, + "loss": 2.9832, + "step": 51093 + }, + { + "epoch": 2.5, + "grad_norm": 0.7287462949752808, + "learning_rate": 3.956725852166302e-05, + "loss": 2.7358, + "step": 51094 + }, + { + "epoch": 2.5, + "grad_norm": 0.7182101011276245, + "learning_rate": 3.9559616605686e-05, + "loss": 2.9946, + "step": 51095 + }, + { + "epoch": 2.5, + "grad_norm": 0.7613502740859985, + "learning_rate": 3.955197537565885e-05, + "loss": 2.8062, + "step": 51096 + }, + { + "epoch": 2.5, + "grad_norm": 0.7491579055786133, + "learning_rate": 3.954433483160174e-05, + "loss": 2.9498, + "step": 51097 + }, + { + "epoch": 2.5, + "grad_norm": 0.7342544794082642, + "learning_rate": 3.953669497353467e-05, + "loss": 2.814, + "step": 51098 + }, + { + "epoch": 2.5, + "grad_norm": 0.7425305843353271, + "learning_rate": 3.952905580147787e-05, + "loss": 2.8258, + "step": 51099 + }, + { + "epoch": 2.5, + "grad_norm": 0.73570716381073, + "learning_rate": 3.952141731545141e-05, + "loss": 2.7309, + "step": 51100 + }, + { + "epoch": 2.5, + "grad_norm": 0.7669528722763062, + "learning_rate": 3.951377951547533e-05, + "loss": 2.9622, + "step": 51101 + }, + { + "epoch": 2.5, + "grad_norm": 0.8032728433609009, + "learning_rate": 3.950614240156992e-05, + "loss": 2.6455, + "step": 51102 + }, + { + "epoch": 2.5, + "grad_norm": 0.7362860441207886, + "learning_rate": 3.949850597375509e-05, + "loss": 2.8745, + "step": 51103 + }, + { + "epoch": 2.5, + "grad_norm": 0.7232388854026794, + "learning_rate": 3.9490870232051164e-05, + "loss": 2.8219, + "step": 51104 + }, + { + "epoch": 2.5, + "grad_norm": 0.7235947847366333, + "learning_rate": 3.948323517647806e-05, + "loss": 2.9228, + "step": 51105 + }, + { + "epoch": 2.5, + "grad_norm": 0.7498955726623535, + "learning_rate": 3.9475600807055986e-05, + "loss": 3.1506, + "step": 51106 + }, + { + "epoch": 2.5, + "grad_norm": 0.7524616122245789, + "learning_rate": 3.94679671238051e-05, + "loss": 2.9171, + "step": 51107 + }, + { + "epoch": 2.5, + "grad_norm": 0.707608163356781, + "learning_rate": 3.9460334126745455e-05, + "loss": 2.8342, + "step": 51108 + }, + { + "epoch": 2.5, + "grad_norm": 0.7663477659225464, + "learning_rate": 3.945270181589716e-05, + "loss": 2.8276, + "step": 51109 + }, + { + "epoch": 2.5, + "grad_norm": 0.7328577041625977, + "learning_rate": 3.94450701912802e-05, + "loss": 2.7313, + "step": 51110 + }, + { + "epoch": 2.5, + "grad_norm": 0.7494615912437439, + "learning_rate": 3.9437439252914805e-05, + "loss": 2.828, + "step": 51111 + }, + { + "epoch": 2.5, + "grad_norm": 0.7223064303398132, + "learning_rate": 3.9429809000821145e-05, + "loss": 2.8811, + "step": 51112 + }, + { + "epoch": 2.5, + "grad_norm": 0.746543288230896, + "learning_rate": 3.9422179435019106e-05, + "loss": 2.8726, + "step": 51113 + }, + { + "epoch": 2.5, + "grad_norm": 0.7731015682220459, + "learning_rate": 3.941455055552898e-05, + "loss": 2.7396, + "step": 51114 + }, + { + "epoch": 2.51, + "grad_norm": 0.7490095496177673, + "learning_rate": 3.9406922362370775e-05, + "loss": 2.8321, + "step": 51115 + }, + { + "epoch": 2.51, + "grad_norm": 0.7323755621910095, + "learning_rate": 3.9399294855564545e-05, + "loss": 2.8937, + "step": 51116 + }, + { + "epoch": 2.51, + "grad_norm": 0.7344057559967041, + "learning_rate": 3.9391668035130444e-05, + "loss": 2.7432, + "step": 51117 + }, + { + "epoch": 2.51, + "grad_norm": 0.744623601436615, + "learning_rate": 3.938404190108848e-05, + "loss": 2.8554, + "step": 51118 + }, + { + "epoch": 2.51, + "grad_norm": 0.7350231409072876, + "learning_rate": 3.937641645345888e-05, + "loss": 2.9645, + "step": 51119 + }, + { + "epoch": 2.51, + "grad_norm": 0.7532517910003662, + "learning_rate": 3.936879169226156e-05, + "loss": 2.8945, + "step": 51120 + }, + { + "epoch": 2.51, + "grad_norm": 0.7580457329750061, + "learning_rate": 3.936116761751676e-05, + "loss": 2.9476, + "step": 51121 + }, + { + "epoch": 2.51, + "grad_norm": 0.7381550073623657, + "learning_rate": 3.935354422924445e-05, + "loss": 2.6959, + "step": 51122 + }, + { + "epoch": 2.51, + "grad_norm": 0.7573136687278748, + "learning_rate": 3.93459215274647e-05, + "loss": 2.9742, + "step": 51123 + }, + { + "epoch": 2.51, + "grad_norm": 0.7315176129341125, + "learning_rate": 3.933829951219769e-05, + "loss": 3.0369, + "step": 51124 + }, + { + "epoch": 2.51, + "grad_norm": 0.8132463693618774, + "learning_rate": 3.9330678183463385e-05, + "loss": 2.9298, + "step": 51125 + }, + { + "epoch": 2.51, + "grad_norm": 0.7474046945571899, + "learning_rate": 3.932305754128196e-05, + "loss": 2.8276, + "step": 51126 + }, + { + "epoch": 2.51, + "grad_norm": 0.7960267663002014, + "learning_rate": 3.9315437585673356e-05, + "loss": 2.8031, + "step": 51127 + }, + { + "epoch": 2.51, + "grad_norm": 0.7314231395721436, + "learning_rate": 3.9307818316657744e-05, + "loss": 2.7175, + "step": 51128 + }, + { + "epoch": 2.51, + "grad_norm": 0.7347849011421204, + "learning_rate": 3.930019973425523e-05, + "loss": 2.9909, + "step": 51129 + }, + { + "epoch": 2.51, + "grad_norm": 0.7521358728408813, + "learning_rate": 3.92925818384858e-05, + "loss": 2.9316, + "step": 51130 + }, + { + "epoch": 2.51, + "grad_norm": 0.7546024322509766, + "learning_rate": 3.9284964629369545e-05, + "loss": 3.0779, + "step": 51131 + }, + { + "epoch": 2.51, + "grad_norm": 0.7242218255996704, + "learning_rate": 3.927734810692643e-05, + "loss": 3.0528, + "step": 51132 + }, + { + "epoch": 2.51, + "grad_norm": 0.7637183666229248, + "learning_rate": 3.9269732271176624e-05, + "loss": 2.8383, + "step": 51133 + }, + { + "epoch": 2.51, + "grad_norm": 0.7487627267837524, + "learning_rate": 3.926211712214024e-05, + "loss": 3.0692, + "step": 51134 + }, + { + "epoch": 2.51, + "grad_norm": 0.7518205642700195, + "learning_rate": 3.925450265983718e-05, + "loss": 2.7788, + "step": 51135 + }, + { + "epoch": 2.51, + "grad_norm": 0.7417492270469666, + "learning_rate": 3.924688888428765e-05, + "loss": 2.9289, + "step": 51136 + }, + { + "epoch": 2.51, + "grad_norm": 0.7587374448776245, + "learning_rate": 3.923927579551163e-05, + "loss": 3.0257, + "step": 51137 + }, + { + "epoch": 2.51, + "grad_norm": 0.7488877177238464, + "learning_rate": 3.923166339352912e-05, + "loss": 2.7045, + "step": 51138 + }, + { + "epoch": 2.51, + "grad_norm": 0.743396520614624, + "learning_rate": 3.9224051678360256e-05, + "loss": 2.9128, + "step": 51139 + }, + { + "epoch": 2.51, + "grad_norm": 0.739387571811676, + "learning_rate": 3.921644065002502e-05, + "loss": 2.8156, + "step": 51140 + }, + { + "epoch": 2.51, + "grad_norm": 0.771299421787262, + "learning_rate": 3.920883030854355e-05, + "loss": 2.7683, + "step": 51141 + }, + { + "epoch": 2.51, + "grad_norm": 0.7252259850502014, + "learning_rate": 3.9201220653935735e-05, + "loss": 2.9843, + "step": 51142 + }, + { + "epoch": 2.51, + "grad_norm": 0.7235883474349976, + "learning_rate": 3.919361168622174e-05, + "loss": 2.9401, + "step": 51143 + }, + { + "epoch": 2.51, + "grad_norm": 0.73777174949646, + "learning_rate": 3.918600340542163e-05, + "loss": 2.9255, + "step": 51144 + }, + { + "epoch": 2.51, + "grad_norm": 0.7344832420349121, + "learning_rate": 3.9178395811555396e-05, + "loss": 2.7266, + "step": 51145 + }, + { + "epoch": 2.51, + "grad_norm": 0.7370460033416748, + "learning_rate": 3.917078890464309e-05, + "loss": 2.7727, + "step": 51146 + }, + { + "epoch": 2.51, + "grad_norm": 0.7238647937774658, + "learning_rate": 3.916318268470466e-05, + "loss": 2.9441, + "step": 51147 + }, + { + "epoch": 2.51, + "grad_norm": 0.7683873176574707, + "learning_rate": 3.9155577151760195e-05, + "loss": 2.7424, + "step": 51148 + }, + { + "epoch": 2.51, + "grad_norm": 0.7333675026893616, + "learning_rate": 3.914797230582981e-05, + "loss": 2.9641, + "step": 51149 + }, + { + "epoch": 2.51, + "grad_norm": 0.789627730846405, + "learning_rate": 3.914036814693339e-05, + "loss": 2.6447, + "step": 51150 + }, + { + "epoch": 2.51, + "grad_norm": 0.7240229845046997, + "learning_rate": 3.913276467509111e-05, + "loss": 2.7824, + "step": 51151 + }, + { + "epoch": 2.51, + "grad_norm": 0.7708064317703247, + "learning_rate": 3.91251618903229e-05, + "loss": 2.9563, + "step": 51152 + }, + { + "epoch": 2.51, + "grad_norm": 0.7075161337852478, + "learning_rate": 3.9117559792648764e-05, + "loss": 2.9594, + "step": 51153 + }, + { + "epoch": 2.51, + "grad_norm": 0.8024198412895203, + "learning_rate": 3.910995838208883e-05, + "loss": 2.9412, + "step": 51154 + }, + { + "epoch": 2.51, + "grad_norm": 0.7190766334533691, + "learning_rate": 3.910235765866302e-05, + "loss": 3.0936, + "step": 51155 + }, + { + "epoch": 2.51, + "grad_norm": 0.7699955701828003, + "learning_rate": 3.9094757622391407e-05, + "loss": 3.0168, + "step": 51156 + }, + { + "epoch": 2.51, + "grad_norm": 0.7096518874168396, + "learning_rate": 3.908715827329394e-05, + "loss": 2.9694, + "step": 51157 + }, + { + "epoch": 2.51, + "grad_norm": 0.7942549586296082, + "learning_rate": 3.907955961139078e-05, + "loss": 2.8103, + "step": 51158 + }, + { + "epoch": 2.51, + "grad_norm": 0.7407145500183105, + "learning_rate": 3.90719616367018e-05, + "loss": 2.7899, + "step": 51159 + }, + { + "epoch": 2.51, + "grad_norm": 0.7580335736274719, + "learning_rate": 3.9064364349247025e-05, + "loss": 2.9164, + "step": 51160 + }, + { + "epoch": 2.51, + "grad_norm": 0.7507249116897583, + "learning_rate": 3.9056767749046534e-05, + "loss": 2.795, + "step": 51161 + }, + { + "epoch": 2.51, + "grad_norm": 0.7054608464241028, + "learning_rate": 3.9049171836120255e-05, + "loss": 3.0047, + "step": 51162 + }, + { + "epoch": 2.51, + "grad_norm": 0.8000778555870056, + "learning_rate": 3.9041576610488226e-05, + "loss": 2.8907, + "step": 51163 + }, + { + "epoch": 2.51, + "grad_norm": 0.7749502062797546, + "learning_rate": 3.903398207217053e-05, + "loss": 2.9108, + "step": 51164 + }, + { + "epoch": 2.51, + "grad_norm": 0.7396555542945862, + "learning_rate": 3.902638822118703e-05, + "loss": 2.8869, + "step": 51165 + }, + { + "epoch": 2.51, + "grad_norm": 0.7471228241920471, + "learning_rate": 3.9018795057557874e-05, + "loss": 2.7746, + "step": 51166 + }, + { + "epoch": 2.51, + "grad_norm": 0.7587604522705078, + "learning_rate": 3.901120258130297e-05, + "loss": 2.8944, + "step": 51167 + }, + { + "epoch": 2.51, + "grad_norm": 0.729351818561554, + "learning_rate": 3.900361079244228e-05, + "loss": 3.1015, + "step": 51168 + }, + { + "epoch": 2.51, + "grad_norm": 0.7310697436332703, + "learning_rate": 3.899601969099591e-05, + "loss": 2.7458, + "step": 51169 + }, + { + "epoch": 2.51, + "grad_norm": 0.7379913330078125, + "learning_rate": 3.898842927698371e-05, + "loss": 2.7591, + "step": 51170 + }, + { + "epoch": 2.51, + "grad_norm": 0.8379819393157959, + "learning_rate": 3.8980839550425846e-05, + "loss": 2.8154, + "step": 51171 + }, + { + "epoch": 2.51, + "grad_norm": 0.7568002343177795, + "learning_rate": 3.8973250511342127e-05, + "loss": 2.9693, + "step": 51172 + }, + { + "epoch": 2.51, + "grad_norm": 0.7051762938499451, + "learning_rate": 3.89656621597527e-05, + "loss": 2.9149, + "step": 51173 + }, + { + "epoch": 2.51, + "grad_norm": 0.7339165210723877, + "learning_rate": 3.895807449567752e-05, + "loss": 2.9697, + "step": 51174 + }, + { + "epoch": 2.51, + "grad_norm": 0.7606975436210632, + "learning_rate": 3.895048751913643e-05, + "loss": 2.8402, + "step": 51175 + }, + { + "epoch": 2.51, + "grad_norm": 0.7482293248176575, + "learning_rate": 3.894290123014959e-05, + "loss": 3.0584, + "step": 51176 + }, + { + "epoch": 2.51, + "grad_norm": 0.727558970451355, + "learning_rate": 3.8935315628736816e-05, + "loss": 2.9286, + "step": 51177 + }, + { + "epoch": 2.51, + "grad_norm": 0.7616907954216003, + "learning_rate": 3.892773071491823e-05, + "loss": 2.9106, + "step": 51178 + }, + { + "epoch": 2.51, + "grad_norm": 0.7025411128997803, + "learning_rate": 3.89201464887138e-05, + "loss": 2.7646, + "step": 51179 + }, + { + "epoch": 2.51, + "grad_norm": 0.7629191279411316, + "learning_rate": 3.891256295014342e-05, + "loss": 3.0237, + "step": 51180 + }, + { + "epoch": 2.51, + "grad_norm": 0.776381254196167, + "learning_rate": 3.8904980099227145e-05, + "loss": 3.0137, + "step": 51181 + }, + { + "epoch": 2.51, + "grad_norm": 0.7896276116371155, + "learning_rate": 3.8897397935984854e-05, + "loss": 2.7922, + "step": 51182 + }, + { + "epoch": 2.51, + "grad_norm": 0.6885989904403687, + "learning_rate": 3.888981646043653e-05, + "loss": 2.9651, + "step": 51183 + }, + { + "epoch": 2.51, + "grad_norm": 0.7584837675094604, + "learning_rate": 3.888223567260226e-05, + "loss": 3.0849, + "step": 51184 + }, + { + "epoch": 2.51, + "grad_norm": 0.7259736061096191, + "learning_rate": 3.887465557250186e-05, + "loss": 2.7324, + "step": 51185 + }, + { + "epoch": 2.51, + "grad_norm": 0.7147319912910461, + "learning_rate": 3.886707616015541e-05, + "loss": 2.9832, + "step": 51186 + }, + { + "epoch": 2.51, + "grad_norm": 0.7638548016548157, + "learning_rate": 3.8859497435582764e-05, + "loss": 2.7991, + "step": 51187 + }, + { + "epoch": 2.51, + "grad_norm": 0.7534319162368774, + "learning_rate": 3.885191939880404e-05, + "loss": 2.9444, + "step": 51188 + }, + { + "epoch": 2.51, + "grad_norm": 0.7515311241149902, + "learning_rate": 3.8844342049839065e-05, + "loss": 2.7472, + "step": 51189 + }, + { + "epoch": 2.51, + "grad_norm": 0.7729167342185974, + "learning_rate": 3.88367653887078e-05, + "loss": 2.9857, + "step": 51190 + }, + { + "epoch": 2.51, + "grad_norm": 0.735460102558136, + "learning_rate": 3.8829189415430285e-05, + "loss": 2.9408, + "step": 51191 + }, + { + "epoch": 2.51, + "grad_norm": 0.7101142406463623, + "learning_rate": 3.882161413002636e-05, + "loss": 3.033, + "step": 51192 + }, + { + "epoch": 2.51, + "grad_norm": 0.7463287115097046, + "learning_rate": 3.8814039532516005e-05, + "loss": 2.7537, + "step": 51193 + }, + { + "epoch": 2.51, + "grad_norm": 0.8116194009780884, + "learning_rate": 3.880646562291931e-05, + "loss": 3.1006, + "step": 51194 + }, + { + "epoch": 2.51, + "grad_norm": 0.7226468324661255, + "learning_rate": 3.87988924012561e-05, + "loss": 2.9244, + "step": 51195 + }, + { + "epoch": 2.51, + "grad_norm": 0.7452834248542786, + "learning_rate": 3.8791319867546345e-05, + "loss": 2.8646, + "step": 51196 + }, + { + "epoch": 2.51, + "grad_norm": 0.78560471534729, + "learning_rate": 3.878374802180994e-05, + "loss": 2.895, + "step": 51197 + }, + { + "epoch": 2.51, + "grad_norm": 0.7502028942108154, + "learning_rate": 3.8776176864066886e-05, + "loss": 2.9412, + "step": 51198 + }, + { + "epoch": 2.51, + "grad_norm": 0.753572940826416, + "learning_rate": 3.8768606394337086e-05, + "loss": 2.994, + "step": 51199 + }, + { + "epoch": 2.51, + "grad_norm": 0.7555680871009827, + "learning_rate": 3.876103661264048e-05, + "loss": 2.7353, + "step": 51200 + }, + { + "epoch": 2.51, + "grad_norm": 0.7278770804405212, + "learning_rate": 3.8753467518997094e-05, + "loss": 2.8839, + "step": 51201 + }, + { + "epoch": 2.51, + "grad_norm": 0.7377351522445679, + "learning_rate": 3.87458991134268e-05, + "loss": 2.8566, + "step": 51202 + }, + { + "epoch": 2.51, + "grad_norm": 0.7519879341125488, + "learning_rate": 3.873833139594953e-05, + "loss": 2.9622, + "step": 51203 + }, + { + "epoch": 2.51, + "grad_norm": 0.7733878493309021, + "learning_rate": 3.873076436658512e-05, + "loss": 2.902, + "step": 51204 + }, + { + "epoch": 2.51, + "grad_norm": 0.7269855737686157, + "learning_rate": 3.872319802535363e-05, + "loss": 3.0837, + "step": 51205 + }, + { + "epoch": 2.51, + "grad_norm": 0.7522500157356262, + "learning_rate": 3.8715632372275005e-05, + "loss": 2.8483, + "step": 51206 + }, + { + "epoch": 2.51, + "grad_norm": 0.7667732834815979, + "learning_rate": 3.870806740736907e-05, + "loss": 2.8965, + "step": 51207 + }, + { + "epoch": 2.51, + "grad_norm": 0.7667376399040222, + "learning_rate": 3.8700503130655836e-05, + "loss": 2.6717, + "step": 51208 + }, + { + "epoch": 2.51, + "grad_norm": 0.7345377206802368, + "learning_rate": 3.869293954215516e-05, + "loss": 2.9421, + "step": 51209 + }, + { + "epoch": 2.51, + "grad_norm": 0.8869428038597107, + "learning_rate": 3.868537664188701e-05, + "loss": 2.8926, + "step": 51210 + }, + { + "epoch": 2.51, + "grad_norm": 0.725968062877655, + "learning_rate": 3.8677814429871324e-05, + "loss": 2.996, + "step": 51211 + }, + { + "epoch": 2.51, + "grad_norm": 0.805472195148468, + "learning_rate": 3.8670252906127905e-05, + "loss": 2.8731, + "step": 51212 + }, + { + "epoch": 2.51, + "grad_norm": 0.7062275409698486, + "learning_rate": 3.866269207067681e-05, + "loss": 2.8816, + "step": 51213 + }, + { + "epoch": 2.51, + "grad_norm": 0.7009338736534119, + "learning_rate": 3.865513192353783e-05, + "loss": 2.8552, + "step": 51214 + }, + { + "epoch": 2.51, + "grad_norm": 0.7251269817352295, + "learning_rate": 3.864757246473094e-05, + "loss": 2.9754, + "step": 51215 + }, + { + "epoch": 2.51, + "grad_norm": 0.7357146739959717, + "learning_rate": 3.864001369427608e-05, + "loss": 3.0632, + "step": 51216 + }, + { + "epoch": 2.51, + "grad_norm": 0.7163044810295105, + "learning_rate": 3.863245561219316e-05, + "loss": 2.9741, + "step": 51217 + }, + { + "epoch": 2.51, + "grad_norm": 0.7487192153930664, + "learning_rate": 3.862489821850204e-05, + "loss": 2.9866, + "step": 51218 + }, + { + "epoch": 2.51, + "grad_norm": 0.727894127368927, + "learning_rate": 3.8617341513222554e-05, + "loss": 2.9668, + "step": 51219 + }, + { + "epoch": 2.51, + "grad_norm": 0.7058404088020325, + "learning_rate": 3.8609785496374714e-05, + "loss": 2.8872, + "step": 51220 + }, + { + "epoch": 2.51, + "grad_norm": 0.7830812931060791, + "learning_rate": 3.8602230167978434e-05, + "loss": 2.5873, + "step": 51221 + }, + { + "epoch": 2.51, + "grad_norm": 0.7016585469245911, + "learning_rate": 3.859467552805353e-05, + "loss": 2.812, + "step": 51222 + }, + { + "epoch": 2.51, + "grad_norm": 0.7951351404190063, + "learning_rate": 3.8587121576619996e-05, + "loss": 2.7951, + "step": 51223 + }, + { + "epoch": 2.51, + "grad_norm": 0.7332335114479065, + "learning_rate": 3.8579568313697674e-05, + "loss": 3.1449, + "step": 51224 + }, + { + "epoch": 2.51, + "grad_norm": 0.7628679275512695, + "learning_rate": 3.8572015739306386e-05, + "loss": 2.8698, + "step": 51225 + }, + { + "epoch": 2.51, + "grad_norm": 0.7135177850723267, + "learning_rate": 3.8564463853466175e-05, + "loss": 2.8874, + "step": 51226 + }, + { + "epoch": 2.51, + "grad_norm": 0.7211930751800537, + "learning_rate": 3.855691265619677e-05, + "loss": 2.8293, + "step": 51227 + }, + { + "epoch": 2.51, + "grad_norm": 0.7237968444824219, + "learning_rate": 3.854936214751822e-05, + "loss": 2.8799, + "step": 51228 + }, + { + "epoch": 2.51, + "grad_norm": 0.7675337791442871, + "learning_rate": 3.854181232745026e-05, + "loss": 2.9046, + "step": 51229 + }, + { + "epoch": 2.51, + "grad_norm": 0.7495302557945251, + "learning_rate": 3.853426319601286e-05, + "loss": 3.2686, + "step": 51230 + }, + { + "epoch": 2.51, + "grad_norm": 0.7274502515792847, + "learning_rate": 3.852671475322593e-05, + "loss": 2.9237, + "step": 51231 + }, + { + "epoch": 2.51, + "grad_norm": 0.7208870053291321, + "learning_rate": 3.851916699910932e-05, + "loss": 3.0848, + "step": 51232 + }, + { + "epoch": 2.51, + "grad_norm": 0.7488343715667725, + "learning_rate": 3.851161993368291e-05, + "loss": 2.658, + "step": 51233 + }, + { + "epoch": 2.51, + "grad_norm": 0.749234139919281, + "learning_rate": 3.850407355696653e-05, + "loss": 2.8975, + "step": 51234 + }, + { + "epoch": 2.51, + "grad_norm": 0.7662747502326965, + "learning_rate": 3.849652786898005e-05, + "loss": 2.8386, + "step": 51235 + }, + { + "epoch": 2.51, + "grad_norm": 0.779507040977478, + "learning_rate": 3.848898286974347e-05, + "loss": 2.7845, + "step": 51236 + }, + { + "epoch": 2.51, + "grad_norm": 0.7223421931266785, + "learning_rate": 3.848143855927652e-05, + "loss": 2.8635, + "step": 51237 + }, + { + "epoch": 2.51, + "grad_norm": 0.7322688102722168, + "learning_rate": 3.84738949375992e-05, + "loss": 3.0542, + "step": 51238 + }, + { + "epoch": 2.51, + "grad_norm": 0.7349992990493774, + "learning_rate": 3.846635200473129e-05, + "loss": 2.9684, + "step": 51239 + }, + { + "epoch": 2.51, + "grad_norm": 0.7478016018867493, + "learning_rate": 3.845880976069261e-05, + "loss": 3.0442, + "step": 51240 + }, + { + "epoch": 2.51, + "grad_norm": 0.7727739214897156, + "learning_rate": 3.845126820550317e-05, + "loss": 2.7984, + "step": 51241 + }, + { + "epoch": 2.51, + "grad_norm": 0.7362459301948547, + "learning_rate": 3.8443727339182695e-05, + "loss": 2.8293, + "step": 51242 + }, + { + "epoch": 2.51, + "grad_norm": 0.7158995866775513, + "learning_rate": 3.843618716175116e-05, + "loss": 2.8788, + "step": 51243 + }, + { + "epoch": 2.51, + "grad_norm": 0.7135204672813416, + "learning_rate": 3.8428647673228294e-05, + "loss": 2.933, + "step": 51244 + }, + { + "epoch": 2.51, + "grad_norm": 0.7790452241897583, + "learning_rate": 3.842110887363404e-05, + "loss": 2.9307, + "step": 51245 + }, + { + "epoch": 2.51, + "grad_norm": 0.7330326437950134, + "learning_rate": 3.8413570762988325e-05, + "loss": 2.6545, + "step": 51246 + }, + { + "epoch": 2.51, + "grad_norm": 0.7503302693367004, + "learning_rate": 3.8406033341310884e-05, + "loss": 2.8702, + "step": 51247 + }, + { + "epoch": 2.51, + "grad_norm": 0.8307878375053406, + "learning_rate": 3.8398496608621644e-05, + "loss": 3.0346, + "step": 51248 + }, + { + "epoch": 2.51, + "grad_norm": 0.7150816917419434, + "learning_rate": 3.839096056494032e-05, + "loss": 2.9994, + "step": 51249 + }, + { + "epoch": 2.51, + "grad_norm": 0.7854032516479492, + "learning_rate": 3.8383425210286864e-05, + "loss": 2.895, + "step": 51250 + }, + { + "epoch": 2.51, + "grad_norm": 0.7631329298019409, + "learning_rate": 3.8375890544681195e-05, + "loss": 2.9736, + "step": 51251 + }, + { + "epoch": 2.51, + "grad_norm": 0.7520675659179688, + "learning_rate": 3.8368356568143036e-05, + "loss": 3.052, + "step": 51252 + }, + { + "epoch": 2.51, + "grad_norm": 0.7277513742446899, + "learning_rate": 3.836082328069229e-05, + "loss": 2.9834, + "step": 51253 + }, + { + "epoch": 2.51, + "grad_norm": 0.7684758901596069, + "learning_rate": 3.8353290682348814e-05, + "loss": 2.7606, + "step": 51254 + }, + { + "epoch": 2.51, + "grad_norm": 0.7762331366539001, + "learning_rate": 3.834575877313233e-05, + "loss": 3.0253, + "step": 51255 + }, + { + "epoch": 2.51, + "grad_norm": 0.7357913851737976, + "learning_rate": 3.833822755306285e-05, + "loss": 3.0781, + "step": 51256 + }, + { + "epoch": 2.51, + "grad_norm": 0.7422073483467102, + "learning_rate": 3.833069702216003e-05, + "loss": 2.9321, + "step": 51257 + }, + { + "epoch": 2.51, + "grad_norm": 0.7310648560523987, + "learning_rate": 3.832316718044388e-05, + "loss": 2.8499, + "step": 51258 + }, + { + "epoch": 2.51, + "grad_norm": 0.6889393925666809, + "learning_rate": 3.831563802793409e-05, + "loss": 3.0701, + "step": 51259 + }, + { + "epoch": 2.51, + "grad_norm": 1.1530731916427612, + "learning_rate": 3.830810956465058e-05, + "loss": 3.111, + "step": 51260 + }, + { + "epoch": 2.51, + "grad_norm": 0.7462144494056702, + "learning_rate": 3.830058179061316e-05, + "loss": 2.8284, + "step": 51261 + }, + { + "epoch": 2.51, + "grad_norm": 0.7419637441635132, + "learning_rate": 3.829305470584158e-05, + "loss": 3.0065, + "step": 51262 + }, + { + "epoch": 2.51, + "grad_norm": 0.7542158365249634, + "learning_rate": 3.828552831035581e-05, + "loss": 2.8055, + "step": 51263 + }, + { + "epoch": 2.51, + "grad_norm": 0.7965158224105835, + "learning_rate": 3.82780026041755e-05, + "loss": 2.7436, + "step": 51264 + }, + { + "epoch": 2.51, + "grad_norm": 0.743840217590332, + "learning_rate": 3.8270477587320566e-05, + "loss": 2.7356, + "step": 51265 + }, + { + "epoch": 2.51, + "grad_norm": 0.7825320959091187, + "learning_rate": 3.826295325981091e-05, + "loss": 2.7954, + "step": 51266 + }, + { + "epoch": 2.51, + "grad_norm": 0.7897096276283264, + "learning_rate": 3.825542962166619e-05, + "loss": 2.9194, + "step": 51267 + }, + { + "epoch": 2.51, + "grad_norm": 0.7811319828033447, + "learning_rate": 3.8247906672906345e-05, + "loss": 2.7623, + "step": 51268 + }, + { + "epoch": 2.51, + "grad_norm": 0.7193589806556702, + "learning_rate": 3.824038441355114e-05, + "loss": 2.8256, + "step": 51269 + }, + { + "epoch": 2.51, + "grad_norm": 0.7284131050109863, + "learning_rate": 3.823286284362031e-05, + "loss": 2.8813, + "step": 51270 + }, + { + "epoch": 2.51, + "grad_norm": 0.7225155830383301, + "learning_rate": 3.822534196313383e-05, + "loss": 2.9283, + "step": 51271 + }, + { + "epoch": 2.51, + "grad_norm": 0.7368038296699524, + "learning_rate": 3.8217821772111354e-05, + "loss": 2.7434, + "step": 51272 + }, + { + "epoch": 2.51, + "grad_norm": 0.7491246461868286, + "learning_rate": 3.821030227057279e-05, + "loss": 2.9343, + "step": 51273 + }, + { + "epoch": 2.51, + "grad_norm": 0.7268712520599365, + "learning_rate": 3.820278345853788e-05, + "loss": 2.9003, + "step": 51274 + }, + { + "epoch": 2.51, + "grad_norm": 0.7777917981147766, + "learning_rate": 3.81952653360265e-05, + "loss": 3.0695, + "step": 51275 + }, + { + "epoch": 2.51, + "grad_norm": 0.6771141886711121, + "learning_rate": 3.818774790305842e-05, + "loss": 2.813, + "step": 51276 + }, + { + "epoch": 2.51, + "grad_norm": 0.7285410165786743, + "learning_rate": 3.818023115965334e-05, + "loss": 2.9205, + "step": 51277 + }, + { + "epoch": 2.51, + "grad_norm": 0.7287654876708984, + "learning_rate": 3.817271510583122e-05, + "loss": 2.9855, + "step": 51278 + }, + { + "epoch": 2.51, + "grad_norm": 0.7238658666610718, + "learning_rate": 3.816519974161171e-05, + "loss": 2.6708, + "step": 51279 + }, + { + "epoch": 2.51, + "grad_norm": 0.756721556186676, + "learning_rate": 3.815768506701472e-05, + "loss": 2.866, + "step": 51280 + }, + { + "epoch": 2.51, + "grad_norm": 0.7326313853263855, + "learning_rate": 3.8150171082059976e-05, + "loss": 2.8672, + "step": 51281 + }, + { + "epoch": 2.51, + "grad_norm": 0.7391651272773743, + "learning_rate": 3.8142657786767324e-05, + "loss": 2.893, + "step": 51282 + }, + { + "epoch": 2.51, + "grad_norm": 0.7335715889930725, + "learning_rate": 3.813514518115649e-05, + "loss": 2.8543, + "step": 51283 + }, + { + "epoch": 2.51, + "grad_norm": 0.7516505718231201, + "learning_rate": 3.812763326524725e-05, + "loss": 3.074, + "step": 51284 + }, + { + "epoch": 2.51, + "grad_norm": 0.6861124038696289, + "learning_rate": 3.812012203905949e-05, + "loss": 2.9163, + "step": 51285 + }, + { + "epoch": 2.51, + "grad_norm": 0.784110426902771, + "learning_rate": 3.8112611502612886e-05, + "loss": 2.7284, + "step": 51286 + }, + { + "epoch": 2.51, + "grad_norm": 0.7559488415718079, + "learning_rate": 3.810510165592724e-05, + "loss": 3.0254, + "step": 51287 + }, + { + "epoch": 2.51, + "grad_norm": 0.7138981223106384, + "learning_rate": 3.8097592499022424e-05, + "loss": 2.9743, + "step": 51288 + }, + { + "epoch": 2.51, + "grad_norm": 0.7892126441001892, + "learning_rate": 3.809008403191806e-05, + "loss": 2.8434, + "step": 51289 + }, + { + "epoch": 2.51, + "grad_norm": 0.7221898436546326, + "learning_rate": 3.808257625463409e-05, + "loss": 3.0125, + "step": 51290 + }, + { + "epoch": 2.51, + "grad_norm": 0.745223879814148, + "learning_rate": 3.807506916719021e-05, + "loss": 2.8629, + "step": 51291 + }, + { + "epoch": 2.51, + "grad_norm": 0.7798271775245667, + "learning_rate": 3.8067562769606096e-05, + "loss": 2.8282, + "step": 51292 + }, + { + "epoch": 2.51, + "grad_norm": 0.7572090029716492, + "learning_rate": 3.8060057061901704e-05, + "loss": 2.9673, + "step": 51293 + }, + { + "epoch": 2.51, + "grad_norm": 0.7191266417503357, + "learning_rate": 3.8052552044096655e-05, + "loss": 2.8256, + "step": 51294 + }, + { + "epoch": 2.51, + "grad_norm": 0.7357266545295715, + "learning_rate": 3.804504771621082e-05, + "loss": 2.7246, + "step": 51295 + }, + { + "epoch": 2.51, + "grad_norm": 0.742739737033844, + "learning_rate": 3.803754407826386e-05, + "loss": 3.0129, + "step": 51296 + }, + { + "epoch": 2.51, + "grad_norm": 0.7276623249053955, + "learning_rate": 3.8030041130275625e-05, + "loss": 2.9153, + "step": 51297 + }, + { + "epoch": 2.51, + "grad_norm": 0.7444546222686768, + "learning_rate": 3.80225388722659e-05, + "loss": 2.9379, + "step": 51298 + }, + { + "epoch": 2.51, + "grad_norm": 0.727475643157959, + "learning_rate": 3.801503730425429e-05, + "loss": 2.8814, + "step": 51299 + }, + { + "epoch": 2.51, + "grad_norm": 0.714273989200592, + "learning_rate": 3.80075364262607e-05, + "loss": 2.8059, + "step": 51300 + }, + { + "epoch": 2.51, + "grad_norm": 0.7392892837524414, + "learning_rate": 3.80000362383048e-05, + "loss": 2.6062, + "step": 51301 + }, + { + "epoch": 2.51, + "grad_norm": 0.7788336277008057, + "learning_rate": 3.799253674040638e-05, + "loss": 2.6913, + "step": 51302 + }, + { + "epoch": 2.51, + "grad_norm": 0.76091468334198, + "learning_rate": 3.798503793258525e-05, + "loss": 2.9406, + "step": 51303 + }, + { + "epoch": 2.51, + "grad_norm": 0.8225963115692139, + "learning_rate": 3.7977539814861105e-05, + "loss": 2.7359, + "step": 51304 + }, + { + "epoch": 2.51, + "grad_norm": 0.7406080365180969, + "learning_rate": 3.797004238725372e-05, + "loss": 3.0082, + "step": 51305 + }, + { + "epoch": 2.51, + "grad_norm": 0.7237011194229126, + "learning_rate": 3.7962545649782715e-05, + "loss": 2.7815, + "step": 51306 + }, + { + "epoch": 2.51, + "grad_norm": 0.7329822778701782, + "learning_rate": 3.7955049602467944e-05, + "loss": 2.8379, + "step": 51307 + }, + { + "epoch": 2.51, + "grad_norm": 0.7319344282150269, + "learning_rate": 3.794755424532919e-05, + "loss": 2.7743, + "step": 51308 + }, + { + "epoch": 2.51, + "grad_norm": 0.7189127206802368, + "learning_rate": 3.794005957838611e-05, + "loss": 2.8306, + "step": 51309 + }, + { + "epoch": 2.51, + "grad_norm": 0.7663130760192871, + "learning_rate": 3.7932565601658535e-05, + "loss": 2.7893, + "step": 51310 + }, + { + "epoch": 2.51, + "grad_norm": 0.7873907089233398, + "learning_rate": 3.792507231516607e-05, + "loss": 2.7739, + "step": 51311 + }, + { + "epoch": 2.51, + "grad_norm": 0.7518800497055054, + "learning_rate": 3.791757971892859e-05, + "loss": 2.6856, + "step": 51312 + }, + { + "epoch": 2.51, + "grad_norm": 0.7844192981719971, + "learning_rate": 3.791008781296578e-05, + "loss": 2.8565, + "step": 51313 + }, + { + "epoch": 2.51, + "grad_norm": 0.8025628924369812, + "learning_rate": 3.79025965972973e-05, + "loss": 2.7685, + "step": 51314 + }, + { + "epoch": 2.51, + "grad_norm": 0.7403668165206909, + "learning_rate": 3.789510607194298e-05, + "loss": 2.9046, + "step": 51315 + }, + { + "epoch": 2.51, + "grad_norm": 0.7729617953300476, + "learning_rate": 3.788761623692246e-05, + "loss": 3.0336, + "step": 51316 + }, + { + "epoch": 2.51, + "grad_norm": 0.7558656930923462, + "learning_rate": 3.7880127092255505e-05, + "loss": 2.9393, + "step": 51317 + }, + { + "epoch": 2.51, + "grad_norm": 0.7228742837905884, + "learning_rate": 3.787263863796195e-05, + "loss": 3.109, + "step": 51318 + }, + { + "epoch": 2.52, + "grad_norm": 0.7505278587341309, + "learning_rate": 3.786515087406137e-05, + "loss": 3.0069, + "step": 51319 + }, + { + "epoch": 2.52, + "grad_norm": 0.6907845735549927, + "learning_rate": 3.785766380057354e-05, + "loss": 2.9476, + "step": 51320 + }, + { + "epoch": 2.52, + "grad_norm": 0.7361884713172913, + "learning_rate": 3.7850177417518135e-05, + "loss": 2.8551, + "step": 51321 + }, + { + "epoch": 2.52, + "grad_norm": 0.7802038192749023, + "learning_rate": 3.7842691724914884e-05, + "loss": 3.0691, + "step": 51322 + }, + { + "epoch": 2.52, + "grad_norm": 0.7692674398422241, + "learning_rate": 3.783520672278362e-05, + "loss": 2.9041, + "step": 51323 + }, + { + "epoch": 2.52, + "grad_norm": 0.7283543348312378, + "learning_rate": 3.782772241114388e-05, + "loss": 2.753, + "step": 51324 + }, + { + "epoch": 2.52, + "grad_norm": 0.7569485306739807, + "learning_rate": 3.7820238790015526e-05, + "loss": 3.0877, + "step": 51325 + }, + { + "epoch": 2.52, + "grad_norm": 0.8705662488937378, + "learning_rate": 3.781275585941823e-05, + "loss": 2.8212, + "step": 51326 + }, + { + "epoch": 2.52, + "grad_norm": 0.7248303890228271, + "learning_rate": 3.780527361937159e-05, + "loss": 2.9386, + "step": 51327 + }, + { + "epoch": 2.52, + "grad_norm": 0.7549064755439758, + "learning_rate": 3.779779206989548e-05, + "loss": 2.8461, + "step": 51328 + }, + { + "epoch": 2.52, + "grad_norm": 0.7630690932273865, + "learning_rate": 3.7790311211009426e-05, + "loss": 2.9711, + "step": 51329 + }, + { + "epoch": 2.52, + "grad_norm": 0.7439910769462585, + "learning_rate": 3.778283104273334e-05, + "loss": 2.9077, + "step": 51330 + }, + { + "epoch": 2.52, + "grad_norm": 0.7504767179489136, + "learning_rate": 3.777535156508671e-05, + "loss": 2.7569, + "step": 51331 + }, + { + "epoch": 2.52, + "grad_norm": 0.7491384148597717, + "learning_rate": 3.776787277808935e-05, + "loss": 2.7003, + "step": 51332 + }, + { + "epoch": 2.52, + "grad_norm": 0.770693302154541, + "learning_rate": 3.7760394681761006e-05, + "loss": 2.9364, + "step": 51333 + }, + { + "epoch": 2.52, + "grad_norm": 0.7890000343322754, + "learning_rate": 3.775291727612134e-05, + "loss": 3.0071, + "step": 51334 + }, + { + "epoch": 2.52, + "grad_norm": 0.711444616317749, + "learning_rate": 3.774544056119e-05, + "loss": 2.8482, + "step": 51335 + }, + { + "epoch": 2.52, + "grad_norm": 0.7917689085006714, + "learning_rate": 3.773796453698663e-05, + "loss": 2.8997, + "step": 51336 + }, + { + "epoch": 2.52, + "grad_norm": 0.6970062255859375, + "learning_rate": 3.773048920353099e-05, + "loss": 2.9283, + "step": 51337 + }, + { + "epoch": 2.52, + "grad_norm": 0.7180047631263733, + "learning_rate": 3.7723014560842825e-05, + "loss": 2.8176, + "step": 51338 + }, + { + "epoch": 2.52, + "grad_norm": 0.743811309337616, + "learning_rate": 3.771554060894172e-05, + "loss": 3.1697, + "step": 51339 + }, + { + "epoch": 2.52, + "grad_norm": 0.7111158967018127, + "learning_rate": 3.7708067347847424e-05, + "loss": 2.7527, + "step": 51340 + }, + { + "epoch": 2.52, + "grad_norm": 0.7135360240936279, + "learning_rate": 3.770059477757966e-05, + "loss": 2.7748, + "step": 51341 + }, + { + "epoch": 2.52, + "grad_norm": 0.767493724822998, + "learning_rate": 3.7693122898157945e-05, + "loss": 3.1682, + "step": 51342 + }, + { + "epoch": 2.52, + "grad_norm": 0.7691839337348938, + "learning_rate": 3.7685651709602126e-05, + "loss": 3.0069, + "step": 51343 + }, + { + "epoch": 2.52, + "grad_norm": 0.7307931780815125, + "learning_rate": 3.767818121193176e-05, + "loss": 2.963, + "step": 51344 + }, + { + "epoch": 2.52, + "grad_norm": 0.7612271308898926, + "learning_rate": 3.767071140516665e-05, + "loss": 2.7677, + "step": 51345 + }, + { + "epoch": 2.52, + "grad_norm": 0.7013478875160217, + "learning_rate": 3.766324228932633e-05, + "loss": 2.7819, + "step": 51346 + }, + { + "epoch": 2.52, + "grad_norm": 0.7199133038520813, + "learning_rate": 3.765577386443055e-05, + "loss": 2.7997, + "step": 51347 + }, + { + "epoch": 2.52, + "grad_norm": 0.7004144787788391, + "learning_rate": 3.764830613049902e-05, + "loss": 2.9246, + "step": 51348 + }, + { + "epoch": 2.52, + "grad_norm": 0.7983659505844116, + "learning_rate": 3.7640839087551386e-05, + "loss": 3.0463, + "step": 51349 + }, + { + "epoch": 2.52, + "grad_norm": 0.7310093641281128, + "learning_rate": 3.763337273560728e-05, + "loss": 2.8928, + "step": 51350 + }, + { + "epoch": 2.52, + "grad_norm": 0.7364878058433533, + "learning_rate": 3.762590707468631e-05, + "loss": 2.8745, + "step": 51351 + }, + { + "epoch": 2.52, + "grad_norm": 0.7587478756904602, + "learning_rate": 3.7618442104808266e-05, + "loss": 2.9023, + "step": 51352 + }, + { + "epoch": 2.52, + "grad_norm": 0.7633194327354431, + "learning_rate": 3.761097782599266e-05, + "loss": 2.8162, + "step": 51353 + }, + { + "epoch": 2.52, + "grad_norm": 0.7305599451065063, + "learning_rate": 3.760351423825928e-05, + "loss": 2.9469, + "step": 51354 + }, + { + "epoch": 2.52, + "grad_norm": 0.7444048523902893, + "learning_rate": 3.759605134162781e-05, + "loss": 3.1084, + "step": 51355 + }, + { + "epoch": 2.52, + "grad_norm": 0.7268261909484863, + "learning_rate": 3.758858913611783e-05, + "loss": 2.8725, + "step": 51356 + }, + { + "epoch": 2.52, + "grad_norm": 0.7650936841964722, + "learning_rate": 3.7581127621748996e-05, + "loss": 2.9345, + "step": 51357 + }, + { + "epoch": 2.52, + "grad_norm": 0.7452462315559387, + "learning_rate": 3.757366679854089e-05, + "loss": 2.8995, + "step": 51358 + }, + { + "epoch": 2.52, + "grad_norm": 0.7164917588233948, + "learning_rate": 3.756620666651327e-05, + "loss": 2.8016, + "step": 51359 + }, + { + "epoch": 2.52, + "grad_norm": 0.7461237907409668, + "learning_rate": 3.75587472256858e-05, + "loss": 2.7738, + "step": 51360 + }, + { + "epoch": 2.52, + "grad_norm": 0.7313634753227234, + "learning_rate": 3.755128847607802e-05, + "loss": 2.7154, + "step": 51361 + }, + { + "epoch": 2.52, + "grad_norm": 0.7086673378944397, + "learning_rate": 3.754383041770971e-05, + "loss": 3.009, + "step": 51362 + }, + { + "epoch": 2.52, + "grad_norm": 0.7295812368392944, + "learning_rate": 3.753637305060044e-05, + "loss": 2.7669, + "step": 51363 + }, + { + "epoch": 2.52, + "grad_norm": 0.7354058027267456, + "learning_rate": 3.752891637476978e-05, + "loss": 3.046, + "step": 51364 + }, + { + "epoch": 2.52, + "grad_norm": 0.7307260632514954, + "learning_rate": 3.752146039023753e-05, + "loss": 2.8995, + "step": 51365 + }, + { + "epoch": 2.52, + "grad_norm": 0.7760553359985352, + "learning_rate": 3.7514005097023125e-05, + "loss": 3.0448, + "step": 51366 + }, + { + "epoch": 2.52, + "grad_norm": 0.8252313733100891, + "learning_rate": 3.750655049514644e-05, + "loss": 2.8753, + "step": 51367 + }, + { + "epoch": 2.52, + "grad_norm": 0.7388860583305359, + "learning_rate": 3.7499096584626854e-05, + "loss": 2.8755, + "step": 51368 + }, + { + "epoch": 2.52, + "grad_norm": 0.7460933327674866, + "learning_rate": 3.7491643365484156e-05, + "loss": 3.0417, + "step": 51369 + }, + { + "epoch": 2.52, + "grad_norm": 0.7309091091156006, + "learning_rate": 3.748419083773804e-05, + "loss": 3.0214, + "step": 51370 + }, + { + "epoch": 2.52, + "grad_norm": 0.7197194695472717, + "learning_rate": 3.7476739001408006e-05, + "loss": 2.9985, + "step": 51371 + }, + { + "epoch": 2.52, + "grad_norm": 0.7402889132499695, + "learning_rate": 3.7469287856513755e-05, + "loss": 3.0035, + "step": 51372 + }, + { + "epoch": 2.52, + "grad_norm": 0.7771908640861511, + "learning_rate": 3.746183740307476e-05, + "loss": 3.1793, + "step": 51373 + }, + { + "epoch": 2.52, + "grad_norm": 0.7580812573432922, + "learning_rate": 3.7454387641110804e-05, + "loss": 2.6364, + "step": 51374 + }, + { + "epoch": 2.52, + "grad_norm": 0.7813465595245361, + "learning_rate": 3.7446938570641515e-05, + "loss": 2.84, + "step": 51375 + }, + { + "epoch": 2.52, + "grad_norm": 0.7351908087730408, + "learning_rate": 3.7439490191686374e-05, + "loss": 2.9712, + "step": 51376 + }, + { + "epoch": 2.52, + "grad_norm": 0.7609416842460632, + "learning_rate": 3.743204250426516e-05, + "loss": 2.6509, + "step": 51377 + }, + { + "epoch": 2.52, + "grad_norm": 0.7777287364006042, + "learning_rate": 3.742459550839743e-05, + "loss": 3.0221, + "step": 51378 + }, + { + "epoch": 2.52, + "grad_norm": 0.7442467212677002, + "learning_rate": 3.741714920410269e-05, + "loss": 2.8488, + "step": 51379 + }, + { + "epoch": 2.52, + "grad_norm": 0.7483687400817871, + "learning_rate": 3.740970359140074e-05, + "loss": 2.9856, + "step": 51380 + }, + { + "epoch": 2.52, + "grad_norm": 0.7566103935241699, + "learning_rate": 3.740225867031103e-05, + "loss": 2.9011, + "step": 51381 + }, + { + "epoch": 2.52, + "grad_norm": 0.7553043365478516, + "learning_rate": 3.739481444085326e-05, + "loss": 2.8707, + "step": 51382 + }, + { + "epoch": 2.52, + "grad_norm": 0.7112729549407959, + "learning_rate": 3.7387370903046963e-05, + "loss": 2.8141, + "step": 51383 + }, + { + "epoch": 2.52, + "grad_norm": 0.7292284965515137, + "learning_rate": 3.737992805691189e-05, + "loss": 3.0078, + "step": 51384 + }, + { + "epoch": 2.52, + "grad_norm": 0.7168455123901367, + "learning_rate": 3.737248590246749e-05, + "loss": 2.8198, + "step": 51385 + }, + { + "epoch": 2.52, + "grad_norm": 0.7240537405014038, + "learning_rate": 3.736504443973337e-05, + "loss": 2.6507, + "step": 51386 + }, + { + "epoch": 2.52, + "grad_norm": 0.7349284291267395, + "learning_rate": 3.7357603668729264e-05, + "loss": 2.8739, + "step": 51387 + }, + { + "epoch": 2.52, + "grad_norm": 0.7479743957519531, + "learning_rate": 3.7350163589474615e-05, + "loss": 2.7535, + "step": 51388 + }, + { + "epoch": 2.52, + "grad_norm": 0.7725386619567871, + "learning_rate": 3.734272420198907e-05, + "loss": 2.9225, + "step": 51389 + }, + { + "epoch": 2.52, + "grad_norm": 0.7183440923690796, + "learning_rate": 3.733528550629232e-05, + "loss": 2.6772, + "step": 51390 + }, + { + "epoch": 2.52, + "grad_norm": 0.7492996454238892, + "learning_rate": 3.732784750240378e-05, + "loss": 2.9754, + "step": 51391 + }, + { + "epoch": 2.52, + "grad_norm": 0.7240472435951233, + "learning_rate": 3.732041019034324e-05, + "loss": 2.9141, + "step": 51392 + }, + { + "epoch": 2.52, + "grad_norm": 0.7172340750694275, + "learning_rate": 3.731297357013018e-05, + "loss": 2.8972, + "step": 51393 + }, + { + "epoch": 2.52, + "grad_norm": 0.8641431927680969, + "learning_rate": 3.730553764178411e-05, + "loss": 2.8425, + "step": 51394 + }, + { + "epoch": 2.52, + "grad_norm": 0.9124081134796143, + "learning_rate": 3.729810240532476e-05, + "loss": 2.9483, + "step": 51395 + }, + { + "epoch": 2.52, + "grad_norm": 0.7566990256309509, + "learning_rate": 3.7290667860771606e-05, + "loss": 2.8396, + "step": 51396 + }, + { + "epoch": 2.52, + "grad_norm": 0.7051795721054077, + "learning_rate": 3.728323400814434e-05, + "loss": 2.8282, + "step": 51397 + }, + { + "epoch": 2.52, + "grad_norm": 0.7272311449050903, + "learning_rate": 3.72758008474624e-05, + "loss": 2.8787, + "step": 51398 + }, + { + "epoch": 2.52, + "grad_norm": 0.7363135814666748, + "learning_rate": 3.7268368378745486e-05, + "loss": 3.0014, + "step": 51399 + }, + { + "epoch": 2.52, + "grad_norm": 0.7577966451644897, + "learning_rate": 3.7260936602013136e-05, + "loss": 2.8576, + "step": 51400 + }, + { + "epoch": 2.52, + "grad_norm": 0.7400948405265808, + "learning_rate": 3.7253505517284876e-05, + "loss": 2.6637, + "step": 51401 + }, + { + "epoch": 2.52, + "grad_norm": 0.7425705194473267, + "learning_rate": 3.724607512458035e-05, + "loss": 3.0053, + "step": 51402 + }, + { + "epoch": 2.52, + "grad_norm": 0.8094008564949036, + "learning_rate": 3.7238645423919054e-05, + "loss": 2.8479, + "step": 51403 + }, + { + "epoch": 2.52, + "grad_norm": 0.7409995198249817, + "learning_rate": 3.723121641532056e-05, + "loss": 2.8932, + "step": 51404 + }, + { + "epoch": 2.52, + "grad_norm": 0.7713068723678589, + "learning_rate": 3.7223788098804565e-05, + "loss": 2.8011, + "step": 51405 + }, + { + "epoch": 2.52, + "grad_norm": 0.7903899550437927, + "learning_rate": 3.721636047439054e-05, + "loss": 2.9123, + "step": 51406 + }, + { + "epoch": 2.52, + "grad_norm": 0.7373391389846802, + "learning_rate": 3.720893354209804e-05, + "loss": 2.7982, + "step": 51407 + }, + { + "epoch": 2.52, + "grad_norm": 0.720302164554596, + "learning_rate": 3.7201507301946556e-05, + "loss": 2.796, + "step": 51408 + }, + { + "epoch": 2.52, + "grad_norm": 0.7427999973297119, + "learning_rate": 3.719408175395573e-05, + "loss": 2.8756, + "step": 51409 + }, + { + "epoch": 2.52, + "grad_norm": 0.7941123843193054, + "learning_rate": 3.718665689814522e-05, + "loss": 3.0277, + "step": 51410 + }, + { + "epoch": 2.52, + "grad_norm": 0.724496603012085, + "learning_rate": 3.717923273453437e-05, + "loss": 2.8808, + "step": 51411 + }, + { + "epoch": 2.52, + "grad_norm": 0.7771130204200745, + "learning_rate": 3.71718092631429e-05, + "loss": 2.6689, + "step": 51412 + }, + { + "epoch": 2.52, + "grad_norm": 0.7540898323059082, + "learning_rate": 3.716438648399027e-05, + "loss": 3.088, + "step": 51413 + }, + { + "epoch": 2.52, + "grad_norm": 0.7112698554992676, + "learning_rate": 3.715696439709609e-05, + "loss": 2.8278, + "step": 51414 + }, + { + "epoch": 2.52, + "grad_norm": 0.7136939167976379, + "learning_rate": 3.714954300247991e-05, + "loss": 2.9853, + "step": 51415 + }, + { + "epoch": 2.52, + "grad_norm": 0.7295254468917847, + "learning_rate": 3.714212230016118e-05, + "loss": 2.9651, + "step": 51416 + }, + { + "epoch": 2.52, + "grad_norm": 0.7394436597824097, + "learning_rate": 3.713470229015956e-05, + "loss": 2.8072, + "step": 51417 + }, + { + "epoch": 2.52, + "grad_norm": 0.6955799460411072, + "learning_rate": 3.712728297249449e-05, + "loss": 2.9469, + "step": 51418 + }, + { + "epoch": 2.52, + "grad_norm": 0.8046622276306152, + "learning_rate": 3.7119864347185556e-05, + "loss": 3.0347, + "step": 51419 + }, + { + "epoch": 2.52, + "grad_norm": 0.729836642742157, + "learning_rate": 3.7112446414252386e-05, + "loss": 2.9362, + "step": 51420 + }, + { + "epoch": 2.52, + "grad_norm": 0.722966730594635, + "learning_rate": 3.710502917371442e-05, + "loss": 3.0038, + "step": 51421 + }, + { + "epoch": 2.52, + "grad_norm": 0.7136669158935547, + "learning_rate": 3.709761262559124e-05, + "loss": 2.904, + "step": 51422 + }, + { + "epoch": 2.52, + "grad_norm": 0.6891632676124573, + "learning_rate": 3.709019676990228e-05, + "loss": 2.9472, + "step": 51423 + }, + { + "epoch": 2.52, + "grad_norm": 0.7332636117935181, + "learning_rate": 3.708278160666711e-05, + "loss": 3.063, + "step": 51424 + }, + { + "epoch": 2.52, + "grad_norm": 0.7345249056816101, + "learning_rate": 3.707536713590539e-05, + "loss": 2.8518, + "step": 51425 + }, + { + "epoch": 2.52, + "grad_norm": 0.7762575745582581, + "learning_rate": 3.706795335763649e-05, + "loss": 3.0165, + "step": 51426 + }, + { + "epoch": 2.52, + "grad_norm": 0.7118173837661743, + "learning_rate": 3.7060540271880044e-05, + "loss": 2.721, + "step": 51427 + }, + { + "epoch": 2.52, + "grad_norm": 0.841128408908844, + "learning_rate": 3.705312787865557e-05, + "loss": 2.938, + "step": 51428 + }, + { + "epoch": 2.52, + "grad_norm": 0.6889047622680664, + "learning_rate": 3.70457161779825e-05, + "loss": 2.7814, + "step": 51429 + }, + { + "epoch": 2.52, + "grad_norm": 0.7486089468002319, + "learning_rate": 3.7038305169880367e-05, + "loss": 3.0047, + "step": 51430 + }, + { + "epoch": 2.52, + "grad_norm": 0.7171392440795898, + "learning_rate": 3.703089485436872e-05, + "loss": 2.7904, + "step": 51431 + }, + { + "epoch": 2.52, + "grad_norm": 0.7496344447135925, + "learning_rate": 3.702348523146718e-05, + "loss": 3.0611, + "step": 51432 + }, + { + "epoch": 2.52, + "grad_norm": 0.7369357943534851, + "learning_rate": 3.7016076301195084e-05, + "loss": 2.9536, + "step": 51433 + }, + { + "epoch": 2.52, + "grad_norm": 0.7333647012710571, + "learning_rate": 3.70086680635721e-05, + "loss": 3.0673, + "step": 51434 + }, + { + "epoch": 2.52, + "grad_norm": 0.7265602946281433, + "learning_rate": 3.7001260518617604e-05, + "loss": 3.0387, + "step": 51435 + }, + { + "epoch": 2.52, + "grad_norm": 0.7276657223701477, + "learning_rate": 3.6993853666351214e-05, + "loss": 2.9921, + "step": 51436 + }, + { + "epoch": 2.52, + "grad_norm": 0.75041264295578, + "learning_rate": 3.6986447506792414e-05, + "loss": 2.9, + "step": 51437 + }, + { + "epoch": 2.52, + "grad_norm": 0.77069091796875, + "learning_rate": 3.697904203996066e-05, + "loss": 2.801, + "step": 51438 + }, + { + "epoch": 2.52, + "grad_norm": 0.7159631848335266, + "learning_rate": 3.6971637265875495e-05, + "loss": 2.9552, + "step": 51439 + }, + { + "epoch": 2.52, + "grad_norm": 0.7610557079315186, + "learning_rate": 3.6964233184556415e-05, + "loss": 2.9516, + "step": 51440 + }, + { + "epoch": 2.52, + "grad_norm": 0.7705284953117371, + "learning_rate": 3.695682979602289e-05, + "loss": 2.8762, + "step": 51441 + }, + { + "epoch": 2.52, + "grad_norm": 0.7517569065093994, + "learning_rate": 3.694942710029452e-05, + "loss": 2.9491, + "step": 51442 + }, + { + "epoch": 2.52, + "grad_norm": 0.774166464805603, + "learning_rate": 3.6942025097390705e-05, + "loss": 2.8259, + "step": 51443 + }, + { + "epoch": 2.52, + "grad_norm": 0.7174193263053894, + "learning_rate": 3.693462378733101e-05, + "loss": 2.8627, + "step": 51444 + }, + { + "epoch": 2.52, + "grad_norm": 0.7680938839912415, + "learning_rate": 3.692722317013481e-05, + "loss": 2.9821, + "step": 51445 + }, + { + "epoch": 2.52, + "grad_norm": 0.7469550371170044, + "learning_rate": 3.691982324582167e-05, + "loss": 2.8813, + "step": 51446 + }, + { + "epoch": 2.52, + "grad_norm": 0.8036483526229858, + "learning_rate": 3.6912424014411156e-05, + "loss": 2.7214, + "step": 51447 + }, + { + "epoch": 2.52, + "grad_norm": 0.7748443484306335, + "learning_rate": 3.6905025475922634e-05, + "loss": 2.9701, + "step": 51448 + }, + { + "epoch": 2.52, + "grad_norm": 0.7516213059425354, + "learning_rate": 3.689762763037568e-05, + "loss": 3.0216, + "step": 51449 + }, + { + "epoch": 2.52, + "grad_norm": 0.7000873684883118, + "learning_rate": 3.6890230477789754e-05, + "loss": 2.8373, + "step": 51450 + }, + { + "epoch": 2.52, + "grad_norm": 0.7015297412872314, + "learning_rate": 3.688283401818426e-05, + "loss": 2.9047, + "step": 51451 + }, + { + "epoch": 2.52, + "grad_norm": 0.717642068862915, + "learning_rate": 3.6875438251578794e-05, + "loss": 2.8453, + "step": 51452 + }, + { + "epoch": 2.52, + "grad_norm": 0.7319772243499756, + "learning_rate": 3.6868043177992736e-05, + "loss": 2.8335, + "step": 51453 + }, + { + "epoch": 2.52, + "grad_norm": 0.7205098867416382, + "learning_rate": 3.686064879744567e-05, + "loss": 3.0223, + "step": 51454 + }, + { + "epoch": 2.52, + "grad_norm": 0.7658298015594482, + "learning_rate": 3.685325510995695e-05, + "loss": 2.8972, + "step": 51455 + }, + { + "epoch": 2.52, + "grad_norm": 0.7169014811515808, + "learning_rate": 3.684586211554609e-05, + "loss": 3.1171, + "step": 51456 + }, + { + "epoch": 2.52, + "grad_norm": 0.7414019703865051, + "learning_rate": 3.683846981423267e-05, + "loss": 3.0538, + "step": 51457 + }, + { + "epoch": 2.52, + "grad_norm": 0.772877037525177, + "learning_rate": 3.683107820603609e-05, + "loss": 2.8904, + "step": 51458 + }, + { + "epoch": 2.52, + "grad_norm": 0.7601258158683777, + "learning_rate": 3.682368729097579e-05, + "loss": 2.7492, + "step": 51459 + }, + { + "epoch": 2.52, + "grad_norm": 0.7426812052726746, + "learning_rate": 3.681629706907115e-05, + "loss": 2.7345, + "step": 51460 + }, + { + "epoch": 2.52, + "grad_norm": 0.7161027193069458, + "learning_rate": 3.680890754034177e-05, + "loss": 3.0118, + "step": 51461 + }, + { + "epoch": 2.52, + "grad_norm": 0.7432463765144348, + "learning_rate": 3.6801518704807156e-05, + "loss": 2.8489, + "step": 51462 + }, + { + "epoch": 2.52, + "grad_norm": 0.7574390769004822, + "learning_rate": 3.679413056248659e-05, + "loss": 2.8106, + "step": 51463 + }, + { + "epoch": 2.52, + "grad_norm": 0.7752411961555481, + "learning_rate": 3.6786743113399696e-05, + "loss": 2.9405, + "step": 51464 + }, + { + "epoch": 2.52, + "grad_norm": 0.7201191782951355, + "learning_rate": 3.677935635756586e-05, + "loss": 2.943, + "step": 51465 + }, + { + "epoch": 2.52, + "grad_norm": 0.7174774408340454, + "learning_rate": 3.67719702950045e-05, + "loss": 2.8726, + "step": 51466 + }, + { + "epoch": 2.52, + "grad_norm": 0.7147430777549744, + "learning_rate": 3.676458492573515e-05, + "loss": 3.0209, + "step": 51467 + }, + { + "epoch": 2.52, + "grad_norm": 0.757570743560791, + "learning_rate": 3.675720024977718e-05, + "loss": 3.0395, + "step": 51468 + }, + { + "epoch": 2.52, + "grad_norm": 0.6996948719024658, + "learning_rate": 3.674981626715015e-05, + "loss": 2.7247, + "step": 51469 + }, + { + "epoch": 2.52, + "grad_norm": 0.8142804503440857, + "learning_rate": 3.674243297787335e-05, + "loss": 2.7782, + "step": 51470 + }, + { + "epoch": 2.52, + "grad_norm": 0.7229151129722595, + "learning_rate": 3.6735050381966346e-05, + "loss": 2.7468, + "step": 51471 + }, + { + "epoch": 2.52, + "grad_norm": 0.7174829840660095, + "learning_rate": 3.672766847944859e-05, + "loss": 2.9069, + "step": 51472 + }, + { + "epoch": 2.52, + "grad_norm": 0.7757089734077454, + "learning_rate": 3.67202872703395e-05, + "loss": 3.1334, + "step": 51473 + }, + { + "epoch": 2.52, + "grad_norm": 0.7175819873809814, + "learning_rate": 3.6712906754658524e-05, + "loss": 3.0466, + "step": 51474 + }, + { + "epoch": 2.52, + "grad_norm": 0.739388108253479, + "learning_rate": 3.6705526932424975e-05, + "loss": 2.9729, + "step": 51475 + }, + { + "epoch": 2.52, + "grad_norm": 0.7365408539772034, + "learning_rate": 3.669814780365842e-05, + "loss": 2.8922, + "step": 51476 + }, + { + "epoch": 2.52, + "grad_norm": 0.6910611987113953, + "learning_rate": 3.669076936837836e-05, + "loss": 2.7283, + "step": 51477 + }, + { + "epoch": 2.52, + "grad_norm": 0.7462716102600098, + "learning_rate": 3.668339162660402e-05, + "loss": 2.814, + "step": 51478 + }, + { + "epoch": 2.52, + "grad_norm": 0.7815803289413452, + "learning_rate": 3.667601457835505e-05, + "loss": 2.8589, + "step": 51479 + }, + { + "epoch": 2.52, + "grad_norm": 0.7381653189659119, + "learning_rate": 3.666863822365077e-05, + "loss": 2.882, + "step": 51480 + }, + { + "epoch": 2.52, + "grad_norm": 0.7210900783538818, + "learning_rate": 3.6661262562510566e-05, + "loss": 3.0684, + "step": 51481 + }, + { + "epoch": 2.52, + "grad_norm": 0.7968168258666992, + "learning_rate": 3.665388759495398e-05, + "loss": 3.0299, + "step": 51482 + }, + { + "epoch": 2.52, + "grad_norm": 0.7475718855857849, + "learning_rate": 3.664651332100027e-05, + "loss": 2.8383, + "step": 51483 + }, + { + "epoch": 2.52, + "grad_norm": 0.8005061149597168, + "learning_rate": 3.6639139740669055e-05, + "loss": 3.1725, + "step": 51484 + }, + { + "epoch": 2.52, + "grad_norm": 0.7502545118331909, + "learning_rate": 3.663176685397962e-05, + "loss": 2.7869, + "step": 51485 + }, + { + "epoch": 2.52, + "grad_norm": 0.7680295705795288, + "learning_rate": 3.662439466095144e-05, + "loss": 2.8294, + "step": 51486 + }, + { + "epoch": 2.52, + "grad_norm": 0.710451066493988, + "learning_rate": 3.661702316160394e-05, + "loss": 2.7683, + "step": 51487 + }, + { + "epoch": 2.52, + "grad_norm": 0.6979348063468933, + "learning_rate": 3.660965235595644e-05, + "loss": 2.9291, + "step": 51488 + }, + { + "epoch": 2.52, + "grad_norm": 0.7553119659423828, + "learning_rate": 3.660228224402849e-05, + "loss": 2.9579, + "step": 51489 + }, + { + "epoch": 2.52, + "grad_norm": 0.7529929876327515, + "learning_rate": 3.659491282583941e-05, + "loss": 2.8826, + "step": 51490 + }, + { + "epoch": 2.52, + "grad_norm": 0.7481595277786255, + "learning_rate": 3.65875441014086e-05, + "loss": 2.8124, + "step": 51491 + }, + { + "epoch": 2.52, + "grad_norm": 0.7778012156486511, + "learning_rate": 3.6580176070755576e-05, + "loss": 2.7338, + "step": 51492 + }, + { + "epoch": 2.52, + "grad_norm": 0.7474700808525085, + "learning_rate": 3.657280873389959e-05, + "loss": 3.1009, + "step": 51493 + }, + { + "epoch": 2.52, + "grad_norm": 0.7289611101150513, + "learning_rate": 3.656544209086022e-05, + "loss": 3.0016, + "step": 51494 + }, + { + "epoch": 2.52, + "grad_norm": 0.815883219242096, + "learning_rate": 3.6558076141656766e-05, + "loss": 2.9222, + "step": 51495 + }, + { + "epoch": 2.52, + "grad_norm": 0.7379744648933411, + "learning_rate": 3.655071088630855e-05, + "loss": 3.0274, + "step": 51496 + }, + { + "epoch": 2.52, + "grad_norm": 0.7309653162956238, + "learning_rate": 3.654334632483516e-05, + "loss": 2.8887, + "step": 51497 + }, + { + "epoch": 2.52, + "grad_norm": 0.7571339011192322, + "learning_rate": 3.6535982457255817e-05, + "loss": 2.9202, + "step": 51498 + }, + { + "epoch": 2.52, + "grad_norm": 0.7372366786003113, + "learning_rate": 3.6528619283590066e-05, + "loss": 3.1448, + "step": 51499 + }, + { + "epoch": 2.52, + "grad_norm": 0.7438984513282776, + "learning_rate": 3.652125680385712e-05, + "loss": 2.9245, + "step": 51500 + }, + { + "epoch": 2.52, + "grad_norm": 0.7498157024383545, + "learning_rate": 3.651389501807658e-05, + "loss": 3.1159, + "step": 51501 + }, + { + "epoch": 2.52, + "grad_norm": 0.7399702072143555, + "learning_rate": 3.650653392626775e-05, + "loss": 3.0412, + "step": 51502 + }, + { + "epoch": 2.52, + "grad_norm": 0.7499619126319885, + "learning_rate": 3.6499173528449874e-05, + "loss": 2.8588, + "step": 51503 + }, + { + "epoch": 2.52, + "grad_norm": 0.784655749797821, + "learning_rate": 3.649181382464256e-05, + "loss": 3.1072, + "step": 51504 + }, + { + "epoch": 2.52, + "grad_norm": 0.742243230342865, + "learning_rate": 3.648445481486504e-05, + "loss": 3.0878, + "step": 51505 + }, + { + "epoch": 2.52, + "grad_norm": 0.7311636805534363, + "learning_rate": 3.6477096499136735e-05, + "loss": 2.9728, + "step": 51506 + }, + { + "epoch": 2.52, + "grad_norm": 0.7376363277435303, + "learning_rate": 3.646973887747714e-05, + "loss": 2.9388, + "step": 51507 + }, + { + "epoch": 2.52, + "grad_norm": 0.746273934841156, + "learning_rate": 3.646238194990549e-05, + "loss": 2.919, + "step": 51508 + }, + { + "epoch": 2.52, + "grad_norm": 0.7192387580871582, + "learning_rate": 3.645502571644122e-05, + "loss": 2.8624, + "step": 51509 + }, + { + "epoch": 2.52, + "grad_norm": 0.7488130331039429, + "learning_rate": 3.644767017710364e-05, + "loss": 2.7476, + "step": 51510 + }, + { + "epoch": 2.52, + "grad_norm": 0.7002290487289429, + "learning_rate": 3.644031533191223e-05, + "loss": 2.8554, + "step": 51511 + }, + { + "epoch": 2.52, + "grad_norm": 0.7030738592147827, + "learning_rate": 3.6432961180886246e-05, + "loss": 2.9592, + "step": 51512 + }, + { + "epoch": 2.52, + "grad_norm": 0.8064178228378296, + "learning_rate": 3.642560772404514e-05, + "loss": 2.8433, + "step": 51513 + }, + { + "epoch": 2.52, + "grad_norm": 0.7585741877555847, + "learning_rate": 3.641825496140829e-05, + "loss": 2.8685, + "step": 51514 + }, + { + "epoch": 2.52, + "grad_norm": 0.7327173352241516, + "learning_rate": 3.6410902892994953e-05, + "loss": 2.8906, + "step": 51515 + }, + { + "epoch": 2.52, + "grad_norm": 0.7685412168502808, + "learning_rate": 3.640355151882468e-05, + "loss": 2.6577, + "step": 51516 + }, + { + "epoch": 2.52, + "grad_norm": 0.7944349646568298, + "learning_rate": 3.639620083891669e-05, + "loss": 2.9258, + "step": 51517 + }, + { + "epoch": 2.52, + "grad_norm": 0.7449418902397156, + "learning_rate": 3.6388850853290294e-05, + "loss": 2.7102, + "step": 51518 + }, + { + "epoch": 2.52, + "grad_norm": 0.7388597130775452, + "learning_rate": 3.638150156196502e-05, + "loss": 2.9237, + "step": 51519 + }, + { + "epoch": 2.52, + "grad_norm": 0.7266759872436523, + "learning_rate": 3.6374152964960076e-05, + "loss": 2.9021, + "step": 51520 + }, + { + "epoch": 2.52, + "grad_norm": 0.7184749841690063, + "learning_rate": 3.6366805062294956e-05, + "loss": 3.0353, + "step": 51521 + }, + { + "epoch": 2.52, + "grad_norm": 0.7220653891563416, + "learning_rate": 3.6359457853988835e-05, + "loss": 2.8162, + "step": 51522 + }, + { + "epoch": 2.53, + "grad_norm": 0.7334814667701721, + "learning_rate": 3.635211134006124e-05, + "loss": 2.9972, + "step": 51523 + }, + { + "epoch": 2.53, + "grad_norm": 0.7480335831642151, + "learning_rate": 3.634476552053145e-05, + "loss": 2.8407, + "step": 51524 + }, + { + "epoch": 2.53, + "grad_norm": 0.7650389671325684, + "learning_rate": 3.633742039541875e-05, + "loss": 2.8594, + "step": 51525 + }, + { + "epoch": 2.53, + "grad_norm": 0.7087839841842651, + "learning_rate": 3.6330075964742586e-05, + "loss": 2.7335, + "step": 51526 + }, + { + "epoch": 2.53, + "grad_norm": 0.7374259829521179, + "learning_rate": 3.632273222852222e-05, + "loss": 2.954, + "step": 51527 + }, + { + "epoch": 2.53, + "grad_norm": 0.7621423006057739, + "learning_rate": 3.6315389186777e-05, + "loss": 2.9835, + "step": 51528 + }, + { + "epoch": 2.53, + "grad_norm": 0.7097713351249695, + "learning_rate": 3.630804683952637e-05, + "loss": 2.8251, + "step": 51529 + }, + { + "epoch": 2.53, + "grad_norm": 0.7970553636550903, + "learning_rate": 3.6300705186789627e-05, + "loss": 2.8793, + "step": 51530 + }, + { + "epoch": 2.53, + "grad_norm": 0.7120518088340759, + "learning_rate": 3.629336422858604e-05, + "loss": 2.8642, + "step": 51531 + }, + { + "epoch": 2.53, + "grad_norm": 0.731629490852356, + "learning_rate": 3.628602396493494e-05, + "loss": 2.9259, + "step": 51532 + }, + { + "epoch": 2.53, + "grad_norm": 0.7600454688072205, + "learning_rate": 3.6278684395855676e-05, + "loss": 2.7663, + "step": 51533 + }, + { + "epoch": 2.53, + "grad_norm": 0.7035639882087708, + "learning_rate": 3.627134552136769e-05, + "loss": 2.8127, + "step": 51534 + }, + { + "epoch": 2.53, + "grad_norm": 0.7026732563972473, + "learning_rate": 3.6264007341490145e-05, + "loss": 2.805, + "step": 51535 + }, + { + "epoch": 2.53, + "grad_norm": 0.7547394633293152, + "learning_rate": 3.6256669856242516e-05, + "loss": 2.8502, + "step": 51536 + }, + { + "epoch": 2.53, + "grad_norm": 0.7437660694122314, + "learning_rate": 3.624933306564399e-05, + "loss": 2.8254, + "step": 51537 + }, + { + "epoch": 2.53, + "grad_norm": 0.7333990335464478, + "learning_rate": 3.624199696971403e-05, + "loss": 2.9906, + "step": 51538 + }, + { + "epoch": 2.53, + "grad_norm": 0.7599939107894897, + "learning_rate": 3.62346615684719e-05, + "loss": 2.7959, + "step": 51539 + }, + { + "epoch": 2.53, + "grad_norm": 0.8379149436950684, + "learning_rate": 3.622732686193683e-05, + "loss": 2.9625, + "step": 51540 + }, + { + "epoch": 2.53, + "grad_norm": 0.7774891257286072, + "learning_rate": 3.621999285012828e-05, + "loss": 2.7655, + "step": 51541 + }, + { + "epoch": 2.53, + "grad_norm": 0.7588175535202026, + "learning_rate": 3.621265953306545e-05, + "loss": 2.7889, + "step": 51542 + }, + { + "epoch": 2.53, + "grad_norm": 0.7654353380203247, + "learning_rate": 3.62053269107677e-05, + "loss": 2.9883, + "step": 51543 + }, + { + "epoch": 2.53, + "grad_norm": 0.744993269443512, + "learning_rate": 3.619799498325441e-05, + "loss": 2.7745, + "step": 51544 + }, + { + "epoch": 2.53, + "grad_norm": 0.7357940077781677, + "learning_rate": 3.6190663750544833e-05, + "loss": 2.6419, + "step": 51545 + }, + { + "epoch": 2.53, + "grad_norm": 0.737634539604187, + "learning_rate": 3.6183333212658294e-05, + "loss": 2.9255, + "step": 51546 + }, + { + "epoch": 2.53, + "grad_norm": 0.7712468504905701, + "learning_rate": 3.6176003369614e-05, + "loss": 3.0007, + "step": 51547 + }, + { + "epoch": 2.53, + "grad_norm": 0.7073559761047363, + "learning_rate": 3.616867422143135e-05, + "loss": 2.9726, + "step": 51548 + }, + { + "epoch": 2.53, + "grad_norm": 0.7305967807769775, + "learning_rate": 3.616134576812971e-05, + "loss": 2.8264, + "step": 51549 + }, + { + "epoch": 2.53, + "grad_norm": 0.7160578370094299, + "learning_rate": 3.6154018009728216e-05, + "loss": 2.9483, + "step": 51550 + }, + { + "epoch": 2.53, + "grad_norm": 0.7831854820251465, + "learning_rate": 3.614669094624634e-05, + "loss": 2.7699, + "step": 51551 + }, + { + "epoch": 2.53, + "grad_norm": 0.7335165739059448, + "learning_rate": 3.61393645777033e-05, + "loss": 3.0746, + "step": 51552 + }, + { + "epoch": 2.53, + "grad_norm": 0.8146199584007263, + "learning_rate": 3.6132038904118356e-05, + "loss": 2.8505, + "step": 51553 + }, + { + "epoch": 2.53, + "grad_norm": 0.7218393087387085, + "learning_rate": 3.6124713925510854e-05, + "loss": 2.8958, + "step": 51554 + }, + { + "epoch": 2.53, + "grad_norm": 0.7315160036087036, + "learning_rate": 3.6117389641900044e-05, + "loss": 2.7957, + "step": 51555 + }, + { + "epoch": 2.53, + "grad_norm": 0.7376806735992432, + "learning_rate": 3.611006605330527e-05, + "loss": 2.8755, + "step": 51556 + }, + { + "epoch": 2.53, + "grad_norm": 0.8134010434150696, + "learning_rate": 3.610274315974577e-05, + "loss": 2.8359, + "step": 51557 + }, + { + "epoch": 2.53, + "grad_norm": 0.7388611435890198, + "learning_rate": 3.6095420961240854e-05, + "loss": 2.6661, + "step": 51558 + }, + { + "epoch": 2.53, + "grad_norm": 0.7700594663619995, + "learning_rate": 3.608809945780986e-05, + "loss": 2.9496, + "step": 51559 + }, + { + "epoch": 2.53, + "grad_norm": 0.7100481986999512, + "learning_rate": 3.608077864947202e-05, + "loss": 2.7765, + "step": 51560 + }, + { + "epoch": 2.53, + "grad_norm": 0.7788733243942261, + "learning_rate": 3.6073458536246616e-05, + "loss": 3.0668, + "step": 51561 + }, + { + "epoch": 2.53, + "grad_norm": 0.7740787863731384, + "learning_rate": 3.6066139118152894e-05, + "loss": 3.0295, + "step": 51562 + }, + { + "epoch": 2.53, + "grad_norm": 0.7355701327323914, + "learning_rate": 3.6058820395210105e-05, + "loss": 2.6675, + "step": 51563 + }, + { + "epoch": 2.53, + "grad_norm": 0.7832901477813721, + "learning_rate": 3.605150236743771e-05, + "loss": 2.7939, + "step": 51564 + }, + { + "epoch": 2.53, + "grad_norm": 0.725982129573822, + "learning_rate": 3.604418503485476e-05, + "loss": 2.9671, + "step": 51565 + }, + { + "epoch": 2.53, + "grad_norm": 0.7156925797462463, + "learning_rate": 3.603686839748072e-05, + "loss": 2.908, + "step": 51566 + }, + { + "epoch": 2.53, + "grad_norm": 0.801988959312439, + "learning_rate": 3.602955245533473e-05, + "loss": 2.8038, + "step": 51567 + }, + { + "epoch": 2.53, + "grad_norm": 0.7519605159759521, + "learning_rate": 3.602223720843607e-05, + "loss": 2.9654, + "step": 51568 + }, + { + "epoch": 2.53, + "grad_norm": 0.7388525605201721, + "learning_rate": 3.601492265680408e-05, + "loss": 3.1603, + "step": 51569 + }, + { + "epoch": 2.53, + "grad_norm": 0.7522304654121399, + "learning_rate": 3.600760880045792e-05, + "loss": 2.9316, + "step": 51570 + }, + { + "epoch": 2.53, + "grad_norm": 0.7278318405151367, + "learning_rate": 3.6000295639416974e-05, + "loss": 2.791, + "step": 51571 + }, + { + "epoch": 2.53, + "grad_norm": 0.7324314117431641, + "learning_rate": 3.599298317370036e-05, + "loss": 3.0109, + "step": 51572 + }, + { + "epoch": 2.53, + "grad_norm": 0.7239487767219543, + "learning_rate": 3.598567140332747e-05, + "loss": 2.8985, + "step": 51573 + }, + { + "epoch": 2.53, + "grad_norm": 0.731003999710083, + "learning_rate": 3.597836032831754e-05, + "loss": 2.9196, + "step": 51574 + }, + { + "epoch": 2.53, + "grad_norm": 0.746536910533905, + "learning_rate": 3.597104994868979e-05, + "loss": 2.985, + "step": 51575 + }, + { + "epoch": 2.53, + "grad_norm": 0.7338457703590393, + "learning_rate": 3.596374026446348e-05, + "loss": 2.9579, + "step": 51576 + }, + { + "epoch": 2.53, + "grad_norm": 0.8143104314804077, + "learning_rate": 3.595643127565779e-05, + "loss": 2.9791, + "step": 51577 + }, + { + "epoch": 2.53, + "grad_norm": 0.7997439503669739, + "learning_rate": 3.594912298229208e-05, + "loss": 2.8689, + "step": 51578 + }, + { + "epoch": 2.53, + "grad_norm": 0.755618691444397, + "learning_rate": 3.594181538438562e-05, + "loss": 2.9621, + "step": 51579 + }, + { + "epoch": 2.53, + "grad_norm": 0.7650567293167114, + "learning_rate": 3.59345084819575e-05, + "loss": 2.8828, + "step": 51580 + }, + { + "epoch": 2.53, + "grad_norm": 0.7664803266525269, + "learning_rate": 3.5927202275027176e-05, + "loss": 3.0211, + "step": 51581 + }, + { + "epoch": 2.53, + "grad_norm": 0.6953228116035461, + "learning_rate": 3.591989676361373e-05, + "loss": 2.8972, + "step": 51582 + }, + { + "epoch": 2.53, + "grad_norm": 0.7946908473968506, + "learning_rate": 3.591259194773641e-05, + "loss": 2.993, + "step": 51583 + }, + { + "epoch": 2.53, + "grad_norm": 0.716802179813385, + "learning_rate": 3.5905287827414574e-05, + "loss": 2.9496, + "step": 51584 + }, + { + "epoch": 2.53, + "grad_norm": 0.7397177219390869, + "learning_rate": 3.589798440266734e-05, + "loss": 2.8957, + "step": 51585 + }, + { + "epoch": 2.53, + "grad_norm": 0.7066813707351685, + "learning_rate": 3.589068167351401e-05, + "loss": 2.8603, + "step": 51586 + }, + { + "epoch": 2.53, + "grad_norm": 0.7754790186882019, + "learning_rate": 3.588337963997375e-05, + "loss": 3.0405, + "step": 51587 + }, + { + "epoch": 2.53, + "grad_norm": 0.7783805131912231, + "learning_rate": 3.587607830206588e-05, + "loss": 3.0427, + "step": 51588 + }, + { + "epoch": 2.53, + "grad_norm": 0.7516245245933533, + "learning_rate": 3.586877765980961e-05, + "loss": 2.9936, + "step": 51589 + }, + { + "epoch": 2.53, + "grad_norm": 0.7606446146965027, + "learning_rate": 3.5861477713224096e-05, + "loss": 3.0245, + "step": 51590 + }, + { + "epoch": 2.53, + "grad_norm": 0.7204053401947021, + "learning_rate": 3.5854178462328654e-05, + "loss": 2.9596, + "step": 51591 + }, + { + "epoch": 2.53, + "grad_norm": 0.7473350763320923, + "learning_rate": 3.58468799071424e-05, + "loss": 2.8518, + "step": 51592 + }, + { + "epoch": 2.53, + "grad_norm": 0.7622993588447571, + "learning_rate": 3.583958204768472e-05, + "loss": 2.7923, + "step": 51593 + }, + { + "epoch": 2.53, + "grad_norm": 0.7171756029129028, + "learning_rate": 3.583228488397464e-05, + "loss": 2.9466, + "step": 51594 + }, + { + "epoch": 2.53, + "grad_norm": 0.7305697202682495, + "learning_rate": 3.582498841603153e-05, + "loss": 2.7989, + "step": 51595 + }, + { + "epoch": 2.53, + "grad_norm": 0.7551934123039246, + "learning_rate": 3.581769264387459e-05, + "loss": 2.8935, + "step": 51596 + }, + { + "epoch": 2.53, + "grad_norm": 0.7103335857391357, + "learning_rate": 3.581039756752302e-05, + "loss": 2.8689, + "step": 51597 + }, + { + "epoch": 2.53, + "grad_norm": 0.7509092688560486, + "learning_rate": 3.5803103186995985e-05, + "loss": 3.0213, + "step": 51598 + }, + { + "epoch": 2.53, + "grad_norm": 0.7707995772361755, + "learning_rate": 3.57958095023127e-05, + "loss": 2.9597, + "step": 51599 + }, + { + "epoch": 2.53, + "grad_norm": 0.7239026427268982, + "learning_rate": 3.578851651349238e-05, + "loss": 2.9482, + "step": 51600 + }, + { + "epoch": 2.53, + "grad_norm": 0.699456512928009, + "learning_rate": 3.578122422055435e-05, + "loss": 3.0693, + "step": 51601 + }, + { + "epoch": 2.53, + "grad_norm": 0.7883853316307068, + "learning_rate": 3.577393262351766e-05, + "loss": 2.8765, + "step": 51602 + }, + { + "epoch": 2.53, + "grad_norm": 0.7522447109222412, + "learning_rate": 3.576664172240165e-05, + "loss": 2.7847, + "step": 51603 + }, + { + "epoch": 2.53, + "grad_norm": 0.8980352282524109, + "learning_rate": 3.575935151722541e-05, + "loss": 3.0591, + "step": 51604 + }, + { + "epoch": 2.53, + "grad_norm": 0.7412164211273193, + "learning_rate": 3.5752062008008163e-05, + "loss": 2.887, + "step": 51605 + }, + { + "epoch": 2.53, + "grad_norm": 0.7191179990768433, + "learning_rate": 3.574477319476919e-05, + "loss": 3.2386, + "step": 51606 + }, + { + "epoch": 2.53, + "grad_norm": 0.8027909398078918, + "learning_rate": 3.573748507752755e-05, + "loss": 2.7148, + "step": 51607 + }, + { + "epoch": 2.53, + "grad_norm": 0.7330635786056519, + "learning_rate": 3.5730197656302585e-05, + "loss": 2.9588, + "step": 51608 + }, + { + "epoch": 2.53, + "grad_norm": 0.7735836505889893, + "learning_rate": 3.5722910931113345e-05, + "loss": 3.1469, + "step": 51609 + }, + { + "epoch": 2.53, + "grad_norm": 0.7595041990280151, + "learning_rate": 3.5715624901979154e-05, + "loss": 2.7322, + "step": 51610 + }, + { + "epoch": 2.53, + "grad_norm": 0.683493435382843, + "learning_rate": 3.5708339568919164e-05, + "loss": 2.8669, + "step": 51611 + }, + { + "epoch": 2.53, + "grad_norm": 0.7193924188613892, + "learning_rate": 3.570105493195249e-05, + "loss": 2.9912, + "step": 51612 + }, + { + "epoch": 2.53, + "grad_norm": 0.7553349137306213, + "learning_rate": 3.5693770991098417e-05, + "loss": 3.0392, + "step": 51613 + }, + { + "epoch": 2.53, + "grad_norm": 0.7322878837585449, + "learning_rate": 3.5686487746376027e-05, + "loss": 2.989, + "step": 51614 + }, + { + "epoch": 2.53, + "grad_norm": 0.7660918235778809, + "learning_rate": 3.567920519780454e-05, + "loss": 2.9561, + "step": 51615 + }, + { + "epoch": 2.53, + "grad_norm": 0.7109578847885132, + "learning_rate": 3.567192334540324e-05, + "loss": 2.8681, + "step": 51616 + }, + { + "epoch": 2.53, + "grad_norm": 0.743213415145874, + "learning_rate": 3.566464218919115e-05, + "loss": 2.8795, + "step": 51617 + }, + { + "epoch": 2.53, + "grad_norm": 0.7197250723838806, + "learning_rate": 3.565736172918762e-05, + "loss": 2.8877, + "step": 51618 + }, + { + "epoch": 2.53, + "grad_norm": 0.692306399345398, + "learning_rate": 3.565008196541169e-05, + "loss": 2.9454, + "step": 51619 + }, + { + "epoch": 2.53, + "grad_norm": 0.7972225546836853, + "learning_rate": 3.564280289788249e-05, + "loss": 2.827, + "step": 51620 + }, + { + "epoch": 2.53, + "grad_norm": 0.7207708358764648, + "learning_rate": 3.563552452661938e-05, + "loss": 2.9103, + "step": 51621 + }, + { + "epoch": 2.53, + "grad_norm": 0.7996771931648254, + "learning_rate": 3.562824685164132e-05, + "loss": 2.7235, + "step": 51622 + }, + { + "epoch": 2.53, + "grad_norm": 0.6930966973304749, + "learning_rate": 3.5620969872967644e-05, + "loss": 2.818, + "step": 51623 + }, + { + "epoch": 2.53, + "grad_norm": 0.6936909556388855, + "learning_rate": 3.56136935906174e-05, + "loss": 2.8292, + "step": 51624 + }, + { + "epoch": 2.53, + "grad_norm": 0.7523632049560547, + "learning_rate": 3.560641800460987e-05, + "loss": 2.9122, + "step": 51625 + }, + { + "epoch": 2.53, + "grad_norm": 0.7502445578575134, + "learning_rate": 3.5599143114964143e-05, + "loss": 2.8885, + "step": 51626 + }, + { + "epoch": 2.53, + "grad_norm": 0.7347469329833984, + "learning_rate": 3.559186892169934e-05, + "loss": 2.8755, + "step": 51627 + }, + { + "epoch": 2.53, + "grad_norm": 0.7211341857910156, + "learning_rate": 3.558459542483474e-05, + "loss": 3.0235, + "step": 51628 + }, + { + "epoch": 2.53, + "grad_norm": 0.7356860637664795, + "learning_rate": 3.557732262438932e-05, + "loss": 2.7142, + "step": 51629 + }, + { + "epoch": 2.53, + "grad_norm": 0.7704647779464722, + "learning_rate": 3.5570050520382386e-05, + "loss": 2.9018, + "step": 51630 + }, + { + "epoch": 2.53, + "grad_norm": 0.6986088156700134, + "learning_rate": 3.55627791128331e-05, + "loss": 2.9044, + "step": 51631 + }, + { + "epoch": 2.53, + "grad_norm": 0.8135074377059937, + "learning_rate": 3.555550840176057e-05, + "loss": 2.9279, + "step": 51632 + }, + { + "epoch": 2.53, + "grad_norm": 0.7185907959938049, + "learning_rate": 3.554823838718393e-05, + "loss": 2.9699, + "step": 51633 + }, + { + "epoch": 2.53, + "grad_norm": 0.7311407327651978, + "learning_rate": 3.554096906912226e-05, + "loss": 2.9253, + "step": 51634 + }, + { + "epoch": 2.53, + "grad_norm": 0.7427823543548584, + "learning_rate": 3.553370044759476e-05, + "loss": 2.9968, + "step": 51635 + }, + { + "epoch": 2.53, + "grad_norm": 0.7487204074859619, + "learning_rate": 3.5526432522620704e-05, + "loss": 2.8834, + "step": 51636 + }, + { + "epoch": 2.53, + "grad_norm": 0.745222806930542, + "learning_rate": 3.551916529421906e-05, + "loss": 2.8098, + "step": 51637 + }, + { + "epoch": 2.53, + "grad_norm": 0.7326517105102539, + "learning_rate": 3.551189876240905e-05, + "loss": 2.9652, + "step": 51638 + }, + { + "epoch": 2.53, + "grad_norm": 0.8086389899253845, + "learning_rate": 3.5504632927209786e-05, + "loss": 3.0676, + "step": 51639 + }, + { + "epoch": 2.53, + "grad_norm": 0.7548852562904358, + "learning_rate": 3.5497367788640474e-05, + "loss": 3.0023, + "step": 51640 + }, + { + "epoch": 2.53, + "grad_norm": 0.7331960201263428, + "learning_rate": 3.5490103346720166e-05, + "loss": 2.9562, + "step": 51641 + }, + { + "epoch": 2.53, + "grad_norm": 0.7696003913879395, + "learning_rate": 3.548283960146795e-05, + "loss": 2.9089, + "step": 51642 + }, + { + "epoch": 2.53, + "grad_norm": 0.6968817710876465, + "learning_rate": 3.5475576552903105e-05, + "loss": 3.0244, + "step": 51643 + }, + { + "epoch": 2.53, + "grad_norm": 0.7616210579872131, + "learning_rate": 3.5468314201044624e-05, + "loss": 2.6087, + "step": 51644 + }, + { + "epoch": 2.53, + "grad_norm": 0.8356584310531616, + "learning_rate": 3.546105254591172e-05, + "loss": 2.9079, + "step": 51645 + }, + { + "epoch": 2.53, + "grad_norm": 0.7322165369987488, + "learning_rate": 3.5453791587523504e-05, + "loss": 2.9639, + "step": 51646 + }, + { + "epoch": 2.53, + "grad_norm": 0.7493136525154114, + "learning_rate": 3.544653132589911e-05, + "loss": 2.6329, + "step": 51647 + }, + { + "epoch": 2.53, + "grad_norm": 0.7908979654312134, + "learning_rate": 3.5439271761057645e-05, + "loss": 2.7651, + "step": 51648 + }, + { + "epoch": 2.53, + "grad_norm": 0.7793807983398438, + "learning_rate": 3.543201289301816e-05, + "loss": 2.8588, + "step": 51649 + }, + { + "epoch": 2.53, + "grad_norm": 0.7199450135231018, + "learning_rate": 3.542475472179981e-05, + "loss": 3.1165, + "step": 51650 + }, + { + "epoch": 2.53, + "grad_norm": 0.7530863285064697, + "learning_rate": 3.5417497247421844e-05, + "loss": 2.7485, + "step": 51651 + }, + { + "epoch": 2.53, + "grad_norm": 0.7242304086685181, + "learning_rate": 3.541024046990318e-05, + "loss": 3.0574, + "step": 51652 + }, + { + "epoch": 2.53, + "grad_norm": 0.7129125595092773, + "learning_rate": 3.540298438926308e-05, + "loss": 2.7127, + "step": 51653 + }, + { + "epoch": 2.53, + "grad_norm": 0.7528355717658997, + "learning_rate": 3.539572900552061e-05, + "loss": 2.8536, + "step": 51654 + }, + { + "epoch": 2.53, + "grad_norm": 0.7420986890792847, + "learning_rate": 3.53884743186948e-05, + "loss": 2.8166, + "step": 51655 + }, + { + "epoch": 2.53, + "grad_norm": 0.7624697089195251, + "learning_rate": 3.538122032880487e-05, + "loss": 3.1127, + "step": 51656 + }, + { + "epoch": 2.53, + "grad_norm": 0.7528888583183289, + "learning_rate": 3.537396703586983e-05, + "loss": 3.0215, + "step": 51657 + }, + { + "epoch": 2.53, + "grad_norm": 0.7675604224205017, + "learning_rate": 3.5366714439908905e-05, + "loss": 2.966, + "step": 51658 + }, + { + "epoch": 2.53, + "grad_norm": 0.7084840536117554, + "learning_rate": 3.535946254094104e-05, + "loss": 2.7934, + "step": 51659 + }, + { + "epoch": 2.53, + "grad_norm": 0.7773526310920715, + "learning_rate": 3.5352211338985425e-05, + "loss": 2.9984, + "step": 51660 + }, + { + "epoch": 2.53, + "grad_norm": 0.7066696882247925, + "learning_rate": 3.534496083406121e-05, + "loss": 2.8627, + "step": 51661 + }, + { + "epoch": 2.53, + "grad_norm": 0.7332533597946167, + "learning_rate": 3.533771102618744e-05, + "loss": 2.885, + "step": 51662 + }, + { + "epoch": 2.53, + "grad_norm": 0.7373495697975159, + "learning_rate": 3.533046191538321e-05, + "loss": 3.0135, + "step": 51663 + }, + { + "epoch": 2.53, + "grad_norm": 0.7333385944366455, + "learning_rate": 3.532321350166757e-05, + "loss": 2.8784, + "step": 51664 + }, + { + "epoch": 2.53, + "grad_norm": 0.726533055305481, + "learning_rate": 3.531596578505966e-05, + "loss": 2.9567, + "step": 51665 + }, + { + "epoch": 2.53, + "grad_norm": 0.7740032076835632, + "learning_rate": 3.530871876557852e-05, + "loss": 2.9343, + "step": 51666 + }, + { + "epoch": 2.53, + "grad_norm": 0.7073380351066589, + "learning_rate": 3.530147244324327e-05, + "loss": 2.7355, + "step": 51667 + }, + { + "epoch": 2.53, + "grad_norm": 0.7208665013313293, + "learning_rate": 3.5294226818073056e-05, + "loss": 2.9038, + "step": 51668 + }, + { + "epoch": 2.53, + "grad_norm": 0.7397687435150146, + "learning_rate": 3.528698189008692e-05, + "loss": 2.8913, + "step": 51669 + }, + { + "epoch": 2.53, + "grad_norm": 0.7066610455513, + "learning_rate": 3.527973765930394e-05, + "loss": 2.8714, + "step": 51670 + }, + { + "epoch": 2.53, + "grad_norm": 0.7497684359550476, + "learning_rate": 3.527249412574308e-05, + "loss": 2.8729, + "step": 51671 + }, + { + "epoch": 2.53, + "grad_norm": 0.7150473594665527, + "learning_rate": 3.5265251289423544e-05, + "loss": 2.8148, + "step": 51672 + }, + { + "epoch": 2.53, + "grad_norm": 0.7562102675437927, + "learning_rate": 3.525800915036446e-05, + "loss": 2.991, + "step": 51673 + }, + { + "epoch": 2.53, + "grad_norm": 0.7330208420753479, + "learning_rate": 3.525076770858475e-05, + "loss": 2.917, + "step": 51674 + }, + { + "epoch": 2.53, + "grad_norm": 0.7095870971679688, + "learning_rate": 3.5243526964103655e-05, + "loss": 2.8007, + "step": 51675 + }, + { + "epoch": 2.53, + "grad_norm": 0.7798789739608765, + "learning_rate": 3.523628691694006e-05, + "loss": 2.918, + "step": 51676 + }, + { + "epoch": 2.53, + "grad_norm": 0.7631597518920898, + "learning_rate": 3.522904756711322e-05, + "loss": 2.9471, + "step": 51677 + }, + { + "epoch": 2.53, + "grad_norm": 0.8316076993942261, + "learning_rate": 3.522180891464209e-05, + "loss": 3.0965, + "step": 51678 + }, + { + "epoch": 2.53, + "grad_norm": 0.739554762840271, + "learning_rate": 3.521457095954572e-05, + "loss": 2.9548, + "step": 51679 + }, + { + "epoch": 2.53, + "grad_norm": 0.7704421877861023, + "learning_rate": 3.5207333701843255e-05, + "loss": 2.9717, + "step": 51680 + }, + { + "epoch": 2.53, + "grad_norm": 0.7329279780387878, + "learning_rate": 3.520009714155365e-05, + "loss": 2.9602, + "step": 51681 + }, + { + "epoch": 2.53, + "grad_norm": 0.7119458913803101, + "learning_rate": 3.5192861278696025e-05, + "loss": 2.7271, + "step": 51682 + }, + { + "epoch": 2.53, + "grad_norm": 0.7648967504501343, + "learning_rate": 3.51856261132895e-05, + "loss": 2.7971, + "step": 51683 + }, + { + "epoch": 2.53, + "grad_norm": 0.7791581749916077, + "learning_rate": 3.5178391645353085e-05, + "loss": 2.8109, + "step": 51684 + }, + { + "epoch": 2.53, + "grad_norm": 0.7299320697784424, + "learning_rate": 3.517115787490581e-05, + "loss": 2.6825, + "step": 51685 + }, + { + "epoch": 2.53, + "grad_norm": 0.7683125734329224, + "learning_rate": 3.5163924801966684e-05, + "loss": 2.733, + "step": 51686 + }, + { + "epoch": 2.53, + "grad_norm": 0.7981067895889282, + "learning_rate": 3.515669242655482e-05, + "loss": 2.8742, + "step": 51687 + }, + { + "epoch": 2.53, + "grad_norm": 0.7027236819267273, + "learning_rate": 3.5149460748689315e-05, + "loss": 2.9845, + "step": 51688 + }, + { + "epoch": 2.53, + "grad_norm": 0.7060105204582214, + "learning_rate": 3.5142229768389085e-05, + "loss": 2.709, + "step": 51689 + }, + { + "epoch": 2.53, + "grad_norm": 0.7767622470855713, + "learning_rate": 3.513499948567331e-05, + "loss": 2.9245, + "step": 51690 + }, + { + "epoch": 2.53, + "grad_norm": 0.7285616993904114, + "learning_rate": 3.512776990056098e-05, + "loss": 3.0044, + "step": 51691 + }, + { + "epoch": 2.53, + "grad_norm": 0.7438006401062012, + "learning_rate": 3.512054101307107e-05, + "loss": 2.9719, + "step": 51692 + }, + { + "epoch": 2.53, + "grad_norm": 0.7528930306434631, + "learning_rate": 3.511331282322274e-05, + "loss": 3.2001, + "step": 51693 + }, + { + "epoch": 2.53, + "grad_norm": 0.7622228860855103, + "learning_rate": 3.5106085331034873e-05, + "loss": 2.8069, + "step": 51694 + }, + { + "epoch": 2.53, + "grad_norm": 0.8420156836509705, + "learning_rate": 3.509885853652668e-05, + "loss": 2.8725, + "step": 51695 + }, + { + "epoch": 2.53, + "grad_norm": 0.7685613632202148, + "learning_rate": 3.5091632439717065e-05, + "loss": 3.0688, + "step": 51696 + }, + { + "epoch": 2.53, + "grad_norm": 0.7986699342727661, + "learning_rate": 3.5084407040625094e-05, + "loss": 2.8336, + "step": 51697 + }, + { + "epoch": 2.53, + "grad_norm": 0.8661927580833435, + "learning_rate": 3.507718233926989e-05, + "loss": 2.9024, + "step": 51698 + }, + { + "epoch": 2.53, + "grad_norm": 0.8043288588523865, + "learning_rate": 3.506995833567038e-05, + "loss": 2.8473, + "step": 51699 + }, + { + "epoch": 2.53, + "grad_norm": 0.7660233974456787, + "learning_rate": 3.5062735029845634e-05, + "loss": 2.9776, + "step": 51700 + }, + { + "epoch": 2.53, + "grad_norm": 0.8327121734619141, + "learning_rate": 3.505551242181458e-05, + "loss": 2.9838, + "step": 51701 + }, + { + "epoch": 2.53, + "grad_norm": 0.786426842212677, + "learning_rate": 3.5048290511596334e-05, + "loss": 2.7796, + "step": 51702 + }, + { + "epoch": 2.53, + "grad_norm": 0.7408648133277893, + "learning_rate": 3.5041069299209944e-05, + "loss": 3.0313, + "step": 51703 + }, + { + "epoch": 2.53, + "grad_norm": 0.7075411677360535, + "learning_rate": 3.50338487846743e-05, + "loss": 2.8537, + "step": 51704 + }, + { + "epoch": 2.53, + "grad_norm": 0.7919765114784241, + "learning_rate": 3.5026628968008616e-05, + "loss": 3.0262, + "step": 51705 + }, + { + "epoch": 2.53, + "grad_norm": 0.726375937461853, + "learning_rate": 3.501940984923175e-05, + "loss": 2.9436, + "step": 51706 + }, + { + "epoch": 2.53, + "grad_norm": 0.7896806597709656, + "learning_rate": 3.501219142836275e-05, + "loss": 3.012, + "step": 51707 + }, + { + "epoch": 2.53, + "grad_norm": 0.741202175617218, + "learning_rate": 3.500497370542066e-05, + "loss": 2.7639, + "step": 51708 + }, + { + "epoch": 2.53, + "grad_norm": 0.7486560940742493, + "learning_rate": 3.4997756680424404e-05, + "loss": 3.1071, + "step": 51709 + }, + { + "epoch": 2.53, + "grad_norm": 0.7122935652732849, + "learning_rate": 3.4990540353393136e-05, + "loss": 2.9733, + "step": 51710 + }, + { + "epoch": 2.53, + "grad_norm": 0.739435076713562, + "learning_rate": 3.498332472434574e-05, + "loss": 2.9056, + "step": 51711 + }, + { + "epoch": 2.53, + "grad_norm": 0.7047844529151917, + "learning_rate": 3.49761097933013e-05, + "loss": 2.8209, + "step": 51712 + }, + { + "epoch": 2.53, + "grad_norm": 0.7015252113342285, + "learning_rate": 3.4968895560278796e-05, + "loss": 2.9192, + "step": 51713 + }, + { + "epoch": 2.53, + "grad_norm": 0.8801385760307312, + "learning_rate": 3.4961682025297125e-05, + "loss": 3.0679, + "step": 51714 + }, + { + "epoch": 2.53, + "grad_norm": 0.7651318311691284, + "learning_rate": 3.495446918837545e-05, + "loss": 2.8485, + "step": 51715 + }, + { + "epoch": 2.53, + "grad_norm": 0.7404123544692993, + "learning_rate": 3.494725704953265e-05, + "loss": 2.8387, + "step": 51716 + }, + { + "epoch": 2.53, + "grad_norm": 0.7318333387374878, + "learning_rate": 3.494004560878776e-05, + "loss": 2.6693, + "step": 51717 + }, + { + "epoch": 2.53, + "grad_norm": 0.7572652697563171, + "learning_rate": 3.493283486615983e-05, + "loss": 2.8279, + "step": 51718 + }, + { + "epoch": 2.53, + "grad_norm": 0.7449601888656616, + "learning_rate": 3.4925624821667755e-05, + "loss": 2.6999, + "step": 51719 + }, + { + "epoch": 2.53, + "grad_norm": 0.7463000416755676, + "learning_rate": 3.4918415475330606e-05, + "loss": 2.8917, + "step": 51720 + }, + { + "epoch": 2.53, + "grad_norm": 0.7627390623092651, + "learning_rate": 3.491120682716735e-05, + "loss": 3.0798, + "step": 51721 + }, + { + "epoch": 2.53, + "grad_norm": 0.75359708070755, + "learning_rate": 3.490399887719689e-05, + "loss": 2.8438, + "step": 51722 + }, + { + "epoch": 2.53, + "grad_norm": 0.6942808628082275, + "learning_rate": 3.489679162543836e-05, + "loss": 2.7695, + "step": 51723 + }, + { + "epoch": 2.53, + "grad_norm": 0.7942733764648438, + "learning_rate": 3.488958507191057e-05, + "loss": 3.0674, + "step": 51724 + }, + { + "epoch": 2.53, + "grad_norm": 0.7533266544342041, + "learning_rate": 3.488237921663266e-05, + "loss": 3.0008, + "step": 51725 + }, + { + "epoch": 2.53, + "grad_norm": 0.760011613368988, + "learning_rate": 3.48751740596235e-05, + "loss": 2.8886, + "step": 51726 + }, + { + "epoch": 2.54, + "grad_norm": 0.7659701704978943, + "learning_rate": 3.4867969600902135e-05, + "loss": 2.9795, + "step": 51727 + }, + { + "epoch": 2.54, + "grad_norm": 0.7460302710533142, + "learning_rate": 3.4860765840487545e-05, + "loss": 2.9011, + "step": 51728 + }, + { + "epoch": 2.54, + "grad_norm": 0.7115795016288757, + "learning_rate": 3.485356277839859e-05, + "loss": 3.0382, + "step": 51729 + }, + { + "epoch": 2.54, + "grad_norm": 0.7279272675514221, + "learning_rate": 3.4846360414654384e-05, + "loss": 3.0661, + "step": 51730 + }, + { + "epoch": 2.54, + "grad_norm": 0.7116730809211731, + "learning_rate": 3.483915874927378e-05, + "loss": 3.0457, + "step": 51731 + }, + { + "epoch": 2.54, + "grad_norm": 0.7574999928474426, + "learning_rate": 3.4831957782275785e-05, + "loss": 3.053, + "step": 51732 + }, + { + "epoch": 2.54, + "grad_norm": 0.752925455570221, + "learning_rate": 3.4824757513679466e-05, + "loss": 2.9957, + "step": 51733 + }, + { + "epoch": 2.54, + "grad_norm": 0.7213291525840759, + "learning_rate": 3.481755794350367e-05, + "loss": 2.759, + "step": 51734 + }, + { + "epoch": 2.54, + "grad_norm": 0.7119690775871277, + "learning_rate": 3.481035907176741e-05, + "loss": 2.8161, + "step": 51735 + }, + { + "epoch": 2.54, + "grad_norm": 0.6793041229248047, + "learning_rate": 3.480316089848958e-05, + "loss": 2.7194, + "step": 51736 + }, + { + "epoch": 2.54, + "grad_norm": 0.7623587250709534, + "learning_rate": 3.479596342368919e-05, + "loss": 2.6806, + "step": 51737 + }, + { + "epoch": 2.54, + "grad_norm": 0.7153224349021912, + "learning_rate": 3.478876664738522e-05, + "loss": 2.838, + "step": 51738 + }, + { + "epoch": 2.54, + "grad_norm": 0.7242084741592407, + "learning_rate": 3.478157056959657e-05, + "loss": 2.9762, + "step": 51739 + }, + { + "epoch": 2.54, + "grad_norm": 0.7159818410873413, + "learning_rate": 3.477437519034225e-05, + "loss": 2.732, + "step": 51740 + }, + { + "epoch": 2.54, + "grad_norm": 0.7122927904129028, + "learning_rate": 3.4767180509641144e-05, + "loss": 2.8784, + "step": 51741 + }, + { + "epoch": 2.54, + "grad_norm": 0.745762825012207, + "learning_rate": 3.475998652751231e-05, + "loss": 2.8922, + "step": 51742 + }, + { + "epoch": 2.54, + "grad_norm": 0.7402251958847046, + "learning_rate": 3.4752793243974584e-05, + "loss": 3.2264, + "step": 51743 + }, + { + "epoch": 2.54, + "grad_norm": 0.7608932256698608, + "learning_rate": 3.4745600659046936e-05, + "loss": 2.9075, + "step": 51744 + }, + { + "epoch": 2.54, + "grad_norm": 0.7457864284515381, + "learning_rate": 3.473840877274837e-05, + "loss": 2.9522, + "step": 51745 + }, + { + "epoch": 2.54, + "grad_norm": 0.7215996384620667, + "learning_rate": 3.4731217585097716e-05, + "loss": 3.1533, + "step": 51746 + }, + { + "epoch": 2.54, + "grad_norm": 0.7540760636329651, + "learning_rate": 3.472402709611405e-05, + "loss": 3.0193, + "step": 51747 + }, + { + "epoch": 2.54, + "grad_norm": 0.7310689091682434, + "learning_rate": 3.471683730581619e-05, + "loss": 2.9448, + "step": 51748 + }, + { + "epoch": 2.54, + "grad_norm": 0.7167690992355347, + "learning_rate": 3.4709648214223166e-05, + "loss": 2.8068, + "step": 51749 + }, + { + "epoch": 2.54, + "grad_norm": 0.7109416127204895, + "learning_rate": 3.470245982135388e-05, + "loss": 2.8514, + "step": 51750 + }, + { + "epoch": 2.54, + "grad_norm": 0.7370871305465698, + "learning_rate": 3.469527212722719e-05, + "loss": 2.8947, + "step": 51751 + }, + { + "epoch": 2.54, + "grad_norm": 0.7531648278236389, + "learning_rate": 3.468808513186219e-05, + "loss": 2.9873, + "step": 51752 + }, + { + "epoch": 2.54, + "grad_norm": 0.7331883311271667, + "learning_rate": 3.468089883527762e-05, + "loss": 2.7971, + "step": 51753 + }, + { + "epoch": 2.54, + "grad_norm": 0.7884783744812012, + "learning_rate": 3.4673713237492496e-05, + "loss": 2.839, + "step": 51754 + }, + { + "epoch": 2.54, + "grad_norm": 0.7267898321151733, + "learning_rate": 3.4666528338525846e-05, + "loss": 2.8982, + "step": 51755 + }, + { + "epoch": 2.54, + "grad_norm": 0.7529955506324768, + "learning_rate": 3.465934413839645e-05, + "loss": 3.0185, + "step": 51756 + }, + { + "epoch": 2.54, + "grad_norm": 0.7315231561660767, + "learning_rate": 3.4652160637123327e-05, + "loss": 2.8829, + "step": 51757 + }, + { + "epoch": 2.54, + "grad_norm": 0.7301185131072998, + "learning_rate": 3.464497783472523e-05, + "loss": 2.7218, + "step": 51758 + }, + { + "epoch": 2.54, + "grad_norm": 0.7687699198722839, + "learning_rate": 3.463779573122124e-05, + "loss": 3.0069, + "step": 51759 + }, + { + "epoch": 2.54, + "grad_norm": 0.7508442401885986, + "learning_rate": 3.463061432663025e-05, + "loss": 3.0149, + "step": 51760 + }, + { + "epoch": 2.54, + "grad_norm": 0.7293921113014221, + "learning_rate": 3.462343362097113e-05, + "loss": 2.8922, + "step": 51761 + }, + { + "epoch": 2.54, + "grad_norm": 0.7335087656974792, + "learning_rate": 3.4616253614262846e-05, + "loss": 2.7265, + "step": 51762 + }, + { + "epoch": 2.54, + "grad_norm": 0.6986225247383118, + "learning_rate": 3.460907430652421e-05, + "loss": 3.1249, + "step": 51763 + }, + { + "epoch": 2.54, + "grad_norm": 0.7187238931655884, + "learning_rate": 3.4601895697774274e-05, + "loss": 2.8012, + "step": 51764 + }, + { + "epoch": 2.54, + "grad_norm": 0.7392198443412781, + "learning_rate": 3.4594717788031853e-05, + "loss": 2.8605, + "step": 51765 + }, + { + "epoch": 2.54, + "grad_norm": 0.7785656452178955, + "learning_rate": 3.458754057731581e-05, + "loss": 2.8712, + "step": 51766 + }, + { + "epoch": 2.54, + "grad_norm": 0.7567797899246216, + "learning_rate": 3.458036406564518e-05, + "loss": 2.7214, + "step": 51767 + }, + { + "epoch": 2.54, + "grad_norm": 0.8188661932945251, + "learning_rate": 3.4573188253038696e-05, + "loss": 3.044, + "step": 51768 + }, + { + "epoch": 2.54, + "grad_norm": 0.8261837959289551, + "learning_rate": 3.45660131395154e-05, + "loss": 2.8088, + "step": 51769 + }, + { + "epoch": 2.54, + "grad_norm": 0.7429391145706177, + "learning_rate": 3.4558838725094154e-05, + "loss": 2.9782, + "step": 51770 + }, + { + "epoch": 2.54, + "grad_norm": 0.7897228002548218, + "learning_rate": 3.45516650097939e-05, + "loss": 2.9229, + "step": 51771 + }, + { + "epoch": 2.54, + "grad_norm": 0.8158841729164124, + "learning_rate": 3.4544491993633415e-05, + "loss": 3.0301, + "step": 51772 + }, + { + "epoch": 2.54, + "grad_norm": 0.7363582253456116, + "learning_rate": 3.45373196766316e-05, + "loss": 3.0389, + "step": 51773 + }, + { + "epoch": 2.54, + "grad_norm": 0.7058192491531372, + "learning_rate": 3.453014805880744e-05, + "loss": 2.7813, + "step": 51774 + }, + { + "epoch": 2.54, + "grad_norm": 0.7266924977302551, + "learning_rate": 3.452297714017981e-05, + "loss": 2.8937, + "step": 51775 + }, + { + "epoch": 2.54, + "grad_norm": 0.7514150738716125, + "learning_rate": 3.45158069207675e-05, + "loss": 3.0089, + "step": 51776 + }, + { + "epoch": 2.54, + "grad_norm": 0.7654863595962524, + "learning_rate": 3.4508637400589526e-05, + "loss": 2.95, + "step": 51777 + }, + { + "epoch": 2.54, + "grad_norm": 0.7285271286964417, + "learning_rate": 3.45014685796647e-05, + "loss": 2.9363, + "step": 51778 + }, + { + "epoch": 2.54, + "grad_norm": 0.7652084827423096, + "learning_rate": 3.4494300458011856e-05, + "loss": 2.9796, + "step": 51779 + }, + { + "epoch": 2.54, + "grad_norm": 0.7151200771331787, + "learning_rate": 3.448713303564997e-05, + "loss": 3.1283, + "step": 51780 + }, + { + "epoch": 2.54, + "grad_norm": 0.7224323749542236, + "learning_rate": 3.447996631259783e-05, + "loss": 3.054, + "step": 51781 + }, + { + "epoch": 2.54, + "grad_norm": 0.7343427538871765, + "learning_rate": 3.447280028887445e-05, + "loss": 2.9277, + "step": 51782 + }, + { + "epoch": 2.54, + "grad_norm": 0.7346922755241394, + "learning_rate": 3.446563496449851e-05, + "loss": 3.0434, + "step": 51783 + }, + { + "epoch": 2.54, + "grad_norm": 0.7594969868659973, + "learning_rate": 3.445847033948901e-05, + "loss": 2.8904, + "step": 51784 + }, + { + "epoch": 2.54, + "grad_norm": 0.7676674723625183, + "learning_rate": 3.445130641386482e-05, + "loss": 2.7869, + "step": 51785 + }, + { + "epoch": 2.54, + "grad_norm": 0.7546485066413879, + "learning_rate": 3.44441431876448e-05, + "loss": 2.9141, + "step": 51786 + }, + { + "epoch": 2.54, + "grad_norm": 0.7444491982460022, + "learning_rate": 3.4436980660847836e-05, + "loss": 2.9501, + "step": 51787 + }, + { + "epoch": 2.54, + "grad_norm": 0.7077409625053406, + "learning_rate": 3.442981883349264e-05, + "loss": 2.9668, + "step": 51788 + }, + { + "epoch": 2.54, + "grad_norm": 0.7607021331787109, + "learning_rate": 3.442265770559823e-05, + "loss": 2.7839, + "step": 51789 + }, + { + "epoch": 2.54, + "grad_norm": 0.790345311164856, + "learning_rate": 3.441549727718347e-05, + "loss": 2.909, + "step": 51790 + }, + { + "epoch": 2.54, + "grad_norm": 0.7288457155227661, + "learning_rate": 3.440833754826713e-05, + "loss": 2.85, + "step": 51791 + }, + { + "epoch": 2.54, + "grad_norm": 0.7285025119781494, + "learning_rate": 3.4401178518868163e-05, + "loss": 3.0604, + "step": 51792 + }, + { + "epoch": 2.54, + "grad_norm": 0.7310810685157776, + "learning_rate": 3.4394020189005364e-05, + "loss": 2.8663, + "step": 51793 + }, + { + "epoch": 2.54, + "grad_norm": 0.725584089756012, + "learning_rate": 3.438686255869758e-05, + "loss": 3.018, + "step": 51794 + }, + { + "epoch": 2.54, + "grad_norm": 0.7113367319107056, + "learning_rate": 3.43797056279637e-05, + "loss": 2.9908, + "step": 51795 + }, + { + "epoch": 2.54, + "grad_norm": 0.8494749665260315, + "learning_rate": 3.4372549396822526e-05, + "loss": 2.9683, + "step": 51796 + }, + { + "epoch": 2.54, + "grad_norm": 0.7407485246658325, + "learning_rate": 3.436539386529299e-05, + "loss": 2.7897, + "step": 51797 + }, + { + "epoch": 2.54, + "grad_norm": 0.7727249264717102, + "learning_rate": 3.4358239033393806e-05, + "loss": 2.8892, + "step": 51798 + }, + { + "epoch": 2.54, + "grad_norm": 0.8405013084411621, + "learning_rate": 3.43510849011439e-05, + "loss": 3.0491, + "step": 51799 + }, + { + "epoch": 2.54, + "grad_norm": 0.7366344332695007, + "learning_rate": 3.434393146856218e-05, + "loss": 2.8452, + "step": 51800 + }, + { + "epoch": 2.54, + "grad_norm": 0.7424528002738953, + "learning_rate": 3.433677873566743e-05, + "loss": 2.9742, + "step": 51801 + }, + { + "epoch": 2.54, + "grad_norm": 0.721091091632843, + "learning_rate": 3.4329626702478455e-05, + "loss": 2.9626, + "step": 51802 + }, + { + "epoch": 2.54, + "grad_norm": 0.770412027835846, + "learning_rate": 3.432247536901408e-05, + "loss": 2.8381, + "step": 51803 + }, + { + "epoch": 2.54, + "grad_norm": 0.7273968458175659, + "learning_rate": 3.431532473529314e-05, + "loss": 2.6277, + "step": 51804 + }, + { + "epoch": 2.54, + "grad_norm": 0.7612076997756958, + "learning_rate": 3.430817480133458e-05, + "loss": 2.9437, + "step": 51805 + }, + { + "epoch": 2.54, + "grad_norm": 0.7737910151481628, + "learning_rate": 3.4301025567157084e-05, + "loss": 3.02, + "step": 51806 + }, + { + "epoch": 2.54, + "grad_norm": 0.8094823956489563, + "learning_rate": 3.429387703277964e-05, + "loss": 2.9737, + "step": 51807 + }, + { + "epoch": 2.54, + "grad_norm": 0.7228150963783264, + "learning_rate": 3.428672919822096e-05, + "loss": 2.7584, + "step": 51808 + }, + { + "epoch": 2.54, + "grad_norm": 0.7346405982971191, + "learning_rate": 3.4279582063499845e-05, + "loss": 2.948, + "step": 51809 + }, + { + "epoch": 2.54, + "grad_norm": 0.7100589275360107, + "learning_rate": 3.427243562863523e-05, + "loss": 2.7957, + "step": 51810 + }, + { + "epoch": 2.54, + "grad_norm": 0.7395407557487488, + "learning_rate": 3.426528989364584e-05, + "loss": 2.9512, + "step": 51811 + }, + { + "epoch": 2.54, + "grad_norm": 0.7701348066329956, + "learning_rate": 3.425814485855056e-05, + "loss": 3.0815, + "step": 51812 + }, + { + "epoch": 2.54, + "grad_norm": 0.8105944991111755, + "learning_rate": 3.425100052336814e-05, + "loss": 2.8488, + "step": 51813 + }, + { + "epoch": 2.54, + "grad_norm": 0.7398055791854858, + "learning_rate": 3.424385688811751e-05, + "loss": 2.9291, + "step": 51814 + }, + { + "epoch": 2.54, + "grad_norm": 0.7211189866065979, + "learning_rate": 3.4236713952817394e-05, + "loss": 2.9474, + "step": 51815 + }, + { + "epoch": 2.54, + "grad_norm": 0.7564665675163269, + "learning_rate": 3.4229571717486606e-05, + "loss": 2.7423, + "step": 51816 + }, + { + "epoch": 2.54, + "grad_norm": 0.7213353514671326, + "learning_rate": 3.4222430182144015e-05, + "loss": 2.8272, + "step": 51817 + }, + { + "epoch": 2.54, + "grad_norm": 0.7244040966033936, + "learning_rate": 3.42152893468083e-05, + "loss": 2.907, + "step": 51818 + }, + { + "epoch": 2.54, + "grad_norm": 0.6921009421348572, + "learning_rate": 3.420814921149847e-05, + "loss": 2.9494, + "step": 51819 + }, + { + "epoch": 2.54, + "grad_norm": 0.7488122582435608, + "learning_rate": 3.420100977623312e-05, + "loss": 2.8671, + "step": 51820 + }, + { + "epoch": 2.54, + "grad_norm": 0.7758278846740723, + "learning_rate": 3.419387104103118e-05, + "loss": 2.7779, + "step": 51821 + }, + { + "epoch": 2.54, + "grad_norm": 0.7232535481452942, + "learning_rate": 3.41867330059115e-05, + "loss": 2.8373, + "step": 51822 + }, + { + "epoch": 2.54, + "grad_norm": 0.716827392578125, + "learning_rate": 3.41795956708928e-05, + "loss": 2.9449, + "step": 51823 + }, + { + "epoch": 2.54, + "grad_norm": 0.7418515086174011, + "learning_rate": 3.417245903599389e-05, + "loss": 2.9679, + "step": 51824 + }, + { + "epoch": 2.54, + "grad_norm": 0.7049885392189026, + "learning_rate": 3.41653231012335e-05, + "loss": 2.8945, + "step": 51825 + }, + { + "epoch": 2.54, + "grad_norm": 0.757987916469574, + "learning_rate": 3.415818786663047e-05, + "loss": 2.8117, + "step": 51826 + }, + { + "epoch": 2.54, + "grad_norm": 0.7701760530471802, + "learning_rate": 3.415105333220369e-05, + "loss": 2.8238, + "step": 51827 + }, + { + "epoch": 2.54, + "grad_norm": 0.75865238904953, + "learning_rate": 3.414391949797182e-05, + "loss": 3.0257, + "step": 51828 + }, + { + "epoch": 2.54, + "grad_norm": 0.698841392993927, + "learning_rate": 3.413678636395373e-05, + "loss": 2.8413, + "step": 51829 + }, + { + "epoch": 2.54, + "grad_norm": 0.7989115715026855, + "learning_rate": 3.412965393016821e-05, + "loss": 2.8112, + "step": 51830 + }, + { + "epoch": 2.54, + "grad_norm": 0.6878560185432434, + "learning_rate": 3.412252219663392e-05, + "loss": 2.8933, + "step": 51831 + }, + { + "epoch": 2.54, + "grad_norm": 0.7924463748931885, + "learning_rate": 3.411539116336983e-05, + "loss": 2.914, + "step": 51832 + }, + { + "epoch": 2.54, + "grad_norm": 0.7778879404067993, + "learning_rate": 3.410826083039454e-05, + "loss": 2.887, + "step": 51833 + }, + { + "epoch": 2.54, + "grad_norm": 0.7213044762611389, + "learning_rate": 3.4101131197726996e-05, + "loss": 2.6942, + "step": 51834 + }, + { + "epoch": 2.54, + "grad_norm": 0.7068144679069519, + "learning_rate": 3.4094002265385846e-05, + "loss": 2.758, + "step": 51835 + }, + { + "epoch": 2.54, + "grad_norm": 0.7102383375167847, + "learning_rate": 3.4086874033389974e-05, + "loss": 3.0547, + "step": 51836 + }, + { + "epoch": 2.54, + "grad_norm": 0.7071000337600708, + "learning_rate": 3.407974650175811e-05, + "loss": 2.9646, + "step": 51837 + }, + { + "epoch": 2.54, + "grad_norm": 0.7384864687919617, + "learning_rate": 3.4072619670508925e-05, + "loss": 2.7728, + "step": 51838 + }, + { + "epoch": 2.54, + "grad_norm": 0.739606499671936, + "learning_rate": 3.406549353966135e-05, + "loss": 2.8419, + "step": 51839 + }, + { + "epoch": 2.54, + "grad_norm": 0.8046254515647888, + "learning_rate": 3.405836810923406e-05, + "loss": 3.0422, + "step": 51840 + }, + { + "epoch": 2.54, + "grad_norm": 0.7787489891052246, + "learning_rate": 3.405124337924581e-05, + "loss": 3.0144, + "step": 51841 + }, + { + "epoch": 2.54, + "grad_norm": 0.7587060332298279, + "learning_rate": 3.404411934971548e-05, + "loss": 3.011, + "step": 51842 + }, + { + "epoch": 2.54, + "grad_norm": 0.7384954690933228, + "learning_rate": 3.403699602066167e-05, + "loss": 2.9519, + "step": 51843 + }, + { + "epoch": 2.54, + "grad_norm": 0.7247627973556519, + "learning_rate": 3.402987339210331e-05, + "loss": 2.9142, + "step": 51844 + }, + { + "epoch": 2.54, + "grad_norm": 0.7663771510124207, + "learning_rate": 3.402275146405907e-05, + "loss": 2.8239, + "step": 51845 + }, + { + "epoch": 2.54, + "grad_norm": 0.7334901094436646, + "learning_rate": 3.401563023654766e-05, + "loss": 3.017, + "step": 51846 + }, + { + "epoch": 2.54, + "grad_norm": 0.7264374494552612, + "learning_rate": 3.400850970958797e-05, + "loss": 2.8086, + "step": 51847 + }, + { + "epoch": 2.54, + "grad_norm": 0.7451301217079163, + "learning_rate": 3.4001389883198584e-05, + "loss": 2.8351, + "step": 51848 + }, + { + "epoch": 2.54, + "grad_norm": 0.7210157513618469, + "learning_rate": 3.3994270757398415e-05, + "loss": 2.8837, + "step": 51849 + }, + { + "epoch": 2.54, + "grad_norm": 0.7271472811698914, + "learning_rate": 3.398715233220608e-05, + "loss": 2.8446, + "step": 51850 + }, + { + "epoch": 2.54, + "grad_norm": 0.7448669075965881, + "learning_rate": 3.398003460764044e-05, + "loss": 2.896, + "step": 51851 + }, + { + "epoch": 2.54, + "grad_norm": 0.7264134287834167, + "learning_rate": 3.397291758372021e-05, + "loss": 2.7349, + "step": 51852 + }, + { + "epoch": 2.54, + "grad_norm": 0.7169459462165833, + "learning_rate": 3.396580126046403e-05, + "loss": 3.0358, + "step": 51853 + }, + { + "epoch": 2.54, + "grad_norm": 0.7395771145820618, + "learning_rate": 3.395868563789084e-05, + "loss": 2.7717, + "step": 51854 + }, + { + "epoch": 2.54, + "grad_norm": 0.7105684876441956, + "learning_rate": 3.395157071601917e-05, + "loss": 2.9395, + "step": 51855 + }, + { + "epoch": 2.54, + "grad_norm": 0.7309338450431824, + "learning_rate": 3.394445649486785e-05, + "loss": 2.8171, + "step": 51856 + }, + { + "epoch": 2.54, + "grad_norm": 0.7164181470870972, + "learning_rate": 3.393734297445574e-05, + "loss": 2.8214, + "step": 51857 + }, + { + "epoch": 2.54, + "grad_norm": 0.7518510818481445, + "learning_rate": 3.393023015480144e-05, + "loss": 2.8898, + "step": 51858 + }, + { + "epoch": 2.54, + "grad_norm": 0.7415131330490112, + "learning_rate": 3.392311803592368e-05, + "loss": 2.9758, + "step": 51859 + }, + { + "epoch": 2.54, + "grad_norm": 0.746828556060791, + "learning_rate": 3.3916006617841175e-05, + "loss": 2.955, + "step": 51860 + }, + { + "epoch": 2.54, + "grad_norm": 0.7056711316108704, + "learning_rate": 3.390889590057271e-05, + "loss": 2.8957, + "step": 51861 + }, + { + "epoch": 2.54, + "grad_norm": 0.8097109198570251, + "learning_rate": 3.390178588413707e-05, + "loss": 2.9484, + "step": 51862 + }, + { + "epoch": 2.54, + "grad_norm": 0.7409765720367432, + "learning_rate": 3.3894676568552834e-05, + "loss": 2.7206, + "step": 51863 + }, + { + "epoch": 2.54, + "grad_norm": 0.8118706345558167, + "learning_rate": 3.388756795383887e-05, + "loss": 2.8796, + "step": 51864 + }, + { + "epoch": 2.54, + "grad_norm": 0.744475781917572, + "learning_rate": 3.3880460040013815e-05, + "loss": 2.8114, + "step": 51865 + }, + { + "epoch": 2.54, + "grad_norm": 0.7227175831794739, + "learning_rate": 3.387335282709642e-05, + "loss": 3.1192, + "step": 51866 + }, + { + "epoch": 2.54, + "grad_norm": 0.7669481635093689, + "learning_rate": 3.3866246315105415e-05, + "loss": 2.7376, + "step": 51867 + }, + { + "epoch": 2.54, + "grad_norm": 0.7482892274856567, + "learning_rate": 3.3859140504059446e-05, + "loss": 2.8004, + "step": 51868 + }, + { + "epoch": 2.54, + "grad_norm": 0.7517153024673462, + "learning_rate": 3.385203539397733e-05, + "loss": 2.7044, + "step": 51869 + }, + { + "epoch": 2.54, + "grad_norm": 0.7399962544441223, + "learning_rate": 3.384493098487768e-05, + "loss": 3.0025, + "step": 51870 + }, + { + "epoch": 2.54, + "grad_norm": 0.7390328049659729, + "learning_rate": 3.383782727677926e-05, + "loss": 2.8516, + "step": 51871 + }, + { + "epoch": 2.54, + "grad_norm": 0.7233524322509766, + "learning_rate": 3.383072426970085e-05, + "loss": 2.804, + "step": 51872 + }, + { + "epoch": 2.54, + "grad_norm": 0.7317050695419312, + "learning_rate": 3.382362196366106e-05, + "loss": 2.9562, + "step": 51873 + }, + { + "epoch": 2.54, + "grad_norm": 0.7686887383460999, + "learning_rate": 3.3816520358678655e-05, + "loss": 2.997, + "step": 51874 + }, + { + "epoch": 2.54, + "grad_norm": 0.7436739802360535, + "learning_rate": 3.3809419454772214e-05, + "loss": 2.8637, + "step": 51875 + }, + { + "epoch": 2.54, + "grad_norm": 0.701702892780304, + "learning_rate": 3.3802319251960554e-05, + "loss": 2.7465, + "step": 51876 + }, + { + "epoch": 2.54, + "grad_norm": 0.7306755185127258, + "learning_rate": 3.379521975026239e-05, + "loss": 2.7137, + "step": 51877 + }, + { + "epoch": 2.54, + "grad_norm": 0.7624945044517517, + "learning_rate": 3.378812094969635e-05, + "loss": 3.029, + "step": 51878 + }, + { + "epoch": 2.54, + "grad_norm": 0.7094208002090454, + "learning_rate": 3.378102285028121e-05, + "loss": 3.0725, + "step": 51879 + }, + { + "epoch": 2.54, + "grad_norm": 0.7654349207878113, + "learning_rate": 3.377392545203567e-05, + "loss": 3.1119, + "step": 51880 + }, + { + "epoch": 2.54, + "grad_norm": 0.7229974865913391, + "learning_rate": 3.376682875497826e-05, + "loss": 2.8137, + "step": 51881 + }, + { + "epoch": 2.54, + "grad_norm": 0.7172889709472656, + "learning_rate": 3.375973275912784e-05, + "loss": 2.8475, + "step": 51882 + }, + { + "epoch": 2.54, + "grad_norm": 0.7357591986656189, + "learning_rate": 3.375263746450301e-05, + "loss": 2.9436, + "step": 51883 + }, + { + "epoch": 2.54, + "grad_norm": 0.7437008619308472, + "learning_rate": 3.374554287112256e-05, + "loss": 3.1229, + "step": 51884 + }, + { + "epoch": 2.54, + "grad_norm": 0.7146857976913452, + "learning_rate": 3.373844897900504e-05, + "loss": 2.7566, + "step": 51885 + }, + { + "epoch": 2.54, + "grad_norm": 0.7032164335250854, + "learning_rate": 3.3731355788169176e-05, + "loss": 2.8108, + "step": 51886 + }, + { + "epoch": 2.54, + "grad_norm": 0.738189697265625, + "learning_rate": 3.372426329863377e-05, + "loss": 2.9045, + "step": 51887 + }, + { + "epoch": 2.54, + "grad_norm": 0.7533491253852844, + "learning_rate": 3.3717171510417396e-05, + "loss": 2.9161, + "step": 51888 + }, + { + "epoch": 2.54, + "grad_norm": 0.7233903408050537, + "learning_rate": 3.371008042353874e-05, + "loss": 2.9135, + "step": 51889 + }, + { + "epoch": 2.54, + "grad_norm": 0.7648455500602722, + "learning_rate": 3.370299003801643e-05, + "loss": 2.9126, + "step": 51890 + }, + { + "epoch": 2.54, + "grad_norm": 0.742719829082489, + "learning_rate": 3.369590035386921e-05, + "loss": 2.9763, + "step": 51891 + }, + { + "epoch": 2.54, + "grad_norm": 0.7518740892410278, + "learning_rate": 3.368881137111577e-05, + "loss": 3.0087, + "step": 51892 + }, + { + "epoch": 2.54, + "grad_norm": 0.7219116687774658, + "learning_rate": 3.3681723089774714e-05, + "loss": 3.0071, + "step": 51893 + }, + { + "epoch": 2.54, + "grad_norm": 0.7408004403114319, + "learning_rate": 3.367463550986478e-05, + "loss": 2.9956, + "step": 51894 + }, + { + "epoch": 2.54, + "grad_norm": 0.7436817288398743, + "learning_rate": 3.3667548631404604e-05, + "loss": 2.7571, + "step": 51895 + }, + { + "epoch": 2.54, + "grad_norm": 0.6922502517700195, + "learning_rate": 3.366046245441282e-05, + "loss": 2.9126, + "step": 51896 + }, + { + "epoch": 2.54, + "grad_norm": 0.7235960364341736, + "learning_rate": 3.365337697890813e-05, + "loss": 2.8327, + "step": 51897 + }, + { + "epoch": 2.54, + "grad_norm": 0.7371718287467957, + "learning_rate": 3.3646292204909164e-05, + "loss": 2.8076, + "step": 51898 + }, + { + "epoch": 2.54, + "grad_norm": 0.7488693594932556, + "learning_rate": 3.3639208132434635e-05, + "loss": 2.727, + "step": 51899 + }, + { + "epoch": 2.54, + "grad_norm": 0.7194048762321472, + "learning_rate": 3.363212476150313e-05, + "loss": 2.8375, + "step": 51900 + }, + { + "epoch": 2.54, + "grad_norm": 0.7807648777961731, + "learning_rate": 3.3625042092133406e-05, + "loss": 2.9876, + "step": 51901 + }, + { + "epoch": 2.54, + "grad_norm": 0.741208016872406, + "learning_rate": 3.3617960124344e-05, + "loss": 2.77, + "step": 51902 + }, + { + "epoch": 2.54, + "grad_norm": 0.7476096749305725, + "learning_rate": 3.361087885815368e-05, + "loss": 3.1433, + "step": 51903 + }, + { + "epoch": 2.54, + "grad_norm": 0.8125763535499573, + "learning_rate": 3.3603798293581016e-05, + "loss": 2.8455, + "step": 51904 + }, + { + "epoch": 2.54, + "grad_norm": 0.7425322532653809, + "learning_rate": 3.359671843064464e-05, + "loss": 3.098, + "step": 51905 + }, + { + "epoch": 2.54, + "grad_norm": 0.7436474561691284, + "learning_rate": 3.358963926936327e-05, + "loss": 2.9924, + "step": 51906 + }, + { + "epoch": 2.54, + "grad_norm": 0.8901980519294739, + "learning_rate": 3.358256080975548e-05, + "loss": 2.8039, + "step": 51907 + }, + { + "epoch": 2.54, + "grad_norm": 0.7209775447845459, + "learning_rate": 3.357548305183996e-05, + "loss": 3.2191, + "step": 51908 + }, + { + "epoch": 2.54, + "grad_norm": 0.7874775528907776, + "learning_rate": 3.3568405995635393e-05, + "loss": 2.9348, + "step": 51909 + }, + { + "epoch": 2.54, + "grad_norm": 0.7514326572418213, + "learning_rate": 3.3561329641160376e-05, + "loss": 3.0049, + "step": 51910 + }, + { + "epoch": 2.54, + "grad_norm": 0.7351415157318115, + "learning_rate": 3.3554253988433513e-05, + "loss": 2.8308, + "step": 51911 + }, + { + "epoch": 2.54, + "grad_norm": 0.7183605432510376, + "learning_rate": 3.354717903747343e-05, + "loss": 2.9062, + "step": 51912 + }, + { + "epoch": 2.54, + "grad_norm": 1.2949225902557373, + "learning_rate": 3.354010478829881e-05, + "loss": 3.0845, + "step": 51913 + }, + { + "epoch": 2.54, + "grad_norm": 0.7274050116539001, + "learning_rate": 3.35330312409283e-05, + "loss": 3.066, + "step": 51914 + }, + { + "epoch": 2.54, + "grad_norm": 0.7558318376541138, + "learning_rate": 3.352595839538046e-05, + "loss": 2.9214, + "step": 51915 + }, + { + "epoch": 2.54, + "grad_norm": 0.7233791351318359, + "learning_rate": 3.3518886251674004e-05, + "loss": 2.8911, + "step": 51916 + }, + { + "epoch": 2.54, + "grad_norm": 0.7367327809333801, + "learning_rate": 3.351181480982755e-05, + "loss": 2.8778, + "step": 51917 + }, + { + "epoch": 2.54, + "grad_norm": 0.7649989128112793, + "learning_rate": 3.350474406985959e-05, + "loss": 2.802, + "step": 51918 + }, + { + "epoch": 2.54, + "grad_norm": 0.7352694869041443, + "learning_rate": 3.3497674031788935e-05, + "loss": 2.871, + "step": 51919 + }, + { + "epoch": 2.54, + "grad_norm": 0.7183537483215332, + "learning_rate": 3.3490604695634073e-05, + "loss": 2.9789, + "step": 51920 + }, + { + "epoch": 2.54, + "grad_norm": 0.7267533540725708, + "learning_rate": 3.348353606141369e-05, + "loss": 2.9397, + "step": 51921 + }, + { + "epoch": 2.54, + "grad_norm": 0.7344212532043457, + "learning_rate": 3.347646812914633e-05, + "loss": 2.9678, + "step": 51922 + }, + { + "epoch": 2.54, + "grad_norm": 0.7288460731506348, + "learning_rate": 3.346940089885065e-05, + "loss": 3.0565, + "step": 51923 + }, + { + "epoch": 2.54, + "grad_norm": 0.77585369348526, + "learning_rate": 3.346233437054538e-05, + "loss": 2.9148, + "step": 51924 + }, + { + "epoch": 2.54, + "grad_norm": 0.7512287497520447, + "learning_rate": 3.345526854424898e-05, + "loss": 2.9999, + "step": 51925 + }, + { + "epoch": 2.54, + "grad_norm": 0.7456086874008179, + "learning_rate": 3.344820341998012e-05, + "loss": 3.0275, + "step": 51926 + }, + { + "epoch": 2.54, + "grad_norm": 0.7627673149108887, + "learning_rate": 3.344113899775732e-05, + "loss": 2.8022, + "step": 51927 + }, + { + "epoch": 2.54, + "grad_norm": 0.701185405254364, + "learning_rate": 3.343407527759929e-05, + "loss": 2.6771, + "step": 51928 + }, + { + "epoch": 2.54, + "grad_norm": 0.7408918738365173, + "learning_rate": 3.3427012259524623e-05, + "loss": 2.8085, + "step": 51929 + }, + { + "epoch": 2.54, + "grad_norm": 0.7534040212631226, + "learning_rate": 3.341994994355186e-05, + "loss": 3.0257, + "step": 51930 + }, + { + "epoch": 2.55, + "grad_norm": 0.7601995468139648, + "learning_rate": 3.341288832969973e-05, + "loss": 2.8248, + "step": 51931 + }, + { + "epoch": 2.55, + "grad_norm": 0.7661672830581665, + "learning_rate": 3.340582741798671e-05, + "loss": 3.0491, + "step": 51932 + }, + { + "epoch": 2.55, + "grad_norm": 0.7597264051437378, + "learning_rate": 3.339876720843139e-05, + "loss": 2.9227, + "step": 51933 + }, + { + "epoch": 2.55, + "grad_norm": 0.726146936416626, + "learning_rate": 3.3391707701052484e-05, + "loss": 2.941, + "step": 51934 + }, + { + "epoch": 2.55, + "grad_norm": 0.7189964652061462, + "learning_rate": 3.338464889586842e-05, + "loss": 2.6657, + "step": 51935 + }, + { + "epoch": 2.55, + "grad_norm": 0.7691967487335205, + "learning_rate": 3.337759079289797e-05, + "loss": 2.7853, + "step": 51936 + }, + { + "epoch": 2.55, + "grad_norm": 0.7658074498176575, + "learning_rate": 3.3370533392159557e-05, + "loss": 2.9422, + "step": 51937 + }, + { + "epoch": 2.55, + "grad_norm": 0.7648226618766785, + "learning_rate": 3.336347669367191e-05, + "loss": 2.8923, + "step": 51938 + }, + { + "epoch": 2.55, + "grad_norm": 0.7221632599830627, + "learning_rate": 3.3356420697453535e-05, + "loss": 2.7206, + "step": 51939 + }, + { + "epoch": 2.55, + "grad_norm": 0.7435133457183838, + "learning_rate": 3.3349365403522986e-05, + "loss": 2.8739, + "step": 51940 + }, + { + "epoch": 2.55, + "grad_norm": 0.744724690914154, + "learning_rate": 3.3342310811898923e-05, + "loss": 2.8973, + "step": 51941 + }, + { + "epoch": 2.55, + "grad_norm": 0.7191993594169617, + "learning_rate": 3.3335256922599886e-05, + "loss": 2.7973, + "step": 51942 + }, + { + "epoch": 2.55, + "grad_norm": 0.7432810068130493, + "learning_rate": 3.33282037356444e-05, + "loss": 2.9852, + "step": 51943 + }, + { + "epoch": 2.55, + "grad_norm": 0.7368122339248657, + "learning_rate": 3.3321151251051216e-05, + "loss": 3.0071, + "step": 51944 + }, + { + "epoch": 2.55, + "grad_norm": 0.7477759122848511, + "learning_rate": 3.331409946883872e-05, + "loss": 2.988, + "step": 51945 + }, + { + "epoch": 2.55, + "grad_norm": 0.8247474431991577, + "learning_rate": 3.330704838902563e-05, + "loss": 2.8604, + "step": 51946 + }, + { + "epoch": 2.55, + "grad_norm": 0.7637084126472473, + "learning_rate": 3.3299998011630424e-05, + "loss": 2.9437, + "step": 51947 + }, + { + "epoch": 2.55, + "grad_norm": 0.7311221361160278, + "learning_rate": 3.3292948336671635e-05, + "loss": 2.9512, + "step": 51948 + }, + { + "epoch": 2.55, + "grad_norm": 0.7546210885047913, + "learning_rate": 3.328589936416797e-05, + "loss": 2.8564, + "step": 51949 + }, + { + "epoch": 2.55, + "grad_norm": 0.7457635402679443, + "learning_rate": 3.327885109413785e-05, + "loss": 2.8564, + "step": 51950 + }, + { + "epoch": 2.55, + "grad_norm": 0.7271265983581543, + "learning_rate": 3.327180352659996e-05, + "loss": 3.0531, + "step": 51951 + }, + { + "epoch": 2.55, + "grad_norm": 0.7246139049530029, + "learning_rate": 3.326475666157275e-05, + "loss": 2.6776, + "step": 51952 + }, + { + "epoch": 2.55, + "grad_norm": 0.7810348868370056, + "learning_rate": 3.325771049907492e-05, + "loss": 2.8616, + "step": 51953 + }, + { + "epoch": 2.55, + "grad_norm": 0.7812083959579468, + "learning_rate": 3.32506650391249e-05, + "loss": 3.0086, + "step": 51954 + }, + { + "epoch": 2.55, + "grad_norm": 0.7078095078468323, + "learning_rate": 3.3243620281741254e-05, + "loss": 2.9684, + "step": 51955 + }, + { + "epoch": 2.55, + "grad_norm": 0.7382186651229858, + "learning_rate": 3.323657622694264e-05, + "loss": 3.0665, + "step": 51956 + }, + { + "epoch": 2.55, + "grad_norm": 0.7355473041534424, + "learning_rate": 3.322953287474749e-05, + "loss": 2.7563, + "step": 51957 + }, + { + "epoch": 2.55, + "grad_norm": 0.7071644067764282, + "learning_rate": 3.32224902251744e-05, + "loss": 2.8823, + "step": 51958 + }, + { + "epoch": 2.55, + "grad_norm": 0.7337292432785034, + "learning_rate": 3.321544827824196e-05, + "loss": 2.8112, + "step": 51959 + }, + { + "epoch": 2.55, + "grad_norm": 0.7419911026954651, + "learning_rate": 3.320840703396872e-05, + "loss": 2.8555, + "step": 51960 + }, + { + "epoch": 2.55, + "grad_norm": 0.7350503206253052, + "learning_rate": 3.3201366492373195e-05, + "loss": 3.1303, + "step": 51961 + }, + { + "epoch": 2.55, + "grad_norm": 0.7591217160224915, + "learning_rate": 3.3194326653473836e-05, + "loss": 2.8945, + "step": 51962 + }, + { + "epoch": 2.55, + "grad_norm": 0.7995786666870117, + "learning_rate": 3.318728751728929e-05, + "loss": 2.9841, + "step": 51963 + }, + { + "epoch": 2.55, + "grad_norm": 0.7559335231781006, + "learning_rate": 3.318024908383816e-05, + "loss": 2.9551, + "step": 51964 + }, + { + "epoch": 2.55, + "grad_norm": 0.7337844371795654, + "learning_rate": 3.3173211353138816e-05, + "loss": 3.106, + "step": 51965 + }, + { + "epoch": 2.55, + "grad_norm": 0.7507258057594299, + "learning_rate": 3.316617432520995e-05, + "loss": 2.6799, + "step": 51966 + }, + { + "epoch": 2.55, + "grad_norm": 0.7264440059661865, + "learning_rate": 3.315913800006994e-05, + "loss": 2.8526, + "step": 51967 + }, + { + "epoch": 2.55, + "grad_norm": 0.7675163745880127, + "learning_rate": 3.315210237773751e-05, + "loss": 3.0471, + "step": 51968 + }, + { + "epoch": 2.55, + "grad_norm": 0.7322779893875122, + "learning_rate": 3.314506745823108e-05, + "loss": 2.9175, + "step": 51969 + }, + { + "epoch": 2.55, + "grad_norm": 0.7262869477272034, + "learning_rate": 3.31380332415691e-05, + "loss": 2.886, + "step": 51970 + }, + { + "epoch": 2.55, + "grad_norm": 0.7132663726806641, + "learning_rate": 3.313099972777025e-05, + "loss": 2.7285, + "step": 51971 + }, + { + "epoch": 2.55, + "grad_norm": 0.7315619587898254, + "learning_rate": 3.3123966916852915e-05, + "loss": 2.9182, + "step": 51972 + }, + { + "epoch": 2.55, + "grad_norm": 0.7278238534927368, + "learning_rate": 3.3116934808835726e-05, + "loss": 2.7525, + "step": 51973 + }, + { + "epoch": 2.55, + "grad_norm": 0.7661489248275757, + "learning_rate": 3.310990340373719e-05, + "loss": 2.9219, + "step": 51974 + }, + { + "epoch": 2.55, + "grad_norm": 0.7610893845558167, + "learning_rate": 3.3102872701575836e-05, + "loss": 2.8953, + "step": 51975 + }, + { + "epoch": 2.55, + "grad_norm": 0.7103796601295471, + "learning_rate": 3.309584270237013e-05, + "loss": 2.8619, + "step": 51976 + }, + { + "epoch": 2.55, + "grad_norm": 0.733978271484375, + "learning_rate": 3.3088813406138545e-05, + "loss": 2.899, + "step": 51977 + }, + { + "epoch": 2.55, + "grad_norm": 0.7456706762313843, + "learning_rate": 3.308178481289976e-05, + "loss": 2.9013, + "step": 51978 + }, + { + "epoch": 2.55, + "grad_norm": 0.7410678863525391, + "learning_rate": 3.3074756922672085e-05, + "loss": 2.8654, + "step": 51979 + }, + { + "epoch": 2.55, + "grad_norm": 0.7431950569152832, + "learning_rate": 3.3067729735474115e-05, + "loss": 2.8395, + "step": 51980 + }, + { + "epoch": 2.55, + "grad_norm": 0.715957522392273, + "learning_rate": 3.306070325132446e-05, + "loss": 2.9237, + "step": 51981 + }, + { + "epoch": 2.55, + "grad_norm": 0.7735357284545898, + "learning_rate": 3.3053677470241544e-05, + "loss": 2.9148, + "step": 51982 + }, + { + "epoch": 2.55, + "grad_norm": 0.7633320689201355, + "learning_rate": 3.304665239224385e-05, + "loss": 2.6345, + "step": 51983 + }, + { + "epoch": 2.55, + "grad_norm": 0.6995530128479004, + "learning_rate": 3.303962801734984e-05, + "loss": 3.1224, + "step": 51984 + }, + { + "epoch": 2.55, + "grad_norm": 0.7412270903587341, + "learning_rate": 3.303260434557808e-05, + "loss": 2.9119, + "step": 51985 + }, + { + "epoch": 2.55, + "grad_norm": 0.7831922173500061, + "learning_rate": 3.302558137694711e-05, + "loss": 2.9452, + "step": 51986 + }, + { + "epoch": 2.55, + "grad_norm": 0.7183090448379517, + "learning_rate": 3.301855911147533e-05, + "loss": 2.9392, + "step": 51987 + }, + { + "epoch": 2.55, + "grad_norm": 0.7193189263343811, + "learning_rate": 3.3011537549181335e-05, + "loss": 3.1388, + "step": 51988 + }, + { + "epoch": 2.55, + "grad_norm": 0.7489942312240601, + "learning_rate": 3.3004516690083505e-05, + "loss": 2.946, + "step": 51989 + }, + { + "epoch": 2.55, + "grad_norm": 0.7693890333175659, + "learning_rate": 3.2997496534200464e-05, + "loss": 2.8692, + "step": 51990 + }, + { + "epoch": 2.55, + "grad_norm": 0.7755201458930969, + "learning_rate": 3.2990477081550624e-05, + "loss": 2.934, + "step": 51991 + }, + { + "epoch": 2.55, + "grad_norm": 0.7237342596054077, + "learning_rate": 3.2983458332152415e-05, + "loss": 2.7404, + "step": 51992 + }, + { + "epoch": 2.55, + "grad_norm": 0.711622953414917, + "learning_rate": 3.2976440286024474e-05, + "loss": 3.0484, + "step": 51993 + }, + { + "epoch": 2.55, + "grad_norm": 0.7582530975341797, + "learning_rate": 3.2969422943185095e-05, + "loss": 2.8522, + "step": 51994 + }, + { + "epoch": 2.55, + "grad_norm": 0.7546455264091492, + "learning_rate": 3.29624063036529e-05, + "loss": 2.8292, + "step": 51995 + }, + { + "epoch": 2.55, + "grad_norm": 0.7362459301948547, + "learning_rate": 3.29553903674464e-05, + "loss": 2.8576, + "step": 51996 + }, + { + "epoch": 2.55, + "grad_norm": 0.7272038459777832, + "learning_rate": 3.2948375134583984e-05, + "loss": 2.9262, + "step": 51997 + }, + { + "epoch": 2.55, + "grad_norm": 0.787972092628479, + "learning_rate": 3.2941360605084166e-05, + "loss": 2.8167, + "step": 51998 + }, + { + "epoch": 2.55, + "grad_norm": 0.7820271253585815, + "learning_rate": 3.293434677896534e-05, + "loss": 2.9891, + "step": 51999 + }, + { + "epoch": 2.55, + "grad_norm": 0.7015359997749329, + "learning_rate": 3.292733365624605e-05, + "loss": 2.8756, + "step": 52000 + }, + { + "epoch": 2.55, + "grad_norm": 0.7298908233642578, + "learning_rate": 3.292032123694481e-05, + "loss": 2.7706, + "step": 52001 + }, + { + "epoch": 2.55, + "grad_norm": 0.7448387145996094, + "learning_rate": 3.2913309521080026e-05, + "loss": 2.8888, + "step": 52002 + }, + { + "epoch": 2.55, + "grad_norm": 0.7021864056587219, + "learning_rate": 3.2906298508670195e-05, + "loss": 2.9838, + "step": 52003 + }, + { + "epoch": 2.55, + "grad_norm": 0.7519044280052185, + "learning_rate": 3.289928819973381e-05, + "loss": 2.8275, + "step": 52004 + }, + { + "epoch": 2.55, + "grad_norm": 0.7517359256744385, + "learning_rate": 3.289227859428919e-05, + "loss": 2.9263, + "step": 52005 + }, + { + "epoch": 2.55, + "grad_norm": 0.7345372438430786, + "learning_rate": 3.288526969235499e-05, + "loss": 2.9273, + "step": 52006 + }, + { + "epoch": 2.55, + "grad_norm": 0.702253520488739, + "learning_rate": 3.287826149394952e-05, + "loss": 3.0666, + "step": 52007 + }, + { + "epoch": 2.55, + "grad_norm": 0.7254359126091003, + "learning_rate": 3.2871253999091377e-05, + "loss": 2.7956, + "step": 52008 + }, + { + "epoch": 2.55, + "grad_norm": 0.7682890295982361, + "learning_rate": 3.2864247207798864e-05, + "loss": 2.8114, + "step": 52009 + }, + { + "epoch": 2.55, + "grad_norm": 0.718375027179718, + "learning_rate": 3.2857241120090515e-05, + "loss": 2.8538, + "step": 52010 + }, + { + "epoch": 2.55, + "grad_norm": 0.7394582629203796, + "learning_rate": 3.285023573598484e-05, + "loss": 2.9124, + "step": 52011 + }, + { + "epoch": 2.55, + "grad_norm": 0.723473310470581, + "learning_rate": 3.284323105550023e-05, + "loss": 3.0113, + "step": 52012 + }, + { + "epoch": 2.55, + "grad_norm": 0.6897356510162354, + "learning_rate": 3.283622707865513e-05, + "loss": 2.8036, + "step": 52013 + }, + { + "epoch": 2.55, + "grad_norm": 0.7808870077133179, + "learning_rate": 3.282922380546793e-05, + "loss": 2.8074, + "step": 52014 + }, + { + "epoch": 2.55, + "grad_norm": 0.7283383011817932, + "learning_rate": 3.2822221235957145e-05, + "loss": 2.8458, + "step": 52015 + }, + { + "epoch": 2.55, + "grad_norm": 0.7218790650367737, + "learning_rate": 3.2815219370141264e-05, + "loss": 3.0506, + "step": 52016 + }, + { + "epoch": 2.55, + "grad_norm": 0.7908827066421509, + "learning_rate": 3.280821820803861e-05, + "loss": 3.0478, + "step": 52017 + }, + { + "epoch": 2.55, + "grad_norm": 0.7488206028938293, + "learning_rate": 3.280121774966775e-05, + "loss": 2.793, + "step": 52018 + }, + { + "epoch": 2.55, + "grad_norm": 0.7366180419921875, + "learning_rate": 3.2794217995047054e-05, + "loss": 3.0707, + "step": 52019 + }, + { + "epoch": 2.55, + "grad_norm": 0.7356551885604858, + "learning_rate": 3.278721894419494e-05, + "loss": 2.9331, + "step": 52020 + }, + { + "epoch": 2.55, + "grad_norm": 0.7605820298194885, + "learning_rate": 3.278022059712988e-05, + "loss": 3.1874, + "step": 52021 + }, + { + "epoch": 2.55, + "grad_norm": 0.7392238974571228, + "learning_rate": 3.2773222953870236e-05, + "loss": 2.8032, + "step": 52022 + }, + { + "epoch": 2.55, + "grad_norm": 0.7410476803779602, + "learning_rate": 3.2766226014434596e-05, + "loss": 2.8852, + "step": 52023 + }, + { + "epoch": 2.55, + "grad_norm": 0.7377091646194458, + "learning_rate": 3.2759229778841166e-05, + "loss": 2.7027, + "step": 52024 + }, + { + "epoch": 2.55, + "grad_norm": 0.7168262004852295, + "learning_rate": 3.275223424710854e-05, + "loss": 2.813, + "step": 52025 + }, + { + "epoch": 2.55, + "grad_norm": 0.7007832527160645, + "learning_rate": 3.2745239419255145e-05, + "loss": 2.9249, + "step": 52026 + }, + { + "epoch": 2.55, + "grad_norm": 0.7348765134811401, + "learning_rate": 3.273824529529935e-05, + "loss": 2.8679, + "step": 52027 + }, + { + "epoch": 2.55, + "grad_norm": 0.7212089896202087, + "learning_rate": 3.273125187525958e-05, + "loss": 2.7796, + "step": 52028 + }, + { + "epoch": 2.55, + "grad_norm": 0.7870141863822937, + "learning_rate": 3.2724259159154186e-05, + "loss": 2.965, + "step": 52029 + }, + { + "epoch": 2.55, + "grad_norm": 0.728225588798523, + "learning_rate": 3.2717267147001656e-05, + "loss": 2.9082, + "step": 52030 + }, + { + "epoch": 2.55, + "grad_norm": 0.7556633353233337, + "learning_rate": 3.271027583882048e-05, + "loss": 2.9904, + "step": 52031 + }, + { + "epoch": 2.55, + "grad_norm": 0.7504242658615112, + "learning_rate": 3.270328523462893e-05, + "loss": 2.9538, + "step": 52032 + }, + { + "epoch": 2.55, + "grad_norm": 0.7282172441482544, + "learning_rate": 3.2696295334445566e-05, + "loss": 2.9926, + "step": 52033 + }, + { + "epoch": 2.55, + "grad_norm": 0.7356157302856445, + "learning_rate": 3.2689306138288675e-05, + "loss": 2.938, + "step": 52034 + }, + { + "epoch": 2.55, + "grad_norm": 0.6893876194953918, + "learning_rate": 3.268231764617667e-05, + "loss": 2.9002, + "step": 52035 + }, + { + "epoch": 2.55, + "grad_norm": 0.7842584848403931, + "learning_rate": 3.267532985812808e-05, + "loss": 2.8638, + "step": 52036 + }, + { + "epoch": 2.55, + "grad_norm": 0.7505773901939392, + "learning_rate": 3.2668342774161105e-05, + "loss": 3.0113, + "step": 52037 + }, + { + "epoch": 2.55, + "grad_norm": 0.7909178733825684, + "learning_rate": 3.266135639429438e-05, + "loss": 2.8813, + "step": 52038 + }, + { + "epoch": 2.55, + "grad_norm": 0.7430846095085144, + "learning_rate": 3.265437071854609e-05, + "loss": 3.109, + "step": 52039 + }, + { + "epoch": 2.55, + "grad_norm": 0.7298345565795898, + "learning_rate": 3.264738574693481e-05, + "loss": 3.0871, + "step": 52040 + }, + { + "epoch": 2.55, + "grad_norm": 0.7301103472709656, + "learning_rate": 3.264040147947889e-05, + "loss": 2.9725, + "step": 52041 + }, + { + "epoch": 2.55, + "grad_norm": 0.7341718077659607, + "learning_rate": 3.26334179161966e-05, + "loss": 2.6133, + "step": 52042 + }, + { + "epoch": 2.55, + "grad_norm": 0.7384917140007019, + "learning_rate": 3.262643505710651e-05, + "loss": 2.7426, + "step": 52043 + }, + { + "epoch": 2.55, + "grad_norm": 0.754947304725647, + "learning_rate": 3.2619452902226885e-05, + "loss": 2.7363, + "step": 52044 + }, + { + "epoch": 2.55, + "grad_norm": 0.7241181135177612, + "learning_rate": 3.261247145157613e-05, + "loss": 2.871, + "step": 52045 + }, + { + "epoch": 2.55, + "grad_norm": 0.7246091365814209, + "learning_rate": 3.2605490705172734e-05, + "loss": 2.969, + "step": 52046 + }, + { + "epoch": 2.55, + "grad_norm": 0.7573584914207458, + "learning_rate": 3.259851066303498e-05, + "loss": 2.9414, + "step": 52047 + }, + { + "epoch": 2.55, + "grad_norm": 0.7820131182670593, + "learning_rate": 3.259153132518133e-05, + "loss": 3.2345, + "step": 52048 + }, + { + "epoch": 2.55, + "grad_norm": 0.7728943824768066, + "learning_rate": 3.2584552691630125e-05, + "loss": 2.8005, + "step": 52049 + }, + { + "epoch": 2.55, + "grad_norm": 0.7590402960777283, + "learning_rate": 3.257757476239966e-05, + "loss": 2.6722, + "step": 52050 + }, + { + "epoch": 2.55, + "grad_norm": 0.7413563132286072, + "learning_rate": 3.257059753750848e-05, + "loss": 2.722, + "step": 52051 + }, + { + "epoch": 2.55, + "grad_norm": 0.7250611782073975, + "learning_rate": 3.256362101697484e-05, + "loss": 2.6977, + "step": 52052 + }, + { + "epoch": 2.55, + "grad_norm": 0.7582236528396606, + "learning_rate": 3.255664520081719e-05, + "loss": 3.023, + "step": 52053 + }, + { + "epoch": 2.55, + "grad_norm": 0.7816905975341797, + "learning_rate": 3.254967008905378e-05, + "loss": 2.8675, + "step": 52054 + }, + { + "epoch": 2.55, + "grad_norm": 0.7680097818374634, + "learning_rate": 3.254269568170317e-05, + "loss": 2.8494, + "step": 52055 + }, + { + "epoch": 2.55, + "grad_norm": 0.7169177532196045, + "learning_rate": 3.25357219787836e-05, + "loss": 2.951, + "step": 52056 + }, + { + "epoch": 2.55, + "grad_norm": 0.7729882597923279, + "learning_rate": 3.25287489803134e-05, + "loss": 2.806, + "step": 52057 + }, + { + "epoch": 2.55, + "grad_norm": 0.7043116688728333, + "learning_rate": 3.252177668631109e-05, + "loss": 3.1595, + "step": 52058 + }, + { + "epoch": 2.55, + "grad_norm": 0.8605642318725586, + "learning_rate": 3.2514805096794835e-05, + "loss": 2.8124, + "step": 52059 + }, + { + "epoch": 2.55, + "grad_norm": 0.7150072455406189, + "learning_rate": 3.2507834211783214e-05, + "loss": 2.7993, + "step": 52060 + }, + { + "epoch": 2.55, + "grad_norm": 0.7066315412521362, + "learning_rate": 3.250086403129439e-05, + "loss": 2.9328, + "step": 52061 + }, + { + "epoch": 2.55, + "grad_norm": 0.7706637978553772, + "learning_rate": 3.2493894555346866e-05, + "loss": 3.0378, + "step": 52062 + }, + { + "epoch": 2.55, + "grad_norm": 0.7277361750602722, + "learning_rate": 3.248692578395898e-05, + "loss": 2.8421, + "step": 52063 + }, + { + "epoch": 2.55, + "grad_norm": 0.7525270581245422, + "learning_rate": 3.2479957717148966e-05, + "loss": 2.9334, + "step": 52064 + }, + { + "epoch": 2.55, + "grad_norm": 0.7777685523033142, + "learning_rate": 3.247299035493529e-05, + "loss": 2.6474, + "step": 52065 + }, + { + "epoch": 2.55, + "grad_norm": 0.7585991621017456, + "learning_rate": 3.246602369733622e-05, + "loss": 2.9298, + "step": 52066 + }, + { + "epoch": 2.55, + "grad_norm": 0.7407344579696655, + "learning_rate": 3.24590577443702e-05, + "loss": 2.8826, + "step": 52067 + }, + { + "epoch": 2.55, + "grad_norm": 0.7381274700164795, + "learning_rate": 3.245209249605555e-05, + "loss": 2.8998, + "step": 52068 + }, + { + "epoch": 2.55, + "grad_norm": 0.7720596790313721, + "learning_rate": 3.2445127952410554e-05, + "loss": 3.0174, + "step": 52069 + }, + { + "epoch": 2.55, + "grad_norm": 0.7466363310813904, + "learning_rate": 3.243816411345364e-05, + "loss": 2.9214, + "step": 52070 + }, + { + "epoch": 2.55, + "grad_norm": 0.7581253051757812, + "learning_rate": 3.243120097920311e-05, + "loss": 2.9456, + "step": 52071 + }, + { + "epoch": 2.55, + "grad_norm": 0.7453657388687134, + "learning_rate": 3.2424238549677265e-05, + "loss": 2.8891, + "step": 52072 + }, + { + "epoch": 2.55, + "grad_norm": 0.7070062756538391, + "learning_rate": 3.2417276824894514e-05, + "loss": 3.0115, + "step": 52073 + }, + { + "epoch": 2.55, + "grad_norm": 0.7681336402893066, + "learning_rate": 3.2410315804873114e-05, + "loss": 2.6473, + "step": 52074 + }, + { + "epoch": 2.55, + "grad_norm": 0.7477194666862488, + "learning_rate": 3.240335548963151e-05, + "loss": 2.9022, + "step": 52075 + }, + { + "epoch": 2.55, + "grad_norm": 0.7516164183616638, + "learning_rate": 3.23963958791879e-05, + "loss": 2.9424, + "step": 52076 + }, + { + "epoch": 2.55, + "grad_norm": 0.7572428584098816, + "learning_rate": 3.2389436973560754e-05, + "loss": 2.8975, + "step": 52077 + }, + { + "epoch": 2.55, + "grad_norm": 0.7532944679260254, + "learning_rate": 3.238247877276831e-05, + "loss": 2.9374, + "step": 52078 + }, + { + "epoch": 2.55, + "grad_norm": 0.7349500060081482, + "learning_rate": 3.2375521276828866e-05, + "loss": 2.7757, + "step": 52079 + }, + { + "epoch": 2.55, + "grad_norm": 0.7738845944404602, + "learning_rate": 3.236856448576085e-05, + "loss": 3.076, + "step": 52080 + }, + { + "epoch": 2.55, + "grad_norm": 0.7966225743293762, + "learning_rate": 3.2361608399582454e-05, + "loss": 2.8035, + "step": 52081 + }, + { + "epoch": 2.55, + "grad_norm": 0.6876529455184937, + "learning_rate": 3.235465301831208e-05, + "loss": 2.9115, + "step": 52082 + }, + { + "epoch": 2.55, + "grad_norm": 0.7187408208847046, + "learning_rate": 3.2347698341968095e-05, + "loss": 2.8788, + "step": 52083 + }, + { + "epoch": 2.55, + "grad_norm": 0.7126637697219849, + "learning_rate": 3.2340744370568783e-05, + "loss": 3.1122, + "step": 52084 + }, + { + "epoch": 2.55, + "grad_norm": 0.7737832069396973, + "learning_rate": 3.233379110413242e-05, + "loss": 2.8319, + "step": 52085 + }, + { + "epoch": 2.55, + "grad_norm": 0.7817880511283875, + "learning_rate": 3.23268385426773e-05, + "loss": 2.8349, + "step": 52086 + }, + { + "epoch": 2.55, + "grad_norm": 0.7609059810638428, + "learning_rate": 3.231988668622174e-05, + "loss": 3.1345, + "step": 52087 + }, + { + "epoch": 2.55, + "grad_norm": 0.7383360266685486, + "learning_rate": 3.2312935534784154e-05, + "loss": 2.9601, + "step": 52088 + }, + { + "epoch": 2.55, + "grad_norm": 0.7559726238250732, + "learning_rate": 3.2305985088382736e-05, + "loss": 2.8189, + "step": 52089 + }, + { + "epoch": 2.55, + "grad_norm": 0.7402336001396179, + "learning_rate": 3.2299035347035896e-05, + "loss": 2.9232, + "step": 52090 + }, + { + "epoch": 2.55, + "grad_norm": 0.8144344687461853, + "learning_rate": 3.229208631076179e-05, + "loss": 2.8724, + "step": 52091 + }, + { + "epoch": 2.55, + "grad_norm": 0.7357531785964966, + "learning_rate": 3.22851379795789e-05, + "loss": 2.8621, + "step": 52092 + }, + { + "epoch": 2.55, + "grad_norm": 0.7645347714424133, + "learning_rate": 3.227819035350542e-05, + "loss": 2.7017, + "step": 52093 + }, + { + "epoch": 2.55, + "grad_norm": 0.7534157037734985, + "learning_rate": 3.227124343255959e-05, + "loss": 2.8386, + "step": 52094 + }, + { + "epoch": 2.55, + "grad_norm": 0.7897367477416992, + "learning_rate": 3.226429721675984e-05, + "loss": 2.8818, + "step": 52095 + }, + { + "epoch": 2.55, + "grad_norm": 0.7672426700592041, + "learning_rate": 3.225735170612435e-05, + "loss": 2.8568, + "step": 52096 + }, + { + "epoch": 2.55, + "grad_norm": 0.7439735531806946, + "learning_rate": 3.2250406900671466e-05, + "loss": 2.8941, + "step": 52097 + }, + { + "epoch": 2.55, + "grad_norm": 0.7530331611633301, + "learning_rate": 3.224346280041955e-05, + "loss": 2.8021, + "step": 52098 + }, + { + "epoch": 2.55, + "grad_norm": 0.7255603671073914, + "learning_rate": 3.223651940538682e-05, + "loss": 3.0742, + "step": 52099 + }, + { + "epoch": 2.55, + "grad_norm": 0.7502477765083313, + "learning_rate": 3.2229576715591555e-05, + "loss": 2.8602, + "step": 52100 + }, + { + "epoch": 2.55, + "grad_norm": 0.8245431184768677, + "learning_rate": 3.222263473105198e-05, + "loss": 2.7466, + "step": 52101 + }, + { + "epoch": 2.55, + "grad_norm": 0.7484453320503235, + "learning_rate": 3.2215693451786474e-05, + "loss": 2.9278, + "step": 52102 + }, + { + "epoch": 2.55, + "grad_norm": 0.720234215259552, + "learning_rate": 3.2208752877813357e-05, + "loss": 3.0219, + "step": 52103 + }, + { + "epoch": 2.55, + "grad_norm": 0.7335119843482971, + "learning_rate": 3.220181300915079e-05, + "loss": 2.795, + "step": 52104 + }, + { + "epoch": 2.55, + "grad_norm": 0.7557951807975769, + "learning_rate": 3.219487384581715e-05, + "loss": 3.0479, + "step": 52105 + }, + { + "epoch": 2.55, + "grad_norm": 0.7810825705528259, + "learning_rate": 3.218793538783069e-05, + "loss": 2.9214, + "step": 52106 + }, + { + "epoch": 2.55, + "grad_norm": 0.7902493476867676, + "learning_rate": 3.218099763520959e-05, + "loss": 2.8729, + "step": 52107 + }, + { + "epoch": 2.55, + "grad_norm": 0.8319610953330994, + "learning_rate": 3.217406058797225e-05, + "loss": 2.9528, + "step": 52108 + }, + { + "epoch": 2.55, + "grad_norm": 0.7634714841842651, + "learning_rate": 3.2167124246136876e-05, + "loss": 2.9904, + "step": 52109 + }, + { + "epoch": 2.55, + "grad_norm": 0.7566140294075012, + "learning_rate": 3.216018860972176e-05, + "loss": 2.822, + "step": 52110 + }, + { + "epoch": 2.55, + "grad_norm": 0.8258447051048279, + "learning_rate": 3.215325367874512e-05, + "loss": 2.7601, + "step": 52111 + }, + { + "epoch": 2.55, + "grad_norm": 0.7029900550842285, + "learning_rate": 3.2146319453225266e-05, + "loss": 3.0402, + "step": 52112 + }, + { + "epoch": 2.55, + "grad_norm": 0.7495381832122803, + "learning_rate": 3.2139385933180526e-05, + "loss": 2.8941, + "step": 52113 + }, + { + "epoch": 2.55, + "grad_norm": 0.7451898455619812, + "learning_rate": 3.213245311862909e-05, + "loss": 2.9559, + "step": 52114 + }, + { + "epoch": 2.55, + "grad_norm": 0.7116692662239075, + "learning_rate": 3.212552100958919e-05, + "loss": 3.0828, + "step": 52115 + }, + { + "epoch": 2.55, + "grad_norm": 0.7679808735847473, + "learning_rate": 3.21185896060791e-05, + "loss": 2.9068, + "step": 52116 + }, + { + "epoch": 2.55, + "grad_norm": 0.7367801070213318, + "learning_rate": 3.2111658908117065e-05, + "loss": 2.9342, + "step": 52117 + }, + { + "epoch": 2.55, + "grad_norm": 0.7176463007926941, + "learning_rate": 3.210472891572143e-05, + "loss": 3.1503, + "step": 52118 + }, + { + "epoch": 2.55, + "grad_norm": 0.6963474750518799, + "learning_rate": 3.2097799628910295e-05, + "loss": 2.9381, + "step": 52119 + }, + { + "epoch": 2.55, + "grad_norm": 0.7654833197593689, + "learning_rate": 3.209087104770209e-05, + "loss": 2.8578, + "step": 52120 + }, + { + "epoch": 2.55, + "grad_norm": 0.7469300627708435, + "learning_rate": 3.208394317211496e-05, + "loss": 3.1074, + "step": 52121 + }, + { + "epoch": 2.55, + "grad_norm": 0.702953577041626, + "learning_rate": 3.20770160021671e-05, + "loss": 2.6675, + "step": 52122 + }, + { + "epoch": 2.55, + "grad_norm": 0.7122648358345032, + "learning_rate": 3.207008953787688e-05, + "loss": 2.9066, + "step": 52123 + }, + { + "epoch": 2.55, + "grad_norm": 0.7659664750099182, + "learning_rate": 3.206316377926244e-05, + "loss": 2.8832, + "step": 52124 + }, + { + "epoch": 2.55, + "grad_norm": 0.747031033039093, + "learning_rate": 3.20562387263421e-05, + "loss": 2.965, + "step": 52125 + }, + { + "epoch": 2.55, + "grad_norm": 0.7918295860290527, + "learning_rate": 3.204931437913398e-05, + "loss": 2.8316, + "step": 52126 + }, + { + "epoch": 2.55, + "grad_norm": 0.7406182289123535, + "learning_rate": 3.204239073765641e-05, + "loss": 2.9045, + "step": 52127 + }, + { + "epoch": 2.55, + "grad_norm": 0.7530829906463623, + "learning_rate": 3.2035467801927696e-05, + "loss": 2.9757, + "step": 52128 + }, + { + "epoch": 2.55, + "grad_norm": 0.7679747343063354, + "learning_rate": 3.2028545571965994e-05, + "loss": 2.9212, + "step": 52129 + }, + { + "epoch": 2.55, + "grad_norm": 0.7454062104225159, + "learning_rate": 3.202162404778949e-05, + "loss": 2.9671, + "step": 52130 + }, + { + "epoch": 2.55, + "grad_norm": 0.7858791947364807, + "learning_rate": 3.2014703229416426e-05, + "loss": 2.8775, + "step": 52131 + }, + { + "epoch": 2.55, + "grad_norm": 0.7846336364746094, + "learning_rate": 3.2007783116865104e-05, + "loss": 2.9201, + "step": 52132 + }, + { + "epoch": 2.55, + "grad_norm": 0.7320681810379028, + "learning_rate": 3.2000863710153635e-05, + "loss": 3.0948, + "step": 52133 + }, + { + "epoch": 2.55, + "grad_norm": 0.8238586783409119, + "learning_rate": 3.1993945009300336e-05, + "loss": 2.9597, + "step": 52134 + }, + { + "epoch": 2.56, + "grad_norm": 0.7886196970939636, + "learning_rate": 3.198702701432346e-05, + "loss": 2.9869, + "step": 52135 + }, + { + "epoch": 2.56, + "grad_norm": 0.7563168406486511, + "learning_rate": 3.19801097252412e-05, + "loss": 2.9357, + "step": 52136 + }, + { + "epoch": 2.56, + "grad_norm": 0.8196958899497986, + "learning_rate": 3.197319314207169e-05, + "loss": 2.9284, + "step": 52137 + }, + { + "epoch": 2.56, + "grad_norm": 0.7236234545707703, + "learning_rate": 3.196627726483321e-05, + "loss": 2.8384, + "step": 52138 + }, + { + "epoch": 2.56, + "grad_norm": 0.773149847984314, + "learning_rate": 3.195936209354395e-05, + "loss": 2.8404, + "step": 52139 + }, + { + "epoch": 2.56, + "grad_norm": 0.7744463682174683, + "learning_rate": 3.1952447628222176e-05, + "loss": 2.7944, + "step": 52140 + }, + { + "epoch": 2.56, + "grad_norm": 0.7418228387832642, + "learning_rate": 3.194553386888604e-05, + "loss": 2.9338, + "step": 52141 + }, + { + "epoch": 2.56, + "grad_norm": 0.7621162533760071, + "learning_rate": 3.193862081555383e-05, + "loss": 2.9538, + "step": 52142 + }, + { + "epoch": 2.56, + "grad_norm": 0.7875955104827881, + "learning_rate": 3.193170846824369e-05, + "loss": 2.9326, + "step": 52143 + }, + { + "epoch": 2.56, + "grad_norm": 0.7359634637832642, + "learning_rate": 3.192479682697381e-05, + "loss": 2.7486, + "step": 52144 + }, + { + "epoch": 2.56, + "grad_norm": 0.7574135661125183, + "learning_rate": 3.191788589176244e-05, + "loss": 2.8255, + "step": 52145 + }, + { + "epoch": 2.56, + "grad_norm": 0.7023094296455383, + "learning_rate": 3.1910975662627734e-05, + "loss": 2.9397, + "step": 52146 + }, + { + "epoch": 2.56, + "grad_norm": 0.7390046119689941, + "learning_rate": 3.190406613958798e-05, + "loss": 3.0639, + "step": 52147 + }, + { + "epoch": 2.56, + "grad_norm": 0.7670930027961731, + "learning_rate": 3.189715732266126e-05, + "loss": 2.6807, + "step": 52148 + }, + { + "epoch": 2.56, + "grad_norm": 0.711762011051178, + "learning_rate": 3.1890249211865826e-05, + "loss": 2.7934, + "step": 52149 + }, + { + "epoch": 2.56, + "grad_norm": 0.7005589008331299, + "learning_rate": 3.1883341807219965e-05, + "loss": 2.9111, + "step": 52150 + }, + { + "epoch": 2.56, + "grad_norm": 0.7569049596786499, + "learning_rate": 3.1876435108741725e-05, + "loss": 2.8733, + "step": 52151 + }, + { + "epoch": 2.56, + "grad_norm": 0.8212657570838928, + "learning_rate": 3.18695291164494e-05, + "loss": 2.9394, + "step": 52152 + }, + { + "epoch": 2.56, + "grad_norm": 0.7402049899101257, + "learning_rate": 3.186262383036107e-05, + "loss": 2.8861, + "step": 52153 + }, + { + "epoch": 2.56, + "grad_norm": 0.7410562038421631, + "learning_rate": 3.185571925049495e-05, + "loss": 3.0182, + "step": 52154 + }, + { + "epoch": 2.56, + "grad_norm": 0.7571350336074829, + "learning_rate": 3.1848815376869366e-05, + "loss": 2.7717, + "step": 52155 + }, + { + "epoch": 2.56, + "grad_norm": 0.7215419411659241, + "learning_rate": 3.1841912209502294e-05, + "loss": 2.86, + "step": 52156 + }, + { + "epoch": 2.56, + "grad_norm": 0.751708984375, + "learning_rate": 3.18350097484121e-05, + "loss": 3.1578, + "step": 52157 + }, + { + "epoch": 2.56, + "grad_norm": 0.7202328443527222, + "learning_rate": 3.182810799361689e-05, + "loss": 2.8002, + "step": 52158 + }, + { + "epoch": 2.56, + "grad_norm": 0.7052213549613953, + "learning_rate": 3.182120694513476e-05, + "loss": 3.1323, + "step": 52159 + }, + { + "epoch": 2.56, + "grad_norm": 0.7351638674736023, + "learning_rate": 3.181430660298402e-05, + "loss": 2.8612, + "step": 52160 + }, + { + "epoch": 2.56, + "grad_norm": 0.7632426619529724, + "learning_rate": 3.180740696718269e-05, + "loss": 3.0659, + "step": 52161 + }, + { + "epoch": 2.56, + "grad_norm": 0.7189815640449524, + "learning_rate": 3.180050803774913e-05, + "loss": 3.0106, + "step": 52162 + }, + { + "epoch": 2.56, + "grad_norm": 0.7511869668960571, + "learning_rate": 3.179360981470138e-05, + "loss": 3.0049, + "step": 52163 + }, + { + "epoch": 2.56, + "grad_norm": 0.7162873148918152, + "learning_rate": 3.178671229805767e-05, + "loss": 2.9714, + "step": 52164 + }, + { + "epoch": 2.56, + "grad_norm": 0.8090042471885681, + "learning_rate": 3.177981548783615e-05, + "loss": 2.873, + "step": 52165 + }, + { + "epoch": 2.56, + "grad_norm": 0.7131547927856445, + "learning_rate": 3.1772919384054897e-05, + "loss": 2.6244, + "step": 52166 + }, + { + "epoch": 2.56, + "grad_norm": 0.7170288562774658, + "learning_rate": 3.176602398673224e-05, + "loss": 3.1226, + "step": 52167 + }, + { + "epoch": 2.56, + "grad_norm": 0.7279961705207825, + "learning_rate": 3.1759129295886184e-05, + "loss": 2.9755, + "step": 52168 + }, + { + "epoch": 2.56, + "grad_norm": 0.7459250688552856, + "learning_rate": 3.175223531153497e-05, + "loss": 2.7423, + "step": 52169 + }, + { + "epoch": 2.56, + "grad_norm": 0.806098461151123, + "learning_rate": 3.1745342033696764e-05, + "loss": 2.8613, + "step": 52170 + }, + { + "epoch": 2.56, + "grad_norm": 0.7358162999153137, + "learning_rate": 3.1738449462389656e-05, + "loss": 2.8704, + "step": 52171 + }, + { + "epoch": 2.56, + "grad_norm": 0.7452125549316406, + "learning_rate": 3.173155759763194e-05, + "loss": 2.9032, + "step": 52172 + }, + { + "epoch": 2.56, + "grad_norm": 0.765661358833313, + "learning_rate": 3.1724666439441626e-05, + "loss": 2.9674, + "step": 52173 + }, + { + "epoch": 2.56, + "grad_norm": 0.7364770770072937, + "learning_rate": 3.171777598783687e-05, + "loss": 2.7907, + "step": 52174 + }, + { + "epoch": 2.56, + "grad_norm": 0.740664005279541, + "learning_rate": 3.171088624283592e-05, + "loss": 2.8336, + "step": 52175 + }, + { + "epoch": 2.56, + "grad_norm": 0.7158206701278687, + "learning_rate": 3.17039972044568e-05, + "loss": 2.734, + "step": 52176 + }, + { + "epoch": 2.56, + "grad_norm": 0.7403131127357483, + "learning_rate": 3.169710887271779e-05, + "loss": 2.9151, + "step": 52177 + }, + { + "epoch": 2.56, + "grad_norm": 0.7874448299407959, + "learning_rate": 3.169022124763688e-05, + "loss": 2.9587, + "step": 52178 + }, + { + "epoch": 2.56, + "grad_norm": 0.7513279318809509, + "learning_rate": 3.168333432923236e-05, + "loss": 2.9218, + "step": 52179 + }, + { + "epoch": 2.56, + "grad_norm": 0.715566873550415, + "learning_rate": 3.16764481175223e-05, + "loss": 2.8593, + "step": 52180 + }, + { + "epoch": 2.56, + "grad_norm": 0.7262329459190369, + "learning_rate": 3.166956261252477e-05, + "loss": 2.9149, + "step": 52181 + }, + { + "epoch": 2.56, + "grad_norm": 0.7445135712623596, + "learning_rate": 3.166267781425804e-05, + "loss": 2.9653, + "step": 52182 + }, + { + "epoch": 2.56, + "grad_norm": 0.731885552406311, + "learning_rate": 3.165579372274011e-05, + "loss": 2.8945, + "step": 52183 + }, + { + "epoch": 2.56, + "grad_norm": 0.7854867577552795, + "learning_rate": 3.164891033798919e-05, + "loss": 2.7433, + "step": 52184 + }, + { + "epoch": 2.56, + "grad_norm": 0.7866995334625244, + "learning_rate": 3.164202766002343e-05, + "loss": 2.8823, + "step": 52185 + }, + { + "epoch": 2.56, + "grad_norm": 0.7712637782096863, + "learning_rate": 3.1635145688860916e-05, + "loss": 2.944, + "step": 52186 + }, + { + "epoch": 2.56, + "grad_norm": 0.758851170539856, + "learning_rate": 3.162826442451981e-05, + "loss": 2.9999, + "step": 52187 + }, + { + "epoch": 2.56, + "grad_norm": 0.7671577334403992, + "learning_rate": 3.162138386701812e-05, + "loss": 2.9154, + "step": 52188 + }, + { + "epoch": 2.56, + "grad_norm": 0.7265655398368835, + "learning_rate": 3.161450401637407e-05, + "loss": 2.9974, + "step": 52189 + }, + { + "epoch": 2.56, + "grad_norm": 0.7680802941322327, + "learning_rate": 3.160762487260582e-05, + "loss": 2.9132, + "step": 52190 + }, + { + "epoch": 2.56, + "grad_norm": 0.7444314360618591, + "learning_rate": 3.160074643573137e-05, + "loss": 2.8905, + "step": 52191 + }, + { + "epoch": 2.56, + "grad_norm": 0.7568455934524536, + "learning_rate": 3.1593868705768955e-05, + "loss": 2.7307, + "step": 52192 + }, + { + "epoch": 2.56, + "grad_norm": 0.7822867631912231, + "learning_rate": 3.1586991682736585e-05, + "loss": 2.93, + "step": 52193 + }, + { + "epoch": 2.56, + "grad_norm": 0.7543140053749084, + "learning_rate": 3.158011536665248e-05, + "loss": 3.1412, + "step": 52194 + }, + { + "epoch": 2.56, + "grad_norm": 0.853643000125885, + "learning_rate": 3.15732397575347e-05, + "loss": 2.9038, + "step": 52195 + }, + { + "epoch": 2.56, + "grad_norm": 0.8402360677719116, + "learning_rate": 3.1566364855401285e-05, + "loss": 2.9202, + "step": 52196 + }, + { + "epoch": 2.56, + "grad_norm": 0.7252039313316345, + "learning_rate": 3.155949066027046e-05, + "loss": 2.977, + "step": 52197 + }, + { + "epoch": 2.56, + "grad_norm": 0.7317278385162354, + "learning_rate": 3.155261717216021e-05, + "loss": 2.7395, + "step": 52198 + }, + { + "epoch": 2.56, + "grad_norm": 0.7297314405441284, + "learning_rate": 3.154574439108869e-05, + "loss": 2.8092, + "step": 52199 + }, + { + "epoch": 2.56, + "grad_norm": 0.7250798940658569, + "learning_rate": 3.153887231707408e-05, + "loss": 3.1066, + "step": 52200 + }, + { + "epoch": 2.56, + "grad_norm": 0.7466351389884949, + "learning_rate": 3.153200095013444e-05, + "loss": 3.1056, + "step": 52201 + }, + { + "epoch": 2.56, + "grad_norm": 0.7196610569953918, + "learning_rate": 3.1525130290287816e-05, + "loss": 2.9067, + "step": 52202 + }, + { + "epoch": 2.56, + "grad_norm": 0.7877478003501892, + "learning_rate": 3.151826033755229e-05, + "loss": 2.6556, + "step": 52203 + }, + { + "epoch": 2.56, + "grad_norm": 0.7279103398323059, + "learning_rate": 3.151139109194596e-05, + "loss": 2.9329, + "step": 52204 + }, + { + "epoch": 2.56, + "grad_norm": 0.7403162717819214, + "learning_rate": 3.1504522553487035e-05, + "loss": 2.9713, + "step": 52205 + }, + { + "epoch": 2.56, + "grad_norm": 0.748703122138977, + "learning_rate": 3.1497654722193466e-05, + "loss": 2.7022, + "step": 52206 + }, + { + "epoch": 2.56, + "grad_norm": 0.7111794948577881, + "learning_rate": 3.149078759808348e-05, + "loss": 3.1579, + "step": 52207 + }, + { + "epoch": 2.56, + "grad_norm": 0.7396597266197205, + "learning_rate": 3.148392118117505e-05, + "loss": 2.6688, + "step": 52208 + }, + { + "epoch": 2.56, + "grad_norm": 0.7661385536193848, + "learning_rate": 3.1477055471486315e-05, + "loss": 3.2509, + "step": 52209 + }, + { + "epoch": 2.56, + "grad_norm": 0.7724836468696594, + "learning_rate": 3.1470190469035274e-05, + "loss": 3.0427, + "step": 52210 + }, + { + "epoch": 2.56, + "grad_norm": 0.7351310849189758, + "learning_rate": 3.146332617384009e-05, + "loss": 2.949, + "step": 52211 + }, + { + "epoch": 2.56, + "grad_norm": 0.7441316246986389, + "learning_rate": 3.145646258591884e-05, + "loss": 2.9306, + "step": 52212 + }, + { + "epoch": 2.56, + "grad_norm": 0.7312101125717163, + "learning_rate": 3.1449599705289554e-05, + "loss": 2.9497, + "step": 52213 + }, + { + "epoch": 2.56, + "grad_norm": 0.733696699142456, + "learning_rate": 3.144273753197041e-05, + "loss": 2.8247, + "step": 52214 + }, + { + "epoch": 2.56, + "grad_norm": 0.7087571620941162, + "learning_rate": 3.143587606597933e-05, + "loss": 2.8334, + "step": 52215 + }, + { + "epoch": 2.56, + "grad_norm": 0.7312347292900085, + "learning_rate": 3.142901530733453e-05, + "loss": 2.869, + "step": 52216 + }, + { + "epoch": 2.56, + "grad_norm": 0.6981326341629028, + "learning_rate": 3.1422155256054024e-05, + "loss": 2.8639, + "step": 52217 + }, + { + "epoch": 2.56, + "grad_norm": 0.7454235553741455, + "learning_rate": 3.141529591215578e-05, + "loss": 3.1351, + "step": 52218 + }, + { + "epoch": 2.56, + "grad_norm": 0.7256028652191162, + "learning_rate": 3.1408437275658074e-05, + "loss": 2.776, + "step": 52219 + }, + { + "epoch": 2.56, + "grad_norm": 0.7506396174430847, + "learning_rate": 3.1401579346578753e-05, + "loss": 2.7559, + "step": 52220 + }, + { + "epoch": 2.56, + "grad_norm": 0.7295669913291931, + "learning_rate": 3.139472212493598e-05, + "loss": 2.8567, + "step": 52221 + }, + { + "epoch": 2.56, + "grad_norm": 0.731708288192749, + "learning_rate": 3.1387865610747864e-05, + "loss": 2.7866, + "step": 52222 + }, + { + "epoch": 2.56, + "grad_norm": 0.718250036239624, + "learning_rate": 3.138100980403243e-05, + "loss": 2.9393, + "step": 52223 + }, + { + "epoch": 2.56, + "grad_norm": 0.7222898006439209, + "learning_rate": 3.137415470480773e-05, + "loss": 2.9184, + "step": 52224 + }, + { + "epoch": 2.56, + "grad_norm": 0.7631383538246155, + "learning_rate": 3.136730031309175e-05, + "loss": 2.9492, + "step": 52225 + }, + { + "epoch": 2.56, + "grad_norm": 0.7158065438270569, + "learning_rate": 3.13604466289026e-05, + "loss": 3.0133, + "step": 52226 + }, + { + "epoch": 2.56, + "grad_norm": 0.7890329360961914, + "learning_rate": 3.135359365225838e-05, + "loss": 3.0817, + "step": 52227 + }, + { + "epoch": 2.56, + "grad_norm": 0.7782434225082397, + "learning_rate": 3.134674138317703e-05, + "loss": 3.0138, + "step": 52228 + }, + { + "epoch": 2.56, + "grad_norm": 0.7466015815734863, + "learning_rate": 3.133988982167671e-05, + "loss": 2.823, + "step": 52229 + }, + { + "epoch": 2.56, + "grad_norm": 0.712049663066864, + "learning_rate": 3.1333038967775404e-05, + "loss": 2.9232, + "step": 52230 + }, + { + "epoch": 2.56, + "grad_norm": 0.7094315886497498, + "learning_rate": 3.1326188821491196e-05, + "loss": 2.7197, + "step": 52231 + }, + { + "epoch": 2.56, + "grad_norm": 0.7306052446365356, + "learning_rate": 3.131933938284207e-05, + "loss": 2.9825, + "step": 52232 + }, + { + "epoch": 2.56, + "grad_norm": 0.6810746788978577, + "learning_rate": 3.131249065184608e-05, + "loss": 2.7697, + "step": 52233 + }, + { + "epoch": 2.56, + "grad_norm": 0.748337984085083, + "learning_rate": 3.130564262852131e-05, + "loss": 2.9846, + "step": 52234 + }, + { + "epoch": 2.56, + "grad_norm": 0.7128395438194275, + "learning_rate": 3.1298795312885716e-05, + "loss": 3.0067, + "step": 52235 + }, + { + "epoch": 2.56, + "grad_norm": 0.7342303395271301, + "learning_rate": 3.1291948704957416e-05, + "loss": 3.1391, + "step": 52236 + }, + { + "epoch": 2.56, + "grad_norm": 0.7613341808319092, + "learning_rate": 3.1285102804754424e-05, + "loss": 2.9238, + "step": 52237 + }, + { + "epoch": 2.56, + "grad_norm": 0.8025845289230347, + "learning_rate": 3.1278257612294765e-05, + "loss": 3.0679, + "step": 52238 + }, + { + "epoch": 2.56, + "grad_norm": 0.7101073265075684, + "learning_rate": 3.127141312759649e-05, + "loss": 2.7353, + "step": 52239 + }, + { + "epoch": 2.56, + "grad_norm": 0.7481630444526672, + "learning_rate": 3.126456935067748e-05, + "loss": 2.8775, + "step": 52240 + }, + { + "epoch": 2.56, + "grad_norm": 0.747065544128418, + "learning_rate": 3.125772628155593e-05, + "loss": 2.9329, + "step": 52241 + }, + { + "epoch": 2.56, + "grad_norm": 0.7435370683670044, + "learning_rate": 3.125088392024985e-05, + "loss": 2.9153, + "step": 52242 + }, + { + "epoch": 2.56, + "grad_norm": 0.781072199344635, + "learning_rate": 3.124404226677712e-05, + "loss": 2.9379, + "step": 52243 + }, + { + "epoch": 2.56, + "grad_norm": 0.7462438344955444, + "learning_rate": 3.123720132115598e-05, + "loss": 2.897, + "step": 52244 + }, + { + "epoch": 2.56, + "grad_norm": 0.7858874797821045, + "learning_rate": 3.12303610834043e-05, + "loss": 2.8322, + "step": 52245 + }, + { + "epoch": 2.56, + "grad_norm": 0.7304945588111877, + "learning_rate": 3.1223521553540064e-05, + "loss": 2.8908, + "step": 52246 + }, + { + "epoch": 2.56, + "grad_norm": 0.7903547286987305, + "learning_rate": 3.12166827315814e-05, + "loss": 2.8174, + "step": 52247 + }, + { + "epoch": 2.56, + "grad_norm": 0.7469950914382935, + "learning_rate": 3.120984461754622e-05, + "loss": 2.9731, + "step": 52248 + }, + { + "epoch": 2.56, + "grad_norm": 0.7844982743263245, + "learning_rate": 3.120300721145261e-05, + "loss": 2.8559, + "step": 52249 + }, + { + "epoch": 2.56, + "grad_norm": 0.7452136278152466, + "learning_rate": 3.119617051331853e-05, + "loss": 3.1268, + "step": 52250 + }, + { + "epoch": 2.56, + "grad_norm": 0.7502493262290955, + "learning_rate": 3.1189334523161954e-05, + "loss": 2.8521, + "step": 52251 + }, + { + "epoch": 2.56, + "grad_norm": 0.6964902877807617, + "learning_rate": 3.1182499241001045e-05, + "loss": 2.8879, + "step": 52252 + }, + { + "epoch": 2.56, + "grad_norm": 0.7556952834129333, + "learning_rate": 3.1175664666853686e-05, + "loss": 2.8724, + "step": 52253 + }, + { + "epoch": 2.56, + "grad_norm": 0.6863505244255066, + "learning_rate": 3.116883080073789e-05, + "loss": 2.9686, + "step": 52254 + }, + { + "epoch": 2.56, + "grad_norm": 0.7049617767333984, + "learning_rate": 3.1161997642671574e-05, + "loss": 2.9675, + "step": 52255 + }, + { + "epoch": 2.56, + "grad_norm": 0.7351288199424744, + "learning_rate": 3.115516519267284e-05, + "loss": 2.7977, + "step": 52256 + }, + { + "epoch": 2.56, + "grad_norm": 0.7706930637359619, + "learning_rate": 3.114833345075972e-05, + "loss": 2.9956, + "step": 52257 + }, + { + "epoch": 2.56, + "grad_norm": 0.771697998046875, + "learning_rate": 3.114150241695008e-05, + "loss": 2.7364, + "step": 52258 + }, + { + "epoch": 2.56, + "grad_norm": 0.7425963878631592, + "learning_rate": 3.1134672091262035e-05, + "loss": 2.963, + "step": 52259 + }, + { + "epoch": 2.56, + "grad_norm": 0.7799915075302124, + "learning_rate": 3.11278424737135e-05, + "loss": 2.9248, + "step": 52260 + }, + { + "epoch": 2.56, + "grad_norm": 0.7096208930015564, + "learning_rate": 3.112101356432247e-05, + "loss": 2.7819, + "step": 52261 + }, + { + "epoch": 2.56, + "grad_norm": 0.747329831123352, + "learning_rate": 3.111418536310699e-05, + "loss": 2.8839, + "step": 52262 + }, + { + "epoch": 2.56, + "grad_norm": 0.7840654253959656, + "learning_rate": 3.110735787008492e-05, + "loss": 2.8895, + "step": 52263 + }, + { + "epoch": 2.56, + "grad_norm": 0.7264786958694458, + "learning_rate": 3.1100531085274394e-05, + "loss": 2.8788, + "step": 52264 + }, + { + "epoch": 2.56, + "grad_norm": 0.7728217840194702, + "learning_rate": 3.1093705008693245e-05, + "loss": 2.9602, + "step": 52265 + }, + { + "epoch": 2.56, + "grad_norm": 0.7420971393585205, + "learning_rate": 3.1086879640359596e-05, + "loss": 2.8934, + "step": 52266 + }, + { + "epoch": 2.56, + "grad_norm": 0.714773952960968, + "learning_rate": 3.108005498029136e-05, + "loss": 2.8666, + "step": 52267 + }, + { + "epoch": 2.56, + "grad_norm": 0.7264063954353333, + "learning_rate": 3.107323102850642e-05, + "loss": 2.8107, + "step": 52268 + }, + { + "epoch": 2.56, + "grad_norm": 0.7227268815040588, + "learning_rate": 3.1066407785022895e-05, + "loss": 2.8292, + "step": 52269 + }, + { + "epoch": 2.56, + "grad_norm": 0.7567482590675354, + "learning_rate": 3.1059585249858675e-05, + "loss": 2.8256, + "step": 52270 + }, + { + "epoch": 2.56, + "grad_norm": 0.7323058247566223, + "learning_rate": 3.1052763423031714e-05, + "loss": 2.9365, + "step": 52271 + }, + { + "epoch": 2.56, + "grad_norm": 0.7168264389038086, + "learning_rate": 3.104594230456009e-05, + "loss": 3.0581, + "step": 52272 + }, + { + "epoch": 2.56, + "grad_norm": 0.7276723980903625, + "learning_rate": 3.1039121894461625e-05, + "loss": 2.7855, + "step": 52273 + }, + { + "epoch": 2.56, + "grad_norm": 0.7067778706550598, + "learning_rate": 3.103230219275441e-05, + "loss": 3.0388, + "step": 52274 + }, + { + "epoch": 2.56, + "grad_norm": 0.7861632704734802, + "learning_rate": 3.1025483199456327e-05, + "loss": 2.954, + "step": 52275 + }, + { + "epoch": 2.56, + "grad_norm": 0.6696493625640869, + "learning_rate": 3.1018664914585326e-05, + "loss": 2.9169, + "step": 52276 + }, + { + "epoch": 2.56, + "grad_norm": 0.7829317450523376, + "learning_rate": 3.1011847338159454e-05, + "loss": 2.9342, + "step": 52277 + }, + { + "epoch": 2.56, + "grad_norm": 0.7882497310638428, + "learning_rate": 3.1005030470196514e-05, + "loss": 2.8775, + "step": 52278 + }, + { + "epoch": 2.56, + "grad_norm": 0.7552138566970825, + "learning_rate": 3.099821431071464e-05, + "loss": 2.9103, + "step": 52279 + }, + { + "epoch": 2.56, + "grad_norm": 0.7698383927345276, + "learning_rate": 3.099139885973166e-05, + "loss": 2.8783, + "step": 52280 + }, + { + "epoch": 2.56, + "grad_norm": 0.782609760761261, + "learning_rate": 3.09845841172656e-05, + "loss": 2.975, + "step": 52281 + }, + { + "epoch": 2.56, + "grad_norm": 0.7387914061546326, + "learning_rate": 3.0977770083334366e-05, + "loss": 2.8243, + "step": 52282 + }, + { + "epoch": 2.56, + "grad_norm": 0.7554448843002319, + "learning_rate": 3.097095675795588e-05, + "loss": 2.9946, + "step": 52283 + }, + { + "epoch": 2.56, + "grad_norm": 0.7315139770507812, + "learning_rate": 3.096414414114813e-05, + "loss": 2.9418, + "step": 52284 + }, + { + "epoch": 2.56, + "grad_norm": 0.7560511827468872, + "learning_rate": 3.0957332232929036e-05, + "loss": 3.0498, + "step": 52285 + }, + { + "epoch": 2.56, + "grad_norm": 0.7678272724151611, + "learning_rate": 3.095052103331651e-05, + "loss": 2.7948, + "step": 52286 + }, + { + "epoch": 2.56, + "grad_norm": 0.7785819172859192, + "learning_rate": 3.0943710542328613e-05, + "loss": 2.7721, + "step": 52287 + }, + { + "epoch": 2.56, + "grad_norm": 0.7176530361175537, + "learning_rate": 3.093690075998323e-05, + "loss": 2.9313, + "step": 52288 + }, + { + "epoch": 2.56, + "grad_norm": 0.7264245748519897, + "learning_rate": 3.0930091686298206e-05, + "loss": 2.7374, + "step": 52289 + }, + { + "epoch": 2.56, + "grad_norm": 0.7617298364639282, + "learning_rate": 3.092328332129154e-05, + "loss": 2.8061, + "step": 52290 + }, + { + "epoch": 2.56, + "grad_norm": 0.7552651166915894, + "learning_rate": 3.0916475664981165e-05, + "loss": 2.9829, + "step": 52291 + }, + { + "epoch": 2.56, + "grad_norm": 0.7785048484802246, + "learning_rate": 3.0909668717384986e-05, + "loss": 2.8478, + "step": 52292 + }, + { + "epoch": 2.56, + "grad_norm": 0.719987154006958, + "learning_rate": 3.090286247852094e-05, + "loss": 2.8228, + "step": 52293 + }, + { + "epoch": 2.56, + "grad_norm": 0.735836386680603, + "learning_rate": 3.089605694840702e-05, + "loss": 2.9333, + "step": 52294 + }, + { + "epoch": 2.56, + "grad_norm": 0.763145923614502, + "learning_rate": 3.0889252127061045e-05, + "loss": 3.0371, + "step": 52295 + }, + { + "epoch": 2.56, + "grad_norm": 0.7010157108306885, + "learning_rate": 3.088244801450104e-05, + "loss": 3.0665, + "step": 52296 + }, + { + "epoch": 2.56, + "grad_norm": 0.7367874979972839, + "learning_rate": 3.0875644610744873e-05, + "loss": 2.93, + "step": 52297 + }, + { + "epoch": 2.56, + "grad_norm": 0.7362725138664246, + "learning_rate": 3.0868841915810414e-05, + "loss": 2.8234, + "step": 52298 + }, + { + "epoch": 2.56, + "grad_norm": 0.6777426600456238, + "learning_rate": 3.086203992971568e-05, + "loss": 2.9562, + "step": 52299 + }, + { + "epoch": 2.56, + "grad_norm": 0.7489884495735168, + "learning_rate": 3.085523865247851e-05, + "loss": 2.8091, + "step": 52300 + }, + { + "epoch": 2.56, + "grad_norm": 0.7176619172096252, + "learning_rate": 3.084843808411687e-05, + "loss": 3.1135, + "step": 52301 + }, + { + "epoch": 2.56, + "grad_norm": 0.7485775351524353, + "learning_rate": 3.084163822464857e-05, + "loss": 2.8253, + "step": 52302 + }, + { + "epoch": 2.56, + "grad_norm": 0.7570214867591858, + "learning_rate": 3.08348390740917e-05, + "loss": 2.7535, + "step": 52303 + }, + { + "epoch": 2.56, + "grad_norm": 0.7418150305747986, + "learning_rate": 3.082804063246401e-05, + "loss": 3.0901, + "step": 52304 + }, + { + "epoch": 2.56, + "grad_norm": 0.6917791962623596, + "learning_rate": 3.0821242899783425e-05, + "loss": 2.8208, + "step": 52305 + }, + { + "epoch": 2.56, + "grad_norm": 0.7729413509368896, + "learning_rate": 3.081444587606796e-05, + "loss": 2.76, + "step": 52306 + }, + { + "epoch": 2.56, + "grad_norm": 0.7332080006599426, + "learning_rate": 3.0807649561335335e-05, + "loss": 3.0991, + "step": 52307 + }, + { + "epoch": 2.56, + "grad_norm": 0.7556661367416382, + "learning_rate": 3.08008539556036e-05, + "loss": 2.9618, + "step": 52308 + }, + { + "epoch": 2.56, + "grad_norm": 0.7616263031959534, + "learning_rate": 3.079405905889064e-05, + "loss": 2.7836, + "step": 52309 + }, + { + "epoch": 2.56, + "grad_norm": 0.7406206727027893, + "learning_rate": 3.078726487121431e-05, + "loss": 2.9042, + "step": 52310 + }, + { + "epoch": 2.56, + "grad_norm": 0.7701447010040283, + "learning_rate": 3.078047139259253e-05, + "loss": 3.0468, + "step": 52311 + }, + { + "epoch": 2.56, + "grad_norm": 0.7630977630615234, + "learning_rate": 3.077367862304312e-05, + "loss": 2.8137, + "step": 52312 + }, + { + "epoch": 2.56, + "grad_norm": 0.7166601419448853, + "learning_rate": 3.0766886562584024e-05, + "loss": 2.9846, + "step": 52313 + }, + { + "epoch": 2.56, + "grad_norm": 0.7290651202201843, + "learning_rate": 3.07600952112332e-05, + "loss": 2.9109, + "step": 52314 + }, + { + "epoch": 2.56, + "grad_norm": 0.7534510493278503, + "learning_rate": 3.075330456900841e-05, + "loss": 3.0675, + "step": 52315 + }, + { + "epoch": 2.56, + "grad_norm": 0.7178420424461365, + "learning_rate": 3.074651463592762e-05, + "loss": 2.9887, + "step": 52316 + }, + { + "epoch": 2.56, + "grad_norm": 0.7508209347724915, + "learning_rate": 3.0739725412008665e-05, + "loss": 2.9242, + "step": 52317 + }, + { + "epoch": 2.56, + "grad_norm": 0.7520042061805725, + "learning_rate": 3.073293689726952e-05, + "loss": 2.9403, + "step": 52318 + }, + { + "epoch": 2.56, + "grad_norm": 0.7215971946716309, + "learning_rate": 3.072614909172798e-05, + "loss": 3.038, + "step": 52319 + }, + { + "epoch": 2.56, + "grad_norm": 0.761830747127533, + "learning_rate": 3.071936199540189e-05, + "loss": 2.9039, + "step": 52320 + }, + { + "epoch": 2.56, + "grad_norm": 0.7258598208427429, + "learning_rate": 3.0712575608309206e-05, + "loss": 2.8936, + "step": 52321 + }, + { + "epoch": 2.56, + "grad_norm": 0.7424689531326294, + "learning_rate": 3.070578993046775e-05, + "loss": 2.8634, + "step": 52322 + }, + { + "epoch": 2.56, + "grad_norm": 0.7346766591072083, + "learning_rate": 3.06990049618954e-05, + "loss": 3.0474, + "step": 52323 + }, + { + "epoch": 2.56, + "grad_norm": 0.7245206236839294, + "learning_rate": 3.0692220702610115e-05, + "loss": 2.8944, + "step": 52324 + }, + { + "epoch": 2.56, + "grad_norm": 0.7064627408981323, + "learning_rate": 3.0685437152629676e-05, + "loss": 2.9548, + "step": 52325 + }, + { + "epoch": 2.56, + "grad_norm": 0.7573798894882202, + "learning_rate": 3.0678654311971974e-05, + "loss": 2.772, + "step": 52326 + }, + { + "epoch": 2.56, + "grad_norm": 0.6997696161270142, + "learning_rate": 3.0671872180654825e-05, + "loss": 2.7829, + "step": 52327 + }, + { + "epoch": 2.56, + "grad_norm": 0.7369567155838013, + "learning_rate": 3.066509075869611e-05, + "loss": 2.819, + "step": 52328 + }, + { + "epoch": 2.56, + "grad_norm": 0.7621482014656067, + "learning_rate": 3.065831004611375e-05, + "loss": 2.9502, + "step": 52329 + }, + { + "epoch": 2.56, + "grad_norm": 0.7581804394721985, + "learning_rate": 3.065153004292554e-05, + "loss": 2.7999, + "step": 52330 + }, + { + "epoch": 2.56, + "grad_norm": 0.7452824711799622, + "learning_rate": 3.064475074914939e-05, + "loss": 3.0601, + "step": 52331 + }, + { + "epoch": 2.56, + "grad_norm": 0.7450490593910217, + "learning_rate": 3.063797216480316e-05, + "loss": 2.7313, + "step": 52332 + }, + { + "epoch": 2.56, + "grad_norm": 0.7387819290161133, + "learning_rate": 3.063119428990459e-05, + "loss": 3.134, + "step": 52333 + }, + { + "epoch": 2.56, + "grad_norm": 0.7497106790542603, + "learning_rate": 3.062441712447171e-05, + "loss": 2.9456, + "step": 52334 + }, + { + "epoch": 2.56, + "grad_norm": 0.7112284898757935, + "learning_rate": 3.061764066852217e-05, + "loss": 2.6568, + "step": 52335 + }, + { + "epoch": 2.56, + "grad_norm": 0.753256618976593, + "learning_rate": 3.0610864922073985e-05, + "loss": 2.8336, + "step": 52336 + }, + { + "epoch": 2.56, + "grad_norm": 0.6917237639427185, + "learning_rate": 3.0604089885144914e-05, + "loss": 2.8997, + "step": 52337 + }, + { + "epoch": 2.56, + "grad_norm": 0.7741192579269409, + "learning_rate": 3.05973155577528e-05, + "loss": 2.942, + "step": 52338 + }, + { + "epoch": 2.57, + "grad_norm": 0.7585358619689941, + "learning_rate": 3.059054193991557e-05, + "loss": 2.755, + "step": 52339 + }, + { + "epoch": 2.57, + "grad_norm": 0.7429476380348206, + "learning_rate": 3.058376903165097e-05, + "loss": 3.0445, + "step": 52340 + }, + { + "epoch": 2.57, + "grad_norm": 0.7957443594932556, + "learning_rate": 3.057699683297693e-05, + "loss": 3.0225, + "step": 52341 + }, + { + "epoch": 2.57, + "grad_norm": 0.7200005650520325, + "learning_rate": 3.057022534391113e-05, + "loss": 3.0441, + "step": 52342 + }, + { + "epoch": 2.57, + "grad_norm": 0.7461026310920715, + "learning_rate": 3.056345456447151e-05, + "loss": 2.8421, + "step": 52343 + }, + { + "epoch": 2.57, + "grad_norm": 0.7428950071334839, + "learning_rate": 3.055668449467598e-05, + "loss": 2.6936, + "step": 52344 + }, + { + "epoch": 2.57, + "grad_norm": 0.7124583721160889, + "learning_rate": 3.05499151345422e-05, + "loss": 3.1548, + "step": 52345 + }, + { + "epoch": 2.57, + "grad_norm": 0.7571352124214172, + "learning_rate": 3.054314648408815e-05, + "loss": 2.9257, + "step": 52346 + }, + { + "epoch": 2.57, + "grad_norm": 0.7435238361358643, + "learning_rate": 3.0536378543331616e-05, + "loss": 2.7556, + "step": 52347 + }, + { + "epoch": 2.57, + "grad_norm": 0.733097493648529, + "learning_rate": 3.052961131229032e-05, + "loss": 2.807, + "step": 52348 + }, + { + "epoch": 2.57, + "grad_norm": 0.7678351998329163, + "learning_rate": 3.052284479098225e-05, + "loss": 2.7884, + "step": 52349 + }, + { + "epoch": 2.57, + "grad_norm": 0.7466233372688293, + "learning_rate": 3.051607897942504e-05, + "loss": 2.9157, + "step": 52350 + }, + { + "epoch": 2.57, + "grad_norm": 0.7528746724128723, + "learning_rate": 3.050931387763671e-05, + "loss": 2.8802, + "step": 52351 + }, + { + "epoch": 2.57, + "grad_norm": 0.7836788296699524, + "learning_rate": 3.0502549485634908e-05, + "loss": 2.6929, + "step": 52352 + }, + { + "epoch": 2.57, + "grad_norm": 0.7055717706680298, + "learning_rate": 3.0495785803437557e-05, + "loss": 2.9488, + "step": 52353 + }, + { + "epoch": 2.57, + "grad_norm": 0.7400059103965759, + "learning_rate": 3.0489022831062472e-05, + "loss": 2.755, + "step": 52354 + }, + { + "epoch": 2.57, + "grad_norm": 0.7582606077194214, + "learning_rate": 3.048226056852744e-05, + "loss": 2.8012, + "step": 52355 + }, + { + "epoch": 2.57, + "grad_norm": 0.751268208026886, + "learning_rate": 3.0475499015850247e-05, + "loss": 2.6739, + "step": 52356 + }, + { + "epoch": 2.57, + "grad_norm": 0.741468071937561, + "learning_rate": 3.0468738173048678e-05, + "loss": 2.9144, + "step": 52357 + }, + { + "epoch": 2.57, + "grad_norm": 0.6897956132888794, + "learning_rate": 3.0461978040140557e-05, + "loss": 3.1279, + "step": 52358 + }, + { + "epoch": 2.57, + "grad_norm": 0.7293424606323242, + "learning_rate": 3.0455218617143796e-05, + "loss": 3.0598, + "step": 52359 + }, + { + "epoch": 2.57, + "grad_norm": 0.8046936988830566, + "learning_rate": 3.044845990407605e-05, + "loss": 2.8674, + "step": 52360 + }, + { + "epoch": 2.57, + "grad_norm": 0.7003205418586731, + "learning_rate": 3.044170190095524e-05, + "loss": 2.8303, + "step": 52361 + }, + { + "epoch": 2.57, + "grad_norm": 0.7055088877677917, + "learning_rate": 3.0434944607799116e-05, + "loss": 2.8384, + "step": 52362 + }, + { + "epoch": 2.57, + "grad_norm": 0.7319667935371399, + "learning_rate": 3.04281880246254e-05, + "loss": 2.9059, + "step": 52363 + }, + { + "epoch": 2.57, + "grad_norm": 0.7617000937461853, + "learning_rate": 3.042143215145201e-05, + "loss": 2.8728, + "step": 52364 + }, + { + "epoch": 2.57, + "grad_norm": 0.7651677131652832, + "learning_rate": 3.041467698829666e-05, + "loss": 2.963, + "step": 52365 + }, + { + "epoch": 2.57, + "grad_norm": 0.7394811511039734, + "learning_rate": 3.0407922535177176e-05, + "loss": 2.7771, + "step": 52366 + }, + { + "epoch": 2.57, + "grad_norm": 0.7691500782966614, + "learning_rate": 3.040116879211131e-05, + "loss": 2.866, + "step": 52367 + }, + { + "epoch": 2.57, + "grad_norm": 0.7116948962211609, + "learning_rate": 3.039441575911694e-05, + "loss": 2.9602, + "step": 52368 + }, + { + "epoch": 2.57, + "grad_norm": 0.7746070623397827, + "learning_rate": 3.0387663436211796e-05, + "loss": 2.8734, + "step": 52369 + }, + { + "epoch": 2.57, + "grad_norm": 0.743488073348999, + "learning_rate": 3.038091182341359e-05, + "loss": 2.7054, + "step": 52370 + }, + { + "epoch": 2.57, + "grad_norm": 0.7217757105827332, + "learning_rate": 3.0374160920740242e-05, + "loss": 2.8834, + "step": 52371 + }, + { + "epoch": 2.57, + "grad_norm": 0.7398301362991333, + "learning_rate": 3.0367410728209408e-05, + "loss": 3.0355, + "step": 52372 + }, + { + "epoch": 2.57, + "grad_norm": 0.7479260563850403, + "learning_rate": 3.036066124583897e-05, + "loss": 2.9872, + "step": 52373 + }, + { + "epoch": 2.57, + "grad_norm": 0.7478479743003845, + "learning_rate": 3.0353912473646615e-05, + "loss": 2.9438, + "step": 52374 + }, + { + "epoch": 2.57, + "grad_norm": 0.7435922026634216, + "learning_rate": 3.0347164411650126e-05, + "loss": 2.8484, + "step": 52375 + }, + { + "epoch": 2.57, + "grad_norm": 0.7409965991973877, + "learning_rate": 3.0340417059867396e-05, + "loss": 2.9877, + "step": 52376 + }, + { + "epoch": 2.57, + "grad_norm": 0.748325526714325, + "learning_rate": 3.0333670418316103e-05, + "loss": 2.9524, + "step": 52377 + }, + { + "epoch": 2.57, + "grad_norm": 0.7367417216300964, + "learning_rate": 3.0326924487014005e-05, + "loss": 2.951, + "step": 52378 + }, + { + "epoch": 2.57, + "grad_norm": 0.753197431564331, + "learning_rate": 3.032017926597885e-05, + "loss": 3.1165, + "step": 52379 + }, + { + "epoch": 2.57, + "grad_norm": 0.7246016263961792, + "learning_rate": 3.031343475522846e-05, + "loss": 2.8368, + "step": 52380 + }, + { + "epoch": 2.57, + "grad_norm": 0.7062520980834961, + "learning_rate": 3.0306690954780622e-05, + "loss": 2.9998, + "step": 52381 + }, + { + "epoch": 2.57, + "grad_norm": 0.73405522108078, + "learning_rate": 3.0299947864652984e-05, + "loss": 2.8911, + "step": 52382 + }, + { + "epoch": 2.57, + "grad_norm": 0.7310712337493896, + "learning_rate": 3.0293205484863435e-05, + "loss": 2.8359, + "step": 52383 + }, + { + "epoch": 2.57, + "grad_norm": 0.759092390537262, + "learning_rate": 3.0286463815429695e-05, + "loss": 3.0487, + "step": 52384 + }, + { + "epoch": 2.57, + "grad_norm": 0.7694963812828064, + "learning_rate": 3.0279722856369447e-05, + "loss": 2.7858, + "step": 52385 + }, + { + "epoch": 2.57, + "grad_norm": 0.7476155161857605, + "learning_rate": 3.0272982607700546e-05, + "loss": 3.0141, + "step": 52386 + }, + { + "epoch": 2.57, + "grad_norm": 0.7267019152641296, + "learning_rate": 3.0266243069440645e-05, + "loss": 2.9373, + "step": 52387 + }, + { + "epoch": 2.57, + "grad_norm": 0.7420546412467957, + "learning_rate": 3.0259504241607625e-05, + "loss": 3.0561, + "step": 52388 + }, + { + "epoch": 2.57, + "grad_norm": 0.7914209961891174, + "learning_rate": 3.0252766124219076e-05, + "loss": 3.0165, + "step": 52389 + }, + { + "epoch": 2.57, + "grad_norm": 0.7075768113136292, + "learning_rate": 3.024602871729288e-05, + "loss": 3.1116, + "step": 52390 + }, + { + "epoch": 2.57, + "grad_norm": 0.7824947237968445, + "learning_rate": 3.023929202084676e-05, + "loss": 2.9569, + "step": 52391 + }, + { + "epoch": 2.57, + "grad_norm": 0.7449918985366821, + "learning_rate": 3.0232556034898336e-05, + "loss": 2.8483, + "step": 52392 + }, + { + "epoch": 2.57, + "grad_norm": 0.7132568359375, + "learning_rate": 3.0225820759465525e-05, + "loss": 2.9501, + "step": 52393 + }, + { + "epoch": 2.57, + "grad_norm": 0.8174427151679993, + "learning_rate": 3.0219086194565913e-05, + "loss": 2.925, + "step": 52394 + }, + { + "epoch": 2.57, + "grad_norm": 0.7686358690261841, + "learning_rate": 3.0212352340217283e-05, + "loss": 2.7436, + "step": 52395 + }, + { + "epoch": 2.57, + "grad_norm": 0.7603972554206848, + "learning_rate": 3.020561919643749e-05, + "loss": 2.8535, + "step": 52396 + }, + { + "epoch": 2.57, + "grad_norm": 0.7688196301460266, + "learning_rate": 3.0198886763244124e-05, + "loss": 2.8584, + "step": 52397 + }, + { + "epoch": 2.57, + "grad_norm": 0.7407582402229309, + "learning_rate": 3.0192155040655e-05, + "loss": 2.8672, + "step": 52398 + }, + { + "epoch": 2.57, + "grad_norm": 0.7440164685249329, + "learning_rate": 3.01854240286878e-05, + "loss": 2.8893, + "step": 52399 + }, + { + "epoch": 2.57, + "grad_norm": 0.768831193447113, + "learning_rate": 3.0178693727360247e-05, + "loss": 2.7461, + "step": 52400 + }, + { + "epoch": 2.57, + "grad_norm": 0.7303265333175659, + "learning_rate": 3.0171964136690098e-05, + "loss": 2.874, + "step": 52401 + }, + { + "epoch": 2.57, + "grad_norm": 0.7380855083465576, + "learning_rate": 3.016523525669503e-05, + "loss": 2.6449, + "step": 52402 + }, + { + "epoch": 2.57, + "grad_norm": 0.7118068933486938, + "learning_rate": 3.015850708739287e-05, + "loss": 2.978, + "step": 52403 + }, + { + "epoch": 2.57, + "grad_norm": 0.7120081782341003, + "learning_rate": 3.0151779628801198e-05, + "loss": 2.8128, + "step": 52404 + }, + { + "epoch": 2.57, + "grad_norm": 0.7590805292129517, + "learning_rate": 3.0145052880937903e-05, + "loss": 2.8198, + "step": 52405 + }, + { + "epoch": 2.57, + "grad_norm": 0.7395599484443665, + "learning_rate": 3.013832684382057e-05, + "loss": 2.7616, + "step": 52406 + }, + { + "epoch": 2.57, + "grad_norm": 0.7489631772041321, + "learning_rate": 3.0131601517466885e-05, + "loss": 2.9875, + "step": 52407 + }, + { + "epoch": 2.57, + "grad_norm": 0.7233737707138062, + "learning_rate": 3.012487690189467e-05, + "loss": 2.8304, + "step": 52408 + }, + { + "epoch": 2.57, + "grad_norm": 0.808874785900116, + "learning_rate": 3.011815299712157e-05, + "loss": 2.8516, + "step": 52409 + }, + { + "epoch": 2.57, + "grad_norm": 0.7949105501174927, + "learning_rate": 3.0111429803165277e-05, + "loss": 2.6781, + "step": 52410 + }, + { + "epoch": 2.57, + "grad_norm": 0.7281270027160645, + "learning_rate": 3.0104707320043643e-05, + "loss": 2.8587, + "step": 52411 + }, + { + "epoch": 2.57, + "grad_norm": 0.7180661559104919, + "learning_rate": 3.0097985547774216e-05, + "loss": 2.7965, + "step": 52412 + }, + { + "epoch": 2.57, + "grad_norm": 0.7658478021621704, + "learning_rate": 3.0091264486374788e-05, + "loss": 2.6224, + "step": 52413 + }, + { + "epoch": 2.57, + "grad_norm": 0.7765020132064819, + "learning_rate": 3.0084544135862943e-05, + "loss": 3.065, + "step": 52414 + }, + { + "epoch": 2.57, + "grad_norm": 0.7255092263221741, + "learning_rate": 3.0077824496256496e-05, + "loss": 2.8975, + "step": 52415 + }, + { + "epoch": 2.57, + "grad_norm": 0.7271882891654968, + "learning_rate": 3.007110556757314e-05, + "loss": 2.7939, + "step": 52416 + }, + { + "epoch": 2.57, + "grad_norm": 0.7662795782089233, + "learning_rate": 3.0064387349830486e-05, + "loss": 2.9394, + "step": 52417 + }, + { + "epoch": 2.57, + "grad_norm": 0.7720481753349304, + "learning_rate": 3.005766984304636e-05, + "loss": 3.0693, + "step": 52418 + }, + { + "epoch": 2.57, + "grad_norm": 0.7715625762939453, + "learning_rate": 3.005095304723831e-05, + "loss": 2.8276, + "step": 52419 + }, + { + "epoch": 2.57, + "grad_norm": 0.7382410764694214, + "learning_rate": 3.0044236962424162e-05, + "loss": 2.9642, + "step": 52420 + }, + { + "epoch": 2.57, + "grad_norm": 0.7307043075561523, + "learning_rate": 3.0037521588621527e-05, + "loss": 2.7719, + "step": 52421 + }, + { + "epoch": 2.57, + "grad_norm": 0.7969446182250977, + "learning_rate": 3.0030806925848062e-05, + "loss": 2.9994, + "step": 52422 + }, + { + "epoch": 2.57, + "grad_norm": 0.7565550208091736, + "learning_rate": 3.0024092974121517e-05, + "loss": 2.9169, + "step": 52423 + }, + { + "epoch": 2.57, + "grad_norm": 0.7802337408065796, + "learning_rate": 3.0017379733459545e-05, + "loss": 2.9471, + "step": 52424 + }, + { + "epoch": 2.57, + "grad_norm": 0.7368726134300232, + "learning_rate": 3.00106672038798e-05, + "loss": 2.9638, + "step": 52425 + }, + { + "epoch": 2.57, + "grad_norm": 0.7419968247413635, + "learning_rate": 3.000395538540007e-05, + "loss": 2.9303, + "step": 52426 + }, + { + "epoch": 2.57, + "grad_norm": 0.7627372741699219, + "learning_rate": 2.9997244278037967e-05, + "loss": 3.1326, + "step": 52427 + }, + { + "epoch": 2.57, + "grad_norm": 0.7172695994377136, + "learning_rate": 2.999053388181115e-05, + "loss": 2.9077, + "step": 52428 + }, + { + "epoch": 2.57, + "grad_norm": 0.7625597715377808, + "learning_rate": 2.9983824196737237e-05, + "loss": 2.6087, + "step": 52429 + }, + { + "epoch": 2.57, + "grad_norm": 0.7408584952354431, + "learning_rate": 2.997711522283398e-05, + "loss": 3.0352, + "step": 52430 + }, + { + "epoch": 2.57, + "grad_norm": 0.7761721611022949, + "learning_rate": 2.9970406960119096e-05, + "loss": 2.7375, + "step": 52431 + }, + { + "epoch": 2.57, + "grad_norm": 0.7523345351219177, + "learning_rate": 2.996369940861011e-05, + "loss": 3.0537, + "step": 52432 + }, + { + "epoch": 2.57, + "grad_norm": 0.7538020610809326, + "learning_rate": 2.9956992568324833e-05, + "loss": 2.8131, + "step": 52433 + }, + { + "epoch": 2.57, + "grad_norm": 0.7494543790817261, + "learning_rate": 2.995028643928089e-05, + "loss": 2.9724, + "step": 52434 + }, + { + "epoch": 2.57, + "grad_norm": 0.8455016016960144, + "learning_rate": 2.9943581021495833e-05, + "loss": 2.8156, + "step": 52435 + }, + { + "epoch": 2.57, + "grad_norm": 0.7785493731498718, + "learning_rate": 2.9936876314987513e-05, + "loss": 2.874, + "step": 52436 + }, + { + "epoch": 2.57, + "grad_norm": 0.740427553653717, + "learning_rate": 2.9930172319773383e-05, + "loss": 2.8312, + "step": 52437 + }, + { + "epoch": 2.57, + "grad_norm": 0.7532118558883667, + "learning_rate": 2.9923469035871294e-05, + "loss": 2.6936, + "step": 52438 + }, + { + "epoch": 2.57, + "grad_norm": 0.7429023385047913, + "learning_rate": 2.9916766463298736e-05, + "loss": 2.7284, + "step": 52439 + }, + { + "epoch": 2.57, + "grad_norm": 0.7911916971206665, + "learning_rate": 2.9910064602073426e-05, + "loss": 2.9946, + "step": 52440 + }, + { + "epoch": 2.57, + "grad_norm": 0.7958114743232727, + "learning_rate": 2.990336345221308e-05, + "loss": 2.8705, + "step": 52441 + }, + { + "epoch": 2.57, + "grad_norm": 0.7775363326072693, + "learning_rate": 2.9896663013735324e-05, + "loss": 2.8921, + "step": 52442 + }, + { + "epoch": 2.57, + "grad_norm": 0.7311971187591553, + "learning_rate": 2.9889963286657737e-05, + "loss": 2.9018, + "step": 52443 + }, + { + "epoch": 2.57, + "grad_norm": 0.7621795535087585, + "learning_rate": 2.9883264270997977e-05, + "loss": 3.0524, + "step": 52444 + }, + { + "epoch": 2.57, + "grad_norm": 0.7280380725860596, + "learning_rate": 2.987656596677376e-05, + "loss": 2.9503, + "step": 52445 + }, + { + "epoch": 2.57, + "grad_norm": 0.7853645086288452, + "learning_rate": 2.9869868374002637e-05, + "loss": 2.7609, + "step": 52446 + }, + { + "epoch": 2.57, + "grad_norm": 0.7341321706771851, + "learning_rate": 2.9863171492702263e-05, + "loss": 2.7723, + "step": 52447 + }, + { + "epoch": 2.57, + "grad_norm": 0.7179156541824341, + "learning_rate": 2.9856475322890396e-05, + "loss": 2.89, + "step": 52448 + }, + { + "epoch": 2.57, + "grad_norm": 0.7569622993469238, + "learning_rate": 2.9849779864584577e-05, + "loss": 2.9834, + "step": 52449 + }, + { + "epoch": 2.57, + "grad_norm": 0.7575824856758118, + "learning_rate": 2.9843085117802433e-05, + "loss": 2.9927, + "step": 52450 + }, + { + "epoch": 2.57, + "grad_norm": 0.7787556052207947, + "learning_rate": 2.983639108256155e-05, + "loss": 2.9165, + "step": 52451 + }, + { + "epoch": 2.57, + "grad_norm": 0.7417843341827393, + "learning_rate": 2.9829697758879645e-05, + "loss": 3.0732, + "step": 52452 + }, + { + "epoch": 2.57, + "grad_norm": 0.756346583366394, + "learning_rate": 2.9823005146774336e-05, + "loss": 3.1536, + "step": 52453 + }, + { + "epoch": 2.57, + "grad_norm": 0.7447629570960999, + "learning_rate": 2.981631324626321e-05, + "loss": 2.9677, + "step": 52454 + }, + { + "epoch": 2.57, + "grad_norm": 0.7455997467041016, + "learning_rate": 2.9809622057363992e-05, + "loss": 2.8583, + "step": 52455 + }, + { + "epoch": 2.57, + "grad_norm": 0.7502301335334778, + "learning_rate": 2.9802931580094126e-05, + "loss": 2.9962, + "step": 52456 + }, + { + "epoch": 2.57, + "grad_norm": 0.7398320436477661, + "learning_rate": 2.9796241814471432e-05, + "loss": 3.0755, + "step": 52457 + }, + { + "epoch": 2.57, + "grad_norm": 0.8331220149993896, + "learning_rate": 2.9789552760513403e-05, + "loss": 2.8654, + "step": 52458 + }, + { + "epoch": 2.57, + "grad_norm": 0.711363673210144, + "learning_rate": 2.978286441823765e-05, + "loss": 3.0171, + "step": 52459 + }, + { + "epoch": 2.57, + "grad_norm": 0.7725111246109009, + "learning_rate": 2.97761767876619e-05, + "loss": 2.9687, + "step": 52460 + }, + { + "epoch": 2.57, + "grad_norm": 0.7222541570663452, + "learning_rate": 2.9769489868803598e-05, + "loss": 2.7685, + "step": 52461 + }, + { + "epoch": 2.57, + "grad_norm": 0.7226037979125977, + "learning_rate": 2.9762803661680502e-05, + "loss": 2.8108, + "step": 52462 + }, + { + "epoch": 2.57, + "grad_norm": 0.7615007162094116, + "learning_rate": 2.9756118166310193e-05, + "loss": 2.9072, + "step": 52463 + }, + { + "epoch": 2.57, + "grad_norm": 0.7283841967582703, + "learning_rate": 2.9749433382710265e-05, + "loss": 2.8891, + "step": 52464 + }, + { + "epoch": 2.57, + "grad_norm": 0.7138139605522156, + "learning_rate": 2.9742749310898327e-05, + "loss": 2.6945, + "step": 52465 + }, + { + "epoch": 2.57, + "grad_norm": 0.7099862694740295, + "learning_rate": 2.9736065950891908e-05, + "loss": 2.9704, + "step": 52466 + }, + { + "epoch": 2.57, + "grad_norm": 0.6979156136512756, + "learning_rate": 2.9729383302708688e-05, + "loss": 2.8211, + "step": 52467 + }, + { + "epoch": 2.57, + "grad_norm": 0.734841525554657, + "learning_rate": 2.972270136636632e-05, + "loss": 3.0207, + "step": 52468 + }, + { + "epoch": 2.57, + "grad_norm": 0.7860302329063416, + "learning_rate": 2.971602014188229e-05, + "loss": 2.7577, + "step": 52469 + }, + { + "epoch": 2.57, + "grad_norm": 0.7355887293815613, + "learning_rate": 2.9709339629274285e-05, + "loss": 3.0183, + "step": 52470 + }, + { + "epoch": 2.57, + "grad_norm": 0.737175464630127, + "learning_rate": 2.970265982855986e-05, + "loss": 2.9608, + "step": 52471 + }, + { + "epoch": 2.57, + "grad_norm": 0.7097364664077759, + "learning_rate": 2.9695980739756564e-05, + "loss": 2.9803, + "step": 52472 + }, + { + "epoch": 2.57, + "grad_norm": 0.7723646759986877, + "learning_rate": 2.9689302362882084e-05, + "loss": 2.8259, + "step": 52473 + }, + { + "epoch": 2.57, + "grad_norm": 0.7352739572525024, + "learning_rate": 2.9682624697953873e-05, + "loss": 2.9392, + "step": 52474 + }, + { + "epoch": 2.57, + "grad_norm": 0.7245145440101624, + "learning_rate": 2.9675947744989716e-05, + "loss": 2.7403, + "step": 52475 + }, + { + "epoch": 2.57, + "grad_norm": 0.7283601760864258, + "learning_rate": 2.9669271504006997e-05, + "loss": 2.7105, + "step": 52476 + }, + { + "epoch": 2.57, + "grad_norm": 0.7669225335121155, + "learning_rate": 2.9662595975023406e-05, + "loss": 2.8408, + "step": 52477 + }, + { + "epoch": 2.57, + "grad_norm": 0.7420752048492432, + "learning_rate": 2.965592115805656e-05, + "loss": 2.8891, + "step": 52478 + }, + { + "epoch": 2.57, + "grad_norm": 0.7681543827056885, + "learning_rate": 2.964924705312398e-05, + "loss": 2.959, + "step": 52479 + }, + { + "epoch": 2.57, + "grad_norm": 0.8160313367843628, + "learning_rate": 2.9642573660243252e-05, + "loss": 3.0549, + "step": 52480 + }, + { + "epoch": 2.57, + "grad_norm": 0.7001559138298035, + "learning_rate": 2.9635900979431927e-05, + "loss": 2.8459, + "step": 52481 + }, + { + "epoch": 2.57, + "grad_norm": 0.7368727326393127, + "learning_rate": 2.9629229010707588e-05, + "loss": 2.7764, + "step": 52482 + }, + { + "epoch": 2.57, + "grad_norm": 0.7512417435646057, + "learning_rate": 2.962255775408786e-05, + "loss": 2.8818, + "step": 52483 + }, + { + "epoch": 2.57, + "grad_norm": 0.7434666752815247, + "learning_rate": 2.9615887209590263e-05, + "loss": 2.8293, + "step": 52484 + }, + { + "epoch": 2.57, + "grad_norm": 0.7331339120864868, + "learning_rate": 2.960921737723241e-05, + "loss": 2.8604, + "step": 52485 + }, + { + "epoch": 2.57, + "grad_norm": 0.7475157976150513, + "learning_rate": 2.9602548257031854e-05, + "loss": 2.8809, + "step": 52486 + }, + { + "epoch": 2.57, + "grad_norm": 0.7474045753479004, + "learning_rate": 2.959587984900609e-05, + "loss": 3.0424, + "step": 52487 + }, + { + "epoch": 2.57, + "grad_norm": 0.7134113311767578, + "learning_rate": 2.9589212153172794e-05, + "loss": 2.8692, + "step": 52488 + }, + { + "epoch": 2.57, + "grad_norm": 0.7441120147705078, + "learning_rate": 2.958254516954939e-05, + "loss": 2.7825, + "step": 52489 + }, + { + "epoch": 2.57, + "grad_norm": 0.7090118527412415, + "learning_rate": 2.9575878898153594e-05, + "loss": 2.9669, + "step": 52490 + }, + { + "epoch": 2.57, + "grad_norm": 0.7831944227218628, + "learning_rate": 2.956921333900286e-05, + "loss": 2.934, + "step": 52491 + }, + { + "epoch": 2.57, + "grad_norm": 0.7588694095611572, + "learning_rate": 2.9562548492114814e-05, + "loss": 2.9626, + "step": 52492 + }, + { + "epoch": 2.57, + "grad_norm": 0.7268178462982178, + "learning_rate": 2.955588435750693e-05, + "loss": 2.8937, + "step": 52493 + }, + { + "epoch": 2.57, + "grad_norm": 0.7331435084342957, + "learning_rate": 2.954922093519677e-05, + "loss": 2.879, + "step": 52494 + }, + { + "epoch": 2.57, + "grad_norm": 0.7241976261138916, + "learning_rate": 2.9542558225201984e-05, + "loss": 2.7338, + "step": 52495 + }, + { + "epoch": 2.57, + "grad_norm": 0.7318156957626343, + "learning_rate": 2.953589622753999e-05, + "loss": 2.8821, + "step": 52496 + }, + { + "epoch": 2.57, + "grad_norm": 0.7937421202659607, + "learning_rate": 2.9529234942228376e-05, + "loss": 2.8645, + "step": 52497 + }, + { + "epoch": 2.57, + "grad_norm": 0.7748422622680664, + "learning_rate": 2.952257436928476e-05, + "loss": 2.8881, + "step": 52498 + }, + { + "epoch": 2.57, + "grad_norm": 0.7224046587944031, + "learning_rate": 2.951591450872659e-05, + "loss": 2.9246, + "step": 52499 + }, + { + "epoch": 2.57, + "grad_norm": 0.7512742877006531, + "learning_rate": 2.9509255360571493e-05, + "loss": 2.9546, + "step": 52500 + }, + { + "epoch": 2.57, + "grad_norm": 0.7295030951499939, + "learning_rate": 2.9502596924836952e-05, + "loss": 2.9228, + "step": 52501 + }, + { + "epoch": 2.57, + "grad_norm": 0.7399193644523621, + "learning_rate": 2.949593920154045e-05, + "loss": 2.7575, + "step": 52502 + }, + { + "epoch": 2.57, + "grad_norm": 0.7585479021072388, + "learning_rate": 2.948928219069968e-05, + "loss": 2.8837, + "step": 52503 + }, + { + "epoch": 2.57, + "grad_norm": 0.8097811341285706, + "learning_rate": 2.948262589233199e-05, + "loss": 2.9717, + "step": 52504 + }, + { + "epoch": 2.57, + "grad_norm": 0.7251771688461304, + "learning_rate": 2.9475970306455065e-05, + "loss": 2.9217, + "step": 52505 + }, + { + "epoch": 2.57, + "grad_norm": 0.7347126603126526, + "learning_rate": 2.9469315433086327e-05, + "loss": 2.8441, + "step": 52506 + }, + { + "epoch": 2.57, + "grad_norm": 0.7708790302276611, + "learning_rate": 2.9462661272243394e-05, + "loss": 2.7953, + "step": 52507 + }, + { + "epoch": 2.57, + "grad_norm": 0.7497082948684692, + "learning_rate": 2.9456007823943716e-05, + "loss": 2.8825, + "step": 52508 + }, + { + "epoch": 2.57, + "grad_norm": 0.7288712859153748, + "learning_rate": 2.9449355088204817e-05, + "loss": 2.8716, + "step": 52509 + }, + { + "epoch": 2.57, + "grad_norm": 0.745967447757721, + "learning_rate": 2.9442703065044314e-05, + "loss": 2.9418, + "step": 52510 + }, + { + "epoch": 2.57, + "grad_norm": 0.7650946378707886, + "learning_rate": 2.9436051754479595e-05, + "loss": 3.0354, + "step": 52511 + }, + { + "epoch": 2.57, + "grad_norm": 0.7825707793235779, + "learning_rate": 2.9429401156528242e-05, + "loss": 2.9016, + "step": 52512 + }, + { + "epoch": 2.57, + "grad_norm": 0.775109052658081, + "learning_rate": 2.9422751271207845e-05, + "loss": 2.8788, + "step": 52513 + }, + { + "epoch": 2.57, + "grad_norm": 0.7160129547119141, + "learning_rate": 2.941610209853582e-05, + "loss": 2.8551, + "step": 52514 + }, + { + "epoch": 2.57, + "grad_norm": 0.7564792037010193, + "learning_rate": 2.9409453638529755e-05, + "loss": 2.9526, + "step": 52515 + }, + { + "epoch": 2.57, + "grad_norm": 0.7617117166519165, + "learning_rate": 2.9402805891207006e-05, + "loss": 2.9431, + "step": 52516 + }, + { + "epoch": 2.57, + "grad_norm": 0.7508881688117981, + "learning_rate": 2.9396158856585216e-05, + "loss": 2.9826, + "step": 52517 + }, + { + "epoch": 2.57, + "grad_norm": 0.7413908839225769, + "learning_rate": 2.9389512534681914e-05, + "loss": 2.6746, + "step": 52518 + }, + { + "epoch": 2.57, + "grad_norm": 0.749812126159668, + "learning_rate": 2.938286692551448e-05, + "loss": 2.9919, + "step": 52519 + }, + { + "epoch": 2.57, + "grad_norm": 0.7722899317741394, + "learning_rate": 2.937622202910057e-05, + "loss": 3.0812, + "step": 52520 + }, + { + "epoch": 2.57, + "grad_norm": 0.7824468016624451, + "learning_rate": 2.936957784545757e-05, + "loss": 2.9144, + "step": 52521 + }, + { + "epoch": 2.57, + "grad_norm": 0.7449400424957275, + "learning_rate": 2.936293437460303e-05, + "loss": 3.1251, + "step": 52522 + }, + { + "epoch": 2.57, + "grad_norm": 0.7934387922286987, + "learning_rate": 2.9356291616554473e-05, + "loss": 2.9064, + "step": 52523 + }, + { + "epoch": 2.57, + "grad_norm": 0.7479174733161926, + "learning_rate": 2.934964957132928e-05, + "loss": 3.1097, + "step": 52524 + }, + { + "epoch": 2.57, + "grad_norm": 0.7839178442955017, + "learning_rate": 2.934300823894511e-05, + "loss": 2.9528, + "step": 52525 + }, + { + "epoch": 2.57, + "grad_norm": 0.7495297789573669, + "learning_rate": 2.9336367619419276e-05, + "loss": 2.9153, + "step": 52526 + }, + { + "epoch": 2.57, + "grad_norm": 0.7545944452285767, + "learning_rate": 2.9329727712769435e-05, + "loss": 2.9136, + "step": 52527 + }, + { + "epoch": 2.57, + "grad_norm": 0.7327878475189209, + "learning_rate": 2.9323088519012938e-05, + "loss": 3.0669, + "step": 52528 + }, + { + "epoch": 2.57, + "grad_norm": 0.7432766556739807, + "learning_rate": 2.9316450038167372e-05, + "loss": 3.0313, + "step": 52529 + }, + { + "epoch": 2.57, + "grad_norm": 0.7075245380401611, + "learning_rate": 2.930981227025022e-05, + "loss": 2.9219, + "step": 52530 + }, + { + "epoch": 2.57, + "grad_norm": 0.740532398223877, + "learning_rate": 2.930317521527884e-05, + "loss": 2.7855, + "step": 52531 + }, + { + "epoch": 2.57, + "grad_norm": 0.7468576431274414, + "learning_rate": 2.9296538873270847e-05, + "loss": 2.8628, + "step": 52532 + }, + { + "epoch": 2.57, + "grad_norm": 0.7329097390174866, + "learning_rate": 2.9289903244243662e-05, + "loss": 2.8902, + "step": 52533 + }, + { + "epoch": 2.57, + "grad_norm": 0.7266642451286316, + "learning_rate": 2.9283268328214737e-05, + "loss": 3.0384, + "step": 52534 + }, + { + "epoch": 2.57, + "grad_norm": 0.7914463877677917, + "learning_rate": 2.9276634125201627e-05, + "loss": 3.044, + "step": 52535 + }, + { + "epoch": 2.57, + "grad_norm": 0.7478669285774231, + "learning_rate": 2.927000063522178e-05, + "loss": 2.7363, + "step": 52536 + }, + { + "epoch": 2.57, + "grad_norm": 0.7697821855545044, + "learning_rate": 2.9263367858292653e-05, + "loss": 2.6884, + "step": 52537 + }, + { + "epoch": 2.57, + "grad_norm": 0.7177620530128479, + "learning_rate": 2.9256735794431662e-05, + "loss": 3.0663, + "step": 52538 + }, + { + "epoch": 2.57, + "grad_norm": 0.7495757937431335, + "learning_rate": 2.9250104443656295e-05, + "loss": 2.8574, + "step": 52539 + }, + { + "epoch": 2.57, + "grad_norm": 0.7898780703544617, + "learning_rate": 2.924347380598414e-05, + "loss": 3.1215, + "step": 52540 + }, + { + "epoch": 2.57, + "grad_norm": 0.7186577320098877, + "learning_rate": 2.9236843881432482e-05, + "loss": 2.8895, + "step": 52541 + }, + { + "epoch": 2.57, + "grad_norm": 0.7339335680007935, + "learning_rate": 2.9230214670018936e-05, + "loss": 3.0893, + "step": 52542 + }, + { + "epoch": 2.58, + "grad_norm": 0.7203998565673828, + "learning_rate": 2.9223586171760827e-05, + "loss": 2.7479, + "step": 52543 + }, + { + "epoch": 2.58, + "grad_norm": 0.7361055612564087, + "learning_rate": 2.9216958386675738e-05, + "loss": 2.9828, + "step": 52544 + }, + { + "epoch": 2.58, + "grad_norm": 0.7605053186416626, + "learning_rate": 2.9210331314781088e-05, + "loss": 2.8262, + "step": 52545 + }, + { + "epoch": 2.58, + "grad_norm": 0.7568164467811584, + "learning_rate": 2.920370495609423e-05, + "loss": 2.8678, + "step": 52546 + }, + { + "epoch": 2.58, + "grad_norm": 0.703392744064331, + "learning_rate": 2.9197079310632787e-05, + "loss": 2.7076, + "step": 52547 + }, + { + "epoch": 2.58, + "grad_norm": 0.7546159625053406, + "learning_rate": 2.9190454378414007e-05, + "loss": 3.0159, + "step": 52548 + }, + { + "epoch": 2.58, + "grad_norm": 0.7149426937103271, + "learning_rate": 2.9183830159455513e-05, + "loss": 2.7881, + "step": 52549 + }, + { + "epoch": 2.58, + "grad_norm": 0.7450437545776367, + "learning_rate": 2.9177206653774722e-05, + "loss": 2.8648, + "step": 52550 + }, + { + "epoch": 2.58, + "grad_norm": 0.7561696171760559, + "learning_rate": 2.917058386138902e-05, + "loss": 2.7633, + "step": 52551 + }, + { + "epoch": 2.58, + "grad_norm": 0.7118037939071655, + "learning_rate": 2.9163961782315924e-05, + "loss": 2.9854, + "step": 52552 + }, + { + "epoch": 2.58, + "grad_norm": 0.7305014133453369, + "learning_rate": 2.9157340416572727e-05, + "loss": 2.6679, + "step": 52553 + }, + { + "epoch": 2.58, + "grad_norm": 0.7449045181274414, + "learning_rate": 2.915071976417701e-05, + "loss": 2.8047, + "step": 52554 + }, + { + "epoch": 2.58, + "grad_norm": 0.7418058514595032, + "learning_rate": 2.9144099825146194e-05, + "loss": 2.8945, + "step": 52555 + }, + { + "epoch": 2.58, + "grad_norm": 0.7265969514846802, + "learning_rate": 2.9137480599497664e-05, + "loss": 3.0225, + "step": 52556 + }, + { + "epoch": 2.58, + "grad_norm": 0.7401600480079651, + "learning_rate": 2.913086208724894e-05, + "loss": 2.9199, + "step": 52557 + }, + { + "epoch": 2.58, + "grad_norm": 0.7653848528862, + "learning_rate": 2.912424428841731e-05, + "loss": 2.897, + "step": 52558 + }, + { + "epoch": 2.58, + "grad_norm": 0.7716215252876282, + "learning_rate": 2.9117627203020354e-05, + "loss": 3.0183, + "step": 52559 + }, + { + "epoch": 2.58, + "grad_norm": 0.7675553560256958, + "learning_rate": 2.9111010831075467e-05, + "loss": 2.9278, + "step": 52560 + }, + { + "epoch": 2.58, + "grad_norm": 0.8107246160507202, + "learning_rate": 2.910439517259996e-05, + "loss": 2.7924, + "step": 52561 + }, + { + "epoch": 2.58, + "grad_norm": 0.7437394261360168, + "learning_rate": 2.909778022761139e-05, + "loss": 2.9086, + "step": 52562 + }, + { + "epoch": 2.58, + "grad_norm": 0.7718828320503235, + "learning_rate": 2.909116599612711e-05, + "loss": 2.9937, + "step": 52563 + }, + { + "epoch": 2.58, + "grad_norm": 0.7451728582382202, + "learning_rate": 2.9084552478164537e-05, + "loss": 2.7001, + "step": 52564 + }, + { + "epoch": 2.58, + "grad_norm": 0.7395138144493103, + "learning_rate": 2.9077939673741158e-05, + "loss": 3.1498, + "step": 52565 + }, + { + "epoch": 2.58, + "grad_norm": 0.7220340967178345, + "learning_rate": 2.907132758287436e-05, + "loss": 3.1024, + "step": 52566 + }, + { + "epoch": 2.58, + "grad_norm": 0.7452659010887146, + "learning_rate": 2.9064716205581527e-05, + "loss": 3.0528, + "step": 52567 + }, + { + "epoch": 2.58, + "grad_norm": 0.7208335995674133, + "learning_rate": 2.905810554188005e-05, + "loss": 2.9171, + "step": 52568 + }, + { + "epoch": 2.58, + "grad_norm": 0.7047619223594666, + "learning_rate": 2.9051495591787376e-05, + "loss": 2.9405, + "step": 52569 + }, + { + "epoch": 2.58, + "grad_norm": 0.7219128608703613, + "learning_rate": 2.9044886355320996e-05, + "loss": 2.9738, + "step": 52570 + }, + { + "epoch": 2.58, + "grad_norm": 0.7547860145568848, + "learning_rate": 2.9038277832498157e-05, + "loss": 2.9313, + "step": 52571 + }, + { + "epoch": 2.58, + "grad_norm": 0.7108772397041321, + "learning_rate": 2.9031670023336418e-05, + "loss": 3.0976, + "step": 52572 + }, + { + "epoch": 2.58, + "grad_norm": 0.7670025825500488, + "learning_rate": 2.9025062927853094e-05, + "loss": 2.9123, + "step": 52573 + }, + { + "epoch": 2.58, + "grad_norm": 0.7210864424705505, + "learning_rate": 2.901845654606554e-05, + "loss": 2.8945, + "step": 52574 + }, + { + "epoch": 2.58, + "grad_norm": 0.7419854998588562, + "learning_rate": 2.9011850877991306e-05, + "loss": 3.1037, + "step": 52575 + }, + { + "epoch": 2.58, + "grad_norm": 0.7315715551376343, + "learning_rate": 2.9005245923647614e-05, + "loss": 2.96, + "step": 52576 + }, + { + "epoch": 2.58, + "grad_norm": 0.7764679789543152, + "learning_rate": 2.8998641683052047e-05, + "loss": 2.8197, + "step": 52577 + }, + { + "epoch": 2.58, + "grad_norm": 0.7059036493301392, + "learning_rate": 2.899203815622183e-05, + "loss": 2.9622, + "step": 52578 + }, + { + "epoch": 2.58, + "grad_norm": 0.8265849947929382, + "learning_rate": 2.8985435343174414e-05, + "loss": 2.9801, + "step": 52579 + }, + { + "epoch": 2.58, + "grad_norm": 0.8718390464782715, + "learning_rate": 2.897883324392728e-05, + "loss": 2.6337, + "step": 52580 + }, + { + "epoch": 2.58, + "grad_norm": 0.7311578989028931, + "learning_rate": 2.897223185849772e-05, + "loss": 2.6949, + "step": 52581 + }, + { + "epoch": 2.58, + "grad_norm": 0.7237415909767151, + "learning_rate": 2.896563118690315e-05, + "loss": 2.9607, + "step": 52582 + }, + { + "epoch": 2.58, + "grad_norm": 0.730317234992981, + "learning_rate": 2.895903122916089e-05, + "loss": 2.8626, + "step": 52583 + }, + { + "epoch": 2.58, + "grad_norm": 0.7898061871528625, + "learning_rate": 2.895243198528836e-05, + "loss": 2.8689, + "step": 52584 + }, + { + "epoch": 2.58, + "grad_norm": 0.7148504853248596, + "learning_rate": 2.8945833455303045e-05, + "loss": 2.8797, + "step": 52585 + }, + { + "epoch": 2.58, + "grad_norm": 0.79593425989151, + "learning_rate": 2.893923563922217e-05, + "loss": 2.9757, + "step": 52586 + }, + { + "epoch": 2.58, + "grad_norm": 0.7755904793739319, + "learning_rate": 2.8932638537063245e-05, + "loss": 2.9006, + "step": 52587 + }, + { + "epoch": 2.58, + "grad_norm": 0.7300575375556946, + "learning_rate": 2.8926042148843566e-05, + "loss": 2.7823, + "step": 52588 + }, + { + "epoch": 2.58, + "grad_norm": 0.7470569014549255, + "learning_rate": 2.8919446474580477e-05, + "loss": 2.8235, + "step": 52589 + }, + { + "epoch": 2.58, + "grad_norm": 0.7756972908973694, + "learning_rate": 2.8912851514291468e-05, + "loss": 2.8028, + "step": 52590 + }, + { + "epoch": 2.58, + "grad_norm": 0.7556537389755249, + "learning_rate": 2.890625726799376e-05, + "loss": 2.7269, + "step": 52591 + }, + { + "epoch": 2.58, + "grad_norm": 0.7390879392623901, + "learning_rate": 2.889966373570487e-05, + "loss": 2.8715, + "step": 52592 + }, + { + "epoch": 2.58, + "grad_norm": 0.7211794853210449, + "learning_rate": 2.889307091744202e-05, + "loss": 2.8193, + "step": 52593 + }, + { + "epoch": 2.58, + "grad_norm": 0.7417266964912415, + "learning_rate": 2.888647881322269e-05, + "loss": 2.8811, + "step": 52594 + }, + { + "epoch": 2.58, + "grad_norm": 0.738362193107605, + "learning_rate": 2.887988742306424e-05, + "loss": 2.9706, + "step": 52595 + }, + { + "epoch": 2.58, + "grad_norm": 0.7565693855285645, + "learning_rate": 2.8873296746983888e-05, + "loss": 2.7883, + "step": 52596 + }, + { + "epoch": 2.58, + "grad_norm": 0.786318838596344, + "learning_rate": 2.8866706784999184e-05, + "loss": 2.8329, + "step": 52597 + }, + { + "epoch": 2.58, + "grad_norm": 0.7326686978340149, + "learning_rate": 2.8860117537127314e-05, + "loss": 2.9177, + "step": 52598 + }, + { + "epoch": 2.58, + "grad_norm": 0.741766095161438, + "learning_rate": 2.8853529003385767e-05, + "loss": 2.8992, + "step": 52599 + }, + { + "epoch": 2.58, + "grad_norm": 0.7155507206916809, + "learning_rate": 2.8846941183791793e-05, + "loss": 2.9925, + "step": 52600 + }, + { + "epoch": 2.58, + "grad_norm": 0.7600530385971069, + "learning_rate": 2.8840354078362782e-05, + "loss": 2.8607, + "step": 52601 + }, + { + "epoch": 2.58, + "grad_norm": 0.7748110294342041, + "learning_rate": 2.8833767687116184e-05, + "loss": 2.8176, + "step": 52602 + }, + { + "epoch": 2.58, + "grad_norm": 0.7540781497955322, + "learning_rate": 2.8827182010069217e-05, + "loss": 2.8926, + "step": 52603 + }, + { + "epoch": 2.58, + "grad_norm": 0.7902787327766418, + "learning_rate": 2.882059704723927e-05, + "loss": 3.0199, + "step": 52604 + }, + { + "epoch": 2.58, + "grad_norm": 0.7752910852432251, + "learning_rate": 2.8814012798643627e-05, + "loss": 2.8814, + "step": 52605 + }, + { + "epoch": 2.58, + "grad_norm": 0.7527629137039185, + "learning_rate": 2.8807429264299708e-05, + "loss": 2.6384, + "step": 52606 + }, + { + "epoch": 2.58, + "grad_norm": 0.7904459834098816, + "learning_rate": 2.8800846444224867e-05, + "loss": 2.8519, + "step": 52607 + }, + { + "epoch": 2.58, + "grad_norm": 0.6968057155609131, + "learning_rate": 2.8794264338436325e-05, + "loss": 2.8082, + "step": 52608 + }, + { + "epoch": 2.58, + "grad_norm": 0.7696243524551392, + "learning_rate": 2.8787682946951596e-05, + "loss": 2.8339, + "step": 52609 + }, + { + "epoch": 2.58, + "grad_norm": 0.8164247870445251, + "learning_rate": 2.878110226978787e-05, + "loss": 2.82, + "step": 52610 + }, + { + "epoch": 2.58, + "grad_norm": 0.7112769484519958, + "learning_rate": 2.87745223069625e-05, + "loss": 2.7388, + "step": 52611 + }, + { + "epoch": 2.58, + "grad_norm": 0.7222334146499634, + "learning_rate": 2.876794305849287e-05, + "loss": 2.8311, + "step": 52612 + }, + { + "epoch": 2.58, + "grad_norm": 0.7716138362884521, + "learning_rate": 2.8761364524396234e-05, + "loss": 2.9988, + "step": 52613 + }, + { + "epoch": 2.58, + "grad_norm": 0.6985487341880798, + "learning_rate": 2.8754786704690012e-05, + "loss": 2.7424, + "step": 52614 + }, + { + "epoch": 2.58, + "grad_norm": 0.7328038811683655, + "learning_rate": 2.874820959939146e-05, + "loss": 2.8344, + "step": 52615 + }, + { + "epoch": 2.58, + "grad_norm": 0.7214050889015198, + "learning_rate": 2.8741633208517922e-05, + "loss": 2.8049, + "step": 52616 + }, + { + "epoch": 2.58, + "grad_norm": 0.7892204523086548, + "learning_rate": 2.873505753208676e-05, + "loss": 2.8323, + "step": 52617 + }, + { + "epoch": 2.58, + "grad_norm": 0.743575930595398, + "learning_rate": 2.8728482570115185e-05, + "loss": 2.9289, + "step": 52618 + }, + { + "epoch": 2.58, + "grad_norm": 0.7116771936416626, + "learning_rate": 2.8721908322620625e-05, + "loss": 2.827, + "step": 52619 + }, + { + "epoch": 2.58, + "grad_norm": 0.7374192476272583, + "learning_rate": 2.871533478962029e-05, + "loss": 3.0038, + "step": 52620 + }, + { + "epoch": 2.58, + "grad_norm": 0.7295014262199402, + "learning_rate": 2.8708761971131577e-05, + "loss": 2.6753, + "step": 52621 + }, + { + "epoch": 2.58, + "grad_norm": 0.7758492827415466, + "learning_rate": 2.87021898671718e-05, + "loss": 3.0055, + "step": 52622 + }, + { + "epoch": 2.58, + "grad_norm": 0.7198835015296936, + "learning_rate": 2.869561847775821e-05, + "loss": 2.8754, + "step": 52623 + }, + { + "epoch": 2.58, + "grad_norm": 0.7312856316566467, + "learning_rate": 2.8689047802908193e-05, + "loss": 2.7471, + "step": 52624 + }, + { + "epoch": 2.58, + "grad_norm": 0.7765367031097412, + "learning_rate": 2.8682477842639007e-05, + "loss": 2.952, + "step": 52625 + }, + { + "epoch": 2.58, + "grad_norm": 0.7989129424095154, + "learning_rate": 2.86759085969679e-05, + "loss": 2.8156, + "step": 52626 + }, + { + "epoch": 2.58, + "grad_norm": 0.7187029719352722, + "learning_rate": 2.8669340065912295e-05, + "loss": 3.0734, + "step": 52627 + }, + { + "epoch": 2.58, + "grad_norm": 0.7583007216453552, + "learning_rate": 2.8662772249489373e-05, + "loss": 2.8671, + "step": 52628 + }, + { + "epoch": 2.58, + "grad_norm": 0.7283254265785217, + "learning_rate": 2.865620514771656e-05, + "loss": 2.874, + "step": 52629 + }, + { + "epoch": 2.58, + "grad_norm": 0.764652669429779, + "learning_rate": 2.8649638760611004e-05, + "loss": 2.9456, + "step": 52630 + }, + { + "epoch": 2.58, + "grad_norm": 0.7280563116073608, + "learning_rate": 2.8643073088190126e-05, + "loss": 3.0906, + "step": 52631 + }, + { + "epoch": 2.58, + "grad_norm": 0.7654474973678589, + "learning_rate": 2.863650813047118e-05, + "loss": 3.0266, + "step": 52632 + }, + { + "epoch": 2.58, + "grad_norm": 0.7615987062454224, + "learning_rate": 2.8629943887471418e-05, + "loss": 2.9073, + "step": 52633 + }, + { + "epoch": 2.58, + "grad_norm": 0.7482545375823975, + "learning_rate": 2.8623380359208194e-05, + "loss": 2.8876, + "step": 52634 + }, + { + "epoch": 2.58, + "grad_norm": 0.7490363121032715, + "learning_rate": 2.861681754569869e-05, + "loss": 2.7844, + "step": 52635 + }, + { + "epoch": 2.58, + "grad_norm": 0.7606791853904724, + "learning_rate": 2.861025544696026e-05, + "loss": 2.9984, + "step": 52636 + }, + { + "epoch": 2.58, + "grad_norm": 0.7316489815711975, + "learning_rate": 2.860369406301026e-05, + "loss": 2.8326, + "step": 52637 + }, + { + "epoch": 2.58, + "grad_norm": 0.7653703093528748, + "learning_rate": 2.859713339386588e-05, + "loss": 2.8809, + "step": 52638 + }, + { + "epoch": 2.58, + "grad_norm": 0.7543324828147888, + "learning_rate": 2.8590573439544427e-05, + "loss": 2.7399, + "step": 52639 + }, + { + "epoch": 2.58, + "grad_norm": 0.739680290222168, + "learning_rate": 2.8584014200063132e-05, + "loss": 2.6263, + "step": 52640 + }, + { + "epoch": 2.58, + "grad_norm": 0.7400057315826416, + "learning_rate": 2.8577455675439277e-05, + "loss": 2.9609, + "step": 52641 + }, + { + "epoch": 2.58, + "grad_norm": 0.7001802325248718, + "learning_rate": 2.8570897865690247e-05, + "loss": 2.8452, + "step": 52642 + }, + { + "epoch": 2.58, + "grad_norm": 0.7247951030731201, + "learning_rate": 2.856434077083316e-05, + "loss": 2.8749, + "step": 52643 + }, + { + "epoch": 2.58, + "grad_norm": 0.7461290955543518, + "learning_rate": 2.855778439088544e-05, + "loss": 2.8568, + "step": 52644 + }, + { + "epoch": 2.58, + "grad_norm": 0.7752999067306519, + "learning_rate": 2.8551228725864205e-05, + "loss": 2.9557, + "step": 52645 + }, + { + "epoch": 2.58, + "grad_norm": 0.7620311379432678, + "learning_rate": 2.8544673775786842e-05, + "loss": 2.9193, + "step": 52646 + }, + { + "epoch": 2.58, + "grad_norm": 0.7228102087974548, + "learning_rate": 2.853811954067057e-05, + "loss": 2.7064, + "step": 52647 + }, + { + "epoch": 2.58, + "grad_norm": 0.7312363386154175, + "learning_rate": 2.8531566020532604e-05, + "loss": 2.8944, + "step": 52648 + }, + { + "epoch": 2.58, + "grad_norm": 0.7253252267837524, + "learning_rate": 2.8525013215390268e-05, + "loss": 3.0867, + "step": 52649 + }, + { + "epoch": 2.58, + "grad_norm": 0.7094355225563049, + "learning_rate": 2.8518461125260782e-05, + "loss": 3.0171, + "step": 52650 + }, + { + "epoch": 2.58, + "grad_norm": 0.7288451194763184, + "learning_rate": 2.85119097501614e-05, + "loss": 2.8495, + "step": 52651 + }, + { + "epoch": 2.58, + "grad_norm": 0.7485461235046387, + "learning_rate": 2.850535909010947e-05, + "loss": 2.9792, + "step": 52652 + }, + { + "epoch": 2.58, + "grad_norm": 0.7915205359458923, + "learning_rate": 2.8498809145122182e-05, + "loss": 2.8687, + "step": 52653 + }, + { + "epoch": 2.58, + "grad_norm": 0.8087054491043091, + "learning_rate": 2.8492259915216754e-05, + "loss": 2.746, + "step": 52654 + }, + { + "epoch": 2.58, + "grad_norm": 0.7752242088317871, + "learning_rate": 2.848571140041044e-05, + "loss": 2.8696, + "step": 52655 + }, + { + "epoch": 2.58, + "grad_norm": 0.7321742177009583, + "learning_rate": 2.8479163600720458e-05, + "loss": 3.0578, + "step": 52656 + }, + { + "epoch": 2.58, + "grad_norm": 0.7605643272399902, + "learning_rate": 2.847261651616419e-05, + "loss": 2.7562, + "step": 52657 + }, + { + "epoch": 2.58, + "grad_norm": 0.7781768441200256, + "learning_rate": 2.8466070146758734e-05, + "loss": 2.7825, + "step": 52658 + }, + { + "epoch": 2.58, + "grad_norm": 0.7603253722190857, + "learning_rate": 2.845952449252147e-05, + "loss": 2.8124, + "step": 52659 + }, + { + "epoch": 2.58, + "grad_norm": 0.750426173210144, + "learning_rate": 2.845297955346951e-05, + "loss": 3.084, + "step": 52660 + }, + { + "epoch": 2.58, + "grad_norm": 0.765870213508606, + "learning_rate": 2.8446435329620122e-05, + "loss": 2.8798, + "step": 52661 + }, + { + "epoch": 2.58, + "grad_norm": 0.7285685539245605, + "learning_rate": 2.843989182099061e-05, + "loss": 2.8378, + "step": 52662 + }, + { + "epoch": 2.58, + "grad_norm": 0.7301684617996216, + "learning_rate": 2.8433349027598107e-05, + "loss": 2.9406, + "step": 52663 + }, + { + "epoch": 2.58, + "grad_norm": 0.7690022587776184, + "learning_rate": 2.8426806949459956e-05, + "loss": 2.8652, + "step": 52664 + }, + { + "epoch": 2.58, + "grad_norm": 0.766257107257843, + "learning_rate": 2.842026558659325e-05, + "loss": 2.9153, + "step": 52665 + }, + { + "epoch": 2.58, + "grad_norm": 0.7352449893951416, + "learning_rate": 2.8413724939015336e-05, + "loss": 3.1021, + "step": 52666 + }, + { + "epoch": 2.58, + "grad_norm": 0.777466893196106, + "learning_rate": 2.840718500674344e-05, + "loss": 2.8008, + "step": 52667 + }, + { + "epoch": 2.58, + "grad_norm": 0.8289501667022705, + "learning_rate": 2.840064578979474e-05, + "loss": 2.8083, + "step": 52668 + }, + { + "epoch": 2.58, + "grad_norm": 0.7426512837409973, + "learning_rate": 2.8394107288186496e-05, + "loss": 2.7525, + "step": 52669 + }, + { + "epoch": 2.58, + "grad_norm": 0.7094964981079102, + "learning_rate": 2.8387569501935825e-05, + "loss": 3.0038, + "step": 52670 + }, + { + "epoch": 2.58, + "grad_norm": 0.7811385989189148, + "learning_rate": 2.8381032431060013e-05, + "loss": 2.8479, + "step": 52671 + }, + { + "epoch": 2.58, + "grad_norm": 0.7416138052940369, + "learning_rate": 2.837449607557635e-05, + "loss": 3.1491, + "step": 52672 + }, + { + "epoch": 2.58, + "grad_norm": 0.7839840650558472, + "learning_rate": 2.8367960435501946e-05, + "loss": 2.6479, + "step": 52673 + }, + { + "epoch": 2.58, + "grad_norm": 0.78972327709198, + "learning_rate": 2.8361425510854096e-05, + "loss": 2.8504, + "step": 52674 + }, + { + "epoch": 2.58, + "grad_norm": 0.7591714262962341, + "learning_rate": 2.8354891301649985e-05, + "loss": 3.0018, + "step": 52675 + }, + { + "epoch": 2.58, + "grad_norm": 0.7825368046760559, + "learning_rate": 2.8348357807906764e-05, + "loss": 2.7301, + "step": 52676 + }, + { + "epoch": 2.58, + "grad_norm": 0.727197527885437, + "learning_rate": 2.8341825029641717e-05, + "loss": 2.9514, + "step": 52677 + }, + { + "epoch": 2.58, + "grad_norm": 0.7893258333206177, + "learning_rate": 2.8335292966872003e-05, + "loss": 2.8807, + "step": 52678 + }, + { + "epoch": 2.58, + "grad_norm": 0.7999505996704102, + "learning_rate": 2.832876161961487e-05, + "loss": 2.8625, + "step": 52679 + }, + { + "epoch": 2.58, + "grad_norm": 0.7148997187614441, + "learning_rate": 2.832223098788744e-05, + "loss": 2.9241, + "step": 52680 + }, + { + "epoch": 2.58, + "grad_norm": 0.7615914344787598, + "learning_rate": 2.831570107170703e-05, + "loss": 2.8288, + "step": 52681 + }, + { + "epoch": 2.58, + "grad_norm": 0.7838194370269775, + "learning_rate": 2.830917187109073e-05, + "loss": 2.7449, + "step": 52682 + }, + { + "epoch": 2.58, + "grad_norm": 0.7049852609634399, + "learning_rate": 2.8302643386055855e-05, + "loss": 2.8416, + "step": 52683 + }, + { + "epoch": 2.58, + "grad_norm": 0.7759522199630737, + "learning_rate": 2.8296115616619497e-05, + "loss": 2.8045, + "step": 52684 + }, + { + "epoch": 2.58, + "grad_norm": 0.726335883140564, + "learning_rate": 2.8289588562798836e-05, + "loss": 2.8608, + "step": 52685 + }, + { + "epoch": 2.58, + "grad_norm": 0.7436689138412476, + "learning_rate": 2.8283062224611165e-05, + "loss": 2.8539, + "step": 52686 + }, + { + "epoch": 2.58, + "grad_norm": 0.7301530838012695, + "learning_rate": 2.827653660207353e-05, + "loss": 2.9732, + "step": 52687 + }, + { + "epoch": 2.58, + "grad_norm": 0.7957533001899719, + "learning_rate": 2.8270011695203222e-05, + "loss": 2.867, + "step": 52688 + }, + { + "epoch": 2.58, + "grad_norm": 0.6973696947097778, + "learning_rate": 2.826348750401749e-05, + "loss": 2.9357, + "step": 52689 + }, + { + "epoch": 2.58, + "grad_norm": 0.8173954486846924, + "learning_rate": 2.8256964028533425e-05, + "loss": 2.7172, + "step": 52690 + }, + { + "epoch": 2.58, + "grad_norm": 0.7530957460403442, + "learning_rate": 2.8250441268768174e-05, + "loss": 3.071, + "step": 52691 + }, + { + "epoch": 2.58, + "grad_norm": 0.7341398596763611, + "learning_rate": 2.8243919224738964e-05, + "loss": 2.9068, + "step": 52692 + }, + { + "epoch": 2.58, + "grad_norm": 0.7384122014045715, + "learning_rate": 2.8237397896462944e-05, + "loss": 2.7069, + "step": 52693 + }, + { + "epoch": 2.58, + "grad_norm": 0.7298815250396729, + "learning_rate": 2.8230877283957365e-05, + "loss": 3.116, + "step": 52694 + }, + { + "epoch": 2.58, + "grad_norm": 0.7248790860176086, + "learning_rate": 2.8224357387239316e-05, + "loss": 2.8794, + "step": 52695 + }, + { + "epoch": 2.58, + "grad_norm": 0.739068329334259, + "learning_rate": 2.821783820632605e-05, + "loss": 3.0346, + "step": 52696 + }, + { + "epoch": 2.58, + "grad_norm": 0.7384850382804871, + "learning_rate": 2.8211319741234684e-05, + "loss": 2.8443, + "step": 52697 + }, + { + "epoch": 2.58, + "grad_norm": 0.7337626218795776, + "learning_rate": 2.820480199198234e-05, + "loss": 2.7513, + "step": 52698 + }, + { + "epoch": 2.58, + "grad_norm": 0.7873049378395081, + "learning_rate": 2.8198284958586302e-05, + "loss": 2.8253, + "step": 52699 + }, + { + "epoch": 2.58, + "grad_norm": 0.7321355938911438, + "learning_rate": 2.8191768641063628e-05, + "loss": 2.7804, + "step": 52700 + }, + { + "epoch": 2.58, + "grad_norm": 0.7380569577217102, + "learning_rate": 2.8185253039431566e-05, + "loss": 2.9944, + "step": 52701 + }, + { + "epoch": 2.58, + "grad_norm": 0.8345293402671814, + "learning_rate": 2.8178738153707202e-05, + "loss": 3.1125, + "step": 52702 + }, + { + "epoch": 2.58, + "grad_norm": 0.779485821723938, + "learning_rate": 2.8172223983907692e-05, + "loss": 2.6968, + "step": 52703 + }, + { + "epoch": 2.58, + "grad_norm": 0.7306099534034729, + "learning_rate": 2.8165710530050322e-05, + "loss": 2.9204, + "step": 52704 + }, + { + "epoch": 2.58, + "grad_norm": 0.8526567816734314, + "learning_rate": 2.8159197792152112e-05, + "loss": 2.9036, + "step": 52705 + }, + { + "epoch": 2.58, + "grad_norm": 0.7224339246749878, + "learning_rate": 2.815268577023031e-05, + "loss": 3.0588, + "step": 52706 + }, + { + "epoch": 2.58, + "grad_norm": 0.7223872542381287, + "learning_rate": 2.8146174464301907e-05, + "loss": 3.0034, + "step": 52707 + }, + { + "epoch": 2.58, + "grad_norm": 0.7458459138870239, + "learning_rate": 2.813966387438419e-05, + "loss": 3.0477, + "step": 52708 + }, + { + "epoch": 2.58, + "grad_norm": 0.7503674626350403, + "learning_rate": 2.813315400049434e-05, + "loss": 2.8695, + "step": 52709 + }, + { + "epoch": 2.58, + "grad_norm": 0.7620118260383606, + "learning_rate": 2.8126644842649382e-05, + "loss": 2.8405, + "step": 52710 + }, + { + "epoch": 2.58, + "grad_norm": 0.7874787449836731, + "learning_rate": 2.8120136400866566e-05, + "loss": 2.8848, + "step": 52711 + }, + { + "epoch": 2.58, + "grad_norm": 0.7471845746040344, + "learning_rate": 2.811362867516298e-05, + "loss": 2.8329, + "step": 52712 + }, + { + "epoch": 2.58, + "grad_norm": 0.8011273741722107, + "learning_rate": 2.8107121665555743e-05, + "loss": 2.7753, + "step": 52713 + }, + { + "epoch": 2.58, + "grad_norm": 0.6914375424385071, + "learning_rate": 2.8100615372062042e-05, + "loss": 2.9165, + "step": 52714 + }, + { + "epoch": 2.58, + "grad_norm": 0.7664171457290649, + "learning_rate": 2.809410979469896e-05, + "loss": 2.8735, + "step": 52715 + }, + { + "epoch": 2.58, + "grad_norm": 0.7507244348526001, + "learning_rate": 2.808760493348372e-05, + "loss": 2.8179, + "step": 52716 + }, + { + "epoch": 2.58, + "grad_norm": 0.7342262864112854, + "learning_rate": 2.808110078843331e-05, + "loss": 2.9128, + "step": 52717 + }, + { + "epoch": 2.58, + "grad_norm": 0.7676442265510559, + "learning_rate": 2.8074597359565045e-05, + "loss": 2.9506, + "step": 52718 + }, + { + "epoch": 2.58, + "grad_norm": 0.7457807064056396, + "learning_rate": 2.8068094646895945e-05, + "loss": 3.0169, + "step": 52719 + }, + { + "epoch": 2.58, + "grad_norm": 0.700971782207489, + "learning_rate": 2.8061592650443065e-05, + "loss": 2.9474, + "step": 52720 + }, + { + "epoch": 2.58, + "grad_norm": 0.7742531895637512, + "learning_rate": 2.8055091370223692e-05, + "loss": 2.8221, + "step": 52721 + }, + { + "epoch": 2.58, + "grad_norm": 0.7480669617652893, + "learning_rate": 2.8048590806254813e-05, + "loss": 2.7798, + "step": 52722 + }, + { + "epoch": 2.58, + "grad_norm": 0.7456562519073486, + "learning_rate": 2.804209095855361e-05, + "loss": 2.9058, + "step": 52723 + }, + { + "epoch": 2.58, + "grad_norm": 0.865938127040863, + "learning_rate": 2.803559182713727e-05, + "loss": 2.8553, + "step": 52724 + }, + { + "epoch": 2.58, + "grad_norm": 0.7843738794326782, + "learning_rate": 2.8029093412022785e-05, + "loss": 2.737, + "step": 52725 + }, + { + "epoch": 2.58, + "grad_norm": 0.7294140458106995, + "learning_rate": 2.802259571322737e-05, + "loss": 2.752, + "step": 52726 + }, + { + "epoch": 2.58, + "grad_norm": 0.7814497351646423, + "learning_rate": 2.801609873076808e-05, + "loss": 3.033, + "step": 52727 + }, + { + "epoch": 2.58, + "grad_norm": 0.7317684888839722, + "learning_rate": 2.8009602464661996e-05, + "loss": 2.8929, + "step": 52728 + }, + { + "epoch": 2.58, + "grad_norm": 0.7592427134513855, + "learning_rate": 2.8003106914926344e-05, + "loss": 2.9463, + "step": 52729 + }, + { + "epoch": 2.58, + "grad_norm": 1.053802728652954, + "learning_rate": 2.7996612081578074e-05, + "loss": 2.9645, + "step": 52730 + }, + { + "epoch": 2.58, + "grad_norm": 0.7302281856536865, + "learning_rate": 2.7990117964634475e-05, + "loss": 2.8003, + "step": 52731 + }, + { + "epoch": 2.58, + "grad_norm": 0.7234871983528137, + "learning_rate": 2.7983624564112494e-05, + "loss": 2.8953, + "step": 52732 + }, + { + "epoch": 2.58, + "grad_norm": 0.7112483382225037, + "learning_rate": 2.7977131880029324e-05, + "loss": 2.7117, + "step": 52733 + }, + { + "epoch": 2.58, + "grad_norm": 0.7166719436645508, + "learning_rate": 2.7970639912402047e-05, + "loss": 2.9582, + "step": 52734 + }, + { + "epoch": 2.58, + "grad_norm": 0.7226663827896118, + "learning_rate": 2.796414866124769e-05, + "loss": 2.8281, + "step": 52735 + }, + { + "epoch": 2.58, + "grad_norm": 0.7304128408432007, + "learning_rate": 2.7957658126583494e-05, + "loss": 3.0941, + "step": 52736 + }, + { + "epoch": 2.58, + "grad_norm": 0.7455897331237793, + "learning_rate": 2.7951168308426387e-05, + "loss": 2.9331, + "step": 52737 + }, + { + "epoch": 2.58, + "grad_norm": 0.7342404127120972, + "learning_rate": 2.7944679206793553e-05, + "loss": 2.9336, + "step": 52738 + }, + { + "epoch": 2.58, + "grad_norm": 0.7524769902229309, + "learning_rate": 2.7938190821702145e-05, + "loss": 2.8062, + "step": 52739 + }, + { + "epoch": 2.58, + "grad_norm": 0.7616720199584961, + "learning_rate": 2.7931703153169182e-05, + "loss": 2.9295, + "step": 52740 + }, + { + "epoch": 2.58, + "grad_norm": 0.7344446182250977, + "learning_rate": 2.792521620121172e-05, + "loss": 2.8753, + "step": 52741 + }, + { + "epoch": 2.58, + "grad_norm": 0.7952720522880554, + "learning_rate": 2.7918729965846876e-05, + "loss": 2.8517, + "step": 52742 + }, + { + "epoch": 2.58, + "grad_norm": 0.7229446768760681, + "learning_rate": 2.79122444470917e-05, + "loss": 2.7266, + "step": 52743 + }, + { + "epoch": 2.58, + "grad_norm": 0.751518964767456, + "learning_rate": 2.7905759644963355e-05, + "loss": 2.9713, + "step": 52744 + }, + { + "epoch": 2.58, + "grad_norm": 0.7525866627693176, + "learning_rate": 2.7899275559478818e-05, + "loss": 2.9393, + "step": 52745 + }, + { + "epoch": 2.58, + "grad_norm": 0.752153217792511, + "learning_rate": 2.7892792190655312e-05, + "loss": 2.991, + "step": 52746 + }, + { + "epoch": 2.59, + "grad_norm": 0.7784674167633057, + "learning_rate": 2.788630953850972e-05, + "loss": 2.9634, + "step": 52747 + }, + { + "epoch": 2.59, + "grad_norm": 0.7399433255195618, + "learning_rate": 2.78798276030593e-05, + "loss": 2.9404, + "step": 52748 + }, + { + "epoch": 2.59, + "grad_norm": 0.7343171834945679, + "learning_rate": 2.787334638432104e-05, + "loss": 3.0185, + "step": 52749 + }, + { + "epoch": 2.59, + "grad_norm": 0.7647814750671387, + "learning_rate": 2.7866865882311985e-05, + "loss": 3.0713, + "step": 52750 + }, + { + "epoch": 2.59, + "grad_norm": 0.7639553546905518, + "learning_rate": 2.7860386097049257e-05, + "loss": 2.974, + "step": 52751 + }, + { + "epoch": 2.59, + "grad_norm": 0.7180127501487732, + "learning_rate": 2.785390702854985e-05, + "loss": 2.7201, + "step": 52752 + }, + { + "epoch": 2.59, + "grad_norm": 0.7458502054214478, + "learning_rate": 2.7847428676830876e-05, + "loss": 2.8863, + "step": 52753 + }, + { + "epoch": 2.59, + "grad_norm": 0.7427824139595032, + "learning_rate": 2.7840951041909455e-05, + "loss": 2.755, + "step": 52754 + }, + { + "epoch": 2.59, + "grad_norm": 0.7325022220611572, + "learning_rate": 2.7834474123802574e-05, + "loss": 3.1051, + "step": 52755 + }, + { + "epoch": 2.59, + "grad_norm": 0.8776165246963501, + "learning_rate": 2.7827997922527324e-05, + "loss": 2.9282, + "step": 52756 + }, + { + "epoch": 2.59, + "grad_norm": 0.7489161491394043, + "learning_rate": 2.7821522438100685e-05, + "loss": 3.0858, + "step": 52757 + }, + { + "epoch": 2.59, + "grad_norm": 0.7696153521537781, + "learning_rate": 2.781504767053985e-05, + "loss": 2.8448, + "step": 52758 + }, + { + "epoch": 2.59, + "grad_norm": 0.7301738858222961, + "learning_rate": 2.780857361986173e-05, + "loss": 3.0967, + "step": 52759 + }, + { + "epoch": 2.59, + "grad_norm": 0.7664462924003601, + "learning_rate": 2.780210028608345e-05, + "loss": 2.7462, + "step": 52760 + }, + { + "epoch": 2.59, + "grad_norm": 0.7320300340652466, + "learning_rate": 2.77956276692221e-05, + "loss": 3.0839, + "step": 52761 + }, + { + "epoch": 2.59, + "grad_norm": 0.7702510356903076, + "learning_rate": 2.7789155769294657e-05, + "loss": 3.0114, + "step": 52762 + }, + { + "epoch": 2.59, + "grad_norm": 0.7481235861778259, + "learning_rate": 2.7782684586318216e-05, + "loss": 2.9768, + "step": 52763 + }, + { + "epoch": 2.59, + "grad_norm": 0.779035747051239, + "learning_rate": 2.7776214120309726e-05, + "loss": 3.0398, + "step": 52764 + }, + { + "epoch": 2.59, + "grad_norm": 0.7233691215515137, + "learning_rate": 2.7769744371286308e-05, + "loss": 2.8899, + "step": 52765 + }, + { + "epoch": 2.59, + "grad_norm": 0.7301437854766846, + "learning_rate": 2.7763275339265044e-05, + "loss": 2.8502, + "step": 52766 + }, + { + "epoch": 2.59, + "grad_norm": 0.7371978759765625, + "learning_rate": 2.7756807024262863e-05, + "loss": 2.8388, + "step": 52767 + }, + { + "epoch": 2.59, + "grad_norm": 0.7594659328460693, + "learning_rate": 2.7750339426296908e-05, + "loss": 2.7252, + "step": 52768 + }, + { + "epoch": 2.59, + "grad_norm": 0.7644268870353699, + "learning_rate": 2.774387254538414e-05, + "loss": 2.8623, + "step": 52769 + }, + { + "epoch": 2.59, + "grad_norm": 0.7400708794593811, + "learning_rate": 2.7737406381541637e-05, + "loss": 2.8575, + "step": 52770 + }, + { + "epoch": 2.59, + "grad_norm": 0.7719500064849854, + "learning_rate": 2.7730940934786426e-05, + "loss": 2.852, + "step": 52771 + }, + { + "epoch": 2.59, + "grad_norm": 0.7356804609298706, + "learning_rate": 2.7724476205135425e-05, + "loss": 2.996, + "step": 52772 + }, + { + "epoch": 2.59, + "grad_norm": 0.7251750230789185, + "learning_rate": 2.7718012192605854e-05, + "loss": 2.897, + "step": 52773 + }, + { + "epoch": 2.59, + "grad_norm": 0.7100973129272461, + "learning_rate": 2.7711548897214562e-05, + "loss": 2.9647, + "step": 52774 + }, + { + "epoch": 2.59, + "grad_norm": 0.7529676556587219, + "learning_rate": 2.770508631897864e-05, + "loss": 3.0783, + "step": 52775 + }, + { + "epoch": 2.59, + "grad_norm": 0.7607330083847046, + "learning_rate": 2.7698624457915175e-05, + "loss": 2.8693, + "step": 52776 + }, + { + "epoch": 2.59, + "grad_norm": 0.7323965430259705, + "learning_rate": 2.769216331404115e-05, + "loss": 2.9267, + "step": 52777 + }, + { + "epoch": 2.59, + "grad_norm": 0.7149645686149597, + "learning_rate": 2.768570288737352e-05, + "loss": 2.8751, + "step": 52778 + }, + { + "epoch": 2.59, + "grad_norm": 0.8055224418640137, + "learning_rate": 2.7679243177929332e-05, + "loss": 2.9824, + "step": 52779 + }, + { + "epoch": 2.59, + "grad_norm": 0.7920107841491699, + "learning_rate": 2.7672784185725584e-05, + "loss": 3.1636, + "step": 52780 + }, + { + "epoch": 2.59, + "grad_norm": 0.6878994107246399, + "learning_rate": 2.766632591077935e-05, + "loss": 2.9685, + "step": 52781 + }, + { + "epoch": 2.59, + "grad_norm": 0.7483735084533691, + "learning_rate": 2.7659868353107563e-05, + "loss": 2.7888, + "step": 52782 + }, + { + "epoch": 2.59, + "grad_norm": 0.7039675116539001, + "learning_rate": 2.765341151272733e-05, + "loss": 2.8918, + "step": 52783 + }, + { + "epoch": 2.59, + "grad_norm": 0.752712070941925, + "learning_rate": 2.7646955389655544e-05, + "loss": 3.069, + "step": 52784 + }, + { + "epoch": 2.59, + "grad_norm": 0.7546181082725525, + "learning_rate": 2.7640499983909326e-05, + "loss": 2.7278, + "step": 52785 + }, + { + "epoch": 2.59, + "grad_norm": 0.7798104286193848, + "learning_rate": 2.763404529550559e-05, + "loss": 2.8005, + "step": 52786 + }, + { + "epoch": 2.59, + "grad_norm": 0.6820088028907776, + "learning_rate": 2.7627591324461295e-05, + "loss": 2.9015, + "step": 52787 + }, + { + "epoch": 2.59, + "grad_norm": 0.7711160182952881, + "learning_rate": 2.762113807079359e-05, + "loss": 2.7493, + "step": 52788 + }, + { + "epoch": 2.59, + "grad_norm": 0.77363121509552, + "learning_rate": 2.7614685534519332e-05, + "loss": 2.8504, + "step": 52789 + }, + { + "epoch": 2.59, + "grad_norm": 0.7659868597984314, + "learning_rate": 2.760823371565557e-05, + "loss": 2.6641, + "step": 52790 + }, + { + "epoch": 2.59, + "grad_norm": 0.7323107719421387, + "learning_rate": 2.7601782614219326e-05, + "loss": 2.7258, + "step": 52791 + }, + { + "epoch": 2.59, + "grad_norm": 0.7489632368087769, + "learning_rate": 2.7595332230227584e-05, + "loss": 3.0457, + "step": 52792 + }, + { + "epoch": 2.59, + "grad_norm": 0.7547075748443604, + "learning_rate": 2.75888825636973e-05, + "loss": 2.9536, + "step": 52793 + }, + { + "epoch": 2.59, + "grad_norm": 0.7597982883453369, + "learning_rate": 2.758243361464543e-05, + "loss": 2.7106, + "step": 52794 + }, + { + "epoch": 2.59, + "grad_norm": 0.7556941509246826, + "learning_rate": 2.7575985383089018e-05, + "loss": 2.908, + "step": 52795 + }, + { + "epoch": 2.59, + "grad_norm": 0.7499327659606934, + "learning_rate": 2.7569537869045055e-05, + "loss": 3.0735, + "step": 52796 + }, + { + "epoch": 2.59, + "grad_norm": 0.780232310295105, + "learning_rate": 2.7563091072530465e-05, + "loss": 2.8993, + "step": 52797 + }, + { + "epoch": 2.59, + "grad_norm": 0.7690110206604004, + "learning_rate": 2.7556644993562295e-05, + "loss": 2.6908, + "step": 52798 + }, + { + "epoch": 2.59, + "grad_norm": 0.7913340926170349, + "learning_rate": 2.75501996321575e-05, + "loss": 2.8206, + "step": 52799 + }, + { + "epoch": 2.59, + "grad_norm": 0.7513294219970703, + "learning_rate": 2.7543754988333e-05, + "loss": 2.8581, + "step": 52800 + }, + { + "epoch": 2.59, + "grad_norm": 0.7517967820167542, + "learning_rate": 2.753731106210585e-05, + "loss": 2.8545, + "step": 52801 + }, + { + "epoch": 2.59, + "grad_norm": 0.7514926195144653, + "learning_rate": 2.7530867853492933e-05, + "loss": 3.0491, + "step": 52802 + }, + { + "epoch": 2.59, + "grad_norm": 0.7327825427055359, + "learning_rate": 2.7524425362511337e-05, + "loss": 2.9297, + "step": 52803 + }, + { + "epoch": 2.59, + "grad_norm": 0.7828764915466309, + "learning_rate": 2.7517983589177917e-05, + "loss": 2.8612, + "step": 52804 + }, + { + "epoch": 2.59, + "grad_norm": 0.7338088154792786, + "learning_rate": 2.751154253350969e-05, + "loss": 3.1315, + "step": 52805 + }, + { + "epoch": 2.59, + "grad_norm": 0.7811881303787231, + "learning_rate": 2.7505102195523675e-05, + "loss": 2.6917, + "step": 52806 + }, + { + "epoch": 2.59, + "grad_norm": 0.7544569969177246, + "learning_rate": 2.7498662575236762e-05, + "loss": 2.8917, + "step": 52807 + }, + { + "epoch": 2.59, + "grad_norm": 0.7520846128463745, + "learning_rate": 2.7492223672665937e-05, + "loss": 2.9407, + "step": 52808 + }, + { + "epoch": 2.59, + "grad_norm": 0.7595059871673584, + "learning_rate": 2.7485785487828083e-05, + "loss": 3.0334, + "step": 52809 + }, + { + "epoch": 2.59, + "grad_norm": 0.7427202463150024, + "learning_rate": 2.7479348020740223e-05, + "loss": 2.8506, + "step": 52810 + }, + { + "epoch": 2.59, + "grad_norm": 0.7459549903869629, + "learning_rate": 2.7472911271419374e-05, + "loss": 2.9055, + "step": 52811 + }, + { + "epoch": 2.59, + "grad_norm": 0.7440264225006104, + "learning_rate": 2.746647523988239e-05, + "loss": 2.9919, + "step": 52812 + }, + { + "epoch": 2.59, + "grad_norm": 0.7285496592521667, + "learning_rate": 2.7460039926146293e-05, + "loss": 3.0071, + "step": 52813 + }, + { + "epoch": 2.59, + "grad_norm": 0.7452382445335388, + "learning_rate": 2.7453605330228e-05, + "loss": 2.8223, + "step": 52814 + }, + { + "epoch": 2.59, + "grad_norm": 0.8756013512611389, + "learning_rate": 2.74471714521444e-05, + "loss": 2.8339, + "step": 52815 + }, + { + "epoch": 2.59, + "grad_norm": 0.8711315393447876, + "learning_rate": 2.744073829191258e-05, + "loss": 3.0862, + "step": 52816 + }, + { + "epoch": 2.59, + "grad_norm": 0.7606200575828552, + "learning_rate": 2.743430584954932e-05, + "loss": 2.7368, + "step": 52817 + }, + { + "epoch": 2.59, + "grad_norm": 0.7606265544891357, + "learning_rate": 2.7427874125071714e-05, + "loss": 2.8173, + "step": 52818 + }, + { + "epoch": 2.59, + "grad_norm": 0.7671118378639221, + "learning_rate": 2.7421443118496544e-05, + "loss": 3.0363, + "step": 52819 + }, + { + "epoch": 2.59, + "grad_norm": 0.7543085813522339, + "learning_rate": 2.7415012829840933e-05, + "loss": 2.8662, + "step": 52820 + }, + { + "epoch": 2.59, + "grad_norm": 0.7956048846244812, + "learning_rate": 2.7408583259121697e-05, + "loss": 2.999, + "step": 52821 + }, + { + "epoch": 2.59, + "grad_norm": 0.7940734028816223, + "learning_rate": 2.7402154406355726e-05, + "loss": 2.9409, + "step": 52822 + }, + { + "epoch": 2.59, + "grad_norm": 0.7047262191772461, + "learning_rate": 2.739572627156007e-05, + "loss": 2.9213, + "step": 52823 + }, + { + "epoch": 2.59, + "grad_norm": 0.7310606241226196, + "learning_rate": 2.7389298854751586e-05, + "loss": 2.7791, + "step": 52824 + }, + { + "epoch": 2.59, + "grad_norm": 0.7745860815048218, + "learning_rate": 2.738287215594719e-05, + "loss": 3.0216, + "step": 52825 + }, + { + "epoch": 2.59, + "grad_norm": 0.7691651582717896, + "learning_rate": 2.7376446175163903e-05, + "loss": 2.6237, + "step": 52826 + }, + { + "epoch": 2.59, + "grad_norm": 0.7258844971656799, + "learning_rate": 2.7370020912418543e-05, + "loss": 2.9344, + "step": 52827 + }, + { + "epoch": 2.59, + "grad_norm": 0.7222638130187988, + "learning_rate": 2.7363596367728135e-05, + "loss": 2.5812, + "step": 52828 + }, + { + "epoch": 2.59, + "grad_norm": 0.7643725872039795, + "learning_rate": 2.735717254110953e-05, + "loss": 3.0799, + "step": 52829 + }, + { + "epoch": 2.59, + "grad_norm": 0.7751563191413879, + "learning_rate": 2.735074943257961e-05, + "loss": 2.8096, + "step": 52830 + }, + { + "epoch": 2.59, + "grad_norm": 0.7652669548988342, + "learning_rate": 2.73443270421554e-05, + "loss": 2.9164, + "step": 52831 + }, + { + "epoch": 2.59, + "grad_norm": 0.7439308166503906, + "learning_rate": 2.733790536985372e-05, + "loss": 2.983, + "step": 52832 + }, + { + "epoch": 2.59, + "grad_norm": 0.7255779504776001, + "learning_rate": 2.7331484415691584e-05, + "loss": 2.8595, + "step": 52833 + }, + { + "epoch": 2.59, + "grad_norm": 0.7431380152702332, + "learning_rate": 2.7325064179685753e-05, + "loss": 2.9898, + "step": 52834 + }, + { + "epoch": 2.59, + "grad_norm": 0.7448784708976746, + "learning_rate": 2.7318644661853307e-05, + "loss": 2.9715, + "step": 52835 + }, + { + "epoch": 2.59, + "grad_norm": 0.7783845067024231, + "learning_rate": 2.7312225862211067e-05, + "loss": 2.908, + "step": 52836 + }, + { + "epoch": 2.59, + "grad_norm": 0.7135199308395386, + "learning_rate": 2.7305807780775856e-05, + "loss": 2.8114, + "step": 52837 + }, + { + "epoch": 2.59, + "grad_norm": 0.7589677572250366, + "learning_rate": 2.729939041756476e-05, + "loss": 2.9255, + "step": 52838 + }, + { + "epoch": 2.59, + "grad_norm": 0.7436524033546448, + "learning_rate": 2.7292973772594527e-05, + "loss": 2.9792, + "step": 52839 + }, + { + "epoch": 2.59, + "grad_norm": 0.786425769329071, + "learning_rate": 2.7286557845882185e-05, + "loss": 2.6977, + "step": 52840 + }, + { + "epoch": 2.59, + "grad_norm": 0.7813881039619446, + "learning_rate": 2.728014263744448e-05, + "loss": 3.0427, + "step": 52841 + }, + { + "epoch": 2.59, + "grad_norm": 0.7063042521476746, + "learning_rate": 2.7273728147298466e-05, + "loss": 2.9036, + "step": 52842 + }, + { + "epoch": 2.59, + "grad_norm": 0.7354524731636047, + "learning_rate": 2.7267314375460968e-05, + "loss": 2.9452, + "step": 52843 + }, + { + "epoch": 2.59, + "grad_norm": 0.7480248212814331, + "learning_rate": 2.72609013219488e-05, + "loss": 2.9096, + "step": 52844 + }, + { + "epoch": 2.59, + "grad_norm": 0.7572676539421082, + "learning_rate": 2.725448898677899e-05, + "loss": 2.8905, + "step": 52845 + }, + { + "epoch": 2.59, + "grad_norm": 0.7566613554954529, + "learning_rate": 2.7248077369968312e-05, + "loss": 2.8314, + "step": 52846 + }, + { + "epoch": 2.59, + "grad_norm": 0.733538806438446, + "learning_rate": 2.724166647153373e-05, + "loss": 2.9796, + "step": 52847 + }, + { + "epoch": 2.59, + "grad_norm": 0.7428073287010193, + "learning_rate": 2.7235256291492124e-05, + "loss": 2.7486, + "step": 52848 + }, + { + "epoch": 2.59, + "grad_norm": 0.7868484258651733, + "learning_rate": 2.7228846829860318e-05, + "loss": 2.7314, + "step": 52849 + }, + { + "epoch": 2.59, + "grad_norm": 0.7082564830780029, + "learning_rate": 2.7222438086655295e-05, + "loss": 3.042, + "step": 52850 + }, + { + "epoch": 2.59, + "grad_norm": 0.7359206676483154, + "learning_rate": 2.721603006189388e-05, + "loss": 3.0088, + "step": 52851 + }, + { + "epoch": 2.59, + "grad_norm": 0.7429801821708679, + "learning_rate": 2.7209622755592852e-05, + "loss": 2.9179, + "step": 52852 + }, + { + "epoch": 2.59, + "grad_norm": 0.7138932347297668, + "learning_rate": 2.720321616776927e-05, + "loss": 2.8191, + "step": 52853 + }, + { + "epoch": 2.59, + "grad_norm": 0.7627942562103271, + "learning_rate": 2.7196810298439852e-05, + "loss": 2.9606, + "step": 52854 + }, + { + "epoch": 2.59, + "grad_norm": 0.7788562178611755, + "learning_rate": 2.7190405147621587e-05, + "loss": 2.7794, + "step": 52855 + }, + { + "epoch": 2.59, + "grad_norm": 0.7393107414245605, + "learning_rate": 2.7184000715331222e-05, + "loss": 2.879, + "step": 52856 + }, + { + "epoch": 2.59, + "grad_norm": 0.7325177192687988, + "learning_rate": 2.7177597001585782e-05, + "loss": 2.9611, + "step": 52857 + }, + { + "epoch": 2.59, + "grad_norm": 0.7476690411567688, + "learning_rate": 2.717119400640202e-05, + "loss": 2.9348, + "step": 52858 + }, + { + "epoch": 2.59, + "grad_norm": 0.7651125192642212, + "learning_rate": 2.7164791729796753e-05, + "loss": 2.7011, + "step": 52859 + }, + { + "epoch": 2.59, + "grad_norm": 0.7937335968017578, + "learning_rate": 2.7158390171787003e-05, + "loss": 3.0789, + "step": 52860 + }, + { + "epoch": 2.59, + "grad_norm": 0.7635074257850647, + "learning_rate": 2.715198933238949e-05, + "loss": 2.9107, + "step": 52861 + }, + { + "epoch": 2.59, + "grad_norm": 0.7641463875770569, + "learning_rate": 2.7145589211621132e-05, + "loss": 2.8345, + "step": 52862 + }, + { + "epoch": 2.59, + "grad_norm": 0.8104754090309143, + "learning_rate": 2.713918980949882e-05, + "loss": 2.9759, + "step": 52863 + }, + { + "epoch": 2.59, + "grad_norm": 0.7333548069000244, + "learning_rate": 2.7132791126039367e-05, + "loss": 3.0388, + "step": 52864 + }, + { + "epoch": 2.59, + "grad_norm": 0.752964437007904, + "learning_rate": 2.712639316125963e-05, + "loss": 2.9652, + "step": 52865 + }, + { + "epoch": 2.59, + "grad_norm": 0.73786860704422, + "learning_rate": 2.7119995915176396e-05, + "loss": 3.0213, + "step": 52866 + }, + { + "epoch": 2.59, + "grad_norm": 0.7262935638427734, + "learning_rate": 2.7113599387806582e-05, + "loss": 2.8607, + "step": 52867 + }, + { + "epoch": 2.59, + "grad_norm": 0.7377870678901672, + "learning_rate": 2.710720357916708e-05, + "loss": 2.9051, + "step": 52868 + }, + { + "epoch": 2.59, + "grad_norm": 0.7554672956466675, + "learning_rate": 2.7100808489274638e-05, + "loss": 2.7477, + "step": 52869 + }, + { + "epoch": 2.59, + "grad_norm": 0.7196999192237854, + "learning_rate": 2.7094414118146213e-05, + "loss": 2.8733, + "step": 52870 + }, + { + "epoch": 2.59, + "grad_norm": 0.7686446309089661, + "learning_rate": 2.708802046579849e-05, + "loss": 3.0189, + "step": 52871 + }, + { + "epoch": 2.59, + "grad_norm": 0.7374565601348877, + "learning_rate": 2.7081627532248486e-05, + "loss": 3.0224, + "step": 52872 + }, + { + "epoch": 2.59, + "grad_norm": 0.727088212966919, + "learning_rate": 2.7075235317512923e-05, + "loss": 2.7337, + "step": 52873 + }, + { + "epoch": 2.59, + "grad_norm": 0.7013735175132751, + "learning_rate": 2.706884382160862e-05, + "loss": 2.9789, + "step": 52874 + }, + { + "epoch": 2.59, + "grad_norm": 0.7275285124778748, + "learning_rate": 2.7062453044552536e-05, + "loss": 2.843, + "step": 52875 + }, + { + "epoch": 2.59, + "grad_norm": 0.7339413166046143, + "learning_rate": 2.705606298636135e-05, + "loss": 2.786, + "step": 52876 + }, + { + "epoch": 2.59, + "grad_norm": 0.8053582906723022, + "learning_rate": 2.704967364705195e-05, + "loss": 3.011, + "step": 52877 + }, + { + "epoch": 2.59, + "grad_norm": 0.770816445350647, + "learning_rate": 2.7043285026641258e-05, + "loss": 3.0154, + "step": 52878 + }, + { + "epoch": 2.59, + "grad_norm": 0.7276818752288818, + "learning_rate": 2.7036897125145995e-05, + "loss": 3.0039, + "step": 52879 + }, + { + "epoch": 2.59, + "grad_norm": 0.7705271244049072, + "learning_rate": 2.7030509942583013e-05, + "loss": 2.8089, + "step": 52880 + }, + { + "epoch": 2.59, + "grad_norm": 0.764750063419342, + "learning_rate": 2.7024123478969095e-05, + "loss": 2.9019, + "step": 52881 + }, + { + "epoch": 2.59, + "grad_norm": 0.8144281506538391, + "learning_rate": 2.70177377343211e-05, + "loss": 2.7056, + "step": 52882 + }, + { + "epoch": 2.59, + "grad_norm": 0.7661029696464539, + "learning_rate": 2.7011352708655908e-05, + "loss": 2.907, + "step": 52883 + }, + { + "epoch": 2.59, + "grad_norm": 0.7692121267318726, + "learning_rate": 2.700496840199021e-05, + "loss": 2.8668, + "step": 52884 + }, + { + "epoch": 2.59, + "grad_norm": 0.7304009199142456, + "learning_rate": 2.699858481434096e-05, + "loss": 2.9795, + "step": 52885 + }, + { + "epoch": 2.59, + "grad_norm": 0.7058482766151428, + "learning_rate": 2.6992201945724878e-05, + "loss": 2.8171, + "step": 52886 + }, + { + "epoch": 2.59, + "grad_norm": 0.8239343762397766, + "learning_rate": 2.6985819796158746e-05, + "loss": 2.7458, + "step": 52887 + }, + { + "epoch": 2.59, + "grad_norm": 0.76352459192276, + "learning_rate": 2.6979438365659456e-05, + "loss": 2.9991, + "step": 52888 + }, + { + "epoch": 2.59, + "grad_norm": 0.7338518500328064, + "learning_rate": 2.6973057654243756e-05, + "loss": 2.8587, + "step": 52889 + }, + { + "epoch": 2.59, + "grad_norm": 0.7867611050605774, + "learning_rate": 2.6966677661928504e-05, + "loss": 2.6982, + "step": 52890 + }, + { + "epoch": 2.59, + "grad_norm": 0.7336326837539673, + "learning_rate": 2.696029838873045e-05, + "loss": 2.8797, + "step": 52891 + }, + { + "epoch": 2.59, + "grad_norm": 0.7487276196479797, + "learning_rate": 2.6953919834666417e-05, + "loss": 2.8777, + "step": 52892 + }, + { + "epoch": 2.59, + "grad_norm": 0.7201322913169861, + "learning_rate": 2.6947541999753253e-05, + "loss": 2.806, + "step": 52893 + }, + { + "epoch": 2.59, + "grad_norm": 0.731619119644165, + "learning_rate": 2.6941164884007715e-05, + "loss": 2.782, + "step": 52894 + }, + { + "epoch": 2.59, + "grad_norm": 0.7519465684890747, + "learning_rate": 2.6934788487446623e-05, + "loss": 2.7115, + "step": 52895 + }, + { + "epoch": 2.59, + "grad_norm": 0.7223137021064758, + "learning_rate": 2.6928412810086665e-05, + "loss": 2.8679, + "step": 52896 + }, + { + "epoch": 2.59, + "grad_norm": 0.7320907711982727, + "learning_rate": 2.692203785194472e-05, + "loss": 3.1397, + "step": 52897 + }, + { + "epoch": 2.59, + "grad_norm": 0.7210008502006531, + "learning_rate": 2.6915663613037618e-05, + "loss": 2.9665, + "step": 52898 + }, + { + "epoch": 2.59, + "grad_norm": 0.6909995675086975, + "learning_rate": 2.690929009338204e-05, + "loss": 2.9763, + "step": 52899 + }, + { + "epoch": 2.59, + "grad_norm": 0.7912375926971436, + "learning_rate": 2.6902917292994906e-05, + "loss": 2.9634, + "step": 52900 + }, + { + "epoch": 2.59, + "grad_norm": 0.7833209037780762, + "learning_rate": 2.6896545211892938e-05, + "loss": 2.7593, + "step": 52901 + }, + { + "epoch": 2.59, + "grad_norm": 0.7575777769088745, + "learning_rate": 2.6890173850092855e-05, + "loss": 2.8503, + "step": 52902 + }, + { + "epoch": 2.59, + "grad_norm": 0.7618677616119385, + "learning_rate": 2.6883803207611543e-05, + "loss": 3.023, + "step": 52903 + }, + { + "epoch": 2.59, + "grad_norm": 0.7287328839302063, + "learning_rate": 2.6877433284465656e-05, + "loss": 2.6533, + "step": 52904 + }, + { + "epoch": 2.59, + "grad_norm": 0.7309133410453796, + "learning_rate": 2.687106408067211e-05, + "loss": 2.9655, + "step": 52905 + }, + { + "epoch": 2.59, + "grad_norm": 0.7655085921287537, + "learning_rate": 2.6864695596247566e-05, + "loss": 2.9034, + "step": 52906 + }, + { + "epoch": 2.59, + "grad_norm": 0.7788105010986328, + "learning_rate": 2.685832783120887e-05, + "loss": 2.8593, + "step": 52907 + }, + { + "epoch": 2.59, + "grad_norm": 0.7491944432258606, + "learning_rate": 2.6851960785572813e-05, + "loss": 2.7122, + "step": 52908 + }, + { + "epoch": 2.59, + "grad_norm": 0.7422747015953064, + "learning_rate": 2.684559445935611e-05, + "loss": 2.8433, + "step": 52909 + }, + { + "epoch": 2.59, + "grad_norm": 0.7405474781990051, + "learning_rate": 2.6839228852575523e-05, + "loss": 2.984, + "step": 52910 + }, + { + "epoch": 2.59, + "grad_norm": 0.719196617603302, + "learning_rate": 2.683286396524783e-05, + "loss": 2.7548, + "step": 52911 + }, + { + "epoch": 2.59, + "grad_norm": 0.7382709980010986, + "learning_rate": 2.6826499797389822e-05, + "loss": 2.9205, + "step": 52912 + }, + { + "epoch": 2.59, + "grad_norm": 0.7906863689422607, + "learning_rate": 2.6820136349018184e-05, + "loss": 2.7707, + "step": 52913 + }, + { + "epoch": 2.59, + "grad_norm": 0.753178596496582, + "learning_rate": 2.681377362014977e-05, + "loss": 2.8353, + "step": 52914 + }, + { + "epoch": 2.59, + "grad_norm": 0.7814869284629822, + "learning_rate": 2.68074116108013e-05, + "loss": 2.7692, + "step": 52915 + }, + { + "epoch": 2.59, + "grad_norm": 0.7554951310157776, + "learning_rate": 2.680105032098956e-05, + "loss": 2.6514, + "step": 52916 + }, + { + "epoch": 2.59, + "grad_norm": 0.7147135138511658, + "learning_rate": 2.67946897507313e-05, + "loss": 3.0395, + "step": 52917 + }, + { + "epoch": 2.59, + "grad_norm": 0.7095446586608887, + "learning_rate": 2.6788329900043147e-05, + "loss": 3.0561, + "step": 52918 + }, + { + "epoch": 2.59, + "grad_norm": 0.7295847535133362, + "learning_rate": 2.6781970768941985e-05, + "loss": 3.0686, + "step": 52919 + }, + { + "epoch": 2.59, + "grad_norm": 0.7395632863044739, + "learning_rate": 2.6775612357444564e-05, + "loss": 3.1226, + "step": 52920 + }, + { + "epoch": 2.59, + "grad_norm": 0.7481321096420288, + "learning_rate": 2.6769254665567575e-05, + "loss": 2.9356, + "step": 52921 + }, + { + "epoch": 2.59, + "grad_norm": 0.7256965637207031, + "learning_rate": 2.67628976933278e-05, + "loss": 2.8715, + "step": 52922 + }, + { + "epoch": 2.59, + "grad_norm": 0.7288814783096313, + "learning_rate": 2.6756541440741996e-05, + "loss": 3.017, + "step": 52923 + }, + { + "epoch": 2.59, + "grad_norm": 0.7406113743782043, + "learning_rate": 2.675018590782678e-05, + "loss": 3.1521, + "step": 52924 + }, + { + "epoch": 2.59, + "grad_norm": 0.7255039811134338, + "learning_rate": 2.6743831094599077e-05, + "loss": 2.865, + "step": 52925 + }, + { + "epoch": 2.59, + "grad_norm": 0.7836463451385498, + "learning_rate": 2.673747700107547e-05, + "loss": 2.8227, + "step": 52926 + }, + { + "epoch": 2.59, + "grad_norm": 0.7792579531669617, + "learning_rate": 2.673112362727281e-05, + "loss": 2.8882, + "step": 52927 + }, + { + "epoch": 2.59, + "grad_norm": 0.7477447986602783, + "learning_rate": 2.6724770973207754e-05, + "loss": 3.0428, + "step": 52928 + }, + { + "epoch": 2.59, + "grad_norm": 0.7685944437980652, + "learning_rate": 2.671841903889702e-05, + "loss": 2.9894, + "step": 52929 + }, + { + "epoch": 2.59, + "grad_norm": 0.7276458740234375, + "learning_rate": 2.671206782435743e-05, + "loss": 2.9397, + "step": 52930 + }, + { + "epoch": 2.59, + "grad_norm": 0.766247034072876, + "learning_rate": 2.670571732960567e-05, + "loss": 3.0083, + "step": 52931 + }, + { + "epoch": 2.59, + "grad_norm": 0.759260356426239, + "learning_rate": 2.669936755465846e-05, + "loss": 2.7322, + "step": 52932 + }, + { + "epoch": 2.59, + "grad_norm": 0.7431995272636414, + "learning_rate": 2.6693018499532483e-05, + "loss": 2.9465, + "step": 52933 + }, + { + "epoch": 2.59, + "grad_norm": 0.744221568107605, + "learning_rate": 2.668667016424446e-05, + "loss": 3.0348, + "step": 52934 + }, + { + "epoch": 2.59, + "grad_norm": 0.8303017616271973, + "learning_rate": 2.6680322548811216e-05, + "loss": 2.8454, + "step": 52935 + }, + { + "epoch": 2.59, + "grad_norm": 0.7433030605316162, + "learning_rate": 2.6673975653249336e-05, + "loss": 2.8534, + "step": 52936 + }, + { + "epoch": 2.59, + "grad_norm": 0.7234331965446472, + "learning_rate": 2.666762947757567e-05, + "loss": 2.7294, + "step": 52937 + }, + { + "epoch": 2.59, + "grad_norm": 0.7342804074287415, + "learning_rate": 2.666128402180684e-05, + "loss": 3.0371, + "step": 52938 + }, + { + "epoch": 2.59, + "grad_norm": 0.7552002668380737, + "learning_rate": 2.6654939285959564e-05, + "loss": 3.075, + "step": 52939 + }, + { + "epoch": 2.59, + "grad_norm": 0.6919820308685303, + "learning_rate": 2.6648595270050598e-05, + "loss": 2.9163, + "step": 52940 + }, + { + "epoch": 2.59, + "grad_norm": 0.7407262921333313, + "learning_rate": 2.6642251974096597e-05, + "loss": 2.9234, + "step": 52941 + }, + { + "epoch": 2.59, + "grad_norm": 0.7760119438171387, + "learning_rate": 2.663590939811434e-05, + "loss": 3.0161, + "step": 52942 + }, + { + "epoch": 2.59, + "grad_norm": 0.7739540338516235, + "learning_rate": 2.6629567542120422e-05, + "loss": 2.8185, + "step": 52943 + }, + { + "epoch": 2.59, + "grad_norm": 0.7417166829109192, + "learning_rate": 2.662322640613166e-05, + "loss": 2.893, + "step": 52944 + }, + { + "epoch": 2.59, + "grad_norm": 0.7252646088600159, + "learning_rate": 2.661688599016474e-05, + "loss": 2.8558, + "step": 52945 + }, + { + "epoch": 2.59, + "grad_norm": 0.7560940980911255, + "learning_rate": 2.6610546294236245e-05, + "loss": 2.9693, + "step": 52946 + }, + { + "epoch": 2.59, + "grad_norm": 0.7531810402870178, + "learning_rate": 2.6604207318363003e-05, + "loss": 3.0296, + "step": 52947 + }, + { + "epoch": 2.59, + "grad_norm": 0.7631690502166748, + "learning_rate": 2.6597869062561627e-05, + "loss": 3.013, + "step": 52948 + }, + { + "epoch": 2.59, + "grad_norm": 0.7429794669151306, + "learning_rate": 2.659153152684884e-05, + "loss": 2.7711, + "step": 52949 + }, + { + "epoch": 2.59, + "grad_norm": 0.7256251573562622, + "learning_rate": 2.6585194711241397e-05, + "loss": 3.0047, + "step": 52950 + }, + { + "epoch": 2.6, + "grad_norm": 0.7361646890640259, + "learning_rate": 2.6578858615755882e-05, + "loss": 3.0397, + "step": 52951 + }, + { + "epoch": 2.6, + "grad_norm": 0.7536941766738892, + "learning_rate": 2.6572523240409082e-05, + "loss": 2.8646, + "step": 52952 + }, + { + "epoch": 2.6, + "grad_norm": 0.7746854424476624, + "learning_rate": 2.6566188585217652e-05, + "loss": 2.9317, + "step": 52953 + }, + { + "epoch": 2.6, + "grad_norm": 0.7454334497451782, + "learning_rate": 2.6559854650198174e-05, + "loss": 2.8275, + "step": 52954 + }, + { + "epoch": 2.6, + "grad_norm": 0.7651907205581665, + "learning_rate": 2.6553521435367474e-05, + "loss": 2.7026, + "step": 52955 + }, + { + "epoch": 2.6, + "grad_norm": 0.7534736394882202, + "learning_rate": 2.6547188940742136e-05, + "loss": 2.9386, + "step": 52956 + }, + { + "epoch": 2.6, + "grad_norm": 0.7591540217399597, + "learning_rate": 2.6540857166338915e-05, + "loss": 2.9647, + "step": 52957 + }, + { + "epoch": 2.6, + "grad_norm": 0.8195220232009888, + "learning_rate": 2.6534526112174392e-05, + "loss": 2.8516, + "step": 52958 + }, + { + "epoch": 2.6, + "grad_norm": 0.8042005300521851, + "learning_rate": 2.652819577826536e-05, + "loss": 2.8863, + "step": 52959 + }, + { + "epoch": 2.6, + "grad_norm": 0.7308369278907776, + "learning_rate": 2.6521866164628437e-05, + "loss": 3.1317, + "step": 52960 + }, + { + "epoch": 2.6, + "grad_norm": 0.7242509126663208, + "learning_rate": 2.6515537271280206e-05, + "loss": 2.8885, + "step": 52961 + }, + { + "epoch": 2.6, + "grad_norm": 0.7605120539665222, + "learning_rate": 2.6509209098237494e-05, + "loss": 3.0218, + "step": 52962 + }, + { + "epoch": 2.6, + "grad_norm": 0.729441225528717, + "learning_rate": 2.6502881645516848e-05, + "loss": 2.9623, + "step": 52963 + }, + { + "epoch": 2.6, + "grad_norm": 0.7368941903114319, + "learning_rate": 2.649655491313496e-05, + "loss": 3.0673, + "step": 52964 + }, + { + "epoch": 2.6, + "grad_norm": 0.7427717447280884, + "learning_rate": 2.6490228901108546e-05, + "loss": 2.8994, + "step": 52965 + }, + { + "epoch": 2.6, + "grad_norm": 0.7409499883651733, + "learning_rate": 2.648390360945426e-05, + "loss": 2.772, + "step": 52966 + }, + { + "epoch": 2.6, + "grad_norm": 0.7534723877906799, + "learning_rate": 2.6477579038188722e-05, + "loss": 3.1012, + "step": 52967 + }, + { + "epoch": 2.6, + "grad_norm": 0.7700952887535095, + "learning_rate": 2.647125518732852e-05, + "loss": 2.9597, + "step": 52968 + }, + { + "epoch": 2.6, + "grad_norm": 0.718622088432312, + "learning_rate": 2.646493205689044e-05, + "loss": 2.9911, + "step": 52969 + }, + { + "epoch": 2.6, + "grad_norm": 0.7912169694900513, + "learning_rate": 2.6458609646891106e-05, + "loss": 3.0782, + "step": 52970 + }, + { + "epoch": 2.6, + "grad_norm": 0.766054630279541, + "learning_rate": 2.6452287957347095e-05, + "loss": 3.1029, + "step": 52971 + }, + { + "epoch": 2.6, + "grad_norm": 0.816627025604248, + "learning_rate": 2.644596698827517e-05, + "loss": 2.7479, + "step": 52972 + }, + { + "epoch": 2.6, + "grad_norm": 0.779098391532898, + "learning_rate": 2.6439646739691876e-05, + "loss": 2.7438, + "step": 52973 + }, + { + "epoch": 2.6, + "grad_norm": 0.7731761932373047, + "learning_rate": 2.6433327211613974e-05, + "loss": 2.9227, + "step": 52974 + }, + { + "epoch": 2.6, + "grad_norm": 0.7547659277915955, + "learning_rate": 2.6427008404058014e-05, + "loss": 2.8468, + "step": 52975 + }, + { + "epoch": 2.6, + "grad_norm": 0.7428092956542969, + "learning_rate": 2.6420690317040616e-05, + "loss": 2.9101, + "step": 52976 + }, + { + "epoch": 2.6, + "grad_norm": 0.7234500050544739, + "learning_rate": 2.6414372950578533e-05, + "loss": 2.7825, + "step": 52977 + }, + { + "epoch": 2.6, + "grad_norm": 0.7860729098320007, + "learning_rate": 2.6408056304688253e-05, + "loss": 2.832, + "step": 52978 + }, + { + "epoch": 2.6, + "grad_norm": 0.7206407189369202, + "learning_rate": 2.6401740379386526e-05, + "loss": 2.6774, + "step": 52979 + }, + { + "epoch": 2.6, + "grad_norm": 0.715305507183075, + "learning_rate": 2.639542517469001e-05, + "loss": 2.9311, + "step": 52980 + }, + { + "epoch": 2.6, + "grad_norm": 0.7753638625144958, + "learning_rate": 2.6389110690615288e-05, + "loss": 2.929, + "step": 52981 + }, + { + "epoch": 2.6, + "grad_norm": 0.7320675253868103, + "learning_rate": 2.6382796927178984e-05, + "loss": 2.7352, + "step": 52982 + }, + { + "epoch": 2.6, + "grad_norm": 0.7664274573326111, + "learning_rate": 2.6376483884397682e-05, + "loss": 2.772, + "step": 52983 + }, + { + "epoch": 2.6, + "grad_norm": 0.723810076713562, + "learning_rate": 2.6370171562288068e-05, + "loss": 2.9876, + "step": 52984 + }, + { + "epoch": 2.6, + "grad_norm": 0.7204124927520752, + "learning_rate": 2.6363859960866828e-05, + "loss": 2.9197, + "step": 52985 + }, + { + "epoch": 2.6, + "grad_norm": 0.748243510723114, + "learning_rate": 2.6357549080150452e-05, + "loss": 2.8726, + "step": 52986 + }, + { + "epoch": 2.6, + "grad_norm": 0.7590616941452026, + "learning_rate": 2.635123892015566e-05, + "loss": 2.8701, + "step": 52987 + }, + { + "epoch": 2.6, + "grad_norm": 0.7760512828826904, + "learning_rate": 2.6344929480899068e-05, + "loss": 2.7923, + "step": 52988 + }, + { + "epoch": 2.6, + "grad_norm": 0.7821771502494812, + "learning_rate": 2.6338620762397266e-05, + "loss": 2.9296, + "step": 52989 + }, + { + "epoch": 2.6, + "grad_norm": 0.7788903117179871, + "learning_rate": 2.6332312764666808e-05, + "loss": 2.7867, + "step": 52990 + }, + { + "epoch": 2.6, + "grad_norm": 0.7191170454025269, + "learning_rate": 2.6326005487724345e-05, + "loss": 3.0973, + "step": 52991 + }, + { + "epoch": 2.6, + "grad_norm": 0.7111521363258362, + "learning_rate": 2.6319698931586598e-05, + "loss": 2.9587, + "step": 52992 + }, + { + "epoch": 2.6, + "grad_norm": 0.742542028427124, + "learning_rate": 2.6313393096270053e-05, + "loss": 2.8454, + "step": 52993 + }, + { + "epoch": 2.6, + "grad_norm": 0.7643880844116211, + "learning_rate": 2.6307087981791365e-05, + "loss": 2.9678, + "step": 52994 + }, + { + "epoch": 2.6, + "grad_norm": 0.7396053075790405, + "learning_rate": 2.630078358816712e-05, + "loss": 2.7297, + "step": 52995 + }, + { + "epoch": 2.6, + "grad_norm": 0.7279162406921387, + "learning_rate": 2.6294479915413968e-05, + "loss": 2.7479, + "step": 52996 + }, + { + "epoch": 2.6, + "grad_norm": 0.7429473996162415, + "learning_rate": 2.62881769635485e-05, + "loss": 2.9331, + "step": 52997 + }, + { + "epoch": 2.6, + "grad_norm": 0.7297216057777405, + "learning_rate": 2.6281874732587238e-05, + "loss": 2.9241, + "step": 52998 + }, + { + "epoch": 2.6, + "grad_norm": 0.7150318026542664, + "learning_rate": 2.6275573222546898e-05, + "loss": 2.9074, + "step": 52999 + }, + { + "epoch": 2.6, + "grad_norm": 0.7341070175170898, + "learning_rate": 2.6269272433443933e-05, + "loss": 2.8459, + "step": 53000 + }, + { + "epoch": 2.6, + "grad_norm": 0.741511881351471, + "learning_rate": 2.6262972365295067e-05, + "loss": 3.1279, + "step": 53001 + }, + { + "epoch": 2.6, + "grad_norm": 0.7074028849601746, + "learning_rate": 2.625667301811688e-05, + "loss": 2.8032, + "step": 53002 + }, + { + "epoch": 2.6, + "grad_norm": 0.7410276532173157, + "learning_rate": 2.6250374391925933e-05, + "loss": 2.6586, + "step": 53003 + }, + { + "epoch": 2.6, + "grad_norm": 0.7217987775802612, + "learning_rate": 2.624407648673881e-05, + "loss": 2.6289, + "step": 53004 + }, + { + "epoch": 2.6, + "grad_norm": 0.7761947512626648, + "learning_rate": 2.623777930257206e-05, + "loss": 2.86, + "step": 53005 + }, + { + "epoch": 2.6, + "grad_norm": 0.7420976161956787, + "learning_rate": 2.6231482839442307e-05, + "loss": 3.0115, + "step": 53006 + }, + { + "epoch": 2.6, + "grad_norm": 0.7364739179611206, + "learning_rate": 2.6225187097366206e-05, + "loss": 2.8002, + "step": 53007 + }, + { + "epoch": 2.6, + "grad_norm": 0.7955524325370789, + "learning_rate": 2.621889207636021e-05, + "loss": 2.9859, + "step": 53008 + }, + { + "epoch": 2.6, + "grad_norm": 0.7260406613349915, + "learning_rate": 2.6212597776441034e-05, + "loss": 2.896, + "step": 53009 + }, + { + "epoch": 2.6, + "grad_norm": 0.7395499348640442, + "learning_rate": 2.6206304197625107e-05, + "loss": 2.9315, + "step": 53010 + }, + { + "epoch": 2.6, + "grad_norm": 0.7385746836662292, + "learning_rate": 2.6200011339929138e-05, + "loss": 2.8443, + "step": 53011 + }, + { + "epoch": 2.6, + "grad_norm": 0.7338225245475769, + "learning_rate": 2.6193719203369658e-05, + "loss": 2.9998, + "step": 53012 + }, + { + "epoch": 2.6, + "grad_norm": 0.7368943691253662, + "learning_rate": 2.6187427787963143e-05, + "loss": 2.812, + "step": 53013 + }, + { + "epoch": 2.6, + "grad_norm": 0.7941758036613464, + "learning_rate": 2.6181137093726358e-05, + "loss": 2.964, + "step": 53014 + }, + { + "epoch": 2.6, + "grad_norm": 0.8088509440422058, + "learning_rate": 2.6174847120675646e-05, + "loss": 3.0882, + "step": 53015 + }, + { + "epoch": 2.6, + "grad_norm": 0.7890652418136597, + "learning_rate": 2.6168557868827733e-05, + "loss": 2.8184, + "step": 53016 + }, + { + "epoch": 2.6, + "grad_norm": 0.7560929656028748, + "learning_rate": 2.6162269338199172e-05, + "loss": 2.9436, + "step": 53017 + }, + { + "epoch": 2.6, + "grad_norm": 0.7155969738960266, + "learning_rate": 2.615598152880648e-05, + "loss": 2.9308, + "step": 53018 + }, + { + "epoch": 2.6, + "grad_norm": 0.7594807147979736, + "learning_rate": 2.6149694440666246e-05, + "loss": 2.8756, + "step": 53019 + }, + { + "epoch": 2.6, + "grad_norm": 0.7847410440444946, + "learning_rate": 2.6143408073794957e-05, + "loss": 3.0293, + "step": 53020 + }, + { + "epoch": 2.6, + "grad_norm": 0.8248916864395142, + "learning_rate": 2.6137122428209234e-05, + "loss": 2.7244, + "step": 53021 + }, + { + "epoch": 2.6, + "grad_norm": 0.7598782777786255, + "learning_rate": 2.6130837503925662e-05, + "loss": 2.9428, + "step": 53022 + }, + { + "epoch": 2.6, + "grad_norm": 0.8169916868209839, + "learning_rate": 2.6124553300960725e-05, + "loss": 2.9181, + "step": 53023 + }, + { + "epoch": 2.6, + "grad_norm": 0.7497507333755493, + "learning_rate": 2.611826981933105e-05, + "loss": 2.7606, + "step": 53024 + }, + { + "epoch": 2.6, + "grad_norm": 0.7234208583831787, + "learning_rate": 2.611198705905312e-05, + "loss": 2.9675, + "step": 53025 + }, + { + "epoch": 2.6, + "grad_norm": 0.6970089077949524, + "learning_rate": 2.6105705020143487e-05, + "loss": 2.9544, + "step": 53026 + }, + { + "epoch": 2.6, + "grad_norm": 0.702825129032135, + "learning_rate": 2.6099423702618738e-05, + "loss": 2.8923, + "step": 53027 + }, + { + "epoch": 2.6, + "grad_norm": 0.7164490222930908, + "learning_rate": 2.6093143106495362e-05, + "loss": 3.0565, + "step": 53028 + }, + { + "epoch": 2.6, + "grad_norm": 0.8021504878997803, + "learning_rate": 2.6086863231789978e-05, + "loss": 2.9865, + "step": 53029 + }, + { + "epoch": 2.6, + "grad_norm": 0.7775247097015381, + "learning_rate": 2.608058407851904e-05, + "loss": 2.9305, + "step": 53030 + }, + { + "epoch": 2.6, + "grad_norm": 0.903372049331665, + "learning_rate": 2.60743056466991e-05, + "loss": 2.945, + "step": 53031 + }, + { + "epoch": 2.6, + "grad_norm": 0.7477425932884216, + "learning_rate": 2.6068027936346814e-05, + "loss": 2.6972, + "step": 53032 + }, + { + "epoch": 2.6, + "grad_norm": 0.7447633147239685, + "learning_rate": 2.6061750947478598e-05, + "loss": 3.0102, + "step": 53033 + }, + { + "epoch": 2.6, + "grad_norm": 0.7586161494255066, + "learning_rate": 2.605547468011101e-05, + "loss": 2.9697, + "step": 53034 + }, + { + "epoch": 2.6, + "grad_norm": 0.7298742532730103, + "learning_rate": 2.6049199134260535e-05, + "loss": 2.9407, + "step": 53035 + }, + { + "epoch": 2.6, + "grad_norm": 0.7363532185554504, + "learning_rate": 2.6042924309943724e-05, + "loss": 3.0353, + "step": 53036 + }, + { + "epoch": 2.6, + "grad_norm": 0.7362156510353088, + "learning_rate": 2.6036650207177202e-05, + "loss": 2.9521, + "step": 53037 + }, + { + "epoch": 2.6, + "grad_norm": 0.7948529720306396, + "learning_rate": 2.603037682597735e-05, + "loss": 2.9741, + "step": 53038 + }, + { + "epoch": 2.6, + "grad_norm": 0.7342039942741394, + "learning_rate": 2.6024104166360826e-05, + "loss": 2.9473, + "step": 53039 + }, + { + "epoch": 2.6, + "grad_norm": 0.7215148210525513, + "learning_rate": 2.601783222834408e-05, + "loss": 3.0339, + "step": 53040 + }, + { + "epoch": 2.6, + "grad_norm": 0.7647704482078552, + "learning_rate": 2.601156101194357e-05, + "loss": 2.7878, + "step": 53041 + }, + { + "epoch": 2.6, + "grad_norm": 0.7760405540466309, + "learning_rate": 2.6005290517175948e-05, + "loss": 2.7535, + "step": 53042 + }, + { + "epoch": 2.6, + "grad_norm": 0.7145057916641235, + "learning_rate": 2.5999020744057598e-05, + "loss": 2.9127, + "step": 53043 + }, + { + "epoch": 2.6, + "grad_norm": 0.7269275784492493, + "learning_rate": 2.599275169260514e-05, + "loss": 2.6882, + "step": 53044 + }, + { + "epoch": 2.6, + "grad_norm": 0.7115113735198975, + "learning_rate": 2.5986483362835e-05, + "loss": 2.9195, + "step": 53045 + }, + { + "epoch": 2.6, + "grad_norm": 0.7383373379707336, + "learning_rate": 2.5980215754763788e-05, + "loss": 2.9078, + "step": 53046 + }, + { + "epoch": 2.6, + "grad_norm": 0.6950742602348328, + "learning_rate": 2.5973948868407936e-05, + "loss": 2.8095, + "step": 53047 + }, + { + "epoch": 2.6, + "grad_norm": 0.7306373715400696, + "learning_rate": 2.596768270378392e-05, + "loss": 2.9741, + "step": 53048 + }, + { + "epoch": 2.6, + "grad_norm": 0.7336162328720093, + "learning_rate": 2.5961417260908335e-05, + "loss": 2.9961, + "step": 53049 + }, + { + "epoch": 2.6, + "grad_norm": 0.7189518213272095, + "learning_rate": 2.5955152539797598e-05, + "loss": 3.0896, + "step": 53050 + }, + { + "epoch": 2.6, + "grad_norm": 0.8078373670578003, + "learning_rate": 2.5948888540468226e-05, + "loss": 2.5814, + "step": 53051 + }, + { + "epoch": 2.6, + "grad_norm": 0.7885643839836121, + "learning_rate": 2.594262526293681e-05, + "loss": 3.1085, + "step": 53052 + }, + { + "epoch": 2.6, + "grad_norm": 0.7464584112167358, + "learning_rate": 2.5936362707219704e-05, + "loss": 2.8979, + "step": 53053 + }, + { + "epoch": 2.6, + "grad_norm": 0.7252410650253296, + "learning_rate": 2.593010087333356e-05, + "loss": 2.8299, + "step": 53054 + }, + { + "epoch": 2.6, + "grad_norm": 0.7533923387527466, + "learning_rate": 2.5923839761294762e-05, + "loss": 2.694, + "step": 53055 + }, + { + "epoch": 2.6, + "grad_norm": 0.700678825378418, + "learning_rate": 2.5917579371119768e-05, + "loss": 2.9615, + "step": 53056 + }, + { + "epoch": 2.6, + "grad_norm": 0.7296587824821472, + "learning_rate": 2.591131970282516e-05, + "loss": 2.9133, + "step": 53057 + }, + { + "epoch": 2.6, + "grad_norm": 0.7433834671974182, + "learning_rate": 2.5905060756427365e-05, + "loss": 2.9297, + "step": 53058 + }, + { + "epoch": 2.6, + "grad_norm": 0.7818628549575806, + "learning_rate": 2.589880253194293e-05, + "loss": 2.846, + "step": 53059 + }, + { + "epoch": 2.6, + "grad_norm": 0.775662362575531, + "learning_rate": 2.5892545029388245e-05, + "loss": 2.7622, + "step": 53060 + }, + { + "epoch": 2.6, + "grad_norm": 0.7537034153938293, + "learning_rate": 2.5886288248779896e-05, + "loss": 3.0378, + "step": 53061 + }, + { + "epoch": 2.6, + "grad_norm": 0.7831193804740906, + "learning_rate": 2.5880032190134303e-05, + "loss": 2.782, + "step": 53062 + }, + { + "epoch": 2.6, + "grad_norm": 0.7730798125267029, + "learning_rate": 2.587377685346792e-05, + "loss": 2.9905, + "step": 53063 + }, + { + "epoch": 2.6, + "grad_norm": 0.7518443465232849, + "learning_rate": 2.5867522238797266e-05, + "loss": 2.8506, + "step": 53064 + }, + { + "epoch": 2.6, + "grad_norm": 0.7517948150634766, + "learning_rate": 2.5861268346138796e-05, + "loss": 2.8146, + "step": 53065 + }, + { + "epoch": 2.6, + "grad_norm": 0.7461619973182678, + "learning_rate": 2.5855015175508998e-05, + "loss": 2.994, + "step": 53066 + }, + { + "epoch": 2.6, + "grad_norm": 0.7383305430412292, + "learning_rate": 2.5848762726924287e-05, + "loss": 2.8297, + "step": 53067 + }, + { + "epoch": 2.6, + "grad_norm": 0.7097854614257812, + "learning_rate": 2.584251100040122e-05, + "loss": 2.6958, + "step": 53068 + }, + { + "epoch": 2.6, + "grad_norm": 0.7683972120285034, + "learning_rate": 2.5836259995956254e-05, + "loss": 2.7007, + "step": 53069 + }, + { + "epoch": 2.6, + "grad_norm": 0.7571167945861816, + "learning_rate": 2.583000971360574e-05, + "loss": 2.9229, + "step": 53070 + }, + { + "epoch": 2.6, + "grad_norm": 0.7330532670021057, + "learning_rate": 2.5823760153366258e-05, + "loss": 2.9509, + "step": 53071 + }, + { + "epoch": 2.6, + "grad_norm": 0.7204066514968872, + "learning_rate": 2.581751131525417e-05, + "loss": 2.9927, + "step": 53072 + }, + { + "epoch": 2.6, + "grad_norm": 0.7530893683433533, + "learning_rate": 2.5811263199285992e-05, + "loss": 2.9785, + "step": 53073 + }, + { + "epoch": 2.6, + "grad_norm": 0.7310274839401245, + "learning_rate": 2.5805015805478245e-05, + "loss": 2.8125, + "step": 53074 + }, + { + "epoch": 2.6, + "grad_norm": 0.7056111097335815, + "learning_rate": 2.579876913384725e-05, + "loss": 2.8229, + "step": 53075 + }, + { + "epoch": 2.6, + "grad_norm": 0.7395758628845215, + "learning_rate": 2.5792523184409597e-05, + "loss": 3.0898, + "step": 53076 + }, + { + "epoch": 2.6, + "grad_norm": 0.7391893267631531, + "learning_rate": 2.578627795718167e-05, + "loss": 2.7922, + "step": 53077 + }, + { + "epoch": 2.6, + "grad_norm": 0.719342052936554, + "learning_rate": 2.5780033452179815e-05, + "loss": 2.8224, + "step": 53078 + }, + { + "epoch": 2.6, + "grad_norm": 0.7190227508544922, + "learning_rate": 2.5773789669420662e-05, + "loss": 2.9418, + "step": 53079 + }, + { + "epoch": 2.6, + "grad_norm": 0.7570742964744568, + "learning_rate": 2.576754660892053e-05, + "loss": 2.5958, + "step": 53080 + }, + { + "epoch": 2.6, + "grad_norm": 0.7037019729614258, + "learning_rate": 2.5761304270695936e-05, + "loss": 2.9506, + "step": 53081 + }, + { + "epoch": 2.6, + "grad_norm": 0.7258500456809998, + "learning_rate": 2.5755062654763237e-05, + "loss": 2.6689, + "step": 53082 + }, + { + "epoch": 2.6, + "grad_norm": 0.7900306582450867, + "learning_rate": 2.5748821761138982e-05, + "loss": 3.0713, + "step": 53083 + }, + { + "epoch": 2.6, + "grad_norm": 0.7837575078010559, + "learning_rate": 2.5742581589839563e-05, + "loss": 2.656, + "step": 53084 + }, + { + "epoch": 2.6, + "grad_norm": 0.715522050857544, + "learning_rate": 2.5736342140881326e-05, + "loss": 2.9993, + "step": 53085 + }, + { + "epoch": 2.6, + "grad_norm": 0.7533067464828491, + "learning_rate": 2.5730103414280834e-05, + "loss": 3.0117, + "step": 53086 + }, + { + "epoch": 2.6, + "grad_norm": 0.7553693652153015, + "learning_rate": 2.5723865410054434e-05, + "loss": 2.9827, + "step": 53087 + }, + { + "epoch": 2.6, + "grad_norm": 0.7062312364578247, + "learning_rate": 2.5717628128218548e-05, + "loss": 3.022, + "step": 53088 + }, + { + "epoch": 2.6, + "grad_norm": 0.7795845866203308, + "learning_rate": 2.571139156878973e-05, + "loss": 2.8414, + "step": 53089 + }, + { + "epoch": 2.6, + "grad_norm": 0.7023152709007263, + "learning_rate": 2.5705155731784333e-05, + "loss": 2.8888, + "step": 53090 + }, + { + "epoch": 2.6, + "grad_norm": 0.7439648509025574, + "learning_rate": 2.5698920617218743e-05, + "loss": 2.8256, + "step": 53091 + }, + { + "epoch": 2.6, + "grad_norm": 0.7769492268562317, + "learning_rate": 2.569268622510935e-05, + "loss": 2.7845, + "step": 53092 + }, + { + "epoch": 2.6, + "grad_norm": 0.7483662962913513, + "learning_rate": 2.5686452555472637e-05, + "loss": 2.8814, + "step": 53093 + }, + { + "epoch": 2.6, + "grad_norm": 0.7647069096565247, + "learning_rate": 2.568021960832506e-05, + "loss": 2.6949, + "step": 53094 + }, + { + "epoch": 2.6, + "grad_norm": 0.7527300715446472, + "learning_rate": 2.5673987383682936e-05, + "loss": 3.0339, + "step": 53095 + }, + { + "epoch": 2.6, + "grad_norm": 0.7335328459739685, + "learning_rate": 2.566775588156279e-05, + "loss": 2.869, + "step": 53096 + }, + { + "epoch": 2.6, + "grad_norm": 0.7456409335136414, + "learning_rate": 2.566152510198094e-05, + "loss": 2.9675, + "step": 53097 + }, + { + "epoch": 2.6, + "grad_norm": 0.7429218888282776, + "learning_rate": 2.5655295044953873e-05, + "loss": 2.9004, + "step": 53098 + }, + { + "epoch": 2.6, + "grad_norm": 0.7677924036979675, + "learning_rate": 2.5649065710497974e-05, + "loss": 2.9512, + "step": 53099 + }, + { + "epoch": 2.6, + "grad_norm": 0.7493651509284973, + "learning_rate": 2.5642837098629566e-05, + "loss": 2.9513, + "step": 53100 + }, + { + "epoch": 2.6, + "grad_norm": 0.7118934392929077, + "learning_rate": 2.5636609209365168e-05, + "loss": 2.6457, + "step": 53101 + }, + { + "epoch": 2.6, + "grad_norm": 0.8193805813789368, + "learning_rate": 2.5630382042721098e-05, + "loss": 2.668, + "step": 53102 + }, + { + "epoch": 2.6, + "grad_norm": 0.6974309086799622, + "learning_rate": 2.5624155598713815e-05, + "loss": 2.8641, + "step": 53103 + }, + { + "epoch": 2.6, + "grad_norm": 0.7470758557319641, + "learning_rate": 2.5617929877359732e-05, + "loss": 2.9929, + "step": 53104 + }, + { + "epoch": 2.6, + "grad_norm": 0.7814863324165344, + "learning_rate": 2.5611704878675244e-05, + "loss": 2.8942, + "step": 53105 + }, + { + "epoch": 2.6, + "grad_norm": 0.7379601001739502, + "learning_rate": 2.5605480602676698e-05, + "loss": 3.0074, + "step": 53106 + }, + { + "epoch": 2.6, + "grad_norm": 0.7781730890274048, + "learning_rate": 2.5599257049380452e-05, + "loss": 2.663, + "step": 53107 + }, + { + "epoch": 2.6, + "grad_norm": 0.7469813227653503, + "learning_rate": 2.5593034218802956e-05, + "loss": 2.8231, + "step": 53108 + }, + { + "epoch": 2.6, + "grad_norm": 0.7671642303466797, + "learning_rate": 2.5586812110960665e-05, + "loss": 2.9087, + "step": 53109 + }, + { + "epoch": 2.6, + "grad_norm": 0.7788114547729492, + "learning_rate": 2.558059072586983e-05, + "loss": 2.7514, + "step": 53110 + }, + { + "epoch": 2.6, + "grad_norm": 0.7065609693527222, + "learning_rate": 2.5574370063546944e-05, + "loss": 2.991, + "step": 53111 + }, + { + "epoch": 2.6, + "grad_norm": 0.725106418132782, + "learning_rate": 2.5568150124008324e-05, + "loss": 2.8381, + "step": 53112 + }, + { + "epoch": 2.6, + "grad_norm": 0.7204715013504028, + "learning_rate": 2.5561930907270456e-05, + "loss": 2.9109, + "step": 53113 + }, + { + "epoch": 2.6, + "grad_norm": 0.7312618494033813, + "learning_rate": 2.5555712413349594e-05, + "loss": 3.0618, + "step": 53114 + }, + { + "epoch": 2.6, + "grad_norm": 0.7476996779441833, + "learning_rate": 2.554949464226216e-05, + "loss": 2.8415, + "step": 53115 + }, + { + "epoch": 2.6, + "grad_norm": 0.763667106628418, + "learning_rate": 2.554327759402457e-05, + "loss": 2.8231, + "step": 53116 + }, + { + "epoch": 2.6, + "grad_norm": 0.759936511516571, + "learning_rate": 2.5537061268653114e-05, + "loss": 2.825, + "step": 53117 + }, + { + "epoch": 2.6, + "grad_norm": 0.7484183311462402, + "learning_rate": 2.5530845666164212e-05, + "loss": 2.7023, + "step": 53118 + }, + { + "epoch": 2.6, + "grad_norm": 0.676468014717102, + "learning_rate": 2.5524630786574285e-05, + "loss": 3.0118, + "step": 53119 + }, + { + "epoch": 2.6, + "grad_norm": 0.7524356842041016, + "learning_rate": 2.5518416629899687e-05, + "loss": 2.7162, + "step": 53120 + }, + { + "epoch": 2.6, + "grad_norm": 0.7328376173973083, + "learning_rate": 2.5512203196156735e-05, + "loss": 2.9075, + "step": 53121 + }, + { + "epoch": 2.6, + "grad_norm": 0.7236394286155701, + "learning_rate": 2.550599048536175e-05, + "loss": 2.8896, + "step": 53122 + }, + { + "epoch": 2.6, + "grad_norm": 0.7498464584350586, + "learning_rate": 2.5499778497531188e-05, + "loss": 2.8998, + "step": 53123 + }, + { + "epoch": 2.6, + "grad_norm": 0.7795760631561279, + "learning_rate": 2.54935672326814e-05, + "loss": 2.6818, + "step": 53124 + }, + { + "epoch": 2.6, + "grad_norm": 0.7325364351272583, + "learning_rate": 2.5487356690828707e-05, + "loss": 2.8785, + "step": 53125 + }, + { + "epoch": 2.6, + "grad_norm": 0.7392412424087524, + "learning_rate": 2.54811468719895e-05, + "loss": 2.9473, + "step": 53126 + }, + { + "epoch": 2.6, + "grad_norm": 0.7106963992118835, + "learning_rate": 2.547493777618016e-05, + "loss": 2.9328, + "step": 53127 + }, + { + "epoch": 2.6, + "grad_norm": 0.7459375262260437, + "learning_rate": 2.546872940341691e-05, + "loss": 2.8673, + "step": 53128 + }, + { + "epoch": 2.6, + "grad_norm": 0.7813073992729187, + "learning_rate": 2.546252175371627e-05, + "loss": 2.8378, + "step": 53129 + }, + { + "epoch": 2.6, + "grad_norm": 0.7099730372428894, + "learning_rate": 2.545631482709446e-05, + "loss": 2.8736, + "step": 53130 + }, + { + "epoch": 2.6, + "grad_norm": 0.739006757736206, + "learning_rate": 2.5450108623567933e-05, + "loss": 2.7253, + "step": 53131 + }, + { + "epoch": 2.6, + "grad_norm": 0.7848395109176636, + "learning_rate": 2.5443903143152912e-05, + "loss": 2.7427, + "step": 53132 + }, + { + "epoch": 2.6, + "grad_norm": 0.7411284446716309, + "learning_rate": 2.543769838586581e-05, + "loss": 2.8126, + "step": 53133 + }, + { + "epoch": 2.6, + "grad_norm": 0.8115738034248352, + "learning_rate": 2.5431494351723058e-05, + "loss": 3.1662, + "step": 53134 + }, + { + "epoch": 2.6, + "grad_norm": 0.7761831879615784, + "learning_rate": 2.542529104074087e-05, + "loss": 2.9151, + "step": 53135 + }, + { + "epoch": 2.6, + "grad_norm": 0.7265602350234985, + "learning_rate": 2.541908845293563e-05, + "loss": 3.0349, + "step": 53136 + }, + { + "epoch": 2.6, + "grad_norm": 0.721573531627655, + "learning_rate": 2.5412886588323634e-05, + "loss": 2.9599, + "step": 53137 + }, + { + "epoch": 2.6, + "grad_norm": 0.7648131251335144, + "learning_rate": 2.5406685446921228e-05, + "loss": 2.9716, + "step": 53138 + }, + { + "epoch": 2.6, + "grad_norm": 0.7431436777114868, + "learning_rate": 2.5400485028744833e-05, + "loss": 2.8647, + "step": 53139 + }, + { + "epoch": 2.6, + "grad_norm": 0.7635769844055176, + "learning_rate": 2.5394285333810637e-05, + "loss": 3.0528, + "step": 53140 + }, + { + "epoch": 2.6, + "grad_norm": 0.7431437373161316, + "learning_rate": 2.538808636213513e-05, + "loss": 3.0272, + "step": 53141 + }, + { + "epoch": 2.6, + "grad_norm": 0.735590934753418, + "learning_rate": 2.5381888113734528e-05, + "loss": 2.8362, + "step": 53142 + }, + { + "epoch": 2.6, + "grad_norm": 0.7066057920455933, + "learning_rate": 2.537569058862512e-05, + "loss": 3.017, + "step": 53143 + }, + { + "epoch": 2.6, + "grad_norm": 0.7370970845222473, + "learning_rate": 2.5369493786823358e-05, + "loss": 2.9394, + "step": 53144 + }, + { + "epoch": 2.6, + "grad_norm": 0.721082866191864, + "learning_rate": 2.5363297708345465e-05, + "loss": 2.8916, + "step": 53145 + }, + { + "epoch": 2.6, + "grad_norm": 0.7915112376213074, + "learning_rate": 2.5357102353207827e-05, + "loss": 2.8815, + "step": 53146 + }, + { + "epoch": 2.6, + "grad_norm": 0.7461155652999878, + "learning_rate": 2.5350907721426663e-05, + "loss": 2.838, + "step": 53147 + }, + { + "epoch": 2.6, + "grad_norm": 0.7456420660018921, + "learning_rate": 2.5344713813018425e-05, + "loss": 2.8849, + "step": 53148 + }, + { + "epoch": 2.6, + "grad_norm": 0.7861379384994507, + "learning_rate": 2.5338520627999337e-05, + "loss": 2.6795, + "step": 53149 + }, + { + "epoch": 2.6, + "grad_norm": 0.7006232738494873, + "learning_rate": 2.533232816638565e-05, + "loss": 3.0351, + "step": 53150 + }, + { + "epoch": 2.6, + "grad_norm": 0.7393845319747925, + "learning_rate": 2.5326136428193823e-05, + "loss": 2.7669, + "step": 53151 + }, + { + "epoch": 2.6, + "grad_norm": 0.7685011029243469, + "learning_rate": 2.531994541344007e-05, + "loss": 2.8029, + "step": 53152 + }, + { + "epoch": 2.6, + "grad_norm": 0.7489452958106995, + "learning_rate": 2.531375512214071e-05, + "loss": 3.1667, + "step": 53153 + }, + { + "epoch": 2.6, + "grad_norm": 0.7370985150337219, + "learning_rate": 2.5307565554312038e-05, + "loss": 2.8493, + "step": 53154 + }, + { + "epoch": 2.6, + "grad_norm": 0.7306023240089417, + "learning_rate": 2.5301376709970368e-05, + "loss": 2.9711, + "step": 53155 + }, + { + "epoch": 2.61, + "grad_norm": 0.7589641809463501, + "learning_rate": 2.5295188589132054e-05, + "loss": 2.9249, + "step": 53156 + }, + { + "epoch": 2.61, + "grad_norm": 0.7181271314620972, + "learning_rate": 2.5289001191813353e-05, + "loss": 2.8004, + "step": 53157 + }, + { + "epoch": 2.61, + "grad_norm": 0.765256941318512, + "learning_rate": 2.5282814518030547e-05, + "loss": 2.964, + "step": 53158 + }, + { + "epoch": 2.61, + "grad_norm": 0.6980361342430115, + "learning_rate": 2.5276628567799894e-05, + "loss": 2.819, + "step": 53159 + }, + { + "epoch": 2.61, + "grad_norm": 0.6991076469421387, + "learning_rate": 2.5270443341137714e-05, + "loss": 3.0081, + "step": 53160 + }, + { + "epoch": 2.61, + "grad_norm": 0.8100741505622864, + "learning_rate": 2.526425883806039e-05, + "loss": 2.9064, + "step": 53161 + }, + { + "epoch": 2.61, + "grad_norm": 0.7279528975486755, + "learning_rate": 2.5258075058584047e-05, + "loss": 3.0166, + "step": 53162 + }, + { + "epoch": 2.61, + "grad_norm": 0.7336760759353638, + "learning_rate": 2.5251892002725138e-05, + "loss": 2.7742, + "step": 53163 + }, + { + "epoch": 2.61, + "grad_norm": 0.7446197271347046, + "learning_rate": 2.524570967049988e-05, + "loss": 3.0359, + "step": 53164 + }, + { + "epoch": 2.61, + "grad_norm": 0.7815460562705994, + "learning_rate": 2.5239528061924462e-05, + "loss": 3.029, + "step": 53165 + }, + { + "epoch": 2.61, + "grad_norm": 0.8095081448554993, + "learning_rate": 2.5233347177015307e-05, + "loss": 2.8958, + "step": 53166 + }, + { + "epoch": 2.61, + "grad_norm": 0.7752934694290161, + "learning_rate": 2.5227167015788597e-05, + "loss": 2.8065, + "step": 53167 + }, + { + "epoch": 2.61, + "grad_norm": 0.7434821724891663, + "learning_rate": 2.5220987578260688e-05, + "loss": 2.9032, + "step": 53168 + }, + { + "epoch": 2.61, + "grad_norm": 0.7885562181472778, + "learning_rate": 2.5214808864447765e-05, + "loss": 3.0296, + "step": 53169 + }, + { + "epoch": 2.61, + "grad_norm": 0.7058457136154175, + "learning_rate": 2.520863087436622e-05, + "loss": 2.7885, + "step": 53170 + }, + { + "epoch": 2.61, + "grad_norm": 0.7025486826896667, + "learning_rate": 2.5202453608032236e-05, + "loss": 2.8992, + "step": 53171 + }, + { + "epoch": 2.61, + "grad_norm": 0.7743384838104248, + "learning_rate": 2.5196277065462066e-05, + "loss": 2.8032, + "step": 53172 + }, + { + "epoch": 2.61, + "grad_norm": 0.7513494491577148, + "learning_rate": 2.5190101246672068e-05, + "loss": 2.8837, + "step": 53173 + }, + { + "epoch": 2.61, + "grad_norm": 0.7396803498268127, + "learning_rate": 2.5183926151678392e-05, + "loss": 2.8922, + "step": 53174 + }, + { + "epoch": 2.61, + "grad_norm": 0.7196879386901855, + "learning_rate": 2.5177751780497358e-05, + "loss": 3.0125, + "step": 53175 + }, + { + "epoch": 2.61, + "grad_norm": 0.7186749577522278, + "learning_rate": 2.517157813314532e-05, + "loss": 2.7121, + "step": 53176 + }, + { + "epoch": 2.61, + "grad_norm": 0.7624224424362183, + "learning_rate": 2.5165405209638366e-05, + "loss": 2.9514, + "step": 53177 + }, + { + "epoch": 2.61, + "grad_norm": 0.7520177364349365, + "learning_rate": 2.5159233009992917e-05, + "loss": 3.0939, + "step": 53178 + }, + { + "epoch": 2.61, + "grad_norm": 0.7938098907470703, + "learning_rate": 2.5153061534225128e-05, + "loss": 2.8881, + "step": 53179 + }, + { + "epoch": 2.61, + "grad_norm": 0.7314983606338501, + "learning_rate": 2.514689078235125e-05, + "loss": 2.8824, + "step": 53180 + }, + { + "epoch": 2.61, + "grad_norm": 0.7542241811752319, + "learning_rate": 2.5140720754387634e-05, + "loss": 2.9394, + "step": 53181 + }, + { + "epoch": 2.61, + "grad_norm": 0.7400225400924683, + "learning_rate": 2.513455145035037e-05, + "loss": 2.7797, + "step": 53182 + }, + { + "epoch": 2.61, + "grad_norm": 0.747697651386261, + "learning_rate": 2.5128382870255882e-05, + "loss": 2.9337, + "step": 53183 + }, + { + "epoch": 2.61, + "grad_norm": 0.8165611624717712, + "learning_rate": 2.5122215014120252e-05, + "loss": 2.7655, + "step": 53184 + }, + { + "epoch": 2.61, + "grad_norm": 0.7541787624359131, + "learning_rate": 2.5116047881959866e-05, + "loss": 2.9598, + "step": 53185 + }, + { + "epoch": 2.61, + "grad_norm": 0.7399340271949768, + "learning_rate": 2.5109881473790914e-05, + "loss": 2.9438, + "step": 53186 + }, + { + "epoch": 2.61, + "grad_norm": 0.7630729079246521, + "learning_rate": 2.5103715789629585e-05, + "loss": 2.9433, + "step": 53187 + }, + { + "epoch": 2.61, + "grad_norm": 0.7590308785438538, + "learning_rate": 2.509755082949223e-05, + "loss": 3.0062, + "step": 53188 + }, + { + "epoch": 2.61, + "grad_norm": 0.7435056567192078, + "learning_rate": 2.5091386593394935e-05, + "loss": 2.7753, + "step": 53189 + }, + { + "epoch": 2.61, + "grad_norm": 0.7512586712837219, + "learning_rate": 2.5085223081354023e-05, + "loss": 2.9262, + "step": 53190 + }, + { + "epoch": 2.61, + "grad_norm": 0.723823070526123, + "learning_rate": 2.507906029338581e-05, + "loss": 2.9895, + "step": 53191 + }, + { + "epoch": 2.61, + "grad_norm": 0.743118166923523, + "learning_rate": 2.5072898229506423e-05, + "loss": 2.8739, + "step": 53192 + }, + { + "epoch": 2.61, + "grad_norm": 0.7139184474945068, + "learning_rate": 2.506673688973211e-05, + "loss": 2.7876, + "step": 53193 + }, + { + "epoch": 2.61, + "grad_norm": 0.8011366724967957, + "learning_rate": 2.5060576274079024e-05, + "loss": 3.0433, + "step": 53194 + }, + { + "epoch": 2.61, + "grad_norm": 0.7116770148277283, + "learning_rate": 2.505441638256349e-05, + "loss": 3.1634, + "step": 53195 + }, + { + "epoch": 2.61, + "grad_norm": 0.7735740542411804, + "learning_rate": 2.5048257215201762e-05, + "loss": 2.8701, + "step": 53196 + }, + { + "epoch": 2.61, + "grad_norm": 0.7631982564926147, + "learning_rate": 2.5042098772009955e-05, + "loss": 2.9832, + "step": 53197 + }, + { + "epoch": 2.61, + "grad_norm": 0.7225585579872131, + "learning_rate": 2.5035941053004392e-05, + "loss": 3.0262, + "step": 53198 + }, + { + "epoch": 2.61, + "grad_norm": 0.744941234588623, + "learning_rate": 2.5029784058201197e-05, + "loss": 2.9931, + "step": 53199 + }, + { + "epoch": 2.61, + "grad_norm": 0.7380294799804688, + "learning_rate": 2.5023627787616685e-05, + "loss": 2.8191, + "step": 53200 + }, + { + "epoch": 2.61, + "grad_norm": 0.6965009570121765, + "learning_rate": 2.501747224126701e-05, + "loss": 2.9667, + "step": 53201 + }, + { + "epoch": 2.61, + "grad_norm": 0.767867922782898, + "learning_rate": 2.501131741916833e-05, + "loss": 2.8884, + "step": 53202 + }, + { + "epoch": 2.61, + "grad_norm": 0.7580838203430176, + "learning_rate": 2.5005163321336996e-05, + "loss": 2.9514, + "step": 53203 + }, + { + "epoch": 2.61, + "grad_norm": 0.7149063944816589, + "learning_rate": 2.499900994778906e-05, + "loss": 2.8192, + "step": 53204 + }, + { + "epoch": 2.61, + "grad_norm": 0.771682858467102, + "learning_rate": 2.4992857298540813e-05, + "loss": 2.8842, + "step": 53205 + }, + { + "epoch": 2.61, + "grad_norm": 0.7764263153076172, + "learning_rate": 2.498670537360854e-05, + "loss": 2.8959, + "step": 53206 + }, + { + "epoch": 2.61, + "grad_norm": 0.7236824631690979, + "learning_rate": 2.4980554173008326e-05, + "loss": 2.844, + "step": 53207 + }, + { + "epoch": 2.61, + "grad_norm": 0.7100843191146851, + "learning_rate": 2.4974403696756394e-05, + "loss": 2.8599, + "step": 53208 + }, + { + "epoch": 2.61, + "grad_norm": 0.7423948049545288, + "learning_rate": 2.4968253944868933e-05, + "loss": 2.8558, + "step": 53209 + }, + { + "epoch": 2.61, + "grad_norm": 0.698421061038971, + "learning_rate": 2.4962104917362124e-05, + "loss": 2.9021, + "step": 53210 + }, + { + "epoch": 2.61, + "grad_norm": 0.7575321793556213, + "learning_rate": 2.495595661425226e-05, + "loss": 3.1162, + "step": 53211 + }, + { + "epoch": 2.61, + "grad_norm": 0.710386335849762, + "learning_rate": 2.4949809035555425e-05, + "loss": 2.816, + "step": 53212 + }, + { + "epoch": 2.61, + "grad_norm": 0.7282596826553345, + "learning_rate": 2.4943662181287937e-05, + "loss": 2.9855, + "step": 53213 + }, + { + "epoch": 2.61, + "grad_norm": 0.7500553727149963, + "learning_rate": 2.4937516051465888e-05, + "loss": 2.7933, + "step": 53214 + }, + { + "epoch": 2.61, + "grad_norm": 0.8065747022628784, + "learning_rate": 2.493137064610543e-05, + "loss": 2.8999, + "step": 53215 + }, + { + "epoch": 2.61, + "grad_norm": 0.7221754789352417, + "learning_rate": 2.492522596522285e-05, + "loss": 2.7116, + "step": 53216 + }, + { + "epoch": 2.61, + "grad_norm": 0.7535700798034668, + "learning_rate": 2.4919082008834236e-05, + "loss": 2.842, + "step": 53217 + }, + { + "epoch": 2.61, + "grad_norm": 0.7541929483413696, + "learning_rate": 2.4912938776955872e-05, + "loss": 2.9126, + "step": 53218 + }, + { + "epoch": 2.61, + "grad_norm": 0.7476422190666199, + "learning_rate": 2.4906796269603847e-05, + "loss": 2.9692, + "step": 53219 + }, + { + "epoch": 2.61, + "grad_norm": 0.7244073152542114, + "learning_rate": 2.4900654486794346e-05, + "loss": 2.8498, + "step": 53220 + }, + { + "epoch": 2.61, + "grad_norm": 0.7529792189598083, + "learning_rate": 2.489451342854366e-05, + "loss": 3.0183, + "step": 53221 + }, + { + "epoch": 2.61, + "grad_norm": 0.7540156841278076, + "learning_rate": 2.4888373094867876e-05, + "loss": 2.6817, + "step": 53222 + }, + { + "epoch": 2.61, + "grad_norm": 0.7746258974075317, + "learning_rate": 2.4882233485783144e-05, + "loss": 3.0357, + "step": 53223 + }, + { + "epoch": 2.61, + "grad_norm": 0.7871257662773132, + "learning_rate": 2.487609460130565e-05, + "loss": 2.6511, + "step": 53224 + }, + { + "epoch": 2.61, + "grad_norm": 0.7155481576919556, + "learning_rate": 2.4869956441451587e-05, + "loss": 3.1273, + "step": 53225 + }, + { + "epoch": 2.61, + "grad_norm": 0.7250311970710754, + "learning_rate": 2.4863819006237072e-05, + "loss": 2.9099, + "step": 53226 + }, + { + "epoch": 2.61, + "grad_norm": 0.7574231028556824, + "learning_rate": 2.4857682295678326e-05, + "loss": 3.0626, + "step": 53227 + }, + { + "epoch": 2.61, + "grad_norm": 0.7355062961578369, + "learning_rate": 2.4851546309791536e-05, + "loss": 2.9971, + "step": 53228 + }, + { + "epoch": 2.61, + "grad_norm": 0.7345170974731445, + "learning_rate": 2.484541104859282e-05, + "loss": 3.0486, + "step": 53229 + }, + { + "epoch": 2.61, + "grad_norm": 0.6943804621696472, + "learning_rate": 2.4839276512098304e-05, + "loss": 2.799, + "step": 53230 + }, + { + "epoch": 2.61, + "grad_norm": 0.7673830389976501, + "learning_rate": 2.483314270032417e-05, + "loss": 2.7265, + "step": 53231 + }, + { + "epoch": 2.61, + "grad_norm": 0.7196083068847656, + "learning_rate": 2.4827009613286575e-05, + "loss": 2.816, + "step": 53232 + }, + { + "epoch": 2.61, + "grad_norm": 0.734291136264801, + "learning_rate": 2.4820877251001702e-05, + "loss": 2.8073, + "step": 53233 + }, + { + "epoch": 2.61, + "grad_norm": 0.7641286253929138, + "learning_rate": 2.4814745613485643e-05, + "loss": 3.1057, + "step": 53234 + }, + { + "epoch": 2.61, + "grad_norm": 0.7599497437477112, + "learning_rate": 2.4808614700754647e-05, + "loss": 2.8137, + "step": 53235 + }, + { + "epoch": 2.61, + "grad_norm": 0.7263058423995972, + "learning_rate": 2.480248451282474e-05, + "loss": 2.8649, + "step": 53236 + }, + { + "epoch": 2.61, + "grad_norm": 0.7398238182067871, + "learning_rate": 2.479635504971217e-05, + "loss": 2.7207, + "step": 53237 + }, + { + "epoch": 2.61, + "grad_norm": 0.7094271779060364, + "learning_rate": 2.4790226311433025e-05, + "loss": 2.9685, + "step": 53238 + }, + { + "epoch": 2.61, + "grad_norm": 0.7355355620384216, + "learning_rate": 2.478409829800343e-05, + "loss": 2.8561, + "step": 53239 + }, + { + "epoch": 2.61, + "grad_norm": 0.747589111328125, + "learning_rate": 2.4777971009439602e-05, + "loss": 2.9834, + "step": 53240 + }, + { + "epoch": 2.61, + "grad_norm": 0.7668625116348267, + "learning_rate": 2.477184444575756e-05, + "loss": 2.7875, + "step": 53241 + }, + { + "epoch": 2.61, + "grad_norm": 0.7156922221183777, + "learning_rate": 2.476571860697353e-05, + "loss": 2.7711, + "step": 53242 + }, + { + "epoch": 2.61, + "grad_norm": 0.7356650233268738, + "learning_rate": 2.475959349310369e-05, + "loss": 2.8436, + "step": 53243 + }, + { + "epoch": 2.61, + "grad_norm": 0.7467832565307617, + "learning_rate": 2.47534691041641e-05, + "loss": 2.7338, + "step": 53244 + }, + { + "epoch": 2.61, + "grad_norm": 0.7915269732475281, + "learning_rate": 2.4747345440170885e-05, + "loss": 2.8669, + "step": 53245 + }, + { + "epoch": 2.61, + "grad_norm": 0.7523009181022644, + "learning_rate": 2.4741222501140155e-05, + "loss": 2.8385, + "step": 53246 + }, + { + "epoch": 2.61, + "grad_norm": 0.8412205576896667, + "learning_rate": 2.4735100287088038e-05, + "loss": 3.0301, + "step": 53247 + }, + { + "epoch": 2.61, + "grad_norm": 0.7007306814193726, + "learning_rate": 2.4728978798030787e-05, + "loss": 3.0305, + "step": 53248 + }, + { + "epoch": 2.61, + "grad_norm": 0.7333875298500061, + "learning_rate": 2.472285803398435e-05, + "loss": 2.9612, + "step": 53249 + }, + { + "epoch": 2.61, + "grad_norm": 0.8363428711891174, + "learning_rate": 2.4716737994964986e-05, + "loss": 2.8032, + "step": 53250 + }, + { + "epoch": 2.61, + "grad_norm": 0.7706584334373474, + "learning_rate": 2.4710618680988746e-05, + "loss": 3.0626, + "step": 53251 + }, + { + "epoch": 2.61, + "grad_norm": 0.7772002816200256, + "learning_rate": 2.4704500092071723e-05, + "loss": 2.8498, + "step": 53252 + }, + { + "epoch": 2.61, + "grad_norm": 0.7295860648155212, + "learning_rate": 2.4698382228230095e-05, + "loss": 2.9828, + "step": 53253 + }, + { + "epoch": 2.61, + "grad_norm": 0.763123631477356, + "learning_rate": 2.4692265089479922e-05, + "loss": 2.9188, + "step": 53254 + }, + { + "epoch": 2.61, + "grad_norm": 0.8055310845375061, + "learning_rate": 2.468614867583736e-05, + "loss": 2.993, + "step": 53255 + }, + { + "epoch": 2.61, + "grad_norm": 0.7372824549674988, + "learning_rate": 2.4680032987318454e-05, + "loss": 2.5725, + "step": 53256 + }, + { + "epoch": 2.61, + "grad_norm": 0.8814607262611389, + "learning_rate": 2.4673918023939364e-05, + "loss": 2.8363, + "step": 53257 + }, + { + "epoch": 2.61, + "grad_norm": 0.7216055393218994, + "learning_rate": 2.4667803785716245e-05, + "loss": 2.7199, + "step": 53258 + }, + { + "epoch": 2.61, + "grad_norm": 0.7859689593315125, + "learning_rate": 2.4661690272665114e-05, + "loss": 2.9632, + "step": 53259 + }, + { + "epoch": 2.61, + "grad_norm": 0.7562228441238403, + "learning_rate": 2.4655577484802125e-05, + "loss": 2.9153, + "step": 53260 + }, + { + "epoch": 2.61, + "grad_norm": 0.7715989351272583, + "learning_rate": 2.46494654221433e-05, + "loss": 2.8636, + "step": 53261 + }, + { + "epoch": 2.61, + "grad_norm": 0.7629818916320801, + "learning_rate": 2.464335408470479e-05, + "loss": 3.0001, + "step": 53262 + }, + { + "epoch": 2.61, + "grad_norm": 0.8690528273582458, + "learning_rate": 2.4637243472502723e-05, + "loss": 2.9002, + "step": 53263 + }, + { + "epoch": 2.61, + "grad_norm": 0.7542612552642822, + "learning_rate": 2.4631133585553144e-05, + "loss": 2.8384, + "step": 53264 + }, + { + "epoch": 2.61, + "grad_norm": 0.7296197414398193, + "learning_rate": 2.462502442387221e-05, + "loss": 2.9057, + "step": 53265 + }, + { + "epoch": 2.61, + "grad_norm": 0.7588483095169067, + "learning_rate": 2.461891598747594e-05, + "loss": 2.5271, + "step": 53266 + }, + { + "epoch": 2.61, + "grad_norm": 0.750971794128418, + "learning_rate": 2.461280827638039e-05, + "loss": 2.8642, + "step": 53267 + }, + { + "epoch": 2.61, + "grad_norm": 0.7146446704864502, + "learning_rate": 2.4606701290601783e-05, + "loss": 2.8347, + "step": 53268 + }, + { + "epoch": 2.61, + "grad_norm": 0.755339503288269, + "learning_rate": 2.4600595030156066e-05, + "loss": 2.8039, + "step": 53269 + }, + { + "epoch": 2.61, + "grad_norm": 0.7227151393890381, + "learning_rate": 2.4594489495059434e-05, + "loss": 3.0266, + "step": 53270 + }, + { + "epoch": 2.61, + "grad_norm": 0.7845844626426697, + "learning_rate": 2.4588384685327865e-05, + "loss": 2.7491, + "step": 53271 + }, + { + "epoch": 2.61, + "grad_norm": 0.7608388066291809, + "learning_rate": 2.4582280600977522e-05, + "loss": 3.0839, + "step": 53272 + }, + { + "epoch": 2.61, + "grad_norm": 0.7561764121055603, + "learning_rate": 2.4576177242024453e-05, + "loss": 3.1059, + "step": 53273 + }, + { + "epoch": 2.61, + "grad_norm": 0.7092791199684143, + "learning_rate": 2.457007460848468e-05, + "loss": 2.9953, + "step": 53274 + }, + { + "epoch": 2.61, + "grad_norm": 0.7406812906265259, + "learning_rate": 2.4563972700374357e-05, + "loss": 3.1362, + "step": 53275 + }, + { + "epoch": 2.61, + "grad_norm": 0.7598647475242615, + "learning_rate": 2.4557871517709503e-05, + "loss": 2.6158, + "step": 53276 + }, + { + "epoch": 2.61, + "grad_norm": 0.7269623279571533, + "learning_rate": 2.4551771060506177e-05, + "loss": 2.9399, + "step": 53277 + }, + { + "epoch": 2.61, + "grad_norm": 0.7937368154525757, + "learning_rate": 2.4545671328780526e-05, + "loss": 2.7327, + "step": 53278 + }, + { + "epoch": 2.61, + "grad_norm": 0.7488956451416016, + "learning_rate": 2.453957232254854e-05, + "loss": 2.8383, + "step": 53279 + }, + { + "epoch": 2.61, + "grad_norm": 0.7417857646942139, + "learning_rate": 2.4533474041826338e-05, + "loss": 2.8689, + "step": 53280 + }, + { + "epoch": 2.61, + "grad_norm": 0.7532116770744324, + "learning_rate": 2.4527376486629945e-05, + "loss": 2.963, + "step": 53281 + }, + { + "epoch": 2.61, + "grad_norm": 0.7646198868751526, + "learning_rate": 2.452127965697538e-05, + "loss": 3.0262, + "step": 53282 + }, + { + "epoch": 2.61, + "grad_norm": 0.7706944346427917, + "learning_rate": 2.4515183552878824e-05, + "loss": 2.781, + "step": 53283 + }, + { + "epoch": 2.61, + "grad_norm": 0.7329809665679932, + "learning_rate": 2.450908817435617e-05, + "loss": 2.8377, + "step": 53284 + }, + { + "epoch": 2.61, + "grad_norm": 0.7508072257041931, + "learning_rate": 2.4502993521423643e-05, + "loss": 2.8186, + "step": 53285 + }, + { + "epoch": 2.61, + "grad_norm": 0.755642831325531, + "learning_rate": 2.4496899594097118e-05, + "loss": 2.888, + "step": 53286 + }, + { + "epoch": 2.61, + "grad_norm": 0.7248871922492981, + "learning_rate": 2.4490806392392826e-05, + "loss": 2.7755, + "step": 53287 + }, + { + "epoch": 2.61, + "grad_norm": 0.7461181879043579, + "learning_rate": 2.4484713916326715e-05, + "loss": 2.8476, + "step": 53288 + }, + { + "epoch": 2.61, + "grad_norm": 0.7649577856063843, + "learning_rate": 2.4478622165914774e-05, + "loss": 2.784, + "step": 53289 + }, + { + "epoch": 2.61, + "grad_norm": 0.7114865779876709, + "learning_rate": 2.447253114117319e-05, + "loss": 2.9854, + "step": 53290 + }, + { + "epoch": 2.61, + "grad_norm": 0.7199516892433167, + "learning_rate": 2.4466440842117886e-05, + "loss": 2.9036, + "step": 53291 + }, + { + "epoch": 2.61, + "grad_norm": 0.7920954823493958, + "learning_rate": 2.4460351268764943e-05, + "loss": 2.7358, + "step": 53292 + }, + { + "epoch": 2.61, + "grad_norm": 0.762523889541626, + "learning_rate": 2.4454262421130456e-05, + "loss": 2.8051, + "step": 53293 + }, + { + "epoch": 2.61, + "grad_norm": 0.7442542910575867, + "learning_rate": 2.4448174299230404e-05, + "loss": 2.7057, + "step": 53294 + }, + { + "epoch": 2.61, + "grad_norm": 0.7322526574134827, + "learning_rate": 2.4442086903080815e-05, + "loss": 2.954, + "step": 53295 + }, + { + "epoch": 2.61, + "grad_norm": 0.7498273849487305, + "learning_rate": 2.4436000232697706e-05, + "loss": 2.9617, + "step": 53296 + }, + { + "epoch": 2.61, + "grad_norm": 0.8006744384765625, + "learning_rate": 2.4429914288097162e-05, + "loss": 2.8717, + "step": 53297 + }, + { + "epoch": 2.61, + "grad_norm": 0.7188971638679504, + "learning_rate": 2.4423829069295207e-05, + "loss": 2.9337, + "step": 53298 + }, + { + "epoch": 2.61, + "grad_norm": 0.728274941444397, + "learning_rate": 2.4417744576307796e-05, + "loss": 2.8373, + "step": 53299 + }, + { + "epoch": 2.61, + "grad_norm": 0.7777703404426575, + "learning_rate": 2.441166080915108e-05, + "loss": 2.7575, + "step": 53300 + }, + { + "epoch": 2.61, + "grad_norm": 0.8000141978263855, + "learning_rate": 2.4405577767840946e-05, + "loss": 3.0339, + "step": 53301 + }, + { + "epoch": 2.61, + "grad_norm": 0.749862551689148, + "learning_rate": 2.4399495452393547e-05, + "loss": 2.854, + "step": 53302 + }, + { + "epoch": 2.61, + "grad_norm": 0.7590504884719849, + "learning_rate": 2.439341386282484e-05, + "loss": 2.7715, + "step": 53303 + }, + { + "epoch": 2.61, + "grad_norm": 0.7074437737464905, + "learning_rate": 2.4387332999150778e-05, + "loss": 2.9909, + "step": 53304 + }, + { + "epoch": 2.61, + "grad_norm": 0.7344616651535034, + "learning_rate": 2.4381252861387478e-05, + "loss": 3.0727, + "step": 53305 + }, + { + "epoch": 2.61, + "grad_norm": 0.7661343812942505, + "learning_rate": 2.43751734495509e-05, + "loss": 2.9525, + "step": 53306 + }, + { + "epoch": 2.61, + "grad_norm": 0.7796639204025269, + "learning_rate": 2.4369094763657093e-05, + "loss": 2.9349, + "step": 53307 + }, + { + "epoch": 2.61, + "grad_norm": 0.749068021774292, + "learning_rate": 2.4363016803721978e-05, + "loss": 3.0218, + "step": 53308 + }, + { + "epoch": 2.61, + "grad_norm": 0.7495163083076477, + "learning_rate": 2.4356939569761713e-05, + "loss": 2.7993, + "step": 53309 + }, + { + "epoch": 2.61, + "grad_norm": 0.7630246877670288, + "learning_rate": 2.435086306179218e-05, + "loss": 2.8242, + "step": 53310 + }, + { + "epoch": 2.61, + "grad_norm": 0.7312656044960022, + "learning_rate": 2.43447872798294e-05, + "loss": 2.8078, + "step": 53311 + }, + { + "epoch": 2.61, + "grad_norm": 0.7347916960716248, + "learning_rate": 2.4338712223889467e-05, + "loss": 2.7272, + "step": 53312 + }, + { + "epoch": 2.61, + "grad_norm": 0.7420660853385925, + "learning_rate": 2.4332637893988228e-05, + "loss": 3.0838, + "step": 53313 + }, + { + "epoch": 2.61, + "grad_norm": 0.7539196014404297, + "learning_rate": 2.4326564290141802e-05, + "loss": 2.8889, + "step": 53314 + }, + { + "epoch": 2.61, + "grad_norm": 0.7311657071113586, + "learning_rate": 2.4320491412366183e-05, + "loss": 2.8895, + "step": 53315 + }, + { + "epoch": 2.61, + "grad_norm": 0.7519007921218872, + "learning_rate": 2.431441926067732e-05, + "loss": 2.7742, + "step": 53316 + }, + { + "epoch": 2.61, + "grad_norm": 0.7469418048858643, + "learning_rate": 2.4308347835091234e-05, + "loss": 2.7949, + "step": 53317 + }, + { + "epoch": 2.61, + "grad_norm": 0.7581943273544312, + "learning_rate": 2.4302277135623883e-05, + "loss": 2.8109, + "step": 53318 + }, + { + "epoch": 2.61, + "grad_norm": 0.7415329813957214, + "learning_rate": 2.4296207162291213e-05, + "loss": 3.1124, + "step": 53319 + }, + { + "epoch": 2.61, + "grad_norm": 0.784935712814331, + "learning_rate": 2.4290137915109386e-05, + "loss": 2.8482, + "step": 53320 + }, + { + "epoch": 2.61, + "grad_norm": 0.6955008506774902, + "learning_rate": 2.4284069394094187e-05, + "loss": 2.8681, + "step": 53321 + }, + { + "epoch": 2.61, + "grad_norm": 0.7275387644767761, + "learning_rate": 2.4278001599261732e-05, + "loss": 2.9916, + "step": 53322 + }, + { + "epoch": 2.61, + "grad_norm": 0.7459524273872375, + "learning_rate": 2.4271934530627913e-05, + "loss": 2.751, + "step": 53323 + }, + { + "epoch": 2.61, + "grad_norm": 0.7747052311897278, + "learning_rate": 2.4265868188208813e-05, + "loss": 2.9919, + "step": 53324 + }, + { + "epoch": 2.61, + "grad_norm": 0.7346550822257996, + "learning_rate": 2.425980257202036e-05, + "loss": 2.6747, + "step": 53325 + }, + { + "epoch": 2.61, + "grad_norm": 0.7680572271347046, + "learning_rate": 2.4253737682078433e-05, + "loss": 2.7634, + "step": 53326 + }, + { + "epoch": 2.61, + "grad_norm": 0.7524115443229675, + "learning_rate": 2.424767351839919e-05, + "loss": 3.0331, + "step": 53327 + }, + { + "epoch": 2.61, + "grad_norm": 0.7559760808944702, + "learning_rate": 2.424161008099842e-05, + "loss": 2.8906, + "step": 53328 + }, + { + "epoch": 2.61, + "grad_norm": 0.7822624444961548, + "learning_rate": 2.4235547369892174e-05, + "loss": 2.6455, + "step": 53329 + }, + { + "epoch": 2.61, + "grad_norm": 0.6873143315315247, + "learning_rate": 2.4229485385096504e-05, + "loss": 2.9405, + "step": 53330 + }, + { + "epoch": 2.61, + "grad_norm": 0.7661668658256531, + "learning_rate": 2.4223424126627267e-05, + "loss": 2.9843, + "step": 53331 + }, + { + "epoch": 2.61, + "grad_norm": 0.7725754380226135, + "learning_rate": 2.4217363594500484e-05, + "loss": 2.8333, + "step": 53332 + }, + { + "epoch": 2.61, + "grad_norm": 0.7318544983863831, + "learning_rate": 2.421130378873204e-05, + "loss": 2.8713, + "step": 53333 + }, + { + "epoch": 2.61, + "grad_norm": 0.7380740642547607, + "learning_rate": 2.4205244709337923e-05, + "loss": 3.1337, + "step": 53334 + }, + { + "epoch": 2.61, + "grad_norm": 0.7293857932090759, + "learning_rate": 2.4199186356334156e-05, + "loss": 2.7604, + "step": 53335 + }, + { + "epoch": 2.61, + "grad_norm": 0.8121163845062256, + "learning_rate": 2.419312872973662e-05, + "loss": 2.935, + "step": 53336 + }, + { + "epoch": 2.61, + "grad_norm": 0.7472822070121765, + "learning_rate": 2.4187071829561376e-05, + "loss": 2.801, + "step": 53337 + }, + { + "epoch": 2.61, + "grad_norm": 0.7153192758560181, + "learning_rate": 2.418101565582421e-05, + "loss": 2.7313, + "step": 53338 + }, + { + "epoch": 2.61, + "grad_norm": 0.7651809453964233, + "learning_rate": 2.4174960208541238e-05, + "loss": 3.0331, + "step": 53339 + }, + { + "epoch": 2.61, + "grad_norm": 0.7362973093986511, + "learning_rate": 2.4168905487728353e-05, + "loss": 2.9481, + "step": 53340 + }, + { + "epoch": 2.61, + "grad_norm": 0.8571172952651978, + "learning_rate": 2.4162851493401403e-05, + "loss": 2.7162, + "step": 53341 + }, + { + "epoch": 2.61, + "grad_norm": 0.766938328742981, + "learning_rate": 2.4156798225576478e-05, + "loss": 3.0625, + "step": 53342 + }, + { + "epoch": 2.61, + "grad_norm": 0.7803354859352112, + "learning_rate": 2.4150745684269436e-05, + "loss": 2.8998, + "step": 53343 + }, + { + "epoch": 2.61, + "grad_norm": 0.7079799771308899, + "learning_rate": 2.414469386949619e-05, + "loss": 2.8232, + "step": 53344 + }, + { + "epoch": 2.61, + "grad_norm": 0.7367957830429077, + "learning_rate": 2.413864278127283e-05, + "loss": 2.5717, + "step": 53345 + }, + { + "epoch": 2.61, + "grad_norm": 0.7332743406295776, + "learning_rate": 2.4132592419615184e-05, + "loss": 2.8145, + "step": 53346 + }, + { + "epoch": 2.61, + "grad_norm": 0.724522590637207, + "learning_rate": 2.412654278453916e-05, + "loss": 3.0007, + "step": 53347 + }, + { + "epoch": 2.61, + "grad_norm": 0.7491048574447632, + "learning_rate": 2.412049387606072e-05, + "loss": 2.6494, + "step": 53348 + }, + { + "epoch": 2.61, + "grad_norm": 0.7689740061759949, + "learning_rate": 2.4114445694195783e-05, + "loss": 2.8103, + "step": 53349 + }, + { + "epoch": 2.61, + "grad_norm": 0.8025519251823425, + "learning_rate": 2.410839823896037e-05, + "loss": 2.7399, + "step": 53350 + }, + { + "epoch": 2.61, + "grad_norm": 0.7537023425102234, + "learning_rate": 2.4102351510370267e-05, + "loss": 2.8394, + "step": 53351 + }, + { + "epoch": 2.61, + "grad_norm": 0.7922477722167969, + "learning_rate": 2.409630550844156e-05, + "loss": 2.9793, + "step": 53352 + }, + { + "epoch": 2.61, + "grad_norm": 0.77238529920578, + "learning_rate": 2.409026023319004e-05, + "loss": 2.83, + "step": 53353 + }, + { + "epoch": 2.61, + "grad_norm": 0.7189983129501343, + "learning_rate": 2.408421568463166e-05, + "loss": 3.0147, + "step": 53354 + }, + { + "epoch": 2.61, + "grad_norm": 0.7614080905914307, + "learning_rate": 2.4078171862782403e-05, + "loss": 2.7063, + "step": 53355 + }, + { + "epoch": 2.61, + "grad_norm": 0.749724805355072, + "learning_rate": 2.4072128767658062e-05, + "loss": 2.812, + "step": 53356 + }, + { + "epoch": 2.61, + "grad_norm": 0.7478910684585571, + "learning_rate": 2.4066086399274687e-05, + "loss": 2.8834, + "step": 53357 + }, + { + "epoch": 2.61, + "grad_norm": 0.8119695782661438, + "learning_rate": 2.4060044757648102e-05, + "loss": 3.0316, + "step": 53358 + }, + { + "epoch": 2.61, + "grad_norm": 0.7431463003158569, + "learning_rate": 2.4054003842794257e-05, + "loss": 2.8187, + "step": 53359 + }, + { + "epoch": 2.62, + "grad_norm": 0.7376152276992798, + "learning_rate": 2.4047963654729075e-05, + "loss": 2.7177, + "step": 53360 + }, + { + "epoch": 2.62, + "grad_norm": 0.7031460404396057, + "learning_rate": 2.4041924193468475e-05, + "loss": 2.753, + "step": 53361 + }, + { + "epoch": 2.62, + "grad_norm": 0.7614917755126953, + "learning_rate": 2.4035885459028347e-05, + "loss": 2.8834, + "step": 53362 + }, + { + "epoch": 2.62, + "grad_norm": 0.7040819525718689, + "learning_rate": 2.4029847451424512e-05, + "loss": 2.9195, + "step": 53363 + }, + { + "epoch": 2.62, + "grad_norm": 0.7586194276809692, + "learning_rate": 2.4023810170672954e-05, + "loss": 3.0339, + "step": 53364 + }, + { + "epoch": 2.62, + "grad_norm": 0.7714332938194275, + "learning_rate": 2.4017773616789628e-05, + "loss": 2.9253, + "step": 53365 + }, + { + "epoch": 2.62, + "grad_norm": 0.7194476127624512, + "learning_rate": 2.401173778979032e-05, + "loss": 2.8969, + "step": 53366 + }, + { + "epoch": 2.62, + "grad_norm": 0.7926198840141296, + "learning_rate": 2.400570268969102e-05, + "loss": 3.1839, + "step": 53367 + }, + { + "epoch": 2.62, + "grad_norm": 0.7163087129592896, + "learning_rate": 2.3999668316507582e-05, + "loss": 2.7451, + "step": 53368 + }, + { + "epoch": 2.62, + "grad_norm": 0.7153766751289368, + "learning_rate": 2.3993634670255856e-05, + "loss": 2.8174, + "step": 53369 + }, + { + "epoch": 2.62, + "grad_norm": 0.7822369933128357, + "learning_rate": 2.3987601750951835e-05, + "loss": 2.7719, + "step": 53370 + }, + { + "epoch": 2.62, + "grad_norm": 0.7908663749694824, + "learning_rate": 2.3981569558611303e-05, + "loss": 2.7651, + "step": 53371 + }, + { + "epoch": 2.62, + "grad_norm": 0.7336825132369995, + "learning_rate": 2.3975538093250245e-05, + "loss": 2.8495, + "step": 53372 + }, + { + "epoch": 2.62, + "grad_norm": 0.8331197500228882, + "learning_rate": 2.396950735488442e-05, + "loss": 2.895, + "step": 53373 + }, + { + "epoch": 2.62, + "grad_norm": 0.7461316585540771, + "learning_rate": 2.396347734352988e-05, + "loss": 2.8857, + "step": 53374 + }, + { + "epoch": 2.62, + "grad_norm": 0.7184967994689941, + "learning_rate": 2.395744805920238e-05, + "loss": 2.9948, + "step": 53375 + }, + { + "epoch": 2.62, + "grad_norm": 0.8150850534439087, + "learning_rate": 2.39514195019178e-05, + "loss": 2.8987, + "step": 53376 + }, + { + "epoch": 2.62, + "grad_norm": 0.750407338142395, + "learning_rate": 2.3945391671692138e-05, + "loss": 3.0958, + "step": 53377 + }, + { + "epoch": 2.62, + "grad_norm": 0.7473511099815369, + "learning_rate": 2.3939364568541075e-05, + "loss": 2.9058, + "step": 53378 + }, + { + "epoch": 2.62, + "grad_norm": 0.7648722529411316, + "learning_rate": 2.39333381924807e-05, + "loss": 2.9031, + "step": 53379 + }, + { + "epoch": 2.62, + "grad_norm": 0.7389246821403503, + "learning_rate": 2.39273125435267e-05, + "loss": 3.0294, + "step": 53380 + }, + { + "epoch": 2.62, + "grad_norm": 0.7998449206352234, + "learning_rate": 2.392128762169503e-05, + "loss": 2.9568, + "step": 53381 + }, + { + "epoch": 2.62, + "grad_norm": 0.7724308371543884, + "learning_rate": 2.3915263427001608e-05, + "loss": 2.7814, + "step": 53382 + }, + { + "epoch": 2.62, + "grad_norm": 0.7729584574699402, + "learning_rate": 2.3909239959462258e-05, + "loss": 2.9317, + "step": 53383 + }, + { + "epoch": 2.62, + "grad_norm": 0.7403950095176697, + "learning_rate": 2.390321721909283e-05, + "loss": 2.8143, + "step": 53384 + }, + { + "epoch": 2.62, + "grad_norm": 0.7252776026725769, + "learning_rate": 2.3897195205909148e-05, + "loss": 2.7213, + "step": 53385 + }, + { + "epoch": 2.62, + "grad_norm": 0.7848640084266663, + "learning_rate": 2.389117391992713e-05, + "loss": 2.8572, + "step": 53386 + }, + { + "epoch": 2.62, + "grad_norm": 0.7389460206031799, + "learning_rate": 2.3885153361162636e-05, + "loss": 2.7936, + "step": 53387 + }, + { + "epoch": 2.62, + "grad_norm": 0.7464485764503479, + "learning_rate": 2.3879133529631477e-05, + "loss": 2.8893, + "step": 53388 + }, + { + "epoch": 2.62, + "grad_norm": 0.7308782339096069, + "learning_rate": 2.3873114425349584e-05, + "loss": 2.8139, + "step": 53389 + }, + { + "epoch": 2.62, + "grad_norm": 0.7255467772483826, + "learning_rate": 2.38670960483328e-05, + "loss": 3.0575, + "step": 53390 + }, + { + "epoch": 2.62, + "grad_norm": 0.7490675449371338, + "learning_rate": 2.3861078398596857e-05, + "loss": 2.8725, + "step": 53391 + }, + { + "epoch": 2.62, + "grad_norm": 0.7539382576942444, + "learning_rate": 2.3855061476157733e-05, + "loss": 2.81, + "step": 53392 + }, + { + "epoch": 2.62, + "grad_norm": 0.7484925389289856, + "learning_rate": 2.3849045281031188e-05, + "loss": 2.9626, + "step": 53393 + }, + { + "epoch": 2.62, + "grad_norm": 0.7690110802650452, + "learning_rate": 2.3843029813233173e-05, + "loss": 3.0192, + "step": 53394 + }, + { + "epoch": 2.62, + "grad_norm": 0.697595477104187, + "learning_rate": 2.383701507277941e-05, + "loss": 3.0619, + "step": 53395 + }, + { + "epoch": 2.62, + "grad_norm": 0.7498865723609924, + "learning_rate": 2.3831001059685884e-05, + "loss": 2.7544, + "step": 53396 + }, + { + "epoch": 2.62, + "grad_norm": 0.6930327415466309, + "learning_rate": 2.382498777396832e-05, + "loss": 2.9071, + "step": 53397 + }, + { + "epoch": 2.62, + "grad_norm": 0.7832930088043213, + "learning_rate": 2.3818975215642532e-05, + "loss": 2.8844, + "step": 53398 + }, + { + "epoch": 2.62, + "grad_norm": 0.7338464856147766, + "learning_rate": 2.381296338472448e-05, + "loss": 2.8968, + "step": 53399 + }, + { + "epoch": 2.62, + "grad_norm": 0.7855767607688904, + "learning_rate": 2.3806952281229853e-05, + "loss": 2.6963, + "step": 53400 + }, + { + "epoch": 2.62, + "grad_norm": 0.7518692016601562, + "learning_rate": 2.3800941905174563e-05, + "loss": 2.9422, + "step": 53401 + }, + { + "epoch": 2.62, + "grad_norm": 0.7118070721626282, + "learning_rate": 2.3794932256574505e-05, + "loss": 2.9071, + "step": 53402 + }, + { + "epoch": 2.62, + "grad_norm": 0.8377758264541626, + "learning_rate": 2.3788923335445398e-05, + "loss": 3.1013, + "step": 53403 + }, + { + "epoch": 2.62, + "grad_norm": 0.7513577342033386, + "learning_rate": 2.3782915141803127e-05, + "loss": 3.0399, + "step": 53404 + }, + { + "epoch": 2.62, + "grad_norm": 0.796913206577301, + "learning_rate": 2.3776907675663513e-05, + "loss": 2.7059, + "step": 53405 + }, + { + "epoch": 2.62, + "grad_norm": 0.7465709447860718, + "learning_rate": 2.3770900937042314e-05, + "loss": 2.7208, + "step": 53406 + }, + { + "epoch": 2.62, + "grad_norm": 0.7374803423881531, + "learning_rate": 2.376489492595548e-05, + "loss": 2.7431, + "step": 53407 + }, + { + "epoch": 2.62, + "grad_norm": 0.7477961182594299, + "learning_rate": 2.3758889642418665e-05, + "loss": 2.7817, + "step": 53408 + }, + { + "epoch": 2.62, + "grad_norm": 0.8048363327980042, + "learning_rate": 2.3752885086447827e-05, + "loss": 2.9601, + "step": 53409 + }, + { + "epoch": 2.62, + "grad_norm": 0.7863703370094299, + "learning_rate": 2.374688125805868e-05, + "loss": 2.7946, + "step": 53410 + }, + { + "epoch": 2.62, + "grad_norm": 0.7128931283950806, + "learning_rate": 2.3740878157267118e-05, + "loss": 2.9408, + "step": 53411 + }, + { + "epoch": 2.62, + "grad_norm": 0.7168523073196411, + "learning_rate": 2.3734875784088926e-05, + "loss": 2.8748, + "step": 53412 + }, + { + "epoch": 2.62, + "grad_norm": 0.7598782181739807, + "learning_rate": 2.3728874138539856e-05, + "loss": 2.8996, + "step": 53413 + }, + { + "epoch": 2.62, + "grad_norm": 0.7533124089241028, + "learning_rate": 2.3722873220635795e-05, + "loss": 2.7102, + "step": 53414 + }, + { + "epoch": 2.62, + "grad_norm": 0.7148170471191406, + "learning_rate": 2.3716873030392504e-05, + "loss": 2.9118, + "step": 53415 + }, + { + "epoch": 2.62, + "grad_norm": 0.7307083606719971, + "learning_rate": 2.371087356782576e-05, + "loss": 2.8594, + "step": 53416 + }, + { + "epoch": 2.62, + "grad_norm": 0.7332124710083008, + "learning_rate": 2.3704874832951458e-05, + "loss": 3.065, + "step": 53417 + }, + { + "epoch": 2.62, + "grad_norm": 0.7090801000595093, + "learning_rate": 2.369887682578535e-05, + "loss": 2.8158, + "step": 53418 + }, + { + "epoch": 2.62, + "grad_norm": 0.7521113157272339, + "learning_rate": 2.3692879546343224e-05, + "loss": 2.92, + "step": 53419 + }, + { + "epoch": 2.62, + "grad_norm": 0.7190861105918884, + "learning_rate": 2.368688299464083e-05, + "loss": 2.8477, + "step": 53420 + }, + { + "epoch": 2.62, + "grad_norm": 0.7460111975669861, + "learning_rate": 2.3680887170693995e-05, + "loss": 2.887, + "step": 53421 + }, + { + "epoch": 2.62, + "grad_norm": 0.7339314222335815, + "learning_rate": 2.3674892074518602e-05, + "loss": 2.8122, + "step": 53422 + }, + { + "epoch": 2.62, + "grad_norm": 0.7279649972915649, + "learning_rate": 2.366889770613031e-05, + "loss": 2.9055, + "step": 53423 + }, + { + "epoch": 2.62, + "grad_norm": 0.7642202973365784, + "learning_rate": 2.3662904065544996e-05, + "loss": 2.8235, + "step": 53424 + }, + { + "epoch": 2.62, + "grad_norm": 0.726921021938324, + "learning_rate": 2.3656911152778357e-05, + "loss": 2.9702, + "step": 53425 + }, + { + "epoch": 2.62, + "grad_norm": 0.7702478170394897, + "learning_rate": 2.3650918967846276e-05, + "loss": 2.729, + "step": 53426 + }, + { + "epoch": 2.62, + "grad_norm": 0.7660560011863708, + "learning_rate": 2.364492751076451e-05, + "loss": 3.0538, + "step": 53427 + }, + { + "epoch": 2.62, + "grad_norm": 0.7796090245246887, + "learning_rate": 2.3638936781548745e-05, + "loss": 2.9503, + "step": 53428 + }, + { + "epoch": 2.62, + "grad_norm": 0.7895865440368652, + "learning_rate": 2.36329467802149e-05, + "loss": 2.9029, + "step": 53429 + }, + { + "epoch": 2.62, + "grad_norm": 0.7662897109985352, + "learning_rate": 2.362695750677863e-05, + "loss": 2.8069, + "step": 53430 + }, + { + "epoch": 2.62, + "grad_norm": 0.7575547695159912, + "learning_rate": 2.362096896125576e-05, + "loss": 3.0073, + "step": 53431 + }, + { + "epoch": 2.62, + "grad_norm": 0.753297746181488, + "learning_rate": 2.3614981143662136e-05, + "loss": 2.9955, + "step": 53432 + }, + { + "epoch": 2.62, + "grad_norm": 0.7785336971282959, + "learning_rate": 2.3608994054013452e-05, + "loss": 2.8248, + "step": 53433 + }, + { + "epoch": 2.62, + "grad_norm": 0.7514358758926392, + "learning_rate": 2.3603007692325493e-05, + "loss": 2.9667, + "step": 53434 + }, + { + "epoch": 2.62, + "grad_norm": 0.7518543004989624, + "learning_rate": 2.3597022058613946e-05, + "loss": 3.0098, + "step": 53435 + }, + { + "epoch": 2.62, + "grad_norm": 0.8257125616073608, + "learning_rate": 2.3591037152894664e-05, + "loss": 2.9063, + "step": 53436 + }, + { + "epoch": 2.62, + "grad_norm": 0.7336806654930115, + "learning_rate": 2.358505297518344e-05, + "loss": 2.9125, + "step": 53437 + }, + { + "epoch": 2.62, + "grad_norm": 0.7743903398513794, + "learning_rate": 2.3579069525495918e-05, + "loss": 2.756, + "step": 53438 + }, + { + "epoch": 2.62, + "grad_norm": 0.7176278233528137, + "learning_rate": 2.3573086803848028e-05, + "loss": 2.8686, + "step": 53439 + }, + { + "epoch": 2.62, + "grad_norm": 0.7256504893302917, + "learning_rate": 2.3567104810255323e-05, + "loss": 2.8576, + "step": 53440 + }, + { + "epoch": 2.62, + "grad_norm": 0.7542241215705872, + "learning_rate": 2.356112354473375e-05, + "loss": 2.9628, + "step": 53441 + }, + { + "epoch": 2.62, + "grad_norm": 0.7405779957771301, + "learning_rate": 2.355514300729897e-05, + "loss": 2.891, + "step": 53442 + }, + { + "epoch": 2.62, + "grad_norm": 0.7476978898048401, + "learning_rate": 2.354916319796667e-05, + "loss": 2.7838, + "step": 53443 + }, + { + "epoch": 2.62, + "grad_norm": 0.7352883219718933, + "learning_rate": 2.3543184116752766e-05, + "loss": 2.9383, + "step": 53444 + }, + { + "epoch": 2.62, + "grad_norm": 0.766408383846283, + "learning_rate": 2.3537205763672816e-05, + "loss": 3.1493, + "step": 53445 + }, + { + "epoch": 2.62, + "grad_norm": 0.7481442093849182, + "learning_rate": 2.3531228138742675e-05, + "loss": 2.8828, + "step": 53446 + }, + { + "epoch": 2.62, + "grad_norm": 0.7440130710601807, + "learning_rate": 2.352525124197813e-05, + "loss": 2.9028, + "step": 53447 + }, + { + "epoch": 2.62, + "grad_norm": 0.7087607383728027, + "learning_rate": 2.351927507339486e-05, + "loss": 2.7698, + "step": 53448 + }, + { + "epoch": 2.62, + "grad_norm": 0.7708470225334167, + "learning_rate": 2.3513299633008597e-05, + "loss": 2.8684, + "step": 53449 + }, + { + "epoch": 2.62, + "grad_norm": 0.7144055962562561, + "learning_rate": 2.3507324920835057e-05, + "loss": 2.8738, + "step": 53450 + }, + { + "epoch": 2.62, + "grad_norm": 0.7633524537086487, + "learning_rate": 2.3501350936889995e-05, + "loss": 2.8275, + "step": 53451 + }, + { + "epoch": 2.62, + "grad_norm": 0.6785310506820679, + "learning_rate": 2.349537768118923e-05, + "loss": 2.8683, + "step": 53452 + }, + { + "epoch": 2.62, + "grad_norm": 0.7202582359313965, + "learning_rate": 2.348940515374835e-05, + "loss": 2.7919, + "step": 53453 + }, + { + "epoch": 2.62, + "grad_norm": 0.7880463004112244, + "learning_rate": 2.3483433354583214e-05, + "loss": 2.7937, + "step": 53454 + }, + { + "epoch": 2.62, + "grad_norm": 0.7895649075508118, + "learning_rate": 2.3477462283709535e-05, + "loss": 2.97, + "step": 53455 + }, + { + "epoch": 2.62, + "grad_norm": 0.741170346736908, + "learning_rate": 2.3471491941142972e-05, + "loss": 2.8845, + "step": 53456 + }, + { + "epoch": 2.62, + "grad_norm": 0.7909066081047058, + "learning_rate": 2.3465522326899245e-05, + "loss": 2.7116, + "step": 53457 + }, + { + "epoch": 2.62, + "grad_norm": 0.7080055475234985, + "learning_rate": 2.3459553440994107e-05, + "loss": 2.9053, + "step": 53458 + }, + { + "epoch": 2.62, + "grad_norm": 0.7602565884590149, + "learning_rate": 2.345358528344331e-05, + "loss": 2.9799, + "step": 53459 + }, + { + "epoch": 2.62, + "grad_norm": 0.7593586444854736, + "learning_rate": 2.344761785426251e-05, + "loss": 3.0791, + "step": 53460 + }, + { + "epoch": 2.62, + "grad_norm": 0.760124146938324, + "learning_rate": 2.3441651153467532e-05, + "loss": 2.8708, + "step": 53461 + }, + { + "epoch": 2.62, + "grad_norm": 0.725273847579956, + "learning_rate": 2.343568518107396e-05, + "loss": 2.902, + "step": 53462 + }, + { + "epoch": 2.62, + "grad_norm": 0.7621003985404968, + "learning_rate": 2.3429719937097614e-05, + "loss": 2.8048, + "step": 53463 + }, + { + "epoch": 2.62, + "grad_norm": 0.7457227110862732, + "learning_rate": 2.342375542155418e-05, + "loss": 3.0858, + "step": 53464 + }, + { + "epoch": 2.62, + "grad_norm": 0.7379422187805176, + "learning_rate": 2.3417791634459248e-05, + "loss": 2.7287, + "step": 53465 + }, + { + "epoch": 2.62, + "grad_norm": 0.7121053338050842, + "learning_rate": 2.3411828575828707e-05, + "loss": 2.7895, + "step": 53466 + }, + { + "epoch": 2.62, + "grad_norm": 0.7786771655082703, + "learning_rate": 2.340586624567814e-05, + "loss": 3.047, + "step": 53467 + }, + { + "epoch": 2.62, + "grad_norm": 0.7441116571426392, + "learning_rate": 2.339990464402327e-05, + "loss": 2.9813, + "step": 53468 + }, + { + "epoch": 2.62, + "grad_norm": 0.7259562611579895, + "learning_rate": 2.339394377087985e-05, + "loss": 2.8765, + "step": 53469 + }, + { + "epoch": 2.62, + "grad_norm": 0.7485330700874329, + "learning_rate": 2.3387983626263575e-05, + "loss": 2.7566, + "step": 53470 + }, + { + "epoch": 2.62, + "grad_norm": 0.7584130764007568, + "learning_rate": 2.338202421019012e-05, + "loss": 3.0932, + "step": 53471 + }, + { + "epoch": 2.62, + "grad_norm": 0.7374604940414429, + "learning_rate": 2.3376065522675113e-05, + "loss": 3.0193, + "step": 53472 + }, + { + "epoch": 2.62, + "grad_norm": 0.7430775761604309, + "learning_rate": 2.337010756373431e-05, + "loss": 3.0866, + "step": 53473 + }, + { + "epoch": 2.62, + "grad_norm": 0.7617211937904358, + "learning_rate": 2.336415033338346e-05, + "loss": 2.6868, + "step": 53474 + }, + { + "epoch": 2.62, + "grad_norm": 0.7515766620635986, + "learning_rate": 2.335819383163815e-05, + "loss": 2.9058, + "step": 53475 + }, + { + "epoch": 2.62, + "grad_norm": 0.7782991528511047, + "learning_rate": 2.335223805851417e-05, + "loss": 2.9492, + "step": 53476 + }, + { + "epoch": 2.62, + "grad_norm": 0.7339718341827393, + "learning_rate": 2.3346283014027144e-05, + "loss": 3.0604, + "step": 53477 + }, + { + "epoch": 2.62, + "grad_norm": 0.7309208512306213, + "learning_rate": 2.334032869819272e-05, + "loss": 2.9403, + "step": 53478 + }, + { + "epoch": 2.62, + "grad_norm": 0.7868647575378418, + "learning_rate": 2.3334375111026693e-05, + "loss": 2.9261, + "step": 53479 + }, + { + "epoch": 2.62, + "grad_norm": 0.7272360920906067, + "learning_rate": 2.332842225254461e-05, + "loss": 2.7392, + "step": 53480 + }, + { + "epoch": 2.62, + "grad_norm": 0.719589114189148, + "learning_rate": 2.332247012276226e-05, + "loss": 2.952, + "step": 53481 + }, + { + "epoch": 2.62, + "grad_norm": 0.7240522503852844, + "learning_rate": 2.331651872169523e-05, + "loss": 2.7877, + "step": 53482 + }, + { + "epoch": 2.62, + "grad_norm": 0.7426586747169495, + "learning_rate": 2.3310568049359245e-05, + "loss": 2.9101, + "step": 53483 + }, + { + "epoch": 2.62, + "grad_norm": 0.7182551026344299, + "learning_rate": 2.3304618105770024e-05, + "loss": 3.0309, + "step": 53484 + }, + { + "epoch": 2.62, + "grad_norm": 0.7897670269012451, + "learning_rate": 2.3298668890943183e-05, + "loss": 2.8458, + "step": 53485 + }, + { + "epoch": 2.62, + "grad_norm": 0.7790352702140808, + "learning_rate": 2.3292720404894416e-05, + "loss": 2.6728, + "step": 53486 + }, + { + "epoch": 2.62, + "grad_norm": 0.7496531009674072, + "learning_rate": 2.3286772647639307e-05, + "loss": 3.0795, + "step": 53487 + }, + { + "epoch": 2.62, + "grad_norm": 0.7485788464546204, + "learning_rate": 2.3280825619193577e-05, + "loss": 2.8998, + "step": 53488 + }, + { + "epoch": 2.62, + "grad_norm": 0.7316558361053467, + "learning_rate": 2.3274879319572946e-05, + "loss": 2.9014, + "step": 53489 + }, + { + "epoch": 2.62, + "grad_norm": 0.766089141368866, + "learning_rate": 2.3268933748792972e-05, + "loss": 2.9425, + "step": 53490 + }, + { + "epoch": 2.62, + "grad_norm": 0.7590373754501343, + "learning_rate": 2.326298890686944e-05, + "loss": 2.9089, + "step": 53491 + }, + { + "epoch": 2.62, + "grad_norm": 0.7415793538093567, + "learning_rate": 2.3257044793817904e-05, + "loss": 2.9765, + "step": 53492 + }, + { + "epoch": 2.62, + "grad_norm": 0.7299401760101318, + "learning_rate": 2.3251101409654015e-05, + "loss": 2.9363, + "step": 53493 + }, + { + "epoch": 2.62, + "grad_norm": 0.714733898639679, + "learning_rate": 2.3245158754393534e-05, + "loss": 2.7415, + "step": 53494 + }, + { + "epoch": 2.62, + "grad_norm": 0.7463199496269226, + "learning_rate": 2.3239216828051975e-05, + "loss": 2.841, + "step": 53495 + }, + { + "epoch": 2.62, + "grad_norm": 0.7127196788787842, + "learning_rate": 2.3233275630645098e-05, + "loss": 2.7024, + "step": 53496 + }, + { + "epoch": 2.62, + "grad_norm": 0.7690715193748474, + "learning_rate": 2.3227335162188486e-05, + "loss": 2.9423, + "step": 53497 + }, + { + "epoch": 2.62, + "grad_norm": 0.7808687090873718, + "learning_rate": 2.322139542269783e-05, + "loss": 3.0039, + "step": 53498 + }, + { + "epoch": 2.62, + "grad_norm": 0.7283824682235718, + "learning_rate": 2.321545641218878e-05, + "loss": 2.8473, + "step": 53499 + }, + { + "epoch": 2.62, + "grad_norm": 0.762187659740448, + "learning_rate": 2.3209518130676863e-05, + "loss": 2.9331, + "step": 53500 + }, + { + "epoch": 2.62, + "grad_norm": 0.7350523471832275, + "learning_rate": 2.3203580578177894e-05, + "loss": 3.0007, + "step": 53501 + }, + { + "epoch": 2.62, + "grad_norm": 0.7983888983726501, + "learning_rate": 2.319764375470733e-05, + "loss": 2.7056, + "step": 53502 + }, + { + "epoch": 2.62, + "grad_norm": 0.697699248790741, + "learning_rate": 2.319170766028092e-05, + "loss": 2.7097, + "step": 53503 + }, + { + "epoch": 2.62, + "grad_norm": 0.7238744497299194, + "learning_rate": 2.318577229491433e-05, + "loss": 2.9083, + "step": 53504 + }, + { + "epoch": 2.62, + "grad_norm": 0.7800061106681824, + "learning_rate": 2.31798376586231e-05, + "loss": 2.7768, + "step": 53505 + }, + { + "epoch": 2.62, + "grad_norm": 0.7480177283287048, + "learning_rate": 2.317390375142296e-05, + "loss": 2.7823, + "step": 53506 + }, + { + "epoch": 2.62, + "grad_norm": 0.7896789312362671, + "learning_rate": 2.3167970573329463e-05, + "loss": 2.7216, + "step": 53507 + }, + { + "epoch": 2.62, + "grad_norm": 0.7196285724639893, + "learning_rate": 2.3162038124358196e-05, + "loss": 2.8438, + "step": 53508 + }, + { + "epoch": 2.62, + "grad_norm": 0.8125433325767517, + "learning_rate": 2.315610640452491e-05, + "loss": 2.9548, + "step": 53509 + }, + { + "epoch": 2.62, + "grad_norm": 0.8360536694526672, + "learning_rate": 2.3150175413845094e-05, + "loss": 2.8399, + "step": 53510 + }, + { + "epoch": 2.62, + "grad_norm": 0.7761578559875488, + "learning_rate": 2.3144245152334506e-05, + "loss": 2.8066, + "step": 53511 + }, + { + "epoch": 2.62, + "grad_norm": 0.7385191917419434, + "learning_rate": 2.3138315620008628e-05, + "loss": 2.8548, + "step": 53512 + }, + { + "epoch": 2.62, + "grad_norm": 0.7106783390045166, + "learning_rate": 2.313238681688322e-05, + "loss": 2.7003, + "step": 53513 + }, + { + "epoch": 2.62, + "grad_norm": 0.7793312668800354, + "learning_rate": 2.3126458742973796e-05, + "loss": 2.7662, + "step": 53514 + }, + { + "epoch": 2.62, + "grad_norm": 0.7528009414672852, + "learning_rate": 2.312053139829595e-05, + "loss": 3.0059, + "step": 53515 + }, + { + "epoch": 2.62, + "grad_norm": 0.7911593317985535, + "learning_rate": 2.31146047828654e-05, + "loss": 2.8624, + "step": 53516 + }, + { + "epoch": 2.62, + "grad_norm": 0.7517163157463074, + "learning_rate": 2.310867889669763e-05, + "loss": 2.8081, + "step": 53517 + }, + { + "epoch": 2.62, + "grad_norm": 0.7046359777450562, + "learning_rate": 2.31027537398083e-05, + "loss": 2.8519, + "step": 53518 + }, + { + "epoch": 2.62, + "grad_norm": 0.7641608715057373, + "learning_rate": 2.3096829312213093e-05, + "loss": 2.8228, + "step": 53519 + }, + { + "epoch": 2.62, + "grad_norm": 0.7242162227630615, + "learning_rate": 2.3090905613927535e-05, + "loss": 2.9697, + "step": 53520 + }, + { + "epoch": 2.62, + "grad_norm": 0.7436241507530212, + "learning_rate": 2.308498264496724e-05, + "loss": 2.9085, + "step": 53521 + }, + { + "epoch": 2.62, + "grad_norm": 0.7740703225135803, + "learning_rate": 2.307906040534777e-05, + "loss": 2.9504, + "step": 53522 + }, + { + "epoch": 2.62, + "grad_norm": 0.7288923859596252, + "learning_rate": 2.3073138895084742e-05, + "loss": 2.8711, + "step": 53523 + }, + { + "epoch": 2.62, + "grad_norm": 0.7444291114807129, + "learning_rate": 2.3067218114193842e-05, + "loss": 3.0138, + "step": 53524 + }, + { + "epoch": 2.62, + "grad_norm": 0.7635215520858765, + "learning_rate": 2.3061298062690525e-05, + "loss": 2.8761, + "step": 53525 + }, + { + "epoch": 2.62, + "grad_norm": 0.790448784828186, + "learning_rate": 2.3055378740590514e-05, + "loss": 2.8546, + "step": 53526 + }, + { + "epoch": 2.62, + "grad_norm": 0.7522045969963074, + "learning_rate": 2.3049460147909294e-05, + "loss": 2.9095, + "step": 53527 + }, + { + "epoch": 2.62, + "grad_norm": 0.7432355880737305, + "learning_rate": 2.304354228466252e-05, + "loss": 3.1744, + "step": 53528 + }, + { + "epoch": 2.62, + "grad_norm": 0.759417712688446, + "learning_rate": 2.303762515086578e-05, + "loss": 2.956, + "step": 53529 + }, + { + "epoch": 2.62, + "grad_norm": 0.7882241606712341, + "learning_rate": 2.3031708746534562e-05, + "loss": 2.8042, + "step": 53530 + }, + { + "epoch": 2.62, + "grad_norm": 0.7573877573013306, + "learning_rate": 2.3025793071684585e-05, + "loss": 2.7942, + "step": 53531 + }, + { + "epoch": 2.62, + "grad_norm": 0.8343915343284607, + "learning_rate": 2.3019878126331304e-05, + "loss": 2.9512, + "step": 53532 + }, + { + "epoch": 2.62, + "grad_norm": 0.7674956917762756, + "learning_rate": 2.301396391049034e-05, + "loss": 2.8093, + "step": 53533 + }, + { + "epoch": 2.62, + "grad_norm": 0.714229941368103, + "learning_rate": 2.3008050424177384e-05, + "loss": 2.9264, + "step": 53534 + }, + { + "epoch": 2.62, + "grad_norm": 0.780677318572998, + "learning_rate": 2.3002137667407884e-05, + "loss": 3.0235, + "step": 53535 + }, + { + "epoch": 2.62, + "grad_norm": 0.7305729985237122, + "learning_rate": 2.2996225640197463e-05, + "loss": 2.7816, + "step": 53536 + }, + { + "epoch": 2.62, + "grad_norm": 0.773148775100708, + "learning_rate": 2.2990314342561612e-05, + "loss": 3.1407, + "step": 53537 + }, + { + "epoch": 2.62, + "grad_norm": 0.7465836405754089, + "learning_rate": 2.298440377451598e-05, + "loss": 2.9568, + "step": 53538 + }, + { + "epoch": 2.62, + "grad_norm": 0.8020625710487366, + "learning_rate": 2.2978493936076125e-05, + "loss": 2.9573, + "step": 53539 + }, + { + "epoch": 2.62, + "grad_norm": 0.7247298955917358, + "learning_rate": 2.2972584827257568e-05, + "loss": 2.9254, + "step": 53540 + }, + { + "epoch": 2.62, + "grad_norm": 0.7181655168533325, + "learning_rate": 2.296667644807596e-05, + "loss": 2.895, + "step": 53541 + }, + { + "epoch": 2.62, + "grad_norm": 0.7766932249069214, + "learning_rate": 2.2960768798546824e-05, + "loss": 2.7412, + "step": 53542 + }, + { + "epoch": 2.62, + "grad_norm": 0.7357663512229919, + "learning_rate": 2.2954861878685684e-05, + "loss": 2.7664, + "step": 53543 + }, + { + "epoch": 2.62, + "grad_norm": 0.7322977185249329, + "learning_rate": 2.2948955688508085e-05, + "loss": 2.931, + "step": 53544 + }, + { + "epoch": 2.62, + "grad_norm": 0.7766660451889038, + "learning_rate": 2.294305022802959e-05, + "loss": 2.9752, + "step": 53545 + }, + { + "epoch": 2.62, + "grad_norm": 0.7280550003051758, + "learning_rate": 2.293714549726585e-05, + "loss": 2.6769, + "step": 53546 + }, + { + "epoch": 2.62, + "grad_norm": 0.726729154586792, + "learning_rate": 2.293124149623232e-05, + "loss": 3.118, + "step": 53547 + }, + { + "epoch": 2.62, + "grad_norm": 0.750844419002533, + "learning_rate": 2.2925338224944588e-05, + "loss": 2.9707, + "step": 53548 + }, + { + "epoch": 2.62, + "grad_norm": 0.7047064304351807, + "learning_rate": 2.2919435683418175e-05, + "loss": 2.7327, + "step": 53549 + }, + { + "epoch": 2.62, + "grad_norm": 0.7407240867614746, + "learning_rate": 2.2913533871668667e-05, + "loss": 2.7221, + "step": 53550 + }, + { + "epoch": 2.62, + "grad_norm": 0.7109093070030212, + "learning_rate": 2.2907632789711617e-05, + "loss": 3.0363, + "step": 53551 + }, + { + "epoch": 2.62, + "grad_norm": 0.7659242749214172, + "learning_rate": 2.2901732437562447e-05, + "loss": 2.7527, + "step": 53552 + }, + { + "epoch": 2.62, + "grad_norm": 0.7269027829170227, + "learning_rate": 2.2895832815236848e-05, + "loss": 2.7867, + "step": 53553 + }, + { + "epoch": 2.62, + "grad_norm": 0.739622950553894, + "learning_rate": 2.288993392275027e-05, + "loss": 2.8092, + "step": 53554 + }, + { + "epoch": 2.62, + "grad_norm": 0.7424209713935852, + "learning_rate": 2.2884035760118235e-05, + "loss": 2.8659, + "step": 53555 + }, + { + "epoch": 2.62, + "grad_norm": 0.7513794302940369, + "learning_rate": 2.28781383273564e-05, + "loss": 2.9606, + "step": 53556 + }, + { + "epoch": 2.62, + "grad_norm": 0.7400728464126587, + "learning_rate": 2.287224162448018e-05, + "loss": 2.8118, + "step": 53557 + }, + { + "epoch": 2.62, + "grad_norm": 0.7726714015007019, + "learning_rate": 2.2866345651505168e-05, + "loss": 3.0265, + "step": 53558 + }, + { + "epoch": 2.62, + "grad_norm": 0.7855329513549805, + "learning_rate": 2.2860450408446818e-05, + "loss": 2.9305, + "step": 53559 + }, + { + "epoch": 2.62, + "grad_norm": 0.7705275416374207, + "learning_rate": 2.2854555895320713e-05, + "loss": 2.7834, + "step": 53560 + }, + { + "epoch": 2.62, + "grad_norm": 0.7320125699043274, + "learning_rate": 2.2848662112142412e-05, + "loss": 2.9377, + "step": 53561 + }, + { + "epoch": 2.62, + "grad_norm": 0.7487272024154663, + "learning_rate": 2.2842769058927336e-05, + "loss": 3.0346, + "step": 53562 + }, + { + "epoch": 2.62, + "grad_norm": 0.7231678366661072, + "learning_rate": 2.2836876735691102e-05, + "loss": 2.7278, + "step": 53563 + }, + { + "epoch": 2.63, + "grad_norm": 0.8296177387237549, + "learning_rate": 2.2830985142449164e-05, + "loss": 2.9324, + "step": 53564 + }, + { + "epoch": 2.63, + "grad_norm": 0.7314631342887878, + "learning_rate": 2.2825094279217116e-05, + "loss": 2.9992, + "step": 53565 + }, + { + "epoch": 2.63, + "grad_norm": 0.7270010113716125, + "learning_rate": 2.281920414601044e-05, + "loss": 2.9121, + "step": 53566 + }, + { + "epoch": 2.63, + "grad_norm": 0.7274070382118225, + "learning_rate": 2.2813314742844556e-05, + "loss": 3.0206, + "step": 53567 + }, + { + "epoch": 2.63, + "grad_norm": 0.7393171787261963, + "learning_rate": 2.2807426069735124e-05, + "loss": 2.9025, + "step": 53568 + }, + { + "epoch": 2.63, + "grad_norm": 0.7126226425170898, + "learning_rate": 2.2801538126697527e-05, + "loss": 2.7961, + "step": 53569 + }, + { + "epoch": 2.63, + "grad_norm": 0.7575780153274536, + "learning_rate": 2.2795650913747355e-05, + "loss": 3.0125, + "step": 53570 + }, + { + "epoch": 2.63, + "grad_norm": 0.7629463076591492, + "learning_rate": 2.2789764430900126e-05, + "loss": 2.834, + "step": 53571 + }, + { + "epoch": 2.63, + "grad_norm": 0.7294546961784363, + "learning_rate": 2.2783878678171295e-05, + "loss": 2.7511, + "step": 53572 + }, + { + "epoch": 2.63, + "grad_norm": 0.7730554938316345, + "learning_rate": 2.2777993655576388e-05, + "loss": 2.8679, + "step": 53573 + }, + { + "epoch": 2.63, + "grad_norm": 0.7737736105918884, + "learning_rate": 2.277210936313082e-05, + "loss": 2.808, + "step": 53574 + }, + { + "epoch": 2.63, + "grad_norm": 0.7201272249221802, + "learning_rate": 2.276622580085018e-05, + "loss": 3.0931, + "step": 53575 + }, + { + "epoch": 2.63, + "grad_norm": 0.7674169540405273, + "learning_rate": 2.2760342968750024e-05, + "loss": 2.6019, + "step": 53576 + }, + { + "epoch": 2.63, + "grad_norm": 0.6896553039550781, + "learning_rate": 2.275446086684567e-05, + "loss": 2.7181, + "step": 53577 + }, + { + "epoch": 2.63, + "grad_norm": 0.7537925839424133, + "learning_rate": 2.2748579495152808e-05, + "loss": 3.0731, + "step": 53578 + }, + { + "epoch": 2.63, + "grad_norm": 0.7853711247444153, + "learning_rate": 2.274269885368679e-05, + "loss": 3.0916, + "step": 53579 + }, + { + "epoch": 2.63, + "grad_norm": 0.8356667160987854, + "learning_rate": 2.273681894246311e-05, + "loss": 2.9315, + "step": 53580 + }, + { + "epoch": 2.63, + "grad_norm": 0.7744783759117126, + "learning_rate": 2.2730939761497347e-05, + "loss": 2.62, + "step": 53581 + }, + { + "epoch": 2.63, + "grad_norm": 0.7801983952522278, + "learning_rate": 2.272506131080486e-05, + "loss": 2.8224, + "step": 53582 + }, + { + "epoch": 2.63, + "grad_norm": 0.737760066986084, + "learning_rate": 2.2719183590401237e-05, + "loss": 3.0579, + "step": 53583 + }, + { + "epoch": 2.63, + "grad_norm": 0.7177762389183044, + "learning_rate": 2.2713306600301894e-05, + "loss": 2.8598, + "step": 53584 + }, + { + "epoch": 2.63, + "grad_norm": 0.7965755462646484, + "learning_rate": 2.2707430340522326e-05, + "loss": 2.925, + "step": 53585 + }, + { + "epoch": 2.63, + "grad_norm": 0.7102624177932739, + "learning_rate": 2.2701554811078083e-05, + "loss": 2.8057, + "step": 53586 + }, + { + "epoch": 2.63, + "grad_norm": 0.8156418800354004, + "learning_rate": 2.2695680011984583e-05, + "loss": 2.8195, + "step": 53587 + }, + { + "epoch": 2.63, + "grad_norm": 0.7690003514289856, + "learning_rate": 2.2689805943257257e-05, + "loss": 2.8399, + "step": 53588 + }, + { + "epoch": 2.63, + "grad_norm": 0.7723907828330994, + "learning_rate": 2.268393260491158e-05, + "loss": 3.0257, + "step": 53589 + }, + { + "epoch": 2.63, + "grad_norm": 0.7428565621376038, + "learning_rate": 2.267805999696308e-05, + "loss": 2.9596, + "step": 53590 + }, + { + "epoch": 2.63, + "grad_norm": 0.7959504127502441, + "learning_rate": 2.2672188119427214e-05, + "loss": 2.9929, + "step": 53591 + }, + { + "epoch": 2.63, + "grad_norm": 0.7445618510246277, + "learning_rate": 2.2666316972319397e-05, + "loss": 2.9063, + "step": 53592 + }, + { + "epoch": 2.63, + "grad_norm": 0.7014729976654053, + "learning_rate": 2.266044655565519e-05, + "loss": 2.7315, + "step": 53593 + }, + { + "epoch": 2.63, + "grad_norm": 0.7331639528274536, + "learning_rate": 2.2654576869449968e-05, + "loss": 3.0117, + "step": 53594 + }, + { + "epoch": 2.63, + "grad_norm": 0.7533635497093201, + "learning_rate": 2.2648707913719165e-05, + "loss": 2.9198, + "step": 53595 + }, + { + "epoch": 2.63, + "grad_norm": 0.7588867545127869, + "learning_rate": 2.2642839688478364e-05, + "loss": 2.9348, + "step": 53596 + }, + { + "epoch": 2.63, + "grad_norm": 0.669175922870636, + "learning_rate": 2.2636972193742887e-05, + "loss": 2.9407, + "step": 53597 + }, + { + "epoch": 2.63, + "grad_norm": 0.7316837310791016, + "learning_rate": 2.2631105429528284e-05, + "loss": 2.952, + "step": 53598 + }, + { + "epoch": 2.63, + "grad_norm": 0.767562210559845, + "learning_rate": 2.2625239395849916e-05, + "loss": 2.943, + "step": 53599 + }, + { + "epoch": 2.63, + "grad_norm": 0.7083771228790283, + "learning_rate": 2.261937409272333e-05, + "loss": 2.9877, + "step": 53600 + }, + { + "epoch": 2.63, + "grad_norm": 0.7206432223320007, + "learning_rate": 2.2613509520163954e-05, + "loss": 2.9687, + "step": 53601 + }, + { + "epoch": 2.63, + "grad_norm": 0.7313346862792969, + "learning_rate": 2.2607645678187138e-05, + "loss": 2.8454, + "step": 53602 + }, + { + "epoch": 2.63, + "grad_norm": 0.713829517364502, + "learning_rate": 2.260178256680847e-05, + "loss": 3.1016, + "step": 53603 + }, + { + "epoch": 2.63, + "grad_norm": 0.695851743221283, + "learning_rate": 2.2595920186043238e-05, + "loss": 2.5597, + "step": 53604 + }, + { + "epoch": 2.63, + "grad_norm": 0.7535444498062134, + "learning_rate": 2.2590058535906996e-05, + "loss": 2.924, + "step": 53605 + }, + { + "epoch": 2.63, + "grad_norm": 0.7128457427024841, + "learning_rate": 2.25841976164152e-05, + "loss": 2.8773, + "step": 53606 + }, + { + "epoch": 2.63, + "grad_norm": 0.7772616147994995, + "learning_rate": 2.2578337427583172e-05, + "loss": 3.1488, + "step": 53607 + }, + { + "epoch": 2.63, + "grad_norm": 0.727367103099823, + "learning_rate": 2.2572477969426462e-05, + "loss": 2.798, + "step": 53608 + }, + { + "epoch": 2.63, + "grad_norm": 0.7210254073143005, + "learning_rate": 2.256661924196046e-05, + "loss": 3.0219, + "step": 53609 + }, + { + "epoch": 2.63, + "grad_norm": 0.7165561318397522, + "learning_rate": 2.256076124520052e-05, + "loss": 2.7132, + "step": 53610 + }, + { + "epoch": 2.63, + "grad_norm": 0.7372732162475586, + "learning_rate": 2.255490397916223e-05, + "loss": 2.8915, + "step": 53611 + }, + { + "epoch": 2.63, + "grad_norm": 0.7609841227531433, + "learning_rate": 2.2549047443860845e-05, + "loss": 3.088, + "step": 53612 + }, + { + "epoch": 2.63, + "grad_norm": 0.744179904460907, + "learning_rate": 2.2543191639311954e-05, + "loss": 3.0776, + "step": 53613 + }, + { + "epoch": 2.63, + "grad_norm": 0.7504658102989197, + "learning_rate": 2.2537336565530807e-05, + "loss": 2.9734, + "step": 53614 + }, + { + "epoch": 2.63, + "grad_norm": 0.6780826449394226, + "learning_rate": 2.2531482222532994e-05, + "loss": 2.9569, + "step": 53615 + }, + { + "epoch": 2.63, + "grad_norm": 0.7250926494598389, + "learning_rate": 2.2525628610333835e-05, + "loss": 2.8901, + "step": 53616 + }, + { + "epoch": 2.63, + "grad_norm": 0.7274686694145203, + "learning_rate": 2.2519775728948754e-05, + "loss": 2.8393, + "step": 53617 + }, + { + "epoch": 2.63, + "grad_norm": 0.7236843109130859, + "learning_rate": 2.25139235783932e-05, + "loss": 2.6181, + "step": 53618 + }, + { + "epoch": 2.63, + "grad_norm": 0.7012773752212524, + "learning_rate": 2.2508072158682535e-05, + "loss": 2.9166, + "step": 53619 + }, + { + "epoch": 2.63, + "grad_norm": 0.7341805696487427, + "learning_rate": 2.2502221469832273e-05, + "loss": 2.7775, + "step": 53620 + }, + { + "epoch": 2.63, + "grad_norm": 0.7430617809295654, + "learning_rate": 2.249637151185767e-05, + "loss": 2.943, + "step": 53621 + }, + { + "epoch": 2.63, + "grad_norm": 0.7874128222465515, + "learning_rate": 2.249052228477428e-05, + "loss": 2.9523, + "step": 53622 + }, + { + "epoch": 2.63, + "grad_norm": 0.8702890276908875, + "learning_rate": 2.248467378859743e-05, + "loss": 2.6836, + "step": 53623 + }, + { + "epoch": 2.63, + "grad_norm": 0.7678395509719849, + "learning_rate": 2.24788260233425e-05, + "loss": 2.9465, + "step": 53624 + }, + { + "epoch": 2.63, + "grad_norm": 0.7276766300201416, + "learning_rate": 2.247297898902498e-05, + "loss": 2.8315, + "step": 53625 + }, + { + "epoch": 2.63, + "grad_norm": 0.7727230191230774, + "learning_rate": 2.2467132685660196e-05, + "loss": 3.0265, + "step": 53626 + }, + { + "epoch": 2.63, + "grad_norm": 0.7668293714523315, + "learning_rate": 2.246128711326356e-05, + "loss": 2.9094, + "step": 53627 + }, + { + "epoch": 2.63, + "grad_norm": 0.7255916595458984, + "learning_rate": 2.2455442271850532e-05, + "loss": 2.8169, + "step": 53628 + }, + { + "epoch": 2.63, + "grad_norm": 0.7411938905715942, + "learning_rate": 2.24495981614364e-05, + "loss": 2.9508, + "step": 53629 + }, + { + "epoch": 2.63, + "grad_norm": 0.7330008745193481, + "learning_rate": 2.2443754782036648e-05, + "loss": 2.8352, + "step": 53630 + }, + { + "epoch": 2.63, + "grad_norm": 0.7218151092529297, + "learning_rate": 2.2437912133666634e-05, + "loss": 2.706, + "step": 53631 + }, + { + "epoch": 2.63, + "grad_norm": 0.7262304425239563, + "learning_rate": 2.2432070216341715e-05, + "loss": 2.8294, + "step": 53632 + }, + { + "epoch": 2.63, + "grad_norm": 0.7365819215774536, + "learning_rate": 2.2426229030077335e-05, + "loss": 2.846, + "step": 53633 + }, + { + "epoch": 2.63, + "grad_norm": 0.7333903312683105, + "learning_rate": 2.2420388574888793e-05, + "loss": 3.0362, + "step": 53634 + }, + { + "epoch": 2.63, + "grad_norm": 0.7654792666435242, + "learning_rate": 2.241454885079157e-05, + "loss": 2.7503, + "step": 53635 + }, + { + "epoch": 2.63, + "grad_norm": 0.7742012739181519, + "learning_rate": 2.2408709857800988e-05, + "loss": 2.7352, + "step": 53636 + }, + { + "epoch": 2.63, + "grad_norm": 0.7555630803108215, + "learning_rate": 2.240287159593247e-05, + "loss": 2.9539, + "step": 53637 + }, + { + "epoch": 2.63, + "grad_norm": 0.7577449083328247, + "learning_rate": 2.2397034065201336e-05, + "loss": 2.9425, + "step": 53638 + }, + { + "epoch": 2.63, + "grad_norm": 0.7749025821685791, + "learning_rate": 2.2391197265622975e-05, + "loss": 3.015, + "step": 53639 + }, + { + "epoch": 2.63, + "grad_norm": 0.7856140732765198, + "learning_rate": 2.2385361197212805e-05, + "loss": 2.7987, + "step": 53640 + }, + { + "epoch": 2.63, + "grad_norm": 0.7463555932044983, + "learning_rate": 2.2379525859986148e-05, + "loss": 2.8118, + "step": 53641 + }, + { + "epoch": 2.63, + "grad_norm": 0.7431473135948181, + "learning_rate": 2.2373691253958358e-05, + "loss": 2.8837, + "step": 53642 + }, + { + "epoch": 2.63, + "grad_norm": 0.7310684323310852, + "learning_rate": 2.2367857379144892e-05, + "loss": 2.8888, + "step": 53643 + }, + { + "epoch": 2.63, + "grad_norm": 0.7498796582221985, + "learning_rate": 2.236202423556107e-05, + "loss": 2.9496, + "step": 53644 + }, + { + "epoch": 2.63, + "grad_norm": 0.740679919719696, + "learning_rate": 2.2356191823222214e-05, + "loss": 3.0156, + "step": 53645 + }, + { + "epoch": 2.63, + "grad_norm": 0.7141246199607849, + "learning_rate": 2.235036014214371e-05, + "loss": 2.8782, + "step": 53646 + }, + { + "epoch": 2.63, + "grad_norm": 0.7410897612571716, + "learning_rate": 2.2344529192340876e-05, + "loss": 2.6084, + "step": 53647 + }, + { + "epoch": 2.63, + "grad_norm": 0.7512717247009277, + "learning_rate": 2.2338698973829207e-05, + "loss": 2.8707, + "step": 53648 + }, + { + "epoch": 2.63, + "grad_norm": 0.7618011832237244, + "learning_rate": 2.2332869486623884e-05, + "loss": 2.8932, + "step": 53649 + }, + { + "epoch": 2.63, + "grad_norm": 0.8862194418907166, + "learning_rate": 2.2327040730740397e-05, + "loss": 2.8414, + "step": 53650 + }, + { + "epoch": 2.63, + "grad_norm": 0.7280387878417969, + "learning_rate": 2.2321212706194037e-05, + "loss": 2.7621, + "step": 53651 + }, + { + "epoch": 2.63, + "grad_norm": 0.714809000492096, + "learning_rate": 2.2315385413000152e-05, + "loss": 2.623, + "step": 53652 + }, + { + "epoch": 2.63, + "grad_norm": 0.7935085892677307, + "learning_rate": 2.2309558851174138e-05, + "loss": 2.7519, + "step": 53653 + }, + { + "epoch": 2.63, + "grad_norm": 0.736251711845398, + "learning_rate": 2.230373302073124e-05, + "loss": 2.8681, + "step": 53654 + }, + { + "epoch": 2.63, + "grad_norm": 0.7445536851882935, + "learning_rate": 2.2297907921686953e-05, + "loss": 3.1613, + "step": 53655 + }, + { + "epoch": 2.63, + "grad_norm": 0.7414441108703613, + "learning_rate": 2.2292083554056427e-05, + "loss": 2.8987, + "step": 53656 + }, + { + "epoch": 2.63, + "grad_norm": 0.7777810096740723, + "learning_rate": 2.2286259917855152e-05, + "loss": 2.6787, + "step": 53657 + }, + { + "epoch": 2.63, + "grad_norm": 0.7500730156898499, + "learning_rate": 2.2280437013098452e-05, + "loss": 2.768, + "step": 53658 + }, + { + "epoch": 2.63, + "grad_norm": 0.7526910901069641, + "learning_rate": 2.2274614839801607e-05, + "loss": 2.9552, + "step": 53659 + }, + { + "epoch": 2.63, + "grad_norm": 0.7558931112289429, + "learning_rate": 2.2268793397980013e-05, + "loss": 3.0392, + "step": 53660 + }, + { + "epoch": 2.63, + "grad_norm": 0.7288579344749451, + "learning_rate": 2.2262972687648916e-05, + "loss": 2.9854, + "step": 53661 + }, + { + "epoch": 2.63, + "grad_norm": 0.7054191827774048, + "learning_rate": 2.225715270882368e-05, + "loss": 2.9694, + "step": 53662 + }, + { + "epoch": 2.63, + "grad_norm": 0.7429760098457336, + "learning_rate": 2.2251333461519717e-05, + "loss": 2.8739, + "step": 53663 + }, + { + "epoch": 2.63, + "grad_norm": 0.8079425692558289, + "learning_rate": 2.224551494575222e-05, + "loss": 2.8049, + "step": 53664 + }, + { + "epoch": 2.63, + "grad_norm": 0.7400147318840027, + "learning_rate": 2.2239697161536642e-05, + "loss": 2.9006, + "step": 53665 + }, + { + "epoch": 2.63, + "grad_norm": 0.7143925428390503, + "learning_rate": 2.223388010888817e-05, + "loss": 2.6057, + "step": 53666 + }, + { + "epoch": 2.63, + "grad_norm": 0.7217711806297302, + "learning_rate": 2.2228063787822292e-05, + "loss": 2.896, + "step": 53667 + }, + { + "epoch": 2.63, + "grad_norm": 0.8171392679214478, + "learning_rate": 2.2222248198354198e-05, + "loss": 3.0967, + "step": 53668 + }, + { + "epoch": 2.63, + "grad_norm": 0.7449054718017578, + "learning_rate": 2.221643334049924e-05, + "loss": 2.9392, + "step": 53669 + }, + { + "epoch": 2.63, + "grad_norm": 0.7087463140487671, + "learning_rate": 2.221061921427274e-05, + "loss": 2.9487, + "step": 53670 + }, + { + "epoch": 2.63, + "grad_norm": 0.7898861169815063, + "learning_rate": 2.2204805819689987e-05, + "loss": 2.8475, + "step": 53671 + }, + { + "epoch": 2.63, + "grad_norm": 0.762144923210144, + "learning_rate": 2.21989931567663e-05, + "loss": 2.948, + "step": 53672 + }, + { + "epoch": 2.63, + "grad_norm": 0.7354418635368347, + "learning_rate": 2.2193181225517065e-05, + "loss": 2.6357, + "step": 53673 + }, + { + "epoch": 2.63, + "grad_norm": 0.7770733833312988, + "learning_rate": 2.2187370025957506e-05, + "loss": 2.9122, + "step": 53674 + }, + { + "epoch": 2.63, + "grad_norm": 0.7444289922714233, + "learning_rate": 2.2181559558102947e-05, + "loss": 2.8764, + "step": 53675 + }, + { + "epoch": 2.63, + "grad_norm": 0.7131602764129639, + "learning_rate": 2.217574982196867e-05, + "loss": 2.8945, + "step": 53676 + }, + { + "epoch": 2.63, + "grad_norm": 0.8043651580810547, + "learning_rate": 2.2169940817569965e-05, + "loss": 2.7772, + "step": 53677 + }, + { + "epoch": 2.63, + "grad_norm": 0.8861671686172485, + "learning_rate": 2.216413254492222e-05, + "loss": 2.8719, + "step": 53678 + }, + { + "epoch": 2.63, + "grad_norm": 0.782197892665863, + "learning_rate": 2.2158325004040655e-05, + "loss": 2.927, + "step": 53679 + }, + { + "epoch": 2.63, + "grad_norm": 0.7546136379241943, + "learning_rate": 2.2152518194940626e-05, + "loss": 2.9038, + "step": 53680 + }, + { + "epoch": 2.63, + "grad_norm": 0.7524154186248779, + "learning_rate": 2.2146712117637388e-05, + "loss": 2.8427, + "step": 53681 + }, + { + "epoch": 2.63, + "grad_norm": 0.7260668277740479, + "learning_rate": 2.2140906772146195e-05, + "loss": 2.9093, + "step": 53682 + }, + { + "epoch": 2.63, + "grad_norm": 0.7640723586082458, + "learning_rate": 2.2135102158482432e-05, + "loss": 2.7328, + "step": 53683 + }, + { + "epoch": 2.63, + "grad_norm": 0.7455698251724243, + "learning_rate": 2.2129298276661255e-05, + "loss": 2.8891, + "step": 53684 + }, + { + "epoch": 2.63, + "grad_norm": 0.7532725930213928, + "learning_rate": 2.212349512669812e-05, + "loss": 2.9091, + "step": 53685 + }, + { + "epoch": 2.63, + "grad_norm": 0.7638143301010132, + "learning_rate": 2.2117692708608148e-05, + "loss": 3.1224, + "step": 53686 + }, + { + "epoch": 2.63, + "grad_norm": 0.7833783626556396, + "learning_rate": 2.211189102240669e-05, + "loss": 2.9242, + "step": 53687 + }, + { + "epoch": 2.63, + "grad_norm": 0.7878255844116211, + "learning_rate": 2.2106090068109107e-05, + "loss": 2.9914, + "step": 53688 + }, + { + "epoch": 2.63, + "grad_norm": 0.7649726867675781, + "learning_rate": 2.210028984573058e-05, + "loss": 2.6449, + "step": 53689 + }, + { + "epoch": 2.63, + "grad_norm": 0.7419236898422241, + "learning_rate": 2.2094490355286397e-05, + "loss": 3.0778, + "step": 53690 + }, + { + "epoch": 2.63, + "grad_norm": 0.7762550115585327, + "learning_rate": 2.2088691596791784e-05, + "loss": 2.869, + "step": 53691 + }, + { + "epoch": 2.63, + "grad_norm": 0.7269838452339172, + "learning_rate": 2.2082893570262163e-05, + "loss": 2.8345, + "step": 53692 + }, + { + "epoch": 2.63, + "grad_norm": 0.7062630653381348, + "learning_rate": 2.2077096275712648e-05, + "loss": 2.7138, + "step": 53693 + }, + { + "epoch": 2.63, + "grad_norm": 0.7006743550300598, + "learning_rate": 2.2071299713158564e-05, + "loss": 2.9197, + "step": 53694 + }, + { + "epoch": 2.63, + "grad_norm": 0.738308846950531, + "learning_rate": 2.206550388261523e-05, + "loss": 2.6824, + "step": 53695 + }, + { + "epoch": 2.63, + "grad_norm": 0.8569402694702148, + "learning_rate": 2.2059708784097874e-05, + "loss": 3.0809, + "step": 53696 + }, + { + "epoch": 2.63, + "grad_norm": 0.7651105523109436, + "learning_rate": 2.2053914417621776e-05, + "loss": 2.6283, + "step": 53697 + }, + { + "epoch": 2.63, + "grad_norm": 0.7134808897972107, + "learning_rate": 2.2048120783202128e-05, + "loss": 2.8568, + "step": 53698 + }, + { + "epoch": 2.63, + "grad_norm": 0.7216563820838928, + "learning_rate": 2.2042327880854215e-05, + "loss": 2.7479, + "step": 53699 + }, + { + "epoch": 2.63, + "grad_norm": 0.7495296597480774, + "learning_rate": 2.2036535710593362e-05, + "loss": 3.0511, + "step": 53700 + }, + { + "epoch": 2.63, + "grad_norm": 0.7457597255706787, + "learning_rate": 2.2030744272434754e-05, + "loss": 2.8722, + "step": 53701 + }, + { + "epoch": 2.63, + "grad_norm": 0.76603764295578, + "learning_rate": 2.2024953566393678e-05, + "loss": 3.0893, + "step": 53702 + }, + { + "epoch": 2.63, + "grad_norm": 0.7615640163421631, + "learning_rate": 2.201916359248542e-05, + "loss": 3.0252, + "step": 53703 + }, + { + "epoch": 2.63, + "grad_norm": 0.765140950679779, + "learning_rate": 2.201337435072511e-05, + "loss": 2.9289, + "step": 53704 + }, + { + "epoch": 2.63, + "grad_norm": 0.7220942974090576, + "learning_rate": 2.200758584112813e-05, + "loss": 2.8011, + "step": 53705 + }, + { + "epoch": 2.63, + "grad_norm": 0.7161285281181335, + "learning_rate": 2.2001798063709632e-05, + "loss": 2.7574, + "step": 53706 + }, + { + "epoch": 2.63, + "grad_norm": 0.7688199281692505, + "learning_rate": 2.1996011018484936e-05, + "loss": 3.0325, + "step": 53707 + }, + { + "epoch": 2.63, + "grad_norm": 0.7416340708732605, + "learning_rate": 2.199022470546917e-05, + "loss": 2.8929, + "step": 53708 + }, + { + "epoch": 2.63, + "grad_norm": 0.7346240282058716, + "learning_rate": 2.198443912467769e-05, + "loss": 2.9675, + "step": 53709 + }, + { + "epoch": 2.63, + "grad_norm": 0.7140522599220276, + "learning_rate": 2.1978654276125708e-05, + "loss": 2.9636, + "step": 53710 + }, + { + "epoch": 2.63, + "grad_norm": 0.761841356754303, + "learning_rate": 2.1972870159828482e-05, + "loss": 2.8097, + "step": 53711 + }, + { + "epoch": 2.63, + "grad_norm": 0.7178420424461365, + "learning_rate": 2.196708677580117e-05, + "loss": 2.9565, + "step": 53712 + }, + { + "epoch": 2.63, + "grad_norm": 0.7384170889854431, + "learning_rate": 2.196130412405902e-05, + "loss": 2.9932, + "step": 53713 + }, + { + "epoch": 2.63, + "grad_norm": 0.742639422416687, + "learning_rate": 2.1955522204617258e-05, + "loss": 2.967, + "step": 53714 + }, + { + "epoch": 2.63, + "grad_norm": 0.7487492561340332, + "learning_rate": 2.1949741017491206e-05, + "loss": 2.8679, + "step": 53715 + }, + { + "epoch": 2.63, + "grad_norm": 0.7340525984764099, + "learning_rate": 2.1943960562695983e-05, + "loss": 2.7455, + "step": 53716 + }, + { + "epoch": 2.63, + "grad_norm": 0.7590996623039246, + "learning_rate": 2.193818084024691e-05, + "loss": 2.8737, + "step": 53717 + }, + { + "epoch": 2.63, + "grad_norm": 0.7017925977706909, + "learning_rate": 2.1932401850159142e-05, + "loss": 2.7985, + "step": 53718 + }, + { + "epoch": 2.63, + "grad_norm": 0.7645330429077148, + "learning_rate": 2.1926623592447834e-05, + "loss": 2.939, + "step": 53719 + }, + { + "epoch": 2.63, + "grad_norm": 0.7296159863471985, + "learning_rate": 2.1920846067128372e-05, + "loss": 2.9681, + "step": 53720 + }, + { + "epoch": 2.63, + "grad_norm": 0.7412862181663513, + "learning_rate": 2.1915069274215812e-05, + "loss": 2.8342, + "step": 53721 + }, + { + "epoch": 2.63, + "grad_norm": 0.7657079100608826, + "learning_rate": 2.190929321372551e-05, + "loss": 3.0628, + "step": 53722 + }, + { + "epoch": 2.63, + "grad_norm": 0.7571131587028503, + "learning_rate": 2.1903517885672552e-05, + "loss": 2.7669, + "step": 53723 + }, + { + "epoch": 2.63, + "grad_norm": 0.7139904499053955, + "learning_rate": 2.1897743290072257e-05, + "loss": 2.8135, + "step": 53724 + }, + { + "epoch": 2.63, + "grad_norm": 0.7659240365028381, + "learning_rate": 2.1891969426939783e-05, + "loss": 2.8425, + "step": 53725 + }, + { + "epoch": 2.63, + "grad_norm": 0.7518433928489685, + "learning_rate": 2.188619629629028e-05, + "loss": 2.9292, + "step": 53726 + }, + { + "epoch": 2.63, + "grad_norm": 0.8315146565437317, + "learning_rate": 2.1880423898139078e-05, + "loss": 2.921, + "step": 53727 + }, + { + "epoch": 2.63, + "grad_norm": 0.7377954125404358, + "learning_rate": 2.1874652232501255e-05, + "loss": 2.7456, + "step": 53728 + }, + { + "epoch": 2.63, + "grad_norm": 0.7297799587249756, + "learning_rate": 2.186888129939207e-05, + "loss": 2.5994, + "step": 53729 + }, + { + "epoch": 2.63, + "grad_norm": 0.7472618818283081, + "learning_rate": 2.186311109882678e-05, + "loss": 2.7483, + "step": 53730 + }, + { + "epoch": 2.63, + "grad_norm": 0.7658689618110657, + "learning_rate": 2.1857341630820503e-05, + "loss": 2.6907, + "step": 53731 + }, + { + "epoch": 2.63, + "grad_norm": 0.7699265480041504, + "learning_rate": 2.185157289538846e-05, + "loss": 2.9277, + "step": 53732 + }, + { + "epoch": 2.63, + "grad_norm": 0.8159839510917664, + "learning_rate": 2.1845804892545872e-05, + "loss": 2.8331, + "step": 53733 + }, + { + "epoch": 2.63, + "grad_norm": 0.7433233857154846, + "learning_rate": 2.1840037622307827e-05, + "loss": 2.7793, + "step": 53734 + }, + { + "epoch": 2.63, + "grad_norm": 0.7243334054946899, + "learning_rate": 2.183427108468968e-05, + "loss": 2.8757, + "step": 53735 + }, + { + "epoch": 2.63, + "grad_norm": 0.7512285709381104, + "learning_rate": 2.1828505279706454e-05, + "loss": 2.91, + "step": 53736 + }, + { + "epoch": 2.63, + "grad_norm": 0.761920690536499, + "learning_rate": 2.182274020737347e-05, + "loss": 2.9553, + "step": 53737 + }, + { + "epoch": 2.63, + "grad_norm": 0.7395731210708618, + "learning_rate": 2.181697586770578e-05, + "loss": 2.8619, + "step": 53738 + }, + { + "epoch": 2.63, + "grad_norm": 0.7716213464736938, + "learning_rate": 2.18112122607187e-05, + "loss": 2.9312, + "step": 53739 + }, + { + "epoch": 2.63, + "grad_norm": 0.7732451558113098, + "learning_rate": 2.1805449386427364e-05, + "loss": 2.8891, + "step": 53740 + }, + { + "epoch": 2.63, + "grad_norm": 0.7086454629898071, + "learning_rate": 2.1799687244846852e-05, + "loss": 3.0304, + "step": 53741 + }, + { + "epoch": 2.63, + "grad_norm": 0.7271209359169006, + "learning_rate": 2.1793925835992486e-05, + "loss": 2.7164, + "step": 53742 + }, + { + "epoch": 2.63, + "grad_norm": 0.7848542332649231, + "learning_rate": 2.1788165159879356e-05, + "loss": 2.7783, + "step": 53743 + }, + { + "epoch": 2.63, + "grad_norm": 0.7181041240692139, + "learning_rate": 2.1782405216522615e-05, + "loss": 2.8273, + "step": 53744 + }, + { + "epoch": 2.63, + "grad_norm": 0.7394196391105652, + "learning_rate": 2.177664600593755e-05, + "loss": 2.9158, + "step": 53745 + }, + { + "epoch": 2.63, + "grad_norm": 0.7684555053710938, + "learning_rate": 2.1770887528139245e-05, + "loss": 2.8852, + "step": 53746 + }, + { + "epoch": 2.63, + "grad_norm": 0.733661413192749, + "learning_rate": 2.1765129783142897e-05, + "loss": 2.8123, + "step": 53747 + }, + { + "epoch": 2.63, + "grad_norm": 0.7537463307380676, + "learning_rate": 2.1759372770963558e-05, + "loss": 2.9477, + "step": 53748 + }, + { + "epoch": 2.63, + "grad_norm": 0.752604067325592, + "learning_rate": 2.175361649161651e-05, + "loss": 2.8565, + "step": 53749 + }, + { + "epoch": 2.63, + "grad_norm": 0.8050605654716492, + "learning_rate": 2.1747860945116946e-05, + "loss": 2.6993, + "step": 53750 + }, + { + "epoch": 2.63, + "grad_norm": 0.7391548156738281, + "learning_rate": 2.1742106131479886e-05, + "loss": 2.7675, + "step": 53751 + }, + { + "epoch": 2.63, + "grad_norm": 0.7486521005630493, + "learning_rate": 2.173635205072065e-05, + "loss": 2.7678, + "step": 53752 + }, + { + "epoch": 2.63, + "grad_norm": 0.7553504705429077, + "learning_rate": 2.173059870285423e-05, + "loss": 2.8386, + "step": 53753 + }, + { + "epoch": 2.63, + "grad_norm": 0.8861485123634338, + "learning_rate": 2.1724846087895943e-05, + "loss": 2.651, + "step": 53754 + }, + { + "epoch": 2.63, + "grad_norm": 0.7568973898887634, + "learning_rate": 2.171909420586081e-05, + "loss": 2.9888, + "step": 53755 + }, + { + "epoch": 2.63, + "grad_norm": 0.7912908792495728, + "learning_rate": 2.171334305676402e-05, + "loss": 2.9017, + "step": 53756 + }, + { + "epoch": 2.63, + "grad_norm": 0.7801294326782227, + "learning_rate": 2.1707592640620762e-05, + "loss": 2.8552, + "step": 53757 + }, + { + "epoch": 2.63, + "grad_norm": 0.749573290348053, + "learning_rate": 2.1701842957446092e-05, + "loss": 2.9134, + "step": 53758 + }, + { + "epoch": 2.63, + "grad_norm": 0.778415322303772, + "learning_rate": 2.1696094007255194e-05, + "loss": 2.8841, + "step": 53759 + }, + { + "epoch": 2.63, + "grad_norm": 0.7514265179634094, + "learning_rate": 2.169034579006329e-05, + "loss": 3.0026, + "step": 53760 + }, + { + "epoch": 2.63, + "grad_norm": 0.7774766683578491, + "learning_rate": 2.168459830588547e-05, + "loss": 2.7802, + "step": 53761 + }, + { + "epoch": 2.63, + "grad_norm": 0.7230890393257141, + "learning_rate": 2.167885155473682e-05, + "loss": 2.9125, + "step": 53762 + }, + { + "epoch": 2.63, + "grad_norm": 0.7347655892372131, + "learning_rate": 2.1673105536632497e-05, + "loss": 3.0178, + "step": 53763 + }, + { + "epoch": 2.63, + "grad_norm": 0.7173894643783569, + "learning_rate": 2.166736025158762e-05, + "loss": 3.0613, + "step": 53764 + }, + { + "epoch": 2.63, + "grad_norm": 0.7474822402000427, + "learning_rate": 2.1661615699617407e-05, + "loss": 2.8945, + "step": 53765 + }, + { + "epoch": 2.63, + "grad_norm": 0.7168195247650146, + "learning_rate": 2.1655871880736887e-05, + "loss": 2.8247, + "step": 53766 + }, + { + "epoch": 2.63, + "grad_norm": 0.7810943722724915, + "learning_rate": 2.1650128794961276e-05, + "loss": 2.7536, + "step": 53767 + }, + { + "epoch": 2.64, + "grad_norm": 0.7393297553062439, + "learning_rate": 2.1644386442305627e-05, + "loss": 2.957, + "step": 53768 + }, + { + "epoch": 2.64, + "grad_norm": 0.758391797542572, + "learning_rate": 2.1638644822785166e-05, + "loss": 2.6468, + "step": 53769 + }, + { + "epoch": 2.64, + "grad_norm": 0.764182448387146, + "learning_rate": 2.1632903936414848e-05, + "loss": 2.9961, + "step": 53770 + }, + { + "epoch": 2.64, + "grad_norm": 0.7735395431518555, + "learning_rate": 2.162716378320989e-05, + "loss": 2.9979, + "step": 53771 + }, + { + "epoch": 2.64, + "grad_norm": 0.7840405702590942, + "learning_rate": 2.162142436318548e-05, + "loss": 2.9104, + "step": 53772 + }, + { + "epoch": 2.64, + "grad_norm": 0.7420419454574585, + "learning_rate": 2.1615685676356577e-05, + "loss": 3.1512, + "step": 53773 + }, + { + "epoch": 2.64, + "grad_norm": 0.7456751465797424, + "learning_rate": 2.160994772273846e-05, + "loss": 2.9972, + "step": 53774 + }, + { + "epoch": 2.64, + "grad_norm": 0.7378044128417969, + "learning_rate": 2.16042105023461e-05, + "loss": 2.7904, + "step": 53775 + }, + { + "epoch": 2.64, + "grad_norm": 0.744476318359375, + "learning_rate": 2.1598474015194733e-05, + "loss": 2.807, + "step": 53776 + }, + { + "epoch": 2.64, + "grad_norm": 0.7326436638832092, + "learning_rate": 2.1592738261299393e-05, + "loss": 2.8704, + "step": 53777 + }, + { + "epoch": 2.64, + "grad_norm": 0.752892792224884, + "learning_rate": 2.158700324067516e-05, + "loss": 2.941, + "step": 53778 + }, + { + "epoch": 2.64, + "grad_norm": 0.7676821947097778, + "learning_rate": 2.1581268953337227e-05, + "loss": 2.8962, + "step": 53779 + }, + { + "epoch": 2.64, + "grad_norm": 0.788492739200592, + "learning_rate": 2.157553539930058e-05, + "loss": 3.0261, + "step": 53780 + }, + { + "epoch": 2.64, + "grad_norm": 0.7496923804283142, + "learning_rate": 2.1569802578580407e-05, + "loss": 3.1289, + "step": 53781 + }, + { + "epoch": 2.64, + "grad_norm": 0.7280400395393372, + "learning_rate": 2.156407049119183e-05, + "loss": 3.0553, + "step": 53782 + }, + { + "epoch": 2.64, + "grad_norm": 0.8010007739067078, + "learning_rate": 2.1558339137149904e-05, + "loss": 2.7921, + "step": 53783 + }, + { + "epoch": 2.64, + "grad_norm": 0.7570450305938721, + "learning_rate": 2.155260851646975e-05, + "loss": 3.0598, + "step": 53784 + }, + { + "epoch": 2.64, + "grad_norm": 0.7340518832206726, + "learning_rate": 2.1546878629166355e-05, + "loss": 2.8535, + "step": 53785 + }, + { + "epoch": 2.64, + "grad_norm": 0.7815223336219788, + "learning_rate": 2.1541149475254903e-05, + "loss": 3.059, + "step": 53786 + }, + { + "epoch": 2.64, + "grad_norm": 0.7372803688049316, + "learning_rate": 2.1535421054750558e-05, + "loss": 3.0539, + "step": 53787 + }, + { + "epoch": 2.64, + "grad_norm": 0.7437090277671814, + "learning_rate": 2.1529693367668233e-05, + "loss": 3.0135, + "step": 53788 + }, + { + "epoch": 2.64, + "grad_norm": 0.7286654710769653, + "learning_rate": 2.152396641402315e-05, + "loss": 2.7342, + "step": 53789 + }, + { + "epoch": 2.64, + "grad_norm": 0.7460008263587952, + "learning_rate": 2.1518240193830306e-05, + "loss": 3.0216, + "step": 53790 + }, + { + "epoch": 2.64, + "grad_norm": 0.7398113012313843, + "learning_rate": 2.151251470710488e-05, + "loss": 3.0744, + "step": 53791 + }, + { + "epoch": 2.64, + "grad_norm": 0.7812997698783875, + "learning_rate": 2.150678995386189e-05, + "loss": 2.8752, + "step": 53792 + }, + { + "epoch": 2.64, + "grad_norm": 0.7865747809410095, + "learning_rate": 2.1501065934116368e-05, + "loss": 2.8237, + "step": 53793 + }, + { + "epoch": 2.64, + "grad_norm": 0.7350339293479919, + "learning_rate": 2.1495342647883496e-05, + "loss": 2.929, + "step": 53794 + }, + { + "epoch": 2.64, + "grad_norm": 0.7692163586616516, + "learning_rate": 2.148962009517823e-05, + "loss": 2.9889, + "step": 53795 + }, + { + "epoch": 2.64, + "grad_norm": 0.7704609036445618, + "learning_rate": 2.1483898276015688e-05, + "loss": 2.9512, + "step": 53796 + }, + { + "epoch": 2.64, + "grad_norm": 0.7423082590103149, + "learning_rate": 2.147817719041103e-05, + "loss": 2.7966, + "step": 53797 + }, + { + "epoch": 2.64, + "grad_norm": 0.7098038196563721, + "learning_rate": 2.147245683837927e-05, + "loss": 3.0031, + "step": 53798 + }, + { + "epoch": 2.64, + "grad_norm": 0.7264339923858643, + "learning_rate": 2.1466737219935405e-05, + "loss": 2.8246, + "step": 53799 + }, + { + "epoch": 2.64, + "grad_norm": 0.7618200778961182, + "learning_rate": 2.1461018335094548e-05, + "loss": 3.1037, + "step": 53800 + }, + { + "epoch": 2.64, + "grad_norm": 0.742719292640686, + "learning_rate": 2.1455300183871728e-05, + "loss": 2.9129, + "step": 53801 + }, + { + "epoch": 2.64, + "grad_norm": 0.8027572631835938, + "learning_rate": 2.1449582766282125e-05, + "loss": 3.0597, + "step": 53802 + }, + { + "epoch": 2.64, + "grad_norm": 0.7896417379379272, + "learning_rate": 2.1443866082340632e-05, + "loss": 3.015, + "step": 53803 + }, + { + "epoch": 2.64, + "grad_norm": 0.7622906565666199, + "learning_rate": 2.1438150132062436e-05, + "loss": 3.051, + "step": 53804 + }, + { + "epoch": 2.64, + "grad_norm": 0.7213104367256165, + "learning_rate": 2.143243491546256e-05, + "loss": 2.6144, + "step": 53805 + }, + { + "epoch": 2.64, + "grad_norm": 0.754871666431427, + "learning_rate": 2.142672043255599e-05, + "loss": 2.8604, + "step": 53806 + }, + { + "epoch": 2.64, + "grad_norm": 0.7541650533676147, + "learning_rate": 2.1421006683357844e-05, + "loss": 2.7727, + "step": 53807 + }, + { + "epoch": 2.64, + "grad_norm": 0.7695738673210144, + "learning_rate": 2.141529366788315e-05, + "loss": 2.9745, + "step": 53808 + }, + { + "epoch": 2.64, + "grad_norm": 0.7393126487731934, + "learning_rate": 2.140958138614699e-05, + "loss": 2.868, + "step": 53809 + }, + { + "epoch": 2.64, + "grad_norm": 0.7608216404914856, + "learning_rate": 2.140386983816429e-05, + "loss": 2.8814, + "step": 53810 + }, + { + "epoch": 2.64, + "grad_norm": 0.7869523763656616, + "learning_rate": 2.1398159023950235e-05, + "loss": 2.9099, + "step": 53811 + }, + { + "epoch": 2.64, + "grad_norm": 0.7133737206459045, + "learning_rate": 2.1392448943519812e-05, + "loss": 2.9117, + "step": 53812 + }, + { + "epoch": 2.64, + "grad_norm": 0.7602190375328064, + "learning_rate": 2.1386739596888116e-05, + "loss": 2.8612, + "step": 53813 + }, + { + "epoch": 2.64, + "grad_norm": 0.7913804650306702, + "learning_rate": 2.138103098407009e-05, + "loss": 2.9442, + "step": 53814 + }, + { + "epoch": 2.64, + "grad_norm": 0.7600404620170593, + "learning_rate": 2.1375323105080768e-05, + "loss": 2.8422, + "step": 53815 + }, + { + "epoch": 2.64, + "grad_norm": 0.7508252263069153, + "learning_rate": 2.1369615959935192e-05, + "loss": 2.6938, + "step": 53816 + }, + { + "epoch": 2.64, + "grad_norm": 0.7605404853820801, + "learning_rate": 2.1363909548648527e-05, + "loss": 2.6986, + "step": 53817 + }, + { + "epoch": 2.64, + "grad_norm": 0.7708483934402466, + "learning_rate": 2.1358203871235623e-05, + "loss": 2.7698, + "step": 53818 + }, + { + "epoch": 2.64, + "grad_norm": 0.7216460704803467, + "learning_rate": 2.1352498927711603e-05, + "loss": 2.9233, + "step": 53819 + }, + { + "epoch": 2.64, + "grad_norm": 0.7428836226463318, + "learning_rate": 2.134679471809152e-05, + "loss": 2.8189, + "step": 53820 + }, + { + "epoch": 2.64, + "grad_norm": 0.757232666015625, + "learning_rate": 2.1341091242390296e-05, + "loss": 2.8876, + "step": 53821 + }, + { + "epoch": 2.64, + "grad_norm": 0.7275024652481079, + "learning_rate": 2.133538850062305e-05, + "loss": 2.7804, + "step": 53822 + }, + { + "epoch": 2.64, + "grad_norm": 0.7810689210891724, + "learning_rate": 2.132968649280471e-05, + "loss": 2.8834, + "step": 53823 + }, + { + "epoch": 2.64, + "grad_norm": 0.758514404296875, + "learning_rate": 2.132398521895039e-05, + "loss": 2.9051, + "step": 53824 + }, + { + "epoch": 2.64, + "grad_norm": 0.7726489305496216, + "learning_rate": 2.1318284679075016e-05, + "loss": 3.0325, + "step": 53825 + }, + { + "epoch": 2.64, + "grad_norm": 0.7153679728507996, + "learning_rate": 2.1312584873193705e-05, + "loss": 2.9691, + "step": 53826 + }, + { + "epoch": 2.64, + "grad_norm": 0.7290853261947632, + "learning_rate": 2.1306885801321384e-05, + "loss": 3.0509, + "step": 53827 + }, + { + "epoch": 2.64, + "grad_norm": 0.7138325572013855, + "learning_rate": 2.1301187463473067e-05, + "loss": 2.9116, + "step": 53828 + }, + { + "epoch": 2.64, + "grad_norm": 0.7441121339797974, + "learning_rate": 2.1295489859663817e-05, + "loss": 2.9767, + "step": 53829 + }, + { + "epoch": 2.64, + "grad_norm": 0.7585853934288025, + "learning_rate": 2.1289792989908583e-05, + "loss": 2.8873, + "step": 53830 + }, + { + "epoch": 2.64, + "grad_norm": 0.7859900593757629, + "learning_rate": 2.1284096854222388e-05, + "loss": 2.9058, + "step": 53831 + }, + { + "epoch": 2.64, + "grad_norm": 0.762042224407196, + "learning_rate": 2.1278401452620287e-05, + "loss": 2.8423, + "step": 53832 + }, + { + "epoch": 2.64, + "grad_norm": 0.7395182251930237, + "learning_rate": 2.1272706785117167e-05, + "loss": 2.7781, + "step": 53833 + }, + { + "epoch": 2.64, + "grad_norm": 0.7580464482307434, + "learning_rate": 2.126701285172815e-05, + "loss": 2.6739, + "step": 53834 + }, + { + "epoch": 2.64, + "grad_norm": 0.7301293015480042, + "learning_rate": 2.126131965246819e-05, + "loss": 2.7645, + "step": 53835 + }, + { + "epoch": 2.64, + "grad_norm": 0.7253274321556091, + "learning_rate": 2.1255627187352244e-05, + "loss": 2.7732, + "step": 53836 + }, + { + "epoch": 2.64, + "grad_norm": 0.7815554738044739, + "learning_rate": 2.124993545639533e-05, + "loss": 2.714, + "step": 53837 + }, + { + "epoch": 2.64, + "grad_norm": 0.7133461236953735, + "learning_rate": 2.1244244459612436e-05, + "loss": 2.8703, + "step": 53838 + }, + { + "epoch": 2.64, + "grad_norm": 0.7465064525604248, + "learning_rate": 2.1238554197018587e-05, + "loss": 2.9164, + "step": 53839 + }, + { + "epoch": 2.64, + "grad_norm": 0.7770967483520508, + "learning_rate": 2.123286466862867e-05, + "loss": 2.7855, + "step": 53840 + }, + { + "epoch": 2.64, + "grad_norm": 0.7370714545249939, + "learning_rate": 2.12271758744578e-05, + "loss": 2.7894, + "step": 53841 + }, + { + "epoch": 2.64, + "grad_norm": 0.747188925743103, + "learning_rate": 2.122148781452091e-05, + "loss": 3.0637, + "step": 53842 + }, + { + "epoch": 2.64, + "grad_norm": 0.747463047504425, + "learning_rate": 2.1215800488832913e-05, + "loss": 2.6843, + "step": 53843 + }, + { + "epoch": 2.64, + "grad_norm": 0.7289397120475769, + "learning_rate": 2.1210113897408898e-05, + "loss": 2.9257, + "step": 53844 + }, + { + "epoch": 2.64, + "grad_norm": 0.7071889042854309, + "learning_rate": 2.120442804026372e-05, + "loss": 2.7178, + "step": 53845 + }, + { + "epoch": 2.64, + "grad_norm": 0.7718331813812256, + "learning_rate": 2.119874291741247e-05, + "loss": 3.0457, + "step": 53846 + }, + { + "epoch": 2.64, + "grad_norm": 0.7785387635231018, + "learning_rate": 2.1193058528870066e-05, + "loss": 2.9793, + "step": 53847 + }, + { + "epoch": 2.64, + "grad_norm": 0.7537055015563965, + "learning_rate": 2.11873748746515e-05, + "loss": 2.723, + "step": 53848 + }, + { + "epoch": 2.64, + "grad_norm": 0.7677419185638428, + "learning_rate": 2.118169195477175e-05, + "loss": 2.7947, + "step": 53849 + }, + { + "epoch": 2.64, + "grad_norm": 0.7995899319648743, + "learning_rate": 2.1176009769245683e-05, + "loss": 3.0872, + "step": 53850 + }, + { + "epoch": 2.64, + "grad_norm": 0.688861608505249, + "learning_rate": 2.1170328318088447e-05, + "loss": 2.8195, + "step": 53851 + }, + { + "epoch": 2.64, + "grad_norm": 0.8810485005378723, + "learning_rate": 2.11646476013148e-05, + "loss": 2.9762, + "step": 53852 + }, + { + "epoch": 2.64, + "grad_norm": 0.7384539842605591, + "learning_rate": 2.1158967618939858e-05, + "loss": 2.8733, + "step": 53853 + }, + { + "epoch": 2.64, + "grad_norm": 0.7322173118591309, + "learning_rate": 2.115328837097855e-05, + "loss": 2.847, + "step": 53854 + }, + { + "epoch": 2.64, + "grad_norm": 0.8136117458343506, + "learning_rate": 2.114760985744576e-05, + "loss": 2.916, + "step": 53855 + }, + { + "epoch": 2.64, + "grad_norm": 0.7917467951774597, + "learning_rate": 2.114193207835657e-05, + "loss": 2.8955, + "step": 53856 + }, + { + "epoch": 2.64, + "grad_norm": 0.7615936994552612, + "learning_rate": 2.1136255033725846e-05, + "loss": 3.037, + "step": 53857 + }, + { + "epoch": 2.64, + "grad_norm": 0.7316914200782776, + "learning_rate": 2.1130578723568538e-05, + "loss": 3.0313, + "step": 53858 + }, + { + "epoch": 2.64, + "grad_norm": 0.705625593662262, + "learning_rate": 2.112490314789963e-05, + "loss": 3.0341, + "step": 53859 + }, + { + "epoch": 2.64, + "grad_norm": 0.7249335646629333, + "learning_rate": 2.1119228306734014e-05, + "loss": 3.0307, + "step": 53860 + }, + { + "epoch": 2.64, + "grad_norm": 0.7347077131271362, + "learning_rate": 2.1113554200086748e-05, + "loss": 2.8507, + "step": 53861 + }, + { + "epoch": 2.64, + "grad_norm": 0.734248161315918, + "learning_rate": 2.1107880827972646e-05, + "loss": 2.9392, + "step": 53862 + }, + { + "epoch": 2.64, + "grad_norm": 0.7667978405952454, + "learning_rate": 2.1102208190406767e-05, + "loss": 2.7851, + "step": 53863 + }, + { + "epoch": 2.64, + "grad_norm": 0.7558903098106384, + "learning_rate": 2.1096536287403997e-05, + "loss": 2.7192, + "step": 53864 + }, + { + "epoch": 2.64, + "grad_norm": 0.8619865775108337, + "learning_rate": 2.1090865118979194e-05, + "loss": 2.8567, + "step": 53865 + }, + { + "epoch": 2.64, + "grad_norm": 0.7786526083946228, + "learning_rate": 2.1085194685147478e-05, + "loss": 2.949, + "step": 53866 + }, + { + "epoch": 2.64, + "grad_norm": 0.7425805330276489, + "learning_rate": 2.1079524985923602e-05, + "loss": 2.7537, + "step": 53867 + }, + { + "epoch": 2.64, + "grad_norm": 0.6905446648597717, + "learning_rate": 2.1073856021322555e-05, + "loss": 2.6834, + "step": 53868 + }, + { + "epoch": 2.64, + "grad_norm": 0.7856373190879822, + "learning_rate": 2.1068187791359393e-05, + "loss": 2.9288, + "step": 53869 + }, + { + "epoch": 2.64, + "grad_norm": 0.7731509804725647, + "learning_rate": 2.1062520296048903e-05, + "loss": 2.9401, + "step": 53870 + }, + { + "epoch": 2.64, + "grad_norm": 0.7262278199195862, + "learning_rate": 2.105685353540607e-05, + "loss": 2.8644, + "step": 53871 + }, + { + "epoch": 2.64, + "grad_norm": 0.730444610118866, + "learning_rate": 2.105118750944572e-05, + "loss": 3.1441, + "step": 53872 + }, + { + "epoch": 2.64, + "grad_norm": 0.7428088188171387, + "learning_rate": 2.1045522218182908e-05, + "loss": 2.9372, + "step": 53873 + }, + { + "epoch": 2.64, + "grad_norm": 0.7640175819396973, + "learning_rate": 2.103985766163252e-05, + "loss": 3.0142, + "step": 53874 + }, + { + "epoch": 2.64, + "grad_norm": 0.8164355158805847, + "learning_rate": 2.103419383980941e-05, + "loss": 2.9677, + "step": 53875 + }, + { + "epoch": 2.64, + "grad_norm": 0.7192038893699646, + "learning_rate": 2.10285307527286e-05, + "loss": 2.8549, + "step": 53876 + }, + { + "epoch": 2.64, + "grad_norm": 0.7759872078895569, + "learning_rate": 2.1022868400404913e-05, + "loss": 2.9487, + "step": 53877 + }, + { + "epoch": 2.64, + "grad_norm": 0.7276328802108765, + "learning_rate": 2.1017206782853334e-05, + "loss": 2.7763, + "step": 53878 + }, + { + "epoch": 2.64, + "grad_norm": 0.7632603049278259, + "learning_rate": 2.1011545900088757e-05, + "loss": 2.974, + "step": 53879 + }, + { + "epoch": 2.64, + "grad_norm": 0.7106670141220093, + "learning_rate": 2.1005885752126027e-05, + "loss": 2.8494, + "step": 53880 + }, + { + "epoch": 2.64, + "grad_norm": 0.7603490948677063, + "learning_rate": 2.1000226338980143e-05, + "loss": 2.8562, + "step": 53881 + }, + { + "epoch": 2.64, + "grad_norm": 0.7455410361289978, + "learning_rate": 2.0994567660665916e-05, + "loss": 2.8456, + "step": 53882 + }, + { + "epoch": 2.64, + "grad_norm": 0.7279428243637085, + "learning_rate": 2.098890971719831e-05, + "loss": 3.1957, + "step": 53883 + }, + { + "epoch": 2.64, + "grad_norm": 0.8473446369171143, + "learning_rate": 2.098325250859224e-05, + "loss": 2.9781, + "step": 53884 + }, + { + "epoch": 2.64, + "grad_norm": 0.7343671321868896, + "learning_rate": 2.0977596034862632e-05, + "loss": 2.9825, + "step": 53885 + }, + { + "epoch": 2.64, + "grad_norm": 0.7839120030403137, + "learning_rate": 2.0971940296024304e-05, + "loss": 3.0148, + "step": 53886 + }, + { + "epoch": 2.64, + "grad_norm": 0.7506952285766602, + "learning_rate": 2.0966285292092144e-05, + "loss": 3.0424, + "step": 53887 + }, + { + "epoch": 2.64, + "grad_norm": 0.7599936723709106, + "learning_rate": 2.0960631023081077e-05, + "loss": 2.9168, + "step": 53888 + }, + { + "epoch": 2.64, + "grad_norm": 0.7371647953987122, + "learning_rate": 2.0954977489006085e-05, + "loss": 2.5458, + "step": 53889 + }, + { + "epoch": 2.64, + "grad_norm": 0.7845785021781921, + "learning_rate": 2.0949324689881896e-05, + "loss": 2.9261, + "step": 53890 + }, + { + "epoch": 2.64, + "grad_norm": 0.7766330242156982, + "learning_rate": 2.094367262572356e-05, + "loss": 2.9509, + "step": 53891 + }, + { + "epoch": 2.64, + "grad_norm": 0.7818014025688171, + "learning_rate": 2.0938021296545804e-05, + "loss": 2.9393, + "step": 53892 + }, + { + "epoch": 2.64, + "grad_norm": 0.7388964891433716, + "learning_rate": 2.0932370702363678e-05, + "loss": 2.8543, + "step": 53893 + }, + { + "epoch": 2.64, + "grad_norm": 0.7346888780593872, + "learning_rate": 2.0926720843191936e-05, + "loss": 2.8711, + "step": 53894 + }, + { + "epoch": 2.64, + "grad_norm": 0.7887476086616516, + "learning_rate": 2.0921071719045503e-05, + "loss": 2.9683, + "step": 53895 + }, + { + "epoch": 2.64, + "grad_norm": 0.7444952130317688, + "learning_rate": 2.091542332993926e-05, + "loss": 2.6975, + "step": 53896 + }, + { + "epoch": 2.64, + "grad_norm": 0.7575118541717529, + "learning_rate": 2.0909775675888073e-05, + "loss": 2.5881, + "step": 53897 + }, + { + "epoch": 2.64, + "grad_norm": 0.7286080121994019, + "learning_rate": 2.090412875690679e-05, + "loss": 2.7892, + "step": 53898 + }, + { + "epoch": 2.64, + "grad_norm": 0.7232897281646729, + "learning_rate": 2.08984825730104e-05, + "loss": 2.9935, + "step": 53899 + }, + { + "epoch": 2.64, + "grad_norm": 0.7478100061416626, + "learning_rate": 2.0892837124213657e-05, + "loss": 2.9702, + "step": 53900 + }, + { + "epoch": 2.64, + "grad_norm": 0.7384991645812988, + "learning_rate": 2.0887192410531483e-05, + "loss": 2.8308, + "step": 53901 + }, + { + "epoch": 2.64, + "grad_norm": 0.7648786902427673, + "learning_rate": 2.08815484319787e-05, + "loss": 2.9977, + "step": 53902 + }, + { + "epoch": 2.64, + "grad_norm": 0.7234745621681213, + "learning_rate": 2.0875905188570163e-05, + "loss": 2.9809, + "step": 53903 + }, + { + "epoch": 2.64, + "grad_norm": 0.7378318905830383, + "learning_rate": 2.087026268032086e-05, + "loss": 2.9467, + "step": 53904 + }, + { + "epoch": 2.64, + "grad_norm": 0.7225425243377686, + "learning_rate": 2.086462090724548e-05, + "loss": 2.7495, + "step": 53905 + }, + { + "epoch": 2.64, + "grad_norm": 0.744601309299469, + "learning_rate": 2.085897986935904e-05, + "loss": 2.9628, + "step": 53906 + }, + { + "epoch": 2.64, + "grad_norm": 0.7627322673797607, + "learning_rate": 2.085333956667633e-05, + "loss": 2.8884, + "step": 53907 + }, + { + "epoch": 2.64, + "grad_norm": 0.7031084299087524, + "learning_rate": 2.084769999921214e-05, + "loss": 3.1328, + "step": 53908 + }, + { + "epoch": 2.64, + "grad_norm": 0.8040719032287598, + "learning_rate": 2.0842061166981427e-05, + "loss": 2.9711, + "step": 53909 + }, + { + "epoch": 2.64, + "grad_norm": 0.733121931552887, + "learning_rate": 2.0836423069998974e-05, + "loss": 2.9283, + "step": 53910 + }, + { + "epoch": 2.64, + "grad_norm": 0.7206590175628662, + "learning_rate": 2.0830785708279673e-05, + "loss": 2.8658, + "step": 53911 + }, + { + "epoch": 2.64, + "grad_norm": 0.7238807678222656, + "learning_rate": 2.082514908183831e-05, + "loss": 2.9172, + "step": 53912 + }, + { + "epoch": 2.64, + "grad_norm": 0.7584816217422485, + "learning_rate": 2.081951319068981e-05, + "loss": 2.977, + "step": 53913 + }, + { + "epoch": 2.64, + "grad_norm": 0.8075535297393799, + "learning_rate": 2.081387803484902e-05, + "loss": 2.8249, + "step": 53914 + }, + { + "epoch": 2.64, + "grad_norm": 0.7531391382217407, + "learning_rate": 2.0808243614330733e-05, + "loss": 2.8588, + "step": 53915 + }, + { + "epoch": 2.64, + "grad_norm": 0.7435611486434937, + "learning_rate": 2.0802609929149804e-05, + "loss": 2.9846, + "step": 53916 + }, + { + "epoch": 2.64, + "grad_norm": 0.7779226303100586, + "learning_rate": 2.0796976979321024e-05, + "loss": 2.9811, + "step": 53917 + }, + { + "epoch": 2.64, + "grad_norm": 0.7348940372467041, + "learning_rate": 2.079134476485924e-05, + "loss": 2.8087, + "step": 53918 + }, + { + "epoch": 2.64, + "grad_norm": 0.7662265300750732, + "learning_rate": 2.0785713285779415e-05, + "loss": 2.6446, + "step": 53919 + }, + { + "epoch": 2.64, + "grad_norm": 0.8061109185218811, + "learning_rate": 2.07800825420962e-05, + "loss": 2.6772, + "step": 53920 + }, + { + "epoch": 2.64, + "grad_norm": 0.744640052318573, + "learning_rate": 2.0774452533824583e-05, + "loss": 3.0171, + "step": 53921 + }, + { + "epoch": 2.64, + "grad_norm": 0.7543174624443054, + "learning_rate": 2.0768823260979318e-05, + "loss": 2.7892, + "step": 53922 + }, + { + "epoch": 2.64, + "grad_norm": 0.7616002559661865, + "learning_rate": 2.0763194723575193e-05, + "loss": 2.8543, + "step": 53923 + }, + { + "epoch": 2.64, + "grad_norm": 0.725988507270813, + "learning_rate": 2.07575669216271e-05, + "loss": 2.82, + "step": 53924 + }, + { + "epoch": 2.64, + "grad_norm": 0.7437060475349426, + "learning_rate": 2.075193985514979e-05, + "loss": 2.7369, + "step": 53925 + }, + { + "epoch": 2.64, + "grad_norm": 0.7315745949745178, + "learning_rate": 2.0746313524158188e-05, + "loss": 2.7923, + "step": 53926 + }, + { + "epoch": 2.64, + "grad_norm": 0.7858138680458069, + "learning_rate": 2.0740687928666978e-05, + "loss": 2.9287, + "step": 53927 + }, + { + "epoch": 2.64, + "grad_norm": 0.7877421975135803, + "learning_rate": 2.073506306869115e-05, + "loss": 2.9113, + "step": 53928 + }, + { + "epoch": 2.64, + "grad_norm": 0.8002856969833374, + "learning_rate": 2.072943894424536e-05, + "loss": 2.9444, + "step": 53929 + }, + { + "epoch": 2.64, + "grad_norm": 0.7211002111434937, + "learning_rate": 2.0723815555344457e-05, + "loss": 2.7723, + "step": 53930 + }, + { + "epoch": 2.64, + "grad_norm": 0.7559928894042969, + "learning_rate": 2.0718192902003338e-05, + "loss": 2.9722, + "step": 53931 + }, + { + "epoch": 2.64, + "grad_norm": 0.7671013474464417, + "learning_rate": 2.0712570984236686e-05, + "loss": 2.9676, + "step": 53932 + }, + { + "epoch": 2.64, + "grad_norm": 0.7174298167228699, + "learning_rate": 2.070694980205946e-05, + "loss": 2.8203, + "step": 53933 + }, + { + "epoch": 2.64, + "grad_norm": 0.7641894817352295, + "learning_rate": 2.0701329355486274e-05, + "loss": 2.8446, + "step": 53934 + }, + { + "epoch": 2.64, + "grad_norm": 0.6962066292762756, + "learning_rate": 2.0695709644532056e-05, + "loss": 2.7518, + "step": 53935 + }, + { + "epoch": 2.64, + "grad_norm": 0.74459308385849, + "learning_rate": 2.0690090669211658e-05, + "loss": 2.8212, + "step": 53936 + }, + { + "epoch": 2.64, + "grad_norm": 0.7321491837501526, + "learning_rate": 2.0684472429539767e-05, + "loss": 2.8309, + "step": 53937 + }, + { + "epoch": 2.64, + "grad_norm": 0.739220917224884, + "learning_rate": 2.067885492553124e-05, + "loss": 3.009, + "step": 53938 + }, + { + "epoch": 2.64, + "grad_norm": 0.7226603627204895, + "learning_rate": 2.06732381572008e-05, + "loss": 2.7967, + "step": 53939 + }, + { + "epoch": 2.64, + "grad_norm": 0.7862400412559509, + "learning_rate": 2.06676221245633e-05, + "loss": 2.827, + "step": 53940 + }, + { + "epoch": 2.64, + "grad_norm": 0.7336839437484741, + "learning_rate": 2.0662006827633558e-05, + "loss": 2.9242, + "step": 53941 + }, + { + "epoch": 2.64, + "grad_norm": 0.7695524096488953, + "learning_rate": 2.06563922664263e-05, + "loss": 2.8123, + "step": 53942 + }, + { + "epoch": 2.64, + "grad_norm": 0.7591333389282227, + "learning_rate": 2.0650778440956383e-05, + "loss": 2.9191, + "step": 53943 + }, + { + "epoch": 2.64, + "grad_norm": 0.7534220218658447, + "learning_rate": 2.0645165351238557e-05, + "loss": 2.9842, + "step": 53944 + }, + { + "epoch": 2.64, + "grad_norm": 0.7719517946243286, + "learning_rate": 2.0639552997287546e-05, + "loss": 2.8474, + "step": 53945 + }, + { + "epoch": 2.64, + "grad_norm": 0.7812724113464355, + "learning_rate": 2.0633941379118237e-05, + "loss": 2.7268, + "step": 53946 + }, + { + "epoch": 2.64, + "grad_norm": 0.7152418494224548, + "learning_rate": 2.0628330496745317e-05, + "loss": 2.833, + "step": 53947 + }, + { + "epoch": 2.64, + "grad_norm": 0.7359592914581299, + "learning_rate": 2.0622720350183674e-05, + "loss": 2.8466, + "step": 53948 + }, + { + "epoch": 2.64, + "grad_norm": 0.7347102761268616, + "learning_rate": 2.0617110939447966e-05, + "loss": 3.0976, + "step": 53949 + }, + { + "epoch": 2.64, + "grad_norm": 0.7281254529953003, + "learning_rate": 2.0611502264553048e-05, + "loss": 3.0755, + "step": 53950 + }, + { + "epoch": 2.64, + "grad_norm": 0.7379975914955139, + "learning_rate": 2.0605894325513673e-05, + "loss": 2.7204, + "step": 53951 + }, + { + "epoch": 2.64, + "grad_norm": 0.7058905959129333, + "learning_rate": 2.0600287122344562e-05, + "loss": 2.7165, + "step": 53952 + }, + { + "epoch": 2.64, + "grad_norm": 0.7494770884513855, + "learning_rate": 2.0594680655060536e-05, + "loss": 2.9196, + "step": 53953 + }, + { + "epoch": 2.64, + "grad_norm": 0.7461589574813843, + "learning_rate": 2.0589074923676352e-05, + "loss": 2.7942, + "step": 53954 + }, + { + "epoch": 2.64, + "grad_norm": 0.8033571839332581, + "learning_rate": 2.058346992820673e-05, + "loss": 2.6223, + "step": 53955 + }, + { + "epoch": 2.64, + "grad_norm": 0.7271087169647217, + "learning_rate": 2.0577865668666527e-05, + "loss": 3.1622, + "step": 53956 + }, + { + "epoch": 2.64, + "grad_norm": 0.7702967524528503, + "learning_rate": 2.0572262145070427e-05, + "loss": 2.827, + "step": 53957 + }, + { + "epoch": 2.64, + "grad_norm": 0.7806286811828613, + "learning_rate": 2.056665935743326e-05, + "loss": 3.1208, + "step": 53958 + }, + { + "epoch": 2.64, + "grad_norm": 0.7661564350128174, + "learning_rate": 2.0561057305769703e-05, + "loss": 2.7039, + "step": 53959 + }, + { + "epoch": 2.64, + "grad_norm": 0.7610711455345154, + "learning_rate": 2.0555455990094515e-05, + "loss": 2.873, + "step": 53960 + }, + { + "epoch": 2.64, + "grad_norm": 0.7484396696090698, + "learning_rate": 2.054985541042252e-05, + "loss": 2.7665, + "step": 53961 + }, + { + "epoch": 2.64, + "grad_norm": 0.734775722026825, + "learning_rate": 2.0544255566768376e-05, + "loss": 3.0531, + "step": 53962 + }, + { + "epoch": 2.64, + "grad_norm": 0.728737473487854, + "learning_rate": 2.0538656459146928e-05, + "loss": 3.0091, + "step": 53963 + }, + { + "epoch": 2.64, + "grad_norm": 0.8020430207252502, + "learning_rate": 2.0533058087572805e-05, + "loss": 3.1086, + "step": 53964 + }, + { + "epoch": 2.64, + "grad_norm": 0.7418114542961121, + "learning_rate": 2.0527460452060895e-05, + "loss": 2.8423, + "step": 53965 + }, + { + "epoch": 2.64, + "grad_norm": 0.7765441536903381, + "learning_rate": 2.0521863552625884e-05, + "loss": 2.9005, + "step": 53966 + }, + { + "epoch": 2.64, + "grad_norm": 0.7442047595977783, + "learning_rate": 2.0516267389282426e-05, + "loss": 2.8706, + "step": 53967 + }, + { + "epoch": 2.64, + "grad_norm": 0.7826739549636841, + "learning_rate": 2.051067196204538e-05, + "loss": 2.861, + "step": 53968 + }, + { + "epoch": 2.64, + "grad_norm": 0.7431984543800354, + "learning_rate": 2.0505077270929392e-05, + "loss": 2.8538, + "step": 53969 + }, + { + "epoch": 2.64, + "grad_norm": 0.7477771043777466, + "learning_rate": 2.0499483315949262e-05, + "loss": 2.9119, + "step": 53970 + }, + { + "epoch": 2.64, + "grad_norm": 0.772100031375885, + "learning_rate": 2.0493890097119703e-05, + "loss": 2.8383, + "step": 53971 + }, + { + "epoch": 2.65, + "grad_norm": 0.7262080311775208, + "learning_rate": 2.0488297614455474e-05, + "loss": 2.9591, + "step": 53972 + }, + { + "epoch": 2.65, + "grad_norm": 0.8009024262428284, + "learning_rate": 2.0482705867971293e-05, + "loss": 3.0581, + "step": 53973 + }, + { + "epoch": 2.65, + "grad_norm": 0.7679193019866943, + "learning_rate": 2.0477114857681787e-05, + "loss": 2.7829, + "step": 53974 + }, + { + "epoch": 2.65, + "grad_norm": 0.7369647026062012, + "learning_rate": 2.047152458360177e-05, + "loss": 3.0353, + "step": 53975 + }, + { + "epoch": 2.65, + "grad_norm": 0.7575830817222595, + "learning_rate": 2.046593504574604e-05, + "loss": 2.9625, + "step": 53976 + }, + { + "epoch": 2.65, + "grad_norm": 0.7336440682411194, + "learning_rate": 2.0460346244129177e-05, + "loss": 2.8509, + "step": 53977 + }, + { + "epoch": 2.65, + "grad_norm": 0.7487339377403259, + "learning_rate": 2.0454758178766007e-05, + "loss": 2.6161, + "step": 53978 + }, + { + "epoch": 2.65, + "grad_norm": 0.7532362341880798, + "learning_rate": 2.044917084967115e-05, + "loss": 2.8774, + "step": 53979 + }, + { + "epoch": 2.65, + "grad_norm": 0.7453672885894775, + "learning_rate": 2.0443584256859426e-05, + "loss": 2.9005, + "step": 53980 + }, + { + "epoch": 2.65, + "grad_norm": 0.750306248664856, + "learning_rate": 2.0437998400345522e-05, + "loss": 3.0054, + "step": 53981 + }, + { + "epoch": 2.65, + "grad_norm": 0.7489728927612305, + "learning_rate": 2.0432413280144035e-05, + "loss": 3.005, + "step": 53982 + }, + { + "epoch": 2.65, + "grad_norm": 0.7205755710601807, + "learning_rate": 2.0426828896269843e-05, + "loss": 2.9092, + "step": 53983 + }, + { + "epoch": 2.65, + "grad_norm": 0.7374038696289062, + "learning_rate": 2.042124524873754e-05, + "loss": 2.7873, + "step": 53984 + }, + { + "epoch": 2.65, + "grad_norm": 0.7091577649116516, + "learning_rate": 2.041566233756188e-05, + "loss": 2.8018, + "step": 53985 + }, + { + "epoch": 2.65, + "grad_norm": 0.7290316820144653, + "learning_rate": 2.041008016275758e-05, + "loss": 2.9189, + "step": 53986 + }, + { + "epoch": 2.65, + "grad_norm": 0.7330735921859741, + "learning_rate": 2.0404498724339336e-05, + "loss": 2.9191, + "step": 53987 + }, + { + "epoch": 2.65, + "grad_norm": 0.7329016923904419, + "learning_rate": 2.0398918022321796e-05, + "loss": 3.0506, + "step": 53988 + }, + { + "epoch": 2.65, + "grad_norm": 0.764037013053894, + "learning_rate": 2.039333805671969e-05, + "loss": 2.8124, + "step": 53989 + }, + { + "epoch": 2.65, + "grad_norm": 0.7291437983512878, + "learning_rate": 2.0387758827547696e-05, + "loss": 2.7463, + "step": 53990 + }, + { + "epoch": 2.65, + "grad_norm": 0.7491414546966553, + "learning_rate": 2.038218033482061e-05, + "loss": 2.7817, + "step": 53991 + }, + { + "epoch": 2.65, + "grad_norm": 0.7548601031303406, + "learning_rate": 2.037660257855298e-05, + "loss": 2.7962, + "step": 53992 + }, + { + "epoch": 2.65, + "grad_norm": 0.7716947197914124, + "learning_rate": 2.0371025558759602e-05, + "loss": 2.8948, + "step": 53993 + }, + { + "epoch": 2.65, + "grad_norm": 0.7344406247138977, + "learning_rate": 2.036544927545509e-05, + "loss": 2.9549, + "step": 53994 + }, + { + "epoch": 2.65, + "grad_norm": 0.7470876574516296, + "learning_rate": 2.035987372865421e-05, + "loss": 3.075, + "step": 53995 + }, + { + "epoch": 2.65, + "grad_norm": 0.7501749992370605, + "learning_rate": 2.0354298918371638e-05, + "loss": 2.5949, + "step": 53996 + }, + { + "epoch": 2.65, + "grad_norm": 0.7505838871002197, + "learning_rate": 2.0348724844621932e-05, + "loss": 3.097, + "step": 53997 + }, + { + "epoch": 2.65, + "grad_norm": 0.7241684198379517, + "learning_rate": 2.034315150741992e-05, + "loss": 2.8974, + "step": 53998 + }, + { + "epoch": 2.65, + "grad_norm": 0.7406726479530334, + "learning_rate": 2.0337578906780185e-05, + "loss": 2.8339, + "step": 53999 + }, + { + "epoch": 2.65, + "grad_norm": 0.7717121839523315, + "learning_rate": 2.0332007042717447e-05, + "loss": 3.0096, + "step": 54000 + }, + { + "epoch": 2.65, + "grad_norm": 0.7408150434494019, + "learning_rate": 2.032643591524643e-05, + "loss": 2.9066, + "step": 54001 + }, + { + "epoch": 2.65, + "grad_norm": 0.7498103380203247, + "learning_rate": 2.0320865524381758e-05, + "loss": 2.8442, + "step": 54002 + }, + { + "epoch": 2.65, + "grad_norm": 0.7494451999664307, + "learning_rate": 2.0315295870138114e-05, + "loss": 2.6605, + "step": 54003 + }, + { + "epoch": 2.65, + "grad_norm": 0.7431146502494812, + "learning_rate": 2.0309726952530092e-05, + "loss": 2.8177, + "step": 54004 + }, + { + "epoch": 2.65, + "grad_norm": 0.7047357559204102, + "learning_rate": 2.0304158771572442e-05, + "loss": 2.879, + "step": 54005 + }, + { + "epoch": 2.65, + "grad_norm": 0.7716295123100281, + "learning_rate": 2.029859132727979e-05, + "loss": 3.1973, + "step": 54006 + }, + { + "epoch": 2.65, + "grad_norm": 0.7560664415359497, + "learning_rate": 2.029302461966682e-05, + "loss": 3.088, + "step": 54007 + }, + { + "epoch": 2.65, + "grad_norm": 0.7658318877220154, + "learning_rate": 2.0287458648748257e-05, + "loss": 2.7865, + "step": 54008 + }, + { + "epoch": 2.65, + "grad_norm": 0.7343339323997498, + "learning_rate": 2.0281893414538653e-05, + "loss": 2.7412, + "step": 54009 + }, + { + "epoch": 2.65, + "grad_norm": 0.7271828651428223, + "learning_rate": 2.027632891705273e-05, + "loss": 2.8022, + "step": 54010 + }, + { + "epoch": 2.65, + "grad_norm": 0.7835471034049988, + "learning_rate": 2.027076515630508e-05, + "loss": 2.8315, + "step": 54011 + }, + { + "epoch": 2.65, + "grad_norm": 0.7581990361213684, + "learning_rate": 2.0265202132310388e-05, + "loss": 2.8508, + "step": 54012 + }, + { + "epoch": 2.65, + "grad_norm": 0.7217426896095276, + "learning_rate": 2.0259639845083374e-05, + "loss": 3.0623, + "step": 54013 + }, + { + "epoch": 2.65, + "grad_norm": 0.7215791344642639, + "learning_rate": 2.0254078294638598e-05, + "loss": 2.8275, + "step": 54014 + }, + { + "epoch": 2.65, + "grad_norm": 0.7520722150802612, + "learning_rate": 2.0248517480990777e-05, + "loss": 2.8071, + "step": 54015 + }, + { + "epoch": 2.65, + "grad_norm": 0.7443130016326904, + "learning_rate": 2.024295740415447e-05, + "loss": 2.849, + "step": 54016 + }, + { + "epoch": 2.65, + "grad_norm": 0.7317799925804138, + "learning_rate": 2.0237398064144394e-05, + "loss": 2.8952, + "step": 54017 + }, + { + "epoch": 2.65, + "grad_norm": 0.7501115798950195, + "learning_rate": 2.0231839460975206e-05, + "loss": 3.1212, + "step": 54018 + }, + { + "epoch": 2.65, + "grad_norm": 0.7234840393066406, + "learning_rate": 2.0226281594661464e-05, + "loss": 3.0704, + "step": 54019 + }, + { + "epoch": 2.65, + "grad_norm": 0.7871589064598083, + "learning_rate": 2.0220724465217884e-05, + "loss": 2.9676, + "step": 54020 + }, + { + "epoch": 2.65, + "grad_norm": 0.7186853885650635, + "learning_rate": 2.0215168072659027e-05, + "loss": 3.0373, + "step": 54021 + }, + { + "epoch": 2.65, + "grad_norm": 0.7855047583580017, + "learning_rate": 2.0209612416999578e-05, + "loss": 2.8772, + "step": 54022 + }, + { + "epoch": 2.65, + "grad_norm": 0.7410944104194641, + "learning_rate": 2.0204057498254223e-05, + "loss": 2.7401, + "step": 54023 + }, + { + "epoch": 2.65, + "grad_norm": 0.7117852568626404, + "learning_rate": 2.019850331643752e-05, + "loss": 2.8494, + "step": 54024 + }, + { + "epoch": 2.65, + "grad_norm": 0.7461109757423401, + "learning_rate": 2.019294987156409e-05, + "loss": 2.9391, + "step": 54025 + }, + { + "epoch": 2.65, + "grad_norm": 0.769916296005249, + "learning_rate": 2.0187397163648555e-05, + "loss": 2.9171, + "step": 54026 + }, + { + "epoch": 2.65, + "grad_norm": 0.7795796990394592, + "learning_rate": 2.018184519270557e-05, + "loss": 2.7288, + "step": 54027 + }, + { + "epoch": 2.65, + "grad_norm": 0.7641432285308838, + "learning_rate": 2.017629395874979e-05, + "loss": 2.8498, + "step": 54028 + }, + { + "epoch": 2.65, + "grad_norm": 0.7332988977432251, + "learning_rate": 2.0170743461795736e-05, + "loss": 2.8082, + "step": 54029 + }, + { + "epoch": 2.65, + "grad_norm": 0.758630096912384, + "learning_rate": 2.0165193701858163e-05, + "loss": 2.8566, + "step": 54030 + }, + { + "epoch": 2.65, + "grad_norm": 0.7000086307525635, + "learning_rate": 2.015964467895159e-05, + "loss": 2.8143, + "step": 54031 + }, + { + "epoch": 2.65, + "grad_norm": 0.7220397591590881, + "learning_rate": 2.0154096393090647e-05, + "loss": 2.7702, + "step": 54032 + }, + { + "epoch": 2.65, + "grad_norm": 0.7732095718383789, + "learning_rate": 2.0148548844289946e-05, + "loss": 2.875, + "step": 54033 + }, + { + "epoch": 2.65, + "grad_norm": 0.7628332376480103, + "learning_rate": 2.0143002032564116e-05, + "loss": 2.7058, + "step": 54034 + }, + { + "epoch": 2.65, + "grad_norm": 0.7517362833023071, + "learning_rate": 2.0137455957927773e-05, + "loss": 2.7692, + "step": 54035 + }, + { + "epoch": 2.65, + "grad_norm": 0.7574434280395508, + "learning_rate": 2.0131910620395476e-05, + "loss": 2.7894, + "step": 54036 + }, + { + "epoch": 2.65, + "grad_norm": 0.7853326797485352, + "learning_rate": 2.0126366019981878e-05, + "loss": 2.861, + "step": 54037 + }, + { + "epoch": 2.65, + "grad_norm": 0.7405124306678772, + "learning_rate": 2.01208221567016e-05, + "loss": 2.9171, + "step": 54038 + }, + { + "epoch": 2.65, + "grad_norm": 0.7227654457092285, + "learning_rate": 2.0115279030569232e-05, + "loss": 2.873, + "step": 54039 + }, + { + "epoch": 2.65, + "grad_norm": 0.7326869368553162, + "learning_rate": 2.010973664159933e-05, + "loss": 2.8694, + "step": 54040 + }, + { + "epoch": 2.65, + "grad_norm": 0.8230959177017212, + "learning_rate": 2.0104194989806476e-05, + "loss": 2.8616, + "step": 54041 + }, + { + "epoch": 2.65, + "grad_norm": 0.7183669209480286, + "learning_rate": 2.00986540752053e-05, + "loss": 2.9716, + "step": 54042 + }, + { + "epoch": 2.65, + "grad_norm": 0.7086861729621887, + "learning_rate": 2.0093113897810453e-05, + "loss": 2.7061, + "step": 54043 + }, + { + "epoch": 2.65, + "grad_norm": 0.7914672493934631, + "learning_rate": 2.0087574457636424e-05, + "loss": 2.8385, + "step": 54044 + }, + { + "epoch": 2.65, + "grad_norm": 0.7433429956436157, + "learning_rate": 2.00820357546979e-05, + "loss": 2.8681, + "step": 54045 + }, + { + "epoch": 2.65, + "grad_norm": 0.7644243240356445, + "learning_rate": 2.0076497789009437e-05, + "loss": 3.195, + "step": 54046 + }, + { + "epoch": 2.65, + "grad_norm": 0.76406329870224, + "learning_rate": 2.007096056058556e-05, + "loss": 2.6896, + "step": 54047 + }, + { + "epoch": 2.65, + "grad_norm": 0.7108646035194397, + "learning_rate": 2.0065424069440915e-05, + "loss": 2.6872, + "step": 54048 + }, + { + "epoch": 2.65, + "grad_norm": 0.7588872313499451, + "learning_rate": 2.0059888315590068e-05, + "loss": 3.081, + "step": 54049 + }, + { + "epoch": 2.65, + "grad_norm": 0.7359708547592163, + "learning_rate": 2.0054353299047598e-05, + "loss": 2.7153, + "step": 54050 + }, + { + "epoch": 2.65, + "grad_norm": 0.7428367733955383, + "learning_rate": 2.0048819019828068e-05, + "loss": 2.935, + "step": 54051 + }, + { + "epoch": 2.65, + "grad_norm": 0.7662042379379272, + "learning_rate": 2.0043285477946124e-05, + "loss": 3.0226, + "step": 54052 + }, + { + "epoch": 2.65, + "grad_norm": 0.7269078493118286, + "learning_rate": 2.0037752673416264e-05, + "loss": 2.9673, + "step": 54053 + }, + { + "epoch": 2.65, + "grad_norm": 0.7469284534454346, + "learning_rate": 2.0032220606253037e-05, + "loss": 2.9737, + "step": 54054 + }, + { + "epoch": 2.65, + "grad_norm": 0.7175904512405396, + "learning_rate": 2.002668927647113e-05, + "loss": 2.8703, + "step": 54055 + }, + { + "epoch": 2.65, + "grad_norm": 0.7442259192466736, + "learning_rate": 2.002115868408497e-05, + "loss": 2.9262, + "step": 54056 + }, + { + "epoch": 2.65, + "grad_norm": 0.7313393354415894, + "learning_rate": 2.001562882910921e-05, + "loss": 2.946, + "step": 54057 + }, + { + "epoch": 2.65, + "grad_norm": 0.7050204873085022, + "learning_rate": 2.0010099711558468e-05, + "loss": 3.024, + "step": 54058 + }, + { + "epoch": 2.65, + "grad_norm": 0.7282868027687073, + "learning_rate": 2.000457133144714e-05, + "loss": 2.8238, + "step": 54059 + }, + { + "epoch": 2.65, + "grad_norm": 0.7803441882133484, + "learning_rate": 1.9999043688789975e-05, + "loss": 2.9185, + "step": 54060 + }, + { + "epoch": 2.65, + "grad_norm": 0.7263804078102112, + "learning_rate": 1.9993516783601426e-05, + "loss": 2.9197, + "step": 54061 + }, + { + "epoch": 2.65, + "grad_norm": 0.7453001141548157, + "learning_rate": 1.998799061589602e-05, + "loss": 2.7994, + "step": 54062 + }, + { + "epoch": 2.65, + "grad_norm": 0.7197867631912231, + "learning_rate": 1.998246518568841e-05, + "loss": 3.0516, + "step": 54063 + }, + { + "epoch": 2.65, + "grad_norm": 0.7825305461883545, + "learning_rate": 1.997694049299302e-05, + "loss": 2.9953, + "step": 54064 + }, + { + "epoch": 2.65, + "grad_norm": 0.7243884801864624, + "learning_rate": 1.997141653782457e-05, + "loss": 2.828, + "step": 54065 + }, + { + "epoch": 2.65, + "grad_norm": 0.7629521489143372, + "learning_rate": 1.9965893320197447e-05, + "loss": 2.8634, + "step": 54066 + }, + { + "epoch": 2.65, + "grad_norm": 0.7828942537307739, + "learning_rate": 1.9960370840126306e-05, + "loss": 3.0588, + "step": 54067 + }, + { + "epoch": 2.65, + "grad_norm": 0.7585510611534119, + "learning_rate": 1.9954849097625668e-05, + "loss": 2.7942, + "step": 54068 + }, + { + "epoch": 2.65, + "grad_norm": 0.7742347717285156, + "learning_rate": 1.9949328092709994e-05, + "loss": 3.0219, + "step": 54069 + }, + { + "epoch": 2.65, + "grad_norm": 0.736878514289856, + "learning_rate": 1.9943807825393965e-05, + "loss": 2.9672, + "step": 54070 + }, + { + "epoch": 2.65, + "grad_norm": 0.7557036280632019, + "learning_rate": 1.9938288295691974e-05, + "loss": 3.0019, + "step": 54071 + }, + { + "epoch": 2.65, + "grad_norm": 0.7458971738815308, + "learning_rate": 1.993276950361864e-05, + "loss": 2.762, + "step": 54072 + }, + { + "epoch": 2.65, + "grad_norm": 0.7320734858512878, + "learning_rate": 1.9927251449188552e-05, + "loss": 3.0947, + "step": 54073 + }, + { + "epoch": 2.65, + "grad_norm": 0.7172092199325562, + "learning_rate": 1.992173413241617e-05, + "loss": 2.8381, + "step": 54074 + }, + { + "epoch": 2.65, + "grad_norm": 0.707459568977356, + "learning_rate": 1.9916217553316037e-05, + "loss": 2.9832, + "step": 54075 + }, + { + "epoch": 2.65, + "grad_norm": 0.7637947797775269, + "learning_rate": 1.9910701711902656e-05, + "loss": 3.0281, + "step": 54076 + }, + { + "epoch": 2.65, + "grad_norm": 0.7895542979240417, + "learning_rate": 1.990518660819057e-05, + "loss": 3.0742, + "step": 54077 + }, + { + "epoch": 2.65, + "grad_norm": 0.707626223564148, + "learning_rate": 1.9899672242194376e-05, + "loss": 2.8941, + "step": 54078 + }, + { + "epoch": 2.65, + "grad_norm": 0.743711531162262, + "learning_rate": 1.989415861392849e-05, + "loss": 3.0584, + "step": 54079 + }, + { + "epoch": 2.65, + "grad_norm": 0.7679846286773682, + "learning_rate": 1.9888645723407504e-05, + "loss": 2.858, + "step": 54080 + }, + { + "epoch": 2.65, + "grad_norm": 0.7698391675949097, + "learning_rate": 1.9883133570645903e-05, + "loss": 2.6755, + "step": 54081 + }, + { + "epoch": 2.65, + "grad_norm": 0.7134325504302979, + "learning_rate": 1.9877622155658246e-05, + "loss": 2.8351, + "step": 54082 + }, + { + "epoch": 2.65, + "grad_norm": 0.7812037467956543, + "learning_rate": 1.9872111478459018e-05, + "loss": 2.9758, + "step": 54083 + }, + { + "epoch": 2.65, + "grad_norm": 0.7417611479759216, + "learning_rate": 1.9866601539062677e-05, + "loss": 2.8881, + "step": 54084 + }, + { + "epoch": 2.65, + "grad_norm": 0.8070379495620728, + "learning_rate": 1.9861092337483876e-05, + "loss": 2.8155, + "step": 54085 + }, + { + "epoch": 2.65, + "grad_norm": 0.7638986706733704, + "learning_rate": 1.985558387373697e-05, + "loss": 2.7645, + "step": 54086 + }, + { + "epoch": 2.65, + "grad_norm": 0.6919122934341431, + "learning_rate": 1.9850076147836613e-05, + "loss": 2.8189, + "step": 54087 + }, + { + "epoch": 2.65, + "grad_norm": 0.7425715923309326, + "learning_rate": 1.9844569159797165e-05, + "loss": 2.8075, + "step": 54088 + }, + { + "epoch": 2.65, + "grad_norm": 0.7360454201698303, + "learning_rate": 1.9839062909633274e-05, + "loss": 2.898, + "step": 54089 + }, + { + "epoch": 2.65, + "grad_norm": 0.7551466226577759, + "learning_rate": 1.9833557397359366e-05, + "loss": 2.9805, + "step": 54090 + }, + { + "epoch": 2.65, + "grad_norm": 0.7435944080352783, + "learning_rate": 1.98280526229899e-05, + "loss": 2.9451, + "step": 54091 + }, + { + "epoch": 2.65, + "grad_norm": 0.7457761168479919, + "learning_rate": 1.9822548586539487e-05, + "loss": 2.7013, + "step": 54092 + }, + { + "epoch": 2.65, + "grad_norm": 0.8180214762687683, + "learning_rate": 1.981704528802249e-05, + "loss": 3.0459, + "step": 54093 + }, + { + "epoch": 2.65, + "grad_norm": 0.7294940948486328, + "learning_rate": 1.9811542727453467e-05, + "loss": 2.8207, + "step": 54094 + }, + { + "epoch": 2.65, + "grad_norm": 0.742396354675293, + "learning_rate": 1.9806040904846997e-05, + "loss": 2.9505, + "step": 54095 + }, + { + "epoch": 2.65, + "grad_norm": 0.7712141275405884, + "learning_rate": 1.9800539820217474e-05, + "loss": 2.8266, + "step": 54096 + }, + { + "epoch": 2.65, + "grad_norm": 0.7579017877578735, + "learning_rate": 1.979503947357939e-05, + "loss": 2.7108, + "step": 54097 + }, + { + "epoch": 2.65, + "grad_norm": 0.7335031628608704, + "learning_rate": 1.9789539864947224e-05, + "loss": 2.7703, + "step": 54098 + }, + { + "epoch": 2.65, + "grad_norm": 0.7949396967887878, + "learning_rate": 1.9784040994335472e-05, + "loss": 2.902, + "step": 54099 + }, + { + "epoch": 2.65, + "grad_norm": 0.7702135443687439, + "learning_rate": 1.977854286175865e-05, + "loss": 2.8729, + "step": 54100 + }, + { + "epoch": 2.65, + "grad_norm": 0.7160282731056213, + "learning_rate": 1.9773045467231185e-05, + "loss": 2.9078, + "step": 54101 + }, + { + "epoch": 2.65, + "grad_norm": 0.7243478298187256, + "learning_rate": 1.9767548810767663e-05, + "loss": 2.9189, + "step": 54102 + }, + { + "epoch": 2.65, + "grad_norm": 0.7236966490745544, + "learning_rate": 1.9762052892382407e-05, + "loss": 3.0583, + "step": 54103 + }, + { + "epoch": 2.65, + "grad_norm": 0.7192164063453674, + "learning_rate": 1.9756557712090037e-05, + "loss": 2.7622, + "step": 54104 + }, + { + "epoch": 2.65, + "grad_norm": 0.7658694386482239, + "learning_rate": 1.9751063269904945e-05, + "loss": 2.9541, + "step": 54105 + }, + { + "epoch": 2.65, + "grad_norm": 0.7122344374656677, + "learning_rate": 1.9745569565841578e-05, + "loss": 2.9305, + "step": 54106 + }, + { + "epoch": 2.65, + "grad_norm": 0.7958580851554871, + "learning_rate": 1.9740076599914468e-05, + "loss": 2.9211, + "step": 54107 + }, + { + "epoch": 2.65, + "grad_norm": 0.7178252339363098, + "learning_rate": 1.973458437213803e-05, + "loss": 2.8776, + "step": 54108 + }, + { + "epoch": 2.65, + "grad_norm": 0.7540079951286316, + "learning_rate": 1.9729092882526788e-05, + "loss": 2.8094, + "step": 54109 + }, + { + "epoch": 2.65, + "grad_norm": 0.766581118106842, + "learning_rate": 1.9723602131095195e-05, + "loss": 2.7469, + "step": 54110 + }, + { + "epoch": 2.65, + "grad_norm": 0.7104828953742981, + "learning_rate": 1.9718112117857675e-05, + "loss": 2.7799, + "step": 54111 + }, + { + "epoch": 2.65, + "grad_norm": 0.7695900201797485, + "learning_rate": 1.9712622842828718e-05, + "loss": 3.009, + "step": 54112 + }, + { + "epoch": 2.65, + "grad_norm": 0.7981126308441162, + "learning_rate": 1.9707134306022744e-05, + "loss": 2.7478, + "step": 54113 + }, + { + "epoch": 2.65, + "grad_norm": 0.714474618434906, + "learning_rate": 1.9701646507454237e-05, + "loss": 2.9426, + "step": 54114 + }, + { + "epoch": 2.65, + "grad_norm": 0.7519267201423645, + "learning_rate": 1.9696159447137662e-05, + "loss": 2.8552, + "step": 54115 + }, + { + "epoch": 2.65, + "grad_norm": 0.7607603669166565, + "learning_rate": 1.969067312508743e-05, + "loss": 2.8663, + "step": 54116 + }, + { + "epoch": 2.65, + "grad_norm": 0.7680455446243286, + "learning_rate": 1.968518754131807e-05, + "loss": 2.8828, + "step": 54117 + }, + { + "epoch": 2.65, + "grad_norm": 0.7606989145278931, + "learning_rate": 1.9679702695843935e-05, + "loss": 2.7714, + "step": 54118 + }, + { + "epoch": 2.65, + "grad_norm": 0.7711759805679321, + "learning_rate": 1.9674218588679548e-05, + "loss": 2.8402, + "step": 54119 + }, + { + "epoch": 2.65, + "grad_norm": 0.7324842214584351, + "learning_rate": 1.9668735219839327e-05, + "loss": 2.7645, + "step": 54120 + }, + { + "epoch": 2.65, + "grad_norm": 0.7444130182266235, + "learning_rate": 1.966325258933763e-05, + "loss": 2.8317, + "step": 54121 + }, + { + "epoch": 2.65, + "grad_norm": 0.7366726398468018, + "learning_rate": 1.965777069718908e-05, + "loss": 2.8117, + "step": 54122 + }, + { + "epoch": 2.65, + "grad_norm": 0.7467571496963501, + "learning_rate": 1.9652289543407927e-05, + "loss": 3.1177, + "step": 54123 + }, + { + "epoch": 2.65, + "grad_norm": 0.7339689135551453, + "learning_rate": 1.9646809128008668e-05, + "loss": 2.7936, + "step": 54124 + }, + { + "epoch": 2.65, + "grad_norm": 0.812752902507782, + "learning_rate": 1.964132945100585e-05, + "loss": 2.9288, + "step": 54125 + }, + { + "epoch": 2.65, + "grad_norm": 0.777722954750061, + "learning_rate": 1.9635850512413797e-05, + "loss": 2.8443, + "step": 54126 + }, + { + "epoch": 2.65, + "grad_norm": 0.8094779253005981, + "learning_rate": 1.9630372312246933e-05, + "loss": 2.9391, + "step": 54127 + }, + { + "epoch": 2.65, + "grad_norm": 0.7842881083488464, + "learning_rate": 1.962489485051968e-05, + "loss": 2.8474, + "step": 54128 + }, + { + "epoch": 2.65, + "grad_norm": 0.7789468765258789, + "learning_rate": 1.9619418127246524e-05, + "loss": 2.8385, + "step": 54129 + }, + { + "epoch": 2.65, + "grad_norm": 0.8131735920906067, + "learning_rate": 1.961394214244185e-05, + "loss": 3.0392, + "step": 54130 + }, + { + "epoch": 2.65, + "grad_norm": 0.7453126907348633, + "learning_rate": 1.960846689612009e-05, + "loss": 2.8532, + "step": 54131 + }, + { + "epoch": 2.65, + "grad_norm": 0.7671250104904175, + "learning_rate": 1.960299238829569e-05, + "loss": 2.914, + "step": 54132 + }, + { + "epoch": 2.65, + "grad_norm": 0.7735947370529175, + "learning_rate": 1.9597518618983044e-05, + "loss": 2.8704, + "step": 54133 + }, + { + "epoch": 2.65, + "grad_norm": 0.762312650680542, + "learning_rate": 1.9592045588196536e-05, + "loss": 2.7795, + "step": 54134 + }, + { + "epoch": 2.65, + "grad_norm": 0.7702953815460205, + "learning_rate": 1.9586573295950658e-05, + "loss": 2.7177, + "step": 54135 + }, + { + "epoch": 2.65, + "grad_norm": 0.7665062546730042, + "learning_rate": 1.958110174225973e-05, + "loss": 3.0213, + "step": 54136 + }, + { + "epoch": 2.65, + "grad_norm": 0.7448751926422119, + "learning_rate": 1.9575630927138274e-05, + "loss": 2.8374, + "step": 54137 + }, + { + "epoch": 2.65, + "grad_norm": 0.7393977642059326, + "learning_rate": 1.957016085060058e-05, + "loss": 2.8929, + "step": 54138 + }, + { + "epoch": 2.65, + "grad_norm": 0.7659404873847961, + "learning_rate": 1.9564691512661102e-05, + "loss": 2.7063, + "step": 54139 + }, + { + "epoch": 2.65, + "grad_norm": 0.7388503551483154, + "learning_rate": 1.9559222913334293e-05, + "loss": 2.922, + "step": 54140 + }, + { + "epoch": 2.65, + "grad_norm": 0.7335487604141235, + "learning_rate": 1.9553755052634545e-05, + "loss": 2.8808, + "step": 54141 + }, + { + "epoch": 2.65, + "grad_norm": 0.7967286705970764, + "learning_rate": 1.954828793057621e-05, + "loss": 2.7343, + "step": 54142 + }, + { + "epoch": 2.65, + "grad_norm": 0.7063218951225281, + "learning_rate": 1.954282154717368e-05, + "loss": 3.0011, + "step": 54143 + }, + { + "epoch": 2.65, + "grad_norm": 0.7302457690238953, + "learning_rate": 1.9537355902441408e-05, + "loss": 3.0176, + "step": 54144 + }, + { + "epoch": 2.65, + "grad_norm": 0.739165723323822, + "learning_rate": 1.9531890996393785e-05, + "loss": 2.6946, + "step": 54145 + }, + { + "epoch": 2.65, + "grad_norm": 0.7565047144889832, + "learning_rate": 1.9526426829045126e-05, + "loss": 2.8469, + "step": 54146 + }, + { + "epoch": 2.65, + "grad_norm": 0.7552198171615601, + "learning_rate": 1.9520963400409927e-05, + "loss": 2.8518, + "step": 54147 + }, + { + "epoch": 2.65, + "grad_norm": 0.7470796704292297, + "learning_rate": 1.9515500710502542e-05, + "loss": 2.8107, + "step": 54148 + }, + { + "epoch": 2.65, + "grad_norm": 0.764629602432251, + "learning_rate": 1.9510038759337287e-05, + "loss": 2.8671, + "step": 54149 + }, + { + "epoch": 2.65, + "grad_norm": 0.7533672451972961, + "learning_rate": 1.950457754692869e-05, + "loss": 2.9093, + "step": 54150 + }, + { + "epoch": 2.65, + "grad_norm": 0.7616235613822937, + "learning_rate": 1.949911707329097e-05, + "loss": 2.8767, + "step": 54151 + }, + { + "epoch": 2.65, + "grad_norm": 0.7351246476173401, + "learning_rate": 1.949365733843865e-05, + "loss": 2.6712, + "step": 54152 + }, + { + "epoch": 2.65, + "grad_norm": 0.80738765001297, + "learning_rate": 1.948819834238602e-05, + "loss": 2.7892, + "step": 54153 + }, + { + "epoch": 2.65, + "grad_norm": 0.7314664721488953, + "learning_rate": 1.948274008514753e-05, + "loss": 2.8074, + "step": 54154 + }, + { + "epoch": 2.65, + "grad_norm": 0.7750166058540344, + "learning_rate": 1.9477282566737505e-05, + "loss": 2.7988, + "step": 54155 + }, + { + "epoch": 2.65, + "grad_norm": 0.739399790763855, + "learning_rate": 1.9471825787170304e-05, + "loss": 2.8234, + "step": 54156 + }, + { + "epoch": 2.65, + "grad_norm": 0.7659290432929993, + "learning_rate": 1.946636974646034e-05, + "loss": 2.828, + "step": 54157 + }, + { + "epoch": 2.65, + "grad_norm": 0.7248683571815491, + "learning_rate": 1.9460914444621943e-05, + "loss": 2.9416, + "step": 54158 + }, + { + "epoch": 2.65, + "grad_norm": 0.7538133859634399, + "learning_rate": 1.9455459881669566e-05, + "loss": 3.1345, + "step": 54159 + }, + { + "epoch": 2.65, + "grad_norm": 0.7005477547645569, + "learning_rate": 1.945000605761743e-05, + "loss": 2.7204, + "step": 54160 + }, + { + "epoch": 2.65, + "grad_norm": 0.7675425410270691, + "learning_rate": 1.944455297248002e-05, + "loss": 2.7766, + "step": 54161 + }, + { + "epoch": 2.65, + "grad_norm": 0.7757595777511597, + "learning_rate": 1.9439100626271697e-05, + "loss": 2.87, + "step": 54162 + }, + { + "epoch": 2.65, + "grad_norm": 0.7428346276283264, + "learning_rate": 1.943364901900678e-05, + "loss": 2.8913, + "step": 54163 + }, + { + "epoch": 2.65, + "grad_norm": 0.74378901720047, + "learning_rate": 1.942819815069966e-05, + "loss": 2.7817, + "step": 54164 + }, + { + "epoch": 2.65, + "grad_norm": 0.7840437889099121, + "learning_rate": 1.942274802136462e-05, + "loss": 2.8597, + "step": 54165 + }, + { + "epoch": 2.65, + "grad_norm": 0.735499382019043, + "learning_rate": 1.941729863101602e-05, + "loss": 2.672, + "step": 54166 + }, + { + "epoch": 2.65, + "grad_norm": 0.7412136793136597, + "learning_rate": 1.941184997966835e-05, + "loss": 2.9249, + "step": 54167 + }, + { + "epoch": 2.65, + "grad_norm": 0.7294532656669617, + "learning_rate": 1.9406402067335792e-05, + "loss": 2.917, + "step": 54168 + }, + { + "epoch": 2.65, + "grad_norm": 0.7440246343612671, + "learning_rate": 1.9400954894032837e-05, + "loss": 2.9341, + "step": 54169 + }, + { + "epoch": 2.65, + "grad_norm": 0.7838345766067505, + "learning_rate": 1.9395508459773745e-05, + "loss": 2.9521, + "step": 54170 + }, + { + "epoch": 2.65, + "grad_norm": 0.7340986728668213, + "learning_rate": 1.9390062764572865e-05, + "loss": 3.0219, + "step": 54171 + }, + { + "epoch": 2.65, + "grad_norm": 0.7282890677452087, + "learning_rate": 1.938461780844456e-05, + "loss": 2.8559, + "step": 54172 + }, + { + "epoch": 2.65, + "grad_norm": 0.7802963256835938, + "learning_rate": 1.9379173591403142e-05, + "loss": 3.0019, + "step": 54173 + }, + { + "epoch": 2.65, + "grad_norm": 0.7121721506118774, + "learning_rate": 1.9373730113463037e-05, + "loss": 3.0312, + "step": 54174 + }, + { + "epoch": 2.65, + "grad_norm": 0.7408917546272278, + "learning_rate": 1.9368287374638437e-05, + "loss": 3.026, + "step": 54175 + }, + { + "epoch": 2.66, + "grad_norm": 0.7899402976036072, + "learning_rate": 1.9362845374943823e-05, + "loss": 2.8091, + "step": 54176 + }, + { + "epoch": 2.66, + "grad_norm": 0.7442259192466736, + "learning_rate": 1.935740411439346e-05, + "loss": 2.894, + "step": 54177 + }, + { + "epoch": 2.66, + "grad_norm": 0.8089560270309448, + "learning_rate": 1.9351963593001664e-05, + "loss": 2.7595, + "step": 54178 + }, + { + "epoch": 2.66, + "grad_norm": 0.7835502624511719, + "learning_rate": 1.9346523810782788e-05, + "loss": 2.8599, + "step": 54179 + }, + { + "epoch": 2.66, + "grad_norm": 0.7364153265953064, + "learning_rate": 1.9341084767751158e-05, + "loss": 2.6572, + "step": 54180 + }, + { + "epoch": 2.66, + "grad_norm": 1.0555264949798584, + "learning_rate": 1.933564646392106e-05, + "loss": 2.8671, + "step": 54181 + }, + { + "epoch": 2.66, + "grad_norm": 0.7606764435768127, + "learning_rate": 1.933020889930692e-05, + "loss": 2.974, + "step": 54182 + }, + { + "epoch": 2.66, + "grad_norm": 0.7183963656425476, + "learning_rate": 1.9324772073922922e-05, + "loss": 2.8173, + "step": 54183 + }, + { + "epoch": 2.66, + "grad_norm": 0.7525346875190735, + "learning_rate": 1.9319335987783523e-05, + "loss": 2.8566, + "step": 54184 + }, + { + "epoch": 2.66, + "grad_norm": 0.7471001744270325, + "learning_rate": 1.9313900640902978e-05, + "loss": 2.8165, + "step": 54185 + }, + { + "epoch": 2.66, + "grad_norm": 0.7534440159797668, + "learning_rate": 1.9308466033295544e-05, + "loss": 2.922, + "step": 54186 + }, + { + "epoch": 2.66, + "grad_norm": 0.7355985045433044, + "learning_rate": 1.930303216497564e-05, + "loss": 2.7781, + "step": 54187 + }, + { + "epoch": 2.66, + "grad_norm": 0.7623375058174133, + "learning_rate": 1.9297599035957457e-05, + "loss": 2.872, + "step": 54188 + }, + { + "epoch": 2.66, + "grad_norm": 0.7745537161827087, + "learning_rate": 1.9292166646255447e-05, + "loss": 2.624, + "step": 54189 + }, + { + "epoch": 2.66, + "grad_norm": 0.7441405653953552, + "learning_rate": 1.92867349958838e-05, + "loss": 2.7594, + "step": 54190 + }, + { + "epoch": 2.66, + "grad_norm": 0.7160558104515076, + "learning_rate": 1.9281304084856942e-05, + "loss": 2.8772, + "step": 54191 + }, + { + "epoch": 2.66, + "grad_norm": 0.7814651727676392, + "learning_rate": 1.9275873913189055e-05, + "loss": 2.7706, + "step": 54192 + }, + { + "epoch": 2.66, + "grad_norm": 0.7084113359451294, + "learning_rate": 1.9270444480894463e-05, + "loss": 3.0227, + "step": 54193 + }, + { + "epoch": 2.66, + "grad_norm": 0.8035101890563965, + "learning_rate": 1.9265015787987558e-05, + "loss": 2.9222, + "step": 54194 + }, + { + "epoch": 2.66, + "grad_norm": 0.7509192824363708, + "learning_rate": 1.9259587834482493e-05, + "loss": 3.0245, + "step": 54195 + }, + { + "epoch": 2.66, + "grad_norm": 0.7465216517448425, + "learning_rate": 1.9254160620393656e-05, + "loss": 2.9304, + "step": 54196 + }, + { + "epoch": 2.66, + "grad_norm": 0.7854889035224915, + "learning_rate": 1.92487341457354e-05, + "loss": 2.6707, + "step": 54197 + }, + { + "epoch": 2.66, + "grad_norm": 0.7187995314598083, + "learning_rate": 1.924330841052192e-05, + "loss": 2.899, + "step": 54198 + }, + { + "epoch": 2.66, + "grad_norm": 0.7536890506744385, + "learning_rate": 1.923788341476753e-05, + "loss": 2.8897, + "step": 54199 + }, + { + "epoch": 2.66, + "grad_norm": 0.7636274695396423, + "learning_rate": 1.9232459158486492e-05, + "loss": 2.8353, + "step": 54200 + }, + { + "epoch": 2.66, + "grad_norm": 0.7261160612106323, + "learning_rate": 1.9227035641693123e-05, + "loss": 2.7868, + "step": 54201 + }, + { + "epoch": 2.66, + "grad_norm": 0.7559409141540527, + "learning_rate": 1.9221612864401716e-05, + "loss": 3.1095, + "step": 54202 + }, + { + "epoch": 2.66, + "grad_norm": 0.8080114722251892, + "learning_rate": 1.9216190826626555e-05, + "loss": 3.0153, + "step": 54203 + }, + { + "epoch": 2.66, + "grad_norm": 0.7144877314567566, + "learning_rate": 1.9210769528381898e-05, + "loss": 3.012, + "step": 54204 + }, + { + "epoch": 2.66, + "grad_norm": 0.700718343257904, + "learning_rate": 1.9205348969682032e-05, + "loss": 2.8758, + "step": 54205 + }, + { + "epoch": 2.66, + "grad_norm": 0.7655426263809204, + "learning_rate": 1.919992915054125e-05, + "loss": 3.0361, + "step": 54206 + }, + { + "epoch": 2.66, + "grad_norm": 0.7685981392860413, + "learning_rate": 1.9194510070973834e-05, + "loss": 2.9705, + "step": 54207 + }, + { + "epoch": 2.66, + "grad_norm": 0.7193219661712646, + "learning_rate": 1.9189091730994012e-05, + "loss": 2.799, + "step": 54208 + }, + { + "epoch": 2.66, + "grad_norm": 0.7027367353439331, + "learning_rate": 1.91836741306161e-05, + "loss": 2.9511, + "step": 54209 + }, + { + "epoch": 2.66, + "grad_norm": 0.744439423084259, + "learning_rate": 1.917825726985429e-05, + "loss": 2.788, + "step": 54210 + }, + { + "epoch": 2.66, + "grad_norm": 0.7260327339172363, + "learning_rate": 1.9172841148722905e-05, + "loss": 2.7213, + "step": 54211 + }, + { + "epoch": 2.66, + "grad_norm": 0.7319767475128174, + "learning_rate": 1.9167425767236268e-05, + "loss": 3.0538, + "step": 54212 + }, + { + "epoch": 2.66, + "grad_norm": 0.7671077251434326, + "learning_rate": 1.9162011125408595e-05, + "loss": 2.8832, + "step": 54213 + }, + { + "epoch": 2.66, + "grad_norm": 0.8228315711021423, + "learning_rate": 1.915659722325411e-05, + "loss": 2.8539, + "step": 54214 + }, + { + "epoch": 2.66, + "grad_norm": 0.7222673296928406, + "learning_rate": 1.9151184060787072e-05, + "loss": 2.9277, + "step": 54215 + }, + { + "epoch": 2.66, + "grad_norm": 0.7548918724060059, + "learning_rate": 1.9145771638021767e-05, + "loss": 2.8345, + "step": 54216 + }, + { + "epoch": 2.66, + "grad_norm": 0.7558291554450989, + "learning_rate": 1.9140359954972485e-05, + "loss": 2.799, + "step": 54217 + }, + { + "epoch": 2.66, + "grad_norm": 0.7385692000389099, + "learning_rate": 1.913494901165341e-05, + "loss": 2.9751, + "step": 54218 + }, + { + "epoch": 2.66, + "grad_norm": 0.7456986904144287, + "learning_rate": 1.9129538808078838e-05, + "loss": 3.0317, + "step": 54219 + }, + { + "epoch": 2.66, + "grad_norm": 0.7515402436256409, + "learning_rate": 1.9124129344262984e-05, + "loss": 3.0887, + "step": 54220 + }, + { + "epoch": 2.66, + "grad_norm": 0.7937130928039551, + "learning_rate": 1.9118720620220172e-05, + "loss": 2.9111, + "step": 54221 + }, + { + "epoch": 2.66, + "grad_norm": 0.8028191924095154, + "learning_rate": 1.9113312635964593e-05, + "loss": 2.9427, + "step": 54222 + }, + { + "epoch": 2.66, + "grad_norm": 0.7735779881477356, + "learning_rate": 1.9107905391510437e-05, + "loss": 2.6901, + "step": 54223 + }, + { + "epoch": 2.66, + "grad_norm": 0.7522846460342407, + "learning_rate": 1.910249888687202e-05, + "loss": 2.7894, + "step": 54224 + }, + { + "epoch": 2.66, + "grad_norm": 0.7548792362213135, + "learning_rate": 1.9097093122063534e-05, + "loss": 2.8319, + "step": 54225 + }, + { + "epoch": 2.66, + "grad_norm": 0.7677998542785645, + "learning_rate": 1.9091688097099266e-05, + "loss": 2.9, + "step": 54226 + }, + { + "epoch": 2.66, + "grad_norm": 0.752396821975708, + "learning_rate": 1.908628381199344e-05, + "loss": 3.1708, + "step": 54227 + }, + { + "epoch": 2.66, + "grad_norm": 0.7591312527656555, + "learning_rate": 1.9080880266760312e-05, + "loss": 2.8663, + "step": 54228 + }, + { + "epoch": 2.66, + "grad_norm": 0.7363387942314148, + "learning_rate": 1.9075477461414067e-05, + "loss": 2.9872, + "step": 54229 + }, + { + "epoch": 2.66, + "grad_norm": 0.7834699153900146, + "learning_rate": 1.90700753959689e-05, + "loss": 2.6909, + "step": 54230 + }, + { + "epoch": 2.66, + "grad_norm": 0.7395800352096558, + "learning_rate": 1.906467407043909e-05, + "loss": 2.9373, + "step": 54231 + }, + { + "epoch": 2.66, + "grad_norm": 0.7479794025421143, + "learning_rate": 1.90592734848389e-05, + "loss": 2.7438, + "step": 54232 + }, + { + "epoch": 2.66, + "grad_norm": 0.7837538719177246, + "learning_rate": 1.9053873639182484e-05, + "loss": 2.7192, + "step": 54233 + }, + { + "epoch": 2.66, + "grad_norm": 0.8077936768531799, + "learning_rate": 1.9048474533484126e-05, + "loss": 2.7285, + "step": 54234 + }, + { + "epoch": 2.66, + "grad_norm": 0.7308048605918884, + "learning_rate": 1.9043076167758055e-05, + "loss": 3.0173, + "step": 54235 + }, + { + "epoch": 2.66, + "grad_norm": 0.7778046131134033, + "learning_rate": 1.9037678542018418e-05, + "loss": 2.8399, + "step": 54236 + }, + { + "epoch": 2.66, + "grad_norm": 0.7745692133903503, + "learning_rate": 1.9032281656279412e-05, + "loss": 2.9058, + "step": 54237 + }, + { + "epoch": 2.66, + "grad_norm": 0.7720127105712891, + "learning_rate": 1.9026885510555324e-05, + "loss": 2.9293, + "step": 54238 + }, + { + "epoch": 2.66, + "grad_norm": 0.7536343932151794, + "learning_rate": 1.902149010486037e-05, + "loss": 2.8993, + "step": 54239 + }, + { + "epoch": 2.66, + "grad_norm": 0.8127491474151611, + "learning_rate": 1.901609543920868e-05, + "loss": 2.9735, + "step": 54240 + }, + { + "epoch": 2.66, + "grad_norm": 0.7343817949295044, + "learning_rate": 1.9010701513614603e-05, + "loss": 2.9745, + "step": 54241 + }, + { + "epoch": 2.66, + "grad_norm": 0.7082345485687256, + "learning_rate": 1.9005308328092196e-05, + "loss": 3.0584, + "step": 54242 + }, + { + "epoch": 2.66, + "grad_norm": 0.7429682612419128, + "learning_rate": 1.8999915882655746e-05, + "loss": 2.7716, + "step": 54243 + }, + { + "epoch": 2.66, + "grad_norm": 0.7653400897979736, + "learning_rate": 1.899452417731948e-05, + "loss": 2.8173, + "step": 54244 + }, + { + "epoch": 2.66, + "grad_norm": 0.7767915725708008, + "learning_rate": 1.8989133212097485e-05, + "loss": 2.7113, + "step": 54245 + }, + { + "epoch": 2.66, + "grad_norm": 0.7101877331733704, + "learning_rate": 1.8983742987004113e-05, + "loss": 2.7319, + "step": 54246 + }, + { + "epoch": 2.66, + "grad_norm": 0.7132937908172607, + "learning_rate": 1.8978353502053388e-05, + "loss": 2.6421, + "step": 54247 + }, + { + "epoch": 2.66, + "grad_norm": 0.7609696984291077, + "learning_rate": 1.8972964757259633e-05, + "loss": 3.0005, + "step": 54248 + }, + { + "epoch": 2.66, + "grad_norm": 0.7206118106842041, + "learning_rate": 1.8967576752637036e-05, + "loss": 3.1412, + "step": 54249 + }, + { + "epoch": 2.66, + "grad_norm": 0.7709072828292847, + "learning_rate": 1.8962189488199754e-05, + "loss": 2.8475, + "step": 54250 + }, + { + "epoch": 2.66, + "grad_norm": 0.7561743259429932, + "learning_rate": 1.895680296396197e-05, + "loss": 2.7725, + "step": 54251 + }, + { + "epoch": 2.66, + "grad_norm": 0.7318551540374756, + "learning_rate": 1.8951417179937844e-05, + "loss": 2.8117, + "step": 54252 + }, + { + "epoch": 2.66, + "grad_norm": 0.7357078790664673, + "learning_rate": 1.89460321361416e-05, + "loss": 2.6899, + "step": 54253 + }, + { + "epoch": 2.66, + "grad_norm": 0.7411123514175415, + "learning_rate": 1.8940647832587452e-05, + "loss": 2.9539, + "step": 54254 + }, + { + "epoch": 2.66, + "grad_norm": 0.7445243000984192, + "learning_rate": 1.8935264269289497e-05, + "loss": 2.8118, + "step": 54255 + }, + { + "epoch": 2.66, + "grad_norm": 0.7355711460113525, + "learning_rate": 1.892988144626202e-05, + "loss": 2.8894, + "step": 54256 + }, + { + "epoch": 2.66, + "grad_norm": 0.7821559906005859, + "learning_rate": 1.892449936351914e-05, + "loss": 2.8614, + "step": 54257 + }, + { + "epoch": 2.66, + "grad_norm": 0.7362728714942932, + "learning_rate": 1.8919118021074985e-05, + "loss": 3.1394, + "step": 54258 + }, + { + "epoch": 2.66, + "grad_norm": 0.7443636655807495, + "learning_rate": 1.891373741894384e-05, + "loss": 2.877, + "step": 54259 + }, + { + "epoch": 2.66, + "grad_norm": 0.745068371295929, + "learning_rate": 1.8908357557139732e-05, + "loss": 2.8646, + "step": 54260 + }, + { + "epoch": 2.66, + "grad_norm": 0.7140920162200928, + "learning_rate": 1.890297843567701e-05, + "loss": 2.768, + "step": 54261 + }, + { + "epoch": 2.66, + "grad_norm": 0.7033805847167969, + "learning_rate": 1.889760005456966e-05, + "loss": 2.8209, + "step": 54262 + }, + { + "epoch": 2.66, + "grad_norm": 0.7535343766212463, + "learning_rate": 1.8892222413831948e-05, + "loss": 2.92, + "step": 54263 + }, + { + "epoch": 2.66, + "grad_norm": 0.7832685112953186, + "learning_rate": 1.8886845513478054e-05, + "loss": 2.8499, + "step": 54264 + }, + { + "epoch": 2.66, + "grad_norm": 0.7332696914672852, + "learning_rate": 1.8881469353522107e-05, + "loss": 3.1087, + "step": 54265 + }, + { + "epoch": 2.66, + "grad_norm": 0.727424144744873, + "learning_rate": 1.8876093933978285e-05, + "loss": 3.0913, + "step": 54266 + }, + { + "epoch": 2.66, + "grad_norm": 0.7173473834991455, + "learning_rate": 1.8870719254860688e-05, + "loss": 3.0397, + "step": 54267 + }, + { + "epoch": 2.66, + "grad_norm": 0.7304912805557251, + "learning_rate": 1.88653453161835e-05, + "loss": 2.8918, + "step": 54268 + }, + { + "epoch": 2.66, + "grad_norm": 0.8100420832633972, + "learning_rate": 1.885997211796094e-05, + "loss": 2.7616, + "step": 54269 + }, + { + "epoch": 2.66, + "grad_norm": 0.8588573336601257, + "learning_rate": 1.885459966020704e-05, + "loss": 2.7817, + "step": 54270 + }, + { + "epoch": 2.66, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.8849227942936074e-05, + "loss": 3.0441, + "step": 54271 + }, + { + "epoch": 2.66, + "grad_norm": 0.7512505054473877, + "learning_rate": 1.884385696616214e-05, + "loss": 2.9208, + "step": 54272 + }, + { + "epoch": 2.66, + "grad_norm": 0.7321940660476685, + "learning_rate": 1.8838486729899327e-05, + "loss": 2.8442, + "step": 54273 + }, + { + "epoch": 2.66, + "grad_norm": 0.7625482678413391, + "learning_rate": 1.8833117234161886e-05, + "loss": 2.9836, + "step": 54274 + }, + { + "epoch": 2.66, + "grad_norm": 0.7443647384643555, + "learning_rate": 1.8827748478963844e-05, + "loss": 2.8452, + "step": 54275 + }, + { + "epoch": 2.66, + "grad_norm": 0.7935992479324341, + "learning_rate": 1.882238046431942e-05, + "loss": 2.7888, + "step": 54276 + }, + { + "epoch": 2.66, + "grad_norm": 0.7821605205535889, + "learning_rate": 1.8817013190242734e-05, + "loss": 2.7943, + "step": 54277 + }, + { + "epoch": 2.66, + "grad_norm": 0.7425271272659302, + "learning_rate": 1.8811646656747948e-05, + "loss": 3.0639, + "step": 54278 + }, + { + "epoch": 2.66, + "grad_norm": 0.702383816242218, + "learning_rate": 1.8806280863849145e-05, + "loss": 2.6989, + "step": 54279 + }, + { + "epoch": 2.66, + "grad_norm": 0.7176997661590576, + "learning_rate": 1.880091581156048e-05, + "loss": 2.9766, + "step": 54280 + }, + { + "epoch": 2.66, + "grad_norm": 0.7591765522956848, + "learning_rate": 1.879555149989611e-05, + "loss": 2.663, + "step": 54281 + }, + { + "epoch": 2.66, + "grad_norm": 0.7460857033729553, + "learning_rate": 1.8790187928870092e-05, + "loss": 2.9084, + "step": 54282 + }, + { + "epoch": 2.66, + "grad_norm": 0.7594284415245056, + "learning_rate": 1.878482509849658e-05, + "loss": 2.7448, + "step": 54283 + }, + { + "epoch": 2.66, + "grad_norm": 0.7766778469085693, + "learning_rate": 1.8779463008789797e-05, + "loss": 2.9685, + "step": 54284 + }, + { + "epoch": 2.66, + "grad_norm": 0.7691056132316589, + "learning_rate": 1.877410165976373e-05, + "loss": 3.0542, + "step": 54285 + }, + { + "epoch": 2.66, + "grad_norm": 0.7344174385070801, + "learning_rate": 1.8768741051432567e-05, + "loss": 3.0607, + "step": 54286 + }, + { + "epoch": 2.66, + "grad_norm": 0.7203677892684937, + "learning_rate": 1.8763381183810468e-05, + "loss": 2.8125, + "step": 54287 + }, + { + "epoch": 2.66, + "grad_norm": 0.7336849570274353, + "learning_rate": 1.8758022056911414e-05, + "loss": 2.8562, + "step": 54288 + }, + { + "epoch": 2.66, + "grad_norm": 0.7745854258537292, + "learning_rate": 1.875266367074967e-05, + "loss": 2.9174, + "step": 54289 + }, + { + "epoch": 2.66, + "grad_norm": 0.7709341049194336, + "learning_rate": 1.8747306025339247e-05, + "loss": 2.6841, + "step": 54290 + }, + { + "epoch": 2.66, + "grad_norm": 0.8152937293052673, + "learning_rate": 1.874194912069431e-05, + "loss": 2.9381, + "step": 54291 + }, + { + "epoch": 2.66, + "grad_norm": 0.7402138710021973, + "learning_rate": 1.873659295682891e-05, + "loss": 2.6456, + "step": 54292 + }, + { + "epoch": 2.66, + "grad_norm": 0.7518551349639893, + "learning_rate": 1.8731237533757272e-05, + "loss": 2.8282, + "step": 54293 + }, + { + "epoch": 2.66, + "grad_norm": 0.7548336386680603, + "learning_rate": 1.872588285149338e-05, + "loss": 2.7088, + "step": 54294 + }, + { + "epoch": 2.66, + "grad_norm": 0.7288030982017517, + "learning_rate": 1.8720528910051356e-05, + "loss": 2.9081, + "step": 54295 + }, + { + "epoch": 2.66, + "grad_norm": 0.7580604553222656, + "learning_rate": 1.8715175709445362e-05, + "loss": 2.9925, + "step": 54296 + }, + { + "epoch": 2.66, + "grad_norm": 0.7717550992965698, + "learning_rate": 1.8709823249689414e-05, + "loss": 2.8237, + "step": 54297 + }, + { + "epoch": 2.66, + "grad_norm": 0.706786572933197, + "learning_rate": 1.870447153079767e-05, + "loss": 2.9454, + "step": 54298 + }, + { + "epoch": 2.66, + "grad_norm": 0.7198885083198547, + "learning_rate": 1.8699120552784252e-05, + "loss": 2.728, + "step": 54299 + }, + { + "epoch": 2.66, + "grad_norm": 0.7398534417152405, + "learning_rate": 1.8693770315663215e-05, + "loss": 3.0013, + "step": 54300 + }, + { + "epoch": 2.66, + "grad_norm": 0.7463058829307556, + "learning_rate": 1.8688420819448645e-05, + "loss": 3.0623, + "step": 54301 + }, + { + "epoch": 2.66, + "grad_norm": 0.7530163526535034, + "learning_rate": 1.86830720641546e-05, + "loss": 2.8843, + "step": 54302 + }, + { + "epoch": 2.66, + "grad_norm": 0.7228906750679016, + "learning_rate": 1.8677724049795174e-05, + "loss": 3.0058, + "step": 54303 + }, + { + "epoch": 2.66, + "grad_norm": 0.759463906288147, + "learning_rate": 1.8672376776384547e-05, + "loss": 2.649, + "step": 54304 + }, + { + "epoch": 2.66, + "grad_norm": 0.7761176824569702, + "learning_rate": 1.8667030243936708e-05, + "loss": 2.94, + "step": 54305 + }, + { + "epoch": 2.66, + "grad_norm": 0.7421549558639526, + "learning_rate": 1.8661684452465787e-05, + "loss": 2.8615, + "step": 54306 + }, + { + "epoch": 2.66, + "grad_norm": 0.7677056193351746, + "learning_rate": 1.8656339401985797e-05, + "loss": 2.7058, + "step": 54307 + }, + { + "epoch": 2.66, + "grad_norm": 0.7426798939704895, + "learning_rate": 1.8650995092510934e-05, + "loss": 2.8782, + "step": 54308 + }, + { + "epoch": 2.66, + "grad_norm": 0.7160890698432922, + "learning_rate": 1.8645651524055184e-05, + "loss": 2.8999, + "step": 54309 + }, + { + "epoch": 2.66, + "grad_norm": 0.7567172646522522, + "learning_rate": 1.86403086966326e-05, + "loss": 2.9189, + "step": 54310 + }, + { + "epoch": 2.66, + "grad_norm": 0.7990744113922119, + "learning_rate": 1.863496661025734e-05, + "loss": 2.9437, + "step": 54311 + }, + { + "epoch": 2.66, + "grad_norm": 0.7247997522354126, + "learning_rate": 1.862962526494336e-05, + "loss": 2.8656, + "step": 54312 + }, + { + "epoch": 2.66, + "grad_norm": 0.7634393572807312, + "learning_rate": 1.8624284660704815e-05, + "loss": 3.0048, + "step": 54313 + }, + { + "epoch": 2.66, + "grad_norm": 0.7473614811897278, + "learning_rate": 1.8618944797555824e-05, + "loss": 3.1732, + "step": 54314 + }, + { + "epoch": 2.66, + "grad_norm": 0.7102734446525574, + "learning_rate": 1.8613605675510345e-05, + "loss": 2.7127, + "step": 54315 + }, + { + "epoch": 2.66, + "grad_norm": 0.7932694554328918, + "learning_rate": 1.860826729458247e-05, + "loss": 2.9487, + "step": 54316 + }, + { + "epoch": 2.66, + "grad_norm": 0.7079724669456482, + "learning_rate": 1.8602929654786247e-05, + "loss": 2.798, + "step": 54317 + }, + { + "epoch": 2.66, + "grad_norm": 0.7377166748046875, + "learning_rate": 1.859759275613577e-05, + "loss": 2.9455, + "step": 54318 + }, + { + "epoch": 2.66, + "grad_norm": 0.8409765958786011, + "learning_rate": 1.859225659864506e-05, + "loss": 2.8922, + "step": 54319 + }, + { + "epoch": 2.66, + "grad_norm": 0.7548701167106628, + "learning_rate": 1.8586921182328173e-05, + "loss": 2.875, + "step": 54320 + }, + { + "epoch": 2.66, + "grad_norm": 0.7892856001853943, + "learning_rate": 1.858158650719923e-05, + "loss": 3.0358, + "step": 54321 + }, + { + "epoch": 2.66, + "grad_norm": 0.7447632551193237, + "learning_rate": 1.8576252573272155e-05, + "loss": 3.0135, + "step": 54322 + }, + { + "epoch": 2.66, + "grad_norm": 0.7343568205833435, + "learning_rate": 1.85709193805612e-05, + "loss": 2.9174, + "step": 54323 + }, + { + "epoch": 2.66, + "grad_norm": 0.7546588778495789, + "learning_rate": 1.8565586929080156e-05, + "loss": 3.1323, + "step": 54324 + }, + { + "epoch": 2.66, + "grad_norm": 0.7940450310707092, + "learning_rate": 1.8560255218843213e-05, + "loss": 2.7931, + "step": 54325 + }, + { + "epoch": 2.66, + "grad_norm": 0.7970552444458008, + "learning_rate": 1.8554924249864423e-05, + "loss": 3.0955, + "step": 54326 + }, + { + "epoch": 2.66, + "grad_norm": 0.7288427352905273, + "learning_rate": 1.8549594022157775e-05, + "loss": 2.8776, + "step": 54327 + }, + { + "epoch": 2.66, + "grad_norm": 0.8218358159065247, + "learning_rate": 1.8544264535737364e-05, + "loss": 2.8839, + "step": 54328 + }, + { + "epoch": 2.66, + "grad_norm": 0.7489563226699829, + "learning_rate": 1.8538935790617136e-05, + "loss": 2.6361, + "step": 54329 + }, + { + "epoch": 2.66, + "grad_norm": 0.7439214587211609, + "learning_rate": 1.8533607786811257e-05, + "loss": 2.7068, + "step": 54330 + }, + { + "epoch": 2.66, + "grad_norm": 0.7576811909675598, + "learning_rate": 1.8528280524333637e-05, + "loss": 2.7559, + "step": 54331 + }, + { + "epoch": 2.66, + "grad_norm": 0.7353628873825073, + "learning_rate": 1.8522954003198342e-05, + "loss": 2.9782, + "step": 54332 + }, + { + "epoch": 2.66, + "grad_norm": 0.7459601163864136, + "learning_rate": 1.851762822341949e-05, + "loss": 2.9133, + "step": 54333 + }, + { + "epoch": 2.66, + "grad_norm": 0.7878472208976746, + "learning_rate": 1.8512303185010936e-05, + "loss": 2.8641, + "step": 54334 + }, + { + "epoch": 2.66, + "grad_norm": 0.7898440957069397, + "learning_rate": 1.8506978887986835e-05, + "loss": 2.9113, + "step": 54335 + }, + { + "epoch": 2.66, + "grad_norm": 0.7459374666213989, + "learning_rate": 1.850165533236121e-05, + "loss": 3.0018, + "step": 54336 + }, + { + "epoch": 2.66, + "grad_norm": 0.7519286870956421, + "learning_rate": 1.849633251814805e-05, + "loss": 2.7782, + "step": 54337 + }, + { + "epoch": 2.66, + "grad_norm": 0.7953622341156006, + "learning_rate": 1.8491010445361376e-05, + "loss": 2.8511, + "step": 54338 + }, + { + "epoch": 2.66, + "grad_norm": 0.7520739436149597, + "learning_rate": 1.8485689114015146e-05, + "loss": 2.993, + "step": 54339 + }, + { + "epoch": 2.66, + "grad_norm": 0.7880058884620667, + "learning_rate": 1.8480368524123445e-05, + "loss": 2.6661, + "step": 54340 + }, + { + "epoch": 2.66, + "grad_norm": 0.7255048751831055, + "learning_rate": 1.84750486757003e-05, + "loss": 2.8142, + "step": 54341 + }, + { + "epoch": 2.66, + "grad_norm": 0.7529579997062683, + "learning_rate": 1.8469729568759662e-05, + "loss": 2.6264, + "step": 54342 + }, + { + "epoch": 2.66, + "grad_norm": 0.7077673077583313, + "learning_rate": 1.8464411203315655e-05, + "loss": 3.071, + "step": 54343 + }, + { + "epoch": 2.66, + "grad_norm": 0.7397609949111938, + "learning_rate": 1.84590935793821e-05, + "loss": 2.9625, + "step": 54344 + }, + { + "epoch": 2.66, + "grad_norm": 0.7642784714698792, + "learning_rate": 1.8453776696973187e-05, + "loss": 2.8608, + "step": 54345 + }, + { + "epoch": 2.66, + "grad_norm": 0.7922868728637695, + "learning_rate": 1.844846055610284e-05, + "loss": 2.6326, + "step": 54346 + }, + { + "epoch": 2.66, + "grad_norm": 0.7658751606941223, + "learning_rate": 1.8443145156785012e-05, + "loss": 2.8669, + "step": 54347 + }, + { + "epoch": 2.66, + "grad_norm": 0.7865391969680786, + "learning_rate": 1.8437830499033822e-05, + "loss": 2.7268, + "step": 54348 + }, + { + "epoch": 2.66, + "grad_norm": 0.7619051337242126, + "learning_rate": 1.843251658286313e-05, + "loss": 2.6806, + "step": 54349 + }, + { + "epoch": 2.66, + "grad_norm": 0.7652508616447449, + "learning_rate": 1.8427203408286994e-05, + "loss": 2.8923, + "step": 54350 + }, + { + "epoch": 2.66, + "grad_norm": 0.718989372253418, + "learning_rate": 1.8421890975319465e-05, + "loss": 3.2317, + "step": 54351 + }, + { + "epoch": 2.66, + "grad_norm": 0.757768988609314, + "learning_rate": 1.84165792839745e-05, + "loss": 2.825, + "step": 54352 + }, + { + "epoch": 2.66, + "grad_norm": 0.7357819676399231, + "learning_rate": 1.841126833426605e-05, + "loss": 3.0808, + "step": 54353 + }, + { + "epoch": 2.66, + "grad_norm": 0.7870209217071533, + "learning_rate": 1.840595812620811e-05, + "loss": 2.8403, + "step": 54354 + }, + { + "epoch": 2.66, + "grad_norm": 0.7332582473754883, + "learning_rate": 1.8400648659814664e-05, + "loss": 2.9084, + "step": 54355 + }, + { + "epoch": 2.66, + "grad_norm": 0.7562378644943237, + "learning_rate": 1.8395339935099772e-05, + "loss": 2.9404, + "step": 54356 + }, + { + "epoch": 2.66, + "grad_norm": 0.7957108020782471, + "learning_rate": 1.8390031952077287e-05, + "loss": 2.9112, + "step": 54357 + }, + { + "epoch": 2.66, + "grad_norm": 0.7108379006385803, + "learning_rate": 1.8384724710761334e-05, + "loss": 2.8825, + "step": 54358 + }, + { + "epoch": 2.66, + "grad_norm": 0.7623836398124695, + "learning_rate": 1.8379418211165796e-05, + "loss": 3.0071, + "step": 54359 + }, + { + "epoch": 2.66, + "grad_norm": 0.7817762494087219, + "learning_rate": 1.8374112453304634e-05, + "loss": 2.6759, + "step": 54360 + }, + { + "epoch": 2.66, + "grad_norm": 0.8142674565315247, + "learning_rate": 1.83688074371919e-05, + "loss": 2.9088, + "step": 54361 + }, + { + "epoch": 2.66, + "grad_norm": 0.7520138621330261, + "learning_rate": 1.836350316284152e-05, + "loss": 2.876, + "step": 54362 + }, + { + "epoch": 2.66, + "grad_norm": 0.7290118336677551, + "learning_rate": 1.835819963026748e-05, + "loss": 2.8468, + "step": 54363 + }, + { + "epoch": 2.66, + "grad_norm": 0.7808724641799927, + "learning_rate": 1.83528968394837e-05, + "loss": 2.9203, + "step": 54364 + }, + { + "epoch": 2.66, + "grad_norm": 0.7678559422492981, + "learning_rate": 1.834759479050417e-05, + "loss": 2.9192, + "step": 54365 + }, + { + "epoch": 2.66, + "grad_norm": 0.7148266434669495, + "learning_rate": 1.834229348334292e-05, + "loss": 2.7605, + "step": 54366 + }, + { + "epoch": 2.66, + "grad_norm": 0.714754045009613, + "learning_rate": 1.8336992918013894e-05, + "loss": 2.8237, + "step": 54367 + }, + { + "epoch": 2.66, + "grad_norm": 0.7212704420089722, + "learning_rate": 1.8331693094530985e-05, + "loss": 2.8664, + "step": 54368 + }, + { + "epoch": 2.66, + "grad_norm": 0.7763723731040955, + "learning_rate": 1.8326394012908153e-05, + "loss": 2.7022, + "step": 54369 + }, + { + "epoch": 2.66, + "grad_norm": 0.7871028780937195, + "learning_rate": 1.8321095673159414e-05, + "loss": 2.9555, + "step": 54370 + }, + { + "epoch": 2.66, + "grad_norm": 0.7615554928779602, + "learning_rate": 1.8315798075298728e-05, + "loss": 2.8821, + "step": 54371 + }, + { + "epoch": 2.66, + "grad_norm": 0.7369310259819031, + "learning_rate": 1.8310501219339947e-05, + "loss": 2.7315, + "step": 54372 + }, + { + "epoch": 2.66, + "grad_norm": 0.7455233931541443, + "learning_rate": 1.8305205105297162e-05, + "loss": 3.1036, + "step": 54373 + }, + { + "epoch": 2.66, + "grad_norm": 0.7491878867149353, + "learning_rate": 1.829990973318426e-05, + "loss": 2.8756, + "step": 54374 + }, + { + "epoch": 2.66, + "grad_norm": 0.8159949779510498, + "learning_rate": 1.829461510301513e-05, + "loss": 2.8182, + "step": 54375 + }, + { + "epoch": 2.66, + "grad_norm": 0.7366239428520203, + "learning_rate": 1.828932121480383e-05, + "loss": 2.8917, + "step": 54376 + }, + { + "epoch": 2.66, + "grad_norm": 0.7913699150085449, + "learning_rate": 1.8284028068564183e-05, + "loss": 3.1585, + "step": 54377 + }, + { + "epoch": 2.66, + "grad_norm": 0.7340931296348572, + "learning_rate": 1.8278735664310208e-05, + "loss": 2.9116, + "step": 54378 + }, + { + "epoch": 2.66, + "grad_norm": 0.7172591686248779, + "learning_rate": 1.8273444002055828e-05, + "loss": 2.9317, + "step": 54379 + }, + { + "epoch": 2.67, + "grad_norm": 0.7502052783966064, + "learning_rate": 1.8268153081814994e-05, + "loss": 2.9903, + "step": 54380 + }, + { + "epoch": 2.67, + "grad_norm": 0.7538665533065796, + "learning_rate": 1.8262862903601606e-05, + "loss": 3.0005, + "step": 54381 + }, + { + "epoch": 2.67, + "grad_norm": 0.7647122740745544, + "learning_rate": 1.8257573467429608e-05, + "loss": 2.9253, + "step": 54382 + }, + { + "epoch": 2.67, + "grad_norm": 0.7346746921539307, + "learning_rate": 1.8252284773312964e-05, + "loss": 2.8669, + "step": 54383 + }, + { + "epoch": 2.67, + "grad_norm": 0.7537041306495667, + "learning_rate": 1.8246996821265557e-05, + "loss": 2.7086, + "step": 54384 + }, + { + "epoch": 2.67, + "grad_norm": 0.74997878074646, + "learning_rate": 1.8241709611301313e-05, + "loss": 2.7979, + "step": 54385 + }, + { + "epoch": 2.67, + "grad_norm": 0.7360642552375793, + "learning_rate": 1.823642314343422e-05, + "loss": 2.7866, + "step": 54386 + }, + { + "epoch": 2.67, + "grad_norm": 0.7375506162643433, + "learning_rate": 1.8231137417678133e-05, + "loss": 2.8911, + "step": 54387 + }, + { + "epoch": 2.67, + "grad_norm": 0.7365680932998657, + "learning_rate": 1.8225852434047038e-05, + "loss": 2.8581, + "step": 54388 + }, + { + "epoch": 2.67, + "grad_norm": 0.7538241147994995, + "learning_rate": 1.822056819255483e-05, + "loss": 2.9333, + "step": 54389 + }, + { + "epoch": 2.67, + "grad_norm": 0.7558910846710205, + "learning_rate": 1.821528469321536e-05, + "loss": 2.7096, + "step": 54390 + }, + { + "epoch": 2.67, + "grad_norm": 0.7916548252105713, + "learning_rate": 1.8210001936042652e-05, + "loss": 2.9807, + "step": 54391 + }, + { + "epoch": 2.67, + "grad_norm": 1.1039891242980957, + "learning_rate": 1.8204719921050492e-05, + "loss": 3.1819, + "step": 54392 + }, + { + "epoch": 2.67, + "grad_norm": 0.734824538230896, + "learning_rate": 1.819943864825294e-05, + "loss": 2.6976, + "step": 54393 + }, + { + "epoch": 2.67, + "grad_norm": 0.7892003655433655, + "learning_rate": 1.819415811766378e-05, + "loss": 3.0551, + "step": 54394 + }, + { + "epoch": 2.67, + "grad_norm": 0.7752202749252319, + "learning_rate": 1.818887832929704e-05, + "loss": 2.8143, + "step": 54395 + }, + { + "epoch": 2.67, + "grad_norm": 0.7521406412124634, + "learning_rate": 1.818359928316654e-05, + "loss": 2.8164, + "step": 54396 + }, + { + "epoch": 2.67, + "grad_norm": 0.7605895400047302, + "learning_rate": 1.8178320979286164e-05, + "loss": 2.945, + "step": 54397 + }, + { + "epoch": 2.67, + "grad_norm": 0.8226302862167358, + "learning_rate": 1.8173043417669908e-05, + "loss": 2.8654, + "step": 54398 + }, + { + "epoch": 2.67, + "grad_norm": 0.7271564602851868, + "learning_rate": 1.8167766598331558e-05, + "loss": 2.9806, + "step": 54399 + }, + { + "epoch": 2.67, + "grad_norm": 0.7447865009307861, + "learning_rate": 1.81624905212851e-05, + "loss": 2.9206, + "step": 54400 + }, + { + "epoch": 2.67, + "grad_norm": 0.7446723580360413, + "learning_rate": 1.815721518654436e-05, + "loss": 3.0564, + "step": 54401 + }, + { + "epoch": 2.67, + "grad_norm": 0.7353099584579468, + "learning_rate": 1.815194059412336e-05, + "loss": 3.0469, + "step": 54402 + }, + { + "epoch": 2.67, + "grad_norm": 0.7726219296455383, + "learning_rate": 1.8146666744035852e-05, + "loss": 2.9872, + "step": 54403 + }, + { + "epoch": 2.67, + "grad_norm": 0.7748692035675049, + "learning_rate": 1.8141393636295764e-05, + "loss": 2.8217, + "step": 54404 + }, + { + "epoch": 2.67, + "grad_norm": 0.7315913438796997, + "learning_rate": 1.813612127091705e-05, + "loss": 2.871, + "step": 54405 + }, + { + "epoch": 2.67, + "grad_norm": 0.7552880644798279, + "learning_rate": 1.8130849647913492e-05, + "loss": 2.8242, + "step": 54406 + }, + { + "epoch": 2.67, + "grad_norm": 0.726218581199646, + "learning_rate": 1.812557876729902e-05, + "loss": 2.8563, + "step": 54407 + }, + { + "epoch": 2.67, + "grad_norm": 0.7617534399032593, + "learning_rate": 1.8120308629087587e-05, + "loss": 2.8983, + "step": 54408 + }, + { + "epoch": 2.67, + "grad_norm": 0.7186073064804077, + "learning_rate": 1.811503923329295e-05, + "loss": 2.9901, + "step": 54409 + }, + { + "epoch": 2.67, + "grad_norm": 0.7599871754646301, + "learning_rate": 1.810977057992913e-05, + "loss": 2.8198, + "step": 54410 + }, + { + "epoch": 2.67, + "grad_norm": 0.7000196576118469, + "learning_rate": 1.810450266900988e-05, + "loss": 2.902, + "step": 54411 + }, + { + "epoch": 2.67, + "grad_norm": 0.7205071449279785, + "learning_rate": 1.8099235500549092e-05, + "loss": 2.6609, + "step": 54412 + }, + { + "epoch": 2.67, + "grad_norm": 0.7560635209083557, + "learning_rate": 1.809396907456072e-05, + "loss": 2.936, + "step": 54413 + }, + { + "epoch": 2.67, + "grad_norm": 0.7410910129547119, + "learning_rate": 1.8088703391058523e-05, + "loss": 2.9418, + "step": 54414 + }, + { + "epoch": 2.67, + "grad_norm": 0.7488538026809692, + "learning_rate": 1.8083438450056487e-05, + "loss": 2.6645, + "step": 54415 + }, + { + "epoch": 2.67, + "grad_norm": 0.7629244923591614, + "learning_rate": 1.8078174251568333e-05, + "loss": 2.7683, + "step": 54416 + }, + { + "epoch": 2.67, + "grad_norm": 0.7645202279090881, + "learning_rate": 1.8072910795608088e-05, + "loss": 2.9679, + "step": 54417 + }, + { + "epoch": 2.67, + "grad_norm": 0.7587352395057678, + "learning_rate": 1.8067648082189536e-05, + "loss": 2.8207, + "step": 54418 + }, + { + "epoch": 2.67, + "grad_norm": 0.7425807118415833, + "learning_rate": 1.80623861113265e-05, + "loss": 3.1311, + "step": 54419 + }, + { + "epoch": 2.67, + "grad_norm": 0.7521542906761169, + "learning_rate": 1.8057124883032904e-05, + "loss": 3.1478, + "step": 54420 + }, + { + "epoch": 2.67, + "grad_norm": 0.7295096516609192, + "learning_rate": 1.805186439732257e-05, + "loss": 3.0066, + "step": 54421 + }, + { + "epoch": 2.67, + "grad_norm": 0.7017486095428467, + "learning_rate": 1.8046604654209317e-05, + "loss": 2.8773, + "step": 54422 + }, + { + "epoch": 2.67, + "grad_norm": 0.7578108310699463, + "learning_rate": 1.8041345653707106e-05, + "loss": 2.8673, + "step": 54423 + }, + { + "epoch": 2.67, + "grad_norm": 0.771789014339447, + "learning_rate": 1.8036087395829758e-05, + "loss": 2.9348, + "step": 54424 + }, + { + "epoch": 2.67, + "grad_norm": 0.7727000117301941, + "learning_rate": 1.8030829880591057e-05, + "loss": 2.6452, + "step": 54425 + }, + { + "epoch": 2.67, + "grad_norm": 0.7393551468849182, + "learning_rate": 1.802557310800483e-05, + "loss": 3.0339, + "step": 54426 + }, + { + "epoch": 2.67, + "grad_norm": 0.7382111549377441, + "learning_rate": 1.8020317078085e-05, + "loss": 2.8841, + "step": 54427 + }, + { + "epoch": 2.67, + "grad_norm": 0.7548832297325134, + "learning_rate": 1.801506179084542e-05, + "loss": 3.0882, + "step": 54428 + }, + { + "epoch": 2.67, + "grad_norm": 0.747711718082428, + "learning_rate": 1.800980724629988e-05, + "loss": 2.8361, + "step": 54429 + }, + { + "epoch": 2.67, + "grad_norm": 0.7639087438583374, + "learning_rate": 1.8004553444462265e-05, + "loss": 2.8952, + "step": 54430 + }, + { + "epoch": 2.67, + "grad_norm": 0.7807071208953857, + "learning_rate": 1.799930038534634e-05, + "loss": 2.8251, + "step": 54431 + }, + { + "epoch": 2.67, + "grad_norm": 0.7129517197608948, + "learning_rate": 1.7994048068966018e-05, + "loss": 2.9591, + "step": 54432 + }, + { + "epoch": 2.67, + "grad_norm": 0.7340091466903687, + "learning_rate": 1.7988796495335088e-05, + "loss": 2.899, + "step": 54433 + }, + { + "epoch": 2.67, + "grad_norm": 0.7638095617294312, + "learning_rate": 1.798354566446738e-05, + "loss": 2.9115, + "step": 54434 + }, + { + "epoch": 2.67, + "grad_norm": 0.7314512729644775, + "learning_rate": 1.797829557637678e-05, + "loss": 3.0898, + "step": 54435 + }, + { + "epoch": 2.67, + "grad_norm": 0.7057316899299622, + "learning_rate": 1.7973046231077037e-05, + "loss": 2.7938, + "step": 54436 + }, + { + "epoch": 2.67, + "grad_norm": 0.7399982213973999, + "learning_rate": 1.796779762858198e-05, + "loss": 2.8381, + "step": 54437 + }, + { + "epoch": 2.67, + "grad_norm": 0.7212401032447815, + "learning_rate": 1.7962549768905532e-05, + "loss": 2.5719, + "step": 54438 + }, + { + "epoch": 2.67, + "grad_norm": 0.7532590627670288, + "learning_rate": 1.7957302652061444e-05, + "loss": 2.8661, + "step": 54439 + }, + { + "epoch": 2.67, + "grad_norm": 0.7178873419761658, + "learning_rate": 1.7952056278063542e-05, + "loss": 2.9399, + "step": 54440 + }, + { + "epoch": 2.67, + "grad_norm": 0.7503253221511841, + "learning_rate": 1.794681064692558e-05, + "loss": 3.0511, + "step": 54441 + }, + { + "epoch": 2.67, + "grad_norm": 0.7387188673019409, + "learning_rate": 1.794156575866148e-05, + "loss": 2.8939, + "step": 54442 + }, + { + "epoch": 2.67, + "grad_norm": 0.7133545279502869, + "learning_rate": 1.793632161328503e-05, + "loss": 2.8469, + "step": 54443 + }, + { + "epoch": 2.67, + "grad_norm": 0.7400594353675842, + "learning_rate": 1.7931078210809958e-05, + "loss": 2.8812, + "step": 54444 + }, + { + "epoch": 2.67, + "grad_norm": 0.7527070641517639, + "learning_rate": 1.792583555125021e-05, + "loss": 2.89, + "step": 54445 + }, + { + "epoch": 2.67, + "grad_norm": 0.7184710502624512, + "learning_rate": 1.7920593634619483e-05, + "loss": 3.0791, + "step": 54446 + }, + { + "epoch": 2.67, + "grad_norm": 0.7595903873443604, + "learning_rate": 1.7915352460931664e-05, + "loss": 2.9072, + "step": 54447 + }, + { + "epoch": 2.67, + "grad_norm": 0.7046235799789429, + "learning_rate": 1.7910112030200542e-05, + "loss": 2.66, + "step": 54448 + }, + { + "epoch": 2.67, + "grad_norm": 0.728029727935791, + "learning_rate": 1.7904872342439803e-05, + "loss": 2.9842, + "step": 54449 + }, + { + "epoch": 2.67, + "grad_norm": 0.7607099413871765, + "learning_rate": 1.7899633397663406e-05, + "loss": 2.8981, + "step": 54450 + }, + { + "epoch": 2.67, + "grad_norm": 0.7469753623008728, + "learning_rate": 1.789439519588507e-05, + "loss": 2.8449, + "step": 54451 + }, + { + "epoch": 2.67, + "grad_norm": 0.7470847368240356, + "learning_rate": 1.7889157737118554e-05, + "loss": 3.0769, + "step": 54452 + }, + { + "epoch": 2.67, + "grad_norm": 0.7539829611778259, + "learning_rate": 1.7883921021377777e-05, + "loss": 2.7504, + "step": 54453 + }, + { + "epoch": 2.67, + "grad_norm": 0.7649708986282349, + "learning_rate": 1.7878685048676467e-05, + "loss": 2.929, + "step": 54454 + }, + { + "epoch": 2.67, + "grad_norm": 0.7718356847763062, + "learning_rate": 1.7873449819028372e-05, + "loss": 3.0847, + "step": 54455 + }, + { + "epoch": 2.67, + "grad_norm": 0.7208101153373718, + "learning_rate": 1.786821533244729e-05, + "loss": 3.0591, + "step": 54456 + }, + { + "epoch": 2.67, + "grad_norm": 0.7297426462173462, + "learning_rate": 1.7862981588947032e-05, + "loss": 2.8899, + "step": 54457 + }, + { + "epoch": 2.67, + "grad_norm": 0.7583280801773071, + "learning_rate": 1.785774858854143e-05, + "loss": 2.958, + "step": 54458 + }, + { + "epoch": 2.67, + "grad_norm": 0.7504193186759949, + "learning_rate": 1.785251633124417e-05, + "loss": 2.8244, + "step": 54459 + }, + { + "epoch": 2.67, + "grad_norm": 0.78138267993927, + "learning_rate": 1.7847284817069107e-05, + "loss": 2.8839, + "step": 54460 + }, + { + "epoch": 2.67, + "grad_norm": 0.7497425675392151, + "learning_rate": 1.784205404603003e-05, + "loss": 2.8106, + "step": 54461 + }, + { + "epoch": 2.67, + "grad_norm": 0.8312497735023499, + "learning_rate": 1.7836824018140628e-05, + "loss": 2.8887, + "step": 54462 + }, + { + "epoch": 2.67, + "grad_norm": 0.7753313779830933, + "learning_rate": 1.7831594733414757e-05, + "loss": 2.8713, + "step": 54463 + }, + { + "epoch": 2.67, + "grad_norm": 0.7584692239761353, + "learning_rate": 1.7826366191866105e-05, + "loss": 2.8697, + "step": 54464 + }, + { + "epoch": 2.67, + "grad_norm": 0.799094021320343, + "learning_rate": 1.782113839350856e-05, + "loss": 2.9074, + "step": 54465 + }, + { + "epoch": 2.67, + "grad_norm": 0.7895060777664185, + "learning_rate": 1.781591133835578e-05, + "loss": 2.7601, + "step": 54466 + }, + { + "epoch": 2.67, + "grad_norm": 0.7699868083000183, + "learning_rate": 1.7810685026421588e-05, + "loss": 2.8981, + "step": 54467 + }, + { + "epoch": 2.67, + "grad_norm": 0.7355355620384216, + "learning_rate": 1.7805459457719805e-05, + "loss": 2.941, + "step": 54468 + }, + { + "epoch": 2.67, + "grad_norm": 0.7582590579986572, + "learning_rate": 1.780023463226412e-05, + "loss": 2.6234, + "step": 54469 + }, + { + "epoch": 2.67, + "grad_norm": 0.7793235182762146, + "learning_rate": 1.779501055006829e-05, + "loss": 2.7773, + "step": 54470 + }, + { + "epoch": 2.67, + "grad_norm": 0.7583927512168884, + "learning_rate": 1.7789787211146067e-05, + "loss": 2.8734, + "step": 54471 + }, + { + "epoch": 2.67, + "grad_norm": 0.7218800187110901, + "learning_rate": 1.7784564615511244e-05, + "loss": 2.9254, + "step": 54472 + }, + { + "epoch": 2.67, + "grad_norm": 0.7587677240371704, + "learning_rate": 1.7779342763177572e-05, + "loss": 2.8399, + "step": 54473 + }, + { + "epoch": 2.67, + "grad_norm": 0.7498154640197754, + "learning_rate": 1.7774121654158745e-05, + "loss": 2.8976, + "step": 54474 + }, + { + "epoch": 2.67, + "grad_norm": 0.7345309257507324, + "learning_rate": 1.7768901288468652e-05, + "loss": 2.9342, + "step": 54475 + }, + { + "epoch": 2.67, + "grad_norm": 0.758578360080719, + "learning_rate": 1.7763681666120942e-05, + "loss": 2.8475, + "step": 54476 + }, + { + "epoch": 2.67, + "grad_norm": 0.7470417618751526, + "learning_rate": 1.7758462787129346e-05, + "loss": 2.9, + "step": 54477 + }, + { + "epoch": 2.67, + "grad_norm": 0.750644326210022, + "learning_rate": 1.7753244651507648e-05, + "loss": 2.934, + "step": 54478 + }, + { + "epoch": 2.67, + "grad_norm": 0.7721165418624878, + "learning_rate": 1.774802725926954e-05, + "loss": 2.9427, + "step": 54479 + }, + { + "epoch": 2.67, + "grad_norm": 0.7746807336807251, + "learning_rate": 1.774281061042887e-05, + "loss": 3.035, + "step": 54480 + }, + { + "epoch": 2.67, + "grad_norm": 0.7496808767318726, + "learning_rate": 1.773759470499927e-05, + "loss": 3.0492, + "step": 54481 + }, + { + "epoch": 2.67, + "grad_norm": 0.7302466630935669, + "learning_rate": 1.773237954299459e-05, + "loss": 2.8497, + "step": 54482 + }, + { + "epoch": 2.67, + "grad_norm": 0.7932860851287842, + "learning_rate": 1.7727165124428455e-05, + "loss": 2.9827, + "step": 54483 + }, + { + "epoch": 2.67, + "grad_norm": 0.7311192154884338, + "learning_rate": 1.7721951449314654e-05, + "loss": 3.0511, + "step": 54484 + }, + { + "epoch": 2.67, + "grad_norm": 0.7317575812339783, + "learning_rate": 1.7716738517666906e-05, + "loss": 2.745, + "step": 54485 + }, + { + "epoch": 2.67, + "grad_norm": 0.7372452020645142, + "learning_rate": 1.7711526329498906e-05, + "loss": 2.8255, + "step": 54486 + }, + { + "epoch": 2.67, + "grad_norm": 0.7596213221549988, + "learning_rate": 1.770631488482447e-05, + "loss": 2.8587, + "step": 54487 + }, + { + "epoch": 2.67, + "grad_norm": 0.7744680643081665, + "learning_rate": 1.770110418365722e-05, + "loss": 2.7446, + "step": 54488 + }, + { + "epoch": 2.67, + "grad_norm": 0.7649564743041992, + "learning_rate": 1.7695894226010953e-05, + "loss": 2.7356, + "step": 54489 + }, + { + "epoch": 2.67, + "grad_norm": 0.7747321724891663, + "learning_rate": 1.7690685011899385e-05, + "loss": 2.8946, + "step": 54490 + }, + { + "epoch": 2.67, + "grad_norm": 0.7615075707435608, + "learning_rate": 1.7685476541336242e-05, + "loss": 2.9659, + "step": 54491 + }, + { + "epoch": 2.67, + "grad_norm": 0.7637103796005249, + "learning_rate": 1.768026881433521e-05, + "loss": 2.7237, + "step": 54492 + }, + { + "epoch": 2.67, + "grad_norm": 0.7580251693725586, + "learning_rate": 1.7675061830909976e-05, + "loss": 3.0487, + "step": 54493 + }, + { + "epoch": 2.67, + "grad_norm": 0.7343547940254211, + "learning_rate": 1.76698555910743e-05, + "loss": 2.7701, + "step": 54494 + }, + { + "epoch": 2.67, + "grad_norm": 0.7214182615280151, + "learning_rate": 1.7664650094841937e-05, + "loss": 2.8685, + "step": 54495 + }, + { + "epoch": 2.67, + "grad_norm": 0.7279322147369385, + "learning_rate": 1.7659445342226473e-05, + "loss": 2.869, + "step": 54496 + }, + { + "epoch": 2.67, + "grad_norm": 0.7043575048446655, + "learning_rate": 1.765424133324177e-05, + "loss": 3.072, + "step": 54497 + }, + { + "epoch": 2.67, + "grad_norm": 0.7192384600639343, + "learning_rate": 1.7649038067901445e-05, + "loss": 2.9503, + "step": 54498 + }, + { + "epoch": 2.67, + "grad_norm": 0.7618083357810974, + "learning_rate": 1.764383554621919e-05, + "loss": 2.9244, + "step": 54499 + }, + { + "epoch": 2.67, + "grad_norm": 0.7713229656219482, + "learning_rate": 1.7638633768208753e-05, + "loss": 2.8477, + "step": 54500 + }, + { + "epoch": 2.67, + "grad_norm": 0.761009931564331, + "learning_rate": 1.763343273388377e-05, + "loss": 2.7269, + "step": 54501 + }, + { + "epoch": 2.67, + "grad_norm": 0.7590358257293701, + "learning_rate": 1.7628232443258016e-05, + "loss": 2.6942, + "step": 54502 + }, + { + "epoch": 2.67, + "grad_norm": 0.7583818435668945, + "learning_rate": 1.7623032896345123e-05, + "loss": 2.8209, + "step": 54503 + }, + { + "epoch": 2.67, + "grad_norm": 0.7613120675086975, + "learning_rate": 1.7617834093158878e-05, + "loss": 2.9961, + "step": 54504 + }, + { + "epoch": 2.67, + "grad_norm": 0.7189532518386841, + "learning_rate": 1.76126360337129e-05, + "loss": 3.0106, + "step": 54505 + }, + { + "epoch": 2.67, + "grad_norm": 0.7215641736984253, + "learning_rate": 1.7607438718020848e-05, + "loss": 2.9489, + "step": 54506 + }, + { + "epoch": 2.67, + "grad_norm": 0.7565929889678955, + "learning_rate": 1.760224214609648e-05, + "loss": 2.9464, + "step": 54507 + }, + { + "epoch": 2.67, + "grad_norm": 0.8237677216529846, + "learning_rate": 1.7597046317953444e-05, + "loss": 3.1464, + "step": 54508 + }, + { + "epoch": 2.67, + "grad_norm": 0.7600015997886658, + "learning_rate": 1.7591851233605405e-05, + "loss": 2.7613, + "step": 54509 + }, + { + "epoch": 2.67, + "grad_norm": 0.7842651009559631, + "learning_rate": 1.7586656893066144e-05, + "loss": 2.8621, + "step": 54510 + }, + { + "epoch": 2.67, + "grad_norm": 0.7692767977714539, + "learning_rate": 1.7581463296349252e-05, + "loss": 2.752, + "step": 54511 + }, + { + "epoch": 2.67, + "grad_norm": 0.7494772672653198, + "learning_rate": 1.7576270443468454e-05, + "loss": 2.8582, + "step": 54512 + }, + { + "epoch": 2.67, + "grad_norm": 0.742486298084259, + "learning_rate": 1.7571078334437405e-05, + "loss": 2.8898, + "step": 54513 + }, + { + "epoch": 2.67, + "grad_norm": 0.7653214335441589, + "learning_rate": 1.7565886969269728e-05, + "loss": 3.1248, + "step": 54514 + }, + { + "epoch": 2.67, + "grad_norm": 0.7274088859558105, + "learning_rate": 1.756069634797921e-05, + "loss": 2.7404, + "step": 54515 + }, + { + "epoch": 2.67, + "grad_norm": 0.7633993625640869, + "learning_rate": 1.7555506470579404e-05, + "loss": 2.8211, + "step": 54516 + }, + { + "epoch": 2.67, + "grad_norm": 0.7414184808731079, + "learning_rate": 1.7550317337084107e-05, + "loss": 2.8421, + "step": 54517 + }, + { + "epoch": 2.67, + "grad_norm": 0.7275792956352234, + "learning_rate": 1.7545128947506836e-05, + "loss": 2.9229, + "step": 54518 + }, + { + "epoch": 2.67, + "grad_norm": 0.7488593459129333, + "learning_rate": 1.7539941301861414e-05, + "loss": 2.8447, + "step": 54519 + }, + { + "epoch": 2.67, + "grad_norm": 0.8635866641998291, + "learning_rate": 1.7534754400161398e-05, + "loss": 2.927, + "step": 54520 + }, + { + "epoch": 2.67, + "grad_norm": 0.7082469463348389, + "learning_rate": 1.7529568242420445e-05, + "loss": 2.8341, + "step": 54521 + }, + { + "epoch": 2.67, + "grad_norm": 0.7854346632957458, + "learning_rate": 1.7524382828652307e-05, + "loss": 3.1416, + "step": 54522 + }, + { + "epoch": 2.67, + "grad_norm": 0.7766318321228027, + "learning_rate": 1.7519198158870507e-05, + "loss": 3.0129, + "step": 54523 + }, + { + "epoch": 2.67, + "grad_norm": 0.7670121788978577, + "learning_rate": 1.7514014233088804e-05, + "loss": 2.9533, + "step": 54524 + }, + { + "epoch": 2.67, + "grad_norm": 0.7497897148132324, + "learning_rate": 1.7508831051320848e-05, + "loss": 2.9596, + "step": 54525 + }, + { + "epoch": 2.67, + "grad_norm": 0.7375215888023376, + "learning_rate": 1.7503648613580266e-05, + "loss": 3.1485, + "step": 54526 + }, + { + "epoch": 2.67, + "grad_norm": 0.7439850568771362, + "learning_rate": 1.7498466919880717e-05, + "loss": 2.9282, + "step": 54527 + }, + { + "epoch": 2.67, + "grad_norm": 0.7792314887046814, + "learning_rate": 1.7493285970235783e-05, + "loss": 3.0288, + "step": 54528 + }, + { + "epoch": 2.67, + "grad_norm": 0.7756353616714478, + "learning_rate": 1.7488105764659156e-05, + "loss": 2.7858, + "step": 54529 + }, + { + "epoch": 2.67, + "grad_norm": 0.7475228905677795, + "learning_rate": 1.7482926303164557e-05, + "loss": 2.8076, + "step": 54530 + }, + { + "epoch": 2.67, + "grad_norm": 0.7505745887756348, + "learning_rate": 1.7477747585765478e-05, + "loss": 2.9298, + "step": 54531 + }, + { + "epoch": 2.67, + "grad_norm": 0.7236295342445374, + "learning_rate": 1.7472569612475705e-05, + "loss": 2.9612, + "step": 54532 + }, + { + "epoch": 2.67, + "grad_norm": 0.7206342816352844, + "learning_rate": 1.7467392383308765e-05, + "loss": 2.8051, + "step": 54533 + }, + { + "epoch": 2.67, + "grad_norm": 0.7881414294242859, + "learning_rate": 1.7462215898278408e-05, + "loss": 3.0171, + "step": 54534 + }, + { + "epoch": 2.67, + "grad_norm": 0.7353194952011108, + "learning_rate": 1.745704015739816e-05, + "loss": 2.9517, + "step": 54535 + }, + { + "epoch": 2.67, + "grad_norm": 0.7909258008003235, + "learning_rate": 1.7451865160681678e-05, + "loss": 2.8948, + "step": 54536 + }, + { + "epoch": 2.67, + "grad_norm": 0.7583587765693665, + "learning_rate": 1.7446690908142613e-05, + "loss": 2.7776, + "step": 54537 + }, + { + "epoch": 2.67, + "grad_norm": 0.7082347273826599, + "learning_rate": 1.744151739979456e-05, + "loss": 2.8779, + "step": 54538 + }, + { + "epoch": 2.67, + "grad_norm": 0.7351507544517517, + "learning_rate": 1.7436344635651167e-05, + "loss": 3.0366, + "step": 54539 + }, + { + "epoch": 2.67, + "grad_norm": 0.7974845170974731, + "learning_rate": 1.743117261572613e-05, + "loss": 2.8264, + "step": 54540 + }, + { + "epoch": 2.67, + "grad_norm": 0.736272394657135, + "learning_rate": 1.742600134003297e-05, + "loss": 3.0235, + "step": 54541 + }, + { + "epoch": 2.67, + "grad_norm": 0.8139263391494751, + "learning_rate": 1.7420830808585373e-05, + "loss": 2.8582, + "step": 54542 + }, + { + "epoch": 2.67, + "grad_norm": 0.7663759589195251, + "learning_rate": 1.7415661021396865e-05, + "loss": 2.965, + "step": 54543 + }, + { + "epoch": 2.67, + "grad_norm": 0.7662950754165649, + "learning_rate": 1.74104919784811e-05, + "loss": 2.7665, + "step": 54544 + }, + { + "epoch": 2.67, + "grad_norm": 0.7392333149909973, + "learning_rate": 1.7405323679851768e-05, + "loss": 2.9475, + "step": 54545 + }, + { + "epoch": 2.67, + "grad_norm": 0.7623760104179382, + "learning_rate": 1.740015612552239e-05, + "loss": 2.9041, + "step": 54546 + }, + { + "epoch": 2.67, + "grad_norm": 0.7544460296630859, + "learning_rate": 1.739498931550669e-05, + "loss": 2.6681, + "step": 54547 + }, + { + "epoch": 2.67, + "grad_norm": 0.7664691209793091, + "learning_rate": 1.7389823249818124e-05, + "loss": 2.902, + "step": 54548 + }, + { + "epoch": 2.67, + "grad_norm": 0.7272012829780579, + "learning_rate": 1.7384657928470448e-05, + "loss": 2.7784, + "step": 54549 + }, + { + "epoch": 2.67, + "grad_norm": 0.747775137424469, + "learning_rate": 1.7379493351477114e-05, + "loss": 2.6218, + "step": 54550 + }, + { + "epoch": 2.67, + "grad_norm": 0.7551100850105286, + "learning_rate": 1.7374329518851814e-05, + "loss": 2.5569, + "step": 54551 + }, + { + "epoch": 2.67, + "grad_norm": 0.7571572661399841, + "learning_rate": 1.7369166430608172e-05, + "loss": 2.973, + "step": 54552 + }, + { + "epoch": 2.67, + "grad_norm": 0.7804607152938843, + "learning_rate": 1.7364004086759707e-05, + "loss": 2.9752, + "step": 54553 + }, + { + "epoch": 2.67, + "grad_norm": 0.739106297492981, + "learning_rate": 1.7358842487320113e-05, + "loss": 2.9756, + "step": 54554 + }, + { + "epoch": 2.67, + "grad_norm": 0.7254763841629028, + "learning_rate": 1.7353681632302873e-05, + "loss": 3.0763, + "step": 54555 + }, + { + "epoch": 2.67, + "grad_norm": 0.7539757490158081, + "learning_rate": 1.7348521521721713e-05, + "loss": 2.7454, + "step": 54556 + }, + { + "epoch": 2.67, + "grad_norm": 0.7677503228187561, + "learning_rate": 1.7343362155590123e-05, + "loss": 2.9946, + "step": 54557 + }, + { + "epoch": 2.67, + "grad_norm": 0.7571077346801758, + "learning_rate": 1.733820353392169e-05, + "loss": 3.0087, + "step": 54558 + }, + { + "epoch": 2.67, + "grad_norm": 0.7669923901557922, + "learning_rate": 1.733304565673007e-05, + "loss": 2.9143, + "step": 54559 + }, + { + "epoch": 2.67, + "grad_norm": 0.73554927110672, + "learning_rate": 1.7327888524028755e-05, + "loss": 2.9558, + "step": 54560 + }, + { + "epoch": 2.67, + "grad_norm": 0.7662096619606018, + "learning_rate": 1.7322732135831396e-05, + "loss": 2.7881, + "step": 54561 + }, + { + "epoch": 2.67, + "grad_norm": 0.7233344912528992, + "learning_rate": 1.7317576492151587e-05, + "loss": 2.831, + "step": 54562 + }, + { + "epoch": 2.67, + "grad_norm": 0.7593091726303101, + "learning_rate": 1.731242159300288e-05, + "loss": 2.9025, + "step": 54563 + }, + { + "epoch": 2.67, + "grad_norm": 0.7726497650146484, + "learning_rate": 1.7307267438398865e-05, + "loss": 2.8003, + "step": 54564 + }, + { + "epoch": 2.67, + "grad_norm": 0.765876829624176, + "learning_rate": 1.7302114028353032e-05, + "loss": 2.9848, + "step": 54565 + }, + { + "epoch": 2.67, + "grad_norm": 0.7233142852783203, + "learning_rate": 1.7296961362879036e-05, + "loss": 2.9807, + "step": 54566 + }, + { + "epoch": 2.67, + "grad_norm": 0.7348734140396118, + "learning_rate": 1.72918094419905e-05, + "loss": 2.7916, + "step": 54567 + }, + { + "epoch": 2.67, + "grad_norm": 0.769679069519043, + "learning_rate": 1.728665826570088e-05, + "loss": 2.8728, + "step": 54568 + }, + { + "epoch": 2.67, + "grad_norm": 0.7657221555709839, + "learning_rate": 1.7281507834023832e-05, + "loss": 2.8739, + "step": 54569 + }, + { + "epoch": 2.67, + "grad_norm": 0.7933992743492126, + "learning_rate": 1.727635814697281e-05, + "loss": 2.812, + "step": 54570 + }, + { + "epoch": 2.67, + "grad_norm": 0.7689788937568665, + "learning_rate": 1.727120920456151e-05, + "loss": 2.8414, + "step": 54571 + }, + { + "epoch": 2.67, + "grad_norm": 0.7207183837890625, + "learning_rate": 1.7266061006803444e-05, + "loss": 2.7137, + "step": 54572 + }, + { + "epoch": 2.67, + "grad_norm": 0.7436949610710144, + "learning_rate": 1.7260913553712107e-05, + "loss": 2.9456, + "step": 54573 + }, + { + "epoch": 2.67, + "grad_norm": 0.7527655363082886, + "learning_rate": 1.7255766845301157e-05, + "loss": 2.8905, + "step": 54574 + }, + { + "epoch": 2.67, + "grad_norm": 0.7638602256774902, + "learning_rate": 1.7250620881584077e-05, + "loss": 2.7258, + "step": 54575 + }, + { + "epoch": 2.67, + "grad_norm": 0.7570220232009888, + "learning_rate": 1.7245475662574428e-05, + "loss": 2.6924, + "step": 54576 + }, + { + "epoch": 2.67, + "grad_norm": 0.7669765949249268, + "learning_rate": 1.724033118828583e-05, + "loss": 2.797, + "step": 54577 + }, + { + "epoch": 2.67, + "grad_norm": 0.7875416278839111, + "learning_rate": 1.723518745873177e-05, + "loss": 2.7294, + "step": 54578 + }, + { + "epoch": 2.67, + "grad_norm": 0.7687140703201294, + "learning_rate": 1.723004447392581e-05, + "loss": 2.9238, + "step": 54579 + }, + { + "epoch": 2.67, + "grad_norm": 0.7571902275085449, + "learning_rate": 1.7224902233881433e-05, + "loss": 2.9232, + "step": 54580 + }, + { + "epoch": 2.67, + "grad_norm": 0.7242570519447327, + "learning_rate": 1.7219760738612264e-05, + "loss": 2.9942, + "step": 54581 + }, + { + "epoch": 2.67, + "grad_norm": 0.7527267336845398, + "learning_rate": 1.721461998813186e-05, + "loss": 2.8456, + "step": 54582 + }, + { + "epoch": 2.67, + "grad_norm": 0.7353515028953552, + "learning_rate": 1.720947998245368e-05, + "loss": 2.8948, + "step": 54583 + }, + { + "epoch": 2.68, + "grad_norm": 0.7328376770019531, + "learning_rate": 1.7204340721591335e-05, + "loss": 2.8345, + "step": 54584 + }, + { + "epoch": 2.68, + "grad_norm": 0.7656261920928955, + "learning_rate": 1.719920220555836e-05, + "loss": 2.8635, + "step": 54585 + }, + { + "epoch": 2.68, + "grad_norm": 0.7430219650268555, + "learning_rate": 1.7194064434368206e-05, + "loss": 2.8203, + "step": 54586 + }, + { + "epoch": 2.68, + "grad_norm": 0.7474393844604492, + "learning_rate": 1.718892740803449e-05, + "loss": 2.9497, + "step": 54587 + }, + { + "epoch": 2.68, + "grad_norm": 0.75533527135849, + "learning_rate": 1.7183791126570678e-05, + "loss": 2.9873, + "step": 54588 + }, + { + "epoch": 2.68, + "grad_norm": 0.776290237903595, + "learning_rate": 1.7178655589990387e-05, + "loss": 2.8499, + "step": 54589 + }, + { + "epoch": 2.68, + "grad_norm": 0.743800699710846, + "learning_rate": 1.717352079830704e-05, + "loss": 2.7576, + "step": 54590 + }, + { + "epoch": 2.68, + "grad_norm": 0.7225151658058167, + "learning_rate": 1.7168386751534192e-05, + "loss": 2.8802, + "step": 54591 + }, + { + "epoch": 2.68, + "grad_norm": 0.7379876971244812, + "learning_rate": 1.716325344968543e-05, + "loss": 2.721, + "step": 54592 + }, + { + "epoch": 2.68, + "grad_norm": 0.7471773028373718, + "learning_rate": 1.715812089277422e-05, + "loss": 2.7912, + "step": 54593 + }, + { + "epoch": 2.68, + "grad_norm": 0.7949302792549133, + "learning_rate": 1.7152989080814106e-05, + "loss": 2.771, + "step": 54594 + }, + { + "epoch": 2.68, + "grad_norm": 0.7417853474617004, + "learning_rate": 1.7147858013818517e-05, + "loss": 2.923, + "step": 54595 + }, + { + "epoch": 2.68, + "grad_norm": 0.7578975558280945, + "learning_rate": 1.714272769180104e-05, + "loss": 2.8479, + "step": 54596 + }, + { + "epoch": 2.68, + "grad_norm": 0.77840256690979, + "learning_rate": 1.7137598114775232e-05, + "loss": 2.7247, + "step": 54597 + }, + { + "epoch": 2.68, + "grad_norm": 0.7270179986953735, + "learning_rate": 1.7132469282754513e-05, + "loss": 2.7254, + "step": 54598 + }, + { + "epoch": 2.68, + "grad_norm": 0.7670302391052246, + "learning_rate": 1.7127341195752475e-05, + "loss": 2.8518, + "step": 54599 + }, + { + "epoch": 2.68, + "grad_norm": 0.7435368895530701, + "learning_rate": 1.7122213853782572e-05, + "loss": 2.9362, + "step": 54600 + }, + { + "epoch": 2.68, + "grad_norm": 0.7616455554962158, + "learning_rate": 1.7117087256858262e-05, + "loss": 2.7466, + "step": 54601 + }, + { + "epoch": 2.68, + "grad_norm": 0.7222978472709656, + "learning_rate": 1.711196140499317e-05, + "loss": 2.826, + "step": 54602 + }, + { + "epoch": 2.68, + "grad_norm": 0.7720111012458801, + "learning_rate": 1.7106836298200675e-05, + "loss": 2.9449, + "step": 54603 + }, + { + "epoch": 2.68, + "grad_norm": 0.7425702214241028, + "learning_rate": 1.7101711936494378e-05, + "loss": 3.0421, + "step": 54604 + }, + { + "epoch": 2.68, + "grad_norm": 0.7751484513282776, + "learning_rate": 1.709658831988766e-05, + "loss": 3.0768, + "step": 54605 + }, + { + "epoch": 2.68, + "grad_norm": 0.731267511844635, + "learning_rate": 1.7091465448394148e-05, + "loss": 2.7284, + "step": 54606 + }, + { + "epoch": 2.68, + "grad_norm": 0.7741591930389404, + "learning_rate": 1.7086343322027263e-05, + "loss": 2.9372, + "step": 54607 + }, + { + "epoch": 2.68, + "grad_norm": 0.7876058220863342, + "learning_rate": 1.708122194080046e-05, + "loss": 2.9196, + "step": 54608 + }, + { + "epoch": 2.68, + "grad_norm": 0.7788753509521484, + "learning_rate": 1.7076101304727296e-05, + "loss": 2.8865, + "step": 54609 + }, + { + "epoch": 2.68, + "grad_norm": 0.7326127290725708, + "learning_rate": 1.7070981413821227e-05, + "loss": 2.9553, + "step": 54610 + }, + { + "epoch": 2.68, + "grad_norm": 0.7484868764877319, + "learning_rate": 1.706586226809571e-05, + "loss": 2.9617, + "step": 54611 + }, + { + "epoch": 2.68, + "grad_norm": 0.7754936814308167, + "learning_rate": 1.706074386756433e-05, + "loss": 2.8784, + "step": 54612 + }, + { + "epoch": 2.68, + "grad_norm": 0.8197011947631836, + "learning_rate": 1.7055626212240413e-05, + "loss": 2.9881, + "step": 54613 + }, + { + "epoch": 2.68, + "grad_norm": 0.7415262460708618, + "learning_rate": 1.7050509302137584e-05, + "loss": 2.9983, + "step": 54614 + }, + { + "epoch": 2.68, + "grad_norm": 0.7331694960594177, + "learning_rate": 1.704539313726929e-05, + "loss": 2.9561, + "step": 54615 + }, + { + "epoch": 2.68, + "grad_norm": 0.7985134720802307, + "learning_rate": 1.70402777176489e-05, + "loss": 2.9792, + "step": 54616 + }, + { + "epoch": 2.68, + "grad_norm": 0.7267589569091797, + "learning_rate": 1.7035163043290022e-05, + "loss": 2.8766, + "step": 54617 + }, + { + "epoch": 2.68, + "grad_norm": 0.7310107946395874, + "learning_rate": 1.703004911420599e-05, + "loss": 2.987, + "step": 54618 + }, + { + "epoch": 2.68, + "grad_norm": 0.7549310326576233, + "learning_rate": 1.702493593041042e-05, + "loss": 2.8447, + "step": 54619 + }, + { + "epoch": 2.68, + "grad_norm": 0.7123493552207947, + "learning_rate": 1.7019823491916673e-05, + "loss": 3.1187, + "step": 54620 + }, + { + "epoch": 2.68, + "grad_norm": 0.7951881885528564, + "learning_rate": 1.7014711798738268e-05, + "loss": 2.8816, + "step": 54621 + }, + { + "epoch": 2.68, + "grad_norm": 0.725439190864563, + "learning_rate": 1.7009600850888693e-05, + "loss": 2.9719, + "step": 54622 + }, + { + "epoch": 2.68, + "grad_norm": 0.7655175924301147, + "learning_rate": 1.7004490648381274e-05, + "loss": 2.8131, + "step": 54623 + }, + { + "epoch": 2.68, + "grad_norm": 0.7330092191696167, + "learning_rate": 1.6999381191229634e-05, + "loss": 2.9509, + "step": 54624 + }, + { + "epoch": 2.68, + "grad_norm": 0.7745311856269836, + "learning_rate": 1.6994272479447124e-05, + "loss": 2.7781, + "step": 54625 + }, + { + "epoch": 2.68, + "grad_norm": 0.7374331951141357, + "learning_rate": 1.6989164513047238e-05, + "loss": 3.0763, + "step": 54626 + }, + { + "epoch": 2.68, + "grad_norm": 0.7071889042854309, + "learning_rate": 1.698405729204343e-05, + "loss": 2.7063, + "step": 54627 + }, + { + "epoch": 2.68, + "grad_norm": 0.7430211901664734, + "learning_rate": 1.6978950816449153e-05, + "loss": 3.0131, + "step": 54628 + }, + { + "epoch": 2.68, + "grad_norm": 0.746117889881134, + "learning_rate": 1.697384508627787e-05, + "loss": 2.7566, + "step": 54629 + }, + { + "epoch": 2.68, + "grad_norm": 0.7581526041030884, + "learning_rate": 1.6968740101542965e-05, + "loss": 3.0198, + "step": 54630 + }, + { + "epoch": 2.68, + "grad_norm": 0.7774915099143982, + "learning_rate": 1.6963635862257996e-05, + "loss": 3.039, + "step": 54631 + }, + { + "epoch": 2.68, + "grad_norm": 0.7348177433013916, + "learning_rate": 1.695853236843625e-05, + "loss": 3.1106, + "step": 54632 + }, + { + "epoch": 2.68, + "grad_norm": 0.7624771595001221, + "learning_rate": 1.6953429620091284e-05, + "loss": 3.0076, + "step": 54633 + }, + { + "epoch": 2.68, + "grad_norm": 0.7285212874412537, + "learning_rate": 1.6948327617236557e-05, + "loss": 2.7591, + "step": 54634 + }, + { + "epoch": 2.68, + "grad_norm": 0.8262482285499573, + "learning_rate": 1.6943226359885385e-05, + "loss": 3.0077, + "step": 54635 + }, + { + "epoch": 2.68, + "grad_norm": 0.7384718656539917, + "learning_rate": 1.6938125848051363e-05, + "loss": 2.8514, + "step": 54636 + }, + { + "epoch": 2.68, + "grad_norm": 0.7086389064788818, + "learning_rate": 1.6933026081747814e-05, + "loss": 2.9698, + "step": 54637 + }, + { + "epoch": 2.68, + "grad_norm": 0.7307590246200562, + "learning_rate": 1.6927927060988156e-05, + "loss": 2.8567, + "step": 54638 + }, + { + "epoch": 2.68, + "grad_norm": 0.7535133957862854, + "learning_rate": 1.6922828785785912e-05, + "loss": 2.821, + "step": 54639 + }, + { + "epoch": 2.68, + "grad_norm": 0.7423950433731079, + "learning_rate": 1.6917731256154443e-05, + "loss": 2.8587, + "step": 54640 + }, + { + "epoch": 2.68, + "grad_norm": 0.7522428035736084, + "learning_rate": 1.69126344721072e-05, + "loss": 2.9274, + "step": 54641 + }, + { + "epoch": 2.68, + "grad_norm": 0.7286722660064697, + "learning_rate": 1.6907538433657543e-05, + "loss": 2.7226, + "step": 54642 + }, + { + "epoch": 2.68, + "grad_norm": 0.7772533297538757, + "learning_rate": 1.690244314081902e-05, + "loss": 2.9504, + "step": 54643 + }, + { + "epoch": 2.68, + "grad_norm": 0.7469460964202881, + "learning_rate": 1.6897348593604965e-05, + "loss": 2.8693, + "step": 54644 + }, + { + "epoch": 2.68, + "grad_norm": 0.7319382429122925, + "learning_rate": 1.6892254792028757e-05, + "loss": 2.8529, + "step": 54645 + }, + { + "epoch": 2.68, + "grad_norm": 0.7492722272872925, + "learning_rate": 1.6887161736103893e-05, + "loss": 2.8104, + "step": 54646 + }, + { + "epoch": 2.68, + "grad_norm": 0.7667667865753174, + "learning_rate": 1.688206942584376e-05, + "loss": 2.8156, + "step": 54647 + }, + { + "epoch": 2.68, + "grad_norm": 0.6894829869270325, + "learning_rate": 1.6876977861261744e-05, + "loss": 2.8207, + "step": 54648 + }, + { + "epoch": 2.68, + "grad_norm": 0.7800548672676086, + "learning_rate": 1.6871887042371334e-05, + "loss": 2.9097, + "step": 54649 + }, + { + "epoch": 2.68, + "grad_norm": 0.7656291127204895, + "learning_rate": 1.6866796969185827e-05, + "loss": 2.7117, + "step": 54650 + }, + { + "epoch": 2.68, + "grad_norm": 0.7687351107597351, + "learning_rate": 1.686170764171877e-05, + "loss": 2.8556, + "step": 54651 + }, + { + "epoch": 2.68, + "grad_norm": 0.7354574203491211, + "learning_rate": 1.6856619059983423e-05, + "loss": 2.8432, + "step": 54652 + }, + { + "epoch": 2.68, + "grad_norm": 0.7067791819572449, + "learning_rate": 1.685153122399321e-05, + "loss": 2.976, + "step": 54653 + }, + { + "epoch": 2.68, + "grad_norm": 0.7321593165397644, + "learning_rate": 1.684644413376165e-05, + "loss": 2.8863, + "step": 54654 + }, + { + "epoch": 2.68, + "grad_norm": 0.7532573342323303, + "learning_rate": 1.6841357789302e-05, + "loss": 2.8874, + "step": 54655 + }, + { + "epoch": 2.68, + "grad_norm": 0.7408707141876221, + "learning_rate": 1.6836272190627787e-05, + "loss": 3.2351, + "step": 54656 + }, + { + "epoch": 2.68, + "grad_norm": 0.7293885946273804, + "learning_rate": 1.683118733775226e-05, + "loss": 2.8992, + "step": 54657 + }, + { + "epoch": 2.68, + "grad_norm": 0.7179583311080933, + "learning_rate": 1.682610323068895e-05, + "loss": 2.8923, + "step": 54658 + }, + { + "epoch": 2.68, + "grad_norm": 0.7808348536491394, + "learning_rate": 1.6821019869451167e-05, + "loss": 3.0609, + "step": 54659 + }, + { + "epoch": 2.68, + "grad_norm": 0.7644177675247192, + "learning_rate": 1.6815937254052314e-05, + "loss": 2.9049, + "step": 54660 + }, + { + "epoch": 2.68, + "grad_norm": 0.7662442326545715, + "learning_rate": 1.6810855384505804e-05, + "loss": 2.8631, + "step": 54661 + }, + { + "epoch": 2.68, + "grad_norm": 0.7450445890426636, + "learning_rate": 1.6805774260824968e-05, + "loss": 2.835, + "step": 54662 + }, + { + "epoch": 2.68, + "grad_norm": 0.7103903889656067, + "learning_rate": 1.6800693883023185e-05, + "loss": 2.7432, + "step": 54663 + }, + { + "epoch": 2.68, + "grad_norm": 0.7254341244697571, + "learning_rate": 1.6795614251113953e-05, + "loss": 3.0087, + "step": 54664 + }, + { + "epoch": 2.68, + "grad_norm": 0.7645226716995239, + "learning_rate": 1.6790535365110557e-05, + "loss": 2.6737, + "step": 54665 + }, + { + "epoch": 2.68, + "grad_norm": 0.7188624143600464, + "learning_rate": 1.6785457225026387e-05, + "loss": 2.6871, + "step": 54666 + }, + { + "epoch": 2.68, + "grad_norm": 0.7680506110191345, + "learning_rate": 1.678037983087477e-05, + "loss": 2.8942, + "step": 54667 + }, + { + "epoch": 2.68, + "grad_norm": 0.7478795051574707, + "learning_rate": 1.677530318266912e-05, + "loss": 2.8025, + "step": 54668 + }, + { + "epoch": 2.68, + "grad_norm": 0.7576417922973633, + "learning_rate": 1.6770227280422866e-05, + "loss": 2.8098, + "step": 54669 + }, + { + "epoch": 2.68, + "grad_norm": 0.7802990078926086, + "learning_rate": 1.6765152124149262e-05, + "loss": 2.8422, + "step": 54670 + }, + { + "epoch": 2.68, + "grad_norm": 0.7456069588661194, + "learning_rate": 1.6760077713861797e-05, + "loss": 2.8779, + "step": 54671 + }, + { + "epoch": 2.68, + "grad_norm": 0.7156590819358826, + "learning_rate": 1.675500404957376e-05, + "loss": 2.8347, + "step": 54672 + }, + { + "epoch": 2.68, + "grad_norm": 0.7358556389808655, + "learning_rate": 1.6749931131298545e-05, + "loss": 3.0632, + "step": 54673 + }, + { + "epoch": 2.68, + "grad_norm": 0.7578690648078918, + "learning_rate": 1.6744858959049502e-05, + "loss": 2.8503, + "step": 54674 + }, + { + "epoch": 2.68, + "grad_norm": 0.7345317602157593, + "learning_rate": 1.6739787532839922e-05, + "loss": 2.9099, + "step": 54675 + }, + { + "epoch": 2.68, + "grad_norm": 0.7033917903900146, + "learning_rate": 1.6734716852683295e-05, + "loss": 2.906, + "step": 54676 + }, + { + "epoch": 2.68, + "grad_norm": 0.6990805268287659, + "learning_rate": 1.6729646918592875e-05, + "loss": 2.9649, + "step": 54677 + }, + { + "epoch": 2.68, + "grad_norm": 0.7403311729431152, + "learning_rate": 1.6724577730582024e-05, + "loss": 2.6907, + "step": 54678 + }, + { + "epoch": 2.68, + "grad_norm": 0.770260214805603, + "learning_rate": 1.671950928866419e-05, + "loss": 2.8517, + "step": 54679 + }, + { + "epoch": 2.68, + "grad_norm": 0.7441731691360474, + "learning_rate": 1.6714441592852633e-05, + "loss": 3.0706, + "step": 54680 + }, + { + "epoch": 2.68, + "grad_norm": 0.7643983960151672, + "learning_rate": 1.670937464316071e-05, + "loss": 2.9851, + "step": 54681 + }, + { + "epoch": 2.68, + "grad_norm": 0.7585827708244324, + "learning_rate": 1.6704308439601745e-05, + "loss": 2.8638, + "step": 54682 + }, + { + "epoch": 2.68, + "grad_norm": 0.7482290863990784, + "learning_rate": 1.6699242982189088e-05, + "loss": 2.8541, + "step": 54683 + }, + { + "epoch": 2.68, + "grad_norm": 0.7718914151191711, + "learning_rate": 1.6694178270936167e-05, + "loss": 2.9846, + "step": 54684 + }, + { + "epoch": 2.68, + "grad_norm": 0.8131400346755981, + "learning_rate": 1.66891143058562e-05, + "loss": 2.7676, + "step": 54685 + }, + { + "epoch": 2.68, + "grad_norm": 0.7344086170196533, + "learning_rate": 1.6684051086962648e-05, + "loss": 2.9111, + "step": 54686 + }, + { + "epoch": 2.68, + "grad_norm": 0.7355520725250244, + "learning_rate": 1.6678988614268762e-05, + "loss": 3.0156, + "step": 54687 + }, + { + "epoch": 2.68, + "grad_norm": 0.7679204940795898, + "learning_rate": 1.667392688778787e-05, + "loss": 2.7147, + "step": 54688 + }, + { + "epoch": 2.68, + "grad_norm": 0.7418351173400879, + "learning_rate": 1.6668865907533356e-05, + "loss": 2.9869, + "step": 54689 + }, + { + "epoch": 2.68, + "grad_norm": 0.7173027992248535, + "learning_rate": 1.666380567351845e-05, + "loss": 2.8766, + "step": 54690 + }, + { + "epoch": 2.68, + "grad_norm": 0.7610350251197815, + "learning_rate": 1.6658746185756632e-05, + "loss": 3.1118, + "step": 54691 + }, + { + "epoch": 2.68, + "grad_norm": 0.7524661421775818, + "learning_rate": 1.66536874442611e-05, + "loss": 2.8634, + "step": 54692 + }, + { + "epoch": 2.68, + "grad_norm": 0.7599107623100281, + "learning_rate": 1.6648629449045204e-05, + "loss": 2.6746, + "step": 54693 + }, + { + "epoch": 2.68, + "grad_norm": 0.7229357361793518, + "learning_rate": 1.6643572200122336e-05, + "loss": 2.9954, + "step": 54694 + }, + { + "epoch": 2.68, + "grad_norm": 0.8016541600227356, + "learning_rate": 1.663851569750578e-05, + "loss": 2.8605, + "step": 54695 + }, + { + "epoch": 2.68, + "grad_norm": 0.7787773609161377, + "learning_rate": 1.663345994120884e-05, + "loss": 3.0242, + "step": 54696 + }, + { + "epoch": 2.68, + "grad_norm": 0.768181562423706, + "learning_rate": 1.6628404931244755e-05, + "loss": 2.7763, + "step": 54697 + }, + { + "epoch": 2.68, + "grad_norm": 0.7640612721443176, + "learning_rate": 1.6623350667626955e-05, + "loss": 2.9869, + "step": 54698 + }, + { + "epoch": 2.68, + "grad_norm": 0.7562863230705261, + "learning_rate": 1.6618297150368733e-05, + "loss": 2.7484, + "step": 54699 + }, + { + "epoch": 2.68, + "grad_norm": 0.755027711391449, + "learning_rate": 1.6613244379483336e-05, + "loss": 2.9133, + "step": 54700 + }, + { + "epoch": 2.68, + "grad_norm": 0.7754169702529907, + "learning_rate": 1.6608192354984162e-05, + "loss": 3.0882, + "step": 54701 + }, + { + "epoch": 2.68, + "grad_norm": 0.7277271151542664, + "learning_rate": 1.6603141076884495e-05, + "loss": 2.8785, + "step": 54702 + }, + { + "epoch": 2.68, + "grad_norm": 0.7051646113395691, + "learning_rate": 1.659809054519753e-05, + "loss": 2.8702, + "step": 54703 + }, + { + "epoch": 2.68, + "grad_norm": 0.7471274137496948, + "learning_rate": 1.6593040759936715e-05, + "loss": 2.8466, + "step": 54704 + }, + { + "epoch": 2.68, + "grad_norm": 0.7694349884986877, + "learning_rate": 1.6587991721115245e-05, + "loss": 2.6614, + "step": 54705 + }, + { + "epoch": 2.68, + "grad_norm": 0.7603945732116699, + "learning_rate": 1.658294342874651e-05, + "loss": 2.8695, + "step": 54706 + }, + { + "epoch": 2.68, + "grad_norm": 0.7334092259407043, + "learning_rate": 1.6577895882843693e-05, + "loss": 2.7742, + "step": 54707 + }, + { + "epoch": 2.68, + "grad_norm": 0.7664968371391296, + "learning_rate": 1.6572849083420225e-05, + "loss": 2.977, + "step": 54708 + }, + { + "epoch": 2.68, + "grad_norm": 0.7536723613739014, + "learning_rate": 1.6567803030489292e-05, + "loss": 2.8594, + "step": 54709 + }, + { + "epoch": 2.68, + "grad_norm": 0.7267215251922607, + "learning_rate": 1.656275772406418e-05, + "loss": 2.8231, + "step": 54710 + }, + { + "epoch": 2.68, + "grad_norm": 0.7520238161087036, + "learning_rate": 1.6557713164158283e-05, + "loss": 2.8955, + "step": 54711 + }, + { + "epoch": 2.68, + "grad_norm": 0.7576550245285034, + "learning_rate": 1.6552669350784753e-05, + "loss": 2.8592, + "step": 54712 + }, + { + "epoch": 2.68, + "grad_norm": 0.7513378262519836, + "learning_rate": 1.6547626283957016e-05, + "loss": 2.9951, + "step": 54713 + }, + { + "epoch": 2.68, + "grad_norm": 0.7380017638206482, + "learning_rate": 1.6542583963688195e-05, + "loss": 2.9589, + "step": 54714 + }, + { + "epoch": 2.68, + "grad_norm": 0.7709711194038391, + "learning_rate": 1.6537542389991676e-05, + "loss": 2.7643, + "step": 54715 + }, + { + "epoch": 2.68, + "grad_norm": 0.7406794428825378, + "learning_rate": 1.6532501562880784e-05, + "loss": 3.0594, + "step": 54716 + }, + { + "epoch": 2.68, + "grad_norm": 0.7626711130142212, + "learning_rate": 1.652746148236871e-05, + "loss": 2.7846, + "step": 54717 + }, + { + "epoch": 2.68, + "grad_norm": 0.7286007404327393, + "learning_rate": 1.6522422148468738e-05, + "loss": 3.0798, + "step": 54718 + }, + { + "epoch": 2.68, + "grad_norm": 0.740604817867279, + "learning_rate": 1.65173835611941e-05, + "loss": 2.8224, + "step": 54719 + }, + { + "epoch": 2.68, + "grad_norm": 0.8206152319908142, + "learning_rate": 1.651234572055814e-05, + "loss": 2.9789, + "step": 54720 + }, + { + "epoch": 2.68, + "grad_norm": 0.7075120210647583, + "learning_rate": 1.6507308626574122e-05, + "loss": 2.9661, + "step": 54721 + }, + { + "epoch": 2.68, + "grad_norm": 0.775321364402771, + "learning_rate": 1.650227227925527e-05, + "loss": 2.9403, + "step": 54722 + }, + { + "epoch": 2.68, + "grad_norm": 0.7323336005210876, + "learning_rate": 1.64972366786149e-05, + "loss": 2.9916, + "step": 54723 + }, + { + "epoch": 2.68, + "grad_norm": 0.7589771151542664, + "learning_rate": 1.6492201824666274e-05, + "loss": 2.7981, + "step": 54724 + }, + { + "epoch": 2.68, + "grad_norm": 0.7782710194587708, + "learning_rate": 1.6487167717422544e-05, + "loss": 2.9767, + "step": 54725 + }, + { + "epoch": 2.68, + "grad_norm": 0.7076034545898438, + "learning_rate": 1.6482134356897138e-05, + "loss": 2.6416, + "step": 54726 + }, + { + "epoch": 2.68, + "grad_norm": 0.7385815382003784, + "learning_rate": 1.6477101743103138e-05, + "loss": 2.7569, + "step": 54727 + }, + { + "epoch": 2.68, + "grad_norm": 0.7414993047714233, + "learning_rate": 1.6472069876053973e-05, + "loss": 2.775, + "step": 54728 + }, + { + "epoch": 2.68, + "grad_norm": 0.7520781755447388, + "learning_rate": 1.646703875576273e-05, + "loss": 2.914, + "step": 54729 + }, + { + "epoch": 2.68, + "grad_norm": 0.7415902614593506, + "learning_rate": 1.64620083822428e-05, + "loss": 2.8865, + "step": 54730 + }, + { + "epoch": 2.68, + "grad_norm": 0.7842666506767273, + "learning_rate": 1.6456978755507366e-05, + "loss": 3.0898, + "step": 54731 + }, + { + "epoch": 2.68, + "grad_norm": 0.7630308866500854, + "learning_rate": 1.6451949875569658e-05, + "loss": 2.8664, + "step": 54732 + }, + { + "epoch": 2.68, + "grad_norm": 0.7267240285873413, + "learning_rate": 1.6446921742442963e-05, + "loss": 2.9599, + "step": 54733 + }, + { + "epoch": 2.68, + "grad_norm": 0.7458666563034058, + "learning_rate": 1.6441894356140474e-05, + "loss": 2.6751, + "step": 54734 + }, + { + "epoch": 2.68, + "grad_norm": 0.7139987349510193, + "learning_rate": 1.6436867716675473e-05, + "loss": 2.7982, + "step": 54735 + }, + { + "epoch": 2.68, + "grad_norm": 0.7688502073287964, + "learning_rate": 1.643184182406122e-05, + "loss": 2.8371, + "step": 54736 + }, + { + "epoch": 2.68, + "grad_norm": 0.7603595852851868, + "learning_rate": 1.6426816678310873e-05, + "loss": 3.0501, + "step": 54737 + }, + { + "epoch": 2.68, + "grad_norm": 0.7536769509315491, + "learning_rate": 1.6421792279437752e-05, + "loss": 2.9334, + "step": 54738 + }, + { + "epoch": 2.68, + "grad_norm": 0.7776528000831604, + "learning_rate": 1.641676862745508e-05, + "loss": 2.8698, + "step": 54739 + }, + { + "epoch": 2.68, + "grad_norm": 0.7579948902130127, + "learning_rate": 1.6411745722376014e-05, + "loss": 2.7414, + "step": 54740 + }, + { + "epoch": 2.68, + "grad_norm": 0.7683404684066772, + "learning_rate": 1.6406723564213843e-05, + "loss": 2.9881, + "step": 54741 + }, + { + "epoch": 2.68, + "grad_norm": 0.7273842692375183, + "learning_rate": 1.640170215298179e-05, + "loss": 2.7239, + "step": 54742 + }, + { + "epoch": 2.68, + "grad_norm": 0.7698961496353149, + "learning_rate": 1.6396681488693077e-05, + "loss": 2.672, + "step": 54743 + }, + { + "epoch": 2.68, + "grad_norm": 0.7344798445701599, + "learning_rate": 1.639166157136089e-05, + "loss": 2.9347, + "step": 54744 + }, + { + "epoch": 2.68, + "grad_norm": 0.7438050508499146, + "learning_rate": 1.638664240099856e-05, + "loss": 2.9031, + "step": 54745 + }, + { + "epoch": 2.68, + "grad_norm": 0.7391680479049683, + "learning_rate": 1.6381623977619206e-05, + "loss": 3.0264, + "step": 54746 + }, + { + "epoch": 2.68, + "grad_norm": 0.7523117661476135, + "learning_rate": 1.6376606301236017e-05, + "loss": 2.9136, + "step": 54747 + }, + { + "epoch": 2.68, + "grad_norm": 0.7371009588241577, + "learning_rate": 1.6371589371862315e-05, + "loss": 3.0831, + "step": 54748 + }, + { + "epoch": 2.68, + "grad_norm": 0.718420147895813, + "learning_rate": 1.6366573189511224e-05, + "loss": 2.6528, + "step": 54749 + }, + { + "epoch": 2.68, + "grad_norm": 0.7344409227371216, + "learning_rate": 1.6361557754195997e-05, + "loss": 2.9695, + "step": 54750 + }, + { + "epoch": 2.68, + "grad_norm": 0.7529169321060181, + "learning_rate": 1.6356543065929862e-05, + "loss": 2.8281, + "step": 54751 + }, + { + "epoch": 2.68, + "grad_norm": 0.7365326881408691, + "learning_rate": 1.6351529124726005e-05, + "loss": 2.85, + "step": 54752 + }, + { + "epoch": 2.68, + "grad_norm": 0.7705813050270081, + "learning_rate": 1.6346515930597648e-05, + "loss": 3.0134, + "step": 54753 + }, + { + "epoch": 2.68, + "grad_norm": 0.7423902750015259, + "learning_rate": 1.634150348355795e-05, + "loss": 2.9457, + "step": 54754 + }, + { + "epoch": 2.68, + "grad_norm": 0.7638031244277954, + "learning_rate": 1.6336491783620097e-05, + "loss": 2.8813, + "step": 54755 + }, + { + "epoch": 2.68, + "grad_norm": 0.7596058249473572, + "learning_rate": 1.6331480830797416e-05, + "loss": 2.9061, + "step": 54756 + }, + { + "epoch": 2.68, + "grad_norm": 0.7403856515884399, + "learning_rate": 1.632647062510296e-05, + "loss": 3.0641, + "step": 54757 + }, + { + "epoch": 2.68, + "grad_norm": 0.7983343005180359, + "learning_rate": 1.632146116655002e-05, + "loss": 2.8901, + "step": 54758 + }, + { + "epoch": 2.68, + "grad_norm": 0.7619073987007141, + "learning_rate": 1.631645245515172e-05, + "loss": 2.8923, + "step": 54759 + }, + { + "epoch": 2.68, + "grad_norm": 0.7451263666152954, + "learning_rate": 1.6311444490921344e-05, + "loss": 2.9146, + "step": 54760 + }, + { + "epoch": 2.68, + "grad_norm": 0.7346713542938232, + "learning_rate": 1.6306437273872018e-05, + "loss": 2.8883, + "step": 54761 + }, + { + "epoch": 2.68, + "grad_norm": 0.7482584118843079, + "learning_rate": 1.6301430804016902e-05, + "loss": 2.8736, + "step": 54762 + }, + { + "epoch": 2.68, + "grad_norm": 0.7432484030723572, + "learning_rate": 1.6296425081369248e-05, + "loss": 2.9137, + "step": 54763 + }, + { + "epoch": 2.68, + "grad_norm": 0.7410925030708313, + "learning_rate": 1.6291420105942178e-05, + "loss": 2.7335, + "step": 54764 + }, + { + "epoch": 2.68, + "grad_norm": 0.7755419015884399, + "learning_rate": 1.6286415877748915e-05, + "loss": 2.6624, + "step": 54765 + }, + { + "epoch": 2.68, + "grad_norm": 0.7352432608604431, + "learning_rate": 1.628141239680265e-05, + "loss": 2.8728, + "step": 54766 + }, + { + "epoch": 2.68, + "grad_norm": 0.7407886385917664, + "learning_rate": 1.627640966311654e-05, + "loss": 2.9961, + "step": 54767 + }, + { + "epoch": 2.68, + "grad_norm": 0.7732663750648499, + "learning_rate": 1.6271407676703773e-05, + "loss": 2.774, + "step": 54768 + }, + { + "epoch": 2.68, + "grad_norm": 0.8005004525184631, + "learning_rate": 1.6266406437577506e-05, + "loss": 3.0023, + "step": 54769 + }, + { + "epoch": 2.68, + "grad_norm": 0.7332467436790466, + "learning_rate": 1.626140594575086e-05, + "loss": 3.1547, + "step": 54770 + }, + { + "epoch": 2.68, + "grad_norm": 0.7251038551330566, + "learning_rate": 1.625640620123716e-05, + "loss": 2.958, + "step": 54771 + }, + { + "epoch": 2.68, + "grad_norm": 0.7420737743377686, + "learning_rate": 1.6251407204049394e-05, + "loss": 2.9342, + "step": 54772 + }, + { + "epoch": 2.68, + "grad_norm": 0.7631248831748962, + "learning_rate": 1.6246408954200883e-05, + "loss": 2.882, + "step": 54773 + }, + { + "epoch": 2.68, + "grad_norm": 0.7106769680976868, + "learning_rate": 1.6241411451704656e-05, + "loss": 2.754, + "step": 54774 + }, + { + "epoch": 2.68, + "grad_norm": 0.77427077293396, + "learning_rate": 1.6236414696573994e-05, + "loss": 3.0064, + "step": 54775 + }, + { + "epoch": 2.68, + "grad_norm": 0.7603651285171509, + "learning_rate": 1.6231418688821997e-05, + "loss": 2.6856, + "step": 54776 + }, + { + "epoch": 2.68, + "grad_norm": 0.7737959623336792, + "learning_rate": 1.622642342846181e-05, + "loss": 2.7616, + "step": 54777 + }, + { + "epoch": 2.68, + "grad_norm": 0.7463186979293823, + "learning_rate": 1.622142891550663e-05, + "loss": 2.7421, + "step": 54778 + }, + { + "epoch": 2.68, + "grad_norm": 0.7298126816749573, + "learning_rate": 1.6216435149969574e-05, + "loss": 2.9434, + "step": 54779 + }, + { + "epoch": 2.68, + "grad_norm": 0.7219801545143127, + "learning_rate": 1.6211442131863772e-05, + "loss": 2.9801, + "step": 54780 + }, + { + "epoch": 2.68, + "grad_norm": 0.7264379858970642, + "learning_rate": 1.620644986120251e-05, + "loss": 2.9004, + "step": 54781 + }, + { + "epoch": 2.68, + "grad_norm": 0.7708029747009277, + "learning_rate": 1.6201458337998807e-05, + "loss": 2.9669, + "step": 54782 + }, + { + "epoch": 2.68, + "grad_norm": 0.7728287577629089, + "learning_rate": 1.6196467562265857e-05, + "loss": 2.7538, + "step": 54783 + }, + { + "epoch": 2.68, + "grad_norm": 0.7423531413078308, + "learning_rate": 1.619147753401675e-05, + "loss": 2.9775, + "step": 54784 + }, + { + "epoch": 2.68, + "grad_norm": 0.7586064338684082, + "learning_rate": 1.6186488253264707e-05, + "loss": 3.0552, + "step": 54785 + }, + { + "epoch": 2.68, + "grad_norm": 0.7348181009292603, + "learning_rate": 1.6181499720022784e-05, + "loss": 2.7265, + "step": 54786 + }, + { + "epoch": 2.68, + "grad_norm": 0.7246251702308655, + "learning_rate": 1.617651193430417e-05, + "loss": 2.7787, + "step": 54787 + }, + { + "epoch": 2.69, + "grad_norm": 0.7123945355415344, + "learning_rate": 1.6171524896122056e-05, + "loss": 2.9102, + "step": 54788 + }, + { + "epoch": 2.69, + "grad_norm": 0.7603493332862854, + "learning_rate": 1.6166538605489497e-05, + "loss": 2.6584, + "step": 54789 + }, + { + "epoch": 2.69, + "grad_norm": 0.7190308570861816, + "learning_rate": 1.6161553062419687e-05, + "loss": 2.8735, + "step": 54790 + }, + { + "epoch": 2.69, + "grad_norm": 0.7607088088989258, + "learning_rate": 1.615656826692564e-05, + "loss": 2.9559, + "step": 54791 + }, + { + "epoch": 2.69, + "grad_norm": 0.7394618391990662, + "learning_rate": 1.6151584219020585e-05, + "loss": 2.9037, + "step": 54792 + }, + { + "epoch": 2.69, + "grad_norm": 0.7496686577796936, + "learning_rate": 1.6146600918717676e-05, + "loss": 2.8412, + "step": 54793 + }, + { + "epoch": 2.69, + "grad_norm": 0.7344440817832947, + "learning_rate": 1.6141618366029974e-05, + "loss": 2.8324, + "step": 54794 + }, + { + "epoch": 2.69, + "grad_norm": 0.7307662963867188, + "learning_rate": 1.6136636560970628e-05, + "loss": 2.7831, + "step": 54795 + }, + { + "epoch": 2.69, + "grad_norm": 0.7606189250946045, + "learning_rate": 1.61316555035527e-05, + "loss": 2.9232, + "step": 54796 + }, + { + "epoch": 2.69, + "grad_norm": 0.7663336992263794, + "learning_rate": 1.6126675193789406e-05, + "loss": 2.7469, + "step": 54797 + }, + { + "epoch": 2.69, + "grad_norm": 0.7470273375511169, + "learning_rate": 1.6121695631693843e-05, + "loss": 2.819, + "step": 54798 + }, + { + "epoch": 2.69, + "grad_norm": 0.7757609486579895, + "learning_rate": 1.611671681727903e-05, + "loss": 2.9269, + "step": 54799 + }, + { + "epoch": 2.69, + "grad_norm": 0.6888236999511719, + "learning_rate": 1.6111738750558223e-05, + "loss": 2.8834, + "step": 54800 + }, + { + "epoch": 2.69, + "grad_norm": 0.7634980082511902, + "learning_rate": 1.610676143154438e-05, + "loss": 3.0092, + "step": 54801 + }, + { + "epoch": 2.69, + "grad_norm": 0.7457658648490906, + "learning_rate": 1.6101784860250722e-05, + "loss": 2.8261, + "step": 54802 + }, + { + "epoch": 2.69, + "grad_norm": 0.7501567602157593, + "learning_rate": 1.6096809036690373e-05, + "loss": 2.6977, + "step": 54803 + }, + { + "epoch": 2.69, + "grad_norm": 0.771567165851593, + "learning_rate": 1.6091833960876355e-05, + "loss": 3.0102, + "step": 54804 + }, + { + "epoch": 2.69, + "grad_norm": 0.7606413960456848, + "learning_rate": 1.6086859632821826e-05, + "loss": 3.0169, + "step": 54805 + }, + { + "epoch": 2.69, + "grad_norm": 0.7576195001602173, + "learning_rate": 1.608188605253984e-05, + "loss": 2.7713, + "step": 54806 + }, + { + "epoch": 2.69, + "grad_norm": 0.7587432861328125, + "learning_rate": 1.6076913220043487e-05, + "loss": 2.7597, + "step": 54807 + }, + { + "epoch": 2.69, + "grad_norm": 0.7228325605392456, + "learning_rate": 1.607194113534599e-05, + "loss": 3.0178, + "step": 54808 + }, + { + "epoch": 2.69, + "grad_norm": 0.7396401166915894, + "learning_rate": 1.6066969798460306e-05, + "loss": 2.9906, + "step": 54809 + }, + { + "epoch": 2.69, + "grad_norm": 0.744340181350708, + "learning_rate": 1.606199920939959e-05, + "loss": 2.8594, + "step": 54810 + }, + { + "epoch": 2.69, + "grad_norm": 0.7660406827926636, + "learning_rate": 1.6057029368176933e-05, + "loss": 2.9305, + "step": 54811 + }, + { + "epoch": 2.69, + "grad_norm": 0.7232922911643982, + "learning_rate": 1.6052060274805388e-05, + "loss": 2.6945, + "step": 54812 + }, + { + "epoch": 2.69, + "grad_norm": 0.7410985827445984, + "learning_rate": 1.6047091929298118e-05, + "loss": 2.826, + "step": 54813 + }, + { + "epoch": 2.69, + "grad_norm": 0.7533641457557678, + "learning_rate": 1.6042124331668106e-05, + "loss": 2.9354, + "step": 54814 + }, + { + "epoch": 2.69, + "grad_norm": 0.8169004321098328, + "learning_rate": 1.6037157481928542e-05, + "loss": 2.8995, + "step": 54815 + }, + { + "epoch": 2.69, + "grad_norm": 0.7725902199745178, + "learning_rate": 1.6032191380092385e-05, + "loss": 2.8986, + "step": 54816 + }, + { + "epoch": 2.69, + "grad_norm": 0.7658239603042603, + "learning_rate": 1.6027226026172822e-05, + "loss": 2.7877, + "step": 54817 + }, + { + "epoch": 2.69, + "grad_norm": 0.7260845899581909, + "learning_rate": 1.6022261420182915e-05, + "loss": 2.7825, + "step": 54818 + }, + { + "epoch": 2.69, + "grad_norm": 0.7706580758094788, + "learning_rate": 1.601729756213571e-05, + "loss": 2.6989, + "step": 54819 + }, + { + "epoch": 2.69, + "grad_norm": 0.7238854169845581, + "learning_rate": 1.6012334452044305e-05, + "loss": 2.9405, + "step": 54820 + }, + { + "epoch": 2.69, + "grad_norm": 0.8242838382720947, + "learning_rate": 1.600737208992172e-05, + "loss": 2.927, + "step": 54821 + }, + { + "epoch": 2.69, + "grad_norm": 0.7421150803565979, + "learning_rate": 1.6002410475781048e-05, + "loss": 2.8967, + "step": 54822 + }, + { + "epoch": 2.69, + "grad_norm": 0.7626473903656006, + "learning_rate": 1.5997449609635404e-05, + "loss": 2.8926, + "step": 54823 + }, + { + "epoch": 2.69, + "grad_norm": 0.7503937482833862, + "learning_rate": 1.5992489491497783e-05, + "loss": 2.6612, + "step": 54824 + }, + { + "epoch": 2.69, + "grad_norm": 0.8117401599884033, + "learning_rate": 1.5987530121381344e-05, + "loss": 2.6266, + "step": 54825 + }, + { + "epoch": 2.69, + "grad_norm": 0.7335777282714844, + "learning_rate": 1.59825714992991e-05, + "loss": 2.9657, + "step": 54826 + }, + { + "epoch": 2.69, + "grad_norm": 0.8089480400085449, + "learning_rate": 1.5977613625264018e-05, + "loss": 2.5981, + "step": 54827 + }, + { + "epoch": 2.69, + "grad_norm": 0.7682913541793823, + "learning_rate": 1.5972656499289317e-05, + "loss": 2.8463, + "step": 54828 + }, + { + "epoch": 2.69, + "grad_norm": 0.7823068499565125, + "learning_rate": 1.596770012138795e-05, + "loss": 3.0189, + "step": 54829 + }, + { + "epoch": 2.69, + "grad_norm": 0.7197554707527161, + "learning_rate": 1.596274449157301e-05, + "loss": 2.952, + "step": 54830 + }, + { + "epoch": 2.69, + "grad_norm": 0.8423383831977844, + "learning_rate": 1.5957789609857486e-05, + "loss": 2.9469, + "step": 54831 + }, + { + "epoch": 2.69, + "grad_norm": 0.742664635181427, + "learning_rate": 1.5952835476254534e-05, + "loss": 2.9605, + "step": 54832 + }, + { + "epoch": 2.69, + "grad_norm": 0.7450295090675354, + "learning_rate": 1.5947882090777142e-05, + "loss": 3.0918, + "step": 54833 + }, + { + "epoch": 2.69, + "grad_norm": 0.7666531801223755, + "learning_rate": 1.5942929453438335e-05, + "loss": 2.7747, + "step": 54834 + }, + { + "epoch": 2.69, + "grad_norm": 0.7433580756187439, + "learning_rate": 1.5937977564251238e-05, + "loss": 2.8439, + "step": 54835 + }, + { + "epoch": 2.69, + "grad_norm": 0.7616721391677856, + "learning_rate": 1.5933026423228766e-05, + "loss": 2.9655, + "step": 54836 + }, + { + "epoch": 2.69, + "grad_norm": 0.7969316840171814, + "learning_rate": 1.5928076030384053e-05, + "loss": 2.7916, + "step": 54837 + }, + { + "epoch": 2.69, + "grad_norm": 0.7451574206352234, + "learning_rate": 1.5923126385730145e-05, + "loss": 2.7925, + "step": 54838 + }, + { + "epoch": 2.69, + "grad_norm": 0.7240291833877563, + "learning_rate": 1.5918177489280003e-05, + "loss": 3.0301, + "step": 54839 + }, + { + "epoch": 2.69, + "grad_norm": 0.7478523254394531, + "learning_rate": 1.5913229341046752e-05, + "loss": 2.9261, + "step": 54840 + }, + { + "epoch": 2.69, + "grad_norm": 0.715190589427948, + "learning_rate": 1.590828194104341e-05, + "loss": 2.8159, + "step": 54841 + }, + { + "epoch": 2.69, + "grad_norm": 0.7498929500579834, + "learning_rate": 1.5903335289282903e-05, + "loss": 2.826, + "step": 54842 + }, + { + "epoch": 2.69, + "grad_norm": 0.7427136301994324, + "learning_rate": 1.5898389385778388e-05, + "loss": 3.1507, + "step": 54843 + }, + { + "epoch": 2.69, + "grad_norm": 0.7392913699150085, + "learning_rate": 1.589344423054282e-05, + "loss": 2.8675, + "step": 54844 + }, + { + "epoch": 2.69, + "grad_norm": 0.7416170835494995, + "learning_rate": 1.588849982358925e-05, + "loss": 2.9029, + "step": 54845 + }, + { + "epoch": 2.69, + "grad_norm": 0.767384946346283, + "learning_rate": 1.588355616493068e-05, + "loss": 2.8468, + "step": 54846 + }, + { + "epoch": 2.69, + "grad_norm": 0.8128290176391602, + "learning_rate": 1.5878613254580153e-05, + "loss": 2.8221, + "step": 54847 + }, + { + "epoch": 2.69, + "grad_norm": 0.7421417832374573, + "learning_rate": 1.58736710925507e-05, + "loss": 2.8124, + "step": 54848 + }, + { + "epoch": 2.69, + "grad_norm": 0.7256699800491333, + "learning_rate": 1.5868729678855276e-05, + "loss": 2.8736, + "step": 54849 + }, + { + "epoch": 2.69, + "grad_norm": 0.7774531841278076, + "learning_rate": 1.5863789013506966e-05, + "loss": 3.0403, + "step": 54850 + }, + { + "epoch": 2.69, + "grad_norm": 0.7691476941108704, + "learning_rate": 1.5858849096518732e-05, + "loss": 2.8938, + "step": 54851 + }, + { + "epoch": 2.69, + "grad_norm": 0.8459373712539673, + "learning_rate": 1.5853909927903596e-05, + "loss": 2.6245, + "step": 54852 + }, + { + "epoch": 2.69, + "grad_norm": 0.7290185689926147, + "learning_rate": 1.5848971507674612e-05, + "loss": 2.8709, + "step": 54853 + }, + { + "epoch": 2.69, + "grad_norm": 0.7624875903129578, + "learning_rate": 1.584403383584474e-05, + "loss": 2.9471, + "step": 54854 + }, + { + "epoch": 2.69, + "grad_norm": 0.7604098320007324, + "learning_rate": 1.583909691242703e-05, + "loss": 2.7269, + "step": 54855 + }, + { + "epoch": 2.69, + "grad_norm": 0.7130759954452515, + "learning_rate": 1.5834160737434377e-05, + "loss": 2.8143, + "step": 54856 + }, + { + "epoch": 2.69, + "grad_norm": 0.7458640933036804, + "learning_rate": 1.5829225310879868e-05, + "loss": 2.9933, + "step": 54857 + }, + { + "epoch": 2.69, + "grad_norm": 0.7971216440200806, + "learning_rate": 1.5824290632776525e-05, + "loss": 2.9175, + "step": 54858 + }, + { + "epoch": 2.69, + "grad_norm": 0.743493378162384, + "learning_rate": 1.5819356703137275e-05, + "loss": 2.7029, + "step": 54859 + }, + { + "epoch": 2.69, + "grad_norm": 0.7640153765678406, + "learning_rate": 1.5814423521975172e-05, + "loss": 2.829, + "step": 54860 + }, + { + "epoch": 2.69, + "grad_norm": 0.7117670774459839, + "learning_rate": 1.5809491089303172e-05, + "loss": 2.9363, + "step": 54861 + }, + { + "epoch": 2.69, + "grad_norm": 0.7635127305984497, + "learning_rate": 1.58045594051343e-05, + "loss": 2.9184, + "step": 54862 + }, + { + "epoch": 2.69, + "grad_norm": 0.7638167142868042, + "learning_rate": 1.579962846948154e-05, + "loss": 2.9043, + "step": 54863 + }, + { + "epoch": 2.69, + "grad_norm": 0.7500227093696594, + "learning_rate": 1.579469828235782e-05, + "loss": 2.999, + "step": 54864 + }, + { + "epoch": 2.69, + "grad_norm": 0.75905442237854, + "learning_rate": 1.5789768843776195e-05, + "loss": 2.7478, + "step": 54865 + }, + { + "epoch": 2.69, + "grad_norm": 0.7460845112800598, + "learning_rate": 1.578484015374959e-05, + "loss": 2.9451, + "step": 54866 + }, + { + "epoch": 2.69, + "grad_norm": 0.7751177549362183, + "learning_rate": 1.577991221229106e-05, + "loss": 2.8939, + "step": 54867 + }, + { + "epoch": 2.69, + "grad_norm": 0.7179179191589355, + "learning_rate": 1.577498501941349e-05, + "loss": 2.9286, + "step": 54868 + }, + { + "epoch": 2.69, + "grad_norm": 0.7725952863693237, + "learning_rate": 1.5770058575129972e-05, + "loss": 2.89, + "step": 54869 + }, + { + "epoch": 2.69, + "grad_norm": 0.7968480587005615, + "learning_rate": 1.57651328794534e-05, + "loss": 2.9152, + "step": 54870 + }, + { + "epoch": 2.69, + "grad_norm": 0.7715826630592346, + "learning_rate": 1.576020793239673e-05, + "loss": 2.9582, + "step": 54871 + }, + { + "epoch": 2.69, + "grad_norm": 0.7265507578849792, + "learning_rate": 1.575528373397301e-05, + "loss": 2.8741, + "step": 54872 + }, + { + "epoch": 2.69, + "grad_norm": 0.7969720363616943, + "learning_rate": 1.5750360284195107e-05, + "loss": 3.063, + "step": 54873 + }, + { + "epoch": 2.69, + "grad_norm": 0.7510287761688232, + "learning_rate": 1.574543758307607e-05, + "loss": 2.8564, + "step": 54874 + }, + { + "epoch": 2.69, + "grad_norm": 0.7693957090377808, + "learning_rate": 1.574051563062889e-05, + "loss": 2.9187, + "step": 54875 + }, + { + "epoch": 2.69, + "grad_norm": 0.8029640316963196, + "learning_rate": 1.573559442686646e-05, + "loss": 2.7687, + "step": 54876 + }, + { + "epoch": 2.69, + "grad_norm": 0.7065871953964233, + "learning_rate": 1.5730673971801833e-05, + "loss": 2.8054, + "step": 54877 + }, + { + "epoch": 2.69, + "grad_norm": 0.7508147358894348, + "learning_rate": 1.5725754265447798e-05, + "loss": 3.1097, + "step": 54878 + }, + { + "epoch": 2.69, + "grad_norm": 0.7067500352859497, + "learning_rate": 1.5720835307817448e-05, + "loss": 3.004, + "step": 54879 + }, + { + "epoch": 2.69, + "grad_norm": 0.7348147034645081, + "learning_rate": 1.5715917098923737e-05, + "loss": 3.0635, + "step": 54880 + }, + { + "epoch": 2.69, + "grad_norm": 0.7430323362350464, + "learning_rate": 1.571099963877952e-05, + "loss": 3.0832, + "step": 54881 + }, + { + "epoch": 2.69, + "grad_norm": 0.7505277395248413, + "learning_rate": 1.570608292739789e-05, + "loss": 3.0346, + "step": 54882 + }, + { + "epoch": 2.69, + "grad_norm": 0.7366390824317932, + "learning_rate": 1.5701166964791667e-05, + "loss": 3.0246, + "step": 54883 + }, + { + "epoch": 2.69, + "grad_norm": 0.7447764277458191, + "learning_rate": 1.569625175097391e-05, + "loss": 2.9516, + "step": 54884 + }, + { + "epoch": 2.69, + "grad_norm": 0.7516607046127319, + "learning_rate": 1.5691337285957504e-05, + "loss": 2.9391, + "step": 54885 + }, + { + "epoch": 2.69, + "grad_norm": 0.7200809121131897, + "learning_rate": 1.568642356975538e-05, + "loss": 3.0, + "step": 54886 + }, + { + "epoch": 2.69, + "grad_norm": 0.7547498941421509, + "learning_rate": 1.568151060238052e-05, + "loss": 2.9319, + "step": 54887 + }, + { + "epoch": 2.69, + "grad_norm": 0.7287806868553162, + "learning_rate": 1.5676598383845817e-05, + "loss": 2.8652, + "step": 54888 + }, + { + "epoch": 2.69, + "grad_norm": 0.756786048412323, + "learning_rate": 1.5671686914164194e-05, + "loss": 3.0127, + "step": 54889 + }, + { + "epoch": 2.69, + "grad_norm": 0.7761844992637634, + "learning_rate": 1.5666776193348706e-05, + "loss": 2.817, + "step": 54890 + }, + { + "epoch": 2.69, + "grad_norm": 0.7763392925262451, + "learning_rate": 1.566186622141221e-05, + "loss": 2.9292, + "step": 54891 + }, + { + "epoch": 2.69, + "grad_norm": 0.7687212824821472, + "learning_rate": 1.5656956998367596e-05, + "loss": 3.0882, + "step": 54892 + }, + { + "epoch": 2.69, + "grad_norm": 0.7391537427902222, + "learning_rate": 1.5652048524227857e-05, + "loss": 3.0195, + "step": 54893 + }, + { + "epoch": 2.69, + "grad_norm": 0.7586456537246704, + "learning_rate": 1.564714079900584e-05, + "loss": 2.8656, + "step": 54894 + }, + { + "epoch": 2.69, + "grad_norm": 0.7237671613693237, + "learning_rate": 1.5642233822714612e-05, + "loss": 3.0369, + "step": 54895 + }, + { + "epoch": 2.69, + "grad_norm": 0.7231841683387756, + "learning_rate": 1.5637327595366954e-05, + "loss": 2.9005, + "step": 54896 + }, + { + "epoch": 2.69, + "grad_norm": 0.7463599443435669, + "learning_rate": 1.5632422116975863e-05, + "loss": 2.8008, + "step": 54897 + }, + { + "epoch": 2.69, + "grad_norm": 0.7573348879814148, + "learning_rate": 1.5627517387554256e-05, + "loss": 2.8954, + "step": 54898 + }, + { + "epoch": 2.69, + "grad_norm": 0.7449231147766113, + "learning_rate": 1.562261340711506e-05, + "loss": 2.8526, + "step": 54899 + }, + { + "epoch": 2.69, + "grad_norm": 0.7895654439926147, + "learning_rate": 1.5617710175671162e-05, + "loss": 2.8813, + "step": 54900 + }, + { + "epoch": 2.69, + "grad_norm": 0.7453767657279968, + "learning_rate": 1.561280769323545e-05, + "loss": 2.8936, + "step": 54901 + }, + { + "epoch": 2.69, + "grad_norm": 0.7513246536254883, + "learning_rate": 1.5607905959820887e-05, + "loss": 2.9529, + "step": 54902 + }, + { + "epoch": 2.69, + "grad_norm": 0.752334475517273, + "learning_rate": 1.5603004975440357e-05, + "loss": 2.9262, + "step": 54903 + }, + { + "epoch": 2.69, + "grad_norm": 0.7545332312583923, + "learning_rate": 1.5598104740106788e-05, + "loss": 2.6894, + "step": 54904 + }, + { + "epoch": 2.69, + "grad_norm": 0.7315773963928223, + "learning_rate": 1.5593205253833096e-05, + "loss": 3.0838, + "step": 54905 + }, + { + "epoch": 2.69, + "grad_norm": 0.7586761713027954, + "learning_rate": 1.5588306516632175e-05, + "loss": 2.8601, + "step": 54906 + }, + { + "epoch": 2.69, + "grad_norm": 0.7025976181030273, + "learning_rate": 1.5583408528516916e-05, + "loss": 2.8994, + "step": 54907 + }, + { + "epoch": 2.69, + "grad_norm": 0.7706955075263977, + "learning_rate": 1.5578511289500174e-05, + "loss": 2.8067, + "step": 54908 + }, + { + "epoch": 2.69, + "grad_norm": 0.7359408736228943, + "learning_rate": 1.5573614799594902e-05, + "loss": 2.7985, + "step": 54909 + }, + { + "epoch": 2.69, + "grad_norm": 0.7297497391700745, + "learning_rate": 1.556871905881403e-05, + "loss": 2.9224, + "step": 54910 + }, + { + "epoch": 2.69, + "grad_norm": 0.7188913822174072, + "learning_rate": 1.556382406717037e-05, + "loss": 2.9255, + "step": 54911 + }, + { + "epoch": 2.69, + "grad_norm": 0.7384754419326782, + "learning_rate": 1.5558929824676892e-05, + "loss": 2.882, + "step": 54912 + }, + { + "epoch": 2.69, + "grad_norm": 0.7281101942062378, + "learning_rate": 1.5554036331346444e-05, + "loss": 3.1627, + "step": 54913 + }, + { + "epoch": 2.69, + "grad_norm": 0.7926753163337708, + "learning_rate": 1.5549143587191914e-05, + "loss": 3.0256, + "step": 54914 + }, + { + "epoch": 2.69, + "grad_norm": 0.7407711148262024, + "learning_rate": 1.5544251592226198e-05, + "loss": 3.0445, + "step": 54915 + }, + { + "epoch": 2.69, + "grad_norm": 0.7497155666351318, + "learning_rate": 1.553936034646218e-05, + "loss": 3.0041, + "step": 54916 + }, + { + "epoch": 2.69, + "grad_norm": 0.7246167063713074, + "learning_rate": 1.5534469849912755e-05, + "loss": 2.8765, + "step": 54917 + }, + { + "epoch": 2.69, + "grad_norm": 0.7616339325904846, + "learning_rate": 1.5529580102590743e-05, + "loss": 2.975, + "step": 54918 + }, + { + "epoch": 2.69, + "grad_norm": 0.7942472696304321, + "learning_rate": 1.552469110450907e-05, + "loss": 2.9639, + "step": 54919 + }, + { + "epoch": 2.69, + "grad_norm": 0.7623060345649719, + "learning_rate": 1.5519802855680685e-05, + "loss": 2.798, + "step": 54920 + }, + { + "epoch": 2.69, + "grad_norm": 0.7489006519317627, + "learning_rate": 1.551491535611835e-05, + "loss": 2.7875, + "step": 54921 + }, + { + "epoch": 2.69, + "grad_norm": 0.7411266565322876, + "learning_rate": 1.5510028605834988e-05, + "loss": 2.9106, + "step": 54922 + }, + { + "epoch": 2.69, + "grad_norm": 0.7480968832969666, + "learning_rate": 1.5505142604843422e-05, + "loss": 2.9138, + "step": 54923 + }, + { + "epoch": 2.69, + "grad_norm": 0.7965890765190125, + "learning_rate": 1.5500257353156574e-05, + "loss": 2.8865, + "step": 54924 + }, + { + "epoch": 2.69, + "grad_norm": 0.7516406774520874, + "learning_rate": 1.5495372850787337e-05, + "loss": 2.8225, + "step": 54925 + }, + { + "epoch": 2.69, + "grad_norm": 0.8011462092399597, + "learning_rate": 1.549048909774846e-05, + "loss": 2.7632, + "step": 54926 + }, + { + "epoch": 2.69, + "grad_norm": 0.7487853169441223, + "learning_rate": 1.548560609405294e-05, + "loss": 2.9072, + "step": 54927 + }, + { + "epoch": 2.69, + "grad_norm": 0.7621021866798401, + "learning_rate": 1.5480723839713592e-05, + "loss": 2.98, + "step": 54928 + }, + { + "epoch": 2.69, + "grad_norm": 0.7990534901618958, + "learning_rate": 1.5475842334743216e-05, + "loss": 2.9085, + "step": 54929 + }, + { + "epoch": 2.69, + "grad_norm": 0.7323390245437622, + "learning_rate": 1.5470961579154727e-05, + "loss": 2.9067, + "step": 54930 + }, + { + "epoch": 2.69, + "grad_norm": 0.7541958093643188, + "learning_rate": 1.5466081572960952e-05, + "loss": 2.6722, + "step": 54931 + }, + { + "epoch": 2.69, + "grad_norm": 0.7375277876853943, + "learning_rate": 1.5461202316174783e-05, + "loss": 3.0888, + "step": 54932 + }, + { + "epoch": 2.69, + "grad_norm": 0.7689357399940491, + "learning_rate": 1.5456323808809033e-05, + "loss": 2.816, + "step": 54933 + }, + { + "epoch": 2.69, + "grad_norm": 0.7837892174720764, + "learning_rate": 1.545144605087657e-05, + "loss": 2.7925, + "step": 54934 + }, + { + "epoch": 2.69, + "grad_norm": 0.7377240061759949, + "learning_rate": 1.5446569042390246e-05, + "loss": 2.8422, + "step": 54935 + }, + { + "epoch": 2.69, + "grad_norm": 0.716027021408081, + "learning_rate": 1.544169278336288e-05, + "loss": 2.9464, + "step": 54936 + }, + { + "epoch": 2.69, + "grad_norm": 0.7693297266960144, + "learning_rate": 1.5436817273807335e-05, + "loss": 2.8894, + "step": 54937 + }, + { + "epoch": 2.69, + "grad_norm": 0.766839861869812, + "learning_rate": 1.5431942513736428e-05, + "loss": 2.981, + "step": 54938 + }, + { + "epoch": 2.69, + "grad_norm": 0.7598301768302917, + "learning_rate": 1.5427068503163052e-05, + "loss": 3.0225, + "step": 54939 + }, + { + "epoch": 2.69, + "grad_norm": 0.7385877966880798, + "learning_rate": 1.5422195242099966e-05, + "loss": 2.9501, + "step": 54940 + }, + { + "epoch": 2.69, + "grad_norm": 0.7691483497619629, + "learning_rate": 1.5417322730560055e-05, + "loss": 2.7536, + "step": 54941 + }, + { + "epoch": 2.69, + "grad_norm": 0.7527814507484436, + "learning_rate": 1.541245096855621e-05, + "loss": 2.7586, + "step": 54942 + }, + { + "epoch": 2.69, + "grad_norm": 0.7725242376327515, + "learning_rate": 1.5407579956101155e-05, + "loss": 2.7225, + "step": 54943 + }, + { + "epoch": 2.69, + "grad_norm": 0.7793623208999634, + "learning_rate": 1.5402709693207816e-05, + "loss": 2.852, + "step": 54944 + }, + { + "epoch": 2.69, + "grad_norm": 0.7739753127098083, + "learning_rate": 1.5397840179888876e-05, + "loss": 2.8733, + "step": 54945 + }, + { + "epoch": 2.69, + "grad_norm": 0.7654193043708801, + "learning_rate": 1.5392971416157296e-05, + "loss": 2.7769, + "step": 54946 + }, + { + "epoch": 2.69, + "grad_norm": 0.7343190908432007, + "learning_rate": 1.5388103402025863e-05, + "loss": 3.0919, + "step": 54947 + }, + { + "epoch": 2.69, + "grad_norm": 0.7587972283363342, + "learning_rate": 1.538323613750737e-05, + "loss": 3.0448, + "step": 54948 + }, + { + "epoch": 2.69, + "grad_norm": 0.7687961459159851, + "learning_rate": 1.537836962261467e-05, + "loss": 2.8649, + "step": 54949 + }, + { + "epoch": 2.69, + "grad_norm": 0.7500821948051453, + "learning_rate": 1.5373503857360592e-05, + "loss": 2.7358, + "step": 54950 + }, + { + "epoch": 2.69, + "grad_norm": 0.7614321708679199, + "learning_rate": 1.5368638841757885e-05, + "loss": 3.0014, + "step": 54951 + }, + { + "epoch": 2.69, + "grad_norm": 0.7479397058486938, + "learning_rate": 1.5363774575819444e-05, + "loss": 2.8803, + "step": 54952 + }, + { + "epoch": 2.69, + "grad_norm": 0.7300918102264404, + "learning_rate": 1.5358911059557986e-05, + "loss": 2.969, + "step": 54953 + }, + { + "epoch": 2.69, + "grad_norm": 0.7309346199035645, + "learning_rate": 1.5354048292986444e-05, + "loss": 2.9086, + "step": 54954 + }, + { + "epoch": 2.69, + "grad_norm": 0.7472579479217529, + "learning_rate": 1.5349186276117498e-05, + "loss": 2.7324, + "step": 54955 + }, + { + "epoch": 2.69, + "grad_norm": 0.7331668734550476, + "learning_rate": 1.5344325008964075e-05, + "loss": 2.9536, + "step": 54956 + }, + { + "epoch": 2.69, + "grad_norm": 0.7437717318534851, + "learning_rate": 1.53394644915389e-05, + "loss": 2.7823, + "step": 54957 + }, + { + "epoch": 2.69, + "grad_norm": 0.7608617544174194, + "learning_rate": 1.5334604723854727e-05, + "loss": 2.9898, + "step": 54958 + }, + { + "epoch": 2.69, + "grad_norm": 0.8812674880027771, + "learning_rate": 1.5329745705924512e-05, + "loss": 2.9538, + "step": 54959 + }, + { + "epoch": 2.69, + "grad_norm": 0.7427992224693298, + "learning_rate": 1.532488743776088e-05, + "loss": 2.6872, + "step": 54960 + }, + { + "epoch": 2.69, + "grad_norm": 0.7801377773284912, + "learning_rate": 1.532002991937672e-05, + "loss": 2.7277, + "step": 54961 + }, + { + "epoch": 2.69, + "grad_norm": 0.7416569590568542, + "learning_rate": 1.5315173150784853e-05, + "loss": 2.9374, + "step": 54962 + }, + { + "epoch": 2.69, + "grad_norm": 0.7589424252510071, + "learning_rate": 1.5310317131998007e-05, + "loss": 2.9005, + "step": 54963 + }, + { + "epoch": 2.69, + "grad_norm": 0.7642767429351807, + "learning_rate": 1.5305461863029e-05, + "loss": 2.8343, + "step": 54964 + }, + { + "epoch": 2.69, + "grad_norm": 0.7222141027450562, + "learning_rate": 1.530060734389066e-05, + "loss": 2.8669, + "step": 54965 + }, + { + "epoch": 2.69, + "grad_norm": 0.8169352412223816, + "learning_rate": 1.529575357459567e-05, + "loss": 2.7694, + "step": 54966 + }, + { + "epoch": 2.69, + "grad_norm": 0.7054728269577026, + "learning_rate": 1.5290900555156893e-05, + "loss": 2.8031, + "step": 54967 + }, + { + "epoch": 2.69, + "grad_norm": 0.7217891812324524, + "learning_rate": 1.5286048285587082e-05, + "loss": 2.7711, + "step": 54968 + }, + { + "epoch": 2.69, + "grad_norm": 0.7295551300048828, + "learning_rate": 1.528119676589903e-05, + "loss": 2.9962, + "step": 54969 + }, + { + "epoch": 2.69, + "grad_norm": 0.7057387232780457, + "learning_rate": 1.527634599610549e-05, + "loss": 3.0983, + "step": 54970 + }, + { + "epoch": 2.69, + "grad_norm": 0.779369056224823, + "learning_rate": 1.5271495976219316e-05, + "loss": 2.9026, + "step": 54971 + }, + { + "epoch": 2.69, + "grad_norm": 0.7789977788925171, + "learning_rate": 1.5266646706253205e-05, + "loss": 2.9408, + "step": 54972 + }, + { + "epoch": 2.69, + "grad_norm": 0.7889942526817322, + "learning_rate": 1.526179818621991e-05, + "loss": 2.7538, + "step": 54973 + }, + { + "epoch": 2.69, + "grad_norm": 0.7462673187255859, + "learning_rate": 1.5256950416132286e-05, + "loss": 2.8916, + "step": 54974 + }, + { + "epoch": 2.69, + "grad_norm": 0.7497276067733765, + "learning_rate": 1.5252103396003023e-05, + "loss": 2.956, + "step": 54975 + }, + { + "epoch": 2.69, + "grad_norm": 0.7257047891616821, + "learning_rate": 1.524725712584488e-05, + "loss": 2.831, + "step": 54976 + }, + { + "epoch": 2.69, + "grad_norm": 0.7067046761512756, + "learning_rate": 1.5242411605670746e-05, + "loss": 2.7964, + "step": 54977 + }, + { + "epoch": 2.69, + "grad_norm": 0.7375363111495972, + "learning_rate": 1.5237566835493275e-05, + "loss": 2.8754, + "step": 54978 + }, + { + "epoch": 2.69, + "grad_norm": 0.7760704159736633, + "learning_rate": 1.523272281532526e-05, + "loss": 2.8453, + "step": 54979 + }, + { + "epoch": 2.69, + "grad_norm": 0.772099494934082, + "learning_rate": 1.5227879545179388e-05, + "loss": 2.8503, + "step": 54980 + }, + { + "epoch": 2.69, + "grad_norm": 0.7476798295974731, + "learning_rate": 1.5223037025068485e-05, + "loss": 2.9263, + "step": 54981 + }, + { + "epoch": 2.69, + "grad_norm": 0.7684528827667236, + "learning_rate": 1.521819525500534e-05, + "loss": 2.8822, + "step": 54982 + }, + { + "epoch": 2.69, + "grad_norm": 0.7638944387435913, + "learning_rate": 1.5213354235002606e-05, + "loss": 2.865, + "step": 54983 + }, + { + "epoch": 2.69, + "grad_norm": 0.7224180102348328, + "learning_rate": 1.5208513965073144e-05, + "loss": 2.8717, + "step": 54984 + }, + { + "epoch": 2.69, + "grad_norm": 0.8036991953849792, + "learning_rate": 1.5203674445229608e-05, + "loss": 2.8207, + "step": 54985 + }, + { + "epoch": 2.69, + "grad_norm": 0.701972246170044, + "learning_rate": 1.5198835675484821e-05, + "loss": 2.6852, + "step": 54986 + }, + { + "epoch": 2.69, + "grad_norm": 0.7487140893936157, + "learning_rate": 1.5193997655851508e-05, + "loss": 2.9361, + "step": 54987 + }, + { + "epoch": 2.69, + "grad_norm": 0.7961825132369995, + "learning_rate": 1.5189160386342324e-05, + "loss": 2.8853, + "step": 54988 + }, + { + "epoch": 2.69, + "grad_norm": 0.7591695785522461, + "learning_rate": 1.5184323866970127e-05, + "loss": 2.8568, + "step": 54989 + }, + { + "epoch": 2.69, + "grad_norm": 0.8069265484809875, + "learning_rate": 1.5179488097747571e-05, + "loss": 2.7135, + "step": 54990 + }, + { + "epoch": 2.69, + "grad_norm": 0.729277491569519, + "learning_rate": 1.5174653078687449e-05, + "loss": 2.9049, + "step": 54991 + }, + { + "epoch": 2.7, + "grad_norm": 0.7785135507583618, + "learning_rate": 1.516981880980248e-05, + "loss": 2.9743, + "step": 54992 + }, + { + "epoch": 2.7, + "grad_norm": 0.7566962242126465, + "learning_rate": 1.5164985291105392e-05, + "loss": 2.7836, + "step": 54993 + }, + { + "epoch": 2.7, + "grad_norm": 0.7411288022994995, + "learning_rate": 1.5160152522608937e-05, + "loss": 2.9298, + "step": 54994 + }, + { + "epoch": 2.7, + "grad_norm": 0.7546302676200867, + "learning_rate": 1.5155320504325773e-05, + "loss": 3.0254, + "step": 54995 + }, + { + "epoch": 2.7, + "grad_norm": 0.7514198422431946, + "learning_rate": 1.5150489236268692e-05, + "loss": 2.8508, + "step": 54996 + }, + { + "epoch": 2.7, + "grad_norm": 0.7835859060287476, + "learning_rate": 1.5145658718450415e-05, + "loss": 2.929, + "step": 54997 + }, + { + "epoch": 2.7, + "grad_norm": 0.7201144695281982, + "learning_rate": 1.5140828950883633e-05, + "loss": 2.9676, + "step": 54998 + }, + { + "epoch": 2.7, + "grad_norm": 0.7485184073448181, + "learning_rate": 1.51359999335811e-05, + "loss": 3.0914, + "step": 54999 + }, + { + "epoch": 2.7, + "grad_norm": 0.7460274696350098, + "learning_rate": 1.5131171666555507e-05, + "loss": 2.8109, + "step": 55000 + }, + { + "epoch": 2.7, + "grad_norm": 0.7447788715362549, + "learning_rate": 1.5126344149819613e-05, + "loss": 2.808, + "step": 55001 + }, + { + "epoch": 2.7, + "grad_norm": 0.7432894706726074, + "learning_rate": 1.5121517383386073e-05, + "loss": 2.8884, + "step": 55002 + }, + { + "epoch": 2.7, + "grad_norm": 0.7191660404205322, + "learning_rate": 1.5116691367267642e-05, + "loss": 2.7407, + "step": 55003 + }, + { + "epoch": 2.7, + "grad_norm": 0.7479361295700073, + "learning_rate": 1.511186610147701e-05, + "loss": 2.7467, + "step": 55004 + }, + { + "epoch": 2.7, + "grad_norm": 0.7603451609611511, + "learning_rate": 1.5107041586026902e-05, + "loss": 3.1482, + "step": 55005 + }, + { + "epoch": 2.7, + "grad_norm": 0.7904460430145264, + "learning_rate": 1.5102217820929974e-05, + "loss": 2.9543, + "step": 55006 + }, + { + "epoch": 2.7, + "grad_norm": 0.7052236795425415, + "learning_rate": 1.5097394806199048e-05, + "loss": 3.0005, + "step": 55007 + }, + { + "epoch": 2.7, + "grad_norm": 0.824068546295166, + "learning_rate": 1.5092572541846748e-05, + "loss": 2.8321, + "step": 55008 + }, + { + "epoch": 2.7, + "grad_norm": 0.7605258822441101, + "learning_rate": 1.5087751027885764e-05, + "loss": 2.7439, + "step": 55009 + }, + { + "epoch": 2.7, + "grad_norm": 0.7721036076545715, + "learning_rate": 1.5082930264328786e-05, + "loss": 2.9184, + "step": 55010 + }, + { + "epoch": 2.7, + "grad_norm": 0.7376681566238403, + "learning_rate": 1.5078110251188535e-05, + "loss": 2.7824, + "step": 55011 + }, + { + "epoch": 2.7, + "grad_norm": 0.7896484732627869, + "learning_rate": 1.5073290988477737e-05, + "loss": 2.9285, + "step": 55012 + }, + { + "epoch": 2.7, + "grad_norm": 0.7744725346565247, + "learning_rate": 1.5068472476209048e-05, + "loss": 2.919, + "step": 55013 + }, + { + "epoch": 2.7, + "grad_norm": 0.7636879086494446, + "learning_rate": 1.5063654714395157e-05, + "loss": 2.7188, + "step": 55014 + }, + { + "epoch": 2.7, + "grad_norm": 0.7519050240516663, + "learning_rate": 1.5058837703048788e-05, + "loss": 2.9375, + "step": 55015 + }, + { + "epoch": 2.7, + "grad_norm": 0.7617154121398926, + "learning_rate": 1.505402144218263e-05, + "loss": 2.8453, + "step": 55016 + }, + { + "epoch": 2.7, + "grad_norm": 0.7616671919822693, + "learning_rate": 1.5049205931809238e-05, + "loss": 2.9701, + "step": 55017 + }, + { + "epoch": 2.7, + "grad_norm": 0.7466253042221069, + "learning_rate": 1.5044391171941439e-05, + "loss": 2.8328, + "step": 55018 + }, + { + "epoch": 2.7, + "grad_norm": 0.7612420320510864, + "learning_rate": 1.5039577162591921e-05, + "loss": 2.83, + "step": 55019 + }, + { + "epoch": 2.7, + "grad_norm": 0.7458781003952026, + "learning_rate": 1.5034763903773239e-05, + "loss": 2.8543, + "step": 55020 + }, + { + "epoch": 2.7, + "grad_norm": 0.7177784442901611, + "learning_rate": 1.5029951395498219e-05, + "loss": 2.9089, + "step": 55021 + }, + { + "epoch": 2.7, + "grad_norm": 0.7244158983230591, + "learning_rate": 1.5025139637779415e-05, + "loss": 2.7326, + "step": 55022 + }, + { + "epoch": 2.7, + "grad_norm": 0.7530938386917114, + "learning_rate": 1.5020328630629586e-05, + "loss": 2.9726, + "step": 55023 + }, + { + "epoch": 2.7, + "grad_norm": 0.7853320240974426, + "learning_rate": 1.5015518374061352e-05, + "loss": 2.8278, + "step": 55024 + }, + { + "epoch": 2.7, + "grad_norm": 0.7918479442596436, + "learning_rate": 1.5010708868087373e-05, + "loss": 2.889, + "step": 55025 + }, + { + "epoch": 2.7, + "grad_norm": 0.748648464679718, + "learning_rate": 1.5005900112720371e-05, + "loss": 3.0377, + "step": 55026 + }, + { + "epoch": 2.7, + "grad_norm": 0.755605161190033, + "learning_rate": 1.5001092107972968e-05, + "loss": 2.9121, + "step": 55027 + }, + { + "epoch": 2.7, + "grad_norm": 0.7698187828063965, + "learning_rate": 1.4996284853857787e-05, + "loss": 2.7703, + "step": 55028 + }, + { + "epoch": 2.7, + "grad_norm": 0.683066189289093, + "learning_rate": 1.499147835038762e-05, + "loss": 2.8249, + "step": 55029 + }, + { + "epoch": 2.7, + "grad_norm": 0.7477896809577942, + "learning_rate": 1.4986672597575056e-05, + "loss": 2.9249, + "step": 55030 + }, + { + "epoch": 2.7, + "grad_norm": 0.7523871064186096, + "learning_rate": 1.4981867595432718e-05, + "loss": 2.9331, + "step": 55031 + }, + { + "epoch": 2.7, + "grad_norm": 0.7780340909957886, + "learning_rate": 1.4977063343973261e-05, + "loss": 2.859, + "step": 55032 + }, + { + "epoch": 2.7, + "grad_norm": 0.7800566554069519, + "learning_rate": 1.4972259843209344e-05, + "loss": 2.6747, + "step": 55033 + }, + { + "epoch": 2.7, + "grad_norm": 0.7878885269165039, + "learning_rate": 1.496745709315369e-05, + "loss": 3.0291, + "step": 55034 + }, + { + "epoch": 2.7, + "grad_norm": 0.7875646352767944, + "learning_rate": 1.4962655093818887e-05, + "loss": 2.8538, + "step": 55035 + }, + { + "epoch": 2.7, + "grad_norm": 0.7212032079696655, + "learning_rate": 1.4957853845217594e-05, + "loss": 2.7787, + "step": 55036 + }, + { + "epoch": 2.7, + "grad_norm": 0.7758206725120544, + "learning_rate": 1.4953053347362498e-05, + "loss": 2.8677, + "step": 55037 + }, + { + "epoch": 2.7, + "grad_norm": 0.7322551012039185, + "learning_rate": 1.4948253600266124e-05, + "loss": 3.0312, + "step": 55038 + }, + { + "epoch": 2.7, + "grad_norm": 0.7753440141677856, + "learning_rate": 1.494345460394123e-05, + "loss": 2.9871, + "step": 55039 + }, + { + "epoch": 2.7, + "grad_norm": 0.7763801217079163, + "learning_rate": 1.4938656358400402e-05, + "loss": 2.8968, + "step": 55040 + }, + { + "epoch": 2.7, + "grad_norm": 0.7767189741134644, + "learning_rate": 1.4933858863656334e-05, + "loss": 2.7693, + "step": 55041 + }, + { + "epoch": 2.7, + "grad_norm": 0.749442458152771, + "learning_rate": 1.4929062119721546e-05, + "loss": 2.8442, + "step": 55042 + }, + { + "epoch": 2.7, + "grad_norm": 0.7398672103881836, + "learning_rate": 1.4924266126608763e-05, + "loss": 2.7577, + "step": 55043 + }, + { + "epoch": 2.7, + "grad_norm": 0.7189925909042358, + "learning_rate": 1.4919470884330642e-05, + "loss": 3.0464, + "step": 55044 + }, + { + "epoch": 2.7, + "grad_norm": 0.758366584777832, + "learning_rate": 1.4914676392899772e-05, + "loss": 3.0093, + "step": 55045 + }, + { + "epoch": 2.7, + "grad_norm": 0.7701453566551208, + "learning_rate": 1.4909882652328775e-05, + "loss": 3.0331, + "step": 55046 + }, + { + "epoch": 2.7, + "grad_norm": 0.7704036831855774, + "learning_rate": 1.4905089662630243e-05, + "loss": 2.8136, + "step": 55047 + }, + { + "epoch": 2.7, + "grad_norm": 0.781955897808075, + "learning_rate": 1.4900297423816832e-05, + "loss": 3.0821, + "step": 55048 + }, + { + "epoch": 2.7, + "grad_norm": 0.7298367619514465, + "learning_rate": 1.489550593590123e-05, + "loss": 3.0272, + "step": 55049 + }, + { + "epoch": 2.7, + "grad_norm": 0.7215245366096497, + "learning_rate": 1.489071519889593e-05, + "loss": 2.8109, + "step": 55050 + }, + { + "epoch": 2.7, + "grad_norm": 0.7225742340087891, + "learning_rate": 1.4885925212813688e-05, + "loss": 2.7193, + "step": 55051 + }, + { + "epoch": 2.7, + "grad_norm": 0.7542920708656311, + "learning_rate": 1.4881135977667025e-05, + "loss": 2.8853, + "step": 55052 + }, + { + "epoch": 2.7, + "grad_norm": 0.7337054014205933, + "learning_rate": 1.4876347493468532e-05, + "loss": 2.9647, + "step": 55053 + }, + { + "epoch": 2.7, + "grad_norm": 0.7353600859642029, + "learning_rate": 1.48715597602309e-05, + "loss": 2.9208, + "step": 55054 + }, + { + "epoch": 2.7, + "grad_norm": 0.7551136016845703, + "learning_rate": 1.4866772777966685e-05, + "loss": 2.9143, + "step": 55055 + }, + { + "epoch": 2.7, + "grad_norm": 0.7284271717071533, + "learning_rate": 1.4861986546688543e-05, + "loss": 2.7553, + "step": 55056 + }, + { + "epoch": 2.7, + "grad_norm": 0.769647479057312, + "learning_rate": 1.485720106640903e-05, + "loss": 2.7599, + "step": 55057 + }, + { + "epoch": 2.7, + "grad_norm": 0.793817937374115, + "learning_rate": 1.485241633714077e-05, + "loss": 2.8884, + "step": 55058 + }, + { + "epoch": 2.7, + "grad_norm": 0.7645768523216248, + "learning_rate": 1.4847632358896422e-05, + "loss": 3.0118, + "step": 55059 + }, + { + "epoch": 2.7, + "grad_norm": 0.7738844156265259, + "learning_rate": 1.4842849131688472e-05, + "loss": 3.0922, + "step": 55060 + }, + { + "epoch": 2.7, + "grad_norm": 0.7235080003738403, + "learning_rate": 1.4838066655529613e-05, + "loss": 2.8157, + "step": 55061 + }, + { + "epoch": 2.7, + "grad_norm": 0.7338597774505615, + "learning_rate": 1.4833284930432366e-05, + "loss": 3.099, + "step": 55062 + }, + { + "epoch": 2.7, + "grad_norm": 0.7195146679878235, + "learning_rate": 1.4828503956409355e-05, + "loss": 2.7882, + "step": 55063 + }, + { + "epoch": 2.7, + "grad_norm": 0.7406694889068604, + "learning_rate": 1.4823723733473237e-05, + "loss": 2.6486, + "step": 55064 + }, + { + "epoch": 2.7, + "grad_norm": 0.7771344184875488, + "learning_rate": 1.48189442616365e-05, + "loss": 3.0643, + "step": 55065 + }, + { + "epoch": 2.7, + "grad_norm": 0.7858314514160156, + "learning_rate": 1.4814165540911805e-05, + "loss": 2.9493, + "step": 55066 + }, + { + "epoch": 2.7, + "grad_norm": 0.8330277800559998, + "learning_rate": 1.4809387571311738e-05, + "loss": 2.9517, + "step": 55067 + }, + { + "epoch": 2.7, + "grad_norm": 0.7209185361862183, + "learning_rate": 1.4804610352848823e-05, + "loss": 2.8796, + "step": 55068 + }, + { + "epoch": 2.7, + "grad_norm": 0.7240145802497864, + "learning_rate": 1.4799833885535684e-05, + "loss": 2.7944, + "step": 55069 + }, + { + "epoch": 2.7, + "grad_norm": 0.7134544253349304, + "learning_rate": 1.4795058169384877e-05, + "loss": 2.7025, + "step": 55070 + }, + { + "epoch": 2.7, + "grad_norm": 0.7314766049385071, + "learning_rate": 1.4790283204409059e-05, + "loss": 2.7722, + "step": 55071 + }, + { + "epoch": 2.7, + "grad_norm": 0.7572324275970459, + "learning_rate": 1.4785508990620654e-05, + "loss": 3.037, + "step": 55072 + }, + { + "epoch": 2.7, + "grad_norm": 0.7572822570800781, + "learning_rate": 1.4780735528032418e-05, + "loss": 2.9088, + "step": 55073 + }, + { + "epoch": 2.7, + "grad_norm": 0.7391690611839294, + "learning_rate": 1.4775962816656806e-05, + "loss": 2.8931, + "step": 55074 + }, + { + "epoch": 2.7, + "grad_norm": 0.7284706830978394, + "learning_rate": 1.4771190856506377e-05, + "loss": 3.114, + "step": 55075 + }, + { + "epoch": 2.7, + "grad_norm": 0.7849894165992737, + "learning_rate": 1.4766419647593785e-05, + "loss": 2.9319, + "step": 55076 + }, + { + "epoch": 2.7, + "grad_norm": 0.7333627939224243, + "learning_rate": 1.476164918993149e-05, + "loss": 3.0572, + "step": 55077 + }, + { + "epoch": 2.7, + "grad_norm": 0.7046851515769958, + "learning_rate": 1.4756879483532146e-05, + "loss": 2.9404, + "step": 55078 + }, + { + "epoch": 2.7, + "grad_norm": 0.7099255919456482, + "learning_rate": 1.4752110528408312e-05, + "loss": 2.7765, + "step": 55079 + }, + { + "epoch": 2.7, + "grad_norm": 0.7698432803153992, + "learning_rate": 1.4747342324572509e-05, + "loss": 2.8692, + "step": 55080 + }, + { + "epoch": 2.7, + "grad_norm": 0.7886145114898682, + "learning_rate": 1.4742574872037327e-05, + "loss": 2.9755, + "step": 55081 + }, + { + "epoch": 2.7, + "grad_norm": 0.7380675077438354, + "learning_rate": 1.4737808170815258e-05, + "loss": 2.6219, + "step": 55082 + }, + { + "epoch": 2.7, + "grad_norm": 0.80084228515625, + "learning_rate": 1.4733042220918923e-05, + "loss": 2.9648, + "step": 55083 + }, + { + "epoch": 2.7, + "grad_norm": 0.7491824626922607, + "learning_rate": 1.4728277022360846e-05, + "loss": 2.7953, + "step": 55084 + }, + { + "epoch": 2.7, + "grad_norm": 0.7680736184120178, + "learning_rate": 1.4723512575153584e-05, + "loss": 2.9031, + "step": 55085 + }, + { + "epoch": 2.7, + "grad_norm": 0.76555997133255, + "learning_rate": 1.4718748879309727e-05, + "loss": 2.863, + "step": 55086 + }, + { + "epoch": 2.7, + "grad_norm": 0.7460500597953796, + "learning_rate": 1.4713985934841731e-05, + "loss": 2.8605, + "step": 55087 + }, + { + "epoch": 2.7, + "grad_norm": 0.7447842359542847, + "learning_rate": 1.4709223741762221e-05, + "loss": 2.8689, + "step": 55088 + }, + { + "epoch": 2.7, + "grad_norm": 0.7283835411071777, + "learning_rate": 1.470446230008372e-05, + "loss": 2.7695, + "step": 55089 + }, + { + "epoch": 2.7, + "grad_norm": 0.775287926197052, + "learning_rate": 1.4699701609818714e-05, + "loss": 2.8405, + "step": 55090 + }, + { + "epoch": 2.7, + "grad_norm": 0.7444217801094055, + "learning_rate": 1.4694941670979832e-05, + "loss": 2.9038, + "step": 55091 + }, + { + "epoch": 2.7, + "grad_norm": 0.7645972371101379, + "learning_rate": 1.4690182483579527e-05, + "loss": 2.7941, + "step": 55092 + }, + { + "epoch": 2.7, + "grad_norm": 0.7570917010307312, + "learning_rate": 1.468542404763039e-05, + "loss": 2.9379, + "step": 55093 + }, + { + "epoch": 2.7, + "grad_norm": 0.7789246439933777, + "learning_rate": 1.4680666363144944e-05, + "loss": 3.1158, + "step": 55094 + }, + { + "epoch": 2.7, + "grad_norm": 0.7554366588592529, + "learning_rate": 1.4675909430135713e-05, + "loss": 2.782, + "step": 55095 + }, + { + "epoch": 2.7, + "grad_norm": 0.7414656281471252, + "learning_rate": 1.4671153248615253e-05, + "loss": 2.6158, + "step": 55096 + }, + { + "epoch": 2.7, + "grad_norm": 0.7360018491744995, + "learning_rate": 1.4666397818596021e-05, + "loss": 2.9317, + "step": 55097 + }, + { + "epoch": 2.7, + "grad_norm": 0.740975022315979, + "learning_rate": 1.4661643140090606e-05, + "loss": 2.8232, + "step": 55098 + }, + { + "epoch": 2.7, + "grad_norm": 0.7482433319091797, + "learning_rate": 1.4656889213111467e-05, + "loss": 2.8816, + "step": 55099 + }, + { + "epoch": 2.7, + "grad_norm": 0.7271791696548462, + "learning_rate": 1.4652136037671158e-05, + "loss": 2.9859, + "step": 55100 + }, + { + "epoch": 2.7, + "grad_norm": 0.7431482076644897, + "learning_rate": 1.464738361378227e-05, + "loss": 2.9572, + "step": 55101 + }, + { + "epoch": 2.7, + "grad_norm": 0.7354775071144104, + "learning_rate": 1.4642631941457194e-05, + "loss": 3.016, + "step": 55102 + }, + { + "epoch": 2.7, + "grad_norm": 0.7678640484809875, + "learning_rate": 1.463788102070862e-05, + "loss": 2.709, + "step": 55103 + }, + { + "epoch": 2.7, + "grad_norm": 0.7526733875274658, + "learning_rate": 1.4633130851548835e-05, + "loss": 2.9976, + "step": 55104 + }, + { + "epoch": 2.7, + "grad_norm": 0.740717887878418, + "learning_rate": 1.4628381433990466e-05, + "loss": 2.8201, + "step": 55105 + }, + { + "epoch": 2.7, + "grad_norm": 0.7562867999076843, + "learning_rate": 1.4623632768046101e-05, + "loss": 2.9115, + "step": 55106 + }, + { + "epoch": 2.7, + "grad_norm": 0.7273349761962891, + "learning_rate": 1.4618884853728063e-05, + "loss": 2.9555, + "step": 55107 + }, + { + "epoch": 2.7, + "grad_norm": 0.7348599433898926, + "learning_rate": 1.4614137691049044e-05, + "loss": 2.7613, + "step": 55108 + }, + { + "epoch": 2.7, + "grad_norm": 0.7380170226097107, + "learning_rate": 1.46093912800214e-05, + "loss": 2.8046, + "step": 55109 + }, + { + "epoch": 2.7, + "grad_norm": 0.7288892269134521, + "learning_rate": 1.4604645620657751e-05, + "loss": 2.7613, + "step": 55110 + }, + { + "epoch": 2.7, + "grad_norm": 0.7518560290336609, + "learning_rate": 1.4599900712970524e-05, + "loss": 2.834, + "step": 55111 + }, + { + "epoch": 2.7, + "grad_norm": 0.792517900466919, + "learning_rate": 1.459515655697221e-05, + "loss": 3.1014, + "step": 55112 + }, + { + "epoch": 2.7, + "grad_norm": 0.7188156247138977, + "learning_rate": 1.4590413152675362e-05, + "loss": 3.0312, + "step": 55113 + }, + { + "epoch": 2.7, + "grad_norm": 0.7061635255813599, + "learning_rate": 1.4585670500092405e-05, + "loss": 2.9461, + "step": 55114 + }, + { + "epoch": 2.7, + "grad_norm": 0.7597672939300537, + "learning_rate": 1.458092859923583e-05, + "loss": 2.8916, + "step": 55115 + }, + { + "epoch": 2.7, + "grad_norm": 0.7571054697036743, + "learning_rate": 1.4576187450118227e-05, + "loss": 3.2875, + "step": 55116 + }, + { + "epoch": 2.7, + "grad_norm": 0.7508683204650879, + "learning_rate": 1.4571447052752017e-05, + "loss": 3.0029, + "step": 55117 + }, + { + "epoch": 2.7, + "grad_norm": 0.7968552708625793, + "learning_rate": 1.4566707407149691e-05, + "loss": 2.9155, + "step": 55118 + }, + { + "epoch": 2.7, + "grad_norm": 0.7436490654945374, + "learning_rate": 1.4561968513323641e-05, + "loss": 2.799, + "step": 55119 + }, + { + "epoch": 2.7, + "grad_norm": 0.6988527178764343, + "learning_rate": 1.4557230371286488e-05, + "loss": 2.8147, + "step": 55120 + }, + { + "epoch": 2.7, + "grad_norm": 0.7517215609550476, + "learning_rate": 1.4552492981050656e-05, + "loss": 2.9274, + "step": 55121 + }, + { + "epoch": 2.7, + "grad_norm": 0.7683263421058655, + "learning_rate": 1.4547756342628603e-05, + "loss": 2.8543, + "step": 55122 + }, + { + "epoch": 2.7, + "grad_norm": 0.7408250570297241, + "learning_rate": 1.454302045603285e-05, + "loss": 2.9393, + "step": 55123 + }, + { + "epoch": 2.7, + "grad_norm": 0.737571656703949, + "learning_rate": 1.4538285321275823e-05, + "loss": 2.8324, + "step": 55124 + }, + { + "epoch": 2.7, + "grad_norm": 0.7951434850692749, + "learning_rate": 1.4533550938370008e-05, + "loss": 2.7856, + "step": 55125 + }, + { + "epoch": 2.7, + "grad_norm": 0.7539615631103516, + "learning_rate": 1.4528817307327933e-05, + "loss": 2.8728, + "step": 55126 + }, + { + "epoch": 2.7, + "grad_norm": 0.7781716585159302, + "learning_rate": 1.4524084428161953e-05, + "loss": 2.78, + "step": 55127 + }, + { + "epoch": 2.7, + "grad_norm": 0.7636751532554626, + "learning_rate": 1.4519352300884623e-05, + "loss": 2.9116, + "step": 55128 + }, + { + "epoch": 2.7, + "grad_norm": 0.7897639274597168, + "learning_rate": 1.451462092550837e-05, + "loss": 2.8244, + "step": 55129 + }, + { + "epoch": 2.7, + "grad_norm": 0.7303542494773865, + "learning_rate": 1.4509890302045612e-05, + "loss": 2.9801, + "step": 55130 + }, + { + "epoch": 2.7, + "grad_norm": 0.7917243838310242, + "learning_rate": 1.4505160430508944e-05, + "loss": 2.9172, + "step": 55131 + }, + { + "epoch": 2.7, + "grad_norm": 0.7893748879432678, + "learning_rate": 1.450043131091072e-05, + "loss": 3.0641, + "step": 55132 + }, + { + "epoch": 2.7, + "grad_norm": 0.7223455309867859, + "learning_rate": 1.4495702943263432e-05, + "loss": 2.8755, + "step": 55133 + }, + { + "epoch": 2.7, + "grad_norm": 0.7736308574676514, + "learning_rate": 1.4490975327579436e-05, + "loss": 3.098, + "step": 55134 + }, + { + "epoch": 2.7, + "grad_norm": 0.7550448179244995, + "learning_rate": 1.4486248463871287e-05, + "loss": 2.8793, + "step": 55135 + }, + { + "epoch": 2.7, + "grad_norm": 0.7876956462860107, + "learning_rate": 1.4481522352151475e-05, + "loss": 2.8375, + "step": 55136 + }, + { + "epoch": 2.7, + "grad_norm": 0.7860339283943176, + "learning_rate": 1.4476796992432327e-05, + "loss": 3.0902, + "step": 55137 + }, + { + "epoch": 2.7, + "grad_norm": 0.7922260761260986, + "learning_rate": 1.4472072384726363e-05, + "loss": 2.7761, + "step": 55138 + }, + { + "epoch": 2.7, + "grad_norm": 0.7500098347663879, + "learning_rate": 1.4467348529046041e-05, + "loss": 2.9636, + "step": 55139 + }, + { + "epoch": 2.7, + "grad_norm": 0.7248625159263611, + "learning_rate": 1.4462625425403718e-05, + "loss": 2.9241, + "step": 55140 + }, + { + "epoch": 2.7, + "grad_norm": 0.7339884638786316, + "learning_rate": 1.4457903073811916e-05, + "loss": 2.9563, + "step": 55141 + }, + { + "epoch": 2.7, + "grad_norm": 0.7312856316566467, + "learning_rate": 1.4453181474283026e-05, + "loss": 2.8138, + "step": 55142 + }, + { + "epoch": 2.7, + "grad_norm": 0.7703109979629517, + "learning_rate": 1.4448460626829505e-05, + "loss": 2.9011, + "step": 55143 + }, + { + "epoch": 2.7, + "grad_norm": 0.7444011569023132, + "learning_rate": 1.4443740531463777e-05, + "loss": 2.8922, + "step": 55144 + }, + { + "epoch": 2.7, + "grad_norm": 0.7480001449584961, + "learning_rate": 1.4439021188198264e-05, + "loss": 2.7758, + "step": 55145 + }, + { + "epoch": 2.7, + "grad_norm": 0.7840633988380432, + "learning_rate": 1.4434302597045454e-05, + "loss": 3.0501, + "step": 55146 + }, + { + "epoch": 2.7, + "grad_norm": 0.8444885015487671, + "learning_rate": 1.4429584758017743e-05, + "loss": 2.9258, + "step": 55147 + }, + { + "epoch": 2.7, + "grad_norm": 0.7418254017829895, + "learning_rate": 1.442486767112755e-05, + "loss": 2.9051, + "step": 55148 + }, + { + "epoch": 2.7, + "grad_norm": 0.75757896900177, + "learning_rate": 1.4420151336387231e-05, + "loss": 2.8969, + "step": 55149 + }, + { + "epoch": 2.7, + "grad_norm": 0.7526350021362305, + "learning_rate": 1.4415435753809313e-05, + "loss": 2.8936, + "step": 55150 + }, + { + "epoch": 2.7, + "grad_norm": 0.7957975268363953, + "learning_rate": 1.4410720923406183e-05, + "loss": 2.763, + "step": 55151 + }, + { + "epoch": 2.7, + "grad_norm": 0.7414007186889648, + "learning_rate": 1.4406006845190233e-05, + "loss": 2.7035, + "step": 55152 + }, + { + "epoch": 2.7, + "grad_norm": 0.7431672215461731, + "learning_rate": 1.4401293519173918e-05, + "loss": 2.8199, + "step": 55153 + }, + { + "epoch": 2.7, + "grad_norm": 0.7612506151199341, + "learning_rate": 1.439658094536963e-05, + "loss": 2.7203, + "step": 55154 + }, + { + "epoch": 2.7, + "grad_norm": 0.7056882977485657, + "learning_rate": 1.4391869123789756e-05, + "loss": 2.8934, + "step": 55155 + }, + { + "epoch": 2.7, + "grad_norm": 0.7619946002960205, + "learning_rate": 1.4387158054446756e-05, + "loss": 2.7752, + "step": 55156 + }, + { + "epoch": 2.7, + "grad_norm": 0.7698616981506348, + "learning_rate": 1.4382447737352987e-05, + "loss": 2.8134, + "step": 55157 + }, + { + "epoch": 2.7, + "grad_norm": 0.8463789224624634, + "learning_rate": 1.4377738172520903e-05, + "loss": 2.8462, + "step": 55158 + }, + { + "epoch": 2.7, + "grad_norm": 0.7743161916732788, + "learning_rate": 1.4373029359962862e-05, + "loss": 2.8833, + "step": 55159 + }, + { + "epoch": 2.7, + "grad_norm": 0.7336245775222778, + "learning_rate": 1.4368321299691355e-05, + "loss": 2.9437, + "step": 55160 + }, + { + "epoch": 2.7, + "grad_norm": 0.7166826128959656, + "learning_rate": 1.436361399171867e-05, + "loss": 3.0139, + "step": 55161 + }, + { + "epoch": 2.7, + "grad_norm": 0.7472900748252869, + "learning_rate": 1.4358907436057266e-05, + "loss": 2.816, + "step": 55162 + }, + { + "epoch": 2.7, + "grad_norm": 0.7503640055656433, + "learning_rate": 1.435420163271953e-05, + "loss": 2.651, + "step": 55163 + }, + { + "epoch": 2.7, + "grad_norm": 0.7015929222106934, + "learning_rate": 1.4349496581717823e-05, + "loss": 2.8712, + "step": 55164 + }, + { + "epoch": 2.7, + "grad_norm": 0.7873293161392212, + "learning_rate": 1.4344792283064565e-05, + "loss": 2.7243, + "step": 55165 + }, + { + "epoch": 2.7, + "grad_norm": 0.7600153684616089, + "learning_rate": 1.4340088736772214e-05, + "loss": 3.0151, + "step": 55166 + }, + { + "epoch": 2.7, + "grad_norm": 0.8037029504776001, + "learning_rate": 1.4335385942853028e-05, + "loss": 2.9545, + "step": 55167 + }, + { + "epoch": 2.7, + "grad_norm": 0.7690410017967224, + "learning_rate": 1.4330683901319496e-05, + "loss": 2.8564, + "step": 55168 + }, + { + "epoch": 2.7, + "grad_norm": 0.7575297355651855, + "learning_rate": 1.4325982612183973e-05, + "loss": 2.8627, + "step": 55169 + }, + { + "epoch": 2.7, + "grad_norm": 0.7149128913879395, + "learning_rate": 1.4321282075458784e-05, + "loss": 2.8766, + "step": 55170 + }, + { + "epoch": 2.7, + "grad_norm": 0.7330735325813293, + "learning_rate": 1.431658229115642e-05, + "loss": 2.8966, + "step": 55171 + }, + { + "epoch": 2.7, + "grad_norm": 0.725845456123352, + "learning_rate": 1.4311883259289136e-05, + "loss": 2.7962, + "step": 55172 + }, + { + "epoch": 2.7, + "grad_norm": 0.712296724319458, + "learning_rate": 1.4307184979869391e-05, + "loss": 2.8348, + "step": 55173 + }, + { + "epoch": 2.7, + "grad_norm": 0.8258801102638245, + "learning_rate": 1.4302487452909539e-05, + "loss": 2.9634, + "step": 55174 + }, + { + "epoch": 2.7, + "grad_norm": 0.7133929133415222, + "learning_rate": 1.4297790678421972e-05, + "loss": 2.7334, + "step": 55175 + }, + { + "epoch": 2.7, + "grad_norm": 0.7767179012298584, + "learning_rate": 1.4293094656419046e-05, + "loss": 2.8227, + "step": 55176 + }, + { + "epoch": 2.7, + "grad_norm": 0.7630220651626587, + "learning_rate": 1.4288399386913086e-05, + "loss": 2.9548, + "step": 55177 + }, + { + "epoch": 2.7, + "grad_norm": 0.756858766078949, + "learning_rate": 1.4283704869916512e-05, + "loss": 3.0724, + "step": 55178 + }, + { + "epoch": 2.7, + "grad_norm": 0.734491765499115, + "learning_rate": 1.4279011105441651e-05, + "loss": 2.8499, + "step": 55179 + }, + { + "epoch": 2.7, + "grad_norm": 0.7555728554725647, + "learning_rate": 1.4274318093500925e-05, + "loss": 3.0742, + "step": 55180 + }, + { + "epoch": 2.7, + "grad_norm": 0.779068648815155, + "learning_rate": 1.4269625834106625e-05, + "loss": 3.0113, + "step": 55181 + }, + { + "epoch": 2.7, + "grad_norm": 0.7292513847351074, + "learning_rate": 1.4264934327271172e-05, + "loss": 2.9021, + "step": 55182 + }, + { + "epoch": 2.7, + "grad_norm": 0.7384148240089417, + "learning_rate": 1.4260243573006891e-05, + "loss": 2.6843, + "step": 55183 + }, + { + "epoch": 2.7, + "grad_norm": 0.72471022605896, + "learning_rate": 1.4255553571326106e-05, + "loss": 3.1414, + "step": 55184 + }, + { + "epoch": 2.7, + "grad_norm": 0.761789083480835, + "learning_rate": 1.425086432224124e-05, + "loss": 3.0294, + "step": 55185 + }, + { + "epoch": 2.7, + "grad_norm": 0.7743604779243469, + "learning_rate": 1.424617582576455e-05, + "loss": 2.8922, + "step": 55186 + }, + { + "epoch": 2.7, + "grad_norm": 0.7332843542098999, + "learning_rate": 1.4241488081908426e-05, + "loss": 2.8659, + "step": 55187 + }, + { + "epoch": 2.7, + "grad_norm": 0.739586353302002, + "learning_rate": 1.423680109068529e-05, + "loss": 2.7717, + "step": 55188 + }, + { + "epoch": 2.7, + "grad_norm": 0.7435079216957092, + "learning_rate": 1.4232114852107368e-05, + "loss": 2.8082, + "step": 55189 + }, + { + "epoch": 2.7, + "grad_norm": 0.7537873983383179, + "learning_rate": 1.4227429366187115e-05, + "loss": 2.7587, + "step": 55190 + }, + { + "epoch": 2.7, + "grad_norm": 0.7387575507164001, + "learning_rate": 1.4222744632936822e-05, + "loss": 2.9726, + "step": 55191 + }, + { + "epoch": 2.7, + "grad_norm": 0.762256920337677, + "learning_rate": 1.421806065236878e-05, + "loss": 2.7777, + "step": 55192 + }, + { + "epoch": 2.7, + "grad_norm": 0.7154507637023926, + "learning_rate": 1.4213377424495375e-05, + "loss": 2.9309, + "step": 55193 + }, + { + "epoch": 2.7, + "grad_norm": 0.7694316506385803, + "learning_rate": 1.4208694949328903e-05, + "loss": 2.4981, + "step": 55194 + }, + { + "epoch": 2.7, + "grad_norm": 0.7170578241348267, + "learning_rate": 1.4204013226881784e-05, + "loss": 3.0198, + "step": 55195 + }, + { + "epoch": 2.71, + "grad_norm": 0.7894667387008667, + "learning_rate": 1.419933225716624e-05, + "loss": 2.8454, + "step": 55196 + }, + { + "epoch": 2.71, + "grad_norm": 0.7934728860855103, + "learning_rate": 1.4194652040194698e-05, + "loss": 3.0578, + "step": 55197 + }, + { + "epoch": 2.71, + "grad_norm": 0.7518163919448853, + "learning_rate": 1.4189972575979448e-05, + "loss": 2.7094, + "step": 55198 + }, + { + "epoch": 2.71, + "grad_norm": 0.7911694645881653, + "learning_rate": 1.4185293864532743e-05, + "loss": 3.0337, + "step": 55199 + }, + { + "epoch": 2.71, + "grad_norm": 0.7032492756843567, + "learning_rate": 1.418061590586701e-05, + "loss": 2.9355, + "step": 55200 + }, + { + "epoch": 2.71, + "grad_norm": 0.7293274402618408, + "learning_rate": 1.4175938699994504e-05, + "loss": 2.9049, + "step": 55201 + }, + { + "epoch": 2.71, + "grad_norm": 0.7434591054916382, + "learning_rate": 1.417126224692755e-05, + "loss": 2.8035, + "step": 55202 + }, + { + "epoch": 2.71, + "grad_norm": 0.7389995455741882, + "learning_rate": 1.4166586546678538e-05, + "loss": 2.8651, + "step": 55203 + }, + { + "epoch": 2.71, + "grad_norm": 0.7393417358398438, + "learning_rate": 1.416191159925969e-05, + "loss": 2.8286, + "step": 55204 + }, + { + "epoch": 2.71, + "grad_norm": 0.7257364988327026, + "learning_rate": 1.415723740468343e-05, + "loss": 2.9278, + "step": 55205 + }, + { + "epoch": 2.71, + "grad_norm": 0.8316163420677185, + "learning_rate": 1.4152563962961916e-05, + "loss": 2.9318, + "step": 55206 + }, + { + "epoch": 2.71, + "grad_norm": 0.718009889125824, + "learning_rate": 1.4147891274107503e-05, + "loss": 3.0043, + "step": 55207 + }, + { + "epoch": 2.71, + "grad_norm": 0.7439994812011719, + "learning_rate": 1.4143219338132616e-05, + "loss": 2.8376, + "step": 55208 + }, + { + "epoch": 2.71, + "grad_norm": 0.7357783317565918, + "learning_rate": 1.4138548155049411e-05, + "loss": 2.8897, + "step": 55209 + }, + { + "epoch": 2.71, + "grad_norm": 0.7854210734367371, + "learning_rate": 1.4133877724870312e-05, + "loss": 2.7374, + "step": 55210 + }, + { + "epoch": 2.71, + "grad_norm": 0.7327690124511719, + "learning_rate": 1.412920804760751e-05, + "loss": 2.7055, + "step": 55211 + }, + { + "epoch": 2.71, + "grad_norm": 0.784988284111023, + "learning_rate": 1.4124539123273392e-05, + "loss": 2.7917, + "step": 55212 + }, + { + "epoch": 2.71, + "grad_norm": 0.7121521830558777, + "learning_rate": 1.4119870951880252e-05, + "loss": 2.8236, + "step": 55213 + }, + { + "epoch": 2.71, + "grad_norm": 0.7981328368186951, + "learning_rate": 1.4115203533440278e-05, + "loss": 3.0959, + "step": 55214 + }, + { + "epoch": 2.71, + "grad_norm": 0.7848892211914062, + "learning_rate": 1.4110536867965894e-05, + "loss": 2.7955, + "step": 55215 + }, + { + "epoch": 2.71, + "grad_norm": 0.7660860419273376, + "learning_rate": 1.410587095546929e-05, + "loss": 3.0071, + "step": 55216 + }, + { + "epoch": 2.71, + "grad_norm": 0.7169570326805115, + "learning_rate": 1.4101205795962788e-05, + "loss": 2.6379, + "step": 55217 + }, + { + "epoch": 2.71, + "grad_norm": 0.7584700584411621, + "learning_rate": 1.409654138945875e-05, + "loss": 2.9686, + "step": 55218 + }, + { + "epoch": 2.71, + "grad_norm": 0.7803013920783997, + "learning_rate": 1.4091877735969393e-05, + "loss": 2.8314, + "step": 55219 + }, + { + "epoch": 2.71, + "grad_norm": 0.7412531971931458, + "learning_rate": 1.408721483550701e-05, + "loss": 2.9973, + "step": 55220 + }, + { + "epoch": 2.71, + "grad_norm": 0.8721296191215515, + "learning_rate": 1.4082552688083827e-05, + "loss": 2.917, + "step": 55221 + }, + { + "epoch": 2.71, + "grad_norm": 0.7726236581802368, + "learning_rate": 1.4077891293712162e-05, + "loss": 2.9424, + "step": 55222 + }, + { + "epoch": 2.71, + "grad_norm": 0.761952817440033, + "learning_rate": 1.4073230652404378e-05, + "loss": 2.6168, + "step": 55223 + }, + { + "epoch": 2.71, + "grad_norm": 0.7393391132354736, + "learning_rate": 1.4068570764172626e-05, + "loss": 2.9725, + "step": 55224 + }, + { + "epoch": 2.71, + "grad_norm": 0.7465383410453796, + "learning_rate": 1.4063911629029268e-05, + "loss": 2.8849, + "step": 55225 + }, + { + "epoch": 2.71, + "grad_norm": 0.7391934394836426, + "learning_rate": 1.405925324698649e-05, + "loss": 2.9616, + "step": 55226 + }, + { + "epoch": 2.71, + "grad_norm": 0.7626303434371948, + "learning_rate": 1.4054595618056652e-05, + "loss": 2.9186, + "step": 55227 + }, + { + "epoch": 2.71, + "grad_norm": 0.7824798226356506, + "learning_rate": 1.4049938742251976e-05, + "loss": 3.2266, + "step": 55228 + }, + { + "epoch": 2.71, + "grad_norm": 0.7267372012138367, + "learning_rate": 1.4045282619584685e-05, + "loss": 2.7209, + "step": 55229 + }, + { + "epoch": 2.71, + "grad_norm": 0.7535328269004822, + "learning_rate": 1.4040627250067138e-05, + "loss": 2.9377, + "step": 55230 + }, + { + "epoch": 2.71, + "grad_norm": 0.7468425035476685, + "learning_rate": 1.403597263371149e-05, + "loss": 2.8851, + "step": 55231 + }, + { + "epoch": 2.71, + "grad_norm": 0.7248378396034241, + "learning_rate": 1.4031318770530065e-05, + "loss": 2.6145, + "step": 55232 + }, + { + "epoch": 2.71, + "grad_norm": 0.7613323926925659, + "learning_rate": 1.4026665660535152e-05, + "loss": 2.9701, + "step": 55233 + }, + { + "epoch": 2.71, + "grad_norm": 0.7856761813163757, + "learning_rate": 1.4022013303738977e-05, + "loss": 2.8771, + "step": 55234 + }, + { + "epoch": 2.71, + "grad_norm": 0.7455305457115173, + "learning_rate": 1.4017361700153762e-05, + "loss": 2.9148, + "step": 55235 + }, + { + "epoch": 2.71, + "grad_norm": 0.7511041760444641, + "learning_rate": 1.4012710849791731e-05, + "loss": 2.6375, + "step": 55236 + }, + { + "epoch": 2.71, + "grad_norm": 0.7273489832878113, + "learning_rate": 1.4008060752665206e-05, + "loss": 2.7796, + "step": 55237 + }, + { + "epoch": 2.71, + "grad_norm": 0.744505763053894, + "learning_rate": 1.4003411408786414e-05, + "loss": 3.0129, + "step": 55238 + }, + { + "epoch": 2.71, + "grad_norm": 0.7265143394470215, + "learning_rate": 1.3998762818167575e-05, + "loss": 2.7688, + "step": 55239 + }, + { + "epoch": 2.71, + "grad_norm": 0.8545629978179932, + "learning_rate": 1.3994114980821014e-05, + "loss": 2.9477, + "step": 55240 + }, + { + "epoch": 2.71, + "grad_norm": 0.7982408404350281, + "learning_rate": 1.3989467896758887e-05, + "loss": 2.9287, + "step": 55241 + }, + { + "epoch": 2.71, + "grad_norm": 0.7544611692428589, + "learning_rate": 1.398482156599342e-05, + "loss": 2.8994, + "step": 55242 + }, + { + "epoch": 2.71, + "grad_norm": 0.8148059248924255, + "learning_rate": 1.3980175988536935e-05, + "loss": 2.81, + "step": 55243 + }, + { + "epoch": 2.71, + "grad_norm": 0.7339653372764587, + "learning_rate": 1.3975531164401555e-05, + "loss": 2.9793, + "step": 55244 + }, + { + "epoch": 2.71, + "grad_norm": 0.7554219961166382, + "learning_rate": 1.397088709359967e-05, + "loss": 3.0484, + "step": 55245 + }, + { + "epoch": 2.71, + "grad_norm": 0.8348968625068665, + "learning_rate": 1.3966243776143338e-05, + "loss": 3.0106, + "step": 55246 + }, + { + "epoch": 2.71, + "grad_norm": 0.7876124382019043, + "learning_rate": 1.3961601212044881e-05, + "loss": 2.8469, + "step": 55247 + }, + { + "epoch": 2.71, + "grad_norm": 0.7836396098136902, + "learning_rate": 1.3956959401316558e-05, + "loss": 2.7991, + "step": 55248 + }, + { + "epoch": 2.71, + "grad_norm": 0.7819403409957886, + "learning_rate": 1.3952318343970559e-05, + "loss": 3.033, + "step": 55249 + }, + { + "epoch": 2.71, + "grad_norm": 0.7380668520927429, + "learning_rate": 1.3947678040019105e-05, + "loss": 2.8926, + "step": 55250 + }, + { + "epoch": 2.71, + "grad_norm": 0.7316756248474121, + "learning_rate": 1.3943038489474356e-05, + "loss": 2.9032, + "step": 55251 + }, + { + "epoch": 2.71, + "grad_norm": 0.7308788299560547, + "learning_rate": 1.3938399692348667e-05, + "loss": 2.911, + "step": 55252 + }, + { + "epoch": 2.71, + "grad_norm": 0.7654592990875244, + "learning_rate": 1.393376164865413e-05, + "loss": 2.6837, + "step": 55253 + }, + { + "epoch": 2.71, + "grad_norm": 0.7962247133255005, + "learning_rate": 1.3929124358402999e-05, + "loss": 2.9718, + "step": 55254 + }, + { + "epoch": 2.71, + "grad_norm": 0.7708144187927246, + "learning_rate": 1.3924487821607534e-05, + "loss": 2.8453, + "step": 55255 + }, + { + "epoch": 2.71, + "grad_norm": 0.7123581767082214, + "learning_rate": 1.3919852038279888e-05, + "loss": 2.7315, + "step": 55256 + }, + { + "epoch": 2.71, + "grad_norm": 0.7534207105636597, + "learning_rate": 1.3915217008432355e-05, + "loss": 3.0169, + "step": 55257 + }, + { + "epoch": 2.71, + "grad_norm": 0.7725705504417419, + "learning_rate": 1.3910582732076991e-05, + "loss": 2.8719, + "step": 55258 + }, + { + "epoch": 2.71, + "grad_norm": 0.7859570384025574, + "learning_rate": 1.3905949209226119e-05, + "loss": 2.9936, + "step": 55259 + }, + { + "epoch": 2.71, + "grad_norm": 0.7947921752929688, + "learning_rate": 1.3901316439891962e-05, + "loss": 2.7604, + "step": 55260 + }, + { + "epoch": 2.71, + "grad_norm": 0.7809910774230957, + "learning_rate": 1.3896684424086613e-05, + "loss": 2.9163, + "step": 55261 + }, + { + "epoch": 2.71, + "grad_norm": 0.7197073698043823, + "learning_rate": 1.3892053161822392e-05, + "loss": 2.9279, + "step": 55262 + }, + { + "epoch": 2.71, + "grad_norm": 0.7367950677871704, + "learning_rate": 1.3887422653111424e-05, + "loss": 2.7356, + "step": 55263 + }, + { + "epoch": 2.71, + "grad_norm": 0.7174615263938904, + "learning_rate": 1.3882792897965899e-05, + "loss": 3.0581, + "step": 55264 + }, + { + "epoch": 2.71, + "grad_norm": 0.7707169055938721, + "learning_rate": 1.3878163896398076e-05, + "loss": 2.6856, + "step": 55265 + }, + { + "epoch": 2.71, + "grad_norm": 0.7517051100730896, + "learning_rate": 1.3873535648420043e-05, + "loss": 2.8619, + "step": 55266 + }, + { + "epoch": 2.71, + "grad_norm": 0.7613734602928162, + "learning_rate": 1.3868908154044089e-05, + "loss": 2.7424, + "step": 55267 + }, + { + "epoch": 2.71, + "grad_norm": 0.7365747690200806, + "learning_rate": 1.386428141328234e-05, + "loss": 3.0878, + "step": 55268 + }, + { + "epoch": 2.71, + "grad_norm": 0.7309760451316833, + "learning_rate": 1.385965542614702e-05, + "loss": 2.7943, + "step": 55269 + }, + { + "epoch": 2.71, + "grad_norm": 0.7792554497718811, + "learning_rate": 1.3855030192650318e-05, + "loss": 2.9993, + "step": 55270 + }, + { + "epoch": 2.71, + "grad_norm": 0.7756322622299194, + "learning_rate": 1.385040571280439e-05, + "loss": 2.5423, + "step": 55271 + }, + { + "epoch": 2.71, + "grad_norm": 0.7910360097885132, + "learning_rate": 1.3845781986621463e-05, + "loss": 3.0719, + "step": 55272 + }, + { + "epoch": 2.71, + "grad_norm": 0.7808008790016174, + "learning_rate": 1.384115901411359e-05, + "loss": 2.9894, + "step": 55273 + }, + { + "epoch": 2.71, + "grad_norm": 0.7794164419174194, + "learning_rate": 1.3836536795293063e-05, + "loss": 2.6242, + "step": 55274 + }, + { + "epoch": 2.71, + "grad_norm": 0.7962504625320435, + "learning_rate": 1.3831915330172038e-05, + "loss": 2.9159, + "step": 55275 + }, + { + "epoch": 2.71, + "grad_norm": 0.7393701076507568, + "learning_rate": 1.382729461876264e-05, + "loss": 2.8513, + "step": 55276 + }, + { + "epoch": 2.71, + "grad_norm": 0.7692285776138306, + "learning_rate": 1.3822674661077126e-05, + "loss": 2.7925, + "step": 55277 + }, + { + "epoch": 2.71, + "grad_norm": 0.721187949180603, + "learning_rate": 1.3818055457127586e-05, + "loss": 3.0527, + "step": 55278 + }, + { + "epoch": 2.71, + "grad_norm": 0.7725158929824829, + "learning_rate": 1.3813437006926209e-05, + "loss": 2.9564, + "step": 55279 + }, + { + "epoch": 2.71, + "grad_norm": 0.7646429538726807, + "learning_rate": 1.3808819310485186e-05, + "loss": 2.8219, + "step": 55280 + }, + { + "epoch": 2.71, + "grad_norm": 0.7827332019805908, + "learning_rate": 1.3804202367816608e-05, + "loss": 2.9961, + "step": 55281 + }, + { + "epoch": 2.71, + "grad_norm": 0.7357837557792664, + "learning_rate": 1.3799586178932697e-05, + "loss": 2.9484, + "step": 55282 + }, + { + "epoch": 2.71, + "grad_norm": 0.7550036311149597, + "learning_rate": 1.3794970743845612e-05, + "loss": 2.9146, + "step": 55283 + }, + { + "epoch": 2.71, + "grad_norm": 0.7835947275161743, + "learning_rate": 1.3790356062567475e-05, + "loss": 2.8771, + "step": 55284 + }, + { + "epoch": 2.71, + "grad_norm": 0.7576488852500916, + "learning_rate": 1.378574213511051e-05, + "loss": 2.6269, + "step": 55285 + }, + { + "epoch": 2.71, + "grad_norm": 0.7808263301849365, + "learning_rate": 1.3781128961486742e-05, + "loss": 2.813, + "step": 55286 + }, + { + "epoch": 2.71, + "grad_norm": 0.7787500619888306, + "learning_rate": 1.377651654170846e-05, + "loss": 2.8559, + "step": 55287 + }, + { + "epoch": 2.71, + "grad_norm": 0.8117019534111023, + "learning_rate": 1.3771904875787687e-05, + "loss": 2.8457, + "step": 55288 + }, + { + "epoch": 2.71, + "grad_norm": 0.712166965007782, + "learning_rate": 1.3767293963736647e-05, + "loss": 2.8873, + "step": 55289 + }, + { + "epoch": 2.71, + "grad_norm": 0.7200855016708374, + "learning_rate": 1.3762683805567498e-05, + "loss": 2.8848, + "step": 55290 + }, + { + "epoch": 2.71, + "grad_norm": 0.7672809958457947, + "learning_rate": 1.3758074401292296e-05, + "loss": 2.8106, + "step": 55291 + }, + { + "epoch": 2.71, + "grad_norm": 0.7226176261901855, + "learning_rate": 1.3753465750923297e-05, + "loss": 2.8904, + "step": 55292 + }, + { + "epoch": 2.71, + "grad_norm": 0.740872323513031, + "learning_rate": 1.3748857854472594e-05, + "loss": 2.9025, + "step": 55293 + }, + { + "epoch": 2.71, + "grad_norm": 0.7510790824890137, + "learning_rate": 1.3744250711952242e-05, + "loss": 2.9526, + "step": 55294 + }, + { + "epoch": 2.71, + "grad_norm": 0.7161646485328674, + "learning_rate": 1.37396443233745e-05, + "loss": 3.074, + "step": 55295 + }, + { + "epoch": 2.71, + "grad_norm": 0.7294967770576477, + "learning_rate": 1.3735038688751387e-05, + "loss": 2.8639, + "step": 55296 + }, + { + "epoch": 2.71, + "grad_norm": 0.7264935970306396, + "learning_rate": 1.3730433808095165e-05, + "loss": 2.8598, + "step": 55297 + }, + { + "epoch": 2.71, + "grad_norm": 0.8006035685539246, + "learning_rate": 1.3725829681417822e-05, + "loss": 2.9085, + "step": 55298 + }, + { + "epoch": 2.71, + "grad_norm": 0.7237944602966309, + "learning_rate": 1.372122630873158e-05, + "loss": 2.7831, + "step": 55299 + }, + { + "epoch": 2.71, + "grad_norm": 0.7685567140579224, + "learning_rate": 1.3716623690048534e-05, + "loss": 3.0923, + "step": 55300 + }, + { + "epoch": 2.71, + "grad_norm": 0.8011992573738098, + "learning_rate": 1.371202182538077e-05, + "loss": 2.9499, + "step": 55301 + }, + { + "epoch": 2.71, + "grad_norm": 0.7467533349990845, + "learning_rate": 1.370742071474048e-05, + "loss": 3.0305, + "step": 55302 + }, + { + "epoch": 2.71, + "grad_norm": 0.7417289614677429, + "learning_rate": 1.3702820358139721e-05, + "loss": 2.7992, + "step": 55303 + }, + { + "epoch": 2.71, + "grad_norm": 0.7307183742523193, + "learning_rate": 1.3698220755590616e-05, + "loss": 2.9265, + "step": 55304 + }, + { + "epoch": 2.71, + "grad_norm": 0.7761890888214111, + "learning_rate": 1.3693621907105357e-05, + "loss": 2.7917, + "step": 55305 + }, + { + "epoch": 2.71, + "grad_norm": 0.7544965744018555, + "learning_rate": 1.3689023812695999e-05, + "loss": 2.9144, + "step": 55306 + }, + { + "epoch": 2.71, + "grad_norm": 0.8068240284919739, + "learning_rate": 1.3684426472374632e-05, + "loss": 2.7519, + "step": 55307 + }, + { + "epoch": 2.71, + "grad_norm": 0.7504112124443054, + "learning_rate": 1.3679829886153349e-05, + "loss": 2.7891, + "step": 55308 + }, + { + "epoch": 2.71, + "grad_norm": 0.7085170149803162, + "learning_rate": 1.367523405404427e-05, + "loss": 2.6752, + "step": 55309 + }, + { + "epoch": 2.71, + "grad_norm": 0.7529444694519043, + "learning_rate": 1.3670638976059589e-05, + "loss": 3.0175, + "step": 55310 + }, + { + "epoch": 2.71, + "grad_norm": 0.7292396426200867, + "learning_rate": 1.3666044652211295e-05, + "loss": 2.9474, + "step": 55311 + }, + { + "epoch": 2.71, + "grad_norm": 0.787841260433197, + "learning_rate": 1.3661451082511577e-05, + "loss": 2.6969, + "step": 55312 + }, + { + "epoch": 2.71, + "grad_norm": 0.7327471375465393, + "learning_rate": 1.3656858266972427e-05, + "loss": 2.9287, + "step": 55313 + }, + { + "epoch": 2.71, + "grad_norm": 0.7643541693687439, + "learning_rate": 1.3652266205606065e-05, + "loss": 2.8279, + "step": 55314 + }, + { + "epoch": 2.71, + "grad_norm": 0.6762557625770569, + "learning_rate": 1.364767489842452e-05, + "loss": 2.8347, + "step": 55315 + }, + { + "epoch": 2.71, + "grad_norm": 0.726814866065979, + "learning_rate": 1.3643084345439847e-05, + "loss": 2.7628, + "step": 55316 + }, + { + "epoch": 2.71, + "grad_norm": 0.770945131778717, + "learning_rate": 1.3638494546664236e-05, + "loss": 2.8228, + "step": 55317 + }, + { + "epoch": 2.71, + "grad_norm": 0.7129945158958435, + "learning_rate": 1.3633905502109676e-05, + "loss": 2.8855, + "step": 55318 + }, + { + "epoch": 2.71, + "grad_norm": 0.7419125437736511, + "learning_rate": 1.362931721178826e-05, + "loss": 2.8758, + "step": 55319 + }, + { + "epoch": 2.71, + "grad_norm": 0.7454127669334412, + "learning_rate": 1.3624729675712176e-05, + "loss": 2.9655, + "step": 55320 + }, + { + "epoch": 2.71, + "grad_norm": 0.7969364523887634, + "learning_rate": 1.362014289389345e-05, + "loss": 3.003, + "step": 55321 + }, + { + "epoch": 2.71, + "grad_norm": 0.7323818206787109, + "learning_rate": 1.3615556866344135e-05, + "loss": 2.8169, + "step": 55322 + }, + { + "epoch": 2.71, + "grad_norm": 0.7280770540237427, + "learning_rate": 1.3610971593076293e-05, + "loss": 2.7861, + "step": 55323 + }, + { + "epoch": 2.71, + "grad_norm": 0.808602511882782, + "learning_rate": 1.3606387074102042e-05, + "loss": 2.9433, + "step": 55324 + }, + { + "epoch": 2.71, + "grad_norm": 0.717870831489563, + "learning_rate": 1.360180330943348e-05, + "loss": 2.9154, + "step": 55325 + }, + { + "epoch": 2.71, + "grad_norm": 0.7248071432113647, + "learning_rate": 1.3597220299082623e-05, + "loss": 2.9233, + "step": 55326 + }, + { + "epoch": 2.71, + "grad_norm": 0.7266684174537659, + "learning_rate": 1.3592638043061598e-05, + "loss": 3.1069, + "step": 55327 + }, + { + "epoch": 2.71, + "grad_norm": 0.7501939535140991, + "learning_rate": 1.3588056541382397e-05, + "loss": 2.9581, + "step": 55328 + }, + { + "epoch": 2.71, + "grad_norm": 0.7233149409294128, + "learning_rate": 1.3583475794057207e-05, + "loss": 2.8132, + "step": 55329 + }, + { + "epoch": 2.71, + "grad_norm": 0.8018295168876648, + "learning_rate": 1.3578895801097989e-05, + "loss": 3.065, + "step": 55330 + }, + { + "epoch": 2.71, + "grad_norm": 0.7703627347946167, + "learning_rate": 1.3574316562516796e-05, + "loss": 2.7958, + "step": 55331 + }, + { + "epoch": 2.71, + "grad_norm": 0.7846996188163757, + "learning_rate": 1.3569738078325788e-05, + "loss": 2.72, + "step": 55332 + }, + { + "epoch": 2.71, + "grad_norm": 0.8439915180206299, + "learning_rate": 1.3565160348536918e-05, + "loss": 2.673, + "step": 55333 + }, + { + "epoch": 2.71, + "grad_norm": 0.7853070497512817, + "learning_rate": 1.3560583373162315e-05, + "loss": 2.8157, + "step": 55334 + }, + { + "epoch": 2.71, + "grad_norm": 0.7770476937294006, + "learning_rate": 1.3556007152213966e-05, + "loss": 2.9171, + "step": 55335 + }, + { + "epoch": 2.71, + "grad_norm": 0.7779586315155029, + "learning_rate": 1.3551431685704028e-05, + "loss": 2.9688, + "step": 55336 + }, + { + "epoch": 2.71, + "grad_norm": 0.7522245049476624, + "learning_rate": 1.3546856973644493e-05, + "loss": 2.7268, + "step": 55337 + }, + { + "epoch": 2.71, + "grad_norm": 0.7922734022140503, + "learning_rate": 1.354228301604735e-05, + "loss": 2.7946, + "step": 55338 + }, + { + "epoch": 2.71, + "grad_norm": 0.7396979331970215, + "learning_rate": 1.3537709812924757e-05, + "loss": 2.9228, + "step": 55339 + }, + { + "epoch": 2.71, + "grad_norm": 0.7690479755401611, + "learning_rate": 1.3533137364288671e-05, + "loss": 2.8392, + "step": 55340 + }, + { + "epoch": 2.71, + "grad_norm": 0.8230964541435242, + "learning_rate": 1.3528565670151149e-05, + "loss": 2.7559, + "step": 55341 + }, + { + "epoch": 2.71, + "grad_norm": 0.7428261637687683, + "learning_rate": 1.3523994730524312e-05, + "loss": 2.8789, + "step": 55342 + }, + { + "epoch": 2.71, + "grad_norm": 0.7513474822044373, + "learning_rate": 1.351942454542012e-05, + "loss": 2.711, + "step": 55343 + }, + { + "epoch": 2.71, + "grad_norm": 0.7638099193572998, + "learning_rate": 1.351485511485063e-05, + "loss": 2.6492, + "step": 55344 + }, + { + "epoch": 2.71, + "grad_norm": 0.7367132306098938, + "learning_rate": 1.3510286438827866e-05, + "loss": 2.936, + "step": 55345 + }, + { + "epoch": 2.71, + "grad_norm": 0.7802484035491943, + "learning_rate": 1.3505718517363851e-05, + "loss": 2.9882, + "step": 55346 + }, + { + "epoch": 2.71, + "grad_norm": 0.7542453408241272, + "learning_rate": 1.3501151350470673e-05, + "loss": 2.8119, + "step": 55347 + }, + { + "epoch": 2.71, + "grad_norm": 0.79583340883255, + "learning_rate": 1.3496584938160292e-05, + "loss": 2.8344, + "step": 55348 + }, + { + "epoch": 2.71, + "grad_norm": 0.7419581413269043, + "learning_rate": 1.3492019280444798e-05, + "loss": 2.6465, + "step": 55349 + }, + { + "epoch": 2.71, + "grad_norm": 0.7605627775192261, + "learning_rate": 1.348745437733615e-05, + "loss": 2.7418, + "step": 55350 + }, + { + "epoch": 2.71, + "grad_norm": 0.7467504143714905, + "learning_rate": 1.3482890228846466e-05, + "loss": 2.8421, + "step": 55351 + }, + { + "epoch": 2.71, + "grad_norm": 0.7766313552856445, + "learning_rate": 1.3478326834987673e-05, + "loss": 2.8587, + "step": 55352 + }, + { + "epoch": 2.71, + "grad_norm": 0.7753242254257202, + "learning_rate": 1.3473764195771797e-05, + "loss": 2.7824, + "step": 55353 + }, + { + "epoch": 2.71, + "grad_norm": 0.7845016121864319, + "learning_rate": 1.3469202311210925e-05, + "loss": 2.9468, + "step": 55354 + }, + { + "epoch": 2.71, + "grad_norm": 0.7222416400909424, + "learning_rate": 1.3464641181317015e-05, + "loss": 3.0865, + "step": 55355 + }, + { + "epoch": 2.71, + "grad_norm": 0.7512432932853699, + "learning_rate": 1.3460080806102057e-05, + "loss": 2.6878, + "step": 55356 + }, + { + "epoch": 2.71, + "grad_norm": 0.7436035871505737, + "learning_rate": 1.3455521185578177e-05, + "loss": 2.7787, + "step": 55357 + }, + { + "epoch": 2.71, + "grad_norm": 0.7184932827949524, + "learning_rate": 1.3450962319757263e-05, + "loss": 2.8446, + "step": 55358 + }, + { + "epoch": 2.71, + "grad_norm": 0.748602569103241, + "learning_rate": 1.3446404208651406e-05, + "loss": 2.948, + "step": 55359 + }, + { + "epoch": 2.71, + "grad_norm": 0.8889462947845459, + "learning_rate": 1.344184685227253e-05, + "loss": 2.9219, + "step": 55360 + }, + { + "epoch": 2.71, + "grad_norm": 0.7768132090568542, + "learning_rate": 1.343729025063266e-05, + "loss": 2.8009, + "step": 55361 + }, + { + "epoch": 2.71, + "grad_norm": 0.727043628692627, + "learning_rate": 1.343273440374385e-05, + "loss": 2.7721, + "step": 55362 + }, + { + "epoch": 2.71, + "grad_norm": 0.7611899375915527, + "learning_rate": 1.3428179311618058e-05, + "loss": 2.8881, + "step": 55363 + }, + { + "epoch": 2.71, + "grad_norm": 0.7601465582847595, + "learning_rate": 1.3423624974267312e-05, + "loss": 2.8482, + "step": 55364 + }, + { + "epoch": 2.71, + "grad_norm": 0.7595193386077881, + "learning_rate": 1.3419071391703596e-05, + "loss": 2.8653, + "step": 55365 + }, + { + "epoch": 2.71, + "grad_norm": 0.8344078660011292, + "learning_rate": 1.3414518563938837e-05, + "loss": 2.969, + "step": 55366 + }, + { + "epoch": 2.71, + "grad_norm": 0.7677531242370605, + "learning_rate": 1.3409966490985125e-05, + "loss": 2.7756, + "step": 55367 + }, + { + "epoch": 2.71, + "grad_norm": 0.7744829654693604, + "learning_rate": 1.3405415172854384e-05, + "loss": 2.9537, + "step": 55368 + }, + { + "epoch": 2.71, + "grad_norm": 0.7642647624015808, + "learning_rate": 1.3400864609558637e-05, + "loss": 2.6661, + "step": 55369 + }, + { + "epoch": 2.71, + "grad_norm": 0.7297582626342773, + "learning_rate": 1.3396314801109808e-05, + "loss": 2.977, + "step": 55370 + }, + { + "epoch": 2.71, + "grad_norm": 0.7670851945877075, + "learning_rate": 1.3391765747519956e-05, + "loss": 2.9117, + "step": 55371 + }, + { + "epoch": 2.71, + "grad_norm": 0.7472463250160217, + "learning_rate": 1.3387217448801069e-05, + "loss": 2.8681, + "step": 55372 + }, + { + "epoch": 2.71, + "grad_norm": 0.7477490305900574, + "learning_rate": 1.3382669904965105e-05, + "loss": 2.8268, + "step": 55373 + }, + { + "epoch": 2.71, + "grad_norm": 0.7499014735221863, + "learning_rate": 1.3378123116023987e-05, + "loss": 2.9298, + "step": 55374 + }, + { + "epoch": 2.71, + "grad_norm": 0.748065710067749, + "learning_rate": 1.3373577081989707e-05, + "loss": 2.9597, + "step": 55375 + }, + { + "epoch": 2.71, + "grad_norm": 0.7136443257331848, + "learning_rate": 1.3369031802874286e-05, + "loss": 2.8244, + "step": 55376 + }, + { + "epoch": 2.71, + "grad_norm": 0.7487014532089233, + "learning_rate": 1.3364487278689684e-05, + "loss": 2.7234, + "step": 55377 + }, + { + "epoch": 2.71, + "grad_norm": 0.7109510898590088, + "learning_rate": 1.3359943509447858e-05, + "loss": 3.0295, + "step": 55378 + }, + { + "epoch": 2.71, + "grad_norm": 0.7548745274543762, + "learning_rate": 1.3355400495160762e-05, + "loss": 2.9223, + "step": 55379 + }, + { + "epoch": 2.71, + "grad_norm": 0.7589823007583618, + "learning_rate": 1.3350858235840423e-05, + "loss": 2.8757, + "step": 55380 + }, + { + "epoch": 2.71, + "grad_norm": 0.8094698190689087, + "learning_rate": 1.3346316731498697e-05, + "loss": 2.9904, + "step": 55381 + }, + { + "epoch": 2.71, + "grad_norm": 0.7765593528747559, + "learning_rate": 1.334177598214764e-05, + "loss": 2.9588, + "step": 55382 + }, + { + "epoch": 2.71, + "grad_norm": 0.7499661445617676, + "learning_rate": 1.3337235987799144e-05, + "loss": 2.8294, + "step": 55383 + }, + { + "epoch": 2.71, + "grad_norm": 0.7597896456718445, + "learning_rate": 1.3332696748465232e-05, + "loss": 2.8076, + "step": 55384 + }, + { + "epoch": 2.71, + "grad_norm": 0.7389174103736877, + "learning_rate": 1.3328158264157762e-05, + "loss": 3.1259, + "step": 55385 + }, + { + "epoch": 2.71, + "grad_norm": 0.7625584006309509, + "learning_rate": 1.3323620534888823e-05, + "loss": 2.8586, + "step": 55386 + }, + { + "epoch": 2.71, + "grad_norm": 0.7467266917228699, + "learning_rate": 1.3319083560670308e-05, + "loss": 3.0767, + "step": 55387 + }, + { + "epoch": 2.71, + "grad_norm": 0.7760413885116577, + "learning_rate": 1.3314547341514104e-05, + "loss": 3.0268, + "step": 55388 + }, + { + "epoch": 2.71, + "grad_norm": 0.7491160035133362, + "learning_rate": 1.3310011877432236e-05, + "loss": 2.9444, + "step": 55389 + }, + { + "epoch": 2.71, + "grad_norm": 0.8000679612159729, + "learning_rate": 1.3305477168436596e-05, + "loss": 2.9765, + "step": 55390 + }, + { + "epoch": 2.71, + "grad_norm": 1.0552611351013184, + "learning_rate": 1.3300943214539138e-05, + "loss": 2.9166, + "step": 55391 + }, + { + "epoch": 2.71, + "grad_norm": 0.7722744345664978, + "learning_rate": 1.3296410015751858e-05, + "loss": 2.8614, + "step": 55392 + }, + { + "epoch": 2.71, + "grad_norm": 0.7588552832603455, + "learning_rate": 1.329187757208664e-05, + "loss": 2.7327, + "step": 55393 + }, + { + "epoch": 2.71, + "grad_norm": 0.7379563450813293, + "learning_rate": 1.3287345883555445e-05, + "loss": 2.8572, + "step": 55394 + }, + { + "epoch": 2.71, + "grad_norm": 0.7401456236839294, + "learning_rate": 1.328281495017023e-05, + "loss": 2.8642, + "step": 55395 + }, + { + "epoch": 2.71, + "grad_norm": 0.7446950078010559, + "learning_rate": 1.3278284771942849e-05, + "loss": 2.8721, + "step": 55396 + }, + { + "epoch": 2.71, + "grad_norm": 0.788352370262146, + "learning_rate": 1.3273755348885329e-05, + "loss": 2.9431, + "step": 55397 + }, + { + "epoch": 2.71, + "grad_norm": 0.8451722264289856, + "learning_rate": 1.3269226681009525e-05, + "loss": 2.822, + "step": 55398 + }, + { + "epoch": 2.71, + "grad_norm": 0.7008662223815918, + "learning_rate": 1.3264698768327432e-05, + "loss": 3.1527, + "step": 55399 + }, + { + "epoch": 2.72, + "grad_norm": 0.7523859143257141, + "learning_rate": 1.3260171610850867e-05, + "loss": 2.9673, + "step": 55400 + }, + { + "epoch": 2.72, + "grad_norm": 0.7550097703933716, + "learning_rate": 1.3255645208591893e-05, + "loss": 2.9408, + "step": 55401 + }, + { + "epoch": 2.72, + "grad_norm": 0.7374807596206665, + "learning_rate": 1.3251119561562362e-05, + "loss": 2.815, + "step": 55402 + }, + { + "epoch": 2.72, + "grad_norm": 0.7401943802833557, + "learning_rate": 1.3246594669774169e-05, + "loss": 2.8424, + "step": 55403 + }, + { + "epoch": 2.72, + "grad_norm": 0.7593181729316711, + "learning_rate": 1.3242070533239302e-05, + "loss": 2.9994, + "step": 55404 + }, + { + "epoch": 2.72, + "grad_norm": 0.7926674485206604, + "learning_rate": 1.3237547151969552e-05, + "loss": 2.9654, + "step": 55405 + }, + { + "epoch": 2.72, + "grad_norm": 0.7539723515510559, + "learning_rate": 1.323302452597701e-05, + "loss": 2.8782, + "step": 55406 + }, + { + "epoch": 2.72, + "grad_norm": 0.776358962059021, + "learning_rate": 1.3228502655273398e-05, + "loss": 3.0188, + "step": 55407 + }, + { + "epoch": 2.72, + "grad_norm": 0.7997563481330872, + "learning_rate": 1.322398153987081e-05, + "loss": 2.9002, + "step": 55408 + }, + { + "epoch": 2.72, + "grad_norm": 0.7761156558990479, + "learning_rate": 1.3219461179781033e-05, + "loss": 2.8597, + "step": 55409 + }, + { + "epoch": 2.72, + "grad_norm": 0.7830085158348083, + "learning_rate": 1.3214941575015958e-05, + "loss": 2.8994, + "step": 55410 + }, + { + "epoch": 2.72, + "grad_norm": 0.7592552304267883, + "learning_rate": 1.3210422725587577e-05, + "loss": 2.755, + "step": 55411 + }, + { + "epoch": 2.72, + "grad_norm": 0.716784656047821, + "learning_rate": 1.3205904631507714e-05, + "loss": 2.8458, + "step": 55412 + }, + { + "epoch": 2.72, + "grad_norm": 0.7735759615898132, + "learning_rate": 1.3201387292788324e-05, + "loss": 2.9543, + "step": 55413 + }, + { + "epoch": 2.72, + "grad_norm": 0.7621521949768066, + "learning_rate": 1.3196870709441298e-05, + "loss": 3.0033, + "step": 55414 + }, + { + "epoch": 2.72, + "grad_norm": 0.7094641923904419, + "learning_rate": 1.3192354881478496e-05, + "loss": 3.1288, + "step": 55415 + }, + { + "epoch": 2.72, + "grad_norm": 0.7761197090148926, + "learning_rate": 1.3187839808911871e-05, + "loss": 3.1512, + "step": 55416 + }, + { + "epoch": 2.72, + "grad_norm": 0.8005975484848022, + "learning_rate": 1.318332549175325e-05, + "loss": 2.9146, + "step": 55417 + }, + { + "epoch": 2.72, + "grad_norm": 0.7150023579597473, + "learning_rate": 1.3178811930014554e-05, + "loss": 2.6611, + "step": 55418 + }, + { + "epoch": 2.72, + "grad_norm": 0.7599121332168579, + "learning_rate": 1.3174299123707677e-05, + "loss": 2.8705, + "step": 55419 + }, + { + "epoch": 2.72, + "grad_norm": 0.8056515455245972, + "learning_rate": 1.3169787072844473e-05, + "loss": 2.8577, + "step": 55420 + }, + { + "epoch": 2.72, + "grad_norm": 0.7697814106941223, + "learning_rate": 1.3165275777436867e-05, + "loss": 3.1584, + "step": 55421 + }, + { + "epoch": 2.72, + "grad_norm": 0.7207685112953186, + "learning_rate": 1.3160765237496685e-05, + "loss": 3.1243, + "step": 55422 + }, + { + "epoch": 2.72, + "grad_norm": 0.7401747107505798, + "learning_rate": 1.3156255453035914e-05, + "loss": 3.0593, + "step": 55423 + }, + { + "epoch": 2.72, + "grad_norm": 0.8038659691810608, + "learning_rate": 1.3151746424066312e-05, + "loss": 2.9738, + "step": 55424 + }, + { + "epoch": 2.72, + "grad_norm": 0.7309437990188599, + "learning_rate": 1.3147238150599804e-05, + "loss": 2.9488, + "step": 55425 + }, + { + "epoch": 2.72, + "grad_norm": 0.752806544303894, + "learning_rate": 1.3142730632648313e-05, + "loss": 2.7383, + "step": 55426 + }, + { + "epoch": 2.72, + "grad_norm": 0.72652268409729, + "learning_rate": 1.3138223870223597e-05, + "loss": 2.7769, + "step": 55427 + }, + { + "epoch": 2.72, + "grad_norm": 0.75226891040802, + "learning_rate": 1.3133717863337612e-05, + "loss": 3.0681, + "step": 55428 + }, + { + "epoch": 2.72, + "grad_norm": 0.7673428654670715, + "learning_rate": 1.3129212612002215e-05, + "loss": 2.7021, + "step": 55429 + }, + { + "epoch": 2.72, + "grad_norm": 0.7407193779945374, + "learning_rate": 1.3124708116229233e-05, + "loss": 2.9799, + "step": 55430 + }, + { + "epoch": 2.72, + "grad_norm": 0.7485948801040649, + "learning_rate": 1.3120204376030652e-05, + "loss": 3.0959, + "step": 55431 + }, + { + "epoch": 2.72, + "grad_norm": 0.7666661739349365, + "learning_rate": 1.3115701391418165e-05, + "loss": 2.7309, + "step": 55432 + }, + { + "epoch": 2.72, + "grad_norm": 0.7030153870582581, + "learning_rate": 1.3111199162403697e-05, + "loss": 2.716, + "step": 55433 + }, + { + "epoch": 2.72, + "grad_norm": 0.6960538625717163, + "learning_rate": 1.3106697688999169e-05, + "loss": 2.7599, + "step": 55434 + }, + { + "epoch": 2.72, + "grad_norm": 0.6722820997238159, + "learning_rate": 1.310219697121634e-05, + "loss": 2.941, + "step": 55435 + }, + { + "epoch": 2.72, + "grad_norm": 0.7488369941711426, + "learning_rate": 1.3097697009067132e-05, + "loss": 2.8376, + "step": 55436 + }, + { + "epoch": 2.72, + "grad_norm": 0.7796913981437683, + "learning_rate": 1.3093197802563338e-05, + "loss": 2.9606, + "step": 55437 + }, + { + "epoch": 2.72, + "grad_norm": 0.6959371566772461, + "learning_rate": 1.3088699351716914e-05, + "loss": 2.9329, + "step": 55438 + }, + { + "epoch": 2.72, + "grad_norm": 0.7195696234703064, + "learning_rate": 1.3084201656539617e-05, + "loss": 2.8452, + "step": 55439 + }, + { + "epoch": 2.72, + "grad_norm": 0.7293813824653625, + "learning_rate": 1.3079704717043272e-05, + "loss": 2.7457, + "step": 55440 + }, + { + "epoch": 2.72, + "grad_norm": 0.7376397252082825, + "learning_rate": 1.3075208533239834e-05, + "loss": 2.6492, + "step": 55441 + }, + { + "epoch": 2.72, + "grad_norm": 0.7778947353363037, + "learning_rate": 1.3070713105140996e-05, + "loss": 2.8703, + "step": 55442 + }, + { + "epoch": 2.72, + "grad_norm": 0.7461794018745422, + "learning_rate": 1.3066218432758712e-05, + "loss": 2.8688, + "step": 55443 + }, + { + "epoch": 2.72, + "grad_norm": 0.7236195802688599, + "learning_rate": 1.306172451610481e-05, + "loss": 2.9004, + "step": 55444 + }, + { + "epoch": 2.72, + "grad_norm": 0.8100160360336304, + "learning_rate": 1.305723135519111e-05, + "loss": 2.7097, + "step": 55445 + }, + { + "epoch": 2.72, + "grad_norm": 0.7482362389564514, + "learning_rate": 1.305273895002944e-05, + "loss": 2.6586, + "step": 55446 + }, + { + "epoch": 2.72, + "grad_norm": 0.7682929635047913, + "learning_rate": 1.3048247300631587e-05, + "loss": 2.9453, + "step": 55447 + }, + { + "epoch": 2.72, + "grad_norm": 0.7298302054405212, + "learning_rate": 1.304375640700941e-05, + "loss": 2.8216, + "step": 55448 + }, + { + "epoch": 2.72, + "grad_norm": 0.7340006828308105, + "learning_rate": 1.303926626917483e-05, + "loss": 2.8007, + "step": 55449 + }, + { + "epoch": 2.72, + "grad_norm": 0.7444607615470886, + "learning_rate": 1.3034776887139509e-05, + "loss": 2.9086, + "step": 55450 + }, + { + "epoch": 2.72, + "grad_norm": 0.7466793060302734, + "learning_rate": 1.3030288260915433e-05, + "loss": 3.0851, + "step": 55451 + }, + { + "epoch": 2.72, + "grad_norm": 0.7335530519485474, + "learning_rate": 1.3025800390514296e-05, + "loss": 2.8182, + "step": 55452 + }, + { + "epoch": 2.72, + "grad_norm": 0.7638733983039856, + "learning_rate": 1.302131327594802e-05, + "loss": 2.7318, + "step": 55453 + }, + { + "epoch": 2.72, + "grad_norm": 0.8187944293022156, + "learning_rate": 1.3016826917228362e-05, + "loss": 3.0425, + "step": 55454 + }, + { + "epoch": 2.72, + "grad_norm": 0.7817355990409851, + "learning_rate": 1.3012341314367147e-05, + "loss": 2.9438, + "step": 55455 + }, + { + "epoch": 2.72, + "grad_norm": 0.7346907258033752, + "learning_rate": 1.3007856467376198e-05, + "loss": 3.1035, + "step": 55456 + }, + { + "epoch": 2.72, + "grad_norm": 0.7391301989555359, + "learning_rate": 1.3003372376267273e-05, + "loss": 3.0594, + "step": 55457 + }, + { + "epoch": 2.72, + "grad_norm": 0.7475512027740479, + "learning_rate": 1.2998889041052263e-05, + "loss": 3.0051, + "step": 55458 + }, + { + "epoch": 2.72, + "grad_norm": 0.7690040469169617, + "learning_rate": 1.2994406461742957e-05, + "loss": 2.7683, + "step": 55459 + }, + { + "epoch": 2.72, + "grad_norm": 0.7919315099716187, + "learning_rate": 1.298992463835118e-05, + "loss": 2.8209, + "step": 55460 + }, + { + "epoch": 2.72, + "grad_norm": 0.7413168549537659, + "learning_rate": 1.2985443570888687e-05, + "loss": 2.9064, + "step": 55461 + }, + { + "epoch": 2.72, + "grad_norm": 0.7446701526641846, + "learning_rate": 1.2980963259367238e-05, + "loss": 2.9471, + "step": 55462 + }, + { + "epoch": 2.72, + "grad_norm": 0.7528586983680725, + "learning_rate": 1.2976483703798724e-05, + "loss": 2.7709, + "step": 55463 + }, + { + "epoch": 2.72, + "grad_norm": 0.7157964706420898, + "learning_rate": 1.2972004904194965e-05, + "loss": 3.0465, + "step": 55464 + }, + { + "epoch": 2.72, + "grad_norm": 0.7323057651519775, + "learning_rate": 1.2967526860567623e-05, + "loss": 2.8482, + "step": 55465 + }, + { + "epoch": 2.72, + "grad_norm": 0.7675830721855164, + "learning_rate": 1.2963049572928652e-05, + "loss": 2.8903, + "step": 55466 + }, + { + "epoch": 2.72, + "grad_norm": 0.7248068451881409, + "learning_rate": 1.2958573041289744e-05, + "loss": 2.8582, + "step": 55467 + }, + { + "epoch": 2.72, + "grad_norm": 0.8281282186508179, + "learning_rate": 1.2954097265662689e-05, + "loss": 3.0083, + "step": 55468 + }, + { + "epoch": 2.72, + "grad_norm": 0.7674946784973145, + "learning_rate": 1.2949622246059311e-05, + "loss": 2.9369, + "step": 55469 + }, + { + "epoch": 2.72, + "grad_norm": 0.7439644932746887, + "learning_rate": 1.2945147982491367e-05, + "loss": 2.9113, + "step": 55470 + }, + { + "epoch": 2.72, + "grad_norm": 0.7129039764404297, + "learning_rate": 1.2940674474970713e-05, + "loss": 2.8134, + "step": 55471 + }, + { + "epoch": 2.72, + "grad_norm": 0.8135375380516052, + "learning_rate": 1.293620172350901e-05, + "loss": 2.8027, + "step": 55472 + }, + { + "epoch": 2.72, + "grad_norm": 0.735965371131897, + "learning_rate": 1.2931729728118112e-05, + "loss": 2.8844, + "step": 55473 + }, + { + "epoch": 2.72, + "grad_norm": 0.7808622717857361, + "learning_rate": 1.2927258488809845e-05, + "loss": 3.1059, + "step": 55474 + }, + { + "epoch": 2.72, + "grad_norm": 0.6862757205963135, + "learning_rate": 1.2922788005595896e-05, + "loss": 2.6411, + "step": 55475 + }, + { + "epoch": 2.72, + "grad_norm": 0.7670363187789917, + "learning_rate": 1.2918318278488093e-05, + "loss": 2.93, + "step": 55476 + }, + { + "epoch": 2.72, + "grad_norm": 0.8030156493186951, + "learning_rate": 1.2913849307498158e-05, + "loss": 2.9131, + "step": 55477 + }, + { + "epoch": 2.72, + "grad_norm": 0.7516763806343079, + "learning_rate": 1.2909381092637883e-05, + "loss": 2.9154, + "step": 55478 + }, + { + "epoch": 2.72, + "grad_norm": 0.7685103416442871, + "learning_rate": 1.290491363391909e-05, + "loss": 2.8171, + "step": 55479 + }, + { + "epoch": 2.72, + "grad_norm": 0.7892995476722717, + "learning_rate": 1.290044693135347e-05, + "loss": 2.8467, + "step": 55480 + }, + { + "epoch": 2.72, + "grad_norm": 0.7564141154289246, + "learning_rate": 1.2895980984952847e-05, + "loss": 2.9793, + "step": 55481 + }, + { + "epoch": 2.72, + "grad_norm": 0.7547937631607056, + "learning_rate": 1.2891515794728979e-05, + "loss": 2.723, + "step": 55482 + }, + { + "epoch": 2.72, + "grad_norm": 0.7510333061218262, + "learning_rate": 1.2887051360693524e-05, + "loss": 2.8364, + "step": 55483 + }, + { + "epoch": 2.72, + "grad_norm": 0.738362193107605, + "learning_rate": 1.2882587682858403e-05, + "loss": 3.0928, + "step": 55484 + }, + { + "epoch": 2.72, + "grad_norm": 0.7294523119926453, + "learning_rate": 1.2878124761235208e-05, + "loss": 2.8948, + "step": 55485 + }, + { + "epoch": 2.72, + "grad_norm": 0.7492228746414185, + "learning_rate": 1.2873662595835833e-05, + "loss": 2.9309, + "step": 55486 + }, + { + "epoch": 2.72, + "grad_norm": 0.7402193546295166, + "learning_rate": 1.286920118667193e-05, + "loss": 2.9529, + "step": 55487 + }, + { + "epoch": 2.72, + "grad_norm": 0.708809494972229, + "learning_rate": 1.2864740533755357e-05, + "loss": 2.8814, + "step": 55488 + }, + { + "epoch": 2.72, + "grad_norm": 0.749814510345459, + "learning_rate": 1.2860280637097776e-05, + "loss": 3.1164, + "step": 55489 + }, + { + "epoch": 2.72, + "grad_norm": 0.7216050624847412, + "learning_rate": 1.2855821496710905e-05, + "loss": 3.0579, + "step": 55490 + }, + { + "epoch": 2.72, + "grad_norm": 0.7654860019683838, + "learning_rate": 1.2851363112606572e-05, + "loss": 2.9416, + "step": 55491 + }, + { + "epoch": 2.72, + "grad_norm": 0.7191634178161621, + "learning_rate": 1.2846905484796466e-05, + "loss": 2.8358, + "step": 55492 + }, + { + "epoch": 2.72, + "grad_norm": 0.7747024893760681, + "learning_rate": 1.2842448613292378e-05, + "loss": 2.883, + "step": 55493 + }, + { + "epoch": 2.72, + "grad_norm": 0.7576825022697449, + "learning_rate": 1.2837992498105998e-05, + "loss": 2.858, + "step": 55494 + }, + { + "epoch": 2.72, + "grad_norm": 0.7463220953941345, + "learning_rate": 1.283353713924905e-05, + "loss": 2.8142, + "step": 55495 + }, + { + "epoch": 2.72, + "grad_norm": 0.7193440198898315, + "learning_rate": 1.282908253673336e-05, + "loss": 2.7727, + "step": 55496 + }, + { + "epoch": 2.72, + "grad_norm": 0.7219351530075073, + "learning_rate": 1.2824628690570582e-05, + "loss": 2.8686, + "step": 55497 + }, + { + "epoch": 2.72, + "grad_norm": 0.7690739035606384, + "learning_rate": 1.2820175600772475e-05, + "loss": 3.065, + "step": 55498 + }, + { + "epoch": 2.72, + "grad_norm": 0.7162508964538574, + "learning_rate": 1.281572326735073e-05, + "loss": 2.8442, + "step": 55499 + }, + { + "epoch": 2.72, + "grad_norm": 0.7276342511177063, + "learning_rate": 1.281127169031707e-05, + "loss": 3.0282, + "step": 55500 + }, + { + "epoch": 2.72, + "grad_norm": 0.7825616002082825, + "learning_rate": 1.2806820869683288e-05, + "loss": 2.9913, + "step": 55501 + }, + { + "epoch": 2.72, + "grad_norm": 0.742638111114502, + "learning_rate": 1.280237080546107e-05, + "loss": 2.76, + "step": 55502 + }, + { + "epoch": 2.72, + "grad_norm": 0.750740647315979, + "learning_rate": 1.2797921497662145e-05, + "loss": 3.0382, + "step": 55503 + }, + { + "epoch": 2.72, + "grad_norm": 0.7699446082115173, + "learning_rate": 1.2793472946298233e-05, + "loss": 2.8473, + "step": 55504 + }, + { + "epoch": 2.72, + "grad_norm": 0.7575497627258301, + "learning_rate": 1.2789025151380994e-05, + "loss": 2.777, + "step": 55505 + }, + { + "epoch": 2.72, + "grad_norm": 0.9555964469909668, + "learning_rate": 1.2784578112922217e-05, + "loss": 2.9411, + "step": 55506 + }, + { + "epoch": 2.72, + "grad_norm": 0.785099983215332, + "learning_rate": 1.278013183093356e-05, + "loss": 2.6507, + "step": 55507 + }, + { + "epoch": 2.72, + "grad_norm": 0.7826633453369141, + "learning_rate": 1.277568630542678e-05, + "loss": 2.8064, + "step": 55508 + }, + { + "epoch": 2.72, + "grad_norm": 0.745834469795227, + "learning_rate": 1.2771241536413534e-05, + "loss": 2.7561, + "step": 55509 + }, + { + "epoch": 2.72, + "grad_norm": 0.7851356267929077, + "learning_rate": 1.2766797523905614e-05, + "loss": 2.7275, + "step": 55510 + }, + { + "epoch": 2.72, + "grad_norm": 0.747675895690918, + "learning_rate": 1.2762354267914643e-05, + "loss": 2.7524, + "step": 55511 + }, + { + "epoch": 2.72, + "grad_norm": 0.6990408897399902, + "learning_rate": 1.2757911768452344e-05, + "loss": 2.7738, + "step": 55512 + }, + { + "epoch": 2.72, + "grad_norm": 0.7718357443809509, + "learning_rate": 1.2753470025530444e-05, + "loss": 2.9192, + "step": 55513 + }, + { + "epoch": 2.72, + "grad_norm": 0.7508569359779358, + "learning_rate": 1.2749029039160597e-05, + "loss": 2.9562, + "step": 55514 + }, + { + "epoch": 2.72, + "grad_norm": 0.777606189250946, + "learning_rate": 1.2744588809354495e-05, + "loss": 3.0925, + "step": 55515 + }, + { + "epoch": 2.72, + "grad_norm": 0.7705445885658264, + "learning_rate": 1.2740149336123928e-05, + "loss": 2.9461, + "step": 55516 + }, + { + "epoch": 2.72, + "grad_norm": 0.7426809072494507, + "learning_rate": 1.2735710619480488e-05, + "loss": 2.9199, + "step": 55517 + }, + { + "epoch": 2.72, + "grad_norm": 0.7444571852684021, + "learning_rate": 1.2731272659435932e-05, + "loss": 2.8854, + "step": 55518 + }, + { + "epoch": 2.72, + "grad_norm": 0.7581663131713867, + "learning_rate": 1.2726835456001916e-05, + "loss": 2.9637, + "step": 55519 + }, + { + "epoch": 2.72, + "grad_norm": 0.77213054895401, + "learning_rate": 1.2722399009190098e-05, + "loss": 2.7211, + "step": 55520 + }, + { + "epoch": 2.72, + "grad_norm": 0.7387886643409729, + "learning_rate": 1.2717963319012203e-05, + "loss": 2.9042, + "step": 55521 + }, + { + "epoch": 2.72, + "grad_norm": 0.7408025860786438, + "learning_rate": 1.2713528385479888e-05, + "loss": 2.9561, + "step": 55522 + }, + { + "epoch": 2.72, + "grad_norm": 0.756806492805481, + "learning_rate": 1.2709094208604908e-05, + "loss": 2.6417, + "step": 55523 + }, + { + "epoch": 2.72, + "grad_norm": 0.750133752822876, + "learning_rate": 1.2704660788398824e-05, + "loss": 3.0107, + "step": 55524 + }, + { + "epoch": 2.72, + "grad_norm": 0.7915104627609253, + "learning_rate": 1.270022812487339e-05, + "loss": 2.6029, + "step": 55525 + }, + { + "epoch": 2.72, + "grad_norm": 0.7304241061210632, + "learning_rate": 1.26957962180403e-05, + "loss": 2.8826, + "step": 55526 + }, + { + "epoch": 2.72, + "grad_norm": 0.8078506588935852, + "learning_rate": 1.269136506791114e-05, + "loss": 2.917, + "step": 55527 + }, + { + "epoch": 2.72, + "grad_norm": 0.7720636129379272, + "learning_rate": 1.268693467449764e-05, + "loss": 2.9377, + "step": 55528 + }, + { + "epoch": 2.72, + "grad_norm": 0.7359655499458313, + "learning_rate": 1.2682505037811452e-05, + "loss": 2.8316, + "step": 55529 + }, + { + "epoch": 2.72, + "grad_norm": 0.7408030033111572, + "learning_rate": 1.2678076157864237e-05, + "loss": 3.0292, + "step": 55530 + }, + { + "epoch": 2.72, + "grad_norm": 0.7604274749755859, + "learning_rate": 1.2673648034667749e-05, + "loss": 2.8554, + "step": 55531 + }, + { + "epoch": 2.72, + "grad_norm": 0.7211507558822632, + "learning_rate": 1.2669220668233481e-05, + "loss": 2.9626, + "step": 55532 + }, + { + "epoch": 2.72, + "grad_norm": 0.7372564077377319, + "learning_rate": 1.266479405857329e-05, + "loss": 3.081, + "step": 55533 + }, + { + "epoch": 2.72, + "grad_norm": 0.7603108882904053, + "learning_rate": 1.2660368205698667e-05, + "loss": 2.7713, + "step": 55534 + }, + { + "epoch": 2.72, + "grad_norm": 0.7562865018844604, + "learning_rate": 1.2655943109621337e-05, + "loss": 2.7846, + "step": 55535 + }, + { + "epoch": 2.72, + "grad_norm": 0.7241320013999939, + "learning_rate": 1.2651518770352954e-05, + "loss": 2.8448, + "step": 55536 + }, + { + "epoch": 2.72, + "grad_norm": 0.7706020474433899, + "learning_rate": 1.2647095187905177e-05, + "loss": 3.0305, + "step": 55537 + }, + { + "epoch": 2.72, + "grad_norm": 0.7894367575645447, + "learning_rate": 1.2642672362289663e-05, + "loss": 2.6124, + "step": 55538 + }, + { + "epoch": 2.72, + "grad_norm": 0.7493867874145508, + "learning_rate": 1.2638250293518004e-05, + "loss": 2.8271, + "step": 55539 + }, + { + "epoch": 2.72, + "grad_norm": 0.781067967414856, + "learning_rate": 1.2633828981601923e-05, + "loss": 3.0305, + "step": 55540 + }, + { + "epoch": 2.72, + "grad_norm": 0.7739070653915405, + "learning_rate": 1.2629408426553044e-05, + "loss": 2.9046, + "step": 55541 + }, + { + "epoch": 2.72, + "grad_norm": 0.7560089230537415, + "learning_rate": 1.262498862838296e-05, + "loss": 2.9307, + "step": 55542 + }, + { + "epoch": 2.72, + "grad_norm": 0.7880851030349731, + "learning_rate": 1.262056958710339e-05, + "loss": 2.7827, + "step": 55543 + }, + { + "epoch": 2.72, + "grad_norm": 0.7638664841651917, + "learning_rate": 1.2616151302725896e-05, + "loss": 2.9314, + "step": 55544 + }, + { + "epoch": 2.72, + "grad_norm": 0.7615971565246582, + "learning_rate": 1.2611733775262133e-05, + "loss": 2.8241, + "step": 55545 + }, + { + "epoch": 2.72, + "grad_norm": 0.7791383266448975, + "learning_rate": 1.2607317004723827e-05, + "loss": 2.9721, + "step": 55546 + }, + { + "epoch": 2.72, + "grad_norm": 0.7734250426292419, + "learning_rate": 1.26029009911225e-05, + "loss": 3.0511, + "step": 55547 + }, + { + "epoch": 2.72, + "grad_norm": 0.7525005340576172, + "learning_rate": 1.2598485734469843e-05, + "loss": 3.016, + "step": 55548 + }, + { + "epoch": 2.72, + "grad_norm": 0.7330466508865356, + "learning_rate": 1.2594071234777413e-05, + "loss": 2.9917, + "step": 55549 + }, + { + "epoch": 2.72, + "grad_norm": 0.7011294364929199, + "learning_rate": 1.2589657492056904e-05, + "loss": 2.9246, + "step": 55550 + }, + { + "epoch": 2.72, + "grad_norm": 0.7659233212471008, + "learning_rate": 1.2585244506319936e-05, + "loss": 2.9226, + "step": 55551 + }, + { + "epoch": 2.72, + "grad_norm": 0.7099422812461853, + "learning_rate": 1.2580832277578102e-05, + "loss": 2.7514, + "step": 55552 + }, + { + "epoch": 2.72, + "grad_norm": 0.7204059958457947, + "learning_rate": 1.257642080584309e-05, + "loss": 2.8493, + "step": 55553 + }, + { + "epoch": 2.72, + "grad_norm": 0.719089150428772, + "learning_rate": 1.2572010091126395e-05, + "loss": 2.8074, + "step": 55554 + }, + { + "epoch": 2.72, + "grad_norm": 0.7707274556159973, + "learning_rate": 1.256760013343977e-05, + "loss": 2.8689, + "step": 55555 + }, + { + "epoch": 2.72, + "grad_norm": 0.7827194333076477, + "learning_rate": 1.2563190932794775e-05, + "loss": 2.8556, + "step": 55556 + }, + { + "epoch": 2.72, + "grad_norm": 0.724220335483551, + "learning_rate": 1.2558782489202967e-05, + "loss": 2.8183, + "step": 55557 + }, + { + "epoch": 2.72, + "grad_norm": 0.7621506452560425, + "learning_rate": 1.255437480267607e-05, + "loss": 2.8624, + "step": 55558 + }, + { + "epoch": 2.72, + "grad_norm": 0.7910413146018982, + "learning_rate": 1.2549967873225542e-05, + "loss": 2.9232, + "step": 55559 + }, + { + "epoch": 2.72, + "grad_norm": 0.7383802533149719, + "learning_rate": 1.2545561700863105e-05, + "loss": 2.8597, + "step": 55560 + }, + { + "epoch": 2.72, + "grad_norm": 0.7192094326019287, + "learning_rate": 1.2541156285600385e-05, + "loss": 2.9147, + "step": 55561 + }, + { + "epoch": 2.72, + "grad_norm": 0.7950126528739929, + "learning_rate": 1.2536751627448938e-05, + "loss": 2.9614, + "step": 55562 + }, + { + "epoch": 2.72, + "grad_norm": 0.7479918599128723, + "learning_rate": 1.2532347726420355e-05, + "loss": 3.1158, + "step": 55563 + }, + { + "epoch": 2.72, + "grad_norm": 0.7508048415184021, + "learning_rate": 1.2527944582526196e-05, + "loss": 3.1672, + "step": 55564 + }, + { + "epoch": 2.72, + "grad_norm": 0.7449748516082764, + "learning_rate": 1.2523542195778147e-05, + "loss": 2.9723, + "step": 55565 + }, + { + "epoch": 2.72, + "grad_norm": 0.731986939907074, + "learning_rate": 1.2519140566187735e-05, + "loss": 3.0262, + "step": 55566 + }, + { + "epoch": 2.72, + "grad_norm": 0.7509217262268066, + "learning_rate": 1.2514739693766585e-05, + "loss": 3.002, + "step": 55567 + }, + { + "epoch": 2.72, + "grad_norm": 0.8176628947257996, + "learning_rate": 1.2510339578526286e-05, + "loss": 2.9815, + "step": 55568 + }, + { + "epoch": 2.72, + "grad_norm": 0.7532127499580383, + "learning_rate": 1.250594022047846e-05, + "loss": 2.8316, + "step": 55569 + }, + { + "epoch": 2.72, + "grad_norm": 0.7925772070884705, + "learning_rate": 1.2501541619634636e-05, + "loss": 2.817, + "step": 55570 + }, + { + "epoch": 2.72, + "grad_norm": 0.7416542768478394, + "learning_rate": 1.2497143776006402e-05, + "loss": 3.0308, + "step": 55571 + }, + { + "epoch": 2.72, + "grad_norm": 0.7497258186340332, + "learning_rate": 1.2492746689605349e-05, + "loss": 2.9368, + "step": 55572 + }, + { + "epoch": 2.72, + "grad_norm": 0.7755299210548401, + "learning_rate": 1.24883503604431e-05, + "loss": 2.9622, + "step": 55573 + }, + { + "epoch": 2.72, + "grad_norm": 0.772011399269104, + "learning_rate": 1.248395478853118e-05, + "loss": 2.8229, + "step": 55574 + }, + { + "epoch": 2.72, + "grad_norm": 0.7620413899421692, + "learning_rate": 1.2479559973881248e-05, + "loss": 2.9222, + "step": 55575 + }, + { + "epoch": 2.72, + "grad_norm": 0.7677061557769775, + "learning_rate": 1.2475165916504726e-05, + "loss": 3.0159, + "step": 55576 + }, + { + "epoch": 2.72, + "grad_norm": 0.72245854139328, + "learning_rate": 1.2470772616413371e-05, + "loss": 2.8573, + "step": 55577 + }, + { + "epoch": 2.72, + "grad_norm": 0.7207547426223755, + "learning_rate": 1.2466380073618642e-05, + "loss": 2.7114, + "step": 55578 + }, + { + "epoch": 2.72, + "grad_norm": 0.7339671850204468, + "learning_rate": 1.2461988288132097e-05, + "loss": 2.9585, + "step": 55579 + }, + { + "epoch": 2.72, + "grad_norm": 0.7937641143798828, + "learning_rate": 1.2457597259965358e-05, + "loss": 2.8606, + "step": 55580 + }, + { + "epoch": 2.72, + "grad_norm": 0.7468560338020325, + "learning_rate": 1.2453206989129949e-05, + "loss": 2.8659, + "step": 55581 + }, + { + "epoch": 2.72, + "grad_norm": 0.7348960638046265, + "learning_rate": 1.2448817475637462e-05, + "loss": 2.9035, + "step": 55582 + }, + { + "epoch": 2.72, + "grad_norm": 0.7143893241882324, + "learning_rate": 1.2444428719499488e-05, + "loss": 2.892, + "step": 55583 + }, + { + "epoch": 2.72, + "grad_norm": 0.787746012210846, + "learning_rate": 1.2440040720727518e-05, + "loss": 2.8582, + "step": 55584 + }, + { + "epoch": 2.72, + "grad_norm": 0.7754120826721191, + "learning_rate": 1.2435653479333173e-05, + "loss": 2.696, + "step": 55585 + }, + { + "epoch": 2.72, + "grad_norm": 0.7287518382072449, + "learning_rate": 1.2431266995327916e-05, + "loss": 2.9811, + "step": 55586 + }, + { + "epoch": 2.72, + "grad_norm": 0.7545494437217712, + "learning_rate": 1.2426881268723365e-05, + "loss": 2.7484, + "step": 55587 + }, + { + "epoch": 2.72, + "grad_norm": 0.7642377614974976, + "learning_rate": 1.2422496299531081e-05, + "loss": 2.8666, + "step": 55588 + }, + { + "epoch": 2.72, + "grad_norm": 0.7176851034164429, + "learning_rate": 1.2418112087762588e-05, + "loss": 2.8306, + "step": 55589 + }, + { + "epoch": 2.72, + "grad_norm": 0.7618530988693237, + "learning_rate": 1.2413728633429476e-05, + "loss": 2.8173, + "step": 55590 + }, + { + "epoch": 2.72, + "grad_norm": 0.7074477076530457, + "learning_rate": 1.2409345936543235e-05, + "loss": 2.847, + "step": 55591 + }, + { + "epoch": 2.72, + "grad_norm": 0.7396611571311951, + "learning_rate": 1.2404963997115391e-05, + "loss": 2.8979, + "step": 55592 + }, + { + "epoch": 2.72, + "grad_norm": 0.7340872287750244, + "learning_rate": 1.24005828151576e-05, + "loss": 2.9145, + "step": 55593 + }, + { + "epoch": 2.72, + "grad_norm": 0.7579055428504944, + "learning_rate": 1.2396202390681254e-05, + "loss": 2.8307, + "step": 55594 + }, + { + "epoch": 2.72, + "grad_norm": 0.7400532364845276, + "learning_rate": 1.2391822723698009e-05, + "loss": 2.8784, + "step": 55595 + }, + { + "epoch": 2.72, + "grad_norm": 0.7686823606491089, + "learning_rate": 1.238744381421929e-05, + "loss": 2.8043, + "step": 55596 + }, + { + "epoch": 2.72, + "grad_norm": 0.7867351770401001, + "learning_rate": 1.2383065662256719e-05, + "loss": 2.8821, + "step": 55597 + }, + { + "epoch": 2.72, + "grad_norm": 0.7557711005210876, + "learning_rate": 1.2378688267821824e-05, + "loss": 2.6193, + "step": 55598 + }, + { + "epoch": 2.72, + "grad_norm": 0.7336000204086304, + "learning_rate": 1.2374311630926126e-05, + "loss": 2.7775, + "step": 55599 + }, + { + "epoch": 2.72, + "grad_norm": 0.8032825589179993, + "learning_rate": 1.2369935751581117e-05, + "loss": 3.0363, + "step": 55600 + }, + { + "epoch": 2.72, + "grad_norm": 0.7513951063156128, + "learning_rate": 1.2365560629798322e-05, + "loss": 2.9107, + "step": 55601 + }, + { + "epoch": 2.72, + "grad_norm": 0.7409695386886597, + "learning_rate": 1.2361186265589262e-05, + "loss": 2.9644, + "step": 55602 + }, + { + "epoch": 2.72, + "grad_norm": 0.7681583166122437, + "learning_rate": 1.2356812658965532e-05, + "loss": 2.9018, + "step": 55603 + }, + { + "epoch": 2.73, + "grad_norm": 0.745311439037323, + "learning_rate": 1.2352439809938585e-05, + "loss": 2.8465, + "step": 55604 + }, + { + "epoch": 2.73, + "grad_norm": 0.7664110660552979, + "learning_rate": 1.2348067718519949e-05, + "loss": 3.0799, + "step": 55605 + }, + { + "epoch": 2.73, + "grad_norm": 0.7889949679374695, + "learning_rate": 1.234369638472118e-05, + "loss": 2.9645, + "step": 55606 + }, + { + "epoch": 2.73, + "grad_norm": 0.7036034464836121, + "learning_rate": 1.233932580855367e-05, + "loss": 2.9552, + "step": 55607 + }, + { + "epoch": 2.73, + "grad_norm": 0.726105272769928, + "learning_rate": 1.2334955990029105e-05, + "loss": 3.079, + "step": 55608 + }, + { + "epoch": 2.73, + "grad_norm": 0.741583526134491, + "learning_rate": 1.2330586929158814e-05, + "loss": 2.8413, + "step": 55609 + }, + { + "epoch": 2.73, + "grad_norm": 0.7955318689346313, + "learning_rate": 1.2326218625954454e-05, + "loss": 2.9633, + "step": 55610 + }, + { + "epoch": 2.73, + "grad_norm": 0.874595046043396, + "learning_rate": 1.2321851080427448e-05, + "loss": 2.8424, + "step": 55611 + }, + { + "epoch": 2.73, + "grad_norm": 0.7419061064720154, + "learning_rate": 1.2317484292589352e-05, + "loss": 2.7988, + "step": 55612 + }, + { + "epoch": 2.73, + "grad_norm": 0.8153803944587708, + "learning_rate": 1.2313118262451627e-05, + "loss": 2.7867, + "step": 55613 + }, + { + "epoch": 2.73, + "grad_norm": 0.7266865968704224, + "learning_rate": 1.2308752990025761e-05, + "loss": 2.5696, + "step": 55614 + }, + { + "epoch": 2.73, + "grad_norm": 0.7737489938735962, + "learning_rate": 1.230438847532328e-05, + "loss": 3.1006, + "step": 55615 + }, + { + "epoch": 2.73, + "grad_norm": 0.762378990650177, + "learning_rate": 1.2300024718355673e-05, + "loss": 3.0235, + "step": 55616 + }, + { + "epoch": 2.73, + "grad_norm": 0.710425078868866, + "learning_rate": 1.2295661719134397e-05, + "loss": 2.9223, + "step": 55617 + }, + { + "epoch": 2.73, + "grad_norm": 0.7368878722190857, + "learning_rate": 1.2291299477671047e-05, + "loss": 2.8503, + "step": 55618 + }, + { + "epoch": 2.73, + "grad_norm": 0.7075875997543335, + "learning_rate": 1.2286937993976976e-05, + "loss": 2.8905, + "step": 55619 + }, + { + "epoch": 2.73, + "grad_norm": 0.7213077545166016, + "learning_rate": 1.2282577268063777e-05, + "loss": 2.8651, + "step": 55620 + }, + { + "epoch": 2.73, + "grad_norm": 0.7512006163597107, + "learning_rate": 1.2278217299942939e-05, + "loss": 2.6974, + "step": 55621 + }, + { + "epoch": 2.73, + "grad_norm": 0.7457561492919922, + "learning_rate": 1.2273858089625821e-05, + "loss": 3.0884, + "step": 55622 + }, + { + "epoch": 2.73, + "grad_norm": 0.8021249771118164, + "learning_rate": 1.2269499637124047e-05, + "loss": 2.9176, + "step": 55623 + }, + { + "epoch": 2.73, + "grad_norm": 0.7707574963569641, + "learning_rate": 1.2265141942448975e-05, + "loss": 2.8524, + "step": 55624 + }, + { + "epoch": 2.73, + "grad_norm": 0.7147628664970398, + "learning_rate": 1.2260785005612195e-05, + "loss": 2.8479, + "step": 55625 + }, + { + "epoch": 2.73, + "grad_norm": 0.7359575629234314, + "learning_rate": 1.2256428826625098e-05, + "loss": 2.7975, + "step": 55626 + }, + { + "epoch": 2.73, + "grad_norm": 0.7357754707336426, + "learning_rate": 1.2252073405499241e-05, + "loss": 2.9908, + "step": 55627 + }, + { + "epoch": 2.73, + "grad_norm": 0.8136209845542908, + "learning_rate": 1.2247718742246016e-05, + "loss": 2.988, + "step": 55628 + }, + { + "epoch": 2.73, + "grad_norm": 0.7402750253677368, + "learning_rate": 1.2243364836876912e-05, + "loss": 2.8614, + "step": 55629 + }, + { + "epoch": 2.73, + "grad_norm": 0.7957865595817566, + "learning_rate": 1.2239011689403422e-05, + "loss": 2.7521, + "step": 55630 + }, + { + "epoch": 2.73, + "grad_norm": 0.7921202182769775, + "learning_rate": 1.2234659299836969e-05, + "loss": 2.8699, + "step": 55631 + }, + { + "epoch": 2.73, + "grad_norm": 0.7857522964477539, + "learning_rate": 1.223030766818901e-05, + "loss": 2.8791, + "step": 55632 + }, + { + "epoch": 2.73, + "grad_norm": 0.7415589094161987, + "learning_rate": 1.2225956794471103e-05, + "loss": 2.8202, + "step": 55633 + }, + { + "epoch": 2.73, + "grad_norm": 0.7329033017158508, + "learning_rate": 1.222160667869464e-05, + "loss": 2.8956, + "step": 55634 + }, + { + "epoch": 2.73, + "grad_norm": 0.7467180490493774, + "learning_rate": 1.2217257320871076e-05, + "loss": 2.7925, + "step": 55635 + }, + { + "epoch": 2.73, + "grad_norm": 0.841463029384613, + "learning_rate": 1.2212908721011837e-05, + "loss": 2.9325, + "step": 55636 + }, + { + "epoch": 2.73, + "grad_norm": 0.7936882376670837, + "learning_rate": 1.2208560879128416e-05, + "loss": 2.9749, + "step": 55637 + }, + { + "epoch": 2.73, + "grad_norm": 0.7346864938735962, + "learning_rate": 1.2204213795232298e-05, + "loss": 2.9747, + "step": 55638 + }, + { + "epoch": 2.73, + "grad_norm": 0.7802562713623047, + "learning_rate": 1.2199867469334845e-05, + "loss": 2.9534, + "step": 55639 + }, + { + "epoch": 2.73, + "grad_norm": 0.7328370213508606, + "learning_rate": 1.2195521901447614e-05, + "loss": 2.9276, + "step": 55640 + }, + { + "epoch": 2.73, + "grad_norm": 0.7416670918464661, + "learning_rate": 1.2191177091581927e-05, + "loss": 2.9684, + "step": 55641 + }, + { + "epoch": 2.73, + "grad_norm": 0.7478400468826294, + "learning_rate": 1.2186833039749344e-05, + "loss": 3.1439, + "step": 55642 + }, + { + "epoch": 2.73, + "grad_norm": 0.7259265780448914, + "learning_rate": 1.218248974596122e-05, + "loss": 2.9068, + "step": 55643 + }, + { + "epoch": 2.73, + "grad_norm": 0.7286504507064819, + "learning_rate": 1.2178147210229017e-05, + "loss": 2.7912, + "step": 55644 + }, + { + "epoch": 2.73, + "grad_norm": 0.804137647151947, + "learning_rate": 1.2173805432564221e-05, + "loss": 3.0854, + "step": 55645 + }, + { + "epoch": 2.73, + "grad_norm": 0.7600886821746826, + "learning_rate": 1.2169464412978192e-05, + "loss": 3.1028, + "step": 55646 + }, + { + "epoch": 2.73, + "grad_norm": 0.7712217569351196, + "learning_rate": 1.2165124151482419e-05, + "loss": 3.0731, + "step": 55647 + }, + { + "epoch": 2.73, + "grad_norm": 0.7367176413536072, + "learning_rate": 1.2160784648088295e-05, + "loss": 2.9482, + "step": 55648 + }, + { + "epoch": 2.73, + "grad_norm": 0.7509863376617432, + "learning_rate": 1.2156445902807278e-05, + "loss": 2.8918, + "step": 55649 + }, + { + "epoch": 2.73, + "grad_norm": 0.7302809357643127, + "learning_rate": 1.2152107915650821e-05, + "loss": 3.0575, + "step": 55650 + }, + { + "epoch": 2.73, + "grad_norm": 0.7326139211654663, + "learning_rate": 1.2147770686630254e-05, + "loss": 2.9015, + "step": 55651 + }, + { + "epoch": 2.73, + "grad_norm": 0.722706139087677, + "learning_rate": 1.2143434215757097e-05, + "loss": 2.8909, + "step": 55652 + }, + { + "epoch": 2.73, + "grad_norm": 0.7505283355712891, + "learning_rate": 1.213909850304271e-05, + "loss": 2.7945, + "step": 55653 + }, + { + "epoch": 2.73, + "grad_norm": 0.7195035815238953, + "learning_rate": 1.213476354849855e-05, + "loss": 2.7821, + "step": 55654 + }, + { + "epoch": 2.73, + "grad_norm": 0.7279053330421448, + "learning_rate": 1.2130429352136039e-05, + "loss": 2.8279, + "step": 55655 + }, + { + "epoch": 2.73, + "grad_norm": 0.7183561325073242, + "learning_rate": 1.2126095913966538e-05, + "loss": 2.8743, + "step": 55656 + }, + { + "epoch": 2.73, + "grad_norm": 0.7786692976951599, + "learning_rate": 1.2121763234001536e-05, + "loss": 2.9192, + "step": 55657 + }, + { + "epoch": 2.73, + "grad_norm": 0.7574261426925659, + "learning_rate": 1.211743131225239e-05, + "loss": 2.8047, + "step": 55658 + }, + { + "epoch": 2.73, + "grad_norm": 0.7224125862121582, + "learning_rate": 1.2113100148730526e-05, + "loss": 3.0287, + "step": 55659 + }, + { + "epoch": 2.73, + "grad_norm": 0.7393490076065063, + "learning_rate": 1.2108769743447366e-05, + "loss": 2.8423, + "step": 55660 + }, + { + "epoch": 2.73, + "grad_norm": 0.746552586555481, + "learning_rate": 1.2104440096414269e-05, + "loss": 2.7179, + "step": 55661 + }, + { + "epoch": 2.73, + "grad_norm": 0.7337487936019897, + "learning_rate": 1.2100111207642693e-05, + "loss": 2.864, + "step": 55662 + }, + { + "epoch": 2.73, + "grad_norm": 0.7381230592727661, + "learning_rate": 1.2095783077143994e-05, + "loss": 2.9342, + "step": 55663 + }, + { + "epoch": 2.73, + "grad_norm": 0.7539899945259094, + "learning_rate": 1.2091455704929631e-05, + "loss": 2.8978, + "step": 55664 + }, + { + "epoch": 2.73, + "grad_norm": 0.8014904856681824, + "learning_rate": 1.208712909101096e-05, + "loss": 2.7139, + "step": 55665 + }, + { + "epoch": 2.73, + "grad_norm": 0.7007734775543213, + "learning_rate": 1.2082803235399374e-05, + "loss": 2.9909, + "step": 55666 + }, + { + "epoch": 2.73, + "grad_norm": 0.7687050700187683, + "learning_rate": 1.2078478138106262e-05, + "loss": 2.9276, + "step": 55667 + }, + { + "epoch": 2.73, + "grad_norm": 0.7429559230804443, + "learning_rate": 1.2074153799143016e-05, + "loss": 2.8966, + "step": 55668 + }, + { + "epoch": 2.73, + "grad_norm": 0.7226108312606812, + "learning_rate": 1.2069830218521025e-05, + "loss": 2.9716, + "step": 55669 + }, + { + "epoch": 2.73, + "grad_norm": 0.7323675751686096, + "learning_rate": 1.206550739625175e-05, + "loss": 2.9101, + "step": 55670 + }, + { + "epoch": 2.73, + "grad_norm": 0.7690631151199341, + "learning_rate": 1.2061185332346478e-05, + "loss": 2.8661, + "step": 55671 + }, + { + "epoch": 2.73, + "grad_norm": 0.8044900894165039, + "learning_rate": 1.2056864026816671e-05, + "loss": 2.8463, + "step": 55672 + }, + { + "epoch": 2.73, + "grad_norm": 0.7190569043159485, + "learning_rate": 1.2052543479673582e-05, + "loss": 3.0036, + "step": 55673 + }, + { + "epoch": 2.73, + "grad_norm": 0.7784371376037598, + "learning_rate": 1.2048223690928704e-05, + "loss": 2.9049, + "step": 55674 + }, + { + "epoch": 2.73, + "grad_norm": 0.7170578241348267, + "learning_rate": 1.2043904660593429e-05, + "loss": 2.8486, + "step": 55675 + }, + { + "epoch": 2.73, + "grad_norm": 0.7676547765731812, + "learning_rate": 1.2039586388679046e-05, + "loss": 2.9257, + "step": 55676 + }, + { + "epoch": 2.73, + "grad_norm": 0.7858740091323853, + "learning_rate": 1.2035268875197014e-05, + "loss": 2.7855, + "step": 55677 + }, + { + "epoch": 2.73, + "grad_norm": 0.7509922981262207, + "learning_rate": 1.2030952120158621e-05, + "loss": 3.0946, + "step": 55678 + }, + { + "epoch": 2.73, + "grad_norm": 0.7644749283790588, + "learning_rate": 1.2026636123575329e-05, + "loss": 2.892, + "step": 55679 + }, + { + "epoch": 2.73, + "grad_norm": 0.7936258912086487, + "learning_rate": 1.2022320885458458e-05, + "loss": 2.7818, + "step": 55680 + }, + { + "epoch": 2.73, + "grad_norm": 0.7399991750717163, + "learning_rate": 1.2018006405819335e-05, + "loss": 3.1416, + "step": 55681 + }, + { + "epoch": 2.73, + "grad_norm": 0.7325205206871033, + "learning_rate": 1.2013692684669386e-05, + "loss": 2.7667, + "step": 55682 + }, + { + "epoch": 2.73, + "grad_norm": 0.7739435434341431, + "learning_rate": 1.2009379722019896e-05, + "loss": 2.9389, + "step": 55683 + }, + { + "epoch": 2.73, + "grad_norm": 0.769698441028595, + "learning_rate": 1.2005067517882294e-05, + "loss": 2.8377, + "step": 55684 + }, + { + "epoch": 2.73, + "grad_norm": 0.752373456954956, + "learning_rate": 1.2000756072267936e-05, + "loss": 2.7622, + "step": 55685 + }, + { + "epoch": 2.73, + "grad_norm": 0.7731847763061523, + "learning_rate": 1.1996445385188181e-05, + "loss": 3.0762, + "step": 55686 + }, + { + "epoch": 2.73, + "grad_norm": 0.7705298662185669, + "learning_rate": 1.199213545665435e-05, + "loss": 2.8453, + "step": 55687 + }, + { + "epoch": 2.73, + "grad_norm": 0.7608867287635803, + "learning_rate": 1.1987826286677771e-05, + "loss": 3.0201, + "step": 55688 + }, + { + "epoch": 2.73, + "grad_norm": 0.7582928538322449, + "learning_rate": 1.1983517875269832e-05, + "loss": 2.9205, + "step": 55689 + }, + { + "epoch": 2.73, + "grad_norm": 0.7286206483840942, + "learning_rate": 1.1979210222441925e-05, + "loss": 2.9747, + "step": 55690 + }, + { + "epoch": 2.73, + "grad_norm": 0.7775378227233887, + "learning_rate": 1.1974903328205276e-05, + "loss": 2.792, + "step": 55691 + }, + { + "epoch": 2.73, + "grad_norm": 0.7497450709342957, + "learning_rate": 1.1970597192571374e-05, + "loss": 2.9682, + "step": 55692 + }, + { + "epoch": 2.73, + "grad_norm": 0.7265482544898987, + "learning_rate": 1.1966291815551477e-05, + "loss": 2.9189, + "step": 55693 + }, + { + "epoch": 2.73, + "grad_norm": 0.7717911601066589, + "learning_rate": 1.1961987197156876e-05, + "loss": 3.1288, + "step": 55694 + }, + { + "epoch": 2.73, + "grad_norm": 0.803691565990448, + "learning_rate": 1.1957683337399026e-05, + "loss": 2.978, + "step": 55695 + }, + { + "epoch": 2.73, + "grad_norm": 0.7337557673454285, + "learning_rate": 1.1953380236289156e-05, + "loss": 2.7589, + "step": 55696 + }, + { + "epoch": 2.73, + "grad_norm": 0.7882424592971802, + "learning_rate": 1.1949077893838688e-05, + "loss": 2.9169, + "step": 55697 + }, + { + "epoch": 2.73, + "grad_norm": 0.7700529098510742, + "learning_rate": 1.194477631005888e-05, + "loss": 2.88, + "step": 55698 + }, + { + "epoch": 2.73, + "grad_norm": 0.7207964658737183, + "learning_rate": 1.1940475484961087e-05, + "loss": 2.7995, + "step": 55699 + }, + { + "epoch": 2.73, + "grad_norm": 0.7262753844261169, + "learning_rate": 1.193617541855667e-05, + "loss": 2.9706, + "step": 55700 + }, + { + "epoch": 2.73, + "grad_norm": 0.7570127844810486, + "learning_rate": 1.1931876110856953e-05, + "loss": 2.9817, + "step": 55701 + }, + { + "epoch": 2.73, + "grad_norm": 0.7883241176605225, + "learning_rate": 1.1927577561873225e-05, + "loss": 2.7953, + "step": 55702 + }, + { + "epoch": 2.73, + "grad_norm": 0.773369550704956, + "learning_rate": 1.1923279771616778e-05, + "loss": 2.8493, + "step": 55703 + }, + { + "epoch": 2.73, + "grad_norm": 0.7068461179733276, + "learning_rate": 1.1918982740098971e-05, + "loss": 3.0174, + "step": 55704 + }, + { + "epoch": 2.73, + "grad_norm": 0.7062078714370728, + "learning_rate": 1.1914686467331158e-05, + "loss": 2.6588, + "step": 55705 + }, + { + "epoch": 2.73, + "grad_norm": 0.7688685059547424, + "learning_rate": 1.1910390953324567e-05, + "loss": 2.9203, + "step": 55706 + }, + { + "epoch": 2.73, + "grad_norm": 0.7777459621429443, + "learning_rate": 1.190609619809062e-05, + "loss": 2.9701, + "step": 55707 + }, + { + "epoch": 2.73, + "grad_norm": 0.7545894384384155, + "learning_rate": 1.1901802201640542e-05, + "loss": 2.8712, + "step": 55708 + }, + { + "epoch": 2.73, + "grad_norm": 0.7288576364517212, + "learning_rate": 1.1897508963985659e-05, + "loss": 2.7988, + "step": 55709 + }, + { + "epoch": 2.73, + "grad_norm": 0.806057333946228, + "learning_rate": 1.1893216485137325e-05, + "loss": 2.8109, + "step": 55710 + }, + { + "epoch": 2.73, + "grad_norm": 0.7674176692962646, + "learning_rate": 1.18889247651068e-05, + "loss": 2.8256, + "step": 55711 + }, + { + "epoch": 2.73, + "grad_norm": 0.7585336565971375, + "learning_rate": 1.1884633803905408e-05, + "loss": 2.8867, + "step": 55712 + }, + { + "epoch": 2.73, + "grad_norm": 0.7617801427841187, + "learning_rate": 1.1880343601544406e-05, + "loss": 2.9724, + "step": 55713 + }, + { + "epoch": 2.73, + "grad_norm": 0.814563512802124, + "learning_rate": 1.1876054158035152e-05, + "loss": 2.8553, + "step": 55714 + }, + { + "epoch": 2.73, + "grad_norm": 0.7293022871017456, + "learning_rate": 1.187176547338894e-05, + "loss": 2.9574, + "step": 55715 + }, + { + "epoch": 2.73, + "grad_norm": 0.7652244567871094, + "learning_rate": 1.1867477547616988e-05, + "loss": 2.9344, + "step": 55716 + }, + { + "epoch": 2.73, + "grad_norm": 0.7348045706748962, + "learning_rate": 1.1863190380730725e-05, + "loss": 3.0233, + "step": 55717 + }, + { + "epoch": 2.73, + "grad_norm": 0.6973322033882141, + "learning_rate": 1.1858903972741308e-05, + "loss": 2.9189, + "step": 55718 + }, + { + "epoch": 2.73, + "grad_norm": 0.7177212238311768, + "learning_rate": 1.1854618323660092e-05, + "loss": 2.5284, + "step": 55719 + }, + { + "epoch": 2.73, + "grad_norm": 0.7871966361999512, + "learning_rate": 1.1850333433498372e-05, + "loss": 3.0959, + "step": 55720 + }, + { + "epoch": 2.73, + "grad_norm": 0.7772104144096375, + "learning_rate": 1.1846049302267369e-05, + "loss": 2.8529, + "step": 55721 + }, + { + "epoch": 2.73, + "grad_norm": 0.7406902313232422, + "learning_rate": 1.1841765929978476e-05, + "loss": 2.917, + "step": 55722 + }, + { + "epoch": 2.73, + "grad_norm": 0.7036511301994324, + "learning_rate": 1.1837483316642915e-05, + "loss": 3.015, + "step": 55723 + }, + { + "epoch": 2.73, + "grad_norm": 0.7832501530647278, + "learning_rate": 1.183320146227198e-05, + "loss": 2.8413, + "step": 55724 + }, + { + "epoch": 2.73, + "grad_norm": 0.7509165406227112, + "learning_rate": 1.182892036687686e-05, + "loss": 2.9039, + "step": 55725 + }, + { + "epoch": 2.73, + "grad_norm": 0.7857611179351807, + "learning_rate": 1.1824640030468945e-05, + "loss": 3.0673, + "step": 55726 + }, + { + "epoch": 2.73, + "grad_norm": 0.7503923177719116, + "learning_rate": 1.1820360453059463e-05, + "loss": 2.8432, + "step": 55727 + }, + { + "epoch": 2.73, + "grad_norm": 0.7694743275642395, + "learning_rate": 1.1816081634659668e-05, + "loss": 2.9747, + "step": 55728 + }, + { + "epoch": 2.73, + "grad_norm": 0.7916954755783081, + "learning_rate": 1.1811803575280921e-05, + "loss": 2.7779, + "step": 55729 + }, + { + "epoch": 2.73, + "grad_norm": 0.8194757699966431, + "learning_rate": 1.1807526274934375e-05, + "loss": 2.7108, + "step": 55730 + }, + { + "epoch": 2.73, + "grad_norm": 0.7391253113746643, + "learning_rate": 1.1803249733631326e-05, + "loss": 3.1041, + "step": 55731 + }, + { + "epoch": 2.73, + "grad_norm": 0.7366408705711365, + "learning_rate": 1.1798973951383094e-05, + "loss": 2.9705, + "step": 55732 + }, + { + "epoch": 2.73, + "grad_norm": 0.7290082573890686, + "learning_rate": 1.179469892820084e-05, + "loss": 2.6135, + "step": 55733 + }, + { + "epoch": 2.73, + "grad_norm": 0.7617344856262207, + "learning_rate": 1.1790424664095954e-05, + "loss": 3.099, + "step": 55734 + }, + { + "epoch": 2.73, + "grad_norm": 0.7532030940055847, + "learning_rate": 1.178615115907956e-05, + "loss": 2.7876, + "step": 55735 + }, + { + "epoch": 2.73, + "grad_norm": 0.7590699195861816, + "learning_rate": 1.1781878413163015e-05, + "loss": 2.9633, + "step": 55736 + }, + { + "epoch": 2.73, + "grad_norm": 0.7126039862632751, + "learning_rate": 1.1777606426357544e-05, + "loss": 2.9342, + "step": 55737 + }, + { + "epoch": 2.73, + "grad_norm": 0.7936747074127197, + "learning_rate": 1.1773335198674338e-05, + "loss": 2.7536, + "step": 55738 + }, + { + "epoch": 2.73, + "grad_norm": 0.7752701044082642, + "learning_rate": 1.1769064730124756e-05, + "loss": 3.0024, + "step": 55739 + }, + { + "epoch": 2.73, + "grad_norm": 0.7142334580421448, + "learning_rate": 1.176479502071992e-05, + "loss": 3.15, + "step": 55740 + }, + { + "epoch": 2.73, + "grad_norm": 0.7375084757804871, + "learning_rate": 1.1760526070471154e-05, + "loss": 2.8894, + "step": 55741 + }, + { + "epoch": 2.73, + "grad_norm": 0.7870582342147827, + "learning_rate": 1.1756257879389752e-05, + "loss": 2.8652, + "step": 55742 + }, + { + "epoch": 2.73, + "grad_norm": 0.7322790026664734, + "learning_rate": 1.1751990447486836e-05, + "loss": 3.0524, + "step": 55743 + }, + { + "epoch": 2.73, + "grad_norm": 0.7427972555160522, + "learning_rate": 1.1747723774773733e-05, + "loss": 2.6524, + "step": 55744 + }, + { + "epoch": 2.73, + "grad_norm": 0.766864538192749, + "learning_rate": 1.1743457861261662e-05, + "loss": 2.7924, + "step": 55745 + }, + { + "epoch": 2.73, + "grad_norm": 0.729448139667511, + "learning_rate": 1.1739192706961786e-05, + "loss": 2.9511, + "step": 55746 + }, + { + "epoch": 2.73, + "grad_norm": 0.7730950117111206, + "learning_rate": 1.1734928311885461e-05, + "loss": 2.9649, + "step": 55747 + }, + { + "epoch": 2.73, + "grad_norm": 0.7573014497756958, + "learning_rate": 1.173066467604381e-05, + "loss": 2.9444, + "step": 55748 + }, + { + "epoch": 2.73, + "grad_norm": 0.768448531627655, + "learning_rate": 1.172640179944816e-05, + "loss": 2.8058, + "step": 55749 + }, + { + "epoch": 2.73, + "grad_norm": 0.7829905152320862, + "learning_rate": 1.1722139682109633e-05, + "loss": 2.6185, + "step": 55750 + }, + { + "epoch": 2.73, + "grad_norm": 0.7292187809944153, + "learning_rate": 1.1717878324039554e-05, + "loss": 2.919, + "step": 55751 + }, + { + "epoch": 2.73, + "grad_norm": 0.7217176556587219, + "learning_rate": 1.171361772524908e-05, + "loss": 2.7037, + "step": 55752 + }, + { + "epoch": 2.73, + "grad_norm": 0.7316499948501587, + "learning_rate": 1.1709357885749437e-05, + "loss": 2.9193, + "step": 55753 + }, + { + "epoch": 2.73, + "grad_norm": 0.748083233833313, + "learning_rate": 1.1705098805551916e-05, + "loss": 2.8312, + "step": 55754 + }, + { + "epoch": 2.73, + "grad_norm": 0.7519057393074036, + "learning_rate": 1.1700840484667639e-05, + "loss": 2.8975, + "step": 55755 + }, + { + "epoch": 2.73, + "grad_norm": 0.7586154937744141, + "learning_rate": 1.1696582923107866e-05, + "loss": 2.8254, + "step": 55756 + }, + { + "epoch": 2.73, + "grad_norm": 0.7953873872756958, + "learning_rate": 1.1692326120883822e-05, + "loss": 2.7867, + "step": 55757 + }, + { + "epoch": 2.73, + "grad_norm": 0.733325183391571, + "learning_rate": 1.1688070078006695e-05, + "loss": 2.8756, + "step": 55758 + }, + { + "epoch": 2.73, + "grad_norm": 0.7922911047935486, + "learning_rate": 1.1683814794487744e-05, + "loss": 3.1538, + "step": 55759 + }, + { + "epoch": 2.73, + "grad_norm": 0.7304345369338989, + "learning_rate": 1.1679560270338095e-05, + "loss": 3.0398, + "step": 55760 + }, + { + "epoch": 2.73, + "grad_norm": 0.8140565752983093, + "learning_rate": 1.1675306505568972e-05, + "loss": 2.7469, + "step": 55761 + }, + { + "epoch": 2.73, + "grad_norm": 0.7204803824424744, + "learning_rate": 1.1671053500191663e-05, + "loss": 2.8597, + "step": 55762 + }, + { + "epoch": 2.73, + "grad_norm": 0.7560591101646423, + "learning_rate": 1.1666801254217262e-05, + "loss": 2.7704, + "step": 55763 + }, + { + "epoch": 2.73, + "grad_norm": 0.7345255017280579, + "learning_rate": 1.1662549767657059e-05, + "loss": 2.8837, + "step": 55764 + }, + { + "epoch": 2.73, + "grad_norm": 0.7856339812278748, + "learning_rate": 1.1658299040522145e-05, + "loss": 2.748, + "step": 55765 + }, + { + "epoch": 2.73, + "grad_norm": 0.7450258135795593, + "learning_rate": 1.1654049072823845e-05, + "loss": 2.8668, + "step": 55766 + }, + { + "epoch": 2.73, + "grad_norm": 0.751708984375, + "learning_rate": 1.1649799864573284e-05, + "loss": 2.9728, + "step": 55767 + }, + { + "epoch": 2.73, + "grad_norm": 0.7992194890975952, + "learning_rate": 1.1645551415781618e-05, + "loss": 2.9086, + "step": 55768 + }, + { + "epoch": 2.73, + "grad_norm": 0.7067645192146301, + "learning_rate": 1.1641303726460106e-05, + "loss": 2.8046, + "step": 55769 + }, + { + "epoch": 2.73, + "grad_norm": 0.7081477046012878, + "learning_rate": 1.1637056796619871e-05, + "loss": 3.1195, + "step": 55770 + }, + { + "epoch": 2.73, + "grad_norm": 0.733273983001709, + "learning_rate": 1.1632810626272104e-05, + "loss": 2.7449, + "step": 55771 + }, + { + "epoch": 2.73, + "grad_norm": 0.7776265740394592, + "learning_rate": 1.1628565215428099e-05, + "loss": 2.8875, + "step": 55772 + }, + { + "epoch": 2.73, + "grad_norm": 0.7776868939399719, + "learning_rate": 1.1624320564098911e-05, + "loss": 2.5992, + "step": 55773 + }, + { + "epoch": 2.73, + "grad_norm": 0.783747136592865, + "learning_rate": 1.1620076672295797e-05, + "loss": 2.9476, + "step": 55774 + }, + { + "epoch": 2.73, + "grad_norm": 0.7857709527015686, + "learning_rate": 1.161583354002985e-05, + "loss": 3.107, + "step": 55775 + }, + { + "epoch": 2.73, + "grad_norm": 0.729192316532135, + "learning_rate": 1.1611591167312295e-05, + "loss": 2.7004, + "step": 55776 + }, + { + "epoch": 2.73, + "grad_norm": 0.7840185165405273, + "learning_rate": 1.1607349554154355e-05, + "loss": 3.0569, + "step": 55777 + }, + { + "epoch": 2.73, + "grad_norm": 0.7330710887908936, + "learning_rate": 1.1603108700567121e-05, + "loss": 2.8325, + "step": 55778 + }, + { + "epoch": 2.73, + "grad_norm": 0.7933141589164734, + "learning_rate": 1.1598868606561817e-05, + "loss": 2.984, + "step": 55779 + }, + { + "epoch": 2.73, + "grad_norm": 0.7475676536560059, + "learning_rate": 1.1594629272149568e-05, + "loss": 2.7584, + "step": 55780 + }, + { + "epoch": 2.73, + "grad_norm": 0.7124994397163391, + "learning_rate": 1.1590390697341567e-05, + "loss": 2.9443, + "step": 55781 + }, + { + "epoch": 2.73, + "grad_norm": 0.7357338666915894, + "learning_rate": 1.1586152882149002e-05, + "loss": 3.0484, + "step": 55782 + }, + { + "epoch": 2.73, + "grad_norm": 0.7515398859977722, + "learning_rate": 1.1581915826582966e-05, + "loss": 3.0142, + "step": 55783 + }, + { + "epoch": 2.73, + "grad_norm": 0.8320146203041077, + "learning_rate": 1.1577679530654681e-05, + "loss": 2.9811, + "step": 55784 + }, + { + "epoch": 2.73, + "grad_norm": 0.7434693574905396, + "learning_rate": 1.1573443994375243e-05, + "loss": 2.9766, + "step": 55785 + }, + { + "epoch": 2.73, + "grad_norm": 0.7687200307846069, + "learning_rate": 1.1569209217755871e-05, + "loss": 2.8711, + "step": 55786 + }, + { + "epoch": 2.73, + "grad_norm": 0.7312271595001221, + "learning_rate": 1.1564975200807692e-05, + "loss": 2.8606, + "step": 55787 + }, + { + "epoch": 2.73, + "grad_norm": 0.77076256275177, + "learning_rate": 1.1560741943541862e-05, + "loss": 2.7525, + "step": 55788 + }, + { + "epoch": 2.73, + "grad_norm": 0.7475319504737854, + "learning_rate": 1.1556509445969542e-05, + "loss": 2.8676, + "step": 55789 + }, + { + "epoch": 2.73, + "grad_norm": 0.7481474876403809, + "learning_rate": 1.155227770810182e-05, + "loss": 2.707, + "step": 55790 + }, + { + "epoch": 2.73, + "grad_norm": 0.7318161725997925, + "learning_rate": 1.154804672994989e-05, + "loss": 2.7252, + "step": 55791 + }, + { + "epoch": 2.73, + "grad_norm": 0.7487803101539612, + "learning_rate": 1.154381651152494e-05, + "loss": 2.6743, + "step": 55792 + }, + { + "epoch": 2.73, + "grad_norm": 0.7442032694816589, + "learning_rate": 1.1539587052837995e-05, + "loss": 2.8245, + "step": 55793 + }, + { + "epoch": 2.73, + "grad_norm": 0.7299758195877075, + "learning_rate": 1.1535358353900314e-05, + "loss": 2.7304, + "step": 55794 + }, + { + "epoch": 2.73, + "grad_norm": 0.7423626780509949, + "learning_rate": 1.1531130414722988e-05, + "loss": 2.8714, + "step": 55795 + }, + { + "epoch": 2.73, + "grad_norm": 0.7554141879081726, + "learning_rate": 1.1526903235317142e-05, + "loss": 2.9592, + "step": 55796 + }, + { + "epoch": 2.73, + "grad_norm": 0.7774835824966431, + "learning_rate": 1.1522676815693899e-05, + "loss": 2.9311, + "step": 55797 + }, + { + "epoch": 2.73, + "grad_norm": 0.7268118262290955, + "learning_rate": 1.1518451155864383e-05, + "loss": 2.9979, + "step": 55798 + }, + { + "epoch": 2.73, + "grad_norm": 0.7733696103096008, + "learning_rate": 1.1514226255839786e-05, + "loss": 2.7952, + "step": 55799 + }, + { + "epoch": 2.73, + "grad_norm": 0.7510420680046082, + "learning_rate": 1.1510002115631167e-05, + "loss": 2.9407, + "step": 55800 + }, + { + "epoch": 2.73, + "grad_norm": 0.7454401254653931, + "learning_rate": 1.1505778735249715e-05, + "loss": 2.8337, + "step": 55801 + }, + { + "epoch": 2.73, + "grad_norm": 0.7559973001480103, + "learning_rate": 1.1501556114706489e-05, + "loss": 2.7546, + "step": 55802 + }, + { + "epoch": 2.73, + "grad_norm": 0.7908011674880981, + "learning_rate": 1.149733425401268e-05, + "loss": 2.935, + "step": 55803 + }, + { + "epoch": 2.73, + "grad_norm": 0.7167772650718689, + "learning_rate": 1.1493113153179345e-05, + "loss": 2.8562, + "step": 55804 + }, + { + "epoch": 2.73, + "grad_norm": 0.8508129119873047, + "learning_rate": 1.1488892812217609e-05, + "loss": 2.9246, + "step": 55805 + }, + { + "epoch": 2.73, + "grad_norm": 0.7644029855728149, + "learning_rate": 1.148467323113863e-05, + "loss": 2.9711, + "step": 55806 + }, + { + "epoch": 2.73, + "grad_norm": 0.7415115237236023, + "learning_rate": 1.1480454409953466e-05, + "loss": 2.8969, + "step": 55807 + }, + { + "epoch": 2.74, + "grad_norm": 0.7756478190422058, + "learning_rate": 1.1476236348673273e-05, + "loss": 3.0643, + "step": 55808 + }, + { + "epoch": 2.74, + "grad_norm": 0.7457125782966614, + "learning_rate": 1.1472019047309177e-05, + "loss": 2.823, + "step": 55809 + }, + { + "epoch": 2.74, + "grad_norm": 0.763870358467102, + "learning_rate": 1.1467802505872236e-05, + "loss": 3.125, + "step": 55810 + }, + { + "epoch": 2.74, + "grad_norm": 0.8004308342933655, + "learning_rate": 1.1463586724373574e-05, + "loss": 2.7832, + "step": 55811 + }, + { + "epoch": 2.74, + "grad_norm": 0.7365848422050476, + "learning_rate": 1.1459371702824283e-05, + "loss": 2.9564, + "step": 55812 + }, + { + "epoch": 2.74, + "grad_norm": 0.746489942073822, + "learning_rate": 1.1455157441235452e-05, + "loss": 2.8939, + "step": 55813 + }, + { + "epoch": 2.74, + "grad_norm": 0.7295230031013489, + "learning_rate": 1.1450943939618273e-05, + "loss": 2.8244, + "step": 55814 + }, + { + "epoch": 2.74, + "grad_norm": 0.7783901691436768, + "learning_rate": 1.1446731197983706e-05, + "loss": 3.006, + "step": 55815 + }, + { + "epoch": 2.74, + "grad_norm": 0.7580480575561523, + "learning_rate": 1.1442519216342972e-05, + "loss": 2.9875, + "step": 55816 + }, + { + "epoch": 2.74, + "grad_norm": 0.7256256341934204, + "learning_rate": 1.1438307994707097e-05, + "loss": 3.119, + "step": 55817 + }, + { + "epoch": 2.74, + "grad_norm": 0.7361882925033569, + "learning_rate": 1.1434097533087139e-05, + "loss": 2.7991, + "step": 55818 + }, + { + "epoch": 2.74, + "grad_norm": 0.7718648910522461, + "learning_rate": 1.1429887831494289e-05, + "loss": 3.0094, + "step": 55819 + }, + { + "epoch": 2.74, + "grad_norm": 0.7573488354682922, + "learning_rate": 1.1425678889939538e-05, + "loss": 2.8855, + "step": 55820 + }, + { + "epoch": 2.74, + "grad_norm": 0.7509820461273193, + "learning_rate": 1.1421470708434044e-05, + "loss": 3.0757, + "step": 55821 + }, + { + "epoch": 2.74, + "grad_norm": 0.7302854657173157, + "learning_rate": 1.1417263286988798e-05, + "loss": 3.0187, + "step": 55822 + }, + { + "epoch": 2.74, + "grad_norm": 0.7011269927024841, + "learning_rate": 1.1413056625614991e-05, + "loss": 2.9904, + "step": 55823 + }, + { + "epoch": 2.74, + "grad_norm": 0.8346005082130432, + "learning_rate": 1.1408850724323648e-05, + "loss": 3.1022, + "step": 55824 + }, + { + "epoch": 2.74, + "grad_norm": 0.7266502380371094, + "learning_rate": 1.140464558312586e-05, + "loss": 2.9047, + "step": 55825 + }, + { + "epoch": 2.74, + "grad_norm": 0.7775142788887024, + "learning_rate": 1.1400441202032717e-05, + "loss": 2.9262, + "step": 55826 + }, + { + "epoch": 2.74, + "grad_norm": 0.7800171971321106, + "learning_rate": 1.1396237581055212e-05, + "loss": 2.7697, + "step": 55827 + }, + { + "epoch": 2.74, + "grad_norm": 0.7560826539993286, + "learning_rate": 1.1392034720204468e-05, + "loss": 2.7127, + "step": 55828 + }, + { + "epoch": 2.74, + "grad_norm": 0.7661461234092712, + "learning_rate": 1.138783261949161e-05, + "loss": 2.9436, + "step": 55829 + }, + { + "epoch": 2.74, + "grad_norm": 0.7515476942062378, + "learning_rate": 1.138363127892763e-05, + "loss": 2.8215, + "step": 55830 + }, + { + "epoch": 2.74, + "grad_norm": 0.7410834431648254, + "learning_rate": 1.137943069852365e-05, + "loss": 2.9039, + "step": 55831 + }, + { + "epoch": 2.74, + "grad_norm": 0.7249680161476135, + "learning_rate": 1.1375230878290697e-05, + "loss": 2.8317, + "step": 55832 + }, + { + "epoch": 2.74, + "grad_norm": 0.7653201818466187, + "learning_rate": 1.1371031818239795e-05, + "loss": 2.7605, + "step": 55833 + }, + { + "epoch": 2.74, + "grad_norm": 0.7415288090705872, + "learning_rate": 1.1366833518382102e-05, + "loss": 3.0566, + "step": 55834 + }, + { + "epoch": 2.74, + "grad_norm": 0.7455683946609497, + "learning_rate": 1.1362635978728574e-05, + "loss": 2.8293, + "step": 55835 + }, + { + "epoch": 2.74, + "grad_norm": 0.7558901906013489, + "learning_rate": 1.1358439199290337e-05, + "loss": 2.8531, + "step": 55836 + }, + { + "epoch": 2.74, + "grad_norm": 0.6817432641983032, + "learning_rate": 1.1354243180078415e-05, + "loss": 3.1373, + "step": 55837 + }, + { + "epoch": 2.74, + "grad_norm": 0.7174025774002075, + "learning_rate": 1.1350047921103866e-05, + "loss": 2.8915, + "step": 55838 + }, + { + "epoch": 2.74, + "grad_norm": 0.759171187877655, + "learning_rate": 1.1345853422377749e-05, + "loss": 2.965, + "step": 55839 + }, + { + "epoch": 2.74, + "grad_norm": 0.7580920457839966, + "learning_rate": 1.1341659683911086e-05, + "loss": 3.0421, + "step": 55840 + }, + { + "epoch": 2.74, + "grad_norm": 0.7607132792472839, + "learning_rate": 1.133746670571497e-05, + "loss": 2.8414, + "step": 55841 + }, + { + "epoch": 2.74, + "grad_norm": 0.7223249077796936, + "learning_rate": 1.133327448780036e-05, + "loss": 3.2551, + "step": 55842 + }, + { + "epoch": 2.74, + "grad_norm": 0.7324064373970032, + "learning_rate": 1.1329083030178376e-05, + "loss": 2.8549, + "step": 55843 + }, + { + "epoch": 2.74, + "grad_norm": 0.8532474040985107, + "learning_rate": 1.1324892332860047e-05, + "loss": 2.9252, + "step": 55844 + }, + { + "epoch": 2.74, + "grad_norm": 0.7820866703987122, + "learning_rate": 1.1320702395856362e-05, + "loss": 2.8606, + "step": 55845 + }, + { + "epoch": 2.74, + "grad_norm": 0.7099783420562744, + "learning_rate": 1.1316513219178413e-05, + "loss": 2.785, + "step": 55846 + }, + { + "epoch": 2.74, + "grad_norm": 0.7845707535743713, + "learning_rate": 1.1312324802837225e-05, + "loss": 2.8446, + "step": 55847 + }, + { + "epoch": 2.74, + "grad_norm": 0.7959213852882385, + "learning_rate": 1.1308137146843787e-05, + "loss": 2.9116, + "step": 55848 + }, + { + "epoch": 2.74, + "grad_norm": 0.780271589756012, + "learning_rate": 1.130395025120916e-05, + "loss": 2.7504, + "step": 55849 + }, + { + "epoch": 2.74, + "grad_norm": 0.7800261378288269, + "learning_rate": 1.1299764115944365e-05, + "loss": 3.0029, + "step": 55850 + }, + { + "epoch": 2.74, + "grad_norm": 0.7617143392562866, + "learning_rate": 1.1295578741060463e-05, + "loss": 2.8185, + "step": 55851 + }, + { + "epoch": 2.74, + "grad_norm": 0.7690078616142273, + "learning_rate": 1.1291394126568376e-05, + "loss": 2.8685, + "step": 55852 + }, + { + "epoch": 2.74, + "grad_norm": 0.7068555355072021, + "learning_rate": 1.1287210272479264e-05, + "loss": 2.9051, + "step": 55853 + }, + { + "epoch": 2.74, + "grad_norm": 0.7279587388038635, + "learning_rate": 1.1283027178804082e-05, + "loss": 2.832, + "step": 55854 + }, + { + "epoch": 2.74, + "grad_norm": 0.7110167145729065, + "learning_rate": 1.1278844845553792e-05, + "loss": 2.6555, + "step": 55855 + }, + { + "epoch": 2.74, + "grad_norm": 0.7727231383323669, + "learning_rate": 1.1274663272739482e-05, + "loss": 2.8709, + "step": 55856 + }, + { + "epoch": 2.74, + "grad_norm": 0.7116361856460571, + "learning_rate": 1.127048246037211e-05, + "loss": 3.0165, + "step": 55857 + }, + { + "epoch": 2.74, + "grad_norm": 0.7671318054199219, + "learning_rate": 1.1266302408462734e-05, + "loss": 2.8943, + "step": 55858 + }, + { + "epoch": 2.74, + "grad_norm": 0.7194930911064148, + "learning_rate": 1.1262123117022382e-05, + "loss": 2.8822, + "step": 55859 + }, + { + "epoch": 2.74, + "grad_norm": 0.7566820979118347, + "learning_rate": 1.1257944586061974e-05, + "loss": 2.7942, + "step": 55860 + }, + { + "epoch": 2.74, + "grad_norm": 0.717607855796814, + "learning_rate": 1.1253766815592669e-05, + "loss": 2.8937, + "step": 55861 + }, + { + "epoch": 2.74, + "grad_norm": 0.7694738507270813, + "learning_rate": 1.1249589805625292e-05, + "loss": 2.8219, + "step": 55862 + }, + { + "epoch": 2.74, + "grad_norm": 0.729606032371521, + "learning_rate": 1.1245413556170901e-05, + "loss": 2.5149, + "step": 55863 + }, + { + "epoch": 2.74, + "grad_norm": 0.7487224340438843, + "learning_rate": 1.1241238067240588e-05, + "loss": 2.6698, + "step": 55864 + }, + { + "epoch": 2.74, + "grad_norm": 0.7842611074447632, + "learning_rate": 1.1237063338845209e-05, + "loss": 2.8821, + "step": 55865 + }, + { + "epoch": 2.74, + "grad_norm": 0.7518981695175171, + "learning_rate": 1.123288937099589e-05, + "loss": 2.8636, + "step": 55866 + }, + { + "epoch": 2.74, + "grad_norm": 0.7385653257369995, + "learning_rate": 1.1228716163703556e-05, + "loss": 2.8735, + "step": 55867 + }, + { + "epoch": 2.74, + "grad_norm": 0.7072750329971313, + "learning_rate": 1.1224543716979195e-05, + "loss": 2.8221, + "step": 55868 + }, + { + "epoch": 2.74, + "grad_norm": 0.7266203165054321, + "learning_rate": 1.1220372030833836e-05, + "loss": 3.007, + "step": 55869 + }, + { + "epoch": 2.74, + "grad_norm": 0.7844318151473999, + "learning_rate": 1.12162011052784e-05, + "loss": 2.8807, + "step": 55870 + }, + { + "epoch": 2.74, + "grad_norm": 0.866727352142334, + "learning_rate": 1.121203094032398e-05, + "loss": 2.8678, + "step": 55871 + }, + { + "epoch": 2.74, + "grad_norm": 0.7634037137031555, + "learning_rate": 1.12078615359814e-05, + "loss": 2.876, + "step": 55872 + }, + { + "epoch": 2.74, + "grad_norm": 0.72465580701828, + "learning_rate": 1.1203692892261785e-05, + "loss": 2.9704, + "step": 55873 + }, + { + "epoch": 2.74, + "grad_norm": 0.7288641929626465, + "learning_rate": 1.1199525009176059e-05, + "loss": 2.9168, + "step": 55874 + }, + { + "epoch": 2.74, + "grad_norm": 0.7715079188346863, + "learning_rate": 1.1195357886735213e-05, + "loss": 2.7081, + "step": 55875 + }, + { + "epoch": 2.74, + "grad_norm": 0.7418809533119202, + "learning_rate": 1.1191191524950239e-05, + "loss": 2.8808, + "step": 55876 + }, + { + "epoch": 2.74, + "grad_norm": 0.7965562343597412, + "learning_rate": 1.1187025923832027e-05, + "loss": 3.0647, + "step": 55877 + }, + { + "epoch": 2.74, + "grad_norm": 0.6859378218650818, + "learning_rate": 1.1182861083391636e-05, + "loss": 2.8672, + "step": 55878 + }, + { + "epoch": 2.74, + "grad_norm": 0.7590106129646301, + "learning_rate": 1.1178697003639958e-05, + "loss": 2.8654, + "step": 55879 + }, + { + "epoch": 2.74, + "grad_norm": 0.7052371501922607, + "learning_rate": 1.117453368458805e-05, + "loss": 2.7231, + "step": 55880 + }, + { + "epoch": 2.74, + "grad_norm": 0.7554574608802795, + "learning_rate": 1.1170371126246835e-05, + "loss": 2.8292, + "step": 55881 + }, + { + "epoch": 2.74, + "grad_norm": 0.7286297678947449, + "learning_rate": 1.116620932862724e-05, + "loss": 3.073, + "step": 55882 + }, + { + "epoch": 2.74, + "grad_norm": 0.7669945359230042, + "learning_rate": 1.1162048291740321e-05, + "loss": 3.0026, + "step": 55883 + }, + { + "epoch": 2.74, + "grad_norm": 0.7861215472221375, + "learning_rate": 1.1157888015596972e-05, + "loss": 2.9076, + "step": 55884 + }, + { + "epoch": 2.74, + "grad_norm": 0.7036914825439453, + "learning_rate": 1.1153728500208115e-05, + "loss": 2.9243, + "step": 55885 + }, + { + "epoch": 2.74, + "grad_norm": 0.7453054785728455, + "learning_rate": 1.1149569745584774e-05, + "loss": 2.9301, + "step": 55886 + }, + { + "epoch": 2.74, + "grad_norm": 0.7406743764877319, + "learning_rate": 1.1145411751737843e-05, + "loss": 3.0202, + "step": 55887 + }, + { + "epoch": 2.74, + "grad_norm": 0.7263259291648865, + "learning_rate": 1.1141254518678344e-05, + "loss": 3.0923, + "step": 55888 + }, + { + "epoch": 2.74, + "grad_norm": 0.7786001563072205, + "learning_rate": 1.113709804641717e-05, + "loss": 3.0409, + "step": 55889 + }, + { + "epoch": 2.74, + "grad_norm": 0.757474958896637, + "learning_rate": 1.1132942334965311e-05, + "loss": 2.7983, + "step": 55890 + }, + { + "epoch": 2.74, + "grad_norm": 0.7858365774154663, + "learning_rate": 1.1128787384333692e-05, + "loss": 3.0745, + "step": 55891 + }, + { + "epoch": 2.74, + "grad_norm": 0.7730826735496521, + "learning_rate": 1.1124633194533206e-05, + "loss": 2.7281, + "step": 55892 + }, + { + "epoch": 2.74, + "grad_norm": 0.7334781885147095, + "learning_rate": 1.1120479765574875e-05, + "loss": 2.8114, + "step": 55893 + }, + { + "epoch": 2.74, + "grad_norm": 0.7341852784156799, + "learning_rate": 1.111632709746959e-05, + "loss": 2.957, + "step": 55894 + }, + { + "epoch": 2.74, + "grad_norm": 0.7085254788398743, + "learning_rate": 1.1112175190228279e-05, + "loss": 2.9096, + "step": 55895 + }, + { + "epoch": 2.74, + "grad_norm": 0.7142402529716492, + "learning_rate": 1.1108024043861963e-05, + "loss": 3.1765, + "step": 55896 + }, + { + "epoch": 2.74, + "grad_norm": 0.7631176710128784, + "learning_rate": 1.1103873658381502e-05, + "loss": 2.923, + "step": 55897 + }, + { + "epoch": 2.74, + "grad_norm": 0.8920885920524597, + "learning_rate": 1.109972403379782e-05, + "loss": 3.0669, + "step": 55898 + }, + { + "epoch": 2.74, + "grad_norm": 0.7416764497756958, + "learning_rate": 1.109557517012184e-05, + "loss": 2.7871, + "step": 55899 + }, + { + "epoch": 2.74, + "grad_norm": 0.8150529265403748, + "learning_rate": 1.1091427067364523e-05, + "loss": 2.9976, + "step": 55900 + }, + { + "epoch": 2.74, + "grad_norm": 0.7230525612831116, + "learning_rate": 1.1087279725536824e-05, + "loss": 2.9518, + "step": 55901 + }, + { + "epoch": 2.74, + "grad_norm": 0.762174665927887, + "learning_rate": 1.1083133144649569e-05, + "loss": 3.2105, + "step": 55902 + }, + { + "epoch": 2.74, + "grad_norm": 0.7683262228965759, + "learning_rate": 1.1078987324713817e-05, + "loss": 2.7405, + "step": 55903 + }, + { + "epoch": 2.74, + "grad_norm": 0.7210031151771545, + "learning_rate": 1.1074842265740325e-05, + "loss": 3.0056, + "step": 55904 + }, + { + "epoch": 2.74, + "grad_norm": 0.7431241273880005, + "learning_rate": 1.1070697967740149e-05, + "loss": 2.7492, + "step": 55905 + }, + { + "epoch": 2.74, + "grad_norm": 0.7543492317199707, + "learning_rate": 1.106655443072415e-05, + "loss": 2.8373, + "step": 55906 + }, + { + "epoch": 2.74, + "grad_norm": 0.7546671628952026, + "learning_rate": 1.1062411654703218e-05, + "loss": 2.8681, + "step": 55907 + }, + { + "epoch": 2.74, + "grad_norm": 0.7264155745506287, + "learning_rate": 1.105826963968831e-05, + "loss": 2.7632, + "step": 55908 + }, + { + "epoch": 2.74, + "grad_norm": 0.7544650435447693, + "learning_rate": 1.1054128385690254e-05, + "loss": 2.8906, + "step": 55909 + }, + { + "epoch": 2.74, + "grad_norm": 0.7051803469657898, + "learning_rate": 1.1049987892720036e-05, + "loss": 2.8865, + "step": 55910 + }, + { + "epoch": 2.74, + "grad_norm": 0.7877517938613892, + "learning_rate": 1.1045848160788585e-05, + "loss": 2.9269, + "step": 55911 + }, + { + "epoch": 2.74, + "grad_norm": 0.7090002298355103, + "learning_rate": 1.1041709189906756e-05, + "loss": 3.0367, + "step": 55912 + }, + { + "epoch": 2.74, + "grad_norm": 0.7225410342216492, + "learning_rate": 1.1037570980085442e-05, + "loss": 2.7446, + "step": 55913 + }, + { + "epoch": 2.74, + "grad_norm": 0.7321907877922058, + "learning_rate": 1.1033433531335533e-05, + "loss": 2.8573, + "step": 55914 + }, + { + "epoch": 2.74, + "grad_norm": 0.7515150904655457, + "learning_rate": 1.1029296843667923e-05, + "loss": 2.9509, + "step": 55915 + }, + { + "epoch": 2.74, + "grad_norm": 0.7628140449523926, + "learning_rate": 1.10251609170936e-05, + "loss": 3.1318, + "step": 55916 + }, + { + "epoch": 2.74, + "grad_norm": 0.7710598111152649, + "learning_rate": 1.1021025751623324e-05, + "loss": 2.8181, + "step": 55917 + }, + { + "epoch": 2.74, + "grad_norm": 0.7168538570404053, + "learning_rate": 1.1016891347268087e-05, + "loss": 2.9143, + "step": 55918 + }, + { + "epoch": 2.74, + "grad_norm": 0.7750139236450195, + "learning_rate": 1.1012757704038778e-05, + "loss": 2.9098, + "step": 55919 + }, + { + "epoch": 2.74, + "grad_norm": 0.8607103824615479, + "learning_rate": 1.1008624821946188e-05, + "loss": 2.8608, + "step": 55920 + }, + { + "epoch": 2.74, + "grad_norm": 0.7440026998519897, + "learning_rate": 1.1004492701001278e-05, + "loss": 2.8417, + "step": 55921 + }, + { + "epoch": 2.74, + "grad_norm": 0.7108123898506165, + "learning_rate": 1.1000361341214902e-05, + "loss": 2.9667, + "step": 55922 + }, + { + "epoch": 2.74, + "grad_norm": 0.7313125729560852, + "learning_rate": 1.099623074259799e-05, + "loss": 2.6806, + "step": 55923 + }, + { + "epoch": 2.74, + "grad_norm": 0.8081473708152771, + "learning_rate": 1.099210090516136e-05, + "loss": 2.704, + "step": 55924 + }, + { + "epoch": 2.74, + "grad_norm": 0.7507513761520386, + "learning_rate": 1.098797182891591e-05, + "loss": 2.9803, + "step": 55925 + }, + { + "epoch": 2.74, + "grad_norm": 0.770849347114563, + "learning_rate": 1.0983843513872559e-05, + "loss": 3.0487, + "step": 55926 + }, + { + "epoch": 2.74, + "grad_norm": 0.7244208455085754, + "learning_rate": 1.0979715960042135e-05, + "loss": 2.879, + "step": 55927 + }, + { + "epoch": 2.74, + "grad_norm": 0.7546032667160034, + "learning_rate": 1.0975589167435527e-05, + "loss": 3.0663, + "step": 55928 + }, + { + "epoch": 2.74, + "grad_norm": 0.8063235878944397, + "learning_rate": 1.097146313606353e-05, + "loss": 2.9658, + "step": 55929 + }, + { + "epoch": 2.74, + "grad_norm": 0.714979887008667, + "learning_rate": 1.0967337865937098e-05, + "loss": 2.9623, + "step": 55930 + }, + { + "epoch": 2.74, + "grad_norm": 0.7428481578826904, + "learning_rate": 1.096321335706709e-05, + "loss": 2.9092, + "step": 55931 + }, + { + "epoch": 2.74, + "grad_norm": 0.7758767008781433, + "learning_rate": 1.0959089609464334e-05, + "loss": 2.7544, + "step": 55932 + }, + { + "epoch": 2.74, + "grad_norm": 0.7635376453399658, + "learning_rate": 1.0954966623139748e-05, + "loss": 2.9114, + "step": 55933 + }, + { + "epoch": 2.74, + "grad_norm": 0.7332720160484314, + "learning_rate": 1.095084439810413e-05, + "loss": 2.836, + "step": 55934 + }, + { + "epoch": 2.74, + "grad_norm": 0.7916608452796936, + "learning_rate": 1.0946722934368334e-05, + "loss": 2.814, + "step": 55935 + }, + { + "epoch": 2.74, + "grad_norm": 0.7521407008171082, + "learning_rate": 1.0942602231943287e-05, + "loss": 2.8053, + "step": 55936 + }, + { + "epoch": 2.74, + "grad_norm": 0.8015977144241333, + "learning_rate": 1.0938482290839745e-05, + "loss": 2.9304, + "step": 55937 + }, + { + "epoch": 2.74, + "grad_norm": 0.7351771593093872, + "learning_rate": 1.0934363111068668e-05, + "loss": 2.9438, + "step": 55938 + }, + { + "epoch": 2.74, + "grad_norm": 0.7315554022789001, + "learning_rate": 1.0930244692640777e-05, + "loss": 2.8405, + "step": 55939 + }, + { + "epoch": 2.74, + "grad_norm": 0.7486281394958496, + "learning_rate": 1.0926127035567067e-05, + "loss": 2.9316, + "step": 55940 + }, + { + "epoch": 2.74, + "grad_norm": 0.721809446811676, + "learning_rate": 1.0922010139858262e-05, + "loss": 2.9926, + "step": 55941 + }, + { + "epoch": 2.74, + "grad_norm": 0.7924941778182983, + "learning_rate": 1.0917894005525251e-05, + "loss": 2.8259, + "step": 55942 + }, + { + "epoch": 2.74, + "grad_norm": 0.7480098009109497, + "learning_rate": 1.0913778632578862e-05, + "loss": 2.9488, + "step": 55943 + }, + { + "epoch": 2.74, + "grad_norm": 0.7476303577423096, + "learning_rate": 1.090966402102995e-05, + "loss": 2.8346, + "step": 55944 + }, + { + "epoch": 2.74, + "grad_norm": 0.7705007195472717, + "learning_rate": 1.0905550170889344e-05, + "loss": 2.9332, + "step": 55945 + }, + { + "epoch": 2.74, + "grad_norm": 0.745185375213623, + "learning_rate": 1.0901437082167896e-05, + "loss": 2.8909, + "step": 55946 + }, + { + "epoch": 2.74, + "grad_norm": 0.7382268905639648, + "learning_rate": 1.0897324754876402e-05, + "loss": 2.8413, + "step": 55947 + }, + { + "epoch": 2.74, + "grad_norm": 0.8054601550102234, + "learning_rate": 1.0893213189025751e-05, + "loss": 2.7561, + "step": 55948 + }, + { + "epoch": 2.74, + "grad_norm": 0.7425912022590637, + "learning_rate": 1.0889102384626702e-05, + "loss": 2.9528, + "step": 55949 + }, + { + "epoch": 2.74, + "grad_norm": 0.7311367988586426, + "learning_rate": 1.0884992341690113e-05, + "loss": 2.9087, + "step": 55950 + }, + { + "epoch": 2.74, + "grad_norm": 0.752752959728241, + "learning_rate": 1.0880883060226875e-05, + "loss": 3.0692, + "step": 55951 + }, + { + "epoch": 2.74, + "grad_norm": 0.74880051612854, + "learning_rate": 1.087677454024768e-05, + "loss": 2.8871, + "step": 55952 + }, + { + "epoch": 2.74, + "grad_norm": 0.776472806930542, + "learning_rate": 1.087266678176345e-05, + "loss": 2.823, + "step": 55953 + }, + { + "epoch": 2.74, + "grad_norm": 0.7268320322036743, + "learning_rate": 1.0868559784784947e-05, + "loss": 2.6937, + "step": 55954 + }, + { + "epoch": 2.74, + "grad_norm": 0.7472983598709106, + "learning_rate": 1.086445354932306e-05, + "loss": 3.2036, + "step": 55955 + }, + { + "epoch": 2.74, + "grad_norm": 0.7563039064407349, + "learning_rate": 1.0860348075388514e-05, + "loss": 2.9643, + "step": 55956 + }, + { + "epoch": 2.74, + "grad_norm": 0.72945237159729, + "learning_rate": 1.0856243362992167e-05, + "loss": 2.9227, + "step": 55957 + }, + { + "epoch": 2.74, + "grad_norm": 0.7762035727500916, + "learning_rate": 1.0852139412144877e-05, + "loss": 2.5892, + "step": 55958 + }, + { + "epoch": 2.74, + "grad_norm": 0.7469337582588196, + "learning_rate": 1.0848036222857337e-05, + "loss": 3.0108, + "step": 55959 + }, + { + "epoch": 2.74, + "grad_norm": 0.6988431215286255, + "learning_rate": 1.084393379514047e-05, + "loss": 3.0516, + "step": 55960 + }, + { + "epoch": 2.74, + "grad_norm": 0.7568329572677612, + "learning_rate": 1.0839832129005e-05, + "loss": 2.9372, + "step": 55961 + }, + { + "epoch": 2.74, + "grad_norm": 0.7815048098564148, + "learning_rate": 1.0835731224461786e-05, + "loss": 2.7753, + "step": 55962 + }, + { + "epoch": 2.74, + "grad_norm": 0.7493932843208313, + "learning_rate": 1.0831631081521618e-05, + "loss": 2.9575, + "step": 55963 + }, + { + "epoch": 2.74, + "grad_norm": 0.7615856528282166, + "learning_rate": 1.0827531700195224e-05, + "loss": 2.6857, + "step": 55964 + }, + { + "epoch": 2.74, + "grad_norm": 0.889343798160553, + "learning_rate": 1.0823433080493526e-05, + "loss": 2.6804, + "step": 55965 + }, + { + "epoch": 2.74, + "grad_norm": 0.7530947923660278, + "learning_rate": 1.0819335222427183e-05, + "loss": 2.8449, + "step": 55966 + }, + { + "epoch": 2.74, + "grad_norm": 0.7794963121414185, + "learning_rate": 1.0815238126007086e-05, + "loss": 2.8913, + "step": 55967 + }, + { + "epoch": 2.74, + "grad_norm": 0.7236289381980896, + "learning_rate": 1.0811141791244026e-05, + "loss": 2.8246, + "step": 55968 + }, + { + "epoch": 2.74, + "grad_norm": 0.7335292100906372, + "learning_rate": 1.0807046218148729e-05, + "loss": 2.9881, + "step": 55969 + }, + { + "epoch": 2.74, + "grad_norm": 0.7387674450874329, + "learning_rate": 1.0802951406732018e-05, + "loss": 2.9562, + "step": 55970 + }, + { + "epoch": 2.74, + "grad_norm": 0.754050076007843, + "learning_rate": 1.0798857357004719e-05, + "loss": 2.7057, + "step": 55971 + }, + { + "epoch": 2.74, + "grad_norm": 0.7682558298110962, + "learning_rate": 1.0794764068977524e-05, + "loss": 2.7377, + "step": 55972 + }, + { + "epoch": 2.74, + "grad_norm": 0.7760759592056274, + "learning_rate": 1.0790671542661289e-05, + "loss": 2.8588, + "step": 55973 + }, + { + "epoch": 2.74, + "grad_norm": 0.74440997838974, + "learning_rate": 1.0786579778066739e-05, + "loss": 2.9655, + "step": 55974 + }, + { + "epoch": 2.74, + "grad_norm": 0.7802349925041199, + "learning_rate": 1.07824887752047e-05, + "loss": 2.9066, + "step": 55975 + }, + { + "epoch": 2.74, + "grad_norm": 0.7338876724243164, + "learning_rate": 1.0778398534085898e-05, + "loss": 3.0789, + "step": 55976 + }, + { + "epoch": 2.74, + "grad_norm": 0.7808542251586914, + "learning_rate": 1.0774309054721187e-05, + "loss": 2.7526, + "step": 55977 + }, + { + "epoch": 2.74, + "grad_norm": 0.7372686862945557, + "learning_rate": 1.077022033712126e-05, + "loss": 2.9052, + "step": 55978 + }, + { + "epoch": 2.74, + "grad_norm": 0.8141536116600037, + "learning_rate": 1.0766132381296876e-05, + "loss": 2.8219, + "step": 55979 + }, + { + "epoch": 2.74, + "grad_norm": 0.7406730651855469, + "learning_rate": 1.076204518725886e-05, + "loss": 2.9798, + "step": 55980 + }, + { + "epoch": 2.74, + "grad_norm": 0.7653099298477173, + "learning_rate": 1.0757958755017937e-05, + "loss": 2.756, + "step": 55981 + }, + { + "epoch": 2.74, + "grad_norm": 0.7098361849784851, + "learning_rate": 1.0753873084584896e-05, + "loss": 2.767, + "step": 55982 + }, + { + "epoch": 2.74, + "grad_norm": 0.7831395268440247, + "learning_rate": 1.0749788175970497e-05, + "loss": 2.6796, + "step": 55983 + }, + { + "epoch": 2.74, + "grad_norm": 0.79066401720047, + "learning_rate": 1.0745704029185464e-05, + "loss": 2.7994, + "step": 55984 + }, + { + "epoch": 2.74, + "grad_norm": 0.7196535468101501, + "learning_rate": 1.0741620644240622e-05, + "loss": 2.7152, + "step": 55985 + }, + { + "epoch": 2.74, + "grad_norm": 0.8042973875999451, + "learning_rate": 1.0737538021146663e-05, + "loss": 2.9082, + "step": 55986 + }, + { + "epoch": 2.74, + "grad_norm": 0.8539407849311829, + "learning_rate": 1.073345615991431e-05, + "loss": 3.0031, + "step": 55987 + }, + { + "epoch": 2.74, + "grad_norm": 0.7349897027015686, + "learning_rate": 1.0729375060554424e-05, + "loss": 2.8356, + "step": 55988 + }, + { + "epoch": 2.74, + "grad_norm": 0.7442134022712708, + "learning_rate": 1.072529472307766e-05, + "loss": 2.9911, + "step": 55989 + }, + { + "epoch": 2.74, + "grad_norm": 0.742074191570282, + "learning_rate": 1.072121514749481e-05, + "loss": 2.9064, + "step": 55990 + }, + { + "epoch": 2.74, + "grad_norm": 0.7211217880249023, + "learning_rate": 1.07171363338166e-05, + "loss": 2.8363, + "step": 55991 + }, + { + "epoch": 2.74, + "grad_norm": 0.7274338006973267, + "learning_rate": 1.0713058282053788e-05, + "loss": 2.9167, + "step": 55992 + }, + { + "epoch": 2.74, + "grad_norm": 0.7637966871261597, + "learning_rate": 1.0708980992217131e-05, + "loss": 3.002, + "step": 55993 + }, + { + "epoch": 2.74, + "grad_norm": 0.7385900616645813, + "learning_rate": 1.0704904464317287e-05, + "loss": 2.7543, + "step": 55994 + }, + { + "epoch": 2.74, + "grad_norm": 0.7418367862701416, + "learning_rate": 1.0700828698365117e-05, + "loss": 2.9536, + "step": 55995 + }, + { + "epoch": 2.74, + "grad_norm": 0.7280617356300354, + "learning_rate": 1.0696753694371207e-05, + "loss": 2.6508, + "step": 55996 + }, + { + "epoch": 2.74, + "grad_norm": 0.7581601738929749, + "learning_rate": 1.069267945234642e-05, + "loss": 3.0828, + "step": 55997 + }, + { + "epoch": 2.74, + "grad_norm": 0.7517957091331482, + "learning_rate": 1.0688605972301444e-05, + "loss": 2.8601, + "step": 55998 + }, + { + "epoch": 2.74, + "grad_norm": 0.7825131416320801, + "learning_rate": 1.0684533254246974e-05, + "loss": 2.9613, + "step": 55999 + }, + { + "epoch": 2.74, + "grad_norm": 0.7765049934387207, + "learning_rate": 1.0680461298193799e-05, + "loss": 2.908, + "step": 56000 + }, + { + "epoch": 2.74, + "grad_norm": 0.7475263476371765, + "learning_rate": 1.0676390104152577e-05, + "loss": 2.9421, + "step": 56001 + }, + { + "epoch": 2.74, + "grad_norm": 0.7581568360328674, + "learning_rate": 1.0672319672134033e-05, + "loss": 2.7795, + "step": 56002 + }, + { + "epoch": 2.74, + "grad_norm": 0.7449358105659485, + "learning_rate": 1.0668250002148993e-05, + "loss": 2.8471, + "step": 56003 + }, + { + "epoch": 2.74, + "grad_norm": 0.7291288375854492, + "learning_rate": 1.0664181094208012e-05, + "loss": 3.0978, + "step": 56004 + }, + { + "epoch": 2.74, + "grad_norm": 0.7702457308769226, + "learning_rate": 1.0660112948321953e-05, + "loss": 2.9775, + "step": 56005 + }, + { + "epoch": 2.74, + "grad_norm": 0.738068699836731, + "learning_rate": 1.065604556450147e-05, + "loss": 2.9591, + "step": 56006 + }, + { + "epoch": 2.74, + "grad_norm": 0.7106137871742249, + "learning_rate": 1.0651978942757256e-05, + "loss": 2.7704, + "step": 56007 + }, + { + "epoch": 2.74, + "grad_norm": 0.7228193879127502, + "learning_rate": 1.0647913083100101e-05, + "loss": 2.8967, + "step": 56008 + }, + { + "epoch": 2.74, + "grad_norm": 0.728057324886322, + "learning_rate": 1.0643847985540566e-05, + "loss": 2.8884, + "step": 56009 + }, + { + "epoch": 2.74, + "grad_norm": 0.8260751366615295, + "learning_rate": 1.0639783650089506e-05, + "loss": 3.0548, + "step": 56010 + }, + { + "epoch": 2.74, + "grad_norm": 0.7846874594688416, + "learning_rate": 1.0635720076757548e-05, + "loss": 2.8825, + "step": 56011 + }, + { + "epoch": 2.75, + "grad_norm": 0.7633529901504517, + "learning_rate": 1.0631657265555382e-05, + "loss": 2.8856, + "step": 56012 + }, + { + "epoch": 2.75, + "grad_norm": 0.7845501899719238, + "learning_rate": 1.0627595216493801e-05, + "loss": 2.9424, + "step": 56013 + }, + { + "epoch": 2.75, + "grad_norm": 0.7749620676040649, + "learning_rate": 1.0623533929583461e-05, + "loss": 2.994, + "step": 56014 + }, + { + "epoch": 2.75, + "grad_norm": 0.7692545056343079, + "learning_rate": 1.0619473404834988e-05, + "loss": 3.0687, + "step": 56015 + }, + { + "epoch": 2.75, + "grad_norm": 0.7629958391189575, + "learning_rate": 1.0615413642259141e-05, + "loss": 3.0601, + "step": 56016 + }, + { + "epoch": 2.75, + "grad_norm": 0.7727113962173462, + "learning_rate": 1.0611354641866576e-05, + "loss": 2.9892, + "step": 56017 + }, + { + "epoch": 2.75, + "grad_norm": 0.7810448408126831, + "learning_rate": 1.0607296403668052e-05, + "loss": 3.0302, + "step": 56018 + }, + { + "epoch": 2.75, + "grad_norm": 0.7546478509902954, + "learning_rate": 1.0603238927674197e-05, + "loss": 3.0782, + "step": 56019 + }, + { + "epoch": 2.75, + "grad_norm": 0.7549473643302917, + "learning_rate": 1.0599182213895729e-05, + "loss": 2.8536, + "step": 56020 + }, + { + "epoch": 2.75, + "grad_norm": 0.712246298789978, + "learning_rate": 1.0595126262343313e-05, + "loss": 2.8927, + "step": 56021 + }, + { + "epoch": 2.75, + "grad_norm": 0.7353459000587463, + "learning_rate": 1.0591071073027601e-05, + "loss": 2.7289, + "step": 56022 + }, + { + "epoch": 2.75, + "grad_norm": 0.7388055920600891, + "learning_rate": 1.0587016645959356e-05, + "loss": 2.7817, + "step": 56023 + }, + { + "epoch": 2.75, + "grad_norm": 0.7375566959381104, + "learning_rate": 1.0582962981149167e-05, + "loss": 2.7332, + "step": 56024 + }, + { + "epoch": 2.75, + "grad_norm": 0.7719786167144775, + "learning_rate": 1.0578910078607794e-05, + "loss": 2.8114, + "step": 56025 + }, + { + "epoch": 2.75, + "grad_norm": 0.7156137824058533, + "learning_rate": 1.057485793834586e-05, + "loss": 2.8983, + "step": 56026 + }, + { + "epoch": 2.75, + "grad_norm": 0.7468981742858887, + "learning_rate": 1.0570806560374023e-05, + "loss": 2.8096, + "step": 56027 + }, + { + "epoch": 2.75, + "grad_norm": 0.7423037886619568, + "learning_rate": 1.0566755944703009e-05, + "loss": 2.9614, + "step": 56028 + }, + { + "epoch": 2.75, + "grad_norm": 0.7885465621948242, + "learning_rate": 1.0562706091343475e-05, + "loss": 2.8188, + "step": 56029 + }, + { + "epoch": 2.75, + "grad_norm": 0.7959953546524048, + "learning_rate": 1.055865700030608e-05, + "loss": 2.7377, + "step": 56030 + }, + { + "epoch": 2.75, + "grad_norm": 0.7130370736122131, + "learning_rate": 1.0554608671601417e-05, + "loss": 2.8816, + "step": 56031 + }, + { + "epoch": 2.75, + "grad_norm": 0.724517822265625, + "learning_rate": 1.055056110524024e-05, + "loss": 2.898, + "step": 56032 + }, + { + "epoch": 2.75, + "grad_norm": 0.7943875789642334, + "learning_rate": 1.0546514301233178e-05, + "loss": 2.956, + "step": 56033 + }, + { + "epoch": 2.75, + "grad_norm": 0.7979520559310913, + "learning_rate": 1.0542468259590853e-05, + "loss": 2.7335, + "step": 56034 + }, + { + "epoch": 2.75, + "grad_norm": 0.7505634427070618, + "learning_rate": 1.0538422980324025e-05, + "loss": 2.8819, + "step": 56035 + }, + { + "epoch": 2.75, + "grad_norm": 0.761986255645752, + "learning_rate": 1.0534378463443283e-05, + "loss": 2.7005, + "step": 56036 + }, + { + "epoch": 2.75, + "grad_norm": 0.7043732404708862, + "learning_rate": 1.0530334708959255e-05, + "loss": 2.8908, + "step": 56037 + }, + { + "epoch": 2.75, + "grad_norm": 0.7133161425590515, + "learning_rate": 1.0526291716882596e-05, + "loss": 2.9596, + "step": 56038 + }, + { + "epoch": 2.75, + "grad_norm": 0.7343462109565735, + "learning_rate": 1.0522249487223966e-05, + "loss": 2.8832, + "step": 56039 + }, + { + "epoch": 2.75, + "grad_norm": 0.7378050684928894, + "learning_rate": 1.0518208019994056e-05, + "loss": 2.9357, + "step": 56040 + }, + { + "epoch": 2.75, + "grad_norm": 0.7832443714141846, + "learning_rate": 1.0514167315203458e-05, + "loss": 3.0828, + "step": 56041 + }, + { + "epoch": 2.75, + "grad_norm": 0.715339183807373, + "learning_rate": 1.0510127372862831e-05, + "loss": 2.8073, + "step": 56042 + }, + { + "epoch": 2.75, + "grad_norm": 0.7633505463600159, + "learning_rate": 1.0506088192982831e-05, + "loss": 2.9054, + "step": 56043 + }, + { + "epoch": 2.75, + "grad_norm": 0.7171920537948608, + "learning_rate": 1.050204977557405e-05, + "loss": 2.8434, + "step": 56044 + }, + { + "epoch": 2.75, + "grad_norm": 0.7434439063072205, + "learning_rate": 1.0498012120647215e-05, + "loss": 2.8091, + "step": 56045 + }, + { + "epoch": 2.75, + "grad_norm": 0.7520685791969299, + "learning_rate": 1.0493975228212814e-05, + "loss": 2.7975, + "step": 56046 + }, + { + "epoch": 2.75, + "grad_norm": 0.7988944053649902, + "learning_rate": 1.048993909828164e-05, + "loss": 2.6339, + "step": 56047 + }, + { + "epoch": 2.75, + "grad_norm": 0.7427051663398743, + "learning_rate": 1.048590373086422e-05, + "loss": 2.7419, + "step": 56048 + }, + { + "epoch": 2.75, + "grad_norm": 0.7187119126319885, + "learning_rate": 1.0481869125971176e-05, + "loss": 2.964, + "step": 56049 + }, + { + "epoch": 2.75, + "grad_norm": 0.7609080672264099, + "learning_rate": 1.0477835283613233e-05, + "loss": 2.6344, + "step": 56050 + }, + { + "epoch": 2.75, + "grad_norm": 0.7719628810882568, + "learning_rate": 1.0473802203800918e-05, + "loss": 2.8794, + "step": 56051 + }, + { + "epoch": 2.75, + "grad_norm": 0.807428240776062, + "learning_rate": 1.0469769886544921e-05, + "loss": 2.9451, + "step": 56052 + }, + { + "epoch": 2.75, + "grad_norm": 0.7161225080490112, + "learning_rate": 1.0465738331855767e-05, + "loss": 2.7363, + "step": 56053 + }, + { + "epoch": 2.75, + "grad_norm": 0.7533316612243652, + "learning_rate": 1.0461707539744179e-05, + "loss": 2.9044, + "step": 56054 + }, + { + "epoch": 2.75, + "grad_norm": 0.7733919620513916, + "learning_rate": 1.045767751022072e-05, + "loss": 3.0109, + "step": 56055 + }, + { + "epoch": 2.75, + "grad_norm": 0.7934540510177612, + "learning_rate": 1.0453648243296008e-05, + "loss": 2.6885, + "step": 56056 + }, + { + "epoch": 2.75, + "grad_norm": 0.75040203332901, + "learning_rate": 1.0449619738980675e-05, + "loss": 3.0411, + "step": 56057 + }, + { + "epoch": 2.75, + "grad_norm": 0.7220397591590881, + "learning_rate": 1.0445591997285341e-05, + "loss": 2.8238, + "step": 56058 + }, + { + "epoch": 2.75, + "grad_norm": 0.7255467772483826, + "learning_rate": 1.0441565018220532e-05, + "loss": 3.0506, + "step": 56059 + }, + { + "epoch": 2.75, + "grad_norm": 0.7547920346260071, + "learning_rate": 1.0437538801796973e-05, + "loss": 3.0475, + "step": 56060 + }, + { + "epoch": 2.75, + "grad_norm": 0.7408540844917297, + "learning_rate": 1.0433513348025158e-05, + "loss": 2.8226, + "step": 56061 + }, + { + "epoch": 2.75, + "grad_norm": 0.7349470853805542, + "learning_rate": 1.0429488656915807e-05, + "loss": 2.9271, + "step": 56062 + }, + { + "epoch": 2.75, + "grad_norm": 0.7243402600288391, + "learning_rate": 1.0425464728479382e-05, + "loss": 2.8937, + "step": 56063 + }, + { + "epoch": 2.75, + "grad_norm": 0.7938183546066284, + "learning_rate": 1.0421441562726607e-05, + "loss": 2.8097, + "step": 56064 + }, + { + "epoch": 2.75, + "grad_norm": 0.7476469278335571, + "learning_rate": 1.0417419159668006e-05, + "loss": 2.5328, + "step": 56065 + }, + { + "epoch": 2.75, + "grad_norm": 0.7038164734840393, + "learning_rate": 1.041339751931417e-05, + "loss": 2.7658, + "step": 56066 + }, + { + "epoch": 2.75, + "grad_norm": 0.6968178749084473, + "learning_rate": 1.0409376641675759e-05, + "loss": 2.946, + "step": 56067 + }, + { + "epoch": 2.75, + "grad_norm": 0.7435596585273743, + "learning_rate": 1.0405356526763264e-05, + "loss": 2.8976, + "step": 56068 + }, + { + "epoch": 2.75, + "grad_norm": 0.7599115371704102, + "learning_rate": 1.0401337174587344e-05, + "loss": 2.6582, + "step": 56069 + }, + { + "epoch": 2.75, + "grad_norm": 0.7440100312232971, + "learning_rate": 1.0397318585158587e-05, + "loss": 2.7613, + "step": 56070 + }, + { + "epoch": 2.75, + "grad_norm": 0.7963297963142395, + "learning_rate": 1.0393300758487522e-05, + "loss": 2.7282, + "step": 56071 + }, + { + "epoch": 2.75, + "grad_norm": 0.746436357498169, + "learning_rate": 1.0389283694584805e-05, + "loss": 2.777, + "step": 56072 + }, + { + "epoch": 2.75, + "grad_norm": 0.7619957327842712, + "learning_rate": 1.0385267393460994e-05, + "loss": 2.6955, + "step": 56073 + }, + { + "epoch": 2.75, + "grad_norm": 0.7717376947402954, + "learning_rate": 1.0381251855126582e-05, + "loss": 3.0108, + "step": 56074 + }, + { + "epoch": 2.75, + "grad_norm": 0.7113671898841858, + "learning_rate": 1.037723707959226e-05, + "loss": 2.7965, + "step": 56075 + }, + { + "epoch": 2.75, + "grad_norm": 0.7118061184883118, + "learning_rate": 1.0373223066868552e-05, + "loss": 2.8349, + "step": 56076 + }, + { + "epoch": 2.75, + "grad_norm": 0.7409781217575073, + "learning_rate": 1.0369209816966051e-05, + "loss": 2.7353, + "step": 56077 + }, + { + "epoch": 2.75, + "grad_norm": 0.7771452069282532, + "learning_rate": 1.0365197329895281e-05, + "loss": 2.8024, + "step": 56078 + }, + { + "epoch": 2.75, + "grad_norm": 0.787254810333252, + "learning_rate": 1.0361185605666866e-05, + "loss": 2.937, + "step": 56079 + }, + { + "epoch": 2.75, + "grad_norm": 0.7171097993850708, + "learning_rate": 1.0357174644291332e-05, + "loss": 2.9785, + "step": 56080 + }, + { + "epoch": 2.75, + "grad_norm": 0.7196240425109863, + "learning_rate": 1.0353164445779238e-05, + "loss": 2.988, + "step": 56081 + }, + { + "epoch": 2.75, + "grad_norm": 0.718910813331604, + "learning_rate": 1.0349155010141208e-05, + "loss": 2.6988, + "step": 56082 + }, + { + "epoch": 2.75, + "grad_norm": 0.7512689828872681, + "learning_rate": 1.03451463373877e-05, + "loss": 3.0488, + "step": 56083 + }, + { + "epoch": 2.75, + "grad_norm": 0.7203948497772217, + "learning_rate": 1.0341138427529339e-05, + "loss": 2.8411, + "step": 56084 + }, + { + "epoch": 2.75, + "grad_norm": 0.7465165257453918, + "learning_rate": 1.0337131280576716e-05, + "loss": 2.9288, + "step": 56085 + }, + { + "epoch": 2.75, + "grad_norm": 0.7512971758842468, + "learning_rate": 1.0333124896540324e-05, + "loss": 2.981, + "step": 56086 + }, + { + "epoch": 2.75, + "grad_norm": 0.7287713885307312, + "learning_rate": 1.0329119275430785e-05, + "loss": 2.9989, + "step": 56087 + }, + { + "epoch": 2.75, + "grad_norm": 0.7980208992958069, + "learning_rate": 1.0325114417258529e-05, + "loss": 2.9048, + "step": 56088 + }, + { + "epoch": 2.75, + "grad_norm": 0.6989551782608032, + "learning_rate": 1.0321110322034175e-05, + "loss": 2.8285, + "step": 56089 + }, + { + "epoch": 2.75, + "grad_norm": 0.7511205077171326, + "learning_rate": 1.0317106989768287e-05, + "loss": 2.8994, + "step": 56090 + }, + { + "epoch": 2.75, + "grad_norm": 0.7218790650367737, + "learning_rate": 1.0313104420471386e-05, + "loss": 2.7294, + "step": 56091 + }, + { + "epoch": 2.75, + "grad_norm": 0.7532414197921753, + "learning_rate": 1.030910261415403e-05, + "loss": 2.9913, + "step": 56092 + }, + { + "epoch": 2.75, + "grad_norm": 0.7547132968902588, + "learning_rate": 1.0305101570826713e-05, + "loss": 2.8585, + "step": 56093 + }, + { + "epoch": 2.75, + "grad_norm": 0.7735399603843689, + "learning_rate": 1.0301101290500025e-05, + "loss": 3.0481, + "step": 56094 + }, + { + "epoch": 2.75, + "grad_norm": 0.728217363357544, + "learning_rate": 1.0297101773184523e-05, + "loss": 2.887, + "step": 56095 + }, + { + "epoch": 2.75, + "grad_norm": 0.7460715174674988, + "learning_rate": 1.0293103018890637e-05, + "loss": 2.7163, + "step": 56096 + }, + { + "epoch": 2.75, + "grad_norm": 0.7449331283569336, + "learning_rate": 1.0289105027628985e-05, + "loss": 2.903, + "step": 56097 + }, + { + "epoch": 2.75, + "grad_norm": 0.738858699798584, + "learning_rate": 1.0285107799410063e-05, + "loss": 2.7658, + "step": 56098 + }, + { + "epoch": 2.75, + "grad_norm": 0.7564826011657715, + "learning_rate": 1.0281111334244396e-05, + "loss": 2.8346, + "step": 56099 + }, + { + "epoch": 2.75, + "grad_norm": 0.7498881816864014, + "learning_rate": 1.0277115632142574e-05, + "loss": 2.8828, + "step": 56100 + }, + { + "epoch": 2.75, + "grad_norm": 0.7890697717666626, + "learning_rate": 1.0273120693115056e-05, + "loss": 2.8409, + "step": 56101 + }, + { + "epoch": 2.75, + "grad_norm": 0.7382162809371948, + "learning_rate": 1.02691265171724e-05, + "loss": 2.915, + "step": 56102 + }, + { + "epoch": 2.75, + "grad_norm": 0.7483049035072327, + "learning_rate": 1.0265133104325063e-05, + "loss": 2.6881, + "step": 56103 + }, + { + "epoch": 2.75, + "grad_norm": 0.7414224147796631, + "learning_rate": 1.0261140454583572e-05, + "loss": 2.8794, + "step": 56104 + }, + { + "epoch": 2.75, + "grad_norm": 0.7294985055923462, + "learning_rate": 1.0257148567958551e-05, + "loss": 2.8673, + "step": 56105 + }, + { + "epoch": 2.75, + "grad_norm": 0.7478199005126953, + "learning_rate": 1.0253157444460391e-05, + "loss": 3.0047, + "step": 56106 + }, + { + "epoch": 2.75, + "grad_norm": 0.7180767059326172, + "learning_rate": 1.0249167084099685e-05, + "loss": 2.7526, + "step": 56107 + }, + { + "epoch": 2.75, + "grad_norm": 0.7234374284744263, + "learning_rate": 1.0245177486886858e-05, + "loss": 3.1474, + "step": 56108 + }, + { + "epoch": 2.75, + "grad_norm": 0.7626802325248718, + "learning_rate": 1.0241188652832533e-05, + "loss": 2.8545, + "step": 56109 + }, + { + "epoch": 2.75, + "grad_norm": 0.7502772212028503, + "learning_rate": 1.0237200581947136e-05, + "loss": 2.9798, + "step": 56110 + }, + { + "epoch": 2.75, + "grad_norm": 0.7730661034584045, + "learning_rate": 1.0233213274241158e-05, + "loss": 3.0912, + "step": 56111 + }, + { + "epoch": 2.75, + "grad_norm": 0.7321888208389282, + "learning_rate": 1.022922672972516e-05, + "loss": 2.8724, + "step": 56112 + }, + { + "epoch": 2.75, + "grad_norm": 0.7915605902671814, + "learning_rate": 1.0225240948409563e-05, + "loss": 2.7356, + "step": 56113 + }, + { + "epoch": 2.75, + "grad_norm": 0.7326378226280212, + "learning_rate": 1.0221255930304961e-05, + "loss": 2.7905, + "step": 56114 + }, + { + "epoch": 2.75, + "grad_norm": 0.7629287838935852, + "learning_rate": 1.0217271675421779e-05, + "loss": 2.915, + "step": 56115 + }, + { + "epoch": 2.75, + "grad_norm": 0.721251904964447, + "learning_rate": 1.0213288183770574e-05, + "loss": 2.9442, + "step": 56116 + }, + { + "epoch": 2.75, + "grad_norm": 0.8092251420021057, + "learning_rate": 1.0209305455361771e-05, + "loss": 2.9196, + "step": 56117 + }, + { + "epoch": 2.75, + "grad_norm": 0.7478073835372925, + "learning_rate": 1.0205323490205863e-05, + "loss": 2.8916, + "step": 56118 + }, + { + "epoch": 2.75, + "grad_norm": 0.7610535621643066, + "learning_rate": 1.0201342288313374e-05, + "loss": 2.9998, + "step": 56119 + }, + { + "epoch": 2.75, + "grad_norm": 0.7558162212371826, + "learning_rate": 1.0197361849694763e-05, + "loss": 2.7903, + "step": 56120 + }, + { + "epoch": 2.75, + "grad_norm": 0.7537033557891846, + "learning_rate": 1.0193382174360554e-05, + "loss": 2.9119, + "step": 56121 + }, + { + "epoch": 2.75, + "grad_norm": 0.7669283747673035, + "learning_rate": 1.0189403262321205e-05, + "loss": 2.9549, + "step": 56122 + }, + { + "epoch": 2.75, + "grad_norm": 0.7572535872459412, + "learning_rate": 1.0185425113587175e-05, + "loss": 3.1026, + "step": 56123 + }, + { + "epoch": 2.75, + "grad_norm": 0.7835591435432434, + "learning_rate": 1.018144772816899e-05, + "loss": 2.9209, + "step": 56124 + }, + { + "epoch": 2.75, + "grad_norm": 0.8195697665214539, + "learning_rate": 1.017747110607704e-05, + "loss": 2.9498, + "step": 56125 + }, + { + "epoch": 2.75, + "grad_norm": 0.7439544796943665, + "learning_rate": 1.0173495247321884e-05, + "loss": 2.8571, + "step": 56126 + }, + { + "epoch": 2.75, + "grad_norm": 0.7486630082130432, + "learning_rate": 1.016952015191398e-05, + "loss": 2.938, + "step": 56127 + }, + { + "epoch": 2.75, + "grad_norm": 0.7225660085678101, + "learning_rate": 1.0165545819863751e-05, + "loss": 2.8997, + "step": 56128 + }, + { + "epoch": 2.75, + "grad_norm": 0.7393267750740051, + "learning_rate": 1.0161572251181692e-05, + "loss": 2.9516, + "step": 56129 + }, + { + "epoch": 2.75, + "grad_norm": 0.7707771062850952, + "learning_rate": 1.0157599445878296e-05, + "loss": 2.9941, + "step": 56130 + }, + { + "epoch": 2.75, + "grad_norm": 0.8051019906997681, + "learning_rate": 1.0153627403963983e-05, + "loss": 2.863, + "step": 56131 + }, + { + "epoch": 2.75, + "grad_norm": 0.7474145293235779, + "learning_rate": 1.014965612544928e-05, + "loss": 2.6743, + "step": 56132 + }, + { + "epoch": 2.75, + "grad_norm": 0.7893514633178711, + "learning_rate": 1.0145685610344544e-05, + "loss": 2.8199, + "step": 56133 + }, + { + "epoch": 2.75, + "grad_norm": 0.7323321104049683, + "learning_rate": 1.0141715858660305e-05, + "loss": 2.9556, + "step": 56134 + }, + { + "epoch": 2.75, + "grad_norm": 0.7719254493713379, + "learning_rate": 1.0137746870406983e-05, + "loss": 2.7592, + "step": 56135 + }, + { + "epoch": 2.75, + "grad_norm": 0.7276996970176697, + "learning_rate": 1.013377864559507e-05, + "loss": 2.6894, + "step": 56136 + }, + { + "epoch": 2.75, + "grad_norm": 0.750486433506012, + "learning_rate": 1.0129811184234993e-05, + "loss": 2.9752, + "step": 56137 + }, + { + "epoch": 2.75, + "grad_norm": 0.7667336463928223, + "learning_rate": 1.0125844486337242e-05, + "loss": 2.9073, + "step": 56138 + }, + { + "epoch": 2.75, + "grad_norm": 0.74593186378479, + "learning_rate": 1.012187855191221e-05, + "loss": 2.8232, + "step": 56139 + }, + { + "epoch": 2.75, + "grad_norm": 0.7445417046546936, + "learning_rate": 1.0117913380970355e-05, + "loss": 2.8888, + "step": 56140 + }, + { + "epoch": 2.75, + "grad_norm": 0.7170975208282471, + "learning_rate": 1.0113948973522101e-05, + "loss": 3.0145, + "step": 56141 + }, + { + "epoch": 2.75, + "grad_norm": 0.7595486640930176, + "learning_rate": 1.010998532957794e-05, + "loss": 2.8864, + "step": 56142 + }, + { + "epoch": 2.75, + "grad_norm": 0.7461397051811218, + "learning_rate": 1.0106022449148266e-05, + "loss": 2.9398, + "step": 56143 + }, + { + "epoch": 2.75, + "grad_norm": 0.7885101437568665, + "learning_rate": 1.01020603322436e-05, + "loss": 2.6727, + "step": 56144 + }, + { + "epoch": 2.75, + "grad_norm": 0.7303288578987122, + "learning_rate": 1.009809897887427e-05, + "loss": 2.9738, + "step": 56145 + }, + { + "epoch": 2.75, + "grad_norm": 0.7623375654220581, + "learning_rate": 1.0094138389050733e-05, + "loss": 3.0115, + "step": 56146 + }, + { + "epoch": 2.75, + "grad_norm": 0.9273803234100342, + "learning_rate": 1.009017856278348e-05, + "loss": 2.7699, + "step": 56147 + }, + { + "epoch": 2.75, + "grad_norm": 0.7486945390701294, + "learning_rate": 1.008621950008287e-05, + "loss": 3.0087, + "step": 56148 + }, + { + "epoch": 2.75, + "grad_norm": 0.7255769968032837, + "learning_rate": 1.0082261200959397e-05, + "loss": 2.9543, + "step": 56149 + }, + { + "epoch": 2.75, + "grad_norm": 0.7860700488090515, + "learning_rate": 1.0078303665423448e-05, + "loss": 2.8486, + "step": 56150 + }, + { + "epoch": 2.75, + "grad_norm": 0.7196409106254578, + "learning_rate": 1.0074346893485418e-05, + "loss": 2.7253, + "step": 56151 + }, + { + "epoch": 2.75, + "grad_norm": 0.7964147329330444, + "learning_rate": 1.00703908851558e-05, + "loss": 2.9744, + "step": 56152 + }, + { + "epoch": 2.75, + "grad_norm": 0.7458089590072632, + "learning_rate": 1.0066435640444947e-05, + "loss": 2.7989, + "step": 56153 + }, + { + "epoch": 2.75, + "grad_norm": 0.7688996195793152, + "learning_rate": 1.0062481159363356e-05, + "loss": 3.0695, + "step": 56154 + }, + { + "epoch": 2.75, + "grad_norm": 0.7817563414573669, + "learning_rate": 1.0058527441921316e-05, + "loss": 2.6904, + "step": 56155 + }, + { + "epoch": 2.75, + "grad_norm": 0.6953161358833313, + "learning_rate": 1.0054574488129318e-05, + "loss": 2.7915, + "step": 56156 + }, + { + "epoch": 2.75, + "grad_norm": 0.7510791420936584, + "learning_rate": 1.0050622297997824e-05, + "loss": 2.8286, + "step": 56157 + }, + { + "epoch": 2.75, + "grad_norm": 0.7296958565711975, + "learning_rate": 1.0046670871537154e-05, + "loss": 3.0983, + "step": 56158 + }, + { + "epoch": 2.75, + "grad_norm": 0.7343621850013733, + "learning_rate": 1.0042720208757771e-05, + "loss": 2.971, + "step": 56159 + }, + { + "epoch": 2.75, + "grad_norm": 0.7544857263565063, + "learning_rate": 1.0038770309670064e-05, + "loss": 2.8618, + "step": 56160 + }, + { + "epoch": 2.75, + "grad_norm": 0.7312347292900085, + "learning_rate": 1.0034821174284391e-05, + "loss": 2.8683, + "step": 56161 + }, + { + "epoch": 2.75, + "grad_norm": 0.723470151424408, + "learning_rate": 1.0030872802611245e-05, + "loss": 3.1332, + "step": 56162 + }, + { + "epoch": 2.75, + "grad_norm": 0.7472372055053711, + "learning_rate": 1.0026925194660918e-05, + "loss": 2.6761, + "step": 56163 + }, + { + "epoch": 2.75, + "grad_norm": 0.7491852641105652, + "learning_rate": 1.00229783504439e-05, + "loss": 2.791, + "step": 56164 + }, + { + "epoch": 2.75, + "grad_norm": 0.7424899339675903, + "learning_rate": 1.001903226997055e-05, + "loss": 2.9423, + "step": 56165 + }, + { + "epoch": 2.75, + "grad_norm": 0.7745535373687744, + "learning_rate": 1.0015086953251262e-05, + "loss": 2.9281, + "step": 56166 + }, + { + "epoch": 2.75, + "grad_norm": 0.8208742141723633, + "learning_rate": 1.0011142400296424e-05, + "loss": 2.8546, + "step": 56167 + }, + { + "epoch": 2.75, + "grad_norm": 0.8146986365318298, + "learning_rate": 1.0007198611116397e-05, + "loss": 2.9061, + "step": 56168 + }, + { + "epoch": 2.75, + "grad_norm": 0.7721244096755981, + "learning_rate": 1.0003255585721636e-05, + "loss": 3.1029, + "step": 56169 + }, + { + "epoch": 2.75, + "grad_norm": 0.709169328212738, + "learning_rate": 9.999313324122471e-06, + "loss": 2.8192, + "step": 56170 + }, + { + "epoch": 2.75, + "grad_norm": 0.7202749848365784, + "learning_rate": 9.99537182632929e-06, + "loss": 2.6902, + "step": 56171 + }, + { + "epoch": 2.75, + "grad_norm": 0.7772741913795471, + "learning_rate": 9.99143109235252e-06, + "loss": 3.0664, + "step": 56172 + }, + { + "epoch": 2.75, + "grad_norm": 0.7416417002677917, + "learning_rate": 9.987491122202484e-06, + "loss": 3.0866, + "step": 56173 + }, + { + "epoch": 2.75, + "grad_norm": 0.7932614088058472, + "learning_rate": 9.983551915889609e-06, + "loss": 2.6837, + "step": 56174 + }, + { + "epoch": 2.75, + "grad_norm": 0.7134155035018921, + "learning_rate": 9.97961347342422e-06, + "loss": 2.6047, + "step": 56175 + }, + { + "epoch": 2.75, + "grad_norm": 0.736059308052063, + "learning_rate": 9.975675794816707e-06, + "loss": 2.8903, + "step": 56176 + }, + { + "epoch": 2.75, + "grad_norm": 0.7584584951400757, + "learning_rate": 9.971738880077495e-06, + "loss": 2.9494, + "step": 56177 + }, + { + "epoch": 2.75, + "grad_norm": 0.6984370946884155, + "learning_rate": 9.967802729216844e-06, + "loss": 2.7811, + "step": 56178 + }, + { + "epoch": 2.75, + "grad_norm": 0.7537134885787964, + "learning_rate": 9.963867342245213e-06, + "loss": 2.9652, + "step": 56179 + }, + { + "epoch": 2.75, + "grad_norm": 0.7283973693847656, + "learning_rate": 9.959932719172924e-06, + "loss": 2.8149, + "step": 56180 + }, + { + "epoch": 2.75, + "grad_norm": 0.7228909730911255, + "learning_rate": 9.955998860010372e-06, + "loss": 3.0026, + "step": 56181 + }, + { + "epoch": 2.75, + "grad_norm": 0.8229911923408508, + "learning_rate": 9.952065764767913e-06, + "loss": 2.7358, + "step": 56182 + }, + { + "epoch": 2.75, + "grad_norm": 0.768810510635376, + "learning_rate": 9.94813343345584e-06, + "loss": 2.9345, + "step": 56183 + }, + { + "epoch": 2.75, + "grad_norm": 0.7733370065689087, + "learning_rate": 9.944201866084612e-06, + "loss": 2.9988, + "step": 56184 + }, + { + "epoch": 2.75, + "grad_norm": 0.7900514602661133, + "learning_rate": 9.940271062664484e-06, + "loss": 2.9167, + "step": 56185 + }, + { + "epoch": 2.75, + "grad_norm": 0.797058641910553, + "learning_rate": 9.936341023205918e-06, + "loss": 2.7364, + "step": 56186 + }, + { + "epoch": 2.75, + "grad_norm": 0.766906201839447, + "learning_rate": 9.932411747719171e-06, + "loss": 2.899, + "step": 56187 + }, + { + "epoch": 2.75, + "grad_norm": 0.7349963784217834, + "learning_rate": 9.928483236214634e-06, + "loss": 2.7932, + "step": 56188 + }, + { + "epoch": 2.75, + "grad_norm": 0.808515191078186, + "learning_rate": 9.924555488702667e-06, + "loss": 2.8549, + "step": 56189 + }, + { + "epoch": 2.75, + "grad_norm": 0.7727739810943604, + "learning_rate": 9.920628505193529e-06, + "loss": 2.8072, + "step": 56190 + }, + { + "epoch": 2.75, + "grad_norm": 0.761241614818573, + "learning_rate": 9.916702285697708e-06, + "loss": 2.6368, + "step": 56191 + }, + { + "epoch": 2.75, + "grad_norm": 0.7611913681030273, + "learning_rate": 9.9127768302254e-06, + "loss": 2.9305, + "step": 56192 + }, + { + "epoch": 2.75, + "grad_norm": 0.7446513772010803, + "learning_rate": 9.908852138787027e-06, + "loss": 2.8921, + "step": 56193 + }, + { + "epoch": 2.75, + "grad_norm": 0.7649831771850586, + "learning_rate": 9.904928211392915e-06, + "loss": 2.7833, + "step": 56194 + }, + { + "epoch": 2.75, + "grad_norm": 0.7408809065818787, + "learning_rate": 9.90100504805339e-06, + "loss": 2.9076, + "step": 56195 + }, + { + "epoch": 2.75, + "grad_norm": 0.7453528642654419, + "learning_rate": 9.897082648778809e-06, + "loss": 2.7996, + "step": 56196 + }, + { + "epoch": 2.75, + "grad_norm": 0.7997094392776489, + "learning_rate": 9.893161013579499e-06, + "loss": 2.9272, + "step": 56197 + }, + { + "epoch": 2.75, + "grad_norm": 0.7552264928817749, + "learning_rate": 9.889240142465715e-06, + "loss": 2.9482, + "step": 56198 + }, + { + "epoch": 2.75, + "grad_norm": 0.7569772005081177, + "learning_rate": 9.885320035447853e-06, + "loss": 2.9521, + "step": 56199 + }, + { + "epoch": 2.75, + "grad_norm": 0.7403649091720581, + "learning_rate": 9.881400692536235e-06, + "loss": 2.9042, + "step": 56200 + }, + { + "epoch": 2.75, + "grad_norm": 0.7069177627563477, + "learning_rate": 9.877482113741186e-06, + "loss": 2.686, + "step": 56201 + }, + { + "epoch": 2.75, + "grad_norm": 0.7120211124420166, + "learning_rate": 9.873564299072968e-06, + "loss": 2.6873, + "step": 56202 + }, + { + "epoch": 2.75, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.869647248542001e-06, + "loss": 2.9283, + "step": 56203 + }, + { + "epoch": 2.75, + "grad_norm": 0.6995888352394104, + "learning_rate": 9.865730962158546e-06, + "loss": 3.067, + "step": 56204 + }, + { + "epoch": 2.75, + "grad_norm": 0.7135257124900818, + "learning_rate": 9.861815439932864e-06, + "loss": 2.9348, + "step": 56205 + }, + { + "epoch": 2.75, + "grad_norm": 0.8033514022827148, + "learning_rate": 9.857900681875374e-06, + "loss": 2.8451, + "step": 56206 + }, + { + "epoch": 2.75, + "grad_norm": 0.8119677901268005, + "learning_rate": 9.853986687996273e-06, + "loss": 2.8265, + "step": 56207 + }, + { + "epoch": 2.75, + "grad_norm": 0.7657623291015625, + "learning_rate": 9.850073458305952e-06, + "loss": 2.9276, + "step": 56208 + }, + { + "epoch": 2.75, + "grad_norm": 0.7248706817626953, + "learning_rate": 9.846160992814734e-06, + "loss": 2.9206, + "step": 56209 + }, + { + "epoch": 2.75, + "grad_norm": 0.7554598450660706, + "learning_rate": 9.842249291532878e-06, + "loss": 2.9856, + "step": 56210 + }, + { + "epoch": 2.75, + "grad_norm": 0.789419949054718, + "learning_rate": 9.838338354470675e-06, + "loss": 3.0536, + "step": 56211 + }, + { + "epoch": 2.75, + "grad_norm": 0.7719467282295227, + "learning_rate": 9.834428181638487e-06, + "loss": 2.8967, + "step": 56212 + }, + { + "epoch": 2.75, + "grad_norm": 0.7833728194236755, + "learning_rate": 9.830518773046536e-06, + "loss": 2.6544, + "step": 56213 + }, + { + "epoch": 2.75, + "grad_norm": 0.7524414658546448, + "learning_rate": 9.826610128705148e-06, + "loss": 2.7656, + "step": 56214 + }, + { + "epoch": 2.75, + "grad_norm": 0.7353940606117249, + "learning_rate": 9.822702248624614e-06, + "loss": 2.8325, + "step": 56215 + }, + { + "epoch": 2.76, + "grad_norm": 0.7165583372116089, + "learning_rate": 9.818795132815294e-06, + "loss": 2.8988, + "step": 56216 + }, + { + "epoch": 2.76, + "grad_norm": 0.6943876147270203, + "learning_rate": 9.814888781287378e-06, + "loss": 2.8794, + "step": 56217 + }, + { + "epoch": 2.76, + "grad_norm": 0.7502477765083313, + "learning_rate": 9.810983194051193e-06, + "loss": 2.8726, + "step": 56218 + }, + { + "epoch": 2.76, + "grad_norm": 0.7636768817901611, + "learning_rate": 9.80707837111706e-06, + "loss": 2.9083, + "step": 56219 + }, + { + "epoch": 2.76, + "grad_norm": 0.7751126885414124, + "learning_rate": 9.80317431249521e-06, + "loss": 2.9759, + "step": 56220 + }, + { + "epoch": 2.76, + "grad_norm": 0.7540944218635559, + "learning_rate": 9.799271018195964e-06, + "loss": 2.8304, + "step": 56221 + }, + { + "epoch": 2.76, + "grad_norm": 0.7522205114364624, + "learning_rate": 9.795368488229583e-06, + "loss": 2.9295, + "step": 56222 + }, + { + "epoch": 2.76, + "grad_norm": 0.7293267846107483, + "learning_rate": 9.791466722606323e-06, + "loss": 3.1562, + "step": 56223 + }, + { + "epoch": 2.76, + "grad_norm": 0.7595233917236328, + "learning_rate": 9.78756572133651e-06, + "loss": 2.8134, + "step": 56224 + }, + { + "epoch": 2.76, + "grad_norm": 0.7647051215171814, + "learning_rate": 9.783665484430436e-06, + "loss": 2.8663, + "step": 56225 + }, + { + "epoch": 2.76, + "grad_norm": 0.7567137479782104, + "learning_rate": 9.779766011898293e-06, + "loss": 2.9385, + "step": 56226 + }, + { + "epoch": 2.76, + "grad_norm": 0.7509698271751404, + "learning_rate": 9.775867303750372e-06, + "loss": 3.0103, + "step": 56227 + }, + { + "epoch": 2.76, + "grad_norm": 0.7402850389480591, + "learning_rate": 9.771969359996967e-06, + "loss": 2.8077, + "step": 56228 + }, + { + "epoch": 2.76, + "grad_norm": 0.7876865267753601, + "learning_rate": 9.768072180648401e-06, + "loss": 2.8664, + "step": 56229 + }, + { + "epoch": 2.76, + "grad_norm": 0.7342726588249207, + "learning_rate": 9.7641757657148e-06, + "loss": 3.1046, + "step": 56230 + }, + { + "epoch": 2.76, + "grad_norm": 0.8054348826408386, + "learning_rate": 9.760280115206553e-06, + "loss": 3.0441, + "step": 56231 + }, + { + "epoch": 2.76, + "grad_norm": 0.7675466537475586, + "learning_rate": 9.756385229133823e-06, + "loss": 2.9284, + "step": 56232 + }, + { + "epoch": 2.76, + "grad_norm": 0.7613694071769714, + "learning_rate": 9.752491107506966e-06, + "loss": 2.923, + "step": 56233 + }, + { + "epoch": 2.76, + "grad_norm": 0.7603533267974854, + "learning_rate": 9.748597750336207e-06, + "loss": 2.7409, + "step": 56234 + }, + { + "epoch": 2.76, + "grad_norm": 0.783909261226654, + "learning_rate": 9.744705157631705e-06, + "loss": 2.7609, + "step": 56235 + }, + { + "epoch": 2.76, + "grad_norm": 0.7282220721244812, + "learning_rate": 9.740813329403851e-06, + "loss": 2.8911, + "step": 56236 + }, + { + "epoch": 2.76, + "grad_norm": 0.7373564839363098, + "learning_rate": 9.736922265662805e-06, + "loss": 2.8638, + "step": 56237 + }, + { + "epoch": 2.76, + "grad_norm": 0.7997520565986633, + "learning_rate": 9.733031966418826e-06, + "loss": 2.9606, + "step": 56238 + }, + { + "epoch": 2.76, + "grad_norm": 0.7475751638412476, + "learning_rate": 9.729142431682201e-06, + "loss": 2.7857, + "step": 56239 + }, + { + "epoch": 2.76, + "grad_norm": 0.7757993936538696, + "learning_rate": 9.725253661463161e-06, + "loss": 2.6651, + "step": 56240 + }, + { + "epoch": 2.76, + "grad_norm": 0.7897785902023315, + "learning_rate": 9.721365655771962e-06, + "loss": 2.7296, + "step": 56241 + }, + { + "epoch": 2.76, + "grad_norm": 0.7303987145423889, + "learning_rate": 9.717478414618763e-06, + "loss": 2.9488, + "step": 56242 + }, + { + "epoch": 2.76, + "grad_norm": 0.7232341170310974, + "learning_rate": 9.713591938013854e-06, + "loss": 2.8354, + "step": 56243 + }, + { + "epoch": 2.76, + "grad_norm": 0.7750096321105957, + "learning_rate": 9.709706225967529e-06, + "loss": 3.0524, + "step": 56244 + }, + { + "epoch": 2.76, + "grad_norm": 0.7706706523895264, + "learning_rate": 9.705821278489912e-06, + "loss": 2.8424, + "step": 56245 + }, + { + "epoch": 2.76, + "grad_norm": 0.8001003265380859, + "learning_rate": 9.701937095591362e-06, + "loss": 2.8951, + "step": 56246 + }, + { + "epoch": 2.76, + "grad_norm": 0.7747387290000916, + "learning_rate": 9.698053677282037e-06, + "loss": 2.9802, + "step": 56247 + }, + { + "epoch": 2.76, + "grad_norm": 0.7447709441184998, + "learning_rate": 9.694171023572095e-06, + "loss": 2.7805, + "step": 56248 + }, + { + "epoch": 2.76, + "grad_norm": 0.7555181384086609, + "learning_rate": 9.690289134471896e-06, + "loss": 2.9242, + "step": 56249 + }, + { + "epoch": 2.76, + "grad_norm": 0.7425608038902283, + "learning_rate": 9.686408009991565e-06, + "loss": 2.7431, + "step": 56250 + }, + { + "epoch": 2.76, + "grad_norm": 0.7470229864120483, + "learning_rate": 9.682527650141426e-06, + "loss": 2.8323, + "step": 56251 + }, + { + "epoch": 2.76, + "grad_norm": 0.7438477873802185, + "learning_rate": 9.678648054931571e-06, + "loss": 2.9314, + "step": 56252 + }, + { + "epoch": 2.76, + "grad_norm": 0.7268569469451904, + "learning_rate": 9.674769224372292e-06, + "loss": 2.9639, + "step": 56253 + }, + { + "epoch": 2.76, + "grad_norm": 0.7507839798927307, + "learning_rate": 9.670891158473814e-06, + "loss": 2.7267, + "step": 56254 + }, + { + "epoch": 2.76, + "grad_norm": 0.7674872875213623, + "learning_rate": 9.667013857246363e-06, + "loss": 2.801, + "step": 56255 + }, + { + "epoch": 2.76, + "grad_norm": 0.7318063974380493, + "learning_rate": 9.663137320700098e-06, + "loss": 2.9449, + "step": 56256 + }, + { + "epoch": 2.76, + "grad_norm": 0.7555484771728516, + "learning_rate": 9.659261548845243e-06, + "loss": 2.8206, + "step": 56257 + }, + { + "epoch": 2.76, + "grad_norm": 0.8074846267700195, + "learning_rate": 9.655386541691989e-06, + "loss": 2.9654, + "step": 56258 + }, + { + "epoch": 2.76, + "grad_norm": 0.7630916833877563, + "learning_rate": 9.65151229925063e-06, + "loss": 2.931, + "step": 56259 + }, + { + "epoch": 2.76, + "grad_norm": 0.7579756379127502, + "learning_rate": 9.647638821531256e-06, + "loss": 3.0187, + "step": 56260 + }, + { + "epoch": 2.76, + "grad_norm": 0.7585728764533997, + "learning_rate": 9.64376610854416e-06, + "loss": 2.6944, + "step": 56261 + }, + { + "epoch": 2.76, + "grad_norm": 0.7346169352531433, + "learning_rate": 9.639894160299499e-06, + "loss": 2.5751, + "step": 56262 + }, + { + "epoch": 2.76, + "grad_norm": 0.7276310920715332, + "learning_rate": 9.636022976807467e-06, + "loss": 2.8546, + "step": 56263 + }, + { + "epoch": 2.76, + "grad_norm": 0.7806369066238403, + "learning_rate": 9.632152558078287e-06, + "loss": 2.9523, + "step": 56264 + }, + { + "epoch": 2.76, + "grad_norm": 0.7958837747573853, + "learning_rate": 9.628282904122087e-06, + "loss": 2.7377, + "step": 56265 + }, + { + "epoch": 2.76, + "grad_norm": 0.7492504715919495, + "learning_rate": 9.624414014949156e-06, + "loss": 3.0474, + "step": 56266 + }, + { + "epoch": 2.76, + "grad_norm": 0.7660962343215942, + "learning_rate": 9.620545890569587e-06, + "loss": 3.0414, + "step": 56267 + }, + { + "epoch": 2.76, + "grad_norm": 0.775435745716095, + "learning_rate": 9.616678530993671e-06, + "loss": 3.2265, + "step": 56268 + }, + { + "epoch": 2.76, + "grad_norm": 0.7225391268730164, + "learning_rate": 9.6128119362315e-06, + "loss": 2.8305, + "step": 56269 + }, + { + "epoch": 2.76, + "grad_norm": 0.7277866005897522, + "learning_rate": 9.608946106293302e-06, + "loss": 2.715, + "step": 56270 + }, + { + "epoch": 2.76, + "grad_norm": 0.7221185564994812, + "learning_rate": 9.605081041189266e-06, + "loss": 2.7211, + "step": 56271 + }, + { + "epoch": 2.76, + "grad_norm": 0.7964680194854736, + "learning_rate": 9.60121674092955e-06, + "loss": 2.9876, + "step": 56272 + }, + { + "epoch": 2.76, + "grad_norm": 0.7531980276107788, + "learning_rate": 9.59735320552435e-06, + "loss": 2.7597, + "step": 56273 + }, + { + "epoch": 2.76, + "grad_norm": 0.74334716796875, + "learning_rate": 9.593490434983787e-06, + "loss": 2.8664, + "step": 56274 + }, + { + "epoch": 2.76, + "grad_norm": 0.7619105577468872, + "learning_rate": 9.58962842931812e-06, + "loss": 2.6577, + "step": 56275 + }, + { + "epoch": 2.76, + "grad_norm": 0.8216807842254639, + "learning_rate": 9.585767188537474e-06, + "loss": 2.9393, + "step": 56276 + }, + { + "epoch": 2.76, + "grad_norm": 0.7856898903846741, + "learning_rate": 9.581906712652043e-06, + "loss": 2.8374, + "step": 56277 + }, + { + "epoch": 2.76, + "grad_norm": 0.7424861788749695, + "learning_rate": 9.578047001671984e-06, + "loss": 2.8822, + "step": 56278 + }, + { + "epoch": 2.76, + "grad_norm": 0.758293092250824, + "learning_rate": 9.574188055607423e-06, + "loss": 2.972, + "step": 56279 + }, + { + "epoch": 2.76, + "grad_norm": 0.7339947819709778, + "learning_rate": 9.570329874468553e-06, + "loss": 3.113, + "step": 56280 + }, + { + "epoch": 2.76, + "grad_norm": 0.7537075281143188, + "learning_rate": 9.566472458265594e-06, + "loss": 2.7329, + "step": 56281 + }, + { + "epoch": 2.76, + "grad_norm": 0.7673822045326233, + "learning_rate": 9.562615807008577e-06, + "loss": 3.0186, + "step": 56282 + }, + { + "epoch": 2.76, + "grad_norm": 0.7142574787139893, + "learning_rate": 9.558759920707792e-06, + "loss": 2.6684, + "step": 56283 + }, + { + "epoch": 2.76, + "grad_norm": 0.7520612478256226, + "learning_rate": 9.55490479937333e-06, + "loss": 2.6922, + "step": 56284 + }, + { + "epoch": 2.76, + "grad_norm": 0.7886697053909302, + "learning_rate": 9.551050443015318e-06, + "loss": 2.95, + "step": 56285 + }, + { + "epoch": 2.76, + "grad_norm": 0.7925397157669067, + "learning_rate": 9.54719685164398e-06, + "loss": 2.953, + "step": 56286 + }, + { + "epoch": 2.76, + "grad_norm": 0.7573472261428833, + "learning_rate": 9.543344025269406e-06, + "loss": 2.975, + "step": 56287 + }, + { + "epoch": 2.76, + "grad_norm": 0.7683876752853394, + "learning_rate": 9.539491963901757e-06, + "loss": 2.8721, + "step": 56288 + }, + { + "epoch": 2.76, + "grad_norm": 0.7631601095199585, + "learning_rate": 9.535640667551192e-06, + "loss": 3.1299, + "step": 56289 + }, + { + "epoch": 2.76, + "grad_norm": 0.7261856198310852, + "learning_rate": 9.531790136227869e-06, + "loss": 2.7541, + "step": 56290 + }, + { + "epoch": 2.76, + "grad_norm": 0.7447924017906189, + "learning_rate": 9.527940369941911e-06, + "loss": 2.9447, + "step": 56291 + }, + { + "epoch": 2.76, + "grad_norm": 0.7883365154266357, + "learning_rate": 9.524091368703413e-06, + "loss": 2.8184, + "step": 56292 + }, + { + "epoch": 2.76, + "grad_norm": 0.7671542763710022, + "learning_rate": 9.520243132522597e-06, + "loss": 2.9793, + "step": 56293 + }, + { + "epoch": 2.76, + "grad_norm": 0.7295643091201782, + "learning_rate": 9.516395661409526e-06, + "loss": 3.0381, + "step": 56294 + }, + { + "epoch": 2.76, + "grad_norm": 0.7493358850479126, + "learning_rate": 9.512548955374355e-06, + "loss": 2.9604, + "step": 56295 + }, + { + "epoch": 2.76, + "grad_norm": 0.7209811806678772, + "learning_rate": 9.508703014427278e-06, + "loss": 3.0627, + "step": 56296 + }, + { + "epoch": 2.76, + "grad_norm": 0.7741036415100098, + "learning_rate": 9.504857838578317e-06, + "loss": 2.9489, + "step": 56297 + }, + { + "epoch": 2.76, + "grad_norm": 0.7175572514533997, + "learning_rate": 9.501013427837666e-06, + "loss": 2.8748, + "step": 56298 + }, + { + "epoch": 2.76, + "grad_norm": 0.7129703760147095, + "learning_rate": 9.497169782215485e-06, + "loss": 2.9452, + "step": 56299 + }, + { + "epoch": 2.76, + "grad_norm": 0.7331249117851257, + "learning_rate": 9.493326901721798e-06, + "loss": 2.7358, + "step": 56300 + }, + { + "epoch": 2.76, + "grad_norm": 0.738678514957428, + "learning_rate": 9.489484786366797e-06, + "loss": 2.7248, + "step": 56301 + }, + { + "epoch": 2.76, + "grad_norm": 0.7397850751876831, + "learning_rate": 9.485643436160572e-06, + "loss": 2.9859, + "step": 56302 + }, + { + "epoch": 2.76, + "grad_norm": 0.7723121643066406, + "learning_rate": 9.48180285111325e-06, + "loss": 3.1972, + "step": 56303 + }, + { + "epoch": 2.76, + "grad_norm": 0.7921167612075806, + "learning_rate": 9.477963031234958e-06, + "loss": 2.9502, + "step": 56304 + }, + { + "epoch": 2.76, + "grad_norm": 0.7509664297103882, + "learning_rate": 9.474123976535819e-06, + "loss": 2.9147, + "step": 56305 + }, + { + "epoch": 2.76, + "grad_norm": 0.8044818043708801, + "learning_rate": 9.470285687025925e-06, + "loss": 3.1336, + "step": 56306 + }, + { + "epoch": 2.76, + "grad_norm": 0.7625141143798828, + "learning_rate": 9.466448162715335e-06, + "loss": 2.882, + "step": 56307 + }, + { + "epoch": 2.76, + "grad_norm": 0.7402327656745911, + "learning_rate": 9.46261140361424e-06, + "loss": 2.8615, + "step": 56308 + }, + { + "epoch": 2.76, + "grad_norm": 0.7257512807846069, + "learning_rate": 9.4587754097327e-06, + "loss": 3.0802, + "step": 56309 + }, + { + "epoch": 2.76, + "grad_norm": 0.7531664371490479, + "learning_rate": 9.454940181080839e-06, + "loss": 2.9292, + "step": 56310 + }, + { + "epoch": 2.76, + "grad_norm": 0.7305158972740173, + "learning_rate": 9.45110571766875e-06, + "loss": 2.9042, + "step": 56311 + }, + { + "epoch": 2.76, + "grad_norm": 0.7849349975585938, + "learning_rate": 9.447272019506524e-06, + "loss": 2.812, + "step": 56312 + }, + { + "epoch": 2.76, + "grad_norm": 0.7790784239768982, + "learning_rate": 9.443439086604287e-06, + "loss": 2.886, + "step": 56313 + }, + { + "epoch": 2.76, + "grad_norm": 0.7468773126602173, + "learning_rate": 9.439606918972097e-06, + "loss": 2.9739, + "step": 56314 + }, + { + "epoch": 2.76, + "grad_norm": 0.7422334551811218, + "learning_rate": 9.435775516620049e-06, + "loss": 2.983, + "step": 56315 + }, + { + "epoch": 2.76, + "grad_norm": 0.7719339728355408, + "learning_rate": 9.431944879558263e-06, + "loss": 2.9528, + "step": 56316 + }, + { + "epoch": 2.76, + "grad_norm": 0.7582297325134277, + "learning_rate": 9.428115007796799e-06, + "loss": 2.8616, + "step": 56317 + }, + { + "epoch": 2.76, + "grad_norm": 0.7723546624183655, + "learning_rate": 9.424285901345785e-06, + "loss": 2.8218, + "step": 56318 + }, + { + "epoch": 2.76, + "grad_norm": 0.7500159740447998, + "learning_rate": 9.420457560215245e-06, + "loss": 2.9789, + "step": 56319 + }, + { + "epoch": 2.76, + "grad_norm": 0.7486183047294617, + "learning_rate": 9.416629984415335e-06, + "loss": 2.8109, + "step": 56320 + }, + { + "epoch": 2.76, + "grad_norm": 0.7632740139961243, + "learning_rate": 9.412803173956085e-06, + "loss": 2.9913, + "step": 56321 + }, + { + "epoch": 2.76, + "grad_norm": 0.7619692087173462, + "learning_rate": 9.408977128847584e-06, + "loss": 3.0314, + "step": 56322 + }, + { + "epoch": 2.76, + "grad_norm": 0.7509559392929077, + "learning_rate": 9.405151849099923e-06, + "loss": 2.7083, + "step": 56323 + }, + { + "epoch": 2.76, + "grad_norm": 0.7356145977973938, + "learning_rate": 9.40132733472313e-06, + "loss": 2.7204, + "step": 56324 + }, + { + "epoch": 2.76, + "grad_norm": 0.7453984618186951, + "learning_rate": 9.397503585727328e-06, + "loss": 2.845, + "step": 56325 + }, + { + "epoch": 2.76, + "grad_norm": 0.7576645612716675, + "learning_rate": 9.393680602122578e-06, + "loss": 2.8996, + "step": 56326 + }, + { + "epoch": 2.76, + "grad_norm": 0.7218955755233765, + "learning_rate": 9.389858383918969e-06, + "loss": 2.8574, + "step": 56327 + }, + { + "epoch": 2.76, + "grad_norm": 0.7386839389801025, + "learning_rate": 9.386036931126529e-06, + "loss": 2.9469, + "step": 56328 + }, + { + "epoch": 2.76, + "grad_norm": 0.7222349047660828, + "learning_rate": 9.382216243755313e-06, + "loss": 2.7775, + "step": 56329 + }, + { + "epoch": 2.76, + "grad_norm": 0.7363526225090027, + "learning_rate": 9.378396321815419e-06, + "loss": 2.9486, + "step": 56330 + }, + { + "epoch": 2.76, + "grad_norm": 0.7235442996025085, + "learning_rate": 9.374577165316932e-06, + "loss": 3.037, + "step": 56331 + }, + { + "epoch": 2.76, + "grad_norm": 0.7777265310287476, + "learning_rate": 9.37075877426985e-06, + "loss": 2.9327, + "step": 56332 + }, + { + "epoch": 2.76, + "grad_norm": 0.7499440908432007, + "learning_rate": 9.36694114868426e-06, + "loss": 3.0098, + "step": 56333 + }, + { + "epoch": 2.76, + "grad_norm": 0.7149835824966431, + "learning_rate": 9.363124288570189e-06, + "loss": 2.913, + "step": 56334 + }, + { + "epoch": 2.76, + "grad_norm": 0.7982455492019653, + "learning_rate": 9.359308193937765e-06, + "loss": 3.024, + "step": 56335 + }, + { + "epoch": 2.76, + "grad_norm": 0.7221076488494873, + "learning_rate": 9.355492864796977e-06, + "loss": 2.7995, + "step": 56336 + }, + { + "epoch": 2.76, + "grad_norm": 0.6967481970787048, + "learning_rate": 9.35167830115785e-06, + "loss": 3.0547, + "step": 56337 + }, + { + "epoch": 2.76, + "grad_norm": 0.7800359129905701, + "learning_rate": 9.347864503030511e-06, + "loss": 2.9932, + "step": 56338 + }, + { + "epoch": 2.76, + "grad_norm": 0.7424789071083069, + "learning_rate": 9.344051470424884e-06, + "loss": 2.8447, + "step": 56339 + }, + { + "epoch": 2.76, + "grad_norm": 0.7663801908493042, + "learning_rate": 9.340239203351129e-06, + "loss": 2.9393, + "step": 56340 + }, + { + "epoch": 2.76, + "grad_norm": 0.7594396471977234, + "learning_rate": 9.336427701819272e-06, + "loss": 2.8301, + "step": 56341 + }, + { + "epoch": 2.76, + "grad_norm": 0.7261060476303101, + "learning_rate": 9.332616965839335e-06, + "loss": 2.8903, + "step": 56342 + }, + { + "epoch": 2.76, + "grad_norm": 0.7838093042373657, + "learning_rate": 9.328806995421311e-06, + "loss": 2.7609, + "step": 56343 + }, + { + "epoch": 2.76, + "grad_norm": 0.7690815925598145, + "learning_rate": 9.324997790575262e-06, + "loss": 2.8137, + "step": 56344 + }, + { + "epoch": 2.76, + "grad_norm": 0.7286287546157837, + "learning_rate": 9.321189351311275e-06, + "loss": 2.7991, + "step": 56345 + }, + { + "epoch": 2.76, + "grad_norm": 0.7787858843803406, + "learning_rate": 9.31738167763928e-06, + "loss": 3.0519, + "step": 56346 + }, + { + "epoch": 2.76, + "grad_norm": 0.7564954161643982, + "learning_rate": 9.313574769569366e-06, + "loss": 2.8195, + "step": 56347 + }, + { + "epoch": 2.76, + "grad_norm": 0.7487925887107849, + "learning_rate": 9.309768627111558e-06, + "loss": 2.8282, + "step": 56348 + }, + { + "epoch": 2.76, + "grad_norm": 0.7704711556434631, + "learning_rate": 9.305963250275916e-06, + "loss": 2.811, + "step": 56349 + }, + { + "epoch": 2.76, + "grad_norm": 0.7544106841087341, + "learning_rate": 9.302158639072399e-06, + "loss": 3.0789, + "step": 56350 + }, + { + "epoch": 2.76, + "grad_norm": 0.7106571793556213, + "learning_rate": 9.298354793510998e-06, + "loss": 2.7505, + "step": 56351 + }, + { + "epoch": 2.76, + "grad_norm": 0.7845272421836853, + "learning_rate": 9.294551713601805e-06, + "loss": 2.916, + "step": 56352 + }, + { + "epoch": 2.76, + "grad_norm": 0.7282042503356934, + "learning_rate": 9.290749399354847e-06, + "loss": 3.0183, + "step": 56353 + }, + { + "epoch": 2.76, + "grad_norm": 0.7017480134963989, + "learning_rate": 9.28694785078008e-06, + "loss": 2.8869, + "step": 56354 + }, + { + "epoch": 2.76, + "grad_norm": 0.7535594701766968, + "learning_rate": 9.283147067887564e-06, + "loss": 2.8366, + "step": 56355 + }, + { + "epoch": 2.76, + "grad_norm": 0.7693657279014587, + "learning_rate": 9.279347050687291e-06, + "loss": 2.8678, + "step": 56356 + }, + { + "epoch": 2.76, + "grad_norm": 0.7219848036766052, + "learning_rate": 9.275547799189253e-06, + "loss": 2.8987, + "step": 56357 + }, + { + "epoch": 2.76, + "grad_norm": 0.7361370325088501, + "learning_rate": 9.271749313403509e-06, + "loss": 2.8727, + "step": 56358 + }, + { + "epoch": 2.76, + "grad_norm": 0.7325665950775146, + "learning_rate": 9.26795159333995e-06, + "loss": 2.8354, + "step": 56359 + }, + { + "epoch": 2.76, + "grad_norm": 0.7923938035964966, + "learning_rate": 9.264154639008736e-06, + "loss": 2.6835, + "step": 56360 + }, + { + "epoch": 2.76, + "grad_norm": 0.7122077345848083, + "learning_rate": 9.260358450419724e-06, + "loss": 2.8346, + "step": 56361 + }, + { + "epoch": 2.76, + "grad_norm": 0.7342128753662109, + "learning_rate": 9.256563027582975e-06, + "loss": 2.8962, + "step": 56362 + }, + { + "epoch": 2.76, + "grad_norm": 0.7379400730133057, + "learning_rate": 9.252768370508513e-06, + "loss": 2.8967, + "step": 56363 + }, + { + "epoch": 2.76, + "grad_norm": 0.7097499966621399, + "learning_rate": 9.248974479206328e-06, + "loss": 2.8588, + "step": 56364 + }, + { + "epoch": 2.76, + "grad_norm": 0.7544580101966858, + "learning_rate": 9.245181353686348e-06, + "loss": 2.9067, + "step": 56365 + }, + { + "epoch": 2.76, + "grad_norm": 0.7791709899902344, + "learning_rate": 9.241388993958598e-06, + "loss": 2.976, + "step": 56366 + }, + { + "epoch": 2.76, + "grad_norm": 0.8830820918083191, + "learning_rate": 9.23759740003307e-06, + "loss": 3.0506, + "step": 56367 + }, + { + "epoch": 2.76, + "grad_norm": 0.7556170225143433, + "learning_rate": 9.233806571919788e-06, + "loss": 2.7625, + "step": 56368 + }, + { + "epoch": 2.76, + "grad_norm": 0.7081811428070068, + "learning_rate": 9.230016509628679e-06, + "loss": 2.8684, + "step": 56369 + }, + { + "epoch": 2.76, + "grad_norm": 0.7261504530906677, + "learning_rate": 9.226227213169735e-06, + "loss": 2.8581, + "step": 56370 + }, + { + "epoch": 2.76, + "grad_norm": 0.768895149230957, + "learning_rate": 9.22243868255298e-06, + "loss": 2.9572, + "step": 56371 + }, + { + "epoch": 2.76, + "grad_norm": 0.7371829152107239, + "learning_rate": 9.218650917788339e-06, + "loss": 2.7464, + "step": 56372 + }, + { + "epoch": 2.76, + "grad_norm": 0.7369073033332825, + "learning_rate": 9.214863918885806e-06, + "loss": 2.6433, + "step": 56373 + }, + { + "epoch": 2.76, + "grad_norm": 0.7547151446342468, + "learning_rate": 9.211077685855372e-06, + "loss": 2.8662, + "step": 56374 + }, + { + "epoch": 2.76, + "grad_norm": 0.7417933344841003, + "learning_rate": 9.207292218706996e-06, + "loss": 3.0114, + "step": 56375 + }, + { + "epoch": 2.76, + "grad_norm": 0.7300156354904175, + "learning_rate": 9.203507517450603e-06, + "loss": 3.0027, + "step": 56376 + }, + { + "epoch": 2.76, + "grad_norm": 0.7735124230384827, + "learning_rate": 9.199723582096253e-06, + "loss": 2.8641, + "step": 56377 + }, + { + "epoch": 2.76, + "grad_norm": 0.7115468382835388, + "learning_rate": 9.195940412653868e-06, + "loss": 2.8722, + "step": 56378 + }, + { + "epoch": 2.76, + "grad_norm": 0.7530733942985535, + "learning_rate": 9.19215800913341e-06, + "loss": 2.9117, + "step": 56379 + }, + { + "epoch": 2.76, + "grad_norm": 0.74192875623703, + "learning_rate": 9.188376371544837e-06, + "loss": 2.8185, + "step": 56380 + }, + { + "epoch": 2.76, + "grad_norm": 0.7729009389877319, + "learning_rate": 9.184595499898073e-06, + "loss": 2.9038, + "step": 56381 + }, + { + "epoch": 2.76, + "grad_norm": 0.7162991166114807, + "learning_rate": 9.180815394203145e-06, + "loss": 3.1287, + "step": 56382 + }, + { + "epoch": 2.76, + "grad_norm": 0.7535189390182495, + "learning_rate": 9.177036054470011e-06, + "loss": 2.8736, + "step": 56383 + }, + { + "epoch": 2.76, + "grad_norm": 0.7650796175003052, + "learning_rate": 9.17325748070853e-06, + "loss": 3.0441, + "step": 56384 + }, + { + "epoch": 2.76, + "grad_norm": 0.7328662872314453, + "learning_rate": 9.169479672928759e-06, + "loss": 2.8645, + "step": 56385 + }, + { + "epoch": 2.76, + "grad_norm": 0.7376196384429932, + "learning_rate": 9.165702631140625e-06, + "loss": 2.7676, + "step": 56386 + }, + { + "epoch": 2.76, + "grad_norm": 0.7664865255355835, + "learning_rate": 9.161926355354022e-06, + "loss": 3.0326, + "step": 56387 + }, + { + "epoch": 2.76, + "grad_norm": 0.7669900059700012, + "learning_rate": 9.158150845578971e-06, + "loss": 2.8375, + "step": 56388 + }, + { + "epoch": 2.76, + "grad_norm": 0.7463096976280212, + "learning_rate": 9.154376101825333e-06, + "loss": 2.9926, + "step": 56389 + }, + { + "epoch": 2.76, + "grad_norm": 0.7658016085624695, + "learning_rate": 9.1506021241031e-06, + "loss": 2.7995, + "step": 56390 + }, + { + "epoch": 2.76, + "grad_norm": 0.7736062407493591, + "learning_rate": 9.146828912422199e-06, + "loss": 2.858, + "step": 56391 + }, + { + "epoch": 2.76, + "grad_norm": 0.7192699313163757, + "learning_rate": 9.14305646679262e-06, + "loss": 2.8394, + "step": 56392 + }, + { + "epoch": 2.76, + "grad_norm": 0.7871600985527039, + "learning_rate": 9.139284787224254e-06, + "loss": 2.9598, + "step": 56393 + }, + { + "epoch": 2.76, + "grad_norm": 0.7646672129631042, + "learning_rate": 9.135513873726996e-06, + "loss": 2.9829, + "step": 56394 + }, + { + "epoch": 2.76, + "grad_norm": 0.7649540305137634, + "learning_rate": 9.131743726310835e-06, + "loss": 3.1376, + "step": 56395 + }, + { + "epoch": 2.76, + "grad_norm": 0.7846724987030029, + "learning_rate": 9.127974344985634e-06, + "loss": 2.955, + "step": 56396 + }, + { + "epoch": 2.76, + "grad_norm": 0.7587399482727051, + "learning_rate": 9.124205729761414e-06, + "loss": 2.7941, + "step": 56397 + }, + { + "epoch": 2.76, + "grad_norm": 0.7872162461280823, + "learning_rate": 9.120437880648068e-06, + "loss": 2.9246, + "step": 56398 + }, + { + "epoch": 2.76, + "grad_norm": 0.7765682339668274, + "learning_rate": 9.116670797655456e-06, + "loss": 2.6701, + "step": 56399 + }, + { + "epoch": 2.76, + "grad_norm": 0.7208048701286316, + "learning_rate": 9.112904480793604e-06, + "loss": 2.915, + "step": 56400 + }, + { + "epoch": 2.76, + "grad_norm": 0.7376406192779541, + "learning_rate": 9.109138930072401e-06, + "loss": 3.0416, + "step": 56401 + }, + { + "epoch": 2.76, + "grad_norm": 0.7587045431137085, + "learning_rate": 9.105374145501676e-06, + "loss": 3.0432, + "step": 56402 + }, + { + "epoch": 2.76, + "grad_norm": 0.7954713106155396, + "learning_rate": 9.101610127091453e-06, + "loss": 2.8843, + "step": 56403 + }, + { + "epoch": 2.76, + "grad_norm": 0.783104419708252, + "learning_rate": 9.097846874851588e-06, + "loss": 3.0071, + "step": 56404 + }, + { + "epoch": 2.76, + "grad_norm": 0.7557005286216736, + "learning_rate": 9.094084388792012e-06, + "loss": 2.8479, + "step": 56405 + }, + { + "epoch": 2.76, + "grad_norm": 0.738347053527832, + "learning_rate": 9.090322668922612e-06, + "loss": 3.0269, + "step": 56406 + }, + { + "epoch": 2.76, + "grad_norm": 0.7137772440910339, + "learning_rate": 9.086561715253349e-06, + "loss": 2.7575, + "step": 56407 + }, + { + "epoch": 2.76, + "grad_norm": 0.7108136415481567, + "learning_rate": 9.082801527794115e-06, + "loss": 3.1663, + "step": 56408 + }, + { + "epoch": 2.76, + "grad_norm": 0.773616373538971, + "learning_rate": 9.079042106554734e-06, + "loss": 2.8878, + "step": 56409 + }, + { + "epoch": 2.76, + "grad_norm": 0.7863094806671143, + "learning_rate": 9.075283451545202e-06, + "loss": 3.0319, + "step": 56410 + }, + { + "epoch": 2.76, + "grad_norm": 0.7484992146492004, + "learning_rate": 9.071525562775373e-06, + "loss": 2.9823, + "step": 56411 + }, + { + "epoch": 2.76, + "grad_norm": 0.7688180804252625, + "learning_rate": 9.067768440255141e-06, + "loss": 2.727, + "step": 56412 + }, + { + "epoch": 2.76, + "grad_norm": 0.7612563967704773, + "learning_rate": 9.064012083994433e-06, + "loss": 2.9456, + "step": 56413 + }, + { + "epoch": 2.76, + "grad_norm": 0.7126185297966003, + "learning_rate": 9.060256494003105e-06, + "loss": 2.7921, + "step": 56414 + }, + { + "epoch": 2.76, + "grad_norm": 0.7291039824485779, + "learning_rate": 9.056501670291149e-06, + "loss": 2.8641, + "step": 56415 + }, + { + "epoch": 2.76, + "grad_norm": 0.7190093398094177, + "learning_rate": 9.052747612868294e-06, + "loss": 3.0766, + "step": 56416 + }, + { + "epoch": 2.76, + "grad_norm": 0.7553203701972961, + "learning_rate": 9.04899432174453e-06, + "loss": 2.7905, + "step": 56417 + }, + { + "epoch": 2.76, + "grad_norm": 0.7836436629295349, + "learning_rate": 9.045241796929713e-06, + "loss": 2.6367, + "step": 56418 + }, + { + "epoch": 2.76, + "grad_norm": 0.7655666470527649, + "learning_rate": 9.041490038433741e-06, + "loss": 2.9534, + "step": 56419 + }, + { + "epoch": 2.77, + "grad_norm": 0.7657229900360107, + "learning_rate": 9.037739046266501e-06, + "loss": 3.0535, + "step": 56420 + }, + { + "epoch": 2.77, + "grad_norm": 0.7775158882141113, + "learning_rate": 9.033988820437821e-06, + "loss": 2.8456, + "step": 56421 + }, + { + "epoch": 2.77, + "grad_norm": 0.773569643497467, + "learning_rate": 9.030239360957658e-06, + "loss": 2.8731, + "step": 56422 + }, + { + "epoch": 2.77, + "grad_norm": 0.7964552640914917, + "learning_rate": 9.02649066783584e-06, + "loss": 2.8055, + "step": 56423 + }, + { + "epoch": 2.77, + "grad_norm": 0.7493492960929871, + "learning_rate": 9.022742741082223e-06, + "loss": 2.7148, + "step": 56424 + }, + { + "epoch": 2.77, + "grad_norm": 0.7704905867576599, + "learning_rate": 9.018995580706733e-06, + "loss": 2.834, + "step": 56425 + }, + { + "epoch": 2.77, + "grad_norm": 0.773236095905304, + "learning_rate": 9.015249186719164e-06, + "loss": 2.7859, + "step": 56426 + }, + { + "epoch": 2.77, + "grad_norm": 0.7138835787773132, + "learning_rate": 9.011503559129474e-06, + "loss": 3.0637, + "step": 56427 + }, + { + "epoch": 2.77, + "grad_norm": 0.7717339396476746, + "learning_rate": 9.007758697947453e-06, + "loss": 3.0914, + "step": 56428 + }, + { + "epoch": 2.77, + "grad_norm": 0.7018489837646484, + "learning_rate": 9.004014603182996e-06, + "loss": 2.8645, + "step": 56429 + }, + { + "epoch": 2.77, + "grad_norm": 0.7404462099075317, + "learning_rate": 9.000271274845994e-06, + "loss": 2.8071, + "step": 56430 + }, + { + "epoch": 2.77, + "grad_norm": 0.74892258644104, + "learning_rate": 8.996528712946238e-06, + "loss": 3.0029, + "step": 56431 + }, + { + "epoch": 2.77, + "grad_norm": 0.8299066424369812, + "learning_rate": 8.992786917493622e-06, + "loss": 2.8981, + "step": 56432 + }, + { + "epoch": 2.77, + "grad_norm": 0.7275087237358093, + "learning_rate": 8.989045888498003e-06, + "loss": 2.8402, + "step": 56433 + }, + { + "epoch": 2.77, + "grad_norm": 0.73467618227005, + "learning_rate": 8.985305625969209e-06, + "loss": 2.5677, + "step": 56434 + }, + { + "epoch": 2.77, + "grad_norm": 0.7989902496337891, + "learning_rate": 8.981566129917162e-06, + "loss": 2.9348, + "step": 56435 + }, + { + "epoch": 2.77, + "grad_norm": 0.7334526777267456, + "learning_rate": 8.977827400351623e-06, + "loss": 2.8775, + "step": 56436 + }, + { + "epoch": 2.77, + "grad_norm": 0.7814991474151611, + "learning_rate": 8.974089437282517e-06, + "loss": 3.0359, + "step": 56437 + }, + { + "epoch": 2.77, + "grad_norm": 0.7895364165306091, + "learning_rate": 8.970352240719637e-06, + "loss": 3.0237, + "step": 56438 + }, + { + "epoch": 2.77, + "grad_norm": 0.7760238647460938, + "learning_rate": 8.966615810672805e-06, + "loss": 2.7975, + "step": 56439 + }, + { + "epoch": 2.77, + "grad_norm": 0.795097827911377, + "learning_rate": 8.962880147151951e-06, + "loss": 2.7911, + "step": 56440 + }, + { + "epoch": 2.77, + "grad_norm": 0.7412228584289551, + "learning_rate": 8.959145250166799e-06, + "loss": 2.7482, + "step": 56441 + }, + { + "epoch": 2.77, + "grad_norm": 0.7412148118019104, + "learning_rate": 8.955411119727307e-06, + "loss": 2.8579, + "step": 56442 + }, + { + "epoch": 2.77, + "grad_norm": 0.740685224533081, + "learning_rate": 8.951677755843201e-06, + "loss": 2.8466, + "step": 56443 + }, + { + "epoch": 2.77, + "grad_norm": 0.7448912262916565, + "learning_rate": 8.947945158524372e-06, + "loss": 2.7933, + "step": 56444 + }, + { + "epoch": 2.77, + "grad_norm": 0.7429138422012329, + "learning_rate": 8.94421332778068e-06, + "loss": 2.8862, + "step": 56445 + }, + { + "epoch": 2.77, + "grad_norm": 0.7522229552268982, + "learning_rate": 8.940482263621884e-06, + "loss": 2.8816, + "step": 56446 + }, + { + "epoch": 2.77, + "grad_norm": 0.7623513340950012, + "learning_rate": 8.936751966057842e-06, + "loss": 2.8082, + "step": 56447 + }, + { + "epoch": 2.77, + "grad_norm": 0.7645907402038574, + "learning_rate": 8.93302243509838e-06, + "loss": 2.9118, + "step": 56448 + }, + { + "epoch": 2.77, + "grad_norm": 0.7380853891372681, + "learning_rate": 8.92929367075329e-06, + "loss": 2.8021, + "step": 56449 + }, + { + "epoch": 2.77, + "grad_norm": 0.714244544506073, + "learning_rate": 8.925565673032465e-06, + "loss": 3.1272, + "step": 56450 + }, + { + "epoch": 2.77, + "grad_norm": 0.7148142457008362, + "learning_rate": 8.921838441945695e-06, + "loss": 2.8177, + "step": 56451 + }, + { + "epoch": 2.77, + "grad_norm": 0.7204702496528625, + "learning_rate": 8.918111977502772e-06, + "loss": 2.8349, + "step": 56452 + }, + { + "epoch": 2.77, + "grad_norm": 0.7750880718231201, + "learning_rate": 8.914386279713492e-06, + "loss": 3.0569, + "step": 56453 + }, + { + "epoch": 2.77, + "grad_norm": 0.7485977411270142, + "learning_rate": 8.910661348587711e-06, + "loss": 2.8497, + "step": 56454 + }, + { + "epoch": 2.77, + "grad_norm": 0.7468576431274414, + "learning_rate": 8.90693718413522e-06, + "loss": 2.9756, + "step": 56455 + }, + { + "epoch": 2.77, + "grad_norm": 0.7802996635437012, + "learning_rate": 8.903213786365847e-06, + "loss": 2.9238, + "step": 56456 + }, + { + "epoch": 2.77, + "grad_norm": 0.7729799747467041, + "learning_rate": 8.899491155289385e-06, + "loss": 2.8883, + "step": 56457 + }, + { + "epoch": 2.77, + "grad_norm": 0.7460412383079529, + "learning_rate": 8.895769290915621e-06, + "loss": 2.7785, + "step": 56458 + }, + { + "epoch": 2.77, + "grad_norm": 0.8159653544425964, + "learning_rate": 8.89204819325442e-06, + "loss": 2.9526, + "step": 56459 + }, + { + "epoch": 2.77, + "grad_norm": 0.7661879658699036, + "learning_rate": 8.888327862315536e-06, + "loss": 2.9368, + "step": 56460 + }, + { + "epoch": 2.77, + "grad_norm": 0.769864022731781, + "learning_rate": 8.884608298108731e-06, + "loss": 2.9032, + "step": 56461 + }, + { + "epoch": 2.77, + "grad_norm": 0.7339568734169006, + "learning_rate": 8.880889500643861e-06, + "loss": 2.8907, + "step": 56462 + }, + { + "epoch": 2.77, + "grad_norm": 0.7468345761299133, + "learning_rate": 8.877171469930689e-06, + "loss": 2.7929, + "step": 56463 + }, + { + "epoch": 2.77, + "grad_norm": 0.7140334844589233, + "learning_rate": 8.873454205979002e-06, + "loss": 2.695, + "step": 56464 + }, + { + "epoch": 2.77, + "grad_norm": 0.7177031636238098, + "learning_rate": 8.869737708798663e-06, + "loss": 2.9442, + "step": 56465 + }, + { + "epoch": 2.77, + "grad_norm": 0.7148962616920471, + "learning_rate": 8.866021978399396e-06, + "loss": 2.8238, + "step": 56466 + }, + { + "epoch": 2.77, + "grad_norm": 0.771285891532898, + "learning_rate": 8.862307014790992e-06, + "loss": 2.9251, + "step": 56467 + }, + { + "epoch": 2.77, + "grad_norm": 0.7293102741241455, + "learning_rate": 8.858592817983212e-06, + "loss": 3.0754, + "step": 56468 + }, + { + "epoch": 2.77, + "grad_norm": 0.776573657989502, + "learning_rate": 8.85487938798588e-06, + "loss": 2.9431, + "step": 56469 + }, + { + "epoch": 2.77, + "grad_norm": 0.7331776022911072, + "learning_rate": 8.85116672480879e-06, + "loss": 3.0407, + "step": 56470 + }, + { + "epoch": 2.77, + "grad_norm": 0.7505592703819275, + "learning_rate": 8.847454828461632e-06, + "loss": 2.9197, + "step": 56471 + }, + { + "epoch": 2.77, + "grad_norm": 0.7655132412910461, + "learning_rate": 8.843743698954297e-06, + "loss": 3.0714, + "step": 56472 + }, + { + "epoch": 2.77, + "grad_norm": 0.7211343050003052, + "learning_rate": 8.840033336296516e-06, + "loss": 2.8666, + "step": 56473 + }, + { + "epoch": 2.77, + "grad_norm": 0.7068425416946411, + "learning_rate": 8.836323740498009e-06, + "loss": 2.8191, + "step": 56474 + }, + { + "epoch": 2.77, + "grad_norm": 0.6717082262039185, + "learning_rate": 8.832614911568636e-06, + "loss": 2.8159, + "step": 56475 + }, + { + "epoch": 2.77, + "grad_norm": 0.7441225051879883, + "learning_rate": 8.828906849518092e-06, + "loss": 2.8976, + "step": 56476 + }, + { + "epoch": 2.77, + "grad_norm": 0.7472232580184937, + "learning_rate": 8.825199554356166e-06, + "loss": 2.7381, + "step": 56477 + }, + { + "epoch": 2.77, + "grad_norm": 0.7638667225837708, + "learning_rate": 8.821493026092619e-06, + "loss": 2.8284, + "step": 56478 + }, + { + "epoch": 2.77, + "grad_norm": 0.7200614213943481, + "learning_rate": 8.817787264737242e-06, + "loss": 2.9348, + "step": 56479 + }, + { + "epoch": 2.77, + "grad_norm": 0.7430014610290527, + "learning_rate": 8.81408227029976e-06, + "loss": 2.9065, + "step": 56480 + }, + { + "epoch": 2.77, + "grad_norm": 0.7274750471115112, + "learning_rate": 8.810378042789968e-06, + "loss": 2.7577, + "step": 56481 + }, + { + "epoch": 2.77, + "grad_norm": 0.7740485072135925, + "learning_rate": 8.80667458221762e-06, + "loss": 2.9349, + "step": 56482 + }, + { + "epoch": 2.77, + "grad_norm": 0.7242769598960876, + "learning_rate": 8.802971888592381e-06, + "loss": 2.9105, + "step": 56483 + }, + { + "epoch": 2.77, + "grad_norm": 0.7862998843193054, + "learning_rate": 8.799269961924104e-06, + "loss": 2.8458, + "step": 56484 + }, + { + "epoch": 2.77, + "grad_norm": 0.7596186399459839, + "learning_rate": 8.79556880222252e-06, + "loss": 2.8756, + "step": 56485 + }, + { + "epoch": 2.77, + "grad_norm": 0.7429347634315491, + "learning_rate": 8.791868409497316e-06, + "loss": 3.0002, + "step": 56486 + }, + { + "epoch": 2.77, + "grad_norm": 0.7703331708908081, + "learning_rate": 8.788168783758354e-06, + "loss": 2.9832, + "step": 56487 + }, + { + "epoch": 2.77, + "grad_norm": 0.7370572090148926, + "learning_rate": 8.784469925015292e-06, + "loss": 2.7631, + "step": 56488 + }, + { + "epoch": 2.77, + "grad_norm": 0.7434906959533691, + "learning_rate": 8.780771833277855e-06, + "loss": 2.8704, + "step": 56489 + }, + { + "epoch": 2.77, + "grad_norm": 0.7217363119125366, + "learning_rate": 8.777074508555871e-06, + "loss": 2.8909, + "step": 56490 + }, + { + "epoch": 2.77, + "grad_norm": 0.7547829747200012, + "learning_rate": 8.773377950858995e-06, + "loss": 2.8094, + "step": 56491 + }, + { + "epoch": 2.77, + "grad_norm": 0.7625556588172913, + "learning_rate": 8.769682160196989e-06, + "loss": 2.9916, + "step": 56492 + }, + { + "epoch": 2.77, + "grad_norm": 0.7367002964019775, + "learning_rate": 8.765987136579577e-06, + "loss": 2.7937, + "step": 56493 + }, + { + "epoch": 2.77, + "grad_norm": 0.7374213337898254, + "learning_rate": 8.762292880016553e-06, + "loss": 2.9097, + "step": 56494 + }, + { + "epoch": 2.77, + "grad_norm": 0.8057516813278198, + "learning_rate": 8.758599390517607e-06, + "loss": 2.7651, + "step": 56495 + }, + { + "epoch": 2.77, + "grad_norm": 0.7465009093284607, + "learning_rate": 8.754906668092398e-06, + "loss": 2.9281, + "step": 56496 + }, + { + "epoch": 2.77, + "grad_norm": 0.7300065755844116, + "learning_rate": 8.751214712750754e-06, + "loss": 2.9257, + "step": 56497 + }, + { + "epoch": 2.77, + "grad_norm": 0.7591533064842224, + "learning_rate": 8.747523524502332e-06, + "loss": 3.0157, + "step": 56498 + }, + { + "epoch": 2.77, + "grad_norm": 0.7155272960662842, + "learning_rate": 8.743833103356923e-06, + "loss": 2.881, + "step": 56499 + }, + { + "epoch": 2.77, + "grad_norm": 0.7301709651947021, + "learning_rate": 8.740143449324155e-06, + "loss": 3.1471, + "step": 56500 + }, + { + "epoch": 2.77, + "grad_norm": 0.7165641784667969, + "learning_rate": 8.736454562413786e-06, + "loss": 2.7683, + "step": 56501 + }, + { + "epoch": 2.77, + "grad_norm": 0.7362425923347473, + "learning_rate": 8.732766442635607e-06, + "loss": 2.9066, + "step": 56502 + }, + { + "epoch": 2.77, + "grad_norm": 0.7725816965103149, + "learning_rate": 8.729079089999246e-06, + "loss": 2.9066, + "step": 56503 + }, + { + "epoch": 2.77, + "grad_norm": 0.7778238654136658, + "learning_rate": 8.725392504514429e-06, + "loss": 2.8673, + "step": 56504 + }, + { + "epoch": 2.77, + "grad_norm": 0.7198028564453125, + "learning_rate": 8.721706686190843e-06, + "loss": 2.9211, + "step": 56505 + }, + { + "epoch": 2.77, + "grad_norm": 0.7817999720573425, + "learning_rate": 8.71802163503822e-06, + "loss": 2.8808, + "step": 56506 + }, + { + "epoch": 2.77, + "grad_norm": 0.7423765063285828, + "learning_rate": 8.714337351066314e-06, + "loss": 2.7144, + "step": 56507 + }, + { + "epoch": 2.77, + "grad_norm": 0.7737480998039246, + "learning_rate": 8.710653834284753e-06, + "loss": 2.9229, + "step": 56508 + }, + { + "epoch": 2.77, + "grad_norm": 0.7494272589683533, + "learning_rate": 8.706971084703295e-06, + "loss": 2.9935, + "step": 56509 + }, + { + "epoch": 2.77, + "grad_norm": 0.7207717895507812, + "learning_rate": 8.7032891023316e-06, + "loss": 3.0677, + "step": 56510 + }, + { + "epoch": 2.77, + "grad_norm": 0.7812270522117615, + "learning_rate": 8.699607887179361e-06, + "loss": 2.7982, + "step": 56511 + }, + { + "epoch": 2.77, + "grad_norm": 0.8005214929580688, + "learning_rate": 8.695927439256334e-06, + "loss": 2.6689, + "step": 56512 + }, + { + "epoch": 2.77, + "grad_norm": 0.7496790885925293, + "learning_rate": 8.692247758572146e-06, + "loss": 2.8616, + "step": 56513 + }, + { + "epoch": 2.77, + "grad_norm": 0.7499210238456726, + "learning_rate": 8.688568845136523e-06, + "loss": 2.803, + "step": 56514 + }, + { + "epoch": 2.77, + "grad_norm": 0.7113456130027771, + "learning_rate": 8.684890698959125e-06, + "loss": 2.8449, + "step": 56515 + }, + { + "epoch": 2.77, + "grad_norm": 0.7013689279556274, + "learning_rate": 8.681213320049674e-06, + "loss": 2.8038, + "step": 56516 + }, + { + "epoch": 2.77, + "grad_norm": 0.7134240865707397, + "learning_rate": 8.677536708417865e-06, + "loss": 2.8203, + "step": 56517 + }, + { + "epoch": 2.77, + "grad_norm": 0.7750184535980225, + "learning_rate": 8.673860864073323e-06, + "loss": 2.7887, + "step": 56518 + }, + { + "epoch": 2.77, + "grad_norm": 0.7498697638511658, + "learning_rate": 8.670185787025807e-06, + "loss": 3.0101, + "step": 56519 + }, + { + "epoch": 2.77, + "grad_norm": 0.7713249921798706, + "learning_rate": 8.66651147728491e-06, + "loss": 2.7569, + "step": 56520 + }, + { + "epoch": 2.77, + "grad_norm": 0.7590934038162231, + "learning_rate": 8.662837934860322e-06, + "loss": 2.8114, + "step": 56521 + }, + { + "epoch": 2.77, + "grad_norm": 0.729499101638794, + "learning_rate": 8.659165159761838e-06, + "loss": 2.8747, + "step": 56522 + }, + { + "epoch": 2.77, + "grad_norm": 0.7570987939834595, + "learning_rate": 8.655493151998983e-06, + "loss": 3.1679, + "step": 56523 + }, + { + "epoch": 2.77, + "grad_norm": 0.7177196145057678, + "learning_rate": 8.651821911581513e-06, + "loss": 2.9119, + "step": 56524 + }, + { + "epoch": 2.77, + "grad_norm": 0.7343611121177673, + "learning_rate": 8.648151438519091e-06, + "loss": 3.0303, + "step": 56525 + }, + { + "epoch": 2.77, + "grad_norm": 0.7677000164985657, + "learning_rate": 8.644481732821306e-06, + "loss": 2.9359, + "step": 56526 + }, + { + "epoch": 2.77, + "grad_norm": 0.7537042498588562, + "learning_rate": 8.640812794497953e-06, + "loss": 2.7783, + "step": 56527 + }, + { + "epoch": 2.77, + "grad_norm": 0.7486345171928406, + "learning_rate": 8.637144623558557e-06, + "loss": 2.7331, + "step": 56528 + }, + { + "epoch": 2.77, + "grad_norm": 0.7860642671585083, + "learning_rate": 8.633477220012908e-06, + "loss": 2.9114, + "step": 56529 + }, + { + "epoch": 2.77, + "grad_norm": 0.7259573340415955, + "learning_rate": 8.629810583870567e-06, + "loss": 2.9674, + "step": 56530 + }, + { + "epoch": 2.77, + "grad_norm": 0.7208335995674133, + "learning_rate": 8.62614471514126e-06, + "loss": 2.7895, + "step": 56531 + }, + { + "epoch": 2.77, + "grad_norm": 0.7486009001731873, + "learning_rate": 8.62247961383461e-06, + "loss": 2.948, + "step": 56532 + }, + { + "epoch": 2.77, + "grad_norm": 0.7948107719421387, + "learning_rate": 8.618815279960244e-06, + "loss": 2.8621, + "step": 56533 + }, + { + "epoch": 2.77, + "grad_norm": 0.7149216532707214, + "learning_rate": 8.615151713527857e-06, + "loss": 2.8832, + "step": 56534 + }, + { + "epoch": 2.77, + "grad_norm": 0.7751069068908691, + "learning_rate": 8.611488914547071e-06, + "loss": 2.9773, + "step": 56535 + }, + { + "epoch": 2.77, + "grad_norm": 0.7153540849685669, + "learning_rate": 8.607826883027547e-06, + "loss": 2.687, + "step": 56536 + }, + { + "epoch": 2.77, + "grad_norm": 0.7932222485542297, + "learning_rate": 8.604165618978942e-06, + "loss": 3.1045, + "step": 56537 + }, + { + "epoch": 2.77, + "grad_norm": 0.7211452126502991, + "learning_rate": 8.60050512241085e-06, + "loss": 2.8193, + "step": 56538 + }, + { + "epoch": 2.77, + "grad_norm": 0.7320334911346436, + "learning_rate": 8.596845393332997e-06, + "loss": 2.969, + "step": 56539 + }, + { + "epoch": 2.77, + "grad_norm": 0.7381606101989746, + "learning_rate": 8.593186431754973e-06, + "loss": 2.72, + "step": 56540 + }, + { + "epoch": 2.77, + "grad_norm": 0.7109737992286682, + "learning_rate": 8.589528237686372e-06, + "loss": 2.9596, + "step": 56541 + }, + { + "epoch": 2.77, + "grad_norm": 0.7556430101394653, + "learning_rate": 8.585870811136885e-06, + "loss": 3.0741, + "step": 56542 + }, + { + "epoch": 2.77, + "grad_norm": 0.7720571160316467, + "learning_rate": 8.58221415211614e-06, + "loss": 3.1015, + "step": 56543 + }, + { + "epoch": 2.77, + "grad_norm": 0.7691468000411987, + "learning_rate": 8.57855826063376e-06, + "loss": 2.952, + "step": 56544 + }, + { + "epoch": 2.77, + "grad_norm": 0.8022409677505493, + "learning_rate": 8.57490313669934e-06, + "loss": 2.8529, + "step": 56545 + }, + { + "epoch": 2.77, + "grad_norm": 0.7315953969955444, + "learning_rate": 8.57124878032257e-06, + "loss": 2.7401, + "step": 56546 + }, + { + "epoch": 2.77, + "grad_norm": 0.7592272162437439, + "learning_rate": 8.567595191513044e-06, + "loss": 2.9078, + "step": 56547 + }, + { + "epoch": 2.77, + "grad_norm": 0.7328471541404724, + "learning_rate": 8.563942370280386e-06, + "loss": 2.691, + "step": 56548 + }, + { + "epoch": 2.77, + "grad_norm": 0.7198687195777893, + "learning_rate": 8.560290316634222e-06, + "loss": 2.7965, + "step": 56549 + }, + { + "epoch": 2.77, + "grad_norm": 0.7822607159614563, + "learning_rate": 8.556639030584112e-06, + "loss": 2.9445, + "step": 56550 + }, + { + "epoch": 2.77, + "grad_norm": 0.789264976978302, + "learning_rate": 8.552988512139747e-06, + "loss": 2.85, + "step": 56551 + }, + { + "epoch": 2.77, + "grad_norm": 0.7752166390419006, + "learning_rate": 8.549338761310753e-06, + "loss": 2.541, + "step": 56552 + }, + { + "epoch": 2.77, + "grad_norm": 0.7067755460739136, + "learning_rate": 8.545689778106724e-06, + "loss": 2.9171, + "step": 56553 + }, + { + "epoch": 2.77, + "grad_norm": 0.7491674423217773, + "learning_rate": 8.54204156253725e-06, + "loss": 2.7476, + "step": 56554 + }, + { + "epoch": 2.77, + "grad_norm": 0.8012225031852722, + "learning_rate": 8.53839411461189e-06, + "loss": 2.9535, + "step": 56555 + }, + { + "epoch": 2.77, + "grad_norm": 0.7741601467132568, + "learning_rate": 8.534747434340306e-06, + "loss": 2.8989, + "step": 56556 + }, + { + "epoch": 2.77, + "grad_norm": 0.735129714012146, + "learning_rate": 8.531101521732153e-06, + "loss": 2.8754, + "step": 56557 + }, + { + "epoch": 2.77, + "grad_norm": 0.730719268321991, + "learning_rate": 8.527456376796959e-06, + "loss": 2.8079, + "step": 56558 + }, + { + "epoch": 2.77, + "grad_norm": 0.8632533550262451, + "learning_rate": 8.523811999544383e-06, + "loss": 2.8242, + "step": 56559 + }, + { + "epoch": 2.77, + "grad_norm": 0.7945080399513245, + "learning_rate": 8.52016838998395e-06, + "loss": 2.7674, + "step": 56560 + }, + { + "epoch": 2.77, + "grad_norm": 0.7971088290214539, + "learning_rate": 8.516525548125319e-06, + "loss": 2.7071, + "step": 56561 + }, + { + "epoch": 2.77, + "grad_norm": 0.7828199863433838, + "learning_rate": 8.512883473978083e-06, + "loss": 2.7697, + "step": 56562 + }, + { + "epoch": 2.77, + "grad_norm": 0.7509476542472839, + "learning_rate": 8.509242167551767e-06, + "loss": 2.7219, + "step": 56563 + }, + { + "epoch": 2.77, + "grad_norm": 0.7151806950569153, + "learning_rate": 8.505601628856062e-06, + "loss": 3.0501, + "step": 56564 + }, + { + "epoch": 2.77, + "grad_norm": 0.7569729089736938, + "learning_rate": 8.50196185790043e-06, + "loss": 2.9743, + "step": 56565 + }, + { + "epoch": 2.77, + "grad_norm": 0.7372108101844788, + "learning_rate": 8.498322854694562e-06, + "loss": 2.8322, + "step": 56566 + }, + { + "epoch": 2.77, + "grad_norm": 0.7561090588569641, + "learning_rate": 8.494684619248049e-06, + "loss": 2.9245, + "step": 56567 + }, + { + "epoch": 2.77, + "grad_norm": 0.7554269433021545, + "learning_rate": 8.491047151570418e-06, + "loss": 2.9634, + "step": 56568 + }, + { + "epoch": 2.77, + "grad_norm": 0.7970679402351379, + "learning_rate": 8.487410451671261e-06, + "loss": 2.8352, + "step": 56569 + }, + { + "epoch": 2.77, + "grad_norm": 0.7189784049987793, + "learning_rate": 8.483774519560137e-06, + "loss": 2.9002, + "step": 56570 + }, + { + "epoch": 2.77, + "grad_norm": 0.768104612827301, + "learning_rate": 8.480139355246674e-06, + "loss": 3.1014, + "step": 56571 + }, + { + "epoch": 2.77, + "grad_norm": 0.7429171800613403, + "learning_rate": 8.476504958740427e-06, + "loss": 2.9454, + "step": 56572 + }, + { + "epoch": 2.77, + "grad_norm": 0.7118484973907471, + "learning_rate": 8.472871330050923e-06, + "loss": 3.0559, + "step": 56573 + }, + { + "epoch": 2.77, + "grad_norm": 0.7167723178863525, + "learning_rate": 8.46923846918779e-06, + "loss": 2.8801, + "step": 56574 + }, + { + "epoch": 2.77, + "grad_norm": 0.749420702457428, + "learning_rate": 8.465606376160617e-06, + "loss": 2.8208, + "step": 56575 + }, + { + "epoch": 2.77, + "grad_norm": 0.8014590740203857, + "learning_rate": 8.461975050978898e-06, + "loss": 2.9306, + "step": 56576 + }, + { + "epoch": 2.77, + "grad_norm": 0.855155348777771, + "learning_rate": 8.458344493652225e-06, + "loss": 2.9504, + "step": 56577 + }, + { + "epoch": 2.77, + "grad_norm": 0.7464328408241272, + "learning_rate": 8.454714704190125e-06, + "loss": 2.8316, + "step": 56578 + }, + { + "epoch": 2.77, + "grad_norm": 0.7837406992912292, + "learning_rate": 8.451085682602255e-06, + "loss": 2.8898, + "step": 56579 + }, + { + "epoch": 2.77, + "grad_norm": 0.7430059313774109, + "learning_rate": 8.447457428898108e-06, + "loss": 2.8781, + "step": 56580 + }, + { + "epoch": 2.77, + "grad_norm": 0.7241303324699402, + "learning_rate": 8.443829943087244e-06, + "loss": 3.1459, + "step": 56581 + }, + { + "epoch": 2.77, + "grad_norm": 0.7481164932250977, + "learning_rate": 8.440203225179187e-06, + "loss": 2.721, + "step": 56582 + }, + { + "epoch": 2.77, + "grad_norm": 0.766409158706665, + "learning_rate": 8.436577275183564e-06, + "loss": 3.0709, + "step": 56583 + }, + { + "epoch": 2.77, + "grad_norm": 0.7170672416687012, + "learning_rate": 8.432952093109901e-06, + "loss": 3.1805, + "step": 56584 + }, + { + "epoch": 2.77, + "grad_norm": 0.7190765142440796, + "learning_rate": 8.429327678967656e-06, + "loss": 2.8354, + "step": 56585 + }, + { + "epoch": 2.77, + "grad_norm": 0.783243715763092, + "learning_rate": 8.425704032766523e-06, + "loss": 2.836, + "step": 56586 + }, + { + "epoch": 2.77, + "grad_norm": 0.716397762298584, + "learning_rate": 8.422081154515925e-06, + "loss": 2.8644, + "step": 56587 + }, + { + "epoch": 2.77, + "grad_norm": 0.766613781452179, + "learning_rate": 8.418459044225456e-06, + "loss": 2.8583, + "step": 56588 + }, + { + "epoch": 2.77, + "grad_norm": 0.7538684606552124, + "learning_rate": 8.414837701904642e-06, + "loss": 2.8837, + "step": 56589 + }, + { + "epoch": 2.77, + "grad_norm": 0.7688279151916504, + "learning_rate": 8.411217127563075e-06, + "loss": 2.9859, + "step": 56590 + }, + { + "epoch": 2.77, + "grad_norm": 0.8095577359199524, + "learning_rate": 8.407597321210214e-06, + "loss": 2.7778, + "step": 56591 + }, + { + "epoch": 2.77, + "grad_norm": 0.7337608337402344, + "learning_rate": 8.403978282855617e-06, + "loss": 2.8053, + "step": 56592 + }, + { + "epoch": 2.77, + "grad_norm": 0.7341989278793335, + "learning_rate": 8.40036001250881e-06, + "loss": 2.9954, + "step": 56593 + }, + { + "epoch": 2.77, + "grad_norm": 0.7581641674041748, + "learning_rate": 8.396742510179388e-06, + "loss": 3.1145, + "step": 56594 + }, + { + "epoch": 2.77, + "grad_norm": 0.7279971241950989, + "learning_rate": 8.393125775876775e-06, + "loss": 2.8688, + "step": 56595 + }, + { + "epoch": 2.77, + "grad_norm": 0.7769571542739868, + "learning_rate": 8.389509809610562e-06, + "loss": 2.6323, + "step": 56596 + }, + { + "epoch": 2.77, + "grad_norm": 0.7689619660377502, + "learning_rate": 8.385894611390276e-06, + "loss": 2.814, + "step": 56597 + }, + { + "epoch": 2.77, + "grad_norm": 0.7330824732780457, + "learning_rate": 8.38228018122541e-06, + "loss": 2.7816, + "step": 56598 + }, + { + "epoch": 2.77, + "grad_norm": 0.7108086943626404, + "learning_rate": 8.378666519125487e-06, + "loss": 2.8276, + "step": 56599 + }, + { + "epoch": 2.77, + "grad_norm": 0.7666386365890503, + "learning_rate": 8.375053625100037e-06, + "loss": 2.9543, + "step": 56600 + }, + { + "epoch": 2.77, + "grad_norm": 0.7531747817993164, + "learning_rate": 8.371441499158582e-06, + "loss": 3.0132, + "step": 56601 + }, + { + "epoch": 2.77, + "grad_norm": 0.7575362920761108, + "learning_rate": 8.367830141310583e-06, + "loss": 3.0232, + "step": 56602 + }, + { + "epoch": 2.77, + "grad_norm": 0.741112232208252, + "learning_rate": 8.364219551565631e-06, + "loss": 2.7388, + "step": 56603 + }, + { + "epoch": 2.77, + "grad_norm": 0.7264926433563232, + "learning_rate": 8.360609729933187e-06, + "loss": 2.987, + "step": 56604 + }, + { + "epoch": 2.77, + "grad_norm": 0.8191945552825928, + "learning_rate": 8.357000676422808e-06, + "loss": 2.8198, + "step": 56605 + }, + { + "epoch": 2.77, + "grad_norm": 0.8269351720809937, + "learning_rate": 8.353392391043922e-06, + "loss": 2.8031, + "step": 56606 + }, + { + "epoch": 2.77, + "grad_norm": 0.7800099849700928, + "learning_rate": 8.349784873806087e-06, + "loss": 2.8467, + "step": 56607 + }, + { + "epoch": 2.77, + "grad_norm": 0.7215859293937683, + "learning_rate": 8.34617812471876e-06, + "loss": 2.7812, + "step": 56608 + }, + { + "epoch": 2.77, + "grad_norm": 0.8046889901161194, + "learning_rate": 8.342572143791504e-06, + "loss": 2.8322, + "step": 56609 + }, + { + "epoch": 2.77, + "grad_norm": 0.753553569316864, + "learning_rate": 8.338966931033741e-06, + "loss": 2.8108, + "step": 56610 + }, + { + "epoch": 2.77, + "grad_norm": 0.7166416049003601, + "learning_rate": 8.335362486455066e-06, + "loss": 2.8806, + "step": 56611 + }, + { + "epoch": 2.77, + "grad_norm": 0.7240424156188965, + "learning_rate": 8.331758810064903e-06, + "loss": 3.0349, + "step": 56612 + }, + { + "epoch": 2.77, + "grad_norm": 0.7516683340072632, + "learning_rate": 8.328155901872713e-06, + "loss": 2.9413, + "step": 56613 + }, + { + "epoch": 2.77, + "grad_norm": 0.7496482729911804, + "learning_rate": 8.324553761888053e-06, + "loss": 2.8857, + "step": 56614 + }, + { + "epoch": 2.77, + "grad_norm": 0.7219851016998291, + "learning_rate": 8.320952390120383e-06, + "loss": 2.9247, + "step": 56615 + }, + { + "epoch": 2.77, + "grad_norm": 0.7659447193145752, + "learning_rate": 8.317351786579196e-06, + "loss": 2.9269, + "step": 56616 + }, + { + "epoch": 2.77, + "grad_norm": 0.7846766114234924, + "learning_rate": 8.31375195127395e-06, + "loss": 2.7706, + "step": 56617 + }, + { + "epoch": 2.77, + "grad_norm": 0.8673431277275085, + "learning_rate": 8.310152884214173e-06, + "loss": 3.0868, + "step": 56618 + }, + { + "epoch": 2.77, + "grad_norm": 0.8626456260681152, + "learning_rate": 8.30655458540932e-06, + "loss": 2.8726, + "step": 56619 + }, + { + "epoch": 2.77, + "grad_norm": 0.7804566621780396, + "learning_rate": 8.30295705486882e-06, + "loss": 2.7799, + "step": 56620 + }, + { + "epoch": 2.77, + "grad_norm": 0.8007548451423645, + "learning_rate": 8.29936029260223e-06, + "loss": 2.9062, + "step": 56621 + }, + { + "epoch": 2.77, + "grad_norm": 0.7293591499328613, + "learning_rate": 8.295764298618946e-06, + "loss": 2.8577, + "step": 56622 + }, + { + "epoch": 2.77, + "grad_norm": 0.8285486698150635, + "learning_rate": 8.29216907292849e-06, + "loss": 2.9366, + "step": 56623 + }, + { + "epoch": 2.78, + "grad_norm": 0.7310425043106079, + "learning_rate": 8.288574615540355e-06, + "loss": 2.931, + "step": 56624 + }, + { + "epoch": 2.78, + "grad_norm": 0.7126893401145935, + "learning_rate": 8.284980926463936e-06, + "loss": 2.839, + "step": 56625 + }, + { + "epoch": 2.78, + "grad_norm": 0.7340129017829895, + "learning_rate": 8.28138800570879e-06, + "loss": 2.925, + "step": 56626 + }, + { + "epoch": 2.78, + "grad_norm": 0.7461667060852051, + "learning_rate": 8.277795853284308e-06, + "loss": 2.7116, + "step": 56627 + }, + { + "epoch": 2.78, + "grad_norm": 0.7524672150611877, + "learning_rate": 8.274204469199919e-06, + "loss": 3.0097, + "step": 56628 + }, + { + "epoch": 2.78, + "grad_norm": 0.7391806244850159, + "learning_rate": 8.27061385346518e-06, + "loss": 2.9233, + "step": 56629 + }, + { + "epoch": 2.78, + "grad_norm": 0.7427461743354797, + "learning_rate": 8.267024006089484e-06, + "loss": 2.8764, + "step": 56630 + }, + { + "epoch": 2.78, + "grad_norm": 0.7367878556251526, + "learning_rate": 8.263434927082291e-06, + "loss": 2.9975, + "step": 56631 + }, + { + "epoch": 2.78, + "grad_norm": 0.7743518948554993, + "learning_rate": 8.259846616453058e-06, + "loss": 2.8299, + "step": 56632 + }, + { + "epoch": 2.78, + "grad_norm": 0.9217862486839294, + "learning_rate": 8.25625907421128e-06, + "loss": 2.9932, + "step": 56633 + }, + { + "epoch": 2.78, + "grad_norm": 0.7349005937576294, + "learning_rate": 8.25267230036638e-06, + "loss": 2.9098, + "step": 56634 + }, + { + "epoch": 2.78, + "grad_norm": 0.7806947827339172, + "learning_rate": 8.24908629492772e-06, + "loss": 2.6354, + "step": 56635 + }, + { + "epoch": 2.78, + "grad_norm": 0.7729756832122803, + "learning_rate": 8.24550105790489e-06, + "loss": 2.8451, + "step": 56636 + }, + { + "epoch": 2.78, + "grad_norm": 0.7258002758026123, + "learning_rate": 8.241916589307219e-06, + "loss": 2.9029, + "step": 56637 + }, + { + "epoch": 2.78, + "grad_norm": 0.7425938248634338, + "learning_rate": 8.238332889144194e-06, + "loss": 3.0206, + "step": 56638 + }, + { + "epoch": 2.78, + "grad_norm": 0.7938366532325745, + "learning_rate": 8.234749957425247e-06, + "loss": 3.0195, + "step": 56639 + }, + { + "epoch": 2.78, + "grad_norm": 0.7334127426147461, + "learning_rate": 8.231167794159832e-06, + "loss": 2.958, + "step": 56640 + }, + { + "epoch": 2.78, + "grad_norm": 0.7349879145622253, + "learning_rate": 8.22758639935741e-06, + "loss": 2.8197, + "step": 56641 + }, + { + "epoch": 2.78, + "grad_norm": 0.7187401056289673, + "learning_rate": 8.22400577302731e-06, + "loss": 2.9082, + "step": 56642 + }, + { + "epoch": 2.78, + "grad_norm": 0.7203710079193115, + "learning_rate": 8.220425915179018e-06, + "loss": 2.7315, + "step": 56643 + }, + { + "epoch": 2.78, + "grad_norm": 0.7575718760490417, + "learning_rate": 8.216846825821999e-06, + "loss": 2.829, + "step": 56644 + }, + { + "epoch": 2.78, + "grad_norm": 0.7450301647186279, + "learning_rate": 8.213268504965643e-06, + "loss": 2.7181, + "step": 56645 + }, + { + "epoch": 2.78, + "grad_norm": 0.8067429065704346, + "learning_rate": 8.209690952619407e-06, + "loss": 2.8633, + "step": 56646 + }, + { + "epoch": 2.78, + "grad_norm": 0.795412540435791, + "learning_rate": 8.206114168792622e-06, + "loss": 2.8272, + "step": 56647 + }, + { + "epoch": 2.78, + "grad_norm": 0.8101593852043152, + "learning_rate": 8.202538153494842e-06, + "loss": 3.12, + "step": 56648 + }, + { + "epoch": 2.78, + "grad_norm": 0.7850880026817322, + "learning_rate": 8.19896290673543e-06, + "loss": 3.0746, + "step": 56649 + }, + { + "epoch": 2.78, + "grad_norm": 0.7382371425628662, + "learning_rate": 8.195388428523741e-06, + "loss": 2.7615, + "step": 56650 + }, + { + "epoch": 2.78, + "grad_norm": 0.7291812896728516, + "learning_rate": 8.191814718869271e-06, + "loss": 2.6933, + "step": 56651 + }, + { + "epoch": 2.78, + "grad_norm": 0.7332150936126709, + "learning_rate": 8.18824177778138e-06, + "loss": 2.9268, + "step": 56652 + }, + { + "epoch": 2.78, + "grad_norm": 0.7187190055847168, + "learning_rate": 8.18466960526949e-06, + "loss": 2.8866, + "step": 56653 + }, + { + "epoch": 2.78, + "grad_norm": 0.7524594664573669, + "learning_rate": 8.181098201343028e-06, + "loss": 3.1315, + "step": 56654 + }, + { + "epoch": 2.78, + "grad_norm": 0.7417961955070496, + "learning_rate": 8.177527566011422e-06, + "loss": 2.8357, + "step": 56655 + }, + { + "epoch": 2.78, + "grad_norm": 0.7433927059173584, + "learning_rate": 8.173957699284029e-06, + "loss": 2.8197, + "step": 56656 + }, + { + "epoch": 2.78, + "grad_norm": 0.7373107671737671, + "learning_rate": 8.170388601170275e-06, + "loss": 2.847, + "step": 56657 + }, + { + "epoch": 2.78, + "grad_norm": 0.7926795482635498, + "learning_rate": 8.166820271679552e-06, + "loss": 2.6819, + "step": 56658 + }, + { + "epoch": 2.78, + "grad_norm": 0.7191329598426819, + "learning_rate": 8.163252710821222e-06, + "loss": 2.9689, + "step": 56659 + }, + { + "epoch": 2.78, + "grad_norm": 0.820548415184021, + "learning_rate": 8.15968591860474e-06, + "loss": 2.8692, + "step": 56660 + }, + { + "epoch": 2.78, + "grad_norm": 0.7487797141075134, + "learning_rate": 8.156119895039503e-06, + "loss": 3.0475, + "step": 56661 + }, + { + "epoch": 2.78, + "grad_norm": 0.78996741771698, + "learning_rate": 8.152554640134834e-06, + "loss": 3.1179, + "step": 56662 + }, + { + "epoch": 2.78, + "grad_norm": 0.7686530351638794, + "learning_rate": 8.148990153900225e-06, + "loss": 2.9168, + "step": 56663 + }, + { + "epoch": 2.78, + "grad_norm": 0.8370349407196045, + "learning_rate": 8.145426436345004e-06, + "loss": 2.891, + "step": 56664 + }, + { + "epoch": 2.78, + "grad_norm": 0.756450355052948, + "learning_rate": 8.141863487478528e-06, + "loss": 2.8712, + "step": 56665 + }, + { + "epoch": 2.78, + "grad_norm": 0.7915993332862854, + "learning_rate": 8.138301307310224e-06, + "loss": 2.7854, + "step": 56666 + }, + { + "epoch": 2.78, + "grad_norm": 0.7291852235794067, + "learning_rate": 8.134739895849452e-06, + "loss": 2.91, + "step": 56667 + }, + { + "epoch": 2.78, + "grad_norm": 0.7370338439941406, + "learning_rate": 8.131179253105635e-06, + "loss": 2.8508, + "step": 56668 + }, + { + "epoch": 2.78, + "grad_norm": 0.7495514750480652, + "learning_rate": 8.127619379088102e-06, + "loss": 3.1002, + "step": 56669 + }, + { + "epoch": 2.78, + "grad_norm": 0.6904610395431519, + "learning_rate": 8.124060273806242e-06, + "loss": 2.8653, + "step": 56670 + }, + { + "epoch": 2.78, + "grad_norm": 0.7376260757446289, + "learning_rate": 8.120501937269453e-06, + "loss": 2.8539, + "step": 56671 + }, + { + "epoch": 2.78, + "grad_norm": 0.7773568034172058, + "learning_rate": 8.116944369487088e-06, + "loss": 2.6758, + "step": 56672 + }, + { + "epoch": 2.78, + "grad_norm": 0.8228160738945007, + "learning_rate": 8.11338757046851e-06, + "loss": 2.7239, + "step": 56673 + }, + { + "epoch": 2.78, + "grad_norm": 0.7620078921318054, + "learning_rate": 8.109831540223077e-06, + "loss": 2.8486, + "step": 56674 + }, + { + "epoch": 2.78, + "grad_norm": 0.7939218282699585, + "learning_rate": 8.106276278760183e-06, + "loss": 2.8879, + "step": 56675 + }, + { + "epoch": 2.78, + "grad_norm": 0.7919220924377441, + "learning_rate": 8.102721786089183e-06, + "loss": 2.9234, + "step": 56676 + }, + { + "epoch": 2.78, + "grad_norm": 0.7480902671813965, + "learning_rate": 8.099168062219474e-06, + "loss": 2.7817, + "step": 56677 + }, + { + "epoch": 2.78, + "grad_norm": 0.7824358344078064, + "learning_rate": 8.095615107160347e-06, + "loss": 2.8962, + "step": 56678 + }, + { + "epoch": 2.78, + "grad_norm": 0.7388315200805664, + "learning_rate": 8.09206292092116e-06, + "loss": 2.9825, + "step": 56679 + }, + { + "epoch": 2.78, + "grad_norm": 0.749704122543335, + "learning_rate": 8.088511503511342e-06, + "loss": 2.8423, + "step": 56680 + }, + { + "epoch": 2.78, + "grad_norm": 0.7328793406486511, + "learning_rate": 8.084960854940181e-06, + "loss": 2.9085, + "step": 56681 + }, + { + "epoch": 2.78, + "grad_norm": 0.7136931419372559, + "learning_rate": 8.081410975217073e-06, + "loss": 2.9693, + "step": 56682 + }, + { + "epoch": 2.78, + "grad_norm": 0.7360411286354065, + "learning_rate": 8.077861864351343e-06, + "loss": 2.7465, + "step": 56683 + }, + { + "epoch": 2.78, + "grad_norm": 0.7628071308135986, + "learning_rate": 8.074313522352315e-06, + "loss": 2.7308, + "step": 56684 + }, + { + "epoch": 2.78, + "grad_norm": 0.72130286693573, + "learning_rate": 8.070765949229419e-06, + "loss": 2.6445, + "step": 56685 + }, + { + "epoch": 2.78, + "grad_norm": 0.7512112259864807, + "learning_rate": 8.06721914499191e-06, + "loss": 2.8764, + "step": 56686 + }, + { + "epoch": 2.78, + "grad_norm": 0.7311689853668213, + "learning_rate": 8.063673109649148e-06, + "loss": 3.1255, + "step": 56687 + }, + { + "epoch": 2.78, + "grad_norm": 0.806206226348877, + "learning_rate": 8.060127843210528e-06, + "loss": 2.7643, + "step": 56688 + }, + { + "epoch": 2.78, + "grad_norm": 0.7583916187286377, + "learning_rate": 8.056583345685308e-06, + "loss": 3.0265, + "step": 56689 + }, + { + "epoch": 2.78, + "grad_norm": 0.7623338103294373, + "learning_rate": 8.053039617082846e-06, + "loss": 2.8745, + "step": 56690 + }, + { + "epoch": 2.78, + "grad_norm": 0.7402061820030212, + "learning_rate": 8.049496657412535e-06, + "loss": 2.9607, + "step": 56691 + }, + { + "epoch": 2.78, + "grad_norm": 0.7634629607200623, + "learning_rate": 8.045954466683668e-06, + "loss": 3.1609, + "step": 56692 + }, + { + "epoch": 2.78, + "grad_norm": 0.7831898927688599, + "learning_rate": 8.042413044905572e-06, + "loss": 2.6583, + "step": 56693 + }, + { + "epoch": 2.78, + "grad_norm": 0.7754256129264832, + "learning_rate": 8.038872392087537e-06, + "loss": 2.5636, + "step": 56694 + }, + { + "epoch": 2.78, + "grad_norm": 0.7307144999504089, + "learning_rate": 8.035332508238923e-06, + "loss": 2.8217, + "step": 56695 + }, + { + "epoch": 2.78, + "grad_norm": 0.7803258895874023, + "learning_rate": 8.031793393369057e-06, + "loss": 2.9664, + "step": 56696 + }, + { + "epoch": 2.78, + "grad_norm": 0.7296281456947327, + "learning_rate": 8.028255047487265e-06, + "loss": 2.8883, + "step": 56697 + }, + { + "epoch": 2.78, + "grad_norm": 0.7694997191429138, + "learning_rate": 8.02471747060287e-06, + "loss": 2.7386, + "step": 56698 + }, + { + "epoch": 2.78, + "grad_norm": 0.7115613222122192, + "learning_rate": 8.021180662725169e-06, + "loss": 2.8441, + "step": 56699 + }, + { + "epoch": 2.78, + "grad_norm": 0.7649816274642944, + "learning_rate": 8.017644623863484e-06, + "loss": 2.9336, + "step": 56700 + }, + { + "epoch": 2.78, + "grad_norm": 0.7806012630462646, + "learning_rate": 8.014109354027142e-06, + "loss": 2.9071, + "step": 56701 + }, + { + "epoch": 2.78, + "grad_norm": 0.7786357402801514, + "learning_rate": 8.010574853225405e-06, + "loss": 2.8436, + "step": 56702 + }, + { + "epoch": 2.78, + "grad_norm": 0.7318533658981323, + "learning_rate": 8.00704112146766e-06, + "loss": 2.9391, + "step": 56703 + }, + { + "epoch": 2.78, + "grad_norm": 0.7530311942100525, + "learning_rate": 8.003508158763138e-06, + "loss": 3.1211, + "step": 56704 + }, + { + "epoch": 2.78, + "grad_norm": 0.7505061626434326, + "learning_rate": 7.999975965121164e-06, + "loss": 3.1264, + "step": 56705 + }, + { + "epoch": 2.78, + "grad_norm": 0.7486022114753723, + "learning_rate": 7.996444540551128e-06, + "loss": 2.7554, + "step": 56706 + }, + { + "epoch": 2.78, + "grad_norm": 0.7808429002761841, + "learning_rate": 7.992913885062224e-06, + "loss": 2.8135, + "step": 56707 + }, + { + "epoch": 2.78, + "grad_norm": 0.7680838704109192, + "learning_rate": 7.989383998663812e-06, + "loss": 2.9188, + "step": 56708 + }, + { + "epoch": 2.78, + "grad_norm": 0.7538081407546997, + "learning_rate": 7.985854881365117e-06, + "loss": 2.7569, + "step": 56709 + }, + { + "epoch": 2.78, + "grad_norm": 0.7408961057662964, + "learning_rate": 7.982326533175466e-06, + "loss": 2.9055, + "step": 56710 + }, + { + "epoch": 2.78, + "grad_norm": 0.7339892983436584, + "learning_rate": 7.978798954104216e-06, + "loss": 2.8456, + "step": 56711 + }, + { + "epoch": 2.78, + "grad_norm": 0.7620311975479126, + "learning_rate": 7.975272144160594e-06, + "loss": 2.9099, + "step": 56712 + }, + { + "epoch": 2.78, + "grad_norm": 0.7628583908081055, + "learning_rate": 7.971746103353926e-06, + "loss": 2.7379, + "step": 56713 + }, + { + "epoch": 2.78, + "grad_norm": 0.7465094327926636, + "learning_rate": 7.968220831693473e-06, + "loss": 2.7516, + "step": 56714 + }, + { + "epoch": 2.78, + "grad_norm": 0.7672311067581177, + "learning_rate": 7.964696329188492e-06, + "loss": 2.8941, + "step": 56715 + }, + { + "epoch": 2.78, + "grad_norm": 0.7001157402992249, + "learning_rate": 7.96117259584831e-06, + "loss": 2.779, + "step": 56716 + }, + { + "epoch": 2.78, + "grad_norm": 0.7452120780944824, + "learning_rate": 7.957649631682183e-06, + "loss": 3.0356, + "step": 56717 + }, + { + "epoch": 2.78, + "grad_norm": 0.7730103731155396, + "learning_rate": 7.954127436699441e-06, + "loss": 2.8371, + "step": 56718 + }, + { + "epoch": 2.78, + "grad_norm": 0.7829596996307373, + "learning_rate": 7.950606010909277e-06, + "loss": 2.8766, + "step": 56719 + }, + { + "epoch": 2.78, + "grad_norm": 0.733018696308136, + "learning_rate": 7.947085354321047e-06, + "loss": 2.9807, + "step": 56720 + }, + { + "epoch": 2.78, + "grad_norm": 0.7372322678565979, + "learning_rate": 7.94356546694398e-06, + "loss": 2.8052, + "step": 56721 + }, + { + "epoch": 2.78, + "grad_norm": 0.7484538555145264, + "learning_rate": 7.940046348787332e-06, + "loss": 2.9547, + "step": 56722 + }, + { + "epoch": 2.78, + "grad_norm": 0.7925270795822144, + "learning_rate": 7.936527999860432e-06, + "loss": 2.9496, + "step": 56723 + }, + { + "epoch": 2.78, + "grad_norm": 0.7163591980934143, + "learning_rate": 7.93301042017247e-06, + "loss": 2.7463, + "step": 56724 + }, + { + "epoch": 2.78, + "grad_norm": 0.777164101600647, + "learning_rate": 7.92949360973274e-06, + "loss": 2.9165, + "step": 56725 + }, + { + "epoch": 2.78, + "grad_norm": 0.7506938576698303, + "learning_rate": 7.925977568550568e-06, + "loss": 2.9781, + "step": 56726 + }, + { + "epoch": 2.78, + "grad_norm": 0.7980271577835083, + "learning_rate": 7.922462296635112e-06, + "loss": 2.6755, + "step": 56727 + }, + { + "epoch": 2.78, + "grad_norm": 0.7719228863716125, + "learning_rate": 7.9189477939957e-06, + "loss": 2.7433, + "step": 56728 + }, + { + "epoch": 2.78, + "grad_norm": 0.7837870121002197, + "learning_rate": 7.91543406064159e-06, + "loss": 2.8218, + "step": 56729 + }, + { + "epoch": 2.78, + "grad_norm": 0.7866212725639343, + "learning_rate": 7.911921096581975e-06, + "loss": 2.895, + "step": 56730 + }, + { + "epoch": 2.78, + "grad_norm": 0.7204878926277161, + "learning_rate": 7.908408901826147e-06, + "loss": 2.9522, + "step": 56731 + }, + { + "epoch": 2.78, + "grad_norm": 0.7222285270690918, + "learning_rate": 7.904897476383365e-06, + "loss": 2.9197, + "step": 56732 + }, + { + "epoch": 2.78, + "grad_norm": 0.7325133085250854, + "learning_rate": 7.90138682026289e-06, + "loss": 2.6879, + "step": 56733 + }, + { + "epoch": 2.78, + "grad_norm": 0.7699395418167114, + "learning_rate": 7.897876933473912e-06, + "loss": 2.8361, + "step": 56734 + }, + { + "epoch": 2.78, + "grad_norm": 0.7754160165786743, + "learning_rate": 7.894367816025726e-06, + "loss": 2.81, + "step": 56735 + }, + { + "epoch": 2.78, + "grad_norm": 0.7943594455718994, + "learning_rate": 7.89085946792759e-06, + "loss": 2.8848, + "step": 56736 + }, + { + "epoch": 2.78, + "grad_norm": 0.7334774136543274, + "learning_rate": 7.887351889188665e-06, + "loss": 2.7652, + "step": 56737 + }, + { + "epoch": 2.78, + "grad_norm": 0.7373555898666382, + "learning_rate": 7.883845079818273e-06, + "loss": 3.0028, + "step": 56738 + }, + { + "epoch": 2.78, + "grad_norm": 0.7737788558006287, + "learning_rate": 7.880339039825578e-06, + "loss": 2.7939, + "step": 56739 + }, + { + "epoch": 2.78, + "grad_norm": 0.7455878257751465, + "learning_rate": 7.87683376921987e-06, + "loss": 2.8291, + "step": 56740 + }, + { + "epoch": 2.78, + "grad_norm": 0.7712538242340088, + "learning_rate": 7.873329268010343e-06, + "loss": 3.0044, + "step": 56741 + }, + { + "epoch": 2.78, + "grad_norm": 0.7128819227218628, + "learning_rate": 7.869825536206254e-06, + "loss": 2.7565, + "step": 56742 + }, + { + "epoch": 2.78, + "grad_norm": 0.72657710313797, + "learning_rate": 7.866322573816864e-06, + "loss": 2.8434, + "step": 56743 + }, + { + "epoch": 2.78, + "grad_norm": 0.7716096639633179, + "learning_rate": 7.862820380851299e-06, + "loss": 2.961, + "step": 56744 + }, + { + "epoch": 2.78, + "grad_norm": 0.7939549684524536, + "learning_rate": 7.859318957318884e-06, + "loss": 2.9316, + "step": 56745 + }, + { + "epoch": 2.78, + "grad_norm": 0.7576471567153931, + "learning_rate": 7.855818303228779e-06, + "loss": 2.9361, + "step": 56746 + }, + { + "epoch": 2.78, + "grad_norm": 0.7495378255844116, + "learning_rate": 7.852318418590208e-06, + "loss": 2.9346, + "step": 56747 + }, + { + "epoch": 2.78, + "grad_norm": 0.7718560099601746, + "learning_rate": 7.848819303412435e-06, + "loss": 2.8327, + "step": 56748 + }, + { + "epoch": 2.78, + "grad_norm": 0.7532568573951721, + "learning_rate": 7.845320957704614e-06, + "loss": 2.7433, + "step": 56749 + }, + { + "epoch": 2.78, + "grad_norm": 0.7588929533958435, + "learning_rate": 7.841823381476043e-06, + "loss": 2.962, + "step": 56750 + }, + { + "epoch": 2.78, + "grad_norm": 0.7554545998573303, + "learning_rate": 7.838326574735875e-06, + "loss": 3.0306, + "step": 56751 + }, + { + "epoch": 2.78, + "grad_norm": 0.7670568227767944, + "learning_rate": 7.834830537493275e-06, + "loss": 2.9727, + "step": 56752 + }, + { + "epoch": 2.78, + "grad_norm": 0.7550936341285706, + "learning_rate": 7.831335269757532e-06, + "loss": 2.9098, + "step": 56753 + }, + { + "epoch": 2.78, + "grad_norm": 0.7789241075515747, + "learning_rate": 7.827840771537808e-06, + "loss": 2.7227, + "step": 56754 + }, + { + "epoch": 2.78, + "grad_norm": 0.7854425311088562, + "learning_rate": 7.82434704284336e-06, + "loss": 2.8727, + "step": 56755 + }, + { + "epoch": 2.78, + "grad_norm": 0.7839749455451965, + "learning_rate": 7.820854083683314e-06, + "loss": 2.9157, + "step": 56756 + }, + { + "epoch": 2.78, + "grad_norm": 0.7156407833099365, + "learning_rate": 7.81736189406693e-06, + "loss": 3.0155, + "step": 56757 + }, + { + "epoch": 2.78, + "grad_norm": 0.7155882716178894, + "learning_rate": 7.813870474003403e-06, + "loss": 2.8871, + "step": 56758 + }, + { + "epoch": 2.78, + "grad_norm": 0.7472594380378723, + "learning_rate": 7.810379823501855e-06, + "loss": 3.0024, + "step": 56759 + }, + { + "epoch": 2.78, + "grad_norm": 0.7841302156448364, + "learning_rate": 7.806889942571582e-06, + "loss": 2.7603, + "step": 56760 + }, + { + "epoch": 2.78, + "grad_norm": 0.7454701662063599, + "learning_rate": 7.803400831221707e-06, + "loss": 2.7723, + "step": 56761 + }, + { + "epoch": 2.78, + "grad_norm": 0.7371299862861633, + "learning_rate": 7.799912489461423e-06, + "loss": 2.9579, + "step": 56762 + }, + { + "epoch": 2.78, + "grad_norm": 0.7872686386108398, + "learning_rate": 7.796424917299959e-06, + "loss": 2.9897, + "step": 56763 + }, + { + "epoch": 2.78, + "grad_norm": 0.7827180624008179, + "learning_rate": 7.792938114746439e-06, + "loss": 3.0709, + "step": 56764 + }, + { + "epoch": 2.78, + "grad_norm": 0.7246116399765015, + "learning_rate": 7.78945208181012e-06, + "loss": 2.8398, + "step": 56765 + }, + { + "epoch": 2.78, + "grad_norm": 0.7627786993980408, + "learning_rate": 7.785966818500166e-06, + "loss": 3.0078, + "step": 56766 + }, + { + "epoch": 2.78, + "grad_norm": 0.7191957831382751, + "learning_rate": 7.7824823248257e-06, + "loss": 3.119, + "step": 56767 + }, + { + "epoch": 2.78, + "grad_norm": 0.7956440448760986, + "learning_rate": 7.778998600795949e-06, + "loss": 3.0264, + "step": 56768 + }, + { + "epoch": 2.78, + "grad_norm": 0.7572888135910034, + "learning_rate": 7.775515646420071e-06, + "loss": 2.9061, + "step": 56769 + }, + { + "epoch": 2.78, + "grad_norm": 0.7518506646156311, + "learning_rate": 7.772033461707261e-06, + "loss": 2.9363, + "step": 56770 + }, + { + "epoch": 2.78, + "grad_norm": 0.7562054991722107, + "learning_rate": 7.768552046666642e-06, + "loss": 2.9044, + "step": 56771 + }, + { + "epoch": 2.78, + "grad_norm": 0.7331022024154663, + "learning_rate": 7.765071401307443e-06, + "loss": 2.8371, + "step": 56772 + }, + { + "epoch": 2.78, + "grad_norm": 0.7433456778526306, + "learning_rate": 7.761591525638822e-06, + "loss": 2.8736, + "step": 56773 + }, + { + "epoch": 2.78, + "grad_norm": 0.7637978196144104, + "learning_rate": 7.75811241966987e-06, + "loss": 2.9288, + "step": 56774 + }, + { + "epoch": 2.78, + "grad_norm": 0.7655687928199768, + "learning_rate": 7.75463408340985e-06, + "loss": 2.8614, + "step": 56775 + }, + { + "epoch": 2.78, + "grad_norm": 0.7311526536941528, + "learning_rate": 7.751156516867884e-06, + "loss": 2.7973, + "step": 56776 + }, + { + "epoch": 2.78, + "grad_norm": 0.7516604661941528, + "learning_rate": 7.747679720053069e-06, + "loss": 2.8696, + "step": 56777 + }, + { + "epoch": 2.78, + "grad_norm": 0.7627346515655518, + "learning_rate": 7.744203692974692e-06, + "loss": 2.7741, + "step": 56778 + }, + { + "epoch": 2.78, + "grad_norm": 0.8112263679504395, + "learning_rate": 7.740728435641818e-06, + "loss": 2.8013, + "step": 56779 + }, + { + "epoch": 2.78, + "grad_norm": 0.7233673930168152, + "learning_rate": 7.737253948063638e-06, + "loss": 3.0573, + "step": 56780 + }, + { + "epoch": 2.78, + "grad_norm": 0.7762109637260437, + "learning_rate": 7.733780230249243e-06, + "loss": 2.8076, + "step": 56781 + }, + { + "epoch": 2.78, + "grad_norm": 0.7639904618263245, + "learning_rate": 7.730307282207794e-06, + "loss": 2.7387, + "step": 56782 + }, + { + "epoch": 2.78, + "grad_norm": 0.7313291430473328, + "learning_rate": 7.726835103948548e-06, + "loss": 2.7993, + "step": 56783 + }, + { + "epoch": 2.78, + "grad_norm": 0.7252292633056641, + "learning_rate": 7.7233636954805e-06, + "loss": 2.8332, + "step": 56784 + }, + { + "epoch": 2.78, + "grad_norm": 0.7580043077468872, + "learning_rate": 7.719893056812908e-06, + "loss": 2.9502, + "step": 56785 + }, + { + "epoch": 2.78, + "grad_norm": 0.7498751878738403, + "learning_rate": 7.716423187954835e-06, + "loss": 2.8644, + "step": 56786 + }, + { + "epoch": 2.78, + "grad_norm": 0.7402743697166443, + "learning_rate": 7.712954088915469e-06, + "loss": 2.7173, + "step": 56787 + }, + { + "epoch": 2.78, + "grad_norm": 0.7868022322654724, + "learning_rate": 7.709485759703971e-06, + "loss": 2.769, + "step": 56788 + }, + { + "epoch": 2.78, + "grad_norm": 0.762657880783081, + "learning_rate": 7.706018200329367e-06, + "loss": 2.8656, + "step": 56789 + }, + { + "epoch": 2.78, + "grad_norm": 0.8196261525154114, + "learning_rate": 7.702551410800884e-06, + "loss": 2.8877, + "step": 56790 + }, + { + "epoch": 2.78, + "grad_norm": 0.7575928568840027, + "learning_rate": 7.699085391127614e-06, + "loss": 2.9055, + "step": 56791 + }, + { + "epoch": 2.78, + "grad_norm": 0.7336806654930115, + "learning_rate": 7.695620141318715e-06, + "loss": 2.9307, + "step": 56792 + }, + { + "epoch": 2.78, + "grad_norm": 0.7909324765205383, + "learning_rate": 7.692155661383314e-06, + "loss": 2.8882, + "step": 56793 + }, + { + "epoch": 2.78, + "grad_norm": 0.778021514415741, + "learning_rate": 7.688691951330506e-06, + "loss": 2.8321, + "step": 56794 + }, + { + "epoch": 2.78, + "grad_norm": 0.7695522904396057, + "learning_rate": 7.685229011169447e-06, + "loss": 2.7739, + "step": 56795 + }, + { + "epoch": 2.78, + "grad_norm": 0.7543647289276123, + "learning_rate": 7.681766840909197e-06, + "loss": 2.9972, + "step": 56796 + }, + { + "epoch": 2.78, + "grad_norm": 0.7531213164329529, + "learning_rate": 7.678305440558884e-06, + "loss": 2.674, + "step": 56797 + }, + { + "epoch": 2.78, + "grad_norm": 0.7810092568397522, + "learning_rate": 7.674844810127734e-06, + "loss": 2.8329, + "step": 56798 + }, + { + "epoch": 2.78, + "grad_norm": 0.7242538332939148, + "learning_rate": 7.671384949624736e-06, + "loss": 2.9462, + "step": 56799 + }, + { + "epoch": 2.78, + "grad_norm": 0.7586749792098999, + "learning_rate": 7.667925859059087e-06, + "loss": 2.7034, + "step": 56800 + }, + { + "epoch": 2.78, + "grad_norm": 0.757546067237854, + "learning_rate": 7.664467538439845e-06, + "loss": 2.7994, + "step": 56801 + }, + { + "epoch": 2.78, + "grad_norm": 0.7679392695426941, + "learning_rate": 7.6610099877761e-06, + "loss": 2.8346, + "step": 56802 + }, + { + "epoch": 2.78, + "grad_norm": 0.7457258701324463, + "learning_rate": 7.657553207077049e-06, + "loss": 2.8294, + "step": 56803 + }, + { + "epoch": 2.78, + "grad_norm": 0.734887957572937, + "learning_rate": 7.654097196351716e-06, + "loss": 2.8035, + "step": 56804 + }, + { + "epoch": 2.78, + "grad_norm": 0.7640114426612854, + "learning_rate": 7.650641955609227e-06, + "loss": 2.8279, + "step": 56805 + }, + { + "epoch": 2.78, + "grad_norm": 0.7584049105644226, + "learning_rate": 7.647187484858674e-06, + "loss": 3.168, + "step": 56806 + }, + { + "epoch": 2.78, + "grad_norm": 0.7257455587387085, + "learning_rate": 7.643733784109151e-06, + "loss": 2.6585, + "step": 56807 + }, + { + "epoch": 2.78, + "grad_norm": 0.7480292916297913, + "learning_rate": 7.640280853369784e-06, + "loss": 2.7445, + "step": 56808 + }, + { + "epoch": 2.78, + "grad_norm": 0.7356035709381104, + "learning_rate": 7.6368286926497e-06, + "loss": 2.9441, + "step": 56809 + }, + { + "epoch": 2.78, + "grad_norm": 0.719250500202179, + "learning_rate": 7.633377301957889e-06, + "loss": 2.8578, + "step": 56810 + }, + { + "epoch": 2.78, + "grad_norm": 0.7773850560188293, + "learning_rate": 7.629926681303512e-06, + "loss": 2.7369, + "step": 56811 + }, + { + "epoch": 2.78, + "grad_norm": 0.7469416856765747, + "learning_rate": 7.626476830695627e-06, + "loss": 2.9075, + "step": 56812 + }, + { + "epoch": 2.78, + "grad_norm": 0.768189549446106, + "learning_rate": 7.623027750143329e-06, + "loss": 2.776, + "step": 56813 + }, + { + "epoch": 2.78, + "grad_norm": 0.7666859030723572, + "learning_rate": 7.61957943965571e-06, + "loss": 2.8001, + "step": 56814 + }, + { + "epoch": 2.78, + "grad_norm": 0.734056293964386, + "learning_rate": 7.616131899241862e-06, + "loss": 2.8107, + "step": 56815 + }, + { + "epoch": 2.78, + "grad_norm": 0.7309668064117432, + "learning_rate": 7.612685128910878e-06, + "loss": 3.0441, + "step": 56816 + }, + { + "epoch": 2.78, + "grad_norm": 0.7235714793205261, + "learning_rate": 7.609239128671785e-06, + "loss": 2.8479, + "step": 56817 + }, + { + "epoch": 2.78, + "grad_norm": 0.8136651515960693, + "learning_rate": 7.6057938985336746e-06, + "loss": 2.8428, + "step": 56818 + }, + { + "epoch": 2.78, + "grad_norm": 0.7454987168312073, + "learning_rate": 7.602349438505606e-06, + "loss": 2.7822, + "step": 56819 + }, + { + "epoch": 2.78, + "grad_norm": 0.775428831577301, + "learning_rate": 7.59890574859674e-06, + "loss": 2.9205, + "step": 56820 + }, + { + "epoch": 2.78, + "grad_norm": 0.82244873046875, + "learning_rate": 7.595462828816035e-06, + "loss": 2.9713, + "step": 56821 + }, + { + "epoch": 2.78, + "grad_norm": 0.7343226075172424, + "learning_rate": 7.5920206791726165e-06, + "loss": 2.7359, + "step": 56822 + }, + { + "epoch": 2.78, + "grad_norm": 0.7130812406539917, + "learning_rate": 7.588579299675579e-06, + "loss": 3.0358, + "step": 56823 + }, + { + "epoch": 2.78, + "grad_norm": 0.752113401889801, + "learning_rate": 7.585138690333881e-06, + "loss": 2.9996, + "step": 56824 + }, + { + "epoch": 2.78, + "grad_norm": 0.7490484118461609, + "learning_rate": 7.581698851156715e-06, + "loss": 2.9933, + "step": 56825 + }, + { + "epoch": 2.78, + "grad_norm": 0.7558857202529907, + "learning_rate": 7.578259782153006e-06, + "loss": 2.846, + "step": 56826 + }, + { + "epoch": 2.78, + "grad_norm": 0.7821259498596191, + "learning_rate": 7.57482148333195e-06, + "loss": 2.5186, + "step": 56827 + }, + { + "epoch": 2.79, + "grad_norm": 0.7697935700416565, + "learning_rate": 7.571383954702504e-06, + "loss": 2.9788, + "step": 56828 + }, + { + "epoch": 2.79, + "grad_norm": 0.7339929342269897, + "learning_rate": 7.567947196273727e-06, + "loss": 3.0044, + "step": 56829 + }, + { + "epoch": 2.79, + "grad_norm": 0.724521815776825, + "learning_rate": 7.5645112080547464e-06, + "loss": 2.8909, + "step": 56830 + }, + { + "epoch": 2.79, + "grad_norm": 0.7704073786735535, + "learning_rate": 7.561075990054555e-06, + "loss": 3.2447, + "step": 56831 + }, + { + "epoch": 2.79, + "grad_norm": 0.7758163213729858, + "learning_rate": 7.5576415422822115e-06, + "loss": 2.9757, + "step": 56832 + }, + { + "epoch": 2.79, + "grad_norm": 0.7906506657600403, + "learning_rate": 7.554207864746742e-06, + "loss": 2.8299, + "step": 56833 + }, + { + "epoch": 2.79, + "grad_norm": 0.7277704477310181, + "learning_rate": 7.550774957457173e-06, + "loss": 2.7329, + "step": 56834 + }, + { + "epoch": 2.79, + "grad_norm": 0.7903933525085449, + "learning_rate": 7.54734282042263e-06, + "loss": 2.7854, + "step": 56835 + }, + { + "epoch": 2.79, + "grad_norm": 0.7526589632034302, + "learning_rate": 7.543911453652074e-06, + "loss": 2.937, + "step": 56836 + }, + { + "epoch": 2.79, + "grad_norm": 0.790673017501831, + "learning_rate": 7.540480857154596e-06, + "loss": 3.0866, + "step": 56837 + }, + { + "epoch": 2.79, + "grad_norm": 0.7465153336524963, + "learning_rate": 7.537051030939223e-06, + "loss": 2.8482, + "step": 56838 + }, + { + "epoch": 2.79, + "grad_norm": 0.7059990167617798, + "learning_rate": 7.5336219750149475e-06, + "loss": 2.9167, + "step": 56839 + }, + { + "epoch": 2.79, + "grad_norm": 0.7449349164962769, + "learning_rate": 7.530193689390828e-06, + "loss": 2.6752, + "step": 56840 + }, + { + "epoch": 2.79, + "grad_norm": 0.7419530153274536, + "learning_rate": 7.52676617407586e-06, + "loss": 2.9918, + "step": 56841 + }, + { + "epoch": 2.79, + "grad_norm": 0.7866668105125427, + "learning_rate": 7.5233394290791334e-06, + "loss": 2.8204, + "step": 56842 + }, + { + "epoch": 2.79, + "grad_norm": 0.8241013288497925, + "learning_rate": 7.519913454409643e-06, + "loss": 2.8006, + "step": 56843 + }, + { + "epoch": 2.79, + "grad_norm": 0.793013334274292, + "learning_rate": 7.516488250076414e-06, + "loss": 2.8997, + "step": 56844 + }, + { + "epoch": 2.79, + "grad_norm": 0.7368309497833252, + "learning_rate": 7.513063816088472e-06, + "loss": 2.9748, + "step": 56845 + }, + { + "epoch": 2.79, + "grad_norm": 0.7466829419136047, + "learning_rate": 7.50964015245481e-06, + "loss": 2.785, + "step": 56846 + }, + { + "epoch": 2.79, + "grad_norm": 0.7598251104354858, + "learning_rate": 7.506217259184488e-06, + "loss": 2.7236, + "step": 56847 + }, + { + "epoch": 2.79, + "grad_norm": 0.7561330199241638, + "learning_rate": 7.502795136286465e-06, + "loss": 2.999, + "step": 56848 + }, + { + "epoch": 2.79, + "grad_norm": 0.7700143456459045, + "learning_rate": 7.499373783769768e-06, + "loss": 2.8617, + "step": 56849 + }, + { + "epoch": 2.79, + "grad_norm": 0.7483510971069336, + "learning_rate": 7.49595320164349e-06, + "loss": 2.8559, + "step": 56850 + }, + { + "epoch": 2.79, + "grad_norm": 0.7109506726264954, + "learning_rate": 7.4925333899165216e-06, + "loss": 2.8932, + "step": 56851 + }, + { + "epoch": 2.79, + "grad_norm": 0.7281533479690552, + "learning_rate": 7.489114348597991e-06, + "loss": 3.0285, + "step": 56852 + }, + { + "epoch": 2.79, + "grad_norm": 0.7932998538017273, + "learning_rate": 7.48569607769679e-06, + "loss": 2.87, + "step": 56853 + }, + { + "epoch": 2.79, + "grad_norm": 0.7219394445419312, + "learning_rate": 7.4822785772219786e-06, + "loss": 2.8928, + "step": 56854 + }, + { + "epoch": 2.79, + "grad_norm": 0.7718573808670044, + "learning_rate": 7.478861847182582e-06, + "loss": 2.8527, + "step": 56855 + }, + { + "epoch": 2.79, + "grad_norm": 0.7460842728614807, + "learning_rate": 7.475445887587528e-06, + "loss": 2.8612, + "step": 56856 + }, + { + "epoch": 2.79, + "grad_norm": 0.7437963485717773, + "learning_rate": 7.472030698445875e-06, + "loss": 2.8628, + "step": 56857 + }, + { + "epoch": 2.79, + "grad_norm": 0.7210949659347534, + "learning_rate": 7.468616279766548e-06, + "loss": 2.7437, + "step": 56858 + }, + { + "epoch": 2.79, + "grad_norm": 0.6923820376396179, + "learning_rate": 7.465202631558642e-06, + "loss": 2.7774, + "step": 56859 + }, + { + "epoch": 2.79, + "grad_norm": 0.7626507878303528, + "learning_rate": 7.461789753831082e-06, + "loss": 2.8715, + "step": 56860 + }, + { + "epoch": 2.79, + "grad_norm": 0.7916741371154785, + "learning_rate": 7.458377646592828e-06, + "loss": 2.9131, + "step": 56861 + }, + { + "epoch": 2.79, + "grad_norm": 0.7709150314331055, + "learning_rate": 7.454966309852972e-06, + "loss": 2.7865, + "step": 56862 + }, + { + "epoch": 2.79, + "grad_norm": 0.7656967639923096, + "learning_rate": 7.4515557436203746e-06, + "loss": 2.9982, + "step": 56863 + }, + { + "epoch": 2.79, + "grad_norm": 0.7165960669517517, + "learning_rate": 7.448145947904094e-06, + "loss": 2.8934, + "step": 56864 + }, + { + "epoch": 2.79, + "grad_norm": 0.7551760673522949, + "learning_rate": 7.444736922713091e-06, + "loss": 2.9164, + "step": 56865 + }, + { + "epoch": 2.79, + "grad_norm": 0.7682600021362305, + "learning_rate": 7.441328668056357e-06, + "loss": 2.9947, + "step": 56866 + }, + { + "epoch": 2.79, + "grad_norm": 0.7713602781295776, + "learning_rate": 7.4379211839428856e-06, + "loss": 2.8302, + "step": 56867 + }, + { + "epoch": 2.79, + "grad_norm": 0.7858055830001831, + "learning_rate": 7.434514470381603e-06, + "loss": 2.838, + "step": 56868 + }, + { + "epoch": 2.79, + "grad_norm": 0.7591269016265869, + "learning_rate": 7.431108527381469e-06, + "loss": 2.764, + "step": 56869 + }, + { + "epoch": 2.79, + "grad_norm": 0.7732019424438477, + "learning_rate": 7.427703354951542e-06, + "loss": 3.1269, + "step": 56870 + }, + { + "epoch": 2.79, + "grad_norm": 0.7430728077888489, + "learning_rate": 7.424298953100716e-06, + "loss": 2.8704, + "step": 56871 + }, + { + "epoch": 2.79, + "grad_norm": 0.7464427947998047, + "learning_rate": 7.420895321837983e-06, + "loss": 2.9217, + "step": 56872 + }, + { + "epoch": 2.79, + "grad_norm": 0.7523966431617737, + "learning_rate": 7.41749246117227e-06, + "loss": 2.8089, + "step": 56873 + }, + { + "epoch": 2.79, + "grad_norm": 0.7557439804077148, + "learning_rate": 7.414090371112636e-06, + "loss": 2.7274, + "step": 56874 + }, + { + "epoch": 2.79, + "grad_norm": 0.7846986651420593, + "learning_rate": 7.41068905166794e-06, + "loss": 2.8855, + "step": 56875 + }, + { + "epoch": 2.79, + "grad_norm": 0.7404799461364746, + "learning_rate": 7.4072885028471755e-06, + "loss": 2.7038, + "step": 56876 + }, + { + "epoch": 2.79, + "grad_norm": 0.7172025442123413, + "learning_rate": 7.403888724659334e-06, + "loss": 2.9229, + "step": 56877 + }, + { + "epoch": 2.79, + "grad_norm": 0.7421782612800598, + "learning_rate": 7.400489717113278e-06, + "loss": 2.8372, + "step": 56878 + }, + { + "epoch": 2.79, + "grad_norm": 0.7487596869468689, + "learning_rate": 7.397091480218065e-06, + "loss": 2.6678, + "step": 56879 + }, + { + "epoch": 2.79, + "grad_norm": 0.753854513168335, + "learning_rate": 7.393694013982587e-06, + "loss": 2.9184, + "step": 56880 + }, + { + "epoch": 2.79, + "grad_norm": 0.7490181922912598, + "learning_rate": 7.390297318415839e-06, + "loss": 2.9672, + "step": 56881 + }, + { + "epoch": 2.79, + "grad_norm": 0.7417166233062744, + "learning_rate": 7.386901393526712e-06, + "loss": 2.9964, + "step": 56882 + }, + { + "epoch": 2.79, + "grad_norm": 0.7824398875236511, + "learning_rate": 7.383506239324166e-06, + "loss": 2.9129, + "step": 56883 + }, + { + "epoch": 2.79, + "grad_norm": 0.7355906367301941, + "learning_rate": 7.380111855817128e-06, + "loss": 2.8434, + "step": 56884 + }, + { + "epoch": 2.79, + "grad_norm": 0.7218039035797119, + "learning_rate": 7.3767182430145895e-06, + "loss": 2.8935, + "step": 56885 + }, + { + "epoch": 2.79, + "grad_norm": 0.7684163451194763, + "learning_rate": 7.373325400925445e-06, + "loss": 2.976, + "step": 56886 + }, + { + "epoch": 2.79, + "grad_norm": 0.7165760397911072, + "learning_rate": 7.3699333295586526e-06, + "loss": 2.6787, + "step": 56887 + }, + { + "epoch": 2.79, + "grad_norm": 0.7151129841804504, + "learning_rate": 7.366542028923139e-06, + "loss": 2.7651, + "step": 56888 + }, + { + "epoch": 2.79, + "grad_norm": 0.7509085536003113, + "learning_rate": 7.3631514990278305e-06, + "loss": 3.0883, + "step": 56889 + }, + { + "epoch": 2.79, + "grad_norm": 0.7478203773498535, + "learning_rate": 7.359761739881686e-06, + "loss": 3.0038, + "step": 56890 + }, + { + "epoch": 2.79, + "grad_norm": 0.77152019739151, + "learning_rate": 7.3563727514935665e-06, + "loss": 2.804, + "step": 56891 + }, + { + "epoch": 2.79, + "grad_norm": 0.7167760729789734, + "learning_rate": 7.352984533872464e-06, + "loss": 3.1426, + "step": 56892 + }, + { + "epoch": 2.79, + "grad_norm": 0.7409500479698181, + "learning_rate": 7.34959708702727e-06, + "loss": 2.9238, + "step": 56893 + }, + { + "epoch": 2.79, + "grad_norm": 0.8209675550460815, + "learning_rate": 7.346210410966946e-06, + "loss": 2.8103, + "step": 56894 + }, + { + "epoch": 2.79, + "grad_norm": 0.7210500836372375, + "learning_rate": 7.342824505700351e-06, + "loss": 2.6596, + "step": 56895 + }, + { + "epoch": 2.79, + "grad_norm": 0.7498226761817932, + "learning_rate": 7.339439371236444e-06, + "loss": 2.8775, + "step": 56896 + }, + { + "epoch": 2.79, + "grad_norm": 0.7758424282073975, + "learning_rate": 7.336055007584152e-06, + "loss": 2.7742, + "step": 56897 + }, + { + "epoch": 2.79, + "grad_norm": 0.7447173595428467, + "learning_rate": 7.3326714147523e-06, + "loss": 2.8494, + "step": 56898 + }, + { + "epoch": 2.79, + "grad_norm": 0.837697446346283, + "learning_rate": 7.329288592749916e-06, + "loss": 2.8712, + "step": 56899 + }, + { + "epoch": 2.79, + "grad_norm": 0.7390157580375671, + "learning_rate": 7.3259065415858575e-06, + "loss": 2.8616, + "step": 56900 + }, + { + "epoch": 2.79, + "grad_norm": 0.735798180103302, + "learning_rate": 7.322525261269019e-06, + "loss": 2.9517, + "step": 56901 + }, + { + "epoch": 2.79, + "grad_norm": 0.7630640268325806, + "learning_rate": 7.319144751808325e-06, + "loss": 2.9783, + "step": 56902 + }, + { + "epoch": 2.79, + "grad_norm": 0.7421290278434753, + "learning_rate": 7.315765013212705e-06, + "loss": 2.8817, + "step": 56903 + }, + { + "epoch": 2.79, + "grad_norm": 0.7206147313117981, + "learning_rate": 7.312386045491015e-06, + "loss": 2.7915, + "step": 56904 + }, + { + "epoch": 2.79, + "grad_norm": 0.7598541975021362, + "learning_rate": 7.309007848652149e-06, + "loss": 2.647, + "step": 56905 + }, + { + "epoch": 2.79, + "grad_norm": 0.7555434703826904, + "learning_rate": 7.305630422705033e-06, + "loss": 2.7925, + "step": 56906 + }, + { + "epoch": 2.79, + "grad_norm": 0.7854859232902527, + "learning_rate": 7.3022537676585944e-06, + "loss": 2.8326, + "step": 56907 + }, + { + "epoch": 2.79, + "grad_norm": 0.7405829429626465, + "learning_rate": 7.298877883521626e-06, + "loss": 2.8358, + "step": 56908 + }, + { + "epoch": 2.79, + "grad_norm": 0.7246938347816467, + "learning_rate": 7.295502770303152e-06, + "loss": 2.6278, + "step": 56909 + }, + { + "epoch": 2.79, + "grad_norm": 0.7304655313491821, + "learning_rate": 7.292128428011934e-06, + "loss": 2.824, + "step": 56910 + }, + { + "epoch": 2.79, + "grad_norm": 0.7219448685646057, + "learning_rate": 7.288754856656964e-06, + "loss": 3.1004, + "step": 56911 + }, + { + "epoch": 2.79, + "grad_norm": 0.7531955242156982, + "learning_rate": 7.285382056247069e-06, + "loss": 2.8265, + "step": 56912 + }, + { + "epoch": 2.79, + "grad_norm": 0.7347313761711121, + "learning_rate": 7.282010026791107e-06, + "loss": 2.758, + "step": 56913 + }, + { + "epoch": 2.79, + "grad_norm": 0.740077555179596, + "learning_rate": 7.27863876829804e-06, + "loss": 2.9192, + "step": 56914 + }, + { + "epoch": 2.79, + "grad_norm": 0.7309461832046509, + "learning_rate": 7.275268280776659e-06, + "loss": 2.8396, + "step": 56915 + }, + { + "epoch": 2.79, + "grad_norm": 0.7398518919944763, + "learning_rate": 7.2718985642358894e-06, + "loss": 2.9457, + "step": 56916 + }, + { + "epoch": 2.79, + "grad_norm": 0.7491543292999268, + "learning_rate": 7.268529618684627e-06, + "loss": 2.9195, + "step": 56917 + }, + { + "epoch": 2.79, + "grad_norm": 0.7731919884681702, + "learning_rate": 7.265161444131729e-06, + "loss": 3.1226, + "step": 56918 + }, + { + "epoch": 2.79, + "grad_norm": 0.802014946937561, + "learning_rate": 7.261794040586055e-06, + "loss": 2.9917, + "step": 56919 + }, + { + "epoch": 2.79, + "grad_norm": 0.7600812315940857, + "learning_rate": 7.258427408056467e-06, + "loss": 2.9308, + "step": 56920 + }, + { + "epoch": 2.79, + "grad_norm": 0.7489023804664612, + "learning_rate": 7.255061546551821e-06, + "loss": 3.0007, + "step": 56921 + }, + { + "epoch": 2.79, + "grad_norm": 0.7434468269348145, + "learning_rate": 7.251696456081046e-06, + "loss": 2.9466, + "step": 56922 + }, + { + "epoch": 2.79, + "grad_norm": 0.7413731813430786, + "learning_rate": 7.248332136652901e-06, + "loss": 3.0299, + "step": 56923 + }, + { + "epoch": 2.79, + "grad_norm": 0.7570599913597107, + "learning_rate": 7.2449685882763784e-06, + "loss": 2.9096, + "step": 56924 + }, + { + "epoch": 2.79, + "grad_norm": 0.7353522777557373, + "learning_rate": 7.241605810960238e-06, + "loss": 2.7581, + "step": 56925 + }, + { + "epoch": 2.79, + "grad_norm": 0.7793547511100769, + "learning_rate": 7.2382438047133395e-06, + "loss": 3.1442, + "step": 56926 + }, + { + "epoch": 2.79, + "grad_norm": 0.7294006943702698, + "learning_rate": 7.234882569544609e-06, + "loss": 3.0892, + "step": 56927 + }, + { + "epoch": 2.79, + "grad_norm": 0.7296475172042847, + "learning_rate": 7.231522105462806e-06, + "loss": 2.8816, + "step": 56928 + }, + { + "epoch": 2.79, + "grad_norm": 0.8307911157608032, + "learning_rate": 7.22816241247689e-06, + "loss": 2.9092, + "step": 56929 + }, + { + "epoch": 2.79, + "grad_norm": 0.744428277015686, + "learning_rate": 7.224803490595587e-06, + "loss": 2.8779, + "step": 56930 + }, + { + "epoch": 2.79, + "grad_norm": 0.758772075176239, + "learning_rate": 7.221445339827825e-06, + "loss": 2.9967, + "step": 56931 + }, + { + "epoch": 2.79, + "grad_norm": 0.7177093029022217, + "learning_rate": 7.218087960182428e-06, + "loss": 2.8032, + "step": 56932 + }, + { + "epoch": 2.79, + "grad_norm": 0.7423251867294312, + "learning_rate": 7.21473135166829e-06, + "loss": 2.9131, + "step": 56933 + }, + { + "epoch": 2.79, + "grad_norm": 0.7697576880455017, + "learning_rate": 7.211375514294171e-06, + "loss": 3.128, + "step": 56934 + }, + { + "epoch": 2.79, + "grad_norm": 0.7272599935531616, + "learning_rate": 7.208020448068897e-06, + "loss": 2.6826, + "step": 56935 + }, + { + "epoch": 2.79, + "grad_norm": 0.7582986354827881, + "learning_rate": 7.204666153001393e-06, + "loss": 2.9562, + "step": 56936 + }, + { + "epoch": 2.79, + "grad_norm": 0.757316529750824, + "learning_rate": 7.201312629100453e-06, + "loss": 2.9547, + "step": 56937 + }, + { + "epoch": 2.79, + "grad_norm": 0.7660689949989319, + "learning_rate": 7.197959876374871e-06, + "loss": 2.8556, + "step": 56938 + }, + { + "epoch": 2.79, + "grad_norm": 0.7317332625389099, + "learning_rate": 7.194607894833537e-06, + "loss": 2.8184, + "step": 56939 + }, + { + "epoch": 2.79, + "grad_norm": 0.7949168682098389, + "learning_rate": 7.1912566844852475e-06, + "loss": 3.0257, + "step": 56940 + }, + { + "epoch": 2.79, + "grad_norm": 0.7534281611442566, + "learning_rate": 7.187906245338826e-06, + "loss": 2.8538, + "step": 56941 + }, + { + "epoch": 2.79, + "grad_norm": 0.7638459205627441, + "learning_rate": 7.184556577403134e-06, + "loss": 2.8792, + "step": 56942 + }, + { + "epoch": 2.79, + "grad_norm": 0.761776864528656, + "learning_rate": 7.181207680686929e-06, + "loss": 2.9767, + "step": 56943 + }, + { + "epoch": 2.79, + "grad_norm": 0.7768969535827637, + "learning_rate": 7.1778595551991055e-06, + "loss": 2.87, + "step": 56944 + }, + { + "epoch": 2.79, + "grad_norm": 0.7668014168739319, + "learning_rate": 7.1745122009484235e-06, + "loss": 2.8332, + "step": 56945 + }, + { + "epoch": 2.79, + "grad_norm": 0.7426735162734985, + "learning_rate": 7.171165617943741e-06, + "loss": 2.8732, + "step": 56946 + }, + { + "epoch": 2.79, + "grad_norm": 0.7256327867507935, + "learning_rate": 7.167819806193853e-06, + "loss": 2.9715, + "step": 56947 + }, + { + "epoch": 2.79, + "grad_norm": 0.8112422227859497, + "learning_rate": 7.164474765707518e-06, + "loss": 2.9562, + "step": 56948 + }, + { + "epoch": 2.79, + "grad_norm": 0.7298985719680786, + "learning_rate": 7.161130496493661e-06, + "loss": 2.8423, + "step": 56949 + }, + { + "epoch": 2.79, + "grad_norm": 0.7860491275787354, + "learning_rate": 7.157786998560977e-06, + "loss": 2.8787, + "step": 56950 + }, + { + "epoch": 2.79, + "grad_norm": 0.815058708190918, + "learning_rate": 7.154444271918358e-06, + "loss": 2.8567, + "step": 56951 + }, + { + "epoch": 2.79, + "grad_norm": 0.7936457395553589, + "learning_rate": 7.151102316574564e-06, + "loss": 2.978, + "step": 56952 + }, + { + "epoch": 2.79, + "grad_norm": 0.7552315592765808, + "learning_rate": 7.147761132538387e-06, + "loss": 2.8227, + "step": 56953 + }, + { + "epoch": 2.79, + "grad_norm": 0.7764397263526917, + "learning_rate": 7.144420719818689e-06, + "loss": 3.0444, + "step": 56954 + }, + { + "epoch": 2.79, + "grad_norm": 0.7704004049301147, + "learning_rate": 7.141081078424227e-06, + "loss": 2.8896, + "step": 56955 + }, + { + "epoch": 2.79, + "grad_norm": 0.749974250793457, + "learning_rate": 7.137742208363762e-06, + "loss": 2.7538, + "step": 56956 + }, + { + "epoch": 2.79, + "grad_norm": 0.7289097309112549, + "learning_rate": 7.1344041096461525e-06, + "loss": 2.9296, + "step": 56957 + }, + { + "epoch": 2.79, + "grad_norm": 0.7585612535476685, + "learning_rate": 7.131066782280126e-06, + "loss": 3.0644, + "step": 56958 + }, + { + "epoch": 2.79, + "grad_norm": 0.7690691947937012, + "learning_rate": 7.127730226274509e-06, + "loss": 3.0722, + "step": 56959 + }, + { + "epoch": 2.79, + "grad_norm": 0.7478421330451965, + "learning_rate": 7.124394441638092e-06, + "loss": 2.7564, + "step": 56960 + }, + { + "epoch": 2.79, + "grad_norm": 0.7582880258560181, + "learning_rate": 7.121059428379672e-06, + "loss": 2.7356, + "step": 56961 + }, + { + "epoch": 2.79, + "grad_norm": 0.7731198668479919, + "learning_rate": 7.117725186508006e-06, + "loss": 2.8882, + "step": 56962 + }, + { + "epoch": 2.79, + "grad_norm": 0.7167331576347351, + "learning_rate": 7.114391716031887e-06, + "loss": 2.9934, + "step": 56963 + }, + { + "epoch": 2.79, + "grad_norm": 0.7596493363380432, + "learning_rate": 7.111059016960108e-06, + "loss": 3.0592, + "step": 56964 + }, + { + "epoch": 2.79, + "grad_norm": 0.7319817543029785, + "learning_rate": 7.107727089301396e-06, + "loss": 3.0243, + "step": 56965 + }, + { + "epoch": 2.79, + "grad_norm": 0.7279293537139893, + "learning_rate": 7.1043959330645774e-06, + "loss": 2.96, + "step": 56966 + }, + { + "epoch": 2.79, + "grad_norm": 0.7954239249229431, + "learning_rate": 7.101065548258411e-06, + "loss": 2.9018, + "step": 56967 + }, + { + "epoch": 2.79, + "grad_norm": 0.7724497318267822, + "learning_rate": 7.097735934891657e-06, + "loss": 3.1738, + "step": 56968 + }, + { + "epoch": 2.79, + "grad_norm": 0.753320574760437, + "learning_rate": 7.094407092973142e-06, + "loss": 2.981, + "step": 56969 + }, + { + "epoch": 2.79, + "grad_norm": 0.8049317002296448, + "learning_rate": 7.091079022511559e-06, + "loss": 2.9705, + "step": 56970 + }, + { + "epoch": 2.79, + "grad_norm": 0.7446392774581909, + "learning_rate": 7.0877517235157e-06, + "loss": 2.6321, + "step": 56971 + }, + { + "epoch": 2.79, + "grad_norm": 0.7473111152648926, + "learning_rate": 7.0844251959943255e-06, + "loss": 2.8549, + "step": 56972 + }, + { + "epoch": 2.79, + "grad_norm": 0.7095595598220825, + "learning_rate": 7.081099439956195e-06, + "loss": 2.7745, + "step": 56973 + }, + { + "epoch": 2.79, + "grad_norm": 0.7316811084747314, + "learning_rate": 7.077774455410102e-06, + "loss": 2.6908, + "step": 56974 + }, + { + "epoch": 2.79, + "grad_norm": 0.7425003051757812, + "learning_rate": 7.074450242364738e-06, + "loss": 2.8079, + "step": 56975 + }, + { + "epoch": 2.79, + "grad_norm": 0.7391384840011597, + "learning_rate": 7.07112680082893e-06, + "loss": 2.8774, + "step": 56976 + }, + { + "epoch": 2.79, + "grad_norm": 0.763640820980072, + "learning_rate": 7.067804130811405e-06, + "loss": 2.8268, + "step": 56977 + }, + { + "epoch": 2.79, + "grad_norm": 0.7273032069206238, + "learning_rate": 7.06448223232089e-06, + "loss": 2.7182, + "step": 56978 + }, + { + "epoch": 2.79, + "grad_norm": 0.7752225399017334, + "learning_rate": 7.061161105366176e-06, + "loss": 2.8235, + "step": 56979 + }, + { + "epoch": 2.79, + "grad_norm": 0.755729615688324, + "learning_rate": 7.057840749955956e-06, + "loss": 2.8752, + "step": 56980 + }, + { + "epoch": 2.79, + "grad_norm": 0.7277241349220276, + "learning_rate": 7.054521166099026e-06, + "loss": 2.7167, + "step": 56981 + }, + { + "epoch": 2.79, + "grad_norm": 0.7544986605644226, + "learning_rate": 7.0512023538040756e-06, + "loss": 2.9328, + "step": 56982 + }, + { + "epoch": 2.79, + "grad_norm": 0.7631686925888062, + "learning_rate": 7.047884313079899e-06, + "loss": 2.9111, + "step": 56983 + }, + { + "epoch": 2.79, + "grad_norm": 0.7494498491287231, + "learning_rate": 7.044567043935257e-06, + "loss": 2.8915, + "step": 56984 + }, + { + "epoch": 2.79, + "grad_norm": 0.7393820285797119, + "learning_rate": 7.0412505463787734e-06, + "loss": 3.0258, + "step": 56985 + }, + { + "epoch": 2.79, + "grad_norm": 0.7384428381919861, + "learning_rate": 7.037934820419311e-06, + "loss": 2.8008, + "step": 56986 + }, + { + "epoch": 2.79, + "grad_norm": 0.7311258316040039, + "learning_rate": 7.034619866065494e-06, + "loss": 2.9212, + "step": 56987 + }, + { + "epoch": 2.79, + "grad_norm": 0.7373533248901367, + "learning_rate": 7.0313056833261495e-06, + "loss": 2.8112, + "step": 56988 + }, + { + "epoch": 2.79, + "grad_norm": 0.7468560338020325, + "learning_rate": 7.0279922722099365e-06, + "loss": 2.9788, + "step": 56989 + }, + { + "epoch": 2.79, + "grad_norm": 0.7494136691093445, + "learning_rate": 7.024679632725616e-06, + "loss": 2.9388, + "step": 56990 + }, + { + "epoch": 2.79, + "grad_norm": 0.731553852558136, + "learning_rate": 7.0213677648819135e-06, + "loss": 2.7017, + "step": 56991 + }, + { + "epoch": 2.79, + "grad_norm": 0.8054585456848145, + "learning_rate": 7.018056668687555e-06, + "loss": 2.8854, + "step": 56992 + }, + { + "epoch": 2.79, + "grad_norm": 0.7437418103218079, + "learning_rate": 7.014746344151234e-06, + "loss": 2.8213, + "step": 56993 + }, + { + "epoch": 2.79, + "grad_norm": 0.7302021980285645, + "learning_rate": 7.01143679128171e-06, + "loss": 2.8911, + "step": 56994 + }, + { + "epoch": 2.79, + "grad_norm": 0.7534018158912659, + "learning_rate": 7.00812801008761e-06, + "loss": 2.8335, + "step": 56995 + }, + { + "epoch": 2.79, + "grad_norm": 0.7726359367370605, + "learning_rate": 7.004820000577793e-06, + "loss": 2.9413, + "step": 56996 + }, + { + "epoch": 2.79, + "grad_norm": 0.748771071434021, + "learning_rate": 7.0015127627608195e-06, + "loss": 2.7926, + "step": 56997 + }, + { + "epoch": 2.79, + "grad_norm": 0.7633996605873108, + "learning_rate": 6.998206296645515e-06, + "loss": 2.8268, + "step": 56998 + }, + { + "epoch": 2.79, + "grad_norm": 0.7452352643013, + "learning_rate": 6.994900602240572e-06, + "loss": 2.7504, + "step": 56999 + }, + { + "epoch": 2.79, + "grad_norm": 0.7363735437393188, + "learning_rate": 6.991595679554618e-06, + "loss": 3.0628, + "step": 57000 + }, + { + "epoch": 2.79, + "grad_norm": 0.7469814419746399, + "learning_rate": 6.988291528596446e-06, + "loss": 3.0343, + "step": 57001 + }, + { + "epoch": 2.79, + "grad_norm": 0.7548006176948547, + "learning_rate": 6.984988149374682e-06, + "loss": 3.1684, + "step": 57002 + }, + { + "epoch": 2.79, + "grad_norm": 0.7495371699333191, + "learning_rate": 6.981685541898086e-06, + "loss": 2.9532, + "step": 57003 + }, + { + "epoch": 2.79, + "grad_norm": 0.7482824325561523, + "learning_rate": 6.97838370617535e-06, + "loss": 3.0097, + "step": 57004 + }, + { + "epoch": 2.79, + "grad_norm": 0.7665938138961792, + "learning_rate": 6.975082642215168e-06, + "loss": 2.7349, + "step": 57005 + }, + { + "epoch": 2.79, + "grad_norm": 0.7205333113670349, + "learning_rate": 6.9717823500262e-06, + "loss": 2.8713, + "step": 57006 + }, + { + "epoch": 2.79, + "grad_norm": 0.7368008494377136, + "learning_rate": 6.9684828296171725e-06, + "loss": 2.7659, + "step": 57007 + }, + { + "epoch": 2.79, + "grad_norm": 0.7547926306724548, + "learning_rate": 6.965184080996744e-06, + "loss": 2.9229, + "step": 57008 + }, + { + "epoch": 2.79, + "grad_norm": 0.7362197041511536, + "learning_rate": 6.9618861041736415e-06, + "loss": 2.6904, + "step": 57009 + }, + { + "epoch": 2.79, + "grad_norm": 0.7418313026428223, + "learning_rate": 6.958588899156525e-06, + "loss": 2.8835, + "step": 57010 + }, + { + "epoch": 2.79, + "grad_norm": 0.7631124258041382, + "learning_rate": 6.955292465954121e-06, + "loss": 2.7645, + "step": 57011 + }, + { + "epoch": 2.79, + "grad_norm": 0.7663556337356567, + "learning_rate": 6.951996804575055e-06, + "loss": 2.9622, + "step": 57012 + }, + { + "epoch": 2.79, + "grad_norm": 0.7453634738922119, + "learning_rate": 6.948701915028054e-06, + "loss": 2.8942, + "step": 57013 + }, + { + "epoch": 2.79, + "grad_norm": 0.7672542333602905, + "learning_rate": 6.9454077973217444e-06, + "loss": 2.9699, + "step": 57014 + }, + { + "epoch": 2.79, + "grad_norm": 0.7669967412948608, + "learning_rate": 6.9421144514648195e-06, + "loss": 2.8731, + "step": 57015 + }, + { + "epoch": 2.79, + "grad_norm": 0.7587398886680603, + "learning_rate": 6.938821877466005e-06, + "loss": 2.8788, + "step": 57016 + }, + { + "epoch": 2.79, + "grad_norm": 0.7381514310836792, + "learning_rate": 6.935530075333895e-06, + "loss": 2.9416, + "step": 57017 + }, + { + "epoch": 2.79, + "grad_norm": 0.7473018765449524, + "learning_rate": 6.932239045077215e-06, + "loss": 3.012, + "step": 57018 + }, + { + "epoch": 2.79, + "grad_norm": 0.7571165561676025, + "learning_rate": 6.928948786704624e-06, + "loss": 2.8817, + "step": 57019 + }, + { + "epoch": 2.79, + "grad_norm": 0.7279015183448792, + "learning_rate": 6.925659300224784e-06, + "loss": 3.0053, + "step": 57020 + }, + { + "epoch": 2.79, + "grad_norm": 0.7720736861228943, + "learning_rate": 6.9223705856463525e-06, + "loss": 2.7265, + "step": 57021 + }, + { + "epoch": 2.79, + "grad_norm": 0.7333340048789978, + "learning_rate": 6.919082642977991e-06, + "loss": 2.9004, + "step": 57022 + }, + { + "epoch": 2.79, + "grad_norm": 0.7707780599594116, + "learning_rate": 6.915795472228325e-06, + "loss": 3.0141, + "step": 57023 + }, + { + "epoch": 2.79, + "grad_norm": 0.7591186165809631, + "learning_rate": 6.912509073406114e-06, + "loss": 2.719, + "step": 57024 + }, + { + "epoch": 2.79, + "grad_norm": 0.7694956064224243, + "learning_rate": 6.909223446519885e-06, + "loss": 2.7744, + "step": 57025 + }, + { + "epoch": 2.79, + "grad_norm": 0.7224785089492798, + "learning_rate": 6.9059385915783974e-06, + "loss": 2.9739, + "step": 57026 + }, + { + "epoch": 2.79, + "grad_norm": 0.8058514595031738, + "learning_rate": 6.9026545085902445e-06, + "loss": 3.1169, + "step": 57027 + }, + { + "epoch": 2.79, + "grad_norm": 0.7608110308647156, + "learning_rate": 6.899371197564085e-06, + "loss": 2.9059, + "step": 57028 + }, + { + "epoch": 2.79, + "grad_norm": 0.7489415407180786, + "learning_rate": 6.896088658508581e-06, + "loss": 2.8504, + "step": 57029 + }, + { + "epoch": 2.79, + "grad_norm": 0.802652895450592, + "learning_rate": 6.892806891432356e-06, + "loss": 2.8233, + "step": 57030 + }, + { + "epoch": 2.79, + "grad_norm": 0.7284513711929321, + "learning_rate": 6.889525896344106e-06, + "loss": 2.678, + "step": 57031 + }, + { + "epoch": 2.8, + "grad_norm": 0.7275854349136353, + "learning_rate": 6.886245673252389e-06, + "loss": 3.0578, + "step": 57032 + }, + { + "epoch": 2.8, + "grad_norm": 0.7676070332527161, + "learning_rate": 6.882966222165898e-06, + "loss": 2.9871, + "step": 57033 + }, + { + "epoch": 2.8, + "grad_norm": 0.7430452108383179, + "learning_rate": 6.879687543093293e-06, + "loss": 2.8993, + "step": 57034 + }, + { + "epoch": 2.8, + "grad_norm": 0.740665853023529, + "learning_rate": 6.876409636043168e-06, + "loss": 2.8878, + "step": 57035 + }, + { + "epoch": 2.8, + "grad_norm": 0.744522750377655, + "learning_rate": 6.8731325010241815e-06, + "loss": 3.1277, + "step": 57036 + }, + { + "epoch": 2.8, + "grad_norm": 0.7291746735572815, + "learning_rate": 6.869856138044927e-06, + "loss": 3.0474, + "step": 57037 + }, + { + "epoch": 2.8, + "grad_norm": 0.7282450795173645, + "learning_rate": 6.866580547114031e-06, + "loss": 2.6989, + "step": 57038 + }, + { + "epoch": 2.8, + "grad_norm": 0.7871297597885132, + "learning_rate": 6.863305728240187e-06, + "loss": 2.9547, + "step": 57039 + }, + { + "epoch": 2.8, + "grad_norm": 0.7655256390571594, + "learning_rate": 6.860031681431954e-06, + "loss": 2.9997, + "step": 57040 + }, + { + "epoch": 2.8, + "grad_norm": 0.7627614736557007, + "learning_rate": 6.8567584066980265e-06, + "loss": 2.9494, + "step": 57041 + }, + { + "epoch": 2.8, + "grad_norm": 0.7453312873840332, + "learning_rate": 6.853485904046962e-06, + "loss": 3.21, + "step": 57042 + }, + { + "epoch": 2.8, + "grad_norm": 0.7098878622055054, + "learning_rate": 6.850214173487389e-06, + "loss": 2.8033, + "step": 57043 + }, + { + "epoch": 2.8, + "grad_norm": 0.79835045337677, + "learning_rate": 6.846943215027934e-06, + "loss": 2.8388, + "step": 57044 + }, + { + "epoch": 2.8, + "grad_norm": 0.7601944804191589, + "learning_rate": 6.843673028677188e-06, + "loss": 3.1851, + "step": 57045 + }, + { + "epoch": 2.8, + "grad_norm": 0.826819896697998, + "learning_rate": 6.840403614443846e-06, + "loss": 2.7549, + "step": 57046 + }, + { + "epoch": 2.8, + "grad_norm": 0.7625347971916199, + "learning_rate": 6.8371349723364e-06, + "loss": 2.9151, + "step": 57047 + }, + { + "epoch": 2.8, + "grad_norm": 0.7577785849571228, + "learning_rate": 6.833867102363544e-06, + "loss": 2.7609, + "step": 57048 + }, + { + "epoch": 2.8, + "grad_norm": 0.7831384539604187, + "learning_rate": 6.830600004533903e-06, + "loss": 2.9857, + "step": 57049 + }, + { + "epoch": 2.8, + "grad_norm": 0.7404026985168457, + "learning_rate": 6.827333678855973e-06, + "loss": 2.9262, + "step": 57050 + }, + { + "epoch": 2.8, + "grad_norm": 0.7409828305244446, + "learning_rate": 6.824068125338478e-06, + "loss": 2.8237, + "step": 57051 + }, + { + "epoch": 2.8, + "grad_norm": 0.7415507435798645, + "learning_rate": 6.820803343989911e-06, + "loss": 2.8341, + "step": 57052 + }, + { + "epoch": 2.8, + "grad_norm": 0.7397423982620239, + "learning_rate": 6.817539334818967e-06, + "loss": 3.0505, + "step": 57053 + }, + { + "epoch": 2.8, + "grad_norm": 0.7370979189872742, + "learning_rate": 6.81427609783417e-06, + "loss": 2.8316, + "step": 57054 + }, + { + "epoch": 2.8, + "grad_norm": 0.7267847657203674, + "learning_rate": 6.8110136330441485e-06, + "loss": 2.8226, + "step": 57055 + }, + { + "epoch": 2.8, + "grad_norm": 0.7403401136398315, + "learning_rate": 6.807751940457529e-06, + "loss": 3.0404, + "step": 57056 + }, + { + "epoch": 2.8, + "grad_norm": 0.770000696182251, + "learning_rate": 6.804491020082836e-06, + "loss": 3.0793, + "step": 57057 + }, + { + "epoch": 2.8, + "grad_norm": 0.7584140300750732, + "learning_rate": 6.801230871928698e-06, + "loss": 2.8472, + "step": 57058 + }, + { + "epoch": 2.8, + "grad_norm": 0.7541932463645935, + "learning_rate": 6.7979714960036735e-06, + "loss": 2.9838, + "step": 57059 + }, + { + "epoch": 2.8, + "grad_norm": 0.749034583568573, + "learning_rate": 6.794712892316356e-06, + "loss": 2.8473, + "step": 57060 + }, + { + "epoch": 2.8, + "grad_norm": 0.7517021298408508, + "learning_rate": 6.791455060875372e-06, + "loss": 2.8163, + "step": 57061 + }, + { + "epoch": 2.8, + "grad_norm": 0.746054470539093, + "learning_rate": 6.788198001689249e-06, + "loss": 3.0738, + "step": 57062 + }, + { + "epoch": 2.8, + "grad_norm": 0.7522591352462769, + "learning_rate": 6.784941714766579e-06, + "loss": 2.8944, + "step": 57063 + }, + { + "epoch": 2.8, + "grad_norm": 0.7277589440345764, + "learning_rate": 6.781686200115987e-06, + "loss": 2.8983, + "step": 57064 + }, + { + "epoch": 2.8, + "grad_norm": 0.7314507961273193, + "learning_rate": 6.7784314577459364e-06, + "loss": 2.8121, + "step": 57065 + }, + { + "epoch": 2.8, + "grad_norm": 0.7789071202278137, + "learning_rate": 6.7751774876650845e-06, + "loss": 3.0653, + "step": 57066 + }, + { + "epoch": 2.8, + "grad_norm": 0.8103259205818176, + "learning_rate": 6.7719242898819915e-06, + "loss": 2.6541, + "step": 57067 + }, + { + "epoch": 2.8, + "grad_norm": 0.7770789265632629, + "learning_rate": 6.768671864405218e-06, + "loss": 3.0381, + "step": 57068 + }, + { + "epoch": 2.8, + "grad_norm": 0.7559372186660767, + "learning_rate": 6.765420211243322e-06, + "loss": 2.9393, + "step": 57069 + }, + { + "epoch": 2.8, + "grad_norm": 0.722319483757019, + "learning_rate": 6.7621693304048984e-06, + "loss": 3.1307, + "step": 57070 + }, + { + "epoch": 2.8, + "grad_norm": 0.7697762846946716, + "learning_rate": 6.758919221898473e-06, + "loss": 2.8636, + "step": 57071 + }, + { + "epoch": 2.8, + "grad_norm": 0.7274612188339233, + "learning_rate": 6.755669885732607e-06, + "loss": 2.9749, + "step": 57072 + }, + { + "epoch": 2.8, + "grad_norm": 0.7847182750701904, + "learning_rate": 6.7524213219158905e-06, + "loss": 2.8848, + "step": 57073 + }, + { + "epoch": 2.8, + "grad_norm": 0.7678075432777405, + "learning_rate": 6.749173530456819e-06, + "loss": 2.7446, + "step": 57074 + }, + { + "epoch": 2.8, + "grad_norm": 0.8545045852661133, + "learning_rate": 6.745926511363986e-06, + "loss": 2.9995, + "step": 57075 + }, + { + "epoch": 2.8, + "grad_norm": 0.7134187817573547, + "learning_rate": 6.742680264645983e-06, + "loss": 2.8353, + "step": 57076 + }, + { + "epoch": 2.8, + "grad_norm": 0.8113738298416138, + "learning_rate": 6.7394347903113045e-06, + "loss": 3.1161, + "step": 57077 + }, + { + "epoch": 2.8, + "grad_norm": 0.7588936686515808, + "learning_rate": 6.736190088368543e-06, + "loss": 2.9629, + "step": 57078 + }, + { + "epoch": 2.8, + "grad_norm": 0.7868394255638123, + "learning_rate": 6.732946158826224e-06, + "loss": 2.9781, + "step": 57079 + }, + { + "epoch": 2.8, + "grad_norm": 0.7183898091316223, + "learning_rate": 6.72970300169281e-06, + "loss": 2.8361, + "step": 57080 + }, + { + "epoch": 2.8, + "grad_norm": 0.7224221229553223, + "learning_rate": 6.726460616976992e-06, + "loss": 2.7046, + "step": 57081 + }, + { + "epoch": 2.8, + "grad_norm": 0.724663257598877, + "learning_rate": 6.723219004687197e-06, + "loss": 2.8677, + "step": 57082 + }, + { + "epoch": 2.8, + "grad_norm": 0.7192972302436829, + "learning_rate": 6.7199781648320185e-06, + "loss": 2.7882, + "step": 57083 + }, + { + "epoch": 2.8, + "grad_norm": 0.7668144106864929, + "learning_rate": 6.716738097419949e-06, + "loss": 2.9125, + "step": 57084 + }, + { + "epoch": 2.8, + "grad_norm": 0.7557151317596436, + "learning_rate": 6.713498802459583e-06, + "loss": 2.6406, + "step": 57085 + }, + { + "epoch": 2.8, + "grad_norm": 0.7683967351913452, + "learning_rate": 6.710260279959412e-06, + "loss": 2.9452, + "step": 57086 + }, + { + "epoch": 2.8, + "grad_norm": 0.7353421449661255, + "learning_rate": 6.707022529927964e-06, + "loss": 2.8917, + "step": 57087 + }, + { + "epoch": 2.8, + "grad_norm": 0.7499251961708069, + "learning_rate": 6.703785552373764e-06, + "loss": 2.8837, + "step": 57088 + }, + { + "epoch": 2.8, + "grad_norm": 0.7639588713645935, + "learning_rate": 6.700549347305339e-06, + "loss": 2.915, + "step": 57089 + }, + { + "epoch": 2.8, + "grad_norm": 0.776925265789032, + "learning_rate": 6.697313914731217e-06, + "loss": 2.9083, + "step": 57090 + }, + { + "epoch": 2.8, + "grad_norm": 0.755670428276062, + "learning_rate": 6.694079254659957e-06, + "loss": 2.8506, + "step": 57091 + }, + { + "epoch": 2.8, + "grad_norm": 0.789338231086731, + "learning_rate": 6.690845367100017e-06, + "loss": 2.8634, + "step": 57092 + }, + { + "epoch": 2.8, + "grad_norm": 0.7923612594604492, + "learning_rate": 6.68761225205996e-06, + "loss": 2.8226, + "step": 57093 + }, + { + "epoch": 2.8, + "grad_norm": 0.731760561466217, + "learning_rate": 6.6843799095482766e-06, + "loss": 2.9201, + "step": 57094 + }, + { + "epoch": 2.8, + "grad_norm": 0.7637943029403687, + "learning_rate": 6.681148339573461e-06, + "loss": 2.7399, + "step": 57095 + }, + { + "epoch": 2.8, + "grad_norm": 0.759672224521637, + "learning_rate": 6.677917542144107e-06, + "loss": 3.0211, + "step": 57096 + }, + { + "epoch": 2.8, + "grad_norm": 0.7676330208778381, + "learning_rate": 6.674687517268606e-06, + "loss": 2.6788, + "step": 57097 + }, + { + "epoch": 2.8, + "grad_norm": 0.7366164326667786, + "learning_rate": 6.671458264955554e-06, + "loss": 2.8141, + "step": 57098 + }, + { + "epoch": 2.8, + "grad_norm": 0.7516197562217712, + "learning_rate": 6.668229785213441e-06, + "loss": 3.1885, + "step": 57099 + }, + { + "epoch": 2.8, + "grad_norm": 0.7431179285049438, + "learning_rate": 6.6650020780507295e-06, + "loss": 2.7607, + "step": 57100 + }, + { + "epoch": 2.8, + "grad_norm": 0.7496405243873596, + "learning_rate": 6.661775143475978e-06, + "loss": 3.0291, + "step": 57101 + }, + { + "epoch": 2.8, + "grad_norm": 0.7139589786529541, + "learning_rate": 6.658548981497647e-06, + "loss": 2.7426, + "step": 57102 + }, + { + "epoch": 2.8, + "grad_norm": 0.7403964400291443, + "learning_rate": 6.655323592124262e-06, + "loss": 2.9492, + "step": 57103 + }, + { + "epoch": 2.8, + "grad_norm": 0.7503599524497986, + "learning_rate": 6.652098975364251e-06, + "loss": 2.7976, + "step": 57104 + }, + { + "epoch": 2.8, + "grad_norm": 0.7594204545021057, + "learning_rate": 6.648875131226173e-06, + "loss": 2.8069, + "step": 57105 + }, + { + "epoch": 2.8, + "grad_norm": 0.7432661652565002, + "learning_rate": 6.645652059718521e-06, + "loss": 2.7111, + "step": 57106 + }, + { + "epoch": 2.8, + "grad_norm": 0.7444175481796265, + "learning_rate": 6.642429760849754e-06, + "loss": 2.7686, + "step": 57107 + }, + { + "epoch": 2.8, + "grad_norm": 0.7241867780685425, + "learning_rate": 6.639208234628402e-06, + "loss": 2.8685, + "step": 57108 + }, + { + "epoch": 2.8, + "grad_norm": 0.8182801008224487, + "learning_rate": 6.635987481062854e-06, + "loss": 2.8802, + "step": 57109 + }, + { + "epoch": 2.8, + "grad_norm": 0.7803454995155334, + "learning_rate": 6.632767500161673e-06, + "loss": 2.9425, + "step": 57110 + }, + { + "epoch": 2.8, + "grad_norm": 0.7847126126289368, + "learning_rate": 6.629548291933351e-06, + "loss": 2.8971, + "step": 57111 + }, + { + "epoch": 2.8, + "grad_norm": 0.7423437833786011, + "learning_rate": 6.626329856386314e-06, + "loss": 2.8664, + "step": 57112 + }, + { + "epoch": 2.8, + "grad_norm": 0.7208489775657654, + "learning_rate": 6.62311219352909e-06, + "loss": 2.8964, + "step": 57113 + }, + { + "epoch": 2.8, + "grad_norm": 0.7785851955413818, + "learning_rate": 6.619895303370071e-06, + "loss": 2.9177, + "step": 57114 + }, + { + "epoch": 2.8, + "grad_norm": 0.7598791122436523, + "learning_rate": 6.61667918591785e-06, + "loss": 2.933, + "step": 57115 + }, + { + "epoch": 2.8, + "grad_norm": 0.7468469142913818, + "learning_rate": 6.613463841180822e-06, + "loss": 3.0654, + "step": 57116 + }, + { + "epoch": 2.8, + "grad_norm": 0.7566969394683838, + "learning_rate": 6.610249269167445e-06, + "loss": 2.978, + "step": 57117 + }, + { + "epoch": 2.8, + "grad_norm": 0.7173053622245789, + "learning_rate": 6.607035469886213e-06, + "loss": 2.6706, + "step": 57118 + }, + { + "epoch": 2.8, + "grad_norm": 0.7708240747451782, + "learning_rate": 6.603822443345586e-06, + "loss": 2.9867, + "step": 57119 + }, + { + "epoch": 2.8, + "grad_norm": 0.7301391959190369, + "learning_rate": 6.60061018955399e-06, + "loss": 2.9992, + "step": 57120 + }, + { + "epoch": 2.8, + "grad_norm": 0.7776297330856323, + "learning_rate": 6.597398708519986e-06, + "loss": 2.88, + "step": 57121 + }, + { + "epoch": 2.8, + "grad_norm": 0.7679793834686279, + "learning_rate": 6.594188000251932e-06, + "loss": 2.8798, + "step": 57122 + }, + { + "epoch": 2.8, + "grad_norm": 0.740397036075592, + "learning_rate": 6.590978064758356e-06, + "loss": 2.9748, + "step": 57123 + }, + { + "epoch": 2.8, + "grad_norm": 0.7653552293777466, + "learning_rate": 6.587768902047618e-06, + "loss": 2.9602, + "step": 57124 + }, + { + "epoch": 2.8, + "grad_norm": 0.70607590675354, + "learning_rate": 6.584560512128278e-06, + "loss": 3.0756, + "step": 57125 + }, + { + "epoch": 2.8, + "grad_norm": 0.7225512266159058, + "learning_rate": 6.581352895008696e-06, + "loss": 2.8843, + "step": 57126 + }, + { + "epoch": 2.8, + "grad_norm": 0.7174662351608276, + "learning_rate": 6.578146050697364e-06, + "loss": 2.8845, + "step": 57127 + }, + { + "epoch": 2.8, + "grad_norm": 0.7353945374488831, + "learning_rate": 6.574939979202776e-06, + "loss": 2.7851, + "step": 57128 + }, + { + "epoch": 2.8, + "grad_norm": 0.7632526159286499, + "learning_rate": 6.571734680533292e-06, + "loss": 2.8001, + "step": 57129 + }, + { + "epoch": 2.8, + "grad_norm": 0.7371880412101746, + "learning_rate": 6.568530154697404e-06, + "loss": 2.8652, + "step": 57130 + }, + { + "epoch": 2.8, + "grad_norm": 0.7216587066650391, + "learning_rate": 6.5653264017035075e-06, + "loss": 2.9515, + "step": 57131 + }, + { + "epoch": 2.8, + "grad_norm": 0.7436114549636841, + "learning_rate": 6.562123421560062e-06, + "loss": 2.7013, + "step": 57132 + }, + { + "epoch": 2.8, + "grad_norm": 0.7178574800491333, + "learning_rate": 6.5589212142755255e-06, + "loss": 2.8224, + "step": 57133 + }, + { + "epoch": 2.8, + "grad_norm": 0.723756730556488, + "learning_rate": 6.555719779858293e-06, + "loss": 2.9817, + "step": 57134 + }, + { + "epoch": 2.8, + "grad_norm": 0.7632118463516235, + "learning_rate": 6.552519118316857e-06, + "loss": 2.7139, + "step": 57135 + }, + { + "epoch": 2.8, + "grad_norm": 0.7216606140136719, + "learning_rate": 6.549319229659611e-06, + "loss": 2.9235, + "step": 57136 + }, + { + "epoch": 2.8, + "grad_norm": 0.7410150766372681, + "learning_rate": 6.546120113894981e-06, + "loss": 2.6225, + "step": 57137 + }, + { + "epoch": 2.8, + "grad_norm": 0.7452429533004761, + "learning_rate": 6.542921771031395e-06, + "loss": 2.729, + "step": 57138 + }, + { + "epoch": 2.8, + "grad_norm": 0.7587102651596069, + "learning_rate": 6.5397242010772454e-06, + "loss": 2.7456, + "step": 57139 + }, + { + "epoch": 2.8, + "grad_norm": 0.7959299087524414, + "learning_rate": 6.536527404041025e-06, + "loss": 3.0694, + "step": 57140 + }, + { + "epoch": 2.8, + "grad_norm": 0.7172296643257141, + "learning_rate": 6.533331379931061e-06, + "loss": 2.7504, + "step": 57141 + }, + { + "epoch": 2.8, + "grad_norm": 0.773472785949707, + "learning_rate": 6.5301361287558475e-06, + "loss": 2.8911, + "step": 57142 + }, + { + "epoch": 2.8, + "grad_norm": 0.7361921072006226, + "learning_rate": 6.526941650523776e-06, + "loss": 3.1896, + "step": 57143 + }, + { + "epoch": 2.8, + "grad_norm": 0.7535603046417236, + "learning_rate": 6.523747945243307e-06, + "loss": 2.7142, + "step": 57144 + }, + { + "epoch": 2.8, + "grad_norm": 0.7661078572273254, + "learning_rate": 6.520555012922768e-06, + "loss": 2.8727, + "step": 57145 + }, + { + "epoch": 2.8, + "grad_norm": 0.8038497567176819, + "learning_rate": 6.5173628535705846e-06, + "loss": 2.9623, + "step": 57146 + }, + { + "epoch": 2.8, + "grad_norm": 0.7339990139007568, + "learning_rate": 6.514171467195184e-06, + "loss": 2.9381, + "step": 57147 + }, + { + "epoch": 2.8, + "grad_norm": 0.7659273743629456, + "learning_rate": 6.510980853805026e-06, + "loss": 2.6611, + "step": 57148 + }, + { + "epoch": 2.8, + "grad_norm": 0.7171333432197571, + "learning_rate": 6.507791013408403e-06, + "loss": 2.8451, + "step": 57149 + }, + { + "epoch": 2.8, + "grad_norm": 0.7424218058586121, + "learning_rate": 6.50460194601381e-06, + "loss": 2.8667, + "step": 57150 + }, + { + "epoch": 2.8, + "grad_norm": 0.7660609483718872, + "learning_rate": 6.501413651629605e-06, + "loss": 2.9709, + "step": 57151 + }, + { + "epoch": 2.8, + "grad_norm": 0.7458734512329102, + "learning_rate": 6.498226130264183e-06, + "loss": 2.931, + "step": 57152 + }, + { + "epoch": 2.8, + "grad_norm": 0.7954019904136658, + "learning_rate": 6.495039381925971e-06, + "loss": 2.6553, + "step": 57153 + }, + { + "epoch": 2.8, + "grad_norm": 0.760521650314331, + "learning_rate": 6.491853406623293e-06, + "loss": 2.7921, + "step": 57154 + }, + { + "epoch": 2.8, + "grad_norm": 0.7642186284065247, + "learning_rate": 6.488668204364644e-06, + "loss": 2.8834, + "step": 57155 + }, + { + "epoch": 2.8, + "grad_norm": 0.7433639764785767, + "learning_rate": 6.485483775158317e-06, + "loss": 3.1153, + "step": 57156 + }, + { + "epoch": 2.8, + "grad_norm": 0.7498500943183899, + "learning_rate": 6.482300119012706e-06, + "loss": 2.9402, + "step": 57157 + }, + { + "epoch": 2.8, + "grad_norm": 0.7769145965576172, + "learning_rate": 6.479117235936304e-06, + "loss": 2.9213, + "step": 57158 + }, + { + "epoch": 2.8, + "grad_norm": 0.7752602100372314, + "learning_rate": 6.475935125937404e-06, + "loss": 3.0248, + "step": 57159 + }, + { + "epoch": 2.8, + "grad_norm": 0.7823769450187683, + "learning_rate": 6.472753789024365e-06, + "loss": 3.0033, + "step": 57160 + }, + { + "epoch": 2.8, + "grad_norm": 0.7235212922096252, + "learning_rate": 6.469573225205615e-06, + "loss": 2.8306, + "step": 57161 + }, + { + "epoch": 2.8, + "grad_norm": 0.7436525225639343, + "learning_rate": 6.466393434489514e-06, + "loss": 2.8062, + "step": 57162 + }, + { + "epoch": 2.8, + "grad_norm": 0.716906726360321, + "learning_rate": 6.463214416884455e-06, + "loss": 2.84, + "step": 57163 + }, + { + "epoch": 2.8, + "grad_norm": 0.7609844207763672, + "learning_rate": 6.460036172398764e-06, + "loss": 2.8809, + "step": 57164 + }, + { + "epoch": 2.8, + "grad_norm": 0.7516579031944275, + "learning_rate": 6.4568587010409015e-06, + "loss": 2.942, + "step": 57165 + }, + { + "epoch": 2.8, + "grad_norm": 0.7477497458457947, + "learning_rate": 6.453682002819161e-06, + "loss": 2.815, + "step": 57166 + }, + { + "epoch": 2.8, + "grad_norm": 0.7754057049751282, + "learning_rate": 6.450506077741901e-06, + "loss": 2.8088, + "step": 57167 + }, + { + "epoch": 2.8, + "grad_norm": 0.7623452544212341, + "learning_rate": 6.447330925817518e-06, + "loss": 2.8484, + "step": 57168 + }, + { + "epoch": 2.8, + "grad_norm": 0.7394402623176575, + "learning_rate": 6.444156547054369e-06, + "loss": 2.9405, + "step": 57169 + }, + { + "epoch": 2.8, + "grad_norm": 0.779207706451416, + "learning_rate": 6.440982941460848e-06, + "loss": 3.1458, + "step": 57170 + }, + { + "epoch": 2.8, + "grad_norm": 0.7456086874008179, + "learning_rate": 6.437810109045249e-06, + "loss": 3.0439, + "step": 57171 + }, + { + "epoch": 2.8, + "grad_norm": 0.7450244426727295, + "learning_rate": 6.434638049815966e-06, + "loss": 2.8157, + "step": 57172 + }, + { + "epoch": 2.8, + "grad_norm": 0.7540341019630432, + "learning_rate": 6.4314667637813565e-06, + "loss": 3.0942, + "step": 57173 + }, + { + "epoch": 2.8, + "grad_norm": 0.7208600044250488, + "learning_rate": 6.428296250949749e-06, + "loss": 2.9982, + "step": 57174 + }, + { + "epoch": 2.8, + "grad_norm": 0.7499627470970154, + "learning_rate": 6.425126511329537e-06, + "loss": 2.8532, + "step": 57175 + }, + { + "epoch": 2.8, + "grad_norm": 0.7473828196525574, + "learning_rate": 6.421957544929013e-06, + "loss": 2.9155, + "step": 57176 + }, + { + "epoch": 2.8, + "grad_norm": 0.7623142004013062, + "learning_rate": 6.41878935175657e-06, + "loss": 2.984, + "step": 57177 + }, + { + "epoch": 2.8, + "grad_norm": 0.7398785352706909, + "learning_rate": 6.415621931820536e-06, + "loss": 2.7673, + "step": 57178 + }, + { + "epoch": 2.8, + "grad_norm": 0.7367839813232422, + "learning_rate": 6.412455285129236e-06, + "loss": 3.085, + "step": 57179 + }, + { + "epoch": 2.8, + "grad_norm": 0.7085956335067749, + "learning_rate": 6.409289411691065e-06, + "loss": 2.8524, + "step": 57180 + }, + { + "epoch": 2.8, + "grad_norm": 0.741699755191803, + "learning_rate": 6.406124311514316e-06, + "loss": 2.7711, + "step": 57181 + }, + { + "epoch": 2.8, + "grad_norm": 0.7420092225074768, + "learning_rate": 6.402959984607281e-06, + "loss": 3.0438, + "step": 57182 + }, + { + "epoch": 2.8, + "grad_norm": 0.7586853504180908, + "learning_rate": 6.3997964309783875e-06, + "loss": 2.9436, + "step": 57183 + }, + { + "epoch": 2.8, + "grad_norm": 0.7290069460868835, + "learning_rate": 6.396633650635929e-06, + "loss": 2.8053, + "step": 57184 + }, + { + "epoch": 2.8, + "grad_norm": 0.7516114115715027, + "learning_rate": 6.393471643588233e-06, + "loss": 2.7453, + "step": 57185 + }, + { + "epoch": 2.8, + "grad_norm": 0.7613343596458435, + "learning_rate": 6.390310409843591e-06, + "loss": 2.723, + "step": 57186 + }, + { + "epoch": 2.8, + "grad_norm": 0.6871271729469299, + "learning_rate": 6.387149949410398e-06, + "loss": 2.9468, + "step": 57187 + }, + { + "epoch": 2.8, + "grad_norm": 0.7292281985282898, + "learning_rate": 6.383990262296945e-06, + "loss": 3.0544, + "step": 57188 + }, + { + "epoch": 2.8, + "grad_norm": 0.7564751505851746, + "learning_rate": 6.380831348511528e-06, + "loss": 2.873, + "step": 57189 + }, + { + "epoch": 2.8, + "grad_norm": 0.7569683194160461, + "learning_rate": 6.377673208062539e-06, + "loss": 2.8391, + "step": 57190 + }, + { + "epoch": 2.8, + "grad_norm": 0.7515416741371155, + "learning_rate": 6.374515840958206e-06, + "loss": 2.9058, + "step": 57191 + }, + { + "epoch": 2.8, + "grad_norm": 0.7558345794677734, + "learning_rate": 6.371359247206887e-06, + "loss": 2.8219, + "step": 57192 + }, + { + "epoch": 2.8, + "grad_norm": 0.7959738969802856, + "learning_rate": 6.368203426816909e-06, + "loss": 2.8152, + "step": 57193 + }, + { + "epoch": 2.8, + "grad_norm": 0.7814188599586487, + "learning_rate": 6.365048379796567e-06, + "loss": 3.0779, + "step": 57194 + }, + { + "epoch": 2.8, + "grad_norm": 0.7512837648391724, + "learning_rate": 6.361894106154253e-06, + "loss": 2.8264, + "step": 57195 + }, + { + "epoch": 2.8, + "grad_norm": 0.7334195375442505, + "learning_rate": 6.358740605898094e-06, + "loss": 2.921, + "step": 57196 + }, + { + "epoch": 2.8, + "grad_norm": 0.7542886137962341, + "learning_rate": 6.35558787903655e-06, + "loss": 2.8447, + "step": 57197 + }, + { + "epoch": 2.8, + "grad_norm": 0.7210304737091064, + "learning_rate": 6.352435925577848e-06, + "loss": 2.9945, + "step": 57198 + }, + { + "epoch": 2.8, + "grad_norm": 0.767892062664032, + "learning_rate": 6.349284745530347e-06, + "loss": 2.9185, + "step": 57199 + }, + { + "epoch": 2.8, + "grad_norm": 0.7845554351806641, + "learning_rate": 6.3461343389023086e-06, + "loss": 3.0824, + "step": 57200 + }, + { + "epoch": 2.8, + "grad_norm": 0.7469670176506042, + "learning_rate": 6.342984705701992e-06, + "loss": 2.7764, + "step": 57201 + }, + { + "epoch": 2.8, + "grad_norm": 0.7826364636421204, + "learning_rate": 6.339835845937791e-06, + "loss": 2.9049, + "step": 57202 + }, + { + "epoch": 2.8, + "grad_norm": 0.7540563344955444, + "learning_rate": 6.336687759617965e-06, + "loss": 2.9289, + "step": 57203 + }, + { + "epoch": 2.8, + "grad_norm": 0.7304716110229492, + "learning_rate": 6.33354044675074e-06, + "loss": 2.9662, + "step": 57204 + }, + { + "epoch": 2.8, + "grad_norm": 0.7389193773269653, + "learning_rate": 6.330393907344478e-06, + "loss": 2.9829, + "step": 57205 + }, + { + "epoch": 2.8, + "grad_norm": 0.8157293200492859, + "learning_rate": 6.3272481414074036e-06, + "loss": 3.1955, + "step": 57206 + }, + { + "epoch": 2.8, + "grad_norm": 0.775015652179718, + "learning_rate": 6.324103148947879e-06, + "loss": 2.8361, + "step": 57207 + }, + { + "epoch": 2.8, + "grad_norm": 0.7417725920677185, + "learning_rate": 6.320958929974096e-06, + "loss": 3.1331, + "step": 57208 + }, + { + "epoch": 2.8, + "grad_norm": 0.7489861845970154, + "learning_rate": 6.3178154844944484e-06, + "loss": 2.947, + "step": 57209 + }, + { + "epoch": 2.8, + "grad_norm": 0.7145342826843262, + "learning_rate": 6.31467281251713e-06, + "loss": 3.0041, + "step": 57210 + }, + { + "epoch": 2.8, + "grad_norm": 0.7301230430603027, + "learning_rate": 6.311530914050433e-06, + "loss": 2.8497, + "step": 57211 + }, + { + "epoch": 2.8, + "grad_norm": 0.8062174916267395, + "learning_rate": 6.3083897891026855e-06, + "loss": 2.843, + "step": 57212 + }, + { + "epoch": 2.8, + "grad_norm": 0.7669421434402466, + "learning_rate": 6.305249437682047e-06, + "loss": 2.9091, + "step": 57213 + }, + { + "epoch": 2.8, + "grad_norm": 0.7261912822723389, + "learning_rate": 6.302109859796878e-06, + "loss": 2.7274, + "step": 57214 + }, + { + "epoch": 2.8, + "grad_norm": 0.7557966709136963, + "learning_rate": 6.298971055455471e-06, + "loss": 2.7214, + "step": 57215 + }, + { + "epoch": 2.8, + "grad_norm": 0.776716947555542, + "learning_rate": 6.29583302466602e-06, + "loss": 3.1463, + "step": 57216 + }, + { + "epoch": 2.8, + "grad_norm": 0.7491167783737183, + "learning_rate": 6.292695767436817e-06, + "loss": 2.8682, + "step": 57217 + }, + { + "epoch": 2.8, + "grad_norm": 0.7415738701820374, + "learning_rate": 6.289559283776158e-06, + "loss": 2.9238, + "step": 57218 + }, + { + "epoch": 2.8, + "grad_norm": 0.731604278087616, + "learning_rate": 6.286423573692234e-06, + "loss": 2.946, + "step": 57219 + }, + { + "epoch": 2.8, + "grad_norm": 0.7397477626800537, + "learning_rate": 6.283288637193373e-06, + "loss": 2.7806, + "step": 57220 + }, + { + "epoch": 2.8, + "grad_norm": 0.761749804019928, + "learning_rate": 6.280154474287769e-06, + "loss": 2.791, + "step": 57221 + }, + { + "epoch": 2.8, + "grad_norm": 0.7503309845924377, + "learning_rate": 6.277021084983747e-06, + "loss": 2.8343, + "step": 57222 + }, + { + "epoch": 2.8, + "grad_norm": 0.7354109883308411, + "learning_rate": 6.273888469289501e-06, + "loss": 2.7524, + "step": 57223 + }, + { + "epoch": 2.8, + "grad_norm": 0.7324250936508179, + "learning_rate": 6.270756627213325e-06, + "loss": 2.9752, + "step": 57224 + }, + { + "epoch": 2.8, + "grad_norm": 0.7345625162124634, + "learning_rate": 6.267625558763445e-06, + "loss": 2.7768, + "step": 57225 + }, + { + "epoch": 2.8, + "grad_norm": 0.7356343269348145, + "learning_rate": 6.264495263948088e-06, + "loss": 2.7994, + "step": 57226 + }, + { + "epoch": 2.8, + "grad_norm": 0.72329181432724, + "learning_rate": 6.261365742775548e-06, + "loss": 2.7858, + "step": 57227 + }, + { + "epoch": 2.8, + "grad_norm": 0.7713742852210999, + "learning_rate": 6.258236995254018e-06, + "loss": 2.6734, + "step": 57228 + }, + { + "epoch": 2.8, + "grad_norm": 0.7680825591087341, + "learning_rate": 6.255109021391724e-06, + "loss": 3.1817, + "step": 57229 + }, + { + "epoch": 2.8, + "grad_norm": 0.7381764650344849, + "learning_rate": 6.251981821196994e-06, + "loss": 2.9703, + "step": 57230 + }, + { + "epoch": 2.8, + "grad_norm": 0.7881075739860535, + "learning_rate": 6.24885539467802e-06, + "loss": 2.8687, + "step": 57231 + }, + { + "epoch": 2.8, + "grad_norm": 0.7882162928581238, + "learning_rate": 6.245729741843031e-06, + "loss": 2.9799, + "step": 57232 + }, + { + "epoch": 2.8, + "grad_norm": 0.8076366782188416, + "learning_rate": 6.242604862700218e-06, + "loss": 2.7061, + "step": 57233 + }, + { + "epoch": 2.8, + "grad_norm": 0.7945789694786072, + "learning_rate": 6.239480757257842e-06, + "loss": 2.9682, + "step": 57234 + }, + { + "epoch": 2.8, + "grad_norm": 0.7510719895362854, + "learning_rate": 6.236357425524164e-06, + "loss": 3.0232, + "step": 57235 + }, + { + "epoch": 2.8, + "grad_norm": 0.7661550045013428, + "learning_rate": 6.233234867507375e-06, + "loss": 2.9934, + "step": 57236 + }, + { + "epoch": 2.81, + "grad_norm": 0.7666568160057068, + "learning_rate": 6.2301130832157046e-06, + "loss": 2.8113, + "step": 57237 + }, + { + "epoch": 2.81, + "grad_norm": 0.7529629468917847, + "learning_rate": 6.226992072657377e-06, + "loss": 2.7529, + "step": 57238 + }, + { + "epoch": 2.81, + "grad_norm": 0.7184775471687317, + "learning_rate": 6.223871835840655e-06, + "loss": 2.7139, + "step": 57239 + }, + { + "epoch": 2.81, + "grad_norm": 0.7889968752861023, + "learning_rate": 6.220752372773696e-06, + "loss": 3.0028, + "step": 57240 + }, + { + "epoch": 2.81, + "grad_norm": 0.7780910134315491, + "learning_rate": 6.217633683464696e-06, + "loss": 2.9095, + "step": 57241 + }, + { + "epoch": 2.81, + "grad_norm": 0.7787454724311829, + "learning_rate": 6.214515767921946e-06, + "loss": 2.9341, + "step": 57242 + }, + { + "epoch": 2.81, + "grad_norm": 0.7292056083679199, + "learning_rate": 6.211398626153607e-06, + "loss": 2.8772, + "step": 57243 + }, + { + "epoch": 2.81, + "grad_norm": 0.7210398316383362, + "learning_rate": 6.208282258167907e-06, + "loss": 2.7536, + "step": 57244 + }, + { + "epoch": 2.81, + "grad_norm": 0.7634195685386658, + "learning_rate": 6.205166663973038e-06, + "loss": 2.9861, + "step": 57245 + }, + { + "epoch": 2.81, + "grad_norm": 0.7329500317573547, + "learning_rate": 6.202051843577261e-06, + "loss": 3.0233, + "step": 57246 + }, + { + "epoch": 2.81, + "grad_norm": 0.7299970388412476, + "learning_rate": 6.198937796988734e-06, + "loss": 2.5595, + "step": 57247 + }, + { + "epoch": 2.81, + "grad_norm": 0.7431080937385559, + "learning_rate": 6.19582452421562e-06, + "loss": 2.9949, + "step": 57248 + }, + { + "epoch": 2.81, + "grad_norm": 0.7681297063827515, + "learning_rate": 6.1927120252661444e-06, + "loss": 3.1091, + "step": 57249 + }, + { + "epoch": 2.81, + "grad_norm": 0.7236172556877136, + "learning_rate": 6.189600300148567e-06, + "loss": 2.9795, + "step": 57250 + }, + { + "epoch": 2.81, + "grad_norm": 0.7395146489143372, + "learning_rate": 6.186489348871016e-06, + "loss": 3.0194, + "step": 57251 + }, + { + "epoch": 2.81, + "grad_norm": 0.7280433773994446, + "learning_rate": 6.183379171441716e-06, + "loss": 2.8707, + "step": 57252 + }, + { + "epoch": 2.81, + "grad_norm": 0.7451938390731812, + "learning_rate": 6.180269767868862e-06, + "loss": 2.9683, + "step": 57253 + }, + { + "epoch": 2.81, + "grad_norm": 0.7515122890472412, + "learning_rate": 6.177161138160614e-06, + "loss": 2.8751, + "step": 57254 + }, + { + "epoch": 2.81, + "grad_norm": 0.7310147881507874, + "learning_rate": 6.174053282325198e-06, + "loss": 2.7883, + "step": 57255 + }, + { + "epoch": 2.81, + "grad_norm": 0.7602556943893433, + "learning_rate": 6.170946200370741e-06, + "loss": 3.0935, + "step": 57256 + }, + { + "epoch": 2.81, + "grad_norm": 0.7617813348770142, + "learning_rate": 6.1678398923055035e-06, + "loss": 3.003, + "step": 57257 + }, + { + "epoch": 2.81, + "grad_norm": 0.7423365116119385, + "learning_rate": 6.164734358137579e-06, + "loss": 2.8405, + "step": 57258 + }, + { + "epoch": 2.81, + "grad_norm": 0.730692446231842, + "learning_rate": 6.161629597875195e-06, + "loss": 2.8546, + "step": 57259 + }, + { + "epoch": 2.81, + "grad_norm": 0.7807807922363281, + "learning_rate": 6.158525611526577e-06, + "loss": 2.837, + "step": 57260 + }, + { + "epoch": 2.81, + "grad_norm": 0.7491143345832825, + "learning_rate": 6.155422399099852e-06, + "loss": 3.056, + "step": 57261 + }, + { + "epoch": 2.81, + "grad_norm": 0.7525192499160767, + "learning_rate": 6.1523199606031806e-06, + "loss": 2.7682, + "step": 57262 + }, + { + "epoch": 2.81, + "grad_norm": 0.8082860708236694, + "learning_rate": 6.149218296044722e-06, + "loss": 2.7799, + "step": 57263 + }, + { + "epoch": 2.81, + "grad_norm": 0.7616897821426392, + "learning_rate": 6.146117405432638e-06, + "loss": 2.6663, + "step": 57264 + }, + { + "epoch": 2.81, + "grad_norm": 0.784067690372467, + "learning_rate": 6.143017288775187e-06, + "loss": 3.032, + "step": 57265 + }, + { + "epoch": 2.81, + "grad_norm": 0.773425817489624, + "learning_rate": 6.139917946080464e-06, + "loss": 2.8674, + "step": 57266 + }, + { + "epoch": 2.81, + "grad_norm": 0.7454070448875427, + "learning_rate": 6.136819377356628e-06, + "loss": 2.6727, + "step": 57267 + }, + { + "epoch": 2.81, + "grad_norm": 0.7858648896217346, + "learning_rate": 6.133721582611906e-06, + "loss": 2.9446, + "step": 57268 + }, + { + "epoch": 2.81, + "grad_norm": 0.7687196135520935, + "learning_rate": 6.130624561854325e-06, + "loss": 2.6409, + "step": 57269 + }, + { + "epoch": 2.81, + "grad_norm": 0.7145859599113464, + "learning_rate": 6.127528315092178e-06, + "loss": 3.2014, + "step": 57270 + }, + { + "epoch": 2.81, + "grad_norm": 0.7361599802970886, + "learning_rate": 6.1244328423335265e-06, + "loss": 3.015, + "step": 57271 + }, + { + "epoch": 2.81, + "grad_norm": 0.7314938902854919, + "learning_rate": 6.121338143586596e-06, + "loss": 3.0502, + "step": 57272 + }, + { + "epoch": 2.81, + "grad_norm": 0.7757865786552429, + "learning_rate": 6.11824421885948e-06, + "loss": 2.767, + "step": 57273 + }, + { + "epoch": 2.81, + "grad_norm": 0.7310301661491394, + "learning_rate": 6.115151068160373e-06, + "loss": 2.7884, + "step": 57274 + }, + { + "epoch": 2.81, + "grad_norm": 0.748052179813385, + "learning_rate": 6.112058691497401e-06, + "loss": 2.7668, + "step": 57275 + }, + { + "epoch": 2.81, + "grad_norm": 0.7008647918701172, + "learning_rate": 6.108967088878691e-06, + "loss": 2.8638, + "step": 57276 + }, + { + "epoch": 2.81, + "grad_norm": 0.7348527312278748, + "learning_rate": 6.105876260312404e-06, + "loss": 2.8012, + "step": 57277 + }, + { + "epoch": 2.81, + "grad_norm": 0.7351037859916687, + "learning_rate": 6.102786205806664e-06, + "loss": 2.9923, + "step": 57278 + }, + { + "epoch": 2.81, + "grad_norm": 0.7870645523071289, + "learning_rate": 6.099696925369635e-06, + "loss": 2.9804, + "step": 57279 + }, + { + "epoch": 2.81, + "grad_norm": 0.7273508310317993, + "learning_rate": 6.096608419009441e-06, + "loss": 2.9093, + "step": 57280 + }, + { + "epoch": 2.81, + "grad_norm": 0.7449030876159668, + "learning_rate": 6.09352068673421e-06, + "loss": 3.0118, + "step": 57281 + }, + { + "epoch": 2.81, + "grad_norm": 0.7714568972587585, + "learning_rate": 6.090433728552102e-06, + "loss": 3.0912, + "step": 57282 + }, + { + "epoch": 2.81, + "grad_norm": 0.7424775958061218, + "learning_rate": 6.08734754447121e-06, + "loss": 2.9796, + "step": 57283 + }, + { + "epoch": 2.81, + "grad_norm": 0.7018720507621765, + "learning_rate": 6.084262134499696e-06, + "loss": 2.7236, + "step": 57284 + }, + { + "epoch": 2.81, + "grad_norm": 0.8280545473098755, + "learning_rate": 6.081177498645651e-06, + "loss": 2.8014, + "step": 57285 + }, + { + "epoch": 2.81, + "grad_norm": 0.7586017847061157, + "learning_rate": 6.078093636917236e-06, + "loss": 2.9029, + "step": 57286 + }, + { + "epoch": 2.81, + "grad_norm": 0.7319121956825256, + "learning_rate": 6.075010549322545e-06, + "loss": 2.954, + "step": 57287 + }, + { + "epoch": 2.81, + "grad_norm": 0.7442706823348999, + "learning_rate": 6.071928235869705e-06, + "loss": 2.993, + "step": 57288 + }, + { + "epoch": 2.81, + "grad_norm": 0.7516229152679443, + "learning_rate": 6.068846696566842e-06, + "loss": 2.8055, + "step": 57289 + }, + { + "epoch": 2.81, + "grad_norm": 0.7649211883544922, + "learning_rate": 6.065765931422084e-06, + "loss": 2.8407, + "step": 57290 + }, + { + "epoch": 2.81, + "grad_norm": 0.7471636533737183, + "learning_rate": 6.062685940443523e-06, + "loss": 2.7969, + "step": 57291 + }, + { + "epoch": 2.81, + "grad_norm": 0.7413542866706848, + "learning_rate": 6.0596067236392874e-06, + "loss": 2.9349, + "step": 57292 + }, + { + "epoch": 2.81, + "grad_norm": 0.7625101804733276, + "learning_rate": 6.056528281017436e-06, + "loss": 3.0211, + "step": 57293 + }, + { + "epoch": 2.81, + "grad_norm": 0.7629085779190063, + "learning_rate": 6.0534506125861635e-06, + "loss": 2.8523, + "step": 57294 + }, + { + "epoch": 2.81, + "grad_norm": 0.7582331895828247, + "learning_rate": 6.050373718353496e-06, + "loss": 2.83, + "step": 57295 + }, + { + "epoch": 2.81, + "grad_norm": 0.7564645409584045, + "learning_rate": 6.047297598327561e-06, + "loss": 2.7993, + "step": 57296 + }, + { + "epoch": 2.81, + "grad_norm": 0.7481827735900879, + "learning_rate": 6.044222252516551e-06, + "loss": 2.9593, + "step": 57297 + }, + { + "epoch": 2.81, + "grad_norm": 0.7227786183357239, + "learning_rate": 6.041147680928427e-06, + "loss": 3.0073, + "step": 57298 + }, + { + "epoch": 2.81, + "grad_norm": 0.7628014087677002, + "learning_rate": 6.038073883571348e-06, + "loss": 3.0438, + "step": 57299 + }, + { + "epoch": 2.81, + "grad_norm": 0.7520178556442261, + "learning_rate": 6.035000860453409e-06, + "loss": 2.8691, + "step": 57300 + }, + { + "epoch": 2.81, + "grad_norm": 0.7767812013626099, + "learning_rate": 6.031928611582704e-06, + "loss": 2.8563, + "step": 57301 + }, + { + "epoch": 2.81, + "grad_norm": 0.738016664981842, + "learning_rate": 6.028857136967325e-06, + "loss": 2.9511, + "step": 57302 + }, + { + "epoch": 2.81, + "grad_norm": 0.7443080544471741, + "learning_rate": 6.0257864366153655e-06, + "loss": 2.7384, + "step": 57303 + }, + { + "epoch": 2.81, + "grad_norm": 0.7690123915672302, + "learning_rate": 6.02271651053492e-06, + "loss": 2.7267, + "step": 57304 + }, + { + "epoch": 2.81, + "grad_norm": 0.757183313369751, + "learning_rate": 6.019647358734048e-06, + "loss": 2.8482, + "step": 57305 + }, + { + "epoch": 2.81, + "grad_norm": 0.7601922750473022, + "learning_rate": 6.0165789812208435e-06, + "loss": 2.9199, + "step": 57306 + }, + { + "epoch": 2.81, + "grad_norm": 0.7741249799728394, + "learning_rate": 6.0135113780034e-06, + "loss": 3.0289, + "step": 57307 + }, + { + "epoch": 2.81, + "grad_norm": 0.7413105368614197, + "learning_rate": 6.010444549089777e-06, + "loss": 2.9368, + "step": 57308 + }, + { + "epoch": 2.81, + "grad_norm": 0.763995349407196, + "learning_rate": 6.007378494488102e-06, + "loss": 2.8091, + "step": 57309 + }, + { + "epoch": 2.81, + "grad_norm": 0.742984414100647, + "learning_rate": 6.004313214206369e-06, + "loss": 2.9815, + "step": 57310 + }, + { + "epoch": 2.81, + "grad_norm": 0.7137545347213745, + "learning_rate": 6.001248708252704e-06, + "loss": 2.8202, + "step": 57311 + }, + { + "epoch": 2.81, + "grad_norm": 0.74273282289505, + "learning_rate": 5.998184976635201e-06, + "loss": 2.8075, + "step": 57312 + }, + { + "epoch": 2.81, + "grad_norm": 0.7143562436103821, + "learning_rate": 5.995122019361853e-06, + "loss": 2.8815, + "step": 57313 + }, + { + "epoch": 2.81, + "grad_norm": 0.713367760181427, + "learning_rate": 5.992059836440788e-06, + "loss": 2.8194, + "step": 57314 + }, + { + "epoch": 2.81, + "grad_norm": 0.6914703845977783, + "learning_rate": 5.988998427880032e-06, + "loss": 2.8552, + "step": 57315 + }, + { + "epoch": 2.81, + "grad_norm": 0.7266353368759155, + "learning_rate": 5.985937793687679e-06, + "loss": 2.7779, + "step": 57316 + }, + { + "epoch": 2.81, + "grad_norm": 0.7659033536911011, + "learning_rate": 5.982877933871822e-06, + "loss": 3.0615, + "step": 57317 + }, + { + "epoch": 2.81, + "grad_norm": 0.740502119064331, + "learning_rate": 5.979818848440421e-06, + "loss": 2.8445, + "step": 57318 + }, + { + "epoch": 2.81, + "grad_norm": 0.7166577577590942, + "learning_rate": 5.976760537401637e-06, + "loss": 2.8114, + "step": 57319 + }, + { + "epoch": 2.81, + "grad_norm": 0.7707402110099792, + "learning_rate": 5.973703000763497e-06, + "loss": 2.8491, + "step": 57320 + }, + { + "epoch": 2.81, + "grad_norm": 0.7468615174293518, + "learning_rate": 5.9706462385339936e-06, + "loss": 2.8557, + "step": 57321 + }, + { + "epoch": 2.81, + "grad_norm": 0.7503215670585632, + "learning_rate": 5.967590250721222e-06, + "loss": 2.9284, + "step": 57322 + }, + { + "epoch": 2.81, + "grad_norm": 0.7697805166244507, + "learning_rate": 5.96453503733324e-06, + "loss": 2.8139, + "step": 57323 + }, + { + "epoch": 2.81, + "grad_norm": 0.7666358351707458, + "learning_rate": 5.96148059837811e-06, + "loss": 3.2273, + "step": 57324 + }, + { + "epoch": 2.81, + "grad_norm": 1.1106570959091187, + "learning_rate": 5.958426933863791e-06, + "loss": 2.9179, + "step": 57325 + }, + { + "epoch": 2.81, + "grad_norm": 0.7379875183105469, + "learning_rate": 5.955374043798444e-06, + "loss": 2.8579, + "step": 57326 + }, + { + "epoch": 2.81, + "grad_norm": 0.7157080173492432, + "learning_rate": 5.952321928190029e-06, + "loss": 2.6449, + "step": 57327 + }, + { + "epoch": 2.81, + "grad_norm": 0.7337937355041504, + "learning_rate": 5.949270587046606e-06, + "loss": 2.6561, + "step": 57328 + }, + { + "epoch": 2.81, + "grad_norm": 0.7767958641052246, + "learning_rate": 5.946220020376235e-06, + "loss": 2.7252, + "step": 57329 + }, + { + "epoch": 2.81, + "grad_norm": 0.699264645576477, + "learning_rate": 5.943170228186878e-06, + "loss": 2.7687, + "step": 57330 + }, + { + "epoch": 2.81, + "grad_norm": 0.769777238368988, + "learning_rate": 5.940121210486659e-06, + "loss": 3.0783, + "step": 57331 + }, + { + "epoch": 2.81, + "grad_norm": 0.7534322738647461, + "learning_rate": 5.93707296728354e-06, + "loss": 2.7708, + "step": 57332 + }, + { + "epoch": 2.81, + "grad_norm": 0.7632072567939758, + "learning_rate": 5.934025498585615e-06, + "loss": 2.759, + "step": 57333 + }, + { + "epoch": 2.81, + "grad_norm": 0.7482007741928101, + "learning_rate": 5.930978804400876e-06, + "loss": 3.1279, + "step": 57334 + }, + { + "epoch": 2.81, + "grad_norm": 0.7381764054298401, + "learning_rate": 5.927932884737318e-06, + "loss": 2.8895, + "step": 57335 + }, + { + "epoch": 2.81, + "grad_norm": 0.7939985394477844, + "learning_rate": 5.924887739602968e-06, + "loss": 2.8252, + "step": 57336 + }, + { + "epoch": 2.81, + "grad_norm": 0.7845645546913147, + "learning_rate": 5.921843369005885e-06, + "loss": 3.0684, + "step": 57337 + }, + { + "epoch": 2.81, + "grad_norm": 0.7730729579925537, + "learning_rate": 5.918799772954064e-06, + "loss": 3.0756, + "step": 57338 + }, + { + "epoch": 2.81, + "grad_norm": 0.7449852824211121, + "learning_rate": 5.9157569514555635e-06, + "loss": 3.0701, + "step": 57339 + }, + { + "epoch": 2.81, + "grad_norm": 0.7435621023178101, + "learning_rate": 5.912714904518312e-06, + "loss": 2.9434, + "step": 57340 + }, + { + "epoch": 2.81, + "grad_norm": 0.7603535652160645, + "learning_rate": 5.909673632150436e-06, + "loss": 3.0084, + "step": 57341 + }, + { + "epoch": 2.81, + "grad_norm": 0.7573125958442688, + "learning_rate": 5.9066331343598285e-06, + "loss": 2.795, + "step": 57342 + }, + { + "epoch": 2.81, + "grad_norm": 0.7497143745422363, + "learning_rate": 5.90359341115455e-06, + "loss": 2.8122, + "step": 57343 + }, + { + "epoch": 2.81, + "grad_norm": 0.7319371104240417, + "learning_rate": 5.900554462542662e-06, + "loss": 2.9278, + "step": 57344 + }, + { + "epoch": 2.81, + "grad_norm": 0.8178033828735352, + "learning_rate": 5.897516288532056e-06, + "loss": 2.8397, + "step": 57345 + }, + { + "epoch": 2.81, + "grad_norm": 0.8565611243247986, + "learning_rate": 5.894478889130794e-06, + "loss": 2.8193, + "step": 57346 + }, + { + "epoch": 2.81, + "grad_norm": 0.744626522064209, + "learning_rate": 5.8914422643468685e-06, + "loss": 2.9301, + "step": 57347 + }, + { + "epoch": 2.81, + "grad_norm": 0.7861790060997009, + "learning_rate": 5.88840641418834e-06, + "loss": 2.8523, + "step": 57348 + }, + { + "epoch": 2.81, + "grad_norm": 0.7643438577651978, + "learning_rate": 5.885371338663103e-06, + "loss": 2.8766, + "step": 57349 + }, + { + "epoch": 2.81, + "grad_norm": 0.8093913197517395, + "learning_rate": 5.882337037779183e-06, + "loss": 3.0652, + "step": 57350 + }, + { + "epoch": 2.81, + "grad_norm": 0.7430794835090637, + "learning_rate": 5.879303511544576e-06, + "loss": 2.8223, + "step": 57351 + }, + { + "epoch": 2.81, + "grad_norm": 0.8071761131286621, + "learning_rate": 5.876270759967305e-06, + "loss": 2.9618, + "step": 57352 + }, + { + "epoch": 2.81, + "grad_norm": 0.7300369143486023, + "learning_rate": 5.873238783055334e-06, + "loss": 2.9846, + "step": 57353 + }, + { + "epoch": 2.81, + "grad_norm": 0.7400018572807312, + "learning_rate": 5.870207580816655e-06, + "loss": 2.6373, + "step": 57354 + }, + { + "epoch": 2.81, + "grad_norm": 0.7521073818206787, + "learning_rate": 5.8671771532592284e-06, + "loss": 2.9761, + "step": 57355 + }, + { + "epoch": 2.81, + "grad_norm": 0.7645672559738159, + "learning_rate": 5.864147500391047e-06, + "loss": 2.6966, + "step": 57356 + }, + { + "epoch": 2.81, + "grad_norm": 0.7233006358146667, + "learning_rate": 5.861118622220073e-06, + "loss": 2.8905, + "step": 57357 + }, + { + "epoch": 2.81, + "grad_norm": 0.7672364115715027, + "learning_rate": 5.858090518754332e-06, + "loss": 2.6379, + "step": 57358 + }, + { + "epoch": 2.81, + "grad_norm": 0.7448053359985352, + "learning_rate": 5.855063190001785e-06, + "loss": 2.89, + "step": 57359 + }, + { + "epoch": 2.81, + "grad_norm": 0.7966521978378296, + "learning_rate": 5.8520366359703585e-06, + "loss": 2.9453, + "step": 57360 + }, + { + "epoch": 2.81, + "grad_norm": 0.851170003414154, + "learning_rate": 5.849010856668079e-06, + "loss": 2.8829, + "step": 57361 + }, + { + "epoch": 2.81, + "grad_norm": 0.8057568073272705, + "learning_rate": 5.845985852102874e-06, + "loss": 2.6775, + "step": 57362 + }, + { + "epoch": 2.81, + "grad_norm": 0.7434202432632446, + "learning_rate": 5.842961622282738e-06, + "loss": 2.8152, + "step": 57363 + }, + { + "epoch": 2.81, + "grad_norm": 0.846804678440094, + "learning_rate": 5.839938167215663e-06, + "loss": 3.005, + "step": 57364 + }, + { + "epoch": 2.81, + "grad_norm": 0.7839913368225098, + "learning_rate": 5.836915486909544e-06, + "loss": 2.8143, + "step": 57365 + }, + { + "epoch": 2.81, + "grad_norm": 0.7269675731658936, + "learning_rate": 5.833893581372373e-06, + "loss": 2.9256, + "step": 57366 + }, + { + "epoch": 2.81, + "grad_norm": 0.7645124793052673, + "learning_rate": 5.830872450612112e-06, + "loss": 2.9203, + "step": 57367 + }, + { + "epoch": 2.81, + "grad_norm": 0.7519145011901855, + "learning_rate": 5.82785209463672e-06, + "loss": 2.7965, + "step": 57368 + }, + { + "epoch": 2.81, + "grad_norm": 0.7483285665512085, + "learning_rate": 5.824832513454192e-06, + "loss": 2.703, + "step": 57369 + }, + { + "epoch": 2.81, + "grad_norm": 0.767500638961792, + "learning_rate": 5.82181370707242e-06, + "loss": 2.6382, + "step": 57370 + }, + { + "epoch": 2.81, + "grad_norm": 0.7527577877044678, + "learning_rate": 5.8187956754993655e-06, + "loss": 2.8592, + "step": 57371 + }, + { + "epoch": 2.81, + "grad_norm": 0.7957414984703064, + "learning_rate": 5.815778418742989e-06, + "loss": 2.7785, + "step": 57372 + }, + { + "epoch": 2.81, + "grad_norm": 0.7185649871826172, + "learning_rate": 5.812761936811217e-06, + "loss": 2.8294, + "step": 57373 + }, + { + "epoch": 2.81, + "grad_norm": 0.7688578367233276, + "learning_rate": 5.8097462297120425e-06, + "loss": 2.8467, + "step": 57374 + }, + { + "epoch": 2.81, + "grad_norm": 0.7571620941162109, + "learning_rate": 5.806731297453326e-06, + "loss": 2.8394, + "step": 57375 + }, + { + "epoch": 2.81, + "grad_norm": 0.7824880480766296, + "learning_rate": 5.803717140043129e-06, + "loss": 2.8248, + "step": 57376 + }, + { + "epoch": 2.81, + "grad_norm": 0.7680121064186096, + "learning_rate": 5.800703757489311e-06, + "loss": 3.0829, + "step": 57377 + }, + { + "epoch": 2.81, + "grad_norm": 0.7361317276954651, + "learning_rate": 5.7976911497997655e-06, + "loss": 2.8745, + "step": 57378 + }, + { + "epoch": 2.81, + "grad_norm": 0.7746545076370239, + "learning_rate": 5.79467931698252e-06, + "loss": 2.9367, + "step": 57379 + }, + { + "epoch": 2.81, + "grad_norm": 0.7424500584602356, + "learning_rate": 5.791668259045468e-06, + "loss": 2.7415, + "step": 57380 + }, + { + "epoch": 2.81, + "grad_norm": 0.9100118279457092, + "learning_rate": 5.788657975996536e-06, + "loss": 3.0618, + "step": 57381 + }, + { + "epoch": 2.81, + "grad_norm": 0.768410325050354, + "learning_rate": 5.785648467843651e-06, + "loss": 2.8411, + "step": 57382 + }, + { + "epoch": 2.81, + "grad_norm": 0.7742539644241333, + "learning_rate": 5.782639734594741e-06, + "loss": 2.8681, + "step": 57383 + }, + { + "epoch": 2.81, + "grad_norm": 0.7879564762115479, + "learning_rate": 5.779631776257765e-06, + "loss": 2.8448, + "step": 57384 + }, + { + "epoch": 2.81, + "grad_norm": 0.7167067527770996, + "learning_rate": 5.776624592840584e-06, + "loss": 2.8608, + "step": 57385 + }, + { + "epoch": 2.81, + "grad_norm": 0.7402490377426147, + "learning_rate": 5.773618184351192e-06, + "loss": 3.1005, + "step": 57386 + }, + { + "epoch": 2.81, + "grad_norm": 0.7294678092002869, + "learning_rate": 5.770612550797416e-06, + "loss": 2.8117, + "step": 57387 + }, + { + "epoch": 2.81, + "grad_norm": 0.7386787533760071, + "learning_rate": 5.767607692187215e-06, + "loss": 2.7278, + "step": 57388 + }, + { + "epoch": 2.81, + "grad_norm": 0.7458301782608032, + "learning_rate": 5.7646036085285505e-06, + "loss": 2.8675, + "step": 57389 + }, + { + "epoch": 2.81, + "grad_norm": 0.7428733706474304, + "learning_rate": 5.76160029982925e-06, + "loss": 2.8125, + "step": 57390 + }, + { + "epoch": 2.81, + "grad_norm": 0.7372967004776001, + "learning_rate": 5.758597766097306e-06, + "loss": 3.0284, + "step": 57391 + }, + { + "epoch": 2.81, + "grad_norm": 0.7986836433410645, + "learning_rate": 5.755596007340579e-06, + "loss": 2.8643, + "step": 57392 + }, + { + "epoch": 2.81, + "grad_norm": 0.7363787293434143, + "learning_rate": 5.752595023566964e-06, + "loss": 3.0305, + "step": 57393 + }, + { + "epoch": 2.81, + "grad_norm": 0.7744364738464355, + "learning_rate": 5.7495948147844194e-06, + "loss": 2.8855, + "step": 57394 + }, + { + "epoch": 2.81, + "grad_norm": 0.7604561448097229, + "learning_rate": 5.746595381000774e-06, + "loss": 2.8601, + "step": 57395 + }, + { + "epoch": 2.81, + "grad_norm": 0.7391270399093628, + "learning_rate": 5.74359672222402e-06, + "loss": 2.9664, + "step": 57396 + }, + { + "epoch": 2.81, + "grad_norm": 0.7320667505264282, + "learning_rate": 5.740598838461951e-06, + "loss": 3.0343, + "step": 57397 + }, + { + "epoch": 2.81, + "grad_norm": 0.7917554378509521, + "learning_rate": 5.73760172972253e-06, + "loss": 2.9968, + "step": 57398 + }, + { + "epoch": 2.81, + "grad_norm": 0.772669792175293, + "learning_rate": 5.734605396013647e-06, + "loss": 2.9576, + "step": 57399 + }, + { + "epoch": 2.81, + "grad_norm": 0.7252935767173767, + "learning_rate": 5.731609837343132e-06, + "loss": 3.0033, + "step": 57400 + }, + { + "epoch": 2.81, + "grad_norm": 0.7601112127304077, + "learning_rate": 5.728615053718977e-06, + "loss": 3.03, + "step": 57401 + }, + { + "epoch": 2.81, + "grad_norm": 0.7780544757843018, + "learning_rate": 5.725621045148976e-06, + "loss": 2.9666, + "step": 57402 + }, + { + "epoch": 2.81, + "grad_norm": 0.760009229183197, + "learning_rate": 5.72262781164109e-06, + "loss": 2.9728, + "step": 57403 + }, + { + "epoch": 2.81, + "grad_norm": 0.7839109301567078, + "learning_rate": 5.719635353203145e-06, + "loss": 3.0657, + "step": 57404 + }, + { + "epoch": 2.81, + "grad_norm": 0.7375196218490601, + "learning_rate": 5.716643669843035e-06, + "loss": 2.8473, + "step": 57405 + }, + { + "epoch": 2.81, + "grad_norm": 0.7924519777297974, + "learning_rate": 5.713652761568688e-06, + "loss": 2.8607, + "step": 57406 + }, + { + "epoch": 2.81, + "grad_norm": 0.7020806074142456, + "learning_rate": 5.71066262838793e-06, + "loss": 2.9978, + "step": 57407 + }, + { + "epoch": 2.81, + "grad_norm": 0.744614839553833, + "learning_rate": 5.707673270308655e-06, + "loss": 3.1015, + "step": 57408 + }, + { + "epoch": 2.81, + "grad_norm": 0.7562986612319946, + "learning_rate": 5.704684687338723e-06, + "loss": 2.7133, + "step": 57409 + }, + { + "epoch": 2.81, + "grad_norm": 0.7199798226356506, + "learning_rate": 5.7016968794859955e-06, + "loss": 2.8839, + "step": 57410 + }, + { + "epoch": 2.81, + "grad_norm": 0.7654950022697449, + "learning_rate": 5.698709846758364e-06, + "loss": 2.9377, + "step": 57411 + }, + { + "epoch": 2.81, + "grad_norm": 0.737543523311615, + "learning_rate": 5.695723589163692e-06, + "loss": 2.7927, + "step": 57412 + }, + { + "epoch": 2.81, + "grad_norm": 0.7589178681373596, + "learning_rate": 5.692738106709871e-06, + "loss": 2.9463, + "step": 57413 + }, + { + "epoch": 2.81, + "grad_norm": 0.7573418021202087, + "learning_rate": 5.689753399404729e-06, + "loss": 2.8789, + "step": 57414 + }, + { + "epoch": 2.81, + "grad_norm": 0.7789357900619507, + "learning_rate": 5.686769467256125e-06, + "loss": 2.8112, + "step": 57415 + }, + { + "epoch": 2.81, + "grad_norm": 0.7354355454444885, + "learning_rate": 5.683786310271921e-06, + "loss": 2.6794, + "step": 57416 + }, + { + "epoch": 2.81, + "grad_norm": 0.7149415016174316, + "learning_rate": 5.680803928459976e-06, + "loss": 2.7977, + "step": 57417 + }, + { + "epoch": 2.81, + "grad_norm": 0.7695300579071045, + "learning_rate": 5.677822321828152e-06, + "loss": 2.9736, + "step": 57418 + }, + { + "epoch": 2.81, + "grad_norm": 0.7486740350723267, + "learning_rate": 5.674841490384341e-06, + "loss": 2.9578, + "step": 57419 + }, + { + "epoch": 2.81, + "grad_norm": 0.8076779842376709, + "learning_rate": 5.671861434136304e-06, + "loss": 2.9316, + "step": 57420 + }, + { + "epoch": 2.81, + "grad_norm": 0.766984224319458, + "learning_rate": 5.668882153091969e-06, + "loss": 2.9472, + "step": 57421 + }, + { + "epoch": 2.81, + "grad_norm": 0.7838339805603027, + "learning_rate": 5.665903647259162e-06, + "loss": 2.7034, + "step": 57422 + }, + { + "epoch": 2.81, + "grad_norm": 0.764181911945343, + "learning_rate": 5.662925916645677e-06, + "loss": 2.7349, + "step": 57423 + }, + { + "epoch": 2.81, + "grad_norm": 0.7412382960319519, + "learning_rate": 5.659948961259442e-06, + "loss": 3.072, + "step": 57424 + }, + { + "epoch": 2.81, + "grad_norm": 0.7331834435462952, + "learning_rate": 5.656972781108216e-06, + "loss": 2.9661, + "step": 57425 + }, + { + "epoch": 2.81, + "grad_norm": 0.7448732852935791, + "learning_rate": 5.653997376199925e-06, + "loss": 2.9843, + "step": 57426 + }, + { + "epoch": 2.81, + "grad_norm": 0.739306628704071, + "learning_rate": 5.6510227465423e-06, + "loss": 2.8614, + "step": 57427 + }, + { + "epoch": 2.81, + "grad_norm": 0.8040658235549927, + "learning_rate": 5.6480488921432645e-06, + "loss": 2.9276, + "step": 57428 + }, + { + "epoch": 2.81, + "grad_norm": 0.7358850240707397, + "learning_rate": 5.645075813010647e-06, + "loss": 2.9018, + "step": 57429 + }, + { + "epoch": 2.81, + "grad_norm": 0.7447613477706909, + "learning_rate": 5.642103509152207e-06, + "loss": 2.9166, + "step": 57430 + }, + { + "epoch": 2.81, + "grad_norm": 0.8018415570259094, + "learning_rate": 5.6391319805758396e-06, + "loss": 2.8888, + "step": 57431 + }, + { + "epoch": 2.81, + "grad_norm": 0.7493090033531189, + "learning_rate": 5.6361612272893045e-06, + "loss": 2.8736, + "step": 57432 + }, + { + "epoch": 2.81, + "grad_norm": 0.770797073841095, + "learning_rate": 5.633191249300529e-06, + "loss": 2.8206, + "step": 57433 + }, + { + "epoch": 2.81, + "grad_norm": 0.7289892435073853, + "learning_rate": 5.630222046617205e-06, + "loss": 2.8424, + "step": 57434 + }, + { + "epoch": 2.81, + "grad_norm": 0.7826147079467773, + "learning_rate": 5.627253619247296e-06, + "loss": 2.9898, + "step": 57435 + }, + { + "epoch": 2.81, + "grad_norm": 0.7384172081947327, + "learning_rate": 5.624285967198528e-06, + "loss": 2.7443, + "step": 57436 + }, + { + "epoch": 2.81, + "grad_norm": 0.7605634927749634, + "learning_rate": 5.621319090478693e-06, + "loss": 2.8757, + "step": 57437 + }, + { + "epoch": 2.81, + "grad_norm": 0.7587273120880127, + "learning_rate": 5.6183529890957205e-06, + "loss": 2.8125, + "step": 57438 + }, + { + "epoch": 2.81, + "grad_norm": 0.6983641982078552, + "learning_rate": 5.615387663057302e-06, + "loss": 2.8265, + "step": 57439 + }, + { + "epoch": 2.81, + "grad_norm": 0.7505119442939758, + "learning_rate": 5.612423112371267e-06, + "loss": 3.1734, + "step": 57440 + }, + { + "epoch": 2.82, + "grad_norm": 0.7802063822746277, + "learning_rate": 5.609459337045507e-06, + "loss": 3.1378, + "step": 57441 + }, + { + "epoch": 2.82, + "grad_norm": 0.7453617453575134, + "learning_rate": 5.606496337087751e-06, + "loss": 2.9801, + "step": 57442 + }, + { + "epoch": 2.82, + "grad_norm": 0.7429621815681458, + "learning_rate": 5.603534112505859e-06, + "loss": 3.0071, + "step": 57443 + }, + { + "epoch": 2.82, + "grad_norm": 0.7750768661499023, + "learning_rate": 5.60057266330759e-06, + "loss": 2.9811, + "step": 57444 + }, + { + "epoch": 2.82, + "grad_norm": 0.767142117023468, + "learning_rate": 5.5976119895007385e-06, + "loss": 2.6831, + "step": 57445 + }, + { + "epoch": 2.82, + "grad_norm": 0.8097765445709229, + "learning_rate": 5.594652091093133e-06, + "loss": 3.0158, + "step": 57446 + }, + { + "epoch": 2.82, + "grad_norm": 0.735511064529419, + "learning_rate": 5.591692968092532e-06, + "loss": 2.8753, + "step": 57447 + }, + { + "epoch": 2.82, + "grad_norm": 0.778668224811554, + "learning_rate": 5.588734620506763e-06, + "loss": 2.624, + "step": 57448 + }, + { + "epoch": 2.82, + "grad_norm": 0.7237783670425415, + "learning_rate": 5.585777048343587e-06, + "loss": 2.9627, + "step": 57449 + }, + { + "epoch": 2.82, + "grad_norm": 0.7732894420623779, + "learning_rate": 5.582820251610864e-06, + "loss": 2.9355, + "step": 57450 + }, + { + "epoch": 2.82, + "grad_norm": 0.7555185556411743, + "learning_rate": 5.5798642303163225e-06, + "loss": 2.781, + "step": 57451 + }, + { + "epoch": 2.82, + "grad_norm": 0.7874925136566162, + "learning_rate": 5.576908984467721e-06, + "loss": 2.9256, + "step": 57452 + }, + { + "epoch": 2.82, + "grad_norm": 0.7143713235855103, + "learning_rate": 5.573954514072887e-06, + "loss": 2.8541, + "step": 57453 + }, + { + "epoch": 2.82, + "grad_norm": 0.7951681613922119, + "learning_rate": 5.571000819139582e-06, + "loss": 2.9457, + "step": 57454 + }, + { + "epoch": 2.82, + "grad_norm": 0.7409695982933044, + "learning_rate": 5.568047899675599e-06, + "loss": 3.0258, + "step": 57455 + }, + { + "epoch": 2.82, + "grad_norm": 0.7536093592643738, + "learning_rate": 5.565095755688731e-06, + "loss": 2.9839, + "step": 57456 + }, + { + "epoch": 2.82, + "grad_norm": 0.7845450639724731, + "learning_rate": 5.56214438718674e-06, + "loss": 2.8603, + "step": 57457 + }, + { + "epoch": 2.82, + "grad_norm": 0.7714955806732178, + "learning_rate": 5.559193794177419e-06, + "loss": 3.0423, + "step": 57458 + }, + { + "epoch": 2.82, + "grad_norm": 0.7862733602523804, + "learning_rate": 5.556243976668462e-06, + "loss": 2.7072, + "step": 57459 + }, + { + "epoch": 2.82, + "grad_norm": 0.7352171540260315, + "learning_rate": 5.553294934667696e-06, + "loss": 3.0502, + "step": 57460 + }, + { + "epoch": 2.82, + "grad_norm": 0.7902106642723083, + "learning_rate": 5.5503466681829146e-06, + "loss": 2.8126, + "step": 57461 + }, + { + "epoch": 2.82, + "grad_norm": 0.7630064487457275, + "learning_rate": 5.547399177221812e-06, + "loss": 2.8373, + "step": 57462 + }, + { + "epoch": 2.82, + "grad_norm": 0.7489845156669617, + "learning_rate": 5.544452461792248e-06, + "loss": 2.9392, + "step": 57463 + }, + { + "epoch": 2.82, + "grad_norm": 0.7420606017112732, + "learning_rate": 5.541506521901884e-06, + "loss": 2.9887, + "step": 57464 + }, + { + "epoch": 2.82, + "grad_norm": 0.7396260499954224, + "learning_rate": 5.538561357558546e-06, + "loss": 2.5488, + "step": 57465 + }, + { + "epoch": 2.82, + "grad_norm": 0.7430559992790222, + "learning_rate": 5.535616968769963e-06, + "loss": 2.9337, + "step": 57466 + }, + { + "epoch": 2.82, + "grad_norm": 0.7690236568450928, + "learning_rate": 5.53267335554386e-06, + "loss": 2.9588, + "step": 57467 + }, + { + "epoch": 2.82, + "grad_norm": 0.733445405960083, + "learning_rate": 5.529730517888064e-06, + "loss": 2.8503, + "step": 57468 + }, + { + "epoch": 2.82, + "grad_norm": 0.7059511542320251, + "learning_rate": 5.526788455810271e-06, + "loss": 2.9426, + "step": 57469 + }, + { + "epoch": 2.82, + "grad_norm": 0.7520898580551147, + "learning_rate": 5.52384716931824e-06, + "loss": 2.9003, + "step": 57470 + }, + { + "epoch": 2.82, + "grad_norm": 0.8237969875335693, + "learning_rate": 5.520906658419733e-06, + "loss": 2.8435, + "step": 57471 + }, + { + "epoch": 2.82, + "grad_norm": 0.7647461891174316, + "learning_rate": 5.517966923122508e-06, + "loss": 2.9031, + "step": 57472 + }, + { + "epoch": 2.82, + "grad_norm": 0.7231850624084473, + "learning_rate": 5.51502796343426e-06, + "loss": 2.7187, + "step": 57473 + }, + { + "epoch": 2.82, + "grad_norm": 0.6980883479118347, + "learning_rate": 5.512089779362716e-06, + "loss": 2.8957, + "step": 57474 + }, + { + "epoch": 2.82, + "grad_norm": 0.7455374598503113, + "learning_rate": 5.50915237091567e-06, + "loss": 2.9012, + "step": 57475 + }, + { + "epoch": 2.82, + "grad_norm": 0.7731330990791321, + "learning_rate": 5.506215738100883e-06, + "loss": 3.0828, + "step": 57476 + }, + { + "epoch": 2.82, + "grad_norm": 0.7092301249504089, + "learning_rate": 5.503279880926015e-06, + "loss": 2.9492, + "step": 57477 + }, + { + "epoch": 2.82, + "grad_norm": 0.7573080062866211, + "learning_rate": 5.5003447993988255e-06, + "loss": 2.8745, + "step": 57478 + }, + { + "epoch": 2.82, + "grad_norm": 0.7752591371536255, + "learning_rate": 5.497410493527077e-06, + "loss": 2.9229, + "step": 57479 + }, + { + "epoch": 2.82, + "grad_norm": 0.7782385945320129, + "learning_rate": 5.4944769633184616e-06, + "loss": 3.0516, + "step": 57480 + }, + { + "epoch": 2.82, + "grad_norm": 0.7580782175064087, + "learning_rate": 5.491544208780707e-06, + "loss": 2.9722, + "step": 57481 + }, + { + "epoch": 2.82, + "grad_norm": 0.749438464641571, + "learning_rate": 5.488612229921541e-06, + "loss": 2.9657, + "step": 57482 + }, + { + "epoch": 2.82, + "grad_norm": 0.7620990872383118, + "learning_rate": 5.485681026748723e-06, + "loss": 2.8918, + "step": 57483 + }, + { + "epoch": 2.82, + "grad_norm": 0.7435450553894043, + "learning_rate": 5.48275059926988e-06, + "loss": 2.8467, + "step": 57484 + }, + { + "epoch": 2.82, + "grad_norm": 0.7541936635971069, + "learning_rate": 5.4798209474928075e-06, + "loss": 2.6854, + "step": 57485 + }, + { + "epoch": 2.82, + "grad_norm": 0.7550050020217896, + "learning_rate": 5.4768920714252315e-06, + "loss": 2.6602, + "step": 57486 + }, + { + "epoch": 2.82, + "grad_norm": 0.7599036693572998, + "learning_rate": 5.473963971074846e-06, + "loss": 2.8434, + "step": 57487 + }, + { + "epoch": 2.82, + "grad_norm": 0.714177668094635, + "learning_rate": 5.471036646449378e-06, + "loss": 2.844, + "step": 57488 + }, + { + "epoch": 2.82, + "grad_norm": 0.7540480494499207, + "learning_rate": 5.468110097556455e-06, + "loss": 2.8204, + "step": 57489 + }, + { + "epoch": 2.82, + "grad_norm": 0.7340836524963379, + "learning_rate": 5.4651843244038705e-06, + "loss": 2.8181, + "step": 57490 + }, + { + "epoch": 2.82, + "grad_norm": 0.7581061124801636, + "learning_rate": 5.462259326999319e-06, + "loss": 2.8814, + "step": 57491 + }, + { + "epoch": 2.82, + "grad_norm": 0.834175169467926, + "learning_rate": 5.45933510535046e-06, + "loss": 2.9842, + "step": 57492 + }, + { + "epoch": 2.82, + "grad_norm": 0.7358565330505371, + "learning_rate": 5.456411659465054e-06, + "loss": 2.8919, + "step": 57493 + }, + { + "epoch": 2.82, + "grad_norm": 0.7109126448631287, + "learning_rate": 5.453488989350762e-06, + "loss": 2.7427, + "step": 57494 + }, + { + "epoch": 2.82, + "grad_norm": 0.724243700504303, + "learning_rate": 5.450567095015312e-06, + "loss": 2.5712, + "step": 57495 + }, + { + "epoch": 2.82, + "grad_norm": 0.7592142224311829, + "learning_rate": 5.447645976466364e-06, + "loss": 2.9613, + "step": 57496 + }, + { + "epoch": 2.82, + "grad_norm": 0.7279868125915527, + "learning_rate": 5.444725633711611e-06, + "loss": 2.8802, + "step": 57497 + }, + { + "epoch": 2.82, + "grad_norm": 0.7003878355026245, + "learning_rate": 5.4418060667588135e-06, + "loss": 3.1777, + "step": 57498 + }, + { + "epoch": 2.82, + "grad_norm": 0.7382853031158447, + "learning_rate": 5.438887275615566e-06, + "loss": 2.8485, + "step": 57499 + }, + { + "epoch": 2.82, + "grad_norm": 0.7092037200927734, + "learning_rate": 5.43596926028963e-06, + "loss": 2.8656, + "step": 57500 + }, + { + "epoch": 2.82, + "grad_norm": 0.756615936756134, + "learning_rate": 5.433052020788631e-06, + "loss": 3.0391, + "step": 57501 + }, + { + "epoch": 2.82, + "grad_norm": 0.7301920056343079, + "learning_rate": 5.4301355571202965e-06, + "loss": 2.8625, + "step": 57502 + }, + { + "epoch": 2.82, + "grad_norm": 0.753453254699707, + "learning_rate": 5.427219869292288e-06, + "loss": 2.8667, + "step": 57503 + }, + { + "epoch": 2.82, + "grad_norm": 0.8032575845718384, + "learning_rate": 5.424304957312298e-06, + "loss": 2.8938, + "step": 57504 + }, + { + "epoch": 2.82, + "grad_norm": 0.7729468941688538, + "learning_rate": 5.421390821187988e-06, + "loss": 3.0574, + "step": 57505 + }, + { + "epoch": 2.82, + "grad_norm": 0.7835939526557922, + "learning_rate": 5.418477460927051e-06, + "loss": 2.7424, + "step": 57506 + }, + { + "epoch": 2.82, + "grad_norm": 0.7130961418151855, + "learning_rate": 5.415564876537149e-06, + "loss": 2.6947, + "step": 57507 + }, + { + "epoch": 2.82, + "grad_norm": 0.7707082033157349, + "learning_rate": 5.412653068025941e-06, + "loss": 2.8117, + "step": 57508 + }, + { + "epoch": 2.82, + "grad_norm": 0.7709498405456543, + "learning_rate": 5.409742035401154e-06, + "loss": 2.7995, + "step": 57509 + }, + { + "epoch": 2.82, + "grad_norm": 0.7126640677452087, + "learning_rate": 5.406831778670351e-06, + "loss": 2.8213, + "step": 57510 + }, + { + "epoch": 2.82, + "grad_norm": 0.7614896893501282, + "learning_rate": 5.403922297841323e-06, + "loss": 2.8555, + "step": 57511 + }, + { + "epoch": 2.82, + "grad_norm": 0.7267091870307922, + "learning_rate": 5.4010135929216e-06, + "loss": 2.9359, + "step": 57512 + }, + { + "epoch": 2.82, + "grad_norm": 0.7336955070495605, + "learning_rate": 5.398105663918972e-06, + "loss": 2.9989, + "step": 57513 + }, + { + "epoch": 2.82, + "grad_norm": 0.7180055975914001, + "learning_rate": 5.39519851084097e-06, + "loss": 3.036, + "step": 57514 + }, + { + "epoch": 2.82, + "grad_norm": 0.8044130206108093, + "learning_rate": 5.392292133695386e-06, + "loss": 2.8144, + "step": 57515 + }, + { + "epoch": 2.82, + "grad_norm": 0.7591328024864197, + "learning_rate": 5.389386532489781e-06, + "loss": 2.801, + "step": 57516 + }, + { + "epoch": 2.82, + "grad_norm": 0.7301998138427734, + "learning_rate": 5.386481707231816e-06, + "loss": 2.7639, + "step": 57517 + }, + { + "epoch": 2.82, + "grad_norm": 0.7751278281211853, + "learning_rate": 5.3835776579291835e-06, + "loss": 2.7864, + "step": 57518 + }, + { + "epoch": 2.82, + "grad_norm": 0.7999823689460754, + "learning_rate": 5.3806743845894784e-06, + "loss": 2.9398, + "step": 57519 + }, + { + "epoch": 2.82, + "grad_norm": 0.7624409794807434, + "learning_rate": 5.377771887220394e-06, + "loss": 2.5767, + "step": 57520 + }, + { + "epoch": 2.82, + "grad_norm": 0.8037823438644409, + "learning_rate": 5.374870165829559e-06, + "loss": 2.9523, + "step": 57521 + }, + { + "epoch": 2.82, + "grad_norm": 0.7656051516532898, + "learning_rate": 5.371969220424599e-06, + "loss": 2.8153, + "step": 57522 + }, + { + "epoch": 2.82, + "grad_norm": 0.7563663125038147, + "learning_rate": 5.369069051013208e-06, + "loss": 2.8219, + "step": 57523 + }, + { + "epoch": 2.82, + "grad_norm": 0.7026633620262146, + "learning_rate": 5.366169657602948e-06, + "loss": 2.8431, + "step": 57524 + }, + { + "epoch": 2.82, + "grad_norm": 0.7382919192314148, + "learning_rate": 5.363271040201511e-06, + "loss": 2.9779, + "step": 57525 + }, + { + "epoch": 2.82, + "grad_norm": 1.0166808366775513, + "learning_rate": 5.360373198816492e-06, + "loss": 2.9759, + "step": 57526 + }, + { + "epoch": 2.82, + "grad_norm": 0.7511675357818604, + "learning_rate": 5.357476133455552e-06, + "loss": 2.8831, + "step": 57527 + }, + { + "epoch": 2.82, + "grad_norm": 0.7667033076286316, + "learning_rate": 5.354579844126317e-06, + "loss": 2.8983, + "step": 57528 + }, + { + "epoch": 2.82, + "grad_norm": 0.7955288290977478, + "learning_rate": 5.351684330836414e-06, + "loss": 2.9765, + "step": 57529 + }, + { + "epoch": 2.82, + "grad_norm": 0.7648731470108032, + "learning_rate": 5.348789593593505e-06, + "loss": 2.9163, + "step": 57530 + }, + { + "epoch": 2.82, + "grad_norm": 0.8392326235771179, + "learning_rate": 5.3458956324051504e-06, + "loss": 2.8243, + "step": 57531 + }, + { + "epoch": 2.82, + "grad_norm": 0.7558670043945312, + "learning_rate": 5.343002447278977e-06, + "loss": 3.025, + "step": 57532 + }, + { + "epoch": 2.82, + "grad_norm": 0.7797101736068726, + "learning_rate": 5.340110038222644e-06, + "loss": 3.0272, + "step": 57533 + }, + { + "epoch": 2.82, + "grad_norm": 0.7675113081932068, + "learning_rate": 5.337218405243748e-06, + "loss": 3.2485, + "step": 57534 + }, + { + "epoch": 2.82, + "grad_norm": 0.7596341371536255, + "learning_rate": 5.334327548349948e-06, + "loss": 3.0808, + "step": 57535 + }, + { + "epoch": 2.82, + "grad_norm": 0.7368146777153015, + "learning_rate": 5.3314374675487714e-06, + "loss": 2.724, + "step": 57536 + }, + { + "epoch": 2.82, + "grad_norm": 0.7727293968200684, + "learning_rate": 5.328548162847879e-06, + "loss": 2.7899, + "step": 57537 + }, + { + "epoch": 2.82, + "grad_norm": 0.71891850233078, + "learning_rate": 5.325659634254931e-06, + "loss": 2.8625, + "step": 57538 + }, + { + "epoch": 2.82, + "grad_norm": 0.7327159643173218, + "learning_rate": 5.322771881777421e-06, + "loss": 2.9382, + "step": 57539 + }, + { + "epoch": 2.82, + "grad_norm": 0.7162185311317444, + "learning_rate": 5.319884905423077e-06, + "loss": 2.7911, + "step": 57540 + }, + { + "epoch": 2.82, + "grad_norm": 0.740092396736145, + "learning_rate": 5.316998705199393e-06, + "loss": 2.9277, + "step": 57541 + }, + { + "epoch": 2.82, + "grad_norm": 0.771077036857605, + "learning_rate": 5.31411328111403e-06, + "loss": 2.7363, + "step": 57542 + }, + { + "epoch": 2.82, + "grad_norm": 0.728895902633667, + "learning_rate": 5.31122863317458e-06, + "loss": 2.8725, + "step": 57543 + }, + { + "epoch": 2.82, + "grad_norm": 0.7603982090950012, + "learning_rate": 5.308344761388639e-06, + "loss": 2.8479, + "step": 57544 + }, + { + "epoch": 2.82, + "grad_norm": 0.7482584118843079, + "learning_rate": 5.305461665763833e-06, + "loss": 2.876, + "step": 57545 + }, + { + "epoch": 2.82, + "grad_norm": 0.7513059377670288, + "learning_rate": 5.302579346307723e-06, + "loss": 2.9574, + "step": 57546 + }, + { + "epoch": 2.82, + "grad_norm": 0.7721797227859497, + "learning_rate": 5.299697803027869e-06, + "loss": 3.2145, + "step": 57547 + }, + { + "epoch": 2.82, + "grad_norm": 0.7654960751533508, + "learning_rate": 5.296817035931933e-06, + "loss": 3.0258, + "step": 57548 + }, + { + "epoch": 2.82, + "grad_norm": 0.793851912021637, + "learning_rate": 5.293937045027441e-06, + "loss": 2.9649, + "step": 57549 + }, + { + "epoch": 2.82, + "grad_norm": 0.7595252394676208, + "learning_rate": 5.2910578303220205e-06, + "loss": 2.8733, + "step": 57550 + }, + { + "epoch": 2.82, + "grad_norm": 0.7790116667747498, + "learning_rate": 5.2881793918232e-06, + "loss": 2.889, + "step": 57551 + }, + { + "epoch": 2.82, + "grad_norm": 0.74222332239151, + "learning_rate": 5.285301729538638e-06, + "loss": 2.9103, + "step": 57552 + }, + { + "epoch": 2.82, + "grad_norm": 0.793062150478363, + "learning_rate": 5.282424843475863e-06, + "loss": 2.8367, + "step": 57553 + }, + { + "epoch": 2.82, + "grad_norm": 0.7489209175109863, + "learning_rate": 5.279548733642436e-06, + "loss": 2.8782, + "step": 57554 + }, + { + "epoch": 2.82, + "grad_norm": 0.7790075540542603, + "learning_rate": 5.276673400045983e-06, + "loss": 2.9005, + "step": 57555 + }, + { + "epoch": 2.82, + "grad_norm": 0.7113046646118164, + "learning_rate": 5.273798842694033e-06, + "loss": 2.7339, + "step": 57556 + }, + { + "epoch": 2.82, + "grad_norm": 0.7641512751579285, + "learning_rate": 5.2709250615941445e-06, + "loss": 2.8315, + "step": 57557 + }, + { + "epoch": 2.82, + "grad_norm": 0.7478774189949036, + "learning_rate": 5.26805205675398e-06, + "loss": 2.7826, + "step": 57558 + }, + { + "epoch": 2.82, + "grad_norm": 0.7339596748352051, + "learning_rate": 5.265179828181032e-06, + "loss": 2.9666, + "step": 57559 + }, + { + "epoch": 2.82, + "grad_norm": 0.7537974715232849, + "learning_rate": 5.2623083758828624e-06, + "loss": 2.9265, + "step": 57560 + }, + { + "epoch": 2.82, + "grad_norm": 0.7144533395767212, + "learning_rate": 5.259437699867064e-06, + "loss": 2.8843, + "step": 57561 + }, + { + "epoch": 2.82, + "grad_norm": 0.7479895949363708, + "learning_rate": 5.256567800141165e-06, + "loss": 2.9927, + "step": 57562 + }, + { + "epoch": 2.82, + "grad_norm": 0.7054181098937988, + "learning_rate": 5.253698676712759e-06, + "loss": 2.8793, + "step": 57563 + }, + { + "epoch": 2.82, + "grad_norm": 0.7824179530143738, + "learning_rate": 5.250830329589339e-06, + "loss": 2.8978, + "step": 57564 + }, + { + "epoch": 2.82, + "grad_norm": 0.7174307703971863, + "learning_rate": 5.2479627587785675e-06, + "loss": 2.8663, + "step": 57565 + }, + { + "epoch": 2.82, + "grad_norm": 0.7782198786735535, + "learning_rate": 5.245095964287904e-06, + "loss": 2.8694, + "step": 57566 + }, + { + "epoch": 2.82, + "grad_norm": 0.7361534833908081, + "learning_rate": 5.242229946124943e-06, + "loss": 2.8386, + "step": 57567 + }, + { + "epoch": 2.82, + "grad_norm": 0.7740105986595154, + "learning_rate": 5.239364704297211e-06, + "loss": 2.7616, + "step": 57568 + }, + { + "epoch": 2.82, + "grad_norm": 0.7601125240325928, + "learning_rate": 5.236500238812269e-06, + "loss": 3.121, + "step": 57569 + }, + { + "epoch": 2.82, + "grad_norm": 0.7358449697494507, + "learning_rate": 5.233636549677678e-06, + "loss": 2.883, + "step": 57570 + }, + { + "epoch": 2.82, + "grad_norm": 0.7319931387901306, + "learning_rate": 5.230773636900898e-06, + "loss": 2.9506, + "step": 57571 + }, + { + "epoch": 2.82, + "grad_norm": 0.7816628217697144, + "learning_rate": 5.22791150048959e-06, + "loss": 2.8939, + "step": 57572 + }, + { + "epoch": 2.82, + "grad_norm": 0.7447727918624878, + "learning_rate": 5.225050140451215e-06, + "loss": 2.9117, + "step": 57573 + }, + { + "epoch": 2.82, + "grad_norm": 0.7741653919219971, + "learning_rate": 5.222189556793332e-06, + "loss": 2.7785, + "step": 57574 + }, + { + "epoch": 2.82, + "grad_norm": 0.7777344584465027, + "learning_rate": 5.219329749523471e-06, + "loss": 2.7844, + "step": 57575 + }, + { + "epoch": 2.82, + "grad_norm": 0.7560518980026245, + "learning_rate": 5.216470718649157e-06, + "loss": 3.047, + "step": 57576 + }, + { + "epoch": 2.82, + "grad_norm": 0.7396130561828613, + "learning_rate": 5.213612464177918e-06, + "loss": 2.8337, + "step": 57577 + }, + { + "epoch": 2.82, + "grad_norm": 0.7314813137054443, + "learning_rate": 5.210754986117316e-06, + "loss": 2.9577, + "step": 57578 + }, + { + "epoch": 2.82, + "grad_norm": 0.7825134992599487, + "learning_rate": 5.207898284474809e-06, + "loss": 2.8162, + "step": 57579 + }, + { + "epoch": 2.82, + "grad_norm": 0.787757933139801, + "learning_rate": 5.205042359257994e-06, + "loss": 3.0029, + "step": 57580 + }, + { + "epoch": 2.82, + "grad_norm": 0.7091009616851807, + "learning_rate": 5.202187210474396e-06, + "loss": 2.8445, + "step": 57581 + }, + { + "epoch": 2.82, + "grad_norm": 0.7412608861923218, + "learning_rate": 5.199332838131443e-06, + "loss": 2.9387, + "step": 57582 + }, + { + "epoch": 2.82, + "grad_norm": 0.7515140771865845, + "learning_rate": 5.196479242236762e-06, + "loss": 2.7867, + "step": 57583 + }, + { + "epoch": 2.82, + "grad_norm": 0.7683895826339722, + "learning_rate": 5.1936264227977806e-06, + "loss": 3.0592, + "step": 57584 + }, + { + "epoch": 2.82, + "grad_norm": 0.7576361298561096, + "learning_rate": 5.1907743798220606e-06, + "loss": 2.7596, + "step": 57585 + }, + { + "epoch": 2.82, + "grad_norm": 0.7240979075431824, + "learning_rate": 5.187923113317094e-06, + "loss": 3.0416, + "step": 57586 + }, + { + "epoch": 2.82, + "grad_norm": 0.7744309306144714, + "learning_rate": 5.1850726232904095e-06, + "loss": 2.9025, + "step": 57587 + }, + { + "epoch": 2.82, + "grad_norm": 0.7236972451210022, + "learning_rate": 5.182222909749501e-06, + "loss": 2.9219, + "step": 57588 + }, + { + "epoch": 2.82, + "grad_norm": 0.7626051306724548, + "learning_rate": 5.179373972701895e-06, + "loss": 3.0846, + "step": 57589 + }, + { + "epoch": 2.82, + "grad_norm": 0.7822689414024353, + "learning_rate": 5.176525812155085e-06, + "loss": 2.8351, + "step": 57590 + }, + { + "epoch": 2.82, + "grad_norm": 0.7402531504631042, + "learning_rate": 5.173678428116534e-06, + "loss": 2.8016, + "step": 57591 + }, + { + "epoch": 2.82, + "grad_norm": 0.8098210692405701, + "learning_rate": 5.170831820593802e-06, + "loss": 3.0032, + "step": 57592 + }, + { + "epoch": 2.82, + "grad_norm": 0.7769423723220825, + "learning_rate": 5.167985989594348e-06, + "loss": 2.9816, + "step": 57593 + }, + { + "epoch": 2.82, + "grad_norm": 0.7293099761009216, + "learning_rate": 5.1651409351256665e-06, + "loss": 2.828, + "step": 57594 + }, + { + "epoch": 2.82, + "grad_norm": 0.7770448923110962, + "learning_rate": 5.162296657195253e-06, + "loss": 2.9258, + "step": 57595 + }, + { + "epoch": 2.82, + "grad_norm": 0.7682694792747498, + "learning_rate": 5.159453155810633e-06, + "loss": 2.7589, + "step": 57596 + }, + { + "epoch": 2.82, + "grad_norm": 0.7171539068222046, + "learning_rate": 5.156610430979302e-06, + "loss": 2.7792, + "step": 57597 + }, + { + "epoch": 2.82, + "grad_norm": 0.7691710591316223, + "learning_rate": 5.1537684827086535e-06, + "loss": 2.9293, + "step": 57598 + }, + { + "epoch": 2.82, + "grad_norm": 0.7626630663871765, + "learning_rate": 5.1509273110062475e-06, + "loss": 2.9575, + "step": 57599 + }, + { + "epoch": 2.82, + "grad_norm": 0.7831852436065674, + "learning_rate": 5.148086915879579e-06, + "loss": 2.7166, + "step": 57600 + }, + { + "epoch": 2.82, + "grad_norm": 0.734420895576477, + "learning_rate": 5.145247297336075e-06, + "loss": 2.883, + "step": 57601 + }, + { + "epoch": 2.82, + "grad_norm": 0.7334008812904358, + "learning_rate": 5.142408455383262e-06, + "loss": 3.034, + "step": 57602 + }, + { + "epoch": 2.82, + "grad_norm": 0.7297717928886414, + "learning_rate": 5.139570390028602e-06, + "loss": 2.6785, + "step": 57603 + }, + { + "epoch": 2.82, + "grad_norm": 0.7485255002975464, + "learning_rate": 5.136733101279556e-06, + "loss": 3.0351, + "step": 57604 + }, + { + "epoch": 2.82, + "grad_norm": 0.7516413927078247, + "learning_rate": 5.133896589143616e-06, + "loss": 2.8879, + "step": 57605 + }, + { + "epoch": 2.82, + "grad_norm": 0.8280989527702332, + "learning_rate": 5.1310608536282105e-06, + "loss": 2.9283, + "step": 57606 + }, + { + "epoch": 2.82, + "grad_norm": 0.7576739192008972, + "learning_rate": 5.128225894740867e-06, + "loss": 2.8524, + "step": 57607 + }, + { + "epoch": 2.82, + "grad_norm": 0.7196022868156433, + "learning_rate": 5.125391712489047e-06, + "loss": 2.8059, + "step": 57608 + }, + { + "epoch": 2.82, + "grad_norm": 0.7515164017677307, + "learning_rate": 5.122558306880143e-06, + "loss": 2.6472, + "step": 57609 + }, + { + "epoch": 2.82, + "grad_norm": 0.7482687830924988, + "learning_rate": 5.119725677921715e-06, + "loss": 3.0543, + "step": 57610 + }, + { + "epoch": 2.82, + "grad_norm": 0.7849657535552979, + "learning_rate": 5.116893825621194e-06, + "loss": 2.604, + "step": 57611 + }, + { + "epoch": 2.82, + "grad_norm": 0.737377941608429, + "learning_rate": 5.114062749986003e-06, + "loss": 2.8361, + "step": 57612 + }, + { + "epoch": 2.82, + "grad_norm": 0.7955058217048645, + "learning_rate": 5.1112324510236055e-06, + "loss": 2.9009, + "step": 57613 + }, + { + "epoch": 2.82, + "grad_norm": 0.7713934183120728, + "learning_rate": 5.108402928741462e-06, + "loss": 2.5417, + "step": 57614 + }, + { + "epoch": 2.82, + "grad_norm": 0.7756964564323425, + "learning_rate": 5.105574183147065e-06, + "loss": 2.8647, + "step": 57615 + }, + { + "epoch": 2.82, + "grad_norm": 0.7626116871833801, + "learning_rate": 5.10274621424781e-06, + "loss": 2.8613, + "step": 57616 + }, + { + "epoch": 2.82, + "grad_norm": 0.7126087546348572, + "learning_rate": 5.099919022051157e-06, + "loss": 2.7191, + "step": 57617 + }, + { + "epoch": 2.82, + "grad_norm": 0.7495712637901306, + "learning_rate": 5.097092606564601e-06, + "loss": 2.7508, + "step": 57618 + }, + { + "epoch": 2.82, + "grad_norm": 0.743903398513794, + "learning_rate": 5.094266967795502e-06, + "loss": 2.9745, + "step": 57619 + }, + { + "epoch": 2.82, + "grad_norm": 0.7743878960609436, + "learning_rate": 5.091442105751387e-06, + "loss": 2.6665, + "step": 57620 + }, + { + "epoch": 2.82, + "grad_norm": 0.7580164074897766, + "learning_rate": 5.0886180204396165e-06, + "loss": 2.9658, + "step": 57621 + }, + { + "epoch": 2.82, + "grad_norm": 0.7179275155067444, + "learning_rate": 5.085794711867719e-06, + "loss": 2.9423, + "step": 57622 + }, + { + "epoch": 2.82, + "grad_norm": 0.7473773956298828, + "learning_rate": 5.0829721800430544e-06, + "loss": 2.9537, + "step": 57623 + }, + { + "epoch": 2.82, + "grad_norm": 0.7446900606155396, + "learning_rate": 5.080150424973051e-06, + "loss": 2.8643, + "step": 57624 + }, + { + "epoch": 2.82, + "grad_norm": 0.7130290865898132, + "learning_rate": 5.077329446665235e-06, + "loss": 2.8381, + "step": 57625 + }, + { + "epoch": 2.82, + "grad_norm": 0.7973248362541199, + "learning_rate": 5.0745092451269345e-06, + "loss": 2.7452, + "step": 57626 + }, + { + "epoch": 2.82, + "grad_norm": 0.7995738387107849, + "learning_rate": 5.071689820365643e-06, + "loss": 2.9036, + "step": 57627 + }, + { + "epoch": 2.82, + "grad_norm": 0.8131319880485535, + "learning_rate": 5.068871172388755e-06, + "loss": 2.9542, + "step": 57628 + }, + { + "epoch": 2.82, + "grad_norm": 0.7483770251274109, + "learning_rate": 5.066053301203665e-06, + "loss": 2.8858, + "step": 57629 + }, + { + "epoch": 2.82, + "grad_norm": 0.7863949537277222, + "learning_rate": 5.0632362068178665e-06, + "loss": 2.9802, + "step": 57630 + }, + { + "epoch": 2.82, + "grad_norm": 0.7554341554641724, + "learning_rate": 5.0604198892387205e-06, + "loss": 3.0817, + "step": 57631 + }, + { + "epoch": 2.82, + "grad_norm": 0.7444338798522949, + "learning_rate": 5.057604348473687e-06, + "loss": 3.0471, + "step": 57632 + }, + { + "epoch": 2.82, + "grad_norm": 0.8026116490364075, + "learning_rate": 5.054789584530161e-06, + "loss": 2.6147, + "step": 57633 + }, + { + "epoch": 2.82, + "grad_norm": 0.7201833128929138, + "learning_rate": 5.051975597415536e-06, + "loss": 2.7799, + "step": 57634 + }, + { + "epoch": 2.82, + "grad_norm": 0.7888780236244202, + "learning_rate": 5.0491623871372396e-06, + "loss": 3.0041, + "step": 57635 + }, + { + "epoch": 2.82, + "grad_norm": 0.7324066162109375, + "learning_rate": 5.0463499537027e-06, + "loss": 2.7168, + "step": 57636 + }, + { + "epoch": 2.82, + "grad_norm": 0.7467460632324219, + "learning_rate": 5.043538297119309e-06, + "loss": 2.7614, + "step": 57637 + }, + { + "epoch": 2.82, + "grad_norm": 0.8163952827453613, + "learning_rate": 5.040727417394464e-06, + "loss": 3.0423, + "step": 57638 + }, + { + "epoch": 2.82, + "grad_norm": 0.788611888885498, + "learning_rate": 5.037917314535589e-06, + "loss": 3.0102, + "step": 57639 + }, + { + "epoch": 2.82, + "grad_norm": 0.7810007929801941, + "learning_rate": 5.03510798855008e-06, + "loss": 2.9274, + "step": 57640 + }, + { + "epoch": 2.82, + "grad_norm": 0.8209431171417236, + "learning_rate": 5.032299439445297e-06, + "loss": 2.7132, + "step": 57641 + }, + { + "epoch": 2.82, + "grad_norm": 0.7978459596633911, + "learning_rate": 5.0294916672287025e-06, + "loss": 2.7835, + "step": 57642 + }, + { + "epoch": 2.82, + "grad_norm": 0.7896865010261536, + "learning_rate": 5.026684671907655e-06, + "loss": 2.7713, + "step": 57643 + }, + { + "epoch": 2.82, + "grad_norm": 0.7331417202949524, + "learning_rate": 5.023878453489516e-06, + "loss": 2.8776, + "step": 57644 + }, + { + "epoch": 2.83, + "grad_norm": 0.7624796628952026, + "learning_rate": 5.0210730119817465e-06, + "loss": 2.9129, + "step": 57645 + }, + { + "epoch": 2.83, + "grad_norm": 0.7527821660041809, + "learning_rate": 5.018268347391674e-06, + "loss": 2.9413, + "step": 57646 + }, + { + "epoch": 2.83, + "grad_norm": 0.7554188966751099, + "learning_rate": 5.015464459726759e-06, + "loss": 3.1033, + "step": 57647 + }, + { + "epoch": 2.83, + "grad_norm": 0.7641181349754333, + "learning_rate": 5.012661348994329e-06, + "loss": 2.8095, + "step": 57648 + }, + { + "epoch": 2.83, + "grad_norm": 0.7661232352256775, + "learning_rate": 5.009859015201745e-06, + "loss": 2.9625, + "step": 57649 + }, + { + "epoch": 2.83, + "grad_norm": 0.7631051540374756, + "learning_rate": 5.0070574583564335e-06, + "loss": 2.8323, + "step": 57650 + }, + { + "epoch": 2.83, + "grad_norm": 0.8133715391159058, + "learning_rate": 5.004256678465757e-06, + "loss": 2.9095, + "step": 57651 + }, + { + "epoch": 2.83, + "grad_norm": 0.7536367774009705, + "learning_rate": 5.001456675537108e-06, + "loss": 2.8894, + "step": 57652 + }, + { + "epoch": 2.83, + "grad_norm": 0.7896614074707031, + "learning_rate": 4.998657449577848e-06, + "loss": 3.0761, + "step": 57653 + }, + { + "epoch": 2.83, + "grad_norm": 0.7261320352554321, + "learning_rate": 4.995859000595337e-06, + "loss": 2.9384, + "step": 57654 + }, + { + "epoch": 2.83, + "grad_norm": 0.7240200638771057, + "learning_rate": 4.99306132859697e-06, + "loss": 2.6961, + "step": 57655 + }, + { + "epoch": 2.83, + "grad_norm": 0.7582331895828247, + "learning_rate": 4.990264433590108e-06, + "loss": 2.7058, + "step": 57656 + }, + { + "epoch": 2.83, + "grad_norm": 0.7294660210609436, + "learning_rate": 4.98746831558211e-06, + "loss": 2.9779, + "step": 57657 + }, + { + "epoch": 2.83, + "grad_norm": 0.752255916595459, + "learning_rate": 4.984672974580306e-06, + "loss": 2.8786, + "step": 57658 + }, + { + "epoch": 2.83, + "grad_norm": 0.751939594745636, + "learning_rate": 4.981878410592122e-06, + "loss": 2.8781, + "step": 57659 + }, + { + "epoch": 2.83, + "grad_norm": 0.7334133982658386, + "learning_rate": 4.979084623624918e-06, + "loss": 2.9869, + "step": 57660 + }, + { + "epoch": 2.83, + "grad_norm": 0.7954682111740112, + "learning_rate": 4.976291613686023e-06, + "loss": 3.1021, + "step": 57661 + }, + { + "epoch": 2.83, + "grad_norm": 0.7579163312911987, + "learning_rate": 4.973499380782797e-06, + "loss": 2.9179, + "step": 57662 + }, + { + "epoch": 2.83, + "grad_norm": 0.743008553981781, + "learning_rate": 4.970707924922568e-06, + "loss": 2.8985, + "step": 57663 + }, + { + "epoch": 2.83, + "grad_norm": 0.7410622835159302, + "learning_rate": 4.9679172461127295e-06, + "loss": 3.0076, + "step": 57664 + }, + { + "epoch": 2.83, + "grad_norm": 0.725355327129364, + "learning_rate": 4.965127344360608e-06, + "loss": 2.8972, + "step": 57665 + }, + { + "epoch": 2.83, + "grad_norm": 0.7321346998214722, + "learning_rate": 4.962338219673567e-06, + "loss": 3.1501, + "step": 57666 + }, + { + "epoch": 2.83, + "grad_norm": 0.7218225002288818, + "learning_rate": 4.9595498720589655e-06, + "loss": 2.7632, + "step": 57667 + }, + { + "epoch": 2.83, + "grad_norm": 0.7458818554878235, + "learning_rate": 4.9567623015240975e-06, + "loss": 3.0234, + "step": 57668 + }, + { + "epoch": 2.83, + "grad_norm": 0.7871835231781006, + "learning_rate": 4.9539755080763575e-06, + "loss": 2.8553, + "step": 57669 + }, + { + "epoch": 2.83, + "grad_norm": 0.9065406322479248, + "learning_rate": 4.95118949172304e-06, + "loss": 2.8418, + "step": 57670 + }, + { + "epoch": 2.83, + "grad_norm": 0.7578248977661133, + "learning_rate": 4.948404252471539e-06, + "loss": 2.8822, + "step": 57671 + }, + { + "epoch": 2.83, + "grad_norm": 0.734098494052887, + "learning_rate": 4.945619790329147e-06, + "loss": 2.7124, + "step": 57672 + }, + { + "epoch": 2.83, + "grad_norm": 0.7335187196731567, + "learning_rate": 4.942836105303161e-06, + "loss": 2.9535, + "step": 57673 + }, + { + "epoch": 2.83, + "grad_norm": 0.7521882057189941, + "learning_rate": 4.940053197401006e-06, + "loss": 2.7288, + "step": 57674 + }, + { + "epoch": 2.83, + "grad_norm": 0.8029778003692627, + "learning_rate": 4.937271066629944e-06, + "loss": 2.9228, + "step": 57675 + }, + { + "epoch": 2.83, + "grad_norm": 0.8154861330986023, + "learning_rate": 4.934489712997369e-06, + "loss": 3.0219, + "step": 57676 + }, + { + "epoch": 2.83, + "grad_norm": 0.7279578447341919, + "learning_rate": 4.931709136510542e-06, + "loss": 2.9194, + "step": 57677 + }, + { + "epoch": 2.83, + "grad_norm": 0.7490201592445374, + "learning_rate": 4.9289293371767905e-06, + "loss": 2.8508, + "step": 57678 + }, + { + "epoch": 2.83, + "grad_norm": 0.8128371834754944, + "learning_rate": 4.926150315003441e-06, + "loss": 2.6265, + "step": 57679 + }, + { + "epoch": 2.83, + "grad_norm": 0.7674548029899597, + "learning_rate": 4.923372069997822e-06, + "loss": 2.8969, + "step": 57680 + }, + { + "epoch": 2.83, + "grad_norm": 0.7656154036521912, + "learning_rate": 4.920594602167261e-06, + "loss": 2.8899, + "step": 57681 + }, + { + "epoch": 2.83, + "grad_norm": 0.733749270439148, + "learning_rate": 4.917817911519084e-06, + "loss": 2.8042, + "step": 57682 + }, + { + "epoch": 2.83, + "grad_norm": 0.7814816832542419, + "learning_rate": 4.915041998060587e-06, + "loss": 2.9326, + "step": 57683 + }, + { + "epoch": 2.83, + "grad_norm": 0.7335500121116638, + "learning_rate": 4.912266861799097e-06, + "loss": 2.7703, + "step": 57684 + }, + { + "epoch": 2.83, + "grad_norm": 0.7337460517883301, + "learning_rate": 4.909492502741841e-06, + "loss": 3.0456, + "step": 57685 + }, + { + "epoch": 2.83, + "grad_norm": 0.7310239672660828, + "learning_rate": 4.906718920896213e-06, + "loss": 2.8439, + "step": 57686 + }, + { + "epoch": 2.83, + "grad_norm": 0.7263157963752747, + "learning_rate": 4.903946116269508e-06, + "loss": 3.001, + "step": 57687 + }, + { + "epoch": 2.83, + "grad_norm": 0.7620536088943481, + "learning_rate": 4.90117408886902e-06, + "loss": 3.0321, + "step": 57688 + }, + { + "epoch": 2.83, + "grad_norm": 0.7526038289070129, + "learning_rate": 4.898402838702043e-06, + "loss": 2.9153, + "step": 57689 + }, + { + "epoch": 2.83, + "grad_norm": 0.7336723804473877, + "learning_rate": 4.89563236577587e-06, + "loss": 3.0666, + "step": 57690 + }, + { + "epoch": 2.83, + "grad_norm": 0.7981674671173096, + "learning_rate": 4.89286267009783e-06, + "loss": 2.7095, + "step": 57691 + }, + { + "epoch": 2.83, + "grad_norm": 0.7469791173934937, + "learning_rate": 4.890093751675183e-06, + "loss": 2.9837, + "step": 57692 + }, + { + "epoch": 2.83, + "grad_norm": 0.7180920243263245, + "learning_rate": 4.887325610515225e-06, + "loss": 2.9881, + "step": 57693 + }, + { + "epoch": 2.83, + "grad_norm": 0.7620313763618469, + "learning_rate": 4.884558246625248e-06, + "loss": 2.7828, + "step": 57694 + }, + { + "epoch": 2.83, + "grad_norm": 0.7135055661201477, + "learning_rate": 4.8817916600125465e-06, + "loss": 3.048, + "step": 57695 + }, + { + "epoch": 2.83, + "grad_norm": 0.7366616129875183, + "learning_rate": 4.879025850684415e-06, + "loss": 2.9772, + "step": 57696 + }, + { + "epoch": 2.83, + "grad_norm": 0.745126485824585, + "learning_rate": 4.876260818648181e-06, + "loss": 2.8637, + "step": 57697 + }, + { + "epoch": 2.83, + "grad_norm": 0.7213049530982971, + "learning_rate": 4.873496563911039e-06, + "loss": 2.9629, + "step": 57698 + }, + { + "epoch": 2.83, + "grad_norm": 0.7770732641220093, + "learning_rate": 4.8707330864803166e-06, + "loss": 2.9155, + "step": 57699 + }, + { + "epoch": 2.83, + "grad_norm": 0.773853063583374, + "learning_rate": 4.8679703863632735e-06, + "loss": 2.8713, + "step": 57700 + }, + { + "epoch": 2.83, + "grad_norm": 0.7769290804862976, + "learning_rate": 4.865208463567172e-06, + "loss": 2.8884, + "step": 57701 + }, + { + "epoch": 2.83, + "grad_norm": 0.7928144335746765, + "learning_rate": 4.8624473180993385e-06, + "loss": 2.7892, + "step": 57702 + }, + { + "epoch": 2.83, + "grad_norm": 0.7701162695884705, + "learning_rate": 4.859686949967034e-06, + "loss": 2.9104, + "step": 57703 + }, + { + "epoch": 2.83, + "grad_norm": 0.7658120393753052, + "learning_rate": 4.856927359177487e-06, + "loss": 2.825, + "step": 57704 + }, + { + "epoch": 2.83, + "grad_norm": 0.7138972282409668, + "learning_rate": 4.854168545738024e-06, + "loss": 3.0144, + "step": 57705 + }, + { + "epoch": 2.83, + "grad_norm": 0.7636241316795349, + "learning_rate": 4.8514105096558396e-06, + "loss": 2.7451, + "step": 57706 + }, + { + "epoch": 2.83, + "grad_norm": 0.7346954941749573, + "learning_rate": 4.848653250938261e-06, + "loss": 2.9506, + "step": 57707 + }, + { + "epoch": 2.83, + "grad_norm": 0.7780959010124207, + "learning_rate": 4.8458967695925166e-06, + "loss": 2.8082, + "step": 57708 + }, + { + "epoch": 2.83, + "grad_norm": 0.7654991149902344, + "learning_rate": 4.843141065625899e-06, + "loss": 2.7958, + "step": 57709 + }, + { + "epoch": 2.83, + "grad_norm": 0.7330126166343689, + "learning_rate": 4.840386139045604e-06, + "loss": 2.9405, + "step": 57710 + }, + { + "epoch": 2.83, + "grad_norm": 0.7495510578155518, + "learning_rate": 4.837631989858959e-06, + "loss": 2.8339, + "step": 57711 + }, + { + "epoch": 2.83, + "grad_norm": 0.7610169053077698, + "learning_rate": 4.83487861807319e-06, + "loss": 2.6812, + "step": 57712 + }, + { + "epoch": 2.83, + "grad_norm": 0.711486279964447, + "learning_rate": 4.832126023695526e-06, + "loss": 2.9422, + "step": 57713 + }, + { + "epoch": 2.83, + "grad_norm": 0.7498459815979004, + "learning_rate": 4.829374206733261e-06, + "loss": 2.8501, + "step": 57714 + }, + { + "epoch": 2.83, + "grad_norm": 0.755424439907074, + "learning_rate": 4.826623167193588e-06, + "loss": 2.9558, + "step": 57715 + }, + { + "epoch": 2.83, + "grad_norm": 0.744842529296875, + "learning_rate": 4.82387290508377e-06, + "loss": 2.8703, + "step": 57716 + }, + { + "epoch": 2.83, + "grad_norm": 0.7416000962257385, + "learning_rate": 4.821123420411099e-06, + "loss": 2.9342, + "step": 57717 + }, + { + "epoch": 2.83, + "grad_norm": 0.782110869884491, + "learning_rate": 4.818374713182771e-06, + "loss": 2.8408, + "step": 57718 + }, + { + "epoch": 2.83, + "grad_norm": 0.7619156241416931, + "learning_rate": 4.815626783406046e-06, + "loss": 2.8473, + "step": 57719 + }, + { + "epoch": 2.83, + "grad_norm": 0.7477062940597534, + "learning_rate": 4.812879631088151e-06, + "loss": 3.0257, + "step": 57720 + }, + { + "epoch": 2.83, + "grad_norm": 0.7299256920814514, + "learning_rate": 4.810133256236282e-06, + "loss": 3.0029, + "step": 57721 + }, + { + "epoch": 2.83, + "grad_norm": 0.7986459136009216, + "learning_rate": 4.807387658857764e-06, + "loss": 2.4808, + "step": 57722 + }, + { + "epoch": 2.83, + "grad_norm": 0.7746559381484985, + "learning_rate": 4.804642838959727e-06, + "loss": 3.0949, + "step": 57723 + }, + { + "epoch": 2.83, + "grad_norm": 0.8096112012863159, + "learning_rate": 4.8018987965494635e-06, + "loss": 2.8608, + "step": 57724 + }, + { + "epoch": 2.83, + "grad_norm": 0.7485320568084717, + "learning_rate": 4.799155531634203e-06, + "loss": 2.904, + "step": 57725 + }, + { + "epoch": 2.83, + "grad_norm": 0.7414878010749817, + "learning_rate": 4.796413044221136e-06, + "loss": 2.7812, + "step": 57726 + }, + { + "epoch": 2.83, + "grad_norm": 0.7604761123657227, + "learning_rate": 4.793671334317528e-06, + "loss": 2.8507, + "step": 57727 + }, + { + "epoch": 2.83, + "grad_norm": 0.7408043146133423, + "learning_rate": 4.7909304019305705e-06, + "loss": 3.0301, + "step": 57728 + }, + { + "epoch": 2.83, + "grad_norm": 0.8125025033950806, + "learning_rate": 4.788190247067492e-06, + "loss": 3.0556, + "step": 57729 + }, + { + "epoch": 2.83, + "grad_norm": 0.7148528099060059, + "learning_rate": 4.785450869735452e-06, + "loss": 2.9549, + "step": 57730 + }, + { + "epoch": 2.83, + "grad_norm": 0.7859530448913574, + "learning_rate": 4.782712269941747e-06, + "loss": 2.904, + "step": 57731 + }, + { + "epoch": 2.83, + "grad_norm": 0.814154863357544, + "learning_rate": 4.7799744476936015e-06, + "loss": 2.845, + "step": 57732 + }, + { + "epoch": 2.83, + "grad_norm": 0.7224166393280029, + "learning_rate": 4.777237402998147e-06, + "loss": 2.8495, + "step": 57733 + }, + { + "epoch": 2.83, + "grad_norm": 0.7980074286460876, + "learning_rate": 4.7745011358626406e-06, + "loss": 2.9546, + "step": 57734 + }, + { + "epoch": 2.83, + "grad_norm": 0.7275083661079407, + "learning_rate": 4.7717656462943125e-06, + "loss": 2.9702, + "step": 57735 + }, + { + "epoch": 2.83, + "grad_norm": 0.7441866993904114, + "learning_rate": 4.769030934300289e-06, + "loss": 2.918, + "step": 57736 + }, + { + "epoch": 2.83, + "grad_norm": 0.8218494653701782, + "learning_rate": 4.766296999887864e-06, + "loss": 3.025, + "step": 57737 + }, + { + "epoch": 2.83, + "grad_norm": 0.7137600779533386, + "learning_rate": 4.763563843064167e-06, + "loss": 2.8481, + "step": 57738 + }, + { + "epoch": 2.83, + "grad_norm": 0.8052471876144409, + "learning_rate": 4.760831463836423e-06, + "loss": 2.8723, + "step": 57739 + }, + { + "epoch": 2.83, + "grad_norm": 0.7791904807090759, + "learning_rate": 4.7580998622118285e-06, + "loss": 2.9187, + "step": 57740 + }, + { + "epoch": 2.83, + "grad_norm": 0.7228248119354248, + "learning_rate": 4.755369038197576e-06, + "loss": 3.0401, + "step": 57741 + }, + { + "epoch": 2.83, + "grad_norm": 0.796574056148529, + "learning_rate": 4.752638991800895e-06, + "loss": 2.9358, + "step": 57742 + }, + { + "epoch": 2.83, + "grad_norm": 0.7155166268348694, + "learning_rate": 4.749909723028911e-06, + "loss": 2.9773, + "step": 57743 + }, + { + "epoch": 2.83, + "grad_norm": 0.731598973274231, + "learning_rate": 4.747181231888852e-06, + "loss": 2.8988, + "step": 57744 + }, + { + "epoch": 2.83, + "grad_norm": 0.7063519954681396, + "learning_rate": 4.74445351838788e-06, + "loss": 2.8026, + "step": 57745 + }, + { + "epoch": 2.83, + "grad_norm": 0.7575142979621887, + "learning_rate": 4.741726582533189e-06, + "loss": 2.9443, + "step": 57746 + }, + { + "epoch": 2.83, + "grad_norm": 0.7412130832672119, + "learning_rate": 4.739000424331973e-06, + "loss": 3.0455, + "step": 57747 + }, + { + "epoch": 2.83, + "grad_norm": 0.700270414352417, + "learning_rate": 4.736275043791393e-06, + "loss": 2.9013, + "step": 57748 + }, + { + "epoch": 2.83, + "grad_norm": 0.8509933352470398, + "learning_rate": 4.733550440918676e-06, + "loss": 2.6319, + "step": 57749 + }, + { + "epoch": 2.83, + "grad_norm": 0.7422335147857666, + "learning_rate": 4.730826615720951e-06, + "loss": 2.8367, + "step": 57750 + }, + { + "epoch": 2.83, + "grad_norm": 0.7345046401023865, + "learning_rate": 4.7281035682054105e-06, + "loss": 3.0125, + "step": 57751 + }, + { + "epoch": 2.83, + "grad_norm": 0.7160439491271973, + "learning_rate": 4.725381298379216e-06, + "loss": 2.8065, + "step": 57752 + }, + { + "epoch": 2.83, + "grad_norm": 0.7227357625961304, + "learning_rate": 4.722659806249496e-06, + "loss": 2.978, + "step": 57753 + }, + { + "epoch": 2.83, + "grad_norm": 0.7689091563224792, + "learning_rate": 4.71993909182351e-06, + "loss": 2.9034, + "step": 57754 + }, + { + "epoch": 2.83, + "grad_norm": 0.7610191106796265, + "learning_rate": 4.717219155108354e-06, + "loss": 2.8094, + "step": 57755 + }, + { + "epoch": 2.83, + "grad_norm": 0.777751088142395, + "learning_rate": 4.714499996111254e-06, + "loss": 2.7979, + "step": 57756 + }, + { + "epoch": 2.83, + "grad_norm": 0.8778126239776611, + "learning_rate": 4.711781614839305e-06, + "loss": 2.9273, + "step": 57757 + }, + { + "epoch": 2.83, + "grad_norm": 0.7235648036003113, + "learning_rate": 4.709064011299668e-06, + "loss": 2.98, + "step": 57758 + }, + { + "epoch": 2.83, + "grad_norm": 0.7848798036575317, + "learning_rate": 4.70634718549957e-06, + "loss": 2.9884, + "step": 57759 + }, + { + "epoch": 2.83, + "grad_norm": 0.7237803936004639, + "learning_rate": 4.703631137446107e-06, + "loss": 2.9722, + "step": 57760 + }, + { + "epoch": 2.83, + "grad_norm": 0.7664092183113098, + "learning_rate": 4.700915867146438e-06, + "loss": 2.7523, + "step": 57761 + }, + { + "epoch": 2.83, + "grad_norm": 0.7664266228675842, + "learning_rate": 4.698201374607724e-06, + "loss": 2.7736, + "step": 57762 + }, + { + "epoch": 2.83, + "grad_norm": 0.7319455742835999, + "learning_rate": 4.695487659837127e-06, + "loss": 2.956, + "step": 57763 + }, + { + "epoch": 2.83, + "grad_norm": 0.7283288836479187, + "learning_rate": 4.692774722841807e-06, + "loss": 2.8592, + "step": 57764 + }, + { + "epoch": 2.83, + "grad_norm": 0.7501567602157593, + "learning_rate": 4.69006256362886e-06, + "loss": 2.7725, + "step": 57765 + }, + { + "epoch": 2.83, + "grad_norm": 0.8033075332641602, + "learning_rate": 4.687351182205446e-06, + "loss": 2.627, + "step": 57766 + }, + { + "epoch": 2.83, + "grad_norm": 0.7505348324775696, + "learning_rate": 4.684640578578725e-06, + "loss": 2.9283, + "step": 57767 + }, + { + "epoch": 2.83, + "grad_norm": 0.7713310718536377, + "learning_rate": 4.681930752755791e-06, + "loss": 2.8058, + "step": 57768 + }, + { + "epoch": 2.83, + "grad_norm": 0.7473810315132141, + "learning_rate": 4.679221704743874e-06, + "loss": 3.0927, + "step": 57769 + }, + { + "epoch": 2.83, + "grad_norm": 0.7242414951324463, + "learning_rate": 4.67651343455e-06, + "loss": 2.848, + "step": 57770 + }, + { + "epoch": 2.83, + "grad_norm": 0.7761023640632629, + "learning_rate": 4.673805942181397e-06, + "loss": 3.0277, + "step": 57771 + }, + { + "epoch": 2.83, + "grad_norm": 0.7542916536331177, + "learning_rate": 4.671099227645159e-06, + "loss": 2.9206, + "step": 57772 + }, + { + "epoch": 2.83, + "grad_norm": 0.7616918087005615, + "learning_rate": 4.668393290948347e-06, + "loss": 3.0023, + "step": 57773 + }, + { + "epoch": 2.83, + "grad_norm": 0.7441499829292297, + "learning_rate": 4.665688132098222e-06, + "loss": 2.8658, + "step": 57774 + }, + { + "epoch": 2.83, + "grad_norm": 0.736674964427948, + "learning_rate": 4.662983751101779e-06, + "loss": 2.8954, + "step": 57775 + }, + { + "epoch": 2.83, + "grad_norm": 0.7363643050193787, + "learning_rate": 4.660280147966211e-06, + "loss": 3.0603, + "step": 57776 + }, + { + "epoch": 2.83, + "grad_norm": 0.7268117070198059, + "learning_rate": 4.657577322698647e-06, + "loss": 2.9473, + "step": 57777 + }, + { + "epoch": 2.83, + "grad_norm": 0.7369086742401123, + "learning_rate": 4.65487527530618e-06, + "loss": 2.6568, + "step": 57778 + }, + { + "epoch": 2.83, + "grad_norm": 0.7631237506866455, + "learning_rate": 4.652174005795939e-06, + "loss": 3.0302, + "step": 57779 + }, + { + "epoch": 2.83, + "grad_norm": 0.7441269755363464, + "learning_rate": 4.649473514174984e-06, + "loss": 2.884, + "step": 57780 + }, + { + "epoch": 2.83, + "grad_norm": 0.7539127469062805, + "learning_rate": 4.646773800450543e-06, + "loss": 2.6154, + "step": 57781 + }, + { + "epoch": 2.83, + "grad_norm": 0.7788946628570557, + "learning_rate": 4.644074864629576e-06, + "loss": 2.783, + "step": 57782 + }, + { + "epoch": 2.83, + "grad_norm": 0.7208992838859558, + "learning_rate": 4.641376706719313e-06, + "loss": 2.7661, + "step": 57783 + }, + { + "epoch": 2.83, + "grad_norm": 0.7645790576934814, + "learning_rate": 4.6386793267268465e-06, + "loss": 2.8266, + "step": 57784 + }, + { + "epoch": 2.83, + "grad_norm": 0.7291238307952881, + "learning_rate": 4.635982724659237e-06, + "loss": 2.8439, + "step": 57785 + }, + { + "epoch": 2.83, + "grad_norm": 0.7457494139671326, + "learning_rate": 4.633286900523614e-06, + "loss": 2.883, + "step": 57786 + }, + { + "epoch": 2.83, + "grad_norm": 0.7361904382705688, + "learning_rate": 4.630591854327037e-06, + "loss": 3.0172, + "step": 57787 + }, + { + "epoch": 2.83, + "grad_norm": 0.7620532512664795, + "learning_rate": 4.627897586076667e-06, + "loss": 2.8486, + "step": 57788 + }, + { + "epoch": 2.83, + "grad_norm": 0.7753808498382568, + "learning_rate": 4.625204095779566e-06, + "loss": 2.9601, + "step": 57789 + }, + { + "epoch": 2.83, + "grad_norm": 0.7386515736579895, + "learning_rate": 4.6225113834427955e-06, + "loss": 2.8754, + "step": 57790 + }, + { + "epoch": 2.83, + "grad_norm": 0.7125532627105713, + "learning_rate": 4.619819449073548e-06, + "loss": 2.8305, + "step": 57791 + }, + { + "epoch": 2.83, + "grad_norm": 0.7690471410751343, + "learning_rate": 4.617128292678784e-06, + "loss": 2.8357, + "step": 57792 + }, + { + "epoch": 2.83, + "grad_norm": 0.717765212059021, + "learning_rate": 4.614437914265701e-06, + "loss": 2.9072, + "step": 57793 + }, + { + "epoch": 2.83, + "grad_norm": 0.7456265091896057, + "learning_rate": 4.611748313841324e-06, + "loss": 2.7729, + "step": 57794 + }, + { + "epoch": 2.83, + "grad_norm": 0.7556934952735901, + "learning_rate": 4.6090594914127475e-06, + "loss": 3.028, + "step": 57795 + }, + { + "epoch": 2.83, + "grad_norm": 0.76350337266922, + "learning_rate": 4.6063714469871005e-06, + "loss": 2.9979, + "step": 57796 + }, + { + "epoch": 2.83, + "grad_norm": 0.7897090911865234, + "learning_rate": 4.6036841805713765e-06, + "loss": 2.8443, + "step": 57797 + }, + { + "epoch": 2.83, + "grad_norm": 0.7721547484397888, + "learning_rate": 4.6009976921726695e-06, + "loss": 3.1241, + "step": 57798 + }, + { + "epoch": 2.83, + "grad_norm": 0.7226487398147583, + "learning_rate": 4.598311981798142e-06, + "loss": 2.8602, + "step": 57799 + }, + { + "epoch": 2.83, + "grad_norm": 0.7703262567520142, + "learning_rate": 4.59562704945482e-06, + "loss": 2.8151, + "step": 57800 + }, + { + "epoch": 2.83, + "grad_norm": 0.7636512517929077, + "learning_rate": 4.592942895149732e-06, + "loss": 2.9814, + "step": 57801 + }, + { + "epoch": 2.83, + "grad_norm": 0.7318775057792664, + "learning_rate": 4.590259518889971e-06, + "loss": 2.7938, + "step": 57802 + }, + { + "epoch": 2.83, + "grad_norm": 0.7615649700164795, + "learning_rate": 4.587576920682601e-06, + "loss": 2.8404, + "step": 57803 + }, + { + "epoch": 2.83, + "grad_norm": 0.7781710624694824, + "learning_rate": 4.584895100534747e-06, + "loss": 3.0723, + "step": 57804 + }, + { + "epoch": 2.83, + "grad_norm": 0.7660056948661804, + "learning_rate": 4.582214058453404e-06, + "loss": 2.945, + "step": 57805 + }, + { + "epoch": 2.83, + "grad_norm": 0.7571398019790649, + "learning_rate": 4.579533794445667e-06, + "loss": 2.8892, + "step": 57806 + }, + { + "epoch": 2.83, + "grad_norm": 0.793786346912384, + "learning_rate": 4.576854308518563e-06, + "loss": 3.0665, + "step": 57807 + }, + { + "epoch": 2.83, + "grad_norm": 0.7984482645988464, + "learning_rate": 4.574175600679186e-06, + "loss": 2.8812, + "step": 57808 + }, + { + "epoch": 2.83, + "grad_norm": 0.7446393966674805, + "learning_rate": 4.571497670934565e-06, + "loss": 2.9753, + "step": 57809 + }, + { + "epoch": 2.83, + "grad_norm": 0.7223838567733765, + "learning_rate": 4.56882051929176e-06, + "loss": 2.9849, + "step": 57810 + }, + { + "epoch": 2.83, + "grad_norm": 0.7122632265090942, + "learning_rate": 4.566144145757833e-06, + "loss": 3.0396, + "step": 57811 + }, + { + "epoch": 2.83, + "grad_norm": 0.7451797723770142, + "learning_rate": 4.56346855033981e-06, + "loss": 2.9919, + "step": 57812 + }, + { + "epoch": 2.83, + "grad_norm": 0.7426049113273621, + "learning_rate": 4.560793733044754e-06, + "loss": 2.8361, + "step": 57813 + }, + { + "epoch": 2.83, + "grad_norm": 0.7814031839370728, + "learning_rate": 4.558119693879725e-06, + "loss": 2.8659, + "step": 57814 + }, + { + "epoch": 2.83, + "grad_norm": 0.7301146388053894, + "learning_rate": 4.55544643285175e-06, + "loss": 2.7683, + "step": 57815 + }, + { + "epoch": 2.83, + "grad_norm": 0.7373456954956055, + "learning_rate": 4.552773949967892e-06, + "loss": 3.0187, + "step": 57816 + }, + { + "epoch": 2.83, + "grad_norm": 0.7450728416442871, + "learning_rate": 4.5501022452351095e-06, + "loss": 2.842, + "step": 57817 + }, + { + "epoch": 2.83, + "grad_norm": 0.7419819831848145, + "learning_rate": 4.5474313186605326e-06, + "loss": 2.803, + "step": 57818 + }, + { + "epoch": 2.83, + "grad_norm": 0.7292543649673462, + "learning_rate": 4.544761170251154e-06, + "loss": 3.1129, + "step": 57819 + }, + { + "epoch": 2.83, + "grad_norm": 0.7770516276359558, + "learning_rate": 4.542091800014003e-06, + "loss": 2.9207, + "step": 57820 + }, + { + "epoch": 2.83, + "grad_norm": 0.7825976610183716, + "learning_rate": 4.5394232079561385e-06, + "loss": 3.0597, + "step": 57821 + }, + { + "epoch": 2.83, + "grad_norm": 0.7760852575302124, + "learning_rate": 4.53675539408459e-06, + "loss": 2.6327, + "step": 57822 + }, + { + "epoch": 2.83, + "grad_norm": 0.7495847344398499, + "learning_rate": 4.534088358406318e-06, + "loss": 2.6761, + "step": 57823 + }, + { + "epoch": 2.83, + "grad_norm": 0.7523453235626221, + "learning_rate": 4.531422100928417e-06, + "loss": 2.7543, + "step": 57824 + }, + { + "epoch": 2.83, + "grad_norm": 0.7610305547714233, + "learning_rate": 4.528756621657881e-06, + "loss": 2.9113, + "step": 57825 + }, + { + "epoch": 2.83, + "grad_norm": 0.7986166477203369, + "learning_rate": 4.526091920601737e-06, + "loss": 2.74, + "step": 57826 + }, + { + "epoch": 2.83, + "grad_norm": 0.7516844272613525, + "learning_rate": 4.523427997766982e-06, + "loss": 2.9315, + "step": 57827 + }, + { + "epoch": 2.83, + "grad_norm": 0.8079379796981812, + "learning_rate": 4.520764853160674e-06, + "loss": 3.0084, + "step": 57828 + }, + { + "epoch": 2.83, + "grad_norm": 0.8045254349708557, + "learning_rate": 4.518102486789843e-06, + "loss": 2.6265, + "step": 57829 + }, + { + "epoch": 2.83, + "grad_norm": 0.7560080289840698, + "learning_rate": 4.515440898661382e-06, + "loss": 2.8943, + "step": 57830 + }, + { + "epoch": 2.83, + "grad_norm": 0.7986272573471069, + "learning_rate": 4.512780088782453e-06, + "loss": 2.8917, + "step": 57831 + }, + { + "epoch": 2.83, + "grad_norm": 0.7806861400604248, + "learning_rate": 4.510120057159949e-06, + "loss": 2.9683, + "step": 57832 + }, + { + "epoch": 2.83, + "grad_norm": 0.7481813430786133, + "learning_rate": 4.507460803800933e-06, + "loss": 2.7959, + "step": 57833 + }, + { + "epoch": 2.83, + "grad_norm": 0.8359472155570984, + "learning_rate": 4.504802328712398e-06, + "loss": 2.8192, + "step": 57834 + }, + { + "epoch": 2.83, + "grad_norm": 0.7175796031951904, + "learning_rate": 4.502144631901339e-06, + "loss": 3.0029, + "step": 57835 + }, + { + "epoch": 2.83, + "grad_norm": 0.7417463660240173, + "learning_rate": 4.499487713374783e-06, + "loss": 3.0287, + "step": 57836 + }, + { + "epoch": 2.83, + "grad_norm": 0.7876279950141907, + "learning_rate": 4.496831573139692e-06, + "loss": 2.8647, + "step": 57837 + }, + { + "epoch": 2.83, + "grad_norm": 0.7579513788223267, + "learning_rate": 4.494176211203093e-06, + "loss": 3.0034, + "step": 57838 + }, + { + "epoch": 2.83, + "grad_norm": 0.7461996078491211, + "learning_rate": 4.491521627571948e-06, + "loss": 2.7366, + "step": 57839 + }, + { + "epoch": 2.83, + "grad_norm": 0.753563404083252, + "learning_rate": 4.48886782225325e-06, + "loss": 3.0483, + "step": 57840 + }, + { + "epoch": 2.83, + "grad_norm": 0.7457968592643738, + "learning_rate": 4.4862147952539954e-06, + "loss": 2.7279, + "step": 57841 + }, + { + "epoch": 2.83, + "grad_norm": 0.7886890769004822, + "learning_rate": 4.4835625465812095e-06, + "loss": 3.0078, + "step": 57842 + }, + { + "epoch": 2.83, + "grad_norm": 0.7757450342178345, + "learning_rate": 4.4809110762418225e-06, + "loss": 2.752, + "step": 57843 + }, + { + "epoch": 2.83, + "grad_norm": 0.731127917766571, + "learning_rate": 4.47826038424286e-06, + "loss": 2.929, + "step": 57844 + }, + { + "epoch": 2.83, + "grad_norm": 0.7428131103515625, + "learning_rate": 4.475610470591284e-06, + "loss": 2.9674, + "step": 57845 + }, + { + "epoch": 2.83, + "grad_norm": 0.8183236718177795, + "learning_rate": 4.472961335294056e-06, + "loss": 2.9806, + "step": 57846 + }, + { + "epoch": 2.83, + "grad_norm": 0.7529081702232361, + "learning_rate": 4.47031297835817e-06, + "loss": 2.944, + "step": 57847 + }, + { + "epoch": 2.83, + "grad_norm": 0.7067267894744873, + "learning_rate": 4.467665399790654e-06, + "loss": 2.9451, + "step": 57848 + }, + { + "epoch": 2.84, + "grad_norm": 0.793649435043335, + "learning_rate": 4.465018599598369e-06, + "loss": 2.9698, + "step": 57849 + }, + { + "epoch": 2.84, + "grad_norm": 0.7690135836601257, + "learning_rate": 4.462372577788376e-06, + "loss": 2.6836, + "step": 57850 + }, + { + "epoch": 2.84, + "grad_norm": 0.7430137991905212, + "learning_rate": 4.459727334367635e-06, + "loss": 2.9179, + "step": 57851 + }, + { + "epoch": 2.84, + "grad_norm": 0.7567710876464844, + "learning_rate": 4.457082869343076e-06, + "loss": 3.0389, + "step": 57852 + }, + { + "epoch": 2.84, + "grad_norm": 0.7652338743209839, + "learning_rate": 4.454439182721692e-06, + "loss": 2.9754, + "step": 57853 + }, + { + "epoch": 2.84, + "grad_norm": 0.7574499845504761, + "learning_rate": 4.451796274510411e-06, + "loss": 2.7802, + "step": 57854 + }, + { + "epoch": 2.84, + "grad_norm": 0.7804326415061951, + "learning_rate": 4.449154144716227e-06, + "loss": 2.8233, + "step": 57855 + }, + { + "epoch": 2.84, + "grad_norm": 0.777418315410614, + "learning_rate": 4.446512793346102e-06, + "loss": 2.8151, + "step": 57856 + }, + { + "epoch": 2.84, + "grad_norm": 0.7421988248825073, + "learning_rate": 4.443872220406963e-06, + "loss": 2.8111, + "step": 57857 + }, + { + "epoch": 2.84, + "grad_norm": 0.758773922920227, + "learning_rate": 4.441232425905805e-06, + "loss": 3.0517, + "step": 57858 + }, + { + "epoch": 2.84, + "grad_norm": 0.8324280381202698, + "learning_rate": 4.438593409849556e-06, + "loss": 3.0362, + "step": 57859 + }, + { + "epoch": 2.84, + "grad_norm": 0.7193976044654846, + "learning_rate": 4.435955172245176e-06, + "loss": 2.7782, + "step": 57860 + }, + { + "epoch": 2.84, + "grad_norm": 0.7793514132499695, + "learning_rate": 4.433317713099593e-06, + "loss": 2.9797, + "step": 57861 + }, + { + "epoch": 2.84, + "grad_norm": 0.7374630570411682, + "learning_rate": 4.43068103241977e-06, + "loss": 2.6941, + "step": 57862 + }, + { + "epoch": 2.84, + "grad_norm": 0.7534642219543457, + "learning_rate": 4.428045130212665e-06, + "loss": 2.9008, + "step": 57863 + }, + { + "epoch": 2.84, + "grad_norm": 0.7688164114952087, + "learning_rate": 4.425410006485175e-06, + "loss": 3.0181, + "step": 57864 + }, + { + "epoch": 2.84, + "grad_norm": 0.7269670963287354, + "learning_rate": 4.4227756612442936e-06, + "loss": 2.9657, + "step": 57865 + }, + { + "epoch": 2.84, + "grad_norm": 0.7430902123451233, + "learning_rate": 4.420142094496948e-06, + "loss": 2.9997, + "step": 57866 + }, + { + "epoch": 2.84, + "grad_norm": 0.7281354069709778, + "learning_rate": 4.417509306250033e-06, + "loss": 2.9365, + "step": 57867 + }, + { + "epoch": 2.84, + "grad_norm": 0.7742909789085388, + "learning_rate": 4.414877296510544e-06, + "loss": 2.9117, + "step": 57868 + }, + { + "epoch": 2.84, + "grad_norm": 0.7314521670341492, + "learning_rate": 4.412246065285341e-06, + "loss": 2.8858, + "step": 57869 + }, + { + "epoch": 2.84, + "grad_norm": 0.7391765713691711, + "learning_rate": 4.409615612581418e-06, + "loss": 3.1135, + "step": 57870 + }, + { + "epoch": 2.84, + "grad_norm": 0.7244545221328735, + "learning_rate": 4.406985938405672e-06, + "loss": 2.8085, + "step": 57871 + }, + { + "epoch": 2.84, + "grad_norm": 0.7341313362121582, + "learning_rate": 4.404357042765028e-06, + "loss": 2.8818, + "step": 57872 + }, + { + "epoch": 2.84, + "grad_norm": 0.7704923152923584, + "learning_rate": 4.4017289256664476e-06, + "loss": 2.8435, + "step": 57873 + }, + { + "epoch": 2.84, + "grad_norm": 0.8078938722610474, + "learning_rate": 4.399101587116827e-06, + "loss": 3.1185, + "step": 57874 + }, + { + "epoch": 2.84, + "grad_norm": 0.7866788506507874, + "learning_rate": 4.396475027123059e-06, + "loss": 2.8965, + "step": 57875 + }, + { + "epoch": 2.84, + "grad_norm": 0.7199814915657043, + "learning_rate": 4.393849245692105e-06, + "loss": 2.9652, + "step": 57876 + }, + { + "epoch": 2.84, + "grad_norm": 0.7180720567703247, + "learning_rate": 4.391224242830827e-06, + "loss": 2.8224, + "step": 57877 + }, + { + "epoch": 2.84, + "grad_norm": 0.731706976890564, + "learning_rate": 4.388600018546218e-06, + "loss": 2.8166, + "step": 57878 + }, + { + "epoch": 2.84, + "grad_norm": 0.7510423064231873, + "learning_rate": 4.385976572845107e-06, + "loss": 2.8502, + "step": 57879 + }, + { + "epoch": 2.84, + "grad_norm": 0.7061409950256348, + "learning_rate": 4.383353905734488e-06, + "loss": 2.8161, + "step": 57880 + }, + { + "epoch": 2.84, + "grad_norm": 0.756230354309082, + "learning_rate": 4.380732017221189e-06, + "loss": 2.8982, + "step": 57881 + }, + { + "epoch": 2.84, + "grad_norm": 0.7549283504486084, + "learning_rate": 4.378110907312138e-06, + "loss": 2.9815, + "step": 57882 + }, + { + "epoch": 2.84, + "grad_norm": 0.7772846817970276, + "learning_rate": 4.375490576014296e-06, + "loss": 2.7412, + "step": 57883 + }, + { + "epoch": 2.84, + "grad_norm": 0.7680914998054504, + "learning_rate": 4.372871023334457e-06, + "loss": 2.6235, + "step": 57884 + }, + { + "epoch": 2.84, + "grad_norm": 0.7407665848731995, + "learning_rate": 4.370252249279615e-06, + "loss": 2.8474, + "step": 57885 + }, + { + "epoch": 2.84, + "grad_norm": 0.7720359563827515, + "learning_rate": 4.367634253856633e-06, + "loss": 2.807, + "step": 57886 + }, + { + "epoch": 2.84, + "grad_norm": 0.7055439352989197, + "learning_rate": 4.365017037072405e-06, + "loss": 2.6587, + "step": 57887 + }, + { + "epoch": 2.84, + "grad_norm": 0.8091383576393127, + "learning_rate": 4.362400598933824e-06, + "loss": 2.8048, + "step": 57888 + }, + { + "epoch": 2.84, + "grad_norm": 0.7102215886116028, + "learning_rate": 4.3597849394477855e-06, + "loss": 2.9037, + "step": 57889 + }, + { + "epoch": 2.84, + "grad_norm": 0.7323922514915466, + "learning_rate": 4.357170058621151e-06, + "loss": 2.7498, + "step": 57890 + }, + { + "epoch": 2.84, + "grad_norm": 0.7890856266021729, + "learning_rate": 4.354555956460881e-06, + "loss": 2.8841, + "step": 57891 + }, + { + "epoch": 2.84, + "grad_norm": 0.782171905040741, + "learning_rate": 4.3519426329737705e-06, + "loss": 2.9299, + "step": 57892 + }, + { + "epoch": 2.84, + "grad_norm": 0.7346777319908142, + "learning_rate": 4.349330088166781e-06, + "loss": 2.9818, + "step": 57893 + }, + { + "epoch": 2.84, + "grad_norm": 0.7390797138214111, + "learning_rate": 4.346718322046705e-06, + "loss": 2.9289, + "step": 57894 + }, + { + "epoch": 2.84, + "grad_norm": 0.710119366645813, + "learning_rate": 4.344107334620539e-06, + "loss": 2.9441, + "step": 57895 + }, + { + "epoch": 2.84, + "grad_norm": 0.8123124241828918, + "learning_rate": 4.341497125895044e-06, + "loss": 3.1647, + "step": 57896 + }, + { + "epoch": 2.84, + "grad_norm": 0.7103381156921387, + "learning_rate": 4.33888769587718e-06, + "loss": 2.8699, + "step": 57897 + }, + { + "epoch": 2.84, + "grad_norm": 0.740925669670105, + "learning_rate": 4.336279044573776e-06, + "loss": 2.9577, + "step": 57898 + }, + { + "epoch": 2.84, + "grad_norm": 0.7418505549430847, + "learning_rate": 4.33367117199166e-06, + "loss": 2.951, + "step": 57899 + }, + { + "epoch": 2.84, + "grad_norm": 0.7685325145721436, + "learning_rate": 4.331064078137791e-06, + "loss": 2.8862, + "step": 57900 + }, + { + "epoch": 2.84, + "grad_norm": 0.787038266658783, + "learning_rate": 4.328457763019033e-06, + "loss": 2.7146, + "step": 57901 + }, + { + "epoch": 2.84, + "grad_norm": 0.7468493580818176, + "learning_rate": 4.325852226642179e-06, + "loss": 2.8801, + "step": 57902 + }, + { + "epoch": 2.84, + "grad_norm": 0.7677332162857056, + "learning_rate": 4.323247469014124e-06, + "loss": 2.8442, + "step": 57903 + }, + { + "epoch": 2.84, + "grad_norm": 0.7528624534606934, + "learning_rate": 4.320643490141729e-06, + "loss": 2.6777, + "step": 57904 + }, + { + "epoch": 2.84, + "grad_norm": 0.7097703218460083, + "learning_rate": 4.318040290031888e-06, + "loss": 2.7669, + "step": 57905 + }, + { + "epoch": 2.84, + "grad_norm": 0.6983256936073303, + "learning_rate": 4.315437868691363e-06, + "loss": 2.9228, + "step": 57906 + }, + { + "epoch": 2.84, + "grad_norm": 0.8027907013893127, + "learning_rate": 4.312836226127115e-06, + "loss": 2.9804, + "step": 57907 + }, + { + "epoch": 2.84, + "grad_norm": 0.7508549690246582, + "learning_rate": 4.3102353623459376e-06, + "loss": 2.9204, + "step": 57908 + }, + { + "epoch": 2.84, + "grad_norm": 0.7843279242515564, + "learning_rate": 4.3076352773546925e-06, + "loss": 2.8192, + "step": 57909 + }, + { + "epoch": 2.84, + "grad_norm": 0.7532942295074463, + "learning_rate": 4.305035971160242e-06, + "loss": 2.9188, + "step": 57910 + }, + { + "epoch": 2.84, + "grad_norm": 0.7582915425300598, + "learning_rate": 4.3024374437694135e-06, + "loss": 2.9539, + "step": 57911 + }, + { + "epoch": 2.84, + "grad_norm": 0.7695998549461365, + "learning_rate": 4.299839695189034e-06, + "loss": 2.9009, + "step": 57912 + }, + { + "epoch": 2.84, + "grad_norm": 0.739365816116333, + "learning_rate": 4.2972427254259974e-06, + "loss": 2.8835, + "step": 57913 + }, + { + "epoch": 2.84, + "grad_norm": 0.745867133140564, + "learning_rate": 4.294646534487068e-06, + "loss": 2.8355, + "step": 57914 + }, + { + "epoch": 2.84, + "grad_norm": 0.7075350880622864, + "learning_rate": 4.2920511223791695e-06, + "loss": 2.8435, + "step": 57915 + }, + { + "epoch": 2.84, + "grad_norm": 0.8138481974601746, + "learning_rate": 4.2894564891091e-06, + "loss": 3.016, + "step": 57916 + }, + { + "epoch": 2.84, + "grad_norm": 0.9259201288223267, + "learning_rate": 4.286862634683685e-06, + "loss": 2.7868, + "step": 57917 + }, + { + "epoch": 2.84, + "grad_norm": 0.7305136322975159, + "learning_rate": 4.284269559109787e-06, + "loss": 2.81, + "step": 57918 + }, + { + "epoch": 2.84, + "grad_norm": 0.7330555319786072, + "learning_rate": 4.2816772623941675e-06, + "loss": 2.6985, + "step": 57919 + }, + { + "epoch": 2.84, + "grad_norm": 0.7787506580352783, + "learning_rate": 4.27908574454372e-06, + "loss": 2.8231, + "step": 57920 + }, + { + "epoch": 2.84, + "grad_norm": 0.7437611222267151, + "learning_rate": 4.2764950055652395e-06, + "loss": 2.8805, + "step": 57921 + }, + { + "epoch": 2.84, + "grad_norm": 0.7861391305923462, + "learning_rate": 4.273905045465553e-06, + "loss": 2.9361, + "step": 57922 + }, + { + "epoch": 2.84, + "grad_norm": 0.7396217584609985, + "learning_rate": 4.27131586425149e-06, + "loss": 2.7683, + "step": 57923 + }, + { + "epoch": 2.84, + "grad_norm": 0.7936074137687683, + "learning_rate": 4.268727461929877e-06, + "loss": 2.794, + "step": 57924 + }, + { + "epoch": 2.84, + "grad_norm": 0.7759957313537598, + "learning_rate": 4.2661398385075426e-06, + "loss": 2.7761, + "step": 57925 + }, + { + "epoch": 2.84, + "grad_norm": 0.7496296167373657, + "learning_rate": 4.263552993991248e-06, + "loss": 2.6912, + "step": 57926 + }, + { + "epoch": 2.84, + "grad_norm": 0.7550476789474487, + "learning_rate": 4.2609669283878215e-06, + "loss": 2.8715, + "step": 57927 + }, + { + "epoch": 2.84, + "grad_norm": 0.7111081480979919, + "learning_rate": 4.258381641704123e-06, + "loss": 2.7266, + "step": 57928 + }, + { + "epoch": 2.84, + "grad_norm": 0.7496201992034912, + "learning_rate": 4.255797133946915e-06, + "loss": 2.8366, + "step": 57929 + }, + { + "epoch": 2.84, + "grad_norm": 0.802704393863678, + "learning_rate": 4.253213405123057e-06, + "loss": 2.9926, + "step": 57930 + }, + { + "epoch": 2.84, + "grad_norm": 0.7297253608703613, + "learning_rate": 4.250630455239312e-06, + "loss": 3.123, + "step": 57931 + }, + { + "epoch": 2.84, + "grad_norm": 0.71900874376297, + "learning_rate": 4.248048284302441e-06, + "loss": 2.8253, + "step": 57932 + }, + { + "epoch": 2.84, + "grad_norm": 0.7305431962013245, + "learning_rate": 4.245466892319338e-06, + "loss": 2.8313, + "step": 57933 + }, + { + "epoch": 2.84, + "grad_norm": 0.7327029705047607, + "learning_rate": 4.242886279296731e-06, + "loss": 2.9302, + "step": 57934 + }, + { + "epoch": 2.84, + "grad_norm": 0.7928759455680847, + "learning_rate": 4.240306445241448e-06, + "loss": 2.8044, + "step": 57935 + }, + { + "epoch": 2.84, + "grad_norm": 0.731713056564331, + "learning_rate": 4.237727390160251e-06, + "loss": 3.0344, + "step": 57936 + }, + { + "epoch": 2.84, + "grad_norm": 0.736788809299469, + "learning_rate": 4.235149114059966e-06, + "loss": 2.8996, + "step": 57937 + }, + { + "epoch": 2.84, + "grad_norm": 0.7987184524536133, + "learning_rate": 4.232571616947422e-06, + "loss": 2.8778, + "step": 57938 + }, + { + "epoch": 2.84, + "grad_norm": 0.7894659638404846, + "learning_rate": 4.229994898829348e-06, + "loss": 2.9336, + "step": 57939 + }, + { + "epoch": 2.84, + "grad_norm": 0.7339268922805786, + "learning_rate": 4.227418959712536e-06, + "loss": 2.855, + "step": 57940 + }, + { + "epoch": 2.84, + "grad_norm": 0.7853479385375977, + "learning_rate": 4.224843799603783e-06, + "loss": 2.9414, + "step": 57941 + }, + { + "epoch": 2.84, + "grad_norm": 0.8146184086799622, + "learning_rate": 4.2222694185098495e-06, + "loss": 2.8615, + "step": 57942 + }, + { + "epoch": 2.84, + "grad_norm": 0.825524091720581, + "learning_rate": 4.2196958164375625e-06, + "loss": 2.9887, + "step": 57943 + }, + { + "epoch": 2.84, + "grad_norm": 0.7087740898132324, + "learning_rate": 4.2171229933936515e-06, + "loss": 2.742, + "step": 57944 + }, + { + "epoch": 2.84, + "grad_norm": 0.7632535099983215, + "learning_rate": 4.21455094938491e-06, + "loss": 2.7973, + "step": 57945 + }, + { + "epoch": 2.84, + "grad_norm": 0.7354146838188171, + "learning_rate": 4.211979684418165e-06, + "loss": 3.1476, + "step": 57946 + }, + { + "epoch": 2.84, + "grad_norm": 0.7568386793136597, + "learning_rate": 4.20940919850008e-06, + "loss": 2.8029, + "step": 57947 + }, + { + "epoch": 2.84, + "grad_norm": 0.7767660617828369, + "learning_rate": 4.206839491637515e-06, + "loss": 2.9156, + "step": 57948 + }, + { + "epoch": 2.84, + "grad_norm": 0.7283722162246704, + "learning_rate": 4.204270563837198e-06, + "loss": 2.8352, + "step": 57949 + }, + { + "epoch": 2.84, + "grad_norm": 0.7674680948257446, + "learning_rate": 4.201702415105923e-06, + "loss": 2.6455, + "step": 57950 + }, + { + "epoch": 2.84, + "grad_norm": 0.751236081123352, + "learning_rate": 4.199135045450386e-06, + "loss": 2.8124, + "step": 57951 + }, + { + "epoch": 2.84, + "grad_norm": 0.7168905735015869, + "learning_rate": 4.196568454877447e-06, + "loss": 2.817, + "step": 57952 + }, + { + "epoch": 2.84, + "grad_norm": 0.7038781046867371, + "learning_rate": 4.194002643393834e-06, + "loss": 2.5922, + "step": 57953 + }, + { + "epoch": 2.84, + "grad_norm": 0.790746808052063, + "learning_rate": 4.191437611006243e-06, + "loss": 2.7898, + "step": 57954 + }, + { + "epoch": 2.84, + "grad_norm": 0.7640130519866943, + "learning_rate": 4.188873357721501e-06, + "loss": 2.9048, + "step": 57955 + }, + { + "epoch": 2.84, + "grad_norm": 0.7775148749351501, + "learning_rate": 4.186309883546335e-06, + "loss": 2.9071, + "step": 57956 + }, + { + "epoch": 2.84, + "grad_norm": 0.7022102475166321, + "learning_rate": 4.183747188487474e-06, + "loss": 2.9208, + "step": 57957 + }, + { + "epoch": 2.84, + "grad_norm": 0.8010618686676025, + "learning_rate": 4.181185272551712e-06, + "loss": 2.9989, + "step": 57958 + }, + { + "epoch": 2.84, + "grad_norm": 0.7519500851631165, + "learning_rate": 4.178624135745779e-06, + "loss": 2.8834, + "step": 57959 + }, + { + "epoch": 2.84, + "grad_norm": 0.745065450668335, + "learning_rate": 4.176063778076433e-06, + "loss": 2.8246, + "step": 57960 + }, + { + "epoch": 2.84, + "grad_norm": 0.7399493455886841, + "learning_rate": 4.17350419955037e-06, + "loss": 2.997, + "step": 57961 + }, + { + "epoch": 2.84, + "grad_norm": 0.7513877153396606, + "learning_rate": 4.170945400174386e-06, + "loss": 2.8798, + "step": 57962 + }, + { + "epoch": 2.84, + "grad_norm": 0.7985098958015442, + "learning_rate": 4.168387379955207e-06, + "loss": 2.7076, + "step": 57963 + }, + { + "epoch": 2.84, + "grad_norm": 0.7976999878883362, + "learning_rate": 4.1658301388995285e-06, + "loss": 2.992, + "step": 57964 + }, + { + "epoch": 2.84, + "grad_norm": 0.7618657946586609, + "learning_rate": 4.163273677014145e-06, + "loss": 2.8952, + "step": 57965 + }, + { + "epoch": 2.84, + "grad_norm": 0.9525282979011536, + "learning_rate": 4.1607179943057176e-06, + "loss": 2.7617, + "step": 57966 + }, + { + "epoch": 2.84, + "grad_norm": 0.7083631753921509, + "learning_rate": 4.1581630907810745e-06, + "loss": 2.9582, + "step": 57967 + }, + { + "epoch": 2.84, + "grad_norm": 0.7903333902359009, + "learning_rate": 4.155608966446911e-06, + "loss": 2.9069, + "step": 57968 + }, + { + "epoch": 2.84, + "grad_norm": 0.7692322134971619, + "learning_rate": 4.153055621309886e-06, + "loss": 2.8151, + "step": 57969 + }, + { + "epoch": 2.84, + "grad_norm": 0.8629139065742493, + "learning_rate": 4.150503055376797e-06, + "loss": 2.9081, + "step": 57970 + }, + { + "epoch": 2.84, + "grad_norm": 0.7383148074150085, + "learning_rate": 4.147951268654337e-06, + "loss": 2.7084, + "step": 57971 + }, + { + "epoch": 2.84, + "grad_norm": 0.7752810716629028, + "learning_rate": 4.145400261149235e-06, + "loss": 2.7297, + "step": 57972 + }, + { + "epoch": 2.84, + "grad_norm": 0.7584665417671204, + "learning_rate": 4.142850032868217e-06, + "loss": 2.7295, + "step": 57973 + }, + { + "epoch": 2.84, + "grad_norm": 0.7689148783683777, + "learning_rate": 4.140300583817979e-06, + "loss": 2.9014, + "step": 57974 + }, + { + "epoch": 2.84, + "grad_norm": 0.7338578104972839, + "learning_rate": 4.137751914005283e-06, + "loss": 2.6482, + "step": 57975 + }, + { + "epoch": 2.84, + "grad_norm": 0.7252892255783081, + "learning_rate": 4.135204023436822e-06, + "loss": 3.0192, + "step": 57976 + }, + { + "epoch": 2.84, + "grad_norm": 0.7320716977119446, + "learning_rate": 4.132656912119226e-06, + "loss": 2.8298, + "step": 57977 + }, + { + "epoch": 2.84, + "grad_norm": 0.7503402829170227, + "learning_rate": 4.130110580059321e-06, + "loss": 2.7203, + "step": 57978 + }, + { + "epoch": 2.84, + "grad_norm": 0.756325364112854, + "learning_rate": 4.127565027263735e-06, + "loss": 2.9428, + "step": 57979 + }, + { + "epoch": 2.84, + "grad_norm": 0.7448549270629883, + "learning_rate": 4.125020253739231e-06, + "loss": 2.8511, + "step": 57980 + }, + { + "epoch": 2.84, + "grad_norm": 0.7655499577522278, + "learning_rate": 4.122476259492469e-06, + "loss": 2.8666, + "step": 57981 + }, + { + "epoch": 2.84, + "grad_norm": 0.7265483736991882, + "learning_rate": 4.119933044530177e-06, + "loss": 2.8987, + "step": 57982 + }, + { + "epoch": 2.84, + "grad_norm": 0.7448196411132812, + "learning_rate": 4.117390608859017e-06, + "loss": 2.792, + "step": 57983 + }, + { + "epoch": 2.84, + "grad_norm": 0.7171865105628967, + "learning_rate": 4.114848952485683e-06, + "loss": 3.0098, + "step": 57984 + }, + { + "epoch": 2.84, + "grad_norm": 0.8641087412834167, + "learning_rate": 4.112308075416937e-06, + "loss": 2.7329, + "step": 57985 + }, + { + "epoch": 2.84, + "grad_norm": 0.7868609428405762, + "learning_rate": 4.109767977659406e-06, + "loss": 2.8346, + "step": 57986 + }, + { + "epoch": 2.84, + "grad_norm": 0.7726017236709595, + "learning_rate": 4.107228659219819e-06, + "loss": 2.8714, + "step": 57987 + }, + { + "epoch": 2.84, + "grad_norm": 0.7346259951591492, + "learning_rate": 4.104690120104803e-06, + "loss": 2.9884, + "step": 57988 + }, + { + "epoch": 2.84, + "grad_norm": 0.7412421107292175, + "learning_rate": 4.102152360321121e-06, + "loss": 2.9378, + "step": 57989 + }, + { + "epoch": 2.84, + "grad_norm": 0.7275198101997375, + "learning_rate": 4.099615379875432e-06, + "loss": 2.8435, + "step": 57990 + }, + { + "epoch": 2.84, + "grad_norm": 0.7367424368858337, + "learning_rate": 4.097079178774399e-06, + "loss": 2.8953, + "step": 57991 + }, + { + "epoch": 2.84, + "grad_norm": 0.7552502751350403, + "learning_rate": 4.094543757024715e-06, + "loss": 2.7933, + "step": 57992 + }, + { + "epoch": 2.84, + "grad_norm": 0.7371522188186646, + "learning_rate": 4.09200911463301e-06, + "loss": 3.0965, + "step": 57993 + }, + { + "epoch": 2.84, + "grad_norm": 0.785754919052124, + "learning_rate": 4.089475251606045e-06, + "loss": 2.8669, + "step": 57994 + }, + { + "epoch": 2.84, + "grad_norm": 0.793599009513855, + "learning_rate": 4.086942167950446e-06, + "loss": 2.6429, + "step": 57995 + }, + { + "epoch": 2.84, + "grad_norm": 0.7560648918151855, + "learning_rate": 4.084409863672877e-06, + "loss": 3.0258, + "step": 57996 + }, + { + "epoch": 2.84, + "grad_norm": 0.7576151490211487, + "learning_rate": 4.081878338780065e-06, + "loss": 2.9662, + "step": 57997 + }, + { + "epoch": 2.84, + "grad_norm": 0.7509180903434753, + "learning_rate": 4.079347593278604e-06, + "loss": 2.7924, + "step": 57998 + }, + { + "epoch": 2.84, + "grad_norm": 0.7320362329483032, + "learning_rate": 4.076817627175189e-06, + "loss": 3.1727, + "step": 57999 + }, + { + "epoch": 2.84, + "grad_norm": 0.7936714887619019, + "learning_rate": 4.074288440476514e-06, + "loss": 2.9039, + "step": 58000 + }, + { + "epoch": 2.84, + "grad_norm": 0.7300601005554199, + "learning_rate": 4.071760033189175e-06, + "loss": 2.7854, + "step": 58001 + }, + { + "epoch": 2.84, + "grad_norm": 0.8125167489051819, + "learning_rate": 4.0692324053199e-06, + "loss": 2.8679, + "step": 58002 + }, + { + "epoch": 2.84, + "grad_norm": 0.768974244594574, + "learning_rate": 4.066705556875283e-06, + "loss": 2.8779, + "step": 58003 + }, + { + "epoch": 2.84, + "grad_norm": 0.8136953711509705, + "learning_rate": 4.0641794878620184e-06, + "loss": 2.9216, + "step": 58004 + }, + { + "epoch": 2.84, + "grad_norm": 0.7406480312347412, + "learning_rate": 4.061654198286768e-06, + "loss": 2.853, + "step": 58005 + }, + { + "epoch": 2.84, + "grad_norm": 0.7325662970542908, + "learning_rate": 4.059129688156127e-06, + "loss": 2.8341, + "step": 58006 + }, + { + "epoch": 2.84, + "grad_norm": 0.7513902187347412, + "learning_rate": 4.056605957476822e-06, + "loss": 3.0636, + "step": 58007 + }, + { + "epoch": 2.84, + "grad_norm": 0.7709435224533081, + "learning_rate": 4.0540830062554485e-06, + "loss": 2.8569, + "step": 58008 + }, + { + "epoch": 2.84, + "grad_norm": 0.7840178608894348, + "learning_rate": 4.051560834498668e-06, + "loss": 2.9059, + "step": 58009 + }, + { + "epoch": 2.84, + "grad_norm": 0.7339916825294495, + "learning_rate": 4.049039442213109e-06, + "loss": 3.0242, + "step": 58010 + }, + { + "epoch": 2.84, + "grad_norm": 0.7548692226409912, + "learning_rate": 4.046518829405432e-06, + "loss": 2.9171, + "step": 58011 + }, + { + "epoch": 2.84, + "grad_norm": 0.7953229546546936, + "learning_rate": 4.0439989960822985e-06, + "loss": 3.0365, + "step": 58012 + }, + { + "epoch": 2.84, + "grad_norm": 0.7593258023262024, + "learning_rate": 4.04147994225027e-06, + "loss": 2.9056, + "step": 58013 + }, + { + "epoch": 2.84, + "grad_norm": 0.7055078744888306, + "learning_rate": 4.0389616679160074e-06, + "loss": 2.7517, + "step": 58014 + }, + { + "epoch": 2.84, + "grad_norm": 0.7557649612426758, + "learning_rate": 4.036444173086206e-06, + "loss": 2.7834, + "step": 58015 + }, + { + "epoch": 2.84, + "grad_norm": 0.7741184234619141, + "learning_rate": 4.0339274577674275e-06, + "loss": 2.812, + "step": 58016 + }, + { + "epoch": 2.84, + "grad_norm": 0.7468675374984741, + "learning_rate": 4.031411521966332e-06, + "loss": 3.0378, + "step": 58017 + }, + { + "epoch": 2.84, + "grad_norm": 0.7847301959991455, + "learning_rate": 4.028896365689516e-06, + "loss": 2.9056, + "step": 58018 + }, + { + "epoch": 2.84, + "grad_norm": 0.7446919083595276, + "learning_rate": 4.026381988943672e-06, + "loss": 2.8941, + "step": 58019 + }, + { + "epoch": 2.84, + "grad_norm": 0.7204757332801819, + "learning_rate": 4.0238683917353634e-06, + "loss": 2.852, + "step": 58020 + }, + { + "epoch": 2.84, + "grad_norm": 0.7593426704406738, + "learning_rate": 4.021355574071217e-06, + "loss": 2.8235, + "step": 58021 + }, + { + "epoch": 2.84, + "grad_norm": 0.7240512371063232, + "learning_rate": 4.018843535957861e-06, + "loss": 2.7313, + "step": 58022 + }, + { + "epoch": 2.84, + "grad_norm": 0.7632853388786316, + "learning_rate": 4.01633227740189e-06, + "loss": 2.9329, + "step": 58023 + }, + { + "epoch": 2.84, + "grad_norm": 0.753861665725708, + "learning_rate": 4.013821798409933e-06, + "loss": 2.913, + "step": 58024 + }, + { + "epoch": 2.84, + "grad_norm": 0.8168571591377258, + "learning_rate": 4.0113120989886176e-06, + "loss": 2.8373, + "step": 58025 + }, + { + "epoch": 2.84, + "grad_norm": 0.7445526719093323, + "learning_rate": 4.008803179144571e-06, + "loss": 2.951, + "step": 58026 + }, + { + "epoch": 2.84, + "grad_norm": 0.7250993251800537, + "learning_rate": 4.006295038884355e-06, + "loss": 3.1706, + "step": 58027 + }, + { + "epoch": 2.84, + "grad_norm": 0.7294281721115112, + "learning_rate": 4.003787678214565e-06, + "loss": 2.9983, + "step": 58028 + }, + { + "epoch": 2.84, + "grad_norm": 0.8010278344154358, + "learning_rate": 4.0012810971418285e-06, + "loss": 2.7596, + "step": 58029 + }, + { + "epoch": 2.84, + "grad_norm": 0.7343051433563232, + "learning_rate": 3.998775295672773e-06, + "loss": 2.9007, + "step": 58030 + }, + { + "epoch": 2.84, + "grad_norm": 0.7521501779556274, + "learning_rate": 3.9962702738139955e-06, + "loss": 2.9278, + "step": 58031 + }, + { + "epoch": 2.84, + "grad_norm": 0.7345162630081177, + "learning_rate": 3.993766031572055e-06, + "loss": 3.1938, + "step": 58032 + }, + { + "epoch": 2.84, + "grad_norm": 0.7414243817329407, + "learning_rate": 3.9912625689535795e-06, + "loss": 2.9453, + "step": 58033 + }, + { + "epoch": 2.84, + "grad_norm": 0.7303183674812317, + "learning_rate": 3.988759885965131e-06, + "loss": 2.8327, + "step": 58034 + }, + { + "epoch": 2.84, + "grad_norm": 0.7680356502532959, + "learning_rate": 3.986257982613339e-06, + "loss": 2.9224, + "step": 58035 + }, + { + "epoch": 2.84, + "grad_norm": 0.7452159523963928, + "learning_rate": 3.983756858904763e-06, + "loss": 2.9883, + "step": 58036 + }, + { + "epoch": 2.84, + "grad_norm": 0.7528615593910217, + "learning_rate": 3.981256514845999e-06, + "loss": 2.7967, + "step": 58037 + }, + { + "epoch": 2.84, + "grad_norm": 0.7940719723701477, + "learning_rate": 3.97875695044364e-06, + "loss": 2.8682, + "step": 58038 + }, + { + "epoch": 2.84, + "grad_norm": 0.7146586179733276, + "learning_rate": 3.9762581657042496e-06, + "loss": 3.025, + "step": 58039 + }, + { + "epoch": 2.84, + "grad_norm": 0.7448782920837402, + "learning_rate": 3.973760160634453e-06, + "loss": 2.828, + "step": 58040 + }, + { + "epoch": 2.84, + "grad_norm": 0.7157915830612183, + "learning_rate": 3.971262935240782e-06, + "loss": 2.9029, + "step": 58041 + }, + { + "epoch": 2.84, + "grad_norm": 0.824224591255188, + "learning_rate": 3.968766489529862e-06, + "loss": 2.7699, + "step": 58042 + }, + { + "epoch": 2.84, + "grad_norm": 0.7506048679351807, + "learning_rate": 3.966270823508189e-06, + "loss": 2.8328, + "step": 58043 + }, + { + "epoch": 2.84, + "grad_norm": 0.7083612084388733, + "learning_rate": 3.963775937182423e-06, + "loss": 2.8861, + "step": 58044 + }, + { + "epoch": 2.84, + "grad_norm": 0.7859985828399658, + "learning_rate": 3.9612818305590934e-06, + "loss": 2.6817, + "step": 58045 + }, + { + "epoch": 2.84, + "grad_norm": 0.7196922898292542, + "learning_rate": 3.958788503644761e-06, + "loss": 3.0316, + "step": 58046 + }, + { + "epoch": 2.84, + "grad_norm": 0.7509509325027466, + "learning_rate": 3.95629595644602e-06, + "loss": 2.9547, + "step": 58047 + }, + { + "epoch": 2.84, + "grad_norm": 0.7594537138938904, + "learning_rate": 3.953804188969401e-06, + "loss": 3.013, + "step": 58048 + }, + { + "epoch": 2.84, + "grad_norm": 0.7385445237159729, + "learning_rate": 3.951313201221495e-06, + "loss": 2.6765, + "step": 58049 + }, + { + "epoch": 2.84, + "grad_norm": 0.7691807746887207, + "learning_rate": 3.9488229932088665e-06, + "loss": 2.778, + "step": 58050 + }, + { + "epoch": 2.84, + "grad_norm": 0.8221585154533386, + "learning_rate": 3.946333564938042e-06, + "loss": 2.7925, + "step": 58051 + }, + { + "epoch": 2.84, + "grad_norm": 0.7647941708564758, + "learning_rate": 3.943844916415617e-06, + "loss": 2.9116, + "step": 58052 + }, + { + "epoch": 2.85, + "grad_norm": 0.7364143133163452, + "learning_rate": 3.941357047648119e-06, + "loss": 2.872, + "step": 58053 + }, + { + "epoch": 2.85, + "grad_norm": 0.7513567805290222, + "learning_rate": 3.9388699586421104e-06, + "loss": 2.9102, + "step": 58054 + }, + { + "epoch": 2.85, + "grad_norm": 0.8263747692108154, + "learning_rate": 3.936383649404151e-06, + "loss": 3.1531, + "step": 58055 + }, + { + "epoch": 2.85, + "grad_norm": 0.7924360632896423, + "learning_rate": 3.93389811994077e-06, + "loss": 3.0245, + "step": 58056 + }, + { + "epoch": 2.85, + "grad_norm": 0.7682927250862122, + "learning_rate": 3.931413370258529e-06, + "loss": 2.8791, + "step": 58057 + }, + { + "epoch": 2.85, + "grad_norm": 0.7378009557723999, + "learning_rate": 3.928929400363922e-06, + "loss": 2.8736, + "step": 58058 + }, + { + "epoch": 2.85, + "grad_norm": 0.742211103439331, + "learning_rate": 3.926446210263579e-06, + "loss": 2.6486, + "step": 58059 + }, + { + "epoch": 2.85, + "grad_norm": 0.7150681614875793, + "learning_rate": 3.923963799963992e-06, + "loss": 3.034, + "step": 58060 + }, + { + "epoch": 2.85, + "grad_norm": 0.726213276386261, + "learning_rate": 3.92148216947169e-06, + "loss": 2.9354, + "step": 58061 + }, + { + "epoch": 2.85, + "grad_norm": 0.7555464506149292, + "learning_rate": 3.919001318793235e-06, + "loss": 2.6879, + "step": 58062 + }, + { + "epoch": 2.85, + "grad_norm": 0.7286418080329895, + "learning_rate": 3.916521247935156e-06, + "loss": 2.8126, + "step": 58063 + }, + { + "epoch": 2.85, + "grad_norm": 0.7420178651809692, + "learning_rate": 3.914041956903979e-06, + "loss": 2.7675, + "step": 58064 + }, + { + "epoch": 2.85, + "grad_norm": 0.7674248814582825, + "learning_rate": 3.9115634457062e-06, + "loss": 3.0303, + "step": 58065 + }, + { + "epoch": 2.85, + "grad_norm": 0.7217745184898376, + "learning_rate": 3.909085714348381e-06, + "loss": 2.9029, + "step": 58066 + }, + { + "epoch": 2.85, + "grad_norm": 0.8204011917114258, + "learning_rate": 3.906608762837082e-06, + "loss": 2.9336, + "step": 58067 + }, + { + "epoch": 2.85, + "grad_norm": 0.744408369064331, + "learning_rate": 3.904132591178766e-06, + "loss": 2.9035, + "step": 58068 + }, + { + "epoch": 2.85, + "grad_norm": 0.7998400926589966, + "learning_rate": 3.901657199379993e-06, + "loss": 2.8445, + "step": 58069 + }, + { + "epoch": 2.85, + "grad_norm": 0.7387979030609131, + "learning_rate": 3.899182587447258e-06, + "loss": 2.7588, + "step": 58070 + }, + { + "epoch": 2.85, + "grad_norm": 0.7533370852470398, + "learning_rate": 3.89670875538709e-06, + "loss": 2.8385, + "step": 58071 + }, + { + "epoch": 2.85, + "grad_norm": 0.7749383449554443, + "learning_rate": 3.8942357032060166e-06, + "loss": 2.8858, + "step": 58072 + }, + { + "epoch": 2.85, + "grad_norm": 0.7244678735733032, + "learning_rate": 3.891763430910533e-06, + "loss": 2.9102, + "step": 58073 + }, + { + "epoch": 2.85, + "grad_norm": 0.7526236176490784, + "learning_rate": 3.8892919385071664e-06, + "loss": 2.7091, + "step": 58074 + }, + { + "epoch": 2.85, + "grad_norm": 0.7498911619186401, + "learning_rate": 3.886821226002379e-06, + "loss": 2.7809, + "step": 58075 + }, + { + "epoch": 2.85, + "grad_norm": 0.8182533383369446, + "learning_rate": 3.884351293402766e-06, + "loss": 2.8096, + "step": 58076 + }, + { + "epoch": 2.85, + "grad_norm": 0.7636764645576477, + "learning_rate": 3.8818821407147875e-06, + "loss": 2.8166, + "step": 58077 + }, + { + "epoch": 2.85, + "grad_norm": 0.7577216625213623, + "learning_rate": 3.879413767944905e-06, + "loss": 3.0238, + "step": 58078 + }, + { + "epoch": 2.85, + "grad_norm": 0.7836326360702515, + "learning_rate": 3.876946175099682e-06, + "loss": 2.8152, + "step": 58079 + }, + { + "epoch": 2.85, + "grad_norm": 0.7404009699821472, + "learning_rate": 3.874479362185545e-06, + "loss": 2.831, + "step": 58080 + }, + { + "epoch": 2.85, + "grad_norm": 0.7890130877494812, + "learning_rate": 3.872013329209089e-06, + "loss": 2.7029, + "step": 58081 + }, + { + "epoch": 2.85, + "grad_norm": 0.821405827999115, + "learning_rate": 3.869548076176743e-06, + "loss": 2.731, + "step": 58082 + }, + { + "epoch": 2.85, + "grad_norm": 0.7491986155509949, + "learning_rate": 3.867083603095e-06, + "loss": 2.6854, + "step": 58083 + }, + { + "epoch": 2.85, + "grad_norm": 0.7356759309768677, + "learning_rate": 3.86461990997039e-06, + "loss": 3.0514, + "step": 58084 + }, + { + "epoch": 2.85, + "grad_norm": 0.7587080597877502, + "learning_rate": 3.862156996809407e-06, + "loss": 3.1083, + "step": 58085 + }, + { + "epoch": 2.85, + "grad_norm": 0.7740174531936646, + "learning_rate": 3.859694863618445e-06, + "loss": 2.9711, + "step": 58086 + }, + { + "epoch": 2.85, + "grad_norm": 0.761070191860199, + "learning_rate": 3.857233510404101e-06, + "loss": 3.0597, + "step": 58087 + }, + { + "epoch": 2.85, + "grad_norm": 0.7623142004013062, + "learning_rate": 3.854772937172768e-06, + "loss": 2.7424, + "step": 58088 + }, + { + "epoch": 2.85, + "grad_norm": 0.7792325615882874, + "learning_rate": 3.852313143931007e-06, + "loss": 2.8965, + "step": 58089 + }, + { + "epoch": 2.85, + "grad_norm": 0.7377564311027527, + "learning_rate": 3.849854130685248e-06, + "loss": 2.9561, + "step": 58090 + }, + { + "epoch": 2.85, + "grad_norm": 0.7853538393974304, + "learning_rate": 3.847395897441985e-06, + "loss": 3.1712, + "step": 58091 + }, + { + "epoch": 2.85, + "grad_norm": 0.7288347482681274, + "learning_rate": 3.844938444207679e-06, + "loss": 2.7964, + "step": 58092 + }, + { + "epoch": 2.85, + "grad_norm": 0.7623826265335083, + "learning_rate": 3.842481770988792e-06, + "loss": 2.8275, + "step": 58093 + }, + { + "epoch": 2.85, + "grad_norm": 0.7762326598167419, + "learning_rate": 3.840025877791852e-06, + "loss": 2.961, + "step": 58094 + }, + { + "epoch": 2.85, + "grad_norm": 0.7291229963302612, + "learning_rate": 3.837570764623254e-06, + "loss": 2.9422, + "step": 58095 + }, + { + "epoch": 2.85, + "grad_norm": 0.7547798752784729, + "learning_rate": 3.8351164314894935e-06, + "loss": 2.7757, + "step": 58096 + }, + { + "epoch": 2.85, + "grad_norm": 0.7603956460952759, + "learning_rate": 3.8326628783970306e-06, + "loss": 3.2117, + "step": 58097 + }, + { + "epoch": 2.85, + "grad_norm": 0.7306252121925354, + "learning_rate": 3.8302101053523605e-06, + "loss": 2.7616, + "step": 58098 + }, + { + "epoch": 2.85, + "grad_norm": 0.7425011992454529, + "learning_rate": 3.8277581123619114e-06, + "loss": 2.9913, + "step": 58099 + }, + { + "epoch": 2.85, + "grad_norm": 0.8286489248275757, + "learning_rate": 3.825306899432145e-06, + "loss": 2.7551, + "step": 58100 + }, + { + "epoch": 2.85, + "grad_norm": 0.7279012799263, + "learning_rate": 3.8228564665695235e-06, + "loss": 2.9012, + "step": 58101 + }, + { + "epoch": 2.85, + "grad_norm": 0.713568925857544, + "learning_rate": 3.820406813780508e-06, + "loss": 2.9402, + "step": 58102 + }, + { + "epoch": 2.85, + "grad_norm": 0.7257249355316162, + "learning_rate": 3.8179579410715255e-06, + "loss": 2.6947, + "step": 58103 + }, + { + "epoch": 2.85, + "grad_norm": 0.7878603935241699, + "learning_rate": 3.815509848449039e-06, + "loss": 2.8945, + "step": 58104 + }, + { + "epoch": 2.85, + "grad_norm": 0.7164974808692932, + "learning_rate": 3.8130625359194756e-06, + "loss": 3.0317, + "step": 58105 + }, + { + "epoch": 2.85, + "grad_norm": 0.731782853603363, + "learning_rate": 3.810616003489364e-06, + "loss": 2.9602, + "step": 58106 + }, + { + "epoch": 2.85, + "grad_norm": 0.7645469903945923, + "learning_rate": 3.808170251165066e-06, + "loss": 2.8414, + "step": 58107 + }, + { + "epoch": 2.85, + "grad_norm": 0.7678006887435913, + "learning_rate": 3.805725278953009e-06, + "loss": 2.8792, + "step": 58108 + }, + { + "epoch": 2.85, + "grad_norm": 0.727119505405426, + "learning_rate": 3.8032810868596888e-06, + "loss": 3.0004, + "step": 58109 + }, + { + "epoch": 2.85, + "grad_norm": 0.8259202837944031, + "learning_rate": 3.8008376748915324e-06, + "loss": 2.7632, + "step": 58110 + }, + { + "epoch": 2.85, + "grad_norm": 0.7696822285652161, + "learning_rate": 3.798395043054936e-06, + "loss": 2.7003, + "step": 58111 + }, + { + "epoch": 2.85, + "grad_norm": 0.753521203994751, + "learning_rate": 3.7959531913563932e-06, + "loss": 2.9795, + "step": 58112 + }, + { + "epoch": 2.85, + "grad_norm": 0.7405487298965454, + "learning_rate": 3.7935121198023e-06, + "loss": 2.8325, + "step": 58113 + }, + { + "epoch": 2.85, + "grad_norm": 0.7650240063667297, + "learning_rate": 3.7910718283990837e-06, + "loss": 3.0367, + "step": 58114 + }, + { + "epoch": 2.85, + "grad_norm": 0.7302567958831787, + "learning_rate": 3.788632317153173e-06, + "loss": 2.7721, + "step": 58115 + }, + { + "epoch": 2.85, + "grad_norm": 0.7484518885612488, + "learning_rate": 3.786193586070996e-06, + "loss": 2.791, + "step": 58116 + }, + { + "epoch": 2.85, + "grad_norm": 0.7105563282966614, + "learning_rate": 3.7837556351589802e-06, + "loss": 2.9578, + "step": 58117 + }, + { + "epoch": 2.85, + "grad_norm": 0.7681797742843628, + "learning_rate": 3.7813184644235217e-06, + "loss": 3.0148, + "step": 58118 + }, + { + "epoch": 2.85, + "grad_norm": 0.7496532797813416, + "learning_rate": 3.7788820738711145e-06, + "loss": 3.0347, + "step": 58119 + }, + { + "epoch": 2.85, + "grad_norm": 0.7610386610031128, + "learning_rate": 3.7764464635080873e-06, + "loss": 2.8305, + "step": 58120 + }, + { + "epoch": 2.85, + "grad_norm": 0.7656939625740051, + "learning_rate": 3.774011633340901e-06, + "loss": 2.881, + "step": 58121 + }, + { + "epoch": 2.85, + "grad_norm": 0.7909092307090759, + "learning_rate": 3.771577583375951e-06, + "loss": 2.9376, + "step": 58122 + }, + { + "epoch": 2.85, + "grad_norm": 0.7618745565414429, + "learning_rate": 3.7691443136196653e-06, + "loss": 2.592, + "step": 58123 + }, + { + "epoch": 2.85, + "grad_norm": 0.7755120992660522, + "learning_rate": 3.766711824078439e-06, + "loss": 2.8602, + "step": 58124 + }, + { + "epoch": 2.85, + "grad_norm": 0.7302433252334595, + "learning_rate": 3.7642801147586664e-06, + "loss": 2.7635, + "step": 58125 + }, + { + "epoch": 2.85, + "grad_norm": 0.7398534417152405, + "learning_rate": 3.7618491856667764e-06, + "loss": 2.9394, + "step": 58126 + }, + { + "epoch": 2.85, + "grad_norm": 0.7428982853889465, + "learning_rate": 3.7594190368091638e-06, + "loss": 2.8178, + "step": 58127 + }, + { + "epoch": 2.85, + "grad_norm": 0.7572886347770691, + "learning_rate": 3.756989668192256e-06, + "loss": 2.7583, + "step": 58128 + }, + { + "epoch": 2.85, + "grad_norm": 0.7500163912773132, + "learning_rate": 3.7545610798224156e-06, + "loss": 2.7213, + "step": 58129 + }, + { + "epoch": 2.85, + "grad_norm": 0.7317116260528564, + "learning_rate": 3.7521332717060372e-06, + "loss": 2.727, + "step": 58130 + }, + { + "epoch": 2.85, + "grad_norm": 0.7358356714248657, + "learning_rate": 3.749706243849482e-06, + "loss": 2.8328, + "step": 58131 + }, + { + "epoch": 2.85, + "grad_norm": 0.7284991145133972, + "learning_rate": 3.747279996259245e-06, + "loss": 2.8797, + "step": 58132 + }, + { + "epoch": 2.85, + "grad_norm": 0.7599244713783264, + "learning_rate": 3.744854528941621e-06, + "loss": 2.8693, + "step": 58133 + }, + { + "epoch": 2.85, + "grad_norm": 0.791732907295227, + "learning_rate": 3.742429841903072e-06, + "loss": 2.6926, + "step": 58134 + }, + { + "epoch": 2.85, + "grad_norm": 0.7750145792961121, + "learning_rate": 3.740005935149959e-06, + "loss": 3.0105, + "step": 58135 + }, + { + "epoch": 2.85, + "grad_norm": 0.7046689391136169, + "learning_rate": 3.7375828086886106e-06, + "loss": 2.9164, + "step": 58136 + }, + { + "epoch": 2.85, + "grad_norm": 0.729168176651001, + "learning_rate": 3.735160462525455e-06, + "loss": 3.0215, + "step": 58137 + }, + { + "epoch": 2.85, + "grad_norm": 0.7688641548156738, + "learning_rate": 3.732738896666887e-06, + "loss": 2.8869, + "step": 58138 + }, + { + "epoch": 2.85, + "grad_norm": 0.7332656383514404, + "learning_rate": 3.730318111119268e-06, + "loss": 2.721, + "step": 58139 + }, + { + "epoch": 2.85, + "grad_norm": 0.7618536353111267, + "learning_rate": 3.72789810588896e-06, + "loss": 2.9654, + "step": 58140 + }, + { + "epoch": 2.85, + "grad_norm": 0.7432697415351868, + "learning_rate": 3.7254788809823577e-06, + "loss": 2.8195, + "step": 58141 + }, + { + "epoch": 2.85, + "grad_norm": 0.7419174313545227, + "learning_rate": 3.7230604364057894e-06, + "loss": 2.9438, + "step": 58142 + }, + { + "epoch": 2.85, + "grad_norm": 0.7732837200164795, + "learning_rate": 3.7206427721657162e-06, + "loss": 2.9231, + "step": 58143 + }, + { + "epoch": 2.85, + "grad_norm": 0.7398931980133057, + "learning_rate": 3.7182258882684333e-06, + "loss": 2.7001, + "step": 58144 + }, + { + "epoch": 2.85, + "grad_norm": 0.7343015670776367, + "learning_rate": 3.715809784720303e-06, + "loss": 2.7543, + "step": 58145 + }, + { + "epoch": 2.85, + "grad_norm": 0.7587280869483948, + "learning_rate": 3.713394461527719e-06, + "loss": 2.8208, + "step": 58146 + }, + { + "epoch": 2.85, + "grad_norm": 0.74312824010849, + "learning_rate": 3.7109799186970437e-06, + "loss": 2.7435, + "step": 58147 + }, + { + "epoch": 2.85, + "grad_norm": 0.7362629175186157, + "learning_rate": 3.708566156234605e-06, + "loss": 2.8137, + "step": 58148 + }, + { + "epoch": 2.85, + "grad_norm": 0.7702608108520508, + "learning_rate": 3.7061531741467975e-06, + "loss": 2.9806, + "step": 58149 + }, + { + "epoch": 2.85, + "grad_norm": 0.7515386343002319, + "learning_rate": 3.7037409724399504e-06, + "loss": 2.8289, + "step": 58150 + }, + { + "epoch": 2.85, + "grad_norm": 0.762421190738678, + "learning_rate": 3.7013295511204576e-06, + "loss": 2.8378, + "step": 58151 + }, + { + "epoch": 2.85, + "grad_norm": 0.7281259298324585, + "learning_rate": 3.6989189101945817e-06, + "loss": 2.8135, + "step": 58152 + }, + { + "epoch": 2.85, + "grad_norm": 0.782008171081543, + "learning_rate": 3.69650904966875e-06, + "loss": 3.1768, + "step": 58153 + }, + { + "epoch": 2.85, + "grad_norm": 0.7807068824768066, + "learning_rate": 3.6940999695493247e-06, + "loss": 2.9561, + "step": 58154 + }, + { + "epoch": 2.85, + "grad_norm": 0.8265916705131531, + "learning_rate": 3.691691669842567e-06, + "loss": 2.907, + "step": 58155 + }, + { + "epoch": 2.85, + "grad_norm": 0.7538347244262695, + "learning_rate": 3.689284150554905e-06, + "loss": 2.9961, + "step": 58156 + }, + { + "epoch": 2.85, + "grad_norm": 0.7269682288169861, + "learning_rate": 3.6868774116926346e-06, + "loss": 2.894, + "step": 58157 + }, + { + "epoch": 2.85, + "grad_norm": 0.7205626964569092, + "learning_rate": 3.6844714532620836e-06, + "loss": 2.9546, + "step": 58158 + }, + { + "epoch": 2.85, + "grad_norm": 0.7215340733528137, + "learning_rate": 3.6820662752696127e-06, + "loss": 2.8961, + "step": 58159 + }, + { + "epoch": 2.85, + "grad_norm": 0.7290651202201843, + "learning_rate": 3.6796618777215516e-06, + "loss": 2.9625, + "step": 58160 + }, + { + "epoch": 2.85, + "grad_norm": 0.7407297492027283, + "learning_rate": 3.6772582606242605e-06, + "loss": 2.9215, + "step": 58161 + }, + { + "epoch": 2.85, + "grad_norm": 0.8198636770248413, + "learning_rate": 3.6748554239840023e-06, + "loss": 2.954, + "step": 58162 + }, + { + "epoch": 2.85, + "grad_norm": 0.7506884932518005, + "learning_rate": 3.6724533678071376e-06, + "loss": 2.9087, + "step": 58163 + }, + { + "epoch": 2.85, + "grad_norm": 0.7431238889694214, + "learning_rate": 3.670052092100062e-06, + "loss": 2.8626, + "step": 58164 + }, + { + "epoch": 2.85, + "grad_norm": 0.7417303323745728, + "learning_rate": 3.6676515968690035e-06, + "loss": 3.2512, + "step": 58165 + }, + { + "epoch": 2.85, + "grad_norm": 0.7299365997314453, + "learning_rate": 3.6652518821203236e-06, + "loss": 2.8178, + "step": 58166 + }, + { + "epoch": 2.85, + "grad_norm": 0.7602360248565674, + "learning_rate": 3.6628529478603176e-06, + "loss": 2.8408, + "step": 58167 + }, + { + "epoch": 2.85, + "grad_norm": 0.7500886917114258, + "learning_rate": 3.6604547940953468e-06, + "loss": 2.9354, + "step": 58168 + }, + { + "epoch": 2.85, + "grad_norm": 0.740761399269104, + "learning_rate": 3.6580574208317058e-06, + "loss": 3.0434, + "step": 58169 + }, + { + "epoch": 2.85, + "grad_norm": 0.7828353643417358, + "learning_rate": 3.6556608280757237e-06, + "loss": 2.9932, + "step": 58170 + }, + { + "epoch": 2.85, + "grad_norm": 0.8029820322990417, + "learning_rate": 3.6532650158336952e-06, + "loss": 2.9728, + "step": 58171 + }, + { + "epoch": 2.85, + "grad_norm": 0.8066024780273438, + "learning_rate": 3.650869984111915e-06, + "loss": 2.7727, + "step": 58172 + }, + { + "epoch": 2.85, + "grad_norm": 0.7479804158210754, + "learning_rate": 3.6484757329167115e-06, + "loss": 2.7737, + "step": 58173 + }, + { + "epoch": 2.85, + "grad_norm": 0.7437139749526978, + "learning_rate": 3.6460822622544126e-06, + "loss": 2.8592, + "step": 58174 + }, + { + "epoch": 2.85, + "grad_norm": 0.7401143312454224, + "learning_rate": 3.6436895721312476e-06, + "loss": 2.7352, + "step": 58175 + }, + { + "epoch": 2.85, + "grad_norm": 0.7213879823684692, + "learning_rate": 3.6412976625536105e-06, + "loss": 3.0583, + "step": 58176 + }, + { + "epoch": 2.85, + "grad_norm": 0.7483210563659668, + "learning_rate": 3.6389065335277632e-06, + "loss": 2.8794, + "step": 58177 + }, + { + "epoch": 2.85, + "grad_norm": 0.7725558280944824, + "learning_rate": 3.636516185059968e-06, + "loss": 2.8978, + "step": 58178 + }, + { + "epoch": 2.85, + "grad_norm": 0.7521719336509705, + "learning_rate": 3.6341266171566185e-06, + "loss": 2.8083, + "step": 58179 + }, + { + "epoch": 2.85, + "grad_norm": 0.7107489705085754, + "learning_rate": 3.6317378298238774e-06, + "loss": 2.9908, + "step": 58180 + }, + { + "epoch": 2.85, + "grad_norm": 0.7555409669876099, + "learning_rate": 3.6293498230681397e-06, + "loss": 2.6925, + "step": 58181 + }, + { + "epoch": 2.85, + "grad_norm": 0.7039787769317627, + "learning_rate": 3.6269625968955997e-06, + "loss": 2.9181, + "step": 58182 + }, + { + "epoch": 2.85, + "grad_norm": 0.7815123200416565, + "learning_rate": 3.6245761513126525e-06, + "loss": 2.9034, + "step": 58183 + }, + { + "epoch": 2.85, + "grad_norm": 0.8096544146537781, + "learning_rate": 3.622190486325527e-06, + "loss": 2.9037, + "step": 58184 + }, + { + "epoch": 2.85, + "grad_norm": 0.7660767436027527, + "learning_rate": 3.6198056019404843e-06, + "loss": 2.7394, + "step": 58185 + }, + { + "epoch": 2.85, + "grad_norm": 0.7837504744529724, + "learning_rate": 3.617421498163886e-06, + "loss": 2.9246, + "step": 58186 + }, + { + "epoch": 2.85, + "grad_norm": 0.7592591643333435, + "learning_rate": 3.6150381750019276e-06, + "loss": 2.9072, + "step": 58187 + }, + { + "epoch": 2.85, + "grad_norm": 0.7563009858131409, + "learning_rate": 3.6126556324609034e-06, + "loss": 2.9115, + "step": 58188 + }, + { + "epoch": 2.85, + "grad_norm": 0.7264691591262817, + "learning_rate": 3.6102738705471425e-06, + "loss": 2.9466, + "step": 58189 + }, + { + "epoch": 2.85, + "grad_norm": 0.6998595595359802, + "learning_rate": 3.6078928892668392e-06, + "loss": 3.0228, + "step": 58190 + }, + { + "epoch": 2.85, + "grad_norm": 0.7344692945480347, + "learning_rate": 3.6055126886263554e-06, + "loss": 2.8275, + "step": 58191 + }, + { + "epoch": 2.85, + "grad_norm": 0.7686775326728821, + "learning_rate": 3.6031332686318525e-06, + "loss": 2.7886, + "step": 58192 + }, + { + "epoch": 2.85, + "grad_norm": 0.8161377906799316, + "learning_rate": 3.600754629289693e-06, + "loss": 2.9067, + "step": 58193 + }, + { + "epoch": 2.85, + "grad_norm": 0.7631113529205322, + "learning_rate": 3.598376770606104e-06, + "loss": 2.8216, + "step": 58194 + }, + { + "epoch": 2.85, + "grad_norm": 0.7431334853172302, + "learning_rate": 3.5959996925873145e-06, + "loss": 2.821, + "step": 58195 + }, + { + "epoch": 2.85, + "grad_norm": 0.7493770718574524, + "learning_rate": 3.593623395239653e-06, + "loss": 2.7067, + "step": 58196 + }, + { + "epoch": 2.85, + "grad_norm": 0.7191861271858215, + "learning_rate": 3.5912478785693144e-06, + "loss": 2.8691, + "step": 58197 + }, + { + "epoch": 2.85, + "grad_norm": 0.8084812760353088, + "learning_rate": 3.5888731425825603e-06, + "loss": 3.0337, + "step": 58198 + }, + { + "epoch": 2.85, + "grad_norm": 0.7551262974739075, + "learning_rate": 3.5864991872857186e-06, + "loss": 2.9638, + "step": 58199 + }, + { + "epoch": 2.85, + "grad_norm": 0.7352034449577332, + "learning_rate": 3.5841260126849515e-06, + "loss": 2.9625, + "step": 58200 + }, + { + "epoch": 2.85, + "grad_norm": 0.7375224232673645, + "learning_rate": 3.581753618786587e-06, + "loss": 2.8832, + "step": 58201 + }, + { + "epoch": 2.85, + "grad_norm": 0.7833859920501709, + "learning_rate": 3.5793820055968535e-06, + "loss": 2.7742, + "step": 58202 + }, + { + "epoch": 2.85, + "grad_norm": 0.7902634143829346, + "learning_rate": 3.577011173121913e-06, + "loss": 3.0064, + "step": 58203 + }, + { + "epoch": 2.85, + "grad_norm": 0.7929283380508423, + "learning_rate": 3.574641121368127e-06, + "loss": 3.0257, + "step": 58204 + }, + { + "epoch": 2.85, + "grad_norm": 0.7383139729499817, + "learning_rate": 3.5722718503416904e-06, + "loss": 2.9243, + "step": 58205 + }, + { + "epoch": 2.85, + "grad_norm": 0.7614343762397766, + "learning_rate": 3.5699033600488313e-06, + "loss": 3.0933, + "step": 58206 + }, + { + "epoch": 2.85, + "grad_norm": 0.7375363707542419, + "learning_rate": 3.5675356504957786e-06, + "loss": 2.8334, + "step": 58207 + }, + { + "epoch": 2.85, + "grad_norm": 0.7111465930938721, + "learning_rate": 3.5651687216887938e-06, + "loss": 2.6562, + "step": 58208 + }, + { + "epoch": 2.85, + "grad_norm": 0.7754095196723938, + "learning_rate": 3.5628025736341383e-06, + "loss": 2.7742, + "step": 58209 + }, + { + "epoch": 2.85, + "grad_norm": 0.7329739332199097, + "learning_rate": 3.5604372063379737e-06, + "loss": 2.8782, + "step": 58210 + }, + { + "epoch": 2.85, + "grad_norm": 0.7588919401168823, + "learning_rate": 3.5580726198065958e-06, + "loss": 2.7934, + "step": 58211 + }, + { + "epoch": 2.85, + "grad_norm": 0.7436473369598389, + "learning_rate": 3.5557088140461655e-06, + "loss": 2.8764, + "step": 58212 + }, + { + "epoch": 2.85, + "grad_norm": 0.7899497151374817, + "learning_rate": 3.553345789062978e-06, + "loss": 2.7362, + "step": 58213 + }, + { + "epoch": 2.85, + "grad_norm": 0.7157718539237976, + "learning_rate": 3.550983544863195e-06, + "loss": 3.0378, + "step": 58214 + }, + { + "epoch": 2.85, + "grad_norm": 0.7522872686386108, + "learning_rate": 3.548622081453112e-06, + "loss": 2.7286, + "step": 58215 + }, + { + "epoch": 2.85, + "grad_norm": 0.7334341406822205, + "learning_rate": 3.5462613988388898e-06, + "loss": 2.8558, + "step": 58216 + }, + { + "epoch": 2.85, + "grad_norm": 0.7479165196418762, + "learning_rate": 3.543901497026724e-06, + "loss": 2.782, + "step": 58217 + }, + { + "epoch": 2.85, + "grad_norm": 0.744875967502594, + "learning_rate": 3.54154237602291e-06, + "loss": 2.9049, + "step": 58218 + }, + { + "epoch": 2.85, + "grad_norm": 0.7260187864303589, + "learning_rate": 3.539184035833609e-06, + "loss": 2.8262, + "step": 58219 + }, + { + "epoch": 2.85, + "grad_norm": 0.7581130266189575, + "learning_rate": 3.5368264764650157e-06, + "loss": 2.9337, + "step": 58220 + }, + { + "epoch": 2.85, + "grad_norm": 0.745539128780365, + "learning_rate": 3.534469697923392e-06, + "loss": 3.1823, + "step": 58221 + }, + { + "epoch": 2.85, + "grad_norm": 0.7698248028755188, + "learning_rate": 3.5321137002149e-06, + "loss": 2.9272, + "step": 58222 + }, + { + "epoch": 2.85, + "grad_norm": 0.7561793923377991, + "learning_rate": 3.5297584833458015e-06, + "loss": 2.6776, + "step": 58223 + }, + { + "epoch": 2.85, + "grad_norm": 0.7512466311454773, + "learning_rate": 3.5274040473222578e-06, + "loss": 2.94, + "step": 58224 + }, + { + "epoch": 2.85, + "grad_norm": 0.7217196226119995, + "learning_rate": 3.5250503921504305e-06, + "loss": 2.9191, + "step": 58225 + }, + { + "epoch": 2.85, + "grad_norm": 0.7432135343551636, + "learning_rate": 3.5226975178366144e-06, + "loss": 3.0536, + "step": 58226 + }, + { + "epoch": 2.85, + "grad_norm": 0.7715436220169067, + "learning_rate": 3.5203454243869056e-06, + "loss": 2.866, + "step": 58227 + }, + { + "epoch": 2.85, + "grad_norm": 0.765325129032135, + "learning_rate": 3.517994111807565e-06, + "loss": 2.8093, + "step": 58228 + }, + { + "epoch": 2.85, + "grad_norm": 0.7218096256256104, + "learning_rate": 3.5156435801047877e-06, + "loss": 2.7881, + "step": 58229 + }, + { + "epoch": 2.85, + "grad_norm": 0.7781936526298523, + "learning_rate": 3.5132938292847355e-06, + "loss": 2.9954, + "step": 58230 + }, + { + "epoch": 2.85, + "grad_norm": 0.759574294090271, + "learning_rate": 3.510944859353604e-06, + "loss": 2.7673, + "step": 58231 + }, + { + "epoch": 2.85, + "grad_norm": 0.723047137260437, + "learning_rate": 3.508596670317587e-06, + "loss": 2.9904, + "step": 58232 + }, + { + "epoch": 2.85, + "grad_norm": 0.7782467007637024, + "learning_rate": 3.506249262182881e-06, + "loss": 2.993, + "step": 58233 + }, + { + "epoch": 2.85, + "grad_norm": 0.7288681864738464, + "learning_rate": 3.5039026349556133e-06, + "loss": 2.8057, + "step": 58234 + }, + { + "epoch": 2.85, + "grad_norm": 0.7624087333679199, + "learning_rate": 3.5015567886420125e-06, + "loss": 2.8476, + "step": 58235 + }, + { + "epoch": 2.85, + "grad_norm": 0.7774996757507324, + "learning_rate": 3.499211723248274e-06, + "loss": 2.8622, + "step": 58236 + }, + { + "epoch": 2.85, + "grad_norm": 0.7889575958251953, + "learning_rate": 3.4968674387805597e-06, + "loss": 2.9525, + "step": 58237 + }, + { + "epoch": 2.85, + "grad_norm": 0.8073905110359192, + "learning_rate": 3.4945239352449974e-06, + "loss": 2.9227, + "step": 58238 + }, + { + "epoch": 2.85, + "grad_norm": 0.781495988368988, + "learning_rate": 3.4921812126478153e-06, + "loss": 3.019, + "step": 58239 + }, + { + "epoch": 2.85, + "grad_norm": 0.7901045680046082, + "learning_rate": 3.489839270995143e-06, + "loss": 2.8305, + "step": 58240 + }, + { + "epoch": 2.85, + "grad_norm": 0.8189294934272766, + "learning_rate": 3.4874981102932074e-06, + "loss": 2.8926, + "step": 58241 + }, + { + "epoch": 2.85, + "grad_norm": 0.7915399670600891, + "learning_rate": 3.4851577305481047e-06, + "loss": 2.8993, + "step": 58242 + }, + { + "epoch": 2.85, + "grad_norm": 0.7558654546737671, + "learning_rate": 3.4828181317660296e-06, + "loss": 2.8288, + "step": 58243 + }, + { + "epoch": 2.85, + "grad_norm": 0.7701000571250916, + "learning_rate": 3.480479313953144e-06, + "loss": 2.6183, + "step": 58244 + }, + { + "epoch": 2.85, + "grad_norm": 0.7547457814216614, + "learning_rate": 3.4781412771156424e-06, + "loss": 2.9023, + "step": 58245 + }, + { + "epoch": 2.85, + "grad_norm": 0.7245228290557861, + "learning_rate": 3.4758040212596206e-06, + "loss": 2.8595, + "step": 58246 + }, + { + "epoch": 2.85, + "grad_norm": 0.7754863500595093, + "learning_rate": 3.473467546391273e-06, + "loss": 2.9257, + "step": 58247 + }, + { + "epoch": 2.85, + "grad_norm": 0.7499856948852539, + "learning_rate": 3.471131852516762e-06, + "loss": 2.9872, + "step": 58248 + }, + { + "epoch": 2.85, + "grad_norm": 0.7747548222541809, + "learning_rate": 3.468796939642182e-06, + "loss": 2.9552, + "step": 58249 + }, + { + "epoch": 2.85, + "grad_norm": 0.8232293128967285, + "learning_rate": 3.466462807773729e-06, + "loss": 3.0993, + "step": 58250 + }, + { + "epoch": 2.85, + "grad_norm": 0.7535151839256287, + "learning_rate": 3.464129456917564e-06, + "loss": 3.1768, + "step": 58251 + }, + { + "epoch": 2.85, + "grad_norm": 0.7690832018852234, + "learning_rate": 3.4617968870798154e-06, + "loss": 2.5916, + "step": 58252 + }, + { + "epoch": 2.85, + "grad_norm": 0.7519869208335876, + "learning_rate": 3.459465098266645e-06, + "loss": 2.8841, + "step": 58253 + }, + { + "epoch": 2.85, + "grad_norm": 0.7166727185249329, + "learning_rate": 3.457134090484115e-06, + "loss": 2.9734, + "step": 58254 + }, + { + "epoch": 2.85, + "grad_norm": 0.742997944355011, + "learning_rate": 3.4548038637384536e-06, + "loss": 2.9862, + "step": 58255 + }, + { + "epoch": 2.85, + "grad_norm": 0.7113354802131653, + "learning_rate": 3.452474418035789e-06, + "loss": 2.8711, + "step": 58256 + }, + { + "epoch": 2.86, + "grad_norm": 0.785720944404602, + "learning_rate": 3.450145753382183e-06, + "loss": 2.9458, + "step": 58257 + }, + { + "epoch": 2.86, + "grad_norm": 0.7967292666435242, + "learning_rate": 3.4478178697838644e-06, + "loss": 2.5159, + "step": 58258 + }, + { + "epoch": 2.86, + "grad_norm": 0.7993360757827759, + "learning_rate": 3.445490767246928e-06, + "loss": 2.8102, + "step": 58259 + }, + { + "epoch": 2.86, + "grad_norm": 0.7960979342460632, + "learning_rate": 3.4431644457774356e-06, + "loss": 3.063, + "step": 58260 + }, + { + "epoch": 2.86, + "grad_norm": 0.7577957510948181, + "learning_rate": 3.4408389053816486e-06, + "loss": 2.733, + "step": 58261 + }, + { + "epoch": 2.86, + "grad_norm": 0.8428319692611694, + "learning_rate": 3.438514146065563e-06, + "loss": 2.8959, + "step": 58262 + }, + { + "epoch": 2.86, + "grad_norm": 0.740744411945343, + "learning_rate": 3.4361901678353734e-06, + "loss": 3.0609, + "step": 58263 + }, + { + "epoch": 2.86, + "grad_norm": 0.7100527882575989, + "learning_rate": 3.4338669706971743e-06, + "loss": 2.7642, + "step": 58264 + }, + { + "epoch": 2.86, + "grad_norm": 0.8007273077964783, + "learning_rate": 3.4315445546570953e-06, + "loss": 2.9573, + "step": 58265 + }, + { + "epoch": 2.86, + "grad_norm": 0.7537129521369934, + "learning_rate": 3.429222919721297e-06, + "loss": 2.8178, + "step": 58266 + }, + { + "epoch": 2.86, + "grad_norm": 0.8160079717636108, + "learning_rate": 3.4269020658958093e-06, + "loss": 2.7717, + "step": 58267 + }, + { + "epoch": 2.86, + "grad_norm": 0.7378734350204468, + "learning_rate": 3.4245819931867924e-06, + "loss": 2.7693, + "step": 58268 + }, + { + "epoch": 2.86, + "grad_norm": 0.7598668932914734, + "learning_rate": 3.422262701600342e-06, + "loss": 3.0711, + "step": 58269 + }, + { + "epoch": 2.86, + "grad_norm": 0.7291812300682068, + "learning_rate": 3.4199441911425874e-06, + "loss": 2.9735, + "step": 58270 + }, + { + "epoch": 2.86, + "grad_norm": 0.8131197094917297, + "learning_rate": 3.4176264618196224e-06, + "loss": 2.922, + "step": 58271 + }, + { + "epoch": 2.86, + "grad_norm": 0.7432801127433777, + "learning_rate": 3.415309513637543e-06, + "loss": 2.7121, + "step": 58272 + }, + { + "epoch": 2.86, + "grad_norm": 0.7480484843254089, + "learning_rate": 3.412993346602444e-06, + "loss": 3.0394, + "step": 58273 + }, + { + "epoch": 2.86, + "grad_norm": 0.7310391664505005, + "learning_rate": 3.410677960720487e-06, + "loss": 3.0235, + "step": 58274 + }, + { + "epoch": 2.86, + "grad_norm": 0.7628144025802612, + "learning_rate": 3.4083633559976676e-06, + "loss": 2.85, + "step": 58275 + }, + { + "epoch": 2.86, + "grad_norm": 0.7469602227210999, + "learning_rate": 3.4060495324401803e-06, + "loss": 2.8982, + "step": 58276 + }, + { + "epoch": 2.86, + "grad_norm": 0.8014663457870483, + "learning_rate": 3.4037364900540544e-06, + "loss": 2.9782, + "step": 58277 + }, + { + "epoch": 2.86, + "grad_norm": 0.7652974724769592, + "learning_rate": 3.4014242288454176e-06, + "loss": 2.901, + "step": 58278 + }, + { + "epoch": 2.86, + "grad_norm": 0.7459307909011841, + "learning_rate": 3.3991127488203318e-06, + "loss": 2.9058, + "step": 58279 + }, + { + "epoch": 2.86, + "grad_norm": 0.8366508483886719, + "learning_rate": 3.3968020499849256e-06, + "loss": 2.9857, + "step": 58280 + }, + { + "epoch": 2.86, + "grad_norm": 0.7131357192993164, + "learning_rate": 3.3944921323452613e-06, + "loss": 2.8848, + "step": 58281 + }, + { + "epoch": 2.86, + "grad_norm": 0.7664152383804321, + "learning_rate": 3.3921829959074e-06, + "loss": 2.9115, + "step": 58282 + }, + { + "epoch": 2.86, + "grad_norm": 0.7806337475776672, + "learning_rate": 3.3898746406774703e-06, + "loss": 2.8592, + "step": 58283 + }, + { + "epoch": 2.86, + "grad_norm": 0.7475855350494385, + "learning_rate": 3.3875670666615007e-06, + "loss": 2.9584, + "step": 58284 + }, + { + "epoch": 2.86, + "grad_norm": 0.754424512386322, + "learning_rate": 3.3852602738655866e-06, + "loss": 2.8993, + "step": 58285 + }, + { + "epoch": 2.86, + "grad_norm": 0.7555055022239685, + "learning_rate": 3.3829542622958563e-06, + "loss": 3.0222, + "step": 58286 + }, + { + "epoch": 2.86, + "grad_norm": 0.7311819791793823, + "learning_rate": 3.3806490319583047e-06, + "loss": 2.8419, + "step": 58287 + }, + { + "epoch": 2.86, + "grad_norm": 0.7578787803649902, + "learning_rate": 3.3783445828590605e-06, + "loss": 2.871, + "step": 58288 + }, + { + "epoch": 2.86, + "grad_norm": 0.9047726392745972, + "learning_rate": 3.3760409150041524e-06, + "loss": 2.8261, + "step": 58289 + }, + { + "epoch": 2.86, + "grad_norm": 0.7717176079750061, + "learning_rate": 3.373738028399675e-06, + "loss": 2.8182, + "step": 58290 + }, + { + "epoch": 2.86, + "grad_norm": 0.7483989000320435, + "learning_rate": 3.3714359230516906e-06, + "loss": 2.8682, + "step": 58291 + }, + { + "epoch": 2.86, + "grad_norm": 0.8349034786224365, + "learning_rate": 3.369134598966228e-06, + "loss": 2.9128, + "step": 58292 + }, + { + "epoch": 2.86, + "grad_norm": 0.7524193525314331, + "learning_rate": 3.3668340561494145e-06, + "loss": 2.8352, + "step": 58293 + }, + { + "epoch": 2.86, + "grad_norm": 0.7450447082519531, + "learning_rate": 3.364534294607246e-06, + "loss": 3.0256, + "step": 58294 + }, + { + "epoch": 2.86, + "grad_norm": 0.7853724360466003, + "learning_rate": 3.362235314345818e-06, + "loss": 2.8471, + "step": 58295 + }, + { + "epoch": 2.86, + "grad_norm": 0.8045984506607056, + "learning_rate": 3.3599371153711917e-06, + "loss": 2.9097, + "step": 58296 + }, + { + "epoch": 2.86, + "grad_norm": 0.7121866941452026, + "learning_rate": 3.3576396976893626e-06, + "loss": 2.7478, + "step": 58297 + }, + { + "epoch": 2.86, + "grad_norm": 0.7629473805427551, + "learning_rate": 3.355343061306459e-06, + "loss": 2.8422, + "step": 58298 + }, + { + "epoch": 2.86, + "grad_norm": 0.7671418786048889, + "learning_rate": 3.353047206228443e-06, + "loss": 3.1245, + "step": 58299 + }, + { + "epoch": 2.86, + "grad_norm": 0.7228366136550903, + "learning_rate": 3.350752132461443e-06, + "loss": 3.0361, + "step": 58300 + }, + { + "epoch": 2.86, + "grad_norm": 0.7080792188644409, + "learning_rate": 3.3484578400114537e-06, + "loss": 3.03, + "step": 58301 + }, + { + "epoch": 2.86, + "grad_norm": 0.7466286420822144, + "learning_rate": 3.3461643288845375e-06, + "loss": 2.765, + "step": 58302 + }, + { + "epoch": 2.86, + "grad_norm": 0.7471815943717957, + "learning_rate": 3.343871599086756e-06, + "loss": 2.9213, + "step": 58303 + }, + { + "epoch": 2.86, + "grad_norm": 0.7371976375579834, + "learning_rate": 3.3415796506241045e-06, + "loss": 2.9685, + "step": 58304 + }, + { + "epoch": 2.86, + "grad_norm": 0.6997452974319458, + "learning_rate": 3.3392884835026444e-06, + "loss": 2.9548, + "step": 58305 + }, + { + "epoch": 2.86, + "grad_norm": 0.801490068435669, + "learning_rate": 3.336998097728405e-06, + "loss": 3.1318, + "step": 58306 + }, + { + "epoch": 2.86, + "grad_norm": 0.7446818947792053, + "learning_rate": 3.3347084933074143e-06, + "loss": 2.9129, + "step": 58307 + }, + { + "epoch": 2.86, + "grad_norm": 0.7398616075515747, + "learning_rate": 3.3324196702457006e-06, + "loss": 2.9168, + "step": 58308 + }, + { + "epoch": 2.86, + "grad_norm": 0.7449392676353455, + "learning_rate": 3.3301316285493263e-06, + "loss": 3.0701, + "step": 58309 + }, + { + "epoch": 2.86, + "grad_norm": 0.7428542375564575, + "learning_rate": 3.327844368224253e-06, + "loss": 2.6963, + "step": 58310 + }, + { + "epoch": 2.86, + "grad_norm": 0.7714420557022095, + "learning_rate": 3.3255578892765753e-06, + "loss": 2.8949, + "step": 58311 + }, + { + "epoch": 2.86, + "grad_norm": 0.7456600069999695, + "learning_rate": 3.323272191712256e-06, + "loss": 2.841, + "step": 58312 + }, + { + "epoch": 2.86, + "grad_norm": 0.7867556810379028, + "learning_rate": 3.3209872755373566e-06, + "loss": 2.8416, + "step": 58313 + }, + { + "epoch": 2.86, + "grad_norm": 0.7435100674629211, + "learning_rate": 3.318703140757872e-06, + "loss": 2.958, + "step": 58314 + }, + { + "epoch": 2.86, + "grad_norm": 0.752841591835022, + "learning_rate": 3.3164197873798316e-06, + "loss": 3.1411, + "step": 58315 + }, + { + "epoch": 2.86, + "grad_norm": 0.7200860381126404, + "learning_rate": 3.3141372154092293e-06, + "loss": 2.8908, + "step": 58316 + }, + { + "epoch": 2.86, + "grad_norm": 0.6982949376106262, + "learning_rate": 3.311855424852128e-06, + "loss": 2.5464, + "step": 58317 + }, + { + "epoch": 2.86, + "grad_norm": 0.7470500469207764, + "learning_rate": 3.309574415714489e-06, + "loss": 2.7845, + "step": 58318 + }, + { + "epoch": 2.86, + "grad_norm": 0.7575530409812927, + "learning_rate": 3.3072941880023075e-06, + "loss": 2.935, + "step": 58319 + }, + { + "epoch": 2.86, + "grad_norm": 0.7366733551025391, + "learning_rate": 3.305014741721612e-06, + "loss": 3.0839, + "step": 58320 + }, + { + "epoch": 2.86, + "grad_norm": 0.7596519589424133, + "learning_rate": 3.3027360768784318e-06, + "loss": 2.7866, + "step": 58321 + }, + { + "epoch": 2.86, + "grad_norm": 0.7350767254829407, + "learning_rate": 3.3004581934787276e-06, + "loss": 2.8478, + "step": 58322 + }, + { + "epoch": 2.86, + "grad_norm": 0.730754554271698, + "learning_rate": 3.298181091528529e-06, + "loss": 2.8066, + "step": 58323 + }, + { + "epoch": 2.86, + "grad_norm": 0.7694946527481079, + "learning_rate": 3.295904771033797e-06, + "loss": 2.6976, + "step": 58324 + }, + { + "epoch": 2.86, + "grad_norm": 0.7357961535453796, + "learning_rate": 3.293629232000561e-06, + "loss": 2.8541, + "step": 58325 + }, + { + "epoch": 2.86, + "grad_norm": 0.7392043471336365, + "learning_rate": 3.2913544744348152e-06, + "loss": 2.7927, + "step": 58326 + }, + { + "epoch": 2.86, + "grad_norm": 0.7261336445808411, + "learning_rate": 3.289080498342522e-06, + "loss": 2.9183, + "step": 58327 + }, + { + "epoch": 2.86, + "grad_norm": 0.7417194843292236, + "learning_rate": 3.2868073037297104e-06, + "loss": 2.9671, + "step": 58328 + }, + { + "epoch": 2.86, + "grad_norm": 0.7913023829460144, + "learning_rate": 3.2845348906023084e-06, + "loss": 3.0921, + "step": 58329 + }, + { + "epoch": 2.86, + "grad_norm": 0.7364912033081055, + "learning_rate": 3.2822632589663776e-06, + "loss": 2.9238, + "step": 58330 + }, + { + "epoch": 2.86, + "grad_norm": 0.7688376903533936, + "learning_rate": 3.279992408827847e-06, + "loss": 2.9304, + "step": 58331 + }, + { + "epoch": 2.86, + "grad_norm": 0.7457802295684814, + "learning_rate": 3.277722340192712e-06, + "loss": 3.1047, + "step": 58332 + }, + { + "epoch": 2.86, + "grad_norm": 0.74704509973526, + "learning_rate": 3.2754530530669343e-06, + "loss": 2.9558, + "step": 58333 + }, + { + "epoch": 2.86, + "grad_norm": 0.7239862680435181, + "learning_rate": 3.2731845474565086e-06, + "loss": 2.8178, + "step": 58334 + }, + { + "epoch": 2.86, + "grad_norm": 0.7056124806404114, + "learning_rate": 3.270916823367431e-06, + "loss": 3.0095, + "step": 58335 + }, + { + "epoch": 2.86, + "grad_norm": 0.7435981035232544, + "learning_rate": 3.2686498808056296e-06, + "loss": 2.7761, + "step": 58336 + }, + { + "epoch": 2.86, + "grad_norm": 0.7090030908584595, + "learning_rate": 3.266383719777099e-06, + "loss": 2.9752, + "step": 58337 + }, + { + "epoch": 2.86, + "grad_norm": 0.8070139288902283, + "learning_rate": 3.2641183402878356e-06, + "loss": 2.9494, + "step": 58338 + }, + { + "epoch": 2.86, + "grad_norm": 0.7761786580085754, + "learning_rate": 3.2618537423437672e-06, + "loss": 2.7368, + "step": 58339 + }, + { + "epoch": 2.86, + "grad_norm": 0.7421362400054932, + "learning_rate": 3.259589925950856e-06, + "loss": 2.8487, + "step": 58340 + }, + { + "epoch": 2.86, + "grad_norm": 0.8992583155632019, + "learning_rate": 3.257326891115064e-06, + "loss": 2.9035, + "step": 58341 + }, + { + "epoch": 2.86, + "grad_norm": 0.7621567249298096, + "learning_rate": 3.255064637842386e-06, + "loss": 2.8351, + "step": 58342 + }, + { + "epoch": 2.86, + "grad_norm": 0.7418533563613892, + "learning_rate": 3.252803166138751e-06, + "loss": 3.0166, + "step": 58343 + }, + { + "epoch": 2.86, + "grad_norm": 0.7442709803581238, + "learning_rate": 3.2505424760101207e-06, + "loss": 2.6756, + "step": 58344 + }, + { + "epoch": 2.86, + "grad_norm": 0.7631102800369263, + "learning_rate": 3.248282567462457e-06, + "loss": 3.0295, + "step": 58345 + }, + { + "epoch": 2.86, + "grad_norm": 0.8174268007278442, + "learning_rate": 3.246023440501688e-06, + "loss": 2.9048, + "step": 58346 + }, + { + "epoch": 2.86, + "grad_norm": 0.736268162727356, + "learning_rate": 3.243765095133777e-06, + "loss": 2.951, + "step": 58347 + }, + { + "epoch": 2.86, + "grad_norm": 0.7341980934143066, + "learning_rate": 3.2415075313647177e-06, + "loss": 2.8567, + "step": 58348 + }, + { + "epoch": 2.86, + "grad_norm": 0.7812278866767883, + "learning_rate": 3.239250749200373e-06, + "loss": 2.882, + "step": 58349 + }, + { + "epoch": 2.86, + "grad_norm": 0.7396385073661804, + "learning_rate": 3.2369947486467707e-06, + "loss": 3.0547, + "step": 58350 + }, + { + "epoch": 2.86, + "grad_norm": 0.7556043267250061, + "learning_rate": 3.2347395297097732e-06, + "loss": 2.7976, + "step": 58351 + }, + { + "epoch": 2.86, + "grad_norm": 0.7392325401306152, + "learning_rate": 3.2324850923953426e-06, + "loss": 2.9214, + "step": 58352 + }, + { + "epoch": 2.86, + "grad_norm": 0.7273942232131958, + "learning_rate": 3.230231436709474e-06, + "loss": 2.8526, + "step": 58353 + }, + { + "epoch": 2.86, + "grad_norm": 0.7350383996963501, + "learning_rate": 3.2279785626580622e-06, + "loss": 3.0958, + "step": 58354 + }, + { + "epoch": 2.86, + "grad_norm": 0.7365357875823975, + "learning_rate": 3.2257264702470363e-06, + "loss": 2.8979, + "step": 58355 + }, + { + "epoch": 2.86, + "grad_norm": 0.742205023765564, + "learning_rate": 3.223475159482325e-06, + "loss": 2.8678, + "step": 58356 + }, + { + "epoch": 2.86, + "grad_norm": 0.7722266912460327, + "learning_rate": 3.2212246303698565e-06, + "loss": 2.7893, + "step": 58357 + }, + { + "epoch": 2.86, + "grad_norm": 0.7623163461685181, + "learning_rate": 3.2189748829155593e-06, + "loss": 2.8126, + "step": 58358 + }, + { + "epoch": 2.86, + "grad_norm": 0.794621467590332, + "learning_rate": 3.2167259171253625e-06, + "loss": 2.8671, + "step": 58359 + }, + { + "epoch": 2.86, + "grad_norm": 0.737347424030304, + "learning_rate": 3.2144777330052274e-06, + "loss": 2.8197, + "step": 58360 + }, + { + "epoch": 2.86, + "grad_norm": 0.7127636075019836, + "learning_rate": 3.2122303305610162e-06, + "loss": 2.803, + "step": 58361 + }, + { + "epoch": 2.86, + "grad_norm": 0.7416470646858215, + "learning_rate": 3.2099837097986914e-06, + "loss": 2.7396, + "step": 58362 + }, + { + "epoch": 2.86, + "grad_norm": 0.7706217765808105, + "learning_rate": 3.2077378707241142e-06, + "loss": 2.7705, + "step": 58363 + }, + { + "epoch": 2.86, + "grad_norm": 0.7568953037261963, + "learning_rate": 3.2054928133432467e-06, + "loss": 2.9229, + "step": 58364 + }, + { + "epoch": 2.86, + "grad_norm": 0.7913509607315063, + "learning_rate": 3.2032485376620175e-06, + "loss": 2.9663, + "step": 58365 + }, + { + "epoch": 2.86, + "grad_norm": 0.777898371219635, + "learning_rate": 3.2010050436862887e-06, + "loss": 2.7852, + "step": 58366 + }, + { + "epoch": 2.86, + "grad_norm": 0.7000056505203247, + "learning_rate": 3.198762331421989e-06, + "loss": 2.7677, + "step": 58367 + }, + { + "epoch": 2.86, + "grad_norm": 0.7845345139503479, + "learning_rate": 3.19652040087508e-06, + "loss": 2.7989, + "step": 58368 + }, + { + "epoch": 2.86, + "grad_norm": 0.7394914627075195, + "learning_rate": 3.194279252051357e-06, + "loss": 2.9253, + "step": 58369 + }, + { + "epoch": 2.86, + "grad_norm": 0.741482675075531, + "learning_rate": 3.1920388849568157e-06, + "loss": 3.07, + "step": 58370 + }, + { + "epoch": 2.86, + "grad_norm": 0.7681283950805664, + "learning_rate": 3.1897992995973175e-06, + "loss": 2.9483, + "step": 58371 + }, + { + "epoch": 2.86, + "grad_norm": 0.7793736457824707, + "learning_rate": 3.187560495978758e-06, + "loss": 2.5742, + "step": 58372 + }, + { + "epoch": 2.86, + "grad_norm": 0.7607495784759521, + "learning_rate": 3.1853224741070326e-06, + "loss": 3.0333, + "step": 58373 + }, + { + "epoch": 2.86, + "grad_norm": 0.7457079887390137, + "learning_rate": 3.1830852339880366e-06, + "loss": 2.9459, + "step": 58374 + }, + { + "epoch": 2.86, + "grad_norm": 0.7789652347564697, + "learning_rate": 3.180848775627698e-06, + "loss": 2.9568, + "step": 58375 + }, + { + "epoch": 2.86, + "grad_norm": 0.7540808916091919, + "learning_rate": 3.178613099031879e-06, + "loss": 2.8091, + "step": 58376 + }, + { + "epoch": 2.86, + "grad_norm": 0.7654391527175903, + "learning_rate": 3.176378204206442e-06, + "loss": 2.8447, + "step": 58377 + }, + { + "epoch": 2.86, + "grad_norm": 0.7442813515663147, + "learning_rate": 3.174144091157316e-06, + "loss": 2.8316, + "step": 58378 + }, + { + "epoch": 2.86, + "grad_norm": 0.7185924053192139, + "learning_rate": 3.1719107598903615e-06, + "loss": 2.8791, + "step": 58379 + }, + { + "epoch": 2.86, + "grad_norm": 0.7041293978691101, + "learning_rate": 3.169678210411475e-06, + "loss": 2.8301, + "step": 58380 + }, + { + "epoch": 2.86, + "grad_norm": 0.8128548860549927, + "learning_rate": 3.167446442726518e-06, + "loss": 2.9627, + "step": 58381 + }, + { + "epoch": 2.86, + "grad_norm": 0.7193200588226318, + "learning_rate": 3.165215456841419e-06, + "loss": 2.8726, + "step": 58382 + }, + { + "epoch": 2.86, + "grad_norm": 0.7626433372497559, + "learning_rate": 3.162985252761974e-06, + "loss": 2.8444, + "step": 58383 + }, + { + "epoch": 2.86, + "grad_norm": 0.7526370286941528, + "learning_rate": 3.1607558304941107e-06, + "loss": 2.6661, + "step": 58384 + }, + { + "epoch": 2.86, + "grad_norm": 0.7516125440597534, + "learning_rate": 3.1585271900436913e-06, + "loss": 2.9895, + "step": 58385 + }, + { + "epoch": 2.86, + "grad_norm": 0.7663487792015076, + "learning_rate": 3.1562993314165784e-06, + "loss": 2.832, + "step": 58386 + }, + { + "epoch": 2.86, + "grad_norm": 0.7314614653587341, + "learning_rate": 3.154072254618667e-06, + "loss": 3.0411, + "step": 58387 + }, + { + "epoch": 2.86, + "grad_norm": 0.7240493893623352, + "learning_rate": 3.151845959655752e-06, + "loss": 2.9518, + "step": 58388 + }, + { + "epoch": 2.86, + "grad_norm": 0.7454181909561157, + "learning_rate": 3.1496204465337626e-06, + "loss": 2.7987, + "step": 58389 + }, + { + "epoch": 2.86, + "grad_norm": 0.6941863894462585, + "learning_rate": 3.1473957152585936e-06, + "loss": 3.1225, + "step": 58390 + }, + { + "epoch": 2.86, + "grad_norm": 0.7873305678367615, + "learning_rate": 3.1451717658360075e-06, + "loss": 2.7987, + "step": 58391 + }, + { + "epoch": 2.86, + "grad_norm": 0.7635713219642639, + "learning_rate": 3.1429485982719326e-06, + "loss": 2.6695, + "step": 58392 + }, + { + "epoch": 2.86, + "grad_norm": 0.74675452709198, + "learning_rate": 3.1407262125721645e-06, + "loss": 2.7484, + "step": 58393 + }, + { + "epoch": 2.86, + "grad_norm": 0.7379273176193237, + "learning_rate": 3.138504608742598e-06, + "loss": 2.9954, + "step": 58394 + }, + { + "epoch": 2.86, + "grad_norm": 0.7270514965057373, + "learning_rate": 3.1362837867890957e-06, + "loss": 3.2038, + "step": 58395 + }, + { + "epoch": 2.86, + "grad_norm": 0.7507408261299133, + "learning_rate": 3.134063746717486e-06, + "loss": 2.8354, + "step": 58396 + }, + { + "epoch": 2.86, + "grad_norm": 0.7894320487976074, + "learning_rate": 3.13184448853363e-06, + "loss": 2.9427, + "step": 58397 + }, + { + "epoch": 2.86, + "grad_norm": 0.7546813488006592, + "learning_rate": 3.129626012243358e-06, + "loss": 2.8778, + "step": 58398 + }, + { + "epoch": 2.86, + "grad_norm": 0.7203192114830017, + "learning_rate": 3.127408317852498e-06, + "loss": 2.9069, + "step": 58399 + }, + { + "epoch": 2.86, + "grad_norm": 0.7543355226516724, + "learning_rate": 3.125191405366911e-06, + "loss": 2.9173, + "step": 58400 + }, + { + "epoch": 2.86, + "grad_norm": 0.7346734404563904, + "learning_rate": 3.1229752747924606e-06, + "loss": 2.8988, + "step": 58401 + }, + { + "epoch": 2.86, + "grad_norm": 0.8272573351860046, + "learning_rate": 3.1207599261349416e-06, + "loss": 2.8884, + "step": 58402 + }, + { + "epoch": 2.86, + "grad_norm": 0.7785396575927734, + "learning_rate": 3.1185453594001818e-06, + "loss": 2.7603, + "step": 58403 + }, + { + "epoch": 2.86, + "grad_norm": 0.7275567650794983, + "learning_rate": 3.1163315745940777e-06, + "loss": 2.8534, + "step": 58404 + }, + { + "epoch": 2.86, + "grad_norm": 0.7752771377563477, + "learning_rate": 3.114118571722424e-06, + "loss": 2.741, + "step": 58405 + }, + { + "epoch": 2.86, + "grad_norm": 0.7439063191413879, + "learning_rate": 3.111906350791016e-06, + "loss": 3.0018, + "step": 58406 + }, + { + "epoch": 2.86, + "grad_norm": 0.7584403157234192, + "learning_rate": 3.109694911805716e-06, + "loss": 3.0572, + "step": 58407 + }, + { + "epoch": 2.86, + "grad_norm": 0.7162322998046875, + "learning_rate": 3.1074842547723523e-06, + "loss": 2.9417, + "step": 58408 + }, + { + "epoch": 2.86, + "grad_norm": 0.7319763898849487, + "learning_rate": 3.1052743796967207e-06, + "loss": 3.1582, + "step": 58409 + }, + { + "epoch": 2.86, + "grad_norm": 0.825232744216919, + "learning_rate": 3.1030652865846828e-06, + "loss": 2.8801, + "step": 58410 + }, + { + "epoch": 2.86, + "grad_norm": 0.7098069787025452, + "learning_rate": 3.100856975442001e-06, + "loss": 2.9849, + "step": 58411 + }, + { + "epoch": 2.86, + "grad_norm": 0.754648745059967, + "learning_rate": 3.0986494462745372e-06, + "loss": 3.0793, + "step": 58412 + }, + { + "epoch": 2.86, + "grad_norm": 0.7526900768280029, + "learning_rate": 3.0964426990880865e-06, + "loss": 2.9493, + "step": 58413 + }, + { + "epoch": 2.86, + "grad_norm": 0.7582096457481384, + "learning_rate": 3.0942367338884775e-06, + "loss": 2.8612, + "step": 58414 + }, + { + "epoch": 2.86, + "grad_norm": 0.7545999884605408, + "learning_rate": 3.0920315506815063e-06, + "loss": 2.8995, + "step": 58415 + }, + { + "epoch": 2.86, + "grad_norm": 0.7414794564247131, + "learning_rate": 3.0898271494729677e-06, + "loss": 2.8369, + "step": 58416 + }, + { + "epoch": 2.86, + "grad_norm": 0.8262287378311157, + "learning_rate": 3.08762353026869e-06, + "loss": 2.9537, + "step": 58417 + }, + { + "epoch": 2.86, + "grad_norm": 0.7526206374168396, + "learning_rate": 3.0854206930744696e-06, + "loss": 2.8687, + "step": 58418 + }, + { + "epoch": 2.86, + "grad_norm": 0.7402799725532532, + "learning_rate": 3.0832186378961343e-06, + "loss": 2.8745, + "step": 58419 + }, + { + "epoch": 2.86, + "grad_norm": 0.784649133682251, + "learning_rate": 3.0810173647394463e-06, + "loss": 2.6984, + "step": 58420 + }, + { + "epoch": 2.86, + "grad_norm": 0.7722074389457703, + "learning_rate": 3.0788168736102014e-06, + "loss": 2.8043, + "step": 58421 + }, + { + "epoch": 2.86, + "grad_norm": 0.8019753694534302, + "learning_rate": 3.0766171645141946e-06, + "loss": 2.8064, + "step": 58422 + }, + { + "epoch": 2.86, + "grad_norm": 0.7800315618515015, + "learning_rate": 3.0744182374572548e-06, + "loss": 2.8841, + "step": 58423 + }, + { + "epoch": 2.86, + "grad_norm": 0.7406907081604004, + "learning_rate": 3.0722200924451434e-06, + "loss": 2.9577, + "step": 58424 + }, + { + "epoch": 2.86, + "grad_norm": 0.7482556104660034, + "learning_rate": 3.0700227294836897e-06, + "loss": 2.8487, + "step": 58425 + }, + { + "epoch": 2.86, + "grad_norm": 0.7338652610778809, + "learning_rate": 3.0678261485785892e-06, + "loss": 3.0761, + "step": 58426 + }, + { + "epoch": 2.86, + "grad_norm": 0.7629140019416809, + "learning_rate": 3.0656303497357373e-06, + "loss": 2.8937, + "step": 58427 + }, + { + "epoch": 2.86, + "grad_norm": 0.7479938864707947, + "learning_rate": 3.0634353329608954e-06, + "loss": 2.6075, + "step": 58428 + }, + { + "epoch": 2.86, + "grad_norm": 0.793538510799408, + "learning_rate": 3.061241098259759e-06, + "loss": 3.0803, + "step": 58429 + }, + { + "epoch": 2.86, + "grad_norm": 0.7822150588035583, + "learning_rate": 3.059047645638191e-06, + "loss": 2.6345, + "step": 58430 + }, + { + "epoch": 2.86, + "grad_norm": 0.7450809478759766, + "learning_rate": 3.056854975101952e-06, + "loss": 2.9367, + "step": 58431 + }, + { + "epoch": 2.86, + "grad_norm": 0.7621826529502869, + "learning_rate": 3.0546630866568057e-06, + "loss": 3.0372, + "step": 58432 + }, + { + "epoch": 2.86, + "grad_norm": 0.7780464887619019, + "learning_rate": 3.052471980308513e-06, + "loss": 2.9461, + "step": 58433 + }, + { + "epoch": 2.86, + "grad_norm": 0.7118364572525024, + "learning_rate": 3.0502816560628695e-06, + "loss": 2.8477, + "step": 58434 + }, + { + "epoch": 2.86, + "grad_norm": 0.7661958336830139, + "learning_rate": 3.0480921139256378e-06, + "loss": 2.9218, + "step": 58435 + }, + { + "epoch": 2.86, + "grad_norm": 0.7564506530761719, + "learning_rate": 3.045903353902579e-06, + "loss": 2.9434, + "step": 58436 + }, + { + "epoch": 2.86, + "grad_norm": 0.7816385626792908, + "learning_rate": 3.043715375999456e-06, + "loss": 2.8841, + "step": 58437 + }, + { + "epoch": 2.86, + "grad_norm": 0.7606683373451233, + "learning_rate": 3.0415281802220304e-06, + "loss": 2.7296, + "step": 58438 + }, + { + "epoch": 2.86, + "grad_norm": 0.7658647298812866, + "learning_rate": 3.0393417665760976e-06, + "loss": 3.0555, + "step": 58439 + }, + { + "epoch": 2.86, + "grad_norm": 0.7768723964691162, + "learning_rate": 3.0371561350673537e-06, + "loss": 2.8682, + "step": 58440 + }, + { + "epoch": 2.86, + "grad_norm": 0.7276012301445007, + "learning_rate": 3.0349712857016263e-06, + "loss": 2.7861, + "step": 58441 + }, + { + "epoch": 2.86, + "grad_norm": 0.7483054995536804, + "learning_rate": 3.0327872184846113e-06, + "loss": 2.8149, + "step": 58442 + }, + { + "epoch": 2.86, + "grad_norm": 0.7850237488746643, + "learning_rate": 3.030603933422071e-06, + "loss": 2.8655, + "step": 58443 + }, + { + "epoch": 2.86, + "grad_norm": 0.764310359954834, + "learning_rate": 3.0284214305197675e-06, + "loss": 3.0662, + "step": 58444 + }, + { + "epoch": 2.86, + "grad_norm": 0.7962012887001038, + "learning_rate": 3.0262397097834956e-06, + "loss": 2.8438, + "step": 58445 + }, + { + "epoch": 2.86, + "grad_norm": 0.789114773273468, + "learning_rate": 3.0240587712188848e-06, + "loss": 3.0808, + "step": 58446 + }, + { + "epoch": 2.86, + "grad_norm": 0.7645512819290161, + "learning_rate": 3.0218786148317965e-06, + "loss": 2.8729, + "step": 58447 + }, + { + "epoch": 2.86, + "grad_norm": 0.7275797724723816, + "learning_rate": 3.019699240627893e-06, + "loss": 3.0125, + "step": 58448 + }, + { + "epoch": 2.86, + "grad_norm": 0.7716579437255859, + "learning_rate": 3.01752064861297e-06, + "loss": 2.9713, + "step": 58449 + }, + { + "epoch": 2.86, + "grad_norm": 0.781461775302887, + "learning_rate": 3.015342838792756e-06, + "loss": 2.8235, + "step": 58450 + }, + { + "epoch": 2.86, + "grad_norm": 0.7181127667427063, + "learning_rate": 3.0131658111729463e-06, + "loss": 2.9488, + "step": 58451 + }, + { + "epoch": 2.86, + "grad_norm": 0.8236247897148132, + "learning_rate": 3.0109895657593365e-06, + "loss": 2.9258, + "step": 58452 + }, + { + "epoch": 2.86, + "grad_norm": 0.7421223521232605, + "learning_rate": 3.0088141025575887e-06, + "loss": 2.8039, + "step": 58453 + }, + { + "epoch": 2.86, + "grad_norm": 0.7448740601539612, + "learning_rate": 3.0066394215734645e-06, + "loss": 2.7816, + "step": 58454 + }, + { + "epoch": 2.86, + "grad_norm": 0.751390814781189, + "learning_rate": 3.0044655228126937e-06, + "loss": 2.8791, + "step": 58455 + }, + { + "epoch": 2.86, + "grad_norm": 0.7451471090316772, + "learning_rate": 3.0022924062810373e-06, + "loss": 2.8559, + "step": 58456 + }, + { + "epoch": 2.86, + "grad_norm": 0.7556320428848267, + "learning_rate": 3.0001200719841578e-06, + "loss": 2.661, + "step": 58457 + }, + { + "epoch": 2.86, + "grad_norm": 0.7909460067749023, + "learning_rate": 2.9979485199278175e-06, + "loss": 3.1855, + "step": 58458 + }, + { + "epoch": 2.86, + "grad_norm": 0.7238598465919495, + "learning_rate": 2.9957777501177115e-06, + "loss": 2.8134, + "step": 58459 + }, + { + "epoch": 2.86, + "grad_norm": 0.7427256107330322, + "learning_rate": 2.9936077625595356e-06, + "loss": 3.0505, + "step": 58460 + }, + { + "epoch": 2.87, + "grad_norm": 0.7333883047103882, + "learning_rate": 2.9914385572590848e-06, + "loss": 2.9026, + "step": 58461 + }, + { + "epoch": 2.87, + "grad_norm": 0.8171818256378174, + "learning_rate": 2.9892701342220215e-06, + "loss": 2.8903, + "step": 58462 + }, + { + "epoch": 2.87, + "grad_norm": 0.7717416286468506, + "learning_rate": 2.987102493454041e-06, + "loss": 2.8525, + "step": 58463 + }, + { + "epoch": 2.87, + "grad_norm": 0.7565333247184753, + "learning_rate": 2.9849356349608724e-06, + "loss": 2.8967, + "step": 58464 + }, + { + "epoch": 2.87, + "grad_norm": 0.7647886872291565, + "learning_rate": 2.9827695587482434e-06, + "loss": 2.9653, + "step": 58465 + }, + { + "epoch": 2.87, + "grad_norm": 0.7452661395072937, + "learning_rate": 2.9806042648217843e-06, + "loss": 2.8538, + "step": 58466 + }, + { + "epoch": 2.87, + "grad_norm": 0.7374056577682495, + "learning_rate": 2.9784397531872894e-06, + "loss": 2.9242, + "step": 58467 + }, + { + "epoch": 2.87, + "grad_norm": 0.7226657271385193, + "learning_rate": 2.9762760238504212e-06, + "loss": 2.7999, + "step": 58468 + }, + { + "epoch": 2.87, + "grad_norm": 0.7068235874176025, + "learning_rate": 2.9741130768168752e-06, + "loss": 2.7401, + "step": 58469 + }, + { + "epoch": 2.87, + "grad_norm": 0.7041826248168945, + "learning_rate": 2.971950912092347e-06, + "loss": 2.8399, + "step": 58470 + }, + { + "epoch": 2.87, + "grad_norm": 0.7928381562232971, + "learning_rate": 2.9697895296825645e-06, + "loss": 2.8014, + "step": 58471 + }, + { + "epoch": 2.87, + "grad_norm": 0.7238459587097168, + "learning_rate": 2.9676289295931577e-06, + "loss": 2.9021, + "step": 58472 + }, + { + "epoch": 2.87, + "grad_norm": 0.7136267423629761, + "learning_rate": 2.9654691118298544e-06, + "loss": 2.937, + "step": 58473 + }, + { + "epoch": 2.87, + "grad_norm": 0.7531981468200684, + "learning_rate": 2.963310076398351e-06, + "loss": 2.7484, + "step": 58474 + }, + { + "epoch": 2.87, + "grad_norm": 0.7254064083099365, + "learning_rate": 2.9611518233043084e-06, + "loss": 2.8743, + "step": 58475 + }, + { + "epoch": 2.87, + "grad_norm": 0.74024498462677, + "learning_rate": 2.9589943525534566e-06, + "loss": 2.8673, + "step": 58476 + }, + { + "epoch": 2.87, + "grad_norm": 0.7722803354263306, + "learning_rate": 2.9568376641514237e-06, + "loss": 3.0154, + "step": 58477 + }, + { + "epoch": 2.87, + "grad_norm": 0.7294988036155701, + "learning_rate": 2.9546817581039385e-06, + "loss": 2.8068, + "step": 58478 + }, + { + "epoch": 2.87, + "grad_norm": 0.7921958565711975, + "learning_rate": 2.95252663441663e-06, + "loss": 3.0001, + "step": 58479 + }, + { + "epoch": 2.87, + "grad_norm": 0.7250039577484131, + "learning_rate": 2.9503722930951933e-06, + "loss": 2.9304, + "step": 58480 + }, + { + "epoch": 2.87, + "grad_norm": 0.8294291496276855, + "learning_rate": 2.948218734145291e-06, + "loss": 2.7402, + "step": 58481 + }, + { + "epoch": 2.87, + "grad_norm": 0.7494039535522461, + "learning_rate": 2.946065957572652e-06, + "loss": 2.7678, + "step": 58482 + }, + { + "epoch": 2.87, + "grad_norm": 0.8283053040504456, + "learning_rate": 2.9439139633828714e-06, + "loss": 2.863, + "step": 58483 + }, + { + "epoch": 2.87, + "grad_norm": 0.772268533706665, + "learning_rate": 2.941762751581678e-06, + "loss": 2.7249, + "step": 58484 + }, + { + "epoch": 2.87, + "grad_norm": 0.7302533388137817, + "learning_rate": 2.939612322174734e-06, + "loss": 2.8957, + "step": 58485 + }, + { + "epoch": 2.87, + "grad_norm": 0.787757933139801, + "learning_rate": 2.9374626751676347e-06, + "loss": 2.9369, + "step": 58486 + }, + { + "epoch": 2.87, + "grad_norm": 0.7752677798271179, + "learning_rate": 2.935313810566109e-06, + "loss": 2.8947, + "step": 58487 + }, + { + "epoch": 2.87, + "grad_norm": 0.7386978268623352, + "learning_rate": 2.933165728375786e-06, + "loss": 2.7775, + "step": 58488 + }, + { + "epoch": 2.87, + "grad_norm": 0.7692047953605652, + "learning_rate": 2.9310184286023276e-06, + "loss": 3.0412, + "step": 58489 + }, + { + "epoch": 2.87, + "grad_norm": 0.7471635937690735, + "learning_rate": 2.928871911251396e-06, + "loss": 2.7332, + "step": 58490 + }, + { + "epoch": 2.87, + "grad_norm": 0.7705933451652527, + "learning_rate": 2.926726176328653e-06, + "loss": 3.0526, + "step": 58491 + }, + { + "epoch": 2.87, + "grad_norm": 0.7803830504417419, + "learning_rate": 2.9245812238397282e-06, + "loss": 2.9832, + "step": 58492 + }, + { + "epoch": 2.87, + "grad_norm": 0.7428114414215088, + "learning_rate": 2.9224370537903163e-06, + "loss": 2.7619, + "step": 58493 + }, + { + "epoch": 2.87, + "grad_norm": 0.7303468585014343, + "learning_rate": 2.9202936661860134e-06, + "loss": 2.7892, + "step": 58494 + }, + { + "epoch": 2.87, + "grad_norm": 0.7733170390129089, + "learning_rate": 2.918151061032481e-06, + "loss": 2.8862, + "step": 58495 + }, + { + "epoch": 2.87, + "grad_norm": 0.7337782979011536, + "learning_rate": 2.9160092383353483e-06, + "loss": 2.9276, + "step": 58496 + }, + { + "epoch": 2.87, + "grad_norm": 0.7292686700820923, + "learning_rate": 2.9138681981002775e-06, + "loss": 3.0101, + "step": 58497 + }, + { + "epoch": 2.87, + "grad_norm": 0.7879104018211365, + "learning_rate": 2.9117279403329307e-06, + "loss": 2.9083, + "step": 58498 + }, + { + "epoch": 2.87, + "grad_norm": 0.7869991660118103, + "learning_rate": 2.9095884650389034e-06, + "loss": 2.912, + "step": 58499 + }, + { + "epoch": 2.87, + "grad_norm": 0.7370448112487793, + "learning_rate": 2.9074497722238242e-06, + "loss": 3.0133, + "step": 58500 + }, + { + "epoch": 2.87, + "grad_norm": 0.7162608504295349, + "learning_rate": 2.9053118618933556e-06, + "loss": 3.022, + "step": 58501 + }, + { + "epoch": 2.87, + "grad_norm": 0.7638165950775146, + "learning_rate": 2.9031747340531264e-06, + "loss": 2.965, + "step": 58502 + }, + { + "epoch": 2.87, + "grad_norm": 0.731220543384552, + "learning_rate": 2.9010383887087317e-06, + "loss": 3.0144, + "step": 58503 + }, + { + "epoch": 2.87, + "grad_norm": 0.778514564037323, + "learning_rate": 2.8989028258658674e-06, + "loss": 2.801, + "step": 58504 + }, + { + "epoch": 2.87, + "grad_norm": 0.8099600076675415, + "learning_rate": 2.8967680455300956e-06, + "loss": 3.0025, + "step": 58505 + }, + { + "epoch": 2.87, + "grad_norm": 0.7909402847290039, + "learning_rate": 2.8946340477070452e-06, + "loss": 2.8844, + "step": 58506 + }, + { + "epoch": 2.87, + "grad_norm": 0.7233417630195618, + "learning_rate": 2.892500832402378e-06, + "loss": 2.7969, + "step": 58507 + }, + { + "epoch": 2.87, + "grad_norm": 0.7570949792861938, + "learning_rate": 2.8903683996216562e-06, + "loss": 2.9808, + "step": 58508 + }, + { + "epoch": 2.87, + "grad_norm": 0.7265278100967407, + "learning_rate": 2.8882367493705425e-06, + "loss": 2.9376, + "step": 58509 + }, + { + "epoch": 2.87, + "grad_norm": 0.7586358189582825, + "learning_rate": 2.886105881654599e-06, + "loss": 2.8498, + "step": 58510 + }, + { + "epoch": 2.87, + "grad_norm": 0.7601178884506226, + "learning_rate": 2.8839757964794873e-06, + "loss": 2.9384, + "step": 58511 + }, + { + "epoch": 2.87, + "grad_norm": 0.7448052167892456, + "learning_rate": 2.8818464938508366e-06, + "loss": 2.997, + "step": 58512 + }, + { + "epoch": 2.87, + "grad_norm": 0.7501569986343384, + "learning_rate": 2.879717973774176e-06, + "loss": 2.7712, + "step": 58513 + }, + { + "epoch": 2.87, + "grad_norm": 0.7544119954109192, + "learning_rate": 2.8775902362552004e-06, + "loss": 2.9557, + "step": 58514 + }, + { + "epoch": 2.87, + "grad_norm": 0.7905291318893433, + "learning_rate": 2.875463281299439e-06, + "loss": 3.0345, + "step": 58515 + }, + { + "epoch": 2.87, + "grad_norm": 0.7742028832435608, + "learning_rate": 2.8733371089125213e-06, + "loss": 2.854, + "step": 58516 + }, + { + "epoch": 2.87, + "grad_norm": 0.748437762260437, + "learning_rate": 2.871211719100075e-06, + "loss": 2.7507, + "step": 58517 + }, + { + "epoch": 2.87, + "grad_norm": 0.7571814060211182, + "learning_rate": 2.8690871118676295e-06, + "loss": 2.6589, + "step": 58518 + }, + { + "epoch": 2.87, + "grad_norm": 0.7534952759742737, + "learning_rate": 2.8669632872208805e-06, + "loss": 2.8444, + "step": 58519 + }, + { + "epoch": 2.87, + "grad_norm": 0.710351824760437, + "learning_rate": 2.864840245165323e-06, + "loss": 2.7474, + "step": 58520 + }, + { + "epoch": 2.87, + "grad_norm": 0.7488248944282532, + "learning_rate": 2.8627179857066195e-06, + "loss": 2.7427, + "step": 58521 + }, + { + "epoch": 2.87, + "grad_norm": 0.7320175766944885, + "learning_rate": 2.8605965088503324e-06, + "loss": 2.8392, + "step": 58522 + }, + { + "epoch": 2.87, + "grad_norm": 0.738500714302063, + "learning_rate": 2.8584758146020236e-06, + "loss": 2.6562, + "step": 58523 + }, + { + "epoch": 2.87, + "grad_norm": 0.7765610218048096, + "learning_rate": 2.856355902967322e-06, + "loss": 3.1511, + "step": 58524 + }, + { + "epoch": 2.87, + "grad_norm": 0.7138228416442871, + "learning_rate": 2.85423677395179e-06, + "loss": 2.9996, + "step": 58525 + }, + { + "epoch": 2.87, + "grad_norm": 0.7560122013092041, + "learning_rate": 2.8521184275609897e-06, + "loss": 2.9072, + "step": 58526 + }, + { + "epoch": 2.87, + "grad_norm": 0.7315943241119385, + "learning_rate": 2.85000086380055e-06, + "loss": 2.8984, + "step": 58527 + }, + { + "epoch": 2.87, + "grad_norm": 0.7746604084968567, + "learning_rate": 2.8478840826759996e-06, + "loss": 2.7893, + "step": 58528 + }, + { + "epoch": 2.87, + "grad_norm": 0.7341941595077515, + "learning_rate": 2.8457680841929344e-06, + "loss": 2.7045, + "step": 58529 + }, + { + "epoch": 2.87, + "grad_norm": 0.7307849526405334, + "learning_rate": 2.843652868356949e-06, + "loss": 2.7645, + "step": 58530 + }, + { + "epoch": 2.87, + "grad_norm": 0.7367571592330933, + "learning_rate": 2.841538435173607e-06, + "loss": 2.8463, + "step": 58531 + }, + { + "epoch": 2.87, + "grad_norm": 0.7715376615524292, + "learning_rate": 2.8394247846484363e-06, + "loss": 2.8366, + "step": 58532 + }, + { + "epoch": 2.87, + "grad_norm": 0.7914366722106934, + "learning_rate": 2.8373119167869994e-06, + "loss": 2.777, + "step": 58533 + }, + { + "epoch": 2.87, + "grad_norm": 0.7575352191925049, + "learning_rate": 2.835199831594959e-06, + "loss": 3.0308, + "step": 58534 + }, + { + "epoch": 2.87, + "grad_norm": 0.7674602270126343, + "learning_rate": 2.8330885290777426e-06, + "loss": 2.7934, + "step": 58535 + }, + { + "epoch": 2.87, + "grad_norm": 0.7100291848182678, + "learning_rate": 2.830978009241047e-06, + "loss": 2.8665, + "step": 58536 + }, + { + "epoch": 2.87, + "grad_norm": 0.7356333136558533, + "learning_rate": 2.828868272090334e-06, + "loss": 2.9221, + "step": 58537 + }, + { + "epoch": 2.87, + "grad_norm": 0.7471137642860413, + "learning_rate": 2.8267593176311665e-06, + "loss": 3.0157, + "step": 58538 + }, + { + "epoch": 2.87, + "grad_norm": 0.7663685083389282, + "learning_rate": 2.8246511458691722e-06, + "loss": 2.8722, + "step": 58539 + }, + { + "epoch": 2.87, + "grad_norm": 0.7581278085708618, + "learning_rate": 2.8225437568098143e-06, + "loss": 2.776, + "step": 58540 + }, + { + "epoch": 2.87, + "grad_norm": 0.7169347405433655, + "learning_rate": 2.820437150458721e-06, + "loss": 2.5599, + "step": 58541 + }, + { + "epoch": 2.87, + "grad_norm": 0.7184317111968994, + "learning_rate": 2.818331326821355e-06, + "loss": 2.8356, + "step": 58542 + }, + { + "epoch": 2.87, + "grad_norm": 0.7458834648132324, + "learning_rate": 2.8162262859033447e-06, + "loss": 2.8992, + "step": 58543 + }, + { + "epoch": 2.87, + "grad_norm": 0.7352158427238464, + "learning_rate": 2.8141220277102196e-06, + "loss": 2.8397, + "step": 58544 + }, + { + "epoch": 2.87, + "grad_norm": 0.716495156288147, + "learning_rate": 2.8120185522474415e-06, + "loss": 2.9937, + "step": 58545 + }, + { + "epoch": 2.87, + "grad_norm": 0.7433305978775024, + "learning_rate": 2.809915859520673e-06, + "loss": 3.0178, + "step": 58546 + }, + { + "epoch": 2.87, + "grad_norm": 0.7520804405212402, + "learning_rate": 2.8078139495353423e-06, + "loss": 2.777, + "step": 58547 + }, + { + "epoch": 2.87, + "grad_norm": 0.728420078754425, + "learning_rate": 2.8057128222970458e-06, + "loss": 3.1996, + "step": 58548 + }, + { + "epoch": 2.87, + "grad_norm": 0.816613495349884, + "learning_rate": 2.8036124778113123e-06, + "loss": 3.1029, + "step": 58549 + }, + { + "epoch": 2.87, + "grad_norm": 0.7988955974578857, + "learning_rate": 2.8015129160836704e-06, + "loss": 2.818, + "step": 58550 + }, + { + "epoch": 2.87, + "grad_norm": 0.7273043394088745, + "learning_rate": 2.7994141371196487e-06, + "loss": 2.9679, + "step": 58551 + }, + { + "epoch": 2.87, + "grad_norm": 0.7395277619361877, + "learning_rate": 2.797316140924777e-06, + "loss": 2.86, + "step": 58552 + }, + { + "epoch": 2.87, + "grad_norm": 0.7469648122787476, + "learning_rate": 2.79521892750455e-06, + "loss": 2.8988, + "step": 58553 + }, + { + "epoch": 2.87, + "grad_norm": 0.789919376373291, + "learning_rate": 2.7931224968645304e-06, + "loss": 2.7315, + "step": 58554 + }, + { + "epoch": 2.87, + "grad_norm": 0.7301388382911682, + "learning_rate": 2.7910268490102473e-06, + "loss": 3.0406, + "step": 58555 + }, + { + "epoch": 2.87, + "grad_norm": 0.7471961975097656, + "learning_rate": 2.7889319839471956e-06, + "loss": 2.7793, + "step": 58556 + }, + { + "epoch": 2.87, + "grad_norm": 0.8272915482521057, + "learning_rate": 2.7868379016808717e-06, + "loss": 2.7463, + "step": 58557 + }, + { + "epoch": 2.87, + "grad_norm": 0.7283830046653748, + "learning_rate": 2.7847446022168375e-06, + "loss": 2.9614, + "step": 58558 + }, + { + "epoch": 2.87, + "grad_norm": 0.697257399559021, + "learning_rate": 2.782652085560588e-06, + "loss": 2.6746, + "step": 58559 + }, + { + "epoch": 2.87, + "grad_norm": 0.7829978466033936, + "learning_rate": 2.7805603517176203e-06, + "loss": 3.048, + "step": 58560 + }, + { + "epoch": 2.87, + "grad_norm": 0.7517426609992981, + "learning_rate": 2.7784694006934618e-06, + "loss": 2.9577, + "step": 58561 + }, + { + "epoch": 2.87, + "grad_norm": 0.8072444796562195, + "learning_rate": 2.7763792324936083e-06, + "loss": 2.8453, + "step": 58562 + }, + { + "epoch": 2.87, + "grad_norm": 0.7768639326095581, + "learning_rate": 2.7742898471235562e-06, + "loss": 2.779, + "step": 58563 + }, + { + "epoch": 2.87, + "grad_norm": 0.7537323832511902, + "learning_rate": 2.7722012445888676e-06, + "loss": 2.6229, + "step": 58564 + }, + { + "epoch": 2.87, + "grad_norm": 0.7503150701522827, + "learning_rate": 2.7701134248949707e-06, + "loss": 2.7897, + "step": 58565 + }, + { + "epoch": 2.87, + "grad_norm": 0.7244771122932434, + "learning_rate": 2.7680263880473953e-06, + "loss": 2.7409, + "step": 58566 + }, + { + "epoch": 2.87, + "grad_norm": 0.7575398087501526, + "learning_rate": 2.765940134051636e-06, + "loss": 3.0002, + "step": 58567 + }, + { + "epoch": 2.87, + "grad_norm": 0.8123334646224976, + "learning_rate": 2.7638546629131566e-06, + "loss": 3.0256, + "step": 58568 + }, + { + "epoch": 2.87, + "grad_norm": 0.8360390663146973, + "learning_rate": 2.7617699746375178e-06, + "loss": 3.0575, + "step": 58569 + }, + { + "epoch": 2.87, + "grad_norm": 0.7276893854141235, + "learning_rate": 2.7596860692301492e-06, + "loss": 2.9831, + "step": 58570 + }, + { + "epoch": 2.87, + "grad_norm": 0.7142115831375122, + "learning_rate": 2.75760294669658e-06, + "loss": 3.0064, + "step": 58571 + }, + { + "epoch": 2.87, + "grad_norm": 0.7129017114639282, + "learning_rate": 2.755520607042272e-06, + "loss": 2.8834, + "step": 58572 + }, + { + "epoch": 2.87, + "grad_norm": 0.7552473545074463, + "learning_rate": 2.753439050272721e-06, + "loss": 2.7786, + "step": 58573 + }, + { + "epoch": 2.87, + "grad_norm": 0.7475464940071106, + "learning_rate": 2.751358276393423e-06, + "loss": 2.9663, + "step": 58574 + }, + { + "epoch": 2.87, + "grad_norm": 0.7509301900863647, + "learning_rate": 2.749278285409806e-06, + "loss": 2.6761, + "step": 58575 + }, + { + "epoch": 2.87, + "grad_norm": 0.7471511960029602, + "learning_rate": 2.7471990773274e-06, + "loss": 2.9407, + "step": 58576 + }, + { + "epoch": 2.87, + "grad_norm": 0.7476446628570557, + "learning_rate": 2.7451206521516334e-06, + "loss": 2.8716, + "step": 58577 + }, + { + "epoch": 2.87, + "grad_norm": 0.7277437448501587, + "learning_rate": 2.743043009888035e-06, + "loss": 3.0029, + "step": 58578 + }, + { + "epoch": 2.87, + "grad_norm": 0.7496187686920166, + "learning_rate": 2.7409661505420675e-06, + "loss": 2.9366, + "step": 58579 + }, + { + "epoch": 2.87, + "grad_norm": 0.7967445254325867, + "learning_rate": 2.738890074119193e-06, + "loss": 2.8992, + "step": 58580 + }, + { + "epoch": 2.87, + "grad_norm": 0.7687461376190186, + "learning_rate": 2.736814780624874e-06, + "loss": 2.6873, + "step": 58581 + }, + { + "epoch": 2.87, + "grad_norm": 0.7759029269218445, + "learning_rate": 2.7347402700645393e-06, + "loss": 3.1086, + "step": 58582 + }, + { + "epoch": 2.87, + "grad_norm": 0.8095799684524536, + "learning_rate": 2.7326665424436844e-06, + "loss": 2.8247, + "step": 58583 + }, + { + "epoch": 2.87, + "grad_norm": 0.7569171786308289, + "learning_rate": 2.730593597767805e-06, + "loss": 2.8072, + "step": 58584 + }, + { + "epoch": 2.87, + "grad_norm": 0.7547518014907837, + "learning_rate": 2.72852143604233e-06, + "loss": 2.8641, + "step": 58585 + }, + { + "epoch": 2.87, + "grad_norm": 0.7172838449478149, + "learning_rate": 2.7264500572727222e-06, + "loss": 2.9092, + "step": 58586 + }, + { + "epoch": 2.87, + "grad_norm": 0.7331339120864868, + "learning_rate": 2.724379461464443e-06, + "loss": 3.0988, + "step": 58587 + }, + { + "epoch": 2.87, + "grad_norm": 0.7577721476554871, + "learning_rate": 2.722309648622889e-06, + "loss": 2.5406, + "step": 58588 + }, + { + "epoch": 2.87, + "grad_norm": 0.7274854183197021, + "learning_rate": 2.7202406187535887e-06, + "loss": 2.9189, + "step": 58589 + }, + { + "epoch": 2.87, + "grad_norm": 0.7488541007041931, + "learning_rate": 2.7181723718619707e-06, + "loss": 2.788, + "step": 58590 + }, + { + "epoch": 2.87, + "grad_norm": 0.7572417855262756, + "learning_rate": 2.7161049079534646e-06, + "loss": 2.7814, + "step": 58591 + }, + { + "epoch": 2.87, + "grad_norm": 0.7595935463905334, + "learning_rate": 2.7140382270334994e-06, + "loss": 2.8215, + "step": 58592 + }, + { + "epoch": 2.87, + "grad_norm": 0.7744547724723816, + "learning_rate": 2.71197232910757e-06, + "loss": 2.7036, + "step": 58593 + }, + { + "epoch": 2.87, + "grad_norm": 0.7563602924346924, + "learning_rate": 2.709907214181106e-06, + "loss": 2.7836, + "step": 58594 + }, + { + "epoch": 2.87, + "grad_norm": 0.7460562586784363, + "learning_rate": 2.7078428822595034e-06, + "loss": 2.9322, + "step": 58595 + }, + { + "epoch": 2.87, + "grad_norm": 0.7538842558860779, + "learning_rate": 2.705779333348257e-06, + "loss": 2.8706, + "step": 58596 + }, + { + "epoch": 2.87, + "grad_norm": 0.7640366554260254, + "learning_rate": 2.703716567452763e-06, + "loss": 2.8659, + "step": 58597 + }, + { + "epoch": 2.87, + "grad_norm": 0.7012739777565002, + "learning_rate": 2.7016545845784497e-06, + "loss": 2.9261, + "step": 58598 + }, + { + "epoch": 2.87, + "grad_norm": 0.7518980503082275, + "learning_rate": 2.6995933847307473e-06, + "loss": 2.8051, + "step": 58599 + }, + { + "epoch": 2.87, + "grad_norm": 0.7138656973838806, + "learning_rate": 2.697532967915117e-06, + "loss": 2.7991, + "step": 58600 + }, + { + "epoch": 2.87, + "grad_norm": 0.7334188222885132, + "learning_rate": 2.6954733341369884e-06, + "loss": 2.888, + "step": 58601 + }, + { + "epoch": 2.87, + "grad_norm": 0.7425467371940613, + "learning_rate": 2.6934144834017567e-06, + "loss": 2.5813, + "step": 58602 + }, + { + "epoch": 2.87, + "grad_norm": 0.7227640151977539, + "learning_rate": 2.6913564157148515e-06, + "loss": 2.7079, + "step": 58603 + }, + { + "epoch": 2.87, + "grad_norm": 0.7601994276046753, + "learning_rate": 2.689299131081668e-06, + "loss": 2.9336, + "step": 58604 + }, + { + "epoch": 2.87, + "grad_norm": 0.7707348465919495, + "learning_rate": 2.6872426295076686e-06, + "loss": 3.0102, + "step": 58605 + }, + { + "epoch": 2.87, + "grad_norm": 0.7580685019493103, + "learning_rate": 2.685186910998283e-06, + "loss": 3.0894, + "step": 58606 + }, + { + "epoch": 2.87, + "grad_norm": 0.7323053479194641, + "learning_rate": 2.6831319755588387e-06, + "loss": 3.0489, + "step": 58607 + }, + { + "epoch": 2.87, + "grad_norm": 0.7686645984649658, + "learning_rate": 2.681077823194866e-06, + "loss": 2.8456, + "step": 58608 + }, + { + "epoch": 2.87, + "grad_norm": 0.7553392052650452, + "learning_rate": 2.6790244539116935e-06, + "loss": 2.83, + "step": 58609 + }, + { + "epoch": 2.87, + "grad_norm": 0.7549324631690979, + "learning_rate": 2.6769718677147167e-06, + "loss": 3.0199, + "step": 58610 + }, + { + "epoch": 2.87, + "grad_norm": 0.758166491985321, + "learning_rate": 2.6749200646093982e-06, + "loss": 2.9844, + "step": 58611 + }, + { + "epoch": 2.87, + "grad_norm": 0.7355962991714478, + "learning_rate": 2.6728690446011336e-06, + "loss": 3.0423, + "step": 58612 + }, + { + "epoch": 2.87, + "grad_norm": 0.7758812308311462, + "learning_rate": 2.6708188076952854e-06, + "loss": 2.8497, + "step": 58613 + }, + { + "epoch": 2.87, + "grad_norm": 0.7655168175697327, + "learning_rate": 2.668769353897282e-06, + "loss": 2.911, + "step": 58614 + }, + { + "epoch": 2.87, + "grad_norm": 0.731803834438324, + "learning_rate": 2.6667206832125197e-06, + "loss": 2.5736, + "step": 58615 + }, + { + "epoch": 2.87, + "grad_norm": 0.727342426776886, + "learning_rate": 2.6646727956463943e-06, + "loss": 3.0545, + "step": 58616 + }, + { + "epoch": 2.87, + "grad_norm": 0.7434524297714233, + "learning_rate": 2.6626256912043343e-06, + "loss": 3.0437, + "step": 58617 + }, + { + "epoch": 2.87, + "grad_norm": 0.805879533290863, + "learning_rate": 2.660579369891669e-06, + "loss": 2.9437, + "step": 58618 + }, + { + "epoch": 2.87, + "grad_norm": 0.76210618019104, + "learning_rate": 2.658533831713794e-06, + "loss": 2.8384, + "step": 58619 + }, + { + "epoch": 2.87, + "grad_norm": 0.7613382935523987, + "learning_rate": 2.6564890766761047e-06, + "loss": 2.7502, + "step": 58620 + }, + { + "epoch": 2.87, + "grad_norm": 0.7467081546783447, + "learning_rate": 2.654445104784031e-06, + "loss": 2.6637, + "step": 58621 + }, + { + "epoch": 2.87, + "grad_norm": 0.7348057627677917, + "learning_rate": 2.652401916042901e-06, + "loss": 2.9316, + "step": 58622 + }, + { + "epoch": 2.87, + "grad_norm": 0.8229193687438965, + "learning_rate": 2.6503595104581444e-06, + "loss": 3.0189, + "step": 58623 + }, + { + "epoch": 2.87, + "grad_norm": 0.741585910320282, + "learning_rate": 2.648317888035123e-06, + "loss": 2.9296, + "step": 58624 + }, + { + "epoch": 2.87, + "grad_norm": 0.7843425869941711, + "learning_rate": 2.6462770487791663e-06, + "loss": 2.562, + "step": 58625 + }, + { + "epoch": 2.87, + "grad_norm": 0.7439048290252686, + "learning_rate": 2.644236992695703e-06, + "loss": 3.0224, + "step": 58626 + }, + { + "epoch": 2.87, + "grad_norm": 0.7619640827178955, + "learning_rate": 2.6421977197900956e-06, + "loss": 2.9144, + "step": 58627 + }, + { + "epoch": 2.87, + "grad_norm": 0.7499315142631531, + "learning_rate": 2.64015923006774e-06, + "loss": 2.5829, + "step": 58628 + }, + { + "epoch": 2.87, + "grad_norm": 0.7854250073432922, + "learning_rate": 2.6381215235339315e-06, + "loss": 2.7211, + "step": 58629 + }, + { + "epoch": 2.87, + "grad_norm": 0.7892012000083923, + "learning_rate": 2.6360846001940994e-06, + "loss": 2.8919, + "step": 58630 + }, + { + "epoch": 2.87, + "grad_norm": 0.7646967768669128, + "learning_rate": 2.634048460053573e-06, + "loss": 3.0164, + "step": 58631 + }, + { + "epoch": 2.87, + "grad_norm": 0.7222882509231567, + "learning_rate": 2.6320131031177805e-06, + "loss": 2.8806, + "step": 58632 + }, + { + "epoch": 2.87, + "grad_norm": 0.7930301427841187, + "learning_rate": 2.6299785293920184e-06, + "loss": 2.9666, + "step": 58633 + }, + { + "epoch": 2.87, + "grad_norm": 0.775130569934845, + "learning_rate": 2.627944738881649e-06, + "loss": 2.9753, + "step": 58634 + }, + { + "epoch": 2.87, + "grad_norm": 0.7208459377288818, + "learning_rate": 2.625911731592034e-06, + "loss": 3.2084, + "step": 58635 + }, + { + "epoch": 2.87, + "grad_norm": 0.7527031898498535, + "learning_rate": 2.62387950752857e-06, + "loss": 2.8473, + "step": 58636 + }, + { + "epoch": 2.87, + "grad_norm": 0.7546898722648621, + "learning_rate": 2.621848066696519e-06, + "loss": 2.799, + "step": 58637 + }, + { + "epoch": 2.87, + "grad_norm": 0.7190507054328918, + "learning_rate": 2.6198174091013435e-06, + "loss": 2.8375, + "step": 58638 + }, + { + "epoch": 2.87, + "grad_norm": 0.7207900285720825, + "learning_rate": 2.617787534748339e-06, + "loss": 2.6189, + "step": 58639 + }, + { + "epoch": 2.87, + "grad_norm": 0.7409490346908569, + "learning_rate": 2.6157584436428016e-06, + "loss": 2.9473, + "step": 58640 + }, + { + "epoch": 2.87, + "grad_norm": 0.7580714225769043, + "learning_rate": 2.6137301357901596e-06, + "loss": 2.8106, + "step": 58641 + }, + { + "epoch": 2.87, + "grad_norm": 0.792218804359436, + "learning_rate": 2.6117026111957097e-06, + "loss": 2.783, + "step": 58642 + }, + { + "epoch": 2.87, + "grad_norm": 0.7418773770332336, + "learning_rate": 2.6096758698648137e-06, + "loss": 3.0417, + "step": 58643 + }, + { + "epoch": 2.87, + "grad_norm": 0.7960341572761536, + "learning_rate": 2.607649911802767e-06, + "loss": 2.8398, + "step": 58644 + }, + { + "epoch": 2.87, + "grad_norm": 0.7470495700836182, + "learning_rate": 2.605624737014966e-06, + "loss": 3.0263, + "step": 58645 + }, + { + "epoch": 2.87, + "grad_norm": 0.7760054469108582, + "learning_rate": 2.60360034550674e-06, + "loss": 2.8589, + "step": 58646 + }, + { + "epoch": 2.87, + "grad_norm": 0.7305024266242981, + "learning_rate": 2.6015767372833507e-06, + "loss": 2.8561, + "step": 58647 + }, + { + "epoch": 2.87, + "grad_norm": 0.753297746181488, + "learning_rate": 2.599553912350194e-06, + "loss": 2.9018, + "step": 58648 + }, + { + "epoch": 2.87, + "grad_norm": 0.7611088156700134, + "learning_rate": 2.5975318707125325e-06, + "loss": 2.9396, + "step": 58649 + }, + { + "epoch": 2.87, + "grad_norm": 0.768486499786377, + "learning_rate": 2.595510612375762e-06, + "loss": 2.6987, + "step": 58650 + }, + { + "epoch": 2.87, + "grad_norm": 0.8263199925422668, + "learning_rate": 2.593490137345211e-06, + "loss": 2.8593, + "step": 58651 + }, + { + "epoch": 2.87, + "grad_norm": 0.7508795857429504, + "learning_rate": 2.5914704456261428e-06, + "loss": 2.9293, + "step": 58652 + }, + { + "epoch": 2.87, + "grad_norm": 0.790963888168335, + "learning_rate": 2.5894515372238854e-06, + "loss": 2.7323, + "step": 58653 + }, + { + "epoch": 2.87, + "grad_norm": 0.7731552124023438, + "learning_rate": 2.5874334121438356e-06, + "loss": 2.994, + "step": 58654 + }, + { + "epoch": 2.87, + "grad_norm": 0.7483537197113037, + "learning_rate": 2.5854160703911885e-06, + "loss": 2.8272, + "step": 58655 + }, + { + "epoch": 2.87, + "grad_norm": 0.749880850315094, + "learning_rate": 2.5833995119713403e-06, + "loss": 2.9543, + "step": 58656 + }, + { + "epoch": 2.87, + "grad_norm": 0.7379136681556702, + "learning_rate": 2.581383736889553e-06, + "loss": 2.9406, + "step": 58657 + }, + { + "epoch": 2.87, + "grad_norm": 0.753685474395752, + "learning_rate": 2.5793687451512223e-06, + "loss": 2.8851, + "step": 58658 + }, + { + "epoch": 2.87, + "grad_norm": 0.7053732872009277, + "learning_rate": 2.5773545367615444e-06, + "loss": 2.9854, + "step": 58659 + }, + { + "epoch": 2.87, + "grad_norm": 0.7595216035842896, + "learning_rate": 2.5753411117258817e-06, + "loss": 2.8745, + "step": 58660 + }, + { + "epoch": 2.87, + "grad_norm": 0.7526916861534119, + "learning_rate": 2.5733284700495295e-06, + "loss": 2.9997, + "step": 58661 + }, + { + "epoch": 2.87, + "grad_norm": 0.7947744131088257, + "learning_rate": 2.5713166117377836e-06, + "loss": 2.8262, + "step": 58662 + }, + { + "epoch": 2.87, + "grad_norm": 0.7724664211273193, + "learning_rate": 2.5693055367959737e-06, + "loss": 2.7696, + "step": 58663 + }, + { + "epoch": 2.87, + "grad_norm": 0.7890022397041321, + "learning_rate": 2.5672952452293617e-06, + "loss": 2.8872, + "step": 58664 + }, + { + "epoch": 2.88, + "grad_norm": 0.7602983713150024, + "learning_rate": 2.5652857370432432e-06, + "loss": 2.9562, + "step": 58665 + }, + { + "epoch": 2.88, + "grad_norm": 0.7163755297660828, + "learning_rate": 2.5632770122429148e-06, + "loss": 2.8898, + "step": 58666 + }, + { + "epoch": 2.88, + "grad_norm": 0.7645668983459473, + "learning_rate": 2.5612690708337046e-06, + "loss": 2.8835, + "step": 58667 + }, + { + "epoch": 2.88, + "grad_norm": 0.7817904949188232, + "learning_rate": 2.5592619128208424e-06, + "loss": 3.0378, + "step": 58668 + }, + { + "epoch": 2.88, + "grad_norm": 0.7333517670631409, + "learning_rate": 2.5572555382096236e-06, + "loss": 2.9638, + "step": 58669 + }, + { + "epoch": 2.88, + "grad_norm": 0.7220652103424072, + "learning_rate": 2.555249947005378e-06, + "loss": 2.7169, + "step": 58670 + }, + { + "epoch": 2.88, + "grad_norm": 0.7610824108123779, + "learning_rate": 2.5532451392133667e-06, + "loss": 2.8406, + "step": 58671 + }, + { + "epoch": 2.88, + "grad_norm": 0.8026726245880127, + "learning_rate": 2.55124111483882e-06, + "loss": 3.1763, + "step": 58672 + }, + { + "epoch": 2.88, + "grad_norm": 0.7766146659851074, + "learning_rate": 2.5492378738871e-06, + "loss": 2.7902, + "step": 58673 + }, + { + "epoch": 2.88, + "grad_norm": 0.7230070233345032, + "learning_rate": 2.5472354163634357e-06, + "loss": 2.7006, + "step": 58674 + }, + { + "epoch": 2.88, + "grad_norm": 0.7731808423995972, + "learning_rate": 2.5452337422730894e-06, + "loss": 3.0314, + "step": 58675 + }, + { + "epoch": 2.88, + "grad_norm": 0.7285119891166687, + "learning_rate": 2.5432328516213574e-06, + "loss": 2.8947, + "step": 58676 + }, + { + "epoch": 2.88, + "grad_norm": 0.746861457824707, + "learning_rate": 2.5412327444135017e-06, + "loss": 2.9094, + "step": 58677 + }, + { + "epoch": 2.88, + "grad_norm": 0.7949832677841187, + "learning_rate": 2.539233420654818e-06, + "loss": 2.8981, + "step": 58678 + }, + { + "epoch": 2.88, + "grad_norm": 0.7782156467437744, + "learning_rate": 2.5372348803505026e-06, + "loss": 2.8062, + "step": 58679 + }, + { + "epoch": 2.88, + "grad_norm": 0.7405202984809875, + "learning_rate": 2.5352371235058845e-06, + "loss": 2.943, + "step": 58680 + }, + { + "epoch": 2.88, + "grad_norm": 0.7959614396095276, + "learning_rate": 2.5332401501262256e-06, + "loss": 2.7178, + "step": 58681 + }, + { + "epoch": 2.88, + "grad_norm": 0.7623839974403381, + "learning_rate": 2.5312439602167555e-06, + "loss": 2.9395, + "step": 58682 + }, + { + "epoch": 2.88, + "grad_norm": 0.7368699312210083, + "learning_rate": 2.529248553782737e-06, + "loss": 2.8478, + "step": 58683 + }, + { + "epoch": 2.88, + "grad_norm": 0.7328764796257019, + "learning_rate": 2.527253930829398e-06, + "loss": 2.7829, + "step": 58684 + }, + { + "epoch": 2.88, + "grad_norm": 0.7400158643722534, + "learning_rate": 2.525260091362036e-06, + "loss": 2.8827, + "step": 58685 + }, + { + "epoch": 2.88, + "grad_norm": 0.7456002831459045, + "learning_rate": 2.523267035385912e-06, + "loss": 2.8261, + "step": 58686 + }, + { + "epoch": 2.88, + "grad_norm": 0.726157009601593, + "learning_rate": 2.5212747629062224e-06, + "loss": 2.7118, + "step": 58687 + }, + { + "epoch": 2.88, + "grad_norm": 0.7584595680236816, + "learning_rate": 2.519283273928263e-06, + "loss": 2.9325, + "step": 58688 + }, + { + "epoch": 2.88, + "grad_norm": 0.8131120204925537, + "learning_rate": 2.5172925684572633e-06, + "loss": 3.0036, + "step": 58689 + }, + { + "epoch": 2.88, + "grad_norm": 0.760067880153656, + "learning_rate": 2.5153026464984517e-06, + "loss": 2.9375, + "step": 58690 + }, + { + "epoch": 2.88, + "grad_norm": 0.7592241764068604, + "learning_rate": 2.513313508057091e-06, + "loss": 2.9265, + "step": 58691 + }, + { + "epoch": 2.88, + "grad_norm": 0.800504744052887, + "learning_rate": 2.5113251531383773e-06, + "loss": 2.8072, + "step": 58692 + }, + { + "epoch": 2.88, + "grad_norm": 0.7303928732872009, + "learning_rate": 2.509337581747606e-06, + "loss": 2.8293, + "step": 58693 + }, + { + "epoch": 2.88, + "grad_norm": 0.8083999156951904, + "learning_rate": 2.5073507938900064e-06, + "loss": 2.6546, + "step": 58694 + }, + { + "epoch": 2.88, + "grad_norm": 0.7412149310112, + "learning_rate": 2.505364789570774e-06, + "loss": 2.9867, + "step": 58695 + }, + { + "epoch": 2.88, + "grad_norm": 0.7698215246200562, + "learning_rate": 2.5033795687951387e-06, + "loss": 3.118, + "step": 58696 + }, + { + "epoch": 2.88, + "grad_norm": 0.77435702085495, + "learning_rate": 2.501395131568362e-06, + "loss": 3.0678, + "step": 58697 + }, + { + "epoch": 2.88, + "grad_norm": 0.7428318858146667, + "learning_rate": 2.499411477895674e-06, + "loss": 2.9809, + "step": 58698 + }, + { + "epoch": 2.88, + "grad_norm": 0.7360471487045288, + "learning_rate": 2.497428607782237e-06, + "loss": 2.9411, + "step": 58699 + }, + { + "epoch": 2.88, + "grad_norm": 0.7438157796859741, + "learning_rate": 2.4954465212333465e-06, + "loss": 2.7105, + "step": 58700 + }, + { + "epoch": 2.88, + "grad_norm": 0.6961084604263306, + "learning_rate": 2.493465218254198e-06, + "loss": 2.7633, + "step": 58701 + }, + { + "epoch": 2.88, + "grad_norm": 0.7811095118522644, + "learning_rate": 2.491484698849988e-06, + "loss": 2.8801, + "step": 58702 + }, + { + "epoch": 2.88, + "grad_norm": 0.7979738116264343, + "learning_rate": 2.489504963025979e-06, + "loss": 2.9504, + "step": 58703 + }, + { + "epoch": 2.88, + "grad_norm": 0.8025102615356445, + "learning_rate": 2.487526010787333e-06, + "loss": 2.6882, + "step": 58704 + }, + { + "epoch": 2.88, + "grad_norm": 0.771865725517273, + "learning_rate": 2.4855478421393126e-06, + "loss": 2.8791, + "step": 58705 + }, + { + "epoch": 2.88, + "grad_norm": 0.7456363439559937, + "learning_rate": 2.4835704570870806e-06, + "loss": 2.6352, + "step": 58706 + }, + { + "epoch": 2.88, + "grad_norm": 0.7263805866241455, + "learning_rate": 2.481593855635833e-06, + "loss": 2.8166, + "step": 58707 + }, + { + "epoch": 2.88, + "grad_norm": 0.7499218583106995, + "learning_rate": 2.479618037790865e-06, + "loss": 2.9985, + "step": 58708 + }, + { + "epoch": 2.88, + "grad_norm": 0.724136471748352, + "learning_rate": 2.4776430035572723e-06, + "loss": 2.9608, + "step": 58709 + }, + { + "epoch": 2.88, + "grad_norm": 0.7608489990234375, + "learning_rate": 2.475668752940352e-06, + "loss": 2.6573, + "step": 58710 + }, + { + "epoch": 2.88, + "grad_norm": 0.7402869462966919, + "learning_rate": 2.4736952859452653e-06, + "loss": 3.0642, + "step": 58711 + }, + { + "epoch": 2.88, + "grad_norm": 0.7280595302581787, + "learning_rate": 2.471722602577175e-06, + "loss": 2.778, + "step": 58712 + }, + { + "epoch": 2.88, + "grad_norm": 0.7287282943725586, + "learning_rate": 2.469750702841311e-06, + "loss": 2.8341, + "step": 58713 + }, + { + "epoch": 2.88, + "grad_norm": 0.7806359529495239, + "learning_rate": 2.4677795867428684e-06, + "loss": 3.1066, + "step": 58714 + }, + { + "epoch": 2.88, + "grad_norm": 0.7894913554191589, + "learning_rate": 2.4658092542870434e-06, + "loss": 3.0778, + "step": 58715 + }, + { + "epoch": 2.88, + "grad_norm": 0.7530649304389954, + "learning_rate": 2.4638397054789984e-06, + "loss": 2.9428, + "step": 58716 + }, + { + "epoch": 2.88, + "grad_norm": 0.7815943360328674, + "learning_rate": 2.461870940323929e-06, + "loss": 3.0238, + "step": 58717 + }, + { + "epoch": 2.88, + "grad_norm": 0.7556338310241699, + "learning_rate": 2.459902958827065e-06, + "loss": 2.8127, + "step": 58718 + }, + { + "epoch": 2.88, + "grad_norm": 0.7679249048233032, + "learning_rate": 2.4579357609935344e-06, + "loss": 2.9174, + "step": 58719 + }, + { + "epoch": 2.88, + "grad_norm": 0.8016083836555481, + "learning_rate": 2.455969346828568e-06, + "loss": 2.8592, + "step": 58720 + }, + { + "epoch": 2.88, + "grad_norm": 0.7322315573692322, + "learning_rate": 2.4540037163372606e-06, + "loss": 3.0443, + "step": 58721 + }, + { + "epoch": 2.88, + "grad_norm": 0.7427347898483276, + "learning_rate": 2.452038869524875e-06, + "loss": 2.7349, + "step": 58722 + }, + { + "epoch": 2.88, + "grad_norm": 0.7253776788711548, + "learning_rate": 2.45007480639654e-06, + "loss": 2.9283, + "step": 58723 + }, + { + "epoch": 2.88, + "grad_norm": 0.7481330037117004, + "learning_rate": 2.4481115269574526e-06, + "loss": 2.8383, + "step": 58724 + }, + { + "epoch": 2.88, + "grad_norm": 0.7602149248123169, + "learning_rate": 2.446149031212774e-06, + "loss": 2.7756, + "step": 58725 + }, + { + "epoch": 2.88, + "grad_norm": 0.7619256377220154, + "learning_rate": 2.4441873191676677e-06, + "loss": 3.0721, + "step": 58726 + }, + { + "epoch": 2.88, + "grad_norm": 0.7644228935241699, + "learning_rate": 2.4422263908272953e-06, + "loss": 2.8749, + "step": 58727 + }, + { + "epoch": 2.88, + "grad_norm": 0.7094855904579163, + "learning_rate": 2.4402662461968538e-06, + "loss": 3.0339, + "step": 58728 + }, + { + "epoch": 2.88, + "grad_norm": 0.7785953283309937, + "learning_rate": 2.4383068852814715e-06, + "loss": 2.9188, + "step": 58729 + }, + { + "epoch": 2.88, + "grad_norm": 0.7782620191574097, + "learning_rate": 2.4363483080863446e-06, + "loss": 2.7007, + "step": 58730 + }, + { + "epoch": 2.88, + "grad_norm": 0.7648835778236389, + "learning_rate": 2.434390514616602e-06, + "loss": 3.0109, + "step": 58731 + }, + { + "epoch": 2.88, + "grad_norm": 0.7509472966194153, + "learning_rate": 2.4324335048773734e-06, + "loss": 2.8961, + "step": 58732 + }, + { + "epoch": 2.88, + "grad_norm": 0.7752648591995239, + "learning_rate": 2.430477278873888e-06, + "loss": 2.8416, + "step": 58733 + }, + { + "epoch": 2.88, + "grad_norm": 0.775607168674469, + "learning_rate": 2.428521836611241e-06, + "loss": 2.7793, + "step": 58734 + }, + { + "epoch": 2.88, + "grad_norm": 0.7678676247596741, + "learning_rate": 2.426567178094596e-06, + "loss": 2.8429, + "step": 58735 + }, + { + "epoch": 2.88, + "grad_norm": 0.7633538246154785, + "learning_rate": 2.4246133033291148e-06, + "loss": 2.7533, + "step": 58736 + }, + { + "epoch": 2.88, + "grad_norm": 0.7499951124191284, + "learning_rate": 2.4226602123199267e-06, + "loss": 3.0187, + "step": 58737 + }, + { + "epoch": 2.88, + "grad_norm": 0.8302484750747681, + "learning_rate": 2.420707905072161e-06, + "loss": 2.9427, + "step": 58738 + }, + { + "epoch": 2.88, + "grad_norm": 0.7577258348464966, + "learning_rate": 2.418756381591014e-06, + "loss": 2.967, + "step": 58739 + }, + { + "epoch": 2.88, + "grad_norm": 0.7333148121833801, + "learning_rate": 2.416805641881581e-06, + "loss": 2.869, + "step": 58740 + }, + { + "epoch": 2.88, + "grad_norm": 0.8242916464805603, + "learning_rate": 2.4148556859490242e-06, + "loss": 2.9112, + "step": 58741 + }, + { + "epoch": 2.88, + "grad_norm": 0.7202953696250916, + "learning_rate": 2.4129065137984404e-06, + "loss": 2.6746, + "step": 58742 + }, + { + "epoch": 2.88, + "grad_norm": 0.7955062389373779, + "learning_rate": 2.410958125434992e-06, + "loss": 2.7812, + "step": 58743 + }, + { + "epoch": 2.88, + "grad_norm": 0.8054195642471313, + "learning_rate": 2.4090105208638078e-06, + "loss": 3.0059, + "step": 58744 + }, + { + "epoch": 2.88, + "grad_norm": 0.7250577807426453, + "learning_rate": 2.4070637000900174e-06, + "loss": 2.8463, + "step": 58745 + }, + { + "epoch": 2.88, + "grad_norm": 0.6978896260261536, + "learning_rate": 2.4051176631187497e-06, + "loss": 2.8534, + "step": 58746 + }, + { + "epoch": 2.88, + "grad_norm": 0.794734001159668, + "learning_rate": 2.403172409955134e-06, + "loss": 2.6841, + "step": 58747 + }, + { + "epoch": 2.88, + "grad_norm": 0.7843784093856812, + "learning_rate": 2.4012279406043e-06, + "loss": 2.7228, + "step": 58748 + }, + { + "epoch": 2.88, + "grad_norm": 0.7873679399490356, + "learning_rate": 2.3992842550713433e-06, + "loss": 2.8963, + "step": 58749 + }, + { + "epoch": 2.88, + "grad_norm": 0.7977191209793091, + "learning_rate": 2.397341353361393e-06, + "loss": 2.8242, + "step": 58750 + }, + { + "epoch": 2.88, + "grad_norm": 0.7681131958961487, + "learning_rate": 2.395399235479578e-06, + "loss": 2.8686, + "step": 58751 + }, + { + "epoch": 2.88, + "grad_norm": 0.7318379282951355, + "learning_rate": 2.393457901430995e-06, + "loss": 2.9615, + "step": 58752 + }, + { + "epoch": 2.88, + "grad_norm": 0.7517814040184021, + "learning_rate": 2.391517351220773e-06, + "loss": 2.8989, + "step": 58753 + }, + { + "epoch": 2.88, + "grad_norm": 0.7598768472671509, + "learning_rate": 2.3895775848540413e-06, + "loss": 2.9079, + "step": 58754 + }, + { + "epoch": 2.88, + "grad_norm": 0.7105078101158142, + "learning_rate": 2.387638602335862e-06, + "loss": 2.8294, + "step": 58755 + }, + { + "epoch": 2.88, + "grad_norm": 0.7269749641418457, + "learning_rate": 2.385700403671398e-06, + "loss": 2.7675, + "step": 58756 + }, + { + "epoch": 2.88, + "grad_norm": 0.7759285569190979, + "learning_rate": 2.383762988865678e-06, + "loss": 2.9059, + "step": 58757 + }, + { + "epoch": 2.88, + "grad_norm": 0.74515700340271, + "learning_rate": 2.381826357923866e-06, + "loss": 2.7272, + "step": 58758 + }, + { + "epoch": 2.88, + "grad_norm": 0.7521591186523438, + "learning_rate": 2.3798905108510567e-06, + "loss": 3.0477, + "step": 58759 + }, + { + "epoch": 2.88, + "grad_norm": 0.7557122707366943, + "learning_rate": 2.3779554476523134e-06, + "loss": 2.7349, + "step": 58760 + }, + { + "epoch": 2.88, + "grad_norm": 0.7690261602401733, + "learning_rate": 2.3760211683327644e-06, + "loss": 2.8652, + "step": 58761 + }, + { + "epoch": 2.88, + "grad_norm": 0.780361533164978, + "learning_rate": 2.374087672897507e-06, + "loss": 2.8442, + "step": 58762 + }, + { + "epoch": 2.88, + "grad_norm": 0.7434288263320923, + "learning_rate": 2.3721549613516355e-06, + "loss": 2.88, + "step": 58763 + }, + { + "epoch": 2.88, + "grad_norm": 0.8056708574295044, + "learning_rate": 2.3702230337002135e-06, + "loss": 2.9113, + "step": 58764 + }, + { + "epoch": 2.88, + "grad_norm": 0.8034791946411133, + "learning_rate": 2.368291889948337e-06, + "loss": 2.7607, + "step": 58765 + }, + { + "epoch": 2.88, + "grad_norm": 0.7488557696342468, + "learning_rate": 2.3663615301011018e-06, + "loss": 2.7314, + "step": 58766 + }, + { + "epoch": 2.88, + "grad_norm": 0.7275140881538391, + "learning_rate": 2.364431954163604e-06, + "loss": 2.7287, + "step": 58767 + }, + { + "epoch": 2.88, + "grad_norm": 0.7067748308181763, + "learning_rate": 2.3625031621409053e-06, + "loss": 2.8275, + "step": 58768 + }, + { + "epoch": 2.88, + "grad_norm": 0.7348287105560303, + "learning_rate": 2.3605751540380692e-06, + "loss": 2.9979, + "step": 58769 + }, + { + "epoch": 2.88, + "grad_norm": 0.7248157262802124, + "learning_rate": 2.3586479298602245e-06, + "loss": 2.8282, + "step": 58770 + }, + { + "epoch": 2.88, + "grad_norm": 0.7620055079460144, + "learning_rate": 2.3567214896124007e-06, + "loss": 2.7483, + "step": 58771 + }, + { + "epoch": 2.88, + "grad_norm": 0.7315212488174438, + "learning_rate": 2.3547958332997276e-06, + "loss": 2.7802, + "step": 58772 + }, + { + "epoch": 2.88, + "grad_norm": 0.7276974320411682, + "learning_rate": 2.3528709609272e-06, + "loss": 2.9266, + "step": 58773 + }, + { + "epoch": 2.88, + "grad_norm": 0.7391351461410522, + "learning_rate": 2.3509468724999146e-06, + "loss": 2.9255, + "step": 58774 + }, + { + "epoch": 2.88, + "grad_norm": 0.8038753271102905, + "learning_rate": 2.3490235680230006e-06, + "loss": 2.572, + "step": 58775 + }, + { + "epoch": 2.88, + "grad_norm": 0.7655202150344849, + "learning_rate": 2.3471010475014207e-06, + "loss": 2.8412, + "step": 58776 + }, + { + "epoch": 2.88, + "grad_norm": 0.7686148881912231, + "learning_rate": 2.345179310940337e-06, + "loss": 3.075, + "step": 58777 + }, + { + "epoch": 2.88, + "grad_norm": 0.7740659117698669, + "learning_rate": 2.3432583583447462e-06, + "loss": 2.96, + "step": 58778 + }, + { + "epoch": 2.88, + "grad_norm": 0.6870248913764954, + "learning_rate": 2.34133818971971e-06, + "loss": 2.7546, + "step": 58779 + }, + { + "epoch": 2.88, + "grad_norm": 0.7673691511154175, + "learning_rate": 2.3394188050703587e-06, + "loss": 2.7017, + "step": 58780 + }, + { + "epoch": 2.88, + "grad_norm": 0.7266324162483215, + "learning_rate": 2.337500204401621e-06, + "loss": 2.9339, + "step": 58781 + }, + { + "epoch": 2.88, + "grad_norm": 0.7386431694030762, + "learning_rate": 2.335582387718693e-06, + "loss": 2.8721, + "step": 58782 + }, + { + "epoch": 2.88, + "grad_norm": 0.7468572854995728, + "learning_rate": 2.3336653550265037e-06, + "loss": 2.9022, + "step": 58783 + }, + { + "epoch": 2.88, + "grad_norm": 0.7643420100212097, + "learning_rate": 2.331749106330183e-06, + "loss": 3.2001, + "step": 58784 + }, + { + "epoch": 2.88, + "grad_norm": 0.7512932419776917, + "learning_rate": 2.3298336416347263e-06, + "loss": 2.9548, + "step": 58785 + }, + { + "epoch": 2.88, + "grad_norm": 0.7336491942405701, + "learning_rate": 2.3279189609451964e-06, + "loss": 2.8733, + "step": 58786 + }, + { + "epoch": 2.88, + "grad_norm": 0.7166445851325989, + "learning_rate": 2.326005064266656e-06, + "loss": 2.8924, + "step": 58787 + }, + { + "epoch": 2.88, + "grad_norm": 0.8269556760787964, + "learning_rate": 2.3240919516041014e-06, + "loss": 2.9191, + "step": 58788 + }, + { + "epoch": 2.88, + "grad_norm": 0.7374542355537415, + "learning_rate": 2.322179622962628e-06, + "loss": 2.9622, + "step": 58789 + }, + { + "epoch": 2.88, + "grad_norm": 0.7649866342544556, + "learning_rate": 2.3202680783472315e-06, + "loss": 2.8605, + "step": 58790 + }, + { + "epoch": 2.88, + "grad_norm": 0.7545380592346191, + "learning_rate": 2.3183573177629757e-06, + "loss": 2.9829, + "step": 58791 + }, + { + "epoch": 2.88, + "grad_norm": 0.7572214007377625, + "learning_rate": 2.316447341214889e-06, + "loss": 2.9821, + "step": 58792 + }, + { + "epoch": 2.88, + "grad_norm": 0.7295137047767639, + "learning_rate": 2.3145381487079674e-06, + "loss": 2.9972, + "step": 58793 + }, + { + "epoch": 2.88, + "grad_norm": 0.7803434133529663, + "learning_rate": 2.3126297402472404e-06, + "loss": 2.8749, + "step": 58794 + }, + { + "epoch": 2.88, + "grad_norm": 0.7383474707603455, + "learning_rate": 2.310722115837771e-06, + "loss": 2.8434, + "step": 58795 + }, + { + "epoch": 2.88, + "grad_norm": 0.8103850483894348, + "learning_rate": 2.308815275484588e-06, + "loss": 2.4542, + "step": 58796 + }, + { + "epoch": 2.88, + "grad_norm": 0.7786707282066345, + "learning_rate": 2.3069092191926873e-06, + "loss": 3.1745, + "step": 58797 + }, + { + "epoch": 2.88, + "grad_norm": 0.7713927626609802, + "learning_rate": 2.3050039469670656e-06, + "loss": 2.824, + "step": 58798 + }, + { + "epoch": 2.88, + "grad_norm": 0.7706761956214905, + "learning_rate": 2.3030994588128183e-06, + "loss": 2.8247, + "step": 58799 + }, + { + "epoch": 2.88, + "grad_norm": 0.7205535173416138, + "learning_rate": 2.301195754734875e-06, + "loss": 2.8544, + "step": 58800 + }, + { + "epoch": 2.88, + "grad_norm": 0.7274318933486938, + "learning_rate": 2.299292834738298e-06, + "loss": 2.9493, + "step": 58801 + }, + { + "epoch": 2.88, + "grad_norm": 0.7716490030288696, + "learning_rate": 2.2973906988281167e-06, + "loss": 2.9651, + "step": 58802 + }, + { + "epoch": 2.88, + "grad_norm": 0.7678560614585876, + "learning_rate": 2.295489347009294e-06, + "loss": 2.9299, + "step": 58803 + }, + { + "epoch": 2.88, + "grad_norm": 0.7848706245422363, + "learning_rate": 2.2935887792868257e-06, + "loss": 2.8571, + "step": 58804 + }, + { + "epoch": 2.88, + "grad_norm": 0.7634941339492798, + "learning_rate": 2.291688995665808e-06, + "loss": 2.9207, + "step": 58805 + }, + { + "epoch": 2.88, + "grad_norm": 0.7758709788322449, + "learning_rate": 2.2897899961511367e-06, + "loss": 2.8365, + "step": 58806 + }, + { + "epoch": 2.88, + "grad_norm": 0.7400304675102234, + "learning_rate": 2.287891780747908e-06, + "loss": 2.952, + "step": 58807 + }, + { + "epoch": 2.88, + "grad_norm": 0.7655829787254333, + "learning_rate": 2.285994349461051e-06, + "loss": 2.6617, + "step": 58808 + }, + { + "epoch": 2.88, + "grad_norm": 0.738251268863678, + "learning_rate": 2.284097702295562e-06, + "loss": 2.9595, + "step": 58809 + }, + { + "epoch": 2.88, + "grad_norm": 0.7782675623893738, + "learning_rate": 2.2822018392565035e-06, + "loss": 2.7474, + "step": 58810 + }, + { + "epoch": 2.88, + "grad_norm": 0.7073926329612732, + "learning_rate": 2.2803067603488378e-06, + "loss": 2.8823, + "step": 58811 + }, + { + "epoch": 2.88, + "grad_norm": 0.8041476607322693, + "learning_rate": 2.278412465577528e-06, + "loss": 2.7922, + "step": 58812 + }, + { + "epoch": 2.88, + "grad_norm": 0.7299349308013916, + "learning_rate": 2.2765189549475703e-06, + "loss": 3.0036, + "step": 58813 + }, + { + "epoch": 2.88, + "grad_norm": 0.7464435696601868, + "learning_rate": 2.2746262284639607e-06, + "loss": 2.8143, + "step": 58814 + }, + { + "epoch": 2.88, + "grad_norm": 0.7710967659950256, + "learning_rate": 2.2727342861317275e-06, + "loss": 2.8977, + "step": 58815 + }, + { + "epoch": 2.88, + "grad_norm": 0.7730777263641357, + "learning_rate": 2.270843127955768e-06, + "loss": 2.773, + "step": 58816 + }, + { + "epoch": 2.88, + "grad_norm": 0.7543210983276367, + "learning_rate": 2.2689527539411445e-06, + "loss": 2.9257, + "step": 58817 + }, + { + "epoch": 2.88, + "grad_norm": 0.7473542094230652, + "learning_rate": 2.2670631640927527e-06, + "loss": 2.7792, + "step": 58818 + }, + { + "epoch": 2.88, + "grad_norm": 0.7523601055145264, + "learning_rate": 2.2651743584156555e-06, + "loss": 2.9197, + "step": 58819 + }, + { + "epoch": 2.88, + "grad_norm": 0.750109851360321, + "learning_rate": 2.2632863369147824e-06, + "loss": 3.0966, + "step": 58820 + }, + { + "epoch": 2.88, + "grad_norm": 0.7598704099655151, + "learning_rate": 2.261399099595129e-06, + "loss": 2.9957, + "step": 58821 + }, + { + "epoch": 2.88, + "grad_norm": 0.7527625560760498, + "learning_rate": 2.2595126464616254e-06, + "loss": 2.9311, + "step": 58822 + }, + { + "epoch": 2.88, + "grad_norm": 0.7135601043701172, + "learning_rate": 2.257626977519267e-06, + "loss": 2.9232, + "step": 58823 + }, + { + "epoch": 2.88, + "grad_norm": 0.7807445526123047, + "learning_rate": 2.2557420927730163e-06, + "loss": 2.9763, + "step": 58824 + }, + { + "epoch": 2.88, + "grad_norm": 0.7743086814880371, + "learning_rate": 2.2538579922278364e-06, + "loss": 3.0103, + "step": 58825 + }, + { + "epoch": 2.88, + "grad_norm": 0.7185844779014587, + "learning_rate": 2.2519746758886904e-06, + "loss": 2.9725, + "step": 58826 + }, + { + "epoch": 2.88, + "grad_norm": 0.7576574683189392, + "learning_rate": 2.2500921437605736e-06, + "loss": 2.8631, + "step": 58827 + }, + { + "epoch": 2.88, + "grad_norm": 0.7718368172645569, + "learning_rate": 2.2482103958483823e-06, + "loss": 3.0295, + "step": 58828 + }, + { + "epoch": 2.88, + "grad_norm": 0.7165724635124207, + "learning_rate": 2.246329432157079e-06, + "loss": 2.7588, + "step": 58829 + }, + { + "epoch": 2.88, + "grad_norm": 0.7765059471130371, + "learning_rate": 2.2444492526916935e-06, + "loss": 2.832, + "step": 58830 + }, + { + "epoch": 2.88, + "grad_norm": 0.7383131980895996, + "learning_rate": 2.2425698574570884e-06, + "loss": 2.8534, + "step": 58831 + }, + { + "epoch": 2.88, + "grad_norm": 0.7530897259712219, + "learning_rate": 2.2406912464582594e-06, + "loss": 2.5967, + "step": 58832 + }, + { + "epoch": 2.88, + "grad_norm": 0.7495601773262024, + "learning_rate": 2.238813419700136e-06, + "loss": 2.8738, + "step": 58833 + }, + { + "epoch": 2.88, + "grad_norm": 0.7114120125770569, + "learning_rate": 2.236936377187681e-06, + "loss": 2.8921, + "step": 58834 + }, + { + "epoch": 2.88, + "grad_norm": 0.8221262693405151, + "learning_rate": 2.2350601189258243e-06, + "loss": 2.7301, + "step": 58835 + }, + { + "epoch": 2.88, + "grad_norm": 0.7930930852890015, + "learning_rate": 2.233184644919528e-06, + "loss": 2.8669, + "step": 58836 + }, + { + "epoch": 2.88, + "grad_norm": 0.7594324350357056, + "learning_rate": 2.2313099551737213e-06, + "loss": 2.8567, + "step": 58837 + }, + { + "epoch": 2.88, + "grad_norm": 0.733824610710144, + "learning_rate": 2.2294360496933007e-06, + "loss": 2.9, + "step": 58838 + }, + { + "epoch": 2.88, + "grad_norm": 0.7296682000160217, + "learning_rate": 2.2275629284832618e-06, + "loss": 2.8983, + "step": 58839 + }, + { + "epoch": 2.88, + "grad_norm": 0.7384452819824219, + "learning_rate": 2.2256905915485013e-06, + "loss": 2.8469, + "step": 58840 + }, + { + "epoch": 2.88, + "grad_norm": 0.7113006114959717, + "learning_rate": 2.2238190388939815e-06, + "loss": 2.9981, + "step": 58841 + }, + { + "epoch": 2.88, + "grad_norm": 0.7444003224372864, + "learning_rate": 2.2219482705246317e-06, + "loss": 2.7707, + "step": 58842 + }, + { + "epoch": 2.88, + "grad_norm": 0.7594712972640991, + "learning_rate": 2.220078286445315e-06, + "loss": 2.9082, + "step": 58843 + }, + { + "epoch": 2.88, + "grad_norm": 0.7738532423973083, + "learning_rate": 2.2182090866610604e-06, + "loss": 2.8424, + "step": 58844 + }, + { + "epoch": 2.88, + "grad_norm": 0.7955849170684814, + "learning_rate": 2.2163406711766975e-06, + "loss": 2.9444, + "step": 58845 + }, + { + "epoch": 2.88, + "grad_norm": 0.7212498188018799, + "learning_rate": 2.2144730399971554e-06, + "loss": 2.9951, + "step": 58846 + }, + { + "epoch": 2.88, + "grad_norm": 0.7770746350288391, + "learning_rate": 2.2126061931274307e-06, + "loss": 2.9731, + "step": 58847 + }, + { + "epoch": 2.88, + "grad_norm": 0.7356753945350647, + "learning_rate": 2.210740130572353e-06, + "loss": 2.8022, + "step": 58848 + }, + { + "epoch": 2.88, + "grad_norm": 0.7566166520118713, + "learning_rate": 2.208874852336917e-06, + "loss": 2.7831, + "step": 58849 + }, + { + "epoch": 2.88, + "grad_norm": 0.7645593881607056, + "learning_rate": 2.2070103584259867e-06, + "loss": 2.8821, + "step": 58850 + }, + { + "epoch": 2.88, + "grad_norm": 0.729720413684845, + "learning_rate": 2.2051466488444246e-06, + "loss": 3.0565, + "step": 58851 + }, + { + "epoch": 2.88, + "grad_norm": 0.7721166610717773, + "learning_rate": 2.20328372359726e-06, + "loss": 2.9014, + "step": 58852 + }, + { + "epoch": 2.88, + "grad_norm": 0.7474196553230286, + "learning_rate": 2.201421582689289e-06, + "loss": 2.9263, + "step": 58853 + }, + { + "epoch": 2.88, + "grad_norm": 0.7505802512168884, + "learning_rate": 2.1995602261255074e-06, + "loss": 2.9056, + "step": 58854 + }, + { + "epoch": 2.88, + "grad_norm": 0.7443969249725342, + "learning_rate": 2.1976996539107117e-06, + "loss": 3.0748, + "step": 58855 + }, + { + "epoch": 2.88, + "grad_norm": 0.7555038332939148, + "learning_rate": 2.1958398660498976e-06, + "loss": 2.9408, + "step": 58856 + }, + { + "epoch": 2.88, + "grad_norm": 0.7873604893684387, + "learning_rate": 2.193980862547928e-06, + "loss": 2.9474, + "step": 58857 + }, + { + "epoch": 2.88, + "grad_norm": 0.7833068370819092, + "learning_rate": 2.1921226434096995e-06, + "loss": 2.9086, + "step": 58858 + }, + { + "epoch": 2.88, + "grad_norm": 0.7554247379302979, + "learning_rate": 2.1902652086401075e-06, + "loss": 2.8782, + "step": 58859 + }, + { + "epoch": 2.88, + "grad_norm": 0.7960243821144104, + "learning_rate": 2.188408558244048e-06, + "loss": 2.8816, + "step": 58860 + }, + { + "epoch": 2.88, + "grad_norm": 0.8111169338226318, + "learning_rate": 2.1865526922263842e-06, + "loss": 2.7859, + "step": 58861 + }, + { + "epoch": 2.88, + "grad_norm": 0.800476610660553, + "learning_rate": 2.1846976105920456e-06, + "loss": 2.8443, + "step": 58862 + }, + { + "epoch": 2.88, + "grad_norm": 0.7481632828712463, + "learning_rate": 2.1828433133458613e-06, + "loss": 2.7816, + "step": 58863 + }, + { + "epoch": 2.88, + "grad_norm": 0.7516404986381531, + "learning_rate": 2.180989800492794e-06, + "loss": 2.7836, + "step": 58864 + }, + { + "epoch": 2.88, + "grad_norm": 0.7697651982307434, + "learning_rate": 2.1791370720376733e-06, + "loss": 2.772, + "step": 58865 + }, + { + "epoch": 2.88, + "grad_norm": 0.7722952365875244, + "learning_rate": 2.177285127985362e-06, + "loss": 2.7993, + "step": 58866 + }, + { + "epoch": 2.88, + "grad_norm": 0.7531412839889526, + "learning_rate": 2.1754339683407897e-06, + "loss": 2.6921, + "step": 58867 + }, + { + "epoch": 2.88, + "grad_norm": 0.7838771939277649, + "learning_rate": 2.1735835931088187e-06, + "loss": 3.0806, + "step": 58868 + }, + { + "epoch": 2.89, + "grad_norm": 0.7424795031547546, + "learning_rate": 2.1717340022942785e-06, + "loss": 2.8725, + "step": 58869 + }, + { + "epoch": 2.89, + "grad_norm": 0.6980968713760376, + "learning_rate": 2.169885195902099e-06, + "loss": 2.9858, + "step": 58870 + }, + { + "epoch": 2.89, + "grad_norm": 0.7613954544067383, + "learning_rate": 2.1680371739371093e-06, + "loss": 3.0112, + "step": 58871 + }, + { + "epoch": 2.89, + "grad_norm": 0.7577490210533142, + "learning_rate": 2.166189936404206e-06, + "loss": 3.0586, + "step": 58872 + }, + { + "epoch": 2.89, + "grad_norm": 0.7090437412261963, + "learning_rate": 2.1643434833082176e-06, + "loss": 2.9531, + "step": 58873 + }, + { + "epoch": 2.89, + "grad_norm": 0.7910329699516296, + "learning_rate": 2.1624978146540407e-06, + "loss": 2.6705, + "step": 58874 + }, + { + "epoch": 2.89, + "grad_norm": 0.7528069019317627, + "learning_rate": 2.1606529304465046e-06, + "loss": 3.0284, + "step": 58875 + }, + { + "epoch": 2.89, + "grad_norm": 0.759083092212677, + "learning_rate": 2.158808830690506e-06, + "loss": 2.9894, + "step": 58876 + }, + { + "epoch": 2.89, + "grad_norm": 0.764187753200531, + "learning_rate": 2.156965515390907e-06, + "loss": 2.8765, + "step": 58877 + }, + { + "epoch": 2.89, + "grad_norm": 0.7952934503555298, + "learning_rate": 2.155122984552504e-06, + "loss": 2.8857, + "step": 58878 + }, + { + "epoch": 2.89, + "grad_norm": 0.751152753829956, + "learning_rate": 2.1532812381801932e-06, + "loss": 2.8625, + "step": 58879 + }, + { + "epoch": 2.89, + "grad_norm": 0.7216384410858154, + "learning_rate": 2.1514402762788376e-06, + "loss": 3.1293, + "step": 58880 + }, + { + "epoch": 2.89, + "grad_norm": 0.7260209321975708, + "learning_rate": 2.1496000988532327e-06, + "loss": 2.8408, + "step": 58881 + }, + { + "epoch": 2.89, + "grad_norm": 0.756483793258667, + "learning_rate": 2.147760705908308e-06, + "loss": 2.9624, + "step": 58882 + }, + { + "epoch": 2.89, + "grad_norm": 0.8064687848091125, + "learning_rate": 2.1459220974487933e-06, + "loss": 2.9032, + "step": 58883 + }, + { + "epoch": 2.89, + "grad_norm": 0.8111887574195862, + "learning_rate": 2.1440842734796514e-06, + "loss": 2.8218, + "step": 58884 + }, + { + "epoch": 2.89, + "grad_norm": 0.7237271070480347, + "learning_rate": 2.142247234005645e-06, + "loss": 2.879, + "step": 58885 + }, + { + "epoch": 2.89, + "grad_norm": 0.699209451675415, + "learning_rate": 2.140410979031637e-06, + "loss": 2.7129, + "step": 58886 + }, + { + "epoch": 2.89, + "grad_norm": 0.7380729913711548, + "learning_rate": 2.13857550856249e-06, + "loss": 2.9703, + "step": 58887 + }, + { + "epoch": 2.89, + "grad_norm": 0.7376232147216797, + "learning_rate": 2.136740822602967e-06, + "loss": 2.9186, + "step": 58888 + }, + { + "epoch": 2.89, + "grad_norm": 0.7537232637405396, + "learning_rate": 2.1349069211579637e-06, + "loss": 3.0193, + "step": 58889 + }, + { + "epoch": 2.89, + "grad_norm": 0.7629387378692627, + "learning_rate": 2.133073804232277e-06, + "loss": 2.8616, + "step": 58890 + }, + { + "epoch": 2.89, + "grad_norm": 0.7387884259223938, + "learning_rate": 2.1312414718307693e-06, + "loss": 2.8796, + "step": 58891 + }, + { + "epoch": 2.89, + "grad_norm": 0.701157808303833, + "learning_rate": 2.129409923958236e-06, + "loss": 2.829, + "step": 58892 + }, + { + "epoch": 2.89, + "grad_norm": 0.8402901291847229, + "learning_rate": 2.127579160619508e-06, + "loss": 2.6893, + "step": 58893 + }, + { + "epoch": 2.89, + "grad_norm": 0.732913076877594, + "learning_rate": 2.125749181819414e-06, + "loss": 2.7791, + "step": 58894 + }, + { + "epoch": 2.89, + "grad_norm": 0.7669530510902405, + "learning_rate": 2.1239199875627834e-06, + "loss": 2.8068, + "step": 58895 + }, + { + "epoch": 2.89, + "grad_norm": 0.7334358096122742, + "learning_rate": 2.122091577854379e-06, + "loss": 2.9818, + "step": 58896 + }, + { + "epoch": 2.89, + "grad_norm": 0.785017192363739, + "learning_rate": 2.1202639526990973e-06, + "loss": 2.8513, + "step": 58897 + }, + { + "epoch": 2.89, + "grad_norm": 0.7877793312072754, + "learning_rate": 2.1184371121017006e-06, + "loss": 2.8978, + "step": 58898 + }, + { + "epoch": 2.89, + "grad_norm": 0.7523209452629089, + "learning_rate": 2.116611056067019e-06, + "loss": 2.6516, + "step": 58899 + }, + { + "epoch": 2.89, + "grad_norm": 0.7709547877311707, + "learning_rate": 2.114785784599815e-06, + "loss": 2.7821, + "step": 58900 + }, + { + "epoch": 2.89, + "grad_norm": 0.7097203731536865, + "learning_rate": 2.1129612977049847e-06, + "loss": 2.9419, + "step": 58901 + }, + { + "epoch": 2.89, + "grad_norm": 0.7327682375907898, + "learning_rate": 2.1111375953872576e-06, + "loss": 2.8539, + "step": 58902 + }, + { + "epoch": 2.89, + "grad_norm": 0.7576892375946045, + "learning_rate": 2.1093146776514637e-06, + "loss": 2.9477, + "step": 58903 + }, + { + "epoch": 2.89, + "grad_norm": 0.7579607367515564, + "learning_rate": 2.1074925445024314e-06, + "loss": 2.7468, + "step": 58904 + }, + { + "epoch": 2.89, + "grad_norm": 0.7820842862129211, + "learning_rate": 2.1056711959449248e-06, + "loss": 2.9472, + "step": 58905 + }, + { + "epoch": 2.89, + "grad_norm": 0.7186954021453857, + "learning_rate": 2.1038506319837056e-06, + "loss": 2.9416, + "step": 58906 + }, + { + "epoch": 2.89, + "grad_norm": 0.740797758102417, + "learning_rate": 2.102030852623671e-06, + "loss": 3.0116, + "step": 58907 + }, + { + "epoch": 2.89, + "grad_norm": 0.7477077841758728, + "learning_rate": 2.100211857869549e-06, + "loss": 3.0106, + "step": 58908 + }, + { + "epoch": 2.89, + "grad_norm": 0.7214314341545105, + "learning_rate": 2.098393647726104e-06, + "loss": 2.8677, + "step": 58909 + }, + { + "epoch": 2.89, + "grad_norm": 0.7203336358070374, + "learning_rate": 2.0965762221981654e-06, + "loss": 3.0648, + "step": 58910 + }, + { + "epoch": 2.89, + "grad_norm": 0.7489813566207886, + "learning_rate": 2.094759581290528e-06, + "loss": 2.7633, + "step": 58911 + }, + { + "epoch": 2.89, + "grad_norm": 0.7301415801048279, + "learning_rate": 2.092943725007956e-06, + "loss": 2.837, + "step": 58912 + }, + { + "epoch": 2.89, + "grad_norm": 0.8440554141998291, + "learning_rate": 2.091128653355245e-06, + "loss": 2.796, + "step": 58913 + }, + { + "epoch": 2.89, + "grad_norm": 0.7419830560684204, + "learning_rate": 2.089314366337158e-06, + "loss": 2.8407, + "step": 58914 + }, + { + "epoch": 2.89, + "grad_norm": 0.7549934387207031, + "learning_rate": 2.0875008639584912e-06, + "loss": 3.0267, + "step": 58915 + }, + { + "epoch": 2.89, + "grad_norm": 0.7184531092643738, + "learning_rate": 2.085688146224007e-06, + "loss": 2.6725, + "step": 58916 + }, + { + "epoch": 2.89, + "grad_norm": 0.7466804385185242, + "learning_rate": 2.0838762131384688e-06, + "loss": 2.9272, + "step": 58917 + }, + { + "epoch": 2.89, + "grad_norm": 0.7794800400733948, + "learning_rate": 2.0820650647066726e-06, + "loss": 2.7772, + "step": 58918 + }, + { + "epoch": 2.89, + "grad_norm": 0.7532921433448792, + "learning_rate": 2.0802547009333814e-06, + "loss": 3.0, + "step": 58919 + }, + { + "epoch": 2.89, + "grad_norm": 0.7411693334579468, + "learning_rate": 2.078445121823358e-06, + "loss": 3.0414, + "step": 58920 + }, + { + "epoch": 2.89, + "grad_norm": 0.7391944527626038, + "learning_rate": 2.076636327381398e-06, + "loss": 2.8976, + "step": 58921 + }, + { + "epoch": 2.89, + "grad_norm": 0.7387174963951111, + "learning_rate": 2.0748283176122317e-06, + "loss": 3.035, + "step": 58922 + }, + { + "epoch": 2.89, + "grad_norm": 0.7118051052093506, + "learning_rate": 2.0730210925206213e-06, + "loss": 2.8181, + "step": 58923 + }, + { + "epoch": 2.89, + "grad_norm": 0.7901133894920349, + "learning_rate": 2.0712146521113636e-06, + "loss": 3.0142, + "step": 58924 + }, + { + "epoch": 2.89, + "grad_norm": 0.7497255206108093, + "learning_rate": 2.0694089963891545e-06, + "loss": 2.7844, + "step": 58925 + }, + { + "epoch": 2.89, + "grad_norm": 0.7415169477462769, + "learning_rate": 2.067604125358824e-06, + "loss": 2.911, + "step": 58926 + }, + { + "epoch": 2.89, + "grad_norm": 0.749427855014801, + "learning_rate": 2.065800039025034e-06, + "loss": 2.9933, + "step": 58927 + }, + { + "epoch": 2.89, + "grad_norm": 0.7294812202453613, + "learning_rate": 2.0639967373926146e-06, + "loss": 2.869, + "step": 58928 + }, + { + "epoch": 2.89, + "grad_norm": 0.7828423380851746, + "learning_rate": 2.0621942204662956e-06, + "loss": 2.7197, + "step": 58929 + }, + { + "epoch": 2.89, + "grad_norm": 0.7263489365577698, + "learning_rate": 2.060392488250806e-06, + "loss": 2.85, + "step": 58930 + }, + { + "epoch": 2.89, + "grad_norm": 0.7711845636367798, + "learning_rate": 2.0585915407509424e-06, + "loss": 3.1033, + "step": 58931 + }, + { + "epoch": 2.89, + "grad_norm": 0.7376527190208435, + "learning_rate": 2.0567913779713675e-06, + "loss": 2.7261, + "step": 58932 + }, + { + "epoch": 2.89, + "grad_norm": 0.7345303893089294, + "learning_rate": 2.0549919999168442e-06, + "loss": 2.7508, + "step": 58933 + }, + { + "epoch": 2.89, + "grad_norm": 0.7842707633972168, + "learning_rate": 2.0531934065921685e-06, + "loss": 2.9854, + "step": 58934 + }, + { + "epoch": 2.89, + "grad_norm": 0.7627545595169067, + "learning_rate": 2.051395598002037e-06, + "loss": 2.6343, + "step": 58935 + }, + { + "epoch": 2.89, + "grad_norm": 0.7491629719734192, + "learning_rate": 2.049598574151179e-06, + "loss": 2.8958, + "step": 58936 + }, + { + "epoch": 2.89, + "grad_norm": 0.7489625811576843, + "learning_rate": 2.047802335044324e-06, + "loss": 2.8476, + "step": 58937 + }, + { + "epoch": 2.89, + "grad_norm": 0.7320138812065125, + "learning_rate": 2.046006880686235e-06, + "loss": 2.9481, + "step": 58938 + }, + { + "epoch": 2.89, + "grad_norm": 0.7232726812362671, + "learning_rate": 2.0442122110816084e-06, + "loss": 2.924, + "step": 58939 + }, + { + "epoch": 2.89, + "grad_norm": 0.7281123399734497, + "learning_rate": 2.0424183262352066e-06, + "loss": 2.9472, + "step": 58940 + }, + { + "epoch": 2.89, + "grad_norm": 0.7523177266120911, + "learning_rate": 2.0406252261517263e-06, + "loss": 2.8302, + "step": 58941 + }, + { + "epoch": 2.89, + "grad_norm": 0.738097608089447, + "learning_rate": 2.0388329108358636e-06, + "loss": 2.8233, + "step": 58942 + }, + { + "epoch": 2.89, + "grad_norm": 0.727954089641571, + "learning_rate": 2.0370413802923813e-06, + "loss": 2.9305, + "step": 58943 + }, + { + "epoch": 2.89, + "grad_norm": 0.7692335844039917, + "learning_rate": 2.035250634526009e-06, + "loss": 2.9345, + "step": 58944 + }, + { + "epoch": 2.89, + "grad_norm": 0.7667322754859924, + "learning_rate": 2.0334606735414426e-06, + "loss": 2.9262, + "step": 58945 + }, + { + "epoch": 2.89, + "grad_norm": 0.8380236625671387, + "learning_rate": 2.0316714973434124e-06, + "loss": 3.0058, + "step": 58946 + }, + { + "epoch": 2.89, + "grad_norm": 0.768031120300293, + "learning_rate": 2.0298831059365807e-06, + "loss": 3.2082, + "step": 58947 + }, + { + "epoch": 2.89, + "grad_norm": 0.773243248462677, + "learning_rate": 2.028095499325677e-06, + "loss": 2.9019, + "step": 58948 + }, + { + "epoch": 2.89, + "grad_norm": 0.7496214509010315, + "learning_rate": 2.026308677515465e-06, + "loss": 2.9999, + "step": 58949 + }, + { + "epoch": 2.89, + "grad_norm": 0.7690075635910034, + "learning_rate": 2.024522640510573e-06, + "loss": 2.6612, + "step": 58950 + }, + { + "epoch": 2.89, + "grad_norm": 0.8003986477851868, + "learning_rate": 2.022737388315765e-06, + "loss": 2.8753, + "step": 58951 + }, + { + "epoch": 2.89, + "grad_norm": 0.7260475158691406, + "learning_rate": 2.020952920935737e-06, + "loss": 3.0065, + "step": 58952 + }, + { + "epoch": 2.89, + "grad_norm": 0.7463489770889282, + "learning_rate": 2.0191692383751514e-06, + "loss": 3.0693, + "step": 58953 + }, + { + "epoch": 2.89, + "grad_norm": 0.7311463356018066, + "learning_rate": 2.0173863406387047e-06, + "loss": 2.9125, + "step": 58954 + }, + { + "epoch": 2.89, + "grad_norm": 0.8149186372756958, + "learning_rate": 2.0156042277311267e-06, + "loss": 3.1956, + "step": 58955 + }, + { + "epoch": 2.89, + "grad_norm": 0.7813419103622437, + "learning_rate": 2.0138228996571136e-06, + "loss": 2.8011, + "step": 58956 + }, + { + "epoch": 2.89, + "grad_norm": 0.7746787071228027, + "learning_rate": 2.0120423564213285e-06, + "loss": 2.9801, + "step": 58957 + }, + { + "epoch": 2.89, + "grad_norm": 0.8212281465530396, + "learning_rate": 2.0102625980285002e-06, + "loss": 2.8556, + "step": 58958 + }, + { + "epoch": 2.89, + "grad_norm": 0.7449041604995728, + "learning_rate": 2.008483624483259e-06, + "loss": 3.0733, + "step": 58959 + }, + { + "epoch": 2.89, + "grad_norm": 0.7603235840797424, + "learning_rate": 2.006705435790368e-06, + "loss": 2.9102, + "step": 58960 + }, + { + "epoch": 2.89, + "grad_norm": 0.7589277625083923, + "learning_rate": 2.0049280319544225e-06, + "loss": 2.7227, + "step": 58961 + }, + { + "epoch": 2.89, + "grad_norm": 0.752943754196167, + "learning_rate": 2.0031514129801527e-06, + "loss": 2.7721, + "step": 58962 + }, + { + "epoch": 2.89, + "grad_norm": 0.768645703792572, + "learning_rate": 2.0013755788722217e-06, + "loss": 2.7773, + "step": 58963 + }, + { + "epoch": 2.89, + "grad_norm": 0.7890374660491943, + "learning_rate": 1.999600529635359e-06, + "loss": 2.7951, + "step": 58964 + }, + { + "epoch": 2.89, + "grad_norm": 0.7113425135612488, + "learning_rate": 1.9978262652741605e-06, + "loss": 2.9277, + "step": 58965 + }, + { + "epoch": 2.89, + "grad_norm": 0.797177255153656, + "learning_rate": 1.996052785793323e-06, + "loss": 2.67, + "step": 58966 + }, + { + "epoch": 2.89, + "grad_norm": 0.7570541501045227, + "learning_rate": 1.9942800911975755e-06, + "loss": 2.9775, + "step": 58967 + }, + { + "epoch": 2.89, + "grad_norm": 0.8025187849998474, + "learning_rate": 1.9925081814915145e-06, + "loss": 2.9072, + "step": 58968 + }, + { + "epoch": 2.89, + "grad_norm": 0.7739835381507874, + "learning_rate": 1.9907370566798363e-06, + "loss": 2.955, + "step": 58969 + }, + { + "epoch": 2.89, + "grad_norm": 0.7543403506278992, + "learning_rate": 1.9889667167672042e-06, + "loss": 2.8965, + "step": 58970 + }, + { + "epoch": 2.89, + "grad_norm": 0.8121760487556458, + "learning_rate": 1.9871971617582804e-06, + "loss": 2.8277, + "step": 58971 + }, + { + "epoch": 2.89, + "grad_norm": 0.8004959225654602, + "learning_rate": 1.9854283916577285e-06, + "loss": 2.7487, + "step": 58972 + }, + { + "epoch": 2.89, + "grad_norm": 0.7454361915588379, + "learning_rate": 1.983660406470211e-06, + "loss": 2.7616, + "step": 58973 + }, + { + "epoch": 2.89, + "grad_norm": 0.7641414999961853, + "learning_rate": 1.9818932062003578e-06, + "loss": 2.9533, + "step": 58974 + }, + { + "epoch": 2.89, + "grad_norm": 0.7749798893928528, + "learning_rate": 1.980126790852865e-06, + "loss": 2.5778, + "step": 58975 + }, + { + "epoch": 2.89, + "grad_norm": 0.7524572610855103, + "learning_rate": 1.978361160432362e-06, + "loss": 2.9732, + "step": 58976 + }, + { + "epoch": 2.89, + "grad_norm": 0.7702212929725647, + "learning_rate": 1.9765963149434793e-06, + "loss": 2.9104, + "step": 58977 + }, + { + "epoch": 2.89, + "grad_norm": 0.802010715007782, + "learning_rate": 1.974832254390879e-06, + "loss": 2.927, + "step": 58978 + }, + { + "epoch": 2.89, + "grad_norm": 0.7283293604850769, + "learning_rate": 1.9730689787792573e-06, + "loss": 3.0435, + "step": 58979 + }, + { + "epoch": 2.89, + "grad_norm": 0.7269130349159241, + "learning_rate": 1.971306488113178e-06, + "loss": 2.8377, + "step": 58980 + }, + { + "epoch": 2.89, + "grad_norm": 0.7806401252746582, + "learning_rate": 1.9695447823973364e-06, + "loss": 2.7224, + "step": 58981 + }, + { + "epoch": 2.89, + "grad_norm": 0.7446566224098206, + "learning_rate": 1.967783861636363e-06, + "loss": 2.9339, + "step": 58982 + }, + { + "epoch": 2.89, + "grad_norm": 0.8244256377220154, + "learning_rate": 1.9660237258348533e-06, + "loss": 2.8912, + "step": 58983 + }, + { + "epoch": 2.89, + "grad_norm": 0.7497897148132324, + "learning_rate": 1.9642643749975373e-06, + "loss": 2.8148, + "step": 58984 + }, + { + "epoch": 2.89, + "grad_norm": 0.8421869874000549, + "learning_rate": 1.9625058091289447e-06, + "loss": 3.0151, + "step": 58985 + }, + { + "epoch": 2.89, + "grad_norm": 0.7372134327888489, + "learning_rate": 1.9607480282337716e-06, + "loss": 2.8176, + "step": 58986 + }, + { + "epoch": 2.89, + "grad_norm": 0.7009263634681702, + "learning_rate": 1.958991032316615e-06, + "loss": 2.819, + "step": 58987 + }, + { + "epoch": 2.89, + "grad_norm": 0.7520594000816345, + "learning_rate": 1.9572348213821365e-06, + "loss": 2.8624, + "step": 58988 + }, + { + "epoch": 2.89, + "grad_norm": 0.7330684661865234, + "learning_rate": 1.9554793954349333e-06, + "loss": 2.7898, + "step": 58989 + }, + { + "epoch": 2.89, + "grad_norm": 0.7308124303817749, + "learning_rate": 1.953724754479635e-06, + "loss": 2.8689, + "step": 58990 + }, + { + "epoch": 2.89, + "grad_norm": 0.7374723553657532, + "learning_rate": 1.9519708985208717e-06, + "loss": 2.7598, + "step": 58991 + }, + { + "epoch": 2.89, + "grad_norm": 0.7027939558029175, + "learning_rate": 1.9502178275632718e-06, + "loss": 2.9557, + "step": 58992 + }, + { + "epoch": 2.89, + "grad_norm": 0.7598909139633179, + "learning_rate": 1.9484655416113993e-06, + "loss": 2.9631, + "step": 58993 + }, + { + "epoch": 2.89, + "grad_norm": 0.7432723641395569, + "learning_rate": 1.94671404066995e-06, + "loss": 3.0194, + "step": 58994 + }, + { + "epoch": 2.89, + "grad_norm": 0.7426236867904663, + "learning_rate": 1.944963324743454e-06, + "loss": 2.9044, + "step": 58995 + }, + { + "epoch": 2.89, + "grad_norm": 0.7190657258033752, + "learning_rate": 1.943213393836607e-06, + "loss": 2.4641, + "step": 58996 + }, + { + "epoch": 2.89, + "grad_norm": 0.7499857544898987, + "learning_rate": 1.9414642479539386e-06, + "loss": 3.0042, + "step": 58997 + }, + { + "epoch": 2.89, + "grad_norm": 0.7538277506828308, + "learning_rate": 1.939715887100113e-06, + "loss": 3.1177, + "step": 58998 + }, + { + "epoch": 2.89, + "grad_norm": 0.7844749093055725, + "learning_rate": 1.937968311279692e-06, + "loss": 2.7512, + "step": 58999 + }, + { + "epoch": 2.89, + "grad_norm": 0.7818543314933777, + "learning_rate": 1.9362215204972718e-06, + "loss": 2.9125, + "step": 59000 + }, + { + "epoch": 2.89, + "grad_norm": 0.7285245656967163, + "learning_rate": 1.9344755147575496e-06, + "loss": 2.6752, + "step": 59001 + }, + { + "epoch": 2.89, + "grad_norm": 0.721545934677124, + "learning_rate": 1.932730294064988e-06, + "loss": 2.9226, + "step": 59002 + }, + { + "epoch": 2.89, + "grad_norm": 0.7440168857574463, + "learning_rate": 1.9309858584242834e-06, + "loss": 2.9274, + "step": 59003 + }, + { + "epoch": 2.89, + "grad_norm": 0.7558407783508301, + "learning_rate": 1.929242207840032e-06, + "loss": 2.8321, + "step": 59004 + }, + { + "epoch": 2.89, + "grad_norm": 0.7752707600593567, + "learning_rate": 1.9274993423167295e-06, + "loss": 2.9114, + "step": 59005 + }, + { + "epoch": 2.89, + "grad_norm": 0.7502283453941345, + "learning_rate": 1.9257572618590732e-06, + "loss": 2.8542, + "step": 59006 + }, + { + "epoch": 2.89, + "grad_norm": 0.7198836207389832, + "learning_rate": 1.9240159664715926e-06, + "loss": 2.6527, + "step": 59007 + }, + { + "epoch": 2.89, + "grad_norm": 0.7290161848068237, + "learning_rate": 1.9222754561588836e-06, + "loss": 2.8487, + "step": 59008 + }, + { + "epoch": 2.89, + "grad_norm": 0.7256250381469727, + "learning_rate": 1.920535730925543e-06, + "loss": 2.6648, + "step": 59009 + }, + { + "epoch": 2.89, + "grad_norm": 0.7327529191970825, + "learning_rate": 1.9187967907761335e-06, + "loss": 2.9445, + "step": 59010 + }, + { + "epoch": 2.89, + "grad_norm": 0.7599136233329773, + "learning_rate": 1.9170586357152518e-06, + "loss": 2.9671, + "step": 59011 + }, + { + "epoch": 2.89, + "grad_norm": 0.7350993752479553, + "learning_rate": 1.9153212657474605e-06, + "loss": 2.881, + "step": 59012 + }, + { + "epoch": 2.89, + "grad_norm": 0.7257763147354126, + "learning_rate": 1.913584680877356e-06, + "loss": 2.8071, + "step": 59013 + }, + { + "epoch": 2.89, + "grad_norm": 0.7590259909629822, + "learning_rate": 1.9118488811095013e-06, + "loss": 2.8561, + "step": 59014 + }, + { + "epoch": 2.89, + "grad_norm": 0.7977140545845032, + "learning_rate": 1.9101138664484593e-06, + "loss": 2.8963, + "step": 59015 + }, + { + "epoch": 2.89, + "grad_norm": 0.734058141708374, + "learning_rate": 1.908379636898827e-06, + "loss": 2.9871, + "step": 59016 + }, + { + "epoch": 2.89, + "grad_norm": 0.7733717560768127, + "learning_rate": 1.9066461924651665e-06, + "loss": 2.7593, + "step": 59017 + }, + { + "epoch": 2.89, + "grad_norm": 0.7138202786445618, + "learning_rate": 1.9049135331520081e-06, + "loss": 2.9411, + "step": 59018 + }, + { + "epoch": 2.89, + "grad_norm": 0.7705768346786499, + "learning_rate": 1.9031816589639148e-06, + "loss": 2.9631, + "step": 59019 + }, + { + "epoch": 2.89, + "grad_norm": 0.7369682192802429, + "learning_rate": 1.9014505699054827e-06, + "loss": 2.8449, + "step": 59020 + }, + { + "epoch": 2.89, + "grad_norm": 0.7815206050872803, + "learning_rate": 1.8997202659812749e-06, + "loss": 2.7891, + "step": 59021 + }, + { + "epoch": 2.89, + "grad_norm": 0.7516481280326843, + "learning_rate": 1.8979907471958212e-06, + "loss": 2.9268, + "step": 59022 + }, + { + "epoch": 2.89, + "grad_norm": 0.8083335161209106, + "learning_rate": 1.8962620135537176e-06, + "loss": 3.0219, + "step": 59023 + }, + { + "epoch": 2.89, + "grad_norm": 0.7490785717964172, + "learning_rate": 1.894534065059461e-06, + "loss": 2.9312, + "step": 59024 + }, + { + "epoch": 2.89, + "grad_norm": 0.6919969916343689, + "learning_rate": 1.8928069017176472e-06, + "loss": 2.8647, + "step": 59025 + }, + { + "epoch": 2.89, + "grad_norm": 0.7737188339233398, + "learning_rate": 1.8910805235328063e-06, + "loss": 2.7855, + "step": 59026 + }, + { + "epoch": 2.89, + "grad_norm": 0.7691394686698914, + "learning_rate": 1.889354930509468e-06, + "loss": 3.0189, + "step": 59027 + }, + { + "epoch": 2.89, + "grad_norm": 0.7691125273704529, + "learning_rate": 1.8876301226522284e-06, + "loss": 2.7889, + "step": 59028 + }, + { + "epoch": 2.89, + "grad_norm": 0.7242297530174255, + "learning_rate": 1.8859060999655507e-06, + "loss": 2.8643, + "step": 59029 + }, + { + "epoch": 2.89, + "grad_norm": 0.7567639946937561, + "learning_rate": 1.8841828624540645e-06, + "loss": 2.7662, + "step": 59030 + }, + { + "epoch": 2.89, + "grad_norm": 0.7616976499557495, + "learning_rate": 1.882460410122233e-06, + "loss": 2.828, + "step": 59031 + }, + { + "epoch": 2.89, + "grad_norm": 0.7366723418235779, + "learning_rate": 1.8807387429746524e-06, + "loss": 2.8366, + "step": 59032 + }, + { + "epoch": 2.89, + "grad_norm": 0.7073797583580017, + "learning_rate": 1.8790178610158525e-06, + "loss": 2.9282, + "step": 59033 + }, + { + "epoch": 2.89, + "grad_norm": 0.7266165614128113, + "learning_rate": 1.8772977642502963e-06, + "loss": 3.0461, + "step": 59034 + }, + { + "epoch": 2.89, + "grad_norm": 0.7223436832427979, + "learning_rate": 1.87557845268258e-06, + "loss": 2.7378, + "step": 59035 + }, + { + "epoch": 2.89, + "grad_norm": 0.8283471465110779, + "learning_rate": 1.8738599263172338e-06, + "loss": 2.844, + "step": 59036 + }, + { + "epoch": 2.89, + "grad_norm": 0.7933211326599121, + "learning_rate": 1.8721421851587537e-06, + "loss": 2.9484, + "step": 59037 + }, + { + "epoch": 2.89, + "grad_norm": 0.7600369453430176, + "learning_rate": 1.8704252292116696e-06, + "loss": 2.9175, + "step": 59038 + }, + { + "epoch": 2.89, + "grad_norm": 0.788422703742981, + "learning_rate": 1.8687090584805108e-06, + "loss": 2.7721, + "step": 59039 + }, + { + "epoch": 2.89, + "grad_norm": 0.7650947570800781, + "learning_rate": 1.8669936729698076e-06, + "loss": 2.7601, + "step": 59040 + }, + { + "epoch": 2.89, + "grad_norm": 0.748820960521698, + "learning_rate": 1.865279072684056e-06, + "loss": 2.8145, + "step": 59041 + }, + { + "epoch": 2.89, + "grad_norm": 0.7527621388435364, + "learning_rate": 1.8635652576277527e-06, + "loss": 2.9823, + "step": 59042 + }, + { + "epoch": 2.89, + "grad_norm": 0.7002708315849304, + "learning_rate": 1.8618522278054938e-06, + "loss": 2.8998, + "step": 59043 + }, + { + "epoch": 2.89, + "grad_norm": 0.7159339785575867, + "learning_rate": 1.860139983221709e-06, + "loss": 2.6089, + "step": 59044 + }, + { + "epoch": 2.89, + "grad_norm": 0.737927258014679, + "learning_rate": 1.8584285238809281e-06, + "loss": 2.9601, + "step": 59045 + }, + { + "epoch": 2.89, + "grad_norm": 0.7862527370452881, + "learning_rate": 1.8567178497876811e-06, + "loss": 2.7792, + "step": 59046 + }, + { + "epoch": 2.89, + "grad_norm": 0.7486521005630493, + "learning_rate": 1.8550079609464974e-06, + "loss": 2.7664, + "step": 59047 + }, + { + "epoch": 2.89, + "grad_norm": 0.7823449969291687, + "learning_rate": 1.8532988573618068e-06, + "loss": 2.8471, + "step": 59048 + }, + { + "epoch": 2.89, + "grad_norm": 0.7296319603919983, + "learning_rate": 1.8515905390381392e-06, + "loss": 2.7969, + "step": 59049 + }, + { + "epoch": 2.89, + "grad_norm": 0.8135117888450623, + "learning_rate": 1.8498830059799908e-06, + "loss": 2.8586, + "step": 59050 + }, + { + "epoch": 2.89, + "grad_norm": 0.6989290714263916, + "learning_rate": 1.8481762581918913e-06, + "loss": 2.6872, + "step": 59051 + }, + { + "epoch": 2.89, + "grad_norm": 0.7613128423690796, + "learning_rate": 1.8464702956783372e-06, + "loss": 2.995, + "step": 59052 + }, + { + "epoch": 2.89, + "grad_norm": 0.721194863319397, + "learning_rate": 1.8447651184437584e-06, + "loss": 2.7919, + "step": 59053 + }, + { + "epoch": 2.89, + "grad_norm": 0.726525068283081, + "learning_rate": 1.8430607264927178e-06, + "loss": 2.972, + "step": 59054 + }, + { + "epoch": 2.89, + "grad_norm": 0.7626053690910339, + "learning_rate": 1.8413571198296784e-06, + "loss": 2.8219, + "step": 59055 + }, + { + "epoch": 2.89, + "grad_norm": 0.7047833204269409, + "learning_rate": 1.8396542984591035e-06, + "loss": 2.7473, + "step": 59056 + }, + { + "epoch": 2.89, + "grad_norm": 0.7405434846878052, + "learning_rate": 1.8379522623854892e-06, + "loss": 2.7802, + "step": 59057 + }, + { + "epoch": 2.89, + "grad_norm": 0.7458202242851257, + "learning_rate": 1.8362510116133323e-06, + "loss": 3.0058, + "step": 59058 + }, + { + "epoch": 2.89, + "grad_norm": 0.7674630880355835, + "learning_rate": 1.8345505461471287e-06, + "loss": 2.9578, + "step": 59059 + }, + { + "epoch": 2.89, + "grad_norm": 0.7461906671524048, + "learning_rate": 1.8328508659913087e-06, + "loss": 2.8953, + "step": 59060 + }, + { + "epoch": 2.89, + "grad_norm": 0.745392382144928, + "learning_rate": 1.831151971150402e-06, + "loss": 2.8253, + "step": 59061 + }, + { + "epoch": 2.89, + "grad_norm": 0.7358865141868591, + "learning_rate": 1.829453861628838e-06, + "loss": 2.9244, + "step": 59062 + }, + { + "epoch": 2.89, + "grad_norm": 0.7252581715583801, + "learning_rate": 1.8277565374311132e-06, + "loss": 3.0891, + "step": 59063 + }, + { + "epoch": 2.89, + "grad_norm": 0.7202037572860718, + "learning_rate": 1.8260599985616908e-06, + "loss": 2.5241, + "step": 59064 + }, + { + "epoch": 2.89, + "grad_norm": 0.7681000232696533, + "learning_rate": 1.824364245025034e-06, + "loss": 2.9659, + "step": 59065 + }, + { + "epoch": 2.89, + "grad_norm": 0.7250438928604126, + "learning_rate": 1.822669276825639e-06, + "loss": 2.7523, + "step": 59066 + }, + { + "epoch": 2.89, + "grad_norm": 0.7999386787414551, + "learning_rate": 1.8209750939679357e-06, + "loss": 2.7248, + "step": 59067 + }, + { + "epoch": 2.89, + "grad_norm": 0.8048508167266846, + "learning_rate": 1.8192816964563873e-06, + "loss": 2.6697, + "step": 59068 + }, + { + "epoch": 2.89, + "grad_norm": 0.7457453012466431, + "learning_rate": 1.81758908429549e-06, + "loss": 2.8159, + "step": 59069 + }, + { + "epoch": 2.89, + "grad_norm": 0.7637097239494324, + "learning_rate": 1.8158972574896403e-06, + "loss": 2.748, + "step": 59070 + }, + { + "epoch": 2.89, + "grad_norm": 0.7343689799308777, + "learning_rate": 1.814206216043368e-06, + "loss": 2.8942, + "step": 59071 + }, + { + "epoch": 2.89, + "grad_norm": 0.7599355578422546, + "learning_rate": 1.8125159599610694e-06, + "loss": 2.9799, + "step": 59072 + }, + { + "epoch": 2.9, + "grad_norm": 0.7494969964027405, + "learning_rate": 1.8108264892472412e-06, + "loss": 2.8893, + "step": 59073 + }, + { + "epoch": 2.9, + "grad_norm": 0.7399177551269531, + "learning_rate": 1.80913780390628e-06, + "loss": 2.8731, + "step": 59074 + }, + { + "epoch": 2.9, + "grad_norm": 0.7720394134521484, + "learning_rate": 1.8074499039426815e-06, + "loss": 2.7274, + "step": 59075 + }, + { + "epoch": 2.9, + "grad_norm": 0.7690413594245911, + "learning_rate": 1.8057627893608428e-06, + "loss": 2.7685, + "step": 59076 + }, + { + "epoch": 2.9, + "grad_norm": 0.7422636151313782, + "learning_rate": 1.8040764601652601e-06, + "loss": 2.884, + "step": 59077 + }, + { + "epoch": 2.9, + "grad_norm": 0.7094586491584778, + "learning_rate": 1.8023909163603634e-06, + "loss": 2.8035, + "step": 59078 + }, + { + "epoch": 2.9, + "grad_norm": 0.7637078166007996, + "learning_rate": 1.8007061579505488e-06, + "loss": 2.9295, + "step": 59079 + }, + { + "epoch": 2.9, + "grad_norm": 0.7474511861801147, + "learning_rate": 1.7990221849402797e-06, + "loss": 2.9187, + "step": 59080 + }, + { + "epoch": 2.9, + "grad_norm": 0.7161756157875061, + "learning_rate": 1.797338997334019e-06, + "loss": 2.7149, + "step": 59081 + }, + { + "epoch": 2.9, + "grad_norm": 0.7648656368255615, + "learning_rate": 1.7956565951361635e-06, + "loss": 2.979, + "step": 59082 + }, + { + "epoch": 2.9, + "grad_norm": 0.8284847736358643, + "learning_rate": 1.7939749783511758e-06, + "loss": 3.0267, + "step": 59083 + }, + { + "epoch": 2.9, + "grad_norm": 0.7770487070083618, + "learning_rate": 1.7922941469834528e-06, + "loss": 2.8505, + "step": 59084 + }, + { + "epoch": 2.9, + "grad_norm": 0.7366689443588257, + "learning_rate": 1.7906141010374576e-06, + "loss": 2.9935, + "step": 59085 + }, + { + "epoch": 2.9, + "grad_norm": 0.7799773812294006, + "learning_rate": 1.788934840517553e-06, + "loss": 2.8051, + "step": 59086 + }, + { + "epoch": 2.9, + "grad_norm": 0.7727737426757812, + "learning_rate": 1.7872563654282357e-06, + "loss": 3.0461, + "step": 59087 + }, + { + "epoch": 2.9, + "grad_norm": 0.7719646692276001, + "learning_rate": 1.7855786757738687e-06, + "loss": 2.8494, + "step": 59088 + }, + { + "epoch": 2.9, + "grad_norm": 0.7487027049064636, + "learning_rate": 1.7839017715589155e-06, + "loss": 2.9247, + "step": 59089 + }, + { + "epoch": 2.9, + "grad_norm": 0.7525886297225952, + "learning_rate": 1.7822256527877721e-06, + "loss": 2.9167, + "step": 59090 + }, + { + "epoch": 2.9, + "grad_norm": 0.7553564310073853, + "learning_rate": 1.7805503194648684e-06, + "loss": 2.7595, + "step": 59091 + }, + { + "epoch": 2.9, + "grad_norm": 0.8537721633911133, + "learning_rate": 1.778875771594568e-06, + "loss": 2.9898, + "step": 59092 + }, + { + "epoch": 2.9, + "grad_norm": 0.7500604391098022, + "learning_rate": 1.7772020091813333e-06, + "loss": 2.9763, + "step": 59093 + }, + { + "epoch": 2.9, + "grad_norm": 0.7806376814842224, + "learning_rate": 1.775529032229528e-06, + "loss": 2.7487, + "step": 59094 + }, + { + "epoch": 2.9, + "grad_norm": 0.7601209282875061, + "learning_rate": 1.7738568407436149e-06, + "loss": 2.9419, + "step": 59095 + }, + { + "epoch": 2.9, + "grad_norm": 0.7778134346008301, + "learning_rate": 1.7721854347279573e-06, + "loss": 2.723, + "step": 59096 + }, + { + "epoch": 2.9, + "grad_norm": 0.759236216545105, + "learning_rate": 1.7705148141869853e-06, + "loss": 3.0486, + "step": 59097 + }, + { + "epoch": 2.9, + "grad_norm": 0.8189762234687805, + "learning_rate": 1.7688449791250614e-06, + "loss": 2.8238, + "step": 59098 + }, + { + "epoch": 2.9, + "grad_norm": 0.7347763776779175, + "learning_rate": 1.7671759295465826e-06, + "loss": 2.8377, + "step": 59099 + }, + { + "epoch": 2.9, + "grad_norm": 0.7606347799301147, + "learning_rate": 1.7655076654560119e-06, + "loss": 2.7916, + "step": 59100 + }, + { + "epoch": 2.9, + "grad_norm": 0.7474910020828247, + "learning_rate": 1.7638401868576791e-06, + "loss": 2.9414, + "step": 59101 + }, + { + "epoch": 2.9, + "grad_norm": 0.7264478206634521, + "learning_rate": 1.7621734937559806e-06, + "loss": 2.8281, + "step": 59102 + }, + { + "epoch": 2.9, + "grad_norm": 0.7830918431282043, + "learning_rate": 1.7605075861553464e-06, + "loss": 2.9221, + "step": 59103 + }, + { + "epoch": 2.9, + "grad_norm": 0.78126060962677, + "learning_rate": 1.7588424640601062e-06, + "loss": 2.9048, + "step": 59104 + }, + { + "epoch": 2.9, + "grad_norm": 0.7438468933105469, + "learning_rate": 1.7571781274746898e-06, + "loss": 2.7915, + "step": 59105 + }, + { + "epoch": 2.9, + "grad_norm": 0.8499040007591248, + "learning_rate": 1.755514576403494e-06, + "loss": 2.8397, + "step": 59106 + }, + { + "epoch": 2.9, + "grad_norm": 0.7694287300109863, + "learning_rate": 1.7538518108508481e-06, + "loss": 2.8528, + "step": 59107 + }, + { + "epoch": 2.9, + "grad_norm": 0.7242769598960876, + "learning_rate": 1.752189830821149e-06, + "loss": 2.9493, + "step": 59108 + }, + { + "epoch": 2.9, + "grad_norm": 0.7706876397132874, + "learning_rate": 1.750528636318793e-06, + "loss": 2.7071, + "step": 59109 + }, + { + "epoch": 2.9, + "grad_norm": 0.7799972295761108, + "learning_rate": 1.7488682273481769e-06, + "loss": 2.9786, + "step": 59110 + }, + { + "epoch": 2.9, + "grad_norm": 0.7658340930938721, + "learning_rate": 1.7472086039135968e-06, + "loss": 2.991, + "step": 59111 + }, + { + "epoch": 2.9, + "grad_norm": 0.7521490454673767, + "learning_rate": 1.7455497660194828e-06, + "loss": 2.8154, + "step": 59112 + }, + { + "epoch": 2.9, + "grad_norm": 0.7272855639457703, + "learning_rate": 1.7438917136702312e-06, + "loss": 2.84, + "step": 59113 + }, + { + "epoch": 2.9, + "grad_norm": 0.7962722182273865, + "learning_rate": 1.7422344468701055e-06, + "loss": 3.0468, + "step": 59114 + }, + { + "epoch": 2.9, + "grad_norm": 0.770363986492157, + "learning_rate": 1.7405779656235686e-06, + "loss": 2.8827, + "step": 59115 + }, + { + "epoch": 2.9, + "grad_norm": 0.7188537120819092, + "learning_rate": 1.7389222699349503e-06, + "loss": 2.8618, + "step": 59116 + }, + { + "epoch": 2.9, + "grad_norm": 0.7084076404571533, + "learning_rate": 1.7372673598085808e-06, + "loss": 2.8814, + "step": 59117 + }, + { + "epoch": 2.9, + "grad_norm": 0.7412644028663635, + "learning_rate": 1.7356132352488562e-06, + "loss": 2.8781, + "step": 59118 + }, + { + "epoch": 2.9, + "grad_norm": 0.7647395730018616, + "learning_rate": 1.733959896260173e-06, + "loss": 2.8254, + "step": 59119 + }, + { + "epoch": 2.9, + "grad_norm": 0.7721149921417236, + "learning_rate": 1.7323073428467948e-06, + "loss": 2.8524, + "step": 59120 + }, + { + "epoch": 2.9, + "grad_norm": 0.7480499148368835, + "learning_rate": 1.730655575013118e-06, + "loss": 2.8018, + "step": 59121 + }, + { + "epoch": 2.9, + "grad_norm": 0.7373523116111755, + "learning_rate": 1.7290045927634721e-06, + "loss": 2.8485, + "step": 59122 + }, + { + "epoch": 2.9, + "grad_norm": 0.7831674218177795, + "learning_rate": 1.727354396102254e-06, + "loss": 3.0131, + "step": 59123 + }, + { + "epoch": 2.9, + "grad_norm": 0.7761872410774231, + "learning_rate": 1.72570498503376e-06, + "loss": 2.8869, + "step": 59124 + }, + { + "epoch": 2.9, + "grad_norm": 0.7596009969711304, + "learning_rate": 1.7240563595623868e-06, + "loss": 2.6972, + "step": 59125 + }, + { + "epoch": 2.9, + "grad_norm": 0.734153151512146, + "learning_rate": 1.7224085196923977e-06, + "loss": 2.7864, + "step": 59126 + }, + { + "epoch": 2.9, + "grad_norm": 0.7314983606338501, + "learning_rate": 1.720761465428222e-06, + "loss": 2.9053, + "step": 59127 + }, + { + "epoch": 2.9, + "grad_norm": 0.7744563817977905, + "learning_rate": 1.7191151967741568e-06, + "loss": 2.905, + "step": 59128 + }, + { + "epoch": 2.9, + "grad_norm": 0.7810060381889343, + "learning_rate": 1.7174697137344983e-06, + "loss": 2.9127, + "step": 59129 + }, + { + "epoch": 2.9, + "grad_norm": 0.7507235407829285, + "learning_rate": 1.7158250163136432e-06, + "loss": 2.8359, + "step": 59130 + }, + { + "epoch": 2.9, + "grad_norm": 0.7610126733779907, + "learning_rate": 1.7141811045158883e-06, + "loss": 2.6502, + "step": 59131 + }, + { + "epoch": 2.9, + "grad_norm": 0.7299541234970093, + "learning_rate": 1.712537978345596e-06, + "loss": 2.7764, + "step": 59132 + }, + { + "epoch": 2.9, + "grad_norm": 0.805625319480896, + "learning_rate": 1.7108956378070637e-06, + "loss": 2.8168, + "step": 59133 + }, + { + "epoch": 2.9, + "grad_norm": 0.7253496646881104, + "learning_rate": 1.7092540829046208e-06, + "loss": 2.8982, + "step": 59134 + }, + { + "epoch": 2.9, + "grad_norm": 0.8161607384681702, + "learning_rate": 1.7076133136425974e-06, + "loss": 2.846, + "step": 59135 + }, + { + "epoch": 2.9, + "grad_norm": 0.7335495948791504, + "learning_rate": 1.7059733300253232e-06, + "loss": 2.9378, + "step": 59136 + }, + { + "epoch": 2.9, + "grad_norm": 0.7281621098518372, + "learning_rate": 1.7043341320570613e-06, + "loss": 2.8579, + "step": 59137 + }, + { + "epoch": 2.9, + "grad_norm": 0.7644967436790466, + "learning_rate": 1.702695719742242e-06, + "loss": 2.8762, + "step": 59138 + }, + { + "epoch": 2.9, + "grad_norm": 0.7520266771316528, + "learning_rate": 1.7010580930850614e-06, + "loss": 2.8353, + "step": 59139 + }, + { + "epoch": 2.9, + "grad_norm": 0.7870677709579468, + "learning_rate": 1.6994212520899164e-06, + "loss": 2.9208, + "step": 59140 + }, + { + "epoch": 2.9, + "grad_norm": 0.7785607576370239, + "learning_rate": 1.69778519676107e-06, + "loss": 3.0277, + "step": 59141 + }, + { + "epoch": 2.9, + "grad_norm": 0.7652883529663086, + "learning_rate": 1.6961499271028521e-06, + "loss": 2.9603, + "step": 59142 + }, + { + "epoch": 2.9, + "grad_norm": 0.7231030464172363, + "learning_rate": 1.6945154431195596e-06, + "loss": 2.5437, + "step": 59143 + }, + { + "epoch": 2.9, + "grad_norm": 0.7375117540359497, + "learning_rate": 1.6928817448155218e-06, + "loss": 2.6275, + "step": 59144 + }, + { + "epoch": 2.9, + "grad_norm": 0.7867879867553711, + "learning_rate": 1.6912488321950023e-06, + "loss": 2.8344, + "step": 59145 + }, + { + "epoch": 2.9, + "grad_norm": 0.8134239912033081, + "learning_rate": 1.689616705262331e-06, + "loss": 2.8047, + "step": 59146 + }, + { + "epoch": 2.9, + "grad_norm": 0.7559427618980408, + "learning_rate": 1.6879853640218044e-06, + "loss": 2.6349, + "step": 59147 + }, + { + "epoch": 2.9, + "grad_norm": 0.7431520223617554, + "learning_rate": 1.686354808477719e-06, + "loss": 2.9635, + "step": 59148 + }, + { + "epoch": 2.9, + "grad_norm": 0.7744369506835938, + "learning_rate": 1.684725038634338e-06, + "loss": 2.972, + "step": 59149 + }, + { + "epoch": 2.9, + "grad_norm": 0.769906222820282, + "learning_rate": 1.6830960544959915e-06, + "loss": 2.7403, + "step": 59150 + }, + { + "epoch": 2.9, + "grad_norm": 0.772413432598114, + "learning_rate": 1.681467856066976e-06, + "loss": 2.9986, + "step": 59151 + }, + { + "epoch": 2.9, + "grad_norm": 0.7344859838485718, + "learning_rate": 1.6798404433515545e-06, + "loss": 2.8915, + "step": 59152 + }, + { + "epoch": 2.9, + "grad_norm": 0.7600609660148621, + "learning_rate": 1.6782138163539903e-06, + "loss": 2.6796, + "step": 59153 + }, + { + "epoch": 2.9, + "grad_norm": 0.7428922653198242, + "learning_rate": 1.6765879750786138e-06, + "loss": 3.1002, + "step": 59154 + }, + { + "epoch": 2.9, + "grad_norm": 0.7847473621368408, + "learning_rate": 1.6749629195296877e-06, + "loss": 2.7684, + "step": 59155 + }, + { + "epoch": 2.9, + "grad_norm": 0.7154970169067383, + "learning_rate": 1.6733386497115086e-06, + "loss": 2.9555, + "step": 59156 + }, + { + "epoch": 2.9, + "grad_norm": 0.8147268295288086, + "learning_rate": 1.6717151656283734e-06, + "loss": 2.8499, + "step": 59157 + }, + { + "epoch": 2.9, + "grad_norm": 0.7564356923103333, + "learning_rate": 1.6700924672844783e-06, + "loss": 2.8353, + "step": 59158 + }, + { + "epoch": 2.9, + "grad_norm": 0.7662482261657715, + "learning_rate": 1.6684705546841537e-06, + "loss": 2.9374, + "step": 59159 + }, + { + "epoch": 2.9, + "grad_norm": 0.7386744022369385, + "learning_rate": 1.6668494278316624e-06, + "loss": 2.749, + "step": 59160 + }, + { + "epoch": 2.9, + "grad_norm": 0.7575583457946777, + "learning_rate": 1.665229086731268e-06, + "loss": 3.0314, + "step": 59161 + }, + { + "epoch": 2.9, + "grad_norm": 0.7565872073173523, + "learning_rate": 1.6636095313872666e-06, + "loss": 2.854, + "step": 59162 + }, + { + "epoch": 2.9, + "grad_norm": 0.7207484245300293, + "learning_rate": 1.6619907618038887e-06, + "loss": 2.8848, + "step": 59163 + }, + { + "epoch": 2.9, + "grad_norm": 0.7588822245597839, + "learning_rate": 1.660372777985397e-06, + "loss": 2.7857, + "step": 59164 + }, + { + "epoch": 2.9, + "grad_norm": 0.7523000836372375, + "learning_rate": 1.6587555799360885e-06, + "loss": 2.9389, + "step": 59165 + }, + { + "epoch": 2.9, + "grad_norm": 0.7837392687797546, + "learning_rate": 1.657139167660193e-06, + "loss": 2.7966, + "step": 59166 + }, + { + "epoch": 2.9, + "grad_norm": 0.7007728815078735, + "learning_rate": 1.655523541161974e-06, + "loss": 2.8785, + "step": 59167 + }, + { + "epoch": 2.9, + "grad_norm": 0.7474599480628967, + "learning_rate": 1.653908700445694e-06, + "loss": 2.8779, + "step": 59168 + }, + { + "epoch": 2.9, + "grad_norm": 0.8092578649520874, + "learning_rate": 1.6522946455155839e-06, + "loss": 2.7932, + "step": 59169 + }, + { + "epoch": 2.9, + "grad_norm": 0.7371110916137695, + "learning_rate": 1.6506813763759064e-06, + "loss": 2.8497, + "step": 59170 + }, + { + "epoch": 2.9, + "grad_norm": 0.7597226500511169, + "learning_rate": 1.6490688930309581e-06, + "loss": 2.9094, + "step": 59171 + }, + { + "epoch": 2.9, + "grad_norm": 0.7393887639045715, + "learning_rate": 1.6474571954849358e-06, + "loss": 2.8139, + "step": 59172 + }, + { + "epoch": 2.9, + "grad_norm": 0.7919648885726929, + "learning_rate": 1.6458462837420693e-06, + "loss": 2.8186, + "step": 59173 + }, + { + "epoch": 2.9, + "grad_norm": 0.7349705696105957, + "learning_rate": 1.6442361578066221e-06, + "loss": 2.7144, + "step": 59174 + }, + { + "epoch": 2.9, + "grad_norm": 0.7724583745002747, + "learning_rate": 1.6426268176828239e-06, + "loss": 2.7673, + "step": 59175 + }, + { + "epoch": 2.9, + "grad_norm": 0.7804393172264099, + "learning_rate": 1.641018263374938e-06, + "loss": 2.8634, + "step": 59176 + }, + { + "epoch": 2.9, + "grad_norm": 0.8103668689727783, + "learning_rate": 1.6394104948872277e-06, + "loss": 2.7951, + "step": 59177 + }, + { + "epoch": 2.9, + "grad_norm": 0.7489713430404663, + "learning_rate": 1.6378035122238565e-06, + "loss": 3.0172, + "step": 59178 + }, + { + "epoch": 2.9, + "grad_norm": 0.7497350573539734, + "learning_rate": 1.6361973153890873e-06, + "loss": 2.7531, + "step": 59179 + }, + { + "epoch": 2.9, + "grad_norm": 0.7449610829353333, + "learning_rate": 1.6345919043871835e-06, + "loss": 2.9164, + "step": 59180 + }, + { + "epoch": 2.9, + "grad_norm": 0.7574251294136047, + "learning_rate": 1.6329872792223087e-06, + "loss": 2.7664, + "step": 59181 + }, + { + "epoch": 2.9, + "grad_norm": 0.7449432611465454, + "learning_rate": 1.6313834398987258e-06, + "loss": 2.9572, + "step": 59182 + }, + { + "epoch": 2.9, + "grad_norm": 0.7493433356285095, + "learning_rate": 1.629780386420665e-06, + "loss": 2.8492, + "step": 59183 + }, + { + "epoch": 2.9, + "grad_norm": 0.721378743648529, + "learning_rate": 1.6281781187923226e-06, + "loss": 2.8584, + "step": 59184 + }, + { + "epoch": 2.9, + "grad_norm": 0.705966055393219, + "learning_rate": 1.6265766370179623e-06, + "loss": 2.7012, + "step": 59185 + }, + { + "epoch": 2.9, + "grad_norm": 0.7675456404685974, + "learning_rate": 1.6249759411017805e-06, + "loss": 2.7796, + "step": 59186 + }, + { + "epoch": 2.9, + "grad_norm": 0.7658663988113403, + "learning_rate": 1.623376031048007e-06, + "loss": 2.6216, + "step": 59187 + }, + { + "epoch": 2.9, + "grad_norm": 0.7195037603378296, + "learning_rate": 1.6217769068607722e-06, + "loss": 2.9578, + "step": 59188 + }, + { + "epoch": 2.9, + "grad_norm": 0.7412412762641907, + "learning_rate": 1.6201785685444058e-06, + "loss": 2.7394, + "step": 59189 + }, + { + "epoch": 2.9, + "grad_norm": 0.7388929724693298, + "learning_rate": 1.6185810161030378e-06, + "loss": 2.9017, + "step": 59190 + }, + { + "epoch": 2.9, + "grad_norm": 0.8015137910842896, + "learning_rate": 1.616984249540898e-06, + "loss": 2.7832, + "step": 59191 + }, + { + "epoch": 2.9, + "grad_norm": 0.733881413936615, + "learning_rate": 1.6153882688622167e-06, + "loss": 2.8523, + "step": 59192 + }, + { + "epoch": 2.9, + "grad_norm": 0.7632375359535217, + "learning_rate": 1.6137930740711901e-06, + "loss": 2.7231, + "step": 59193 + }, + { + "epoch": 2.9, + "grad_norm": 0.7348890900611877, + "learning_rate": 1.6121986651719821e-06, + "loss": 2.8821, + "step": 59194 + }, + { + "epoch": 2.9, + "grad_norm": 0.729511559009552, + "learning_rate": 1.610605042168822e-06, + "loss": 2.9414, + "step": 59195 + }, + { + "epoch": 2.9, + "grad_norm": 0.7740534543991089, + "learning_rate": 1.609012205065907e-06, + "loss": 2.7154, + "step": 59196 + }, + { + "epoch": 2.9, + "grad_norm": 0.7699583768844604, + "learning_rate": 1.6074201538674335e-06, + "loss": 2.8821, + "step": 59197 + }, + { + "epoch": 2.9, + "grad_norm": 0.7296841144561768, + "learning_rate": 1.605828888577565e-06, + "loss": 2.6342, + "step": 59198 + }, + { + "epoch": 2.9, + "grad_norm": 0.7484259009361267, + "learning_rate": 1.6042384092005312e-06, + "loss": 2.9587, + "step": 59199 + }, + { + "epoch": 2.9, + "grad_norm": 0.7556095719337463, + "learning_rate": 1.6026487157404954e-06, + "loss": 2.8836, + "step": 59200 + }, + { + "epoch": 2.9, + "grad_norm": 0.7266550064086914, + "learning_rate": 1.6010598082016545e-06, + "loss": 2.7881, + "step": 59201 + }, + { + "epoch": 2.9, + "grad_norm": 0.745608925819397, + "learning_rate": 1.5994716865882384e-06, + "loss": 3.12, + "step": 59202 + }, + { + "epoch": 2.9, + "grad_norm": 0.7120503783226013, + "learning_rate": 1.5978843509043103e-06, + "loss": 3.1001, + "step": 59203 + }, + { + "epoch": 2.9, + "grad_norm": 0.7117788195610046, + "learning_rate": 1.5962978011541672e-06, + "loss": 3.027, + "step": 59204 + }, + { + "epoch": 2.9, + "grad_norm": 0.7518680691719055, + "learning_rate": 1.5947120373419719e-06, + "loss": 2.94, + "step": 59205 + }, + { + "epoch": 2.9, + "grad_norm": 0.731158435344696, + "learning_rate": 1.5931270594718215e-06, + "loss": 2.7666, + "step": 59206 + }, + { + "epoch": 2.9, + "grad_norm": 0.7914332151412964, + "learning_rate": 1.5915428675479792e-06, + "loss": 2.8187, + "step": 59207 + }, + { + "epoch": 2.9, + "grad_norm": 0.7586977481842041, + "learning_rate": 1.5899594615746081e-06, + "loss": 2.8927, + "step": 59208 + }, + { + "epoch": 2.9, + "grad_norm": 0.7375707030296326, + "learning_rate": 1.588376841555805e-06, + "loss": 2.7874, + "step": 59209 + }, + { + "epoch": 2.9, + "grad_norm": 0.7598499655723572, + "learning_rate": 1.5867950074958001e-06, + "loss": 2.8557, + "step": 59210 + }, + { + "epoch": 2.9, + "grad_norm": 0.759164571762085, + "learning_rate": 1.5852139593987566e-06, + "loss": 2.8819, + "step": 59211 + }, + { + "epoch": 2.9, + "grad_norm": 0.7925102710723877, + "learning_rate": 1.583633697268838e-06, + "loss": 2.7934, + "step": 59212 + }, + { + "epoch": 2.9, + "grad_norm": 0.7286711931228638, + "learning_rate": 1.582054221110174e-06, + "loss": 2.9716, + "step": 59213 + }, + { + "epoch": 2.9, + "grad_norm": 0.7402360439300537, + "learning_rate": 1.5804755309269612e-06, + "loss": 2.8322, + "step": 59214 + }, + { + "epoch": 2.9, + "grad_norm": 0.7518147230148315, + "learning_rate": 1.57889762672333e-06, + "loss": 2.8124, + "step": 59215 + }, + { + "epoch": 2.9, + "grad_norm": 0.7689666748046875, + "learning_rate": 1.5773205085034435e-06, + "loss": 2.7527, + "step": 59216 + }, + { + "epoch": 2.9, + "grad_norm": 0.746090829372406, + "learning_rate": 1.5757441762714984e-06, + "loss": 2.9052, + "step": 59217 + }, + { + "epoch": 2.9, + "grad_norm": 0.7587335109710693, + "learning_rate": 1.5741686300315581e-06, + "loss": 2.7332, + "step": 59218 + }, + { + "epoch": 2.9, + "grad_norm": 0.7321277260780334, + "learning_rate": 1.5725938697878525e-06, + "loss": 3.0402, + "step": 59219 + }, + { + "epoch": 2.9, + "grad_norm": 0.774876058101654, + "learning_rate": 1.5710198955445119e-06, + "loss": 2.9392, + "step": 59220 + }, + { + "epoch": 2.9, + "grad_norm": 0.7118234634399414, + "learning_rate": 1.5694467073056661e-06, + "loss": 2.7393, + "step": 59221 + }, + { + "epoch": 2.9, + "grad_norm": 0.8065067529678345, + "learning_rate": 1.5678743050754784e-06, + "loss": 2.8755, + "step": 59222 + }, + { + "epoch": 2.9, + "grad_norm": 0.815037190914154, + "learning_rate": 1.5663026888580454e-06, + "loss": 2.7749, + "step": 59223 + }, + { + "epoch": 2.9, + "grad_norm": 0.7758423089981079, + "learning_rate": 1.564731858657531e-06, + "loss": 2.9076, + "step": 59224 + }, + { + "epoch": 2.9, + "grad_norm": 0.7859812378883362, + "learning_rate": 1.5631618144780977e-06, + "loss": 3.0618, + "step": 59225 + }, + { + "epoch": 2.9, + "grad_norm": 0.7478824257850647, + "learning_rate": 1.5615925563238763e-06, + "loss": 2.906, + "step": 59226 + }, + { + "epoch": 2.9, + "grad_norm": 0.7796872854232788, + "learning_rate": 1.5600240841989632e-06, + "loss": 2.8319, + "step": 59227 + }, + { + "epoch": 2.9, + "grad_norm": 0.7634370923042297, + "learning_rate": 1.5584563981075215e-06, + "loss": 2.7921, + "step": 59228 + }, + { + "epoch": 2.9, + "grad_norm": 0.7156879901885986, + "learning_rate": 1.5568894980536484e-06, + "loss": 2.8655, + "step": 59229 + }, + { + "epoch": 2.9, + "grad_norm": 0.7343173027038574, + "learning_rate": 1.5553233840415068e-06, + "loss": 3.1111, + "step": 59230 + }, + { + "epoch": 2.9, + "grad_norm": 0.7309796810150146, + "learning_rate": 1.5537580560751938e-06, + "loss": 2.6718, + "step": 59231 + }, + { + "epoch": 2.9, + "grad_norm": 0.8148305416107178, + "learning_rate": 1.5521935141588726e-06, + "loss": 2.8729, + "step": 59232 + }, + { + "epoch": 2.9, + "grad_norm": 0.7874312400817871, + "learning_rate": 1.5506297582966066e-06, + "loss": 2.889, + "step": 59233 + }, + { + "epoch": 2.9, + "grad_norm": 0.7957163453102112, + "learning_rate": 1.549066788492559e-06, + "loss": 2.9847, + "step": 59234 + }, + { + "epoch": 2.9, + "grad_norm": 0.7534181475639343, + "learning_rate": 1.5475046047508265e-06, + "loss": 2.8653, + "step": 59235 + }, + { + "epoch": 2.9, + "grad_norm": 0.7766826748847961, + "learning_rate": 1.5459432070755395e-06, + "loss": 2.8438, + "step": 59236 + }, + { + "epoch": 2.9, + "grad_norm": 0.7522812485694885, + "learning_rate": 1.5443825954707945e-06, + "loss": 2.7315, + "step": 59237 + }, + { + "epoch": 2.9, + "grad_norm": 0.7763208746910095, + "learning_rate": 1.5428227699407213e-06, + "loss": 2.9692, + "step": 59238 + }, + { + "epoch": 2.9, + "grad_norm": 0.773130476474762, + "learning_rate": 1.5412637304893838e-06, + "loss": 2.821, + "step": 59239 + }, + { + "epoch": 2.9, + "grad_norm": 0.7237739562988281, + "learning_rate": 1.539705477120945e-06, + "loss": 2.8297, + "step": 59240 + }, + { + "epoch": 2.9, + "grad_norm": 0.7151420712471008, + "learning_rate": 1.538148009839435e-06, + "loss": 2.758, + "step": 59241 + }, + { + "epoch": 2.9, + "grad_norm": 0.7747194170951843, + "learning_rate": 1.5365913286490506e-06, + "loss": 2.7573, + "step": 59242 + }, + { + "epoch": 2.9, + "grad_norm": 0.7364190220832825, + "learning_rate": 1.5350354335538217e-06, + "loss": 3.0683, + "step": 59243 + }, + { + "epoch": 2.9, + "grad_norm": 0.6902226209640503, + "learning_rate": 1.533480324557912e-06, + "loss": 2.7821, + "step": 59244 + }, + { + "epoch": 2.9, + "grad_norm": 0.7629439234733582, + "learning_rate": 1.5319260016653178e-06, + "loss": 2.8934, + "step": 59245 + }, + { + "epoch": 2.9, + "grad_norm": 0.7819312214851379, + "learning_rate": 1.530372464880203e-06, + "loss": 2.9913, + "step": 59246 + }, + { + "epoch": 2.9, + "grad_norm": 0.7529870867729187, + "learning_rate": 1.5288197142066638e-06, + "loss": 2.7037, + "step": 59247 + }, + { + "epoch": 2.9, + "grad_norm": 0.7887903451919556, + "learning_rate": 1.5272677496487307e-06, + "loss": 2.9547, + "step": 59248 + }, + { + "epoch": 2.9, + "grad_norm": 0.7321487665176392, + "learning_rate": 1.5257165712106e-06, + "loss": 2.9963, + "step": 59249 + }, + { + "epoch": 2.9, + "grad_norm": 0.7392261028289795, + "learning_rate": 1.5241661788962357e-06, + "loss": 2.9604, + "step": 59250 + }, + { + "epoch": 2.9, + "grad_norm": 0.7557159662246704, + "learning_rate": 1.5226165727098005e-06, + "loss": 2.964, + "step": 59251 + }, + { + "epoch": 2.9, + "grad_norm": 0.7541767358779907, + "learning_rate": 1.521067752655325e-06, + "loss": 3.0179, + "step": 59252 + }, + { + "epoch": 2.9, + "grad_norm": 0.7616634368896484, + "learning_rate": 1.519519718736939e-06, + "loss": 2.7647, + "step": 59253 + }, + { + "epoch": 2.9, + "grad_norm": 0.7685297727584839, + "learning_rate": 1.5179724709586727e-06, + "loss": 2.9063, + "step": 59254 + }, + { + "epoch": 2.9, + "grad_norm": 0.7325605154037476, + "learning_rate": 1.5164260093246228e-06, + "loss": 3.0109, + "step": 59255 + }, + { + "epoch": 2.9, + "grad_norm": 0.7943716645240784, + "learning_rate": 1.514880333838886e-06, + "loss": 2.7804, + "step": 59256 + }, + { + "epoch": 2.9, + "grad_norm": 0.7822527885437012, + "learning_rate": 1.5133354445054923e-06, + "loss": 3.128, + "step": 59257 + }, + { + "epoch": 2.9, + "grad_norm": 0.7548781037330627, + "learning_rate": 1.5117913413285388e-06, + "loss": 2.8991, + "step": 59258 + }, + { + "epoch": 2.9, + "grad_norm": 0.769331157207489, + "learning_rate": 1.5102480243120885e-06, + "loss": 3.0113, + "step": 59259 + }, + { + "epoch": 2.9, + "grad_norm": 0.772138237953186, + "learning_rate": 1.5087054934601716e-06, + "loss": 2.8884, + "step": 59260 + }, + { + "epoch": 2.9, + "grad_norm": 0.7810102105140686, + "learning_rate": 1.507163748776885e-06, + "loss": 2.8167, + "step": 59261 + }, + { + "epoch": 2.9, + "grad_norm": 0.7660080194473267, + "learning_rate": 1.505622790266292e-06, + "loss": 2.8325, + "step": 59262 + }, + { + "epoch": 2.9, + "grad_norm": 0.7257230281829834, + "learning_rate": 1.504082617932456e-06, + "loss": 3.0289, + "step": 59263 + }, + { + "epoch": 2.9, + "grad_norm": 0.7466867566108704, + "learning_rate": 1.5025432317794073e-06, + "loss": 2.8704, + "step": 59264 + }, + { + "epoch": 2.9, + "grad_norm": 0.7224690914154053, + "learning_rate": 1.501004631811209e-06, + "loss": 2.7124, + "step": 59265 + }, + { + "epoch": 2.9, + "grad_norm": 0.7552944421768188, + "learning_rate": 1.4994668180319248e-06, + "loss": 3.1567, + "step": 59266 + }, + { + "epoch": 2.9, + "grad_norm": 0.8041907548904419, + "learning_rate": 1.4979297904455844e-06, + "loss": 2.9041, + "step": 59267 + }, + { + "epoch": 2.9, + "grad_norm": 0.7628178596496582, + "learning_rate": 1.4963935490562516e-06, + "loss": 2.8904, + "step": 59268 + }, + { + "epoch": 2.9, + "grad_norm": 0.8382160663604736, + "learning_rate": 1.49485809386799e-06, + "loss": 2.7538, + "step": 59269 + }, + { + "epoch": 2.9, + "grad_norm": 0.7363545298576355, + "learning_rate": 1.4933234248847958e-06, + "loss": 2.8377, + "step": 59270 + }, + { + "epoch": 2.9, + "grad_norm": 0.7144303321838379, + "learning_rate": 1.491789542110766e-06, + "loss": 2.7339, + "step": 59271 + }, + { + "epoch": 2.9, + "grad_norm": 0.8086472749710083, + "learning_rate": 1.4902564455498978e-06, + "loss": 2.9529, + "step": 59272 + }, + { + "epoch": 2.9, + "grad_norm": 0.7652781009674072, + "learning_rate": 1.488724135206254e-06, + "loss": 2.8901, + "step": 59273 + }, + { + "epoch": 2.9, + "grad_norm": 0.7442313432693481, + "learning_rate": 1.487192611083865e-06, + "loss": 3.0013, + "step": 59274 + }, + { + "epoch": 2.9, + "grad_norm": 0.7640436887741089, + "learning_rate": 1.4856618731867277e-06, + "loss": 2.8635, + "step": 59275 + }, + { + "epoch": 2.9, + "grad_norm": 0.7747520208358765, + "learning_rate": 1.4841319215189385e-06, + "loss": 2.9427, + "step": 59276 + }, + { + "epoch": 2.91, + "grad_norm": 0.7614535689353943, + "learning_rate": 1.482602756084461e-06, + "loss": 2.713, + "step": 59277 + }, + { + "epoch": 2.91, + "grad_norm": 0.7723492980003357, + "learning_rate": 1.4810743768873923e-06, + "loss": 2.9073, + "step": 59278 + }, + { + "epoch": 2.91, + "grad_norm": 0.7300555109977722, + "learning_rate": 1.4795467839317288e-06, + "loss": 2.9138, + "step": 59279 + }, + { + "epoch": 2.91, + "grad_norm": 0.7472530603408813, + "learning_rate": 1.4780199772214674e-06, + "loss": 2.831, + "step": 59280 + }, + { + "epoch": 2.91, + "grad_norm": 0.7535228729248047, + "learning_rate": 1.4764939567606382e-06, + "loss": 2.9556, + "step": 59281 + }, + { + "epoch": 2.91, + "grad_norm": 0.7726250886917114, + "learning_rate": 1.4749687225533047e-06, + "loss": 2.7113, + "step": 59282 + }, + { + "epoch": 2.91, + "grad_norm": 0.7266640663146973, + "learning_rate": 1.4734442746034303e-06, + "loss": 2.8155, + "step": 59283 + }, + { + "epoch": 2.91, + "grad_norm": 0.7508488297462463, + "learning_rate": 1.471920612915045e-06, + "loss": 2.8604, + "step": 59284 + }, + { + "epoch": 2.91, + "grad_norm": 0.777144193649292, + "learning_rate": 1.4703977374921794e-06, + "loss": 2.6361, + "step": 59285 + }, + { + "epoch": 2.91, + "grad_norm": 0.7734659910202026, + "learning_rate": 1.4688756483388297e-06, + "loss": 2.8084, + "step": 59286 + }, + { + "epoch": 2.91, + "grad_norm": 0.7497792840003967, + "learning_rate": 1.4673543454590264e-06, + "loss": 2.8345, + "step": 59287 + }, + { + "epoch": 2.91, + "grad_norm": 0.753085732460022, + "learning_rate": 1.4658338288567329e-06, + "loss": 2.9196, + "step": 59288 + }, + { + "epoch": 2.91, + "grad_norm": 0.7959982752799988, + "learning_rate": 1.4643140985359792e-06, + "loss": 2.8826, + "step": 59289 + }, + { + "epoch": 2.91, + "grad_norm": 0.7402199506759644, + "learning_rate": 1.462795154500762e-06, + "loss": 2.8387, + "step": 59290 + }, + { + "epoch": 2.91, + "grad_norm": 0.7226002216339111, + "learning_rate": 1.461276996755112e-06, + "loss": 3.0343, + "step": 59291 + }, + { + "epoch": 2.91, + "grad_norm": 0.7589437365531921, + "learning_rate": 1.459759625302992e-06, + "loss": 2.9562, + "step": 59292 + }, + { + "epoch": 2.91, + "grad_norm": 0.7250106930732727, + "learning_rate": 1.458243040148399e-06, + "loss": 2.8674, + "step": 59293 + }, + { + "epoch": 2.91, + "grad_norm": 0.7297521233558655, + "learning_rate": 1.4567272412953633e-06, + "loss": 3.1333, + "step": 59294 + }, + { + "epoch": 2.91, + "grad_norm": 0.729756236076355, + "learning_rate": 1.4552122287478485e-06, + "loss": 2.7591, + "step": 59295 + }, + { + "epoch": 2.91, + "grad_norm": 0.7715573906898499, + "learning_rate": 1.4536980025098178e-06, + "loss": 3.0325, + "step": 59296 + }, + { + "epoch": 2.91, + "grad_norm": 0.7817413806915283, + "learning_rate": 1.4521845625853347e-06, + "loss": 3.0629, + "step": 59297 + }, + { + "epoch": 2.91, + "grad_norm": 0.7538647651672363, + "learning_rate": 1.4506719089782958e-06, + "loss": 2.9156, + "step": 59298 + }, + { + "epoch": 2.91, + "grad_norm": 0.7621957063674927, + "learning_rate": 1.4491600416927652e-06, + "loss": 3.0193, + "step": 59299 + }, + { + "epoch": 2.91, + "grad_norm": 0.7765817642211914, + "learning_rate": 1.4476489607326725e-06, + "loss": 3.0129, + "step": 59300 + }, + { + "epoch": 2.91, + "grad_norm": 0.7483084797859192, + "learning_rate": 1.4461386661020146e-06, + "loss": 2.8993, + "step": 59301 + }, + { + "epoch": 2.91, + "grad_norm": 0.7720745801925659, + "learning_rate": 1.4446291578047886e-06, + "loss": 3.0589, + "step": 59302 + }, + { + "epoch": 2.91, + "grad_norm": 0.7810670137405396, + "learning_rate": 1.4431204358449244e-06, + "loss": 3.0455, + "step": 59303 + }, + { + "epoch": 2.91, + "grad_norm": 0.7453815937042236, + "learning_rate": 1.441612500226419e-06, + "loss": 2.8832, + "step": 59304 + }, + { + "epoch": 2.91, + "grad_norm": 0.7267022132873535, + "learning_rate": 1.440105350953269e-06, + "loss": 2.8214, + "step": 59305 + }, + { + "epoch": 2.91, + "grad_norm": 0.7542246580123901, + "learning_rate": 1.438598988029438e-06, + "loss": 2.7414, + "step": 59306 + }, + { + "epoch": 2.91, + "grad_norm": 0.7183325290679932, + "learning_rate": 1.4370934114588228e-06, + "loss": 2.7841, + "step": 59307 + }, + { + "epoch": 2.91, + "grad_norm": 0.9062812924385071, + "learning_rate": 1.435588621245487e-06, + "loss": 2.7707, + "step": 59308 + }, + { + "epoch": 2.91, + "grad_norm": 0.7738885283470154, + "learning_rate": 1.4340846173933606e-06, + "loss": 2.9159, + "step": 59309 + }, + { + "epoch": 2.91, + "grad_norm": 0.7519917488098145, + "learning_rate": 1.432581399906374e-06, + "loss": 3.0687, + "step": 59310 + }, + { + "epoch": 2.91, + "grad_norm": 0.7735452055931091, + "learning_rate": 1.4310789687885238e-06, + "loss": 2.6822, + "step": 59311 + }, + { + "epoch": 2.91, + "grad_norm": 0.767966091632843, + "learning_rate": 1.4295773240437068e-06, + "loss": 2.9608, + "step": 59312 + }, + { + "epoch": 2.91, + "grad_norm": 0.7551448941230774, + "learning_rate": 1.4280764656759535e-06, + "loss": 2.9766, + "step": 59313 + }, + { + "epoch": 2.91, + "grad_norm": 0.785728394985199, + "learning_rate": 1.426576393689194e-06, + "loss": 3.004, + "step": 59314 + }, + { + "epoch": 2.91, + "grad_norm": 0.7843324542045593, + "learning_rate": 1.425077108087358e-06, + "loss": 2.6908, + "step": 59315 + }, + { + "epoch": 2.91, + "grad_norm": 0.717927098274231, + "learning_rate": 1.4235786088744094e-06, + "loss": 3.0576, + "step": 59316 + }, + { + "epoch": 2.91, + "grad_norm": 0.7239921689033508, + "learning_rate": 1.4220808960542784e-06, + "loss": 3.0828, + "step": 59317 + }, + { + "epoch": 2.91, + "grad_norm": 0.7748315334320068, + "learning_rate": 1.4205839696309618e-06, + "loss": 2.8963, + "step": 59318 + }, + { + "epoch": 2.91, + "grad_norm": 0.7714270353317261, + "learning_rate": 1.419087829608323e-06, + "loss": 2.9912, + "step": 59319 + }, + { + "epoch": 2.91, + "grad_norm": 0.7787078619003296, + "learning_rate": 1.4175924759903589e-06, + "loss": 2.7653, + "step": 59320 + }, + { + "epoch": 2.91, + "grad_norm": 0.7504279613494873, + "learning_rate": 1.4160979087809999e-06, + "loss": 2.7674, + "step": 59321 + }, + { + "epoch": 2.91, + "grad_norm": 0.7364410161972046, + "learning_rate": 1.4146041279841759e-06, + "loss": 3.0725, + "step": 59322 + }, + { + "epoch": 2.91, + "grad_norm": 0.757007896900177, + "learning_rate": 1.413111133603817e-06, + "loss": 2.8924, + "step": 59323 + }, + { + "epoch": 2.91, + "grad_norm": 0.7655906677246094, + "learning_rate": 1.4116189256438538e-06, + "loss": 2.8821, + "step": 59324 + }, + { + "epoch": 2.91, + "grad_norm": 0.7602041363716125, + "learning_rate": 1.410127504108216e-06, + "loss": 2.6149, + "step": 59325 + }, + { + "epoch": 2.91, + "grad_norm": 0.7541115283966064, + "learning_rate": 1.4086368690008675e-06, + "loss": 2.7603, + "step": 59326 + }, + { + "epoch": 2.91, + "grad_norm": 0.7562124133110046, + "learning_rate": 1.4071470203256718e-06, + "loss": 2.9612, + "step": 59327 + }, + { + "epoch": 2.91, + "grad_norm": 0.7242172956466675, + "learning_rate": 1.4056579580865923e-06, + "loss": 2.708, + "step": 59328 + }, + { + "epoch": 2.91, + "grad_norm": 0.7529643774032593, + "learning_rate": 1.4041696822875592e-06, + "loss": 2.7851, + "step": 59329 + }, + { + "epoch": 2.91, + "grad_norm": 0.6918560266494751, + "learning_rate": 1.4026821929324695e-06, + "loss": 2.9248, + "step": 59330 + }, + { + "epoch": 2.91, + "grad_norm": 0.7533132433891296, + "learning_rate": 1.4011954900252532e-06, + "loss": 2.9044, + "step": 59331 + }, + { + "epoch": 2.91, + "grad_norm": 0.7325230240821838, + "learning_rate": 1.3997095735698071e-06, + "loss": 2.7677, + "step": 59332 + }, + { + "epoch": 2.91, + "grad_norm": 0.6999819278717041, + "learning_rate": 1.3982244435700617e-06, + "loss": 2.7991, + "step": 59333 + }, + { + "epoch": 2.91, + "grad_norm": 0.7240120768547058, + "learning_rate": 1.3967401000299472e-06, + "loss": 2.9905, + "step": 59334 + }, + { + "epoch": 2.91, + "grad_norm": 0.7363443374633789, + "learning_rate": 1.3952565429533268e-06, + "loss": 2.7389, + "step": 59335 + }, + { + "epoch": 2.91, + "grad_norm": 0.7331762909889221, + "learning_rate": 1.3937737723441644e-06, + "loss": 2.6583, + "step": 59336 + }, + { + "epoch": 2.91, + "grad_norm": 0.7030410766601562, + "learning_rate": 1.39229178820629e-06, + "loss": 2.8128, + "step": 59337 + }, + { + "epoch": 2.91, + "grad_norm": 0.7303512692451477, + "learning_rate": 1.3908105905436672e-06, + "loss": 2.8716, + "step": 59338 + }, + { + "epoch": 2.91, + "grad_norm": 0.7628925442695618, + "learning_rate": 1.389330179360193e-06, + "loss": 2.9677, + "step": 59339 + }, + { + "epoch": 2.91, + "grad_norm": 0.7441055178642273, + "learning_rate": 1.387850554659764e-06, + "loss": 2.8874, + "step": 59340 + }, + { + "epoch": 2.91, + "grad_norm": 0.7332181334495544, + "learning_rate": 1.386371716446244e-06, + "loss": 2.8901, + "step": 59341 + }, + { + "epoch": 2.91, + "grad_norm": 0.7477689385414124, + "learning_rate": 1.384893664723563e-06, + "loss": 2.9624, + "step": 59342 + }, + { + "epoch": 2.91, + "grad_norm": 0.7767438888549805, + "learning_rate": 1.3834163994955848e-06, + "loss": 2.825, + "step": 59343 + }, + { + "epoch": 2.91, + "grad_norm": 0.7393341660499573, + "learning_rate": 1.3819399207662396e-06, + "loss": 2.8816, + "step": 59344 + }, + { + "epoch": 2.91, + "grad_norm": 0.7812148928642273, + "learning_rate": 1.3804642285393908e-06, + "loss": 2.8379, + "step": 59345 + }, + { + "epoch": 2.91, + "grad_norm": 0.7248178124427795, + "learning_rate": 1.3789893228189352e-06, + "loss": 3.0399, + "step": 59346 + }, + { + "epoch": 2.91, + "grad_norm": 0.7927574515342712, + "learning_rate": 1.3775152036087367e-06, + "loss": 3.02, + "step": 59347 + }, + { + "epoch": 2.91, + "grad_norm": 0.7457157969474792, + "learning_rate": 1.3760418709126919e-06, + "loss": 3.0966, + "step": 59348 + }, + { + "epoch": 2.91, + "grad_norm": 0.6921155452728271, + "learning_rate": 1.3745693247346978e-06, + "loss": 2.9358, + "step": 59349 + }, + { + "epoch": 2.91, + "grad_norm": 0.7026863098144531, + "learning_rate": 1.3730975650785847e-06, + "loss": 2.6633, + "step": 59350 + }, + { + "epoch": 2.91, + "grad_norm": 0.7563304901123047, + "learning_rate": 1.3716265919482827e-06, + "loss": 2.8672, + "step": 59351 + }, + { + "epoch": 2.91, + "grad_norm": 0.7299355268478394, + "learning_rate": 1.370156405347622e-06, + "loss": 2.9305, + "step": 59352 + }, + { + "epoch": 2.91, + "grad_norm": 0.7180936336517334, + "learning_rate": 1.3686870052805332e-06, + "loss": 3.0106, + "step": 59353 + }, + { + "epoch": 2.91, + "grad_norm": 0.8197305798530579, + "learning_rate": 1.3672183917508461e-06, + "loss": 2.9598, + "step": 59354 + }, + { + "epoch": 2.91, + "grad_norm": 0.7417368292808533, + "learning_rate": 1.3657505647624245e-06, + "loss": 2.879, + "step": 59355 + }, + { + "epoch": 2.91, + "grad_norm": 0.7476125955581665, + "learning_rate": 1.364283524319132e-06, + "loss": 2.804, + "step": 59356 + }, + { + "epoch": 2.91, + "grad_norm": 0.7566918134689331, + "learning_rate": 1.362817270424832e-06, + "loss": 2.9898, + "step": 59357 + }, + { + "epoch": 2.91, + "grad_norm": 0.7430882453918457, + "learning_rate": 1.3613518030834215e-06, + "loss": 2.7312, + "step": 59358 + }, + { + "epoch": 2.91, + "grad_norm": 0.7406376004219055, + "learning_rate": 1.3598871222987307e-06, + "loss": 2.9305, + "step": 59359 + }, + { + "epoch": 2.91, + "grad_norm": 0.7154626846313477, + "learning_rate": 1.3584232280746231e-06, + "loss": 2.8426, + "step": 59360 + }, + { + "epoch": 2.91, + "grad_norm": 0.7587353587150574, + "learning_rate": 1.3569601204149628e-06, + "loss": 2.8959, + "step": 59361 + }, + { + "epoch": 2.91, + "grad_norm": 0.7413969039916992, + "learning_rate": 1.3554977993235794e-06, + "loss": 2.8985, + "step": 59362 + }, + { + "epoch": 2.91, + "grad_norm": 0.6970677971839905, + "learning_rate": 1.354036264804337e-06, + "loss": 2.7983, + "step": 59363 + }, + { + "epoch": 2.91, + "grad_norm": 0.7779316902160645, + "learning_rate": 1.3525755168610985e-06, + "loss": 2.8532, + "step": 59364 + }, + { + "epoch": 2.91, + "grad_norm": 0.7617775201797485, + "learning_rate": 1.3511155554976948e-06, + "loss": 2.9153, + "step": 59365 + }, + { + "epoch": 2.91, + "grad_norm": 0.7746046781539917, + "learning_rate": 1.3496563807179895e-06, + "loss": 3.0756, + "step": 59366 + }, + { + "epoch": 2.91, + "grad_norm": 0.7900286316871643, + "learning_rate": 1.3481979925257792e-06, + "loss": 2.6014, + "step": 59367 + }, + { + "epoch": 2.91, + "grad_norm": 0.7511681914329529, + "learning_rate": 1.3467403909249608e-06, + "loss": 2.8521, + "step": 59368 + }, + { + "epoch": 2.91, + "grad_norm": 0.7791048884391785, + "learning_rate": 1.3452835759193648e-06, + "loss": 2.9309, + "step": 59369 + }, + { + "epoch": 2.91, + "grad_norm": 0.7215959429740906, + "learning_rate": 1.343827547512788e-06, + "loss": 3.0088, + "step": 59370 + }, + { + "epoch": 2.91, + "grad_norm": 0.7659755349159241, + "learning_rate": 1.3423723057091274e-06, + "loss": 2.7044, + "step": 59371 + }, + { + "epoch": 2.91, + "grad_norm": 0.7261947393417358, + "learning_rate": 1.3409178505121465e-06, + "loss": 2.7411, + "step": 59372 + }, + { + "epoch": 2.91, + "grad_norm": 0.7190525531768799, + "learning_rate": 1.339464181925709e-06, + "loss": 2.938, + "step": 59373 + }, + { + "epoch": 2.91, + "grad_norm": 0.7865150570869446, + "learning_rate": 1.3380112999536786e-06, + "loss": 3.0685, + "step": 59374 + }, + { + "epoch": 2.91, + "grad_norm": 0.7611143589019775, + "learning_rate": 1.3365592045998186e-06, + "loss": 2.7982, + "step": 59375 + }, + { + "epoch": 2.91, + "grad_norm": 0.758155107498169, + "learning_rate": 1.3351078958679929e-06, + "loss": 2.8102, + "step": 59376 + }, + { + "epoch": 2.91, + "grad_norm": 0.7684991955757141, + "learning_rate": 1.3336573737619981e-06, + "loss": 2.8202, + "step": 59377 + }, + { + "epoch": 2.91, + "grad_norm": 0.72589510679245, + "learning_rate": 1.3322076382856984e-06, + "loss": 2.9137, + "step": 59378 + }, + { + "epoch": 2.91, + "grad_norm": 0.7442227005958557, + "learning_rate": 1.3307586894428568e-06, + "loss": 2.8947, + "step": 59379 + }, + { + "epoch": 2.91, + "grad_norm": 0.8001748919487, + "learning_rate": 1.3293105272373372e-06, + "loss": 2.8781, + "step": 59380 + }, + { + "epoch": 2.91, + "grad_norm": 0.77564537525177, + "learning_rate": 1.3278631516729366e-06, + "loss": 2.9313, + "step": 59381 + }, + { + "epoch": 2.91, + "grad_norm": 0.7333300709724426, + "learning_rate": 1.3264165627534518e-06, + "loss": 2.889, + "step": 59382 + }, + { + "epoch": 2.91, + "grad_norm": 0.7711548805236816, + "learning_rate": 1.3249707604827131e-06, + "loss": 2.8388, + "step": 59383 + }, + { + "epoch": 2.91, + "grad_norm": 0.7647919058799744, + "learning_rate": 1.3235257448644843e-06, + "loss": 2.7657, + "step": 59384 + }, + { + "epoch": 2.91, + "grad_norm": 0.7608219981193542, + "learning_rate": 1.3220815159026288e-06, + "loss": 2.745, + "step": 59385 + }, + { + "epoch": 2.91, + "grad_norm": 0.7038655877113342, + "learning_rate": 1.3206380736009437e-06, + "loss": 2.6167, + "step": 59386 + }, + { + "epoch": 2.91, + "grad_norm": 0.7791205048561096, + "learning_rate": 1.3191954179632258e-06, + "loss": 3.025, + "step": 59387 + }, + { + "epoch": 2.91, + "grad_norm": 0.7197181582450867, + "learning_rate": 1.3177535489932388e-06, + "loss": 3.1071, + "step": 59388 + }, + { + "epoch": 2.91, + "grad_norm": 0.7548245191574097, + "learning_rate": 1.3163124666948132e-06, + "loss": 2.7108, + "step": 59389 + }, + { + "epoch": 2.91, + "grad_norm": 0.7533859014511108, + "learning_rate": 1.3148721710717459e-06, + "loss": 3.086, + "step": 59390 + }, + { + "epoch": 2.91, + "grad_norm": 0.7900393009185791, + "learning_rate": 1.3134326621278002e-06, + "loss": 2.8453, + "step": 59391 + }, + { + "epoch": 2.91, + "grad_norm": 0.7366978526115417, + "learning_rate": 1.3119939398668068e-06, + "loss": 3.0731, + "step": 59392 + }, + { + "epoch": 2.91, + "grad_norm": 0.7741217613220215, + "learning_rate": 1.3105560042925623e-06, + "loss": 2.9944, + "step": 59393 + }, + { + "epoch": 2.91, + "grad_norm": 0.7335111498832703, + "learning_rate": 1.3091188554087971e-06, + "loss": 2.7327, + "step": 59394 + }, + { + "epoch": 2.91, + "grad_norm": 0.7100944519042969, + "learning_rate": 1.3076824932193086e-06, + "loss": 2.8545, + "step": 59395 + }, + { + "epoch": 2.91, + "grad_norm": 0.7313215136528015, + "learning_rate": 1.3062469177279267e-06, + "loss": 2.8571, + "step": 59396 + }, + { + "epoch": 2.91, + "grad_norm": 0.7661868929862976, + "learning_rate": 1.304812128938415e-06, + "loss": 3.0734, + "step": 59397 + }, + { + "epoch": 2.91, + "grad_norm": 0.7975308895111084, + "learning_rate": 1.3033781268545374e-06, + "loss": 2.8728, + "step": 59398 + }, + { + "epoch": 2.91, + "grad_norm": 0.7336994409561157, + "learning_rate": 1.3019449114800573e-06, + "loss": 3.1197, + "step": 59399 + }, + { + "epoch": 2.91, + "grad_norm": 0.7279320359230042, + "learning_rate": 1.3005124828188052e-06, + "loss": 2.7903, + "step": 59400 + }, + { + "epoch": 2.91, + "grad_norm": 0.7265657186508179, + "learning_rate": 1.299080840874478e-06, + "loss": 3.0084, + "step": 59401 + }, + { + "epoch": 2.91, + "grad_norm": 0.7682440876960754, + "learning_rate": 1.2976499856509059e-06, + "loss": 2.8712, + "step": 59402 + }, + { + "epoch": 2.91, + "grad_norm": 0.7393474578857422, + "learning_rate": 1.2962199171518528e-06, + "loss": 2.8719, + "step": 59403 + }, + { + "epoch": 2.91, + "grad_norm": 0.7249324917793274, + "learning_rate": 1.2947906353810489e-06, + "loss": 2.9401, + "step": 59404 + }, + { + "epoch": 2.91, + "grad_norm": 0.7434907555580139, + "learning_rate": 1.293362140342258e-06, + "loss": 2.9189, + "step": 59405 + }, + { + "epoch": 2.91, + "grad_norm": 0.766069233417511, + "learning_rate": 1.29193443203931e-06, + "loss": 3.0494, + "step": 59406 + }, + { + "epoch": 2.91, + "grad_norm": 0.7799139618873596, + "learning_rate": 1.2905075104759022e-06, + "loss": 2.7501, + "step": 59407 + }, + { + "epoch": 2.91, + "grad_norm": 0.7259254455566406, + "learning_rate": 1.2890813756557983e-06, + "loss": 2.9274, + "step": 59408 + }, + { + "epoch": 2.91, + "grad_norm": 0.8382498621940613, + "learning_rate": 1.2876560275827619e-06, + "loss": 3.0196, + "step": 59409 + }, + { + "epoch": 2.91, + "grad_norm": 0.7565951943397522, + "learning_rate": 1.2862314662605565e-06, + "loss": 3.0605, + "step": 59410 + }, + { + "epoch": 2.91, + "grad_norm": 0.7980879545211792, + "learning_rate": 1.2848076916929462e-06, + "loss": 2.7056, + "step": 59411 + }, + { + "epoch": 2.91, + "grad_norm": 0.8130079507827759, + "learning_rate": 1.283384703883661e-06, + "loss": 2.6706, + "step": 59412 + }, + { + "epoch": 2.91, + "grad_norm": 0.7338296175003052, + "learning_rate": 1.2819625028364312e-06, + "loss": 2.8004, + "step": 59413 + }, + { + "epoch": 2.91, + "grad_norm": 0.7697415351867676, + "learning_rate": 1.2805410885550205e-06, + "loss": 2.8974, + "step": 59414 + }, + { + "epoch": 2.91, + "grad_norm": 0.7453495860099792, + "learning_rate": 1.2791204610431927e-06, + "loss": 3.0127, + "step": 59415 + }, + { + "epoch": 2.91, + "grad_norm": 0.7462559342384338, + "learning_rate": 1.277700620304678e-06, + "loss": 2.7122, + "step": 59416 + }, + { + "epoch": 2.91, + "grad_norm": 0.7518516778945923, + "learning_rate": 1.2762815663431735e-06, + "loss": 2.7392, + "step": 59417 + }, + { + "epoch": 2.91, + "grad_norm": 0.7591348886489868, + "learning_rate": 1.2748632991625096e-06, + "loss": 3.0127, + "step": 59418 + }, + { + "epoch": 2.91, + "grad_norm": 0.7799931764602661, + "learning_rate": 1.2734458187663166e-06, + "loss": 2.912, + "step": 59419 + }, + { + "epoch": 2.91, + "grad_norm": 0.7421641945838928, + "learning_rate": 1.2720291251583913e-06, + "loss": 2.6909, + "step": 59420 + }, + { + "epoch": 2.91, + "grad_norm": 0.7436515092849731, + "learning_rate": 1.2706132183424644e-06, + "loss": 2.7775, + "step": 59421 + }, + { + "epoch": 2.91, + "grad_norm": 0.7163355350494385, + "learning_rate": 1.2691980983222327e-06, + "loss": 2.8094, + "step": 59422 + }, + { + "epoch": 2.91, + "grad_norm": 0.7329570651054382, + "learning_rate": 1.26778376510146e-06, + "loss": 2.9062, + "step": 59423 + }, + { + "epoch": 2.91, + "grad_norm": 0.8021432161331177, + "learning_rate": 1.26637021868381e-06, + "loss": 3.0601, + "step": 59424 + }, + { + "epoch": 2.91, + "grad_norm": 0.7619763016700745, + "learning_rate": 1.2649574590730794e-06, + "loss": 2.9556, + "step": 59425 + }, + { + "epoch": 2.91, + "grad_norm": 0.7866322994232178, + "learning_rate": 1.2635454862729656e-06, + "loss": 2.7857, + "step": 59426 + }, + { + "epoch": 2.91, + "grad_norm": 0.7534366250038147, + "learning_rate": 1.2621343002871654e-06, + "loss": 2.8646, + "step": 59427 + }, + { + "epoch": 2.91, + "grad_norm": 0.7509728074073792, + "learning_rate": 1.2607239011194092e-06, + "loss": 2.8734, + "step": 59428 + }, + { + "epoch": 2.91, + "grad_norm": 0.7655785083770752, + "learning_rate": 1.2593142887734276e-06, + "loss": 2.8618, + "step": 59429 + }, + { + "epoch": 2.91, + "grad_norm": 0.759252667427063, + "learning_rate": 1.2579054632529173e-06, + "loss": 3.0101, + "step": 59430 + }, + { + "epoch": 2.91, + "grad_norm": 0.7218188047409058, + "learning_rate": 1.2564974245615756e-06, + "loss": 2.8201, + "step": 59431 + }, + { + "epoch": 2.91, + "grad_norm": 0.7249210476875305, + "learning_rate": 1.2550901727031326e-06, + "loss": 2.7672, + "step": 59432 + }, + { + "epoch": 2.91, + "grad_norm": 0.7824299335479736, + "learning_rate": 1.2536837076812856e-06, + "loss": 2.8039, + "step": 59433 + }, + { + "epoch": 2.91, + "grad_norm": 0.731982946395874, + "learning_rate": 1.2522780294997315e-06, + "loss": 2.997, + "step": 59434 + }, + { + "epoch": 2.91, + "grad_norm": 0.7653811573982239, + "learning_rate": 1.2508731381622006e-06, + "loss": 3.0187, + "step": 59435 + }, + { + "epoch": 2.91, + "grad_norm": 0.7498577833175659, + "learning_rate": 1.2494690336723568e-06, + "loss": 2.7194, + "step": 59436 + }, + { + "epoch": 2.91, + "grad_norm": 0.7105596661567688, + "learning_rate": 1.2480657160339303e-06, + "loss": 2.8219, + "step": 59437 + }, + { + "epoch": 2.91, + "grad_norm": 0.7923392653465271, + "learning_rate": 1.2466631852506182e-06, + "loss": 2.8113, + "step": 59438 + }, + { + "epoch": 2.91, + "grad_norm": 0.725375235080719, + "learning_rate": 1.2452614413260509e-06, + "loss": 2.9244, + "step": 59439 + }, + { + "epoch": 2.91, + "grad_norm": 0.7582253217697144, + "learning_rate": 1.2438604842640253e-06, + "loss": 2.7547, + "step": 59440 + }, + { + "epoch": 2.91, + "grad_norm": 0.7434356808662415, + "learning_rate": 1.242460314068139e-06, + "loss": 2.8055, + "step": 59441 + }, + { + "epoch": 2.91, + "grad_norm": 0.7622381448745728, + "learning_rate": 1.2410609307421216e-06, + "loss": 2.8985, + "step": 59442 + }, + { + "epoch": 2.91, + "grad_norm": 0.8195493817329407, + "learning_rate": 1.2396623342896373e-06, + "loss": 2.8076, + "step": 59443 + }, + { + "epoch": 2.91, + "grad_norm": 0.7499621510505676, + "learning_rate": 1.2382645247144163e-06, + "loss": 2.7527, + "step": 59444 + }, + { + "epoch": 2.91, + "grad_norm": 0.7483765482902527, + "learning_rate": 1.2368675020200891e-06, + "loss": 2.9304, + "step": 59445 + }, + { + "epoch": 2.91, + "grad_norm": 0.7556241154670715, + "learning_rate": 1.2354712662103528e-06, + "loss": 2.9852, + "step": 59446 + }, + { + "epoch": 2.91, + "grad_norm": 0.7922104597091675, + "learning_rate": 1.2340758172889043e-06, + "loss": 2.9386, + "step": 59447 + }, + { + "epoch": 2.91, + "grad_norm": 0.7229467034339905, + "learning_rate": 1.232681155259374e-06, + "loss": 2.897, + "step": 59448 + }, + { + "epoch": 2.91, + "grad_norm": 0.7952060699462891, + "learning_rate": 1.2312872801254926e-06, + "loss": 3.0477, + "step": 59449 + }, + { + "epoch": 2.91, + "grad_norm": 0.8594965934753418, + "learning_rate": 1.2298941918908566e-06, + "loss": 2.9014, + "step": 59450 + }, + { + "epoch": 2.91, + "grad_norm": 0.7167303562164307, + "learning_rate": 1.2285018905592303e-06, + "loss": 2.8262, + "step": 59451 + }, + { + "epoch": 2.91, + "grad_norm": 0.7530221343040466, + "learning_rate": 1.2271103761342105e-06, + "loss": 2.9817, + "step": 59452 + }, + { + "epoch": 2.91, + "grad_norm": 0.7742487192153931, + "learning_rate": 1.225719648619461e-06, + "loss": 2.8349, + "step": 59453 + }, + { + "epoch": 2.91, + "grad_norm": 0.7406141757965088, + "learning_rate": 1.2243297080186786e-06, + "loss": 2.9665, + "step": 59454 + }, + { + "epoch": 2.91, + "grad_norm": 0.7695170044898987, + "learning_rate": 1.2229405543355274e-06, + "loss": 2.9037, + "step": 59455 + }, + { + "epoch": 2.91, + "grad_norm": 0.7280238270759583, + "learning_rate": 1.221552187573671e-06, + "loss": 2.9269, + "step": 59456 + }, + { + "epoch": 2.91, + "grad_norm": 0.730404257774353, + "learning_rate": 1.2201646077367067e-06, + "loss": 2.8521, + "step": 59457 + }, + { + "epoch": 2.91, + "grad_norm": 0.7469308376312256, + "learning_rate": 1.2187778148283312e-06, + "loss": 2.7984, + "step": 59458 + }, + { + "epoch": 2.91, + "grad_norm": 0.7723640203475952, + "learning_rate": 1.217391808852175e-06, + "loss": 2.9174, + "step": 59459 + }, + { + "epoch": 2.91, + "grad_norm": 0.8354815244674683, + "learning_rate": 1.2160065898119352e-06, + "loss": 2.7031, + "step": 59460 + }, + { + "epoch": 2.91, + "grad_norm": 0.7310277819633484, + "learning_rate": 1.2146221577112426e-06, + "loss": 2.8573, + "step": 59461 + }, + { + "epoch": 2.91, + "grad_norm": 0.761574923992157, + "learning_rate": 1.2132385125537269e-06, + "loss": 2.947, + "step": 59462 + }, + { + "epoch": 2.91, + "grad_norm": 0.8336168527603149, + "learning_rate": 1.2118556543430192e-06, + "loss": 2.9162, + "step": 59463 + }, + { + "epoch": 2.91, + "grad_norm": 0.7921255230903625, + "learning_rate": 1.2104735830827827e-06, + "loss": 2.8752, + "step": 59464 + }, + { + "epoch": 2.91, + "grad_norm": 0.7641667723655701, + "learning_rate": 1.2090922987766816e-06, + "loss": 2.8504, + "step": 59465 + }, + { + "epoch": 2.91, + "grad_norm": 0.7452090382575989, + "learning_rate": 1.2077118014282794e-06, + "loss": 2.8647, + "step": 59466 + }, + { + "epoch": 2.91, + "grad_norm": 0.751675546169281, + "learning_rate": 1.2063320910413067e-06, + "loss": 2.8153, + "step": 59467 + }, + { + "epoch": 2.91, + "grad_norm": 0.7424731254577637, + "learning_rate": 1.204953167619327e-06, + "loss": 3.0888, + "step": 59468 + }, + { + "epoch": 2.91, + "grad_norm": 0.8056619167327881, + "learning_rate": 1.2035750311660042e-06, + "loss": 2.851, + "step": 59469 + }, + { + "epoch": 2.91, + "grad_norm": 0.8469576239585876, + "learning_rate": 1.2021976816849687e-06, + "loss": 2.8366, + "step": 59470 + }, + { + "epoch": 2.91, + "grad_norm": 0.7873582243919373, + "learning_rate": 1.2008211191798177e-06, + "loss": 2.9648, + "step": 59471 + }, + { + "epoch": 2.91, + "grad_norm": 0.7305603623390198, + "learning_rate": 1.1994453436542152e-06, + "loss": 2.845, + "step": 59472 + }, + { + "epoch": 2.91, + "grad_norm": 0.7844927906990051, + "learning_rate": 1.198070355111791e-06, + "loss": 2.9788, + "step": 59473 + }, + { + "epoch": 2.91, + "grad_norm": 0.7984238862991333, + "learning_rate": 1.1966961535561094e-06, + "loss": 2.9127, + "step": 59474 + }, + { + "epoch": 2.91, + "grad_norm": 0.7224608659744263, + "learning_rate": 1.1953227389908339e-06, + "loss": 2.9176, + "step": 59475 + }, + { + "epoch": 2.91, + "grad_norm": 0.7413891553878784, + "learning_rate": 1.1939501114195616e-06, + "loss": 2.6806, + "step": 59476 + }, + { + "epoch": 2.91, + "grad_norm": 0.8167316317558289, + "learning_rate": 1.192578270845923e-06, + "loss": 3.0055, + "step": 59477 + }, + { + "epoch": 2.91, + "grad_norm": 0.7244284749031067, + "learning_rate": 1.1912072172735488e-06, + "loss": 2.8573, + "step": 59478 + }, + { + "epoch": 2.91, + "grad_norm": 0.7515481114387512, + "learning_rate": 1.1898369507060024e-06, + "loss": 2.9796, + "step": 59479 + }, + { + "epoch": 2.91, + "grad_norm": 0.7766403555870056, + "learning_rate": 1.1884674711469477e-06, + "loss": 2.9782, + "step": 59480 + }, + { + "epoch": 2.92, + "grad_norm": 0.7261124849319458, + "learning_rate": 1.1870987785999154e-06, + "loss": 2.7586, + "step": 59481 + }, + { + "epoch": 2.92, + "grad_norm": 0.7415921092033386, + "learning_rate": 1.1857308730685688e-06, + "loss": 3.0122, + "step": 59482 + }, + { + "epoch": 2.92, + "grad_norm": 0.7948103547096252, + "learning_rate": 1.1843637545565388e-06, + "loss": 3.0193, + "step": 59483 + }, + { + "epoch": 2.92, + "grad_norm": 0.7435404062271118, + "learning_rate": 1.1829974230673557e-06, + "loss": 2.734, + "step": 59484 + }, + { + "epoch": 2.92, + "grad_norm": 0.7635899186134338, + "learning_rate": 1.1816318786046497e-06, + "loss": 2.918, + "step": 59485 + }, + { + "epoch": 2.92, + "grad_norm": 0.7679666876792908, + "learning_rate": 1.1802671211720182e-06, + "loss": 2.9842, + "step": 59486 + }, + { + "epoch": 2.92, + "grad_norm": 0.7481985092163086, + "learning_rate": 1.1789031507730584e-06, + "loss": 2.8345, + "step": 59487 + }, + { + "epoch": 2.92, + "grad_norm": 0.7324191927909851, + "learning_rate": 1.1775399674113672e-06, + "loss": 2.9773, + "step": 59488 + }, + { + "epoch": 2.92, + "grad_norm": 0.7569427490234375, + "learning_rate": 1.1761775710905086e-06, + "loss": 2.8726, + "step": 59489 + }, + { + "epoch": 2.92, + "grad_norm": 0.7332199811935425, + "learning_rate": 1.174815961814113e-06, + "loss": 2.9145, + "step": 59490 + }, + { + "epoch": 2.92, + "grad_norm": 0.7667021155357361, + "learning_rate": 1.1734551395857107e-06, + "loss": 2.8075, + "step": 59491 + }, + { + "epoch": 2.92, + "grad_norm": 0.7241703271865845, + "learning_rate": 1.1720951044089323e-06, + "loss": 2.8421, + "step": 59492 + }, + { + "epoch": 2.92, + "grad_norm": 0.807370662689209, + "learning_rate": 1.1707358562873414e-06, + "loss": 2.9485, + "step": 59493 + }, + { + "epoch": 2.92, + "grad_norm": 0.7350105047225952, + "learning_rate": 1.1693773952245355e-06, + "loss": 3.1253, + "step": 59494 + }, + { + "epoch": 2.92, + "grad_norm": 0.8460718989372253, + "learning_rate": 1.1680197212240783e-06, + "loss": 2.8301, + "step": 59495 + }, + { + "epoch": 2.92, + "grad_norm": 0.7515153884887695, + "learning_rate": 1.1666628342895e-06, + "loss": 2.8304, + "step": 59496 + }, + { + "epoch": 2.92, + "grad_norm": 0.718813955783844, + "learning_rate": 1.1653067344244649e-06, + "loss": 2.9743, + "step": 59497 + }, + { + "epoch": 2.92, + "grad_norm": 0.7459069490432739, + "learning_rate": 1.163951421632503e-06, + "loss": 2.8279, + "step": 59498 + }, + { + "epoch": 2.92, + "grad_norm": 0.7909800410270691, + "learning_rate": 1.1625968959171782e-06, + "loss": 3.0892, + "step": 59499 + }, + { + "epoch": 2.92, + "grad_norm": 0.7388293743133545, + "learning_rate": 1.1612431572820547e-06, + "loss": 2.8286, + "step": 59500 + }, + { + "epoch": 2.92, + "grad_norm": 0.7680931091308594, + "learning_rate": 1.1598902057306958e-06, + "loss": 2.8482, + "step": 59501 + }, + { + "epoch": 2.92, + "grad_norm": 0.7312542200088501, + "learning_rate": 1.1585380412666657e-06, + "loss": 2.8631, + "step": 59502 + }, + { + "epoch": 2.92, + "grad_norm": 0.7777981758117676, + "learning_rate": 1.157186663893528e-06, + "loss": 2.8585, + "step": 59503 + }, + { + "epoch": 2.92, + "grad_norm": 0.7600824236869812, + "learning_rate": 1.1558360736148797e-06, + "loss": 2.6733, + "step": 59504 + }, + { + "epoch": 2.92, + "grad_norm": 0.7767745852470398, + "learning_rate": 1.1544862704342184e-06, + "loss": 2.8698, + "step": 59505 + }, + { + "epoch": 2.92, + "grad_norm": 0.7815971970558167, + "learning_rate": 1.1531372543551409e-06, + "loss": 2.8926, + "step": 59506 + }, + { + "epoch": 2.92, + "grad_norm": 0.7735201716423035, + "learning_rate": 1.151789025381178e-06, + "loss": 2.8795, + "step": 59507 + }, + { + "epoch": 2.92, + "grad_norm": 0.7491256594657898, + "learning_rate": 1.150441583515893e-06, + "loss": 2.9923, + "step": 59508 + }, + { + "epoch": 2.92, + "grad_norm": 0.7839617133140564, + "learning_rate": 1.1490949287628503e-06, + "loss": 3.0807, + "step": 59509 + }, + { + "epoch": 2.92, + "grad_norm": 0.7311417460441589, + "learning_rate": 1.1477490611255468e-06, + "loss": 3.0293, + "step": 59510 + }, + { + "epoch": 2.92, + "grad_norm": 0.7277370691299438, + "learning_rate": 1.1464039806075464e-06, + "loss": 2.8279, + "step": 59511 + }, + { + "epoch": 2.92, + "grad_norm": 0.7706161141395569, + "learning_rate": 1.1450596872124463e-06, + "loss": 2.918, + "step": 59512 + }, + { + "epoch": 2.92, + "grad_norm": 0.7380478978157043, + "learning_rate": 1.14371618094371e-06, + "loss": 2.7686, + "step": 59513 + }, + { + "epoch": 2.92, + "grad_norm": 0.7525132894515991, + "learning_rate": 1.1423734618049352e-06, + "loss": 2.6612, + "step": 59514 + }, + { + "epoch": 2.92, + "grad_norm": 0.7478514909744263, + "learning_rate": 1.1410315297996187e-06, + "loss": 3.1156, + "step": 59515 + }, + { + "epoch": 2.92, + "grad_norm": 0.7614080309867859, + "learning_rate": 1.139690384931291e-06, + "loss": 2.9505, + "step": 59516 + }, + { + "epoch": 2.92, + "grad_norm": 0.7383436560630798, + "learning_rate": 1.1383500272035163e-06, + "loss": 2.9673, + "step": 59517 + }, + { + "epoch": 2.92, + "grad_norm": 0.7430320382118225, + "learning_rate": 1.1370104566197913e-06, + "loss": 2.9268, + "step": 59518 + }, + { + "epoch": 2.92, + "grad_norm": 0.7891849279403687, + "learning_rate": 1.1356716731836802e-06, + "loss": 2.8843, + "step": 59519 + }, + { + "epoch": 2.92, + "grad_norm": 0.7453582286834717, + "learning_rate": 1.1343336768987132e-06, + "loss": 2.9038, + "step": 59520 + }, + { + "epoch": 2.92, + "grad_norm": 0.7106955051422119, + "learning_rate": 1.1329964677683878e-06, + "loss": 2.5896, + "step": 59521 + }, + { + "epoch": 2.92, + "grad_norm": 0.807589054107666, + "learning_rate": 1.1316600457962011e-06, + "loss": 2.8919, + "step": 59522 + }, + { + "epoch": 2.92, + "grad_norm": 0.7693127989768982, + "learning_rate": 1.1303244109857168e-06, + "loss": 3.0337, + "step": 59523 + }, + { + "epoch": 2.92, + "grad_norm": 0.747882068157196, + "learning_rate": 1.1289895633404656e-06, + "loss": 3.0611, + "step": 59524 + }, + { + "epoch": 2.92, + "grad_norm": 0.7988489270210266, + "learning_rate": 1.127655502863911e-06, + "loss": 2.8766, + "step": 59525 + }, + { + "epoch": 2.92, + "grad_norm": 0.7369797229766846, + "learning_rate": 1.1263222295595842e-06, + "loss": 2.8861, + "step": 59526 + }, + { + "epoch": 2.92, + "grad_norm": 0.7736074924468994, + "learning_rate": 1.124989743431015e-06, + "loss": 2.7278, + "step": 59527 + }, + { + "epoch": 2.92, + "grad_norm": 0.7230579257011414, + "learning_rate": 1.1236580444817345e-06, + "loss": 2.8485, + "step": 59528 + }, + { + "epoch": 2.92, + "grad_norm": 0.7344254851341248, + "learning_rate": 1.1223271327151728e-06, + "loss": 3.0521, + "step": 59529 + }, + { + "epoch": 2.92, + "grad_norm": 0.7863304615020752, + "learning_rate": 1.1209970081348941e-06, + "loss": 2.9299, + "step": 59530 + }, + { + "epoch": 2.92, + "grad_norm": 0.7944401502609253, + "learning_rate": 1.1196676707443953e-06, + "loss": 2.8285, + "step": 59531 + }, + { + "epoch": 2.92, + "grad_norm": 0.7780875563621521, + "learning_rate": 1.1183391205471737e-06, + "loss": 2.8872, + "step": 59532 + }, + { + "epoch": 2.92, + "grad_norm": 0.7192845344543457, + "learning_rate": 1.1170113575467266e-06, + "loss": 2.7766, + "step": 59533 + }, + { + "epoch": 2.92, + "grad_norm": 0.7729279398918152, + "learning_rate": 1.1156843817465178e-06, + "loss": 2.7897, + "step": 59534 + }, + { + "epoch": 2.92, + "grad_norm": 0.7623705267906189, + "learning_rate": 1.1143581931501111e-06, + "loss": 2.882, + "step": 59535 + }, + { + "epoch": 2.92, + "grad_norm": 0.7770814895629883, + "learning_rate": 1.1130327917609704e-06, + "loss": 3.0556, + "step": 59536 + }, + { + "epoch": 2.92, + "grad_norm": 0.7293316721916199, + "learning_rate": 1.1117081775825598e-06, + "loss": 2.7412, + "step": 59537 + }, + { + "epoch": 2.92, + "grad_norm": 0.7572553157806396, + "learning_rate": 1.1103843506184096e-06, + "loss": 2.7117, + "step": 59538 + }, + { + "epoch": 2.92, + "grad_norm": 0.7205670475959778, + "learning_rate": 1.1090613108719505e-06, + "loss": 2.7834, + "step": 59539 + }, + { + "epoch": 2.92, + "grad_norm": 0.7953411936759949, + "learning_rate": 1.1077390583467126e-06, + "loss": 3.0259, + "step": 59540 + }, + { + "epoch": 2.92, + "grad_norm": 0.7684890031814575, + "learning_rate": 1.1064175930461604e-06, + "loss": 2.7954, + "step": 59541 + }, + { + "epoch": 2.92, + "grad_norm": 0.7398133873939514, + "learning_rate": 1.1050969149738243e-06, + "loss": 3.0277, + "step": 59542 + }, + { + "epoch": 2.92, + "grad_norm": 0.7606955170631409, + "learning_rate": 1.1037770241331013e-06, + "loss": 2.8503, + "step": 59543 + }, + { + "epoch": 2.92, + "grad_norm": 0.7371569871902466, + "learning_rate": 1.1024579205275218e-06, + "loss": 2.629, + "step": 59544 + }, + { + "epoch": 2.92, + "grad_norm": 0.7682278156280518, + "learning_rate": 1.101139604160517e-06, + "loss": 3.0296, + "step": 59545 + }, + { + "epoch": 2.92, + "grad_norm": 0.8034104704856873, + "learning_rate": 1.0998220750356169e-06, + "loss": 2.9309, + "step": 59546 + }, + { + "epoch": 2.92, + "grad_norm": 0.7584922909736633, + "learning_rate": 1.098505333156252e-06, + "loss": 2.854, + "step": 59547 + }, + { + "epoch": 2.92, + "grad_norm": 0.7490361928939819, + "learning_rate": 1.0971893785258868e-06, + "loss": 3.0433, + "step": 59548 + }, + { + "epoch": 2.92, + "grad_norm": 0.7814920544624329, + "learning_rate": 1.0958742111480179e-06, + "loss": 2.7952, + "step": 59549 + }, + { + "epoch": 2.92, + "grad_norm": 0.7534535527229309, + "learning_rate": 1.094559831026076e-06, + "loss": 2.8577, + "step": 59550 + }, + { + "epoch": 2.92, + "grad_norm": 0.722969114780426, + "learning_rate": 1.0932462381635254e-06, + "loss": 2.7215, + "step": 59551 + }, + { + "epoch": 2.92, + "grad_norm": 0.7845919728279114, + "learning_rate": 1.0919334325638629e-06, + "loss": 2.7127, + "step": 59552 + }, + { + "epoch": 2.92, + "grad_norm": 0.8240389227867126, + "learning_rate": 1.0906214142305191e-06, + "loss": 2.735, + "step": 59553 + }, + { + "epoch": 2.92, + "grad_norm": 0.7913576364517212, + "learning_rate": 1.0893101831669249e-06, + "loss": 2.9265, + "step": 59554 + }, + { + "epoch": 2.92, + "grad_norm": 0.716317892074585, + "learning_rate": 1.0879997393766105e-06, + "loss": 2.9504, + "step": 59555 + }, + { + "epoch": 2.92, + "grad_norm": 0.7530320882797241, + "learning_rate": 1.08669008286294e-06, + "loss": 2.8871, + "step": 59556 + }, + { + "epoch": 2.92, + "grad_norm": 0.7626606822013855, + "learning_rate": 1.0853812136294104e-06, + "loss": 3.0875, + "step": 59557 + }, + { + "epoch": 2.92, + "grad_norm": 0.7616795301437378, + "learning_rate": 1.084073131679486e-06, + "loss": 2.6925, + "step": 59558 + }, + { + "epoch": 2.92, + "grad_norm": 0.7134222388267517, + "learning_rate": 1.0827658370165304e-06, + "loss": 2.921, + "step": 59559 + }, + { + "epoch": 2.92, + "grad_norm": 0.7397261261940002, + "learning_rate": 1.0814593296441077e-06, + "loss": 2.9264, + "step": 59560 + }, + { + "epoch": 2.92, + "grad_norm": 0.7546602487564087, + "learning_rate": 1.0801536095655482e-06, + "loss": 3.096, + "step": 59561 + }, + { + "epoch": 2.92, + "grad_norm": 0.7439787983894348, + "learning_rate": 1.0788486767843497e-06, + "loss": 2.8096, + "step": 59562 + }, + { + "epoch": 2.92, + "grad_norm": 0.7837499380111694, + "learning_rate": 1.0775445313039088e-06, + "loss": 2.8704, + "step": 59563 + }, + { + "epoch": 2.92, + "grad_norm": 0.7200571894645691, + "learning_rate": 1.0762411731277232e-06, + "loss": 2.9517, + "step": 59564 + }, + { + "epoch": 2.92, + "grad_norm": 0.7449080348014832, + "learning_rate": 1.0749386022591899e-06, + "loss": 2.9333, + "step": 59565 + }, + { + "epoch": 2.92, + "grad_norm": 0.7231045365333557, + "learning_rate": 1.0736368187017064e-06, + "loss": 2.7141, + "step": 59566 + }, + { + "epoch": 2.92, + "grad_norm": 0.8130992650985718, + "learning_rate": 1.0723358224587698e-06, + "loss": 2.8928, + "step": 59567 + }, + { + "epoch": 2.92, + "grad_norm": 0.7753105759620667, + "learning_rate": 1.071035613533744e-06, + "loss": 2.8285, + "step": 59568 + }, + { + "epoch": 2.92, + "grad_norm": 0.7972942590713501, + "learning_rate": 1.0697361919300928e-06, + "loss": 2.8247, + "step": 59569 + }, + { + "epoch": 2.92, + "grad_norm": 0.710925817489624, + "learning_rate": 1.068437557651214e-06, + "loss": 2.8817, + "step": 59570 + }, + { + "epoch": 2.92, + "grad_norm": 0.7548450827598572, + "learning_rate": 1.0671397107005708e-06, + "loss": 2.7388, + "step": 59571 + }, + { + "epoch": 2.92, + "grad_norm": 0.7482631802558899, + "learning_rate": 1.0658426510815276e-06, + "loss": 2.7826, + "step": 59572 + }, + { + "epoch": 2.92, + "grad_norm": 0.7267739772796631, + "learning_rate": 1.064546378797515e-06, + "loss": 2.8136, + "step": 59573 + }, + { + "epoch": 2.92, + "grad_norm": 0.7370942831039429, + "learning_rate": 1.0632508938519634e-06, + "loss": 2.8792, + "step": 59574 + }, + { + "epoch": 2.92, + "grad_norm": 0.7242228984832764, + "learning_rate": 1.0619561962482703e-06, + "loss": 2.8447, + "step": 59575 + }, + { + "epoch": 2.92, + "grad_norm": 0.7722229361534119, + "learning_rate": 1.0606622859898662e-06, + "loss": 3.0327, + "step": 59576 + }, + { + "epoch": 2.92, + "grad_norm": 0.7654498219490051, + "learning_rate": 1.0593691630801482e-06, + "loss": 2.9213, + "step": 59577 + }, + { + "epoch": 2.92, + "grad_norm": 0.7603773474693298, + "learning_rate": 1.0580768275225139e-06, + "loss": 3.1623, + "step": 59578 + }, + { + "epoch": 2.92, + "grad_norm": 0.7189480662345886, + "learning_rate": 1.0567852793203602e-06, + "loss": 2.812, + "step": 59579 + }, + { + "epoch": 2.92, + "grad_norm": 0.7826395034790039, + "learning_rate": 1.055494518477118e-06, + "loss": 3.1349, + "step": 59580 + }, + { + "epoch": 2.92, + "grad_norm": 0.7658017873764038, + "learning_rate": 1.0542045449961844e-06, + "loss": 2.6386, + "step": 59581 + }, + { + "epoch": 2.92, + "grad_norm": 0.7522681951522827, + "learning_rate": 1.0529153588809235e-06, + "loss": 2.8757, + "step": 59582 + }, + { + "epoch": 2.92, + "grad_norm": 0.7336506843566895, + "learning_rate": 1.0516269601347326e-06, + "loss": 2.8125, + "step": 59583 + }, + { + "epoch": 2.92, + "grad_norm": 0.7140693068504333, + "learning_rate": 1.0503393487610423e-06, + "loss": 2.9674, + "step": 59584 + }, + { + "epoch": 2.92, + "grad_norm": 0.7803701758384705, + "learning_rate": 1.0490525247632165e-06, + "loss": 3.1645, + "step": 59585 + }, + { + "epoch": 2.92, + "grad_norm": 0.7349176406860352, + "learning_rate": 1.0477664881446857e-06, + "loss": 2.7669, + "step": 59586 + }, + { + "epoch": 2.92, + "grad_norm": 0.7476712465286255, + "learning_rate": 1.0464812389087806e-06, + "loss": 2.765, + "step": 59587 + }, + { + "epoch": 2.92, + "grad_norm": 0.764438271522522, + "learning_rate": 1.0451967770588987e-06, + "loss": 2.971, + "step": 59588 + }, + { + "epoch": 2.92, + "grad_norm": 0.7536563873291016, + "learning_rate": 1.043913102598437e-06, + "loss": 2.9274, + "step": 59589 + }, + { + "epoch": 2.92, + "grad_norm": 0.728675365447998, + "learning_rate": 1.0426302155307597e-06, + "loss": 2.7336, + "step": 59590 + }, + { + "epoch": 2.92, + "grad_norm": 0.7635043859481812, + "learning_rate": 1.041348115859264e-06, + "loss": 2.8814, + "step": 59591 + }, + { + "epoch": 2.92, + "grad_norm": 0.7409845590591431, + "learning_rate": 1.0400668035873472e-06, + "loss": 2.8674, + "step": 59592 + }, + { + "epoch": 2.92, + "grad_norm": 0.707777202129364, + "learning_rate": 1.03878627871834e-06, + "loss": 2.8546, + "step": 59593 + }, + { + "epoch": 2.92, + "grad_norm": 0.7888078689575195, + "learning_rate": 1.0375065412556393e-06, + "loss": 3.0258, + "step": 59594 + }, + { + "epoch": 2.92, + "grad_norm": 0.7485946416854858, + "learning_rate": 1.0362275912026096e-06, + "loss": 2.9028, + "step": 59595 + }, + { + "epoch": 2.92, + "grad_norm": 0.7295870780944824, + "learning_rate": 1.0349494285626147e-06, + "loss": 3.0208, + "step": 59596 + }, + { + "epoch": 2.92, + "grad_norm": 0.7414456009864807, + "learning_rate": 1.0336720533390187e-06, + "loss": 2.789, + "step": 59597 + }, + { + "epoch": 2.92, + "grad_norm": 0.7335713505744934, + "learning_rate": 1.0323954655352185e-06, + "loss": 3.0562, + "step": 59598 + }, + { + "epoch": 2.92, + "grad_norm": 0.759948194026947, + "learning_rate": 1.0311196651545118e-06, + "loss": 2.9113, + "step": 59599 + }, + { + "epoch": 2.92, + "grad_norm": 0.7210058569908142, + "learning_rate": 1.029844652200329e-06, + "loss": 3.0582, + "step": 59600 + }, + { + "epoch": 2.92, + "grad_norm": 0.736574113368988, + "learning_rate": 1.028570426676001e-06, + "loss": 2.7906, + "step": 59601 + }, + { + "epoch": 2.92, + "grad_norm": 0.7816761136054993, + "learning_rate": 1.0272969885848915e-06, + "loss": 2.9032, + "step": 59602 + }, + { + "epoch": 2.92, + "grad_norm": 0.8219104409217834, + "learning_rate": 1.0260243379302979e-06, + "loss": 3.044, + "step": 59603 + }, + { + "epoch": 2.92, + "grad_norm": 0.7174280285835266, + "learning_rate": 1.024752474715651e-06, + "loss": 2.9151, + "step": 59604 + }, + { + "epoch": 2.92, + "grad_norm": 0.7537269592285156, + "learning_rate": 1.023481398944248e-06, + "loss": 2.9196, + "step": 59605 + }, + { + "epoch": 2.92, + "grad_norm": 0.7246042490005493, + "learning_rate": 1.022211110619453e-06, + "loss": 2.7453, + "step": 59606 + }, + { + "epoch": 2.92, + "grad_norm": 0.7886763215065002, + "learning_rate": 1.0209416097446632e-06, + "loss": 2.8192, + "step": 59607 + }, + { + "epoch": 2.92, + "grad_norm": 0.7710185050964355, + "learning_rate": 1.0196728963231427e-06, + "loss": 3.0203, + "step": 59608 + }, + { + "epoch": 2.92, + "grad_norm": 0.704352080821991, + "learning_rate": 1.018404970358222e-06, + "loss": 2.8758, + "step": 59609 + }, + { + "epoch": 2.92, + "grad_norm": 0.7111163139343262, + "learning_rate": 1.0171378318533318e-06, + "loss": 2.7523, + "step": 59610 + }, + { + "epoch": 2.92, + "grad_norm": 0.7470084428787231, + "learning_rate": 1.0158714808117362e-06, + "loss": 2.7751, + "step": 59611 + }, + { + "epoch": 2.92, + "grad_norm": 0.7764518857002258, + "learning_rate": 1.0146059172368327e-06, + "loss": 2.9253, + "step": 59612 + }, + { + "epoch": 2.92, + "grad_norm": 0.7668911218643188, + "learning_rate": 1.0133411411318849e-06, + "loss": 3.1101, + "step": 59613 + }, + { + "epoch": 2.92, + "grad_norm": 0.7347301840782166, + "learning_rate": 1.012077152500257e-06, + "loss": 2.9405, + "step": 59614 + }, + { + "epoch": 2.92, + "grad_norm": 0.8694640398025513, + "learning_rate": 1.0108139513452795e-06, + "loss": 2.8178, + "step": 59615 + }, + { + "epoch": 2.92, + "grad_norm": 0.7998950481414795, + "learning_rate": 1.0095515376702833e-06, + "loss": 2.9485, + "step": 59616 + }, + { + "epoch": 2.92, + "grad_norm": 0.7624552249908447, + "learning_rate": 1.0082899114785659e-06, + "loss": 3.0671, + "step": 59617 + }, + { + "epoch": 2.92, + "grad_norm": 0.7087655067443848, + "learning_rate": 1.0070290727734909e-06, + "loss": 2.8172, + "step": 59618 + }, + { + "epoch": 2.92, + "grad_norm": 0.7436306476593018, + "learning_rate": 1.0057690215583558e-06, + "loss": 2.9391, + "step": 59619 + }, + { + "epoch": 2.92, + "grad_norm": 0.7129530906677246, + "learning_rate": 1.004509757836458e-06, + "loss": 2.7657, + "step": 59620 + }, + { + "epoch": 2.92, + "grad_norm": 0.7246747016906738, + "learning_rate": 1.0032512816111616e-06, + "loss": 3.0554, + "step": 59621 + }, + { + "epoch": 2.92, + "grad_norm": 0.7818371057510376, + "learning_rate": 1.0019935928857637e-06, + "loss": 2.7496, + "step": 59622 + }, + { + "epoch": 2.92, + "grad_norm": 0.754097580909729, + "learning_rate": 1.0007366916635618e-06, + "loss": 2.7842, + "step": 59623 + }, + { + "epoch": 2.92, + "grad_norm": 0.7770971655845642, + "learning_rate": 9.994805779478865e-07, + "loss": 2.9256, + "step": 59624 + }, + { + "epoch": 2.92, + "grad_norm": 0.7818499803543091, + "learning_rate": 9.982252517420352e-07, + "loss": 3.0954, + "step": 59625 + }, + { + "epoch": 2.92, + "grad_norm": 0.7363375425338745, + "learning_rate": 9.969707130493055e-07, + "loss": 3.0082, + "step": 59626 + }, + { + "epoch": 2.92, + "grad_norm": 0.7672472596168518, + "learning_rate": 9.957169618729944e-07, + "loss": 2.9003, + "step": 59627 + }, + { + "epoch": 2.92, + "grad_norm": 0.7299450039863586, + "learning_rate": 9.944639982164326e-07, + "loss": 3.1018, + "step": 59628 + }, + { + "epoch": 2.92, + "grad_norm": 0.788252592086792, + "learning_rate": 9.932118220829177e-07, + "loss": 2.7675, + "step": 59629 + }, + { + "epoch": 2.92, + "grad_norm": 0.760606050491333, + "learning_rate": 9.91960433475747e-07, + "loss": 2.8997, + "step": 59630 + }, + { + "epoch": 2.92, + "grad_norm": 0.7685892581939697, + "learning_rate": 9.907098323981845e-07, + "loss": 2.9651, + "step": 59631 + }, + { + "epoch": 2.92, + "grad_norm": 0.7383074760437012, + "learning_rate": 9.894600188535606e-07, + "loss": 2.9564, + "step": 59632 + }, + { + "epoch": 2.92, + "grad_norm": 0.7410311698913574, + "learning_rate": 9.882109928451398e-07, + "loss": 2.6135, + "step": 59633 + }, + { + "epoch": 2.92, + "grad_norm": 0.7539855241775513, + "learning_rate": 9.869627543762527e-07, + "loss": 3.037, + "step": 59634 + }, + { + "epoch": 2.92, + "grad_norm": 0.7680968046188354, + "learning_rate": 9.857153034501297e-07, + "loss": 2.7394, + "step": 59635 + }, + { + "epoch": 2.92, + "grad_norm": 0.779704213142395, + "learning_rate": 9.84468640070102e-07, + "loss": 2.7469, + "step": 59636 + }, + { + "epoch": 2.92, + "grad_norm": 0.7694498896598816, + "learning_rate": 9.83222764239433e-07, + "loss": 3.0253, + "step": 59637 + }, + { + "epoch": 2.92, + "grad_norm": 0.8191694617271423, + "learning_rate": 9.819776759614206e-07, + "loss": 3.0428, + "step": 59638 + }, + { + "epoch": 2.92, + "grad_norm": 0.722665548324585, + "learning_rate": 9.807333752392953e-07, + "loss": 2.944, + "step": 59639 + }, + { + "epoch": 2.92, + "grad_norm": 0.7409653067588806, + "learning_rate": 9.79489862076388e-07, + "loss": 2.8522, + "step": 59640 + }, + { + "epoch": 2.92, + "grad_norm": 0.7272565364837646, + "learning_rate": 9.782471364759625e-07, + "loss": 2.7308, + "step": 59641 + }, + { + "epoch": 2.92, + "grad_norm": 0.766656756401062, + "learning_rate": 9.770051984413164e-07, + "loss": 2.7798, + "step": 59642 + }, + { + "epoch": 2.92, + "grad_norm": 0.7929888367652893, + "learning_rate": 9.757640479756468e-07, + "loss": 2.9093, + "step": 59643 + }, + { + "epoch": 2.92, + "grad_norm": 0.7375426888465881, + "learning_rate": 9.745236850822846e-07, + "loss": 2.6784, + "step": 59644 + }, + { + "epoch": 2.92, + "grad_norm": 0.7349624037742615, + "learning_rate": 9.732841097644605e-07, + "loss": 2.7962, + "step": 59645 + }, + { + "epoch": 2.92, + "grad_norm": 0.7367823719978333, + "learning_rate": 9.72045322025472e-07, + "loss": 2.975, + "step": 59646 + }, + { + "epoch": 2.92, + "grad_norm": 0.7537251114845276, + "learning_rate": 9.70807321868583e-07, + "loss": 2.8237, + "step": 59647 + }, + { + "epoch": 2.92, + "grad_norm": 0.7877868413925171, + "learning_rate": 9.695701092970243e-07, + "loss": 2.7918, + "step": 59648 + }, + { + "epoch": 2.92, + "grad_norm": 0.7532494068145752, + "learning_rate": 9.683336843140598e-07, + "loss": 2.8354, + "step": 59649 + }, + { + "epoch": 2.92, + "grad_norm": 0.7658734321594238, + "learning_rate": 9.67098046922954e-07, + "loss": 2.9192, + "step": 59650 + }, + { + "epoch": 2.92, + "grad_norm": 0.7385966777801514, + "learning_rate": 9.658631971269704e-07, + "loss": 2.8873, + "step": 59651 + }, + { + "epoch": 2.92, + "grad_norm": 0.7445787787437439, + "learning_rate": 9.6462913492934e-07, + "loss": 2.6811, + "step": 59652 + }, + { + "epoch": 2.92, + "grad_norm": 0.7418826818466187, + "learning_rate": 9.63395860333327e-07, + "loss": 2.8437, + "step": 59653 + }, + { + "epoch": 2.92, + "grad_norm": 0.7458414435386658, + "learning_rate": 9.621633733421953e-07, + "loss": 2.7761, + "step": 59654 + }, + { + "epoch": 2.92, + "grad_norm": 0.7673033475875854, + "learning_rate": 9.609316739591754e-07, + "loss": 2.9863, + "step": 59655 + }, + { + "epoch": 2.92, + "grad_norm": 0.7251089811325073, + "learning_rate": 9.597007621874985e-07, + "loss": 3.0085, + "step": 59656 + }, + { + "epoch": 2.92, + "grad_norm": 0.7382851243019104, + "learning_rate": 9.584706380304286e-07, + "loss": 2.6704, + "step": 59657 + }, + { + "epoch": 2.92, + "grad_norm": 0.7357835173606873, + "learning_rate": 9.572413014911628e-07, + "loss": 2.9542, + "step": 59658 + }, + { + "epoch": 2.92, + "grad_norm": 0.7621243596076965, + "learning_rate": 9.560127525730321e-07, + "loss": 2.9173, + "step": 59659 + }, + { + "epoch": 2.92, + "grad_norm": 0.8021424412727356, + "learning_rate": 9.547849912791673e-07, + "loss": 2.9653, + "step": 59660 + }, + { + "epoch": 2.92, + "grad_norm": 0.7782572507858276, + "learning_rate": 9.535580176128654e-07, + "loss": 2.9197, + "step": 59661 + }, + { + "epoch": 2.92, + "grad_norm": 0.7518317103385925, + "learning_rate": 9.523318315773242e-07, + "loss": 2.8542, + "step": 59662 + }, + { + "epoch": 2.92, + "grad_norm": 0.7433437705039978, + "learning_rate": 9.511064331757745e-07, + "loss": 2.9361, + "step": 59663 + }, + { + "epoch": 2.92, + "grad_norm": 0.7323741912841797, + "learning_rate": 9.498818224114802e-07, + "loss": 2.8354, + "step": 59664 + }, + { + "epoch": 2.92, + "grad_norm": 0.7862437963485718, + "learning_rate": 9.486579992876386e-07, + "loss": 2.9296, + "step": 59665 + }, + { + "epoch": 2.92, + "grad_norm": 0.729877769947052, + "learning_rate": 9.474349638074808e-07, + "loss": 2.6364, + "step": 59666 + }, + { + "epoch": 2.92, + "grad_norm": 0.7364502549171448, + "learning_rate": 9.462127159742372e-07, + "loss": 2.9278, + "step": 59667 + }, + { + "epoch": 2.92, + "grad_norm": 0.748285174369812, + "learning_rate": 9.449912557911055e-07, + "loss": 2.8261, + "step": 59668 + }, + { + "epoch": 2.92, + "grad_norm": 0.7488347887992859, + "learning_rate": 9.437705832613163e-07, + "loss": 3.0538, + "step": 59669 + }, + { + "epoch": 2.92, + "grad_norm": 0.7590417265892029, + "learning_rate": 9.425506983880671e-07, + "loss": 2.8349, + "step": 59670 + }, + { + "epoch": 2.92, + "grad_norm": 0.7655706405639648, + "learning_rate": 9.413316011745886e-07, + "loss": 2.8614, + "step": 59671 + }, + { + "epoch": 2.92, + "grad_norm": 0.7407287955284119, + "learning_rate": 9.401132916240784e-07, + "loss": 3.0396, + "step": 59672 + }, + { + "epoch": 2.92, + "grad_norm": 0.7478367686271667, + "learning_rate": 9.388957697397669e-07, + "loss": 2.9103, + "step": 59673 + }, + { + "epoch": 2.92, + "grad_norm": 0.8015658259391785, + "learning_rate": 9.37679035524852e-07, + "loss": 2.7994, + "step": 59674 + }, + { + "epoch": 2.92, + "grad_norm": 0.748598039150238, + "learning_rate": 9.364630889825309e-07, + "loss": 2.7964, + "step": 59675 + }, + { + "epoch": 2.92, + "grad_norm": 0.7474225759506226, + "learning_rate": 9.352479301160009e-07, + "loss": 2.935, + "step": 59676 + }, + { + "epoch": 2.92, + "grad_norm": 0.7985930442810059, + "learning_rate": 9.340335589284931e-07, + "loss": 2.9996, + "step": 59677 + }, + { + "epoch": 2.92, + "grad_norm": 0.739578127861023, + "learning_rate": 9.328199754231713e-07, + "loss": 2.8553, + "step": 59678 + }, + { + "epoch": 2.92, + "grad_norm": 0.745047926902771, + "learning_rate": 9.316071796032331e-07, + "loss": 2.7658, + "step": 59679 + }, + { + "epoch": 2.92, + "grad_norm": 0.7639679312705994, + "learning_rate": 9.303951714719093e-07, + "loss": 2.7573, + "step": 59680 + }, + { + "epoch": 2.92, + "grad_norm": 0.8055102229118347, + "learning_rate": 9.29183951032364e-07, + "loss": 2.7784, + "step": 59681 + }, + { + "epoch": 2.92, + "grad_norm": 0.717705488204956, + "learning_rate": 9.279735182877613e-07, + "loss": 2.8754, + "step": 59682 + }, + { + "epoch": 2.92, + "grad_norm": 0.7145783305168152, + "learning_rate": 9.267638732413651e-07, + "loss": 2.9277, + "step": 59683 + }, + { + "epoch": 2.92, + "grad_norm": 0.7795868515968323, + "learning_rate": 9.255550158962733e-07, + "loss": 2.8612, + "step": 59684 + }, + { + "epoch": 2.93, + "grad_norm": 0.7477320432662964, + "learning_rate": 9.243469462557163e-07, + "loss": 2.8429, + "step": 59685 + }, + { + "epoch": 2.93, + "grad_norm": 0.7685227990150452, + "learning_rate": 9.231396643228917e-07, + "loss": 2.7086, + "step": 59686 + }, + { + "epoch": 2.93, + "grad_norm": 0.742152214050293, + "learning_rate": 9.219331701009635e-07, + "loss": 2.9458, + "step": 59687 + }, + { + "epoch": 2.93, + "grad_norm": 0.7278732657432556, + "learning_rate": 9.207274635930961e-07, + "loss": 2.8947, + "step": 59688 + }, + { + "epoch": 2.93, + "grad_norm": 0.7534875273704529, + "learning_rate": 9.195225448024535e-07, + "loss": 2.974, + "step": 59689 + }, + { + "epoch": 2.93, + "grad_norm": 0.726291835308075, + "learning_rate": 9.183184137322664e-07, + "loss": 2.9126, + "step": 59690 + }, + { + "epoch": 2.93, + "grad_norm": 0.746126651763916, + "learning_rate": 9.171150703856323e-07, + "loss": 2.9024, + "step": 59691 + }, + { + "epoch": 2.93, + "grad_norm": 0.7456505298614502, + "learning_rate": 9.15912514765782e-07, + "loss": 3.0167, + "step": 59692 + }, + { + "epoch": 2.93, + "grad_norm": 0.7747290134429932, + "learning_rate": 9.147107468758464e-07, + "loss": 2.956, + "step": 59693 + }, + { + "epoch": 2.93, + "grad_norm": 0.7368700504302979, + "learning_rate": 9.135097667189895e-07, + "loss": 2.9151, + "step": 59694 + }, + { + "epoch": 2.93, + "grad_norm": 0.7123827934265137, + "learning_rate": 9.123095742984088e-07, + "loss": 3.1211, + "step": 59695 + }, + { + "epoch": 2.93, + "grad_norm": 0.7461838722229004, + "learning_rate": 9.111101696172351e-07, + "loss": 2.9004, + "step": 59696 + }, + { + "epoch": 2.93, + "grad_norm": 0.7076005935668945, + "learning_rate": 9.099115526786327e-07, + "loss": 2.8209, + "step": 59697 + }, + { + "epoch": 2.93, + "grad_norm": 0.7806337475776672, + "learning_rate": 9.087137234857656e-07, + "loss": 2.6983, + "step": 59698 + }, + { + "epoch": 2.93, + "grad_norm": 0.7522020936012268, + "learning_rate": 9.075166820417978e-07, + "loss": 2.8614, + "step": 59699 + }, + { + "epoch": 2.93, + "grad_norm": 0.7688117623329163, + "learning_rate": 9.063204283498604e-07, + "loss": 2.8533, + "step": 59700 + }, + { + "epoch": 2.93, + "grad_norm": 0.7308453917503357, + "learning_rate": 9.051249624131174e-07, + "loss": 2.8086, + "step": 59701 + }, + { + "epoch": 2.93, + "grad_norm": 0.7483466267585754, + "learning_rate": 9.039302842346996e-07, + "loss": 2.9024, + "step": 59702 + }, + { + "epoch": 2.93, + "grad_norm": 0.7657617926597595, + "learning_rate": 9.027363938178045e-07, + "loss": 2.7308, + "step": 59703 + }, + { + "epoch": 2.93, + "grad_norm": 0.7511147856712341, + "learning_rate": 9.015432911654962e-07, + "loss": 2.9577, + "step": 59704 + }, + { + "epoch": 2.93, + "grad_norm": 0.7356677651405334, + "learning_rate": 9.003509762809724e-07, + "loss": 2.991, + "step": 59705 + }, + { + "epoch": 2.93, + "grad_norm": 0.7413914203643799, + "learning_rate": 8.991594491673637e-07, + "loss": 3.2518, + "step": 59706 + }, + { + "epoch": 2.93, + "grad_norm": 0.7317723631858826, + "learning_rate": 8.97968709827801e-07, + "loss": 2.9233, + "step": 59707 + }, + { + "epoch": 2.93, + "grad_norm": 0.7266475558280945, + "learning_rate": 8.967787582654484e-07, + "loss": 2.9267, + "step": 59708 + }, + { + "epoch": 2.93, + "grad_norm": 0.725741446018219, + "learning_rate": 8.955895944834035e-07, + "loss": 2.8691, + "step": 59709 + }, + { + "epoch": 2.93, + "grad_norm": 0.7397601008415222, + "learning_rate": 8.94401218484797e-07, + "loss": 2.9947, + "step": 59710 + }, + { + "epoch": 2.93, + "grad_norm": 0.735898494720459, + "learning_rate": 8.932136302727933e-07, + "loss": 2.7562, + "step": 59711 + }, + { + "epoch": 2.93, + "grad_norm": 0.7711181044578552, + "learning_rate": 8.920268298504896e-07, + "loss": 2.9557, + "step": 59712 + }, + { + "epoch": 2.93, + "grad_norm": 0.8324527740478516, + "learning_rate": 8.908408172210169e-07, + "loss": 2.8636, + "step": 59713 + }, + { + "epoch": 2.93, + "grad_norm": 0.7208682894706726, + "learning_rate": 8.896555923875393e-07, + "loss": 2.7489, + "step": 59714 + }, + { + "epoch": 2.93, + "grad_norm": 0.7793273329734802, + "learning_rate": 8.884711553530876e-07, + "loss": 2.862, + "step": 59715 + }, + { + "epoch": 2.93, + "grad_norm": 0.7881961464881897, + "learning_rate": 8.872875061208928e-07, + "loss": 2.8456, + "step": 59716 + }, + { + "epoch": 2.93, + "grad_norm": 0.7748516798019409, + "learning_rate": 8.861046446939857e-07, + "loss": 3.0182, + "step": 59717 + }, + { + "epoch": 2.93, + "grad_norm": 0.7874419093132019, + "learning_rate": 8.849225710755304e-07, + "loss": 2.9961, + "step": 59718 + }, + { + "epoch": 2.93, + "grad_norm": 0.7328853011131287, + "learning_rate": 8.83741285268591e-07, + "loss": 3.0178, + "step": 59719 + }, + { + "epoch": 2.93, + "grad_norm": 0.7477407455444336, + "learning_rate": 8.825607872763319e-07, + "loss": 2.914, + "step": 59720 + }, + { + "epoch": 2.93, + "grad_norm": 0.7703258991241455, + "learning_rate": 8.813810771018503e-07, + "loss": 3.1145, + "step": 59721 + }, + { + "epoch": 2.93, + "grad_norm": 0.7548531889915466, + "learning_rate": 8.802021547482441e-07, + "loss": 3.0983, + "step": 59722 + }, + { + "epoch": 2.93, + "grad_norm": 0.7501577734947205, + "learning_rate": 8.790240202186105e-07, + "loss": 2.8297, + "step": 59723 + }, + { + "epoch": 2.93, + "grad_norm": 0.8373538255691528, + "learning_rate": 8.778466735160472e-07, + "loss": 2.8036, + "step": 59724 + }, + { + "epoch": 2.93, + "grad_norm": 0.7723232507705688, + "learning_rate": 8.76670114643685e-07, + "loss": 2.98, + "step": 59725 + }, + { + "epoch": 2.93, + "grad_norm": 0.7283118963241577, + "learning_rate": 8.754943436046214e-07, + "loss": 2.9201, + "step": 59726 + }, + { + "epoch": 2.93, + "grad_norm": 0.7659836411476135, + "learning_rate": 8.743193604019205e-07, + "loss": 2.7259, + "step": 59727 + }, + { + "epoch": 2.93, + "grad_norm": 0.7766382098197937, + "learning_rate": 8.731451650386801e-07, + "loss": 2.8198, + "step": 59728 + }, + { + "epoch": 2.93, + "grad_norm": 0.7227720618247986, + "learning_rate": 8.719717575180308e-07, + "loss": 3.1198, + "step": 59729 + }, + { + "epoch": 2.93, + "grad_norm": 0.7672495245933533, + "learning_rate": 8.707991378430034e-07, + "loss": 2.8466, + "step": 59730 + }, + { + "epoch": 2.93, + "grad_norm": 0.754241406917572, + "learning_rate": 8.696273060167624e-07, + "loss": 2.6828, + "step": 59731 + }, + { + "epoch": 2.93, + "grad_norm": 0.7628318667411804, + "learning_rate": 8.684562620423385e-07, + "loss": 3.0211, + "step": 59732 + }, + { + "epoch": 2.93, + "grad_norm": 0.788650393486023, + "learning_rate": 8.672860059228293e-07, + "loss": 2.8669, + "step": 59733 + }, + { + "epoch": 2.93, + "grad_norm": 0.7480198740959167, + "learning_rate": 8.661165376613321e-07, + "loss": 2.9814, + "step": 59734 + }, + { + "epoch": 2.93, + "grad_norm": 0.7865045666694641, + "learning_rate": 8.649478572609114e-07, + "loss": 2.9745, + "step": 59735 + }, + { + "epoch": 2.93, + "grad_norm": 0.7339010238647461, + "learning_rate": 8.637799647246646e-07, + "loss": 2.803, + "step": 59736 + }, + { + "epoch": 2.93, + "grad_norm": 0.6831942200660706, + "learning_rate": 8.626128600556226e-07, + "loss": 3.0673, + "step": 59737 + }, + { + "epoch": 2.93, + "grad_norm": 0.7489328980445862, + "learning_rate": 8.614465432568829e-07, + "loss": 2.8345, + "step": 59738 + }, + { + "epoch": 2.93, + "grad_norm": 0.8042427897453308, + "learning_rate": 8.602810143315431e-07, + "loss": 2.8714, + "step": 59739 + }, + { + "epoch": 2.93, + "grad_norm": 0.8107337355613708, + "learning_rate": 8.591162732826339e-07, + "loss": 2.9028, + "step": 59740 + }, + { + "epoch": 2.93, + "grad_norm": 0.7713976502418518, + "learning_rate": 8.579523201132866e-07, + "loss": 2.9202, + "step": 59741 + }, + { + "epoch": 2.93, + "grad_norm": 0.8133545517921448, + "learning_rate": 8.567891548264649e-07, + "loss": 2.8562, + "step": 59742 + }, + { + "epoch": 2.93, + "grad_norm": 0.8308982253074646, + "learning_rate": 8.556267774253001e-07, + "loss": 2.7826, + "step": 59743 + }, + { + "epoch": 2.93, + "grad_norm": 0.756340503692627, + "learning_rate": 8.544651879128894e-07, + "loss": 2.8316, + "step": 59744 + }, + { + "epoch": 2.93, + "grad_norm": 0.7788317203521729, + "learning_rate": 8.533043862921973e-07, + "loss": 2.8322, + "step": 59745 + }, + { + "epoch": 2.93, + "grad_norm": 0.735393226146698, + "learning_rate": 8.521443725663546e-07, + "loss": 2.8576, + "step": 59746 + }, + { + "epoch": 2.93, + "grad_norm": 0.7246791124343872, + "learning_rate": 8.509851467383588e-07, + "loss": 3.0481, + "step": 59747 + }, + { + "epoch": 2.93, + "grad_norm": 0.7521356344223022, + "learning_rate": 8.498267088113409e-07, + "loss": 2.7354, + "step": 59748 + }, + { + "epoch": 2.93, + "grad_norm": 0.803340494632721, + "learning_rate": 8.48669058788265e-07, + "loss": 2.8304, + "step": 59749 + }, + { + "epoch": 2.93, + "grad_norm": 0.7315862774848938, + "learning_rate": 8.47512196672262e-07, + "loss": 2.8125, + "step": 59750 + }, + { + "epoch": 2.93, + "grad_norm": 0.7708979845046997, + "learning_rate": 8.463561224662962e-07, + "loss": 2.8332, + "step": 59751 + }, + { + "epoch": 2.93, + "grad_norm": 0.7568145990371704, + "learning_rate": 8.452008361734985e-07, + "loss": 2.7078, + "step": 59752 + }, + { + "epoch": 2.93, + "grad_norm": 0.7380116581916809, + "learning_rate": 8.440463377968332e-07, + "loss": 2.9881, + "step": 59753 + }, + { + "epoch": 2.93, + "grad_norm": 0.7458478212356567, + "learning_rate": 8.428926273393977e-07, + "loss": 2.9401, + "step": 59754 + }, + { + "epoch": 2.93, + "grad_norm": 0.7849690914154053, + "learning_rate": 8.417397048041897e-07, + "loss": 2.8784, + "step": 59755 + }, + { + "epoch": 2.93, + "grad_norm": 0.8106399178504944, + "learning_rate": 8.405875701942732e-07, + "loss": 2.7235, + "step": 59756 + }, + { + "epoch": 2.93, + "grad_norm": 0.7472584247589111, + "learning_rate": 8.394362235126795e-07, + "loss": 3.1152, + "step": 59757 + }, + { + "epoch": 2.93, + "grad_norm": 0.7843793630599976, + "learning_rate": 8.382856647624392e-07, + "loss": 2.928, + "step": 59758 + }, + { + "epoch": 2.93, + "grad_norm": 0.7767054438591003, + "learning_rate": 8.371358939465833e-07, + "loss": 2.9102, + "step": 59759 + }, + { + "epoch": 2.93, + "grad_norm": 0.7358002066612244, + "learning_rate": 8.359869110681095e-07, + "loss": 2.8749, + "step": 59760 + }, + { + "epoch": 2.93, + "grad_norm": 0.768272340297699, + "learning_rate": 8.348387161301151e-07, + "loss": 2.8866, + "step": 59761 + }, + { + "epoch": 2.93, + "grad_norm": 0.7254506945610046, + "learning_rate": 8.336913091355313e-07, + "loss": 2.7688, + "step": 59762 + }, + { + "epoch": 2.93, + "grad_norm": 0.7313871383666992, + "learning_rate": 8.325446900874555e-07, + "loss": 2.6363, + "step": 59763 + }, + { + "epoch": 2.93, + "grad_norm": 0.793598473072052, + "learning_rate": 8.313988589888853e-07, + "loss": 2.9504, + "step": 59764 + }, + { + "epoch": 2.93, + "grad_norm": 0.73553466796875, + "learning_rate": 8.302538158428185e-07, + "loss": 2.9293, + "step": 59765 + }, + { + "epoch": 2.93, + "grad_norm": 0.765264093875885, + "learning_rate": 8.29109560652319e-07, + "loss": 3.1213, + "step": 59766 + }, + { + "epoch": 2.93, + "grad_norm": 0.7166268229484558, + "learning_rate": 8.279660934203181e-07, + "loss": 2.8594, + "step": 59767 + }, + { + "epoch": 2.93, + "grad_norm": 0.772975742816925, + "learning_rate": 8.268234141499131e-07, + "loss": 2.7602, + "step": 59768 + }, + { + "epoch": 2.93, + "grad_norm": 0.73553466796875, + "learning_rate": 8.256815228440683e-07, + "loss": 2.8789, + "step": 59769 + }, + { + "epoch": 2.93, + "grad_norm": 0.7122424244880676, + "learning_rate": 8.245404195058147e-07, + "loss": 2.7645, + "step": 59770 + }, + { + "epoch": 2.93, + "grad_norm": 0.6854143738746643, + "learning_rate": 8.234001041381166e-07, + "loss": 2.9399, + "step": 59771 + }, + { + "epoch": 2.93, + "grad_norm": 0.7698032855987549, + "learning_rate": 8.222605767440383e-07, + "loss": 3.0868, + "step": 59772 + }, + { + "epoch": 2.93, + "grad_norm": 0.7176466584205627, + "learning_rate": 8.211218373265105e-07, + "loss": 2.824, + "step": 59773 + }, + { + "epoch": 2.93, + "grad_norm": 0.7282741665840149, + "learning_rate": 8.199838858885977e-07, + "loss": 2.9953, + "step": 59774 + }, + { + "epoch": 2.93, + "grad_norm": 0.8401013612747192, + "learning_rate": 8.18846722433264e-07, + "loss": 2.7525, + "step": 59775 + }, + { + "epoch": 2.93, + "grad_norm": 0.7844291925430298, + "learning_rate": 8.177103469635071e-07, + "loss": 2.899, + "step": 59776 + }, + { + "epoch": 2.93, + "grad_norm": 0.7597390413284302, + "learning_rate": 8.165747594823247e-07, + "loss": 2.7735, + "step": 59777 + }, + { + "epoch": 2.93, + "grad_norm": 0.7756280899047852, + "learning_rate": 8.154399599926809e-07, + "loss": 2.8091, + "step": 59778 + }, + { + "epoch": 2.93, + "grad_norm": 0.7339857816696167, + "learning_rate": 8.143059484976066e-07, + "loss": 2.9528, + "step": 59779 + }, + { + "epoch": 2.93, + "grad_norm": 0.764804482460022, + "learning_rate": 8.131727250000663e-07, + "loss": 2.8768, + "step": 59780 + }, + { + "epoch": 2.93, + "grad_norm": 0.8054780960083008, + "learning_rate": 8.120402895030908e-07, + "loss": 2.5517, + "step": 59781 + }, + { + "epoch": 2.93, + "grad_norm": 0.7895975708961487, + "learning_rate": 8.109086420095779e-07, + "loss": 2.8077, + "step": 59782 + }, + { + "epoch": 2.93, + "grad_norm": 0.7449101805686951, + "learning_rate": 8.097777825225582e-07, + "loss": 2.8612, + "step": 59783 + }, + { + "epoch": 2.93, + "grad_norm": 0.7465765476226807, + "learning_rate": 8.086477110449962e-07, + "loss": 2.7231, + "step": 59784 + }, + { + "epoch": 2.93, + "grad_norm": 0.7886302471160889, + "learning_rate": 8.075184275798896e-07, + "loss": 2.8589, + "step": 59785 + }, + { + "epoch": 2.93, + "grad_norm": 0.7831330895423889, + "learning_rate": 8.063899321302025e-07, + "loss": 2.7825, + "step": 59786 + }, + { + "epoch": 2.93, + "grad_norm": 0.7341232895851135, + "learning_rate": 8.052622246988993e-07, + "loss": 2.8403, + "step": 59787 + }, + { + "epoch": 2.93, + "grad_norm": 0.7725659608840942, + "learning_rate": 8.041353052889443e-07, + "loss": 3.0116, + "step": 59788 + }, + { + "epoch": 2.93, + "grad_norm": 0.7446717023849487, + "learning_rate": 8.030091739033018e-07, + "loss": 2.9361, + "step": 59789 + }, + { + "epoch": 2.93, + "grad_norm": 0.7172529697418213, + "learning_rate": 8.018838305450026e-07, + "loss": 2.827, + "step": 59790 + }, + { + "epoch": 2.93, + "grad_norm": 0.7514923810958862, + "learning_rate": 8.007592752169112e-07, + "loss": 2.8141, + "step": 59791 + }, + { + "epoch": 2.93, + "grad_norm": 0.7799364924430847, + "learning_rate": 7.996355079220585e-07, + "loss": 2.9459, + "step": 59792 + }, + { + "epoch": 2.93, + "grad_norm": 0.801102876663208, + "learning_rate": 7.985125286633753e-07, + "loss": 2.6798, + "step": 59793 + }, + { + "epoch": 2.93, + "grad_norm": 0.7336550354957581, + "learning_rate": 7.973903374438262e-07, + "loss": 3.0554, + "step": 59794 + }, + { + "epoch": 2.93, + "grad_norm": 0.7697311043739319, + "learning_rate": 7.962689342663753e-07, + "loss": 2.8884, + "step": 59795 + }, + { + "epoch": 2.93, + "grad_norm": 0.7298890948295593, + "learning_rate": 7.951483191339869e-07, + "loss": 2.6709, + "step": 59796 + }, + { + "epoch": 2.93, + "grad_norm": 0.7298159003257751, + "learning_rate": 7.940284920495588e-07, + "loss": 2.8218, + "step": 59797 + }, + { + "epoch": 2.93, + "grad_norm": 0.7589902281761169, + "learning_rate": 7.929094530161217e-07, + "loss": 2.8853, + "step": 59798 + }, + { + "epoch": 2.93, + "grad_norm": 0.7164919972419739, + "learning_rate": 7.917912020365402e-07, + "loss": 2.8939, + "step": 59799 + }, + { + "epoch": 2.93, + "grad_norm": 0.7424035668373108, + "learning_rate": 7.906737391137785e-07, + "loss": 2.9849, + "step": 59800 + }, + { + "epoch": 2.93, + "grad_norm": 0.7874431014060974, + "learning_rate": 7.895570642508342e-07, + "loss": 2.6638, + "step": 59801 + }, + { + "epoch": 2.93, + "grad_norm": 0.7526593804359436, + "learning_rate": 7.88441177450605e-07, + "loss": 2.9343, + "step": 59802 + }, + { + "epoch": 2.93, + "grad_norm": 0.7278794050216675, + "learning_rate": 7.873260787160551e-07, + "loss": 2.8957, + "step": 59803 + }, + { + "epoch": 2.93, + "grad_norm": 0.7460549473762512, + "learning_rate": 7.862117680500823e-07, + "loss": 2.8864, + "step": 59804 + }, + { + "epoch": 2.93, + "grad_norm": 0.7620648145675659, + "learning_rate": 7.850982454556509e-07, + "loss": 2.8314, + "step": 59805 + }, + { + "epoch": 2.93, + "grad_norm": 0.727797269821167, + "learning_rate": 7.839855109356585e-07, + "loss": 2.9278, + "step": 59806 + }, + { + "epoch": 2.93, + "grad_norm": 0.7091959714889526, + "learning_rate": 7.828735644931028e-07, + "loss": 2.9797, + "step": 59807 + }, + { + "epoch": 2.93, + "grad_norm": 0.7799173593521118, + "learning_rate": 7.81762406130848e-07, + "loss": 2.9589, + "step": 59808 + }, + { + "epoch": 2.93, + "grad_norm": 0.7307581901550293, + "learning_rate": 7.806520358518587e-07, + "loss": 2.7646, + "step": 59809 + }, + { + "epoch": 2.93, + "grad_norm": 0.8110495805740356, + "learning_rate": 7.795424536590322e-07, + "loss": 2.9862, + "step": 59810 + }, + { + "epoch": 2.93, + "grad_norm": 0.7516154646873474, + "learning_rate": 7.784336595552998e-07, + "loss": 3.0308, + "step": 59811 + }, + { + "epoch": 2.93, + "grad_norm": 0.7222862839698792, + "learning_rate": 7.773256535436257e-07, + "loss": 2.7501, + "step": 59812 + }, + { + "epoch": 2.93, + "grad_norm": 0.7540051937103271, + "learning_rate": 7.762184356268408e-07, + "loss": 2.8516, + "step": 59813 + }, + { + "epoch": 2.93, + "grad_norm": 0.7463213205337524, + "learning_rate": 7.751120058079429e-07, + "loss": 2.9547, + "step": 59814 + }, + { + "epoch": 2.93, + "grad_norm": 0.7319959402084351, + "learning_rate": 7.740063640897964e-07, + "loss": 2.8742, + "step": 59815 + }, + { + "epoch": 2.93, + "grad_norm": 0.7268669605255127, + "learning_rate": 7.729015104753322e-07, + "loss": 2.9301, + "step": 59816 + }, + { + "epoch": 2.93, + "grad_norm": 0.7571326494216919, + "learning_rate": 7.71797444967448e-07, + "loss": 2.7898, + "step": 59817 + }, + { + "epoch": 2.93, + "grad_norm": 0.7747343182563782, + "learning_rate": 7.706941675690747e-07, + "loss": 3.0083, + "step": 59818 + }, + { + "epoch": 2.93, + "grad_norm": 0.7545844912528992, + "learning_rate": 7.695916782831102e-07, + "loss": 2.9409, + "step": 59819 + }, + { + "epoch": 2.93, + "grad_norm": 0.7999106049537659, + "learning_rate": 7.68489977112452e-07, + "loss": 2.7857, + "step": 59820 + }, + { + "epoch": 2.93, + "grad_norm": 0.7516934275627136, + "learning_rate": 7.673890640599978e-07, + "loss": 2.9531, + "step": 59821 + }, + { + "epoch": 2.93, + "grad_norm": 0.73682701587677, + "learning_rate": 7.662889391286453e-07, + "loss": 2.9481, + "step": 59822 + }, + { + "epoch": 2.93, + "grad_norm": 0.7407304048538208, + "learning_rate": 7.651896023213255e-07, + "loss": 2.8385, + "step": 59823 + }, + { + "epoch": 2.93, + "grad_norm": 0.8045844435691833, + "learning_rate": 7.640910536408695e-07, + "loss": 2.9524, + "step": 59824 + }, + { + "epoch": 2.93, + "grad_norm": 0.7355616092681885, + "learning_rate": 7.629932930902416e-07, + "loss": 2.7532, + "step": 59825 + }, + { + "epoch": 2.93, + "grad_norm": 0.8066126704216003, + "learning_rate": 7.618963206722728e-07, + "loss": 2.9467, + "step": 59826 + }, + { + "epoch": 2.93, + "grad_norm": 0.8547759652137756, + "learning_rate": 7.608001363899274e-07, + "loss": 2.8454, + "step": 59827 + }, + { + "epoch": 2.93, + "grad_norm": 0.77346271276474, + "learning_rate": 7.597047402460033e-07, + "loss": 2.6539, + "step": 59828 + }, + { + "epoch": 2.93, + "grad_norm": 0.7750498652458191, + "learning_rate": 7.586101322434313e-07, + "loss": 2.8954, + "step": 59829 + }, + { + "epoch": 2.93, + "grad_norm": 0.7115117907524109, + "learning_rate": 7.575163123850758e-07, + "loss": 2.933, + "step": 59830 + }, + { + "epoch": 2.93, + "grad_norm": 0.7660773992538452, + "learning_rate": 7.564232806738679e-07, + "loss": 3.1174, + "step": 59831 + }, + { + "epoch": 2.93, + "grad_norm": 0.7857538461685181, + "learning_rate": 7.553310371126053e-07, + "loss": 2.6361, + "step": 59832 + }, + { + "epoch": 2.93, + "grad_norm": 0.789605438709259, + "learning_rate": 7.542395817042523e-07, + "loss": 2.6682, + "step": 59833 + }, + { + "epoch": 2.93, + "grad_norm": 0.7484418153762817, + "learning_rate": 7.5314891445164e-07, + "loss": 2.6815, + "step": 59834 + }, + { + "epoch": 2.93, + "grad_norm": 0.7630303502082825, + "learning_rate": 7.520590353575994e-07, + "loss": 2.6285, + "step": 59835 + }, + { + "epoch": 2.93, + "grad_norm": 0.8174473643302917, + "learning_rate": 7.509699444250949e-07, + "loss": 2.7668, + "step": 59836 + }, + { + "epoch": 2.93, + "grad_norm": 0.791915774345398, + "learning_rate": 7.498816416569242e-07, + "loss": 2.9532, + "step": 59837 + }, + { + "epoch": 2.93, + "grad_norm": 0.7464105486869812, + "learning_rate": 7.487941270559516e-07, + "loss": 3.0629, + "step": 59838 + }, + { + "epoch": 2.93, + "grad_norm": 0.7361699938774109, + "learning_rate": 7.47707400625075e-07, + "loss": 2.7977, + "step": 59839 + }, + { + "epoch": 2.93, + "grad_norm": 0.7796388268470764, + "learning_rate": 7.466214623671585e-07, + "loss": 2.9098, + "step": 59840 + }, + { + "epoch": 2.93, + "grad_norm": 0.7234344482421875, + "learning_rate": 7.455363122850333e-07, + "loss": 2.918, + "step": 59841 + }, + { + "epoch": 2.93, + "grad_norm": 0.7764577269554138, + "learning_rate": 7.444519503815971e-07, + "loss": 2.8202, + "step": 59842 + }, + { + "epoch": 2.93, + "grad_norm": 0.7591822147369385, + "learning_rate": 7.43368376659681e-07, + "loss": 2.964, + "step": 59843 + }, + { + "epoch": 2.93, + "grad_norm": 0.6883577704429626, + "learning_rate": 7.422855911221159e-07, + "loss": 2.7551, + "step": 59844 + }, + { + "epoch": 2.93, + "grad_norm": 0.7411786913871765, + "learning_rate": 7.412035937717665e-07, + "loss": 2.8426, + "step": 59845 + }, + { + "epoch": 2.93, + "grad_norm": 0.7492934465408325, + "learning_rate": 7.4012238461153e-07, + "loss": 2.8528, + "step": 59846 + }, + { + "epoch": 2.93, + "grad_norm": 0.7484312057495117, + "learning_rate": 7.390419636442047e-07, + "loss": 2.831, + "step": 59847 + }, + { + "epoch": 2.93, + "grad_norm": 0.7504763007164001, + "learning_rate": 7.379623308726213e-07, + "loss": 2.8901, + "step": 59848 + }, + { + "epoch": 2.93, + "grad_norm": 1.2700088024139404, + "learning_rate": 7.368834862997109e-07, + "loss": 2.9359, + "step": 59849 + }, + { + "epoch": 2.93, + "grad_norm": 0.7634099125862122, + "learning_rate": 7.358054299282045e-07, + "loss": 2.8977, + "step": 59850 + }, + { + "epoch": 2.93, + "grad_norm": 0.7338115572929382, + "learning_rate": 7.347281617610001e-07, + "loss": 2.7591, + "step": 59851 + }, + { + "epoch": 2.93, + "grad_norm": 0.7483226656913757, + "learning_rate": 7.336516818009286e-07, + "loss": 2.9123, + "step": 59852 + }, + { + "epoch": 2.93, + "grad_norm": 0.7367458343505859, + "learning_rate": 7.32575990050821e-07, + "loss": 2.9619, + "step": 59853 + }, + { + "epoch": 2.93, + "grad_norm": 0.7470386624336243, + "learning_rate": 7.315010865135418e-07, + "loss": 2.7604, + "step": 59854 + }, + { + "epoch": 2.93, + "grad_norm": 0.7699258923530579, + "learning_rate": 7.304269711918553e-07, + "loss": 3.1449, + "step": 59855 + }, + { + "epoch": 2.93, + "grad_norm": 0.7648332118988037, + "learning_rate": 7.293536440886594e-07, + "loss": 2.8605, + "step": 59856 + }, + { + "epoch": 2.93, + "grad_norm": 0.7431793212890625, + "learning_rate": 7.282811052067183e-07, + "loss": 3.0201, + "step": 59857 + }, + { + "epoch": 2.93, + "grad_norm": 0.7206470966339111, + "learning_rate": 7.272093545489299e-07, + "loss": 2.9679, + "step": 59858 + }, + { + "epoch": 2.93, + "grad_norm": 0.7647475600242615, + "learning_rate": 7.261383921180253e-07, + "loss": 3.2728, + "step": 59859 + }, + { + "epoch": 2.93, + "grad_norm": 0.7028852701187134, + "learning_rate": 7.250682179169021e-07, + "loss": 2.8486, + "step": 59860 + }, + { + "epoch": 2.93, + "grad_norm": 0.7771363854408264, + "learning_rate": 7.239988319483581e-07, + "loss": 2.6647, + "step": 59861 + }, + { + "epoch": 2.93, + "grad_norm": 0.802079975605011, + "learning_rate": 7.22930234215191e-07, + "loss": 2.7629, + "step": 59862 + }, + { + "epoch": 2.93, + "grad_norm": 0.7468277812004089, + "learning_rate": 7.21862424720232e-07, + "loss": 2.5919, + "step": 59863 + }, + { + "epoch": 2.93, + "grad_norm": 0.7966805696487427, + "learning_rate": 7.207954034663122e-07, + "loss": 2.8663, + "step": 59864 + }, + { + "epoch": 2.93, + "grad_norm": 0.7449951767921448, + "learning_rate": 7.197291704561959e-07, + "loss": 2.8764, + "step": 59865 + }, + { + "epoch": 2.93, + "grad_norm": 0.7162653803825378, + "learning_rate": 7.186637256927141e-07, + "loss": 2.9887, + "step": 59866 + }, + { + "epoch": 2.93, + "grad_norm": 0.762417733669281, + "learning_rate": 7.175990691786981e-07, + "loss": 2.6655, + "step": 59867 + }, + { + "epoch": 2.93, + "grad_norm": 0.7331001162528992, + "learning_rate": 7.165352009169123e-07, + "loss": 2.7909, + "step": 59868 + }, + { + "epoch": 2.93, + "grad_norm": 0.8558750152587891, + "learning_rate": 7.154721209101544e-07, + "loss": 2.9628, + "step": 59869 + }, + { + "epoch": 2.93, + "grad_norm": 0.7514426112174988, + "learning_rate": 7.144098291612888e-07, + "loss": 2.8311, + "step": 59870 + }, + { + "epoch": 2.93, + "grad_norm": 0.7667511701583862, + "learning_rate": 7.133483256730465e-07, + "loss": 2.8383, + "step": 59871 + }, + { + "epoch": 2.93, + "grad_norm": 0.7563773393630981, + "learning_rate": 7.122876104482256e-07, + "loss": 2.8637, + "step": 59872 + }, + { + "epoch": 2.93, + "grad_norm": 0.7660837173461914, + "learning_rate": 7.112276834896568e-07, + "loss": 2.8503, + "step": 59873 + }, + { + "epoch": 2.93, + "grad_norm": 0.7688391804695129, + "learning_rate": 7.101685448001382e-07, + "loss": 2.9298, + "step": 59874 + }, + { + "epoch": 2.93, + "grad_norm": 0.7841348052024841, + "learning_rate": 7.091101943824007e-07, + "loss": 3.0863, + "step": 59875 + }, + { + "epoch": 2.93, + "grad_norm": 0.748285174369812, + "learning_rate": 7.080526322392754e-07, + "loss": 2.9875, + "step": 59876 + }, + { + "epoch": 2.93, + "grad_norm": 0.7872505784034729, + "learning_rate": 7.069958583735602e-07, + "loss": 3.2147, + "step": 59877 + }, + { + "epoch": 2.93, + "grad_norm": 0.7570214867591858, + "learning_rate": 7.059398727880194e-07, + "loss": 3.0622, + "step": 59878 + }, + { + "epoch": 2.93, + "grad_norm": 0.9395710825920105, + "learning_rate": 7.048846754853843e-07, + "loss": 3.0173, + "step": 59879 + }, + { + "epoch": 2.93, + "grad_norm": 0.7911810874938965, + "learning_rate": 7.038302664685192e-07, + "loss": 2.8858, + "step": 59880 + }, + { + "epoch": 2.93, + "grad_norm": 0.7433484792709351, + "learning_rate": 7.027766457401551e-07, + "loss": 2.9579, + "step": 59881 + }, + { + "epoch": 2.93, + "grad_norm": 0.7412406206130981, + "learning_rate": 7.017238133030901e-07, + "loss": 2.8247, + "step": 59882 + }, + { + "epoch": 2.93, + "grad_norm": 0.742946207523346, + "learning_rate": 7.006717691600883e-07, + "loss": 2.6349, + "step": 59883 + }, + { + "epoch": 2.93, + "grad_norm": 0.7795896530151367, + "learning_rate": 6.99620513313881e-07, + "loss": 2.8827, + "step": 59884 + }, + { + "epoch": 2.93, + "grad_norm": 0.7590773701667786, + "learning_rate": 6.985700457672993e-07, + "loss": 2.8935, + "step": 59885 + }, + { + "epoch": 2.93, + "grad_norm": 0.7604755163192749, + "learning_rate": 6.975203665230744e-07, + "loss": 3.0704, + "step": 59886 + }, + { + "epoch": 2.93, + "grad_norm": 0.7391711473464966, + "learning_rate": 6.964714755839707e-07, + "loss": 2.9104, + "step": 59887 + }, + { + "epoch": 2.93, + "grad_norm": 0.7480840682983398, + "learning_rate": 6.954233729527859e-07, + "loss": 2.9565, + "step": 59888 + }, + { + "epoch": 2.94, + "grad_norm": 0.7480906844139099, + "learning_rate": 6.943760586322178e-07, + "loss": 2.8197, + "step": 59889 + }, + { + "epoch": 2.94, + "grad_norm": 0.7429905533790588, + "learning_rate": 6.933295326250976e-07, + "loss": 2.9929, + "step": 59890 + }, + { + "epoch": 2.94, + "grad_norm": 0.8021376132965088, + "learning_rate": 6.922837949341231e-07, + "loss": 2.9372, + "step": 59891 + }, + { + "epoch": 2.94, + "grad_norm": 0.8098527193069458, + "learning_rate": 6.912388455620588e-07, + "loss": 3.0109, + "step": 59892 + }, + { + "epoch": 2.94, + "grad_norm": 0.7434927225112915, + "learning_rate": 6.901946845117023e-07, + "loss": 2.8893, + "step": 59893 + }, + { + "epoch": 2.94, + "grad_norm": 0.7502673864364624, + "learning_rate": 6.891513117857517e-07, + "loss": 2.8706, + "step": 59894 + }, + { + "epoch": 2.94, + "grad_norm": 0.7409077882766724, + "learning_rate": 6.881087273869712e-07, + "loss": 2.8049, + "step": 59895 + }, + { + "epoch": 2.94, + "grad_norm": 0.7963376045227051, + "learning_rate": 6.870669313180921e-07, + "loss": 2.9638, + "step": 59896 + }, + { + "epoch": 2.94, + "grad_norm": 0.7674747705459595, + "learning_rate": 6.860259235818788e-07, + "loss": 2.8135, + "step": 59897 + }, + { + "epoch": 2.94, + "grad_norm": 0.8309979438781738, + "learning_rate": 6.849857041810958e-07, + "loss": 2.8022, + "step": 59898 + }, + { + "epoch": 2.94, + "grad_norm": 0.7403390407562256, + "learning_rate": 6.839462731184409e-07, + "loss": 2.8496, + "step": 59899 + }, + { + "epoch": 2.94, + "grad_norm": 0.7429600954055786, + "learning_rate": 6.829076303966785e-07, + "loss": 3.2261, + "step": 59900 + }, + { + "epoch": 2.94, + "grad_norm": 0.7548747658729553, + "learning_rate": 6.818697760185065e-07, + "loss": 2.8569, + "step": 59901 + }, + { + "epoch": 2.94, + "grad_norm": 0.7258870005607605, + "learning_rate": 6.808327099867228e-07, + "loss": 2.9454, + "step": 59902 + }, + { + "epoch": 2.94, + "grad_norm": 0.7824847102165222, + "learning_rate": 6.797964323039918e-07, + "loss": 3.0032, + "step": 59903 + }, + { + "epoch": 2.94, + "grad_norm": 0.769087016582489, + "learning_rate": 6.787609429730779e-07, + "loss": 2.9478, + "step": 59904 + }, + { + "epoch": 2.94, + "grad_norm": 0.7909684777259827, + "learning_rate": 6.777262419967122e-07, + "loss": 2.8437, + "step": 59905 + }, + { + "epoch": 2.94, + "grad_norm": 0.7738947868347168, + "learning_rate": 6.766923293775928e-07, + "loss": 2.8174, + "step": 59906 + }, + { + "epoch": 2.94, + "grad_norm": 0.7569680213928223, + "learning_rate": 6.756592051184507e-07, + "loss": 2.7229, + "step": 59907 + }, + { + "epoch": 2.94, + "grad_norm": 0.7944665551185608, + "learning_rate": 6.746268692220502e-07, + "loss": 2.8974, + "step": 59908 + }, + { + "epoch": 2.94, + "grad_norm": 0.7685111165046692, + "learning_rate": 6.735953216910561e-07, + "loss": 2.7728, + "step": 59909 + }, + { + "epoch": 2.94, + "grad_norm": 0.7883465886116028, + "learning_rate": 6.725645625281995e-07, + "loss": 2.8538, + "step": 59910 + }, + { + "epoch": 2.94, + "grad_norm": 0.7175097465515137, + "learning_rate": 6.715345917362113e-07, + "loss": 2.7597, + "step": 59911 + }, + { + "epoch": 2.94, + "grad_norm": 0.8032987713813782, + "learning_rate": 6.705054093177897e-07, + "loss": 2.6333, + "step": 59912 + }, + { + "epoch": 2.94, + "grad_norm": 0.8348165154457092, + "learning_rate": 6.694770152756657e-07, + "loss": 2.7465, + "step": 59913 + }, + { + "epoch": 2.94, + "grad_norm": 0.78910893201828, + "learning_rate": 6.684494096125037e-07, + "loss": 2.8535, + "step": 59914 + }, + { + "epoch": 2.94, + "grad_norm": 0.7422067523002625, + "learning_rate": 6.674225923310683e-07, + "loss": 2.9404, + "step": 59915 + }, + { + "epoch": 2.94, + "grad_norm": 0.8071993589401245, + "learning_rate": 6.66396563434024e-07, + "loss": 3.0353, + "step": 59916 + }, + { + "epoch": 2.94, + "grad_norm": 0.7702639698982239, + "learning_rate": 6.653713229240686e-07, + "loss": 3.0289, + "step": 59917 + }, + { + "epoch": 2.94, + "grad_norm": 0.7507279515266418, + "learning_rate": 6.643468708039335e-07, + "loss": 2.84, + "step": 59918 + }, + { + "epoch": 2.94, + "grad_norm": 0.7871813178062439, + "learning_rate": 6.633232070762828e-07, + "loss": 2.923, + "step": 59919 + }, + { + "epoch": 2.94, + "grad_norm": 0.7500841617584229, + "learning_rate": 6.62300331743848e-07, + "loss": 2.9329, + "step": 59920 + }, + { + "epoch": 2.94, + "grad_norm": 0.7141773104667664, + "learning_rate": 6.612782448092935e-07, + "loss": 3.0867, + "step": 59921 + }, + { + "epoch": 2.94, + "grad_norm": 0.7696955800056458, + "learning_rate": 6.602569462753171e-07, + "loss": 2.8553, + "step": 59922 + }, + { + "epoch": 2.94, + "grad_norm": 0.7573153376579285, + "learning_rate": 6.5923643614465e-07, + "loss": 2.8073, + "step": 59923 + }, + { + "epoch": 2.94, + "grad_norm": 0.7639854550361633, + "learning_rate": 6.582167144198902e-07, + "loss": 2.7228, + "step": 59924 + }, + { + "epoch": 2.94, + "grad_norm": 0.7519087195396423, + "learning_rate": 6.571977811038354e-07, + "loss": 2.9471, + "step": 59925 + }, + { + "epoch": 2.94, + "grad_norm": 0.7760326266288757, + "learning_rate": 6.561796361990834e-07, + "loss": 2.9599, + "step": 59926 + }, + { + "epoch": 2.94, + "grad_norm": 0.791580855846405, + "learning_rate": 6.551622797083322e-07, + "loss": 2.843, + "step": 59927 + }, + { + "epoch": 2.94, + "grad_norm": 0.7381553053855896, + "learning_rate": 6.541457116342797e-07, + "loss": 2.8464, + "step": 59928 + }, + { + "epoch": 2.94, + "grad_norm": 0.7461539506912231, + "learning_rate": 6.531299319796234e-07, + "loss": 2.7977, + "step": 59929 + }, + { + "epoch": 2.94, + "grad_norm": 0.7161452174186707, + "learning_rate": 6.52114940746995e-07, + "loss": 3.0583, + "step": 59930 + }, + { + "epoch": 2.94, + "grad_norm": 0.7718291878700256, + "learning_rate": 6.51100737939092e-07, + "loss": 2.8127, + "step": 59931 + }, + { + "epoch": 2.94, + "grad_norm": 0.8032710552215576, + "learning_rate": 6.500873235585458e-07, + "loss": 2.8657, + "step": 59932 + }, + { + "epoch": 2.94, + "grad_norm": 0.8120545744895935, + "learning_rate": 6.490746976080874e-07, + "loss": 2.8896, + "step": 59933 + }, + { + "epoch": 2.94, + "grad_norm": 0.7528083920478821, + "learning_rate": 6.480628600903482e-07, + "loss": 2.6816, + "step": 59934 + }, + { + "epoch": 2.94, + "grad_norm": 0.7332311868667603, + "learning_rate": 6.470518110080259e-07, + "loss": 2.7482, + "step": 59935 + }, + { + "epoch": 2.94, + "grad_norm": 0.7682661414146423, + "learning_rate": 6.460415503637184e-07, + "loss": 2.9757, + "step": 59936 + }, + { + "epoch": 2.94, + "grad_norm": 0.7852498292922974, + "learning_rate": 6.45032078160157e-07, + "loss": 2.8943, + "step": 59937 + }, + { + "epoch": 2.94, + "grad_norm": 0.7649948596954346, + "learning_rate": 6.440233943999396e-07, + "loss": 3.1078, + "step": 59938 + }, + { + "epoch": 2.94, + "grad_norm": 0.7823342084884644, + "learning_rate": 6.430154990857639e-07, + "loss": 2.9817, + "step": 59939 + }, + { + "epoch": 2.94, + "grad_norm": 0.755197286605835, + "learning_rate": 6.420083922202945e-07, + "loss": 2.9656, + "step": 59940 + }, + { + "epoch": 2.94, + "grad_norm": 0.7865545153617859, + "learning_rate": 6.410020738061294e-07, + "loss": 2.8751, + "step": 59941 + }, + { + "epoch": 2.94, + "grad_norm": 0.7866171598434448, + "learning_rate": 6.399965438459664e-07, + "loss": 2.8668, + "step": 59942 + }, + { + "epoch": 2.94, + "grad_norm": 0.7472968697547913, + "learning_rate": 6.389918023424367e-07, + "loss": 2.7944, + "step": 59943 + }, + { + "epoch": 2.94, + "grad_norm": 0.8500422835350037, + "learning_rate": 6.379878492982049e-07, + "loss": 2.9968, + "step": 59944 + }, + { + "epoch": 2.94, + "grad_norm": 0.7453992962837219, + "learning_rate": 6.369846847158689e-07, + "loss": 2.8663, + "step": 59945 + }, + { + "epoch": 2.94, + "grad_norm": 0.7211142182350159, + "learning_rate": 6.359823085981264e-07, + "loss": 2.7665, + "step": 59946 + }, + { + "epoch": 2.94, + "grad_norm": 0.7596474885940552, + "learning_rate": 6.349807209476088e-07, + "loss": 2.7561, + "step": 59947 + }, + { + "epoch": 2.94, + "grad_norm": 0.7471470832824707, + "learning_rate": 6.339799217668806e-07, + "loss": 2.7884, + "step": 59948 + }, + { + "epoch": 2.94, + "grad_norm": 0.7682774662971497, + "learning_rate": 6.329799110586731e-07, + "loss": 2.7054, + "step": 59949 + }, + { + "epoch": 2.94, + "grad_norm": 0.7464492321014404, + "learning_rate": 6.31980688825584e-07, + "loss": 2.7138, + "step": 59950 + }, + { + "epoch": 2.94, + "grad_norm": 0.6935986876487732, + "learning_rate": 6.309822550702448e-07, + "loss": 2.8231, + "step": 59951 + }, + { + "epoch": 2.94, + "grad_norm": 0.7591943740844727, + "learning_rate": 6.299846097952865e-07, + "loss": 2.8893, + "step": 59952 + }, + { + "epoch": 2.94, + "grad_norm": 0.7476579546928406, + "learning_rate": 6.289877530033072e-07, + "loss": 2.9504, + "step": 59953 + }, + { + "epoch": 2.94, + "grad_norm": 0.8381856083869934, + "learning_rate": 6.279916846969713e-07, + "loss": 2.689, + "step": 59954 + }, + { + "epoch": 2.94, + "grad_norm": 0.779679536819458, + "learning_rate": 6.2699640487891e-07, + "loss": 2.6562, + "step": 59955 + }, + { + "epoch": 2.94, + "grad_norm": 0.7561830282211304, + "learning_rate": 6.260019135517213e-07, + "loss": 2.7303, + "step": 59956 + }, + { + "epoch": 2.94, + "grad_norm": 0.764441728591919, + "learning_rate": 6.250082107180032e-07, + "loss": 2.9123, + "step": 59957 + }, + { + "epoch": 2.94, + "grad_norm": 0.7792513370513916, + "learning_rate": 6.240152963804202e-07, + "loss": 2.6153, + "step": 59958 + }, + { + "epoch": 2.94, + "grad_norm": 0.7713596820831299, + "learning_rate": 6.230231705415367e-07, + "loss": 3.1653, + "step": 59959 + }, + { + "epoch": 2.94, + "grad_norm": 0.7923070192337036, + "learning_rate": 6.220318332040175e-07, + "loss": 2.753, + "step": 59960 + }, + { + "epoch": 2.94, + "grad_norm": 0.7460241317749023, + "learning_rate": 6.210412843704605e-07, + "loss": 2.9271, + "step": 59961 + }, + { + "epoch": 2.94, + "grad_norm": 0.768636167049408, + "learning_rate": 6.2005152404343e-07, + "loss": 2.8513, + "step": 59962 + }, + { + "epoch": 2.94, + "grad_norm": 0.7382229566574097, + "learning_rate": 6.19062552225591e-07, + "loss": 2.8161, + "step": 59963 + }, + { + "epoch": 2.94, + "grad_norm": 0.7440184354782104, + "learning_rate": 6.180743689195078e-07, + "loss": 2.9522, + "step": 59964 + }, + { + "epoch": 2.94, + "grad_norm": 0.7206867337226868, + "learning_rate": 6.170869741278117e-07, + "loss": 2.7925, + "step": 59965 + }, + { + "epoch": 2.94, + "grad_norm": 0.7849396467208862, + "learning_rate": 6.161003678531006e-07, + "loss": 3.0001, + "step": 59966 + }, + { + "epoch": 2.94, + "grad_norm": 0.7822026610374451, + "learning_rate": 6.151145500979726e-07, + "loss": 3.0031, + "step": 59967 + }, + { + "epoch": 2.94, + "grad_norm": 0.7590682506561279, + "learning_rate": 6.141295208649921e-07, + "loss": 2.9123, + "step": 59968 + }, + { + "epoch": 2.94, + "grad_norm": 0.8161299824714661, + "learning_rate": 6.131452801567571e-07, + "loss": 2.7404, + "step": 59969 + }, + { + "epoch": 2.94, + "grad_norm": 0.7368728518486023, + "learning_rate": 6.121618279758989e-07, + "loss": 2.7845, + "step": 59970 + }, + { + "epoch": 2.94, + "grad_norm": 0.7626436352729797, + "learning_rate": 6.111791643250152e-07, + "loss": 2.8567, + "step": 59971 + }, + { + "epoch": 2.94, + "grad_norm": 0.7329202890396118, + "learning_rate": 6.101972892066376e-07, + "loss": 2.7161, + "step": 59972 + }, + { + "epoch": 2.94, + "grad_norm": 0.7684486508369446, + "learning_rate": 6.092162026233971e-07, + "loss": 2.9794, + "step": 59973 + }, + { + "epoch": 2.94, + "grad_norm": 0.769387423992157, + "learning_rate": 6.082359045778251e-07, + "loss": 2.7875, + "step": 59974 + }, + { + "epoch": 2.94, + "grad_norm": 0.7294189929962158, + "learning_rate": 6.072563950725862e-07, + "loss": 2.8859, + "step": 59975 + }, + { + "epoch": 2.94, + "grad_norm": 0.7302929759025574, + "learning_rate": 6.062776741101783e-07, + "loss": 2.7874, + "step": 59976 + }, + { + "epoch": 2.94, + "grad_norm": 0.7487363815307617, + "learning_rate": 6.052997416932326e-07, + "loss": 2.9571, + "step": 59977 + }, + { + "epoch": 2.94, + "grad_norm": 0.7794578075408936, + "learning_rate": 6.043225978243138e-07, + "loss": 2.9774, + "step": 59978 + }, + { + "epoch": 2.94, + "grad_norm": 0.7639981508255005, + "learning_rate": 6.033462425059865e-07, + "loss": 2.8534, + "step": 59979 + }, + { + "epoch": 2.94, + "grad_norm": 0.7297922968864441, + "learning_rate": 6.023706757408487e-07, + "loss": 2.799, + "step": 59980 + }, + { + "epoch": 2.94, + "grad_norm": 0.7623937726020813, + "learning_rate": 6.013958975313982e-07, + "loss": 2.9303, + "step": 59981 + }, + { + "epoch": 2.94, + "grad_norm": 0.7628301382064819, + "learning_rate": 6.004219078802663e-07, + "loss": 2.7251, + "step": 59982 + }, + { + "epoch": 2.94, + "grad_norm": 0.7557390928268433, + "learning_rate": 5.994487067900178e-07, + "loss": 2.8481, + "step": 59983 + }, + { + "epoch": 2.94, + "grad_norm": 0.7709507346153259, + "learning_rate": 5.984762942631838e-07, + "loss": 2.871, + "step": 59984 + }, + { + "epoch": 2.94, + "grad_norm": 0.718964159488678, + "learning_rate": 5.975046703023623e-07, + "loss": 2.7692, + "step": 59985 + }, + { + "epoch": 2.94, + "grad_norm": 0.7481675744056702, + "learning_rate": 5.965338349100845e-07, + "loss": 2.899, + "step": 59986 + }, + { + "epoch": 2.94, + "grad_norm": 0.7227299809455872, + "learning_rate": 5.955637880889153e-07, + "loss": 2.7444, + "step": 59987 + }, + { + "epoch": 2.94, + "grad_norm": 0.7137525677680969, + "learning_rate": 5.945945298413856e-07, + "loss": 2.8117, + "step": 59988 + }, + { + "epoch": 2.94, + "grad_norm": 0.7731427550315857, + "learning_rate": 5.936260601700937e-07, + "loss": 2.776, + "step": 59989 + }, + { + "epoch": 2.94, + "grad_norm": 0.7331191897392273, + "learning_rate": 5.926583790775707e-07, + "loss": 3.0075, + "step": 59990 + }, + { + "epoch": 2.94, + "grad_norm": 0.7784517407417297, + "learning_rate": 5.916914865663481e-07, + "loss": 2.8882, + "step": 59991 + }, + { + "epoch": 2.94, + "grad_norm": 0.847660481929779, + "learning_rate": 5.907253826390234e-07, + "loss": 2.9467, + "step": 59992 + }, + { + "epoch": 2.94, + "grad_norm": 0.7578979730606079, + "learning_rate": 5.897600672980618e-07, + "loss": 2.915, + "step": 59993 + }, + { + "epoch": 2.94, + "grad_norm": 0.754513144493103, + "learning_rate": 5.88795540546061e-07, + "loss": 2.7695, + "step": 59994 + }, + { + "epoch": 2.94, + "grad_norm": 0.7934916615486145, + "learning_rate": 5.878318023855855e-07, + "loss": 2.6653, + "step": 59995 + }, + { + "epoch": 2.94, + "grad_norm": 0.7949754595756531, + "learning_rate": 5.868688528191002e-07, + "loss": 2.8544, + "step": 59996 + }, + { + "epoch": 2.94, + "grad_norm": 0.7312231659889221, + "learning_rate": 5.859066918492028e-07, + "loss": 2.908, + "step": 59997 + }, + { + "epoch": 2.94, + "grad_norm": 0.7361305952072144, + "learning_rate": 5.849453194783915e-07, + "loss": 2.8526, + "step": 59998 + }, + { + "epoch": 2.94, + "grad_norm": 0.7572147250175476, + "learning_rate": 5.839847357091975e-07, + "loss": 2.8984, + "step": 59999 + }, + { + "epoch": 2.94, + "grad_norm": 0.793131411075592, + "learning_rate": 5.830249405441855e-07, + "loss": 2.9666, + "step": 60000 + }, + { + "epoch": 2.94, + "grad_norm": 0.7692931294441223, + "learning_rate": 5.820659339858536e-07, + "loss": 3.1132, + "step": 60001 + }, + { + "epoch": 2.94, + "grad_norm": 0.7415686249732971, + "learning_rate": 5.811077160367328e-07, + "loss": 2.9689, + "step": 60002 + }, + { + "epoch": 2.94, + "grad_norm": 0.704154372215271, + "learning_rate": 5.801502866993546e-07, + "loss": 2.8167, + "step": 60003 + }, + { + "epoch": 2.94, + "grad_norm": 0.7278680205345154, + "learning_rate": 5.791936459762169e-07, + "loss": 2.8319, + "step": 60004 + }, + { + "epoch": 2.94, + "grad_norm": 0.7455959320068359, + "learning_rate": 5.782377938698845e-07, + "loss": 3.024, + "step": 60005 + }, + { + "epoch": 2.94, + "grad_norm": 0.7692423462867737, + "learning_rate": 5.77282730382822e-07, + "loss": 2.9902, + "step": 60006 + }, + { + "epoch": 2.94, + "grad_norm": 0.7454026937484741, + "learning_rate": 5.76328455517594e-07, + "loss": 2.8497, + "step": 60007 + }, + { + "epoch": 2.94, + "grad_norm": 0.7537476420402527, + "learning_rate": 5.753749692766984e-07, + "loss": 2.8039, + "step": 60008 + }, + { + "epoch": 2.94, + "grad_norm": 0.7972829937934875, + "learning_rate": 5.744222716626334e-07, + "loss": 2.9203, + "step": 60009 + }, + { + "epoch": 2.94, + "grad_norm": 0.8087637424468994, + "learning_rate": 5.734703626778969e-07, + "loss": 2.9301, + "step": 60010 + }, + { + "epoch": 2.94, + "grad_norm": 0.7381844520568848, + "learning_rate": 5.725192423250203e-07, + "loss": 2.8868, + "step": 60011 + }, + { + "epoch": 2.94, + "grad_norm": 0.7417834997177124, + "learning_rate": 5.715689106065346e-07, + "loss": 2.7835, + "step": 60012 + }, + { + "epoch": 2.94, + "grad_norm": 0.7412571310997009, + "learning_rate": 5.706193675248716e-07, + "loss": 3.0335, + "step": 60013 + }, + { + "epoch": 2.94, + "grad_norm": 0.7355944514274597, + "learning_rate": 5.696706130825624e-07, + "loss": 2.7241, + "step": 60014 + }, + { + "epoch": 2.94, + "grad_norm": 0.738745927810669, + "learning_rate": 5.687226472821383e-07, + "loss": 2.7394, + "step": 60015 + }, + { + "epoch": 2.94, + "grad_norm": 0.6923105716705322, + "learning_rate": 5.67775470126064e-07, + "loss": 2.9196, + "step": 60016 + }, + { + "epoch": 2.94, + "grad_norm": 0.7854421734809875, + "learning_rate": 5.668290816168708e-07, + "loss": 2.734, + "step": 60017 + }, + { + "epoch": 2.94, + "grad_norm": 0.8272637128829956, + "learning_rate": 5.658834817569901e-07, + "loss": 2.9645, + "step": 60018 + }, + { + "epoch": 2.94, + "grad_norm": 0.72818922996521, + "learning_rate": 5.649386705489533e-07, + "loss": 2.9149, + "step": 60019 + }, + { + "epoch": 2.94, + "grad_norm": 0.7304298877716064, + "learning_rate": 5.63994647995225e-07, + "loss": 2.7626, + "step": 60020 + }, + { + "epoch": 2.94, + "grad_norm": 0.8027445673942566, + "learning_rate": 5.630514140983366e-07, + "loss": 2.9628, + "step": 60021 + }, + { + "epoch": 2.94, + "grad_norm": 0.719323456287384, + "learning_rate": 5.621089688607194e-07, + "loss": 2.6681, + "step": 60022 + }, + { + "epoch": 2.94, + "grad_norm": 0.806006669998169, + "learning_rate": 5.611673122849048e-07, + "loss": 2.7381, + "step": 60023 + }, + { + "epoch": 2.94, + "grad_norm": 0.7545458078384399, + "learning_rate": 5.60226444373324e-07, + "loss": 2.8362, + "step": 60024 + }, + { + "epoch": 2.94, + "grad_norm": 0.8001763224601746, + "learning_rate": 5.592863651285084e-07, + "loss": 3.0595, + "step": 60025 + }, + { + "epoch": 2.94, + "grad_norm": 0.7519499659538269, + "learning_rate": 5.583470745528562e-07, + "loss": 2.7645, + "step": 60026 + }, + { + "epoch": 2.94, + "grad_norm": 0.7754395604133606, + "learning_rate": 5.574085726489319e-07, + "loss": 2.7446, + "step": 60027 + }, + { + "epoch": 2.94, + "grad_norm": 0.7979375123977661, + "learning_rate": 5.564708594191337e-07, + "loss": 3.0058, + "step": 60028 + }, + { + "epoch": 2.94, + "grad_norm": 0.7920042872428894, + "learning_rate": 5.555339348659926e-07, + "loss": 2.7863, + "step": 60029 + }, + { + "epoch": 2.94, + "grad_norm": 0.7542409300804138, + "learning_rate": 5.54597798991907e-07, + "loss": 2.8707, + "step": 60030 + }, + { + "epoch": 2.94, + "grad_norm": 0.7596069574356079, + "learning_rate": 5.536624517994081e-07, + "loss": 2.9896, + "step": 60031 + }, + { + "epoch": 2.94, + "grad_norm": 0.7304089665412903, + "learning_rate": 5.527278932909274e-07, + "loss": 2.6327, + "step": 60032 + }, + { + "epoch": 2.94, + "grad_norm": 0.7943688631057739, + "learning_rate": 5.517941234689294e-07, + "loss": 2.9162, + "step": 60033 + }, + { + "epoch": 2.94, + "grad_norm": 0.7383855581283569, + "learning_rate": 5.508611423358789e-07, + "loss": 2.8657, + "step": 60034 + }, + { + "epoch": 2.94, + "grad_norm": 0.717839777469635, + "learning_rate": 5.499289498942405e-07, + "loss": 3.0219, + "step": 60035 + }, + { + "epoch": 2.94, + "grad_norm": 0.8198238015174866, + "learning_rate": 5.489975461464458e-07, + "loss": 2.9683, + "step": 60036 + }, + { + "epoch": 2.94, + "grad_norm": 0.7249385118484497, + "learning_rate": 5.480669310949593e-07, + "loss": 2.7602, + "step": 60037 + }, + { + "epoch": 2.94, + "grad_norm": 0.771135687828064, + "learning_rate": 5.471371047422457e-07, + "loss": 3.0002, + "step": 60038 + }, + { + "epoch": 2.94, + "grad_norm": 0.759382426738739, + "learning_rate": 5.462080670907365e-07, + "loss": 2.7144, + "step": 60039 + }, + { + "epoch": 2.94, + "grad_norm": 0.8611246943473816, + "learning_rate": 5.452798181428631e-07, + "loss": 2.7942, + "step": 60040 + }, + { + "epoch": 2.94, + "grad_norm": 0.7444459795951843, + "learning_rate": 5.443523579011233e-07, + "loss": 3.0667, + "step": 60041 + }, + { + "epoch": 2.94, + "grad_norm": 0.7149555683135986, + "learning_rate": 5.434256863679154e-07, + "loss": 2.9428, + "step": 60042 + }, + { + "epoch": 2.94, + "grad_norm": 0.7191066741943359, + "learning_rate": 5.424998035456707e-07, + "loss": 2.6826, + "step": 60043 + }, + { + "epoch": 2.94, + "grad_norm": 0.7511018514633179, + "learning_rate": 5.415747094368872e-07, + "loss": 2.7892, + "step": 60044 + }, + { + "epoch": 2.94, + "grad_norm": 0.7677654027938843, + "learning_rate": 5.40650404043963e-07, + "loss": 2.8827, + "step": 60045 + }, + { + "epoch": 2.94, + "grad_norm": 0.7885034084320068, + "learning_rate": 5.397268873692961e-07, + "loss": 2.6797, + "step": 60046 + }, + { + "epoch": 2.94, + "grad_norm": 0.7618846893310547, + "learning_rate": 5.388041594153847e-07, + "loss": 3.108, + "step": 60047 + }, + { + "epoch": 2.94, + "grad_norm": 0.7315735816955566, + "learning_rate": 5.378822201846267e-07, + "loss": 2.8573, + "step": 60048 + }, + { + "epoch": 2.94, + "grad_norm": 0.7194553017616272, + "learning_rate": 5.369610696794535e-07, + "loss": 2.9314, + "step": 60049 + }, + { + "epoch": 2.94, + "grad_norm": 0.712523341178894, + "learning_rate": 5.360407079022966e-07, + "loss": 2.717, + "step": 60050 + }, + { + "epoch": 2.94, + "grad_norm": 0.725117027759552, + "learning_rate": 5.35121134855554e-07, + "loss": 2.6336, + "step": 60051 + }, + { + "epoch": 2.94, + "grad_norm": 0.7298504114151001, + "learning_rate": 5.342023505416903e-07, + "loss": 2.9929, + "step": 60052 + }, + { + "epoch": 2.94, + "grad_norm": 1.2413877248764038, + "learning_rate": 5.332843549631038e-07, + "loss": 2.911, + "step": 60053 + }, + { + "epoch": 2.94, + "grad_norm": 0.7253249883651733, + "learning_rate": 5.323671481222258e-07, + "loss": 2.8556, + "step": 60054 + }, + { + "epoch": 2.94, + "grad_norm": 0.7611720561981201, + "learning_rate": 5.31450730021421e-07, + "loss": 2.8577, + "step": 60055 + }, + { + "epoch": 2.94, + "grad_norm": 0.7636274695396423, + "learning_rate": 5.305351006631542e-07, + "loss": 2.7728, + "step": 60056 + }, + { + "epoch": 2.94, + "grad_norm": 0.7457626461982727, + "learning_rate": 5.296202600498567e-07, + "loss": 2.8102, + "step": 60057 + }, + { + "epoch": 2.94, + "grad_norm": 0.7568612098693848, + "learning_rate": 5.2870620818386e-07, + "loss": 2.9234, + "step": 60058 + }, + { + "epoch": 2.94, + "grad_norm": 0.7343557476997375, + "learning_rate": 5.277929450676288e-07, + "loss": 2.979, + "step": 60059 + }, + { + "epoch": 2.94, + "grad_norm": 0.7438076734542847, + "learning_rate": 5.268804707035946e-07, + "loss": 3.08, + "step": 60060 + }, + { + "epoch": 2.94, + "grad_norm": 0.7437184453010559, + "learning_rate": 5.259687850940886e-07, + "loss": 2.7556, + "step": 60061 + }, + { + "epoch": 2.94, + "grad_norm": 0.7388575077056885, + "learning_rate": 5.250578882415423e-07, + "loss": 2.9538, + "step": 60062 + }, + { + "epoch": 2.94, + "grad_norm": 0.7527971267700195, + "learning_rate": 5.24147780148354e-07, + "loss": 2.9749, + "step": 60063 + }, + { + "epoch": 2.94, + "grad_norm": 0.7302408814430237, + "learning_rate": 5.232384608169549e-07, + "loss": 2.6886, + "step": 60064 + }, + { + "epoch": 2.94, + "grad_norm": 0.7744280695915222, + "learning_rate": 5.223299302496764e-07, + "loss": 2.6481, + "step": 60065 + }, + { + "epoch": 2.94, + "grad_norm": 0.72216796875, + "learning_rate": 5.2142218844895e-07, + "loss": 2.9079, + "step": 60066 + }, + { + "epoch": 2.94, + "grad_norm": 0.7193196415901184, + "learning_rate": 5.205152354171738e-07, + "loss": 2.98, + "step": 60067 + }, + { + "epoch": 2.94, + "grad_norm": 0.7564364075660706, + "learning_rate": 5.196090711567457e-07, + "loss": 2.8694, + "step": 60068 + }, + { + "epoch": 2.94, + "grad_norm": 0.7751148343086243, + "learning_rate": 5.187036956699975e-07, + "loss": 2.8992, + "step": 60069 + }, + { + "epoch": 2.94, + "grad_norm": 0.7742836475372314, + "learning_rate": 5.177991089593603e-07, + "loss": 2.8661, + "step": 60070 + }, + { + "epoch": 2.94, + "grad_norm": 0.7468364834785461, + "learning_rate": 5.16895311027199e-07, + "loss": 2.8607, + "step": 60071 + }, + { + "epoch": 2.94, + "grad_norm": 0.7487417459487915, + "learning_rate": 5.159923018759115e-07, + "loss": 2.8908, + "step": 60072 + }, + { + "epoch": 2.94, + "grad_norm": 0.7274312376976013, + "learning_rate": 5.150900815078296e-07, + "loss": 2.9371, + "step": 60073 + }, + { + "epoch": 2.94, + "grad_norm": 0.7443316578865051, + "learning_rate": 5.141886499253844e-07, + "loss": 2.9809, + "step": 60074 + }, + { + "epoch": 2.94, + "grad_norm": 0.7913233041763306, + "learning_rate": 5.132880071309408e-07, + "loss": 2.9794, + "step": 60075 + }, + { + "epoch": 2.94, + "grad_norm": 0.763105034828186, + "learning_rate": 5.123881531268637e-07, + "loss": 2.7934, + "step": 60076 + }, + { + "epoch": 2.94, + "grad_norm": 0.7337465882301331, + "learning_rate": 5.114890879155176e-07, + "loss": 2.7711, + "step": 60077 + }, + { + "epoch": 2.94, + "grad_norm": 0.785071849822998, + "learning_rate": 5.105908114992341e-07, + "loss": 2.9069, + "step": 60078 + }, + { + "epoch": 2.94, + "grad_norm": 0.7351061701774597, + "learning_rate": 5.096933238804446e-07, + "loss": 2.5839, + "step": 60079 + }, + { + "epoch": 2.94, + "grad_norm": 0.7270272970199585, + "learning_rate": 5.087966250614806e-07, + "loss": 2.8432, + "step": 60080 + }, + { + "epoch": 2.94, + "grad_norm": 0.8075821399688721, + "learning_rate": 5.079007150447067e-07, + "loss": 2.7532, + "step": 60081 + }, + { + "epoch": 2.94, + "grad_norm": 0.7147870659828186, + "learning_rate": 5.07005593832488e-07, + "loss": 3.1726, + "step": 60082 + }, + { + "epoch": 2.94, + "grad_norm": 0.772592306137085, + "learning_rate": 5.06111261427189e-07, + "loss": 3.0678, + "step": 60083 + }, + { + "epoch": 2.94, + "grad_norm": 0.7498898506164551, + "learning_rate": 5.052177178311412e-07, + "loss": 2.8506, + "step": 60084 + }, + { + "epoch": 2.94, + "grad_norm": 0.7638654708862305, + "learning_rate": 5.043249630467094e-07, + "loss": 3.0761, + "step": 60085 + }, + { + "epoch": 2.94, + "grad_norm": 0.7409616112709045, + "learning_rate": 5.034329970762586e-07, + "loss": 2.9633, + "step": 60086 + }, + { + "epoch": 2.94, + "grad_norm": 0.7529875040054321, + "learning_rate": 5.025418199220866e-07, + "loss": 2.938, + "step": 60087 + }, + { + "epoch": 2.94, + "grad_norm": 0.7796822190284729, + "learning_rate": 5.01651431586625e-07, + "loss": 2.8388, + "step": 60088 + }, + { + "epoch": 2.94, + "grad_norm": 0.7700410485267639, + "learning_rate": 5.007618320721718e-07, + "loss": 2.8562, + "step": 60089 + }, + { + "epoch": 2.94, + "grad_norm": 0.7736303210258484, + "learning_rate": 4.998730213810587e-07, + "loss": 2.8441, + "step": 60090 + }, + { + "epoch": 2.94, + "grad_norm": 0.7461416721343994, + "learning_rate": 4.989849995156503e-07, + "loss": 3.0117, + "step": 60091 + }, + { + "epoch": 2.94, + "grad_norm": 0.7124243974685669, + "learning_rate": 4.980977664782448e-07, + "loss": 3.0132, + "step": 60092 + }, + { + "epoch": 2.95, + "grad_norm": 0.7641018629074097, + "learning_rate": 4.972113222712404e-07, + "loss": 3.0398, + "step": 60093 + }, + { + "epoch": 2.95, + "grad_norm": 0.763166069984436, + "learning_rate": 4.963256668969351e-07, + "loss": 2.9907, + "step": 60094 + }, + { + "epoch": 2.95, + "grad_norm": 0.7739465236663818, + "learning_rate": 4.954408003576271e-07, + "loss": 2.6657, + "step": 60095 + }, + { + "epoch": 2.95, + "grad_norm": 0.7320348024368286, + "learning_rate": 4.945567226557478e-07, + "loss": 2.9288, + "step": 60096 + }, + { + "epoch": 2.95, + "grad_norm": 0.8021144270896912, + "learning_rate": 4.936734337935289e-07, + "loss": 2.7831, + "step": 60097 + }, + { + "epoch": 2.95, + "grad_norm": 0.6665416359901428, + "learning_rate": 4.927909337733349e-07, + "loss": 2.8185, + "step": 60098 + }, + { + "epoch": 2.95, + "grad_norm": 0.737612783908844, + "learning_rate": 4.919092225975307e-07, + "loss": 2.7675, + "step": 60099 + }, + { + "epoch": 2.95, + "grad_norm": 0.7620866894721985, + "learning_rate": 4.91028300268348e-07, + "loss": 2.9269, + "step": 60100 + }, + { + "epoch": 2.95, + "grad_norm": 0.7583200335502625, + "learning_rate": 4.901481667881846e-07, + "loss": 2.9917, + "step": 60101 + }, + { + "epoch": 2.95, + "grad_norm": 0.7368113398551941, + "learning_rate": 4.892688221593055e-07, + "loss": 2.9468, + "step": 60102 + }, + { + "epoch": 2.95, + "grad_norm": 0.754863440990448, + "learning_rate": 4.883902663840422e-07, + "loss": 2.7378, + "step": 60103 + }, + { + "epoch": 2.95, + "grad_norm": 0.793886125087738, + "learning_rate": 4.875124994647595e-07, + "loss": 3.0714, + "step": 60104 + }, + { + "epoch": 2.95, + "grad_norm": 0.7754970192909241, + "learning_rate": 4.866355214036887e-07, + "loss": 2.759, + "step": 60105 + }, + { + "epoch": 2.95, + "grad_norm": 0.7628046870231628, + "learning_rate": 4.85759332203195e-07, + "loss": 2.9111, + "step": 60106 + }, + { + "epoch": 2.95, + "grad_norm": 0.7230570912361145, + "learning_rate": 4.848839318655429e-07, + "loss": 2.9432, + "step": 60107 + }, + { + "epoch": 2.95, + "grad_norm": 0.7568901777267456, + "learning_rate": 4.840093203930973e-07, + "loss": 2.9762, + "step": 60108 + }, + { + "epoch": 2.95, + "grad_norm": 0.772777259349823, + "learning_rate": 4.83135497788123e-07, + "loss": 3.0028, + "step": 60109 + }, + { + "epoch": 2.95, + "grad_norm": 0.7773007750511169, + "learning_rate": 4.822624640529182e-07, + "loss": 2.7886, + "step": 60110 + }, + { + "epoch": 2.95, + "grad_norm": 0.7324758768081665, + "learning_rate": 4.813902191897812e-07, + "loss": 2.8291, + "step": 60111 + }, + { + "epoch": 2.95, + "grad_norm": 0.7719749808311462, + "learning_rate": 4.805187632010432e-07, + "loss": 2.7614, + "step": 60112 + }, + { + "epoch": 2.95, + "grad_norm": 0.7728280425071716, + "learning_rate": 4.796480960889359e-07, + "loss": 2.9761, + "step": 60113 + }, + { + "epoch": 2.95, + "grad_norm": 0.7225713133811951, + "learning_rate": 4.787782178558241e-07, + "loss": 2.757, + "step": 60114 + }, + { + "epoch": 2.95, + "grad_norm": 0.7266071438789368, + "learning_rate": 4.779091285039726e-07, + "loss": 3.0808, + "step": 60115 + }, + { + "epoch": 2.95, + "grad_norm": 0.7436891198158264, + "learning_rate": 4.770408280356464e-07, + "loss": 2.7847, + "step": 60116 + }, + { + "epoch": 2.95, + "grad_norm": 0.7610815167427063, + "learning_rate": 4.761733164531434e-07, + "loss": 3.0614, + "step": 60117 + }, + { + "epoch": 2.95, + "grad_norm": 0.7271731495857239, + "learning_rate": 4.753065937587619e-07, + "loss": 2.7171, + "step": 60118 + }, + { + "epoch": 2.95, + "grad_norm": 0.7084574103355408, + "learning_rate": 4.7444065995480006e-07, + "loss": 2.8051, + "step": 60119 + }, + { + "epoch": 2.95, + "grad_norm": 0.7272463440895081, + "learning_rate": 4.7357551504348944e-07, + "loss": 2.9871, + "step": 60120 + }, + { + "epoch": 2.95, + "grad_norm": 0.7564078569412231, + "learning_rate": 4.727111590271615e-07, + "loss": 3.0835, + "step": 60121 + }, + { + "epoch": 2.95, + "grad_norm": 0.7226554155349731, + "learning_rate": 4.718475919080478e-07, + "loss": 3.0259, + "step": 60122 + }, + { + "epoch": 2.95, + "grad_norm": 0.747903048992157, + "learning_rate": 4.7098481368844645e-07, + "loss": 2.9517, + "step": 60123 + }, + { + "epoch": 2.95, + "grad_norm": 0.7668612599372864, + "learning_rate": 4.7012282437062234e-07, + "loss": 2.9348, + "step": 60124 + }, + { + "epoch": 2.95, + "grad_norm": 0.7929732203483582, + "learning_rate": 4.692616239568403e-07, + "loss": 3.0129, + "step": 60125 + }, + { + "epoch": 2.95, + "grad_norm": 0.7587389945983887, + "learning_rate": 4.6840121244939856e-07, + "loss": 2.7101, + "step": 60126 + }, + { + "epoch": 2.95, + "grad_norm": 0.7391680479049683, + "learning_rate": 4.675415898505286e-07, + "loss": 2.7574, + "step": 60127 + }, + { + "epoch": 2.95, + "grad_norm": 0.827909529209137, + "learning_rate": 4.6668275616249526e-07, + "loss": 2.8657, + "step": 60128 + }, + { + "epoch": 2.95, + "grad_norm": 0.7175806164741516, + "learning_rate": 4.658247113875968e-07, + "loss": 2.9622, + "step": 60129 + }, + { + "epoch": 2.95, + "grad_norm": 0.7418898940086365, + "learning_rate": 4.649674555280647e-07, + "loss": 3.021, + "step": 60130 + }, + { + "epoch": 2.95, + "grad_norm": 0.7666599154472351, + "learning_rate": 4.641109885861638e-07, + "loss": 2.723, + "step": 60131 + }, + { + "epoch": 2.95, + "grad_norm": 0.775105357170105, + "learning_rate": 4.632553105641257e-07, + "loss": 2.871, + "step": 60132 + }, + { + "epoch": 2.95, + "grad_norm": 0.7589561939239502, + "learning_rate": 4.624004214642485e-07, + "loss": 2.9596, + "step": 60133 + }, + { + "epoch": 2.95, + "grad_norm": 0.7122417092323303, + "learning_rate": 4.615463212887638e-07, + "loss": 2.7842, + "step": 60134 + }, + { + "epoch": 2.95, + "grad_norm": 0.7434190511703491, + "learning_rate": 4.6069301003990313e-07, + "loss": 2.9867, + "step": 60135 + }, + { + "epoch": 2.95, + "grad_norm": 0.8169651627540588, + "learning_rate": 4.598404877199646e-07, + "loss": 2.8957, + "step": 60136 + }, + { + "epoch": 2.95, + "grad_norm": 0.7982842922210693, + "learning_rate": 4.5898875433111327e-07, + "loss": 2.9529, + "step": 60137 + }, + { + "epoch": 2.95, + "grad_norm": 0.7457339763641357, + "learning_rate": 4.581378098756472e-07, + "loss": 2.8845, + "step": 60138 + }, + { + "epoch": 2.95, + "grad_norm": 0.7173131704330444, + "learning_rate": 4.5728765435583124e-07, + "loss": 2.9729, + "step": 60139 + }, + { + "epoch": 2.95, + "grad_norm": 0.791340708732605, + "learning_rate": 4.564382877738304e-07, + "loss": 2.6271, + "step": 60140 + }, + { + "epoch": 2.95, + "grad_norm": 0.7958931922912598, + "learning_rate": 4.5558971013194277e-07, + "loss": 2.7756, + "step": 60141 + }, + { + "epoch": 2.95, + "grad_norm": 0.7099641561508179, + "learning_rate": 4.547419214324e-07, + "loss": 2.8837, + "step": 60142 + }, + { + "epoch": 2.95, + "grad_norm": 0.7637771368026733, + "learning_rate": 4.5389492167736685e-07, + "loss": 2.9816, + "step": 60143 + }, + { + "epoch": 2.95, + "grad_norm": 0.7536028027534485, + "learning_rate": 4.5304871086917495e-07, + "loss": 2.8259, + "step": 60144 + }, + { + "epoch": 2.95, + "grad_norm": 0.8726342916488647, + "learning_rate": 4.5220328900998917e-07, + "loss": 3.1647, + "step": 60145 + }, + { + "epoch": 2.95, + "grad_norm": 0.7555391192436218, + "learning_rate": 4.5135865610204103e-07, + "loss": 2.7219, + "step": 60146 + }, + { + "epoch": 2.95, + "grad_norm": 0.7423574328422546, + "learning_rate": 4.5051481214759545e-07, + "loss": 2.8423, + "step": 60147 + }, + { + "epoch": 2.95, + "grad_norm": 0.7736955285072327, + "learning_rate": 4.4967175714881733e-07, + "loss": 2.8912, + "step": 60148 + }, + { + "epoch": 2.95, + "grad_norm": 0.7960270643234253, + "learning_rate": 4.488294911079382e-07, + "loss": 3.2692, + "step": 60149 + }, + { + "epoch": 2.95, + "grad_norm": 0.7632052898406982, + "learning_rate": 4.47988014027223e-07, + "loss": 2.9191, + "step": 60150 + }, + { + "epoch": 2.95, + "grad_norm": 0.7297966480255127, + "learning_rate": 4.471473259088365e-07, + "loss": 2.8145, + "step": 60151 + }, + { + "epoch": 2.95, + "grad_norm": 0.707733154296875, + "learning_rate": 4.463074267550104e-07, + "loss": 2.9402, + "step": 60152 + }, + { + "epoch": 2.95, + "grad_norm": 0.741661012172699, + "learning_rate": 4.454683165679762e-07, + "loss": 2.7414, + "step": 60153 + }, + { + "epoch": 2.95, + "grad_norm": 0.7709798812866211, + "learning_rate": 4.4462999534993214e-07, + "loss": 2.7895, + "step": 60154 + }, + { + "epoch": 2.95, + "grad_norm": 0.7329643964767456, + "learning_rate": 4.4379246310307646e-07, + "loss": 2.8292, + "step": 60155 + }, + { + "epoch": 2.95, + "grad_norm": 0.7478441596031189, + "learning_rate": 4.4295571982964074e-07, + "loss": 3.0081, + "step": 60156 + }, + { + "epoch": 2.95, + "grad_norm": 0.7324329614639282, + "learning_rate": 4.421197655317565e-07, + "loss": 2.9113, + "step": 60157 + }, + { + "epoch": 2.95, + "grad_norm": 0.7825899720191956, + "learning_rate": 4.412846002117221e-07, + "loss": 2.8504, + "step": 60158 + }, + { + "epoch": 2.95, + "grad_norm": 0.7741754055023193, + "learning_rate": 4.4045022387166897e-07, + "loss": 2.8648, + "step": 60159 + }, + { + "epoch": 2.95, + "grad_norm": 0.7594071626663208, + "learning_rate": 4.396166365138287e-07, + "loss": 3.0879, + "step": 60160 + }, + { + "epoch": 2.95, + "grad_norm": 0.7644873261451721, + "learning_rate": 4.3878383814039964e-07, + "loss": 2.9545, + "step": 60161 + }, + { + "epoch": 2.95, + "grad_norm": 0.7412201762199402, + "learning_rate": 4.379518287535466e-07, + "loss": 2.967, + "step": 60162 + }, + { + "epoch": 2.95, + "grad_norm": 0.7475193738937378, + "learning_rate": 4.3712060835550123e-07, + "loss": 2.8789, + "step": 60163 + }, + { + "epoch": 2.95, + "grad_norm": 0.7200668454170227, + "learning_rate": 4.3629017694839505e-07, + "loss": 2.7571, + "step": 60164 + }, + { + "epoch": 2.95, + "grad_norm": 0.7569831013679504, + "learning_rate": 4.354605345344597e-07, + "loss": 2.8523, + "step": 60165 + }, + { + "epoch": 2.95, + "grad_norm": 0.7711689472198486, + "learning_rate": 4.346316811158601e-07, + "loss": 2.9118, + "step": 60166 + }, + { + "epoch": 2.95, + "grad_norm": 0.7586641311645508, + "learning_rate": 4.3380361669479447e-07, + "loss": 2.9045, + "step": 60167 + }, + { + "epoch": 2.95, + "grad_norm": 0.7284442782402039, + "learning_rate": 4.3297634127346106e-07, + "loss": 2.7352, + "step": 60168 + }, + { + "epoch": 2.95, + "grad_norm": 0.8342050909996033, + "learning_rate": 4.3214985485399144e-07, + "loss": 2.7207, + "step": 60169 + }, + { + "epoch": 2.95, + "grad_norm": 0.749860405921936, + "learning_rate": 4.3132415743858394e-07, + "loss": 2.7941, + "step": 60170 + }, + { + "epoch": 2.95, + "grad_norm": 0.7597050070762634, + "learning_rate": 4.3049924902940345e-07, + "loss": 2.8716, + "step": 60171 + }, + { + "epoch": 2.95, + "grad_norm": 0.7795093059539795, + "learning_rate": 4.2967512962864825e-07, + "loss": 2.7667, + "step": 60172 + }, + { + "epoch": 2.95, + "grad_norm": 0.7680733799934387, + "learning_rate": 4.2885179923848323e-07, + "loss": 2.8817, + "step": 60173 + }, + { + "epoch": 2.95, + "grad_norm": 0.780337929725647, + "learning_rate": 4.280292578610733e-07, + "loss": 2.7562, + "step": 60174 + }, + { + "epoch": 2.95, + "grad_norm": 0.7037748694419861, + "learning_rate": 4.272075054985835e-07, + "loss": 2.8465, + "step": 60175 + }, + { + "epoch": 2.95, + "grad_norm": 0.7473371028900146, + "learning_rate": 4.263865421531454e-07, + "loss": 3.058, + "step": 60176 + }, + { + "epoch": 2.95, + "grad_norm": 0.766514778137207, + "learning_rate": 4.255663678269905e-07, + "loss": 3.0962, + "step": 60177 + }, + { + "epoch": 2.95, + "grad_norm": 0.705461859703064, + "learning_rate": 4.2474698252221715e-07, + "loss": 2.9497, + "step": 60178 + }, + { + "epoch": 2.95, + "grad_norm": 0.7880750298500061, + "learning_rate": 4.2392838624099033e-07, + "loss": 2.8343, + "step": 60179 + }, + { + "epoch": 2.95, + "grad_norm": 0.8290342092514038, + "learning_rate": 4.231105789855083e-07, + "loss": 2.8913, + "step": 60180 + }, + { + "epoch": 2.95, + "grad_norm": 0.8501545190811157, + "learning_rate": 4.2229356075786923e-07, + "loss": 2.9309, + "step": 60181 + }, + { + "epoch": 2.95, + "grad_norm": 0.7908932566642761, + "learning_rate": 4.214773315602715e-07, + "loss": 2.9298, + "step": 60182 + }, + { + "epoch": 2.95, + "grad_norm": 0.7290691137313843, + "learning_rate": 4.206618913948467e-07, + "loss": 2.7574, + "step": 60183 + }, + { + "epoch": 2.95, + "grad_norm": 0.7517605423927307, + "learning_rate": 4.198472402637598e-07, + "loss": 2.9407, + "step": 60184 + }, + { + "epoch": 2.95, + "grad_norm": 0.7302461862564087, + "learning_rate": 4.190333781691424e-07, + "loss": 2.9164, + "step": 60185 + }, + { + "epoch": 2.95, + "grad_norm": 0.7860396504402161, + "learning_rate": 4.182203051130928e-07, + "loss": 2.8992, + "step": 60186 + }, + { + "epoch": 2.95, + "grad_norm": 0.7854883670806885, + "learning_rate": 4.174080210978426e-07, + "loss": 3.107, + "step": 60187 + }, + { + "epoch": 2.95, + "grad_norm": 0.7467091083526611, + "learning_rate": 4.165965261254567e-07, + "loss": 2.832, + "step": 60188 + }, + { + "epoch": 2.95, + "grad_norm": 0.7421848773956299, + "learning_rate": 4.157858201981001e-07, + "loss": 2.874, + "step": 60189 + }, + { + "epoch": 2.95, + "grad_norm": 0.7487695217132568, + "learning_rate": 4.1497590331790453e-07, + "loss": 2.6692, + "step": 60190 + }, + { + "epoch": 2.95, + "grad_norm": 0.7340686321258545, + "learning_rate": 4.141667754870348e-07, + "loss": 2.8941, + "step": 60191 + }, + { + "epoch": 2.95, + "grad_norm": 0.7664749622344971, + "learning_rate": 4.13358436707556e-07, + "loss": 2.8168, + "step": 60192 + }, + { + "epoch": 2.95, + "grad_norm": 0.7620031833648682, + "learning_rate": 4.1255088698166625e-07, + "loss": 3.0271, + "step": 60193 + }, + { + "epoch": 2.95, + "grad_norm": 0.7645248174667358, + "learning_rate": 4.117441263114307e-07, + "loss": 2.8387, + "step": 60194 + }, + { + "epoch": 2.95, + "grad_norm": 0.7493310570716858, + "learning_rate": 4.1093815469901423e-07, + "loss": 2.8177, + "step": 60195 + }, + { + "epoch": 2.95, + "grad_norm": 0.7595289349555969, + "learning_rate": 4.101329721465485e-07, + "loss": 3.058, + "step": 60196 + }, + { + "epoch": 2.95, + "grad_norm": 0.7337210774421692, + "learning_rate": 4.093285786560985e-07, + "loss": 2.8429, + "step": 60197 + }, + { + "epoch": 2.95, + "grad_norm": 0.7361249327659607, + "learning_rate": 4.085249742298624e-07, + "loss": 2.9273, + "step": 60198 + }, + { + "epoch": 2.95, + "grad_norm": 0.7934818863868713, + "learning_rate": 4.077221588699053e-07, + "loss": 2.8338, + "step": 60199 + }, + { + "epoch": 2.95, + "grad_norm": 0.8189910650253296, + "learning_rate": 4.0692013257832556e-07, + "loss": 2.6777, + "step": 60200 + }, + { + "epoch": 2.95, + "grad_norm": 0.777704119682312, + "learning_rate": 4.0611889535728803e-07, + "loss": 2.9845, + "step": 60201 + }, + { + "epoch": 2.95, + "grad_norm": 0.7534101605415344, + "learning_rate": 4.0531844720885774e-07, + "loss": 3.0629, + "step": 60202 + }, + { + "epoch": 2.95, + "grad_norm": 0.7779284119606018, + "learning_rate": 4.0451878813516636e-07, + "loss": 2.8526, + "step": 60203 + }, + { + "epoch": 2.95, + "grad_norm": 0.72135990858078, + "learning_rate": 4.0371991813831215e-07, + "loss": 2.9209, + "step": 60204 + }, + { + "epoch": 2.95, + "grad_norm": 0.7906358242034912, + "learning_rate": 4.0292183722039353e-07, + "loss": 3.1225, + "step": 60205 + }, + { + "epoch": 2.95, + "grad_norm": 0.6974494457244873, + "learning_rate": 4.02124545383542e-07, + "loss": 2.844, + "step": 60206 + }, + { + "epoch": 2.95, + "grad_norm": 0.7315931916236877, + "learning_rate": 4.013280426297893e-07, + "loss": 2.9354, + "step": 60207 + }, + { + "epoch": 2.95, + "grad_norm": 0.759009838104248, + "learning_rate": 4.005323289613338e-07, + "loss": 2.8429, + "step": 60208 + }, + { + "epoch": 2.95, + "grad_norm": 0.7365983128547668, + "learning_rate": 3.9973740438017376e-07, + "loss": 2.7836, + "step": 60209 + }, + { + "epoch": 2.95, + "grad_norm": 0.7780154943466187, + "learning_rate": 3.989432688884742e-07, + "loss": 2.786, + "step": 60210 + }, + { + "epoch": 2.95, + "grad_norm": 0.7027831673622131, + "learning_rate": 3.981499224883e-07, + "loss": 2.8762, + "step": 60211 + }, + { + "epoch": 2.95, + "grad_norm": 0.7347671389579773, + "learning_rate": 3.9735736518171634e-07, + "loss": 2.6765, + "step": 60212 + }, + { + "epoch": 2.95, + "grad_norm": 0.7240535616874695, + "learning_rate": 3.965655969708548e-07, + "loss": 2.9078, + "step": 60213 + }, + { + "epoch": 2.95, + "grad_norm": 0.7486701011657715, + "learning_rate": 3.957746178577803e-07, + "loss": 2.8639, + "step": 60214 + }, + { + "epoch": 2.95, + "grad_norm": 0.7389553785324097, + "learning_rate": 3.94984427844558e-07, + "loss": 2.7024, + "step": 60215 + }, + { + "epoch": 2.95, + "grad_norm": 0.737718403339386, + "learning_rate": 3.941950269333194e-07, + "loss": 3.0266, + "step": 60216 + }, + { + "epoch": 2.95, + "grad_norm": 0.7455750703811646, + "learning_rate": 3.9340641512606295e-07, + "loss": 2.8594, + "step": 60217 + }, + { + "epoch": 2.95, + "grad_norm": 0.7492016553878784, + "learning_rate": 3.9261859242495363e-07, + "loss": 2.9803, + "step": 60218 + }, + { + "epoch": 2.95, + "grad_norm": 0.7766376733779907, + "learning_rate": 3.918315588319898e-07, + "loss": 2.68, + "step": 60219 + }, + { + "epoch": 2.95, + "grad_norm": 0.7816788554191589, + "learning_rate": 3.910453143493364e-07, + "loss": 2.9721, + "step": 60220 + }, + { + "epoch": 2.95, + "grad_norm": 0.7525882720947266, + "learning_rate": 3.902598589789585e-07, + "loss": 2.6368, + "step": 60221 + }, + { + "epoch": 2.95, + "grad_norm": 0.746664822101593, + "learning_rate": 3.8947519272298777e-07, + "loss": 2.7792, + "step": 60222 + }, + { + "epoch": 2.95, + "grad_norm": 0.7470570802688599, + "learning_rate": 3.886913155834892e-07, + "loss": 3.0567, + "step": 60223 + }, + { + "epoch": 2.95, + "grad_norm": 0.7868690490722656, + "learning_rate": 3.8790822756249453e-07, + "loss": 2.7597, + "step": 60224 + }, + { + "epoch": 2.95, + "grad_norm": 0.8167101144790649, + "learning_rate": 3.87125928662102e-07, + "loss": 2.9416, + "step": 60225 + }, + { + "epoch": 2.95, + "grad_norm": 0.7606449127197266, + "learning_rate": 3.863444188843767e-07, + "loss": 2.9537, + "step": 60226 + }, + { + "epoch": 2.95, + "grad_norm": 0.7369158864021301, + "learning_rate": 3.8556369823135036e-07, + "loss": 2.8176, + "step": 60227 + }, + { + "epoch": 2.95, + "grad_norm": 0.7974393963813782, + "learning_rate": 3.847837667050546e-07, + "loss": 2.9026, + "step": 60228 + }, + { + "epoch": 2.95, + "grad_norm": 0.7653765082359314, + "learning_rate": 3.8400462430762113e-07, + "loss": 2.6229, + "step": 60229 + }, + { + "epoch": 2.95, + "grad_norm": 0.7237482070922852, + "learning_rate": 3.8322627104101497e-07, + "loss": 2.9362, + "step": 60230 + }, + { + "epoch": 2.95, + "grad_norm": 0.7616258263587952, + "learning_rate": 3.8244870690736785e-07, + "loss": 2.702, + "step": 60231 + }, + { + "epoch": 2.95, + "grad_norm": 0.750545859336853, + "learning_rate": 3.8167193190864477e-07, + "loss": 2.8297, + "step": 60232 + }, + { + "epoch": 2.95, + "grad_norm": 0.7552297711372375, + "learning_rate": 3.808959460469774e-07, + "loss": 2.7552, + "step": 60233 + }, + { + "epoch": 2.95, + "grad_norm": 0.7343993782997131, + "learning_rate": 3.801207493243308e-07, + "loss": 2.9365, + "step": 60234 + }, + { + "epoch": 2.95, + "grad_norm": 0.8061990737915039, + "learning_rate": 3.793463417428033e-07, + "loss": 2.784, + "step": 60235 + }, + { + "epoch": 2.95, + "grad_norm": 0.7554232478141785, + "learning_rate": 3.7857272330442667e-07, + "loss": 2.8843, + "step": 60236 + }, + { + "epoch": 2.95, + "grad_norm": 0.8099880814552307, + "learning_rate": 3.7779989401119925e-07, + "loss": 2.8103, + "step": 60237 + }, + { + "epoch": 2.95, + "grad_norm": 0.7447828054428101, + "learning_rate": 3.770278538652194e-07, + "loss": 2.6748, + "step": 60238 + }, + { + "epoch": 2.95, + "grad_norm": 0.7810490727424622, + "learning_rate": 3.7625660286845215e-07, + "loss": 2.7598, + "step": 60239 + }, + { + "epoch": 2.95, + "grad_norm": 0.8725793361663818, + "learning_rate": 3.754861410229626e-07, + "loss": 2.9231, + "step": 60240 + }, + { + "epoch": 2.95, + "grad_norm": 0.7612780332565308, + "learning_rate": 3.747164683307824e-07, + "loss": 2.8793, + "step": 60241 + }, + { + "epoch": 2.95, + "grad_norm": 0.7492809295654297, + "learning_rate": 3.7394758479394326e-07, + "loss": 2.8316, + "step": 60242 + }, + { + "epoch": 2.95, + "grad_norm": 0.7240146398544312, + "learning_rate": 3.731794904144769e-07, + "loss": 3.118, + "step": 60243 + }, + { + "epoch": 2.95, + "grad_norm": 0.7847809791564941, + "learning_rate": 3.7241218519434845e-07, + "loss": 2.8566, + "step": 60244 + }, + { + "epoch": 2.95, + "grad_norm": 0.7559931874275208, + "learning_rate": 3.7164566913565617e-07, + "loss": 2.8552, + "step": 60245 + }, + { + "epoch": 2.95, + "grad_norm": 0.7063673734664917, + "learning_rate": 3.7087994224036524e-07, + "loss": 2.8538, + "step": 60246 + }, + { + "epoch": 2.95, + "grad_norm": 0.7377515435218811, + "learning_rate": 3.7011500451054055e-07, + "loss": 2.9346, + "step": 60247 + }, + { + "epoch": 2.95, + "grad_norm": 0.7750132083892822, + "learning_rate": 3.693508559481473e-07, + "loss": 2.8041, + "step": 60248 + }, + { + "epoch": 2.95, + "grad_norm": 0.7176759243011475, + "learning_rate": 3.6858749655521714e-07, + "loss": 2.7881, + "step": 60249 + }, + { + "epoch": 2.95, + "grad_norm": 0.834689199924469, + "learning_rate": 3.678249263337818e-07, + "loss": 2.8782, + "step": 60250 + }, + { + "epoch": 2.95, + "grad_norm": 0.7550405263900757, + "learning_rate": 3.6706314528580637e-07, + "loss": 3.157, + "step": 60251 + }, + { + "epoch": 2.95, + "grad_norm": 0.7085062861442566, + "learning_rate": 3.663021534133226e-07, + "loss": 2.9406, + "step": 60252 + }, + { + "epoch": 2.95, + "grad_norm": 0.7482970952987671, + "learning_rate": 3.655419507183288e-07, + "loss": 3.069, + "step": 60253 + }, + { + "epoch": 2.95, + "grad_norm": 0.7461667656898499, + "learning_rate": 3.647825372028568e-07, + "loss": 2.9194, + "step": 60254 + }, + { + "epoch": 2.95, + "grad_norm": 0.7867854237556458, + "learning_rate": 3.640239128688716e-07, + "loss": 2.7486, + "step": 60255 + }, + { + "epoch": 2.95, + "grad_norm": 0.7498422861099243, + "learning_rate": 3.6326607771837156e-07, + "loss": 2.6522, + "step": 60256 + }, + { + "epoch": 2.95, + "grad_norm": 0.75998854637146, + "learning_rate": 3.625090317533552e-07, + "loss": 2.9697, + "step": 60257 + }, + { + "epoch": 2.95, + "grad_norm": 0.7807403802871704, + "learning_rate": 3.617527749758542e-07, + "loss": 2.9362, + "step": 60258 + }, + { + "epoch": 2.95, + "grad_norm": 0.773429811000824, + "learning_rate": 3.609973073878003e-07, + "loss": 2.9117, + "step": 60259 + }, + { + "epoch": 2.95, + "grad_norm": 0.76775062084198, + "learning_rate": 3.602426289912252e-07, + "loss": 3.1239, + "step": 60260 + }, + { + "epoch": 2.95, + "grad_norm": 0.7371132969856262, + "learning_rate": 3.594887397880941e-07, + "loss": 2.8399, + "step": 60261 + }, + { + "epoch": 2.95, + "grad_norm": 0.747292697429657, + "learning_rate": 3.5873563978040534e-07, + "loss": 2.7784, + "step": 60262 + }, + { + "epoch": 2.95, + "grad_norm": 0.8072085976600647, + "learning_rate": 3.5798332897015727e-07, + "loss": 2.8365, + "step": 60263 + }, + { + "epoch": 2.95, + "grad_norm": 0.7212876677513123, + "learning_rate": 3.57231807359315e-07, + "loss": 2.7879, + "step": 60264 + }, + { + "epoch": 2.95, + "grad_norm": 0.7406131029129028, + "learning_rate": 3.564810749498437e-07, + "loss": 2.7748, + "step": 60265 + }, + { + "epoch": 2.95, + "grad_norm": 0.7893876433372498, + "learning_rate": 3.5573113174374167e-07, + "loss": 2.9208, + "step": 60266 + }, + { + "epoch": 2.95, + "grad_norm": 0.7580415606498718, + "learning_rate": 3.549819777429408e-07, + "loss": 2.708, + "step": 60267 + }, + { + "epoch": 2.95, + "grad_norm": 0.7362975478172302, + "learning_rate": 3.5423361294950603e-07, + "loss": 2.9144, + "step": 60268 + }, + { + "epoch": 2.95, + "grad_norm": 0.7409021854400635, + "learning_rate": 3.5348603736530254e-07, + "loss": 2.9298, + "step": 60269 + }, + { + "epoch": 2.95, + "grad_norm": 0.7252013683319092, + "learning_rate": 3.527392509923621e-07, + "loss": 3.0379, + "step": 60270 + }, + { + "epoch": 2.95, + "grad_norm": 0.8025727272033691, + "learning_rate": 3.5199325383264976e-07, + "loss": 2.7628, + "step": 60271 + }, + { + "epoch": 2.95, + "grad_norm": 0.775205135345459, + "learning_rate": 3.512480458881306e-07, + "loss": 2.9703, + "step": 60272 + }, + { + "epoch": 2.95, + "grad_norm": 0.7527696490287781, + "learning_rate": 3.505036271607364e-07, + "loss": 2.7959, + "step": 60273 + }, + { + "epoch": 2.95, + "grad_norm": 0.732417106628418, + "learning_rate": 3.4975999765243233e-07, + "loss": 2.8526, + "step": 60274 + }, + { + "epoch": 2.95, + "grad_norm": 0.7460353374481201, + "learning_rate": 3.490171573652167e-07, + "loss": 2.827, + "step": 60275 + }, + { + "epoch": 2.95, + "grad_norm": 0.785345733165741, + "learning_rate": 3.482751063010214e-07, + "loss": 2.8966, + "step": 60276 + }, + { + "epoch": 2.95, + "grad_norm": 0.7670798301696777, + "learning_rate": 3.4753384446177813e-07, + "loss": 3.0136, + "step": 60277 + }, + { + "epoch": 2.95, + "grad_norm": 0.7101062536239624, + "learning_rate": 3.467933718494853e-07, + "loss": 2.7194, + "step": 60278 + }, + { + "epoch": 2.95, + "grad_norm": 0.7125613689422607, + "learning_rate": 3.4605368846607473e-07, + "loss": 2.9547, + "step": 60279 + }, + { + "epoch": 2.95, + "grad_norm": 0.7298769950866699, + "learning_rate": 3.453147943134782e-07, + "loss": 2.9709, + "step": 60280 + }, + { + "epoch": 2.95, + "grad_norm": 0.7311604619026184, + "learning_rate": 3.4457668939362747e-07, + "loss": 2.7934, + "step": 60281 + }, + { + "epoch": 2.95, + "grad_norm": 0.7600909471511841, + "learning_rate": 3.43839373708521e-07, + "loss": 3.0138, + "step": 60282 + }, + { + "epoch": 2.95, + "grad_norm": 0.9641423225402832, + "learning_rate": 3.431028472600572e-07, + "loss": 2.7382, + "step": 60283 + }, + { + "epoch": 2.95, + "grad_norm": 0.7784249186515808, + "learning_rate": 3.423671100502012e-07, + "loss": 2.7052, + "step": 60284 + }, + { + "epoch": 2.95, + "grad_norm": 0.7559043765068054, + "learning_rate": 3.4163216208088485e-07, + "loss": 2.7005, + "step": 60285 + }, + { + "epoch": 2.95, + "grad_norm": 0.7473515272140503, + "learning_rate": 3.408980033540398e-07, + "loss": 2.9085, + "step": 60286 + }, + { + "epoch": 2.95, + "grad_norm": 0.7658305168151855, + "learning_rate": 3.401646338715647e-07, + "loss": 2.7508, + "step": 60287 + }, + { + "epoch": 2.95, + "grad_norm": 0.7382382154464722, + "learning_rate": 3.394320536354578e-07, + "loss": 2.912, + "step": 60288 + }, + { + "epoch": 2.95, + "grad_norm": 0.762175440788269, + "learning_rate": 3.387002626476176e-07, + "loss": 3.04, + "step": 60289 + }, + { + "epoch": 2.95, + "grad_norm": 0.7681100964546204, + "learning_rate": 3.379692609099427e-07, + "loss": 2.9685, + "step": 60290 + }, + { + "epoch": 2.95, + "grad_norm": 0.7338895797729492, + "learning_rate": 3.372390484244314e-07, + "loss": 2.8639, + "step": 60291 + }, + { + "epoch": 2.95, + "grad_norm": 0.7698550224304199, + "learning_rate": 3.3650962519291557e-07, + "loss": 2.8677, + "step": 60292 + }, + { + "epoch": 2.95, + "grad_norm": 0.7156473398208618, + "learning_rate": 3.3578099121739365e-07, + "loss": 2.9148, + "step": 60293 + }, + { + "epoch": 2.95, + "grad_norm": 0.7647261023521423, + "learning_rate": 3.350531464997308e-07, + "loss": 2.7391, + "step": 60294 + }, + { + "epoch": 2.95, + "grad_norm": 0.7407509088516235, + "learning_rate": 3.3432609104189214e-07, + "loss": 2.776, + "step": 60295 + }, + { + "epoch": 2.95, + "grad_norm": 0.713916540145874, + "learning_rate": 3.335998248457428e-07, + "loss": 2.6472, + "step": 60296 + }, + { + "epoch": 2.96, + "grad_norm": 0.7673091888427734, + "learning_rate": 3.328743479132145e-07, + "loss": 2.8011, + "step": 60297 + }, + { + "epoch": 2.96, + "grad_norm": 0.7312276363372803, + "learning_rate": 3.3214966024623926e-07, + "loss": 2.7984, + "step": 60298 + }, + { + "epoch": 2.96, + "grad_norm": 0.7208895087242126, + "learning_rate": 3.3142576184668204e-07, + "loss": 2.8252, + "step": 60299 + }, + { + "epoch": 2.96, + "grad_norm": 0.7697820663452148, + "learning_rate": 3.30702652716508e-07, + "loss": 2.7858, + "step": 60300 + }, + { + "epoch": 2.96, + "grad_norm": 0.7742111682891846, + "learning_rate": 3.299803328575823e-07, + "loss": 2.994, + "step": 60301 + }, + { + "epoch": 2.96, + "grad_norm": 0.7125436663627625, + "learning_rate": 3.2925880227183677e-07, + "loss": 2.9656, + "step": 60302 + }, + { + "epoch": 2.96, + "grad_norm": 0.7392163872718811, + "learning_rate": 3.2853806096113655e-07, + "loss": 2.8482, + "step": 60303 + }, + { + "epoch": 2.96, + "grad_norm": 0.77128005027771, + "learning_rate": 3.2781810892738016e-07, + "loss": 3.0908, + "step": 60304 + }, + { + "epoch": 2.96, + "grad_norm": 0.7209950685501099, + "learning_rate": 3.27098946172466e-07, + "loss": 2.9793, + "step": 60305 + }, + { + "epoch": 2.96, + "grad_norm": 0.7680009007453918, + "learning_rate": 3.2638057269832594e-07, + "loss": 2.9299, + "step": 60306 + }, + { + "epoch": 2.96, + "grad_norm": 0.7300169467926025, + "learning_rate": 3.2566298850682513e-07, + "loss": 2.9231, + "step": 60307 + }, + { + "epoch": 2.96, + "grad_norm": 0.7227309942245483, + "learning_rate": 3.249461935998621e-07, + "loss": 2.7252, + "step": 60308 + }, + { + "epoch": 2.96, + "grad_norm": 0.7580020427703857, + "learning_rate": 3.242301879793019e-07, + "loss": 2.8591, + "step": 60309 + }, + { + "epoch": 2.96, + "grad_norm": 0.7477824687957764, + "learning_rate": 3.235149716470431e-07, + "loss": 2.9945, + "step": 60310 + }, + { + "epoch": 2.96, + "grad_norm": 0.7373795509338379, + "learning_rate": 3.2280054460498414e-07, + "loss": 2.882, + "step": 60311 + }, + { + "epoch": 2.96, + "grad_norm": 0.766094982624054, + "learning_rate": 3.22086906854957e-07, + "loss": 2.8065, + "step": 60312 + }, + { + "epoch": 2.96, + "grad_norm": 0.7640054225921631, + "learning_rate": 3.213740583989266e-07, + "loss": 2.9561, + "step": 60313 + }, + { + "epoch": 2.96, + "grad_norm": 0.7449299097061157, + "learning_rate": 3.206619992386916e-07, + "loss": 3.0813, + "step": 60314 + }, + { + "epoch": 2.96, + "grad_norm": 0.7240998148918152, + "learning_rate": 3.1995072937618385e-07, + "loss": 3.1171, + "step": 60315 + }, + { + "epoch": 2.96, + "grad_norm": 0.756550133228302, + "learning_rate": 3.1924024881323505e-07, + "loss": 2.882, + "step": 60316 + }, + { + "epoch": 2.96, + "grad_norm": 0.7245644330978394, + "learning_rate": 3.1853055755174386e-07, + "loss": 3.0473, + "step": 60317 + }, + { + "epoch": 2.96, + "grad_norm": 0.7418863773345947, + "learning_rate": 3.17821655593542e-07, + "loss": 2.9368, + "step": 60318 + }, + { + "epoch": 2.96, + "grad_norm": 0.7780527472496033, + "learning_rate": 3.1711354294056137e-07, + "loss": 2.8981, + "step": 60319 + }, + { + "epoch": 2.96, + "grad_norm": 0.6818873286247253, + "learning_rate": 3.1640621959460046e-07, + "loss": 2.7333, + "step": 60320 + }, + { + "epoch": 2.96, + "grad_norm": 0.7510486245155334, + "learning_rate": 3.1569968555755777e-07, + "loss": 2.8713, + "step": 60321 + }, + { + "epoch": 2.96, + "grad_norm": 0.7911639213562012, + "learning_rate": 3.1499394083129845e-07, + "loss": 2.9481, + "step": 60322 + }, + { + "epoch": 2.96, + "grad_norm": 0.779403030872345, + "learning_rate": 3.1428898541765444e-07, + "loss": 2.9089, + "step": 60323 + }, + { + "epoch": 2.96, + "grad_norm": 0.7606043815612793, + "learning_rate": 3.135848193184909e-07, + "loss": 2.7561, + "step": 60324 + }, + { + "epoch": 2.96, + "grad_norm": 0.7843817472457886, + "learning_rate": 3.128814425357062e-07, + "loss": 2.6959, + "step": 60325 + }, + { + "epoch": 2.96, + "grad_norm": 0.7207468152046204, + "learning_rate": 3.1217885507106576e-07, + "loss": 2.6606, + "step": 60326 + }, + { + "epoch": 2.96, + "grad_norm": 0.753822386264801, + "learning_rate": 3.1147705692650126e-07, + "loss": 2.8586, + "step": 60327 + }, + { + "epoch": 2.96, + "grad_norm": 0.7353950142860413, + "learning_rate": 3.107760481038113e-07, + "loss": 2.9383, + "step": 60328 + }, + { + "epoch": 2.96, + "grad_norm": 0.7305023074150085, + "learning_rate": 3.1007582860489433e-07, + "loss": 2.8052, + "step": 60329 + }, + { + "epoch": 2.96, + "grad_norm": 0.716469943523407, + "learning_rate": 3.0937639843154893e-07, + "loss": 2.8077, + "step": 60330 + }, + { + "epoch": 2.96, + "grad_norm": 0.78750079870224, + "learning_rate": 3.0867775758560697e-07, + "loss": 2.8025, + "step": 60331 + }, + { + "epoch": 2.96, + "grad_norm": 0.7497711777687073, + "learning_rate": 3.079799060689336e-07, + "loss": 3.0581, + "step": 60332 + }, + { + "epoch": 2.96, + "grad_norm": 0.7364689111709595, + "learning_rate": 3.072828438833941e-07, + "loss": 2.6442, + "step": 60333 + }, + { + "epoch": 2.96, + "grad_norm": 0.8130603432655334, + "learning_rate": 3.065865710307536e-07, + "loss": 3.0189, + "step": 60334 + }, + { + "epoch": 2.96, + "grad_norm": 0.7645628452301025, + "learning_rate": 3.058910875129106e-07, + "loss": 2.8934, + "step": 60335 + }, + { + "epoch": 2.96, + "grad_norm": 0.759429395198822, + "learning_rate": 3.051963933316637e-07, + "loss": 2.8775, + "step": 60336 + }, + { + "epoch": 2.96, + "grad_norm": 0.7373213171958923, + "learning_rate": 3.04502488488878e-07, + "loss": 2.9541, + "step": 60337 + }, + { + "epoch": 2.96, + "grad_norm": 0.7441393136978149, + "learning_rate": 3.038093729863522e-07, + "loss": 2.9521, + "step": 60338 + }, + { + "epoch": 2.96, + "grad_norm": 0.7805241346359253, + "learning_rate": 3.0311704682588477e-07, + "loss": 2.8231, + "step": 60339 + }, + { + "epoch": 2.96, + "grad_norm": 0.7624837160110474, + "learning_rate": 3.024255100093409e-07, + "loss": 2.7851, + "step": 60340 + }, + { + "epoch": 2.96, + "grad_norm": 0.800739049911499, + "learning_rate": 3.0173476253851915e-07, + "loss": 2.8968, + "step": 60341 + }, + { + "epoch": 2.96, + "grad_norm": 0.7413939833641052, + "learning_rate": 3.0104480441528465e-07, + "loss": 2.7343, + "step": 60342 + }, + { + "epoch": 2.96, + "grad_norm": 0.7429508566856384, + "learning_rate": 3.0035563564140275e-07, + "loss": 3.1652, + "step": 60343 + }, + { + "epoch": 2.96, + "grad_norm": 0.7063744068145752, + "learning_rate": 2.9966725621870527e-07, + "loss": 2.7918, + "step": 60344 + }, + { + "epoch": 2.96, + "grad_norm": 0.8287339806556702, + "learning_rate": 2.989796661489907e-07, + "loss": 2.8146, + "step": 60345 + }, + { + "epoch": 2.96, + "grad_norm": 0.7384853959083557, + "learning_rate": 2.98292865434091e-07, + "loss": 3.0369, + "step": 60346 + }, + { + "epoch": 2.96, + "grad_norm": 0.7586947679519653, + "learning_rate": 2.976068540758381e-07, + "loss": 3.0617, + "step": 60347 + }, + { + "epoch": 2.96, + "grad_norm": 0.7649457454681396, + "learning_rate": 2.9692163207596375e-07, + "loss": 2.9689, + "step": 60348 + }, + { + "epoch": 2.96, + "grad_norm": 0.8336549401283264, + "learning_rate": 2.9623719943633326e-07, + "loss": 3.0257, + "step": 60349 + }, + { + "epoch": 2.96, + "grad_norm": 0.767746090888977, + "learning_rate": 2.955535561587452e-07, + "loss": 2.9512, + "step": 60350 + }, + { + "epoch": 2.96, + "grad_norm": 0.737592339515686, + "learning_rate": 2.948707022449981e-07, + "loss": 3.0337, + "step": 60351 + }, + { + "epoch": 2.96, + "grad_norm": 0.7662484049797058, + "learning_rate": 2.941886376968905e-07, + "loss": 3.0496, + "step": 60352 + }, + { + "epoch": 2.96, + "grad_norm": 0.7627364993095398, + "learning_rate": 2.9350736251618765e-07, + "loss": 2.8868, + "step": 60353 + }, + { + "epoch": 2.96, + "grad_norm": 0.7466115951538086, + "learning_rate": 2.9282687670468817e-07, + "loss": 3.1975, + "step": 60354 + }, + { + "epoch": 2.96, + "grad_norm": 0.7700613141059875, + "learning_rate": 2.921471802642572e-07, + "loss": 2.9977, + "step": 60355 + }, + { + "epoch": 2.96, + "grad_norm": 0.7370407581329346, + "learning_rate": 2.9146827319659337e-07, + "loss": 2.7743, + "step": 60356 + }, + { + "epoch": 2.96, + "grad_norm": 0.7551794648170471, + "learning_rate": 2.907901555035286e-07, + "loss": 2.8775, + "step": 60357 + }, + { + "epoch": 2.96, + "grad_norm": 0.7633122801780701, + "learning_rate": 2.90112827186828e-07, + "loss": 3.004, + "step": 60358 + }, + { + "epoch": 2.96, + "grad_norm": 0.7845262289047241, + "learning_rate": 2.8943628824832365e-07, + "loss": 2.8964, + "step": 60359 + }, + { + "epoch": 2.96, + "grad_norm": 0.7491711378097534, + "learning_rate": 2.8876053868974734e-07, + "loss": 2.9076, + "step": 60360 + }, + { + "epoch": 2.96, + "grad_norm": 0.7596331238746643, + "learning_rate": 2.880855785128644e-07, + "loss": 2.9886, + "step": 60361 + }, + { + "epoch": 2.96, + "grad_norm": 0.7640942931175232, + "learning_rate": 2.8741140771953996e-07, + "loss": 2.879, + "step": 60362 + }, + { + "epoch": 2.96, + "grad_norm": 0.7326937317848206, + "learning_rate": 2.867380263114394e-07, + "loss": 3.1048, + "step": 60363 + }, + { + "epoch": 2.96, + "grad_norm": 0.7499207854270935, + "learning_rate": 2.860654342904278e-07, + "loss": 2.9645, + "step": 60364 + }, + { + "epoch": 2.96, + "grad_norm": 0.8236726522445679, + "learning_rate": 2.853936316582039e-07, + "loss": 2.878, + "step": 60365 + }, + { + "epoch": 2.96, + "grad_norm": 0.7456938624382019, + "learning_rate": 2.847226184165996e-07, + "loss": 2.9094, + "step": 60366 + }, + { + "epoch": 2.96, + "grad_norm": 0.7293276786804199, + "learning_rate": 2.8405239456734675e-07, + "loss": 2.7994, + "step": 60367 + }, + { + "epoch": 2.96, + "grad_norm": 0.7146532535552979, + "learning_rate": 2.833829601122439e-07, + "loss": 2.8819, + "step": 60368 + }, + { + "epoch": 2.96, + "grad_norm": 0.784432053565979, + "learning_rate": 2.827143150529898e-07, + "loss": 2.6586, + "step": 60369 + }, + { + "epoch": 2.96, + "grad_norm": 0.7396759986877441, + "learning_rate": 2.8204645939141623e-07, + "loss": 2.786, + "step": 60370 + }, + { + "epoch": 2.96, + "grad_norm": 0.7426009774208069, + "learning_rate": 2.813793931292219e-07, + "loss": 2.9405, + "step": 60371 + }, + { + "epoch": 2.96, + "grad_norm": 0.7698811292648315, + "learning_rate": 2.8071311626820526e-07, + "loss": 2.6749, + "step": 60372 + }, + { + "epoch": 2.96, + "grad_norm": 0.8511273860931396, + "learning_rate": 2.8004762881009833e-07, + "loss": 2.877, + "step": 60373 + }, + { + "epoch": 2.96, + "grad_norm": 0.7467965483665466, + "learning_rate": 2.793829307566664e-07, + "loss": 2.8784, + "step": 60374 + }, + { + "epoch": 2.96, + "grad_norm": 0.7436198592185974, + "learning_rate": 2.7871902210967467e-07, + "loss": 3.0372, + "step": 60375 + }, + { + "epoch": 2.96, + "grad_norm": 0.7805924415588379, + "learning_rate": 2.780559028708551e-07, + "loss": 3.0216, + "step": 60376 + }, + { + "epoch": 2.96, + "grad_norm": 0.7895100712776184, + "learning_rate": 2.7739357304193963e-07, + "loss": 2.9302, + "step": 60377 + }, + { + "epoch": 2.96, + "grad_norm": 0.7379295229911804, + "learning_rate": 2.7673203262469355e-07, + "loss": 2.8006, + "step": 60378 + }, + { + "epoch": 2.96, + "grad_norm": 0.7495266199111938, + "learning_rate": 2.760712816208155e-07, + "loss": 2.8561, + "step": 60379 + }, + { + "epoch": 2.96, + "grad_norm": 0.7267303466796875, + "learning_rate": 2.754113200321373e-07, + "loss": 2.9655, + "step": 60380 + }, + { + "epoch": 2.96, + "grad_norm": 0.743676483631134, + "learning_rate": 2.7475214786032427e-07, + "loss": 2.9309, + "step": 60381 + }, + { + "epoch": 2.96, + "grad_norm": 0.7842069864273071, + "learning_rate": 2.7409376510710847e-07, + "loss": 2.9816, + "step": 60382 + }, + { + "epoch": 2.96, + "grad_norm": 0.7070533037185669, + "learning_rate": 2.7343617177428835e-07, + "loss": 2.7144, + "step": 60383 + }, + { + "epoch": 2.96, + "grad_norm": 0.7402557134628296, + "learning_rate": 2.7277936786352925e-07, + "loss": 3.0865, + "step": 60384 + }, + { + "epoch": 2.96, + "grad_norm": 0.7583931684494019, + "learning_rate": 2.7212335337656323e-07, + "loss": 2.7086, + "step": 60385 + }, + { + "epoch": 2.96, + "grad_norm": 0.7335100173950195, + "learning_rate": 2.714681283151554e-07, + "loss": 2.8412, + "step": 60386 + }, + { + "epoch": 2.96, + "grad_norm": 0.7490924596786499, + "learning_rate": 2.708136926810378e-07, + "loss": 2.7074, + "step": 60387 + }, + { + "epoch": 2.96, + "grad_norm": 0.7380550503730774, + "learning_rate": 2.7016004647590905e-07, + "loss": 2.6201, + "step": 60388 + }, + { + "epoch": 2.96, + "grad_norm": 0.7886814475059509, + "learning_rate": 2.695071897014678e-07, + "loss": 3.0093, + "step": 60389 + }, + { + "epoch": 2.96, + "grad_norm": 0.7457057237625122, + "learning_rate": 2.688551223594793e-07, + "loss": 2.8093, + "step": 60390 + }, + { + "epoch": 2.96, + "grad_norm": 0.7019795179367065, + "learning_rate": 2.682038444516421e-07, + "loss": 2.7947, + "step": 60391 + }, + { + "epoch": 2.96, + "grad_norm": 0.7453329563140869, + "learning_rate": 2.675533559796883e-07, + "loss": 2.8498, + "step": 60392 + }, + { + "epoch": 2.96, + "grad_norm": 0.7653692364692688, + "learning_rate": 2.6690365694528317e-07, + "loss": 2.88, + "step": 60393 + }, + { + "epoch": 2.96, + "grad_norm": 0.7203850746154785, + "learning_rate": 2.662547473501919e-07, + "loss": 2.7761, + "step": 60394 + }, + { + "epoch": 2.96, + "grad_norm": 0.7470075488090515, + "learning_rate": 2.656066271960799e-07, + "loss": 2.8994, + "step": 60395 + }, + { + "epoch": 2.96, + "grad_norm": 0.765425980091095, + "learning_rate": 2.649592964846792e-07, + "loss": 2.7976, + "step": 60396 + }, + { + "epoch": 2.96, + "grad_norm": 0.7347993850708008, + "learning_rate": 2.6431275521772156e-07, + "loss": 2.7715, + "step": 60397 + }, + { + "epoch": 2.96, + "grad_norm": 0.7307003736495972, + "learning_rate": 2.6366700339683913e-07, + "loss": 3.1203, + "step": 60398 + }, + { + "epoch": 2.96, + "grad_norm": 0.7308559417724609, + "learning_rate": 2.630220410237971e-07, + "loss": 2.9197, + "step": 60399 + }, + { + "epoch": 2.96, + "grad_norm": 0.7948404550552368, + "learning_rate": 2.6237786810026085e-07, + "loss": 2.9589, + "step": 60400 + }, + { + "epoch": 2.96, + "grad_norm": 0.7453910112380981, + "learning_rate": 2.61734484627929e-07, + "loss": 2.9592, + "step": 60401 + }, + { + "epoch": 2.96, + "grad_norm": 0.7211994528770447, + "learning_rate": 2.610918906085002e-07, + "loss": 3.1642, + "step": 60402 + }, + { + "epoch": 2.96, + "grad_norm": 0.7621414065361023, + "learning_rate": 2.604500860436731e-07, + "loss": 2.7984, + "step": 60403 + }, + { + "epoch": 2.96, + "grad_norm": 0.722687840461731, + "learning_rate": 2.5980907093514635e-07, + "loss": 3.0943, + "step": 60404 + }, + { + "epoch": 2.96, + "grad_norm": 0.7349399924278259, + "learning_rate": 2.5916884528458523e-07, + "loss": 2.7707, + "step": 60405 + }, + { + "epoch": 2.96, + "grad_norm": 0.7808841466903687, + "learning_rate": 2.5852940909368846e-07, + "loss": 2.7073, + "step": 60406 + }, + { + "epoch": 2.96, + "grad_norm": 0.7772266268730164, + "learning_rate": 2.578907623641546e-07, + "loss": 2.9222, + "step": 60407 + }, + { + "epoch": 2.96, + "grad_norm": 0.7723431587219238, + "learning_rate": 2.572529050976491e-07, + "loss": 2.8101, + "step": 60408 + }, + { + "epoch": 2.96, + "grad_norm": 0.7670159935951233, + "learning_rate": 2.5661583729583713e-07, + "loss": 2.8798, + "step": 60409 + }, + { + "epoch": 2.96, + "grad_norm": 0.780825138092041, + "learning_rate": 2.5597955896041744e-07, + "loss": 2.6376, + "step": 60410 + }, + { + "epoch": 2.96, + "grad_norm": 0.7525875568389893, + "learning_rate": 2.5534407009305536e-07, + "loss": 2.8498, + "step": 60411 + }, + { + "epoch": 2.96, + "grad_norm": 0.7023805975914001, + "learning_rate": 2.547093706954162e-07, + "loss": 2.9365, + "step": 60412 + }, + { + "epoch": 2.96, + "grad_norm": 0.7754830121994019, + "learning_rate": 2.540754607691986e-07, + "loss": 2.9444, + "step": 60413 + }, + { + "epoch": 2.96, + "grad_norm": 0.7659097909927368, + "learning_rate": 2.534423403160679e-07, + "loss": 2.9425, + "step": 60414 + }, + { + "epoch": 2.96, + "grad_norm": 0.8276565074920654, + "learning_rate": 2.5281000933768946e-07, + "loss": 2.9218, + "step": 60415 + }, + { + "epoch": 2.96, + "grad_norm": 0.778061032295227, + "learning_rate": 2.521784678356953e-07, + "loss": 3.0158, + "step": 60416 + }, + { + "epoch": 2.96, + "grad_norm": 0.7523320317268372, + "learning_rate": 2.515477158118173e-07, + "loss": 2.7615, + "step": 60417 + }, + { + "epoch": 2.96, + "grad_norm": 0.8072141408920288, + "learning_rate": 2.50917753267621e-07, + "loss": 2.6866, + "step": 60418 + }, + { + "epoch": 2.96, + "grad_norm": 0.7441281676292419, + "learning_rate": 2.502885802048715e-07, + "loss": 2.6342, + "step": 60419 + }, + { + "epoch": 2.96, + "grad_norm": 0.7458040714263916, + "learning_rate": 2.4966019662513435e-07, + "loss": 2.9695, + "step": 60420 + }, + { + "epoch": 2.96, + "grad_norm": 0.7723721265792847, + "learning_rate": 2.490326025301082e-07, + "loss": 2.8109, + "step": 60421 + }, + { + "epoch": 2.96, + "grad_norm": 0.7250966429710388, + "learning_rate": 2.484057979214582e-07, + "loss": 3.1312, + "step": 60422 + }, + { + "epoch": 2.96, + "grad_norm": 0.7306153178215027, + "learning_rate": 2.4777978280081657e-07, + "loss": 2.9188, + "step": 60423 + }, + { + "epoch": 2.96, + "grad_norm": 0.7456044554710388, + "learning_rate": 2.4715455716984853e-07, + "loss": 2.8441, + "step": 60424 + }, + { + "epoch": 2.96, + "grad_norm": 0.7753781080245972, + "learning_rate": 2.4653012103015293e-07, + "loss": 2.8846, + "step": 60425 + }, + { + "epoch": 2.96, + "grad_norm": 0.6862159371376038, + "learning_rate": 2.4590647438342823e-07, + "loss": 2.8749, + "step": 60426 + }, + { + "epoch": 2.96, + "grad_norm": 0.7013908624649048, + "learning_rate": 2.452836172313066e-07, + "loss": 2.9398, + "step": 60427 + }, + { + "epoch": 2.96, + "grad_norm": 0.7356927990913391, + "learning_rate": 2.4466154957538676e-07, + "loss": 2.9671, + "step": 60428 + }, + { + "epoch": 2.96, + "grad_norm": 0.7475108504295349, + "learning_rate": 2.4404027141736726e-07, + "loss": 2.9092, + "step": 60429 + }, + { + "epoch": 2.96, + "grad_norm": 0.7366794943809509, + "learning_rate": 2.4341978275884686e-07, + "loss": 2.8098, + "step": 60430 + }, + { + "epoch": 2.96, + "grad_norm": 0.7213911414146423, + "learning_rate": 2.428000836014576e-07, + "loss": 3.0121, + "step": 60431 + }, + { + "epoch": 2.96, + "grad_norm": 0.7313321232795715, + "learning_rate": 2.4218117394686487e-07, + "loss": 2.7566, + "step": 60432 + }, + { + "epoch": 2.96, + "grad_norm": 0.7520000338554382, + "learning_rate": 2.415630537966673e-07, + "loss": 2.8234, + "step": 60433 + }, + { + "epoch": 2.96, + "grad_norm": 0.7432150840759277, + "learning_rate": 2.4094572315253025e-07, + "loss": 3.0233, + "step": 60434 + }, + { + "epoch": 2.96, + "grad_norm": 0.7823925018310547, + "learning_rate": 2.403291820160191e-07, + "loss": 2.9009, + "step": 60435 + }, + { + "epoch": 2.96, + "grad_norm": 0.7720388770103455, + "learning_rate": 2.397134303887993e-07, + "loss": 2.8322, + "step": 60436 + }, + { + "epoch": 2.96, + "grad_norm": 0.7479499578475952, + "learning_rate": 2.390984682724695e-07, + "loss": 2.8643, + "step": 60437 + }, + { + "epoch": 2.96, + "grad_norm": 0.7411769032478333, + "learning_rate": 2.384842956686617e-07, + "loss": 2.8443, + "step": 60438 + }, + { + "epoch": 2.96, + "grad_norm": 0.7253438830375671, + "learning_rate": 2.3787091257904122e-07, + "loss": 3.2558, + "step": 60439 + }, + { + "epoch": 2.96, + "grad_norm": 0.7320099472999573, + "learning_rate": 2.3725831900514026e-07, + "loss": 2.7542, + "step": 60440 + }, + { + "epoch": 2.96, + "grad_norm": 0.7726000547409058, + "learning_rate": 2.3664651494862407e-07, + "loss": 3.1502, + "step": 60441 + }, + { + "epoch": 2.96, + "grad_norm": 0.7863835096359253, + "learning_rate": 2.3603550041109142e-07, + "loss": 2.9089, + "step": 60442 + }, + { + "epoch": 2.96, + "grad_norm": 0.7607067823410034, + "learning_rate": 2.3542527539414102e-07, + "loss": 2.8335, + "step": 60443 + }, + { + "epoch": 2.96, + "grad_norm": 0.7399455308914185, + "learning_rate": 2.348158398994049e-07, + "loss": 2.9403, + "step": 60444 + }, + { + "epoch": 2.96, + "grad_norm": 0.7538642883300781, + "learning_rate": 2.3420719392844844e-07, + "loss": 2.9633, + "step": 60445 + }, + { + "epoch": 2.96, + "grad_norm": 0.7989528775215149, + "learning_rate": 2.335993374829037e-07, + "loss": 2.8171, + "step": 60446 + }, + { + "epoch": 2.96, + "grad_norm": 0.7250673770904541, + "learning_rate": 2.3299227056436942e-07, + "loss": 2.9535, + "step": 60447 + }, + { + "epoch": 2.96, + "grad_norm": 0.7597386240959167, + "learning_rate": 2.323859931744776e-07, + "loss": 2.8269, + "step": 60448 + }, + { + "epoch": 2.96, + "grad_norm": 0.7532151937484741, + "learning_rate": 2.3178050531476034e-07, + "loss": 2.8806, + "step": 60449 + }, + { + "epoch": 2.96, + "grad_norm": 0.7555036544799805, + "learning_rate": 2.3117580698681638e-07, + "loss": 3.0434, + "step": 60450 + }, + { + "epoch": 2.96, + "grad_norm": 0.7302855253219604, + "learning_rate": 2.3057189819231103e-07, + "loss": 2.9344, + "step": 60451 + }, + { + "epoch": 2.96, + "grad_norm": 0.715503454208374, + "learning_rate": 2.2996877893274313e-07, + "loss": 2.769, + "step": 60452 + }, + { + "epoch": 2.96, + "grad_norm": 0.770855188369751, + "learning_rate": 2.2936644920977798e-07, + "loss": 2.8592, + "step": 60453 + }, + { + "epoch": 2.96, + "grad_norm": 0.7369658350944519, + "learning_rate": 2.2876490902498102e-07, + "loss": 2.8822, + "step": 60454 + }, + { + "epoch": 2.96, + "grad_norm": 0.7520059943199158, + "learning_rate": 2.2816415837988432e-07, + "loss": 2.9528, + "step": 60455 + }, + { + "epoch": 2.96, + "grad_norm": 0.7750919461250305, + "learning_rate": 2.2756419727611997e-07, + "loss": 2.8362, + "step": 60456 + }, + { + "epoch": 2.96, + "grad_norm": 0.7011710405349731, + "learning_rate": 2.2696502571528664e-07, + "loss": 2.9554, + "step": 60457 + }, + { + "epoch": 2.96, + "grad_norm": 0.7666123509407043, + "learning_rate": 2.2636664369891643e-07, + "loss": 2.6842, + "step": 60458 + }, + { + "epoch": 2.96, + "grad_norm": 0.7649023532867432, + "learning_rate": 2.2576905122860812e-07, + "loss": 2.9592, + "step": 60459 + }, + { + "epoch": 2.96, + "grad_norm": 0.7662659287452698, + "learning_rate": 2.2517224830592707e-07, + "loss": 3.0518, + "step": 60460 + }, + { + "epoch": 2.96, + "grad_norm": 0.7530821561813354, + "learning_rate": 2.245762349324387e-07, + "loss": 2.9532, + "step": 60461 + }, + { + "epoch": 2.96, + "grad_norm": 0.7605764865875244, + "learning_rate": 2.2398101110974176e-07, + "loss": 2.7051, + "step": 60462 + }, + { + "epoch": 2.96, + "grad_norm": 0.760724663734436, + "learning_rate": 2.2338657683936834e-07, + "loss": 2.9107, + "step": 60463 + }, + { + "epoch": 2.96, + "grad_norm": 0.744367241859436, + "learning_rate": 2.2279293212291715e-07, + "loss": 2.7573, + "step": 60464 + }, + { + "epoch": 2.96, + "grad_norm": 0.7295060157775879, + "learning_rate": 2.2220007696195364e-07, + "loss": 2.9798, + "step": 60465 + }, + { + "epoch": 2.96, + "grad_norm": 0.7500439286231995, + "learning_rate": 2.2160801135797658e-07, + "loss": 2.9385, + "step": 60466 + }, + { + "epoch": 2.96, + "grad_norm": 0.7618116140365601, + "learning_rate": 2.21016735312618e-07, + "loss": 3.0254, + "step": 60467 + }, + { + "epoch": 2.96, + "grad_norm": 0.7346891760826111, + "learning_rate": 2.2042624882741e-07, + "loss": 2.9205, + "step": 60468 + }, + { + "epoch": 2.96, + "grad_norm": 0.7819466590881348, + "learning_rate": 2.1983655190391804e-07, + "loss": 2.892, + "step": 60469 + }, + { + "epoch": 2.96, + "grad_norm": 0.7500108480453491, + "learning_rate": 2.1924764454364084e-07, + "loss": 2.7599, + "step": 60470 + }, + { + "epoch": 2.96, + "grad_norm": 0.7795835137367249, + "learning_rate": 2.1865952674821051e-07, + "loss": 2.8849, + "step": 60471 + }, + { + "epoch": 2.96, + "grad_norm": 0.7483556270599365, + "learning_rate": 2.1807219851912582e-07, + "loss": 2.7881, + "step": 60472 + }, + { + "epoch": 2.96, + "grad_norm": 0.7419742345809937, + "learning_rate": 2.1748565985791888e-07, + "loss": 3.0436, + "step": 60473 + }, + { + "epoch": 2.96, + "grad_norm": 0.7780374884605408, + "learning_rate": 2.168999107661884e-07, + "loss": 2.9787, + "step": 60474 + }, + { + "epoch": 2.96, + "grad_norm": 0.7674023509025574, + "learning_rate": 2.163149512454665e-07, + "loss": 3.093, + "step": 60475 + }, + { + "epoch": 2.96, + "grad_norm": 0.7375624775886536, + "learning_rate": 2.1573078129725196e-07, + "loss": 2.7765, + "step": 60476 + }, + { + "epoch": 2.96, + "grad_norm": 0.7869064211845398, + "learning_rate": 2.1514740092311023e-07, + "loss": 2.8789, + "step": 60477 + }, + { + "epoch": 2.96, + "grad_norm": 0.7125802040100098, + "learning_rate": 2.1456481012454006e-07, + "loss": 2.7467, + "step": 60478 + }, + { + "epoch": 2.96, + "grad_norm": 0.7873156666755676, + "learning_rate": 2.1398300890317354e-07, + "loss": 2.6907, + "step": 60479 + }, + { + "epoch": 2.96, + "grad_norm": 0.7791264653205872, + "learning_rate": 2.1340199726044282e-07, + "loss": 2.898, + "step": 60480 + }, + { + "epoch": 2.96, + "grad_norm": 0.7683501839637756, + "learning_rate": 2.1282177519791333e-07, + "loss": 2.8212, + "step": 60481 + }, + { + "epoch": 2.96, + "grad_norm": 0.7381917834281921, + "learning_rate": 2.1224234271715045e-07, + "loss": 2.8748, + "step": 60482 + }, + { + "epoch": 2.96, + "grad_norm": 0.7684051394462585, + "learning_rate": 2.1166369981961973e-07, + "loss": 2.9751, + "step": 60483 + }, + { + "epoch": 2.96, + "grad_norm": 0.7232508659362793, + "learning_rate": 2.1108584650688652e-07, + "loss": 2.8718, + "step": 60484 + }, + { + "epoch": 2.96, + "grad_norm": 0.7385919094085693, + "learning_rate": 2.1050878278041637e-07, + "loss": 3.0691, + "step": 60485 + }, + { + "epoch": 2.96, + "grad_norm": 0.7586545944213867, + "learning_rate": 2.09932508641808e-07, + "loss": 2.9754, + "step": 60486 + }, + { + "epoch": 2.96, + "grad_norm": 0.7664059996604919, + "learning_rate": 2.0935702409252686e-07, + "loss": 2.9511, + "step": 60487 + }, + { + "epoch": 2.96, + "grad_norm": 0.7628438472747803, + "learning_rate": 2.0878232913413839e-07, + "loss": 2.8217, + "step": 60488 + }, + { + "epoch": 2.96, + "grad_norm": 0.765557587146759, + "learning_rate": 2.082084237680748e-07, + "loss": 2.8077, + "step": 60489 + }, + { + "epoch": 2.96, + "grad_norm": 0.7436291575431824, + "learning_rate": 2.0763530799593474e-07, + "loss": 2.8652, + "step": 60490 + }, + { + "epoch": 2.96, + "grad_norm": 0.7807187438011169, + "learning_rate": 2.070629818191505e-07, + "loss": 2.835, + "step": 60491 + }, + { + "epoch": 2.96, + "grad_norm": 0.7364583611488342, + "learning_rate": 2.064914452392874e-07, + "loss": 2.8279, + "step": 60492 + }, + { + "epoch": 2.96, + "grad_norm": 0.8078753352165222, + "learning_rate": 2.05920698257811e-07, + "loss": 2.6383, + "step": 60493 + }, + { + "epoch": 2.96, + "grad_norm": 0.7190173268318176, + "learning_rate": 2.0535074087625337e-07, + "loss": 2.8113, + "step": 60494 + }, + { + "epoch": 2.96, + "grad_norm": 0.7492725253105164, + "learning_rate": 2.047815730961133e-07, + "loss": 2.8811, + "step": 60495 + }, + { + "epoch": 2.96, + "grad_norm": 0.7870933413505554, + "learning_rate": 2.0421319491888966e-07, + "loss": 3.0474, + "step": 60496 + }, + { + "epoch": 2.96, + "grad_norm": 0.7799807786941528, + "learning_rate": 2.0364560634604787e-07, + "loss": 2.8235, + "step": 60497 + }, + { + "epoch": 2.96, + "grad_norm": 0.7293304800987244, + "learning_rate": 2.030788073791201e-07, + "loss": 2.9151, + "step": 60498 + }, + { + "epoch": 2.96, + "grad_norm": 0.7670882344245911, + "learning_rate": 2.0251279801957177e-07, + "loss": 2.7792, + "step": 60499 + }, + { + "epoch": 2.96, + "grad_norm": 0.7522982358932495, + "learning_rate": 2.0194757826893503e-07, + "loss": 2.9497, + "step": 60500 + }, + { + "epoch": 2.97, + "grad_norm": 0.811562716960907, + "learning_rate": 2.013831481286421e-07, + "loss": 3.0968, + "step": 60501 + }, + { + "epoch": 2.97, + "grad_norm": 0.7589870095252991, + "learning_rate": 2.0081950760022502e-07, + "loss": 2.9288, + "step": 60502 + }, + { + "epoch": 2.97, + "grad_norm": 0.7747796177864075, + "learning_rate": 2.0025665668514935e-07, + "loss": 2.8051, + "step": 60503 + }, + { + "epoch": 2.97, + "grad_norm": 0.7064770460128784, + "learning_rate": 1.9969459538488052e-07, + "loss": 3.077, + "step": 60504 + }, + { + "epoch": 2.97, + "grad_norm": 0.784367024898529, + "learning_rate": 1.991333237009507e-07, + "loss": 2.9097, + "step": 60505 + }, + { + "epoch": 2.97, + "grad_norm": 0.7294931411743164, + "learning_rate": 1.9857284163479203e-07, + "loss": 2.8172, + "step": 60506 + }, + { + "epoch": 2.97, + "grad_norm": 0.7817466259002686, + "learning_rate": 1.9801314918787003e-07, + "loss": 2.8352, + "step": 60507 + }, + { + "epoch": 2.97, + "grad_norm": 0.790314257144928, + "learning_rate": 1.974542463617168e-07, + "loss": 2.8238, + "step": 60508 + }, + { + "epoch": 2.97, + "grad_norm": 0.833737313747406, + "learning_rate": 1.9689613315776454e-07, + "loss": 2.7384, + "step": 60509 + }, + { + "epoch": 2.97, + "grad_norm": 0.7663082480430603, + "learning_rate": 1.963388095774787e-07, + "loss": 2.7838, + "step": 60510 + }, + { + "epoch": 2.97, + "grad_norm": 0.7980212569236755, + "learning_rate": 1.9578227562235816e-07, + "loss": 3.0168, + "step": 60511 + }, + { + "epoch": 2.97, + "grad_norm": 0.7827617526054382, + "learning_rate": 1.9522653129383504e-07, + "loss": 2.9978, + "step": 60512 + }, + { + "epoch": 2.97, + "grad_norm": 0.7780979871749878, + "learning_rate": 1.9467157659337486e-07, + "loss": 2.8574, + "step": 60513 + }, + { + "epoch": 2.97, + "grad_norm": 0.716793954372406, + "learning_rate": 1.941174115224764e-07, + "loss": 2.8676, + "step": 60514 + }, + { + "epoch": 2.97, + "grad_norm": 0.7576918601989746, + "learning_rate": 1.9356403608257187e-07, + "loss": 2.9088, + "step": 60515 + }, + { + "epoch": 2.97, + "grad_norm": 0.725763201713562, + "learning_rate": 1.9301145027509346e-07, + "loss": 2.8202, + "step": 60516 + }, + { + "epoch": 2.97, + "grad_norm": 0.7284558415412903, + "learning_rate": 1.9245965410153996e-07, + "loss": 2.9063, + "step": 60517 + }, + { + "epoch": 2.97, + "grad_norm": 0.729720950126648, + "learning_rate": 1.9190864756334355e-07, + "loss": 2.6978, + "step": 60518 + }, + { + "epoch": 2.97, + "grad_norm": 0.7677045464515686, + "learning_rate": 1.9135843066196976e-07, + "loss": 2.9292, + "step": 60519 + }, + { + "epoch": 2.97, + "grad_norm": 0.7284916639328003, + "learning_rate": 1.9080900339885075e-07, + "loss": 3.099, + "step": 60520 + }, + { + "epoch": 2.97, + "grad_norm": 0.7096062898635864, + "learning_rate": 1.90260365775452e-07, + "loss": 2.7886, + "step": 60521 + }, + { + "epoch": 2.97, + "grad_norm": 0.7152775526046753, + "learning_rate": 1.8971251779320574e-07, + "loss": 2.8109, + "step": 60522 + }, + { + "epoch": 2.97, + "grad_norm": 0.7443001866340637, + "learning_rate": 1.8916545945354412e-07, + "loss": 2.9791, + "step": 60523 + }, + { + "epoch": 2.97, + "grad_norm": 0.7410553693771362, + "learning_rate": 1.8861919075793263e-07, + "loss": 2.8849, + "step": 60524 + }, + { + "epoch": 2.97, + "grad_norm": 0.6965992450714111, + "learning_rate": 1.880737117078035e-07, + "loss": 2.9143, + "step": 60525 + }, + { + "epoch": 2.97, + "grad_norm": 0.7935270667076111, + "learning_rate": 1.8752902230458888e-07, + "loss": 2.9024, + "step": 60526 + }, + { + "epoch": 2.97, + "grad_norm": 0.725361168384552, + "learning_rate": 1.8698512254968766e-07, + "loss": 2.9605, + "step": 60527 + }, + { + "epoch": 2.97, + "grad_norm": 0.745184063911438, + "learning_rate": 1.8644201244459866e-07, + "loss": 2.8531, + "step": 60528 + }, + { + "epoch": 2.97, + "grad_norm": 0.7021890878677368, + "learning_rate": 1.8589969199072074e-07, + "loss": 3.0086, + "step": 60529 + }, + { + "epoch": 2.97, + "grad_norm": 0.7811173796653748, + "learning_rate": 1.853581611894861e-07, + "loss": 2.9315, + "step": 60530 + }, + { + "epoch": 2.97, + "grad_norm": 0.7329429984092712, + "learning_rate": 1.848174200422936e-07, + "loss": 2.9401, + "step": 60531 + }, + { + "epoch": 2.97, + "grad_norm": 0.7532490491867065, + "learning_rate": 1.8427746855060877e-07, + "loss": 2.6845, + "step": 60532 + }, + { + "epoch": 2.97, + "grad_norm": 0.7217630743980408, + "learning_rate": 1.8373830671583045e-07, + "loss": 2.6904, + "step": 60533 + }, + { + "epoch": 2.97, + "grad_norm": 0.772171676158905, + "learning_rate": 1.8319993453939087e-07, + "loss": 2.8626, + "step": 60534 + }, + { + "epoch": 2.97, + "grad_norm": 0.6981392502784729, + "learning_rate": 1.8266235202272217e-07, + "loss": 3.0572, + "step": 60535 + }, + { + "epoch": 2.97, + "grad_norm": 0.7536487579345703, + "learning_rate": 1.8212555916718997e-07, + "loss": 3.0188, + "step": 60536 + }, + { + "epoch": 2.97, + "grad_norm": 0.7514350414276123, + "learning_rate": 1.815895559742264e-07, + "loss": 2.8476, + "step": 60537 + }, + { + "epoch": 2.97, + "grad_norm": 0.7271486520767212, + "learning_rate": 1.8105434244529705e-07, + "loss": 2.8713, + "step": 60538 + }, + { + "epoch": 2.97, + "grad_norm": 0.8349723815917969, + "learning_rate": 1.805199185817341e-07, + "loss": 2.9083, + "step": 60539 + }, + { + "epoch": 2.97, + "grad_norm": 0.7469309568405151, + "learning_rate": 1.7998628438500306e-07, + "loss": 2.9315, + "step": 60540 + }, + { + "epoch": 2.97, + "grad_norm": 0.7527533769607544, + "learning_rate": 1.7945343985646955e-07, + "loss": 3.0218, + "step": 60541 + }, + { + "epoch": 2.97, + "grad_norm": 0.713559091091156, + "learning_rate": 1.789213849975657e-07, + "loss": 2.7901, + "step": 60542 + }, + { + "epoch": 2.97, + "grad_norm": 0.7423522472381592, + "learning_rate": 1.7839011980965712e-07, + "loss": 2.95, + "step": 60543 + }, + { + "epoch": 2.97, + "grad_norm": 0.7502104640007019, + "learning_rate": 1.7785964429420928e-07, + "loss": 2.8922, + "step": 60544 + }, + { + "epoch": 2.97, + "grad_norm": 0.7708683609962463, + "learning_rate": 1.7732995845255448e-07, + "loss": 2.8545, + "step": 60545 + }, + { + "epoch": 2.97, + "grad_norm": 0.7669790983200073, + "learning_rate": 1.7680106228609158e-07, + "loss": 3.0175, + "step": 60546 + }, + { + "epoch": 2.97, + "grad_norm": 0.7533451318740845, + "learning_rate": 1.7627295579625277e-07, + "loss": 2.8942, + "step": 60547 + }, + { + "epoch": 2.97, + "grad_norm": 0.7395743131637573, + "learning_rate": 1.7574563898440364e-07, + "loss": 2.8993, + "step": 60548 + }, + { + "epoch": 2.97, + "grad_norm": 0.777221143245697, + "learning_rate": 1.7521911185194305e-07, + "loss": 2.8378, + "step": 60549 + }, + { + "epoch": 2.97, + "grad_norm": 0.7704327702522278, + "learning_rate": 1.7469337440026986e-07, + "loss": 2.7663, + "step": 60550 + }, + { + "epoch": 2.97, + "grad_norm": 0.7657343149185181, + "learning_rate": 1.741684266307164e-07, + "loss": 2.8244, + "step": 60551 + }, + { + "epoch": 2.97, + "grad_norm": 0.7424664497375488, + "learning_rate": 1.736442685447148e-07, + "loss": 2.9918, + "step": 60552 + }, + { + "epoch": 2.97, + "grad_norm": 0.7621636390686035, + "learning_rate": 1.7312090014363068e-07, + "loss": 2.7005, + "step": 60553 + }, + { + "epoch": 2.97, + "grad_norm": 0.8045515418052673, + "learning_rate": 1.7259832142886286e-07, + "loss": 2.9842, + "step": 60554 + }, + { + "epoch": 2.97, + "grad_norm": 0.7256165742874146, + "learning_rate": 1.7207653240174369e-07, + "loss": 2.92, + "step": 60555 + }, + { + "epoch": 2.97, + "grad_norm": 0.7653000354766846, + "learning_rate": 1.7155553306367197e-07, + "loss": 2.6685, + "step": 60556 + }, + { + "epoch": 2.97, + "grad_norm": 0.7511091828346252, + "learning_rate": 1.7103532341604665e-07, + "loss": 2.8257, + "step": 60557 + }, + { + "epoch": 2.97, + "grad_norm": 0.7537457346916199, + "learning_rate": 1.7051590346019994e-07, + "loss": 2.954, + "step": 60558 + }, + { + "epoch": 2.97, + "grad_norm": 0.7920700311660767, + "learning_rate": 1.6999727319749745e-07, + "loss": 2.7778, + "step": 60559 + }, + { + "epoch": 2.97, + "grad_norm": 0.7353650331497192, + "learning_rate": 1.6947943262930475e-07, + "loss": 2.7668, + "step": 60560 + }, + { + "epoch": 2.97, + "grad_norm": 0.68696129322052, + "learning_rate": 1.689623817570207e-07, + "loss": 2.8671, + "step": 60561 + }, + { + "epoch": 2.97, + "grad_norm": 0.7720268368721008, + "learning_rate": 1.684461205819776e-07, + "loss": 2.7461, + "step": 60562 + }, + { + "epoch": 2.97, + "grad_norm": 0.787878692150116, + "learning_rate": 1.679306491055743e-07, + "loss": 2.6584, + "step": 60563 + }, + { + "epoch": 2.97, + "grad_norm": 0.7417284846305847, + "learning_rate": 1.674159673291098e-07, + "loss": 2.8816, + "step": 60564 + }, + { + "epoch": 2.97, + "grad_norm": 0.7512297034263611, + "learning_rate": 1.669020752539829e-07, + "loss": 2.8187, + "step": 60565 + }, + { + "epoch": 2.97, + "grad_norm": 0.7542157769203186, + "learning_rate": 1.6638897288152598e-07, + "loss": 2.9312, + "step": 60566 + }, + { + "epoch": 2.97, + "grad_norm": 0.7483489513397217, + "learning_rate": 1.6587666021307123e-07, + "loss": 2.934, + "step": 60567 + }, + { + "epoch": 2.97, + "grad_norm": 0.7624621987342834, + "learning_rate": 1.6536513725001754e-07, + "loss": 2.9115, + "step": 60568 + }, + { + "epoch": 2.97, + "grad_norm": 0.7325168251991272, + "learning_rate": 1.648544039936972e-07, + "loss": 2.9444, + "step": 60569 + }, + { + "epoch": 2.97, + "grad_norm": 0.693103015422821, + "learning_rate": 1.6434446044540916e-07, + "loss": 2.6785, + "step": 60570 + }, + { + "epoch": 2.97, + "grad_norm": 0.7448264360427856, + "learning_rate": 1.638353066065856e-07, + "loss": 2.8162, + "step": 60571 + }, + { + "epoch": 2.97, + "grad_norm": 0.7847685217857361, + "learning_rate": 1.633269424784589e-07, + "loss": 2.9311, + "step": 60572 + }, + { + "epoch": 2.97, + "grad_norm": 0.7694564461708069, + "learning_rate": 1.628193680624612e-07, + "loss": 2.8487, + "step": 60573 + }, + { + "epoch": 2.97, + "grad_norm": 0.7881042957305908, + "learning_rate": 1.623125833598915e-07, + "loss": 2.7117, + "step": 60574 + }, + { + "epoch": 2.97, + "grad_norm": 0.7773247957229614, + "learning_rate": 1.6180658837208204e-07, + "loss": 3.0141, + "step": 60575 + }, + { + "epoch": 2.97, + "grad_norm": 0.7404429912567139, + "learning_rate": 1.613013831003651e-07, + "loss": 2.8635, + "step": 60576 + }, + { + "epoch": 2.97, + "grad_norm": 0.7321801781654358, + "learning_rate": 1.6079696754607298e-07, + "loss": 2.7727, + "step": 60577 + }, + { + "epoch": 2.97, + "grad_norm": 0.7246294617652893, + "learning_rate": 1.6029334171053786e-07, + "loss": 3.004, + "step": 60578 + }, + { + "epoch": 2.97, + "grad_norm": 0.7736529111862183, + "learning_rate": 1.597905055950921e-07, + "loss": 2.9635, + "step": 60579 + }, + { + "epoch": 2.97, + "grad_norm": 0.746509850025177, + "learning_rate": 1.592884592010346e-07, + "loss": 2.8717, + "step": 60580 + }, + { + "epoch": 2.97, + "grad_norm": 0.7547891736030579, + "learning_rate": 1.587872025296977e-07, + "loss": 3.016, + "step": 60581 + }, + { + "epoch": 2.97, + "grad_norm": 0.7664916515350342, + "learning_rate": 1.5828673558244687e-07, + "loss": 2.9747, + "step": 60582 + }, + { + "epoch": 2.97, + "grad_norm": 0.7931352853775024, + "learning_rate": 1.5778705836054784e-07, + "loss": 2.8982, + "step": 60583 + }, + { + "epoch": 2.97, + "grad_norm": 0.7977129817008972, + "learning_rate": 1.5728817086533286e-07, + "loss": 3.0114, + "step": 60584 + }, + { + "epoch": 2.97, + "grad_norm": 0.7142460346221924, + "learning_rate": 1.5679007309810088e-07, + "loss": 2.9114, + "step": 60585 + }, + { + "epoch": 2.97, + "grad_norm": 0.7245585918426514, + "learning_rate": 1.5629276506018418e-07, + "loss": 2.7651, + "step": 60586 + }, + { + "epoch": 2.97, + "grad_norm": 0.742992639541626, + "learning_rate": 1.557962467528817e-07, + "loss": 2.851, + "step": 60587 + }, + { + "epoch": 2.97, + "grad_norm": 0.7510650753974915, + "learning_rate": 1.5530051817752576e-07, + "loss": 2.9322, + "step": 60588 + }, + { + "epoch": 2.97, + "grad_norm": 0.7841531038284302, + "learning_rate": 1.5480557933538195e-07, + "loss": 3.0033, + "step": 60589 + }, + { + "epoch": 2.97, + "grad_norm": 0.7130355834960938, + "learning_rate": 1.5431143022778257e-07, + "loss": 2.8831, + "step": 60590 + }, + { + "epoch": 2.97, + "grad_norm": 0.7567395567893982, + "learning_rate": 1.538180708560266e-07, + "loss": 3.1319, + "step": 60591 + }, + { + "epoch": 2.97, + "grad_norm": 0.7231013178825378, + "learning_rate": 1.5332550122141295e-07, + "loss": 2.9103, + "step": 60592 + }, + { + "epoch": 2.97, + "grad_norm": 0.7480908036231995, + "learning_rate": 1.528337213252073e-07, + "loss": 2.7543, + "step": 60593 + }, + { + "epoch": 2.97, + "grad_norm": 0.7661453485488892, + "learning_rate": 1.5234273116874197e-07, + "loss": 2.8553, + "step": 60594 + }, + { + "epoch": 2.97, + "grad_norm": 0.7607757449150085, + "learning_rate": 1.5185253075331584e-07, + "loss": 3.1001, + "step": 60595 + }, + { + "epoch": 2.97, + "grad_norm": 0.7835237383842468, + "learning_rate": 1.5136312008019458e-07, + "loss": 2.6377, + "step": 60596 + }, + { + "epoch": 2.97, + "grad_norm": 0.7366989254951477, + "learning_rate": 1.5087449915064387e-07, + "loss": 2.9224, + "step": 60597 + }, + { + "epoch": 2.97, + "grad_norm": 0.7715787887573242, + "learning_rate": 1.5038666796602926e-07, + "loss": 2.8592, + "step": 60598 + }, + { + "epoch": 2.97, + "grad_norm": 0.7971885800361633, + "learning_rate": 1.4989962652754983e-07, + "loss": 2.7836, + "step": 60599 + }, + { + "epoch": 2.97, + "grad_norm": 0.770579993724823, + "learning_rate": 1.4941337483657113e-07, + "loss": 2.8284, + "step": 60600 + }, + { + "epoch": 2.97, + "grad_norm": 0.7559013962745667, + "learning_rate": 1.489279128942922e-07, + "loss": 2.7782, + "step": 60601 + }, + { + "epoch": 2.97, + "grad_norm": 0.737792432308197, + "learning_rate": 1.4844324070204528e-07, + "loss": 2.8605, + "step": 60602 + }, + { + "epoch": 2.97, + "grad_norm": 0.726133406162262, + "learning_rate": 1.4795935826109606e-07, + "loss": 3.0003, + "step": 60603 + }, + { + "epoch": 2.97, + "grad_norm": 0.7365215420722961, + "learning_rate": 1.4747626557271018e-07, + "loss": 2.8262, + "step": 60604 + }, + { + "epoch": 2.97, + "grad_norm": 0.7618759274482727, + "learning_rate": 1.4699396263815332e-07, + "loss": 2.8041, + "step": 60605 + }, + { + "epoch": 2.97, + "grad_norm": 0.7196887731552124, + "learning_rate": 1.4651244945869113e-07, + "loss": 2.7637, + "step": 60606 + }, + { + "epoch": 2.97, + "grad_norm": 0.7778249382972717, + "learning_rate": 1.4603172603565583e-07, + "loss": 2.9692, + "step": 60607 + }, + { + "epoch": 2.97, + "grad_norm": 0.8094426989555359, + "learning_rate": 1.455517923702465e-07, + "loss": 2.815, + "step": 60608 + }, + { + "epoch": 2.97, + "grad_norm": 0.706639289855957, + "learning_rate": 1.450726484637288e-07, + "loss": 2.6698, + "step": 60609 + }, + { + "epoch": 2.97, + "grad_norm": 0.6968713998794556, + "learning_rate": 1.4459429431736835e-07, + "loss": 3.1378, + "step": 60610 + }, + { + "epoch": 2.97, + "grad_norm": 0.7427403926849365, + "learning_rate": 1.4411672993246415e-07, + "loss": 2.8479, + "step": 60611 + }, + { + "epoch": 2.97, + "grad_norm": 0.7301240563392639, + "learning_rate": 1.4363995531024853e-07, + "loss": 2.8367, + "step": 60612 + }, + { + "epoch": 2.97, + "grad_norm": 0.7708475589752197, + "learning_rate": 1.4316397045198712e-07, + "loss": 2.9659, + "step": 60613 + }, + { + "epoch": 2.97, + "grad_norm": 0.7885755300521851, + "learning_rate": 1.426887753589123e-07, + "loss": 2.8787, + "step": 60614 + }, + { + "epoch": 2.97, + "grad_norm": 0.73357093334198, + "learning_rate": 1.4221437003228974e-07, + "loss": 2.7029, + "step": 60615 + }, + { + "epoch": 2.97, + "grad_norm": 0.7780753374099731, + "learning_rate": 1.4174075447335177e-07, + "loss": 2.9124, + "step": 60616 + }, + { + "epoch": 2.97, + "grad_norm": 0.7349255084991455, + "learning_rate": 1.41267928683364e-07, + "loss": 2.872, + "step": 60617 + }, + { + "epoch": 2.97, + "grad_norm": 0.7401142120361328, + "learning_rate": 1.4079589266355885e-07, + "loss": 2.8839, + "step": 60618 + }, + { + "epoch": 2.97, + "grad_norm": 0.7998162508010864, + "learning_rate": 1.403246464152019e-07, + "loss": 2.874, + "step": 60619 + }, + { + "epoch": 2.97, + "grad_norm": 0.7657843232154846, + "learning_rate": 1.3985418993952558e-07, + "loss": 2.8038, + "step": 60620 + }, + { + "epoch": 2.97, + "grad_norm": 0.7320613861083984, + "learning_rate": 1.3938452323776217e-07, + "loss": 2.9106, + "step": 60621 + }, + { + "epoch": 2.97, + "grad_norm": 0.7517797350883484, + "learning_rate": 1.3891564631114404e-07, + "loss": 3.0539, + "step": 60622 + }, + { + "epoch": 2.97, + "grad_norm": 0.7234874963760376, + "learning_rate": 1.3844755916090355e-07, + "loss": 2.8527, + "step": 60623 + }, + { + "epoch": 2.97, + "grad_norm": 0.7384620904922485, + "learning_rate": 1.3798026178830634e-07, + "loss": 2.853, + "step": 60624 + }, + { + "epoch": 2.97, + "grad_norm": 0.7372459173202515, + "learning_rate": 1.3751375419455145e-07, + "loss": 2.9442, + "step": 60625 + }, + { + "epoch": 2.97, + "grad_norm": 0.7490679025650024, + "learning_rate": 1.3704803638087125e-07, + "loss": 2.808, + "step": 60626 + }, + { + "epoch": 2.97, + "grad_norm": 0.7829398512840271, + "learning_rate": 1.3658310834849807e-07, + "loss": 2.9018, + "step": 60627 + }, + { + "epoch": 2.97, + "grad_norm": 0.7897228002548218, + "learning_rate": 1.3611897009866425e-07, + "loss": 2.5938, + "step": 60628 + }, + { + "epoch": 2.97, + "grad_norm": 0.7101447582244873, + "learning_rate": 1.3565562163256882e-07, + "loss": 2.9465, + "step": 60629 + }, + { + "epoch": 2.97, + "grad_norm": 0.7695879340171814, + "learning_rate": 1.3519306295144416e-07, + "loss": 2.8979, + "step": 60630 + }, + { + "epoch": 2.97, + "grad_norm": 0.752558708190918, + "learning_rate": 1.347312940565226e-07, + "loss": 3.0605, + "step": 60631 + }, + { + "epoch": 2.97, + "grad_norm": 0.7636677026748657, + "learning_rate": 1.342703149490032e-07, + "loss": 2.7709, + "step": 60632 + }, + { + "epoch": 2.97, + "grad_norm": 0.7717976570129395, + "learning_rate": 1.3381012563008497e-07, + "loss": 2.8797, + "step": 60633 + }, + { + "epoch": 2.97, + "grad_norm": 0.7932553291320801, + "learning_rate": 1.3335072610103359e-07, + "loss": 2.944, + "step": 60634 + }, + { + "epoch": 2.97, + "grad_norm": 0.7431058883666992, + "learning_rate": 1.3289211636301478e-07, + "loss": 2.8666, + "step": 60635 + }, + { + "epoch": 2.97, + "grad_norm": 0.7574412226676941, + "learning_rate": 1.324342964172276e-07, + "loss": 3.0864, + "step": 60636 + }, + { + "epoch": 2.97, + "grad_norm": 0.7627542018890381, + "learning_rate": 1.3197726626493764e-07, + "loss": 2.9098, + "step": 60637 + }, + { + "epoch": 2.97, + "grad_norm": 0.7745662331581116, + "learning_rate": 1.315210259072774e-07, + "loss": 2.9873, + "step": 60638 + }, + { + "epoch": 2.97, + "grad_norm": 0.7328659296035767, + "learning_rate": 1.3106557534547923e-07, + "loss": 2.9549, + "step": 60639 + }, + { + "epoch": 2.97, + "grad_norm": 0.778195321559906, + "learning_rate": 1.3061091458077543e-07, + "loss": 2.9793, + "step": 60640 + }, + { + "epoch": 2.97, + "grad_norm": 0.7515281438827515, + "learning_rate": 1.3015704361429846e-07, + "loss": 2.9819, + "step": 60641 + }, + { + "epoch": 2.97, + "grad_norm": 0.7318151593208313, + "learning_rate": 1.2970396244728065e-07, + "loss": 2.7252, + "step": 60642 + }, + { + "epoch": 2.97, + "grad_norm": 0.7786916494369507, + "learning_rate": 1.2925167108092105e-07, + "loss": 2.9772, + "step": 60643 + }, + { + "epoch": 2.97, + "grad_norm": 0.7459295392036438, + "learning_rate": 1.288001695164187e-07, + "loss": 2.9008, + "step": 60644 + }, + { + "epoch": 2.97, + "grad_norm": 0.7117395401000977, + "learning_rate": 1.2834945775490601e-07, + "loss": 3.1014, + "step": 60645 + }, + { + "epoch": 2.97, + "grad_norm": 0.7417547702789307, + "learning_rate": 1.2789953579764866e-07, + "loss": 3.09, + "step": 60646 + }, + { + "epoch": 2.97, + "grad_norm": 0.8106539249420166, + "learning_rate": 1.2745040364574576e-07, + "loss": 3.0429, + "step": 60647 + }, + { + "epoch": 2.97, + "grad_norm": 0.7398137450218201, + "learning_rate": 1.2700206130046297e-07, + "loss": 3.0541, + "step": 60648 + }, + { + "epoch": 2.97, + "grad_norm": 0.7454041242599487, + "learning_rate": 1.265545087629327e-07, + "loss": 2.9233, + "step": 60649 + }, + { + "epoch": 2.97, + "grad_norm": 0.7723972201347351, + "learning_rate": 1.2610774603435403e-07, + "loss": 3.01, + "step": 60650 + }, + { + "epoch": 2.97, + "grad_norm": 0.7595714926719666, + "learning_rate": 1.2566177311589264e-07, + "loss": 3.0519, + "step": 60651 + }, + { + "epoch": 2.97, + "grad_norm": 0.7491887807846069, + "learning_rate": 1.2521659000871433e-07, + "loss": 2.7545, + "step": 60652 + }, + { + "epoch": 2.97, + "grad_norm": 0.7535812258720398, + "learning_rate": 1.2477219671401807e-07, + "loss": 2.9838, + "step": 60653 + }, + { + "epoch": 2.97, + "grad_norm": 0.7194299101829529, + "learning_rate": 1.2432859323296962e-07, + "loss": 2.8807, + "step": 60654 + }, + { + "epoch": 2.97, + "grad_norm": 0.7006655335426331, + "learning_rate": 1.2388577956670143e-07, + "loss": 2.7205, + "step": 60655 + }, + { + "epoch": 2.97, + "grad_norm": 0.7317060232162476, + "learning_rate": 1.2344375571641251e-07, + "loss": 2.8463, + "step": 60656 + }, + { + "epoch": 2.97, + "grad_norm": 0.761465311050415, + "learning_rate": 1.2300252168326862e-07, + "loss": 2.9015, + "step": 60657 + }, + { + "epoch": 2.97, + "grad_norm": 0.7406482696533203, + "learning_rate": 1.225620774684355e-07, + "loss": 2.8137, + "step": 60658 + }, + { + "epoch": 2.97, + "grad_norm": 0.7317981123924255, + "learning_rate": 1.2212242307304554e-07, + "loss": 2.7945, + "step": 60659 + }, + { + "epoch": 2.97, + "grad_norm": 0.7996775507926941, + "learning_rate": 1.216835584982645e-07, + "loss": 2.9023, + "step": 60660 + }, + { + "epoch": 2.97, + "grad_norm": 0.7539355158805847, + "learning_rate": 1.212454837452581e-07, + "loss": 3.0568, + "step": 60661 + }, + { + "epoch": 2.97, + "grad_norm": 0.7302529215812683, + "learning_rate": 1.2080819881519211e-07, + "loss": 2.985, + "step": 60662 + }, + { + "epoch": 2.97, + "grad_norm": 0.6857694387435913, + "learning_rate": 1.203717037091989e-07, + "loss": 2.8932, + "step": 60663 + }, + { + "epoch": 2.97, + "grad_norm": 0.7334001660346985, + "learning_rate": 1.1993599842844426e-07, + "loss": 2.8093, + "step": 60664 + }, + { + "epoch": 2.97, + "grad_norm": 0.7142249345779419, + "learning_rate": 1.1950108297406058e-07, + "loss": 2.8508, + "step": 60665 + }, + { + "epoch": 2.97, + "grad_norm": 0.7907318472862244, + "learning_rate": 1.1906695734718031e-07, + "loss": 2.9828, + "step": 60666 + }, + { + "epoch": 2.97, + "grad_norm": 0.8050836324691772, + "learning_rate": 1.1863362154896916e-07, + "loss": 2.9504, + "step": 60667 + }, + { + "epoch": 2.97, + "grad_norm": 0.7420530915260315, + "learning_rate": 1.1820107558055958e-07, + "loss": 2.9651, + "step": 60668 + }, + { + "epoch": 2.97, + "grad_norm": 0.7614012360572815, + "learning_rate": 1.177693194431173e-07, + "loss": 2.9209, + "step": 60669 + }, + { + "epoch": 2.97, + "grad_norm": 0.7007195949554443, + "learning_rate": 1.1733835313774143e-07, + "loss": 2.9823, + "step": 60670 + }, + { + "epoch": 2.97, + "grad_norm": 0.7247536778450012, + "learning_rate": 1.169081766655977e-07, + "loss": 2.8296, + "step": 60671 + }, + { + "epoch": 2.97, + "grad_norm": 0.7323319315910339, + "learning_rate": 1.1647879002778526e-07, + "loss": 2.6532, + "step": 60672 + }, + { + "epoch": 2.97, + "grad_norm": 0.7220818996429443, + "learning_rate": 1.1605019322546982e-07, + "loss": 2.8848, + "step": 60673 + }, + { + "epoch": 2.97, + "grad_norm": 0.7621148824691772, + "learning_rate": 1.1562238625975051e-07, + "loss": 2.9724, + "step": 60674 + }, + { + "epoch": 2.97, + "grad_norm": 0.7443565130233765, + "learning_rate": 1.1519536913175976e-07, + "loss": 3.0197, + "step": 60675 + }, + { + "epoch": 2.97, + "grad_norm": 0.7332092523574829, + "learning_rate": 1.1476914184262997e-07, + "loss": 2.8808, + "step": 60676 + }, + { + "epoch": 2.97, + "grad_norm": 0.7590422630310059, + "learning_rate": 1.1434370439352691e-07, + "loss": 2.9004, + "step": 60677 + }, + { + "epoch": 2.97, + "grad_norm": 0.7833712697029114, + "learning_rate": 1.1391905678548308e-07, + "loss": 2.8703, + "step": 60678 + }, + { + "epoch": 2.97, + "grad_norm": 0.7459626197814941, + "learning_rate": 1.134951990196975e-07, + "loss": 2.9135, + "step": 60679 + }, + { + "epoch": 2.97, + "grad_norm": 0.7762752175331116, + "learning_rate": 1.130721310972027e-07, + "loss": 2.8371, + "step": 60680 + }, + { + "epoch": 2.97, + "grad_norm": 0.7452636361122131, + "learning_rate": 1.1264985301919771e-07, + "loss": 2.8966, + "step": 60681 + }, + { + "epoch": 2.97, + "grad_norm": 0.766267716884613, + "learning_rate": 1.1222836478674834e-07, + "loss": 2.8336, + "step": 60682 + }, + { + "epoch": 2.97, + "grad_norm": 0.7369273900985718, + "learning_rate": 1.1180766640098704e-07, + "loss": 2.8865, + "step": 60683 + }, + { + "epoch": 2.97, + "grad_norm": 0.7170395255088806, + "learning_rate": 1.1138775786301291e-07, + "loss": 2.7637, + "step": 60684 + }, + { + "epoch": 2.97, + "grad_norm": 0.7494436502456665, + "learning_rate": 1.1096863917389175e-07, + "loss": 3.0208, + "step": 60685 + }, + { + "epoch": 2.97, + "grad_norm": 0.7833617329597473, + "learning_rate": 1.1055031033478934e-07, + "loss": 2.8597, + "step": 60686 + }, + { + "epoch": 2.97, + "grad_norm": 0.7761126160621643, + "learning_rate": 1.1013277134680476e-07, + "loss": 2.6651, + "step": 60687 + }, + { + "epoch": 2.97, + "grad_norm": 0.7350229024887085, + "learning_rate": 1.0971602221097054e-07, + "loss": 2.9107, + "step": 60688 + }, + { + "epoch": 2.97, + "grad_norm": 0.7333272695541382, + "learning_rate": 1.093000629284524e-07, + "loss": 2.8596, + "step": 60689 + }, + { + "epoch": 2.97, + "grad_norm": 0.8076415061950684, + "learning_rate": 1.0888489350034946e-07, + "loss": 2.7711, + "step": 60690 + }, + { + "epoch": 2.97, + "grad_norm": 0.7704684734344482, + "learning_rate": 1.0847051392769424e-07, + "loss": 2.7579, + "step": 60691 + }, + { + "epoch": 2.97, + "grad_norm": 0.7633429169654846, + "learning_rate": 1.0805692421161915e-07, + "loss": 2.75, + "step": 60692 + }, + { + "epoch": 2.97, + "grad_norm": 0.7828161716461182, + "learning_rate": 1.0764412435319004e-07, + "loss": 2.8519, + "step": 60693 + }, + { + "epoch": 2.97, + "grad_norm": 0.8346174955368042, + "learning_rate": 1.0723211435353929e-07, + "loss": 2.9347, + "step": 60694 + }, + { + "epoch": 2.97, + "grad_norm": 0.783769965171814, + "learning_rate": 1.0682089421369944e-07, + "loss": 2.8745, + "step": 60695 + }, + { + "epoch": 2.97, + "grad_norm": 0.7348896861076355, + "learning_rate": 1.0641046393480291e-07, + "loss": 2.9594, + "step": 60696 + }, + { + "epoch": 2.97, + "grad_norm": 0.7758384346961975, + "learning_rate": 1.060008235178822e-07, + "loss": 2.971, + "step": 60697 + }, + { + "epoch": 2.97, + "grad_norm": 0.7444601058959961, + "learning_rate": 1.0559197296406974e-07, + "loss": 2.8435, + "step": 60698 + }, + { + "epoch": 2.97, + "grad_norm": 0.7654844522476196, + "learning_rate": 1.0518391227439804e-07, + "loss": 2.7562, + "step": 60699 + }, + { + "epoch": 2.97, + "grad_norm": 0.7547435760498047, + "learning_rate": 1.0477664144993292e-07, + "loss": 2.9196, + "step": 60700 + }, + { + "epoch": 2.97, + "grad_norm": 0.7515742778778076, + "learning_rate": 1.043701604918068e-07, + "loss": 2.9464, + "step": 60701 + }, + { + "epoch": 2.97, + "grad_norm": 0.7705299854278564, + "learning_rate": 1.0396446940101888e-07, + "loss": 2.8524, + "step": 60702 + }, + { + "epoch": 2.97, + "grad_norm": 0.7585739493370056, + "learning_rate": 1.0355956817870159e-07, + "loss": 3.0153, + "step": 60703 + }, + { + "epoch": 2.97, + "grad_norm": 0.7425493001937866, + "learning_rate": 1.0315545682585413e-07, + "loss": 2.9251, + "step": 60704 + }, + { + "epoch": 2.98, + "grad_norm": 0.7635918855667114, + "learning_rate": 1.0275213534360893e-07, + "loss": 2.8483, + "step": 60705 + }, + { + "epoch": 2.98, + "grad_norm": 0.8116132616996765, + "learning_rate": 1.0234960373299849e-07, + "loss": 2.6353, + "step": 60706 + }, + { + "epoch": 2.98, + "grad_norm": 0.7795093059539795, + "learning_rate": 1.0194786199508865e-07, + "loss": 2.9259, + "step": 60707 + }, + { + "epoch": 2.98, + "grad_norm": 0.7156280279159546, + "learning_rate": 1.0154691013091187e-07, + "loss": 2.8244, + "step": 60708 + }, + { + "epoch": 2.98, + "grad_norm": 0.7367256879806519, + "learning_rate": 1.0114674814156731e-07, + "loss": 2.8971, + "step": 60709 + }, + { + "epoch": 2.98, + "grad_norm": 0.762520968914032, + "learning_rate": 1.0074737602805416e-07, + "loss": 2.7293, + "step": 60710 + }, + { + "epoch": 2.98, + "grad_norm": 0.7359768748283386, + "learning_rate": 1.0034879379147154e-07, + "loss": 2.8846, + "step": 60711 + }, + { + "epoch": 2.98, + "grad_norm": 0.7226070761680603, + "learning_rate": 9.995100143285195e-08, + "loss": 3.0858, + "step": 60712 + }, + { + "epoch": 2.98, + "grad_norm": 0.7261221408843994, + "learning_rate": 9.955399895326121e-08, + "loss": 2.9844, + "step": 60713 + }, + { + "epoch": 2.98, + "grad_norm": 0.7427242398262024, + "learning_rate": 9.915778635369854e-08, + "loss": 2.7948, + "step": 60714 + }, + { + "epoch": 2.98, + "grad_norm": 0.771078884601593, + "learning_rate": 9.876236363526303e-08, + "loss": 2.7511, + "step": 60715 + }, + { + "epoch": 2.98, + "grad_norm": 0.7642545104026794, + "learning_rate": 9.836773079895389e-08, + "loss": 2.8683, + "step": 60716 + }, + { + "epoch": 2.98, + "grad_norm": 0.7525956034660339, + "learning_rate": 9.797388784583692e-08, + "loss": 2.9183, + "step": 60717 + }, + { + "epoch": 2.98, + "grad_norm": 0.7317299246788025, + "learning_rate": 9.758083477694468e-08, + "loss": 2.7959, + "step": 60718 + }, + { + "epoch": 2.98, + "grad_norm": 0.7813636064529419, + "learning_rate": 9.71885715932763e-08, + "loss": 2.9581, + "step": 60719 + }, + { + "epoch": 2.98, + "grad_norm": 0.7140172719955444, + "learning_rate": 9.679709829593096e-08, + "loss": 2.9738, + "step": 60720 + }, + { + "epoch": 2.98, + "grad_norm": 0.7551311254501343, + "learning_rate": 9.640641488587453e-08, + "loss": 2.7313, + "step": 60721 + }, + { + "epoch": 2.98, + "grad_norm": 0.7514950037002563, + "learning_rate": 9.601652136420612e-08, + "loss": 2.9285, + "step": 60722 + }, + { + "epoch": 2.98, + "grad_norm": 0.7382096648216248, + "learning_rate": 9.562741773185833e-08, + "loss": 3.1155, + "step": 60723 + }, + { + "epoch": 2.98, + "grad_norm": 0.7395986318588257, + "learning_rate": 9.523910398996359e-08, + "loss": 2.9011, + "step": 60724 + }, + { + "epoch": 2.98, + "grad_norm": 0.742736279964447, + "learning_rate": 9.485158013945449e-08, + "loss": 2.9065, + "step": 60725 + }, + { + "epoch": 2.98, + "grad_norm": 0.7521002888679504, + "learning_rate": 9.446484618139682e-08, + "loss": 2.7803, + "step": 60726 + }, + { + "epoch": 2.98, + "grad_norm": 0.7703086733818054, + "learning_rate": 9.407890211682312e-08, + "loss": 2.8006, + "step": 60727 + }, + { + "epoch": 2.98, + "grad_norm": 0.7542996406555176, + "learning_rate": 9.369374794669926e-08, + "loss": 2.8211, + "step": 60728 + }, + { + "epoch": 2.98, + "grad_norm": 0.7727019190788269, + "learning_rate": 9.330938367209107e-08, + "loss": 2.8717, + "step": 60729 + }, + { + "epoch": 2.98, + "grad_norm": 0.7284082174301147, + "learning_rate": 9.292580929396443e-08, + "loss": 2.7599, + "step": 60730 + }, + { + "epoch": 2.98, + "grad_norm": 0.7392890453338623, + "learning_rate": 9.254302481338516e-08, + "loss": 2.9226, + "step": 60731 + }, + { + "epoch": 2.98, + "grad_norm": 0.7863142490386963, + "learning_rate": 9.216103023128584e-08, + "loss": 2.753, + "step": 60732 + }, + { + "epoch": 2.98, + "grad_norm": 0.7285134196281433, + "learning_rate": 9.177982554873231e-08, + "loss": 3.173, + "step": 60733 + }, + { + "epoch": 2.98, + "grad_norm": 0.7787915468215942, + "learning_rate": 9.139941076672374e-08, + "loss": 2.8017, + "step": 60734 + }, + { + "epoch": 2.98, + "grad_norm": 0.77415531873703, + "learning_rate": 9.101978588625935e-08, + "loss": 2.6794, + "step": 60735 + }, + { + "epoch": 2.98, + "grad_norm": 0.7192097902297974, + "learning_rate": 9.064095090830504e-08, + "loss": 2.8903, + "step": 60736 + }, + { + "epoch": 2.98, + "grad_norm": 0.755970299243927, + "learning_rate": 9.026290583389329e-08, + "loss": 3.218, + "step": 60737 + }, + { + "epoch": 2.98, + "grad_norm": 0.7275537848472595, + "learning_rate": 8.988565066402332e-08, + "loss": 2.9964, + "step": 60738 + }, + { + "epoch": 2.98, + "grad_norm": 0.8885321617126465, + "learning_rate": 8.950918539966102e-08, + "loss": 2.9317, + "step": 60739 + }, + { + "epoch": 2.98, + "grad_norm": 0.7572287321090698, + "learning_rate": 8.913351004180558e-08, + "loss": 3.1145, + "step": 60740 + }, + { + "epoch": 2.98, + "grad_norm": 0.7967362999916077, + "learning_rate": 8.875862459145622e-08, + "loss": 2.6688, + "step": 60741 + }, + { + "epoch": 2.98, + "grad_norm": 0.7556520700454712, + "learning_rate": 8.838452904961213e-08, + "loss": 2.9846, + "step": 60742 + }, + { + "epoch": 2.98, + "grad_norm": 0.6918498873710632, + "learning_rate": 8.801122341723921e-08, + "loss": 2.8424, + "step": 60743 + }, + { + "epoch": 2.98, + "grad_norm": 0.7346342206001282, + "learning_rate": 8.763870769533666e-08, + "loss": 2.6878, + "step": 60744 + }, + { + "epoch": 2.98, + "grad_norm": 0.7678686380386353, + "learning_rate": 8.726698188483705e-08, + "loss": 2.9052, + "step": 60745 + }, + { + "epoch": 2.98, + "grad_norm": 0.7246375679969788, + "learning_rate": 8.689604598680621e-08, + "loss": 2.956, + "step": 60746 + }, + { + "epoch": 2.98, + "grad_norm": 0.7658019065856934, + "learning_rate": 8.652590000214343e-08, + "loss": 2.8864, + "step": 60747 + }, + { + "epoch": 2.98, + "grad_norm": 0.7121071219444275, + "learning_rate": 8.61565439318479e-08, + "loss": 2.8179, + "step": 60748 + }, + { + "epoch": 2.98, + "grad_norm": 0.767242431640625, + "learning_rate": 8.578797777688551e-08, + "loss": 2.8497, + "step": 60749 + }, + { + "epoch": 2.98, + "grad_norm": 0.7547940611839294, + "learning_rate": 8.542020153828877e-08, + "loss": 2.9006, + "step": 60750 + }, + { + "epoch": 2.98, + "grad_norm": 0.7395439743995667, + "learning_rate": 8.505321521692366e-08, + "loss": 3.0071, + "step": 60751 + }, + { + "epoch": 2.98, + "grad_norm": 0.7863125801086426, + "learning_rate": 8.468701881382267e-08, + "loss": 3.1082, + "step": 60752 + }, + { + "epoch": 2.98, + "grad_norm": 0.7661166191101074, + "learning_rate": 8.432161232995172e-08, + "loss": 2.8509, + "step": 60753 + }, + { + "epoch": 2.98, + "grad_norm": 0.7914922833442688, + "learning_rate": 8.395699576624338e-08, + "loss": 2.7245, + "step": 60754 + }, + { + "epoch": 2.98, + "grad_norm": 0.7542526125907898, + "learning_rate": 8.359316912366355e-08, + "loss": 2.7699, + "step": 60755 + }, + { + "epoch": 2.98, + "grad_norm": 0.7157665491104126, + "learning_rate": 8.323013240321141e-08, + "loss": 2.9214, + "step": 60756 + }, + { + "epoch": 2.98, + "grad_norm": 0.7522284388542175, + "learning_rate": 8.286788560578628e-08, + "loss": 2.7608, + "step": 60757 + }, + { + "epoch": 2.98, + "grad_norm": 0.7462372779846191, + "learning_rate": 8.250642873235402e-08, + "loss": 2.8285, + "step": 60758 + }, + { + "epoch": 2.98, + "grad_norm": 0.7638521790504456, + "learning_rate": 8.214576178391386e-08, + "loss": 3.0214, + "step": 60759 + }, + { + "epoch": 2.98, + "grad_norm": 0.7532297968864441, + "learning_rate": 8.178588476136505e-08, + "loss": 2.8109, + "step": 60760 + }, + { + "epoch": 2.98, + "grad_norm": 0.7298749089241028, + "learning_rate": 8.14267976656735e-08, + "loss": 2.7234, + "step": 60761 + }, + { + "epoch": 2.98, + "grad_norm": 0.7774704694747925, + "learning_rate": 8.10685004977718e-08, + "loss": 2.7798, + "step": 60762 + }, + { + "epoch": 2.98, + "grad_norm": 0.756490170955658, + "learning_rate": 8.071099325862584e-08, + "loss": 2.8804, + "step": 60763 + }, + { + "epoch": 2.98, + "grad_norm": 0.7454630732536316, + "learning_rate": 8.035427594916822e-08, + "loss": 2.839, + "step": 60764 + }, + { + "epoch": 2.98, + "grad_norm": 0.7813576459884644, + "learning_rate": 7.99983485703315e-08, + "loss": 2.8881, + "step": 60765 + }, + { + "epoch": 2.98, + "grad_norm": 0.7779887318611145, + "learning_rate": 7.964321112304828e-08, + "loss": 2.9005, + "step": 60766 + }, + { + "epoch": 2.98, + "grad_norm": 0.7656338214874268, + "learning_rate": 7.928886360828446e-08, + "loss": 3.1341, + "step": 60767 + }, + { + "epoch": 2.98, + "grad_norm": 0.7824438810348511, + "learning_rate": 7.893530602693931e-08, + "loss": 2.8625, + "step": 60768 + }, + { + "epoch": 2.98, + "grad_norm": 0.779429018497467, + "learning_rate": 7.858253837997874e-08, + "loss": 2.8873, + "step": 60769 + }, + { + "epoch": 2.98, + "grad_norm": 0.7629287242889404, + "learning_rate": 7.823056066830202e-08, + "loss": 2.9589, + "step": 60770 + }, + { + "epoch": 2.98, + "grad_norm": 0.7588117718696594, + "learning_rate": 7.787937289284174e-08, + "loss": 2.9203, + "step": 60771 + }, + { + "epoch": 2.98, + "grad_norm": 0.7689938545227051, + "learning_rate": 7.752897505449718e-08, + "loss": 2.9673, + "step": 60772 + }, + { + "epoch": 2.98, + "grad_norm": 0.7638649344444275, + "learning_rate": 7.717936715426753e-08, + "loss": 2.893, + "step": 60773 + }, + { + "epoch": 2.98, + "grad_norm": 0.7803759574890137, + "learning_rate": 7.683054919298549e-08, + "loss": 2.8572, + "step": 60774 + }, + { + "epoch": 2.98, + "grad_norm": 0.7284832000732422, + "learning_rate": 7.648252117165021e-08, + "loss": 2.7997, + "step": 60775 + }, + { + "epoch": 2.98, + "grad_norm": 0.7143325209617615, + "learning_rate": 7.61352830910944e-08, + "loss": 2.8185, + "step": 60776 + }, + { + "epoch": 2.98, + "grad_norm": 0.8019963502883911, + "learning_rate": 7.578883495231725e-08, + "loss": 2.6413, + "step": 60777 + }, + { + "epoch": 2.98, + "grad_norm": 0.7633552551269531, + "learning_rate": 7.544317675615142e-08, + "loss": 2.8431, + "step": 60778 + }, + { + "epoch": 2.98, + "grad_norm": 0.7556493878364563, + "learning_rate": 7.50983085035961e-08, + "loss": 2.885, + "step": 60779 + }, + { + "epoch": 2.98, + "grad_norm": 0.7214013338088989, + "learning_rate": 7.475423019548399e-08, + "loss": 2.959, + "step": 60780 + }, + { + "epoch": 2.98, + "grad_norm": 0.7588320970535278, + "learning_rate": 7.441094183271435e-08, + "loss": 2.9404, + "step": 60781 + }, + { + "epoch": 2.98, + "grad_norm": 0.7851195335388184, + "learning_rate": 7.406844341628638e-08, + "loss": 2.9118, + "step": 60782 + }, + { + "epoch": 2.98, + "grad_norm": 0.7186158299446106, + "learning_rate": 7.372673494699943e-08, + "loss": 2.8767, + "step": 60783 + }, + { + "epoch": 2.98, + "grad_norm": 0.7348068952560425, + "learning_rate": 7.338581642578611e-08, + "loss": 2.8697, + "step": 60784 + }, + { + "epoch": 2.98, + "grad_norm": 0.7313429713249207, + "learning_rate": 7.304568785357901e-08, + "loss": 2.8737, + "step": 60785 + }, + { + "epoch": 2.98, + "grad_norm": 0.7902020812034607, + "learning_rate": 7.270634923124408e-08, + "loss": 2.9459, + "step": 60786 + }, + { + "epoch": 2.98, + "grad_norm": 0.7413740158081055, + "learning_rate": 7.236780055968061e-08, + "loss": 2.8579, + "step": 60787 + }, + { + "epoch": 2.98, + "grad_norm": 0.7744763493537903, + "learning_rate": 7.203004183975459e-08, + "loss": 2.8673, + "step": 60788 + }, + { + "epoch": 2.98, + "grad_norm": 0.7383904457092285, + "learning_rate": 7.169307307239857e-08, + "loss": 2.976, + "step": 60789 + }, + { + "epoch": 2.98, + "grad_norm": 0.7870457768440247, + "learning_rate": 7.135689425847857e-08, + "loss": 2.8683, + "step": 60790 + }, + { + "epoch": 2.98, + "grad_norm": 0.7270851135253906, + "learning_rate": 7.102150539889384e-08, + "loss": 2.9019, + "step": 60791 + }, + { + "epoch": 2.98, + "grad_norm": 0.7581509947776794, + "learning_rate": 7.068690649451036e-08, + "loss": 2.8886, + "step": 60792 + }, + { + "epoch": 2.98, + "grad_norm": 0.7382477521896362, + "learning_rate": 7.035309754619411e-08, + "loss": 2.9169, + "step": 60793 + }, + { + "epoch": 2.98, + "grad_norm": 0.7643795013427734, + "learning_rate": 7.002007855487768e-08, + "loss": 2.9914, + "step": 60794 + }, + { + "epoch": 2.98, + "grad_norm": 0.7508678436279297, + "learning_rate": 6.968784952136042e-08, + "loss": 2.7102, + "step": 60795 + }, + { + "epoch": 2.98, + "grad_norm": 0.7405849099159241, + "learning_rate": 6.935641044660823e-08, + "loss": 2.7978, + "step": 60796 + }, + { + "epoch": 2.98, + "grad_norm": 0.7330188155174255, + "learning_rate": 6.902576133142047e-08, + "loss": 2.8835, + "step": 60797 + }, + { + "epoch": 2.98, + "grad_norm": 0.7529171109199524, + "learning_rate": 6.86959021766964e-08, + "loss": 2.7461, + "step": 60798 + }, + { + "epoch": 2.98, + "grad_norm": 0.7461307048797607, + "learning_rate": 6.836683298333534e-08, + "loss": 3.0741, + "step": 60799 + }, + { + "epoch": 2.98, + "grad_norm": 0.755508542060852, + "learning_rate": 6.803855375213663e-08, + "loss": 2.6922, + "step": 60800 + }, + { + "epoch": 2.98, + "grad_norm": 0.8063923120498657, + "learning_rate": 6.771106448399955e-08, + "loss": 2.7744, + "step": 60801 + }, + { + "epoch": 2.98, + "grad_norm": 0.7900208234786987, + "learning_rate": 6.738436517982338e-08, + "loss": 2.8385, + "step": 60802 + }, + { + "epoch": 2.98, + "grad_norm": 0.7695721387863159, + "learning_rate": 6.705845584040748e-08, + "loss": 2.9264, + "step": 60803 + }, + { + "epoch": 2.98, + "grad_norm": 0.7548896670341492, + "learning_rate": 6.673333646665113e-08, + "loss": 2.9377, + "step": 60804 + }, + { + "epoch": 2.98, + "grad_norm": 0.7459438443183899, + "learning_rate": 6.640900705938701e-08, + "loss": 2.9739, + "step": 60805 + }, + { + "epoch": 2.98, + "grad_norm": 0.7769594788551331, + "learning_rate": 6.608546761948108e-08, + "loss": 2.9764, + "step": 60806 + }, + { + "epoch": 2.98, + "grad_norm": 0.7430137395858765, + "learning_rate": 6.576271814776601e-08, + "loss": 2.8683, + "step": 60807 + }, + { + "epoch": 2.98, + "grad_norm": 0.7870651483535767, + "learning_rate": 6.544075864514109e-08, + "loss": 2.7308, + "step": 60808 + }, + { + "epoch": 2.98, + "grad_norm": 0.7871001958847046, + "learning_rate": 6.511958911240567e-08, + "loss": 2.9413, + "step": 60809 + }, + { + "epoch": 2.98, + "grad_norm": 0.7236528992652893, + "learning_rate": 6.479920955042573e-08, + "loss": 2.7938, + "step": 60810 + }, + { + "epoch": 2.98, + "grad_norm": 0.8005480170249939, + "learning_rate": 6.447961996003392e-08, + "loss": 2.7432, + "step": 60811 + }, + { + "epoch": 2.98, + "grad_norm": 0.7919958233833313, + "learning_rate": 6.416082034206294e-08, + "loss": 2.8455, + "step": 60812 + }, + { + "epoch": 2.98, + "grad_norm": 0.8858550190925598, + "learning_rate": 6.384281069737873e-08, + "loss": 2.9674, + "step": 60813 + }, + { + "epoch": 2.98, + "grad_norm": 0.7702856659889221, + "learning_rate": 6.352559102681399e-08, + "loss": 2.6873, + "step": 60814 + }, + { + "epoch": 2.98, + "grad_norm": 0.7305554747581482, + "learning_rate": 6.320916133120135e-08, + "loss": 2.8038, + "step": 60815 + }, + { + "epoch": 2.98, + "grad_norm": 0.818423330783844, + "learning_rate": 6.289352161137351e-08, + "loss": 2.7824, + "step": 60816 + }, + { + "epoch": 2.98, + "grad_norm": 0.7536776661872864, + "learning_rate": 6.257867186816312e-08, + "loss": 2.9083, + "step": 60817 + }, + { + "epoch": 2.98, + "grad_norm": 0.7545386552810669, + "learning_rate": 6.226461210240286e-08, + "loss": 3.1492, + "step": 60818 + }, + { + "epoch": 2.98, + "grad_norm": 0.7835391163825989, + "learning_rate": 6.195134231489207e-08, + "loss": 3.1372, + "step": 60819 + }, + { + "epoch": 2.98, + "grad_norm": 0.7253636717796326, + "learning_rate": 6.163886250649675e-08, + "loss": 3.0669, + "step": 60820 + }, + { + "epoch": 2.98, + "grad_norm": 0.7546964287757874, + "learning_rate": 6.132717267801624e-08, + "loss": 2.9428, + "step": 60821 + }, + { + "epoch": 2.98, + "grad_norm": 0.7309494614601135, + "learning_rate": 6.10162728302499e-08, + "loss": 2.8188, + "step": 60822 + }, + { + "epoch": 2.98, + "grad_norm": 0.7510556578636169, + "learning_rate": 6.070616296406372e-08, + "loss": 2.7979, + "step": 60823 + }, + { + "epoch": 2.98, + "grad_norm": 0.7674130201339722, + "learning_rate": 6.039684308025705e-08, + "loss": 2.7607, + "step": 60824 + }, + { + "epoch": 2.98, + "grad_norm": 0.7727053761482239, + "learning_rate": 6.008831317962926e-08, + "loss": 2.949, + "step": 60825 + }, + { + "epoch": 2.98, + "grad_norm": 0.7896957993507385, + "learning_rate": 5.9780573263013e-08, + "loss": 2.7591, + "step": 60826 + }, + { + "epoch": 2.98, + "grad_norm": 0.7202666997909546, + "learning_rate": 5.947362333120764e-08, + "loss": 2.9216, + "step": 60827 + }, + { + "epoch": 2.98, + "grad_norm": 0.7396377921104431, + "learning_rate": 5.916746338501255e-08, + "loss": 2.9348, + "step": 60828 + }, + { + "epoch": 2.98, + "grad_norm": 0.8150199055671692, + "learning_rate": 5.886209342526038e-08, + "loss": 2.7997, + "step": 60829 + }, + { + "epoch": 2.98, + "grad_norm": 0.7385109066963196, + "learning_rate": 5.8557513452750506e-08, + "loss": 2.8084, + "step": 60830 + }, + { + "epoch": 2.98, + "grad_norm": 0.739365816116333, + "learning_rate": 5.8253723468248966e-08, + "loss": 2.8535, + "step": 60831 + }, + { + "epoch": 2.98, + "grad_norm": 0.7645565271377563, + "learning_rate": 5.795072347258844e-08, + "loss": 2.9378, + "step": 60832 + }, + { + "epoch": 2.98, + "grad_norm": 0.7741032838821411, + "learning_rate": 5.764851346656829e-08, + "loss": 2.9119, + "step": 60833 + }, + { + "epoch": 2.98, + "grad_norm": 0.7111135125160217, + "learning_rate": 5.734709345098787e-08, + "loss": 2.8008, + "step": 60834 + }, + { + "epoch": 2.98, + "grad_norm": 0.7414789199829102, + "learning_rate": 5.704646342661323e-08, + "loss": 2.8277, + "step": 60835 + }, + { + "epoch": 2.98, + "grad_norm": 0.7525594830513, + "learning_rate": 5.674662339427705e-08, + "loss": 2.9346, + "step": 60836 + }, + { + "epoch": 2.98, + "grad_norm": 0.704296886920929, + "learning_rate": 5.644757335471206e-08, + "loss": 2.8076, + "step": 60837 + }, + { + "epoch": 2.98, + "grad_norm": 0.793901264667511, + "learning_rate": 5.614931330875094e-08, + "loss": 2.8382, + "step": 60838 + }, + { + "epoch": 2.98, + "grad_norm": 0.7604386210441589, + "learning_rate": 5.585184325715975e-08, + "loss": 2.8852, + "step": 60839 + }, + { + "epoch": 2.98, + "grad_norm": 0.7613590955734253, + "learning_rate": 5.555516320073783e-08, + "loss": 2.7332, + "step": 60840 + }, + { + "epoch": 2.98, + "grad_norm": 0.7492443323135376, + "learning_rate": 5.5259273140284554e-08, + "loss": 2.9632, + "step": 60841 + }, + { + "epoch": 2.98, + "grad_norm": 0.7389005422592163, + "learning_rate": 5.4964173076499364e-08, + "loss": 2.99, + "step": 60842 + }, + { + "epoch": 2.98, + "grad_norm": 0.7318647503852844, + "learning_rate": 5.466986301024823e-08, + "loss": 2.8089, + "step": 60843 + }, + { + "epoch": 2.98, + "grad_norm": 0.7469201683998108, + "learning_rate": 5.4376342942263895e-08, + "loss": 2.7664, + "step": 60844 + }, + { + "epoch": 2.98, + "grad_norm": 0.7827994227409363, + "learning_rate": 5.4083612873345726e-08, + "loss": 2.7729, + "step": 60845 + }, + { + "epoch": 2.98, + "grad_norm": 0.7702500224113464, + "learning_rate": 5.3791672804226474e-08, + "loss": 2.8159, + "step": 60846 + }, + { + "epoch": 2.98, + "grad_norm": 0.7574726343154907, + "learning_rate": 5.350052273567218e-08, + "loss": 2.8351, + "step": 60847 + }, + { + "epoch": 2.98, + "grad_norm": 0.7728380560874939, + "learning_rate": 5.321016266851552e-08, + "loss": 3.0487, + "step": 60848 + }, + { + "epoch": 2.98, + "grad_norm": 0.7415326833724976, + "learning_rate": 5.2920592603455934e-08, + "loss": 2.5731, + "step": 60849 + }, + { + "epoch": 2.98, + "grad_norm": 0.7318070530891418, + "learning_rate": 5.263181254125948e-08, + "loss": 2.7188, + "step": 60850 + }, + { + "epoch": 2.98, + "grad_norm": 0.8076432943344116, + "learning_rate": 5.2343822482725504e-08, + "loss": 2.7616, + "step": 60851 + }, + { + "epoch": 2.98, + "grad_norm": 0.7778842449188232, + "learning_rate": 5.205662242858677e-08, + "loss": 2.9999, + "step": 60852 + }, + { + "epoch": 2.98, + "grad_norm": 0.7371053695678711, + "learning_rate": 5.1770212379609324e-08, + "loss": 2.8389, + "step": 60853 + }, + { + "epoch": 2.98, + "grad_norm": 0.732855498790741, + "learning_rate": 5.148459233652591e-08, + "loss": 2.8063, + "step": 60854 + }, + { + "epoch": 2.98, + "grad_norm": 0.7521831393241882, + "learning_rate": 5.119976230013589e-08, + "loss": 2.8495, + "step": 60855 + }, + { + "epoch": 2.98, + "grad_norm": 0.7384352087974548, + "learning_rate": 5.0915722271138715e-08, + "loss": 2.8919, + "step": 60856 + }, + { + "epoch": 2.98, + "grad_norm": 0.762461245059967, + "learning_rate": 5.0632472250300424e-08, + "loss": 2.8156, + "step": 60857 + }, + { + "epoch": 2.98, + "grad_norm": 0.774097740650177, + "learning_rate": 5.035001223838708e-08, + "loss": 2.8582, + "step": 60858 + }, + { + "epoch": 2.98, + "grad_norm": 0.758370578289032, + "learning_rate": 5.0068342236098124e-08, + "loss": 2.7938, + "step": 60859 + }, + { + "epoch": 2.98, + "grad_norm": 0.7590804100036621, + "learning_rate": 4.9787462244232914e-08, + "loss": 2.8904, + "step": 60860 + }, + { + "epoch": 2.98, + "grad_norm": 0.7966086268424988, + "learning_rate": 4.950737226349089e-08, + "loss": 3.0966, + "step": 60861 + }, + { + "epoch": 2.98, + "grad_norm": 0.7693399786949158, + "learning_rate": 4.9228072294604795e-08, + "loss": 2.9406, + "step": 60862 + }, + { + "epoch": 2.98, + "grad_norm": 0.7839014530181885, + "learning_rate": 4.8949562338340684e-08, + "loss": 2.771, + "step": 60863 + }, + { + "epoch": 2.98, + "grad_norm": 0.7283673882484436, + "learning_rate": 4.867184239543131e-08, + "loss": 2.9914, + "step": 60864 + }, + { + "epoch": 2.98, + "grad_norm": 0.7859243750572205, + "learning_rate": 4.839491246657612e-08, + "loss": 3.0732, + "step": 60865 + }, + { + "epoch": 2.98, + "grad_norm": 0.7411079406738281, + "learning_rate": 4.811877255250785e-08, + "loss": 3.0077, + "step": 60866 + }, + { + "epoch": 2.98, + "grad_norm": 0.7194960117340088, + "learning_rate": 4.7843422653992546e-08, + "loss": 2.9794, + "step": 60867 + }, + { + "epoch": 2.98, + "grad_norm": 0.7908714413642883, + "learning_rate": 4.756886277169636e-08, + "loss": 2.8156, + "step": 60868 + }, + { + "epoch": 2.98, + "grad_norm": 0.7900540828704834, + "learning_rate": 4.729509290641864e-08, + "loss": 2.8043, + "step": 60869 + }, + { + "epoch": 2.98, + "grad_norm": 0.8169203400611877, + "learning_rate": 4.702211305882553e-08, + "loss": 2.8463, + "step": 60870 + }, + { + "epoch": 2.98, + "grad_norm": 0.7235054969787598, + "learning_rate": 4.674992322961646e-08, + "loss": 2.7804, + "step": 60871 + }, + { + "epoch": 2.98, + "grad_norm": 0.7315838932991028, + "learning_rate": 4.64785234195908e-08, + "loss": 2.9179, + "step": 60872 + }, + { + "epoch": 2.98, + "grad_norm": 0.7634678483009338, + "learning_rate": 4.620791362938137e-08, + "loss": 2.7995, + "step": 60873 + }, + { + "epoch": 2.98, + "grad_norm": 0.7515854835510254, + "learning_rate": 4.593809385975422e-08, + "loss": 2.7075, + "step": 60874 + }, + { + "epoch": 2.98, + "grad_norm": 0.746206521987915, + "learning_rate": 4.56690641113755e-08, + "loss": 2.7542, + "step": 60875 + }, + { + "epoch": 2.98, + "grad_norm": 0.7026474475860596, + "learning_rate": 4.540082438497794e-08, + "loss": 2.9067, + "step": 60876 + }, + { + "epoch": 2.98, + "grad_norm": 0.7854546308517456, + "learning_rate": 4.51333746812943e-08, + "loss": 2.8615, + "step": 60877 + }, + { + "epoch": 2.98, + "grad_norm": 0.8138797879219055, + "learning_rate": 4.4866715000990703e-08, + "loss": 2.6686, + "step": 60878 + }, + { + "epoch": 2.98, + "grad_norm": 0.7269437313079834, + "learning_rate": 4.46008453447666e-08, + "loss": 2.9791, + "step": 60879 + }, + { + "epoch": 2.98, + "grad_norm": 0.7314205765724182, + "learning_rate": 4.4335765713321427e-08, + "loss": 3.0126, + "step": 60880 + }, + { + "epoch": 2.98, + "grad_norm": 0.7562000155448914, + "learning_rate": 4.407147610742123e-08, + "loss": 2.915, + "step": 60881 + }, + { + "epoch": 2.98, + "grad_norm": 0.7194030284881592, + "learning_rate": 4.3807976527665544e-08, + "loss": 2.7785, + "step": 60882 + }, + { + "epoch": 2.98, + "grad_norm": 0.7603053450584412, + "learning_rate": 4.3545266974820417e-08, + "loss": 2.7567, + "step": 60883 + }, + { + "epoch": 2.98, + "grad_norm": 0.7493316531181335, + "learning_rate": 4.328334744951867e-08, + "loss": 3.0762, + "step": 60884 + }, + { + "epoch": 2.98, + "grad_norm": 0.7748812437057495, + "learning_rate": 4.3022217952493055e-08, + "loss": 2.9854, + "step": 60885 + }, + { + "epoch": 2.98, + "grad_norm": 0.7610214352607727, + "learning_rate": 4.2761878484443014e-08, + "loss": 2.941, + "step": 60886 + }, + { + "epoch": 2.98, + "grad_norm": 0.7458277940750122, + "learning_rate": 4.250232904600137e-08, + "loss": 2.921, + "step": 60887 + }, + { + "epoch": 2.98, + "grad_norm": 0.7483231425285339, + "learning_rate": 4.224356963786757e-08, + "loss": 2.8299, + "step": 60888 + }, + { + "epoch": 2.98, + "grad_norm": 0.738000214099884, + "learning_rate": 4.1985600260774354e-08, + "loss": 3.1012, + "step": 60889 + }, + { + "epoch": 2.98, + "grad_norm": 0.7348437309265137, + "learning_rate": 4.1728420915354555e-08, + "loss": 2.8678, + "step": 60890 + }, + { + "epoch": 2.98, + "grad_norm": 0.7560616135597229, + "learning_rate": 4.14720316022743e-08, + "loss": 3.0925, + "step": 60891 + }, + { + "epoch": 2.98, + "grad_norm": 0.7360788583755493, + "learning_rate": 4.121643232223304e-08, + "loss": 2.85, + "step": 60892 + }, + { + "epoch": 2.98, + "grad_norm": 0.7723987698554993, + "learning_rate": 4.0961623075930206e-08, + "loss": 2.9986, + "step": 60893 + }, + { + "epoch": 2.98, + "grad_norm": 0.7306909561157227, + "learning_rate": 4.070760386396532e-08, + "loss": 2.8415, + "step": 60894 + }, + { + "epoch": 2.98, + "grad_norm": 0.7794232368469238, + "learning_rate": 4.045437468707113e-08, + "loss": 2.8476, + "step": 60895 + }, + { + "epoch": 2.98, + "grad_norm": 0.7389131784439087, + "learning_rate": 4.020193554588047e-08, + "loss": 2.9256, + "step": 60896 + }, + { + "epoch": 2.98, + "grad_norm": 0.7388023734092712, + "learning_rate": 3.9950286441092773e-08, + "loss": 3.1202, + "step": 60897 + }, + { + "epoch": 2.98, + "grad_norm": 0.7390792369842529, + "learning_rate": 3.9699427373307554e-08, + "loss": 2.8996, + "step": 60898 + }, + { + "epoch": 2.98, + "grad_norm": 0.7606891989707947, + "learning_rate": 3.9449358343257574e-08, + "loss": 3.14, + "step": 60899 + }, + { + "epoch": 2.98, + "grad_norm": 0.7230342626571655, + "learning_rate": 3.920007935157565e-08, + "loss": 2.8227, + "step": 60900 + }, + { + "epoch": 2.98, + "grad_norm": 0.7700236439704895, + "learning_rate": 3.8951590398894614e-08, + "loss": 2.8222, + "step": 60901 + }, + { + "epoch": 2.98, + "grad_norm": 0.7606983780860901, + "learning_rate": 3.87038914858806e-08, + "loss": 2.849, + "step": 60902 + }, + { + "epoch": 2.98, + "grad_norm": 0.7816833853721619, + "learning_rate": 3.845698261319974e-08, + "loss": 2.6993, + "step": 60903 + }, + { + "epoch": 2.98, + "grad_norm": 0.768175482749939, + "learning_rate": 3.8210863781484856e-08, + "loss": 2.8003, + "step": 60904 + }, + { + "epoch": 2.98, + "grad_norm": 0.7766599655151367, + "learning_rate": 3.796553499140209e-08, + "loss": 3.0448, + "step": 60905 + }, + { + "epoch": 2.98, + "grad_norm": 0.7720254063606262, + "learning_rate": 3.7720996243584265e-08, + "loss": 2.8401, + "step": 60906 + }, + { + "epoch": 2.98, + "grad_norm": 0.7150791883468628, + "learning_rate": 3.747724753869752e-08, + "loss": 2.7285, + "step": 60907 + }, + { + "epoch": 2.98, + "grad_norm": 0.7499209046363831, + "learning_rate": 3.723428887734137e-08, + "loss": 2.8719, + "step": 60908 + }, + { + "epoch": 2.99, + "grad_norm": 0.7711845636367798, + "learning_rate": 3.6992120260215254e-08, + "loss": 2.8742, + "step": 60909 + }, + { + "epoch": 2.99, + "grad_norm": 0.7651515007019043, + "learning_rate": 3.6750741687885385e-08, + "loss": 2.8158, + "step": 60910 + }, + { + "epoch": 2.99, + "grad_norm": 0.7973775267601013, + "learning_rate": 3.651015316105121e-08, + "loss": 3.1713, + "step": 60911 + }, + { + "epoch": 2.99, + "grad_norm": 0.8048893213272095, + "learning_rate": 3.627035468031225e-08, + "loss": 3.031, + "step": 60912 + }, + { + "epoch": 2.99, + "grad_norm": 0.7640774250030518, + "learning_rate": 3.603134624630133e-08, + "loss": 2.8489, + "step": 60913 + }, + { + "epoch": 2.99, + "grad_norm": 0.7189072370529175, + "learning_rate": 3.579312785965127e-08, + "loss": 2.9895, + "step": 60914 + }, + { + "epoch": 2.99, + "grad_norm": 0.7827427387237549, + "learning_rate": 3.5555699521028214e-08, + "loss": 3.075, + "step": 60915 + }, + { + "epoch": 2.99, + "grad_norm": 0.7645692229270935, + "learning_rate": 3.531906123099837e-08, + "loss": 2.8066, + "step": 60916 + }, + { + "epoch": 2.99, + "grad_norm": 0.7669183611869812, + "learning_rate": 3.5083212990194564e-08, + "loss": 2.787, + "step": 60917 + }, + { + "epoch": 2.99, + "grad_norm": 0.7498992085456848, + "learning_rate": 3.4848154799249627e-08, + "loss": 2.9104, + "step": 60918 + }, + { + "epoch": 2.99, + "grad_norm": 0.761599600315094, + "learning_rate": 3.4613886658796383e-08, + "loss": 3.0023, + "step": 60919 + }, + { + "epoch": 2.99, + "grad_norm": 0.8176621794700623, + "learning_rate": 3.438040856946767e-08, + "loss": 2.9239, + "step": 60920 + }, + { + "epoch": 2.99, + "grad_norm": 0.7003955245018005, + "learning_rate": 3.414772053182968e-08, + "loss": 2.9555, + "step": 60921 + }, + { + "epoch": 2.99, + "grad_norm": 0.7190863490104675, + "learning_rate": 3.391582254651526e-08, + "loss": 2.682, + "step": 60922 + }, + { + "epoch": 2.99, + "grad_norm": 0.7530226707458496, + "learning_rate": 3.368471461412392e-08, + "loss": 2.7699, + "step": 60923 + }, + { + "epoch": 2.99, + "grad_norm": 0.7347545027732849, + "learning_rate": 3.345439673528849e-08, + "loss": 2.9471, + "step": 60924 + }, + { + "epoch": 2.99, + "grad_norm": 0.7659326791763306, + "learning_rate": 3.322486891060849e-08, + "loss": 2.7471, + "step": 60925 + }, + { + "epoch": 2.99, + "grad_norm": 0.7832285165786743, + "learning_rate": 3.299613114068345e-08, + "loss": 2.9932, + "step": 60926 + }, + { + "epoch": 2.99, + "grad_norm": 0.7939525842666626, + "learning_rate": 3.276818342611287e-08, + "loss": 2.9293, + "step": 60927 + }, + { + "epoch": 2.99, + "grad_norm": 0.7634779810905457, + "learning_rate": 3.254102576749629e-08, + "loss": 2.8537, + "step": 60928 + }, + { + "epoch": 2.99, + "grad_norm": 0.7426297664642334, + "learning_rate": 3.231465816546652e-08, + "loss": 2.8445, + "step": 60929 + }, + { + "epoch": 2.99, + "grad_norm": 0.7596210837364197, + "learning_rate": 3.2089080620556483e-08, + "loss": 2.7078, + "step": 60930 + }, + { + "epoch": 2.99, + "grad_norm": 0.743432879447937, + "learning_rate": 3.1864293133399e-08, + "loss": 2.8809, + "step": 60931 + }, + { + "epoch": 2.99, + "grad_norm": 0.7715290784835815, + "learning_rate": 3.1640295704593586e-08, + "loss": 3.0125, + "step": 60932 + }, + { + "epoch": 2.99, + "grad_norm": 0.7802637815475464, + "learning_rate": 3.141708833470646e-08, + "loss": 2.9165, + "step": 60933 + }, + { + "epoch": 2.99, + "grad_norm": 0.8331025242805481, + "learning_rate": 3.119467102433715e-08, + "loss": 2.8806, + "step": 60934 + }, + { + "epoch": 2.99, + "grad_norm": 0.7259206771850586, + "learning_rate": 3.097304377408516e-08, + "loss": 2.8832, + "step": 60935 + }, + { + "epoch": 2.99, + "grad_norm": 0.7143881916999817, + "learning_rate": 3.0752206584516714e-08, + "loss": 2.9913, + "step": 60936 + }, + { + "epoch": 2.99, + "grad_norm": 0.7747646570205688, + "learning_rate": 3.053215945619802e-08, + "loss": 2.8695, + "step": 60937 + }, + { + "epoch": 2.99, + "grad_norm": 0.7718039155006409, + "learning_rate": 3.031290238972861e-08, + "loss": 3.0108, + "step": 60938 + }, + { + "epoch": 2.99, + "grad_norm": 0.7405833601951599, + "learning_rate": 3.0094435385707994e-08, + "loss": 2.7529, + "step": 60939 + }, + { + "epoch": 2.99, + "grad_norm": 0.7925773859024048, + "learning_rate": 2.987675844466908e-08, + "loss": 2.6726, + "step": 60940 + }, + { + "epoch": 2.99, + "grad_norm": 0.7442820072174072, + "learning_rate": 2.9659871567211392e-08, + "loss": 2.8788, + "step": 60941 + }, + { + "epoch": 2.99, + "grad_norm": 0.7715669870376587, + "learning_rate": 2.9443774753901138e-08, + "loss": 2.6613, + "step": 60942 + }, + { + "epoch": 2.99, + "grad_norm": 0.7934419512748718, + "learning_rate": 2.922846800530454e-08, + "loss": 2.6129, + "step": 60943 + }, + { + "epoch": 2.99, + "grad_norm": 0.7691748738288879, + "learning_rate": 2.9013951321987804e-08, + "loss": 2.9847, + "step": 60944 + }, + { + "epoch": 2.99, + "grad_norm": 0.8082806468009949, + "learning_rate": 2.880022470451715e-08, + "loss": 2.9681, + "step": 60945 + }, + { + "epoch": 2.99, + "grad_norm": 0.7494584918022156, + "learning_rate": 2.8587288153458788e-08, + "loss": 2.8827, + "step": 60946 + }, + { + "epoch": 2.99, + "grad_norm": 0.7465475797653198, + "learning_rate": 2.8375141669345624e-08, + "loss": 2.9779, + "step": 60947 + }, + { + "epoch": 2.99, + "grad_norm": 0.7119285464286804, + "learning_rate": 2.816378525281049e-08, + "loss": 2.8423, + "step": 60948 + }, + { + "epoch": 2.99, + "grad_norm": 0.7398651838302612, + "learning_rate": 2.7953218904319674e-08, + "loss": 2.6514, + "step": 60949 + }, + { + "epoch": 2.99, + "grad_norm": 0.7157944440841675, + "learning_rate": 2.7743442624506008e-08, + "loss": 2.8982, + "step": 60950 + }, + { + "epoch": 2.99, + "grad_norm": 0.7688831686973572, + "learning_rate": 2.753445641383578e-08, + "loss": 3.0429, + "step": 60951 + }, + { + "epoch": 2.99, + "grad_norm": 0.7500354647636414, + "learning_rate": 2.7326260272941825e-08, + "loss": 3.1836, + "step": 60952 + }, + { + "epoch": 2.99, + "grad_norm": 0.7020663619041443, + "learning_rate": 2.7118854202357044e-08, + "loss": 2.8623, + "step": 60953 + }, + { + "epoch": 2.99, + "grad_norm": 0.7152649164199829, + "learning_rate": 2.6912238202581038e-08, + "loss": 2.8368, + "step": 60954 + }, + { + "epoch": 2.99, + "grad_norm": 0.7284510731697083, + "learning_rate": 2.6706412274180022e-08, + "loss": 3.0018, + "step": 60955 + }, + { + "epoch": 2.99, + "grad_norm": 0.7411639094352722, + "learning_rate": 2.650137641772021e-08, + "loss": 2.9187, + "step": 60956 + }, + { + "epoch": 2.99, + "grad_norm": 0.77473384141922, + "learning_rate": 2.629713063373451e-08, + "loss": 3.0106, + "step": 60957 + }, + { + "epoch": 2.99, + "grad_norm": 0.729745626449585, + "learning_rate": 2.6093674922722518e-08, + "loss": 2.9179, + "step": 60958 + }, + { + "epoch": 2.99, + "grad_norm": 0.7914096117019653, + "learning_rate": 2.589100928528376e-08, + "loss": 2.9409, + "step": 60959 + }, + { + "epoch": 2.99, + "grad_norm": 0.7265556454658508, + "learning_rate": 2.5689133721884526e-08, + "loss": 2.8163, + "step": 60960 + }, + { + "epoch": 2.99, + "grad_norm": 0.7446621656417847, + "learning_rate": 2.5488048233124337e-08, + "loss": 2.7302, + "step": 60961 + }, + { + "epoch": 2.99, + "grad_norm": 0.7492318153381348, + "learning_rate": 2.5287752819469488e-08, + "loss": 2.8235, + "step": 60962 + }, + { + "epoch": 2.99, + "grad_norm": 0.8737034797668457, + "learning_rate": 2.508824748148619e-08, + "loss": 2.7881, + "step": 60963 + }, + { + "epoch": 2.99, + "grad_norm": 0.7302938103675842, + "learning_rate": 2.4889532219674047e-08, + "loss": 2.8977, + "step": 60964 + }, + { + "epoch": 2.99, + "grad_norm": 0.7328683137893677, + "learning_rate": 2.4691607034565963e-08, + "loss": 3.0039, + "step": 60965 + }, + { + "epoch": 2.99, + "grad_norm": 0.8015636801719666, + "learning_rate": 2.4494471926728155e-08, + "loss": 2.8645, + "step": 60966 + }, + { + "epoch": 2.99, + "grad_norm": 0.7779310941696167, + "learning_rate": 2.4298126896593607e-08, + "loss": 2.8827, + "step": 60967 + }, + { + "epoch": 2.99, + "grad_norm": 0.7854167222976685, + "learning_rate": 2.4102571944761838e-08, + "loss": 3.0293, + "step": 60968 + }, + { + "epoch": 2.99, + "grad_norm": 0.7807804346084595, + "learning_rate": 2.3907807071699148e-08, + "loss": 2.6994, + "step": 60969 + }, + { + "epoch": 2.99, + "grad_norm": 0.7262849807739258, + "learning_rate": 2.371383227793844e-08, + "loss": 3.0185, + "step": 60970 + }, + { + "epoch": 2.99, + "grad_norm": 0.7822308540344238, + "learning_rate": 2.352064756397931e-08, + "loss": 2.7, + "step": 60971 + }, + { + "epoch": 2.99, + "grad_norm": 0.7708417773246765, + "learning_rate": 2.3328252930321366e-08, + "loss": 3.0354, + "step": 60972 + }, + { + "epoch": 2.99, + "grad_norm": 0.7239246964454651, + "learning_rate": 2.3136648377497512e-08, + "loss": 2.9021, + "step": 60973 + }, + { + "epoch": 2.99, + "grad_norm": 0.7216629981994629, + "learning_rate": 2.294583390600735e-08, + "loss": 2.932, + "step": 60974 + }, + { + "epoch": 2.99, + "grad_norm": 0.7417978048324585, + "learning_rate": 2.2755809516350475e-08, + "loss": 3.0051, + "step": 60975 + }, + { + "epoch": 2.99, + "grad_norm": 0.7490457892417908, + "learning_rate": 2.2566575208993187e-08, + "loss": 2.5854, + "step": 60976 + }, + { + "epoch": 2.99, + "grad_norm": 0.7109230756759644, + "learning_rate": 2.2378130984501695e-08, + "loss": 2.6226, + "step": 60977 + }, + { + "epoch": 2.99, + "grad_norm": 0.7185120582580566, + "learning_rate": 2.2190476843308992e-08, + "loss": 2.9134, + "step": 60978 + }, + { + "epoch": 2.99, + "grad_norm": 0.7175775170326233, + "learning_rate": 2.2003612785947977e-08, + "loss": 2.7592, + "step": 60979 + }, + { + "epoch": 2.99, + "grad_norm": 0.7656946182250977, + "learning_rate": 2.181753881288495e-08, + "loss": 3.0205, + "step": 60980 + }, + { + "epoch": 2.99, + "grad_norm": 0.7306361198425293, + "learning_rate": 2.1632254924619507e-08, + "loss": 2.7981, + "step": 60981 + }, + { + "epoch": 2.99, + "grad_norm": 0.8056076169013977, + "learning_rate": 2.1447761121651254e-08, + "loss": 3.096, + "step": 60982 + }, + { + "epoch": 2.99, + "grad_norm": 0.7194265127182007, + "learning_rate": 2.1264057404446476e-08, + "loss": 2.9004, + "step": 60983 + }, + { + "epoch": 2.99, + "grad_norm": 0.7339944243431091, + "learning_rate": 2.108114377353809e-08, + "loss": 2.9615, + "step": 60984 + }, + { + "epoch": 2.99, + "grad_norm": 0.7202855348587036, + "learning_rate": 2.089902022932577e-08, + "loss": 3.0398, + "step": 60985 + }, + { + "epoch": 2.99, + "grad_norm": 0.7589322328567505, + "learning_rate": 2.0717686772342422e-08, + "loss": 3.0222, + "step": 60986 + }, + { + "epoch": 2.99, + "grad_norm": 0.7830872535705566, + "learning_rate": 2.0537143403087653e-08, + "loss": 2.7202, + "step": 60987 + }, + { + "epoch": 2.99, + "grad_norm": 0.7454860210418701, + "learning_rate": 2.0357390121961138e-08, + "loss": 2.89, + "step": 60988 + }, + { + "epoch": 2.99, + "grad_norm": 0.7104257941246033, + "learning_rate": 2.017842692952909e-08, + "loss": 2.9121, + "step": 60989 + }, + { + "epoch": 2.99, + "grad_norm": 0.7989444136619568, + "learning_rate": 2.0000253826157885e-08, + "loss": 2.8338, + "step": 60990 + }, + { + "epoch": 2.99, + "grad_norm": 0.7443826794624329, + "learning_rate": 1.9822870812413738e-08, + "loss": 2.7031, + "step": 60991 + }, + { + "epoch": 2.99, + "grad_norm": 0.7373210191726685, + "learning_rate": 1.9646277888729635e-08, + "loss": 2.858, + "step": 60992 + }, + { + "epoch": 2.99, + "grad_norm": 0.7563875317573547, + "learning_rate": 1.9470475055538558e-08, + "loss": 2.8906, + "step": 60993 + }, + { + "epoch": 2.99, + "grad_norm": 0.7365525960922241, + "learning_rate": 1.9295462313340116e-08, + "loss": 2.7371, + "step": 60994 + }, + { + "epoch": 2.99, + "grad_norm": 0.8209661841392517, + "learning_rate": 1.9121239662600595e-08, + "loss": 2.8336, + "step": 60995 + }, + { + "epoch": 2.99, + "grad_norm": 0.7196007370948792, + "learning_rate": 1.894780710371968e-08, + "loss": 3.0186, + "step": 60996 + }, + { + "epoch": 2.99, + "grad_norm": 0.7250151634216309, + "learning_rate": 1.8775164637230278e-08, + "loss": 2.6314, + "step": 60997 + }, + { + "epoch": 2.99, + "grad_norm": 0.7289470434188843, + "learning_rate": 1.8603312263565374e-08, + "loss": 3.0658, + "step": 60998 + }, + { + "epoch": 2.99, + "grad_norm": 0.7411735653877258, + "learning_rate": 1.8432249983157954e-08, + "loss": 2.9024, + "step": 60999 + }, + { + "epoch": 2.99, + "grad_norm": 0.7378327250480652, + "learning_rate": 1.8261977796441007e-08, + "loss": 2.8826, + "step": 61000 + }, + { + "epoch": 2.99, + "grad_norm": 0.8322823643684387, + "learning_rate": 1.8092495703914134e-08, + "loss": 2.8086, + "step": 61001 + }, + { + "epoch": 2.99, + "grad_norm": 0.7469660043716431, + "learning_rate": 1.792380370601032e-08, + "loss": 2.8937, + "step": 61002 + }, + { + "epoch": 2.99, + "grad_norm": 0.6969466209411621, + "learning_rate": 1.7755901803129246e-08, + "loss": 2.7511, + "step": 61003 + }, + { + "epoch": 2.99, + "grad_norm": 0.7883122563362122, + "learning_rate": 1.7588789995770515e-08, + "loss": 2.7053, + "step": 61004 + }, + { + "epoch": 2.99, + "grad_norm": 0.7031104564666748, + "learning_rate": 1.7422468284333802e-08, + "loss": 2.7567, + "step": 61005 + }, + { + "epoch": 2.99, + "grad_norm": 0.7445712089538574, + "learning_rate": 1.7256936669285402e-08, + "loss": 2.9512, + "step": 61006 + }, + { + "epoch": 2.99, + "grad_norm": 0.7441263794898987, + "learning_rate": 1.709219515105831e-08, + "loss": 3.0192, + "step": 61007 + }, + { + "epoch": 2.99, + "grad_norm": 0.7398879528045654, + "learning_rate": 1.6928243730052194e-08, + "loss": 2.6434, + "step": 61008 + }, + { + "epoch": 2.99, + "grad_norm": 0.8084988594055176, + "learning_rate": 1.676508240676666e-08, + "loss": 2.6561, + "step": 61009 + }, + { + "epoch": 2.99, + "grad_norm": 0.7381051778793335, + "learning_rate": 1.6602711181534778e-08, + "loss": 3.1026, + "step": 61010 + }, + { + "epoch": 2.99, + "grad_norm": 0.77137690782547, + "learning_rate": 1.6441130054889452e-08, + "loss": 2.9732, + "step": 61011 + }, + { + "epoch": 2.99, + "grad_norm": 0.7231003046035767, + "learning_rate": 1.6280339027163747e-08, + "loss": 2.6597, + "step": 61012 + }, + { + "epoch": 2.99, + "grad_norm": 0.740614116191864, + "learning_rate": 1.612033809882396e-08, + "loss": 2.9157, + "step": 61013 + }, + { + "epoch": 2.99, + "grad_norm": 0.7222123146057129, + "learning_rate": 1.5961127270336382e-08, + "loss": 2.9297, + "step": 61014 + }, + { + "epoch": 2.99, + "grad_norm": 0.7570497989654541, + "learning_rate": 1.5802706542034083e-08, + "loss": 2.7988, + "step": 61015 + }, + { + "epoch": 2.99, + "grad_norm": 0.7413751482963562, + "learning_rate": 1.5645075914383352e-08, + "loss": 2.7553, + "step": 61016 + }, + { + "epoch": 2.99, + "grad_norm": 0.715862512588501, + "learning_rate": 1.5488235387783877e-08, + "loss": 3.0391, + "step": 61017 + }, + { + "epoch": 2.99, + "grad_norm": 0.782159686088562, + "learning_rate": 1.5332184962668638e-08, + "loss": 2.856, + "step": 61018 + }, + { + "epoch": 2.99, + "grad_norm": 0.7552375197410583, + "learning_rate": 1.517692463940401e-08, + "loss": 2.7686, + "step": 61019 + }, + { + "epoch": 2.99, + "grad_norm": 0.7790356874465942, + "learning_rate": 1.5022454418456285e-08, + "loss": 3.0508, + "step": 61020 + }, + { + "epoch": 2.99, + "grad_norm": 0.7251024842262268, + "learning_rate": 1.486877430022515e-08, + "loss": 2.9912, + "step": 61021 + }, + { + "epoch": 2.99, + "grad_norm": 0.7824087142944336, + "learning_rate": 1.4715884285076974e-08, + "loss": 2.7077, + "step": 61022 + }, + { + "epoch": 2.99, + "grad_norm": 0.7169434428215027, + "learning_rate": 1.4563784373411435e-08, + "loss": 2.813, + "step": 61023 + }, + { + "epoch": 2.99, + "grad_norm": 0.7796780467033386, + "learning_rate": 1.441247456569483e-08, + "loss": 2.8747, + "step": 61024 + }, + { + "epoch": 2.99, + "grad_norm": 0.756627082824707, + "learning_rate": 1.4261954862260228e-08, + "loss": 2.8956, + "step": 61025 + }, + { + "epoch": 2.99, + "grad_norm": 0.7208350300788879, + "learning_rate": 1.4112225263507304e-08, + "loss": 2.7472, + "step": 61026 + }, + { + "epoch": 2.99, + "grad_norm": 0.7269884347915649, + "learning_rate": 1.3963285769869049e-08, + "loss": 2.8458, + "step": 61027 + }, + { + "epoch": 2.99, + "grad_norm": 0.7311272621154785, + "learning_rate": 1.3815136381711833e-08, + "loss": 2.8329, + "step": 61028 + }, + { + "epoch": 2.99, + "grad_norm": 0.7485470175743103, + "learning_rate": 1.366777709943534e-08, + "loss": 3.0685, + "step": 61029 + }, + { + "epoch": 2.99, + "grad_norm": 0.7784717679023743, + "learning_rate": 1.3521207923439247e-08, + "loss": 2.9271, + "step": 61030 + }, + { + "epoch": 2.99, + "grad_norm": 0.7870509624481201, + "learning_rate": 1.3375428854089931e-08, + "loss": 2.8426, + "step": 61031 + }, + { + "epoch": 2.99, + "grad_norm": 0.7719137072563171, + "learning_rate": 1.3230439891753763e-08, + "loss": 2.9635, + "step": 61032 + }, + { + "epoch": 2.99, + "grad_norm": 0.7624124884605408, + "learning_rate": 1.3086241036863732e-08, + "loss": 3.0412, + "step": 61033 + }, + { + "epoch": 2.99, + "grad_norm": 0.7434390187263489, + "learning_rate": 1.2942832289752903e-08, + "loss": 2.7478, + "step": 61034 + }, + { + "epoch": 2.99, + "grad_norm": 0.7709497213363647, + "learning_rate": 1.2800213650820957e-08, + "loss": 2.8803, + "step": 61035 + }, + { + "epoch": 2.99, + "grad_norm": 0.7721742987632751, + "learning_rate": 1.2658385120434266e-08, + "loss": 2.8181, + "step": 61036 + }, + { + "epoch": 2.99, + "grad_norm": 0.7293633222579956, + "learning_rate": 1.2517346698959207e-08, + "loss": 2.9901, + "step": 61037 + }, + { + "epoch": 2.99, + "grad_norm": 0.7672916650772095, + "learning_rate": 1.2377098386828766e-08, + "loss": 2.7232, + "step": 61038 + }, + { + "epoch": 2.99, + "grad_norm": 0.7433645725250244, + "learning_rate": 1.2237640184309394e-08, + "loss": 2.8997, + "step": 61039 + }, + { + "epoch": 2.99, + "grad_norm": 0.7337106466293335, + "learning_rate": 1.209897209183408e-08, + "loss": 2.7652, + "step": 61040 + }, + { + "epoch": 2.99, + "grad_norm": 0.74492347240448, + "learning_rate": 1.1961094109769199e-08, + "loss": 3.0415, + "step": 61041 + }, + { + "epoch": 2.99, + "grad_norm": 0.751065731048584, + "learning_rate": 1.1824006238481121e-08, + "loss": 2.6755, + "step": 61042 + }, + { + "epoch": 2.99, + "grad_norm": 0.7474420666694641, + "learning_rate": 1.1687708478302915e-08, + "loss": 2.7026, + "step": 61043 + }, + { + "epoch": 2.99, + "grad_norm": 0.7618042826652527, + "learning_rate": 1.1552200829600955e-08, + "loss": 2.8862, + "step": 61044 + }, + { + "epoch": 2.99, + "grad_norm": 0.7556697130203247, + "learning_rate": 1.1417483292708308e-08, + "loss": 2.9045, + "step": 61045 + }, + { + "epoch": 2.99, + "grad_norm": 0.7072124481201172, + "learning_rate": 1.1283555868057958e-08, + "loss": 2.8852, + "step": 61046 + }, + { + "epoch": 2.99, + "grad_norm": 0.7252053618431091, + "learning_rate": 1.1150418555916362e-08, + "loss": 2.6886, + "step": 61047 + }, + { + "epoch": 2.99, + "grad_norm": 0.7440884709358215, + "learning_rate": 1.1018071356683201e-08, + "loss": 2.9452, + "step": 61048 + }, + { + "epoch": 2.99, + "grad_norm": 0.7469719052314758, + "learning_rate": 1.0886514270691538e-08, + "loss": 3.1659, + "step": 61049 + }, + { + "epoch": 2.99, + "grad_norm": 0.771772563457489, + "learning_rate": 1.0755747298307748e-08, + "loss": 2.9415, + "step": 61050 + }, + { + "epoch": 2.99, + "grad_norm": 0.7458013892173767, + "learning_rate": 1.0625770439831594e-08, + "loss": 2.8189, + "step": 61051 + }, + { + "epoch": 2.99, + "grad_norm": 0.749146044254303, + "learning_rate": 1.0496583695629447e-08, + "loss": 2.8342, + "step": 61052 + }, + { + "epoch": 2.99, + "grad_norm": 0.7614862322807312, + "learning_rate": 1.0368187066067679e-08, + "loss": 2.9507, + "step": 61053 + }, + { + "epoch": 2.99, + "grad_norm": 0.7297611832618713, + "learning_rate": 1.0240580551479361e-08, + "loss": 2.8405, + "step": 61054 + }, + { + "epoch": 2.99, + "grad_norm": 0.764214277267456, + "learning_rate": 1.0113764152130944e-08, + "loss": 2.7654, + "step": 61055 + }, + { + "epoch": 2.99, + "grad_norm": 0.7660698890686035, + "learning_rate": 9.987737868455415e-09, + "loss": 2.8231, + "step": 61056 + }, + { + "epoch": 2.99, + "grad_norm": 0.7836101055145264, + "learning_rate": 9.862501700719227e-09, + "loss": 2.6716, + "step": 61057 + }, + { + "epoch": 2.99, + "grad_norm": 0.7371546626091003, + "learning_rate": 9.73805564925545e-09, + "loss": 2.7436, + "step": 61058 + }, + { + "epoch": 2.99, + "grad_norm": 0.7891651391983032, + "learning_rate": 9.614399714430455e-09, + "loss": 2.951, + "step": 61059 + }, + { + "epoch": 2.99, + "grad_norm": 0.7586222290992737, + "learning_rate": 9.491533896544002e-09, + "loss": 2.8016, + "step": 61060 + }, + { + "epoch": 2.99, + "grad_norm": 0.7199023365974426, + "learning_rate": 9.369458195929158e-09, + "loss": 2.7525, + "step": 61061 + }, + { + "epoch": 2.99, + "grad_norm": 0.7390625476837158, + "learning_rate": 9.248172612885686e-09, + "loss": 3.1196, + "step": 61062 + }, + { + "epoch": 2.99, + "grad_norm": 0.7458911538124084, + "learning_rate": 9.127677147746649e-09, + "loss": 2.8515, + "step": 61063 + }, + { + "epoch": 2.99, + "grad_norm": 0.7433958053588867, + "learning_rate": 9.007971800845115e-09, + "loss": 2.6842, + "step": 61064 + }, + { + "epoch": 2.99, + "grad_norm": 0.7755656242370605, + "learning_rate": 8.889056572480847e-09, + "loss": 2.8729, + "step": 61065 + }, + { + "epoch": 2.99, + "grad_norm": 0.730906069278717, + "learning_rate": 8.7709314629536e-09, + "loss": 2.811, + "step": 61066 + }, + { + "epoch": 2.99, + "grad_norm": 0.7480701804161072, + "learning_rate": 8.653596472596447e-09, + "loss": 2.8485, + "step": 61067 + }, + { + "epoch": 2.99, + "grad_norm": 0.7432823777198792, + "learning_rate": 8.537051601709144e-09, + "loss": 2.8094, + "step": 61068 + }, + { + "epoch": 2.99, + "grad_norm": 0.796856701374054, + "learning_rate": 8.421296850591452e-09, + "loss": 2.7903, + "step": 61069 + }, + { + "epoch": 2.99, + "grad_norm": 0.7544580698013306, + "learning_rate": 8.306332219576439e-09, + "loss": 2.7556, + "step": 61070 + }, + { + "epoch": 2.99, + "grad_norm": 0.7667417526245117, + "learning_rate": 8.192157708930558e-09, + "loss": 2.8409, + "step": 61071 + }, + { + "epoch": 2.99, + "grad_norm": 0.7658982872962952, + "learning_rate": 8.078773318986875e-09, + "loss": 2.8028, + "step": 61072 + }, + { + "epoch": 2.99, + "grad_norm": 0.7310402989387512, + "learning_rate": 7.966179050045152e-09, + "loss": 2.904, + "step": 61073 + }, + { + "epoch": 2.99, + "grad_norm": 0.7373026609420776, + "learning_rate": 7.854374902338533e-09, + "loss": 3.0008, + "step": 61074 + }, + { + "epoch": 2.99, + "grad_norm": 0.732184886932373, + "learning_rate": 7.743360876233395e-09, + "loss": 2.732, + "step": 61075 + }, + { + "epoch": 2.99, + "grad_norm": 0.7509750723838806, + "learning_rate": 7.633136972029497e-09, + "loss": 2.9609, + "step": 61076 + }, + { + "epoch": 2.99, + "grad_norm": 0.8037346601486206, + "learning_rate": 7.523703189926677e-09, + "loss": 2.8011, + "step": 61077 + }, + { + "epoch": 2.99, + "grad_norm": 0.7447194457054138, + "learning_rate": 7.415059530324619e-09, + "loss": 3.1165, + "step": 61078 + }, + { + "epoch": 2.99, + "grad_norm": 0.7197108268737793, + "learning_rate": 7.30720599342316e-09, + "loss": 2.8209, + "step": 61079 + }, + { + "epoch": 2.99, + "grad_norm": 0.7522173523902893, + "learning_rate": 7.200142579555368e-09, + "loss": 3.2279, + "step": 61080 + }, + { + "epoch": 2.99, + "grad_norm": 0.7585819363594055, + "learning_rate": 7.093869288987697e-09, + "loss": 2.8398, + "step": 61081 + }, + { + "epoch": 2.99, + "grad_norm": 0.741621732711792, + "learning_rate": 6.9883861219866e-09, + "loss": 2.9234, + "step": 61082 + }, + { + "epoch": 2.99, + "grad_norm": 0.758293092250824, + "learning_rate": 6.883693078851838e-09, + "loss": 2.9133, + "step": 61083 + }, + { + "epoch": 2.99, + "grad_norm": 0.7666619420051575, + "learning_rate": 6.779790159849863e-09, + "loss": 2.7775, + "step": 61084 + }, + { + "epoch": 2.99, + "grad_norm": 0.7205373644828796, + "learning_rate": 6.67667736524713e-09, + "loss": 3.0212, + "step": 61085 + }, + { + "epoch": 2.99, + "grad_norm": 0.7747965455055237, + "learning_rate": 6.574354695343398e-09, + "loss": 2.7993, + "step": 61086 + }, + { + "epoch": 2.99, + "grad_norm": 0.7606662511825562, + "learning_rate": 6.472822150371815e-09, + "loss": 2.7932, + "step": 61087 + }, + { + "epoch": 2.99, + "grad_norm": 0.7349268198013306, + "learning_rate": 6.372079730598833e-09, + "loss": 2.9177, + "step": 61088 + }, + { + "epoch": 2.99, + "grad_norm": 0.7886388897895813, + "learning_rate": 6.272127436324215e-09, + "loss": 2.8516, + "step": 61089 + }, + { + "epoch": 2.99, + "grad_norm": 0.733219563961029, + "learning_rate": 6.172965267814411e-09, + "loss": 2.7334, + "step": 61090 + }, + { + "epoch": 2.99, + "grad_norm": 0.8257836699485779, + "learning_rate": 6.074593225302571e-09, + "loss": 2.939, + "step": 61091 + }, + { + "epoch": 2.99, + "grad_norm": 0.7202447056770325, + "learning_rate": 5.9770113090218396e-09, + "loss": 2.8128, + "step": 61092 + }, + { + "epoch": 2.99, + "grad_norm": 0.7605451941490173, + "learning_rate": 5.880219519305285e-09, + "loss": 2.9223, + "step": 61093 + }, + { + "epoch": 2.99, + "grad_norm": 0.7240636348724365, + "learning_rate": 5.784217856352746e-09, + "loss": 3.0476, + "step": 61094 + }, + { + "epoch": 2.99, + "grad_norm": 0.7533585429191589, + "learning_rate": 5.6890063203973715e-09, + "loss": 2.95, + "step": 61095 + }, + { + "epoch": 2.99, + "grad_norm": 0.724474310874939, + "learning_rate": 5.594584911772227e-09, + "loss": 3.0003, + "step": 61096 + }, + { + "epoch": 2.99, + "grad_norm": 0.7521463632583618, + "learning_rate": 5.5009536306438455e-09, + "loss": 2.5749, + "step": 61097 + }, + { + "epoch": 2.99, + "grad_norm": 0.7722815275192261, + "learning_rate": 5.408112477278681e-09, + "loss": 2.865, + "step": 61098 + }, + { + "epoch": 2.99, + "grad_norm": 0.776710033416748, + "learning_rate": 5.316061451976494e-09, + "loss": 2.9713, + "step": 61099 + }, + { + "epoch": 2.99, + "grad_norm": 0.7619010806083679, + "learning_rate": 5.2248005549038184e-09, + "loss": 3.0012, + "step": 61100 + }, + { + "epoch": 2.99, + "grad_norm": 0.7714282870292664, + "learning_rate": 5.134329786327107e-09, + "loss": 2.6743, + "step": 61101 + }, + { + "epoch": 2.99, + "grad_norm": 0.707321286201477, + "learning_rate": 5.044649146512814e-09, + "loss": 2.9208, + "step": 61102 + }, + { + "epoch": 2.99, + "grad_norm": 0.7693123817443848, + "learning_rate": 4.955758635694085e-09, + "loss": 2.8076, + "step": 61103 + }, + { + "epoch": 2.99, + "grad_norm": 0.7465909123420715, + "learning_rate": 4.867658254037454e-09, + "loss": 2.8324, + "step": 61104 + }, + { + "epoch": 2.99, + "grad_norm": 0.732196033000946, + "learning_rate": 4.780348001842682e-09, + "loss": 2.9903, + "step": 61105 + }, + { + "epoch": 2.99, + "grad_norm": 0.7602418065071106, + "learning_rate": 4.693827879342915e-09, + "loss": 2.9082, + "step": 61106 + }, + { + "epoch": 2.99, + "grad_norm": 0.7443466782569885, + "learning_rate": 4.608097886737994e-09, + "loss": 2.9654, + "step": 61107 + }, + { + "epoch": 2.99, + "grad_norm": 0.7329009175300598, + "learning_rate": 4.523158024227758e-09, + "loss": 2.9078, + "step": 61108 + }, + { + "epoch": 2.99, + "grad_norm": 0.8062663078308105, + "learning_rate": 4.439008292111967e-09, + "loss": 2.8382, + "step": 61109 + }, + { + "epoch": 2.99, + "grad_norm": 0.7804901003837585, + "learning_rate": 4.355648690557157e-09, + "loss": 2.9814, + "step": 61110 + }, + { + "epoch": 2.99, + "grad_norm": 0.7507225871086121, + "learning_rate": 4.273079219763165e-09, + "loss": 2.7954, + "step": 61111 + }, + { + "epoch": 2.99, + "grad_norm": 0.7602078914642334, + "learning_rate": 4.191299879996446e-09, + "loss": 2.7327, + "step": 61112 + }, + { + "epoch": 3.0, + "grad_norm": 0.7694182991981506, + "learning_rate": 4.110310671490147e-09, + "loss": 2.9445, + "step": 61113 + }, + { + "epoch": 3.0, + "grad_norm": 0.7346636652946472, + "learning_rate": 4.030111594377494e-09, + "loss": 2.5756, + "step": 61114 + }, + { + "epoch": 3.0, + "grad_norm": 0.7528190612792969, + "learning_rate": 3.950702648924942e-09, + "loss": 2.7316, + "step": 61115 + }, + { + "epoch": 3.0, + "grad_norm": 0.7616682052612305, + "learning_rate": 3.872083835332329e-09, + "loss": 2.8147, + "step": 61116 + }, + { + "epoch": 3.0, + "grad_norm": 0.7209632396697998, + "learning_rate": 3.794255153799497e-09, + "loss": 2.9357, + "step": 61117 + }, + { + "epoch": 3.0, + "grad_norm": 0.7706297039985657, + "learning_rate": 3.717216604559592e-09, + "loss": 2.9647, + "step": 61118 + }, + { + "epoch": 3.0, + "grad_norm": 0.7737950682640076, + "learning_rate": 3.6409681877791475e-09, + "loss": 2.6065, + "step": 61119 + }, + { + "epoch": 3.0, + "grad_norm": 0.7609835863113403, + "learning_rate": 3.5655099036580036e-09, + "loss": 2.9106, + "step": 61120 + }, + { + "epoch": 3.0, + "grad_norm": 0.774637758731842, + "learning_rate": 3.4908417524293075e-09, + "loss": 2.862, + "step": 61121 + }, + { + "epoch": 3.0, + "grad_norm": 0.8165029883384705, + "learning_rate": 3.4169637342595922e-09, + "loss": 2.9376, + "step": 61122 + }, + { + "epoch": 3.0, + "grad_norm": 0.817268967628479, + "learning_rate": 3.3438758493486984e-09, + "loss": 3.0128, + "step": 61123 + }, + { + "epoch": 3.0, + "grad_norm": 0.7474488615989685, + "learning_rate": 3.2715780979297723e-09, + "loss": 2.8821, + "step": 61124 + }, + { + "epoch": 3.0, + "grad_norm": 0.72201007604599, + "learning_rate": 3.200070480136041e-09, + "loss": 3.1782, + "step": 61125 + }, + { + "epoch": 3.0, + "grad_norm": 0.762576699256897, + "learning_rate": 3.1293529961673445e-09, + "loss": 2.8246, + "step": 61126 + }, + { + "epoch": 3.0, + "grad_norm": 0.8136159777641296, + "learning_rate": 3.0594256462235234e-09, + "loss": 2.9168, + "step": 61127 + }, + { + "epoch": 3.0, + "grad_norm": 0.7495198249816895, + "learning_rate": 2.9902884304711106e-09, + "loss": 2.7297, + "step": 61128 + }, + { + "epoch": 3.0, + "grad_norm": 0.7615594267845154, + "learning_rate": 2.921941349143253e-09, + "loss": 2.8838, + "step": 61129 + }, + { + "epoch": 3.0, + "grad_norm": 0.7518624067306519, + "learning_rate": 2.854384402373178e-09, + "loss": 3.0472, + "step": 61130 + }, + { + "epoch": 3.0, + "grad_norm": 0.7348370552062988, + "learning_rate": 2.7876175903274177e-09, + "loss": 2.8619, + "step": 61131 + }, + { + "epoch": 3.0, + "grad_norm": 0.7087485790252686, + "learning_rate": 2.7216409132058136e-09, + "loss": 3.0459, + "step": 61132 + }, + { + "epoch": 3.0, + "grad_norm": 0.8339840769767761, + "learning_rate": 2.6564543712082053e-09, + "loss": 2.7792, + "step": 61133 + }, + { + "epoch": 3.0, + "grad_norm": 0.7480267882347107, + "learning_rate": 2.5920579644678195e-09, + "loss": 2.9, + "step": 61134 + }, + { + "epoch": 3.0, + "grad_norm": 0.7389785647392273, + "learning_rate": 2.52845169315119e-09, + "loss": 2.9488, + "step": 61135 + }, + { + "epoch": 3.0, + "grad_norm": 0.7411351799964905, + "learning_rate": 2.4656355574581565e-09, + "loss": 2.8339, + "step": 61136 + }, + { + "epoch": 3.0, + "grad_norm": 0.8139830231666565, + "learning_rate": 2.4036095575219462e-09, + "loss": 2.8145, + "step": 61137 + }, + { + "epoch": 3.0, + "grad_norm": 0.7364872097969055, + "learning_rate": 2.342373693509092e-09, + "loss": 2.7267, + "step": 61138 + }, + { + "epoch": 3.0, + "grad_norm": 0.7832111120223999, + "learning_rate": 2.281927965586128e-09, + "loss": 2.8519, + "step": 61139 + }, + { + "epoch": 3.0, + "grad_norm": 0.7762227058410645, + "learning_rate": 2.2222723739528935e-09, + "loss": 2.7967, + "step": 61140 + }, + { + "epoch": 3.0, + "grad_norm": 0.7120994329452515, + "learning_rate": 2.1634069187093096e-09, + "loss": 2.878, + "step": 61141 + }, + { + "epoch": 3.0, + "grad_norm": 0.7865113019943237, + "learning_rate": 2.105331600021909e-09, + "loss": 2.6527, + "step": 61142 + }, + { + "epoch": 3.0, + "grad_norm": 0.7547078728675842, + "learning_rate": 2.0480464180572252e-09, + "loss": 3.0927, + "step": 61143 + }, + { + "epoch": 3.0, + "grad_norm": 0.7572701573371887, + "learning_rate": 1.991551372981792e-09, + "loss": 2.9788, + "step": 61144 + }, + { + "epoch": 3.0, + "grad_norm": 0.7272855639457703, + "learning_rate": 1.9358464648955295e-09, + "loss": 2.9688, + "step": 61145 + }, + { + "epoch": 3.0, + "grad_norm": 0.8515251278877258, + "learning_rate": 1.8809316939982775e-09, + "loss": 2.8112, + "step": 61146 + }, + { + "epoch": 3.0, + "grad_norm": 0.7926790714263916, + "learning_rate": 1.8268070603899565e-09, + "loss": 2.7903, + "step": 61147 + }, + { + "epoch": 3.0, + "grad_norm": 0.795349657535553, + "learning_rate": 1.7734725642370994e-09, + "loss": 2.7626, + "step": 61148 + }, + { + "epoch": 3.0, + "grad_norm": 0.7533791661262512, + "learning_rate": 1.7209282057062401e-09, + "loss": 2.8706, + "step": 61149 + }, + { + "epoch": 3.0, + "grad_norm": 0.7342902421951294, + "learning_rate": 1.6691739848972985e-09, + "loss": 2.7107, + "step": 61150 + }, + { + "epoch": 3.0, + "grad_norm": 0.7694773077964783, + "learning_rate": 1.6182099019435013e-09, + "loss": 2.9749, + "step": 61151 + }, + { + "epoch": 3.0, + "grad_norm": 0.7363452315330505, + "learning_rate": 1.5680359569780754e-09, + "loss": 2.7131, + "step": 61152 + }, + { + "epoch": 3.0, + "grad_norm": 0.6961015462875366, + "learning_rate": 1.5186521501675541e-09, + "loss": 2.7004, + "step": 61153 + }, + { + "epoch": 3.0, + "grad_norm": 0.7018082141876221, + "learning_rate": 1.4700584816118576e-09, + "loss": 2.9433, + "step": 61154 + }, + { + "epoch": 3.0, + "grad_norm": 0.6965725421905518, + "learning_rate": 1.4222549514442127e-09, + "loss": 3.0143, + "step": 61155 + }, + { + "epoch": 3.0, + "grad_norm": 0.7362546324729919, + "learning_rate": 1.3752415597978462e-09, + "loss": 2.652, + "step": 61156 + }, + { + "epoch": 3.0, + "grad_norm": 0.7468457818031311, + "learning_rate": 1.3290183068059845e-09, + "loss": 2.9477, + "step": 61157 + }, + { + "epoch": 3.0, + "grad_norm": 0.764543890953064, + "learning_rate": 1.283585192568548e-09, + "loss": 2.9168, + "step": 61158 + }, + { + "epoch": 3.0, + "grad_norm": 0.7274712920188904, + "learning_rate": 1.2389422172187636e-09, + "loss": 2.7423, + "step": 61159 + }, + { + "epoch": 3.0, + "grad_norm": 0.8610571026802063, + "learning_rate": 1.1950893808565509e-09, + "loss": 2.8983, + "step": 61160 + }, + { + "epoch": 3.0, + "grad_norm": 0.7543455362319946, + "learning_rate": 1.1520266836151371e-09, + "loss": 3.0349, + "step": 61161 + }, + { + "epoch": 3.0, + "grad_norm": 0.7443333864212036, + "learning_rate": 1.1097541256277487e-09, + "loss": 2.9159, + "step": 61162 + }, + { + "epoch": 3.0, + "grad_norm": 0.7243948578834534, + "learning_rate": 1.0682717069609993e-09, + "loss": 2.8332, + "step": 61163 + }, + { + "epoch": 3.0, + "grad_norm": 0.7481058239936829, + "learning_rate": 1.0275794277481154e-09, + "loss": 3.0435, + "step": 61164 + }, + { + "epoch": 3.0, + "grad_norm": 0.7074810862541199, + "learning_rate": 9.876772880890171e-10, + "loss": 2.8396, + "step": 61165 + }, + { + "epoch": 3.0, + "grad_norm": 0.743200421333313, + "learning_rate": 9.485652880836248e-10, + "loss": 2.8234, + "step": 61166 + }, + { + "epoch": 3.0, + "grad_norm": 0.7826438546180725, + "learning_rate": 9.102434278318582e-10, + "loss": 2.8359, + "step": 61167 + }, + { + "epoch": 3.0, + "grad_norm": 0.7499393820762634, + "learning_rate": 8.72711707500251e-10, + "loss": 2.7631, + "step": 61168 + }, + { + "epoch": 3.0, + "grad_norm": 0.7657136917114258, + "learning_rate": 8.359701270888031e-10, + "loss": 2.7475, + "step": 61169 + }, + { + "epoch": 3.0, + "grad_norm": 0.72947758436203, + "learning_rate": 8.000186867307413e-10, + "loss": 2.9276, + "step": 61170 + }, + { + "epoch": 3.0, + "grad_norm": 0.7392745018005371, + "learning_rate": 7.648573865592921e-10, + "loss": 2.9282, + "step": 61171 + }, + { + "epoch": 3.0, + "grad_norm": 0.7172539830207825, + "learning_rate": 7.304862266410693e-10, + "loss": 2.8242, + "step": 61172 + }, + { + "epoch": 3.0, + "grad_norm": 0.7763723731040955, + "learning_rate": 6.969052070426862e-10, + "loss": 3.1106, + "step": 61173 + }, + { + "epoch": 3.0, + "grad_norm": 0.7424915432929993, + "learning_rate": 6.641143278973693e-10, + "loss": 3.0323, + "step": 61174 + }, + { + "epoch": 3.0, + "grad_norm": 0.7958371043205261, + "learning_rate": 6.321135892384254e-10, + "loss": 2.972, + "step": 61175 + }, + { + "epoch": 3.0, + "grad_norm": 0.7552845478057861, + "learning_rate": 6.009029911990815e-10, + "loss": 2.764, + "step": 61176 + }, + { + "epoch": 3.0, + "grad_norm": 0.7244768142700195, + "learning_rate": 5.70482533812644e-10, + "loss": 3.1299, + "step": 61177 + }, + { + "epoch": 3.0, + "grad_norm": 0.7536823153495789, + "learning_rate": 5.408522172123397e-10, + "loss": 2.8117, + "step": 61178 + }, + { + "epoch": 3.0, + "grad_norm": 0.7438461780548096, + "learning_rate": 5.120120414314754e-10, + "loss": 2.8251, + "step": 61179 + }, + { + "epoch": 3.0, + "grad_norm": 0.767284095287323, + "learning_rate": 4.839620066032778e-10, + "loss": 2.6252, + "step": 61180 + }, + { + "epoch": 3.0, + "grad_norm": 0.755458652973175, + "learning_rate": 4.5670211272774705e-10, + "loss": 2.9643, + "step": 61181 + }, + { + "epoch": 3.0, + "grad_norm": 0.7305696606636047, + "learning_rate": 4.3023235993810966e-10, + "loss": 3.0228, + "step": 61182 + }, + { + "epoch": 3.0, + "grad_norm": 0.7119986414909363, + "learning_rate": 4.0455274826767246e-10, + "loss": 2.8719, + "step": 61183 + }, + { + "epoch": 3.0, + "grad_norm": 0.7614222764968872, + "learning_rate": 3.7966327778304883e-10, + "loss": 2.9814, + "step": 61184 + }, + { + "epoch": 3.0, + "grad_norm": 0.8122992515563965, + "learning_rate": 3.555639485841588e-10, + "loss": 2.9432, + "step": 61185 + }, + { + "epoch": 3.0, + "grad_norm": 0.7403243780136108, + "learning_rate": 3.322547607043091e-10, + "loss": 2.8608, + "step": 61186 + }, + { + "epoch": 3.0, + "grad_norm": 0.7525769472122192, + "learning_rate": 3.0973571421011313e-10, + "loss": 2.9349, + "step": 61187 + }, + { + "epoch": 3.0, + "grad_norm": 0.7958530187606812, + "learning_rate": 2.880068091681842e-10, + "loss": 3.0496, + "step": 61188 + }, + { + "epoch": 3.0, + "grad_norm": 0.7971166968345642, + "learning_rate": 2.67068045611829e-10, + "loss": 3.0409, + "step": 61189 + }, + { + "epoch": 3.0, + "grad_norm": 0.7553383708000183, + "learning_rate": 2.469194236409677e-10, + "loss": 2.8486, + "step": 61190 + }, + { + "epoch": 3.0, + "grad_norm": 0.7261549830436707, + "learning_rate": 2.2756094325560025e-10, + "loss": 2.9742, + "step": 61191 + }, + { + "epoch": 3.0, + "grad_norm": 0.7534186840057373, + "learning_rate": 2.089926045556467e-10, + "loss": 2.5935, + "step": 61192 + }, + { + "epoch": 3.0, + "grad_norm": 0.7727741003036499, + "learning_rate": 1.9121440757441376e-10, + "loss": 2.9467, + "step": 61193 + }, + { + "epoch": 3.0, + "grad_norm": 0.7001724243164062, + "learning_rate": 1.742263523452081e-10, + "loss": 2.7758, + "step": 61194 + }, + { + "epoch": 3.0, + "grad_norm": 0.7367728352546692, + "learning_rate": 1.5802843893464312e-10, + "loss": 2.7828, + "step": 61195 + }, + { + "epoch": 3.0, + "grad_norm": 0.7262899875640869, + "learning_rate": 1.4262066734271881e-10, + "loss": 2.8854, + "step": 61196 + }, + { + "epoch": 3.0, + "grad_norm": 0.7722108364105225, + "learning_rate": 1.2800303766935526e-10, + "loss": 2.9377, + "step": 61197 + }, + { + "epoch": 3.0, + "grad_norm": 0.774972677230835, + "learning_rate": 1.1417554991455247e-10, + "loss": 2.8263, + "step": 61198 + }, + { + "epoch": 3.0, + "grad_norm": 0.7445690035820007, + "learning_rate": 1.011382041116171e-10, + "loss": 2.9051, + "step": 61199 + }, + { + "epoch": 3.0, + "grad_norm": 0.7341079115867615, + "learning_rate": 8.889100032716257e-11, + "loss": 2.925, + "step": 61200 + }, + { + "epoch": 3.0, + "grad_norm": 0.7602269649505615, + "learning_rate": 7.743393852788216e-11, + "loss": 2.9037, + "step": 61201 + }, + { + "epoch": 3.0, + "grad_norm": 0.7879922986030579, + "learning_rate": 6.676701884700264e-11, + "loss": 2.7574, + "step": 61202 + }, + { + "epoch": 3.0, + "grad_norm": 0.7422546148300171, + "learning_rate": 5.689024121791064e-11, + "loss": 2.8568, + "step": 61203 + }, + { + "epoch": 3.0, + "grad_norm": 0.7315881252288818, + "learning_rate": 4.7803605707219525e-11, + "loss": 2.8086, + "step": 61204 + }, + { + "epoch": 3.0, + "grad_norm": 0.7382916212081909, + "learning_rate": 3.9507112348236e-11, + "loss": 2.7754, + "step": 61205 + }, + { + "epoch": 3.0, + "grad_norm": 0.8105321526527405, + "learning_rate": 3.2000761107653375e-11, + "loss": 2.7572, + "step": 61206 + }, + { + "epoch": 3.0, + "grad_norm": 0.7308658361434937, + "learning_rate": 2.5284552085391706e-11, + "loss": 2.9413, + "step": 61207 + }, + { + "epoch": 3.0, + "grad_norm": 0.8163118362426758, + "learning_rate": 1.9358485248144318e-11, + "loss": 2.8004, + "step": 61208 + }, + { + "epoch": 3.0, + "grad_norm": 0.7237694263458252, + "learning_rate": 1.4222560629217894e-11, + "loss": 2.8694, + "step": 61209 + }, + { + "epoch": 3.0, + "grad_norm": 0.7595729231834412, + "learning_rate": 9.876778261919127e-12, + "loss": 2.8303, + "step": 61210 + }, + { + "epoch": 3.0, + "grad_norm": 0.7996221780776978, + "learning_rate": 6.3211380796346356e-12, + "loss": 3.0761, + "step": 61211 + }, + { + "epoch": 3.0, + "grad_norm": 0.793536901473999, + "learning_rate": 3.5556401822844914e-12, + "loss": 2.7739, + "step": 61212 + }, + { + "epoch": 3.0, + "grad_norm": 0.7907678484916687, + "learning_rate": 1.5802845365620042e-12, + "loss": 2.7948, + "step": 61213 + }, + { + "epoch": 3.0, + "grad_norm": 0.7259610891342163, + "learning_rate": 3.9507114246717374e-13, + "loss": 2.8744, + "step": 61214 + }, + { + "epoch": 3.0, + "grad_norm": 0.8341996669769287, + "learning_rate": 0.0, + "loss": 3.0732, + "step": 61215 + }, + { + "epoch": 3.0, + "step": 61215, + "total_flos": 1.1279892023790797e+17, + "train_loss": 3.1076881288265, + "train_runtime": 9220.7966, + "train_samples_per_second": 424.877, + "train_steps_per_second": 6.639 + } + ], + "logging_steps": 1.0, + "max_steps": 61215, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5000, + "total_flos": 1.1279892023790797e+17, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}